Index: head/sys/amd64/amd64/genassym.c =================================================================== --- head/sys/amd64/amd64/genassym.c (revision 82308) +++ head/sys/amd64/amd64/genassym.c (revision 82309) @@ -1,206 +1,208 @@ /*- * Copyright (c) 1982, 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)genassym.c 5.11 (Berkeley) 5/10/91 * $FreeBSD$ */ +#include "opt_upages.h" + #include #include #include #include #include #include #include #include #include #include #include /* XXX */ #ifdef KTR_PERCPU #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #include #include #include #include ASSYM(P_VMSPACE, offsetof(struct proc, p_vmspace)); ASSYM(VM_PMAP, offsetof(struct vmspace, vm_pmap)); ASSYM(PM_ACTIVE, offsetof(struct pmap, pm_active)); ASSYM(P_ADDR, offsetof(struct proc, p_addr)); ASSYM(P_INTR_NESTING_LEVEL, offsetof(struct proc, p_intr_nesting_level)); ASSYM(P_SFLAG, offsetof(struct proc, p_sflag)); ASSYM(P_STAT, offsetof(struct proc, p_stat)); ASSYM(P_WCHAN, offsetof(struct proc, p_wchan)); ASSYM(PS_ASTPENDING, PS_ASTPENDING); ASSYM(PS_NEEDRESCHED, PS_NEEDRESCHED); ASSYM(SSLEEP, SSLEEP); ASSYM(SRUN, SRUN); ASSYM(V_TRAP, offsetof(struct vmmeter, v_trap)); ASSYM(V_SYSCALL, offsetof(struct vmmeter, v_syscall)); ASSYM(V_INTR, offsetof(struct vmmeter, v_intr)); ASSYM(UPAGES, UPAGES); ASSYM(PAGE_SIZE, PAGE_SIZE); ASSYM(NPTEPG, NPTEPG); ASSYM(NPDEPG, NPDEPG); ASSYM(PDESIZE, PDESIZE); ASSYM(PTESIZE, PTESIZE); ASSYM(PAGE_SHIFT, PAGE_SHIFT); ASSYM(PAGE_MASK, PAGE_MASK); ASSYM(PDRSHIFT, PDRSHIFT); ASSYM(USRSTACK, USRSTACK); ASSYM(VM_MAXUSER_ADDRESS, VM_MAXUSER_ADDRESS); ASSYM(KERNBASE, KERNBASE); ASSYM(MCLBYTES, MCLBYTES); ASSYM(PCB_CR3, offsetof(struct pcb, pcb_cr3)); ASSYM(PCB_EDI, offsetof(struct pcb, pcb_edi)); ASSYM(PCB_ESI, offsetof(struct pcb, pcb_esi)); ASSYM(PCB_EBP, offsetof(struct pcb, pcb_ebp)); ASSYM(PCB_ESP, offsetof(struct pcb, pcb_esp)); ASSYM(PCB_EBX, offsetof(struct pcb, pcb_ebx)); ASSYM(PCB_EIP, offsetof(struct pcb, pcb_eip)); ASSYM(TSS_ESP0, offsetof(struct i386tss, tss_esp0)); ASSYM(PCB_USERLDT, offsetof(struct pcb, pcb_ldt)); ASSYM(PCB_GS, offsetof(struct pcb, pcb_gs)); ASSYM(PCB_DR0, offsetof(struct pcb, pcb_dr0)); ASSYM(PCB_DR1, offsetof(struct pcb, pcb_dr1)); ASSYM(PCB_DR2, offsetof(struct pcb, pcb_dr2)); ASSYM(PCB_DR3, offsetof(struct pcb, pcb_dr3)); ASSYM(PCB_DR6, offsetof(struct pcb, pcb_dr6)); ASSYM(PCB_DR7, offsetof(struct pcb, pcb_dr7)); ASSYM(PCB_DBREGS, PCB_DBREGS); ASSYM(PCB_EXT, offsetof(struct pcb, pcb_ext)); ASSYM(PCB_SPARE, offsetof(struct pcb, __pcb_spare)); ASSYM(PCB_FLAGS, offsetof(struct pcb, pcb_flags)); ASSYM(PCB_SAVEFPU, offsetof(struct pcb, pcb_save)); ASSYM(PCB_SAVEFPU_SIZE, sizeof(union savefpu)); ASSYM(PCB_SAVE87_SIZE, sizeof(struct save87)); ASSYM(PCB_ONFAULT, offsetof(struct pcb, pcb_onfault)); #ifdef SMP ASSYM(PCB_SIZE, sizeof(struct pcb)); #endif ASSYM(TF_TRAPNO, offsetof(struct trapframe, tf_trapno)); ASSYM(TF_ERR, offsetof(struct trapframe, tf_err)); ASSYM(TF_CS, offsetof(struct trapframe, tf_cs)); ASSYM(TF_EFLAGS, offsetof(struct trapframe, tf_eflags)); ASSYM(SIGF_HANDLER, offsetof(struct sigframe, sf_ahu.sf_handler)); ASSYM(SIGF_SC, offsetof(struct osigframe, sf_siginfo.si_sc)); ASSYM(SIGF_UC, offsetof(struct sigframe, sf_uc)); ASSYM(SC_PS, offsetof(struct osigcontext, sc_ps)); ASSYM(SC_FS, offsetof(struct osigcontext, sc_fs)); ASSYM(SC_GS, offsetof(struct osigcontext, sc_gs)); ASSYM(SC_TRAPNO, offsetof(struct osigcontext, sc_trapno)); ASSYM(UC_EFLAGS, offsetof(ucontext_t, uc_mcontext.mc_eflags)); ASSYM(UC_GS, offsetof(ucontext_t, uc_mcontext.mc_gs)); ASSYM(ENOENT, ENOENT); ASSYM(EFAULT, EFAULT); ASSYM(ENAMETOOLONG, ENAMETOOLONG); ASSYM(MAXPATHLEN, MAXPATHLEN); ASSYM(BOOTINFO_SIZE, sizeof(struct bootinfo)); ASSYM(BI_VERSION, offsetof(struct bootinfo, bi_version)); ASSYM(BI_KERNELNAME, offsetof(struct bootinfo, bi_kernelname)); ASSYM(BI_NFS_DISKLESS, offsetof(struct bootinfo, bi_nfs_diskless)); ASSYM(BI_ENDCOMMON, offsetof(struct bootinfo, bi_endcommon)); ASSYM(NFSDISKLESS_SIZE, sizeof(struct nfs_diskless)); ASSYM(BI_SIZE, offsetof(struct bootinfo, bi_size)); ASSYM(BI_SYMTAB, offsetof(struct bootinfo, bi_symtab)); ASSYM(BI_ESYMTAB, offsetof(struct bootinfo, bi_esymtab)); ASSYM(BI_KERNEND, offsetof(struct bootinfo, bi_kernend)); ASSYM(GD_SIZEOF, sizeof(struct globaldata)); ASSYM(GD_PRVSPACE, offsetof(struct globaldata, gd_prvspace)); ASSYM(GD_CURPROC, offsetof(struct globaldata, gd_curproc)); ASSYM(GD_NPXPROC, offsetof(struct globaldata, gd_npxproc)); ASSYM(GD_IDLEPROC, offsetof(struct globaldata, gd_idleproc)); ASSYM(GD_CURPCB, offsetof(struct globaldata, gd_curpcb)); ASSYM(GD_COMMON_TSS, offsetof(struct globaldata, gd_common_tss)); ASSYM(GD_SWITCHTIME, offsetof(struct globaldata, gd_switchtime)); ASSYM(GD_SWITCHTICKS, offsetof(struct globaldata, gd_switchticks)); ASSYM(GD_COMMON_TSSD, offsetof(struct globaldata, gd_common_tssd)); ASSYM(GD_TSS_GDT, offsetof(struct globaldata, gd_tss_gdt)); ASSYM(GD_CURRENTLDT, offsetof(struct globaldata, gd_currentldt)); /* XXX */ #ifdef KTR_PERCPU ASSYM(GD_KTR_IDX, offsetof(struct globaldata, gd_ktr_idx)); ASSYM(GD_KTR_BUF, offsetof(struct globaldata, gd_ktr_buf)); ASSYM(GD_KTR_BUF_DATA, offsetof(struct globaldata, gd_ktr_buf_data)); #endif ASSYM(GD_CPUID, offsetof(struct globaldata, gd_cpuid)); #ifdef SMP ASSYM(LA_VER, offsetof(struct LAPIC, version)); ASSYM(LA_TPR, offsetof(struct LAPIC, tpr)); ASSYM(LA_EOI, offsetof(struct LAPIC, eoi)); ASSYM(LA_SVR, offsetof(struct LAPIC, svr)); ASSYM(LA_ICR_LO, offsetof(struct LAPIC, icr_lo)); ASSYM(LA_ICR_HI, offsetof(struct LAPIC, icr_hi)); #endif ASSYM(KCSEL, GSEL(GCODE_SEL, SEL_KPL)); ASSYM(KDSEL, GSEL(GDATA_SEL, SEL_KPL)); ASSYM(KPSEL, GSEL(GPRIV_SEL, SEL_KPL)); ASSYM(BC32SEL, GSEL(GBIOSCODE32_SEL, SEL_KPL)); ASSYM(GPROC0_SEL, GPROC0_SEL); ASSYM(VM86_FRAMESIZE, sizeof(struct vm86frame)); ASSYM(MTX_LOCK, offsetof(struct mtx, mtx_lock)); ASSYM(MTX_RECURSECNT, offsetof(struct mtx, mtx_recurse)); ASSYM(MTX_SAVECRIT, offsetof(struct mtx, mtx_savecrit)); Index: head/sys/amd64/amd64/machdep.c =================================================================== --- head/sys/amd64/amd64/machdep.c (revision 82308) +++ head/sys/amd64/amd64/machdep.c (revision 82309) @@ -1,2530 +1,2534 @@ /*- * Copyright (c) 1992 Terrence R. Lambert. * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 * $FreeBSD$ */ #include "opt_atalk.h" #include "opt_compat.h" #include "opt_cpu.h" #include "opt_ddb.h" #include "opt_inet.h" #include "opt_ipx.h" #include "opt_isa.h" #include "opt_maxmem.h" #include "opt_msgbuf.h" #include "opt_npx.h" #include "opt_perfmon.h" +#include "opt_upages.h" /* #include "opt_userconfig.h" */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* pcb.h included via sys/user.h */ #include #ifdef PERFMON #include +#endif +#ifdef SMP +#include #endif #include #include #include #include #include #include extern void init386 __P((int first)); extern void dblfault_handler __P((void)); extern void printcpuinfo(void); /* XXX header file */ extern void earlysetcpuclass(void); /* same header file */ extern void finishidentcpu(void); extern void panicifcpuunsupported(void); extern void initializecpu(void); #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) #define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) static void cpu_startup __P((void *)); #ifdef CPU_ENABLE_SSE static void set_fpregs_xmm __P((struct save87 *, struct savexmm *)); static void fill_fpregs_xmm __P((struct savexmm *, struct save87 *)); #endif /* CPU_ENABLE_SSE */ SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL) int _udatasel, _ucodesel; u_int atdevbase; #if defined(SWTCH_OPTIM_STATS) extern int swtch_optim_stats; SYSCTL_INT(_debug, OID_AUTO, swtch_optim_stats, CTLFLAG_RD, &swtch_optim_stats, 0, ""); SYSCTL_INT(_debug, OID_AUTO, tlb_flush_count, CTLFLAG_RD, &tlb_flush_count, 0, ""); #endif #ifdef PC98 static int ispc98 = 1; #else static int ispc98 = 0; #endif SYSCTL_INT(_machdep, OID_AUTO, ispc98, CTLFLAG_RD, &ispc98, 0, ""); int physmem = 0; int cold = 1; #ifdef COMPAT_43 static void osendsig __P((sig_t catcher, int sig, sigset_t *mask, u_long code)); #endif static int sysctl_hw_physmem(SYSCTL_HANDLER_ARGS) { int error = sysctl_handle_int(oidp, 0, ctob(physmem), req); return (error); } SYSCTL_PROC(_hw, HW_PHYSMEM, physmem, CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_hw_physmem, "IU", ""); static int sysctl_hw_usermem(SYSCTL_HANDLER_ARGS) { int error = sysctl_handle_int(oidp, 0, ctob(physmem - cnt.v_wire_count), req); return (error); } SYSCTL_PROC(_hw, HW_USERMEM, usermem, CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_hw_usermem, "IU", ""); static int sysctl_hw_availpages(SYSCTL_HANDLER_ARGS) { int error = sysctl_handle_int(oidp, 0, i386_btop(avail_end - avail_start), req); return (error); } SYSCTL_PROC(_hw, OID_AUTO, availpages, CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_hw_availpages, "I", ""); int Maxmem = 0; long dumplo; vm_offset_t phys_avail[10]; /* must be 2 less so 0 0 can signal end of chunks */ #define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(vm_offset_t)) - 2) struct kva_md_info kmi; static struct trapframe proc0_tf; #ifndef SMP static struct globaldata __globaldata; #endif struct mtx sched_lock; struct mtx Giant; static void cpu_startup(dummy) void *dummy; { /* * Good {morning,afternoon,evening,night}. */ earlysetcpuclass(); startrtclock(); printcpuinfo(); panicifcpuunsupported(); #ifdef PERFMON perfmon_init(); #endif printf("real memory = %u (%uK bytes)\n", ptoa(Maxmem), ptoa(Maxmem) / 1024); /* * Display any holes after the first chunk of extended memory. */ if (bootverbose) { int indx; printf("Physical memory chunk(s):\n"); for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) { unsigned int size1; size1 = phys_avail[indx + 1] - phys_avail[indx]; printf("0x%08x - 0x%08x, %u bytes (%u pages)\n", phys_avail[indx], phys_avail[indx + 1] - 1, size1, size1 / PAGE_SIZE); } } vm_ksubmap_init(&kmi); #if 0 /* * Calculate callout wheel size */ for (callwheelsize = 1, callwheelbits = 0; callwheelsize < ncallout; callwheelsize <<= 1, ++callwheelbits) ; callwheelmask = callwheelsize - 1; /* * Allocate space for system data structures. * The first available kernel virtual address is in "v". * As pages of kernel virtual memory are allocated, "v" is incremented. * As pages of memory are allocated and cleared, * "firstaddr" is incremented. * An index into the kernel page table corresponding to the * virtual memory address maintained in "v" is kept in "mapaddr". */ /* * Make two passes. The first pass calculates how much memory is * needed and allocates it. The second pass assigns virtual * addresses to the various data structures. */ firstaddr = 0; again: v = (caddr_t)firstaddr; #define valloc(name, type, num) \ (name) = (type *)v; v = (caddr_t)((name)+(num)) #define valloclim(name, type, num, lim) \ (name) = (type *)v; v = (caddr_t)((lim) = ((name)+(num))) valloc(callout, struct callout, ncallout); valloc(callwheel, struct callout_tailq, callwheelsize); /* * Discount the physical memory larger than the size of kernel_map * to avoid eating up all of KVA space. */ if (kernel_map->first_free == NULL) { printf("Warning: no free entries in kernel_map.\n"); physmem_est = physmem; } else { physmem_est = min(physmem, btoc(kernel_map->max_offset - kernel_map->min_offset)); } /* * The nominal buffer size (and minimum KVA allocation) is BKVASIZE. * For the first 64MB of ram nominally allocate sufficient buffers to * cover 1/4 of our ram. Beyond the first 64MB allocate additional * buffers to cover 1/20 of our ram over 64MB. When auto-sizing * the buffer cache we limit the eventual kva reservation to * maxbcache bytes. * * factor represents the 1/4 x ram conversion. */ if (nbuf == 0) { int factor = 4 * BKVASIZE / PAGE_SIZE; nbuf = 50; if (physmem_est > 1024) nbuf += min((physmem_est - 1024) / factor, 16384 / factor); if (physmem_est > 16384) nbuf += (physmem_est - 16384) * 2 / (factor * 5); if (maxbcache && nbuf > maxbcache / BKVASIZE) nbuf = maxbcache / BKVASIZE; } /* * Do not allow the buffer_map to be more then 1/2 the size of the * kernel_map. */ if (nbuf > (kernel_map->max_offset - kernel_map->min_offset) / (BKVASIZE * 2)) { nbuf = (kernel_map->max_offset - kernel_map->min_offset) / (BKVASIZE * 2); printf("Warning: nbufs capped at %d\n", nbuf); } nswbuf = max(min(nbuf/4, 256), 16); valloc(swbuf, struct buf, nswbuf); valloc(buf, struct buf, nbuf); v = bufhashinit(v); /* * End of first pass, size has been calculated so allocate memory */ if (firstaddr == 0) { size = (vm_size_t)(v - firstaddr); firstaddr = (int)kmem_alloc(kernel_map, round_page(size)); if (firstaddr == 0) panic("startup: no room for tables"); goto again; } /* * End of second pass, addresses have been assigned */ if ((vm_size_t)(v - firstaddr) != size) panic("startup: table size inconsistency"); clean_map = kmem_suballoc(kernel_map, &clean_sva, &clean_eva, (nbuf*BKVASIZE) + (nswbuf*MAXPHYS) + pager_map_size); buffer_map = kmem_suballoc(clean_map, &buffer_sva, &buffer_eva, (nbuf*BKVASIZE)); buffer_map->system_map = 1; pager_map = kmem_suballoc(clean_map, &pager_sva, &pager_eva, (nswbuf*MAXPHYS) + pager_map_size); pager_map->system_map = 1; exec_map = kmem_suballoc(kernel_map, &minaddr, &maxaddr, (16*(ARG_MAX+(PAGE_SIZE*3)))); /* * XXX: Mbuf system machine-specific initializations should * go here, if anywhere. */ /* * Initialize callouts */ SLIST_INIT(&callfree); for (i = 0; i < ncallout; i++) { callout_init(&callout[i], 0); callout[i].c_flags = CALLOUT_LOCAL_ALLOC; SLIST_INSERT_HEAD(&callfree, &callout[i], c_links.sle); } for (i = 0; i < callwheelsize; i++) { TAILQ_INIT(&callwheel[i]); } mtx_init(&callout_lock, "callout", MTX_SPIN | MTX_RECURSE); #endif #if defined(USERCONFIG) userconfig(); cninit(); /* the preferred console may have changed */ #endif printf("avail memory = %u (%uK bytes)\n", ptoa(cnt.v_free_count), ptoa(cnt.v_free_count) / 1024); /* * Set up buffers, so they can be used to read disk labels. */ bufinit(); vm_pager_bufferinit(); globaldata_register(GLOBALDATA); #ifndef SMP /* For SMP, we delay the cpu_setregs() until after SMP startup. */ cpu_setregs(); #endif } /* * Send an interrupt to process. * * Stack is set up to allow sigcode stored * at top to call routine, followed by kcall * to sigreturn routine below. After sigreturn * resets the signal mask, the stack, and the * frame pointer, it returns to the user * specified pc, psl. */ #ifdef COMPAT_43 static void osendsig(catcher, sig, mask, code) sig_t catcher; int sig; sigset_t *mask; u_long code; { struct osigframe sf; struct osigframe *fp; struct proc *p; struct sigacts *psp; struct trapframe *regs; int oonstack; p = curproc; PROC_LOCK(p); psp = p->p_sigacts; regs = p->p_frame; oonstack = sigonstack(regs->tf_esp); /* Allocate and validate space for the signal handler context. */ if ((p->p_flag & P_ALTSTACK) && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { fp = (struct osigframe *)(p->p_sigstk.ss_sp + p->p_sigstk.ss_size - sizeof(struct osigframe)); #if defined(COMPAT_43) || defined(COMPAT_SUNOS) p->p_sigstk.ss_flags |= SS_ONSTACK; #endif } else fp = (struct osigframe *)regs->tf_esp - 1; PROC_UNLOCK(p); /* * grow_stack() will return 0 if *fp does not fit inside the stack * and the stack can not be grown. * useracc() will return FALSE if access is denied. */ if (grow_stack(p, (int)fp) == 0 || !useracc((caddr_t)fp, sizeof(*fp), VM_PROT_WRITE)) { /* * Process has trashed its stack; give it an illegal * instruction to halt it in its tracks. */ PROC_LOCK(p); SIGACTION(p, SIGILL) = SIG_DFL; SIGDELSET(p->p_sigignore, SIGILL); SIGDELSET(p->p_sigcatch, SIGILL); SIGDELSET(p->p_sigmask, SIGILL); psignal(p, SIGILL); PROC_UNLOCK(p); return; } /* Translate the signal if appropriate. */ if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize) sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; /* Build the argument list for the signal handler. */ sf.sf_signum = sig; sf.sf_scp = (register_t)&fp->sf_siginfo.si_sc; PROC_LOCK(p); if (SIGISMEMBER(p->p_sigacts->ps_siginfo, sig)) { /* Signal handler installed with SA_SIGINFO. */ sf.sf_arg2 = (register_t)&fp->sf_siginfo; sf.sf_siginfo.si_signo = sig; sf.sf_siginfo.si_code = code; sf.sf_ahu.sf_action = (__osiginfohandler_t *)catcher; } else { /* Old FreeBSD-style arguments. */ sf.sf_arg2 = code; sf.sf_addr = regs->tf_err; sf.sf_ahu.sf_handler = catcher; } PROC_UNLOCK(p); /* Save most if not all of trap frame. */ sf.sf_siginfo.si_sc.sc_eax = regs->tf_eax; sf.sf_siginfo.si_sc.sc_ebx = regs->tf_ebx; sf.sf_siginfo.si_sc.sc_ecx = regs->tf_ecx; sf.sf_siginfo.si_sc.sc_edx = regs->tf_edx; sf.sf_siginfo.si_sc.sc_esi = regs->tf_esi; sf.sf_siginfo.si_sc.sc_edi = regs->tf_edi; sf.sf_siginfo.si_sc.sc_cs = regs->tf_cs; sf.sf_siginfo.si_sc.sc_ds = regs->tf_ds; sf.sf_siginfo.si_sc.sc_ss = regs->tf_ss; sf.sf_siginfo.si_sc.sc_es = regs->tf_es; sf.sf_siginfo.si_sc.sc_fs = regs->tf_fs; sf.sf_siginfo.si_sc.sc_gs = rgs(); sf.sf_siginfo.si_sc.sc_isp = regs->tf_isp; /* Build the signal context to be used by osigreturn(). */ sf.sf_siginfo.si_sc.sc_onstack = (oonstack) ? 1 : 0; SIG2OSIG(*mask, sf.sf_siginfo.si_sc.sc_mask); sf.sf_siginfo.si_sc.sc_sp = regs->tf_esp; sf.sf_siginfo.si_sc.sc_fp = regs->tf_ebp; sf.sf_siginfo.si_sc.sc_pc = regs->tf_eip; sf.sf_siginfo.si_sc.sc_ps = regs->tf_eflags; sf.sf_siginfo.si_sc.sc_trapno = regs->tf_trapno; sf.sf_siginfo.si_sc.sc_err = regs->tf_err; /* * If we're a vm86 process, we want to save the segment registers. * We also change eflags to be our emulated eflags, not the actual * eflags. */ if (regs->tf_eflags & PSL_VM) { /* XXX confusing names: `tf' isn't a trapframe; `regs' is. */ struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; struct vm86_kernel *vm86 = &p->p_addr->u_pcb.pcb_ext->ext_vm86; sf.sf_siginfo.si_sc.sc_gs = tf->tf_vm86_gs; sf.sf_siginfo.si_sc.sc_fs = tf->tf_vm86_fs; sf.sf_siginfo.si_sc.sc_es = tf->tf_vm86_es; sf.sf_siginfo.si_sc.sc_ds = tf->tf_vm86_ds; if (vm86->vm86_has_vme == 0) sf.sf_siginfo.si_sc.sc_ps = (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); /* See sendsig() for comments. */ tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_T | PSL_VIF | PSL_VIP); } /* Copy the sigframe out to the user's stack. */ if (copyout(&sf, fp, sizeof(*fp)) != 0) { /* * Something is wrong with the stack pointer. * ...Kill the process. */ PROC_LOCK(p); sigexit(p, SIGILL); /* NOTREACHED */ } regs->tf_esp = (int)fp; regs->tf_eip = PS_STRINGS - szosigcode; regs->tf_cs = _ucodesel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _udatasel; load_gs(_udatasel); regs->tf_ss = _udatasel; } #endif void sendsig(catcher, sig, mask, code) sig_t catcher; int sig; sigset_t *mask; u_long code; { struct sigframe sf; struct proc *p; struct sigacts *psp; struct trapframe *regs; struct sigframe *sfp; int oonstack; p = curproc; PROC_LOCK(p); psp = p->p_sigacts; #ifdef COMPAT_43 if (SIGISMEMBER(psp->ps_osigset, sig)) { PROC_UNLOCK(p); osendsig(catcher, sig, mask, code); return; } #endif regs = p->p_frame; oonstack = sigonstack(regs->tf_esp); /* Save user context. */ bzero(&sf, sizeof(sf)); sf.sf_uc.uc_sigmask = *mask; sf.sf_uc.uc_stack = p->p_sigstk; sf.sf_uc.uc_stack.ss_flags = (p->p_flag & P_ALTSTACK) ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; sf.sf_uc.uc_mcontext.mc_gs = rgs(); bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs)); /* Allocate and validate space for the signal handler context. */ if ((p->p_flag & P_ALTSTACK) != 0 && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { sfp = (struct sigframe *)(p->p_sigstk.ss_sp + p->p_sigstk.ss_size - sizeof(struct sigframe)); #if defined(COMPAT_43) || defined(COMPAT_SUNOS) p->p_sigstk.ss_flags |= SS_ONSTACK; #endif } else sfp = (struct sigframe *)regs->tf_esp - 1; PROC_UNLOCK(p); /* * grow_stack() will return 0 if *sfp does not fit inside the stack * and the stack can not be grown. * useracc() will return FALSE if access is denied. */ if (grow_stack(p, (int)sfp) == 0 || !useracc((caddr_t)sfp, sizeof(*sfp), VM_PROT_WRITE)) { /* * Process has trashed its stack; give it an illegal * instruction to halt it in its tracks. */ #ifdef DEBUG printf("process %d has trashed its stack\n", p->p_pid); #endif PROC_LOCK(p); SIGACTION(p, SIGILL) = SIG_DFL; SIGDELSET(p->p_sigignore, SIGILL); SIGDELSET(p->p_sigcatch, SIGILL); SIGDELSET(p->p_sigmask, SIGILL); psignal(p, SIGILL); PROC_UNLOCK(p); return; } /* Translate the signal if appropriate. */ if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize) sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; /* Build the argument list for the signal handler. */ sf.sf_signum = sig; sf.sf_ucontext = (register_t)&sfp->sf_uc; PROC_LOCK(p); if (SIGISMEMBER(p->p_sigacts->ps_siginfo, sig)) { /* Signal handler installed with SA_SIGINFO. */ sf.sf_siginfo = (register_t)&sfp->sf_si; sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; /* Fill siginfo structure. */ sf.sf_si.si_signo = sig; sf.sf_si.si_code = code; sf.sf_si.si_addr = (void *)regs->tf_err; } else { /* Old FreeBSD-style arguments. */ sf.sf_siginfo = code; sf.sf_addr = regs->tf_err; sf.sf_ahu.sf_handler = catcher; } PROC_UNLOCK(p); /* * If we're a vm86 process, we want to save the segment registers. * We also change eflags to be our emulated eflags, not the actual * eflags. */ if (regs->tf_eflags & PSL_VM) { struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; struct vm86_kernel *vm86 = &p->p_addr->u_pcb.pcb_ext->ext_vm86; sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs; sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs; sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es; sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds; if (vm86->vm86_has_vme == 0) sf.sf_uc.uc_mcontext.mc_eflags = (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); /* * We should never have PSL_T set when returning from vm86 * mode. It may be set here if we deliver a signal before * getting to vm86 mode, so turn it off. * * Clear PSL_NT to inhibit T_TSSFLT faults on return from * syscalls made by the signal handler. This just avoids * wasting time for our lazy fixup of such faults. PSL_NT * does nothing in vm86 mode, but vm86 programs can set it * almost legitimately in probes for old cpu types. */ tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_T | PSL_VIF | PSL_VIP); } /* Copy the sigframe out to the user's stack. */ if (copyout(&sf, sfp, sizeof(*sfp)) != 0) { /* * Something is wrong with the stack pointer. * ...Kill the process. */ PROC_LOCK(p); sigexit(p, SIGILL); /* NOTREACHED */ } regs->tf_esp = (int)sfp; regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode); regs->tf_cs = _ucodesel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _udatasel; regs->tf_ss = _udatasel; } /* * System call to cleanup state after a signal * has been taken. Reset signal mask and * stack state from context left by sendsig (above). * Return to previous pc and psl as specified by * context left by sendsig. Check carefully to * make sure that the user has not modified the * state to gain improper privileges. */ #ifdef COMPAT_43 int osigreturn(p, uap) struct proc *p; struct osigreturn_args /* { struct osigcontext *sigcntxp; } */ *uap; { struct trapframe *regs; struct osigcontext *scp; int eflags; regs = p->p_frame; scp = uap->sigcntxp; if (!useracc((caddr_t)scp, sizeof(*scp), VM_PROT_READ)) return (EFAULT); eflags = scp->sc_ps; if (eflags & PSL_VM) { struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; struct vm86_kernel *vm86; /* * if pcb_ext == 0 or vm86_inited == 0, the user hasn't * set up the vm86 area, and we can't enter vm86 mode. */ if (p->p_addr->u_pcb.pcb_ext == 0) return (EINVAL); vm86 = &p->p_addr->u_pcb.pcb_ext->ext_vm86; if (vm86->vm86_inited == 0) return (EINVAL); /* Go back to user mode if both flags are set. */ if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) trapsignal(p, SIGBUS, 0); if (vm86->vm86_has_vme) { eflags = (tf->tf_eflags & ~VME_USERCHANGE) | (eflags & VME_USERCHANGE) | PSL_VM; } else { vm86->vm86_eflags = eflags; /* save VIF, VIP */ eflags = (tf->tf_eflags & ~VM_USERCHANGE) | (eflags & VM_USERCHANGE) | PSL_VM; } tf->tf_vm86_ds = scp->sc_ds; tf->tf_vm86_es = scp->sc_es; tf->tf_vm86_fs = scp->sc_fs; tf->tf_vm86_gs = scp->sc_gs; tf->tf_ds = _udatasel; tf->tf_es = _udatasel; tf->tf_fs = _udatasel; } else { /* * Don't allow users to change privileged or reserved flags. */ /* * XXX do allow users to change the privileged flag PSL_RF. * The cpu sets PSL_RF in tf_eflags for faults. Debuggers * should sometimes set it there too. tf_eflags is kept in * the signal context during signal handling and there is no * other place to remember it, so the PSL_RF bit may be * corrupted by the signal handler without us knowing. * Corruption of the PSL_RF bit at worst causes one more or * one less debugger trap, so allowing it is fairly harmless. */ if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) { return (EINVAL); } /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ if (!CS_SECURE(scp->sc_cs)) { trapsignal(p, SIGBUS, T_PROTFLT); return (EINVAL); } regs->tf_ds = scp->sc_ds; regs->tf_es = scp->sc_es; regs->tf_fs = scp->sc_fs; } /* Restore remaining registers. */ regs->tf_eax = scp->sc_eax; regs->tf_ebx = scp->sc_ebx; regs->tf_ecx = scp->sc_ecx; regs->tf_edx = scp->sc_edx; regs->tf_esi = scp->sc_esi; regs->tf_edi = scp->sc_edi; regs->tf_cs = scp->sc_cs; regs->tf_ss = scp->sc_ss; regs->tf_isp = scp->sc_isp; PROC_LOCK(p); #if defined(COMPAT_43) || defined(COMPAT_SUNOS) if (scp->sc_onstack & 1) p->p_sigstk.ss_flags |= SS_ONSTACK; else p->p_sigstk.ss_flags &= ~SS_ONSTACK; #endif SIGSETOLD(p->p_sigmask, scp->sc_mask); SIG_CANTMASK(p->p_sigmask); PROC_UNLOCK(p); regs->tf_ebp = scp->sc_fp; regs->tf_esp = scp->sc_sp; regs->tf_eip = scp->sc_pc; regs->tf_eflags = eflags; return (EJUSTRETURN); } #endif int sigreturn(p, uap) struct proc *p; struct sigreturn_args /* { ucontext_t *sigcntxp; } */ *uap; { struct trapframe *regs; ucontext_t *ucp; int cs, eflags; ucp = uap->sigcntxp; #ifdef COMPAT_43 if (!useracc((caddr_t)ucp, sizeof(struct osigcontext), VM_PROT_READ)) return (EFAULT); if (((struct osigcontext *)ucp)->sc_trapno == 0x01d516) return (osigreturn(p, (struct osigreturn_args *)uap)); /* * Since ucp is not an osigcontext but a ucontext_t, we have to * check again if all of it is accessible. A ucontext_t is * much larger, so instead of just checking for the pointer * being valid for the size of an osigcontext, now check for * it being valid for a whole, new-style ucontext_t. */ #endif if (!useracc((caddr_t)ucp, sizeof(*ucp), VM_PROT_READ)) return (EFAULT); regs = p->p_frame; eflags = ucp->uc_mcontext.mc_eflags; if (eflags & PSL_VM) { struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; struct vm86_kernel *vm86; /* * if pcb_ext == 0 or vm86_inited == 0, the user hasn't * set up the vm86 area, and we can't enter vm86 mode. */ if (p->p_addr->u_pcb.pcb_ext == 0) return (EINVAL); vm86 = &p->p_addr->u_pcb.pcb_ext->ext_vm86; if (vm86->vm86_inited == 0) return (EINVAL); /* Go back to user mode if both flags are set. */ if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) trapsignal(p, SIGBUS, 0); if (vm86->vm86_has_vme) { eflags = (tf->tf_eflags & ~VME_USERCHANGE) | (eflags & VME_USERCHANGE) | PSL_VM; } else { vm86->vm86_eflags = eflags; /* save VIF, VIP */ eflags = (tf->tf_eflags & ~VM_USERCHANGE) | (eflags & VM_USERCHANGE) | PSL_VM; } bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe)); tf->tf_eflags = eflags; tf->tf_vm86_ds = tf->tf_ds; tf->tf_vm86_es = tf->tf_es; tf->tf_vm86_fs = tf->tf_fs; tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs; tf->tf_ds = _udatasel; tf->tf_es = _udatasel; tf->tf_fs = _udatasel; } else { /* * Don't allow users to change privileged or reserved flags. */ /* * XXX do allow users to change the privileged flag PSL_RF. * The cpu sets PSL_RF in tf_eflags for faults. Debuggers * should sometimes set it there too. tf_eflags is kept in * the signal context during signal handling and there is no * other place to remember it, so the PSL_RF bit may be * corrupted by the signal handler without us knowing. * Corruption of the PSL_RF bit at worst causes one more or * one less debugger trap, so allowing it is fairly harmless. */ if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) { printf("sigreturn: eflags = 0x%x\n", eflags); return (EINVAL); } /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ cs = ucp->uc_mcontext.mc_cs; if (!CS_SECURE(cs)) { printf("sigreturn: cs = 0x%x\n", cs); trapsignal(p, SIGBUS, T_PROTFLT); return (EINVAL); } bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs)); } PROC_LOCK(p); #if defined(COMPAT_43) || defined(COMPAT_SUNOS) if (ucp->uc_mcontext.mc_onstack & 1) p->p_sigstk.ss_flags |= SS_ONSTACK; else p->p_sigstk.ss_flags &= ~SS_ONSTACK; #endif p->p_sigmask = ucp->uc_sigmask; SIG_CANTMASK(p->p_sigmask); PROC_UNLOCK(p); return (EJUSTRETURN); } /* * Machine dependent boot() routine * * I haven't seen anything to put here yet * Possibly some stuff might be grafted back here from boot() */ void cpu_boot(int howto) { } /* * Shutdown the CPU as much as possible */ void cpu_halt(void) { for (;;) __asm__ ("hlt"); } /* * Hook to idle the CPU when possible. This currently only works in * the !SMP case, as there is no clean way to ensure that a CPU will be * woken when there is work available for it. */ static int cpu_idle_hlt = 1; SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_hlt, CTLFLAG_RW, &cpu_idle_hlt, 0, "Idle loop HLT enable"); /* * Note that we have to be careful here to avoid a race between checking * procrunnable() and actually halting. If we don't do this, we may waste * the time between calling hlt and the next interrupt even though there * is a runnable process. */ void cpu_idle(void) { #ifndef SMP if (cpu_idle_hlt) { disable_intr(); if (procrunnable()) enable_intr(); else { enable_intr(); __asm __volatile("hlt"); } } #endif } /* * Clear registers on exec */ void setregs(p, entry, stack, ps_strings) struct proc *p; u_long entry; u_long stack; u_long ps_strings; { struct trapframe *regs = p->p_frame; struct pcb *pcb = &p->p_addr->u_pcb; if (pcb->pcb_ldt) user_ldt_free(pcb); bzero((char *)regs, sizeof(struct trapframe)); regs->tf_eip = entry; regs->tf_esp = stack; regs->tf_eflags = PSL_USER | (regs->tf_eflags & PSL_T); regs->tf_ss = _udatasel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _udatasel; regs->tf_cs = _ucodesel; /* PS_STRINGS value for BSD/OS binaries. It is 0 for non-BSD/OS. */ regs->tf_ebx = ps_strings; /* reset %gs as well */ if (pcb == PCPU_GET(curpcb)) load_gs(_udatasel); else pcb->pcb_gs = _udatasel; /* * Reset the hardware debug registers if they were in use. * They won't have any meaning for the newly exec'd process. */ if (pcb->pcb_flags & PCB_DBREGS) { pcb->pcb_dr0 = 0; pcb->pcb_dr1 = 0; pcb->pcb_dr2 = 0; pcb->pcb_dr3 = 0; pcb->pcb_dr6 = 0; pcb->pcb_dr7 = 0; if (pcb == PCPU_GET(curpcb)) { /* * Clear the debug registers on the running * CPU, otherwise they will end up affecting * the next process we switch to. */ reset_dbregs(); } pcb->pcb_flags &= ~PCB_DBREGS; } /* * Initialize the math emulator (if any) for the current process. * Actually, just clear the bit that says that the emulator has * been initialized. Initialization is delayed until the process * traps to the emulator (if it is done at all) mainly because * emulators don't provide an entry point for initialization. */ p->p_addr->u_pcb.pcb_flags &= ~FP_SOFTFP; /* * Arrange to trap the next npx or `fwait' instruction (see npx.c * for why fwait must be trapped at least if there is an npx or an * emulator). This is mainly to handle the case where npx0 is not * configured, since the npx routines normally set up the trap * otherwise. It should be done only at boot time, but doing it * here allows modifying `npx_exists' for testing the emulator on * systems with an npx. */ load_cr0(rcr0() | CR0_MP | CR0_TS); #ifdef DEV_NPX /* Initialize the npx (if any) for the current process. */ npxinit(__INITIAL_NPXCW__); #endif /* * XXX - Linux emulator * Make sure sure edx is 0x0 on entry. Linux binaries depend * on it. */ p->p_retval[1] = 0; } void cpu_setregs(void) { unsigned int cr0; cr0 = rcr0(); cr0 |= CR0_NE; /* Done by npxinit() */ cr0 |= CR0_MP | CR0_TS; /* Done at every execve() too. */ #ifndef I386_CPU cr0 |= CR0_WP | CR0_AM; #endif load_cr0(cr0); load_gs(_udatasel); } static int sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS) { int error; error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req); if (!error && req->newptr) resettodr(); return (error); } SYSCTL_PROC(_machdep, CPU_ADJKERNTZ, adjkerntz, CTLTYPE_INT|CTLFLAG_RW, &adjkerntz, 0, sysctl_machdep_adjkerntz, "I", ""); SYSCTL_INT(_machdep, CPU_DISRTCSET, disable_rtc_set, CTLFLAG_RW, &disable_rtc_set, 0, ""); SYSCTL_STRUCT(_machdep, CPU_BOOTINFO, bootinfo, CTLFLAG_RD, &bootinfo, bootinfo, ""); SYSCTL_INT(_machdep, CPU_WALLCLOCK, wall_cmos_clock, CTLFLAG_RW, &wall_cmos_clock, 0, ""); /* * Initialize 386 and configure to run kernel */ /* * Initialize segments & interrupt table */ int _default_ldt; union descriptor gdt[NGDT * MAXCPU]; /* global descriptor table */ static struct gate_descriptor idt0[NIDT]; struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */ union descriptor ldt[NLDT]; /* local descriptor table */ #ifdef SMP /* table descriptors - used to load tables by microp */ struct region_descriptor r_gdt, r_idt; #endif int private_tss; /* flag indicating private tss */ #if defined(I586_CPU) && !defined(NO_F00F_HACK) extern int has_f00f_bug; #endif static struct i386tss dblfault_tss; static char dblfault_stack[PAGE_SIZE]; extern struct user *proc0paddr; /* software prototypes -- in more palatable form */ struct soft_segment_descriptor gdt_segs[] = { /* GNULL_SEL 0 Null Descriptor */ { 0x0, /* segment base address */ 0x0, /* length */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GCODE_SEL 1 Code Descriptor for kernel */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMERA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GDATA_SEL 2 Data Descriptor for kernel */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMRWA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GPRIV_SEL 3 SMP Per-Processor Private Data Descriptor */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMRWA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GPROC0_SEL 4 Proc 0 Tss Descriptor */ { 0x0, /* segment base address */ sizeof(struct i386tss)-1,/* length - all address space */ SDT_SYS386TSS, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* unused - default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GLDT_SEL 5 LDT Descriptor */ { (int) ldt, /* segment base address */ sizeof(ldt)-1, /* length - all address space */ SDT_SYSLDT, /* segment type */ SEL_UPL, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* unused - default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GUSERLDT_SEL 6 User LDT Descriptor per process */ { (int) ldt, /* segment base address */ (512 * sizeof(union descriptor)-1), /* length */ SDT_SYSLDT, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* unused - default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GTGATE_SEL 7 Null Descriptor - Placeholder */ { 0x0, /* segment base address */ 0x0, /* length - all address space */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GBIOSLOWMEM_SEL 8 BIOS access to realmode segment 0x40, must be #8 in GDT */ { 0x400, /* segment base address */ 0xfffff, /* length */ SDT_MEMRWA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GPANIC_SEL 9 Panic Tss Descriptor */ { (int) &dblfault_tss, /* segment base address */ sizeof(struct i386tss)-1,/* length - all address space */ SDT_SYS386TSS, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* unused - default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GBIOSCODE32_SEL 10 BIOS 32-bit interface (32bit Code) */ { 0, /* segment base address (overwritten) */ 0xfffff, /* length */ SDT_MEMERA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GBIOSCODE16_SEL 11 BIOS 32-bit interface (16bit Code) */ { 0, /* segment base address (overwritten) */ 0xfffff, /* length */ SDT_MEMERA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GBIOSDATA_SEL 12 BIOS 32-bit interface (Data) */ { 0, /* segment base address (overwritten) */ 0xfffff, /* length */ SDT_MEMRWA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GBIOSUTIL_SEL 13 BIOS 16-bit interface (Utility) */ { 0, /* segment base address (overwritten) */ 0xfffff, /* length */ SDT_MEMRWA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GBIOSARGS_SEL 14 BIOS 16-bit interface (Arguments) */ { 0, /* segment base address (overwritten) */ 0xfffff, /* length */ SDT_MEMRWA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, }; static struct soft_segment_descriptor ldt_segs[] = { /* Null Descriptor - overwritten by call gate */ { 0x0, /* segment base address */ 0x0, /* length - all address space */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* Null Descriptor - overwritten by call gate */ { 0x0, /* segment base address */ 0x0, /* length - all address space */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* Null Descriptor - overwritten by call gate */ { 0x0, /* segment base address */ 0x0, /* length - all address space */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* Code Descriptor for user */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMERA, /* segment type */ SEL_UPL, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* Null Descriptor - overwritten by call gate */ { 0x0, /* segment base address */ 0x0, /* length - all address space */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* Data Descriptor for user */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMRWA, /* segment type */ SEL_UPL, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, }; void setidt(idx, func, typ, dpl, selec) int idx; inthand_t *func; int typ; int dpl; int selec; { struct gate_descriptor *ip; ip = idt + idx; ip->gd_looffset = (int)func; ip->gd_selector = selec; ip->gd_stkcpy = 0; ip->gd_xx = 0; ip->gd_type = typ; ip->gd_dpl = dpl; ip->gd_p = 1; ip->gd_hioffset = ((int)func)>>16 ; } #define IDTVEC(name) __CONCAT(X,name) extern inthand_t IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), IDTVEC(xmm), IDTVEC(lcall_syscall), IDTVEC(int0x80_syscall); void sdtossd(sd, ssd) struct segment_descriptor *sd; struct soft_segment_descriptor *ssd; { ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase; ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit; ssd->ssd_type = sd->sd_type; ssd->ssd_dpl = sd->sd_dpl; ssd->ssd_p = sd->sd_p; ssd->ssd_def32 = sd->sd_def32; ssd->ssd_gran = sd->sd_gran; } #define PHYSMAP_SIZE (2 * 8) /* * Populate the (physmap) array with base/bound pairs describing the * available physical memory in the system, then test this memory and * build the phys_avail array describing the actually-available memory. * * If we cannot accurately determine the physical memory map, then use * value from the 0xE801 call, and failing that, the RTC. * * Total memory size may be set by the kernel environment variable * hw.physmem or the compile-time define MAXMEM. */ static void getmemsize(int first) { int i, physmap_idx, pa_indx; u_int basemem, extmem; struct vm86frame vmf; struct vm86context vmc; vm_offset_t pa, physmap[PHYSMAP_SIZE]; pt_entry_t pte; const char *cp; struct bios_smap *smap; bzero(&vmf, sizeof(struct vm86frame)); bzero(physmap, sizeof(physmap)); /* * Perform "base memory" related probes & setup */ vm86_intcall(0x12, &vmf); basemem = vmf.vmf_ax; if (basemem > 640) { printf("Preposterous BIOS basemem of %uK, truncating to 640K\n", basemem); basemem = 640; } /* * XXX if biosbasemem is now < 640, there is a `hole' * between the end of base memory and the start of * ISA memory. The hole may be empty or it may * contain BIOS code or data. Map it read/write so * that the BIOS can write to it. (Memory from 0 to * the physical end of the kernel is mapped read-only * to begin with and then parts of it are remapped. * The parts that aren't remapped form holes that * remain read-only and are unused by the kernel. * The base memory area is below the physical end of * the kernel and right now forms a read-only hole. * The part of it from PAGE_SIZE to * (trunc_page(biosbasemem * 1024) - 1) will be * remapped and used by the kernel later.) * * This code is similar to the code used in * pmap_mapdev, but since no memory needs to be * allocated we simply change the mapping. */ for (pa = trunc_page(basemem * 1024); pa < ISA_HOLE_START; pa += PAGE_SIZE) { pte = (pt_entry_t)vtopte(pa + KERNBASE); *pte = pa | PG_RW | PG_V; } /* * if basemem != 640, map pages r/w into vm86 page table so * that the bios can scribble on it. */ pte = (pt_entry_t)vm86paddr; for (i = basemem / 4; i < 160; i++) pte[i] = (i << PAGE_SHIFT) | PG_V | PG_RW | PG_U; /* * map page 1 R/W into the kernel page table so we can use it * as a buffer. The kernel will unmap this page later. */ pte = (pt_entry_t)vtopte(KERNBASE + (1 << PAGE_SHIFT)); *pte = (1 << PAGE_SHIFT) | PG_RW | PG_V; /* * get memory map with INT 15:E820 */ vmc.npages = 0; smap = (void *)vm86_addpage(&vmc, 1, KERNBASE + (1 << PAGE_SHIFT)); vm86_getptr(&vmc, (vm_offset_t)smap, &vmf.vmf_es, &vmf.vmf_di); physmap_idx = 0; vmf.vmf_ebx = 0; do { vmf.vmf_eax = 0xE820; vmf.vmf_edx = SMAP_SIG; vmf.vmf_ecx = sizeof(struct bios_smap); i = vm86_datacall(0x15, &vmf, &vmc); if (i || vmf.vmf_eax != SMAP_SIG) break; if (boothowto & RB_VERBOSE) printf("SMAP type=%02x base=%08x %08x len=%08x %08x\n", smap->type, *(u_int32_t *)((char *)&smap->base + 4), (u_int32_t)smap->base, *(u_int32_t *)((char *)&smap->length + 4), (u_int32_t)smap->length); if (smap->type != 0x01) goto next_run; if (smap->length == 0) goto next_run; if (smap->base >= 0xffffffff) { printf("%uK of memory above 4GB ignored\n", (u_int)(smap->length / 1024)); goto next_run; } for (i = 0; i <= physmap_idx; i += 2) { if (smap->base < physmap[i + 1]) { if (boothowto & RB_VERBOSE) printf( "Overlapping or non-montonic memory region, ignoring second region\n"); goto next_run; } } if (smap->base == physmap[physmap_idx + 1]) { physmap[physmap_idx + 1] += smap->length; goto next_run; } physmap_idx += 2; if (physmap_idx == PHYSMAP_SIZE) { printf( "Too many segments in the physical address map, giving up\n"); break; } physmap[physmap_idx] = smap->base; physmap[physmap_idx + 1] = smap->base + smap->length; next_run: } while (vmf.vmf_ebx != 0); if (physmap[1] != 0) goto physmap_done; /* * If we failed above, try memory map with INT 15:E801 */ vmf.vmf_ax = 0xE801; if (vm86_intcall(0x15, &vmf) == 0) { extmem = vmf.vmf_cx + vmf.vmf_dx * 64; } else { #if 0 vmf.vmf_ah = 0x88; vm86_intcall(0x15, &vmf); extmem = vmf.vmf_ax; #else /* * Prefer the RTC value for extended memory. */ extmem = rtcin(RTC_EXTLO) + (rtcin(RTC_EXTHI) << 8); #endif } /* * Special hack for chipsets that still remap the 384k hole when * there's 16MB of memory - this really confuses people that * are trying to use bus mastering ISA controllers with the * "16MB limit"; they only have 16MB, but the remapping puts * them beyond the limit. * * If extended memory is between 15-16MB (16-17MB phys address range), * chop it to 15MB. */ if ((extmem > 15 * 1024) && (extmem < 16 * 1024)) extmem = 15 * 1024; physmap[0] = 0; physmap[1] = basemem * 1024; physmap_idx = 2; physmap[physmap_idx] = 0x100000; physmap[physmap_idx + 1] = physmap[physmap_idx] + extmem * 1024; physmap_done: /* * Now, physmap contains a map of physical memory. */ #ifdef SMP /* make hole for AP bootstrap code */ physmap[1] = mp_bootaddress(physmap[1] / 1024); /* look for the MP hardware - needed for apic addresses */ i386_mp_probe(); #endif /* * Maxmem isn't the "maximum memory", it's one larger than the * highest page of the physical address space. It should be * called something like "Maxphyspage". We may adjust this * based on ``hw.physmem'' and the results of the memory test. */ Maxmem = atop(physmap[physmap_idx + 1]); #ifdef MAXMEM Maxmem = MAXMEM / 4; #endif /* * hw.physmem is a size in bytes; we also allow k, m, and g suffixes * for the appropriate modifiers. This overrides MAXMEM. */ if ((cp = getenv("hw.physmem")) != NULL) { u_int64_t AllowMem, sanity; char *ep; sanity = AllowMem = strtouq(cp, &ep, 0); if ((ep != cp) && (*ep != 0)) { switch(*ep) { case 'g': case 'G': AllowMem <<= 10; case 'm': case 'M': AllowMem <<= 10; case 'k': case 'K': AllowMem <<= 10; break; default: AllowMem = sanity = 0; } if (AllowMem < sanity) AllowMem = 0; } if (AllowMem == 0) printf("Ignoring invalid memory size of '%s'\n", cp); else Maxmem = atop(AllowMem); } if (atop(physmap[physmap_idx + 1]) != Maxmem && (boothowto & RB_VERBOSE)) printf("Physical memory use set to %uK\n", Maxmem * 4); /* * If Maxmem has been increased beyond what the system has detected, * extend the last memory segment to the new limit. */ if (atop(physmap[physmap_idx + 1]) < Maxmem) physmap[physmap_idx + 1] = ptoa(Maxmem); /* call pmap initialization to make new kernel address space */ pmap_bootstrap(first, 0); /* * Size up each available chunk of physical memory. */ physmap[0] = PAGE_SIZE; /* mask off page 0 */ pa_indx = 0; phys_avail[pa_indx++] = physmap[0]; phys_avail[pa_indx] = physmap[0]; #if 0 pte = (pt_entry_t)vtopte(KERNBASE); #else pte = (pt_entry_t)CMAP1; #endif /* * physmap is in bytes, so when converting to page boundaries, * round up the start address and round down the end address. */ for (i = 0; i <= physmap_idx; i += 2) { vm_offset_t end; end = ptoa(Maxmem); if (physmap[i + 1] < end) end = trunc_page(physmap[i + 1]); for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) { int tmp, page_bad; #if 0 int *ptr = 0; #else int *ptr = (int *)CADDR1; #endif /* * block out kernel memory as not available. */ if (pa >= 0x100000 && pa < first) continue; page_bad = FALSE; /* * map page into kernel: valid, read/write,non-cacheable */ *pte = pa | PG_V | PG_RW | PG_N; invltlb(); tmp = *(int *)ptr; /* * Test for alternating 1's and 0's */ *(volatile int *)ptr = 0xaaaaaaaa; if (*(volatile int *)ptr != 0xaaaaaaaa) { page_bad = TRUE; } /* * Test for alternating 0's and 1's */ *(volatile int *)ptr = 0x55555555; if (*(volatile int *)ptr != 0x55555555) { page_bad = TRUE; } /* * Test for all 1's */ *(volatile int *)ptr = 0xffffffff; if (*(volatile int *)ptr != 0xffffffff) { page_bad = TRUE; } /* * Test for all 0's */ *(volatile int *)ptr = 0x0; if (*(volatile int *)ptr != 0x0) { page_bad = TRUE; } /* * Restore original value. */ *(int *)ptr = tmp; /* * Adjust array of valid/good pages. */ if (page_bad == TRUE) { continue; } /* * If this good page is a continuation of the * previous set of good pages, then just increase * the end pointer. Otherwise start a new chunk. * Note that "end" points one higher than end, * making the range >= start and < end. * If we're also doing a speculative memory * test and we at or past the end, bump up Maxmem * so that we keep going. The first bad page * will terminate the loop. */ if (phys_avail[pa_indx] == pa) { phys_avail[pa_indx] += PAGE_SIZE; } else { pa_indx++; if (pa_indx == PHYS_AVAIL_ARRAY_END) { printf( "Too many holes in the physical address space, giving up\n"); pa_indx--; break; } phys_avail[pa_indx++] = pa; /* start */ phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */ } physmem++; } } *pte = 0; invltlb(); /* * XXX * The last chunk must contain at least one page plus the message * buffer to avoid complicating other code (message buffer address * calculation, etc.). */ while (phys_avail[pa_indx - 1] + PAGE_SIZE + round_page(MSGBUF_SIZE) >= phys_avail[pa_indx]) { physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]); phys_avail[pa_indx--] = 0; phys_avail[pa_indx--] = 0; } Maxmem = atop(phys_avail[pa_indx]); /* Trim off space for the message buffer. */ phys_avail[pa_indx] -= round_page(MSGBUF_SIZE); avail_end = phys_avail[pa_indx]; } void init386(first) int first; { struct gate_descriptor *gdp; int gsel_tss, metadata_missing, off, x; #ifndef SMP /* table descriptors - used to load tables by microp */ struct region_descriptor r_gdt, r_idt; #endif proc0.p_addr = proc0paddr; atdevbase = ISA_HOLE_START + KERNBASE; metadata_missing = 0; if (bootinfo.bi_modulep) { preload_metadata = (caddr_t)bootinfo.bi_modulep + KERNBASE; preload_bootstrap_relocate(KERNBASE); } else { metadata_missing = 1; } if (bootinfo.bi_envp) kern_envp = (caddr_t)bootinfo.bi_envp + KERNBASE; /* Init basic tunables, hz etc */ init_param(); /* * make gdt memory segments, the code segment goes up to end of the * page with etext in it, the data segment goes to the end of * the address space */ /* * XXX text protection is temporarily (?) disabled. The limit was * i386_btop(round_page(etext)) - 1. */ gdt_segs[GCODE_SEL].ssd_limit = atop(0 - 1); gdt_segs[GDATA_SEL].ssd_limit = atop(0 - 1); #ifdef SMP gdt_segs[GPRIV_SEL].ssd_limit = atop(sizeof(struct privatespace) - 1); gdt_segs[GPRIV_SEL].ssd_base = (int) &SMP_prvspace[0]; gdt_segs[GPROC0_SEL].ssd_base = (int) &SMP_prvspace[0].globaldata.gd_common_tss; SMP_prvspace[0].globaldata.gd_prvspace = &SMP_prvspace[0].globaldata; #else gdt_segs[GPRIV_SEL].ssd_limit = atop(sizeof(struct globaldata) - 1); gdt_segs[GPRIV_SEL].ssd_base = (int) &__globaldata; gdt_segs[GPROC0_SEL].ssd_base = (int) &__globaldata.gd_common_tss; __globaldata.gd_prvspace = &__globaldata; #endif for (x = 0; x < NGDT; x++) { #ifdef BDE_DEBUGGER /* avoid overwriting db entries with APM ones */ if (x >= GAPMCODE32_SEL && x <= GAPMDATA_SEL) continue; #endif ssdtosd(&gdt_segs[x], &gdt[x].sd); } r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; r_gdt.rd_base = (int) gdt; lgdt(&r_gdt); /* setup curproc so that mutexes work */ PCPU_SET(curproc, &proc0); PCPU_SET(spinlocks, NULL); LIST_INIT(&proc0.p_contested); /* * Initialize mutexes. */ mtx_init(&Giant, "Giant", MTX_DEF | MTX_RECURSE); mtx_init(&sched_lock, "sched lock", MTX_SPIN | MTX_RECURSE); mtx_init(&proc0.p_mtx, "process lock", MTX_DEF); mtx_init(&clock_lock, "clk", MTX_SPIN | MTX_RECURSE); #ifdef SMP mtx_init(&imen_mtx, "imen", MTX_SPIN); #endif mtx_lock(&Giant); /* make ldt memory segments */ /* * XXX - VM_MAXUSER_ADDRESS is an end address, not a max. And it * should be spelled ...MAX_USER... */ ldt_segs[LUCODE_SEL].ssd_limit = atop(VM_MAXUSER_ADDRESS - 1); ldt_segs[LUDATA_SEL].ssd_limit = atop(VM_MAXUSER_ADDRESS - 1); for (x = 0; x < sizeof ldt_segs / sizeof ldt_segs[0]; x++) ssdtosd(&ldt_segs[x], &ldt[x].sd); _default_ldt = GSEL(GLDT_SEL, SEL_KPL); lldt(_default_ldt); PCPU_SET(currentldt, _default_ldt); /* exceptions */ for (x = 0; x < NIDT; x++) setidt(x, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(0, &IDTVEC(div), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(1, &IDTVEC(dbg), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(2, &IDTVEC(nmi), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(3, &IDTVEC(bpt), SDT_SYS386IGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(4, &IDTVEC(ofl), SDT_SYS386TGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(5, &IDTVEC(bnd), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(6, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(7, &IDTVEC(dna), SDT_SYS386TGT, SEL_KPL , GSEL(GCODE_SEL, SEL_KPL)); setidt(8, 0, SDT_SYSTASKGT, SEL_KPL, GSEL(GPANIC_SEL, SEL_KPL)); setidt(9, &IDTVEC(fpusegm), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(10, &IDTVEC(tss), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(11, &IDTVEC(missing), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(12, &IDTVEC(stk), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(13, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(14, &IDTVEC(page), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(15, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(16, &IDTVEC(fpu), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(17, &IDTVEC(align), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(18, &IDTVEC(mchk), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(19, &IDTVEC(xmm), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(0x80, &IDTVEC(int0x80_syscall), SDT_SYS386TGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); r_idt.rd_limit = sizeof(idt0) - 1; r_idt.rd_base = (int) idt; lidt(&r_idt); /* * Initialize the console before we print anything out. */ cninit(); if (metadata_missing) printf("WARNING: loader(8) metadata is missing!\n"); #ifdef DEV_ISA isa_defaultirq(); #endif #ifdef DDB kdb_init(); if (boothowto & RB_KDB) Debugger("Boot flags requested debugger"); #endif finishidentcpu(); /* Final stage of CPU initialization */ setidt(6, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(13, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); initializecpu(); /* Initialize CPU registers */ /* make an initial tss so cpu can get interrupt stack on syscall! */ PCPU_SET(common_tss.tss_esp0, (int) proc0.p_addr + UPAGES*PAGE_SIZE - 16); PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL)); gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); private_tss = 0; PCPU_SET(tss_gdt, &gdt[GPROC0_SEL].sd); PCPU_SET(common_tssd, *PCPU_GET(tss_gdt)); PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16); ltr(gsel_tss); dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 = dblfault_tss.tss_esp2 = (int)&dblfault_stack[sizeof(dblfault_stack)]; dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 = dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL); dblfault_tss.tss_cr3 = (int)IdlePTD; dblfault_tss.tss_eip = (int)dblfault_handler; dblfault_tss.tss_eflags = PSL_KERNEL; dblfault_tss.tss_ds = dblfault_tss.tss_es = dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL); dblfault_tss.tss_fs = GSEL(GPRIV_SEL, SEL_KPL); dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL); dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL); vm86_initialize(); getmemsize(first); /* now running on new page tables, configured,and u/iom is accessible */ /* Map the message buffer. */ for (off = 0; off < round_page(MSGBUF_SIZE); off += PAGE_SIZE) pmap_kenter((vm_offset_t)msgbufp + off, avail_end + off); msgbufinit(msgbufp, MSGBUF_SIZE); /* make a call gate to reenter kernel with */ gdp = &ldt[LSYS5CALLS_SEL].gd; x = (int) &IDTVEC(lcall_syscall); gdp->gd_looffset = x; gdp->gd_selector = GSEL(GCODE_SEL,SEL_KPL); gdp->gd_stkcpy = 1; gdp->gd_type = SDT_SYS386CGT; gdp->gd_dpl = SEL_UPL; gdp->gd_p = 1; gdp->gd_hioffset = x >> 16; /* XXX does this work? */ ldt[LBSDICALLS_SEL] = ldt[LSYS5CALLS_SEL]; ldt[LSOL26CALLS_SEL] = ldt[LSYS5CALLS_SEL]; /* transfer to user mode */ _ucodesel = LSEL(LUCODE_SEL, SEL_UPL); _udatasel = LSEL(LUDATA_SEL, SEL_UPL); /* setup proc 0's pcb */ proc0.p_addr->u_pcb.pcb_flags = 0; proc0.p_addr->u_pcb.pcb_cr3 = (int)IdlePTD; proc0.p_addr->u_pcb.pcb_ext = 0; proc0.p_frame = &proc0_tf; } #if defined(I586_CPU) && !defined(NO_F00F_HACK) static void f00f_hack(void *unused); SYSINIT(f00f_hack, SI_SUB_INTRINSIC, SI_ORDER_FIRST, f00f_hack, NULL); static void f00f_hack(void *unused) { struct gate_descriptor *new_idt; #ifndef SMP struct region_descriptor r_idt; #endif vm_offset_t tmp; if (!has_f00f_bug) return; GIANT_REQUIRED; printf("Intel Pentium detected, installing workaround for F00F bug\n"); r_idt.rd_limit = sizeof(idt0) - 1; tmp = kmem_alloc(kernel_map, PAGE_SIZE * 2); if (tmp == 0) panic("kmem_alloc returned 0"); if (((unsigned int)tmp & (PAGE_SIZE-1)) != 0) panic("kmem_alloc returned non-page-aligned memory"); /* Put the first seven entries in the lower page */ new_idt = (struct gate_descriptor*)(tmp + PAGE_SIZE - (7*8)); bcopy(idt, new_idt, sizeof(idt0)); r_idt.rd_base = (int)new_idt; lidt(&r_idt); idt = new_idt; if (vm_map_protect(kernel_map, tmp, tmp + PAGE_SIZE, VM_PROT_READ, FALSE) != KERN_SUCCESS) panic("vm_map_protect failed"); return; } #endif /* defined(I586_CPU) && !NO_F00F_HACK */ int ptrace_set_pc(p, addr) struct proc *p; unsigned long addr; { p->p_frame->tf_eip = addr; return (0); } int ptrace_single_step(p) struct proc *p; { p->p_frame->tf_eflags |= PSL_T; return (0); } int fill_regs(p, regs) struct proc *p; struct reg *regs; { struct pcb *pcb; struct trapframe *tp; tp = p->p_frame; regs->r_fs = tp->tf_fs; regs->r_es = tp->tf_es; regs->r_ds = tp->tf_ds; regs->r_edi = tp->tf_edi; regs->r_esi = tp->tf_esi; regs->r_ebp = tp->tf_ebp; regs->r_ebx = tp->tf_ebx; regs->r_edx = tp->tf_edx; regs->r_ecx = tp->tf_ecx; regs->r_eax = tp->tf_eax; regs->r_eip = tp->tf_eip; regs->r_cs = tp->tf_cs; regs->r_eflags = tp->tf_eflags; regs->r_esp = tp->tf_esp; regs->r_ss = tp->tf_ss; pcb = &p->p_addr->u_pcb; regs->r_gs = pcb->pcb_gs; return (0); } int set_regs(p, regs) struct proc *p; struct reg *regs; { struct pcb *pcb; struct trapframe *tp; tp = p->p_frame; if (!EFL_SECURE(regs->r_eflags, tp->tf_eflags) || !CS_SECURE(regs->r_cs)) return (EINVAL); tp->tf_fs = regs->r_fs; tp->tf_es = regs->r_es; tp->tf_ds = regs->r_ds; tp->tf_edi = regs->r_edi; tp->tf_esi = regs->r_esi; tp->tf_ebp = regs->r_ebp; tp->tf_ebx = regs->r_ebx; tp->tf_edx = regs->r_edx; tp->tf_ecx = regs->r_ecx; tp->tf_eax = regs->r_eax; tp->tf_eip = regs->r_eip; tp->tf_cs = regs->r_cs; tp->tf_eflags = regs->r_eflags; tp->tf_esp = regs->r_esp; tp->tf_ss = regs->r_ss; pcb = &p->p_addr->u_pcb; pcb->pcb_gs = regs->r_gs; return (0); } #ifdef CPU_ENABLE_SSE static void fill_fpregs_xmm(sv_xmm, sv_87) struct savexmm *sv_xmm; struct save87 *sv_87; { register struct env87 *penv_87 = &sv_87->sv_env; register struct envxmm *penv_xmm = &sv_xmm->sv_env; int i; /* FPU control/status */ penv_87->en_cw = penv_xmm->en_cw; penv_87->en_sw = penv_xmm->en_sw; penv_87->en_tw = penv_xmm->en_tw; penv_87->en_fip = penv_xmm->en_fip; penv_87->en_fcs = penv_xmm->en_fcs; penv_87->en_opcode = penv_xmm->en_opcode; penv_87->en_foo = penv_xmm->en_foo; penv_87->en_fos = penv_xmm->en_fos; /* FPU registers */ for (i = 0; i < 8; ++i) sv_87->sv_ac[i] = sv_xmm->sv_fp[i].fp_acc; sv_87->sv_ex_sw = sv_xmm->sv_ex_sw; } static void set_fpregs_xmm(sv_87, sv_xmm) struct save87 *sv_87; struct savexmm *sv_xmm; { register struct env87 *penv_87 = &sv_87->sv_env; register struct envxmm *penv_xmm = &sv_xmm->sv_env; int i; /* FPU control/status */ penv_xmm->en_cw = penv_87->en_cw; penv_xmm->en_sw = penv_87->en_sw; penv_xmm->en_tw = penv_87->en_tw; penv_xmm->en_fip = penv_87->en_fip; penv_xmm->en_fcs = penv_87->en_fcs; penv_xmm->en_opcode = penv_87->en_opcode; penv_xmm->en_foo = penv_87->en_foo; penv_xmm->en_fos = penv_87->en_fos; /* FPU registers */ for (i = 0; i < 8; ++i) sv_xmm->sv_fp[i].fp_acc = sv_87->sv_ac[i]; sv_xmm->sv_ex_sw = sv_87->sv_ex_sw; } #endif /* CPU_ENABLE_SSE */ int fill_fpregs(p, fpregs) struct proc *p; struct fpreg *fpregs; { #ifdef CPU_ENABLE_SSE if (cpu_fxsr) { fill_fpregs_xmm(&p->p_addr->u_pcb.pcb_save.sv_xmm, (struct save87 *)fpregs); return (0); } #endif /* CPU_ENABLE_SSE */ bcopy(&p->p_addr->u_pcb.pcb_save.sv_87, fpregs, sizeof *fpregs); return (0); } int set_fpregs(p, fpregs) struct proc *p; struct fpreg *fpregs; { #ifdef CPU_ENABLE_SSE if (cpu_fxsr) { set_fpregs_xmm((struct save87 *)fpregs, &p->p_addr->u_pcb.pcb_save.sv_xmm); return (0); } #endif /* CPU_ENABLE_SSE */ bcopy(fpregs, &p->p_addr->u_pcb.pcb_save.sv_87, sizeof *fpregs); return (0); } int fill_dbregs(p, dbregs) struct proc *p; struct dbreg *dbregs; { struct pcb *pcb; if (p == NULL) { dbregs->dr0 = rdr0(); dbregs->dr1 = rdr1(); dbregs->dr2 = rdr2(); dbregs->dr3 = rdr3(); dbregs->dr4 = rdr4(); dbregs->dr5 = rdr5(); dbregs->dr6 = rdr6(); dbregs->dr7 = rdr7(); } else { pcb = &p->p_addr->u_pcb; dbregs->dr0 = pcb->pcb_dr0; dbregs->dr1 = pcb->pcb_dr1; dbregs->dr2 = pcb->pcb_dr2; dbregs->dr3 = pcb->pcb_dr3; dbregs->dr4 = 0; dbregs->dr5 = 0; dbregs->dr6 = pcb->pcb_dr6; dbregs->dr7 = pcb->pcb_dr7; } return (0); } int set_dbregs(p, dbregs) struct proc *p; struct dbreg *dbregs; { struct pcb *pcb; int i; u_int32_t mask1, mask2; if (p == NULL) { load_dr0(dbregs->dr0); load_dr1(dbregs->dr1); load_dr2(dbregs->dr2); load_dr3(dbregs->dr3); load_dr4(dbregs->dr4); load_dr5(dbregs->dr5); load_dr6(dbregs->dr6); load_dr7(dbregs->dr7); } else { /* * Don't let an illegal value for dr7 get set. Specifically, * check for undefined settings. Setting these bit patterns * result in undefined behaviour and can lead to an unexpected * TRCTRAP. */ for (i = 0, mask1 = 0x3<<16, mask2 = 0x2<<16; i < 8; i++, mask1 <<= 2, mask2 <<= 2) if ((dbregs->dr7 & mask1) == mask2) return (EINVAL); pcb = &p->p_addr->u_pcb; /* * Don't let a process set a breakpoint that is not within the * process's address space. If a process could do this, it * could halt the system by setting a breakpoint in the kernel * (if ddb was enabled). Thus, we need to check to make sure * that no breakpoints are being enabled for addresses outside * process's address space, unless, perhaps, we were called by * uid 0. * * XXX - what about when the watched area of the user's * address space is written into from within the kernel * ... wouldn't that still cause a breakpoint to be generated * from within kernel mode? */ if (suser(p) != 0) { if (dbregs->dr7 & 0x3) { /* dr0 is enabled */ if (dbregs->dr0 >= VM_MAXUSER_ADDRESS) return (EINVAL); } if (dbregs->dr7 & (0x3<<2)) { /* dr1 is enabled */ if (dbregs->dr1 >= VM_MAXUSER_ADDRESS) return (EINVAL); } if (dbregs->dr7 & (0x3<<4)) { /* dr2 is enabled */ if (dbregs->dr2 >= VM_MAXUSER_ADDRESS) return (EINVAL); } if (dbregs->dr7 & (0x3<<6)) { /* dr3 is enabled */ if (dbregs->dr3 >= VM_MAXUSER_ADDRESS) return (EINVAL); } } pcb->pcb_dr0 = dbregs->dr0; pcb->pcb_dr1 = dbregs->dr1; pcb->pcb_dr2 = dbregs->dr2; pcb->pcb_dr3 = dbregs->dr3; pcb->pcb_dr6 = dbregs->dr6; pcb->pcb_dr7 = dbregs->dr7; pcb->pcb_flags |= PCB_DBREGS; } return (0); } /* * Return > 0 if a hardware breakpoint has been hit, and the * breakpoint was in user space. Return 0, otherwise. */ int user_dbreg_trap(void) { u_int32_t dr7, dr6; /* debug registers dr6 and dr7 */ u_int32_t bp; /* breakpoint bits extracted from dr6 */ int nbp; /* number of breakpoints that triggered */ caddr_t addr[4]; /* breakpoint addresses */ int i; dr7 = rdr7(); if ((dr7 & 0x000000ff) == 0) { /* * all GE and LE bits in the dr7 register are zero, * thus the trap couldn't have been caused by the * hardware debug registers */ return 0; } nbp = 0; dr6 = rdr6(); bp = dr6 & 0x0000000f; if (!bp) { /* * None of the breakpoint bits are set meaning this * trap was not caused by any of the debug registers */ return 0; } /* * at least one of the breakpoints were hit, check to see * which ones and if any of them are user space addresses */ if (bp & 0x01) { addr[nbp++] = (caddr_t)rdr0(); } if (bp & 0x02) { addr[nbp++] = (caddr_t)rdr1(); } if (bp & 0x04) { addr[nbp++] = (caddr_t)rdr2(); } if (bp & 0x08) { addr[nbp++] = (caddr_t)rdr3(); } for (i=0; i /* * Determine the size of the transfer, and make sure it is * within the boundaries of the partition. Adjust transfer * if needed, and signal errors or early completion. */ int bounds_check_with_label(struct bio *bp, struct disklabel *lp, int wlabel) { struct partition *p = lp->d_partitions + dkpart(bp->bio_dev); int labelsect = lp->d_partitions[0].p_offset; int maxsz = p->p_size, sz = (bp->bio_bcount + DEV_BSIZE - 1) >> DEV_BSHIFT; /* overwriting disk label ? */ /* XXX should also protect bootstrap in first 8K */ if (bp->bio_blkno + p->p_offset <= LABELSECTOR + labelsect && #if LABELSECTOR != 0 bp->bio_blkno + p->p_offset + sz > LABELSECTOR + labelsect && #endif (bp->bio_cmd == BIO_WRITE) && wlabel == 0) { bp->bio_error = EROFS; goto bad; } #if defined(DOSBBSECTOR) && defined(notyet) /* overwriting master boot record? */ if (bp->bio_blkno + p->p_offset <= DOSBBSECTOR && (bp->bio_cmd == BIO_WRITE) && wlabel == 0) { bp->bio_error = EROFS; goto bad; } #endif /* beyond partition? */ if (bp->bio_blkno < 0 || bp->bio_blkno + sz > maxsz) { /* if exactly at end of disk, return an EOF */ if (bp->bio_blkno == maxsz) { bp->bio_resid = bp->bio_bcount; return(0); } /* or truncate if part of it fits */ sz = maxsz - bp->bio_blkno; if (sz <= 0) { bp->bio_error = EINVAL; goto bad; } bp->bio_bcount = sz << DEV_BSHIFT; } bp->bio_pblkno = bp->bio_blkno + p->p_offset; return(1); bad: bp->bio_flags |= BIO_ERROR; return(-1); } #ifdef DDB /* * Provide inb() and outb() as functions. They are normally only * available as macros calling inlined functions, thus cannot be * called inside DDB. * * The actual code is stolen from , and de-inlined. */ #undef inb #undef outb /* silence compiler warnings */ u_char inb(u_int); void outb(u_int, u_char); u_char inb(u_int port) { u_char data; /* * We use %%dx and not %1 here because i/o is done at %dx and not at * %edx, while gcc generates inferior code (movw instead of movl) * if we tell it to load (u_short) port. */ __asm __volatile("inb %%dx,%0" : "=a" (data) : "d" (port)); return (data); } void outb(u_int port, u_char data) { u_char al; /* * Use an unnecessary assignment to help gcc's register allocator. * This make a large difference for gcc-1.40 and a tiny difference * for gcc-2.6.0. For gcc-1.40, al had to be ``asm("ax")'' for * best results. gcc-2.6.0 can't handle this. */ al = data; __asm __volatile("outb %0,%%dx" : : "a" (al), "d" (port)); } #endif /* DDB */ Index: head/sys/amd64/amd64/mp_machdep.c =================================================================== --- head/sys/amd64/amd64/mp_machdep.c (revision 82308) +++ head/sys/amd64/amd64/mp_machdep.c (revision 82309) @@ -1,2440 +1,2442 @@ /* * Copyright (c) 1996, by Steve Passe * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. The name of the developer may NOT be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include "opt_cpu.h" +#include "opt_upages.h" #ifdef SMP #include #else #error #endif #include #include #include #include /* cngetc() */ #include #ifdef GPROF #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /** TEST_DEFAULT_CONFIG, TEST_TEST1 */ #include #include #include +#include #if defined(APIC_IO) #include /* setidt() */ #include /* IPIs */ #include /* IPIs */ #endif /* APIC_IO */ #if defined(TEST_DEFAULT_CONFIG) #define MPFPS_MPFB1 TEST_DEFAULT_CONFIG #else #define MPFPS_MPFB1 mpfps->mpfb1 #endif /* TEST_DEFAULT_CONFIG */ #define WARMBOOT_TARGET 0 #define WARMBOOT_OFF (KERNBASE + 0x0467) #define WARMBOOT_SEG (KERNBASE + 0x0469) #ifdef PC98 #define BIOS_BASE (0xe8000) #define BIOS_SIZE (0x18000) #else #define BIOS_BASE (0xf0000) #define BIOS_SIZE (0x10000) #endif #define BIOS_COUNT (BIOS_SIZE/4) #define CMOS_REG (0x70) #define CMOS_DATA (0x71) #define BIOS_RESET (0x0f) #define BIOS_WARM (0x0a) #define PROCENTRY_FLAG_EN 0x01 #define PROCENTRY_FLAG_BP 0x02 #define IOAPICENTRY_FLAG_EN 0x01 /* MP Floating Pointer Structure */ typedef struct MPFPS { char signature[4]; void *pap; u_char length; u_char spec_rev; u_char checksum; u_char mpfb1; u_char mpfb2; u_char mpfb3; u_char mpfb4; u_char mpfb5; } *mpfps_t; /* MP Configuration Table Header */ typedef struct MPCTH { char signature[4]; u_short base_table_length; u_char spec_rev; u_char checksum; u_char oem_id[8]; u_char product_id[12]; void *oem_table_pointer; u_short oem_table_size; u_short entry_count; void *apic_address; u_short extended_table_length; u_char extended_table_checksum; u_char reserved; } *mpcth_t; typedef struct PROCENTRY { u_char type; u_char apic_id; u_char apic_version; u_char cpu_flags; u_long cpu_signature; u_long feature_flags; u_long reserved1; u_long reserved2; } *proc_entry_ptr; typedef struct BUSENTRY { u_char type; u_char bus_id; char bus_type[6]; } *bus_entry_ptr; typedef struct IOAPICENTRY { u_char type; u_char apic_id; u_char apic_version; u_char apic_flags; void *apic_address; } *io_apic_entry_ptr; typedef struct INTENTRY { u_char type; u_char int_type; u_short int_flags; u_char src_bus_id; u_char src_bus_irq; u_char dst_apic_id; u_char dst_apic_int; } *int_entry_ptr; /* descriptions of MP basetable entries */ typedef struct BASETABLE_ENTRY { u_char type; u_char length; char name[16]; } basetable_entry; /* * this code MUST be enabled here and in mpboot.s. * it follows the very early stages of AP boot by placing values in CMOS ram. * it NORMALLY will never be needed and thus the primitive method for enabling. * #define CHECK_POINTS */ #if defined(CHECK_POINTS) && !defined(PC98) #define CHECK_READ(A) (outb(CMOS_REG, (A)), inb(CMOS_DATA)) #define CHECK_WRITE(A,D) (outb(CMOS_REG, (A)), outb(CMOS_DATA, (D))) #define CHECK_INIT(D); \ CHECK_WRITE(0x34, (D)); \ CHECK_WRITE(0x35, (D)); \ CHECK_WRITE(0x36, (D)); \ CHECK_WRITE(0x37, (D)); \ CHECK_WRITE(0x38, (D)); \ CHECK_WRITE(0x39, (D)); #define CHECK_PRINT(S); \ printf("%s: %d, %d, %d, %d, %d, %d\n", \ (S), \ CHECK_READ(0x34), \ CHECK_READ(0x35), \ CHECK_READ(0x36), \ CHECK_READ(0x37), \ CHECK_READ(0x38), \ CHECK_READ(0x39)); #else /* CHECK_POINTS */ #define CHECK_INIT(D) #define CHECK_PRINT(S) #endif /* CHECK_POINTS */ /* * Values to send to the POST hardware. */ #define MP_BOOTADDRESS_POST 0x10 #define MP_PROBE_POST 0x11 #define MPTABLE_PASS1_POST 0x12 #define MP_START_POST 0x13 #define MP_ENABLE_POST 0x14 #define MPTABLE_PASS2_POST 0x15 #define START_ALL_APS_POST 0x16 #define INSTALL_AP_TRAMP_POST 0x17 #define START_AP_POST 0x18 #define MP_ANNOUNCE_POST 0x19 /* used to hold the AP's until we are ready to release them */ static struct mtx ap_boot_mtx; /** XXX FIXME: where does this really belong, isa.h/isa.c perhaps? */ int current_postcode; /** XXX FIXME: what system files declare these??? */ extern struct region_descriptor r_gdt, r_idt; int bsp_apic_ready = 0; /* flags useability of BSP apic */ int mp_naps; /* # of Applications processors */ int mp_nbusses; /* # of busses */ int mp_napics; /* # of IO APICs */ int boot_cpu_id; /* designated BSP */ vm_offset_t cpu_apic_address; vm_offset_t io_apic_address[NAPICID]; /* NAPICID is more than enough */ extern int nkpt; u_int32_t cpu_apic_versions[MAXCPU]; u_int32_t *io_apic_versions; #ifdef APIC_INTR_REORDER struct { volatile int *location; int bit; } apic_isrbit_location[32]; #endif struct apic_intmapinfo int_to_apicintpin[APIC_INTMAPSIZE]; /* * APIC ID logical/physical mapping structures. * We oversize these to simplify boot-time config. */ int cpu_num_to_apic_id[NAPICID]; int io_num_to_apic_id[NAPICID]; int apic_id_to_logical[NAPICID]; /* AP uses this during bootstrap. Do not staticize. */ char *bootSTK; static int bootAP; /* Hotwire a 0->4MB V==P mapping */ extern pt_entry_t *KPTphys; /* SMP page table page */ extern pt_entry_t *SMPpt; struct pcb stoppcbs[MAXCPU]; int invltlb_ok = 0; /* throttle smp_invltlb() till safe */ SYSCTL_INT(_machdep, OID_AUTO, invltlb_ok, CTLFLAG_RW, &invltlb_ok, 0, ""); /* * Local data and functions. */ /* Set to 1 once we're ready to let the APs out of the pen. */ static volatile int aps_ready = 0; static int mp_capable; static u_int boot_address; static u_int base_memory; static int picmode; /* 0: virtual wire mode, 1: PIC mode */ static mpfps_t mpfps; static int search_for_sig(u_int32_t target, int count); static void mp_enable(u_int boot_addr); static void mptable_pass1(void); static int mptable_pass2(void); static void default_mp_table(int type); static void fix_mp_table(void); static void setup_apic_irq_mapping(void); static void init_locks(void); static int start_all_aps(u_int boot_addr); static void install_ap_tramp(u_int boot_addr); static int start_ap(int logicalCpu, u_int boot_addr); void ap_init(void); static int apic_int_is_bus_type(int intr, int bus_type); static void release_aps(void *dummy); /* * initialize all the SMP locks */ /* critical region around IO APIC, apic_imen */ struct mtx imen_mtx; /* lock region used by kernel profiling */ int mcount_lock; #ifdef USE_COMLOCK /* locks com (tty) data/hardware accesses: a FASTINTR() */ struct mtx com_mtx; #endif /* USE_COMLOCK */ static void init_locks(void) { #ifdef USE_COMLOCK mtx_init(&com_mtx, "com", MTX_SPIN); #endif /* USE_COMLOCK */ } /* * Calculate usable address in base memory for AP trampoline code. */ u_int mp_bootaddress(u_int basemem) { POSTCODE(MP_BOOTADDRESS_POST); base_memory = basemem * 1024; /* convert to bytes */ boot_address = base_memory & ~0xfff; /* round down to 4k boundary */ if ((base_memory - boot_address) < bootMP_size) boot_address -= 4096; /* not enough, lower by 4k */ return boot_address; } /* * Look for an Intel MP spec table (ie, SMP capable hardware). */ void i386_mp_probe(void) { int x; u_long segment; u_int32_t target; POSTCODE(MP_PROBE_POST); /* see if EBDA exists */ if ((segment = (u_long) * (u_short *) (KERNBASE + 0x40e)) != 0) { /* search first 1K of EBDA */ target = (u_int32_t) (segment << 4); if ((x = search_for_sig(target, 1024 / 4)) >= 0) goto found; } else { /* last 1K of base memory, effective 'top of base' passed in */ target = (u_int32_t) (base_memory - 0x400); if ((x = search_for_sig(target, 1024 / 4)) >= 0) goto found; } /* search the BIOS */ target = (u_int32_t) BIOS_BASE; if ((x = search_for_sig(target, BIOS_COUNT)) >= 0) goto found; /* nothing found */ mpfps = (mpfps_t)0; mp_capable = 0; return; found: /* calculate needed resources */ mpfps = (mpfps_t)x; mptable_pass1(); /* flag fact that we are running multiple processors */ mp_capable = 1; } int cpu_mp_probe(void) { /* * Record BSP in CPU map * This is done here so that MBUF init code works correctly. */ all_cpus = 1; return (mp_capable); } /* * Initialize the SMP hardware and the APIC and start up the AP's. */ void cpu_mp_start(void) { POSTCODE(MP_START_POST); /* look for MP capable motherboard */ if (mp_capable) mp_enable(boot_address); else panic("MP hardware not found!"); cpu_setregs(); } /* * Print various information about the SMP system hardware and setup. */ void cpu_mp_announce(void) { int x; POSTCODE(MP_ANNOUNCE_POST); printf(" cpu0 (BSP): apic id: %2d", CPU_TO_ID(0)); printf(", version: 0x%08x", cpu_apic_versions[0]); printf(", at 0x%08x\n", cpu_apic_address); for (x = 1; x <= mp_naps; ++x) { printf(" cpu%d (AP): apic id: %2d", x, CPU_TO_ID(x)); printf(", version: 0x%08x", cpu_apic_versions[x]); printf(", at 0x%08x\n", cpu_apic_address); } #if defined(APIC_IO) for (x = 0; x < mp_napics; ++x) { printf(" io%d (APIC): apic id: %2d", x, IO_TO_ID(x)); printf(", version: 0x%08x", io_apic_versions[x]); printf(", at 0x%08x\n", io_apic_address[x]); } #else printf(" Warning: APIC I/O disabled\n"); #endif /* APIC_IO */ } /* * AP cpu's call this to sync up protected mode. */ void init_secondary(void) { int gsel_tss; int x, myid = bootAP; gdt_segs[GPRIV_SEL].ssd_base = (int) &SMP_prvspace[myid]; gdt_segs[GPROC0_SEL].ssd_base = (int) &SMP_prvspace[myid].globaldata.gd_common_tss; SMP_prvspace[myid].globaldata.gd_prvspace = &SMP_prvspace[myid].globaldata; for (x = 0; x < NGDT; x++) { ssdtosd(&gdt_segs[x], &gdt[myid * NGDT + x].sd); } r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; r_gdt.rd_base = (int) &gdt[myid * NGDT]; lgdt(&r_gdt); /* does magic intra-segment return */ lidt(&r_idt); lldt(_default_ldt); PCPU_SET(currentldt, _default_ldt); gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); gdt[myid * NGDT + GPROC0_SEL].sd.sd_type = SDT_SYS386TSS; PCPU_SET(common_tss.tss_esp0, 0); /* not used until after switch */ PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL)); PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16); PCPU_SET(tss_gdt, &gdt[myid * NGDT + GPROC0_SEL].sd); PCPU_SET(common_tssd, *PCPU_GET(tss_gdt)); ltr(gsel_tss); pmap_set_opt(); } #if defined(APIC_IO) /* * Final configuration of the BSP's local APIC: * - disable 'pic mode'. * - disable 'virtual wire mode'. * - enable NMI. */ void bsp_apic_configure(void) { u_char byte; u_int32_t temp; /* leave 'pic mode' if necessary */ if (picmode) { outb(0x22, 0x70); /* select IMCR */ byte = inb(0x23); /* current contents */ byte |= 0x01; /* mask external INTR */ outb(0x23, byte); /* disconnect 8259s/NMI */ } /* mask lint0 (the 8259 'virtual wire' connection) */ temp = lapic.lvt_lint0; temp |= APIC_LVT_M; /* set the mask */ lapic.lvt_lint0 = temp; /* setup lint1 to handle NMI */ temp = lapic.lvt_lint1; temp &= ~APIC_LVT_M; /* clear the mask */ lapic.lvt_lint1 = temp; if (bootverbose) apic_dump("bsp_apic_configure()"); } #endif /* APIC_IO */ /******************************************************************* * local functions and data */ /* * start the SMP system */ static void mp_enable(u_int boot_addr) { int x; #if defined(APIC_IO) int apic; u_int ux; #endif /* APIC_IO */ POSTCODE(MP_ENABLE_POST); /* turn on 4MB of V == P addressing so we can get to MP table */ *(int *)PTD = PG_V | PG_RW | ((uintptr_t)(void *)KPTphys & PG_FRAME); invltlb(); /* examine the MP table for needed info, uses physical addresses */ x = mptable_pass2(); *(int *)PTD = 0; invltlb(); /* can't process default configs till the CPU APIC is pmapped */ if (x) default_mp_table(x); /* post scan cleanup */ fix_mp_table(); setup_apic_irq_mapping(); #if defined(APIC_IO) /* fill the LOGICAL io_apic_versions table */ for (apic = 0; apic < mp_napics; ++apic) { ux = io_apic_read(apic, IOAPIC_VER); io_apic_versions[apic] = ux; io_apic_set_id(apic, IO_TO_ID(apic)); } /* program each IO APIC in the system */ for (apic = 0; apic < mp_napics; ++apic) if (io_apic_setup(apic) < 0) panic("IO APIC setup failure"); /* install a 'Spurious INTerrupt' vector */ setidt(XSPURIOUSINT_OFFSET, Xspuriousint, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* install an inter-CPU IPI for TLB invalidation */ setidt(XINVLTLB_OFFSET, Xinvltlb, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* install an inter-CPU IPI for forwarding hardclock() */ setidt(XHARDCLOCK_OFFSET, Xhardclock, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* install an inter-CPU IPI for forwarding statclock() */ setidt(XSTATCLOCK_OFFSET, Xstatclock, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* install an inter-CPU IPI for all-CPU rendezvous */ setidt(XRENDEZVOUS_OFFSET, Xrendezvous, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* install an inter-CPU IPI for forcing an additional software trap */ setidt(XCPUAST_OFFSET, Xcpuast, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* install an inter-CPU IPI for CPU stop/restart */ setidt(XCPUSTOP_OFFSET, Xcpustop, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); #if defined(TEST_TEST1) /* install a "fake hardware INTerrupt" vector */ setidt(XTEST1_OFFSET, Xtest1, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); #endif /** TEST_TEST1 */ #endif /* APIC_IO */ /* initialize all SMP locks */ init_locks(); /* start each Application Processor */ start_all_aps(boot_addr); } /* * look for the MP spec signature */ /* string defined by the Intel MP Spec as identifying the MP table */ #define MP_SIG 0x5f504d5f /* _MP_ */ #define NEXT(X) ((X) += 4) static int search_for_sig(u_int32_t target, int count) { int x; u_int32_t *addr = (u_int32_t *) (KERNBASE + target); for (x = 0; x < count; NEXT(x)) if (addr[x] == MP_SIG) /* make array index a byte index */ return (target + (x * sizeof(u_int32_t))); return -1; } static basetable_entry basetable_entry_types[] = { {0, 20, "Processor"}, {1, 8, "Bus"}, {2, 8, "I/O APIC"}, {3, 8, "I/O INT"}, {4, 8, "Local INT"} }; typedef struct BUSDATA { u_char bus_id; enum busTypes bus_type; } bus_datum; typedef struct INTDATA { u_char int_type; u_short int_flags; u_char src_bus_id; u_char src_bus_irq; u_char dst_apic_id; u_char dst_apic_int; u_char int_vector; } io_int, local_int; typedef struct BUSTYPENAME { u_char type; char name[7]; } bus_type_name; static bus_type_name bus_type_table[] = { {CBUS, "CBUS"}, {CBUSII, "CBUSII"}, {EISA, "EISA"}, {MCA, "MCA"}, {UNKNOWN_BUSTYPE, "---"}, {ISA, "ISA"}, {MCA, "MCA"}, {UNKNOWN_BUSTYPE, "---"}, {UNKNOWN_BUSTYPE, "---"}, {UNKNOWN_BUSTYPE, "---"}, {UNKNOWN_BUSTYPE, "---"}, {UNKNOWN_BUSTYPE, "---"}, {PCI, "PCI"}, {UNKNOWN_BUSTYPE, "---"}, {UNKNOWN_BUSTYPE, "---"}, {UNKNOWN_BUSTYPE, "---"}, {UNKNOWN_BUSTYPE, "---"}, {XPRESS, "XPRESS"}, {UNKNOWN_BUSTYPE, "---"} }; /* from MP spec v1.4, table 5-1 */ static int default_data[7][5] = { /* nbus, id0, type0, id1, type1 */ {1, 0, ISA, 255, 255}, {1, 0, EISA, 255, 255}, {1, 0, EISA, 255, 255}, {1, 0, MCA, 255, 255}, {2, 0, ISA, 1, PCI}, {2, 0, EISA, 1, PCI}, {2, 0, MCA, 1, PCI} }; /* the bus data */ static bus_datum *bus_data; /* the IO INT data, one entry per possible APIC INTerrupt */ static io_int *io_apic_ints; static int nintrs; static int processor_entry __P((proc_entry_ptr entry, int cpu)); static int bus_entry __P((bus_entry_ptr entry, int bus)); static int io_apic_entry __P((io_apic_entry_ptr entry, int apic)); static int int_entry __P((int_entry_ptr entry, int intr)); static int lookup_bus_type __P((char *name)); /* * 1st pass on motherboard's Intel MP specification table. * * initializes: * mp_ncpus = 1 * * determines: * cpu_apic_address (common to all CPUs) * io_apic_address[N] * mp_naps * mp_nbusses * mp_napics * nintrs */ static void mptable_pass1(void) { int x; mpcth_t cth; int totalSize; void* position; int count; int type; POSTCODE(MPTABLE_PASS1_POST); /* clear various tables */ for (x = 0; x < NAPICID; ++x) { io_apic_address[x] = ~0; /* IO APIC address table */ } /* init everything to empty */ mp_naps = 0; mp_nbusses = 0; mp_napics = 0; nintrs = 0; /* check for use of 'default' configuration */ if (MPFPS_MPFB1 != 0) { /* use default addresses */ cpu_apic_address = DEFAULT_APIC_BASE; io_apic_address[0] = DEFAULT_IO_APIC_BASE; /* fill in with defaults */ mp_naps = 2; /* includes BSP */ mp_nbusses = default_data[MPFPS_MPFB1 - 1][0]; #if defined(APIC_IO) mp_napics = 1; nintrs = 16; #endif /* APIC_IO */ } else { if ((cth = mpfps->pap) == 0) panic("MP Configuration Table Header MISSING!"); cpu_apic_address = (vm_offset_t) cth->apic_address; /* walk the table, recording info of interest */ totalSize = cth->base_table_length - sizeof(struct MPCTH); position = (u_char *) cth + sizeof(struct MPCTH); count = cth->entry_count; while (count--) { switch (type = *(u_char *) position) { case 0: /* processor_entry */ if (((proc_entry_ptr)position)->cpu_flags & PROCENTRY_FLAG_EN) ++mp_naps; break; case 1: /* bus_entry */ ++mp_nbusses; break; case 2: /* io_apic_entry */ if (((io_apic_entry_ptr)position)->apic_flags & IOAPICENTRY_FLAG_EN) io_apic_address[mp_napics++] = (vm_offset_t)((io_apic_entry_ptr) position)->apic_address; break; case 3: /* int_entry */ ++nintrs; break; case 4: /* int_entry */ break; default: panic("mpfps Base Table HOSED!"); /* NOTREACHED */ } totalSize -= basetable_entry_types[type].length; (u_char*)position += basetable_entry_types[type].length; } } /* qualify the numbers */ if (mp_naps > MAXCPU) { printf("Warning: only using %d of %d available CPUs!\n", MAXCPU, mp_naps); mp_naps = MAXCPU; } /* * Count the BSP. * This is also used as a counter while starting the APs. */ mp_ncpus = 1; --mp_naps; /* subtract the BSP */ } /* * 2nd pass on motherboard's Intel MP specification table. * * sets: * boot_cpu_id * ID_TO_IO(N), phy APIC ID to log CPU/IO table * CPU_TO_ID(N), logical CPU to APIC ID table * IO_TO_ID(N), logical IO to APIC ID table * bus_data[N] * io_apic_ints[N] */ static int mptable_pass2(void) { int x; mpcth_t cth; int totalSize; void* position; int count; int type; int apic, bus, cpu, intr; int i, j; int pgeflag; POSTCODE(MPTABLE_PASS2_POST); pgeflag = 0; /* XXX - Not used under SMP yet. */ MALLOC(io_apic_versions, u_int32_t *, sizeof(u_int32_t) * mp_napics, M_DEVBUF, M_WAITOK); MALLOC(ioapic, volatile ioapic_t **, sizeof(ioapic_t *) * mp_napics, M_DEVBUF, M_WAITOK); MALLOC(io_apic_ints, io_int *, sizeof(io_int) * (nintrs + 1), M_DEVBUF, M_WAITOK); MALLOC(bus_data, bus_datum *, sizeof(bus_datum) * mp_nbusses, M_DEVBUF, M_WAITOK); bzero(ioapic, sizeof(ioapic_t *) * mp_napics); for (i = 0; i < mp_napics; i++) { for (j = 0; j < mp_napics; j++) { /* same page frame as a previous IO apic? */ if (((vm_offset_t)SMPpt[NPTEPG-2-j] & PG_FRAME) == (io_apic_address[i] & PG_FRAME)) { ioapic[i] = (ioapic_t *)((u_int)SMP_prvspace + (NPTEPG-2-j) * PAGE_SIZE + (io_apic_address[i] & PAGE_MASK)); break; } /* use this slot if available */ if (((vm_offset_t)SMPpt[NPTEPG-2-j] & PG_FRAME) == 0) { SMPpt[NPTEPG-2-j] = (pt_entry_t)(PG_V | PG_RW | pgeflag | (io_apic_address[i] & PG_FRAME)); ioapic[i] = (ioapic_t *)((u_int)SMP_prvspace + (NPTEPG-2-j) * PAGE_SIZE + (io_apic_address[i] & PAGE_MASK)); break; } } } /* clear various tables */ for (x = 0; x < NAPICID; ++x) { ID_TO_IO(x) = -1; /* phy APIC ID to log CPU/IO table */ CPU_TO_ID(x) = -1; /* logical CPU to APIC ID table */ IO_TO_ID(x) = -1; /* logical IO to APIC ID table */ } /* clear bus data table */ for (x = 0; x < mp_nbusses; ++x) bus_data[x].bus_id = 0xff; /* clear IO APIC INT table */ for (x = 0; x < (nintrs + 1); ++x) { io_apic_ints[x].int_type = 0xff; io_apic_ints[x].int_vector = 0xff; } /* setup the cpu/apic mapping arrays */ boot_cpu_id = -1; /* record whether PIC or virtual-wire mode */ picmode = (mpfps->mpfb2 & 0x80) ? 1 : 0; /* check for use of 'default' configuration */ if (MPFPS_MPFB1 != 0) return MPFPS_MPFB1; /* return default configuration type */ if ((cth = mpfps->pap) == 0) panic("MP Configuration Table Header MISSING!"); /* walk the table, recording info of interest */ totalSize = cth->base_table_length - sizeof(struct MPCTH); position = (u_char *) cth + sizeof(struct MPCTH); count = cth->entry_count; apic = bus = intr = 0; cpu = 1; /* pre-count the BSP */ while (count--) { switch (type = *(u_char *) position) { case 0: if (processor_entry(position, cpu)) ++cpu; break; case 1: if (bus_entry(position, bus)) ++bus; break; case 2: if (io_apic_entry(position, apic)) ++apic; break; case 3: if (int_entry(position, intr)) ++intr; break; case 4: /* int_entry(position); */ break; default: panic("mpfps Base Table HOSED!"); /* NOTREACHED */ } totalSize -= basetable_entry_types[type].length; (u_char *) position += basetable_entry_types[type].length; } if (boot_cpu_id == -1) panic("NO BSP found!"); /* report fact that its NOT a default configuration */ return 0; } void assign_apic_irq(int apic, int intpin, int irq) { int x; if (int_to_apicintpin[irq].ioapic != -1) panic("assign_apic_irq: inconsistent table"); int_to_apicintpin[irq].ioapic = apic; int_to_apicintpin[irq].int_pin = intpin; int_to_apicintpin[irq].apic_address = ioapic[apic]; int_to_apicintpin[irq].redirindex = IOAPIC_REDTBL + 2 * intpin; for (x = 0; x < nintrs; x++) { if ((io_apic_ints[x].int_type == 0 || io_apic_ints[x].int_type == 3) && io_apic_ints[x].int_vector == 0xff && io_apic_ints[x].dst_apic_id == IO_TO_ID(apic) && io_apic_ints[x].dst_apic_int == intpin) io_apic_ints[x].int_vector = irq; } } void revoke_apic_irq(int irq) { int x; int oldapic; int oldintpin; if (int_to_apicintpin[irq].ioapic == -1) panic("assign_apic_irq: inconsistent table"); oldapic = int_to_apicintpin[irq].ioapic; oldintpin = int_to_apicintpin[irq].int_pin; int_to_apicintpin[irq].ioapic = -1; int_to_apicintpin[irq].int_pin = 0; int_to_apicintpin[irq].apic_address = NULL; int_to_apicintpin[irq].redirindex = 0; for (x = 0; x < nintrs; x++) { if ((io_apic_ints[x].int_type == 0 || io_apic_ints[x].int_type == 3) && io_apic_ints[x].int_vector == 0xff && io_apic_ints[x].dst_apic_id == IO_TO_ID(oldapic) && io_apic_ints[x].dst_apic_int == oldintpin) io_apic_ints[x].int_vector = 0xff; } } static void allocate_apic_irq(int intr) { int apic; int intpin; int irq; if (io_apic_ints[intr].int_vector != 0xff) return; /* Interrupt handler already assigned */ if (io_apic_ints[intr].int_type != 0 && (io_apic_ints[intr].int_type != 3 || (io_apic_ints[intr].dst_apic_id == IO_TO_ID(0) && io_apic_ints[intr].dst_apic_int == 0))) return; /* Not INT or ExtInt on != (0, 0) */ irq = 0; while (irq < APIC_INTMAPSIZE && int_to_apicintpin[irq].ioapic != -1) irq++; if (irq >= APIC_INTMAPSIZE) return; /* No free interrupt handlers */ apic = ID_TO_IO(io_apic_ints[intr].dst_apic_id); intpin = io_apic_ints[intr].dst_apic_int; assign_apic_irq(apic, intpin, irq); io_apic_setup_intpin(apic, intpin); } static void swap_apic_id(int apic, int oldid, int newid) { int x; int oapic; if (oldid == newid) return; /* Nothing to do */ printf("Changing APIC ID for IO APIC #%d from %d to %d in MP table\n", apic, oldid, newid); /* Swap physical APIC IDs in interrupt entries */ for (x = 0; x < nintrs; x++) { if (io_apic_ints[x].dst_apic_id == oldid) io_apic_ints[x].dst_apic_id = newid; else if (io_apic_ints[x].dst_apic_id == newid) io_apic_ints[x].dst_apic_id = oldid; } /* Swap physical APIC IDs in IO_TO_ID mappings */ for (oapic = 0; oapic < mp_napics; oapic++) if (IO_TO_ID(oapic) == newid) break; if (oapic < mp_napics) { printf("Changing APIC ID for IO APIC #%d from " "%d to %d in MP table\n", oapic, newid, oldid); IO_TO_ID(oapic) = oldid; } IO_TO_ID(apic) = newid; } static void fix_id_to_io_mapping(void) { int x; for (x = 0; x < NAPICID; x++) ID_TO_IO(x) = -1; for (x = 0; x <= mp_naps; x++) if (CPU_TO_ID(x) < NAPICID) ID_TO_IO(CPU_TO_ID(x)) = x; for (x = 0; x < mp_napics; x++) if (IO_TO_ID(x) < NAPICID) ID_TO_IO(IO_TO_ID(x)) = x; } static int first_free_apic_id(void) { int freeid, x; for (freeid = 0; freeid < NAPICID; freeid++) { for (x = 0; x <= mp_naps; x++) if (CPU_TO_ID(x) == freeid) break; if (x <= mp_naps) continue; for (x = 0; x < mp_napics; x++) if (IO_TO_ID(x) == freeid) break; if (x < mp_napics) continue; return freeid; } return freeid; } static int io_apic_id_acceptable(int apic, int id) { int cpu; /* Logical CPU number */ int oapic; /* Logical IO APIC number for other IO APIC */ if (id >= NAPICID) return 0; /* Out of range */ for (cpu = 0; cpu <= mp_naps; cpu++) if (CPU_TO_ID(cpu) == id) return 0; /* Conflict with CPU */ for (oapic = 0; oapic < mp_napics && oapic < apic; oapic++) if (IO_TO_ID(oapic) == id) return 0; /* Conflict with other APIC */ return 1; /* ID is acceptable for IO APIC */ } /* * parse an Intel MP specification table */ static void fix_mp_table(void) { int x; int id; int bus_0 = 0; /* Stop GCC warning */ int bus_pci = 0; /* Stop GCC warning */ int num_pci_bus; int apic; /* IO APIC unit number */ int freeid; /* Free physical APIC ID */ int physid; /* Current physical IO APIC ID */ /* * Fix mis-numbering of the PCI bus and its INT entries if the BIOS * did it wrong. The MP spec says that when more than 1 PCI bus * exists the BIOS must begin with bus entries for the PCI bus and use * actual PCI bus numbering. This implies that when only 1 PCI bus * exists the BIOS can choose to ignore this ordering, and indeed many * MP motherboards do ignore it. This causes a problem when the PCI * sub-system makes requests of the MP sub-system based on PCI bus * numbers. So here we look for the situation and renumber the * busses and associated INTs in an effort to "make it right". */ /* find bus 0, PCI bus, count the number of PCI busses */ for (num_pci_bus = 0, x = 0; x < mp_nbusses; ++x) { if (bus_data[x].bus_id == 0) { bus_0 = x; } if (bus_data[x].bus_type == PCI) { ++num_pci_bus; bus_pci = x; } } /* * bus_0 == slot of bus with ID of 0 * bus_pci == slot of last PCI bus encountered */ /* check the 1 PCI bus case for sanity */ /* if it is number 0 all is well */ if (num_pci_bus == 1 && bus_data[bus_pci].bus_id != 0) { /* mis-numbered, swap with whichever bus uses slot 0 */ /* swap the bus entry types */ bus_data[bus_pci].bus_type = bus_data[bus_0].bus_type; bus_data[bus_0].bus_type = PCI; /* swap each relavant INTerrupt entry */ id = bus_data[bus_pci].bus_id; for (x = 0; x < nintrs; ++x) { if (io_apic_ints[x].src_bus_id == id) { io_apic_ints[x].src_bus_id = 0; } else if (io_apic_ints[x].src_bus_id == 0) { io_apic_ints[x].src_bus_id = id; } } } /* Assign IO APIC IDs. * * First try the existing ID. If a conflict is detected, try * the ID in the MP table. If a conflict is still detected, find * a free id. * * We cannot use the ID_TO_IO table before all conflicts has been * resolved and the table has been corrected. */ for (apic = 0; apic < mp_napics; ++apic) { /* For all IO APICs */ /* First try to use the value set by the BIOS */ physid = io_apic_get_id(apic); if (io_apic_id_acceptable(apic, physid)) { if (IO_TO_ID(apic) != physid) swap_apic_id(apic, IO_TO_ID(apic), physid); continue; } /* Then check if the value in the MP table is acceptable */ if (io_apic_id_acceptable(apic, IO_TO_ID(apic))) continue; /* Last resort, find a free APIC ID and use it */ freeid = first_free_apic_id(); if (freeid >= NAPICID) panic("No free physical APIC IDs found"); if (io_apic_id_acceptable(apic, freeid)) { swap_apic_id(apic, IO_TO_ID(apic), freeid); continue; } panic("Free physical APIC ID not usable"); } fix_id_to_io_mapping(); /* detect and fix broken Compaq MP table */ if (apic_int_type(0, 0) == -1) { printf("APIC_IO: MP table broken: 8259->APIC entry missing!\n"); io_apic_ints[nintrs].int_type = 3; /* ExtInt */ io_apic_ints[nintrs].int_vector = 0xff; /* Unassigned */ /* XXX fixme, set src bus id etc, but it doesn't seem to hurt */ io_apic_ints[nintrs].dst_apic_id = IO_TO_ID(0); io_apic_ints[nintrs].dst_apic_int = 0; /* Pin 0 */ nintrs++; } } /* Assign low level interrupt handlers */ static void setup_apic_irq_mapping(void) { int x; int int_vector; /* Clear array */ for (x = 0; x < APIC_INTMAPSIZE; x++) { int_to_apicintpin[x].ioapic = -1; int_to_apicintpin[x].int_pin = 0; int_to_apicintpin[x].apic_address = NULL; int_to_apicintpin[x].redirindex = 0; } /* First assign ISA/EISA interrupts */ for (x = 0; x < nintrs; x++) { int_vector = io_apic_ints[x].src_bus_irq; if (int_vector < APIC_INTMAPSIZE && io_apic_ints[x].int_vector == 0xff && int_to_apicintpin[int_vector].ioapic == -1 && (apic_int_is_bus_type(x, ISA) || apic_int_is_bus_type(x, EISA)) && io_apic_ints[x].int_type == 0) { assign_apic_irq(ID_TO_IO(io_apic_ints[x].dst_apic_id), io_apic_ints[x].dst_apic_int, int_vector); } } /* Assign ExtInt entry if no ISA/EISA interrupt 0 entry */ for (x = 0; x < nintrs; x++) { if (io_apic_ints[x].dst_apic_int == 0 && io_apic_ints[x].dst_apic_id == IO_TO_ID(0) && io_apic_ints[x].int_vector == 0xff && int_to_apicintpin[0].ioapic == -1 && io_apic_ints[x].int_type == 3) { assign_apic_irq(0, 0, 0); break; } } /* PCI interrupt assignment is deferred */ } static int processor_entry(proc_entry_ptr entry, int cpu) { /* check for usability */ if (!(entry->cpu_flags & PROCENTRY_FLAG_EN)) return 0; if(entry->apic_id >= NAPICID) panic("CPU APIC ID out of range (0..%d)", NAPICID - 1); /* check for BSP flag */ if (entry->cpu_flags & PROCENTRY_FLAG_BP) { boot_cpu_id = entry->apic_id; CPU_TO_ID(0) = entry->apic_id; ID_TO_CPU(entry->apic_id) = 0; return 0; /* its already been counted */ } /* add another AP to list, if less than max number of CPUs */ else if (cpu < MAXCPU) { CPU_TO_ID(cpu) = entry->apic_id; ID_TO_CPU(entry->apic_id) = cpu; return 1; } return 0; } static int bus_entry(bus_entry_ptr entry, int bus) { int x; char c, name[8]; /* encode the name into an index */ for (x = 0; x < 6; ++x) { if ((c = entry->bus_type[x]) == ' ') break; name[x] = c; } name[x] = '\0'; if ((x = lookup_bus_type(name)) == UNKNOWN_BUSTYPE) panic("unknown bus type: '%s'", name); bus_data[bus].bus_id = entry->bus_id; bus_data[bus].bus_type = x; return 1; } static int io_apic_entry(io_apic_entry_ptr entry, int apic) { if (!(entry->apic_flags & IOAPICENTRY_FLAG_EN)) return 0; IO_TO_ID(apic) = entry->apic_id; if (entry->apic_id < NAPICID) ID_TO_IO(entry->apic_id) = apic; return 1; } static int lookup_bus_type(char *name) { int x; for (x = 0; x < MAX_BUSTYPE; ++x) if (strcmp(bus_type_table[x].name, name) == 0) return bus_type_table[x].type; return UNKNOWN_BUSTYPE; } static int int_entry(int_entry_ptr entry, int intr) { int apic; io_apic_ints[intr].int_type = entry->int_type; io_apic_ints[intr].int_flags = entry->int_flags; io_apic_ints[intr].src_bus_id = entry->src_bus_id; io_apic_ints[intr].src_bus_irq = entry->src_bus_irq; if (entry->dst_apic_id == 255) { /* This signal goes to all IO APICS. Select an IO APIC with sufficient number of interrupt pins */ for (apic = 0; apic < mp_napics; apic++) if (((io_apic_read(apic, IOAPIC_VER) & IOART_VER_MAXREDIR) >> MAXREDIRSHIFT) >= entry->dst_apic_int) break; if (apic < mp_napics) io_apic_ints[intr].dst_apic_id = IO_TO_ID(apic); else io_apic_ints[intr].dst_apic_id = entry->dst_apic_id; } else io_apic_ints[intr].dst_apic_id = entry->dst_apic_id; io_apic_ints[intr].dst_apic_int = entry->dst_apic_int; return 1; } static int apic_int_is_bus_type(int intr, int bus_type) { int bus; for (bus = 0; bus < mp_nbusses; ++bus) if ((bus_data[bus].bus_id == io_apic_ints[intr].src_bus_id) && ((int) bus_data[bus].bus_type == bus_type)) return 1; return 0; } /* * Given a traditional ISA INT mask, return an APIC mask. */ u_int isa_apic_mask(u_int isa_mask) { int isa_irq; int apic_pin; #if defined(SKIP_IRQ15_REDIRECT) if (isa_mask == (1 << 15)) { printf("skipping ISA IRQ15 redirect\n"); return isa_mask; } #endif /* SKIP_IRQ15_REDIRECT */ isa_irq = ffs(isa_mask); /* find its bit position */ if (isa_irq == 0) /* doesn't exist */ return 0; --isa_irq; /* make it zero based */ apic_pin = isa_apic_irq(isa_irq); /* look for APIC connection */ if (apic_pin == -1) return 0; return (1 << apic_pin); /* convert pin# to a mask */ } /* * Determine which APIC pin an ISA/EISA INT is attached to. */ #define INTTYPE(I) (io_apic_ints[(I)].int_type) #define INTPIN(I) (io_apic_ints[(I)].dst_apic_int) #define INTIRQ(I) (io_apic_ints[(I)].int_vector) #define INTAPIC(I) (ID_TO_IO(io_apic_ints[(I)].dst_apic_id)) #define SRCBUSIRQ(I) (io_apic_ints[(I)].src_bus_irq) int isa_apic_irq(int isa_irq) { int intr; for (intr = 0; intr < nintrs; ++intr) { /* check each record */ if (INTTYPE(intr) == 0) { /* standard INT */ if (SRCBUSIRQ(intr) == isa_irq) { if (apic_int_is_bus_type(intr, ISA) || apic_int_is_bus_type(intr, EISA)) { if (INTIRQ(intr) == 0xff) return -1; /* unassigned */ return INTIRQ(intr); /* found */ } } } } return -1; /* NOT found */ } /* * Determine which APIC pin a PCI INT is attached to. */ #define SRCBUSID(I) (io_apic_ints[(I)].src_bus_id) #define SRCBUSDEVICE(I) ((io_apic_ints[(I)].src_bus_irq >> 2) & 0x1f) #define SRCBUSLINE(I) (io_apic_ints[(I)].src_bus_irq & 0x03) int pci_apic_irq(int pciBus, int pciDevice, int pciInt) { int intr; --pciInt; /* zero based */ for (intr = 0; intr < nintrs; ++intr) /* check each record */ if ((INTTYPE(intr) == 0) /* standard INT */ && (SRCBUSID(intr) == pciBus) && (SRCBUSDEVICE(intr) == pciDevice) && (SRCBUSLINE(intr) == pciInt)) /* a candidate IRQ */ if (apic_int_is_bus_type(intr, PCI)) { if (INTIRQ(intr) == 0xff) allocate_apic_irq(intr); if (INTIRQ(intr) == 0xff) return -1; /* unassigned */ return INTIRQ(intr); /* exact match */ } return -1; /* NOT found */ } int next_apic_irq(int irq) { int intr, ointr; int bus, bustype; bus = 0; bustype = 0; for (intr = 0; intr < nintrs; intr++) { if (INTIRQ(intr) != irq || INTTYPE(intr) != 0) continue; bus = SRCBUSID(intr); bustype = apic_bus_type(bus); if (bustype != ISA && bustype != EISA && bustype != PCI) continue; break; } if (intr >= nintrs) { return -1; } for (ointr = intr + 1; ointr < nintrs; ointr++) { if (INTTYPE(ointr) != 0) continue; if (bus != SRCBUSID(ointr)) continue; if (bustype == PCI) { if (SRCBUSDEVICE(intr) != SRCBUSDEVICE(ointr)) continue; if (SRCBUSLINE(intr) != SRCBUSLINE(ointr)) continue; } if (bustype == ISA || bustype == EISA) { if (SRCBUSIRQ(intr) != SRCBUSIRQ(ointr)) continue; } if (INTPIN(intr) == INTPIN(ointr)) continue; break; } if (ointr >= nintrs) { return -1; } return INTIRQ(ointr); } #undef SRCBUSLINE #undef SRCBUSDEVICE #undef SRCBUSID #undef SRCBUSIRQ #undef INTPIN #undef INTIRQ #undef INTAPIC #undef INTTYPE /* * Reprogram the MB chipset to NOT redirect an ISA INTerrupt. * * XXX FIXME: * Exactly what this means is unclear at this point. It is a solution * for motherboards that redirect the MBIRQ0 pin. Generically a motherboard * could route any of the ISA INTs to upper (>15) IRQ values. But most would * NOT be redirected via MBIRQ0, thus "undirect()ing" them would NOT be an * option. */ int undirect_isa_irq(int rirq) { #if defined(READY) if (bootverbose) printf("Freeing redirected ISA irq %d.\n", rirq); /** FIXME: tickle the MB redirector chip */ return -1; #else if (bootverbose) printf("Freeing (NOT implemented) redirected ISA irq %d.\n", rirq); return 0; #endif /* READY */ } /* * Reprogram the MB chipset to NOT redirect a PCI INTerrupt */ int undirect_pci_irq(int rirq) { #if defined(READY) if (bootverbose) printf("Freeing redirected PCI irq %d.\n", rirq); /** FIXME: tickle the MB redirector chip */ return -1; #else if (bootverbose) printf("Freeing (NOT implemented) redirected PCI irq %d.\n", rirq); return 0; #endif /* READY */ } /* * given a bus ID, return: * the bus type if found * -1 if NOT found */ int apic_bus_type(int id) { int x; for (x = 0; x < mp_nbusses; ++x) if (bus_data[x].bus_id == id) return bus_data[x].bus_type; return -1; } /* * given a LOGICAL APIC# and pin#, return: * the associated src bus ID if found * -1 if NOT found */ int apic_src_bus_id(int apic, int pin) { int x; /* search each of the possible INTerrupt sources */ for (x = 0; x < nintrs; ++x) if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) && (pin == io_apic_ints[x].dst_apic_int)) return (io_apic_ints[x].src_bus_id); return -1; /* NOT found */ } /* * given a LOGICAL APIC# and pin#, return: * the associated src bus IRQ if found * -1 if NOT found */ int apic_src_bus_irq(int apic, int pin) { int x; for (x = 0; x < nintrs; x++) if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) && (pin == io_apic_ints[x].dst_apic_int)) return (io_apic_ints[x].src_bus_irq); return -1; /* NOT found */ } /* * given a LOGICAL APIC# and pin#, return: * the associated INTerrupt type if found * -1 if NOT found */ int apic_int_type(int apic, int pin) { int x; /* search each of the possible INTerrupt sources */ for (x = 0; x < nintrs; ++x) if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) && (pin == io_apic_ints[x].dst_apic_int)) return (io_apic_ints[x].int_type); return -1; /* NOT found */ } int apic_irq(int apic, int pin) { int x; int res; for (x = 0; x < nintrs; ++x) if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) && (pin == io_apic_ints[x].dst_apic_int)) { res = io_apic_ints[x].int_vector; if (res == 0xff) return -1; if (apic != int_to_apicintpin[res].ioapic) panic("apic_irq: inconsistent table"); if (pin != int_to_apicintpin[res].int_pin) panic("apic_irq inconsistent table (2)"); return res; } return -1; } /* * given a LOGICAL APIC# and pin#, return: * the associated trigger mode if found * -1 if NOT found */ int apic_trigger(int apic, int pin) { int x; /* search each of the possible INTerrupt sources */ for (x = 0; x < nintrs; ++x) if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) && (pin == io_apic_ints[x].dst_apic_int)) return ((io_apic_ints[x].int_flags >> 2) & 0x03); return -1; /* NOT found */ } /* * given a LOGICAL APIC# and pin#, return: * the associated 'active' level if found * -1 if NOT found */ int apic_polarity(int apic, int pin) { int x; /* search each of the possible INTerrupt sources */ for (x = 0; x < nintrs; ++x) if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) && (pin == io_apic_ints[x].dst_apic_int)) return (io_apic_ints[x].int_flags & 0x03); return -1; /* NOT found */ } /* * set data according to MP defaults * FIXME: probably not complete yet... */ static void default_mp_table(int type) { int ap_cpu_id; #if defined(APIC_IO) int io_apic_id; int pin; #endif /* APIC_IO */ #if 0 printf(" MP default config type: %d\n", type); switch (type) { case 1: printf(" bus: ISA, APIC: 82489DX\n"); break; case 2: printf(" bus: EISA, APIC: 82489DX\n"); break; case 3: printf(" bus: EISA, APIC: 82489DX\n"); break; case 4: printf(" bus: MCA, APIC: 82489DX\n"); break; case 5: printf(" bus: ISA+PCI, APIC: Integrated\n"); break; case 6: printf(" bus: EISA+PCI, APIC: Integrated\n"); break; case 7: printf(" bus: MCA+PCI, APIC: Integrated\n"); break; default: printf(" future type\n"); break; /* NOTREACHED */ } #endif /* 0 */ boot_cpu_id = (lapic.id & APIC_ID_MASK) >> 24; ap_cpu_id = (boot_cpu_id == 0) ? 1 : 0; /* BSP */ CPU_TO_ID(0) = boot_cpu_id; ID_TO_CPU(boot_cpu_id) = 0; /* one and only AP */ CPU_TO_ID(1) = ap_cpu_id; ID_TO_CPU(ap_cpu_id) = 1; #if defined(APIC_IO) /* one and only IO APIC */ io_apic_id = (io_apic_read(0, IOAPIC_ID) & APIC_ID_MASK) >> 24; /* * sanity check, refer to MP spec section 3.6.6, last paragraph * necessary as some hardware isn't properly setting up the IO APIC */ #if defined(REALLY_ANAL_IOAPICID_VALUE) if (io_apic_id != 2) { #else if ((io_apic_id == 0) || (io_apic_id == 1) || (io_apic_id == 15)) { #endif /* REALLY_ANAL_IOAPICID_VALUE */ io_apic_set_id(0, 2); io_apic_id = 2; } IO_TO_ID(0) = io_apic_id; ID_TO_IO(io_apic_id) = 0; #endif /* APIC_IO */ /* fill out bus entries */ switch (type) { case 1: case 2: case 3: case 4: case 5: case 6: case 7: bus_data[0].bus_id = default_data[type - 1][1]; bus_data[0].bus_type = default_data[type - 1][2]; bus_data[1].bus_id = default_data[type - 1][3]; bus_data[1].bus_type = default_data[type - 1][4]; break; /* case 4: case 7: MCA NOT supported */ default: /* illegal/reserved */ panic("BAD default MP config: %d", type); /* NOTREACHED */ } #if defined(APIC_IO) /* general cases from MP v1.4, table 5-2 */ for (pin = 0; pin < 16; ++pin) { io_apic_ints[pin].int_type = 0; io_apic_ints[pin].int_flags = 0x05; /* edge/active-hi */ io_apic_ints[pin].src_bus_id = 0; io_apic_ints[pin].src_bus_irq = pin; /* IRQ2 caught below */ io_apic_ints[pin].dst_apic_id = io_apic_id; io_apic_ints[pin].dst_apic_int = pin; /* 1-to-1 */ } /* special cases from MP v1.4, table 5-2 */ if (type == 2) { io_apic_ints[2].int_type = 0xff; /* N/C */ io_apic_ints[13].int_type = 0xff; /* N/C */ #if !defined(APIC_MIXED_MODE) /** FIXME: ??? */ panic("sorry, can't support type 2 default yet"); #endif /* APIC_MIXED_MODE */ } else io_apic_ints[2].src_bus_irq = 0; /* ISA IRQ0 is on APIC INT 2 */ if (type == 7) io_apic_ints[0].int_type = 0xff; /* N/C */ else io_apic_ints[0].int_type = 3; /* vectored 8259 */ #endif /* APIC_IO */ } /* * start each AP in our list */ static int start_all_aps(u_int boot_addr) { int x, i, pg; u_char mpbiosreason; u_long mpbioswarmvec; struct globaldata *gd; char *stack; uintptr_t kptbase; POSTCODE(START_ALL_APS_POST); mtx_init(&ap_boot_mtx, "ap boot", MTX_SPIN); /* initialize BSP's local APIC */ apic_initialize(); bsp_apic_ready = 1; /* install the AP 1st level boot code */ install_ap_tramp(boot_addr); /* save the current value of the warm-start vector */ mpbioswarmvec = *((u_long *) WARMBOOT_OFF); #ifndef PC98 outb(CMOS_REG, BIOS_RESET); mpbiosreason = inb(CMOS_DATA); #endif /* set up temporary P==V mapping for AP boot */ /* XXX this is a hack, we should boot the AP on its own stack/PTD */ kptbase = (uintptr_t)(void *)KPTphys; for (x = 0; x < NKPT; x++) PTD[x] = (pd_entry_t)(PG_V | PG_RW | ((kptbase + x * PAGE_SIZE) & PG_FRAME)); invltlb(); /* start each AP */ for (x = 1; x <= mp_naps; ++x) { /* This is a bit verbose, it will go away soon. */ /* first page of AP's private space */ pg = x * i386_btop(sizeof(struct privatespace)); /* allocate a new private data page */ gd = (struct globaldata *)kmem_alloc(kernel_map, PAGE_SIZE); /* wire it into the private page table page */ SMPpt[pg] = (pt_entry_t)(PG_V | PG_RW | vtophys(gd)); /* allocate and set up an idle stack data page */ stack = (char *)kmem_alloc(kernel_map, UPAGES*PAGE_SIZE); for (i = 0; i < UPAGES; i++) SMPpt[pg + 1 + i] = (pt_entry_t) (PG_V | PG_RW | vtophys(PAGE_SIZE * i + stack)); /* prime data page for it to use */ gd->gd_cpuid = x; globaldata_register(gd); /* setup a vector to our boot code */ *((volatile u_short *) WARMBOOT_OFF) = WARMBOOT_TARGET; *((volatile u_short *) WARMBOOT_SEG) = (boot_addr >> 4); #ifndef PC98 outb(CMOS_REG, BIOS_RESET); outb(CMOS_DATA, BIOS_WARM); /* 'warm-start' */ #endif bootSTK = &SMP_prvspace[x].idlestack[UPAGES*PAGE_SIZE]; bootAP = x; /* attempt to start the Application Processor */ CHECK_INIT(99); /* setup checkpoints */ if (!start_ap(x, boot_addr)) { printf("AP #%d (PHY# %d) failed!\n", x, CPU_TO_ID(x)); CHECK_PRINT("trace"); /* show checkpoints */ /* better panic as the AP may be running loose */ printf("panic y/n? [y] "); if (cngetc() != 'n') panic("bye-bye"); } CHECK_PRINT("trace"); /* show checkpoints */ /* record its version info */ cpu_apic_versions[x] = cpu_apic_versions[0]; all_cpus |= (1 << x); /* record AP in CPU map */ } /* build our map of 'other' CPUs */ PCPU_SET(other_cpus, all_cpus & ~(1 << PCPU_GET(cpuid))); /* fill in our (BSP) APIC version */ cpu_apic_versions[0] = lapic.version; /* restore the warmstart vector */ *(u_long *) WARMBOOT_OFF = mpbioswarmvec; #ifndef PC98 outb(CMOS_REG, BIOS_RESET); outb(CMOS_DATA, mpbiosreason); #endif /* * Set up the idle context for the BSP. Similar to above except * that some was done by locore, some by pmap.c and some is implicit * because the BSP is cpu#0 and the page is initially zero, and also * because we can refer to variables by name on the BSP.. */ /* Allocate and setup BSP idle stack */ stack = (char *)kmem_alloc(kernel_map, UPAGES * PAGE_SIZE); for (i = 0; i < UPAGES; i++) SMPpt[1 + i] = (pt_entry_t) (PG_V | PG_RW | vtophys(PAGE_SIZE * i + stack)); for (x = 0; x < NKPT; x++) PTD[x] = 0; pmap_set_opt(); /* number of APs actually started */ return mp_ncpus - 1; } /* * load the 1st level AP boot code into base memory. */ /* targets for relocation */ extern void bigJump(void); extern void bootCodeSeg(void); extern void bootDataSeg(void); extern void MPentry(void); extern u_int MP_GDT; extern u_int mp_gdtbase; static void install_ap_tramp(u_int boot_addr) { int x; int size = *(int *) ((u_long) & bootMP_size); u_char *src = (u_char *) ((u_long) bootMP); u_char *dst = (u_char *) boot_addr + KERNBASE; u_int boot_base = (u_int) bootMP; u_int8_t *dst8; u_int16_t *dst16; u_int32_t *dst32; POSTCODE(INSTALL_AP_TRAMP_POST); for (x = 0; x < size; ++x) *dst++ = *src++; /* * modify addresses in code we just moved to basemem. unfortunately we * need fairly detailed info about mpboot.s for this to work. changes * to mpboot.s might require changes here. */ /* boot code is located in KERNEL space */ dst = (u_char *) boot_addr + KERNBASE; /* modify the lgdt arg */ dst32 = (u_int32_t *) (dst + ((u_int) & mp_gdtbase - boot_base)); *dst32 = boot_addr + ((u_int) & MP_GDT - boot_base); /* modify the ljmp target for MPentry() */ dst32 = (u_int32_t *) (dst + ((u_int) bigJump - boot_base) + 1); *dst32 = ((u_int) MPentry - KERNBASE); /* modify the target for boot code segment */ dst16 = (u_int16_t *) (dst + ((u_int) bootCodeSeg - boot_base)); dst8 = (u_int8_t *) (dst16 + 1); *dst16 = (u_int) boot_addr & 0xffff; *dst8 = ((u_int) boot_addr >> 16) & 0xff; /* modify the target for boot data segment */ dst16 = (u_int16_t *) (dst + ((u_int) bootDataSeg - boot_base)); dst8 = (u_int8_t *) (dst16 + 1); *dst16 = (u_int) boot_addr & 0xffff; *dst8 = ((u_int) boot_addr >> 16) & 0xff; } /* * this function starts the AP (application processor) identified * by the APIC ID 'physicalCpu'. It does quite a "song and dance" * to accomplish this. This is necessary because of the nuances * of the different hardware we might encounter. It ain't pretty, * but it seems to work. */ static int start_ap(int logical_cpu, u_int boot_addr) { int physical_cpu; int vector; int cpus; u_long icr_lo, icr_hi; POSTCODE(START_AP_POST); /* get the PHYSICAL APIC ID# */ physical_cpu = CPU_TO_ID(logical_cpu); /* calculate the vector */ vector = (boot_addr >> 12) & 0xff; /* used as a watchpoint to signal AP startup */ cpus = mp_ncpus; /* * first we do an INIT/RESET IPI this INIT IPI might be run, reseting * and running the target CPU. OR this INIT IPI might be latched (P5 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be * ignored. */ /* setup the address for the target AP */ icr_hi = lapic.icr_hi & ~APIC_ID_MASK; icr_hi |= (physical_cpu << 24); lapic.icr_hi = icr_hi; /* do an INIT IPI: assert RESET */ icr_lo = lapic.icr_lo & 0xfff00000; lapic.icr_lo = icr_lo | 0x0000c500; /* wait for pending status end */ while (lapic.icr_lo & APIC_DELSTAT_MASK) /* spin */ ; /* do an INIT IPI: deassert RESET */ lapic.icr_lo = icr_lo | 0x00008500; /* wait for pending status end */ u_sleep(10000); /* wait ~10mS */ while (lapic.icr_lo & APIC_DELSTAT_MASK) /* spin */ ; /* * next we do a STARTUP IPI: the previous INIT IPI might still be * latched, (P5 bug) this 1st STARTUP would then terminate * immediately, and the previously started INIT IPI would continue. OR * the previous INIT IPI has already run. and this STARTUP IPI will * run. OR the previous INIT IPI was ignored. and this STARTUP IPI * will run. */ /* do a STARTUP IPI */ lapic.icr_lo = icr_lo | 0x00000600 | vector; while (lapic.icr_lo & APIC_DELSTAT_MASK) /* spin */ ; u_sleep(200); /* wait ~200uS */ /* * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is * recognized after hardware RESET or INIT IPI. */ lapic.icr_lo = icr_lo | 0x00000600 | vector; while (lapic.icr_lo & APIC_DELSTAT_MASK) /* spin */ ; u_sleep(200); /* wait ~200uS */ /* wait for it to start */ set_apic_timer(5000000);/* == 5 seconds */ while (read_apic_timer()) if (mp_ncpus > cpus) return 1; /* return SUCCESS */ return 0; /* return FAILURE */ } /* * Flush the TLB on all other CPU's * * XXX: Needs to handshake and wait for completion before proceding. */ void smp_invltlb(void) { #if defined(APIC_IO) if (smp_started && invltlb_ok) ipi_all_but_self(IPI_INVLTLB); #endif /* APIC_IO */ } void invlpg(u_int addr) { __asm __volatile("invlpg (%0)"::"r"(addr):"memory"); /* send a message to the other CPUs */ smp_invltlb(); } void invltlb(void) { u_long temp; /* * This should be implemented as load_cr3(rcr3()) when load_cr3() is * inlined. */ __asm __volatile("movl %%cr3, %0; movl %0, %%cr3":"=r"(temp) :: "memory"); /* send a message to the other CPUs */ smp_invltlb(); } /* * This is called once the rest of the system is up and running and we're * ready to let the AP's out of the pen. */ extern void enable_sse(void); void ap_init(void) { u_int apic_id; /* spin until all the AP's are ready */ while (!aps_ready) /* spin */ ; /* * Set curproc to our per-cpu idleproc so that mutexes have * something unique to lock with. */ PCPU_SET(curproc, PCPU_GET(idleproc)); PCPU_SET(spinlocks, NULL); /* lock against other AP's that are waking up */ mtx_lock_spin(&ap_boot_mtx); /* BSP may have changed PTD while we're waiting for the lock */ cpu_invltlb(); smp_cpus++; #if defined(I586_CPU) && !defined(NO_F00F_HACK) lidt(&r_idt); #endif /* Build our map of 'other' CPUs. */ PCPU_SET(other_cpus, all_cpus & ~(1 << PCPU_GET(cpuid))); printf("SMP: AP CPU #%d Launched!\n", PCPU_GET(cpuid)); /* set up CPU registers and state */ cpu_setregs(); /* set up FPU state on the AP */ npxinit(__INITIAL_NPXCW__); /* set up SSE registers */ enable_sse(); /* A quick check from sanity claus */ apic_id = (apic_id_to_logical[(lapic.id & 0x0f000000) >> 24]); if (PCPU_GET(cpuid) != apic_id) { printf("SMP: cpuid = %d\n", PCPU_GET(cpuid)); printf("SMP: apic_id = %d\n", apic_id); printf("PTD[MPPTDI] = %p\n", (void *)PTD[MPPTDI]); panic("cpuid mismatch! boom!!"); } /* Init local apic for irq's */ apic_initialize(); /* Set memory range attributes for this CPU to match the BSP */ mem_range_AP_init(); /* * Activate smp_invltlb, although strictly speaking, this isn't * quite correct yet. We should have a bitfield for cpus willing * to accept TLB flush IPI's or something and sync them. */ if (smp_cpus == mp_ncpus) { invltlb_ok = 1; smp_started = 1; /* enable IPI's, tlb shootdown, freezes etc */ smp_active = 1; /* historic */ } /* let other AP's wake up now */ mtx_unlock_spin(&ap_boot_mtx); /* wait until all the AP's are up */ while (smp_started == 0) ; /* nothing */ microuptime(PCPU_PTR(switchtime)); PCPU_SET(switchticks, ticks); /* ok, now grab sched_lock and enter the scheduler */ enable_intr(); mtx_lock_spin(&sched_lock); cpu_throw(); /* doesn't return */ panic("scheduler returned us to ap_init"); } /* * For statclock, we send an IPI to all CPU's to have them call this * function. */ void forwarded_statclock(struct trapframe frame) { mtx_lock_spin(&sched_lock); statclock_process(curproc, TRAPF_PC(&frame), TRAPF_USERMODE(&frame)); mtx_unlock_spin(&sched_lock); } void forward_statclock(void) { int map; CTR0(KTR_SMP, "forward_statclock"); if (!smp_started || !invltlb_ok || cold || panicstr) return; map = PCPU_GET(other_cpus) & ~stopped_cpus ; if (map != 0) ipi_selected(map, IPI_STATCLOCK); } /* * For each hardclock(), we send an IPI to all other CPU's to have them * execute this function. It would be nice to reduce contention on * sched_lock if we could simply peek at the CPU to determine the user/kernel * state and call hardclock_process() on the CPU receiving the clock interrupt * and then just use a simple IPI to handle any ast's if needed. */ void forwarded_hardclock(struct trapframe frame) { mtx_lock_spin(&sched_lock); hardclock_process(curproc, TRAPF_USERMODE(&frame)); mtx_unlock_spin(&sched_lock); } void forward_hardclock(void) { u_int map; CTR0(KTR_SMP, "forward_hardclock"); if (!smp_started || !invltlb_ok || cold || panicstr) return; map = PCPU_GET(other_cpus) & ~stopped_cpus ; if (map != 0) ipi_selected(map, IPI_HARDCLOCK); } #ifdef APIC_INTR_REORDER /* * Maintain mapping from softintr vector to isr bit in local apic. */ void set_lapic_isrloc(int intr, int vector) { if (intr < 0 || intr > 32) panic("set_apic_isrloc: bad intr argument: %d",intr); if (vector < ICU_OFFSET || vector > 255) panic("set_apic_isrloc: bad vector argument: %d",vector); apic_isrbit_location[intr].location = &lapic.isr0 + ((vector>>5)<<2); apic_isrbit_location[intr].bit = (1<<(vector & 31)); } #endif /* * send an IPI to a set of cpus. */ void ipi_selected(u_int32_t cpus, u_int ipi) { CTR2(KTR_SMP, __func__ ": cpus: %x ipi: %x", cpus, ipi); selected_apic_ipi(cpus, ipi, APIC_DELMODE_FIXED); } /* * send an IPI INTerrupt containing 'vector' to all CPUs, including myself */ void ipi_all(u_int ipi) { CTR1(KTR_SMP, __func__ ": ipi: %x", ipi); apic_ipi(APIC_DEST_ALLISELF, ipi, APIC_DELMODE_FIXED); } /* * send an IPI to all CPUs EXCEPT myself */ void ipi_all_but_self(u_int ipi) { CTR1(KTR_SMP, __func__ ": ipi: %x", ipi); apic_ipi(APIC_DEST_ALLESELF, ipi, APIC_DELMODE_FIXED); } /* * send an IPI to myself */ void ipi_self(u_int ipi) { CTR1(KTR_SMP, __func__ ": ipi: %x", ipi); apic_ipi(APIC_DEST_SELF, ipi, APIC_DELMODE_FIXED); } void release_aps(void *dummy __unused) { atomic_store_rel_int(&aps_ready, 1); } SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); Index: head/sys/amd64/amd64/mptable.c =================================================================== --- head/sys/amd64/amd64/mptable.c (revision 82308) +++ head/sys/amd64/amd64/mptable.c (revision 82309) @@ -1,2440 +1,2442 @@ /* * Copyright (c) 1996, by Steve Passe * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. The name of the developer may NOT be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include "opt_cpu.h" +#include "opt_upages.h" #ifdef SMP #include #else #error #endif #include #include #include #include /* cngetc() */ #include #ifdef GPROF #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /** TEST_DEFAULT_CONFIG, TEST_TEST1 */ #include #include #include +#include #if defined(APIC_IO) #include /* setidt() */ #include /* IPIs */ #include /* IPIs */ #endif /* APIC_IO */ #if defined(TEST_DEFAULT_CONFIG) #define MPFPS_MPFB1 TEST_DEFAULT_CONFIG #else #define MPFPS_MPFB1 mpfps->mpfb1 #endif /* TEST_DEFAULT_CONFIG */ #define WARMBOOT_TARGET 0 #define WARMBOOT_OFF (KERNBASE + 0x0467) #define WARMBOOT_SEG (KERNBASE + 0x0469) #ifdef PC98 #define BIOS_BASE (0xe8000) #define BIOS_SIZE (0x18000) #else #define BIOS_BASE (0xf0000) #define BIOS_SIZE (0x10000) #endif #define BIOS_COUNT (BIOS_SIZE/4) #define CMOS_REG (0x70) #define CMOS_DATA (0x71) #define BIOS_RESET (0x0f) #define BIOS_WARM (0x0a) #define PROCENTRY_FLAG_EN 0x01 #define PROCENTRY_FLAG_BP 0x02 #define IOAPICENTRY_FLAG_EN 0x01 /* MP Floating Pointer Structure */ typedef struct MPFPS { char signature[4]; void *pap; u_char length; u_char spec_rev; u_char checksum; u_char mpfb1; u_char mpfb2; u_char mpfb3; u_char mpfb4; u_char mpfb5; } *mpfps_t; /* MP Configuration Table Header */ typedef struct MPCTH { char signature[4]; u_short base_table_length; u_char spec_rev; u_char checksum; u_char oem_id[8]; u_char product_id[12]; void *oem_table_pointer; u_short oem_table_size; u_short entry_count; void *apic_address; u_short extended_table_length; u_char extended_table_checksum; u_char reserved; } *mpcth_t; typedef struct PROCENTRY { u_char type; u_char apic_id; u_char apic_version; u_char cpu_flags; u_long cpu_signature; u_long feature_flags; u_long reserved1; u_long reserved2; } *proc_entry_ptr; typedef struct BUSENTRY { u_char type; u_char bus_id; char bus_type[6]; } *bus_entry_ptr; typedef struct IOAPICENTRY { u_char type; u_char apic_id; u_char apic_version; u_char apic_flags; void *apic_address; } *io_apic_entry_ptr; typedef struct INTENTRY { u_char type; u_char int_type; u_short int_flags; u_char src_bus_id; u_char src_bus_irq; u_char dst_apic_id; u_char dst_apic_int; } *int_entry_ptr; /* descriptions of MP basetable entries */ typedef struct BASETABLE_ENTRY { u_char type; u_char length; char name[16]; } basetable_entry; /* * this code MUST be enabled here and in mpboot.s. * it follows the very early stages of AP boot by placing values in CMOS ram. * it NORMALLY will never be needed and thus the primitive method for enabling. * #define CHECK_POINTS */ #if defined(CHECK_POINTS) && !defined(PC98) #define CHECK_READ(A) (outb(CMOS_REG, (A)), inb(CMOS_DATA)) #define CHECK_WRITE(A,D) (outb(CMOS_REG, (A)), outb(CMOS_DATA, (D))) #define CHECK_INIT(D); \ CHECK_WRITE(0x34, (D)); \ CHECK_WRITE(0x35, (D)); \ CHECK_WRITE(0x36, (D)); \ CHECK_WRITE(0x37, (D)); \ CHECK_WRITE(0x38, (D)); \ CHECK_WRITE(0x39, (D)); #define CHECK_PRINT(S); \ printf("%s: %d, %d, %d, %d, %d, %d\n", \ (S), \ CHECK_READ(0x34), \ CHECK_READ(0x35), \ CHECK_READ(0x36), \ CHECK_READ(0x37), \ CHECK_READ(0x38), \ CHECK_READ(0x39)); #else /* CHECK_POINTS */ #define CHECK_INIT(D) #define CHECK_PRINT(S) #endif /* CHECK_POINTS */ /* * Values to send to the POST hardware. */ #define MP_BOOTADDRESS_POST 0x10 #define MP_PROBE_POST 0x11 #define MPTABLE_PASS1_POST 0x12 #define MP_START_POST 0x13 #define MP_ENABLE_POST 0x14 #define MPTABLE_PASS2_POST 0x15 #define START_ALL_APS_POST 0x16 #define INSTALL_AP_TRAMP_POST 0x17 #define START_AP_POST 0x18 #define MP_ANNOUNCE_POST 0x19 /* used to hold the AP's until we are ready to release them */ static struct mtx ap_boot_mtx; /** XXX FIXME: where does this really belong, isa.h/isa.c perhaps? */ int current_postcode; /** XXX FIXME: what system files declare these??? */ extern struct region_descriptor r_gdt, r_idt; int bsp_apic_ready = 0; /* flags useability of BSP apic */ int mp_naps; /* # of Applications processors */ int mp_nbusses; /* # of busses */ int mp_napics; /* # of IO APICs */ int boot_cpu_id; /* designated BSP */ vm_offset_t cpu_apic_address; vm_offset_t io_apic_address[NAPICID]; /* NAPICID is more than enough */ extern int nkpt; u_int32_t cpu_apic_versions[MAXCPU]; u_int32_t *io_apic_versions; #ifdef APIC_INTR_REORDER struct { volatile int *location; int bit; } apic_isrbit_location[32]; #endif struct apic_intmapinfo int_to_apicintpin[APIC_INTMAPSIZE]; /* * APIC ID logical/physical mapping structures. * We oversize these to simplify boot-time config. */ int cpu_num_to_apic_id[NAPICID]; int io_num_to_apic_id[NAPICID]; int apic_id_to_logical[NAPICID]; /* AP uses this during bootstrap. Do not staticize. */ char *bootSTK; static int bootAP; /* Hotwire a 0->4MB V==P mapping */ extern pt_entry_t *KPTphys; /* SMP page table page */ extern pt_entry_t *SMPpt; struct pcb stoppcbs[MAXCPU]; int invltlb_ok = 0; /* throttle smp_invltlb() till safe */ SYSCTL_INT(_machdep, OID_AUTO, invltlb_ok, CTLFLAG_RW, &invltlb_ok, 0, ""); /* * Local data and functions. */ /* Set to 1 once we're ready to let the APs out of the pen. */ static volatile int aps_ready = 0; static int mp_capable; static u_int boot_address; static u_int base_memory; static int picmode; /* 0: virtual wire mode, 1: PIC mode */ static mpfps_t mpfps; static int search_for_sig(u_int32_t target, int count); static void mp_enable(u_int boot_addr); static void mptable_pass1(void); static int mptable_pass2(void); static void default_mp_table(int type); static void fix_mp_table(void); static void setup_apic_irq_mapping(void); static void init_locks(void); static int start_all_aps(u_int boot_addr); static void install_ap_tramp(u_int boot_addr); static int start_ap(int logicalCpu, u_int boot_addr); void ap_init(void); static int apic_int_is_bus_type(int intr, int bus_type); static void release_aps(void *dummy); /* * initialize all the SMP locks */ /* critical region around IO APIC, apic_imen */ struct mtx imen_mtx; /* lock region used by kernel profiling */ int mcount_lock; #ifdef USE_COMLOCK /* locks com (tty) data/hardware accesses: a FASTINTR() */ struct mtx com_mtx; #endif /* USE_COMLOCK */ static void init_locks(void) { #ifdef USE_COMLOCK mtx_init(&com_mtx, "com", MTX_SPIN); #endif /* USE_COMLOCK */ } /* * Calculate usable address in base memory for AP trampoline code. */ u_int mp_bootaddress(u_int basemem) { POSTCODE(MP_BOOTADDRESS_POST); base_memory = basemem * 1024; /* convert to bytes */ boot_address = base_memory & ~0xfff; /* round down to 4k boundary */ if ((base_memory - boot_address) < bootMP_size) boot_address -= 4096; /* not enough, lower by 4k */ return boot_address; } /* * Look for an Intel MP spec table (ie, SMP capable hardware). */ void i386_mp_probe(void) { int x; u_long segment; u_int32_t target; POSTCODE(MP_PROBE_POST); /* see if EBDA exists */ if ((segment = (u_long) * (u_short *) (KERNBASE + 0x40e)) != 0) { /* search first 1K of EBDA */ target = (u_int32_t) (segment << 4); if ((x = search_for_sig(target, 1024 / 4)) >= 0) goto found; } else { /* last 1K of base memory, effective 'top of base' passed in */ target = (u_int32_t) (base_memory - 0x400); if ((x = search_for_sig(target, 1024 / 4)) >= 0) goto found; } /* search the BIOS */ target = (u_int32_t) BIOS_BASE; if ((x = search_for_sig(target, BIOS_COUNT)) >= 0) goto found; /* nothing found */ mpfps = (mpfps_t)0; mp_capable = 0; return; found: /* calculate needed resources */ mpfps = (mpfps_t)x; mptable_pass1(); /* flag fact that we are running multiple processors */ mp_capable = 1; } int cpu_mp_probe(void) { /* * Record BSP in CPU map * This is done here so that MBUF init code works correctly. */ all_cpus = 1; return (mp_capable); } /* * Initialize the SMP hardware and the APIC and start up the AP's. */ void cpu_mp_start(void) { POSTCODE(MP_START_POST); /* look for MP capable motherboard */ if (mp_capable) mp_enable(boot_address); else panic("MP hardware not found!"); cpu_setregs(); } /* * Print various information about the SMP system hardware and setup. */ void cpu_mp_announce(void) { int x; POSTCODE(MP_ANNOUNCE_POST); printf(" cpu0 (BSP): apic id: %2d", CPU_TO_ID(0)); printf(", version: 0x%08x", cpu_apic_versions[0]); printf(", at 0x%08x\n", cpu_apic_address); for (x = 1; x <= mp_naps; ++x) { printf(" cpu%d (AP): apic id: %2d", x, CPU_TO_ID(x)); printf(", version: 0x%08x", cpu_apic_versions[x]); printf(", at 0x%08x\n", cpu_apic_address); } #if defined(APIC_IO) for (x = 0; x < mp_napics; ++x) { printf(" io%d (APIC): apic id: %2d", x, IO_TO_ID(x)); printf(", version: 0x%08x", io_apic_versions[x]); printf(", at 0x%08x\n", io_apic_address[x]); } #else printf(" Warning: APIC I/O disabled\n"); #endif /* APIC_IO */ } /* * AP cpu's call this to sync up protected mode. */ void init_secondary(void) { int gsel_tss; int x, myid = bootAP; gdt_segs[GPRIV_SEL].ssd_base = (int) &SMP_prvspace[myid]; gdt_segs[GPROC0_SEL].ssd_base = (int) &SMP_prvspace[myid].globaldata.gd_common_tss; SMP_prvspace[myid].globaldata.gd_prvspace = &SMP_prvspace[myid].globaldata; for (x = 0; x < NGDT; x++) { ssdtosd(&gdt_segs[x], &gdt[myid * NGDT + x].sd); } r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; r_gdt.rd_base = (int) &gdt[myid * NGDT]; lgdt(&r_gdt); /* does magic intra-segment return */ lidt(&r_idt); lldt(_default_ldt); PCPU_SET(currentldt, _default_ldt); gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); gdt[myid * NGDT + GPROC0_SEL].sd.sd_type = SDT_SYS386TSS; PCPU_SET(common_tss.tss_esp0, 0); /* not used until after switch */ PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL)); PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16); PCPU_SET(tss_gdt, &gdt[myid * NGDT + GPROC0_SEL].sd); PCPU_SET(common_tssd, *PCPU_GET(tss_gdt)); ltr(gsel_tss); pmap_set_opt(); } #if defined(APIC_IO) /* * Final configuration of the BSP's local APIC: * - disable 'pic mode'. * - disable 'virtual wire mode'. * - enable NMI. */ void bsp_apic_configure(void) { u_char byte; u_int32_t temp; /* leave 'pic mode' if necessary */ if (picmode) { outb(0x22, 0x70); /* select IMCR */ byte = inb(0x23); /* current contents */ byte |= 0x01; /* mask external INTR */ outb(0x23, byte); /* disconnect 8259s/NMI */ } /* mask lint0 (the 8259 'virtual wire' connection) */ temp = lapic.lvt_lint0; temp |= APIC_LVT_M; /* set the mask */ lapic.lvt_lint0 = temp; /* setup lint1 to handle NMI */ temp = lapic.lvt_lint1; temp &= ~APIC_LVT_M; /* clear the mask */ lapic.lvt_lint1 = temp; if (bootverbose) apic_dump("bsp_apic_configure()"); } #endif /* APIC_IO */ /******************************************************************* * local functions and data */ /* * start the SMP system */ static void mp_enable(u_int boot_addr) { int x; #if defined(APIC_IO) int apic; u_int ux; #endif /* APIC_IO */ POSTCODE(MP_ENABLE_POST); /* turn on 4MB of V == P addressing so we can get to MP table */ *(int *)PTD = PG_V | PG_RW | ((uintptr_t)(void *)KPTphys & PG_FRAME); invltlb(); /* examine the MP table for needed info, uses physical addresses */ x = mptable_pass2(); *(int *)PTD = 0; invltlb(); /* can't process default configs till the CPU APIC is pmapped */ if (x) default_mp_table(x); /* post scan cleanup */ fix_mp_table(); setup_apic_irq_mapping(); #if defined(APIC_IO) /* fill the LOGICAL io_apic_versions table */ for (apic = 0; apic < mp_napics; ++apic) { ux = io_apic_read(apic, IOAPIC_VER); io_apic_versions[apic] = ux; io_apic_set_id(apic, IO_TO_ID(apic)); } /* program each IO APIC in the system */ for (apic = 0; apic < mp_napics; ++apic) if (io_apic_setup(apic) < 0) panic("IO APIC setup failure"); /* install a 'Spurious INTerrupt' vector */ setidt(XSPURIOUSINT_OFFSET, Xspuriousint, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* install an inter-CPU IPI for TLB invalidation */ setidt(XINVLTLB_OFFSET, Xinvltlb, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* install an inter-CPU IPI for forwarding hardclock() */ setidt(XHARDCLOCK_OFFSET, Xhardclock, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* install an inter-CPU IPI for forwarding statclock() */ setidt(XSTATCLOCK_OFFSET, Xstatclock, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* install an inter-CPU IPI for all-CPU rendezvous */ setidt(XRENDEZVOUS_OFFSET, Xrendezvous, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* install an inter-CPU IPI for forcing an additional software trap */ setidt(XCPUAST_OFFSET, Xcpuast, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* install an inter-CPU IPI for CPU stop/restart */ setidt(XCPUSTOP_OFFSET, Xcpustop, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); #if defined(TEST_TEST1) /* install a "fake hardware INTerrupt" vector */ setidt(XTEST1_OFFSET, Xtest1, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); #endif /** TEST_TEST1 */ #endif /* APIC_IO */ /* initialize all SMP locks */ init_locks(); /* start each Application Processor */ start_all_aps(boot_addr); } /* * look for the MP spec signature */ /* string defined by the Intel MP Spec as identifying the MP table */ #define MP_SIG 0x5f504d5f /* _MP_ */ #define NEXT(X) ((X) += 4) static int search_for_sig(u_int32_t target, int count) { int x; u_int32_t *addr = (u_int32_t *) (KERNBASE + target); for (x = 0; x < count; NEXT(x)) if (addr[x] == MP_SIG) /* make array index a byte index */ return (target + (x * sizeof(u_int32_t))); return -1; } static basetable_entry basetable_entry_types[] = { {0, 20, "Processor"}, {1, 8, "Bus"}, {2, 8, "I/O APIC"}, {3, 8, "I/O INT"}, {4, 8, "Local INT"} }; typedef struct BUSDATA { u_char bus_id; enum busTypes bus_type; } bus_datum; typedef struct INTDATA { u_char int_type; u_short int_flags; u_char src_bus_id; u_char src_bus_irq; u_char dst_apic_id; u_char dst_apic_int; u_char int_vector; } io_int, local_int; typedef struct BUSTYPENAME { u_char type; char name[7]; } bus_type_name; static bus_type_name bus_type_table[] = { {CBUS, "CBUS"}, {CBUSII, "CBUSII"}, {EISA, "EISA"}, {MCA, "MCA"}, {UNKNOWN_BUSTYPE, "---"}, {ISA, "ISA"}, {MCA, "MCA"}, {UNKNOWN_BUSTYPE, "---"}, {UNKNOWN_BUSTYPE, "---"}, {UNKNOWN_BUSTYPE, "---"}, {UNKNOWN_BUSTYPE, "---"}, {UNKNOWN_BUSTYPE, "---"}, {PCI, "PCI"}, {UNKNOWN_BUSTYPE, "---"}, {UNKNOWN_BUSTYPE, "---"}, {UNKNOWN_BUSTYPE, "---"}, {UNKNOWN_BUSTYPE, "---"}, {XPRESS, "XPRESS"}, {UNKNOWN_BUSTYPE, "---"} }; /* from MP spec v1.4, table 5-1 */ static int default_data[7][5] = { /* nbus, id0, type0, id1, type1 */ {1, 0, ISA, 255, 255}, {1, 0, EISA, 255, 255}, {1, 0, EISA, 255, 255}, {1, 0, MCA, 255, 255}, {2, 0, ISA, 1, PCI}, {2, 0, EISA, 1, PCI}, {2, 0, MCA, 1, PCI} }; /* the bus data */ static bus_datum *bus_data; /* the IO INT data, one entry per possible APIC INTerrupt */ static io_int *io_apic_ints; static int nintrs; static int processor_entry __P((proc_entry_ptr entry, int cpu)); static int bus_entry __P((bus_entry_ptr entry, int bus)); static int io_apic_entry __P((io_apic_entry_ptr entry, int apic)); static int int_entry __P((int_entry_ptr entry, int intr)); static int lookup_bus_type __P((char *name)); /* * 1st pass on motherboard's Intel MP specification table. * * initializes: * mp_ncpus = 1 * * determines: * cpu_apic_address (common to all CPUs) * io_apic_address[N] * mp_naps * mp_nbusses * mp_napics * nintrs */ static void mptable_pass1(void) { int x; mpcth_t cth; int totalSize; void* position; int count; int type; POSTCODE(MPTABLE_PASS1_POST); /* clear various tables */ for (x = 0; x < NAPICID; ++x) { io_apic_address[x] = ~0; /* IO APIC address table */ } /* init everything to empty */ mp_naps = 0; mp_nbusses = 0; mp_napics = 0; nintrs = 0; /* check for use of 'default' configuration */ if (MPFPS_MPFB1 != 0) { /* use default addresses */ cpu_apic_address = DEFAULT_APIC_BASE; io_apic_address[0] = DEFAULT_IO_APIC_BASE; /* fill in with defaults */ mp_naps = 2; /* includes BSP */ mp_nbusses = default_data[MPFPS_MPFB1 - 1][0]; #if defined(APIC_IO) mp_napics = 1; nintrs = 16; #endif /* APIC_IO */ } else { if ((cth = mpfps->pap) == 0) panic("MP Configuration Table Header MISSING!"); cpu_apic_address = (vm_offset_t) cth->apic_address; /* walk the table, recording info of interest */ totalSize = cth->base_table_length - sizeof(struct MPCTH); position = (u_char *) cth + sizeof(struct MPCTH); count = cth->entry_count; while (count--) { switch (type = *(u_char *) position) { case 0: /* processor_entry */ if (((proc_entry_ptr)position)->cpu_flags & PROCENTRY_FLAG_EN) ++mp_naps; break; case 1: /* bus_entry */ ++mp_nbusses; break; case 2: /* io_apic_entry */ if (((io_apic_entry_ptr)position)->apic_flags & IOAPICENTRY_FLAG_EN) io_apic_address[mp_napics++] = (vm_offset_t)((io_apic_entry_ptr) position)->apic_address; break; case 3: /* int_entry */ ++nintrs; break; case 4: /* int_entry */ break; default: panic("mpfps Base Table HOSED!"); /* NOTREACHED */ } totalSize -= basetable_entry_types[type].length; (u_char*)position += basetable_entry_types[type].length; } } /* qualify the numbers */ if (mp_naps > MAXCPU) { printf("Warning: only using %d of %d available CPUs!\n", MAXCPU, mp_naps); mp_naps = MAXCPU; } /* * Count the BSP. * This is also used as a counter while starting the APs. */ mp_ncpus = 1; --mp_naps; /* subtract the BSP */ } /* * 2nd pass on motherboard's Intel MP specification table. * * sets: * boot_cpu_id * ID_TO_IO(N), phy APIC ID to log CPU/IO table * CPU_TO_ID(N), logical CPU to APIC ID table * IO_TO_ID(N), logical IO to APIC ID table * bus_data[N] * io_apic_ints[N] */ static int mptable_pass2(void) { int x; mpcth_t cth; int totalSize; void* position; int count; int type; int apic, bus, cpu, intr; int i, j; int pgeflag; POSTCODE(MPTABLE_PASS2_POST); pgeflag = 0; /* XXX - Not used under SMP yet. */ MALLOC(io_apic_versions, u_int32_t *, sizeof(u_int32_t) * mp_napics, M_DEVBUF, M_WAITOK); MALLOC(ioapic, volatile ioapic_t **, sizeof(ioapic_t *) * mp_napics, M_DEVBUF, M_WAITOK); MALLOC(io_apic_ints, io_int *, sizeof(io_int) * (nintrs + 1), M_DEVBUF, M_WAITOK); MALLOC(bus_data, bus_datum *, sizeof(bus_datum) * mp_nbusses, M_DEVBUF, M_WAITOK); bzero(ioapic, sizeof(ioapic_t *) * mp_napics); for (i = 0; i < mp_napics; i++) { for (j = 0; j < mp_napics; j++) { /* same page frame as a previous IO apic? */ if (((vm_offset_t)SMPpt[NPTEPG-2-j] & PG_FRAME) == (io_apic_address[i] & PG_FRAME)) { ioapic[i] = (ioapic_t *)((u_int)SMP_prvspace + (NPTEPG-2-j) * PAGE_SIZE + (io_apic_address[i] & PAGE_MASK)); break; } /* use this slot if available */ if (((vm_offset_t)SMPpt[NPTEPG-2-j] & PG_FRAME) == 0) { SMPpt[NPTEPG-2-j] = (pt_entry_t)(PG_V | PG_RW | pgeflag | (io_apic_address[i] & PG_FRAME)); ioapic[i] = (ioapic_t *)((u_int)SMP_prvspace + (NPTEPG-2-j) * PAGE_SIZE + (io_apic_address[i] & PAGE_MASK)); break; } } } /* clear various tables */ for (x = 0; x < NAPICID; ++x) { ID_TO_IO(x) = -1; /* phy APIC ID to log CPU/IO table */ CPU_TO_ID(x) = -1; /* logical CPU to APIC ID table */ IO_TO_ID(x) = -1; /* logical IO to APIC ID table */ } /* clear bus data table */ for (x = 0; x < mp_nbusses; ++x) bus_data[x].bus_id = 0xff; /* clear IO APIC INT table */ for (x = 0; x < (nintrs + 1); ++x) { io_apic_ints[x].int_type = 0xff; io_apic_ints[x].int_vector = 0xff; } /* setup the cpu/apic mapping arrays */ boot_cpu_id = -1; /* record whether PIC or virtual-wire mode */ picmode = (mpfps->mpfb2 & 0x80) ? 1 : 0; /* check for use of 'default' configuration */ if (MPFPS_MPFB1 != 0) return MPFPS_MPFB1; /* return default configuration type */ if ((cth = mpfps->pap) == 0) panic("MP Configuration Table Header MISSING!"); /* walk the table, recording info of interest */ totalSize = cth->base_table_length - sizeof(struct MPCTH); position = (u_char *) cth + sizeof(struct MPCTH); count = cth->entry_count; apic = bus = intr = 0; cpu = 1; /* pre-count the BSP */ while (count--) { switch (type = *(u_char *) position) { case 0: if (processor_entry(position, cpu)) ++cpu; break; case 1: if (bus_entry(position, bus)) ++bus; break; case 2: if (io_apic_entry(position, apic)) ++apic; break; case 3: if (int_entry(position, intr)) ++intr; break; case 4: /* int_entry(position); */ break; default: panic("mpfps Base Table HOSED!"); /* NOTREACHED */ } totalSize -= basetable_entry_types[type].length; (u_char *) position += basetable_entry_types[type].length; } if (boot_cpu_id == -1) panic("NO BSP found!"); /* report fact that its NOT a default configuration */ return 0; } void assign_apic_irq(int apic, int intpin, int irq) { int x; if (int_to_apicintpin[irq].ioapic != -1) panic("assign_apic_irq: inconsistent table"); int_to_apicintpin[irq].ioapic = apic; int_to_apicintpin[irq].int_pin = intpin; int_to_apicintpin[irq].apic_address = ioapic[apic]; int_to_apicintpin[irq].redirindex = IOAPIC_REDTBL + 2 * intpin; for (x = 0; x < nintrs; x++) { if ((io_apic_ints[x].int_type == 0 || io_apic_ints[x].int_type == 3) && io_apic_ints[x].int_vector == 0xff && io_apic_ints[x].dst_apic_id == IO_TO_ID(apic) && io_apic_ints[x].dst_apic_int == intpin) io_apic_ints[x].int_vector = irq; } } void revoke_apic_irq(int irq) { int x; int oldapic; int oldintpin; if (int_to_apicintpin[irq].ioapic == -1) panic("assign_apic_irq: inconsistent table"); oldapic = int_to_apicintpin[irq].ioapic; oldintpin = int_to_apicintpin[irq].int_pin; int_to_apicintpin[irq].ioapic = -1; int_to_apicintpin[irq].int_pin = 0; int_to_apicintpin[irq].apic_address = NULL; int_to_apicintpin[irq].redirindex = 0; for (x = 0; x < nintrs; x++) { if ((io_apic_ints[x].int_type == 0 || io_apic_ints[x].int_type == 3) && io_apic_ints[x].int_vector == 0xff && io_apic_ints[x].dst_apic_id == IO_TO_ID(oldapic) && io_apic_ints[x].dst_apic_int == oldintpin) io_apic_ints[x].int_vector = 0xff; } } static void allocate_apic_irq(int intr) { int apic; int intpin; int irq; if (io_apic_ints[intr].int_vector != 0xff) return; /* Interrupt handler already assigned */ if (io_apic_ints[intr].int_type != 0 && (io_apic_ints[intr].int_type != 3 || (io_apic_ints[intr].dst_apic_id == IO_TO_ID(0) && io_apic_ints[intr].dst_apic_int == 0))) return; /* Not INT or ExtInt on != (0, 0) */ irq = 0; while (irq < APIC_INTMAPSIZE && int_to_apicintpin[irq].ioapic != -1) irq++; if (irq >= APIC_INTMAPSIZE) return; /* No free interrupt handlers */ apic = ID_TO_IO(io_apic_ints[intr].dst_apic_id); intpin = io_apic_ints[intr].dst_apic_int; assign_apic_irq(apic, intpin, irq); io_apic_setup_intpin(apic, intpin); } static void swap_apic_id(int apic, int oldid, int newid) { int x; int oapic; if (oldid == newid) return; /* Nothing to do */ printf("Changing APIC ID for IO APIC #%d from %d to %d in MP table\n", apic, oldid, newid); /* Swap physical APIC IDs in interrupt entries */ for (x = 0; x < nintrs; x++) { if (io_apic_ints[x].dst_apic_id == oldid) io_apic_ints[x].dst_apic_id = newid; else if (io_apic_ints[x].dst_apic_id == newid) io_apic_ints[x].dst_apic_id = oldid; } /* Swap physical APIC IDs in IO_TO_ID mappings */ for (oapic = 0; oapic < mp_napics; oapic++) if (IO_TO_ID(oapic) == newid) break; if (oapic < mp_napics) { printf("Changing APIC ID for IO APIC #%d from " "%d to %d in MP table\n", oapic, newid, oldid); IO_TO_ID(oapic) = oldid; } IO_TO_ID(apic) = newid; } static void fix_id_to_io_mapping(void) { int x; for (x = 0; x < NAPICID; x++) ID_TO_IO(x) = -1; for (x = 0; x <= mp_naps; x++) if (CPU_TO_ID(x) < NAPICID) ID_TO_IO(CPU_TO_ID(x)) = x; for (x = 0; x < mp_napics; x++) if (IO_TO_ID(x) < NAPICID) ID_TO_IO(IO_TO_ID(x)) = x; } static int first_free_apic_id(void) { int freeid, x; for (freeid = 0; freeid < NAPICID; freeid++) { for (x = 0; x <= mp_naps; x++) if (CPU_TO_ID(x) == freeid) break; if (x <= mp_naps) continue; for (x = 0; x < mp_napics; x++) if (IO_TO_ID(x) == freeid) break; if (x < mp_napics) continue; return freeid; } return freeid; } static int io_apic_id_acceptable(int apic, int id) { int cpu; /* Logical CPU number */ int oapic; /* Logical IO APIC number for other IO APIC */ if (id >= NAPICID) return 0; /* Out of range */ for (cpu = 0; cpu <= mp_naps; cpu++) if (CPU_TO_ID(cpu) == id) return 0; /* Conflict with CPU */ for (oapic = 0; oapic < mp_napics && oapic < apic; oapic++) if (IO_TO_ID(oapic) == id) return 0; /* Conflict with other APIC */ return 1; /* ID is acceptable for IO APIC */ } /* * parse an Intel MP specification table */ static void fix_mp_table(void) { int x; int id; int bus_0 = 0; /* Stop GCC warning */ int bus_pci = 0; /* Stop GCC warning */ int num_pci_bus; int apic; /* IO APIC unit number */ int freeid; /* Free physical APIC ID */ int physid; /* Current physical IO APIC ID */ /* * Fix mis-numbering of the PCI bus and its INT entries if the BIOS * did it wrong. The MP spec says that when more than 1 PCI bus * exists the BIOS must begin with bus entries for the PCI bus and use * actual PCI bus numbering. This implies that when only 1 PCI bus * exists the BIOS can choose to ignore this ordering, and indeed many * MP motherboards do ignore it. This causes a problem when the PCI * sub-system makes requests of the MP sub-system based on PCI bus * numbers. So here we look for the situation and renumber the * busses and associated INTs in an effort to "make it right". */ /* find bus 0, PCI bus, count the number of PCI busses */ for (num_pci_bus = 0, x = 0; x < mp_nbusses; ++x) { if (bus_data[x].bus_id == 0) { bus_0 = x; } if (bus_data[x].bus_type == PCI) { ++num_pci_bus; bus_pci = x; } } /* * bus_0 == slot of bus with ID of 0 * bus_pci == slot of last PCI bus encountered */ /* check the 1 PCI bus case for sanity */ /* if it is number 0 all is well */ if (num_pci_bus == 1 && bus_data[bus_pci].bus_id != 0) { /* mis-numbered, swap with whichever bus uses slot 0 */ /* swap the bus entry types */ bus_data[bus_pci].bus_type = bus_data[bus_0].bus_type; bus_data[bus_0].bus_type = PCI; /* swap each relavant INTerrupt entry */ id = bus_data[bus_pci].bus_id; for (x = 0; x < nintrs; ++x) { if (io_apic_ints[x].src_bus_id == id) { io_apic_ints[x].src_bus_id = 0; } else if (io_apic_ints[x].src_bus_id == 0) { io_apic_ints[x].src_bus_id = id; } } } /* Assign IO APIC IDs. * * First try the existing ID. If a conflict is detected, try * the ID in the MP table. If a conflict is still detected, find * a free id. * * We cannot use the ID_TO_IO table before all conflicts has been * resolved and the table has been corrected. */ for (apic = 0; apic < mp_napics; ++apic) { /* For all IO APICs */ /* First try to use the value set by the BIOS */ physid = io_apic_get_id(apic); if (io_apic_id_acceptable(apic, physid)) { if (IO_TO_ID(apic) != physid) swap_apic_id(apic, IO_TO_ID(apic), physid); continue; } /* Then check if the value in the MP table is acceptable */ if (io_apic_id_acceptable(apic, IO_TO_ID(apic))) continue; /* Last resort, find a free APIC ID and use it */ freeid = first_free_apic_id(); if (freeid >= NAPICID) panic("No free physical APIC IDs found"); if (io_apic_id_acceptable(apic, freeid)) { swap_apic_id(apic, IO_TO_ID(apic), freeid); continue; } panic("Free physical APIC ID not usable"); } fix_id_to_io_mapping(); /* detect and fix broken Compaq MP table */ if (apic_int_type(0, 0) == -1) { printf("APIC_IO: MP table broken: 8259->APIC entry missing!\n"); io_apic_ints[nintrs].int_type = 3; /* ExtInt */ io_apic_ints[nintrs].int_vector = 0xff; /* Unassigned */ /* XXX fixme, set src bus id etc, but it doesn't seem to hurt */ io_apic_ints[nintrs].dst_apic_id = IO_TO_ID(0); io_apic_ints[nintrs].dst_apic_int = 0; /* Pin 0 */ nintrs++; } } /* Assign low level interrupt handlers */ static void setup_apic_irq_mapping(void) { int x; int int_vector; /* Clear array */ for (x = 0; x < APIC_INTMAPSIZE; x++) { int_to_apicintpin[x].ioapic = -1; int_to_apicintpin[x].int_pin = 0; int_to_apicintpin[x].apic_address = NULL; int_to_apicintpin[x].redirindex = 0; } /* First assign ISA/EISA interrupts */ for (x = 0; x < nintrs; x++) { int_vector = io_apic_ints[x].src_bus_irq; if (int_vector < APIC_INTMAPSIZE && io_apic_ints[x].int_vector == 0xff && int_to_apicintpin[int_vector].ioapic == -1 && (apic_int_is_bus_type(x, ISA) || apic_int_is_bus_type(x, EISA)) && io_apic_ints[x].int_type == 0) { assign_apic_irq(ID_TO_IO(io_apic_ints[x].dst_apic_id), io_apic_ints[x].dst_apic_int, int_vector); } } /* Assign ExtInt entry if no ISA/EISA interrupt 0 entry */ for (x = 0; x < nintrs; x++) { if (io_apic_ints[x].dst_apic_int == 0 && io_apic_ints[x].dst_apic_id == IO_TO_ID(0) && io_apic_ints[x].int_vector == 0xff && int_to_apicintpin[0].ioapic == -1 && io_apic_ints[x].int_type == 3) { assign_apic_irq(0, 0, 0); break; } } /* PCI interrupt assignment is deferred */ } static int processor_entry(proc_entry_ptr entry, int cpu) { /* check for usability */ if (!(entry->cpu_flags & PROCENTRY_FLAG_EN)) return 0; if(entry->apic_id >= NAPICID) panic("CPU APIC ID out of range (0..%d)", NAPICID - 1); /* check for BSP flag */ if (entry->cpu_flags & PROCENTRY_FLAG_BP) { boot_cpu_id = entry->apic_id; CPU_TO_ID(0) = entry->apic_id; ID_TO_CPU(entry->apic_id) = 0; return 0; /* its already been counted */ } /* add another AP to list, if less than max number of CPUs */ else if (cpu < MAXCPU) { CPU_TO_ID(cpu) = entry->apic_id; ID_TO_CPU(entry->apic_id) = cpu; return 1; } return 0; } static int bus_entry(bus_entry_ptr entry, int bus) { int x; char c, name[8]; /* encode the name into an index */ for (x = 0; x < 6; ++x) { if ((c = entry->bus_type[x]) == ' ') break; name[x] = c; } name[x] = '\0'; if ((x = lookup_bus_type(name)) == UNKNOWN_BUSTYPE) panic("unknown bus type: '%s'", name); bus_data[bus].bus_id = entry->bus_id; bus_data[bus].bus_type = x; return 1; } static int io_apic_entry(io_apic_entry_ptr entry, int apic) { if (!(entry->apic_flags & IOAPICENTRY_FLAG_EN)) return 0; IO_TO_ID(apic) = entry->apic_id; if (entry->apic_id < NAPICID) ID_TO_IO(entry->apic_id) = apic; return 1; } static int lookup_bus_type(char *name) { int x; for (x = 0; x < MAX_BUSTYPE; ++x) if (strcmp(bus_type_table[x].name, name) == 0) return bus_type_table[x].type; return UNKNOWN_BUSTYPE; } static int int_entry(int_entry_ptr entry, int intr) { int apic; io_apic_ints[intr].int_type = entry->int_type; io_apic_ints[intr].int_flags = entry->int_flags; io_apic_ints[intr].src_bus_id = entry->src_bus_id; io_apic_ints[intr].src_bus_irq = entry->src_bus_irq; if (entry->dst_apic_id == 255) { /* This signal goes to all IO APICS. Select an IO APIC with sufficient number of interrupt pins */ for (apic = 0; apic < mp_napics; apic++) if (((io_apic_read(apic, IOAPIC_VER) & IOART_VER_MAXREDIR) >> MAXREDIRSHIFT) >= entry->dst_apic_int) break; if (apic < mp_napics) io_apic_ints[intr].dst_apic_id = IO_TO_ID(apic); else io_apic_ints[intr].dst_apic_id = entry->dst_apic_id; } else io_apic_ints[intr].dst_apic_id = entry->dst_apic_id; io_apic_ints[intr].dst_apic_int = entry->dst_apic_int; return 1; } static int apic_int_is_bus_type(int intr, int bus_type) { int bus; for (bus = 0; bus < mp_nbusses; ++bus) if ((bus_data[bus].bus_id == io_apic_ints[intr].src_bus_id) && ((int) bus_data[bus].bus_type == bus_type)) return 1; return 0; } /* * Given a traditional ISA INT mask, return an APIC mask. */ u_int isa_apic_mask(u_int isa_mask) { int isa_irq; int apic_pin; #if defined(SKIP_IRQ15_REDIRECT) if (isa_mask == (1 << 15)) { printf("skipping ISA IRQ15 redirect\n"); return isa_mask; } #endif /* SKIP_IRQ15_REDIRECT */ isa_irq = ffs(isa_mask); /* find its bit position */ if (isa_irq == 0) /* doesn't exist */ return 0; --isa_irq; /* make it zero based */ apic_pin = isa_apic_irq(isa_irq); /* look for APIC connection */ if (apic_pin == -1) return 0; return (1 << apic_pin); /* convert pin# to a mask */ } /* * Determine which APIC pin an ISA/EISA INT is attached to. */ #define INTTYPE(I) (io_apic_ints[(I)].int_type) #define INTPIN(I) (io_apic_ints[(I)].dst_apic_int) #define INTIRQ(I) (io_apic_ints[(I)].int_vector) #define INTAPIC(I) (ID_TO_IO(io_apic_ints[(I)].dst_apic_id)) #define SRCBUSIRQ(I) (io_apic_ints[(I)].src_bus_irq) int isa_apic_irq(int isa_irq) { int intr; for (intr = 0; intr < nintrs; ++intr) { /* check each record */ if (INTTYPE(intr) == 0) { /* standard INT */ if (SRCBUSIRQ(intr) == isa_irq) { if (apic_int_is_bus_type(intr, ISA) || apic_int_is_bus_type(intr, EISA)) { if (INTIRQ(intr) == 0xff) return -1; /* unassigned */ return INTIRQ(intr); /* found */ } } } } return -1; /* NOT found */ } /* * Determine which APIC pin a PCI INT is attached to. */ #define SRCBUSID(I) (io_apic_ints[(I)].src_bus_id) #define SRCBUSDEVICE(I) ((io_apic_ints[(I)].src_bus_irq >> 2) & 0x1f) #define SRCBUSLINE(I) (io_apic_ints[(I)].src_bus_irq & 0x03) int pci_apic_irq(int pciBus, int pciDevice, int pciInt) { int intr; --pciInt; /* zero based */ for (intr = 0; intr < nintrs; ++intr) /* check each record */ if ((INTTYPE(intr) == 0) /* standard INT */ && (SRCBUSID(intr) == pciBus) && (SRCBUSDEVICE(intr) == pciDevice) && (SRCBUSLINE(intr) == pciInt)) /* a candidate IRQ */ if (apic_int_is_bus_type(intr, PCI)) { if (INTIRQ(intr) == 0xff) allocate_apic_irq(intr); if (INTIRQ(intr) == 0xff) return -1; /* unassigned */ return INTIRQ(intr); /* exact match */ } return -1; /* NOT found */ } int next_apic_irq(int irq) { int intr, ointr; int bus, bustype; bus = 0; bustype = 0; for (intr = 0; intr < nintrs; intr++) { if (INTIRQ(intr) != irq || INTTYPE(intr) != 0) continue; bus = SRCBUSID(intr); bustype = apic_bus_type(bus); if (bustype != ISA && bustype != EISA && bustype != PCI) continue; break; } if (intr >= nintrs) { return -1; } for (ointr = intr + 1; ointr < nintrs; ointr++) { if (INTTYPE(ointr) != 0) continue; if (bus != SRCBUSID(ointr)) continue; if (bustype == PCI) { if (SRCBUSDEVICE(intr) != SRCBUSDEVICE(ointr)) continue; if (SRCBUSLINE(intr) != SRCBUSLINE(ointr)) continue; } if (bustype == ISA || bustype == EISA) { if (SRCBUSIRQ(intr) != SRCBUSIRQ(ointr)) continue; } if (INTPIN(intr) == INTPIN(ointr)) continue; break; } if (ointr >= nintrs) { return -1; } return INTIRQ(ointr); } #undef SRCBUSLINE #undef SRCBUSDEVICE #undef SRCBUSID #undef SRCBUSIRQ #undef INTPIN #undef INTIRQ #undef INTAPIC #undef INTTYPE /* * Reprogram the MB chipset to NOT redirect an ISA INTerrupt. * * XXX FIXME: * Exactly what this means is unclear at this point. It is a solution * for motherboards that redirect the MBIRQ0 pin. Generically a motherboard * could route any of the ISA INTs to upper (>15) IRQ values. But most would * NOT be redirected via MBIRQ0, thus "undirect()ing" them would NOT be an * option. */ int undirect_isa_irq(int rirq) { #if defined(READY) if (bootverbose) printf("Freeing redirected ISA irq %d.\n", rirq); /** FIXME: tickle the MB redirector chip */ return -1; #else if (bootverbose) printf("Freeing (NOT implemented) redirected ISA irq %d.\n", rirq); return 0; #endif /* READY */ } /* * Reprogram the MB chipset to NOT redirect a PCI INTerrupt */ int undirect_pci_irq(int rirq) { #if defined(READY) if (bootverbose) printf("Freeing redirected PCI irq %d.\n", rirq); /** FIXME: tickle the MB redirector chip */ return -1; #else if (bootverbose) printf("Freeing (NOT implemented) redirected PCI irq %d.\n", rirq); return 0; #endif /* READY */ } /* * given a bus ID, return: * the bus type if found * -1 if NOT found */ int apic_bus_type(int id) { int x; for (x = 0; x < mp_nbusses; ++x) if (bus_data[x].bus_id == id) return bus_data[x].bus_type; return -1; } /* * given a LOGICAL APIC# and pin#, return: * the associated src bus ID if found * -1 if NOT found */ int apic_src_bus_id(int apic, int pin) { int x; /* search each of the possible INTerrupt sources */ for (x = 0; x < nintrs; ++x) if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) && (pin == io_apic_ints[x].dst_apic_int)) return (io_apic_ints[x].src_bus_id); return -1; /* NOT found */ } /* * given a LOGICAL APIC# and pin#, return: * the associated src bus IRQ if found * -1 if NOT found */ int apic_src_bus_irq(int apic, int pin) { int x; for (x = 0; x < nintrs; x++) if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) && (pin == io_apic_ints[x].dst_apic_int)) return (io_apic_ints[x].src_bus_irq); return -1; /* NOT found */ } /* * given a LOGICAL APIC# and pin#, return: * the associated INTerrupt type if found * -1 if NOT found */ int apic_int_type(int apic, int pin) { int x; /* search each of the possible INTerrupt sources */ for (x = 0; x < nintrs; ++x) if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) && (pin == io_apic_ints[x].dst_apic_int)) return (io_apic_ints[x].int_type); return -1; /* NOT found */ } int apic_irq(int apic, int pin) { int x; int res; for (x = 0; x < nintrs; ++x) if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) && (pin == io_apic_ints[x].dst_apic_int)) { res = io_apic_ints[x].int_vector; if (res == 0xff) return -1; if (apic != int_to_apicintpin[res].ioapic) panic("apic_irq: inconsistent table"); if (pin != int_to_apicintpin[res].int_pin) panic("apic_irq inconsistent table (2)"); return res; } return -1; } /* * given a LOGICAL APIC# and pin#, return: * the associated trigger mode if found * -1 if NOT found */ int apic_trigger(int apic, int pin) { int x; /* search each of the possible INTerrupt sources */ for (x = 0; x < nintrs; ++x) if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) && (pin == io_apic_ints[x].dst_apic_int)) return ((io_apic_ints[x].int_flags >> 2) & 0x03); return -1; /* NOT found */ } /* * given a LOGICAL APIC# and pin#, return: * the associated 'active' level if found * -1 if NOT found */ int apic_polarity(int apic, int pin) { int x; /* search each of the possible INTerrupt sources */ for (x = 0; x < nintrs; ++x) if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) && (pin == io_apic_ints[x].dst_apic_int)) return (io_apic_ints[x].int_flags & 0x03); return -1; /* NOT found */ } /* * set data according to MP defaults * FIXME: probably not complete yet... */ static void default_mp_table(int type) { int ap_cpu_id; #if defined(APIC_IO) int io_apic_id; int pin; #endif /* APIC_IO */ #if 0 printf(" MP default config type: %d\n", type); switch (type) { case 1: printf(" bus: ISA, APIC: 82489DX\n"); break; case 2: printf(" bus: EISA, APIC: 82489DX\n"); break; case 3: printf(" bus: EISA, APIC: 82489DX\n"); break; case 4: printf(" bus: MCA, APIC: 82489DX\n"); break; case 5: printf(" bus: ISA+PCI, APIC: Integrated\n"); break; case 6: printf(" bus: EISA+PCI, APIC: Integrated\n"); break; case 7: printf(" bus: MCA+PCI, APIC: Integrated\n"); break; default: printf(" future type\n"); break; /* NOTREACHED */ } #endif /* 0 */ boot_cpu_id = (lapic.id & APIC_ID_MASK) >> 24; ap_cpu_id = (boot_cpu_id == 0) ? 1 : 0; /* BSP */ CPU_TO_ID(0) = boot_cpu_id; ID_TO_CPU(boot_cpu_id) = 0; /* one and only AP */ CPU_TO_ID(1) = ap_cpu_id; ID_TO_CPU(ap_cpu_id) = 1; #if defined(APIC_IO) /* one and only IO APIC */ io_apic_id = (io_apic_read(0, IOAPIC_ID) & APIC_ID_MASK) >> 24; /* * sanity check, refer to MP spec section 3.6.6, last paragraph * necessary as some hardware isn't properly setting up the IO APIC */ #if defined(REALLY_ANAL_IOAPICID_VALUE) if (io_apic_id != 2) { #else if ((io_apic_id == 0) || (io_apic_id == 1) || (io_apic_id == 15)) { #endif /* REALLY_ANAL_IOAPICID_VALUE */ io_apic_set_id(0, 2); io_apic_id = 2; } IO_TO_ID(0) = io_apic_id; ID_TO_IO(io_apic_id) = 0; #endif /* APIC_IO */ /* fill out bus entries */ switch (type) { case 1: case 2: case 3: case 4: case 5: case 6: case 7: bus_data[0].bus_id = default_data[type - 1][1]; bus_data[0].bus_type = default_data[type - 1][2]; bus_data[1].bus_id = default_data[type - 1][3]; bus_data[1].bus_type = default_data[type - 1][4]; break; /* case 4: case 7: MCA NOT supported */ default: /* illegal/reserved */ panic("BAD default MP config: %d", type); /* NOTREACHED */ } #if defined(APIC_IO) /* general cases from MP v1.4, table 5-2 */ for (pin = 0; pin < 16; ++pin) { io_apic_ints[pin].int_type = 0; io_apic_ints[pin].int_flags = 0x05; /* edge/active-hi */ io_apic_ints[pin].src_bus_id = 0; io_apic_ints[pin].src_bus_irq = pin; /* IRQ2 caught below */ io_apic_ints[pin].dst_apic_id = io_apic_id; io_apic_ints[pin].dst_apic_int = pin; /* 1-to-1 */ } /* special cases from MP v1.4, table 5-2 */ if (type == 2) { io_apic_ints[2].int_type = 0xff; /* N/C */ io_apic_ints[13].int_type = 0xff; /* N/C */ #if !defined(APIC_MIXED_MODE) /** FIXME: ??? */ panic("sorry, can't support type 2 default yet"); #endif /* APIC_MIXED_MODE */ } else io_apic_ints[2].src_bus_irq = 0; /* ISA IRQ0 is on APIC INT 2 */ if (type == 7) io_apic_ints[0].int_type = 0xff; /* N/C */ else io_apic_ints[0].int_type = 3; /* vectored 8259 */ #endif /* APIC_IO */ } /* * start each AP in our list */ static int start_all_aps(u_int boot_addr) { int x, i, pg; u_char mpbiosreason; u_long mpbioswarmvec; struct globaldata *gd; char *stack; uintptr_t kptbase; POSTCODE(START_ALL_APS_POST); mtx_init(&ap_boot_mtx, "ap boot", MTX_SPIN); /* initialize BSP's local APIC */ apic_initialize(); bsp_apic_ready = 1; /* install the AP 1st level boot code */ install_ap_tramp(boot_addr); /* save the current value of the warm-start vector */ mpbioswarmvec = *((u_long *) WARMBOOT_OFF); #ifndef PC98 outb(CMOS_REG, BIOS_RESET); mpbiosreason = inb(CMOS_DATA); #endif /* set up temporary P==V mapping for AP boot */ /* XXX this is a hack, we should boot the AP on its own stack/PTD */ kptbase = (uintptr_t)(void *)KPTphys; for (x = 0; x < NKPT; x++) PTD[x] = (pd_entry_t)(PG_V | PG_RW | ((kptbase + x * PAGE_SIZE) & PG_FRAME)); invltlb(); /* start each AP */ for (x = 1; x <= mp_naps; ++x) { /* This is a bit verbose, it will go away soon. */ /* first page of AP's private space */ pg = x * i386_btop(sizeof(struct privatespace)); /* allocate a new private data page */ gd = (struct globaldata *)kmem_alloc(kernel_map, PAGE_SIZE); /* wire it into the private page table page */ SMPpt[pg] = (pt_entry_t)(PG_V | PG_RW | vtophys(gd)); /* allocate and set up an idle stack data page */ stack = (char *)kmem_alloc(kernel_map, UPAGES*PAGE_SIZE); for (i = 0; i < UPAGES; i++) SMPpt[pg + 1 + i] = (pt_entry_t) (PG_V | PG_RW | vtophys(PAGE_SIZE * i + stack)); /* prime data page for it to use */ gd->gd_cpuid = x; globaldata_register(gd); /* setup a vector to our boot code */ *((volatile u_short *) WARMBOOT_OFF) = WARMBOOT_TARGET; *((volatile u_short *) WARMBOOT_SEG) = (boot_addr >> 4); #ifndef PC98 outb(CMOS_REG, BIOS_RESET); outb(CMOS_DATA, BIOS_WARM); /* 'warm-start' */ #endif bootSTK = &SMP_prvspace[x].idlestack[UPAGES*PAGE_SIZE]; bootAP = x; /* attempt to start the Application Processor */ CHECK_INIT(99); /* setup checkpoints */ if (!start_ap(x, boot_addr)) { printf("AP #%d (PHY# %d) failed!\n", x, CPU_TO_ID(x)); CHECK_PRINT("trace"); /* show checkpoints */ /* better panic as the AP may be running loose */ printf("panic y/n? [y] "); if (cngetc() != 'n') panic("bye-bye"); } CHECK_PRINT("trace"); /* show checkpoints */ /* record its version info */ cpu_apic_versions[x] = cpu_apic_versions[0]; all_cpus |= (1 << x); /* record AP in CPU map */ } /* build our map of 'other' CPUs */ PCPU_SET(other_cpus, all_cpus & ~(1 << PCPU_GET(cpuid))); /* fill in our (BSP) APIC version */ cpu_apic_versions[0] = lapic.version; /* restore the warmstart vector */ *(u_long *) WARMBOOT_OFF = mpbioswarmvec; #ifndef PC98 outb(CMOS_REG, BIOS_RESET); outb(CMOS_DATA, mpbiosreason); #endif /* * Set up the idle context for the BSP. Similar to above except * that some was done by locore, some by pmap.c and some is implicit * because the BSP is cpu#0 and the page is initially zero, and also * because we can refer to variables by name on the BSP.. */ /* Allocate and setup BSP idle stack */ stack = (char *)kmem_alloc(kernel_map, UPAGES * PAGE_SIZE); for (i = 0; i < UPAGES; i++) SMPpt[1 + i] = (pt_entry_t) (PG_V | PG_RW | vtophys(PAGE_SIZE * i + stack)); for (x = 0; x < NKPT; x++) PTD[x] = 0; pmap_set_opt(); /* number of APs actually started */ return mp_ncpus - 1; } /* * load the 1st level AP boot code into base memory. */ /* targets for relocation */ extern void bigJump(void); extern void bootCodeSeg(void); extern void bootDataSeg(void); extern void MPentry(void); extern u_int MP_GDT; extern u_int mp_gdtbase; static void install_ap_tramp(u_int boot_addr) { int x; int size = *(int *) ((u_long) & bootMP_size); u_char *src = (u_char *) ((u_long) bootMP); u_char *dst = (u_char *) boot_addr + KERNBASE; u_int boot_base = (u_int) bootMP; u_int8_t *dst8; u_int16_t *dst16; u_int32_t *dst32; POSTCODE(INSTALL_AP_TRAMP_POST); for (x = 0; x < size; ++x) *dst++ = *src++; /* * modify addresses in code we just moved to basemem. unfortunately we * need fairly detailed info about mpboot.s for this to work. changes * to mpboot.s might require changes here. */ /* boot code is located in KERNEL space */ dst = (u_char *) boot_addr + KERNBASE; /* modify the lgdt arg */ dst32 = (u_int32_t *) (dst + ((u_int) & mp_gdtbase - boot_base)); *dst32 = boot_addr + ((u_int) & MP_GDT - boot_base); /* modify the ljmp target for MPentry() */ dst32 = (u_int32_t *) (dst + ((u_int) bigJump - boot_base) + 1); *dst32 = ((u_int) MPentry - KERNBASE); /* modify the target for boot code segment */ dst16 = (u_int16_t *) (dst + ((u_int) bootCodeSeg - boot_base)); dst8 = (u_int8_t *) (dst16 + 1); *dst16 = (u_int) boot_addr & 0xffff; *dst8 = ((u_int) boot_addr >> 16) & 0xff; /* modify the target for boot data segment */ dst16 = (u_int16_t *) (dst + ((u_int) bootDataSeg - boot_base)); dst8 = (u_int8_t *) (dst16 + 1); *dst16 = (u_int) boot_addr & 0xffff; *dst8 = ((u_int) boot_addr >> 16) & 0xff; } /* * this function starts the AP (application processor) identified * by the APIC ID 'physicalCpu'. It does quite a "song and dance" * to accomplish this. This is necessary because of the nuances * of the different hardware we might encounter. It ain't pretty, * but it seems to work. */ static int start_ap(int logical_cpu, u_int boot_addr) { int physical_cpu; int vector; int cpus; u_long icr_lo, icr_hi; POSTCODE(START_AP_POST); /* get the PHYSICAL APIC ID# */ physical_cpu = CPU_TO_ID(logical_cpu); /* calculate the vector */ vector = (boot_addr >> 12) & 0xff; /* used as a watchpoint to signal AP startup */ cpus = mp_ncpus; /* * first we do an INIT/RESET IPI this INIT IPI might be run, reseting * and running the target CPU. OR this INIT IPI might be latched (P5 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be * ignored. */ /* setup the address for the target AP */ icr_hi = lapic.icr_hi & ~APIC_ID_MASK; icr_hi |= (physical_cpu << 24); lapic.icr_hi = icr_hi; /* do an INIT IPI: assert RESET */ icr_lo = lapic.icr_lo & 0xfff00000; lapic.icr_lo = icr_lo | 0x0000c500; /* wait for pending status end */ while (lapic.icr_lo & APIC_DELSTAT_MASK) /* spin */ ; /* do an INIT IPI: deassert RESET */ lapic.icr_lo = icr_lo | 0x00008500; /* wait for pending status end */ u_sleep(10000); /* wait ~10mS */ while (lapic.icr_lo & APIC_DELSTAT_MASK) /* spin */ ; /* * next we do a STARTUP IPI: the previous INIT IPI might still be * latched, (P5 bug) this 1st STARTUP would then terminate * immediately, and the previously started INIT IPI would continue. OR * the previous INIT IPI has already run. and this STARTUP IPI will * run. OR the previous INIT IPI was ignored. and this STARTUP IPI * will run. */ /* do a STARTUP IPI */ lapic.icr_lo = icr_lo | 0x00000600 | vector; while (lapic.icr_lo & APIC_DELSTAT_MASK) /* spin */ ; u_sleep(200); /* wait ~200uS */ /* * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is * recognized after hardware RESET or INIT IPI. */ lapic.icr_lo = icr_lo | 0x00000600 | vector; while (lapic.icr_lo & APIC_DELSTAT_MASK) /* spin */ ; u_sleep(200); /* wait ~200uS */ /* wait for it to start */ set_apic_timer(5000000);/* == 5 seconds */ while (read_apic_timer()) if (mp_ncpus > cpus) return 1; /* return SUCCESS */ return 0; /* return FAILURE */ } /* * Flush the TLB on all other CPU's * * XXX: Needs to handshake and wait for completion before proceding. */ void smp_invltlb(void) { #if defined(APIC_IO) if (smp_started && invltlb_ok) ipi_all_but_self(IPI_INVLTLB); #endif /* APIC_IO */ } void invlpg(u_int addr) { __asm __volatile("invlpg (%0)"::"r"(addr):"memory"); /* send a message to the other CPUs */ smp_invltlb(); } void invltlb(void) { u_long temp; /* * This should be implemented as load_cr3(rcr3()) when load_cr3() is * inlined. */ __asm __volatile("movl %%cr3, %0; movl %0, %%cr3":"=r"(temp) :: "memory"); /* send a message to the other CPUs */ smp_invltlb(); } /* * This is called once the rest of the system is up and running and we're * ready to let the AP's out of the pen. */ extern void enable_sse(void); void ap_init(void) { u_int apic_id; /* spin until all the AP's are ready */ while (!aps_ready) /* spin */ ; /* * Set curproc to our per-cpu idleproc so that mutexes have * something unique to lock with. */ PCPU_SET(curproc, PCPU_GET(idleproc)); PCPU_SET(spinlocks, NULL); /* lock against other AP's that are waking up */ mtx_lock_spin(&ap_boot_mtx); /* BSP may have changed PTD while we're waiting for the lock */ cpu_invltlb(); smp_cpus++; #if defined(I586_CPU) && !defined(NO_F00F_HACK) lidt(&r_idt); #endif /* Build our map of 'other' CPUs. */ PCPU_SET(other_cpus, all_cpus & ~(1 << PCPU_GET(cpuid))); printf("SMP: AP CPU #%d Launched!\n", PCPU_GET(cpuid)); /* set up CPU registers and state */ cpu_setregs(); /* set up FPU state on the AP */ npxinit(__INITIAL_NPXCW__); /* set up SSE registers */ enable_sse(); /* A quick check from sanity claus */ apic_id = (apic_id_to_logical[(lapic.id & 0x0f000000) >> 24]); if (PCPU_GET(cpuid) != apic_id) { printf("SMP: cpuid = %d\n", PCPU_GET(cpuid)); printf("SMP: apic_id = %d\n", apic_id); printf("PTD[MPPTDI] = %p\n", (void *)PTD[MPPTDI]); panic("cpuid mismatch! boom!!"); } /* Init local apic for irq's */ apic_initialize(); /* Set memory range attributes for this CPU to match the BSP */ mem_range_AP_init(); /* * Activate smp_invltlb, although strictly speaking, this isn't * quite correct yet. We should have a bitfield for cpus willing * to accept TLB flush IPI's or something and sync them. */ if (smp_cpus == mp_ncpus) { invltlb_ok = 1; smp_started = 1; /* enable IPI's, tlb shootdown, freezes etc */ smp_active = 1; /* historic */ } /* let other AP's wake up now */ mtx_unlock_spin(&ap_boot_mtx); /* wait until all the AP's are up */ while (smp_started == 0) ; /* nothing */ microuptime(PCPU_PTR(switchtime)); PCPU_SET(switchticks, ticks); /* ok, now grab sched_lock and enter the scheduler */ enable_intr(); mtx_lock_spin(&sched_lock); cpu_throw(); /* doesn't return */ panic("scheduler returned us to ap_init"); } /* * For statclock, we send an IPI to all CPU's to have them call this * function. */ void forwarded_statclock(struct trapframe frame) { mtx_lock_spin(&sched_lock); statclock_process(curproc, TRAPF_PC(&frame), TRAPF_USERMODE(&frame)); mtx_unlock_spin(&sched_lock); } void forward_statclock(void) { int map; CTR0(KTR_SMP, "forward_statclock"); if (!smp_started || !invltlb_ok || cold || panicstr) return; map = PCPU_GET(other_cpus) & ~stopped_cpus ; if (map != 0) ipi_selected(map, IPI_STATCLOCK); } /* * For each hardclock(), we send an IPI to all other CPU's to have them * execute this function. It would be nice to reduce contention on * sched_lock if we could simply peek at the CPU to determine the user/kernel * state and call hardclock_process() on the CPU receiving the clock interrupt * and then just use a simple IPI to handle any ast's if needed. */ void forwarded_hardclock(struct trapframe frame) { mtx_lock_spin(&sched_lock); hardclock_process(curproc, TRAPF_USERMODE(&frame)); mtx_unlock_spin(&sched_lock); } void forward_hardclock(void) { u_int map; CTR0(KTR_SMP, "forward_hardclock"); if (!smp_started || !invltlb_ok || cold || panicstr) return; map = PCPU_GET(other_cpus) & ~stopped_cpus ; if (map != 0) ipi_selected(map, IPI_HARDCLOCK); } #ifdef APIC_INTR_REORDER /* * Maintain mapping from softintr vector to isr bit in local apic. */ void set_lapic_isrloc(int intr, int vector) { if (intr < 0 || intr > 32) panic("set_apic_isrloc: bad intr argument: %d",intr); if (vector < ICU_OFFSET || vector > 255) panic("set_apic_isrloc: bad vector argument: %d",vector); apic_isrbit_location[intr].location = &lapic.isr0 + ((vector>>5)<<2); apic_isrbit_location[intr].bit = (1<<(vector & 31)); } #endif /* * send an IPI to a set of cpus. */ void ipi_selected(u_int32_t cpus, u_int ipi) { CTR2(KTR_SMP, __func__ ": cpus: %x ipi: %x", cpus, ipi); selected_apic_ipi(cpus, ipi, APIC_DELMODE_FIXED); } /* * send an IPI INTerrupt containing 'vector' to all CPUs, including myself */ void ipi_all(u_int ipi) { CTR1(KTR_SMP, __func__ ": ipi: %x", ipi); apic_ipi(APIC_DEST_ALLISELF, ipi, APIC_DELMODE_FIXED); } /* * send an IPI to all CPUs EXCEPT myself */ void ipi_all_but_self(u_int ipi) { CTR1(KTR_SMP, __func__ ": ipi: %x", ipi); apic_ipi(APIC_DEST_ALLESELF, ipi, APIC_DELMODE_FIXED); } /* * send an IPI to myself */ void ipi_self(u_int ipi) { CTR1(KTR_SMP, __func__ ": ipi: %x", ipi); apic_ipi(APIC_DEST_SELF, ipi, APIC_DELMODE_FIXED); } void release_aps(void *dummy __unused) { atomic_store_rel_int(&aps_ready, 1); } SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); Index: head/sys/amd64/amd64/pmap.c =================================================================== --- head/sys/amd64/amd64/pmap.c (revision 82308) +++ head/sys/amd64/amd64/pmap.c (revision 82309) @@ -1,3397 +1,3398 @@ /* * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 * $FreeBSD$ */ /* * Manages physical address maps. * * In addition to hardware address maps, this * module is called upon to provide software-use-only * maps which may or may not be stored in the same * form as hardware maps. These pseudo-maps are * used to store intermediate results from copy * operations to and from address spaces. * * Since the information managed by this module is * also stored by the logical address mapping module, * this module may throw away valid virtual-to-physical * mappings at almost any time. However, invalidations * of virtual-to-physical mappings must be done as * requested. * * In order to cope with hardware architectures which * make virtual-to-physical map invalidates expensive, * this module may delay invalidate or reduced protection * operations until such time as they are actually * necessary. This module is given full information as * to which processors are currently using which maps, * and to when physical maps must be made correct. */ #include "opt_disable_pse.h" #include "opt_pmap.h" #include "opt_msgbuf.h" +#include "opt_upages.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(SMP) || defined(APIC_IO) #include #include #include #include #include #endif /* SMP || APIC_IO */ #define PMAP_KEEP_PDIRS #ifndef PMAP_SHPGPERPROC #define PMAP_SHPGPERPROC 200 #endif #if defined(DIAGNOSTIC) #define PMAP_DIAGNOSTIC #endif #define MINPV 2048 #if !defined(PMAP_DIAGNOSTIC) #define PMAP_INLINE __inline #else #define PMAP_INLINE #endif /* * Get PDEs and PTEs for user/kernel address space */ #define pmap_pde(m, v) (&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT])) #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT]) #define pmap_pde_v(pte) ((*(int *)pte & PG_V) != 0) #define pmap_pte_w(pte) ((*(int *)pte & PG_W) != 0) #define pmap_pte_m(pte) ((*(int *)pte & PG_M) != 0) #define pmap_pte_u(pte) ((*(int *)pte & PG_A) != 0) #define pmap_pte_v(pte) ((*(int *)pte & PG_V) != 0) #define pmap_pte_set_w(pte, v) ((v)?(*(int *)pte |= PG_W):(*(int *)pte &= ~PG_W)) #define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v))) /* * Given a map and a machine independent protection code, * convert to a vax protection code. */ #define pte_prot(m, p) (protection_codes[p]) static int protection_codes[8]; static struct pmap kernel_pmap_store; pmap_t kernel_pmap; LIST_HEAD(pmaplist, pmap); struct pmaplist allpmaps; vm_offset_t avail_start; /* PA of first available physical page */ vm_offset_t avail_end; /* PA of last available physical page */ vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ static boolean_t pmap_initialized = FALSE; /* Has pmap_init completed? */ static int pgeflag; /* PG_G or-in */ static int pseflag; /* PG_PS or-in */ static vm_object_t kptobj; static int nkpt; vm_offset_t kernel_vm_end; /* * Data for the pv entry allocation mechanism */ static vm_zone_t pvzone; static struct vm_zone pvzone_store; static struct vm_object pvzone_obj; static int pv_entry_count=0, pv_entry_max=0, pv_entry_high_water=0; static int pmap_pagedaemon_waken = 0; static struct pv_entry *pvinit; /* * All those kernel PT submaps that BSD is so fond of */ pt_entry_t *CMAP1 = 0; static pt_entry_t *CMAP2, *ptmmap; caddr_t CADDR1 = 0, ptvmmap = 0; static caddr_t CADDR2; static pt_entry_t *msgbufmap; struct msgbuf *msgbufp=0; /* * Crashdump maps. */ static pt_entry_t *pt_crashdumpmap; static caddr_t crashdumpmap; #ifdef SMP extern pt_entry_t *SMPpt; #endif static pt_entry_t *PMAP1 = 0; static unsigned *PADDR1 = 0; static PMAP_INLINE void free_pv_entry __P((pv_entry_t pv)); static unsigned * get_ptbase __P((pmap_t pmap)); static pv_entry_t get_pv_entry __P((void)); static void i386_protection_init __P((void)); static __inline void pmap_changebit __P((vm_page_t m, int bit, boolean_t setem)); static void pmap_remove_all __P((vm_page_t m)); static vm_page_t pmap_enter_quick __P((pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t mpte)); static int pmap_remove_pte __P((struct pmap *pmap, unsigned *ptq, vm_offset_t sva)); static void pmap_remove_page __P((struct pmap *pmap, vm_offset_t va)); static int pmap_remove_entry __P((struct pmap *pmap, vm_page_t m, vm_offset_t va)); static boolean_t pmap_testbit __P((vm_page_t m, int bit)); static void pmap_insert_entry __P((pmap_t pmap, vm_offset_t va, vm_page_t mpte, vm_page_t m)); static vm_page_t pmap_allocpte __P((pmap_t pmap, vm_offset_t va)); static int pmap_release_free_page __P((pmap_t pmap, vm_page_t p)); static vm_page_t _pmap_allocpte __P((pmap_t pmap, unsigned ptepindex)); static unsigned * pmap_pte_quick __P((pmap_t pmap, vm_offset_t va)); static vm_page_t pmap_page_lookup __P((vm_object_t object, vm_pindex_t pindex)); static int pmap_unuse_pt __P((pmap_t, vm_offset_t, vm_page_t)); static vm_offset_t pmap_kmem_choose(vm_offset_t addr); static unsigned pdir4mb; /* * Routine: pmap_pte * Function: * Extract the page table entry associated * with the given map/virtual_address pair. */ PMAP_INLINE unsigned * pmap_pte(pmap, va) register pmap_t pmap; vm_offset_t va; { unsigned *pdeaddr; if (pmap) { pdeaddr = (unsigned *) pmap_pde(pmap, va); if (*pdeaddr & PG_PS) return pdeaddr; if (*pdeaddr) { return get_ptbase(pmap) + i386_btop(va); } } return (0); } /* * Move the kernel virtual free pointer to the next * 4MB. This is used to help improve performance * by using a large (4MB) page for much of the kernel * (.text, .data, .bss) */ static vm_offset_t pmap_kmem_choose(vm_offset_t addr) { vm_offset_t newaddr = addr; #ifndef DISABLE_PSE if (cpu_feature & CPUID_PSE) { newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1); } #endif return newaddr; } /* * Bootstrap the system enough to run with virtual memory. * * On the i386 this is called after mapping has already been enabled * and just syncs the pmap module with what has already been done. * [We can't call it easily with mapping off since the kernel is not * mapped with PA == VA, hence we would have to relocate every address * from the linked base (virtual) address "KERNBASE" to the actual * (physical) address starting relative to 0] */ void pmap_bootstrap(firstaddr, loadaddr) vm_offset_t firstaddr; vm_offset_t loadaddr; { vm_offset_t va; pt_entry_t *pte; int i; avail_start = firstaddr; /* * XXX The calculation of virtual_avail is wrong. It's NKPT*PAGE_SIZE too * large. It should instead be correctly calculated in locore.s and * not based on 'first' (which is a physical address, not a virtual * address, for the start of unused physical memory). The kernel * page tables are NOT double mapped and thus should not be included * in this calculation. */ virtual_avail = (vm_offset_t) KERNBASE + firstaddr; virtual_avail = pmap_kmem_choose(virtual_avail); virtual_end = VM_MAX_KERNEL_ADDRESS; /* * Initialize protection array. */ i386_protection_init(); /* * The kernel's pmap is statically allocated so we don't have to use * pmap_create, which is unlikely to work correctly at this part of * the boot sequence (XXX and which no longer exists). */ kernel_pmap = &kernel_pmap_store; kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD); kernel_pmap->pm_count = 1; kernel_pmap->pm_active = -1; /* don't allow deactivation */ TAILQ_INIT(&kernel_pmap->pm_pvlist); LIST_INIT(&allpmaps); LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list); nkpt = NKPT; /* * Reserve some special page table entries/VA space for temporary * mapping of pages. */ #define SYSMAP(c, p, v, n) \ v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); va = virtual_avail; pte = (pt_entry_t *) pmap_pte(kernel_pmap, va); /* * CMAP1/CMAP2 are used for zeroing and copying pages. */ SYSMAP(caddr_t, CMAP1, CADDR1, 1) SYSMAP(caddr_t, CMAP2, CADDR2, 1) /* * Crashdump maps. */ SYSMAP(caddr_t, pt_crashdumpmap, crashdumpmap, MAXDUMPPGS); /* * ptvmmap is used for reading arbitrary physical pages via /dev/mem. * XXX ptmmap is not used. */ SYSMAP(caddr_t, ptmmap, ptvmmap, 1) /* * msgbufp is used to map the system message buffer. * XXX msgbufmap is not used. */ SYSMAP(struct msgbuf *, msgbufmap, msgbufp, atop(round_page(MSGBUF_SIZE))) /* * ptemap is used for pmap_pte_quick */ SYSMAP(unsigned *, PMAP1, PADDR1, 1); virtual_avail = va; *(int *) CMAP1 = *(int *) CMAP2 = 0; for (i = 0; i < NKPT; i++) PTD[i] = 0; pgeflag = 0; #if !defined(SMP) /* XXX - see also mp_machdep.c */ if (cpu_feature & CPUID_PGE) { pgeflag = PG_G; } #endif /* * Initialize the 4MB page size flag */ pseflag = 0; /* * The 4MB page version of the initial * kernel page mapping. */ pdir4mb = 0; #if !defined(DISABLE_PSE) if (cpu_feature & CPUID_PSE) { unsigned ptditmp; /* * Note that we have enabled PSE mode */ pseflag = PG_PS; ptditmp = *((unsigned *)PTmap + i386_btop(KERNBASE)); ptditmp &= ~(NBPDR - 1); ptditmp |= PG_V | PG_RW | PG_PS | PG_U | pgeflag; pdir4mb = ptditmp; #if !defined(SMP) /* * Enable the PSE mode. */ load_cr4(rcr4() | CR4_PSE); /* * We can do the mapping here for the single processor * case. We simply ignore the old page table page from * now on. */ /* * For SMP, we still need 4K pages to bootstrap APs, * PSE will be enabled as soon as all APs are up. */ PTD[KPTDI] = (pd_entry_t) ptditmp; kernel_pmap->pm_pdir[KPTDI] = (pd_entry_t) ptditmp; invltlb(); #endif } #endif #ifdef SMP if (cpu_apic_address == 0) panic("pmap_bootstrap: no local apic! (non-SMP hardware?)"); /* local apic is mapped on last page */ SMPpt[NPTEPG - 1] = (pt_entry_t)(PG_V | PG_RW | PG_N | pgeflag | (cpu_apic_address & PG_FRAME)); #endif invltlb(); } #ifdef SMP /* * Set 4mb pdir for mp startup */ void pmap_set_opt(void) { if (pseflag && (cpu_feature & CPUID_PSE)) { load_cr4(rcr4() | CR4_PSE); if (pdir4mb && PCPU_GET(cpuid) == 0) { /* only on BSP */ kernel_pmap->pm_pdir[KPTDI] = PTD[KPTDI] = (pd_entry_t)pdir4mb; cpu_invltlb(); } } } #endif /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. * pmap_init has been enhanced to support in a fairly consistant * way, discontiguous physical memory. */ void pmap_init(phys_start, phys_end) vm_offset_t phys_start, phys_end; { int i; int initial_pvs; /* * object for kernel page table pages */ kptobj = vm_object_allocate(OBJT_DEFAULT, NKPDE); /* * Allocate memory for random pmap data structures. Includes the * pv_head_table. */ for(i = 0; i < vm_page_array_size; i++) { vm_page_t m; m = &vm_page_array[i]; TAILQ_INIT(&m->md.pv_list); m->md.pv_list_count = 0; } /* * init the pv free list */ initial_pvs = vm_page_array_size; if (initial_pvs < MINPV) initial_pvs = MINPV; pvzone = &pvzone_store; pvinit = (struct pv_entry *) kmem_alloc(kernel_map, initial_pvs * sizeof (struct pv_entry)); zbootinit(pvzone, "PV ENTRY", sizeof (struct pv_entry), pvinit, vm_page_array_size); /* * Now it is safe to enable pv_table recording. */ pmap_initialized = TRUE; } /* * Initialize the address space (zone) for the pv_entries. Set a * high water mark so that the system can recover from excessive * numbers of pv entries. */ void pmap_init2() { int shpgperproc = PMAP_SHPGPERPROC; TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); pv_entry_max = shpgperproc * maxproc + vm_page_array_size; pv_entry_high_water = 9 * (pv_entry_max / 10); zinitna(pvzone, &pvzone_obj, NULL, 0, pv_entry_max, ZONE_INTERRUPT, 1); } /*************************************************** * Low level helper routines..... ***************************************************/ #if defined(PMAP_DIAGNOSTIC) /* * This code checks for non-writeable/modified pages. * This should be an invalid condition. */ static int pmap_nw_modified(pt_entry_t ptea) { int pte; pte = (int) ptea; if ((pte & (PG_M|PG_RW)) == PG_M) return 1; else return 0; } #endif /* * this routine defines the region(s) of memory that should * not be tested for the modified bit. */ static PMAP_INLINE int pmap_track_modified(vm_offset_t va) { if ((va < kmi.clean_sva) || (va >= kmi.clean_eva)) return 1; else return 0; } static PMAP_INLINE void invltlb_1pg(vm_offset_t va) { #ifdef I386_CPU invltlb(); #else invlpg(va); #endif } static __inline void pmap_TLB_invalidate(pmap_t pmap, vm_offset_t va) { #if defined(SMP) if (pmap->pm_active & (1 << PCPU_GET(cpuid))) cpu_invlpg((void *)va); if (pmap->pm_active & PCPU_GET(other_cpus)) smp_invltlb(); #else if (pmap->pm_active) invltlb_1pg(va); #endif } static __inline void pmap_TLB_invalidate_all(pmap_t pmap) { #if defined(SMP) if (pmap->pm_active & (1 << PCPU_GET(cpuid))) cpu_invltlb(); if (pmap->pm_active & PCPU_GET(other_cpus)) smp_invltlb(); #else if (pmap->pm_active) invltlb(); #endif } static unsigned * get_ptbase(pmap) pmap_t pmap; { unsigned frame = (unsigned) pmap->pm_pdir[PTDPTDI] & PG_FRAME; /* are we current address space or kernel? */ if (pmap == kernel_pmap || frame == (((unsigned) PTDpde) & PG_FRAME)) { return (unsigned *) PTmap; } /* otherwise, we are alternate address space */ if (frame != (((unsigned) APTDpde) & PG_FRAME)) { APTDpde = (pd_entry_t) (frame | PG_RW | PG_V); #if defined(SMP) /* The page directory is not shared between CPUs */ cpu_invltlb(); #else invltlb(); #endif } return (unsigned *) APTmap; } /* * Super fast pmap_pte routine best used when scanning * the pv lists. This eliminates many coarse-grained * invltlb calls. Note that many of the pv list * scans are across different pmaps. It is very wasteful * to do an entire invltlb for checking a single mapping. */ static unsigned * pmap_pte_quick(pmap, va) register pmap_t pmap; vm_offset_t va; { unsigned pde, newpf; if ((pde = (unsigned) pmap->pm_pdir[va >> PDRSHIFT]) != 0) { unsigned frame = (unsigned) pmap->pm_pdir[PTDPTDI] & PG_FRAME; unsigned index = i386_btop(va); /* are we current address space or kernel? */ if ((pmap == kernel_pmap) || (frame == (((unsigned) PTDpde) & PG_FRAME))) { return (unsigned *) PTmap + index; } newpf = pde & PG_FRAME; if ( ((* (unsigned *) PMAP1) & PG_FRAME) != newpf) { * (unsigned *) PMAP1 = newpf | PG_RW | PG_V; invltlb_1pg((vm_offset_t) PADDR1); } return PADDR1 + ((unsigned) index & (NPTEPG - 1)); } return (0); } /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ vm_offset_t pmap_extract(pmap, va) register pmap_t pmap; vm_offset_t va; { vm_offset_t rtval; vm_offset_t pdirindex; pdirindex = va >> PDRSHIFT; if (pmap && (rtval = (unsigned) pmap->pm_pdir[pdirindex])) { unsigned *pte; if ((rtval & PG_PS) != 0) { rtval &= ~(NBPDR - 1); rtval |= va & (NBPDR - 1); return rtval; } pte = get_ptbase(pmap) + i386_btop(va); rtval = ((*pte & PG_FRAME) | (va & PAGE_MASK)); return rtval; } return 0; } /*************************************************** * Low level mapping routines..... ***************************************************/ /* * add a wired page to the kva * note that in order for the mapping to take effect -- you * should do a invltlb after doing the pmap_kenter... */ PMAP_INLINE void pmap_kenter(va, pa) vm_offset_t va; register vm_offset_t pa; { register unsigned *pte; unsigned npte, opte; npte = pa | PG_RW | PG_V | pgeflag; pte = (unsigned *)vtopte(va); opte = *pte; *pte = npte; /*if (opte)*/ invltlb_1pg(va); /* XXX what about SMP? */ } /* * remove a page from the kernel pagetables */ PMAP_INLINE void pmap_kremove(va) vm_offset_t va; { register unsigned *pte; pte = (unsigned *)vtopte(va); *pte = 0; invltlb_1pg(va); /* XXX what about SMP? */ } /* * Used to map a range of physical addresses into kernel * virtual address space. * * The value passed in '*virt' is a suggested virtual address for * the mapping. Architectures which can support a direct-mapped * physical to virtual region can return the appropriate address * within that region, leaving '*virt' unchanged. Other * architectures should map the pages starting at '*virt' and * update '*virt' with the first usable address after the mapped * region. */ vm_offset_t pmap_map(virt, start, end, prot) vm_offset_t *virt; vm_offset_t start; vm_offset_t end; int prot; { vm_offset_t sva = *virt; vm_offset_t va = sva; while (start < end) { pmap_kenter(va, start); va += PAGE_SIZE; start += PAGE_SIZE; } *virt = va; return (sva); } /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. */ void pmap_qenter(va, m, count) vm_offset_t va; vm_page_t *m; int count; { int i; for (i = 0; i < count; i++) { vm_offset_t tva = va + i * PAGE_SIZE; pmap_kenter(tva, VM_PAGE_TO_PHYS(m[i])); } } /* * this routine jerks page mappings from the * kernel -- it is meant only for temporary mappings. */ void pmap_qremove(va, count) vm_offset_t va; int count; { vm_offset_t end_va; end_va = va + count*PAGE_SIZE; while (va < end_va) { unsigned *pte; pte = (unsigned *)vtopte(va); *pte = 0; #ifdef SMP cpu_invlpg((void *)va); #else invltlb_1pg(va); #endif va += PAGE_SIZE; } #ifdef SMP smp_invltlb(); #endif } static vm_page_t pmap_page_lookup(object, pindex) vm_object_t object; vm_pindex_t pindex; { vm_page_t m; retry: m = vm_page_lookup(object, pindex); if (m && vm_page_sleep_busy(m, FALSE, "pplookp")) goto retry; return m; } /* * Create the UPAGES for a new process. * This routine directly affects the fork perf for a process. */ void pmap_new_proc(p) struct proc *p; { #ifdef I386_CPU int updateneeded; #endif int i; vm_object_t upobj; vm_page_t m; struct user *up; unsigned *ptek, oldpte; /* * allocate object for the upages */ if ((upobj = p->p_upages_obj) == NULL) { upobj = vm_object_allocate( OBJT_DEFAULT, UPAGES); p->p_upages_obj = upobj; } /* get a kernel virtual address for the UPAGES for this proc */ if ((up = p->p_addr) == NULL) { up = (struct user *) kmem_alloc_nofault(kernel_map, UPAGES * PAGE_SIZE); if (up == NULL) panic("pmap_new_proc: u_map allocation failed"); p->p_addr = up; } ptek = (unsigned *) vtopte((vm_offset_t) up); #ifdef I386_CPU updateneeded = 0; #endif for(i=0;iwire_count++; cnt.v_wire_count++; oldpte = *(ptek + i); /* * Enter the page into the kernel address space. */ *(ptek + i) = VM_PAGE_TO_PHYS(m) | PG_RW | PG_V | pgeflag; if (oldpte) { #ifdef I386_CPU updateneeded = 1; #else invlpg((vm_offset_t) up + i * PAGE_SIZE); #endif } vm_page_wakeup(m); vm_page_flag_clear(m, PG_ZERO); vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE); m->valid = VM_PAGE_BITS_ALL; } #ifdef I386_CPU if (updateneeded) invltlb(); #endif } /* * Dispose the UPAGES for a process that has exited. * This routine directly impacts the exit perf of a process. */ void pmap_dispose_proc(p) struct proc *p; { int i; vm_object_t upobj; vm_page_t m; unsigned *ptek, oldpte; upobj = p->p_upages_obj; ptek = (unsigned *) vtopte((vm_offset_t) p->p_addr); for(i=0;ip_addr + i * PAGE_SIZE); #endif vm_page_unwire(m, 0); vm_page_free(m); } #ifdef I386_CPU invltlb(); #endif } /* * Allow the UPAGES for a process to be prejudicially paged out. */ void pmap_swapout_proc(p) struct proc *p; { int i; vm_object_t upobj; vm_page_t m; upobj = p->p_upages_obj; /* * let the upages be paged */ for(i=0;ip_addr + PAGE_SIZE * i); } } /* * Bring the UPAGES for a specified process back in. */ void pmap_swapin_proc(p) struct proc *p; { int i,rv; vm_object_t upobj; vm_page_t m; upobj = p->p_upages_obj; for(i=0;ip_addr) + i * PAGE_SIZE, VM_PAGE_TO_PHYS(m)); if (m->valid != VM_PAGE_BITS_ALL) { rv = vm_pager_get_pages(upobj, &m, 1, 0); if (rv != VM_PAGER_OK) panic("pmap_swapin_proc: cannot get upages for proc: %d\n", p->p_pid); m = vm_page_lookup(upobj, i); m->valid = VM_PAGE_BITS_ALL; } vm_page_wire(m); vm_page_wakeup(m); vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE); } } /*************************************************** * Page table page management routines..... ***************************************************/ /* * This routine unholds page table pages, and if the hold count * drops to zero, then it decrements the wire count. */ static int _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m) { while (vm_page_sleep_busy(m, FALSE, "pmuwpt")) ; if (m->hold_count == 0) { vm_offset_t pteva; /* * unmap the page table page */ pmap->pm_pdir[m->pindex] = 0; --pmap->pm_stats.resident_count; if ((((unsigned)pmap->pm_pdir[PTDPTDI]) & PG_FRAME) == (((unsigned) PTDpde) & PG_FRAME)) { /* * Do a invltlb to make the invalidated mapping * take effect immediately. */ pteva = UPT_MIN_ADDRESS + i386_ptob(m->pindex); pmap_TLB_invalidate(pmap, pteva); } if (pmap->pm_ptphint == m) pmap->pm_ptphint = NULL; /* * If the page is finally unwired, simply free it. */ --m->wire_count; if (m->wire_count == 0) { vm_page_flash(m); vm_page_busy(m); vm_page_free_zero(m); --cnt.v_wire_count; } return 1; } return 0; } static PMAP_INLINE int pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m) { vm_page_unhold(m); if (m->hold_count == 0) return _pmap_unwire_pte_hold(pmap, m); else return 0; } /* * After removing a page table entry, this routine is used to * conditionally free the page, and manage the hold/wire counts. */ static int pmap_unuse_pt(pmap, va, mpte) pmap_t pmap; vm_offset_t va; vm_page_t mpte; { unsigned ptepindex; if (va >= UPT_MIN_ADDRESS) return 0; if (mpte == NULL) { ptepindex = (va >> PDRSHIFT); if (pmap->pm_ptphint && (pmap->pm_ptphint->pindex == ptepindex)) { mpte = pmap->pm_ptphint; } else { mpte = pmap_page_lookup( pmap->pm_pteobj, ptepindex); pmap->pm_ptphint = mpte; } } return pmap_unwire_pte_hold(pmap, mpte); } void pmap_pinit0(pmap) struct pmap *pmap; { pmap->pm_pdir = (pd_entry_t *)kmem_alloc_pageable(kernel_map, PAGE_SIZE); pmap_kenter((vm_offset_t) pmap->pm_pdir, (vm_offset_t) IdlePTD); pmap->pm_count = 1; pmap->pm_active = 0; pmap->pm_ptphint = NULL; TAILQ_INIT(&pmap->pm_pvlist); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); } /* * Initialize a preallocated and zeroed pmap structure, * such as one in a vmspace structure. */ void pmap_pinit(pmap) register struct pmap *pmap; { vm_page_t ptdpg; /* * No need to allocate page table space yet but we do need a valid * page directory table. */ if (pmap->pm_pdir == NULL) pmap->pm_pdir = (pd_entry_t *)kmem_alloc_pageable(kernel_map, PAGE_SIZE); /* * allocate object for the ptes */ if (pmap->pm_pteobj == NULL) pmap->pm_pteobj = vm_object_allocate( OBJT_DEFAULT, PTDPTDI + 1); /* * allocate the page directory page */ ptdpg = vm_page_grab( pmap->pm_pteobj, PTDPTDI, VM_ALLOC_NORMAL | VM_ALLOC_RETRY); ptdpg->wire_count = 1; ++cnt.v_wire_count; vm_page_flag_clear(ptdpg, PG_MAPPED | PG_BUSY); /* not usually mapped*/ ptdpg->valid = VM_PAGE_BITS_ALL; pmap_kenter((vm_offset_t) pmap->pm_pdir, VM_PAGE_TO_PHYS(ptdpg)); if ((ptdpg->flags & PG_ZERO) == 0) bzero(pmap->pm_pdir, PAGE_SIZE); LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); /* Wire in kernel global address entries. */ /* XXX copies current process, does not fill in MPPTDI */ bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * PTESIZE); #ifdef SMP pmap->pm_pdir[MPPTDI] = PTD[MPPTDI]; #endif /* install self-referential address mapping entry */ *(unsigned *) (pmap->pm_pdir + PTDPTDI) = VM_PAGE_TO_PHYS(ptdpg) | PG_V | PG_RW | PG_A | PG_M; pmap->pm_count = 1; pmap->pm_active = 0; pmap->pm_ptphint = NULL; TAILQ_INIT(&pmap->pm_pvlist); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); } /* * Wire in kernel global address entries. To avoid a race condition * between pmap initialization and pmap_growkernel, this procedure * should be called after the vmspace is attached to the process * but before this pmap is activated. */ void pmap_pinit2(pmap) struct pmap *pmap; { /* XXX: Remove this stub when no longer called */ } static int pmap_release_free_page(pmap, p) struct pmap *pmap; vm_page_t p; { unsigned *pde = (unsigned *) pmap->pm_pdir; /* * This code optimizes the case of freeing non-busy * page-table pages. Those pages are zero now, and * might as well be placed directly into the zero queue. */ if (vm_page_sleep_busy(p, FALSE, "pmaprl")) return 0; vm_page_busy(p); /* * Remove the page table page from the processes address space. */ pde[p->pindex] = 0; pmap->pm_stats.resident_count--; if (p->hold_count) { panic("pmap_release: freeing held page table page"); } /* * Page directory pages need to have the kernel * stuff cleared, so they can go into the zero queue also. */ if (p->pindex == PTDPTDI) { bzero(pde + KPTDI, nkpt * PTESIZE); #ifdef SMP pde[MPPTDI] = 0; #endif pde[APTDPTDI] = 0; pmap_kremove((vm_offset_t) pmap->pm_pdir); } if (pmap->pm_ptphint && (pmap->pm_ptphint->pindex == p->pindex)) pmap->pm_ptphint = NULL; p->wire_count--; cnt.v_wire_count--; vm_page_free_zero(p); return 1; } /* * this routine is called if the page table page is not * mapped correctly. */ static vm_page_t _pmap_allocpte(pmap, ptepindex) pmap_t pmap; unsigned ptepindex; { vm_offset_t pteva, ptepa; vm_page_t m; /* * Find or fabricate a new pagetable page */ m = vm_page_grab(pmap->pm_pteobj, ptepindex, VM_ALLOC_ZERO | VM_ALLOC_RETRY); KASSERT(m->queue == PQ_NONE, ("_pmap_allocpte: %p->queue != PQ_NONE", m)); if (m->wire_count == 0) cnt.v_wire_count++; m->wire_count++; /* * Increment the hold count for the page table page * (denoting a new mapping.) */ m->hold_count++; /* * Map the pagetable page into the process address space, if * it isn't already there. */ pmap->pm_stats.resident_count++; ptepa = VM_PAGE_TO_PHYS(m); pmap->pm_pdir[ptepindex] = (pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M); /* * Set the page table hint */ pmap->pm_ptphint = m; /* * Try to use the new mapping, but if we cannot, then * do it with the routine that maps the page explicitly. */ if ((m->flags & PG_ZERO) == 0) { if ((((unsigned)pmap->pm_pdir[PTDPTDI]) & PG_FRAME) == (((unsigned) PTDpde) & PG_FRAME)) { pteva = UPT_MIN_ADDRESS + i386_ptob(ptepindex); bzero((caddr_t) pteva, PAGE_SIZE); } else { pmap_zero_page(ptepa); } } m->valid = VM_PAGE_BITS_ALL; vm_page_flag_clear(m, PG_ZERO); vm_page_flag_set(m, PG_MAPPED); vm_page_wakeup(m); return m; } static vm_page_t pmap_allocpte(pmap, va) pmap_t pmap; vm_offset_t va; { unsigned ptepindex; vm_offset_t ptepa; vm_page_t m; /* * Calculate pagetable page index */ ptepindex = va >> PDRSHIFT; /* * Get the page directory entry */ ptepa = (vm_offset_t) pmap->pm_pdir[ptepindex]; /* * This supports switching from a 4MB page to a * normal 4K page. */ if (ptepa & PG_PS) { pmap->pm_pdir[ptepindex] = 0; ptepa = 0; invltlb(); } /* * If the page table page is mapped, we just increment the * hold count, and activate it. */ if (ptepa) { /* * In order to get the page table page, try the * hint first. */ if (pmap->pm_ptphint && (pmap->pm_ptphint->pindex == ptepindex)) { m = pmap->pm_ptphint; } else { m = pmap_page_lookup( pmap->pm_pteobj, ptepindex); pmap->pm_ptphint = m; } m->hold_count++; return m; } /* * Here if the pte page isn't mapped, or if it has been deallocated. */ return _pmap_allocpte(pmap, ptepindex); } /*************************************************** * Pmap allocation/deallocation routines. ***************************************************/ /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap) register struct pmap *pmap; { vm_page_t p,n,ptdpg; vm_object_t object = pmap->pm_pteobj; int curgeneration; #if defined(DIAGNOSTIC) if (object->ref_count != 1) panic("pmap_release: pteobj reference count != 1"); #endif ptdpg = NULL; LIST_REMOVE(pmap, pm_list); retry: curgeneration = object->generation; for (p = TAILQ_FIRST(&object->memq); p != NULL; p = n) { n = TAILQ_NEXT(p, listq); if (p->pindex == PTDPTDI) { ptdpg = p; continue; } while (1) { if (!pmap_release_free_page(pmap, p) && (object->generation != curgeneration)) goto retry; } } if (ptdpg && !pmap_release_free_page(pmap, ptdpg)) goto retry; } static int kvm_size(SYSCTL_HANDLER_ARGS) { unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE; return sysctl_handle_long(oidp, &ksize, 0, req); } SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 0, 0, kvm_size, "IU", "Size of KVM"); static int kvm_free(SYSCTL_HANDLER_ARGS) { unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; return sysctl_handle_long(oidp, &kfree, 0, req); } SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 0, 0, kvm_free, "IU", "Amount of KVM free"); /* * grow the number of kernel page table entries, if needed */ void pmap_growkernel(vm_offset_t addr) { struct pmap *pmap; int s; vm_offset_t ptppaddr; vm_page_t nkpg; pd_entry_t newpdir; s = splhigh(); if (kernel_vm_end == 0) { kernel_vm_end = KERNBASE; nkpt = 0; while (pdir_pde(PTD, kernel_vm_end)) { kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); nkpt++; } } addr = (addr + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); while (kernel_vm_end < addr) { if (pdir_pde(PTD, kernel_vm_end)) { kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); continue; } /* * This index is bogus, but out of the way */ nkpg = vm_page_alloc(kptobj, nkpt, VM_ALLOC_SYSTEM); if (!nkpg) panic("pmap_growkernel: no memory to grow kernel"); nkpt++; vm_page_wire(nkpg); ptppaddr = VM_PAGE_TO_PHYS(nkpg); pmap_zero_page(ptppaddr); newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M); pdir_pde(PTD, kernel_vm_end) = newpdir; LIST_FOREACH(pmap, &allpmaps, pm_list) { *pmap_pde(pmap, kernel_vm_end) = newpdir; } kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); } splx(s); } /* * Retire the given physical map from service. * Should only be called if the map contains * no valid mappings. */ void pmap_destroy(pmap) register pmap_t pmap; { int count; if (pmap == NULL) return; count = --pmap->pm_count; if (count == 0) { pmap_release(pmap); panic("destroying a pmap is not yet implemented"); } } /* * Add a reference to the specified pmap. */ void pmap_reference(pmap) pmap_t pmap; { if (pmap != NULL) { pmap->pm_count++; } } /*************************************************** * page management routines. ***************************************************/ /* * free the pv_entry back to the free list */ static PMAP_INLINE void free_pv_entry(pv) pv_entry_t pv; { pv_entry_count--; zfree(pvzone, pv); } /* * get a new pv_entry, allocating a block from the system * when needed. * the memory allocation is performed bypassing the malloc code * because of the possibility of allocations at interrupt time. */ static pv_entry_t get_pv_entry(void) { pv_entry_count++; if (pv_entry_high_water && (pv_entry_count > pv_entry_high_water) && (pmap_pagedaemon_waken == 0)) { pmap_pagedaemon_waken = 1; wakeup (&vm_pages_needed); } return zalloc(pvzone); } /* * This routine is very drastic, but can save the system * in a pinch. */ void pmap_collect() { int i; vm_page_t m; static int warningdone=0; if (pmap_pagedaemon_waken == 0) return; if (warningdone < 5) { printf("pmap_collect: collecting pv entries -- suggest increasing PMAP_SHPGPERPROC\n"); warningdone++; } for(i = 0; i < vm_page_array_size; i++) { m = &vm_page_array[i]; if (m->wire_count || m->hold_count || m->busy || (m->flags & PG_BUSY)) continue; pmap_remove_all(m); } pmap_pagedaemon_waken = 0; } /* * If it is the first entry on the list, it is actually * in the header and we must copy the following entry up * to the header. Otherwise we must search the list for * the entry. In either case we free the now unused entry. */ static int pmap_remove_entry(pmap, m, va) struct pmap *pmap; vm_page_t m; vm_offset_t va; { pv_entry_t pv; int rtval; int s; s = splvm(); if (m->md.pv_list_count < pmap->pm_stats.resident_count) { for (pv = TAILQ_FIRST(&m->md.pv_list); pv; pv = TAILQ_NEXT(pv, pv_list)) { if (pmap == pv->pv_pmap && va == pv->pv_va) break; } } else { for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = TAILQ_NEXT(pv, pv_plist)) { if (va == pv->pv_va) break; } } rtval = 0; if (pv) { rtval = pmap_unuse_pt(pmap, va, pv->pv_ptem); TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); m->md.pv_list_count--; if (TAILQ_FIRST(&m->md.pv_list) == NULL) vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist); free_pv_entry(pv); } splx(s); return rtval; } /* * Create a pv entry for page at pa for * (pmap, va). */ static void pmap_insert_entry(pmap, va, mpte, m) pmap_t pmap; vm_offset_t va; vm_page_t mpte; vm_page_t m; { int s; pv_entry_t pv; s = splvm(); pv = get_pv_entry(); pv->pv_va = va; pv->pv_pmap = pmap; pv->pv_ptem = mpte; TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); m->md.pv_list_count++; splx(s); } /* * pmap_remove_pte: do the things to unmap a page in a process */ static int pmap_remove_pte(pmap, ptq, va) struct pmap *pmap; unsigned *ptq; vm_offset_t va; { unsigned oldpte; vm_page_t m; oldpte = atomic_readandclear_int(ptq); if (oldpte & PG_W) pmap->pm_stats.wired_count -= 1; /* * Machines that don't support invlpg, also don't support * PG_G. */ if (oldpte & PG_G) invlpg(va); pmap->pm_stats.resident_count -= 1; if (oldpte & PG_MANAGED) { m = PHYS_TO_VM_PAGE(oldpte); if (oldpte & PG_M) { #if defined(PMAP_DIAGNOSTIC) if (pmap_nw_modified((pt_entry_t) oldpte)) { printf( "pmap_remove: modified page not writable: va: 0x%x, pte: 0x%x\n", va, oldpte); } #endif if (pmap_track_modified(va)) vm_page_dirty(m); } if (oldpte & PG_A) vm_page_flag_set(m, PG_REFERENCED); return pmap_remove_entry(pmap, m, va); } else { return pmap_unuse_pt(pmap, va, NULL); } return 0; } /* * Remove a single page from a process address space */ static void pmap_remove_page(pmap, va) struct pmap *pmap; register vm_offset_t va; { register unsigned *ptq; /* * if there is no pte for this address, just skip it!!! */ if (*pmap_pde(pmap, va) == 0) { return; } /* * get a local va for mappings for this pmap. */ ptq = get_ptbase(pmap) + i386_btop(va); if (*ptq) { (void) pmap_remove_pte(pmap, ptq, va); pmap_TLB_invalidate(pmap, va); } return; } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ void pmap_remove(pmap, sva, eva) struct pmap *pmap; register vm_offset_t sva; register vm_offset_t eva; { register unsigned *ptbase; vm_offset_t pdnxt; vm_offset_t ptpaddr; vm_offset_t sindex, eindex; int anyvalid; if (pmap == NULL) return; if (pmap->pm_stats.resident_count == 0) return; /* * special handling of removing one page. a very * common operation and easy to short circuit some * code. */ if (((sva + PAGE_SIZE) == eva) && (((unsigned) pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) { pmap_remove_page(pmap, sva); return; } anyvalid = 0; /* * Get a local virtual address for the mappings that are being * worked with. */ ptbase = get_ptbase(pmap); sindex = i386_btop(sva); eindex = i386_btop(eva); for (; sindex < eindex; sindex = pdnxt) { unsigned pdirindex; /* * Calculate index for next page table. */ pdnxt = ((sindex + NPTEPG) & ~(NPTEPG - 1)); if (pmap->pm_stats.resident_count == 0) break; pdirindex = sindex / NPDEPG; if (((ptpaddr = (unsigned) pmap->pm_pdir[pdirindex]) & PG_PS) != 0) { pmap->pm_pdir[pdirindex] = 0; pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; anyvalid++; continue; } /* * Weed out invalid mappings. Note: we assume that the page * directory table is always allocated, and in kernel virtual. */ if (ptpaddr == 0) continue; /* * Limit our scan to either the end of the va represented * by the current page table page, or to the end of the * range being removed. */ if (pdnxt > eindex) { pdnxt = eindex; } for ( ;sindex != pdnxt; sindex++) { vm_offset_t va; if (ptbase[sindex] == 0) { continue; } va = i386_ptob(sindex); anyvalid++; if (pmap_remove_pte(pmap, ptbase + sindex, va)) break; } } if (anyvalid) pmap_TLB_invalidate_all(pmap); } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. * * Notes: * Original versions of this routine were very * inefficient because they iteratively called * pmap_remove (slow...) */ static void pmap_remove_all(m) vm_page_t m; { register pv_entry_t pv; register unsigned *pte, tpte; int s; #if defined(PMAP_DIAGNOSTIC) /* * XXX this makes pmap_page_protect(NONE) illegal for non-managed * pages! */ if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) { panic("pmap_page_protect: illegal for unmanaged page, va: 0x%x", VM_PAGE_TO_PHYS(m)); } #endif s = splvm(); while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pv->pv_pmap->pm_stats.resident_count--; pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); tpte = atomic_readandclear_int(pte); if (tpte & PG_W) pv->pv_pmap->pm_stats.wired_count--; if (tpte & PG_A) vm_page_flag_set(m, PG_REFERENCED); /* * Update the vm_page_t clean and reference bits. */ if (tpte & PG_M) { #if defined(PMAP_DIAGNOSTIC) if (pmap_nw_modified((pt_entry_t) tpte)) { printf( "pmap_remove_all: modified page not writable: va: 0x%x, pte: 0x%x\n", pv->pv_va, tpte); } #endif if (pmap_track_modified(pv->pv_va)) vm_page_dirty(m); } pmap_TLB_invalidate(pv->pv_pmap, pv->pv_va); TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist); TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); m->md.pv_list_count--; pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem); free_pv_entry(pv); } vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); splx(s); } /* * Set the physical protection on the * specified range of this map as requested. */ void pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { register unsigned *ptbase; vm_offset_t pdnxt, ptpaddr; vm_pindex_t sindex, eindex; int anychanged; if (pmap == NULL) return; if ((prot & VM_PROT_READ) == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); return; } if (prot & VM_PROT_WRITE) return; anychanged = 0; ptbase = get_ptbase(pmap); sindex = i386_btop(sva); eindex = i386_btop(eva); for (; sindex < eindex; sindex = pdnxt) { unsigned pdirindex; pdnxt = ((sindex + NPTEPG) & ~(NPTEPG - 1)); pdirindex = sindex / NPDEPG; if (((ptpaddr = (unsigned) pmap->pm_pdir[pdirindex]) & PG_PS) != 0) { (unsigned) pmap->pm_pdir[pdirindex] &= ~(PG_M|PG_RW); pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; anychanged++; continue; } /* * Weed out invalid mappings. Note: we assume that the page * directory table is always allocated, and in kernel virtual. */ if (ptpaddr == 0) continue; if (pdnxt > eindex) { pdnxt = eindex; } for (; sindex != pdnxt; sindex++) { unsigned pbits; vm_page_t m; pbits = ptbase[sindex]; if (pbits & PG_MANAGED) { m = NULL; if (pbits & PG_A) { m = PHYS_TO_VM_PAGE(pbits); vm_page_flag_set(m, PG_REFERENCED); pbits &= ~PG_A; } if (pbits & PG_M) { if (pmap_track_modified(i386_ptob(sindex))) { if (m == NULL) m = PHYS_TO_VM_PAGE(pbits); vm_page_dirty(m); pbits &= ~PG_M; } } } pbits &= ~PG_RW; if (pbits != ptbase[sindex]) { ptbase[sindex] = pbits; anychanged = 1; } } } if (anychanged) pmap_TLB_invalidate_all(pmap); } /* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. */ void pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, boolean_t wired) { vm_offset_t pa; register unsigned *pte; vm_offset_t opa; vm_offset_t origpte, newpte; vm_page_t mpte; if (pmap == NULL) return; va &= PG_FRAME; #ifdef PMAP_DIAGNOSTIC if (va > VM_MAX_KERNEL_ADDRESS) panic("pmap_enter: toobig"); if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS)) panic("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", va); #endif mpte = NULL; /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < UPT_MIN_ADDRESS) { mpte = pmap_allocpte(pmap, va); } #if 0 && defined(PMAP_DIAGNOSTIC) else { vm_offset_t *pdeaddr = (vm_offset_t *)pmap_pde(pmap, va); if (((origpte = (vm_offset_t) *pdeaddr) & PG_V) == 0) { panic("pmap_enter: invalid kernel page table page(0), pdir=%p, pde=%p, va=%p\n", pmap->pm_pdir[PTDPTDI], origpte, va); } if (smp_active) { pdeaddr = (vm_offset_t *) IdlePTDS[PCPU_GET(cpuid)]; if (((newpte = pdeaddr[va >> PDRSHIFT]) & PG_V) == 0) { if ((vm_offset_t) my_idlePTD != (vm_offset_t) vtophys(pdeaddr)) printf("pde mismatch: %x, %x\n", my_idlePTD, pdeaddr); printf("cpuid: %d, pdeaddr: 0x%x\n", PCPU_GET(cpuid), pdeaddr); panic("pmap_enter: invalid kernel page table page(1), pdir=%p, npde=%p, pde=%p, va=%p\n", pmap->pm_pdir[PTDPTDI], newpte, origpte, va); } } } #endif pte = pmap_pte(pmap, va); /* * Page Directory table entry not valid, we need a new PT page */ if (pte == NULL) { panic("pmap_enter: invalid page directory, pdir=%p, va=0x%x\n", (void *)pmap->pm_pdir[PTDPTDI], va); } pa = VM_PAGE_TO_PHYS(m) & PG_FRAME; origpte = *(vm_offset_t *)pte; opa = origpte & PG_FRAME; if (origpte & PG_PS) panic("pmap_enter: attempted pmap_enter on 4MB page"); /* * Mapping has not changed, must be protection or wiring change. */ if (origpte && (opa == pa)) { /* * Wiring change, just update stats. We don't worry about * wiring PT pages as they remain resident as long as there * are valid mappings in them. Hence, if a user page is wired, * the PT page will be also. */ if (wired && ((origpte & PG_W) == 0)) pmap->pm_stats.wired_count++; else if (!wired && (origpte & PG_W)) pmap->pm_stats.wired_count--; #if defined(PMAP_DIAGNOSTIC) if (pmap_nw_modified((pt_entry_t) origpte)) { printf( "pmap_enter: modified page not writable: va: 0x%x, pte: 0x%x\n", va, origpte); } #endif /* * Remove extra pte reference */ if (mpte) mpte->hold_count--; if ((prot & VM_PROT_WRITE) && (origpte & PG_V)) { if ((origpte & PG_RW) == 0) { *pte |= PG_RW; #ifdef SMP cpu_invlpg((void *)va); if (pmap->pm_active & PCPU_GET(other_cpus)) smp_invltlb(); #else invltlb_1pg(va); #endif } return; } /* * We might be turning off write access to the page, * so we go ahead and sense modify status. */ if (origpte & PG_MANAGED) { if ((origpte & PG_M) && pmap_track_modified(va)) { vm_page_t om; om = PHYS_TO_VM_PAGE(opa); vm_page_dirty(om); } pa |= PG_MANAGED; } goto validate; } /* * Mapping has changed, invalidate old range and fall through to * handle validating new mapping. */ if (opa) { int err; err = pmap_remove_pte(pmap, pte, va); if (err) panic("pmap_enter: pte vanished, va: 0x%x", va); } /* * Enter on the PV list if part of our managed memory. Note that we * raise IPL while manipulating pv_table since pmap_enter can be * called at interrupt time. */ if (pmap_initialized && (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) { pmap_insert_entry(pmap, va, mpte, m); pa |= PG_MANAGED; } /* * Increment counters */ pmap->pm_stats.resident_count++; if (wired) pmap->pm_stats.wired_count++; validate: /* * Now validate mapping with desired protection/wiring. */ newpte = (vm_offset_t) (pa | pte_prot(pmap, prot) | PG_V); if (wired) newpte |= PG_W; if (va < UPT_MIN_ADDRESS) newpte |= PG_U; if (pmap == kernel_pmap) newpte |= pgeflag; /* * if the mapping or permission bits are different, we need * to update the pte. */ if ((origpte & ~(PG_M|PG_A)) != newpte) { *pte = newpte | PG_A; /*if (origpte)*/ { #ifdef SMP cpu_invlpg((void *)va); if (pmap->pm_active & PCPU_GET(other_cpus)) smp_invltlb(); #else invltlb_1pg(va); #endif } } } /* * this code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No page table pages. * 5. Tlbflush is deferred to calling procedure. * 6. Page IS managed. * but is *MUCH* faster than pmap_enter... */ static vm_page_t pmap_enter_quick(pmap, va, m, mpte) register pmap_t pmap; vm_offset_t va; vm_page_t m; vm_page_t mpte; { unsigned *pte; vm_offset_t pa; /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < UPT_MIN_ADDRESS) { unsigned ptepindex; vm_offset_t ptepa; /* * Calculate pagetable page index */ ptepindex = va >> PDRSHIFT; if (mpte && (mpte->pindex == ptepindex)) { mpte->hold_count++; } else { retry: /* * Get the page directory entry */ ptepa = (vm_offset_t) pmap->pm_pdir[ptepindex]; /* * If the page table page is mapped, we just increment * the hold count, and activate it. */ if (ptepa) { if (ptepa & PG_PS) panic("pmap_enter_quick: unexpected mapping into 4MB page"); if (pmap->pm_ptphint && (pmap->pm_ptphint->pindex == ptepindex)) { mpte = pmap->pm_ptphint; } else { mpte = pmap_page_lookup( pmap->pm_pteobj, ptepindex); pmap->pm_ptphint = mpte; } if (mpte == NULL) goto retry; mpte->hold_count++; } else { mpte = _pmap_allocpte(pmap, ptepindex); } } } else { mpte = NULL; } /* * This call to vtopte makes the assumption that we are * entering the page into the current pmap. In order to support * quick entry into any pmap, one would likely use pmap_pte_quick. * But that isn't as quick as vtopte. */ pte = (unsigned *)vtopte(va); if (*pte) { if (mpte) pmap_unwire_pte_hold(pmap, mpte); return 0; } /* * Enter on the PV list if part of our managed memory. Note that we * raise IPL while manipulating pv_table since pmap_enter can be * called at interrupt time. */ if ((m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) pmap_insert_entry(pmap, va, mpte, m); /* * Increment counters */ pmap->pm_stats.resident_count++; pa = VM_PAGE_TO_PHYS(m); /* * Now validate mapping with RO protection */ if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) *pte = pa | PG_V | PG_U; else *pte = pa | PG_V | PG_U | PG_MANAGED; return mpte; } /* * Make a temporary mapping for a physical address. This is only intended * to be used for panic dumps. */ void * pmap_kenter_temporary(vm_offset_t pa, int i) { pmap_kenter((vm_offset_t)crashdumpmap + (i * PAGE_SIZE), pa); return ((void *)crashdumpmap); } #define MAX_INIT_PT (96) /* * pmap_object_init_pt preloads the ptes for a given object * into the specified pmap. This eliminates the blast of soft * faults on process startup and immediately after an mmap. */ void pmap_object_init_pt(pmap, addr, object, pindex, size, limit) pmap_t pmap; vm_offset_t addr; vm_object_t object; vm_pindex_t pindex; vm_size_t size; int limit; { vm_offset_t tmpidx; int psize; vm_page_t p, mpte; int objpgs; if (pmap == NULL || object == NULL) return; /* * This code maps large physical mmap regions into the * processor address space. Note that some shortcuts * are taken, but the code works. */ if (pseflag && (object->type == OBJT_DEVICE) && ((addr & (NBPDR - 1)) == 0) && ((size & (NBPDR - 1)) == 0) ) { int i; vm_page_t m[1]; unsigned int ptepindex; int npdes; vm_offset_t ptepa; if (pmap->pm_pdir[ptepindex = (addr >> PDRSHIFT)]) return; retry: p = vm_page_lookup(object, pindex); if (p && vm_page_sleep_busy(p, FALSE, "init4p")) goto retry; if (p == NULL) { p = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL); if (p == NULL) return; m[0] = p; if (vm_pager_get_pages(object, m, 1, 0) != VM_PAGER_OK) { vm_page_free(p); return; } p = vm_page_lookup(object, pindex); vm_page_wakeup(p); } ptepa = (vm_offset_t) VM_PAGE_TO_PHYS(p); if (ptepa & (NBPDR - 1)) { return; } p->valid = VM_PAGE_BITS_ALL; pmap->pm_stats.resident_count += size >> PAGE_SHIFT; npdes = size >> PDRSHIFT; for(i=0;ipm_pdir[ptepindex] = (pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_PS); ptepa += NBPDR; ptepindex += 1; } vm_page_flag_set(p, PG_MAPPED); invltlb(); return; } psize = i386_btop(size); if ((object->type != OBJT_VNODE) || (limit && (psize > MAX_INIT_PT) && (object->resident_page_count > MAX_INIT_PT))) { return; } if (psize + pindex > object->size) { if (object->size < pindex) return; psize = object->size - pindex; } mpte = NULL; /* * if we are processing a major portion of the object, then scan the * entire thing. */ if (psize > (object->resident_page_count >> 2)) { objpgs = psize; for (p = TAILQ_FIRST(&object->memq); ((objpgs > 0) && (p != NULL)); p = TAILQ_NEXT(p, listq)) { tmpidx = p->pindex; if (tmpidx < pindex) { continue; } tmpidx -= pindex; if (tmpidx >= psize) { continue; } if (((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) && (p->busy == 0) && (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) { if ((p->queue - p->pc) == PQ_CACHE) vm_page_deactivate(p); vm_page_busy(p); mpte = pmap_enter_quick(pmap, addr + i386_ptob(tmpidx), p, mpte); vm_page_flag_set(p, PG_MAPPED); vm_page_wakeup(p); } objpgs -= 1; } } else { /* * else lookup the pages one-by-one. */ for (tmpidx = 0; tmpidx < psize; tmpidx += 1) { p = vm_page_lookup(object, tmpidx + pindex); if (p && ((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) && (p->busy == 0) && (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) { if ((p->queue - p->pc) == PQ_CACHE) vm_page_deactivate(p); vm_page_busy(p); mpte = pmap_enter_quick(pmap, addr + i386_ptob(tmpidx), p, mpte); vm_page_flag_set(p, PG_MAPPED); vm_page_wakeup(p); } } } return; } /* * pmap_prefault provides a quick way of clustering * pagefaults into a processes address space. It is a "cousin" * of pmap_object_init_pt, except it runs at page fault time instead * of mmap time. */ #define PFBAK 4 #define PFFOR 4 #define PAGEORDER_SIZE (PFBAK+PFFOR) static int pmap_prefault_pageorder[] = { -PAGE_SIZE, PAGE_SIZE, -2 * PAGE_SIZE, 2 * PAGE_SIZE, -3 * PAGE_SIZE, 3 * PAGE_SIZE -4 * PAGE_SIZE, 4 * PAGE_SIZE }; void pmap_prefault(pmap, addra, entry) pmap_t pmap; vm_offset_t addra; vm_map_entry_t entry; { int i; vm_offset_t starta; vm_offset_t addr; vm_pindex_t pindex; vm_page_t m, mpte; vm_object_t object; if (!curproc || (pmap != vmspace_pmap(curproc->p_vmspace))) return; object = entry->object.vm_object; starta = addra - PFBAK * PAGE_SIZE; if (starta < entry->start) { starta = entry->start; } else if (starta > addra) { starta = 0; } mpte = NULL; for (i = 0; i < PAGEORDER_SIZE; i++) { vm_object_t lobject; unsigned *pte; addr = addra + pmap_prefault_pageorder[i]; if (addr > addra + (PFFOR * PAGE_SIZE)) addr = 0; if (addr < starta || addr >= entry->end) continue; if ((*pmap_pde(pmap, addr)) == NULL) continue; pte = (unsigned *) vtopte(addr); if (*pte) continue; pindex = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT; lobject = object; for (m = vm_page_lookup(lobject, pindex); (!m && (lobject->type == OBJT_DEFAULT) && (lobject->backing_object)); lobject = lobject->backing_object) { if (lobject->backing_object_offset & PAGE_MASK) break; pindex += (lobject->backing_object_offset >> PAGE_SHIFT); m = vm_page_lookup(lobject->backing_object, pindex); } /* * give-up when a page is not in memory */ if (m == NULL) break; if (((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) && (m->busy == 0) && (m->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) { if ((m->queue - m->pc) == PQ_CACHE) { vm_page_deactivate(m); } vm_page_busy(m); mpte = pmap_enter_quick(pmap, addr, m, mpte); vm_page_flag_set(m, PG_MAPPED); vm_page_wakeup(m); } } } /* * Routine: pmap_change_wiring * Function: Change the wiring attribute for a map/virtual-address * pair. * In/out conditions: * The mapping must already exist in the pmap. */ void pmap_change_wiring(pmap, va, wired) register pmap_t pmap; vm_offset_t va; boolean_t wired; { register unsigned *pte; if (pmap == NULL) return; pte = pmap_pte(pmap, va); if (wired && !pmap_pte_w(pte)) pmap->pm_stats.wired_count++; else if (!wired && pmap_pte_w(pte)) pmap->pm_stats.wired_count--; /* * Wiring is not a hardware characteristic so there is no need to * invalidate TLB. */ pmap_pte_set_w(pte, wired); } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. */ void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr) pmap_t dst_pmap, src_pmap; vm_offset_t dst_addr; vm_size_t len; vm_offset_t src_addr; { vm_offset_t addr; vm_offset_t end_addr = src_addr + len; vm_offset_t pdnxt; unsigned src_frame, dst_frame; vm_page_t m; if (dst_addr != src_addr) return; src_frame = ((unsigned) src_pmap->pm_pdir[PTDPTDI]) & PG_FRAME; if (src_frame != (((unsigned) PTDpde) & PG_FRAME)) { return; } dst_frame = ((unsigned) dst_pmap->pm_pdir[PTDPTDI]) & PG_FRAME; if (dst_frame != (((unsigned) APTDpde) & PG_FRAME)) { APTDpde = (pd_entry_t) (dst_frame | PG_RW | PG_V); #if defined(SMP) /* The page directory is not shared between CPUs */ cpu_invltlb(); #else invltlb(); #endif } for(addr = src_addr; addr < end_addr; addr = pdnxt) { unsigned *src_pte, *dst_pte; vm_page_t dstmpte, srcmpte; vm_offset_t srcptepaddr; unsigned ptepindex; if (addr >= UPT_MIN_ADDRESS) panic("pmap_copy: invalid to pmap_copy page tables\n"); /* * Don't let optional prefaulting of pages make us go * way below the low water mark of free pages or way * above high water mark of used pv entries. */ if (cnt.v_free_count < cnt.v_free_reserved || pv_entry_count > pv_entry_high_water) break; pdnxt = ((addr + PAGE_SIZE*NPTEPG) & ~(PAGE_SIZE*NPTEPG - 1)); ptepindex = addr >> PDRSHIFT; srcptepaddr = (vm_offset_t) src_pmap->pm_pdir[ptepindex]; if (srcptepaddr == 0) continue; if (srcptepaddr & PG_PS) { if (dst_pmap->pm_pdir[ptepindex] == 0) { dst_pmap->pm_pdir[ptepindex] = (pd_entry_t) srcptepaddr; dst_pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE; } continue; } srcmpte = vm_page_lookup(src_pmap->pm_pteobj, ptepindex); if ((srcmpte == NULL) || (srcmpte->hold_count == 0) || (srcmpte->flags & PG_BUSY)) continue; if (pdnxt > end_addr) pdnxt = end_addr; src_pte = (unsigned *) vtopte(addr); dst_pte = (unsigned *) avtopte(addr); while (addr < pdnxt) { unsigned ptetemp; ptetemp = *src_pte; /* * we only virtual copy managed pages */ if ((ptetemp & PG_MANAGED) != 0) { /* * We have to check after allocpte for the * pte still being around... allocpte can * block. */ dstmpte = pmap_allocpte(dst_pmap, addr); if ((*dst_pte == 0) && (ptetemp = *src_pte)) { /* * Clear the modified and * accessed (referenced) bits * during the copy. */ m = PHYS_TO_VM_PAGE(ptetemp); *dst_pte = ptetemp & ~(PG_M | PG_A); dst_pmap->pm_stats.resident_count++; pmap_insert_entry(dst_pmap, addr, dstmpte, m); } else { pmap_unwire_pte_hold(dst_pmap, dstmpte); } if (dstmpte->hold_count >= srcmpte->hold_count) break; } addr += PAGE_SIZE; src_pte++; dst_pte++; } } } /* * Routine: pmap_kernel * Function: * Returns the physical map handle for the kernel. */ pmap_t pmap_kernel() { return (kernel_pmap); } /* * pmap_zero_page zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. */ void pmap_zero_page(phys) vm_offset_t phys; { if (*(int *) CMAP2) panic("pmap_zero_page: CMAP2 busy"); *(int *) CMAP2 = PG_V | PG_RW | (phys & PG_FRAME) | PG_A | PG_M; invltlb_1pg((vm_offset_t)CADDR2); #if defined(I686_CPU) if (cpu_class == CPUCLASS_686) i686_pagezero(CADDR2); else #endif bzero(CADDR2, PAGE_SIZE); *(int *) CMAP2 = 0; } /* * pmap_zero_page_area zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. * * off and size may not cover an area beyond a single hardware page. */ void pmap_zero_page_area(phys, off, size) vm_offset_t phys; int off; int size; { if (*(int *) CMAP2) panic("pmap_zero_page: CMAP2 busy"); *(int *) CMAP2 = PG_V | PG_RW | (phys & PG_FRAME) | PG_A | PG_M; invltlb_1pg((vm_offset_t)CADDR2); #if defined(I686_CPU) if (cpu_class == CPUCLASS_686 && off == 0 && size == PAGE_SIZE) i686_pagezero(CADDR2); else #endif bzero((char *)CADDR2 + off, size); *(int *) CMAP2 = 0; } /* * pmap_copy_page copies the specified (machine independent) * page by mapping the page into virtual memory and using * bcopy to copy the page, one machine dependent page at a * time. */ void pmap_copy_page(src, dst) vm_offset_t src; vm_offset_t dst; { if (*(int *) CMAP1) panic("pmap_copy_page: CMAP1 busy"); if (*(int *) CMAP2) panic("pmap_copy_page: CMAP2 busy"); *(int *) CMAP1 = PG_V | (src & PG_FRAME) | PG_A; *(int *) CMAP2 = PG_V | PG_RW | (dst & PG_FRAME) | PG_A | PG_M; #ifdef I386_CPU invltlb(); #else invlpg((u_int)CADDR1); invlpg((u_int)CADDR2); #endif bcopy(CADDR1, CADDR2, PAGE_SIZE); *(int *) CMAP1 = 0; *(int *) CMAP2 = 0; } /* * Routine: pmap_pageable * Function: * Make the specified pages (by pmap, offset) * pageable (or not) as requested. * * A page which is not pageable may not take * a fault; therefore, its page table entry * must remain valid for the duration. * * This routine is merely advisory; pmap_enter * will specify that these pages are to be wired * down (or not) as appropriate. */ void pmap_pageable(pmap, sva, eva, pageable) pmap_t pmap; vm_offset_t sva, eva; boolean_t pageable; { } /* * this routine returns true if a physical page resides * in the given pmap. */ boolean_t pmap_page_exists(pmap, m) pmap_t pmap; vm_page_t m; { register pv_entry_t pv; int s; if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) return FALSE; s = splvm(); /* * Not found, check current mappings returning immediately if found. */ for (pv = TAILQ_FIRST(&m->md.pv_list); pv; pv = TAILQ_NEXT(pv, pv_list)) { if (pv->pv_pmap == pmap) { splx(s); return TRUE; } } splx(s); return (FALSE); } #define PMAP_REMOVE_PAGES_CURPROC_ONLY /* * Remove all pages from specified address space * this aids process exit speeds. Also, this code * is special cased for current process only, but * can have the more generic (and slightly slower) * mode enabled. This is much faster than pmap_remove * in the case of running down an entire address space. */ void pmap_remove_pages(pmap, sva, eva) pmap_t pmap; vm_offset_t sva, eva; { unsigned *pte, tpte; pv_entry_t pv, npv; int s; vm_page_t m; #ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY if (!curproc || (pmap != vmspace_pmap(curproc->p_vmspace))) { printf("warning: pmap_remove_pages called with non-current pmap\n"); return; } #endif s = splvm(); for(pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) { if (pv->pv_va >= eva || pv->pv_va < sva) { npv = TAILQ_NEXT(pv, pv_plist); continue; } #ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY pte = (unsigned *)vtopte(pv->pv_va); #else pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); #endif tpte = *pte; /* * We cannot remove wired pages from a process' mapping at this time */ if (tpte & PG_W) { npv = TAILQ_NEXT(pv, pv_plist); continue; } *pte = 0; m = PHYS_TO_VM_PAGE(tpte); KASSERT(m < &vm_page_array[vm_page_array_size], ("pmap_remove_pages: bad tpte %x", tpte)); pv->pv_pmap->pm_stats.resident_count--; /* * Update the vm_page_t clean and reference bits. */ if (tpte & PG_M) { vm_page_dirty(m); } npv = TAILQ_NEXT(pv, pv_plist); TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist); m->md.pv_list_count--; TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); if (TAILQ_FIRST(&m->md.pv_list) == NULL) { vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); } pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem); free_pv_entry(pv); } splx(s); pmap_TLB_invalidate_all(pmap); } /* * pmap_testbit tests bits in pte's * note that the testbit/changebit routines are inline, * and a lot of things compile-time evaluate. */ static boolean_t pmap_testbit(m, bit) vm_page_t m; int bit; { pv_entry_t pv; unsigned *pte; int s; if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) return FALSE; if (TAILQ_FIRST(&m->md.pv_list) == NULL) return FALSE; s = splvm(); for (pv = TAILQ_FIRST(&m->md.pv_list); pv; pv = TAILQ_NEXT(pv, pv_list)) { /* * if the bit being tested is the modified bit, then * mark clean_map and ptes as never * modified. */ if (bit & (PG_A|PG_M)) { if (!pmap_track_modified(pv->pv_va)) continue; } #if defined(PMAP_DIAGNOSTIC) if (!pv->pv_pmap) { printf("Null pmap (tb) at va: 0x%x\n", pv->pv_va); continue; } #endif pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); if (*pte & bit) { splx(s); return TRUE; } } splx(s); return (FALSE); } /* * this routine is used to modify bits in ptes */ static __inline void pmap_changebit(m, bit, setem) vm_page_t m; int bit; boolean_t setem; { register pv_entry_t pv; register unsigned *pte; int s; if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) return; s = splvm(); /* * Loop over all current mappings setting/clearing as appropos If * setting RO do we need to clear the VAC? */ for (pv = TAILQ_FIRST(&m->md.pv_list); pv; pv = TAILQ_NEXT(pv, pv_list)) { /* * don't write protect pager mappings */ if (!setem && (bit == PG_RW)) { if (!pmap_track_modified(pv->pv_va)) continue; } #if defined(PMAP_DIAGNOSTIC) if (!pv->pv_pmap) { printf("Null pmap (cb) at va: 0x%x\n", pv->pv_va); continue; } #endif pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); if (setem) { *(int *)pte |= bit; pmap_TLB_invalidate(pv->pv_pmap, pv->pv_va); } else { vm_offset_t pbits = *(vm_offset_t *)pte; if (pbits & bit) { if (bit == PG_RW) { if (pbits & PG_M) { vm_page_dirty(m); } *(int *)pte = pbits & ~(PG_M|PG_RW); } else { *(int *)pte = pbits & ~bit; } pmap_TLB_invalidate(pv->pv_pmap, pv->pv_va); } } } splx(s); } /* * pmap_page_protect: * * Lower the permission for all mappings to a given page. */ void pmap_page_protect(vm_page_t m, vm_prot_t prot) { if ((prot & VM_PROT_WRITE) == 0) { if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) { pmap_changebit(m, PG_RW, FALSE); } else { pmap_remove_all(m); } } } vm_offset_t pmap_phys_address(ppn) int ppn; { return (i386_ptob(ppn)); } /* * pmap_ts_referenced: * * Return the count of reference bits for a page, clearing all of them. */ int pmap_ts_referenced(vm_page_t m) { register pv_entry_t pv, pvf, pvn; unsigned *pte; int s; int rtval = 0; if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) return (rtval); s = splvm(); if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pvf = pv; do { pvn = TAILQ_NEXT(pv, pv_list); TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); if (!pmap_track_modified(pv->pv_va)) continue; pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); if (pte && (*pte & PG_A)) { *pte &= ~PG_A; pmap_TLB_invalidate(pv->pv_pmap, pv->pv_va); rtval++; if (rtval > 4) { break; } } } while ((pv = pvn) != NULL && pv != pvf); } splx(s); return (rtval); } /* * pmap_is_modified: * * Return whether or not the specified physical page was modified * in any physical maps. */ boolean_t pmap_is_modified(vm_page_t m) { return pmap_testbit(m, PG_M); } /* * Clear the modify bits on the specified physical page. */ void pmap_clear_modify(vm_page_t m) { pmap_changebit(m, PG_M, FALSE); } /* * pmap_clear_reference: * * Clear the reference bit on the specified physical page. */ void pmap_clear_reference(vm_page_t m) { pmap_changebit(m, PG_A, FALSE); } /* * Miscellaneous support routines follow */ static void i386_protection_init() { register int *kp, prot; kp = protection_codes; for (prot = 0; prot < 8; prot++) { switch (prot) { case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_NONE: /* * Read access is also 0. There isn't any execute bit, * so just make it readable. */ case VM_PROT_READ | VM_PROT_NONE | VM_PROT_NONE: case VM_PROT_READ | VM_PROT_NONE | VM_PROT_EXECUTE: case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_EXECUTE: *kp++ = 0; break; case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_NONE: case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_EXECUTE: case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_NONE: case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE: *kp++ = PG_RW; break; } } } /* * Map a set of physical memory pages into the kernel virtual * address space. Return a pointer to where it is mapped. This * routine is intended to be used for mapping device memory, * NOT real memory. */ void * pmap_mapdev(pa, size) vm_offset_t pa; vm_size_t size; { vm_offset_t va, tmpva, offset; unsigned *pte; offset = pa & PAGE_MASK; size = roundup(offset + size, PAGE_SIZE); GIANT_REQUIRED; va = kmem_alloc_pageable(kernel_map, size); if (!va) panic("pmap_mapdev: Couldn't alloc kernel virtual memory"); pa = pa & PG_FRAME; for (tmpva = va; size > 0;) { pte = (unsigned *)vtopte(tmpva); *pte = pa | PG_RW | PG_V | pgeflag; size -= PAGE_SIZE; tmpva += PAGE_SIZE; pa += PAGE_SIZE; } invltlb(); return ((void *)(va + offset)); } void pmap_unmapdev(va, size) vm_offset_t va; vm_size_t size; { vm_offset_t base, offset; base = va & PG_FRAME; offset = va & PAGE_MASK; size = roundup(offset + size, PAGE_SIZE); kmem_free(kernel_map, base, size); } /* * perform the pmap work for mincore */ int pmap_mincore(pmap, addr) pmap_t pmap; vm_offset_t addr; { unsigned *ptep, pte; vm_page_t m; int val = 0; ptep = pmap_pte(pmap, addr); if (ptep == 0) { return 0; } if ((pte = *ptep) != 0) { vm_offset_t pa; val = MINCORE_INCORE; if ((pte & PG_MANAGED) == 0) return val; pa = pte & PG_FRAME; m = PHYS_TO_VM_PAGE(pa); /* * Modified by us */ if (pte & PG_M) val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER; /* * Modified by someone */ else if (m->dirty || pmap_is_modified(m)) val |= MINCORE_MODIFIED_OTHER; /* * Referenced by us */ if (pte & PG_A) val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER; /* * Referenced by someone */ else if ((m->flags & PG_REFERENCED) || pmap_ts_referenced(m)) { val |= MINCORE_REFERENCED_OTHER; vm_page_flag_set(m, PG_REFERENCED); } } return val; } void pmap_activate(struct proc *p) { pmap_t pmap; pmap = vmspace_pmap(p->p_vmspace); #if defined(SMP) pmap->pm_active |= 1 << PCPU_GET(cpuid); #else pmap->pm_active |= 1; #endif #if defined(SWTCH_OPTIM_STATS) tlb_flush_count++; #endif load_cr3(p->p_addr->u_pcb.pcb_cr3 = vtophys(pmap->pm_pdir)); } vm_offset_t pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size) { if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) { return addr; } addr = (addr + (NBPDR - 1)) & ~(NBPDR - 1); return addr; } #if defined(PMAP_DEBUG) pmap_pid_dump(int pid) { pmap_t pmap; struct proc *p; int npte = 0; int index; sx_slock(&allproc_lock); LIST_FOREACH(p, &allproc, p_list) { if (p->p_pid != pid) continue; if (p->p_vmspace) { int i,j; index = 0; pmap = vmspace_pmap(p->p_vmspace); for(i=0;i<1024;i++) { pd_entry_t *pde; unsigned *pte; unsigned base = i << PDRSHIFT; pde = &pmap->pm_pdir[i]; if (pde && pmap_pde_v(pde)) { for(j=0;j<1024;j++) { unsigned va = base + (j << PAGE_SHIFT); if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) { if (index) { index = 0; printf("\n"); } sx_sunlock(&allproc_lock); return npte; } pte = pmap_pte_quick( pmap, va); if (pte && pmap_pte_v(pte)) { vm_offset_t pa; vm_page_t m; pa = *(int *)pte; m = PHYS_TO_VM_PAGE(pa); printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x", va, pa, m->hold_count, m->wire_count, m->flags); npte++; index++; if (index >= 2) { index = 0; printf("\n"); } else { printf(" "); } } } } } } } sx_sunlock(&allproc_lock); return npte; } #endif #if defined(DEBUG) static void pads __P((pmap_t pm)); void pmap_pvdump __P((vm_offset_t pa)); /* print address space of pmap*/ static void pads(pm) pmap_t pm; { unsigned va, i, j; unsigned *ptep; if (pm == kernel_pmap) return; for (i = 0; i < 1024; i++) if (pm->pm_pdir[i]) for (j = 0; j < 1024; j++) { va = (i << PDRSHIFT) + (j << PAGE_SHIFT); if (pm == kernel_pmap && va < KERNBASE) continue; if (pm != kernel_pmap && va > UPT_MAX_ADDRESS) continue; ptep = pmap_pte_quick(pm, va); if (pmap_pte_v(ptep)) printf("%x:%x ", va, *(int *) ptep); }; } void pmap_pvdump(pa) vm_offset_t pa; { register pv_entry_t pv; vm_page_t m; printf("pa %x", pa); m = PHYS_TO_VM_PAGE(pa); for (pv = TAILQ_FIRST(&m->md.pv_list); pv; pv = TAILQ_NEXT(pv, pv_list)) { #ifdef used_to_be printf(" -> pmap %p, va %x, flags %x", (void *)pv->pv_pmap, pv->pv_va, pv->pv_flags); #endif printf(" -> pmap %p, va %x", (void *)pv->pv_pmap, pv->pv_va); pads(pv->pv_pmap); } printf(" "); } #endif Index: head/sys/amd64/amd64/sys_machdep.c =================================================================== --- head/sys/amd64/amd64/sys_machdep.c (revision 82308) +++ head/sys/amd64/amd64/sys_machdep.c (revision 82309) @@ -1,535 +1,537 @@ /*- * Copyright (c) 1990 The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)sys_machdep.c 5.5 (Berkeley) 1/19/91 * $FreeBSD$ * */ +#include "opt_upages.h" + #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* pcb.h included by sys/user.h */ #include #include /* for kernel_map */ #define MAX_LD 8192 #define LD_PER_PAGE 512 #define NEW_MAX_LD(num) ((num + LD_PER_PAGE) & ~(LD_PER_PAGE-1)) #define SIZE_FROM_LARGEST_LD(num) (NEW_MAX_LD(num) << 3) static int i386_get_ldt __P((struct proc *, char *)); static int i386_set_ldt __P((struct proc *, char *)); static int i386_get_ioperm __P((struct proc *, char *)); static int i386_set_ioperm __P((struct proc *, char *)); #ifdef SMP static void set_user_ldt_rv __P((struct pcb *)); #endif #ifndef _SYS_SYSPROTO_H_ struct sysarch_args { int op; char *parms; }; #endif int sysarch(p, uap) struct proc *p; register struct sysarch_args *uap; { int error = 0; switch(uap->op) { case I386_GET_LDT: error = i386_get_ldt(p, uap->parms); break; case I386_SET_LDT: error = i386_set_ldt(p, uap->parms); break; case I386_GET_IOPERM: error = i386_get_ioperm(p, uap->parms); break; case I386_SET_IOPERM: error = i386_set_ioperm(p, uap->parms); break; case I386_VM86: error = vm86_sysarch(p, uap->parms); break; default: error = EOPNOTSUPP; break; } return (error); } int i386_extend_pcb(struct proc *p) { int i, offset; u_long *addr; struct pcb_ext *ext; struct soft_segment_descriptor ssd = { 0, /* segment base address (overwritten) */ ctob(IOPAGES + 1) - 1, /* length */ SDT_SYS386TSS, /* segment type */ 0, /* priority level */ 1, /* descriptor present */ 0, 0, 0, /* default 32 size */ 0 /* granularity */ }; ext = (struct pcb_ext *)kmem_alloc(kernel_map, ctob(IOPAGES+1)); if (ext == 0) return (ENOMEM); bzero(ext, sizeof(struct pcb_ext)); ext->ext_tss.tss_esp0 = (unsigned)p->p_addr + ctob(UPAGES) - 16; ext->ext_tss.tss_ss0 = GSEL(GDATA_SEL, SEL_KPL); /* * The last byte of the i/o map must be followed by an 0xff byte. * We arbitrarily allocate 16 bytes here, to keep the starting * address on a doubleword boundary. */ offset = PAGE_SIZE - 16; ext->ext_tss.tss_ioopt = (offset - ((unsigned)&ext->ext_tss - (unsigned)ext)) << 16; ext->ext_iomap = (caddr_t)ext + offset; ext->ext_vm86.vm86_intmap = (caddr_t)ext + offset - 32; addr = (u_long *)ext->ext_vm86.vm86_intmap; for (i = 0; i < (ctob(IOPAGES) + 32 + 16) / sizeof(u_long); i++) *addr++ = ~0; ssd.ssd_base = (unsigned)&ext->ext_tss; ssd.ssd_limit -= ((unsigned)&ext->ext_tss - (unsigned)ext); ssdtosd(&ssd, &ext->ext_tssd); KASSERT(p == curproc, ("giving a TSS to non-curproc")); KASSERT(p->p_addr->u_pcb.pcb_ext == 0, ("already have a TSS!")); mtx_lock_spin(&sched_lock); p->p_addr->u_pcb.pcb_ext = ext; /* switch to the new TSS after syscall completes */ p->p_sflag |= PS_NEEDRESCHED; mtx_unlock_spin(&sched_lock); return 0; } static int i386_set_ioperm(p, args) struct proc *p; char *args; { int i, error; struct i386_ioperm_args ua; char *iomap; if ((error = copyin(args, &ua, sizeof(struct i386_ioperm_args))) != 0) return (error); if ((error = suser(p)) != 0) return (error); if (securelevel > 0) return (EPERM); /* * XXX * While this is restricted to root, we should probably figure out * whether any other driver is using this i/o address, as so not to * cause confusion. This probably requires a global 'usage registry'. */ if (p->p_addr->u_pcb.pcb_ext == 0) if ((error = i386_extend_pcb(p)) != 0) return (error); iomap = (char *)p->p_addr->u_pcb.pcb_ext->ext_iomap; if (ua.start + ua.length > IOPAGES * PAGE_SIZE * NBBY) return (EINVAL); for (i = ua.start; i < ua.start + ua.length; i++) { if (ua.enable) iomap[i >> 3] &= ~(1 << (i & 7)); else iomap[i >> 3] |= (1 << (i & 7)); } return (error); } static int i386_get_ioperm(p, args) struct proc *p; char *args; { int i, state, error; struct i386_ioperm_args ua; char *iomap; if ((error = copyin(args, &ua, sizeof(struct i386_ioperm_args))) != 0) return (error); if (ua.start >= IOPAGES * PAGE_SIZE * NBBY) return (EINVAL); if (p->p_addr->u_pcb.pcb_ext == 0) { ua.length = 0; goto done; } iomap = (char *)p->p_addr->u_pcb.pcb_ext->ext_iomap; i = ua.start; state = (iomap[i >> 3] >> (i & 7)) & 1; ua.enable = !state; ua.length = 1; for (i = ua.start + 1; i < IOPAGES * PAGE_SIZE * NBBY; i++) { if (state != ((iomap[i >> 3] >> (i & 7)) & 1)) break; ua.length++; } done: error = copyout(&ua, args, sizeof(struct i386_ioperm_args)); return (error); } /* * Update the GDT entry pointing to the LDT to point to the LDT of the * current process. * * This must be called with sched_lock held. Unfortunately, we can't use a * mtx_assert() here because cpu_switch() calls this function after changing * curproc but before sched_lock's owner is updated in mi_switch(). */ void set_user_ldt(struct pcb *pcb) { struct pcb_ldt *pcb_ldt; pcb_ldt = pcb->pcb_ldt; #ifdef SMP gdt[PCPU_GET(cpuid) * NGDT + GUSERLDT_SEL].sd = pcb_ldt->ldt_sd; #else gdt[GUSERLDT_SEL].sd = pcb_ldt->ldt_sd; #endif lldt(GSEL(GUSERLDT_SEL, SEL_KPL)); PCPU_SET(currentldt, GSEL(GUSERLDT_SEL, SEL_KPL)); } #ifdef SMP static void set_user_ldt_rv(struct pcb *pcb) { if (pcb != PCPU_GET(curpcb)) return; mtx_lock_spin(&sched_lock); set_user_ldt(pcb); mtx_unlock_spin(&sched_lock); } #endif /* * Must be called with either sched_lock free or held but not recursed. * If it does not return NULL, it will return with it owned. */ struct pcb_ldt * user_ldt_alloc(struct pcb *pcb, int len) { struct pcb_ldt *pcb_ldt, *new_ldt; if (mtx_owned(&sched_lock)) mtx_unlock_spin(&sched_lock); mtx_assert(&sched_lock, MA_NOTOWNED); MALLOC(new_ldt, struct pcb_ldt *, sizeof(struct pcb_ldt), M_SUBPROC, M_WAITOK); new_ldt->ldt_len = len = NEW_MAX_LD(len); new_ldt->ldt_base = (caddr_t)kmem_alloc(kernel_map, len * sizeof(union descriptor)); if (new_ldt->ldt_base == NULL) { FREE(new_ldt, M_SUBPROC); return NULL; } new_ldt->ldt_refcnt = 1; new_ldt->ldt_active = 0; mtx_lock_spin(&sched_lock); gdt_segs[GUSERLDT_SEL].ssd_base = (unsigned)new_ldt->ldt_base; gdt_segs[GUSERLDT_SEL].ssd_limit = len * sizeof(union descriptor) - 1; ssdtosd(&gdt_segs[GUSERLDT_SEL], &new_ldt->ldt_sd); if ((pcb_ldt = pcb->pcb_ldt)) { if (len > pcb_ldt->ldt_len) len = pcb_ldt->ldt_len; bcopy(pcb_ldt->ldt_base, new_ldt->ldt_base, len * sizeof(union descriptor)); } else { bcopy(ldt, new_ldt->ldt_base, sizeof(ldt)); } return new_ldt; } /* * Must be called either with sched_lock free or held but not recursed. * If pcb->pcb_ldt is not NULL, it will return with sched_lock released. */ void user_ldt_free(struct pcb *pcb) { struct pcb_ldt *pcb_ldt = pcb->pcb_ldt; if (pcb_ldt == NULL) return; if (!mtx_owned(&sched_lock)) mtx_lock_spin(&sched_lock); mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED); if (pcb == PCPU_GET(curpcb)) { lldt(_default_ldt); PCPU_SET(currentldt, _default_ldt); } pcb->pcb_ldt = NULL; if (--pcb_ldt->ldt_refcnt == 0) { mtx_unlock_spin(&sched_lock); kmem_free(kernel_map, (vm_offset_t)pcb_ldt->ldt_base, pcb_ldt->ldt_len * sizeof(union descriptor)); FREE(pcb_ldt, M_SUBPROC); } else mtx_unlock_spin(&sched_lock); } static int i386_get_ldt(p, args) struct proc *p; char *args; { int error = 0; struct pcb *pcb = &p->p_addr->u_pcb; struct pcb_ldt *pcb_ldt = pcb->pcb_ldt; int nldt, num; union descriptor *lp; struct i386_ldt_args ua, *uap = &ua; if ((error = copyin(args, uap, sizeof(struct i386_ldt_args))) < 0) return(error); #ifdef DEBUG printf("i386_get_ldt: start=%d num=%d descs=%p\n", uap->start, uap->num, (void *)uap->descs); #endif /* verify range of LDTs exist */ if ((uap->start < 0) || (uap->num <= 0)) return(EINVAL); if (pcb_ldt) { nldt = pcb_ldt->ldt_len; num = min(uap->num, nldt); lp = &((union descriptor *)(pcb_ldt->ldt_base))[uap->start]; } else { nldt = sizeof(ldt)/sizeof(ldt[0]); num = min(uap->num, nldt); lp = &ldt[uap->start]; } if (uap->start > nldt) return(EINVAL); error = copyout(lp, uap->descs, num * sizeof(union descriptor)); if (!error) p->p_retval[0] = num; return(error); } static int i386_set_ldt(p, args) struct proc *p; char *args; { int error = 0, i, n; int largest_ld; struct pcb *pcb = &p->p_addr->u_pcb; struct pcb_ldt *pcb_ldt = pcb->pcb_ldt; struct i386_ldt_args ua, *uap = &ua; caddr_t old_ldt_base; int old_ldt_len; critical_t savecrit; if ((error = copyin(args, uap, sizeof(struct i386_ldt_args))) < 0) return(error); #ifdef DEBUG printf("i386_set_ldt: start=%d num=%d descs=%p\n", uap->start, uap->num, (void *)uap->descs); #endif /* verify range of descriptors to modify */ if ((uap->start < 0) || (uap->start >= MAX_LD) || (uap->num < 0) || (uap->num > MAX_LD)) { return(EINVAL); } largest_ld = uap->start + uap->num - 1; if (largest_ld >= MAX_LD) return(EINVAL); /* allocate user ldt */ if (!pcb_ldt || largest_ld >= pcb_ldt->ldt_len) { struct pcb_ldt *new_ldt = user_ldt_alloc(pcb, largest_ld); if (new_ldt == NULL) return ENOMEM; if (pcb_ldt) { old_ldt_base = pcb_ldt->ldt_base; old_ldt_len = pcb_ldt->ldt_len; pcb_ldt->ldt_sd = new_ldt->ldt_sd; pcb_ldt->ldt_base = new_ldt->ldt_base; pcb_ldt->ldt_len = new_ldt->ldt_len; mtx_unlock_spin(&sched_lock); kmem_free(kernel_map, (vm_offset_t)old_ldt_base, old_ldt_len * sizeof(union descriptor)); FREE(new_ldt, M_SUBPROC); #ifndef SMP mtx_lock_spin(&sched_lock); #endif } else { pcb->pcb_ldt = pcb_ldt = new_ldt; #ifdef SMP mtx_unlock_spin(&sched_lock); #endif } #ifdef SMP /* signal other cpus to reload ldt */ smp_rendezvous(NULL, (void (*)(void *))set_user_ldt_rv, NULL, pcb); #else set_user_ldt(pcb); mtx_unlock_spin(&sched_lock); #endif } /* Check descriptors for access violations */ for (i = 0, n = uap->start; i < uap->num; i++, n++) { union descriptor desc, *dp; dp = &uap->descs[i]; error = copyin(dp, &desc, sizeof(union descriptor)); if (error) return(error); switch (desc.sd.sd_type) { case SDT_SYSNULL: /* system null */ desc.sd.sd_p = 0; break; case SDT_SYS286TSS: /* system 286 TSS available */ case SDT_SYSLDT: /* system local descriptor table */ case SDT_SYS286BSY: /* system 286 TSS busy */ case SDT_SYSTASKGT: /* system task gate */ case SDT_SYS286IGT: /* system 286 interrupt gate */ case SDT_SYS286TGT: /* system 286 trap gate */ case SDT_SYSNULL2: /* undefined by Intel */ case SDT_SYS386TSS: /* system 386 TSS available */ case SDT_SYSNULL3: /* undefined by Intel */ case SDT_SYS386BSY: /* system 386 TSS busy */ case SDT_SYSNULL4: /* undefined by Intel */ case SDT_SYS386IGT: /* system 386 interrupt gate */ case SDT_SYS386TGT: /* system 386 trap gate */ case SDT_SYS286CGT: /* system 286 call gate */ case SDT_SYS386CGT: /* system 386 call gate */ /* I can't think of any reason to allow a user proc * to create a segment of these types. They are * for OS use only. */ return EACCES; /*NOTREACHED*/ /* memory segment types */ case SDT_MEMEC: /* memory execute only conforming */ case SDT_MEMEAC: /* memory execute only accessed conforming */ case SDT_MEMERC: /* memory execute read conforming */ case SDT_MEMERAC: /* memory execute read accessed conforming */ /* Must be "present" if executable and conforming. */ if (desc.sd.sd_p == 0) return (EACCES); break; case SDT_MEMRO: /* memory read only */ case SDT_MEMROA: /* memory read only accessed */ case SDT_MEMRW: /* memory read write */ case SDT_MEMRWA: /* memory read write accessed */ case SDT_MEMROD: /* memory read only expand dwn limit */ case SDT_MEMRODA: /* memory read only expand dwn lim accessed */ case SDT_MEMRWD: /* memory read write expand dwn limit */ case SDT_MEMRWDA: /* memory read write expand dwn lim acessed */ case SDT_MEME: /* memory execute only */ case SDT_MEMEA: /* memory execute only accessed */ case SDT_MEMER: /* memory execute read */ case SDT_MEMERA: /* memory execute read accessed */ break; default: return(EINVAL); /*NOTREACHED*/ } /* Only user (ring-3) descriptors may be present. */ if ((desc.sd.sd_p != 0) && (desc.sd.sd_dpl != SEL_UPL)) return (EACCES); } /* Fill in range */ savecrit = critical_enter(); error = copyin(uap->descs, &((union descriptor *)(pcb_ldt->ldt_base))[uap->start], uap->num * sizeof(union descriptor)); if (!error) p->p_retval[0] = uap->start; critical_exit(savecrit); return(error); } Index: head/sys/amd64/amd64/vm_machdep.c =================================================================== --- head/sys/amd64/amd64/vm_machdep.c (revision 82308) +++ head/sys/amd64/amd64/vm_machdep.c (revision 82309) @@ -1,587 +1,588 @@ /*- * Copyright (c) 1982, 1986 The Regents of the University of California. * Copyright (c) 1989, 1990 William Jolitz * Copyright (c) 1994 John Dyson * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department, and William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_machdep.c 7.3 (Berkeley) 5/13/91 * Utah $Hdr: vm_machdep.c 1.16.1.1 89/06/23$ * $FreeBSD$ */ #include "opt_npx.h" #ifdef PC98 #include "opt_pc98.h" #endif #include "opt_reset.h" #include "opt_isa.h" +#include "opt_upages.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef PC98 #include #else #include #endif static void cpu_reset_real __P((void)); #ifdef SMP static void cpu_reset_proxy __P((void)); static u_int cpu_reset_proxyid; static volatile u_int cpu_reset_proxy_active; #endif extern int _ucodesel, _udatasel; /* * quick version of vm_fault */ int vm_fault_quick(v, prot) caddr_t v; int prot; { int r; if (prot & VM_PROT_WRITE) r = subyte(v, fubyte(v)); else r = fubyte(v); return(r); } /* * Finish a fork operation, with process p2 nearly set up. * Copy and update the pcb, set up the stack so that the child * ready to run and return to user mode. */ void cpu_fork(p1, p2, flags) register struct proc *p1, *p2; int flags; { struct pcb *pcb2; #ifdef DEV_NPX int savecrit; #endif if ((flags & RFPROC) == 0) { if ((flags & RFMEM) == 0) { /* unshare user LDT */ struct pcb *pcb1 = &p1->p_addr->u_pcb; struct pcb_ldt *pcb_ldt = pcb1->pcb_ldt; if (pcb_ldt && pcb_ldt->ldt_refcnt > 1) { pcb_ldt = user_ldt_alloc(pcb1,pcb_ldt->ldt_len); if (pcb_ldt == NULL) panic("could not copy LDT"); pcb1->pcb_ldt = pcb_ldt; set_user_ldt(pcb1); user_ldt_free(pcb1); } } return; } /* Ensure that p1's pcb is up to date. */ #ifdef DEV_NPX if (p1 == curproc) p1->p_addr->u_pcb.pcb_gs = rgs(); savecrit = critical_enter(); if (PCPU_GET(npxproc) == p1) npxsave(&p1->p_addr->u_pcb.pcb_save); critical_exit(savecrit); #endif /* Copy p1's pcb. */ p2->p_addr->u_pcb = p1->p_addr->u_pcb; pcb2 = &p2->p_addr->u_pcb; /* * Create a new fresh stack for the new process. * Copy the trap frame for the return to user mode as if from a * syscall. This copies most of the user mode register values. */ p2->p_frame = (struct trapframe *) ((int)p2->p_addr + UPAGES * PAGE_SIZE - 16) - 1; bcopy(p1->p_frame, p2->p_frame, sizeof(struct trapframe)); p2->p_frame->tf_eax = 0; /* Child returns zero */ p2->p_frame->tf_eflags &= ~PSL_C; /* success */ p2->p_frame->tf_edx = 1; /* * Set registers for trampoline to user mode. Leave space for the * return address on stack. These are the kernel mode register values. */ pcb2->pcb_cr3 = vtophys(vmspace_pmap(p2->p_vmspace)->pm_pdir); pcb2->pcb_edi = 0; pcb2->pcb_esi = (int)fork_return; /* fork_trampoline argument */ pcb2->pcb_ebp = 0; pcb2->pcb_esp = (int)p2->p_frame - sizeof(void *); pcb2->pcb_ebx = (int)p2; /* fork_trampoline argument */ pcb2->pcb_eip = (int)fork_trampoline; /*- * pcb2->pcb_dr*: cloned above. * pcb2->pcb_ldt: duplicated below, if necessary. * pcb2->pcb_savefpu: cloned above. * pcb2->pcb_flags: cloned above. * pcb2->pcb_onfault: cloned above (always NULL here?). * pcb2->pcb_gs: cloned above. * pcb2->pcb_ext: cleared below. */ /* * XXX don't copy the i/o pages. this should probably be fixed. */ pcb2->pcb_ext = 0; /* Copy the LDT, if necessary. */ mtx_lock_spin(&sched_lock); if (pcb2->pcb_ldt != 0) { if (flags & RFMEM) { pcb2->pcb_ldt->ldt_refcnt++; } else { pcb2->pcb_ldt = user_ldt_alloc(pcb2, pcb2->pcb_ldt->ldt_len); if (pcb2->pcb_ldt == NULL) panic("could not copy LDT"); } } mtx_unlock_spin(&sched_lock); /* * Now, cpu_switch() can schedule the new process. * pcb_esp is loaded pointing to the cpu_switch() stack frame * containing the return address when exiting cpu_switch. * This will normally be to fork_trampoline(), which will have * %ebx loaded with the new proc's pointer. fork_trampoline() * will set up a stack to call fork_return(p, frame); to complete * the return to user-mode. */ } /* * Intercept the return address from a freshly forked process that has NOT * been scheduled yet. * * This is needed to make kernel threads stay in kernel mode. */ void cpu_set_fork_handler(p, func, arg) struct proc *p; void (*func) __P((void *)); void *arg; { /* * Note that the trap frame follows the args, so the function * is really called like this: func(arg, frame); */ p->p_addr->u_pcb.pcb_esi = (int) func; /* function */ p->p_addr->u_pcb.pcb_ebx = (int) arg; /* first arg */ } void cpu_exit(p) register struct proc *p; { struct pcb *pcb = &p->p_addr->u_pcb; #ifdef DEV_NPX npxexit(p); #endif if (pcb->pcb_ext != 0) { /* * XXX do we need to move the TSS off the allocated pages * before freeing them? (not done here) */ kmem_free(kernel_map, (vm_offset_t)pcb->pcb_ext, ctob(IOPAGES + 1)); pcb->pcb_ext = 0; } if (pcb->pcb_ldt) user_ldt_free(pcb); if (pcb->pcb_flags & PCB_DBREGS) { /* * disable all hardware breakpoints */ reset_dbregs(); pcb->pcb_flags &= ~PCB_DBREGS; } PROC_LOCK(p); mtx_lock_spin(&sched_lock); while (mtx_owned(&Giant)) mtx_unlock_flags(&Giant, MTX_NOSWITCH); /* * We have to wait until after releasing all locks before * changing p_stat. If we block on a mutex then we will be * back at SRUN when we resume and our parent will never * harvest us. */ p->p_stat = SZOMB; wakeup(p->p_pptr); PROC_UNLOCK_NOSWITCH(p); cnt.v_swtch++; cpu_throw(); panic("cpu_exit"); } void cpu_wait(p) struct proc *p; { GIANT_REQUIRED; /* drop per-process resources */ pmap_dispose_proc(p); /* and clean-out the vmspace */ vmspace_free(p->p_vmspace); } /* * Dump the machine specific header information at the start of a core dump. */ int cpu_coredump(p, vp, cred) struct proc *p; struct vnode *vp; struct ucred *cred; { int error; caddr_t tempuser; tempuser = malloc(ctob(UPAGES), M_TEMP, M_WAITOK | M_ZERO); if (!tempuser) return EINVAL; bcopy(p->p_addr, tempuser, sizeof(struct user)); bcopy(p->p_frame, tempuser + ((caddr_t) p->p_frame - (caddr_t) p->p_addr), sizeof(struct trapframe)); error = vn_rdwr(UIO_WRITE, vp, (caddr_t) tempuser, ctob(UPAGES), (off_t)0, UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, cred, (int *)NULL, p); free(tempuser, M_TEMP); return error; } #ifdef notyet static void setredzone(pte, vaddr) u_short *pte; caddr_t vaddr; { /* eventually do this by setting up an expand-down stack segment for ss0: selector, allowing stack access down to top of u. this means though that protection violations need to be handled thru a double fault exception that must do an integral task switch to a known good context, within which a dump can be taken. a sensible scheme might be to save the initial context used by sched (that has physical memory mapped 1:1 at bottom) and take the dump while still in mapped mode */ } #endif /* * Convert kernel VA to physical address */ u_long kvtop(void *addr) { vm_offset_t va; va = pmap_kextract((vm_offset_t)addr); if (va == 0) panic("kvtop: zero page frame"); return((int)va); } /* * Map an IO request into kernel virtual address space. * * All requests are (re)mapped into kernel VA space. * Notice that we use b_bufsize for the size of the buffer * to be mapped. b_bcount might be modified by the driver. */ void vmapbuf(bp) register struct buf *bp; { register caddr_t addr, v, kva; vm_offset_t pa; GIANT_REQUIRED; if ((bp->b_flags & B_PHYS) == 0) panic("vmapbuf"); for (v = bp->b_saveaddr, addr = (caddr_t)trunc_page((vm_offset_t)bp->b_data); addr < bp->b_data + bp->b_bufsize; addr += PAGE_SIZE, v += PAGE_SIZE) { /* * Do the vm_fault if needed; do the copy-on-write thing * when reading stuff off device into memory. */ vm_fault_quick(addr, (bp->b_iocmd == BIO_READ)?(VM_PROT_READ|VM_PROT_WRITE):VM_PROT_READ); pa = trunc_page(pmap_kextract((vm_offset_t) addr)); if (pa == 0) panic("vmapbuf: page not present"); vm_page_hold(PHYS_TO_VM_PAGE(pa)); pmap_kenter((vm_offset_t) v, pa); } kva = bp->b_saveaddr; bp->b_saveaddr = bp->b_data; bp->b_data = kva + (((vm_offset_t) bp->b_data) & PAGE_MASK); } /* * Free the io map PTEs associated with this IO operation. * We also invalidate the TLB entries and restore the original b_addr. */ void vunmapbuf(bp) register struct buf *bp; { register caddr_t addr; vm_offset_t pa; GIANT_REQUIRED; if ((bp->b_flags & B_PHYS) == 0) panic("vunmapbuf"); for (addr = (caddr_t)trunc_page((vm_offset_t)bp->b_data); addr < bp->b_data + bp->b_bufsize; addr += PAGE_SIZE) { pa = trunc_page(pmap_kextract((vm_offset_t) addr)); pmap_kremove((vm_offset_t) addr); vm_page_unhold(PHYS_TO_VM_PAGE(pa)); } bp->b_data = bp->b_saveaddr; } /* * Force reset the processor by invalidating the entire address space! */ #ifdef SMP static void cpu_reset_proxy() { cpu_reset_proxy_active = 1; while (cpu_reset_proxy_active == 1) ; /* Wait for other cpu to see that we've started */ stop_cpus((1<" */ invltlb(); /* NOTREACHED */ while(1); } int grow_stack(p, sp) struct proc *p; u_int sp; { int rv; rv = vm_map_growstack (p, sp); if (rv != KERN_SUCCESS) return (0); return (1); } /* * Software interrupt handler for queued VM system processing. */ void swi_vm(void *dummy) { if (busdma_swi_pending != 0) busdma_swi(); } /* * Tell whether this address is in some physical memory region. * Currently used by the kernel coredump code in order to avoid * dumping the ``ISA memory hole'' which could cause indefinite hangs, * or other unpredictable behaviour. */ int is_physical_memory(addr) vm_offset_t addr; { #ifdef DEV_ISA /* The ISA ``memory hole''. */ if (addr >= 0xa0000 && addr < 0x100000) return 0; #endif /* * stuff other tests for known memory-mapped devices (PCI?) * here */ return 1; } Index: head/sys/amd64/conf/GENERIC =================================================================== --- head/sys/amd64/conf/GENERIC (revision 82308) +++ head/sys/amd64/conf/GENERIC (revision 82309) @@ -1,235 +1,237 @@ # # GENERIC -- Generic kernel configuration file for FreeBSD/i386 # # For more information on this file, please read the handbook section on # Kernel Configuration Files: # # http://www.FreeBSD.org/handbook/kernelconfig-config.html # # The handbook is also available locally in /usr/share/doc/handbook # if you've installed the doc distribution, otherwise always see the # FreeBSD World Wide Web server (http://www.FreeBSD.org/) for the # latest information. # # An exhaustive list of options and more detailed explanations of the # device lines is also present in the NOTES configuration file. If you are # in doubt as to the purpose or necessity of a line, check first in NOTES. # # $FreeBSD$ machine i386 cpu I486_CPU cpu I586_CPU cpu I686_CPU ident GENERIC maxusers 32 #To statically compile in device wiring instead of /boot/device.hints -#hints "GENERIC.hints" #Default places to look for devices. +hints "GENERIC.hints" #Default places to look for devices. makeoptions DEBUG=-g #Build kernel with gdb(1) debug symbols options MATH_EMULATE #Support for x87 emulation options INET #InterNETworking options INET6 #IPv6 communications protocols options FFS #Berkeley Fast Filesystem options SOFTUPDATES #Enable FFS soft updates support options MD_ROOT #MD is a potential root device options NFS #Network Filesystem options NFS_ROOT #NFS usable as root device, NFS required options MSDOSFS #MSDOS Filesystem options CD9660 #ISO 9660 Filesystem options PROCFS #Process filesystem options COMPAT_43 #Compatible with BSD 4.3 [KEEP THIS!] options SCSI_DELAY=15000 #Delay (in ms) before probing SCSI options UCONSOLE #Allow users to grab the console #options USERCONFIG #boot -c editor #options VISUAL_USERCONFIG #visual boot -c editor options KTRACE #ktrace(1) support options SYSVSHM #SYSV-style shared memory options SYSVMSG #SYSV-style message queues options SYSVSEM #SYSV-style semaphores options P1003_1B #Posix P1003_1B real-time extensions options _KPOSIX_PRIORITY_SCHEDULING options KBD_INSTALL_CDEV # install a CDEV entry in /dev # Debugging for use in -current options DDB options INVARIANTS options INVARIANT_SUPPORT options WITNESS +options UPAGES=4 +options CPU_ENABLE_SSE # To make an SMP kernel, the next two are needed #options SMP # Symmetric MultiProcessor Kernel #options APIC_IO # Symmetric (APIC) I/O device isa device eisa device pci # Floppy drives device fdc # ATA and ATAPI devices device ata device atadisk # ATA disk drives device atapicd # ATAPI CDROM drives device atapifd # ATAPI floppy drives device atapist # ATAPI tape drives options ATA_STATIC_ID #Static device numbering # SCSI Controllers device ahb # EISA AHA1742 family device ahc # AHA2940 and onboard AIC7xxx devices device amd # AMD 53C974 (Tekram DC-390(T)) device isp # Qlogic family #device ncr # NCR/Symbios Logic device sym # NCR/Symbios Logic (newer chipsets + those of `ncr') device adv # Advansys SCSI adapters device adw # Advansys wide SCSI adapters device aha # Adaptec 154x SCSI adapters device aic # Adaptec 15[012]x SCSI adapters, AIC-6[23]60. device bt # Buslogic/Mylex MultiMaster SCSI adapters device ncv # NCR 53C500 device nsp # Workbit Ninja SCSI-3 device stg # TMC 18C30/18C50 # RAID controllers interfaced to the SCSI subsystem device asr # DPT SmartRAID V, VI and Adaptec SCSI RAID device dpt # DPT Smartcache III, IV - See NOTES for options! device mly # Mylex AcceleRAID/eXtremeRAID # SCSI peripherals device scbus # SCSI bus (required) device da # Direct Access (disks) device sa # Sequential Access (tape etc) device cd # CD device pass # Passthrough device (direct SCSI access) # RAID controllers device aac # Adaptec FSA RAID device amr # AMI MegaRAID device ida # Compaq Smart RAID device mlx # Mylex DAC960 family device twe # 3ware ATA RAID # atkbdc0 controls both the keyboard and the PS/2 mouse device atkbdc 1 # At keyboard controller device atkbd # at keyboard device psm # psm mouse device vga # VGA screen # splash screen/screen saver device splash # syscons is the default console driver, resembling an SCO console device sc 1 # Enable this for the pcvt (VT220 compatible) console driver #device vt #options XSERVER # support for X server on a vt console #options FAT_CURSOR # start with block cursor # Floating point support - do not disable. device npx # Power management support (see NOTES for more options) device apm # Add suspend/resume support for the i8254. device pmtimer # PCCARD (PCMCIA) support device card # pccard bus device pcic # PCMCIA bridge # Serial (COM) ports device sio # 8250, 16[45]50 based serial ports # Parallel port device ppc device ppbus # Parallel port bus (required) device lpt # Printer device plip # TCP/IP over parallel device ppi # Parallel port interface device #device vpo # Requires scbus and da # PCI Ethernet NICs. device de # DEC/Intel DC21x4x (``Tulip'') device vx # 3Com 3c590, 3c595 (``Vortex'') device txp # 3Com 3cR990 (``Typhoon'') # PCI Ethernet NICs that use the common MII bus controller code. # NOTE: Be sure to keep the 'device miibus' line in order to use these NICs! device miibus # MII bus support device dc # DEC/Intel 21143 and various workalikes device fxp # Intel EtherExpress PRO/100B (82557, 82558) device pcn # AMD Am79C97x PCI 10/100 NICs device rl # RealTek 8129/8139 device sf # Adaptec AIC-6915 (``Starfire'') device sis # Silicon Integrated Systems SiS 900/SiS 7016 device ste # Sundance ST201 (D-Link DFE-550TX) device tl # Texas Instruments ThunderLAN device tx # SMC EtherPower II (83c170 ``EPIC'') device vr # VIA Rhine, Rhine II device wb # Winbond W89C840F device wx # Intel Gigabit Ethernet Card (``Wiseman'') device xl # 3Com 3c90x (``Boomerang'', ``Cyclone'') # ISA Ethernet NICs. pccard nics included. device cs # Crystal Semiconductor CS89x0 NIC # 'device ed' requires 'device miibus' device ed # NE[12]000, SMC Ultra, 3c503, DS8390 cards device ex # Intel EtherExpress Pro/10 and Pro/10+ device ep # Etherlink III based cards device fe # Fujitsu MB8696x based cards device sn # SMC's 9000 series of ethernet chips device xe # Xircom pccard ethernet # The probe order of these is presently determined by i386/isa/isa_compat.c. #device ie #device le device lnc # Wireless NIC cards device an # Aironet 4500/4800 802.11 wireless NICs. device awi # BayStack 660 and others device wi # WaveLAN/IEEE 802.11 wireless NICs. #device wl # Older non 802.11 Wavelan wireless NIC. # Pseudo devices - the number indicates how many units to allocate. device random # Entropy device device loop # Network loopback device ether # Ethernet support device sl # Kernel SLIP device ppp 1 # Kernel PPP device tun # Packet tunnel. device pty # Pseudo-ttys (telnet etc) device md # Memory "disks" device gif # IPv6 and IPv4 tunneling device faith 1 # IPv6-to-IPv4 relaying (translation) # The `bpf' device enables the Berkeley Packet Filter. # Be aware of the administrative consequences of enabling this! device bpf # Berkeley packet filter # USB support device uhci # UHCI PCI->USB interface device ohci # OHCI PCI->USB interface device usb # USB Bus (required) #device udbp # USB Double Bulk Pipe devices device ugen # Generic device uhid # "Human Interface Devices" device ukbd # Keyboard device ulpt # Printer device umass # Disks/Mass storage - Requires scbus and da device ums # Mouse device urio # Diamond Rio 500 MP3 player device uscanner # Scanners # USB Ethernet, requires mii device aue # ADMtek USB ethernet device cue # CATC USB ethernet device kue # Kawasaki LSI USB ethernet Index: head/sys/amd64/include/mptable.h =================================================================== --- head/sys/amd64/include/mptable.h (revision 82308) +++ head/sys/amd64/include/mptable.h (revision 82309) @@ -1,2440 +1,2442 @@ /* * Copyright (c) 1996, by Steve Passe * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. The name of the developer may NOT be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include "opt_cpu.h" +#include "opt_upages.h" #ifdef SMP #include #else #error #endif #include #include #include #include /* cngetc() */ #include #ifdef GPROF #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /** TEST_DEFAULT_CONFIG, TEST_TEST1 */ #include #include #include +#include #if defined(APIC_IO) #include /* setidt() */ #include /* IPIs */ #include /* IPIs */ #endif /* APIC_IO */ #if defined(TEST_DEFAULT_CONFIG) #define MPFPS_MPFB1 TEST_DEFAULT_CONFIG #else #define MPFPS_MPFB1 mpfps->mpfb1 #endif /* TEST_DEFAULT_CONFIG */ #define WARMBOOT_TARGET 0 #define WARMBOOT_OFF (KERNBASE + 0x0467) #define WARMBOOT_SEG (KERNBASE + 0x0469) #ifdef PC98 #define BIOS_BASE (0xe8000) #define BIOS_SIZE (0x18000) #else #define BIOS_BASE (0xf0000) #define BIOS_SIZE (0x10000) #endif #define BIOS_COUNT (BIOS_SIZE/4) #define CMOS_REG (0x70) #define CMOS_DATA (0x71) #define BIOS_RESET (0x0f) #define BIOS_WARM (0x0a) #define PROCENTRY_FLAG_EN 0x01 #define PROCENTRY_FLAG_BP 0x02 #define IOAPICENTRY_FLAG_EN 0x01 /* MP Floating Pointer Structure */ typedef struct MPFPS { char signature[4]; void *pap; u_char length; u_char spec_rev; u_char checksum; u_char mpfb1; u_char mpfb2; u_char mpfb3; u_char mpfb4; u_char mpfb5; } *mpfps_t; /* MP Configuration Table Header */ typedef struct MPCTH { char signature[4]; u_short base_table_length; u_char spec_rev; u_char checksum; u_char oem_id[8]; u_char product_id[12]; void *oem_table_pointer; u_short oem_table_size; u_short entry_count; void *apic_address; u_short extended_table_length; u_char extended_table_checksum; u_char reserved; } *mpcth_t; typedef struct PROCENTRY { u_char type; u_char apic_id; u_char apic_version; u_char cpu_flags; u_long cpu_signature; u_long feature_flags; u_long reserved1; u_long reserved2; } *proc_entry_ptr; typedef struct BUSENTRY { u_char type; u_char bus_id; char bus_type[6]; } *bus_entry_ptr; typedef struct IOAPICENTRY { u_char type; u_char apic_id; u_char apic_version; u_char apic_flags; void *apic_address; } *io_apic_entry_ptr; typedef struct INTENTRY { u_char type; u_char int_type; u_short int_flags; u_char src_bus_id; u_char src_bus_irq; u_char dst_apic_id; u_char dst_apic_int; } *int_entry_ptr; /* descriptions of MP basetable entries */ typedef struct BASETABLE_ENTRY { u_char type; u_char length; char name[16]; } basetable_entry; /* * this code MUST be enabled here and in mpboot.s. * it follows the very early stages of AP boot by placing values in CMOS ram. * it NORMALLY will never be needed and thus the primitive method for enabling. * #define CHECK_POINTS */ #if defined(CHECK_POINTS) && !defined(PC98) #define CHECK_READ(A) (outb(CMOS_REG, (A)), inb(CMOS_DATA)) #define CHECK_WRITE(A,D) (outb(CMOS_REG, (A)), outb(CMOS_DATA, (D))) #define CHECK_INIT(D); \ CHECK_WRITE(0x34, (D)); \ CHECK_WRITE(0x35, (D)); \ CHECK_WRITE(0x36, (D)); \ CHECK_WRITE(0x37, (D)); \ CHECK_WRITE(0x38, (D)); \ CHECK_WRITE(0x39, (D)); #define CHECK_PRINT(S); \ printf("%s: %d, %d, %d, %d, %d, %d\n", \ (S), \ CHECK_READ(0x34), \ CHECK_READ(0x35), \ CHECK_READ(0x36), \ CHECK_READ(0x37), \ CHECK_READ(0x38), \ CHECK_READ(0x39)); #else /* CHECK_POINTS */ #define CHECK_INIT(D) #define CHECK_PRINT(S) #endif /* CHECK_POINTS */ /* * Values to send to the POST hardware. */ #define MP_BOOTADDRESS_POST 0x10 #define MP_PROBE_POST 0x11 #define MPTABLE_PASS1_POST 0x12 #define MP_START_POST 0x13 #define MP_ENABLE_POST 0x14 #define MPTABLE_PASS2_POST 0x15 #define START_ALL_APS_POST 0x16 #define INSTALL_AP_TRAMP_POST 0x17 #define START_AP_POST 0x18 #define MP_ANNOUNCE_POST 0x19 /* used to hold the AP's until we are ready to release them */ static struct mtx ap_boot_mtx; /** XXX FIXME: where does this really belong, isa.h/isa.c perhaps? */ int current_postcode; /** XXX FIXME: what system files declare these??? */ extern struct region_descriptor r_gdt, r_idt; int bsp_apic_ready = 0; /* flags useability of BSP apic */ int mp_naps; /* # of Applications processors */ int mp_nbusses; /* # of busses */ int mp_napics; /* # of IO APICs */ int boot_cpu_id; /* designated BSP */ vm_offset_t cpu_apic_address; vm_offset_t io_apic_address[NAPICID]; /* NAPICID is more than enough */ extern int nkpt; u_int32_t cpu_apic_versions[MAXCPU]; u_int32_t *io_apic_versions; #ifdef APIC_INTR_REORDER struct { volatile int *location; int bit; } apic_isrbit_location[32]; #endif struct apic_intmapinfo int_to_apicintpin[APIC_INTMAPSIZE]; /* * APIC ID logical/physical mapping structures. * We oversize these to simplify boot-time config. */ int cpu_num_to_apic_id[NAPICID]; int io_num_to_apic_id[NAPICID]; int apic_id_to_logical[NAPICID]; /* AP uses this during bootstrap. Do not staticize. */ char *bootSTK; static int bootAP; /* Hotwire a 0->4MB V==P mapping */ extern pt_entry_t *KPTphys; /* SMP page table page */ extern pt_entry_t *SMPpt; struct pcb stoppcbs[MAXCPU]; int invltlb_ok = 0; /* throttle smp_invltlb() till safe */ SYSCTL_INT(_machdep, OID_AUTO, invltlb_ok, CTLFLAG_RW, &invltlb_ok, 0, ""); /* * Local data and functions. */ /* Set to 1 once we're ready to let the APs out of the pen. */ static volatile int aps_ready = 0; static int mp_capable; static u_int boot_address; static u_int base_memory; static int picmode; /* 0: virtual wire mode, 1: PIC mode */ static mpfps_t mpfps; static int search_for_sig(u_int32_t target, int count); static void mp_enable(u_int boot_addr); static void mptable_pass1(void); static int mptable_pass2(void); static void default_mp_table(int type); static void fix_mp_table(void); static void setup_apic_irq_mapping(void); static void init_locks(void); static int start_all_aps(u_int boot_addr); static void install_ap_tramp(u_int boot_addr); static int start_ap(int logicalCpu, u_int boot_addr); void ap_init(void); static int apic_int_is_bus_type(int intr, int bus_type); static void release_aps(void *dummy); /* * initialize all the SMP locks */ /* critical region around IO APIC, apic_imen */ struct mtx imen_mtx; /* lock region used by kernel profiling */ int mcount_lock; #ifdef USE_COMLOCK /* locks com (tty) data/hardware accesses: a FASTINTR() */ struct mtx com_mtx; #endif /* USE_COMLOCK */ static void init_locks(void) { #ifdef USE_COMLOCK mtx_init(&com_mtx, "com", MTX_SPIN); #endif /* USE_COMLOCK */ } /* * Calculate usable address in base memory for AP trampoline code. */ u_int mp_bootaddress(u_int basemem) { POSTCODE(MP_BOOTADDRESS_POST); base_memory = basemem * 1024; /* convert to bytes */ boot_address = base_memory & ~0xfff; /* round down to 4k boundary */ if ((base_memory - boot_address) < bootMP_size) boot_address -= 4096; /* not enough, lower by 4k */ return boot_address; } /* * Look for an Intel MP spec table (ie, SMP capable hardware). */ void i386_mp_probe(void) { int x; u_long segment; u_int32_t target; POSTCODE(MP_PROBE_POST); /* see if EBDA exists */ if ((segment = (u_long) * (u_short *) (KERNBASE + 0x40e)) != 0) { /* search first 1K of EBDA */ target = (u_int32_t) (segment << 4); if ((x = search_for_sig(target, 1024 / 4)) >= 0) goto found; } else { /* last 1K of base memory, effective 'top of base' passed in */ target = (u_int32_t) (base_memory - 0x400); if ((x = search_for_sig(target, 1024 / 4)) >= 0) goto found; } /* search the BIOS */ target = (u_int32_t) BIOS_BASE; if ((x = search_for_sig(target, BIOS_COUNT)) >= 0) goto found; /* nothing found */ mpfps = (mpfps_t)0; mp_capable = 0; return; found: /* calculate needed resources */ mpfps = (mpfps_t)x; mptable_pass1(); /* flag fact that we are running multiple processors */ mp_capable = 1; } int cpu_mp_probe(void) { /* * Record BSP in CPU map * This is done here so that MBUF init code works correctly. */ all_cpus = 1; return (mp_capable); } /* * Initialize the SMP hardware and the APIC and start up the AP's. */ void cpu_mp_start(void) { POSTCODE(MP_START_POST); /* look for MP capable motherboard */ if (mp_capable) mp_enable(boot_address); else panic("MP hardware not found!"); cpu_setregs(); } /* * Print various information about the SMP system hardware and setup. */ void cpu_mp_announce(void) { int x; POSTCODE(MP_ANNOUNCE_POST); printf(" cpu0 (BSP): apic id: %2d", CPU_TO_ID(0)); printf(", version: 0x%08x", cpu_apic_versions[0]); printf(", at 0x%08x\n", cpu_apic_address); for (x = 1; x <= mp_naps; ++x) { printf(" cpu%d (AP): apic id: %2d", x, CPU_TO_ID(x)); printf(", version: 0x%08x", cpu_apic_versions[x]); printf(", at 0x%08x\n", cpu_apic_address); } #if defined(APIC_IO) for (x = 0; x < mp_napics; ++x) { printf(" io%d (APIC): apic id: %2d", x, IO_TO_ID(x)); printf(", version: 0x%08x", io_apic_versions[x]); printf(", at 0x%08x\n", io_apic_address[x]); } #else printf(" Warning: APIC I/O disabled\n"); #endif /* APIC_IO */ } /* * AP cpu's call this to sync up protected mode. */ void init_secondary(void) { int gsel_tss; int x, myid = bootAP; gdt_segs[GPRIV_SEL].ssd_base = (int) &SMP_prvspace[myid]; gdt_segs[GPROC0_SEL].ssd_base = (int) &SMP_prvspace[myid].globaldata.gd_common_tss; SMP_prvspace[myid].globaldata.gd_prvspace = &SMP_prvspace[myid].globaldata; for (x = 0; x < NGDT; x++) { ssdtosd(&gdt_segs[x], &gdt[myid * NGDT + x].sd); } r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; r_gdt.rd_base = (int) &gdt[myid * NGDT]; lgdt(&r_gdt); /* does magic intra-segment return */ lidt(&r_idt); lldt(_default_ldt); PCPU_SET(currentldt, _default_ldt); gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); gdt[myid * NGDT + GPROC0_SEL].sd.sd_type = SDT_SYS386TSS; PCPU_SET(common_tss.tss_esp0, 0); /* not used until after switch */ PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL)); PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16); PCPU_SET(tss_gdt, &gdt[myid * NGDT + GPROC0_SEL].sd); PCPU_SET(common_tssd, *PCPU_GET(tss_gdt)); ltr(gsel_tss); pmap_set_opt(); } #if defined(APIC_IO) /* * Final configuration of the BSP's local APIC: * - disable 'pic mode'. * - disable 'virtual wire mode'. * - enable NMI. */ void bsp_apic_configure(void) { u_char byte; u_int32_t temp; /* leave 'pic mode' if necessary */ if (picmode) { outb(0x22, 0x70); /* select IMCR */ byte = inb(0x23); /* current contents */ byte |= 0x01; /* mask external INTR */ outb(0x23, byte); /* disconnect 8259s/NMI */ } /* mask lint0 (the 8259 'virtual wire' connection) */ temp = lapic.lvt_lint0; temp |= APIC_LVT_M; /* set the mask */ lapic.lvt_lint0 = temp; /* setup lint1 to handle NMI */ temp = lapic.lvt_lint1; temp &= ~APIC_LVT_M; /* clear the mask */ lapic.lvt_lint1 = temp; if (bootverbose) apic_dump("bsp_apic_configure()"); } #endif /* APIC_IO */ /******************************************************************* * local functions and data */ /* * start the SMP system */ static void mp_enable(u_int boot_addr) { int x; #if defined(APIC_IO) int apic; u_int ux; #endif /* APIC_IO */ POSTCODE(MP_ENABLE_POST); /* turn on 4MB of V == P addressing so we can get to MP table */ *(int *)PTD = PG_V | PG_RW | ((uintptr_t)(void *)KPTphys & PG_FRAME); invltlb(); /* examine the MP table for needed info, uses physical addresses */ x = mptable_pass2(); *(int *)PTD = 0; invltlb(); /* can't process default configs till the CPU APIC is pmapped */ if (x) default_mp_table(x); /* post scan cleanup */ fix_mp_table(); setup_apic_irq_mapping(); #if defined(APIC_IO) /* fill the LOGICAL io_apic_versions table */ for (apic = 0; apic < mp_napics; ++apic) { ux = io_apic_read(apic, IOAPIC_VER); io_apic_versions[apic] = ux; io_apic_set_id(apic, IO_TO_ID(apic)); } /* program each IO APIC in the system */ for (apic = 0; apic < mp_napics; ++apic) if (io_apic_setup(apic) < 0) panic("IO APIC setup failure"); /* install a 'Spurious INTerrupt' vector */ setidt(XSPURIOUSINT_OFFSET, Xspuriousint, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* install an inter-CPU IPI for TLB invalidation */ setidt(XINVLTLB_OFFSET, Xinvltlb, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* install an inter-CPU IPI for forwarding hardclock() */ setidt(XHARDCLOCK_OFFSET, Xhardclock, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* install an inter-CPU IPI for forwarding statclock() */ setidt(XSTATCLOCK_OFFSET, Xstatclock, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* install an inter-CPU IPI for all-CPU rendezvous */ setidt(XRENDEZVOUS_OFFSET, Xrendezvous, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* install an inter-CPU IPI for forcing an additional software trap */ setidt(XCPUAST_OFFSET, Xcpuast, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* install an inter-CPU IPI for CPU stop/restart */ setidt(XCPUSTOP_OFFSET, Xcpustop, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); #if defined(TEST_TEST1) /* install a "fake hardware INTerrupt" vector */ setidt(XTEST1_OFFSET, Xtest1, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); #endif /** TEST_TEST1 */ #endif /* APIC_IO */ /* initialize all SMP locks */ init_locks(); /* start each Application Processor */ start_all_aps(boot_addr); } /* * look for the MP spec signature */ /* string defined by the Intel MP Spec as identifying the MP table */ #define MP_SIG 0x5f504d5f /* _MP_ */ #define NEXT(X) ((X) += 4) static int search_for_sig(u_int32_t target, int count) { int x; u_int32_t *addr = (u_int32_t *) (KERNBASE + target); for (x = 0; x < count; NEXT(x)) if (addr[x] == MP_SIG) /* make array index a byte index */ return (target + (x * sizeof(u_int32_t))); return -1; } static basetable_entry basetable_entry_types[] = { {0, 20, "Processor"}, {1, 8, "Bus"}, {2, 8, "I/O APIC"}, {3, 8, "I/O INT"}, {4, 8, "Local INT"} }; typedef struct BUSDATA { u_char bus_id; enum busTypes bus_type; } bus_datum; typedef struct INTDATA { u_char int_type; u_short int_flags; u_char src_bus_id; u_char src_bus_irq; u_char dst_apic_id; u_char dst_apic_int; u_char int_vector; } io_int, local_int; typedef struct BUSTYPENAME { u_char type; char name[7]; } bus_type_name; static bus_type_name bus_type_table[] = { {CBUS, "CBUS"}, {CBUSII, "CBUSII"}, {EISA, "EISA"}, {MCA, "MCA"}, {UNKNOWN_BUSTYPE, "---"}, {ISA, "ISA"}, {MCA, "MCA"}, {UNKNOWN_BUSTYPE, "---"}, {UNKNOWN_BUSTYPE, "---"}, {UNKNOWN_BUSTYPE, "---"}, {UNKNOWN_BUSTYPE, "---"}, {UNKNOWN_BUSTYPE, "---"}, {PCI, "PCI"}, {UNKNOWN_BUSTYPE, "---"}, {UNKNOWN_BUSTYPE, "---"}, {UNKNOWN_BUSTYPE, "---"}, {UNKNOWN_BUSTYPE, "---"}, {XPRESS, "XPRESS"}, {UNKNOWN_BUSTYPE, "---"} }; /* from MP spec v1.4, table 5-1 */ static int default_data[7][5] = { /* nbus, id0, type0, id1, type1 */ {1, 0, ISA, 255, 255}, {1, 0, EISA, 255, 255}, {1, 0, EISA, 255, 255}, {1, 0, MCA, 255, 255}, {2, 0, ISA, 1, PCI}, {2, 0, EISA, 1, PCI}, {2, 0, MCA, 1, PCI} }; /* the bus data */ static bus_datum *bus_data; /* the IO INT data, one entry per possible APIC INTerrupt */ static io_int *io_apic_ints; static int nintrs; static int processor_entry __P((proc_entry_ptr entry, int cpu)); static int bus_entry __P((bus_entry_ptr entry, int bus)); static int io_apic_entry __P((io_apic_entry_ptr entry, int apic)); static int int_entry __P((int_entry_ptr entry, int intr)); static int lookup_bus_type __P((char *name)); /* * 1st pass on motherboard's Intel MP specification table. * * initializes: * mp_ncpus = 1 * * determines: * cpu_apic_address (common to all CPUs) * io_apic_address[N] * mp_naps * mp_nbusses * mp_napics * nintrs */ static void mptable_pass1(void) { int x; mpcth_t cth; int totalSize; void* position; int count; int type; POSTCODE(MPTABLE_PASS1_POST); /* clear various tables */ for (x = 0; x < NAPICID; ++x) { io_apic_address[x] = ~0; /* IO APIC address table */ } /* init everything to empty */ mp_naps = 0; mp_nbusses = 0; mp_napics = 0; nintrs = 0; /* check for use of 'default' configuration */ if (MPFPS_MPFB1 != 0) { /* use default addresses */ cpu_apic_address = DEFAULT_APIC_BASE; io_apic_address[0] = DEFAULT_IO_APIC_BASE; /* fill in with defaults */ mp_naps = 2; /* includes BSP */ mp_nbusses = default_data[MPFPS_MPFB1 - 1][0]; #if defined(APIC_IO) mp_napics = 1; nintrs = 16; #endif /* APIC_IO */ } else { if ((cth = mpfps->pap) == 0) panic("MP Configuration Table Header MISSING!"); cpu_apic_address = (vm_offset_t) cth->apic_address; /* walk the table, recording info of interest */ totalSize = cth->base_table_length - sizeof(struct MPCTH); position = (u_char *) cth + sizeof(struct MPCTH); count = cth->entry_count; while (count--) { switch (type = *(u_char *) position) { case 0: /* processor_entry */ if (((proc_entry_ptr)position)->cpu_flags & PROCENTRY_FLAG_EN) ++mp_naps; break; case 1: /* bus_entry */ ++mp_nbusses; break; case 2: /* io_apic_entry */ if (((io_apic_entry_ptr)position)->apic_flags & IOAPICENTRY_FLAG_EN) io_apic_address[mp_napics++] = (vm_offset_t)((io_apic_entry_ptr) position)->apic_address; break; case 3: /* int_entry */ ++nintrs; break; case 4: /* int_entry */ break; default: panic("mpfps Base Table HOSED!"); /* NOTREACHED */ } totalSize -= basetable_entry_types[type].length; (u_char*)position += basetable_entry_types[type].length; } } /* qualify the numbers */ if (mp_naps > MAXCPU) { printf("Warning: only using %d of %d available CPUs!\n", MAXCPU, mp_naps); mp_naps = MAXCPU; } /* * Count the BSP. * This is also used as a counter while starting the APs. */ mp_ncpus = 1; --mp_naps; /* subtract the BSP */ } /* * 2nd pass on motherboard's Intel MP specification table. * * sets: * boot_cpu_id * ID_TO_IO(N), phy APIC ID to log CPU/IO table * CPU_TO_ID(N), logical CPU to APIC ID table * IO_TO_ID(N), logical IO to APIC ID table * bus_data[N] * io_apic_ints[N] */ static int mptable_pass2(void) { int x; mpcth_t cth; int totalSize; void* position; int count; int type; int apic, bus, cpu, intr; int i, j; int pgeflag; POSTCODE(MPTABLE_PASS2_POST); pgeflag = 0; /* XXX - Not used under SMP yet. */ MALLOC(io_apic_versions, u_int32_t *, sizeof(u_int32_t) * mp_napics, M_DEVBUF, M_WAITOK); MALLOC(ioapic, volatile ioapic_t **, sizeof(ioapic_t *) * mp_napics, M_DEVBUF, M_WAITOK); MALLOC(io_apic_ints, io_int *, sizeof(io_int) * (nintrs + 1), M_DEVBUF, M_WAITOK); MALLOC(bus_data, bus_datum *, sizeof(bus_datum) * mp_nbusses, M_DEVBUF, M_WAITOK); bzero(ioapic, sizeof(ioapic_t *) * mp_napics); for (i = 0; i < mp_napics; i++) { for (j = 0; j < mp_napics; j++) { /* same page frame as a previous IO apic? */ if (((vm_offset_t)SMPpt[NPTEPG-2-j] & PG_FRAME) == (io_apic_address[i] & PG_FRAME)) { ioapic[i] = (ioapic_t *)((u_int)SMP_prvspace + (NPTEPG-2-j) * PAGE_SIZE + (io_apic_address[i] & PAGE_MASK)); break; } /* use this slot if available */ if (((vm_offset_t)SMPpt[NPTEPG-2-j] & PG_FRAME) == 0) { SMPpt[NPTEPG-2-j] = (pt_entry_t)(PG_V | PG_RW | pgeflag | (io_apic_address[i] & PG_FRAME)); ioapic[i] = (ioapic_t *)((u_int)SMP_prvspace + (NPTEPG-2-j) * PAGE_SIZE + (io_apic_address[i] & PAGE_MASK)); break; } } } /* clear various tables */ for (x = 0; x < NAPICID; ++x) { ID_TO_IO(x) = -1; /* phy APIC ID to log CPU/IO table */ CPU_TO_ID(x) = -1; /* logical CPU to APIC ID table */ IO_TO_ID(x) = -1; /* logical IO to APIC ID table */ } /* clear bus data table */ for (x = 0; x < mp_nbusses; ++x) bus_data[x].bus_id = 0xff; /* clear IO APIC INT table */ for (x = 0; x < (nintrs + 1); ++x) { io_apic_ints[x].int_type = 0xff; io_apic_ints[x].int_vector = 0xff; } /* setup the cpu/apic mapping arrays */ boot_cpu_id = -1; /* record whether PIC or virtual-wire mode */ picmode = (mpfps->mpfb2 & 0x80) ? 1 : 0; /* check for use of 'default' configuration */ if (MPFPS_MPFB1 != 0) return MPFPS_MPFB1; /* return default configuration type */ if ((cth = mpfps->pap) == 0) panic("MP Configuration Table Header MISSING!"); /* walk the table, recording info of interest */ totalSize = cth->base_table_length - sizeof(struct MPCTH); position = (u_char *) cth + sizeof(struct MPCTH); count = cth->entry_count; apic = bus = intr = 0; cpu = 1; /* pre-count the BSP */ while (count--) { switch (type = *(u_char *) position) { case 0: if (processor_entry(position, cpu)) ++cpu; break; case 1: if (bus_entry(position, bus)) ++bus; break; case 2: if (io_apic_entry(position, apic)) ++apic; break; case 3: if (int_entry(position, intr)) ++intr; break; case 4: /* int_entry(position); */ break; default: panic("mpfps Base Table HOSED!"); /* NOTREACHED */ } totalSize -= basetable_entry_types[type].length; (u_char *) position += basetable_entry_types[type].length; } if (boot_cpu_id == -1) panic("NO BSP found!"); /* report fact that its NOT a default configuration */ return 0; } void assign_apic_irq(int apic, int intpin, int irq) { int x; if (int_to_apicintpin[irq].ioapic != -1) panic("assign_apic_irq: inconsistent table"); int_to_apicintpin[irq].ioapic = apic; int_to_apicintpin[irq].int_pin = intpin; int_to_apicintpin[irq].apic_address = ioapic[apic]; int_to_apicintpin[irq].redirindex = IOAPIC_REDTBL + 2 * intpin; for (x = 0; x < nintrs; x++) { if ((io_apic_ints[x].int_type == 0 || io_apic_ints[x].int_type == 3) && io_apic_ints[x].int_vector == 0xff && io_apic_ints[x].dst_apic_id == IO_TO_ID(apic) && io_apic_ints[x].dst_apic_int == intpin) io_apic_ints[x].int_vector = irq; } } void revoke_apic_irq(int irq) { int x; int oldapic; int oldintpin; if (int_to_apicintpin[irq].ioapic == -1) panic("assign_apic_irq: inconsistent table"); oldapic = int_to_apicintpin[irq].ioapic; oldintpin = int_to_apicintpin[irq].int_pin; int_to_apicintpin[irq].ioapic = -1; int_to_apicintpin[irq].int_pin = 0; int_to_apicintpin[irq].apic_address = NULL; int_to_apicintpin[irq].redirindex = 0; for (x = 0; x < nintrs; x++) { if ((io_apic_ints[x].int_type == 0 || io_apic_ints[x].int_type == 3) && io_apic_ints[x].int_vector == 0xff && io_apic_ints[x].dst_apic_id == IO_TO_ID(oldapic) && io_apic_ints[x].dst_apic_int == oldintpin) io_apic_ints[x].int_vector = 0xff; } } static void allocate_apic_irq(int intr) { int apic; int intpin; int irq; if (io_apic_ints[intr].int_vector != 0xff) return; /* Interrupt handler already assigned */ if (io_apic_ints[intr].int_type != 0 && (io_apic_ints[intr].int_type != 3 || (io_apic_ints[intr].dst_apic_id == IO_TO_ID(0) && io_apic_ints[intr].dst_apic_int == 0))) return; /* Not INT or ExtInt on != (0, 0) */ irq = 0; while (irq < APIC_INTMAPSIZE && int_to_apicintpin[irq].ioapic != -1) irq++; if (irq >= APIC_INTMAPSIZE) return; /* No free interrupt handlers */ apic = ID_TO_IO(io_apic_ints[intr].dst_apic_id); intpin = io_apic_ints[intr].dst_apic_int; assign_apic_irq(apic, intpin, irq); io_apic_setup_intpin(apic, intpin); } static void swap_apic_id(int apic, int oldid, int newid) { int x; int oapic; if (oldid == newid) return; /* Nothing to do */ printf("Changing APIC ID for IO APIC #%d from %d to %d in MP table\n", apic, oldid, newid); /* Swap physical APIC IDs in interrupt entries */ for (x = 0; x < nintrs; x++) { if (io_apic_ints[x].dst_apic_id == oldid) io_apic_ints[x].dst_apic_id = newid; else if (io_apic_ints[x].dst_apic_id == newid) io_apic_ints[x].dst_apic_id = oldid; } /* Swap physical APIC IDs in IO_TO_ID mappings */ for (oapic = 0; oapic < mp_napics; oapic++) if (IO_TO_ID(oapic) == newid) break; if (oapic < mp_napics) { printf("Changing APIC ID for IO APIC #%d from " "%d to %d in MP table\n", oapic, newid, oldid); IO_TO_ID(oapic) = oldid; } IO_TO_ID(apic) = newid; } static void fix_id_to_io_mapping(void) { int x; for (x = 0; x < NAPICID; x++) ID_TO_IO(x) = -1; for (x = 0; x <= mp_naps; x++) if (CPU_TO_ID(x) < NAPICID) ID_TO_IO(CPU_TO_ID(x)) = x; for (x = 0; x < mp_napics; x++) if (IO_TO_ID(x) < NAPICID) ID_TO_IO(IO_TO_ID(x)) = x; } static int first_free_apic_id(void) { int freeid, x; for (freeid = 0; freeid < NAPICID; freeid++) { for (x = 0; x <= mp_naps; x++) if (CPU_TO_ID(x) == freeid) break; if (x <= mp_naps) continue; for (x = 0; x < mp_napics; x++) if (IO_TO_ID(x) == freeid) break; if (x < mp_napics) continue; return freeid; } return freeid; } static int io_apic_id_acceptable(int apic, int id) { int cpu; /* Logical CPU number */ int oapic; /* Logical IO APIC number for other IO APIC */ if (id >= NAPICID) return 0; /* Out of range */ for (cpu = 0; cpu <= mp_naps; cpu++) if (CPU_TO_ID(cpu) == id) return 0; /* Conflict with CPU */ for (oapic = 0; oapic < mp_napics && oapic < apic; oapic++) if (IO_TO_ID(oapic) == id) return 0; /* Conflict with other APIC */ return 1; /* ID is acceptable for IO APIC */ } /* * parse an Intel MP specification table */ static void fix_mp_table(void) { int x; int id; int bus_0 = 0; /* Stop GCC warning */ int bus_pci = 0; /* Stop GCC warning */ int num_pci_bus; int apic; /* IO APIC unit number */ int freeid; /* Free physical APIC ID */ int physid; /* Current physical IO APIC ID */ /* * Fix mis-numbering of the PCI bus and its INT entries if the BIOS * did it wrong. The MP spec says that when more than 1 PCI bus * exists the BIOS must begin with bus entries for the PCI bus and use * actual PCI bus numbering. This implies that when only 1 PCI bus * exists the BIOS can choose to ignore this ordering, and indeed many * MP motherboards do ignore it. This causes a problem when the PCI * sub-system makes requests of the MP sub-system based on PCI bus * numbers. So here we look for the situation and renumber the * busses and associated INTs in an effort to "make it right". */ /* find bus 0, PCI bus, count the number of PCI busses */ for (num_pci_bus = 0, x = 0; x < mp_nbusses; ++x) { if (bus_data[x].bus_id == 0) { bus_0 = x; } if (bus_data[x].bus_type == PCI) { ++num_pci_bus; bus_pci = x; } } /* * bus_0 == slot of bus with ID of 0 * bus_pci == slot of last PCI bus encountered */ /* check the 1 PCI bus case for sanity */ /* if it is number 0 all is well */ if (num_pci_bus == 1 && bus_data[bus_pci].bus_id != 0) { /* mis-numbered, swap with whichever bus uses slot 0 */ /* swap the bus entry types */ bus_data[bus_pci].bus_type = bus_data[bus_0].bus_type; bus_data[bus_0].bus_type = PCI; /* swap each relavant INTerrupt entry */ id = bus_data[bus_pci].bus_id; for (x = 0; x < nintrs; ++x) { if (io_apic_ints[x].src_bus_id == id) { io_apic_ints[x].src_bus_id = 0; } else if (io_apic_ints[x].src_bus_id == 0) { io_apic_ints[x].src_bus_id = id; } } } /* Assign IO APIC IDs. * * First try the existing ID. If a conflict is detected, try * the ID in the MP table. If a conflict is still detected, find * a free id. * * We cannot use the ID_TO_IO table before all conflicts has been * resolved and the table has been corrected. */ for (apic = 0; apic < mp_napics; ++apic) { /* For all IO APICs */ /* First try to use the value set by the BIOS */ physid = io_apic_get_id(apic); if (io_apic_id_acceptable(apic, physid)) { if (IO_TO_ID(apic) != physid) swap_apic_id(apic, IO_TO_ID(apic), physid); continue; } /* Then check if the value in the MP table is acceptable */ if (io_apic_id_acceptable(apic, IO_TO_ID(apic))) continue; /* Last resort, find a free APIC ID and use it */ freeid = first_free_apic_id(); if (freeid >= NAPICID) panic("No free physical APIC IDs found"); if (io_apic_id_acceptable(apic, freeid)) { swap_apic_id(apic, IO_TO_ID(apic), freeid); continue; } panic("Free physical APIC ID not usable"); } fix_id_to_io_mapping(); /* detect and fix broken Compaq MP table */ if (apic_int_type(0, 0) == -1) { printf("APIC_IO: MP table broken: 8259->APIC entry missing!\n"); io_apic_ints[nintrs].int_type = 3; /* ExtInt */ io_apic_ints[nintrs].int_vector = 0xff; /* Unassigned */ /* XXX fixme, set src bus id etc, but it doesn't seem to hurt */ io_apic_ints[nintrs].dst_apic_id = IO_TO_ID(0); io_apic_ints[nintrs].dst_apic_int = 0; /* Pin 0 */ nintrs++; } } /* Assign low level interrupt handlers */ static void setup_apic_irq_mapping(void) { int x; int int_vector; /* Clear array */ for (x = 0; x < APIC_INTMAPSIZE; x++) { int_to_apicintpin[x].ioapic = -1; int_to_apicintpin[x].int_pin = 0; int_to_apicintpin[x].apic_address = NULL; int_to_apicintpin[x].redirindex = 0; } /* First assign ISA/EISA interrupts */ for (x = 0; x < nintrs; x++) { int_vector = io_apic_ints[x].src_bus_irq; if (int_vector < APIC_INTMAPSIZE && io_apic_ints[x].int_vector == 0xff && int_to_apicintpin[int_vector].ioapic == -1 && (apic_int_is_bus_type(x, ISA) || apic_int_is_bus_type(x, EISA)) && io_apic_ints[x].int_type == 0) { assign_apic_irq(ID_TO_IO(io_apic_ints[x].dst_apic_id), io_apic_ints[x].dst_apic_int, int_vector); } } /* Assign ExtInt entry if no ISA/EISA interrupt 0 entry */ for (x = 0; x < nintrs; x++) { if (io_apic_ints[x].dst_apic_int == 0 && io_apic_ints[x].dst_apic_id == IO_TO_ID(0) && io_apic_ints[x].int_vector == 0xff && int_to_apicintpin[0].ioapic == -1 && io_apic_ints[x].int_type == 3) { assign_apic_irq(0, 0, 0); break; } } /* PCI interrupt assignment is deferred */ } static int processor_entry(proc_entry_ptr entry, int cpu) { /* check for usability */ if (!(entry->cpu_flags & PROCENTRY_FLAG_EN)) return 0; if(entry->apic_id >= NAPICID) panic("CPU APIC ID out of range (0..%d)", NAPICID - 1); /* check for BSP flag */ if (entry->cpu_flags & PROCENTRY_FLAG_BP) { boot_cpu_id = entry->apic_id; CPU_TO_ID(0) = entry->apic_id; ID_TO_CPU(entry->apic_id) = 0; return 0; /* its already been counted */ } /* add another AP to list, if less than max number of CPUs */ else if (cpu < MAXCPU) { CPU_TO_ID(cpu) = entry->apic_id; ID_TO_CPU(entry->apic_id) = cpu; return 1; } return 0; } static int bus_entry(bus_entry_ptr entry, int bus) { int x; char c, name[8]; /* encode the name into an index */ for (x = 0; x < 6; ++x) { if ((c = entry->bus_type[x]) == ' ') break; name[x] = c; } name[x] = '\0'; if ((x = lookup_bus_type(name)) == UNKNOWN_BUSTYPE) panic("unknown bus type: '%s'", name); bus_data[bus].bus_id = entry->bus_id; bus_data[bus].bus_type = x; return 1; } static int io_apic_entry(io_apic_entry_ptr entry, int apic) { if (!(entry->apic_flags & IOAPICENTRY_FLAG_EN)) return 0; IO_TO_ID(apic) = entry->apic_id; if (entry->apic_id < NAPICID) ID_TO_IO(entry->apic_id) = apic; return 1; } static int lookup_bus_type(char *name) { int x; for (x = 0; x < MAX_BUSTYPE; ++x) if (strcmp(bus_type_table[x].name, name) == 0) return bus_type_table[x].type; return UNKNOWN_BUSTYPE; } static int int_entry(int_entry_ptr entry, int intr) { int apic; io_apic_ints[intr].int_type = entry->int_type; io_apic_ints[intr].int_flags = entry->int_flags; io_apic_ints[intr].src_bus_id = entry->src_bus_id; io_apic_ints[intr].src_bus_irq = entry->src_bus_irq; if (entry->dst_apic_id == 255) { /* This signal goes to all IO APICS. Select an IO APIC with sufficient number of interrupt pins */ for (apic = 0; apic < mp_napics; apic++) if (((io_apic_read(apic, IOAPIC_VER) & IOART_VER_MAXREDIR) >> MAXREDIRSHIFT) >= entry->dst_apic_int) break; if (apic < mp_napics) io_apic_ints[intr].dst_apic_id = IO_TO_ID(apic); else io_apic_ints[intr].dst_apic_id = entry->dst_apic_id; } else io_apic_ints[intr].dst_apic_id = entry->dst_apic_id; io_apic_ints[intr].dst_apic_int = entry->dst_apic_int; return 1; } static int apic_int_is_bus_type(int intr, int bus_type) { int bus; for (bus = 0; bus < mp_nbusses; ++bus) if ((bus_data[bus].bus_id == io_apic_ints[intr].src_bus_id) && ((int) bus_data[bus].bus_type == bus_type)) return 1; return 0; } /* * Given a traditional ISA INT mask, return an APIC mask. */ u_int isa_apic_mask(u_int isa_mask) { int isa_irq; int apic_pin; #if defined(SKIP_IRQ15_REDIRECT) if (isa_mask == (1 << 15)) { printf("skipping ISA IRQ15 redirect\n"); return isa_mask; } #endif /* SKIP_IRQ15_REDIRECT */ isa_irq = ffs(isa_mask); /* find its bit position */ if (isa_irq == 0) /* doesn't exist */ return 0; --isa_irq; /* make it zero based */ apic_pin = isa_apic_irq(isa_irq); /* look for APIC connection */ if (apic_pin == -1) return 0; return (1 << apic_pin); /* convert pin# to a mask */ } /* * Determine which APIC pin an ISA/EISA INT is attached to. */ #define INTTYPE(I) (io_apic_ints[(I)].int_type) #define INTPIN(I) (io_apic_ints[(I)].dst_apic_int) #define INTIRQ(I) (io_apic_ints[(I)].int_vector) #define INTAPIC(I) (ID_TO_IO(io_apic_ints[(I)].dst_apic_id)) #define SRCBUSIRQ(I) (io_apic_ints[(I)].src_bus_irq) int isa_apic_irq(int isa_irq) { int intr; for (intr = 0; intr < nintrs; ++intr) { /* check each record */ if (INTTYPE(intr) == 0) { /* standard INT */ if (SRCBUSIRQ(intr) == isa_irq) { if (apic_int_is_bus_type(intr, ISA) || apic_int_is_bus_type(intr, EISA)) { if (INTIRQ(intr) == 0xff) return -1; /* unassigned */ return INTIRQ(intr); /* found */ } } } } return -1; /* NOT found */ } /* * Determine which APIC pin a PCI INT is attached to. */ #define SRCBUSID(I) (io_apic_ints[(I)].src_bus_id) #define SRCBUSDEVICE(I) ((io_apic_ints[(I)].src_bus_irq >> 2) & 0x1f) #define SRCBUSLINE(I) (io_apic_ints[(I)].src_bus_irq & 0x03) int pci_apic_irq(int pciBus, int pciDevice, int pciInt) { int intr; --pciInt; /* zero based */ for (intr = 0; intr < nintrs; ++intr) /* check each record */ if ((INTTYPE(intr) == 0) /* standard INT */ && (SRCBUSID(intr) == pciBus) && (SRCBUSDEVICE(intr) == pciDevice) && (SRCBUSLINE(intr) == pciInt)) /* a candidate IRQ */ if (apic_int_is_bus_type(intr, PCI)) { if (INTIRQ(intr) == 0xff) allocate_apic_irq(intr); if (INTIRQ(intr) == 0xff) return -1; /* unassigned */ return INTIRQ(intr); /* exact match */ } return -1; /* NOT found */ } int next_apic_irq(int irq) { int intr, ointr; int bus, bustype; bus = 0; bustype = 0; for (intr = 0; intr < nintrs; intr++) { if (INTIRQ(intr) != irq || INTTYPE(intr) != 0) continue; bus = SRCBUSID(intr); bustype = apic_bus_type(bus); if (bustype != ISA && bustype != EISA && bustype != PCI) continue; break; } if (intr >= nintrs) { return -1; } for (ointr = intr + 1; ointr < nintrs; ointr++) { if (INTTYPE(ointr) != 0) continue; if (bus != SRCBUSID(ointr)) continue; if (bustype == PCI) { if (SRCBUSDEVICE(intr) != SRCBUSDEVICE(ointr)) continue; if (SRCBUSLINE(intr) != SRCBUSLINE(ointr)) continue; } if (bustype == ISA || bustype == EISA) { if (SRCBUSIRQ(intr) != SRCBUSIRQ(ointr)) continue; } if (INTPIN(intr) == INTPIN(ointr)) continue; break; } if (ointr >= nintrs) { return -1; } return INTIRQ(ointr); } #undef SRCBUSLINE #undef SRCBUSDEVICE #undef SRCBUSID #undef SRCBUSIRQ #undef INTPIN #undef INTIRQ #undef INTAPIC #undef INTTYPE /* * Reprogram the MB chipset to NOT redirect an ISA INTerrupt. * * XXX FIXME: * Exactly what this means is unclear at this point. It is a solution * for motherboards that redirect the MBIRQ0 pin. Generically a motherboard * could route any of the ISA INTs to upper (>15) IRQ values. But most would * NOT be redirected via MBIRQ0, thus "undirect()ing" them would NOT be an * option. */ int undirect_isa_irq(int rirq) { #if defined(READY) if (bootverbose) printf("Freeing redirected ISA irq %d.\n", rirq); /** FIXME: tickle the MB redirector chip */ return -1; #else if (bootverbose) printf("Freeing (NOT implemented) redirected ISA irq %d.\n", rirq); return 0; #endif /* READY */ } /* * Reprogram the MB chipset to NOT redirect a PCI INTerrupt */ int undirect_pci_irq(int rirq) { #if defined(READY) if (bootverbose) printf("Freeing redirected PCI irq %d.\n", rirq); /** FIXME: tickle the MB redirector chip */ return -1; #else if (bootverbose) printf("Freeing (NOT implemented) redirected PCI irq %d.\n", rirq); return 0; #endif /* READY */ } /* * given a bus ID, return: * the bus type if found * -1 if NOT found */ int apic_bus_type(int id) { int x; for (x = 0; x < mp_nbusses; ++x) if (bus_data[x].bus_id == id) return bus_data[x].bus_type; return -1; } /* * given a LOGICAL APIC# and pin#, return: * the associated src bus ID if found * -1 if NOT found */ int apic_src_bus_id(int apic, int pin) { int x; /* search each of the possible INTerrupt sources */ for (x = 0; x < nintrs; ++x) if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) && (pin == io_apic_ints[x].dst_apic_int)) return (io_apic_ints[x].src_bus_id); return -1; /* NOT found */ } /* * given a LOGICAL APIC# and pin#, return: * the associated src bus IRQ if found * -1 if NOT found */ int apic_src_bus_irq(int apic, int pin) { int x; for (x = 0; x < nintrs; x++) if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) && (pin == io_apic_ints[x].dst_apic_int)) return (io_apic_ints[x].src_bus_irq); return -1; /* NOT found */ } /* * given a LOGICAL APIC# and pin#, return: * the associated INTerrupt type if found * -1 if NOT found */ int apic_int_type(int apic, int pin) { int x; /* search each of the possible INTerrupt sources */ for (x = 0; x < nintrs; ++x) if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) && (pin == io_apic_ints[x].dst_apic_int)) return (io_apic_ints[x].int_type); return -1; /* NOT found */ } int apic_irq(int apic, int pin) { int x; int res; for (x = 0; x < nintrs; ++x) if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) && (pin == io_apic_ints[x].dst_apic_int)) { res = io_apic_ints[x].int_vector; if (res == 0xff) return -1; if (apic != int_to_apicintpin[res].ioapic) panic("apic_irq: inconsistent table"); if (pin != int_to_apicintpin[res].int_pin) panic("apic_irq inconsistent table (2)"); return res; } return -1; } /* * given a LOGICAL APIC# and pin#, return: * the associated trigger mode if found * -1 if NOT found */ int apic_trigger(int apic, int pin) { int x; /* search each of the possible INTerrupt sources */ for (x = 0; x < nintrs; ++x) if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) && (pin == io_apic_ints[x].dst_apic_int)) return ((io_apic_ints[x].int_flags >> 2) & 0x03); return -1; /* NOT found */ } /* * given a LOGICAL APIC# and pin#, return: * the associated 'active' level if found * -1 if NOT found */ int apic_polarity(int apic, int pin) { int x; /* search each of the possible INTerrupt sources */ for (x = 0; x < nintrs; ++x) if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) && (pin == io_apic_ints[x].dst_apic_int)) return (io_apic_ints[x].int_flags & 0x03); return -1; /* NOT found */ } /* * set data according to MP defaults * FIXME: probably not complete yet... */ static void default_mp_table(int type) { int ap_cpu_id; #if defined(APIC_IO) int io_apic_id; int pin; #endif /* APIC_IO */ #if 0 printf(" MP default config type: %d\n", type); switch (type) { case 1: printf(" bus: ISA, APIC: 82489DX\n"); break; case 2: printf(" bus: EISA, APIC: 82489DX\n"); break; case 3: printf(" bus: EISA, APIC: 82489DX\n"); break; case 4: printf(" bus: MCA, APIC: 82489DX\n"); break; case 5: printf(" bus: ISA+PCI, APIC: Integrated\n"); break; case 6: printf(" bus: EISA+PCI, APIC: Integrated\n"); break; case 7: printf(" bus: MCA+PCI, APIC: Integrated\n"); break; default: printf(" future type\n"); break; /* NOTREACHED */ } #endif /* 0 */ boot_cpu_id = (lapic.id & APIC_ID_MASK) >> 24; ap_cpu_id = (boot_cpu_id == 0) ? 1 : 0; /* BSP */ CPU_TO_ID(0) = boot_cpu_id; ID_TO_CPU(boot_cpu_id) = 0; /* one and only AP */ CPU_TO_ID(1) = ap_cpu_id; ID_TO_CPU(ap_cpu_id) = 1; #if defined(APIC_IO) /* one and only IO APIC */ io_apic_id = (io_apic_read(0, IOAPIC_ID) & APIC_ID_MASK) >> 24; /* * sanity check, refer to MP spec section 3.6.6, last paragraph * necessary as some hardware isn't properly setting up the IO APIC */ #if defined(REALLY_ANAL_IOAPICID_VALUE) if (io_apic_id != 2) { #else if ((io_apic_id == 0) || (io_apic_id == 1) || (io_apic_id == 15)) { #endif /* REALLY_ANAL_IOAPICID_VALUE */ io_apic_set_id(0, 2); io_apic_id = 2; } IO_TO_ID(0) = io_apic_id; ID_TO_IO(io_apic_id) = 0; #endif /* APIC_IO */ /* fill out bus entries */ switch (type) { case 1: case 2: case 3: case 4: case 5: case 6: case 7: bus_data[0].bus_id = default_data[type - 1][1]; bus_data[0].bus_type = default_data[type - 1][2]; bus_data[1].bus_id = default_data[type - 1][3]; bus_data[1].bus_type = default_data[type - 1][4]; break; /* case 4: case 7: MCA NOT supported */ default: /* illegal/reserved */ panic("BAD default MP config: %d", type); /* NOTREACHED */ } #if defined(APIC_IO) /* general cases from MP v1.4, table 5-2 */ for (pin = 0; pin < 16; ++pin) { io_apic_ints[pin].int_type = 0; io_apic_ints[pin].int_flags = 0x05; /* edge/active-hi */ io_apic_ints[pin].src_bus_id = 0; io_apic_ints[pin].src_bus_irq = pin; /* IRQ2 caught below */ io_apic_ints[pin].dst_apic_id = io_apic_id; io_apic_ints[pin].dst_apic_int = pin; /* 1-to-1 */ } /* special cases from MP v1.4, table 5-2 */ if (type == 2) { io_apic_ints[2].int_type = 0xff; /* N/C */ io_apic_ints[13].int_type = 0xff; /* N/C */ #if !defined(APIC_MIXED_MODE) /** FIXME: ??? */ panic("sorry, can't support type 2 default yet"); #endif /* APIC_MIXED_MODE */ } else io_apic_ints[2].src_bus_irq = 0; /* ISA IRQ0 is on APIC INT 2 */ if (type == 7) io_apic_ints[0].int_type = 0xff; /* N/C */ else io_apic_ints[0].int_type = 3; /* vectored 8259 */ #endif /* APIC_IO */ } /* * start each AP in our list */ static int start_all_aps(u_int boot_addr) { int x, i, pg; u_char mpbiosreason; u_long mpbioswarmvec; struct globaldata *gd; char *stack; uintptr_t kptbase; POSTCODE(START_ALL_APS_POST); mtx_init(&ap_boot_mtx, "ap boot", MTX_SPIN); /* initialize BSP's local APIC */ apic_initialize(); bsp_apic_ready = 1; /* install the AP 1st level boot code */ install_ap_tramp(boot_addr); /* save the current value of the warm-start vector */ mpbioswarmvec = *((u_long *) WARMBOOT_OFF); #ifndef PC98 outb(CMOS_REG, BIOS_RESET); mpbiosreason = inb(CMOS_DATA); #endif /* set up temporary P==V mapping for AP boot */ /* XXX this is a hack, we should boot the AP on its own stack/PTD */ kptbase = (uintptr_t)(void *)KPTphys; for (x = 0; x < NKPT; x++) PTD[x] = (pd_entry_t)(PG_V | PG_RW | ((kptbase + x * PAGE_SIZE) & PG_FRAME)); invltlb(); /* start each AP */ for (x = 1; x <= mp_naps; ++x) { /* This is a bit verbose, it will go away soon. */ /* first page of AP's private space */ pg = x * i386_btop(sizeof(struct privatespace)); /* allocate a new private data page */ gd = (struct globaldata *)kmem_alloc(kernel_map, PAGE_SIZE); /* wire it into the private page table page */ SMPpt[pg] = (pt_entry_t)(PG_V | PG_RW | vtophys(gd)); /* allocate and set up an idle stack data page */ stack = (char *)kmem_alloc(kernel_map, UPAGES*PAGE_SIZE); for (i = 0; i < UPAGES; i++) SMPpt[pg + 1 + i] = (pt_entry_t) (PG_V | PG_RW | vtophys(PAGE_SIZE * i + stack)); /* prime data page for it to use */ gd->gd_cpuid = x; globaldata_register(gd); /* setup a vector to our boot code */ *((volatile u_short *) WARMBOOT_OFF) = WARMBOOT_TARGET; *((volatile u_short *) WARMBOOT_SEG) = (boot_addr >> 4); #ifndef PC98 outb(CMOS_REG, BIOS_RESET); outb(CMOS_DATA, BIOS_WARM); /* 'warm-start' */ #endif bootSTK = &SMP_prvspace[x].idlestack[UPAGES*PAGE_SIZE]; bootAP = x; /* attempt to start the Application Processor */ CHECK_INIT(99); /* setup checkpoints */ if (!start_ap(x, boot_addr)) { printf("AP #%d (PHY# %d) failed!\n", x, CPU_TO_ID(x)); CHECK_PRINT("trace"); /* show checkpoints */ /* better panic as the AP may be running loose */ printf("panic y/n? [y] "); if (cngetc() != 'n') panic("bye-bye"); } CHECK_PRINT("trace"); /* show checkpoints */ /* record its version info */ cpu_apic_versions[x] = cpu_apic_versions[0]; all_cpus |= (1 << x); /* record AP in CPU map */ } /* build our map of 'other' CPUs */ PCPU_SET(other_cpus, all_cpus & ~(1 << PCPU_GET(cpuid))); /* fill in our (BSP) APIC version */ cpu_apic_versions[0] = lapic.version; /* restore the warmstart vector */ *(u_long *) WARMBOOT_OFF = mpbioswarmvec; #ifndef PC98 outb(CMOS_REG, BIOS_RESET); outb(CMOS_DATA, mpbiosreason); #endif /* * Set up the idle context for the BSP. Similar to above except * that some was done by locore, some by pmap.c and some is implicit * because the BSP is cpu#0 and the page is initially zero, and also * because we can refer to variables by name on the BSP.. */ /* Allocate and setup BSP idle stack */ stack = (char *)kmem_alloc(kernel_map, UPAGES * PAGE_SIZE); for (i = 0; i < UPAGES; i++) SMPpt[1 + i] = (pt_entry_t) (PG_V | PG_RW | vtophys(PAGE_SIZE * i + stack)); for (x = 0; x < NKPT; x++) PTD[x] = 0; pmap_set_opt(); /* number of APs actually started */ return mp_ncpus - 1; } /* * load the 1st level AP boot code into base memory. */ /* targets for relocation */ extern void bigJump(void); extern void bootCodeSeg(void); extern void bootDataSeg(void); extern void MPentry(void); extern u_int MP_GDT; extern u_int mp_gdtbase; static void install_ap_tramp(u_int boot_addr) { int x; int size = *(int *) ((u_long) & bootMP_size); u_char *src = (u_char *) ((u_long) bootMP); u_char *dst = (u_char *) boot_addr + KERNBASE; u_int boot_base = (u_int) bootMP; u_int8_t *dst8; u_int16_t *dst16; u_int32_t *dst32; POSTCODE(INSTALL_AP_TRAMP_POST); for (x = 0; x < size; ++x) *dst++ = *src++; /* * modify addresses in code we just moved to basemem. unfortunately we * need fairly detailed info about mpboot.s for this to work. changes * to mpboot.s might require changes here. */ /* boot code is located in KERNEL space */ dst = (u_char *) boot_addr + KERNBASE; /* modify the lgdt arg */ dst32 = (u_int32_t *) (dst + ((u_int) & mp_gdtbase - boot_base)); *dst32 = boot_addr + ((u_int) & MP_GDT - boot_base); /* modify the ljmp target for MPentry() */ dst32 = (u_int32_t *) (dst + ((u_int) bigJump - boot_base) + 1); *dst32 = ((u_int) MPentry - KERNBASE); /* modify the target for boot code segment */ dst16 = (u_int16_t *) (dst + ((u_int) bootCodeSeg - boot_base)); dst8 = (u_int8_t *) (dst16 + 1); *dst16 = (u_int) boot_addr & 0xffff; *dst8 = ((u_int) boot_addr >> 16) & 0xff; /* modify the target for boot data segment */ dst16 = (u_int16_t *) (dst + ((u_int) bootDataSeg - boot_base)); dst8 = (u_int8_t *) (dst16 + 1); *dst16 = (u_int) boot_addr & 0xffff; *dst8 = ((u_int) boot_addr >> 16) & 0xff; } /* * this function starts the AP (application processor) identified * by the APIC ID 'physicalCpu'. It does quite a "song and dance" * to accomplish this. This is necessary because of the nuances * of the different hardware we might encounter. It ain't pretty, * but it seems to work. */ static int start_ap(int logical_cpu, u_int boot_addr) { int physical_cpu; int vector; int cpus; u_long icr_lo, icr_hi; POSTCODE(START_AP_POST); /* get the PHYSICAL APIC ID# */ physical_cpu = CPU_TO_ID(logical_cpu); /* calculate the vector */ vector = (boot_addr >> 12) & 0xff; /* used as a watchpoint to signal AP startup */ cpus = mp_ncpus; /* * first we do an INIT/RESET IPI this INIT IPI might be run, reseting * and running the target CPU. OR this INIT IPI might be latched (P5 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be * ignored. */ /* setup the address for the target AP */ icr_hi = lapic.icr_hi & ~APIC_ID_MASK; icr_hi |= (physical_cpu << 24); lapic.icr_hi = icr_hi; /* do an INIT IPI: assert RESET */ icr_lo = lapic.icr_lo & 0xfff00000; lapic.icr_lo = icr_lo | 0x0000c500; /* wait for pending status end */ while (lapic.icr_lo & APIC_DELSTAT_MASK) /* spin */ ; /* do an INIT IPI: deassert RESET */ lapic.icr_lo = icr_lo | 0x00008500; /* wait for pending status end */ u_sleep(10000); /* wait ~10mS */ while (lapic.icr_lo & APIC_DELSTAT_MASK) /* spin */ ; /* * next we do a STARTUP IPI: the previous INIT IPI might still be * latched, (P5 bug) this 1st STARTUP would then terminate * immediately, and the previously started INIT IPI would continue. OR * the previous INIT IPI has already run. and this STARTUP IPI will * run. OR the previous INIT IPI was ignored. and this STARTUP IPI * will run. */ /* do a STARTUP IPI */ lapic.icr_lo = icr_lo | 0x00000600 | vector; while (lapic.icr_lo & APIC_DELSTAT_MASK) /* spin */ ; u_sleep(200); /* wait ~200uS */ /* * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is * recognized after hardware RESET or INIT IPI. */ lapic.icr_lo = icr_lo | 0x00000600 | vector; while (lapic.icr_lo & APIC_DELSTAT_MASK) /* spin */ ; u_sleep(200); /* wait ~200uS */ /* wait for it to start */ set_apic_timer(5000000);/* == 5 seconds */ while (read_apic_timer()) if (mp_ncpus > cpus) return 1; /* return SUCCESS */ return 0; /* return FAILURE */ } /* * Flush the TLB on all other CPU's * * XXX: Needs to handshake and wait for completion before proceding. */ void smp_invltlb(void) { #if defined(APIC_IO) if (smp_started && invltlb_ok) ipi_all_but_self(IPI_INVLTLB); #endif /* APIC_IO */ } void invlpg(u_int addr) { __asm __volatile("invlpg (%0)"::"r"(addr):"memory"); /* send a message to the other CPUs */ smp_invltlb(); } void invltlb(void) { u_long temp; /* * This should be implemented as load_cr3(rcr3()) when load_cr3() is * inlined. */ __asm __volatile("movl %%cr3, %0; movl %0, %%cr3":"=r"(temp) :: "memory"); /* send a message to the other CPUs */ smp_invltlb(); } /* * This is called once the rest of the system is up and running and we're * ready to let the AP's out of the pen. */ extern void enable_sse(void); void ap_init(void) { u_int apic_id; /* spin until all the AP's are ready */ while (!aps_ready) /* spin */ ; /* * Set curproc to our per-cpu idleproc so that mutexes have * something unique to lock with. */ PCPU_SET(curproc, PCPU_GET(idleproc)); PCPU_SET(spinlocks, NULL); /* lock against other AP's that are waking up */ mtx_lock_spin(&ap_boot_mtx); /* BSP may have changed PTD while we're waiting for the lock */ cpu_invltlb(); smp_cpus++; #if defined(I586_CPU) && !defined(NO_F00F_HACK) lidt(&r_idt); #endif /* Build our map of 'other' CPUs. */ PCPU_SET(other_cpus, all_cpus & ~(1 << PCPU_GET(cpuid))); printf("SMP: AP CPU #%d Launched!\n", PCPU_GET(cpuid)); /* set up CPU registers and state */ cpu_setregs(); /* set up FPU state on the AP */ npxinit(__INITIAL_NPXCW__); /* set up SSE registers */ enable_sse(); /* A quick check from sanity claus */ apic_id = (apic_id_to_logical[(lapic.id & 0x0f000000) >> 24]); if (PCPU_GET(cpuid) != apic_id) { printf("SMP: cpuid = %d\n", PCPU_GET(cpuid)); printf("SMP: apic_id = %d\n", apic_id); printf("PTD[MPPTDI] = %p\n", (void *)PTD[MPPTDI]); panic("cpuid mismatch! boom!!"); } /* Init local apic for irq's */ apic_initialize(); /* Set memory range attributes for this CPU to match the BSP */ mem_range_AP_init(); /* * Activate smp_invltlb, although strictly speaking, this isn't * quite correct yet. We should have a bitfield for cpus willing * to accept TLB flush IPI's or something and sync them. */ if (smp_cpus == mp_ncpus) { invltlb_ok = 1; smp_started = 1; /* enable IPI's, tlb shootdown, freezes etc */ smp_active = 1; /* historic */ } /* let other AP's wake up now */ mtx_unlock_spin(&ap_boot_mtx); /* wait until all the AP's are up */ while (smp_started == 0) ; /* nothing */ microuptime(PCPU_PTR(switchtime)); PCPU_SET(switchticks, ticks); /* ok, now grab sched_lock and enter the scheduler */ enable_intr(); mtx_lock_spin(&sched_lock); cpu_throw(); /* doesn't return */ panic("scheduler returned us to ap_init"); } /* * For statclock, we send an IPI to all CPU's to have them call this * function. */ void forwarded_statclock(struct trapframe frame) { mtx_lock_spin(&sched_lock); statclock_process(curproc, TRAPF_PC(&frame), TRAPF_USERMODE(&frame)); mtx_unlock_spin(&sched_lock); } void forward_statclock(void) { int map; CTR0(KTR_SMP, "forward_statclock"); if (!smp_started || !invltlb_ok || cold || panicstr) return; map = PCPU_GET(other_cpus) & ~stopped_cpus ; if (map != 0) ipi_selected(map, IPI_STATCLOCK); } /* * For each hardclock(), we send an IPI to all other CPU's to have them * execute this function. It would be nice to reduce contention on * sched_lock if we could simply peek at the CPU to determine the user/kernel * state and call hardclock_process() on the CPU receiving the clock interrupt * and then just use a simple IPI to handle any ast's if needed. */ void forwarded_hardclock(struct trapframe frame) { mtx_lock_spin(&sched_lock); hardclock_process(curproc, TRAPF_USERMODE(&frame)); mtx_unlock_spin(&sched_lock); } void forward_hardclock(void) { u_int map; CTR0(KTR_SMP, "forward_hardclock"); if (!smp_started || !invltlb_ok || cold || panicstr) return; map = PCPU_GET(other_cpus) & ~stopped_cpus ; if (map != 0) ipi_selected(map, IPI_HARDCLOCK); } #ifdef APIC_INTR_REORDER /* * Maintain mapping from softintr vector to isr bit in local apic. */ void set_lapic_isrloc(int intr, int vector) { if (intr < 0 || intr > 32) panic("set_apic_isrloc: bad intr argument: %d",intr); if (vector < ICU_OFFSET || vector > 255) panic("set_apic_isrloc: bad vector argument: %d",vector); apic_isrbit_location[intr].location = &lapic.isr0 + ((vector>>5)<<2); apic_isrbit_location[intr].bit = (1<<(vector & 31)); } #endif /* * send an IPI to a set of cpus. */ void ipi_selected(u_int32_t cpus, u_int ipi) { CTR2(KTR_SMP, __func__ ": cpus: %x ipi: %x", cpus, ipi); selected_apic_ipi(cpus, ipi, APIC_DELMODE_FIXED); } /* * send an IPI INTerrupt containing 'vector' to all CPUs, including myself */ void ipi_all(u_int ipi) { CTR1(KTR_SMP, __func__ ": ipi: %x", ipi); apic_ipi(APIC_DEST_ALLISELF, ipi, APIC_DELMODE_FIXED); } /* * send an IPI to all CPUs EXCEPT myself */ void ipi_all_but_self(u_int ipi) { CTR1(KTR_SMP, __func__ ": ipi: %x", ipi); apic_ipi(APIC_DEST_ALLESELF, ipi, APIC_DELMODE_FIXED); } /* * send an IPI to myself */ void ipi_self(u_int ipi) { CTR1(KTR_SMP, __func__ ": ipi: %x", ipi); apic_ipi(APIC_DEST_SELF, ipi, APIC_DELMODE_FIXED); } void release_aps(void *dummy __unused) { atomic_store_rel_int(&aps_ready, 1); } SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); Index: head/sys/amd64/include/pcpu.h =================================================================== --- head/sys/amd64/include/pcpu.h (revision 82308) +++ head/sys/amd64/include/pcpu.h (revision 82309) @@ -1,99 +1,79 @@ /*- * Copyright (c) Peter Wemm * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _MACHINE_GLOBALDATA_H_ #define _MACHINE_GLOBALDATA_H_ #ifdef _KERNEL #include #include /* XXX */ #ifdef KTR_PERCPU #include #endif /* * This structure maps out the global data that needs to be kept on a * per-cpu basis. genassym uses this to generate offsets for the assembler * code, which also provides external symbols so that C can get at them as * though they were really globals. * * The SMP parts are setup in pmap.c and locore.s for the BSP, and * mp_machdep.c sets up the data for the AP's to "see" when they awake. * The reason for doing it via a struct is so that an array of pointers * to each CPU's data can be set up for things like "check curproc on all * other processors" */ struct globaldata { struct globaldata *gd_prvspace; /* self-reference */ struct proc *gd_curproc; /* current process */ struct proc *gd_idleproc; /* idle process */ struct proc *gd_npxproc; struct pcb *gd_curpcb; /* current pcb */ struct timeval gd_switchtime; struct i386tss gd_common_tss; int gd_switchticks; struct segment_descriptor gd_common_tssd; struct segment_descriptor *gd_tss_gdt; int gd_currentldt; u_int gd_cpuid; /* this cpu number */ u_int gd_other_cpus; /* all other cpus */ SLIST_ENTRY(globaldata) gd_allcpu; struct lock_list_entry *gd_spinlocks; #ifdef KTR_PERCPU volatile int gd_ktr_idx; /* Index into trace table */ char *gd_ktr_buf; char gd_ktr_buf_data[KTR_SIZE]; #endif }; -#ifdef SMP -/* - * This is the upper (0xff800000) address space layout that is per-cpu. - * It is setup in locore.s and pmap.c for the BSP and in mp_machdep.c for - * each AP. genassym helps export this to the assembler code. - */ -struct privatespace { - /* page 0 - data page */ - struct globaldata globaldata; - char __filler0[PAGE_SIZE - sizeof(struct globaldata)]; - - /* page 1 - idle stack (UPAGES pages) */ - char idlestack[UPAGES * PAGE_SIZE]; - /* page 1+UPAGES... */ -}; - -extern struct privatespace SMP_prvspace[]; - -#endif - #endif /* _KERNEL */ #endif /* ! _MACHINE_GLOBALDATA_H_ */ Index: head/sys/conf/NOTES =================================================================== --- head/sys/conf/NOTES (revision 82308) +++ head/sys/conf/NOTES (revision 82309) @@ -1,2909 +1,2910 @@ # # NOTES -- Lines that can be cut/pasted into kernel and hints configs. # # Lines that begin with 'device', 'options', 'machine', 'ident', 'maxusers', # 'makeoptions', 'hints' etc go into the kernel configuration that you # run config(8) with. # # Lines that begin with 'hints.' are NOT for config(8), they go into your # hints file. See /boot/device.hints and/or the 'hints' config(8) directive. # # Please use ``make LINT'' to create an old-style LINT file if you want to # do kernel test-builds. # # $FreeBSD$ # # # This directive is mandatory; it defines the architecture to be # configured for; in this case, the 386 family based IBM-PC and # compatibles. # machine i386 # # This is the ``identification'' of the kernel. Usually this should # be the same as the name of your kernel. # ident LINT # # The `maxusers' parameter controls the static sizing of a number of # internal system tables by a complicated formula defined in param.c. # maxusers 10 # # We want LINT to cover profiling as well profile 2 # # The `makeoptions' parameter allows variables to be passed to the # generated Makefile in the build area. # # CONF_CFLAGS gives some extra compiler flags that are added to ${CFLAGS} # after most other flags. Here we use it to inhibit use of non-optimal # gcc builtin functions (e.g., memcmp). # # DEBUG happens to be magic. # The following is equivalent to 'config -g KERNELNAME' and creates # 'kernel.debug' compiled with -g debugging as well as a normal # 'kernel'. Use 'make install.debug' to install the debug kernel # but that isn't normally necessary as the debug symbols are not loaded # by the kernel and are not useful there anyway. # # KERNEL can be overridden so that you can change the default name of your # kernel. # makeoptions CONF_CFLAGS=-fno-builtin #Don't allow use of memcmp, etc. #makeoptions DEBUG=-g #Build kernel with gdb(1) debug symbols #makeoptions KERNEL=foo #Build kernel "foo" and install "/foo" # # Certain applications can grow to be larger than the 512M limit # that FreeBSD initially imposes. Below are some options to # allow that limit to grow to 1GB, and can be increased further # with changing the parameters. MAXDSIZ is the maximum that the # limit can be set to, and the DFLDSIZ is the default value for # the limit. MAXSSIZ is the maximum that the stack limit can be # set to. You might want to set the default lower than the max, # and explicitly set the maximum with a shell command for processes # that regularly exceed the limit like INND. # options MAXDSIZ="(1024UL*1024*1024)" options MAXSSIZ="(128UL*1024*1024)" options DFLDSIZ="(1024UL*1024*1024)" # # BLKDEV_IOSIZE sets the default block size used in user block # device I/O. Note that this value will be overriden by the label # when specifying a block device from a label with a non-0 # partition blocksize. The default is PAGE_SIZE. # options BLKDEV_IOSIZE=8192 # Options for the VM subsystem options PQ_CACHESIZE=512 # color for 512k/16k cache +options UPAGES=3 # number of 4k stack pages per process # Deprecated options supported for backwards compatibility #options PQ_NOOPT # No coloring #options PQ_LARGECACHE # color for 512k/16k cache #options PQ_HUGECACHE # color for 1024k/16k cache #options PQ_MEDIUMCACHE # color for 256k/16k cache #options PQ_NORMALCACHE # color for 64k/16k cache # This allows you to actually store this configuration file into # the kernel binary itself, where it may be later read by saying: # strings -n 3 /boot/kernel/kernel | sed -n 's/^___//p' > MYKERNEL # options INCLUDE_CONFIG_FILE # Include this file in kernel # # The root device and filesystem type can be compiled in; # this provides a fallback option if the root device cannot # be correctly guesst by the bootstrap code, or an override if # the RB_DFLTROOT flag (-r) is specified when booting the kernel. # options ROOTDEVNAME=\"ufs:da0s2e\" ##################################################################### # SMP OPTIONS: # # SMP enables building of a Symmetric MultiProcessor Kernel. # APIC_IO enables the use of the IO APIC for Symmetric I/O. # # Notes: # # An SMP kernel will ONLY run on an Intel MP spec. qualified motherboard. # # Be sure to disable 'cpu I386_CPU' && 'cpu I486_CPU' for SMP kernels. # # Check the 'Rogue SMP hardware' section to see if additional options # are required by your hardware. # # Mandatory: options SMP # Symmetric MultiProcessor Kernel options APIC_IO # Symmetric (APIC) I/O # # Rogue SMP hardware: # # Bridged PCI cards: # # The MP tables of most of the current generation MP motherboards # do NOT properly support bridged PCI cards. To use one of these # cards you should refer to ??? # SMP Debugging Options: # # MUTEX_DEBUG enables various extra assertions in the mutex code. # WITNESS enables the mutex witness code which detects deadlocks and cycles # during locking operations. # WITNESS_DDB causes the witness code to drop into the kernel debugger if # a lock heirarchy violation occurs or if locks are held when going to # sleep. # WITNESS_SKIPSPIN disables the witness checks on spin mutexes. options MUTEX_DEBUG options WITNESS options WITNESS_DDB options WITNESS_SKIPSPIN ##################################################################### # CPU OPTIONS # # You must specify at least one CPU (the one you intend to run on); # deleting the specification for CPUs you don't need to use may make # parts of the system run faster. # I386_CPU is mutually exclusive with the other CPU types. # #cpu I386_CPU cpu I486_CPU cpu I586_CPU # aka Pentium(tm) cpu I686_CPU # aka Pentium Pro(tm) # # Options for CPU features. # # CPU_BLUELIGHTNING_FPU_OP_CACHE enables FPU operand cache on IBM # BlueLightning CPU. It works only with Cyrix FPU, and this option # should not be used with Intel FPU. # # CPU_BLUELIGHTNING_3X enables triple-clock mode on IBM Blue Lightning # CPU if CPU supports it. The default is double-clock mode on # BlueLightning CPU box. # # CPU_BTB_EN enables branch target buffer on Cyrix 5x86 (NOTE 1). # # CPU_DIRECT_MAPPED_CACHE sets L1 cache of Cyrix 486DLC CPU in direct # mapped mode. Default is 2-way set associative mode. # # CPU_CYRIX_NO_LOCK enables weak locking for the entire address space # of Cyrix 6x86 and 6x86MX CPUs by setting the NO_LOCK bit of CCR1. # Otherwise, the NO_LOCK bit of CCR1 is cleared. (NOTE 3) # # CPU_DISABLE_5X86_LSSER disables load store serialize (i.e. enables # reorder). This option should not be used if you use memory mapped # I/O device(s). # # CPU_ENABLE_SSE enables SSE/MMX2 instructions support. # # CPU_FASTER_5X86_FPU enables faster FPU exception handler. # # CPU_I486_ON_386 enables CPU cache on i486 based CPU upgrade products # for i386 machines. # # CPU_IORT defines I/O clock delay time (NOTE 1). Default values of # I/O clock delay time on Cyrix 5x86 and 6x86 are 0 and 7,respectively # (no clock delay). # # CPU_L2_LATENCY specifed the L2 cache latency value. This option is used # only when CPU_PPRO2CELERON is defined and Mendocino Celeron is detected. # The default value is 5. # # CPU_LOOP_EN prevents flushing the prefetch buffer if the destination # of a jump is already present in the prefetch buffer on Cyrix 5x86(NOTE # 1). # # CPU_PPRO2CELERON enables L2 cache of Mendocino Celeron CPUs. This option # is useful when you use Socket 8 to Socket 370 converter, because most Pentium # Pro BIOSs do not enable L2 cache of Mendocino Celeron CPUs. # # CPU_RSTK_EN enables return stack on Cyrix 5x86 (NOTE 1). # # CPU_SUSP_HLT enables suspend on HALT. If this option is set, CPU # enters suspend mode following execution of HALT instruction. # # CPU_WT_ALLOC enables write allocation on Cyrix 6x86/6x86MX and AMD # K5/K6/K6-2 cpus. # # CYRIX_CACHE_WORKS enables CPU cache on Cyrix 486 CPUs with cache # flush at hold state. # # CYRIX_CACHE_REALLY_WORKS enables (1) CPU cache on Cyrix 486 CPUs # without cache flush at hold state, and (2) write-back CPU cache on # Cyrix 6x86 whose revision < 2.7 (NOTE 2). # # NO_F00F_HACK disables the hack that prevents Pentiums (and ONLY # Pentiums) from locking up when a LOCK CMPXCHG8B instruction is # executed. This option is only needed if I586_CPU is also defined, # and should be included for any non-Pentium CPU that defines it. # # NO_MEMORY_HOLE is an optimisation for systems with AMD K6 processors # which indicates that the 15-16MB range is *definitely* not being # occupied by an ISA memory hole. # # NOTE 1: The options, CPU_BTB_EN, CPU_LOOP_EN, CPU_IORT, # CPU_LOOP_EN and CPU_RSTK_EN should not be used because of CPU bugs. # These options may crash your system. # # NOTE 2: If CYRIX_CACHE_REALLY_WORKS is not set, CPU cache is enabled # in write-through mode when revision < 2.7. If revision of Cyrix # 6x86 >= 2.7, CPU cache is always enabled in write-back mode. # # NOTE 3: This option may cause failures for software that requires # locked cycles in order to operate correctly. # options CPU_BLUELIGHTNING_FPU_OP_CACHE options CPU_BLUELIGHTNING_3X options CPU_BTB_EN options CPU_DIRECT_MAPPED_CACHE options CPU_DISABLE_5X86_LSSER options CPU_ENABLE_SSE options CPU_FASTER_5X86_FPU options CPU_I486_ON_386 options CPU_IORT options CPU_L2_LATENCY=5 options CPU_LOOP_EN options CPU_PPRO2CELERON options CPU_RSTK_EN options CPU_SUSP_HLT options CPU_WT_ALLOC options CYRIX_CACHE_WORKS options CYRIX_CACHE_REALLY_WORKS #options NO_F00F_HACK # # A math emulator is mandatory if you wish to run on hardware which # does not have a floating-point processor. Pick either the original, # bogus (but freely-distributable) math emulator, or a much more # fully-featured but GPL-licensed emulator taken from Linux. # options MATH_EMULATE #Support for x87 emulation # Don't enable both of these in a real config. options GPL_MATH_EMULATE #Support for x87 emulation via #new math emulator ##################################################################### # COMPATIBILITY OPTIONS # # Implement system calls compatible with 4.3BSD and older versions of # FreeBSD. You probably do NOT want to remove this as much current code # still relies on the 4.3 emulation. # options COMPAT_43 # # These three options provide support for System V Interface # Definition-style interprocess communication, in the form of shared # memory, semaphores, and message queues, respectively. # options SYSVSHM options SYSVSEM options SYSVMSG ##################################################################### # DEBUGGING OPTIONS # # Enable the kernel debugger. # options DDB # # Don't drop into DDB for a panic. Intended for unattended operation # where you may want to drop to DDB from the console, but still want # the machine to recover from a panic # options DDB_UNATTENDED # # If using GDB remote mode to debug the kernel, there's a non-standard # extension to the remote protocol that can be used to use the serial # port as both the debugging port and the system console. It's non- # standard and you're on your own if you enable it. See also the # "remotechat" variables in the FreeBSD specific version of gdb. # options GDB_REMOTE_CHAT # # KTRACE enables the system-call tracing facility ktrace(2). # options KTRACE #kernel tracing # # KTR is a kernel tracing mechanism imported from BSD/OS. Currently it # has no userland interface aside from a few sysctl's. It is enabled with # the KTR option. The KTR_EXTEND option causes trace events to be generated # as a string from snprintf rather than as a string and up to 5 argument # pointers. KTR_ENTRIES defines the number of entries in the circular trace # buffer. KTR_COMPILE defines the mask of events to compile into the kernel # as defined by the KTR_* constants in . KTR_MASK defines the # initial value of the ktr_mask variable which determines at runtime what # events to trace. KTR_CPUMASK determines which CPU's log events, with # bit X corresponding to cpu X. KTR_VERBOSE enables dumping of KTR events # to the console by default. This functionality can be toggled via the # debug.ktr_verbose sysctl and defaults to off if KTR_VERBOSE is not defined. # options KTR options KTR_EXTEND options KTR_ENTRIES=1024 options KTR_COMPILE="(KTR_INTR|KTR_PROC)" options KTR_MASK=KTR_INTR options KTR_CPUMASK=0x3 options KTR_VERBOSE # # The INVARIANTS option is used in a number of source files to enable # extra sanity checking of internal structures. This support is not # enabled by default because of the extra time it would take to check # for these conditions, which can only occur as a result of # programming errors. # options INVARIANTS # # The INVARIANT_SUPPORT option makes us compile in support for # verifying some of the internal structures. It is a prerequisite for # 'INVARIANTS', as enabling 'INVARIANTS' will make these functions be # called. The intent is that you can set 'INVARIANTS' for single # source files (by changing the source file or specifying it on the # command line) if you have 'INVARIANT_SUPPORT' enabled. Also, if you # wish to build a kernel module with 'INVARIANTS', then adding # 'INVARIANT_SUPPORT' to your kernel will provide all the necessary # infrastructure without the added overhead. # options INVARIANT_SUPPORT # # The DIAGNOSTIC option is used to enable extra debugging information # from some parts of the kernel. As this makes everything more noisy, # it is disabled by default. # options DIAGNOSTIC # # REGRESSION causes optional kernel interfaces necessary only for regression # testing to be enabled. These interfaces may consitute security risks # when enabled, as they permit processes to easily modify aspects of the # run-time environment to reproduce unlikely or unusual (possibly normally # impossible) scenarios. # options REGRESSION # # RESTARTABLE_PANICS allows one to continue from a panic as if it were # a call to the debugger via the Debugger() function instead. It is only # useful if a kernel debugger is present. To restart from a panic, reset # the panicstr variable to NULL and continue execution. This option is # for development use only and should NOT be used in production systems # to "workaround" a panic. # options RESTARTABLE_PANICS # # PERFMON causes the driver for Pentium/Pentium Pro performance counters # to be compiled. See perfmon(4) for more information. # options PERFMON # # This option let some drivers co-exist that can't co-exist in a running # system. This is used to be able to compile all kernel code in one go for # quality assurance purposes (like this file, which the option takes it name # from.) # options COMPILING_LINT # XXX - this doesn't belong here. # Allow ordinary users to take the console - this is useful for X. options UCONSOLE # XXX - this doesn't belong here either #options USERCONFIG #boot -c editor #options INTRO_USERCONFIG #imply -c and show intro screen #options VISUAL_USERCONFIG #visual boot -c editor ##################################################################### # NETWORKING OPTIONS # # Protocol families: # Only the INET (Internet) family is officially supported in FreeBSD. # Source code for the NS (Xerox Network Service) is provided for amusement # value. # options INET #Internet communications protocols options INET6 #IPv6 communications protocols options IPSEC #IP security options IPSEC_ESP #IP security (crypto; define w/ IPSEC) options IPSEC_DEBUG #debug for IP security options IPX #IPX/SPX communications protocols options IPXIP #IPX in IP encapsulation (not available) options IPTUNNEL #IP in IPX encapsulation (not available) options NCP #NetWare Core protocol options NETATALK #Appletalk communications protocols options NETATALKDEBUG #Appletalk debugging # These are currently broken but are shipped due to interest. #options NS #Xerox NS protocols #options NSIP #XNS over IP # mchain library. It can be either loaded as KLD or compiled into kernel options LIBMCHAIN # netgraph(4). Enable the base netgraph code with the NETGRAPH option. # Individual node types can be enabled with the corresponding option # listed below; however, this is not strictly necessary as netgraph # will automatically load the corresponding KLD module if the node type # is not already compiled into the kernel. Each type below has a # corresponding man page, e.g., ng_async(8). options NETGRAPH #netgraph(4) system options NETGRAPH_ASYNC options NETGRAPH_BPF options NETGRAPH_CISCO options NETGRAPH_ECHO options NETGRAPH_ETHER options NETGRAPH_FRAME_RELAY options NETGRAPH_HOLE options NETGRAPH_IFACE options NETGRAPH_KSOCKET options NETGRAPH_LMI # MPPC compression requires proprietary files (not included) #options NETGRAPH_MPPC_COMPRESSION options NETGRAPH_MPPC_ENCRYPTION options NETGRAPH_ONE2MANY options NETGRAPH_PPP options NETGRAPH_PPPOE options NETGRAPH_PPTPGRE options NETGRAPH_RFC1490 options NETGRAPH_SOCKET options NETGRAPH_SPLIT options NETGRAPH_TEE options NETGRAPH_TTY options NETGRAPH_UI options NETGRAPH_VJC device mn # Munich32x/Falc54 Nx64kbit/sec cards. device lmc # tulip based LanMedia WAN cards device musycc # LMC/SBE LMC1504 quad T1/E1 # # Network interfaces: # The `loop' device is MANDATORY when networking is enabled. # The `ether' device provides generic code to handle # Ethernets; it is MANDATORY when a Ethernet device driver is # configured or token-ring is enabled. # The `fddi' device provides generic code to support FDDI. # The `sppp' device serves a similar role for certain types # of synchronous PPP links (like `cx', `ar'). # The `sl' device implements the Serial Line IP (SLIP) service. # The `ppp' device implements the Point-to-Point Protocol. # The `bpf' device enables the Berkeley Packet Filter. Be # aware of the legal and administrative consequences of enabling this # option. The number of devices determines the maximum number of # simultaneous BPF clients programs runnable. # The `disc' device implements a minimal network interface, # which throws away all packets sent and never receives any. It is # included for testing purposes. This shows up as the `ds' interface. # The `tap' device is a pty-like virtual Ethernet interface # The `tun' device implements (user-)ppp and nos-tun # The `gif' device implements IPv6 over IP4 tunneling, # IPv4 over IPv6 tunneling, IPv4 over IPv4 tunneling and # IPv6 over IPv6 tunneling. # The XBONEHACK option allows the same pair of addresses to be configured on # multiple gif interfaces. # The `faith' device captures packets sent to it and diverts them # to the IPv4/IPv6 translation daemon. # The `stf' device implements 6to4 encapsulation. # The `ef' device provides support for multiple ethernet frame types # specified via ETHER_* options. See ef(4) for details. # # The PPP_BSDCOMP option enables support for compress(1) style entire # packet compression, the PPP_DEFLATE is for zlib/gzip style compression. # PPP_FILTER enables code for filtering the ppp data stream and selecting # events for resetting the demand dial activity timer - requires bpf. # See pppd(8) for more details. # device ether #Generic Ethernet device vlan 1 #VLAN support device token #Generic TokenRing device fddi #Generic FDDI device sppp #Generic Synchronous PPP device loop 1 #Network loopback device device bpf #Berkeley packet filter device disc #Discard device (ds0, ds1, etc) device tap #Virtual Ethernet driver device tun #Tunnel driver (ppp(8), nos-tun(8)) device sl #Serial Line IP device ppp 2 #Point-to-point protocol options PPP_BSDCOMP #PPP BSD-compress support options PPP_DEFLATE #PPP zlib/deflate/gzip support options PPP_FILTER #enable bpf filtering (needs bpf) device ef # Multiple ethernet frames support options ETHER_II # enable Ethernet_II frame options ETHER_8023 # enable Ethernet_802.3 (Novell) frame options ETHER_8022 # enable Ethernet_802.2 frame options ETHER_SNAP # enable Ethernet_802.2/SNAP frame # for IPv6 device gif #IPv6 and IPv4 tunneling options XBONEHACK device faith 1 #for IPv6 and IPv4 translation device stf #6to4 IPv6 over IPv4 encapsulation # # Internet family options: # # MROUTING enables the kernel multicast packet forwarder, which works # with mrouted(8). # # IPFIREWALL enables support for IP firewall construction, in # conjunction with the `ipfw' program. IPFIREWALL_VERBOSE sends # logged packets to the system logger. IPFIREWALL_VERBOSE_LIMIT # limits the number of times a matching entry can be logged. # # WARNING: IPFIREWALL defaults to a policy of "deny ip from any to any" # and if you do not add other rules during startup to allow access, # YOU WILL LOCK YOURSELF OUT. It is suggested that you set firewall_type=open # in /etc/rc.conf when first enabling this feature, then refining the # firewall rules in /etc/rc.firewall after you've tested that the new kernel # feature works properly. # # IPFIREWALL_DEFAULT_TO_ACCEPT causes the default rule (at boot) to # allow everything. Use with care, if a cracker can crash your # firewall machine, they can get to your protected machines. However, # if you are using it as an as-needed filter for specific problems as # they arise, then this may be for you. Changing the default to 'allow' # means that you won't get stuck if the kernel and /sbin/ipfw binary get # out of sync. # # IPDIVERT enables the divert IP sockets, used by ``ipfw divert'' # # IPSTEALTH enables code to support stealth forwarding (i.e., forwarding # packets without touching the ttl). This can be useful to hide firewalls # from traceroute and similar tools. # # TCPDEBUG enables code which keeps traces of the TCP state machine # for sockets with the SO_DEBUG option set, which can then be examined # using the trpt(8) utility. # options MROUTING # Multicast routing options IPFIREWALL #firewall options IPFIREWALL_VERBOSE #print information about # dropped packets options IPFIREWALL_FORWARD #enable transparent proxy support options IPFIREWALL_VERBOSE_LIMIT=100 #limit verbosity options IPFIREWALL_DEFAULT_TO_ACCEPT #allow everything by default options IPV6FIREWALL #firewall for IPv6 options IPV6FIREWALL_VERBOSE options IPV6FIREWALL_VERBOSE_LIMIT=100 options IPV6FIREWALL_DEFAULT_TO_ACCEPT options IPDIVERT #divert sockets options IPFILTER #ipfilter support options IPFILTER_LOG #ipfilter logging options IPFILTER_DEFAULT_BLOCK #block all packets by default options IPSTEALTH #support for stealth forwarding options TCPDEBUG # RANDOM_IP_ID causes the ID field in IP packets to be randomized # instead of incremented by 1 with each packet generated. This # option closes a minor information leak which allows remote # observers to determine the rate of packet generation on the # machine by watching the counter. options RANDOM_IP_ID # Statically Link in accept filters options ACCEPT_FILTER_DATA options ACCEPT_FILTER_HTTP # TCP_DROP_SYNFIN adds support for ignoring TCP packets with SYN+FIN. This # prevents nmap et al. from identifying the TCP/IP stack, but breaks support # for RFC1644 extensions and is not recommended for web servers. # options TCP_DROP_SYNFIN #drop TCP packets with SYN+FIN # DUMMYNET enables the "dummynet" bandwidth limiter. You need # IPFIREWALL as well. See the dummynet(4) manpage for more info. # BRIDGE enables bridging between ethernet cards -- see bridge(4). # You can use IPFIREWALL and dummynet together with bridging. options DUMMYNET options BRIDGE # # ATM (HARP version) options # # ATM_CORE includes the base ATM functionality code. This must be included # for ATM support. # # ATM_IP includes support for running IP over ATM. # # At least one (and usually only one) of the following signalling managers # must be included (note that all signalling managers include PVC support): # ATM_SIGPVC includes support for the PVC-only signalling manager `sigpvc'. # ATM_SPANS includes support for the `spans' signalling manager, which runs # the FORE Systems's proprietary SPANS signalling protocol. # ATM_UNI includes support for the `uni30' and `uni31' signalling managers, # which run the ATM Forum UNI 3.x signalling protocols. # # The `hea' driver provides support for the Efficient Networks, Inc. # ENI-155p ATM PCI Adapter. # # The `hfa' driver provides support for the FORE Systems, Inc. # PCA-200E ATM PCI Adapter. # options ATM_CORE #core ATM protocol family options ATM_IP #IP over ATM support options ATM_SIGPVC #SIGPVC signalling manager options ATM_SPANS #SPANS signalling manager options ATM_UNI #UNI signalling manager device hea #Efficient ENI-155p ATM PCI device hfa #FORE PCA-200E ATM PCI ##################################################################### # FILESYSTEM OPTIONS # # Only the root, /usr, and /tmp filesystems need be statically # compiled; everything else will be automatically loaded at mount # time. (Exception: the UFS family--- FFS --- cannot # currently be demand-loaded.) Some people still prefer to statically # compile other filesystems as well. # # NB: The NULL, PORTAL, UMAP and UNION filesystems are known to be # buggy, and WILL panic your system if you attempt to do anything with # them. They are included here as an incentive for some enterprising # soul to sit down and fix them. # # One of these is mandatory: options FFS #Fast filesystem options NFS #Network File System # The rest are optional: #options NFS_NOSERVER #Disable the NFS-server code. options CD9660 #ISO 9660 filesystem options FDESCFS #File descriptor filesystem options HPFS #OS/2 File system options MSDOSFS #MS DOS File System (FAT, FAT32) options NTFS #NT File System options NULLFS #NULL filesystem options NWFS #NetWare filesystem options PORTALFS #Portal filesystem options PROCFS #Process filesystem options PSEUDOFS #Pseudo-filesystem framework options UMAPFS #UID map filesystem options UNIONFS #Union filesystem # options NODEVFS #disable devices filesystem # The xFS_ROOT options REQUIRE the associated ``options xFS'' options NFS_ROOT #NFS usable as root device # This code enables IFS, an FFS which exports inodes as the namespace. # You can find details in src/sys/ufs/ifs/README . options IFS # Soft updates is a technique for improving file system speed and # making abrupt shutdown less risky. # options SOFTUPDATES # Extended attributes allow additional data to be associated with files, # and is used for ACLs, Capabilities, and MAC labels. # See src/sys/ufs/ufs/README.extattr for more information. options UFS_EXTATTR options UFS_EXTATTR_AUTOSTART # Access Control List support for UFS filesystems. The current ACL # implementation requires extended attribute support, UFS_EXTATTR, # for the underlying filesystem. # See src/sys/ufs/ufs/README.acls for more information. options UFS_ACL # Directory hashing improves the speed of operations on very large # directories at the expense of some memory. options UFS_DIRHASH # Make space in the kernel for a root filesystem on a md device. # Define to the number of kilobytes to reserve for the filesystem. options MD_ROOT_SIZE=10 # Make the md device a potential root device, either with preloaded # images of type mfs_root or md_root. options MD_ROOT # Allow this many swap-devices. # # In order to manage swap, the system must reserve bitmap space that # scales with the largest mounted swap device multiplied by NSWAPDEV, # irregardless of whether other swap devices exist or not. So it # is not a good idea to make this value too large. options NSWAPDEV=5 # Disk quotas are supported when this option is enabled. options QUOTA #enable disk quotas # If you are running a machine just as a fileserver for PC and MAC # users, using SAMBA or Netatalk, you may consider setting this option # and keeping all those users' directories on a filesystem that is # mounted with the suiddir option. This gives new files the same # ownership as the directory (similar to group). It's a security hole # if you let these users run programs, so confine it to file-servers # (but it'll save you lots of headaches in those cases). Root owned # directories are exempt and X bits are cleared. The suid bit must be # set on the directory as well; see chmod(1) PC owners can't see/set # ownerships so they keep getting their toes trodden on. This saves # you all the support calls as the filesystem it's used on will act as # they expect: "It's my dir so it must be my file". # options SUIDDIR # NFS options: options NFS_MINATTRTIMO=3 # VREG attrib cache timeout in sec options NFS_MAXATTRTIMO=60 options NFS_MINDIRATTRTIMO=30 # VDIR attrib cache timeout in sec options NFS_MAXDIRATTRTIMO=60 options NFS_GATHERDELAY=10 # Default write gather delay (msec) options NFS_UIDHASHSIZ=29 # Tune the size of nfssvc_sock with this options NFS_WDELAYHASHSIZ=16 # and with this options NFS_MUIDHASHSIZ=63 # Tune the size of nfsmount with this options NFS_DEBUG # Enable NFS Debugging # Coda stuff: options CODA #CODA filesystem. device vcoda 4 #coda minicache <-> venus comm. # # Add support for the EXT2FS filesystem of Linux fame. Be a bit # careful with this - the ext2fs code has a tendency to lag behind # changes and not be exercised very much, so mounting read/write could # be dangerous (and even mounting read only could result in panics.) # options EXT2FS # Use real implementations of the aio_* system calls. There are numerous # stability issues in the current aio code that make it unsuitable for # inclusion on shell boxes. options VFS_AIO # Enable the code UFS IO optimization through the VM system. This allows # use VM operations instead of copying operations when possible. # # Even with this enabled, actual use of the code is still controlled by the # sysctl vfs.ioopt. 0 gives no optimization, 1 gives normal (use VM # operations if a request happens to fit), 2 gives agressive optimization # (the operations are split to do as much as possible through the VM system.) # # Enabling this will probably not give an overall speedup except for # special workloads. options ENABLE_VFS_IOOPT # Cryptographically secure random number generator; /dev/[u]random device random ##################################################################### # POSIX P1003.1B # Real time extensions added in the 1993 Posix # P1003_1B: Infrastructure # _KPOSIX_PRIORITY_SCHEDULING: Build in _POSIX_PRIORITY_SCHEDULING # _KPOSIX_VERSION: Version kernel is built for options P1003_1B options _KPOSIX_PRIORITY_SCHEDULING options _KPOSIX_VERSION=199309L ##################################################################### # CLOCK OPTIONS # The granularity of operation is controlled by the kernel option HZ whose # default value (100) means a granularity of 10ms. For an accurate simulation # of high data rates it might be necessary to reduce the timer granularity to # 1ms or less. Consider, however, that some interfaces using programmed I/O # may require a considerable time to output packets. So, reducing the # granularity too much might actually cause ticks to be missed thus reducing # the accuracy of operation. options HZ=100 # Other clock options options CLK_CALIBRATION_LOOP options CLK_USE_I8254_CALIBRATION options CLK_USE_TSC_CALIBRATION ##################################################################### # SCSI DEVICES # SCSI DEVICE CONFIGURATION # The SCSI subsystem consists of the `base' SCSI code, a number of # high-level SCSI device `type' drivers, and the low-level host-adapter # device drivers. The host adapters are listed in the ISA and PCI # device configuration sections below. # # Beginning with FreeBSD 2.0.5 you can wire down your SCSI devices so # that a given bus, target, and LUN always come on line as the same # device unit. In earlier versions the unit numbers were assigned # in the order that the devices were probed on the SCSI bus. This # means that if you removed a disk drive, you may have had to rewrite # your /etc/fstab file, and also that you had to be careful when adding # a new disk as it may have been probed earlier and moved your device # configuration around. # This old behavior is maintained as the default behavior. The unit # assignment begins with the first non-wired down unit for a device # type. For example, if you wire a disk as "da3" then the first # non-wired disk will be assigned da4. # The syntax for wiring down devices is: hint.scbus.0.at="ahc0" hint.scbus.1.at="ahc1" hint.scbus.1.bus="0" hint.scbus.3.at="ahc2" hint.scbus.3.bus="0" hint.scbus.2.at="ahc2" hint.scbus.2.bus="1" hint.da.0.at="scbus0" hint.da.0.target="0" hint.da.0.unit="0" hint.da.1.at="scbus3" hint.da.1.target="1" hint.da.2.at="scbus2" hint.da.2.target="3" hint.sa.1.at="scbus1" hint.sa.1.target="6" # "units" (SCSI logical unit number) that are not specified are # treated as if specified as LUN 0. # All SCSI devices allocate as many units as are required. # The ch driver drives SCSI Media Changer ("jukebox") devices. # # The da driver drives SCSI Direct Access ("disk") and Optical Media # ("WORM") devices. # # The sa driver drives SCSI Sequential Access ("tape") devices. # # The cd driver drives SCSI Read Only Direct Access ("cd") devices. # # The ses driver drives SCSI Envinronment Services ("ses") and # SAF-TE ("SCSI Accessable Fault-Tolerant Enclosure") devices. # # The pt driver drives SCSI Processor devices. # # # Target Mode support is provided here but also requires that a SIM # (SCSI Host Adapter Driver) provide support as well. # # The targ driver provides target mode support as a Processor type device. # It exists to give the minimal context necessary to respond to Inquiry # commands. There is a sample user application that shows how the rest # of the command support might be done in /usr/share/examples/scsi_target. # # The targbh driver provides target mode support and exists to respond # to incoming commands that do not otherwise have a logical unit assigned # to them. # # The "unknown" device (uk? in pre-2.0.5) is now part of the base SCSI # configuration as the "pass" driver. device scbus #base SCSI code device ch #SCSI media changers device da #SCSI direct access devices (aka disks) device sa #SCSI tapes device cd #SCSI CD-ROMs device ses #SCSI Environmental Services (and SAF-TE) device pt #SCSI processor device targ #SCSI Target Mode Code device targbh #SCSI Target Mode Blackhole Device device pass #CAM passthrough driver # CAM OPTIONS: # debugging options: # -- NOTE -- If you specify one of the bus/target/lun options, you must # specify them all! # CAMDEBUG: When defined enables debugging macros # CAM_DEBUG_BUS: Debug the given bus. Use -1 to debug all busses. # CAM_DEBUG_TARGET: Debug the given target. Use -1 to debug all targets. # CAM_DEBUG_LUN: Debug the given lun. Use -1 to debug all luns. # CAM_DEBUG_FLAGS: OR together CAM_DEBUG_INFO, CAM_DEBUG_TRACE, # CAM_DEBUG_SUBTRACE, and CAM_DEBUG_CDB # # CAM_MAX_HIGHPOWER: Maximum number of concurrent high power (start unit) cmds # CAM_NEW_TRAN_CODE: this is the new transport layer code that will be switched # to soon # SCSI_NO_SENSE_STRINGS: When defined disables sense descriptions # SCSI_NO_OP_STRINGS: When defined disables opcode descriptions # SCSI_DELAY: The number of MILLISECONDS to freeze the SIM (scsi adapter) # queue after a bus reset, and the number of milliseconds to # freeze the device queue after a bus device reset. options CAMDEBUG options CAM_DEBUG_BUS=-1 options CAM_DEBUG_TARGET=-1 options CAM_DEBUG_LUN=-1 options CAM_DEBUG_FLAGS="CAM_DEBUG_INFO|CAM_DEBUG_TRACE|CAM_DEBUG_CDB" options CAM_MAX_HIGHPOWER=4 options SCSI_NO_SENSE_STRINGS options SCSI_NO_OP_STRINGS options SCSI_DELAY=8000 # Be pessimistic about Joe SCSI device # Options for the CAM CDROM driver: # CHANGER_MIN_BUSY_SECONDS: Guaranteed minimum time quantum for a changer LUN # CHANGER_MAX_BUSY_SECONDS: Maximum time quantum per changer LUN, only # enforced if there is I/O waiting for another LUN # The compiled in defaults for these variables are 2 and 10 seconds, # respectively. # # These can also be changed on the fly with the following sysctl variables: # kern.cam.cd.changer.min_busy_seconds # kern.cam.cd.changer.max_busy_seconds # options CHANGER_MIN_BUSY_SECONDS=2 options CHANGER_MAX_BUSY_SECONDS=10 # Options for the CAM sequential access driver: # SA_IO_TIMEOUT: Timeout for read/write/wfm operations, in minutes # SA_SPACE_TIMEOUT: Timeout for space operations, in minutes # SA_REWIND_TIMEOUT: Timeout for rewind operations, in minutes # SA_ERASE_TIMEOUT: Timeout for erase operations, in minutes # SA_1FM_AT_EOD: Default to model which only has a default one filemark at EOT. options SA_IO_TIMEOUT="(4)" options SA_SPACE_TIMEOUT="(60)" options SA_REWIND_TIMEOUT="(2*60)" options SA_ERASE_TIMEOUT="(4*60)" options SA_1FM_AT_EOD # Optional timeout for the CAM processor target (pt) device # This is specified in seconds. The default is 60 seconds. options SCSI_PT_DEFAULT_TIMEOUT="60" # Optional enable of doing SES passthrough on other devices (e.g., disks) # # Normally disabled because a lot of newer SCSI disks report themselves # as having SES capabilities, but this can then clot up attempts to build # build a topology with the SES device that's on the box these drives # are in.... options SES_ENABLE_PASSTHROUGH ##################################################################### # MISCELLANEOUS DEVICES AND OPTIONS # The `pty' device usually turns out to be ``effectively mandatory'', # as it is required for `telnetd', `rlogind', `screen', `emacs', and # `xterm', among others. device pty #Pseudo ttys device speaker #Play IBM BASIC-style noises out your speaker device gzip #Exec gzipped a.out's device md #Memory/malloc disk device snp #Snoop device - to look at pty/vty/etc.. device ccd 4 #Concatenated disk driver # Configuring Vinum into the kernel is not necessary, since the kld # module gets started automatically when vinum(8) starts. This # device is also untested. Use at your own risk. # # The option VINUMDEBUG must match the value set in CFLAGS # in src/sbin/vinum/Makefile. Failure to do so will result in # the following message from vinum(8): # # Can't get vinum config: Invalid argument # # see vinum(4) for more reasons not to use these options. device vinum #Vinum concat/mirror/raid driver options VINUMDEBUG #enable Vinum debugging hooks # Kernel side iconv library options LIBICONV # Size of the kernel message buffer. Should be N * pagesize. options MSGBUF_SIZE=40960 ##################################################################### # HARDWARE BUS CONFIGURATION # ISA, EISA, MCA and PCI bus: # # Mandatory ISA devices: isa, npx # device isa # # Options for `isa': # # AUTO_EOI_1 enables the `automatic EOI' feature for the master 8259A # interrupt controller. This saves about 0.7-1.25 usec for each interrupt. # This option breaks suspend/resume on some portables. # # AUTO_EOI_2 enables the `automatic EOI' feature for the slave 8259A # interrupt controller. This saves about 0.7-1.25 usec for each interrupt. # Automatic EOI is documented not to work for for the slave with the # original i8259A, but it works for some clones and some integrated # versions. # # MAXMEM specifies the amount of RAM on the machine; if this is not # specified, FreeBSD will first read the amount of memory from the CMOS # RAM, so the amount of memory will initially be limited to 64MB or 16MB # depending on the BIOS. If the BIOS reports 64MB, a memory probe will # then attempt to detect the installed amount of RAM. If this probe # fails to detect >64MB RAM you will have to use the MAXMEM option. # The amount is in kilobytes, so for a machine with 128MB of RAM, it would # be 131072 (128 * 1024). # # BROKEN_KEYBOARD_RESET disables the use of the keyboard controller to # reset the CPU for reboot. This is needed on some systems with broken # keyboard controllers. options COMPAT_OLDISA #Use ISA shims and glue for old drivers options AUTO_EOI_1 #options AUTO_EOI_2 options MAXMEM="(128*1024)" #options BROKEN_KEYBOARD_RESET # Enable support for the kernel PLL to use an external PPS signal, # under supervision of [x]ntpd(8) # More info in ntpd documentation: http://www.eecis.udel.edu/~ntp options PPS_SYNC # If you see the "calcru: negative time of %ld usec for pid %d (%s)\n" # message you probably have some broken sw/hw which disables interrupts # for too long. You can make the system more resistant to this by # choosing a high value for NTIMECOUNTER. The default is 5, there # is no upper limit but more than a couple of hundred are not productive. # A better strategy may be to sysctl -w kern.timecounter.method=1 options NTIMECOUNTER=20 # # EISA bus # # The EISA bus device is `eisa'. It provides auto-detection and # configuration support for all devices on the EISA bus. device eisa # By default, only 10 EISA slots are probed, since the slot numbers # above clash with the configuration address space of the PCI subsystem, # and the EISA probe is not very smart about this. This is sufficient # for most machines, but in particular the HP NetServer LC series comes # with an onboard AIC7770 dual-channel SCSI controller on EISA slot #11, # thus you need to bump this figure to 12 for them. options EISA_SLOTS=12 # # MCA bus: # # The MCA bus device is `mca'. It provides auto-detection and # configuration support for all devices on the MCA bus. # No hints are required for MCA. device mca # # PCI bus & PCI options: # # The main PCI bus device is `pci'. It provides auto-detection and # configuration support for all devices on the PCI bus, using either # configuration mode defined in the PCI specification. device pci # # AGP GART support device agp # PCI options # #options PCI_QUIET #quiets PCI code on chipset settings ##################################################################### # HARDWARE DEVICE CONFIGURATION # EISA support is available for some device, so they can be auto-probed. # MicroChannel (MCA) support is available for some devices. # For ISA the required hints are listed. # EISA, MCA, PCI and pccard are self identifying buses, so no hints # are needed. # # Mandatory devices: # # The keyboard controller; it controls the keyboard and the PS/2 mouse. device atkbdc 1 hint.atkbdc.0.at="isa" hint.atkbdc.0.port="0x060" # The AT keyboard device atkbd hint.atkbd.0.at="atkbdc" hint.atkbd.0.irq="1" # Options for atkbd: options ATKBD_DFLT_KEYMAP # specify the built-in keymap makeoptions ATKBD_DFLT_KEYMAP="jp.106" # These options are valid for other keyboard drivers as well. options KBD_DISABLE_KEYMAP_LOAD # refuse to load a keymap options KBD_INSTALL_CDEV # install a CDEV entry in /dev # `flags' for atkbd: # 0x01 Force detection of keyboard, else we always assume a keyboard # 0x02 Don't reset keyboard, useful for some newer ThinkPads # 0x04 Old-style (XT) keyboard support, useful for older ThinkPads # PS/2 mouse device psm hint.psm.0.at="atkbdc" hint.psm.0.irq="12" # Options for psm: options PSM_HOOKRESUME #hook the system resume event, useful #for some laptops options PSM_RESETAFTERSUSPEND #reset the device at the resume event # The video card driver. device vga hint.vga.0.at="isa" # Options for vga: # Try the following option if the mouse pointer is not drawn correctly # or font does not seem to be loaded properly. May cause flicker on # some systems. options VGA_ALT_SEQACCESS # If you can dispense with some vga driver features, you may want to # use the following options to save some memory. #options VGA_NO_FONT_LOADING # don't save/load font #options VGA_NO_MODE_CHANGE # don't change video modes # Older video cards may require this option for proper operation. options VGA_SLOW_IOACCESS # do byte-wide i/o's to TS and GDC regs # The following option probably won't work with the LCD displays. options VGA_WIDTH90 # support 90 column modes # To include support for VESA video modes options VESA options FB_DEBUG # Frame buffer debugging options FB_INSTALL_CDEV # install a CDEV entry in /dev # Splash screen at start up! Screen savers require this too. device splash # Various screen savers. device apm_saver # Requires APM device blank_saver device daemon_saver device fade_saver device fire_saver device green_saver device logo_saver device rain_saver device star_saver device warp_saver # The pcvt console driver (vt220 compatible). device vt hint.vt.0.at="isa" options XSERVER # support for running an X server on vt options FAT_CURSOR # start with block cursor # This PCVT option is for keyboards such as those used on really old ThinkPads options PCVT_SCANSET=2 # Other PCVT options are documented in pcvt(4). options PCVT_24LINESDEF options PCVT_CTRL_ALT_DEL options PCVT_META_ESC options PCVT_NSCREENS=9 options PCVT_PRETTYSCRNS options PCVT_SCREENSAVER options PCVT_USEKBDSEC options PCVT_VT220KEYB options PCVT_GREENSAVER # The syscons console driver (sco color console compatible). device sc 1 hint.sc.0.at="isa" options MAXCONS=16 # number of virtual consoles options SC_ALT_MOUSE_IMAGE # simplified mouse cursor in text mode options SC_DFLT_FONT # compile font in makeoptions SC_DFLT_FONT=cp850 options SC_DISABLE_DDBKEY # disable `debug' key options SC_DISABLE_REBOOT # disable reboot key sequence options SC_HISTORY_SIZE=200 # number of history buffer lines options SC_MOUSE_CHAR=0x3 # char code for text mode mouse cursor options SC_PIXEL_MODE # add support for the raster text mode # The following options will let you change the default colors of syscons. options SC_NORM_ATTR="(FG_GREEN|BG_BLACK)" options SC_NORM_REV_ATTR="(FG_YELLOW|BG_GREEN)" options SC_KERNEL_CONS_ATTR="(FG_RED|BG_BLACK)" options SC_KERNEL_CONS_REV_ATTR="(FG_BLACK|BG_RED)" # If you have a two button mouse, you may want to add the following option # to use the right button of the mouse to paste text. options SC_TWOBUTTON_MOUSE # You can selectively disable features in syscons. options SC_NO_CUTPASTE options SC_NO_FONT_LOADING options SC_NO_HISTORY options SC_NO_SYSMOUSE # `flags' for sc # 0x80 Put the video card in the VESA 800x600 dots, 16 color mode # 0x100 Probe for a keyboard device periodically if one is not present # 3Dfx Voodoo Graphics, Voodoo II /dev/3dfx CDEV support. This will create # the /dev/3dfx0 device to work with glide implementations. This should get # linked to /dev/3dfx and /dev/voodoo. Note that this is not the same as # the tdfx DRI module from XFree86 and is completely unrelated. # # To enable Linuxulator support, one must also include COMPAT_LINUX in the # config as well, or you will not have the dependencies. The other option # is to load both as modules. device tdfx # Enable 3Dfx Voodoo support options TDFX_LINUX # Enable Linuxulator support # # The Numeric Processing eXtension driver. In addition to this, you # may configure a math emulator (see above). If your machine has a # hardware FPU and the kernel configuration includes the npx device # *and* a math emulator compiled into the kernel, the hardware FPU # will be used, unless it is found to be broken or unless "flags" to # npx0 includes "0x08", which requests preference for the emulator. device npx hint.npx.0.at="nexus" hint.npx.0.port="0x0F0" hint.npx.0.flags="0x0" hint.npx.0.irq="13" # # `flags' for npx0: # 0x01 don't use the npx registers to optimize bcopy. # 0x02 don't use the npx registers to optimize bzero. # 0x04 don't use the npx registers to optimize copyin or copyout. # 0x08 use emulator even if hardware FPU is available. # The npx registers are normally used to optimize copying and zeroing when # all of the following conditions are satisfied: # I586_CPU is an option # the cpu is an i586 (perhaps not a Pentium) # the probe for npx0 succeeds # INT 16 exception handling works. # Then copying and zeroing using the npx registers is normally 30-100% faster. # The flags can be used to control cases where it doesn't work or is slower. # Setting them at boot time using userconfig works right (the optimizations # are not used until later in the bootstrap when npx0 is attached). # Flag 0x08 automatically disables the i586 optimized routines. # # # ACPI support using the Intel ACPI Component Architecture reference # implementation. # # ACPI_DEBUG enables the use of the debug.acpi.level and debug.acpi.layer # kernel environment variables to select initial debugging levels for the # Intel ACPICA code. (Note that the Intel code must also have USE_DEBUGGER # defined when it is built). # device acpica options ACPI_DEBUG # # Optional devices: # # # SCSI host adapters: # # adv: All Narrow SCSI bus AdvanSys controllers. # adw: Second Generation AdvanSys controllers including the ADV940UW. # aha: Adaptec 154x/1535/1640 # ahb: Adaptec 174x EISA controllers # ahc: Adaptec 274x/284x/2910/293x/294x/394x/3950x/3960x/398X/4944/ # 19160x/29160x, aic7770/aic78xx # aic: Adaptec 6260/6360, APA-1460 (PC Card), NEC PC9801-100 (C-BUS) # amd: Support for the AMD 53C974 SCSI host adapter chip as found on devices # such as the Tekram DC-390(T). # bt: Most Buslogic controllers: including BT-445, BT-54x, BT-64x, BT-74x, # BT-75x, BT-946, BT-948, BT-956, BT-958, SDC3211B, SDC3211F, SDC3222F # isp: Qlogic ISP 1020, 1040 and 1040B PCI SCSI host adapters, # ISP 1240 Dual Ultra SCSI, ISP 1080 and 1280 (Dual) Ultra2, # ISP 12160 Ultra3 SCSI, # Qlogic ISP 2100 and ISP 2200 Fibre Channel host adapters. # ispfw: Firmware module for Qlogic host adapters # ncr: NCR 53C810, 53C825 self-contained SCSI host adapters. # ncv: NCR 53C500 based SCSI host adapters. # nsp: Workbit Ninja SCSI-3 based PC Card SCSI host adapters. # sym: Symbios/Logic 53C8XX family of PCI-SCSI I/O processors: # 53C810, 53C810A, 53C815, 53C825, 53C825A, 53C860, 53C875, # 53C876, 53C885, 53C895, 53C895A, 53C896, 53C897, 53C1510D, # 53C1010-33, 53C1010-66. # stg: TMC 18C30, 18C50 based SCSI host adapters. # wds: WD7000 # # Note that the order is important in order for Buslogic ISA/EISA cards to be # probed correctly. # device bt hint.bt.0.at="isa" hint.bt.0.port="0x330" device adv hint.adv.0.at="isa" device adw device aha hint.aha.0.at="isa" device aic hint.aic.0.at="isa" device ahb device ahc device amd device isp hint.isp.0.disable="1" hint.isp.0.role="3" hint.isp.0.prefer_iomap="1" hint.isp.0.prefer_memmap="1" hint.isp.0.fwload_disable="1" hint.isp.0.ignore_nvram="1" hint.isp.0.fullduplex="1" hint.isp.0.topology="lport" hint.isp.0.topology="nport" hint.isp.0.topology="lport-only" hint.isp.0.topology="nport-only" # we can't get u_int64_t types, nor can we get strings if it's got # a leading 0x, hence this silly dodge. hint.isp.0.portwnn="w50000000aaaa0000" hint.isp.0.nodewnn="w50000000aaaa0001" device ispfw device ncr device ncv device nsp device sym device stg hint.stg.0.at="isa" hint.stg.0.port="0x140" hint.stg.0.port="11" device wds hint.wds.0.at="isa" hint.wds.0.port="0x350" hint.wds.0.irq="11" hint.wds.0.drq="6" # The aic7xxx driver will attempt to use memory mapped I/O for all PCI # controllers that have it configured only if this option is set. Unfortunately, # this doesn't work on some motherboards, which prevents it from being the # default. options AHC_ALLOW_MEMIO # Enable diagnostic sequencer code. options AHC_DEBUG_SEQUENCER # Dump the contents of the ahc controller configuration PROM. options AHC_DUMP_EEPROM # Bitmap of units to enable targetmode operations. options AHC_TMODE_ENABLE # The adw driver will attempt to use memory mapped I/O for all PCI # controllers that have it configured only if this option is set. options ADW_ALLOW_MEMIO # Options used in dev/isp/ (Qlogic SCSI/FC driver). # # ISP_TARGET_MODE - enable target mode operation # #options ISP_TARGET_MODE=1 # Options used in dev/sym/ (Symbios SCSI driver). #options SYM_SETUP_LP_PROBE_MAP #-Low Priority Probe Map (bits) # Allows the ncr to take precedence # 1 (1<<0) -> 810a, 860 # 2 (1<<1) -> 825a, 875, 885, 895 # 4 (1<<2) -> 895a, 896, 1510d #options SYM_SETUP_SCSI_DIFF #-HVD support for 825a, 875, 885 # disabled:0 (default), enabled:1 #options SYM_SETUP_PCI_PARITY #-PCI parity checking # disabled:0, enabled:1 (default) #options SYM_SETUP_MAX_LUN #-Number of LUNs supported # default:8, range:[1..64] # The 'asr' driver provides support for current DPT/Adaptec SCSI RAID # controllers (SmartRAID V and VI and later). # These controllers require the CAM infrastructure. # device asr # The 'dpt' driver provides support for old DPT controllers (http://www.dpt.com/). # These have hardware RAID-{0,1,5} support, and do multi-initiator I/O. # The DPT controllers are commonly re-licensed under other brand-names - # some controllers by Olivetti, Dec, HP, AT&T, SNI, AST, Alphatronic, NEC and # Compaq are actually DPT controllers. # # See src/sys/dev/dpt for debugging and other subtle options. # DPT_MEASURE_PERFORMANCE Enables a set of (semi)invasive metrics. Various # instruments are enabled. The tools in # /usr/sbin/dpt_* assume these to be enabled. # DPT_HANDLE_TIMEOUTS Normally device timeouts are handled by the DPT. # If you ant the driver to handle timeouts, enable # this option. If your system is very busy, this # option will create more trouble than solve. # DPT_TIMEOUT_FACTOR Used to compute the excessive amount of time to # wait when timing out with the above option. # DPT_DEBUG_xxxx These are controllable from sys/dev/dpt/dpt.h # DPT_LOST_IRQ When enabled, will try, once per second, to catch # any interrupt that got lost. Seems to help in some # DPT-firmware/Motherboard combinations. Minimal # cost, great benefit. # DPT_RESET_HBA Make "reset" actually reset the controller # instead of fudging it. Only enable this if you # are 100% certain you need it. device dpt # DPT options #!CAM# options DPT_MEASURE_PERFORMANCE #!CAM# options DPT_HANDLE_TIMEOUTS options DPT_TIMEOUT_FACTOR=4 options DPT_LOST_IRQ options DPT_RESET_HBA options DPT_ALLOW_MEMIO # # Mylex AcceleRAID and eXtremeRAID controllers with v6 and later # firmware. These controllers have a SCSI-like interface, and require # the CAM infrastructure. # device mly # # Adaptec FSA RAID controllers, including integrated DELL controllers, # the Dell PERC 2/QC and the HP NetRAID-4M # # AAC_COMPAT_LINUX Include code to support Linux-binary management # utilities (requires Linux compatibility # support). # device aac # # Compaq Smart RAID, Mylex DAC960 and AMI MegaRAID controllers. Only # one entry is needed; the code will find and configure all supported # controllers. # device ida # Compaq Smart RAID device mlx # Mylex DAC960 device amr # AMI MegaRAID # # 3ware ATA RAID # device twe # 3ware ATA RAID # # The 'ATA' driver supports all ATA and ATAPI devices, including PC Card # devices. You only need one "device ata" for it to find all # PCI and PC Card ATA/ATAPI devices on modern machines. device ata device atadisk # ATA disk drives device atapicd # ATAPI CDROM drives device atapifd # ATAPI floppy drives device atapist # ATAPI tape drives # # For older non-PCI, non-PnPBIOS systems, these are the hints lines to add: hint.ata.0.at="isa" hint.ata.0.port="0x1f0" hint.ata.0.irq="14" hint.ata.1.at="isa" hint.ata.1.port="0x170" hint.ata.1.irq="15" # # The following options are valid on the ATA driver: # # ATA_STATIC_ID: controller numbering is static ie depends on location # else the device numbers are dynamically allocated. options ATA_STATIC_ID # # Standard floppy disk controllers and floppy tapes, supports # the Y-E DATA External FDD (PC Card) # device fdc hint.fdc.0.at="isa" hint.fdc.0.port="0x3F0" hint.fdc.0.irq="6" hint.fdc.0.drq="2" # # FDC_DEBUG enables floppy debugging. Since the debug output is huge, you # gotta turn it actually on by setting the variable fd_debug with DDB, # however. options FDC_DEBUG # # Activate this line if you happen to have an Insight floppy tape. # Probing them proved to be dangerous for people with floppy disks only, # so it's "hidden" behind a flag: #hint.fdc.0.flags="1" # Specify floppy devices hint.fd.0.at="fdc0" hint.fd.0.drive="0" hint.fd.1.at="fdc0" hint.fd.1.drive="1" # M-systems DiskOnchip products see src/sys/contrib/dev/fla/README device fla hint.fla.0.at="isa" # # Other standard PC hardware: # # mse: Logitech and ATI InPort bus mouse ports # sio: serial ports (see sio(4)), including support for various # PC Card devices, such as Modem and NICs (see etc/defaults/pccard.conf) device mse hint.mse.0.at="isa" hint.mse.0.port="0x23c" hint.mse.0.irq="5" device sio hint.sio.0.at="isa" hint.sio.0.port="0x3F8" hint.sio.0.flags="0x10" hint.sio.0.irq="4" # # `flags' for serial drivers that support consoles (only for sio now): # 0x10 enable console support for this unit. The other console flags # are ignored unless this is set. Enabling console support does # not make the unit the preferred console - boot with -h or set # the 0x20 flag for that. Currently, at most one unit can have # console support; the first one (in config file order) with # this flag set is preferred. Setting this flag for sio0 gives # the old behaviour. # 0x20 force this unit to be the console (unless there is another # higher priority console). This replaces the COMCONSOLE option. # 0x40 reserve this unit for low level console operations. Do not # access the device in any normal way. # 0x80 use this port for serial line gdb support in ddb. # # PnP `flags' (set via userconfig using pnp x flags y) # 0x1 disable probing of this device. Used to prevent your modem # from being attached as a PnP modem. # # Options for serial drivers that support consoles (only for sio now): options BREAK_TO_DEBUGGER #a BREAK on a comconsole goes to #DDB, if available. options CONSPEED=115200 # speed for serial console # (default 9600) # Solaris implements a new BREAK which is initiated by a character # sequence CR ~ ^b which is similar to a familiar pattern used on # Sun servers by the Remote Console. options ALT_BREAK_TO_DEBUGGER # Options for sio: options COM_ESP #code for Hayes ESP options COM_MULTIPORT #code for some cards with shared IRQs # Other flags for sio that aren't documented in the man page. # 0x20000 enable hardware RTS/CTS and larger FIFOs. Only works for # ST16650A-compatible UARTs. # # Network interfaces: # # MII bus support is required for some PCI 10/100 ethernet NICs, # namely those which use MII-compliant transceivers or implement # tranceiver control interfaces that operate like an MII. Adding # "device miibus0" to the kernel config pulls in support for # the generic miibus API and all of the PHY drivers, including a # generic one for PHYs that aren't specifically handled by an # individual driver. device miibus # an: Aironet 4500/4800 802.11 wireless adapters. Supports the PCMCIA, # PCI and ISA varieties. # ar: Arnet SYNC/570i hdlc sync 2/4 port V.35/X.21 serial driver # (requires sppp) # awi: Support for IEEE 802.11 PC Card devices using the AMD Am79C930 and # Harris (Intersil) Chipset with PCnetMobile firmware by AMD. # cnw: Xircom CNW/Netware Airsurfer PC Card adapter # cs: IBM Etherjet and other Crystal Semi CS89x0-based adapters # cx: Cronyx/Sigma multiport sync/async (with Cisco or PPP framing) # dc: Support for PCI fast ethernet adapters based on the DEC/Intel 21143 # and various workalikes including: # the ADMtek AL981 Comet and AN985 Centaur, the ASIX Electronics # AX88140A and AX88141, the Davicom DM9100 and DM9102, the Lite-On # 82c168 and 82c169 PNIC, the Lite-On/Macronix LC82C115 PNIC II # and the Macronix 98713/98713A/98715/98715A/98725 PMAC. This driver # replaces the old al, ax, dm, pn and mx drivers. List of brands: # Digital DE500-BA, Kingston KNE100TX, D-Link DFE-570TX, SOHOware SFA110, # SVEC PN102-TX, CNet Pro110B, 120A, and 120B, Compex RL100-TX, # LinkSys LNE100TX, LNE100TX V2.0, Jaton XpressNet, Alfa Inc GFC2204, # KNE110TX. # de: Digital Equipment DC21040 # ed: Western Digital and SMC 80xx; Novell NE1000 and NE2000; 3Com 3C503 # HP PC Lan+, various PC Card devices (refer to etc/defauls/pccard.conf) # el: 3Com 3C501 (slow!) # ep: 3Com 3C509, 3C529, 3C556, 3C562D, 3C563D, 3C572, 3C574X, 3C579, 3C589 # and PC Card devices using these chipsets. # ex: Intel EtherExpress Pro/10 and other i82595-based adapters, # Olicom Ethernet PC Card devices. # fe: Fujitsu MB86960A/MB86965A Ethernet # fea: DEC DEFEA EISA FDDI adapter # fpa: Support for the Digital DEFPA PCI FDDI. `device fddi' is also needed. # fxp: Intel EtherExpress Pro/100B # (hint of prefer_iomap can be done to prefer I/O instead of Mem mapping) # ie: AT&T StarLAN 10 and EN100; 3Com 3C507; unknown NI5210; # Intel EtherExpress # le: Digital Equipment EtherWorks 2 and EtherWorks 3 (DEPCA, DE100, # DE101, DE200, DE201, DE202, DE203, DE204, DE205, DE422) # lnc: Lance/PCnet cards (Isolan, Novell NE2100, NE32-VL, AMD Am7990 and # Am79C960) # lge: Support for PCI gigabit ethernet adapters based on the Level 1 # LXT1001 NetCellerator chipset. This includes the D-Link DGE-500SX, # SMC TigerCard 1000 (SMC9462SX), and some Addtron cards. # nge: Support for PCI gigabit ethernet adapters based on the National # Semiconductor DP83820 and DP83821 chipset. This includes the # SMC EZ Card 1000 (SMC9462TX), D-Link DGE-500T, Asante FriendlyNet # GigaNIX 1000TA and 1000TPC, the Addtron AEG320T, the LinkSys # EG1032 and EG1064, the Surecom EP-320G-TX and the Netgear GA622T. # oltr: Olicom ISA token-ring adapters OC-3115, OC-3117, OC-3118 and OC-3133 # (no hints needed). # Olicom PCI token-ring adapters OC-3136, OC-3137, OC-3139, OC-3140, # OC-3141, OC-3540, OC-3250 # rdp: RealTek RTL 8002-based pocket ethernet adapters # pcn: Support for PCI fast ethernet adapters based on the AMD Am79c97x # chipsets, including the PCnet/FAST, PCnet/FAST+, PCnet/PRO and # PCnet/Home. These were previously handled by the lnc driver (and # still will be if you leave this driver out of the kernel). # rl: Support for PCI fast ethernet adapters based on the RealTek 8129/8139 # chipset. Note that the RealTek driver defaults to using programmed # I/O to do register accesses because memory mapped mode seems to cause # severe lockups on SMP hardware. This driver also supports the # Accton EN1207D `Cheetah' adapter, which uses a chip called # the MPX 5030/5038, which is either a RealTek in disguise or a # RealTek workalike. Note that the D-Link DFE-530TX+ uses the RealTek # chipset and is supported by this driver, not the 'vr' driver. # sf: Support for Adaptec Duralink PCI fast ethernet adapters based on the # Adaptec AIC-6915 "starfire" controller. # This includes dual and quad port cards, as well as one 100baseFX card. # Most of these are 64-bit PCI devices, except for one single port # card which is 32-bit. # sis: Support for NICs based on the Silicon Integrated Systems SiS 900, # SiS 7016 and NS DP83815 PCI fast ethernet controller chips. # sk: Support for the SysKonnect SK-984x series PCI gigabit ethernet NICs. # This includes the SK-9841 and SK-9842 single port cards (single mode # and multimode fiber) and the SK-9843 and SK-9844 dual port cards # (also single mode and multimode). # The driver will autodetect the number of ports on the card and # attach each one as a separate network interface. # sn: Support for ISA and PC Card Ethernet devices using the # SMC91C90/92/94/95 chips. # sr: RISCom/N2 hdlc sync 1/2 port V.35/X.21 serial driver (requires sppp) # ste: Sundance Technologies ST201 PCI fast ethernet controller, includes # the D-Link DFE-550TX. # ti: Support for PCI gigabit ethernet NICs based on the Alteon Networks # Tigon 1 and Tigon 2 chipsets. This includes the Alteon AceNIC, the # 3Com 3c985, the Netgear GA620 and various others. Note that you will # probably want to bump up NMBCLUSTERS a lot to use this driver. # tl: Support for the Texas Instruments TNETE100 series 'ThunderLAN' # cards and integrated ethernet controllers. This includes several # Compaq Netelligent 10/100 cards and the built-in ethernet controllers # in several Compaq Prosignia, Proliant and Deskpro systems. It also # supports several Olicom 10Mbps and 10/100 boards. # tx: SMC 9432 TX, BTX and FTX cards. (SMC EtherPower II serie) # txp: Support for 3Com 3cR990 cards with the "Typhoon" chipset # vr: Support for various fast ethernet adapters based on the VIA # Technologies VT3043 `Rhine I' and VT86C100A `Rhine II' chips, # including the D-Link DFE530TX (see 'rl' for DFE530TX+), the Hawking # Technologies PN102TX, and the AOpen/Acer ALN-320. # vx: 3Com 3C590 and 3C595 # wb: Support for fast ethernet adapters based on the Winbond W89C840F chip. # Note: this is not the same as the Winbond W89C940F, which is a # NE2000 clone. # wl: Lucent Wavelan (ISA card only). # wi: Lucent WaveLAN/IEEE 802.11 PCMCIA adapters. Note: this supports both # the PCMCIA and ISA cards: the ISA card is really a PCMCIA to ISA # bridge with a PCMCIA adapter plugged into it. # wx: Intel Gigabit Ethernet PCI card (`Wiseman') # xe: Xircom/Intel EtherExpress Pro100/16 PC Card ethernet controller, # Accton Fast EtherCard-16, Compaq Netelligent 10/100 PC Card, # Toshiba 10/100 Ethernet PC Card, Xircom 16-bit Ethernet + Modem 56 # xl: Support for the 3Com 3c900, 3c905, 3c905B and 3c905C (Fast) # Etherlink XL cards and integrated controllers. This includes the # integrated 3c905B-TX chips in certain Dell Optiplex and Dell # Precision desktop machines and the integrated 3c905-TX chips # in Dell Latitude laptop docking stations. # Also supported: 3Com 3c980(C)-TX, 3Com 3cSOHO100-TX, 3Com 3c450-TX # Order for ISA/EISA devices is important here device ar 1 hint.ar.0.at="isa" hint.ar.0.port="0x300" hint.ar.0.irq="10" hint.ar.0.maddr="0xd0000" device cs hint.cs.0.at="isa" hint.cs.0.port="0x300" device cx 1 hint.cx.0.at="isa" hint.cx.0.port="0x240" hint.cx.0.irq="15" hint.cx.0.drq="7" device ed hint.ed.0.at="isa" hint.ed.0.port="0x280" hint.ed.0.irq="5" hint.ed.0.maddr="0xd8000" device el 1 hint.el.0.at="isa" hint.el.0.port="0x300" hint.el.0.irq="9" device ep device ex device fe 1 options FE_8BIT_SUPPORT # LAC-98 support hint.fe.0.at="isa" hint.fe.0.port="0x300" device fea device ie 2 hint.ie.0.at="isa" hint.ie.0.port="0x300" hint.ie.0.irq="5" hint.ie.0.maddr="0xd0000" hint.ie.1.at="isa" hint.ie.1.port="0x360" hint.ie.1.irq="7" hint.ie.1.maddr="0xd0000" device le 1 hint.le.0.at="isa" hint.le.0.port="0x300" hint.le.0.irq="5" hint.le.0.maddr="0xd0000" device lnc 1 hint.lnc.0.at="isa" hint.lnc.0.port="0x280" hint.lnc.0.irq="10" hint.lnc.0.drq="0" device rdp 1 hint.rdp.0.at="isa" hint.rdp.0.port="0x378" hint.rdp.0.irq="7" hint.rdp.0.flags="2" device sr 1 hint.sr.0.at="isa" hint.sr.0.port="0x300" hint.sr.0.irq="5" hint.sr.0.maddr="0xd0000" device sn hint.sn.0.at="isa" hint.sn.0.port="0x300" hint.sn.0.irq="10" device an device awi device cnw device wi options WLCACHE # enables the signal-strength cache options WLDEBUG # enables verbose debugging output device wl 1 hint.wl.0.at="isa" hint.wl.0.port="0x300" device xe device oltr options OLTR_NO_BULLSEYE_MAC options OLTR_NO_HAWKEYE_MAC options OLTR_NO_TMS_MAC hint.oltr.0.at="isa" # PCI Ethernet NICs that use the common MII bus controller code. device dc # DEC/Intel 21143 and various workalikes device fxp # Intel EtherExpress PRO/100B (82557, 82558) hint.fxp.0.prefer_iomap="0" device rl # RealTek 8129/8139 device pcn # AMD Am79C97x PCI 10/100 NICs device sf # Adaptec AIC-6915 (``Starfire'') device sis # Silicon Integrated Systems SiS 900/SiS 7016 device ste # Sundance ST201 (D-Link DFE-550TX) device tl # Texas Instruments ThunderLAN device tx # SMC EtherPower II (83c170 ``EPIC'') device vr # VIA Rhine, Rhine II device wb # Winbond W89C840F device xl # 3Com 3c90x (``Boomerang'', ``Cyclone'') # PCI Ethernet NICs. device de # DEC/Intel DC21x4x (``Tulip'') device vx # 3Com 3c590, 3c595 (``Vortex'') # PCI Gigabit & FDDI NICs. device lge device nge device sk device ti device wx device fpa 1 # # ATM related options (Cranor version) # (note: this driver cannot be used with the HARP ATM stack) # # The `en' device provides support for Efficient Networks (ENI) # ENI-155 PCI midway cards, and the Adaptec 155Mbps PCI ATM cards (ANA-59x0). # # atm device provides generic atm functions and is required for # atm devices. # NATM enables the netnatm protocol family that can be used to # bypass TCP/IP. # # the current driver supports only PVC operations (no atm-arp, no multicast). # for more details, please read the original documents at # http://www.ccrc.wustl.edu/pub/chuck/tech/bsdatm/bsdatm.html # device atm device en options NATM #native ATM # # Audio drivers: `pcm', `sbc', `gusc', `pca' # # pcm: PCM audio through various sound cards. # # This has support for a large number of new audio cards, based on # CS423x, OPTi931, Yamaha OPL-SAx, and also for SB16, GusPnP. # For more information about this driver and supported cards, # see the pcm.4 man page. # # The flags of the device tells the device a bit more info about the # device that normally is obtained through the PnP interface. # bit 2..0 secondary DMA channel; # bit 4 set if the board uses two dma channels; # bit 15..8 board type, overrides autodetection; leave it # zero if don't know what to put in (and you don't, # since this is unsupported at the moment...). # # This driver will use the new PnP code if it's available. # # pca: PCM audio through your PC speaker # # Supported cards include: # Creative SoundBlaster ISA PnP/non-PnP # Supports ESS and Avance ISA chips as well. # Gravis UltraSound ISA PnP/non-PnP # Crystal Semiconductor CS461x/428x PCI # Neomagic 256AV (ac97) # Most of the more common ISA/PnP sb/mss/ess compatable cards. device pcm # For non-pnp sound cards with no bridge drivers only: hint.pcm.0.at="isa" hint.pcm.0.irq="10" hint.pcm.0.drq="1" hint.pcm.0.flags="0x0" # For PnP/PCI sound cards, no hints are required. # # midi: MIDI interfaces and synthesizers # device midi # For non-pnp sound cards with no bridge drivers: hint.midi.0.at="isa" hint.midi.0.irq="5" hint.midi.0.flags="0x0" # For serial ports (this example configures port 2): # TODO: implement generic tty-midi interface so that we can use # other uarts. hint.midi.0.at="isa" hint.midi.0.port="0x2F8" hint.midi.0.irq="3" # # seq: MIDI sequencer # device seq # The bridge drivers for sound cards. These can be separately configured # for providing services to the likes of new-midi. # When used with 'device pcm' they also provide pcm sound services. # # sbc: Creative SoundBlaster ISA PnP/non-PnP # Supports ESS and Avance ISA chips as well. # gusc: Gravis UltraSound ISA PnP/non-PnP # csa: Crystal Semiconductor CS461x/428x PCI # For non-PnP cards: device sbc hint.sbc.0.at="isa" hint.sbc.0.port="0x220" hint.sbc.0.irq="5" hint.sbc.0.drq="1" hint.sbc.0.flags="0x15" device gusc hint.gusc.0.at="isa" hint.gusc.0.port="0x220" hint.gusc.0.irq="5" hint.gusc.0.drq="1" hint.gusc.0.flags="0x13" device pca hint.pca.0.at="isa" hint.pca.0.port="0x040" # # Miscellaneous hardware: # # mcd: Mitsumi CD-ROM using proprietary (non-ATAPI) interface # scd: Sony CD-ROM using proprietary (non-ATAPI) interface # matcd: Matsushita/Panasonic CD-ROM using proprietary (non-ATAPI) interface # wt: Wangtek and Archive QIC-02/QIC-36 tape drives # ctx: Cortex-I frame grabber # apm: Laptop Advanced Power Management (experimental) # pmtimer: Timer device driver for power management events (APM or ACPI) # spigot: The Creative Labs Video Spigot video-acquisition board # meteor: Matrox Meteor video capture board # bktr: Brooktree bt848/848a/849a/878/879 video capture and TV Tuner board # cy: Cyclades serial driver # dgb: Digiboard PC/Xi and PC/Xe series driver (ALPHA QUALITY!) # digi: Digiboard driver # gp: National Instruments AT-GPIB and AT-GPIB/TNT board, PCMCIA-GPIB # asc: GI1904-based hand scanners, e.g. the Trust Amiscan Grey # gsc: Genius GS-4500 hand scanner. # joy: joystick (including IO DATA PCJOY PC Card joystick) # The LOUTB option specifies a slower outb() for debugging purposes. # rc: RISCom/8 multiport card # rp: Comtrol Rocketport(ISA) - single card # tw: TW-523 power line interface for use with X-10 home control products # si: Specialix SI/XIO 4-32 port terminal multiplexor # spic: Sony Programmable I/O controller (VAIO notebooks) # stl: Stallion EasyIO and EasyConnection 8/32 (cd1400 based) # stli: Stallion EasyConnection 8/64, ONboard, Brumby (intelligent) # Notes on APM # The flags takes the following meaning for apm0: # 0x0020 Statclock is broken. # If apm is omitted, some systems require sysctl -w kern.timecounter.method=1 # for correct timekeeping. # Notes on the spigot: # The video spigot is at 0xad6. This port address can not be changed. # The irq values may only be 10, 11, or 15 # I/O memory is an 8kb region. Possible values are: # 0a0000, 0a2000, ..., 0fffff, f00000, f02000, ..., ffffff # The start address must be on an even boundary. # Add the following option if you want to allow non-root users to be able # to access the spigot. This option is not secure because it allows users # direct access to the I/O page. # options SPIGOT_UNSECURE # Notes on the Comtrol Rocketport driver: # # The exact values used for rp0 depend on how many boards you have # in the system. The manufacturer's sample configs are listed as: # # device rp # core driver support # # Comtrol Rocketport ISA single card # hints.rp.0.at="isa" # hints.rp.0.port="0x280" # # If instead you have two ISA cards, one installed at 0x100 and the # second installed at 0x180, then you should add the following to # your kernel probe hints: # hints.rp.0.at="isa" # hints.rp.0.port="0x100" # hints.rp.1.at="isa" # hints.rp.1.port="0x180" # # For 4 ISA cards, it might be something like this: # hints.rp.0.at="isa" # hints.rp.0.port="0x180" # hints.rp.1.at="isa" # hints.rp.1.port="0x100" # hints.rp.2.at="isa" # hints.rp.2.port="0x340" # hints.rp.3.at="isa" # hints.rp.3.port="0x240" # # And for PCI cards, you need no hints. # Notes on the Digiboard driver: # # The following flag values have special meanings in dgb: # 0x01 - alternate layout of pins # 0x02 - use the windowed PC/Xe in 64K mode # Notes on the Specialix SI/XIO driver: # The host card is memory, not IO mapped. # The Rev 1 host cards use a 64K chunk, on a 32K boundary. # The Rev 2 host cards use a 32K chunk, on a 32K boundary. # The cards can use an IRQ of 11, 12 or 15. # Notes on the Sony Programmable I/O controller # This is a temporary driver that should someday be replaced by something # that hooks into the ACPI layer. The device is hooked to the PIIX4's # General Device 10 decoder, which means you have to fiddle with PCI # registers to map it in, even though it is otherwise treated here as # an ISA device. At the moment, the driver polls, although the device # is capable of generating interrupts. It largely undocumented. # The port location in the hint is where you WANT the device to be # mapped. 0x10a0 seems to be traditional. At the moment the jogdial # is the only thing truly supported, but aparently a fair percentage # of the Vaio extra features are controlled by this device. # Notes on the Stallion stl and stli drivers: # See src/i386/isa/README.stl for complete instructions. # This is version 0.0.5alpha, unsupported by Stallion. # The stl driver has a secondary IO port hard coded at 0x280. You need # to change src/i386/isa/stallion.c if you reconfigure this on the boards. # The "flags" and "msize" settings on the stli driver depend on the board: # EasyConnection 8/64 ISA: flags 23 msize 0x1000 # EasyConnection 8/64 EISA: flags 24 msize 0x10000 # EasyConnection 8/64 MCA: flags 25 msize 0x1000 # ONboard ISA: flags 4 msize 0x10000 # ONboard EISA: flags 7 msize 0x10000 # ONboard MCA: flags 3 msize 0x10000 # Brumby: flags 2 msize 0x4000 # Stallion: flags 1 msize 0x10000 device mcd 1 hint.mcd.0.at="isa" hint.mcd.0.port="0x300" hint.mcd.0.irq="10" # for the Sony CDU31/33A CDROM device scd 1 hint.scd.0.at="isa" hint.scd.0.port="0x230" # for the SoundBlaster 16 multicd - up to 4 devices device matcd 1 hint.matcd.0.at="isa" hint.matcd.0.port="0x230" device wt 1 hint.wt.0.at="isa" hint.wt.0.port="0x300" hint.wt.0.irq="5" hint.wt.0.drq="1" device ctx 1 hint.ctx.0.at="isa" hint.ctx.0.port="0x230" hint.ctx.0.maddr="0xd0000" device spigot 1 hint.spigot.0.at="isa" hint.spigot.0.port="0xad6" hint.spigot.0.irq="15" hint.spigot.0.maddr="0xee000" device apm hint.apm.0.flags="0x20" device pmtimer # Adjust system timer at wakeup time hint.pmtimer.0.at="isa" device gp hint.gp.0.at="isa" hint.gp.0.port="0x2c0" device gsc 1 hint.gsc.0.at="isa" hint.gsc.0.port="0x270" hint.gsc.0.drq="3" device joy # PnP aware, hints for nonpnp only hint.joy.0.at="isa" hint.joy.0.port="0x201" device cy 1 options CY_PCI_FASTINTR # Use with cy_pci unless irq is shared hint.cy.0.at="isa" hint.cy.0.irq="10" hint.cy.0.maddr="0xd4000" hint.cy.0.msize="0x2000" device dgb 1 options NDGBPORTS=16 # Defaults to 16*NDGB hint.dgb.0.at="isa" hint.dgb.0.port="0x220" hint.dgb.0.maddr="0xfc000" device digi hint.digi.0.at="isa" hint.digi.0.port="0x104" hint.digi.0.maddr="0xd0000" # BIOS & FEP/OS components of device digi. Normally left as modules device digi_CX device digi_CX_PCI device digi_EPCX device digi_EPCX_PCI device digi_Xe device digi_Xem device digi_Xr device rc 1 hint.rc.0.at="isa" hint.rc.0.port="0x220" hint.rc.0.irq="12" device rp hint.rp.0.at="isa" hint.rp.0.port="0x280" # the port and irq for tw0 are fictitious device tw 1 hint.tw.0.at="isa" hint.tw.0.port="0x380" hint.tw.0.irq="11" device si options SI_DEBUG hint.si.0.at="isa" hint.si.0.maddr="0xd0000" hint.si.0.irq="12" device asc 1 hint.asc.0.at="isa" hint.asc.0.port="0x3EB" hint.asc.0.drq="3" hint.asc.0.irq="10" device spic hint.spic.0.at="isa" hint.spic.0.port="0x10a0" device stl hint.stl.0.at="isa" hint.stl.0.port="0x2a0" hint.stl.0.irq="10" device stli hint.stli.0.at="isa" hint.stli.0.port="0x2a0" hint.stli.0.maddr="0xcc000" hint.stli.0.flags="23" hint.stli.0.msize="0x1000" # You are unlikely to have the hardware for loran device loran hint.loran.0.at="isa" hint.loran.0.irq="5" # HOT1 Xilinx 6200 card (http://www.vcc.com/) device xrpu # # The `meteor' device is a PCI video capture board. It can also have the # following options: # options METEOR_ALLOC_PAGES=xxx preallocate kernel pages for data entry # figure (ROWS*COLUMN*BYTES_PER_PIXEL*FRAME+PAGE_SIZE-1)/PAGE_SIZE # options METEOR_DEALLOC_PAGES remove all allocated pages on close(2) # options METEOR_DEALLOC_ABOVE=xxx remove all allocated pages above the # specified amount. If this value is below the allocated amount no action # taken # options METEOR_SYSTEM_DEFAULT={METEOR_PAL|METEOR_NTSC|METEOR_SECAM}, used # for initialization of fps routine when a signal is not present. # # The 'bktr' device is a PCI video capture device using the Brooktree # bt848/bt848a/bt849a/bt878/bt879 chipset. When used with a TV Tuner it forms a # TV card, eg Miro PC/TV, Hauppauge WinCast/TV WinTV, VideoLogic Captivator, # Intel Smart Video III, AverMedia, IMS Turbo, FlyVideo. # # options OVERRIDE_CARD=xxx # options OVERRIDE_TUNER=xxx # options OVERRIDE_MSP=1 # options OVERRIDE_DBX=1 # These options can be used to override the auto detection # The current values for xxx are found in src/sys/dev/bktr/bktr_card.h # Using sysctl(8) run-time overrides on a per-card basis can be made # # options BROOKTREE_SYSTEM_DEFAULT=BROOKTREE_PAL # or # options BROOKTREE_SYSTEM_DEFAULT=BROOKTREE_NTSC # Specifes the default video capture mode. # This is required for Dual Crystal (28&35Mhz) boards where PAL is used # to prevent hangs during initialisation. eg VideoLogic Captivator PCI. # # options BKTR_USE_PLL # PAL or SECAM users who have a 28Mhz crystal (and no 35Mhz crystal) # must enable PLL mode with this option. eg some new Bt878 cards. # # options BKTR_GPIO_ACCESS # This enable IOCTLs which give user level access to the GPIO port. # # options BKTR_NO_MSP_RESET # Prevents the MSP34xx reset. Good if you initialise the MSP in another OS first # # options BKTR_430_FX_MODE # Switch Bt878/879 cards into Intel 430FX chipset compatibility mode. # # options BKTR_SIS_VIA_MODE # Switch Bt878/879 cards into SIS/VIA chipset compatibility mode which is # needed for some old SiS and VIA chipset motherboards. # This also allows Bt878/879 chips to work on old OPTi (<1997) chipset # motherboards and motherboards with bad or incomplete PCI 2.1 support. # As a rough guess, old = before 1998 # device meteor 1 # Brooktree driver has been ported to the new I2C framework. Thus, # you'll need to have the following 3 lines in the kernel config. # device smbus # device iicbus # device iicbb # The iic and smb devices are only needed if you want to control other # I2C slaves connected to the external connector of some cards. # device bktr 1 # # PC Card/PCMCIA # (OLDCARD) # # card: pccard slots # pcic: isa/pccard bridge device pcic hint.pcic.0.at="isa" hint.pcic.1.at="isa" device card # # PC Card/PCMCIA and Cardbus # (NEWCARD) # # Note that NEWCARD and OLDCARD are incompatible. Do not use both at the same # time. # # pccbb: isa/pccard and pci/cardbus bridge # pccard: pccard slots # cardbus: cardbus slots #device pccbb #device pccard #device cardbus # You may need to reset all pccards after resuming options PCIC_RESUME_RESET # reset after resume # # Laptop/Notebook options: # # See also: # apm under `Miscellaneous hardware' # above. # For older notebooks that signal a powerfail condition (external # power supply dropped, or battery state low) by issuing an NMI: options POWERFAIL_NMI # make it beep instead of panicing # # SMB bus # # System Management Bus support is provided by the 'smbus' device. # Access to the SMBus device is via the 'smb' device (/dev/smb*), # which is a child of the 'smbus' device. # # Supported devices: # smb standard io through /dev/smb* # # Supported SMB interfaces: # iicsmb I2C to SMB bridge with any iicbus interface # bktr brooktree848 I2C hardware interface # intpm Intel PIIX4 Power Management Unit # alpm Acer Aladdin-IV/V/Pro2 Power Management Unit # ichsmb Intel ICH SMBus controller chips (82801AA, 82801AB, 82801BA) # device smbus # Bus support, required for smb below. device intpm device alpm device ichsmb device smb # # I2C Bus # # Philips i2c bus support is provided by the `iicbus' device. # # Supported devices: # ic i2c network interface # iic i2c standard io # iicsmb i2c to smb bridge. Allow i2c i/o with smb commands. # # Supported interfaces: # pcf Philips PCF8584 ISA-bus controller # bktr brooktree848 I2C software interface # # Other: # iicbb generic I2C bit-banging code (needed by lpbb, bktr) # device iicbus # Bus support, required for ic/iic/iicsmb below. device iicbb device ic device iic device iicsmb # smb over i2c bridge device pcf hint.pcf.0.at="isa" hint.pcf.0.port="0x320" hint.pcf.0.irq="5" #--------------------------------------------------------------------------- # ISDN4BSD # # See /usr/share/examples/isdn/ROADMAP for an introduction to isdn4bsd. # # i4b passive ISDN cards support contains the following hardware drivers: # # isic - Siemens/Infineon ISDN ISAC/HSCX/IPAC chipset driver # iwic - Winbond W6692 PCI bus ISDN S/T interface controller # ifpi - AVM Fritz!Card PCI driver # ihfc - Cologne Chip HFC ISA/ISA-PnP chipset driver # ifpnp - AVM Fritz!Card PnP driver # itjc - Siemens ISAC / TJNet Tiger300/320 chipset # # i4b active ISDN cards support contains the following hardware drivers: # # iavc - AVM B1 PCI, AVM B1 ISA, AVM T1 # # Note that the ``options'' (if given) and ``device'' lines must BOTH # be uncommented to enable support for a given card ! # # In addition to a hardware driver (and probably an option) the mandatory # ISDN protocol stack devices and the mandatory support device must be # enabled as well as one or more devices from the optional devices section. # #--------------------------------------------------------------------------- # isic driver (Siemens/Infineon chipsets) # device isic # # ISA bus non-PnP Cards: # ---------------------- # # Teles S0/8 or Niccy 1008 options TEL_S0_8 hint.isic.0.at="isa" hint.isic.0.maddr="0xd0000" hint.isic.0.irq="5" hint.isic.0.flags="1" # # Teles S0/16 or Creatix ISDN-S0 or Niccy 1016 options TEL_S0_16 hint.isic.0.at="isa" hint.isic.0.port="0xd80" hint.isic.0.maddr="0xd0000" hint.isic.0.irq="5" hint.isic.0.flags="2" # # Teles S0/16.3 options TEL_S0_16_3 hint.isic.0.at="isa" hint.isic.0.port="0xd80" hint.isic.0.irq="5" hint.isic.0.flags="3" # # AVM A1 or AVM Fritz!Card options AVM_A1 hint.isic.0.at="isa" hint.isic.0.port="0x340" hint.isic.0.irq="5" hint.isic.0.flags="4" # # USRobotics Sportster ISDN TA intern options USR_STI hint.isic.0.at="isa" hint.isic.0.port="0x268" hint.isic.0.irq="5" hint.isic.0.flags="7" # # ITK ix1 Micro ( < V.3, non-PnP version ) options ITKIX1 hint.isic.0.at="isa" hint.isic.0.port="0x398" hint.isic.0.irq="10" hint.isic.0.flags="18" # # ELSA PCC-16 options ELSA_PCC16 hint.isic.0.at="isa" hint.isic.0.port="0x360" hint.isic.0.irq="10" hint.isic.0.flags="20" # # ISA bus PnP Cards: # ------------------ # # Teles S0/16.3 PnP options TEL_S0_16_3_P # # Creatix ISDN-S0 P&P options CRTX_S0_P # # Dr. Neuhaus Niccy Go@ options DRN_NGO # # Sedlbauer Win Speed options SEDLBAUER # # Dynalink IS64PH options DYNALINK # # ELSA QuickStep 1000pro ISA options ELSA_QS1ISA # # Siemens I-Surf 2.0 options SIEMENS_ISURF2 # # Asuscom ISDNlink 128K ISA options ASUSCOM_IPAC # # Eicon Diehl DIVA 2.0 and 2.02 options EICON_DIVA # # PCI bus Cards: # -------------- # # ELSA MicroLink ISDN/PCI (same as ELSA QuickStep 1000pro PCI) options ELSA_QS1PCI # # #--------------------------------------------------------------------------- # ifpnp driver for AVM Fritz!Card PnP # # AVM Fritz!Card PnP device ifpnp # #--------------------------------------------------------------------------- # ihfc driver for Cologne Chip ISA chipsets (experimental!) # # Teles 16.3c ISA PnP # AcerISDN P10 ISA PnP # TELEINT ISDN SPEED No.1 device ihfc # #--------------------------------------------------------------------------- # ifpi driver for AVM Fritz!Card PCI # # AVM Fritz!Card PCI device ifpi # #--------------------------------------------------------------------------- # iwic driver for Winbond W6692 chipset # # ASUSCOM P-IN100-ST-D (and other Winbond W6692 based cards) device iwic # #--------------------------------------------------------------------------- # itjc driver for Simens ISAC / TJNet Tiger300/320 chipset # # Traverse Technologies NETjet-S # Teles PCI-TJ device itjc # #--------------------------------------------------------------------------- # iavc driver (AVM active cards, needs i4bcapi driver!) # device iavc # # AVM B1 ISA bus (PnP mode not supported!) # ---------------------------------------- hint.iavc.0.at="isa" hint.iavc.0.port="0x150" hint.iavc.0.irq="5" # #--------------------------------------------------------------------------- # ISDN Protocol Stack - mandatory for all hardware drivers # # Q.921 / layer 2 - i4b passive cards D channel handling device "i4bq921" # # Q.931 / layer 3 - i4b passive cards D channel handling device "i4bq931" # # layer 4 - i4b common passive and active card handling device "i4b" # #--------------------------------------------------------------------------- # ISDN devices - mandatory for all hardware drivers # # userland driver to do ISDN tracing (for passive cards only) device "i4btrc" 4 # # userland driver to control the whole thing device "i4bctl" # #--------------------------------------------------------------------------- # ISDN devices - optional # # userland driver for access to raw B channel device "i4brbch" 4 # # userland driver for telephony device "i4btel" 2 # # network driver for IP over raw HDLC ISDN device "i4bipr" 4 # enable VJ header compression detection for ipr i/f options IPR_VJ # enable logging of the first n IP packets to isdnd (n=32 here) options IPR_LOG=32 # # network driver for sync PPP over ISDN; requires an equivalent # number of sppp device to be configured device "i4bisppp" 4 # # B-channel interface to the netgraph subsystem device "i4bing" 2 # # CAPI driver needed for active ISDN cards (see iavc driver above) device "i4bcapi" # #--------------------------------------------------------------------------- # Parallel-Port Bus # # Parallel port bus support is provided by the `ppbus' device. # Multiple devices may be attached to the parallel port, devices # are automatically probed and attached when found. # # Supported devices: # vpo Iomega Zip Drive # Requires SCSI disk support ('scbus' and 'da'), best # performance is achieved with ports in EPP 1.9 mode. # lpt Parallel Printer # plip Parallel network interface # ppi General-purpose I/O ("Geek Port") + IEEE1284 I/O # pps Pulse per second Timing Interface # lpbb Philips official parallel port I2C bit-banging interface # # Supported interfaces: # ppc ISA-bus parallel port interfaces. # options PPC_PROBE_CHIPSET # Enable chipset specific detection # (see flags in ppc(4)) options DEBUG_1284 # IEEE1284 signaling protocol debug options PERIPH_1284 # Makes your computer act as a IEEE1284 # compliant peripheral options DONTPROBE_1284 # Avoid boot detection of PnP parallel devices options VP0_DEBUG # ZIP/ZIP+ debug options LPT_DEBUG # Printer driver debug options PPC_DEBUG # Parallel chipset level debug options PLIP_DEBUG # Parallel network IP interface debug options PCFCLOCK_VERBOSE # Verbose pcfclock driver options PCFCLOCK_MAX_RETRIES=5 # Maximum read tries (default 10) device ppc hint.ppc.0.at="isa" hint.ppc.0.irq="7" device ppbus device vpo device lpt device plip device ppi device pps device lpbb device pcfclock # Kernel BOOTP support options BOOTP # Use BOOTP to obtain IP address/hostname options BOOTP_NFSROOT # NFS mount root filesystem using BOOTP info options BOOTP_NFSV3 # Use NFS v3 to NFS mount root options BOOTP_COMPAT # Workaround for broken bootp daemons. options BOOTP_WIRED_TO=fxp0 # Use interface fxp0 for BOOTP # # Add tie-ins for a hardware watchdog. This only enable the hooks; # the user must still supply the actual driver. # options HW_WDOG # # Set the number of PV entries per process. Increasing this can # stop panics related to heavy use of shared memory. However, that can # (combined with large amounts of physical memory) cause panics at # boot time due the kernel running out of VM space. # # If you're tweaking this, you might also want to increase the sysctls # "vm.v_free_min", "vm.v_free_reserved", and "vm.v_free_target". # # The value below is the one more than the default. # options PMAP_SHPGPERPROC=201 # # Disable swapping. This option removes all code which actually performs # swapping, so it's not possible to turn it back on at run-time. # # This is sometimes usable for systems which don't have any swap space # (see also sysctls "vm.defer_swapspace_pageouts" and # "vm.disable_swapspace_pageouts") # #options NO_SWAPPING # Set the number of sf_bufs to allocate. sf_bufs are virtual buffers # for sendfile(2) that are used to map file VM pages, and normally # default to a quantity that is roughly 16*MAXUSERS+512. You would # typically want about 4 of these for each simultaneous file send. # options NSFBUFS=1024 # # Enable extra debugging code for locks. This stores the filename and # line of whatever acquired the lock in the lock itself, and change a # number of function calls to pass around the relevant data. This is # not at all useful unless you are debugging lock code. Also note # that it is likely to break e.g. fstat(1) unless you recompile your # userland with -DDEBUG_LOCKS as well. # options DEBUG_LOCKS ##################################################################### # ABI Emulation # Enable iBCS2 runtime support for SCO and ISC binaries options IBCS2 # Emulate spx device for client side of SVR3 local X interface options SPX_HACK # Enable Linux ABI emulation options COMPAT_LINUX # Enable the linux-like proc filesystem support (requires COMPAT_LINUX # and PSEUDOFS) options LINPROCFS # Linux debugging options DEBUG_LINUX # # SysVR4 ABI emulation # # The svr4 ABI emulator can be statically compiled into the kernel or loaded as # a KLD module. # The STREAMS network emulation code can also be compiled statically or as a # module. If loaded as a module, it must be loaded before the svr4 module # (the /usr/sbin/svr4 script does this for you). If compiling statically, # the `streams' device must be configured into any kernel which also # specifies COMPAT_SVR4. It is possible to have a statically-configured # STREAMS device and a dynamically loadable svr4 emulator; the /usr/sbin/svr4 # script understands that it doesn't need to load the `streams' module under # those circumstances. # Caveat: At this time, `options KTRACE' is required for the svr4 emulator # (whether static or dynamic). # options COMPAT_SVR4 # build emulator statically options DEBUG_SVR4 # enable verbose debugging device streams # STREAMS network driver (required for svr4). ##################################################################### # USB support # UHCI controller device uhci # OHCI controller device ohci # General USB code (mandatory for USB) device usb # # USB Double Bulk Pipe devices device udbp # Generic USB device driver device ugen # Human Interface Device (anything with buttons and dials) device uhid # USB keyboard device ukbd # USB printer device ulpt # USB Iomega Zip 100 Drive (Requires scbus and da) device umass # USB modem support device umodem # USB mouse device ums # Diamond Rio 500 Mp3 player device urio # USB scanners device uscanner # # ADMtek USB ethernet. Supports the LinkSys USB100TX, # the Billionton USB100, the Melco LU-ATX, the D-Link DSB-650TX # and the SMC 2202USB. Also works with the ADMtek AN986 Pegasus # eval board. device aue # # CATC USB-EL1201A USB ethernet. Supports the CATC Netmate # and Netmate II, and the Belkin F5U111. device cue # # Kawasaki LSI ethernet. Supports the LinkSys USB10T, # Entrega USB-NET-E45, Peracom Ethernet Adapter, the # 3Com 3c19250, the ADS Technologies USB-10BT, the ATen UC10T, # the Netgear EA101, the D-Link DSB-650, the SMC 2102USB # and 2104USB, and the Corega USB-T. device kue # debugging options for the USB subsystem # options UHCI_DEBUG options OHCI_DEBUG options USB_DEBUG options UGEN_DEBUG options UHID_DEBUG options UHUB_DEBUG options UKBD_DEBUG options ULPT_DEBUG options UMASS_DEBUG options UMS_DEBUG options URIO_DEBUG # options for ukbd: options UKBD_DFLT_KEYMAP # specify the built-in keymap makeoptions UKBD_DFLT_KEYMAP=it.iso # # Embedded system options: # # An embedded system might want to run something other than init. options INIT_PATH="/sbin/init:/stand/sysinstall" # Debug options options BUS_DEBUG # enable newbus debugging options DEBUG_VFS_LOCKS # enable vfs lock debugging options NPX_DEBUG # enable npx debugging (FPU/math emu) ##################################################################### # SYSV IPC KERNEL PARAMETERS # # Maximum number of entries in a semaphore map. options SEMMAP=31 # Maximum number of System V semaphores that can be used on the system at # one time. options SEMMNI=11 # Total number of semaphores system wide options SEMMNS=61 # Total number of undo structures in system options SEMMNU=31 # Maximum number of System V semaphores that can be used by a single process # at one time. options SEMMSL=61 # Maximum number of operations that can be outstanding on a single System V # semaphore at one time. options SEMOPM=101 # Maximum number of undo operations that can be outstanding on a single # System V semaphore at one time. options SEMUME=11 # Maximum number of shared memory pages system wide. options SHMALL=1025 # Maximum size, in bytes, of a single System V shared memory region. options SHMMAX="(SHMMAXPGS*PAGE_SIZE+1)" options SHMMAXPGS=1025 # Minimum size, in bytes, of a single System V shared memory region. options SHMMIN=2 # Maximum number of shared memory regions that can be used on the system # at one time. options SHMMNI=33 # Maximum number of System V shared memory regions that can be attached to # a single process at one time. options SHMSEG=9 # Set the amount of time (in seconds) the system will wait before # rebooting automatically when a kernel panic occurs. If set to (-1), # the system will wait indefinitely until a key is pressed on the # console. options PANIC_REBOOT_WAIT_TIME=16 ##################################################################### # More undocumented options for linting. # Note that documenting these are not considered an affront. options CAM_DEBUG_DELAY # VFS cluster debugging. options CLUSTERDEBUG # Eliminate unneeded cache flush instruction(s). options CPU_UPGRADE_HW_CACHE options DEBUG # PECOFF module (Win32 Execution Format) options PECOFF_SUPPORT options PECOFF_DEBUG # Disable the 4 MByte PSE CPU feature. #options DISABLE_PSE options ENABLE_ALART options I4B_SMP_WORKAROUND options I586_PMC_GUPROF=0x70000 options KBDIO_DEBUG=2 options KBD_MAXRETRY=4 options KBD_MAXWAIT=6 options KBD_RESETDELAY=201 # Enable the PF_KEY Key Management API. options KEY # Kernel filelock debugging. options LOCKF_DEBUG # System V compatible message queues # Please note that the values provided here are used to test kernel # building. The defaults in the sources provide almost the same numbers. # MSGSSZ must be a power of 2 between 8 and 1024. options MSGMNB=2049 # Max number of chars in queue options MSGMNI=41 # Max number of message queue identifiers options MSGSEG=2049 # Max number of message segments options MSGSSZ=16 # Size of a message segment options MSGTQL=41 # Max number of messages in system options NBUF=512 # Number of buffer headers options NMBCLUSTERS=1024 # Number of mbuf clusters options PSM_DEBUG=1 options SCSI_NCR_DEBUG options SCSI_NCR_MAX_SYNC=10000 options SCSI_NCR_MAX_WIDE=1 options SCSI_NCR_MYADDR=7 options SC_DEBUG_LEVEL=5 # Syscons debug level options SC_RENDER_DEBUG # syscons rendering debugging options SHOW_BUSYBUFS # List buffers that prevent root unmount options SIMPLELOCK_DEBUG options SLIP_IFF_OPTS options TIMER_FREQ="((14318182+6)/12)" options VFS_BIO_DEBUG # VFS buffer I/O debugging options VM_KMEM_SIZE options VM_KMEM_SIZE_MAX options VM_KMEM_SIZE_SCALE Index: head/sys/conf/options.i386 =================================================================== --- head/sys/conf/options.i386 (revision 82308) +++ head/sys/conf/options.i386 (revision 82309) @@ -1,213 +1,214 @@ # $FreeBSD$ # Options specific to the i386 platform kernels DISABLE_PSE IDE_DELAY MATH_EMULATE opt_math_emulate.h GPL_MATH_EMULATE opt_math_emulate.h PMAP_SHPGPERPROC opt_pmap.h PPC_PROBE_CHIPSET opt_ppc.h PPC_DEBUG opt_ppc.h SHOW_BUSYBUFS PANIC_REBOOT_WAIT_TIME opt_panic.h MAXMEM PERFMON opt_perfmon.h POWERFAIL_NMI opt_trap.h AUTO_EOI_1 opt_auto_eoi.h AUTO_EOI_2 opt_auto_eoi.h CONSPEED opt_comconsole.h I586_PMC_GUPROF opt_i586_guprof.h WLCACHE opt_wavelan.h WLDEBUG opt_wavelan.h COMPAT_OLDISA BROKEN_KEYBOARD_RESET opt_reset.h +UPAGES # Options for emulators. These should only be used at config time, so # they are handled like options for static file systems # (see src/sys/conf/options), except for broken debugging options. IBCS2 opt_dontuse.h COMPAT_LINUX opt_dontuse.h DEBUG_LINUX opt_linux.h COMPAT_SVR4 opt_dontuse.h DEBUG_SVR4 opt_svr4.h PECOFF_SUPPORT opt_dontuse.h PECOFF_DEBUG opt_pecoff.h # i386 SMP options APIC_IO opt_global.h CLK_CALIBRATION_LOOP opt_clock.h CLK_USE_I8254_CALIBRATION opt_clock.h CLK_USE_TSC_CALIBRATION opt_clock.h TIMER_FREQ opt_clock.h NO_F00F_HACK opt_cpu.h CPU_BLUELIGHTNING_FPU_OP_CACHE opt_cpu.h CPU_BLUELIGHTNING_3X opt_cpu.h CPU_BTB_EN opt_cpu.h CPU_CYRIX_NO_LOCK opt_cpu.h CPU_DIRECT_MAPPED_CACHE opt_cpu.h CPU_DISABLE_5X86_LSSER opt_cpu.h CPU_FASTER_5X86_FPU opt_cpu.h CPU_I486_ON_386 opt_cpu.h CPU_IORT opt_cpu.h CPU_L2_LATENCY opt_cpu.h CPU_LOOP_EN opt_cpu.h CPU_PPRO2CELERON opt_cpu.h CPU_RSTK_EN opt_cpu.h CPU_SUSP_HLT opt_cpu.h CPU_UPGRADE_HW_CACHE opt_cpu.h CPU_WT_ALLOC opt_cpu.h CYRIX_CACHE_WORKS opt_cpu.h CYRIX_CACHE_REALLY_WORKS opt_cpu.h NO_MEMORY_HOLE opt_cpu.h CPU_ENABLE_SSE opt_cpu.h # The CPU type affects the endian conversion functions all over the kernel. I386_CPU opt_global.h I486_CPU opt_global.h I586_CPU opt_global.h I686_CPU opt_global.h MAXCONS opt_syscons.h SC_ALT_MOUSE_IMAGE opt_syscons.h SC_DEBUG_LEVEL opt_syscons.h SC_DFLT_FONT opt_syscons.h SC_DISABLE_DDBKEY opt_syscons.h SC_DISABLE_REBOOT opt_syscons.h SC_HISTORY_SIZE opt_syscons.h SC_KERNEL_CONS_ATTR opt_syscons.h SC_KERNEL_CONS_REV_ATTR opt_syscons.h SC_MOUSE_CHAR opt_syscons.h SC_NO_CUTPASTE opt_syscons.h SC_NO_FONT_LOADING opt_syscons.h SC_NO_HISTORY opt_syscons.h SC_NO_SYSMOUSE opt_syscons.h SC_NORM_ATTR opt_syscons.h SC_NORM_REV_ATTR opt_syscons.h SC_PIXEL_MODE opt_syscons.h SC_RENDER_DEBUG opt_syscons.h SC_TWOBUTTON_MOUSE opt_syscons.h VGA_ALT_SEQACCESS opt_vga.h VGA_DEBUG opt_vga.h VGA_NO_FONT_LOADING opt_vga.h VGA_NO_MODE_CHANGE opt_vga.h VGA_SLOW_IOACCESS opt_vga.h VGA_WIDTH90 opt_vga.h VESA opt_vesa.h VESA_DEBUG opt_vesa.h PSM_HOOKRESUME opt_psm.h PSM_RESETAFTERSUSPEND opt_psm.h PSM_DEBUG opt_psm.h PCIC_RESUME_RESET opt_pcic.h ATKBD_DFLT_KEYMAP opt_atkbd.h KBD_DISABLE_KEYMAP_LOAD opt_kbd.h KBD_INSTALL_CDEV opt_kbd.h KBD_MAXRETRY opt_kbd.h KBD_MAXWAIT opt_kbd.h KBD_RESETDELAY opt_kbd.h KBDIO_DEBUG opt_kbd.h #USERCONFIG opt_userconfig.h #VISUAL_USERCONFIG opt_userconfig.h #INTRO_USERCONFIG opt_userconfig.h #DEV_EISA opt_userconfig.h EISA_SLOTS opt_eisa.h FE_8BIT_SUPPORT opt_fe.h # pcvt(4) has a bunch of options FAT_CURSOR opt_pcvt.h XSERVER opt_pcvt.h PCVT_24LINESDEF opt_pcvt.h PCVT_CTRL_ALT_DEL opt_pcvt.h PCVT_META_ESC opt_pcvt.h PCVT_NSCREENS opt_pcvt.h PCVT_PRETTYSCRNS opt_pcvt.h PCVT_SCANSET opt_pcvt.h PCVT_SCREENSAVER opt_pcvt.h PCVT_USEKBDSEC opt_pcvt.h PCVT_VT220KEYB opt_pcvt.h PCVT_GREENSAVER opt_pcvt.h # voxware options GUS_DMA2 opt_sound.h GUS_DMA opt_sound.h GUS_IRQ opt_sound.h # Video spigot SPIGOT_UNSECURE opt_spigot.h # ------------------------------- # isdn4bsd: passive ISA cards # ------------------------------- TEL_S0_8 opt_i4b.h TEL_S0_16 opt_i4b.h TEL_S0_16_3 opt_i4b.h AVM_A1 opt_i4b.h USR_STI opt_i4b.h ITKIX1 opt_i4b.h ELSA_PCC16 opt_i4b.h # ------------------------------- # isdn4bsd: passive ISA PnP cards # ------------------------------- CRTX_S0_P opt_i4b.h DRN_NGO opt_i4b.h TEL_S0_16_3_P opt_i4b.h SEDLBAUER opt_i4b.h DYNALINK opt_i4b.h ASUSCOM_IPAC opt_i4b.h ELSA_QS1ISA opt_i4b.h SIEMENS_ISURF2 opt_i4b.h EICON_DIVA opt_i4b.h # ------------------------------- # isdn4bsd: passive PCI cards # ------------------------------- ELSA_QS1PCI opt_i4b.h AVM_A1_PCI opt_i4b.h # ------------------------------- # isdn4bsd: passive PCMCIA cards # ------------------------------- #AVM_A1_PCMCIA opt_i4b.h # ------------------------------- # isdn4bsd: misc options # ------------------------------- # temporary workaround for SMP machines I4B_SMP_WORKAROUND opt_i4b.h # enable VJ compression code for ipr i/f IPR_VJ opt_i4b.h IPR_LOG opt_i4b.h # ------------------------------- # oltr: build options # ------------------------------- # Exclude microcode options OLTR_NO_TMS_MAC opt_oltr.h OLTR_NO_HAWKEYE_MAC opt_oltr.h OLTR_NO_BULLSEYE_MAC opt_oltr.h # Total number of ports controlled by the dgb(4) driver. # Defaults to NDGB*16. NDGBPORTS opt_dgb.h # Device options DEV_NPX opt_npx.h DEV_APM opt_apm.h DEV_SPLASH opt_splash.h # SMB/CIFS requester NETSMB opt_netsmb.h NETSMBCRYPTO opt_netsmb.h # SMB/CIFS filesystem SMBFS # ------------------------------- # EOF # ------------------------------- Index: head/sys/conf/options.pc98 =================================================================== --- head/sys/conf/options.pc98 (revision 82308) +++ head/sys/conf/options.pc98 (revision 82309) @@ -1,220 +1,221 @@ # $FreeBSD$ # Options specific to the pc98 platform kernels DISABLE_PSE IDE_DELAY MATH_EMULATE opt_math_emulate.h GPL_MATH_EMULATE opt_math_emulate.h PMAP_SHPGPERPROC opt_pmap.h PPC_PROBE_CHIPSET opt_ppc.h PPC_DEBUG opt_ppc.h SHOW_BUSYBUFS PANIC_REBOOT_WAIT_TIME opt_panic.h MAXMEM PERFMON opt_perfmon.h POWERFAIL_NMI opt_trap.h AUTO_EOI_1 opt_auto_eoi.h AUTO_EOI_2 opt_auto_eoi.h CONSPEED opt_comconsole.h I586_PMC_GUPROF opt_i586_guprof.h WLCACHE opt_wavelan.h WLDEBUG opt_wavelan.h COMPAT_OLDISA BROKEN_KEYBOARD_RESET opt_reset.h +UPAGES # Options for emulators. These should only be used at config time, so # they are handled like options for static file systems # (see src/sys/conf/options), except for broken debugging options. IBCS2 opt_dontuse.h COMPAT_LINUX opt_dontuse.h DEBUG_LINUX opt_linux.h COMPAT_SVR4 opt_dontuse.h DEBUG_SVR4 opt_svr4.h PECOFF_SUPPORT opt_dontuse.h PECOFF_DEBUG opt_pecoff.h # i386 SMP options APIC_IO opt_global.h CLK_CALIBRATION_LOOP opt_clock.h CLK_USE_I8254_CALIBRATION opt_clock.h CLK_USE_TSC_CALIBRATION opt_clock.h TIMER_FREQ opt_clock.h NO_F00F_HACK opt_cpu.h CPU_BLUELIGHTNING_FPU_OP_CACHE opt_cpu.h CPU_BLUELIGHTNING_3X opt_cpu.h CPU_BTB_EN opt_cpu.h CPU_CYRIX_NO_LOCK opt_cpu.h CPU_DIRECT_MAPPED_CACHE opt_cpu.h CPU_DISABLE_5X86_LSSER opt_cpu.h CPU_FASTER_5X86_FPU opt_cpu.h CPU_I486_ON_386 opt_cpu.h CPU_IORT opt_cpu.h CPU_L2_LATENCY opt_cpu.h CPU_LOOP_EN opt_cpu.h CPU_PPRO2CELERON opt_cpu.h CPU_RSTK_EN opt_cpu.h CPU_SUSP_HLT opt_cpu.h CPU_UPGRADE_HW_CACHE opt_cpu.h CPU_WT_ALLOC opt_cpu.h CYRIX_CACHE_WORKS opt_cpu.h CYRIX_CACHE_REALLY_WORKS opt_cpu.h NO_MEMORY_HOLE opt_cpu.h CPU_ENABLE_SSE opt_cpu.h # The CPU type affects the endian conversion functions all over the kernel. I386_CPU opt_global.h I486_CPU opt_global.h I586_CPU opt_global.h I686_CPU opt_global.h MAXCONS opt_syscons.h SC_ALT_MOUSE_IMAGE opt_syscons.h SC_DEBUG_LEVEL opt_syscons.h SC_DFLT_FONT opt_syscons.h SC_DISABLE_DDBKEY opt_syscons.h SC_DISABLE_REBOOT opt_syscons.h SC_HISTORY_SIZE opt_syscons.h SC_KERNEL_CONS_ATTR opt_syscons.h SC_KERNEL_CONS_REV_ATTR opt_syscons.h SC_MOUSE_CHAR opt_syscons.h SC_NO_CUTPASTE opt_syscons.h SC_NO_FONT_LOADING opt_syscons.h SC_NO_HISTORY opt_syscons.h SC_NO_SYSMOUSE opt_syscons.h SC_NORM_ATTR opt_syscons.h SC_NORM_REV_ATTR opt_syscons.h SC_PIXEL_MODE opt_syscons.h SC_RENDER_DEBUG opt_syscons.h SC_TWOBUTTON_MOUSE opt_syscons.h GDC opt_gdc.h PSM_HOOKRESUME opt_psm.h PSM_RESETAFTERSUSPEND opt_psm.h PSM_DEBUG opt_psm.h PCIC_RESUME_RESET opt_pcic.h KBD_DISABLE_KEYMAP_LOAD opt_kbd.h KBD_INSTALL_CDEV opt_kbd.h KBD_MAXRETRY opt_kbd.h KBD_MAXWAIT opt_kbd.h KBD_RESETDELAY opt_kbd.h KBDIO_DEBUG opt_kbd.h #USERCONFIG opt_userconfig.h #VISUAL_USERCONFIG opt_userconfig.h #INTRO_USERCONFIG opt_userconfig.h #DEV_EISA opt_userconfig.h EISA_SLOTS opt_eisa.h FE_8BIT_SUPPORT opt_fe.h # pcvt(4) has a bunch of options FAT_CURSOR opt_pcvt.h XSERVER opt_pcvt.h PCVT_24LINESDEF opt_pcvt.h PCVT_CTRL_ALT_DEL opt_pcvt.h PCVT_META_ESC opt_pcvt.h PCVT_NSCREENS opt_pcvt.h PCVT_PRETTYSCRNS opt_pcvt.h PCVT_SCANSET opt_pcvt.h PCVT_SCREENSAVER opt_pcvt.h PCVT_USEKBDSEC opt_pcvt.h PCVT_VT220KEYB opt_pcvt.h PCVT_GREENSAVER opt_pcvt.h # voxware options GUS_DMA2 opt_sound.h GUS_DMA opt_sound.h GUS_IRQ opt_sound.h # Video spigot SPIGOT_UNSECURE opt_spigot.h # ------------------------------- # isdn4bsd: passive ISA cards # ------------------------------- TEL_S0_8 opt_i4b.h TEL_S0_16 opt_i4b.h TEL_S0_16_3 opt_i4b.h AVM_A1 opt_i4b.h USR_STI opt_i4b.h ITKIX1 opt_i4b.h ELSA_PCC16 opt_i4b.h # ------------------------------- # isdn4bsd: passive ISA PnP cards # ------------------------------- CRTX_S0_P opt_i4b.h DRN_NGO opt_i4b.h TEL_S0_16_3_P opt_i4b.h SEDLBAUER opt_i4b.h DYNALINK opt_i4b.h ASUSCOM_IPAC opt_i4b.h ELSA_QS1ISA opt_i4b.h SIEMENS_ISURF2 opt_i4b.h EICON_DIVA opt_i4b.h # ------------------------------- # isdn4bsd: passive PCI cards # ------------------------------- ELSA_QS1PCI opt_i4b.h AVM_A1_PCI opt_i4b.h # ------------------------------- # isdn4bsd: passive PCMCIA cards # ------------------------------- #AVM_A1_PCMCIA opt_i4b.h # ------------------------------- # isdn4bsd: misc options # ------------------------------- # temporary workaround for SMP machines I4B_SMP_WORKAROUND opt_i4b.h # enable VJ compression code for ipr i/f IPR_VJ opt_i4b.h IPR_LOG opt_i4b.h # ------------------------------- # oltr: build options # ------------------------------- # Exclude microcode options OLTR_NO_TMS_MAC opt_oltr.h OLTR_NO_HAWKEYE_MAC opt_oltr.h OLTR_NO_BULLSEYE_MAC opt_oltr.h # Total number of ports controlled by the dgb(4) driver. # Defaults to NDGB*16. NDGBPORTS opt_dgb.h # bs driver options SCSI_BOUNCE_SIZE opt_bs.h BS_TARG_SAFEMODE opt_bs.h # ct driver options CT_USE_RELOCATE_OFFSET opt_ct.h CT_BUS_WEIGHT opt_ct.h # npx options FPU_ERROR_BROKEN opt_npx.h # PC98 options PC98 opt_global.h EPSON_BOUNCEDMA opt_pc98.h EPSON_MEMWIN opt_pc98.h LINE30 opt_syscons.h # Device options DEV_NPX opt_npx.h DEV_APM opt_apm.h DEV_SPLASH opt_splash.h # SMB/CIFS requester NETSMB opt_netsmb.h NETSMBCRYPTO opt_netsmb.h # SMB/CIFS filesystem SMBFS # ------------------------------- # EOF # ------------------------------- Index: head/sys/i386/conf/GENERIC =================================================================== --- head/sys/i386/conf/GENERIC (revision 82308) +++ head/sys/i386/conf/GENERIC (revision 82309) @@ -1,235 +1,237 @@ # # GENERIC -- Generic kernel configuration file for FreeBSD/i386 # # For more information on this file, please read the handbook section on # Kernel Configuration Files: # # http://www.FreeBSD.org/handbook/kernelconfig-config.html # # The handbook is also available locally in /usr/share/doc/handbook # if you've installed the doc distribution, otherwise always see the # FreeBSD World Wide Web server (http://www.FreeBSD.org/) for the # latest information. # # An exhaustive list of options and more detailed explanations of the # device lines is also present in the NOTES configuration file. If you are # in doubt as to the purpose or necessity of a line, check first in NOTES. # # $FreeBSD$ machine i386 cpu I486_CPU cpu I586_CPU cpu I686_CPU ident GENERIC maxusers 32 #To statically compile in device wiring instead of /boot/device.hints -#hints "GENERIC.hints" #Default places to look for devices. +hints "GENERIC.hints" #Default places to look for devices. makeoptions DEBUG=-g #Build kernel with gdb(1) debug symbols options MATH_EMULATE #Support for x87 emulation options INET #InterNETworking options INET6 #IPv6 communications protocols options FFS #Berkeley Fast Filesystem options SOFTUPDATES #Enable FFS soft updates support options MD_ROOT #MD is a potential root device options NFS #Network Filesystem options NFS_ROOT #NFS usable as root device, NFS required options MSDOSFS #MSDOS Filesystem options CD9660 #ISO 9660 Filesystem options PROCFS #Process filesystem options COMPAT_43 #Compatible with BSD 4.3 [KEEP THIS!] options SCSI_DELAY=15000 #Delay (in ms) before probing SCSI options UCONSOLE #Allow users to grab the console #options USERCONFIG #boot -c editor #options VISUAL_USERCONFIG #visual boot -c editor options KTRACE #ktrace(1) support options SYSVSHM #SYSV-style shared memory options SYSVMSG #SYSV-style message queues options SYSVSEM #SYSV-style semaphores options P1003_1B #Posix P1003_1B real-time extensions options _KPOSIX_PRIORITY_SCHEDULING options KBD_INSTALL_CDEV # install a CDEV entry in /dev # Debugging for use in -current options DDB options INVARIANTS options INVARIANT_SUPPORT options WITNESS +options UPAGES=4 +options CPU_ENABLE_SSE # To make an SMP kernel, the next two are needed #options SMP # Symmetric MultiProcessor Kernel #options APIC_IO # Symmetric (APIC) I/O device isa device eisa device pci # Floppy drives device fdc # ATA and ATAPI devices device ata device atadisk # ATA disk drives device atapicd # ATAPI CDROM drives device atapifd # ATAPI floppy drives device atapist # ATAPI tape drives options ATA_STATIC_ID #Static device numbering # SCSI Controllers device ahb # EISA AHA1742 family device ahc # AHA2940 and onboard AIC7xxx devices device amd # AMD 53C974 (Tekram DC-390(T)) device isp # Qlogic family #device ncr # NCR/Symbios Logic device sym # NCR/Symbios Logic (newer chipsets + those of `ncr') device adv # Advansys SCSI adapters device adw # Advansys wide SCSI adapters device aha # Adaptec 154x SCSI adapters device aic # Adaptec 15[012]x SCSI adapters, AIC-6[23]60. device bt # Buslogic/Mylex MultiMaster SCSI adapters device ncv # NCR 53C500 device nsp # Workbit Ninja SCSI-3 device stg # TMC 18C30/18C50 # RAID controllers interfaced to the SCSI subsystem device asr # DPT SmartRAID V, VI and Adaptec SCSI RAID device dpt # DPT Smartcache III, IV - See NOTES for options! device mly # Mylex AcceleRAID/eXtremeRAID # SCSI peripherals device scbus # SCSI bus (required) device da # Direct Access (disks) device sa # Sequential Access (tape etc) device cd # CD device pass # Passthrough device (direct SCSI access) # RAID controllers device aac # Adaptec FSA RAID device amr # AMI MegaRAID device ida # Compaq Smart RAID device mlx # Mylex DAC960 family device twe # 3ware ATA RAID # atkbdc0 controls both the keyboard and the PS/2 mouse device atkbdc 1 # At keyboard controller device atkbd # at keyboard device psm # psm mouse device vga # VGA screen # splash screen/screen saver device splash # syscons is the default console driver, resembling an SCO console device sc 1 # Enable this for the pcvt (VT220 compatible) console driver #device vt #options XSERVER # support for X server on a vt console #options FAT_CURSOR # start with block cursor # Floating point support - do not disable. device npx # Power management support (see NOTES for more options) device apm # Add suspend/resume support for the i8254. device pmtimer # PCCARD (PCMCIA) support device card # pccard bus device pcic # PCMCIA bridge # Serial (COM) ports device sio # 8250, 16[45]50 based serial ports # Parallel port device ppc device ppbus # Parallel port bus (required) device lpt # Printer device plip # TCP/IP over parallel device ppi # Parallel port interface device #device vpo # Requires scbus and da # PCI Ethernet NICs. device de # DEC/Intel DC21x4x (``Tulip'') device vx # 3Com 3c590, 3c595 (``Vortex'') device txp # 3Com 3cR990 (``Typhoon'') # PCI Ethernet NICs that use the common MII bus controller code. # NOTE: Be sure to keep the 'device miibus' line in order to use these NICs! device miibus # MII bus support device dc # DEC/Intel 21143 and various workalikes device fxp # Intel EtherExpress PRO/100B (82557, 82558) device pcn # AMD Am79C97x PCI 10/100 NICs device rl # RealTek 8129/8139 device sf # Adaptec AIC-6915 (``Starfire'') device sis # Silicon Integrated Systems SiS 900/SiS 7016 device ste # Sundance ST201 (D-Link DFE-550TX) device tl # Texas Instruments ThunderLAN device tx # SMC EtherPower II (83c170 ``EPIC'') device vr # VIA Rhine, Rhine II device wb # Winbond W89C840F device wx # Intel Gigabit Ethernet Card (``Wiseman'') device xl # 3Com 3c90x (``Boomerang'', ``Cyclone'') # ISA Ethernet NICs. pccard nics included. device cs # Crystal Semiconductor CS89x0 NIC # 'device ed' requires 'device miibus' device ed # NE[12]000, SMC Ultra, 3c503, DS8390 cards device ex # Intel EtherExpress Pro/10 and Pro/10+ device ep # Etherlink III based cards device fe # Fujitsu MB8696x based cards device sn # SMC's 9000 series of ethernet chips device xe # Xircom pccard ethernet # The probe order of these is presently determined by i386/isa/isa_compat.c. #device ie #device le device lnc # Wireless NIC cards device an # Aironet 4500/4800 802.11 wireless NICs. device awi # BayStack 660 and others device wi # WaveLAN/IEEE 802.11 wireless NICs. #device wl # Older non 802.11 Wavelan wireless NIC. # Pseudo devices - the number indicates how many units to allocate. device random # Entropy device device loop # Network loopback device ether # Ethernet support device sl # Kernel SLIP device ppp 1 # Kernel PPP device tun # Packet tunnel. device pty # Pseudo-ttys (telnet etc) device md # Memory "disks" device gif # IPv6 and IPv4 tunneling device faith 1 # IPv6-to-IPv4 relaying (translation) # The `bpf' device enables the Berkeley Packet Filter. # Be aware of the administrative consequences of enabling this! device bpf # Berkeley packet filter # USB support device uhci # UHCI PCI->USB interface device ohci # OHCI PCI->USB interface device usb # USB Bus (required) #device udbp # USB Double Bulk Pipe devices device ugen # Generic device uhid # "Human Interface Devices" device ukbd # Keyboard device ulpt # Printer device umass # Disks/Mass storage - Requires scbus and da device ums # Mouse device urio # Diamond Rio 500 MP3 player device uscanner # Scanners # USB Ethernet, requires mii device aue # ADMtek USB ethernet device cue # CATC USB ethernet device kue # Kawasaki LSI USB ethernet Index: head/sys/i386/conf/NOTES =================================================================== --- head/sys/i386/conf/NOTES (revision 82308) +++ head/sys/i386/conf/NOTES (revision 82309) @@ -1,2909 +1,2910 @@ # # NOTES -- Lines that can be cut/pasted into kernel and hints configs. # # Lines that begin with 'device', 'options', 'machine', 'ident', 'maxusers', # 'makeoptions', 'hints' etc go into the kernel configuration that you # run config(8) with. # # Lines that begin with 'hints.' are NOT for config(8), they go into your # hints file. See /boot/device.hints and/or the 'hints' config(8) directive. # # Please use ``make LINT'' to create an old-style LINT file if you want to # do kernel test-builds. # # $FreeBSD$ # # # This directive is mandatory; it defines the architecture to be # configured for; in this case, the 386 family based IBM-PC and # compatibles. # machine i386 # # This is the ``identification'' of the kernel. Usually this should # be the same as the name of your kernel. # ident LINT # # The `maxusers' parameter controls the static sizing of a number of # internal system tables by a complicated formula defined in param.c. # maxusers 10 # # We want LINT to cover profiling as well profile 2 # # The `makeoptions' parameter allows variables to be passed to the # generated Makefile in the build area. # # CONF_CFLAGS gives some extra compiler flags that are added to ${CFLAGS} # after most other flags. Here we use it to inhibit use of non-optimal # gcc builtin functions (e.g., memcmp). # # DEBUG happens to be magic. # The following is equivalent to 'config -g KERNELNAME' and creates # 'kernel.debug' compiled with -g debugging as well as a normal # 'kernel'. Use 'make install.debug' to install the debug kernel # but that isn't normally necessary as the debug symbols are not loaded # by the kernel and are not useful there anyway. # # KERNEL can be overridden so that you can change the default name of your # kernel. # makeoptions CONF_CFLAGS=-fno-builtin #Don't allow use of memcmp, etc. #makeoptions DEBUG=-g #Build kernel with gdb(1) debug symbols #makeoptions KERNEL=foo #Build kernel "foo" and install "/foo" # # Certain applications can grow to be larger than the 512M limit # that FreeBSD initially imposes. Below are some options to # allow that limit to grow to 1GB, and can be increased further # with changing the parameters. MAXDSIZ is the maximum that the # limit can be set to, and the DFLDSIZ is the default value for # the limit. MAXSSIZ is the maximum that the stack limit can be # set to. You might want to set the default lower than the max, # and explicitly set the maximum with a shell command for processes # that regularly exceed the limit like INND. # options MAXDSIZ="(1024UL*1024*1024)" options MAXSSIZ="(128UL*1024*1024)" options DFLDSIZ="(1024UL*1024*1024)" # # BLKDEV_IOSIZE sets the default block size used in user block # device I/O. Note that this value will be overriden by the label # when specifying a block device from a label with a non-0 # partition blocksize. The default is PAGE_SIZE. # options BLKDEV_IOSIZE=8192 # Options for the VM subsystem options PQ_CACHESIZE=512 # color for 512k/16k cache +options UPAGES=3 # number of 4k stack pages per process # Deprecated options supported for backwards compatibility #options PQ_NOOPT # No coloring #options PQ_LARGECACHE # color for 512k/16k cache #options PQ_HUGECACHE # color for 1024k/16k cache #options PQ_MEDIUMCACHE # color for 256k/16k cache #options PQ_NORMALCACHE # color for 64k/16k cache # This allows you to actually store this configuration file into # the kernel binary itself, where it may be later read by saying: # strings -n 3 /boot/kernel/kernel | sed -n 's/^___//p' > MYKERNEL # options INCLUDE_CONFIG_FILE # Include this file in kernel # # The root device and filesystem type can be compiled in; # this provides a fallback option if the root device cannot # be correctly guesst by the bootstrap code, or an override if # the RB_DFLTROOT flag (-r) is specified when booting the kernel. # options ROOTDEVNAME=\"ufs:da0s2e\" ##################################################################### # SMP OPTIONS: # # SMP enables building of a Symmetric MultiProcessor Kernel. # APIC_IO enables the use of the IO APIC for Symmetric I/O. # # Notes: # # An SMP kernel will ONLY run on an Intel MP spec. qualified motherboard. # # Be sure to disable 'cpu I386_CPU' && 'cpu I486_CPU' for SMP kernels. # # Check the 'Rogue SMP hardware' section to see if additional options # are required by your hardware. # # Mandatory: options SMP # Symmetric MultiProcessor Kernel options APIC_IO # Symmetric (APIC) I/O # # Rogue SMP hardware: # # Bridged PCI cards: # # The MP tables of most of the current generation MP motherboards # do NOT properly support bridged PCI cards. To use one of these # cards you should refer to ??? # SMP Debugging Options: # # MUTEX_DEBUG enables various extra assertions in the mutex code. # WITNESS enables the mutex witness code which detects deadlocks and cycles # during locking operations. # WITNESS_DDB causes the witness code to drop into the kernel debugger if # a lock heirarchy violation occurs or if locks are held when going to # sleep. # WITNESS_SKIPSPIN disables the witness checks on spin mutexes. options MUTEX_DEBUG options WITNESS options WITNESS_DDB options WITNESS_SKIPSPIN ##################################################################### # CPU OPTIONS # # You must specify at least one CPU (the one you intend to run on); # deleting the specification for CPUs you don't need to use may make # parts of the system run faster. # I386_CPU is mutually exclusive with the other CPU types. # #cpu I386_CPU cpu I486_CPU cpu I586_CPU # aka Pentium(tm) cpu I686_CPU # aka Pentium Pro(tm) # # Options for CPU features. # # CPU_BLUELIGHTNING_FPU_OP_CACHE enables FPU operand cache on IBM # BlueLightning CPU. It works only with Cyrix FPU, and this option # should not be used with Intel FPU. # # CPU_BLUELIGHTNING_3X enables triple-clock mode on IBM Blue Lightning # CPU if CPU supports it. The default is double-clock mode on # BlueLightning CPU box. # # CPU_BTB_EN enables branch target buffer on Cyrix 5x86 (NOTE 1). # # CPU_DIRECT_MAPPED_CACHE sets L1 cache of Cyrix 486DLC CPU in direct # mapped mode. Default is 2-way set associative mode. # # CPU_CYRIX_NO_LOCK enables weak locking for the entire address space # of Cyrix 6x86 and 6x86MX CPUs by setting the NO_LOCK bit of CCR1. # Otherwise, the NO_LOCK bit of CCR1 is cleared. (NOTE 3) # # CPU_DISABLE_5X86_LSSER disables load store serialize (i.e. enables # reorder). This option should not be used if you use memory mapped # I/O device(s). # # CPU_ENABLE_SSE enables SSE/MMX2 instructions support. # # CPU_FASTER_5X86_FPU enables faster FPU exception handler. # # CPU_I486_ON_386 enables CPU cache on i486 based CPU upgrade products # for i386 machines. # # CPU_IORT defines I/O clock delay time (NOTE 1). Default values of # I/O clock delay time on Cyrix 5x86 and 6x86 are 0 and 7,respectively # (no clock delay). # # CPU_L2_LATENCY specifed the L2 cache latency value. This option is used # only when CPU_PPRO2CELERON is defined and Mendocino Celeron is detected. # The default value is 5. # # CPU_LOOP_EN prevents flushing the prefetch buffer if the destination # of a jump is already present in the prefetch buffer on Cyrix 5x86(NOTE # 1). # # CPU_PPRO2CELERON enables L2 cache of Mendocino Celeron CPUs. This option # is useful when you use Socket 8 to Socket 370 converter, because most Pentium # Pro BIOSs do not enable L2 cache of Mendocino Celeron CPUs. # # CPU_RSTK_EN enables return stack on Cyrix 5x86 (NOTE 1). # # CPU_SUSP_HLT enables suspend on HALT. If this option is set, CPU # enters suspend mode following execution of HALT instruction. # # CPU_WT_ALLOC enables write allocation on Cyrix 6x86/6x86MX and AMD # K5/K6/K6-2 cpus. # # CYRIX_CACHE_WORKS enables CPU cache on Cyrix 486 CPUs with cache # flush at hold state. # # CYRIX_CACHE_REALLY_WORKS enables (1) CPU cache on Cyrix 486 CPUs # without cache flush at hold state, and (2) write-back CPU cache on # Cyrix 6x86 whose revision < 2.7 (NOTE 2). # # NO_F00F_HACK disables the hack that prevents Pentiums (and ONLY # Pentiums) from locking up when a LOCK CMPXCHG8B instruction is # executed. This option is only needed if I586_CPU is also defined, # and should be included for any non-Pentium CPU that defines it. # # NO_MEMORY_HOLE is an optimisation for systems with AMD K6 processors # which indicates that the 15-16MB range is *definitely* not being # occupied by an ISA memory hole. # # NOTE 1: The options, CPU_BTB_EN, CPU_LOOP_EN, CPU_IORT, # CPU_LOOP_EN and CPU_RSTK_EN should not be used because of CPU bugs. # These options may crash your system. # # NOTE 2: If CYRIX_CACHE_REALLY_WORKS is not set, CPU cache is enabled # in write-through mode when revision < 2.7. If revision of Cyrix # 6x86 >= 2.7, CPU cache is always enabled in write-back mode. # # NOTE 3: This option may cause failures for software that requires # locked cycles in order to operate correctly. # options CPU_BLUELIGHTNING_FPU_OP_CACHE options CPU_BLUELIGHTNING_3X options CPU_BTB_EN options CPU_DIRECT_MAPPED_CACHE options CPU_DISABLE_5X86_LSSER options CPU_ENABLE_SSE options CPU_FASTER_5X86_FPU options CPU_I486_ON_386 options CPU_IORT options CPU_L2_LATENCY=5 options CPU_LOOP_EN options CPU_PPRO2CELERON options CPU_RSTK_EN options CPU_SUSP_HLT options CPU_WT_ALLOC options CYRIX_CACHE_WORKS options CYRIX_CACHE_REALLY_WORKS #options NO_F00F_HACK # # A math emulator is mandatory if you wish to run on hardware which # does not have a floating-point processor. Pick either the original, # bogus (but freely-distributable) math emulator, or a much more # fully-featured but GPL-licensed emulator taken from Linux. # options MATH_EMULATE #Support for x87 emulation # Don't enable both of these in a real config. options GPL_MATH_EMULATE #Support for x87 emulation via #new math emulator ##################################################################### # COMPATIBILITY OPTIONS # # Implement system calls compatible with 4.3BSD and older versions of # FreeBSD. You probably do NOT want to remove this as much current code # still relies on the 4.3 emulation. # options COMPAT_43 # # These three options provide support for System V Interface # Definition-style interprocess communication, in the form of shared # memory, semaphores, and message queues, respectively. # options SYSVSHM options SYSVSEM options SYSVMSG ##################################################################### # DEBUGGING OPTIONS # # Enable the kernel debugger. # options DDB # # Don't drop into DDB for a panic. Intended for unattended operation # where you may want to drop to DDB from the console, but still want # the machine to recover from a panic # options DDB_UNATTENDED # # If using GDB remote mode to debug the kernel, there's a non-standard # extension to the remote protocol that can be used to use the serial # port as both the debugging port and the system console. It's non- # standard and you're on your own if you enable it. See also the # "remotechat" variables in the FreeBSD specific version of gdb. # options GDB_REMOTE_CHAT # # KTRACE enables the system-call tracing facility ktrace(2). # options KTRACE #kernel tracing # # KTR is a kernel tracing mechanism imported from BSD/OS. Currently it # has no userland interface aside from a few sysctl's. It is enabled with # the KTR option. The KTR_EXTEND option causes trace events to be generated # as a string from snprintf rather than as a string and up to 5 argument # pointers. KTR_ENTRIES defines the number of entries in the circular trace # buffer. KTR_COMPILE defines the mask of events to compile into the kernel # as defined by the KTR_* constants in . KTR_MASK defines the # initial value of the ktr_mask variable which determines at runtime what # events to trace. KTR_CPUMASK determines which CPU's log events, with # bit X corresponding to cpu X. KTR_VERBOSE enables dumping of KTR events # to the console by default. This functionality can be toggled via the # debug.ktr_verbose sysctl and defaults to off if KTR_VERBOSE is not defined. # options KTR options KTR_EXTEND options KTR_ENTRIES=1024 options KTR_COMPILE="(KTR_INTR|KTR_PROC)" options KTR_MASK=KTR_INTR options KTR_CPUMASK=0x3 options KTR_VERBOSE # # The INVARIANTS option is used in a number of source files to enable # extra sanity checking of internal structures. This support is not # enabled by default because of the extra time it would take to check # for these conditions, which can only occur as a result of # programming errors. # options INVARIANTS # # The INVARIANT_SUPPORT option makes us compile in support for # verifying some of the internal structures. It is a prerequisite for # 'INVARIANTS', as enabling 'INVARIANTS' will make these functions be # called. The intent is that you can set 'INVARIANTS' for single # source files (by changing the source file or specifying it on the # command line) if you have 'INVARIANT_SUPPORT' enabled. Also, if you # wish to build a kernel module with 'INVARIANTS', then adding # 'INVARIANT_SUPPORT' to your kernel will provide all the necessary # infrastructure without the added overhead. # options INVARIANT_SUPPORT # # The DIAGNOSTIC option is used to enable extra debugging information # from some parts of the kernel. As this makes everything more noisy, # it is disabled by default. # options DIAGNOSTIC # # REGRESSION causes optional kernel interfaces necessary only for regression # testing to be enabled. These interfaces may consitute security risks # when enabled, as they permit processes to easily modify aspects of the # run-time environment to reproduce unlikely or unusual (possibly normally # impossible) scenarios. # options REGRESSION # # RESTARTABLE_PANICS allows one to continue from a panic as if it were # a call to the debugger via the Debugger() function instead. It is only # useful if a kernel debugger is present. To restart from a panic, reset # the panicstr variable to NULL and continue execution. This option is # for development use only and should NOT be used in production systems # to "workaround" a panic. # options RESTARTABLE_PANICS # # PERFMON causes the driver for Pentium/Pentium Pro performance counters # to be compiled. See perfmon(4) for more information. # options PERFMON # # This option let some drivers co-exist that can't co-exist in a running # system. This is used to be able to compile all kernel code in one go for # quality assurance purposes (like this file, which the option takes it name # from.) # options COMPILING_LINT # XXX - this doesn't belong here. # Allow ordinary users to take the console - this is useful for X. options UCONSOLE # XXX - this doesn't belong here either #options USERCONFIG #boot -c editor #options INTRO_USERCONFIG #imply -c and show intro screen #options VISUAL_USERCONFIG #visual boot -c editor ##################################################################### # NETWORKING OPTIONS # # Protocol families: # Only the INET (Internet) family is officially supported in FreeBSD. # Source code for the NS (Xerox Network Service) is provided for amusement # value. # options INET #Internet communications protocols options INET6 #IPv6 communications protocols options IPSEC #IP security options IPSEC_ESP #IP security (crypto; define w/ IPSEC) options IPSEC_DEBUG #debug for IP security options IPX #IPX/SPX communications protocols options IPXIP #IPX in IP encapsulation (not available) options IPTUNNEL #IP in IPX encapsulation (not available) options NCP #NetWare Core protocol options NETATALK #Appletalk communications protocols options NETATALKDEBUG #Appletalk debugging # These are currently broken but are shipped due to interest. #options NS #Xerox NS protocols #options NSIP #XNS over IP # mchain library. It can be either loaded as KLD or compiled into kernel options LIBMCHAIN # netgraph(4). Enable the base netgraph code with the NETGRAPH option. # Individual node types can be enabled with the corresponding option # listed below; however, this is not strictly necessary as netgraph # will automatically load the corresponding KLD module if the node type # is not already compiled into the kernel. Each type below has a # corresponding man page, e.g., ng_async(8). options NETGRAPH #netgraph(4) system options NETGRAPH_ASYNC options NETGRAPH_BPF options NETGRAPH_CISCO options NETGRAPH_ECHO options NETGRAPH_ETHER options NETGRAPH_FRAME_RELAY options NETGRAPH_HOLE options NETGRAPH_IFACE options NETGRAPH_KSOCKET options NETGRAPH_LMI # MPPC compression requires proprietary files (not included) #options NETGRAPH_MPPC_COMPRESSION options NETGRAPH_MPPC_ENCRYPTION options NETGRAPH_ONE2MANY options NETGRAPH_PPP options NETGRAPH_PPPOE options NETGRAPH_PPTPGRE options NETGRAPH_RFC1490 options NETGRAPH_SOCKET options NETGRAPH_SPLIT options NETGRAPH_TEE options NETGRAPH_TTY options NETGRAPH_UI options NETGRAPH_VJC device mn # Munich32x/Falc54 Nx64kbit/sec cards. device lmc # tulip based LanMedia WAN cards device musycc # LMC/SBE LMC1504 quad T1/E1 # # Network interfaces: # The `loop' device is MANDATORY when networking is enabled. # The `ether' device provides generic code to handle # Ethernets; it is MANDATORY when a Ethernet device driver is # configured or token-ring is enabled. # The `fddi' device provides generic code to support FDDI. # The `sppp' device serves a similar role for certain types # of synchronous PPP links (like `cx', `ar'). # The `sl' device implements the Serial Line IP (SLIP) service. # The `ppp' device implements the Point-to-Point Protocol. # The `bpf' device enables the Berkeley Packet Filter. Be # aware of the legal and administrative consequences of enabling this # option. The number of devices determines the maximum number of # simultaneous BPF clients programs runnable. # The `disc' device implements a minimal network interface, # which throws away all packets sent and never receives any. It is # included for testing purposes. This shows up as the `ds' interface. # The `tap' device is a pty-like virtual Ethernet interface # The `tun' device implements (user-)ppp and nos-tun # The `gif' device implements IPv6 over IP4 tunneling, # IPv4 over IPv6 tunneling, IPv4 over IPv4 tunneling and # IPv6 over IPv6 tunneling. # The XBONEHACK option allows the same pair of addresses to be configured on # multiple gif interfaces. # The `faith' device captures packets sent to it and diverts them # to the IPv4/IPv6 translation daemon. # The `stf' device implements 6to4 encapsulation. # The `ef' device provides support for multiple ethernet frame types # specified via ETHER_* options. See ef(4) for details. # # The PPP_BSDCOMP option enables support for compress(1) style entire # packet compression, the PPP_DEFLATE is for zlib/gzip style compression. # PPP_FILTER enables code for filtering the ppp data stream and selecting # events for resetting the demand dial activity timer - requires bpf. # See pppd(8) for more details. # device ether #Generic Ethernet device vlan 1 #VLAN support device token #Generic TokenRing device fddi #Generic FDDI device sppp #Generic Synchronous PPP device loop 1 #Network loopback device device bpf #Berkeley packet filter device disc #Discard device (ds0, ds1, etc) device tap #Virtual Ethernet driver device tun #Tunnel driver (ppp(8), nos-tun(8)) device sl #Serial Line IP device ppp 2 #Point-to-point protocol options PPP_BSDCOMP #PPP BSD-compress support options PPP_DEFLATE #PPP zlib/deflate/gzip support options PPP_FILTER #enable bpf filtering (needs bpf) device ef # Multiple ethernet frames support options ETHER_II # enable Ethernet_II frame options ETHER_8023 # enable Ethernet_802.3 (Novell) frame options ETHER_8022 # enable Ethernet_802.2 frame options ETHER_SNAP # enable Ethernet_802.2/SNAP frame # for IPv6 device gif #IPv6 and IPv4 tunneling options XBONEHACK device faith 1 #for IPv6 and IPv4 translation device stf #6to4 IPv6 over IPv4 encapsulation # # Internet family options: # # MROUTING enables the kernel multicast packet forwarder, which works # with mrouted(8). # # IPFIREWALL enables support for IP firewall construction, in # conjunction with the `ipfw' program. IPFIREWALL_VERBOSE sends # logged packets to the system logger. IPFIREWALL_VERBOSE_LIMIT # limits the number of times a matching entry can be logged. # # WARNING: IPFIREWALL defaults to a policy of "deny ip from any to any" # and if you do not add other rules during startup to allow access, # YOU WILL LOCK YOURSELF OUT. It is suggested that you set firewall_type=open # in /etc/rc.conf when first enabling this feature, then refining the # firewall rules in /etc/rc.firewall after you've tested that the new kernel # feature works properly. # # IPFIREWALL_DEFAULT_TO_ACCEPT causes the default rule (at boot) to # allow everything. Use with care, if a cracker can crash your # firewall machine, they can get to your protected machines. However, # if you are using it as an as-needed filter for specific problems as # they arise, then this may be for you. Changing the default to 'allow' # means that you won't get stuck if the kernel and /sbin/ipfw binary get # out of sync. # # IPDIVERT enables the divert IP sockets, used by ``ipfw divert'' # # IPSTEALTH enables code to support stealth forwarding (i.e., forwarding # packets without touching the ttl). This can be useful to hide firewalls # from traceroute and similar tools. # # TCPDEBUG enables code which keeps traces of the TCP state machine # for sockets with the SO_DEBUG option set, which can then be examined # using the trpt(8) utility. # options MROUTING # Multicast routing options IPFIREWALL #firewall options IPFIREWALL_VERBOSE #print information about # dropped packets options IPFIREWALL_FORWARD #enable transparent proxy support options IPFIREWALL_VERBOSE_LIMIT=100 #limit verbosity options IPFIREWALL_DEFAULT_TO_ACCEPT #allow everything by default options IPV6FIREWALL #firewall for IPv6 options IPV6FIREWALL_VERBOSE options IPV6FIREWALL_VERBOSE_LIMIT=100 options IPV6FIREWALL_DEFAULT_TO_ACCEPT options IPDIVERT #divert sockets options IPFILTER #ipfilter support options IPFILTER_LOG #ipfilter logging options IPFILTER_DEFAULT_BLOCK #block all packets by default options IPSTEALTH #support for stealth forwarding options TCPDEBUG # RANDOM_IP_ID causes the ID field in IP packets to be randomized # instead of incremented by 1 with each packet generated. This # option closes a minor information leak which allows remote # observers to determine the rate of packet generation on the # machine by watching the counter. options RANDOM_IP_ID # Statically Link in accept filters options ACCEPT_FILTER_DATA options ACCEPT_FILTER_HTTP # TCP_DROP_SYNFIN adds support for ignoring TCP packets with SYN+FIN. This # prevents nmap et al. from identifying the TCP/IP stack, but breaks support # for RFC1644 extensions and is not recommended for web servers. # options TCP_DROP_SYNFIN #drop TCP packets with SYN+FIN # DUMMYNET enables the "dummynet" bandwidth limiter. You need # IPFIREWALL as well. See the dummynet(4) manpage for more info. # BRIDGE enables bridging between ethernet cards -- see bridge(4). # You can use IPFIREWALL and dummynet together with bridging. options DUMMYNET options BRIDGE # # ATM (HARP version) options # # ATM_CORE includes the base ATM functionality code. This must be included # for ATM support. # # ATM_IP includes support for running IP over ATM. # # At least one (and usually only one) of the following signalling managers # must be included (note that all signalling managers include PVC support): # ATM_SIGPVC includes support for the PVC-only signalling manager `sigpvc'. # ATM_SPANS includes support for the `spans' signalling manager, which runs # the FORE Systems's proprietary SPANS signalling protocol. # ATM_UNI includes support for the `uni30' and `uni31' signalling managers, # which run the ATM Forum UNI 3.x signalling protocols. # # The `hea' driver provides support for the Efficient Networks, Inc. # ENI-155p ATM PCI Adapter. # # The `hfa' driver provides support for the FORE Systems, Inc. # PCA-200E ATM PCI Adapter. # options ATM_CORE #core ATM protocol family options ATM_IP #IP over ATM support options ATM_SIGPVC #SIGPVC signalling manager options ATM_SPANS #SPANS signalling manager options ATM_UNI #UNI signalling manager device hea #Efficient ENI-155p ATM PCI device hfa #FORE PCA-200E ATM PCI ##################################################################### # FILESYSTEM OPTIONS # # Only the root, /usr, and /tmp filesystems need be statically # compiled; everything else will be automatically loaded at mount # time. (Exception: the UFS family--- FFS --- cannot # currently be demand-loaded.) Some people still prefer to statically # compile other filesystems as well. # # NB: The NULL, PORTAL, UMAP and UNION filesystems are known to be # buggy, and WILL panic your system if you attempt to do anything with # them. They are included here as an incentive for some enterprising # soul to sit down and fix them. # # One of these is mandatory: options FFS #Fast filesystem options NFS #Network File System # The rest are optional: #options NFS_NOSERVER #Disable the NFS-server code. options CD9660 #ISO 9660 filesystem options FDESCFS #File descriptor filesystem options HPFS #OS/2 File system options MSDOSFS #MS DOS File System (FAT, FAT32) options NTFS #NT File System options NULLFS #NULL filesystem options NWFS #NetWare filesystem options PORTALFS #Portal filesystem options PROCFS #Process filesystem options PSEUDOFS #Pseudo-filesystem framework options UMAPFS #UID map filesystem options UNIONFS #Union filesystem # options NODEVFS #disable devices filesystem # The xFS_ROOT options REQUIRE the associated ``options xFS'' options NFS_ROOT #NFS usable as root device # This code enables IFS, an FFS which exports inodes as the namespace. # You can find details in src/sys/ufs/ifs/README . options IFS # Soft updates is a technique for improving file system speed and # making abrupt shutdown less risky. # options SOFTUPDATES # Extended attributes allow additional data to be associated with files, # and is used for ACLs, Capabilities, and MAC labels. # See src/sys/ufs/ufs/README.extattr for more information. options UFS_EXTATTR options UFS_EXTATTR_AUTOSTART # Access Control List support for UFS filesystems. The current ACL # implementation requires extended attribute support, UFS_EXTATTR, # for the underlying filesystem. # See src/sys/ufs/ufs/README.acls for more information. options UFS_ACL # Directory hashing improves the speed of operations on very large # directories at the expense of some memory. options UFS_DIRHASH # Make space in the kernel for a root filesystem on a md device. # Define to the number of kilobytes to reserve for the filesystem. options MD_ROOT_SIZE=10 # Make the md device a potential root device, either with preloaded # images of type mfs_root or md_root. options MD_ROOT # Allow this many swap-devices. # # In order to manage swap, the system must reserve bitmap space that # scales with the largest mounted swap device multiplied by NSWAPDEV, # irregardless of whether other swap devices exist or not. So it # is not a good idea to make this value too large. options NSWAPDEV=5 # Disk quotas are supported when this option is enabled. options QUOTA #enable disk quotas # If you are running a machine just as a fileserver for PC and MAC # users, using SAMBA or Netatalk, you may consider setting this option # and keeping all those users' directories on a filesystem that is # mounted with the suiddir option. This gives new files the same # ownership as the directory (similar to group). It's a security hole # if you let these users run programs, so confine it to file-servers # (but it'll save you lots of headaches in those cases). Root owned # directories are exempt and X bits are cleared. The suid bit must be # set on the directory as well; see chmod(1) PC owners can't see/set # ownerships so they keep getting their toes trodden on. This saves # you all the support calls as the filesystem it's used on will act as # they expect: "It's my dir so it must be my file". # options SUIDDIR # NFS options: options NFS_MINATTRTIMO=3 # VREG attrib cache timeout in sec options NFS_MAXATTRTIMO=60 options NFS_MINDIRATTRTIMO=30 # VDIR attrib cache timeout in sec options NFS_MAXDIRATTRTIMO=60 options NFS_GATHERDELAY=10 # Default write gather delay (msec) options NFS_UIDHASHSIZ=29 # Tune the size of nfssvc_sock with this options NFS_WDELAYHASHSIZ=16 # and with this options NFS_MUIDHASHSIZ=63 # Tune the size of nfsmount with this options NFS_DEBUG # Enable NFS Debugging # Coda stuff: options CODA #CODA filesystem. device vcoda 4 #coda minicache <-> venus comm. # # Add support for the EXT2FS filesystem of Linux fame. Be a bit # careful with this - the ext2fs code has a tendency to lag behind # changes and not be exercised very much, so mounting read/write could # be dangerous (and even mounting read only could result in panics.) # options EXT2FS # Use real implementations of the aio_* system calls. There are numerous # stability issues in the current aio code that make it unsuitable for # inclusion on shell boxes. options VFS_AIO # Enable the code UFS IO optimization through the VM system. This allows # use VM operations instead of copying operations when possible. # # Even with this enabled, actual use of the code is still controlled by the # sysctl vfs.ioopt. 0 gives no optimization, 1 gives normal (use VM # operations if a request happens to fit), 2 gives agressive optimization # (the operations are split to do as much as possible through the VM system.) # # Enabling this will probably not give an overall speedup except for # special workloads. options ENABLE_VFS_IOOPT # Cryptographically secure random number generator; /dev/[u]random device random ##################################################################### # POSIX P1003.1B # Real time extensions added in the 1993 Posix # P1003_1B: Infrastructure # _KPOSIX_PRIORITY_SCHEDULING: Build in _POSIX_PRIORITY_SCHEDULING # _KPOSIX_VERSION: Version kernel is built for options P1003_1B options _KPOSIX_PRIORITY_SCHEDULING options _KPOSIX_VERSION=199309L ##################################################################### # CLOCK OPTIONS # The granularity of operation is controlled by the kernel option HZ whose # default value (100) means a granularity of 10ms. For an accurate simulation # of high data rates it might be necessary to reduce the timer granularity to # 1ms or less. Consider, however, that some interfaces using programmed I/O # may require a considerable time to output packets. So, reducing the # granularity too much might actually cause ticks to be missed thus reducing # the accuracy of operation. options HZ=100 # Other clock options options CLK_CALIBRATION_LOOP options CLK_USE_I8254_CALIBRATION options CLK_USE_TSC_CALIBRATION ##################################################################### # SCSI DEVICES # SCSI DEVICE CONFIGURATION # The SCSI subsystem consists of the `base' SCSI code, a number of # high-level SCSI device `type' drivers, and the low-level host-adapter # device drivers. The host adapters are listed in the ISA and PCI # device configuration sections below. # # Beginning with FreeBSD 2.0.5 you can wire down your SCSI devices so # that a given bus, target, and LUN always come on line as the same # device unit. In earlier versions the unit numbers were assigned # in the order that the devices were probed on the SCSI bus. This # means that if you removed a disk drive, you may have had to rewrite # your /etc/fstab file, and also that you had to be careful when adding # a new disk as it may have been probed earlier and moved your device # configuration around. # This old behavior is maintained as the default behavior. The unit # assignment begins with the first non-wired down unit for a device # type. For example, if you wire a disk as "da3" then the first # non-wired disk will be assigned da4. # The syntax for wiring down devices is: hint.scbus.0.at="ahc0" hint.scbus.1.at="ahc1" hint.scbus.1.bus="0" hint.scbus.3.at="ahc2" hint.scbus.3.bus="0" hint.scbus.2.at="ahc2" hint.scbus.2.bus="1" hint.da.0.at="scbus0" hint.da.0.target="0" hint.da.0.unit="0" hint.da.1.at="scbus3" hint.da.1.target="1" hint.da.2.at="scbus2" hint.da.2.target="3" hint.sa.1.at="scbus1" hint.sa.1.target="6" # "units" (SCSI logical unit number) that are not specified are # treated as if specified as LUN 0. # All SCSI devices allocate as many units as are required. # The ch driver drives SCSI Media Changer ("jukebox") devices. # # The da driver drives SCSI Direct Access ("disk") and Optical Media # ("WORM") devices. # # The sa driver drives SCSI Sequential Access ("tape") devices. # # The cd driver drives SCSI Read Only Direct Access ("cd") devices. # # The ses driver drives SCSI Envinronment Services ("ses") and # SAF-TE ("SCSI Accessable Fault-Tolerant Enclosure") devices. # # The pt driver drives SCSI Processor devices. # # # Target Mode support is provided here but also requires that a SIM # (SCSI Host Adapter Driver) provide support as well. # # The targ driver provides target mode support as a Processor type device. # It exists to give the minimal context necessary to respond to Inquiry # commands. There is a sample user application that shows how the rest # of the command support might be done in /usr/share/examples/scsi_target. # # The targbh driver provides target mode support and exists to respond # to incoming commands that do not otherwise have a logical unit assigned # to them. # # The "unknown" device (uk? in pre-2.0.5) is now part of the base SCSI # configuration as the "pass" driver. device scbus #base SCSI code device ch #SCSI media changers device da #SCSI direct access devices (aka disks) device sa #SCSI tapes device cd #SCSI CD-ROMs device ses #SCSI Environmental Services (and SAF-TE) device pt #SCSI processor device targ #SCSI Target Mode Code device targbh #SCSI Target Mode Blackhole Device device pass #CAM passthrough driver # CAM OPTIONS: # debugging options: # -- NOTE -- If you specify one of the bus/target/lun options, you must # specify them all! # CAMDEBUG: When defined enables debugging macros # CAM_DEBUG_BUS: Debug the given bus. Use -1 to debug all busses. # CAM_DEBUG_TARGET: Debug the given target. Use -1 to debug all targets. # CAM_DEBUG_LUN: Debug the given lun. Use -1 to debug all luns. # CAM_DEBUG_FLAGS: OR together CAM_DEBUG_INFO, CAM_DEBUG_TRACE, # CAM_DEBUG_SUBTRACE, and CAM_DEBUG_CDB # # CAM_MAX_HIGHPOWER: Maximum number of concurrent high power (start unit) cmds # CAM_NEW_TRAN_CODE: this is the new transport layer code that will be switched # to soon # SCSI_NO_SENSE_STRINGS: When defined disables sense descriptions # SCSI_NO_OP_STRINGS: When defined disables opcode descriptions # SCSI_DELAY: The number of MILLISECONDS to freeze the SIM (scsi adapter) # queue after a bus reset, and the number of milliseconds to # freeze the device queue after a bus device reset. options CAMDEBUG options CAM_DEBUG_BUS=-1 options CAM_DEBUG_TARGET=-1 options CAM_DEBUG_LUN=-1 options CAM_DEBUG_FLAGS="CAM_DEBUG_INFO|CAM_DEBUG_TRACE|CAM_DEBUG_CDB" options CAM_MAX_HIGHPOWER=4 options SCSI_NO_SENSE_STRINGS options SCSI_NO_OP_STRINGS options SCSI_DELAY=8000 # Be pessimistic about Joe SCSI device # Options for the CAM CDROM driver: # CHANGER_MIN_BUSY_SECONDS: Guaranteed minimum time quantum for a changer LUN # CHANGER_MAX_BUSY_SECONDS: Maximum time quantum per changer LUN, only # enforced if there is I/O waiting for another LUN # The compiled in defaults for these variables are 2 and 10 seconds, # respectively. # # These can also be changed on the fly with the following sysctl variables: # kern.cam.cd.changer.min_busy_seconds # kern.cam.cd.changer.max_busy_seconds # options CHANGER_MIN_BUSY_SECONDS=2 options CHANGER_MAX_BUSY_SECONDS=10 # Options for the CAM sequential access driver: # SA_IO_TIMEOUT: Timeout for read/write/wfm operations, in minutes # SA_SPACE_TIMEOUT: Timeout for space operations, in minutes # SA_REWIND_TIMEOUT: Timeout for rewind operations, in minutes # SA_ERASE_TIMEOUT: Timeout for erase operations, in minutes # SA_1FM_AT_EOD: Default to model which only has a default one filemark at EOT. options SA_IO_TIMEOUT="(4)" options SA_SPACE_TIMEOUT="(60)" options SA_REWIND_TIMEOUT="(2*60)" options SA_ERASE_TIMEOUT="(4*60)" options SA_1FM_AT_EOD # Optional timeout for the CAM processor target (pt) device # This is specified in seconds. The default is 60 seconds. options SCSI_PT_DEFAULT_TIMEOUT="60" # Optional enable of doing SES passthrough on other devices (e.g., disks) # # Normally disabled because a lot of newer SCSI disks report themselves # as having SES capabilities, but this can then clot up attempts to build # build a topology with the SES device that's on the box these drives # are in.... options SES_ENABLE_PASSTHROUGH ##################################################################### # MISCELLANEOUS DEVICES AND OPTIONS # The `pty' device usually turns out to be ``effectively mandatory'', # as it is required for `telnetd', `rlogind', `screen', `emacs', and # `xterm', among others. device pty #Pseudo ttys device speaker #Play IBM BASIC-style noises out your speaker device gzip #Exec gzipped a.out's device md #Memory/malloc disk device snp #Snoop device - to look at pty/vty/etc.. device ccd 4 #Concatenated disk driver # Configuring Vinum into the kernel is not necessary, since the kld # module gets started automatically when vinum(8) starts. This # device is also untested. Use at your own risk. # # The option VINUMDEBUG must match the value set in CFLAGS # in src/sbin/vinum/Makefile. Failure to do so will result in # the following message from vinum(8): # # Can't get vinum config: Invalid argument # # see vinum(4) for more reasons not to use these options. device vinum #Vinum concat/mirror/raid driver options VINUMDEBUG #enable Vinum debugging hooks # Kernel side iconv library options LIBICONV # Size of the kernel message buffer. Should be N * pagesize. options MSGBUF_SIZE=40960 ##################################################################### # HARDWARE BUS CONFIGURATION # ISA, EISA, MCA and PCI bus: # # Mandatory ISA devices: isa, npx # device isa # # Options for `isa': # # AUTO_EOI_1 enables the `automatic EOI' feature for the master 8259A # interrupt controller. This saves about 0.7-1.25 usec for each interrupt. # This option breaks suspend/resume on some portables. # # AUTO_EOI_2 enables the `automatic EOI' feature for the slave 8259A # interrupt controller. This saves about 0.7-1.25 usec for each interrupt. # Automatic EOI is documented not to work for for the slave with the # original i8259A, but it works for some clones and some integrated # versions. # # MAXMEM specifies the amount of RAM on the machine; if this is not # specified, FreeBSD will first read the amount of memory from the CMOS # RAM, so the amount of memory will initially be limited to 64MB or 16MB # depending on the BIOS. If the BIOS reports 64MB, a memory probe will # then attempt to detect the installed amount of RAM. If this probe # fails to detect >64MB RAM you will have to use the MAXMEM option. # The amount is in kilobytes, so for a machine with 128MB of RAM, it would # be 131072 (128 * 1024). # # BROKEN_KEYBOARD_RESET disables the use of the keyboard controller to # reset the CPU for reboot. This is needed on some systems with broken # keyboard controllers. options COMPAT_OLDISA #Use ISA shims and glue for old drivers options AUTO_EOI_1 #options AUTO_EOI_2 options MAXMEM="(128*1024)" #options BROKEN_KEYBOARD_RESET # Enable support for the kernel PLL to use an external PPS signal, # under supervision of [x]ntpd(8) # More info in ntpd documentation: http://www.eecis.udel.edu/~ntp options PPS_SYNC # If you see the "calcru: negative time of %ld usec for pid %d (%s)\n" # message you probably have some broken sw/hw which disables interrupts # for too long. You can make the system more resistant to this by # choosing a high value for NTIMECOUNTER. The default is 5, there # is no upper limit but more than a couple of hundred are not productive. # A better strategy may be to sysctl -w kern.timecounter.method=1 options NTIMECOUNTER=20 # # EISA bus # # The EISA bus device is `eisa'. It provides auto-detection and # configuration support for all devices on the EISA bus. device eisa # By default, only 10 EISA slots are probed, since the slot numbers # above clash with the configuration address space of the PCI subsystem, # and the EISA probe is not very smart about this. This is sufficient # for most machines, but in particular the HP NetServer LC series comes # with an onboard AIC7770 dual-channel SCSI controller on EISA slot #11, # thus you need to bump this figure to 12 for them. options EISA_SLOTS=12 # # MCA bus: # # The MCA bus device is `mca'. It provides auto-detection and # configuration support for all devices on the MCA bus. # No hints are required for MCA. device mca # # PCI bus & PCI options: # # The main PCI bus device is `pci'. It provides auto-detection and # configuration support for all devices on the PCI bus, using either # configuration mode defined in the PCI specification. device pci # # AGP GART support device agp # PCI options # #options PCI_QUIET #quiets PCI code on chipset settings ##################################################################### # HARDWARE DEVICE CONFIGURATION # EISA support is available for some device, so they can be auto-probed. # MicroChannel (MCA) support is available for some devices. # For ISA the required hints are listed. # EISA, MCA, PCI and pccard are self identifying buses, so no hints # are needed. # # Mandatory devices: # # The keyboard controller; it controls the keyboard and the PS/2 mouse. device atkbdc 1 hint.atkbdc.0.at="isa" hint.atkbdc.0.port="0x060" # The AT keyboard device atkbd hint.atkbd.0.at="atkbdc" hint.atkbd.0.irq="1" # Options for atkbd: options ATKBD_DFLT_KEYMAP # specify the built-in keymap makeoptions ATKBD_DFLT_KEYMAP="jp.106" # These options are valid for other keyboard drivers as well. options KBD_DISABLE_KEYMAP_LOAD # refuse to load a keymap options KBD_INSTALL_CDEV # install a CDEV entry in /dev # `flags' for atkbd: # 0x01 Force detection of keyboard, else we always assume a keyboard # 0x02 Don't reset keyboard, useful for some newer ThinkPads # 0x04 Old-style (XT) keyboard support, useful for older ThinkPads # PS/2 mouse device psm hint.psm.0.at="atkbdc" hint.psm.0.irq="12" # Options for psm: options PSM_HOOKRESUME #hook the system resume event, useful #for some laptops options PSM_RESETAFTERSUSPEND #reset the device at the resume event # The video card driver. device vga hint.vga.0.at="isa" # Options for vga: # Try the following option if the mouse pointer is not drawn correctly # or font does not seem to be loaded properly. May cause flicker on # some systems. options VGA_ALT_SEQACCESS # If you can dispense with some vga driver features, you may want to # use the following options to save some memory. #options VGA_NO_FONT_LOADING # don't save/load font #options VGA_NO_MODE_CHANGE # don't change video modes # Older video cards may require this option for proper operation. options VGA_SLOW_IOACCESS # do byte-wide i/o's to TS and GDC regs # The following option probably won't work with the LCD displays. options VGA_WIDTH90 # support 90 column modes # To include support for VESA video modes options VESA options FB_DEBUG # Frame buffer debugging options FB_INSTALL_CDEV # install a CDEV entry in /dev # Splash screen at start up! Screen savers require this too. device splash # Various screen savers. device apm_saver # Requires APM device blank_saver device daemon_saver device fade_saver device fire_saver device green_saver device logo_saver device rain_saver device star_saver device warp_saver # The pcvt console driver (vt220 compatible). device vt hint.vt.0.at="isa" options XSERVER # support for running an X server on vt options FAT_CURSOR # start with block cursor # This PCVT option is for keyboards such as those used on really old ThinkPads options PCVT_SCANSET=2 # Other PCVT options are documented in pcvt(4). options PCVT_24LINESDEF options PCVT_CTRL_ALT_DEL options PCVT_META_ESC options PCVT_NSCREENS=9 options PCVT_PRETTYSCRNS options PCVT_SCREENSAVER options PCVT_USEKBDSEC options PCVT_VT220KEYB options PCVT_GREENSAVER # The syscons console driver (sco color console compatible). device sc 1 hint.sc.0.at="isa" options MAXCONS=16 # number of virtual consoles options SC_ALT_MOUSE_IMAGE # simplified mouse cursor in text mode options SC_DFLT_FONT # compile font in makeoptions SC_DFLT_FONT=cp850 options SC_DISABLE_DDBKEY # disable `debug' key options SC_DISABLE_REBOOT # disable reboot key sequence options SC_HISTORY_SIZE=200 # number of history buffer lines options SC_MOUSE_CHAR=0x3 # char code for text mode mouse cursor options SC_PIXEL_MODE # add support for the raster text mode # The following options will let you change the default colors of syscons. options SC_NORM_ATTR="(FG_GREEN|BG_BLACK)" options SC_NORM_REV_ATTR="(FG_YELLOW|BG_GREEN)" options SC_KERNEL_CONS_ATTR="(FG_RED|BG_BLACK)" options SC_KERNEL_CONS_REV_ATTR="(FG_BLACK|BG_RED)" # If you have a two button mouse, you may want to add the following option # to use the right button of the mouse to paste text. options SC_TWOBUTTON_MOUSE # You can selectively disable features in syscons. options SC_NO_CUTPASTE options SC_NO_FONT_LOADING options SC_NO_HISTORY options SC_NO_SYSMOUSE # `flags' for sc # 0x80 Put the video card in the VESA 800x600 dots, 16 color mode # 0x100 Probe for a keyboard device periodically if one is not present # 3Dfx Voodoo Graphics, Voodoo II /dev/3dfx CDEV support. This will create # the /dev/3dfx0 device to work with glide implementations. This should get # linked to /dev/3dfx and /dev/voodoo. Note that this is not the same as # the tdfx DRI module from XFree86 and is completely unrelated. # # To enable Linuxulator support, one must also include COMPAT_LINUX in the # config as well, or you will not have the dependencies. The other option # is to load both as modules. device tdfx # Enable 3Dfx Voodoo support options TDFX_LINUX # Enable Linuxulator support # # The Numeric Processing eXtension driver. In addition to this, you # may configure a math emulator (see above). If your machine has a # hardware FPU and the kernel configuration includes the npx device # *and* a math emulator compiled into the kernel, the hardware FPU # will be used, unless it is found to be broken or unless "flags" to # npx0 includes "0x08", which requests preference for the emulator. device npx hint.npx.0.at="nexus" hint.npx.0.port="0x0F0" hint.npx.0.flags="0x0" hint.npx.0.irq="13" # # `flags' for npx0: # 0x01 don't use the npx registers to optimize bcopy. # 0x02 don't use the npx registers to optimize bzero. # 0x04 don't use the npx registers to optimize copyin or copyout. # 0x08 use emulator even if hardware FPU is available. # The npx registers are normally used to optimize copying and zeroing when # all of the following conditions are satisfied: # I586_CPU is an option # the cpu is an i586 (perhaps not a Pentium) # the probe for npx0 succeeds # INT 16 exception handling works. # Then copying and zeroing using the npx registers is normally 30-100% faster. # The flags can be used to control cases where it doesn't work or is slower. # Setting them at boot time using userconfig works right (the optimizations # are not used until later in the bootstrap when npx0 is attached). # Flag 0x08 automatically disables the i586 optimized routines. # # # ACPI support using the Intel ACPI Component Architecture reference # implementation. # # ACPI_DEBUG enables the use of the debug.acpi.level and debug.acpi.layer # kernel environment variables to select initial debugging levels for the # Intel ACPICA code. (Note that the Intel code must also have USE_DEBUGGER # defined when it is built). # device acpica options ACPI_DEBUG # # Optional devices: # # # SCSI host adapters: # # adv: All Narrow SCSI bus AdvanSys controllers. # adw: Second Generation AdvanSys controllers including the ADV940UW. # aha: Adaptec 154x/1535/1640 # ahb: Adaptec 174x EISA controllers # ahc: Adaptec 274x/284x/2910/293x/294x/394x/3950x/3960x/398X/4944/ # 19160x/29160x, aic7770/aic78xx # aic: Adaptec 6260/6360, APA-1460 (PC Card), NEC PC9801-100 (C-BUS) # amd: Support for the AMD 53C974 SCSI host adapter chip as found on devices # such as the Tekram DC-390(T). # bt: Most Buslogic controllers: including BT-445, BT-54x, BT-64x, BT-74x, # BT-75x, BT-946, BT-948, BT-956, BT-958, SDC3211B, SDC3211F, SDC3222F # isp: Qlogic ISP 1020, 1040 and 1040B PCI SCSI host adapters, # ISP 1240 Dual Ultra SCSI, ISP 1080 and 1280 (Dual) Ultra2, # ISP 12160 Ultra3 SCSI, # Qlogic ISP 2100 and ISP 2200 Fibre Channel host adapters. # ispfw: Firmware module for Qlogic host adapters # ncr: NCR 53C810, 53C825 self-contained SCSI host adapters. # ncv: NCR 53C500 based SCSI host adapters. # nsp: Workbit Ninja SCSI-3 based PC Card SCSI host adapters. # sym: Symbios/Logic 53C8XX family of PCI-SCSI I/O processors: # 53C810, 53C810A, 53C815, 53C825, 53C825A, 53C860, 53C875, # 53C876, 53C885, 53C895, 53C895A, 53C896, 53C897, 53C1510D, # 53C1010-33, 53C1010-66. # stg: TMC 18C30, 18C50 based SCSI host adapters. # wds: WD7000 # # Note that the order is important in order for Buslogic ISA/EISA cards to be # probed correctly. # device bt hint.bt.0.at="isa" hint.bt.0.port="0x330" device adv hint.adv.0.at="isa" device adw device aha hint.aha.0.at="isa" device aic hint.aic.0.at="isa" device ahb device ahc device amd device isp hint.isp.0.disable="1" hint.isp.0.role="3" hint.isp.0.prefer_iomap="1" hint.isp.0.prefer_memmap="1" hint.isp.0.fwload_disable="1" hint.isp.0.ignore_nvram="1" hint.isp.0.fullduplex="1" hint.isp.0.topology="lport" hint.isp.0.topology="nport" hint.isp.0.topology="lport-only" hint.isp.0.topology="nport-only" # we can't get u_int64_t types, nor can we get strings if it's got # a leading 0x, hence this silly dodge. hint.isp.0.portwnn="w50000000aaaa0000" hint.isp.0.nodewnn="w50000000aaaa0001" device ispfw device ncr device ncv device nsp device sym device stg hint.stg.0.at="isa" hint.stg.0.port="0x140" hint.stg.0.port="11" device wds hint.wds.0.at="isa" hint.wds.0.port="0x350" hint.wds.0.irq="11" hint.wds.0.drq="6" # The aic7xxx driver will attempt to use memory mapped I/O for all PCI # controllers that have it configured only if this option is set. Unfortunately, # this doesn't work on some motherboards, which prevents it from being the # default. options AHC_ALLOW_MEMIO # Enable diagnostic sequencer code. options AHC_DEBUG_SEQUENCER # Dump the contents of the ahc controller configuration PROM. options AHC_DUMP_EEPROM # Bitmap of units to enable targetmode operations. options AHC_TMODE_ENABLE # The adw driver will attempt to use memory mapped I/O for all PCI # controllers that have it configured only if this option is set. options ADW_ALLOW_MEMIO # Options used in dev/isp/ (Qlogic SCSI/FC driver). # # ISP_TARGET_MODE - enable target mode operation # #options ISP_TARGET_MODE=1 # Options used in dev/sym/ (Symbios SCSI driver). #options SYM_SETUP_LP_PROBE_MAP #-Low Priority Probe Map (bits) # Allows the ncr to take precedence # 1 (1<<0) -> 810a, 860 # 2 (1<<1) -> 825a, 875, 885, 895 # 4 (1<<2) -> 895a, 896, 1510d #options SYM_SETUP_SCSI_DIFF #-HVD support for 825a, 875, 885 # disabled:0 (default), enabled:1 #options SYM_SETUP_PCI_PARITY #-PCI parity checking # disabled:0, enabled:1 (default) #options SYM_SETUP_MAX_LUN #-Number of LUNs supported # default:8, range:[1..64] # The 'asr' driver provides support for current DPT/Adaptec SCSI RAID # controllers (SmartRAID V and VI and later). # These controllers require the CAM infrastructure. # device asr # The 'dpt' driver provides support for old DPT controllers (http://www.dpt.com/). # These have hardware RAID-{0,1,5} support, and do multi-initiator I/O. # The DPT controllers are commonly re-licensed under other brand-names - # some controllers by Olivetti, Dec, HP, AT&T, SNI, AST, Alphatronic, NEC and # Compaq are actually DPT controllers. # # See src/sys/dev/dpt for debugging and other subtle options. # DPT_MEASURE_PERFORMANCE Enables a set of (semi)invasive metrics. Various # instruments are enabled. The tools in # /usr/sbin/dpt_* assume these to be enabled. # DPT_HANDLE_TIMEOUTS Normally device timeouts are handled by the DPT. # If you ant the driver to handle timeouts, enable # this option. If your system is very busy, this # option will create more trouble than solve. # DPT_TIMEOUT_FACTOR Used to compute the excessive amount of time to # wait when timing out with the above option. # DPT_DEBUG_xxxx These are controllable from sys/dev/dpt/dpt.h # DPT_LOST_IRQ When enabled, will try, once per second, to catch # any interrupt that got lost. Seems to help in some # DPT-firmware/Motherboard combinations. Minimal # cost, great benefit. # DPT_RESET_HBA Make "reset" actually reset the controller # instead of fudging it. Only enable this if you # are 100% certain you need it. device dpt # DPT options #!CAM# options DPT_MEASURE_PERFORMANCE #!CAM# options DPT_HANDLE_TIMEOUTS options DPT_TIMEOUT_FACTOR=4 options DPT_LOST_IRQ options DPT_RESET_HBA options DPT_ALLOW_MEMIO # # Mylex AcceleRAID and eXtremeRAID controllers with v6 and later # firmware. These controllers have a SCSI-like interface, and require # the CAM infrastructure. # device mly # # Adaptec FSA RAID controllers, including integrated DELL controllers, # the Dell PERC 2/QC and the HP NetRAID-4M # # AAC_COMPAT_LINUX Include code to support Linux-binary management # utilities (requires Linux compatibility # support). # device aac # # Compaq Smart RAID, Mylex DAC960 and AMI MegaRAID controllers. Only # one entry is needed; the code will find and configure all supported # controllers. # device ida # Compaq Smart RAID device mlx # Mylex DAC960 device amr # AMI MegaRAID # # 3ware ATA RAID # device twe # 3ware ATA RAID # # The 'ATA' driver supports all ATA and ATAPI devices, including PC Card # devices. You only need one "device ata" for it to find all # PCI and PC Card ATA/ATAPI devices on modern machines. device ata device atadisk # ATA disk drives device atapicd # ATAPI CDROM drives device atapifd # ATAPI floppy drives device atapist # ATAPI tape drives # # For older non-PCI, non-PnPBIOS systems, these are the hints lines to add: hint.ata.0.at="isa" hint.ata.0.port="0x1f0" hint.ata.0.irq="14" hint.ata.1.at="isa" hint.ata.1.port="0x170" hint.ata.1.irq="15" # # The following options are valid on the ATA driver: # # ATA_STATIC_ID: controller numbering is static ie depends on location # else the device numbers are dynamically allocated. options ATA_STATIC_ID # # Standard floppy disk controllers and floppy tapes, supports # the Y-E DATA External FDD (PC Card) # device fdc hint.fdc.0.at="isa" hint.fdc.0.port="0x3F0" hint.fdc.0.irq="6" hint.fdc.0.drq="2" # # FDC_DEBUG enables floppy debugging. Since the debug output is huge, you # gotta turn it actually on by setting the variable fd_debug with DDB, # however. options FDC_DEBUG # # Activate this line if you happen to have an Insight floppy tape. # Probing them proved to be dangerous for people with floppy disks only, # so it's "hidden" behind a flag: #hint.fdc.0.flags="1" # Specify floppy devices hint.fd.0.at="fdc0" hint.fd.0.drive="0" hint.fd.1.at="fdc0" hint.fd.1.drive="1" # M-systems DiskOnchip products see src/sys/contrib/dev/fla/README device fla hint.fla.0.at="isa" # # Other standard PC hardware: # # mse: Logitech and ATI InPort bus mouse ports # sio: serial ports (see sio(4)), including support for various # PC Card devices, such as Modem and NICs (see etc/defaults/pccard.conf) device mse hint.mse.0.at="isa" hint.mse.0.port="0x23c" hint.mse.0.irq="5" device sio hint.sio.0.at="isa" hint.sio.0.port="0x3F8" hint.sio.0.flags="0x10" hint.sio.0.irq="4" # # `flags' for serial drivers that support consoles (only for sio now): # 0x10 enable console support for this unit. The other console flags # are ignored unless this is set. Enabling console support does # not make the unit the preferred console - boot with -h or set # the 0x20 flag for that. Currently, at most one unit can have # console support; the first one (in config file order) with # this flag set is preferred. Setting this flag for sio0 gives # the old behaviour. # 0x20 force this unit to be the console (unless there is another # higher priority console). This replaces the COMCONSOLE option. # 0x40 reserve this unit for low level console operations. Do not # access the device in any normal way. # 0x80 use this port for serial line gdb support in ddb. # # PnP `flags' (set via userconfig using pnp x flags y) # 0x1 disable probing of this device. Used to prevent your modem # from being attached as a PnP modem. # # Options for serial drivers that support consoles (only for sio now): options BREAK_TO_DEBUGGER #a BREAK on a comconsole goes to #DDB, if available. options CONSPEED=115200 # speed for serial console # (default 9600) # Solaris implements a new BREAK which is initiated by a character # sequence CR ~ ^b which is similar to a familiar pattern used on # Sun servers by the Remote Console. options ALT_BREAK_TO_DEBUGGER # Options for sio: options COM_ESP #code for Hayes ESP options COM_MULTIPORT #code for some cards with shared IRQs # Other flags for sio that aren't documented in the man page. # 0x20000 enable hardware RTS/CTS and larger FIFOs. Only works for # ST16650A-compatible UARTs. # # Network interfaces: # # MII bus support is required for some PCI 10/100 ethernet NICs, # namely those which use MII-compliant transceivers or implement # tranceiver control interfaces that operate like an MII. Adding # "device miibus0" to the kernel config pulls in support for # the generic miibus API and all of the PHY drivers, including a # generic one for PHYs that aren't specifically handled by an # individual driver. device miibus # an: Aironet 4500/4800 802.11 wireless adapters. Supports the PCMCIA, # PCI and ISA varieties. # ar: Arnet SYNC/570i hdlc sync 2/4 port V.35/X.21 serial driver # (requires sppp) # awi: Support for IEEE 802.11 PC Card devices using the AMD Am79C930 and # Harris (Intersil) Chipset with PCnetMobile firmware by AMD. # cnw: Xircom CNW/Netware Airsurfer PC Card adapter # cs: IBM Etherjet and other Crystal Semi CS89x0-based adapters # cx: Cronyx/Sigma multiport sync/async (with Cisco or PPP framing) # dc: Support for PCI fast ethernet adapters based on the DEC/Intel 21143 # and various workalikes including: # the ADMtek AL981 Comet and AN985 Centaur, the ASIX Electronics # AX88140A and AX88141, the Davicom DM9100 and DM9102, the Lite-On # 82c168 and 82c169 PNIC, the Lite-On/Macronix LC82C115 PNIC II # and the Macronix 98713/98713A/98715/98715A/98725 PMAC. This driver # replaces the old al, ax, dm, pn and mx drivers. List of brands: # Digital DE500-BA, Kingston KNE100TX, D-Link DFE-570TX, SOHOware SFA110, # SVEC PN102-TX, CNet Pro110B, 120A, and 120B, Compex RL100-TX, # LinkSys LNE100TX, LNE100TX V2.0, Jaton XpressNet, Alfa Inc GFC2204, # KNE110TX. # de: Digital Equipment DC21040 # ed: Western Digital and SMC 80xx; Novell NE1000 and NE2000; 3Com 3C503 # HP PC Lan+, various PC Card devices (refer to etc/defauls/pccard.conf) # el: 3Com 3C501 (slow!) # ep: 3Com 3C509, 3C529, 3C556, 3C562D, 3C563D, 3C572, 3C574X, 3C579, 3C589 # and PC Card devices using these chipsets. # ex: Intel EtherExpress Pro/10 and other i82595-based adapters, # Olicom Ethernet PC Card devices. # fe: Fujitsu MB86960A/MB86965A Ethernet # fea: DEC DEFEA EISA FDDI adapter # fpa: Support for the Digital DEFPA PCI FDDI. `device fddi' is also needed. # fxp: Intel EtherExpress Pro/100B # (hint of prefer_iomap can be done to prefer I/O instead of Mem mapping) # ie: AT&T StarLAN 10 and EN100; 3Com 3C507; unknown NI5210; # Intel EtherExpress # le: Digital Equipment EtherWorks 2 and EtherWorks 3 (DEPCA, DE100, # DE101, DE200, DE201, DE202, DE203, DE204, DE205, DE422) # lnc: Lance/PCnet cards (Isolan, Novell NE2100, NE32-VL, AMD Am7990 and # Am79C960) # lge: Support for PCI gigabit ethernet adapters based on the Level 1 # LXT1001 NetCellerator chipset. This includes the D-Link DGE-500SX, # SMC TigerCard 1000 (SMC9462SX), and some Addtron cards. # nge: Support for PCI gigabit ethernet adapters based on the National # Semiconductor DP83820 and DP83821 chipset. This includes the # SMC EZ Card 1000 (SMC9462TX), D-Link DGE-500T, Asante FriendlyNet # GigaNIX 1000TA and 1000TPC, the Addtron AEG320T, the LinkSys # EG1032 and EG1064, the Surecom EP-320G-TX and the Netgear GA622T. # oltr: Olicom ISA token-ring adapters OC-3115, OC-3117, OC-3118 and OC-3133 # (no hints needed). # Olicom PCI token-ring adapters OC-3136, OC-3137, OC-3139, OC-3140, # OC-3141, OC-3540, OC-3250 # rdp: RealTek RTL 8002-based pocket ethernet adapters # pcn: Support for PCI fast ethernet adapters based on the AMD Am79c97x # chipsets, including the PCnet/FAST, PCnet/FAST+, PCnet/PRO and # PCnet/Home. These were previously handled by the lnc driver (and # still will be if you leave this driver out of the kernel). # rl: Support for PCI fast ethernet adapters based on the RealTek 8129/8139 # chipset. Note that the RealTek driver defaults to using programmed # I/O to do register accesses because memory mapped mode seems to cause # severe lockups on SMP hardware. This driver also supports the # Accton EN1207D `Cheetah' adapter, which uses a chip called # the MPX 5030/5038, which is either a RealTek in disguise or a # RealTek workalike. Note that the D-Link DFE-530TX+ uses the RealTek # chipset and is supported by this driver, not the 'vr' driver. # sf: Support for Adaptec Duralink PCI fast ethernet adapters based on the # Adaptec AIC-6915 "starfire" controller. # This includes dual and quad port cards, as well as one 100baseFX card. # Most of these are 64-bit PCI devices, except for one single port # card which is 32-bit. # sis: Support for NICs based on the Silicon Integrated Systems SiS 900, # SiS 7016 and NS DP83815 PCI fast ethernet controller chips. # sk: Support for the SysKonnect SK-984x series PCI gigabit ethernet NICs. # This includes the SK-9841 and SK-9842 single port cards (single mode # and multimode fiber) and the SK-9843 and SK-9844 dual port cards # (also single mode and multimode). # The driver will autodetect the number of ports on the card and # attach each one as a separate network interface. # sn: Support for ISA and PC Card Ethernet devices using the # SMC91C90/92/94/95 chips. # sr: RISCom/N2 hdlc sync 1/2 port V.35/X.21 serial driver (requires sppp) # ste: Sundance Technologies ST201 PCI fast ethernet controller, includes # the D-Link DFE-550TX. # ti: Support for PCI gigabit ethernet NICs based on the Alteon Networks # Tigon 1 and Tigon 2 chipsets. This includes the Alteon AceNIC, the # 3Com 3c985, the Netgear GA620 and various others. Note that you will # probably want to bump up NMBCLUSTERS a lot to use this driver. # tl: Support for the Texas Instruments TNETE100 series 'ThunderLAN' # cards and integrated ethernet controllers. This includes several # Compaq Netelligent 10/100 cards and the built-in ethernet controllers # in several Compaq Prosignia, Proliant and Deskpro systems. It also # supports several Olicom 10Mbps and 10/100 boards. # tx: SMC 9432 TX, BTX and FTX cards. (SMC EtherPower II serie) # txp: Support for 3Com 3cR990 cards with the "Typhoon" chipset # vr: Support for various fast ethernet adapters based on the VIA # Technologies VT3043 `Rhine I' and VT86C100A `Rhine II' chips, # including the D-Link DFE530TX (see 'rl' for DFE530TX+), the Hawking # Technologies PN102TX, and the AOpen/Acer ALN-320. # vx: 3Com 3C590 and 3C595 # wb: Support for fast ethernet adapters based on the Winbond W89C840F chip. # Note: this is not the same as the Winbond W89C940F, which is a # NE2000 clone. # wl: Lucent Wavelan (ISA card only). # wi: Lucent WaveLAN/IEEE 802.11 PCMCIA adapters. Note: this supports both # the PCMCIA and ISA cards: the ISA card is really a PCMCIA to ISA # bridge with a PCMCIA adapter plugged into it. # wx: Intel Gigabit Ethernet PCI card (`Wiseman') # xe: Xircom/Intel EtherExpress Pro100/16 PC Card ethernet controller, # Accton Fast EtherCard-16, Compaq Netelligent 10/100 PC Card, # Toshiba 10/100 Ethernet PC Card, Xircom 16-bit Ethernet + Modem 56 # xl: Support for the 3Com 3c900, 3c905, 3c905B and 3c905C (Fast) # Etherlink XL cards and integrated controllers. This includes the # integrated 3c905B-TX chips in certain Dell Optiplex and Dell # Precision desktop machines and the integrated 3c905-TX chips # in Dell Latitude laptop docking stations. # Also supported: 3Com 3c980(C)-TX, 3Com 3cSOHO100-TX, 3Com 3c450-TX # Order for ISA/EISA devices is important here device ar 1 hint.ar.0.at="isa" hint.ar.0.port="0x300" hint.ar.0.irq="10" hint.ar.0.maddr="0xd0000" device cs hint.cs.0.at="isa" hint.cs.0.port="0x300" device cx 1 hint.cx.0.at="isa" hint.cx.0.port="0x240" hint.cx.0.irq="15" hint.cx.0.drq="7" device ed hint.ed.0.at="isa" hint.ed.0.port="0x280" hint.ed.0.irq="5" hint.ed.0.maddr="0xd8000" device el 1 hint.el.0.at="isa" hint.el.0.port="0x300" hint.el.0.irq="9" device ep device ex device fe 1 options FE_8BIT_SUPPORT # LAC-98 support hint.fe.0.at="isa" hint.fe.0.port="0x300" device fea device ie 2 hint.ie.0.at="isa" hint.ie.0.port="0x300" hint.ie.0.irq="5" hint.ie.0.maddr="0xd0000" hint.ie.1.at="isa" hint.ie.1.port="0x360" hint.ie.1.irq="7" hint.ie.1.maddr="0xd0000" device le 1 hint.le.0.at="isa" hint.le.0.port="0x300" hint.le.0.irq="5" hint.le.0.maddr="0xd0000" device lnc 1 hint.lnc.0.at="isa" hint.lnc.0.port="0x280" hint.lnc.0.irq="10" hint.lnc.0.drq="0" device rdp 1 hint.rdp.0.at="isa" hint.rdp.0.port="0x378" hint.rdp.0.irq="7" hint.rdp.0.flags="2" device sr 1 hint.sr.0.at="isa" hint.sr.0.port="0x300" hint.sr.0.irq="5" hint.sr.0.maddr="0xd0000" device sn hint.sn.0.at="isa" hint.sn.0.port="0x300" hint.sn.0.irq="10" device an device awi device cnw device wi options WLCACHE # enables the signal-strength cache options WLDEBUG # enables verbose debugging output device wl 1 hint.wl.0.at="isa" hint.wl.0.port="0x300" device xe device oltr options OLTR_NO_BULLSEYE_MAC options OLTR_NO_HAWKEYE_MAC options OLTR_NO_TMS_MAC hint.oltr.0.at="isa" # PCI Ethernet NICs that use the common MII bus controller code. device dc # DEC/Intel 21143 and various workalikes device fxp # Intel EtherExpress PRO/100B (82557, 82558) hint.fxp.0.prefer_iomap="0" device rl # RealTek 8129/8139 device pcn # AMD Am79C97x PCI 10/100 NICs device sf # Adaptec AIC-6915 (``Starfire'') device sis # Silicon Integrated Systems SiS 900/SiS 7016 device ste # Sundance ST201 (D-Link DFE-550TX) device tl # Texas Instruments ThunderLAN device tx # SMC EtherPower II (83c170 ``EPIC'') device vr # VIA Rhine, Rhine II device wb # Winbond W89C840F device xl # 3Com 3c90x (``Boomerang'', ``Cyclone'') # PCI Ethernet NICs. device de # DEC/Intel DC21x4x (``Tulip'') device vx # 3Com 3c590, 3c595 (``Vortex'') # PCI Gigabit & FDDI NICs. device lge device nge device sk device ti device wx device fpa 1 # # ATM related options (Cranor version) # (note: this driver cannot be used with the HARP ATM stack) # # The `en' device provides support for Efficient Networks (ENI) # ENI-155 PCI midway cards, and the Adaptec 155Mbps PCI ATM cards (ANA-59x0). # # atm device provides generic atm functions and is required for # atm devices. # NATM enables the netnatm protocol family that can be used to # bypass TCP/IP. # # the current driver supports only PVC operations (no atm-arp, no multicast). # for more details, please read the original documents at # http://www.ccrc.wustl.edu/pub/chuck/tech/bsdatm/bsdatm.html # device atm device en options NATM #native ATM # # Audio drivers: `pcm', `sbc', `gusc', `pca' # # pcm: PCM audio through various sound cards. # # This has support for a large number of new audio cards, based on # CS423x, OPTi931, Yamaha OPL-SAx, and also for SB16, GusPnP. # For more information about this driver and supported cards, # see the pcm.4 man page. # # The flags of the device tells the device a bit more info about the # device that normally is obtained through the PnP interface. # bit 2..0 secondary DMA channel; # bit 4 set if the board uses two dma channels; # bit 15..8 board type, overrides autodetection; leave it # zero if don't know what to put in (and you don't, # since this is unsupported at the moment...). # # This driver will use the new PnP code if it's available. # # pca: PCM audio through your PC speaker # # Supported cards include: # Creative SoundBlaster ISA PnP/non-PnP # Supports ESS and Avance ISA chips as well. # Gravis UltraSound ISA PnP/non-PnP # Crystal Semiconductor CS461x/428x PCI # Neomagic 256AV (ac97) # Most of the more common ISA/PnP sb/mss/ess compatable cards. device pcm # For non-pnp sound cards with no bridge drivers only: hint.pcm.0.at="isa" hint.pcm.0.irq="10" hint.pcm.0.drq="1" hint.pcm.0.flags="0x0" # For PnP/PCI sound cards, no hints are required. # # midi: MIDI interfaces and synthesizers # device midi # For non-pnp sound cards with no bridge drivers: hint.midi.0.at="isa" hint.midi.0.irq="5" hint.midi.0.flags="0x0" # For serial ports (this example configures port 2): # TODO: implement generic tty-midi interface so that we can use # other uarts. hint.midi.0.at="isa" hint.midi.0.port="0x2F8" hint.midi.0.irq="3" # # seq: MIDI sequencer # device seq # The bridge drivers for sound cards. These can be separately configured # for providing services to the likes of new-midi. # When used with 'device pcm' they also provide pcm sound services. # # sbc: Creative SoundBlaster ISA PnP/non-PnP # Supports ESS and Avance ISA chips as well. # gusc: Gravis UltraSound ISA PnP/non-PnP # csa: Crystal Semiconductor CS461x/428x PCI # For non-PnP cards: device sbc hint.sbc.0.at="isa" hint.sbc.0.port="0x220" hint.sbc.0.irq="5" hint.sbc.0.drq="1" hint.sbc.0.flags="0x15" device gusc hint.gusc.0.at="isa" hint.gusc.0.port="0x220" hint.gusc.0.irq="5" hint.gusc.0.drq="1" hint.gusc.0.flags="0x13" device pca hint.pca.0.at="isa" hint.pca.0.port="0x040" # # Miscellaneous hardware: # # mcd: Mitsumi CD-ROM using proprietary (non-ATAPI) interface # scd: Sony CD-ROM using proprietary (non-ATAPI) interface # matcd: Matsushita/Panasonic CD-ROM using proprietary (non-ATAPI) interface # wt: Wangtek and Archive QIC-02/QIC-36 tape drives # ctx: Cortex-I frame grabber # apm: Laptop Advanced Power Management (experimental) # pmtimer: Timer device driver for power management events (APM or ACPI) # spigot: The Creative Labs Video Spigot video-acquisition board # meteor: Matrox Meteor video capture board # bktr: Brooktree bt848/848a/849a/878/879 video capture and TV Tuner board # cy: Cyclades serial driver # dgb: Digiboard PC/Xi and PC/Xe series driver (ALPHA QUALITY!) # digi: Digiboard driver # gp: National Instruments AT-GPIB and AT-GPIB/TNT board, PCMCIA-GPIB # asc: GI1904-based hand scanners, e.g. the Trust Amiscan Grey # gsc: Genius GS-4500 hand scanner. # joy: joystick (including IO DATA PCJOY PC Card joystick) # The LOUTB option specifies a slower outb() for debugging purposes. # rc: RISCom/8 multiport card # rp: Comtrol Rocketport(ISA) - single card # tw: TW-523 power line interface for use with X-10 home control products # si: Specialix SI/XIO 4-32 port terminal multiplexor # spic: Sony Programmable I/O controller (VAIO notebooks) # stl: Stallion EasyIO and EasyConnection 8/32 (cd1400 based) # stli: Stallion EasyConnection 8/64, ONboard, Brumby (intelligent) # Notes on APM # The flags takes the following meaning for apm0: # 0x0020 Statclock is broken. # If apm is omitted, some systems require sysctl -w kern.timecounter.method=1 # for correct timekeeping. # Notes on the spigot: # The video spigot is at 0xad6. This port address can not be changed. # The irq values may only be 10, 11, or 15 # I/O memory is an 8kb region. Possible values are: # 0a0000, 0a2000, ..., 0fffff, f00000, f02000, ..., ffffff # The start address must be on an even boundary. # Add the following option if you want to allow non-root users to be able # to access the spigot. This option is not secure because it allows users # direct access to the I/O page. # options SPIGOT_UNSECURE # Notes on the Comtrol Rocketport driver: # # The exact values used for rp0 depend on how many boards you have # in the system. The manufacturer's sample configs are listed as: # # device rp # core driver support # # Comtrol Rocketport ISA single card # hints.rp.0.at="isa" # hints.rp.0.port="0x280" # # If instead you have two ISA cards, one installed at 0x100 and the # second installed at 0x180, then you should add the following to # your kernel probe hints: # hints.rp.0.at="isa" # hints.rp.0.port="0x100" # hints.rp.1.at="isa" # hints.rp.1.port="0x180" # # For 4 ISA cards, it might be something like this: # hints.rp.0.at="isa" # hints.rp.0.port="0x180" # hints.rp.1.at="isa" # hints.rp.1.port="0x100" # hints.rp.2.at="isa" # hints.rp.2.port="0x340" # hints.rp.3.at="isa" # hints.rp.3.port="0x240" # # And for PCI cards, you need no hints. # Notes on the Digiboard driver: # # The following flag values have special meanings in dgb: # 0x01 - alternate layout of pins # 0x02 - use the windowed PC/Xe in 64K mode # Notes on the Specialix SI/XIO driver: # The host card is memory, not IO mapped. # The Rev 1 host cards use a 64K chunk, on a 32K boundary. # The Rev 2 host cards use a 32K chunk, on a 32K boundary. # The cards can use an IRQ of 11, 12 or 15. # Notes on the Sony Programmable I/O controller # This is a temporary driver that should someday be replaced by something # that hooks into the ACPI layer. The device is hooked to the PIIX4's # General Device 10 decoder, which means you have to fiddle with PCI # registers to map it in, even though it is otherwise treated here as # an ISA device. At the moment, the driver polls, although the device # is capable of generating interrupts. It largely undocumented. # The port location in the hint is where you WANT the device to be # mapped. 0x10a0 seems to be traditional. At the moment the jogdial # is the only thing truly supported, but aparently a fair percentage # of the Vaio extra features are controlled by this device. # Notes on the Stallion stl and stli drivers: # See src/i386/isa/README.stl for complete instructions. # This is version 0.0.5alpha, unsupported by Stallion. # The stl driver has a secondary IO port hard coded at 0x280. You need # to change src/i386/isa/stallion.c if you reconfigure this on the boards. # The "flags" and "msize" settings on the stli driver depend on the board: # EasyConnection 8/64 ISA: flags 23 msize 0x1000 # EasyConnection 8/64 EISA: flags 24 msize 0x10000 # EasyConnection 8/64 MCA: flags 25 msize 0x1000 # ONboard ISA: flags 4 msize 0x10000 # ONboard EISA: flags 7 msize 0x10000 # ONboard MCA: flags 3 msize 0x10000 # Brumby: flags 2 msize 0x4000 # Stallion: flags 1 msize 0x10000 device mcd 1 hint.mcd.0.at="isa" hint.mcd.0.port="0x300" hint.mcd.0.irq="10" # for the Sony CDU31/33A CDROM device scd 1 hint.scd.0.at="isa" hint.scd.0.port="0x230" # for the SoundBlaster 16 multicd - up to 4 devices device matcd 1 hint.matcd.0.at="isa" hint.matcd.0.port="0x230" device wt 1 hint.wt.0.at="isa" hint.wt.0.port="0x300" hint.wt.0.irq="5" hint.wt.0.drq="1" device ctx 1 hint.ctx.0.at="isa" hint.ctx.0.port="0x230" hint.ctx.0.maddr="0xd0000" device spigot 1 hint.spigot.0.at="isa" hint.spigot.0.port="0xad6" hint.spigot.0.irq="15" hint.spigot.0.maddr="0xee000" device apm hint.apm.0.flags="0x20" device pmtimer # Adjust system timer at wakeup time hint.pmtimer.0.at="isa" device gp hint.gp.0.at="isa" hint.gp.0.port="0x2c0" device gsc 1 hint.gsc.0.at="isa" hint.gsc.0.port="0x270" hint.gsc.0.drq="3" device joy # PnP aware, hints for nonpnp only hint.joy.0.at="isa" hint.joy.0.port="0x201" device cy 1 options CY_PCI_FASTINTR # Use with cy_pci unless irq is shared hint.cy.0.at="isa" hint.cy.0.irq="10" hint.cy.0.maddr="0xd4000" hint.cy.0.msize="0x2000" device dgb 1 options NDGBPORTS=16 # Defaults to 16*NDGB hint.dgb.0.at="isa" hint.dgb.0.port="0x220" hint.dgb.0.maddr="0xfc000" device digi hint.digi.0.at="isa" hint.digi.0.port="0x104" hint.digi.0.maddr="0xd0000" # BIOS & FEP/OS components of device digi. Normally left as modules device digi_CX device digi_CX_PCI device digi_EPCX device digi_EPCX_PCI device digi_Xe device digi_Xem device digi_Xr device rc 1 hint.rc.0.at="isa" hint.rc.0.port="0x220" hint.rc.0.irq="12" device rp hint.rp.0.at="isa" hint.rp.0.port="0x280" # the port and irq for tw0 are fictitious device tw 1 hint.tw.0.at="isa" hint.tw.0.port="0x380" hint.tw.0.irq="11" device si options SI_DEBUG hint.si.0.at="isa" hint.si.0.maddr="0xd0000" hint.si.0.irq="12" device asc 1 hint.asc.0.at="isa" hint.asc.0.port="0x3EB" hint.asc.0.drq="3" hint.asc.0.irq="10" device spic hint.spic.0.at="isa" hint.spic.0.port="0x10a0" device stl hint.stl.0.at="isa" hint.stl.0.port="0x2a0" hint.stl.0.irq="10" device stli hint.stli.0.at="isa" hint.stli.0.port="0x2a0" hint.stli.0.maddr="0xcc000" hint.stli.0.flags="23" hint.stli.0.msize="0x1000" # You are unlikely to have the hardware for loran device loran hint.loran.0.at="isa" hint.loran.0.irq="5" # HOT1 Xilinx 6200 card (http://www.vcc.com/) device xrpu # # The `meteor' device is a PCI video capture board. It can also have the # following options: # options METEOR_ALLOC_PAGES=xxx preallocate kernel pages for data entry # figure (ROWS*COLUMN*BYTES_PER_PIXEL*FRAME+PAGE_SIZE-1)/PAGE_SIZE # options METEOR_DEALLOC_PAGES remove all allocated pages on close(2) # options METEOR_DEALLOC_ABOVE=xxx remove all allocated pages above the # specified amount. If this value is below the allocated amount no action # taken # options METEOR_SYSTEM_DEFAULT={METEOR_PAL|METEOR_NTSC|METEOR_SECAM}, used # for initialization of fps routine when a signal is not present. # # The 'bktr' device is a PCI video capture device using the Brooktree # bt848/bt848a/bt849a/bt878/bt879 chipset. When used with a TV Tuner it forms a # TV card, eg Miro PC/TV, Hauppauge WinCast/TV WinTV, VideoLogic Captivator, # Intel Smart Video III, AverMedia, IMS Turbo, FlyVideo. # # options OVERRIDE_CARD=xxx # options OVERRIDE_TUNER=xxx # options OVERRIDE_MSP=1 # options OVERRIDE_DBX=1 # These options can be used to override the auto detection # The current values for xxx are found in src/sys/dev/bktr/bktr_card.h # Using sysctl(8) run-time overrides on a per-card basis can be made # # options BROOKTREE_SYSTEM_DEFAULT=BROOKTREE_PAL # or # options BROOKTREE_SYSTEM_DEFAULT=BROOKTREE_NTSC # Specifes the default video capture mode. # This is required for Dual Crystal (28&35Mhz) boards where PAL is used # to prevent hangs during initialisation. eg VideoLogic Captivator PCI. # # options BKTR_USE_PLL # PAL or SECAM users who have a 28Mhz crystal (and no 35Mhz crystal) # must enable PLL mode with this option. eg some new Bt878 cards. # # options BKTR_GPIO_ACCESS # This enable IOCTLs which give user level access to the GPIO port. # # options BKTR_NO_MSP_RESET # Prevents the MSP34xx reset. Good if you initialise the MSP in another OS first # # options BKTR_430_FX_MODE # Switch Bt878/879 cards into Intel 430FX chipset compatibility mode. # # options BKTR_SIS_VIA_MODE # Switch Bt878/879 cards into SIS/VIA chipset compatibility mode which is # needed for some old SiS and VIA chipset motherboards. # This also allows Bt878/879 chips to work on old OPTi (<1997) chipset # motherboards and motherboards with bad or incomplete PCI 2.1 support. # As a rough guess, old = before 1998 # device meteor 1 # Brooktree driver has been ported to the new I2C framework. Thus, # you'll need to have the following 3 lines in the kernel config. # device smbus # device iicbus # device iicbb # The iic and smb devices are only needed if you want to control other # I2C slaves connected to the external connector of some cards. # device bktr 1 # # PC Card/PCMCIA # (OLDCARD) # # card: pccard slots # pcic: isa/pccard bridge device pcic hint.pcic.0.at="isa" hint.pcic.1.at="isa" device card # # PC Card/PCMCIA and Cardbus # (NEWCARD) # # Note that NEWCARD and OLDCARD are incompatible. Do not use both at the same # time. # # pccbb: isa/pccard and pci/cardbus bridge # pccard: pccard slots # cardbus: cardbus slots #device pccbb #device pccard #device cardbus # You may need to reset all pccards after resuming options PCIC_RESUME_RESET # reset after resume # # Laptop/Notebook options: # # See also: # apm under `Miscellaneous hardware' # above. # For older notebooks that signal a powerfail condition (external # power supply dropped, or battery state low) by issuing an NMI: options POWERFAIL_NMI # make it beep instead of panicing # # SMB bus # # System Management Bus support is provided by the 'smbus' device. # Access to the SMBus device is via the 'smb' device (/dev/smb*), # which is a child of the 'smbus' device. # # Supported devices: # smb standard io through /dev/smb* # # Supported SMB interfaces: # iicsmb I2C to SMB bridge with any iicbus interface # bktr brooktree848 I2C hardware interface # intpm Intel PIIX4 Power Management Unit # alpm Acer Aladdin-IV/V/Pro2 Power Management Unit # ichsmb Intel ICH SMBus controller chips (82801AA, 82801AB, 82801BA) # device smbus # Bus support, required for smb below. device intpm device alpm device ichsmb device smb # # I2C Bus # # Philips i2c bus support is provided by the `iicbus' device. # # Supported devices: # ic i2c network interface # iic i2c standard io # iicsmb i2c to smb bridge. Allow i2c i/o with smb commands. # # Supported interfaces: # pcf Philips PCF8584 ISA-bus controller # bktr brooktree848 I2C software interface # # Other: # iicbb generic I2C bit-banging code (needed by lpbb, bktr) # device iicbus # Bus support, required for ic/iic/iicsmb below. device iicbb device ic device iic device iicsmb # smb over i2c bridge device pcf hint.pcf.0.at="isa" hint.pcf.0.port="0x320" hint.pcf.0.irq="5" #--------------------------------------------------------------------------- # ISDN4BSD # # See /usr/share/examples/isdn/ROADMAP for an introduction to isdn4bsd. # # i4b passive ISDN cards support contains the following hardware drivers: # # isic - Siemens/Infineon ISDN ISAC/HSCX/IPAC chipset driver # iwic - Winbond W6692 PCI bus ISDN S/T interface controller # ifpi - AVM Fritz!Card PCI driver # ihfc - Cologne Chip HFC ISA/ISA-PnP chipset driver # ifpnp - AVM Fritz!Card PnP driver # itjc - Siemens ISAC / TJNet Tiger300/320 chipset # # i4b active ISDN cards support contains the following hardware drivers: # # iavc - AVM B1 PCI, AVM B1 ISA, AVM T1 # # Note that the ``options'' (if given) and ``device'' lines must BOTH # be uncommented to enable support for a given card ! # # In addition to a hardware driver (and probably an option) the mandatory # ISDN protocol stack devices and the mandatory support device must be # enabled as well as one or more devices from the optional devices section. # #--------------------------------------------------------------------------- # isic driver (Siemens/Infineon chipsets) # device isic # # ISA bus non-PnP Cards: # ---------------------- # # Teles S0/8 or Niccy 1008 options TEL_S0_8 hint.isic.0.at="isa" hint.isic.0.maddr="0xd0000" hint.isic.0.irq="5" hint.isic.0.flags="1" # # Teles S0/16 or Creatix ISDN-S0 or Niccy 1016 options TEL_S0_16 hint.isic.0.at="isa" hint.isic.0.port="0xd80" hint.isic.0.maddr="0xd0000" hint.isic.0.irq="5" hint.isic.0.flags="2" # # Teles S0/16.3 options TEL_S0_16_3 hint.isic.0.at="isa" hint.isic.0.port="0xd80" hint.isic.0.irq="5" hint.isic.0.flags="3" # # AVM A1 or AVM Fritz!Card options AVM_A1 hint.isic.0.at="isa" hint.isic.0.port="0x340" hint.isic.0.irq="5" hint.isic.0.flags="4" # # USRobotics Sportster ISDN TA intern options USR_STI hint.isic.0.at="isa" hint.isic.0.port="0x268" hint.isic.0.irq="5" hint.isic.0.flags="7" # # ITK ix1 Micro ( < V.3, non-PnP version ) options ITKIX1 hint.isic.0.at="isa" hint.isic.0.port="0x398" hint.isic.0.irq="10" hint.isic.0.flags="18" # # ELSA PCC-16 options ELSA_PCC16 hint.isic.0.at="isa" hint.isic.0.port="0x360" hint.isic.0.irq="10" hint.isic.0.flags="20" # # ISA bus PnP Cards: # ------------------ # # Teles S0/16.3 PnP options TEL_S0_16_3_P # # Creatix ISDN-S0 P&P options CRTX_S0_P # # Dr. Neuhaus Niccy Go@ options DRN_NGO # # Sedlbauer Win Speed options SEDLBAUER # # Dynalink IS64PH options DYNALINK # # ELSA QuickStep 1000pro ISA options ELSA_QS1ISA # # Siemens I-Surf 2.0 options SIEMENS_ISURF2 # # Asuscom ISDNlink 128K ISA options ASUSCOM_IPAC # # Eicon Diehl DIVA 2.0 and 2.02 options EICON_DIVA # # PCI bus Cards: # -------------- # # ELSA MicroLink ISDN/PCI (same as ELSA QuickStep 1000pro PCI) options ELSA_QS1PCI # # #--------------------------------------------------------------------------- # ifpnp driver for AVM Fritz!Card PnP # # AVM Fritz!Card PnP device ifpnp # #--------------------------------------------------------------------------- # ihfc driver for Cologne Chip ISA chipsets (experimental!) # # Teles 16.3c ISA PnP # AcerISDN P10 ISA PnP # TELEINT ISDN SPEED No.1 device ihfc # #--------------------------------------------------------------------------- # ifpi driver for AVM Fritz!Card PCI # # AVM Fritz!Card PCI device ifpi # #--------------------------------------------------------------------------- # iwic driver for Winbond W6692 chipset # # ASUSCOM P-IN100-ST-D (and other Winbond W6692 based cards) device iwic # #--------------------------------------------------------------------------- # itjc driver for Simens ISAC / TJNet Tiger300/320 chipset # # Traverse Technologies NETjet-S # Teles PCI-TJ device itjc # #--------------------------------------------------------------------------- # iavc driver (AVM active cards, needs i4bcapi driver!) # device iavc # # AVM B1 ISA bus (PnP mode not supported!) # ---------------------------------------- hint.iavc.0.at="isa" hint.iavc.0.port="0x150" hint.iavc.0.irq="5" # #--------------------------------------------------------------------------- # ISDN Protocol Stack - mandatory for all hardware drivers # # Q.921 / layer 2 - i4b passive cards D channel handling device "i4bq921" # # Q.931 / layer 3 - i4b passive cards D channel handling device "i4bq931" # # layer 4 - i4b common passive and active card handling device "i4b" # #--------------------------------------------------------------------------- # ISDN devices - mandatory for all hardware drivers # # userland driver to do ISDN tracing (for passive cards only) device "i4btrc" 4 # # userland driver to control the whole thing device "i4bctl" # #--------------------------------------------------------------------------- # ISDN devices - optional # # userland driver for access to raw B channel device "i4brbch" 4 # # userland driver for telephony device "i4btel" 2 # # network driver for IP over raw HDLC ISDN device "i4bipr" 4 # enable VJ header compression detection for ipr i/f options IPR_VJ # enable logging of the first n IP packets to isdnd (n=32 here) options IPR_LOG=32 # # network driver for sync PPP over ISDN; requires an equivalent # number of sppp device to be configured device "i4bisppp" 4 # # B-channel interface to the netgraph subsystem device "i4bing" 2 # # CAPI driver needed for active ISDN cards (see iavc driver above) device "i4bcapi" # #--------------------------------------------------------------------------- # Parallel-Port Bus # # Parallel port bus support is provided by the `ppbus' device. # Multiple devices may be attached to the parallel port, devices # are automatically probed and attached when found. # # Supported devices: # vpo Iomega Zip Drive # Requires SCSI disk support ('scbus' and 'da'), best # performance is achieved with ports in EPP 1.9 mode. # lpt Parallel Printer # plip Parallel network interface # ppi General-purpose I/O ("Geek Port") + IEEE1284 I/O # pps Pulse per second Timing Interface # lpbb Philips official parallel port I2C bit-banging interface # # Supported interfaces: # ppc ISA-bus parallel port interfaces. # options PPC_PROBE_CHIPSET # Enable chipset specific detection # (see flags in ppc(4)) options DEBUG_1284 # IEEE1284 signaling protocol debug options PERIPH_1284 # Makes your computer act as a IEEE1284 # compliant peripheral options DONTPROBE_1284 # Avoid boot detection of PnP parallel devices options VP0_DEBUG # ZIP/ZIP+ debug options LPT_DEBUG # Printer driver debug options PPC_DEBUG # Parallel chipset level debug options PLIP_DEBUG # Parallel network IP interface debug options PCFCLOCK_VERBOSE # Verbose pcfclock driver options PCFCLOCK_MAX_RETRIES=5 # Maximum read tries (default 10) device ppc hint.ppc.0.at="isa" hint.ppc.0.irq="7" device ppbus device vpo device lpt device plip device ppi device pps device lpbb device pcfclock # Kernel BOOTP support options BOOTP # Use BOOTP to obtain IP address/hostname options BOOTP_NFSROOT # NFS mount root filesystem using BOOTP info options BOOTP_NFSV3 # Use NFS v3 to NFS mount root options BOOTP_COMPAT # Workaround for broken bootp daemons. options BOOTP_WIRED_TO=fxp0 # Use interface fxp0 for BOOTP # # Add tie-ins for a hardware watchdog. This only enable the hooks; # the user must still supply the actual driver. # options HW_WDOG # # Set the number of PV entries per process. Increasing this can # stop panics related to heavy use of shared memory. However, that can # (combined with large amounts of physical memory) cause panics at # boot time due the kernel running out of VM space. # # If you're tweaking this, you might also want to increase the sysctls # "vm.v_free_min", "vm.v_free_reserved", and "vm.v_free_target". # # The value below is the one more than the default. # options PMAP_SHPGPERPROC=201 # # Disable swapping. This option removes all code which actually performs # swapping, so it's not possible to turn it back on at run-time. # # This is sometimes usable for systems which don't have any swap space # (see also sysctls "vm.defer_swapspace_pageouts" and # "vm.disable_swapspace_pageouts") # #options NO_SWAPPING # Set the number of sf_bufs to allocate. sf_bufs are virtual buffers # for sendfile(2) that are used to map file VM pages, and normally # default to a quantity that is roughly 16*MAXUSERS+512. You would # typically want about 4 of these for each simultaneous file send. # options NSFBUFS=1024 # # Enable extra debugging code for locks. This stores the filename and # line of whatever acquired the lock in the lock itself, and change a # number of function calls to pass around the relevant data. This is # not at all useful unless you are debugging lock code. Also note # that it is likely to break e.g. fstat(1) unless you recompile your # userland with -DDEBUG_LOCKS as well. # options DEBUG_LOCKS ##################################################################### # ABI Emulation # Enable iBCS2 runtime support for SCO and ISC binaries options IBCS2 # Emulate spx device for client side of SVR3 local X interface options SPX_HACK # Enable Linux ABI emulation options COMPAT_LINUX # Enable the linux-like proc filesystem support (requires COMPAT_LINUX # and PSEUDOFS) options LINPROCFS # Linux debugging options DEBUG_LINUX # # SysVR4 ABI emulation # # The svr4 ABI emulator can be statically compiled into the kernel or loaded as # a KLD module. # The STREAMS network emulation code can also be compiled statically or as a # module. If loaded as a module, it must be loaded before the svr4 module # (the /usr/sbin/svr4 script does this for you). If compiling statically, # the `streams' device must be configured into any kernel which also # specifies COMPAT_SVR4. It is possible to have a statically-configured # STREAMS device and a dynamically loadable svr4 emulator; the /usr/sbin/svr4 # script understands that it doesn't need to load the `streams' module under # those circumstances. # Caveat: At this time, `options KTRACE' is required for the svr4 emulator # (whether static or dynamic). # options COMPAT_SVR4 # build emulator statically options DEBUG_SVR4 # enable verbose debugging device streams # STREAMS network driver (required for svr4). ##################################################################### # USB support # UHCI controller device uhci # OHCI controller device ohci # General USB code (mandatory for USB) device usb # # USB Double Bulk Pipe devices device udbp # Generic USB device driver device ugen # Human Interface Device (anything with buttons and dials) device uhid # USB keyboard device ukbd # USB printer device ulpt # USB Iomega Zip 100 Drive (Requires scbus and da) device umass # USB modem support device umodem # USB mouse device ums # Diamond Rio 500 Mp3 player device urio # USB scanners device uscanner # # ADMtek USB ethernet. Supports the LinkSys USB100TX, # the Billionton USB100, the Melco LU-ATX, the D-Link DSB-650TX # and the SMC 2202USB. Also works with the ADMtek AN986 Pegasus # eval board. device aue # # CATC USB-EL1201A USB ethernet. Supports the CATC Netmate # and Netmate II, and the Belkin F5U111. device cue # # Kawasaki LSI ethernet. Supports the LinkSys USB10T, # Entrega USB-NET-E45, Peracom Ethernet Adapter, the # 3Com 3c19250, the ADS Technologies USB-10BT, the ATen UC10T, # the Netgear EA101, the D-Link DSB-650, the SMC 2102USB # and 2104USB, and the Corega USB-T. device kue # debugging options for the USB subsystem # options UHCI_DEBUG options OHCI_DEBUG options USB_DEBUG options UGEN_DEBUG options UHID_DEBUG options UHUB_DEBUG options UKBD_DEBUG options ULPT_DEBUG options UMASS_DEBUG options UMS_DEBUG options URIO_DEBUG # options for ukbd: options UKBD_DFLT_KEYMAP # specify the built-in keymap makeoptions UKBD_DFLT_KEYMAP=it.iso # # Embedded system options: # # An embedded system might want to run something other than init. options INIT_PATH="/sbin/init:/stand/sysinstall" # Debug options options BUS_DEBUG # enable newbus debugging options DEBUG_VFS_LOCKS # enable vfs lock debugging options NPX_DEBUG # enable npx debugging (FPU/math emu) ##################################################################### # SYSV IPC KERNEL PARAMETERS # # Maximum number of entries in a semaphore map. options SEMMAP=31 # Maximum number of System V semaphores that can be used on the system at # one time. options SEMMNI=11 # Total number of semaphores system wide options SEMMNS=61 # Total number of undo structures in system options SEMMNU=31 # Maximum number of System V semaphores that can be used by a single process # at one time. options SEMMSL=61 # Maximum number of operations that can be outstanding on a single System V # semaphore at one time. options SEMOPM=101 # Maximum number of undo operations that can be outstanding on a single # System V semaphore at one time. options SEMUME=11 # Maximum number of shared memory pages system wide. options SHMALL=1025 # Maximum size, in bytes, of a single System V shared memory region. options SHMMAX="(SHMMAXPGS*PAGE_SIZE+1)" options SHMMAXPGS=1025 # Minimum size, in bytes, of a single System V shared memory region. options SHMMIN=2 # Maximum number of shared memory regions that can be used on the system # at one time. options SHMMNI=33 # Maximum number of System V shared memory regions that can be attached to # a single process at one time. options SHMSEG=9 # Set the amount of time (in seconds) the system will wait before # rebooting automatically when a kernel panic occurs. If set to (-1), # the system will wait indefinitely until a key is pressed on the # console. options PANIC_REBOOT_WAIT_TIME=16 ##################################################################### # More undocumented options for linting. # Note that documenting these are not considered an affront. options CAM_DEBUG_DELAY # VFS cluster debugging. options CLUSTERDEBUG # Eliminate unneeded cache flush instruction(s). options CPU_UPGRADE_HW_CACHE options DEBUG # PECOFF module (Win32 Execution Format) options PECOFF_SUPPORT options PECOFF_DEBUG # Disable the 4 MByte PSE CPU feature. #options DISABLE_PSE options ENABLE_ALART options I4B_SMP_WORKAROUND options I586_PMC_GUPROF=0x70000 options KBDIO_DEBUG=2 options KBD_MAXRETRY=4 options KBD_MAXWAIT=6 options KBD_RESETDELAY=201 # Enable the PF_KEY Key Management API. options KEY # Kernel filelock debugging. options LOCKF_DEBUG # System V compatible message queues # Please note that the values provided here are used to test kernel # building. The defaults in the sources provide almost the same numbers. # MSGSSZ must be a power of 2 between 8 and 1024. options MSGMNB=2049 # Max number of chars in queue options MSGMNI=41 # Max number of message queue identifiers options MSGSEG=2049 # Max number of message segments options MSGSSZ=16 # Size of a message segment options MSGTQL=41 # Max number of messages in system options NBUF=512 # Number of buffer headers options NMBCLUSTERS=1024 # Number of mbuf clusters options PSM_DEBUG=1 options SCSI_NCR_DEBUG options SCSI_NCR_MAX_SYNC=10000 options SCSI_NCR_MAX_WIDE=1 options SCSI_NCR_MYADDR=7 options SC_DEBUG_LEVEL=5 # Syscons debug level options SC_RENDER_DEBUG # syscons rendering debugging options SHOW_BUSYBUFS # List buffers that prevent root unmount options SIMPLELOCK_DEBUG options SLIP_IFF_OPTS options TIMER_FREQ="((14318182+6)/12)" options VFS_BIO_DEBUG # VFS buffer I/O debugging options VM_KMEM_SIZE options VM_KMEM_SIZE_MAX options VM_KMEM_SIZE_SCALE Index: head/sys/i386/i386/genassym.c =================================================================== --- head/sys/i386/i386/genassym.c (revision 82308) +++ head/sys/i386/i386/genassym.c (revision 82309) @@ -1,206 +1,208 @@ /*- * Copyright (c) 1982, 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)genassym.c 5.11 (Berkeley) 5/10/91 * $FreeBSD$ */ +#include "opt_upages.h" + #include #include #include #include #include #include #include #include #include #include #include /* XXX */ #ifdef KTR_PERCPU #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #include #include #include #include ASSYM(P_VMSPACE, offsetof(struct proc, p_vmspace)); ASSYM(VM_PMAP, offsetof(struct vmspace, vm_pmap)); ASSYM(PM_ACTIVE, offsetof(struct pmap, pm_active)); ASSYM(P_ADDR, offsetof(struct proc, p_addr)); ASSYM(P_INTR_NESTING_LEVEL, offsetof(struct proc, p_intr_nesting_level)); ASSYM(P_SFLAG, offsetof(struct proc, p_sflag)); ASSYM(P_STAT, offsetof(struct proc, p_stat)); ASSYM(P_WCHAN, offsetof(struct proc, p_wchan)); ASSYM(PS_ASTPENDING, PS_ASTPENDING); ASSYM(PS_NEEDRESCHED, PS_NEEDRESCHED); ASSYM(SSLEEP, SSLEEP); ASSYM(SRUN, SRUN); ASSYM(V_TRAP, offsetof(struct vmmeter, v_trap)); ASSYM(V_SYSCALL, offsetof(struct vmmeter, v_syscall)); ASSYM(V_INTR, offsetof(struct vmmeter, v_intr)); ASSYM(UPAGES, UPAGES); ASSYM(PAGE_SIZE, PAGE_SIZE); ASSYM(NPTEPG, NPTEPG); ASSYM(NPDEPG, NPDEPG); ASSYM(PDESIZE, PDESIZE); ASSYM(PTESIZE, PTESIZE); ASSYM(PAGE_SHIFT, PAGE_SHIFT); ASSYM(PAGE_MASK, PAGE_MASK); ASSYM(PDRSHIFT, PDRSHIFT); ASSYM(USRSTACK, USRSTACK); ASSYM(VM_MAXUSER_ADDRESS, VM_MAXUSER_ADDRESS); ASSYM(KERNBASE, KERNBASE); ASSYM(MCLBYTES, MCLBYTES); ASSYM(PCB_CR3, offsetof(struct pcb, pcb_cr3)); ASSYM(PCB_EDI, offsetof(struct pcb, pcb_edi)); ASSYM(PCB_ESI, offsetof(struct pcb, pcb_esi)); ASSYM(PCB_EBP, offsetof(struct pcb, pcb_ebp)); ASSYM(PCB_ESP, offsetof(struct pcb, pcb_esp)); ASSYM(PCB_EBX, offsetof(struct pcb, pcb_ebx)); ASSYM(PCB_EIP, offsetof(struct pcb, pcb_eip)); ASSYM(TSS_ESP0, offsetof(struct i386tss, tss_esp0)); ASSYM(PCB_USERLDT, offsetof(struct pcb, pcb_ldt)); ASSYM(PCB_GS, offsetof(struct pcb, pcb_gs)); ASSYM(PCB_DR0, offsetof(struct pcb, pcb_dr0)); ASSYM(PCB_DR1, offsetof(struct pcb, pcb_dr1)); ASSYM(PCB_DR2, offsetof(struct pcb, pcb_dr2)); ASSYM(PCB_DR3, offsetof(struct pcb, pcb_dr3)); ASSYM(PCB_DR6, offsetof(struct pcb, pcb_dr6)); ASSYM(PCB_DR7, offsetof(struct pcb, pcb_dr7)); ASSYM(PCB_DBREGS, PCB_DBREGS); ASSYM(PCB_EXT, offsetof(struct pcb, pcb_ext)); ASSYM(PCB_SPARE, offsetof(struct pcb, __pcb_spare)); ASSYM(PCB_FLAGS, offsetof(struct pcb, pcb_flags)); ASSYM(PCB_SAVEFPU, offsetof(struct pcb, pcb_save)); ASSYM(PCB_SAVEFPU_SIZE, sizeof(union savefpu)); ASSYM(PCB_SAVE87_SIZE, sizeof(struct save87)); ASSYM(PCB_ONFAULT, offsetof(struct pcb, pcb_onfault)); #ifdef SMP ASSYM(PCB_SIZE, sizeof(struct pcb)); #endif ASSYM(TF_TRAPNO, offsetof(struct trapframe, tf_trapno)); ASSYM(TF_ERR, offsetof(struct trapframe, tf_err)); ASSYM(TF_CS, offsetof(struct trapframe, tf_cs)); ASSYM(TF_EFLAGS, offsetof(struct trapframe, tf_eflags)); ASSYM(SIGF_HANDLER, offsetof(struct sigframe, sf_ahu.sf_handler)); ASSYM(SIGF_SC, offsetof(struct osigframe, sf_siginfo.si_sc)); ASSYM(SIGF_UC, offsetof(struct sigframe, sf_uc)); ASSYM(SC_PS, offsetof(struct osigcontext, sc_ps)); ASSYM(SC_FS, offsetof(struct osigcontext, sc_fs)); ASSYM(SC_GS, offsetof(struct osigcontext, sc_gs)); ASSYM(SC_TRAPNO, offsetof(struct osigcontext, sc_trapno)); ASSYM(UC_EFLAGS, offsetof(ucontext_t, uc_mcontext.mc_eflags)); ASSYM(UC_GS, offsetof(ucontext_t, uc_mcontext.mc_gs)); ASSYM(ENOENT, ENOENT); ASSYM(EFAULT, EFAULT); ASSYM(ENAMETOOLONG, ENAMETOOLONG); ASSYM(MAXPATHLEN, MAXPATHLEN); ASSYM(BOOTINFO_SIZE, sizeof(struct bootinfo)); ASSYM(BI_VERSION, offsetof(struct bootinfo, bi_version)); ASSYM(BI_KERNELNAME, offsetof(struct bootinfo, bi_kernelname)); ASSYM(BI_NFS_DISKLESS, offsetof(struct bootinfo, bi_nfs_diskless)); ASSYM(BI_ENDCOMMON, offsetof(struct bootinfo, bi_endcommon)); ASSYM(NFSDISKLESS_SIZE, sizeof(struct nfs_diskless)); ASSYM(BI_SIZE, offsetof(struct bootinfo, bi_size)); ASSYM(BI_SYMTAB, offsetof(struct bootinfo, bi_symtab)); ASSYM(BI_ESYMTAB, offsetof(struct bootinfo, bi_esymtab)); ASSYM(BI_KERNEND, offsetof(struct bootinfo, bi_kernend)); ASSYM(GD_SIZEOF, sizeof(struct globaldata)); ASSYM(GD_PRVSPACE, offsetof(struct globaldata, gd_prvspace)); ASSYM(GD_CURPROC, offsetof(struct globaldata, gd_curproc)); ASSYM(GD_NPXPROC, offsetof(struct globaldata, gd_npxproc)); ASSYM(GD_IDLEPROC, offsetof(struct globaldata, gd_idleproc)); ASSYM(GD_CURPCB, offsetof(struct globaldata, gd_curpcb)); ASSYM(GD_COMMON_TSS, offsetof(struct globaldata, gd_common_tss)); ASSYM(GD_SWITCHTIME, offsetof(struct globaldata, gd_switchtime)); ASSYM(GD_SWITCHTICKS, offsetof(struct globaldata, gd_switchticks)); ASSYM(GD_COMMON_TSSD, offsetof(struct globaldata, gd_common_tssd)); ASSYM(GD_TSS_GDT, offsetof(struct globaldata, gd_tss_gdt)); ASSYM(GD_CURRENTLDT, offsetof(struct globaldata, gd_currentldt)); /* XXX */ #ifdef KTR_PERCPU ASSYM(GD_KTR_IDX, offsetof(struct globaldata, gd_ktr_idx)); ASSYM(GD_KTR_BUF, offsetof(struct globaldata, gd_ktr_buf)); ASSYM(GD_KTR_BUF_DATA, offsetof(struct globaldata, gd_ktr_buf_data)); #endif ASSYM(GD_CPUID, offsetof(struct globaldata, gd_cpuid)); #ifdef SMP ASSYM(LA_VER, offsetof(struct LAPIC, version)); ASSYM(LA_TPR, offsetof(struct LAPIC, tpr)); ASSYM(LA_EOI, offsetof(struct LAPIC, eoi)); ASSYM(LA_SVR, offsetof(struct LAPIC, svr)); ASSYM(LA_ICR_LO, offsetof(struct LAPIC, icr_lo)); ASSYM(LA_ICR_HI, offsetof(struct LAPIC, icr_hi)); #endif ASSYM(KCSEL, GSEL(GCODE_SEL, SEL_KPL)); ASSYM(KDSEL, GSEL(GDATA_SEL, SEL_KPL)); ASSYM(KPSEL, GSEL(GPRIV_SEL, SEL_KPL)); ASSYM(BC32SEL, GSEL(GBIOSCODE32_SEL, SEL_KPL)); ASSYM(GPROC0_SEL, GPROC0_SEL); ASSYM(VM86_FRAMESIZE, sizeof(struct vm86frame)); ASSYM(MTX_LOCK, offsetof(struct mtx, mtx_lock)); ASSYM(MTX_RECURSECNT, offsetof(struct mtx, mtx_recurse)); ASSYM(MTX_SAVECRIT, offsetof(struct mtx, mtx_savecrit)); Index: head/sys/i386/i386/machdep.c =================================================================== --- head/sys/i386/i386/machdep.c (revision 82308) +++ head/sys/i386/i386/machdep.c (revision 82309) @@ -1,2530 +1,2534 @@ /*- * Copyright (c) 1992 Terrence R. Lambert. * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 * $FreeBSD$ */ #include "opt_atalk.h" #include "opt_compat.h" #include "opt_cpu.h" #include "opt_ddb.h" #include "opt_inet.h" #include "opt_ipx.h" #include "opt_isa.h" #include "opt_maxmem.h" #include "opt_msgbuf.h" #include "opt_npx.h" #include "opt_perfmon.h" +#include "opt_upages.h" /* #include "opt_userconfig.h" */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* pcb.h included via sys/user.h */ #include #ifdef PERFMON #include +#endif +#ifdef SMP +#include #endif #include #include #include #include #include #include extern void init386 __P((int first)); extern void dblfault_handler __P((void)); extern void printcpuinfo(void); /* XXX header file */ extern void earlysetcpuclass(void); /* same header file */ extern void finishidentcpu(void); extern void panicifcpuunsupported(void); extern void initializecpu(void); #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) #define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) static void cpu_startup __P((void *)); #ifdef CPU_ENABLE_SSE static void set_fpregs_xmm __P((struct save87 *, struct savexmm *)); static void fill_fpregs_xmm __P((struct savexmm *, struct save87 *)); #endif /* CPU_ENABLE_SSE */ SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL) int _udatasel, _ucodesel; u_int atdevbase; #if defined(SWTCH_OPTIM_STATS) extern int swtch_optim_stats; SYSCTL_INT(_debug, OID_AUTO, swtch_optim_stats, CTLFLAG_RD, &swtch_optim_stats, 0, ""); SYSCTL_INT(_debug, OID_AUTO, tlb_flush_count, CTLFLAG_RD, &tlb_flush_count, 0, ""); #endif #ifdef PC98 static int ispc98 = 1; #else static int ispc98 = 0; #endif SYSCTL_INT(_machdep, OID_AUTO, ispc98, CTLFLAG_RD, &ispc98, 0, ""); int physmem = 0; int cold = 1; #ifdef COMPAT_43 static void osendsig __P((sig_t catcher, int sig, sigset_t *mask, u_long code)); #endif static int sysctl_hw_physmem(SYSCTL_HANDLER_ARGS) { int error = sysctl_handle_int(oidp, 0, ctob(physmem), req); return (error); } SYSCTL_PROC(_hw, HW_PHYSMEM, physmem, CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_hw_physmem, "IU", ""); static int sysctl_hw_usermem(SYSCTL_HANDLER_ARGS) { int error = sysctl_handle_int(oidp, 0, ctob(physmem - cnt.v_wire_count), req); return (error); } SYSCTL_PROC(_hw, HW_USERMEM, usermem, CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_hw_usermem, "IU", ""); static int sysctl_hw_availpages(SYSCTL_HANDLER_ARGS) { int error = sysctl_handle_int(oidp, 0, i386_btop(avail_end - avail_start), req); return (error); } SYSCTL_PROC(_hw, OID_AUTO, availpages, CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_hw_availpages, "I", ""); int Maxmem = 0; long dumplo; vm_offset_t phys_avail[10]; /* must be 2 less so 0 0 can signal end of chunks */ #define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(vm_offset_t)) - 2) struct kva_md_info kmi; static struct trapframe proc0_tf; #ifndef SMP static struct globaldata __globaldata; #endif struct mtx sched_lock; struct mtx Giant; static void cpu_startup(dummy) void *dummy; { /* * Good {morning,afternoon,evening,night}. */ earlysetcpuclass(); startrtclock(); printcpuinfo(); panicifcpuunsupported(); #ifdef PERFMON perfmon_init(); #endif printf("real memory = %u (%uK bytes)\n", ptoa(Maxmem), ptoa(Maxmem) / 1024); /* * Display any holes after the first chunk of extended memory. */ if (bootverbose) { int indx; printf("Physical memory chunk(s):\n"); for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) { unsigned int size1; size1 = phys_avail[indx + 1] - phys_avail[indx]; printf("0x%08x - 0x%08x, %u bytes (%u pages)\n", phys_avail[indx], phys_avail[indx + 1] - 1, size1, size1 / PAGE_SIZE); } } vm_ksubmap_init(&kmi); #if 0 /* * Calculate callout wheel size */ for (callwheelsize = 1, callwheelbits = 0; callwheelsize < ncallout; callwheelsize <<= 1, ++callwheelbits) ; callwheelmask = callwheelsize - 1; /* * Allocate space for system data structures. * The first available kernel virtual address is in "v". * As pages of kernel virtual memory are allocated, "v" is incremented. * As pages of memory are allocated and cleared, * "firstaddr" is incremented. * An index into the kernel page table corresponding to the * virtual memory address maintained in "v" is kept in "mapaddr". */ /* * Make two passes. The first pass calculates how much memory is * needed and allocates it. The second pass assigns virtual * addresses to the various data structures. */ firstaddr = 0; again: v = (caddr_t)firstaddr; #define valloc(name, type, num) \ (name) = (type *)v; v = (caddr_t)((name)+(num)) #define valloclim(name, type, num, lim) \ (name) = (type *)v; v = (caddr_t)((lim) = ((name)+(num))) valloc(callout, struct callout, ncallout); valloc(callwheel, struct callout_tailq, callwheelsize); /* * Discount the physical memory larger than the size of kernel_map * to avoid eating up all of KVA space. */ if (kernel_map->first_free == NULL) { printf("Warning: no free entries in kernel_map.\n"); physmem_est = physmem; } else { physmem_est = min(physmem, btoc(kernel_map->max_offset - kernel_map->min_offset)); } /* * The nominal buffer size (and minimum KVA allocation) is BKVASIZE. * For the first 64MB of ram nominally allocate sufficient buffers to * cover 1/4 of our ram. Beyond the first 64MB allocate additional * buffers to cover 1/20 of our ram over 64MB. When auto-sizing * the buffer cache we limit the eventual kva reservation to * maxbcache bytes. * * factor represents the 1/4 x ram conversion. */ if (nbuf == 0) { int factor = 4 * BKVASIZE / PAGE_SIZE; nbuf = 50; if (physmem_est > 1024) nbuf += min((physmem_est - 1024) / factor, 16384 / factor); if (physmem_est > 16384) nbuf += (physmem_est - 16384) * 2 / (factor * 5); if (maxbcache && nbuf > maxbcache / BKVASIZE) nbuf = maxbcache / BKVASIZE; } /* * Do not allow the buffer_map to be more then 1/2 the size of the * kernel_map. */ if (nbuf > (kernel_map->max_offset - kernel_map->min_offset) / (BKVASIZE * 2)) { nbuf = (kernel_map->max_offset - kernel_map->min_offset) / (BKVASIZE * 2); printf("Warning: nbufs capped at %d\n", nbuf); } nswbuf = max(min(nbuf/4, 256), 16); valloc(swbuf, struct buf, nswbuf); valloc(buf, struct buf, nbuf); v = bufhashinit(v); /* * End of first pass, size has been calculated so allocate memory */ if (firstaddr == 0) { size = (vm_size_t)(v - firstaddr); firstaddr = (int)kmem_alloc(kernel_map, round_page(size)); if (firstaddr == 0) panic("startup: no room for tables"); goto again; } /* * End of second pass, addresses have been assigned */ if ((vm_size_t)(v - firstaddr) != size) panic("startup: table size inconsistency"); clean_map = kmem_suballoc(kernel_map, &clean_sva, &clean_eva, (nbuf*BKVASIZE) + (nswbuf*MAXPHYS) + pager_map_size); buffer_map = kmem_suballoc(clean_map, &buffer_sva, &buffer_eva, (nbuf*BKVASIZE)); buffer_map->system_map = 1; pager_map = kmem_suballoc(clean_map, &pager_sva, &pager_eva, (nswbuf*MAXPHYS) + pager_map_size); pager_map->system_map = 1; exec_map = kmem_suballoc(kernel_map, &minaddr, &maxaddr, (16*(ARG_MAX+(PAGE_SIZE*3)))); /* * XXX: Mbuf system machine-specific initializations should * go here, if anywhere. */ /* * Initialize callouts */ SLIST_INIT(&callfree); for (i = 0; i < ncallout; i++) { callout_init(&callout[i], 0); callout[i].c_flags = CALLOUT_LOCAL_ALLOC; SLIST_INSERT_HEAD(&callfree, &callout[i], c_links.sle); } for (i = 0; i < callwheelsize; i++) { TAILQ_INIT(&callwheel[i]); } mtx_init(&callout_lock, "callout", MTX_SPIN | MTX_RECURSE); #endif #if defined(USERCONFIG) userconfig(); cninit(); /* the preferred console may have changed */ #endif printf("avail memory = %u (%uK bytes)\n", ptoa(cnt.v_free_count), ptoa(cnt.v_free_count) / 1024); /* * Set up buffers, so they can be used to read disk labels. */ bufinit(); vm_pager_bufferinit(); globaldata_register(GLOBALDATA); #ifndef SMP /* For SMP, we delay the cpu_setregs() until after SMP startup. */ cpu_setregs(); #endif } /* * Send an interrupt to process. * * Stack is set up to allow sigcode stored * at top to call routine, followed by kcall * to sigreturn routine below. After sigreturn * resets the signal mask, the stack, and the * frame pointer, it returns to the user * specified pc, psl. */ #ifdef COMPAT_43 static void osendsig(catcher, sig, mask, code) sig_t catcher; int sig; sigset_t *mask; u_long code; { struct osigframe sf; struct osigframe *fp; struct proc *p; struct sigacts *psp; struct trapframe *regs; int oonstack; p = curproc; PROC_LOCK(p); psp = p->p_sigacts; regs = p->p_frame; oonstack = sigonstack(regs->tf_esp); /* Allocate and validate space for the signal handler context. */ if ((p->p_flag & P_ALTSTACK) && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { fp = (struct osigframe *)(p->p_sigstk.ss_sp + p->p_sigstk.ss_size - sizeof(struct osigframe)); #if defined(COMPAT_43) || defined(COMPAT_SUNOS) p->p_sigstk.ss_flags |= SS_ONSTACK; #endif } else fp = (struct osigframe *)regs->tf_esp - 1; PROC_UNLOCK(p); /* * grow_stack() will return 0 if *fp does not fit inside the stack * and the stack can not be grown. * useracc() will return FALSE if access is denied. */ if (grow_stack(p, (int)fp) == 0 || !useracc((caddr_t)fp, sizeof(*fp), VM_PROT_WRITE)) { /* * Process has trashed its stack; give it an illegal * instruction to halt it in its tracks. */ PROC_LOCK(p); SIGACTION(p, SIGILL) = SIG_DFL; SIGDELSET(p->p_sigignore, SIGILL); SIGDELSET(p->p_sigcatch, SIGILL); SIGDELSET(p->p_sigmask, SIGILL); psignal(p, SIGILL); PROC_UNLOCK(p); return; } /* Translate the signal if appropriate. */ if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize) sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; /* Build the argument list for the signal handler. */ sf.sf_signum = sig; sf.sf_scp = (register_t)&fp->sf_siginfo.si_sc; PROC_LOCK(p); if (SIGISMEMBER(p->p_sigacts->ps_siginfo, sig)) { /* Signal handler installed with SA_SIGINFO. */ sf.sf_arg2 = (register_t)&fp->sf_siginfo; sf.sf_siginfo.si_signo = sig; sf.sf_siginfo.si_code = code; sf.sf_ahu.sf_action = (__osiginfohandler_t *)catcher; } else { /* Old FreeBSD-style arguments. */ sf.sf_arg2 = code; sf.sf_addr = regs->tf_err; sf.sf_ahu.sf_handler = catcher; } PROC_UNLOCK(p); /* Save most if not all of trap frame. */ sf.sf_siginfo.si_sc.sc_eax = regs->tf_eax; sf.sf_siginfo.si_sc.sc_ebx = regs->tf_ebx; sf.sf_siginfo.si_sc.sc_ecx = regs->tf_ecx; sf.sf_siginfo.si_sc.sc_edx = regs->tf_edx; sf.sf_siginfo.si_sc.sc_esi = regs->tf_esi; sf.sf_siginfo.si_sc.sc_edi = regs->tf_edi; sf.sf_siginfo.si_sc.sc_cs = regs->tf_cs; sf.sf_siginfo.si_sc.sc_ds = regs->tf_ds; sf.sf_siginfo.si_sc.sc_ss = regs->tf_ss; sf.sf_siginfo.si_sc.sc_es = regs->tf_es; sf.sf_siginfo.si_sc.sc_fs = regs->tf_fs; sf.sf_siginfo.si_sc.sc_gs = rgs(); sf.sf_siginfo.si_sc.sc_isp = regs->tf_isp; /* Build the signal context to be used by osigreturn(). */ sf.sf_siginfo.si_sc.sc_onstack = (oonstack) ? 1 : 0; SIG2OSIG(*mask, sf.sf_siginfo.si_sc.sc_mask); sf.sf_siginfo.si_sc.sc_sp = regs->tf_esp; sf.sf_siginfo.si_sc.sc_fp = regs->tf_ebp; sf.sf_siginfo.si_sc.sc_pc = regs->tf_eip; sf.sf_siginfo.si_sc.sc_ps = regs->tf_eflags; sf.sf_siginfo.si_sc.sc_trapno = regs->tf_trapno; sf.sf_siginfo.si_sc.sc_err = regs->tf_err; /* * If we're a vm86 process, we want to save the segment registers. * We also change eflags to be our emulated eflags, not the actual * eflags. */ if (regs->tf_eflags & PSL_VM) { /* XXX confusing names: `tf' isn't a trapframe; `regs' is. */ struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; struct vm86_kernel *vm86 = &p->p_addr->u_pcb.pcb_ext->ext_vm86; sf.sf_siginfo.si_sc.sc_gs = tf->tf_vm86_gs; sf.sf_siginfo.si_sc.sc_fs = tf->tf_vm86_fs; sf.sf_siginfo.si_sc.sc_es = tf->tf_vm86_es; sf.sf_siginfo.si_sc.sc_ds = tf->tf_vm86_ds; if (vm86->vm86_has_vme == 0) sf.sf_siginfo.si_sc.sc_ps = (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); /* See sendsig() for comments. */ tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_T | PSL_VIF | PSL_VIP); } /* Copy the sigframe out to the user's stack. */ if (copyout(&sf, fp, sizeof(*fp)) != 0) { /* * Something is wrong with the stack pointer. * ...Kill the process. */ PROC_LOCK(p); sigexit(p, SIGILL); /* NOTREACHED */ } regs->tf_esp = (int)fp; regs->tf_eip = PS_STRINGS - szosigcode; regs->tf_cs = _ucodesel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _udatasel; load_gs(_udatasel); regs->tf_ss = _udatasel; } #endif void sendsig(catcher, sig, mask, code) sig_t catcher; int sig; sigset_t *mask; u_long code; { struct sigframe sf; struct proc *p; struct sigacts *psp; struct trapframe *regs; struct sigframe *sfp; int oonstack; p = curproc; PROC_LOCK(p); psp = p->p_sigacts; #ifdef COMPAT_43 if (SIGISMEMBER(psp->ps_osigset, sig)) { PROC_UNLOCK(p); osendsig(catcher, sig, mask, code); return; } #endif regs = p->p_frame; oonstack = sigonstack(regs->tf_esp); /* Save user context. */ bzero(&sf, sizeof(sf)); sf.sf_uc.uc_sigmask = *mask; sf.sf_uc.uc_stack = p->p_sigstk; sf.sf_uc.uc_stack.ss_flags = (p->p_flag & P_ALTSTACK) ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; sf.sf_uc.uc_mcontext.mc_gs = rgs(); bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs)); /* Allocate and validate space for the signal handler context. */ if ((p->p_flag & P_ALTSTACK) != 0 && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { sfp = (struct sigframe *)(p->p_sigstk.ss_sp + p->p_sigstk.ss_size - sizeof(struct sigframe)); #if defined(COMPAT_43) || defined(COMPAT_SUNOS) p->p_sigstk.ss_flags |= SS_ONSTACK; #endif } else sfp = (struct sigframe *)regs->tf_esp - 1; PROC_UNLOCK(p); /* * grow_stack() will return 0 if *sfp does not fit inside the stack * and the stack can not be grown. * useracc() will return FALSE if access is denied. */ if (grow_stack(p, (int)sfp) == 0 || !useracc((caddr_t)sfp, sizeof(*sfp), VM_PROT_WRITE)) { /* * Process has trashed its stack; give it an illegal * instruction to halt it in its tracks. */ #ifdef DEBUG printf("process %d has trashed its stack\n", p->p_pid); #endif PROC_LOCK(p); SIGACTION(p, SIGILL) = SIG_DFL; SIGDELSET(p->p_sigignore, SIGILL); SIGDELSET(p->p_sigcatch, SIGILL); SIGDELSET(p->p_sigmask, SIGILL); psignal(p, SIGILL); PROC_UNLOCK(p); return; } /* Translate the signal if appropriate. */ if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize) sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; /* Build the argument list for the signal handler. */ sf.sf_signum = sig; sf.sf_ucontext = (register_t)&sfp->sf_uc; PROC_LOCK(p); if (SIGISMEMBER(p->p_sigacts->ps_siginfo, sig)) { /* Signal handler installed with SA_SIGINFO. */ sf.sf_siginfo = (register_t)&sfp->sf_si; sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; /* Fill siginfo structure. */ sf.sf_si.si_signo = sig; sf.sf_si.si_code = code; sf.sf_si.si_addr = (void *)regs->tf_err; } else { /* Old FreeBSD-style arguments. */ sf.sf_siginfo = code; sf.sf_addr = regs->tf_err; sf.sf_ahu.sf_handler = catcher; } PROC_UNLOCK(p); /* * If we're a vm86 process, we want to save the segment registers. * We also change eflags to be our emulated eflags, not the actual * eflags. */ if (regs->tf_eflags & PSL_VM) { struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; struct vm86_kernel *vm86 = &p->p_addr->u_pcb.pcb_ext->ext_vm86; sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs; sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs; sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es; sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds; if (vm86->vm86_has_vme == 0) sf.sf_uc.uc_mcontext.mc_eflags = (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); /* * We should never have PSL_T set when returning from vm86 * mode. It may be set here if we deliver a signal before * getting to vm86 mode, so turn it off. * * Clear PSL_NT to inhibit T_TSSFLT faults on return from * syscalls made by the signal handler. This just avoids * wasting time for our lazy fixup of such faults. PSL_NT * does nothing in vm86 mode, but vm86 programs can set it * almost legitimately in probes for old cpu types. */ tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_T | PSL_VIF | PSL_VIP); } /* Copy the sigframe out to the user's stack. */ if (copyout(&sf, sfp, sizeof(*sfp)) != 0) { /* * Something is wrong with the stack pointer. * ...Kill the process. */ PROC_LOCK(p); sigexit(p, SIGILL); /* NOTREACHED */ } regs->tf_esp = (int)sfp; regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode); regs->tf_cs = _ucodesel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _udatasel; regs->tf_ss = _udatasel; } /* * System call to cleanup state after a signal * has been taken. Reset signal mask and * stack state from context left by sendsig (above). * Return to previous pc and psl as specified by * context left by sendsig. Check carefully to * make sure that the user has not modified the * state to gain improper privileges. */ #ifdef COMPAT_43 int osigreturn(p, uap) struct proc *p; struct osigreturn_args /* { struct osigcontext *sigcntxp; } */ *uap; { struct trapframe *regs; struct osigcontext *scp; int eflags; regs = p->p_frame; scp = uap->sigcntxp; if (!useracc((caddr_t)scp, sizeof(*scp), VM_PROT_READ)) return (EFAULT); eflags = scp->sc_ps; if (eflags & PSL_VM) { struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; struct vm86_kernel *vm86; /* * if pcb_ext == 0 or vm86_inited == 0, the user hasn't * set up the vm86 area, and we can't enter vm86 mode. */ if (p->p_addr->u_pcb.pcb_ext == 0) return (EINVAL); vm86 = &p->p_addr->u_pcb.pcb_ext->ext_vm86; if (vm86->vm86_inited == 0) return (EINVAL); /* Go back to user mode if both flags are set. */ if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) trapsignal(p, SIGBUS, 0); if (vm86->vm86_has_vme) { eflags = (tf->tf_eflags & ~VME_USERCHANGE) | (eflags & VME_USERCHANGE) | PSL_VM; } else { vm86->vm86_eflags = eflags; /* save VIF, VIP */ eflags = (tf->tf_eflags & ~VM_USERCHANGE) | (eflags & VM_USERCHANGE) | PSL_VM; } tf->tf_vm86_ds = scp->sc_ds; tf->tf_vm86_es = scp->sc_es; tf->tf_vm86_fs = scp->sc_fs; tf->tf_vm86_gs = scp->sc_gs; tf->tf_ds = _udatasel; tf->tf_es = _udatasel; tf->tf_fs = _udatasel; } else { /* * Don't allow users to change privileged or reserved flags. */ /* * XXX do allow users to change the privileged flag PSL_RF. * The cpu sets PSL_RF in tf_eflags for faults. Debuggers * should sometimes set it there too. tf_eflags is kept in * the signal context during signal handling and there is no * other place to remember it, so the PSL_RF bit may be * corrupted by the signal handler without us knowing. * Corruption of the PSL_RF bit at worst causes one more or * one less debugger trap, so allowing it is fairly harmless. */ if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) { return (EINVAL); } /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ if (!CS_SECURE(scp->sc_cs)) { trapsignal(p, SIGBUS, T_PROTFLT); return (EINVAL); } regs->tf_ds = scp->sc_ds; regs->tf_es = scp->sc_es; regs->tf_fs = scp->sc_fs; } /* Restore remaining registers. */ regs->tf_eax = scp->sc_eax; regs->tf_ebx = scp->sc_ebx; regs->tf_ecx = scp->sc_ecx; regs->tf_edx = scp->sc_edx; regs->tf_esi = scp->sc_esi; regs->tf_edi = scp->sc_edi; regs->tf_cs = scp->sc_cs; regs->tf_ss = scp->sc_ss; regs->tf_isp = scp->sc_isp; PROC_LOCK(p); #if defined(COMPAT_43) || defined(COMPAT_SUNOS) if (scp->sc_onstack & 1) p->p_sigstk.ss_flags |= SS_ONSTACK; else p->p_sigstk.ss_flags &= ~SS_ONSTACK; #endif SIGSETOLD(p->p_sigmask, scp->sc_mask); SIG_CANTMASK(p->p_sigmask); PROC_UNLOCK(p); regs->tf_ebp = scp->sc_fp; regs->tf_esp = scp->sc_sp; regs->tf_eip = scp->sc_pc; regs->tf_eflags = eflags; return (EJUSTRETURN); } #endif int sigreturn(p, uap) struct proc *p; struct sigreturn_args /* { ucontext_t *sigcntxp; } */ *uap; { struct trapframe *regs; ucontext_t *ucp; int cs, eflags; ucp = uap->sigcntxp; #ifdef COMPAT_43 if (!useracc((caddr_t)ucp, sizeof(struct osigcontext), VM_PROT_READ)) return (EFAULT); if (((struct osigcontext *)ucp)->sc_trapno == 0x01d516) return (osigreturn(p, (struct osigreturn_args *)uap)); /* * Since ucp is not an osigcontext but a ucontext_t, we have to * check again if all of it is accessible. A ucontext_t is * much larger, so instead of just checking for the pointer * being valid for the size of an osigcontext, now check for * it being valid for a whole, new-style ucontext_t. */ #endif if (!useracc((caddr_t)ucp, sizeof(*ucp), VM_PROT_READ)) return (EFAULT); regs = p->p_frame; eflags = ucp->uc_mcontext.mc_eflags; if (eflags & PSL_VM) { struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; struct vm86_kernel *vm86; /* * if pcb_ext == 0 or vm86_inited == 0, the user hasn't * set up the vm86 area, and we can't enter vm86 mode. */ if (p->p_addr->u_pcb.pcb_ext == 0) return (EINVAL); vm86 = &p->p_addr->u_pcb.pcb_ext->ext_vm86; if (vm86->vm86_inited == 0) return (EINVAL); /* Go back to user mode if both flags are set. */ if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) trapsignal(p, SIGBUS, 0); if (vm86->vm86_has_vme) { eflags = (tf->tf_eflags & ~VME_USERCHANGE) | (eflags & VME_USERCHANGE) | PSL_VM; } else { vm86->vm86_eflags = eflags; /* save VIF, VIP */ eflags = (tf->tf_eflags & ~VM_USERCHANGE) | (eflags & VM_USERCHANGE) | PSL_VM; } bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe)); tf->tf_eflags = eflags; tf->tf_vm86_ds = tf->tf_ds; tf->tf_vm86_es = tf->tf_es; tf->tf_vm86_fs = tf->tf_fs; tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs; tf->tf_ds = _udatasel; tf->tf_es = _udatasel; tf->tf_fs = _udatasel; } else { /* * Don't allow users to change privileged or reserved flags. */ /* * XXX do allow users to change the privileged flag PSL_RF. * The cpu sets PSL_RF in tf_eflags for faults. Debuggers * should sometimes set it there too. tf_eflags is kept in * the signal context during signal handling and there is no * other place to remember it, so the PSL_RF bit may be * corrupted by the signal handler without us knowing. * Corruption of the PSL_RF bit at worst causes one more or * one less debugger trap, so allowing it is fairly harmless. */ if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) { printf("sigreturn: eflags = 0x%x\n", eflags); return (EINVAL); } /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ cs = ucp->uc_mcontext.mc_cs; if (!CS_SECURE(cs)) { printf("sigreturn: cs = 0x%x\n", cs); trapsignal(p, SIGBUS, T_PROTFLT); return (EINVAL); } bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs)); } PROC_LOCK(p); #if defined(COMPAT_43) || defined(COMPAT_SUNOS) if (ucp->uc_mcontext.mc_onstack & 1) p->p_sigstk.ss_flags |= SS_ONSTACK; else p->p_sigstk.ss_flags &= ~SS_ONSTACK; #endif p->p_sigmask = ucp->uc_sigmask; SIG_CANTMASK(p->p_sigmask); PROC_UNLOCK(p); return (EJUSTRETURN); } /* * Machine dependent boot() routine * * I haven't seen anything to put here yet * Possibly some stuff might be grafted back here from boot() */ void cpu_boot(int howto) { } /* * Shutdown the CPU as much as possible */ void cpu_halt(void) { for (;;) __asm__ ("hlt"); } /* * Hook to idle the CPU when possible. This currently only works in * the !SMP case, as there is no clean way to ensure that a CPU will be * woken when there is work available for it. */ static int cpu_idle_hlt = 1; SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_hlt, CTLFLAG_RW, &cpu_idle_hlt, 0, "Idle loop HLT enable"); /* * Note that we have to be careful here to avoid a race between checking * procrunnable() and actually halting. If we don't do this, we may waste * the time between calling hlt and the next interrupt even though there * is a runnable process. */ void cpu_idle(void) { #ifndef SMP if (cpu_idle_hlt) { disable_intr(); if (procrunnable()) enable_intr(); else { enable_intr(); __asm __volatile("hlt"); } } #endif } /* * Clear registers on exec */ void setregs(p, entry, stack, ps_strings) struct proc *p; u_long entry; u_long stack; u_long ps_strings; { struct trapframe *regs = p->p_frame; struct pcb *pcb = &p->p_addr->u_pcb; if (pcb->pcb_ldt) user_ldt_free(pcb); bzero((char *)regs, sizeof(struct trapframe)); regs->tf_eip = entry; regs->tf_esp = stack; regs->tf_eflags = PSL_USER | (regs->tf_eflags & PSL_T); regs->tf_ss = _udatasel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _udatasel; regs->tf_cs = _ucodesel; /* PS_STRINGS value for BSD/OS binaries. It is 0 for non-BSD/OS. */ regs->tf_ebx = ps_strings; /* reset %gs as well */ if (pcb == PCPU_GET(curpcb)) load_gs(_udatasel); else pcb->pcb_gs = _udatasel; /* * Reset the hardware debug registers if they were in use. * They won't have any meaning for the newly exec'd process. */ if (pcb->pcb_flags & PCB_DBREGS) { pcb->pcb_dr0 = 0; pcb->pcb_dr1 = 0; pcb->pcb_dr2 = 0; pcb->pcb_dr3 = 0; pcb->pcb_dr6 = 0; pcb->pcb_dr7 = 0; if (pcb == PCPU_GET(curpcb)) { /* * Clear the debug registers on the running * CPU, otherwise they will end up affecting * the next process we switch to. */ reset_dbregs(); } pcb->pcb_flags &= ~PCB_DBREGS; } /* * Initialize the math emulator (if any) for the current process. * Actually, just clear the bit that says that the emulator has * been initialized. Initialization is delayed until the process * traps to the emulator (if it is done at all) mainly because * emulators don't provide an entry point for initialization. */ p->p_addr->u_pcb.pcb_flags &= ~FP_SOFTFP; /* * Arrange to trap the next npx or `fwait' instruction (see npx.c * for why fwait must be trapped at least if there is an npx or an * emulator). This is mainly to handle the case where npx0 is not * configured, since the npx routines normally set up the trap * otherwise. It should be done only at boot time, but doing it * here allows modifying `npx_exists' for testing the emulator on * systems with an npx. */ load_cr0(rcr0() | CR0_MP | CR0_TS); #ifdef DEV_NPX /* Initialize the npx (if any) for the current process. */ npxinit(__INITIAL_NPXCW__); #endif /* * XXX - Linux emulator * Make sure sure edx is 0x0 on entry. Linux binaries depend * on it. */ p->p_retval[1] = 0; } void cpu_setregs(void) { unsigned int cr0; cr0 = rcr0(); cr0 |= CR0_NE; /* Done by npxinit() */ cr0 |= CR0_MP | CR0_TS; /* Done at every execve() too. */ #ifndef I386_CPU cr0 |= CR0_WP | CR0_AM; #endif load_cr0(cr0); load_gs(_udatasel); } static int sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS) { int error; error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req); if (!error && req->newptr) resettodr(); return (error); } SYSCTL_PROC(_machdep, CPU_ADJKERNTZ, adjkerntz, CTLTYPE_INT|CTLFLAG_RW, &adjkerntz, 0, sysctl_machdep_adjkerntz, "I", ""); SYSCTL_INT(_machdep, CPU_DISRTCSET, disable_rtc_set, CTLFLAG_RW, &disable_rtc_set, 0, ""); SYSCTL_STRUCT(_machdep, CPU_BOOTINFO, bootinfo, CTLFLAG_RD, &bootinfo, bootinfo, ""); SYSCTL_INT(_machdep, CPU_WALLCLOCK, wall_cmos_clock, CTLFLAG_RW, &wall_cmos_clock, 0, ""); /* * Initialize 386 and configure to run kernel */ /* * Initialize segments & interrupt table */ int _default_ldt; union descriptor gdt[NGDT * MAXCPU]; /* global descriptor table */ static struct gate_descriptor idt0[NIDT]; struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */ union descriptor ldt[NLDT]; /* local descriptor table */ #ifdef SMP /* table descriptors - used to load tables by microp */ struct region_descriptor r_gdt, r_idt; #endif int private_tss; /* flag indicating private tss */ #if defined(I586_CPU) && !defined(NO_F00F_HACK) extern int has_f00f_bug; #endif static struct i386tss dblfault_tss; static char dblfault_stack[PAGE_SIZE]; extern struct user *proc0paddr; /* software prototypes -- in more palatable form */ struct soft_segment_descriptor gdt_segs[] = { /* GNULL_SEL 0 Null Descriptor */ { 0x0, /* segment base address */ 0x0, /* length */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GCODE_SEL 1 Code Descriptor for kernel */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMERA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GDATA_SEL 2 Data Descriptor for kernel */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMRWA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GPRIV_SEL 3 SMP Per-Processor Private Data Descriptor */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMRWA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GPROC0_SEL 4 Proc 0 Tss Descriptor */ { 0x0, /* segment base address */ sizeof(struct i386tss)-1,/* length - all address space */ SDT_SYS386TSS, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* unused - default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GLDT_SEL 5 LDT Descriptor */ { (int) ldt, /* segment base address */ sizeof(ldt)-1, /* length - all address space */ SDT_SYSLDT, /* segment type */ SEL_UPL, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* unused - default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GUSERLDT_SEL 6 User LDT Descriptor per process */ { (int) ldt, /* segment base address */ (512 * sizeof(union descriptor)-1), /* length */ SDT_SYSLDT, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* unused - default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GTGATE_SEL 7 Null Descriptor - Placeholder */ { 0x0, /* segment base address */ 0x0, /* length - all address space */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GBIOSLOWMEM_SEL 8 BIOS access to realmode segment 0x40, must be #8 in GDT */ { 0x400, /* segment base address */ 0xfffff, /* length */ SDT_MEMRWA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GPANIC_SEL 9 Panic Tss Descriptor */ { (int) &dblfault_tss, /* segment base address */ sizeof(struct i386tss)-1,/* length - all address space */ SDT_SYS386TSS, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* unused - default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GBIOSCODE32_SEL 10 BIOS 32-bit interface (32bit Code) */ { 0, /* segment base address (overwritten) */ 0xfffff, /* length */ SDT_MEMERA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GBIOSCODE16_SEL 11 BIOS 32-bit interface (16bit Code) */ { 0, /* segment base address (overwritten) */ 0xfffff, /* length */ SDT_MEMERA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GBIOSDATA_SEL 12 BIOS 32-bit interface (Data) */ { 0, /* segment base address (overwritten) */ 0xfffff, /* length */ SDT_MEMRWA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GBIOSUTIL_SEL 13 BIOS 16-bit interface (Utility) */ { 0, /* segment base address (overwritten) */ 0xfffff, /* length */ SDT_MEMRWA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GBIOSARGS_SEL 14 BIOS 16-bit interface (Arguments) */ { 0, /* segment base address (overwritten) */ 0xfffff, /* length */ SDT_MEMRWA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, }; static struct soft_segment_descriptor ldt_segs[] = { /* Null Descriptor - overwritten by call gate */ { 0x0, /* segment base address */ 0x0, /* length - all address space */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* Null Descriptor - overwritten by call gate */ { 0x0, /* segment base address */ 0x0, /* length - all address space */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* Null Descriptor - overwritten by call gate */ { 0x0, /* segment base address */ 0x0, /* length - all address space */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* Code Descriptor for user */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMERA, /* segment type */ SEL_UPL, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* Null Descriptor - overwritten by call gate */ { 0x0, /* segment base address */ 0x0, /* length - all address space */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* Data Descriptor for user */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMRWA, /* segment type */ SEL_UPL, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, }; void setidt(idx, func, typ, dpl, selec) int idx; inthand_t *func; int typ; int dpl; int selec; { struct gate_descriptor *ip; ip = idt + idx; ip->gd_looffset = (int)func; ip->gd_selector = selec; ip->gd_stkcpy = 0; ip->gd_xx = 0; ip->gd_type = typ; ip->gd_dpl = dpl; ip->gd_p = 1; ip->gd_hioffset = ((int)func)>>16 ; } #define IDTVEC(name) __CONCAT(X,name) extern inthand_t IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), IDTVEC(xmm), IDTVEC(lcall_syscall), IDTVEC(int0x80_syscall); void sdtossd(sd, ssd) struct segment_descriptor *sd; struct soft_segment_descriptor *ssd; { ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase; ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit; ssd->ssd_type = sd->sd_type; ssd->ssd_dpl = sd->sd_dpl; ssd->ssd_p = sd->sd_p; ssd->ssd_def32 = sd->sd_def32; ssd->ssd_gran = sd->sd_gran; } #define PHYSMAP_SIZE (2 * 8) /* * Populate the (physmap) array with base/bound pairs describing the * available physical memory in the system, then test this memory and * build the phys_avail array describing the actually-available memory. * * If we cannot accurately determine the physical memory map, then use * value from the 0xE801 call, and failing that, the RTC. * * Total memory size may be set by the kernel environment variable * hw.physmem or the compile-time define MAXMEM. */ static void getmemsize(int first) { int i, physmap_idx, pa_indx; u_int basemem, extmem; struct vm86frame vmf; struct vm86context vmc; vm_offset_t pa, physmap[PHYSMAP_SIZE]; pt_entry_t pte; const char *cp; struct bios_smap *smap; bzero(&vmf, sizeof(struct vm86frame)); bzero(physmap, sizeof(physmap)); /* * Perform "base memory" related probes & setup */ vm86_intcall(0x12, &vmf); basemem = vmf.vmf_ax; if (basemem > 640) { printf("Preposterous BIOS basemem of %uK, truncating to 640K\n", basemem); basemem = 640; } /* * XXX if biosbasemem is now < 640, there is a `hole' * between the end of base memory and the start of * ISA memory. The hole may be empty or it may * contain BIOS code or data. Map it read/write so * that the BIOS can write to it. (Memory from 0 to * the physical end of the kernel is mapped read-only * to begin with and then parts of it are remapped. * The parts that aren't remapped form holes that * remain read-only and are unused by the kernel. * The base memory area is below the physical end of * the kernel and right now forms a read-only hole. * The part of it from PAGE_SIZE to * (trunc_page(biosbasemem * 1024) - 1) will be * remapped and used by the kernel later.) * * This code is similar to the code used in * pmap_mapdev, but since no memory needs to be * allocated we simply change the mapping. */ for (pa = trunc_page(basemem * 1024); pa < ISA_HOLE_START; pa += PAGE_SIZE) { pte = (pt_entry_t)vtopte(pa + KERNBASE); *pte = pa | PG_RW | PG_V; } /* * if basemem != 640, map pages r/w into vm86 page table so * that the bios can scribble on it. */ pte = (pt_entry_t)vm86paddr; for (i = basemem / 4; i < 160; i++) pte[i] = (i << PAGE_SHIFT) | PG_V | PG_RW | PG_U; /* * map page 1 R/W into the kernel page table so we can use it * as a buffer. The kernel will unmap this page later. */ pte = (pt_entry_t)vtopte(KERNBASE + (1 << PAGE_SHIFT)); *pte = (1 << PAGE_SHIFT) | PG_RW | PG_V; /* * get memory map with INT 15:E820 */ vmc.npages = 0; smap = (void *)vm86_addpage(&vmc, 1, KERNBASE + (1 << PAGE_SHIFT)); vm86_getptr(&vmc, (vm_offset_t)smap, &vmf.vmf_es, &vmf.vmf_di); physmap_idx = 0; vmf.vmf_ebx = 0; do { vmf.vmf_eax = 0xE820; vmf.vmf_edx = SMAP_SIG; vmf.vmf_ecx = sizeof(struct bios_smap); i = vm86_datacall(0x15, &vmf, &vmc); if (i || vmf.vmf_eax != SMAP_SIG) break; if (boothowto & RB_VERBOSE) printf("SMAP type=%02x base=%08x %08x len=%08x %08x\n", smap->type, *(u_int32_t *)((char *)&smap->base + 4), (u_int32_t)smap->base, *(u_int32_t *)((char *)&smap->length + 4), (u_int32_t)smap->length); if (smap->type != 0x01) goto next_run; if (smap->length == 0) goto next_run; if (smap->base >= 0xffffffff) { printf("%uK of memory above 4GB ignored\n", (u_int)(smap->length / 1024)); goto next_run; } for (i = 0; i <= physmap_idx; i += 2) { if (smap->base < physmap[i + 1]) { if (boothowto & RB_VERBOSE) printf( "Overlapping or non-montonic memory region, ignoring second region\n"); goto next_run; } } if (smap->base == physmap[physmap_idx + 1]) { physmap[physmap_idx + 1] += smap->length; goto next_run; } physmap_idx += 2; if (physmap_idx == PHYSMAP_SIZE) { printf( "Too many segments in the physical address map, giving up\n"); break; } physmap[physmap_idx] = smap->base; physmap[physmap_idx + 1] = smap->base + smap->length; next_run: } while (vmf.vmf_ebx != 0); if (physmap[1] != 0) goto physmap_done; /* * If we failed above, try memory map with INT 15:E801 */ vmf.vmf_ax = 0xE801; if (vm86_intcall(0x15, &vmf) == 0) { extmem = vmf.vmf_cx + vmf.vmf_dx * 64; } else { #if 0 vmf.vmf_ah = 0x88; vm86_intcall(0x15, &vmf); extmem = vmf.vmf_ax; #else /* * Prefer the RTC value for extended memory. */ extmem = rtcin(RTC_EXTLO) + (rtcin(RTC_EXTHI) << 8); #endif } /* * Special hack for chipsets that still remap the 384k hole when * there's 16MB of memory - this really confuses people that * are trying to use bus mastering ISA controllers with the * "16MB limit"; they only have 16MB, but the remapping puts * them beyond the limit. * * If extended memory is between 15-16MB (16-17MB phys address range), * chop it to 15MB. */ if ((extmem > 15 * 1024) && (extmem < 16 * 1024)) extmem = 15 * 1024; physmap[0] = 0; physmap[1] = basemem * 1024; physmap_idx = 2; physmap[physmap_idx] = 0x100000; physmap[physmap_idx + 1] = physmap[physmap_idx] + extmem * 1024; physmap_done: /* * Now, physmap contains a map of physical memory. */ #ifdef SMP /* make hole for AP bootstrap code */ physmap[1] = mp_bootaddress(physmap[1] / 1024); /* look for the MP hardware - needed for apic addresses */ i386_mp_probe(); #endif /* * Maxmem isn't the "maximum memory", it's one larger than the * highest page of the physical address space. It should be * called something like "Maxphyspage". We may adjust this * based on ``hw.physmem'' and the results of the memory test. */ Maxmem = atop(physmap[physmap_idx + 1]); #ifdef MAXMEM Maxmem = MAXMEM / 4; #endif /* * hw.physmem is a size in bytes; we also allow k, m, and g suffixes * for the appropriate modifiers. This overrides MAXMEM. */ if ((cp = getenv("hw.physmem")) != NULL) { u_int64_t AllowMem, sanity; char *ep; sanity = AllowMem = strtouq(cp, &ep, 0); if ((ep != cp) && (*ep != 0)) { switch(*ep) { case 'g': case 'G': AllowMem <<= 10; case 'm': case 'M': AllowMem <<= 10; case 'k': case 'K': AllowMem <<= 10; break; default: AllowMem = sanity = 0; } if (AllowMem < sanity) AllowMem = 0; } if (AllowMem == 0) printf("Ignoring invalid memory size of '%s'\n", cp); else Maxmem = atop(AllowMem); } if (atop(physmap[physmap_idx + 1]) != Maxmem && (boothowto & RB_VERBOSE)) printf("Physical memory use set to %uK\n", Maxmem * 4); /* * If Maxmem has been increased beyond what the system has detected, * extend the last memory segment to the new limit. */ if (atop(physmap[physmap_idx + 1]) < Maxmem) physmap[physmap_idx + 1] = ptoa(Maxmem); /* call pmap initialization to make new kernel address space */ pmap_bootstrap(first, 0); /* * Size up each available chunk of physical memory. */ physmap[0] = PAGE_SIZE; /* mask off page 0 */ pa_indx = 0; phys_avail[pa_indx++] = physmap[0]; phys_avail[pa_indx] = physmap[0]; #if 0 pte = (pt_entry_t)vtopte(KERNBASE); #else pte = (pt_entry_t)CMAP1; #endif /* * physmap is in bytes, so when converting to page boundaries, * round up the start address and round down the end address. */ for (i = 0; i <= physmap_idx; i += 2) { vm_offset_t end; end = ptoa(Maxmem); if (physmap[i + 1] < end) end = trunc_page(physmap[i + 1]); for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) { int tmp, page_bad; #if 0 int *ptr = 0; #else int *ptr = (int *)CADDR1; #endif /* * block out kernel memory as not available. */ if (pa >= 0x100000 && pa < first) continue; page_bad = FALSE; /* * map page into kernel: valid, read/write,non-cacheable */ *pte = pa | PG_V | PG_RW | PG_N; invltlb(); tmp = *(int *)ptr; /* * Test for alternating 1's and 0's */ *(volatile int *)ptr = 0xaaaaaaaa; if (*(volatile int *)ptr != 0xaaaaaaaa) { page_bad = TRUE; } /* * Test for alternating 0's and 1's */ *(volatile int *)ptr = 0x55555555; if (*(volatile int *)ptr != 0x55555555) { page_bad = TRUE; } /* * Test for all 1's */ *(volatile int *)ptr = 0xffffffff; if (*(volatile int *)ptr != 0xffffffff) { page_bad = TRUE; } /* * Test for all 0's */ *(volatile int *)ptr = 0x0; if (*(volatile int *)ptr != 0x0) { page_bad = TRUE; } /* * Restore original value. */ *(int *)ptr = tmp; /* * Adjust array of valid/good pages. */ if (page_bad == TRUE) { continue; } /* * If this good page is a continuation of the * previous set of good pages, then just increase * the end pointer. Otherwise start a new chunk. * Note that "end" points one higher than end, * making the range >= start and < end. * If we're also doing a speculative memory * test and we at or past the end, bump up Maxmem * so that we keep going. The first bad page * will terminate the loop. */ if (phys_avail[pa_indx] == pa) { phys_avail[pa_indx] += PAGE_SIZE; } else { pa_indx++; if (pa_indx == PHYS_AVAIL_ARRAY_END) { printf( "Too many holes in the physical address space, giving up\n"); pa_indx--; break; } phys_avail[pa_indx++] = pa; /* start */ phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */ } physmem++; } } *pte = 0; invltlb(); /* * XXX * The last chunk must contain at least one page plus the message * buffer to avoid complicating other code (message buffer address * calculation, etc.). */ while (phys_avail[pa_indx - 1] + PAGE_SIZE + round_page(MSGBUF_SIZE) >= phys_avail[pa_indx]) { physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]); phys_avail[pa_indx--] = 0; phys_avail[pa_indx--] = 0; } Maxmem = atop(phys_avail[pa_indx]); /* Trim off space for the message buffer. */ phys_avail[pa_indx] -= round_page(MSGBUF_SIZE); avail_end = phys_avail[pa_indx]; } void init386(first) int first; { struct gate_descriptor *gdp; int gsel_tss, metadata_missing, off, x; #ifndef SMP /* table descriptors - used to load tables by microp */ struct region_descriptor r_gdt, r_idt; #endif proc0.p_addr = proc0paddr; atdevbase = ISA_HOLE_START + KERNBASE; metadata_missing = 0; if (bootinfo.bi_modulep) { preload_metadata = (caddr_t)bootinfo.bi_modulep + KERNBASE; preload_bootstrap_relocate(KERNBASE); } else { metadata_missing = 1; } if (bootinfo.bi_envp) kern_envp = (caddr_t)bootinfo.bi_envp + KERNBASE; /* Init basic tunables, hz etc */ init_param(); /* * make gdt memory segments, the code segment goes up to end of the * page with etext in it, the data segment goes to the end of * the address space */ /* * XXX text protection is temporarily (?) disabled. The limit was * i386_btop(round_page(etext)) - 1. */ gdt_segs[GCODE_SEL].ssd_limit = atop(0 - 1); gdt_segs[GDATA_SEL].ssd_limit = atop(0 - 1); #ifdef SMP gdt_segs[GPRIV_SEL].ssd_limit = atop(sizeof(struct privatespace) - 1); gdt_segs[GPRIV_SEL].ssd_base = (int) &SMP_prvspace[0]; gdt_segs[GPROC0_SEL].ssd_base = (int) &SMP_prvspace[0].globaldata.gd_common_tss; SMP_prvspace[0].globaldata.gd_prvspace = &SMP_prvspace[0].globaldata; #else gdt_segs[GPRIV_SEL].ssd_limit = atop(sizeof(struct globaldata) - 1); gdt_segs[GPRIV_SEL].ssd_base = (int) &__globaldata; gdt_segs[GPROC0_SEL].ssd_base = (int) &__globaldata.gd_common_tss; __globaldata.gd_prvspace = &__globaldata; #endif for (x = 0; x < NGDT; x++) { #ifdef BDE_DEBUGGER /* avoid overwriting db entries with APM ones */ if (x >= GAPMCODE32_SEL && x <= GAPMDATA_SEL) continue; #endif ssdtosd(&gdt_segs[x], &gdt[x].sd); } r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; r_gdt.rd_base = (int) gdt; lgdt(&r_gdt); /* setup curproc so that mutexes work */ PCPU_SET(curproc, &proc0); PCPU_SET(spinlocks, NULL); LIST_INIT(&proc0.p_contested); /* * Initialize mutexes. */ mtx_init(&Giant, "Giant", MTX_DEF | MTX_RECURSE); mtx_init(&sched_lock, "sched lock", MTX_SPIN | MTX_RECURSE); mtx_init(&proc0.p_mtx, "process lock", MTX_DEF); mtx_init(&clock_lock, "clk", MTX_SPIN | MTX_RECURSE); #ifdef SMP mtx_init(&imen_mtx, "imen", MTX_SPIN); #endif mtx_lock(&Giant); /* make ldt memory segments */ /* * XXX - VM_MAXUSER_ADDRESS is an end address, not a max. And it * should be spelled ...MAX_USER... */ ldt_segs[LUCODE_SEL].ssd_limit = atop(VM_MAXUSER_ADDRESS - 1); ldt_segs[LUDATA_SEL].ssd_limit = atop(VM_MAXUSER_ADDRESS - 1); for (x = 0; x < sizeof ldt_segs / sizeof ldt_segs[0]; x++) ssdtosd(&ldt_segs[x], &ldt[x].sd); _default_ldt = GSEL(GLDT_SEL, SEL_KPL); lldt(_default_ldt); PCPU_SET(currentldt, _default_ldt); /* exceptions */ for (x = 0; x < NIDT; x++) setidt(x, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(0, &IDTVEC(div), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(1, &IDTVEC(dbg), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(2, &IDTVEC(nmi), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(3, &IDTVEC(bpt), SDT_SYS386IGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(4, &IDTVEC(ofl), SDT_SYS386TGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(5, &IDTVEC(bnd), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(6, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(7, &IDTVEC(dna), SDT_SYS386TGT, SEL_KPL , GSEL(GCODE_SEL, SEL_KPL)); setidt(8, 0, SDT_SYSTASKGT, SEL_KPL, GSEL(GPANIC_SEL, SEL_KPL)); setidt(9, &IDTVEC(fpusegm), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(10, &IDTVEC(tss), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(11, &IDTVEC(missing), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(12, &IDTVEC(stk), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(13, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(14, &IDTVEC(page), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(15, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(16, &IDTVEC(fpu), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(17, &IDTVEC(align), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(18, &IDTVEC(mchk), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(19, &IDTVEC(xmm), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(0x80, &IDTVEC(int0x80_syscall), SDT_SYS386TGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); r_idt.rd_limit = sizeof(idt0) - 1; r_idt.rd_base = (int) idt; lidt(&r_idt); /* * Initialize the console before we print anything out. */ cninit(); if (metadata_missing) printf("WARNING: loader(8) metadata is missing!\n"); #ifdef DEV_ISA isa_defaultirq(); #endif #ifdef DDB kdb_init(); if (boothowto & RB_KDB) Debugger("Boot flags requested debugger"); #endif finishidentcpu(); /* Final stage of CPU initialization */ setidt(6, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(13, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); initializecpu(); /* Initialize CPU registers */ /* make an initial tss so cpu can get interrupt stack on syscall! */ PCPU_SET(common_tss.tss_esp0, (int) proc0.p_addr + UPAGES*PAGE_SIZE - 16); PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL)); gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); private_tss = 0; PCPU_SET(tss_gdt, &gdt[GPROC0_SEL].sd); PCPU_SET(common_tssd, *PCPU_GET(tss_gdt)); PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16); ltr(gsel_tss); dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 = dblfault_tss.tss_esp2 = (int)&dblfault_stack[sizeof(dblfault_stack)]; dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 = dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL); dblfault_tss.tss_cr3 = (int)IdlePTD; dblfault_tss.tss_eip = (int)dblfault_handler; dblfault_tss.tss_eflags = PSL_KERNEL; dblfault_tss.tss_ds = dblfault_tss.tss_es = dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL); dblfault_tss.tss_fs = GSEL(GPRIV_SEL, SEL_KPL); dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL); dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL); vm86_initialize(); getmemsize(first); /* now running on new page tables, configured,and u/iom is accessible */ /* Map the message buffer. */ for (off = 0; off < round_page(MSGBUF_SIZE); off += PAGE_SIZE) pmap_kenter((vm_offset_t)msgbufp + off, avail_end + off); msgbufinit(msgbufp, MSGBUF_SIZE); /* make a call gate to reenter kernel with */ gdp = &ldt[LSYS5CALLS_SEL].gd; x = (int) &IDTVEC(lcall_syscall); gdp->gd_looffset = x; gdp->gd_selector = GSEL(GCODE_SEL,SEL_KPL); gdp->gd_stkcpy = 1; gdp->gd_type = SDT_SYS386CGT; gdp->gd_dpl = SEL_UPL; gdp->gd_p = 1; gdp->gd_hioffset = x >> 16; /* XXX does this work? */ ldt[LBSDICALLS_SEL] = ldt[LSYS5CALLS_SEL]; ldt[LSOL26CALLS_SEL] = ldt[LSYS5CALLS_SEL]; /* transfer to user mode */ _ucodesel = LSEL(LUCODE_SEL, SEL_UPL); _udatasel = LSEL(LUDATA_SEL, SEL_UPL); /* setup proc 0's pcb */ proc0.p_addr->u_pcb.pcb_flags = 0; proc0.p_addr->u_pcb.pcb_cr3 = (int)IdlePTD; proc0.p_addr->u_pcb.pcb_ext = 0; proc0.p_frame = &proc0_tf; } #if defined(I586_CPU) && !defined(NO_F00F_HACK) static void f00f_hack(void *unused); SYSINIT(f00f_hack, SI_SUB_INTRINSIC, SI_ORDER_FIRST, f00f_hack, NULL); static void f00f_hack(void *unused) { struct gate_descriptor *new_idt; #ifndef SMP struct region_descriptor r_idt; #endif vm_offset_t tmp; if (!has_f00f_bug) return; GIANT_REQUIRED; printf("Intel Pentium detected, installing workaround for F00F bug\n"); r_idt.rd_limit = sizeof(idt0) - 1; tmp = kmem_alloc(kernel_map, PAGE_SIZE * 2); if (tmp == 0) panic("kmem_alloc returned 0"); if (((unsigned int)tmp & (PAGE_SIZE-1)) != 0) panic("kmem_alloc returned non-page-aligned memory"); /* Put the first seven entries in the lower page */ new_idt = (struct gate_descriptor*)(tmp + PAGE_SIZE - (7*8)); bcopy(idt, new_idt, sizeof(idt0)); r_idt.rd_base = (int)new_idt; lidt(&r_idt); idt = new_idt; if (vm_map_protect(kernel_map, tmp, tmp + PAGE_SIZE, VM_PROT_READ, FALSE) != KERN_SUCCESS) panic("vm_map_protect failed"); return; } #endif /* defined(I586_CPU) && !NO_F00F_HACK */ int ptrace_set_pc(p, addr) struct proc *p; unsigned long addr; { p->p_frame->tf_eip = addr; return (0); } int ptrace_single_step(p) struct proc *p; { p->p_frame->tf_eflags |= PSL_T; return (0); } int fill_regs(p, regs) struct proc *p; struct reg *regs; { struct pcb *pcb; struct trapframe *tp; tp = p->p_frame; regs->r_fs = tp->tf_fs; regs->r_es = tp->tf_es; regs->r_ds = tp->tf_ds; regs->r_edi = tp->tf_edi; regs->r_esi = tp->tf_esi; regs->r_ebp = tp->tf_ebp; regs->r_ebx = tp->tf_ebx; regs->r_edx = tp->tf_edx; regs->r_ecx = tp->tf_ecx; regs->r_eax = tp->tf_eax; regs->r_eip = tp->tf_eip; regs->r_cs = tp->tf_cs; regs->r_eflags = tp->tf_eflags; regs->r_esp = tp->tf_esp; regs->r_ss = tp->tf_ss; pcb = &p->p_addr->u_pcb; regs->r_gs = pcb->pcb_gs; return (0); } int set_regs(p, regs) struct proc *p; struct reg *regs; { struct pcb *pcb; struct trapframe *tp; tp = p->p_frame; if (!EFL_SECURE(regs->r_eflags, tp->tf_eflags) || !CS_SECURE(regs->r_cs)) return (EINVAL); tp->tf_fs = regs->r_fs; tp->tf_es = regs->r_es; tp->tf_ds = regs->r_ds; tp->tf_edi = regs->r_edi; tp->tf_esi = regs->r_esi; tp->tf_ebp = regs->r_ebp; tp->tf_ebx = regs->r_ebx; tp->tf_edx = regs->r_edx; tp->tf_ecx = regs->r_ecx; tp->tf_eax = regs->r_eax; tp->tf_eip = regs->r_eip; tp->tf_cs = regs->r_cs; tp->tf_eflags = regs->r_eflags; tp->tf_esp = regs->r_esp; tp->tf_ss = regs->r_ss; pcb = &p->p_addr->u_pcb; pcb->pcb_gs = regs->r_gs; return (0); } #ifdef CPU_ENABLE_SSE static void fill_fpregs_xmm(sv_xmm, sv_87) struct savexmm *sv_xmm; struct save87 *sv_87; { register struct env87 *penv_87 = &sv_87->sv_env; register struct envxmm *penv_xmm = &sv_xmm->sv_env; int i; /* FPU control/status */ penv_87->en_cw = penv_xmm->en_cw; penv_87->en_sw = penv_xmm->en_sw; penv_87->en_tw = penv_xmm->en_tw; penv_87->en_fip = penv_xmm->en_fip; penv_87->en_fcs = penv_xmm->en_fcs; penv_87->en_opcode = penv_xmm->en_opcode; penv_87->en_foo = penv_xmm->en_foo; penv_87->en_fos = penv_xmm->en_fos; /* FPU registers */ for (i = 0; i < 8; ++i) sv_87->sv_ac[i] = sv_xmm->sv_fp[i].fp_acc; sv_87->sv_ex_sw = sv_xmm->sv_ex_sw; } static void set_fpregs_xmm(sv_87, sv_xmm) struct save87 *sv_87; struct savexmm *sv_xmm; { register struct env87 *penv_87 = &sv_87->sv_env; register struct envxmm *penv_xmm = &sv_xmm->sv_env; int i; /* FPU control/status */ penv_xmm->en_cw = penv_87->en_cw; penv_xmm->en_sw = penv_87->en_sw; penv_xmm->en_tw = penv_87->en_tw; penv_xmm->en_fip = penv_87->en_fip; penv_xmm->en_fcs = penv_87->en_fcs; penv_xmm->en_opcode = penv_87->en_opcode; penv_xmm->en_foo = penv_87->en_foo; penv_xmm->en_fos = penv_87->en_fos; /* FPU registers */ for (i = 0; i < 8; ++i) sv_xmm->sv_fp[i].fp_acc = sv_87->sv_ac[i]; sv_xmm->sv_ex_sw = sv_87->sv_ex_sw; } #endif /* CPU_ENABLE_SSE */ int fill_fpregs(p, fpregs) struct proc *p; struct fpreg *fpregs; { #ifdef CPU_ENABLE_SSE if (cpu_fxsr) { fill_fpregs_xmm(&p->p_addr->u_pcb.pcb_save.sv_xmm, (struct save87 *)fpregs); return (0); } #endif /* CPU_ENABLE_SSE */ bcopy(&p->p_addr->u_pcb.pcb_save.sv_87, fpregs, sizeof *fpregs); return (0); } int set_fpregs(p, fpregs) struct proc *p; struct fpreg *fpregs; { #ifdef CPU_ENABLE_SSE if (cpu_fxsr) { set_fpregs_xmm((struct save87 *)fpregs, &p->p_addr->u_pcb.pcb_save.sv_xmm); return (0); } #endif /* CPU_ENABLE_SSE */ bcopy(fpregs, &p->p_addr->u_pcb.pcb_save.sv_87, sizeof *fpregs); return (0); } int fill_dbregs(p, dbregs) struct proc *p; struct dbreg *dbregs; { struct pcb *pcb; if (p == NULL) { dbregs->dr0 = rdr0(); dbregs->dr1 = rdr1(); dbregs->dr2 = rdr2(); dbregs->dr3 = rdr3(); dbregs->dr4 = rdr4(); dbregs->dr5 = rdr5(); dbregs->dr6 = rdr6(); dbregs->dr7 = rdr7(); } else { pcb = &p->p_addr->u_pcb; dbregs->dr0 = pcb->pcb_dr0; dbregs->dr1 = pcb->pcb_dr1; dbregs->dr2 = pcb->pcb_dr2; dbregs->dr3 = pcb->pcb_dr3; dbregs->dr4 = 0; dbregs->dr5 = 0; dbregs->dr6 = pcb->pcb_dr6; dbregs->dr7 = pcb->pcb_dr7; } return (0); } int set_dbregs(p, dbregs) struct proc *p; struct dbreg *dbregs; { struct pcb *pcb; int i; u_int32_t mask1, mask2; if (p == NULL) { load_dr0(dbregs->dr0); load_dr1(dbregs->dr1); load_dr2(dbregs->dr2); load_dr3(dbregs->dr3); load_dr4(dbregs->dr4); load_dr5(dbregs->dr5); load_dr6(dbregs->dr6); load_dr7(dbregs->dr7); } else { /* * Don't let an illegal value for dr7 get set. Specifically, * check for undefined settings. Setting these bit patterns * result in undefined behaviour and can lead to an unexpected * TRCTRAP. */ for (i = 0, mask1 = 0x3<<16, mask2 = 0x2<<16; i < 8; i++, mask1 <<= 2, mask2 <<= 2) if ((dbregs->dr7 & mask1) == mask2) return (EINVAL); pcb = &p->p_addr->u_pcb; /* * Don't let a process set a breakpoint that is not within the * process's address space. If a process could do this, it * could halt the system by setting a breakpoint in the kernel * (if ddb was enabled). Thus, we need to check to make sure * that no breakpoints are being enabled for addresses outside * process's address space, unless, perhaps, we were called by * uid 0. * * XXX - what about when the watched area of the user's * address space is written into from within the kernel * ... wouldn't that still cause a breakpoint to be generated * from within kernel mode? */ if (suser(p) != 0) { if (dbregs->dr7 & 0x3) { /* dr0 is enabled */ if (dbregs->dr0 >= VM_MAXUSER_ADDRESS) return (EINVAL); } if (dbregs->dr7 & (0x3<<2)) { /* dr1 is enabled */ if (dbregs->dr1 >= VM_MAXUSER_ADDRESS) return (EINVAL); } if (dbregs->dr7 & (0x3<<4)) { /* dr2 is enabled */ if (dbregs->dr2 >= VM_MAXUSER_ADDRESS) return (EINVAL); } if (dbregs->dr7 & (0x3<<6)) { /* dr3 is enabled */ if (dbregs->dr3 >= VM_MAXUSER_ADDRESS) return (EINVAL); } } pcb->pcb_dr0 = dbregs->dr0; pcb->pcb_dr1 = dbregs->dr1; pcb->pcb_dr2 = dbregs->dr2; pcb->pcb_dr3 = dbregs->dr3; pcb->pcb_dr6 = dbregs->dr6; pcb->pcb_dr7 = dbregs->dr7; pcb->pcb_flags |= PCB_DBREGS; } return (0); } /* * Return > 0 if a hardware breakpoint has been hit, and the * breakpoint was in user space. Return 0, otherwise. */ int user_dbreg_trap(void) { u_int32_t dr7, dr6; /* debug registers dr6 and dr7 */ u_int32_t bp; /* breakpoint bits extracted from dr6 */ int nbp; /* number of breakpoints that triggered */ caddr_t addr[4]; /* breakpoint addresses */ int i; dr7 = rdr7(); if ((dr7 & 0x000000ff) == 0) { /* * all GE and LE bits in the dr7 register are zero, * thus the trap couldn't have been caused by the * hardware debug registers */ return 0; } nbp = 0; dr6 = rdr6(); bp = dr6 & 0x0000000f; if (!bp) { /* * None of the breakpoint bits are set meaning this * trap was not caused by any of the debug registers */ return 0; } /* * at least one of the breakpoints were hit, check to see * which ones and if any of them are user space addresses */ if (bp & 0x01) { addr[nbp++] = (caddr_t)rdr0(); } if (bp & 0x02) { addr[nbp++] = (caddr_t)rdr1(); } if (bp & 0x04) { addr[nbp++] = (caddr_t)rdr2(); } if (bp & 0x08) { addr[nbp++] = (caddr_t)rdr3(); } for (i=0; i /* * Determine the size of the transfer, and make sure it is * within the boundaries of the partition. Adjust transfer * if needed, and signal errors or early completion. */ int bounds_check_with_label(struct bio *bp, struct disklabel *lp, int wlabel) { struct partition *p = lp->d_partitions + dkpart(bp->bio_dev); int labelsect = lp->d_partitions[0].p_offset; int maxsz = p->p_size, sz = (bp->bio_bcount + DEV_BSIZE - 1) >> DEV_BSHIFT; /* overwriting disk label ? */ /* XXX should also protect bootstrap in first 8K */ if (bp->bio_blkno + p->p_offset <= LABELSECTOR + labelsect && #if LABELSECTOR != 0 bp->bio_blkno + p->p_offset + sz > LABELSECTOR + labelsect && #endif (bp->bio_cmd == BIO_WRITE) && wlabel == 0) { bp->bio_error = EROFS; goto bad; } #if defined(DOSBBSECTOR) && defined(notyet) /* overwriting master boot record? */ if (bp->bio_blkno + p->p_offset <= DOSBBSECTOR && (bp->bio_cmd == BIO_WRITE) && wlabel == 0) { bp->bio_error = EROFS; goto bad; } #endif /* beyond partition? */ if (bp->bio_blkno < 0 || bp->bio_blkno + sz > maxsz) { /* if exactly at end of disk, return an EOF */ if (bp->bio_blkno == maxsz) { bp->bio_resid = bp->bio_bcount; return(0); } /* or truncate if part of it fits */ sz = maxsz - bp->bio_blkno; if (sz <= 0) { bp->bio_error = EINVAL; goto bad; } bp->bio_bcount = sz << DEV_BSHIFT; } bp->bio_pblkno = bp->bio_blkno + p->p_offset; return(1); bad: bp->bio_flags |= BIO_ERROR; return(-1); } #ifdef DDB /* * Provide inb() and outb() as functions. They are normally only * available as macros calling inlined functions, thus cannot be * called inside DDB. * * The actual code is stolen from , and de-inlined. */ #undef inb #undef outb /* silence compiler warnings */ u_char inb(u_int); void outb(u_int, u_char); u_char inb(u_int port) { u_char data; /* * We use %%dx and not %1 here because i/o is done at %dx and not at * %edx, while gcc generates inferior code (movw instead of movl) * if we tell it to load (u_short) port. */ __asm __volatile("inb %%dx,%0" : "=a" (data) : "d" (port)); return (data); } void outb(u_int port, u_char data) { u_char al; /* * Use an unnecessary assignment to help gcc's register allocator. * This make a large difference for gcc-1.40 and a tiny difference * for gcc-2.6.0. For gcc-1.40, al had to be ``asm("ax")'' for * best results. gcc-2.6.0 can't handle this. */ al = data; __asm __volatile("outb %0,%%dx" : : "a" (al), "d" (port)); } #endif /* DDB */ Index: head/sys/i386/i386/mp_machdep.c =================================================================== --- head/sys/i386/i386/mp_machdep.c (revision 82308) +++ head/sys/i386/i386/mp_machdep.c (revision 82309) @@ -1,2440 +1,2442 @@ /* * Copyright (c) 1996, by Steve Passe * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. The name of the developer may NOT be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include "opt_cpu.h" +#include "opt_upages.h" #ifdef SMP #include #else #error #endif #include #include #include #include /* cngetc() */ #include #ifdef GPROF #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /** TEST_DEFAULT_CONFIG, TEST_TEST1 */ #include #include #include +#include #if defined(APIC_IO) #include /* setidt() */ #include /* IPIs */ #include /* IPIs */ #endif /* APIC_IO */ #if defined(TEST_DEFAULT_CONFIG) #define MPFPS_MPFB1 TEST_DEFAULT_CONFIG #else #define MPFPS_MPFB1 mpfps->mpfb1 #endif /* TEST_DEFAULT_CONFIG */ #define WARMBOOT_TARGET 0 #define WARMBOOT_OFF (KERNBASE + 0x0467) #define WARMBOOT_SEG (KERNBASE + 0x0469) #ifdef PC98 #define BIOS_BASE (0xe8000) #define BIOS_SIZE (0x18000) #else #define BIOS_BASE (0xf0000) #define BIOS_SIZE (0x10000) #endif #define BIOS_COUNT (BIOS_SIZE/4) #define CMOS_REG (0x70) #define CMOS_DATA (0x71) #define BIOS_RESET (0x0f) #define BIOS_WARM (0x0a) #define PROCENTRY_FLAG_EN 0x01 #define PROCENTRY_FLAG_BP 0x02 #define IOAPICENTRY_FLAG_EN 0x01 /* MP Floating Pointer Structure */ typedef struct MPFPS { char signature[4]; void *pap; u_char length; u_char spec_rev; u_char checksum; u_char mpfb1; u_char mpfb2; u_char mpfb3; u_char mpfb4; u_char mpfb5; } *mpfps_t; /* MP Configuration Table Header */ typedef struct MPCTH { char signature[4]; u_short base_table_length; u_char spec_rev; u_char checksum; u_char oem_id[8]; u_char product_id[12]; void *oem_table_pointer; u_short oem_table_size; u_short entry_count; void *apic_address; u_short extended_table_length; u_char extended_table_checksum; u_char reserved; } *mpcth_t; typedef struct PROCENTRY { u_char type; u_char apic_id; u_char apic_version; u_char cpu_flags; u_long cpu_signature; u_long feature_flags; u_long reserved1; u_long reserved2; } *proc_entry_ptr; typedef struct BUSENTRY { u_char type; u_char bus_id; char bus_type[6]; } *bus_entry_ptr; typedef struct IOAPICENTRY { u_char type; u_char apic_id; u_char apic_version; u_char apic_flags; void *apic_address; } *io_apic_entry_ptr; typedef struct INTENTRY { u_char type; u_char int_type; u_short int_flags; u_char src_bus_id; u_char src_bus_irq; u_char dst_apic_id; u_char dst_apic_int; } *int_entry_ptr; /* descriptions of MP basetable entries */ typedef struct BASETABLE_ENTRY { u_char type; u_char length; char name[16]; } basetable_entry; /* * this code MUST be enabled here and in mpboot.s. * it follows the very early stages of AP boot by placing values in CMOS ram. * it NORMALLY will never be needed and thus the primitive method for enabling. * #define CHECK_POINTS */ #if defined(CHECK_POINTS) && !defined(PC98) #define CHECK_READ(A) (outb(CMOS_REG, (A)), inb(CMOS_DATA)) #define CHECK_WRITE(A,D) (outb(CMOS_REG, (A)), outb(CMOS_DATA, (D))) #define CHECK_INIT(D); \ CHECK_WRITE(0x34, (D)); \ CHECK_WRITE(0x35, (D)); \ CHECK_WRITE(0x36, (D)); \ CHECK_WRITE(0x37, (D)); \ CHECK_WRITE(0x38, (D)); \ CHECK_WRITE(0x39, (D)); #define CHECK_PRINT(S); \ printf("%s: %d, %d, %d, %d, %d, %d\n", \ (S), \ CHECK_READ(0x34), \ CHECK_READ(0x35), \ CHECK_READ(0x36), \ CHECK_READ(0x37), \ CHECK_READ(0x38), \ CHECK_READ(0x39)); #else /* CHECK_POINTS */ #define CHECK_INIT(D) #define CHECK_PRINT(S) #endif /* CHECK_POINTS */ /* * Values to send to the POST hardware. */ #define MP_BOOTADDRESS_POST 0x10 #define MP_PROBE_POST 0x11 #define MPTABLE_PASS1_POST 0x12 #define MP_START_POST 0x13 #define MP_ENABLE_POST 0x14 #define MPTABLE_PASS2_POST 0x15 #define START_ALL_APS_POST 0x16 #define INSTALL_AP_TRAMP_POST 0x17 #define START_AP_POST 0x18 #define MP_ANNOUNCE_POST 0x19 /* used to hold the AP's until we are ready to release them */ static struct mtx ap_boot_mtx; /** XXX FIXME: where does this really belong, isa.h/isa.c perhaps? */ int current_postcode; /** XXX FIXME: what system files declare these??? */ extern struct region_descriptor r_gdt, r_idt; int bsp_apic_ready = 0; /* flags useability of BSP apic */ int mp_naps; /* # of Applications processors */ int mp_nbusses; /* # of busses */ int mp_napics; /* # of IO APICs */ int boot_cpu_id; /* designated BSP */ vm_offset_t cpu_apic_address; vm_offset_t io_apic_address[NAPICID]; /* NAPICID is more than enough */ extern int nkpt; u_int32_t cpu_apic_versions[MAXCPU]; u_int32_t *io_apic_versions; #ifdef APIC_INTR_REORDER struct { volatile int *location; int bit; } apic_isrbit_location[32]; #endif struct apic_intmapinfo int_to_apicintpin[APIC_INTMAPSIZE]; /* * APIC ID logical/physical mapping structures. * We oversize these to simplify boot-time config. */ int cpu_num_to_apic_id[NAPICID]; int io_num_to_apic_id[NAPICID]; int apic_id_to_logical[NAPICID]; /* AP uses this during bootstrap. Do not staticize. */ char *bootSTK; static int bootAP; /* Hotwire a 0->4MB V==P mapping */ extern pt_entry_t *KPTphys; /* SMP page table page */ extern pt_entry_t *SMPpt; struct pcb stoppcbs[MAXCPU]; int invltlb_ok = 0; /* throttle smp_invltlb() till safe */ SYSCTL_INT(_machdep, OID_AUTO, invltlb_ok, CTLFLAG_RW, &invltlb_ok, 0, ""); /* * Local data and functions. */ /* Set to 1 once we're ready to let the APs out of the pen. */ static volatile int aps_ready = 0; static int mp_capable; static u_int boot_address; static u_int base_memory; static int picmode; /* 0: virtual wire mode, 1: PIC mode */ static mpfps_t mpfps; static int search_for_sig(u_int32_t target, int count); static void mp_enable(u_int boot_addr); static void mptable_pass1(void); static int mptable_pass2(void); static void default_mp_table(int type); static void fix_mp_table(void); static void setup_apic_irq_mapping(void); static void init_locks(void); static int start_all_aps(u_int boot_addr); static void install_ap_tramp(u_int boot_addr); static int start_ap(int logicalCpu, u_int boot_addr); void ap_init(void); static int apic_int_is_bus_type(int intr, int bus_type); static void release_aps(void *dummy); /* * initialize all the SMP locks */ /* critical region around IO APIC, apic_imen */ struct mtx imen_mtx; /* lock region used by kernel profiling */ int mcount_lock; #ifdef USE_COMLOCK /* locks com (tty) data/hardware accesses: a FASTINTR() */ struct mtx com_mtx; #endif /* USE_COMLOCK */ static void init_locks(void) { #ifdef USE_COMLOCK mtx_init(&com_mtx, "com", MTX_SPIN); #endif /* USE_COMLOCK */ } /* * Calculate usable address in base memory for AP trampoline code. */ u_int mp_bootaddress(u_int basemem) { POSTCODE(MP_BOOTADDRESS_POST); base_memory = basemem * 1024; /* convert to bytes */ boot_address = base_memory & ~0xfff; /* round down to 4k boundary */ if ((base_memory - boot_address) < bootMP_size) boot_address -= 4096; /* not enough, lower by 4k */ return boot_address; } /* * Look for an Intel MP spec table (ie, SMP capable hardware). */ void i386_mp_probe(void) { int x; u_long segment; u_int32_t target; POSTCODE(MP_PROBE_POST); /* see if EBDA exists */ if ((segment = (u_long) * (u_short *) (KERNBASE + 0x40e)) != 0) { /* search first 1K of EBDA */ target = (u_int32_t) (segment << 4); if ((x = search_for_sig(target, 1024 / 4)) >= 0) goto found; } else { /* last 1K of base memory, effective 'top of base' passed in */ target = (u_int32_t) (base_memory - 0x400); if ((x = search_for_sig(target, 1024 / 4)) >= 0) goto found; } /* search the BIOS */ target = (u_int32_t) BIOS_BASE; if ((x = search_for_sig(target, BIOS_COUNT)) >= 0) goto found; /* nothing found */ mpfps = (mpfps_t)0; mp_capable = 0; return; found: /* calculate needed resources */ mpfps = (mpfps_t)x; mptable_pass1(); /* flag fact that we are running multiple processors */ mp_capable = 1; } int cpu_mp_probe(void) { /* * Record BSP in CPU map * This is done here so that MBUF init code works correctly. */ all_cpus = 1; return (mp_capable); } /* * Initialize the SMP hardware and the APIC and start up the AP's. */ void cpu_mp_start(void) { POSTCODE(MP_START_POST); /* look for MP capable motherboard */ if (mp_capable) mp_enable(boot_address); else panic("MP hardware not found!"); cpu_setregs(); } /* * Print various information about the SMP system hardware and setup. */ void cpu_mp_announce(void) { int x; POSTCODE(MP_ANNOUNCE_POST); printf(" cpu0 (BSP): apic id: %2d", CPU_TO_ID(0)); printf(", version: 0x%08x", cpu_apic_versions[0]); printf(", at 0x%08x\n", cpu_apic_address); for (x = 1; x <= mp_naps; ++x) { printf(" cpu%d (AP): apic id: %2d", x, CPU_TO_ID(x)); printf(", version: 0x%08x", cpu_apic_versions[x]); printf(", at 0x%08x\n", cpu_apic_address); } #if defined(APIC_IO) for (x = 0; x < mp_napics; ++x) { printf(" io%d (APIC): apic id: %2d", x, IO_TO_ID(x)); printf(", version: 0x%08x", io_apic_versions[x]); printf(", at 0x%08x\n", io_apic_address[x]); } #else printf(" Warning: APIC I/O disabled\n"); #endif /* APIC_IO */ } /* * AP cpu's call this to sync up protected mode. */ void init_secondary(void) { int gsel_tss; int x, myid = bootAP; gdt_segs[GPRIV_SEL].ssd_base = (int) &SMP_prvspace[myid]; gdt_segs[GPROC0_SEL].ssd_base = (int) &SMP_prvspace[myid].globaldata.gd_common_tss; SMP_prvspace[myid].globaldata.gd_prvspace = &SMP_prvspace[myid].globaldata; for (x = 0; x < NGDT; x++) { ssdtosd(&gdt_segs[x], &gdt[myid * NGDT + x].sd); } r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; r_gdt.rd_base = (int) &gdt[myid * NGDT]; lgdt(&r_gdt); /* does magic intra-segment return */ lidt(&r_idt); lldt(_default_ldt); PCPU_SET(currentldt, _default_ldt); gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); gdt[myid * NGDT + GPROC0_SEL].sd.sd_type = SDT_SYS386TSS; PCPU_SET(common_tss.tss_esp0, 0); /* not used until after switch */ PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL)); PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16); PCPU_SET(tss_gdt, &gdt[myid * NGDT + GPROC0_SEL].sd); PCPU_SET(common_tssd, *PCPU_GET(tss_gdt)); ltr(gsel_tss); pmap_set_opt(); } #if defined(APIC_IO) /* * Final configuration of the BSP's local APIC: * - disable 'pic mode'. * - disable 'virtual wire mode'. * - enable NMI. */ void bsp_apic_configure(void) { u_char byte; u_int32_t temp; /* leave 'pic mode' if necessary */ if (picmode) { outb(0x22, 0x70); /* select IMCR */ byte = inb(0x23); /* current contents */ byte |= 0x01; /* mask external INTR */ outb(0x23, byte); /* disconnect 8259s/NMI */ } /* mask lint0 (the 8259 'virtual wire' connection) */ temp = lapic.lvt_lint0; temp |= APIC_LVT_M; /* set the mask */ lapic.lvt_lint0 = temp; /* setup lint1 to handle NMI */ temp = lapic.lvt_lint1; temp &= ~APIC_LVT_M; /* clear the mask */ lapic.lvt_lint1 = temp; if (bootverbose) apic_dump("bsp_apic_configure()"); } #endif /* APIC_IO */ /******************************************************************* * local functions and data */ /* * start the SMP system */ static void mp_enable(u_int boot_addr) { int x; #if defined(APIC_IO) int apic; u_int ux; #endif /* APIC_IO */ POSTCODE(MP_ENABLE_POST); /* turn on 4MB of V == P addressing so we can get to MP table */ *(int *)PTD = PG_V | PG_RW | ((uintptr_t)(void *)KPTphys & PG_FRAME); invltlb(); /* examine the MP table for needed info, uses physical addresses */ x = mptable_pass2(); *(int *)PTD = 0; invltlb(); /* can't process default configs till the CPU APIC is pmapped */ if (x) default_mp_table(x); /* post scan cleanup */ fix_mp_table(); setup_apic_irq_mapping(); #if defined(APIC_IO) /* fill the LOGICAL io_apic_versions table */ for (apic = 0; apic < mp_napics; ++apic) { ux = io_apic_read(apic, IOAPIC_VER); io_apic_versions[apic] = ux; io_apic_set_id(apic, IO_TO_ID(apic)); } /* program each IO APIC in the system */ for (apic = 0; apic < mp_napics; ++apic) if (io_apic_setup(apic) < 0) panic("IO APIC setup failure"); /* install a 'Spurious INTerrupt' vector */ setidt(XSPURIOUSINT_OFFSET, Xspuriousint, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* install an inter-CPU IPI for TLB invalidation */ setidt(XINVLTLB_OFFSET, Xinvltlb, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* install an inter-CPU IPI for forwarding hardclock() */ setidt(XHARDCLOCK_OFFSET, Xhardclock, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* install an inter-CPU IPI for forwarding statclock() */ setidt(XSTATCLOCK_OFFSET, Xstatclock, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* install an inter-CPU IPI for all-CPU rendezvous */ setidt(XRENDEZVOUS_OFFSET, Xrendezvous, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* install an inter-CPU IPI for forcing an additional software trap */ setidt(XCPUAST_OFFSET, Xcpuast, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* install an inter-CPU IPI for CPU stop/restart */ setidt(XCPUSTOP_OFFSET, Xcpustop, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); #if defined(TEST_TEST1) /* install a "fake hardware INTerrupt" vector */ setidt(XTEST1_OFFSET, Xtest1, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); #endif /** TEST_TEST1 */ #endif /* APIC_IO */ /* initialize all SMP locks */ init_locks(); /* start each Application Processor */ start_all_aps(boot_addr); } /* * look for the MP spec signature */ /* string defined by the Intel MP Spec as identifying the MP table */ #define MP_SIG 0x5f504d5f /* _MP_ */ #define NEXT(X) ((X) += 4) static int search_for_sig(u_int32_t target, int count) { int x; u_int32_t *addr = (u_int32_t *) (KERNBASE + target); for (x = 0; x < count; NEXT(x)) if (addr[x] == MP_SIG) /* make array index a byte index */ return (target + (x * sizeof(u_int32_t))); return -1; } static basetable_entry basetable_entry_types[] = { {0, 20, "Processor"}, {1, 8, "Bus"}, {2, 8, "I/O APIC"}, {3, 8, "I/O INT"}, {4, 8, "Local INT"} }; typedef struct BUSDATA { u_char bus_id; enum busTypes bus_type; } bus_datum; typedef struct INTDATA { u_char int_type; u_short int_flags; u_char src_bus_id; u_char src_bus_irq; u_char dst_apic_id; u_char dst_apic_int; u_char int_vector; } io_int, local_int; typedef struct BUSTYPENAME { u_char type; char name[7]; } bus_type_name; static bus_type_name bus_type_table[] = { {CBUS, "CBUS"}, {CBUSII, "CBUSII"}, {EISA, "EISA"}, {MCA, "MCA"}, {UNKNOWN_BUSTYPE, "---"}, {ISA, "ISA"}, {MCA, "MCA"}, {UNKNOWN_BUSTYPE, "---"}, {UNKNOWN_BUSTYPE, "---"}, {UNKNOWN_BUSTYPE, "---"}, {UNKNOWN_BUSTYPE, "---"}, {UNKNOWN_BUSTYPE, "---"}, {PCI, "PCI"}, {UNKNOWN_BUSTYPE, "---"}, {UNKNOWN_BUSTYPE, "---"}, {UNKNOWN_BUSTYPE, "---"}, {UNKNOWN_BUSTYPE, "---"}, {XPRESS, "XPRESS"}, {UNKNOWN_BUSTYPE, "---"} }; /* from MP spec v1.4, table 5-1 */ static int default_data[7][5] = { /* nbus, id0, type0, id1, type1 */ {1, 0, ISA, 255, 255}, {1, 0, EISA, 255, 255}, {1, 0, EISA, 255, 255}, {1, 0, MCA, 255, 255}, {2, 0, ISA, 1, PCI}, {2, 0, EISA, 1, PCI}, {2, 0, MCA, 1, PCI} }; /* the bus data */ static bus_datum *bus_data; /* the IO INT data, one entry per possible APIC INTerrupt */ static io_int *io_apic_ints; static int nintrs; static int processor_entry __P((proc_entry_ptr entry, int cpu)); static int bus_entry __P((bus_entry_ptr entry, int bus)); static int io_apic_entry __P((io_apic_entry_ptr entry, int apic)); static int int_entry __P((int_entry_ptr entry, int intr)); static int lookup_bus_type __P((char *name)); /* * 1st pass on motherboard's Intel MP specification table. * * initializes: * mp_ncpus = 1 * * determines: * cpu_apic_address (common to all CPUs) * io_apic_address[N] * mp_naps * mp_nbusses * mp_napics * nintrs */ static void mptable_pass1(void) { int x; mpcth_t cth; int totalSize; void* position; int count; int type; POSTCODE(MPTABLE_PASS1_POST); /* clear various tables */ for (x = 0; x < NAPICID; ++x) { io_apic_address[x] = ~0; /* IO APIC address table */ } /* init everything to empty */ mp_naps = 0; mp_nbusses = 0; mp_napics = 0; nintrs = 0; /* check for use of 'default' configuration */ if (MPFPS_MPFB1 != 0) { /* use default addresses */ cpu_apic_address = DEFAULT_APIC_BASE; io_apic_address[0] = DEFAULT_IO_APIC_BASE; /* fill in with defaults */ mp_naps = 2; /* includes BSP */ mp_nbusses = default_data[MPFPS_MPFB1 - 1][0]; #if defined(APIC_IO) mp_napics = 1; nintrs = 16; #endif /* APIC_IO */ } else { if ((cth = mpfps->pap) == 0) panic("MP Configuration Table Header MISSING!"); cpu_apic_address = (vm_offset_t) cth->apic_address; /* walk the table, recording info of interest */ totalSize = cth->base_table_length - sizeof(struct MPCTH); position = (u_char *) cth + sizeof(struct MPCTH); count = cth->entry_count; while (count--) { switch (type = *(u_char *) position) { case 0: /* processor_entry */ if (((proc_entry_ptr)position)->cpu_flags & PROCENTRY_FLAG_EN) ++mp_naps; break; case 1: /* bus_entry */ ++mp_nbusses; break; case 2: /* io_apic_entry */ if (((io_apic_entry_ptr)position)->apic_flags & IOAPICENTRY_FLAG_EN) io_apic_address[mp_napics++] = (vm_offset_t)((io_apic_entry_ptr) position)->apic_address; break; case 3: /* int_entry */ ++nintrs; break; case 4: /* int_entry */ break; default: panic("mpfps Base Table HOSED!"); /* NOTREACHED */ } totalSize -= basetable_entry_types[type].length; (u_char*)position += basetable_entry_types[type].length; } } /* qualify the numbers */ if (mp_naps > MAXCPU) { printf("Warning: only using %d of %d available CPUs!\n", MAXCPU, mp_naps); mp_naps = MAXCPU; } /* * Count the BSP. * This is also used as a counter while starting the APs. */ mp_ncpus = 1; --mp_naps; /* subtract the BSP */ } /* * 2nd pass on motherboard's Intel MP specification table. * * sets: * boot_cpu_id * ID_TO_IO(N), phy APIC ID to log CPU/IO table * CPU_TO_ID(N), logical CPU to APIC ID table * IO_TO_ID(N), logical IO to APIC ID table * bus_data[N] * io_apic_ints[N] */ static int mptable_pass2(void) { int x; mpcth_t cth; int totalSize; void* position; int count; int type; int apic, bus, cpu, intr; int i, j; int pgeflag; POSTCODE(MPTABLE_PASS2_POST); pgeflag = 0; /* XXX - Not used under SMP yet. */ MALLOC(io_apic_versions, u_int32_t *, sizeof(u_int32_t) * mp_napics, M_DEVBUF, M_WAITOK); MALLOC(ioapic, volatile ioapic_t **, sizeof(ioapic_t *) * mp_napics, M_DEVBUF, M_WAITOK); MALLOC(io_apic_ints, io_int *, sizeof(io_int) * (nintrs + 1), M_DEVBUF, M_WAITOK); MALLOC(bus_data, bus_datum *, sizeof(bus_datum) * mp_nbusses, M_DEVBUF, M_WAITOK); bzero(ioapic, sizeof(ioapic_t *) * mp_napics); for (i = 0; i < mp_napics; i++) { for (j = 0; j < mp_napics; j++) { /* same page frame as a previous IO apic? */ if (((vm_offset_t)SMPpt[NPTEPG-2-j] & PG_FRAME) == (io_apic_address[i] & PG_FRAME)) { ioapic[i] = (ioapic_t *)((u_int)SMP_prvspace + (NPTEPG-2-j) * PAGE_SIZE + (io_apic_address[i] & PAGE_MASK)); break; } /* use this slot if available */ if (((vm_offset_t)SMPpt[NPTEPG-2-j] & PG_FRAME) == 0) { SMPpt[NPTEPG-2-j] = (pt_entry_t)(PG_V | PG_RW | pgeflag | (io_apic_address[i] & PG_FRAME)); ioapic[i] = (ioapic_t *)((u_int)SMP_prvspace + (NPTEPG-2-j) * PAGE_SIZE + (io_apic_address[i] & PAGE_MASK)); break; } } } /* clear various tables */ for (x = 0; x < NAPICID; ++x) { ID_TO_IO(x) = -1; /* phy APIC ID to log CPU/IO table */ CPU_TO_ID(x) = -1; /* logical CPU to APIC ID table */ IO_TO_ID(x) = -1; /* logical IO to APIC ID table */ } /* clear bus data table */ for (x = 0; x < mp_nbusses; ++x) bus_data[x].bus_id = 0xff; /* clear IO APIC INT table */ for (x = 0; x < (nintrs + 1); ++x) { io_apic_ints[x].int_type = 0xff; io_apic_ints[x].int_vector = 0xff; } /* setup the cpu/apic mapping arrays */ boot_cpu_id = -1; /* record whether PIC or virtual-wire mode */ picmode = (mpfps->mpfb2 & 0x80) ? 1 : 0; /* check for use of 'default' configuration */ if (MPFPS_MPFB1 != 0) return MPFPS_MPFB1; /* return default configuration type */ if ((cth = mpfps->pap) == 0) panic("MP Configuration Table Header MISSING!"); /* walk the table, recording info of interest */ totalSize = cth->base_table_length - sizeof(struct MPCTH); position = (u_char *) cth + sizeof(struct MPCTH); count = cth->entry_count; apic = bus = intr = 0; cpu = 1; /* pre-count the BSP */ while (count--) { switch (type = *(u_char *) position) { case 0: if (processor_entry(position, cpu)) ++cpu; break; case 1: if (bus_entry(position, bus)) ++bus; break; case 2: if (io_apic_entry(position, apic)) ++apic; break; case 3: if (int_entry(position, intr)) ++intr; break; case 4: /* int_entry(position); */ break; default: panic("mpfps Base Table HOSED!"); /* NOTREACHED */ } totalSize -= basetable_entry_types[type].length; (u_char *) position += basetable_entry_types[type].length; } if (boot_cpu_id == -1) panic("NO BSP found!"); /* report fact that its NOT a default configuration */ return 0; } void assign_apic_irq(int apic, int intpin, int irq) { int x; if (int_to_apicintpin[irq].ioapic != -1) panic("assign_apic_irq: inconsistent table"); int_to_apicintpin[irq].ioapic = apic; int_to_apicintpin[irq].int_pin = intpin; int_to_apicintpin[irq].apic_address = ioapic[apic]; int_to_apicintpin[irq].redirindex = IOAPIC_REDTBL + 2 * intpin; for (x = 0; x < nintrs; x++) { if ((io_apic_ints[x].int_type == 0 || io_apic_ints[x].int_type == 3) && io_apic_ints[x].int_vector == 0xff && io_apic_ints[x].dst_apic_id == IO_TO_ID(apic) && io_apic_ints[x].dst_apic_int == intpin) io_apic_ints[x].int_vector = irq; } } void revoke_apic_irq(int irq) { int x; int oldapic; int oldintpin; if (int_to_apicintpin[irq].ioapic == -1) panic("assign_apic_irq: inconsistent table"); oldapic = int_to_apicintpin[irq].ioapic; oldintpin = int_to_apicintpin[irq].int_pin; int_to_apicintpin[irq].ioapic = -1; int_to_apicintpin[irq].int_pin = 0; int_to_apicintpin[irq].apic_address = NULL; int_to_apicintpin[irq].redirindex = 0; for (x = 0; x < nintrs; x++) { if ((io_apic_ints[x].int_type == 0 || io_apic_ints[x].int_type == 3) && io_apic_ints[x].int_vector == 0xff && io_apic_ints[x].dst_apic_id == IO_TO_ID(oldapic) && io_apic_ints[x].dst_apic_int == oldintpin) io_apic_ints[x].int_vector = 0xff; } } static void allocate_apic_irq(int intr) { int apic; int intpin; int irq; if (io_apic_ints[intr].int_vector != 0xff) return; /* Interrupt handler already assigned */ if (io_apic_ints[intr].int_type != 0 && (io_apic_ints[intr].int_type != 3 || (io_apic_ints[intr].dst_apic_id == IO_TO_ID(0) && io_apic_ints[intr].dst_apic_int == 0))) return; /* Not INT or ExtInt on != (0, 0) */ irq = 0; while (irq < APIC_INTMAPSIZE && int_to_apicintpin[irq].ioapic != -1) irq++; if (irq >= APIC_INTMAPSIZE) return; /* No free interrupt handlers */ apic = ID_TO_IO(io_apic_ints[intr].dst_apic_id); intpin = io_apic_ints[intr].dst_apic_int; assign_apic_irq(apic, intpin, irq); io_apic_setup_intpin(apic, intpin); } static void swap_apic_id(int apic, int oldid, int newid) { int x; int oapic; if (oldid == newid) return; /* Nothing to do */ printf("Changing APIC ID for IO APIC #%d from %d to %d in MP table\n", apic, oldid, newid); /* Swap physical APIC IDs in interrupt entries */ for (x = 0; x < nintrs; x++) { if (io_apic_ints[x].dst_apic_id == oldid) io_apic_ints[x].dst_apic_id = newid; else if (io_apic_ints[x].dst_apic_id == newid) io_apic_ints[x].dst_apic_id = oldid; } /* Swap physical APIC IDs in IO_TO_ID mappings */ for (oapic = 0; oapic < mp_napics; oapic++) if (IO_TO_ID(oapic) == newid) break; if (oapic < mp_napics) { printf("Changing APIC ID for IO APIC #%d from " "%d to %d in MP table\n", oapic, newid, oldid); IO_TO_ID(oapic) = oldid; } IO_TO_ID(apic) = newid; } static void fix_id_to_io_mapping(void) { int x; for (x = 0; x < NAPICID; x++) ID_TO_IO(x) = -1; for (x = 0; x <= mp_naps; x++) if (CPU_TO_ID(x) < NAPICID) ID_TO_IO(CPU_TO_ID(x)) = x; for (x = 0; x < mp_napics; x++) if (IO_TO_ID(x) < NAPICID) ID_TO_IO(IO_TO_ID(x)) = x; } static int first_free_apic_id(void) { int freeid, x; for (freeid = 0; freeid < NAPICID; freeid++) { for (x = 0; x <= mp_naps; x++) if (CPU_TO_ID(x) == freeid) break; if (x <= mp_naps) continue; for (x = 0; x < mp_napics; x++) if (IO_TO_ID(x) == freeid) break; if (x < mp_napics) continue; return freeid; } return freeid; } static int io_apic_id_acceptable(int apic, int id) { int cpu; /* Logical CPU number */ int oapic; /* Logical IO APIC number for other IO APIC */ if (id >= NAPICID) return 0; /* Out of range */ for (cpu = 0; cpu <= mp_naps; cpu++) if (CPU_TO_ID(cpu) == id) return 0; /* Conflict with CPU */ for (oapic = 0; oapic < mp_napics && oapic < apic; oapic++) if (IO_TO_ID(oapic) == id) return 0; /* Conflict with other APIC */ return 1; /* ID is acceptable for IO APIC */ } /* * parse an Intel MP specification table */ static void fix_mp_table(void) { int x; int id; int bus_0 = 0; /* Stop GCC warning */ int bus_pci = 0; /* Stop GCC warning */ int num_pci_bus; int apic; /* IO APIC unit number */ int freeid; /* Free physical APIC ID */ int physid; /* Current physical IO APIC ID */ /* * Fix mis-numbering of the PCI bus and its INT entries if the BIOS * did it wrong. The MP spec says that when more than 1 PCI bus * exists the BIOS must begin with bus entries for the PCI bus and use * actual PCI bus numbering. This implies that when only 1 PCI bus * exists the BIOS can choose to ignore this ordering, and indeed many * MP motherboards do ignore it. This causes a problem when the PCI * sub-system makes requests of the MP sub-system based on PCI bus * numbers. So here we look for the situation and renumber the * busses and associated INTs in an effort to "make it right". */ /* find bus 0, PCI bus, count the number of PCI busses */ for (num_pci_bus = 0, x = 0; x < mp_nbusses; ++x) { if (bus_data[x].bus_id == 0) { bus_0 = x; } if (bus_data[x].bus_type == PCI) { ++num_pci_bus; bus_pci = x; } } /* * bus_0 == slot of bus with ID of 0 * bus_pci == slot of last PCI bus encountered */ /* check the 1 PCI bus case for sanity */ /* if it is number 0 all is well */ if (num_pci_bus == 1 && bus_data[bus_pci].bus_id != 0) { /* mis-numbered, swap with whichever bus uses slot 0 */ /* swap the bus entry types */ bus_data[bus_pci].bus_type = bus_data[bus_0].bus_type; bus_data[bus_0].bus_type = PCI; /* swap each relavant INTerrupt entry */ id = bus_data[bus_pci].bus_id; for (x = 0; x < nintrs; ++x) { if (io_apic_ints[x].src_bus_id == id) { io_apic_ints[x].src_bus_id = 0; } else if (io_apic_ints[x].src_bus_id == 0) { io_apic_ints[x].src_bus_id = id; } } } /* Assign IO APIC IDs. * * First try the existing ID. If a conflict is detected, try * the ID in the MP table. If a conflict is still detected, find * a free id. * * We cannot use the ID_TO_IO table before all conflicts has been * resolved and the table has been corrected. */ for (apic = 0; apic < mp_napics; ++apic) { /* For all IO APICs */ /* First try to use the value set by the BIOS */ physid = io_apic_get_id(apic); if (io_apic_id_acceptable(apic, physid)) { if (IO_TO_ID(apic) != physid) swap_apic_id(apic, IO_TO_ID(apic), physid); continue; } /* Then check if the value in the MP table is acceptable */ if (io_apic_id_acceptable(apic, IO_TO_ID(apic))) continue; /* Last resort, find a free APIC ID and use it */ freeid = first_free_apic_id(); if (freeid >= NAPICID) panic("No free physical APIC IDs found"); if (io_apic_id_acceptable(apic, freeid)) { swap_apic_id(apic, IO_TO_ID(apic), freeid); continue; } panic("Free physical APIC ID not usable"); } fix_id_to_io_mapping(); /* detect and fix broken Compaq MP table */ if (apic_int_type(0, 0) == -1) { printf("APIC_IO: MP table broken: 8259->APIC entry missing!\n"); io_apic_ints[nintrs].int_type = 3; /* ExtInt */ io_apic_ints[nintrs].int_vector = 0xff; /* Unassigned */ /* XXX fixme, set src bus id etc, but it doesn't seem to hurt */ io_apic_ints[nintrs].dst_apic_id = IO_TO_ID(0); io_apic_ints[nintrs].dst_apic_int = 0; /* Pin 0 */ nintrs++; } } /* Assign low level interrupt handlers */ static void setup_apic_irq_mapping(void) { int x; int int_vector; /* Clear array */ for (x = 0; x < APIC_INTMAPSIZE; x++) { int_to_apicintpin[x].ioapic = -1; int_to_apicintpin[x].int_pin = 0; int_to_apicintpin[x].apic_address = NULL; int_to_apicintpin[x].redirindex = 0; } /* First assign ISA/EISA interrupts */ for (x = 0; x < nintrs; x++) { int_vector = io_apic_ints[x].src_bus_irq; if (int_vector < APIC_INTMAPSIZE && io_apic_ints[x].int_vector == 0xff && int_to_apicintpin[int_vector].ioapic == -1 && (apic_int_is_bus_type(x, ISA) || apic_int_is_bus_type(x, EISA)) && io_apic_ints[x].int_type == 0) { assign_apic_irq(ID_TO_IO(io_apic_ints[x].dst_apic_id), io_apic_ints[x].dst_apic_int, int_vector); } } /* Assign ExtInt entry if no ISA/EISA interrupt 0 entry */ for (x = 0; x < nintrs; x++) { if (io_apic_ints[x].dst_apic_int == 0 && io_apic_ints[x].dst_apic_id == IO_TO_ID(0) && io_apic_ints[x].int_vector == 0xff && int_to_apicintpin[0].ioapic == -1 && io_apic_ints[x].int_type == 3) { assign_apic_irq(0, 0, 0); break; } } /* PCI interrupt assignment is deferred */ } static int processor_entry(proc_entry_ptr entry, int cpu) { /* check for usability */ if (!(entry->cpu_flags & PROCENTRY_FLAG_EN)) return 0; if(entry->apic_id >= NAPICID) panic("CPU APIC ID out of range (0..%d)", NAPICID - 1); /* check for BSP flag */ if (entry->cpu_flags & PROCENTRY_FLAG_BP) { boot_cpu_id = entry->apic_id; CPU_TO_ID(0) = entry->apic_id; ID_TO_CPU(entry->apic_id) = 0; return 0; /* its already been counted */ } /* add another AP to list, if less than max number of CPUs */ else if (cpu < MAXCPU) { CPU_TO_ID(cpu) = entry->apic_id; ID_TO_CPU(entry->apic_id) = cpu; return 1; } return 0; } static int bus_entry(bus_entry_ptr entry, int bus) { int x; char c, name[8]; /* encode the name into an index */ for (x = 0; x < 6; ++x) { if ((c = entry->bus_type[x]) == ' ') break; name[x] = c; } name[x] = '\0'; if ((x = lookup_bus_type(name)) == UNKNOWN_BUSTYPE) panic("unknown bus type: '%s'", name); bus_data[bus].bus_id = entry->bus_id; bus_data[bus].bus_type = x; return 1; } static int io_apic_entry(io_apic_entry_ptr entry, int apic) { if (!(entry->apic_flags & IOAPICENTRY_FLAG_EN)) return 0; IO_TO_ID(apic) = entry->apic_id; if (entry->apic_id < NAPICID) ID_TO_IO(entry->apic_id) = apic; return 1; } static int lookup_bus_type(char *name) { int x; for (x = 0; x < MAX_BUSTYPE; ++x) if (strcmp(bus_type_table[x].name, name) == 0) return bus_type_table[x].type; return UNKNOWN_BUSTYPE; } static int int_entry(int_entry_ptr entry, int intr) { int apic; io_apic_ints[intr].int_type = entry->int_type; io_apic_ints[intr].int_flags = entry->int_flags; io_apic_ints[intr].src_bus_id = entry->src_bus_id; io_apic_ints[intr].src_bus_irq = entry->src_bus_irq; if (entry->dst_apic_id == 255) { /* This signal goes to all IO APICS. Select an IO APIC with sufficient number of interrupt pins */ for (apic = 0; apic < mp_napics; apic++) if (((io_apic_read(apic, IOAPIC_VER) & IOART_VER_MAXREDIR) >> MAXREDIRSHIFT) >= entry->dst_apic_int) break; if (apic < mp_napics) io_apic_ints[intr].dst_apic_id = IO_TO_ID(apic); else io_apic_ints[intr].dst_apic_id = entry->dst_apic_id; } else io_apic_ints[intr].dst_apic_id = entry->dst_apic_id; io_apic_ints[intr].dst_apic_int = entry->dst_apic_int; return 1; } static int apic_int_is_bus_type(int intr, int bus_type) { int bus; for (bus = 0; bus < mp_nbusses; ++bus) if ((bus_data[bus].bus_id == io_apic_ints[intr].src_bus_id) && ((int) bus_data[bus].bus_type == bus_type)) return 1; return 0; } /* * Given a traditional ISA INT mask, return an APIC mask. */ u_int isa_apic_mask(u_int isa_mask) { int isa_irq; int apic_pin; #if defined(SKIP_IRQ15_REDIRECT) if (isa_mask == (1 << 15)) { printf("skipping ISA IRQ15 redirect\n"); return isa_mask; } #endif /* SKIP_IRQ15_REDIRECT */ isa_irq = ffs(isa_mask); /* find its bit position */ if (isa_irq == 0) /* doesn't exist */ return 0; --isa_irq; /* make it zero based */ apic_pin = isa_apic_irq(isa_irq); /* look for APIC connection */ if (apic_pin == -1) return 0; return (1 << apic_pin); /* convert pin# to a mask */ } /* * Determine which APIC pin an ISA/EISA INT is attached to. */ #define INTTYPE(I) (io_apic_ints[(I)].int_type) #define INTPIN(I) (io_apic_ints[(I)].dst_apic_int) #define INTIRQ(I) (io_apic_ints[(I)].int_vector) #define INTAPIC(I) (ID_TO_IO(io_apic_ints[(I)].dst_apic_id)) #define SRCBUSIRQ(I) (io_apic_ints[(I)].src_bus_irq) int isa_apic_irq(int isa_irq) { int intr; for (intr = 0; intr < nintrs; ++intr) { /* check each record */ if (INTTYPE(intr) == 0) { /* standard INT */ if (SRCBUSIRQ(intr) == isa_irq) { if (apic_int_is_bus_type(intr, ISA) || apic_int_is_bus_type(intr, EISA)) { if (INTIRQ(intr) == 0xff) return -1; /* unassigned */ return INTIRQ(intr); /* found */ } } } } return -1; /* NOT found */ } /* * Determine which APIC pin a PCI INT is attached to. */ #define SRCBUSID(I) (io_apic_ints[(I)].src_bus_id) #define SRCBUSDEVICE(I) ((io_apic_ints[(I)].src_bus_irq >> 2) & 0x1f) #define SRCBUSLINE(I) (io_apic_ints[(I)].src_bus_irq & 0x03) int pci_apic_irq(int pciBus, int pciDevice, int pciInt) { int intr; --pciInt; /* zero based */ for (intr = 0; intr < nintrs; ++intr) /* check each record */ if ((INTTYPE(intr) == 0) /* standard INT */ && (SRCBUSID(intr) == pciBus) && (SRCBUSDEVICE(intr) == pciDevice) && (SRCBUSLINE(intr) == pciInt)) /* a candidate IRQ */ if (apic_int_is_bus_type(intr, PCI)) { if (INTIRQ(intr) == 0xff) allocate_apic_irq(intr); if (INTIRQ(intr) == 0xff) return -1; /* unassigned */ return INTIRQ(intr); /* exact match */ } return -1; /* NOT found */ } int next_apic_irq(int irq) { int intr, ointr; int bus, bustype; bus = 0; bustype = 0; for (intr = 0; intr < nintrs; intr++) { if (INTIRQ(intr) != irq || INTTYPE(intr) != 0) continue; bus = SRCBUSID(intr); bustype = apic_bus_type(bus); if (bustype != ISA && bustype != EISA && bustype != PCI) continue; break; } if (intr >= nintrs) { return -1; } for (ointr = intr + 1; ointr < nintrs; ointr++) { if (INTTYPE(ointr) != 0) continue; if (bus != SRCBUSID(ointr)) continue; if (bustype == PCI) { if (SRCBUSDEVICE(intr) != SRCBUSDEVICE(ointr)) continue; if (SRCBUSLINE(intr) != SRCBUSLINE(ointr)) continue; } if (bustype == ISA || bustype == EISA) { if (SRCBUSIRQ(intr) != SRCBUSIRQ(ointr)) continue; } if (INTPIN(intr) == INTPIN(ointr)) continue; break; } if (ointr >= nintrs) { return -1; } return INTIRQ(ointr); } #undef SRCBUSLINE #undef SRCBUSDEVICE #undef SRCBUSID #undef SRCBUSIRQ #undef INTPIN #undef INTIRQ #undef INTAPIC #undef INTTYPE /* * Reprogram the MB chipset to NOT redirect an ISA INTerrupt. * * XXX FIXME: * Exactly what this means is unclear at this point. It is a solution * for motherboards that redirect the MBIRQ0 pin. Generically a motherboard * could route any of the ISA INTs to upper (>15) IRQ values. But most would * NOT be redirected via MBIRQ0, thus "undirect()ing" them would NOT be an * option. */ int undirect_isa_irq(int rirq) { #if defined(READY) if (bootverbose) printf("Freeing redirected ISA irq %d.\n", rirq); /** FIXME: tickle the MB redirector chip */ return -1; #else if (bootverbose) printf("Freeing (NOT implemented) redirected ISA irq %d.\n", rirq); return 0; #endif /* READY */ } /* * Reprogram the MB chipset to NOT redirect a PCI INTerrupt */ int undirect_pci_irq(int rirq) { #if defined(READY) if (bootverbose) printf("Freeing redirected PCI irq %d.\n", rirq); /** FIXME: tickle the MB redirector chip */ return -1; #else if (bootverbose) printf("Freeing (NOT implemented) redirected PCI irq %d.\n", rirq); return 0; #endif /* READY */ } /* * given a bus ID, return: * the bus type if found * -1 if NOT found */ int apic_bus_type(int id) { int x; for (x = 0; x < mp_nbusses; ++x) if (bus_data[x].bus_id == id) return bus_data[x].bus_type; return -1; } /* * given a LOGICAL APIC# and pin#, return: * the associated src bus ID if found * -1 if NOT found */ int apic_src_bus_id(int apic, int pin) { int x; /* search each of the possible INTerrupt sources */ for (x = 0; x < nintrs; ++x) if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) && (pin == io_apic_ints[x].dst_apic_int)) return (io_apic_ints[x].src_bus_id); return -1; /* NOT found */ } /* * given a LOGICAL APIC# and pin#, return: * the associated src bus IRQ if found * -1 if NOT found */ int apic_src_bus_irq(int apic, int pin) { int x; for (x = 0; x < nintrs; x++) if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) && (pin == io_apic_ints[x].dst_apic_int)) return (io_apic_ints[x].src_bus_irq); return -1; /* NOT found */ } /* * given a LOGICAL APIC# and pin#, return: * the associated INTerrupt type if found * -1 if NOT found */ int apic_int_type(int apic, int pin) { int x; /* search each of the possible INTerrupt sources */ for (x = 0; x < nintrs; ++x) if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) && (pin == io_apic_ints[x].dst_apic_int)) return (io_apic_ints[x].int_type); return -1; /* NOT found */ } int apic_irq(int apic, int pin) { int x; int res; for (x = 0; x < nintrs; ++x) if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) && (pin == io_apic_ints[x].dst_apic_int)) { res = io_apic_ints[x].int_vector; if (res == 0xff) return -1; if (apic != int_to_apicintpin[res].ioapic) panic("apic_irq: inconsistent table"); if (pin != int_to_apicintpin[res].int_pin) panic("apic_irq inconsistent table (2)"); return res; } return -1; } /* * given a LOGICAL APIC# and pin#, return: * the associated trigger mode if found * -1 if NOT found */ int apic_trigger(int apic, int pin) { int x; /* search each of the possible INTerrupt sources */ for (x = 0; x < nintrs; ++x) if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) && (pin == io_apic_ints[x].dst_apic_int)) return ((io_apic_ints[x].int_flags >> 2) & 0x03); return -1; /* NOT found */ } /* * given a LOGICAL APIC# and pin#, return: * the associated 'active' level if found * -1 if NOT found */ int apic_polarity(int apic, int pin) { int x; /* search each of the possible INTerrupt sources */ for (x = 0; x < nintrs; ++x) if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) && (pin == io_apic_ints[x].dst_apic_int)) return (io_apic_ints[x].int_flags & 0x03); return -1; /* NOT found */ } /* * set data according to MP defaults * FIXME: probably not complete yet... */ static void default_mp_table(int type) { int ap_cpu_id; #if defined(APIC_IO) int io_apic_id; int pin; #endif /* APIC_IO */ #if 0 printf(" MP default config type: %d\n", type); switch (type) { case 1: printf(" bus: ISA, APIC: 82489DX\n"); break; case 2: printf(" bus: EISA, APIC: 82489DX\n"); break; case 3: printf(" bus: EISA, APIC: 82489DX\n"); break; case 4: printf(" bus: MCA, APIC: 82489DX\n"); break; case 5: printf(" bus: ISA+PCI, APIC: Integrated\n"); break; case 6: printf(" bus: EISA+PCI, APIC: Integrated\n"); break; case 7: printf(" bus: MCA+PCI, APIC: Integrated\n"); break; default: printf(" future type\n"); break; /* NOTREACHED */ } #endif /* 0 */ boot_cpu_id = (lapic.id & APIC_ID_MASK) >> 24; ap_cpu_id = (boot_cpu_id == 0) ? 1 : 0; /* BSP */ CPU_TO_ID(0) = boot_cpu_id; ID_TO_CPU(boot_cpu_id) = 0; /* one and only AP */ CPU_TO_ID(1) = ap_cpu_id; ID_TO_CPU(ap_cpu_id) = 1; #if defined(APIC_IO) /* one and only IO APIC */ io_apic_id = (io_apic_read(0, IOAPIC_ID) & APIC_ID_MASK) >> 24; /* * sanity check, refer to MP spec section 3.6.6, last paragraph * necessary as some hardware isn't properly setting up the IO APIC */ #if defined(REALLY_ANAL_IOAPICID_VALUE) if (io_apic_id != 2) { #else if ((io_apic_id == 0) || (io_apic_id == 1) || (io_apic_id == 15)) { #endif /* REALLY_ANAL_IOAPICID_VALUE */ io_apic_set_id(0, 2); io_apic_id = 2; } IO_TO_ID(0) = io_apic_id; ID_TO_IO(io_apic_id) = 0; #endif /* APIC_IO */ /* fill out bus entries */ switch (type) { case 1: case 2: case 3: case 4: case 5: case 6: case 7: bus_data[0].bus_id = default_data[type - 1][1]; bus_data[0].bus_type = default_data[type - 1][2]; bus_data[1].bus_id = default_data[type - 1][3]; bus_data[1].bus_type = default_data[type - 1][4]; break; /* case 4: case 7: MCA NOT supported */ default: /* illegal/reserved */ panic("BAD default MP config: %d", type); /* NOTREACHED */ } #if defined(APIC_IO) /* general cases from MP v1.4, table 5-2 */ for (pin = 0; pin < 16; ++pin) { io_apic_ints[pin].int_type = 0; io_apic_ints[pin].int_flags = 0x05; /* edge/active-hi */ io_apic_ints[pin].src_bus_id = 0; io_apic_ints[pin].src_bus_irq = pin; /* IRQ2 caught below */ io_apic_ints[pin].dst_apic_id = io_apic_id; io_apic_ints[pin].dst_apic_int = pin; /* 1-to-1 */ } /* special cases from MP v1.4, table 5-2 */ if (type == 2) { io_apic_ints[2].int_type = 0xff; /* N/C */ io_apic_ints[13].int_type = 0xff; /* N/C */ #if !defined(APIC_MIXED_MODE) /** FIXME: ??? */ panic("sorry, can't support type 2 default yet"); #endif /* APIC_MIXED_MODE */ } else io_apic_ints[2].src_bus_irq = 0; /* ISA IRQ0 is on APIC INT 2 */ if (type == 7) io_apic_ints[0].int_type = 0xff; /* N/C */ else io_apic_ints[0].int_type = 3; /* vectored 8259 */ #endif /* APIC_IO */ } /* * start each AP in our list */ static int start_all_aps(u_int boot_addr) { int x, i, pg; u_char mpbiosreason; u_long mpbioswarmvec; struct globaldata *gd; char *stack; uintptr_t kptbase; POSTCODE(START_ALL_APS_POST); mtx_init(&ap_boot_mtx, "ap boot", MTX_SPIN); /* initialize BSP's local APIC */ apic_initialize(); bsp_apic_ready = 1; /* install the AP 1st level boot code */ install_ap_tramp(boot_addr); /* save the current value of the warm-start vector */ mpbioswarmvec = *((u_long *) WARMBOOT_OFF); #ifndef PC98 outb(CMOS_REG, BIOS_RESET); mpbiosreason = inb(CMOS_DATA); #endif /* set up temporary P==V mapping for AP boot */ /* XXX this is a hack, we should boot the AP on its own stack/PTD */ kptbase = (uintptr_t)(void *)KPTphys; for (x = 0; x < NKPT; x++) PTD[x] = (pd_entry_t)(PG_V | PG_RW | ((kptbase + x * PAGE_SIZE) & PG_FRAME)); invltlb(); /* start each AP */ for (x = 1; x <= mp_naps; ++x) { /* This is a bit verbose, it will go away soon. */ /* first page of AP's private space */ pg = x * i386_btop(sizeof(struct privatespace)); /* allocate a new private data page */ gd = (struct globaldata *)kmem_alloc(kernel_map, PAGE_SIZE); /* wire it into the private page table page */ SMPpt[pg] = (pt_entry_t)(PG_V | PG_RW | vtophys(gd)); /* allocate and set up an idle stack data page */ stack = (char *)kmem_alloc(kernel_map, UPAGES*PAGE_SIZE); for (i = 0; i < UPAGES; i++) SMPpt[pg + 1 + i] = (pt_entry_t) (PG_V | PG_RW | vtophys(PAGE_SIZE * i + stack)); /* prime data page for it to use */ gd->gd_cpuid = x; globaldata_register(gd); /* setup a vector to our boot code */ *((volatile u_short *) WARMBOOT_OFF) = WARMBOOT_TARGET; *((volatile u_short *) WARMBOOT_SEG) = (boot_addr >> 4); #ifndef PC98 outb(CMOS_REG, BIOS_RESET); outb(CMOS_DATA, BIOS_WARM); /* 'warm-start' */ #endif bootSTK = &SMP_prvspace[x].idlestack[UPAGES*PAGE_SIZE]; bootAP = x; /* attempt to start the Application Processor */ CHECK_INIT(99); /* setup checkpoints */ if (!start_ap(x, boot_addr)) { printf("AP #%d (PHY# %d) failed!\n", x, CPU_TO_ID(x)); CHECK_PRINT("trace"); /* show checkpoints */ /* better panic as the AP may be running loose */ printf("panic y/n? [y] "); if (cngetc() != 'n') panic("bye-bye"); } CHECK_PRINT("trace"); /* show checkpoints */ /* record its version info */ cpu_apic_versions[x] = cpu_apic_versions[0]; all_cpus |= (1 << x); /* record AP in CPU map */ } /* build our map of 'other' CPUs */ PCPU_SET(other_cpus, all_cpus & ~(1 << PCPU_GET(cpuid))); /* fill in our (BSP) APIC version */ cpu_apic_versions[0] = lapic.version; /* restore the warmstart vector */ *(u_long *) WARMBOOT_OFF = mpbioswarmvec; #ifndef PC98 outb(CMOS_REG, BIOS_RESET); outb(CMOS_DATA, mpbiosreason); #endif /* * Set up the idle context for the BSP. Similar to above except * that some was done by locore, some by pmap.c and some is implicit * because the BSP is cpu#0 and the page is initially zero, and also * because we can refer to variables by name on the BSP.. */ /* Allocate and setup BSP idle stack */ stack = (char *)kmem_alloc(kernel_map, UPAGES * PAGE_SIZE); for (i = 0; i < UPAGES; i++) SMPpt[1 + i] = (pt_entry_t) (PG_V | PG_RW | vtophys(PAGE_SIZE * i + stack)); for (x = 0; x < NKPT; x++) PTD[x] = 0; pmap_set_opt(); /* number of APs actually started */ return mp_ncpus - 1; } /* * load the 1st level AP boot code into base memory. */ /* targets for relocation */ extern void bigJump(void); extern void bootCodeSeg(void); extern void bootDataSeg(void); extern void MPentry(void); extern u_int MP_GDT; extern u_int mp_gdtbase; static void install_ap_tramp(u_int boot_addr) { int x; int size = *(int *) ((u_long) & bootMP_size); u_char *src = (u_char *) ((u_long) bootMP); u_char *dst = (u_char *) boot_addr + KERNBASE; u_int boot_base = (u_int) bootMP; u_int8_t *dst8; u_int16_t *dst16; u_int32_t *dst32; POSTCODE(INSTALL_AP_TRAMP_POST); for (x = 0; x < size; ++x) *dst++ = *src++; /* * modify addresses in code we just moved to basemem. unfortunately we * need fairly detailed info about mpboot.s for this to work. changes * to mpboot.s might require changes here. */ /* boot code is located in KERNEL space */ dst = (u_char *) boot_addr + KERNBASE; /* modify the lgdt arg */ dst32 = (u_int32_t *) (dst + ((u_int) & mp_gdtbase - boot_base)); *dst32 = boot_addr + ((u_int) & MP_GDT - boot_base); /* modify the ljmp target for MPentry() */ dst32 = (u_int32_t *) (dst + ((u_int) bigJump - boot_base) + 1); *dst32 = ((u_int) MPentry - KERNBASE); /* modify the target for boot code segment */ dst16 = (u_int16_t *) (dst + ((u_int) bootCodeSeg - boot_base)); dst8 = (u_int8_t *) (dst16 + 1); *dst16 = (u_int) boot_addr & 0xffff; *dst8 = ((u_int) boot_addr >> 16) & 0xff; /* modify the target for boot data segment */ dst16 = (u_int16_t *) (dst + ((u_int) bootDataSeg - boot_base)); dst8 = (u_int8_t *) (dst16 + 1); *dst16 = (u_int) boot_addr & 0xffff; *dst8 = ((u_int) boot_addr >> 16) & 0xff; } /* * this function starts the AP (application processor) identified * by the APIC ID 'physicalCpu'. It does quite a "song and dance" * to accomplish this. This is necessary because of the nuances * of the different hardware we might encounter. It ain't pretty, * but it seems to work. */ static int start_ap(int logical_cpu, u_int boot_addr) { int physical_cpu; int vector; int cpus; u_long icr_lo, icr_hi; POSTCODE(START_AP_POST); /* get the PHYSICAL APIC ID# */ physical_cpu = CPU_TO_ID(logical_cpu); /* calculate the vector */ vector = (boot_addr >> 12) & 0xff; /* used as a watchpoint to signal AP startup */ cpus = mp_ncpus; /* * first we do an INIT/RESET IPI this INIT IPI might be run, reseting * and running the target CPU. OR this INIT IPI might be latched (P5 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be * ignored. */ /* setup the address for the target AP */ icr_hi = lapic.icr_hi & ~APIC_ID_MASK; icr_hi |= (physical_cpu << 24); lapic.icr_hi = icr_hi; /* do an INIT IPI: assert RESET */ icr_lo = lapic.icr_lo & 0xfff00000; lapic.icr_lo = icr_lo | 0x0000c500; /* wait for pending status end */ while (lapic.icr_lo & APIC_DELSTAT_MASK) /* spin */ ; /* do an INIT IPI: deassert RESET */ lapic.icr_lo = icr_lo | 0x00008500; /* wait for pending status end */ u_sleep(10000); /* wait ~10mS */ while (lapic.icr_lo & APIC_DELSTAT_MASK) /* spin */ ; /* * next we do a STARTUP IPI: the previous INIT IPI might still be * latched, (P5 bug) this 1st STARTUP would then terminate * immediately, and the previously started INIT IPI would continue. OR * the previous INIT IPI has already run. and this STARTUP IPI will * run. OR the previous INIT IPI was ignored. and this STARTUP IPI * will run. */ /* do a STARTUP IPI */ lapic.icr_lo = icr_lo | 0x00000600 | vector; while (lapic.icr_lo & APIC_DELSTAT_MASK) /* spin */ ; u_sleep(200); /* wait ~200uS */ /* * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is * recognized after hardware RESET or INIT IPI. */ lapic.icr_lo = icr_lo | 0x00000600 | vector; while (lapic.icr_lo & APIC_DELSTAT_MASK) /* spin */ ; u_sleep(200); /* wait ~200uS */ /* wait for it to start */ set_apic_timer(5000000);/* == 5 seconds */ while (read_apic_timer()) if (mp_ncpus > cpus) return 1; /* return SUCCESS */ return 0; /* return FAILURE */ } /* * Flush the TLB on all other CPU's * * XXX: Needs to handshake and wait for completion before proceding. */ void smp_invltlb(void) { #if defined(APIC_IO) if (smp_started && invltlb_ok) ipi_all_but_self(IPI_INVLTLB); #endif /* APIC_IO */ } void invlpg(u_int addr) { __asm __volatile("invlpg (%0)"::"r"(addr):"memory"); /* send a message to the other CPUs */ smp_invltlb(); } void invltlb(void) { u_long temp; /* * This should be implemented as load_cr3(rcr3()) when load_cr3() is * inlined. */ __asm __volatile("movl %%cr3, %0; movl %0, %%cr3":"=r"(temp) :: "memory"); /* send a message to the other CPUs */ smp_invltlb(); } /* * This is called once the rest of the system is up and running and we're * ready to let the AP's out of the pen. */ extern void enable_sse(void); void ap_init(void) { u_int apic_id; /* spin until all the AP's are ready */ while (!aps_ready) /* spin */ ; /* * Set curproc to our per-cpu idleproc so that mutexes have * something unique to lock with. */ PCPU_SET(curproc, PCPU_GET(idleproc)); PCPU_SET(spinlocks, NULL); /* lock against other AP's that are waking up */ mtx_lock_spin(&ap_boot_mtx); /* BSP may have changed PTD while we're waiting for the lock */ cpu_invltlb(); smp_cpus++; #if defined(I586_CPU) && !defined(NO_F00F_HACK) lidt(&r_idt); #endif /* Build our map of 'other' CPUs. */ PCPU_SET(other_cpus, all_cpus & ~(1 << PCPU_GET(cpuid))); printf("SMP: AP CPU #%d Launched!\n", PCPU_GET(cpuid)); /* set up CPU registers and state */ cpu_setregs(); /* set up FPU state on the AP */ npxinit(__INITIAL_NPXCW__); /* set up SSE registers */ enable_sse(); /* A quick check from sanity claus */ apic_id = (apic_id_to_logical[(lapic.id & 0x0f000000) >> 24]); if (PCPU_GET(cpuid) != apic_id) { printf("SMP: cpuid = %d\n", PCPU_GET(cpuid)); printf("SMP: apic_id = %d\n", apic_id); printf("PTD[MPPTDI] = %p\n", (void *)PTD[MPPTDI]); panic("cpuid mismatch! boom!!"); } /* Init local apic for irq's */ apic_initialize(); /* Set memory range attributes for this CPU to match the BSP */ mem_range_AP_init(); /* * Activate smp_invltlb, although strictly speaking, this isn't * quite correct yet. We should have a bitfield for cpus willing * to accept TLB flush IPI's or something and sync them. */ if (smp_cpus == mp_ncpus) { invltlb_ok = 1; smp_started = 1; /* enable IPI's, tlb shootdown, freezes etc */ smp_active = 1; /* historic */ } /* let other AP's wake up now */ mtx_unlock_spin(&ap_boot_mtx); /* wait until all the AP's are up */ while (smp_started == 0) ; /* nothing */ microuptime(PCPU_PTR(switchtime)); PCPU_SET(switchticks, ticks); /* ok, now grab sched_lock and enter the scheduler */ enable_intr(); mtx_lock_spin(&sched_lock); cpu_throw(); /* doesn't return */ panic("scheduler returned us to ap_init"); } /* * For statclock, we send an IPI to all CPU's to have them call this * function. */ void forwarded_statclock(struct trapframe frame) { mtx_lock_spin(&sched_lock); statclock_process(curproc, TRAPF_PC(&frame), TRAPF_USERMODE(&frame)); mtx_unlock_spin(&sched_lock); } void forward_statclock(void) { int map; CTR0(KTR_SMP, "forward_statclock"); if (!smp_started || !invltlb_ok || cold || panicstr) return; map = PCPU_GET(other_cpus) & ~stopped_cpus ; if (map != 0) ipi_selected(map, IPI_STATCLOCK); } /* * For each hardclock(), we send an IPI to all other CPU's to have them * execute this function. It would be nice to reduce contention on * sched_lock if we could simply peek at the CPU to determine the user/kernel * state and call hardclock_process() on the CPU receiving the clock interrupt * and then just use a simple IPI to handle any ast's if needed. */ void forwarded_hardclock(struct trapframe frame) { mtx_lock_spin(&sched_lock); hardclock_process(curproc, TRAPF_USERMODE(&frame)); mtx_unlock_spin(&sched_lock); } void forward_hardclock(void) { u_int map; CTR0(KTR_SMP, "forward_hardclock"); if (!smp_started || !invltlb_ok || cold || panicstr) return; map = PCPU_GET(other_cpus) & ~stopped_cpus ; if (map != 0) ipi_selected(map, IPI_HARDCLOCK); } #ifdef APIC_INTR_REORDER /* * Maintain mapping from softintr vector to isr bit in local apic. */ void set_lapic_isrloc(int intr, int vector) { if (intr < 0 || intr > 32) panic("set_apic_isrloc: bad intr argument: %d",intr); if (vector < ICU_OFFSET || vector > 255) panic("set_apic_isrloc: bad vector argument: %d",vector); apic_isrbit_location[intr].location = &lapic.isr0 + ((vector>>5)<<2); apic_isrbit_location[intr].bit = (1<<(vector & 31)); } #endif /* * send an IPI to a set of cpus. */ void ipi_selected(u_int32_t cpus, u_int ipi) { CTR2(KTR_SMP, __func__ ": cpus: %x ipi: %x", cpus, ipi); selected_apic_ipi(cpus, ipi, APIC_DELMODE_FIXED); } /* * send an IPI INTerrupt containing 'vector' to all CPUs, including myself */ void ipi_all(u_int ipi) { CTR1(KTR_SMP, __func__ ": ipi: %x", ipi); apic_ipi(APIC_DEST_ALLISELF, ipi, APIC_DELMODE_FIXED); } /* * send an IPI to all CPUs EXCEPT myself */ void ipi_all_but_self(u_int ipi) { CTR1(KTR_SMP, __func__ ": ipi: %x", ipi); apic_ipi(APIC_DEST_ALLESELF, ipi, APIC_DELMODE_FIXED); } /* * send an IPI to myself */ void ipi_self(u_int ipi) { CTR1(KTR_SMP, __func__ ": ipi: %x", ipi); apic_ipi(APIC_DEST_SELF, ipi, APIC_DELMODE_FIXED); } void release_aps(void *dummy __unused) { atomic_store_rel_int(&aps_ready, 1); } SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); Index: head/sys/i386/i386/mptable.c =================================================================== --- head/sys/i386/i386/mptable.c (revision 82308) +++ head/sys/i386/i386/mptable.c (revision 82309) @@ -1,2440 +1,2442 @@ /* * Copyright (c) 1996, by Steve Passe * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. The name of the developer may NOT be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include "opt_cpu.h" +#include "opt_upages.h" #ifdef SMP #include #else #error #endif #include #include #include #include /* cngetc() */ #include #ifdef GPROF #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /** TEST_DEFAULT_CONFIG, TEST_TEST1 */ #include #include #include +#include #if defined(APIC_IO) #include /* setidt() */ #include /* IPIs */ #include /* IPIs */ #endif /* APIC_IO */ #if defined(TEST_DEFAULT_CONFIG) #define MPFPS_MPFB1 TEST_DEFAULT_CONFIG #else #define MPFPS_MPFB1 mpfps->mpfb1 #endif /* TEST_DEFAULT_CONFIG */ #define WARMBOOT_TARGET 0 #define WARMBOOT_OFF (KERNBASE + 0x0467) #define WARMBOOT_SEG (KERNBASE + 0x0469) #ifdef PC98 #define BIOS_BASE (0xe8000) #define BIOS_SIZE (0x18000) #else #define BIOS_BASE (0xf0000) #define BIOS_SIZE (0x10000) #endif #define BIOS_COUNT (BIOS_SIZE/4) #define CMOS_REG (0x70) #define CMOS_DATA (0x71) #define BIOS_RESET (0x0f) #define BIOS_WARM (0x0a) #define PROCENTRY_FLAG_EN 0x01 #define PROCENTRY_FLAG_BP 0x02 #define IOAPICENTRY_FLAG_EN 0x01 /* MP Floating Pointer Structure */ typedef struct MPFPS { char signature[4]; void *pap; u_char length; u_char spec_rev; u_char checksum; u_char mpfb1; u_char mpfb2; u_char mpfb3; u_char mpfb4; u_char mpfb5; } *mpfps_t; /* MP Configuration Table Header */ typedef struct MPCTH { char signature[4]; u_short base_table_length; u_char spec_rev; u_char checksum; u_char oem_id[8]; u_char product_id[12]; void *oem_table_pointer; u_short oem_table_size; u_short entry_count; void *apic_address; u_short extended_table_length; u_char extended_table_checksum; u_char reserved; } *mpcth_t; typedef struct PROCENTRY { u_char type; u_char apic_id; u_char apic_version; u_char cpu_flags; u_long cpu_signature; u_long feature_flags; u_long reserved1; u_long reserved2; } *proc_entry_ptr; typedef struct BUSENTRY { u_char type; u_char bus_id; char bus_type[6]; } *bus_entry_ptr; typedef struct IOAPICENTRY { u_char type; u_char apic_id; u_char apic_version; u_char apic_flags; void *apic_address; } *io_apic_entry_ptr; typedef struct INTENTRY { u_char type; u_char int_type; u_short int_flags; u_char src_bus_id; u_char src_bus_irq; u_char dst_apic_id; u_char dst_apic_int; } *int_entry_ptr; /* descriptions of MP basetable entries */ typedef struct BASETABLE_ENTRY { u_char type; u_char length; char name[16]; } basetable_entry; /* * this code MUST be enabled here and in mpboot.s. * it follows the very early stages of AP boot by placing values in CMOS ram. * it NORMALLY will never be needed and thus the primitive method for enabling. * #define CHECK_POINTS */ #if defined(CHECK_POINTS) && !defined(PC98) #define CHECK_READ(A) (outb(CMOS_REG, (A)), inb(CMOS_DATA)) #define CHECK_WRITE(A,D) (outb(CMOS_REG, (A)), outb(CMOS_DATA, (D))) #define CHECK_INIT(D); \ CHECK_WRITE(0x34, (D)); \ CHECK_WRITE(0x35, (D)); \ CHECK_WRITE(0x36, (D)); \ CHECK_WRITE(0x37, (D)); \ CHECK_WRITE(0x38, (D)); \ CHECK_WRITE(0x39, (D)); #define CHECK_PRINT(S); \ printf("%s: %d, %d, %d, %d, %d, %d\n", \ (S), \ CHECK_READ(0x34), \ CHECK_READ(0x35), \ CHECK_READ(0x36), \ CHECK_READ(0x37), \ CHECK_READ(0x38), \ CHECK_READ(0x39)); #else /* CHECK_POINTS */ #define CHECK_INIT(D) #define CHECK_PRINT(S) #endif /* CHECK_POINTS */ /* * Values to send to the POST hardware. */ #define MP_BOOTADDRESS_POST 0x10 #define MP_PROBE_POST 0x11 #define MPTABLE_PASS1_POST 0x12 #define MP_START_POST 0x13 #define MP_ENABLE_POST 0x14 #define MPTABLE_PASS2_POST 0x15 #define START_ALL_APS_POST 0x16 #define INSTALL_AP_TRAMP_POST 0x17 #define START_AP_POST 0x18 #define MP_ANNOUNCE_POST 0x19 /* used to hold the AP's until we are ready to release them */ static struct mtx ap_boot_mtx; /** XXX FIXME: where does this really belong, isa.h/isa.c perhaps? */ int current_postcode; /** XXX FIXME: what system files declare these??? */ extern struct region_descriptor r_gdt, r_idt; int bsp_apic_ready = 0; /* flags useability of BSP apic */ int mp_naps; /* # of Applications processors */ int mp_nbusses; /* # of busses */ int mp_napics; /* # of IO APICs */ int boot_cpu_id; /* designated BSP */ vm_offset_t cpu_apic_address; vm_offset_t io_apic_address[NAPICID]; /* NAPICID is more than enough */ extern int nkpt; u_int32_t cpu_apic_versions[MAXCPU]; u_int32_t *io_apic_versions; #ifdef APIC_INTR_REORDER struct { volatile int *location; int bit; } apic_isrbit_location[32]; #endif struct apic_intmapinfo int_to_apicintpin[APIC_INTMAPSIZE]; /* * APIC ID logical/physical mapping structures. * We oversize these to simplify boot-time config. */ int cpu_num_to_apic_id[NAPICID]; int io_num_to_apic_id[NAPICID]; int apic_id_to_logical[NAPICID]; /* AP uses this during bootstrap. Do not staticize. */ char *bootSTK; static int bootAP; /* Hotwire a 0->4MB V==P mapping */ extern pt_entry_t *KPTphys; /* SMP page table page */ extern pt_entry_t *SMPpt; struct pcb stoppcbs[MAXCPU]; int invltlb_ok = 0; /* throttle smp_invltlb() till safe */ SYSCTL_INT(_machdep, OID_AUTO, invltlb_ok, CTLFLAG_RW, &invltlb_ok, 0, ""); /* * Local data and functions. */ /* Set to 1 once we're ready to let the APs out of the pen. */ static volatile int aps_ready = 0; static int mp_capable; static u_int boot_address; static u_int base_memory; static int picmode; /* 0: virtual wire mode, 1: PIC mode */ static mpfps_t mpfps; static int search_for_sig(u_int32_t target, int count); static void mp_enable(u_int boot_addr); static void mptable_pass1(void); static int mptable_pass2(void); static void default_mp_table(int type); static void fix_mp_table(void); static void setup_apic_irq_mapping(void); static void init_locks(void); static int start_all_aps(u_int boot_addr); static void install_ap_tramp(u_int boot_addr); static int start_ap(int logicalCpu, u_int boot_addr); void ap_init(void); static int apic_int_is_bus_type(int intr, int bus_type); static void release_aps(void *dummy); /* * initialize all the SMP locks */ /* critical region around IO APIC, apic_imen */ struct mtx imen_mtx; /* lock region used by kernel profiling */ int mcount_lock; #ifdef USE_COMLOCK /* locks com (tty) data/hardware accesses: a FASTINTR() */ struct mtx com_mtx; #endif /* USE_COMLOCK */ static void init_locks(void) { #ifdef USE_COMLOCK mtx_init(&com_mtx, "com", MTX_SPIN); #endif /* USE_COMLOCK */ } /* * Calculate usable address in base memory for AP trampoline code. */ u_int mp_bootaddress(u_int basemem) { POSTCODE(MP_BOOTADDRESS_POST); base_memory = basemem * 1024; /* convert to bytes */ boot_address = base_memory & ~0xfff; /* round down to 4k boundary */ if ((base_memory - boot_address) < bootMP_size) boot_address -= 4096; /* not enough, lower by 4k */ return boot_address; } /* * Look for an Intel MP spec table (ie, SMP capable hardware). */ void i386_mp_probe(void) { int x; u_long segment; u_int32_t target; POSTCODE(MP_PROBE_POST); /* see if EBDA exists */ if ((segment = (u_long) * (u_short *) (KERNBASE + 0x40e)) != 0) { /* search first 1K of EBDA */ target = (u_int32_t) (segment << 4); if ((x = search_for_sig(target, 1024 / 4)) >= 0) goto found; } else { /* last 1K of base memory, effective 'top of base' passed in */ target = (u_int32_t) (base_memory - 0x400); if ((x = search_for_sig(target, 1024 / 4)) >= 0) goto found; } /* search the BIOS */ target = (u_int32_t) BIOS_BASE; if ((x = search_for_sig(target, BIOS_COUNT)) >= 0) goto found; /* nothing found */ mpfps = (mpfps_t)0; mp_capable = 0; return; found: /* calculate needed resources */ mpfps = (mpfps_t)x; mptable_pass1(); /* flag fact that we are running multiple processors */ mp_capable = 1; } int cpu_mp_probe(void) { /* * Record BSP in CPU map * This is done here so that MBUF init code works correctly. */ all_cpus = 1; return (mp_capable); } /* * Initialize the SMP hardware and the APIC and start up the AP's. */ void cpu_mp_start(void) { POSTCODE(MP_START_POST); /* look for MP capable motherboard */ if (mp_capable) mp_enable(boot_address); else panic("MP hardware not found!"); cpu_setregs(); } /* * Print various information about the SMP system hardware and setup. */ void cpu_mp_announce(void) { int x; POSTCODE(MP_ANNOUNCE_POST); printf(" cpu0 (BSP): apic id: %2d", CPU_TO_ID(0)); printf(", version: 0x%08x", cpu_apic_versions[0]); printf(", at 0x%08x\n", cpu_apic_address); for (x = 1; x <= mp_naps; ++x) { printf(" cpu%d (AP): apic id: %2d", x, CPU_TO_ID(x)); printf(", version: 0x%08x", cpu_apic_versions[x]); printf(", at 0x%08x\n", cpu_apic_address); } #if defined(APIC_IO) for (x = 0; x < mp_napics; ++x) { printf(" io%d (APIC): apic id: %2d", x, IO_TO_ID(x)); printf(", version: 0x%08x", io_apic_versions[x]); printf(", at 0x%08x\n", io_apic_address[x]); } #else printf(" Warning: APIC I/O disabled\n"); #endif /* APIC_IO */ } /* * AP cpu's call this to sync up protected mode. */ void init_secondary(void) { int gsel_tss; int x, myid = bootAP; gdt_segs[GPRIV_SEL].ssd_base = (int) &SMP_prvspace[myid]; gdt_segs[GPROC0_SEL].ssd_base = (int) &SMP_prvspace[myid].globaldata.gd_common_tss; SMP_prvspace[myid].globaldata.gd_prvspace = &SMP_prvspace[myid].globaldata; for (x = 0; x < NGDT; x++) { ssdtosd(&gdt_segs[x], &gdt[myid * NGDT + x].sd); } r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; r_gdt.rd_base = (int) &gdt[myid * NGDT]; lgdt(&r_gdt); /* does magic intra-segment return */ lidt(&r_idt); lldt(_default_ldt); PCPU_SET(currentldt, _default_ldt); gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); gdt[myid * NGDT + GPROC0_SEL].sd.sd_type = SDT_SYS386TSS; PCPU_SET(common_tss.tss_esp0, 0); /* not used until after switch */ PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL)); PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16); PCPU_SET(tss_gdt, &gdt[myid * NGDT + GPROC0_SEL].sd); PCPU_SET(common_tssd, *PCPU_GET(tss_gdt)); ltr(gsel_tss); pmap_set_opt(); } #if defined(APIC_IO) /* * Final configuration of the BSP's local APIC: * - disable 'pic mode'. * - disable 'virtual wire mode'. * - enable NMI. */ void bsp_apic_configure(void) { u_char byte; u_int32_t temp; /* leave 'pic mode' if necessary */ if (picmode) { outb(0x22, 0x70); /* select IMCR */ byte = inb(0x23); /* current contents */ byte |= 0x01; /* mask external INTR */ outb(0x23, byte); /* disconnect 8259s/NMI */ } /* mask lint0 (the 8259 'virtual wire' connection) */ temp = lapic.lvt_lint0; temp |= APIC_LVT_M; /* set the mask */ lapic.lvt_lint0 = temp; /* setup lint1 to handle NMI */ temp = lapic.lvt_lint1; temp &= ~APIC_LVT_M; /* clear the mask */ lapic.lvt_lint1 = temp; if (bootverbose) apic_dump("bsp_apic_configure()"); } #endif /* APIC_IO */ /******************************************************************* * local functions and data */ /* * start the SMP system */ static void mp_enable(u_int boot_addr) { int x; #if defined(APIC_IO) int apic; u_int ux; #endif /* APIC_IO */ POSTCODE(MP_ENABLE_POST); /* turn on 4MB of V == P addressing so we can get to MP table */ *(int *)PTD = PG_V | PG_RW | ((uintptr_t)(void *)KPTphys & PG_FRAME); invltlb(); /* examine the MP table for needed info, uses physical addresses */ x = mptable_pass2(); *(int *)PTD = 0; invltlb(); /* can't process default configs till the CPU APIC is pmapped */ if (x) default_mp_table(x); /* post scan cleanup */ fix_mp_table(); setup_apic_irq_mapping(); #if defined(APIC_IO) /* fill the LOGICAL io_apic_versions table */ for (apic = 0; apic < mp_napics; ++apic) { ux = io_apic_read(apic, IOAPIC_VER); io_apic_versions[apic] = ux; io_apic_set_id(apic, IO_TO_ID(apic)); } /* program each IO APIC in the system */ for (apic = 0; apic < mp_napics; ++apic) if (io_apic_setup(apic) < 0) panic("IO APIC setup failure"); /* install a 'Spurious INTerrupt' vector */ setidt(XSPURIOUSINT_OFFSET, Xspuriousint, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* install an inter-CPU IPI for TLB invalidation */ setidt(XINVLTLB_OFFSET, Xinvltlb, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* install an inter-CPU IPI for forwarding hardclock() */ setidt(XHARDCLOCK_OFFSET, Xhardclock, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* install an inter-CPU IPI for forwarding statclock() */ setidt(XSTATCLOCK_OFFSET, Xstatclock, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* install an inter-CPU IPI for all-CPU rendezvous */ setidt(XRENDEZVOUS_OFFSET, Xrendezvous, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* install an inter-CPU IPI for forcing an additional software trap */ setidt(XCPUAST_OFFSET, Xcpuast, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* install an inter-CPU IPI for CPU stop/restart */ setidt(XCPUSTOP_OFFSET, Xcpustop, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); #if defined(TEST_TEST1) /* install a "fake hardware INTerrupt" vector */ setidt(XTEST1_OFFSET, Xtest1, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); #endif /** TEST_TEST1 */ #endif /* APIC_IO */ /* initialize all SMP locks */ init_locks(); /* start each Application Processor */ start_all_aps(boot_addr); } /* * look for the MP spec signature */ /* string defined by the Intel MP Spec as identifying the MP table */ #define MP_SIG 0x5f504d5f /* _MP_ */ #define NEXT(X) ((X) += 4) static int search_for_sig(u_int32_t target, int count) { int x; u_int32_t *addr = (u_int32_t *) (KERNBASE + target); for (x = 0; x < count; NEXT(x)) if (addr[x] == MP_SIG) /* make array index a byte index */ return (target + (x * sizeof(u_int32_t))); return -1; } static basetable_entry basetable_entry_types[] = { {0, 20, "Processor"}, {1, 8, "Bus"}, {2, 8, "I/O APIC"}, {3, 8, "I/O INT"}, {4, 8, "Local INT"} }; typedef struct BUSDATA { u_char bus_id; enum busTypes bus_type; } bus_datum; typedef struct INTDATA { u_char int_type; u_short int_flags; u_char src_bus_id; u_char src_bus_irq; u_char dst_apic_id; u_char dst_apic_int; u_char int_vector; } io_int, local_int; typedef struct BUSTYPENAME { u_char type; char name[7]; } bus_type_name; static bus_type_name bus_type_table[] = { {CBUS, "CBUS"}, {CBUSII, "CBUSII"}, {EISA, "EISA"}, {MCA, "MCA"}, {UNKNOWN_BUSTYPE, "---"}, {ISA, "ISA"}, {MCA, "MCA"}, {UNKNOWN_BUSTYPE, "---"}, {UNKNOWN_BUSTYPE, "---"}, {UNKNOWN_BUSTYPE, "---"}, {UNKNOWN_BUSTYPE, "---"}, {UNKNOWN_BUSTYPE, "---"}, {PCI, "PCI"}, {UNKNOWN_BUSTYPE, "---"}, {UNKNOWN_BUSTYPE, "---"}, {UNKNOWN_BUSTYPE, "---"}, {UNKNOWN_BUSTYPE, "---"}, {XPRESS, "XPRESS"}, {UNKNOWN_BUSTYPE, "---"} }; /* from MP spec v1.4, table 5-1 */ static int default_data[7][5] = { /* nbus, id0, type0, id1, type1 */ {1, 0, ISA, 255, 255}, {1, 0, EISA, 255, 255}, {1, 0, EISA, 255, 255}, {1, 0, MCA, 255, 255}, {2, 0, ISA, 1, PCI}, {2, 0, EISA, 1, PCI}, {2, 0, MCA, 1, PCI} }; /* the bus data */ static bus_datum *bus_data; /* the IO INT data, one entry per possible APIC INTerrupt */ static io_int *io_apic_ints; static int nintrs; static int processor_entry __P((proc_entry_ptr entry, int cpu)); static int bus_entry __P((bus_entry_ptr entry, int bus)); static int io_apic_entry __P((io_apic_entry_ptr entry, int apic)); static int int_entry __P((int_entry_ptr entry, int intr)); static int lookup_bus_type __P((char *name)); /* * 1st pass on motherboard's Intel MP specification table. * * initializes: * mp_ncpus = 1 * * determines: * cpu_apic_address (common to all CPUs) * io_apic_address[N] * mp_naps * mp_nbusses * mp_napics * nintrs */ static void mptable_pass1(void) { int x; mpcth_t cth; int totalSize; void* position; int count; int type; POSTCODE(MPTABLE_PASS1_POST); /* clear various tables */ for (x = 0; x < NAPICID; ++x) { io_apic_address[x] = ~0; /* IO APIC address table */ } /* init everything to empty */ mp_naps = 0; mp_nbusses = 0; mp_napics = 0; nintrs = 0; /* check for use of 'default' configuration */ if (MPFPS_MPFB1 != 0) { /* use default addresses */ cpu_apic_address = DEFAULT_APIC_BASE; io_apic_address[0] = DEFAULT_IO_APIC_BASE; /* fill in with defaults */ mp_naps = 2; /* includes BSP */ mp_nbusses = default_data[MPFPS_MPFB1 - 1][0]; #if defined(APIC_IO) mp_napics = 1; nintrs = 16; #endif /* APIC_IO */ } else { if ((cth = mpfps->pap) == 0) panic("MP Configuration Table Header MISSING!"); cpu_apic_address = (vm_offset_t) cth->apic_address; /* walk the table, recording info of interest */ totalSize = cth->base_table_length - sizeof(struct MPCTH); position = (u_char *) cth + sizeof(struct MPCTH); count = cth->entry_count; while (count--) { switch (type = *(u_char *) position) { case 0: /* processor_entry */ if (((proc_entry_ptr)position)->cpu_flags & PROCENTRY_FLAG_EN) ++mp_naps; break; case 1: /* bus_entry */ ++mp_nbusses; break; case 2: /* io_apic_entry */ if (((io_apic_entry_ptr)position)->apic_flags & IOAPICENTRY_FLAG_EN) io_apic_address[mp_napics++] = (vm_offset_t)((io_apic_entry_ptr) position)->apic_address; break; case 3: /* int_entry */ ++nintrs; break; case 4: /* int_entry */ break; default: panic("mpfps Base Table HOSED!"); /* NOTREACHED */ } totalSize -= basetable_entry_types[type].length; (u_char*)position += basetable_entry_types[type].length; } } /* qualify the numbers */ if (mp_naps > MAXCPU) { printf("Warning: only using %d of %d available CPUs!\n", MAXCPU, mp_naps); mp_naps = MAXCPU; } /* * Count the BSP. * This is also used as a counter while starting the APs. */ mp_ncpus = 1; --mp_naps; /* subtract the BSP */ } /* * 2nd pass on motherboard's Intel MP specification table. * * sets: * boot_cpu_id * ID_TO_IO(N), phy APIC ID to log CPU/IO table * CPU_TO_ID(N), logical CPU to APIC ID table * IO_TO_ID(N), logical IO to APIC ID table * bus_data[N] * io_apic_ints[N] */ static int mptable_pass2(void) { int x; mpcth_t cth; int totalSize; void* position; int count; int type; int apic, bus, cpu, intr; int i, j; int pgeflag; POSTCODE(MPTABLE_PASS2_POST); pgeflag = 0; /* XXX - Not used under SMP yet. */ MALLOC(io_apic_versions, u_int32_t *, sizeof(u_int32_t) * mp_napics, M_DEVBUF, M_WAITOK); MALLOC(ioapic, volatile ioapic_t **, sizeof(ioapic_t *) * mp_napics, M_DEVBUF, M_WAITOK); MALLOC(io_apic_ints, io_int *, sizeof(io_int) * (nintrs + 1), M_DEVBUF, M_WAITOK); MALLOC(bus_data, bus_datum *, sizeof(bus_datum) * mp_nbusses, M_DEVBUF, M_WAITOK); bzero(ioapic, sizeof(ioapic_t *) * mp_napics); for (i = 0; i < mp_napics; i++) { for (j = 0; j < mp_napics; j++) { /* same page frame as a previous IO apic? */ if (((vm_offset_t)SMPpt[NPTEPG-2-j] & PG_FRAME) == (io_apic_address[i] & PG_FRAME)) { ioapic[i] = (ioapic_t *)((u_int)SMP_prvspace + (NPTEPG-2-j) * PAGE_SIZE + (io_apic_address[i] & PAGE_MASK)); break; } /* use this slot if available */ if (((vm_offset_t)SMPpt[NPTEPG-2-j] & PG_FRAME) == 0) { SMPpt[NPTEPG-2-j] = (pt_entry_t)(PG_V | PG_RW | pgeflag | (io_apic_address[i] & PG_FRAME)); ioapic[i] = (ioapic_t *)((u_int)SMP_prvspace + (NPTEPG-2-j) * PAGE_SIZE + (io_apic_address[i] & PAGE_MASK)); break; } } } /* clear various tables */ for (x = 0; x < NAPICID; ++x) { ID_TO_IO(x) = -1; /* phy APIC ID to log CPU/IO table */ CPU_TO_ID(x) = -1; /* logical CPU to APIC ID table */ IO_TO_ID(x) = -1; /* logical IO to APIC ID table */ } /* clear bus data table */ for (x = 0; x < mp_nbusses; ++x) bus_data[x].bus_id = 0xff; /* clear IO APIC INT table */ for (x = 0; x < (nintrs + 1); ++x) { io_apic_ints[x].int_type = 0xff; io_apic_ints[x].int_vector = 0xff; } /* setup the cpu/apic mapping arrays */ boot_cpu_id = -1; /* record whether PIC or virtual-wire mode */ picmode = (mpfps->mpfb2 & 0x80) ? 1 : 0; /* check for use of 'default' configuration */ if (MPFPS_MPFB1 != 0) return MPFPS_MPFB1; /* return default configuration type */ if ((cth = mpfps->pap) == 0) panic("MP Configuration Table Header MISSING!"); /* walk the table, recording info of interest */ totalSize = cth->base_table_length - sizeof(struct MPCTH); position = (u_char *) cth + sizeof(struct MPCTH); count = cth->entry_count; apic = bus = intr = 0; cpu = 1; /* pre-count the BSP */ while (count--) { switch (type = *(u_char *) position) { case 0: if (processor_entry(position, cpu)) ++cpu; break; case 1: if (bus_entry(position, bus)) ++bus; break; case 2: if (io_apic_entry(position, apic)) ++apic; break; case 3: if (int_entry(position, intr)) ++intr; break; case 4: /* int_entry(position); */ break; default: panic("mpfps Base Table HOSED!"); /* NOTREACHED */ } totalSize -= basetable_entry_types[type].length; (u_char *) position += basetable_entry_types[type].length; } if (boot_cpu_id == -1) panic("NO BSP found!"); /* report fact that its NOT a default configuration */ return 0; } void assign_apic_irq(int apic, int intpin, int irq) { int x; if (int_to_apicintpin[irq].ioapic != -1) panic("assign_apic_irq: inconsistent table"); int_to_apicintpin[irq].ioapic = apic; int_to_apicintpin[irq].int_pin = intpin; int_to_apicintpin[irq].apic_address = ioapic[apic]; int_to_apicintpin[irq].redirindex = IOAPIC_REDTBL + 2 * intpin; for (x = 0; x < nintrs; x++) { if ((io_apic_ints[x].int_type == 0 || io_apic_ints[x].int_type == 3) && io_apic_ints[x].int_vector == 0xff && io_apic_ints[x].dst_apic_id == IO_TO_ID(apic) && io_apic_ints[x].dst_apic_int == intpin) io_apic_ints[x].int_vector = irq; } } void revoke_apic_irq(int irq) { int x; int oldapic; int oldintpin; if (int_to_apicintpin[irq].ioapic == -1) panic("assign_apic_irq: inconsistent table"); oldapic = int_to_apicintpin[irq].ioapic; oldintpin = int_to_apicintpin[irq].int_pin; int_to_apicintpin[irq].ioapic = -1; int_to_apicintpin[irq].int_pin = 0; int_to_apicintpin[irq].apic_address = NULL; int_to_apicintpin[irq].redirindex = 0; for (x = 0; x < nintrs; x++) { if ((io_apic_ints[x].int_type == 0 || io_apic_ints[x].int_type == 3) && io_apic_ints[x].int_vector == 0xff && io_apic_ints[x].dst_apic_id == IO_TO_ID(oldapic) && io_apic_ints[x].dst_apic_int == oldintpin) io_apic_ints[x].int_vector = 0xff; } } static void allocate_apic_irq(int intr) { int apic; int intpin; int irq; if (io_apic_ints[intr].int_vector != 0xff) return; /* Interrupt handler already assigned */ if (io_apic_ints[intr].int_type != 0 && (io_apic_ints[intr].int_type != 3 || (io_apic_ints[intr].dst_apic_id == IO_TO_ID(0) && io_apic_ints[intr].dst_apic_int == 0))) return; /* Not INT or ExtInt on != (0, 0) */ irq = 0; while (irq < APIC_INTMAPSIZE && int_to_apicintpin[irq].ioapic != -1) irq++; if (irq >= APIC_INTMAPSIZE) return; /* No free interrupt handlers */ apic = ID_TO_IO(io_apic_ints[intr].dst_apic_id); intpin = io_apic_ints[intr].dst_apic_int; assign_apic_irq(apic, intpin, irq); io_apic_setup_intpin(apic, intpin); } static void swap_apic_id(int apic, int oldid, int newid) { int x; int oapic; if (oldid == newid) return; /* Nothing to do */ printf("Changing APIC ID for IO APIC #%d from %d to %d in MP table\n", apic, oldid, newid); /* Swap physical APIC IDs in interrupt entries */ for (x = 0; x < nintrs; x++) { if (io_apic_ints[x].dst_apic_id == oldid) io_apic_ints[x].dst_apic_id = newid; else if (io_apic_ints[x].dst_apic_id == newid) io_apic_ints[x].dst_apic_id = oldid; } /* Swap physical APIC IDs in IO_TO_ID mappings */ for (oapic = 0; oapic < mp_napics; oapic++) if (IO_TO_ID(oapic) == newid) break; if (oapic < mp_napics) { printf("Changing APIC ID for IO APIC #%d from " "%d to %d in MP table\n", oapic, newid, oldid); IO_TO_ID(oapic) = oldid; } IO_TO_ID(apic) = newid; } static void fix_id_to_io_mapping(void) { int x; for (x = 0; x < NAPICID; x++) ID_TO_IO(x) = -1; for (x = 0; x <= mp_naps; x++) if (CPU_TO_ID(x) < NAPICID) ID_TO_IO(CPU_TO_ID(x)) = x; for (x = 0; x < mp_napics; x++) if (IO_TO_ID(x) < NAPICID) ID_TO_IO(IO_TO_ID(x)) = x; } static int first_free_apic_id(void) { int freeid, x; for (freeid = 0; freeid < NAPICID; freeid++) { for (x = 0; x <= mp_naps; x++) if (CPU_TO_ID(x) == freeid) break; if (x <= mp_naps) continue; for (x = 0; x < mp_napics; x++) if (IO_TO_ID(x) == freeid) break; if (x < mp_napics) continue; return freeid; } return freeid; } static int io_apic_id_acceptable(int apic, int id) { int cpu; /* Logical CPU number */ int oapic; /* Logical IO APIC number for other IO APIC */ if (id >= NAPICID) return 0; /* Out of range */ for (cpu = 0; cpu <= mp_naps; cpu++) if (CPU_TO_ID(cpu) == id) return 0; /* Conflict with CPU */ for (oapic = 0; oapic < mp_napics && oapic < apic; oapic++) if (IO_TO_ID(oapic) == id) return 0; /* Conflict with other APIC */ return 1; /* ID is acceptable for IO APIC */ } /* * parse an Intel MP specification table */ static void fix_mp_table(void) { int x; int id; int bus_0 = 0; /* Stop GCC warning */ int bus_pci = 0; /* Stop GCC warning */ int num_pci_bus; int apic; /* IO APIC unit number */ int freeid; /* Free physical APIC ID */ int physid; /* Current physical IO APIC ID */ /* * Fix mis-numbering of the PCI bus and its INT entries if the BIOS * did it wrong. The MP spec says that when more than 1 PCI bus * exists the BIOS must begin with bus entries for the PCI bus and use * actual PCI bus numbering. This implies that when only 1 PCI bus * exists the BIOS can choose to ignore this ordering, and indeed many * MP motherboards do ignore it. This causes a problem when the PCI * sub-system makes requests of the MP sub-system based on PCI bus * numbers. So here we look for the situation and renumber the * busses and associated INTs in an effort to "make it right". */ /* find bus 0, PCI bus, count the number of PCI busses */ for (num_pci_bus = 0, x = 0; x < mp_nbusses; ++x) { if (bus_data[x].bus_id == 0) { bus_0 = x; } if (bus_data[x].bus_type == PCI) { ++num_pci_bus; bus_pci = x; } } /* * bus_0 == slot of bus with ID of 0 * bus_pci == slot of last PCI bus encountered */ /* check the 1 PCI bus case for sanity */ /* if it is number 0 all is well */ if (num_pci_bus == 1 && bus_data[bus_pci].bus_id != 0) { /* mis-numbered, swap with whichever bus uses slot 0 */ /* swap the bus entry types */ bus_data[bus_pci].bus_type = bus_data[bus_0].bus_type; bus_data[bus_0].bus_type = PCI; /* swap each relavant INTerrupt entry */ id = bus_data[bus_pci].bus_id; for (x = 0; x < nintrs; ++x) { if (io_apic_ints[x].src_bus_id == id) { io_apic_ints[x].src_bus_id = 0; } else if (io_apic_ints[x].src_bus_id == 0) { io_apic_ints[x].src_bus_id = id; } } } /* Assign IO APIC IDs. * * First try the existing ID. If a conflict is detected, try * the ID in the MP table. If a conflict is still detected, find * a free id. * * We cannot use the ID_TO_IO table before all conflicts has been * resolved and the table has been corrected. */ for (apic = 0; apic < mp_napics; ++apic) { /* For all IO APICs */ /* First try to use the value set by the BIOS */ physid = io_apic_get_id(apic); if (io_apic_id_acceptable(apic, physid)) { if (IO_TO_ID(apic) != physid) swap_apic_id(apic, IO_TO_ID(apic), physid); continue; } /* Then check if the value in the MP table is acceptable */ if (io_apic_id_acceptable(apic, IO_TO_ID(apic))) continue; /* Last resort, find a free APIC ID and use it */ freeid = first_free_apic_id(); if (freeid >= NAPICID) panic("No free physical APIC IDs found"); if (io_apic_id_acceptable(apic, freeid)) { swap_apic_id(apic, IO_TO_ID(apic), freeid); continue; } panic("Free physical APIC ID not usable"); } fix_id_to_io_mapping(); /* detect and fix broken Compaq MP table */ if (apic_int_type(0, 0) == -1) { printf("APIC_IO: MP table broken: 8259->APIC entry missing!\n"); io_apic_ints[nintrs].int_type = 3; /* ExtInt */ io_apic_ints[nintrs].int_vector = 0xff; /* Unassigned */ /* XXX fixme, set src bus id etc, but it doesn't seem to hurt */ io_apic_ints[nintrs].dst_apic_id = IO_TO_ID(0); io_apic_ints[nintrs].dst_apic_int = 0; /* Pin 0 */ nintrs++; } } /* Assign low level interrupt handlers */ static void setup_apic_irq_mapping(void) { int x; int int_vector; /* Clear array */ for (x = 0; x < APIC_INTMAPSIZE; x++) { int_to_apicintpin[x].ioapic = -1; int_to_apicintpin[x].int_pin = 0; int_to_apicintpin[x].apic_address = NULL; int_to_apicintpin[x].redirindex = 0; } /* First assign ISA/EISA interrupts */ for (x = 0; x < nintrs; x++) { int_vector = io_apic_ints[x].src_bus_irq; if (int_vector < APIC_INTMAPSIZE && io_apic_ints[x].int_vector == 0xff && int_to_apicintpin[int_vector].ioapic == -1 && (apic_int_is_bus_type(x, ISA) || apic_int_is_bus_type(x, EISA)) && io_apic_ints[x].int_type == 0) { assign_apic_irq(ID_TO_IO(io_apic_ints[x].dst_apic_id), io_apic_ints[x].dst_apic_int, int_vector); } } /* Assign ExtInt entry if no ISA/EISA interrupt 0 entry */ for (x = 0; x < nintrs; x++) { if (io_apic_ints[x].dst_apic_int == 0 && io_apic_ints[x].dst_apic_id == IO_TO_ID(0) && io_apic_ints[x].int_vector == 0xff && int_to_apicintpin[0].ioapic == -1 && io_apic_ints[x].int_type == 3) { assign_apic_irq(0, 0, 0); break; } } /* PCI interrupt assignment is deferred */ } static int processor_entry(proc_entry_ptr entry, int cpu) { /* check for usability */ if (!(entry->cpu_flags & PROCENTRY_FLAG_EN)) return 0; if(entry->apic_id >= NAPICID) panic("CPU APIC ID out of range (0..%d)", NAPICID - 1); /* check for BSP flag */ if (entry->cpu_flags & PROCENTRY_FLAG_BP) { boot_cpu_id = entry->apic_id; CPU_TO_ID(0) = entry->apic_id; ID_TO_CPU(entry->apic_id) = 0; return 0; /* its already been counted */ } /* add another AP to list, if less than max number of CPUs */ else if (cpu < MAXCPU) { CPU_TO_ID(cpu) = entry->apic_id; ID_TO_CPU(entry->apic_id) = cpu; return 1; } return 0; } static int bus_entry(bus_entry_ptr entry, int bus) { int x; char c, name[8]; /* encode the name into an index */ for (x = 0; x < 6; ++x) { if ((c = entry->bus_type[x]) == ' ') break; name[x] = c; } name[x] = '\0'; if ((x = lookup_bus_type(name)) == UNKNOWN_BUSTYPE) panic("unknown bus type: '%s'", name); bus_data[bus].bus_id = entry->bus_id; bus_data[bus].bus_type = x; return 1; } static int io_apic_entry(io_apic_entry_ptr entry, int apic) { if (!(entry->apic_flags & IOAPICENTRY_FLAG_EN)) return 0; IO_TO_ID(apic) = entry->apic_id; if (entry->apic_id < NAPICID) ID_TO_IO(entry->apic_id) = apic; return 1; } static int lookup_bus_type(char *name) { int x; for (x = 0; x < MAX_BUSTYPE; ++x) if (strcmp(bus_type_table[x].name, name) == 0) return bus_type_table[x].type; return UNKNOWN_BUSTYPE; } static int int_entry(int_entry_ptr entry, int intr) { int apic; io_apic_ints[intr].int_type = entry->int_type; io_apic_ints[intr].int_flags = entry->int_flags; io_apic_ints[intr].src_bus_id = entry->src_bus_id; io_apic_ints[intr].src_bus_irq = entry->src_bus_irq; if (entry->dst_apic_id == 255) { /* This signal goes to all IO APICS. Select an IO APIC with sufficient number of interrupt pins */ for (apic = 0; apic < mp_napics; apic++) if (((io_apic_read(apic, IOAPIC_VER) & IOART_VER_MAXREDIR) >> MAXREDIRSHIFT) >= entry->dst_apic_int) break; if (apic < mp_napics) io_apic_ints[intr].dst_apic_id = IO_TO_ID(apic); else io_apic_ints[intr].dst_apic_id = entry->dst_apic_id; } else io_apic_ints[intr].dst_apic_id = entry->dst_apic_id; io_apic_ints[intr].dst_apic_int = entry->dst_apic_int; return 1; } static int apic_int_is_bus_type(int intr, int bus_type) { int bus; for (bus = 0; bus < mp_nbusses; ++bus) if ((bus_data[bus].bus_id == io_apic_ints[intr].src_bus_id) && ((int) bus_data[bus].bus_type == bus_type)) return 1; return 0; } /* * Given a traditional ISA INT mask, return an APIC mask. */ u_int isa_apic_mask(u_int isa_mask) { int isa_irq; int apic_pin; #if defined(SKIP_IRQ15_REDIRECT) if (isa_mask == (1 << 15)) { printf("skipping ISA IRQ15 redirect\n"); return isa_mask; } #endif /* SKIP_IRQ15_REDIRECT */ isa_irq = ffs(isa_mask); /* find its bit position */ if (isa_irq == 0) /* doesn't exist */ return 0; --isa_irq; /* make it zero based */ apic_pin = isa_apic_irq(isa_irq); /* look for APIC connection */ if (apic_pin == -1) return 0; return (1 << apic_pin); /* convert pin# to a mask */ } /* * Determine which APIC pin an ISA/EISA INT is attached to. */ #define INTTYPE(I) (io_apic_ints[(I)].int_type) #define INTPIN(I) (io_apic_ints[(I)].dst_apic_int) #define INTIRQ(I) (io_apic_ints[(I)].int_vector) #define INTAPIC(I) (ID_TO_IO(io_apic_ints[(I)].dst_apic_id)) #define SRCBUSIRQ(I) (io_apic_ints[(I)].src_bus_irq) int isa_apic_irq(int isa_irq) { int intr; for (intr = 0; intr < nintrs; ++intr) { /* check each record */ if (INTTYPE(intr) == 0) { /* standard INT */ if (SRCBUSIRQ(intr) == isa_irq) { if (apic_int_is_bus_type(intr, ISA) || apic_int_is_bus_type(intr, EISA)) { if (INTIRQ(intr) == 0xff) return -1; /* unassigned */ return INTIRQ(intr); /* found */ } } } } return -1; /* NOT found */ } /* * Determine which APIC pin a PCI INT is attached to. */ #define SRCBUSID(I) (io_apic_ints[(I)].src_bus_id) #define SRCBUSDEVICE(I) ((io_apic_ints[(I)].src_bus_irq >> 2) & 0x1f) #define SRCBUSLINE(I) (io_apic_ints[(I)].src_bus_irq & 0x03) int pci_apic_irq(int pciBus, int pciDevice, int pciInt) { int intr; --pciInt; /* zero based */ for (intr = 0; intr < nintrs; ++intr) /* check each record */ if ((INTTYPE(intr) == 0) /* standard INT */ && (SRCBUSID(intr) == pciBus) && (SRCBUSDEVICE(intr) == pciDevice) && (SRCBUSLINE(intr) == pciInt)) /* a candidate IRQ */ if (apic_int_is_bus_type(intr, PCI)) { if (INTIRQ(intr) == 0xff) allocate_apic_irq(intr); if (INTIRQ(intr) == 0xff) return -1; /* unassigned */ return INTIRQ(intr); /* exact match */ } return -1; /* NOT found */ } int next_apic_irq(int irq) { int intr, ointr; int bus, bustype; bus = 0; bustype = 0; for (intr = 0; intr < nintrs; intr++) { if (INTIRQ(intr) != irq || INTTYPE(intr) != 0) continue; bus = SRCBUSID(intr); bustype = apic_bus_type(bus); if (bustype != ISA && bustype != EISA && bustype != PCI) continue; break; } if (intr >= nintrs) { return -1; } for (ointr = intr + 1; ointr < nintrs; ointr++) { if (INTTYPE(ointr) != 0) continue; if (bus != SRCBUSID(ointr)) continue; if (bustype == PCI) { if (SRCBUSDEVICE(intr) != SRCBUSDEVICE(ointr)) continue; if (SRCBUSLINE(intr) != SRCBUSLINE(ointr)) continue; } if (bustype == ISA || bustype == EISA) { if (SRCBUSIRQ(intr) != SRCBUSIRQ(ointr)) continue; } if (INTPIN(intr) == INTPIN(ointr)) continue; break; } if (ointr >= nintrs) { return -1; } return INTIRQ(ointr); } #undef SRCBUSLINE #undef SRCBUSDEVICE #undef SRCBUSID #undef SRCBUSIRQ #undef INTPIN #undef INTIRQ #undef INTAPIC #undef INTTYPE /* * Reprogram the MB chipset to NOT redirect an ISA INTerrupt. * * XXX FIXME: * Exactly what this means is unclear at this point. It is a solution * for motherboards that redirect the MBIRQ0 pin. Generically a motherboard * could route any of the ISA INTs to upper (>15) IRQ values. But most would * NOT be redirected via MBIRQ0, thus "undirect()ing" them would NOT be an * option. */ int undirect_isa_irq(int rirq) { #if defined(READY) if (bootverbose) printf("Freeing redirected ISA irq %d.\n", rirq); /** FIXME: tickle the MB redirector chip */ return -1; #else if (bootverbose) printf("Freeing (NOT implemented) redirected ISA irq %d.\n", rirq); return 0; #endif /* READY */ } /* * Reprogram the MB chipset to NOT redirect a PCI INTerrupt */ int undirect_pci_irq(int rirq) { #if defined(READY) if (bootverbose) printf("Freeing redirected PCI irq %d.\n", rirq); /** FIXME: tickle the MB redirector chip */ return -1; #else if (bootverbose) printf("Freeing (NOT implemented) redirected PCI irq %d.\n", rirq); return 0; #endif /* READY */ } /* * given a bus ID, return: * the bus type if found * -1 if NOT found */ int apic_bus_type(int id) { int x; for (x = 0; x < mp_nbusses; ++x) if (bus_data[x].bus_id == id) return bus_data[x].bus_type; return -1; } /* * given a LOGICAL APIC# and pin#, return: * the associated src bus ID if found * -1 if NOT found */ int apic_src_bus_id(int apic, int pin) { int x; /* search each of the possible INTerrupt sources */ for (x = 0; x < nintrs; ++x) if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) && (pin == io_apic_ints[x].dst_apic_int)) return (io_apic_ints[x].src_bus_id); return -1; /* NOT found */ } /* * given a LOGICAL APIC# and pin#, return: * the associated src bus IRQ if found * -1 if NOT found */ int apic_src_bus_irq(int apic, int pin) { int x; for (x = 0; x < nintrs; x++) if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) && (pin == io_apic_ints[x].dst_apic_int)) return (io_apic_ints[x].src_bus_irq); return -1; /* NOT found */ } /* * given a LOGICAL APIC# and pin#, return: * the associated INTerrupt type if found * -1 if NOT found */ int apic_int_type(int apic, int pin) { int x; /* search each of the possible INTerrupt sources */ for (x = 0; x < nintrs; ++x) if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) && (pin == io_apic_ints[x].dst_apic_int)) return (io_apic_ints[x].int_type); return -1; /* NOT found */ } int apic_irq(int apic, int pin) { int x; int res; for (x = 0; x < nintrs; ++x) if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) && (pin == io_apic_ints[x].dst_apic_int)) { res = io_apic_ints[x].int_vector; if (res == 0xff) return -1; if (apic != int_to_apicintpin[res].ioapic) panic("apic_irq: inconsistent table"); if (pin != int_to_apicintpin[res].int_pin) panic("apic_irq inconsistent table (2)"); return res; } return -1; } /* * given a LOGICAL APIC# and pin#, return: * the associated trigger mode if found * -1 if NOT found */ int apic_trigger(int apic, int pin) { int x; /* search each of the possible INTerrupt sources */ for (x = 0; x < nintrs; ++x) if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) && (pin == io_apic_ints[x].dst_apic_int)) return ((io_apic_ints[x].int_flags >> 2) & 0x03); return -1; /* NOT found */ } /* * given a LOGICAL APIC# and pin#, return: * the associated 'active' level if found * -1 if NOT found */ int apic_polarity(int apic, int pin) { int x; /* search each of the possible INTerrupt sources */ for (x = 0; x < nintrs; ++x) if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) && (pin == io_apic_ints[x].dst_apic_int)) return (io_apic_ints[x].int_flags & 0x03); return -1; /* NOT found */ } /* * set data according to MP defaults * FIXME: probably not complete yet... */ static void default_mp_table(int type) { int ap_cpu_id; #if defined(APIC_IO) int io_apic_id; int pin; #endif /* APIC_IO */ #if 0 printf(" MP default config type: %d\n", type); switch (type) { case 1: printf(" bus: ISA, APIC: 82489DX\n"); break; case 2: printf(" bus: EISA, APIC: 82489DX\n"); break; case 3: printf(" bus: EISA, APIC: 82489DX\n"); break; case 4: printf(" bus: MCA, APIC: 82489DX\n"); break; case 5: printf(" bus: ISA+PCI, APIC: Integrated\n"); break; case 6: printf(" bus: EISA+PCI, APIC: Integrated\n"); break; case 7: printf(" bus: MCA+PCI, APIC: Integrated\n"); break; default: printf(" future type\n"); break; /* NOTREACHED */ } #endif /* 0 */ boot_cpu_id = (lapic.id & APIC_ID_MASK) >> 24; ap_cpu_id = (boot_cpu_id == 0) ? 1 : 0; /* BSP */ CPU_TO_ID(0) = boot_cpu_id; ID_TO_CPU(boot_cpu_id) = 0; /* one and only AP */ CPU_TO_ID(1) = ap_cpu_id; ID_TO_CPU(ap_cpu_id) = 1; #if defined(APIC_IO) /* one and only IO APIC */ io_apic_id = (io_apic_read(0, IOAPIC_ID) & APIC_ID_MASK) >> 24; /* * sanity check, refer to MP spec section 3.6.6, last paragraph * necessary as some hardware isn't properly setting up the IO APIC */ #if defined(REALLY_ANAL_IOAPICID_VALUE) if (io_apic_id != 2) { #else if ((io_apic_id == 0) || (io_apic_id == 1) || (io_apic_id == 15)) { #endif /* REALLY_ANAL_IOAPICID_VALUE */ io_apic_set_id(0, 2); io_apic_id = 2; } IO_TO_ID(0) = io_apic_id; ID_TO_IO(io_apic_id) = 0; #endif /* APIC_IO */ /* fill out bus entries */ switch (type) { case 1: case 2: case 3: case 4: case 5: case 6: case 7: bus_data[0].bus_id = default_data[type - 1][1]; bus_data[0].bus_type = default_data[type - 1][2]; bus_data[1].bus_id = default_data[type - 1][3]; bus_data[1].bus_type = default_data[type - 1][4]; break; /* case 4: case 7: MCA NOT supported */ default: /* illegal/reserved */ panic("BAD default MP config: %d", type); /* NOTREACHED */ } #if defined(APIC_IO) /* general cases from MP v1.4, table 5-2 */ for (pin = 0; pin < 16; ++pin) { io_apic_ints[pin].int_type = 0; io_apic_ints[pin].int_flags = 0x05; /* edge/active-hi */ io_apic_ints[pin].src_bus_id = 0; io_apic_ints[pin].src_bus_irq = pin; /* IRQ2 caught below */ io_apic_ints[pin].dst_apic_id = io_apic_id; io_apic_ints[pin].dst_apic_int = pin; /* 1-to-1 */ } /* special cases from MP v1.4, table 5-2 */ if (type == 2) { io_apic_ints[2].int_type = 0xff; /* N/C */ io_apic_ints[13].int_type = 0xff; /* N/C */ #if !defined(APIC_MIXED_MODE) /** FIXME: ??? */ panic("sorry, can't support type 2 default yet"); #endif /* APIC_MIXED_MODE */ } else io_apic_ints[2].src_bus_irq = 0; /* ISA IRQ0 is on APIC INT 2 */ if (type == 7) io_apic_ints[0].int_type = 0xff; /* N/C */ else io_apic_ints[0].int_type = 3; /* vectored 8259 */ #endif /* APIC_IO */ } /* * start each AP in our list */ static int start_all_aps(u_int boot_addr) { int x, i, pg; u_char mpbiosreason; u_long mpbioswarmvec; struct globaldata *gd; char *stack; uintptr_t kptbase; POSTCODE(START_ALL_APS_POST); mtx_init(&ap_boot_mtx, "ap boot", MTX_SPIN); /* initialize BSP's local APIC */ apic_initialize(); bsp_apic_ready = 1; /* install the AP 1st level boot code */ install_ap_tramp(boot_addr); /* save the current value of the warm-start vector */ mpbioswarmvec = *((u_long *) WARMBOOT_OFF); #ifndef PC98 outb(CMOS_REG, BIOS_RESET); mpbiosreason = inb(CMOS_DATA); #endif /* set up temporary P==V mapping for AP boot */ /* XXX this is a hack, we should boot the AP on its own stack/PTD */ kptbase = (uintptr_t)(void *)KPTphys; for (x = 0; x < NKPT; x++) PTD[x] = (pd_entry_t)(PG_V | PG_RW | ((kptbase + x * PAGE_SIZE) & PG_FRAME)); invltlb(); /* start each AP */ for (x = 1; x <= mp_naps; ++x) { /* This is a bit verbose, it will go away soon. */ /* first page of AP's private space */ pg = x * i386_btop(sizeof(struct privatespace)); /* allocate a new private data page */ gd = (struct globaldata *)kmem_alloc(kernel_map, PAGE_SIZE); /* wire it into the private page table page */ SMPpt[pg] = (pt_entry_t)(PG_V | PG_RW | vtophys(gd)); /* allocate and set up an idle stack data page */ stack = (char *)kmem_alloc(kernel_map, UPAGES*PAGE_SIZE); for (i = 0; i < UPAGES; i++) SMPpt[pg + 1 + i] = (pt_entry_t) (PG_V | PG_RW | vtophys(PAGE_SIZE * i + stack)); /* prime data page for it to use */ gd->gd_cpuid = x; globaldata_register(gd); /* setup a vector to our boot code */ *((volatile u_short *) WARMBOOT_OFF) = WARMBOOT_TARGET; *((volatile u_short *) WARMBOOT_SEG) = (boot_addr >> 4); #ifndef PC98 outb(CMOS_REG, BIOS_RESET); outb(CMOS_DATA, BIOS_WARM); /* 'warm-start' */ #endif bootSTK = &SMP_prvspace[x].idlestack[UPAGES*PAGE_SIZE]; bootAP = x; /* attempt to start the Application Processor */ CHECK_INIT(99); /* setup checkpoints */ if (!start_ap(x, boot_addr)) { printf("AP #%d (PHY# %d) failed!\n", x, CPU_TO_ID(x)); CHECK_PRINT("trace"); /* show checkpoints */ /* better panic as the AP may be running loose */ printf("panic y/n? [y] "); if (cngetc() != 'n') panic("bye-bye"); } CHECK_PRINT("trace"); /* show checkpoints */ /* record its version info */ cpu_apic_versions[x] = cpu_apic_versions[0]; all_cpus |= (1 << x); /* record AP in CPU map */ } /* build our map of 'other' CPUs */ PCPU_SET(other_cpus, all_cpus & ~(1 << PCPU_GET(cpuid))); /* fill in our (BSP) APIC version */ cpu_apic_versions[0] = lapic.version; /* restore the warmstart vector */ *(u_long *) WARMBOOT_OFF = mpbioswarmvec; #ifndef PC98 outb(CMOS_REG, BIOS_RESET); outb(CMOS_DATA, mpbiosreason); #endif /* * Set up the idle context for the BSP. Similar to above except * that some was done by locore, some by pmap.c and some is implicit * because the BSP is cpu#0 and the page is initially zero, and also * because we can refer to variables by name on the BSP.. */ /* Allocate and setup BSP idle stack */ stack = (char *)kmem_alloc(kernel_map, UPAGES * PAGE_SIZE); for (i = 0; i < UPAGES; i++) SMPpt[1 + i] = (pt_entry_t) (PG_V | PG_RW | vtophys(PAGE_SIZE * i + stack)); for (x = 0; x < NKPT; x++) PTD[x] = 0; pmap_set_opt(); /* number of APs actually started */ return mp_ncpus - 1; } /* * load the 1st level AP boot code into base memory. */ /* targets for relocation */ extern void bigJump(void); extern void bootCodeSeg(void); extern void bootDataSeg(void); extern void MPentry(void); extern u_int MP_GDT; extern u_int mp_gdtbase; static void install_ap_tramp(u_int boot_addr) { int x; int size = *(int *) ((u_long) & bootMP_size); u_char *src = (u_char *) ((u_long) bootMP); u_char *dst = (u_char *) boot_addr + KERNBASE; u_int boot_base = (u_int) bootMP; u_int8_t *dst8; u_int16_t *dst16; u_int32_t *dst32; POSTCODE(INSTALL_AP_TRAMP_POST); for (x = 0; x < size; ++x) *dst++ = *src++; /* * modify addresses in code we just moved to basemem. unfortunately we * need fairly detailed info about mpboot.s for this to work. changes * to mpboot.s might require changes here. */ /* boot code is located in KERNEL space */ dst = (u_char *) boot_addr + KERNBASE; /* modify the lgdt arg */ dst32 = (u_int32_t *) (dst + ((u_int) & mp_gdtbase - boot_base)); *dst32 = boot_addr + ((u_int) & MP_GDT - boot_base); /* modify the ljmp target for MPentry() */ dst32 = (u_int32_t *) (dst + ((u_int) bigJump - boot_base) + 1); *dst32 = ((u_int) MPentry - KERNBASE); /* modify the target for boot code segment */ dst16 = (u_int16_t *) (dst + ((u_int) bootCodeSeg - boot_base)); dst8 = (u_int8_t *) (dst16 + 1); *dst16 = (u_int) boot_addr & 0xffff; *dst8 = ((u_int) boot_addr >> 16) & 0xff; /* modify the target for boot data segment */ dst16 = (u_int16_t *) (dst + ((u_int) bootDataSeg - boot_base)); dst8 = (u_int8_t *) (dst16 + 1); *dst16 = (u_int) boot_addr & 0xffff; *dst8 = ((u_int) boot_addr >> 16) & 0xff; } /* * this function starts the AP (application processor) identified * by the APIC ID 'physicalCpu'. It does quite a "song and dance" * to accomplish this. This is necessary because of the nuances * of the different hardware we might encounter. It ain't pretty, * but it seems to work. */ static int start_ap(int logical_cpu, u_int boot_addr) { int physical_cpu; int vector; int cpus; u_long icr_lo, icr_hi; POSTCODE(START_AP_POST); /* get the PHYSICAL APIC ID# */ physical_cpu = CPU_TO_ID(logical_cpu); /* calculate the vector */ vector = (boot_addr >> 12) & 0xff; /* used as a watchpoint to signal AP startup */ cpus = mp_ncpus; /* * first we do an INIT/RESET IPI this INIT IPI might be run, reseting * and running the target CPU. OR this INIT IPI might be latched (P5 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be * ignored. */ /* setup the address for the target AP */ icr_hi = lapic.icr_hi & ~APIC_ID_MASK; icr_hi |= (physical_cpu << 24); lapic.icr_hi = icr_hi; /* do an INIT IPI: assert RESET */ icr_lo = lapic.icr_lo & 0xfff00000; lapic.icr_lo = icr_lo | 0x0000c500; /* wait for pending status end */ while (lapic.icr_lo & APIC_DELSTAT_MASK) /* spin */ ; /* do an INIT IPI: deassert RESET */ lapic.icr_lo = icr_lo | 0x00008500; /* wait for pending status end */ u_sleep(10000); /* wait ~10mS */ while (lapic.icr_lo & APIC_DELSTAT_MASK) /* spin */ ; /* * next we do a STARTUP IPI: the previous INIT IPI might still be * latched, (P5 bug) this 1st STARTUP would then terminate * immediately, and the previously started INIT IPI would continue. OR * the previous INIT IPI has already run. and this STARTUP IPI will * run. OR the previous INIT IPI was ignored. and this STARTUP IPI * will run. */ /* do a STARTUP IPI */ lapic.icr_lo = icr_lo | 0x00000600 | vector; while (lapic.icr_lo & APIC_DELSTAT_MASK) /* spin */ ; u_sleep(200); /* wait ~200uS */ /* * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is * recognized after hardware RESET or INIT IPI. */ lapic.icr_lo = icr_lo | 0x00000600 | vector; while (lapic.icr_lo & APIC_DELSTAT_MASK) /* spin */ ; u_sleep(200); /* wait ~200uS */ /* wait for it to start */ set_apic_timer(5000000);/* == 5 seconds */ while (read_apic_timer()) if (mp_ncpus > cpus) return 1; /* return SUCCESS */ return 0; /* return FAILURE */ } /* * Flush the TLB on all other CPU's * * XXX: Needs to handshake and wait for completion before proceding. */ void smp_invltlb(void) { #if defined(APIC_IO) if (smp_started && invltlb_ok) ipi_all_but_self(IPI_INVLTLB); #endif /* APIC_IO */ } void invlpg(u_int addr) { __asm __volatile("invlpg (%0)"::"r"(addr):"memory"); /* send a message to the other CPUs */ smp_invltlb(); } void invltlb(void) { u_long temp; /* * This should be implemented as load_cr3(rcr3()) when load_cr3() is * inlined. */ __asm __volatile("movl %%cr3, %0; movl %0, %%cr3":"=r"(temp) :: "memory"); /* send a message to the other CPUs */ smp_invltlb(); } /* * This is called once the rest of the system is up and running and we're * ready to let the AP's out of the pen. */ extern void enable_sse(void); void ap_init(void) { u_int apic_id; /* spin until all the AP's are ready */ while (!aps_ready) /* spin */ ; /* * Set curproc to our per-cpu idleproc so that mutexes have * something unique to lock with. */ PCPU_SET(curproc, PCPU_GET(idleproc)); PCPU_SET(spinlocks, NULL); /* lock against other AP's that are waking up */ mtx_lock_spin(&ap_boot_mtx); /* BSP may have changed PTD while we're waiting for the lock */ cpu_invltlb(); smp_cpus++; #if defined(I586_CPU) && !defined(NO_F00F_HACK) lidt(&r_idt); #endif /* Build our map of 'other' CPUs. */ PCPU_SET(other_cpus, all_cpus & ~(1 << PCPU_GET(cpuid))); printf("SMP: AP CPU #%d Launched!\n", PCPU_GET(cpuid)); /* set up CPU registers and state */ cpu_setregs(); /* set up FPU state on the AP */ npxinit(__INITIAL_NPXCW__); /* set up SSE registers */ enable_sse(); /* A quick check from sanity claus */ apic_id = (apic_id_to_logical[(lapic.id & 0x0f000000) >> 24]); if (PCPU_GET(cpuid) != apic_id) { printf("SMP: cpuid = %d\n", PCPU_GET(cpuid)); printf("SMP: apic_id = %d\n", apic_id); printf("PTD[MPPTDI] = %p\n", (void *)PTD[MPPTDI]); panic("cpuid mismatch! boom!!"); } /* Init local apic for irq's */ apic_initialize(); /* Set memory range attributes for this CPU to match the BSP */ mem_range_AP_init(); /* * Activate smp_invltlb, although strictly speaking, this isn't * quite correct yet. We should have a bitfield for cpus willing * to accept TLB flush IPI's or something and sync them. */ if (smp_cpus == mp_ncpus) { invltlb_ok = 1; smp_started = 1; /* enable IPI's, tlb shootdown, freezes etc */ smp_active = 1; /* historic */ } /* let other AP's wake up now */ mtx_unlock_spin(&ap_boot_mtx); /* wait until all the AP's are up */ while (smp_started == 0) ; /* nothing */ microuptime(PCPU_PTR(switchtime)); PCPU_SET(switchticks, ticks); /* ok, now grab sched_lock and enter the scheduler */ enable_intr(); mtx_lock_spin(&sched_lock); cpu_throw(); /* doesn't return */ panic("scheduler returned us to ap_init"); } /* * For statclock, we send an IPI to all CPU's to have them call this * function. */ void forwarded_statclock(struct trapframe frame) { mtx_lock_spin(&sched_lock); statclock_process(curproc, TRAPF_PC(&frame), TRAPF_USERMODE(&frame)); mtx_unlock_spin(&sched_lock); } void forward_statclock(void) { int map; CTR0(KTR_SMP, "forward_statclock"); if (!smp_started || !invltlb_ok || cold || panicstr) return; map = PCPU_GET(other_cpus) & ~stopped_cpus ; if (map != 0) ipi_selected(map, IPI_STATCLOCK); } /* * For each hardclock(), we send an IPI to all other CPU's to have them * execute this function. It would be nice to reduce contention on * sched_lock if we could simply peek at the CPU to determine the user/kernel * state and call hardclock_process() on the CPU receiving the clock interrupt * and then just use a simple IPI to handle any ast's if needed. */ void forwarded_hardclock(struct trapframe frame) { mtx_lock_spin(&sched_lock); hardclock_process(curproc, TRAPF_USERMODE(&frame)); mtx_unlock_spin(&sched_lock); } void forward_hardclock(void) { u_int map; CTR0(KTR_SMP, "forward_hardclock"); if (!smp_started || !invltlb_ok || cold || panicstr) return; map = PCPU_GET(other_cpus) & ~stopped_cpus ; if (map != 0) ipi_selected(map, IPI_HARDCLOCK); } #ifdef APIC_INTR_REORDER /* * Maintain mapping from softintr vector to isr bit in local apic. */ void set_lapic_isrloc(int intr, int vector) { if (intr < 0 || intr > 32) panic("set_apic_isrloc: bad intr argument: %d",intr); if (vector < ICU_OFFSET || vector > 255) panic("set_apic_isrloc: bad vector argument: %d",vector); apic_isrbit_location[intr].location = &lapic.isr0 + ((vector>>5)<<2); apic_isrbit_location[intr].bit = (1<<(vector & 31)); } #endif /* * send an IPI to a set of cpus. */ void ipi_selected(u_int32_t cpus, u_int ipi) { CTR2(KTR_SMP, __func__ ": cpus: %x ipi: %x", cpus, ipi); selected_apic_ipi(cpus, ipi, APIC_DELMODE_FIXED); } /* * send an IPI INTerrupt containing 'vector' to all CPUs, including myself */ void ipi_all(u_int ipi) { CTR1(KTR_SMP, __func__ ": ipi: %x", ipi); apic_ipi(APIC_DEST_ALLISELF, ipi, APIC_DELMODE_FIXED); } /* * send an IPI to all CPUs EXCEPT myself */ void ipi_all_but_self(u_int ipi) { CTR1(KTR_SMP, __func__ ": ipi: %x", ipi); apic_ipi(APIC_DEST_ALLESELF, ipi, APIC_DELMODE_FIXED); } /* * send an IPI to myself */ void ipi_self(u_int ipi) { CTR1(KTR_SMP, __func__ ": ipi: %x", ipi); apic_ipi(APIC_DEST_SELF, ipi, APIC_DELMODE_FIXED); } void release_aps(void *dummy __unused) { atomic_store_rel_int(&aps_ready, 1); } SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); Index: head/sys/i386/i386/pmap.c =================================================================== --- head/sys/i386/i386/pmap.c (revision 82308) +++ head/sys/i386/i386/pmap.c (revision 82309) @@ -1,3397 +1,3398 @@ /* * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 * $FreeBSD$ */ /* * Manages physical address maps. * * In addition to hardware address maps, this * module is called upon to provide software-use-only * maps which may or may not be stored in the same * form as hardware maps. These pseudo-maps are * used to store intermediate results from copy * operations to and from address spaces. * * Since the information managed by this module is * also stored by the logical address mapping module, * this module may throw away valid virtual-to-physical * mappings at almost any time. However, invalidations * of virtual-to-physical mappings must be done as * requested. * * In order to cope with hardware architectures which * make virtual-to-physical map invalidates expensive, * this module may delay invalidate or reduced protection * operations until such time as they are actually * necessary. This module is given full information as * to which processors are currently using which maps, * and to when physical maps must be made correct. */ #include "opt_disable_pse.h" #include "opt_pmap.h" #include "opt_msgbuf.h" +#include "opt_upages.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(SMP) || defined(APIC_IO) #include #include #include #include #include #endif /* SMP || APIC_IO */ #define PMAP_KEEP_PDIRS #ifndef PMAP_SHPGPERPROC #define PMAP_SHPGPERPROC 200 #endif #if defined(DIAGNOSTIC) #define PMAP_DIAGNOSTIC #endif #define MINPV 2048 #if !defined(PMAP_DIAGNOSTIC) #define PMAP_INLINE __inline #else #define PMAP_INLINE #endif /* * Get PDEs and PTEs for user/kernel address space */ #define pmap_pde(m, v) (&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT])) #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT]) #define pmap_pde_v(pte) ((*(int *)pte & PG_V) != 0) #define pmap_pte_w(pte) ((*(int *)pte & PG_W) != 0) #define pmap_pte_m(pte) ((*(int *)pte & PG_M) != 0) #define pmap_pte_u(pte) ((*(int *)pte & PG_A) != 0) #define pmap_pte_v(pte) ((*(int *)pte & PG_V) != 0) #define pmap_pte_set_w(pte, v) ((v)?(*(int *)pte |= PG_W):(*(int *)pte &= ~PG_W)) #define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v))) /* * Given a map and a machine independent protection code, * convert to a vax protection code. */ #define pte_prot(m, p) (protection_codes[p]) static int protection_codes[8]; static struct pmap kernel_pmap_store; pmap_t kernel_pmap; LIST_HEAD(pmaplist, pmap); struct pmaplist allpmaps; vm_offset_t avail_start; /* PA of first available physical page */ vm_offset_t avail_end; /* PA of last available physical page */ vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ static boolean_t pmap_initialized = FALSE; /* Has pmap_init completed? */ static int pgeflag; /* PG_G or-in */ static int pseflag; /* PG_PS or-in */ static vm_object_t kptobj; static int nkpt; vm_offset_t kernel_vm_end; /* * Data for the pv entry allocation mechanism */ static vm_zone_t pvzone; static struct vm_zone pvzone_store; static struct vm_object pvzone_obj; static int pv_entry_count=0, pv_entry_max=0, pv_entry_high_water=0; static int pmap_pagedaemon_waken = 0; static struct pv_entry *pvinit; /* * All those kernel PT submaps that BSD is so fond of */ pt_entry_t *CMAP1 = 0; static pt_entry_t *CMAP2, *ptmmap; caddr_t CADDR1 = 0, ptvmmap = 0; static caddr_t CADDR2; static pt_entry_t *msgbufmap; struct msgbuf *msgbufp=0; /* * Crashdump maps. */ static pt_entry_t *pt_crashdumpmap; static caddr_t crashdumpmap; #ifdef SMP extern pt_entry_t *SMPpt; #endif static pt_entry_t *PMAP1 = 0; static unsigned *PADDR1 = 0; static PMAP_INLINE void free_pv_entry __P((pv_entry_t pv)); static unsigned * get_ptbase __P((pmap_t pmap)); static pv_entry_t get_pv_entry __P((void)); static void i386_protection_init __P((void)); static __inline void pmap_changebit __P((vm_page_t m, int bit, boolean_t setem)); static void pmap_remove_all __P((vm_page_t m)); static vm_page_t pmap_enter_quick __P((pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t mpte)); static int pmap_remove_pte __P((struct pmap *pmap, unsigned *ptq, vm_offset_t sva)); static void pmap_remove_page __P((struct pmap *pmap, vm_offset_t va)); static int pmap_remove_entry __P((struct pmap *pmap, vm_page_t m, vm_offset_t va)); static boolean_t pmap_testbit __P((vm_page_t m, int bit)); static void pmap_insert_entry __P((pmap_t pmap, vm_offset_t va, vm_page_t mpte, vm_page_t m)); static vm_page_t pmap_allocpte __P((pmap_t pmap, vm_offset_t va)); static int pmap_release_free_page __P((pmap_t pmap, vm_page_t p)); static vm_page_t _pmap_allocpte __P((pmap_t pmap, unsigned ptepindex)); static unsigned * pmap_pte_quick __P((pmap_t pmap, vm_offset_t va)); static vm_page_t pmap_page_lookup __P((vm_object_t object, vm_pindex_t pindex)); static int pmap_unuse_pt __P((pmap_t, vm_offset_t, vm_page_t)); static vm_offset_t pmap_kmem_choose(vm_offset_t addr); static unsigned pdir4mb; /* * Routine: pmap_pte * Function: * Extract the page table entry associated * with the given map/virtual_address pair. */ PMAP_INLINE unsigned * pmap_pte(pmap, va) register pmap_t pmap; vm_offset_t va; { unsigned *pdeaddr; if (pmap) { pdeaddr = (unsigned *) pmap_pde(pmap, va); if (*pdeaddr & PG_PS) return pdeaddr; if (*pdeaddr) { return get_ptbase(pmap) + i386_btop(va); } } return (0); } /* * Move the kernel virtual free pointer to the next * 4MB. This is used to help improve performance * by using a large (4MB) page for much of the kernel * (.text, .data, .bss) */ static vm_offset_t pmap_kmem_choose(vm_offset_t addr) { vm_offset_t newaddr = addr; #ifndef DISABLE_PSE if (cpu_feature & CPUID_PSE) { newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1); } #endif return newaddr; } /* * Bootstrap the system enough to run with virtual memory. * * On the i386 this is called after mapping has already been enabled * and just syncs the pmap module with what has already been done. * [We can't call it easily with mapping off since the kernel is not * mapped with PA == VA, hence we would have to relocate every address * from the linked base (virtual) address "KERNBASE" to the actual * (physical) address starting relative to 0] */ void pmap_bootstrap(firstaddr, loadaddr) vm_offset_t firstaddr; vm_offset_t loadaddr; { vm_offset_t va; pt_entry_t *pte; int i; avail_start = firstaddr; /* * XXX The calculation of virtual_avail is wrong. It's NKPT*PAGE_SIZE too * large. It should instead be correctly calculated in locore.s and * not based on 'first' (which is a physical address, not a virtual * address, for the start of unused physical memory). The kernel * page tables are NOT double mapped and thus should not be included * in this calculation. */ virtual_avail = (vm_offset_t) KERNBASE + firstaddr; virtual_avail = pmap_kmem_choose(virtual_avail); virtual_end = VM_MAX_KERNEL_ADDRESS; /* * Initialize protection array. */ i386_protection_init(); /* * The kernel's pmap is statically allocated so we don't have to use * pmap_create, which is unlikely to work correctly at this part of * the boot sequence (XXX and which no longer exists). */ kernel_pmap = &kernel_pmap_store; kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD); kernel_pmap->pm_count = 1; kernel_pmap->pm_active = -1; /* don't allow deactivation */ TAILQ_INIT(&kernel_pmap->pm_pvlist); LIST_INIT(&allpmaps); LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list); nkpt = NKPT; /* * Reserve some special page table entries/VA space for temporary * mapping of pages. */ #define SYSMAP(c, p, v, n) \ v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); va = virtual_avail; pte = (pt_entry_t *) pmap_pte(kernel_pmap, va); /* * CMAP1/CMAP2 are used for zeroing and copying pages. */ SYSMAP(caddr_t, CMAP1, CADDR1, 1) SYSMAP(caddr_t, CMAP2, CADDR2, 1) /* * Crashdump maps. */ SYSMAP(caddr_t, pt_crashdumpmap, crashdumpmap, MAXDUMPPGS); /* * ptvmmap is used for reading arbitrary physical pages via /dev/mem. * XXX ptmmap is not used. */ SYSMAP(caddr_t, ptmmap, ptvmmap, 1) /* * msgbufp is used to map the system message buffer. * XXX msgbufmap is not used. */ SYSMAP(struct msgbuf *, msgbufmap, msgbufp, atop(round_page(MSGBUF_SIZE))) /* * ptemap is used for pmap_pte_quick */ SYSMAP(unsigned *, PMAP1, PADDR1, 1); virtual_avail = va; *(int *) CMAP1 = *(int *) CMAP2 = 0; for (i = 0; i < NKPT; i++) PTD[i] = 0; pgeflag = 0; #if !defined(SMP) /* XXX - see also mp_machdep.c */ if (cpu_feature & CPUID_PGE) { pgeflag = PG_G; } #endif /* * Initialize the 4MB page size flag */ pseflag = 0; /* * The 4MB page version of the initial * kernel page mapping. */ pdir4mb = 0; #if !defined(DISABLE_PSE) if (cpu_feature & CPUID_PSE) { unsigned ptditmp; /* * Note that we have enabled PSE mode */ pseflag = PG_PS; ptditmp = *((unsigned *)PTmap + i386_btop(KERNBASE)); ptditmp &= ~(NBPDR - 1); ptditmp |= PG_V | PG_RW | PG_PS | PG_U | pgeflag; pdir4mb = ptditmp; #if !defined(SMP) /* * Enable the PSE mode. */ load_cr4(rcr4() | CR4_PSE); /* * We can do the mapping here for the single processor * case. We simply ignore the old page table page from * now on. */ /* * For SMP, we still need 4K pages to bootstrap APs, * PSE will be enabled as soon as all APs are up. */ PTD[KPTDI] = (pd_entry_t) ptditmp; kernel_pmap->pm_pdir[KPTDI] = (pd_entry_t) ptditmp; invltlb(); #endif } #endif #ifdef SMP if (cpu_apic_address == 0) panic("pmap_bootstrap: no local apic! (non-SMP hardware?)"); /* local apic is mapped on last page */ SMPpt[NPTEPG - 1] = (pt_entry_t)(PG_V | PG_RW | PG_N | pgeflag | (cpu_apic_address & PG_FRAME)); #endif invltlb(); } #ifdef SMP /* * Set 4mb pdir for mp startup */ void pmap_set_opt(void) { if (pseflag && (cpu_feature & CPUID_PSE)) { load_cr4(rcr4() | CR4_PSE); if (pdir4mb && PCPU_GET(cpuid) == 0) { /* only on BSP */ kernel_pmap->pm_pdir[KPTDI] = PTD[KPTDI] = (pd_entry_t)pdir4mb; cpu_invltlb(); } } } #endif /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. * pmap_init has been enhanced to support in a fairly consistant * way, discontiguous physical memory. */ void pmap_init(phys_start, phys_end) vm_offset_t phys_start, phys_end; { int i; int initial_pvs; /* * object for kernel page table pages */ kptobj = vm_object_allocate(OBJT_DEFAULT, NKPDE); /* * Allocate memory for random pmap data structures. Includes the * pv_head_table. */ for(i = 0; i < vm_page_array_size; i++) { vm_page_t m; m = &vm_page_array[i]; TAILQ_INIT(&m->md.pv_list); m->md.pv_list_count = 0; } /* * init the pv free list */ initial_pvs = vm_page_array_size; if (initial_pvs < MINPV) initial_pvs = MINPV; pvzone = &pvzone_store; pvinit = (struct pv_entry *) kmem_alloc(kernel_map, initial_pvs * sizeof (struct pv_entry)); zbootinit(pvzone, "PV ENTRY", sizeof (struct pv_entry), pvinit, vm_page_array_size); /* * Now it is safe to enable pv_table recording. */ pmap_initialized = TRUE; } /* * Initialize the address space (zone) for the pv_entries. Set a * high water mark so that the system can recover from excessive * numbers of pv entries. */ void pmap_init2() { int shpgperproc = PMAP_SHPGPERPROC; TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); pv_entry_max = shpgperproc * maxproc + vm_page_array_size; pv_entry_high_water = 9 * (pv_entry_max / 10); zinitna(pvzone, &pvzone_obj, NULL, 0, pv_entry_max, ZONE_INTERRUPT, 1); } /*************************************************** * Low level helper routines..... ***************************************************/ #if defined(PMAP_DIAGNOSTIC) /* * This code checks for non-writeable/modified pages. * This should be an invalid condition. */ static int pmap_nw_modified(pt_entry_t ptea) { int pte; pte = (int) ptea; if ((pte & (PG_M|PG_RW)) == PG_M) return 1; else return 0; } #endif /* * this routine defines the region(s) of memory that should * not be tested for the modified bit. */ static PMAP_INLINE int pmap_track_modified(vm_offset_t va) { if ((va < kmi.clean_sva) || (va >= kmi.clean_eva)) return 1; else return 0; } static PMAP_INLINE void invltlb_1pg(vm_offset_t va) { #ifdef I386_CPU invltlb(); #else invlpg(va); #endif } static __inline void pmap_TLB_invalidate(pmap_t pmap, vm_offset_t va) { #if defined(SMP) if (pmap->pm_active & (1 << PCPU_GET(cpuid))) cpu_invlpg((void *)va); if (pmap->pm_active & PCPU_GET(other_cpus)) smp_invltlb(); #else if (pmap->pm_active) invltlb_1pg(va); #endif } static __inline void pmap_TLB_invalidate_all(pmap_t pmap) { #if defined(SMP) if (pmap->pm_active & (1 << PCPU_GET(cpuid))) cpu_invltlb(); if (pmap->pm_active & PCPU_GET(other_cpus)) smp_invltlb(); #else if (pmap->pm_active) invltlb(); #endif } static unsigned * get_ptbase(pmap) pmap_t pmap; { unsigned frame = (unsigned) pmap->pm_pdir[PTDPTDI] & PG_FRAME; /* are we current address space or kernel? */ if (pmap == kernel_pmap || frame == (((unsigned) PTDpde) & PG_FRAME)) { return (unsigned *) PTmap; } /* otherwise, we are alternate address space */ if (frame != (((unsigned) APTDpde) & PG_FRAME)) { APTDpde = (pd_entry_t) (frame | PG_RW | PG_V); #if defined(SMP) /* The page directory is not shared between CPUs */ cpu_invltlb(); #else invltlb(); #endif } return (unsigned *) APTmap; } /* * Super fast pmap_pte routine best used when scanning * the pv lists. This eliminates many coarse-grained * invltlb calls. Note that many of the pv list * scans are across different pmaps. It is very wasteful * to do an entire invltlb for checking a single mapping. */ static unsigned * pmap_pte_quick(pmap, va) register pmap_t pmap; vm_offset_t va; { unsigned pde, newpf; if ((pde = (unsigned) pmap->pm_pdir[va >> PDRSHIFT]) != 0) { unsigned frame = (unsigned) pmap->pm_pdir[PTDPTDI] & PG_FRAME; unsigned index = i386_btop(va); /* are we current address space or kernel? */ if ((pmap == kernel_pmap) || (frame == (((unsigned) PTDpde) & PG_FRAME))) { return (unsigned *) PTmap + index; } newpf = pde & PG_FRAME; if ( ((* (unsigned *) PMAP1) & PG_FRAME) != newpf) { * (unsigned *) PMAP1 = newpf | PG_RW | PG_V; invltlb_1pg((vm_offset_t) PADDR1); } return PADDR1 + ((unsigned) index & (NPTEPG - 1)); } return (0); } /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ vm_offset_t pmap_extract(pmap, va) register pmap_t pmap; vm_offset_t va; { vm_offset_t rtval; vm_offset_t pdirindex; pdirindex = va >> PDRSHIFT; if (pmap && (rtval = (unsigned) pmap->pm_pdir[pdirindex])) { unsigned *pte; if ((rtval & PG_PS) != 0) { rtval &= ~(NBPDR - 1); rtval |= va & (NBPDR - 1); return rtval; } pte = get_ptbase(pmap) + i386_btop(va); rtval = ((*pte & PG_FRAME) | (va & PAGE_MASK)); return rtval; } return 0; } /*************************************************** * Low level mapping routines..... ***************************************************/ /* * add a wired page to the kva * note that in order for the mapping to take effect -- you * should do a invltlb after doing the pmap_kenter... */ PMAP_INLINE void pmap_kenter(va, pa) vm_offset_t va; register vm_offset_t pa; { register unsigned *pte; unsigned npte, opte; npte = pa | PG_RW | PG_V | pgeflag; pte = (unsigned *)vtopte(va); opte = *pte; *pte = npte; /*if (opte)*/ invltlb_1pg(va); /* XXX what about SMP? */ } /* * remove a page from the kernel pagetables */ PMAP_INLINE void pmap_kremove(va) vm_offset_t va; { register unsigned *pte; pte = (unsigned *)vtopte(va); *pte = 0; invltlb_1pg(va); /* XXX what about SMP? */ } /* * Used to map a range of physical addresses into kernel * virtual address space. * * The value passed in '*virt' is a suggested virtual address for * the mapping. Architectures which can support a direct-mapped * physical to virtual region can return the appropriate address * within that region, leaving '*virt' unchanged. Other * architectures should map the pages starting at '*virt' and * update '*virt' with the first usable address after the mapped * region. */ vm_offset_t pmap_map(virt, start, end, prot) vm_offset_t *virt; vm_offset_t start; vm_offset_t end; int prot; { vm_offset_t sva = *virt; vm_offset_t va = sva; while (start < end) { pmap_kenter(va, start); va += PAGE_SIZE; start += PAGE_SIZE; } *virt = va; return (sva); } /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. */ void pmap_qenter(va, m, count) vm_offset_t va; vm_page_t *m; int count; { int i; for (i = 0; i < count; i++) { vm_offset_t tva = va + i * PAGE_SIZE; pmap_kenter(tva, VM_PAGE_TO_PHYS(m[i])); } } /* * this routine jerks page mappings from the * kernel -- it is meant only for temporary mappings. */ void pmap_qremove(va, count) vm_offset_t va; int count; { vm_offset_t end_va; end_va = va + count*PAGE_SIZE; while (va < end_va) { unsigned *pte; pte = (unsigned *)vtopte(va); *pte = 0; #ifdef SMP cpu_invlpg((void *)va); #else invltlb_1pg(va); #endif va += PAGE_SIZE; } #ifdef SMP smp_invltlb(); #endif } static vm_page_t pmap_page_lookup(object, pindex) vm_object_t object; vm_pindex_t pindex; { vm_page_t m; retry: m = vm_page_lookup(object, pindex); if (m && vm_page_sleep_busy(m, FALSE, "pplookp")) goto retry; return m; } /* * Create the UPAGES for a new process. * This routine directly affects the fork perf for a process. */ void pmap_new_proc(p) struct proc *p; { #ifdef I386_CPU int updateneeded; #endif int i; vm_object_t upobj; vm_page_t m; struct user *up; unsigned *ptek, oldpte; /* * allocate object for the upages */ if ((upobj = p->p_upages_obj) == NULL) { upobj = vm_object_allocate( OBJT_DEFAULT, UPAGES); p->p_upages_obj = upobj; } /* get a kernel virtual address for the UPAGES for this proc */ if ((up = p->p_addr) == NULL) { up = (struct user *) kmem_alloc_nofault(kernel_map, UPAGES * PAGE_SIZE); if (up == NULL) panic("pmap_new_proc: u_map allocation failed"); p->p_addr = up; } ptek = (unsigned *) vtopte((vm_offset_t) up); #ifdef I386_CPU updateneeded = 0; #endif for(i=0;iwire_count++; cnt.v_wire_count++; oldpte = *(ptek + i); /* * Enter the page into the kernel address space. */ *(ptek + i) = VM_PAGE_TO_PHYS(m) | PG_RW | PG_V | pgeflag; if (oldpte) { #ifdef I386_CPU updateneeded = 1; #else invlpg((vm_offset_t) up + i * PAGE_SIZE); #endif } vm_page_wakeup(m); vm_page_flag_clear(m, PG_ZERO); vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE); m->valid = VM_PAGE_BITS_ALL; } #ifdef I386_CPU if (updateneeded) invltlb(); #endif } /* * Dispose the UPAGES for a process that has exited. * This routine directly impacts the exit perf of a process. */ void pmap_dispose_proc(p) struct proc *p; { int i; vm_object_t upobj; vm_page_t m; unsigned *ptek, oldpte; upobj = p->p_upages_obj; ptek = (unsigned *) vtopte((vm_offset_t) p->p_addr); for(i=0;ip_addr + i * PAGE_SIZE); #endif vm_page_unwire(m, 0); vm_page_free(m); } #ifdef I386_CPU invltlb(); #endif } /* * Allow the UPAGES for a process to be prejudicially paged out. */ void pmap_swapout_proc(p) struct proc *p; { int i; vm_object_t upobj; vm_page_t m; upobj = p->p_upages_obj; /* * let the upages be paged */ for(i=0;ip_addr + PAGE_SIZE * i); } } /* * Bring the UPAGES for a specified process back in. */ void pmap_swapin_proc(p) struct proc *p; { int i,rv; vm_object_t upobj; vm_page_t m; upobj = p->p_upages_obj; for(i=0;ip_addr) + i * PAGE_SIZE, VM_PAGE_TO_PHYS(m)); if (m->valid != VM_PAGE_BITS_ALL) { rv = vm_pager_get_pages(upobj, &m, 1, 0); if (rv != VM_PAGER_OK) panic("pmap_swapin_proc: cannot get upages for proc: %d\n", p->p_pid); m = vm_page_lookup(upobj, i); m->valid = VM_PAGE_BITS_ALL; } vm_page_wire(m); vm_page_wakeup(m); vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE); } } /*************************************************** * Page table page management routines..... ***************************************************/ /* * This routine unholds page table pages, and if the hold count * drops to zero, then it decrements the wire count. */ static int _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m) { while (vm_page_sleep_busy(m, FALSE, "pmuwpt")) ; if (m->hold_count == 0) { vm_offset_t pteva; /* * unmap the page table page */ pmap->pm_pdir[m->pindex] = 0; --pmap->pm_stats.resident_count; if ((((unsigned)pmap->pm_pdir[PTDPTDI]) & PG_FRAME) == (((unsigned) PTDpde) & PG_FRAME)) { /* * Do a invltlb to make the invalidated mapping * take effect immediately. */ pteva = UPT_MIN_ADDRESS + i386_ptob(m->pindex); pmap_TLB_invalidate(pmap, pteva); } if (pmap->pm_ptphint == m) pmap->pm_ptphint = NULL; /* * If the page is finally unwired, simply free it. */ --m->wire_count; if (m->wire_count == 0) { vm_page_flash(m); vm_page_busy(m); vm_page_free_zero(m); --cnt.v_wire_count; } return 1; } return 0; } static PMAP_INLINE int pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m) { vm_page_unhold(m); if (m->hold_count == 0) return _pmap_unwire_pte_hold(pmap, m); else return 0; } /* * After removing a page table entry, this routine is used to * conditionally free the page, and manage the hold/wire counts. */ static int pmap_unuse_pt(pmap, va, mpte) pmap_t pmap; vm_offset_t va; vm_page_t mpte; { unsigned ptepindex; if (va >= UPT_MIN_ADDRESS) return 0; if (mpte == NULL) { ptepindex = (va >> PDRSHIFT); if (pmap->pm_ptphint && (pmap->pm_ptphint->pindex == ptepindex)) { mpte = pmap->pm_ptphint; } else { mpte = pmap_page_lookup( pmap->pm_pteobj, ptepindex); pmap->pm_ptphint = mpte; } } return pmap_unwire_pte_hold(pmap, mpte); } void pmap_pinit0(pmap) struct pmap *pmap; { pmap->pm_pdir = (pd_entry_t *)kmem_alloc_pageable(kernel_map, PAGE_SIZE); pmap_kenter((vm_offset_t) pmap->pm_pdir, (vm_offset_t) IdlePTD); pmap->pm_count = 1; pmap->pm_active = 0; pmap->pm_ptphint = NULL; TAILQ_INIT(&pmap->pm_pvlist); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); } /* * Initialize a preallocated and zeroed pmap structure, * such as one in a vmspace structure. */ void pmap_pinit(pmap) register struct pmap *pmap; { vm_page_t ptdpg; /* * No need to allocate page table space yet but we do need a valid * page directory table. */ if (pmap->pm_pdir == NULL) pmap->pm_pdir = (pd_entry_t *)kmem_alloc_pageable(kernel_map, PAGE_SIZE); /* * allocate object for the ptes */ if (pmap->pm_pteobj == NULL) pmap->pm_pteobj = vm_object_allocate( OBJT_DEFAULT, PTDPTDI + 1); /* * allocate the page directory page */ ptdpg = vm_page_grab( pmap->pm_pteobj, PTDPTDI, VM_ALLOC_NORMAL | VM_ALLOC_RETRY); ptdpg->wire_count = 1; ++cnt.v_wire_count; vm_page_flag_clear(ptdpg, PG_MAPPED | PG_BUSY); /* not usually mapped*/ ptdpg->valid = VM_PAGE_BITS_ALL; pmap_kenter((vm_offset_t) pmap->pm_pdir, VM_PAGE_TO_PHYS(ptdpg)); if ((ptdpg->flags & PG_ZERO) == 0) bzero(pmap->pm_pdir, PAGE_SIZE); LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); /* Wire in kernel global address entries. */ /* XXX copies current process, does not fill in MPPTDI */ bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * PTESIZE); #ifdef SMP pmap->pm_pdir[MPPTDI] = PTD[MPPTDI]; #endif /* install self-referential address mapping entry */ *(unsigned *) (pmap->pm_pdir + PTDPTDI) = VM_PAGE_TO_PHYS(ptdpg) | PG_V | PG_RW | PG_A | PG_M; pmap->pm_count = 1; pmap->pm_active = 0; pmap->pm_ptphint = NULL; TAILQ_INIT(&pmap->pm_pvlist); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); } /* * Wire in kernel global address entries. To avoid a race condition * between pmap initialization and pmap_growkernel, this procedure * should be called after the vmspace is attached to the process * but before this pmap is activated. */ void pmap_pinit2(pmap) struct pmap *pmap; { /* XXX: Remove this stub when no longer called */ } static int pmap_release_free_page(pmap, p) struct pmap *pmap; vm_page_t p; { unsigned *pde = (unsigned *) pmap->pm_pdir; /* * This code optimizes the case of freeing non-busy * page-table pages. Those pages are zero now, and * might as well be placed directly into the zero queue. */ if (vm_page_sleep_busy(p, FALSE, "pmaprl")) return 0; vm_page_busy(p); /* * Remove the page table page from the processes address space. */ pde[p->pindex] = 0; pmap->pm_stats.resident_count--; if (p->hold_count) { panic("pmap_release: freeing held page table page"); } /* * Page directory pages need to have the kernel * stuff cleared, so they can go into the zero queue also. */ if (p->pindex == PTDPTDI) { bzero(pde + KPTDI, nkpt * PTESIZE); #ifdef SMP pde[MPPTDI] = 0; #endif pde[APTDPTDI] = 0; pmap_kremove((vm_offset_t) pmap->pm_pdir); } if (pmap->pm_ptphint && (pmap->pm_ptphint->pindex == p->pindex)) pmap->pm_ptphint = NULL; p->wire_count--; cnt.v_wire_count--; vm_page_free_zero(p); return 1; } /* * this routine is called if the page table page is not * mapped correctly. */ static vm_page_t _pmap_allocpte(pmap, ptepindex) pmap_t pmap; unsigned ptepindex; { vm_offset_t pteva, ptepa; vm_page_t m; /* * Find or fabricate a new pagetable page */ m = vm_page_grab(pmap->pm_pteobj, ptepindex, VM_ALLOC_ZERO | VM_ALLOC_RETRY); KASSERT(m->queue == PQ_NONE, ("_pmap_allocpte: %p->queue != PQ_NONE", m)); if (m->wire_count == 0) cnt.v_wire_count++; m->wire_count++; /* * Increment the hold count for the page table page * (denoting a new mapping.) */ m->hold_count++; /* * Map the pagetable page into the process address space, if * it isn't already there. */ pmap->pm_stats.resident_count++; ptepa = VM_PAGE_TO_PHYS(m); pmap->pm_pdir[ptepindex] = (pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M); /* * Set the page table hint */ pmap->pm_ptphint = m; /* * Try to use the new mapping, but if we cannot, then * do it with the routine that maps the page explicitly. */ if ((m->flags & PG_ZERO) == 0) { if ((((unsigned)pmap->pm_pdir[PTDPTDI]) & PG_FRAME) == (((unsigned) PTDpde) & PG_FRAME)) { pteva = UPT_MIN_ADDRESS + i386_ptob(ptepindex); bzero((caddr_t) pteva, PAGE_SIZE); } else { pmap_zero_page(ptepa); } } m->valid = VM_PAGE_BITS_ALL; vm_page_flag_clear(m, PG_ZERO); vm_page_flag_set(m, PG_MAPPED); vm_page_wakeup(m); return m; } static vm_page_t pmap_allocpte(pmap, va) pmap_t pmap; vm_offset_t va; { unsigned ptepindex; vm_offset_t ptepa; vm_page_t m; /* * Calculate pagetable page index */ ptepindex = va >> PDRSHIFT; /* * Get the page directory entry */ ptepa = (vm_offset_t) pmap->pm_pdir[ptepindex]; /* * This supports switching from a 4MB page to a * normal 4K page. */ if (ptepa & PG_PS) { pmap->pm_pdir[ptepindex] = 0; ptepa = 0; invltlb(); } /* * If the page table page is mapped, we just increment the * hold count, and activate it. */ if (ptepa) { /* * In order to get the page table page, try the * hint first. */ if (pmap->pm_ptphint && (pmap->pm_ptphint->pindex == ptepindex)) { m = pmap->pm_ptphint; } else { m = pmap_page_lookup( pmap->pm_pteobj, ptepindex); pmap->pm_ptphint = m; } m->hold_count++; return m; } /* * Here if the pte page isn't mapped, or if it has been deallocated. */ return _pmap_allocpte(pmap, ptepindex); } /*************************************************** * Pmap allocation/deallocation routines. ***************************************************/ /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap) register struct pmap *pmap; { vm_page_t p,n,ptdpg; vm_object_t object = pmap->pm_pteobj; int curgeneration; #if defined(DIAGNOSTIC) if (object->ref_count != 1) panic("pmap_release: pteobj reference count != 1"); #endif ptdpg = NULL; LIST_REMOVE(pmap, pm_list); retry: curgeneration = object->generation; for (p = TAILQ_FIRST(&object->memq); p != NULL; p = n) { n = TAILQ_NEXT(p, listq); if (p->pindex == PTDPTDI) { ptdpg = p; continue; } while (1) { if (!pmap_release_free_page(pmap, p) && (object->generation != curgeneration)) goto retry; } } if (ptdpg && !pmap_release_free_page(pmap, ptdpg)) goto retry; } static int kvm_size(SYSCTL_HANDLER_ARGS) { unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE; return sysctl_handle_long(oidp, &ksize, 0, req); } SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 0, 0, kvm_size, "IU", "Size of KVM"); static int kvm_free(SYSCTL_HANDLER_ARGS) { unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; return sysctl_handle_long(oidp, &kfree, 0, req); } SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 0, 0, kvm_free, "IU", "Amount of KVM free"); /* * grow the number of kernel page table entries, if needed */ void pmap_growkernel(vm_offset_t addr) { struct pmap *pmap; int s; vm_offset_t ptppaddr; vm_page_t nkpg; pd_entry_t newpdir; s = splhigh(); if (kernel_vm_end == 0) { kernel_vm_end = KERNBASE; nkpt = 0; while (pdir_pde(PTD, kernel_vm_end)) { kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); nkpt++; } } addr = (addr + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); while (kernel_vm_end < addr) { if (pdir_pde(PTD, kernel_vm_end)) { kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); continue; } /* * This index is bogus, but out of the way */ nkpg = vm_page_alloc(kptobj, nkpt, VM_ALLOC_SYSTEM); if (!nkpg) panic("pmap_growkernel: no memory to grow kernel"); nkpt++; vm_page_wire(nkpg); ptppaddr = VM_PAGE_TO_PHYS(nkpg); pmap_zero_page(ptppaddr); newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M); pdir_pde(PTD, kernel_vm_end) = newpdir; LIST_FOREACH(pmap, &allpmaps, pm_list) { *pmap_pde(pmap, kernel_vm_end) = newpdir; } kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); } splx(s); } /* * Retire the given physical map from service. * Should only be called if the map contains * no valid mappings. */ void pmap_destroy(pmap) register pmap_t pmap; { int count; if (pmap == NULL) return; count = --pmap->pm_count; if (count == 0) { pmap_release(pmap); panic("destroying a pmap is not yet implemented"); } } /* * Add a reference to the specified pmap. */ void pmap_reference(pmap) pmap_t pmap; { if (pmap != NULL) { pmap->pm_count++; } } /*************************************************** * page management routines. ***************************************************/ /* * free the pv_entry back to the free list */ static PMAP_INLINE void free_pv_entry(pv) pv_entry_t pv; { pv_entry_count--; zfree(pvzone, pv); } /* * get a new pv_entry, allocating a block from the system * when needed. * the memory allocation is performed bypassing the malloc code * because of the possibility of allocations at interrupt time. */ static pv_entry_t get_pv_entry(void) { pv_entry_count++; if (pv_entry_high_water && (pv_entry_count > pv_entry_high_water) && (pmap_pagedaemon_waken == 0)) { pmap_pagedaemon_waken = 1; wakeup (&vm_pages_needed); } return zalloc(pvzone); } /* * This routine is very drastic, but can save the system * in a pinch. */ void pmap_collect() { int i; vm_page_t m; static int warningdone=0; if (pmap_pagedaemon_waken == 0) return; if (warningdone < 5) { printf("pmap_collect: collecting pv entries -- suggest increasing PMAP_SHPGPERPROC\n"); warningdone++; } for(i = 0; i < vm_page_array_size; i++) { m = &vm_page_array[i]; if (m->wire_count || m->hold_count || m->busy || (m->flags & PG_BUSY)) continue; pmap_remove_all(m); } pmap_pagedaemon_waken = 0; } /* * If it is the first entry on the list, it is actually * in the header and we must copy the following entry up * to the header. Otherwise we must search the list for * the entry. In either case we free the now unused entry. */ static int pmap_remove_entry(pmap, m, va) struct pmap *pmap; vm_page_t m; vm_offset_t va; { pv_entry_t pv; int rtval; int s; s = splvm(); if (m->md.pv_list_count < pmap->pm_stats.resident_count) { for (pv = TAILQ_FIRST(&m->md.pv_list); pv; pv = TAILQ_NEXT(pv, pv_list)) { if (pmap == pv->pv_pmap && va == pv->pv_va) break; } } else { for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = TAILQ_NEXT(pv, pv_plist)) { if (va == pv->pv_va) break; } } rtval = 0; if (pv) { rtval = pmap_unuse_pt(pmap, va, pv->pv_ptem); TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); m->md.pv_list_count--; if (TAILQ_FIRST(&m->md.pv_list) == NULL) vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist); free_pv_entry(pv); } splx(s); return rtval; } /* * Create a pv entry for page at pa for * (pmap, va). */ static void pmap_insert_entry(pmap, va, mpte, m) pmap_t pmap; vm_offset_t va; vm_page_t mpte; vm_page_t m; { int s; pv_entry_t pv; s = splvm(); pv = get_pv_entry(); pv->pv_va = va; pv->pv_pmap = pmap; pv->pv_ptem = mpte; TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); m->md.pv_list_count++; splx(s); } /* * pmap_remove_pte: do the things to unmap a page in a process */ static int pmap_remove_pte(pmap, ptq, va) struct pmap *pmap; unsigned *ptq; vm_offset_t va; { unsigned oldpte; vm_page_t m; oldpte = atomic_readandclear_int(ptq); if (oldpte & PG_W) pmap->pm_stats.wired_count -= 1; /* * Machines that don't support invlpg, also don't support * PG_G. */ if (oldpte & PG_G) invlpg(va); pmap->pm_stats.resident_count -= 1; if (oldpte & PG_MANAGED) { m = PHYS_TO_VM_PAGE(oldpte); if (oldpte & PG_M) { #if defined(PMAP_DIAGNOSTIC) if (pmap_nw_modified((pt_entry_t) oldpte)) { printf( "pmap_remove: modified page not writable: va: 0x%x, pte: 0x%x\n", va, oldpte); } #endif if (pmap_track_modified(va)) vm_page_dirty(m); } if (oldpte & PG_A) vm_page_flag_set(m, PG_REFERENCED); return pmap_remove_entry(pmap, m, va); } else { return pmap_unuse_pt(pmap, va, NULL); } return 0; } /* * Remove a single page from a process address space */ static void pmap_remove_page(pmap, va) struct pmap *pmap; register vm_offset_t va; { register unsigned *ptq; /* * if there is no pte for this address, just skip it!!! */ if (*pmap_pde(pmap, va) == 0) { return; } /* * get a local va for mappings for this pmap. */ ptq = get_ptbase(pmap) + i386_btop(va); if (*ptq) { (void) pmap_remove_pte(pmap, ptq, va); pmap_TLB_invalidate(pmap, va); } return; } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ void pmap_remove(pmap, sva, eva) struct pmap *pmap; register vm_offset_t sva; register vm_offset_t eva; { register unsigned *ptbase; vm_offset_t pdnxt; vm_offset_t ptpaddr; vm_offset_t sindex, eindex; int anyvalid; if (pmap == NULL) return; if (pmap->pm_stats.resident_count == 0) return; /* * special handling of removing one page. a very * common operation and easy to short circuit some * code. */ if (((sva + PAGE_SIZE) == eva) && (((unsigned) pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) { pmap_remove_page(pmap, sva); return; } anyvalid = 0; /* * Get a local virtual address for the mappings that are being * worked with. */ ptbase = get_ptbase(pmap); sindex = i386_btop(sva); eindex = i386_btop(eva); for (; sindex < eindex; sindex = pdnxt) { unsigned pdirindex; /* * Calculate index for next page table. */ pdnxt = ((sindex + NPTEPG) & ~(NPTEPG - 1)); if (pmap->pm_stats.resident_count == 0) break; pdirindex = sindex / NPDEPG; if (((ptpaddr = (unsigned) pmap->pm_pdir[pdirindex]) & PG_PS) != 0) { pmap->pm_pdir[pdirindex] = 0; pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; anyvalid++; continue; } /* * Weed out invalid mappings. Note: we assume that the page * directory table is always allocated, and in kernel virtual. */ if (ptpaddr == 0) continue; /* * Limit our scan to either the end of the va represented * by the current page table page, or to the end of the * range being removed. */ if (pdnxt > eindex) { pdnxt = eindex; } for ( ;sindex != pdnxt; sindex++) { vm_offset_t va; if (ptbase[sindex] == 0) { continue; } va = i386_ptob(sindex); anyvalid++; if (pmap_remove_pte(pmap, ptbase + sindex, va)) break; } } if (anyvalid) pmap_TLB_invalidate_all(pmap); } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. * * Notes: * Original versions of this routine were very * inefficient because they iteratively called * pmap_remove (slow...) */ static void pmap_remove_all(m) vm_page_t m; { register pv_entry_t pv; register unsigned *pte, tpte; int s; #if defined(PMAP_DIAGNOSTIC) /* * XXX this makes pmap_page_protect(NONE) illegal for non-managed * pages! */ if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) { panic("pmap_page_protect: illegal for unmanaged page, va: 0x%x", VM_PAGE_TO_PHYS(m)); } #endif s = splvm(); while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pv->pv_pmap->pm_stats.resident_count--; pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); tpte = atomic_readandclear_int(pte); if (tpte & PG_W) pv->pv_pmap->pm_stats.wired_count--; if (tpte & PG_A) vm_page_flag_set(m, PG_REFERENCED); /* * Update the vm_page_t clean and reference bits. */ if (tpte & PG_M) { #if defined(PMAP_DIAGNOSTIC) if (pmap_nw_modified((pt_entry_t) tpte)) { printf( "pmap_remove_all: modified page not writable: va: 0x%x, pte: 0x%x\n", pv->pv_va, tpte); } #endif if (pmap_track_modified(pv->pv_va)) vm_page_dirty(m); } pmap_TLB_invalidate(pv->pv_pmap, pv->pv_va); TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist); TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); m->md.pv_list_count--; pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem); free_pv_entry(pv); } vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); splx(s); } /* * Set the physical protection on the * specified range of this map as requested. */ void pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { register unsigned *ptbase; vm_offset_t pdnxt, ptpaddr; vm_pindex_t sindex, eindex; int anychanged; if (pmap == NULL) return; if ((prot & VM_PROT_READ) == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); return; } if (prot & VM_PROT_WRITE) return; anychanged = 0; ptbase = get_ptbase(pmap); sindex = i386_btop(sva); eindex = i386_btop(eva); for (; sindex < eindex; sindex = pdnxt) { unsigned pdirindex; pdnxt = ((sindex + NPTEPG) & ~(NPTEPG - 1)); pdirindex = sindex / NPDEPG; if (((ptpaddr = (unsigned) pmap->pm_pdir[pdirindex]) & PG_PS) != 0) { (unsigned) pmap->pm_pdir[pdirindex] &= ~(PG_M|PG_RW); pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; anychanged++; continue; } /* * Weed out invalid mappings. Note: we assume that the page * directory table is always allocated, and in kernel virtual. */ if (ptpaddr == 0) continue; if (pdnxt > eindex) { pdnxt = eindex; } for (; sindex != pdnxt; sindex++) { unsigned pbits; vm_page_t m; pbits = ptbase[sindex]; if (pbits & PG_MANAGED) { m = NULL; if (pbits & PG_A) { m = PHYS_TO_VM_PAGE(pbits); vm_page_flag_set(m, PG_REFERENCED); pbits &= ~PG_A; } if (pbits & PG_M) { if (pmap_track_modified(i386_ptob(sindex))) { if (m == NULL) m = PHYS_TO_VM_PAGE(pbits); vm_page_dirty(m); pbits &= ~PG_M; } } } pbits &= ~PG_RW; if (pbits != ptbase[sindex]) { ptbase[sindex] = pbits; anychanged = 1; } } } if (anychanged) pmap_TLB_invalidate_all(pmap); } /* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. */ void pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, boolean_t wired) { vm_offset_t pa; register unsigned *pte; vm_offset_t opa; vm_offset_t origpte, newpte; vm_page_t mpte; if (pmap == NULL) return; va &= PG_FRAME; #ifdef PMAP_DIAGNOSTIC if (va > VM_MAX_KERNEL_ADDRESS) panic("pmap_enter: toobig"); if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS)) panic("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", va); #endif mpte = NULL; /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < UPT_MIN_ADDRESS) { mpte = pmap_allocpte(pmap, va); } #if 0 && defined(PMAP_DIAGNOSTIC) else { vm_offset_t *pdeaddr = (vm_offset_t *)pmap_pde(pmap, va); if (((origpte = (vm_offset_t) *pdeaddr) & PG_V) == 0) { panic("pmap_enter: invalid kernel page table page(0), pdir=%p, pde=%p, va=%p\n", pmap->pm_pdir[PTDPTDI], origpte, va); } if (smp_active) { pdeaddr = (vm_offset_t *) IdlePTDS[PCPU_GET(cpuid)]; if (((newpte = pdeaddr[va >> PDRSHIFT]) & PG_V) == 0) { if ((vm_offset_t) my_idlePTD != (vm_offset_t) vtophys(pdeaddr)) printf("pde mismatch: %x, %x\n", my_idlePTD, pdeaddr); printf("cpuid: %d, pdeaddr: 0x%x\n", PCPU_GET(cpuid), pdeaddr); panic("pmap_enter: invalid kernel page table page(1), pdir=%p, npde=%p, pde=%p, va=%p\n", pmap->pm_pdir[PTDPTDI], newpte, origpte, va); } } } #endif pte = pmap_pte(pmap, va); /* * Page Directory table entry not valid, we need a new PT page */ if (pte == NULL) { panic("pmap_enter: invalid page directory, pdir=%p, va=0x%x\n", (void *)pmap->pm_pdir[PTDPTDI], va); } pa = VM_PAGE_TO_PHYS(m) & PG_FRAME; origpte = *(vm_offset_t *)pte; opa = origpte & PG_FRAME; if (origpte & PG_PS) panic("pmap_enter: attempted pmap_enter on 4MB page"); /* * Mapping has not changed, must be protection or wiring change. */ if (origpte && (opa == pa)) { /* * Wiring change, just update stats. We don't worry about * wiring PT pages as they remain resident as long as there * are valid mappings in them. Hence, if a user page is wired, * the PT page will be also. */ if (wired && ((origpte & PG_W) == 0)) pmap->pm_stats.wired_count++; else if (!wired && (origpte & PG_W)) pmap->pm_stats.wired_count--; #if defined(PMAP_DIAGNOSTIC) if (pmap_nw_modified((pt_entry_t) origpte)) { printf( "pmap_enter: modified page not writable: va: 0x%x, pte: 0x%x\n", va, origpte); } #endif /* * Remove extra pte reference */ if (mpte) mpte->hold_count--; if ((prot & VM_PROT_WRITE) && (origpte & PG_V)) { if ((origpte & PG_RW) == 0) { *pte |= PG_RW; #ifdef SMP cpu_invlpg((void *)va); if (pmap->pm_active & PCPU_GET(other_cpus)) smp_invltlb(); #else invltlb_1pg(va); #endif } return; } /* * We might be turning off write access to the page, * so we go ahead and sense modify status. */ if (origpte & PG_MANAGED) { if ((origpte & PG_M) && pmap_track_modified(va)) { vm_page_t om; om = PHYS_TO_VM_PAGE(opa); vm_page_dirty(om); } pa |= PG_MANAGED; } goto validate; } /* * Mapping has changed, invalidate old range and fall through to * handle validating new mapping. */ if (opa) { int err; err = pmap_remove_pte(pmap, pte, va); if (err) panic("pmap_enter: pte vanished, va: 0x%x", va); } /* * Enter on the PV list if part of our managed memory. Note that we * raise IPL while manipulating pv_table since pmap_enter can be * called at interrupt time. */ if (pmap_initialized && (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) { pmap_insert_entry(pmap, va, mpte, m); pa |= PG_MANAGED; } /* * Increment counters */ pmap->pm_stats.resident_count++; if (wired) pmap->pm_stats.wired_count++; validate: /* * Now validate mapping with desired protection/wiring. */ newpte = (vm_offset_t) (pa | pte_prot(pmap, prot) | PG_V); if (wired) newpte |= PG_W; if (va < UPT_MIN_ADDRESS) newpte |= PG_U; if (pmap == kernel_pmap) newpte |= pgeflag; /* * if the mapping or permission bits are different, we need * to update the pte. */ if ((origpte & ~(PG_M|PG_A)) != newpte) { *pte = newpte | PG_A; /*if (origpte)*/ { #ifdef SMP cpu_invlpg((void *)va); if (pmap->pm_active & PCPU_GET(other_cpus)) smp_invltlb(); #else invltlb_1pg(va); #endif } } } /* * this code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No page table pages. * 5. Tlbflush is deferred to calling procedure. * 6. Page IS managed. * but is *MUCH* faster than pmap_enter... */ static vm_page_t pmap_enter_quick(pmap, va, m, mpte) register pmap_t pmap; vm_offset_t va; vm_page_t m; vm_page_t mpte; { unsigned *pte; vm_offset_t pa; /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < UPT_MIN_ADDRESS) { unsigned ptepindex; vm_offset_t ptepa; /* * Calculate pagetable page index */ ptepindex = va >> PDRSHIFT; if (mpte && (mpte->pindex == ptepindex)) { mpte->hold_count++; } else { retry: /* * Get the page directory entry */ ptepa = (vm_offset_t) pmap->pm_pdir[ptepindex]; /* * If the page table page is mapped, we just increment * the hold count, and activate it. */ if (ptepa) { if (ptepa & PG_PS) panic("pmap_enter_quick: unexpected mapping into 4MB page"); if (pmap->pm_ptphint && (pmap->pm_ptphint->pindex == ptepindex)) { mpte = pmap->pm_ptphint; } else { mpte = pmap_page_lookup( pmap->pm_pteobj, ptepindex); pmap->pm_ptphint = mpte; } if (mpte == NULL) goto retry; mpte->hold_count++; } else { mpte = _pmap_allocpte(pmap, ptepindex); } } } else { mpte = NULL; } /* * This call to vtopte makes the assumption that we are * entering the page into the current pmap. In order to support * quick entry into any pmap, one would likely use pmap_pte_quick. * But that isn't as quick as vtopte. */ pte = (unsigned *)vtopte(va); if (*pte) { if (mpte) pmap_unwire_pte_hold(pmap, mpte); return 0; } /* * Enter on the PV list if part of our managed memory. Note that we * raise IPL while manipulating pv_table since pmap_enter can be * called at interrupt time. */ if ((m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) pmap_insert_entry(pmap, va, mpte, m); /* * Increment counters */ pmap->pm_stats.resident_count++; pa = VM_PAGE_TO_PHYS(m); /* * Now validate mapping with RO protection */ if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) *pte = pa | PG_V | PG_U; else *pte = pa | PG_V | PG_U | PG_MANAGED; return mpte; } /* * Make a temporary mapping for a physical address. This is only intended * to be used for panic dumps. */ void * pmap_kenter_temporary(vm_offset_t pa, int i) { pmap_kenter((vm_offset_t)crashdumpmap + (i * PAGE_SIZE), pa); return ((void *)crashdumpmap); } #define MAX_INIT_PT (96) /* * pmap_object_init_pt preloads the ptes for a given object * into the specified pmap. This eliminates the blast of soft * faults on process startup and immediately after an mmap. */ void pmap_object_init_pt(pmap, addr, object, pindex, size, limit) pmap_t pmap; vm_offset_t addr; vm_object_t object; vm_pindex_t pindex; vm_size_t size; int limit; { vm_offset_t tmpidx; int psize; vm_page_t p, mpte; int objpgs; if (pmap == NULL || object == NULL) return; /* * This code maps large physical mmap regions into the * processor address space. Note that some shortcuts * are taken, but the code works. */ if (pseflag && (object->type == OBJT_DEVICE) && ((addr & (NBPDR - 1)) == 0) && ((size & (NBPDR - 1)) == 0) ) { int i; vm_page_t m[1]; unsigned int ptepindex; int npdes; vm_offset_t ptepa; if (pmap->pm_pdir[ptepindex = (addr >> PDRSHIFT)]) return; retry: p = vm_page_lookup(object, pindex); if (p && vm_page_sleep_busy(p, FALSE, "init4p")) goto retry; if (p == NULL) { p = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL); if (p == NULL) return; m[0] = p; if (vm_pager_get_pages(object, m, 1, 0) != VM_PAGER_OK) { vm_page_free(p); return; } p = vm_page_lookup(object, pindex); vm_page_wakeup(p); } ptepa = (vm_offset_t) VM_PAGE_TO_PHYS(p); if (ptepa & (NBPDR - 1)) { return; } p->valid = VM_PAGE_BITS_ALL; pmap->pm_stats.resident_count += size >> PAGE_SHIFT; npdes = size >> PDRSHIFT; for(i=0;ipm_pdir[ptepindex] = (pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_PS); ptepa += NBPDR; ptepindex += 1; } vm_page_flag_set(p, PG_MAPPED); invltlb(); return; } psize = i386_btop(size); if ((object->type != OBJT_VNODE) || (limit && (psize > MAX_INIT_PT) && (object->resident_page_count > MAX_INIT_PT))) { return; } if (psize + pindex > object->size) { if (object->size < pindex) return; psize = object->size - pindex; } mpte = NULL; /* * if we are processing a major portion of the object, then scan the * entire thing. */ if (psize > (object->resident_page_count >> 2)) { objpgs = psize; for (p = TAILQ_FIRST(&object->memq); ((objpgs > 0) && (p != NULL)); p = TAILQ_NEXT(p, listq)) { tmpidx = p->pindex; if (tmpidx < pindex) { continue; } tmpidx -= pindex; if (tmpidx >= psize) { continue; } if (((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) && (p->busy == 0) && (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) { if ((p->queue - p->pc) == PQ_CACHE) vm_page_deactivate(p); vm_page_busy(p); mpte = pmap_enter_quick(pmap, addr + i386_ptob(tmpidx), p, mpte); vm_page_flag_set(p, PG_MAPPED); vm_page_wakeup(p); } objpgs -= 1; } } else { /* * else lookup the pages one-by-one. */ for (tmpidx = 0; tmpidx < psize; tmpidx += 1) { p = vm_page_lookup(object, tmpidx + pindex); if (p && ((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) && (p->busy == 0) && (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) { if ((p->queue - p->pc) == PQ_CACHE) vm_page_deactivate(p); vm_page_busy(p); mpte = pmap_enter_quick(pmap, addr + i386_ptob(tmpidx), p, mpte); vm_page_flag_set(p, PG_MAPPED); vm_page_wakeup(p); } } } return; } /* * pmap_prefault provides a quick way of clustering * pagefaults into a processes address space. It is a "cousin" * of pmap_object_init_pt, except it runs at page fault time instead * of mmap time. */ #define PFBAK 4 #define PFFOR 4 #define PAGEORDER_SIZE (PFBAK+PFFOR) static int pmap_prefault_pageorder[] = { -PAGE_SIZE, PAGE_SIZE, -2 * PAGE_SIZE, 2 * PAGE_SIZE, -3 * PAGE_SIZE, 3 * PAGE_SIZE -4 * PAGE_SIZE, 4 * PAGE_SIZE }; void pmap_prefault(pmap, addra, entry) pmap_t pmap; vm_offset_t addra; vm_map_entry_t entry; { int i; vm_offset_t starta; vm_offset_t addr; vm_pindex_t pindex; vm_page_t m, mpte; vm_object_t object; if (!curproc || (pmap != vmspace_pmap(curproc->p_vmspace))) return; object = entry->object.vm_object; starta = addra - PFBAK * PAGE_SIZE; if (starta < entry->start) { starta = entry->start; } else if (starta > addra) { starta = 0; } mpte = NULL; for (i = 0; i < PAGEORDER_SIZE; i++) { vm_object_t lobject; unsigned *pte; addr = addra + pmap_prefault_pageorder[i]; if (addr > addra + (PFFOR * PAGE_SIZE)) addr = 0; if (addr < starta || addr >= entry->end) continue; if ((*pmap_pde(pmap, addr)) == NULL) continue; pte = (unsigned *) vtopte(addr); if (*pte) continue; pindex = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT; lobject = object; for (m = vm_page_lookup(lobject, pindex); (!m && (lobject->type == OBJT_DEFAULT) && (lobject->backing_object)); lobject = lobject->backing_object) { if (lobject->backing_object_offset & PAGE_MASK) break; pindex += (lobject->backing_object_offset >> PAGE_SHIFT); m = vm_page_lookup(lobject->backing_object, pindex); } /* * give-up when a page is not in memory */ if (m == NULL) break; if (((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) && (m->busy == 0) && (m->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) { if ((m->queue - m->pc) == PQ_CACHE) { vm_page_deactivate(m); } vm_page_busy(m); mpte = pmap_enter_quick(pmap, addr, m, mpte); vm_page_flag_set(m, PG_MAPPED); vm_page_wakeup(m); } } } /* * Routine: pmap_change_wiring * Function: Change the wiring attribute for a map/virtual-address * pair. * In/out conditions: * The mapping must already exist in the pmap. */ void pmap_change_wiring(pmap, va, wired) register pmap_t pmap; vm_offset_t va; boolean_t wired; { register unsigned *pte; if (pmap == NULL) return; pte = pmap_pte(pmap, va); if (wired && !pmap_pte_w(pte)) pmap->pm_stats.wired_count++; else if (!wired && pmap_pte_w(pte)) pmap->pm_stats.wired_count--; /* * Wiring is not a hardware characteristic so there is no need to * invalidate TLB. */ pmap_pte_set_w(pte, wired); } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. */ void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr) pmap_t dst_pmap, src_pmap; vm_offset_t dst_addr; vm_size_t len; vm_offset_t src_addr; { vm_offset_t addr; vm_offset_t end_addr = src_addr + len; vm_offset_t pdnxt; unsigned src_frame, dst_frame; vm_page_t m; if (dst_addr != src_addr) return; src_frame = ((unsigned) src_pmap->pm_pdir[PTDPTDI]) & PG_FRAME; if (src_frame != (((unsigned) PTDpde) & PG_FRAME)) { return; } dst_frame = ((unsigned) dst_pmap->pm_pdir[PTDPTDI]) & PG_FRAME; if (dst_frame != (((unsigned) APTDpde) & PG_FRAME)) { APTDpde = (pd_entry_t) (dst_frame | PG_RW | PG_V); #if defined(SMP) /* The page directory is not shared between CPUs */ cpu_invltlb(); #else invltlb(); #endif } for(addr = src_addr; addr < end_addr; addr = pdnxt) { unsigned *src_pte, *dst_pte; vm_page_t dstmpte, srcmpte; vm_offset_t srcptepaddr; unsigned ptepindex; if (addr >= UPT_MIN_ADDRESS) panic("pmap_copy: invalid to pmap_copy page tables\n"); /* * Don't let optional prefaulting of pages make us go * way below the low water mark of free pages or way * above high water mark of used pv entries. */ if (cnt.v_free_count < cnt.v_free_reserved || pv_entry_count > pv_entry_high_water) break; pdnxt = ((addr + PAGE_SIZE*NPTEPG) & ~(PAGE_SIZE*NPTEPG - 1)); ptepindex = addr >> PDRSHIFT; srcptepaddr = (vm_offset_t) src_pmap->pm_pdir[ptepindex]; if (srcptepaddr == 0) continue; if (srcptepaddr & PG_PS) { if (dst_pmap->pm_pdir[ptepindex] == 0) { dst_pmap->pm_pdir[ptepindex] = (pd_entry_t) srcptepaddr; dst_pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE; } continue; } srcmpte = vm_page_lookup(src_pmap->pm_pteobj, ptepindex); if ((srcmpte == NULL) || (srcmpte->hold_count == 0) || (srcmpte->flags & PG_BUSY)) continue; if (pdnxt > end_addr) pdnxt = end_addr; src_pte = (unsigned *) vtopte(addr); dst_pte = (unsigned *) avtopte(addr); while (addr < pdnxt) { unsigned ptetemp; ptetemp = *src_pte; /* * we only virtual copy managed pages */ if ((ptetemp & PG_MANAGED) != 0) { /* * We have to check after allocpte for the * pte still being around... allocpte can * block. */ dstmpte = pmap_allocpte(dst_pmap, addr); if ((*dst_pte == 0) && (ptetemp = *src_pte)) { /* * Clear the modified and * accessed (referenced) bits * during the copy. */ m = PHYS_TO_VM_PAGE(ptetemp); *dst_pte = ptetemp & ~(PG_M | PG_A); dst_pmap->pm_stats.resident_count++; pmap_insert_entry(dst_pmap, addr, dstmpte, m); } else { pmap_unwire_pte_hold(dst_pmap, dstmpte); } if (dstmpte->hold_count >= srcmpte->hold_count) break; } addr += PAGE_SIZE; src_pte++; dst_pte++; } } } /* * Routine: pmap_kernel * Function: * Returns the physical map handle for the kernel. */ pmap_t pmap_kernel() { return (kernel_pmap); } /* * pmap_zero_page zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. */ void pmap_zero_page(phys) vm_offset_t phys; { if (*(int *) CMAP2) panic("pmap_zero_page: CMAP2 busy"); *(int *) CMAP2 = PG_V | PG_RW | (phys & PG_FRAME) | PG_A | PG_M; invltlb_1pg((vm_offset_t)CADDR2); #if defined(I686_CPU) if (cpu_class == CPUCLASS_686) i686_pagezero(CADDR2); else #endif bzero(CADDR2, PAGE_SIZE); *(int *) CMAP2 = 0; } /* * pmap_zero_page_area zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. * * off and size may not cover an area beyond a single hardware page. */ void pmap_zero_page_area(phys, off, size) vm_offset_t phys; int off; int size; { if (*(int *) CMAP2) panic("pmap_zero_page: CMAP2 busy"); *(int *) CMAP2 = PG_V | PG_RW | (phys & PG_FRAME) | PG_A | PG_M; invltlb_1pg((vm_offset_t)CADDR2); #if defined(I686_CPU) if (cpu_class == CPUCLASS_686 && off == 0 && size == PAGE_SIZE) i686_pagezero(CADDR2); else #endif bzero((char *)CADDR2 + off, size); *(int *) CMAP2 = 0; } /* * pmap_copy_page copies the specified (machine independent) * page by mapping the page into virtual memory and using * bcopy to copy the page, one machine dependent page at a * time. */ void pmap_copy_page(src, dst) vm_offset_t src; vm_offset_t dst; { if (*(int *) CMAP1) panic("pmap_copy_page: CMAP1 busy"); if (*(int *) CMAP2) panic("pmap_copy_page: CMAP2 busy"); *(int *) CMAP1 = PG_V | (src & PG_FRAME) | PG_A; *(int *) CMAP2 = PG_V | PG_RW | (dst & PG_FRAME) | PG_A | PG_M; #ifdef I386_CPU invltlb(); #else invlpg((u_int)CADDR1); invlpg((u_int)CADDR2); #endif bcopy(CADDR1, CADDR2, PAGE_SIZE); *(int *) CMAP1 = 0; *(int *) CMAP2 = 0; } /* * Routine: pmap_pageable * Function: * Make the specified pages (by pmap, offset) * pageable (or not) as requested. * * A page which is not pageable may not take * a fault; therefore, its page table entry * must remain valid for the duration. * * This routine is merely advisory; pmap_enter * will specify that these pages are to be wired * down (or not) as appropriate. */ void pmap_pageable(pmap, sva, eva, pageable) pmap_t pmap; vm_offset_t sva, eva; boolean_t pageable; { } /* * this routine returns true if a physical page resides * in the given pmap. */ boolean_t pmap_page_exists(pmap, m) pmap_t pmap; vm_page_t m; { register pv_entry_t pv; int s; if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) return FALSE; s = splvm(); /* * Not found, check current mappings returning immediately if found. */ for (pv = TAILQ_FIRST(&m->md.pv_list); pv; pv = TAILQ_NEXT(pv, pv_list)) { if (pv->pv_pmap == pmap) { splx(s); return TRUE; } } splx(s); return (FALSE); } #define PMAP_REMOVE_PAGES_CURPROC_ONLY /* * Remove all pages from specified address space * this aids process exit speeds. Also, this code * is special cased for current process only, but * can have the more generic (and slightly slower) * mode enabled. This is much faster than pmap_remove * in the case of running down an entire address space. */ void pmap_remove_pages(pmap, sva, eva) pmap_t pmap; vm_offset_t sva, eva; { unsigned *pte, tpte; pv_entry_t pv, npv; int s; vm_page_t m; #ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY if (!curproc || (pmap != vmspace_pmap(curproc->p_vmspace))) { printf("warning: pmap_remove_pages called with non-current pmap\n"); return; } #endif s = splvm(); for(pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) { if (pv->pv_va >= eva || pv->pv_va < sva) { npv = TAILQ_NEXT(pv, pv_plist); continue; } #ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY pte = (unsigned *)vtopte(pv->pv_va); #else pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); #endif tpte = *pte; /* * We cannot remove wired pages from a process' mapping at this time */ if (tpte & PG_W) { npv = TAILQ_NEXT(pv, pv_plist); continue; } *pte = 0; m = PHYS_TO_VM_PAGE(tpte); KASSERT(m < &vm_page_array[vm_page_array_size], ("pmap_remove_pages: bad tpte %x", tpte)); pv->pv_pmap->pm_stats.resident_count--; /* * Update the vm_page_t clean and reference bits. */ if (tpte & PG_M) { vm_page_dirty(m); } npv = TAILQ_NEXT(pv, pv_plist); TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist); m->md.pv_list_count--; TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); if (TAILQ_FIRST(&m->md.pv_list) == NULL) { vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); } pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem); free_pv_entry(pv); } splx(s); pmap_TLB_invalidate_all(pmap); } /* * pmap_testbit tests bits in pte's * note that the testbit/changebit routines are inline, * and a lot of things compile-time evaluate. */ static boolean_t pmap_testbit(m, bit) vm_page_t m; int bit; { pv_entry_t pv; unsigned *pte; int s; if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) return FALSE; if (TAILQ_FIRST(&m->md.pv_list) == NULL) return FALSE; s = splvm(); for (pv = TAILQ_FIRST(&m->md.pv_list); pv; pv = TAILQ_NEXT(pv, pv_list)) { /* * if the bit being tested is the modified bit, then * mark clean_map and ptes as never * modified. */ if (bit & (PG_A|PG_M)) { if (!pmap_track_modified(pv->pv_va)) continue; } #if defined(PMAP_DIAGNOSTIC) if (!pv->pv_pmap) { printf("Null pmap (tb) at va: 0x%x\n", pv->pv_va); continue; } #endif pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); if (*pte & bit) { splx(s); return TRUE; } } splx(s); return (FALSE); } /* * this routine is used to modify bits in ptes */ static __inline void pmap_changebit(m, bit, setem) vm_page_t m; int bit; boolean_t setem; { register pv_entry_t pv; register unsigned *pte; int s; if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) return; s = splvm(); /* * Loop over all current mappings setting/clearing as appropos If * setting RO do we need to clear the VAC? */ for (pv = TAILQ_FIRST(&m->md.pv_list); pv; pv = TAILQ_NEXT(pv, pv_list)) { /* * don't write protect pager mappings */ if (!setem && (bit == PG_RW)) { if (!pmap_track_modified(pv->pv_va)) continue; } #if defined(PMAP_DIAGNOSTIC) if (!pv->pv_pmap) { printf("Null pmap (cb) at va: 0x%x\n", pv->pv_va); continue; } #endif pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); if (setem) { *(int *)pte |= bit; pmap_TLB_invalidate(pv->pv_pmap, pv->pv_va); } else { vm_offset_t pbits = *(vm_offset_t *)pte; if (pbits & bit) { if (bit == PG_RW) { if (pbits & PG_M) { vm_page_dirty(m); } *(int *)pte = pbits & ~(PG_M|PG_RW); } else { *(int *)pte = pbits & ~bit; } pmap_TLB_invalidate(pv->pv_pmap, pv->pv_va); } } } splx(s); } /* * pmap_page_protect: * * Lower the permission for all mappings to a given page. */ void pmap_page_protect(vm_page_t m, vm_prot_t prot) { if ((prot & VM_PROT_WRITE) == 0) { if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) { pmap_changebit(m, PG_RW, FALSE); } else { pmap_remove_all(m); } } } vm_offset_t pmap_phys_address(ppn) int ppn; { return (i386_ptob(ppn)); } /* * pmap_ts_referenced: * * Return the count of reference bits for a page, clearing all of them. */ int pmap_ts_referenced(vm_page_t m) { register pv_entry_t pv, pvf, pvn; unsigned *pte; int s; int rtval = 0; if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) return (rtval); s = splvm(); if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pvf = pv; do { pvn = TAILQ_NEXT(pv, pv_list); TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); if (!pmap_track_modified(pv->pv_va)) continue; pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); if (pte && (*pte & PG_A)) { *pte &= ~PG_A; pmap_TLB_invalidate(pv->pv_pmap, pv->pv_va); rtval++; if (rtval > 4) { break; } } } while ((pv = pvn) != NULL && pv != pvf); } splx(s); return (rtval); } /* * pmap_is_modified: * * Return whether or not the specified physical page was modified * in any physical maps. */ boolean_t pmap_is_modified(vm_page_t m) { return pmap_testbit(m, PG_M); } /* * Clear the modify bits on the specified physical page. */ void pmap_clear_modify(vm_page_t m) { pmap_changebit(m, PG_M, FALSE); } /* * pmap_clear_reference: * * Clear the reference bit on the specified physical page. */ void pmap_clear_reference(vm_page_t m) { pmap_changebit(m, PG_A, FALSE); } /* * Miscellaneous support routines follow */ static void i386_protection_init() { register int *kp, prot; kp = protection_codes; for (prot = 0; prot < 8; prot++) { switch (prot) { case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_NONE: /* * Read access is also 0. There isn't any execute bit, * so just make it readable. */ case VM_PROT_READ | VM_PROT_NONE | VM_PROT_NONE: case VM_PROT_READ | VM_PROT_NONE | VM_PROT_EXECUTE: case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_EXECUTE: *kp++ = 0; break; case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_NONE: case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_EXECUTE: case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_NONE: case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE: *kp++ = PG_RW; break; } } } /* * Map a set of physical memory pages into the kernel virtual * address space. Return a pointer to where it is mapped. This * routine is intended to be used for mapping device memory, * NOT real memory. */ void * pmap_mapdev(pa, size) vm_offset_t pa; vm_size_t size; { vm_offset_t va, tmpva, offset; unsigned *pte; offset = pa & PAGE_MASK; size = roundup(offset + size, PAGE_SIZE); GIANT_REQUIRED; va = kmem_alloc_pageable(kernel_map, size); if (!va) panic("pmap_mapdev: Couldn't alloc kernel virtual memory"); pa = pa & PG_FRAME; for (tmpva = va; size > 0;) { pte = (unsigned *)vtopte(tmpva); *pte = pa | PG_RW | PG_V | pgeflag; size -= PAGE_SIZE; tmpva += PAGE_SIZE; pa += PAGE_SIZE; } invltlb(); return ((void *)(va + offset)); } void pmap_unmapdev(va, size) vm_offset_t va; vm_size_t size; { vm_offset_t base, offset; base = va & PG_FRAME; offset = va & PAGE_MASK; size = roundup(offset + size, PAGE_SIZE); kmem_free(kernel_map, base, size); } /* * perform the pmap work for mincore */ int pmap_mincore(pmap, addr) pmap_t pmap; vm_offset_t addr; { unsigned *ptep, pte; vm_page_t m; int val = 0; ptep = pmap_pte(pmap, addr); if (ptep == 0) { return 0; } if ((pte = *ptep) != 0) { vm_offset_t pa; val = MINCORE_INCORE; if ((pte & PG_MANAGED) == 0) return val; pa = pte & PG_FRAME; m = PHYS_TO_VM_PAGE(pa); /* * Modified by us */ if (pte & PG_M) val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER; /* * Modified by someone */ else if (m->dirty || pmap_is_modified(m)) val |= MINCORE_MODIFIED_OTHER; /* * Referenced by us */ if (pte & PG_A) val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER; /* * Referenced by someone */ else if ((m->flags & PG_REFERENCED) || pmap_ts_referenced(m)) { val |= MINCORE_REFERENCED_OTHER; vm_page_flag_set(m, PG_REFERENCED); } } return val; } void pmap_activate(struct proc *p) { pmap_t pmap; pmap = vmspace_pmap(p->p_vmspace); #if defined(SMP) pmap->pm_active |= 1 << PCPU_GET(cpuid); #else pmap->pm_active |= 1; #endif #if defined(SWTCH_OPTIM_STATS) tlb_flush_count++; #endif load_cr3(p->p_addr->u_pcb.pcb_cr3 = vtophys(pmap->pm_pdir)); } vm_offset_t pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size) { if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) { return addr; } addr = (addr + (NBPDR - 1)) & ~(NBPDR - 1); return addr; } #if defined(PMAP_DEBUG) pmap_pid_dump(int pid) { pmap_t pmap; struct proc *p; int npte = 0; int index; sx_slock(&allproc_lock); LIST_FOREACH(p, &allproc, p_list) { if (p->p_pid != pid) continue; if (p->p_vmspace) { int i,j; index = 0; pmap = vmspace_pmap(p->p_vmspace); for(i=0;i<1024;i++) { pd_entry_t *pde; unsigned *pte; unsigned base = i << PDRSHIFT; pde = &pmap->pm_pdir[i]; if (pde && pmap_pde_v(pde)) { for(j=0;j<1024;j++) { unsigned va = base + (j << PAGE_SHIFT); if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) { if (index) { index = 0; printf("\n"); } sx_sunlock(&allproc_lock); return npte; } pte = pmap_pte_quick( pmap, va); if (pte && pmap_pte_v(pte)) { vm_offset_t pa; vm_page_t m; pa = *(int *)pte; m = PHYS_TO_VM_PAGE(pa); printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x", va, pa, m->hold_count, m->wire_count, m->flags); npte++; index++; if (index >= 2) { index = 0; printf("\n"); } else { printf(" "); } } } } } } } sx_sunlock(&allproc_lock); return npte; } #endif #if defined(DEBUG) static void pads __P((pmap_t pm)); void pmap_pvdump __P((vm_offset_t pa)); /* print address space of pmap*/ static void pads(pm) pmap_t pm; { unsigned va, i, j; unsigned *ptep; if (pm == kernel_pmap) return; for (i = 0; i < 1024; i++) if (pm->pm_pdir[i]) for (j = 0; j < 1024; j++) { va = (i << PDRSHIFT) + (j << PAGE_SHIFT); if (pm == kernel_pmap && va < KERNBASE) continue; if (pm != kernel_pmap && va > UPT_MAX_ADDRESS) continue; ptep = pmap_pte_quick(pm, va); if (pmap_pte_v(ptep)) printf("%x:%x ", va, *(int *) ptep); }; } void pmap_pvdump(pa) vm_offset_t pa; { register pv_entry_t pv; vm_page_t m; printf("pa %x", pa); m = PHYS_TO_VM_PAGE(pa); for (pv = TAILQ_FIRST(&m->md.pv_list); pv; pv = TAILQ_NEXT(pv, pv_list)) { #ifdef used_to_be printf(" -> pmap %p, va %x, flags %x", (void *)pv->pv_pmap, pv->pv_va, pv->pv_flags); #endif printf(" -> pmap %p, va %x", (void *)pv->pv_pmap, pv->pv_va); pads(pv->pv_pmap); } printf(" "); } #endif Index: head/sys/i386/i386/sys_machdep.c =================================================================== --- head/sys/i386/i386/sys_machdep.c (revision 82308) +++ head/sys/i386/i386/sys_machdep.c (revision 82309) @@ -1,535 +1,537 @@ /*- * Copyright (c) 1990 The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)sys_machdep.c 5.5 (Berkeley) 1/19/91 * $FreeBSD$ * */ +#include "opt_upages.h" + #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* pcb.h included by sys/user.h */ #include #include /* for kernel_map */ #define MAX_LD 8192 #define LD_PER_PAGE 512 #define NEW_MAX_LD(num) ((num + LD_PER_PAGE) & ~(LD_PER_PAGE-1)) #define SIZE_FROM_LARGEST_LD(num) (NEW_MAX_LD(num) << 3) static int i386_get_ldt __P((struct proc *, char *)); static int i386_set_ldt __P((struct proc *, char *)); static int i386_get_ioperm __P((struct proc *, char *)); static int i386_set_ioperm __P((struct proc *, char *)); #ifdef SMP static void set_user_ldt_rv __P((struct pcb *)); #endif #ifndef _SYS_SYSPROTO_H_ struct sysarch_args { int op; char *parms; }; #endif int sysarch(p, uap) struct proc *p; register struct sysarch_args *uap; { int error = 0; switch(uap->op) { case I386_GET_LDT: error = i386_get_ldt(p, uap->parms); break; case I386_SET_LDT: error = i386_set_ldt(p, uap->parms); break; case I386_GET_IOPERM: error = i386_get_ioperm(p, uap->parms); break; case I386_SET_IOPERM: error = i386_set_ioperm(p, uap->parms); break; case I386_VM86: error = vm86_sysarch(p, uap->parms); break; default: error = EOPNOTSUPP; break; } return (error); } int i386_extend_pcb(struct proc *p) { int i, offset; u_long *addr; struct pcb_ext *ext; struct soft_segment_descriptor ssd = { 0, /* segment base address (overwritten) */ ctob(IOPAGES + 1) - 1, /* length */ SDT_SYS386TSS, /* segment type */ 0, /* priority level */ 1, /* descriptor present */ 0, 0, 0, /* default 32 size */ 0 /* granularity */ }; ext = (struct pcb_ext *)kmem_alloc(kernel_map, ctob(IOPAGES+1)); if (ext == 0) return (ENOMEM); bzero(ext, sizeof(struct pcb_ext)); ext->ext_tss.tss_esp0 = (unsigned)p->p_addr + ctob(UPAGES) - 16; ext->ext_tss.tss_ss0 = GSEL(GDATA_SEL, SEL_KPL); /* * The last byte of the i/o map must be followed by an 0xff byte. * We arbitrarily allocate 16 bytes here, to keep the starting * address on a doubleword boundary. */ offset = PAGE_SIZE - 16; ext->ext_tss.tss_ioopt = (offset - ((unsigned)&ext->ext_tss - (unsigned)ext)) << 16; ext->ext_iomap = (caddr_t)ext + offset; ext->ext_vm86.vm86_intmap = (caddr_t)ext + offset - 32; addr = (u_long *)ext->ext_vm86.vm86_intmap; for (i = 0; i < (ctob(IOPAGES) + 32 + 16) / sizeof(u_long); i++) *addr++ = ~0; ssd.ssd_base = (unsigned)&ext->ext_tss; ssd.ssd_limit -= ((unsigned)&ext->ext_tss - (unsigned)ext); ssdtosd(&ssd, &ext->ext_tssd); KASSERT(p == curproc, ("giving a TSS to non-curproc")); KASSERT(p->p_addr->u_pcb.pcb_ext == 0, ("already have a TSS!")); mtx_lock_spin(&sched_lock); p->p_addr->u_pcb.pcb_ext = ext; /* switch to the new TSS after syscall completes */ p->p_sflag |= PS_NEEDRESCHED; mtx_unlock_spin(&sched_lock); return 0; } static int i386_set_ioperm(p, args) struct proc *p; char *args; { int i, error; struct i386_ioperm_args ua; char *iomap; if ((error = copyin(args, &ua, sizeof(struct i386_ioperm_args))) != 0) return (error); if ((error = suser(p)) != 0) return (error); if (securelevel > 0) return (EPERM); /* * XXX * While this is restricted to root, we should probably figure out * whether any other driver is using this i/o address, as so not to * cause confusion. This probably requires a global 'usage registry'. */ if (p->p_addr->u_pcb.pcb_ext == 0) if ((error = i386_extend_pcb(p)) != 0) return (error); iomap = (char *)p->p_addr->u_pcb.pcb_ext->ext_iomap; if (ua.start + ua.length > IOPAGES * PAGE_SIZE * NBBY) return (EINVAL); for (i = ua.start; i < ua.start + ua.length; i++) { if (ua.enable) iomap[i >> 3] &= ~(1 << (i & 7)); else iomap[i >> 3] |= (1 << (i & 7)); } return (error); } static int i386_get_ioperm(p, args) struct proc *p; char *args; { int i, state, error; struct i386_ioperm_args ua; char *iomap; if ((error = copyin(args, &ua, sizeof(struct i386_ioperm_args))) != 0) return (error); if (ua.start >= IOPAGES * PAGE_SIZE * NBBY) return (EINVAL); if (p->p_addr->u_pcb.pcb_ext == 0) { ua.length = 0; goto done; } iomap = (char *)p->p_addr->u_pcb.pcb_ext->ext_iomap; i = ua.start; state = (iomap[i >> 3] >> (i & 7)) & 1; ua.enable = !state; ua.length = 1; for (i = ua.start + 1; i < IOPAGES * PAGE_SIZE * NBBY; i++) { if (state != ((iomap[i >> 3] >> (i & 7)) & 1)) break; ua.length++; } done: error = copyout(&ua, args, sizeof(struct i386_ioperm_args)); return (error); } /* * Update the GDT entry pointing to the LDT to point to the LDT of the * current process. * * This must be called with sched_lock held. Unfortunately, we can't use a * mtx_assert() here because cpu_switch() calls this function after changing * curproc but before sched_lock's owner is updated in mi_switch(). */ void set_user_ldt(struct pcb *pcb) { struct pcb_ldt *pcb_ldt; pcb_ldt = pcb->pcb_ldt; #ifdef SMP gdt[PCPU_GET(cpuid) * NGDT + GUSERLDT_SEL].sd = pcb_ldt->ldt_sd; #else gdt[GUSERLDT_SEL].sd = pcb_ldt->ldt_sd; #endif lldt(GSEL(GUSERLDT_SEL, SEL_KPL)); PCPU_SET(currentldt, GSEL(GUSERLDT_SEL, SEL_KPL)); } #ifdef SMP static void set_user_ldt_rv(struct pcb *pcb) { if (pcb != PCPU_GET(curpcb)) return; mtx_lock_spin(&sched_lock); set_user_ldt(pcb); mtx_unlock_spin(&sched_lock); } #endif /* * Must be called with either sched_lock free or held but not recursed. * If it does not return NULL, it will return with it owned. */ struct pcb_ldt * user_ldt_alloc(struct pcb *pcb, int len) { struct pcb_ldt *pcb_ldt, *new_ldt; if (mtx_owned(&sched_lock)) mtx_unlock_spin(&sched_lock); mtx_assert(&sched_lock, MA_NOTOWNED); MALLOC(new_ldt, struct pcb_ldt *, sizeof(struct pcb_ldt), M_SUBPROC, M_WAITOK); new_ldt->ldt_len = len = NEW_MAX_LD(len); new_ldt->ldt_base = (caddr_t)kmem_alloc(kernel_map, len * sizeof(union descriptor)); if (new_ldt->ldt_base == NULL) { FREE(new_ldt, M_SUBPROC); return NULL; } new_ldt->ldt_refcnt = 1; new_ldt->ldt_active = 0; mtx_lock_spin(&sched_lock); gdt_segs[GUSERLDT_SEL].ssd_base = (unsigned)new_ldt->ldt_base; gdt_segs[GUSERLDT_SEL].ssd_limit = len * sizeof(union descriptor) - 1; ssdtosd(&gdt_segs[GUSERLDT_SEL], &new_ldt->ldt_sd); if ((pcb_ldt = pcb->pcb_ldt)) { if (len > pcb_ldt->ldt_len) len = pcb_ldt->ldt_len; bcopy(pcb_ldt->ldt_base, new_ldt->ldt_base, len * sizeof(union descriptor)); } else { bcopy(ldt, new_ldt->ldt_base, sizeof(ldt)); } return new_ldt; } /* * Must be called either with sched_lock free or held but not recursed. * If pcb->pcb_ldt is not NULL, it will return with sched_lock released. */ void user_ldt_free(struct pcb *pcb) { struct pcb_ldt *pcb_ldt = pcb->pcb_ldt; if (pcb_ldt == NULL) return; if (!mtx_owned(&sched_lock)) mtx_lock_spin(&sched_lock); mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED); if (pcb == PCPU_GET(curpcb)) { lldt(_default_ldt); PCPU_SET(currentldt, _default_ldt); } pcb->pcb_ldt = NULL; if (--pcb_ldt->ldt_refcnt == 0) { mtx_unlock_spin(&sched_lock); kmem_free(kernel_map, (vm_offset_t)pcb_ldt->ldt_base, pcb_ldt->ldt_len * sizeof(union descriptor)); FREE(pcb_ldt, M_SUBPROC); } else mtx_unlock_spin(&sched_lock); } static int i386_get_ldt(p, args) struct proc *p; char *args; { int error = 0; struct pcb *pcb = &p->p_addr->u_pcb; struct pcb_ldt *pcb_ldt = pcb->pcb_ldt; int nldt, num; union descriptor *lp; struct i386_ldt_args ua, *uap = &ua; if ((error = copyin(args, uap, sizeof(struct i386_ldt_args))) < 0) return(error); #ifdef DEBUG printf("i386_get_ldt: start=%d num=%d descs=%p\n", uap->start, uap->num, (void *)uap->descs); #endif /* verify range of LDTs exist */ if ((uap->start < 0) || (uap->num <= 0)) return(EINVAL); if (pcb_ldt) { nldt = pcb_ldt->ldt_len; num = min(uap->num, nldt); lp = &((union descriptor *)(pcb_ldt->ldt_base))[uap->start]; } else { nldt = sizeof(ldt)/sizeof(ldt[0]); num = min(uap->num, nldt); lp = &ldt[uap->start]; } if (uap->start > nldt) return(EINVAL); error = copyout(lp, uap->descs, num * sizeof(union descriptor)); if (!error) p->p_retval[0] = num; return(error); } static int i386_set_ldt(p, args) struct proc *p; char *args; { int error = 0, i, n; int largest_ld; struct pcb *pcb = &p->p_addr->u_pcb; struct pcb_ldt *pcb_ldt = pcb->pcb_ldt; struct i386_ldt_args ua, *uap = &ua; caddr_t old_ldt_base; int old_ldt_len; critical_t savecrit; if ((error = copyin(args, uap, sizeof(struct i386_ldt_args))) < 0) return(error); #ifdef DEBUG printf("i386_set_ldt: start=%d num=%d descs=%p\n", uap->start, uap->num, (void *)uap->descs); #endif /* verify range of descriptors to modify */ if ((uap->start < 0) || (uap->start >= MAX_LD) || (uap->num < 0) || (uap->num > MAX_LD)) { return(EINVAL); } largest_ld = uap->start + uap->num - 1; if (largest_ld >= MAX_LD) return(EINVAL); /* allocate user ldt */ if (!pcb_ldt || largest_ld >= pcb_ldt->ldt_len) { struct pcb_ldt *new_ldt = user_ldt_alloc(pcb, largest_ld); if (new_ldt == NULL) return ENOMEM; if (pcb_ldt) { old_ldt_base = pcb_ldt->ldt_base; old_ldt_len = pcb_ldt->ldt_len; pcb_ldt->ldt_sd = new_ldt->ldt_sd; pcb_ldt->ldt_base = new_ldt->ldt_base; pcb_ldt->ldt_len = new_ldt->ldt_len; mtx_unlock_spin(&sched_lock); kmem_free(kernel_map, (vm_offset_t)old_ldt_base, old_ldt_len * sizeof(union descriptor)); FREE(new_ldt, M_SUBPROC); #ifndef SMP mtx_lock_spin(&sched_lock); #endif } else { pcb->pcb_ldt = pcb_ldt = new_ldt; #ifdef SMP mtx_unlock_spin(&sched_lock); #endif } #ifdef SMP /* signal other cpus to reload ldt */ smp_rendezvous(NULL, (void (*)(void *))set_user_ldt_rv, NULL, pcb); #else set_user_ldt(pcb); mtx_unlock_spin(&sched_lock); #endif } /* Check descriptors for access violations */ for (i = 0, n = uap->start; i < uap->num; i++, n++) { union descriptor desc, *dp; dp = &uap->descs[i]; error = copyin(dp, &desc, sizeof(union descriptor)); if (error) return(error); switch (desc.sd.sd_type) { case SDT_SYSNULL: /* system null */ desc.sd.sd_p = 0; break; case SDT_SYS286TSS: /* system 286 TSS available */ case SDT_SYSLDT: /* system local descriptor table */ case SDT_SYS286BSY: /* system 286 TSS busy */ case SDT_SYSTASKGT: /* system task gate */ case SDT_SYS286IGT: /* system 286 interrupt gate */ case SDT_SYS286TGT: /* system 286 trap gate */ case SDT_SYSNULL2: /* undefined by Intel */ case SDT_SYS386TSS: /* system 386 TSS available */ case SDT_SYSNULL3: /* undefined by Intel */ case SDT_SYS386BSY: /* system 386 TSS busy */ case SDT_SYSNULL4: /* undefined by Intel */ case SDT_SYS386IGT: /* system 386 interrupt gate */ case SDT_SYS386TGT: /* system 386 trap gate */ case SDT_SYS286CGT: /* system 286 call gate */ case SDT_SYS386CGT: /* system 386 call gate */ /* I can't think of any reason to allow a user proc * to create a segment of these types. They are * for OS use only. */ return EACCES; /*NOTREACHED*/ /* memory segment types */ case SDT_MEMEC: /* memory execute only conforming */ case SDT_MEMEAC: /* memory execute only accessed conforming */ case SDT_MEMERC: /* memory execute read conforming */ case SDT_MEMERAC: /* memory execute read accessed conforming */ /* Must be "present" if executable and conforming. */ if (desc.sd.sd_p == 0) return (EACCES); break; case SDT_MEMRO: /* memory read only */ case SDT_MEMROA: /* memory read only accessed */ case SDT_MEMRW: /* memory read write */ case SDT_MEMRWA: /* memory read write accessed */ case SDT_MEMROD: /* memory read only expand dwn limit */ case SDT_MEMRODA: /* memory read only expand dwn lim accessed */ case SDT_MEMRWD: /* memory read write expand dwn limit */ case SDT_MEMRWDA: /* memory read write expand dwn lim acessed */ case SDT_MEME: /* memory execute only */ case SDT_MEMEA: /* memory execute only accessed */ case SDT_MEMER: /* memory execute read */ case SDT_MEMERA: /* memory execute read accessed */ break; default: return(EINVAL); /*NOTREACHED*/ } /* Only user (ring-3) descriptors may be present. */ if ((desc.sd.sd_p != 0) && (desc.sd.sd_dpl != SEL_UPL)) return (EACCES); } /* Fill in range */ savecrit = critical_enter(); error = copyin(uap->descs, &((union descriptor *)(pcb_ldt->ldt_base))[uap->start], uap->num * sizeof(union descriptor)); if (!error) p->p_retval[0] = uap->start; critical_exit(savecrit); return(error); } Index: head/sys/i386/i386/vm_machdep.c =================================================================== --- head/sys/i386/i386/vm_machdep.c (revision 82308) +++ head/sys/i386/i386/vm_machdep.c (revision 82309) @@ -1,587 +1,588 @@ /*- * Copyright (c) 1982, 1986 The Regents of the University of California. * Copyright (c) 1989, 1990 William Jolitz * Copyright (c) 1994 John Dyson * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department, and William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_machdep.c 7.3 (Berkeley) 5/13/91 * Utah $Hdr: vm_machdep.c 1.16.1.1 89/06/23$ * $FreeBSD$ */ #include "opt_npx.h" #ifdef PC98 #include "opt_pc98.h" #endif #include "opt_reset.h" #include "opt_isa.h" +#include "opt_upages.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef PC98 #include #else #include #endif static void cpu_reset_real __P((void)); #ifdef SMP static void cpu_reset_proxy __P((void)); static u_int cpu_reset_proxyid; static volatile u_int cpu_reset_proxy_active; #endif extern int _ucodesel, _udatasel; /* * quick version of vm_fault */ int vm_fault_quick(v, prot) caddr_t v; int prot; { int r; if (prot & VM_PROT_WRITE) r = subyte(v, fubyte(v)); else r = fubyte(v); return(r); } /* * Finish a fork operation, with process p2 nearly set up. * Copy and update the pcb, set up the stack so that the child * ready to run and return to user mode. */ void cpu_fork(p1, p2, flags) register struct proc *p1, *p2; int flags; { struct pcb *pcb2; #ifdef DEV_NPX int savecrit; #endif if ((flags & RFPROC) == 0) { if ((flags & RFMEM) == 0) { /* unshare user LDT */ struct pcb *pcb1 = &p1->p_addr->u_pcb; struct pcb_ldt *pcb_ldt = pcb1->pcb_ldt; if (pcb_ldt && pcb_ldt->ldt_refcnt > 1) { pcb_ldt = user_ldt_alloc(pcb1,pcb_ldt->ldt_len); if (pcb_ldt == NULL) panic("could not copy LDT"); pcb1->pcb_ldt = pcb_ldt; set_user_ldt(pcb1); user_ldt_free(pcb1); } } return; } /* Ensure that p1's pcb is up to date. */ #ifdef DEV_NPX if (p1 == curproc) p1->p_addr->u_pcb.pcb_gs = rgs(); savecrit = critical_enter(); if (PCPU_GET(npxproc) == p1) npxsave(&p1->p_addr->u_pcb.pcb_save); critical_exit(savecrit); #endif /* Copy p1's pcb. */ p2->p_addr->u_pcb = p1->p_addr->u_pcb; pcb2 = &p2->p_addr->u_pcb; /* * Create a new fresh stack for the new process. * Copy the trap frame for the return to user mode as if from a * syscall. This copies most of the user mode register values. */ p2->p_frame = (struct trapframe *) ((int)p2->p_addr + UPAGES * PAGE_SIZE - 16) - 1; bcopy(p1->p_frame, p2->p_frame, sizeof(struct trapframe)); p2->p_frame->tf_eax = 0; /* Child returns zero */ p2->p_frame->tf_eflags &= ~PSL_C; /* success */ p2->p_frame->tf_edx = 1; /* * Set registers for trampoline to user mode. Leave space for the * return address on stack. These are the kernel mode register values. */ pcb2->pcb_cr3 = vtophys(vmspace_pmap(p2->p_vmspace)->pm_pdir); pcb2->pcb_edi = 0; pcb2->pcb_esi = (int)fork_return; /* fork_trampoline argument */ pcb2->pcb_ebp = 0; pcb2->pcb_esp = (int)p2->p_frame - sizeof(void *); pcb2->pcb_ebx = (int)p2; /* fork_trampoline argument */ pcb2->pcb_eip = (int)fork_trampoline; /*- * pcb2->pcb_dr*: cloned above. * pcb2->pcb_ldt: duplicated below, if necessary. * pcb2->pcb_savefpu: cloned above. * pcb2->pcb_flags: cloned above. * pcb2->pcb_onfault: cloned above (always NULL here?). * pcb2->pcb_gs: cloned above. * pcb2->pcb_ext: cleared below. */ /* * XXX don't copy the i/o pages. this should probably be fixed. */ pcb2->pcb_ext = 0; /* Copy the LDT, if necessary. */ mtx_lock_spin(&sched_lock); if (pcb2->pcb_ldt != 0) { if (flags & RFMEM) { pcb2->pcb_ldt->ldt_refcnt++; } else { pcb2->pcb_ldt = user_ldt_alloc(pcb2, pcb2->pcb_ldt->ldt_len); if (pcb2->pcb_ldt == NULL) panic("could not copy LDT"); } } mtx_unlock_spin(&sched_lock); /* * Now, cpu_switch() can schedule the new process. * pcb_esp is loaded pointing to the cpu_switch() stack frame * containing the return address when exiting cpu_switch. * This will normally be to fork_trampoline(), which will have * %ebx loaded with the new proc's pointer. fork_trampoline() * will set up a stack to call fork_return(p, frame); to complete * the return to user-mode. */ } /* * Intercept the return address from a freshly forked process that has NOT * been scheduled yet. * * This is needed to make kernel threads stay in kernel mode. */ void cpu_set_fork_handler(p, func, arg) struct proc *p; void (*func) __P((void *)); void *arg; { /* * Note that the trap frame follows the args, so the function * is really called like this: func(arg, frame); */ p->p_addr->u_pcb.pcb_esi = (int) func; /* function */ p->p_addr->u_pcb.pcb_ebx = (int) arg; /* first arg */ } void cpu_exit(p) register struct proc *p; { struct pcb *pcb = &p->p_addr->u_pcb; #ifdef DEV_NPX npxexit(p); #endif if (pcb->pcb_ext != 0) { /* * XXX do we need to move the TSS off the allocated pages * before freeing them? (not done here) */ kmem_free(kernel_map, (vm_offset_t)pcb->pcb_ext, ctob(IOPAGES + 1)); pcb->pcb_ext = 0; } if (pcb->pcb_ldt) user_ldt_free(pcb); if (pcb->pcb_flags & PCB_DBREGS) { /* * disable all hardware breakpoints */ reset_dbregs(); pcb->pcb_flags &= ~PCB_DBREGS; } PROC_LOCK(p); mtx_lock_spin(&sched_lock); while (mtx_owned(&Giant)) mtx_unlock_flags(&Giant, MTX_NOSWITCH); /* * We have to wait until after releasing all locks before * changing p_stat. If we block on a mutex then we will be * back at SRUN when we resume and our parent will never * harvest us. */ p->p_stat = SZOMB; wakeup(p->p_pptr); PROC_UNLOCK_NOSWITCH(p); cnt.v_swtch++; cpu_throw(); panic("cpu_exit"); } void cpu_wait(p) struct proc *p; { GIANT_REQUIRED; /* drop per-process resources */ pmap_dispose_proc(p); /* and clean-out the vmspace */ vmspace_free(p->p_vmspace); } /* * Dump the machine specific header information at the start of a core dump. */ int cpu_coredump(p, vp, cred) struct proc *p; struct vnode *vp; struct ucred *cred; { int error; caddr_t tempuser; tempuser = malloc(ctob(UPAGES), M_TEMP, M_WAITOK | M_ZERO); if (!tempuser) return EINVAL; bcopy(p->p_addr, tempuser, sizeof(struct user)); bcopy(p->p_frame, tempuser + ((caddr_t) p->p_frame - (caddr_t) p->p_addr), sizeof(struct trapframe)); error = vn_rdwr(UIO_WRITE, vp, (caddr_t) tempuser, ctob(UPAGES), (off_t)0, UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, cred, (int *)NULL, p); free(tempuser, M_TEMP); return error; } #ifdef notyet static void setredzone(pte, vaddr) u_short *pte; caddr_t vaddr; { /* eventually do this by setting up an expand-down stack segment for ss0: selector, allowing stack access down to top of u. this means though that protection violations need to be handled thru a double fault exception that must do an integral task switch to a known good context, within which a dump can be taken. a sensible scheme might be to save the initial context used by sched (that has physical memory mapped 1:1 at bottom) and take the dump while still in mapped mode */ } #endif /* * Convert kernel VA to physical address */ u_long kvtop(void *addr) { vm_offset_t va; va = pmap_kextract((vm_offset_t)addr); if (va == 0) panic("kvtop: zero page frame"); return((int)va); } /* * Map an IO request into kernel virtual address space. * * All requests are (re)mapped into kernel VA space. * Notice that we use b_bufsize for the size of the buffer * to be mapped. b_bcount might be modified by the driver. */ void vmapbuf(bp) register struct buf *bp; { register caddr_t addr, v, kva; vm_offset_t pa; GIANT_REQUIRED; if ((bp->b_flags & B_PHYS) == 0) panic("vmapbuf"); for (v = bp->b_saveaddr, addr = (caddr_t)trunc_page((vm_offset_t)bp->b_data); addr < bp->b_data + bp->b_bufsize; addr += PAGE_SIZE, v += PAGE_SIZE) { /* * Do the vm_fault if needed; do the copy-on-write thing * when reading stuff off device into memory. */ vm_fault_quick(addr, (bp->b_iocmd == BIO_READ)?(VM_PROT_READ|VM_PROT_WRITE):VM_PROT_READ); pa = trunc_page(pmap_kextract((vm_offset_t) addr)); if (pa == 0) panic("vmapbuf: page not present"); vm_page_hold(PHYS_TO_VM_PAGE(pa)); pmap_kenter((vm_offset_t) v, pa); } kva = bp->b_saveaddr; bp->b_saveaddr = bp->b_data; bp->b_data = kva + (((vm_offset_t) bp->b_data) & PAGE_MASK); } /* * Free the io map PTEs associated with this IO operation. * We also invalidate the TLB entries and restore the original b_addr. */ void vunmapbuf(bp) register struct buf *bp; { register caddr_t addr; vm_offset_t pa; GIANT_REQUIRED; if ((bp->b_flags & B_PHYS) == 0) panic("vunmapbuf"); for (addr = (caddr_t)trunc_page((vm_offset_t)bp->b_data); addr < bp->b_data + bp->b_bufsize; addr += PAGE_SIZE) { pa = trunc_page(pmap_kextract((vm_offset_t) addr)); pmap_kremove((vm_offset_t) addr); vm_page_unhold(PHYS_TO_VM_PAGE(pa)); } bp->b_data = bp->b_saveaddr; } /* * Force reset the processor by invalidating the entire address space! */ #ifdef SMP static void cpu_reset_proxy() { cpu_reset_proxy_active = 1; while (cpu_reset_proxy_active == 1) ; /* Wait for other cpu to see that we've started */ stop_cpus((1<" */ invltlb(); /* NOTREACHED */ while(1); } int grow_stack(p, sp) struct proc *p; u_int sp; { int rv; rv = vm_map_growstack (p, sp); if (rv != KERN_SUCCESS) return (0); return (1); } /* * Software interrupt handler for queued VM system processing. */ void swi_vm(void *dummy) { if (busdma_swi_pending != 0) busdma_swi(); } /* * Tell whether this address is in some physical memory region. * Currently used by the kernel coredump code in order to avoid * dumping the ``ISA memory hole'' which could cause indefinite hangs, * or other unpredictable behaviour. */ int is_physical_memory(addr) vm_offset_t addr; { #ifdef DEV_ISA /* The ISA ``memory hole''. */ if (addr >= 0xa0000 && addr < 0x100000) return 0; #endif /* * stuff other tests for known memory-mapped devices (PCI?) * here */ return 1; } Index: head/sys/i386/include/globaldata.h =================================================================== --- head/sys/i386/include/globaldata.h (revision 82308) +++ head/sys/i386/include/globaldata.h (revision 82309) @@ -1,99 +1,79 @@ /*- * Copyright (c) Peter Wemm * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _MACHINE_GLOBALDATA_H_ #define _MACHINE_GLOBALDATA_H_ #ifdef _KERNEL #include #include /* XXX */ #ifdef KTR_PERCPU #include #endif /* * This structure maps out the global data that needs to be kept on a * per-cpu basis. genassym uses this to generate offsets for the assembler * code, which also provides external symbols so that C can get at them as * though they were really globals. * * The SMP parts are setup in pmap.c and locore.s for the BSP, and * mp_machdep.c sets up the data for the AP's to "see" when they awake. * The reason for doing it via a struct is so that an array of pointers * to each CPU's data can be set up for things like "check curproc on all * other processors" */ struct globaldata { struct globaldata *gd_prvspace; /* self-reference */ struct proc *gd_curproc; /* current process */ struct proc *gd_idleproc; /* idle process */ struct proc *gd_npxproc; struct pcb *gd_curpcb; /* current pcb */ struct timeval gd_switchtime; struct i386tss gd_common_tss; int gd_switchticks; struct segment_descriptor gd_common_tssd; struct segment_descriptor *gd_tss_gdt; int gd_currentldt; u_int gd_cpuid; /* this cpu number */ u_int gd_other_cpus; /* all other cpus */ SLIST_ENTRY(globaldata) gd_allcpu; struct lock_list_entry *gd_spinlocks; #ifdef KTR_PERCPU volatile int gd_ktr_idx; /* Index into trace table */ char *gd_ktr_buf; char gd_ktr_buf_data[KTR_SIZE]; #endif }; -#ifdef SMP -/* - * This is the upper (0xff800000) address space layout that is per-cpu. - * It is setup in locore.s and pmap.c for the BSP and in mp_machdep.c for - * each AP. genassym helps export this to the assembler code. - */ -struct privatespace { - /* page 0 - data page */ - struct globaldata globaldata; - char __filler0[PAGE_SIZE - sizeof(struct globaldata)]; - - /* page 1 - idle stack (UPAGES pages) */ - char idlestack[UPAGES * PAGE_SIZE]; - /* page 1+UPAGES... */ -}; - -extern struct privatespace SMP_prvspace[]; - -#endif - #endif /* _KERNEL */ #endif /* ! _MACHINE_GLOBALDATA_H_ */ Index: head/sys/i386/include/mptable.h =================================================================== --- head/sys/i386/include/mptable.h (revision 82308) +++ head/sys/i386/include/mptable.h (revision 82309) @@ -1,2440 +1,2442 @@ /* * Copyright (c) 1996, by Steve Passe * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. The name of the developer may NOT be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include "opt_cpu.h" +#include "opt_upages.h" #ifdef SMP #include #else #error #endif #include #include #include #include /* cngetc() */ #include #ifdef GPROF #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /** TEST_DEFAULT_CONFIG, TEST_TEST1 */ #include #include #include +#include #if defined(APIC_IO) #include /* setidt() */ #include /* IPIs */ #include /* IPIs */ #endif /* APIC_IO */ #if defined(TEST_DEFAULT_CONFIG) #define MPFPS_MPFB1 TEST_DEFAULT_CONFIG #else #define MPFPS_MPFB1 mpfps->mpfb1 #endif /* TEST_DEFAULT_CONFIG */ #define WARMBOOT_TARGET 0 #define WARMBOOT_OFF (KERNBASE + 0x0467) #define WARMBOOT_SEG (KERNBASE + 0x0469) #ifdef PC98 #define BIOS_BASE (0xe8000) #define BIOS_SIZE (0x18000) #else #define BIOS_BASE (0xf0000) #define BIOS_SIZE (0x10000) #endif #define BIOS_COUNT (BIOS_SIZE/4) #define CMOS_REG (0x70) #define CMOS_DATA (0x71) #define BIOS_RESET (0x0f) #define BIOS_WARM (0x0a) #define PROCENTRY_FLAG_EN 0x01 #define PROCENTRY_FLAG_BP 0x02 #define IOAPICENTRY_FLAG_EN 0x01 /* MP Floating Pointer Structure */ typedef struct MPFPS { char signature[4]; void *pap; u_char length; u_char spec_rev; u_char checksum; u_char mpfb1; u_char mpfb2; u_char mpfb3; u_char mpfb4; u_char mpfb5; } *mpfps_t; /* MP Configuration Table Header */ typedef struct MPCTH { char signature[4]; u_short base_table_length; u_char spec_rev; u_char checksum; u_char oem_id[8]; u_char product_id[12]; void *oem_table_pointer; u_short oem_table_size; u_short entry_count; void *apic_address; u_short extended_table_length; u_char extended_table_checksum; u_char reserved; } *mpcth_t; typedef struct PROCENTRY { u_char type; u_char apic_id; u_char apic_version; u_char cpu_flags; u_long cpu_signature; u_long feature_flags; u_long reserved1; u_long reserved2; } *proc_entry_ptr; typedef struct BUSENTRY { u_char type; u_char bus_id; char bus_type[6]; } *bus_entry_ptr; typedef struct IOAPICENTRY { u_char type; u_char apic_id; u_char apic_version; u_char apic_flags; void *apic_address; } *io_apic_entry_ptr; typedef struct INTENTRY { u_char type; u_char int_type; u_short int_flags; u_char src_bus_id; u_char src_bus_irq; u_char dst_apic_id; u_char dst_apic_int; } *int_entry_ptr; /* descriptions of MP basetable entries */ typedef struct BASETABLE_ENTRY { u_char type; u_char length; char name[16]; } basetable_entry; /* * this code MUST be enabled here and in mpboot.s. * it follows the very early stages of AP boot by placing values in CMOS ram. * it NORMALLY will never be needed and thus the primitive method for enabling. * #define CHECK_POINTS */ #if defined(CHECK_POINTS) && !defined(PC98) #define CHECK_READ(A) (outb(CMOS_REG, (A)), inb(CMOS_DATA)) #define CHECK_WRITE(A,D) (outb(CMOS_REG, (A)), outb(CMOS_DATA, (D))) #define CHECK_INIT(D); \ CHECK_WRITE(0x34, (D)); \ CHECK_WRITE(0x35, (D)); \ CHECK_WRITE(0x36, (D)); \ CHECK_WRITE(0x37, (D)); \ CHECK_WRITE(0x38, (D)); \ CHECK_WRITE(0x39, (D)); #define CHECK_PRINT(S); \ printf("%s: %d, %d, %d, %d, %d, %d\n", \ (S), \ CHECK_READ(0x34), \ CHECK_READ(0x35), \ CHECK_READ(0x36), \ CHECK_READ(0x37), \ CHECK_READ(0x38), \ CHECK_READ(0x39)); #else /* CHECK_POINTS */ #define CHECK_INIT(D) #define CHECK_PRINT(S) #endif /* CHECK_POINTS */ /* * Values to send to the POST hardware. */ #define MP_BOOTADDRESS_POST 0x10 #define MP_PROBE_POST 0x11 #define MPTABLE_PASS1_POST 0x12 #define MP_START_POST 0x13 #define MP_ENABLE_POST 0x14 #define MPTABLE_PASS2_POST 0x15 #define START_ALL_APS_POST 0x16 #define INSTALL_AP_TRAMP_POST 0x17 #define START_AP_POST 0x18 #define MP_ANNOUNCE_POST 0x19 /* used to hold the AP's until we are ready to release them */ static struct mtx ap_boot_mtx; /** XXX FIXME: where does this really belong, isa.h/isa.c perhaps? */ int current_postcode; /** XXX FIXME: what system files declare these??? */ extern struct region_descriptor r_gdt, r_idt; int bsp_apic_ready = 0; /* flags useability of BSP apic */ int mp_naps; /* # of Applications processors */ int mp_nbusses; /* # of busses */ int mp_napics; /* # of IO APICs */ int boot_cpu_id; /* designated BSP */ vm_offset_t cpu_apic_address; vm_offset_t io_apic_address[NAPICID]; /* NAPICID is more than enough */ extern int nkpt; u_int32_t cpu_apic_versions[MAXCPU]; u_int32_t *io_apic_versions; #ifdef APIC_INTR_REORDER struct { volatile int *location; int bit; } apic_isrbit_location[32]; #endif struct apic_intmapinfo int_to_apicintpin[APIC_INTMAPSIZE]; /* * APIC ID logical/physical mapping structures. * We oversize these to simplify boot-time config. */ int cpu_num_to_apic_id[NAPICID]; int io_num_to_apic_id[NAPICID]; int apic_id_to_logical[NAPICID]; /* AP uses this during bootstrap. Do not staticize. */ char *bootSTK; static int bootAP; /* Hotwire a 0->4MB V==P mapping */ extern pt_entry_t *KPTphys; /* SMP page table page */ extern pt_entry_t *SMPpt; struct pcb stoppcbs[MAXCPU]; int invltlb_ok = 0; /* throttle smp_invltlb() till safe */ SYSCTL_INT(_machdep, OID_AUTO, invltlb_ok, CTLFLAG_RW, &invltlb_ok, 0, ""); /* * Local data and functions. */ /* Set to 1 once we're ready to let the APs out of the pen. */ static volatile int aps_ready = 0; static int mp_capable; static u_int boot_address; static u_int base_memory; static int picmode; /* 0: virtual wire mode, 1: PIC mode */ static mpfps_t mpfps; static int search_for_sig(u_int32_t target, int count); static void mp_enable(u_int boot_addr); static void mptable_pass1(void); static int mptable_pass2(void); static void default_mp_table(int type); static void fix_mp_table(void); static void setup_apic_irq_mapping(void); static void init_locks(void); static int start_all_aps(u_int boot_addr); static void install_ap_tramp(u_int boot_addr); static int start_ap(int logicalCpu, u_int boot_addr); void ap_init(void); static int apic_int_is_bus_type(int intr, int bus_type); static void release_aps(void *dummy); /* * initialize all the SMP locks */ /* critical region around IO APIC, apic_imen */ struct mtx imen_mtx; /* lock region used by kernel profiling */ int mcount_lock; #ifdef USE_COMLOCK /* locks com (tty) data/hardware accesses: a FASTINTR() */ struct mtx com_mtx; #endif /* USE_COMLOCK */ static void init_locks(void) { #ifdef USE_COMLOCK mtx_init(&com_mtx, "com", MTX_SPIN); #endif /* USE_COMLOCK */ } /* * Calculate usable address in base memory for AP trampoline code. */ u_int mp_bootaddress(u_int basemem) { POSTCODE(MP_BOOTADDRESS_POST); base_memory = basemem * 1024; /* convert to bytes */ boot_address = base_memory & ~0xfff; /* round down to 4k boundary */ if ((base_memory - boot_address) < bootMP_size) boot_address -= 4096; /* not enough, lower by 4k */ return boot_address; } /* * Look for an Intel MP spec table (ie, SMP capable hardware). */ void i386_mp_probe(void) { int x; u_long segment; u_int32_t target; POSTCODE(MP_PROBE_POST); /* see if EBDA exists */ if ((segment = (u_long) * (u_short *) (KERNBASE + 0x40e)) != 0) { /* search first 1K of EBDA */ target = (u_int32_t) (segment << 4); if ((x = search_for_sig(target, 1024 / 4)) >= 0) goto found; } else { /* last 1K of base memory, effective 'top of base' passed in */ target = (u_int32_t) (base_memory - 0x400); if ((x = search_for_sig(target, 1024 / 4)) >= 0) goto found; } /* search the BIOS */ target = (u_int32_t) BIOS_BASE; if ((x = search_for_sig(target, BIOS_COUNT)) >= 0) goto found; /* nothing found */ mpfps = (mpfps_t)0; mp_capable = 0; return; found: /* calculate needed resources */ mpfps = (mpfps_t)x; mptable_pass1(); /* flag fact that we are running multiple processors */ mp_capable = 1; } int cpu_mp_probe(void) { /* * Record BSP in CPU map * This is done here so that MBUF init code works correctly. */ all_cpus = 1; return (mp_capable); } /* * Initialize the SMP hardware and the APIC and start up the AP's. */ void cpu_mp_start(void) { POSTCODE(MP_START_POST); /* look for MP capable motherboard */ if (mp_capable) mp_enable(boot_address); else panic("MP hardware not found!"); cpu_setregs(); } /* * Print various information about the SMP system hardware and setup. */ void cpu_mp_announce(void) { int x; POSTCODE(MP_ANNOUNCE_POST); printf(" cpu0 (BSP): apic id: %2d", CPU_TO_ID(0)); printf(", version: 0x%08x", cpu_apic_versions[0]); printf(", at 0x%08x\n", cpu_apic_address); for (x = 1; x <= mp_naps; ++x) { printf(" cpu%d (AP): apic id: %2d", x, CPU_TO_ID(x)); printf(", version: 0x%08x", cpu_apic_versions[x]); printf(", at 0x%08x\n", cpu_apic_address); } #if defined(APIC_IO) for (x = 0; x < mp_napics; ++x) { printf(" io%d (APIC): apic id: %2d", x, IO_TO_ID(x)); printf(", version: 0x%08x", io_apic_versions[x]); printf(", at 0x%08x\n", io_apic_address[x]); } #else printf(" Warning: APIC I/O disabled\n"); #endif /* APIC_IO */ } /* * AP cpu's call this to sync up protected mode. */ void init_secondary(void) { int gsel_tss; int x, myid = bootAP; gdt_segs[GPRIV_SEL].ssd_base = (int) &SMP_prvspace[myid]; gdt_segs[GPROC0_SEL].ssd_base = (int) &SMP_prvspace[myid].globaldata.gd_common_tss; SMP_prvspace[myid].globaldata.gd_prvspace = &SMP_prvspace[myid].globaldata; for (x = 0; x < NGDT; x++) { ssdtosd(&gdt_segs[x], &gdt[myid * NGDT + x].sd); } r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; r_gdt.rd_base = (int) &gdt[myid * NGDT]; lgdt(&r_gdt); /* does magic intra-segment return */ lidt(&r_idt); lldt(_default_ldt); PCPU_SET(currentldt, _default_ldt); gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); gdt[myid * NGDT + GPROC0_SEL].sd.sd_type = SDT_SYS386TSS; PCPU_SET(common_tss.tss_esp0, 0); /* not used until after switch */ PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL)); PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16); PCPU_SET(tss_gdt, &gdt[myid * NGDT + GPROC0_SEL].sd); PCPU_SET(common_tssd, *PCPU_GET(tss_gdt)); ltr(gsel_tss); pmap_set_opt(); } #if defined(APIC_IO) /* * Final configuration of the BSP's local APIC: * - disable 'pic mode'. * - disable 'virtual wire mode'. * - enable NMI. */ void bsp_apic_configure(void) { u_char byte; u_int32_t temp; /* leave 'pic mode' if necessary */ if (picmode) { outb(0x22, 0x70); /* select IMCR */ byte = inb(0x23); /* current contents */ byte |= 0x01; /* mask external INTR */ outb(0x23, byte); /* disconnect 8259s/NMI */ } /* mask lint0 (the 8259 'virtual wire' connection) */ temp = lapic.lvt_lint0; temp |= APIC_LVT_M; /* set the mask */ lapic.lvt_lint0 = temp; /* setup lint1 to handle NMI */ temp = lapic.lvt_lint1; temp &= ~APIC_LVT_M; /* clear the mask */ lapic.lvt_lint1 = temp; if (bootverbose) apic_dump("bsp_apic_configure()"); } #endif /* APIC_IO */ /******************************************************************* * local functions and data */ /* * start the SMP system */ static void mp_enable(u_int boot_addr) { int x; #if defined(APIC_IO) int apic; u_int ux; #endif /* APIC_IO */ POSTCODE(MP_ENABLE_POST); /* turn on 4MB of V == P addressing so we can get to MP table */ *(int *)PTD = PG_V | PG_RW | ((uintptr_t)(void *)KPTphys & PG_FRAME); invltlb(); /* examine the MP table for needed info, uses physical addresses */ x = mptable_pass2(); *(int *)PTD = 0; invltlb(); /* can't process default configs till the CPU APIC is pmapped */ if (x) default_mp_table(x); /* post scan cleanup */ fix_mp_table(); setup_apic_irq_mapping(); #if defined(APIC_IO) /* fill the LOGICAL io_apic_versions table */ for (apic = 0; apic < mp_napics; ++apic) { ux = io_apic_read(apic, IOAPIC_VER); io_apic_versions[apic] = ux; io_apic_set_id(apic, IO_TO_ID(apic)); } /* program each IO APIC in the system */ for (apic = 0; apic < mp_napics; ++apic) if (io_apic_setup(apic) < 0) panic("IO APIC setup failure"); /* install a 'Spurious INTerrupt' vector */ setidt(XSPURIOUSINT_OFFSET, Xspuriousint, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* install an inter-CPU IPI for TLB invalidation */ setidt(XINVLTLB_OFFSET, Xinvltlb, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* install an inter-CPU IPI for forwarding hardclock() */ setidt(XHARDCLOCK_OFFSET, Xhardclock, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* install an inter-CPU IPI for forwarding statclock() */ setidt(XSTATCLOCK_OFFSET, Xstatclock, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* install an inter-CPU IPI for all-CPU rendezvous */ setidt(XRENDEZVOUS_OFFSET, Xrendezvous, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* install an inter-CPU IPI for forcing an additional software trap */ setidt(XCPUAST_OFFSET, Xcpuast, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* install an inter-CPU IPI for CPU stop/restart */ setidt(XCPUSTOP_OFFSET, Xcpustop, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); #if defined(TEST_TEST1) /* install a "fake hardware INTerrupt" vector */ setidt(XTEST1_OFFSET, Xtest1, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); #endif /** TEST_TEST1 */ #endif /* APIC_IO */ /* initialize all SMP locks */ init_locks(); /* start each Application Processor */ start_all_aps(boot_addr); } /* * look for the MP spec signature */ /* string defined by the Intel MP Spec as identifying the MP table */ #define MP_SIG 0x5f504d5f /* _MP_ */ #define NEXT(X) ((X) += 4) static int search_for_sig(u_int32_t target, int count) { int x; u_int32_t *addr = (u_int32_t *) (KERNBASE + target); for (x = 0; x < count; NEXT(x)) if (addr[x] == MP_SIG) /* make array index a byte index */ return (target + (x * sizeof(u_int32_t))); return -1; } static basetable_entry basetable_entry_types[] = { {0, 20, "Processor"}, {1, 8, "Bus"}, {2, 8, "I/O APIC"}, {3, 8, "I/O INT"}, {4, 8, "Local INT"} }; typedef struct BUSDATA { u_char bus_id; enum busTypes bus_type; } bus_datum; typedef struct INTDATA { u_char int_type; u_short int_flags; u_char src_bus_id; u_char src_bus_irq; u_char dst_apic_id; u_char dst_apic_int; u_char int_vector; } io_int, local_int; typedef struct BUSTYPENAME { u_char type; char name[7]; } bus_type_name; static bus_type_name bus_type_table[] = { {CBUS, "CBUS"}, {CBUSII, "CBUSII"}, {EISA, "EISA"}, {MCA, "MCA"}, {UNKNOWN_BUSTYPE, "---"}, {ISA, "ISA"}, {MCA, "MCA"}, {UNKNOWN_BUSTYPE, "---"}, {UNKNOWN_BUSTYPE, "---"}, {UNKNOWN_BUSTYPE, "---"}, {UNKNOWN_BUSTYPE, "---"}, {UNKNOWN_BUSTYPE, "---"}, {PCI, "PCI"}, {UNKNOWN_BUSTYPE, "---"}, {UNKNOWN_BUSTYPE, "---"}, {UNKNOWN_BUSTYPE, "---"}, {UNKNOWN_BUSTYPE, "---"}, {XPRESS, "XPRESS"}, {UNKNOWN_BUSTYPE, "---"} }; /* from MP spec v1.4, table 5-1 */ static int default_data[7][5] = { /* nbus, id0, type0, id1, type1 */ {1, 0, ISA, 255, 255}, {1, 0, EISA, 255, 255}, {1, 0, EISA, 255, 255}, {1, 0, MCA, 255, 255}, {2, 0, ISA, 1, PCI}, {2, 0, EISA, 1, PCI}, {2, 0, MCA, 1, PCI} }; /* the bus data */ static bus_datum *bus_data; /* the IO INT data, one entry per possible APIC INTerrupt */ static io_int *io_apic_ints; static int nintrs; static int processor_entry __P((proc_entry_ptr entry, int cpu)); static int bus_entry __P((bus_entry_ptr entry, int bus)); static int io_apic_entry __P((io_apic_entry_ptr entry, int apic)); static int int_entry __P((int_entry_ptr entry, int intr)); static int lookup_bus_type __P((char *name)); /* * 1st pass on motherboard's Intel MP specification table. * * initializes: * mp_ncpus = 1 * * determines: * cpu_apic_address (common to all CPUs) * io_apic_address[N] * mp_naps * mp_nbusses * mp_napics * nintrs */ static void mptable_pass1(void) { int x; mpcth_t cth; int totalSize; void* position; int count; int type; POSTCODE(MPTABLE_PASS1_POST); /* clear various tables */ for (x = 0; x < NAPICID; ++x) { io_apic_address[x] = ~0; /* IO APIC address table */ } /* init everything to empty */ mp_naps = 0; mp_nbusses = 0; mp_napics = 0; nintrs = 0; /* check for use of 'default' configuration */ if (MPFPS_MPFB1 != 0) { /* use default addresses */ cpu_apic_address = DEFAULT_APIC_BASE; io_apic_address[0] = DEFAULT_IO_APIC_BASE; /* fill in with defaults */ mp_naps = 2; /* includes BSP */ mp_nbusses = default_data[MPFPS_MPFB1 - 1][0]; #if defined(APIC_IO) mp_napics = 1; nintrs = 16; #endif /* APIC_IO */ } else { if ((cth = mpfps->pap) == 0) panic("MP Configuration Table Header MISSING!"); cpu_apic_address = (vm_offset_t) cth->apic_address; /* walk the table, recording info of interest */ totalSize = cth->base_table_length - sizeof(struct MPCTH); position = (u_char *) cth + sizeof(struct MPCTH); count = cth->entry_count; while (count--) { switch (type = *(u_char *) position) { case 0: /* processor_entry */ if (((proc_entry_ptr)position)->cpu_flags & PROCENTRY_FLAG_EN) ++mp_naps; break; case 1: /* bus_entry */ ++mp_nbusses; break; case 2: /* io_apic_entry */ if (((io_apic_entry_ptr)position)->apic_flags & IOAPICENTRY_FLAG_EN) io_apic_address[mp_napics++] = (vm_offset_t)((io_apic_entry_ptr) position)->apic_address; break; case 3: /* int_entry */ ++nintrs; break; case 4: /* int_entry */ break; default: panic("mpfps Base Table HOSED!"); /* NOTREACHED */ } totalSize -= basetable_entry_types[type].length; (u_char*)position += basetable_entry_types[type].length; } } /* qualify the numbers */ if (mp_naps > MAXCPU) { printf("Warning: only using %d of %d available CPUs!\n", MAXCPU, mp_naps); mp_naps = MAXCPU; } /* * Count the BSP. * This is also used as a counter while starting the APs. */ mp_ncpus = 1; --mp_naps; /* subtract the BSP */ } /* * 2nd pass on motherboard's Intel MP specification table. * * sets: * boot_cpu_id * ID_TO_IO(N), phy APIC ID to log CPU/IO table * CPU_TO_ID(N), logical CPU to APIC ID table * IO_TO_ID(N), logical IO to APIC ID table * bus_data[N] * io_apic_ints[N] */ static int mptable_pass2(void) { int x; mpcth_t cth; int totalSize; void* position; int count; int type; int apic, bus, cpu, intr; int i, j; int pgeflag; POSTCODE(MPTABLE_PASS2_POST); pgeflag = 0; /* XXX - Not used under SMP yet. */ MALLOC(io_apic_versions, u_int32_t *, sizeof(u_int32_t) * mp_napics, M_DEVBUF, M_WAITOK); MALLOC(ioapic, volatile ioapic_t **, sizeof(ioapic_t *) * mp_napics, M_DEVBUF, M_WAITOK); MALLOC(io_apic_ints, io_int *, sizeof(io_int) * (nintrs + 1), M_DEVBUF, M_WAITOK); MALLOC(bus_data, bus_datum *, sizeof(bus_datum) * mp_nbusses, M_DEVBUF, M_WAITOK); bzero(ioapic, sizeof(ioapic_t *) * mp_napics); for (i = 0; i < mp_napics; i++) { for (j = 0; j < mp_napics; j++) { /* same page frame as a previous IO apic? */ if (((vm_offset_t)SMPpt[NPTEPG-2-j] & PG_FRAME) == (io_apic_address[i] & PG_FRAME)) { ioapic[i] = (ioapic_t *)((u_int)SMP_prvspace + (NPTEPG-2-j) * PAGE_SIZE + (io_apic_address[i] & PAGE_MASK)); break; } /* use this slot if available */ if (((vm_offset_t)SMPpt[NPTEPG-2-j] & PG_FRAME) == 0) { SMPpt[NPTEPG-2-j] = (pt_entry_t)(PG_V | PG_RW | pgeflag | (io_apic_address[i] & PG_FRAME)); ioapic[i] = (ioapic_t *)((u_int)SMP_prvspace + (NPTEPG-2-j) * PAGE_SIZE + (io_apic_address[i] & PAGE_MASK)); break; } } } /* clear various tables */ for (x = 0; x < NAPICID; ++x) { ID_TO_IO(x) = -1; /* phy APIC ID to log CPU/IO table */ CPU_TO_ID(x) = -1; /* logical CPU to APIC ID table */ IO_TO_ID(x) = -1; /* logical IO to APIC ID table */ } /* clear bus data table */ for (x = 0; x < mp_nbusses; ++x) bus_data[x].bus_id = 0xff; /* clear IO APIC INT table */ for (x = 0; x < (nintrs + 1); ++x) { io_apic_ints[x].int_type = 0xff; io_apic_ints[x].int_vector = 0xff; } /* setup the cpu/apic mapping arrays */ boot_cpu_id = -1; /* record whether PIC or virtual-wire mode */ picmode = (mpfps->mpfb2 & 0x80) ? 1 : 0; /* check for use of 'default' configuration */ if (MPFPS_MPFB1 != 0) return MPFPS_MPFB1; /* return default configuration type */ if ((cth = mpfps->pap) == 0) panic("MP Configuration Table Header MISSING!"); /* walk the table, recording info of interest */ totalSize = cth->base_table_length - sizeof(struct MPCTH); position = (u_char *) cth + sizeof(struct MPCTH); count = cth->entry_count; apic = bus = intr = 0; cpu = 1; /* pre-count the BSP */ while (count--) { switch (type = *(u_char *) position) { case 0: if (processor_entry(position, cpu)) ++cpu; break; case 1: if (bus_entry(position, bus)) ++bus; break; case 2: if (io_apic_entry(position, apic)) ++apic; break; case 3: if (int_entry(position, intr)) ++intr; break; case 4: /* int_entry(position); */ break; default: panic("mpfps Base Table HOSED!"); /* NOTREACHED */ } totalSize -= basetable_entry_types[type].length; (u_char *) position += basetable_entry_types[type].length; } if (boot_cpu_id == -1) panic("NO BSP found!"); /* report fact that its NOT a default configuration */ return 0; } void assign_apic_irq(int apic, int intpin, int irq) { int x; if (int_to_apicintpin[irq].ioapic != -1) panic("assign_apic_irq: inconsistent table"); int_to_apicintpin[irq].ioapic = apic; int_to_apicintpin[irq].int_pin = intpin; int_to_apicintpin[irq].apic_address = ioapic[apic]; int_to_apicintpin[irq].redirindex = IOAPIC_REDTBL + 2 * intpin; for (x = 0; x < nintrs; x++) { if ((io_apic_ints[x].int_type == 0 || io_apic_ints[x].int_type == 3) && io_apic_ints[x].int_vector == 0xff && io_apic_ints[x].dst_apic_id == IO_TO_ID(apic) && io_apic_ints[x].dst_apic_int == intpin) io_apic_ints[x].int_vector = irq; } } void revoke_apic_irq(int irq) { int x; int oldapic; int oldintpin; if (int_to_apicintpin[irq].ioapic == -1) panic("assign_apic_irq: inconsistent table"); oldapic = int_to_apicintpin[irq].ioapic; oldintpin = int_to_apicintpin[irq].int_pin; int_to_apicintpin[irq].ioapic = -1; int_to_apicintpin[irq].int_pin = 0; int_to_apicintpin[irq].apic_address = NULL; int_to_apicintpin[irq].redirindex = 0; for (x = 0; x < nintrs; x++) { if ((io_apic_ints[x].int_type == 0 || io_apic_ints[x].int_type == 3) && io_apic_ints[x].int_vector == 0xff && io_apic_ints[x].dst_apic_id == IO_TO_ID(oldapic) && io_apic_ints[x].dst_apic_int == oldintpin) io_apic_ints[x].int_vector = 0xff; } } static void allocate_apic_irq(int intr) { int apic; int intpin; int irq; if (io_apic_ints[intr].int_vector != 0xff) return; /* Interrupt handler already assigned */ if (io_apic_ints[intr].int_type != 0 && (io_apic_ints[intr].int_type != 3 || (io_apic_ints[intr].dst_apic_id == IO_TO_ID(0) && io_apic_ints[intr].dst_apic_int == 0))) return; /* Not INT or ExtInt on != (0, 0) */ irq = 0; while (irq < APIC_INTMAPSIZE && int_to_apicintpin[irq].ioapic != -1) irq++; if (irq >= APIC_INTMAPSIZE) return; /* No free interrupt handlers */ apic = ID_TO_IO(io_apic_ints[intr].dst_apic_id); intpin = io_apic_ints[intr].dst_apic_int; assign_apic_irq(apic, intpin, irq); io_apic_setup_intpin(apic, intpin); } static void swap_apic_id(int apic, int oldid, int newid) { int x; int oapic; if (oldid == newid) return; /* Nothing to do */ printf("Changing APIC ID for IO APIC #%d from %d to %d in MP table\n", apic, oldid, newid); /* Swap physical APIC IDs in interrupt entries */ for (x = 0; x < nintrs; x++) { if (io_apic_ints[x].dst_apic_id == oldid) io_apic_ints[x].dst_apic_id = newid; else if (io_apic_ints[x].dst_apic_id == newid) io_apic_ints[x].dst_apic_id = oldid; } /* Swap physical APIC IDs in IO_TO_ID mappings */ for (oapic = 0; oapic < mp_napics; oapic++) if (IO_TO_ID(oapic) == newid) break; if (oapic < mp_napics) { printf("Changing APIC ID for IO APIC #%d from " "%d to %d in MP table\n", oapic, newid, oldid); IO_TO_ID(oapic) = oldid; } IO_TO_ID(apic) = newid; } static void fix_id_to_io_mapping(void) { int x; for (x = 0; x < NAPICID; x++) ID_TO_IO(x) = -1; for (x = 0; x <= mp_naps; x++) if (CPU_TO_ID(x) < NAPICID) ID_TO_IO(CPU_TO_ID(x)) = x; for (x = 0; x < mp_napics; x++) if (IO_TO_ID(x) < NAPICID) ID_TO_IO(IO_TO_ID(x)) = x; } static int first_free_apic_id(void) { int freeid, x; for (freeid = 0; freeid < NAPICID; freeid++) { for (x = 0; x <= mp_naps; x++) if (CPU_TO_ID(x) == freeid) break; if (x <= mp_naps) continue; for (x = 0; x < mp_napics; x++) if (IO_TO_ID(x) == freeid) break; if (x < mp_napics) continue; return freeid; } return freeid; } static int io_apic_id_acceptable(int apic, int id) { int cpu; /* Logical CPU number */ int oapic; /* Logical IO APIC number for other IO APIC */ if (id >= NAPICID) return 0; /* Out of range */ for (cpu = 0; cpu <= mp_naps; cpu++) if (CPU_TO_ID(cpu) == id) return 0; /* Conflict with CPU */ for (oapic = 0; oapic < mp_napics && oapic < apic; oapic++) if (IO_TO_ID(oapic) == id) return 0; /* Conflict with other APIC */ return 1; /* ID is acceptable for IO APIC */ } /* * parse an Intel MP specification table */ static void fix_mp_table(void) { int x; int id; int bus_0 = 0; /* Stop GCC warning */ int bus_pci = 0; /* Stop GCC warning */ int num_pci_bus; int apic; /* IO APIC unit number */ int freeid; /* Free physical APIC ID */ int physid; /* Current physical IO APIC ID */ /* * Fix mis-numbering of the PCI bus and its INT entries if the BIOS * did it wrong. The MP spec says that when more than 1 PCI bus * exists the BIOS must begin with bus entries for the PCI bus and use * actual PCI bus numbering. This implies that when only 1 PCI bus * exists the BIOS can choose to ignore this ordering, and indeed many * MP motherboards do ignore it. This causes a problem when the PCI * sub-system makes requests of the MP sub-system based on PCI bus * numbers. So here we look for the situation and renumber the * busses and associated INTs in an effort to "make it right". */ /* find bus 0, PCI bus, count the number of PCI busses */ for (num_pci_bus = 0, x = 0; x < mp_nbusses; ++x) { if (bus_data[x].bus_id == 0) { bus_0 = x; } if (bus_data[x].bus_type == PCI) { ++num_pci_bus; bus_pci = x; } } /* * bus_0 == slot of bus with ID of 0 * bus_pci == slot of last PCI bus encountered */ /* check the 1 PCI bus case for sanity */ /* if it is number 0 all is well */ if (num_pci_bus == 1 && bus_data[bus_pci].bus_id != 0) { /* mis-numbered, swap with whichever bus uses slot 0 */ /* swap the bus entry types */ bus_data[bus_pci].bus_type = bus_data[bus_0].bus_type; bus_data[bus_0].bus_type = PCI; /* swap each relavant INTerrupt entry */ id = bus_data[bus_pci].bus_id; for (x = 0; x < nintrs; ++x) { if (io_apic_ints[x].src_bus_id == id) { io_apic_ints[x].src_bus_id = 0; } else if (io_apic_ints[x].src_bus_id == 0) { io_apic_ints[x].src_bus_id = id; } } } /* Assign IO APIC IDs. * * First try the existing ID. If a conflict is detected, try * the ID in the MP table. If a conflict is still detected, find * a free id. * * We cannot use the ID_TO_IO table before all conflicts has been * resolved and the table has been corrected. */ for (apic = 0; apic < mp_napics; ++apic) { /* For all IO APICs */ /* First try to use the value set by the BIOS */ physid = io_apic_get_id(apic); if (io_apic_id_acceptable(apic, physid)) { if (IO_TO_ID(apic) != physid) swap_apic_id(apic, IO_TO_ID(apic), physid); continue; } /* Then check if the value in the MP table is acceptable */ if (io_apic_id_acceptable(apic, IO_TO_ID(apic))) continue; /* Last resort, find a free APIC ID and use it */ freeid = first_free_apic_id(); if (freeid >= NAPICID) panic("No free physical APIC IDs found"); if (io_apic_id_acceptable(apic, freeid)) { swap_apic_id(apic, IO_TO_ID(apic), freeid); continue; } panic("Free physical APIC ID not usable"); } fix_id_to_io_mapping(); /* detect and fix broken Compaq MP table */ if (apic_int_type(0, 0) == -1) { printf("APIC_IO: MP table broken: 8259->APIC entry missing!\n"); io_apic_ints[nintrs].int_type = 3; /* ExtInt */ io_apic_ints[nintrs].int_vector = 0xff; /* Unassigned */ /* XXX fixme, set src bus id etc, but it doesn't seem to hurt */ io_apic_ints[nintrs].dst_apic_id = IO_TO_ID(0); io_apic_ints[nintrs].dst_apic_int = 0; /* Pin 0 */ nintrs++; } } /* Assign low level interrupt handlers */ static void setup_apic_irq_mapping(void) { int x; int int_vector; /* Clear array */ for (x = 0; x < APIC_INTMAPSIZE; x++) { int_to_apicintpin[x].ioapic = -1; int_to_apicintpin[x].int_pin = 0; int_to_apicintpin[x].apic_address = NULL; int_to_apicintpin[x].redirindex = 0; } /* First assign ISA/EISA interrupts */ for (x = 0; x < nintrs; x++) { int_vector = io_apic_ints[x].src_bus_irq; if (int_vector < APIC_INTMAPSIZE && io_apic_ints[x].int_vector == 0xff && int_to_apicintpin[int_vector].ioapic == -1 && (apic_int_is_bus_type(x, ISA) || apic_int_is_bus_type(x, EISA)) && io_apic_ints[x].int_type == 0) { assign_apic_irq(ID_TO_IO(io_apic_ints[x].dst_apic_id), io_apic_ints[x].dst_apic_int, int_vector); } } /* Assign ExtInt entry if no ISA/EISA interrupt 0 entry */ for (x = 0; x < nintrs; x++) { if (io_apic_ints[x].dst_apic_int == 0 && io_apic_ints[x].dst_apic_id == IO_TO_ID(0) && io_apic_ints[x].int_vector == 0xff && int_to_apicintpin[0].ioapic == -1 && io_apic_ints[x].int_type == 3) { assign_apic_irq(0, 0, 0); break; } } /* PCI interrupt assignment is deferred */ } static int processor_entry(proc_entry_ptr entry, int cpu) { /* check for usability */ if (!(entry->cpu_flags & PROCENTRY_FLAG_EN)) return 0; if(entry->apic_id >= NAPICID) panic("CPU APIC ID out of range (0..%d)", NAPICID - 1); /* check for BSP flag */ if (entry->cpu_flags & PROCENTRY_FLAG_BP) { boot_cpu_id = entry->apic_id; CPU_TO_ID(0) = entry->apic_id; ID_TO_CPU(entry->apic_id) = 0; return 0; /* its already been counted */ } /* add another AP to list, if less than max number of CPUs */ else if (cpu < MAXCPU) { CPU_TO_ID(cpu) = entry->apic_id; ID_TO_CPU(entry->apic_id) = cpu; return 1; } return 0; } static int bus_entry(bus_entry_ptr entry, int bus) { int x; char c, name[8]; /* encode the name into an index */ for (x = 0; x < 6; ++x) { if ((c = entry->bus_type[x]) == ' ') break; name[x] = c; } name[x] = '\0'; if ((x = lookup_bus_type(name)) == UNKNOWN_BUSTYPE) panic("unknown bus type: '%s'", name); bus_data[bus].bus_id = entry->bus_id; bus_data[bus].bus_type = x; return 1; } static int io_apic_entry(io_apic_entry_ptr entry, int apic) { if (!(entry->apic_flags & IOAPICENTRY_FLAG_EN)) return 0; IO_TO_ID(apic) = entry->apic_id; if (entry->apic_id < NAPICID) ID_TO_IO(entry->apic_id) = apic; return 1; } static int lookup_bus_type(char *name) { int x; for (x = 0; x < MAX_BUSTYPE; ++x) if (strcmp(bus_type_table[x].name, name) == 0) return bus_type_table[x].type; return UNKNOWN_BUSTYPE; } static int int_entry(int_entry_ptr entry, int intr) { int apic; io_apic_ints[intr].int_type = entry->int_type; io_apic_ints[intr].int_flags = entry->int_flags; io_apic_ints[intr].src_bus_id = entry->src_bus_id; io_apic_ints[intr].src_bus_irq = entry->src_bus_irq; if (entry->dst_apic_id == 255) { /* This signal goes to all IO APICS. Select an IO APIC with sufficient number of interrupt pins */ for (apic = 0; apic < mp_napics; apic++) if (((io_apic_read(apic, IOAPIC_VER) & IOART_VER_MAXREDIR) >> MAXREDIRSHIFT) >= entry->dst_apic_int) break; if (apic < mp_napics) io_apic_ints[intr].dst_apic_id = IO_TO_ID(apic); else io_apic_ints[intr].dst_apic_id = entry->dst_apic_id; } else io_apic_ints[intr].dst_apic_id = entry->dst_apic_id; io_apic_ints[intr].dst_apic_int = entry->dst_apic_int; return 1; } static int apic_int_is_bus_type(int intr, int bus_type) { int bus; for (bus = 0; bus < mp_nbusses; ++bus) if ((bus_data[bus].bus_id == io_apic_ints[intr].src_bus_id) && ((int) bus_data[bus].bus_type == bus_type)) return 1; return 0; } /* * Given a traditional ISA INT mask, return an APIC mask. */ u_int isa_apic_mask(u_int isa_mask) { int isa_irq; int apic_pin; #if defined(SKIP_IRQ15_REDIRECT) if (isa_mask == (1 << 15)) { printf("skipping ISA IRQ15 redirect\n"); return isa_mask; } #endif /* SKIP_IRQ15_REDIRECT */ isa_irq = ffs(isa_mask); /* find its bit position */ if (isa_irq == 0) /* doesn't exist */ return 0; --isa_irq; /* make it zero based */ apic_pin = isa_apic_irq(isa_irq); /* look for APIC connection */ if (apic_pin == -1) return 0; return (1 << apic_pin); /* convert pin# to a mask */ } /* * Determine which APIC pin an ISA/EISA INT is attached to. */ #define INTTYPE(I) (io_apic_ints[(I)].int_type) #define INTPIN(I) (io_apic_ints[(I)].dst_apic_int) #define INTIRQ(I) (io_apic_ints[(I)].int_vector) #define INTAPIC(I) (ID_TO_IO(io_apic_ints[(I)].dst_apic_id)) #define SRCBUSIRQ(I) (io_apic_ints[(I)].src_bus_irq) int isa_apic_irq(int isa_irq) { int intr; for (intr = 0; intr < nintrs; ++intr) { /* check each record */ if (INTTYPE(intr) == 0) { /* standard INT */ if (SRCBUSIRQ(intr) == isa_irq) { if (apic_int_is_bus_type(intr, ISA) || apic_int_is_bus_type(intr, EISA)) { if (INTIRQ(intr) == 0xff) return -1; /* unassigned */ return INTIRQ(intr); /* found */ } } } } return -1; /* NOT found */ } /* * Determine which APIC pin a PCI INT is attached to. */ #define SRCBUSID(I) (io_apic_ints[(I)].src_bus_id) #define SRCBUSDEVICE(I) ((io_apic_ints[(I)].src_bus_irq >> 2) & 0x1f) #define SRCBUSLINE(I) (io_apic_ints[(I)].src_bus_irq & 0x03) int pci_apic_irq(int pciBus, int pciDevice, int pciInt) { int intr; --pciInt; /* zero based */ for (intr = 0; intr < nintrs; ++intr) /* check each record */ if ((INTTYPE(intr) == 0) /* standard INT */ && (SRCBUSID(intr) == pciBus) && (SRCBUSDEVICE(intr) == pciDevice) && (SRCBUSLINE(intr) == pciInt)) /* a candidate IRQ */ if (apic_int_is_bus_type(intr, PCI)) { if (INTIRQ(intr) == 0xff) allocate_apic_irq(intr); if (INTIRQ(intr) == 0xff) return -1; /* unassigned */ return INTIRQ(intr); /* exact match */ } return -1; /* NOT found */ } int next_apic_irq(int irq) { int intr, ointr; int bus, bustype; bus = 0; bustype = 0; for (intr = 0; intr < nintrs; intr++) { if (INTIRQ(intr) != irq || INTTYPE(intr) != 0) continue; bus = SRCBUSID(intr); bustype = apic_bus_type(bus); if (bustype != ISA && bustype != EISA && bustype != PCI) continue; break; } if (intr >= nintrs) { return -1; } for (ointr = intr + 1; ointr < nintrs; ointr++) { if (INTTYPE(ointr) != 0) continue; if (bus != SRCBUSID(ointr)) continue; if (bustype == PCI) { if (SRCBUSDEVICE(intr) != SRCBUSDEVICE(ointr)) continue; if (SRCBUSLINE(intr) != SRCBUSLINE(ointr)) continue; } if (bustype == ISA || bustype == EISA) { if (SRCBUSIRQ(intr) != SRCBUSIRQ(ointr)) continue; } if (INTPIN(intr) == INTPIN(ointr)) continue; break; } if (ointr >= nintrs) { return -1; } return INTIRQ(ointr); } #undef SRCBUSLINE #undef SRCBUSDEVICE #undef SRCBUSID #undef SRCBUSIRQ #undef INTPIN #undef INTIRQ #undef INTAPIC #undef INTTYPE /* * Reprogram the MB chipset to NOT redirect an ISA INTerrupt. * * XXX FIXME: * Exactly what this means is unclear at this point. It is a solution * for motherboards that redirect the MBIRQ0 pin. Generically a motherboard * could route any of the ISA INTs to upper (>15) IRQ values. But most would * NOT be redirected via MBIRQ0, thus "undirect()ing" them would NOT be an * option. */ int undirect_isa_irq(int rirq) { #if defined(READY) if (bootverbose) printf("Freeing redirected ISA irq %d.\n", rirq); /** FIXME: tickle the MB redirector chip */ return -1; #else if (bootverbose) printf("Freeing (NOT implemented) redirected ISA irq %d.\n", rirq); return 0; #endif /* READY */ } /* * Reprogram the MB chipset to NOT redirect a PCI INTerrupt */ int undirect_pci_irq(int rirq) { #if defined(READY) if (bootverbose) printf("Freeing redirected PCI irq %d.\n", rirq); /** FIXME: tickle the MB redirector chip */ return -1; #else if (bootverbose) printf("Freeing (NOT implemented) redirected PCI irq %d.\n", rirq); return 0; #endif /* READY */ } /* * given a bus ID, return: * the bus type if found * -1 if NOT found */ int apic_bus_type(int id) { int x; for (x = 0; x < mp_nbusses; ++x) if (bus_data[x].bus_id == id) return bus_data[x].bus_type; return -1; } /* * given a LOGICAL APIC# and pin#, return: * the associated src bus ID if found * -1 if NOT found */ int apic_src_bus_id(int apic, int pin) { int x; /* search each of the possible INTerrupt sources */ for (x = 0; x < nintrs; ++x) if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) && (pin == io_apic_ints[x].dst_apic_int)) return (io_apic_ints[x].src_bus_id); return -1; /* NOT found */ } /* * given a LOGICAL APIC# and pin#, return: * the associated src bus IRQ if found * -1 if NOT found */ int apic_src_bus_irq(int apic, int pin) { int x; for (x = 0; x < nintrs; x++) if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) && (pin == io_apic_ints[x].dst_apic_int)) return (io_apic_ints[x].src_bus_irq); return -1; /* NOT found */ } /* * given a LOGICAL APIC# and pin#, return: * the associated INTerrupt type if found * -1 if NOT found */ int apic_int_type(int apic, int pin) { int x; /* search each of the possible INTerrupt sources */ for (x = 0; x < nintrs; ++x) if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) && (pin == io_apic_ints[x].dst_apic_int)) return (io_apic_ints[x].int_type); return -1; /* NOT found */ } int apic_irq(int apic, int pin) { int x; int res; for (x = 0; x < nintrs; ++x) if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) && (pin == io_apic_ints[x].dst_apic_int)) { res = io_apic_ints[x].int_vector; if (res == 0xff) return -1; if (apic != int_to_apicintpin[res].ioapic) panic("apic_irq: inconsistent table"); if (pin != int_to_apicintpin[res].int_pin) panic("apic_irq inconsistent table (2)"); return res; } return -1; } /* * given a LOGICAL APIC# and pin#, return: * the associated trigger mode if found * -1 if NOT found */ int apic_trigger(int apic, int pin) { int x; /* search each of the possible INTerrupt sources */ for (x = 0; x < nintrs; ++x) if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) && (pin == io_apic_ints[x].dst_apic_int)) return ((io_apic_ints[x].int_flags >> 2) & 0x03); return -1; /* NOT found */ } /* * given a LOGICAL APIC# and pin#, return: * the associated 'active' level if found * -1 if NOT found */ int apic_polarity(int apic, int pin) { int x; /* search each of the possible INTerrupt sources */ for (x = 0; x < nintrs; ++x) if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) && (pin == io_apic_ints[x].dst_apic_int)) return (io_apic_ints[x].int_flags & 0x03); return -1; /* NOT found */ } /* * set data according to MP defaults * FIXME: probably not complete yet... */ static void default_mp_table(int type) { int ap_cpu_id; #if defined(APIC_IO) int io_apic_id; int pin; #endif /* APIC_IO */ #if 0 printf(" MP default config type: %d\n", type); switch (type) { case 1: printf(" bus: ISA, APIC: 82489DX\n"); break; case 2: printf(" bus: EISA, APIC: 82489DX\n"); break; case 3: printf(" bus: EISA, APIC: 82489DX\n"); break; case 4: printf(" bus: MCA, APIC: 82489DX\n"); break; case 5: printf(" bus: ISA+PCI, APIC: Integrated\n"); break; case 6: printf(" bus: EISA+PCI, APIC: Integrated\n"); break; case 7: printf(" bus: MCA+PCI, APIC: Integrated\n"); break; default: printf(" future type\n"); break; /* NOTREACHED */ } #endif /* 0 */ boot_cpu_id = (lapic.id & APIC_ID_MASK) >> 24; ap_cpu_id = (boot_cpu_id == 0) ? 1 : 0; /* BSP */ CPU_TO_ID(0) = boot_cpu_id; ID_TO_CPU(boot_cpu_id) = 0; /* one and only AP */ CPU_TO_ID(1) = ap_cpu_id; ID_TO_CPU(ap_cpu_id) = 1; #if defined(APIC_IO) /* one and only IO APIC */ io_apic_id = (io_apic_read(0, IOAPIC_ID) & APIC_ID_MASK) >> 24; /* * sanity check, refer to MP spec section 3.6.6, last paragraph * necessary as some hardware isn't properly setting up the IO APIC */ #if defined(REALLY_ANAL_IOAPICID_VALUE) if (io_apic_id != 2) { #else if ((io_apic_id == 0) || (io_apic_id == 1) || (io_apic_id == 15)) { #endif /* REALLY_ANAL_IOAPICID_VALUE */ io_apic_set_id(0, 2); io_apic_id = 2; } IO_TO_ID(0) = io_apic_id; ID_TO_IO(io_apic_id) = 0; #endif /* APIC_IO */ /* fill out bus entries */ switch (type) { case 1: case 2: case 3: case 4: case 5: case 6: case 7: bus_data[0].bus_id = default_data[type - 1][1]; bus_data[0].bus_type = default_data[type - 1][2]; bus_data[1].bus_id = default_data[type - 1][3]; bus_data[1].bus_type = default_data[type - 1][4]; break; /* case 4: case 7: MCA NOT supported */ default: /* illegal/reserved */ panic("BAD default MP config: %d", type); /* NOTREACHED */ } #if defined(APIC_IO) /* general cases from MP v1.4, table 5-2 */ for (pin = 0; pin < 16; ++pin) { io_apic_ints[pin].int_type = 0; io_apic_ints[pin].int_flags = 0x05; /* edge/active-hi */ io_apic_ints[pin].src_bus_id = 0; io_apic_ints[pin].src_bus_irq = pin; /* IRQ2 caught below */ io_apic_ints[pin].dst_apic_id = io_apic_id; io_apic_ints[pin].dst_apic_int = pin; /* 1-to-1 */ } /* special cases from MP v1.4, table 5-2 */ if (type == 2) { io_apic_ints[2].int_type = 0xff; /* N/C */ io_apic_ints[13].int_type = 0xff; /* N/C */ #if !defined(APIC_MIXED_MODE) /** FIXME: ??? */ panic("sorry, can't support type 2 default yet"); #endif /* APIC_MIXED_MODE */ } else io_apic_ints[2].src_bus_irq = 0; /* ISA IRQ0 is on APIC INT 2 */ if (type == 7) io_apic_ints[0].int_type = 0xff; /* N/C */ else io_apic_ints[0].int_type = 3; /* vectored 8259 */ #endif /* APIC_IO */ } /* * start each AP in our list */ static int start_all_aps(u_int boot_addr) { int x, i, pg; u_char mpbiosreason; u_long mpbioswarmvec; struct globaldata *gd; char *stack; uintptr_t kptbase; POSTCODE(START_ALL_APS_POST); mtx_init(&ap_boot_mtx, "ap boot", MTX_SPIN); /* initialize BSP's local APIC */ apic_initialize(); bsp_apic_ready = 1; /* install the AP 1st level boot code */ install_ap_tramp(boot_addr); /* save the current value of the warm-start vector */ mpbioswarmvec = *((u_long *) WARMBOOT_OFF); #ifndef PC98 outb(CMOS_REG, BIOS_RESET); mpbiosreason = inb(CMOS_DATA); #endif /* set up temporary P==V mapping for AP boot */ /* XXX this is a hack, we should boot the AP on its own stack/PTD */ kptbase = (uintptr_t)(void *)KPTphys; for (x = 0; x < NKPT; x++) PTD[x] = (pd_entry_t)(PG_V | PG_RW | ((kptbase + x * PAGE_SIZE) & PG_FRAME)); invltlb(); /* start each AP */ for (x = 1; x <= mp_naps; ++x) { /* This is a bit verbose, it will go away soon. */ /* first page of AP's private space */ pg = x * i386_btop(sizeof(struct privatespace)); /* allocate a new private data page */ gd = (struct globaldata *)kmem_alloc(kernel_map, PAGE_SIZE); /* wire it into the private page table page */ SMPpt[pg] = (pt_entry_t)(PG_V | PG_RW | vtophys(gd)); /* allocate and set up an idle stack data page */ stack = (char *)kmem_alloc(kernel_map, UPAGES*PAGE_SIZE); for (i = 0; i < UPAGES; i++) SMPpt[pg + 1 + i] = (pt_entry_t) (PG_V | PG_RW | vtophys(PAGE_SIZE * i + stack)); /* prime data page for it to use */ gd->gd_cpuid = x; globaldata_register(gd); /* setup a vector to our boot code */ *((volatile u_short *) WARMBOOT_OFF) = WARMBOOT_TARGET; *((volatile u_short *) WARMBOOT_SEG) = (boot_addr >> 4); #ifndef PC98 outb(CMOS_REG, BIOS_RESET); outb(CMOS_DATA, BIOS_WARM); /* 'warm-start' */ #endif bootSTK = &SMP_prvspace[x].idlestack[UPAGES*PAGE_SIZE]; bootAP = x; /* attempt to start the Application Processor */ CHECK_INIT(99); /* setup checkpoints */ if (!start_ap(x, boot_addr)) { printf("AP #%d (PHY# %d) failed!\n", x, CPU_TO_ID(x)); CHECK_PRINT("trace"); /* show checkpoints */ /* better panic as the AP may be running loose */ printf("panic y/n? [y] "); if (cngetc() != 'n') panic("bye-bye"); } CHECK_PRINT("trace"); /* show checkpoints */ /* record its version info */ cpu_apic_versions[x] = cpu_apic_versions[0]; all_cpus |= (1 << x); /* record AP in CPU map */ } /* build our map of 'other' CPUs */ PCPU_SET(other_cpus, all_cpus & ~(1 << PCPU_GET(cpuid))); /* fill in our (BSP) APIC version */ cpu_apic_versions[0] = lapic.version; /* restore the warmstart vector */ *(u_long *) WARMBOOT_OFF = mpbioswarmvec; #ifndef PC98 outb(CMOS_REG, BIOS_RESET); outb(CMOS_DATA, mpbiosreason); #endif /* * Set up the idle context for the BSP. Similar to above except * that some was done by locore, some by pmap.c and some is implicit * because the BSP is cpu#0 and the page is initially zero, and also * because we can refer to variables by name on the BSP.. */ /* Allocate and setup BSP idle stack */ stack = (char *)kmem_alloc(kernel_map, UPAGES * PAGE_SIZE); for (i = 0; i < UPAGES; i++) SMPpt[1 + i] = (pt_entry_t) (PG_V | PG_RW | vtophys(PAGE_SIZE * i + stack)); for (x = 0; x < NKPT; x++) PTD[x] = 0; pmap_set_opt(); /* number of APs actually started */ return mp_ncpus - 1; } /* * load the 1st level AP boot code into base memory. */ /* targets for relocation */ extern void bigJump(void); extern void bootCodeSeg(void); extern void bootDataSeg(void); extern void MPentry(void); extern u_int MP_GDT; extern u_int mp_gdtbase; static void install_ap_tramp(u_int boot_addr) { int x; int size = *(int *) ((u_long) & bootMP_size); u_char *src = (u_char *) ((u_long) bootMP); u_char *dst = (u_char *) boot_addr + KERNBASE; u_int boot_base = (u_int) bootMP; u_int8_t *dst8; u_int16_t *dst16; u_int32_t *dst32; POSTCODE(INSTALL_AP_TRAMP_POST); for (x = 0; x < size; ++x) *dst++ = *src++; /* * modify addresses in code we just moved to basemem. unfortunately we * need fairly detailed info about mpboot.s for this to work. changes * to mpboot.s might require changes here. */ /* boot code is located in KERNEL space */ dst = (u_char *) boot_addr + KERNBASE; /* modify the lgdt arg */ dst32 = (u_int32_t *) (dst + ((u_int) & mp_gdtbase - boot_base)); *dst32 = boot_addr + ((u_int) & MP_GDT - boot_base); /* modify the ljmp target for MPentry() */ dst32 = (u_int32_t *) (dst + ((u_int) bigJump - boot_base) + 1); *dst32 = ((u_int) MPentry - KERNBASE); /* modify the target for boot code segment */ dst16 = (u_int16_t *) (dst + ((u_int) bootCodeSeg - boot_base)); dst8 = (u_int8_t *) (dst16 + 1); *dst16 = (u_int) boot_addr & 0xffff; *dst8 = ((u_int) boot_addr >> 16) & 0xff; /* modify the target for boot data segment */ dst16 = (u_int16_t *) (dst + ((u_int) bootDataSeg - boot_base)); dst8 = (u_int8_t *) (dst16 + 1); *dst16 = (u_int) boot_addr & 0xffff; *dst8 = ((u_int) boot_addr >> 16) & 0xff; } /* * this function starts the AP (application processor) identified * by the APIC ID 'physicalCpu'. It does quite a "song and dance" * to accomplish this. This is necessary because of the nuances * of the different hardware we might encounter. It ain't pretty, * but it seems to work. */ static int start_ap(int logical_cpu, u_int boot_addr) { int physical_cpu; int vector; int cpus; u_long icr_lo, icr_hi; POSTCODE(START_AP_POST); /* get the PHYSICAL APIC ID# */ physical_cpu = CPU_TO_ID(logical_cpu); /* calculate the vector */ vector = (boot_addr >> 12) & 0xff; /* used as a watchpoint to signal AP startup */ cpus = mp_ncpus; /* * first we do an INIT/RESET IPI this INIT IPI might be run, reseting * and running the target CPU. OR this INIT IPI might be latched (P5 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be * ignored. */ /* setup the address for the target AP */ icr_hi = lapic.icr_hi & ~APIC_ID_MASK; icr_hi |= (physical_cpu << 24); lapic.icr_hi = icr_hi; /* do an INIT IPI: assert RESET */ icr_lo = lapic.icr_lo & 0xfff00000; lapic.icr_lo = icr_lo | 0x0000c500; /* wait for pending status end */ while (lapic.icr_lo & APIC_DELSTAT_MASK) /* spin */ ; /* do an INIT IPI: deassert RESET */ lapic.icr_lo = icr_lo | 0x00008500; /* wait for pending status end */ u_sleep(10000); /* wait ~10mS */ while (lapic.icr_lo & APIC_DELSTAT_MASK) /* spin */ ; /* * next we do a STARTUP IPI: the previous INIT IPI might still be * latched, (P5 bug) this 1st STARTUP would then terminate * immediately, and the previously started INIT IPI would continue. OR * the previous INIT IPI has already run. and this STARTUP IPI will * run. OR the previous INIT IPI was ignored. and this STARTUP IPI * will run. */ /* do a STARTUP IPI */ lapic.icr_lo = icr_lo | 0x00000600 | vector; while (lapic.icr_lo & APIC_DELSTAT_MASK) /* spin */ ; u_sleep(200); /* wait ~200uS */ /* * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is * recognized after hardware RESET or INIT IPI. */ lapic.icr_lo = icr_lo | 0x00000600 | vector; while (lapic.icr_lo & APIC_DELSTAT_MASK) /* spin */ ; u_sleep(200); /* wait ~200uS */ /* wait for it to start */ set_apic_timer(5000000);/* == 5 seconds */ while (read_apic_timer()) if (mp_ncpus > cpus) return 1; /* return SUCCESS */ return 0; /* return FAILURE */ } /* * Flush the TLB on all other CPU's * * XXX: Needs to handshake and wait for completion before proceding. */ void smp_invltlb(void) { #if defined(APIC_IO) if (smp_started && invltlb_ok) ipi_all_but_self(IPI_INVLTLB); #endif /* APIC_IO */ } void invlpg(u_int addr) { __asm __volatile("invlpg (%0)"::"r"(addr):"memory"); /* send a message to the other CPUs */ smp_invltlb(); } void invltlb(void) { u_long temp; /* * This should be implemented as load_cr3(rcr3()) when load_cr3() is * inlined. */ __asm __volatile("movl %%cr3, %0; movl %0, %%cr3":"=r"(temp) :: "memory"); /* send a message to the other CPUs */ smp_invltlb(); } /* * This is called once the rest of the system is up and running and we're * ready to let the AP's out of the pen. */ extern void enable_sse(void); void ap_init(void) { u_int apic_id; /* spin until all the AP's are ready */ while (!aps_ready) /* spin */ ; /* * Set curproc to our per-cpu idleproc so that mutexes have * something unique to lock with. */ PCPU_SET(curproc, PCPU_GET(idleproc)); PCPU_SET(spinlocks, NULL); /* lock against other AP's that are waking up */ mtx_lock_spin(&ap_boot_mtx); /* BSP may have changed PTD while we're waiting for the lock */ cpu_invltlb(); smp_cpus++; #if defined(I586_CPU) && !defined(NO_F00F_HACK) lidt(&r_idt); #endif /* Build our map of 'other' CPUs. */ PCPU_SET(other_cpus, all_cpus & ~(1 << PCPU_GET(cpuid))); printf("SMP: AP CPU #%d Launched!\n", PCPU_GET(cpuid)); /* set up CPU registers and state */ cpu_setregs(); /* set up FPU state on the AP */ npxinit(__INITIAL_NPXCW__); /* set up SSE registers */ enable_sse(); /* A quick check from sanity claus */ apic_id = (apic_id_to_logical[(lapic.id & 0x0f000000) >> 24]); if (PCPU_GET(cpuid) != apic_id) { printf("SMP: cpuid = %d\n", PCPU_GET(cpuid)); printf("SMP: apic_id = %d\n", apic_id); printf("PTD[MPPTDI] = %p\n", (void *)PTD[MPPTDI]); panic("cpuid mismatch! boom!!"); } /* Init local apic for irq's */ apic_initialize(); /* Set memory range attributes for this CPU to match the BSP */ mem_range_AP_init(); /* * Activate smp_invltlb, although strictly speaking, this isn't * quite correct yet. We should have a bitfield for cpus willing * to accept TLB flush IPI's or something and sync them. */ if (smp_cpus == mp_ncpus) { invltlb_ok = 1; smp_started = 1; /* enable IPI's, tlb shootdown, freezes etc */ smp_active = 1; /* historic */ } /* let other AP's wake up now */ mtx_unlock_spin(&ap_boot_mtx); /* wait until all the AP's are up */ while (smp_started == 0) ; /* nothing */ microuptime(PCPU_PTR(switchtime)); PCPU_SET(switchticks, ticks); /* ok, now grab sched_lock and enter the scheduler */ enable_intr(); mtx_lock_spin(&sched_lock); cpu_throw(); /* doesn't return */ panic("scheduler returned us to ap_init"); } /* * For statclock, we send an IPI to all CPU's to have them call this * function. */ void forwarded_statclock(struct trapframe frame) { mtx_lock_spin(&sched_lock); statclock_process(curproc, TRAPF_PC(&frame), TRAPF_USERMODE(&frame)); mtx_unlock_spin(&sched_lock); } void forward_statclock(void) { int map; CTR0(KTR_SMP, "forward_statclock"); if (!smp_started || !invltlb_ok || cold || panicstr) return; map = PCPU_GET(other_cpus) & ~stopped_cpus ; if (map != 0) ipi_selected(map, IPI_STATCLOCK); } /* * For each hardclock(), we send an IPI to all other CPU's to have them * execute this function. It would be nice to reduce contention on * sched_lock if we could simply peek at the CPU to determine the user/kernel * state and call hardclock_process() on the CPU receiving the clock interrupt * and then just use a simple IPI to handle any ast's if needed. */ void forwarded_hardclock(struct trapframe frame) { mtx_lock_spin(&sched_lock); hardclock_process(curproc, TRAPF_USERMODE(&frame)); mtx_unlock_spin(&sched_lock); } void forward_hardclock(void) { u_int map; CTR0(KTR_SMP, "forward_hardclock"); if (!smp_started || !invltlb_ok || cold || panicstr) return; map = PCPU_GET(other_cpus) & ~stopped_cpus ; if (map != 0) ipi_selected(map, IPI_HARDCLOCK); } #ifdef APIC_INTR_REORDER /* * Maintain mapping from softintr vector to isr bit in local apic. */ void set_lapic_isrloc(int intr, int vector) { if (intr < 0 || intr > 32) panic("set_apic_isrloc: bad intr argument: %d",intr); if (vector < ICU_OFFSET || vector > 255) panic("set_apic_isrloc: bad vector argument: %d",vector); apic_isrbit_location[intr].location = &lapic.isr0 + ((vector>>5)<<2); apic_isrbit_location[intr].bit = (1<<(vector & 31)); } #endif /* * send an IPI to a set of cpus. */ void ipi_selected(u_int32_t cpus, u_int ipi) { CTR2(KTR_SMP, __func__ ": cpus: %x ipi: %x", cpus, ipi); selected_apic_ipi(cpus, ipi, APIC_DELMODE_FIXED); } /* * send an IPI INTerrupt containing 'vector' to all CPUs, including myself */ void ipi_all(u_int ipi) { CTR1(KTR_SMP, __func__ ": ipi: %x", ipi); apic_ipi(APIC_DEST_ALLISELF, ipi, APIC_DELMODE_FIXED); } /* * send an IPI to all CPUs EXCEPT myself */ void ipi_all_but_self(u_int ipi) { CTR1(KTR_SMP, __func__ ": ipi: %x", ipi); apic_ipi(APIC_DEST_ALLESELF, ipi, APIC_DELMODE_FIXED); } /* * send an IPI to myself */ void ipi_self(u_int ipi) { CTR1(KTR_SMP, __func__ ": ipi: %x", ipi); apic_ipi(APIC_DEST_SELF, ipi, APIC_DELMODE_FIXED); } void release_aps(void *dummy __unused) { atomic_store_rel_int(&aps_ready, 1); } SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); Index: head/sys/i386/include/param.h =================================================================== --- head/sys/i386/include/param.h (revision 82308) +++ head/sys/i386/include/param.h (revision 82309) @@ -1,185 +1,188 @@ /*- * Copyright (c) 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)param.h 5.8 (Berkeley) 6/28/91 * $FreeBSD$ */ /* * Machine dependent constants for Intel 386. */ /* * Round p (pointer or byte index) up to a correctly-aligned value * for all data types (int, long, ...). The result is unsigned int * and must be cast to any desired pointer type. */ #ifndef _ALIGNBYTES #define _ALIGNBYTES (sizeof(int) - 1) #endif #ifndef _ALIGN #define _ALIGN(p) (((unsigned)(p) + _ALIGNBYTES) & ~_ALIGNBYTES) #endif #ifndef _MACHINE #define _MACHINE i386 #endif #ifndef _MACHINE_ARCH #define _MACHINE_ARCH i386 #endif #ifndef _NO_NAMESPACE_POLLUTION #ifndef _MACHINE_PARAM_H_ #define _MACHINE_PARAM_H_ #ifndef MACHINE #define MACHINE "i386" #endif #ifndef MACHINE_ARCH #define MACHINE_ARCH "i386" #endif #define MID_MACHINE MID_I386 /* * OBJFORMAT_NAMES is a comma-separated list of the object formats * that are supported on the architecture. */ #define OBJFORMAT_NAMES "elf", "aout" #define OBJFORMAT_DEFAULT "elf" #ifdef SMP #define MAXCPU 16 #else #define MAXCPU 1 #endif /* SMP */ #define ALIGNBYTES _ALIGNBYTES #define ALIGN(p) _ALIGN(p) #define PAGE_SHIFT 12 /* LOG2(PAGE_SIZE) */ #define PAGE_SIZE (1<>PAGE_SHIFT) /* * btodb() is messy and perhaps slow because `bytes' may be an off_t. We * want to shift an unsigned type to avoid sign extension and we don't * want to widen `bytes' unnecessarily. Assume that the result fits in * a daddr_t. */ #define btodb(bytes) /* calculates (bytes / DEV_BSIZE) */ \ (sizeof (bytes) > sizeof(long) \ ? (daddr_t)((unsigned long long)(bytes) >> DEV_BSHIFT) \ : (daddr_t)((unsigned long)(bytes) >> DEV_BSHIFT)) #define dbtob(db) /* calculates (db * DEV_BSIZE) */ \ ((off_t)(db) << DEV_BSHIFT) /* * Mach derived conversion macros */ #define trunc_page(x) ((x) & ~PAGE_MASK) #define round_page(x) (((x) + PAGE_MASK) & ~PAGE_MASK) #define trunc_4mpage(x) ((unsigned)(x) & ~PDRMASK) #define round_4mpage(x) ((((unsigned)(x)) + PDRMASK) & ~PDRMASK) #define atop(x) ((unsigned)(x) >> PAGE_SHIFT) #define ptoa(x) ((unsigned)(x) << PAGE_SHIFT) #define i386_btop(x) ((unsigned)(x) >> PAGE_SHIFT) #define i386_ptob(x) ((unsigned)(x) << PAGE_SHIFT) #define pgtok(x) ((x) * (PAGE_SIZE / 1024)) #endif /* !_MACHINE_PARAM_H_ */ #endif /* !_NO_NAMESPACE_POLLUTION */ Index: head/sys/i386/include/pcpu.h =================================================================== --- head/sys/i386/include/pcpu.h (revision 82308) +++ head/sys/i386/include/pcpu.h (revision 82309) @@ -1,99 +1,79 @@ /*- * Copyright (c) Peter Wemm * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _MACHINE_GLOBALDATA_H_ #define _MACHINE_GLOBALDATA_H_ #ifdef _KERNEL #include #include /* XXX */ #ifdef KTR_PERCPU #include #endif /* * This structure maps out the global data that needs to be kept on a * per-cpu basis. genassym uses this to generate offsets for the assembler * code, which also provides external symbols so that C can get at them as * though they were really globals. * * The SMP parts are setup in pmap.c and locore.s for the BSP, and * mp_machdep.c sets up the data for the AP's to "see" when they awake. * The reason for doing it via a struct is so that an array of pointers * to each CPU's data can be set up for things like "check curproc on all * other processors" */ struct globaldata { struct globaldata *gd_prvspace; /* self-reference */ struct proc *gd_curproc; /* current process */ struct proc *gd_idleproc; /* idle process */ struct proc *gd_npxproc; struct pcb *gd_curpcb; /* current pcb */ struct timeval gd_switchtime; struct i386tss gd_common_tss; int gd_switchticks; struct segment_descriptor gd_common_tssd; struct segment_descriptor *gd_tss_gdt; int gd_currentldt; u_int gd_cpuid; /* this cpu number */ u_int gd_other_cpus; /* all other cpus */ SLIST_ENTRY(globaldata) gd_allcpu; struct lock_list_entry *gd_spinlocks; #ifdef KTR_PERCPU volatile int gd_ktr_idx; /* Index into trace table */ char *gd_ktr_buf; char gd_ktr_buf_data[KTR_SIZE]; #endif }; -#ifdef SMP -/* - * This is the upper (0xff800000) address space layout that is per-cpu. - * It is setup in locore.s and pmap.c for the BSP and in mp_machdep.c for - * each AP. genassym helps export this to the assembler code. - */ -struct privatespace { - /* page 0 - data page */ - struct globaldata globaldata; - char __filler0[PAGE_SIZE - sizeof(struct globaldata)]; - - /* page 1 - idle stack (UPAGES pages) */ - char idlestack[UPAGES * PAGE_SIZE]; - /* page 1+UPAGES... */ -}; - -extern struct privatespace SMP_prvspace[]; - -#endif - #endif /* _KERNEL */ #endif /* ! _MACHINE_GLOBALDATA_H_ */ Index: head/sys/i386/include/privatespace.h =================================================================== --- head/sys/i386/include/privatespace.h (nonexistent) +++ head/sys/i386/include/privatespace.h (revision 82309) @@ -0,0 +1,49 @@ +/*- + * Copyright (c) Peter Wemm + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _MACHINE_PRIVATESPACE_H_ +#define _MACHINE_PRIVATESPACE_H_ + +/* + * This is the upper (0xff800000) address space layout that is per-cpu. + * It is setup in locore.s and pmap.c for the BSP and in mp_machdep.c for + * each AP. This is only applicable to the x86 SMP kernel. + */ +struct privatespace { + /* page 0 - data page */ + struct globaldata globaldata; + char __filler0[PAGE_SIZE - sizeof(struct globaldata)]; + + /* page 1 - idle stack (UPAGES pages) */ + char idlestack[UPAGES * PAGE_SIZE]; + /* page 1+UPAGES... */ +}; + +extern struct privatespace SMP_prvspace[]; + +#endif /* ! _MACHINE_PRIVATESPACE_H_ */ Property changes on: head/sys/i386/include/privatespace.h ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Index: head/sys/kern/imgact_aout.c =================================================================== --- head/sys/kern/imgact_aout.c (revision 82308) +++ head/sys/kern/imgact_aout.c (revision 82309) @@ -1,281 +1,283 @@ /* * Copyright (c) 1993, David Greenman * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ +#include "opt_upages.h" + #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int exec_aout_imgact __P((struct image_params *imgp)); struct sysentvec aout_sysvec = { SYS_MAXSYSCALL, sysent, 0, 0, 0, 0, 0, 0, 0, sendsig, sigcode, &szsigcode, 0, "FreeBSD a.out", aout_coredump, NULL, MINSIGSTKSZ }; static int exec_aout_imgact(imgp) struct image_params *imgp; { const struct exec *a_out = (const struct exec *) imgp->image_header; struct vmspace *vmspace; struct vnode *vp; vm_map_t map; vm_object_t object; vm_offset_t text_end, data_end; unsigned long virtual_offset; unsigned long file_offset; unsigned long bss_size; int error; GIANT_REQUIRED; /* * Linux and *BSD binaries look very much alike, * only the machine id is different: * 0x64 for Linux, 0x86 for *BSD, 0x00 for BSDI. * NetBSD is in network byte order.. ugh. */ if (((a_out->a_magic >> 16) & 0xff) != 0x86 && ((a_out->a_magic >> 16) & 0xff) != 0 && ((((int)ntohl(a_out->a_magic)) >> 16) & 0xff) != 0x86) return -1; /* * Set file/virtual offset based on a.out variant. * We do two cases: host byte order and network byte order * (for NetBSD compatibility) */ switch ((int)(a_out->a_magic & 0xffff)) { case ZMAGIC: virtual_offset = 0; if (a_out->a_text) { file_offset = PAGE_SIZE; } else { /* Bill's "screwball mode" */ file_offset = 0; } break; case QMAGIC: virtual_offset = PAGE_SIZE; file_offset = 0; /* Pass PS_STRINGS for BSD/OS binaries only. */ if (N_GETMID(*a_out) == MID_ZERO) imgp->ps_strings = PS_STRINGS; break; default: /* NetBSD compatibility */ switch ((int)(ntohl(a_out->a_magic) & 0xffff)) { case ZMAGIC: case QMAGIC: virtual_offset = PAGE_SIZE; file_offset = 0; break; default: return (-1); } } bss_size = roundup(a_out->a_bss, PAGE_SIZE); /* * Check various fields in header for validity/bounds. */ if (/* entry point must lay with text region */ a_out->a_entry < virtual_offset || a_out->a_entry >= virtual_offset + a_out->a_text || /* text and data size must each be page rounded */ a_out->a_text & PAGE_MASK || a_out->a_data & PAGE_MASK) return (-1); /* text + data can't exceed file size */ if (a_out->a_data + a_out->a_text > imgp->attr->va_size) return (EFAULT); /* * text/data/bss must not exceed limits */ mtx_assert(&Giant, MA_OWNED); if (/* text can't exceed maximum text size */ a_out->a_text > MAXTSIZ || /* data + bss can't exceed rlimit */ a_out->a_data + bss_size > imgp->proc->p_rlimit[RLIMIT_DATA].rlim_cur) return (ENOMEM); /* copy in arguments and/or environment from old process */ error = exec_extract_strings(imgp); if (error) return (error); /* * Destroy old process VM and create a new one (with a new stack) */ exec_new_vmspace(imgp); /* * The vm space can be changed by exec_new_vmspace */ vmspace = imgp->proc->p_vmspace; vp = imgp->vp; map = &vmspace->vm_map; vm_map_lock(map); VOP_GETVOBJECT(vp, &object); vm_object_reference(object); text_end = virtual_offset + a_out->a_text; error = vm_map_insert(map, object, file_offset, virtual_offset, text_end, VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_ALL, MAP_COPY_ON_WRITE | MAP_PREFAULT); if (error) { vm_map_unlock(map); return (error); } data_end = text_end + a_out->a_data; if (a_out->a_data) { vm_object_reference(object); error = vm_map_insert(map, object, file_offset + a_out->a_text, text_end, data_end, VM_PROT_ALL, VM_PROT_ALL, MAP_COPY_ON_WRITE | MAP_PREFAULT); if (error) { vm_map_unlock(map); return (error); } } if (bss_size) { error = vm_map_insert(map, NULL, 0, data_end, data_end + bss_size, VM_PROT_ALL, VM_PROT_ALL, 0); if (error) { vm_map_unlock(map); return (error); } } vm_map_unlock(map); /* Fill in process VM information */ vmspace->vm_tsize = a_out->a_text >> PAGE_SHIFT; vmspace->vm_dsize = (a_out->a_data + bss_size) >> PAGE_SHIFT; vmspace->vm_taddr = (caddr_t) (uintptr_t) virtual_offset; vmspace->vm_daddr = (caddr_t) (uintptr_t) (virtual_offset + a_out->a_text); /* Fill in image_params */ imgp->interpreted = 0; imgp->entry_addr = a_out->a_entry; imgp->proc->p_sysent = &aout_sysvec; /* Indicate that this file should not be modified */ imgp->vp->v_flag |= VTEXT; return (0); } /* * Dump core, into a file named as described in the comments for * expand_name(), unless the process was setuid/setgid. */ int aout_coredump(p, vp, limit) register struct proc *p; register struct vnode *vp; off_t limit; { register struct ucred *cred = p->p_ucred; register struct vmspace *vm = p->p_vmspace; int error; if (ctob(UPAGES + vm->vm_dsize + vm->vm_ssize) >= limit) return (EFAULT); fill_kinfo_proc(p, &p->p_addr->u_kproc); error = cpu_coredump(p, vp, cred); if (error == 0) error = vn_rdwr(UIO_WRITE, vp, vm->vm_daddr, (int)ctob(vm->vm_dsize), (off_t)ctob(UPAGES), UIO_USERSPACE, IO_NODELOCKED|IO_UNIT, cred, (int *) NULL, p); if (error == 0) error = vn_rdwr(UIO_WRITE, vp, (caddr_t) trunc_page(USRSTACK - ctob(vm->vm_ssize)), round_page(ctob(vm->vm_ssize)), (off_t)ctob(UPAGES) + ctob(vm->vm_dsize), UIO_USERSPACE, IO_NODELOCKED|IO_UNIT, cred, (int *) NULL, p); return (error); } /* * Tell kern_execve.c about it, with a little help from the linker. */ static struct execsw aout_execsw = { exec_aout_imgact, "a.out" }; EXEC_SET(aout, aout_execsw); Index: head/sys/pc98/i386/machdep.c =================================================================== --- head/sys/pc98/i386/machdep.c (revision 82308) +++ head/sys/pc98/i386/machdep.c (revision 82309) @@ -1,2594 +1,2598 @@ /*- * Copyright (c) 1992 Terrence R. Lambert. * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 * $FreeBSD$ */ #include "opt_atalk.h" #include "opt_compat.h" #include "opt_cpu.h" #include "opt_ddb.h" #include "opt_inet.h" #include "opt_ipx.h" #include "opt_isa.h" #include "opt_maxmem.h" #include "opt_msgbuf.h" #include "opt_npx.h" #include "opt_perfmon.h" +#include "opt_upages.h" /* #include "opt_userconfig.h" */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* pcb.h included via sys/user.h */ #include #ifdef PERFMON #include +#endif +#ifdef SMP +#include #endif #include #include #ifdef PC98 #include #include #else #include #endif #include #include #include extern void init386 __P((int first)); extern void dblfault_handler __P((void)); extern void printcpuinfo(void); /* XXX header file */ extern void earlysetcpuclass(void); /* same header file */ extern void finishidentcpu(void); extern void panicifcpuunsupported(void); extern void initializecpu(void); #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) #define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) static void cpu_startup __P((void *)); #ifdef CPU_ENABLE_SSE static void set_fpregs_xmm __P((struct save87 *, struct savexmm *)); static void fill_fpregs_xmm __P((struct savexmm *, struct save87 *)); #endif /* CPU_ENABLE_SSE */ SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL) #ifdef PC98 int need_pre_dma_flush; /* If 1, use wbinvd befor DMA transfer. */ int need_post_dma_flush; /* If 1, use invd after DMA transfer. */ #endif int _udatasel, _ucodesel; u_int atdevbase; #if defined(SWTCH_OPTIM_STATS) extern int swtch_optim_stats; SYSCTL_INT(_debug, OID_AUTO, swtch_optim_stats, CTLFLAG_RD, &swtch_optim_stats, 0, ""); SYSCTL_INT(_debug, OID_AUTO, tlb_flush_count, CTLFLAG_RD, &tlb_flush_count, 0, ""); #endif #ifdef PC98 static int ispc98 = 1; #else static int ispc98 = 0; #endif SYSCTL_INT(_machdep, OID_AUTO, ispc98, CTLFLAG_RD, &ispc98, 0, ""); int physmem = 0; int cold = 1; #ifdef COMPAT_43 static void osendsig __P((sig_t catcher, int sig, sigset_t *mask, u_long code)); #endif static int sysctl_hw_physmem(SYSCTL_HANDLER_ARGS) { int error = sysctl_handle_int(oidp, 0, ctob(physmem), req); return (error); } SYSCTL_PROC(_hw, HW_PHYSMEM, physmem, CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_hw_physmem, "IU", ""); static int sysctl_hw_usermem(SYSCTL_HANDLER_ARGS) { int error = sysctl_handle_int(oidp, 0, ctob(physmem - cnt.v_wire_count), req); return (error); } SYSCTL_PROC(_hw, HW_USERMEM, usermem, CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_hw_usermem, "IU", ""); static int sysctl_hw_availpages(SYSCTL_HANDLER_ARGS) { int error = sysctl_handle_int(oidp, 0, i386_btop(avail_end - avail_start), req); return (error); } SYSCTL_PROC(_hw, OID_AUTO, availpages, CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_hw_availpages, "I", ""); int Maxmem = 0; #ifdef PC98 int Maxmem_under16M = 0; #endif long dumplo; vm_offset_t phys_avail[10]; /* must be 2 less so 0 0 can signal end of chunks */ #define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(vm_offset_t)) - 2) struct kva_md_info kmi; static struct trapframe proc0_tf; #ifndef SMP static struct globaldata __globaldata; #endif struct mtx sched_lock; struct mtx Giant; static void cpu_startup(dummy) void *dummy; { /* * Good {morning,afternoon,evening,night}. */ earlysetcpuclass(); startrtclock(); printcpuinfo(); panicifcpuunsupported(); #ifdef PERFMON perfmon_init(); #endif printf("real memory = %u (%uK bytes)\n", ptoa(Maxmem), ptoa(Maxmem) / 1024); /* * Display any holes after the first chunk of extended memory. */ if (bootverbose) { int indx; printf("Physical memory chunk(s):\n"); for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) { unsigned int size1; size1 = phys_avail[indx + 1] - phys_avail[indx]; printf("0x%08x - 0x%08x, %u bytes (%u pages)\n", phys_avail[indx], phys_avail[indx + 1] - 1, size1, size1 / PAGE_SIZE); } } vm_ksubmap_init(&kmi); #if 0 /* * Calculate callout wheel size */ for (callwheelsize = 1, callwheelbits = 0; callwheelsize < ncallout; callwheelsize <<= 1, ++callwheelbits) ; callwheelmask = callwheelsize - 1; /* * Allocate space for system data structures. * The first available kernel virtual address is in "v". * As pages of kernel virtual memory are allocated, "v" is incremented. * As pages of memory are allocated and cleared, * "firstaddr" is incremented. * An index into the kernel page table corresponding to the * virtual memory address maintained in "v" is kept in "mapaddr". */ /* * Make two passes. The first pass calculates how much memory is * needed and allocates it. The second pass assigns virtual * addresses to the various data structures. */ firstaddr = 0; again: v = (caddr_t)firstaddr; #define valloc(name, type, num) \ (name) = (type *)v; v = (caddr_t)((name)+(num)) #define valloclim(name, type, num, lim) \ (name) = (type *)v; v = (caddr_t)((lim) = ((name)+(num))) valloc(callout, struct callout, ncallout); valloc(callwheel, struct callout_tailq, callwheelsize); /* * Discount the physical memory larger than the size of kernel_map * to avoid eating up all of KVA space. */ if (kernel_map->first_free == NULL) { printf("Warning: no free entries in kernel_map.\n"); physmem_est = physmem; } else { physmem_est = min(physmem, btoc(kernel_map->max_offset - kernel_map->min_offset)); } /* * The nominal buffer size (and minimum KVA allocation) is BKVASIZE. * For the first 64MB of ram nominally allocate sufficient buffers to * cover 1/4 of our ram. Beyond the first 64MB allocate additional * buffers to cover 1/20 of our ram over 64MB. When auto-sizing * the buffer cache we limit the eventual kva reservation to * maxbcache bytes. * * factor represents the 1/4 x ram conversion. */ if (nbuf == 0) { int factor = 4 * BKVASIZE / PAGE_SIZE; nbuf = 50; if (physmem_est > 1024) nbuf += min((physmem_est - 1024) / factor, 16384 / factor); if (physmem_est > 16384) nbuf += (physmem_est - 16384) * 2 / (factor * 5); if (maxbcache && nbuf > maxbcache / BKVASIZE) nbuf = maxbcache / BKVASIZE; } /* * Do not allow the buffer_map to be more then 1/2 the size of the * kernel_map. */ if (nbuf > (kernel_map->max_offset - kernel_map->min_offset) / (BKVASIZE * 2)) { nbuf = (kernel_map->max_offset - kernel_map->min_offset) / (BKVASIZE * 2); printf("Warning: nbufs capped at %d\n", nbuf); } nswbuf = max(min(nbuf/4, 256), 16); valloc(swbuf, struct buf, nswbuf); valloc(buf, struct buf, nbuf); v = bufhashinit(v); /* * End of first pass, size has been calculated so allocate memory */ if (firstaddr == 0) { size = (vm_size_t)(v - firstaddr); firstaddr = (int)kmem_alloc(kernel_map, round_page(size)); if (firstaddr == 0) panic("startup: no room for tables"); goto again; } /* * End of second pass, addresses have been assigned */ if ((vm_size_t)(v - firstaddr) != size) panic("startup: table size inconsistency"); clean_map = kmem_suballoc(kernel_map, &clean_sva, &clean_eva, (nbuf*BKVASIZE) + (nswbuf*MAXPHYS) + pager_map_size); buffer_map = kmem_suballoc(clean_map, &buffer_sva, &buffer_eva, (nbuf*BKVASIZE)); buffer_map->system_map = 1; pager_map = kmem_suballoc(clean_map, &pager_sva, &pager_eva, (nswbuf*MAXPHYS) + pager_map_size); pager_map->system_map = 1; exec_map = kmem_suballoc(kernel_map, &minaddr, &maxaddr, (16*(ARG_MAX+(PAGE_SIZE*3)))); /* * XXX: Mbuf system machine-specific initializations should * go here, if anywhere. */ /* * Initialize callouts */ SLIST_INIT(&callfree); for (i = 0; i < ncallout; i++) { callout_init(&callout[i], 0); callout[i].c_flags = CALLOUT_LOCAL_ALLOC; SLIST_INSERT_HEAD(&callfree, &callout[i], c_links.sle); } for (i = 0; i < callwheelsize; i++) { TAILQ_INIT(&callwheel[i]); } mtx_init(&callout_lock, "callout", MTX_SPIN | MTX_RECURSE); #endif #if defined(USERCONFIG) userconfig(); cninit(); /* the preferred console may have changed */ #endif printf("avail memory = %u (%uK bytes)\n", ptoa(cnt.v_free_count), ptoa(cnt.v_free_count) / 1024); /* * Set up buffers, so they can be used to read disk labels. */ bufinit(); vm_pager_bufferinit(); globaldata_register(GLOBALDATA); #ifndef SMP /* For SMP, we delay the cpu_setregs() until after SMP startup. */ cpu_setregs(); #endif } /* * Send an interrupt to process. * * Stack is set up to allow sigcode stored * at top to call routine, followed by kcall * to sigreturn routine below. After sigreturn * resets the signal mask, the stack, and the * frame pointer, it returns to the user * specified pc, psl. */ #ifdef COMPAT_43 static void osendsig(catcher, sig, mask, code) sig_t catcher; int sig; sigset_t *mask; u_long code; { struct osigframe sf; struct osigframe *fp; struct proc *p; struct sigacts *psp; struct trapframe *regs; int oonstack; p = curproc; PROC_LOCK(p); psp = p->p_sigacts; regs = p->p_frame; oonstack = sigonstack(regs->tf_esp); /* Allocate and validate space for the signal handler context. */ if ((p->p_flag & P_ALTSTACK) && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { fp = (struct osigframe *)(p->p_sigstk.ss_sp + p->p_sigstk.ss_size - sizeof(struct osigframe)); #if defined(COMPAT_43) || defined(COMPAT_SUNOS) p->p_sigstk.ss_flags |= SS_ONSTACK; #endif } else fp = (struct osigframe *)regs->tf_esp - 1; PROC_UNLOCK(p); /* * grow_stack() will return 0 if *fp does not fit inside the stack * and the stack can not be grown. * useracc() will return FALSE if access is denied. */ if (grow_stack(p, (int)fp) == 0 || !useracc((caddr_t)fp, sizeof(*fp), VM_PROT_WRITE)) { /* * Process has trashed its stack; give it an illegal * instruction to halt it in its tracks. */ PROC_LOCK(p); SIGACTION(p, SIGILL) = SIG_DFL; SIGDELSET(p->p_sigignore, SIGILL); SIGDELSET(p->p_sigcatch, SIGILL); SIGDELSET(p->p_sigmask, SIGILL); psignal(p, SIGILL); PROC_UNLOCK(p); return; } /* Translate the signal if appropriate. */ if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize) sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; /* Build the argument list for the signal handler. */ sf.sf_signum = sig; sf.sf_scp = (register_t)&fp->sf_siginfo.si_sc; PROC_LOCK(p); if (SIGISMEMBER(p->p_sigacts->ps_siginfo, sig)) { /* Signal handler installed with SA_SIGINFO. */ sf.sf_arg2 = (register_t)&fp->sf_siginfo; sf.sf_siginfo.si_signo = sig; sf.sf_siginfo.si_code = code; sf.sf_ahu.sf_action = (__osiginfohandler_t *)catcher; } else { /* Old FreeBSD-style arguments. */ sf.sf_arg2 = code; sf.sf_addr = regs->tf_err; sf.sf_ahu.sf_handler = catcher; } PROC_UNLOCK(p); /* Save most if not all of trap frame. */ sf.sf_siginfo.si_sc.sc_eax = regs->tf_eax; sf.sf_siginfo.si_sc.sc_ebx = regs->tf_ebx; sf.sf_siginfo.si_sc.sc_ecx = regs->tf_ecx; sf.sf_siginfo.si_sc.sc_edx = regs->tf_edx; sf.sf_siginfo.si_sc.sc_esi = regs->tf_esi; sf.sf_siginfo.si_sc.sc_edi = regs->tf_edi; sf.sf_siginfo.si_sc.sc_cs = regs->tf_cs; sf.sf_siginfo.si_sc.sc_ds = regs->tf_ds; sf.sf_siginfo.si_sc.sc_ss = regs->tf_ss; sf.sf_siginfo.si_sc.sc_es = regs->tf_es; sf.sf_siginfo.si_sc.sc_fs = regs->tf_fs; sf.sf_siginfo.si_sc.sc_gs = rgs(); sf.sf_siginfo.si_sc.sc_isp = regs->tf_isp; /* Build the signal context to be used by osigreturn(). */ sf.sf_siginfo.si_sc.sc_onstack = (oonstack) ? 1 : 0; SIG2OSIG(*mask, sf.sf_siginfo.si_sc.sc_mask); sf.sf_siginfo.si_sc.sc_sp = regs->tf_esp; sf.sf_siginfo.si_sc.sc_fp = regs->tf_ebp; sf.sf_siginfo.si_sc.sc_pc = regs->tf_eip; sf.sf_siginfo.si_sc.sc_ps = regs->tf_eflags; sf.sf_siginfo.si_sc.sc_trapno = regs->tf_trapno; sf.sf_siginfo.si_sc.sc_err = regs->tf_err; /* * If we're a vm86 process, we want to save the segment registers. * We also change eflags to be our emulated eflags, not the actual * eflags. */ if (regs->tf_eflags & PSL_VM) { /* XXX confusing names: `tf' isn't a trapframe; `regs' is. */ struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; struct vm86_kernel *vm86 = &p->p_addr->u_pcb.pcb_ext->ext_vm86; sf.sf_siginfo.si_sc.sc_gs = tf->tf_vm86_gs; sf.sf_siginfo.si_sc.sc_fs = tf->tf_vm86_fs; sf.sf_siginfo.si_sc.sc_es = tf->tf_vm86_es; sf.sf_siginfo.si_sc.sc_ds = tf->tf_vm86_ds; if (vm86->vm86_has_vme == 0) sf.sf_siginfo.si_sc.sc_ps = (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); /* See sendsig() for comments. */ tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_T | PSL_VIF | PSL_VIP); } /* Copy the sigframe out to the user's stack. */ if (copyout(&sf, fp, sizeof(*fp)) != 0) { /* * Something is wrong with the stack pointer. * ...Kill the process. */ PROC_LOCK(p); sigexit(p, SIGILL); /* NOTREACHED */ } regs->tf_esp = (int)fp; regs->tf_eip = PS_STRINGS - szosigcode; regs->tf_cs = _ucodesel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _udatasel; load_gs(_udatasel); regs->tf_ss = _udatasel; } #endif void sendsig(catcher, sig, mask, code) sig_t catcher; int sig; sigset_t *mask; u_long code; { struct sigframe sf; struct proc *p; struct sigacts *psp; struct trapframe *regs; struct sigframe *sfp; int oonstack; p = curproc; PROC_LOCK(p); psp = p->p_sigacts; #ifdef COMPAT_43 if (SIGISMEMBER(psp->ps_osigset, sig)) { PROC_UNLOCK(p); osendsig(catcher, sig, mask, code); return; } #endif regs = p->p_frame; oonstack = sigonstack(regs->tf_esp); /* Save user context. */ bzero(&sf, sizeof(sf)); sf.sf_uc.uc_sigmask = *mask; sf.sf_uc.uc_stack = p->p_sigstk; sf.sf_uc.uc_stack.ss_flags = (p->p_flag & P_ALTSTACK) ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; sf.sf_uc.uc_mcontext.mc_gs = rgs(); bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs)); /* Allocate and validate space for the signal handler context. */ if ((p->p_flag & P_ALTSTACK) != 0 && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { sfp = (struct sigframe *)(p->p_sigstk.ss_sp + p->p_sigstk.ss_size - sizeof(struct sigframe)); #if defined(COMPAT_43) || defined(COMPAT_SUNOS) p->p_sigstk.ss_flags |= SS_ONSTACK; #endif } else sfp = (struct sigframe *)regs->tf_esp - 1; PROC_UNLOCK(p); /* * grow_stack() will return 0 if *sfp does not fit inside the stack * and the stack can not be grown. * useracc() will return FALSE if access is denied. */ if (grow_stack(p, (int)sfp) == 0 || !useracc((caddr_t)sfp, sizeof(*sfp), VM_PROT_WRITE)) { /* * Process has trashed its stack; give it an illegal * instruction to halt it in its tracks. */ #ifdef DEBUG printf("process %d has trashed its stack\n", p->p_pid); #endif PROC_LOCK(p); SIGACTION(p, SIGILL) = SIG_DFL; SIGDELSET(p->p_sigignore, SIGILL); SIGDELSET(p->p_sigcatch, SIGILL); SIGDELSET(p->p_sigmask, SIGILL); psignal(p, SIGILL); PROC_UNLOCK(p); return; } /* Translate the signal if appropriate. */ if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize) sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; /* Build the argument list for the signal handler. */ sf.sf_signum = sig; sf.sf_ucontext = (register_t)&sfp->sf_uc; PROC_LOCK(p); if (SIGISMEMBER(p->p_sigacts->ps_siginfo, sig)) { /* Signal handler installed with SA_SIGINFO. */ sf.sf_siginfo = (register_t)&sfp->sf_si; sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; /* Fill siginfo structure. */ sf.sf_si.si_signo = sig; sf.sf_si.si_code = code; sf.sf_si.si_addr = (void *)regs->tf_err; } else { /* Old FreeBSD-style arguments. */ sf.sf_siginfo = code; sf.sf_addr = regs->tf_err; sf.sf_ahu.sf_handler = catcher; } PROC_UNLOCK(p); /* * If we're a vm86 process, we want to save the segment registers. * We also change eflags to be our emulated eflags, not the actual * eflags. */ if (regs->tf_eflags & PSL_VM) { struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; struct vm86_kernel *vm86 = &p->p_addr->u_pcb.pcb_ext->ext_vm86; sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs; sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs; sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es; sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds; if (vm86->vm86_has_vme == 0) sf.sf_uc.uc_mcontext.mc_eflags = (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); /* * We should never have PSL_T set when returning from vm86 * mode. It may be set here if we deliver a signal before * getting to vm86 mode, so turn it off. * * Clear PSL_NT to inhibit T_TSSFLT faults on return from * syscalls made by the signal handler. This just avoids * wasting time for our lazy fixup of such faults. PSL_NT * does nothing in vm86 mode, but vm86 programs can set it * almost legitimately in probes for old cpu types. */ tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_T | PSL_VIF | PSL_VIP); } /* Copy the sigframe out to the user's stack. */ if (copyout(&sf, sfp, sizeof(*sfp)) != 0) { /* * Something is wrong with the stack pointer. * ...Kill the process. */ PROC_LOCK(p); sigexit(p, SIGILL); /* NOTREACHED */ } regs->tf_esp = (int)sfp; regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode); regs->tf_cs = _ucodesel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _udatasel; regs->tf_ss = _udatasel; } /* * System call to cleanup state after a signal * has been taken. Reset signal mask and * stack state from context left by sendsig (above). * Return to previous pc and psl as specified by * context left by sendsig. Check carefully to * make sure that the user has not modified the * state to gain improper privileges. */ #ifdef COMPAT_43 int osigreturn(p, uap) struct proc *p; struct osigreturn_args /* { struct osigcontext *sigcntxp; } */ *uap; { struct trapframe *regs; struct osigcontext *scp; int eflags; regs = p->p_frame; scp = uap->sigcntxp; if (!useracc((caddr_t)scp, sizeof(*scp), VM_PROT_READ)) return (EFAULT); eflags = scp->sc_ps; if (eflags & PSL_VM) { struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; struct vm86_kernel *vm86; /* * if pcb_ext == 0 or vm86_inited == 0, the user hasn't * set up the vm86 area, and we can't enter vm86 mode. */ if (p->p_addr->u_pcb.pcb_ext == 0) return (EINVAL); vm86 = &p->p_addr->u_pcb.pcb_ext->ext_vm86; if (vm86->vm86_inited == 0) return (EINVAL); /* Go back to user mode if both flags are set. */ if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) trapsignal(p, SIGBUS, 0); if (vm86->vm86_has_vme) { eflags = (tf->tf_eflags & ~VME_USERCHANGE) | (eflags & VME_USERCHANGE) | PSL_VM; } else { vm86->vm86_eflags = eflags; /* save VIF, VIP */ eflags = (tf->tf_eflags & ~VM_USERCHANGE) | (eflags & VM_USERCHANGE) | PSL_VM; } tf->tf_vm86_ds = scp->sc_ds; tf->tf_vm86_es = scp->sc_es; tf->tf_vm86_fs = scp->sc_fs; tf->tf_vm86_gs = scp->sc_gs; tf->tf_ds = _udatasel; tf->tf_es = _udatasel; tf->tf_fs = _udatasel; } else { /* * Don't allow users to change privileged or reserved flags. */ /* * XXX do allow users to change the privileged flag PSL_RF. * The cpu sets PSL_RF in tf_eflags for faults. Debuggers * should sometimes set it there too. tf_eflags is kept in * the signal context during signal handling and there is no * other place to remember it, so the PSL_RF bit may be * corrupted by the signal handler without us knowing. * Corruption of the PSL_RF bit at worst causes one more or * one less debugger trap, so allowing it is fairly harmless. */ if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) { return (EINVAL); } /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ if (!CS_SECURE(scp->sc_cs)) { trapsignal(p, SIGBUS, T_PROTFLT); return (EINVAL); } regs->tf_ds = scp->sc_ds; regs->tf_es = scp->sc_es; regs->tf_fs = scp->sc_fs; } /* Restore remaining registers. */ regs->tf_eax = scp->sc_eax; regs->tf_ebx = scp->sc_ebx; regs->tf_ecx = scp->sc_ecx; regs->tf_edx = scp->sc_edx; regs->tf_esi = scp->sc_esi; regs->tf_edi = scp->sc_edi; regs->tf_cs = scp->sc_cs; regs->tf_ss = scp->sc_ss; regs->tf_isp = scp->sc_isp; PROC_LOCK(p); #if defined(COMPAT_43) || defined(COMPAT_SUNOS) if (scp->sc_onstack & 1) p->p_sigstk.ss_flags |= SS_ONSTACK; else p->p_sigstk.ss_flags &= ~SS_ONSTACK; #endif SIGSETOLD(p->p_sigmask, scp->sc_mask); SIG_CANTMASK(p->p_sigmask); PROC_UNLOCK(p); regs->tf_ebp = scp->sc_fp; regs->tf_esp = scp->sc_sp; regs->tf_eip = scp->sc_pc; regs->tf_eflags = eflags; return (EJUSTRETURN); } #endif int sigreturn(p, uap) struct proc *p; struct sigreturn_args /* { ucontext_t *sigcntxp; } */ *uap; { struct trapframe *regs; ucontext_t *ucp; int cs, eflags; ucp = uap->sigcntxp; #ifdef COMPAT_43 if (!useracc((caddr_t)ucp, sizeof(struct osigcontext), VM_PROT_READ)) return (EFAULT); if (((struct osigcontext *)ucp)->sc_trapno == 0x01d516) return (osigreturn(p, (struct osigreturn_args *)uap)); /* * Since ucp is not an osigcontext but a ucontext_t, we have to * check again if all of it is accessible. A ucontext_t is * much larger, so instead of just checking for the pointer * being valid for the size of an osigcontext, now check for * it being valid for a whole, new-style ucontext_t. */ #endif if (!useracc((caddr_t)ucp, sizeof(*ucp), VM_PROT_READ)) return (EFAULT); regs = p->p_frame; eflags = ucp->uc_mcontext.mc_eflags; if (eflags & PSL_VM) { struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; struct vm86_kernel *vm86; /* * if pcb_ext == 0 or vm86_inited == 0, the user hasn't * set up the vm86 area, and we can't enter vm86 mode. */ if (p->p_addr->u_pcb.pcb_ext == 0) return (EINVAL); vm86 = &p->p_addr->u_pcb.pcb_ext->ext_vm86; if (vm86->vm86_inited == 0) return (EINVAL); /* Go back to user mode if both flags are set. */ if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) trapsignal(p, SIGBUS, 0); if (vm86->vm86_has_vme) { eflags = (tf->tf_eflags & ~VME_USERCHANGE) | (eflags & VME_USERCHANGE) | PSL_VM; } else { vm86->vm86_eflags = eflags; /* save VIF, VIP */ eflags = (tf->tf_eflags & ~VM_USERCHANGE) | (eflags & VM_USERCHANGE) | PSL_VM; } bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe)); tf->tf_eflags = eflags; tf->tf_vm86_ds = tf->tf_ds; tf->tf_vm86_es = tf->tf_es; tf->tf_vm86_fs = tf->tf_fs; tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs; tf->tf_ds = _udatasel; tf->tf_es = _udatasel; tf->tf_fs = _udatasel; } else { /* * Don't allow users to change privileged or reserved flags. */ /* * XXX do allow users to change the privileged flag PSL_RF. * The cpu sets PSL_RF in tf_eflags for faults. Debuggers * should sometimes set it there too. tf_eflags is kept in * the signal context during signal handling and there is no * other place to remember it, so the PSL_RF bit may be * corrupted by the signal handler without us knowing. * Corruption of the PSL_RF bit at worst causes one more or * one less debugger trap, so allowing it is fairly harmless. */ if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) { printf("sigreturn: eflags = 0x%x\n", eflags); return (EINVAL); } /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ cs = ucp->uc_mcontext.mc_cs; if (!CS_SECURE(cs)) { printf("sigreturn: cs = 0x%x\n", cs); trapsignal(p, SIGBUS, T_PROTFLT); return (EINVAL); } bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs)); } PROC_LOCK(p); #if defined(COMPAT_43) || defined(COMPAT_SUNOS) if (ucp->uc_mcontext.mc_onstack & 1) p->p_sigstk.ss_flags |= SS_ONSTACK; else p->p_sigstk.ss_flags &= ~SS_ONSTACK; #endif p->p_sigmask = ucp->uc_sigmask; SIG_CANTMASK(p->p_sigmask); PROC_UNLOCK(p); return (EJUSTRETURN); } /* * Machine dependent boot() routine * * I haven't seen anything to put here yet * Possibly some stuff might be grafted back here from boot() */ void cpu_boot(int howto) { } /* * Shutdown the CPU as much as possible */ void cpu_halt(void) { for (;;) __asm__ ("hlt"); } /* * Hook to idle the CPU when possible. This currently only works in * the !SMP case, as there is no clean way to ensure that a CPU will be * woken when there is work available for it. */ static int cpu_idle_hlt = 1; SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_hlt, CTLFLAG_RW, &cpu_idle_hlt, 0, "Idle loop HLT enable"); /* * Note that we have to be careful here to avoid a race between checking * procrunnable() and actually halting. If we don't do this, we may waste * the time between calling hlt and the next interrupt even though there * is a runnable process. */ void cpu_idle(void) { #ifndef SMP if (cpu_idle_hlt) { disable_intr(); if (procrunnable()) enable_intr(); else { enable_intr(); __asm __volatile("hlt"); } } #endif } /* * Clear registers on exec */ void setregs(p, entry, stack, ps_strings) struct proc *p; u_long entry; u_long stack; u_long ps_strings; { struct trapframe *regs = p->p_frame; struct pcb *pcb = &p->p_addr->u_pcb; if (pcb->pcb_ldt) user_ldt_free(pcb); bzero((char *)regs, sizeof(struct trapframe)); regs->tf_eip = entry; regs->tf_esp = stack; regs->tf_eflags = PSL_USER | (regs->tf_eflags & PSL_T); regs->tf_ss = _udatasel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _udatasel; regs->tf_cs = _ucodesel; /* PS_STRINGS value for BSD/OS binaries. It is 0 for non-BSD/OS. */ regs->tf_ebx = ps_strings; /* reset %gs as well */ if (pcb == PCPU_GET(curpcb)) load_gs(_udatasel); else pcb->pcb_gs = _udatasel; /* * Reset the hardware debug registers if they were in use. * They won't have any meaning for the newly exec'd process. */ if (pcb->pcb_flags & PCB_DBREGS) { pcb->pcb_dr0 = 0; pcb->pcb_dr1 = 0; pcb->pcb_dr2 = 0; pcb->pcb_dr3 = 0; pcb->pcb_dr6 = 0; pcb->pcb_dr7 = 0; if (pcb == PCPU_GET(curpcb)) { /* * Clear the debug registers on the running * CPU, otherwise they will end up affecting * the next process we switch to. */ reset_dbregs(); } pcb->pcb_flags &= ~PCB_DBREGS; } /* * Initialize the math emulator (if any) for the current process. * Actually, just clear the bit that says that the emulator has * been initialized. Initialization is delayed until the process * traps to the emulator (if it is done at all) mainly because * emulators don't provide an entry point for initialization. */ p->p_addr->u_pcb.pcb_flags &= ~FP_SOFTFP; /* * Arrange to trap the next npx or `fwait' instruction (see npx.c * for why fwait must be trapped at least if there is an npx or an * emulator). This is mainly to handle the case where npx0 is not * configured, since the npx routines normally set up the trap * otherwise. It should be done only at boot time, but doing it * here allows modifying `npx_exists' for testing the emulator on * systems with an npx. */ load_cr0(rcr0() | CR0_MP | CR0_TS); #ifdef DEV_NPX /* Initialize the npx (if any) for the current process. */ npxinit(__INITIAL_NPXCW__); #endif /* * XXX - Linux emulator * Make sure sure edx is 0x0 on entry. Linux binaries depend * on it. */ p->p_retval[1] = 0; } void cpu_setregs(void) { unsigned int cr0; cr0 = rcr0(); cr0 |= CR0_NE; /* Done by npxinit() */ cr0 |= CR0_MP | CR0_TS; /* Done at every execve() too. */ #ifndef I386_CPU cr0 |= CR0_WP | CR0_AM; #endif load_cr0(cr0); load_gs(_udatasel); } static int sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS) { int error; error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req); if (!error && req->newptr) resettodr(); return (error); } SYSCTL_PROC(_machdep, CPU_ADJKERNTZ, adjkerntz, CTLTYPE_INT|CTLFLAG_RW, &adjkerntz, 0, sysctl_machdep_adjkerntz, "I", ""); SYSCTL_INT(_machdep, CPU_DISRTCSET, disable_rtc_set, CTLFLAG_RW, &disable_rtc_set, 0, ""); SYSCTL_STRUCT(_machdep, CPU_BOOTINFO, bootinfo, CTLFLAG_RD, &bootinfo, bootinfo, ""); SYSCTL_INT(_machdep, CPU_WALLCLOCK, wall_cmos_clock, CTLFLAG_RW, &wall_cmos_clock, 0, ""); /* * Initialize 386 and configure to run kernel */ /* * Initialize segments & interrupt table */ int _default_ldt; union descriptor gdt[NGDT * MAXCPU]; /* global descriptor table */ static struct gate_descriptor idt0[NIDT]; struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */ union descriptor ldt[NLDT]; /* local descriptor table */ #ifdef SMP /* table descriptors - used to load tables by microp */ struct region_descriptor r_gdt, r_idt; #endif int private_tss; /* flag indicating private tss */ #if defined(I586_CPU) && !defined(NO_F00F_HACK) extern int has_f00f_bug; #endif static struct i386tss dblfault_tss; static char dblfault_stack[PAGE_SIZE]; extern struct user *proc0paddr; /* software prototypes -- in more palatable form */ struct soft_segment_descriptor gdt_segs[] = { /* GNULL_SEL 0 Null Descriptor */ { 0x0, /* segment base address */ 0x0, /* length */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GCODE_SEL 1 Code Descriptor for kernel */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMERA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GDATA_SEL 2 Data Descriptor for kernel */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMRWA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GPRIV_SEL 3 SMP Per-Processor Private Data Descriptor */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMRWA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GPROC0_SEL 4 Proc 0 Tss Descriptor */ { 0x0, /* segment base address */ sizeof(struct i386tss)-1,/* length - all address space */ SDT_SYS386TSS, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* unused - default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GLDT_SEL 5 LDT Descriptor */ { (int) ldt, /* segment base address */ sizeof(ldt)-1, /* length - all address space */ SDT_SYSLDT, /* segment type */ SEL_UPL, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* unused - default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GUSERLDT_SEL 6 User LDT Descriptor per process */ { (int) ldt, /* segment base address */ (512 * sizeof(union descriptor)-1), /* length */ SDT_SYSLDT, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* unused - default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GTGATE_SEL 7 Null Descriptor - Placeholder */ { 0x0, /* segment base address */ 0x0, /* length - all address space */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GBIOSLOWMEM_SEL 8 BIOS access to realmode segment 0x40, must be #8 in GDT */ { 0x400, /* segment base address */ 0xfffff, /* length */ SDT_MEMRWA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GPANIC_SEL 9 Panic Tss Descriptor */ { (int) &dblfault_tss, /* segment base address */ sizeof(struct i386tss)-1,/* length - all address space */ SDT_SYS386TSS, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* unused - default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GBIOSCODE32_SEL 10 BIOS 32-bit interface (32bit Code) */ { 0, /* segment base address (overwritten) */ 0xfffff, /* length */ SDT_MEMERA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GBIOSCODE16_SEL 11 BIOS 32-bit interface (16bit Code) */ { 0, /* segment base address (overwritten) */ 0xfffff, /* length */ SDT_MEMERA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GBIOSDATA_SEL 12 BIOS 32-bit interface (Data) */ { 0, /* segment base address (overwritten) */ 0xfffff, /* length */ SDT_MEMRWA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GBIOSUTIL_SEL 13 BIOS 16-bit interface (Utility) */ { 0, /* segment base address (overwritten) */ 0xfffff, /* length */ SDT_MEMRWA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GBIOSARGS_SEL 14 BIOS 16-bit interface (Arguments) */ { 0, /* segment base address (overwritten) */ 0xfffff, /* length */ SDT_MEMRWA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, }; static struct soft_segment_descriptor ldt_segs[] = { /* Null Descriptor - overwritten by call gate */ { 0x0, /* segment base address */ 0x0, /* length - all address space */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* Null Descriptor - overwritten by call gate */ { 0x0, /* segment base address */ 0x0, /* length - all address space */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* Null Descriptor - overwritten by call gate */ { 0x0, /* segment base address */ 0x0, /* length - all address space */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* Code Descriptor for user */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMERA, /* segment type */ SEL_UPL, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* Null Descriptor - overwritten by call gate */ { 0x0, /* segment base address */ 0x0, /* length - all address space */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* Data Descriptor for user */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMRWA, /* segment type */ SEL_UPL, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, }; void setidt(idx, func, typ, dpl, selec) int idx; inthand_t *func; int typ; int dpl; int selec; { struct gate_descriptor *ip; ip = idt + idx; ip->gd_looffset = (int)func; ip->gd_selector = selec; ip->gd_stkcpy = 0; ip->gd_xx = 0; ip->gd_type = typ; ip->gd_dpl = dpl; ip->gd_p = 1; ip->gd_hioffset = ((int)func)>>16 ; } #define IDTVEC(name) __CONCAT(X,name) extern inthand_t IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), IDTVEC(xmm), IDTVEC(lcall_syscall), IDTVEC(int0x80_syscall); void sdtossd(sd, ssd) struct segment_descriptor *sd; struct soft_segment_descriptor *ssd; { ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase; ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit; ssd->ssd_type = sd->sd_type; ssd->ssd_dpl = sd->sd_dpl; ssd->ssd_p = sd->sd_p; ssd->ssd_def32 = sd->sd_def32; ssd->ssd_gran = sd->sd_gran; } #define PHYSMAP_SIZE (2 * 8) /* * Populate the (physmap) array with base/bound pairs describing the * available physical memory in the system, then test this memory and * build the phys_avail array describing the actually-available memory. * * If we cannot accurately determine the physical memory map, then use * value from the 0xE801 call, and failing that, the RTC. * * Total memory size may be set by the kernel environment variable * hw.physmem or the compile-time define MAXMEM. */ static void getmemsize(int first) { int i, physmap_idx, pa_indx; u_int basemem, extmem; #ifdef PC98 int pg_n; u_int under16; #else struct vm86frame vmf; struct vm86context vmc; #endif vm_offset_t pa, physmap[PHYSMAP_SIZE]; pt_entry_t pte; const char *cp; #ifndef PC98 struct bios_smap *smap; #endif #ifdef PC98 /* XXX - some of EPSON machines can't use PG_N */ pg_n = PG_N; if (pc98_machine_type & M_EPSON_PC98) { switch (epson_machine_id) { #ifdef WB_CACHE default: #endif case 0x34: /* PC-486HX */ case 0x35: /* PC-486HG */ case 0x3B: /* PC-486HA */ pg_n = 0; break; } } #else bzero(&vmf, sizeof(struct vm86frame)); #endif bzero(physmap, sizeof(physmap)); /* * Perform "base memory" related probes & setup */ #ifdef PC98 under16 = pc98_getmemsize(&basemem, &extmem); #else vm86_intcall(0x12, &vmf); basemem = vmf.vmf_ax; #endif if (basemem > 640) { printf("Preposterous BIOS basemem of %uK, truncating to 640K\n", basemem); basemem = 640; } /* * XXX if biosbasemem is now < 640, there is a `hole' * between the end of base memory and the start of * ISA memory. The hole may be empty or it may * contain BIOS code or data. Map it read/write so * that the BIOS can write to it. (Memory from 0 to * the physical end of the kernel is mapped read-only * to begin with and then parts of it are remapped. * The parts that aren't remapped form holes that * remain read-only and are unused by the kernel. * The base memory area is below the physical end of * the kernel and right now forms a read-only hole. * The part of it from PAGE_SIZE to * (trunc_page(biosbasemem * 1024) - 1) will be * remapped and used by the kernel later.) * * This code is similar to the code used in * pmap_mapdev, but since no memory needs to be * allocated we simply change the mapping. */ for (pa = trunc_page(basemem * 1024); pa < ISA_HOLE_START; pa += PAGE_SIZE) { pte = (pt_entry_t)vtopte(pa + KERNBASE); *pte = pa | PG_RW | PG_V; } /* * if basemem != 640, map pages r/w into vm86 page table so * that the bios can scribble on it. */ pte = (pt_entry_t)vm86paddr; for (i = basemem / 4; i < 160; i++) pte[i] = (i << PAGE_SHIFT) | PG_V | PG_RW | PG_U; #ifndef PC98 /* * map page 1 R/W into the kernel page table so we can use it * as a buffer. The kernel will unmap this page later. */ pte = (pt_entry_t)vtopte(KERNBASE + (1 << PAGE_SHIFT)); *pte = (1 << PAGE_SHIFT) | PG_RW | PG_V; /* * get memory map with INT 15:E820 */ vmc.npages = 0; smap = (void *)vm86_addpage(&vmc, 1, KERNBASE + (1 << PAGE_SHIFT)); vm86_getptr(&vmc, (vm_offset_t)smap, &vmf.vmf_es, &vmf.vmf_di); physmap_idx = 0; vmf.vmf_ebx = 0; do { vmf.vmf_eax = 0xE820; vmf.vmf_edx = SMAP_SIG; vmf.vmf_ecx = sizeof(struct bios_smap); i = vm86_datacall(0x15, &vmf, &vmc); if (i || vmf.vmf_eax != SMAP_SIG) break; if (boothowto & RB_VERBOSE) printf("SMAP type=%02x base=%08x %08x len=%08x %08x\n", smap->type, *(u_int32_t *)((char *)&smap->base + 4), (u_int32_t)smap->base, *(u_int32_t *)((char *)&smap->length + 4), (u_int32_t)smap->length); if (smap->type != 0x01) goto next_run; if (smap->length == 0) goto next_run; if (smap->base >= 0xffffffff) { printf("%uK of memory above 4GB ignored\n", (u_int)(smap->length / 1024)); goto next_run; } for (i = 0; i <= physmap_idx; i += 2) { if (smap->base < physmap[i + 1]) { if (boothowto & RB_VERBOSE) printf( "Overlapping or non-montonic memory region, ignoring second region\n"); goto next_run; } } if (smap->base == physmap[physmap_idx + 1]) { physmap[physmap_idx + 1] += smap->length; goto next_run; } physmap_idx += 2; if (physmap_idx == PHYSMAP_SIZE) { printf( "Too many segments in the physical address map, giving up\n"); break; } physmap[physmap_idx] = smap->base; physmap[physmap_idx + 1] = smap->base + smap->length; next_run: } while (vmf.vmf_ebx != 0); if (physmap[1] != 0) goto physmap_done; /* * If we failed above, try memory map with INT 15:E801 */ vmf.vmf_ax = 0xE801; if (vm86_intcall(0x15, &vmf) == 0) { extmem = vmf.vmf_cx + vmf.vmf_dx * 64; } else { #if 0 vmf.vmf_ah = 0x88; vm86_intcall(0x15, &vmf); extmem = vmf.vmf_ax; #else /* * Prefer the RTC value for extended memory. */ extmem = rtcin(RTC_EXTLO) + (rtcin(RTC_EXTHI) << 8); #endif } /* * Special hack for chipsets that still remap the 384k hole when * there's 16MB of memory - this really confuses people that * are trying to use bus mastering ISA controllers with the * "16MB limit"; they only have 16MB, but the remapping puts * them beyond the limit. * * If extended memory is between 15-16MB (16-17MB phys address range), * chop it to 15MB. */ if ((extmem > 15 * 1024) && (extmem < 16 * 1024)) extmem = 15 * 1024; #endif physmap[0] = 0; physmap[1] = basemem * 1024; physmap_idx = 2; physmap[physmap_idx] = 0x100000; physmap[physmap_idx + 1] = physmap[physmap_idx] + extmem * 1024; #ifdef PC98 if ((under16 != 16 * 1024) && (extmem > 15 * 1024)) { /* 15M - 16M region is cut off, so need to divide chunk */ physmap[physmap_idx + 1] = under16 * 1024; physmap_idx += 2; physmap[physmap_idx] = 0x1000000; physmap[physmap_idx + 1] = physmap[2] + extmem * 1024; } #else physmap_done: #endif /* * Now, physmap contains a map of physical memory. */ #ifdef SMP /* make hole for AP bootstrap code */ physmap[1] = mp_bootaddress(physmap[1] / 1024); /* look for the MP hardware - needed for apic addresses */ i386_mp_probe(); #endif /* * Maxmem isn't the "maximum memory", it's one larger than the * highest page of the physical address space. It should be * called something like "Maxphyspage". We may adjust this * based on ``hw.physmem'' and the results of the memory test. */ Maxmem = atop(physmap[physmap_idx + 1]); #ifdef MAXMEM Maxmem = MAXMEM / 4; #endif /* * hw.physmem is a size in bytes; we also allow k, m, and g suffixes * for the appropriate modifiers. This overrides MAXMEM. */ if ((cp = getenv("hw.physmem")) != NULL) { u_int64_t AllowMem, sanity; char *ep; sanity = AllowMem = strtouq(cp, &ep, 0); if ((ep != cp) && (*ep != 0)) { switch(*ep) { case 'g': case 'G': AllowMem <<= 10; case 'm': case 'M': AllowMem <<= 10; case 'k': case 'K': AllowMem <<= 10; break; default: AllowMem = sanity = 0; } if (AllowMem < sanity) AllowMem = 0; } if (AllowMem == 0) printf("Ignoring invalid memory size of '%s'\n", cp); else Maxmem = atop(AllowMem); } if (atop(physmap[physmap_idx + 1]) != Maxmem && (boothowto & RB_VERBOSE)) printf("Physical memory use set to %uK\n", Maxmem * 4); /* * If Maxmem has been increased beyond what the system has detected, * extend the last memory segment to the new limit. */ if (atop(physmap[physmap_idx + 1]) < Maxmem) physmap[physmap_idx + 1] = ptoa(Maxmem); /* call pmap initialization to make new kernel address space */ pmap_bootstrap(first, 0); /* * Size up each available chunk of physical memory. */ physmap[0] = PAGE_SIZE; /* mask off page 0 */ pa_indx = 0; phys_avail[pa_indx++] = physmap[0]; phys_avail[pa_indx] = physmap[0]; #if 0 pte = (pt_entry_t)vtopte(KERNBASE); #else pte = (pt_entry_t)CMAP1; #endif /* * physmap is in bytes, so when converting to page boundaries, * round up the start address and round down the end address. */ for (i = 0; i <= physmap_idx; i += 2) { vm_offset_t end; end = ptoa(Maxmem); if (physmap[i + 1] < end) end = trunc_page(physmap[i + 1]); for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) { int tmp, page_bad; #if 0 int *ptr = 0; #else int *ptr = (int *)CADDR1; #endif /* * block out kernel memory as not available. */ if (pa >= 0x100000 && pa < first) continue; page_bad = FALSE; /* * map page into kernel: valid, read/write,non-cacheable */ #ifdef PC98 *pte = pa | PG_V | PG_RW | pg_n; #else *pte = pa | PG_V | PG_RW | PG_N; #endif invltlb(); tmp = *(int *)ptr; /* * Test for alternating 1's and 0's */ *(volatile int *)ptr = 0xaaaaaaaa; if (*(volatile int *)ptr != 0xaaaaaaaa) { page_bad = TRUE; } /* * Test for alternating 0's and 1's */ *(volatile int *)ptr = 0x55555555; if (*(volatile int *)ptr != 0x55555555) { page_bad = TRUE; } /* * Test for all 1's */ *(volatile int *)ptr = 0xffffffff; if (*(volatile int *)ptr != 0xffffffff) { page_bad = TRUE; } /* * Test for all 0's */ *(volatile int *)ptr = 0x0; if (*(volatile int *)ptr != 0x0) { page_bad = TRUE; } /* * Restore original value. */ *(int *)ptr = tmp; /* * Adjust array of valid/good pages. */ if (page_bad == TRUE) { continue; } /* * If this good page is a continuation of the * previous set of good pages, then just increase * the end pointer. Otherwise start a new chunk. * Note that "end" points one higher than end, * making the range >= start and < end. * If we're also doing a speculative memory * test and we at or past the end, bump up Maxmem * so that we keep going. The first bad page * will terminate the loop. */ if (phys_avail[pa_indx] == pa) { phys_avail[pa_indx] += PAGE_SIZE; } else { pa_indx++; if (pa_indx == PHYS_AVAIL_ARRAY_END) { printf( "Too many holes in the physical address space, giving up\n"); pa_indx--; break; } phys_avail[pa_indx++] = pa; /* start */ phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */ } physmem++; } } *pte = 0; invltlb(); /* * XXX * The last chunk must contain at least one page plus the message * buffer to avoid complicating other code (message buffer address * calculation, etc.). */ while (phys_avail[pa_indx - 1] + PAGE_SIZE + round_page(MSGBUF_SIZE) >= phys_avail[pa_indx]) { physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]); phys_avail[pa_indx--] = 0; phys_avail[pa_indx--] = 0; } Maxmem = atop(phys_avail[pa_indx]); /* Trim off space for the message buffer. */ phys_avail[pa_indx] -= round_page(MSGBUF_SIZE); avail_end = phys_avail[pa_indx]; } void init386(first) int first; { struct gate_descriptor *gdp; int gsel_tss, metadata_missing, off, x; #ifndef SMP /* table descriptors - used to load tables by microp */ struct region_descriptor r_gdt, r_idt; #endif proc0.p_addr = proc0paddr; atdevbase = ISA_HOLE_START + KERNBASE; #ifdef PC98 /* * Initialize DMAC */ pc98_init_dmac(); #endif metadata_missing = 0; if (bootinfo.bi_modulep) { preload_metadata = (caddr_t)bootinfo.bi_modulep + KERNBASE; preload_bootstrap_relocate(KERNBASE); } else { metadata_missing = 1; } if (bootinfo.bi_envp) kern_envp = (caddr_t)bootinfo.bi_envp + KERNBASE; /* Init basic tunables, hz etc */ init_param(); /* * make gdt memory segments, the code segment goes up to end of the * page with etext in it, the data segment goes to the end of * the address space */ /* * XXX text protection is temporarily (?) disabled. The limit was * i386_btop(round_page(etext)) - 1. */ gdt_segs[GCODE_SEL].ssd_limit = atop(0 - 1); gdt_segs[GDATA_SEL].ssd_limit = atop(0 - 1); #ifdef SMP gdt_segs[GPRIV_SEL].ssd_limit = atop(sizeof(struct privatespace) - 1); gdt_segs[GPRIV_SEL].ssd_base = (int) &SMP_prvspace[0]; gdt_segs[GPROC0_SEL].ssd_base = (int) &SMP_prvspace[0].globaldata.gd_common_tss; SMP_prvspace[0].globaldata.gd_prvspace = &SMP_prvspace[0].globaldata; #else gdt_segs[GPRIV_SEL].ssd_limit = atop(sizeof(struct globaldata) - 1); gdt_segs[GPRIV_SEL].ssd_base = (int) &__globaldata; gdt_segs[GPROC0_SEL].ssd_base = (int) &__globaldata.gd_common_tss; __globaldata.gd_prvspace = &__globaldata; #endif for (x = 0; x < NGDT; x++) { #ifdef BDE_DEBUGGER /* avoid overwriting db entries with APM ones */ if (x >= GAPMCODE32_SEL && x <= GAPMDATA_SEL) continue; #endif ssdtosd(&gdt_segs[x], &gdt[x].sd); } r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; r_gdt.rd_base = (int) gdt; lgdt(&r_gdt); /* setup curproc so that mutexes work */ PCPU_SET(curproc, &proc0); PCPU_SET(spinlocks, NULL); LIST_INIT(&proc0.p_contested); /* * Initialize mutexes. */ mtx_init(&Giant, "Giant", MTX_DEF | MTX_RECURSE); mtx_init(&sched_lock, "sched lock", MTX_SPIN | MTX_RECURSE); mtx_init(&proc0.p_mtx, "process lock", MTX_DEF); mtx_init(&clock_lock, "clk", MTX_SPIN | MTX_RECURSE); #ifdef SMP mtx_init(&imen_mtx, "imen", MTX_SPIN); #endif mtx_lock(&Giant); /* make ldt memory segments */ /* * XXX - VM_MAXUSER_ADDRESS is an end address, not a max. And it * should be spelled ...MAX_USER... */ ldt_segs[LUCODE_SEL].ssd_limit = atop(VM_MAXUSER_ADDRESS - 1); ldt_segs[LUDATA_SEL].ssd_limit = atop(VM_MAXUSER_ADDRESS - 1); for (x = 0; x < sizeof ldt_segs / sizeof ldt_segs[0]; x++) ssdtosd(&ldt_segs[x], &ldt[x].sd); _default_ldt = GSEL(GLDT_SEL, SEL_KPL); lldt(_default_ldt); PCPU_SET(currentldt, _default_ldt); /* exceptions */ for (x = 0; x < NIDT; x++) setidt(x, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(0, &IDTVEC(div), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(1, &IDTVEC(dbg), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(2, &IDTVEC(nmi), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(3, &IDTVEC(bpt), SDT_SYS386IGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(4, &IDTVEC(ofl), SDT_SYS386TGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(5, &IDTVEC(bnd), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(6, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(7, &IDTVEC(dna), SDT_SYS386TGT, SEL_KPL , GSEL(GCODE_SEL, SEL_KPL)); setidt(8, 0, SDT_SYSTASKGT, SEL_KPL, GSEL(GPANIC_SEL, SEL_KPL)); setidt(9, &IDTVEC(fpusegm), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(10, &IDTVEC(tss), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(11, &IDTVEC(missing), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(12, &IDTVEC(stk), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(13, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(14, &IDTVEC(page), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(15, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(16, &IDTVEC(fpu), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(17, &IDTVEC(align), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(18, &IDTVEC(mchk), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(19, &IDTVEC(xmm), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(0x80, &IDTVEC(int0x80_syscall), SDT_SYS386TGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); r_idt.rd_limit = sizeof(idt0) - 1; r_idt.rd_base = (int) idt; lidt(&r_idt); /* * Initialize the console before we print anything out. */ cninit(); if (metadata_missing) printf("WARNING: loader(8) metadata is missing!\n"); #ifdef DEV_ISA isa_defaultirq(); #endif #ifdef DDB kdb_init(); if (boothowto & RB_KDB) Debugger("Boot flags requested debugger"); #endif finishidentcpu(); /* Final stage of CPU initialization */ setidt(6, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(13, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); initializecpu(); /* Initialize CPU registers */ /* make an initial tss so cpu can get interrupt stack on syscall! */ PCPU_SET(common_tss.tss_esp0, (int) proc0.p_addr + UPAGES*PAGE_SIZE - 16); PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL)); gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); private_tss = 0; PCPU_SET(tss_gdt, &gdt[GPROC0_SEL].sd); PCPU_SET(common_tssd, *PCPU_GET(tss_gdt)); PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16); ltr(gsel_tss); dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 = dblfault_tss.tss_esp2 = (int)&dblfault_stack[sizeof(dblfault_stack)]; dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 = dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL); dblfault_tss.tss_cr3 = (int)IdlePTD; dblfault_tss.tss_eip = (int)dblfault_handler; dblfault_tss.tss_eflags = PSL_KERNEL; dblfault_tss.tss_ds = dblfault_tss.tss_es = dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL); dblfault_tss.tss_fs = GSEL(GPRIV_SEL, SEL_KPL); dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL); dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL); vm86_initialize(); getmemsize(first); /* now running on new page tables, configured,and u/iom is accessible */ /* Map the message buffer. */ for (off = 0; off < round_page(MSGBUF_SIZE); off += PAGE_SIZE) pmap_kenter((vm_offset_t)msgbufp + off, avail_end + off); msgbufinit(msgbufp, MSGBUF_SIZE); /* make a call gate to reenter kernel with */ gdp = &ldt[LSYS5CALLS_SEL].gd; x = (int) &IDTVEC(lcall_syscall); gdp->gd_looffset = x; gdp->gd_selector = GSEL(GCODE_SEL,SEL_KPL); gdp->gd_stkcpy = 1; gdp->gd_type = SDT_SYS386CGT; gdp->gd_dpl = SEL_UPL; gdp->gd_p = 1; gdp->gd_hioffset = x >> 16; /* XXX does this work? */ ldt[LBSDICALLS_SEL] = ldt[LSYS5CALLS_SEL]; ldt[LSOL26CALLS_SEL] = ldt[LSYS5CALLS_SEL]; /* transfer to user mode */ _ucodesel = LSEL(LUCODE_SEL, SEL_UPL); _udatasel = LSEL(LUDATA_SEL, SEL_UPL); /* setup proc 0's pcb */ proc0.p_addr->u_pcb.pcb_flags = 0; proc0.p_addr->u_pcb.pcb_cr3 = (int)IdlePTD; proc0.p_addr->u_pcb.pcb_ext = 0; proc0.p_frame = &proc0_tf; } #if defined(I586_CPU) && !defined(NO_F00F_HACK) static void f00f_hack(void *unused); SYSINIT(f00f_hack, SI_SUB_INTRINSIC, SI_ORDER_FIRST, f00f_hack, NULL); static void f00f_hack(void *unused) { struct gate_descriptor *new_idt; #ifndef SMP struct region_descriptor r_idt; #endif vm_offset_t tmp; if (!has_f00f_bug) return; GIANT_REQUIRED; printf("Intel Pentium detected, installing workaround for F00F bug\n"); r_idt.rd_limit = sizeof(idt0) - 1; tmp = kmem_alloc(kernel_map, PAGE_SIZE * 2); if (tmp == 0) panic("kmem_alloc returned 0"); if (((unsigned int)tmp & (PAGE_SIZE-1)) != 0) panic("kmem_alloc returned non-page-aligned memory"); /* Put the first seven entries in the lower page */ new_idt = (struct gate_descriptor*)(tmp + PAGE_SIZE - (7*8)); bcopy(idt, new_idt, sizeof(idt0)); r_idt.rd_base = (int)new_idt; lidt(&r_idt); idt = new_idt; if (vm_map_protect(kernel_map, tmp, tmp + PAGE_SIZE, VM_PROT_READ, FALSE) != KERN_SUCCESS) panic("vm_map_protect failed"); return; } #endif /* defined(I586_CPU) && !NO_F00F_HACK */ int ptrace_set_pc(p, addr) struct proc *p; unsigned long addr; { p->p_frame->tf_eip = addr; return (0); } int ptrace_single_step(p) struct proc *p; { p->p_frame->tf_eflags |= PSL_T; return (0); } int fill_regs(p, regs) struct proc *p; struct reg *regs; { struct pcb *pcb; struct trapframe *tp; tp = p->p_frame; regs->r_fs = tp->tf_fs; regs->r_es = tp->tf_es; regs->r_ds = tp->tf_ds; regs->r_edi = tp->tf_edi; regs->r_esi = tp->tf_esi; regs->r_ebp = tp->tf_ebp; regs->r_ebx = tp->tf_ebx; regs->r_edx = tp->tf_edx; regs->r_ecx = tp->tf_ecx; regs->r_eax = tp->tf_eax; regs->r_eip = tp->tf_eip; regs->r_cs = tp->tf_cs; regs->r_eflags = tp->tf_eflags; regs->r_esp = tp->tf_esp; regs->r_ss = tp->tf_ss; pcb = &p->p_addr->u_pcb; regs->r_gs = pcb->pcb_gs; return (0); } int set_regs(p, regs) struct proc *p; struct reg *regs; { struct pcb *pcb; struct trapframe *tp; tp = p->p_frame; if (!EFL_SECURE(regs->r_eflags, tp->tf_eflags) || !CS_SECURE(regs->r_cs)) return (EINVAL); tp->tf_fs = regs->r_fs; tp->tf_es = regs->r_es; tp->tf_ds = regs->r_ds; tp->tf_edi = regs->r_edi; tp->tf_esi = regs->r_esi; tp->tf_ebp = regs->r_ebp; tp->tf_ebx = regs->r_ebx; tp->tf_edx = regs->r_edx; tp->tf_ecx = regs->r_ecx; tp->tf_eax = regs->r_eax; tp->tf_eip = regs->r_eip; tp->tf_cs = regs->r_cs; tp->tf_eflags = regs->r_eflags; tp->tf_esp = regs->r_esp; tp->tf_ss = regs->r_ss; pcb = &p->p_addr->u_pcb; pcb->pcb_gs = regs->r_gs; return (0); } #ifdef CPU_ENABLE_SSE static void fill_fpregs_xmm(sv_xmm, sv_87) struct savexmm *sv_xmm; struct save87 *sv_87; { register struct env87 *penv_87 = &sv_87->sv_env; register struct envxmm *penv_xmm = &sv_xmm->sv_env; int i; /* FPU control/status */ penv_87->en_cw = penv_xmm->en_cw; penv_87->en_sw = penv_xmm->en_sw; penv_87->en_tw = penv_xmm->en_tw; penv_87->en_fip = penv_xmm->en_fip; penv_87->en_fcs = penv_xmm->en_fcs; penv_87->en_opcode = penv_xmm->en_opcode; penv_87->en_foo = penv_xmm->en_foo; penv_87->en_fos = penv_xmm->en_fos; /* FPU registers */ for (i = 0; i < 8; ++i) sv_87->sv_ac[i] = sv_xmm->sv_fp[i].fp_acc; sv_87->sv_ex_sw = sv_xmm->sv_ex_sw; } static void set_fpregs_xmm(sv_87, sv_xmm) struct save87 *sv_87; struct savexmm *sv_xmm; { register struct env87 *penv_87 = &sv_87->sv_env; register struct envxmm *penv_xmm = &sv_xmm->sv_env; int i; /* FPU control/status */ penv_xmm->en_cw = penv_87->en_cw; penv_xmm->en_sw = penv_87->en_sw; penv_xmm->en_tw = penv_87->en_tw; penv_xmm->en_fip = penv_87->en_fip; penv_xmm->en_fcs = penv_87->en_fcs; penv_xmm->en_opcode = penv_87->en_opcode; penv_xmm->en_foo = penv_87->en_foo; penv_xmm->en_fos = penv_87->en_fos; /* FPU registers */ for (i = 0; i < 8; ++i) sv_xmm->sv_fp[i].fp_acc = sv_87->sv_ac[i]; sv_xmm->sv_ex_sw = sv_87->sv_ex_sw; } #endif /* CPU_ENABLE_SSE */ int fill_fpregs(p, fpregs) struct proc *p; struct fpreg *fpregs; { #ifdef CPU_ENABLE_SSE if (cpu_fxsr) { fill_fpregs_xmm(&p->p_addr->u_pcb.pcb_save.sv_xmm, (struct save87 *)fpregs); return (0); } #endif /* CPU_ENABLE_SSE */ bcopy(&p->p_addr->u_pcb.pcb_save.sv_87, fpregs, sizeof *fpregs); return (0); } int set_fpregs(p, fpregs) struct proc *p; struct fpreg *fpregs; { #ifdef CPU_ENABLE_SSE if (cpu_fxsr) { set_fpregs_xmm((struct save87 *)fpregs, &p->p_addr->u_pcb.pcb_save.sv_xmm); return (0); } #endif /* CPU_ENABLE_SSE */ bcopy(fpregs, &p->p_addr->u_pcb.pcb_save.sv_87, sizeof *fpregs); return (0); } int fill_dbregs(p, dbregs) struct proc *p; struct dbreg *dbregs; { struct pcb *pcb; if (p == NULL) { dbregs->dr0 = rdr0(); dbregs->dr1 = rdr1(); dbregs->dr2 = rdr2(); dbregs->dr3 = rdr3(); dbregs->dr4 = rdr4(); dbregs->dr5 = rdr5(); dbregs->dr6 = rdr6(); dbregs->dr7 = rdr7(); } else { pcb = &p->p_addr->u_pcb; dbregs->dr0 = pcb->pcb_dr0; dbregs->dr1 = pcb->pcb_dr1; dbregs->dr2 = pcb->pcb_dr2; dbregs->dr3 = pcb->pcb_dr3; dbregs->dr4 = 0; dbregs->dr5 = 0; dbregs->dr6 = pcb->pcb_dr6; dbregs->dr7 = pcb->pcb_dr7; } return (0); } int set_dbregs(p, dbregs) struct proc *p; struct dbreg *dbregs; { struct pcb *pcb; int i; u_int32_t mask1, mask2; if (p == NULL) { load_dr0(dbregs->dr0); load_dr1(dbregs->dr1); load_dr2(dbregs->dr2); load_dr3(dbregs->dr3); load_dr4(dbregs->dr4); load_dr5(dbregs->dr5); load_dr6(dbregs->dr6); load_dr7(dbregs->dr7); } else { /* * Don't let an illegal value for dr7 get set. Specifically, * check for undefined settings. Setting these bit patterns * result in undefined behaviour and can lead to an unexpected * TRCTRAP. */ for (i = 0, mask1 = 0x3<<16, mask2 = 0x2<<16; i < 8; i++, mask1 <<= 2, mask2 <<= 2) if ((dbregs->dr7 & mask1) == mask2) return (EINVAL); pcb = &p->p_addr->u_pcb; /* * Don't let a process set a breakpoint that is not within the * process's address space. If a process could do this, it * could halt the system by setting a breakpoint in the kernel * (if ddb was enabled). Thus, we need to check to make sure * that no breakpoints are being enabled for addresses outside * process's address space, unless, perhaps, we were called by * uid 0. * * XXX - what about when the watched area of the user's * address space is written into from within the kernel * ... wouldn't that still cause a breakpoint to be generated * from within kernel mode? */ if (suser(p) != 0) { if (dbregs->dr7 & 0x3) { /* dr0 is enabled */ if (dbregs->dr0 >= VM_MAXUSER_ADDRESS) return (EINVAL); } if (dbregs->dr7 & (0x3<<2)) { /* dr1 is enabled */ if (dbregs->dr1 >= VM_MAXUSER_ADDRESS) return (EINVAL); } if (dbregs->dr7 & (0x3<<4)) { /* dr2 is enabled */ if (dbregs->dr2 >= VM_MAXUSER_ADDRESS) return (EINVAL); } if (dbregs->dr7 & (0x3<<6)) { /* dr3 is enabled */ if (dbregs->dr3 >= VM_MAXUSER_ADDRESS) return (EINVAL); } } pcb->pcb_dr0 = dbregs->dr0; pcb->pcb_dr1 = dbregs->dr1; pcb->pcb_dr2 = dbregs->dr2; pcb->pcb_dr3 = dbregs->dr3; pcb->pcb_dr6 = dbregs->dr6; pcb->pcb_dr7 = dbregs->dr7; pcb->pcb_flags |= PCB_DBREGS; } return (0); } /* * Return > 0 if a hardware breakpoint has been hit, and the * breakpoint was in user space. Return 0, otherwise. */ int user_dbreg_trap(void) { u_int32_t dr7, dr6; /* debug registers dr6 and dr7 */ u_int32_t bp; /* breakpoint bits extracted from dr6 */ int nbp; /* number of breakpoints that triggered */ caddr_t addr[4]; /* breakpoint addresses */ int i; dr7 = rdr7(); if ((dr7 & 0x000000ff) == 0) { /* * all GE and LE bits in the dr7 register are zero, * thus the trap couldn't have been caused by the * hardware debug registers */ return 0; } nbp = 0; dr6 = rdr6(); bp = dr6 & 0x0000000f; if (!bp) { /* * None of the breakpoint bits are set meaning this * trap was not caused by any of the debug registers */ return 0; } /* * at least one of the breakpoints were hit, check to see * which ones and if any of them are user space addresses */ if (bp & 0x01) { addr[nbp++] = (caddr_t)rdr0(); } if (bp & 0x02) { addr[nbp++] = (caddr_t)rdr1(); } if (bp & 0x04) { addr[nbp++] = (caddr_t)rdr2(); } if (bp & 0x08) { addr[nbp++] = (caddr_t)rdr3(); } for (i=0; i /* * Determine the size of the transfer, and make sure it is * within the boundaries of the partition. Adjust transfer * if needed, and signal errors or early completion. */ int bounds_check_with_label(struct bio *bp, struct disklabel *lp, int wlabel) { struct partition *p = lp->d_partitions + dkpart(bp->bio_dev); int labelsect = lp->d_partitions[0].p_offset; int maxsz = p->p_size, sz = (bp->bio_bcount + DEV_BSIZE - 1) >> DEV_BSHIFT; /* overwriting disk label ? */ /* XXX should also protect bootstrap in first 8K */ if (bp->bio_blkno + p->p_offset <= LABELSECTOR + labelsect && #if LABELSECTOR != 0 bp->bio_blkno + p->p_offset + sz > LABELSECTOR + labelsect && #endif (bp->bio_cmd == BIO_WRITE) && wlabel == 0) { bp->bio_error = EROFS; goto bad; } #if defined(DOSBBSECTOR) && defined(notyet) /* overwriting master boot record? */ if (bp->bio_blkno + p->p_offset <= DOSBBSECTOR && (bp->bio_cmd == BIO_WRITE) && wlabel == 0) { bp->bio_error = EROFS; goto bad; } #endif /* beyond partition? */ if (bp->bio_blkno < 0 || bp->bio_blkno + sz > maxsz) { /* if exactly at end of disk, return an EOF */ if (bp->bio_blkno == maxsz) { bp->bio_resid = bp->bio_bcount; return(0); } /* or truncate if part of it fits */ sz = maxsz - bp->bio_blkno; if (sz <= 0) { bp->bio_error = EINVAL; goto bad; } bp->bio_bcount = sz << DEV_BSHIFT; } bp->bio_pblkno = bp->bio_blkno + p->p_offset; return(1); bad: bp->bio_flags |= BIO_ERROR; return(-1); } #ifdef DDB /* * Provide inb() and outb() as functions. They are normally only * available as macros calling inlined functions, thus cannot be * called inside DDB. * * The actual code is stolen from , and de-inlined. */ #undef inb #undef outb /* silence compiler warnings */ u_char inb(u_int); void outb(u_int, u_char); u_char inb(u_int port) { u_char data; /* * We use %%dx and not %1 here because i/o is done at %dx and not at * %edx, while gcc generates inferior code (movw instead of movl) * if we tell it to load (u_short) port. */ __asm __volatile("inb %%dx,%0" : "=a" (data) : "d" (port)); return (data); } void outb(u_int port, u_char data) { u_char al; /* * Use an unnecessary assignment to help gcc's register allocator. * This make a large difference for gcc-1.40 and a tiny difference * for gcc-2.6.0. For gcc-1.40, al had to be ``asm("ax")'' for * best results. gcc-2.6.0 can't handle this. */ al = data; __asm __volatile("outb %0,%%dx" : : "a" (al), "d" (port)); } #endif /* DDB */ Index: head/sys/pc98/pc98/machdep.c =================================================================== --- head/sys/pc98/pc98/machdep.c (revision 82308) +++ head/sys/pc98/pc98/machdep.c (revision 82309) @@ -1,2594 +1,2598 @@ /*- * Copyright (c) 1992 Terrence R. Lambert. * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 * $FreeBSD$ */ #include "opt_atalk.h" #include "opt_compat.h" #include "opt_cpu.h" #include "opt_ddb.h" #include "opt_inet.h" #include "opt_ipx.h" #include "opt_isa.h" #include "opt_maxmem.h" #include "opt_msgbuf.h" #include "opt_npx.h" #include "opt_perfmon.h" +#include "opt_upages.h" /* #include "opt_userconfig.h" */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* pcb.h included via sys/user.h */ #include #ifdef PERFMON #include +#endif +#ifdef SMP +#include #endif #include #include #ifdef PC98 #include #include #else #include #endif #include #include #include extern void init386 __P((int first)); extern void dblfault_handler __P((void)); extern void printcpuinfo(void); /* XXX header file */ extern void earlysetcpuclass(void); /* same header file */ extern void finishidentcpu(void); extern void panicifcpuunsupported(void); extern void initializecpu(void); #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) #define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) static void cpu_startup __P((void *)); #ifdef CPU_ENABLE_SSE static void set_fpregs_xmm __P((struct save87 *, struct savexmm *)); static void fill_fpregs_xmm __P((struct savexmm *, struct save87 *)); #endif /* CPU_ENABLE_SSE */ SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL) #ifdef PC98 int need_pre_dma_flush; /* If 1, use wbinvd befor DMA transfer. */ int need_post_dma_flush; /* If 1, use invd after DMA transfer. */ #endif int _udatasel, _ucodesel; u_int atdevbase; #if defined(SWTCH_OPTIM_STATS) extern int swtch_optim_stats; SYSCTL_INT(_debug, OID_AUTO, swtch_optim_stats, CTLFLAG_RD, &swtch_optim_stats, 0, ""); SYSCTL_INT(_debug, OID_AUTO, tlb_flush_count, CTLFLAG_RD, &tlb_flush_count, 0, ""); #endif #ifdef PC98 static int ispc98 = 1; #else static int ispc98 = 0; #endif SYSCTL_INT(_machdep, OID_AUTO, ispc98, CTLFLAG_RD, &ispc98, 0, ""); int physmem = 0; int cold = 1; #ifdef COMPAT_43 static void osendsig __P((sig_t catcher, int sig, sigset_t *mask, u_long code)); #endif static int sysctl_hw_physmem(SYSCTL_HANDLER_ARGS) { int error = sysctl_handle_int(oidp, 0, ctob(physmem), req); return (error); } SYSCTL_PROC(_hw, HW_PHYSMEM, physmem, CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_hw_physmem, "IU", ""); static int sysctl_hw_usermem(SYSCTL_HANDLER_ARGS) { int error = sysctl_handle_int(oidp, 0, ctob(physmem - cnt.v_wire_count), req); return (error); } SYSCTL_PROC(_hw, HW_USERMEM, usermem, CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_hw_usermem, "IU", ""); static int sysctl_hw_availpages(SYSCTL_HANDLER_ARGS) { int error = sysctl_handle_int(oidp, 0, i386_btop(avail_end - avail_start), req); return (error); } SYSCTL_PROC(_hw, OID_AUTO, availpages, CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_hw_availpages, "I", ""); int Maxmem = 0; #ifdef PC98 int Maxmem_under16M = 0; #endif long dumplo; vm_offset_t phys_avail[10]; /* must be 2 less so 0 0 can signal end of chunks */ #define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(vm_offset_t)) - 2) struct kva_md_info kmi; static struct trapframe proc0_tf; #ifndef SMP static struct globaldata __globaldata; #endif struct mtx sched_lock; struct mtx Giant; static void cpu_startup(dummy) void *dummy; { /* * Good {morning,afternoon,evening,night}. */ earlysetcpuclass(); startrtclock(); printcpuinfo(); panicifcpuunsupported(); #ifdef PERFMON perfmon_init(); #endif printf("real memory = %u (%uK bytes)\n", ptoa(Maxmem), ptoa(Maxmem) / 1024); /* * Display any holes after the first chunk of extended memory. */ if (bootverbose) { int indx; printf("Physical memory chunk(s):\n"); for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) { unsigned int size1; size1 = phys_avail[indx + 1] - phys_avail[indx]; printf("0x%08x - 0x%08x, %u bytes (%u pages)\n", phys_avail[indx], phys_avail[indx + 1] - 1, size1, size1 / PAGE_SIZE); } } vm_ksubmap_init(&kmi); #if 0 /* * Calculate callout wheel size */ for (callwheelsize = 1, callwheelbits = 0; callwheelsize < ncallout; callwheelsize <<= 1, ++callwheelbits) ; callwheelmask = callwheelsize - 1; /* * Allocate space for system data structures. * The first available kernel virtual address is in "v". * As pages of kernel virtual memory are allocated, "v" is incremented. * As pages of memory are allocated and cleared, * "firstaddr" is incremented. * An index into the kernel page table corresponding to the * virtual memory address maintained in "v" is kept in "mapaddr". */ /* * Make two passes. The first pass calculates how much memory is * needed and allocates it. The second pass assigns virtual * addresses to the various data structures. */ firstaddr = 0; again: v = (caddr_t)firstaddr; #define valloc(name, type, num) \ (name) = (type *)v; v = (caddr_t)((name)+(num)) #define valloclim(name, type, num, lim) \ (name) = (type *)v; v = (caddr_t)((lim) = ((name)+(num))) valloc(callout, struct callout, ncallout); valloc(callwheel, struct callout_tailq, callwheelsize); /* * Discount the physical memory larger than the size of kernel_map * to avoid eating up all of KVA space. */ if (kernel_map->first_free == NULL) { printf("Warning: no free entries in kernel_map.\n"); physmem_est = physmem; } else { physmem_est = min(physmem, btoc(kernel_map->max_offset - kernel_map->min_offset)); } /* * The nominal buffer size (and minimum KVA allocation) is BKVASIZE. * For the first 64MB of ram nominally allocate sufficient buffers to * cover 1/4 of our ram. Beyond the first 64MB allocate additional * buffers to cover 1/20 of our ram over 64MB. When auto-sizing * the buffer cache we limit the eventual kva reservation to * maxbcache bytes. * * factor represents the 1/4 x ram conversion. */ if (nbuf == 0) { int factor = 4 * BKVASIZE / PAGE_SIZE; nbuf = 50; if (physmem_est > 1024) nbuf += min((physmem_est - 1024) / factor, 16384 / factor); if (physmem_est > 16384) nbuf += (physmem_est - 16384) * 2 / (factor * 5); if (maxbcache && nbuf > maxbcache / BKVASIZE) nbuf = maxbcache / BKVASIZE; } /* * Do not allow the buffer_map to be more then 1/2 the size of the * kernel_map. */ if (nbuf > (kernel_map->max_offset - kernel_map->min_offset) / (BKVASIZE * 2)) { nbuf = (kernel_map->max_offset - kernel_map->min_offset) / (BKVASIZE * 2); printf("Warning: nbufs capped at %d\n", nbuf); } nswbuf = max(min(nbuf/4, 256), 16); valloc(swbuf, struct buf, nswbuf); valloc(buf, struct buf, nbuf); v = bufhashinit(v); /* * End of first pass, size has been calculated so allocate memory */ if (firstaddr == 0) { size = (vm_size_t)(v - firstaddr); firstaddr = (int)kmem_alloc(kernel_map, round_page(size)); if (firstaddr == 0) panic("startup: no room for tables"); goto again; } /* * End of second pass, addresses have been assigned */ if ((vm_size_t)(v - firstaddr) != size) panic("startup: table size inconsistency"); clean_map = kmem_suballoc(kernel_map, &clean_sva, &clean_eva, (nbuf*BKVASIZE) + (nswbuf*MAXPHYS) + pager_map_size); buffer_map = kmem_suballoc(clean_map, &buffer_sva, &buffer_eva, (nbuf*BKVASIZE)); buffer_map->system_map = 1; pager_map = kmem_suballoc(clean_map, &pager_sva, &pager_eva, (nswbuf*MAXPHYS) + pager_map_size); pager_map->system_map = 1; exec_map = kmem_suballoc(kernel_map, &minaddr, &maxaddr, (16*(ARG_MAX+(PAGE_SIZE*3)))); /* * XXX: Mbuf system machine-specific initializations should * go here, if anywhere. */ /* * Initialize callouts */ SLIST_INIT(&callfree); for (i = 0; i < ncallout; i++) { callout_init(&callout[i], 0); callout[i].c_flags = CALLOUT_LOCAL_ALLOC; SLIST_INSERT_HEAD(&callfree, &callout[i], c_links.sle); } for (i = 0; i < callwheelsize; i++) { TAILQ_INIT(&callwheel[i]); } mtx_init(&callout_lock, "callout", MTX_SPIN | MTX_RECURSE); #endif #if defined(USERCONFIG) userconfig(); cninit(); /* the preferred console may have changed */ #endif printf("avail memory = %u (%uK bytes)\n", ptoa(cnt.v_free_count), ptoa(cnt.v_free_count) / 1024); /* * Set up buffers, so they can be used to read disk labels. */ bufinit(); vm_pager_bufferinit(); globaldata_register(GLOBALDATA); #ifndef SMP /* For SMP, we delay the cpu_setregs() until after SMP startup. */ cpu_setregs(); #endif } /* * Send an interrupt to process. * * Stack is set up to allow sigcode stored * at top to call routine, followed by kcall * to sigreturn routine below. After sigreturn * resets the signal mask, the stack, and the * frame pointer, it returns to the user * specified pc, psl. */ #ifdef COMPAT_43 static void osendsig(catcher, sig, mask, code) sig_t catcher; int sig; sigset_t *mask; u_long code; { struct osigframe sf; struct osigframe *fp; struct proc *p; struct sigacts *psp; struct trapframe *regs; int oonstack; p = curproc; PROC_LOCK(p); psp = p->p_sigacts; regs = p->p_frame; oonstack = sigonstack(regs->tf_esp); /* Allocate and validate space for the signal handler context. */ if ((p->p_flag & P_ALTSTACK) && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { fp = (struct osigframe *)(p->p_sigstk.ss_sp + p->p_sigstk.ss_size - sizeof(struct osigframe)); #if defined(COMPAT_43) || defined(COMPAT_SUNOS) p->p_sigstk.ss_flags |= SS_ONSTACK; #endif } else fp = (struct osigframe *)regs->tf_esp - 1; PROC_UNLOCK(p); /* * grow_stack() will return 0 if *fp does not fit inside the stack * and the stack can not be grown. * useracc() will return FALSE if access is denied. */ if (grow_stack(p, (int)fp) == 0 || !useracc((caddr_t)fp, sizeof(*fp), VM_PROT_WRITE)) { /* * Process has trashed its stack; give it an illegal * instruction to halt it in its tracks. */ PROC_LOCK(p); SIGACTION(p, SIGILL) = SIG_DFL; SIGDELSET(p->p_sigignore, SIGILL); SIGDELSET(p->p_sigcatch, SIGILL); SIGDELSET(p->p_sigmask, SIGILL); psignal(p, SIGILL); PROC_UNLOCK(p); return; } /* Translate the signal if appropriate. */ if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize) sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; /* Build the argument list for the signal handler. */ sf.sf_signum = sig; sf.sf_scp = (register_t)&fp->sf_siginfo.si_sc; PROC_LOCK(p); if (SIGISMEMBER(p->p_sigacts->ps_siginfo, sig)) { /* Signal handler installed with SA_SIGINFO. */ sf.sf_arg2 = (register_t)&fp->sf_siginfo; sf.sf_siginfo.si_signo = sig; sf.sf_siginfo.si_code = code; sf.sf_ahu.sf_action = (__osiginfohandler_t *)catcher; } else { /* Old FreeBSD-style arguments. */ sf.sf_arg2 = code; sf.sf_addr = regs->tf_err; sf.sf_ahu.sf_handler = catcher; } PROC_UNLOCK(p); /* Save most if not all of trap frame. */ sf.sf_siginfo.si_sc.sc_eax = regs->tf_eax; sf.sf_siginfo.si_sc.sc_ebx = regs->tf_ebx; sf.sf_siginfo.si_sc.sc_ecx = regs->tf_ecx; sf.sf_siginfo.si_sc.sc_edx = regs->tf_edx; sf.sf_siginfo.si_sc.sc_esi = regs->tf_esi; sf.sf_siginfo.si_sc.sc_edi = regs->tf_edi; sf.sf_siginfo.si_sc.sc_cs = regs->tf_cs; sf.sf_siginfo.si_sc.sc_ds = regs->tf_ds; sf.sf_siginfo.si_sc.sc_ss = regs->tf_ss; sf.sf_siginfo.si_sc.sc_es = regs->tf_es; sf.sf_siginfo.si_sc.sc_fs = regs->tf_fs; sf.sf_siginfo.si_sc.sc_gs = rgs(); sf.sf_siginfo.si_sc.sc_isp = regs->tf_isp; /* Build the signal context to be used by osigreturn(). */ sf.sf_siginfo.si_sc.sc_onstack = (oonstack) ? 1 : 0; SIG2OSIG(*mask, sf.sf_siginfo.si_sc.sc_mask); sf.sf_siginfo.si_sc.sc_sp = regs->tf_esp; sf.sf_siginfo.si_sc.sc_fp = regs->tf_ebp; sf.sf_siginfo.si_sc.sc_pc = regs->tf_eip; sf.sf_siginfo.si_sc.sc_ps = regs->tf_eflags; sf.sf_siginfo.si_sc.sc_trapno = regs->tf_trapno; sf.sf_siginfo.si_sc.sc_err = regs->tf_err; /* * If we're a vm86 process, we want to save the segment registers. * We also change eflags to be our emulated eflags, not the actual * eflags. */ if (regs->tf_eflags & PSL_VM) { /* XXX confusing names: `tf' isn't a trapframe; `regs' is. */ struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; struct vm86_kernel *vm86 = &p->p_addr->u_pcb.pcb_ext->ext_vm86; sf.sf_siginfo.si_sc.sc_gs = tf->tf_vm86_gs; sf.sf_siginfo.si_sc.sc_fs = tf->tf_vm86_fs; sf.sf_siginfo.si_sc.sc_es = tf->tf_vm86_es; sf.sf_siginfo.si_sc.sc_ds = tf->tf_vm86_ds; if (vm86->vm86_has_vme == 0) sf.sf_siginfo.si_sc.sc_ps = (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); /* See sendsig() for comments. */ tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_T | PSL_VIF | PSL_VIP); } /* Copy the sigframe out to the user's stack. */ if (copyout(&sf, fp, sizeof(*fp)) != 0) { /* * Something is wrong with the stack pointer. * ...Kill the process. */ PROC_LOCK(p); sigexit(p, SIGILL); /* NOTREACHED */ } regs->tf_esp = (int)fp; regs->tf_eip = PS_STRINGS - szosigcode; regs->tf_cs = _ucodesel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _udatasel; load_gs(_udatasel); regs->tf_ss = _udatasel; } #endif void sendsig(catcher, sig, mask, code) sig_t catcher; int sig; sigset_t *mask; u_long code; { struct sigframe sf; struct proc *p; struct sigacts *psp; struct trapframe *regs; struct sigframe *sfp; int oonstack; p = curproc; PROC_LOCK(p); psp = p->p_sigacts; #ifdef COMPAT_43 if (SIGISMEMBER(psp->ps_osigset, sig)) { PROC_UNLOCK(p); osendsig(catcher, sig, mask, code); return; } #endif regs = p->p_frame; oonstack = sigonstack(regs->tf_esp); /* Save user context. */ bzero(&sf, sizeof(sf)); sf.sf_uc.uc_sigmask = *mask; sf.sf_uc.uc_stack = p->p_sigstk; sf.sf_uc.uc_stack.ss_flags = (p->p_flag & P_ALTSTACK) ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; sf.sf_uc.uc_mcontext.mc_gs = rgs(); bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs)); /* Allocate and validate space for the signal handler context. */ if ((p->p_flag & P_ALTSTACK) != 0 && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { sfp = (struct sigframe *)(p->p_sigstk.ss_sp + p->p_sigstk.ss_size - sizeof(struct sigframe)); #if defined(COMPAT_43) || defined(COMPAT_SUNOS) p->p_sigstk.ss_flags |= SS_ONSTACK; #endif } else sfp = (struct sigframe *)regs->tf_esp - 1; PROC_UNLOCK(p); /* * grow_stack() will return 0 if *sfp does not fit inside the stack * and the stack can not be grown. * useracc() will return FALSE if access is denied. */ if (grow_stack(p, (int)sfp) == 0 || !useracc((caddr_t)sfp, sizeof(*sfp), VM_PROT_WRITE)) { /* * Process has trashed its stack; give it an illegal * instruction to halt it in its tracks. */ #ifdef DEBUG printf("process %d has trashed its stack\n", p->p_pid); #endif PROC_LOCK(p); SIGACTION(p, SIGILL) = SIG_DFL; SIGDELSET(p->p_sigignore, SIGILL); SIGDELSET(p->p_sigcatch, SIGILL); SIGDELSET(p->p_sigmask, SIGILL); psignal(p, SIGILL); PROC_UNLOCK(p); return; } /* Translate the signal if appropriate. */ if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize) sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; /* Build the argument list for the signal handler. */ sf.sf_signum = sig; sf.sf_ucontext = (register_t)&sfp->sf_uc; PROC_LOCK(p); if (SIGISMEMBER(p->p_sigacts->ps_siginfo, sig)) { /* Signal handler installed with SA_SIGINFO. */ sf.sf_siginfo = (register_t)&sfp->sf_si; sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; /* Fill siginfo structure. */ sf.sf_si.si_signo = sig; sf.sf_si.si_code = code; sf.sf_si.si_addr = (void *)regs->tf_err; } else { /* Old FreeBSD-style arguments. */ sf.sf_siginfo = code; sf.sf_addr = regs->tf_err; sf.sf_ahu.sf_handler = catcher; } PROC_UNLOCK(p); /* * If we're a vm86 process, we want to save the segment registers. * We also change eflags to be our emulated eflags, not the actual * eflags. */ if (regs->tf_eflags & PSL_VM) { struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; struct vm86_kernel *vm86 = &p->p_addr->u_pcb.pcb_ext->ext_vm86; sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs; sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs; sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es; sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds; if (vm86->vm86_has_vme == 0) sf.sf_uc.uc_mcontext.mc_eflags = (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); /* * We should never have PSL_T set when returning from vm86 * mode. It may be set here if we deliver a signal before * getting to vm86 mode, so turn it off. * * Clear PSL_NT to inhibit T_TSSFLT faults on return from * syscalls made by the signal handler. This just avoids * wasting time for our lazy fixup of such faults. PSL_NT * does nothing in vm86 mode, but vm86 programs can set it * almost legitimately in probes for old cpu types. */ tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_T | PSL_VIF | PSL_VIP); } /* Copy the sigframe out to the user's stack. */ if (copyout(&sf, sfp, sizeof(*sfp)) != 0) { /* * Something is wrong with the stack pointer. * ...Kill the process. */ PROC_LOCK(p); sigexit(p, SIGILL); /* NOTREACHED */ } regs->tf_esp = (int)sfp; regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode); regs->tf_cs = _ucodesel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _udatasel; regs->tf_ss = _udatasel; } /* * System call to cleanup state after a signal * has been taken. Reset signal mask and * stack state from context left by sendsig (above). * Return to previous pc and psl as specified by * context left by sendsig. Check carefully to * make sure that the user has not modified the * state to gain improper privileges. */ #ifdef COMPAT_43 int osigreturn(p, uap) struct proc *p; struct osigreturn_args /* { struct osigcontext *sigcntxp; } */ *uap; { struct trapframe *regs; struct osigcontext *scp; int eflags; regs = p->p_frame; scp = uap->sigcntxp; if (!useracc((caddr_t)scp, sizeof(*scp), VM_PROT_READ)) return (EFAULT); eflags = scp->sc_ps; if (eflags & PSL_VM) { struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; struct vm86_kernel *vm86; /* * if pcb_ext == 0 or vm86_inited == 0, the user hasn't * set up the vm86 area, and we can't enter vm86 mode. */ if (p->p_addr->u_pcb.pcb_ext == 0) return (EINVAL); vm86 = &p->p_addr->u_pcb.pcb_ext->ext_vm86; if (vm86->vm86_inited == 0) return (EINVAL); /* Go back to user mode if both flags are set. */ if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) trapsignal(p, SIGBUS, 0); if (vm86->vm86_has_vme) { eflags = (tf->tf_eflags & ~VME_USERCHANGE) | (eflags & VME_USERCHANGE) | PSL_VM; } else { vm86->vm86_eflags = eflags; /* save VIF, VIP */ eflags = (tf->tf_eflags & ~VM_USERCHANGE) | (eflags & VM_USERCHANGE) | PSL_VM; } tf->tf_vm86_ds = scp->sc_ds; tf->tf_vm86_es = scp->sc_es; tf->tf_vm86_fs = scp->sc_fs; tf->tf_vm86_gs = scp->sc_gs; tf->tf_ds = _udatasel; tf->tf_es = _udatasel; tf->tf_fs = _udatasel; } else { /* * Don't allow users to change privileged or reserved flags. */ /* * XXX do allow users to change the privileged flag PSL_RF. * The cpu sets PSL_RF in tf_eflags for faults. Debuggers * should sometimes set it there too. tf_eflags is kept in * the signal context during signal handling and there is no * other place to remember it, so the PSL_RF bit may be * corrupted by the signal handler without us knowing. * Corruption of the PSL_RF bit at worst causes one more or * one less debugger trap, so allowing it is fairly harmless. */ if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) { return (EINVAL); } /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ if (!CS_SECURE(scp->sc_cs)) { trapsignal(p, SIGBUS, T_PROTFLT); return (EINVAL); } regs->tf_ds = scp->sc_ds; regs->tf_es = scp->sc_es; regs->tf_fs = scp->sc_fs; } /* Restore remaining registers. */ regs->tf_eax = scp->sc_eax; regs->tf_ebx = scp->sc_ebx; regs->tf_ecx = scp->sc_ecx; regs->tf_edx = scp->sc_edx; regs->tf_esi = scp->sc_esi; regs->tf_edi = scp->sc_edi; regs->tf_cs = scp->sc_cs; regs->tf_ss = scp->sc_ss; regs->tf_isp = scp->sc_isp; PROC_LOCK(p); #if defined(COMPAT_43) || defined(COMPAT_SUNOS) if (scp->sc_onstack & 1) p->p_sigstk.ss_flags |= SS_ONSTACK; else p->p_sigstk.ss_flags &= ~SS_ONSTACK; #endif SIGSETOLD(p->p_sigmask, scp->sc_mask); SIG_CANTMASK(p->p_sigmask); PROC_UNLOCK(p); regs->tf_ebp = scp->sc_fp; regs->tf_esp = scp->sc_sp; regs->tf_eip = scp->sc_pc; regs->tf_eflags = eflags; return (EJUSTRETURN); } #endif int sigreturn(p, uap) struct proc *p; struct sigreturn_args /* { ucontext_t *sigcntxp; } */ *uap; { struct trapframe *regs; ucontext_t *ucp; int cs, eflags; ucp = uap->sigcntxp; #ifdef COMPAT_43 if (!useracc((caddr_t)ucp, sizeof(struct osigcontext), VM_PROT_READ)) return (EFAULT); if (((struct osigcontext *)ucp)->sc_trapno == 0x01d516) return (osigreturn(p, (struct osigreturn_args *)uap)); /* * Since ucp is not an osigcontext but a ucontext_t, we have to * check again if all of it is accessible. A ucontext_t is * much larger, so instead of just checking for the pointer * being valid for the size of an osigcontext, now check for * it being valid for a whole, new-style ucontext_t. */ #endif if (!useracc((caddr_t)ucp, sizeof(*ucp), VM_PROT_READ)) return (EFAULT); regs = p->p_frame; eflags = ucp->uc_mcontext.mc_eflags; if (eflags & PSL_VM) { struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; struct vm86_kernel *vm86; /* * if pcb_ext == 0 or vm86_inited == 0, the user hasn't * set up the vm86 area, and we can't enter vm86 mode. */ if (p->p_addr->u_pcb.pcb_ext == 0) return (EINVAL); vm86 = &p->p_addr->u_pcb.pcb_ext->ext_vm86; if (vm86->vm86_inited == 0) return (EINVAL); /* Go back to user mode if both flags are set. */ if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) trapsignal(p, SIGBUS, 0); if (vm86->vm86_has_vme) { eflags = (tf->tf_eflags & ~VME_USERCHANGE) | (eflags & VME_USERCHANGE) | PSL_VM; } else { vm86->vm86_eflags = eflags; /* save VIF, VIP */ eflags = (tf->tf_eflags & ~VM_USERCHANGE) | (eflags & VM_USERCHANGE) | PSL_VM; } bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe)); tf->tf_eflags = eflags; tf->tf_vm86_ds = tf->tf_ds; tf->tf_vm86_es = tf->tf_es; tf->tf_vm86_fs = tf->tf_fs; tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs; tf->tf_ds = _udatasel; tf->tf_es = _udatasel; tf->tf_fs = _udatasel; } else { /* * Don't allow users to change privileged or reserved flags. */ /* * XXX do allow users to change the privileged flag PSL_RF. * The cpu sets PSL_RF in tf_eflags for faults. Debuggers * should sometimes set it there too. tf_eflags is kept in * the signal context during signal handling and there is no * other place to remember it, so the PSL_RF bit may be * corrupted by the signal handler without us knowing. * Corruption of the PSL_RF bit at worst causes one more or * one less debugger trap, so allowing it is fairly harmless. */ if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) { printf("sigreturn: eflags = 0x%x\n", eflags); return (EINVAL); } /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ cs = ucp->uc_mcontext.mc_cs; if (!CS_SECURE(cs)) { printf("sigreturn: cs = 0x%x\n", cs); trapsignal(p, SIGBUS, T_PROTFLT); return (EINVAL); } bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs)); } PROC_LOCK(p); #if defined(COMPAT_43) || defined(COMPAT_SUNOS) if (ucp->uc_mcontext.mc_onstack & 1) p->p_sigstk.ss_flags |= SS_ONSTACK; else p->p_sigstk.ss_flags &= ~SS_ONSTACK; #endif p->p_sigmask = ucp->uc_sigmask; SIG_CANTMASK(p->p_sigmask); PROC_UNLOCK(p); return (EJUSTRETURN); } /* * Machine dependent boot() routine * * I haven't seen anything to put here yet * Possibly some stuff might be grafted back here from boot() */ void cpu_boot(int howto) { } /* * Shutdown the CPU as much as possible */ void cpu_halt(void) { for (;;) __asm__ ("hlt"); } /* * Hook to idle the CPU when possible. This currently only works in * the !SMP case, as there is no clean way to ensure that a CPU will be * woken when there is work available for it. */ static int cpu_idle_hlt = 1; SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_hlt, CTLFLAG_RW, &cpu_idle_hlt, 0, "Idle loop HLT enable"); /* * Note that we have to be careful here to avoid a race between checking * procrunnable() and actually halting. If we don't do this, we may waste * the time between calling hlt and the next interrupt even though there * is a runnable process. */ void cpu_idle(void) { #ifndef SMP if (cpu_idle_hlt) { disable_intr(); if (procrunnable()) enable_intr(); else { enable_intr(); __asm __volatile("hlt"); } } #endif } /* * Clear registers on exec */ void setregs(p, entry, stack, ps_strings) struct proc *p; u_long entry; u_long stack; u_long ps_strings; { struct trapframe *regs = p->p_frame; struct pcb *pcb = &p->p_addr->u_pcb; if (pcb->pcb_ldt) user_ldt_free(pcb); bzero((char *)regs, sizeof(struct trapframe)); regs->tf_eip = entry; regs->tf_esp = stack; regs->tf_eflags = PSL_USER | (regs->tf_eflags & PSL_T); regs->tf_ss = _udatasel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _udatasel; regs->tf_cs = _ucodesel; /* PS_STRINGS value for BSD/OS binaries. It is 0 for non-BSD/OS. */ regs->tf_ebx = ps_strings; /* reset %gs as well */ if (pcb == PCPU_GET(curpcb)) load_gs(_udatasel); else pcb->pcb_gs = _udatasel; /* * Reset the hardware debug registers if they were in use. * They won't have any meaning for the newly exec'd process. */ if (pcb->pcb_flags & PCB_DBREGS) { pcb->pcb_dr0 = 0; pcb->pcb_dr1 = 0; pcb->pcb_dr2 = 0; pcb->pcb_dr3 = 0; pcb->pcb_dr6 = 0; pcb->pcb_dr7 = 0; if (pcb == PCPU_GET(curpcb)) { /* * Clear the debug registers on the running * CPU, otherwise they will end up affecting * the next process we switch to. */ reset_dbregs(); } pcb->pcb_flags &= ~PCB_DBREGS; } /* * Initialize the math emulator (if any) for the current process. * Actually, just clear the bit that says that the emulator has * been initialized. Initialization is delayed until the process * traps to the emulator (if it is done at all) mainly because * emulators don't provide an entry point for initialization. */ p->p_addr->u_pcb.pcb_flags &= ~FP_SOFTFP; /* * Arrange to trap the next npx or `fwait' instruction (see npx.c * for why fwait must be trapped at least if there is an npx or an * emulator). This is mainly to handle the case where npx0 is not * configured, since the npx routines normally set up the trap * otherwise. It should be done only at boot time, but doing it * here allows modifying `npx_exists' for testing the emulator on * systems with an npx. */ load_cr0(rcr0() | CR0_MP | CR0_TS); #ifdef DEV_NPX /* Initialize the npx (if any) for the current process. */ npxinit(__INITIAL_NPXCW__); #endif /* * XXX - Linux emulator * Make sure sure edx is 0x0 on entry. Linux binaries depend * on it. */ p->p_retval[1] = 0; } void cpu_setregs(void) { unsigned int cr0; cr0 = rcr0(); cr0 |= CR0_NE; /* Done by npxinit() */ cr0 |= CR0_MP | CR0_TS; /* Done at every execve() too. */ #ifndef I386_CPU cr0 |= CR0_WP | CR0_AM; #endif load_cr0(cr0); load_gs(_udatasel); } static int sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS) { int error; error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req); if (!error && req->newptr) resettodr(); return (error); } SYSCTL_PROC(_machdep, CPU_ADJKERNTZ, adjkerntz, CTLTYPE_INT|CTLFLAG_RW, &adjkerntz, 0, sysctl_machdep_adjkerntz, "I", ""); SYSCTL_INT(_machdep, CPU_DISRTCSET, disable_rtc_set, CTLFLAG_RW, &disable_rtc_set, 0, ""); SYSCTL_STRUCT(_machdep, CPU_BOOTINFO, bootinfo, CTLFLAG_RD, &bootinfo, bootinfo, ""); SYSCTL_INT(_machdep, CPU_WALLCLOCK, wall_cmos_clock, CTLFLAG_RW, &wall_cmos_clock, 0, ""); /* * Initialize 386 and configure to run kernel */ /* * Initialize segments & interrupt table */ int _default_ldt; union descriptor gdt[NGDT * MAXCPU]; /* global descriptor table */ static struct gate_descriptor idt0[NIDT]; struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */ union descriptor ldt[NLDT]; /* local descriptor table */ #ifdef SMP /* table descriptors - used to load tables by microp */ struct region_descriptor r_gdt, r_idt; #endif int private_tss; /* flag indicating private tss */ #if defined(I586_CPU) && !defined(NO_F00F_HACK) extern int has_f00f_bug; #endif static struct i386tss dblfault_tss; static char dblfault_stack[PAGE_SIZE]; extern struct user *proc0paddr; /* software prototypes -- in more palatable form */ struct soft_segment_descriptor gdt_segs[] = { /* GNULL_SEL 0 Null Descriptor */ { 0x0, /* segment base address */ 0x0, /* length */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GCODE_SEL 1 Code Descriptor for kernel */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMERA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GDATA_SEL 2 Data Descriptor for kernel */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMRWA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GPRIV_SEL 3 SMP Per-Processor Private Data Descriptor */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMRWA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GPROC0_SEL 4 Proc 0 Tss Descriptor */ { 0x0, /* segment base address */ sizeof(struct i386tss)-1,/* length - all address space */ SDT_SYS386TSS, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* unused - default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GLDT_SEL 5 LDT Descriptor */ { (int) ldt, /* segment base address */ sizeof(ldt)-1, /* length - all address space */ SDT_SYSLDT, /* segment type */ SEL_UPL, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* unused - default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GUSERLDT_SEL 6 User LDT Descriptor per process */ { (int) ldt, /* segment base address */ (512 * sizeof(union descriptor)-1), /* length */ SDT_SYSLDT, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* unused - default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GTGATE_SEL 7 Null Descriptor - Placeholder */ { 0x0, /* segment base address */ 0x0, /* length - all address space */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GBIOSLOWMEM_SEL 8 BIOS access to realmode segment 0x40, must be #8 in GDT */ { 0x400, /* segment base address */ 0xfffff, /* length */ SDT_MEMRWA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GPANIC_SEL 9 Panic Tss Descriptor */ { (int) &dblfault_tss, /* segment base address */ sizeof(struct i386tss)-1,/* length - all address space */ SDT_SYS386TSS, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* unused - default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GBIOSCODE32_SEL 10 BIOS 32-bit interface (32bit Code) */ { 0, /* segment base address (overwritten) */ 0xfffff, /* length */ SDT_MEMERA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GBIOSCODE16_SEL 11 BIOS 32-bit interface (16bit Code) */ { 0, /* segment base address (overwritten) */ 0xfffff, /* length */ SDT_MEMERA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GBIOSDATA_SEL 12 BIOS 32-bit interface (Data) */ { 0, /* segment base address (overwritten) */ 0xfffff, /* length */ SDT_MEMRWA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GBIOSUTIL_SEL 13 BIOS 16-bit interface (Utility) */ { 0, /* segment base address (overwritten) */ 0xfffff, /* length */ SDT_MEMRWA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GBIOSARGS_SEL 14 BIOS 16-bit interface (Arguments) */ { 0, /* segment base address (overwritten) */ 0xfffff, /* length */ SDT_MEMRWA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, }; static struct soft_segment_descriptor ldt_segs[] = { /* Null Descriptor - overwritten by call gate */ { 0x0, /* segment base address */ 0x0, /* length - all address space */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* Null Descriptor - overwritten by call gate */ { 0x0, /* segment base address */ 0x0, /* length - all address space */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* Null Descriptor - overwritten by call gate */ { 0x0, /* segment base address */ 0x0, /* length - all address space */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* Code Descriptor for user */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMERA, /* segment type */ SEL_UPL, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* Null Descriptor - overwritten by call gate */ { 0x0, /* segment base address */ 0x0, /* length - all address space */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* Data Descriptor for user */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMRWA, /* segment type */ SEL_UPL, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, }; void setidt(idx, func, typ, dpl, selec) int idx; inthand_t *func; int typ; int dpl; int selec; { struct gate_descriptor *ip; ip = idt + idx; ip->gd_looffset = (int)func; ip->gd_selector = selec; ip->gd_stkcpy = 0; ip->gd_xx = 0; ip->gd_type = typ; ip->gd_dpl = dpl; ip->gd_p = 1; ip->gd_hioffset = ((int)func)>>16 ; } #define IDTVEC(name) __CONCAT(X,name) extern inthand_t IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), IDTVEC(xmm), IDTVEC(lcall_syscall), IDTVEC(int0x80_syscall); void sdtossd(sd, ssd) struct segment_descriptor *sd; struct soft_segment_descriptor *ssd; { ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase; ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit; ssd->ssd_type = sd->sd_type; ssd->ssd_dpl = sd->sd_dpl; ssd->ssd_p = sd->sd_p; ssd->ssd_def32 = sd->sd_def32; ssd->ssd_gran = sd->sd_gran; } #define PHYSMAP_SIZE (2 * 8) /* * Populate the (physmap) array with base/bound pairs describing the * available physical memory in the system, then test this memory and * build the phys_avail array describing the actually-available memory. * * If we cannot accurately determine the physical memory map, then use * value from the 0xE801 call, and failing that, the RTC. * * Total memory size may be set by the kernel environment variable * hw.physmem or the compile-time define MAXMEM. */ static void getmemsize(int first) { int i, physmap_idx, pa_indx; u_int basemem, extmem; #ifdef PC98 int pg_n; u_int under16; #else struct vm86frame vmf; struct vm86context vmc; #endif vm_offset_t pa, physmap[PHYSMAP_SIZE]; pt_entry_t pte; const char *cp; #ifndef PC98 struct bios_smap *smap; #endif #ifdef PC98 /* XXX - some of EPSON machines can't use PG_N */ pg_n = PG_N; if (pc98_machine_type & M_EPSON_PC98) { switch (epson_machine_id) { #ifdef WB_CACHE default: #endif case 0x34: /* PC-486HX */ case 0x35: /* PC-486HG */ case 0x3B: /* PC-486HA */ pg_n = 0; break; } } #else bzero(&vmf, sizeof(struct vm86frame)); #endif bzero(physmap, sizeof(physmap)); /* * Perform "base memory" related probes & setup */ #ifdef PC98 under16 = pc98_getmemsize(&basemem, &extmem); #else vm86_intcall(0x12, &vmf); basemem = vmf.vmf_ax; #endif if (basemem > 640) { printf("Preposterous BIOS basemem of %uK, truncating to 640K\n", basemem); basemem = 640; } /* * XXX if biosbasemem is now < 640, there is a `hole' * between the end of base memory and the start of * ISA memory. The hole may be empty or it may * contain BIOS code or data. Map it read/write so * that the BIOS can write to it. (Memory from 0 to * the physical end of the kernel is mapped read-only * to begin with and then parts of it are remapped. * The parts that aren't remapped form holes that * remain read-only and are unused by the kernel. * The base memory area is below the physical end of * the kernel and right now forms a read-only hole. * The part of it from PAGE_SIZE to * (trunc_page(biosbasemem * 1024) - 1) will be * remapped and used by the kernel later.) * * This code is similar to the code used in * pmap_mapdev, but since no memory needs to be * allocated we simply change the mapping. */ for (pa = trunc_page(basemem * 1024); pa < ISA_HOLE_START; pa += PAGE_SIZE) { pte = (pt_entry_t)vtopte(pa + KERNBASE); *pte = pa | PG_RW | PG_V; } /* * if basemem != 640, map pages r/w into vm86 page table so * that the bios can scribble on it. */ pte = (pt_entry_t)vm86paddr; for (i = basemem / 4; i < 160; i++) pte[i] = (i << PAGE_SHIFT) | PG_V | PG_RW | PG_U; #ifndef PC98 /* * map page 1 R/W into the kernel page table so we can use it * as a buffer. The kernel will unmap this page later. */ pte = (pt_entry_t)vtopte(KERNBASE + (1 << PAGE_SHIFT)); *pte = (1 << PAGE_SHIFT) | PG_RW | PG_V; /* * get memory map with INT 15:E820 */ vmc.npages = 0; smap = (void *)vm86_addpage(&vmc, 1, KERNBASE + (1 << PAGE_SHIFT)); vm86_getptr(&vmc, (vm_offset_t)smap, &vmf.vmf_es, &vmf.vmf_di); physmap_idx = 0; vmf.vmf_ebx = 0; do { vmf.vmf_eax = 0xE820; vmf.vmf_edx = SMAP_SIG; vmf.vmf_ecx = sizeof(struct bios_smap); i = vm86_datacall(0x15, &vmf, &vmc); if (i || vmf.vmf_eax != SMAP_SIG) break; if (boothowto & RB_VERBOSE) printf("SMAP type=%02x base=%08x %08x len=%08x %08x\n", smap->type, *(u_int32_t *)((char *)&smap->base + 4), (u_int32_t)smap->base, *(u_int32_t *)((char *)&smap->length + 4), (u_int32_t)smap->length); if (smap->type != 0x01) goto next_run; if (smap->length == 0) goto next_run; if (smap->base >= 0xffffffff) { printf("%uK of memory above 4GB ignored\n", (u_int)(smap->length / 1024)); goto next_run; } for (i = 0; i <= physmap_idx; i += 2) { if (smap->base < physmap[i + 1]) { if (boothowto & RB_VERBOSE) printf( "Overlapping or non-montonic memory region, ignoring second region\n"); goto next_run; } } if (smap->base == physmap[physmap_idx + 1]) { physmap[physmap_idx + 1] += smap->length; goto next_run; } physmap_idx += 2; if (physmap_idx == PHYSMAP_SIZE) { printf( "Too many segments in the physical address map, giving up\n"); break; } physmap[physmap_idx] = smap->base; physmap[physmap_idx + 1] = smap->base + smap->length; next_run: } while (vmf.vmf_ebx != 0); if (physmap[1] != 0) goto physmap_done; /* * If we failed above, try memory map with INT 15:E801 */ vmf.vmf_ax = 0xE801; if (vm86_intcall(0x15, &vmf) == 0) { extmem = vmf.vmf_cx + vmf.vmf_dx * 64; } else { #if 0 vmf.vmf_ah = 0x88; vm86_intcall(0x15, &vmf); extmem = vmf.vmf_ax; #else /* * Prefer the RTC value for extended memory. */ extmem = rtcin(RTC_EXTLO) + (rtcin(RTC_EXTHI) << 8); #endif } /* * Special hack for chipsets that still remap the 384k hole when * there's 16MB of memory - this really confuses people that * are trying to use bus mastering ISA controllers with the * "16MB limit"; they only have 16MB, but the remapping puts * them beyond the limit. * * If extended memory is between 15-16MB (16-17MB phys address range), * chop it to 15MB. */ if ((extmem > 15 * 1024) && (extmem < 16 * 1024)) extmem = 15 * 1024; #endif physmap[0] = 0; physmap[1] = basemem * 1024; physmap_idx = 2; physmap[physmap_idx] = 0x100000; physmap[physmap_idx + 1] = physmap[physmap_idx] + extmem * 1024; #ifdef PC98 if ((under16 != 16 * 1024) && (extmem > 15 * 1024)) { /* 15M - 16M region is cut off, so need to divide chunk */ physmap[physmap_idx + 1] = under16 * 1024; physmap_idx += 2; physmap[physmap_idx] = 0x1000000; physmap[physmap_idx + 1] = physmap[2] + extmem * 1024; } #else physmap_done: #endif /* * Now, physmap contains a map of physical memory. */ #ifdef SMP /* make hole for AP bootstrap code */ physmap[1] = mp_bootaddress(physmap[1] / 1024); /* look for the MP hardware - needed for apic addresses */ i386_mp_probe(); #endif /* * Maxmem isn't the "maximum memory", it's one larger than the * highest page of the physical address space. It should be * called something like "Maxphyspage". We may adjust this * based on ``hw.physmem'' and the results of the memory test. */ Maxmem = atop(physmap[physmap_idx + 1]); #ifdef MAXMEM Maxmem = MAXMEM / 4; #endif /* * hw.physmem is a size in bytes; we also allow k, m, and g suffixes * for the appropriate modifiers. This overrides MAXMEM. */ if ((cp = getenv("hw.physmem")) != NULL) { u_int64_t AllowMem, sanity; char *ep; sanity = AllowMem = strtouq(cp, &ep, 0); if ((ep != cp) && (*ep != 0)) { switch(*ep) { case 'g': case 'G': AllowMem <<= 10; case 'm': case 'M': AllowMem <<= 10; case 'k': case 'K': AllowMem <<= 10; break; default: AllowMem = sanity = 0; } if (AllowMem < sanity) AllowMem = 0; } if (AllowMem == 0) printf("Ignoring invalid memory size of '%s'\n", cp); else Maxmem = atop(AllowMem); } if (atop(physmap[physmap_idx + 1]) != Maxmem && (boothowto & RB_VERBOSE)) printf("Physical memory use set to %uK\n", Maxmem * 4); /* * If Maxmem has been increased beyond what the system has detected, * extend the last memory segment to the new limit. */ if (atop(physmap[physmap_idx + 1]) < Maxmem) physmap[physmap_idx + 1] = ptoa(Maxmem); /* call pmap initialization to make new kernel address space */ pmap_bootstrap(first, 0); /* * Size up each available chunk of physical memory. */ physmap[0] = PAGE_SIZE; /* mask off page 0 */ pa_indx = 0; phys_avail[pa_indx++] = physmap[0]; phys_avail[pa_indx] = physmap[0]; #if 0 pte = (pt_entry_t)vtopte(KERNBASE); #else pte = (pt_entry_t)CMAP1; #endif /* * physmap is in bytes, so when converting to page boundaries, * round up the start address and round down the end address. */ for (i = 0; i <= physmap_idx; i += 2) { vm_offset_t end; end = ptoa(Maxmem); if (physmap[i + 1] < end) end = trunc_page(physmap[i + 1]); for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) { int tmp, page_bad; #if 0 int *ptr = 0; #else int *ptr = (int *)CADDR1; #endif /* * block out kernel memory as not available. */ if (pa >= 0x100000 && pa < first) continue; page_bad = FALSE; /* * map page into kernel: valid, read/write,non-cacheable */ #ifdef PC98 *pte = pa | PG_V | PG_RW | pg_n; #else *pte = pa | PG_V | PG_RW | PG_N; #endif invltlb(); tmp = *(int *)ptr; /* * Test for alternating 1's and 0's */ *(volatile int *)ptr = 0xaaaaaaaa; if (*(volatile int *)ptr != 0xaaaaaaaa) { page_bad = TRUE; } /* * Test for alternating 0's and 1's */ *(volatile int *)ptr = 0x55555555; if (*(volatile int *)ptr != 0x55555555) { page_bad = TRUE; } /* * Test for all 1's */ *(volatile int *)ptr = 0xffffffff; if (*(volatile int *)ptr != 0xffffffff) { page_bad = TRUE; } /* * Test for all 0's */ *(volatile int *)ptr = 0x0; if (*(volatile int *)ptr != 0x0) { page_bad = TRUE; } /* * Restore original value. */ *(int *)ptr = tmp; /* * Adjust array of valid/good pages. */ if (page_bad == TRUE) { continue; } /* * If this good page is a continuation of the * previous set of good pages, then just increase * the end pointer. Otherwise start a new chunk. * Note that "end" points one higher than end, * making the range >= start and < end. * If we're also doing a speculative memory * test and we at or past the end, bump up Maxmem * so that we keep going. The first bad page * will terminate the loop. */ if (phys_avail[pa_indx] == pa) { phys_avail[pa_indx] += PAGE_SIZE; } else { pa_indx++; if (pa_indx == PHYS_AVAIL_ARRAY_END) { printf( "Too many holes in the physical address space, giving up\n"); pa_indx--; break; } phys_avail[pa_indx++] = pa; /* start */ phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */ } physmem++; } } *pte = 0; invltlb(); /* * XXX * The last chunk must contain at least one page plus the message * buffer to avoid complicating other code (message buffer address * calculation, etc.). */ while (phys_avail[pa_indx - 1] + PAGE_SIZE + round_page(MSGBUF_SIZE) >= phys_avail[pa_indx]) { physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]); phys_avail[pa_indx--] = 0; phys_avail[pa_indx--] = 0; } Maxmem = atop(phys_avail[pa_indx]); /* Trim off space for the message buffer. */ phys_avail[pa_indx] -= round_page(MSGBUF_SIZE); avail_end = phys_avail[pa_indx]; } void init386(first) int first; { struct gate_descriptor *gdp; int gsel_tss, metadata_missing, off, x; #ifndef SMP /* table descriptors - used to load tables by microp */ struct region_descriptor r_gdt, r_idt; #endif proc0.p_addr = proc0paddr; atdevbase = ISA_HOLE_START + KERNBASE; #ifdef PC98 /* * Initialize DMAC */ pc98_init_dmac(); #endif metadata_missing = 0; if (bootinfo.bi_modulep) { preload_metadata = (caddr_t)bootinfo.bi_modulep + KERNBASE; preload_bootstrap_relocate(KERNBASE); } else { metadata_missing = 1; } if (bootinfo.bi_envp) kern_envp = (caddr_t)bootinfo.bi_envp + KERNBASE; /* Init basic tunables, hz etc */ init_param(); /* * make gdt memory segments, the code segment goes up to end of the * page with etext in it, the data segment goes to the end of * the address space */ /* * XXX text protection is temporarily (?) disabled. The limit was * i386_btop(round_page(etext)) - 1. */ gdt_segs[GCODE_SEL].ssd_limit = atop(0 - 1); gdt_segs[GDATA_SEL].ssd_limit = atop(0 - 1); #ifdef SMP gdt_segs[GPRIV_SEL].ssd_limit = atop(sizeof(struct privatespace) - 1); gdt_segs[GPRIV_SEL].ssd_base = (int) &SMP_prvspace[0]; gdt_segs[GPROC0_SEL].ssd_base = (int) &SMP_prvspace[0].globaldata.gd_common_tss; SMP_prvspace[0].globaldata.gd_prvspace = &SMP_prvspace[0].globaldata; #else gdt_segs[GPRIV_SEL].ssd_limit = atop(sizeof(struct globaldata) - 1); gdt_segs[GPRIV_SEL].ssd_base = (int) &__globaldata; gdt_segs[GPROC0_SEL].ssd_base = (int) &__globaldata.gd_common_tss; __globaldata.gd_prvspace = &__globaldata; #endif for (x = 0; x < NGDT; x++) { #ifdef BDE_DEBUGGER /* avoid overwriting db entries with APM ones */ if (x >= GAPMCODE32_SEL && x <= GAPMDATA_SEL) continue; #endif ssdtosd(&gdt_segs[x], &gdt[x].sd); } r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; r_gdt.rd_base = (int) gdt; lgdt(&r_gdt); /* setup curproc so that mutexes work */ PCPU_SET(curproc, &proc0); PCPU_SET(spinlocks, NULL); LIST_INIT(&proc0.p_contested); /* * Initialize mutexes. */ mtx_init(&Giant, "Giant", MTX_DEF | MTX_RECURSE); mtx_init(&sched_lock, "sched lock", MTX_SPIN | MTX_RECURSE); mtx_init(&proc0.p_mtx, "process lock", MTX_DEF); mtx_init(&clock_lock, "clk", MTX_SPIN | MTX_RECURSE); #ifdef SMP mtx_init(&imen_mtx, "imen", MTX_SPIN); #endif mtx_lock(&Giant); /* make ldt memory segments */ /* * XXX - VM_MAXUSER_ADDRESS is an end address, not a max. And it * should be spelled ...MAX_USER... */ ldt_segs[LUCODE_SEL].ssd_limit = atop(VM_MAXUSER_ADDRESS - 1); ldt_segs[LUDATA_SEL].ssd_limit = atop(VM_MAXUSER_ADDRESS - 1); for (x = 0; x < sizeof ldt_segs / sizeof ldt_segs[0]; x++) ssdtosd(&ldt_segs[x], &ldt[x].sd); _default_ldt = GSEL(GLDT_SEL, SEL_KPL); lldt(_default_ldt); PCPU_SET(currentldt, _default_ldt); /* exceptions */ for (x = 0; x < NIDT; x++) setidt(x, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(0, &IDTVEC(div), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(1, &IDTVEC(dbg), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(2, &IDTVEC(nmi), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(3, &IDTVEC(bpt), SDT_SYS386IGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(4, &IDTVEC(ofl), SDT_SYS386TGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(5, &IDTVEC(bnd), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(6, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(7, &IDTVEC(dna), SDT_SYS386TGT, SEL_KPL , GSEL(GCODE_SEL, SEL_KPL)); setidt(8, 0, SDT_SYSTASKGT, SEL_KPL, GSEL(GPANIC_SEL, SEL_KPL)); setidt(9, &IDTVEC(fpusegm), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(10, &IDTVEC(tss), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(11, &IDTVEC(missing), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(12, &IDTVEC(stk), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(13, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(14, &IDTVEC(page), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(15, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(16, &IDTVEC(fpu), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(17, &IDTVEC(align), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(18, &IDTVEC(mchk), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(19, &IDTVEC(xmm), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(0x80, &IDTVEC(int0x80_syscall), SDT_SYS386TGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); r_idt.rd_limit = sizeof(idt0) - 1; r_idt.rd_base = (int) idt; lidt(&r_idt); /* * Initialize the console before we print anything out. */ cninit(); if (metadata_missing) printf("WARNING: loader(8) metadata is missing!\n"); #ifdef DEV_ISA isa_defaultirq(); #endif #ifdef DDB kdb_init(); if (boothowto & RB_KDB) Debugger("Boot flags requested debugger"); #endif finishidentcpu(); /* Final stage of CPU initialization */ setidt(6, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(13, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); initializecpu(); /* Initialize CPU registers */ /* make an initial tss so cpu can get interrupt stack on syscall! */ PCPU_SET(common_tss.tss_esp0, (int) proc0.p_addr + UPAGES*PAGE_SIZE - 16); PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL)); gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); private_tss = 0; PCPU_SET(tss_gdt, &gdt[GPROC0_SEL].sd); PCPU_SET(common_tssd, *PCPU_GET(tss_gdt)); PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16); ltr(gsel_tss); dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 = dblfault_tss.tss_esp2 = (int)&dblfault_stack[sizeof(dblfault_stack)]; dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 = dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL); dblfault_tss.tss_cr3 = (int)IdlePTD; dblfault_tss.tss_eip = (int)dblfault_handler; dblfault_tss.tss_eflags = PSL_KERNEL; dblfault_tss.tss_ds = dblfault_tss.tss_es = dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL); dblfault_tss.tss_fs = GSEL(GPRIV_SEL, SEL_KPL); dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL); dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL); vm86_initialize(); getmemsize(first); /* now running on new page tables, configured,and u/iom is accessible */ /* Map the message buffer. */ for (off = 0; off < round_page(MSGBUF_SIZE); off += PAGE_SIZE) pmap_kenter((vm_offset_t)msgbufp + off, avail_end + off); msgbufinit(msgbufp, MSGBUF_SIZE); /* make a call gate to reenter kernel with */ gdp = &ldt[LSYS5CALLS_SEL].gd; x = (int) &IDTVEC(lcall_syscall); gdp->gd_looffset = x; gdp->gd_selector = GSEL(GCODE_SEL,SEL_KPL); gdp->gd_stkcpy = 1; gdp->gd_type = SDT_SYS386CGT; gdp->gd_dpl = SEL_UPL; gdp->gd_p = 1; gdp->gd_hioffset = x >> 16; /* XXX does this work? */ ldt[LBSDICALLS_SEL] = ldt[LSYS5CALLS_SEL]; ldt[LSOL26CALLS_SEL] = ldt[LSYS5CALLS_SEL]; /* transfer to user mode */ _ucodesel = LSEL(LUCODE_SEL, SEL_UPL); _udatasel = LSEL(LUDATA_SEL, SEL_UPL); /* setup proc 0's pcb */ proc0.p_addr->u_pcb.pcb_flags = 0; proc0.p_addr->u_pcb.pcb_cr3 = (int)IdlePTD; proc0.p_addr->u_pcb.pcb_ext = 0; proc0.p_frame = &proc0_tf; } #if defined(I586_CPU) && !defined(NO_F00F_HACK) static void f00f_hack(void *unused); SYSINIT(f00f_hack, SI_SUB_INTRINSIC, SI_ORDER_FIRST, f00f_hack, NULL); static void f00f_hack(void *unused) { struct gate_descriptor *new_idt; #ifndef SMP struct region_descriptor r_idt; #endif vm_offset_t tmp; if (!has_f00f_bug) return; GIANT_REQUIRED; printf("Intel Pentium detected, installing workaround for F00F bug\n"); r_idt.rd_limit = sizeof(idt0) - 1; tmp = kmem_alloc(kernel_map, PAGE_SIZE * 2); if (tmp == 0) panic("kmem_alloc returned 0"); if (((unsigned int)tmp & (PAGE_SIZE-1)) != 0) panic("kmem_alloc returned non-page-aligned memory"); /* Put the first seven entries in the lower page */ new_idt = (struct gate_descriptor*)(tmp + PAGE_SIZE - (7*8)); bcopy(idt, new_idt, sizeof(idt0)); r_idt.rd_base = (int)new_idt; lidt(&r_idt); idt = new_idt; if (vm_map_protect(kernel_map, tmp, tmp + PAGE_SIZE, VM_PROT_READ, FALSE) != KERN_SUCCESS) panic("vm_map_protect failed"); return; } #endif /* defined(I586_CPU) && !NO_F00F_HACK */ int ptrace_set_pc(p, addr) struct proc *p; unsigned long addr; { p->p_frame->tf_eip = addr; return (0); } int ptrace_single_step(p) struct proc *p; { p->p_frame->tf_eflags |= PSL_T; return (0); } int fill_regs(p, regs) struct proc *p; struct reg *regs; { struct pcb *pcb; struct trapframe *tp; tp = p->p_frame; regs->r_fs = tp->tf_fs; regs->r_es = tp->tf_es; regs->r_ds = tp->tf_ds; regs->r_edi = tp->tf_edi; regs->r_esi = tp->tf_esi; regs->r_ebp = tp->tf_ebp; regs->r_ebx = tp->tf_ebx; regs->r_edx = tp->tf_edx; regs->r_ecx = tp->tf_ecx; regs->r_eax = tp->tf_eax; regs->r_eip = tp->tf_eip; regs->r_cs = tp->tf_cs; regs->r_eflags = tp->tf_eflags; regs->r_esp = tp->tf_esp; regs->r_ss = tp->tf_ss; pcb = &p->p_addr->u_pcb; regs->r_gs = pcb->pcb_gs; return (0); } int set_regs(p, regs) struct proc *p; struct reg *regs; { struct pcb *pcb; struct trapframe *tp; tp = p->p_frame; if (!EFL_SECURE(regs->r_eflags, tp->tf_eflags) || !CS_SECURE(regs->r_cs)) return (EINVAL); tp->tf_fs = regs->r_fs; tp->tf_es = regs->r_es; tp->tf_ds = regs->r_ds; tp->tf_edi = regs->r_edi; tp->tf_esi = regs->r_esi; tp->tf_ebp = regs->r_ebp; tp->tf_ebx = regs->r_ebx; tp->tf_edx = regs->r_edx; tp->tf_ecx = regs->r_ecx; tp->tf_eax = regs->r_eax; tp->tf_eip = regs->r_eip; tp->tf_cs = regs->r_cs; tp->tf_eflags = regs->r_eflags; tp->tf_esp = regs->r_esp; tp->tf_ss = regs->r_ss; pcb = &p->p_addr->u_pcb; pcb->pcb_gs = regs->r_gs; return (0); } #ifdef CPU_ENABLE_SSE static void fill_fpregs_xmm(sv_xmm, sv_87) struct savexmm *sv_xmm; struct save87 *sv_87; { register struct env87 *penv_87 = &sv_87->sv_env; register struct envxmm *penv_xmm = &sv_xmm->sv_env; int i; /* FPU control/status */ penv_87->en_cw = penv_xmm->en_cw; penv_87->en_sw = penv_xmm->en_sw; penv_87->en_tw = penv_xmm->en_tw; penv_87->en_fip = penv_xmm->en_fip; penv_87->en_fcs = penv_xmm->en_fcs; penv_87->en_opcode = penv_xmm->en_opcode; penv_87->en_foo = penv_xmm->en_foo; penv_87->en_fos = penv_xmm->en_fos; /* FPU registers */ for (i = 0; i < 8; ++i) sv_87->sv_ac[i] = sv_xmm->sv_fp[i].fp_acc; sv_87->sv_ex_sw = sv_xmm->sv_ex_sw; } static void set_fpregs_xmm(sv_87, sv_xmm) struct save87 *sv_87; struct savexmm *sv_xmm; { register struct env87 *penv_87 = &sv_87->sv_env; register struct envxmm *penv_xmm = &sv_xmm->sv_env; int i; /* FPU control/status */ penv_xmm->en_cw = penv_87->en_cw; penv_xmm->en_sw = penv_87->en_sw; penv_xmm->en_tw = penv_87->en_tw; penv_xmm->en_fip = penv_87->en_fip; penv_xmm->en_fcs = penv_87->en_fcs; penv_xmm->en_opcode = penv_87->en_opcode; penv_xmm->en_foo = penv_87->en_foo; penv_xmm->en_fos = penv_87->en_fos; /* FPU registers */ for (i = 0; i < 8; ++i) sv_xmm->sv_fp[i].fp_acc = sv_87->sv_ac[i]; sv_xmm->sv_ex_sw = sv_87->sv_ex_sw; } #endif /* CPU_ENABLE_SSE */ int fill_fpregs(p, fpregs) struct proc *p; struct fpreg *fpregs; { #ifdef CPU_ENABLE_SSE if (cpu_fxsr) { fill_fpregs_xmm(&p->p_addr->u_pcb.pcb_save.sv_xmm, (struct save87 *)fpregs); return (0); } #endif /* CPU_ENABLE_SSE */ bcopy(&p->p_addr->u_pcb.pcb_save.sv_87, fpregs, sizeof *fpregs); return (0); } int set_fpregs(p, fpregs) struct proc *p; struct fpreg *fpregs; { #ifdef CPU_ENABLE_SSE if (cpu_fxsr) { set_fpregs_xmm((struct save87 *)fpregs, &p->p_addr->u_pcb.pcb_save.sv_xmm); return (0); } #endif /* CPU_ENABLE_SSE */ bcopy(fpregs, &p->p_addr->u_pcb.pcb_save.sv_87, sizeof *fpregs); return (0); } int fill_dbregs(p, dbregs) struct proc *p; struct dbreg *dbregs; { struct pcb *pcb; if (p == NULL) { dbregs->dr0 = rdr0(); dbregs->dr1 = rdr1(); dbregs->dr2 = rdr2(); dbregs->dr3 = rdr3(); dbregs->dr4 = rdr4(); dbregs->dr5 = rdr5(); dbregs->dr6 = rdr6(); dbregs->dr7 = rdr7(); } else { pcb = &p->p_addr->u_pcb; dbregs->dr0 = pcb->pcb_dr0; dbregs->dr1 = pcb->pcb_dr1; dbregs->dr2 = pcb->pcb_dr2; dbregs->dr3 = pcb->pcb_dr3; dbregs->dr4 = 0; dbregs->dr5 = 0; dbregs->dr6 = pcb->pcb_dr6; dbregs->dr7 = pcb->pcb_dr7; } return (0); } int set_dbregs(p, dbregs) struct proc *p; struct dbreg *dbregs; { struct pcb *pcb; int i; u_int32_t mask1, mask2; if (p == NULL) { load_dr0(dbregs->dr0); load_dr1(dbregs->dr1); load_dr2(dbregs->dr2); load_dr3(dbregs->dr3); load_dr4(dbregs->dr4); load_dr5(dbregs->dr5); load_dr6(dbregs->dr6); load_dr7(dbregs->dr7); } else { /* * Don't let an illegal value for dr7 get set. Specifically, * check for undefined settings. Setting these bit patterns * result in undefined behaviour and can lead to an unexpected * TRCTRAP. */ for (i = 0, mask1 = 0x3<<16, mask2 = 0x2<<16; i < 8; i++, mask1 <<= 2, mask2 <<= 2) if ((dbregs->dr7 & mask1) == mask2) return (EINVAL); pcb = &p->p_addr->u_pcb; /* * Don't let a process set a breakpoint that is not within the * process's address space. If a process could do this, it * could halt the system by setting a breakpoint in the kernel * (if ddb was enabled). Thus, we need to check to make sure * that no breakpoints are being enabled for addresses outside * process's address space, unless, perhaps, we were called by * uid 0. * * XXX - what about when the watched area of the user's * address space is written into from within the kernel * ... wouldn't that still cause a breakpoint to be generated * from within kernel mode? */ if (suser(p) != 0) { if (dbregs->dr7 & 0x3) { /* dr0 is enabled */ if (dbregs->dr0 >= VM_MAXUSER_ADDRESS) return (EINVAL); } if (dbregs->dr7 & (0x3<<2)) { /* dr1 is enabled */ if (dbregs->dr1 >= VM_MAXUSER_ADDRESS) return (EINVAL); } if (dbregs->dr7 & (0x3<<4)) { /* dr2 is enabled */ if (dbregs->dr2 >= VM_MAXUSER_ADDRESS) return (EINVAL); } if (dbregs->dr7 & (0x3<<6)) { /* dr3 is enabled */ if (dbregs->dr3 >= VM_MAXUSER_ADDRESS) return (EINVAL); } } pcb->pcb_dr0 = dbregs->dr0; pcb->pcb_dr1 = dbregs->dr1; pcb->pcb_dr2 = dbregs->dr2; pcb->pcb_dr3 = dbregs->dr3; pcb->pcb_dr6 = dbregs->dr6; pcb->pcb_dr7 = dbregs->dr7; pcb->pcb_flags |= PCB_DBREGS; } return (0); } /* * Return > 0 if a hardware breakpoint has been hit, and the * breakpoint was in user space. Return 0, otherwise. */ int user_dbreg_trap(void) { u_int32_t dr7, dr6; /* debug registers dr6 and dr7 */ u_int32_t bp; /* breakpoint bits extracted from dr6 */ int nbp; /* number of breakpoints that triggered */ caddr_t addr[4]; /* breakpoint addresses */ int i; dr7 = rdr7(); if ((dr7 & 0x000000ff) == 0) { /* * all GE and LE bits in the dr7 register are zero, * thus the trap couldn't have been caused by the * hardware debug registers */ return 0; } nbp = 0; dr6 = rdr6(); bp = dr6 & 0x0000000f; if (!bp) { /* * None of the breakpoint bits are set meaning this * trap was not caused by any of the debug registers */ return 0; } /* * at least one of the breakpoints were hit, check to see * which ones and if any of them are user space addresses */ if (bp & 0x01) { addr[nbp++] = (caddr_t)rdr0(); } if (bp & 0x02) { addr[nbp++] = (caddr_t)rdr1(); } if (bp & 0x04) { addr[nbp++] = (caddr_t)rdr2(); } if (bp & 0x08) { addr[nbp++] = (caddr_t)rdr3(); } for (i=0; i /* * Determine the size of the transfer, and make sure it is * within the boundaries of the partition. Adjust transfer * if needed, and signal errors or early completion. */ int bounds_check_with_label(struct bio *bp, struct disklabel *lp, int wlabel) { struct partition *p = lp->d_partitions + dkpart(bp->bio_dev); int labelsect = lp->d_partitions[0].p_offset; int maxsz = p->p_size, sz = (bp->bio_bcount + DEV_BSIZE - 1) >> DEV_BSHIFT; /* overwriting disk label ? */ /* XXX should also protect bootstrap in first 8K */ if (bp->bio_blkno + p->p_offset <= LABELSECTOR + labelsect && #if LABELSECTOR != 0 bp->bio_blkno + p->p_offset + sz > LABELSECTOR + labelsect && #endif (bp->bio_cmd == BIO_WRITE) && wlabel == 0) { bp->bio_error = EROFS; goto bad; } #if defined(DOSBBSECTOR) && defined(notyet) /* overwriting master boot record? */ if (bp->bio_blkno + p->p_offset <= DOSBBSECTOR && (bp->bio_cmd == BIO_WRITE) && wlabel == 0) { bp->bio_error = EROFS; goto bad; } #endif /* beyond partition? */ if (bp->bio_blkno < 0 || bp->bio_blkno + sz > maxsz) { /* if exactly at end of disk, return an EOF */ if (bp->bio_blkno == maxsz) { bp->bio_resid = bp->bio_bcount; return(0); } /* or truncate if part of it fits */ sz = maxsz - bp->bio_blkno; if (sz <= 0) { bp->bio_error = EINVAL; goto bad; } bp->bio_bcount = sz << DEV_BSHIFT; } bp->bio_pblkno = bp->bio_blkno + p->p_offset; return(1); bad: bp->bio_flags |= BIO_ERROR; return(-1); } #ifdef DDB /* * Provide inb() and outb() as functions. They are normally only * available as macros calling inlined functions, thus cannot be * called inside DDB. * * The actual code is stolen from , and de-inlined. */ #undef inb #undef outb /* silence compiler warnings */ u_char inb(u_int); void outb(u_int, u_char); u_char inb(u_int port) { u_char data; /* * We use %%dx and not %1 here because i/o is done at %dx and not at * %edx, while gcc generates inferior code (movw instead of movl) * if we tell it to load (u_short) port. */ __asm __volatile("inb %%dx,%0" : "=a" (data) : "d" (port)); return (data); } void outb(u_int port, u_char data) { u_char al; /* * Use an unnecessary assignment to help gcc's register allocator. * This make a large difference for gcc-1.40 and a tiny difference * for gcc-2.6.0. For gcc-1.40, al had to be ``asm("ax")'' for * best results. gcc-2.6.0 can't handle this. */ al = data; __asm __volatile("outb %0,%%dx" : : "a" (al), "d" (port)); } #endif /* DDB */