Index: head/sys/amd64/amd64/cpu_switch.S =================================================================== --- head/sys/amd64/amd64/cpu_switch.S (revision 31708) +++ head/sys/amd64/amd64/cpu_switch.S (revision 31709) @@ -1,778 +1,815 @@ /*- * Copyright (c) 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $Id: swtch.s,v 1.63 1997/09/21 15:03:58 peter Exp $ + * $Id: swtch.s,v 1.64 1997/10/10 09:44:06 peter Exp $ */ #include "npx.h" #include "opt_user_ldt.h" #include "opt_vm86.h" #include #include #ifdef SMP #include #include #include /** GRAB_LOPRIO */ #endif /* SMP */ #include "assym.s" /*****************************************************************************/ /* Scheduling */ /*****************************************************************************/ /* * The following primitives manipulate the run queues. * _whichqs tells which of the 32 queues _qs * have processes in them. setrunqueue puts processes into queues, Remrq * removes them from queues. The running process is on no queue, * other processes are on a queue related to p->p_priority, divided by 4 * actually to shrink the 0-127 range of priorities into the 32 available * queues. */ .data #ifndef SMP .globl _curpcb _curpcb: .long 0 /* pointer to curproc's PCB area */ #endif /* !SMP */ .globl _whichqs, _whichrtqs, _whichidqs _whichqs: .long 0 /* which run queues have data */ _whichrtqs: .long 0 /* which realtime run qs have data */ _whichidqs: .long 0 /* which idletime run qs have data */ .globl _hlt_vector _hlt_vector: .long _default_halt /* pointer to halt routine */ .globl _qs,_cnt,_panic .globl _want_resched _want_resched: .long 0 /* we need to re-run the scheduler */ +#if defined(SWTCH_OPTIM_STATS) + .globl _swtch_optim_stats, _tlb_flush_count +_swtch_optim_stats: .long 0 /* number of _swtch_optims */ +_tlb_flush_count: .long 0 +#endif .text /* * setrunqueue(p) * * Call should be made at spl6(), and p->p_stat should be SRUN */ ENTRY(setrunqueue) movl 4(%esp),%eax #ifdef DIAGNOSTIC cmpb $SRUN,P_STAT(%eax) je set1 pushl $set2 call _panic set1: #endif cmpw $RTP_PRIO_NORMAL,P_RTPRIO_TYPE(%eax) /* normal priority process? */ je set_nort movzwl P_RTPRIO_PRIO(%eax),%edx cmpw $RTP_PRIO_REALTIME,P_RTPRIO_TYPE(%eax) /* realtime priority? */ jne set_id /* must be idle priority */ set_rt: btsl %edx,_whichrtqs /* set q full bit */ shll $3,%edx addl $_rtqs,%edx /* locate q hdr */ movl %edx,P_FORW(%eax) /* link process on tail of q */ movl P_BACK(%edx),%ecx movl %ecx,P_BACK(%eax) movl %eax,P_BACK(%edx) movl %eax,P_FORW(%ecx) ret set_id: btsl %edx,_whichidqs /* set q full bit */ shll $3,%edx addl $_idqs,%edx /* locate q hdr */ movl %edx,P_FORW(%eax) /* link process on tail of q */ movl P_BACK(%edx),%ecx movl %ecx,P_BACK(%eax) movl %eax,P_BACK(%edx) movl %eax,P_FORW(%ecx) ret set_nort: /* Normal (RTOFF) code */ movzbl P_PRI(%eax),%edx shrl $2,%edx btsl %edx,_whichqs /* set q full bit */ shll $3,%edx addl $_qs,%edx /* locate q hdr */ movl %edx,P_FORW(%eax) /* link process on tail of q */ movl P_BACK(%edx),%ecx movl %ecx,P_BACK(%eax) movl %eax,P_BACK(%edx) movl %eax,P_FORW(%ecx) ret set2: .asciz "setrunqueue" /* * Remrq(p) * * Call should be made at spl6(). */ ENTRY(remrq) movl 4(%esp),%eax cmpw $RTP_PRIO_NORMAL,P_RTPRIO_TYPE(%eax) /* normal priority process? */ je rem_nort movzwl P_RTPRIO_PRIO(%eax),%edx cmpw $RTP_PRIO_REALTIME,P_RTPRIO_TYPE(%eax) /* normal priority process? */ jne rem_id btrl %edx,_whichrtqs /* clear full bit, panic if clear already */ jb rem1rt pushl $rem3rt call _panic rem1rt: pushl %edx movl P_FORW(%eax),%ecx /* unlink process */ movl P_BACK(%eax),%edx movl %edx,P_BACK(%ecx) movl P_BACK(%eax),%ecx movl P_FORW(%eax),%edx movl %edx,P_FORW(%ecx) popl %edx movl $_rtqs,%ecx shll $3,%edx addl %edx,%ecx cmpl P_FORW(%ecx),%ecx /* q still has something? */ je rem2rt shrl $3,%edx /* yes, set bit as still full */ btsl %edx,_whichrtqs rem2rt: ret rem_id: btrl %edx,_whichidqs /* clear full bit, panic if clear already */ jb rem1id pushl $rem3id call _panic rem1id: pushl %edx movl P_FORW(%eax),%ecx /* unlink process */ movl P_BACK(%eax),%edx movl %edx,P_BACK(%ecx) movl P_BACK(%eax),%ecx movl P_FORW(%eax),%edx movl %edx,P_FORW(%ecx) popl %edx movl $_idqs,%ecx shll $3,%edx addl %edx,%ecx cmpl P_FORW(%ecx),%ecx /* q still has something? */ je rem2id shrl $3,%edx /* yes, set bit as still full */ btsl %edx,_whichidqs rem2id: ret rem_nort: movzbl P_PRI(%eax),%edx shrl $2,%edx btrl %edx,_whichqs /* clear full bit, panic if clear already */ jb rem1 pushl $rem3 call _panic rem1: pushl %edx movl P_FORW(%eax),%ecx /* unlink process */ movl P_BACK(%eax),%edx movl %edx,P_BACK(%ecx) movl P_BACK(%eax),%ecx movl P_FORW(%eax),%edx movl %edx,P_FORW(%ecx) popl %edx movl $_qs,%ecx shll $3,%edx addl %edx,%ecx cmpl P_FORW(%ecx),%ecx /* q still has something? */ je rem2 shrl $3,%edx /* yes, set bit as still full */ btsl %edx,_whichqs rem2: ret rem3: .asciz "remrq" rem3rt: .asciz "remrq.rt" rem3id: .asciz "remrq.id" /* * When no processes are on the runq, cpu_switch() branches to _idle * to wait for something to come ready. */ ALIGN_TEXT _idle: #ifdef SMP /* when called, we have the mplock, intr disabled */ xorl %ebp,%ebp /* use our idleproc's "context" */ movl _my_idlePTD,%ecx movl %ecx,%cr3 +#if defined(SWTCH_OPTIM_STATS) + incl _tlb_flush_count +#endif movl $_idlestack_top,%ecx movl %ecx,%esp /* update common_tss.tss_esp0 pointer */ #ifdef VM86 movl _my_tr, %esi #endif /* VM86 */ - movl $_common_tss, %eax - movl %ecx, TSS_ESP0(%eax) + movl %ecx, _common_tss + TSS_ESP0 #ifdef VM86 btrl %esi, _private_tss je 1f movl $_common_tssd, %edi /* move correct tss descriptor into GDT slot, then reload tr */ leal _gdt(,%esi,8), %ebx /* entry in GDT */ movl 0(%edi), %eax movl %eax, 0(%ebx) movl 4(%edi), %eax movl %eax, 4(%ebx) shll $3, %esi /* GSEL(entry, SEL_KPL) */ ltr %si 1: #endif /* VM86 */ sti /* * XXX callers of cpu_switch() do a bogus splclock(). Locking should * be left to cpu_switch(). */ call _spl0 cli /* * _REALLY_ free the lock, no matter how deep the prior nesting. * We will recover the nesting on the way out when we have a new * proc to load. * * XXX: we had damn well better be sure we had it before doing this! */ movl $FREE_LOCK, %eax movl %eax, _mp_lock /* do NOT have lock, intrs disabled */ .globl idle_loop idle_loop: +#if defined(SWTCH_OPTIM_STATS) + incl _tlb_flush_count +#endif movl %cr3,%eax /* ouch! */ movl %eax,%cr3 cmpl $0,_smp_active jne 1f cmpl $0,_cpuid je 1f jmp 2f 1: cmpl $0,_whichrtqs /* real-time queue */ jne 3f cmpl $0,_whichqs /* normal queue */ jne 3f cmpl $0,_whichidqs /* 'idle' queue */ jne 3f cmpl $0,_do_page_zero_idle je 2f /* XXX appears to cause panics */ /* * Inside zero_idle we enable interrupts and grab the mplock * as needed. It needs to be careful about entry/exit mutexes. */ call _vm_page_zero_idle /* internal locking */ testl %eax, %eax jnz idle_loop 2: /* enable intrs for a halt */ #ifdef SMP movl $0, lapic_tpr /* 1st candidate for an INT */ #endif sti call *_hlt_vector /* wait for interrupt */ cli jmp idle_loop 3: #ifdef SMP movl $LOPRIO_LEVEL, lapic_tpr /* arbitrate for INTs */ #endif call _get_mplock cmpl $0,_whichrtqs /* real-time queue */ CROSSJUMP(jne, sw1a, je) cmpl $0,_whichqs /* normal queue */ CROSSJUMP(jne, nortqr, je) cmpl $0,_whichidqs /* 'idle' queue */ CROSSJUMP(jne, idqr, je) call _rel_mplock jmp idle_loop #else xorl %ebp,%ebp movl $HIDENAME(tmpstk),%esp - movl _IdlePTD,%ecx - movl %ecx,%cr3 +#if defined(OVERLY_CONSERVATIVE_PTD_MGMT) +#if defined(SWTCH_OPTIM_STATS) + incl _swtch_optim_stats +#endif + movl _IdlePTD, %ecx + movl %cr3, %eax + cmpl %ecx, %eax + je 2f +#if defined(SWTCH_OPTIM_STATS) + decl _swtch_optim_stats + incl _tlb_flush_count +#endif + movl %ecx, %cr3 +2: +#endif /* update common_tss.tss_esp0 pointer */ #ifdef VM86 movl _my_tr, %esi #endif /* VM86 */ - movl $_common_tss, %eax - movl %esp, TSS_ESP0(%eax) + movl %esp, _common_tss + TSS_ESP0 #ifdef VM86 btrl %esi, _private_tss je 1f movl $_common_tssd, %edi /* move correct tss descriptor into GDT slot, then reload tr */ leal _gdt(,%esi,8), %ebx /* entry in GDT */ movl 0(%edi), %eax movl %eax, 0(%ebx) movl 4(%edi), %eax movl %eax, 4(%ebx) shll $3, %esi /* GSEL(entry, SEL_KPL) */ ltr %si 1: #endif /* VM86 */ sti /* * XXX callers of cpu_switch() do a bogus splclock(). Locking should * be left to cpu_switch(). */ call _spl0 ALIGN_TEXT idle_loop: cli cmpl $0,_whichrtqs /* real-time queue */ CROSSJUMP(jne, sw1a, je) cmpl $0,_whichqs /* normal queue */ CROSSJUMP(jne, nortqr, je) cmpl $0,_whichidqs /* 'idle' queue */ CROSSJUMP(jne, idqr, je) call _vm_page_zero_idle testl %eax, %eax jnz idle_loop sti call *_hlt_vector /* wait for interrupt */ jmp idle_loop #endif CROSSJUMPTARGET(_idle) ENTRY(default_halt) #ifndef SMP hlt /* XXX: until a wakeup IPI */ #endif ret /* * cpu_switch() */ ENTRY(cpu_switch) /* switch to new process. first, save context as needed */ movl _curproc,%ecx /* if no process to save, don't bother */ testl %ecx,%ecx je sw1 #ifdef SMP movb P_ONCPU(%ecx), %al /* save "last" cpu */ movb %al, P_LASTCPU(%ecx) movb $0xff, P_ONCPU(%ecx) /* "leave" the cpu */ #endif /* SMP */ movl P_ADDR(%ecx),%ecx movl (%esp),%eax /* Hardware registers */ movl %eax,PCB_EIP(%ecx) movl %ebx,PCB_EBX(%ecx) movl %esp,PCB_ESP(%ecx) movl %ebp,PCB_EBP(%ecx) movl %esi,PCB_ESI(%ecx) movl %edi,PCB_EDI(%ecx) movl %fs,PCB_FS(%ecx) movl %gs,PCB_GS(%ecx) #ifdef SMP movl _mp_lock, %eax /* XXX FIXME: we should be saving the local APIC TPR */ #ifdef DIAGNOSTIC cmpl $FREE_LOCK, %eax /* is it free? */ je badsw4 /* yes, bad medicine! */ #endif /* DIAGNOSTIC */ andl $COUNT_FIELD, %eax /* clear CPU portion */ movl %eax, PCB_MPNEST(%ecx) /* store it */ #endif /* SMP */ #if NNPX > 0 /* have we used fp, and need a save? */ movl _curproc,%eax cmpl %eax,_npxproc jne 1f addl $PCB_SAVEFPU,%ecx /* h/w bugs make saving complicated */ pushl %ecx call _npxsave /* do it in a big C function */ popl %eax 1: #endif /* NNPX > 0 */ movl $0,_curproc /* out of process */ /* save is done, now choose a new process or idle */ sw1: cli #ifdef SMP /* Stop scheduling if smp_active goes zero and we are not BSP */ cmpl $0,_smp_active jne 1f cmpl $0,_cpuid je 1f CROSSJUMP(je, _idle, jne) /* wind down */ 1: #endif sw1a: movl _whichrtqs,%edi /* pick next p. from rtqs */ testl %edi,%edi jz nortqr /* no realtime procs */ /* XXX - bsf is sloow */ bsfl %edi,%ebx /* find a full q */ jz nortqr /* no proc on rt q - try normal ... */ /* XX update whichqs? */ btrl %ebx,%edi /* clear q full status */ leal _rtqs(,%ebx,8),%eax /* select q */ movl %eax,%esi movl P_FORW(%eax),%ecx /* unlink from front of process q */ movl P_FORW(%ecx),%edx movl %edx,P_FORW(%eax) movl P_BACK(%ecx),%eax movl %eax,P_BACK(%edx) cmpl P_FORW(%ecx),%esi /* q empty */ je rt3 btsl %ebx,%edi /* nope, set to indicate not empty */ rt3: movl %edi,_whichrtqs /* update q status */ jmp swtch_com /* old sw1a */ /* Normal process priority's */ nortqr: movl _whichqs,%edi 2: /* XXX - bsf is sloow */ bsfl %edi,%ebx /* find a full q */ jz idqr /* if none, idle */ /* XX update whichqs? */ btrl %ebx,%edi /* clear q full status */ leal _qs(,%ebx,8),%eax /* select q */ movl %eax,%esi movl P_FORW(%eax),%ecx /* unlink from front of process q */ movl P_FORW(%ecx),%edx movl %edx,P_FORW(%eax) movl P_BACK(%ecx),%eax movl %eax,P_BACK(%edx) cmpl P_FORW(%ecx),%esi /* q empty */ je 3f btsl %ebx,%edi /* nope, set to indicate not empty */ 3: movl %edi,_whichqs /* update q status */ jmp swtch_com idqr: /* was sw1a */ movl _whichidqs,%edi /* pick next p. from idqs */ /* XXX - bsf is sloow */ bsfl %edi,%ebx /* find a full q */ CROSSJUMP(je, _idle, jne) /* if no proc, idle */ /* XX update whichqs? */ btrl %ebx,%edi /* clear q full status */ leal _idqs(,%ebx,8),%eax /* select q */ movl %eax,%esi movl P_FORW(%eax),%ecx /* unlink from front of process q */ movl P_FORW(%ecx),%edx movl %edx,P_FORW(%eax) movl P_BACK(%ecx),%eax movl %eax,P_BACK(%edx) cmpl P_FORW(%ecx),%esi /* q empty */ je id3 btsl %ebx,%edi /* nope, set to indicate not empty */ id3: movl %edi,_whichidqs /* update q status */ swtch_com: movl $0,%eax movl %eax,_want_resched #ifdef DIAGNOSTIC cmpl %eax,P_WCHAN(%ecx) jne badsw1 cmpb $SRUN,P_STAT(%ecx) jne badsw2 #endif movl %eax,P_BACK(%ecx) /* isolate process to run */ movl P_ADDR(%ecx),%edx - movl PCB_CR3(%edx),%ebx #ifdef SMP + movl PCB_CR3(%edx),%ebx /* Grab the private PT pointer from the outgoing process's PTD */ movl $_PTD, %esi movl 4*MPPTDI(%esi), %eax /* fetch cpu's prv pt */ -#endif /* SMP */ - +#else +#if defined(SWTCH_OPTIM_STATS) + incl _swtch_optim_stats +#endif /* switch address space */ + movl %cr3,%ebx + cmpl PCB_CR3(%edx),%ebx + je 4f +#if defined(SWTCH_OPTIM_STATS) + decl _swtch_optim_stats + incl _tlb_flush_count +#endif + movl PCB_CR3(%edx),%ebx +#endif /* SMP */ movl %ebx,%cr3 +4: #ifdef SMP /* Copy the private PT to the new process's PTD */ /* XXX yuck, the _PTD changes when we switch, so we have to * reload %cr3 after changing the address space. * We need to fix this by storing a pointer to the virtual * location of the per-process PTD in the PCB or something quick. * Dereferencing proc->vm_map->pmap->p_pdir[] is painful in asm. */ movl %eax, 4*MPPTDI(%esi) /* restore cpu's prv page */ +#if defined(SWTCH_OPTIM_STATS) + incl _tlb_flush_count +#endif /* XXX: we have just changed the page tables.. reload.. */ movl %ebx, %cr3 #endif /* SMP */ #ifdef VM86 movl _my_tr, %esi cmpl $0, PCB_EXT(%edx) /* has pcb extension? */ je 1f btsl %esi, _private_tss /* mark use of private tss */ movl PCB_EXT(%edx), %edi /* new tss descriptor */ jmp 2f 1: #endif /* update common_tss.tss_esp0 pointer */ movl $_common_tss, %eax movl %edx, %ebx /* pcb */ #ifdef VM86 addl $(UPAGES * PAGE_SIZE - 16), %ebx #else addl $(UPAGES * PAGE_SIZE), %ebx #endif /* VM86 */ movl %ebx, TSS_ESP0(%eax) #ifdef VM86 btrl %esi, _private_tss je 3f movl $_common_tssd, %edi 2: /* move correct tss descriptor into GDT slot, then reload tr */ leal _gdt(,%esi,8), %ebx /* entry in GDT */ movl 0(%edi), %eax movl %eax, 0(%ebx) movl 4(%edi), %eax movl %eax, 4(%ebx) shll $3, %esi /* GSEL(entry, SEL_KPL) */ ltr %si 3: #endif /* VM86 */ /* restore context */ movl PCB_EBX(%edx),%ebx movl PCB_ESP(%edx),%esp movl PCB_EBP(%edx),%ebp movl PCB_ESI(%edx),%esi movl PCB_EDI(%edx),%edi movl PCB_EIP(%edx),%eax movl %eax,(%esp) #ifdef SMP #ifdef GRAB_LOPRIO /* hold LOPRIO for INTs */ #ifdef CHEAP_TPR movl $0, lapic_tpr #else andl $~APIC_TPR_PRIO, lapic_tpr #endif /** CHEAP_TPR */ #endif /** GRAB_LOPRIO */ movl _cpuid,%eax movb %al, P_ONCPU(%ecx) #endif /* SMP */ movl %edx, _curpcb movl %ecx, _curproc /* into next process */ #ifdef SMP movl _cpu_lockid, %eax orl PCB_MPNEST(%edx), %eax /* add next count from PROC */ movl %eax, _mp_lock /* load the mp_lock */ /* XXX FIXME: we should be restoring the local APIC TPR */ #endif /* SMP */ #ifdef USER_LDT cmpl $0, PCB_USERLDT(%edx) jnz 1f movl __default_ldt,%eax cmpl _currentldt,%eax je 2f lldt __default_ldt movl %eax,_currentldt jmp 2f 1: pushl %edx call _set_user_ldt popl %edx 2: #endif /* This must be done after loading the user LDT. */ .globl cpu_switch_load_fs cpu_switch_load_fs: movl PCB_FS(%edx),%fs .globl cpu_switch_load_gs cpu_switch_load_gs: movl PCB_GS(%edx),%gs sti ret CROSSJUMPTARGET(idqr) CROSSJUMPTARGET(nortqr) CROSSJUMPTARGET(sw1a) #ifdef DIAGNOSTIC badsw1: pushl $sw0_1 call _panic sw0_1: .asciz "cpu_switch: has wchan" badsw2: pushl $sw0_2 call _panic sw0_2: .asciz "cpu_switch: not SRUN" #endif #if defined(SMP) && defined(DIAGNOSTIC) badsw4: pushl $sw0_4 call _panic sw0_4: .asciz "cpu_switch: do not have lock" #endif /* SMP && DIAGNOSTIC */ /* * savectx(pcb) * Update pcb, saving current processor state. */ ENTRY(savectx) /* fetch PCB */ movl 4(%esp),%ecx /* caller's return address - child won't execute this routine */ movl (%esp),%eax movl %eax,PCB_EIP(%ecx) movl %ebx,PCB_EBX(%ecx) movl %esp,PCB_ESP(%ecx) movl %ebp,PCB_EBP(%ecx) movl %esi,PCB_ESI(%ecx) movl %edi,PCB_EDI(%ecx) movl %fs,PCB_FS(%ecx) movl %gs,PCB_GS(%ecx) #if NNPX > 0 /* * If npxproc == NULL, then the npx h/w state is irrelevant and the * state had better already be in the pcb. This is true for forks * but not for dumps (the old book-keeping with FP flags in the pcb * always lost for dumps because the dump pcb has 0 flags). * * If npxproc != NULL, then we have to save the npx h/w state to * npxproc's pcb and copy it to the requested pcb, or save to the * requested pcb and reload. Copying is easier because we would * have to handle h/w bugs for reloading. We used to lose the * parent's npx state for forks by forgetting to reload. */ movl _npxproc,%eax testl %eax,%eax je 1f pushl %ecx movl P_ADDR(%eax),%eax leal PCB_SAVEFPU(%eax),%eax pushl %eax pushl %eax call _npxsave addl $4,%esp popl %eax popl %ecx pushl $PCB_SAVEFPU_SIZE leal PCB_SAVEFPU(%ecx),%ecx pushl %ecx pushl %eax call _bcopy addl $12,%esp #endif /* NNPX > 0 */ 1: ret Index: head/sys/amd64/amd64/machdep.c =================================================================== --- head/sys/amd64/amd64/machdep.c (revision 31708) +++ head/sys/amd64/amd64/machdep.c (revision 31709) @@ -1,1798 +1,1806 @@ /*- * Copyright (c) 1992 Terrence R. Lambert. * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 - * $Id: machdep.c,v 1.277 1997/12/04 14:35:39 jkh Exp $ + * $Id: machdep.c,v 1.278 1997/12/04 21:21:24 jmg Exp $ */ #include "apm.h" #include "npx.h" #include "opt_bounce.h" #include "opt_cpu.h" #include "opt_ddb.h" #include "opt_maxmem.h" #include "opt_perfmon.h" #include "opt_smp.h" #include "opt_sysvipc.h" #include "opt_userconfig.h" #include "opt_vm86.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SYSVSHM #include #endif #ifdef SYSVMSG #include #endif #ifdef SYSVSEM #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if NAPM > 0 #include #endif #include #include #include #include #include #include #include #include #include /* pcb.h included via sys/user.h */ #ifdef SMP #include #endif #ifdef PERFMON #include #endif #include #include #include #include extern void init386 __P((int first)); extern int ptrace_set_pc __P((struct proc *p, unsigned int addr)); extern int ptrace_single_step __P((struct proc *p)); extern int ptrace_write_u __P((struct proc *p, vm_offset_t off, int data)); extern void dblfault_handler __P((void)); extern void printcpuinfo(void); /* XXX header file */ extern void earlysetcpuclass(void); /* same header file */ extern void finishidentcpu(void); extern void panicifcpuunsupported(void); extern void initializecpu(void); static void cpu_startup __P((void *)); SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL) static MALLOC_DEFINE(M_MBUF, "mbuf", "mbuf"); #ifdef BOUNCE_BUFFERS #ifdef BOUNCEPAGES int bouncepages = BOUNCEPAGES; #else int bouncepages = 0; #endif #endif /* BOUNCE_BUFFERS */ int msgbufmapped = 0; /* set when safe to use msgbuf */ int _udatasel, _ucodesel; u_int atdevbase; + +#if defined(SWTCH_OPTIM_STATS) +extern int swtch_optim_stats; +SYSCTL_INT(_debug, OID_AUTO, swtch_optim_stats, + CTLFLAG_RD, &swtch_optim_stats, 0, ""); +SYSCTL_INT(_debug, OID_AUTO, tlb_flush_count, + CTLFLAG_RD, &tlb_flush_count, 0, ""); +#endif int physmem = 0; int cold = 1; static int sysctl_hw_physmem SYSCTL_HANDLER_ARGS { int error = sysctl_handle_int(oidp, 0, ctob(physmem), req); return (error); } SYSCTL_PROC(_hw, HW_PHYSMEM, physmem, CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_hw_physmem, "I", ""); static int sysctl_hw_usermem SYSCTL_HANDLER_ARGS { int error = sysctl_handle_int(oidp, 0, ctob(physmem - cnt.v_wire_count), req); return (error); } SYSCTL_PROC(_hw, HW_USERMEM, usermem, CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_hw_usermem, "I", ""); int bootverbose = 0, Maxmem = 0; long dumplo; vm_offset_t phys_avail[10]; /* must be 2 less so 0 0 can signal end of chunks */ #define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(vm_offset_t)) - 2) static void setup_netisrs __P((struct linker_set *)); /* XXX declare elsewhere */ static vm_offset_t buffer_sva, buffer_eva; vm_offset_t clean_sva, clean_eva; static vm_offset_t pager_sva, pager_eva; extern struct linker_set netisr_set; #define offsetof(type, member) ((size_t)(&((type *)0)->member)) static void cpu_startup(dummy) void *dummy; { register unsigned i; register caddr_t v; vm_offset_t maxaddr; vm_size_t size = 0; int firstaddr; vm_offset_t minaddr; if (boothowto & RB_VERBOSE) bootverbose++; /* * Good {morning,afternoon,evening,night}. */ printf(version); earlysetcpuclass(); startrtclock(); printcpuinfo(); panicifcpuunsupported(); #ifdef PERFMON perfmon_init(); #endif printf("real memory = %d (%dK bytes)\n", ptoa(Maxmem), ptoa(Maxmem) / 1024); /* * Display any holes after the first chunk of extended memory. */ if (bootverbose) { int indx; printf("Physical memory chunk(s):\n"); for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) { int size1 = phys_avail[indx + 1] - phys_avail[indx]; printf("0x%08lx - 0x%08lx, %d bytes (%d pages)\n", phys_avail[indx], phys_avail[indx + 1] - 1, size1, size1 / PAGE_SIZE); } } /* * Quickly wire in netisrs. */ setup_netisrs(&netisr_set); /* * Calculate callout wheel size */ for (callwheelsize = 1, callwheelbits = 0; callwheelsize < ncallout; callwheelsize <<= 1, ++callwheelbits) ; callwheelmask = callwheelsize - 1; /* * Allocate space for system data structures. * The first available kernel virtual address is in "v". * As pages of kernel virtual memory are allocated, "v" is incremented. * As pages of memory are allocated and cleared, * "firstaddr" is incremented. * An index into the kernel page table corresponding to the * virtual memory address maintained in "v" is kept in "mapaddr". */ /* * Make two passes. The first pass calculates how much memory is * needed and allocates it. The second pass assigns virtual * addresses to the various data structures. */ firstaddr = 0; again: v = (caddr_t)firstaddr; #define valloc(name, type, num) \ (name) = (type *)v; v = (caddr_t)((name)+(num)) #define valloclim(name, type, num, lim) \ (name) = (type *)v; v = (caddr_t)((lim) = ((name)+(num))) valloc(callout, struct callout, ncallout); valloc(callwheel, struct callout_tailq, callwheelsize); #ifdef SYSVSHM valloc(shmsegs, struct shmid_ds, shminfo.shmmni); #endif #ifdef SYSVSEM valloc(sema, struct semid_ds, seminfo.semmni); valloc(sem, struct sem, seminfo.semmns); /* This is pretty disgusting! */ valloc(semu, int, (seminfo.semmnu * seminfo.semusz) / sizeof(int)); #endif #ifdef SYSVMSG valloc(msgpool, char, msginfo.msgmax); valloc(msgmaps, struct msgmap, msginfo.msgseg); valloc(msghdrs, struct msg, msginfo.msgtql); valloc(msqids, struct msqid_ds, msginfo.msgmni); #endif if (nbuf == 0) { nbuf = 30; if( physmem > 1024) nbuf += min((physmem - 1024) / 8, 2048); } nswbuf = max(min(nbuf/4, 128), 16); valloc(swbuf, struct buf, nswbuf); valloc(buf, struct buf, nbuf); #ifdef BOUNCE_BUFFERS /* * If there is more than 16MB of memory, allocate some bounce buffers */ if (Maxmem > 4096) { if (bouncepages == 0) { bouncepages = 64; bouncepages += ((Maxmem - 4096) / 2048) * 32; if (bouncepages > 128) bouncepages = 128; } v = (caddr_t)((vm_offset_t)round_page(v)); valloc(bouncememory, char, bouncepages * PAGE_SIZE); } #endif /* * End of first pass, size has been calculated so allocate memory */ if (firstaddr == 0) { size = (vm_size_t)(v - firstaddr); firstaddr = (int)kmem_alloc(kernel_map, round_page(size)); if (firstaddr == 0) panic("startup: no room for tables"); goto again; } /* * End of second pass, addresses have been assigned */ if ((vm_size_t)(v - firstaddr) != size) panic("startup: table size inconsistency"); #ifdef BOUNCE_BUFFERS clean_map = kmem_suballoc(kernel_map, &clean_sva, &clean_eva, (nbuf*BKVASIZE) + (nswbuf*MAXPHYS) + maxbkva + pager_map_size, TRUE); io_map = kmem_suballoc(clean_map, &minaddr, &maxaddr, maxbkva, FALSE); #else clean_map = kmem_suballoc(kernel_map, &clean_sva, &clean_eva, (nbuf*BKVASIZE) + (nswbuf*MAXPHYS) + pager_map_size, TRUE); #endif buffer_map = kmem_suballoc(clean_map, &buffer_sva, &buffer_eva, (nbuf*BKVASIZE), TRUE); pager_map = kmem_suballoc(clean_map, &pager_sva, &pager_eva, (nswbuf*MAXPHYS) + pager_map_size, TRUE); pager_map->system_map = 1; exec_map = kmem_suballoc(kernel_map, &minaddr, &maxaddr, (16*ARG_MAX), TRUE); u_map = kmem_suballoc(kernel_map, &minaddr, &maxaddr, (maxproc*UPAGES*PAGE_SIZE), FALSE); /* * Finally, allocate mbuf pool. Since mclrefcnt is an off-size * we use the more space efficient malloc in place of kmem_alloc. */ { vm_offset_t mb_map_size; mb_map_size = nmbufs * MSIZE + nmbclusters * MCLBYTES; mb_map_size = roundup2(mb_map_size, max(MCLBYTES, PAGE_SIZE)); mclrefcnt = malloc(mb_map_size / MCLBYTES, M_MBUF, M_NOWAIT); bzero(mclrefcnt, mb_map_size / MCLBYTES); mb_map = kmem_suballoc(kmem_map, (vm_offset_t *)&mbutl, &maxaddr, mb_map_size, FALSE); mb_map->system_map = 1; } /* * Initialize callouts */ SLIST_INIT(&callfree); for (i = 0; i < ncallout; i++) { SLIST_INSERT_HEAD(&callfree, &callout[i], c_links.sle); } for (i = 0; i < callwheelsize; i++) { TAILQ_INIT(&callwheel[i]); } #if defined(USERCONFIG) #if defined(USERCONFIG_BOOT) if (1) { #else if (boothowto & RB_CONFIG) { #endif userconfig(); cninit(); /* the preferred console may have changed */ } #endif #ifdef BOUNCE_BUFFERS /* * init bounce buffers */ vm_bounce_init(); #endif printf("avail memory = %d (%dK bytes)\n", ptoa(cnt.v_free_count), ptoa(cnt.v_free_count) / 1024); /* * Set up buffers, so they can be used to read disk labels. */ bufinit(); vm_pager_bufferinit(); #ifdef SMP /* * OK, enough kmem_alloc/malloc state should be up, lets get on with it! */ mp_start(); /* fire up the APs and APICs */ mp_announce(); #endif /* SMP */ } int register_netisr(num, handler) int num; netisr_t *handler; { if (num < 0 || num >= (sizeof(netisrs)/sizeof(*netisrs)) ) { printf("register_netisr: bad isr number: %d\n", num); return (EINVAL); } netisrs[num] = handler; return (0); } static void setup_netisrs(ls) struct linker_set *ls; { int i; const struct netisrtab *nit; for(i = 0; ls->ls_items[i]; i++) { nit = (const struct netisrtab *)ls->ls_items[i]; register_netisr(nit->nit_num, nit->nit_isr); } } /* * Send an interrupt to process. * * Stack is set up to allow sigcode stored * at top to call routine, followed by kcall * to sigreturn routine below. After sigreturn * resets the signal mask, the stack, and the * frame pointer, it returns to the user * specified pc, psl. */ void sendsig(catcher, sig, mask, code) sig_t catcher; int sig, mask; u_long code; { register struct proc *p = curproc; register struct trapframe *regs; register struct sigframe *fp; struct sigframe sf; struct sigacts *psp = p->p_sigacts; int oonstack; regs = p->p_md.md_regs; oonstack = psp->ps_sigstk.ss_flags & SS_ONSTACK; /* * Allocate and validate space for the signal handler context. */ if ((psp->ps_flags & SAS_ALTSTACK) && !oonstack && (psp->ps_sigonstack & sigmask(sig))) { fp = (struct sigframe *)(psp->ps_sigstk.ss_sp + psp->ps_sigstk.ss_size - sizeof(struct sigframe)); psp->ps_sigstk.ss_flags |= SS_ONSTACK; } else { fp = (struct sigframe *)regs->tf_esp - 1; } /* * grow() will return FALSE if the fp will not fit inside the stack * and the stack can not be grown. useracc will return FALSE * if access is denied. */ if ((grow(p, (int)fp) == FALSE) || (useracc((caddr_t)fp, sizeof(struct sigframe), B_WRITE) == FALSE)) { /* * Process has trashed its stack; give it an illegal * instruction to halt it in its tracks. */ SIGACTION(p, SIGILL) = SIG_DFL; sig = sigmask(SIGILL); p->p_sigignore &= ~sig; p->p_sigcatch &= ~sig; p->p_sigmask &= ~sig; psignal(p, SIGILL); return; } /* * Build the argument list for the signal handler. */ if (p->p_sysent->sv_sigtbl) { if (sig < p->p_sysent->sv_sigsize) sig = p->p_sysent->sv_sigtbl[sig]; else sig = p->p_sysent->sv_sigsize + 1; } sf.sf_signum = sig; sf.sf_code = code; sf.sf_scp = &fp->sf_sc; sf.sf_addr = (char *) regs->tf_err; sf.sf_handler = catcher; /* save scratch registers */ sf.sf_sc.sc_eax = regs->tf_eax; sf.sf_sc.sc_ebx = regs->tf_ebx; sf.sf_sc.sc_ecx = regs->tf_ecx; sf.sf_sc.sc_edx = regs->tf_edx; sf.sf_sc.sc_esi = regs->tf_esi; sf.sf_sc.sc_edi = regs->tf_edi; sf.sf_sc.sc_cs = regs->tf_cs; sf.sf_sc.sc_ds = regs->tf_ds; sf.sf_sc.sc_ss = regs->tf_ss; sf.sf_sc.sc_es = regs->tf_es; sf.sf_sc.sc_isp = regs->tf_isp; /* * Build the signal context to be used by sigreturn. */ sf.sf_sc.sc_onstack = oonstack; sf.sf_sc.sc_mask = mask; sf.sf_sc.sc_sp = regs->tf_esp; sf.sf_sc.sc_fp = regs->tf_ebp; sf.sf_sc.sc_pc = regs->tf_eip; sf.sf_sc.sc_ps = regs->tf_eflags; sf.sf_sc.sc_trapno = regs->tf_trapno; sf.sf_sc.sc_err = regs->tf_err; /* * If we're a vm86 process, we want to save the segment registers. * We also change eflags to be our emulated eflags, not the actual * eflags. */ if (regs->tf_eflags & PSL_VM) { struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; struct vm86_kernel *vm86 = &p->p_addr->u_pcb.pcb_ext->ext_vm86; sf.sf_sc.sc_gs = tf->tf_vm86_gs; sf.sf_sc.sc_fs = tf->tf_vm86_fs; sf.sf_sc.sc_es = tf->tf_vm86_es; sf.sf_sc.sc_ds = tf->tf_vm86_ds; if (vm86->vm86_has_vme == 0) sf.sf_sc.sc_ps = (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); /* * We should never have PSL_T set when returning from vm86 * mode. It may be set here if we deliver a signal before * getting to vm86 mode, so turn it off. */ tf->tf_eflags &= ~(PSL_VM | PSL_T | PSL_VIF | PSL_VIP); } /* * Copy the sigframe out to the user's stack. */ if (copyout(&sf, fp, sizeof(struct sigframe)) != 0) { /* * Something is wrong with the stack pointer. * ...Kill the process. */ sigexit(p, SIGILL); } regs->tf_esp = (int)fp; regs->tf_eip = (int)(((char *)PS_STRINGS) - *(p->p_sysent->sv_szsigcode)); regs->tf_cs = _ucodesel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_ss = _udatasel; } /* * System call to cleanup state after a signal * has been taken. Reset signal mask and * stack state from context left by sendsig (above). * Return to previous pc and psl as specified by * context left by sendsig. Check carefully to * make sure that the user has not modified the * state to gain improper privileges. */ int sigreturn(p, uap) struct proc *p; struct sigreturn_args /* { struct sigcontext *sigcntxp; } */ *uap; { register struct sigcontext *scp; register struct sigframe *fp; register struct trapframe *regs = p->p_md.md_regs; int eflags; /* * (XXX old comment) regs->tf_esp points to the return address. * The user scp pointer is above that. * The return address is faked in the signal trampoline code * for consistency. */ scp = uap->sigcntxp; fp = (struct sigframe *) ((caddr_t)scp - offsetof(struct sigframe, sf_sc)); if (useracc((caddr_t)fp, sizeof (*fp), B_WRITE) == 0) return(EFAULT); eflags = scp->sc_ps; if (eflags & PSL_VM) { struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; struct vm86_kernel *vm86; /* * if pcb_ext == 0 or vm86_inited == 0, the user hasn't * set up the vm86 area, and we can't enter vm86 mode. */ if (p->p_addr->u_pcb.pcb_ext == 0) return (EINVAL); vm86 = &p->p_addr->u_pcb.pcb_ext->ext_vm86; if (vm86->vm86_inited == 0) return (EINVAL); /* go back to user mode if both flags are set */ if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) trapsignal(p, SIGBUS, 0); #define VM_USERCHANGE (PSL_USERCHANGE | PSL_RF) #define VME_USERCHANGE (VM_USERCHANGE | PSL_VIP | PSL_VIF) if (vm86->vm86_has_vme) { eflags = (tf->tf_eflags & ~VME_USERCHANGE) | (eflags & VME_USERCHANGE) | PSL_VM; } else { vm86->vm86_eflags = eflags; /* save VIF, VIP */ eflags = (tf->tf_eflags & ~VM_USERCHANGE) | (eflags & VM_USERCHANGE) | PSL_VM; } tf->tf_vm86_ds = scp->sc_ds; tf->tf_vm86_es = scp->sc_es; tf->tf_vm86_fs = scp->sc_fs; tf->tf_vm86_gs = scp->sc_gs; tf->tf_ds = _udatasel; tf->tf_es = _udatasel; } else { /* * Don't allow users to change privileged or reserved flags. */ #define EFLAGS_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) /* * XXX do allow users to change the privileged flag PSL_RF. * The cpu sets PSL_RF in tf_eflags for faults. Debuggers * should sometimes set it there too. tf_eflags is kept in * the signal context during signal handling and there is no * other place to remember it, so the PSL_RF bit may be * corrupted by the signal handler without us knowing. * Corruption of the PSL_RF bit at worst causes one more or * one less debugger trap, so allowing it is fairly harmless. */ if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) { #ifdef DEBUG printf("sigreturn: eflags = 0x%x\n", eflags); #endif return(EINVAL); } /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) if (!CS_SECURE(scp->sc_cs)) { #ifdef DEBUG printf("sigreturn: cs = 0x%x\n", scp->sc_cs); #endif trapsignal(p, SIGBUS, T_PROTFLT); return(EINVAL); } regs->tf_ds = scp->sc_ds; regs->tf_es = scp->sc_es; } /* restore scratch registers */ regs->tf_eax = scp->sc_eax; regs->tf_ebx = scp->sc_ebx; regs->tf_ecx = scp->sc_ecx; regs->tf_edx = scp->sc_edx; regs->tf_esi = scp->sc_esi; regs->tf_edi = scp->sc_edi; regs->tf_cs = scp->sc_cs; regs->tf_ss = scp->sc_ss; regs->tf_isp = scp->sc_isp; if (useracc((caddr_t)scp, sizeof (*scp), B_WRITE) == 0) return(EINVAL); if (scp->sc_onstack & 01) p->p_sigacts->ps_sigstk.ss_flags |= SS_ONSTACK; else p->p_sigacts->ps_sigstk.ss_flags &= ~SS_ONSTACK; p->p_sigmask = scp->sc_mask & ~sigcantmask; regs->tf_ebp = scp->sc_fp; regs->tf_esp = scp->sc_sp; regs->tf_eip = scp->sc_pc; regs->tf_eflags = eflags; return(EJUSTRETURN); } /* * Machine dependent boot() routine * * I haven't seen anything to put here yet * Possibly some stuff might be grafted back here from boot() */ void cpu_boot(int howto) { } /* * Shutdown the CPU as much as possible */ void cpu_halt(void) { for (;;) __asm__ ("hlt"); } /* * Turn the power off. */ void cpu_power_down(void) { #if NAPM > 0 apm_power_off(); #endif } /* * Clear registers on exec */ void setregs(p, entry, stack) struct proc *p; u_long entry; u_long stack; { struct trapframe *regs = p->p_md.md_regs; #ifdef USER_LDT struct pcb *pcb = &p->p_addr->u_pcb; /* was i386_user_cleanup() in NetBSD */ if (pcb->pcb_ldt) { if (pcb == curpcb) lldt(GSEL(GUSERLDT_SEL, SEL_KPL)); kmem_free(kernel_map, (vm_offset_t)pcb->pcb_ldt, pcb->pcb_ldt_len * sizeof(union descriptor)); pcb->pcb_ldt_len = (int)pcb->pcb_ldt = 0; } #endif bzero((char *)regs, sizeof(struct trapframe)); regs->tf_eip = entry; regs->tf_esp = stack; regs->tf_eflags = PSL_USER | (regs->tf_eflags & PSL_T); regs->tf_ss = _udatasel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_cs = _ucodesel; /* * Initialize the math emulator (if any) for the current process. * Actually, just clear the bit that says that the emulator has * been initialized. Initialization is delayed until the process * traps to the emulator (if it is done at all) mainly because * emulators don't provide an entry point for initialization. */ p->p_addr->u_pcb.pcb_flags &= ~FP_SOFTFP; /* * Arrange to trap the next npx or `fwait' instruction (see npx.c * for why fwait must be trapped at least if there is an npx or an * emulator). This is mainly to handle the case where npx0 is not * configured, since the npx routines normally set up the trap * otherwise. It should be done only at boot time, but doing it * here allows modifying `npx_exists' for testing the emulator on * systems with an npx. */ load_cr0(rcr0() | CR0_MP | CR0_TS); #if NNPX > 0 /* Initialize the npx (if any) for the current process. */ npxinit(__INITIAL_NPXCW__); #endif } static int sysctl_machdep_adjkerntz SYSCTL_HANDLER_ARGS { int error; error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req); if (!error && req->newptr) resettodr(); return (error); } SYSCTL_PROC(_machdep, CPU_ADJKERNTZ, adjkerntz, CTLTYPE_INT|CTLFLAG_RW, &adjkerntz, 0, sysctl_machdep_adjkerntz, "I", ""); SYSCTL_INT(_machdep, CPU_DISRTCSET, disable_rtc_set, CTLFLAG_RW, &disable_rtc_set, 0, ""); SYSCTL_STRUCT(_machdep, CPU_BOOTINFO, bootinfo, CTLFLAG_RD, &bootinfo, bootinfo, ""); SYSCTL_INT(_machdep, CPU_WALLCLOCK, wall_cmos_clock, CTLFLAG_RW, &wall_cmos_clock, 0, ""); /* * Initialize 386 and configure to run kernel */ /* * Initialize segments & interrupt table */ int currentldt; int _default_ldt; #ifdef SMP union descriptor gdt[NGDT + NCPU]; /* global descriptor table */ #else union descriptor gdt[NGDT]; /* global descriptor table */ #endif struct gate_descriptor idt[NIDT]; /* interrupt descriptor table */ union descriptor ldt[NLDT]; /* local descriptor table */ #ifdef SMP /* table descriptors - used to load tables by microp */ struct region_descriptor r_gdt, r_idt; #endif #ifdef SMP extern struct i386tss common_tss; /* One tss per cpu */ #ifdef VM86 extern struct segment_descriptor common_tssd; extern int private_tss; extern u_int my_tr; #endif /* VM86 */ #else struct i386tss common_tss; #ifdef VM86 struct segment_descriptor common_tssd; u_int private_tss; /* flag indicating private tss */ u_int my_tr; /* which task register setting */ #endif /* VM86 */ #endif #if defined(I586_CPU) && !defined(NO_F00F_HACK) struct gate_descriptor *t_idt; extern int has_f00f_bug; #endif static struct i386tss dblfault_tss; static char dblfault_stack[PAGE_SIZE]; extern struct user *proc0paddr; /* software prototypes -- in more palatable form */ struct soft_segment_descriptor gdt_segs[ #ifdef SMP NGDT + NCPU #endif ] = { /* GNULL_SEL 0 Null Descriptor */ { 0x0, /* segment base address */ 0x0, /* length */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GCODE_SEL 1 Code Descriptor for kernel */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMERA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GDATA_SEL 2 Data Descriptor for kernel */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMRWA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GLDT_SEL 3 LDT Descriptor */ { (int) ldt, /* segment base address */ sizeof(ldt)-1, /* length - all address space */ SDT_SYSLDT, /* segment type */ SEL_UPL, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* unused - default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GTGATE_SEL 4 Null Descriptor - Placeholder */ { 0x0, /* segment base address */ 0x0, /* length - all address space */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GPANIC_SEL 5 Panic Tss Descriptor */ { (int) &dblfault_tss, /* segment base address */ sizeof(struct i386tss)-1,/* length - all address space */ SDT_SYS386TSS, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* unused - default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GPROC0_SEL 6 Proc 0 Tss Descriptor */ { (int) &common_tss, /* segment base address */ sizeof(struct i386tss)-1,/* length - all address space */ SDT_SYS386TSS, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* unused - default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GUSERLDT_SEL 7 User LDT Descriptor per process */ { (int) ldt, /* segment base address */ (512 * sizeof(union descriptor)-1), /* length */ SDT_SYSLDT, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* unused - default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GAPMCODE32_SEL 8 APM BIOS 32-bit interface (32bit Code) */ { 0, /* segment base address (overwritten by APM) */ 0xfffff, /* length */ SDT_MEMERA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GAPMCODE16_SEL 9 APM BIOS 32-bit interface (16bit Code) */ { 0, /* segment base address (overwritten by APM) */ 0xfffff, /* length */ SDT_MEMERA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GAPMDATA_SEL 10 APM BIOS 32-bit interface (Data) */ { 0, /* segment base address (overwritten by APM) */ 0xfffff, /* length */ SDT_MEMRWA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, }; static struct soft_segment_descriptor ldt_segs[] = { /* Null Descriptor - overwritten by call gate */ { 0x0, /* segment base address */ 0x0, /* length - all address space */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* Null Descriptor - overwritten by call gate */ { 0x0, /* segment base address */ 0x0, /* length - all address space */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* Null Descriptor - overwritten by call gate */ { 0x0, /* segment base address */ 0x0, /* length - all address space */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* Code Descriptor for user */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMERA, /* segment type */ SEL_UPL, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* Data Descriptor for user */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMRWA, /* segment type */ SEL_UPL, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, }; void setidt(idx, func, typ, dpl, selec) int idx; inthand_t *func; int typ; int dpl; int selec; { struct gate_descriptor *ip = idt + idx; ip->gd_looffset = (int)func; ip->gd_selector = selec; ip->gd_stkcpy = 0; ip->gd_xx = 0; ip->gd_type = typ; ip->gd_dpl = dpl; ip->gd_p = 1; ip->gd_hioffset = ((int)func)>>16 ; } #define IDTVEC(name) __CONCAT(X,name) extern inthand_t IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), IDTVEC(syscall), IDTVEC(int0x80_syscall); void sdtossd(sd, ssd) struct segment_descriptor *sd; struct soft_segment_descriptor *ssd; { ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase; ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit; ssd->ssd_type = sd->sd_type; ssd->ssd_dpl = sd->sd_dpl; ssd->ssd_p = sd->sd_p; ssd->ssd_def32 = sd->sd_def32; ssd->ssd_gran = sd->sd_gran; } void init386(first) int first; { int x; unsigned biosbasemem, biosextmem; struct gate_descriptor *gdp; int gsel_tss; struct isa_device *idp; #ifndef SMP /* table descriptors - used to load tables by microp */ struct region_descriptor r_gdt, r_idt; #endif int pagesinbase, pagesinext; int target_page, pa_indx; int off; int speculative_mprobe; /* * Prevent lowering of the ipl if we call tsleep() early. */ safepri = cpl; proc0.p_addr = proc0paddr; atdevbase = ISA_HOLE_START + KERNBASE; /* * Initialize the console before we print anything out. */ cninit(); /* * make gdt memory segments, the code segment goes up to end of the * page with etext in it, the data segment goes to the end of * the address space */ /* * XXX text protection is temporarily (?) disabled. The limit was * i386_btop(round_page(etext)) - 1. */ gdt_segs[GCODE_SEL].ssd_limit = i386_btop(0) - 1; gdt_segs[GDATA_SEL].ssd_limit = i386_btop(0) - 1; #ifdef BDE_DEBUGGER #define NGDT1 8 /* avoid overwriting db entries with APM ones */ #else #define NGDT1 (sizeof gdt_segs / sizeof gdt_segs[0]) #endif for (x = 0; x < NGDT1; x++) ssdtosd(&gdt_segs[x], &gdt[x].sd); #ifdef VM86 common_tssd = gdt[GPROC0_SEL].sd; #endif /* VM86 */ #ifdef SMP /* * Spin these up now. init_secondary() grabs them. We could use * #for(x,y,z) / #endfor cpp directives if they existed. */ for (x = 0; x < NCPU; x++) { gdt_segs[NGDT + x] = gdt_segs[GPROC0_SEL]; ssdtosd(&gdt_segs[NGDT + x], &gdt[NGDT + x].sd); } #endif /* make ldt memory segments */ /* * The data segment limit must not cover the user area because we * don't want the user area to be writable in copyout() etc. (page * level protection is lost in kernel mode on 386's). Also, we * don't want the user area to be writable directly (page level * protection of the user area is not available on 486's with * CR0_WP set, because there is no user-read/kernel-write mode). * * XXX - VM_MAXUSER_ADDRESS is an end address, not a max. And it * should be spelled ...MAX_USER... */ #define VM_END_USER_RW_ADDRESS VM_MAXUSER_ADDRESS /* * The code segment limit has to cover the user area until we move * the signal trampoline out of the user area. This is safe because * the code segment cannot be written to directly. */ #define VM_END_USER_R_ADDRESS (VM_END_USER_RW_ADDRESS + UPAGES * PAGE_SIZE) ldt_segs[LUCODE_SEL].ssd_limit = i386_btop(VM_END_USER_R_ADDRESS) - 1; ldt_segs[LUDATA_SEL].ssd_limit = i386_btop(VM_END_USER_RW_ADDRESS) - 1; for (x = 0; x < sizeof ldt_segs / sizeof ldt_segs[0]; x++) ssdtosd(&ldt_segs[x], &ldt[x].sd); /* exceptions */ for (x = 0; x < NIDT; x++) setidt(x, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(0, &IDTVEC(div), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(1, &IDTVEC(dbg), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(2, &IDTVEC(nmi), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(3, &IDTVEC(bpt), SDT_SYS386TGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(4, &IDTVEC(ofl), SDT_SYS386TGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(5, &IDTVEC(bnd), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(6, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(7, &IDTVEC(dna), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(8, 0, SDT_SYSTASKGT, SEL_KPL, GSEL(GPANIC_SEL, SEL_KPL)); setidt(9, &IDTVEC(fpusegm), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(10, &IDTVEC(tss), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(11, &IDTVEC(missing), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(12, &IDTVEC(stk), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(13, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(14, &IDTVEC(page), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(15, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(16, &IDTVEC(fpu), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(17, &IDTVEC(align), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(18, &IDTVEC(mchk), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(0x80, &IDTVEC(int0x80_syscall), SDT_SYS386TGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); #include "isa.h" #if NISA >0 isa_defaultirq(); #endif rand_initialize(); r_gdt.rd_limit = sizeof(gdt) - 1; r_gdt.rd_base = (int) gdt; lgdt(&r_gdt); r_idt.rd_limit = sizeof(idt) - 1; r_idt.rd_base = (int) idt; lidt(&r_idt); _default_ldt = GSEL(GLDT_SEL, SEL_KPL); lldt(_default_ldt); currentldt = _default_ldt; #ifdef DDB kdb_init(); if (boothowto & RB_KDB) Debugger("Boot flags requested debugger"); #endif finishidentcpu(); /* Final stage of CPU initialization */ setidt(6, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); initializecpu(); /* Initialize CPU registers */ /* Use BIOS values stored in RTC CMOS RAM, since probing * breaks certain 386 AT relics. */ biosbasemem = rtcin(RTC_BASELO)+ (rtcin(RTC_BASEHI)<<8); biosextmem = rtcin(RTC_EXTLO)+ (rtcin(RTC_EXTHI)<<8); /* * If BIOS tells us that it has more than 640k in the basemem, * don't believe it - set it to 640k. */ if (biosbasemem > 640) { printf("Preposterous RTC basemem of %dK, truncating to 640K\n", biosbasemem); biosbasemem = 640; } if (bootinfo.bi_memsizes_valid && bootinfo.bi_basemem > 640) { printf("Preposterous BIOS basemem of %dK, truncating to 640K\n", bootinfo.bi_basemem); bootinfo.bi_basemem = 640; } /* * Warn if the official BIOS interface disagrees with the RTC * interface used above about the amount of base memory or the * amount of extended memory. Prefer the BIOS value for the base * memory. This is necessary for machines that `steal' base * memory for use as BIOS memory, at least if we are going to use * the BIOS for apm. Prefer the RTC value for extended memory. * Eventually the hackish interface shouldn't even be looked at. */ if (bootinfo.bi_memsizes_valid) { if (bootinfo.bi_basemem != biosbasemem) { vm_offset_t pa; printf( "BIOS basemem (%ldK) != RTC basemem (%dK), setting to BIOS value\n", bootinfo.bi_basemem, biosbasemem); biosbasemem = bootinfo.bi_basemem; /* * XXX if biosbasemem is now < 640, there is `hole' * between the end of base memory and the start of * ISA memory. The hole may be empty or it may * contain BIOS code or data. Map it read/write so * that the BIOS can write to it. (Memory from 0 to * the physical end of the kernel is mapped read-only * to begin with and then parts of it are remapped. * The parts that aren't remapped form holes that * remain read-only and are unused by the kernel. * The base memory area is below the physical end of * the kernel and right now forms a read-only hole. * The part of it from 0 to * (trunc_page(biosbasemem * 1024) - 1) will be * remapped and used by the kernel later.) * * This code is similar to the code used in * pmap_mapdev, but since no memory needs to be * allocated we simply change the mapping. */ for (pa = trunc_page(biosbasemem * 1024); pa < ISA_HOLE_START; pa += PAGE_SIZE) { unsigned *pte; pte = (unsigned *)vtopte(pa + KERNBASE); *pte = pa | PG_RW | PG_V; } } if (bootinfo.bi_extmem != biosextmem) printf("BIOS extmem (%ldK) != RTC extmem (%dK)\n", bootinfo.bi_extmem, biosextmem); } #ifdef SMP /* make hole for AP bootstrap code */ pagesinbase = mp_bootaddress(biosbasemem) / PAGE_SIZE; #else pagesinbase = biosbasemem * 1024 / PAGE_SIZE; #endif pagesinext = biosextmem * 1024 / PAGE_SIZE; /* * Special hack for chipsets that still remap the 384k hole when * there's 16MB of memory - this really confuses people that * are trying to use bus mastering ISA controllers with the * "16MB limit"; they only have 16MB, but the remapping puts * them beyond the limit. */ /* * If extended memory is between 15-16MB (16-17MB phys address range), * chop it to 15MB. */ if ((pagesinext > 3840) && (pagesinext < 4096)) pagesinext = 3840; /* * Maxmem isn't the "maximum memory", it's one larger than the * highest page of the physical address space. It should be * called something like "Maxphyspage". */ Maxmem = pagesinext + 0x100000/PAGE_SIZE; /* * Indicate that we wish to do a speculative search for memory beyond * the end of the reported size if the indicated amount is 64MB (0x4000 * pages) - which is the largest amount that the BIOS/bootblocks can * currently report. If a specific amount of memory is indicated via * the MAXMEM option or the npx0 "msize", then don't do the speculative * memory probe. */ if (Maxmem >= 0x4000) speculative_mprobe = TRUE; else speculative_mprobe = FALSE; #ifdef MAXMEM Maxmem = MAXMEM/4; speculative_mprobe = FALSE; #endif #if NNPX > 0 idp = find_isadev(isa_devtab_null, &npxdriver, 0); if (idp != NULL && idp->id_msize != 0) { Maxmem = idp->id_msize / 4; speculative_mprobe = FALSE; } #endif #ifdef SMP /* look for the MP hardware - needed for apic addresses */ mp_probe(); #endif /* call pmap initialization to make new kernel address space */ pmap_bootstrap (first, 0); /* * Size up each available chunk of physical memory. */ /* * We currently don't bother testing base memory. * XXX ...but we probably should. */ pa_indx = 0; if (pagesinbase > 1) { phys_avail[pa_indx++] = PAGE_SIZE; /* skip first page of memory */ phys_avail[pa_indx] = ptoa(pagesinbase);/* memory up to the ISA hole */ physmem = pagesinbase - 1; } else { /* point at first chunk end */ pa_indx++; } for (target_page = avail_start; target_page < ptoa(Maxmem); target_page += PAGE_SIZE) { int tmp, page_bad; page_bad = FALSE; /* * map page into kernel: valid, read/write, non-cacheable */ *(int *)CMAP1 = PG_V | PG_RW | PG_N | target_page; invltlb(); tmp = *(int *)CADDR1; /* * Test for alternating 1's and 0's */ *(volatile int *)CADDR1 = 0xaaaaaaaa; if (*(volatile int *)CADDR1 != 0xaaaaaaaa) { page_bad = TRUE; } /* * Test for alternating 0's and 1's */ *(volatile int *)CADDR1 = 0x55555555; if (*(volatile int *)CADDR1 != 0x55555555) { page_bad = TRUE; } /* * Test for all 1's */ *(volatile int *)CADDR1 = 0xffffffff; if (*(volatile int *)CADDR1 != 0xffffffff) { page_bad = TRUE; } /* * Test for all 0's */ *(volatile int *)CADDR1 = 0x0; if (*(volatile int *)CADDR1 != 0x0) { /* * test of page failed */ page_bad = TRUE; } /* * Restore original value. */ *(int *)CADDR1 = tmp; /* * Adjust array of valid/good pages. */ if (page_bad == FALSE) { /* * If this good page is a continuation of the * previous set of good pages, then just increase * the end pointer. Otherwise start a new chunk. * Note that "end" points one higher than end, * making the range >= start and < end. * If we're also doing a speculative memory * test and we at or past the end, bump up Maxmem * so that we keep going. The first bad page * will terminate the loop. */ if (phys_avail[pa_indx] == target_page) { phys_avail[pa_indx] += PAGE_SIZE; if (speculative_mprobe == TRUE && phys_avail[pa_indx] >= (64*1024*1024)) Maxmem++; } else { pa_indx++; if (pa_indx == PHYS_AVAIL_ARRAY_END) { printf("Too many holes in the physical address space, giving up\n"); pa_indx--; break; } phys_avail[pa_indx++] = target_page; /* start */ phys_avail[pa_indx] = target_page + PAGE_SIZE; /* end */ } physmem++; } } *(int *)CMAP1 = 0; invltlb(); /* * XXX * The last chunk must contain at least one page plus the message * buffer to avoid complicating other code (message buffer address * calculation, etc.). */ while (phys_avail[pa_indx - 1] + PAGE_SIZE + round_page(sizeof(struct msgbuf)) >= phys_avail[pa_indx]) { physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]); phys_avail[pa_indx--] = 0; phys_avail[pa_indx--] = 0; } Maxmem = atop(phys_avail[pa_indx]); /* Trim off space for the message buffer. */ phys_avail[pa_indx] -= round_page(sizeof(struct msgbuf)); avail_end = phys_avail[pa_indx]; /* now running on new page tables, configured,and u/iom is accessible */ /* Map the message buffer. */ for (off = 0; off < round_page(sizeof(struct msgbuf)); off += PAGE_SIZE) pmap_enter(kernel_pmap, (vm_offset_t)msgbufp + off, avail_end + off, VM_PROT_ALL, TRUE); msgbufmapped = 1; /* make an initial tss so cpu can get interrupt stack on syscall! */ #ifdef VM86 common_tss.tss_esp0 = (int) proc0.p_addr + UPAGES*PAGE_SIZE - 16; #else common_tss.tss_esp0 = (int) proc0.p_addr + UPAGES*PAGE_SIZE; #endif /* VM86 */ common_tss.tss_ss0 = GSEL(GDATA_SEL, SEL_KPL) ; common_tss.tss_ioopt = (sizeof common_tss) << 16; gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); ltr(gsel_tss); #ifdef VM86 private_tss = 0; my_tr = GPROC0_SEL; #endif dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 = dblfault_tss.tss_esp2 = (int) &dblfault_stack[sizeof(dblfault_stack)]; dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 = dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL); dblfault_tss.tss_cr3 = (int)IdlePTD; dblfault_tss.tss_eip = (int) dblfault_handler; dblfault_tss.tss_eflags = PSL_KERNEL; dblfault_tss.tss_ds = dblfault_tss.tss_es = dblfault_tss.tss_fs = dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL); dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL); dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL); /* make a call gate to reenter kernel with */ gdp = &ldt[LSYS5CALLS_SEL].gd; x = (int) &IDTVEC(syscall); gdp->gd_looffset = x++; gdp->gd_selector = GSEL(GCODE_SEL,SEL_KPL); gdp->gd_stkcpy = 1; gdp->gd_type = SDT_SYS386CGT; gdp->gd_dpl = SEL_UPL; gdp->gd_p = 1; gdp->gd_hioffset = ((int) &IDTVEC(syscall)) >>16; /* XXX does this work? */ ldt[LBSDICALLS_SEL] = ldt[LSYS5CALLS_SEL]; /* transfer to user mode */ _ucodesel = LSEL(LUCODE_SEL, SEL_UPL); _udatasel = LSEL(LUDATA_SEL, SEL_UPL); /* setup proc 0's pcb */ proc0.p_addr->u_pcb.pcb_flags = 0; proc0.p_addr->u_pcb.pcb_cr3 = (int)IdlePTD; proc0.p_addr->u_pcb.pcb_mpnest = 1; proc0.p_addr->u_pcb.pcb_ext = 0; } #if defined(I586_CPU) && !defined(NO_F00F_HACK) void f00f_hack(void); SYSINIT(f00f_hack, SI_SUB_INTRINSIC, SI_ORDER_FIRST, f00f_hack, NULL); void f00f_hack(void) { struct region_descriptor r_idt; unsigned char *tmp; int i; if (!has_f00f_bug) return; printf("Intel Pentium F00F detected, installing workaround\n"); r_idt.rd_limit = sizeof(idt) - 1; tmp = kmem_alloc(kernel_map, PAGE_SIZE * 2); if (tmp == 0) panic("kmem_alloc returned 0"); if (((unsigned int)tmp & (PAGE_SIZE-1)) != 0) panic("kmem_alloc returned non-page-aligned memory"); /* Put the first seven entries in the lower page */ t_idt = (struct gate_descriptor*)(tmp + PAGE_SIZE - (7*8)); bcopy(idt, t_idt, sizeof(idt)); r_idt.rd_base = (int)t_idt; lidt(&r_idt); if (vm_map_protect(kernel_map, tmp, tmp + PAGE_SIZE, VM_PROT_READ, FALSE) != KERN_SUCCESS) panic("vm_map_protect failed"); return; } #endif /* defined(I586_CPU) && !NO_F00F_HACK */ int ptrace_set_pc(p, addr) struct proc *p; unsigned int addr; { p->p_md.md_regs->tf_eip = addr; return (0); } int ptrace_single_step(p) struct proc *p; { p->p_md.md_regs->tf_eflags |= PSL_T; return (0); } int ptrace_write_u(p, off, data) struct proc *p; vm_offset_t off; int data; { struct trapframe frame_copy; vm_offset_t min; struct trapframe *tp; /* * Privileged kernel state is scattered all over the user area. * Only allow write access to parts of regs and to fpregs. */ min = (char *)p->p_md.md_regs - (char *)p->p_addr; if (off >= min && off <= min + sizeof(struct trapframe) - sizeof(int)) { tp = p->p_md.md_regs; frame_copy = *tp; *(int *)((char *)&frame_copy + (off - min)) = data; if (!EFLAGS_SECURE(frame_copy.tf_eflags, tp->tf_eflags) || !CS_SECURE(frame_copy.tf_cs)) return (EINVAL); *(int*)((char *)p->p_addr + off) = data; return (0); } min = offsetof(struct user, u_pcb) + offsetof(struct pcb, pcb_savefpu); if (off >= min && off <= min + sizeof(struct save87) - sizeof(int)) { *(int*)((char *)p->p_addr + off) = data; return (0); } return (EFAULT); } int fill_regs(p, regs) struct proc *p; struct reg *regs; { struct pcb *pcb; struct trapframe *tp; tp = p->p_md.md_regs; regs->r_es = tp->tf_es; regs->r_ds = tp->tf_ds; regs->r_edi = tp->tf_edi; regs->r_esi = tp->tf_esi; regs->r_ebp = tp->tf_ebp; regs->r_ebx = tp->tf_ebx; regs->r_edx = tp->tf_edx; regs->r_ecx = tp->tf_ecx; regs->r_eax = tp->tf_eax; regs->r_eip = tp->tf_eip; regs->r_cs = tp->tf_cs; regs->r_eflags = tp->tf_eflags; regs->r_esp = tp->tf_esp; regs->r_ss = tp->tf_ss; pcb = &p->p_addr->u_pcb; regs->r_fs = pcb->pcb_fs; regs->r_gs = pcb->pcb_gs; return (0); } int set_regs(p, regs) struct proc *p; struct reg *regs; { struct pcb *pcb; struct trapframe *tp; tp = p->p_md.md_regs; if (!EFLAGS_SECURE(regs->r_eflags, tp->tf_eflags) || !CS_SECURE(regs->r_cs)) return (EINVAL); tp->tf_es = regs->r_es; tp->tf_ds = regs->r_ds; tp->tf_edi = regs->r_edi; tp->tf_esi = regs->r_esi; tp->tf_ebp = regs->r_ebp; tp->tf_ebx = regs->r_ebx; tp->tf_edx = regs->r_edx; tp->tf_ecx = regs->r_ecx; tp->tf_eax = regs->r_eax; tp->tf_eip = regs->r_eip; tp->tf_cs = regs->r_cs; tp->tf_eflags = regs->r_eflags; tp->tf_esp = regs->r_esp; tp->tf_ss = regs->r_ss; pcb = &p->p_addr->u_pcb; pcb->pcb_fs = regs->r_fs; pcb->pcb_gs = regs->r_gs; return (0); } #ifndef DDB void Debugger(const char *msg) { printf("Debugger(\"%s\") called.\n", msg); } #endif /* no DDB */ #include /* * Determine the size of the transfer, and make sure it is * within the boundaries of the partition. Adjust transfer * if needed, and signal errors or early completion. */ int bounds_check_with_label(struct buf *bp, struct disklabel *lp, int wlabel) { struct partition *p = lp->d_partitions + dkpart(bp->b_dev); int labelsect = lp->d_partitions[0].p_offset; int maxsz = p->p_size, sz = (bp->b_bcount + DEV_BSIZE - 1) >> DEV_BSHIFT; /* overwriting disk label ? */ /* XXX should also protect bootstrap in first 8K */ if (bp->b_blkno + p->p_offset <= LABELSECTOR + labelsect && #if LABELSECTOR != 0 bp->b_blkno + p->p_offset + sz > LABELSECTOR + labelsect && #endif (bp->b_flags & B_READ) == 0 && wlabel == 0) { bp->b_error = EROFS; goto bad; } #if defined(DOSBBSECTOR) && defined(notyet) /* overwriting master boot record? */ if (bp->b_blkno + p->p_offset <= DOSBBSECTOR && (bp->b_flags & B_READ) == 0 && wlabel == 0) { bp->b_error = EROFS; goto bad; } #endif /* beyond partition? */ if (bp->b_blkno < 0 || bp->b_blkno + sz > maxsz) { /* if exactly at end of disk, return an EOF */ if (bp->b_blkno == maxsz) { bp->b_resid = bp->b_bcount; return(0); } /* or truncate if part of it fits */ sz = maxsz - bp->b_blkno; if (sz <= 0) { bp->b_error = EINVAL; goto bad; } bp->b_bcount = sz << DEV_BSHIFT; } bp->b_pblkno = bp->b_blkno + p->p_offset; return(1); bad: bp->b_flags |= B_ERROR; return(-1); } #ifdef DDB /* * Provide inb() and outb() as functions. They are normally only * available as macros calling inlined functions, thus cannot be * called inside DDB. * * The actual code is stolen from , and de-inlined. */ #undef inb #undef outb /* silence compiler warnings */ u_char inb(u_int); void outb(u_int, u_char); u_char inb(u_int port) { u_char data; /* * We use %%dx and not %1 here because i/o is done at %dx and not at * %edx, while gcc generates inferior code (movw instead of movl) * if we tell it to load (u_short) port. */ __asm __volatile("inb %%dx,%0" : "=a" (data) : "d" (port)); return (data); } void outb(u_int port, u_char data) { u_char al; /* * Use an unnecessary assignment to help gcc's register allocator. * This make a large difference for gcc-1.40 and a tiny difference * for gcc-2.6.0. For gcc-1.40, al had to be ``asm("ax")'' for * best results. gcc-2.6.0 can't handle this. */ al = data; __asm __volatile("outb %0,%%dx" : : "a" (al), "d" (port)); } #endif /* DDB */ Index: head/sys/amd64/amd64/pmap.c =================================================================== --- head/sys/amd64/amd64/pmap.c (revision 31708) +++ head/sys/amd64/amd64/pmap.c (revision 31709) @@ -1,3374 +1,3399 @@ /* * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 - * $Id: pmap.c,v 1.172 1997/11/07 19:58:34 tegge Exp $ + * $Id: pmap.c,v 1.173 1997/11/20 19:30:31 bde Exp $ */ /* * Manages physical address maps. * * In addition to hardware address maps, this * module is called upon to provide software-use-only * maps which may or may not be stored in the same * form as hardware maps. These pseudo-maps are * used to store intermediate results from copy * operations to and from address spaces. * * Since the information managed by this module is * also stored by the logical address mapping module, * this module may throw away valid virtual-to-physical * mappings at almost any time. However, invalidations * of virtual-to-physical mappings must be done as * requested. * * In order to cope with hardware architectures which * make virtual-to-physical map invalidates expensive, * this module may delay invalidate or reduced protection * operations until such time as they are actually * necessary. This module is given full information as * to which processors are currently using which maps, * and to when physical maps must be made correct. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(SMP) || defined(APIC_IO) #include #include #endif /* SMP || APIC_IO */ #define PMAP_KEEP_PDIRS #ifndef PMAP_SHPGPERPROC #define PMAP_SHPGPERPROC 200 #endif #if defined(DIAGNOSTIC) #define PMAP_DIAGNOSTIC #endif #define MINPV 2048 #if !defined(PMAP_DIAGNOSTIC) #define PMAP_INLINE __inline #else #define PMAP_INLINE #endif #define PTPHINT /* * Get PDEs and PTEs for user/kernel address space */ #define pmap_pde(m, v) (&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT])) #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT]) #define pmap_pde_v(pte) ((*(int *)pte & PG_V) != 0) #define pmap_pte_w(pte) ((*(int *)pte & PG_W) != 0) #define pmap_pte_m(pte) ((*(int *)pte & PG_M) != 0) #define pmap_pte_u(pte) ((*(int *)pte & PG_A) != 0) #define pmap_pte_v(pte) ((*(int *)pte & PG_V) != 0) #define pmap_pte_set_w(pte, v) ((v)?(*(int *)pte |= PG_W):(*(int *)pte &= ~PG_W)) #define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v))) /* * Given a map and a machine independent protection code, * convert to a vax protection code. */ #define pte_prot(m, p) (protection_codes[p]) static int protection_codes[8]; #define pa_index(pa) atop((pa) - vm_first_phys) #define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) static struct pmap kernel_pmap_store; pmap_t kernel_pmap; extern pd_entry_t my_idlePTD; vm_offset_t avail_start; /* PA of first available physical page */ vm_offset_t avail_end; /* PA of last available physical page */ vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ static boolean_t pmap_initialized = FALSE; /* Has pmap_init completed? */ static vm_offset_t vm_first_phys; int pgeflag; /* PG_G or-in */ int pseflag; /* PG_PS or-in */ int pv_npg; int nkpt; vm_offset_t kernel_vm_end; /* * Data for the pv entry allocation mechanism */ vm_zone_t pvzone; struct vm_zone pvzone_store; struct vm_object pvzone_obj; int pv_entry_count=0, pv_entry_max=0, pv_entry_high_water=0; int pmap_pagedaemon_waken = 0; struct pv_entry *pvinit; /* * All those kernel PT submaps that BSD is so fond of */ pt_entry_t *CMAP1 = 0; static pt_entry_t *CMAP2, *ptmmap; static pv_table_t *pv_table; caddr_t CADDR1 = 0, ptvmmap = 0; static caddr_t CADDR2; static pt_entry_t *msgbufmap; struct msgbuf *msgbufp=0; #ifdef SMP extern char prv_CPAGE1[], prv_CPAGE2[], prv_CPAGE3[]; extern pt_entry_t *prv_CMAP1, *prv_CMAP2, *prv_CMAP3; extern pd_entry_t *IdlePTDS[]; extern pt_entry_t SMP_prvpt[]; #endif pt_entry_t *PMAP1 = 0; unsigned *PADDR1 = 0; static PMAP_INLINE void free_pv_entry __P((pv_entry_t pv)); static unsigned * get_ptbase __P((pmap_t pmap)); static pv_entry_t get_pv_entry __P((void)); static void i386_protection_init __P((void)); static void pmap_changebit __P((vm_offset_t pa, int bit, boolean_t setem)); static PMAP_INLINE int pmap_is_managed __P((vm_offset_t pa)); static void pmap_remove_all __P((vm_offset_t pa)); static vm_page_t pmap_enter_quick __P((pmap_t pmap, vm_offset_t va, vm_offset_t pa, vm_page_t mpte)); static int pmap_remove_pte __P((struct pmap *pmap, unsigned *ptq, vm_offset_t sva)); static void pmap_remove_page __P((struct pmap *pmap, vm_offset_t va)); static int pmap_remove_entry __P((struct pmap *pmap, pv_table_t *pv, vm_offset_t va)); static boolean_t pmap_testbit __P((vm_offset_t pa, int bit)); static void pmap_insert_entry __P((pmap_t pmap, vm_offset_t va, vm_page_t mpte, vm_offset_t pa)); static vm_page_t pmap_allocpte __P((pmap_t pmap, vm_offset_t va)); static int pmap_release_free_page __P((pmap_t pmap, vm_page_t p)); static vm_page_t _pmap_allocpte __P((pmap_t pmap, unsigned ptepindex)); static unsigned * pmap_pte_quick __P((pmap_t pmap, vm_offset_t va)); static vm_page_t pmap_page_alloc __P((vm_object_t object, vm_pindex_t pindex)); static vm_page_t pmap_page_lookup __P((vm_object_t object, vm_pindex_t pindex)); static int pmap_unuse_pt __P((pmap_t, vm_offset_t, vm_page_t)); vm_offset_t pmap_kmem_choose(vm_offset_t addr) ; void pmap_collect(void); #define PDSTACKMAX 6 static vm_offset_t pdstack[PDSTACKMAX]; static int pdstackptr; unsigned pdir4mb; /* * Routine: pmap_pte * Function: * Extract the page table entry associated * with the given map/virtual_address pair. */ PMAP_INLINE unsigned * pmap_pte(pmap, va) register pmap_t pmap; vm_offset_t va; { unsigned *pdeaddr; if (pmap) { pdeaddr = (unsigned *) pmap_pde(pmap, va); if (*pdeaddr & PG_PS) return pdeaddr; if (*pdeaddr) { return get_ptbase(pmap) + i386_btop(va); } } return (0); } /* * Move the kernel virtual free pointer to the next * 4MB. This is used to help improve performance * by using a large (4MB) page for much of the kernel * (.text, .data, .bss) */ vm_offset_t pmap_kmem_choose(vm_offset_t addr) { vm_offset_t newaddr = addr; #ifndef DISABLE_PSE if (cpu_feature & CPUID_PSE) { newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1); } #endif return newaddr; } /* * Bootstrap the system enough to run with virtual memory. * * On the i386 this is called after mapping has already been enabled * and just syncs the pmap module with what has already been done. * [We can't call it easily with mapping off since the kernel is not * mapped with PA == VA, hence we would have to relocate every address * from the linked base (virtual) address "KERNBASE" to the actual * (physical) address starting relative to 0] */ void pmap_bootstrap(firstaddr, loadaddr) vm_offset_t firstaddr; vm_offset_t loadaddr; { vm_offset_t va; pt_entry_t *pte; int i, j; avail_start = firstaddr; /* * XXX The calculation of virtual_avail is wrong. It's NKPT*PAGE_SIZE too * large. It should instead be correctly calculated in locore.s and * not based on 'first' (which is a physical address, not a virtual * address, for the start of unused physical memory). The kernel * page tables are NOT double mapped and thus should not be included * in this calculation. */ virtual_avail = (vm_offset_t) KERNBASE + firstaddr; virtual_avail = pmap_kmem_choose(virtual_avail); virtual_end = VM_MAX_KERNEL_ADDRESS; /* * Initialize protection array. */ i386_protection_init(); /* * The kernel's pmap is statically allocated so we don't have to use * pmap_create, which is unlikely to work correctly at this part of * the boot sequence (XXX and which no longer exists). */ kernel_pmap = &kernel_pmap_store; kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD); kernel_pmap->pm_count = 1; TAILQ_INIT(&kernel_pmap->pm_pvlist); nkpt = NKPT; /* * Reserve some special page table entries/VA space for temporary * mapping of pages. */ #define SYSMAP(c, p, v, n) \ v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); va = virtual_avail; pte = (pt_entry_t *) pmap_pte(kernel_pmap, va); /* * CMAP1/CMAP2 are used for zeroing and copying pages. */ SYSMAP(caddr_t, CMAP1, CADDR1, 1) SYSMAP(caddr_t, CMAP2, CADDR2, 1) /* * ptvmmap is used for reading arbitrary physical pages via /dev/mem. * XXX ptmmap is not used. */ SYSMAP(caddr_t, ptmmap, ptvmmap, 1) /* * msgbufp is used to map the system message buffer. * XXX msgbufmap is not used. */ SYSMAP(struct msgbuf *, msgbufmap, msgbufp, atop(round_page(sizeof(struct msgbuf)))) /* * ptemap is used for pmap_pte_quick */ SYSMAP(unsigned *, PMAP1, PADDR1, 1); virtual_avail = va; *(int *) CMAP1 = *(int *) CMAP2 = 0; *(int *) PTD = 0; pgeflag = 0; #if !defined(SMP) if (cpu_feature & CPUID_PGE) { pgeflag = PG_G; } #endif /* * Initialize the 4MB page size flag */ pseflag = 0; /* * The 4MB page version of the initial * kernel page mapping. */ pdir4mb = 0; #if !defined(DISABLE_PSE) if (cpu_feature & CPUID_PSE) { unsigned ptditmp; /* * Enable the PSE mode */ load_cr4(rcr4() | CR4_PSE); /* * Note that we have enabled PSE mode */ pseflag = PG_PS; ptditmp = (unsigned) kernel_pmap->pm_pdir[KPTDI]; ptditmp &= ~(NBPDR - 1); ptditmp |= PG_V | PG_RW | PG_PS | PG_U | pgeflag; pdir4mb = ptditmp; /* * We can do the mapping here for the single processor * case. We simply ignore the old page table page from * now on. */ #if !defined(SMP) PTD[KPTDI] = (pd_entry_t) ptditmp; kernel_pmap->pm_pdir[KPTDI] = (pd_entry_t) ptditmp; invltlb(); #endif } #endif #ifdef SMP if (cpu_apic_address == 0) panic("pmap_bootstrap: no local apic!"); /* 0 = private page */ /* 1 = page table page */ /* 2 = local apic */ /* 16-31 = io apics */ SMP_prvpt[2] = (pt_entry_t)(PG_V | PG_RW | pgeflag | ((u_long)cpu_apic_address & PG_FRAME)); for (i = 0; i < mp_napics; i++) { for (j = 0; j < 16; j++) { /* same page frame as a previous IO apic? */ if (((u_long)SMP_prvpt[j + 16] & PG_FRAME) == ((u_long)io_apic_address[0] & PG_FRAME)) { ioapic[i] = (ioapic_t *)&SMP_ioapic[j * PAGE_SIZE]; break; } /* use this slot if available */ if (((u_long)SMP_prvpt[j + 16] & PG_FRAME) == 0) { SMP_prvpt[j + 16] = (pt_entry_t)(PG_V | PG_RW | pgeflag | ((u_long)io_apic_address[i] & PG_FRAME)); ioapic[i] = (ioapic_t *)&SMP_ioapic[j * PAGE_SIZE]; break; } } if (j == 16) panic("no space to map IO apic %d!", i); } /* BSP does this itself, AP's get it pre-set */ prv_CMAP1 = (pt_entry_t *)&SMP_prvpt[3 + UPAGES]; prv_CMAP2 = (pt_entry_t *)&SMP_prvpt[4 + UPAGES]; prv_CMAP3 = (pt_entry_t *)&SMP_prvpt[5 + UPAGES]; #endif invltlb(); } /* * Set 4mb pdir for mp startup, and global flags */ void pmap_set_opt(unsigned *pdir) { int i; if (pseflag && (cpu_feature & CPUID_PSE)) { load_cr4(rcr4() | CR4_PSE); if (pdir4mb) { (unsigned) pdir[KPTDI] = pdir4mb; } } if (pgeflag && (cpu_feature & CPUID_PGE)) { load_cr4(rcr4() | CR4_PGE); for(i = KPTDI; i < KPTDI + nkpt; i++) { if (pdir[i]) { pdir[i] |= PG_G; } } } } /* * Setup the PTD for the boot processor */ void pmap_set_opt_bsp(void) { pmap_set_opt((unsigned *)kernel_pmap->pm_pdir); pmap_set_opt((unsigned *)PTD); invltlb(); } /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. * pmap_init has been enhanced to support in a fairly consistant * way, discontiguous physical memory. */ void pmap_init(phys_start, phys_end) vm_offset_t phys_start, phys_end; { vm_offset_t addr; vm_size_t s; int i; int initial_pvs; /* * calculate the number of pv_entries needed */ vm_first_phys = phys_avail[0]; for (i = 0; phys_avail[i + 1]; i += 2); pv_npg = (phys_avail[(i - 2) + 1] - vm_first_phys) / PAGE_SIZE; /* * Allocate memory for random pmap data structures. Includes the * pv_head_table. */ s = (vm_size_t) (sizeof(pv_table_t) * pv_npg); s = round_page(s); addr = (vm_offset_t) kmem_alloc(kernel_map, s); pv_table = (pv_table_t *) addr; for(i = 0; i < pv_npg; i++) { vm_offset_t pa; TAILQ_INIT(&pv_table[i].pv_list); pv_table[i].pv_list_count = 0; pa = vm_first_phys + i * PAGE_SIZE; pv_table[i].pv_vm_page = PHYS_TO_VM_PAGE(pa); } /* * init the pv free list */ initial_pvs = pv_npg; if (initial_pvs < MINPV) initial_pvs = MINPV; pvzone = &pvzone_store; pvinit = (struct pv_entry *) kmem_alloc(kernel_map, initial_pvs * sizeof (struct pv_entry)); zbootinit(pvzone, "PV ENTRY", sizeof (struct pv_entry), pvinit, pv_npg); /* * Now it is safe to enable pv_table recording. */ pmap_initialized = TRUE; } /* * Initialize the address space (zone) for the pv_entries. Set a * high water mark so that the system can recover from excessive * numbers of pv entries. */ void pmap_init2() { pv_entry_max = PMAP_SHPGPERPROC * maxproc + pv_npg; pv_entry_high_water = 9 * (pv_entry_max / 10); zinitna(pvzone, &pvzone_obj, NULL, 0, pv_entry_max, ZONE_INTERRUPT, 1); } /* * Used to map a range of physical addresses into kernel * virtual address space. * * For now, VM is already on, we only need to map the * specified memory. */ vm_offset_t pmap_map(virt, start, end, prot) vm_offset_t virt; vm_offset_t start; vm_offset_t end; int prot; { while (start < end) { pmap_enter(kernel_pmap, virt, start, prot, FALSE); virt += PAGE_SIZE; start += PAGE_SIZE; } return (virt); } /*************************************************** * Low level helper routines..... ***************************************************/ #if defined(PMAP_DIAGNOSTIC) /* * This code checks for non-writeable/modified pages. * This should be an invalid condition. */ static int pmap_nw_modified(pt_entry_t ptea) { int pte; pte = (int) ptea; if ((pte & (PG_M|PG_RW)) == PG_M) return 1; else return 0; } #endif /* * this routine defines the region(s) of memory that should * not be tested for the modified bit. */ static PMAP_INLINE int pmap_track_modified( vm_offset_t va) { if ((va < clean_sva) || (va >= clean_eva)) return 1; else return 0; } static PMAP_INLINE void invltlb_1pg( vm_offset_t va) { #if defined(I386_CPU) if (cpu_class == CPUCLASS_386) { invltlb(); } else #endif { invlpg(va); } } static PMAP_INLINE void invltlb_2pg( vm_offset_t va1, vm_offset_t va2) { #if defined(I386_CPU) if (cpu_class == CPUCLASS_386) { invltlb(); } else #endif { invlpg(va1); invlpg(va2); } } static unsigned * get_ptbase(pmap) pmap_t pmap; { unsigned frame = (unsigned) pmap->pm_pdir[PTDPTDI] & PG_FRAME; /* are we current address space or kernel? */ if (pmap == kernel_pmap || frame == (((unsigned) PTDpde) & PG_FRAME)) { return (unsigned *) PTmap; } /* otherwise, we are alternate address space */ if (frame != (((unsigned) APTDpde) & PG_FRAME)) { APTDpde = (pd_entry_t) (frame | PG_RW | PG_V); invltlb(); } return (unsigned *) APTmap; } /* * Super fast pmap_pte routine best used when scanning * the pv lists. This eliminates many coarse-grained * invltlb calls. Note that many of the pv list * scans are across different pmaps. It is very wasteful * to do an entire invltlb for checking a single mapping. */ static unsigned * pmap_pte_quick(pmap, va) register pmap_t pmap; vm_offset_t va; { unsigned pde, newpf; if (pde = (unsigned) pmap->pm_pdir[va >> PDRSHIFT]) { unsigned frame = (unsigned) pmap->pm_pdir[PTDPTDI] & PG_FRAME; unsigned index = i386_btop(va); /* are we current address space or kernel? */ if ((pmap == kernel_pmap) || (frame == (((unsigned) PTDpde) & PG_FRAME))) { return (unsigned *) PTmap + index; } newpf = pde & PG_FRAME; if ( ((* (unsigned *) PMAP1) & PG_FRAME) != newpf) { * (unsigned *) PMAP1 = newpf | PG_RW | PG_V; invltlb_1pg((vm_offset_t) PADDR1); } return PADDR1 + ((unsigned) index & (NPTEPG - 1)); } return (0); } /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ vm_offset_t pmap_extract(pmap, va) register pmap_t pmap; vm_offset_t va; { vm_offset_t rtval; vm_offset_t pdirindex; pdirindex = va >> PDRSHIFT; if (pmap && (rtval = (unsigned) pmap->pm_pdir[pdirindex])) { unsigned *pte; if ((rtval & PG_PS) != 0) { rtval &= ~(NBPDR - 1); rtval |= va & (NBPDR - 1); return rtval; } pte = get_ptbase(pmap) + i386_btop(va); rtval = ((*pte & PG_FRAME) | (va & PAGE_MASK)); return rtval; } return 0; } /* * determine if a page is managed (memory vs. device) */ static PMAP_INLINE int pmap_is_managed(pa) vm_offset_t pa; { int i; if (!pmap_initialized) return 0; for (i = 0; phys_avail[i + 1]; i += 2) { if (pa < phys_avail[i + 1] && pa >= phys_avail[i]) return 1; } return 0; } /*************************************************** * Low level mapping routines..... ***************************************************/ /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. */ void pmap_qenter(va, m, count) vm_offset_t va; vm_page_t *m; int count; { int i; register unsigned *pte; for (i = 0; i < count; i++) { vm_offset_t tva = va + i * PAGE_SIZE; unsigned npte = VM_PAGE_TO_PHYS(m[i]) | PG_RW | PG_V | pgeflag; unsigned opte; pte = (unsigned *)vtopte(tva); opte = *pte; *pte = npte; if (opte) invltlb_1pg(tva); } } /* * this routine jerks page mappings from the * kernel -- it is meant only for temporary mappings. */ void pmap_qremove(va, count) vm_offset_t va; int count; { int i; register unsigned *pte; for (i = 0; i < count; i++) { pte = (unsigned *)vtopte(va); *pte = 0; invltlb_1pg(va); va += PAGE_SIZE; } } /* * add a wired page to the kva * note that in order for the mapping to take effect -- you * should do a invltlb after doing the pmap_kenter... */ PMAP_INLINE void pmap_kenter(va, pa) vm_offset_t va; register vm_offset_t pa; { register unsigned *pte; unsigned npte, opte; npte = pa | PG_RW | PG_V | pgeflag; pte = (unsigned *)vtopte(va); opte = *pte; *pte = npte; if (opte) invltlb_1pg(va); } /* * remove a page from the kernel pagetables */ PMAP_INLINE void pmap_kremove(va) vm_offset_t va; { register unsigned *pte; pte = (unsigned *)vtopte(va); *pte = 0; invltlb_1pg(va); } static vm_page_t pmap_page_alloc(object, pindex) vm_object_t object; vm_pindex_t pindex; { vm_page_t m; m = vm_page_alloc(object, pindex, VM_ALLOC_ZERO); if (m == NULL) { VM_WAIT; } return m; } static vm_page_t pmap_page_lookup(object, pindex) vm_object_t object; vm_pindex_t pindex; { vm_page_t m; retry: m = vm_page_lookup(object, pindex); if (m) { if (m->flags & PG_BUSY) { m->flags |= PG_WANTED; tsleep(m, PVM, "pplookp", 0); goto retry; } } return m; } /* * Create the UPAGES for a new process. * This routine directly affects the fork perf for a process. */ void pmap_new_proc(p) struct proc *p; { int i; vm_object_t upobj; vm_page_t m; struct user *up; unsigned *ptek; /* * allocate object for the upages */ upobj = vm_object_allocate( OBJT_DEFAULT, UPAGES); p->p_upages_obj = upobj; /* get a kernel virtual address for the UPAGES for this proc */ up = (struct user *) kmem_alloc_pageable(u_map, UPAGES * PAGE_SIZE); if (up == NULL) panic("pmap_new_proc: u_map allocation failed"); ptek = (unsigned *) vtopte((vm_offset_t) up); for(i=0;iwire_count++; ++cnt.v_wire_count; /* * Enter the page into the kernel address space. */ *(ptek + i) = VM_PAGE_TO_PHYS(m) | PG_RW | PG_V | pgeflag; m->flags &= ~(PG_ZERO|PG_BUSY); m->flags |= PG_MAPPED|PG_WRITEABLE; m->valid = VM_PAGE_BITS_ALL; } p->p_addr = up; } /* * Dispose the UPAGES for a process that has exited. * This routine directly impacts the exit perf of a process. */ void pmap_dispose_proc(p) struct proc *p; { int i; vm_object_t upobj; vm_page_t m; unsigned *ptek; ptek = (unsigned *) vtopte((vm_offset_t) p->p_addr); upobj = p->p_upages_obj; for(i=0;ip_addr + i * PAGE_SIZE); vm_page_unwire(m); vm_page_free(m); } vm_object_deallocate(upobj); kmem_free(u_map, (vm_offset_t)p->p_addr, ctob(UPAGES)); } /* * Allow the UPAGES for a process to be prejudicially paged out. */ void pmap_swapout_proc(p) struct proc *p; { int i; vm_object_t upobj; vm_page_t m; upobj = p->p_upages_obj; /* * let the upages be paged */ for(i=0;idirty = VM_PAGE_BITS_ALL; vm_page_unwire(m); vm_page_deactivate(m); pmap_kremove( (vm_offset_t) p->p_addr + PAGE_SIZE * i); } } /* * Bring the UPAGES for a specified process back in. */ void pmap_swapin_proc(p) struct proc *p; { int i; vm_object_t upobj; vm_page_t m; upobj = p->p_upages_obj; for(i=0;iflags & PG_BUSY) || m->busy) { m->flags |= PG_WANTED; tsleep(m, PVM, "swinuw",0); goto retry; } m->flags |= PG_BUSY; } vm_page_wire(m); splx(s); pmap_kenter(((vm_offset_t) p->p_addr) + i * PAGE_SIZE, VM_PAGE_TO_PHYS(m)); if (m->valid != VM_PAGE_BITS_ALL) { int rv; rv = vm_pager_get_pages(upobj, &m, 1, 0); if (rv != VM_PAGER_OK) panic("pmap_swapin_proc: cannot get upages for proc: %d\n", p->p_pid); m->valid = VM_PAGE_BITS_ALL; } PAGE_WAKEUP(m); m->flags |= PG_MAPPED|PG_WRITEABLE; } } /*************************************************** * Page table page management routines..... ***************************************************/ /* * This routine unholds page table pages, and if the hold count * drops to zero, then it decrements the wire count. */ static int _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m) { int s; if (m->flags & PG_BUSY) { s = splvm(); while (m->flags & PG_BUSY) { m->flags |= PG_WANTED; tsleep(m, PVM, "pmuwpt", 0); } splx(s); } if (m->hold_count == 0) { vm_offset_t pteva; /* * unmap the page table page */ pmap->pm_pdir[m->pindex] = 0; --pmap->pm_stats.resident_count; if ((((unsigned)pmap->pm_pdir[PTDPTDI]) & PG_FRAME) == (((unsigned) PTDpde) & PG_FRAME)) { /* * Do a invltlb to make the invalidated mapping * take effect immediately. */ pteva = UPT_MIN_ADDRESS + i386_ptob(m->pindex); invltlb_1pg(pteva); } #if defined(PTPHINT) if (pmap->pm_ptphint == m) pmap->pm_ptphint = NULL; #endif /* * If the page is finally unwired, simply free it. */ --m->wire_count; if (m->wire_count == 0) { if (m->flags & PG_WANTED) { m->flags &= ~PG_WANTED; wakeup(m); } vm_page_free_zero(m); --cnt.v_wire_count; } return 1; } return 0; } __inline static int pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m) { vm_page_unhold(m); if (m->hold_count == 0) return _pmap_unwire_pte_hold(pmap, m); else return 0; } /* * After removing a page table entry, this routine is used to * conditionally free the page, and manage the hold/wire counts. */ static int pmap_unuse_pt(pmap, va, mpte) pmap_t pmap; vm_offset_t va; vm_page_t mpte; { unsigned ptepindex; if (va >= UPT_MIN_ADDRESS) return 0; if (mpte == NULL) { ptepindex = (va >> PDRSHIFT); #if defined(PTPHINT) if (pmap->pm_ptphint && (pmap->pm_ptphint->pindex == ptepindex)) { mpte = pmap->pm_ptphint; } else { mpte = pmap_page_lookup( pmap->pm_pteobj, ptepindex); pmap->pm_ptphint = mpte; } #else mpte = pmap_page_lookup( pmap->pm_pteobj, ptepindex); #endif } return pmap_unwire_pte_hold(pmap, mpte); } +#if !defined(SMP) +void +pmap_pinit0(pmap) + struct pmap *pmap; +{ + pmap->pm_pdir = + (pd_entry_t *)kmem_alloc_pageable(kernel_map, PAGE_SIZE); + pmap_kenter((vm_offset_t) pmap->pm_pdir, (vm_offset_t) IdlePTD); + pmap->pm_flags = 0; + pmap->pm_count = 1; + pmap->pm_ptphint = NULL; + TAILQ_INIT(&pmap->pm_pvlist); +} +#else +void +pmap_pinit0(pmap) + struct pmap *pmap; +{ + pmap_pinit(pmap); +} +#endif + /* * Initialize a preallocated and zeroed pmap structure, * such as one in a vmspace structure. */ void pmap_pinit(pmap) register struct pmap *pmap; { vm_page_t ptdpg; /* * No need to allocate page table space yet but we do need a valid * page directory table. */ if (pdstackptr > 0) { --pdstackptr; pmap->pm_pdir = (pd_entry_t *)pdstack[pdstackptr]; } else { pmap->pm_pdir = (pd_entry_t *)kmem_alloc_pageable(kernel_map, PAGE_SIZE); } /* * allocate object for the ptes */ pmap->pm_pteobj = vm_object_allocate( OBJT_DEFAULT, PTDPTDI + 1); /* * allocate the page directory page */ retry: ptdpg = pmap_page_alloc( pmap->pm_pteobj, PTDPTDI); if (ptdpg == NULL) goto retry; ptdpg->wire_count = 1; ++cnt.v_wire_count; ptdpg->flags &= ~(PG_MAPPED|PG_BUSY); /* not mapped normally */ ptdpg->valid = VM_PAGE_BITS_ALL; pmap_kenter((vm_offset_t) pmap->pm_pdir, VM_PAGE_TO_PHYS(ptdpg)); if ((ptdpg->flags & PG_ZERO) == 0) bzero(pmap->pm_pdir, PAGE_SIZE); /* wire in kernel global address entries */ /* XXX copies current process, does not fill in MPPTDI */ bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * PTESIZE); /* install self-referential address mapping entry */ *(unsigned *) (pmap->pm_pdir + PTDPTDI) = VM_PAGE_TO_PHYS(ptdpg) | PG_V | PG_RW; pmap->pm_flags = 0; pmap->pm_count = 1; pmap->pm_ptphint = NULL; TAILQ_INIT(&pmap->pm_pvlist); } static int pmap_release_free_page(pmap, p) struct pmap *pmap; vm_page_t p; { int s; unsigned *pde = (unsigned *) pmap->pm_pdir; /* * This code optimizes the case of freeing non-busy * page-table pages. Those pages are zero now, and * might as well be placed directly into the zero queue. */ s = splvm(); if (p->flags & PG_BUSY) { p->flags |= PG_WANTED; tsleep(p, PVM, "pmaprl", 0); splx(s); return 0; } if (p->flags & PG_WANTED) { p->flags &= ~PG_WANTED; wakeup(p); } /* * Remove the page table page from the processes address space. */ pde[p->pindex] = 0; --pmap->pm_stats.resident_count; if (p->hold_count) { panic("pmap_release: freeing held page table page"); } /* * Page directory pages need to have the kernel * stuff cleared, so they can go into the zero queue also. */ if (p->pindex == PTDPTDI) { bzero(pde + KPTDI, nkpt * PTESIZE); #ifdef SMP pde[MPPTDI] = 0; #endif pde[APTDPTDI] = 0; pmap_kremove((vm_offset_t) pmap->pm_pdir); } #if defined(PTPHINT) if (pmap->pm_ptphint && (pmap->pm_ptphint->pindex == p->pindex)) pmap->pm_ptphint = NULL; #endif vm_page_free_zero(p); splx(s); return 1; } /* * this routine is called if the page table page is not * mapped correctly. */ static vm_page_t _pmap_allocpte(pmap, ptepindex) pmap_t pmap; unsigned ptepindex; { vm_offset_t pteva, ptepa; vm_page_t m; int needszero = 0; /* * Find or fabricate a new pagetable page */ retry: m = vm_page_lookup(pmap->pm_pteobj, ptepindex); if (m == NULL) { m = pmap_page_alloc(pmap->pm_pteobj, ptepindex); if (m == NULL) goto retry; if ((m->flags & PG_ZERO) == 0) needszero = 1; m->flags &= ~(PG_ZERO|PG_BUSY); m->valid = VM_PAGE_BITS_ALL; } else { if ((m->flags & PG_BUSY) || m->busy) { m->flags |= PG_WANTED; tsleep(m, PVM, "ptewai", 0); goto retry; } } if (m->queue != PQ_NONE) { int s = splvm(); vm_page_unqueue(m); splx(s); } if (m->wire_count == 0) ++cnt.v_wire_count; ++m->wire_count; /* * Increment the hold count for the page table page * (denoting a new mapping.) */ ++m->hold_count; /* * Map the pagetable page into the process address space, if * it isn't already there. */ pmap->pm_stats.resident_count++; ptepa = VM_PAGE_TO_PHYS(m); pmap->pm_pdir[ptepindex] = (pd_entry_t) (ptepa | PG_U | PG_RW | PG_V); #if defined(PTPHINT) /* * Set the page table hint */ pmap->pm_ptphint = m; #endif /* * Try to use the new mapping, but if we cannot, then * do it with the routine that maps the page explicitly. */ if (needszero) { if ((((unsigned)pmap->pm_pdir[PTDPTDI]) & PG_FRAME) == (((unsigned) PTDpde) & PG_FRAME)) { pteva = UPT_MIN_ADDRESS + i386_ptob(ptepindex); bzero((caddr_t) pteva, PAGE_SIZE); } else { pmap_zero_page(ptepa); } } m->valid = VM_PAGE_BITS_ALL; m->flags |= PG_MAPPED; return m; } static vm_page_t pmap_allocpte(pmap, va) pmap_t pmap; vm_offset_t va; { unsigned ptepindex; vm_offset_t ptepa; vm_page_t m; /* * Calculate pagetable page index */ ptepindex = va >> PDRSHIFT; /* * Get the page directory entry */ ptepa = (vm_offset_t) pmap->pm_pdir[ptepindex]; /* * This supports switching from a 4MB page to a * normal 4K page. */ if (ptepa & PG_PS) { pmap->pm_pdir[ptepindex] = 0; ptepa = 0; invltlb(); } /* * If the page table page is mapped, we just increment the * hold count, and activate it. */ if (ptepa) { #if defined(PTPHINT) /* * In order to get the page table page, try the * hint first. */ if (pmap->pm_ptphint && (pmap->pm_ptphint->pindex == ptepindex)) { m = pmap->pm_ptphint; } else { m = pmap_page_lookup( pmap->pm_pteobj, ptepindex); pmap->pm_ptphint = m; } #else m = pmap_page_lookup( pmap->pm_pteobj, ptepindex); #endif ++m->hold_count; return m; } /* * Here if the pte page isn't mapped, or if it has been deallocated. */ return _pmap_allocpte(pmap, ptepindex); } /*************************************************** * Pmap allocation/deallocation routines. ***************************************************/ /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap) register struct pmap *pmap; { vm_page_t p,n,ptdpg; vm_object_t object = pmap->pm_pteobj; #if defined(DIAGNOSTIC) if (object->ref_count != 1) panic("pmap_release: pteobj reference count != 1"); #endif ptdpg = NULL; retry: for (p = TAILQ_FIRST(&object->memq); p != NULL; p = n) { n = TAILQ_NEXT(p, listq); if (p->pindex == PTDPTDI) { ptdpg = p; continue; } if (!pmap_release_free_page(pmap, p)) goto retry; } if (ptdpg && !pmap_release_free_page(pmap, ptdpg)) goto retry; vm_object_deallocate(object); if (pdstackptr < PDSTACKMAX) { pdstack[pdstackptr] = (vm_offset_t) pmap->pm_pdir; ++pdstackptr; } else { int pdstmp = pdstackptr - 1; kmem_free(kernel_map, pdstack[pdstmp], PAGE_SIZE); pdstack[pdstmp] = (vm_offset_t) pmap->pm_pdir; } pmap->pm_pdir = 0; } /* * grow the number of kernel page table entries, if needed */ void pmap_growkernel(vm_offset_t addr) { struct proc *p; struct pmap *pmap; int s; vm_offset_t ptpkva, ptppaddr; vm_page_t nkpg; #ifdef SMP int i; #endif pd_entry_t newpdir; vm_pindex_t ptpidx; s = splhigh(); if (kernel_vm_end == 0) { kernel_vm_end = KERNBASE; nkpt = 0; while (pdir_pde(PTD, kernel_vm_end)) { kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); ++nkpt; } } addr = (addr + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); while (kernel_vm_end < addr) { if (pdir_pde(PTD, kernel_vm_end)) { kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); continue; } ++nkpt; ptpkva = (vm_offset_t) vtopte(addr); ptpidx = (ptpkva >> PAGE_SHIFT); /* * This index is bogus, but out of the way */ nkpg = vm_page_alloc(kernel_object, ptpidx, VM_ALLOC_SYSTEM); if (!nkpg) panic("pmap_growkernel: no memory to grow kernel"); vm_page_wire(nkpg); vm_page_remove(nkpg); ptppaddr = VM_PAGE_TO_PHYS(nkpg); pmap_zero_page(ptppaddr); newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW); pdir_pde(PTD, kernel_vm_end) = newpdir; #ifdef SMP for (i = 0; i < mp_ncpus; i++) { if (IdlePTDS[i]) pdir_pde(IdlePTDS[i], kernel_vm_end) = newpdir; } #endif for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) { if (p->p_vmspace) { pmap = &p->p_vmspace->vm_pmap; *pmap_pde(pmap, kernel_vm_end) = newpdir; } } *pmap_pde(kernel_pmap, kernel_vm_end) = newpdir; kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); } splx(s); } /* * Retire the given physical map from service. * Should only be called if the map contains * no valid mappings. */ void pmap_destroy(pmap) register pmap_t pmap; { int count; if (pmap == NULL) return; count = --pmap->pm_count; if (count == 0) { pmap_release(pmap); panic("destroying a pmap is not yet implemented"); /* free((caddr_t) pmap, M_VMPMAP); */ } } /* * Add a reference to the specified pmap. */ void pmap_reference(pmap) pmap_t pmap; { if (pmap != NULL) { pmap->pm_count++; } } /*************************************************** * page management routines. ***************************************************/ /* * free the pv_entry back to the free list */ static inline void free_pv_entry(pv) pv_entry_t pv; { pv_entry_count--; zfreei(pvzone, pv); } /* * get a new pv_entry, allocating a block from the system * when needed. * the memory allocation is performed bypassing the malloc code * because of the possibility of allocations at interrupt time. */ static pv_entry_t get_pv_entry(void) { pv_entry_count++; if (pv_entry_high_water && (pv_entry_count > pv_entry_high_water) && (pmap_pagedaemon_waken == 0)) { pmap_pagedaemon_waken = 1; wakeup (&vm_pages_needed); } return zalloci(pvzone); } /* * This routine is very drastic, but can save the system * in a pinch. */ void pmap_collect() { pv_table_t *ppv; int i; vm_offset_t pa; vm_page_t m; static int warningdone=0; if (pmap_pagedaemon_waken == 0) return; if (warningdone < 5) { printf("pmap_collect: collecting pv entries -- suggest increasing PMAP_SHPGPERPROC\n"); warningdone++; } for(i = 0; i < pv_npg; i++) { if ((ppv = &pv_table[i]) == 0) continue; m = ppv->pv_vm_page; if ((pa = VM_PAGE_TO_PHYS(m)) == 0) continue; if (m->wire_count || m->hold_count || m->busy || (m->flags & PG_BUSY)) continue; pmap_remove_all(pa); } pmap_pagedaemon_waken = 0; } /* * If it is the first entry on the list, it is actually * in the header and we must copy the following entry up * to the header. Otherwise we must search the list for * the entry. In either case we free the now unused entry. */ static int pmap_remove_entry(pmap, ppv, va) struct pmap *pmap; pv_table_t *ppv; vm_offset_t va; { pv_entry_t pv; int rtval; int s; s = splvm(); if (ppv->pv_list_count < pmap->pm_stats.resident_count) { for (pv = TAILQ_FIRST(&ppv->pv_list); pv; pv = TAILQ_NEXT(pv, pv_list)) { if (pmap == pv->pv_pmap && va == pv->pv_va) break; } } else { for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = TAILQ_NEXT(pv, pv_plist)) { if (va == pv->pv_va) break; } } rtval = 0; if (pv) { rtval = pmap_unuse_pt(pmap, va, pv->pv_ptem); TAILQ_REMOVE(&ppv->pv_list, pv, pv_list); --ppv->pv_list_count; if (TAILQ_FIRST(&ppv->pv_list) == NULL) { ppv->pv_vm_page->flags &= ~(PG_MAPPED|PG_WRITEABLE); } TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist); free_pv_entry(pv); } splx(s); return rtval; } /* * Create a pv entry for page at pa for * (pmap, va). */ static void pmap_insert_entry(pmap, va, mpte, pa) pmap_t pmap; vm_offset_t va; vm_page_t mpte; vm_offset_t pa; { int s; pv_entry_t pv; pv_table_t *ppv; s = splvm(); pv = get_pv_entry(); pv->pv_va = va; pv->pv_pmap = pmap; pv->pv_ptem = mpte; TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist); ppv = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&ppv->pv_list, pv, pv_list); ++ppv->pv_list_count; splx(s); } /* * pmap_remove_pte: do the things to unmap a page in a process */ static int pmap_remove_pte(pmap, ptq, va) struct pmap *pmap; unsigned *ptq; vm_offset_t va; { unsigned oldpte; pv_table_t *ppv; oldpte = *ptq; *ptq = 0; if (oldpte & PG_W) pmap->pm_stats.wired_count -= 1; /* * Machines that don't support invlpg, also don't support * PG_G. */ if (oldpte & PG_G) invlpg(va); pmap->pm_stats.resident_count -= 1; if (oldpte & PG_MANAGED) { ppv = pa_to_pvh(oldpte); if (oldpte & PG_M) { #if defined(PMAP_DIAGNOSTIC) if (pmap_nw_modified((pt_entry_t) oldpte)) { printf("pmap_remove: modified page not writable: va: 0x%lx, pte: 0x%lx\n", va, (int) oldpte); } #endif if (pmap_track_modified(va)) ppv->pv_vm_page->dirty = VM_PAGE_BITS_ALL; } return pmap_remove_entry(pmap, ppv, va); } else { return pmap_unuse_pt(pmap, va, NULL); } return 0; } /* * Remove a single page from a process address space */ static void pmap_remove_page(pmap, va) struct pmap *pmap; register vm_offset_t va; { register unsigned *ptq; /* * if there is no pte for this address, just skip it!!! */ if (*pmap_pde(pmap, va) == 0) { return; } /* * get a local va for mappings for this pmap. */ ptq = get_ptbase(pmap) + i386_btop(va); if (*ptq) { (void) pmap_remove_pte(pmap, ptq, va); invltlb_1pg(va); } return; } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ void pmap_remove(pmap, sva, eva) struct pmap *pmap; register vm_offset_t sva; register vm_offset_t eva; { register unsigned *ptbase; vm_offset_t pdnxt; vm_offset_t ptpaddr; vm_offset_t sindex, eindex; int anyvalid; if (pmap == NULL) return; if (pmap->pm_stats.resident_count == 0) return; /* * special handling of removing one page. a very * common operation and easy to short circuit some * code. */ if (((sva + PAGE_SIZE) == eva) && (((unsigned) pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) { pmap_remove_page(pmap, sva); return; } anyvalid = 0; /* * Get a local virtual address for the mappings that are being * worked with. */ ptbase = get_ptbase(pmap); sindex = i386_btop(sva); eindex = i386_btop(eva); for (; sindex < eindex; sindex = pdnxt) { unsigned pdirindex; /* * Calculate index for next page table. */ pdnxt = ((sindex + NPTEPG) & ~(NPTEPG - 1)); if (pmap->pm_stats.resident_count == 0) break; pdirindex = sindex / NPDEPG; if (((ptpaddr = (unsigned) pmap->pm_pdir[pdirindex]) & PG_PS) != 0) { pmap->pm_pdir[pdirindex] = 0; pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; anyvalid++; continue; } /* * Weed out invalid mappings. Note: we assume that the page * directory table is always allocated, and in kernel virtual. */ if (ptpaddr == 0) continue; /* * Limit our scan to either the end of the va represented * by the current page table page, or to the end of the * range being removed. */ if (pdnxt > eindex) { pdnxt = eindex; } for ( ;sindex != pdnxt; sindex++) { vm_offset_t va; if (ptbase[sindex] == 0) { continue; } va = i386_ptob(sindex); anyvalid++; if (pmap_remove_pte(pmap, ptbase + sindex, va)) break; } } if (anyvalid) { invltlb(); } } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. * * Notes: * Original versions of this routine were very * inefficient because they iteratively called * pmap_remove (slow...) */ static void pmap_remove_all(pa) vm_offset_t pa; { register pv_entry_t pv; pv_table_t *ppv; register unsigned *pte, tpte; int nmodify; int update_needed; int s; nmodify = 0; update_needed = 0; #if defined(PMAP_DIAGNOSTIC) /* * XXX this makes pmap_page_protect(NONE) illegal for non-managed * pages! */ if (!pmap_is_managed(pa)) { panic("pmap_page_protect: illegal for unmanaged page, va: 0x%lx", pa); } #endif s = splvm(); ppv = pa_to_pvh(pa); while ((pv = TAILQ_FIRST(&ppv->pv_list)) != NULL) { pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); pv->pv_pmap->pm_stats.resident_count--; tpte = *pte; *pte = 0; if (tpte & PG_W) pv->pv_pmap->pm_stats.wired_count--; /* * Update the vm_page_t clean and reference bits. */ if (tpte & PG_M) { #if defined(PMAP_DIAGNOSTIC) if (pmap_nw_modified((pt_entry_t) tpte)) { printf("pmap_remove_all: modified page not writable: va: 0x%lx, pte: 0x%lx\n", pv->pv_va, tpte); } #endif if (pmap_track_modified(pv->pv_va)) ppv->pv_vm_page->dirty = VM_PAGE_BITS_ALL; } if (!update_needed && ((!curproc || (&curproc->p_vmspace->vm_pmap == pv->pv_pmap)) || (pv->pv_pmap == kernel_pmap))) { update_needed = 1; } TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist); TAILQ_REMOVE(&ppv->pv_list, pv, pv_list); --ppv->pv_list_count; pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem); free_pv_entry(pv); } ppv->pv_vm_page->flags &= ~(PG_MAPPED|PG_WRITEABLE); if (update_needed) invltlb(); splx(s); return; } /* * Set the physical protection on the * specified range of this map as requested. */ void pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { register unsigned *ptbase; vm_offset_t pdnxt; vm_offset_t ptpaddr; vm_offset_t sindex, eindex; int anychanged; if (pmap == NULL) return; if ((prot & VM_PROT_READ) == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); return; } anychanged = 0; ptbase = get_ptbase(pmap); sindex = i386_btop(sva); eindex = i386_btop(eva); for (; sindex < eindex; sindex = pdnxt) { unsigned pdirindex; pdnxt = ((sindex + NPTEPG) & ~(NPTEPG - 1)); pdirindex = sindex / NPDEPG; if (((ptpaddr = (unsigned) pmap->pm_pdir[pdirindex]) & PG_PS) != 0) { (unsigned) pmap->pm_pdir[pdirindex] &= ~(PG_M|PG_RW); pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; anychanged++; continue; } /* * Weed out invalid mappings. Note: we assume that the page * directory table is always allocated, and in kernel virtual. */ if (ptpaddr == 0) continue; if (pdnxt > eindex) { pdnxt = eindex; } for (; sindex != pdnxt; sindex++) { unsigned pbits = ptbase[sindex]; if (prot & VM_PROT_WRITE) { if ((pbits & (PG_RW|PG_V)) == PG_V) { if (pbits & PG_MANAGED) { vm_page_t m = PHYS_TO_VM_PAGE(pbits); m->flags |= PG_WRITEABLE; m->object->flags |= OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY; } ptbase[sindex] = pbits | PG_RW; anychanged = 1; } } else if (pbits & PG_RW) { if (pbits & PG_M) { vm_offset_t sva1 = i386_ptob(sindex); if ((pbits & PG_MANAGED) && pmap_track_modified(sva1)) { vm_page_t m = PHYS_TO_VM_PAGE(pbits); m->dirty = VM_PAGE_BITS_ALL; } } ptbase[sindex] = pbits & ~(PG_M|PG_RW); anychanged = 1; } } } if (anychanged) invltlb(); } /* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. */ void pmap_enter(pmap_t pmap, vm_offset_t va, vm_offset_t pa, vm_prot_t prot, boolean_t wired) { register unsigned *pte; vm_offset_t opa; vm_offset_t origpte, newpte; vm_page_t mpte; if (pmap == NULL) return; va &= PG_FRAME; #ifdef PMAP_DIAGNOSTIC if (va > VM_MAX_KERNEL_ADDRESS) panic("pmap_enter: toobig"); if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS)) panic("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", va); #endif mpte = NULL; /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < UPT_MIN_ADDRESS) { mpte = pmap_allocpte(pmap, va); } #if 0 && defined(PMAP_DIAGNOSTIC) else { vm_offset_t *pdeaddr = (vm_offset_t *)pmap_pde(pmap, va); if (((origpte = (vm_offset_t) *pdeaddr) & PG_V) == 0) { panic("pmap_enter: invalid kernel page table page(0), pdir=%p, pde=%p, va=%p\n", pmap->pm_pdir[PTDPTDI], origpte, va); } if (smp_active) { pdeaddr = (vm_offset_t *) IdlePTDS[cpuid]; if (((newpte = pdeaddr[va >> PDRSHIFT]) & PG_V) == 0) { if ((vm_offset_t) my_idlePTD != (vm_offset_t) vtophys(pdeaddr)) printf("pde mismatch: %x, %x\n", my_idlePTD, pdeaddr); printf("cpuid: %d, pdeaddr: 0x%x\n", cpuid, pdeaddr); panic("pmap_enter: invalid kernel page table page(1), pdir=%p, npde=%p, pde=%p, va=%p\n", pmap->pm_pdir[PTDPTDI], newpte, origpte, va); } } } #endif pte = pmap_pte(pmap, va); /* * Page Directory table entry not valid, we need a new PT page */ if (pte == NULL) { panic("pmap_enter: invalid page directory, pdir=%p, va=0x%lx\n", pmap->pm_pdir[PTDPTDI], va); } origpte = *(vm_offset_t *)pte; pa &= PG_FRAME; opa = origpte & PG_FRAME; if (origpte & PG_PS) panic("pmap_enter: attempted pmap_enter on 4MB page"); /* * Mapping has not changed, must be protection or wiring change. */ if (origpte && (opa == pa)) { /* * Wiring change, just update stats. We don't worry about * wiring PT pages as they remain resident as long as there * are valid mappings in them. Hence, if a user page is wired, * the PT page will be also. */ if (wired && ((origpte & PG_W) == 0)) pmap->pm_stats.wired_count++; else if (!wired && (origpte & PG_W)) pmap->pm_stats.wired_count--; #if defined(PMAP_DIAGNOSTIC) if (pmap_nw_modified((pt_entry_t) origpte)) { printf("pmap_enter: modified page not writable: va: 0x%lx, pte: 0x%lx\n", va, origpte); } #endif /* * We might be turning off write access to the page, * so we go ahead and sense modify status. */ if (origpte & PG_MANAGED) { vm_page_t m; if (origpte & PG_M) { if (pmap_track_modified(va)) { m = PHYS_TO_VM_PAGE(pa); m->dirty = VM_PAGE_BITS_ALL; } } pa |= PG_MANAGED; } if (mpte) --mpte->hold_count; goto validate; } /* * Mapping has changed, invalidate old range and fall through to * handle validating new mapping. */ if (opa) { int err; err = pmap_remove_pte(pmap, pte, va); if (err) panic("pmap_enter: pte vanished, va: 0x%x", va); } /* * Enter on the PV list if part of our managed memory Note that we * raise IPL while manipulating pv_table since pmap_enter can be * called at interrupt time. */ if (pmap_is_managed(pa)) { pmap_insert_entry(pmap, va, mpte, pa); pa |= PG_MANAGED; } /* * Increment counters */ pmap->pm_stats.resident_count++; if (wired) pmap->pm_stats.wired_count++; validate: /* * Now validate mapping with desired protection/wiring. */ newpte = (vm_offset_t) (pa | pte_prot(pmap, prot) | PG_V); if (wired) newpte |= PG_W; if (va < UPT_MIN_ADDRESS) newpte |= PG_U; if (pmap == kernel_pmap) newpte |= pgeflag; /* * if the mapping or permission bits are different, we need * to update the pte. */ if ((origpte & ~(PG_M|PG_A)) != newpte) { *pte = newpte; if (origpte) invltlb_1pg(va); } } /* * this code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No page table pages. * 5. Tlbflush is deferred to calling procedure. * 6. Page IS managed. * but is *MUCH* faster than pmap_enter... */ static vm_page_t pmap_enter_quick(pmap, va, pa, mpte) register pmap_t pmap; vm_offset_t va; register vm_offset_t pa; vm_page_t mpte; { register unsigned *pte; /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < UPT_MIN_ADDRESS) { unsigned ptepindex; vm_offset_t ptepa; /* * Calculate pagetable page index */ ptepindex = va >> PDRSHIFT; if (mpte && (mpte->pindex == ptepindex)) { ++mpte->hold_count; } else { retry: /* * Get the page directory entry */ ptepa = (vm_offset_t) pmap->pm_pdir[ptepindex]; /* * If the page table page is mapped, we just increment * the hold count, and activate it. */ if (ptepa) { if (ptepa & PG_PS) panic("pmap_enter_quick: unexpected mapping into 4MB page"); #if defined(PTPHINT) if (pmap->pm_ptphint && (pmap->pm_ptphint->pindex == ptepindex)) { mpte = pmap->pm_ptphint; } else { mpte = pmap_page_lookup( pmap->pm_pteobj, ptepindex); pmap->pm_ptphint = mpte; } #else mpte = pmap_page_lookup( pmap->pm_pteobj, ptepindex); #endif if (mpte == NULL) goto retry; ++mpte->hold_count; } else { mpte = _pmap_allocpte(pmap, ptepindex); } } } else { mpte = NULL; } /* * This call to vtopte makes the assumption that we are * entering the page into the current pmap. In order to support * quick entry into any pmap, one would likely use pmap_pte_quick. * But that isn't as quick as vtopte. */ pte = (unsigned *)vtopte(va); if (*pte) { if (mpte) pmap_unwire_pte_hold(pmap, mpte); return 0; } /* * Enter on the PV list if part of our managed memory Note that we * raise IPL while manipulating pv_table since pmap_enter can be * called at interrupt time. */ pmap_insert_entry(pmap, va, mpte, pa); /* * Increment counters */ pmap->pm_stats.resident_count++; /* * Now validate mapping with RO protection */ *pte = pa | PG_V | PG_U | PG_MANAGED; return mpte; } #define MAX_INIT_PT (96) /* * pmap_object_init_pt preloads the ptes for a given object * into the specified pmap. This eliminates the blast of soft * faults on process startup and immediately after an mmap. */ void pmap_object_init_pt(pmap, addr, object, pindex, size, limit) pmap_t pmap; vm_offset_t addr; vm_object_t object; vm_pindex_t pindex; vm_size_t size; int limit; { vm_offset_t tmpidx; int psize; vm_page_t p, mpte; int objpgs; if (!pmap) return; /* * This code maps large physical mmap regions into the * processor address space. Note that some shortcuts * are taken, but the code works. */ if (pseflag && (object->type == OBJT_DEVICE) && ((addr & (NBPDR - 1)) == 0) && ((size & (NBPDR - 1)) == 0) ) { int i; int s; vm_page_t m[1]; unsigned int ptepindex; int npdes; vm_offset_t ptepa; if (pmap->pm_pdir[ptepindex = (addr >> PDRSHIFT)]) return; s = splhigh(); retry: p = vm_page_lookup(object, pindex); if (p && (p->flags & PG_BUSY)) { tsleep(p, PVM, "init4p", 0); goto retry; } splx(s); if (p == NULL) { p = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL); if (p == NULL) return; m[0] = p; if (vm_pager_get_pages(object, m, 1, 0) != VM_PAGER_OK) { PAGE_WAKEUP(p); vm_page_free(p); return; } p = vm_page_lookup(object, pindex); PAGE_WAKEUP(p); } ptepa = (vm_offset_t) VM_PAGE_TO_PHYS(p); if (ptepa & (NBPDR - 1)) { return; } p->valid = VM_PAGE_BITS_ALL; pmap->pm_stats.resident_count += size >> PAGE_SHIFT; npdes = size >> PDRSHIFT; for(i=0;ipm_pdir[ptepindex] = (pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_PS); ptepa += NBPDR; ptepindex += 1; } p->flags |= PG_MAPPED; invltlb(); return; } psize = i386_btop(size); if ((object->type != OBJT_VNODE) || (limit && (psize > MAX_INIT_PT) && (object->resident_page_count > MAX_INIT_PT))) { return; } if (psize + pindex > object->size) psize = object->size - pindex; mpte = NULL; /* * if we are processing a major portion of the object, then scan the * entire thing. */ if (psize > (object->size >> 2)) { objpgs = psize; for (p = TAILQ_FIRST(&object->memq); ((objpgs > 0) && (p != NULL)); p = TAILQ_NEXT(p, listq)) { tmpidx = p->pindex; if (tmpidx < pindex) { continue; } tmpidx -= pindex; if (tmpidx >= psize) { continue; } if (((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) && (p->busy == 0) && (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) { if ((p->queue - p->pc) == PQ_CACHE) vm_page_deactivate(p); p->flags |= PG_BUSY; mpte = pmap_enter_quick(pmap, addr + i386_ptob(tmpidx), VM_PAGE_TO_PHYS(p), mpte); p->flags |= PG_MAPPED; PAGE_WAKEUP(p); } objpgs -= 1; } } else { /* * else lookup the pages one-by-one. */ for (tmpidx = 0; tmpidx < psize; tmpidx += 1) { p = vm_page_lookup(object, tmpidx + pindex); if (p && ((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) && (p->busy == 0) && (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) { if ((p->queue - p->pc) == PQ_CACHE) vm_page_deactivate(p); p->flags |= PG_BUSY; mpte = pmap_enter_quick(pmap, addr + i386_ptob(tmpidx), VM_PAGE_TO_PHYS(p), mpte); p->flags |= PG_MAPPED; PAGE_WAKEUP(p); } } } return; } /* * pmap_prefault provides a quick way of clustering * pagefaults into a processes address space. It is a "cousin" * of pmap_object_init_pt, except it runs at page fault time instead * of mmap time. */ #define PFBAK 2 #define PFFOR 2 #define PAGEORDER_SIZE (PFBAK+PFFOR) static int pmap_prefault_pageorder[] = { -PAGE_SIZE, PAGE_SIZE, -2 * PAGE_SIZE, 2 * PAGE_SIZE }; void pmap_prefault(pmap, addra, entry, object) pmap_t pmap; vm_offset_t addra; vm_map_entry_t entry; vm_object_t object; { int i; vm_offset_t starta; vm_offset_t addr; vm_pindex_t pindex; vm_page_t m, mpte; if (entry->object.vm_object != object) return; if (!curproc || (pmap != &curproc->p_vmspace->vm_pmap)) return; starta = addra - PFBAK * PAGE_SIZE; if (starta < entry->start) { starta = entry->start; } else if (starta > addra) { starta = 0; } mpte = NULL; for (i = 0; i < PAGEORDER_SIZE; i++) { vm_object_t lobject; unsigned *pte; addr = addra + pmap_prefault_pageorder[i]; if (addr < starta || addr >= entry->end) continue; if ((*pmap_pde(pmap, addr)) == NULL) continue; pte = (unsigned *) vtopte(addr); if (*pte) continue; pindex = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT; lobject = object; for (m = vm_page_lookup(lobject, pindex); (!m && (lobject->type == OBJT_DEFAULT) && (lobject->backing_object)); lobject = lobject->backing_object) { if (lobject->backing_object_offset & PAGE_MASK) break; pindex += (lobject->backing_object_offset >> PAGE_SHIFT); m = vm_page_lookup(lobject->backing_object, pindex); } /* * give-up when a page is not in memory */ if (m == NULL) break; if (((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) && (m->busy == 0) && (m->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) { if ((m->queue - m->pc) == PQ_CACHE) { vm_page_deactivate(m); } m->flags |= PG_BUSY; mpte = pmap_enter_quick(pmap, addr, VM_PAGE_TO_PHYS(m), mpte); m->flags |= PG_MAPPED; PAGE_WAKEUP(m); } } } /* * Routine: pmap_change_wiring * Function: Change the wiring attribute for a map/virtual-address * pair. * In/out conditions: * The mapping must already exist in the pmap. */ void pmap_change_wiring(pmap, va, wired) register pmap_t pmap; vm_offset_t va; boolean_t wired; { register unsigned *pte; if (pmap == NULL) return; pte = pmap_pte(pmap, va); if (wired && !pmap_pte_w(pte)) pmap->pm_stats.wired_count++; else if (!wired && pmap_pte_w(pte)) pmap->pm_stats.wired_count--; /* * Wiring is not a hardware characteristic so there is no need to * invalidate TLB. */ pmap_pte_set_w(pte, wired); } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. */ void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr) pmap_t dst_pmap, src_pmap; vm_offset_t dst_addr; vm_size_t len; vm_offset_t src_addr; { vm_offset_t addr; vm_offset_t end_addr = src_addr + len; vm_offset_t pdnxt; unsigned src_frame, dst_frame; if (dst_addr != src_addr) return; src_frame = ((unsigned) src_pmap->pm_pdir[PTDPTDI]) & PG_FRAME; if (src_frame != (((unsigned) PTDpde) & PG_FRAME)) { return; } dst_frame = ((unsigned) dst_pmap->pm_pdir[PTDPTDI]) & PG_FRAME; if (dst_frame != (((unsigned) APTDpde) & PG_FRAME)) { APTDpde = (pd_entry_t) (dst_frame | PG_RW | PG_V); invltlb(); } for(addr = src_addr; addr < end_addr; addr = pdnxt) { unsigned *src_pte, *dst_pte; vm_page_t dstmpte, srcmpte; vm_offset_t srcptepaddr; unsigned ptepindex; if (addr >= UPT_MIN_ADDRESS) panic("pmap_copy: invalid to pmap_copy page tables\n"); pdnxt = ((addr + PAGE_SIZE*NPTEPG) & ~(PAGE_SIZE*NPTEPG - 1)); ptepindex = addr >> PDRSHIFT; srcptepaddr = (vm_offset_t) src_pmap->pm_pdir[ptepindex]; if (srcptepaddr == 0) continue; if (srcptepaddr & PG_PS) { if (dst_pmap->pm_pdir[ptepindex] == 0) { dst_pmap->pm_pdir[ptepindex] = (pd_entry_t) srcptepaddr; dst_pmap->pm_stats.resident_count += NBPDR; } continue; } srcmpte = vm_page_lookup(src_pmap->pm_pteobj, ptepindex); if ((srcmpte == NULL) || (srcmpte->hold_count == 0) || (srcmpte->flags & PG_BUSY)) continue; if (pdnxt > end_addr) pdnxt = end_addr; src_pte = (unsigned *) vtopte(addr); dst_pte = (unsigned *) avtopte(addr); while (addr < pdnxt) { unsigned ptetemp; ptetemp = *src_pte; /* * we only virtual copy managed pages */ if ((ptetemp & PG_MANAGED) != 0) { /* * We have to check after allocpte for the * pte still being around... allocpte can * block. */ dstmpte = pmap_allocpte(dst_pmap, addr); if ((*dst_pte == 0) && (ptetemp = *src_pte)) { /* * Clear the modified and * accessed (referenced) bits * during the copy. */ *dst_pte = ptetemp & ~(PG_M|PG_A); dst_pmap->pm_stats.resident_count++; pmap_insert_entry(dst_pmap, addr, dstmpte, (ptetemp & PG_FRAME)); } else { pmap_unwire_pte_hold(dst_pmap, dstmpte); } if (dstmpte->hold_count >= srcmpte->hold_count) break; } addr += PAGE_SIZE; ++src_pte; ++dst_pte; } } } /* * Routine: pmap_kernel * Function: * Returns the physical map handle for the kernel. */ pmap_t pmap_kernel() { return (kernel_pmap); } /* * pmap_zero_page zeros the specified (machine independent) * page by mapping the page into virtual memory and using * bzero to clear its contents, one machine dependent page * at a time. */ void pmap_zero_page(phys) vm_offset_t phys; { #ifdef SMP if (*(int *) prv_CMAP3) panic("pmap_zero_page: prv_CMAP3 busy"); *(int *) prv_CMAP3 = PG_V | PG_RW | (phys & PG_FRAME); invltlb_1pg((vm_offset_t) &prv_CPAGE3); bzero(&prv_CPAGE3, PAGE_SIZE); *(int *) prv_CMAP3 = 0; invltlb_1pg((vm_offset_t) &prv_CPAGE3); #else if (*(int *) CMAP2) panic("pmap_zero_page: CMAP busy"); *(int *) CMAP2 = PG_V | PG_RW | (phys & PG_FRAME); bzero(CADDR2, PAGE_SIZE); *(int *) CMAP2 = 0; invltlb_1pg((vm_offset_t) CADDR2); #endif } /* * pmap_copy_page copies the specified (machine independent) * page by mapping the page into virtual memory and using * bcopy to copy the page, one machine dependent page at a * time. */ void pmap_copy_page(src, dst) vm_offset_t src; vm_offset_t dst; { #ifdef SMP if (*(int *) prv_CMAP1) panic("pmap_copy_page: prv_CMAP1 busy"); if (*(int *) prv_CMAP2) panic("pmap_copy_page: prv_CMAP2 busy"); *(int *) prv_CMAP1 = PG_V | PG_RW | (src & PG_FRAME); *(int *) prv_CMAP2 = PG_V | PG_RW | (dst & PG_FRAME); invltlb_2pg( (vm_offset_t) &prv_CPAGE1, (vm_offset_t) &prv_CPAGE2); bcopy(&prv_CPAGE1, &prv_CPAGE2, PAGE_SIZE); *(int *) prv_CMAP1 = 0; *(int *) prv_CMAP2 = 0; invltlb_2pg( (vm_offset_t) &prv_CPAGE1, (vm_offset_t) &prv_CPAGE2); #else if (*(int *) CMAP1 || *(int *) CMAP2) panic("pmap_copy_page: CMAP busy"); *(int *) CMAP1 = PG_V | PG_RW | (src & PG_FRAME); *(int *) CMAP2 = PG_V | PG_RW | (dst & PG_FRAME); bcopy(CADDR1, CADDR2, PAGE_SIZE); *(int *) CMAP1 = 0; *(int *) CMAP2 = 0; invltlb_2pg( (vm_offset_t) CADDR1, (vm_offset_t) CADDR2); #endif } /* * Routine: pmap_pageable * Function: * Make the specified pages (by pmap, offset) * pageable (or not) as requested. * * A page which is not pageable may not take * a fault; therefore, its page table entry * must remain valid for the duration. * * This routine is merely advisory; pmap_enter * will specify that these pages are to be wired * down (or not) as appropriate. */ void pmap_pageable(pmap, sva, eva, pageable) pmap_t pmap; vm_offset_t sva, eva; boolean_t pageable; { } /* * this routine returns true if a physical page resides * in the given pmap. */ boolean_t pmap_page_exists(pmap, pa) pmap_t pmap; vm_offset_t pa; { register pv_entry_t pv; pv_table_t *ppv; int s; if (!pmap_is_managed(pa)) return FALSE; s = splvm(); ppv = pa_to_pvh(pa); /* * Not found, check current mappings returning immediately if found. */ for (pv = TAILQ_FIRST(&ppv->pv_list); pv; pv = TAILQ_NEXT(pv, pv_list)) { if (pv->pv_pmap == pmap) { splx(s); return TRUE; } } splx(s); return (FALSE); } #define PMAP_REMOVE_PAGES_CURPROC_ONLY /* * Remove all pages from specified address space * this aids process exit speeds. Also, this code * is special cased for current process only, but * can have the more generic (and slightly slower) * mode enabled. This is much faster than pmap_remove * in the case of running down an entire address space. */ void pmap_remove_pages(pmap, sva, eva) pmap_t pmap; vm_offset_t sva, eva; { unsigned *pte, tpte; pv_table_t *ppv; pv_entry_t pv, npv; int s; #ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY if (!curproc || (pmap != &curproc->p_vmspace->vm_pmap)) { printf("warning: pmap_remove_pages called with non-current pmap\n"); return; } #endif s = splvm(); for(pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) { if (pv->pv_va >= eva || pv->pv_va < sva) { npv = TAILQ_NEXT(pv, pv_plist); continue; } #ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY pte = (unsigned *)vtopte(pv->pv_va); #else pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); #endif tpte = *pte; /* * We cannot remove wired pages from a process' mapping at this time */ if (tpte & PG_W) { npv = TAILQ_NEXT(pv, pv_plist); continue; } *pte = 0; ppv = pa_to_pvh(tpte); pv->pv_pmap->pm_stats.resident_count--; /* * Update the vm_page_t clean and reference bits. */ if (tpte & PG_M) { ppv->pv_vm_page->dirty = VM_PAGE_BITS_ALL; } npv = TAILQ_NEXT(pv, pv_plist); TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist); --ppv->pv_list_count; TAILQ_REMOVE(&ppv->pv_list, pv, pv_list); if (TAILQ_FIRST(&ppv->pv_list) == NULL) { ppv->pv_vm_page->flags &= ~(PG_MAPPED|PG_WRITEABLE); } pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem); free_pv_entry(pv); } splx(s); invltlb(); } /* * pmap_testbit tests bits in pte's * note that the testbit/changebit routines are inline, * and a lot of things compile-time evaluate. */ static boolean_t pmap_testbit(pa, bit) register vm_offset_t pa; int bit; { register pv_entry_t pv; pv_table_t *ppv; unsigned *pte; int s; if (!pmap_is_managed(pa)) return FALSE; ppv = pa_to_pvh(pa); if (TAILQ_FIRST(&ppv->pv_list) == NULL) return FALSE; s = splvm(); for (pv = TAILQ_FIRST(&ppv->pv_list); pv; pv = TAILQ_NEXT(pv, pv_list)) { /* * if the bit being tested is the modified bit, then * mark clean_map and ptes as never * modified. */ if (bit & (PG_A|PG_M)) { if (!pmap_track_modified(pv->pv_va)) continue; } #if defined(PMAP_DIAGNOSTIC) if (!pv->pv_pmap) { printf("Null pmap (tb) at va: 0x%lx\n", pv->pv_va); continue; } #endif pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); if (*pte & bit) { splx(s); return TRUE; } } splx(s); return (FALSE); } /* * this routine is used to modify bits in ptes */ static void pmap_changebit(pa, bit, setem) vm_offset_t pa; int bit; boolean_t setem; { register pv_entry_t pv; pv_table_t *ppv; register unsigned *pte; int changed; int s; if (!pmap_is_managed(pa)) return; s = splvm(); changed = 0; ppv = pa_to_pvh(pa); /* * Loop over all current mappings setting/clearing as appropos If * setting RO do we need to clear the VAC? */ for (pv = TAILQ_FIRST(&ppv->pv_list); pv; pv = TAILQ_NEXT(pv, pv_list)) { /* * don't write protect pager mappings */ if (!setem && (bit == PG_RW)) { if (!pmap_track_modified(pv->pv_va)) continue; } #if defined(PMAP_DIAGNOSTIC) if (!pv->pv_pmap) { printf("Null pmap (cb) at va: 0x%lx\n", pv->pv_va); continue; } #endif pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); if (setem) { *(int *)pte |= bit; changed = 1; } else { vm_offset_t pbits = *(vm_offset_t *)pte; if (pbits & bit) { changed = 1; if (bit == PG_RW) { if (pbits & PG_M) { ppv->pv_vm_page->dirty = VM_PAGE_BITS_ALL; } *(int *)pte = pbits & ~(PG_M|PG_RW); } else { *(int *)pte = pbits & ~bit; } } } } splx(s); if (changed) invltlb(); } /* * pmap_page_protect: * * Lower the permission for all mappings to a given page. */ void pmap_page_protect(vm_offset_t phys, vm_prot_t prot) { if ((prot & VM_PROT_WRITE) == 0) { if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) { pmap_changebit(phys, PG_RW, FALSE); } else { pmap_remove_all(phys); } } } vm_offset_t pmap_phys_address(ppn) int ppn; { return (i386_ptob(ppn)); } /* * pmap_ts_referenced: * * Return the count of reference bits for a page, clearing all of them. * */ int pmap_ts_referenced(vm_offset_t pa) { register pv_entry_t pv; pv_table_t *ppv; unsigned *pte; int s; int rtval = 0; if (!pmap_is_managed(pa)) return FALSE; s = splvm(); ppv = pa_to_pvh(pa); if (TAILQ_FIRST(&ppv->pv_list) == NULL) { splx(s); return 0; } /* * Not found, check current mappings returning immediately if found. */ for (pv = TAILQ_FIRST(&ppv->pv_list); pv; pv = TAILQ_NEXT(pv, pv_list)) { /* * if the bit being tested is the modified bit, then * mark clean_map and ptes as never * modified. */ if (!pmap_track_modified(pv->pv_va)) continue; pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); if (pte == NULL) { continue; } if (*pte & PG_A) { rtval++; *pte &= ~PG_A; } } splx(s); if (rtval) { invltlb(); } return (rtval); } /* * pmap_is_modified: * * Return whether or not the specified physical page was modified * in any physical maps. */ boolean_t pmap_is_modified(vm_offset_t pa) { return pmap_testbit((pa), PG_M); } /* * Clear the modify bits on the specified physical page. */ void pmap_clear_modify(vm_offset_t pa) { pmap_changebit((pa), PG_M, FALSE); } /* * pmap_clear_reference: * * Clear the reference bit on the specified physical page. */ void pmap_clear_reference(vm_offset_t pa) { pmap_changebit((pa), PG_A, FALSE); } /* * Miscellaneous support routines follow */ static void i386_protection_init() { register int *kp, prot; kp = protection_codes; for (prot = 0; prot < 8; prot++) { switch (prot) { case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_NONE: /* * Read access is also 0. There isn't any execute bit, * so just make it readable. */ case VM_PROT_READ | VM_PROT_NONE | VM_PROT_NONE: case VM_PROT_READ | VM_PROT_NONE | VM_PROT_EXECUTE: case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_EXECUTE: *kp++ = 0; break; case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_NONE: case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_EXECUTE: case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_NONE: case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE: *kp++ = PG_RW; break; } } } /* * Map a set of physical memory pages into the kernel virtual * address space. Return a pointer to where it is mapped. This * routine is intended to be used for mapping device memory, * NOT real memory. */ void * pmap_mapdev(pa, size) vm_offset_t pa; vm_size_t size; { vm_offset_t va, tmpva; unsigned *pte; size = roundup(size, PAGE_SIZE); va = kmem_alloc_pageable(kernel_map, size); if (!va) panic("pmap_mapdev: Couldn't alloc kernel virtual memory"); pa = pa & PG_FRAME; for (tmpva = va; size > 0;) { pte = (unsigned *)vtopte(tmpva); *pte = pa | PG_RW | PG_V | pgeflag; size -= PAGE_SIZE; tmpva += PAGE_SIZE; pa += PAGE_SIZE; } invltlb(); return ((void *) va); } /* * perform the pmap work for mincore */ int pmap_mincore(pmap, addr) pmap_t pmap; vm_offset_t addr; { unsigned *ptep, pte; int val = 0; ptep = pmap_pte(pmap, addr); if (ptep == 0) { return 0; } if (pte = *ptep) { vm_offset_t pa; val = MINCORE_INCORE; pa = pte & PG_FRAME; /* * Modified by us */ if (pte & PG_M) val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER; /* * Modified by someone */ else if (PHYS_TO_VM_PAGE(pa)->dirty || pmap_is_modified(pa)) val |= MINCORE_MODIFIED_OTHER; /* * Referenced by us */ if (pte & PG_U) val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER; /* * Referenced by someone */ else if ((PHYS_TO_VM_PAGE(pa)->flags & PG_REFERENCED) || pmap_ts_referenced(pa)) { val |= MINCORE_REFERENCED_OTHER; PHYS_TO_VM_PAGE(pa)->flags |= PG_REFERENCED; } } return val; } void pmap_activate(struct proc *p) { +#if defined(SWTCH_OPTIM_STATS) + ++tlb_flush_count; +#endif load_cr3(p->p_addr->u_pcb.pcb_cr3 = vtophys(p->p_vmspace->vm_pmap.pm_pdir)); } vm_offset_t pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size) { if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) { return addr; } addr = (addr + (NBPDR - 1)) & ~(NBPDR - 1); return addr; } #if defined(PMAP_DEBUG) pmap_pid_dump(int pid) { pmap_t pmap; struct proc *p; int npte = 0; int index; for (p = allproc.lh_first; p != NULL; p = p->p_list.le_next) { if (p->p_pid != pid) continue; if (p->p_vmspace) { int i,j; index = 0; pmap = &p->p_vmspace->vm_pmap; for(i=0;i<1024;i++) { pd_entry_t *pde; unsigned *pte; unsigned base = i << PDRSHIFT; pde = &pmap->pm_pdir[i]; if (pde && pmap_pde_v(pde)) { for(j=0;j<1024;j++) { unsigned va = base + (j << PAGE_SHIFT); if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) { if (index) { index = 0; printf("\n"); } return npte; } pte = pmap_pte_quick( pmap, va); if (pte && pmap_pte_v(pte)) { vm_offset_t pa; vm_page_t m; pa = *(int *)pte; m = PHYS_TO_VM_PAGE((pa & PG_FRAME)); printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x", va, pa, m->hold_count, m->wire_count, m->flags); npte++; index++; if (index >= 2) { index = 0; printf("\n"); } else { printf(" "); } } } } } } } return npte; } #endif #if defined(DEBUG) static void pads __P((pmap_t pm)); static void pmap_pvdump __P((vm_offset_t pa)); /* print address space of pmap*/ static void pads(pm) pmap_t pm; { unsigned va, i, j; unsigned *ptep; if (pm == kernel_pmap) return; for (i = 0; i < 1024; i++) if (pm->pm_pdir[i]) for (j = 0; j < 1024; j++) { va = (i << PDRSHIFT) + (j << PAGE_SHIFT); if (pm == kernel_pmap && va < KERNBASE) continue; if (pm != kernel_pmap && va > UPT_MAX_ADDRESS) continue; ptep = pmap_pte_quick(pm, va); if (pmap_pte_v(ptep)) printf("%x:%x ", va, *(int *) ptep); }; } static void pmap_pvdump(pa) vm_offset_t pa; { pv_table_t *ppv; register pv_entry_t pv; printf("pa %x", pa); ppv = pa_to_pvh(pa); for (pv = TAILQ_FIRST(&ppv->pv_list); pv; pv = TAILQ_NEXT(pv, pv_list)) { #ifdef used_to_be printf(" -> pmap %x, va %x, flags %x", pv->pv_pmap, pv->pv_va, pv->pv_flags); #endif printf(" -> pmap %x, va %x", pv->pv_pmap, pv->pv_va); pads(pv->pv_pmap); } printf(" "); } #endif Index: head/sys/amd64/amd64/support.S =================================================================== --- head/sys/amd64/amd64/support.S (revision 31708) +++ head/sys/amd64/amd64/support.S (revision 31709) @@ -1,1571 +1,1574 @@ /*- * Copyright (c) 1993 The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $Id: support.s,v 1.56 1997/08/09 00:02:44 dyson Exp $ + * $Id: support.s,v 1.57 1997/09/02 20:05:30 bde Exp $ */ #include "npx.h" #include #include #include #include #include "assym.s" #define KDSEL 0x10 /* kernel data selector */ #define KCSEL 0x8 /* kernel code selector */ #define IDXSHIFT 10 .data .globl _bcopy_vector _bcopy_vector: .long _generic_bcopy .globl _bzero _bzero: .long _generic_bzero .globl _copyin_vector _copyin_vector: .long _generic_copyin .globl _copyout_vector _copyout_vector: .long _generic_copyout .globl _ovbcopy_vector _ovbcopy_vector: .long _generic_bcopy #if defined(I586_CPU) && NNPX > 0 kernel_fpu_lock: .byte 0xfe .space 3 #endif .text /* * bcopy family * void bzero(void *buf, u_int len) */ ENTRY(generic_bzero) pushl %edi movl 8(%esp),%edi movl 12(%esp),%ecx xorl %eax,%eax shrl $2,%ecx cld rep stosl movl 12(%esp),%ecx andl $3,%ecx rep stosb popl %edi ret #if defined(I486_CPU) ENTRY(i486_bzero) movl 4(%esp),%edx movl 8(%esp),%ecx xorl %eax,%eax /* * do 64 byte chunks first * * XXX this is probably over-unrolled at least for DX2's */ 2: cmpl $64,%ecx jb 3f movl %eax,(%edx) movl %eax,4(%edx) movl %eax,8(%edx) movl %eax,12(%edx) movl %eax,16(%edx) movl %eax,20(%edx) movl %eax,24(%edx) movl %eax,28(%edx) movl %eax,32(%edx) movl %eax,36(%edx) movl %eax,40(%edx) movl %eax,44(%edx) movl %eax,48(%edx) movl %eax,52(%edx) movl %eax,56(%edx) movl %eax,60(%edx) addl $64,%edx subl $64,%ecx jnz 2b ret /* * do 16 byte chunks */ SUPERALIGN_TEXT 3: cmpl $16,%ecx jb 4f movl %eax,(%edx) movl %eax,4(%edx) movl %eax,8(%edx) movl %eax,12(%edx) addl $16,%edx subl $16,%ecx jnz 3b ret /* * do 4 byte chunks */ SUPERALIGN_TEXT 4: cmpl $4,%ecx jb 5f movl %eax,(%edx) addl $4,%edx subl $4,%ecx jnz 4b ret /* * do 1 byte chunks * a jump table seems to be faster than a loop or more range reductions * * XXX need a const section for non-text */ .data jtab: .long do0 .long do1 .long do2 .long do3 .text SUPERALIGN_TEXT 5: jmp jtab(,%ecx,4) SUPERALIGN_TEXT do3: movw %ax,(%edx) movb %al,2(%edx) ret SUPERALIGN_TEXT do2: movw %ax,(%edx) ret SUPERALIGN_TEXT do1: movb %al,(%edx) ret SUPERALIGN_TEXT do0: ret #endif #if defined(I586_CPU) && NNPX > 0 ENTRY(i586_bzero) movl 4(%esp),%edx movl 8(%esp),%ecx /* * The FPU register method is twice as fast as the integer register * method unless the target is in the L1 cache and we pre-allocate a * cache line for it (then the integer register method is 4-5 times * faster). However, we never pre-allocate cache lines, since that * would make the integer method 25% or more slower for the common * case when the target isn't in either the L1 cache or the L2 cache. * Thus we normally use the FPU register method unless the overhead * would be too large. */ cmpl $256,%ecx /* empirical; clts, fninit, smsw cost a lot */ jb intreg_i586_bzero /* * The FPU registers may belong to an application or to fastmove() * or to another invocation of bcopy() or ourself in a higher level * interrupt or trap handler. Preserving the registers is * complicated since we avoid it if possible at all levels. We * want to localize the complications even when that increases them. * Here the extra work involves preserving CR0_TS in TS. * `npxproc != NULL' is supposed to be the condition that all the * FPU resources belong to an application, but npxproc and CR0_TS * aren't set atomically enough for this condition to work in * interrupt handlers. * * Case 1: FPU registers belong to the application: we must preserve * the registers if we use them, so we only use the FPU register * method if the target size is large enough to amortize the extra * overhead for preserving them. CR0_TS must be preserved although * it is very likely to end up as set. * * Case 2: FPU registers belong to fastmove(): fastmove() currently * makes the registers look like they belong to an application so * that cpu_switch() and savectx() don't have to know about it, so * this case reduces to case 1. * * Case 3: FPU registers belong to the kernel: don't use the FPU * register method. This case is unlikely, and supporting it would * be more complicated and might take too much stack. * * Case 4: FPU registers don't belong to anyone: the FPU registers * don't need to be preserved, so we always use the FPU register * method. CR0_TS must be preserved although it is very likely to * always end up as clear. */ cmpl $0,_npxproc je i586_bz1 cmpl $256+184,%ecx /* empirical; not quite 2*108 more */ jb intreg_i586_bzero sarb $1,kernel_fpu_lock jc intreg_i586_bzero smsw %ax clts subl $108,%esp fnsave 0(%esp) jmp i586_bz2 i586_bz1: sarb $1,kernel_fpu_lock jc intreg_i586_bzero smsw %ax clts fninit /* XXX should avoid needing this */ i586_bz2: fldz /* * Align to an 8 byte boundary (misalignment in the main loop would * cost a factor of >= 2). Avoid jumps (at little cost if it is * already aligned) by always zeroing 8 bytes and using the part up * to the _next_ alignment position. */ fstl 0(%edx) addl %edx,%ecx /* part of %ecx -= new_%edx - %edx */ addl $8,%edx andl $~7,%edx subl %edx,%ecx /* * Similarly align `len' to a multiple of 8. */ fstl -8(%edx,%ecx) decl %ecx andl $~7,%ecx /* * This wouldn't be any faster if it were unrolled, since the loop * control instructions are much faster than the fstl and/or done * in parallel with it so their overhead is insignificant. */ fpureg_i586_bzero_loop: fstl 0(%edx) addl $8,%edx subl $8,%ecx cmpl $8,%ecx jae fpureg_i586_bzero_loop cmpl $0,_npxproc je i586_bz3 frstor 0(%esp) addl $108,%esp lmsw %ax movb $0xfe,kernel_fpu_lock ret i586_bz3: fstpl %st(0) lmsw %ax movb $0xfe,kernel_fpu_lock ret intreg_i586_bzero: /* * `rep stos' seems to be the best method in practice for small * counts. Fancy methods usually take too long to start up due * to cache and BTB misses. */ pushl %edi movl %edx,%edi xorl %eax,%eax shrl $2,%ecx cld rep stosl movl 12(%esp),%ecx andl $3,%ecx jne 1f popl %edi ret 1: rep stosb popl %edi ret #endif /* I586_CPU && NNPX > 0 */ /* fillw(pat, base, cnt) */ ENTRY(fillw) pushl %edi movl 8(%esp),%eax movl 12(%esp),%edi movl 16(%esp),%ecx cld rep stosw popl %edi ret ENTRY(bcopyb) bcopyb: pushl %esi pushl %edi movl 12(%esp),%esi movl 16(%esp),%edi movl 20(%esp),%ecx movl %edi,%eax subl %esi,%eax cmpl %ecx,%eax /* overlapping && src < dst? */ jb 1f cld /* nope, copy forwards */ rep movsb popl %edi popl %esi ret ALIGN_TEXT 1: addl %ecx,%edi /* copy backwards. */ addl %ecx,%esi decl %edi decl %esi std rep movsb popl %edi popl %esi cld ret ENTRY(bcopy) MEXITCOUNT jmp *_bcopy_vector ENTRY(ovbcopy) MEXITCOUNT jmp *_ovbcopy_vector /* * generic_bcopy(src, dst, cnt) * ws@tools.de (Wolfgang Solfrank, TooLs GmbH) +49-228-985800 */ ENTRY(generic_bcopy) pushl %esi pushl %edi movl 12(%esp),%esi movl 16(%esp),%edi movl 20(%esp),%ecx movl %edi,%eax subl %esi,%eax cmpl %ecx,%eax /* overlapping && src < dst? */ jb 1f shrl $2,%ecx /* copy by 32-bit words */ cld /* nope, copy forwards */ rep movsl movl 20(%esp),%ecx andl $3,%ecx /* any bytes left? */ rep movsb popl %edi popl %esi ret ALIGN_TEXT 1: addl %ecx,%edi /* copy backwards */ addl %ecx,%esi decl %edi decl %esi andl $3,%ecx /* any fractional bytes? */ std rep movsb movl 20(%esp),%ecx /* copy remainder by 32-bit words */ shrl $2,%ecx subl $3,%esi subl $3,%edi rep movsl popl %edi popl %esi cld ret #if defined(I586_CPU) && NNPX > 0 ENTRY(i586_bcopy) pushl %esi pushl %edi movl 12(%esp),%esi movl 16(%esp),%edi movl 20(%esp),%ecx movl %edi,%eax subl %esi,%eax cmpl %ecx,%eax /* overlapping && src < dst? */ jb 1f cmpl $1024,%ecx jb small_i586_bcopy sarb $1,kernel_fpu_lock jc small_i586_bcopy cmpl $0,_npxproc je i586_bc1 smsw %dx clts subl $108,%esp fnsave 0(%esp) jmp 4f i586_bc1: smsw %dx clts fninit /* XXX should avoid needing this */ ALIGN_TEXT 4: pushl %ecx #define DCACHE_SIZE 8192 cmpl $(DCACHE_SIZE-512)/2,%ecx jbe 2f movl $(DCACHE_SIZE-512)/2,%ecx 2: subl %ecx,0(%esp) cmpl $256,%ecx jb 5f /* XXX should prefetch if %ecx >= 32 */ pushl %esi pushl %ecx ALIGN_TEXT 3: movl 0(%esi),%eax movl 32(%esi),%eax movl 64(%esi),%eax movl 96(%esi),%eax movl 128(%esi),%eax movl 160(%esi),%eax movl 192(%esi),%eax movl 224(%esi),%eax addl $256,%esi subl $256,%ecx cmpl $256,%ecx jae 3b popl %ecx popl %esi 5: ALIGN_TEXT large_i586_bcopy_loop: fildq 0(%esi) fildq 8(%esi) fildq 16(%esi) fildq 24(%esi) fildq 32(%esi) fildq 40(%esi) fildq 48(%esi) fildq 56(%esi) fistpq 56(%edi) fistpq 48(%edi) fistpq 40(%edi) fistpq 32(%edi) fistpq 24(%edi) fistpq 16(%edi) fistpq 8(%edi) fistpq 0(%edi) addl $64,%esi addl $64,%edi subl $64,%ecx cmpl $64,%ecx jae large_i586_bcopy_loop popl %eax addl %eax,%ecx cmpl $64,%ecx jae 4b cmpl $0,_npxproc je i586_bc2 frstor 0(%esp) addl $108,%esp i586_bc2: lmsw %dx movb $0xfe,kernel_fpu_lock /* * This is a duplicate of the main part of generic_bcopy. See the comments * there. Jumping into generic_bcopy would cost a whole 0-1 cycles and * would mess up high resolution profiling. */ ALIGN_TEXT small_i586_bcopy: shrl $2,%ecx cld rep movsl movl 20(%esp),%ecx andl $3,%ecx rep movsb popl %edi popl %esi ret ALIGN_TEXT 1: addl %ecx,%edi addl %ecx,%esi decl %edi decl %esi andl $3,%ecx std rep movsb movl 20(%esp),%ecx shrl $2,%ecx subl $3,%esi subl $3,%edi rep movsl popl %edi popl %esi cld ret #endif /* I586_CPU && NNPX > 0 */ /* * Note: memcpy does not support overlapping copies */ ENTRY(memcpy) pushl %edi pushl %esi movl 12(%esp),%edi movl 16(%esp),%esi movl 20(%esp),%ecx movl %edi,%eax shrl $2,%ecx /* copy by 32-bit words */ cld /* nope, copy forwards */ rep movsl movl 20(%esp),%ecx andl $3,%ecx /* any bytes left? */ rep movsb popl %esi popl %edi ret /*****************************************************************************/ /* copyout and fubyte family */ /*****************************************************************************/ /* * Access user memory from inside the kernel. These routines and possibly * the math- and DOS emulators should be the only places that do this. * * We have to access the memory with user's permissions, so use a segment * selector with RPL 3. For writes to user space we have to additionally * check the PTE for write permission, because the 386 does not check * write permissions when we are executing with EPL 0. The 486 does check * this if the WP bit is set in CR0, so we can use a simpler version here. * * These routines set curpcb->onfault for the time they execute. When a * protection violation occurs inside the functions, the trap handler * returns to *curpcb->onfault instead of the function. */ /* copyout(from_kernel, to_user, len) */ ENTRY(copyout) MEXITCOUNT jmp *_copyout_vector ENTRY(generic_copyout) movl _curpcb,%eax movl $copyout_fault,PCB_ONFAULT(%eax) pushl %esi pushl %edi pushl %ebx movl 16(%esp),%esi movl 20(%esp),%edi movl 24(%esp),%ebx testl %ebx,%ebx /* anything to do? */ jz done_copyout /* * Check explicitly for non-user addresses. If 486 write protection * is being used, this check is essential because we are in kernel * mode so the h/w does not provide any protection against writing * kernel addresses. */ /* * First, prevent address wrapping. */ movl %edi,%eax addl %ebx,%eax jc copyout_fault /* * XXX STOP USING VM_MAXUSER_ADDRESS. * It is an end address, not a max, so every time it is used correctly it * looks like there is an off by one error, and of course it caused an off * by one error in several places. */ cmpl $VM_MAXUSER_ADDRESS,%eax ja copyout_fault #if defined(I386_CPU) #if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU) cmpl $CPUCLASS_386,_cpu_class jne 3f #endif /* * We have to check each PTE for user write permission. * The checking may cause a page fault, so it is important to set * up everything for return via copyout_fault before here. */ /* compute number of pages */ movl %edi,%ecx andl $PAGE_MASK,%ecx addl %ebx,%ecx decl %ecx shrl $IDXSHIFT+2,%ecx incl %ecx /* compute PTE offset for start address */ movl %edi,%edx shrl $IDXSHIFT,%edx andb $0xfc,%dl 1: /* check PTE for each page */ leal _PTmap(%edx),%eax shrl $IDXSHIFT,%eax andb $0xfc,%al testb $PG_V,_PTmap(%eax) /* PTE page must be valid */ je 4f movb _PTmap(%edx),%al andb $PG_V|PG_RW|PG_U,%al /* page must be valid and user writable */ cmpb $PG_V|PG_RW|PG_U,%al je 2f 4: /* simulate a trap */ pushl %edx pushl %ecx shll $IDXSHIFT,%edx pushl %edx call _trapwrite /* trapwrite(addr) */ popl %edx popl %ecx popl %edx testl %eax,%eax /* if not ok, return EFAULT */ jnz copyout_fault 2: addl $4,%edx decl %ecx jnz 1b /* check next page */ #endif /* I386_CPU */ /* bcopy(%esi, %edi, %ebx) */ 3: movl %ebx,%ecx #if defined(I586_CPU) && NNPX > 0 ALIGN_TEXT slow_copyout: #endif shrl $2,%ecx cld rep movsl movb %bl,%cl andb $3,%cl rep movsb done_copyout: popl %ebx popl %edi popl %esi xorl %eax,%eax movl _curpcb,%edx movl %eax,PCB_ONFAULT(%edx) ret ALIGN_TEXT copyout_fault: popl %ebx popl %edi popl %esi movl _curpcb,%edx movl $0,PCB_ONFAULT(%edx) movl $EFAULT,%eax ret #if defined(I586_CPU) && NNPX > 0 ENTRY(i586_copyout) /* * Duplicated from generic_copyout. Could be done a bit better. */ movl _curpcb,%eax movl $copyout_fault,PCB_ONFAULT(%eax) pushl %esi pushl %edi pushl %ebx movl 16(%esp),%esi movl 20(%esp),%edi movl 24(%esp),%ebx testl %ebx,%ebx /* anything to do? */ jz done_copyout /* * Check explicitly for non-user addresses. If 486 write protection * is being used, this check is essential because we are in kernel * mode so the h/w does not provide any protection against writing * kernel addresses. */ /* * First, prevent address wrapping. */ movl %edi,%eax addl %ebx,%eax jc copyout_fault /* * XXX STOP USING VM_MAXUSER_ADDRESS. * It is an end address, not a max, so every time it is used correctly it * looks like there is an off by one error, and of course it caused an off * by one error in several places. */ cmpl $VM_MAXUSER_ADDRESS,%eax ja copyout_fault /* bcopy(%esi, %edi, %ebx) */ 3: movl %ebx,%ecx /* * End of duplicated code. */ cmpl $1024,%ecx jb slow_copyout pushl %ecx call _fastmove addl $4,%esp jmp done_copyout #endif /* I586_CPU && NNPX > 0 */ /* copyin(from_user, to_kernel, len) */ ENTRY(copyin) MEXITCOUNT jmp *_copyin_vector ENTRY(generic_copyin) movl _curpcb,%eax movl $copyin_fault,PCB_ONFAULT(%eax) pushl %esi pushl %edi movl 12(%esp),%esi /* caddr_t from */ movl 16(%esp),%edi /* caddr_t to */ movl 20(%esp),%ecx /* size_t len */ /* * make sure address is valid */ movl %esi,%edx addl %ecx,%edx jc copyin_fault cmpl $VM_MAXUSER_ADDRESS,%edx ja copyin_fault #if defined(I586_CPU) && NNPX > 0 ALIGN_TEXT slow_copyin: #endif movb %cl,%al shrl $2,%ecx /* copy longword-wise */ cld rep movsl movb %al,%cl andb $3,%cl /* copy remaining bytes */ rep movsb #if defined(I586_CPU) && NNPX > 0 ALIGN_TEXT done_copyin: #endif popl %edi popl %esi xorl %eax,%eax movl _curpcb,%edx movl %eax,PCB_ONFAULT(%edx) ret ALIGN_TEXT copyin_fault: popl %edi popl %esi movl _curpcb,%edx movl $0,PCB_ONFAULT(%edx) movl $EFAULT,%eax ret #if defined(I586_CPU) && NNPX > 0 ENTRY(i586_copyin) /* * Duplicated from generic_copyin. Could be done a bit better. */ movl _curpcb,%eax movl $copyin_fault,PCB_ONFAULT(%eax) pushl %esi pushl %edi movl 12(%esp),%esi /* caddr_t from */ movl 16(%esp),%edi /* caddr_t to */ movl 20(%esp),%ecx /* size_t len */ /* * make sure address is valid */ movl %esi,%edx addl %ecx,%edx jc copyin_fault cmpl $VM_MAXUSER_ADDRESS,%edx ja copyin_fault /* * End of duplicated code. */ cmpl $1024,%ecx jb slow_copyin pushl %ebx /* XXX prepare for fastmove_fault */ pushl %ecx call _fastmove addl $8,%esp jmp done_copyin #endif /* I586_CPU && NNPX > 0 */ #if defined(I586_CPU) && NNPX > 0 /* fastmove(src, dst, len) src in %esi dst in %edi len in %ecx XXX changed to on stack for profiling uses %eax and %edx for tmp. storage */ /* XXX use ENTRY() to get profiling. fastmove() is actually a non-entry. */ ENTRY(fastmove) pushl %ebp movl %esp,%ebp subl $PCB_SAVEFPU_SIZE+3*4,%esp movl 8(%ebp),%ecx cmpl $63,%ecx jbe fastmove_tail testl $7,%esi /* check if src addr is multiple of 8 */ jnz fastmove_tail testl $7,%edi /* check if dst addr is multiple of 8 */ jnz fastmove_tail /* if (npxproc != NULL) { */ cmpl $0,_npxproc je 6f /* fnsave(&curpcb->pcb_savefpu); */ movl _curpcb,%eax fnsave PCB_SAVEFPU(%eax) /* npxproc = NULL; */ movl $0,_npxproc /* } */ 6: /* now we own the FPU. */ /* * The process' FP state is saved in the pcb, but if we get * switched, the cpu_switch() will store our FP state in the * pcb. It should be possible to avoid all the copying for * this, e.g., by setting a flag to tell cpu_switch() to * save the state somewhere else. */ /* tmp = curpcb->pcb_savefpu; */ movl %ecx,-12(%ebp) movl %esi,-8(%ebp) movl %edi,-4(%ebp) movl %esp,%edi movl _curpcb,%esi addl $PCB_SAVEFPU,%esi cld movl $PCB_SAVEFPU_SIZE>>2,%ecx rep movsl movl -12(%ebp),%ecx movl -8(%ebp),%esi movl -4(%ebp),%edi /* stop_emulating(); */ clts /* npxproc = curproc; */ movl _curproc,%eax movl %eax,_npxproc movl _curpcb,%eax movl $fastmove_fault,PCB_ONFAULT(%eax) 4: movl %ecx,-12(%ebp) cmpl $1792,%ecx jbe 2f movl $1792,%ecx 2: subl %ecx,-12(%ebp) cmpl $256,%ecx jb 5f movl %ecx,-8(%ebp) movl %esi,-4(%ebp) ALIGN_TEXT 3: movl 0(%esi),%eax movl 32(%esi),%eax movl 64(%esi),%eax movl 96(%esi),%eax movl 128(%esi),%eax movl 160(%esi),%eax movl 192(%esi),%eax movl 224(%esi),%eax addl $256,%esi subl $256,%ecx cmpl $256,%ecx jae 3b movl -8(%ebp),%ecx movl -4(%ebp),%esi 5: ALIGN_TEXT fastmove_loop: fildq 0(%esi) fildq 8(%esi) fildq 16(%esi) fildq 24(%esi) fildq 32(%esi) fildq 40(%esi) fildq 48(%esi) fildq 56(%esi) fistpq 56(%edi) fistpq 48(%edi) fistpq 40(%edi) fistpq 32(%edi) fistpq 24(%edi) fistpq 16(%edi) fistpq 8(%edi) fistpq 0(%edi) addl $-64,%ecx addl $64,%esi addl $64,%edi cmpl $63,%ecx ja fastmove_loop movl -12(%ebp),%eax addl %eax,%ecx cmpl $64,%ecx jae 4b /* curpcb->pcb_savefpu = tmp; */ movl %ecx,-12(%ebp) movl %esi,-8(%ebp) movl %edi,-4(%ebp) movl _curpcb,%edi addl $PCB_SAVEFPU,%edi movl %esp,%esi cld movl $PCB_SAVEFPU_SIZE>>2,%ecx rep movsl movl -12(%ebp),%ecx movl -8(%ebp),%esi movl -4(%ebp),%edi /* start_emulating(); */ smsw %ax orb $CR0_TS,%al lmsw %ax /* npxproc = NULL; */ movl $0,_npxproc ALIGN_TEXT fastmove_tail: movl _curpcb,%eax movl $fastmove_tail_fault,PCB_ONFAULT(%eax) movb %cl,%al shrl $2,%ecx /* copy longword-wise */ cld rep movsl movb %al,%cl andb $3,%cl /* copy remaining bytes */ rep movsb movl %ebp,%esp popl %ebp ret ALIGN_TEXT fastmove_fault: movl _curpcb,%edi addl $PCB_SAVEFPU,%edi movl %esp,%esi cld movl $PCB_SAVEFPU_SIZE>>2,%ecx rep movsl smsw %ax orb $CR0_TS,%al lmsw %ax movl $0,_npxproc fastmove_tail_fault: movl %ebp,%esp popl %ebp addl $8,%esp popl %ebx popl %edi popl %esi movl _curpcb,%edx movl $0,PCB_ONFAULT(%edx) movl $EFAULT,%eax ret #endif /* I586_CPU && NNPX > 0 */ /* * fu{byte,sword,word} : fetch a byte (sword, word) from user memory */ ENTRY(fuword) movl _curpcb,%ecx movl $fusufault,PCB_ONFAULT(%ecx) movl 4(%esp),%edx /* from */ cmpl $VM_MAXUSER_ADDRESS-4,%edx /* verify address is valid */ ja fusufault movl (%edx),%eax movl $0,PCB_ONFAULT(%ecx) ret /* * These two routines are called from the profiling code, potentially * at interrupt time. If they fail, that's okay, good things will * happen later. Fail all the time for now - until the trap code is * able to deal with this. */ ALTENTRY(suswintr) ENTRY(fuswintr) movl $-1,%eax ret ENTRY(fusword) movl _curpcb,%ecx movl $fusufault,PCB_ONFAULT(%ecx) movl 4(%esp),%edx cmpl $VM_MAXUSER_ADDRESS-2,%edx ja fusufault movzwl (%edx),%eax movl $0,PCB_ONFAULT(%ecx) ret ENTRY(fubyte) movl _curpcb,%ecx movl $fusufault,PCB_ONFAULT(%ecx) movl 4(%esp),%edx cmpl $VM_MAXUSER_ADDRESS-1,%edx ja fusufault movzbl (%edx),%eax movl $0,PCB_ONFAULT(%ecx) ret ALIGN_TEXT fusufault: movl _curpcb,%ecx xorl %eax,%eax movl %eax,PCB_ONFAULT(%ecx) decl %eax ret /* * su{byte,sword,word}: write a byte (word, longword) to user memory */ ENTRY(suword) movl _curpcb,%ecx movl $fusufault,PCB_ONFAULT(%ecx) movl 4(%esp),%edx #if defined(I386_CPU) #if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU) cmpl $CPUCLASS_386,_cpu_class jne 2f /* we only have to set the right segment selector */ #endif /* I486_CPU || I586_CPU || I686_CPU */ /* XXX - page boundary crossing is still not handled */ movl %edx,%eax shrl $IDXSHIFT,%edx andb $0xfc,%dl leal _PTmap(%edx),%ecx shrl $IDXSHIFT,%ecx andb $0xfc,%cl testb $PG_V,_PTmap(%ecx) /* PTE page must be valid */ je 4f movb _PTmap(%edx),%dl andb $PG_V|PG_RW|PG_U,%dl /* page must be valid and user writable */ cmpb $PG_V|PG_RW|PG_U,%dl je 1f 4: /* simulate a trap */ pushl %eax call _trapwrite popl %edx /* remove junk parameter from stack */ testl %eax,%eax jnz fusufault 1: movl 4(%esp),%edx #endif 2: cmpl $VM_MAXUSER_ADDRESS-4,%edx /* verify address validity */ ja fusufault movl 8(%esp),%eax movl %eax,(%edx) xorl %eax,%eax movl _curpcb,%ecx movl %eax,PCB_ONFAULT(%ecx) ret ENTRY(susword) movl _curpcb,%ecx movl $fusufault,PCB_ONFAULT(%ecx) movl 4(%esp),%edx #if defined(I386_CPU) #if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU) cmpl $CPUCLASS_386,_cpu_class jne 2f #endif /* I486_CPU || I586_CPU || I686_CPU */ /* XXX - page boundary crossing is still not handled */ movl %edx,%eax shrl $IDXSHIFT,%edx andb $0xfc,%dl leal _PTmap(%edx),%ecx shrl $IDXSHIFT,%ecx andb $0xfc,%cl testb $PG_V,_PTmap(%ecx) /* PTE page must be valid */ je 4f movb _PTmap(%edx),%dl andb $PG_V|PG_RW|PG_U,%dl /* page must be valid and user writable */ cmpb $PG_V|PG_RW|PG_U,%dl je 1f 4: /* simulate a trap */ pushl %eax call _trapwrite popl %edx /* remove junk parameter from stack */ testl %eax,%eax jnz fusufault 1: movl 4(%esp),%edx #endif 2: cmpl $VM_MAXUSER_ADDRESS-2,%edx /* verify address validity */ ja fusufault movw 8(%esp),%ax movw %ax,(%edx) xorl %eax,%eax movl _curpcb,%ecx /* restore trashed register */ movl %eax,PCB_ONFAULT(%ecx) ret ALTENTRY(suibyte) ENTRY(subyte) movl _curpcb,%ecx movl $fusufault,PCB_ONFAULT(%ecx) movl 4(%esp),%edx #if defined(I386_CPU) #if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU) cmpl $CPUCLASS_386,_cpu_class jne 2f #endif /* I486_CPU || I586_CPU || I686_CPU */ movl %edx,%eax shrl $IDXSHIFT,%edx andb $0xfc,%dl leal _PTmap(%edx),%ecx shrl $IDXSHIFT,%ecx andb $0xfc,%cl testb $PG_V,_PTmap(%ecx) /* PTE page must be valid */ je 4f movb _PTmap(%edx),%dl andb $PG_V|PG_RW|PG_U,%dl /* page must be valid and user writable */ cmpb $PG_V|PG_RW|PG_U,%dl je 1f 4: /* simulate a trap */ pushl %eax call _trapwrite popl %edx /* remove junk parameter from stack */ testl %eax,%eax jnz fusufault 1: movl 4(%esp),%edx #endif 2: cmpl $VM_MAXUSER_ADDRESS-1,%edx /* verify address validity */ ja fusufault movb 8(%esp),%al movb %al,(%edx) xorl %eax,%eax movl _curpcb,%ecx /* restore trashed register */ movl %eax,PCB_ONFAULT(%ecx) ret /* * copyinstr(from, to, maxlen, int *lencopied) * copy a string from from to to, stop when a 0 character is reached. * return ENAMETOOLONG if string is longer than maxlen, and * EFAULT on protection violations. If lencopied is non-zero, * return the actual length in *lencopied. */ ENTRY(copyinstr) pushl %esi pushl %edi movl _curpcb,%ecx movl $cpystrflt,PCB_ONFAULT(%ecx) movl 12(%esp),%esi /* %esi = from */ movl 16(%esp),%edi /* %edi = to */ movl 20(%esp),%edx /* %edx = maxlen */ movl $VM_MAXUSER_ADDRESS,%eax /* make sure 'from' is within bounds */ subl %esi,%eax jbe cpystrflt /* restrict maxlen to <= VM_MAXUSER_ADDRESS-from */ cmpl %edx,%eax jae 1f movl %eax,%edx movl %eax,20(%esp) 1: incl %edx cld 2: decl %edx jz 3f lodsb stosb orb %al,%al jnz 2b /* Success -- 0 byte reached */ decl %edx xorl %eax,%eax jmp cpystrflt_x 3: /* edx is zero - return ENAMETOOLONG or EFAULT */ cmpl $VM_MAXUSER_ADDRESS,%esi jae cpystrflt 4: movl $ENAMETOOLONG,%eax jmp cpystrflt_x cpystrflt: movl $EFAULT,%eax cpystrflt_x: /* set *lencopied and return %eax */ movl _curpcb,%ecx movl $0,PCB_ONFAULT(%ecx) movl 20(%esp),%ecx subl %edx,%ecx movl 24(%esp),%edx testl %edx,%edx jz 1f movl %ecx,(%edx) 1: popl %edi popl %esi ret /* * copystr(from, to, maxlen, int *lencopied) */ ENTRY(copystr) pushl %esi pushl %edi movl 12(%esp),%esi /* %esi = from */ movl 16(%esp),%edi /* %edi = to */ movl 20(%esp),%edx /* %edx = maxlen */ incl %edx cld 1: decl %edx jz 4f lodsb stosb orb %al,%al jnz 1b /* Success -- 0 byte reached */ decl %edx xorl %eax,%eax jmp 6f 4: /* edx is zero -- return ENAMETOOLONG */ movl $ENAMETOOLONG,%eax 6: /* set *lencopied and return %eax */ movl 20(%esp),%ecx subl %edx,%ecx movl 24(%esp),%edx testl %edx,%edx jz 7f movl %ecx,(%edx) 7: popl %edi popl %esi ret ENTRY(bcmp) pushl %edi pushl %esi movl 12(%esp),%edi movl 16(%esp),%esi movl 20(%esp),%edx xorl %eax,%eax movl %edx,%ecx shrl $2,%ecx cld /* compare forwards */ repe cmpsl jne 1f movl %edx,%ecx andl $3,%ecx repe cmpsb je 2f 1: incl %eax 2: popl %esi popl %edi ret /* * Handling of special 386 registers and descriptor tables etc */ /* void lgdt(struct region_descriptor *rdp); */ ENTRY(lgdt) /* reload the descriptor table */ movl 4(%esp),%eax lgdt (%eax) /* flush the prefetch q */ jmp 1f nop 1: /* reload "stale" selectors */ movl $KDSEL,%eax movl %ax,%ds movl %ax,%es movl %ax,%fs movl %ax,%gs movl %ax,%ss /* reload code selector by turning return into intersegmental return */ movl (%esp),%eax pushl %eax movl $KCSEL,4(%esp) lret /* * void lidt(struct region_descriptor *rdp); */ ENTRY(lidt) movl 4(%esp),%eax lidt (%eax) ret /* * void lldt(u_short sel) */ ENTRY(lldt) lldt 4(%esp) ret /* * void ltr(u_short sel) */ ENTRY(ltr) ltr 4(%esp) ret /* ssdtosd(*ssdp,*sdp) */ ENTRY(ssdtosd) pushl %ebx movl 8(%esp),%ecx movl 8(%ecx),%ebx shll $16,%ebx movl (%ecx),%edx roll $16,%edx movb %dh,%bl movb %dl,%bh rorl $8,%ebx movl 4(%ecx),%eax movw %ax,%dx andl $0xf0000,%eax orl %eax,%ebx movl 12(%esp),%ecx movl %edx,(%ecx) movl %ebx,4(%ecx) popl %ebx ret /* load_cr0(cr0) */ ENTRY(load_cr0) movl 4(%esp),%eax movl %eax,%cr0 ret /* rcr0() */ ENTRY(rcr0) movl %cr0,%eax ret /* rcr3() */ ENTRY(rcr3) movl %cr3,%eax ret /* void load_cr3(caddr_t cr3) */ ENTRY(load_cr3) +#if defined(SWTCH_OPTIM_STATS) + incl _tlb_flush_count +#endif movl 4(%esp),%eax movl %eax,%cr3 ret /* rcr4() */ ENTRY(rcr4) movl %cr4,%eax ret /* void load_cr4(caddr_t cr4) */ ENTRY(load_cr4) movl 4(%esp),%eax movl %eax,%cr4 ret /*****************************************************************************/ /* setjump, longjump */ /*****************************************************************************/ ENTRY(setjmp) movl 4(%esp),%eax movl %ebx,(%eax) /* save ebx */ movl %esp,4(%eax) /* save esp */ movl %ebp,8(%eax) /* save ebp */ movl %esi,12(%eax) /* save esi */ movl %edi,16(%eax) /* save edi */ movl (%esp),%edx /* get rta */ movl %edx,20(%eax) /* save eip */ xorl %eax,%eax /* return(0); */ ret ENTRY(longjmp) movl 4(%esp),%eax movl (%eax),%ebx /* restore ebx */ movl 4(%eax),%esp /* restore esp */ movl 8(%eax),%ebp /* restore ebp */ movl 12(%eax),%esi /* restore esi */ movl 16(%eax),%edi /* restore edi */ movl 20(%eax),%edx /* get rta */ movl %edx,(%esp) /* put in return frame */ xorl %eax,%eax /* return(1); */ incl %eax ret /* * Here for doing BB-profiling (gcc -a). * We rely on the "bbset" instead, but need a dummy function. */ NON_GPROF_ENTRY(__bb_init_func) movl 4(%esp),%eax movl $1,(%eax) .byte 0xc3 /* avoid macro for `ret' */ Index: head/sys/amd64/amd64/support.s =================================================================== --- head/sys/amd64/amd64/support.s (revision 31708) +++ head/sys/amd64/amd64/support.s (revision 31709) @@ -1,1571 +1,1574 @@ /*- * Copyright (c) 1993 The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $Id: support.s,v 1.56 1997/08/09 00:02:44 dyson Exp $ + * $Id: support.s,v 1.57 1997/09/02 20:05:30 bde Exp $ */ #include "npx.h" #include #include #include #include #include "assym.s" #define KDSEL 0x10 /* kernel data selector */ #define KCSEL 0x8 /* kernel code selector */ #define IDXSHIFT 10 .data .globl _bcopy_vector _bcopy_vector: .long _generic_bcopy .globl _bzero _bzero: .long _generic_bzero .globl _copyin_vector _copyin_vector: .long _generic_copyin .globl _copyout_vector _copyout_vector: .long _generic_copyout .globl _ovbcopy_vector _ovbcopy_vector: .long _generic_bcopy #if defined(I586_CPU) && NNPX > 0 kernel_fpu_lock: .byte 0xfe .space 3 #endif .text /* * bcopy family * void bzero(void *buf, u_int len) */ ENTRY(generic_bzero) pushl %edi movl 8(%esp),%edi movl 12(%esp),%ecx xorl %eax,%eax shrl $2,%ecx cld rep stosl movl 12(%esp),%ecx andl $3,%ecx rep stosb popl %edi ret #if defined(I486_CPU) ENTRY(i486_bzero) movl 4(%esp),%edx movl 8(%esp),%ecx xorl %eax,%eax /* * do 64 byte chunks first * * XXX this is probably over-unrolled at least for DX2's */ 2: cmpl $64,%ecx jb 3f movl %eax,(%edx) movl %eax,4(%edx) movl %eax,8(%edx) movl %eax,12(%edx) movl %eax,16(%edx) movl %eax,20(%edx) movl %eax,24(%edx) movl %eax,28(%edx) movl %eax,32(%edx) movl %eax,36(%edx) movl %eax,40(%edx) movl %eax,44(%edx) movl %eax,48(%edx) movl %eax,52(%edx) movl %eax,56(%edx) movl %eax,60(%edx) addl $64,%edx subl $64,%ecx jnz 2b ret /* * do 16 byte chunks */ SUPERALIGN_TEXT 3: cmpl $16,%ecx jb 4f movl %eax,(%edx) movl %eax,4(%edx) movl %eax,8(%edx) movl %eax,12(%edx) addl $16,%edx subl $16,%ecx jnz 3b ret /* * do 4 byte chunks */ SUPERALIGN_TEXT 4: cmpl $4,%ecx jb 5f movl %eax,(%edx) addl $4,%edx subl $4,%ecx jnz 4b ret /* * do 1 byte chunks * a jump table seems to be faster than a loop or more range reductions * * XXX need a const section for non-text */ .data jtab: .long do0 .long do1 .long do2 .long do3 .text SUPERALIGN_TEXT 5: jmp jtab(,%ecx,4) SUPERALIGN_TEXT do3: movw %ax,(%edx) movb %al,2(%edx) ret SUPERALIGN_TEXT do2: movw %ax,(%edx) ret SUPERALIGN_TEXT do1: movb %al,(%edx) ret SUPERALIGN_TEXT do0: ret #endif #if defined(I586_CPU) && NNPX > 0 ENTRY(i586_bzero) movl 4(%esp),%edx movl 8(%esp),%ecx /* * The FPU register method is twice as fast as the integer register * method unless the target is in the L1 cache and we pre-allocate a * cache line for it (then the integer register method is 4-5 times * faster). However, we never pre-allocate cache lines, since that * would make the integer method 25% or more slower for the common * case when the target isn't in either the L1 cache or the L2 cache. * Thus we normally use the FPU register method unless the overhead * would be too large. */ cmpl $256,%ecx /* empirical; clts, fninit, smsw cost a lot */ jb intreg_i586_bzero /* * The FPU registers may belong to an application or to fastmove() * or to another invocation of bcopy() or ourself in a higher level * interrupt or trap handler. Preserving the registers is * complicated since we avoid it if possible at all levels. We * want to localize the complications even when that increases them. * Here the extra work involves preserving CR0_TS in TS. * `npxproc != NULL' is supposed to be the condition that all the * FPU resources belong to an application, but npxproc and CR0_TS * aren't set atomically enough for this condition to work in * interrupt handlers. * * Case 1: FPU registers belong to the application: we must preserve * the registers if we use them, so we only use the FPU register * method if the target size is large enough to amortize the extra * overhead for preserving them. CR0_TS must be preserved although * it is very likely to end up as set. * * Case 2: FPU registers belong to fastmove(): fastmove() currently * makes the registers look like they belong to an application so * that cpu_switch() and savectx() don't have to know about it, so * this case reduces to case 1. * * Case 3: FPU registers belong to the kernel: don't use the FPU * register method. This case is unlikely, and supporting it would * be more complicated and might take too much stack. * * Case 4: FPU registers don't belong to anyone: the FPU registers * don't need to be preserved, so we always use the FPU register * method. CR0_TS must be preserved although it is very likely to * always end up as clear. */ cmpl $0,_npxproc je i586_bz1 cmpl $256+184,%ecx /* empirical; not quite 2*108 more */ jb intreg_i586_bzero sarb $1,kernel_fpu_lock jc intreg_i586_bzero smsw %ax clts subl $108,%esp fnsave 0(%esp) jmp i586_bz2 i586_bz1: sarb $1,kernel_fpu_lock jc intreg_i586_bzero smsw %ax clts fninit /* XXX should avoid needing this */ i586_bz2: fldz /* * Align to an 8 byte boundary (misalignment in the main loop would * cost a factor of >= 2). Avoid jumps (at little cost if it is * already aligned) by always zeroing 8 bytes and using the part up * to the _next_ alignment position. */ fstl 0(%edx) addl %edx,%ecx /* part of %ecx -= new_%edx - %edx */ addl $8,%edx andl $~7,%edx subl %edx,%ecx /* * Similarly align `len' to a multiple of 8. */ fstl -8(%edx,%ecx) decl %ecx andl $~7,%ecx /* * This wouldn't be any faster if it were unrolled, since the loop * control instructions are much faster than the fstl and/or done * in parallel with it so their overhead is insignificant. */ fpureg_i586_bzero_loop: fstl 0(%edx) addl $8,%edx subl $8,%ecx cmpl $8,%ecx jae fpureg_i586_bzero_loop cmpl $0,_npxproc je i586_bz3 frstor 0(%esp) addl $108,%esp lmsw %ax movb $0xfe,kernel_fpu_lock ret i586_bz3: fstpl %st(0) lmsw %ax movb $0xfe,kernel_fpu_lock ret intreg_i586_bzero: /* * `rep stos' seems to be the best method in practice for small * counts. Fancy methods usually take too long to start up due * to cache and BTB misses. */ pushl %edi movl %edx,%edi xorl %eax,%eax shrl $2,%ecx cld rep stosl movl 12(%esp),%ecx andl $3,%ecx jne 1f popl %edi ret 1: rep stosb popl %edi ret #endif /* I586_CPU && NNPX > 0 */ /* fillw(pat, base, cnt) */ ENTRY(fillw) pushl %edi movl 8(%esp),%eax movl 12(%esp),%edi movl 16(%esp),%ecx cld rep stosw popl %edi ret ENTRY(bcopyb) bcopyb: pushl %esi pushl %edi movl 12(%esp),%esi movl 16(%esp),%edi movl 20(%esp),%ecx movl %edi,%eax subl %esi,%eax cmpl %ecx,%eax /* overlapping && src < dst? */ jb 1f cld /* nope, copy forwards */ rep movsb popl %edi popl %esi ret ALIGN_TEXT 1: addl %ecx,%edi /* copy backwards. */ addl %ecx,%esi decl %edi decl %esi std rep movsb popl %edi popl %esi cld ret ENTRY(bcopy) MEXITCOUNT jmp *_bcopy_vector ENTRY(ovbcopy) MEXITCOUNT jmp *_ovbcopy_vector /* * generic_bcopy(src, dst, cnt) * ws@tools.de (Wolfgang Solfrank, TooLs GmbH) +49-228-985800 */ ENTRY(generic_bcopy) pushl %esi pushl %edi movl 12(%esp),%esi movl 16(%esp),%edi movl 20(%esp),%ecx movl %edi,%eax subl %esi,%eax cmpl %ecx,%eax /* overlapping && src < dst? */ jb 1f shrl $2,%ecx /* copy by 32-bit words */ cld /* nope, copy forwards */ rep movsl movl 20(%esp),%ecx andl $3,%ecx /* any bytes left? */ rep movsb popl %edi popl %esi ret ALIGN_TEXT 1: addl %ecx,%edi /* copy backwards */ addl %ecx,%esi decl %edi decl %esi andl $3,%ecx /* any fractional bytes? */ std rep movsb movl 20(%esp),%ecx /* copy remainder by 32-bit words */ shrl $2,%ecx subl $3,%esi subl $3,%edi rep movsl popl %edi popl %esi cld ret #if defined(I586_CPU) && NNPX > 0 ENTRY(i586_bcopy) pushl %esi pushl %edi movl 12(%esp),%esi movl 16(%esp),%edi movl 20(%esp),%ecx movl %edi,%eax subl %esi,%eax cmpl %ecx,%eax /* overlapping && src < dst? */ jb 1f cmpl $1024,%ecx jb small_i586_bcopy sarb $1,kernel_fpu_lock jc small_i586_bcopy cmpl $0,_npxproc je i586_bc1 smsw %dx clts subl $108,%esp fnsave 0(%esp) jmp 4f i586_bc1: smsw %dx clts fninit /* XXX should avoid needing this */ ALIGN_TEXT 4: pushl %ecx #define DCACHE_SIZE 8192 cmpl $(DCACHE_SIZE-512)/2,%ecx jbe 2f movl $(DCACHE_SIZE-512)/2,%ecx 2: subl %ecx,0(%esp) cmpl $256,%ecx jb 5f /* XXX should prefetch if %ecx >= 32 */ pushl %esi pushl %ecx ALIGN_TEXT 3: movl 0(%esi),%eax movl 32(%esi),%eax movl 64(%esi),%eax movl 96(%esi),%eax movl 128(%esi),%eax movl 160(%esi),%eax movl 192(%esi),%eax movl 224(%esi),%eax addl $256,%esi subl $256,%ecx cmpl $256,%ecx jae 3b popl %ecx popl %esi 5: ALIGN_TEXT large_i586_bcopy_loop: fildq 0(%esi) fildq 8(%esi) fildq 16(%esi) fildq 24(%esi) fildq 32(%esi) fildq 40(%esi) fildq 48(%esi) fildq 56(%esi) fistpq 56(%edi) fistpq 48(%edi) fistpq 40(%edi) fistpq 32(%edi) fistpq 24(%edi) fistpq 16(%edi) fistpq 8(%edi) fistpq 0(%edi) addl $64,%esi addl $64,%edi subl $64,%ecx cmpl $64,%ecx jae large_i586_bcopy_loop popl %eax addl %eax,%ecx cmpl $64,%ecx jae 4b cmpl $0,_npxproc je i586_bc2 frstor 0(%esp) addl $108,%esp i586_bc2: lmsw %dx movb $0xfe,kernel_fpu_lock /* * This is a duplicate of the main part of generic_bcopy. See the comments * there. Jumping into generic_bcopy would cost a whole 0-1 cycles and * would mess up high resolution profiling. */ ALIGN_TEXT small_i586_bcopy: shrl $2,%ecx cld rep movsl movl 20(%esp),%ecx andl $3,%ecx rep movsb popl %edi popl %esi ret ALIGN_TEXT 1: addl %ecx,%edi addl %ecx,%esi decl %edi decl %esi andl $3,%ecx std rep movsb movl 20(%esp),%ecx shrl $2,%ecx subl $3,%esi subl $3,%edi rep movsl popl %edi popl %esi cld ret #endif /* I586_CPU && NNPX > 0 */ /* * Note: memcpy does not support overlapping copies */ ENTRY(memcpy) pushl %edi pushl %esi movl 12(%esp),%edi movl 16(%esp),%esi movl 20(%esp),%ecx movl %edi,%eax shrl $2,%ecx /* copy by 32-bit words */ cld /* nope, copy forwards */ rep movsl movl 20(%esp),%ecx andl $3,%ecx /* any bytes left? */ rep movsb popl %esi popl %edi ret /*****************************************************************************/ /* copyout and fubyte family */ /*****************************************************************************/ /* * Access user memory from inside the kernel. These routines and possibly * the math- and DOS emulators should be the only places that do this. * * We have to access the memory with user's permissions, so use a segment * selector with RPL 3. For writes to user space we have to additionally * check the PTE for write permission, because the 386 does not check * write permissions when we are executing with EPL 0. The 486 does check * this if the WP bit is set in CR0, so we can use a simpler version here. * * These routines set curpcb->onfault for the time they execute. When a * protection violation occurs inside the functions, the trap handler * returns to *curpcb->onfault instead of the function. */ /* copyout(from_kernel, to_user, len) */ ENTRY(copyout) MEXITCOUNT jmp *_copyout_vector ENTRY(generic_copyout) movl _curpcb,%eax movl $copyout_fault,PCB_ONFAULT(%eax) pushl %esi pushl %edi pushl %ebx movl 16(%esp),%esi movl 20(%esp),%edi movl 24(%esp),%ebx testl %ebx,%ebx /* anything to do? */ jz done_copyout /* * Check explicitly for non-user addresses. If 486 write protection * is being used, this check is essential because we are in kernel * mode so the h/w does not provide any protection against writing * kernel addresses. */ /* * First, prevent address wrapping. */ movl %edi,%eax addl %ebx,%eax jc copyout_fault /* * XXX STOP USING VM_MAXUSER_ADDRESS. * It is an end address, not a max, so every time it is used correctly it * looks like there is an off by one error, and of course it caused an off * by one error in several places. */ cmpl $VM_MAXUSER_ADDRESS,%eax ja copyout_fault #if defined(I386_CPU) #if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU) cmpl $CPUCLASS_386,_cpu_class jne 3f #endif /* * We have to check each PTE for user write permission. * The checking may cause a page fault, so it is important to set * up everything for return via copyout_fault before here. */ /* compute number of pages */ movl %edi,%ecx andl $PAGE_MASK,%ecx addl %ebx,%ecx decl %ecx shrl $IDXSHIFT+2,%ecx incl %ecx /* compute PTE offset for start address */ movl %edi,%edx shrl $IDXSHIFT,%edx andb $0xfc,%dl 1: /* check PTE for each page */ leal _PTmap(%edx),%eax shrl $IDXSHIFT,%eax andb $0xfc,%al testb $PG_V,_PTmap(%eax) /* PTE page must be valid */ je 4f movb _PTmap(%edx),%al andb $PG_V|PG_RW|PG_U,%al /* page must be valid and user writable */ cmpb $PG_V|PG_RW|PG_U,%al je 2f 4: /* simulate a trap */ pushl %edx pushl %ecx shll $IDXSHIFT,%edx pushl %edx call _trapwrite /* trapwrite(addr) */ popl %edx popl %ecx popl %edx testl %eax,%eax /* if not ok, return EFAULT */ jnz copyout_fault 2: addl $4,%edx decl %ecx jnz 1b /* check next page */ #endif /* I386_CPU */ /* bcopy(%esi, %edi, %ebx) */ 3: movl %ebx,%ecx #if defined(I586_CPU) && NNPX > 0 ALIGN_TEXT slow_copyout: #endif shrl $2,%ecx cld rep movsl movb %bl,%cl andb $3,%cl rep movsb done_copyout: popl %ebx popl %edi popl %esi xorl %eax,%eax movl _curpcb,%edx movl %eax,PCB_ONFAULT(%edx) ret ALIGN_TEXT copyout_fault: popl %ebx popl %edi popl %esi movl _curpcb,%edx movl $0,PCB_ONFAULT(%edx) movl $EFAULT,%eax ret #if defined(I586_CPU) && NNPX > 0 ENTRY(i586_copyout) /* * Duplicated from generic_copyout. Could be done a bit better. */ movl _curpcb,%eax movl $copyout_fault,PCB_ONFAULT(%eax) pushl %esi pushl %edi pushl %ebx movl 16(%esp),%esi movl 20(%esp),%edi movl 24(%esp),%ebx testl %ebx,%ebx /* anything to do? */ jz done_copyout /* * Check explicitly for non-user addresses. If 486 write protection * is being used, this check is essential because we are in kernel * mode so the h/w does not provide any protection against writing * kernel addresses. */ /* * First, prevent address wrapping. */ movl %edi,%eax addl %ebx,%eax jc copyout_fault /* * XXX STOP USING VM_MAXUSER_ADDRESS. * It is an end address, not a max, so every time it is used correctly it * looks like there is an off by one error, and of course it caused an off * by one error in several places. */ cmpl $VM_MAXUSER_ADDRESS,%eax ja copyout_fault /* bcopy(%esi, %edi, %ebx) */ 3: movl %ebx,%ecx /* * End of duplicated code. */ cmpl $1024,%ecx jb slow_copyout pushl %ecx call _fastmove addl $4,%esp jmp done_copyout #endif /* I586_CPU && NNPX > 0 */ /* copyin(from_user, to_kernel, len) */ ENTRY(copyin) MEXITCOUNT jmp *_copyin_vector ENTRY(generic_copyin) movl _curpcb,%eax movl $copyin_fault,PCB_ONFAULT(%eax) pushl %esi pushl %edi movl 12(%esp),%esi /* caddr_t from */ movl 16(%esp),%edi /* caddr_t to */ movl 20(%esp),%ecx /* size_t len */ /* * make sure address is valid */ movl %esi,%edx addl %ecx,%edx jc copyin_fault cmpl $VM_MAXUSER_ADDRESS,%edx ja copyin_fault #if defined(I586_CPU) && NNPX > 0 ALIGN_TEXT slow_copyin: #endif movb %cl,%al shrl $2,%ecx /* copy longword-wise */ cld rep movsl movb %al,%cl andb $3,%cl /* copy remaining bytes */ rep movsb #if defined(I586_CPU) && NNPX > 0 ALIGN_TEXT done_copyin: #endif popl %edi popl %esi xorl %eax,%eax movl _curpcb,%edx movl %eax,PCB_ONFAULT(%edx) ret ALIGN_TEXT copyin_fault: popl %edi popl %esi movl _curpcb,%edx movl $0,PCB_ONFAULT(%edx) movl $EFAULT,%eax ret #if defined(I586_CPU) && NNPX > 0 ENTRY(i586_copyin) /* * Duplicated from generic_copyin. Could be done a bit better. */ movl _curpcb,%eax movl $copyin_fault,PCB_ONFAULT(%eax) pushl %esi pushl %edi movl 12(%esp),%esi /* caddr_t from */ movl 16(%esp),%edi /* caddr_t to */ movl 20(%esp),%ecx /* size_t len */ /* * make sure address is valid */ movl %esi,%edx addl %ecx,%edx jc copyin_fault cmpl $VM_MAXUSER_ADDRESS,%edx ja copyin_fault /* * End of duplicated code. */ cmpl $1024,%ecx jb slow_copyin pushl %ebx /* XXX prepare for fastmove_fault */ pushl %ecx call _fastmove addl $8,%esp jmp done_copyin #endif /* I586_CPU && NNPX > 0 */ #if defined(I586_CPU) && NNPX > 0 /* fastmove(src, dst, len) src in %esi dst in %edi len in %ecx XXX changed to on stack for profiling uses %eax and %edx for tmp. storage */ /* XXX use ENTRY() to get profiling. fastmove() is actually a non-entry. */ ENTRY(fastmove) pushl %ebp movl %esp,%ebp subl $PCB_SAVEFPU_SIZE+3*4,%esp movl 8(%ebp),%ecx cmpl $63,%ecx jbe fastmove_tail testl $7,%esi /* check if src addr is multiple of 8 */ jnz fastmove_tail testl $7,%edi /* check if dst addr is multiple of 8 */ jnz fastmove_tail /* if (npxproc != NULL) { */ cmpl $0,_npxproc je 6f /* fnsave(&curpcb->pcb_savefpu); */ movl _curpcb,%eax fnsave PCB_SAVEFPU(%eax) /* npxproc = NULL; */ movl $0,_npxproc /* } */ 6: /* now we own the FPU. */ /* * The process' FP state is saved in the pcb, but if we get * switched, the cpu_switch() will store our FP state in the * pcb. It should be possible to avoid all the copying for * this, e.g., by setting a flag to tell cpu_switch() to * save the state somewhere else. */ /* tmp = curpcb->pcb_savefpu; */ movl %ecx,-12(%ebp) movl %esi,-8(%ebp) movl %edi,-4(%ebp) movl %esp,%edi movl _curpcb,%esi addl $PCB_SAVEFPU,%esi cld movl $PCB_SAVEFPU_SIZE>>2,%ecx rep movsl movl -12(%ebp),%ecx movl -8(%ebp),%esi movl -4(%ebp),%edi /* stop_emulating(); */ clts /* npxproc = curproc; */ movl _curproc,%eax movl %eax,_npxproc movl _curpcb,%eax movl $fastmove_fault,PCB_ONFAULT(%eax) 4: movl %ecx,-12(%ebp) cmpl $1792,%ecx jbe 2f movl $1792,%ecx 2: subl %ecx,-12(%ebp) cmpl $256,%ecx jb 5f movl %ecx,-8(%ebp) movl %esi,-4(%ebp) ALIGN_TEXT 3: movl 0(%esi),%eax movl 32(%esi),%eax movl 64(%esi),%eax movl 96(%esi),%eax movl 128(%esi),%eax movl 160(%esi),%eax movl 192(%esi),%eax movl 224(%esi),%eax addl $256,%esi subl $256,%ecx cmpl $256,%ecx jae 3b movl -8(%ebp),%ecx movl -4(%ebp),%esi 5: ALIGN_TEXT fastmove_loop: fildq 0(%esi) fildq 8(%esi) fildq 16(%esi) fildq 24(%esi) fildq 32(%esi) fildq 40(%esi) fildq 48(%esi) fildq 56(%esi) fistpq 56(%edi) fistpq 48(%edi) fistpq 40(%edi) fistpq 32(%edi) fistpq 24(%edi) fistpq 16(%edi) fistpq 8(%edi) fistpq 0(%edi) addl $-64,%ecx addl $64,%esi addl $64,%edi cmpl $63,%ecx ja fastmove_loop movl -12(%ebp),%eax addl %eax,%ecx cmpl $64,%ecx jae 4b /* curpcb->pcb_savefpu = tmp; */ movl %ecx,-12(%ebp) movl %esi,-8(%ebp) movl %edi,-4(%ebp) movl _curpcb,%edi addl $PCB_SAVEFPU,%edi movl %esp,%esi cld movl $PCB_SAVEFPU_SIZE>>2,%ecx rep movsl movl -12(%ebp),%ecx movl -8(%ebp),%esi movl -4(%ebp),%edi /* start_emulating(); */ smsw %ax orb $CR0_TS,%al lmsw %ax /* npxproc = NULL; */ movl $0,_npxproc ALIGN_TEXT fastmove_tail: movl _curpcb,%eax movl $fastmove_tail_fault,PCB_ONFAULT(%eax) movb %cl,%al shrl $2,%ecx /* copy longword-wise */ cld rep movsl movb %al,%cl andb $3,%cl /* copy remaining bytes */ rep movsb movl %ebp,%esp popl %ebp ret ALIGN_TEXT fastmove_fault: movl _curpcb,%edi addl $PCB_SAVEFPU,%edi movl %esp,%esi cld movl $PCB_SAVEFPU_SIZE>>2,%ecx rep movsl smsw %ax orb $CR0_TS,%al lmsw %ax movl $0,_npxproc fastmove_tail_fault: movl %ebp,%esp popl %ebp addl $8,%esp popl %ebx popl %edi popl %esi movl _curpcb,%edx movl $0,PCB_ONFAULT(%edx) movl $EFAULT,%eax ret #endif /* I586_CPU && NNPX > 0 */ /* * fu{byte,sword,word} : fetch a byte (sword, word) from user memory */ ENTRY(fuword) movl _curpcb,%ecx movl $fusufault,PCB_ONFAULT(%ecx) movl 4(%esp),%edx /* from */ cmpl $VM_MAXUSER_ADDRESS-4,%edx /* verify address is valid */ ja fusufault movl (%edx),%eax movl $0,PCB_ONFAULT(%ecx) ret /* * These two routines are called from the profiling code, potentially * at interrupt time. If they fail, that's okay, good things will * happen later. Fail all the time for now - until the trap code is * able to deal with this. */ ALTENTRY(suswintr) ENTRY(fuswintr) movl $-1,%eax ret ENTRY(fusword) movl _curpcb,%ecx movl $fusufault,PCB_ONFAULT(%ecx) movl 4(%esp),%edx cmpl $VM_MAXUSER_ADDRESS-2,%edx ja fusufault movzwl (%edx),%eax movl $0,PCB_ONFAULT(%ecx) ret ENTRY(fubyte) movl _curpcb,%ecx movl $fusufault,PCB_ONFAULT(%ecx) movl 4(%esp),%edx cmpl $VM_MAXUSER_ADDRESS-1,%edx ja fusufault movzbl (%edx),%eax movl $0,PCB_ONFAULT(%ecx) ret ALIGN_TEXT fusufault: movl _curpcb,%ecx xorl %eax,%eax movl %eax,PCB_ONFAULT(%ecx) decl %eax ret /* * su{byte,sword,word}: write a byte (word, longword) to user memory */ ENTRY(suword) movl _curpcb,%ecx movl $fusufault,PCB_ONFAULT(%ecx) movl 4(%esp),%edx #if defined(I386_CPU) #if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU) cmpl $CPUCLASS_386,_cpu_class jne 2f /* we only have to set the right segment selector */ #endif /* I486_CPU || I586_CPU || I686_CPU */ /* XXX - page boundary crossing is still not handled */ movl %edx,%eax shrl $IDXSHIFT,%edx andb $0xfc,%dl leal _PTmap(%edx),%ecx shrl $IDXSHIFT,%ecx andb $0xfc,%cl testb $PG_V,_PTmap(%ecx) /* PTE page must be valid */ je 4f movb _PTmap(%edx),%dl andb $PG_V|PG_RW|PG_U,%dl /* page must be valid and user writable */ cmpb $PG_V|PG_RW|PG_U,%dl je 1f 4: /* simulate a trap */ pushl %eax call _trapwrite popl %edx /* remove junk parameter from stack */ testl %eax,%eax jnz fusufault 1: movl 4(%esp),%edx #endif 2: cmpl $VM_MAXUSER_ADDRESS-4,%edx /* verify address validity */ ja fusufault movl 8(%esp),%eax movl %eax,(%edx) xorl %eax,%eax movl _curpcb,%ecx movl %eax,PCB_ONFAULT(%ecx) ret ENTRY(susword) movl _curpcb,%ecx movl $fusufault,PCB_ONFAULT(%ecx) movl 4(%esp),%edx #if defined(I386_CPU) #if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU) cmpl $CPUCLASS_386,_cpu_class jne 2f #endif /* I486_CPU || I586_CPU || I686_CPU */ /* XXX - page boundary crossing is still not handled */ movl %edx,%eax shrl $IDXSHIFT,%edx andb $0xfc,%dl leal _PTmap(%edx),%ecx shrl $IDXSHIFT,%ecx andb $0xfc,%cl testb $PG_V,_PTmap(%ecx) /* PTE page must be valid */ je 4f movb _PTmap(%edx),%dl andb $PG_V|PG_RW|PG_U,%dl /* page must be valid and user writable */ cmpb $PG_V|PG_RW|PG_U,%dl je 1f 4: /* simulate a trap */ pushl %eax call _trapwrite popl %edx /* remove junk parameter from stack */ testl %eax,%eax jnz fusufault 1: movl 4(%esp),%edx #endif 2: cmpl $VM_MAXUSER_ADDRESS-2,%edx /* verify address validity */ ja fusufault movw 8(%esp),%ax movw %ax,(%edx) xorl %eax,%eax movl _curpcb,%ecx /* restore trashed register */ movl %eax,PCB_ONFAULT(%ecx) ret ALTENTRY(suibyte) ENTRY(subyte) movl _curpcb,%ecx movl $fusufault,PCB_ONFAULT(%ecx) movl 4(%esp),%edx #if defined(I386_CPU) #if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU) cmpl $CPUCLASS_386,_cpu_class jne 2f #endif /* I486_CPU || I586_CPU || I686_CPU */ movl %edx,%eax shrl $IDXSHIFT,%edx andb $0xfc,%dl leal _PTmap(%edx),%ecx shrl $IDXSHIFT,%ecx andb $0xfc,%cl testb $PG_V,_PTmap(%ecx) /* PTE page must be valid */ je 4f movb _PTmap(%edx),%dl andb $PG_V|PG_RW|PG_U,%dl /* page must be valid and user writable */ cmpb $PG_V|PG_RW|PG_U,%dl je 1f 4: /* simulate a trap */ pushl %eax call _trapwrite popl %edx /* remove junk parameter from stack */ testl %eax,%eax jnz fusufault 1: movl 4(%esp),%edx #endif 2: cmpl $VM_MAXUSER_ADDRESS-1,%edx /* verify address validity */ ja fusufault movb 8(%esp),%al movb %al,(%edx) xorl %eax,%eax movl _curpcb,%ecx /* restore trashed register */ movl %eax,PCB_ONFAULT(%ecx) ret /* * copyinstr(from, to, maxlen, int *lencopied) * copy a string from from to to, stop when a 0 character is reached. * return ENAMETOOLONG if string is longer than maxlen, and * EFAULT on protection violations. If lencopied is non-zero, * return the actual length in *lencopied. */ ENTRY(copyinstr) pushl %esi pushl %edi movl _curpcb,%ecx movl $cpystrflt,PCB_ONFAULT(%ecx) movl 12(%esp),%esi /* %esi = from */ movl 16(%esp),%edi /* %edi = to */ movl 20(%esp),%edx /* %edx = maxlen */ movl $VM_MAXUSER_ADDRESS,%eax /* make sure 'from' is within bounds */ subl %esi,%eax jbe cpystrflt /* restrict maxlen to <= VM_MAXUSER_ADDRESS-from */ cmpl %edx,%eax jae 1f movl %eax,%edx movl %eax,20(%esp) 1: incl %edx cld 2: decl %edx jz 3f lodsb stosb orb %al,%al jnz 2b /* Success -- 0 byte reached */ decl %edx xorl %eax,%eax jmp cpystrflt_x 3: /* edx is zero - return ENAMETOOLONG or EFAULT */ cmpl $VM_MAXUSER_ADDRESS,%esi jae cpystrflt 4: movl $ENAMETOOLONG,%eax jmp cpystrflt_x cpystrflt: movl $EFAULT,%eax cpystrflt_x: /* set *lencopied and return %eax */ movl _curpcb,%ecx movl $0,PCB_ONFAULT(%ecx) movl 20(%esp),%ecx subl %edx,%ecx movl 24(%esp),%edx testl %edx,%edx jz 1f movl %ecx,(%edx) 1: popl %edi popl %esi ret /* * copystr(from, to, maxlen, int *lencopied) */ ENTRY(copystr) pushl %esi pushl %edi movl 12(%esp),%esi /* %esi = from */ movl 16(%esp),%edi /* %edi = to */ movl 20(%esp),%edx /* %edx = maxlen */ incl %edx cld 1: decl %edx jz 4f lodsb stosb orb %al,%al jnz 1b /* Success -- 0 byte reached */ decl %edx xorl %eax,%eax jmp 6f 4: /* edx is zero -- return ENAMETOOLONG */ movl $ENAMETOOLONG,%eax 6: /* set *lencopied and return %eax */ movl 20(%esp),%ecx subl %edx,%ecx movl 24(%esp),%edx testl %edx,%edx jz 7f movl %ecx,(%edx) 7: popl %edi popl %esi ret ENTRY(bcmp) pushl %edi pushl %esi movl 12(%esp),%edi movl 16(%esp),%esi movl 20(%esp),%edx xorl %eax,%eax movl %edx,%ecx shrl $2,%ecx cld /* compare forwards */ repe cmpsl jne 1f movl %edx,%ecx andl $3,%ecx repe cmpsb je 2f 1: incl %eax 2: popl %esi popl %edi ret /* * Handling of special 386 registers and descriptor tables etc */ /* void lgdt(struct region_descriptor *rdp); */ ENTRY(lgdt) /* reload the descriptor table */ movl 4(%esp),%eax lgdt (%eax) /* flush the prefetch q */ jmp 1f nop 1: /* reload "stale" selectors */ movl $KDSEL,%eax movl %ax,%ds movl %ax,%es movl %ax,%fs movl %ax,%gs movl %ax,%ss /* reload code selector by turning return into intersegmental return */ movl (%esp),%eax pushl %eax movl $KCSEL,4(%esp) lret /* * void lidt(struct region_descriptor *rdp); */ ENTRY(lidt) movl 4(%esp),%eax lidt (%eax) ret /* * void lldt(u_short sel) */ ENTRY(lldt) lldt 4(%esp) ret /* * void ltr(u_short sel) */ ENTRY(ltr) ltr 4(%esp) ret /* ssdtosd(*ssdp,*sdp) */ ENTRY(ssdtosd) pushl %ebx movl 8(%esp),%ecx movl 8(%ecx),%ebx shll $16,%ebx movl (%ecx),%edx roll $16,%edx movb %dh,%bl movb %dl,%bh rorl $8,%ebx movl 4(%ecx),%eax movw %ax,%dx andl $0xf0000,%eax orl %eax,%ebx movl 12(%esp),%ecx movl %edx,(%ecx) movl %ebx,4(%ecx) popl %ebx ret /* load_cr0(cr0) */ ENTRY(load_cr0) movl 4(%esp),%eax movl %eax,%cr0 ret /* rcr0() */ ENTRY(rcr0) movl %cr0,%eax ret /* rcr3() */ ENTRY(rcr3) movl %cr3,%eax ret /* void load_cr3(caddr_t cr3) */ ENTRY(load_cr3) +#if defined(SWTCH_OPTIM_STATS) + incl _tlb_flush_count +#endif movl 4(%esp),%eax movl %eax,%cr3 ret /* rcr4() */ ENTRY(rcr4) movl %cr4,%eax ret /* void load_cr4(caddr_t cr4) */ ENTRY(load_cr4) movl 4(%esp),%eax movl %eax,%cr4 ret /*****************************************************************************/ /* setjump, longjump */ /*****************************************************************************/ ENTRY(setjmp) movl 4(%esp),%eax movl %ebx,(%eax) /* save ebx */ movl %esp,4(%eax) /* save esp */ movl %ebp,8(%eax) /* save ebp */ movl %esi,12(%eax) /* save esi */ movl %edi,16(%eax) /* save edi */ movl (%esp),%edx /* get rta */ movl %edx,20(%eax) /* save eip */ xorl %eax,%eax /* return(0); */ ret ENTRY(longjmp) movl 4(%esp),%eax movl (%eax),%ebx /* restore ebx */ movl 4(%eax),%esp /* restore esp */ movl 8(%eax),%ebp /* restore ebp */ movl 12(%eax),%esi /* restore esi */ movl 16(%eax),%edi /* restore edi */ movl 20(%eax),%edx /* get rta */ movl %edx,(%esp) /* put in return frame */ xorl %eax,%eax /* return(1); */ incl %eax ret /* * Here for doing BB-profiling (gcc -a). * We rely on the "bbset" instead, but need a dummy function. */ NON_GPROF_ENTRY(__bb_init_func) movl 4(%esp),%eax movl $1,(%eax) .byte 0xc3 /* avoid macro for `ret' */ Index: head/sys/amd64/amd64/swtch.s =================================================================== --- head/sys/amd64/amd64/swtch.s (revision 31708) +++ head/sys/amd64/amd64/swtch.s (revision 31709) @@ -1,778 +1,815 @@ /*- * Copyright (c) 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $Id: swtch.s,v 1.63 1997/09/21 15:03:58 peter Exp $ + * $Id: swtch.s,v 1.64 1997/10/10 09:44:06 peter Exp $ */ #include "npx.h" #include "opt_user_ldt.h" #include "opt_vm86.h" #include #include #ifdef SMP #include #include #include /** GRAB_LOPRIO */ #endif /* SMP */ #include "assym.s" /*****************************************************************************/ /* Scheduling */ /*****************************************************************************/ /* * The following primitives manipulate the run queues. * _whichqs tells which of the 32 queues _qs * have processes in them. setrunqueue puts processes into queues, Remrq * removes them from queues. The running process is on no queue, * other processes are on a queue related to p->p_priority, divided by 4 * actually to shrink the 0-127 range of priorities into the 32 available * queues. */ .data #ifndef SMP .globl _curpcb _curpcb: .long 0 /* pointer to curproc's PCB area */ #endif /* !SMP */ .globl _whichqs, _whichrtqs, _whichidqs _whichqs: .long 0 /* which run queues have data */ _whichrtqs: .long 0 /* which realtime run qs have data */ _whichidqs: .long 0 /* which idletime run qs have data */ .globl _hlt_vector _hlt_vector: .long _default_halt /* pointer to halt routine */ .globl _qs,_cnt,_panic .globl _want_resched _want_resched: .long 0 /* we need to re-run the scheduler */ +#if defined(SWTCH_OPTIM_STATS) + .globl _swtch_optim_stats, _tlb_flush_count +_swtch_optim_stats: .long 0 /* number of _swtch_optims */ +_tlb_flush_count: .long 0 +#endif .text /* * setrunqueue(p) * * Call should be made at spl6(), and p->p_stat should be SRUN */ ENTRY(setrunqueue) movl 4(%esp),%eax #ifdef DIAGNOSTIC cmpb $SRUN,P_STAT(%eax) je set1 pushl $set2 call _panic set1: #endif cmpw $RTP_PRIO_NORMAL,P_RTPRIO_TYPE(%eax) /* normal priority process? */ je set_nort movzwl P_RTPRIO_PRIO(%eax),%edx cmpw $RTP_PRIO_REALTIME,P_RTPRIO_TYPE(%eax) /* realtime priority? */ jne set_id /* must be idle priority */ set_rt: btsl %edx,_whichrtqs /* set q full bit */ shll $3,%edx addl $_rtqs,%edx /* locate q hdr */ movl %edx,P_FORW(%eax) /* link process on tail of q */ movl P_BACK(%edx),%ecx movl %ecx,P_BACK(%eax) movl %eax,P_BACK(%edx) movl %eax,P_FORW(%ecx) ret set_id: btsl %edx,_whichidqs /* set q full bit */ shll $3,%edx addl $_idqs,%edx /* locate q hdr */ movl %edx,P_FORW(%eax) /* link process on tail of q */ movl P_BACK(%edx),%ecx movl %ecx,P_BACK(%eax) movl %eax,P_BACK(%edx) movl %eax,P_FORW(%ecx) ret set_nort: /* Normal (RTOFF) code */ movzbl P_PRI(%eax),%edx shrl $2,%edx btsl %edx,_whichqs /* set q full bit */ shll $3,%edx addl $_qs,%edx /* locate q hdr */ movl %edx,P_FORW(%eax) /* link process on tail of q */ movl P_BACK(%edx),%ecx movl %ecx,P_BACK(%eax) movl %eax,P_BACK(%edx) movl %eax,P_FORW(%ecx) ret set2: .asciz "setrunqueue" /* * Remrq(p) * * Call should be made at spl6(). */ ENTRY(remrq) movl 4(%esp),%eax cmpw $RTP_PRIO_NORMAL,P_RTPRIO_TYPE(%eax) /* normal priority process? */ je rem_nort movzwl P_RTPRIO_PRIO(%eax),%edx cmpw $RTP_PRIO_REALTIME,P_RTPRIO_TYPE(%eax) /* normal priority process? */ jne rem_id btrl %edx,_whichrtqs /* clear full bit, panic if clear already */ jb rem1rt pushl $rem3rt call _panic rem1rt: pushl %edx movl P_FORW(%eax),%ecx /* unlink process */ movl P_BACK(%eax),%edx movl %edx,P_BACK(%ecx) movl P_BACK(%eax),%ecx movl P_FORW(%eax),%edx movl %edx,P_FORW(%ecx) popl %edx movl $_rtqs,%ecx shll $3,%edx addl %edx,%ecx cmpl P_FORW(%ecx),%ecx /* q still has something? */ je rem2rt shrl $3,%edx /* yes, set bit as still full */ btsl %edx,_whichrtqs rem2rt: ret rem_id: btrl %edx,_whichidqs /* clear full bit, panic if clear already */ jb rem1id pushl $rem3id call _panic rem1id: pushl %edx movl P_FORW(%eax),%ecx /* unlink process */ movl P_BACK(%eax),%edx movl %edx,P_BACK(%ecx) movl P_BACK(%eax),%ecx movl P_FORW(%eax),%edx movl %edx,P_FORW(%ecx) popl %edx movl $_idqs,%ecx shll $3,%edx addl %edx,%ecx cmpl P_FORW(%ecx),%ecx /* q still has something? */ je rem2id shrl $3,%edx /* yes, set bit as still full */ btsl %edx,_whichidqs rem2id: ret rem_nort: movzbl P_PRI(%eax),%edx shrl $2,%edx btrl %edx,_whichqs /* clear full bit, panic if clear already */ jb rem1 pushl $rem3 call _panic rem1: pushl %edx movl P_FORW(%eax),%ecx /* unlink process */ movl P_BACK(%eax),%edx movl %edx,P_BACK(%ecx) movl P_BACK(%eax),%ecx movl P_FORW(%eax),%edx movl %edx,P_FORW(%ecx) popl %edx movl $_qs,%ecx shll $3,%edx addl %edx,%ecx cmpl P_FORW(%ecx),%ecx /* q still has something? */ je rem2 shrl $3,%edx /* yes, set bit as still full */ btsl %edx,_whichqs rem2: ret rem3: .asciz "remrq" rem3rt: .asciz "remrq.rt" rem3id: .asciz "remrq.id" /* * When no processes are on the runq, cpu_switch() branches to _idle * to wait for something to come ready. */ ALIGN_TEXT _idle: #ifdef SMP /* when called, we have the mplock, intr disabled */ xorl %ebp,%ebp /* use our idleproc's "context" */ movl _my_idlePTD,%ecx movl %ecx,%cr3 +#if defined(SWTCH_OPTIM_STATS) + incl _tlb_flush_count +#endif movl $_idlestack_top,%ecx movl %ecx,%esp /* update common_tss.tss_esp0 pointer */ #ifdef VM86 movl _my_tr, %esi #endif /* VM86 */ - movl $_common_tss, %eax - movl %ecx, TSS_ESP0(%eax) + movl %ecx, _common_tss + TSS_ESP0 #ifdef VM86 btrl %esi, _private_tss je 1f movl $_common_tssd, %edi /* move correct tss descriptor into GDT slot, then reload tr */ leal _gdt(,%esi,8), %ebx /* entry in GDT */ movl 0(%edi), %eax movl %eax, 0(%ebx) movl 4(%edi), %eax movl %eax, 4(%ebx) shll $3, %esi /* GSEL(entry, SEL_KPL) */ ltr %si 1: #endif /* VM86 */ sti /* * XXX callers of cpu_switch() do a bogus splclock(). Locking should * be left to cpu_switch(). */ call _spl0 cli /* * _REALLY_ free the lock, no matter how deep the prior nesting. * We will recover the nesting on the way out when we have a new * proc to load. * * XXX: we had damn well better be sure we had it before doing this! */ movl $FREE_LOCK, %eax movl %eax, _mp_lock /* do NOT have lock, intrs disabled */ .globl idle_loop idle_loop: +#if defined(SWTCH_OPTIM_STATS) + incl _tlb_flush_count +#endif movl %cr3,%eax /* ouch! */ movl %eax,%cr3 cmpl $0,_smp_active jne 1f cmpl $0,_cpuid je 1f jmp 2f 1: cmpl $0,_whichrtqs /* real-time queue */ jne 3f cmpl $0,_whichqs /* normal queue */ jne 3f cmpl $0,_whichidqs /* 'idle' queue */ jne 3f cmpl $0,_do_page_zero_idle je 2f /* XXX appears to cause panics */ /* * Inside zero_idle we enable interrupts and grab the mplock * as needed. It needs to be careful about entry/exit mutexes. */ call _vm_page_zero_idle /* internal locking */ testl %eax, %eax jnz idle_loop 2: /* enable intrs for a halt */ #ifdef SMP movl $0, lapic_tpr /* 1st candidate for an INT */ #endif sti call *_hlt_vector /* wait for interrupt */ cli jmp idle_loop 3: #ifdef SMP movl $LOPRIO_LEVEL, lapic_tpr /* arbitrate for INTs */ #endif call _get_mplock cmpl $0,_whichrtqs /* real-time queue */ CROSSJUMP(jne, sw1a, je) cmpl $0,_whichqs /* normal queue */ CROSSJUMP(jne, nortqr, je) cmpl $0,_whichidqs /* 'idle' queue */ CROSSJUMP(jne, idqr, je) call _rel_mplock jmp idle_loop #else xorl %ebp,%ebp movl $HIDENAME(tmpstk),%esp - movl _IdlePTD,%ecx - movl %ecx,%cr3 +#if defined(OVERLY_CONSERVATIVE_PTD_MGMT) +#if defined(SWTCH_OPTIM_STATS) + incl _swtch_optim_stats +#endif + movl _IdlePTD, %ecx + movl %cr3, %eax + cmpl %ecx, %eax + je 2f +#if defined(SWTCH_OPTIM_STATS) + decl _swtch_optim_stats + incl _tlb_flush_count +#endif + movl %ecx, %cr3 +2: +#endif /* update common_tss.tss_esp0 pointer */ #ifdef VM86 movl _my_tr, %esi #endif /* VM86 */ - movl $_common_tss, %eax - movl %esp, TSS_ESP0(%eax) + movl %esp, _common_tss + TSS_ESP0 #ifdef VM86 btrl %esi, _private_tss je 1f movl $_common_tssd, %edi /* move correct tss descriptor into GDT slot, then reload tr */ leal _gdt(,%esi,8), %ebx /* entry in GDT */ movl 0(%edi), %eax movl %eax, 0(%ebx) movl 4(%edi), %eax movl %eax, 4(%ebx) shll $3, %esi /* GSEL(entry, SEL_KPL) */ ltr %si 1: #endif /* VM86 */ sti /* * XXX callers of cpu_switch() do a bogus splclock(). Locking should * be left to cpu_switch(). */ call _spl0 ALIGN_TEXT idle_loop: cli cmpl $0,_whichrtqs /* real-time queue */ CROSSJUMP(jne, sw1a, je) cmpl $0,_whichqs /* normal queue */ CROSSJUMP(jne, nortqr, je) cmpl $0,_whichidqs /* 'idle' queue */ CROSSJUMP(jne, idqr, je) call _vm_page_zero_idle testl %eax, %eax jnz idle_loop sti call *_hlt_vector /* wait for interrupt */ jmp idle_loop #endif CROSSJUMPTARGET(_idle) ENTRY(default_halt) #ifndef SMP hlt /* XXX: until a wakeup IPI */ #endif ret /* * cpu_switch() */ ENTRY(cpu_switch) /* switch to new process. first, save context as needed */ movl _curproc,%ecx /* if no process to save, don't bother */ testl %ecx,%ecx je sw1 #ifdef SMP movb P_ONCPU(%ecx), %al /* save "last" cpu */ movb %al, P_LASTCPU(%ecx) movb $0xff, P_ONCPU(%ecx) /* "leave" the cpu */ #endif /* SMP */ movl P_ADDR(%ecx),%ecx movl (%esp),%eax /* Hardware registers */ movl %eax,PCB_EIP(%ecx) movl %ebx,PCB_EBX(%ecx) movl %esp,PCB_ESP(%ecx) movl %ebp,PCB_EBP(%ecx) movl %esi,PCB_ESI(%ecx) movl %edi,PCB_EDI(%ecx) movl %fs,PCB_FS(%ecx) movl %gs,PCB_GS(%ecx) #ifdef SMP movl _mp_lock, %eax /* XXX FIXME: we should be saving the local APIC TPR */ #ifdef DIAGNOSTIC cmpl $FREE_LOCK, %eax /* is it free? */ je badsw4 /* yes, bad medicine! */ #endif /* DIAGNOSTIC */ andl $COUNT_FIELD, %eax /* clear CPU portion */ movl %eax, PCB_MPNEST(%ecx) /* store it */ #endif /* SMP */ #if NNPX > 0 /* have we used fp, and need a save? */ movl _curproc,%eax cmpl %eax,_npxproc jne 1f addl $PCB_SAVEFPU,%ecx /* h/w bugs make saving complicated */ pushl %ecx call _npxsave /* do it in a big C function */ popl %eax 1: #endif /* NNPX > 0 */ movl $0,_curproc /* out of process */ /* save is done, now choose a new process or idle */ sw1: cli #ifdef SMP /* Stop scheduling if smp_active goes zero and we are not BSP */ cmpl $0,_smp_active jne 1f cmpl $0,_cpuid je 1f CROSSJUMP(je, _idle, jne) /* wind down */ 1: #endif sw1a: movl _whichrtqs,%edi /* pick next p. from rtqs */ testl %edi,%edi jz nortqr /* no realtime procs */ /* XXX - bsf is sloow */ bsfl %edi,%ebx /* find a full q */ jz nortqr /* no proc on rt q - try normal ... */ /* XX update whichqs? */ btrl %ebx,%edi /* clear q full status */ leal _rtqs(,%ebx,8),%eax /* select q */ movl %eax,%esi movl P_FORW(%eax),%ecx /* unlink from front of process q */ movl P_FORW(%ecx),%edx movl %edx,P_FORW(%eax) movl P_BACK(%ecx),%eax movl %eax,P_BACK(%edx) cmpl P_FORW(%ecx),%esi /* q empty */ je rt3 btsl %ebx,%edi /* nope, set to indicate not empty */ rt3: movl %edi,_whichrtqs /* update q status */ jmp swtch_com /* old sw1a */ /* Normal process priority's */ nortqr: movl _whichqs,%edi 2: /* XXX - bsf is sloow */ bsfl %edi,%ebx /* find a full q */ jz idqr /* if none, idle */ /* XX update whichqs? */ btrl %ebx,%edi /* clear q full status */ leal _qs(,%ebx,8),%eax /* select q */ movl %eax,%esi movl P_FORW(%eax),%ecx /* unlink from front of process q */ movl P_FORW(%ecx),%edx movl %edx,P_FORW(%eax) movl P_BACK(%ecx),%eax movl %eax,P_BACK(%edx) cmpl P_FORW(%ecx),%esi /* q empty */ je 3f btsl %ebx,%edi /* nope, set to indicate not empty */ 3: movl %edi,_whichqs /* update q status */ jmp swtch_com idqr: /* was sw1a */ movl _whichidqs,%edi /* pick next p. from idqs */ /* XXX - bsf is sloow */ bsfl %edi,%ebx /* find a full q */ CROSSJUMP(je, _idle, jne) /* if no proc, idle */ /* XX update whichqs? */ btrl %ebx,%edi /* clear q full status */ leal _idqs(,%ebx,8),%eax /* select q */ movl %eax,%esi movl P_FORW(%eax),%ecx /* unlink from front of process q */ movl P_FORW(%ecx),%edx movl %edx,P_FORW(%eax) movl P_BACK(%ecx),%eax movl %eax,P_BACK(%edx) cmpl P_FORW(%ecx),%esi /* q empty */ je id3 btsl %ebx,%edi /* nope, set to indicate not empty */ id3: movl %edi,_whichidqs /* update q status */ swtch_com: movl $0,%eax movl %eax,_want_resched #ifdef DIAGNOSTIC cmpl %eax,P_WCHAN(%ecx) jne badsw1 cmpb $SRUN,P_STAT(%ecx) jne badsw2 #endif movl %eax,P_BACK(%ecx) /* isolate process to run */ movl P_ADDR(%ecx),%edx - movl PCB_CR3(%edx),%ebx #ifdef SMP + movl PCB_CR3(%edx),%ebx /* Grab the private PT pointer from the outgoing process's PTD */ movl $_PTD, %esi movl 4*MPPTDI(%esi), %eax /* fetch cpu's prv pt */ -#endif /* SMP */ - +#else +#if defined(SWTCH_OPTIM_STATS) + incl _swtch_optim_stats +#endif /* switch address space */ + movl %cr3,%ebx + cmpl PCB_CR3(%edx),%ebx + je 4f +#if defined(SWTCH_OPTIM_STATS) + decl _swtch_optim_stats + incl _tlb_flush_count +#endif + movl PCB_CR3(%edx),%ebx +#endif /* SMP */ movl %ebx,%cr3 +4: #ifdef SMP /* Copy the private PT to the new process's PTD */ /* XXX yuck, the _PTD changes when we switch, so we have to * reload %cr3 after changing the address space. * We need to fix this by storing a pointer to the virtual * location of the per-process PTD in the PCB or something quick. * Dereferencing proc->vm_map->pmap->p_pdir[] is painful in asm. */ movl %eax, 4*MPPTDI(%esi) /* restore cpu's prv page */ +#if defined(SWTCH_OPTIM_STATS) + incl _tlb_flush_count +#endif /* XXX: we have just changed the page tables.. reload.. */ movl %ebx, %cr3 #endif /* SMP */ #ifdef VM86 movl _my_tr, %esi cmpl $0, PCB_EXT(%edx) /* has pcb extension? */ je 1f btsl %esi, _private_tss /* mark use of private tss */ movl PCB_EXT(%edx), %edi /* new tss descriptor */ jmp 2f 1: #endif /* update common_tss.tss_esp0 pointer */ movl $_common_tss, %eax movl %edx, %ebx /* pcb */ #ifdef VM86 addl $(UPAGES * PAGE_SIZE - 16), %ebx #else addl $(UPAGES * PAGE_SIZE), %ebx #endif /* VM86 */ movl %ebx, TSS_ESP0(%eax) #ifdef VM86 btrl %esi, _private_tss je 3f movl $_common_tssd, %edi 2: /* move correct tss descriptor into GDT slot, then reload tr */ leal _gdt(,%esi,8), %ebx /* entry in GDT */ movl 0(%edi), %eax movl %eax, 0(%ebx) movl 4(%edi), %eax movl %eax, 4(%ebx) shll $3, %esi /* GSEL(entry, SEL_KPL) */ ltr %si 3: #endif /* VM86 */ /* restore context */ movl PCB_EBX(%edx),%ebx movl PCB_ESP(%edx),%esp movl PCB_EBP(%edx),%ebp movl PCB_ESI(%edx),%esi movl PCB_EDI(%edx),%edi movl PCB_EIP(%edx),%eax movl %eax,(%esp) #ifdef SMP #ifdef GRAB_LOPRIO /* hold LOPRIO for INTs */ #ifdef CHEAP_TPR movl $0, lapic_tpr #else andl $~APIC_TPR_PRIO, lapic_tpr #endif /** CHEAP_TPR */ #endif /** GRAB_LOPRIO */ movl _cpuid,%eax movb %al, P_ONCPU(%ecx) #endif /* SMP */ movl %edx, _curpcb movl %ecx, _curproc /* into next process */ #ifdef SMP movl _cpu_lockid, %eax orl PCB_MPNEST(%edx), %eax /* add next count from PROC */ movl %eax, _mp_lock /* load the mp_lock */ /* XXX FIXME: we should be restoring the local APIC TPR */ #endif /* SMP */ #ifdef USER_LDT cmpl $0, PCB_USERLDT(%edx) jnz 1f movl __default_ldt,%eax cmpl _currentldt,%eax je 2f lldt __default_ldt movl %eax,_currentldt jmp 2f 1: pushl %edx call _set_user_ldt popl %edx 2: #endif /* This must be done after loading the user LDT. */ .globl cpu_switch_load_fs cpu_switch_load_fs: movl PCB_FS(%edx),%fs .globl cpu_switch_load_gs cpu_switch_load_gs: movl PCB_GS(%edx),%gs sti ret CROSSJUMPTARGET(idqr) CROSSJUMPTARGET(nortqr) CROSSJUMPTARGET(sw1a) #ifdef DIAGNOSTIC badsw1: pushl $sw0_1 call _panic sw0_1: .asciz "cpu_switch: has wchan" badsw2: pushl $sw0_2 call _panic sw0_2: .asciz "cpu_switch: not SRUN" #endif #if defined(SMP) && defined(DIAGNOSTIC) badsw4: pushl $sw0_4 call _panic sw0_4: .asciz "cpu_switch: do not have lock" #endif /* SMP && DIAGNOSTIC */ /* * savectx(pcb) * Update pcb, saving current processor state. */ ENTRY(savectx) /* fetch PCB */ movl 4(%esp),%ecx /* caller's return address - child won't execute this routine */ movl (%esp),%eax movl %eax,PCB_EIP(%ecx) movl %ebx,PCB_EBX(%ecx) movl %esp,PCB_ESP(%ecx) movl %ebp,PCB_EBP(%ecx) movl %esi,PCB_ESI(%ecx) movl %edi,PCB_EDI(%ecx) movl %fs,PCB_FS(%ecx) movl %gs,PCB_GS(%ecx) #if NNPX > 0 /* * If npxproc == NULL, then the npx h/w state is irrelevant and the * state had better already be in the pcb. This is true for forks * but not for dumps (the old book-keeping with FP flags in the pcb * always lost for dumps because the dump pcb has 0 flags). * * If npxproc != NULL, then we have to save the npx h/w state to * npxproc's pcb and copy it to the requested pcb, or save to the * requested pcb and reload. Copying is easier because we would * have to handle h/w bugs for reloading. We used to lose the * parent's npx state for forks by forgetting to reload. */ movl _npxproc,%eax testl %eax,%eax je 1f pushl %ecx movl P_ADDR(%eax),%eax leal PCB_SAVEFPU(%eax),%eax pushl %eax pushl %eax call _npxsave addl $4,%esp popl %eax popl %ecx pushl $PCB_SAVEFPU_SIZE leal PCB_SAVEFPU(%ecx),%ecx pushl %ecx pushl %eax call _bcopy addl $12,%esp #endif /* NNPX > 0 */ 1: ret Index: head/sys/amd64/include/cpufunc.h =================================================================== --- head/sys/amd64/include/cpufunc.h (revision 31708) +++ head/sys/amd64/include/cpufunc.h (revision 31709) @@ -1,430 +1,436 @@ /*- * Copyright (c) 1993 The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $Id: cpufunc.h,v 1.3 1997/09/05 20:20:31 smp Exp smp $ + * $Id: cpufunc.h,v 1.72 1997/09/07 22:01:27 fsmp Exp $ */ /* * Functions to provide access to special i386 instructions. */ #ifndef _MACHINE_CPUFUNC_H_ #define _MACHINE_CPUFUNC_H_ #include #include #include +#if defined(SWTCH_OPTIM_STATS) +extern int tlb_flush_count; +#endif #ifdef __GNUC__ static __inline void breakpoint(void) { __asm __volatile("int $3"); } static __inline void disable_intr(void) { __asm __volatile("cli" : : : "memory"); MPINTR_LOCK(); } static __inline void enable_intr(void) { MPINTR_UNLOCK(); __asm __volatile("sti"); } #define HAVE_INLINE_FFS static __inline int ffs(int mask) { int result; /* * bsfl turns out to be not all that slow on 486's. It can beaten * using a binary search to reduce to 4 bits and then a table lookup, * but only if the code is inlined and in the cache, and the code * is quite large so inlining it probably busts the cache. * * Note that gcc-2's builtin ffs would be used if we didn't declare * this inline or turn off the builtin. The builtin is faster but * broken in gcc-2.4.5 and slower but working in gcc-2.5 and 2.6. */ __asm __volatile("testl %0,%0; je 1f; bsfl %0,%0; incl %0; 1:" : "=r" (result) : "0" (mask)); return (result); } #define HAVE_INLINE_FLS static __inline int fls(int mask) { int result; __asm __volatile("testl %0,%0; je 1f; bsrl %0,%0; incl %0; 1:" : "=r" (result) : "0" (mask)); return (result); } #if __GNUC__ < 2 #define inb(port) inbv(port) #define outb(port, data) outbv(port, data) #else /* __GNUC >= 2 */ /* * The following complications are to get around gcc not having a * constraint letter for the range 0..255. We still put "d" in the * constraint because "i" isn't a valid constraint when the port * isn't constant. This only matters for -O0 because otherwise * the non-working version gets optimized away. * * Use an expression-statement instead of a conditional expression * because gcc-2.6.0 would promote the operands of the conditional * and produce poor code for "if ((inb(var) & const1) == const2)". * * The unnecessary test `(port) < 0x10000' is to generate a warning if * the `port' has type u_short or smaller. Such types are pessimal. * This actually only works for signed types. The range check is * careful to avoid generating warnings. */ #define inb(port) __extension__ ({ \ u_char _data; \ if (__builtin_constant_p(port) && ((port) & 0xffff) < 0x100 \ && (port) < 0x10000) \ _data = inbc(port); \ else \ _data = inbv(port); \ _data; }) #define outb(port, data) ( \ __builtin_constant_p(port) && ((port) & 0xffff) < 0x100 \ && (port) < 0x10000 \ ? outbc(port, data) : outbv(port, data)) static __inline u_char inbc(u_int port) { u_char data; __asm __volatile("inb %1,%0" : "=a" (data) : "id" ((u_short)(port))); return (data); } static __inline void outbc(u_int port, u_char data) { __asm __volatile("outb %0,%1" : : "a" (data), "id" ((u_short)(port))); } #endif /* __GNUC <= 2 */ static __inline u_char inbv(u_int port) { u_char data; /* * We use %%dx and not %1 here because i/o is done at %dx and not at * %edx, while gcc generates inferior code (movw instead of movl) * if we tell it to load (u_short) port. */ __asm __volatile("inb %%dx,%0" : "=a" (data) : "d" (port)); return (data); } static __inline u_long inl(u_int port) { u_long data; __asm __volatile("inl %%dx,%0" : "=a" (data) : "d" (port)); return (data); } static __inline void insb(u_int port, void *addr, size_t cnt) { __asm __volatile("cld; rep; insb" : : "d" (port), "D" (addr), "c" (cnt) : "di", "cx", "memory"); } static __inline void insw(u_int port, void *addr, size_t cnt) { __asm __volatile("cld; rep; insw" : : "d" (port), "D" (addr), "c" (cnt) : "di", "cx", "memory"); } static __inline void insl(u_int port, void *addr, size_t cnt) { __asm __volatile("cld; rep; insl" : : "d" (port), "D" (addr), "c" (cnt) : "di", "cx", "memory"); } static __inline void invd(void) { __asm __volatile("invd"); } #ifdef KERNEL #ifdef SMP /* * When using APIC IPI's, the inlining cost is prohibitive since the call * executes into the IPI transmission system. */ void invlpg __P((u_int addr)); void invltlb __P((void)); #else /* !SMP */ static __inline void invlpg(u_int addr) { __asm __volatile("invlpg (%0)" : : "r" (addr) : "memory"); } static __inline void invltlb(void) { u_long temp; /* * This should be implemented as load_cr3(rcr3()) when load_cr3() * is inlined. */ __asm __volatile("movl %%cr3, %0; movl %0, %%cr3" : "=r" (temp) : : "memory"); +#if defined(SWTCH_OPTIM_STATS) + ++tlb_flush_count; +#endif } #endif /* SMP */ #endif /* KERNEL */ static __inline u_short inw(u_int port) { u_short data; __asm __volatile("inw %%dx,%0" : "=a" (data) : "d" (port)); return (data); } static __inline u_int loadandclear(u_int *addr) { u_int result; __asm __volatile("xorl %0,%0; xchgl %1,%0" : "=&r" (result) : "m" (*addr)); return (result); } static __inline void outbv(u_int port, u_char data) { u_char al; /* * Use an unnecessary assignment to help gcc's register allocator. * This make a large difference for gcc-1.40 and a tiny difference * for gcc-2.6.0. For gcc-1.40, al had to be ``asm("ax")'' for * best results. gcc-2.6.0 can't handle this. */ al = data; __asm __volatile("outb %0,%%dx" : : "a" (al), "d" (port)); } static __inline void outl(u_int port, u_long data) { /* * outl() and outw() aren't used much so we haven't looked at * possible micro-optimizations such as the unnecessary * assignment for them. */ __asm __volatile("outl %0,%%dx" : : "a" (data), "d" (port)); } static __inline void outsb(u_int port, void *addr, size_t cnt) { __asm __volatile("cld; rep; outsb" : : "d" (port), "S" (addr), "c" (cnt) : "si", "cx"); } static __inline void outsw(u_int port, void *addr, size_t cnt) { __asm __volatile("cld; rep; outsw" : : "d" (port), "S" (addr), "c" (cnt) : "si", "cx"); } static __inline void outsl(u_int port, void *addr, size_t cnt) { __asm __volatile("cld; rep; outsl" : : "d" (port), "S" (addr), "c" (cnt) : "si", "cx"); } static __inline void outw(u_int port, u_short data) { __asm __volatile("outw %0,%%dx" : : "a" (data), "d" (port)); } static __inline u_long rcr2(void) { u_long data; __asm __volatile("movl %%cr2,%0" : "=r" (data)); return (data); } static __inline u_long read_eflags(void) { u_long ef; __asm __volatile("pushfl; popl %0" : "=r" (ef)); return (ef); } static __inline quad_t rdmsr(u_int msr) { quad_t rv; __asm __volatile(".byte 0x0f, 0x32" : "=A" (rv) : "c" (msr)); return (rv); } static __inline quad_t rdpmc(u_int pmc) { quad_t rv; __asm __volatile(".byte 0x0f, 0x33" : "=A" (rv) : "c" (pmc)); return (rv); } static __inline quad_t rdtsc(void) { quad_t rv; __asm __volatile(".byte 0x0f, 0x31" : "=A" (rv)); return (rv); } static __inline void setbits(volatile unsigned *addr, u_int bits) { __asm __volatile( #ifdef SMP "lock; " #endif "orl %1,%0" : "=m" (*addr) : "ir" (bits)); } static __inline void wbinvd(void) { __asm __volatile("wbinvd"); } static __inline void write_eflags(u_long ef) { __asm __volatile("pushl %0; popfl" : : "r" (ef)); } static __inline void wrmsr(u_int msr, quad_t newval) { __asm __volatile(".byte 0x0f, 0x30" : : "A" (newval), "c" (msr)); } #else /* !__GNUC__ */ int breakpoint __P((void)); void disable_intr __P((void)); void enable_intr __P((void)); u_char inb __P((u_int port)); u_long inl __P((u_int port)); void insb __P((u_int port, void *addr, size_t cnt)); void insl __P((u_int port, void *addr, size_t cnt)); void insw __P((u_int port, void *addr, size_t cnt)); void invd __P((void)); void invlpg __P((u_int addr)); void invltlb __P((void)); u_short inw __P((u_int port)); u_int loadandclear __P((u_int *addr)); void outb __P((u_int port, u_char data)); void outl __P((u_int port, u_long data)); void outsb __P((u_int port, void *addr, size_t cnt)); void outsl __P((u_int port, void *addr, size_t cnt)); void outsw __P((u_int port, void *addr, size_t cnt)); void outw __P((u_int port, u_short data)); u_long rcr2 __P((void)); quad_t rdmsr __P((u_int msr)); quad_t rdpmc __P((u_int pmc)); quad_t rdtsc __P((void)); u_long read_eflags __P((void)); void setbits __P((volatile unsigned *addr, u_int bits)); void wbinvd __P((void)); void write_eflags __P((u_long ef)); void wrmsr __P((u_int msr, quad_t newval)); #endif /* __GNUC__ */ void load_cr0 __P((u_long cr0)); void load_cr3 __P((u_long cr3)); void load_cr4 __P((u_long cr4)); void ltr __P((u_short sel)); u_int rcr0 __P((void)); u_long rcr3 __P((void)); u_long rcr4 __P((void)); #endif /* !_MACHINE_CPUFUNC_H_ */ Index: head/sys/i386/i386/machdep.c =================================================================== --- head/sys/i386/i386/machdep.c (revision 31708) +++ head/sys/i386/i386/machdep.c (revision 31709) @@ -1,1798 +1,1806 @@ /*- * Copyright (c) 1992 Terrence R. Lambert. * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 - * $Id: machdep.c,v 1.277 1997/12/04 14:35:39 jkh Exp $ + * $Id: machdep.c,v 1.278 1997/12/04 21:21:24 jmg Exp $ */ #include "apm.h" #include "npx.h" #include "opt_bounce.h" #include "opt_cpu.h" #include "opt_ddb.h" #include "opt_maxmem.h" #include "opt_perfmon.h" #include "opt_smp.h" #include "opt_sysvipc.h" #include "opt_userconfig.h" #include "opt_vm86.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SYSVSHM #include #endif #ifdef SYSVMSG #include #endif #ifdef SYSVSEM #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if NAPM > 0 #include #endif #include #include #include #include #include #include #include #include #include /* pcb.h included via sys/user.h */ #ifdef SMP #include #endif #ifdef PERFMON #include #endif #include #include #include #include extern void init386 __P((int first)); extern int ptrace_set_pc __P((struct proc *p, unsigned int addr)); extern int ptrace_single_step __P((struct proc *p)); extern int ptrace_write_u __P((struct proc *p, vm_offset_t off, int data)); extern void dblfault_handler __P((void)); extern void printcpuinfo(void); /* XXX header file */ extern void earlysetcpuclass(void); /* same header file */ extern void finishidentcpu(void); extern void panicifcpuunsupported(void); extern void initializecpu(void); static void cpu_startup __P((void *)); SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL) static MALLOC_DEFINE(M_MBUF, "mbuf", "mbuf"); #ifdef BOUNCE_BUFFERS #ifdef BOUNCEPAGES int bouncepages = BOUNCEPAGES; #else int bouncepages = 0; #endif #endif /* BOUNCE_BUFFERS */ int msgbufmapped = 0; /* set when safe to use msgbuf */ int _udatasel, _ucodesel; u_int atdevbase; + +#if defined(SWTCH_OPTIM_STATS) +extern int swtch_optim_stats; +SYSCTL_INT(_debug, OID_AUTO, swtch_optim_stats, + CTLFLAG_RD, &swtch_optim_stats, 0, ""); +SYSCTL_INT(_debug, OID_AUTO, tlb_flush_count, + CTLFLAG_RD, &tlb_flush_count, 0, ""); +#endif int physmem = 0; int cold = 1; static int sysctl_hw_physmem SYSCTL_HANDLER_ARGS { int error = sysctl_handle_int(oidp, 0, ctob(physmem), req); return (error); } SYSCTL_PROC(_hw, HW_PHYSMEM, physmem, CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_hw_physmem, "I", ""); static int sysctl_hw_usermem SYSCTL_HANDLER_ARGS { int error = sysctl_handle_int(oidp, 0, ctob(physmem - cnt.v_wire_count), req); return (error); } SYSCTL_PROC(_hw, HW_USERMEM, usermem, CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_hw_usermem, "I", ""); int bootverbose = 0, Maxmem = 0; long dumplo; vm_offset_t phys_avail[10]; /* must be 2 less so 0 0 can signal end of chunks */ #define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(vm_offset_t)) - 2) static void setup_netisrs __P((struct linker_set *)); /* XXX declare elsewhere */ static vm_offset_t buffer_sva, buffer_eva; vm_offset_t clean_sva, clean_eva; static vm_offset_t pager_sva, pager_eva; extern struct linker_set netisr_set; #define offsetof(type, member) ((size_t)(&((type *)0)->member)) static void cpu_startup(dummy) void *dummy; { register unsigned i; register caddr_t v; vm_offset_t maxaddr; vm_size_t size = 0; int firstaddr; vm_offset_t minaddr; if (boothowto & RB_VERBOSE) bootverbose++; /* * Good {morning,afternoon,evening,night}. */ printf(version); earlysetcpuclass(); startrtclock(); printcpuinfo(); panicifcpuunsupported(); #ifdef PERFMON perfmon_init(); #endif printf("real memory = %d (%dK bytes)\n", ptoa(Maxmem), ptoa(Maxmem) / 1024); /* * Display any holes after the first chunk of extended memory. */ if (bootverbose) { int indx; printf("Physical memory chunk(s):\n"); for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) { int size1 = phys_avail[indx + 1] - phys_avail[indx]; printf("0x%08lx - 0x%08lx, %d bytes (%d pages)\n", phys_avail[indx], phys_avail[indx + 1] - 1, size1, size1 / PAGE_SIZE); } } /* * Quickly wire in netisrs. */ setup_netisrs(&netisr_set); /* * Calculate callout wheel size */ for (callwheelsize = 1, callwheelbits = 0; callwheelsize < ncallout; callwheelsize <<= 1, ++callwheelbits) ; callwheelmask = callwheelsize - 1; /* * Allocate space for system data structures. * The first available kernel virtual address is in "v". * As pages of kernel virtual memory are allocated, "v" is incremented. * As pages of memory are allocated and cleared, * "firstaddr" is incremented. * An index into the kernel page table corresponding to the * virtual memory address maintained in "v" is kept in "mapaddr". */ /* * Make two passes. The first pass calculates how much memory is * needed and allocates it. The second pass assigns virtual * addresses to the various data structures. */ firstaddr = 0; again: v = (caddr_t)firstaddr; #define valloc(name, type, num) \ (name) = (type *)v; v = (caddr_t)((name)+(num)) #define valloclim(name, type, num, lim) \ (name) = (type *)v; v = (caddr_t)((lim) = ((name)+(num))) valloc(callout, struct callout, ncallout); valloc(callwheel, struct callout_tailq, callwheelsize); #ifdef SYSVSHM valloc(shmsegs, struct shmid_ds, shminfo.shmmni); #endif #ifdef SYSVSEM valloc(sema, struct semid_ds, seminfo.semmni); valloc(sem, struct sem, seminfo.semmns); /* This is pretty disgusting! */ valloc(semu, int, (seminfo.semmnu * seminfo.semusz) / sizeof(int)); #endif #ifdef SYSVMSG valloc(msgpool, char, msginfo.msgmax); valloc(msgmaps, struct msgmap, msginfo.msgseg); valloc(msghdrs, struct msg, msginfo.msgtql); valloc(msqids, struct msqid_ds, msginfo.msgmni); #endif if (nbuf == 0) { nbuf = 30; if( physmem > 1024) nbuf += min((physmem - 1024) / 8, 2048); } nswbuf = max(min(nbuf/4, 128), 16); valloc(swbuf, struct buf, nswbuf); valloc(buf, struct buf, nbuf); #ifdef BOUNCE_BUFFERS /* * If there is more than 16MB of memory, allocate some bounce buffers */ if (Maxmem > 4096) { if (bouncepages == 0) { bouncepages = 64; bouncepages += ((Maxmem - 4096) / 2048) * 32; if (bouncepages > 128) bouncepages = 128; } v = (caddr_t)((vm_offset_t)round_page(v)); valloc(bouncememory, char, bouncepages * PAGE_SIZE); } #endif /* * End of first pass, size has been calculated so allocate memory */ if (firstaddr == 0) { size = (vm_size_t)(v - firstaddr); firstaddr = (int)kmem_alloc(kernel_map, round_page(size)); if (firstaddr == 0) panic("startup: no room for tables"); goto again; } /* * End of second pass, addresses have been assigned */ if ((vm_size_t)(v - firstaddr) != size) panic("startup: table size inconsistency"); #ifdef BOUNCE_BUFFERS clean_map = kmem_suballoc(kernel_map, &clean_sva, &clean_eva, (nbuf*BKVASIZE) + (nswbuf*MAXPHYS) + maxbkva + pager_map_size, TRUE); io_map = kmem_suballoc(clean_map, &minaddr, &maxaddr, maxbkva, FALSE); #else clean_map = kmem_suballoc(kernel_map, &clean_sva, &clean_eva, (nbuf*BKVASIZE) + (nswbuf*MAXPHYS) + pager_map_size, TRUE); #endif buffer_map = kmem_suballoc(clean_map, &buffer_sva, &buffer_eva, (nbuf*BKVASIZE), TRUE); pager_map = kmem_suballoc(clean_map, &pager_sva, &pager_eva, (nswbuf*MAXPHYS) + pager_map_size, TRUE); pager_map->system_map = 1; exec_map = kmem_suballoc(kernel_map, &minaddr, &maxaddr, (16*ARG_MAX), TRUE); u_map = kmem_suballoc(kernel_map, &minaddr, &maxaddr, (maxproc*UPAGES*PAGE_SIZE), FALSE); /* * Finally, allocate mbuf pool. Since mclrefcnt is an off-size * we use the more space efficient malloc in place of kmem_alloc. */ { vm_offset_t mb_map_size; mb_map_size = nmbufs * MSIZE + nmbclusters * MCLBYTES; mb_map_size = roundup2(mb_map_size, max(MCLBYTES, PAGE_SIZE)); mclrefcnt = malloc(mb_map_size / MCLBYTES, M_MBUF, M_NOWAIT); bzero(mclrefcnt, mb_map_size / MCLBYTES); mb_map = kmem_suballoc(kmem_map, (vm_offset_t *)&mbutl, &maxaddr, mb_map_size, FALSE); mb_map->system_map = 1; } /* * Initialize callouts */ SLIST_INIT(&callfree); for (i = 0; i < ncallout; i++) { SLIST_INSERT_HEAD(&callfree, &callout[i], c_links.sle); } for (i = 0; i < callwheelsize; i++) { TAILQ_INIT(&callwheel[i]); } #if defined(USERCONFIG) #if defined(USERCONFIG_BOOT) if (1) { #else if (boothowto & RB_CONFIG) { #endif userconfig(); cninit(); /* the preferred console may have changed */ } #endif #ifdef BOUNCE_BUFFERS /* * init bounce buffers */ vm_bounce_init(); #endif printf("avail memory = %d (%dK bytes)\n", ptoa(cnt.v_free_count), ptoa(cnt.v_free_count) / 1024); /* * Set up buffers, so they can be used to read disk labels. */ bufinit(); vm_pager_bufferinit(); #ifdef SMP /* * OK, enough kmem_alloc/malloc state should be up, lets get on with it! */ mp_start(); /* fire up the APs and APICs */ mp_announce(); #endif /* SMP */ } int register_netisr(num, handler) int num; netisr_t *handler; { if (num < 0 || num >= (sizeof(netisrs)/sizeof(*netisrs)) ) { printf("register_netisr: bad isr number: %d\n", num); return (EINVAL); } netisrs[num] = handler; return (0); } static void setup_netisrs(ls) struct linker_set *ls; { int i; const struct netisrtab *nit; for(i = 0; ls->ls_items[i]; i++) { nit = (const struct netisrtab *)ls->ls_items[i]; register_netisr(nit->nit_num, nit->nit_isr); } } /* * Send an interrupt to process. * * Stack is set up to allow sigcode stored * at top to call routine, followed by kcall * to sigreturn routine below. After sigreturn * resets the signal mask, the stack, and the * frame pointer, it returns to the user * specified pc, psl. */ void sendsig(catcher, sig, mask, code) sig_t catcher; int sig, mask; u_long code; { register struct proc *p = curproc; register struct trapframe *regs; register struct sigframe *fp; struct sigframe sf; struct sigacts *psp = p->p_sigacts; int oonstack; regs = p->p_md.md_regs; oonstack = psp->ps_sigstk.ss_flags & SS_ONSTACK; /* * Allocate and validate space for the signal handler context. */ if ((psp->ps_flags & SAS_ALTSTACK) && !oonstack && (psp->ps_sigonstack & sigmask(sig))) { fp = (struct sigframe *)(psp->ps_sigstk.ss_sp + psp->ps_sigstk.ss_size - sizeof(struct sigframe)); psp->ps_sigstk.ss_flags |= SS_ONSTACK; } else { fp = (struct sigframe *)regs->tf_esp - 1; } /* * grow() will return FALSE if the fp will not fit inside the stack * and the stack can not be grown. useracc will return FALSE * if access is denied. */ if ((grow(p, (int)fp) == FALSE) || (useracc((caddr_t)fp, sizeof(struct sigframe), B_WRITE) == FALSE)) { /* * Process has trashed its stack; give it an illegal * instruction to halt it in its tracks. */ SIGACTION(p, SIGILL) = SIG_DFL; sig = sigmask(SIGILL); p->p_sigignore &= ~sig; p->p_sigcatch &= ~sig; p->p_sigmask &= ~sig; psignal(p, SIGILL); return; } /* * Build the argument list for the signal handler. */ if (p->p_sysent->sv_sigtbl) { if (sig < p->p_sysent->sv_sigsize) sig = p->p_sysent->sv_sigtbl[sig]; else sig = p->p_sysent->sv_sigsize + 1; } sf.sf_signum = sig; sf.sf_code = code; sf.sf_scp = &fp->sf_sc; sf.sf_addr = (char *) regs->tf_err; sf.sf_handler = catcher; /* save scratch registers */ sf.sf_sc.sc_eax = regs->tf_eax; sf.sf_sc.sc_ebx = regs->tf_ebx; sf.sf_sc.sc_ecx = regs->tf_ecx; sf.sf_sc.sc_edx = regs->tf_edx; sf.sf_sc.sc_esi = regs->tf_esi; sf.sf_sc.sc_edi = regs->tf_edi; sf.sf_sc.sc_cs = regs->tf_cs; sf.sf_sc.sc_ds = regs->tf_ds; sf.sf_sc.sc_ss = regs->tf_ss; sf.sf_sc.sc_es = regs->tf_es; sf.sf_sc.sc_isp = regs->tf_isp; /* * Build the signal context to be used by sigreturn. */ sf.sf_sc.sc_onstack = oonstack; sf.sf_sc.sc_mask = mask; sf.sf_sc.sc_sp = regs->tf_esp; sf.sf_sc.sc_fp = regs->tf_ebp; sf.sf_sc.sc_pc = regs->tf_eip; sf.sf_sc.sc_ps = regs->tf_eflags; sf.sf_sc.sc_trapno = regs->tf_trapno; sf.sf_sc.sc_err = regs->tf_err; /* * If we're a vm86 process, we want to save the segment registers. * We also change eflags to be our emulated eflags, not the actual * eflags. */ if (regs->tf_eflags & PSL_VM) { struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; struct vm86_kernel *vm86 = &p->p_addr->u_pcb.pcb_ext->ext_vm86; sf.sf_sc.sc_gs = tf->tf_vm86_gs; sf.sf_sc.sc_fs = tf->tf_vm86_fs; sf.sf_sc.sc_es = tf->tf_vm86_es; sf.sf_sc.sc_ds = tf->tf_vm86_ds; if (vm86->vm86_has_vme == 0) sf.sf_sc.sc_ps = (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); /* * We should never have PSL_T set when returning from vm86 * mode. It may be set here if we deliver a signal before * getting to vm86 mode, so turn it off. */ tf->tf_eflags &= ~(PSL_VM | PSL_T | PSL_VIF | PSL_VIP); } /* * Copy the sigframe out to the user's stack. */ if (copyout(&sf, fp, sizeof(struct sigframe)) != 0) { /* * Something is wrong with the stack pointer. * ...Kill the process. */ sigexit(p, SIGILL); } regs->tf_esp = (int)fp; regs->tf_eip = (int)(((char *)PS_STRINGS) - *(p->p_sysent->sv_szsigcode)); regs->tf_cs = _ucodesel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_ss = _udatasel; } /* * System call to cleanup state after a signal * has been taken. Reset signal mask and * stack state from context left by sendsig (above). * Return to previous pc and psl as specified by * context left by sendsig. Check carefully to * make sure that the user has not modified the * state to gain improper privileges. */ int sigreturn(p, uap) struct proc *p; struct sigreturn_args /* { struct sigcontext *sigcntxp; } */ *uap; { register struct sigcontext *scp; register struct sigframe *fp; register struct trapframe *regs = p->p_md.md_regs; int eflags; /* * (XXX old comment) regs->tf_esp points to the return address. * The user scp pointer is above that. * The return address is faked in the signal trampoline code * for consistency. */ scp = uap->sigcntxp; fp = (struct sigframe *) ((caddr_t)scp - offsetof(struct sigframe, sf_sc)); if (useracc((caddr_t)fp, sizeof (*fp), B_WRITE) == 0) return(EFAULT); eflags = scp->sc_ps; if (eflags & PSL_VM) { struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; struct vm86_kernel *vm86; /* * if pcb_ext == 0 or vm86_inited == 0, the user hasn't * set up the vm86 area, and we can't enter vm86 mode. */ if (p->p_addr->u_pcb.pcb_ext == 0) return (EINVAL); vm86 = &p->p_addr->u_pcb.pcb_ext->ext_vm86; if (vm86->vm86_inited == 0) return (EINVAL); /* go back to user mode if both flags are set */ if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) trapsignal(p, SIGBUS, 0); #define VM_USERCHANGE (PSL_USERCHANGE | PSL_RF) #define VME_USERCHANGE (VM_USERCHANGE | PSL_VIP | PSL_VIF) if (vm86->vm86_has_vme) { eflags = (tf->tf_eflags & ~VME_USERCHANGE) | (eflags & VME_USERCHANGE) | PSL_VM; } else { vm86->vm86_eflags = eflags; /* save VIF, VIP */ eflags = (tf->tf_eflags & ~VM_USERCHANGE) | (eflags & VM_USERCHANGE) | PSL_VM; } tf->tf_vm86_ds = scp->sc_ds; tf->tf_vm86_es = scp->sc_es; tf->tf_vm86_fs = scp->sc_fs; tf->tf_vm86_gs = scp->sc_gs; tf->tf_ds = _udatasel; tf->tf_es = _udatasel; } else { /* * Don't allow users to change privileged or reserved flags. */ #define EFLAGS_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) /* * XXX do allow users to change the privileged flag PSL_RF. * The cpu sets PSL_RF in tf_eflags for faults. Debuggers * should sometimes set it there too. tf_eflags is kept in * the signal context during signal handling and there is no * other place to remember it, so the PSL_RF bit may be * corrupted by the signal handler without us knowing. * Corruption of the PSL_RF bit at worst causes one more or * one less debugger trap, so allowing it is fairly harmless. */ if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) { #ifdef DEBUG printf("sigreturn: eflags = 0x%x\n", eflags); #endif return(EINVAL); } /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) if (!CS_SECURE(scp->sc_cs)) { #ifdef DEBUG printf("sigreturn: cs = 0x%x\n", scp->sc_cs); #endif trapsignal(p, SIGBUS, T_PROTFLT); return(EINVAL); } regs->tf_ds = scp->sc_ds; regs->tf_es = scp->sc_es; } /* restore scratch registers */ regs->tf_eax = scp->sc_eax; regs->tf_ebx = scp->sc_ebx; regs->tf_ecx = scp->sc_ecx; regs->tf_edx = scp->sc_edx; regs->tf_esi = scp->sc_esi; regs->tf_edi = scp->sc_edi; regs->tf_cs = scp->sc_cs; regs->tf_ss = scp->sc_ss; regs->tf_isp = scp->sc_isp; if (useracc((caddr_t)scp, sizeof (*scp), B_WRITE) == 0) return(EINVAL); if (scp->sc_onstack & 01) p->p_sigacts->ps_sigstk.ss_flags |= SS_ONSTACK; else p->p_sigacts->ps_sigstk.ss_flags &= ~SS_ONSTACK; p->p_sigmask = scp->sc_mask & ~sigcantmask; regs->tf_ebp = scp->sc_fp; regs->tf_esp = scp->sc_sp; regs->tf_eip = scp->sc_pc; regs->tf_eflags = eflags; return(EJUSTRETURN); } /* * Machine dependent boot() routine * * I haven't seen anything to put here yet * Possibly some stuff might be grafted back here from boot() */ void cpu_boot(int howto) { } /* * Shutdown the CPU as much as possible */ void cpu_halt(void) { for (;;) __asm__ ("hlt"); } /* * Turn the power off. */ void cpu_power_down(void) { #if NAPM > 0 apm_power_off(); #endif } /* * Clear registers on exec */ void setregs(p, entry, stack) struct proc *p; u_long entry; u_long stack; { struct trapframe *regs = p->p_md.md_regs; #ifdef USER_LDT struct pcb *pcb = &p->p_addr->u_pcb; /* was i386_user_cleanup() in NetBSD */ if (pcb->pcb_ldt) { if (pcb == curpcb) lldt(GSEL(GUSERLDT_SEL, SEL_KPL)); kmem_free(kernel_map, (vm_offset_t)pcb->pcb_ldt, pcb->pcb_ldt_len * sizeof(union descriptor)); pcb->pcb_ldt_len = (int)pcb->pcb_ldt = 0; } #endif bzero((char *)regs, sizeof(struct trapframe)); regs->tf_eip = entry; regs->tf_esp = stack; regs->tf_eflags = PSL_USER | (regs->tf_eflags & PSL_T); regs->tf_ss = _udatasel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_cs = _ucodesel; /* * Initialize the math emulator (if any) for the current process. * Actually, just clear the bit that says that the emulator has * been initialized. Initialization is delayed until the process * traps to the emulator (if it is done at all) mainly because * emulators don't provide an entry point for initialization. */ p->p_addr->u_pcb.pcb_flags &= ~FP_SOFTFP; /* * Arrange to trap the next npx or `fwait' instruction (see npx.c * for why fwait must be trapped at least if there is an npx or an * emulator). This is mainly to handle the case where npx0 is not * configured, since the npx routines normally set up the trap * otherwise. It should be done only at boot time, but doing it * here allows modifying `npx_exists' for testing the emulator on * systems with an npx. */ load_cr0(rcr0() | CR0_MP | CR0_TS); #if NNPX > 0 /* Initialize the npx (if any) for the current process. */ npxinit(__INITIAL_NPXCW__); #endif } static int sysctl_machdep_adjkerntz SYSCTL_HANDLER_ARGS { int error; error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req); if (!error && req->newptr) resettodr(); return (error); } SYSCTL_PROC(_machdep, CPU_ADJKERNTZ, adjkerntz, CTLTYPE_INT|CTLFLAG_RW, &adjkerntz, 0, sysctl_machdep_adjkerntz, "I", ""); SYSCTL_INT(_machdep, CPU_DISRTCSET, disable_rtc_set, CTLFLAG_RW, &disable_rtc_set, 0, ""); SYSCTL_STRUCT(_machdep, CPU_BOOTINFO, bootinfo, CTLFLAG_RD, &bootinfo, bootinfo, ""); SYSCTL_INT(_machdep, CPU_WALLCLOCK, wall_cmos_clock, CTLFLAG_RW, &wall_cmos_clock, 0, ""); /* * Initialize 386 and configure to run kernel */ /* * Initialize segments & interrupt table */ int currentldt; int _default_ldt; #ifdef SMP union descriptor gdt[NGDT + NCPU]; /* global descriptor table */ #else union descriptor gdt[NGDT]; /* global descriptor table */ #endif struct gate_descriptor idt[NIDT]; /* interrupt descriptor table */ union descriptor ldt[NLDT]; /* local descriptor table */ #ifdef SMP /* table descriptors - used to load tables by microp */ struct region_descriptor r_gdt, r_idt; #endif #ifdef SMP extern struct i386tss common_tss; /* One tss per cpu */ #ifdef VM86 extern struct segment_descriptor common_tssd; extern int private_tss; extern u_int my_tr; #endif /* VM86 */ #else struct i386tss common_tss; #ifdef VM86 struct segment_descriptor common_tssd; u_int private_tss; /* flag indicating private tss */ u_int my_tr; /* which task register setting */ #endif /* VM86 */ #endif #if defined(I586_CPU) && !defined(NO_F00F_HACK) struct gate_descriptor *t_idt; extern int has_f00f_bug; #endif static struct i386tss dblfault_tss; static char dblfault_stack[PAGE_SIZE]; extern struct user *proc0paddr; /* software prototypes -- in more palatable form */ struct soft_segment_descriptor gdt_segs[ #ifdef SMP NGDT + NCPU #endif ] = { /* GNULL_SEL 0 Null Descriptor */ { 0x0, /* segment base address */ 0x0, /* length */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GCODE_SEL 1 Code Descriptor for kernel */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMERA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GDATA_SEL 2 Data Descriptor for kernel */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMRWA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GLDT_SEL 3 LDT Descriptor */ { (int) ldt, /* segment base address */ sizeof(ldt)-1, /* length - all address space */ SDT_SYSLDT, /* segment type */ SEL_UPL, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* unused - default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GTGATE_SEL 4 Null Descriptor - Placeholder */ { 0x0, /* segment base address */ 0x0, /* length - all address space */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GPANIC_SEL 5 Panic Tss Descriptor */ { (int) &dblfault_tss, /* segment base address */ sizeof(struct i386tss)-1,/* length - all address space */ SDT_SYS386TSS, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* unused - default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GPROC0_SEL 6 Proc 0 Tss Descriptor */ { (int) &common_tss, /* segment base address */ sizeof(struct i386tss)-1,/* length - all address space */ SDT_SYS386TSS, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* unused - default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GUSERLDT_SEL 7 User LDT Descriptor per process */ { (int) ldt, /* segment base address */ (512 * sizeof(union descriptor)-1), /* length */ SDT_SYSLDT, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* unused - default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GAPMCODE32_SEL 8 APM BIOS 32-bit interface (32bit Code) */ { 0, /* segment base address (overwritten by APM) */ 0xfffff, /* length */ SDT_MEMERA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GAPMCODE16_SEL 9 APM BIOS 32-bit interface (16bit Code) */ { 0, /* segment base address (overwritten by APM) */ 0xfffff, /* length */ SDT_MEMERA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GAPMDATA_SEL 10 APM BIOS 32-bit interface (Data) */ { 0, /* segment base address (overwritten by APM) */ 0xfffff, /* length */ SDT_MEMRWA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, }; static struct soft_segment_descriptor ldt_segs[] = { /* Null Descriptor - overwritten by call gate */ { 0x0, /* segment base address */ 0x0, /* length - all address space */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* Null Descriptor - overwritten by call gate */ { 0x0, /* segment base address */ 0x0, /* length - all address space */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* Null Descriptor - overwritten by call gate */ { 0x0, /* segment base address */ 0x0, /* length - all address space */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* Code Descriptor for user */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMERA, /* segment type */ SEL_UPL, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* Data Descriptor for user */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMRWA, /* segment type */ SEL_UPL, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, }; void setidt(idx, func, typ, dpl, selec) int idx; inthand_t *func; int typ; int dpl; int selec; { struct gate_descriptor *ip = idt + idx; ip->gd_looffset = (int)func; ip->gd_selector = selec; ip->gd_stkcpy = 0; ip->gd_xx = 0; ip->gd_type = typ; ip->gd_dpl = dpl; ip->gd_p = 1; ip->gd_hioffset = ((int)func)>>16 ; } #define IDTVEC(name) __CONCAT(X,name) extern inthand_t IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), IDTVEC(syscall), IDTVEC(int0x80_syscall); void sdtossd(sd, ssd) struct segment_descriptor *sd; struct soft_segment_descriptor *ssd; { ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase; ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit; ssd->ssd_type = sd->sd_type; ssd->ssd_dpl = sd->sd_dpl; ssd->ssd_p = sd->sd_p; ssd->ssd_def32 = sd->sd_def32; ssd->ssd_gran = sd->sd_gran; } void init386(first) int first; { int x; unsigned biosbasemem, biosextmem; struct gate_descriptor *gdp; int gsel_tss; struct isa_device *idp; #ifndef SMP /* table descriptors - used to load tables by microp */ struct region_descriptor r_gdt, r_idt; #endif int pagesinbase, pagesinext; int target_page, pa_indx; int off; int speculative_mprobe; /* * Prevent lowering of the ipl if we call tsleep() early. */ safepri = cpl; proc0.p_addr = proc0paddr; atdevbase = ISA_HOLE_START + KERNBASE; /* * Initialize the console before we print anything out. */ cninit(); /* * make gdt memory segments, the code segment goes up to end of the * page with etext in it, the data segment goes to the end of * the address space */ /* * XXX text protection is temporarily (?) disabled. The limit was * i386_btop(round_page(etext)) - 1. */ gdt_segs[GCODE_SEL].ssd_limit = i386_btop(0) - 1; gdt_segs[GDATA_SEL].ssd_limit = i386_btop(0) - 1; #ifdef BDE_DEBUGGER #define NGDT1 8 /* avoid overwriting db entries with APM ones */ #else #define NGDT1 (sizeof gdt_segs / sizeof gdt_segs[0]) #endif for (x = 0; x < NGDT1; x++) ssdtosd(&gdt_segs[x], &gdt[x].sd); #ifdef VM86 common_tssd = gdt[GPROC0_SEL].sd; #endif /* VM86 */ #ifdef SMP /* * Spin these up now. init_secondary() grabs them. We could use * #for(x,y,z) / #endfor cpp directives if they existed. */ for (x = 0; x < NCPU; x++) { gdt_segs[NGDT + x] = gdt_segs[GPROC0_SEL]; ssdtosd(&gdt_segs[NGDT + x], &gdt[NGDT + x].sd); } #endif /* make ldt memory segments */ /* * The data segment limit must not cover the user area because we * don't want the user area to be writable in copyout() etc. (page * level protection is lost in kernel mode on 386's). Also, we * don't want the user area to be writable directly (page level * protection of the user area is not available on 486's with * CR0_WP set, because there is no user-read/kernel-write mode). * * XXX - VM_MAXUSER_ADDRESS is an end address, not a max. And it * should be spelled ...MAX_USER... */ #define VM_END_USER_RW_ADDRESS VM_MAXUSER_ADDRESS /* * The code segment limit has to cover the user area until we move * the signal trampoline out of the user area. This is safe because * the code segment cannot be written to directly. */ #define VM_END_USER_R_ADDRESS (VM_END_USER_RW_ADDRESS + UPAGES * PAGE_SIZE) ldt_segs[LUCODE_SEL].ssd_limit = i386_btop(VM_END_USER_R_ADDRESS) - 1; ldt_segs[LUDATA_SEL].ssd_limit = i386_btop(VM_END_USER_RW_ADDRESS) - 1; for (x = 0; x < sizeof ldt_segs / sizeof ldt_segs[0]; x++) ssdtosd(&ldt_segs[x], &ldt[x].sd); /* exceptions */ for (x = 0; x < NIDT; x++) setidt(x, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(0, &IDTVEC(div), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(1, &IDTVEC(dbg), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(2, &IDTVEC(nmi), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(3, &IDTVEC(bpt), SDT_SYS386TGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(4, &IDTVEC(ofl), SDT_SYS386TGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(5, &IDTVEC(bnd), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(6, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(7, &IDTVEC(dna), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(8, 0, SDT_SYSTASKGT, SEL_KPL, GSEL(GPANIC_SEL, SEL_KPL)); setidt(9, &IDTVEC(fpusegm), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(10, &IDTVEC(tss), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(11, &IDTVEC(missing), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(12, &IDTVEC(stk), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(13, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(14, &IDTVEC(page), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(15, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(16, &IDTVEC(fpu), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(17, &IDTVEC(align), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(18, &IDTVEC(mchk), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(0x80, &IDTVEC(int0x80_syscall), SDT_SYS386TGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); #include "isa.h" #if NISA >0 isa_defaultirq(); #endif rand_initialize(); r_gdt.rd_limit = sizeof(gdt) - 1; r_gdt.rd_base = (int) gdt; lgdt(&r_gdt); r_idt.rd_limit = sizeof(idt) - 1; r_idt.rd_base = (int) idt; lidt(&r_idt); _default_ldt = GSEL(GLDT_SEL, SEL_KPL); lldt(_default_ldt); currentldt = _default_ldt; #ifdef DDB kdb_init(); if (boothowto & RB_KDB) Debugger("Boot flags requested debugger"); #endif finishidentcpu(); /* Final stage of CPU initialization */ setidt(6, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); initializecpu(); /* Initialize CPU registers */ /* Use BIOS values stored in RTC CMOS RAM, since probing * breaks certain 386 AT relics. */ biosbasemem = rtcin(RTC_BASELO)+ (rtcin(RTC_BASEHI)<<8); biosextmem = rtcin(RTC_EXTLO)+ (rtcin(RTC_EXTHI)<<8); /* * If BIOS tells us that it has more than 640k in the basemem, * don't believe it - set it to 640k. */ if (biosbasemem > 640) { printf("Preposterous RTC basemem of %dK, truncating to 640K\n", biosbasemem); biosbasemem = 640; } if (bootinfo.bi_memsizes_valid && bootinfo.bi_basemem > 640) { printf("Preposterous BIOS basemem of %dK, truncating to 640K\n", bootinfo.bi_basemem); bootinfo.bi_basemem = 640; } /* * Warn if the official BIOS interface disagrees with the RTC * interface used above about the amount of base memory or the * amount of extended memory. Prefer the BIOS value for the base * memory. This is necessary for machines that `steal' base * memory for use as BIOS memory, at least if we are going to use * the BIOS for apm. Prefer the RTC value for extended memory. * Eventually the hackish interface shouldn't even be looked at. */ if (bootinfo.bi_memsizes_valid) { if (bootinfo.bi_basemem != biosbasemem) { vm_offset_t pa; printf( "BIOS basemem (%ldK) != RTC basemem (%dK), setting to BIOS value\n", bootinfo.bi_basemem, biosbasemem); biosbasemem = bootinfo.bi_basemem; /* * XXX if biosbasemem is now < 640, there is `hole' * between the end of base memory and the start of * ISA memory. The hole may be empty or it may * contain BIOS code or data. Map it read/write so * that the BIOS can write to it. (Memory from 0 to * the physical end of the kernel is mapped read-only * to begin with and then parts of it are remapped. * The parts that aren't remapped form holes that * remain read-only and are unused by the kernel. * The base memory area is below the physical end of * the kernel and right now forms a read-only hole. * The part of it from 0 to * (trunc_page(biosbasemem * 1024) - 1) will be * remapped and used by the kernel later.) * * This code is similar to the code used in * pmap_mapdev, but since no memory needs to be * allocated we simply change the mapping. */ for (pa = trunc_page(biosbasemem * 1024); pa < ISA_HOLE_START; pa += PAGE_SIZE) { unsigned *pte; pte = (unsigned *)vtopte(pa + KERNBASE); *pte = pa | PG_RW | PG_V; } } if (bootinfo.bi_extmem != biosextmem) printf("BIOS extmem (%ldK) != RTC extmem (%dK)\n", bootinfo.bi_extmem, biosextmem); } #ifdef SMP /* make hole for AP bootstrap code */ pagesinbase = mp_bootaddress(biosbasemem) / PAGE_SIZE; #else pagesinbase = biosbasemem * 1024 / PAGE_SIZE; #endif pagesinext = biosextmem * 1024 / PAGE_SIZE; /* * Special hack for chipsets that still remap the 384k hole when * there's 16MB of memory - this really confuses people that * are trying to use bus mastering ISA controllers with the * "16MB limit"; they only have 16MB, but the remapping puts * them beyond the limit. */ /* * If extended memory is between 15-16MB (16-17MB phys address range), * chop it to 15MB. */ if ((pagesinext > 3840) && (pagesinext < 4096)) pagesinext = 3840; /* * Maxmem isn't the "maximum memory", it's one larger than the * highest page of the physical address space. It should be * called something like "Maxphyspage". */ Maxmem = pagesinext + 0x100000/PAGE_SIZE; /* * Indicate that we wish to do a speculative search for memory beyond * the end of the reported size if the indicated amount is 64MB (0x4000 * pages) - which is the largest amount that the BIOS/bootblocks can * currently report. If a specific amount of memory is indicated via * the MAXMEM option or the npx0 "msize", then don't do the speculative * memory probe. */ if (Maxmem >= 0x4000) speculative_mprobe = TRUE; else speculative_mprobe = FALSE; #ifdef MAXMEM Maxmem = MAXMEM/4; speculative_mprobe = FALSE; #endif #if NNPX > 0 idp = find_isadev(isa_devtab_null, &npxdriver, 0); if (idp != NULL && idp->id_msize != 0) { Maxmem = idp->id_msize / 4; speculative_mprobe = FALSE; } #endif #ifdef SMP /* look for the MP hardware - needed for apic addresses */ mp_probe(); #endif /* call pmap initialization to make new kernel address space */ pmap_bootstrap (first, 0); /* * Size up each available chunk of physical memory. */ /* * We currently don't bother testing base memory. * XXX ...but we probably should. */ pa_indx = 0; if (pagesinbase > 1) { phys_avail[pa_indx++] = PAGE_SIZE; /* skip first page of memory */ phys_avail[pa_indx] = ptoa(pagesinbase);/* memory up to the ISA hole */ physmem = pagesinbase - 1; } else { /* point at first chunk end */ pa_indx++; } for (target_page = avail_start; target_page < ptoa(Maxmem); target_page += PAGE_SIZE) { int tmp, page_bad; page_bad = FALSE; /* * map page into kernel: valid, read/write, non-cacheable */ *(int *)CMAP1 = PG_V | PG_RW | PG_N | target_page; invltlb(); tmp = *(int *)CADDR1; /* * Test for alternating 1's and 0's */ *(volatile int *)CADDR1 = 0xaaaaaaaa; if (*(volatile int *)CADDR1 != 0xaaaaaaaa) { page_bad = TRUE; } /* * Test for alternating 0's and 1's */ *(volatile int *)CADDR1 = 0x55555555; if (*(volatile int *)CADDR1 != 0x55555555) { page_bad = TRUE; } /* * Test for all 1's */ *(volatile int *)CADDR1 = 0xffffffff; if (*(volatile int *)CADDR1 != 0xffffffff) { page_bad = TRUE; } /* * Test for all 0's */ *(volatile int *)CADDR1 = 0x0; if (*(volatile int *)CADDR1 != 0x0) { /* * test of page failed */ page_bad = TRUE; } /* * Restore original value. */ *(int *)CADDR1 = tmp; /* * Adjust array of valid/good pages. */ if (page_bad == FALSE) { /* * If this good page is a continuation of the * previous set of good pages, then just increase * the end pointer. Otherwise start a new chunk. * Note that "end" points one higher than end, * making the range >= start and < end. * If we're also doing a speculative memory * test and we at or past the end, bump up Maxmem * so that we keep going. The first bad page * will terminate the loop. */ if (phys_avail[pa_indx] == target_page) { phys_avail[pa_indx] += PAGE_SIZE; if (speculative_mprobe == TRUE && phys_avail[pa_indx] >= (64*1024*1024)) Maxmem++; } else { pa_indx++; if (pa_indx == PHYS_AVAIL_ARRAY_END) { printf("Too many holes in the physical address space, giving up\n"); pa_indx--; break; } phys_avail[pa_indx++] = target_page; /* start */ phys_avail[pa_indx] = target_page + PAGE_SIZE; /* end */ } physmem++; } } *(int *)CMAP1 = 0; invltlb(); /* * XXX * The last chunk must contain at least one page plus the message * buffer to avoid complicating other code (message buffer address * calculation, etc.). */ while (phys_avail[pa_indx - 1] + PAGE_SIZE + round_page(sizeof(struct msgbuf)) >= phys_avail[pa_indx]) { physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]); phys_avail[pa_indx--] = 0; phys_avail[pa_indx--] = 0; } Maxmem = atop(phys_avail[pa_indx]); /* Trim off space for the message buffer. */ phys_avail[pa_indx] -= round_page(sizeof(struct msgbuf)); avail_end = phys_avail[pa_indx]; /* now running on new page tables, configured,and u/iom is accessible */ /* Map the message buffer. */ for (off = 0; off < round_page(sizeof(struct msgbuf)); off += PAGE_SIZE) pmap_enter(kernel_pmap, (vm_offset_t)msgbufp + off, avail_end + off, VM_PROT_ALL, TRUE); msgbufmapped = 1; /* make an initial tss so cpu can get interrupt stack on syscall! */ #ifdef VM86 common_tss.tss_esp0 = (int) proc0.p_addr + UPAGES*PAGE_SIZE - 16; #else common_tss.tss_esp0 = (int) proc0.p_addr + UPAGES*PAGE_SIZE; #endif /* VM86 */ common_tss.tss_ss0 = GSEL(GDATA_SEL, SEL_KPL) ; common_tss.tss_ioopt = (sizeof common_tss) << 16; gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); ltr(gsel_tss); #ifdef VM86 private_tss = 0; my_tr = GPROC0_SEL; #endif dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 = dblfault_tss.tss_esp2 = (int) &dblfault_stack[sizeof(dblfault_stack)]; dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 = dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL); dblfault_tss.tss_cr3 = (int)IdlePTD; dblfault_tss.tss_eip = (int) dblfault_handler; dblfault_tss.tss_eflags = PSL_KERNEL; dblfault_tss.tss_ds = dblfault_tss.tss_es = dblfault_tss.tss_fs = dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL); dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL); dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL); /* make a call gate to reenter kernel with */ gdp = &ldt[LSYS5CALLS_SEL].gd; x = (int) &IDTVEC(syscall); gdp->gd_looffset = x++; gdp->gd_selector = GSEL(GCODE_SEL,SEL_KPL); gdp->gd_stkcpy = 1; gdp->gd_type = SDT_SYS386CGT; gdp->gd_dpl = SEL_UPL; gdp->gd_p = 1; gdp->gd_hioffset = ((int) &IDTVEC(syscall)) >>16; /* XXX does this work? */ ldt[LBSDICALLS_SEL] = ldt[LSYS5CALLS_SEL]; /* transfer to user mode */ _ucodesel = LSEL(LUCODE_SEL, SEL_UPL); _udatasel = LSEL(LUDATA_SEL, SEL_UPL); /* setup proc 0's pcb */ proc0.p_addr->u_pcb.pcb_flags = 0; proc0.p_addr->u_pcb.pcb_cr3 = (int)IdlePTD; proc0.p_addr->u_pcb.pcb_mpnest = 1; proc0.p_addr->u_pcb.pcb_ext = 0; } #if defined(I586_CPU) && !defined(NO_F00F_HACK) void f00f_hack(void); SYSINIT(f00f_hack, SI_SUB_INTRINSIC, SI_ORDER_FIRST, f00f_hack, NULL); void f00f_hack(void) { struct region_descriptor r_idt; unsigned char *tmp; int i; if (!has_f00f_bug) return; printf("Intel Pentium F00F detected, installing workaround\n"); r_idt.rd_limit = sizeof(idt) - 1; tmp = kmem_alloc(kernel_map, PAGE_SIZE * 2); if (tmp == 0) panic("kmem_alloc returned 0"); if (((unsigned int)tmp & (PAGE_SIZE-1)) != 0) panic("kmem_alloc returned non-page-aligned memory"); /* Put the first seven entries in the lower page */ t_idt = (struct gate_descriptor*)(tmp + PAGE_SIZE - (7*8)); bcopy(idt, t_idt, sizeof(idt)); r_idt.rd_base = (int)t_idt; lidt(&r_idt); if (vm_map_protect(kernel_map, tmp, tmp + PAGE_SIZE, VM_PROT_READ, FALSE) != KERN_SUCCESS) panic("vm_map_protect failed"); return; } #endif /* defined(I586_CPU) && !NO_F00F_HACK */ int ptrace_set_pc(p, addr) struct proc *p; unsigned int addr; { p->p_md.md_regs->tf_eip = addr; return (0); } int ptrace_single_step(p) struct proc *p; { p->p_md.md_regs->tf_eflags |= PSL_T; return (0); } int ptrace_write_u(p, off, data) struct proc *p; vm_offset_t off; int data; { struct trapframe frame_copy; vm_offset_t min; struct trapframe *tp; /* * Privileged kernel state is scattered all over the user area. * Only allow write access to parts of regs and to fpregs. */ min = (char *)p->p_md.md_regs - (char *)p->p_addr; if (off >= min && off <= min + sizeof(struct trapframe) - sizeof(int)) { tp = p->p_md.md_regs; frame_copy = *tp; *(int *)((char *)&frame_copy + (off - min)) = data; if (!EFLAGS_SECURE(frame_copy.tf_eflags, tp->tf_eflags) || !CS_SECURE(frame_copy.tf_cs)) return (EINVAL); *(int*)((char *)p->p_addr + off) = data; return (0); } min = offsetof(struct user, u_pcb) + offsetof(struct pcb, pcb_savefpu); if (off >= min && off <= min + sizeof(struct save87) - sizeof(int)) { *(int*)((char *)p->p_addr + off) = data; return (0); } return (EFAULT); } int fill_regs(p, regs) struct proc *p; struct reg *regs; { struct pcb *pcb; struct trapframe *tp; tp = p->p_md.md_regs; regs->r_es = tp->tf_es; regs->r_ds = tp->tf_ds; regs->r_edi = tp->tf_edi; regs->r_esi = tp->tf_esi; regs->r_ebp = tp->tf_ebp; regs->r_ebx = tp->tf_ebx; regs->r_edx = tp->tf_edx; regs->r_ecx = tp->tf_ecx; regs->r_eax = tp->tf_eax; regs->r_eip = tp->tf_eip; regs->r_cs = tp->tf_cs; regs->r_eflags = tp->tf_eflags; regs->r_esp = tp->tf_esp; regs->r_ss = tp->tf_ss; pcb = &p->p_addr->u_pcb; regs->r_fs = pcb->pcb_fs; regs->r_gs = pcb->pcb_gs; return (0); } int set_regs(p, regs) struct proc *p; struct reg *regs; { struct pcb *pcb; struct trapframe *tp; tp = p->p_md.md_regs; if (!EFLAGS_SECURE(regs->r_eflags, tp->tf_eflags) || !CS_SECURE(regs->r_cs)) return (EINVAL); tp->tf_es = regs->r_es; tp->tf_ds = regs->r_ds; tp->tf_edi = regs->r_edi; tp->tf_esi = regs->r_esi; tp->tf_ebp = regs->r_ebp; tp->tf_ebx = regs->r_ebx; tp->tf_edx = regs->r_edx; tp->tf_ecx = regs->r_ecx; tp->tf_eax = regs->r_eax; tp->tf_eip = regs->r_eip; tp->tf_cs = regs->r_cs; tp->tf_eflags = regs->r_eflags; tp->tf_esp = regs->r_esp; tp->tf_ss = regs->r_ss; pcb = &p->p_addr->u_pcb; pcb->pcb_fs = regs->r_fs; pcb->pcb_gs = regs->r_gs; return (0); } #ifndef DDB void Debugger(const char *msg) { printf("Debugger(\"%s\") called.\n", msg); } #endif /* no DDB */ #include /* * Determine the size of the transfer, and make sure it is * within the boundaries of the partition. Adjust transfer * if needed, and signal errors or early completion. */ int bounds_check_with_label(struct buf *bp, struct disklabel *lp, int wlabel) { struct partition *p = lp->d_partitions + dkpart(bp->b_dev); int labelsect = lp->d_partitions[0].p_offset; int maxsz = p->p_size, sz = (bp->b_bcount + DEV_BSIZE - 1) >> DEV_BSHIFT; /* overwriting disk label ? */ /* XXX should also protect bootstrap in first 8K */ if (bp->b_blkno + p->p_offset <= LABELSECTOR + labelsect && #if LABELSECTOR != 0 bp->b_blkno + p->p_offset + sz > LABELSECTOR + labelsect && #endif (bp->b_flags & B_READ) == 0 && wlabel == 0) { bp->b_error = EROFS; goto bad; } #if defined(DOSBBSECTOR) && defined(notyet) /* overwriting master boot record? */ if (bp->b_blkno + p->p_offset <= DOSBBSECTOR && (bp->b_flags & B_READ) == 0 && wlabel == 0) { bp->b_error = EROFS; goto bad; } #endif /* beyond partition? */ if (bp->b_blkno < 0 || bp->b_blkno + sz > maxsz) { /* if exactly at end of disk, return an EOF */ if (bp->b_blkno == maxsz) { bp->b_resid = bp->b_bcount; return(0); } /* or truncate if part of it fits */ sz = maxsz - bp->b_blkno; if (sz <= 0) { bp->b_error = EINVAL; goto bad; } bp->b_bcount = sz << DEV_BSHIFT; } bp->b_pblkno = bp->b_blkno + p->p_offset; return(1); bad: bp->b_flags |= B_ERROR; return(-1); } #ifdef DDB /* * Provide inb() and outb() as functions. They are normally only * available as macros calling inlined functions, thus cannot be * called inside DDB. * * The actual code is stolen from , and de-inlined. */ #undef inb #undef outb /* silence compiler warnings */ u_char inb(u_int); void outb(u_int, u_char); u_char inb(u_int port) { u_char data; /* * We use %%dx and not %1 here because i/o is done at %dx and not at * %edx, while gcc generates inferior code (movw instead of movl) * if we tell it to load (u_short) port. */ __asm __volatile("inb %%dx,%0" : "=a" (data) : "d" (port)); return (data); } void outb(u_int port, u_char data) { u_char al; /* * Use an unnecessary assignment to help gcc's register allocator. * This make a large difference for gcc-1.40 and a tiny difference * for gcc-2.6.0. For gcc-1.40, al had to be ``asm("ax")'' for * best results. gcc-2.6.0 can't handle this. */ al = data; __asm __volatile("outb %0,%%dx" : : "a" (al), "d" (port)); } #endif /* DDB */ Index: head/sys/i386/i386/pmap.c =================================================================== --- head/sys/i386/i386/pmap.c (revision 31708) +++ head/sys/i386/i386/pmap.c (revision 31709) @@ -1,3374 +1,3399 @@ /* * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 - * $Id: pmap.c,v 1.172 1997/11/07 19:58:34 tegge Exp $ + * $Id: pmap.c,v 1.173 1997/11/20 19:30:31 bde Exp $ */ /* * Manages physical address maps. * * In addition to hardware address maps, this * module is called upon to provide software-use-only * maps which may or may not be stored in the same * form as hardware maps. These pseudo-maps are * used to store intermediate results from copy * operations to and from address spaces. * * Since the information managed by this module is * also stored by the logical address mapping module, * this module may throw away valid virtual-to-physical * mappings at almost any time. However, invalidations * of virtual-to-physical mappings must be done as * requested. * * In order to cope with hardware architectures which * make virtual-to-physical map invalidates expensive, * this module may delay invalidate or reduced protection * operations until such time as they are actually * necessary. This module is given full information as * to which processors are currently using which maps, * and to when physical maps must be made correct. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(SMP) || defined(APIC_IO) #include #include #endif /* SMP || APIC_IO */ #define PMAP_KEEP_PDIRS #ifndef PMAP_SHPGPERPROC #define PMAP_SHPGPERPROC 200 #endif #if defined(DIAGNOSTIC) #define PMAP_DIAGNOSTIC #endif #define MINPV 2048 #if !defined(PMAP_DIAGNOSTIC) #define PMAP_INLINE __inline #else #define PMAP_INLINE #endif #define PTPHINT /* * Get PDEs and PTEs for user/kernel address space */ #define pmap_pde(m, v) (&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT])) #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT]) #define pmap_pde_v(pte) ((*(int *)pte & PG_V) != 0) #define pmap_pte_w(pte) ((*(int *)pte & PG_W) != 0) #define pmap_pte_m(pte) ((*(int *)pte & PG_M) != 0) #define pmap_pte_u(pte) ((*(int *)pte & PG_A) != 0) #define pmap_pte_v(pte) ((*(int *)pte & PG_V) != 0) #define pmap_pte_set_w(pte, v) ((v)?(*(int *)pte |= PG_W):(*(int *)pte &= ~PG_W)) #define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v))) /* * Given a map and a machine independent protection code, * convert to a vax protection code. */ #define pte_prot(m, p) (protection_codes[p]) static int protection_codes[8]; #define pa_index(pa) atop((pa) - vm_first_phys) #define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) static struct pmap kernel_pmap_store; pmap_t kernel_pmap; extern pd_entry_t my_idlePTD; vm_offset_t avail_start; /* PA of first available physical page */ vm_offset_t avail_end; /* PA of last available physical page */ vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ static boolean_t pmap_initialized = FALSE; /* Has pmap_init completed? */ static vm_offset_t vm_first_phys; int pgeflag; /* PG_G or-in */ int pseflag; /* PG_PS or-in */ int pv_npg; int nkpt; vm_offset_t kernel_vm_end; /* * Data for the pv entry allocation mechanism */ vm_zone_t pvzone; struct vm_zone pvzone_store; struct vm_object pvzone_obj; int pv_entry_count=0, pv_entry_max=0, pv_entry_high_water=0; int pmap_pagedaemon_waken = 0; struct pv_entry *pvinit; /* * All those kernel PT submaps that BSD is so fond of */ pt_entry_t *CMAP1 = 0; static pt_entry_t *CMAP2, *ptmmap; static pv_table_t *pv_table; caddr_t CADDR1 = 0, ptvmmap = 0; static caddr_t CADDR2; static pt_entry_t *msgbufmap; struct msgbuf *msgbufp=0; #ifdef SMP extern char prv_CPAGE1[], prv_CPAGE2[], prv_CPAGE3[]; extern pt_entry_t *prv_CMAP1, *prv_CMAP2, *prv_CMAP3; extern pd_entry_t *IdlePTDS[]; extern pt_entry_t SMP_prvpt[]; #endif pt_entry_t *PMAP1 = 0; unsigned *PADDR1 = 0; static PMAP_INLINE void free_pv_entry __P((pv_entry_t pv)); static unsigned * get_ptbase __P((pmap_t pmap)); static pv_entry_t get_pv_entry __P((void)); static void i386_protection_init __P((void)); static void pmap_changebit __P((vm_offset_t pa, int bit, boolean_t setem)); static PMAP_INLINE int pmap_is_managed __P((vm_offset_t pa)); static void pmap_remove_all __P((vm_offset_t pa)); static vm_page_t pmap_enter_quick __P((pmap_t pmap, vm_offset_t va, vm_offset_t pa, vm_page_t mpte)); static int pmap_remove_pte __P((struct pmap *pmap, unsigned *ptq, vm_offset_t sva)); static void pmap_remove_page __P((struct pmap *pmap, vm_offset_t va)); static int pmap_remove_entry __P((struct pmap *pmap, pv_table_t *pv, vm_offset_t va)); static boolean_t pmap_testbit __P((vm_offset_t pa, int bit)); static void pmap_insert_entry __P((pmap_t pmap, vm_offset_t va, vm_page_t mpte, vm_offset_t pa)); static vm_page_t pmap_allocpte __P((pmap_t pmap, vm_offset_t va)); static int pmap_release_free_page __P((pmap_t pmap, vm_page_t p)); static vm_page_t _pmap_allocpte __P((pmap_t pmap, unsigned ptepindex)); static unsigned * pmap_pte_quick __P((pmap_t pmap, vm_offset_t va)); static vm_page_t pmap_page_alloc __P((vm_object_t object, vm_pindex_t pindex)); static vm_page_t pmap_page_lookup __P((vm_object_t object, vm_pindex_t pindex)); static int pmap_unuse_pt __P((pmap_t, vm_offset_t, vm_page_t)); vm_offset_t pmap_kmem_choose(vm_offset_t addr) ; void pmap_collect(void); #define PDSTACKMAX 6 static vm_offset_t pdstack[PDSTACKMAX]; static int pdstackptr; unsigned pdir4mb; /* * Routine: pmap_pte * Function: * Extract the page table entry associated * with the given map/virtual_address pair. */ PMAP_INLINE unsigned * pmap_pte(pmap, va) register pmap_t pmap; vm_offset_t va; { unsigned *pdeaddr; if (pmap) { pdeaddr = (unsigned *) pmap_pde(pmap, va); if (*pdeaddr & PG_PS) return pdeaddr; if (*pdeaddr) { return get_ptbase(pmap) + i386_btop(va); } } return (0); } /* * Move the kernel virtual free pointer to the next * 4MB. This is used to help improve performance * by using a large (4MB) page for much of the kernel * (.text, .data, .bss) */ vm_offset_t pmap_kmem_choose(vm_offset_t addr) { vm_offset_t newaddr = addr; #ifndef DISABLE_PSE if (cpu_feature & CPUID_PSE) { newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1); } #endif return newaddr; } /* * Bootstrap the system enough to run with virtual memory. * * On the i386 this is called after mapping has already been enabled * and just syncs the pmap module with what has already been done. * [We can't call it easily with mapping off since the kernel is not * mapped with PA == VA, hence we would have to relocate every address * from the linked base (virtual) address "KERNBASE" to the actual * (physical) address starting relative to 0] */ void pmap_bootstrap(firstaddr, loadaddr) vm_offset_t firstaddr; vm_offset_t loadaddr; { vm_offset_t va; pt_entry_t *pte; int i, j; avail_start = firstaddr; /* * XXX The calculation of virtual_avail is wrong. It's NKPT*PAGE_SIZE too * large. It should instead be correctly calculated in locore.s and * not based on 'first' (which is a physical address, not a virtual * address, for the start of unused physical memory). The kernel * page tables are NOT double mapped and thus should not be included * in this calculation. */ virtual_avail = (vm_offset_t) KERNBASE + firstaddr; virtual_avail = pmap_kmem_choose(virtual_avail); virtual_end = VM_MAX_KERNEL_ADDRESS; /* * Initialize protection array. */ i386_protection_init(); /* * The kernel's pmap is statically allocated so we don't have to use * pmap_create, which is unlikely to work correctly at this part of * the boot sequence (XXX and which no longer exists). */ kernel_pmap = &kernel_pmap_store; kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD); kernel_pmap->pm_count = 1; TAILQ_INIT(&kernel_pmap->pm_pvlist); nkpt = NKPT; /* * Reserve some special page table entries/VA space for temporary * mapping of pages. */ #define SYSMAP(c, p, v, n) \ v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); va = virtual_avail; pte = (pt_entry_t *) pmap_pte(kernel_pmap, va); /* * CMAP1/CMAP2 are used for zeroing and copying pages. */ SYSMAP(caddr_t, CMAP1, CADDR1, 1) SYSMAP(caddr_t, CMAP2, CADDR2, 1) /* * ptvmmap is used for reading arbitrary physical pages via /dev/mem. * XXX ptmmap is not used. */ SYSMAP(caddr_t, ptmmap, ptvmmap, 1) /* * msgbufp is used to map the system message buffer. * XXX msgbufmap is not used. */ SYSMAP(struct msgbuf *, msgbufmap, msgbufp, atop(round_page(sizeof(struct msgbuf)))) /* * ptemap is used for pmap_pte_quick */ SYSMAP(unsigned *, PMAP1, PADDR1, 1); virtual_avail = va; *(int *) CMAP1 = *(int *) CMAP2 = 0; *(int *) PTD = 0; pgeflag = 0; #if !defined(SMP) if (cpu_feature & CPUID_PGE) { pgeflag = PG_G; } #endif /* * Initialize the 4MB page size flag */ pseflag = 0; /* * The 4MB page version of the initial * kernel page mapping. */ pdir4mb = 0; #if !defined(DISABLE_PSE) if (cpu_feature & CPUID_PSE) { unsigned ptditmp; /* * Enable the PSE mode */ load_cr4(rcr4() | CR4_PSE); /* * Note that we have enabled PSE mode */ pseflag = PG_PS; ptditmp = (unsigned) kernel_pmap->pm_pdir[KPTDI]; ptditmp &= ~(NBPDR - 1); ptditmp |= PG_V | PG_RW | PG_PS | PG_U | pgeflag; pdir4mb = ptditmp; /* * We can do the mapping here for the single processor * case. We simply ignore the old page table page from * now on. */ #if !defined(SMP) PTD[KPTDI] = (pd_entry_t) ptditmp; kernel_pmap->pm_pdir[KPTDI] = (pd_entry_t) ptditmp; invltlb(); #endif } #endif #ifdef SMP if (cpu_apic_address == 0) panic("pmap_bootstrap: no local apic!"); /* 0 = private page */ /* 1 = page table page */ /* 2 = local apic */ /* 16-31 = io apics */ SMP_prvpt[2] = (pt_entry_t)(PG_V | PG_RW | pgeflag | ((u_long)cpu_apic_address & PG_FRAME)); for (i = 0; i < mp_napics; i++) { for (j = 0; j < 16; j++) { /* same page frame as a previous IO apic? */ if (((u_long)SMP_prvpt[j + 16] & PG_FRAME) == ((u_long)io_apic_address[0] & PG_FRAME)) { ioapic[i] = (ioapic_t *)&SMP_ioapic[j * PAGE_SIZE]; break; } /* use this slot if available */ if (((u_long)SMP_prvpt[j + 16] & PG_FRAME) == 0) { SMP_prvpt[j + 16] = (pt_entry_t)(PG_V | PG_RW | pgeflag | ((u_long)io_apic_address[i] & PG_FRAME)); ioapic[i] = (ioapic_t *)&SMP_ioapic[j * PAGE_SIZE]; break; } } if (j == 16) panic("no space to map IO apic %d!", i); } /* BSP does this itself, AP's get it pre-set */ prv_CMAP1 = (pt_entry_t *)&SMP_prvpt[3 + UPAGES]; prv_CMAP2 = (pt_entry_t *)&SMP_prvpt[4 + UPAGES]; prv_CMAP3 = (pt_entry_t *)&SMP_prvpt[5 + UPAGES]; #endif invltlb(); } /* * Set 4mb pdir for mp startup, and global flags */ void pmap_set_opt(unsigned *pdir) { int i; if (pseflag && (cpu_feature & CPUID_PSE)) { load_cr4(rcr4() | CR4_PSE); if (pdir4mb) { (unsigned) pdir[KPTDI] = pdir4mb; } } if (pgeflag && (cpu_feature & CPUID_PGE)) { load_cr4(rcr4() | CR4_PGE); for(i = KPTDI; i < KPTDI + nkpt; i++) { if (pdir[i]) { pdir[i] |= PG_G; } } } } /* * Setup the PTD for the boot processor */ void pmap_set_opt_bsp(void) { pmap_set_opt((unsigned *)kernel_pmap->pm_pdir); pmap_set_opt((unsigned *)PTD); invltlb(); } /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. * pmap_init has been enhanced to support in a fairly consistant * way, discontiguous physical memory. */ void pmap_init(phys_start, phys_end) vm_offset_t phys_start, phys_end; { vm_offset_t addr; vm_size_t s; int i; int initial_pvs; /* * calculate the number of pv_entries needed */ vm_first_phys = phys_avail[0]; for (i = 0; phys_avail[i + 1]; i += 2); pv_npg = (phys_avail[(i - 2) + 1] - vm_first_phys) / PAGE_SIZE; /* * Allocate memory for random pmap data structures. Includes the * pv_head_table. */ s = (vm_size_t) (sizeof(pv_table_t) * pv_npg); s = round_page(s); addr = (vm_offset_t) kmem_alloc(kernel_map, s); pv_table = (pv_table_t *) addr; for(i = 0; i < pv_npg; i++) { vm_offset_t pa; TAILQ_INIT(&pv_table[i].pv_list); pv_table[i].pv_list_count = 0; pa = vm_first_phys + i * PAGE_SIZE; pv_table[i].pv_vm_page = PHYS_TO_VM_PAGE(pa); } /* * init the pv free list */ initial_pvs = pv_npg; if (initial_pvs < MINPV) initial_pvs = MINPV; pvzone = &pvzone_store; pvinit = (struct pv_entry *) kmem_alloc(kernel_map, initial_pvs * sizeof (struct pv_entry)); zbootinit(pvzone, "PV ENTRY", sizeof (struct pv_entry), pvinit, pv_npg); /* * Now it is safe to enable pv_table recording. */ pmap_initialized = TRUE; } /* * Initialize the address space (zone) for the pv_entries. Set a * high water mark so that the system can recover from excessive * numbers of pv entries. */ void pmap_init2() { pv_entry_max = PMAP_SHPGPERPROC * maxproc + pv_npg; pv_entry_high_water = 9 * (pv_entry_max / 10); zinitna(pvzone, &pvzone_obj, NULL, 0, pv_entry_max, ZONE_INTERRUPT, 1); } /* * Used to map a range of physical addresses into kernel * virtual address space. * * For now, VM is already on, we only need to map the * specified memory. */ vm_offset_t pmap_map(virt, start, end, prot) vm_offset_t virt; vm_offset_t start; vm_offset_t end; int prot; { while (start < end) { pmap_enter(kernel_pmap, virt, start, prot, FALSE); virt += PAGE_SIZE; start += PAGE_SIZE; } return (virt); } /*************************************************** * Low level helper routines..... ***************************************************/ #if defined(PMAP_DIAGNOSTIC) /* * This code checks for non-writeable/modified pages. * This should be an invalid condition. */ static int pmap_nw_modified(pt_entry_t ptea) { int pte; pte = (int) ptea; if ((pte & (PG_M|PG_RW)) == PG_M) return 1; else return 0; } #endif /* * this routine defines the region(s) of memory that should * not be tested for the modified bit. */ static PMAP_INLINE int pmap_track_modified( vm_offset_t va) { if ((va < clean_sva) || (va >= clean_eva)) return 1; else return 0; } static PMAP_INLINE void invltlb_1pg( vm_offset_t va) { #if defined(I386_CPU) if (cpu_class == CPUCLASS_386) { invltlb(); } else #endif { invlpg(va); } } static PMAP_INLINE void invltlb_2pg( vm_offset_t va1, vm_offset_t va2) { #if defined(I386_CPU) if (cpu_class == CPUCLASS_386) { invltlb(); } else #endif { invlpg(va1); invlpg(va2); } } static unsigned * get_ptbase(pmap) pmap_t pmap; { unsigned frame = (unsigned) pmap->pm_pdir[PTDPTDI] & PG_FRAME; /* are we current address space or kernel? */ if (pmap == kernel_pmap || frame == (((unsigned) PTDpde) & PG_FRAME)) { return (unsigned *) PTmap; } /* otherwise, we are alternate address space */ if (frame != (((unsigned) APTDpde) & PG_FRAME)) { APTDpde = (pd_entry_t) (frame | PG_RW | PG_V); invltlb(); } return (unsigned *) APTmap; } /* * Super fast pmap_pte routine best used when scanning * the pv lists. This eliminates many coarse-grained * invltlb calls. Note that many of the pv list * scans are across different pmaps. It is very wasteful * to do an entire invltlb for checking a single mapping. */ static unsigned * pmap_pte_quick(pmap, va) register pmap_t pmap; vm_offset_t va; { unsigned pde, newpf; if (pde = (unsigned) pmap->pm_pdir[va >> PDRSHIFT]) { unsigned frame = (unsigned) pmap->pm_pdir[PTDPTDI] & PG_FRAME; unsigned index = i386_btop(va); /* are we current address space or kernel? */ if ((pmap == kernel_pmap) || (frame == (((unsigned) PTDpde) & PG_FRAME))) { return (unsigned *) PTmap + index; } newpf = pde & PG_FRAME; if ( ((* (unsigned *) PMAP1) & PG_FRAME) != newpf) { * (unsigned *) PMAP1 = newpf | PG_RW | PG_V; invltlb_1pg((vm_offset_t) PADDR1); } return PADDR1 + ((unsigned) index & (NPTEPG - 1)); } return (0); } /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ vm_offset_t pmap_extract(pmap, va) register pmap_t pmap; vm_offset_t va; { vm_offset_t rtval; vm_offset_t pdirindex; pdirindex = va >> PDRSHIFT; if (pmap && (rtval = (unsigned) pmap->pm_pdir[pdirindex])) { unsigned *pte; if ((rtval & PG_PS) != 0) { rtval &= ~(NBPDR - 1); rtval |= va & (NBPDR - 1); return rtval; } pte = get_ptbase(pmap) + i386_btop(va); rtval = ((*pte & PG_FRAME) | (va & PAGE_MASK)); return rtval; } return 0; } /* * determine if a page is managed (memory vs. device) */ static PMAP_INLINE int pmap_is_managed(pa) vm_offset_t pa; { int i; if (!pmap_initialized) return 0; for (i = 0; phys_avail[i + 1]; i += 2) { if (pa < phys_avail[i + 1] && pa >= phys_avail[i]) return 1; } return 0; } /*************************************************** * Low level mapping routines..... ***************************************************/ /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. */ void pmap_qenter(va, m, count) vm_offset_t va; vm_page_t *m; int count; { int i; register unsigned *pte; for (i = 0; i < count; i++) { vm_offset_t tva = va + i * PAGE_SIZE; unsigned npte = VM_PAGE_TO_PHYS(m[i]) | PG_RW | PG_V | pgeflag; unsigned opte; pte = (unsigned *)vtopte(tva); opte = *pte; *pte = npte; if (opte) invltlb_1pg(tva); } } /* * this routine jerks page mappings from the * kernel -- it is meant only for temporary mappings. */ void pmap_qremove(va, count) vm_offset_t va; int count; { int i; register unsigned *pte; for (i = 0; i < count; i++) { pte = (unsigned *)vtopte(va); *pte = 0; invltlb_1pg(va); va += PAGE_SIZE; } } /* * add a wired page to the kva * note that in order for the mapping to take effect -- you * should do a invltlb after doing the pmap_kenter... */ PMAP_INLINE void pmap_kenter(va, pa) vm_offset_t va; register vm_offset_t pa; { register unsigned *pte; unsigned npte, opte; npte = pa | PG_RW | PG_V | pgeflag; pte = (unsigned *)vtopte(va); opte = *pte; *pte = npte; if (opte) invltlb_1pg(va); } /* * remove a page from the kernel pagetables */ PMAP_INLINE void pmap_kremove(va) vm_offset_t va; { register unsigned *pte; pte = (unsigned *)vtopte(va); *pte = 0; invltlb_1pg(va); } static vm_page_t pmap_page_alloc(object, pindex) vm_object_t object; vm_pindex_t pindex; { vm_page_t m; m = vm_page_alloc(object, pindex, VM_ALLOC_ZERO); if (m == NULL) { VM_WAIT; } return m; } static vm_page_t pmap_page_lookup(object, pindex) vm_object_t object; vm_pindex_t pindex; { vm_page_t m; retry: m = vm_page_lookup(object, pindex); if (m) { if (m->flags & PG_BUSY) { m->flags |= PG_WANTED; tsleep(m, PVM, "pplookp", 0); goto retry; } } return m; } /* * Create the UPAGES for a new process. * This routine directly affects the fork perf for a process. */ void pmap_new_proc(p) struct proc *p; { int i; vm_object_t upobj; vm_page_t m; struct user *up; unsigned *ptek; /* * allocate object for the upages */ upobj = vm_object_allocate( OBJT_DEFAULT, UPAGES); p->p_upages_obj = upobj; /* get a kernel virtual address for the UPAGES for this proc */ up = (struct user *) kmem_alloc_pageable(u_map, UPAGES * PAGE_SIZE); if (up == NULL) panic("pmap_new_proc: u_map allocation failed"); ptek = (unsigned *) vtopte((vm_offset_t) up); for(i=0;iwire_count++; ++cnt.v_wire_count; /* * Enter the page into the kernel address space. */ *(ptek + i) = VM_PAGE_TO_PHYS(m) | PG_RW | PG_V | pgeflag; m->flags &= ~(PG_ZERO|PG_BUSY); m->flags |= PG_MAPPED|PG_WRITEABLE; m->valid = VM_PAGE_BITS_ALL; } p->p_addr = up; } /* * Dispose the UPAGES for a process that has exited. * This routine directly impacts the exit perf of a process. */ void pmap_dispose_proc(p) struct proc *p; { int i; vm_object_t upobj; vm_page_t m; unsigned *ptek; ptek = (unsigned *) vtopte((vm_offset_t) p->p_addr); upobj = p->p_upages_obj; for(i=0;ip_addr + i * PAGE_SIZE); vm_page_unwire(m); vm_page_free(m); } vm_object_deallocate(upobj); kmem_free(u_map, (vm_offset_t)p->p_addr, ctob(UPAGES)); } /* * Allow the UPAGES for a process to be prejudicially paged out. */ void pmap_swapout_proc(p) struct proc *p; { int i; vm_object_t upobj; vm_page_t m; upobj = p->p_upages_obj; /* * let the upages be paged */ for(i=0;idirty = VM_PAGE_BITS_ALL; vm_page_unwire(m); vm_page_deactivate(m); pmap_kremove( (vm_offset_t) p->p_addr + PAGE_SIZE * i); } } /* * Bring the UPAGES for a specified process back in. */ void pmap_swapin_proc(p) struct proc *p; { int i; vm_object_t upobj; vm_page_t m; upobj = p->p_upages_obj; for(i=0;iflags & PG_BUSY) || m->busy) { m->flags |= PG_WANTED; tsleep(m, PVM, "swinuw",0); goto retry; } m->flags |= PG_BUSY; } vm_page_wire(m); splx(s); pmap_kenter(((vm_offset_t) p->p_addr) + i * PAGE_SIZE, VM_PAGE_TO_PHYS(m)); if (m->valid != VM_PAGE_BITS_ALL) { int rv; rv = vm_pager_get_pages(upobj, &m, 1, 0); if (rv != VM_PAGER_OK) panic("pmap_swapin_proc: cannot get upages for proc: %d\n", p->p_pid); m->valid = VM_PAGE_BITS_ALL; } PAGE_WAKEUP(m); m->flags |= PG_MAPPED|PG_WRITEABLE; } } /*************************************************** * Page table page management routines..... ***************************************************/ /* * This routine unholds page table pages, and if the hold count * drops to zero, then it decrements the wire count. */ static int _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m) { int s; if (m->flags & PG_BUSY) { s = splvm(); while (m->flags & PG_BUSY) { m->flags |= PG_WANTED; tsleep(m, PVM, "pmuwpt", 0); } splx(s); } if (m->hold_count == 0) { vm_offset_t pteva; /* * unmap the page table page */ pmap->pm_pdir[m->pindex] = 0; --pmap->pm_stats.resident_count; if ((((unsigned)pmap->pm_pdir[PTDPTDI]) & PG_FRAME) == (((unsigned) PTDpde) & PG_FRAME)) { /* * Do a invltlb to make the invalidated mapping * take effect immediately. */ pteva = UPT_MIN_ADDRESS + i386_ptob(m->pindex); invltlb_1pg(pteva); } #if defined(PTPHINT) if (pmap->pm_ptphint == m) pmap->pm_ptphint = NULL; #endif /* * If the page is finally unwired, simply free it. */ --m->wire_count; if (m->wire_count == 0) { if (m->flags & PG_WANTED) { m->flags &= ~PG_WANTED; wakeup(m); } vm_page_free_zero(m); --cnt.v_wire_count; } return 1; } return 0; } __inline static int pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m) { vm_page_unhold(m); if (m->hold_count == 0) return _pmap_unwire_pte_hold(pmap, m); else return 0; } /* * After removing a page table entry, this routine is used to * conditionally free the page, and manage the hold/wire counts. */ static int pmap_unuse_pt(pmap, va, mpte) pmap_t pmap; vm_offset_t va; vm_page_t mpte; { unsigned ptepindex; if (va >= UPT_MIN_ADDRESS) return 0; if (mpte == NULL) { ptepindex = (va >> PDRSHIFT); #if defined(PTPHINT) if (pmap->pm_ptphint && (pmap->pm_ptphint->pindex == ptepindex)) { mpte = pmap->pm_ptphint; } else { mpte = pmap_page_lookup( pmap->pm_pteobj, ptepindex); pmap->pm_ptphint = mpte; } #else mpte = pmap_page_lookup( pmap->pm_pteobj, ptepindex); #endif } return pmap_unwire_pte_hold(pmap, mpte); } +#if !defined(SMP) +void +pmap_pinit0(pmap) + struct pmap *pmap; +{ + pmap->pm_pdir = + (pd_entry_t *)kmem_alloc_pageable(kernel_map, PAGE_SIZE); + pmap_kenter((vm_offset_t) pmap->pm_pdir, (vm_offset_t) IdlePTD); + pmap->pm_flags = 0; + pmap->pm_count = 1; + pmap->pm_ptphint = NULL; + TAILQ_INIT(&pmap->pm_pvlist); +} +#else +void +pmap_pinit0(pmap) + struct pmap *pmap; +{ + pmap_pinit(pmap); +} +#endif + /* * Initialize a preallocated and zeroed pmap structure, * such as one in a vmspace structure. */ void pmap_pinit(pmap) register struct pmap *pmap; { vm_page_t ptdpg; /* * No need to allocate page table space yet but we do need a valid * page directory table. */ if (pdstackptr > 0) { --pdstackptr; pmap->pm_pdir = (pd_entry_t *)pdstack[pdstackptr]; } else { pmap->pm_pdir = (pd_entry_t *)kmem_alloc_pageable(kernel_map, PAGE_SIZE); } /* * allocate object for the ptes */ pmap->pm_pteobj = vm_object_allocate( OBJT_DEFAULT, PTDPTDI + 1); /* * allocate the page directory page */ retry: ptdpg = pmap_page_alloc( pmap->pm_pteobj, PTDPTDI); if (ptdpg == NULL) goto retry; ptdpg->wire_count = 1; ++cnt.v_wire_count; ptdpg->flags &= ~(PG_MAPPED|PG_BUSY); /* not mapped normally */ ptdpg->valid = VM_PAGE_BITS_ALL; pmap_kenter((vm_offset_t) pmap->pm_pdir, VM_PAGE_TO_PHYS(ptdpg)); if ((ptdpg->flags & PG_ZERO) == 0) bzero(pmap->pm_pdir, PAGE_SIZE); /* wire in kernel global address entries */ /* XXX copies current process, does not fill in MPPTDI */ bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * PTESIZE); /* install self-referential address mapping entry */ *(unsigned *) (pmap->pm_pdir + PTDPTDI) = VM_PAGE_TO_PHYS(ptdpg) | PG_V | PG_RW; pmap->pm_flags = 0; pmap->pm_count = 1; pmap->pm_ptphint = NULL; TAILQ_INIT(&pmap->pm_pvlist); } static int pmap_release_free_page(pmap, p) struct pmap *pmap; vm_page_t p; { int s; unsigned *pde = (unsigned *) pmap->pm_pdir; /* * This code optimizes the case of freeing non-busy * page-table pages. Those pages are zero now, and * might as well be placed directly into the zero queue. */ s = splvm(); if (p->flags & PG_BUSY) { p->flags |= PG_WANTED; tsleep(p, PVM, "pmaprl", 0); splx(s); return 0; } if (p->flags & PG_WANTED) { p->flags &= ~PG_WANTED; wakeup(p); } /* * Remove the page table page from the processes address space. */ pde[p->pindex] = 0; --pmap->pm_stats.resident_count; if (p->hold_count) { panic("pmap_release: freeing held page table page"); } /* * Page directory pages need to have the kernel * stuff cleared, so they can go into the zero queue also. */ if (p->pindex == PTDPTDI) { bzero(pde + KPTDI, nkpt * PTESIZE); #ifdef SMP pde[MPPTDI] = 0; #endif pde[APTDPTDI] = 0; pmap_kremove((vm_offset_t) pmap->pm_pdir); } #if defined(PTPHINT) if (pmap->pm_ptphint && (pmap->pm_ptphint->pindex == p->pindex)) pmap->pm_ptphint = NULL; #endif vm_page_free_zero(p); splx(s); return 1; } /* * this routine is called if the page table page is not * mapped correctly. */ static vm_page_t _pmap_allocpte(pmap, ptepindex) pmap_t pmap; unsigned ptepindex; { vm_offset_t pteva, ptepa; vm_page_t m; int needszero = 0; /* * Find or fabricate a new pagetable page */ retry: m = vm_page_lookup(pmap->pm_pteobj, ptepindex); if (m == NULL) { m = pmap_page_alloc(pmap->pm_pteobj, ptepindex); if (m == NULL) goto retry; if ((m->flags & PG_ZERO) == 0) needszero = 1; m->flags &= ~(PG_ZERO|PG_BUSY); m->valid = VM_PAGE_BITS_ALL; } else { if ((m->flags & PG_BUSY) || m->busy) { m->flags |= PG_WANTED; tsleep(m, PVM, "ptewai", 0); goto retry; } } if (m->queue != PQ_NONE) { int s = splvm(); vm_page_unqueue(m); splx(s); } if (m->wire_count == 0) ++cnt.v_wire_count; ++m->wire_count; /* * Increment the hold count for the page table page * (denoting a new mapping.) */ ++m->hold_count; /* * Map the pagetable page into the process address space, if * it isn't already there. */ pmap->pm_stats.resident_count++; ptepa = VM_PAGE_TO_PHYS(m); pmap->pm_pdir[ptepindex] = (pd_entry_t) (ptepa | PG_U | PG_RW | PG_V); #if defined(PTPHINT) /* * Set the page table hint */ pmap->pm_ptphint = m; #endif /* * Try to use the new mapping, but if we cannot, then * do it with the routine that maps the page explicitly. */ if (needszero) { if ((((unsigned)pmap->pm_pdir[PTDPTDI]) & PG_FRAME) == (((unsigned) PTDpde) & PG_FRAME)) { pteva = UPT_MIN_ADDRESS + i386_ptob(ptepindex); bzero((caddr_t) pteva, PAGE_SIZE); } else { pmap_zero_page(ptepa); } } m->valid = VM_PAGE_BITS_ALL; m->flags |= PG_MAPPED; return m; } static vm_page_t pmap_allocpte(pmap, va) pmap_t pmap; vm_offset_t va; { unsigned ptepindex; vm_offset_t ptepa; vm_page_t m; /* * Calculate pagetable page index */ ptepindex = va >> PDRSHIFT; /* * Get the page directory entry */ ptepa = (vm_offset_t) pmap->pm_pdir[ptepindex]; /* * This supports switching from a 4MB page to a * normal 4K page. */ if (ptepa & PG_PS) { pmap->pm_pdir[ptepindex] = 0; ptepa = 0; invltlb(); } /* * If the page table page is mapped, we just increment the * hold count, and activate it. */ if (ptepa) { #if defined(PTPHINT) /* * In order to get the page table page, try the * hint first. */ if (pmap->pm_ptphint && (pmap->pm_ptphint->pindex == ptepindex)) { m = pmap->pm_ptphint; } else { m = pmap_page_lookup( pmap->pm_pteobj, ptepindex); pmap->pm_ptphint = m; } #else m = pmap_page_lookup( pmap->pm_pteobj, ptepindex); #endif ++m->hold_count; return m; } /* * Here if the pte page isn't mapped, or if it has been deallocated. */ return _pmap_allocpte(pmap, ptepindex); } /*************************************************** * Pmap allocation/deallocation routines. ***************************************************/ /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap) register struct pmap *pmap; { vm_page_t p,n,ptdpg; vm_object_t object = pmap->pm_pteobj; #if defined(DIAGNOSTIC) if (object->ref_count != 1) panic("pmap_release: pteobj reference count != 1"); #endif ptdpg = NULL; retry: for (p = TAILQ_FIRST(&object->memq); p != NULL; p = n) { n = TAILQ_NEXT(p, listq); if (p->pindex == PTDPTDI) { ptdpg = p; continue; } if (!pmap_release_free_page(pmap, p)) goto retry; } if (ptdpg && !pmap_release_free_page(pmap, ptdpg)) goto retry; vm_object_deallocate(object); if (pdstackptr < PDSTACKMAX) { pdstack[pdstackptr] = (vm_offset_t) pmap->pm_pdir; ++pdstackptr; } else { int pdstmp = pdstackptr - 1; kmem_free(kernel_map, pdstack[pdstmp], PAGE_SIZE); pdstack[pdstmp] = (vm_offset_t) pmap->pm_pdir; } pmap->pm_pdir = 0; } /* * grow the number of kernel page table entries, if needed */ void pmap_growkernel(vm_offset_t addr) { struct proc *p; struct pmap *pmap; int s; vm_offset_t ptpkva, ptppaddr; vm_page_t nkpg; #ifdef SMP int i; #endif pd_entry_t newpdir; vm_pindex_t ptpidx; s = splhigh(); if (kernel_vm_end == 0) { kernel_vm_end = KERNBASE; nkpt = 0; while (pdir_pde(PTD, kernel_vm_end)) { kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); ++nkpt; } } addr = (addr + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); while (kernel_vm_end < addr) { if (pdir_pde(PTD, kernel_vm_end)) { kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); continue; } ++nkpt; ptpkva = (vm_offset_t) vtopte(addr); ptpidx = (ptpkva >> PAGE_SHIFT); /* * This index is bogus, but out of the way */ nkpg = vm_page_alloc(kernel_object, ptpidx, VM_ALLOC_SYSTEM); if (!nkpg) panic("pmap_growkernel: no memory to grow kernel"); vm_page_wire(nkpg); vm_page_remove(nkpg); ptppaddr = VM_PAGE_TO_PHYS(nkpg); pmap_zero_page(ptppaddr); newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW); pdir_pde(PTD, kernel_vm_end) = newpdir; #ifdef SMP for (i = 0; i < mp_ncpus; i++) { if (IdlePTDS[i]) pdir_pde(IdlePTDS[i], kernel_vm_end) = newpdir; } #endif for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) { if (p->p_vmspace) { pmap = &p->p_vmspace->vm_pmap; *pmap_pde(pmap, kernel_vm_end) = newpdir; } } *pmap_pde(kernel_pmap, kernel_vm_end) = newpdir; kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); } splx(s); } /* * Retire the given physical map from service. * Should only be called if the map contains * no valid mappings. */ void pmap_destroy(pmap) register pmap_t pmap; { int count; if (pmap == NULL) return; count = --pmap->pm_count; if (count == 0) { pmap_release(pmap); panic("destroying a pmap is not yet implemented"); /* free((caddr_t) pmap, M_VMPMAP); */ } } /* * Add a reference to the specified pmap. */ void pmap_reference(pmap) pmap_t pmap; { if (pmap != NULL) { pmap->pm_count++; } } /*************************************************** * page management routines. ***************************************************/ /* * free the pv_entry back to the free list */ static inline void free_pv_entry(pv) pv_entry_t pv; { pv_entry_count--; zfreei(pvzone, pv); } /* * get a new pv_entry, allocating a block from the system * when needed. * the memory allocation is performed bypassing the malloc code * because of the possibility of allocations at interrupt time. */ static pv_entry_t get_pv_entry(void) { pv_entry_count++; if (pv_entry_high_water && (pv_entry_count > pv_entry_high_water) && (pmap_pagedaemon_waken == 0)) { pmap_pagedaemon_waken = 1; wakeup (&vm_pages_needed); } return zalloci(pvzone); } /* * This routine is very drastic, but can save the system * in a pinch. */ void pmap_collect() { pv_table_t *ppv; int i; vm_offset_t pa; vm_page_t m; static int warningdone=0; if (pmap_pagedaemon_waken == 0) return; if (warningdone < 5) { printf("pmap_collect: collecting pv entries -- suggest increasing PMAP_SHPGPERPROC\n"); warningdone++; } for(i = 0; i < pv_npg; i++) { if ((ppv = &pv_table[i]) == 0) continue; m = ppv->pv_vm_page; if ((pa = VM_PAGE_TO_PHYS(m)) == 0) continue; if (m->wire_count || m->hold_count || m->busy || (m->flags & PG_BUSY)) continue; pmap_remove_all(pa); } pmap_pagedaemon_waken = 0; } /* * If it is the first entry on the list, it is actually * in the header and we must copy the following entry up * to the header. Otherwise we must search the list for * the entry. In either case we free the now unused entry. */ static int pmap_remove_entry(pmap, ppv, va) struct pmap *pmap; pv_table_t *ppv; vm_offset_t va; { pv_entry_t pv; int rtval; int s; s = splvm(); if (ppv->pv_list_count < pmap->pm_stats.resident_count) { for (pv = TAILQ_FIRST(&ppv->pv_list); pv; pv = TAILQ_NEXT(pv, pv_list)) { if (pmap == pv->pv_pmap && va == pv->pv_va) break; } } else { for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = TAILQ_NEXT(pv, pv_plist)) { if (va == pv->pv_va) break; } } rtval = 0; if (pv) { rtval = pmap_unuse_pt(pmap, va, pv->pv_ptem); TAILQ_REMOVE(&ppv->pv_list, pv, pv_list); --ppv->pv_list_count; if (TAILQ_FIRST(&ppv->pv_list) == NULL) { ppv->pv_vm_page->flags &= ~(PG_MAPPED|PG_WRITEABLE); } TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist); free_pv_entry(pv); } splx(s); return rtval; } /* * Create a pv entry for page at pa for * (pmap, va). */ static void pmap_insert_entry(pmap, va, mpte, pa) pmap_t pmap; vm_offset_t va; vm_page_t mpte; vm_offset_t pa; { int s; pv_entry_t pv; pv_table_t *ppv; s = splvm(); pv = get_pv_entry(); pv->pv_va = va; pv->pv_pmap = pmap; pv->pv_ptem = mpte; TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist); ppv = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&ppv->pv_list, pv, pv_list); ++ppv->pv_list_count; splx(s); } /* * pmap_remove_pte: do the things to unmap a page in a process */ static int pmap_remove_pte(pmap, ptq, va) struct pmap *pmap; unsigned *ptq; vm_offset_t va; { unsigned oldpte; pv_table_t *ppv; oldpte = *ptq; *ptq = 0; if (oldpte & PG_W) pmap->pm_stats.wired_count -= 1; /* * Machines that don't support invlpg, also don't support * PG_G. */ if (oldpte & PG_G) invlpg(va); pmap->pm_stats.resident_count -= 1; if (oldpte & PG_MANAGED) { ppv = pa_to_pvh(oldpte); if (oldpte & PG_M) { #if defined(PMAP_DIAGNOSTIC) if (pmap_nw_modified((pt_entry_t) oldpte)) { printf("pmap_remove: modified page not writable: va: 0x%lx, pte: 0x%lx\n", va, (int) oldpte); } #endif if (pmap_track_modified(va)) ppv->pv_vm_page->dirty = VM_PAGE_BITS_ALL; } return pmap_remove_entry(pmap, ppv, va); } else { return pmap_unuse_pt(pmap, va, NULL); } return 0; } /* * Remove a single page from a process address space */ static void pmap_remove_page(pmap, va) struct pmap *pmap; register vm_offset_t va; { register unsigned *ptq; /* * if there is no pte for this address, just skip it!!! */ if (*pmap_pde(pmap, va) == 0) { return; } /* * get a local va for mappings for this pmap. */ ptq = get_ptbase(pmap) + i386_btop(va); if (*ptq) { (void) pmap_remove_pte(pmap, ptq, va); invltlb_1pg(va); } return; } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ void pmap_remove(pmap, sva, eva) struct pmap *pmap; register vm_offset_t sva; register vm_offset_t eva; { register unsigned *ptbase; vm_offset_t pdnxt; vm_offset_t ptpaddr; vm_offset_t sindex, eindex; int anyvalid; if (pmap == NULL) return; if (pmap->pm_stats.resident_count == 0) return; /* * special handling of removing one page. a very * common operation and easy to short circuit some * code. */ if (((sva + PAGE_SIZE) == eva) && (((unsigned) pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) { pmap_remove_page(pmap, sva); return; } anyvalid = 0; /* * Get a local virtual address for the mappings that are being * worked with. */ ptbase = get_ptbase(pmap); sindex = i386_btop(sva); eindex = i386_btop(eva); for (; sindex < eindex; sindex = pdnxt) { unsigned pdirindex; /* * Calculate index for next page table. */ pdnxt = ((sindex + NPTEPG) & ~(NPTEPG - 1)); if (pmap->pm_stats.resident_count == 0) break; pdirindex = sindex / NPDEPG; if (((ptpaddr = (unsigned) pmap->pm_pdir[pdirindex]) & PG_PS) != 0) { pmap->pm_pdir[pdirindex] = 0; pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; anyvalid++; continue; } /* * Weed out invalid mappings. Note: we assume that the page * directory table is always allocated, and in kernel virtual. */ if (ptpaddr == 0) continue; /* * Limit our scan to either the end of the va represented * by the current page table page, or to the end of the * range being removed. */ if (pdnxt > eindex) { pdnxt = eindex; } for ( ;sindex != pdnxt; sindex++) { vm_offset_t va; if (ptbase[sindex] == 0) { continue; } va = i386_ptob(sindex); anyvalid++; if (pmap_remove_pte(pmap, ptbase + sindex, va)) break; } } if (anyvalid) { invltlb(); } } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. * * Notes: * Original versions of this routine were very * inefficient because they iteratively called * pmap_remove (slow...) */ static void pmap_remove_all(pa) vm_offset_t pa; { register pv_entry_t pv; pv_table_t *ppv; register unsigned *pte, tpte; int nmodify; int update_needed; int s; nmodify = 0; update_needed = 0; #if defined(PMAP_DIAGNOSTIC) /* * XXX this makes pmap_page_protect(NONE) illegal for non-managed * pages! */ if (!pmap_is_managed(pa)) { panic("pmap_page_protect: illegal for unmanaged page, va: 0x%lx", pa); } #endif s = splvm(); ppv = pa_to_pvh(pa); while ((pv = TAILQ_FIRST(&ppv->pv_list)) != NULL) { pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); pv->pv_pmap->pm_stats.resident_count--; tpte = *pte; *pte = 0; if (tpte & PG_W) pv->pv_pmap->pm_stats.wired_count--; /* * Update the vm_page_t clean and reference bits. */ if (tpte & PG_M) { #if defined(PMAP_DIAGNOSTIC) if (pmap_nw_modified((pt_entry_t) tpte)) { printf("pmap_remove_all: modified page not writable: va: 0x%lx, pte: 0x%lx\n", pv->pv_va, tpte); } #endif if (pmap_track_modified(pv->pv_va)) ppv->pv_vm_page->dirty = VM_PAGE_BITS_ALL; } if (!update_needed && ((!curproc || (&curproc->p_vmspace->vm_pmap == pv->pv_pmap)) || (pv->pv_pmap == kernel_pmap))) { update_needed = 1; } TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist); TAILQ_REMOVE(&ppv->pv_list, pv, pv_list); --ppv->pv_list_count; pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem); free_pv_entry(pv); } ppv->pv_vm_page->flags &= ~(PG_MAPPED|PG_WRITEABLE); if (update_needed) invltlb(); splx(s); return; } /* * Set the physical protection on the * specified range of this map as requested. */ void pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { register unsigned *ptbase; vm_offset_t pdnxt; vm_offset_t ptpaddr; vm_offset_t sindex, eindex; int anychanged; if (pmap == NULL) return; if ((prot & VM_PROT_READ) == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); return; } anychanged = 0; ptbase = get_ptbase(pmap); sindex = i386_btop(sva); eindex = i386_btop(eva); for (; sindex < eindex; sindex = pdnxt) { unsigned pdirindex; pdnxt = ((sindex + NPTEPG) & ~(NPTEPG - 1)); pdirindex = sindex / NPDEPG; if (((ptpaddr = (unsigned) pmap->pm_pdir[pdirindex]) & PG_PS) != 0) { (unsigned) pmap->pm_pdir[pdirindex] &= ~(PG_M|PG_RW); pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; anychanged++; continue; } /* * Weed out invalid mappings. Note: we assume that the page * directory table is always allocated, and in kernel virtual. */ if (ptpaddr == 0) continue; if (pdnxt > eindex) { pdnxt = eindex; } for (; sindex != pdnxt; sindex++) { unsigned pbits = ptbase[sindex]; if (prot & VM_PROT_WRITE) { if ((pbits & (PG_RW|PG_V)) == PG_V) { if (pbits & PG_MANAGED) { vm_page_t m = PHYS_TO_VM_PAGE(pbits); m->flags |= PG_WRITEABLE; m->object->flags |= OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY; } ptbase[sindex] = pbits | PG_RW; anychanged = 1; } } else if (pbits & PG_RW) { if (pbits & PG_M) { vm_offset_t sva1 = i386_ptob(sindex); if ((pbits & PG_MANAGED) && pmap_track_modified(sva1)) { vm_page_t m = PHYS_TO_VM_PAGE(pbits); m->dirty = VM_PAGE_BITS_ALL; } } ptbase[sindex] = pbits & ~(PG_M|PG_RW); anychanged = 1; } } } if (anychanged) invltlb(); } /* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. */ void pmap_enter(pmap_t pmap, vm_offset_t va, vm_offset_t pa, vm_prot_t prot, boolean_t wired) { register unsigned *pte; vm_offset_t opa; vm_offset_t origpte, newpte; vm_page_t mpte; if (pmap == NULL) return; va &= PG_FRAME; #ifdef PMAP_DIAGNOSTIC if (va > VM_MAX_KERNEL_ADDRESS) panic("pmap_enter: toobig"); if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS)) panic("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", va); #endif mpte = NULL; /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < UPT_MIN_ADDRESS) { mpte = pmap_allocpte(pmap, va); } #if 0 && defined(PMAP_DIAGNOSTIC) else { vm_offset_t *pdeaddr = (vm_offset_t *)pmap_pde(pmap, va); if (((origpte = (vm_offset_t) *pdeaddr) & PG_V) == 0) { panic("pmap_enter: invalid kernel page table page(0), pdir=%p, pde=%p, va=%p\n", pmap->pm_pdir[PTDPTDI], origpte, va); } if (smp_active) { pdeaddr = (vm_offset_t *) IdlePTDS[cpuid]; if (((newpte = pdeaddr[va >> PDRSHIFT]) & PG_V) == 0) { if ((vm_offset_t) my_idlePTD != (vm_offset_t) vtophys(pdeaddr)) printf("pde mismatch: %x, %x\n", my_idlePTD, pdeaddr); printf("cpuid: %d, pdeaddr: 0x%x\n", cpuid, pdeaddr); panic("pmap_enter: invalid kernel page table page(1), pdir=%p, npde=%p, pde=%p, va=%p\n", pmap->pm_pdir[PTDPTDI], newpte, origpte, va); } } } #endif pte = pmap_pte(pmap, va); /* * Page Directory table entry not valid, we need a new PT page */ if (pte == NULL) { panic("pmap_enter: invalid page directory, pdir=%p, va=0x%lx\n", pmap->pm_pdir[PTDPTDI], va); } origpte = *(vm_offset_t *)pte; pa &= PG_FRAME; opa = origpte & PG_FRAME; if (origpte & PG_PS) panic("pmap_enter: attempted pmap_enter on 4MB page"); /* * Mapping has not changed, must be protection or wiring change. */ if (origpte && (opa == pa)) { /* * Wiring change, just update stats. We don't worry about * wiring PT pages as they remain resident as long as there * are valid mappings in them. Hence, if a user page is wired, * the PT page will be also. */ if (wired && ((origpte & PG_W) == 0)) pmap->pm_stats.wired_count++; else if (!wired && (origpte & PG_W)) pmap->pm_stats.wired_count--; #if defined(PMAP_DIAGNOSTIC) if (pmap_nw_modified((pt_entry_t) origpte)) { printf("pmap_enter: modified page not writable: va: 0x%lx, pte: 0x%lx\n", va, origpte); } #endif /* * We might be turning off write access to the page, * so we go ahead and sense modify status. */ if (origpte & PG_MANAGED) { vm_page_t m; if (origpte & PG_M) { if (pmap_track_modified(va)) { m = PHYS_TO_VM_PAGE(pa); m->dirty = VM_PAGE_BITS_ALL; } } pa |= PG_MANAGED; } if (mpte) --mpte->hold_count; goto validate; } /* * Mapping has changed, invalidate old range and fall through to * handle validating new mapping. */ if (opa) { int err; err = pmap_remove_pte(pmap, pte, va); if (err) panic("pmap_enter: pte vanished, va: 0x%x", va); } /* * Enter on the PV list if part of our managed memory Note that we * raise IPL while manipulating pv_table since pmap_enter can be * called at interrupt time. */ if (pmap_is_managed(pa)) { pmap_insert_entry(pmap, va, mpte, pa); pa |= PG_MANAGED; } /* * Increment counters */ pmap->pm_stats.resident_count++; if (wired) pmap->pm_stats.wired_count++; validate: /* * Now validate mapping with desired protection/wiring. */ newpte = (vm_offset_t) (pa | pte_prot(pmap, prot) | PG_V); if (wired) newpte |= PG_W; if (va < UPT_MIN_ADDRESS) newpte |= PG_U; if (pmap == kernel_pmap) newpte |= pgeflag; /* * if the mapping or permission bits are different, we need * to update the pte. */ if ((origpte & ~(PG_M|PG_A)) != newpte) { *pte = newpte; if (origpte) invltlb_1pg(va); } } /* * this code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No page table pages. * 5. Tlbflush is deferred to calling procedure. * 6. Page IS managed. * but is *MUCH* faster than pmap_enter... */ static vm_page_t pmap_enter_quick(pmap, va, pa, mpte) register pmap_t pmap; vm_offset_t va; register vm_offset_t pa; vm_page_t mpte; { register unsigned *pte; /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < UPT_MIN_ADDRESS) { unsigned ptepindex; vm_offset_t ptepa; /* * Calculate pagetable page index */ ptepindex = va >> PDRSHIFT; if (mpte && (mpte->pindex == ptepindex)) { ++mpte->hold_count; } else { retry: /* * Get the page directory entry */ ptepa = (vm_offset_t) pmap->pm_pdir[ptepindex]; /* * If the page table page is mapped, we just increment * the hold count, and activate it. */ if (ptepa) { if (ptepa & PG_PS) panic("pmap_enter_quick: unexpected mapping into 4MB page"); #if defined(PTPHINT) if (pmap->pm_ptphint && (pmap->pm_ptphint->pindex == ptepindex)) { mpte = pmap->pm_ptphint; } else { mpte = pmap_page_lookup( pmap->pm_pteobj, ptepindex); pmap->pm_ptphint = mpte; } #else mpte = pmap_page_lookup( pmap->pm_pteobj, ptepindex); #endif if (mpte == NULL) goto retry; ++mpte->hold_count; } else { mpte = _pmap_allocpte(pmap, ptepindex); } } } else { mpte = NULL; } /* * This call to vtopte makes the assumption that we are * entering the page into the current pmap. In order to support * quick entry into any pmap, one would likely use pmap_pte_quick. * But that isn't as quick as vtopte. */ pte = (unsigned *)vtopte(va); if (*pte) { if (mpte) pmap_unwire_pte_hold(pmap, mpte); return 0; } /* * Enter on the PV list if part of our managed memory Note that we * raise IPL while manipulating pv_table since pmap_enter can be * called at interrupt time. */ pmap_insert_entry(pmap, va, mpte, pa); /* * Increment counters */ pmap->pm_stats.resident_count++; /* * Now validate mapping with RO protection */ *pte = pa | PG_V | PG_U | PG_MANAGED; return mpte; } #define MAX_INIT_PT (96) /* * pmap_object_init_pt preloads the ptes for a given object * into the specified pmap. This eliminates the blast of soft * faults on process startup and immediately after an mmap. */ void pmap_object_init_pt(pmap, addr, object, pindex, size, limit) pmap_t pmap; vm_offset_t addr; vm_object_t object; vm_pindex_t pindex; vm_size_t size; int limit; { vm_offset_t tmpidx; int psize; vm_page_t p, mpte; int objpgs; if (!pmap) return; /* * This code maps large physical mmap regions into the * processor address space. Note that some shortcuts * are taken, but the code works. */ if (pseflag && (object->type == OBJT_DEVICE) && ((addr & (NBPDR - 1)) == 0) && ((size & (NBPDR - 1)) == 0) ) { int i; int s; vm_page_t m[1]; unsigned int ptepindex; int npdes; vm_offset_t ptepa; if (pmap->pm_pdir[ptepindex = (addr >> PDRSHIFT)]) return; s = splhigh(); retry: p = vm_page_lookup(object, pindex); if (p && (p->flags & PG_BUSY)) { tsleep(p, PVM, "init4p", 0); goto retry; } splx(s); if (p == NULL) { p = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL); if (p == NULL) return; m[0] = p; if (vm_pager_get_pages(object, m, 1, 0) != VM_PAGER_OK) { PAGE_WAKEUP(p); vm_page_free(p); return; } p = vm_page_lookup(object, pindex); PAGE_WAKEUP(p); } ptepa = (vm_offset_t) VM_PAGE_TO_PHYS(p); if (ptepa & (NBPDR - 1)) { return; } p->valid = VM_PAGE_BITS_ALL; pmap->pm_stats.resident_count += size >> PAGE_SHIFT; npdes = size >> PDRSHIFT; for(i=0;ipm_pdir[ptepindex] = (pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_PS); ptepa += NBPDR; ptepindex += 1; } p->flags |= PG_MAPPED; invltlb(); return; } psize = i386_btop(size); if ((object->type != OBJT_VNODE) || (limit && (psize > MAX_INIT_PT) && (object->resident_page_count > MAX_INIT_PT))) { return; } if (psize + pindex > object->size) psize = object->size - pindex; mpte = NULL; /* * if we are processing a major portion of the object, then scan the * entire thing. */ if (psize > (object->size >> 2)) { objpgs = psize; for (p = TAILQ_FIRST(&object->memq); ((objpgs > 0) && (p != NULL)); p = TAILQ_NEXT(p, listq)) { tmpidx = p->pindex; if (tmpidx < pindex) { continue; } tmpidx -= pindex; if (tmpidx >= psize) { continue; } if (((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) && (p->busy == 0) && (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) { if ((p->queue - p->pc) == PQ_CACHE) vm_page_deactivate(p); p->flags |= PG_BUSY; mpte = pmap_enter_quick(pmap, addr + i386_ptob(tmpidx), VM_PAGE_TO_PHYS(p), mpte); p->flags |= PG_MAPPED; PAGE_WAKEUP(p); } objpgs -= 1; } } else { /* * else lookup the pages one-by-one. */ for (tmpidx = 0; tmpidx < psize; tmpidx += 1) { p = vm_page_lookup(object, tmpidx + pindex); if (p && ((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) && (p->busy == 0) && (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) { if ((p->queue - p->pc) == PQ_CACHE) vm_page_deactivate(p); p->flags |= PG_BUSY; mpte = pmap_enter_quick(pmap, addr + i386_ptob(tmpidx), VM_PAGE_TO_PHYS(p), mpte); p->flags |= PG_MAPPED; PAGE_WAKEUP(p); } } } return; } /* * pmap_prefault provides a quick way of clustering * pagefaults into a processes address space. It is a "cousin" * of pmap_object_init_pt, except it runs at page fault time instead * of mmap time. */ #define PFBAK 2 #define PFFOR 2 #define PAGEORDER_SIZE (PFBAK+PFFOR) static int pmap_prefault_pageorder[] = { -PAGE_SIZE, PAGE_SIZE, -2 * PAGE_SIZE, 2 * PAGE_SIZE }; void pmap_prefault(pmap, addra, entry, object) pmap_t pmap; vm_offset_t addra; vm_map_entry_t entry; vm_object_t object; { int i; vm_offset_t starta; vm_offset_t addr; vm_pindex_t pindex; vm_page_t m, mpte; if (entry->object.vm_object != object) return; if (!curproc || (pmap != &curproc->p_vmspace->vm_pmap)) return; starta = addra - PFBAK * PAGE_SIZE; if (starta < entry->start) { starta = entry->start; } else if (starta > addra) { starta = 0; } mpte = NULL; for (i = 0; i < PAGEORDER_SIZE; i++) { vm_object_t lobject; unsigned *pte; addr = addra + pmap_prefault_pageorder[i]; if (addr < starta || addr >= entry->end) continue; if ((*pmap_pde(pmap, addr)) == NULL) continue; pte = (unsigned *) vtopte(addr); if (*pte) continue; pindex = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT; lobject = object; for (m = vm_page_lookup(lobject, pindex); (!m && (lobject->type == OBJT_DEFAULT) && (lobject->backing_object)); lobject = lobject->backing_object) { if (lobject->backing_object_offset & PAGE_MASK) break; pindex += (lobject->backing_object_offset >> PAGE_SHIFT); m = vm_page_lookup(lobject->backing_object, pindex); } /* * give-up when a page is not in memory */ if (m == NULL) break; if (((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) && (m->busy == 0) && (m->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) { if ((m->queue - m->pc) == PQ_CACHE) { vm_page_deactivate(m); } m->flags |= PG_BUSY; mpte = pmap_enter_quick(pmap, addr, VM_PAGE_TO_PHYS(m), mpte); m->flags |= PG_MAPPED; PAGE_WAKEUP(m); } } } /* * Routine: pmap_change_wiring * Function: Change the wiring attribute for a map/virtual-address * pair. * In/out conditions: * The mapping must already exist in the pmap. */ void pmap_change_wiring(pmap, va, wired) register pmap_t pmap; vm_offset_t va; boolean_t wired; { register unsigned *pte; if (pmap == NULL) return; pte = pmap_pte(pmap, va); if (wired && !pmap_pte_w(pte)) pmap->pm_stats.wired_count++; else if (!wired && pmap_pte_w(pte)) pmap->pm_stats.wired_count--; /* * Wiring is not a hardware characteristic so there is no need to * invalidate TLB. */ pmap_pte_set_w(pte, wired); } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. */ void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr) pmap_t dst_pmap, src_pmap; vm_offset_t dst_addr; vm_size_t len; vm_offset_t src_addr; { vm_offset_t addr; vm_offset_t end_addr = src_addr + len; vm_offset_t pdnxt; unsigned src_frame, dst_frame; if (dst_addr != src_addr) return; src_frame = ((unsigned) src_pmap->pm_pdir[PTDPTDI]) & PG_FRAME; if (src_frame != (((unsigned) PTDpde) & PG_FRAME)) { return; } dst_frame = ((unsigned) dst_pmap->pm_pdir[PTDPTDI]) & PG_FRAME; if (dst_frame != (((unsigned) APTDpde) & PG_FRAME)) { APTDpde = (pd_entry_t) (dst_frame | PG_RW | PG_V); invltlb(); } for(addr = src_addr; addr < end_addr; addr = pdnxt) { unsigned *src_pte, *dst_pte; vm_page_t dstmpte, srcmpte; vm_offset_t srcptepaddr; unsigned ptepindex; if (addr >= UPT_MIN_ADDRESS) panic("pmap_copy: invalid to pmap_copy page tables\n"); pdnxt = ((addr + PAGE_SIZE*NPTEPG) & ~(PAGE_SIZE*NPTEPG - 1)); ptepindex = addr >> PDRSHIFT; srcptepaddr = (vm_offset_t) src_pmap->pm_pdir[ptepindex]; if (srcptepaddr == 0) continue; if (srcptepaddr & PG_PS) { if (dst_pmap->pm_pdir[ptepindex] == 0) { dst_pmap->pm_pdir[ptepindex] = (pd_entry_t) srcptepaddr; dst_pmap->pm_stats.resident_count += NBPDR; } continue; } srcmpte = vm_page_lookup(src_pmap->pm_pteobj, ptepindex); if ((srcmpte == NULL) || (srcmpte->hold_count == 0) || (srcmpte->flags & PG_BUSY)) continue; if (pdnxt > end_addr) pdnxt = end_addr; src_pte = (unsigned *) vtopte(addr); dst_pte = (unsigned *) avtopte(addr); while (addr < pdnxt) { unsigned ptetemp; ptetemp = *src_pte; /* * we only virtual copy managed pages */ if ((ptetemp & PG_MANAGED) != 0) { /* * We have to check after allocpte for the * pte still being around... allocpte can * block. */ dstmpte = pmap_allocpte(dst_pmap, addr); if ((*dst_pte == 0) && (ptetemp = *src_pte)) { /* * Clear the modified and * accessed (referenced) bits * during the copy. */ *dst_pte = ptetemp & ~(PG_M|PG_A); dst_pmap->pm_stats.resident_count++; pmap_insert_entry(dst_pmap, addr, dstmpte, (ptetemp & PG_FRAME)); } else { pmap_unwire_pte_hold(dst_pmap, dstmpte); } if (dstmpte->hold_count >= srcmpte->hold_count) break; } addr += PAGE_SIZE; ++src_pte; ++dst_pte; } } } /* * Routine: pmap_kernel * Function: * Returns the physical map handle for the kernel. */ pmap_t pmap_kernel() { return (kernel_pmap); } /* * pmap_zero_page zeros the specified (machine independent) * page by mapping the page into virtual memory and using * bzero to clear its contents, one machine dependent page * at a time. */ void pmap_zero_page(phys) vm_offset_t phys; { #ifdef SMP if (*(int *) prv_CMAP3) panic("pmap_zero_page: prv_CMAP3 busy"); *(int *) prv_CMAP3 = PG_V | PG_RW | (phys & PG_FRAME); invltlb_1pg((vm_offset_t) &prv_CPAGE3); bzero(&prv_CPAGE3, PAGE_SIZE); *(int *) prv_CMAP3 = 0; invltlb_1pg((vm_offset_t) &prv_CPAGE3); #else if (*(int *) CMAP2) panic("pmap_zero_page: CMAP busy"); *(int *) CMAP2 = PG_V | PG_RW | (phys & PG_FRAME); bzero(CADDR2, PAGE_SIZE); *(int *) CMAP2 = 0; invltlb_1pg((vm_offset_t) CADDR2); #endif } /* * pmap_copy_page copies the specified (machine independent) * page by mapping the page into virtual memory and using * bcopy to copy the page, one machine dependent page at a * time. */ void pmap_copy_page(src, dst) vm_offset_t src; vm_offset_t dst; { #ifdef SMP if (*(int *) prv_CMAP1) panic("pmap_copy_page: prv_CMAP1 busy"); if (*(int *) prv_CMAP2) panic("pmap_copy_page: prv_CMAP2 busy"); *(int *) prv_CMAP1 = PG_V | PG_RW | (src & PG_FRAME); *(int *) prv_CMAP2 = PG_V | PG_RW | (dst & PG_FRAME); invltlb_2pg( (vm_offset_t) &prv_CPAGE1, (vm_offset_t) &prv_CPAGE2); bcopy(&prv_CPAGE1, &prv_CPAGE2, PAGE_SIZE); *(int *) prv_CMAP1 = 0; *(int *) prv_CMAP2 = 0; invltlb_2pg( (vm_offset_t) &prv_CPAGE1, (vm_offset_t) &prv_CPAGE2); #else if (*(int *) CMAP1 || *(int *) CMAP2) panic("pmap_copy_page: CMAP busy"); *(int *) CMAP1 = PG_V | PG_RW | (src & PG_FRAME); *(int *) CMAP2 = PG_V | PG_RW | (dst & PG_FRAME); bcopy(CADDR1, CADDR2, PAGE_SIZE); *(int *) CMAP1 = 0; *(int *) CMAP2 = 0; invltlb_2pg( (vm_offset_t) CADDR1, (vm_offset_t) CADDR2); #endif } /* * Routine: pmap_pageable * Function: * Make the specified pages (by pmap, offset) * pageable (or not) as requested. * * A page which is not pageable may not take * a fault; therefore, its page table entry * must remain valid for the duration. * * This routine is merely advisory; pmap_enter * will specify that these pages are to be wired * down (or not) as appropriate. */ void pmap_pageable(pmap, sva, eva, pageable) pmap_t pmap; vm_offset_t sva, eva; boolean_t pageable; { } /* * this routine returns true if a physical page resides * in the given pmap. */ boolean_t pmap_page_exists(pmap, pa) pmap_t pmap; vm_offset_t pa; { register pv_entry_t pv; pv_table_t *ppv; int s; if (!pmap_is_managed(pa)) return FALSE; s = splvm(); ppv = pa_to_pvh(pa); /* * Not found, check current mappings returning immediately if found. */ for (pv = TAILQ_FIRST(&ppv->pv_list); pv; pv = TAILQ_NEXT(pv, pv_list)) { if (pv->pv_pmap == pmap) { splx(s); return TRUE; } } splx(s); return (FALSE); } #define PMAP_REMOVE_PAGES_CURPROC_ONLY /* * Remove all pages from specified address space * this aids process exit speeds. Also, this code * is special cased for current process only, but * can have the more generic (and slightly slower) * mode enabled. This is much faster than pmap_remove * in the case of running down an entire address space. */ void pmap_remove_pages(pmap, sva, eva) pmap_t pmap; vm_offset_t sva, eva; { unsigned *pte, tpte; pv_table_t *ppv; pv_entry_t pv, npv; int s; #ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY if (!curproc || (pmap != &curproc->p_vmspace->vm_pmap)) { printf("warning: pmap_remove_pages called with non-current pmap\n"); return; } #endif s = splvm(); for(pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) { if (pv->pv_va >= eva || pv->pv_va < sva) { npv = TAILQ_NEXT(pv, pv_plist); continue; } #ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY pte = (unsigned *)vtopte(pv->pv_va); #else pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); #endif tpte = *pte; /* * We cannot remove wired pages from a process' mapping at this time */ if (tpte & PG_W) { npv = TAILQ_NEXT(pv, pv_plist); continue; } *pte = 0; ppv = pa_to_pvh(tpte); pv->pv_pmap->pm_stats.resident_count--; /* * Update the vm_page_t clean and reference bits. */ if (tpte & PG_M) { ppv->pv_vm_page->dirty = VM_PAGE_BITS_ALL; } npv = TAILQ_NEXT(pv, pv_plist); TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist); --ppv->pv_list_count; TAILQ_REMOVE(&ppv->pv_list, pv, pv_list); if (TAILQ_FIRST(&ppv->pv_list) == NULL) { ppv->pv_vm_page->flags &= ~(PG_MAPPED|PG_WRITEABLE); } pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem); free_pv_entry(pv); } splx(s); invltlb(); } /* * pmap_testbit tests bits in pte's * note that the testbit/changebit routines are inline, * and a lot of things compile-time evaluate. */ static boolean_t pmap_testbit(pa, bit) register vm_offset_t pa; int bit; { register pv_entry_t pv; pv_table_t *ppv; unsigned *pte; int s; if (!pmap_is_managed(pa)) return FALSE; ppv = pa_to_pvh(pa); if (TAILQ_FIRST(&ppv->pv_list) == NULL) return FALSE; s = splvm(); for (pv = TAILQ_FIRST(&ppv->pv_list); pv; pv = TAILQ_NEXT(pv, pv_list)) { /* * if the bit being tested is the modified bit, then * mark clean_map and ptes as never * modified. */ if (bit & (PG_A|PG_M)) { if (!pmap_track_modified(pv->pv_va)) continue; } #if defined(PMAP_DIAGNOSTIC) if (!pv->pv_pmap) { printf("Null pmap (tb) at va: 0x%lx\n", pv->pv_va); continue; } #endif pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); if (*pte & bit) { splx(s); return TRUE; } } splx(s); return (FALSE); } /* * this routine is used to modify bits in ptes */ static void pmap_changebit(pa, bit, setem) vm_offset_t pa; int bit; boolean_t setem; { register pv_entry_t pv; pv_table_t *ppv; register unsigned *pte; int changed; int s; if (!pmap_is_managed(pa)) return; s = splvm(); changed = 0; ppv = pa_to_pvh(pa); /* * Loop over all current mappings setting/clearing as appropos If * setting RO do we need to clear the VAC? */ for (pv = TAILQ_FIRST(&ppv->pv_list); pv; pv = TAILQ_NEXT(pv, pv_list)) { /* * don't write protect pager mappings */ if (!setem && (bit == PG_RW)) { if (!pmap_track_modified(pv->pv_va)) continue; } #if defined(PMAP_DIAGNOSTIC) if (!pv->pv_pmap) { printf("Null pmap (cb) at va: 0x%lx\n", pv->pv_va); continue; } #endif pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); if (setem) { *(int *)pte |= bit; changed = 1; } else { vm_offset_t pbits = *(vm_offset_t *)pte; if (pbits & bit) { changed = 1; if (bit == PG_RW) { if (pbits & PG_M) { ppv->pv_vm_page->dirty = VM_PAGE_BITS_ALL; } *(int *)pte = pbits & ~(PG_M|PG_RW); } else { *(int *)pte = pbits & ~bit; } } } } splx(s); if (changed) invltlb(); } /* * pmap_page_protect: * * Lower the permission for all mappings to a given page. */ void pmap_page_protect(vm_offset_t phys, vm_prot_t prot) { if ((prot & VM_PROT_WRITE) == 0) { if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) { pmap_changebit(phys, PG_RW, FALSE); } else { pmap_remove_all(phys); } } } vm_offset_t pmap_phys_address(ppn) int ppn; { return (i386_ptob(ppn)); } /* * pmap_ts_referenced: * * Return the count of reference bits for a page, clearing all of them. * */ int pmap_ts_referenced(vm_offset_t pa) { register pv_entry_t pv; pv_table_t *ppv; unsigned *pte; int s; int rtval = 0; if (!pmap_is_managed(pa)) return FALSE; s = splvm(); ppv = pa_to_pvh(pa); if (TAILQ_FIRST(&ppv->pv_list) == NULL) { splx(s); return 0; } /* * Not found, check current mappings returning immediately if found. */ for (pv = TAILQ_FIRST(&ppv->pv_list); pv; pv = TAILQ_NEXT(pv, pv_list)) { /* * if the bit being tested is the modified bit, then * mark clean_map and ptes as never * modified. */ if (!pmap_track_modified(pv->pv_va)) continue; pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); if (pte == NULL) { continue; } if (*pte & PG_A) { rtval++; *pte &= ~PG_A; } } splx(s); if (rtval) { invltlb(); } return (rtval); } /* * pmap_is_modified: * * Return whether or not the specified physical page was modified * in any physical maps. */ boolean_t pmap_is_modified(vm_offset_t pa) { return pmap_testbit((pa), PG_M); } /* * Clear the modify bits on the specified physical page. */ void pmap_clear_modify(vm_offset_t pa) { pmap_changebit((pa), PG_M, FALSE); } /* * pmap_clear_reference: * * Clear the reference bit on the specified physical page. */ void pmap_clear_reference(vm_offset_t pa) { pmap_changebit((pa), PG_A, FALSE); } /* * Miscellaneous support routines follow */ static void i386_protection_init() { register int *kp, prot; kp = protection_codes; for (prot = 0; prot < 8; prot++) { switch (prot) { case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_NONE: /* * Read access is also 0. There isn't any execute bit, * so just make it readable. */ case VM_PROT_READ | VM_PROT_NONE | VM_PROT_NONE: case VM_PROT_READ | VM_PROT_NONE | VM_PROT_EXECUTE: case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_EXECUTE: *kp++ = 0; break; case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_NONE: case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_EXECUTE: case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_NONE: case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE: *kp++ = PG_RW; break; } } } /* * Map a set of physical memory pages into the kernel virtual * address space. Return a pointer to where it is mapped. This * routine is intended to be used for mapping device memory, * NOT real memory. */ void * pmap_mapdev(pa, size) vm_offset_t pa; vm_size_t size; { vm_offset_t va, tmpva; unsigned *pte; size = roundup(size, PAGE_SIZE); va = kmem_alloc_pageable(kernel_map, size); if (!va) panic("pmap_mapdev: Couldn't alloc kernel virtual memory"); pa = pa & PG_FRAME; for (tmpva = va; size > 0;) { pte = (unsigned *)vtopte(tmpva); *pte = pa | PG_RW | PG_V | pgeflag; size -= PAGE_SIZE; tmpva += PAGE_SIZE; pa += PAGE_SIZE; } invltlb(); return ((void *) va); } /* * perform the pmap work for mincore */ int pmap_mincore(pmap, addr) pmap_t pmap; vm_offset_t addr; { unsigned *ptep, pte; int val = 0; ptep = pmap_pte(pmap, addr); if (ptep == 0) { return 0; } if (pte = *ptep) { vm_offset_t pa; val = MINCORE_INCORE; pa = pte & PG_FRAME; /* * Modified by us */ if (pte & PG_M) val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER; /* * Modified by someone */ else if (PHYS_TO_VM_PAGE(pa)->dirty || pmap_is_modified(pa)) val |= MINCORE_MODIFIED_OTHER; /* * Referenced by us */ if (pte & PG_U) val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER; /* * Referenced by someone */ else if ((PHYS_TO_VM_PAGE(pa)->flags & PG_REFERENCED) || pmap_ts_referenced(pa)) { val |= MINCORE_REFERENCED_OTHER; PHYS_TO_VM_PAGE(pa)->flags |= PG_REFERENCED; } } return val; } void pmap_activate(struct proc *p) { +#if defined(SWTCH_OPTIM_STATS) + ++tlb_flush_count; +#endif load_cr3(p->p_addr->u_pcb.pcb_cr3 = vtophys(p->p_vmspace->vm_pmap.pm_pdir)); } vm_offset_t pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size) { if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) { return addr; } addr = (addr + (NBPDR - 1)) & ~(NBPDR - 1); return addr; } #if defined(PMAP_DEBUG) pmap_pid_dump(int pid) { pmap_t pmap; struct proc *p; int npte = 0; int index; for (p = allproc.lh_first; p != NULL; p = p->p_list.le_next) { if (p->p_pid != pid) continue; if (p->p_vmspace) { int i,j; index = 0; pmap = &p->p_vmspace->vm_pmap; for(i=0;i<1024;i++) { pd_entry_t *pde; unsigned *pte; unsigned base = i << PDRSHIFT; pde = &pmap->pm_pdir[i]; if (pde && pmap_pde_v(pde)) { for(j=0;j<1024;j++) { unsigned va = base + (j << PAGE_SHIFT); if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) { if (index) { index = 0; printf("\n"); } return npte; } pte = pmap_pte_quick( pmap, va); if (pte && pmap_pte_v(pte)) { vm_offset_t pa; vm_page_t m; pa = *(int *)pte; m = PHYS_TO_VM_PAGE((pa & PG_FRAME)); printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x", va, pa, m->hold_count, m->wire_count, m->flags); npte++; index++; if (index >= 2) { index = 0; printf("\n"); } else { printf(" "); } } } } } } } return npte; } #endif #if defined(DEBUG) static void pads __P((pmap_t pm)); static void pmap_pvdump __P((vm_offset_t pa)); /* print address space of pmap*/ static void pads(pm) pmap_t pm; { unsigned va, i, j; unsigned *ptep; if (pm == kernel_pmap) return; for (i = 0; i < 1024; i++) if (pm->pm_pdir[i]) for (j = 0; j < 1024; j++) { va = (i << PDRSHIFT) + (j << PAGE_SHIFT); if (pm == kernel_pmap && va < KERNBASE) continue; if (pm != kernel_pmap && va > UPT_MAX_ADDRESS) continue; ptep = pmap_pte_quick(pm, va); if (pmap_pte_v(ptep)) printf("%x:%x ", va, *(int *) ptep); }; } static void pmap_pvdump(pa) vm_offset_t pa; { pv_table_t *ppv; register pv_entry_t pv; printf("pa %x", pa); ppv = pa_to_pvh(pa); for (pv = TAILQ_FIRST(&ppv->pv_list); pv; pv = TAILQ_NEXT(pv, pv_list)) { #ifdef used_to_be printf(" -> pmap %x, va %x, flags %x", pv->pv_pmap, pv->pv_va, pv->pv_flags); #endif printf(" -> pmap %x, va %x", pv->pv_pmap, pv->pv_va); pads(pv->pv_pmap); } printf(" "); } #endif Index: head/sys/i386/i386/support.s =================================================================== --- head/sys/i386/i386/support.s (revision 31708) +++ head/sys/i386/i386/support.s (revision 31709) @@ -1,1571 +1,1574 @@ /*- * Copyright (c) 1993 The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $Id: support.s,v 1.56 1997/08/09 00:02:44 dyson Exp $ + * $Id: support.s,v 1.57 1997/09/02 20:05:30 bde Exp $ */ #include "npx.h" #include #include #include #include #include "assym.s" #define KDSEL 0x10 /* kernel data selector */ #define KCSEL 0x8 /* kernel code selector */ #define IDXSHIFT 10 .data .globl _bcopy_vector _bcopy_vector: .long _generic_bcopy .globl _bzero _bzero: .long _generic_bzero .globl _copyin_vector _copyin_vector: .long _generic_copyin .globl _copyout_vector _copyout_vector: .long _generic_copyout .globl _ovbcopy_vector _ovbcopy_vector: .long _generic_bcopy #if defined(I586_CPU) && NNPX > 0 kernel_fpu_lock: .byte 0xfe .space 3 #endif .text /* * bcopy family * void bzero(void *buf, u_int len) */ ENTRY(generic_bzero) pushl %edi movl 8(%esp),%edi movl 12(%esp),%ecx xorl %eax,%eax shrl $2,%ecx cld rep stosl movl 12(%esp),%ecx andl $3,%ecx rep stosb popl %edi ret #if defined(I486_CPU) ENTRY(i486_bzero) movl 4(%esp),%edx movl 8(%esp),%ecx xorl %eax,%eax /* * do 64 byte chunks first * * XXX this is probably over-unrolled at least for DX2's */ 2: cmpl $64,%ecx jb 3f movl %eax,(%edx) movl %eax,4(%edx) movl %eax,8(%edx) movl %eax,12(%edx) movl %eax,16(%edx) movl %eax,20(%edx) movl %eax,24(%edx) movl %eax,28(%edx) movl %eax,32(%edx) movl %eax,36(%edx) movl %eax,40(%edx) movl %eax,44(%edx) movl %eax,48(%edx) movl %eax,52(%edx) movl %eax,56(%edx) movl %eax,60(%edx) addl $64,%edx subl $64,%ecx jnz 2b ret /* * do 16 byte chunks */ SUPERALIGN_TEXT 3: cmpl $16,%ecx jb 4f movl %eax,(%edx) movl %eax,4(%edx) movl %eax,8(%edx) movl %eax,12(%edx) addl $16,%edx subl $16,%ecx jnz 3b ret /* * do 4 byte chunks */ SUPERALIGN_TEXT 4: cmpl $4,%ecx jb 5f movl %eax,(%edx) addl $4,%edx subl $4,%ecx jnz 4b ret /* * do 1 byte chunks * a jump table seems to be faster than a loop or more range reductions * * XXX need a const section for non-text */ .data jtab: .long do0 .long do1 .long do2 .long do3 .text SUPERALIGN_TEXT 5: jmp jtab(,%ecx,4) SUPERALIGN_TEXT do3: movw %ax,(%edx) movb %al,2(%edx) ret SUPERALIGN_TEXT do2: movw %ax,(%edx) ret SUPERALIGN_TEXT do1: movb %al,(%edx) ret SUPERALIGN_TEXT do0: ret #endif #if defined(I586_CPU) && NNPX > 0 ENTRY(i586_bzero) movl 4(%esp),%edx movl 8(%esp),%ecx /* * The FPU register method is twice as fast as the integer register * method unless the target is in the L1 cache and we pre-allocate a * cache line for it (then the integer register method is 4-5 times * faster). However, we never pre-allocate cache lines, since that * would make the integer method 25% or more slower for the common * case when the target isn't in either the L1 cache or the L2 cache. * Thus we normally use the FPU register method unless the overhead * would be too large. */ cmpl $256,%ecx /* empirical; clts, fninit, smsw cost a lot */ jb intreg_i586_bzero /* * The FPU registers may belong to an application or to fastmove() * or to another invocation of bcopy() or ourself in a higher level * interrupt or trap handler. Preserving the registers is * complicated since we avoid it if possible at all levels. We * want to localize the complications even when that increases them. * Here the extra work involves preserving CR0_TS in TS. * `npxproc != NULL' is supposed to be the condition that all the * FPU resources belong to an application, but npxproc and CR0_TS * aren't set atomically enough for this condition to work in * interrupt handlers. * * Case 1: FPU registers belong to the application: we must preserve * the registers if we use them, so we only use the FPU register * method if the target size is large enough to amortize the extra * overhead for preserving them. CR0_TS must be preserved although * it is very likely to end up as set. * * Case 2: FPU registers belong to fastmove(): fastmove() currently * makes the registers look like they belong to an application so * that cpu_switch() and savectx() don't have to know about it, so * this case reduces to case 1. * * Case 3: FPU registers belong to the kernel: don't use the FPU * register method. This case is unlikely, and supporting it would * be more complicated and might take too much stack. * * Case 4: FPU registers don't belong to anyone: the FPU registers * don't need to be preserved, so we always use the FPU register * method. CR0_TS must be preserved although it is very likely to * always end up as clear. */ cmpl $0,_npxproc je i586_bz1 cmpl $256+184,%ecx /* empirical; not quite 2*108 more */ jb intreg_i586_bzero sarb $1,kernel_fpu_lock jc intreg_i586_bzero smsw %ax clts subl $108,%esp fnsave 0(%esp) jmp i586_bz2 i586_bz1: sarb $1,kernel_fpu_lock jc intreg_i586_bzero smsw %ax clts fninit /* XXX should avoid needing this */ i586_bz2: fldz /* * Align to an 8 byte boundary (misalignment in the main loop would * cost a factor of >= 2). Avoid jumps (at little cost if it is * already aligned) by always zeroing 8 bytes and using the part up * to the _next_ alignment position. */ fstl 0(%edx) addl %edx,%ecx /* part of %ecx -= new_%edx - %edx */ addl $8,%edx andl $~7,%edx subl %edx,%ecx /* * Similarly align `len' to a multiple of 8. */ fstl -8(%edx,%ecx) decl %ecx andl $~7,%ecx /* * This wouldn't be any faster if it were unrolled, since the loop * control instructions are much faster than the fstl and/or done * in parallel with it so their overhead is insignificant. */ fpureg_i586_bzero_loop: fstl 0(%edx) addl $8,%edx subl $8,%ecx cmpl $8,%ecx jae fpureg_i586_bzero_loop cmpl $0,_npxproc je i586_bz3 frstor 0(%esp) addl $108,%esp lmsw %ax movb $0xfe,kernel_fpu_lock ret i586_bz3: fstpl %st(0) lmsw %ax movb $0xfe,kernel_fpu_lock ret intreg_i586_bzero: /* * `rep stos' seems to be the best method in practice for small * counts. Fancy methods usually take too long to start up due * to cache and BTB misses. */ pushl %edi movl %edx,%edi xorl %eax,%eax shrl $2,%ecx cld rep stosl movl 12(%esp),%ecx andl $3,%ecx jne 1f popl %edi ret 1: rep stosb popl %edi ret #endif /* I586_CPU && NNPX > 0 */ /* fillw(pat, base, cnt) */ ENTRY(fillw) pushl %edi movl 8(%esp),%eax movl 12(%esp),%edi movl 16(%esp),%ecx cld rep stosw popl %edi ret ENTRY(bcopyb) bcopyb: pushl %esi pushl %edi movl 12(%esp),%esi movl 16(%esp),%edi movl 20(%esp),%ecx movl %edi,%eax subl %esi,%eax cmpl %ecx,%eax /* overlapping && src < dst? */ jb 1f cld /* nope, copy forwards */ rep movsb popl %edi popl %esi ret ALIGN_TEXT 1: addl %ecx,%edi /* copy backwards. */ addl %ecx,%esi decl %edi decl %esi std rep movsb popl %edi popl %esi cld ret ENTRY(bcopy) MEXITCOUNT jmp *_bcopy_vector ENTRY(ovbcopy) MEXITCOUNT jmp *_ovbcopy_vector /* * generic_bcopy(src, dst, cnt) * ws@tools.de (Wolfgang Solfrank, TooLs GmbH) +49-228-985800 */ ENTRY(generic_bcopy) pushl %esi pushl %edi movl 12(%esp),%esi movl 16(%esp),%edi movl 20(%esp),%ecx movl %edi,%eax subl %esi,%eax cmpl %ecx,%eax /* overlapping && src < dst? */ jb 1f shrl $2,%ecx /* copy by 32-bit words */ cld /* nope, copy forwards */ rep movsl movl 20(%esp),%ecx andl $3,%ecx /* any bytes left? */ rep movsb popl %edi popl %esi ret ALIGN_TEXT 1: addl %ecx,%edi /* copy backwards */ addl %ecx,%esi decl %edi decl %esi andl $3,%ecx /* any fractional bytes? */ std rep movsb movl 20(%esp),%ecx /* copy remainder by 32-bit words */ shrl $2,%ecx subl $3,%esi subl $3,%edi rep movsl popl %edi popl %esi cld ret #if defined(I586_CPU) && NNPX > 0 ENTRY(i586_bcopy) pushl %esi pushl %edi movl 12(%esp),%esi movl 16(%esp),%edi movl 20(%esp),%ecx movl %edi,%eax subl %esi,%eax cmpl %ecx,%eax /* overlapping && src < dst? */ jb 1f cmpl $1024,%ecx jb small_i586_bcopy sarb $1,kernel_fpu_lock jc small_i586_bcopy cmpl $0,_npxproc je i586_bc1 smsw %dx clts subl $108,%esp fnsave 0(%esp) jmp 4f i586_bc1: smsw %dx clts fninit /* XXX should avoid needing this */ ALIGN_TEXT 4: pushl %ecx #define DCACHE_SIZE 8192 cmpl $(DCACHE_SIZE-512)/2,%ecx jbe 2f movl $(DCACHE_SIZE-512)/2,%ecx 2: subl %ecx,0(%esp) cmpl $256,%ecx jb 5f /* XXX should prefetch if %ecx >= 32 */ pushl %esi pushl %ecx ALIGN_TEXT 3: movl 0(%esi),%eax movl 32(%esi),%eax movl 64(%esi),%eax movl 96(%esi),%eax movl 128(%esi),%eax movl 160(%esi),%eax movl 192(%esi),%eax movl 224(%esi),%eax addl $256,%esi subl $256,%ecx cmpl $256,%ecx jae 3b popl %ecx popl %esi 5: ALIGN_TEXT large_i586_bcopy_loop: fildq 0(%esi) fildq 8(%esi) fildq 16(%esi) fildq 24(%esi) fildq 32(%esi) fildq 40(%esi) fildq 48(%esi) fildq 56(%esi) fistpq 56(%edi) fistpq 48(%edi) fistpq 40(%edi) fistpq 32(%edi) fistpq 24(%edi) fistpq 16(%edi) fistpq 8(%edi) fistpq 0(%edi) addl $64,%esi addl $64,%edi subl $64,%ecx cmpl $64,%ecx jae large_i586_bcopy_loop popl %eax addl %eax,%ecx cmpl $64,%ecx jae 4b cmpl $0,_npxproc je i586_bc2 frstor 0(%esp) addl $108,%esp i586_bc2: lmsw %dx movb $0xfe,kernel_fpu_lock /* * This is a duplicate of the main part of generic_bcopy. See the comments * there. Jumping into generic_bcopy would cost a whole 0-1 cycles and * would mess up high resolution profiling. */ ALIGN_TEXT small_i586_bcopy: shrl $2,%ecx cld rep movsl movl 20(%esp),%ecx andl $3,%ecx rep movsb popl %edi popl %esi ret ALIGN_TEXT 1: addl %ecx,%edi addl %ecx,%esi decl %edi decl %esi andl $3,%ecx std rep movsb movl 20(%esp),%ecx shrl $2,%ecx subl $3,%esi subl $3,%edi rep movsl popl %edi popl %esi cld ret #endif /* I586_CPU && NNPX > 0 */ /* * Note: memcpy does not support overlapping copies */ ENTRY(memcpy) pushl %edi pushl %esi movl 12(%esp),%edi movl 16(%esp),%esi movl 20(%esp),%ecx movl %edi,%eax shrl $2,%ecx /* copy by 32-bit words */ cld /* nope, copy forwards */ rep movsl movl 20(%esp),%ecx andl $3,%ecx /* any bytes left? */ rep movsb popl %esi popl %edi ret /*****************************************************************************/ /* copyout and fubyte family */ /*****************************************************************************/ /* * Access user memory from inside the kernel. These routines and possibly * the math- and DOS emulators should be the only places that do this. * * We have to access the memory with user's permissions, so use a segment * selector with RPL 3. For writes to user space we have to additionally * check the PTE for write permission, because the 386 does not check * write permissions when we are executing with EPL 0. The 486 does check * this if the WP bit is set in CR0, so we can use a simpler version here. * * These routines set curpcb->onfault for the time they execute. When a * protection violation occurs inside the functions, the trap handler * returns to *curpcb->onfault instead of the function. */ /* copyout(from_kernel, to_user, len) */ ENTRY(copyout) MEXITCOUNT jmp *_copyout_vector ENTRY(generic_copyout) movl _curpcb,%eax movl $copyout_fault,PCB_ONFAULT(%eax) pushl %esi pushl %edi pushl %ebx movl 16(%esp),%esi movl 20(%esp),%edi movl 24(%esp),%ebx testl %ebx,%ebx /* anything to do? */ jz done_copyout /* * Check explicitly for non-user addresses. If 486 write protection * is being used, this check is essential because we are in kernel * mode so the h/w does not provide any protection against writing * kernel addresses. */ /* * First, prevent address wrapping. */ movl %edi,%eax addl %ebx,%eax jc copyout_fault /* * XXX STOP USING VM_MAXUSER_ADDRESS. * It is an end address, not a max, so every time it is used correctly it * looks like there is an off by one error, and of course it caused an off * by one error in several places. */ cmpl $VM_MAXUSER_ADDRESS,%eax ja copyout_fault #if defined(I386_CPU) #if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU) cmpl $CPUCLASS_386,_cpu_class jne 3f #endif /* * We have to check each PTE for user write permission. * The checking may cause a page fault, so it is important to set * up everything for return via copyout_fault before here. */ /* compute number of pages */ movl %edi,%ecx andl $PAGE_MASK,%ecx addl %ebx,%ecx decl %ecx shrl $IDXSHIFT+2,%ecx incl %ecx /* compute PTE offset for start address */ movl %edi,%edx shrl $IDXSHIFT,%edx andb $0xfc,%dl 1: /* check PTE for each page */ leal _PTmap(%edx),%eax shrl $IDXSHIFT,%eax andb $0xfc,%al testb $PG_V,_PTmap(%eax) /* PTE page must be valid */ je 4f movb _PTmap(%edx),%al andb $PG_V|PG_RW|PG_U,%al /* page must be valid and user writable */ cmpb $PG_V|PG_RW|PG_U,%al je 2f 4: /* simulate a trap */ pushl %edx pushl %ecx shll $IDXSHIFT,%edx pushl %edx call _trapwrite /* trapwrite(addr) */ popl %edx popl %ecx popl %edx testl %eax,%eax /* if not ok, return EFAULT */ jnz copyout_fault 2: addl $4,%edx decl %ecx jnz 1b /* check next page */ #endif /* I386_CPU */ /* bcopy(%esi, %edi, %ebx) */ 3: movl %ebx,%ecx #if defined(I586_CPU) && NNPX > 0 ALIGN_TEXT slow_copyout: #endif shrl $2,%ecx cld rep movsl movb %bl,%cl andb $3,%cl rep movsb done_copyout: popl %ebx popl %edi popl %esi xorl %eax,%eax movl _curpcb,%edx movl %eax,PCB_ONFAULT(%edx) ret ALIGN_TEXT copyout_fault: popl %ebx popl %edi popl %esi movl _curpcb,%edx movl $0,PCB_ONFAULT(%edx) movl $EFAULT,%eax ret #if defined(I586_CPU) && NNPX > 0 ENTRY(i586_copyout) /* * Duplicated from generic_copyout. Could be done a bit better. */ movl _curpcb,%eax movl $copyout_fault,PCB_ONFAULT(%eax) pushl %esi pushl %edi pushl %ebx movl 16(%esp),%esi movl 20(%esp),%edi movl 24(%esp),%ebx testl %ebx,%ebx /* anything to do? */ jz done_copyout /* * Check explicitly for non-user addresses. If 486 write protection * is being used, this check is essential because we are in kernel * mode so the h/w does not provide any protection against writing * kernel addresses. */ /* * First, prevent address wrapping. */ movl %edi,%eax addl %ebx,%eax jc copyout_fault /* * XXX STOP USING VM_MAXUSER_ADDRESS. * It is an end address, not a max, so every time it is used correctly it * looks like there is an off by one error, and of course it caused an off * by one error in several places. */ cmpl $VM_MAXUSER_ADDRESS,%eax ja copyout_fault /* bcopy(%esi, %edi, %ebx) */ 3: movl %ebx,%ecx /* * End of duplicated code. */ cmpl $1024,%ecx jb slow_copyout pushl %ecx call _fastmove addl $4,%esp jmp done_copyout #endif /* I586_CPU && NNPX > 0 */ /* copyin(from_user, to_kernel, len) */ ENTRY(copyin) MEXITCOUNT jmp *_copyin_vector ENTRY(generic_copyin) movl _curpcb,%eax movl $copyin_fault,PCB_ONFAULT(%eax) pushl %esi pushl %edi movl 12(%esp),%esi /* caddr_t from */ movl 16(%esp),%edi /* caddr_t to */ movl 20(%esp),%ecx /* size_t len */ /* * make sure address is valid */ movl %esi,%edx addl %ecx,%edx jc copyin_fault cmpl $VM_MAXUSER_ADDRESS,%edx ja copyin_fault #if defined(I586_CPU) && NNPX > 0 ALIGN_TEXT slow_copyin: #endif movb %cl,%al shrl $2,%ecx /* copy longword-wise */ cld rep movsl movb %al,%cl andb $3,%cl /* copy remaining bytes */ rep movsb #if defined(I586_CPU) && NNPX > 0 ALIGN_TEXT done_copyin: #endif popl %edi popl %esi xorl %eax,%eax movl _curpcb,%edx movl %eax,PCB_ONFAULT(%edx) ret ALIGN_TEXT copyin_fault: popl %edi popl %esi movl _curpcb,%edx movl $0,PCB_ONFAULT(%edx) movl $EFAULT,%eax ret #if defined(I586_CPU) && NNPX > 0 ENTRY(i586_copyin) /* * Duplicated from generic_copyin. Could be done a bit better. */ movl _curpcb,%eax movl $copyin_fault,PCB_ONFAULT(%eax) pushl %esi pushl %edi movl 12(%esp),%esi /* caddr_t from */ movl 16(%esp),%edi /* caddr_t to */ movl 20(%esp),%ecx /* size_t len */ /* * make sure address is valid */ movl %esi,%edx addl %ecx,%edx jc copyin_fault cmpl $VM_MAXUSER_ADDRESS,%edx ja copyin_fault /* * End of duplicated code. */ cmpl $1024,%ecx jb slow_copyin pushl %ebx /* XXX prepare for fastmove_fault */ pushl %ecx call _fastmove addl $8,%esp jmp done_copyin #endif /* I586_CPU && NNPX > 0 */ #if defined(I586_CPU) && NNPX > 0 /* fastmove(src, dst, len) src in %esi dst in %edi len in %ecx XXX changed to on stack for profiling uses %eax and %edx for tmp. storage */ /* XXX use ENTRY() to get profiling. fastmove() is actually a non-entry. */ ENTRY(fastmove) pushl %ebp movl %esp,%ebp subl $PCB_SAVEFPU_SIZE+3*4,%esp movl 8(%ebp),%ecx cmpl $63,%ecx jbe fastmove_tail testl $7,%esi /* check if src addr is multiple of 8 */ jnz fastmove_tail testl $7,%edi /* check if dst addr is multiple of 8 */ jnz fastmove_tail /* if (npxproc != NULL) { */ cmpl $0,_npxproc je 6f /* fnsave(&curpcb->pcb_savefpu); */ movl _curpcb,%eax fnsave PCB_SAVEFPU(%eax) /* npxproc = NULL; */ movl $0,_npxproc /* } */ 6: /* now we own the FPU. */ /* * The process' FP state is saved in the pcb, but if we get * switched, the cpu_switch() will store our FP state in the * pcb. It should be possible to avoid all the copying for * this, e.g., by setting a flag to tell cpu_switch() to * save the state somewhere else. */ /* tmp = curpcb->pcb_savefpu; */ movl %ecx,-12(%ebp) movl %esi,-8(%ebp) movl %edi,-4(%ebp) movl %esp,%edi movl _curpcb,%esi addl $PCB_SAVEFPU,%esi cld movl $PCB_SAVEFPU_SIZE>>2,%ecx rep movsl movl -12(%ebp),%ecx movl -8(%ebp),%esi movl -4(%ebp),%edi /* stop_emulating(); */ clts /* npxproc = curproc; */ movl _curproc,%eax movl %eax,_npxproc movl _curpcb,%eax movl $fastmove_fault,PCB_ONFAULT(%eax) 4: movl %ecx,-12(%ebp) cmpl $1792,%ecx jbe 2f movl $1792,%ecx 2: subl %ecx,-12(%ebp) cmpl $256,%ecx jb 5f movl %ecx,-8(%ebp) movl %esi,-4(%ebp) ALIGN_TEXT 3: movl 0(%esi),%eax movl 32(%esi),%eax movl 64(%esi),%eax movl 96(%esi),%eax movl 128(%esi),%eax movl 160(%esi),%eax movl 192(%esi),%eax movl 224(%esi),%eax addl $256,%esi subl $256,%ecx cmpl $256,%ecx jae 3b movl -8(%ebp),%ecx movl -4(%ebp),%esi 5: ALIGN_TEXT fastmove_loop: fildq 0(%esi) fildq 8(%esi) fildq 16(%esi) fildq 24(%esi) fildq 32(%esi) fildq 40(%esi) fildq 48(%esi) fildq 56(%esi) fistpq 56(%edi) fistpq 48(%edi) fistpq 40(%edi) fistpq 32(%edi) fistpq 24(%edi) fistpq 16(%edi) fistpq 8(%edi) fistpq 0(%edi) addl $-64,%ecx addl $64,%esi addl $64,%edi cmpl $63,%ecx ja fastmove_loop movl -12(%ebp),%eax addl %eax,%ecx cmpl $64,%ecx jae 4b /* curpcb->pcb_savefpu = tmp; */ movl %ecx,-12(%ebp) movl %esi,-8(%ebp) movl %edi,-4(%ebp) movl _curpcb,%edi addl $PCB_SAVEFPU,%edi movl %esp,%esi cld movl $PCB_SAVEFPU_SIZE>>2,%ecx rep movsl movl -12(%ebp),%ecx movl -8(%ebp),%esi movl -4(%ebp),%edi /* start_emulating(); */ smsw %ax orb $CR0_TS,%al lmsw %ax /* npxproc = NULL; */ movl $0,_npxproc ALIGN_TEXT fastmove_tail: movl _curpcb,%eax movl $fastmove_tail_fault,PCB_ONFAULT(%eax) movb %cl,%al shrl $2,%ecx /* copy longword-wise */ cld rep movsl movb %al,%cl andb $3,%cl /* copy remaining bytes */ rep movsb movl %ebp,%esp popl %ebp ret ALIGN_TEXT fastmove_fault: movl _curpcb,%edi addl $PCB_SAVEFPU,%edi movl %esp,%esi cld movl $PCB_SAVEFPU_SIZE>>2,%ecx rep movsl smsw %ax orb $CR0_TS,%al lmsw %ax movl $0,_npxproc fastmove_tail_fault: movl %ebp,%esp popl %ebp addl $8,%esp popl %ebx popl %edi popl %esi movl _curpcb,%edx movl $0,PCB_ONFAULT(%edx) movl $EFAULT,%eax ret #endif /* I586_CPU && NNPX > 0 */ /* * fu{byte,sword,word} : fetch a byte (sword, word) from user memory */ ENTRY(fuword) movl _curpcb,%ecx movl $fusufault,PCB_ONFAULT(%ecx) movl 4(%esp),%edx /* from */ cmpl $VM_MAXUSER_ADDRESS-4,%edx /* verify address is valid */ ja fusufault movl (%edx),%eax movl $0,PCB_ONFAULT(%ecx) ret /* * These two routines are called from the profiling code, potentially * at interrupt time. If they fail, that's okay, good things will * happen later. Fail all the time for now - until the trap code is * able to deal with this. */ ALTENTRY(suswintr) ENTRY(fuswintr) movl $-1,%eax ret ENTRY(fusword) movl _curpcb,%ecx movl $fusufault,PCB_ONFAULT(%ecx) movl 4(%esp),%edx cmpl $VM_MAXUSER_ADDRESS-2,%edx ja fusufault movzwl (%edx),%eax movl $0,PCB_ONFAULT(%ecx) ret ENTRY(fubyte) movl _curpcb,%ecx movl $fusufault,PCB_ONFAULT(%ecx) movl 4(%esp),%edx cmpl $VM_MAXUSER_ADDRESS-1,%edx ja fusufault movzbl (%edx),%eax movl $0,PCB_ONFAULT(%ecx) ret ALIGN_TEXT fusufault: movl _curpcb,%ecx xorl %eax,%eax movl %eax,PCB_ONFAULT(%ecx) decl %eax ret /* * su{byte,sword,word}: write a byte (word, longword) to user memory */ ENTRY(suword) movl _curpcb,%ecx movl $fusufault,PCB_ONFAULT(%ecx) movl 4(%esp),%edx #if defined(I386_CPU) #if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU) cmpl $CPUCLASS_386,_cpu_class jne 2f /* we only have to set the right segment selector */ #endif /* I486_CPU || I586_CPU || I686_CPU */ /* XXX - page boundary crossing is still not handled */ movl %edx,%eax shrl $IDXSHIFT,%edx andb $0xfc,%dl leal _PTmap(%edx),%ecx shrl $IDXSHIFT,%ecx andb $0xfc,%cl testb $PG_V,_PTmap(%ecx) /* PTE page must be valid */ je 4f movb _PTmap(%edx),%dl andb $PG_V|PG_RW|PG_U,%dl /* page must be valid and user writable */ cmpb $PG_V|PG_RW|PG_U,%dl je 1f 4: /* simulate a trap */ pushl %eax call _trapwrite popl %edx /* remove junk parameter from stack */ testl %eax,%eax jnz fusufault 1: movl 4(%esp),%edx #endif 2: cmpl $VM_MAXUSER_ADDRESS-4,%edx /* verify address validity */ ja fusufault movl 8(%esp),%eax movl %eax,(%edx) xorl %eax,%eax movl _curpcb,%ecx movl %eax,PCB_ONFAULT(%ecx) ret ENTRY(susword) movl _curpcb,%ecx movl $fusufault,PCB_ONFAULT(%ecx) movl 4(%esp),%edx #if defined(I386_CPU) #if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU) cmpl $CPUCLASS_386,_cpu_class jne 2f #endif /* I486_CPU || I586_CPU || I686_CPU */ /* XXX - page boundary crossing is still not handled */ movl %edx,%eax shrl $IDXSHIFT,%edx andb $0xfc,%dl leal _PTmap(%edx),%ecx shrl $IDXSHIFT,%ecx andb $0xfc,%cl testb $PG_V,_PTmap(%ecx) /* PTE page must be valid */ je 4f movb _PTmap(%edx),%dl andb $PG_V|PG_RW|PG_U,%dl /* page must be valid and user writable */ cmpb $PG_V|PG_RW|PG_U,%dl je 1f 4: /* simulate a trap */ pushl %eax call _trapwrite popl %edx /* remove junk parameter from stack */ testl %eax,%eax jnz fusufault 1: movl 4(%esp),%edx #endif 2: cmpl $VM_MAXUSER_ADDRESS-2,%edx /* verify address validity */ ja fusufault movw 8(%esp),%ax movw %ax,(%edx) xorl %eax,%eax movl _curpcb,%ecx /* restore trashed register */ movl %eax,PCB_ONFAULT(%ecx) ret ALTENTRY(suibyte) ENTRY(subyte) movl _curpcb,%ecx movl $fusufault,PCB_ONFAULT(%ecx) movl 4(%esp),%edx #if defined(I386_CPU) #if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU) cmpl $CPUCLASS_386,_cpu_class jne 2f #endif /* I486_CPU || I586_CPU || I686_CPU */ movl %edx,%eax shrl $IDXSHIFT,%edx andb $0xfc,%dl leal _PTmap(%edx),%ecx shrl $IDXSHIFT,%ecx andb $0xfc,%cl testb $PG_V,_PTmap(%ecx) /* PTE page must be valid */ je 4f movb _PTmap(%edx),%dl andb $PG_V|PG_RW|PG_U,%dl /* page must be valid and user writable */ cmpb $PG_V|PG_RW|PG_U,%dl je 1f 4: /* simulate a trap */ pushl %eax call _trapwrite popl %edx /* remove junk parameter from stack */ testl %eax,%eax jnz fusufault 1: movl 4(%esp),%edx #endif 2: cmpl $VM_MAXUSER_ADDRESS-1,%edx /* verify address validity */ ja fusufault movb 8(%esp),%al movb %al,(%edx) xorl %eax,%eax movl _curpcb,%ecx /* restore trashed register */ movl %eax,PCB_ONFAULT(%ecx) ret /* * copyinstr(from, to, maxlen, int *lencopied) * copy a string from from to to, stop when a 0 character is reached. * return ENAMETOOLONG if string is longer than maxlen, and * EFAULT on protection violations. If lencopied is non-zero, * return the actual length in *lencopied. */ ENTRY(copyinstr) pushl %esi pushl %edi movl _curpcb,%ecx movl $cpystrflt,PCB_ONFAULT(%ecx) movl 12(%esp),%esi /* %esi = from */ movl 16(%esp),%edi /* %edi = to */ movl 20(%esp),%edx /* %edx = maxlen */ movl $VM_MAXUSER_ADDRESS,%eax /* make sure 'from' is within bounds */ subl %esi,%eax jbe cpystrflt /* restrict maxlen to <= VM_MAXUSER_ADDRESS-from */ cmpl %edx,%eax jae 1f movl %eax,%edx movl %eax,20(%esp) 1: incl %edx cld 2: decl %edx jz 3f lodsb stosb orb %al,%al jnz 2b /* Success -- 0 byte reached */ decl %edx xorl %eax,%eax jmp cpystrflt_x 3: /* edx is zero - return ENAMETOOLONG or EFAULT */ cmpl $VM_MAXUSER_ADDRESS,%esi jae cpystrflt 4: movl $ENAMETOOLONG,%eax jmp cpystrflt_x cpystrflt: movl $EFAULT,%eax cpystrflt_x: /* set *lencopied and return %eax */ movl _curpcb,%ecx movl $0,PCB_ONFAULT(%ecx) movl 20(%esp),%ecx subl %edx,%ecx movl 24(%esp),%edx testl %edx,%edx jz 1f movl %ecx,(%edx) 1: popl %edi popl %esi ret /* * copystr(from, to, maxlen, int *lencopied) */ ENTRY(copystr) pushl %esi pushl %edi movl 12(%esp),%esi /* %esi = from */ movl 16(%esp),%edi /* %edi = to */ movl 20(%esp),%edx /* %edx = maxlen */ incl %edx cld 1: decl %edx jz 4f lodsb stosb orb %al,%al jnz 1b /* Success -- 0 byte reached */ decl %edx xorl %eax,%eax jmp 6f 4: /* edx is zero -- return ENAMETOOLONG */ movl $ENAMETOOLONG,%eax 6: /* set *lencopied and return %eax */ movl 20(%esp),%ecx subl %edx,%ecx movl 24(%esp),%edx testl %edx,%edx jz 7f movl %ecx,(%edx) 7: popl %edi popl %esi ret ENTRY(bcmp) pushl %edi pushl %esi movl 12(%esp),%edi movl 16(%esp),%esi movl 20(%esp),%edx xorl %eax,%eax movl %edx,%ecx shrl $2,%ecx cld /* compare forwards */ repe cmpsl jne 1f movl %edx,%ecx andl $3,%ecx repe cmpsb je 2f 1: incl %eax 2: popl %esi popl %edi ret /* * Handling of special 386 registers and descriptor tables etc */ /* void lgdt(struct region_descriptor *rdp); */ ENTRY(lgdt) /* reload the descriptor table */ movl 4(%esp),%eax lgdt (%eax) /* flush the prefetch q */ jmp 1f nop 1: /* reload "stale" selectors */ movl $KDSEL,%eax movl %ax,%ds movl %ax,%es movl %ax,%fs movl %ax,%gs movl %ax,%ss /* reload code selector by turning return into intersegmental return */ movl (%esp),%eax pushl %eax movl $KCSEL,4(%esp) lret /* * void lidt(struct region_descriptor *rdp); */ ENTRY(lidt) movl 4(%esp),%eax lidt (%eax) ret /* * void lldt(u_short sel) */ ENTRY(lldt) lldt 4(%esp) ret /* * void ltr(u_short sel) */ ENTRY(ltr) ltr 4(%esp) ret /* ssdtosd(*ssdp,*sdp) */ ENTRY(ssdtosd) pushl %ebx movl 8(%esp),%ecx movl 8(%ecx),%ebx shll $16,%ebx movl (%ecx),%edx roll $16,%edx movb %dh,%bl movb %dl,%bh rorl $8,%ebx movl 4(%ecx),%eax movw %ax,%dx andl $0xf0000,%eax orl %eax,%ebx movl 12(%esp),%ecx movl %edx,(%ecx) movl %ebx,4(%ecx) popl %ebx ret /* load_cr0(cr0) */ ENTRY(load_cr0) movl 4(%esp),%eax movl %eax,%cr0 ret /* rcr0() */ ENTRY(rcr0) movl %cr0,%eax ret /* rcr3() */ ENTRY(rcr3) movl %cr3,%eax ret /* void load_cr3(caddr_t cr3) */ ENTRY(load_cr3) +#if defined(SWTCH_OPTIM_STATS) + incl _tlb_flush_count +#endif movl 4(%esp),%eax movl %eax,%cr3 ret /* rcr4() */ ENTRY(rcr4) movl %cr4,%eax ret /* void load_cr4(caddr_t cr4) */ ENTRY(load_cr4) movl 4(%esp),%eax movl %eax,%cr4 ret /*****************************************************************************/ /* setjump, longjump */ /*****************************************************************************/ ENTRY(setjmp) movl 4(%esp),%eax movl %ebx,(%eax) /* save ebx */ movl %esp,4(%eax) /* save esp */ movl %ebp,8(%eax) /* save ebp */ movl %esi,12(%eax) /* save esi */ movl %edi,16(%eax) /* save edi */ movl (%esp),%edx /* get rta */ movl %edx,20(%eax) /* save eip */ xorl %eax,%eax /* return(0); */ ret ENTRY(longjmp) movl 4(%esp),%eax movl (%eax),%ebx /* restore ebx */ movl 4(%eax),%esp /* restore esp */ movl 8(%eax),%ebp /* restore ebp */ movl 12(%eax),%esi /* restore esi */ movl 16(%eax),%edi /* restore edi */ movl 20(%eax),%edx /* get rta */ movl %edx,(%esp) /* put in return frame */ xorl %eax,%eax /* return(1); */ incl %eax ret /* * Here for doing BB-profiling (gcc -a). * We rely on the "bbset" instead, but need a dummy function. */ NON_GPROF_ENTRY(__bb_init_func) movl 4(%esp),%eax movl $1,(%eax) .byte 0xc3 /* avoid macro for `ret' */ Index: head/sys/i386/i386/swtch.s =================================================================== --- head/sys/i386/i386/swtch.s (revision 31708) +++ head/sys/i386/i386/swtch.s (revision 31709) @@ -1,778 +1,815 @@ /*- * Copyright (c) 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $Id: swtch.s,v 1.63 1997/09/21 15:03:58 peter Exp $ + * $Id: swtch.s,v 1.64 1997/10/10 09:44:06 peter Exp $ */ #include "npx.h" #include "opt_user_ldt.h" #include "opt_vm86.h" #include #include #ifdef SMP #include #include #include /** GRAB_LOPRIO */ #endif /* SMP */ #include "assym.s" /*****************************************************************************/ /* Scheduling */ /*****************************************************************************/ /* * The following primitives manipulate the run queues. * _whichqs tells which of the 32 queues _qs * have processes in them. setrunqueue puts processes into queues, Remrq * removes them from queues. The running process is on no queue, * other processes are on a queue related to p->p_priority, divided by 4 * actually to shrink the 0-127 range of priorities into the 32 available * queues. */ .data #ifndef SMP .globl _curpcb _curpcb: .long 0 /* pointer to curproc's PCB area */ #endif /* !SMP */ .globl _whichqs, _whichrtqs, _whichidqs _whichqs: .long 0 /* which run queues have data */ _whichrtqs: .long 0 /* which realtime run qs have data */ _whichidqs: .long 0 /* which idletime run qs have data */ .globl _hlt_vector _hlt_vector: .long _default_halt /* pointer to halt routine */ .globl _qs,_cnt,_panic .globl _want_resched _want_resched: .long 0 /* we need to re-run the scheduler */ +#if defined(SWTCH_OPTIM_STATS) + .globl _swtch_optim_stats, _tlb_flush_count +_swtch_optim_stats: .long 0 /* number of _swtch_optims */ +_tlb_flush_count: .long 0 +#endif .text /* * setrunqueue(p) * * Call should be made at spl6(), and p->p_stat should be SRUN */ ENTRY(setrunqueue) movl 4(%esp),%eax #ifdef DIAGNOSTIC cmpb $SRUN,P_STAT(%eax) je set1 pushl $set2 call _panic set1: #endif cmpw $RTP_PRIO_NORMAL,P_RTPRIO_TYPE(%eax) /* normal priority process? */ je set_nort movzwl P_RTPRIO_PRIO(%eax),%edx cmpw $RTP_PRIO_REALTIME,P_RTPRIO_TYPE(%eax) /* realtime priority? */ jne set_id /* must be idle priority */ set_rt: btsl %edx,_whichrtqs /* set q full bit */ shll $3,%edx addl $_rtqs,%edx /* locate q hdr */ movl %edx,P_FORW(%eax) /* link process on tail of q */ movl P_BACK(%edx),%ecx movl %ecx,P_BACK(%eax) movl %eax,P_BACK(%edx) movl %eax,P_FORW(%ecx) ret set_id: btsl %edx,_whichidqs /* set q full bit */ shll $3,%edx addl $_idqs,%edx /* locate q hdr */ movl %edx,P_FORW(%eax) /* link process on tail of q */ movl P_BACK(%edx),%ecx movl %ecx,P_BACK(%eax) movl %eax,P_BACK(%edx) movl %eax,P_FORW(%ecx) ret set_nort: /* Normal (RTOFF) code */ movzbl P_PRI(%eax),%edx shrl $2,%edx btsl %edx,_whichqs /* set q full bit */ shll $3,%edx addl $_qs,%edx /* locate q hdr */ movl %edx,P_FORW(%eax) /* link process on tail of q */ movl P_BACK(%edx),%ecx movl %ecx,P_BACK(%eax) movl %eax,P_BACK(%edx) movl %eax,P_FORW(%ecx) ret set2: .asciz "setrunqueue" /* * Remrq(p) * * Call should be made at spl6(). */ ENTRY(remrq) movl 4(%esp),%eax cmpw $RTP_PRIO_NORMAL,P_RTPRIO_TYPE(%eax) /* normal priority process? */ je rem_nort movzwl P_RTPRIO_PRIO(%eax),%edx cmpw $RTP_PRIO_REALTIME,P_RTPRIO_TYPE(%eax) /* normal priority process? */ jne rem_id btrl %edx,_whichrtqs /* clear full bit, panic if clear already */ jb rem1rt pushl $rem3rt call _panic rem1rt: pushl %edx movl P_FORW(%eax),%ecx /* unlink process */ movl P_BACK(%eax),%edx movl %edx,P_BACK(%ecx) movl P_BACK(%eax),%ecx movl P_FORW(%eax),%edx movl %edx,P_FORW(%ecx) popl %edx movl $_rtqs,%ecx shll $3,%edx addl %edx,%ecx cmpl P_FORW(%ecx),%ecx /* q still has something? */ je rem2rt shrl $3,%edx /* yes, set bit as still full */ btsl %edx,_whichrtqs rem2rt: ret rem_id: btrl %edx,_whichidqs /* clear full bit, panic if clear already */ jb rem1id pushl $rem3id call _panic rem1id: pushl %edx movl P_FORW(%eax),%ecx /* unlink process */ movl P_BACK(%eax),%edx movl %edx,P_BACK(%ecx) movl P_BACK(%eax),%ecx movl P_FORW(%eax),%edx movl %edx,P_FORW(%ecx) popl %edx movl $_idqs,%ecx shll $3,%edx addl %edx,%ecx cmpl P_FORW(%ecx),%ecx /* q still has something? */ je rem2id shrl $3,%edx /* yes, set bit as still full */ btsl %edx,_whichidqs rem2id: ret rem_nort: movzbl P_PRI(%eax),%edx shrl $2,%edx btrl %edx,_whichqs /* clear full bit, panic if clear already */ jb rem1 pushl $rem3 call _panic rem1: pushl %edx movl P_FORW(%eax),%ecx /* unlink process */ movl P_BACK(%eax),%edx movl %edx,P_BACK(%ecx) movl P_BACK(%eax),%ecx movl P_FORW(%eax),%edx movl %edx,P_FORW(%ecx) popl %edx movl $_qs,%ecx shll $3,%edx addl %edx,%ecx cmpl P_FORW(%ecx),%ecx /* q still has something? */ je rem2 shrl $3,%edx /* yes, set bit as still full */ btsl %edx,_whichqs rem2: ret rem3: .asciz "remrq" rem3rt: .asciz "remrq.rt" rem3id: .asciz "remrq.id" /* * When no processes are on the runq, cpu_switch() branches to _idle * to wait for something to come ready. */ ALIGN_TEXT _idle: #ifdef SMP /* when called, we have the mplock, intr disabled */ xorl %ebp,%ebp /* use our idleproc's "context" */ movl _my_idlePTD,%ecx movl %ecx,%cr3 +#if defined(SWTCH_OPTIM_STATS) + incl _tlb_flush_count +#endif movl $_idlestack_top,%ecx movl %ecx,%esp /* update common_tss.tss_esp0 pointer */ #ifdef VM86 movl _my_tr, %esi #endif /* VM86 */ - movl $_common_tss, %eax - movl %ecx, TSS_ESP0(%eax) + movl %ecx, _common_tss + TSS_ESP0 #ifdef VM86 btrl %esi, _private_tss je 1f movl $_common_tssd, %edi /* move correct tss descriptor into GDT slot, then reload tr */ leal _gdt(,%esi,8), %ebx /* entry in GDT */ movl 0(%edi), %eax movl %eax, 0(%ebx) movl 4(%edi), %eax movl %eax, 4(%ebx) shll $3, %esi /* GSEL(entry, SEL_KPL) */ ltr %si 1: #endif /* VM86 */ sti /* * XXX callers of cpu_switch() do a bogus splclock(). Locking should * be left to cpu_switch(). */ call _spl0 cli /* * _REALLY_ free the lock, no matter how deep the prior nesting. * We will recover the nesting on the way out when we have a new * proc to load. * * XXX: we had damn well better be sure we had it before doing this! */ movl $FREE_LOCK, %eax movl %eax, _mp_lock /* do NOT have lock, intrs disabled */ .globl idle_loop idle_loop: +#if defined(SWTCH_OPTIM_STATS) + incl _tlb_flush_count +#endif movl %cr3,%eax /* ouch! */ movl %eax,%cr3 cmpl $0,_smp_active jne 1f cmpl $0,_cpuid je 1f jmp 2f 1: cmpl $0,_whichrtqs /* real-time queue */ jne 3f cmpl $0,_whichqs /* normal queue */ jne 3f cmpl $0,_whichidqs /* 'idle' queue */ jne 3f cmpl $0,_do_page_zero_idle je 2f /* XXX appears to cause panics */ /* * Inside zero_idle we enable interrupts and grab the mplock * as needed. It needs to be careful about entry/exit mutexes. */ call _vm_page_zero_idle /* internal locking */ testl %eax, %eax jnz idle_loop 2: /* enable intrs for a halt */ #ifdef SMP movl $0, lapic_tpr /* 1st candidate for an INT */ #endif sti call *_hlt_vector /* wait for interrupt */ cli jmp idle_loop 3: #ifdef SMP movl $LOPRIO_LEVEL, lapic_tpr /* arbitrate for INTs */ #endif call _get_mplock cmpl $0,_whichrtqs /* real-time queue */ CROSSJUMP(jne, sw1a, je) cmpl $0,_whichqs /* normal queue */ CROSSJUMP(jne, nortqr, je) cmpl $0,_whichidqs /* 'idle' queue */ CROSSJUMP(jne, idqr, je) call _rel_mplock jmp idle_loop #else xorl %ebp,%ebp movl $HIDENAME(tmpstk),%esp - movl _IdlePTD,%ecx - movl %ecx,%cr3 +#if defined(OVERLY_CONSERVATIVE_PTD_MGMT) +#if defined(SWTCH_OPTIM_STATS) + incl _swtch_optim_stats +#endif + movl _IdlePTD, %ecx + movl %cr3, %eax + cmpl %ecx, %eax + je 2f +#if defined(SWTCH_OPTIM_STATS) + decl _swtch_optim_stats + incl _tlb_flush_count +#endif + movl %ecx, %cr3 +2: +#endif /* update common_tss.tss_esp0 pointer */ #ifdef VM86 movl _my_tr, %esi #endif /* VM86 */ - movl $_common_tss, %eax - movl %esp, TSS_ESP0(%eax) + movl %esp, _common_tss + TSS_ESP0 #ifdef VM86 btrl %esi, _private_tss je 1f movl $_common_tssd, %edi /* move correct tss descriptor into GDT slot, then reload tr */ leal _gdt(,%esi,8), %ebx /* entry in GDT */ movl 0(%edi), %eax movl %eax, 0(%ebx) movl 4(%edi), %eax movl %eax, 4(%ebx) shll $3, %esi /* GSEL(entry, SEL_KPL) */ ltr %si 1: #endif /* VM86 */ sti /* * XXX callers of cpu_switch() do a bogus splclock(). Locking should * be left to cpu_switch(). */ call _spl0 ALIGN_TEXT idle_loop: cli cmpl $0,_whichrtqs /* real-time queue */ CROSSJUMP(jne, sw1a, je) cmpl $0,_whichqs /* normal queue */ CROSSJUMP(jne, nortqr, je) cmpl $0,_whichidqs /* 'idle' queue */ CROSSJUMP(jne, idqr, je) call _vm_page_zero_idle testl %eax, %eax jnz idle_loop sti call *_hlt_vector /* wait for interrupt */ jmp idle_loop #endif CROSSJUMPTARGET(_idle) ENTRY(default_halt) #ifndef SMP hlt /* XXX: until a wakeup IPI */ #endif ret /* * cpu_switch() */ ENTRY(cpu_switch) /* switch to new process. first, save context as needed */ movl _curproc,%ecx /* if no process to save, don't bother */ testl %ecx,%ecx je sw1 #ifdef SMP movb P_ONCPU(%ecx), %al /* save "last" cpu */ movb %al, P_LASTCPU(%ecx) movb $0xff, P_ONCPU(%ecx) /* "leave" the cpu */ #endif /* SMP */ movl P_ADDR(%ecx),%ecx movl (%esp),%eax /* Hardware registers */ movl %eax,PCB_EIP(%ecx) movl %ebx,PCB_EBX(%ecx) movl %esp,PCB_ESP(%ecx) movl %ebp,PCB_EBP(%ecx) movl %esi,PCB_ESI(%ecx) movl %edi,PCB_EDI(%ecx) movl %fs,PCB_FS(%ecx) movl %gs,PCB_GS(%ecx) #ifdef SMP movl _mp_lock, %eax /* XXX FIXME: we should be saving the local APIC TPR */ #ifdef DIAGNOSTIC cmpl $FREE_LOCK, %eax /* is it free? */ je badsw4 /* yes, bad medicine! */ #endif /* DIAGNOSTIC */ andl $COUNT_FIELD, %eax /* clear CPU portion */ movl %eax, PCB_MPNEST(%ecx) /* store it */ #endif /* SMP */ #if NNPX > 0 /* have we used fp, and need a save? */ movl _curproc,%eax cmpl %eax,_npxproc jne 1f addl $PCB_SAVEFPU,%ecx /* h/w bugs make saving complicated */ pushl %ecx call _npxsave /* do it in a big C function */ popl %eax 1: #endif /* NNPX > 0 */ movl $0,_curproc /* out of process */ /* save is done, now choose a new process or idle */ sw1: cli #ifdef SMP /* Stop scheduling if smp_active goes zero and we are not BSP */ cmpl $0,_smp_active jne 1f cmpl $0,_cpuid je 1f CROSSJUMP(je, _idle, jne) /* wind down */ 1: #endif sw1a: movl _whichrtqs,%edi /* pick next p. from rtqs */ testl %edi,%edi jz nortqr /* no realtime procs */ /* XXX - bsf is sloow */ bsfl %edi,%ebx /* find a full q */ jz nortqr /* no proc on rt q - try normal ... */ /* XX update whichqs? */ btrl %ebx,%edi /* clear q full status */ leal _rtqs(,%ebx,8),%eax /* select q */ movl %eax,%esi movl P_FORW(%eax),%ecx /* unlink from front of process q */ movl P_FORW(%ecx),%edx movl %edx,P_FORW(%eax) movl P_BACK(%ecx),%eax movl %eax,P_BACK(%edx) cmpl P_FORW(%ecx),%esi /* q empty */ je rt3 btsl %ebx,%edi /* nope, set to indicate not empty */ rt3: movl %edi,_whichrtqs /* update q status */ jmp swtch_com /* old sw1a */ /* Normal process priority's */ nortqr: movl _whichqs,%edi 2: /* XXX - bsf is sloow */ bsfl %edi,%ebx /* find a full q */ jz idqr /* if none, idle */ /* XX update whichqs? */ btrl %ebx,%edi /* clear q full status */ leal _qs(,%ebx,8),%eax /* select q */ movl %eax,%esi movl P_FORW(%eax),%ecx /* unlink from front of process q */ movl P_FORW(%ecx),%edx movl %edx,P_FORW(%eax) movl P_BACK(%ecx),%eax movl %eax,P_BACK(%edx) cmpl P_FORW(%ecx),%esi /* q empty */ je 3f btsl %ebx,%edi /* nope, set to indicate not empty */ 3: movl %edi,_whichqs /* update q status */ jmp swtch_com idqr: /* was sw1a */ movl _whichidqs,%edi /* pick next p. from idqs */ /* XXX - bsf is sloow */ bsfl %edi,%ebx /* find a full q */ CROSSJUMP(je, _idle, jne) /* if no proc, idle */ /* XX update whichqs? */ btrl %ebx,%edi /* clear q full status */ leal _idqs(,%ebx,8),%eax /* select q */ movl %eax,%esi movl P_FORW(%eax),%ecx /* unlink from front of process q */ movl P_FORW(%ecx),%edx movl %edx,P_FORW(%eax) movl P_BACK(%ecx),%eax movl %eax,P_BACK(%edx) cmpl P_FORW(%ecx),%esi /* q empty */ je id3 btsl %ebx,%edi /* nope, set to indicate not empty */ id3: movl %edi,_whichidqs /* update q status */ swtch_com: movl $0,%eax movl %eax,_want_resched #ifdef DIAGNOSTIC cmpl %eax,P_WCHAN(%ecx) jne badsw1 cmpb $SRUN,P_STAT(%ecx) jne badsw2 #endif movl %eax,P_BACK(%ecx) /* isolate process to run */ movl P_ADDR(%ecx),%edx - movl PCB_CR3(%edx),%ebx #ifdef SMP + movl PCB_CR3(%edx),%ebx /* Grab the private PT pointer from the outgoing process's PTD */ movl $_PTD, %esi movl 4*MPPTDI(%esi), %eax /* fetch cpu's prv pt */ -#endif /* SMP */ - +#else +#if defined(SWTCH_OPTIM_STATS) + incl _swtch_optim_stats +#endif /* switch address space */ + movl %cr3,%ebx + cmpl PCB_CR3(%edx),%ebx + je 4f +#if defined(SWTCH_OPTIM_STATS) + decl _swtch_optim_stats + incl _tlb_flush_count +#endif + movl PCB_CR3(%edx),%ebx +#endif /* SMP */ movl %ebx,%cr3 +4: #ifdef SMP /* Copy the private PT to the new process's PTD */ /* XXX yuck, the _PTD changes when we switch, so we have to * reload %cr3 after changing the address space. * We need to fix this by storing a pointer to the virtual * location of the per-process PTD in the PCB or something quick. * Dereferencing proc->vm_map->pmap->p_pdir[] is painful in asm. */ movl %eax, 4*MPPTDI(%esi) /* restore cpu's prv page */ +#if defined(SWTCH_OPTIM_STATS) + incl _tlb_flush_count +#endif /* XXX: we have just changed the page tables.. reload.. */ movl %ebx, %cr3 #endif /* SMP */ #ifdef VM86 movl _my_tr, %esi cmpl $0, PCB_EXT(%edx) /* has pcb extension? */ je 1f btsl %esi, _private_tss /* mark use of private tss */ movl PCB_EXT(%edx), %edi /* new tss descriptor */ jmp 2f 1: #endif /* update common_tss.tss_esp0 pointer */ movl $_common_tss, %eax movl %edx, %ebx /* pcb */ #ifdef VM86 addl $(UPAGES * PAGE_SIZE - 16), %ebx #else addl $(UPAGES * PAGE_SIZE), %ebx #endif /* VM86 */ movl %ebx, TSS_ESP0(%eax) #ifdef VM86 btrl %esi, _private_tss je 3f movl $_common_tssd, %edi 2: /* move correct tss descriptor into GDT slot, then reload tr */ leal _gdt(,%esi,8), %ebx /* entry in GDT */ movl 0(%edi), %eax movl %eax, 0(%ebx) movl 4(%edi), %eax movl %eax, 4(%ebx) shll $3, %esi /* GSEL(entry, SEL_KPL) */ ltr %si 3: #endif /* VM86 */ /* restore context */ movl PCB_EBX(%edx),%ebx movl PCB_ESP(%edx),%esp movl PCB_EBP(%edx),%ebp movl PCB_ESI(%edx),%esi movl PCB_EDI(%edx),%edi movl PCB_EIP(%edx),%eax movl %eax,(%esp) #ifdef SMP #ifdef GRAB_LOPRIO /* hold LOPRIO for INTs */ #ifdef CHEAP_TPR movl $0, lapic_tpr #else andl $~APIC_TPR_PRIO, lapic_tpr #endif /** CHEAP_TPR */ #endif /** GRAB_LOPRIO */ movl _cpuid,%eax movb %al, P_ONCPU(%ecx) #endif /* SMP */ movl %edx, _curpcb movl %ecx, _curproc /* into next process */ #ifdef SMP movl _cpu_lockid, %eax orl PCB_MPNEST(%edx), %eax /* add next count from PROC */ movl %eax, _mp_lock /* load the mp_lock */ /* XXX FIXME: we should be restoring the local APIC TPR */ #endif /* SMP */ #ifdef USER_LDT cmpl $0, PCB_USERLDT(%edx) jnz 1f movl __default_ldt,%eax cmpl _currentldt,%eax je 2f lldt __default_ldt movl %eax,_currentldt jmp 2f 1: pushl %edx call _set_user_ldt popl %edx 2: #endif /* This must be done after loading the user LDT. */ .globl cpu_switch_load_fs cpu_switch_load_fs: movl PCB_FS(%edx),%fs .globl cpu_switch_load_gs cpu_switch_load_gs: movl PCB_GS(%edx),%gs sti ret CROSSJUMPTARGET(idqr) CROSSJUMPTARGET(nortqr) CROSSJUMPTARGET(sw1a) #ifdef DIAGNOSTIC badsw1: pushl $sw0_1 call _panic sw0_1: .asciz "cpu_switch: has wchan" badsw2: pushl $sw0_2 call _panic sw0_2: .asciz "cpu_switch: not SRUN" #endif #if defined(SMP) && defined(DIAGNOSTIC) badsw4: pushl $sw0_4 call _panic sw0_4: .asciz "cpu_switch: do not have lock" #endif /* SMP && DIAGNOSTIC */ /* * savectx(pcb) * Update pcb, saving current processor state. */ ENTRY(savectx) /* fetch PCB */ movl 4(%esp),%ecx /* caller's return address - child won't execute this routine */ movl (%esp),%eax movl %eax,PCB_EIP(%ecx) movl %ebx,PCB_EBX(%ecx) movl %esp,PCB_ESP(%ecx) movl %ebp,PCB_EBP(%ecx) movl %esi,PCB_ESI(%ecx) movl %edi,PCB_EDI(%ecx) movl %fs,PCB_FS(%ecx) movl %gs,PCB_GS(%ecx) #if NNPX > 0 /* * If npxproc == NULL, then the npx h/w state is irrelevant and the * state had better already be in the pcb. This is true for forks * but not for dumps (the old book-keeping with FP flags in the pcb * always lost for dumps because the dump pcb has 0 flags). * * If npxproc != NULL, then we have to save the npx h/w state to * npxproc's pcb and copy it to the requested pcb, or save to the * requested pcb and reload. Copying is easier because we would * have to handle h/w bugs for reloading. We used to lose the * parent's npx state for forks by forgetting to reload. */ movl _npxproc,%eax testl %eax,%eax je 1f pushl %ecx movl P_ADDR(%eax),%eax leal PCB_SAVEFPU(%eax),%eax pushl %eax pushl %eax call _npxsave addl $4,%esp popl %eax popl %ecx pushl $PCB_SAVEFPU_SIZE leal PCB_SAVEFPU(%ecx),%ecx pushl %ecx pushl %eax call _bcopy addl $12,%esp #endif /* NNPX > 0 */ 1: ret Index: head/sys/i386/include/cpufunc.h =================================================================== --- head/sys/i386/include/cpufunc.h (revision 31708) +++ head/sys/i386/include/cpufunc.h (revision 31709) @@ -1,430 +1,436 @@ /*- * Copyright (c) 1993 The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $Id: cpufunc.h,v 1.3 1997/09/05 20:20:31 smp Exp smp $ + * $Id: cpufunc.h,v 1.72 1997/09/07 22:01:27 fsmp Exp $ */ /* * Functions to provide access to special i386 instructions. */ #ifndef _MACHINE_CPUFUNC_H_ #define _MACHINE_CPUFUNC_H_ #include #include #include +#if defined(SWTCH_OPTIM_STATS) +extern int tlb_flush_count; +#endif #ifdef __GNUC__ static __inline void breakpoint(void) { __asm __volatile("int $3"); } static __inline void disable_intr(void) { __asm __volatile("cli" : : : "memory"); MPINTR_LOCK(); } static __inline void enable_intr(void) { MPINTR_UNLOCK(); __asm __volatile("sti"); } #define HAVE_INLINE_FFS static __inline int ffs(int mask) { int result; /* * bsfl turns out to be not all that slow on 486's. It can beaten * using a binary search to reduce to 4 bits and then a table lookup, * but only if the code is inlined and in the cache, and the code * is quite large so inlining it probably busts the cache. * * Note that gcc-2's builtin ffs would be used if we didn't declare * this inline or turn off the builtin. The builtin is faster but * broken in gcc-2.4.5 and slower but working in gcc-2.5 and 2.6. */ __asm __volatile("testl %0,%0; je 1f; bsfl %0,%0; incl %0; 1:" : "=r" (result) : "0" (mask)); return (result); } #define HAVE_INLINE_FLS static __inline int fls(int mask) { int result; __asm __volatile("testl %0,%0; je 1f; bsrl %0,%0; incl %0; 1:" : "=r" (result) : "0" (mask)); return (result); } #if __GNUC__ < 2 #define inb(port) inbv(port) #define outb(port, data) outbv(port, data) #else /* __GNUC >= 2 */ /* * The following complications are to get around gcc not having a * constraint letter for the range 0..255. We still put "d" in the * constraint because "i" isn't a valid constraint when the port * isn't constant. This only matters for -O0 because otherwise * the non-working version gets optimized away. * * Use an expression-statement instead of a conditional expression * because gcc-2.6.0 would promote the operands of the conditional * and produce poor code for "if ((inb(var) & const1) == const2)". * * The unnecessary test `(port) < 0x10000' is to generate a warning if * the `port' has type u_short or smaller. Such types are pessimal. * This actually only works for signed types. The range check is * careful to avoid generating warnings. */ #define inb(port) __extension__ ({ \ u_char _data; \ if (__builtin_constant_p(port) && ((port) & 0xffff) < 0x100 \ && (port) < 0x10000) \ _data = inbc(port); \ else \ _data = inbv(port); \ _data; }) #define outb(port, data) ( \ __builtin_constant_p(port) && ((port) & 0xffff) < 0x100 \ && (port) < 0x10000 \ ? outbc(port, data) : outbv(port, data)) static __inline u_char inbc(u_int port) { u_char data; __asm __volatile("inb %1,%0" : "=a" (data) : "id" ((u_short)(port))); return (data); } static __inline void outbc(u_int port, u_char data) { __asm __volatile("outb %0,%1" : : "a" (data), "id" ((u_short)(port))); } #endif /* __GNUC <= 2 */ static __inline u_char inbv(u_int port) { u_char data; /* * We use %%dx and not %1 here because i/o is done at %dx and not at * %edx, while gcc generates inferior code (movw instead of movl) * if we tell it to load (u_short) port. */ __asm __volatile("inb %%dx,%0" : "=a" (data) : "d" (port)); return (data); } static __inline u_long inl(u_int port) { u_long data; __asm __volatile("inl %%dx,%0" : "=a" (data) : "d" (port)); return (data); } static __inline void insb(u_int port, void *addr, size_t cnt) { __asm __volatile("cld; rep; insb" : : "d" (port), "D" (addr), "c" (cnt) : "di", "cx", "memory"); } static __inline void insw(u_int port, void *addr, size_t cnt) { __asm __volatile("cld; rep; insw" : : "d" (port), "D" (addr), "c" (cnt) : "di", "cx", "memory"); } static __inline void insl(u_int port, void *addr, size_t cnt) { __asm __volatile("cld; rep; insl" : : "d" (port), "D" (addr), "c" (cnt) : "di", "cx", "memory"); } static __inline void invd(void) { __asm __volatile("invd"); } #ifdef KERNEL #ifdef SMP /* * When using APIC IPI's, the inlining cost is prohibitive since the call * executes into the IPI transmission system. */ void invlpg __P((u_int addr)); void invltlb __P((void)); #else /* !SMP */ static __inline void invlpg(u_int addr) { __asm __volatile("invlpg (%0)" : : "r" (addr) : "memory"); } static __inline void invltlb(void) { u_long temp; /* * This should be implemented as load_cr3(rcr3()) when load_cr3() * is inlined. */ __asm __volatile("movl %%cr3, %0; movl %0, %%cr3" : "=r" (temp) : : "memory"); +#if defined(SWTCH_OPTIM_STATS) + ++tlb_flush_count; +#endif } #endif /* SMP */ #endif /* KERNEL */ static __inline u_short inw(u_int port) { u_short data; __asm __volatile("inw %%dx,%0" : "=a" (data) : "d" (port)); return (data); } static __inline u_int loadandclear(u_int *addr) { u_int result; __asm __volatile("xorl %0,%0; xchgl %1,%0" : "=&r" (result) : "m" (*addr)); return (result); } static __inline void outbv(u_int port, u_char data) { u_char al; /* * Use an unnecessary assignment to help gcc's register allocator. * This make a large difference for gcc-1.40 and a tiny difference * for gcc-2.6.0. For gcc-1.40, al had to be ``asm("ax")'' for * best results. gcc-2.6.0 can't handle this. */ al = data; __asm __volatile("outb %0,%%dx" : : "a" (al), "d" (port)); } static __inline void outl(u_int port, u_long data) { /* * outl() and outw() aren't used much so we haven't looked at * possible micro-optimizations such as the unnecessary * assignment for them. */ __asm __volatile("outl %0,%%dx" : : "a" (data), "d" (port)); } static __inline void outsb(u_int port, void *addr, size_t cnt) { __asm __volatile("cld; rep; outsb" : : "d" (port), "S" (addr), "c" (cnt) : "si", "cx"); } static __inline void outsw(u_int port, void *addr, size_t cnt) { __asm __volatile("cld; rep; outsw" : : "d" (port), "S" (addr), "c" (cnt) : "si", "cx"); } static __inline void outsl(u_int port, void *addr, size_t cnt) { __asm __volatile("cld; rep; outsl" : : "d" (port), "S" (addr), "c" (cnt) : "si", "cx"); } static __inline void outw(u_int port, u_short data) { __asm __volatile("outw %0,%%dx" : : "a" (data), "d" (port)); } static __inline u_long rcr2(void) { u_long data; __asm __volatile("movl %%cr2,%0" : "=r" (data)); return (data); } static __inline u_long read_eflags(void) { u_long ef; __asm __volatile("pushfl; popl %0" : "=r" (ef)); return (ef); } static __inline quad_t rdmsr(u_int msr) { quad_t rv; __asm __volatile(".byte 0x0f, 0x32" : "=A" (rv) : "c" (msr)); return (rv); } static __inline quad_t rdpmc(u_int pmc) { quad_t rv; __asm __volatile(".byte 0x0f, 0x33" : "=A" (rv) : "c" (pmc)); return (rv); } static __inline quad_t rdtsc(void) { quad_t rv; __asm __volatile(".byte 0x0f, 0x31" : "=A" (rv)); return (rv); } static __inline void setbits(volatile unsigned *addr, u_int bits) { __asm __volatile( #ifdef SMP "lock; " #endif "orl %1,%0" : "=m" (*addr) : "ir" (bits)); } static __inline void wbinvd(void) { __asm __volatile("wbinvd"); } static __inline void write_eflags(u_long ef) { __asm __volatile("pushl %0; popfl" : : "r" (ef)); } static __inline void wrmsr(u_int msr, quad_t newval) { __asm __volatile(".byte 0x0f, 0x30" : : "A" (newval), "c" (msr)); } #else /* !__GNUC__ */ int breakpoint __P((void)); void disable_intr __P((void)); void enable_intr __P((void)); u_char inb __P((u_int port)); u_long inl __P((u_int port)); void insb __P((u_int port, void *addr, size_t cnt)); void insl __P((u_int port, void *addr, size_t cnt)); void insw __P((u_int port, void *addr, size_t cnt)); void invd __P((void)); void invlpg __P((u_int addr)); void invltlb __P((void)); u_short inw __P((u_int port)); u_int loadandclear __P((u_int *addr)); void outb __P((u_int port, u_char data)); void outl __P((u_int port, u_long data)); void outsb __P((u_int port, void *addr, size_t cnt)); void outsl __P((u_int port, void *addr, size_t cnt)); void outsw __P((u_int port, void *addr, size_t cnt)); void outw __P((u_int port, u_short data)); u_long rcr2 __P((void)); quad_t rdmsr __P((u_int msr)); quad_t rdpmc __P((u_int pmc)); quad_t rdtsc __P((void)); u_long read_eflags __P((void)); void setbits __P((volatile unsigned *addr, u_int bits)); void wbinvd __P((void)); void write_eflags __P((u_long ef)); void wrmsr __P((u_int msr, quad_t newval)); #endif /* __GNUC__ */ void load_cr0 __P((u_long cr0)); void load_cr3 __P((u_long cr3)); void load_cr4 __P((u_long cr4)); void ltr __P((u_short sel)); u_int rcr0 __P((void)); u_long rcr3 __P((void)); u_long rcr4 __P((void)); #endif /* !_MACHINE_CPUFUNC_H_ */ Index: head/sys/kern/init_main.c =================================================================== --- head/sys/kern/init_main.c (revision 31708) +++ head/sys/kern/init_main.c (revision 31709) @@ -1,647 +1,647 @@ /* * Copyright (c) 1995 Terrence R. Lambert * All rights reserved. * * Copyright (c) 1982, 1986, 1989, 1991, 1992, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)init_main.c 8.9 (Berkeley) 1/21/94 - * $Id: init_main.c,v 1.77 1997/12/06 04:11:09 sef Exp $ + * $Id: init_main.c,v 1.78 1997/12/12 04:00:57 dyson Exp $ */ #include "opt_devfs.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include extern struct linker_set sysinit_set; /* XXX */ extern void __main __P((void)); extern void main __P((void *framep)); /* Components of the first process -- never freed. */ static struct session session0; static struct pgrp pgrp0; struct proc proc0; static struct pcred cred0; static struct filedesc0 filedesc0; static struct plimit limit0; static struct vmspace vmspace0; #ifndef SMP /* per-cpu on smp */ struct proc *curproc = &proc0; #endif struct proc *initproc; int cmask = CMASK; extern struct user *proc0paddr; struct vnode *rootvp; int boothowto = 0; /* initialized so that it can be patched */ struct timeval boottime; SYSCTL_STRUCT(_kern, KERN_BOOTTIME, boottime, CTLFLAG_RD, &boottime, timeval, ""); static int shutdowntimeout = 120; SYSCTL_INT(_kern, OID_AUTO, shutdown_timeout, CTLFLAG_RW, &shutdowntimeout, 0, ""); #ifndef SMP /* per-cpu on smp */ struct timeval runtime; #endif /* * Promiscuous argument pass for start_init() * * This is a kludge because we use a return from main() rather than a call * to a new routine in locore.s to kick the kernel alive from locore.s. */ static void *init_framep; #if __GNUC__ >= 2 void __main() {} #endif /* * This ensures that there is at least one entry so that the sysinit_set * symbol is not undefined. A sybsystem ID of SI_SUB_DUMMY is never * executed. */ SYSINIT(placeholder, SI_SUB_DUMMY,SI_ORDER_ANY, NULL, NULL) /* * System startup; initialize the world, create process 0, mount root * filesystem, and fork to create init and pagedaemon. Most of the * hard work is done in the lower-level initialization routines including * startup(), which does memory initialization and autoconfiguration. * * This allows simple addition of new kernel subsystems that require * boot time initialization. It also allows substitution of subsystem * (for instance, a scheduler, kernel profiler, or VM system) by object * module. Finally, it allows for optional "kernel threads", like an LFS * cleaner. */ void main(framep) void *framep; { register struct sysinit **sipp; /* system initialization*/ register struct sysinit **xipp; /* interior loop of sort*/ register struct sysinit *save; /* bubble*/ /* * Copy the locore.s frame pointer for proc0, this is forked into * all other processes. */ init_framep = framep; #ifdef SMP /* * XXX curproc is per-cpu in SMP, and exists in freshly mapped pages, * so its value must be initialized now before it is used by various * initialization routines. proc0_init() isn't soon enough: * among other things, configure() is called first, and may call * setdumpdev(), which panics without a valid curproc. */ curproc = &proc0; #endif /* SMP */ /* * Perform a bubble sort of the system initialization objects by * their subsystem (primary key) and order (secondary key). * * Since some things care about execution order, this is the * operation which ensures continued function. */ for( sipp = (struct sysinit **)sysinit_set.ls_items; *sipp; sipp++) { for( xipp = sipp + 1; *xipp; xipp++) { if( (*sipp)->subsystem < (*xipp)->subsystem || ( (*sipp)->subsystem == (*xipp)->subsystem && (*sipp)->order < (*xipp)->order)) continue; /* skip*/ save = *sipp; *sipp = *xipp; *xipp = save; } } /* * Traverse the (now) ordered list of system initialization tasks. * Perform each task, and continue on to the next task. * * The last item on the list is expected to be the scheduler, * which will not return. */ for( sipp = (struct sysinit **)sysinit_set.ls_items; *sipp; sipp++) { if( (*sipp)->subsystem == SI_SUB_DUMMY) continue; /* skip dummy task(s)*/ switch( (*sipp)->type) { case SI_TYPE_DEFAULT: /* no special processing*/ (*((*sipp)->func))( (*sipp)->udata); break; case SI_TYPE_KTHREAD: #if !defined(SMP) /* kernel thread*/ if (fork1(&proc0, RFMEM|RFFDG|RFPROC)) panic("fork kernel thread"); cpu_set_fork_handler(pfind(proc0.p_retval[0]), (*sipp)->func, (*sipp)->udata); break; #endif case SI_TYPE_KPROCESS: if (fork1(&proc0, RFFDG|RFPROC)) panic("fork kernel process"); cpu_set_fork_handler(pfind(proc0.p_retval[0]), (*sipp)->func, (*sipp)->udata); break; default: panic( "init_main: unrecognized init type"); } } panic("Shouldn't get here!"); /* NOTREACHED*/ } /* * Start a kernel process. This is called after a fork() call in * main() in the file kern/init_main.c. * * This function is used to start "internal" daemons. */ /* ARGSUSED*/ void kproc_start(udata) void *udata; { struct kproc_desc *kp = udata; struct proc *p = curproc; #ifdef DIAGNOSTIC printf("Start pid=%d <%s>\n",p->p_pid, kp->arg0); #endif /* save a global descriptor, if desired*/ if( kp->global_procpp != NULL) *kp->global_procpp = p; /* this is a non-swapped system process*/ p->p_flag |= P_INMEM | P_SYSTEM; /* set up arg0 for 'ps', et al*/ strcpy( p->p_comm, kp->arg0); /* call the processes' main()...*/ (*kp->func)(); /* NOTREACHED */ panic("kproc_start: %s", kp->arg0); } /* *************************************************************************** **** **** The following SYSINIT's belong elsewhere, but have not yet **** been moved. **** *************************************************************************** */ #ifdef OMIT /* * Handled by vfs_mountroot (bad idea) at this time... should be * done the same as 4.4Lite2. */ SYSINIT(swapinit, SI_SUB_SWAP, SI_ORDER_FIRST, swapinit, NULL) #endif /* OMIT*/ static void print_caddr_t __P((void *data)); static void print_caddr_t(data) void *data; { printf("%s", (char *)data); } SYSINIT(announce, SI_SUB_COPYRIGHT, SI_ORDER_FIRST, print_caddr_t, copyright) /* *************************************************************************** **** **** The two following SYSINT's are proc0 specific glue code. I am not **** convinced that they can not be safely combined, but their order of **** operation has been maintained as the same as the original init_main.c **** for right now. **** **** These probably belong in init_proc.c or kern_proc.c, since they **** deal with proc0 (the fork template process). **** *************************************************************************** */ /* ARGSUSED*/ static void proc0_init __P((void *dummy)); static void proc0_init(dummy) void *dummy; { register struct proc *p; register struct filedesc0 *fdp; register unsigned i; /* * Initialize the current process pointer (curproc) before * any possible traps/probes to simplify trap processing. */ p = &proc0; curproc = p; /* XXX redundant*/ /* * Initialize process and pgrp structures. */ procinit(); /* * Initialize sleep queue hash table */ sleepinit(); /* * additional VM structures */ vm_init2(); /* * Create process 0 (the swapper). */ LIST_INSERT_HEAD(&allproc, p, p_list); p->p_pgrp = &pgrp0; LIST_INSERT_HEAD(PGRPHASH(0), &pgrp0, pg_hash); LIST_INIT(&pgrp0.pg_members); LIST_INSERT_HEAD(&pgrp0.pg_members, p, p_pglist); pgrp0.pg_session = &session0; session0.s_count = 1; session0.s_leader = p; p->p_sysent = &aout_sysvec; p->p_flag = P_INMEM | P_SYSTEM; p->p_stat = SRUN; p->p_nice = NZERO; p->p_rtprio.type = RTP_PRIO_NORMAL; p->p_rtprio.prio = 0; /* * Link for kernel based threads */ p->p_peers = 0; p->p_leader = p; bcopy("swapper", p->p_comm, sizeof ("swapper")); /* Create credentials. */ cred0.p_refcnt = 1; p->p_cred = &cred0; p->p_ucred = crget(); p->p_ucred->cr_ngroups = 1; /* group 0 */ /* Create the file descriptor table. */ fdp = &filedesc0; p->p_fd = &fdp->fd_fd; fdp->fd_fd.fd_refcnt = 1; fdp->fd_fd.fd_cmask = cmask; fdp->fd_fd.fd_ofiles = fdp->fd_dfiles; fdp->fd_fd.fd_ofileflags = fdp->fd_dfileflags; fdp->fd_fd.fd_nfiles = NDFILE; /* Create the limits structures. */ p->p_limit = &limit0; for (i = 0; i < sizeof(p->p_rlimit)/sizeof(p->p_rlimit[0]); i++) limit0.pl_rlimit[i].rlim_cur = limit0.pl_rlimit[i].rlim_max = RLIM_INFINITY; limit0.pl_rlimit[RLIMIT_NOFILE].rlim_cur = limit0.pl_rlimit[RLIMIT_NOFILE].rlim_max = maxfiles; limit0.pl_rlimit[RLIMIT_NPROC].rlim_cur = limit0.pl_rlimit[RLIMIT_NPROC].rlim_max = maxproc; i = ptoa(cnt.v_free_count); limit0.pl_rlimit[RLIMIT_RSS].rlim_max = i; limit0.pl_rlimit[RLIMIT_MEMLOCK].rlim_max = i; limit0.pl_rlimit[RLIMIT_MEMLOCK].rlim_cur = i / 3; limit0.p_refcnt = 1; /* Allocate a prototype map so we have something to fork. */ p->p_vmspace = &vmspace0; vmspace0.vm_refcnt = 1; - pmap_pinit(&vmspace0.vm_pmap); + pmap_pinit0(&vmspace0.vm_pmap); vm_map_init(&vmspace0.vm_map, round_page(VM_MIN_ADDRESS), trunc_page(VM_MAXUSER_ADDRESS), TRUE); vmspace0.vm_map.pmap = &vmspace0.vm_pmap; p->p_addr = proc0paddr; /* XXX */ #define INCOMPAT_LITES2 #ifdef INCOMPAT_LITES2 /* * proc0 needs to have a coherent frame base in it's stack. */ cpu_set_init_frame(p, init_framep); /* XXX! */ #endif /* INCOMPAT_LITES2*/ /* * We continue to place resource usage info and signal * actions in the user struct so they're pageable. */ p->p_stats = &p->p_addr->u_stats; p->p_sigacts = &p->p_addr->u_sigacts; /* * Charge root for one process. */ (void)chgproccnt(0, 1); /* * Initialize the procfs flags (to 0, of course) */ p->p_stops = p->p_stype = p->p_step = 0; } SYSINIT(p0init, SI_SUB_INTRINSIC, SI_ORDER_FIRST, proc0_init, NULL) /* ARGSUSED*/ static void proc0_post __P((void *dummy)); static void proc0_post(dummy) void *dummy; { struct timeval tv; /* * Now can look at time, having had a chance to verify the time * from the file system. Reset p->p_rtime as it may have been * munched in mi_switch() after the time got set. */ gettime(&boottime); proc0.p_stats->p_start = runtime = mono_time = boottime; proc0.p_rtime.tv_sec = proc0.p_rtime.tv_usec = 0; /* * Give the ``random'' number generator a thump. */ microtime(&tv); srandom(tv.tv_sec ^ tv.tv_usec); /* Initialize signal state for process 0. */ siginit(&proc0); } SYSINIT(p0post, SI_SUB_INTRINSIC_POST, SI_ORDER_FIRST, proc0_post, NULL) /* *************************************************************************** **** **** The following SYSINIT's and glue code should be moved to the **** respective files on a per subsystem basis. **** *************************************************************************** */ /* ARGSUSED */ static void root_conf __P((void *dummy)); static void root_conf(dummy) void *dummy; { cpu_rootconf(); } SYSINIT(root_conf, SI_SUB_ROOT_CONF, SI_ORDER_FIRST, root_conf, NULL) /* ARGSUSED*/ static void xxx_vfs_root_fdtab __P((void *dummy)); static void xxx_vfs_root_fdtab(dummy) void *dummy; { register struct filedesc0 *fdp = &filedesc0; /* Get the vnode for '/'. Set fdp->fd_fd.fd_cdir to reference it. */ if (VFS_ROOT(mountlist.cqh_first, &rootvnode)) panic("cannot find root vnode"); fdp->fd_fd.fd_cdir = rootvnode; VREF(fdp->fd_fd.fd_cdir); VOP_UNLOCK(rootvnode, 0, &proc0); fdp->fd_fd.fd_rdir = NULL; } SYSINIT(retrofit, SI_SUB_ROOT_FDTAB, SI_ORDER_FIRST, xxx_vfs_root_fdtab, NULL) /* *************************************************************************** **** **** The following code probably belongs in another file, like **** kern/init_init.c. It is here for two reasons only: **** **** 1) This code returns to startup the system; this is **** abnormal for a kernel thread. **** 2) This code promiscuously uses init_frame **** *************************************************************************** */ static void kthread_init __P((void *dummy)); SYSINIT_KP(init,SI_SUB_KTHREAD_INIT, SI_ORDER_FIRST, kthread_init, NULL) extern void prepare_usermode __P((void)); static void start_init __P((struct proc *p)); /* ARGSUSED*/ static void kthread_init(dummy) void *dummy; { /* Create process 1 (init(8)). */ start_init(curproc); prepare_usermode(); /* * This returns to the fork trampoline, then to user mode. */ return; } /* * List of paths to try when searching for "init". */ static char *initpaths[] = { "/sbin/init", "/sbin/oinit", "/sbin/init.bak", "/stand/sysinstall", NULL, }; /* * Start the initial user process; try exec'ing each pathname in "initpaths". * The program is invoked with one argument containing the boot flags. */ static void start_init(p) struct proc *p; { vm_offset_t addr; struct execve_args args; int options, i, error; char **pathp, *path, *ucp, **uap, *arg0, *arg1; initproc = p; /* * Need just enough stack to hold the faked-up "execve()" arguments. */ addr = trunc_page(VM_MAXUSER_ADDRESS - PAGE_SIZE); if (vm_map_find(&p->p_vmspace->vm_map, NULL, 0, &addr, PAGE_SIZE, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0) != 0) panic("init: couldn't allocate argument space"); p->p_vmspace->vm_maxsaddr = (caddr_t)addr; p->p_vmspace->vm_ssize = 1; for (pathp = &initpaths[0]; (path = *pathp) != NULL; pathp++) { /* * Move out the boot flag argument. */ options = 0; ucp = (char *)USRSTACK; (void)subyte(--ucp, 0); /* trailing zero */ if (boothowto & RB_SINGLE) { (void)subyte(--ucp, 's'); options = 1; } #ifdef notyet if (boothowto & RB_FASTBOOT) { (void)subyte(--ucp, 'f'); options = 1; } #endif #ifdef BOOTCDROM (void)subyte(--ucp, 'C'); options = 1; #endif #if defined(DEVFS) && defined(DEVFS_ROOT) (void)subyte(--ucp, 'd'); options = 1; #endif if (options == 0) (void)subyte(--ucp, '-'); (void)subyte(--ucp, '-'); /* leading hyphen */ arg1 = ucp; /* * Move out the file name (also arg 0). */ for (i = strlen(path) + 1; i >= 0; i--) (void)subyte(--ucp, path[i]); arg0 = ucp; /* * Move out the arg pointers. */ uap = (char **)((int)ucp & ~(NBPW-1)); (void)suword((caddr_t)--uap, 0); /* terminator */ (void)suword((caddr_t)--uap, (int)arg1); (void)suword((caddr_t)--uap, (int)arg0); /* * Point at the arguments. */ args.fname = arg0; args.argv = uap; args.envv = NULL; /* * Now try to exec the program. If can't for any reason * other than it doesn't exist, complain. * * Otherwise return to main() which returns to btext * which completes the system startup. */ if ((error = execve(p, &args)) == 0) return; if (error != ENOENT) printf("exec %s: error %d\n", path, error); } printf("init: not found\n"); panic("no init"); } Index: head/sys/vm/pmap.h =================================================================== --- head/sys/vm/pmap.h (revision 31708) +++ head/sys/vm/pmap.h (revision 31709) @@ -1,138 +1,139 @@ /* * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.h 8.1 (Berkeley) 6/11/93 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Author: Avadis Tevanian, Jr. * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * - * $Id: pmap.h,v 1.23 1997/08/05 22:07:21 dyson Exp $ + * $Id: pmap.h,v 1.24 1997/08/05 23:03:24 dyson Exp $ */ /* * Machine address mapping definitions -- machine-independent * section. [For machine-dependent section, see "machine/pmap.h".] */ #ifndef _PMAP_VM_ #define _PMAP_VM_ /* * Each machine dependent implementation is expected to * keep certain statistics. They may do this anyway they * so choose, but are expected to return the statistics * in the following structure. */ struct pmap_statistics { long resident_count; /* # of pages mapped (total) */ long wired_count; /* # of pages wired */ }; typedef struct pmap_statistics *pmap_statistics_t; #include #ifdef KERNEL void pmap_change_wiring __P((pmap_t, vm_offset_t, boolean_t)); void pmap_clear_modify __P((vm_offset_t pa)); void pmap_clear_reference __P((vm_offset_t pa)); void pmap_copy __P((pmap_t, pmap_t, vm_offset_t, vm_size_t, vm_offset_t)); void pmap_copy_page __P((vm_offset_t, vm_offset_t)); void pmap_destroy __P((pmap_t)); void pmap_enter __P((pmap_t, vm_offset_t, vm_offset_t, vm_prot_t, boolean_t)); vm_offset_t pmap_extract __P((pmap_t, vm_offset_t)); void pmap_growkernel __P((vm_offset_t)); void pmap_init __P((vm_offset_t, vm_offset_t)); boolean_t pmap_is_modified __P((vm_offset_t pa)); boolean_t pmap_ts_referenced __P((vm_offset_t pa)); void pmap_kenter __P((vm_offset_t, vm_offset_t)); void pmap_kremove __P((vm_offset_t)); vm_offset_t pmap_map __P((vm_offset_t, vm_offset_t, vm_offset_t, int)); void pmap_object_init_pt __P((pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_offset_t size, int pagelimit)); boolean_t pmap_page_exists __P((pmap_t, vm_offset_t)); void pmap_page_protect __P((vm_offset_t, vm_prot_t)); void pmap_pageable __P((pmap_t, vm_offset_t, vm_offset_t, boolean_t)); vm_offset_t pmap_phys_address __P((int)); void pmap_pinit __P((pmap_t)); +void pmap_pinit0 __P((pmap_t)); void pmap_protect __P((pmap_t, vm_offset_t, vm_offset_t, vm_prot_t)); void pmap_qenter __P((vm_offset_t, vm_page_t *, int)); void pmap_qremove __P((vm_offset_t, int)); void pmap_reference __P((pmap_t)); void pmap_release __P((pmap_t)); void pmap_remove __P((pmap_t, vm_offset_t, vm_offset_t)); void pmap_remove_pages __P((pmap_t, vm_offset_t, vm_offset_t)); void pmap_zero_page __P((vm_offset_t)); void pmap_prefault __P((pmap_t pmap, vm_offset_t addra, vm_map_entry_t entry, vm_object_t object)); int pmap_mincore __P((pmap_t pmap, vm_offset_t addr)); void pmap_new_proc __P((struct proc *p)); void pmap_dispose_proc __P((struct proc *p)); void pmap_swapout_proc __P((struct proc *p)); void pmap_swapin_proc __P((struct proc *p)); void pmap_activate __P((struct proc *p)); vm_offset_t pmap_addr_hint __P((vm_object_t obj, vm_offset_t addr, vm_size_t size)); void pmap_init2 __P((void)); #endif /* KERNEL */ #endif /* _PMAP_VM_ */