Index: head/sys/amd64/amd64/cpu_switch.S
===================================================================
--- head/sys/amd64/amd64/cpu_switch.S	(revision 31708)
+++ head/sys/amd64/amd64/cpu_switch.S	(revision 31709)
@@ -1,778 +1,815 @@
 /*-
  * Copyright (c) 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *	$Id: swtch.s,v 1.63 1997/09/21 15:03:58 peter Exp $
+ *	$Id: swtch.s,v 1.64 1997/10/10 09:44:06 peter Exp $
  */
 
 #include "npx.h"
 #include "opt_user_ldt.h"
 #include "opt_vm86.h"
 
 #include <sys/rtprio.h>
 
 #include <machine/asmacros.h>
 
 #ifdef SMP
 #include <machine/pmap.h>
 #include <machine/apic.h>
 #include <machine/smptests.h>		/** GRAB_LOPRIO */
 #endif /* SMP */
 
 #include "assym.s"
 
 
 /*****************************************************************************/
 /* Scheduling                                                                */
 /*****************************************************************************/
 
 /*
  * The following primitives manipulate the run queues.
  * _whichqs tells which of the 32 queues _qs
  * have processes in them.  setrunqueue puts processes into queues, Remrq
  * removes them from queues.  The running process is on no queue,
  * other processes are on a queue related to p->p_priority, divided by 4
  * actually to shrink the 0-127 range of priorities into the 32 available
  * queues.
  */
 	.data
 
 #ifndef SMP
 	.globl	_curpcb
 _curpcb:	.long	0		/* pointer to curproc's PCB area */
 #endif /* !SMP */
 
 	.globl	_whichqs, _whichrtqs, _whichidqs
 
 _whichqs:	.long	0		/* which run queues have data */
 _whichrtqs:	.long	0		/* which realtime run qs have data */
 _whichidqs:	.long	0		/* which idletime run qs have data */
 
 	.globl	_hlt_vector
 _hlt_vector:	.long	_default_halt	/* pointer to halt routine */
 
 	.globl	_qs,_cnt,_panic
 
 	.globl	_want_resched
 _want_resched:	.long	0		/* we need to re-run the scheduler */
+#if defined(SWTCH_OPTIM_STATS)
+	.globl	_swtch_optim_stats, _tlb_flush_count
+_swtch_optim_stats:	.long	0		/* number of _swtch_optims */
+_tlb_flush_count:	.long	0
+#endif
 
 	.text
 /*
  * setrunqueue(p)
  *
  * Call should be made at spl6(), and p->p_stat should be SRUN
  */
 ENTRY(setrunqueue)
 	movl	4(%esp),%eax
 #ifdef DIAGNOSTIC
 	cmpb	$SRUN,P_STAT(%eax)
 	je	set1
 	pushl	$set2
 	call	_panic
 set1:
 #endif
 	cmpw	$RTP_PRIO_NORMAL,P_RTPRIO_TYPE(%eax) /* normal priority process? */
 	je	set_nort
 
 	movzwl	P_RTPRIO_PRIO(%eax),%edx
 
 	cmpw	$RTP_PRIO_REALTIME,P_RTPRIO_TYPE(%eax) /* realtime priority? */
 	jne	set_id				/* must be idle priority */
 	
 set_rt:
 	btsl	%edx,_whichrtqs			/* set q full bit */
 	shll	$3,%edx
 	addl	$_rtqs,%edx			/* locate q hdr */
 	movl	%edx,P_FORW(%eax)		/* link process on tail of q */
 	movl	P_BACK(%edx),%ecx
 	movl	%ecx,P_BACK(%eax)
 	movl	%eax,P_BACK(%edx)
 	movl	%eax,P_FORW(%ecx)
 	ret
 
 set_id:	
 	btsl	%edx,_whichidqs			/* set q full bit */
 	shll	$3,%edx
 	addl	$_idqs,%edx			/* locate q hdr */
 	movl	%edx,P_FORW(%eax)		/* link process on tail of q */
 	movl	P_BACK(%edx),%ecx
 	movl	%ecx,P_BACK(%eax)
 	movl	%eax,P_BACK(%edx)
 	movl	%eax,P_FORW(%ecx)
 	ret
 
 set_nort:                    			/*  Normal (RTOFF) code */
 	movzbl	P_PRI(%eax),%edx
 	shrl	$2,%edx
 	btsl	%edx,_whichqs			/* set q full bit */
 	shll	$3,%edx
 	addl	$_qs,%edx			/* locate q hdr */
 	movl	%edx,P_FORW(%eax)		/* link process on tail of q */
 	movl	P_BACK(%edx),%ecx
 	movl	%ecx,P_BACK(%eax)
 	movl	%eax,P_BACK(%edx)
 	movl	%eax,P_FORW(%ecx)
 	ret
 
 set2:	.asciz	"setrunqueue"
 
 /*
  * Remrq(p)
  *
  * Call should be made at spl6().
  */
 ENTRY(remrq)
 	movl	4(%esp),%eax
 	cmpw	$RTP_PRIO_NORMAL,P_RTPRIO_TYPE(%eax) /* normal priority process? */
 	je	rem_nort
 
 	movzwl	P_RTPRIO_PRIO(%eax),%edx
 
 	cmpw	$RTP_PRIO_REALTIME,P_RTPRIO_TYPE(%eax) /* normal priority process? */
 	jne	rem_id
 		
 	btrl	%edx,_whichrtqs			/* clear full bit, panic if clear already */
 	jb	rem1rt
 	pushl	$rem3rt
 	call	_panic
 rem1rt:
 	pushl	%edx
 	movl	P_FORW(%eax),%ecx		/* unlink process */
 	movl	P_BACK(%eax),%edx
 	movl	%edx,P_BACK(%ecx)
 	movl	P_BACK(%eax),%ecx
 	movl	P_FORW(%eax),%edx
 	movl	%edx,P_FORW(%ecx)
 	popl	%edx
 	movl	$_rtqs,%ecx
 	shll	$3,%edx
 	addl	%edx,%ecx
 	cmpl	P_FORW(%ecx),%ecx		/* q still has something? */
 	je	rem2rt
 	shrl	$3,%edx				/* yes, set bit as still full */
 	btsl	%edx,_whichrtqs
 rem2rt:
 	ret
 rem_id:
 	btrl	%edx,_whichidqs			/* clear full bit, panic if clear already */
 	jb	rem1id
 	pushl	$rem3id
 	call	_panic
 rem1id:
 	pushl	%edx
 	movl	P_FORW(%eax),%ecx		/* unlink process */
 	movl	P_BACK(%eax),%edx
 	movl	%edx,P_BACK(%ecx)
 	movl	P_BACK(%eax),%ecx
 	movl	P_FORW(%eax),%edx
 	movl	%edx,P_FORW(%ecx)
 	popl	%edx
 	movl	$_idqs,%ecx
 	shll	$3,%edx
 	addl	%edx,%ecx
 	cmpl	P_FORW(%ecx),%ecx		/* q still has something? */
 	je	rem2id
 	shrl	$3,%edx				/* yes, set bit as still full */
 	btsl	%edx,_whichidqs
 rem2id:
 	ret
 
 rem_nort:     
 	movzbl	P_PRI(%eax),%edx
 	shrl	$2,%edx
 	btrl	%edx,_whichqs			/* clear full bit, panic if clear already */
 	jb	rem1
 	pushl	$rem3
 	call	_panic
 rem1:
 	pushl	%edx
 	movl	P_FORW(%eax),%ecx		/* unlink process */
 	movl	P_BACK(%eax),%edx
 	movl	%edx,P_BACK(%ecx)
 	movl	P_BACK(%eax),%ecx
 	movl	P_FORW(%eax),%edx
 	movl	%edx,P_FORW(%ecx)
 	popl	%edx
 	movl	$_qs,%ecx
 	shll	$3,%edx
 	addl	%edx,%ecx
 	cmpl	P_FORW(%ecx),%ecx		/* q still has something? */
 	je	rem2
 	shrl	$3,%edx				/* yes, set bit as still full */
 	btsl	%edx,_whichqs
 rem2:
 	ret
 
 rem3:	.asciz	"remrq"
 rem3rt:	.asciz	"remrq.rt"
 rem3id:	.asciz	"remrq.id"
 
 /*
  * When no processes are on the runq, cpu_switch() branches to _idle
  * to wait for something to come ready.
  */
 	ALIGN_TEXT
 _idle:
 #ifdef SMP
 	/* when called, we have the mplock, intr disabled */
 
 	xorl	%ebp,%ebp
 
 	/* use our idleproc's "context" */
 	movl	_my_idlePTD,%ecx
 	movl	%ecx,%cr3
+#if defined(SWTCH_OPTIM_STATS)
+	incl	_tlb_flush_count
+#endif
 	movl	$_idlestack_top,%ecx
 	movl	%ecx,%esp
 
 	/* update common_tss.tss_esp0 pointer */
 #ifdef VM86
 	movl	_my_tr, %esi
 #endif /* VM86 */
-	movl	$_common_tss, %eax
-	movl	%ecx, TSS_ESP0(%eax)
+	movl	%ecx, _common_tss + TSS_ESP0
 
 #ifdef VM86
 	btrl	%esi, _private_tss
 	je	1f
 	movl	$_common_tssd, %edi
 
 	/* move correct tss descriptor into GDT slot, then reload tr */
 	leal	_gdt(,%esi,8), %ebx		/* entry in GDT */
 	movl	0(%edi), %eax
 	movl	%eax, 0(%ebx)
 	movl	4(%edi), %eax
 	movl	%eax, 4(%ebx)
 	shll	$3, %esi			/* GSEL(entry, SEL_KPL) */
 	ltr	%si
 1:
 #endif /* VM86 */
 
 	sti
 
 	/*
 	 * XXX callers of cpu_switch() do a bogus splclock().  Locking should
 	 * be left to cpu_switch().
 	 */
 	call	_spl0
 
 	cli
 
 	/*
 	 * _REALLY_ free the lock, no matter how deep the prior nesting.
 	 * We will recover the nesting on the way out when we have a new
 	 * proc to load.
 	 *
 	 * XXX: we had damn well better be sure we had it before doing this!
 	 */
 	movl	$FREE_LOCK, %eax
 	movl	%eax, _mp_lock
 
 	/* do NOT have lock, intrs disabled */
 	.globl	idle_loop
 idle_loop:
 
+#if defined(SWTCH_OPTIM_STATS)
+	incl	_tlb_flush_count
+#endif
 	movl	%cr3,%eax			/* ouch! */
 	movl	%eax,%cr3
 
 	cmpl	$0,_smp_active
 	jne	1f
 	cmpl	$0,_cpuid
 	je	1f
 	jmp	2f
 
 1:	cmpl	$0,_whichrtqs			/* real-time queue */
 	jne	3f
 	cmpl	$0,_whichqs			/* normal queue */
 	jne	3f
 	cmpl	$0,_whichidqs			/* 'idle' queue */
 	jne	3f
 
 	cmpl	$0,_do_page_zero_idle
 	je	2f
 
 	/* XXX appears to cause panics */
 	/*
 	 * Inside zero_idle we enable interrupts and grab the mplock
 	 * as needed.  It needs to be careful about entry/exit mutexes.
 	 */
 	call	_vm_page_zero_idle		/* internal locking */
 	testl	%eax, %eax
 	jnz	idle_loop
 2:
 
 	/* enable intrs for a halt */
 #ifdef SMP
 	movl	$0, lapic_tpr			/* 1st candidate for an INT */
 #endif
 	sti
 	call	*_hlt_vector			/* wait for interrupt */
 	cli
 	jmp	idle_loop
 
 3:
 #ifdef SMP
 	movl	$LOPRIO_LEVEL, lapic_tpr	/* arbitrate for INTs */
 #endif
 	call	_get_mplock
 	cmpl	$0,_whichrtqs			/* real-time queue */
 	CROSSJUMP(jne, sw1a, je)
 	cmpl	$0,_whichqs			/* normal queue */
 	CROSSJUMP(jne, nortqr, je)
 	cmpl	$0,_whichidqs			/* 'idle' queue */
 	CROSSJUMP(jne, idqr, je)
 	call	_rel_mplock
 	jmp	idle_loop
 
 #else
 	xorl	%ebp,%ebp
 	movl	$HIDENAME(tmpstk),%esp
-	movl	_IdlePTD,%ecx
-	movl	%ecx,%cr3
+#if defined(OVERLY_CONSERVATIVE_PTD_MGMT)
+#if defined(SWTCH_OPTIM_STATS)
+	incl	_swtch_optim_stats
+#endif
+	movl	_IdlePTD, %ecx
+	movl	%cr3, %eax
+	cmpl	%ecx, %eax
+	je		2f
+#if defined(SWTCH_OPTIM_STATS)
+	decl	_swtch_optim_stats
+	incl	_tlb_flush_count
+#endif
+	movl	%ecx, %cr3
+2:
+#endif
 
 	/* update common_tss.tss_esp0 pointer */
 #ifdef VM86
 	movl	_my_tr, %esi
 #endif /* VM86 */
-	movl	$_common_tss, %eax
-	movl	%esp, TSS_ESP0(%eax)
+	movl	%esp, _common_tss + TSS_ESP0
 
 #ifdef VM86
 	btrl	%esi, _private_tss
 	je	1f
 	movl	$_common_tssd, %edi
 
 	/* move correct tss descriptor into GDT slot, then reload tr */
 	leal	_gdt(,%esi,8), %ebx		/* entry in GDT */
 	movl	0(%edi), %eax
 	movl	%eax, 0(%ebx)
 	movl	4(%edi), %eax
 	movl	%eax, 4(%ebx)
 	shll	$3, %esi			/* GSEL(entry, SEL_KPL) */
 	ltr	%si
 1:
 #endif /* VM86 */
 
 	sti
 
 	/*
 	 * XXX callers of cpu_switch() do a bogus splclock().  Locking should
 	 * be left to cpu_switch().
 	 */
 	call	_spl0
 
 	ALIGN_TEXT
 idle_loop:
 	cli
 	cmpl	$0,_whichrtqs			/* real-time queue */
 	CROSSJUMP(jne, sw1a, je)
 	cmpl	$0,_whichqs			/* normal queue */
 	CROSSJUMP(jne, nortqr, je)
 	cmpl	$0,_whichidqs			/* 'idle' queue */
 	CROSSJUMP(jne, idqr, je)
 	call	_vm_page_zero_idle
 	testl	%eax, %eax
 	jnz	idle_loop
 	sti
 	call	*_hlt_vector			/* wait for interrupt */
 	jmp	idle_loop
 #endif
 
 CROSSJUMPTARGET(_idle)
 
 ENTRY(default_halt)
 #ifndef SMP
 	hlt					/* XXX:	 until a wakeup IPI */
 #endif
 	ret
 
 /*
  * cpu_switch()
  */
 ENTRY(cpu_switch)
 	
 	/* switch to new process. first, save context as needed */
 	movl	_curproc,%ecx
 
 	/* if no process to save, don't bother */
 	testl	%ecx,%ecx
 	je	sw1
 
 #ifdef SMP
 	movb	P_ONCPU(%ecx), %al		/* save "last" cpu */
 	movb	%al, P_LASTCPU(%ecx)
 	movb	$0xff, P_ONCPU(%ecx)		/* "leave" the cpu */
 #endif /* SMP */
 
 	movl	P_ADDR(%ecx),%ecx
 
 	movl	(%esp),%eax			/* Hardware registers */
 	movl	%eax,PCB_EIP(%ecx)
 	movl	%ebx,PCB_EBX(%ecx)
 	movl	%esp,PCB_ESP(%ecx)
 	movl	%ebp,PCB_EBP(%ecx)
 	movl	%esi,PCB_ESI(%ecx)
 	movl	%edi,PCB_EDI(%ecx)
 	movl	%fs,PCB_FS(%ecx)
 	movl	%gs,PCB_GS(%ecx)
 
 #ifdef SMP
 	movl	_mp_lock, %eax
 	/* XXX FIXME: we should be saving the local APIC TPR */
 #ifdef DIAGNOSTIC
 	cmpl	$FREE_LOCK, %eax		/* is it free? */
 	je	badsw4				/* yes, bad medicine! */
 #endif /* DIAGNOSTIC */
 	andl	$COUNT_FIELD, %eax		/* clear CPU portion */
 	movl	%eax, PCB_MPNEST(%ecx)		/* store it */
 #endif /* SMP */
 
 #if NNPX > 0
 	/* have we used fp, and need a save? */
 	movl	_curproc,%eax
 	cmpl	%eax,_npxproc
 	jne	1f
 	addl	$PCB_SAVEFPU,%ecx		/* h/w bugs make saving complicated */
 	pushl	%ecx
 	call	_npxsave			/* do it in a big C function */
 	popl	%eax
 1:
 #endif	/* NNPX > 0 */
 
 	movl	$0,_curproc			/* out of process */
 
 	/* save is done, now choose a new process or idle */
 sw1:
 	cli
 
 #ifdef SMP
 	/* Stop scheduling if smp_active goes zero and we are not BSP */
 	cmpl	$0,_smp_active
 	jne	1f
 	cmpl	$0,_cpuid
 	je	1f
 	CROSSJUMP(je, _idle, jne)		/* wind down */
 1:
 #endif
 
 sw1a:
 	movl    _whichrtqs,%edi			/* pick next p. from rtqs */
 	testl	%edi,%edi
 	jz	nortqr				/* no realtime procs */
 
 	/* XXX - bsf is sloow */
 	bsfl	%edi,%ebx			/* find a full q */
 	jz	nortqr				/* no proc on rt q - try normal ... */
 
 	/* XX update whichqs? */
 	btrl	%ebx,%edi			/* clear q full status */
 	leal	_rtqs(,%ebx,8),%eax		/* select q */
 	movl	%eax,%esi
 
 	movl	P_FORW(%eax),%ecx		/* unlink from front of process q */
 	movl	P_FORW(%ecx),%edx
 	movl	%edx,P_FORW(%eax)
 	movl	P_BACK(%ecx),%eax
 	movl	%eax,P_BACK(%edx)
 
 	cmpl	P_FORW(%ecx),%esi		/* q empty */
 	je	rt3
 	btsl	%ebx,%edi			/* nope, set to indicate not empty */
 rt3:
 	movl	%edi,_whichrtqs			/* update q status */
 	jmp	swtch_com
 
 	/* old sw1a */
 /* Normal process priority's */
 nortqr:
 	movl	_whichqs,%edi
 2:
 	/* XXX - bsf is sloow */
 	bsfl	%edi,%ebx			/* find a full q */
 	jz	idqr				/* if none, idle */
 
 	/* XX update whichqs? */
 	btrl	%ebx,%edi			/* clear q full status */
 	leal	_qs(,%ebx,8),%eax		/* select q */
 	movl	%eax,%esi
 
 	movl	P_FORW(%eax),%ecx		/* unlink from front of process q */
 	movl	P_FORW(%ecx),%edx
 	movl	%edx,P_FORW(%eax)
 	movl	P_BACK(%ecx),%eax
 	movl	%eax,P_BACK(%edx)
 
 	cmpl	P_FORW(%ecx),%esi		/* q empty */
 	je	3f
 	btsl	%ebx,%edi			/* nope, set to indicate not empty */
 3:
 	movl	%edi,_whichqs			/* update q status */
 	jmp	swtch_com
 
 idqr: /* was sw1a */
 	movl    _whichidqs,%edi			/* pick next p. from idqs */
 
 	/* XXX - bsf is sloow */
 	bsfl	%edi,%ebx			/* find a full q */
 	CROSSJUMP(je, _idle, jne)		/* if no proc, idle */
 
 	/* XX update whichqs? */
 	btrl	%ebx,%edi			/* clear q full status */
 	leal	_idqs(,%ebx,8),%eax		/* select q */
 	movl	%eax,%esi
 
 	movl	P_FORW(%eax),%ecx		/* unlink from front of process q */
 	movl	P_FORW(%ecx),%edx
 	movl	%edx,P_FORW(%eax)
 	movl	P_BACK(%ecx),%eax
 	movl	%eax,P_BACK(%edx)
 
 	cmpl	P_FORW(%ecx),%esi		/* q empty */
 	je	id3
 	btsl	%ebx,%edi			/* nope, set to indicate not empty */
 id3:
 	movl	%edi,_whichidqs			/* update q status */
 
 swtch_com:
 	movl	$0,%eax
 	movl	%eax,_want_resched
 
 #ifdef	DIAGNOSTIC
 	cmpl	%eax,P_WCHAN(%ecx)
 	jne	badsw1
 	cmpb	$SRUN,P_STAT(%ecx)
 	jne	badsw2
 #endif
 
 	movl	%eax,P_BACK(%ecx) 		/* isolate process to run */
 	movl	P_ADDR(%ecx),%edx
-	movl	PCB_CR3(%edx),%ebx
 
 #ifdef SMP
+	movl	PCB_CR3(%edx),%ebx
 	/* Grab the private PT pointer from the outgoing process's PTD */
 	movl	$_PTD, %esi
 	movl	4*MPPTDI(%esi), %eax		/* fetch cpu's prv pt */
-#endif /* SMP */
-
+#else
+#if defined(SWTCH_OPTIM_STATS)
+	incl	_swtch_optim_stats
+#endif
 	/* switch address space */
+	movl	%cr3,%ebx
+	cmpl	PCB_CR3(%edx),%ebx
+	je		4f
+#if defined(SWTCH_OPTIM_STATS)
+	decl	_swtch_optim_stats
+	incl	_tlb_flush_count
+#endif
+	movl	PCB_CR3(%edx),%ebx
+#endif /* SMP */
 	movl	%ebx,%cr3
+4:
 
 #ifdef SMP
 	/* Copy the private PT to the new process's PTD */
 	/* XXX yuck, the _PTD changes when we switch, so we have to
 	 * reload %cr3 after changing the address space.
 	 * We need to fix this by storing a pointer to the virtual
 	 * location of the per-process PTD in the PCB or something quick.
 	 * Dereferencing proc->vm_map->pmap->p_pdir[] is painful in asm.
 	 */
 	movl	%eax, 4*MPPTDI(%esi)		/* restore cpu's prv page */
 
+#if defined(SWTCH_OPTIM_STATS)
+	incl	_tlb_flush_count
+#endif
 	/* XXX: we have just changed the page tables.. reload.. */
 	movl	%ebx, %cr3
 #endif /* SMP */
 
 #ifdef VM86
 	movl	_my_tr, %esi
 	cmpl	$0, PCB_EXT(%edx)		/* has pcb extension? */
 	je	1f
 	btsl	%esi, _private_tss		/* mark use of private tss */
 	movl	PCB_EXT(%edx), %edi		/* new tss descriptor */
 	jmp	2f
 1:
 #endif
 
 	/* update common_tss.tss_esp0 pointer */
 	movl	$_common_tss, %eax
 	movl	%edx, %ebx			/* pcb */
 #ifdef VM86
 	addl	$(UPAGES * PAGE_SIZE - 16), %ebx
 #else
 	addl	$(UPAGES * PAGE_SIZE), %ebx
 #endif /* VM86 */
 	movl	%ebx, TSS_ESP0(%eax)
 
 #ifdef VM86
 	btrl	%esi, _private_tss
 	je	3f
 	movl	$_common_tssd, %edi
 2:
 	/* move correct tss descriptor into GDT slot, then reload tr */
 	leal	_gdt(,%esi,8), %ebx		/* entry in GDT */
 	movl	0(%edi), %eax
 	movl	%eax, 0(%ebx)
 	movl	4(%edi), %eax
 	movl	%eax, 4(%ebx)
 	shll	$3, %esi			/* GSEL(entry, SEL_KPL) */
 	ltr	%si
 3:
 #endif /* VM86 */
 
 	/* restore context */
 	movl	PCB_EBX(%edx),%ebx
 	movl	PCB_ESP(%edx),%esp
 	movl	PCB_EBP(%edx),%ebp
 	movl	PCB_ESI(%edx),%esi
 	movl	PCB_EDI(%edx),%edi
 	movl	PCB_EIP(%edx),%eax
 	movl	%eax,(%esp)
 
 #ifdef SMP
 #ifdef GRAB_LOPRIO				/* hold LOPRIO for INTs */
 #ifdef CHEAP_TPR
 	movl	$0, lapic_tpr
 #else
 	andl	$~APIC_TPR_PRIO, lapic_tpr
 #endif /** CHEAP_TPR */
 #endif /** GRAB_LOPRIO */
 	movl	_cpuid,%eax
 	movb	%al, P_ONCPU(%ecx)
 #endif /* SMP */
 	movl	%edx, _curpcb
 	movl	%ecx, _curproc			/* into next process */
 
 #ifdef SMP
 	movl	_cpu_lockid, %eax
 	orl	PCB_MPNEST(%edx), %eax		/* add next count from PROC */
 	movl	%eax, _mp_lock			/* load the mp_lock */
 	/* XXX FIXME: we should be restoring the local APIC TPR */
 #endif /* SMP */
 
 #ifdef	USER_LDT
 	cmpl	$0, PCB_USERLDT(%edx)
 	jnz	1f
 	movl	__default_ldt,%eax
 	cmpl	_currentldt,%eax
 	je	2f
 	lldt	__default_ldt
 	movl	%eax,_currentldt
 	jmp	2f
 1:	pushl	%edx
 	call	_set_user_ldt
 	popl	%edx
 2:
 #endif
 
 	/* This must be done after loading the user LDT. */
 	.globl	cpu_switch_load_fs
 cpu_switch_load_fs:
 	movl	PCB_FS(%edx),%fs
 	.globl	cpu_switch_load_gs
 cpu_switch_load_gs:
 	movl	PCB_GS(%edx),%gs
 
 	sti
 	ret
 
 CROSSJUMPTARGET(idqr)
 CROSSJUMPTARGET(nortqr)
 CROSSJUMPTARGET(sw1a)
 
 #ifdef DIAGNOSTIC
 badsw1:
 	pushl	$sw0_1
 	call	_panic
 
 sw0_1:	.asciz	"cpu_switch: has wchan"
 
 badsw2:
 	pushl	$sw0_2
 	call	_panic
 
 sw0_2:	.asciz	"cpu_switch: not SRUN"
 #endif
 
 #if defined(SMP) && defined(DIAGNOSTIC)
 badsw4:
 	pushl	$sw0_4
 	call	_panic
 
 sw0_4:	.asciz	"cpu_switch: do not have lock"
 #endif /* SMP && DIAGNOSTIC */
 
 /*
  * savectx(pcb)
  * Update pcb, saving current processor state.
  */
 ENTRY(savectx)
 	/* fetch PCB */
 	movl	4(%esp),%ecx
 
 	/* caller's return address - child won't execute this routine */
 	movl	(%esp),%eax
 	movl	%eax,PCB_EIP(%ecx)
 
 	movl	%ebx,PCB_EBX(%ecx)
 	movl	%esp,PCB_ESP(%ecx)
 	movl	%ebp,PCB_EBP(%ecx)
 	movl	%esi,PCB_ESI(%ecx)
 	movl	%edi,PCB_EDI(%ecx)
 	movl	%fs,PCB_FS(%ecx)
 	movl	%gs,PCB_GS(%ecx)
 
 #if NNPX > 0
 	/*
 	 * If npxproc == NULL, then the npx h/w state is irrelevant and the
 	 * state had better already be in the pcb.  This is true for forks
 	 * but not for dumps (the old book-keeping with FP flags in the pcb
 	 * always lost for dumps because the dump pcb has 0 flags).
 	 *
 	 * If npxproc != NULL, then we have to save the npx h/w state to
 	 * npxproc's pcb and copy it to the requested pcb, or save to the
 	 * requested pcb and reload.  Copying is easier because we would
 	 * have to handle h/w bugs for reloading.  We used to lose the
 	 * parent's npx state for forks by forgetting to reload.
 	 */
 	movl	_npxproc,%eax
 	testl	%eax,%eax
 	je	1f
 
 	pushl	%ecx
 	movl	P_ADDR(%eax),%eax
 	leal	PCB_SAVEFPU(%eax),%eax
 	pushl	%eax
 	pushl	%eax
 	call	_npxsave
 	addl	$4,%esp
 	popl	%eax
 	popl	%ecx
 
 	pushl	$PCB_SAVEFPU_SIZE
 	leal	PCB_SAVEFPU(%ecx),%ecx
 	pushl	%ecx
 	pushl	%eax
 	call	_bcopy
 	addl	$12,%esp
 #endif	/* NNPX > 0 */
 
 1:
 	ret
Index: head/sys/amd64/amd64/machdep.c
===================================================================
--- head/sys/amd64/amd64/machdep.c	(revision 31708)
+++ head/sys/amd64/amd64/machdep.c	(revision 31709)
@@ -1,1798 +1,1806 @@
 /*-
  * Copyright (c) 1992 Terrence R. Lambert.
  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
- *	$Id: machdep.c,v 1.277 1997/12/04 14:35:39 jkh Exp $
+ *	$Id: machdep.c,v 1.278 1997/12/04 21:21:24 jmg Exp $
  */
 
 #include "apm.h"
 #include "npx.h"
 #include "opt_bounce.h"
 #include "opt_cpu.h"
 #include "opt_ddb.h"
 #include "opt_maxmem.h"
 #include "opt_perfmon.h"
 #include "opt_smp.h"
 #include "opt_sysvipc.h"
 #include "opt_userconfig.h"
 #include "opt_vm86.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/signalvar.h>
 #include <sys/kernel.h>
 #include <sys/proc.h>
 #include <sys/buf.h>
 #include <sys/reboot.h>
 #include <sys/conf.h>
 #include <sys/callout.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/msgbuf.h>
 #include <sys/sysent.h>
 #include <sys/sysctl.h>
 #include <sys/vmmeter.h>
 
 #ifdef SYSVSHM
 #include <sys/shm.h>
 #endif
 
 #ifdef SYSVMSG
 #include <sys/msg.h>
 #endif
 
 #ifdef SYSVSEM
 #include <sys/sem.h>
 #endif
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_prot.h>
 #include <sys/lock.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_extern.h>
 
 #include <sys/user.h>
 #include <sys/exec.h>
 
 #include <ddb/ddb.h>
 
 #include <net/netisr.h>
 
 #if NAPM > 0
 #include <machine/apm_bios.h>
 #endif
 #include <machine/cpu.h>
 #include <machine/reg.h>
 #include <machine/clock.h>
 #include <machine/specialreg.h>
 #include <machine/cons.h>
 #include <machine/bootinfo.h>
 #include <machine/ipl.h>
 #include <machine/md_var.h>
 #include <machine/pcb_ext.h>		/* pcb.h included via sys/user.h */
 #ifdef SMP
 #include <machine/smp.h>
 #endif
 #ifdef PERFMON
 #include <machine/perfmon.h>
 #endif
 
 #include <i386/isa/isa_device.h>
 #include <i386/isa/intr_machdep.h>
 #include <i386/isa/rtc.h>
 #include <machine/random.h>
 
 extern void init386 __P((int first));
 extern int ptrace_set_pc __P((struct proc *p, unsigned int addr));
 extern int ptrace_single_step __P((struct proc *p));
 extern int ptrace_write_u __P((struct proc *p, vm_offset_t off, int data));
 extern void dblfault_handler __P((void));
 
 extern void printcpuinfo(void);	/* XXX header file */
 extern void earlysetcpuclass(void);	/* same header file */
 extern void finishidentcpu(void);
 extern void panicifcpuunsupported(void);
 extern void initializecpu(void);
 
 static void cpu_startup __P((void *));
 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL)
 
 static MALLOC_DEFINE(M_MBUF, "mbuf", "mbuf");
 
 #ifdef BOUNCE_BUFFERS
 #ifdef BOUNCEPAGES
 int	bouncepages = BOUNCEPAGES;
 #else
 int	bouncepages = 0;
 #endif
 #endif	/* BOUNCE_BUFFERS */
 
 int	msgbufmapped = 0;		/* set when safe to use msgbuf */
 int _udatasel, _ucodesel;
 u_int	atdevbase;
+
+#if defined(SWTCH_OPTIM_STATS)
+extern int swtch_optim_stats;
+SYSCTL_INT(_debug, OID_AUTO, swtch_optim_stats,
+	CTLFLAG_RD, &swtch_optim_stats, 0, "");
+SYSCTL_INT(_debug, OID_AUTO, tlb_flush_count,
+	CTLFLAG_RD, &tlb_flush_count, 0, "");
+#endif
 
 
 int physmem = 0;
 int cold = 1;
 
 static int
 sysctl_hw_physmem SYSCTL_HANDLER_ARGS
 {
 	int error = sysctl_handle_int(oidp, 0, ctob(physmem), req);
 	return (error);
 }
 
 SYSCTL_PROC(_hw, HW_PHYSMEM, physmem, CTLTYPE_INT|CTLFLAG_RD,
 	0, 0, sysctl_hw_physmem, "I", "");
 
 static int
 sysctl_hw_usermem SYSCTL_HANDLER_ARGS
 {
 	int error = sysctl_handle_int(oidp, 0,
 		ctob(physmem - cnt.v_wire_count), req);
 	return (error);
 }
 
 SYSCTL_PROC(_hw, HW_USERMEM, usermem, CTLTYPE_INT|CTLFLAG_RD,
 	0, 0, sysctl_hw_usermem, "I", "");
 
 int bootverbose = 0, Maxmem = 0;
 long dumplo;
 
 vm_offset_t phys_avail[10];
 
 /* must be 2 less so 0 0 can signal end of chunks */
 #define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(vm_offset_t)) - 2)
 
 static void setup_netisrs __P((struct linker_set *)); /* XXX declare elsewhere */
 
 static vm_offset_t buffer_sva, buffer_eva;
 vm_offset_t clean_sva, clean_eva;
 static vm_offset_t pager_sva, pager_eva;
 extern struct linker_set netisr_set;
 
 #define offsetof(type, member)	((size_t)(&((type *)0)->member))
 
 static void
 cpu_startup(dummy)
 	void *dummy;
 {
 	register unsigned i;
 	register caddr_t v;
 	vm_offset_t maxaddr;
 	vm_size_t size = 0;
 	int firstaddr;
 	vm_offset_t minaddr;
 
 	if (boothowto & RB_VERBOSE)
 		bootverbose++;
 
 	/*
 	 * Good {morning,afternoon,evening,night}.
 	 */
 	printf(version);
 	earlysetcpuclass();
 	startrtclock();
 	printcpuinfo();
 	panicifcpuunsupported();
 #ifdef PERFMON
 	perfmon_init();
 #endif
 	printf("real memory  = %d (%dK bytes)\n", ptoa(Maxmem), ptoa(Maxmem) / 1024);
 	/*
 	 * Display any holes after the first chunk of extended memory.
 	 */
 	if (bootverbose) {
 		int indx;
 
 		printf("Physical memory chunk(s):\n");
 		for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
 			int size1 = phys_avail[indx + 1] - phys_avail[indx];
 
 			printf("0x%08lx - 0x%08lx, %d bytes (%d pages)\n", phys_avail[indx],
 			    phys_avail[indx + 1] - 1, size1, size1 / PAGE_SIZE);
 		}
 	}
 
 	/*
 	 * Quickly wire in netisrs.
 	 */
 	setup_netisrs(&netisr_set);
 
 	/*
 	 * Calculate callout wheel size
 	 */
 	for (callwheelsize = 1, callwheelbits = 0;
 	     callwheelsize < ncallout;
 	     callwheelsize <<= 1, ++callwheelbits)
 		;
 	callwheelmask = callwheelsize - 1;
 
 	/*
 	 * Allocate space for system data structures.
 	 * The first available kernel virtual address is in "v".
 	 * As pages of kernel virtual memory are allocated, "v" is incremented.
 	 * As pages of memory are allocated and cleared,
 	 * "firstaddr" is incremented.
 	 * An index into the kernel page table corresponding to the
 	 * virtual memory address maintained in "v" is kept in "mapaddr".
 	 */
 
 	/*
 	 * Make two passes.  The first pass calculates how much memory is
 	 * needed and allocates it.  The second pass assigns virtual
 	 * addresses to the various data structures.
 	 */
 	firstaddr = 0;
 again:
 	v = (caddr_t)firstaddr;
 
 #define	valloc(name, type, num) \
 	    (name) = (type *)v; v = (caddr_t)((name)+(num))
 #define	valloclim(name, type, num, lim) \
 	    (name) = (type *)v; v = (caddr_t)((lim) = ((name)+(num)))
 	valloc(callout, struct callout, ncallout);
 	valloc(callwheel, struct callout_tailq, callwheelsize);
 #ifdef SYSVSHM
 	valloc(shmsegs, struct shmid_ds, shminfo.shmmni);
 #endif
 #ifdef SYSVSEM
 	valloc(sema, struct semid_ds, seminfo.semmni);
 	valloc(sem, struct sem, seminfo.semmns);
 	/* This is pretty disgusting! */
 	valloc(semu, int, (seminfo.semmnu * seminfo.semusz) / sizeof(int));
 #endif
 #ifdef SYSVMSG
 	valloc(msgpool, char, msginfo.msgmax);
 	valloc(msgmaps, struct msgmap, msginfo.msgseg);
 	valloc(msghdrs, struct msg, msginfo.msgtql);
 	valloc(msqids, struct msqid_ds, msginfo.msgmni);
 #endif
 
 	if (nbuf == 0) {
 		nbuf = 30;
 		if( physmem > 1024)
 			nbuf += min((physmem - 1024) / 8, 2048);
 	}
 	nswbuf = max(min(nbuf/4, 128), 16);
 
 	valloc(swbuf, struct buf, nswbuf);
 	valloc(buf, struct buf, nbuf);
 
 #ifdef BOUNCE_BUFFERS
 	/*
 	 * If there is more than 16MB of memory, allocate some bounce buffers
 	 */
 	if (Maxmem > 4096) {
 		if (bouncepages == 0) {
 			bouncepages = 64;
 			bouncepages += ((Maxmem - 4096) / 2048) * 32;
 			if (bouncepages > 128)
 				bouncepages = 128;
 		}
 		v = (caddr_t)((vm_offset_t)round_page(v));
 		valloc(bouncememory, char, bouncepages * PAGE_SIZE);
 	}
 #endif
 
 	/*
 	 * End of first pass, size has been calculated so allocate memory
 	 */
 	if (firstaddr == 0) {
 		size = (vm_size_t)(v - firstaddr);
 		firstaddr = (int)kmem_alloc(kernel_map, round_page(size));
 		if (firstaddr == 0)
 			panic("startup: no room for tables");
 		goto again;
 	}
 
 	/*
 	 * End of second pass, addresses have been assigned
 	 */
 	if ((vm_size_t)(v - firstaddr) != size)
 		panic("startup: table size inconsistency");
 
 #ifdef BOUNCE_BUFFERS
 	clean_map = kmem_suballoc(kernel_map, &clean_sva, &clean_eva,
 			(nbuf*BKVASIZE) + (nswbuf*MAXPHYS) +
 				maxbkva + pager_map_size, TRUE);
 	io_map = kmem_suballoc(clean_map, &minaddr, &maxaddr, maxbkva, FALSE);
 #else
 	clean_map = kmem_suballoc(kernel_map, &clean_sva, &clean_eva,
 			(nbuf*BKVASIZE) + (nswbuf*MAXPHYS) + pager_map_size, TRUE);
 #endif
 	buffer_map = kmem_suballoc(clean_map, &buffer_sva, &buffer_eva,
 				(nbuf*BKVASIZE), TRUE);
 	pager_map = kmem_suballoc(clean_map, &pager_sva, &pager_eva,
 				(nswbuf*MAXPHYS) + pager_map_size, TRUE);
 	pager_map->system_map = 1;
 	exec_map = kmem_suballoc(kernel_map, &minaddr, &maxaddr,
 				(16*ARG_MAX), TRUE);
 	u_map = kmem_suballoc(kernel_map, &minaddr, &maxaddr,
 				(maxproc*UPAGES*PAGE_SIZE), FALSE);
 
 	/*
 	 * Finally, allocate mbuf pool.  Since mclrefcnt is an off-size
 	 * we use the more space efficient malloc in place of kmem_alloc.
 	 */
 	{
 		vm_offset_t mb_map_size;
 
 		mb_map_size = nmbufs * MSIZE + nmbclusters * MCLBYTES;
 		mb_map_size = roundup2(mb_map_size, max(MCLBYTES, PAGE_SIZE));
 		mclrefcnt = malloc(mb_map_size / MCLBYTES, M_MBUF, M_NOWAIT);
 		bzero(mclrefcnt, mb_map_size / MCLBYTES);
 		mb_map = kmem_suballoc(kmem_map, (vm_offset_t *)&mbutl, &maxaddr,
 			mb_map_size, FALSE);
 		mb_map->system_map = 1;
 	}
 
 	/*
 	 * Initialize callouts
 	 */
 	SLIST_INIT(&callfree);
 	for (i = 0; i < ncallout; i++) {
 		SLIST_INSERT_HEAD(&callfree, &callout[i], c_links.sle);
 	}
 
 	for (i = 0; i < callwheelsize; i++) {
 		TAILQ_INIT(&callwheel[i]);
 	}
 
 #if defined(USERCONFIG)
 #if defined(USERCONFIG_BOOT)
 	if (1) {
 #else
         if (boothowto & RB_CONFIG) {
 #endif
 		userconfig();
 		cninit();	/* the preferred console may have changed */
 	}
 #endif
 
 #ifdef BOUNCE_BUFFERS
 	/*
 	 * init bounce buffers
 	 */
 	vm_bounce_init();
 #endif
 
 	printf("avail memory = %d (%dK bytes)\n", ptoa(cnt.v_free_count),
 	    ptoa(cnt.v_free_count) / 1024);
 
 	/*
 	 * Set up buffers, so they can be used to read disk labels.
 	 */
 	bufinit();
 	vm_pager_bufferinit();
 
 #ifdef SMP
 	/*
 	 * OK, enough kmem_alloc/malloc state should be up, lets get on with it!
 	 */
 	mp_start();			/* fire up the APs and APICs */
 	mp_announce();
 #endif  /* SMP */
 }
 
 int
 register_netisr(num, handler)
 	int num;
 	netisr_t *handler;
 {
 	
 	if (num < 0 || num >= (sizeof(netisrs)/sizeof(*netisrs)) ) {
 		printf("register_netisr: bad isr number: %d\n", num);
 		return (EINVAL);
 	}
 	netisrs[num] = handler;
 	return (0);
 }
 
 static void
 setup_netisrs(ls)
 	struct linker_set *ls;
 {
 	int i;
 	const struct netisrtab *nit;
 
 	for(i = 0; ls->ls_items[i]; i++) {
 		nit = (const struct netisrtab *)ls->ls_items[i];
 		register_netisr(nit->nit_num, nit->nit_isr);
 	}
 }
 
 /*
  * Send an interrupt to process.
  *
  * Stack is set up to allow sigcode stored
  * at top to call routine, followed by kcall
  * to sigreturn routine below.  After sigreturn
  * resets the signal mask, the stack, and the
  * frame pointer, it returns to the user
  * specified pc, psl.
  */
 void
 sendsig(catcher, sig, mask, code)
 	sig_t catcher;
 	int sig, mask;
 	u_long code;
 {
 	register struct proc *p = curproc;
 	register struct trapframe *regs;
 	register struct sigframe *fp;
 	struct sigframe sf;
 	struct sigacts *psp = p->p_sigacts;
 	int oonstack;
 
 	regs = p->p_md.md_regs;
         oonstack = psp->ps_sigstk.ss_flags & SS_ONSTACK;
 	/*
 	 * Allocate and validate space for the signal handler context.
 	 */
         if ((psp->ps_flags & SAS_ALTSTACK) && !oonstack &&
 	    (psp->ps_sigonstack & sigmask(sig))) {
 		fp = (struct sigframe *)(psp->ps_sigstk.ss_sp +
 		    psp->ps_sigstk.ss_size - sizeof(struct sigframe));
 		psp->ps_sigstk.ss_flags |= SS_ONSTACK;
 	} else {
 		fp = (struct sigframe *)regs->tf_esp - 1;
 	}
 
 	/*
 	 * grow() will return FALSE if the fp will not fit inside the stack
 	 *	and the stack can not be grown. useracc will return FALSE
 	 *	if access is denied.
 	 */
 	if ((grow(p, (int)fp) == FALSE) ||
 	    (useracc((caddr_t)fp, sizeof(struct sigframe), B_WRITE) == FALSE)) {
 		/*
 		 * Process has trashed its stack; give it an illegal
 		 * instruction to halt it in its tracks.
 		 */
 		SIGACTION(p, SIGILL) = SIG_DFL;
 		sig = sigmask(SIGILL);
 		p->p_sigignore &= ~sig;
 		p->p_sigcatch &= ~sig;
 		p->p_sigmask &= ~sig;
 		psignal(p, SIGILL);
 		return;
 	}
 
 	/*
 	 * Build the argument list for the signal handler.
 	 */
 	if (p->p_sysent->sv_sigtbl) {
 		if (sig < p->p_sysent->sv_sigsize)
 			sig = p->p_sysent->sv_sigtbl[sig];
 		else
 			sig = p->p_sysent->sv_sigsize + 1;
 	}
 	sf.sf_signum = sig;
 	sf.sf_code = code;
 	sf.sf_scp = &fp->sf_sc;
 	sf.sf_addr = (char *) regs->tf_err;
 	sf.sf_handler = catcher;
 
 	/* save scratch registers */
 	sf.sf_sc.sc_eax = regs->tf_eax;
 	sf.sf_sc.sc_ebx = regs->tf_ebx;
 	sf.sf_sc.sc_ecx = regs->tf_ecx;
 	sf.sf_sc.sc_edx = regs->tf_edx;
 	sf.sf_sc.sc_esi = regs->tf_esi;
 	sf.sf_sc.sc_edi = regs->tf_edi;
 	sf.sf_sc.sc_cs = regs->tf_cs;
 	sf.sf_sc.sc_ds = regs->tf_ds;
 	sf.sf_sc.sc_ss = regs->tf_ss;
 	sf.sf_sc.sc_es = regs->tf_es;
 	sf.sf_sc.sc_isp = regs->tf_isp;
 
 	/*
 	 * Build the signal context to be used by sigreturn.
 	 */
 	sf.sf_sc.sc_onstack = oonstack;
 	sf.sf_sc.sc_mask = mask;
 	sf.sf_sc.sc_sp = regs->tf_esp;
 	sf.sf_sc.sc_fp = regs->tf_ebp;
 	sf.sf_sc.sc_pc = regs->tf_eip;
 	sf.sf_sc.sc_ps = regs->tf_eflags;
 	sf.sf_sc.sc_trapno = regs->tf_trapno;
 	sf.sf_sc.sc_err = regs->tf_err;
 
 	/*
 	 * If we're a vm86 process, we want to save the segment registers.
 	 * We also change eflags to be our emulated eflags, not the actual
 	 * eflags.
 	 */
 	if (regs->tf_eflags & PSL_VM) {
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86 = &p->p_addr->u_pcb.pcb_ext->ext_vm86;
 
 		sf.sf_sc.sc_gs = tf->tf_vm86_gs;
 		sf.sf_sc.sc_fs = tf->tf_vm86_fs;
 		sf.sf_sc.sc_es = tf->tf_vm86_es;
 		sf.sf_sc.sc_ds = tf->tf_vm86_ds;
 
 		if (vm86->vm86_has_vme == 0)
 			sf.sf_sc.sc_ps = (tf->tf_eflags & ~(PSL_VIF | PSL_VIP))
 			    | (vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
 
 		/*
 		 * We should never have PSL_T set when returning from vm86
 		 * mode.  It may be set here if we deliver a signal before
 		 * getting to vm86 mode, so turn it off.
 		 */
 		tf->tf_eflags &= ~(PSL_VM | PSL_T | PSL_VIF | PSL_VIP);
 	}
 
 	/*
 	 * Copy the sigframe out to the user's stack.
 	 */
 	if (copyout(&sf, fp, sizeof(struct sigframe)) != 0) {
 		/*
 		 * Something is wrong with the stack pointer.
 		 * ...Kill the process.
 		 */
 		sigexit(p, SIGILL);
 	}
 
 	regs->tf_esp = (int)fp;
 	regs->tf_eip = (int)(((char *)PS_STRINGS) - *(p->p_sysent->sv_szsigcode));
 	regs->tf_cs = _ucodesel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_ss = _udatasel;
 }
 
 /*
  * System call to cleanup state after a signal
  * has been taken.  Reset signal mask and
  * stack state from context left by sendsig (above).
  * Return to previous pc and psl as specified by
  * context left by sendsig. Check carefully to
  * make sure that the user has not modified the
  * state to gain improper privileges.
  */
 int
 sigreturn(p, uap)
 	struct proc *p;
 	struct sigreturn_args /* {
 		struct sigcontext *sigcntxp;
 	} */ *uap;
 {
 	register struct sigcontext *scp;
 	register struct sigframe *fp;
 	register struct trapframe *regs = p->p_md.md_regs;
 	int eflags;
 
 	/*
 	 * (XXX old comment) regs->tf_esp points to the return address.
 	 * The user scp pointer is above that.
 	 * The return address is faked in the signal trampoline code
 	 * for consistency.
 	 */
 	scp = uap->sigcntxp;
 	fp = (struct sigframe *)
 	     ((caddr_t)scp - offsetof(struct sigframe, sf_sc));
 
 	if (useracc((caddr_t)fp, sizeof (*fp), B_WRITE) == 0)
 		return(EFAULT);
 
 	eflags = scp->sc_ps;
 	if (eflags & PSL_VM) {
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86;
 
 		/*
 		 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
 		 * set up the vm86 area, and we can't enter vm86 mode.
 		 */
 		if (p->p_addr->u_pcb.pcb_ext == 0)
 			return (EINVAL);
 		vm86 = &p->p_addr->u_pcb.pcb_ext->ext_vm86;
 		if (vm86->vm86_inited == 0)
 			return (EINVAL);
 
 		/* go back to user mode if both flags are set */
 		if ((eflags & PSL_VIP) && (eflags & PSL_VIF))
 			trapsignal(p, SIGBUS, 0);
 
 #define VM_USERCHANGE	(PSL_USERCHANGE | PSL_RF)
 #define VME_USERCHANGE	(VM_USERCHANGE | PSL_VIP | PSL_VIF)
 		if (vm86->vm86_has_vme) {
 			eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
 			    (eflags & VME_USERCHANGE) | PSL_VM;
 		} else {
 			vm86->vm86_eflags = eflags;	/* save VIF, VIP */
 			eflags = (tf->tf_eflags & ~VM_USERCHANGE) |					    (eflags & VM_USERCHANGE) | PSL_VM;
 		}
 		tf->tf_vm86_ds = scp->sc_ds;
 		tf->tf_vm86_es = scp->sc_es;
 		tf->tf_vm86_fs = scp->sc_fs;
 		tf->tf_vm86_gs = scp->sc_gs;
 		tf->tf_ds = _udatasel;
 		tf->tf_es = _udatasel;
 	} else {
 		/*
 		 * Don't allow users to change privileged or reserved flags.
 		 */
 #define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
 		/*
 		 * XXX do allow users to change the privileged flag PSL_RF.
 		 * The cpu sets PSL_RF in tf_eflags for faults.  Debuggers
 		 * should sometimes set it there too.  tf_eflags is kept in
 		 * the signal context during signal handling and there is no
 		 * other place to remember it, so the PSL_RF bit may be
 		 * corrupted by the signal handler without us knowing.
 		 * Corruption of the PSL_RF bit at worst causes one more or
 		 * one less debugger trap, so allowing it is fairly harmless.
 		 */
 		if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) {
 #ifdef DEBUG
 	    		printf("sigreturn: eflags = 0x%x\n", eflags);
 #endif
 	    		return(EINVAL);
 		}
 
 		/*
 		 * Don't allow users to load a valid privileged %cs.  Let the
 		 * hardware check for invalid selectors, excess privilege in
 		 * other selectors, invalid %eip's and invalid %esp's.
 		 */
 #define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
 		if (!CS_SECURE(scp->sc_cs)) {
 #ifdef DEBUG
     			printf("sigreturn: cs = 0x%x\n", scp->sc_cs);
 #endif
 			trapsignal(p, SIGBUS, T_PROTFLT);
 			return(EINVAL);
 		}
 		regs->tf_ds = scp->sc_ds;
 		regs->tf_es = scp->sc_es;
 	}
 	/* restore scratch registers */
 	regs->tf_eax = scp->sc_eax;
 	regs->tf_ebx = scp->sc_ebx;
 	regs->tf_ecx = scp->sc_ecx;
 	regs->tf_edx = scp->sc_edx;
 	regs->tf_esi = scp->sc_esi;
 	regs->tf_edi = scp->sc_edi;
 	regs->tf_cs = scp->sc_cs;
 	regs->tf_ss = scp->sc_ss;
 	regs->tf_isp = scp->sc_isp;
 
 	if (useracc((caddr_t)scp, sizeof (*scp), B_WRITE) == 0)
 		return(EINVAL);
 
 	if (scp->sc_onstack & 01)
 		p->p_sigacts->ps_sigstk.ss_flags |= SS_ONSTACK;
 	else
 		p->p_sigacts->ps_sigstk.ss_flags &= ~SS_ONSTACK;
 	p->p_sigmask = scp->sc_mask & ~sigcantmask;
 	regs->tf_ebp = scp->sc_fp;
 	regs->tf_esp = scp->sc_sp;
 	regs->tf_eip = scp->sc_pc;
 	regs->tf_eflags = eflags;
 	return(EJUSTRETURN);
 }
 
 /*
  * Machine dependent boot() routine
  *
  * I haven't seen anything to put here yet
  * Possibly some stuff might be grafted back here from boot()
  */
 void
 cpu_boot(int howto)
 {
 }
 
 /*
  * Shutdown the CPU as much as possible
  */
 void
 cpu_halt(void)
 {
 	for (;;)
 		__asm__ ("hlt");
 }
 
 /*
  * Turn the power off.
  */
 void
 cpu_power_down(void)
 {
 #if NAPM > 0
 	apm_power_off();
 #endif
 }
 
 /*
  * Clear registers on exec
  */
 void
 setregs(p, entry, stack)
 	struct proc *p;
 	u_long entry;
 	u_long stack;
 {
 	struct trapframe *regs = p->p_md.md_regs;
 
 #ifdef USER_LDT
 	struct pcb *pcb = &p->p_addr->u_pcb;
 
 	/* was i386_user_cleanup() in NetBSD */
 	if (pcb->pcb_ldt) {
 		if (pcb == curpcb)
 			lldt(GSEL(GUSERLDT_SEL, SEL_KPL));
 		kmem_free(kernel_map, (vm_offset_t)pcb->pcb_ldt,
 			pcb->pcb_ldt_len * sizeof(union descriptor));
 		pcb->pcb_ldt_len = (int)pcb->pcb_ldt = 0;
  	}
 #endif
   
 	bzero((char *)regs, sizeof(struct trapframe));
 	regs->tf_eip = entry;
 	regs->tf_esp = stack;
 	regs->tf_eflags = PSL_USER | (regs->tf_eflags & PSL_T);
 	regs->tf_ss = _udatasel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_cs = _ucodesel;
 
 	/*
 	 * Initialize the math emulator (if any) for the current process.
 	 * Actually, just clear the bit that says that the emulator has
 	 * been initialized.  Initialization is delayed until the process
 	 * traps to the emulator (if it is done at all) mainly because
 	 * emulators don't provide an entry point for initialization.
 	 */
 	p->p_addr->u_pcb.pcb_flags &= ~FP_SOFTFP;
 
 	/*
 	 * Arrange to trap the next npx or `fwait' instruction (see npx.c
 	 * for why fwait must be trapped at least if there is an npx or an
 	 * emulator).  This is mainly to handle the case where npx0 is not
 	 * configured, since the npx routines normally set up the trap
 	 * otherwise.  It should be done only at boot time, but doing it
 	 * here allows modifying `npx_exists' for testing the emulator on
 	 * systems with an npx.
 	 */
 	load_cr0(rcr0() | CR0_MP | CR0_TS);
 
 #if NNPX > 0
 	/* Initialize the npx (if any) for the current process. */
 	npxinit(__INITIAL_NPXCW__);
 #endif
 }
 
 static int
 sysctl_machdep_adjkerntz SYSCTL_HANDLER_ARGS
 {
 	int error;
 	error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2,
 		req);
 	if (!error && req->newptr)
 		resettodr();
 	return (error);
 }
 
 SYSCTL_PROC(_machdep, CPU_ADJKERNTZ, adjkerntz, CTLTYPE_INT|CTLFLAG_RW,
 	&adjkerntz, 0, sysctl_machdep_adjkerntz, "I", "");
 
 SYSCTL_INT(_machdep, CPU_DISRTCSET, disable_rtc_set,
 	CTLFLAG_RW, &disable_rtc_set, 0, "");
 
 SYSCTL_STRUCT(_machdep, CPU_BOOTINFO, bootinfo, 
 	CTLFLAG_RD, &bootinfo, bootinfo, "");
 
 SYSCTL_INT(_machdep, CPU_WALLCLOCK, wall_cmos_clock,
 	CTLFLAG_RW, &wall_cmos_clock, 0, "");
 
 /*
  * Initialize 386 and configure to run kernel
  */
 
 /*
  * Initialize segments & interrupt table
  */
 
 int currentldt;
 int _default_ldt;
 #ifdef SMP
 union descriptor gdt[NGDT + NCPU];	/* global descriptor table */
 #else
 union descriptor gdt[NGDT];		/* global descriptor table */
 #endif
 struct gate_descriptor idt[NIDT];	/* interrupt descriptor table */
 union descriptor ldt[NLDT];		/* local descriptor table */
 #ifdef SMP
 /* table descriptors - used to load tables by microp */
 struct region_descriptor r_gdt, r_idt;
 #endif
 
 #ifdef SMP
 extern struct i386tss common_tss;	/* One tss per cpu */
 #ifdef VM86
 extern struct segment_descriptor common_tssd;
 extern int private_tss;
 extern u_int my_tr;
 #endif /* VM86 */
 #else
 struct i386tss common_tss;
 #ifdef VM86
 struct segment_descriptor common_tssd;
 u_int private_tss;			/* flag indicating private tss */
 u_int my_tr;				/* which task register setting */
 #endif /* VM86 */
 #endif
 
 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 struct gate_descriptor *t_idt;
 extern int has_f00f_bug;
 #endif
 
 static struct i386tss dblfault_tss;
 static char dblfault_stack[PAGE_SIZE];
 
 extern  struct user *proc0paddr;
 
 
 /* software prototypes -- in more palatable form */
 struct soft_segment_descriptor gdt_segs[
 #ifdef SMP
 					NGDT + NCPU
 #endif
 						   ] = {
 /* GNULL_SEL	0 Null Descriptor */
 {	0x0,			/* segment base address  */
 	0x0,			/* length */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GCODE_SEL	1 Code Descriptor for kernel */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMERA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GDATA_SEL	2 Data Descriptor for kernel */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMRWA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GLDT_SEL	3 LDT Descriptor */
 {	(int) ldt,		/* segment base address  */
 	sizeof(ldt)-1,		/* length - all address space */
 	SDT_SYSLDT,		/* segment type */
 	SEL_UPL,		/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* unused - default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GTGATE_SEL	4 Null Descriptor - Placeholder */
 {	0x0,			/* segment base address  */
 	0x0,			/* length - all address space */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GPANIC_SEL	5 Panic Tss Descriptor */
 {	(int) &dblfault_tss,	/* segment base address  */
 	sizeof(struct i386tss)-1,/* length - all address space */
 	SDT_SYS386TSS,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* unused - default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GPROC0_SEL	6 Proc 0 Tss Descriptor */
 {
 	(int) &common_tss,	/* segment base address */
 	sizeof(struct i386tss)-1,/* length - all address space */
 	SDT_SYS386TSS,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* unused - default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GUSERLDT_SEL	7 User LDT Descriptor per process */
 {	(int) ldt,		/* segment base address  */
 	(512 * sizeof(union descriptor)-1),		/* length */
 	SDT_SYSLDT,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* unused - default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GAPMCODE32_SEL 8 APM BIOS 32-bit interface (32bit Code) */
 {	0,			/* segment base address (overwritten by APM)  */
 	0xfffff,		/* length */
 	SDT_MEMERA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GAPMCODE16_SEL 9 APM BIOS 32-bit interface (16bit Code) */
 {	0,			/* segment base address (overwritten by APM)  */
 	0xfffff,		/* length */
 	SDT_MEMERA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GAPMDATA_SEL	10 APM BIOS 32-bit interface (Data) */
 {	0,			/* segment base address (overwritten by APM) */
 	0xfffff,		/* length */
 	SDT_MEMRWA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 };
 
 static struct soft_segment_descriptor ldt_segs[] = {
 	/* Null Descriptor - overwritten by call gate */
 {	0x0,			/* segment base address  */
 	0x0,			/* length - all address space */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 	/* Null Descriptor - overwritten by call gate */
 {	0x0,			/* segment base address  */
 	0x0,			/* length - all address space */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 	/* Null Descriptor - overwritten by call gate */
 {	0x0,			/* segment base address  */
 	0x0,			/* length - all address space */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 	/* Code Descriptor for user */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMERA,		/* segment type */
 	SEL_UPL,		/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 	/* Data Descriptor for user */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMRWA,		/* segment type */
 	SEL_UPL,		/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 };
 
 void
 setidt(idx, func, typ, dpl, selec)
 	int idx;
 	inthand_t *func;
 	int typ;
 	int dpl;
 	int selec;
 {
 	struct gate_descriptor *ip = idt + idx;
 
 	ip->gd_looffset = (int)func;
 	ip->gd_selector = selec;
 	ip->gd_stkcpy = 0;
 	ip->gd_xx = 0;
 	ip->gd_type = typ;
 	ip->gd_dpl = dpl;
 	ip->gd_p = 1;
 	ip->gd_hioffset = ((int)func)>>16 ;
 }
 
 #define	IDTVEC(name)	__CONCAT(X,name)
 
 extern inthand_t
 	IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
 	IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
 	IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
 	IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
 	IDTVEC(syscall), IDTVEC(int0x80_syscall);
 
 void
 sdtossd(sd, ssd)
 	struct segment_descriptor *sd;
 	struct soft_segment_descriptor *ssd;
 {
 	ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
 	ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
 	ssd->ssd_type  = sd->sd_type;
 	ssd->ssd_dpl   = sd->sd_dpl;
 	ssd->ssd_p     = sd->sd_p;
 	ssd->ssd_def32 = sd->sd_def32;
 	ssd->ssd_gran  = sd->sd_gran;
 }
 
 void
 init386(first)
 	int first;
 {
 	int x;
 	unsigned biosbasemem, biosextmem;
 	struct gate_descriptor *gdp;
 	int gsel_tss;
 
 	struct isa_device *idp;
 #ifndef SMP
 	/* table descriptors - used to load tables by microp */
 	struct region_descriptor r_gdt, r_idt;
 #endif
 	int pagesinbase, pagesinext;
 	int target_page, pa_indx;
 	int off;
 	int speculative_mprobe;
 
 	/*
 	 * Prevent lowering of the ipl if we call tsleep() early.
 	 */
 	safepri = cpl;
 
 	proc0.p_addr = proc0paddr;
 
 	atdevbase = ISA_HOLE_START + KERNBASE;
 
 	/*
 	 * Initialize the console before we print anything out.
 	 */
 	cninit();
 
 	/*
 	 * make gdt memory segments, the code segment goes up to end of the
 	 * page with etext in it, the data segment goes to the end of
 	 * the address space
 	 */
 	/*
 	 * XXX text protection is temporarily (?) disabled.  The limit was
 	 * i386_btop(round_page(etext)) - 1.
 	 */
 	gdt_segs[GCODE_SEL].ssd_limit = i386_btop(0) - 1;
 	gdt_segs[GDATA_SEL].ssd_limit = i386_btop(0) - 1;
 #ifdef BDE_DEBUGGER
 #define	NGDT1	8		/* avoid overwriting db entries with APM ones */
 #else
 #define	NGDT1	(sizeof gdt_segs / sizeof gdt_segs[0])
 #endif
 	for (x = 0; x < NGDT1; x++)
 		ssdtosd(&gdt_segs[x], &gdt[x].sd);
 #ifdef VM86
 	common_tssd = gdt[GPROC0_SEL].sd;
 #endif /* VM86 */
 
 #ifdef SMP
 	/*
 	 * Spin these up now.  init_secondary() grabs them.  We could use
 	 * #for(x,y,z) / #endfor cpp directives if they existed.
 	 */
 	for (x = 0; x < NCPU; x++) {
 		gdt_segs[NGDT + x] = gdt_segs[GPROC0_SEL];
 		ssdtosd(&gdt_segs[NGDT + x], &gdt[NGDT + x].sd);
 	}
 #endif
 
 	/* make ldt memory segments */
 	/*
 	 * The data segment limit must not cover the user area because we
 	 * don't want the user area to be writable in copyout() etc. (page
 	 * level protection is lost in kernel mode on 386's).  Also, we
 	 * don't want the user area to be writable directly (page level
 	 * protection of the user area is not available on 486's with
 	 * CR0_WP set, because there is no user-read/kernel-write mode).
 	 *
 	 * XXX - VM_MAXUSER_ADDRESS is an end address, not a max.  And it
 	 * should be spelled ...MAX_USER...
 	 */
 #define VM_END_USER_RW_ADDRESS	VM_MAXUSER_ADDRESS
 	/*
 	 * The code segment limit has to cover the user area until we move
 	 * the signal trampoline out of the user area.  This is safe because
 	 * the code segment cannot be written to directly.
 	 */
 #define VM_END_USER_R_ADDRESS	(VM_END_USER_RW_ADDRESS + UPAGES * PAGE_SIZE)
 	ldt_segs[LUCODE_SEL].ssd_limit = i386_btop(VM_END_USER_R_ADDRESS) - 1;
 	ldt_segs[LUDATA_SEL].ssd_limit = i386_btop(VM_END_USER_RW_ADDRESS) - 1;
 	for (x = 0; x < sizeof ldt_segs / sizeof ldt_segs[0]; x++)
 		ssdtosd(&ldt_segs[x], &ldt[x].sd);
 
 	/* exceptions */
 	for (x = 0; x < NIDT; x++)
 		setidt(x, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(0, &IDTVEC(div),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(1, &IDTVEC(dbg),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(2, &IDTVEC(nmi),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
  	setidt(3, &IDTVEC(bpt),  SDT_SYS386TGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(4, &IDTVEC(ofl),  SDT_SYS386TGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(5, &IDTVEC(bnd),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(6, &IDTVEC(ill),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(7, &IDTVEC(dna),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(8, 0,  SDT_SYSTASKGT, SEL_KPL, GSEL(GPANIC_SEL, SEL_KPL));
 	setidt(9, &IDTVEC(fpusegm),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(10, &IDTVEC(tss),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(11, &IDTVEC(missing),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(12, &IDTVEC(stk),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(13, &IDTVEC(prot),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(14, &IDTVEC(page),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(15, &IDTVEC(rsvd),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(16, &IDTVEC(fpu),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(17, &IDTVEC(align), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(18, &IDTVEC(mchk),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
  	setidt(0x80, &IDTVEC(int0x80_syscall),
 			SDT_SYS386TGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL));
 
 #include	"isa.h"
 #if	NISA >0
 	isa_defaultirq();
 #endif
 	rand_initialize();
 
 	r_gdt.rd_limit = sizeof(gdt) - 1;
 	r_gdt.rd_base =  (int) gdt;
 	lgdt(&r_gdt);
 
 	r_idt.rd_limit = sizeof(idt) - 1;
 	r_idt.rd_base = (int) idt;
 	lidt(&r_idt);
 
 	_default_ldt = GSEL(GLDT_SEL, SEL_KPL);
 	lldt(_default_ldt);
 	currentldt = _default_ldt;
 
 #ifdef DDB
 	kdb_init();
 	if (boothowto & RB_KDB)
 		Debugger("Boot flags requested debugger");
 #endif
 
 	finishidentcpu();	/* Final stage of CPU initialization */
 	setidt(6, &IDTVEC(ill),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	initializecpu();	/* Initialize CPU registers */
 
 	/* Use BIOS values stored in RTC CMOS RAM, since probing
 	 * breaks certain 386 AT relics.
 	 */
 	biosbasemem = rtcin(RTC_BASELO)+ (rtcin(RTC_BASEHI)<<8);
 	biosextmem = rtcin(RTC_EXTLO)+ (rtcin(RTC_EXTHI)<<8);
 
 	/*
 	 * If BIOS tells us that it has more than 640k in the basemem,
 	 *	don't believe it - set it to 640k.
 	 */
 	if (biosbasemem > 640) {
 		printf("Preposterous RTC basemem of %dK, truncating to 640K\n",
 		       biosbasemem);
 		biosbasemem = 640;
 	}
 	if (bootinfo.bi_memsizes_valid && bootinfo.bi_basemem > 640) {
 		printf("Preposterous BIOS basemem of %dK, truncating to 640K\n",
 		       bootinfo.bi_basemem);
 		bootinfo.bi_basemem = 640;
 	}
 
 	/*
 	 * Warn if the official BIOS interface disagrees with the RTC
 	 * interface used above about the amount of base memory or the
 	 * amount of extended memory.  Prefer the BIOS value for the base
 	 * memory.  This is necessary for machines that `steal' base
 	 * memory for use as BIOS memory, at least if we are going to use
 	 * the BIOS for apm.  Prefer the RTC value for extended memory.
 	 * Eventually the hackish interface shouldn't even be looked at.
 	 */
 	if (bootinfo.bi_memsizes_valid) {
 		if (bootinfo.bi_basemem != biosbasemem) {
 			vm_offset_t pa;
 
 			printf(
 	"BIOS basemem (%ldK) != RTC basemem (%dK), setting to BIOS value\n",
 			       bootinfo.bi_basemem, biosbasemem);
 			biosbasemem = bootinfo.bi_basemem;
 
 			/*
 			 * XXX if biosbasemem is now < 640, there is `hole'
 			 * between the end of base memory and the start of
 			 * ISA memory.  The hole may be empty or it may
 			 * contain BIOS code or data.  Map it read/write so
 			 * that the BIOS can write to it.  (Memory from 0 to
 			 * the physical end of the kernel is mapped read-only
 			 * to begin with and then parts of it are remapped.
 			 * The parts that aren't remapped form holes that
 			 * remain read-only and are unused by the kernel.
 			 * The base memory area is below the physical end of
 			 * the kernel and right now forms a read-only hole.
 			 * The part of it from 0 to
 			 * (trunc_page(biosbasemem * 1024) - 1) will be
 			 * remapped and used by the kernel later.)
 			 *
 			 * This code is similar to the code used in
 			 * pmap_mapdev, but since no memory needs to be
 			 * allocated we simply change the mapping.
 			 */
 			for (pa = trunc_page(biosbasemem * 1024);
 			     pa < ISA_HOLE_START; pa += PAGE_SIZE) {
 				unsigned *pte;
 
 				pte = (unsigned *)vtopte(pa + KERNBASE);
 				*pte = pa | PG_RW | PG_V;
 			}
 		}
 		if (bootinfo.bi_extmem != biosextmem)
 			printf("BIOS extmem (%ldK) != RTC extmem (%dK)\n",
 			       bootinfo.bi_extmem, biosextmem);
 	}
 
 #ifdef SMP
 	/* make hole for AP bootstrap code */
 	pagesinbase = mp_bootaddress(biosbasemem) / PAGE_SIZE;
 #else
 	pagesinbase = biosbasemem * 1024 / PAGE_SIZE;
 #endif
 
 	pagesinext = biosextmem * 1024 / PAGE_SIZE;
 
 	/*
 	 * Special hack for chipsets that still remap the 384k hole when
 	 *	there's 16MB of memory - this really confuses people that
 	 *	are trying to use bus mastering ISA controllers with the
 	 *	"16MB limit"; they only have 16MB, but the remapping puts
 	 *	them beyond the limit.
 	 */
 	/*
 	 * If extended memory is between 15-16MB (16-17MB phys address range),
 	 *	chop it to 15MB.
 	 */
 	if ((pagesinext > 3840) && (pagesinext < 4096))
 		pagesinext = 3840;
 
 	/*
 	 * Maxmem isn't the "maximum memory", it's one larger than the
 	 * highest page of the physical address space.  It should be
 	 * called something like "Maxphyspage".
 	 */
 	Maxmem = pagesinext + 0x100000/PAGE_SIZE;
 	/*
 	 * Indicate that we wish to do a speculative search for memory beyond
 	 * the end of the reported size if the indicated amount is 64MB (0x4000
 	 * pages) - which is the largest amount that the BIOS/bootblocks can
 	 * currently report. If a specific amount of memory is indicated via
 	 * the MAXMEM option or the npx0 "msize", then don't do the speculative
 	 * memory probe.
 	 */
 	if (Maxmem >= 0x4000)
 		speculative_mprobe = TRUE;
 	else
 		speculative_mprobe = FALSE;
 
 #ifdef MAXMEM
 	Maxmem = MAXMEM/4;
 	speculative_mprobe = FALSE;
 #endif
 
 #if NNPX > 0
 	idp = find_isadev(isa_devtab_null, &npxdriver, 0);
 	if (idp != NULL && idp->id_msize != 0) {
 		Maxmem = idp->id_msize / 4;
 		speculative_mprobe = FALSE;
 	}
 #endif
 
 #ifdef SMP
 	/* look for the MP hardware - needed for apic addresses */
 	mp_probe();
 #endif
 
 	/* call pmap initialization to make new kernel address space */
 	pmap_bootstrap (first, 0);
 
 	/*
 	 * Size up each available chunk of physical memory.
 	 */
 
 	/*
 	 * We currently don't bother testing base memory.
 	 * XXX  ...but we probably should.
 	 */
 	pa_indx = 0;
 	if (pagesinbase > 1) {
 		phys_avail[pa_indx++] = PAGE_SIZE;	/* skip first page of memory */
 		phys_avail[pa_indx] = ptoa(pagesinbase);/* memory up to the ISA hole */
 		physmem = pagesinbase - 1;
 	} else {
 		/* point at first chunk end */
 		pa_indx++;
 	}
 
 	for (target_page = avail_start; target_page < ptoa(Maxmem); target_page += PAGE_SIZE) {
 		int tmp, page_bad;
 
 		page_bad = FALSE;
 
 		/*
 		 * map page into kernel: valid, read/write, non-cacheable
 		 */
 		*(int *)CMAP1 = PG_V | PG_RW | PG_N | target_page;
 		invltlb();
 
 		tmp = *(int *)CADDR1;
 		/*
 		 * Test for alternating 1's and 0's
 		 */
 		*(volatile int *)CADDR1 = 0xaaaaaaaa;
 		if (*(volatile int *)CADDR1 != 0xaaaaaaaa) {
 			page_bad = TRUE;
 		}
 		/*
 		 * Test for alternating 0's and 1's
 		 */
 		*(volatile int *)CADDR1 = 0x55555555;
 		if (*(volatile int *)CADDR1 != 0x55555555) {
 			page_bad = TRUE;
 		}
 		/*
 		 * Test for all 1's
 		 */
 		*(volatile int *)CADDR1 = 0xffffffff;
 		if (*(volatile int *)CADDR1 != 0xffffffff) {
 			page_bad = TRUE;
 		}
 		/*
 		 * Test for all 0's
 		 */
 		*(volatile int *)CADDR1 = 0x0;
 		if (*(volatile int *)CADDR1 != 0x0) {
 			/*
 			 * test of page failed
 			 */
 			page_bad = TRUE;
 		}
 		/*
 		 * Restore original value.
 		 */
 		*(int *)CADDR1 = tmp;
 
 		/*
 		 * Adjust array of valid/good pages.
 		 */
 		if (page_bad == FALSE) {
 			/*
 			 * If this good page is a continuation of the
 			 * previous set of good pages, then just increase
 			 * the end pointer. Otherwise start a new chunk.
 			 * Note that "end" points one higher than end,
 			 * making the range >= start and < end.
 			 * If we're also doing a speculative memory
 			 * test and we at or past the end, bump up Maxmem
 			 * so that we keep going. The first bad page
 			 * will terminate the loop.
 			 */
 			if (phys_avail[pa_indx] == target_page) {
 				phys_avail[pa_indx] += PAGE_SIZE;
 				if (speculative_mprobe == TRUE &&
 				    phys_avail[pa_indx] >= (64*1024*1024))
 					Maxmem++;
 			} else {
 				pa_indx++;
 				if (pa_indx == PHYS_AVAIL_ARRAY_END) {
 					printf("Too many holes in the physical address space, giving up\n");
 					pa_indx--;
 					break;
 				}
 				phys_avail[pa_indx++] = target_page;	/* start */
 				phys_avail[pa_indx] = target_page + PAGE_SIZE;	/* end */
 			}
 			physmem++;
 		}
 	}
 
 	*(int *)CMAP1 = 0;
 	invltlb();
 
 	/*
 	 * XXX
 	 * The last chunk must contain at least one page plus the message
 	 * buffer to avoid complicating other code (message buffer address
 	 * calculation, etc.).
 	 */
 	while (phys_avail[pa_indx - 1] + PAGE_SIZE +
 	    round_page(sizeof(struct msgbuf)) >= phys_avail[pa_indx]) {
 		physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
 		phys_avail[pa_indx--] = 0;
 		phys_avail[pa_indx--] = 0;
 	}
 
 	Maxmem = atop(phys_avail[pa_indx]);
 
 	/* Trim off space for the message buffer. */
 	phys_avail[pa_indx] -= round_page(sizeof(struct msgbuf));
 
 	avail_end = phys_avail[pa_indx];
 
 	/* now running on new page tables, configured,and u/iom is accessible */
 
 	/* Map the message buffer. */
 	for (off = 0; off < round_page(sizeof(struct msgbuf)); off += PAGE_SIZE)
 		pmap_enter(kernel_pmap, (vm_offset_t)msgbufp + off,
 			   avail_end + off, VM_PROT_ALL, TRUE);
 	msgbufmapped = 1;
 
 	/* make an initial tss so cpu can get interrupt stack on syscall! */
 #ifdef VM86
 	common_tss.tss_esp0 = (int) proc0.p_addr + UPAGES*PAGE_SIZE - 16;
 #else
 	common_tss.tss_esp0 = (int) proc0.p_addr + UPAGES*PAGE_SIZE;
 #endif /* VM86 */
 	common_tss.tss_ss0 = GSEL(GDATA_SEL, SEL_KPL) ;
 	common_tss.tss_ioopt = (sizeof common_tss) << 16;
 	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
 	ltr(gsel_tss);
 #ifdef VM86
 	private_tss = 0;
 	my_tr = GPROC0_SEL;
 #endif
 
 	dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 =
 	    dblfault_tss.tss_esp2 = (int) &dblfault_stack[sizeof(dblfault_stack)];
 	dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 =
 	    dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL);
 	dblfault_tss.tss_cr3 = (int)IdlePTD;
 	dblfault_tss.tss_eip = (int) dblfault_handler;
 	dblfault_tss.tss_eflags = PSL_KERNEL;
 	dblfault_tss.tss_ds = dblfault_tss.tss_es = dblfault_tss.tss_fs = 
 	    dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL);
 	dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL);
 	dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL);
 
 	/* make a call gate to reenter kernel with */
 	gdp = &ldt[LSYS5CALLS_SEL].gd;
 
 	x = (int) &IDTVEC(syscall);
 	gdp->gd_looffset = x++;
 	gdp->gd_selector = GSEL(GCODE_SEL,SEL_KPL);
 	gdp->gd_stkcpy = 1;
 	gdp->gd_type = SDT_SYS386CGT;
 	gdp->gd_dpl = SEL_UPL;
 	gdp->gd_p = 1;
 	gdp->gd_hioffset = ((int) &IDTVEC(syscall)) >>16;
 
 	/* XXX does this work? */
 	ldt[LBSDICALLS_SEL] = ldt[LSYS5CALLS_SEL];
 
 	/* transfer to user mode */
 
 	_ucodesel = LSEL(LUCODE_SEL, SEL_UPL);
 	_udatasel = LSEL(LUDATA_SEL, SEL_UPL);
 
 	/* setup proc 0's pcb */
 	proc0.p_addr->u_pcb.pcb_flags = 0;
 	proc0.p_addr->u_pcb.pcb_cr3 = (int)IdlePTD;
 	proc0.p_addr->u_pcb.pcb_mpnest = 1;
 	proc0.p_addr->u_pcb.pcb_ext = 0;
 }
 
 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 void f00f_hack(void);
 SYSINIT(f00f_hack, SI_SUB_INTRINSIC, SI_ORDER_FIRST, f00f_hack, NULL);
 
 void
 f00f_hack(void) {
 	struct region_descriptor r_idt;
 	unsigned char *tmp;
 	int i;
 
 	if (!has_f00f_bug)
 		return;
 
 	printf("Intel Pentium F00F detected, installing workaround\n");
 
 	r_idt.rd_limit = sizeof(idt) - 1;
 
 	tmp = kmem_alloc(kernel_map, PAGE_SIZE * 2);
 	if (tmp == 0)
 		panic("kmem_alloc returned 0");
 	if (((unsigned int)tmp & (PAGE_SIZE-1)) != 0)
 		panic("kmem_alloc returned non-page-aligned memory");
 	/* Put the first seven entries in the lower page */
 	t_idt = (struct gate_descriptor*)(tmp + PAGE_SIZE - (7*8));
 	bcopy(idt, t_idt, sizeof(idt));
 	r_idt.rd_base = (int)t_idt;
 	lidt(&r_idt);
 	if (vm_map_protect(kernel_map, tmp, tmp + PAGE_SIZE,
 			   VM_PROT_READ, FALSE) != KERN_SUCCESS)
 		panic("vm_map_protect failed");
 	return;
 }
 #endif /* defined(I586_CPU) && !NO_F00F_HACK */
 
 int
 ptrace_set_pc(p, addr)
 	struct proc *p;
 	unsigned int addr;
 {
 	p->p_md.md_regs->tf_eip = addr;
 	return (0);
 }
 
 int
 ptrace_single_step(p)
 	struct proc *p;
 {
 	p->p_md.md_regs->tf_eflags |= PSL_T;
 	return (0);
 }
 
 int ptrace_write_u(p, off, data)
 	struct proc *p;
 	vm_offset_t off;
 	int data;
 {
 	struct trapframe frame_copy;
 	vm_offset_t min;
 	struct trapframe *tp;
 
 	/*
 	 * Privileged kernel state is scattered all over the user area.
 	 * Only allow write access to parts of regs and to fpregs.
 	 */
 	min = (char *)p->p_md.md_regs - (char *)p->p_addr;
 	if (off >= min && off <= min + sizeof(struct trapframe) - sizeof(int)) {
 		tp = p->p_md.md_regs;
 		frame_copy = *tp;
 		*(int *)((char *)&frame_copy + (off - min)) = data;
 		if (!EFLAGS_SECURE(frame_copy.tf_eflags, tp->tf_eflags) ||
 		    !CS_SECURE(frame_copy.tf_cs))
 			return (EINVAL);
 		*(int*)((char *)p->p_addr + off) = data;
 		return (0);
 	}
 	min = offsetof(struct user, u_pcb) + offsetof(struct pcb, pcb_savefpu);
 	if (off >= min && off <= min + sizeof(struct save87) - sizeof(int)) {
 		*(int*)((char *)p->p_addr + off) = data;
 		return (0);
 	}
 	return (EFAULT);
 }
 
 int
 fill_regs(p, regs)
 	struct proc *p;
 	struct reg *regs;
 {
 	struct pcb *pcb;
 	struct trapframe *tp;
 
 	tp = p->p_md.md_regs;
 	regs->r_es = tp->tf_es;
 	regs->r_ds = tp->tf_ds;
 	regs->r_edi = tp->tf_edi;
 	regs->r_esi = tp->tf_esi;
 	regs->r_ebp = tp->tf_ebp;
 	regs->r_ebx = tp->tf_ebx;
 	regs->r_edx = tp->tf_edx;
 	regs->r_ecx = tp->tf_ecx;
 	regs->r_eax = tp->tf_eax;
 	regs->r_eip = tp->tf_eip;
 	regs->r_cs = tp->tf_cs;
 	regs->r_eflags = tp->tf_eflags;
 	regs->r_esp = tp->tf_esp;
 	regs->r_ss = tp->tf_ss;
 	pcb = &p->p_addr->u_pcb;
 	regs->r_fs = pcb->pcb_fs;
 	regs->r_gs = pcb->pcb_gs;
 	return (0);
 }
 
 int
 set_regs(p, regs)
 	struct proc *p;
 	struct reg *regs;
 {
 	struct pcb *pcb;
 	struct trapframe *tp;
 
 	tp = p->p_md.md_regs;
 	if (!EFLAGS_SECURE(regs->r_eflags, tp->tf_eflags) ||
 	    !CS_SECURE(regs->r_cs))
 		return (EINVAL);
 	tp->tf_es = regs->r_es;
 	tp->tf_ds = regs->r_ds;
 	tp->tf_edi = regs->r_edi;
 	tp->tf_esi = regs->r_esi;
 	tp->tf_ebp = regs->r_ebp;
 	tp->tf_ebx = regs->r_ebx;
 	tp->tf_edx = regs->r_edx;
 	tp->tf_ecx = regs->r_ecx;
 	tp->tf_eax = regs->r_eax;
 	tp->tf_eip = regs->r_eip;
 	tp->tf_cs = regs->r_cs;
 	tp->tf_eflags = regs->r_eflags;
 	tp->tf_esp = regs->r_esp;
 	tp->tf_ss = regs->r_ss;
 	pcb = &p->p_addr->u_pcb;
 	pcb->pcb_fs = regs->r_fs;
 	pcb->pcb_gs = regs->r_gs;
 	return (0);
 }
 
 #ifndef DDB
 void
 Debugger(const char *msg)
 {
 	printf("Debugger(\"%s\") called.\n", msg);
 }
 #endif /* no DDB */
 
 #include <sys/disklabel.h>
 
 /*
  * Determine the size of the transfer, and make sure it is
  * within the boundaries of the partition. Adjust transfer
  * if needed, and signal errors or early completion.
  */
 int
 bounds_check_with_label(struct buf *bp, struct disklabel *lp, int wlabel)
 {
         struct partition *p = lp->d_partitions + dkpart(bp->b_dev);
         int labelsect = lp->d_partitions[0].p_offset;
         int maxsz = p->p_size,
                 sz = (bp->b_bcount + DEV_BSIZE - 1) >> DEV_BSHIFT;
 
         /* overwriting disk label ? */
         /* XXX should also protect bootstrap in first 8K */
         if (bp->b_blkno + p->p_offset <= LABELSECTOR + labelsect &&
 #if LABELSECTOR != 0
             bp->b_blkno + p->p_offset + sz > LABELSECTOR + labelsect &&
 #endif
             (bp->b_flags & B_READ) == 0 && wlabel == 0) {
                 bp->b_error = EROFS;
                 goto bad;
         }
 
 #if     defined(DOSBBSECTOR) && defined(notyet)
         /* overwriting master boot record? */
         if (bp->b_blkno + p->p_offset <= DOSBBSECTOR &&
             (bp->b_flags & B_READ) == 0 && wlabel == 0) {
                 bp->b_error = EROFS;
                 goto bad;
         }
 #endif
 
         /* beyond partition? */
         if (bp->b_blkno < 0 || bp->b_blkno + sz > maxsz) {
                 /* if exactly at end of disk, return an EOF */
                 if (bp->b_blkno == maxsz) {
                         bp->b_resid = bp->b_bcount;
                         return(0);
                 }
                 /* or truncate if part of it fits */
                 sz = maxsz - bp->b_blkno;
                 if (sz <= 0) {
                         bp->b_error = EINVAL;
                         goto bad;
                 }
                 bp->b_bcount = sz << DEV_BSHIFT;
         }
 
         bp->b_pblkno = bp->b_blkno + p->p_offset;
         return(1);
 
 bad:
         bp->b_flags |= B_ERROR;
         return(-1);
 }
 
 #ifdef DDB
 
 /*
  * Provide inb() and outb() as functions.  They are normally only
  * available as macros calling inlined functions, thus cannot be
  * called inside DDB.
  *
  * The actual code is stolen from <machine/cpufunc.h>, and de-inlined.
  */
 
 #undef inb
 #undef outb
 
 /* silence compiler warnings */
 u_char inb(u_int);
 void outb(u_int, u_char);
 
 u_char
 inb(u_int port)
 {
 	u_char	data;
 	/*
 	 * We use %%dx and not %1 here because i/o is done at %dx and not at
 	 * %edx, while gcc generates inferior code (movw instead of movl)
 	 * if we tell it to load (u_short) port.
 	 */
 	__asm __volatile("inb %%dx,%0" : "=a" (data) : "d" (port));
 	return (data);
 }
 
 void
 outb(u_int port, u_char data)
 {
 	u_char	al;
 	/*
 	 * Use an unnecessary assignment to help gcc's register allocator.
 	 * This make a large difference for gcc-1.40 and a tiny difference
 	 * for gcc-2.6.0.  For gcc-1.40, al had to be ``asm("ax")'' for
 	 * best results.  gcc-2.6.0 can't handle this.
 	 */
 	al = data;
 	__asm __volatile("outb %0,%%dx" : : "a" (al), "d" (port));
 }
 
 #endif /* DDB */
Index: head/sys/amd64/amd64/pmap.c
===================================================================
--- head/sys/amd64/amd64/pmap.c	(revision 31708)
+++ head/sys/amd64/amd64/pmap.c	(revision 31709)
@@ -1,3374 +1,3399 @@
 /*
  * Copyright (c) 1991 Regents of the University of California.
  * All rights reserved.
  * Copyright (c) 1994 John S. Dyson
  * All rights reserved.
  * Copyright (c) 1994 David Greenman
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department and William Jolitz of UUNET Technologies Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
- *	$Id: pmap.c,v 1.172 1997/11/07 19:58:34 tegge Exp $
+ *	$Id: pmap.c,v 1.173 1997/11/20 19:30:31 bde Exp $
  */
 
 /*
  *	Manages physical address maps.
  *
  *	In addition to hardware address maps, this
  *	module is called upon to provide software-use-only
  *	maps which may or may not be stored in the same
  *	form as hardware maps.  These pseudo-maps are
  *	used to store intermediate results from copy
  *	operations to and from address spaces.
  *
  *	Since the information managed by this module is
  *	also stored by the logical address mapping module,
  *	this module may throw away valid virtual-to-physical
  *	mappings at almost any time.  However, invalidations
  *	of virtual-to-physical mappings must be done as
  *	requested.
  *
  *	In order to cope with hardware architectures which
  *	make virtual-to-physical map invalidates expensive,
  *	this module may delay invalidate or reduced protection
  *	operations until such time as they are actually
  *	necessary.  This module is given full information as
  *	to which processors are currently using which maps,
  *	and to when physical maps must be made correct.
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 #include <sys/msgbuf.h>
 #include <sys/vmmeter.h>
 #include <sys/mman.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_prot.h>
 #include <sys/lock.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_zone.h>
 
 #include <sys/user.h>
 
 #include <machine/cputypes.h>
 #include <machine/md_var.h>
 #include <machine/specialreg.h>
 #if defined(SMP) || defined(APIC_IO)
 #include <machine/smp.h>
 #include <machine/apic.h>
 #endif /* SMP || APIC_IO */
 
 #define PMAP_KEEP_PDIRS
 #ifndef PMAP_SHPGPERPROC
 #define PMAP_SHPGPERPROC 200
 #endif
 
 #if defined(DIAGNOSTIC)
 #define PMAP_DIAGNOSTIC
 #endif
 
 #define MINPV 2048
 
 #if !defined(PMAP_DIAGNOSTIC)
 #define PMAP_INLINE __inline
 #else
 #define PMAP_INLINE
 #endif
 
 #define PTPHINT
 
 /*
  * Get PDEs and PTEs for user/kernel address space
  */
 #define	pmap_pde(m, v)	(&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT]))
 #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT])
 
 #define pmap_pde_v(pte)		((*(int *)pte & PG_V) != 0)
 #define pmap_pte_w(pte)		((*(int *)pte & PG_W) != 0)
 #define pmap_pte_m(pte)		((*(int *)pte & PG_M) != 0)
 #define pmap_pte_u(pte)		((*(int *)pte & PG_A) != 0)
 #define pmap_pte_v(pte)		((*(int *)pte & PG_V) != 0)
 
 #define pmap_pte_set_w(pte, v) ((v)?(*(int *)pte |= PG_W):(*(int *)pte &= ~PG_W))
 #define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v)))
 
 /*
  * Given a map and a machine independent protection code,
  * convert to a vax protection code.
  */
 #define pte_prot(m, p)	(protection_codes[p])
 static int protection_codes[8];
 
 #define	pa_index(pa)		atop((pa) - vm_first_phys)
 #define	pa_to_pvh(pa)		(&pv_table[pa_index(pa)])
 
 static struct pmap kernel_pmap_store;
 pmap_t kernel_pmap;
 extern pd_entry_t my_idlePTD;
 
 vm_offset_t avail_start;	/* PA of first available physical page */
 vm_offset_t avail_end;		/* PA of last available physical page */
 vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
 vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
 static boolean_t pmap_initialized = FALSE;	/* Has pmap_init completed? */
 static vm_offset_t vm_first_phys;
 int pgeflag;		/* PG_G or-in */
 int pseflag;		/* PG_PS or-in */
 int pv_npg;
 
 int nkpt;
 vm_offset_t kernel_vm_end;
 
 /*
  * Data for the pv entry allocation mechanism
  */
 vm_zone_t pvzone;
 struct vm_zone pvzone_store;
 struct vm_object pvzone_obj;
 int pv_entry_count=0, pv_entry_max=0, pv_entry_high_water=0;
 int pmap_pagedaemon_waken = 0;
 struct pv_entry *pvinit;
 
 /*
  * All those kernel PT submaps that BSD is so fond of
  */
 pt_entry_t *CMAP1 = 0;
 static pt_entry_t *CMAP2, *ptmmap;
 static pv_table_t *pv_table;
 caddr_t CADDR1 = 0, ptvmmap = 0;
 static caddr_t CADDR2;
 static pt_entry_t *msgbufmap;
 struct msgbuf *msgbufp=0;
 
 #ifdef SMP
 extern char prv_CPAGE1[], prv_CPAGE2[], prv_CPAGE3[];
 extern pt_entry_t *prv_CMAP1, *prv_CMAP2, *prv_CMAP3;
 extern pd_entry_t *IdlePTDS[];
 extern pt_entry_t SMP_prvpt[];
 #endif
 
 pt_entry_t *PMAP1 = 0;
 unsigned *PADDR1 = 0;
 
 static PMAP_INLINE void	free_pv_entry __P((pv_entry_t pv));
 static unsigned * get_ptbase __P((pmap_t pmap));
 static pv_entry_t get_pv_entry __P((void));
 static void	i386_protection_init __P((void));
 static void	pmap_changebit __P((vm_offset_t pa, int bit, boolean_t setem));
 
 static PMAP_INLINE int	pmap_is_managed __P((vm_offset_t pa));
 static void	pmap_remove_all __P((vm_offset_t pa));
 static vm_page_t pmap_enter_quick __P((pmap_t pmap, vm_offset_t va,
 				      vm_offset_t pa, vm_page_t mpte));
 static int pmap_remove_pte __P((struct pmap *pmap, unsigned *ptq,
 					vm_offset_t sva));
 static void pmap_remove_page __P((struct pmap *pmap, vm_offset_t va));
 static int pmap_remove_entry __P((struct pmap *pmap, pv_table_t *pv,
 					vm_offset_t va));
 static boolean_t pmap_testbit __P((vm_offset_t pa, int bit));
 static void pmap_insert_entry __P((pmap_t pmap, vm_offset_t va,
 		vm_page_t mpte, vm_offset_t pa));
 
 static vm_page_t pmap_allocpte __P((pmap_t pmap, vm_offset_t va));
 
 static int pmap_release_free_page __P((pmap_t pmap, vm_page_t p));
 static vm_page_t _pmap_allocpte __P((pmap_t pmap, unsigned ptepindex));
 static unsigned * pmap_pte_quick __P((pmap_t pmap, vm_offset_t va));
 static vm_page_t pmap_page_alloc __P((vm_object_t object, vm_pindex_t pindex));
 static vm_page_t pmap_page_lookup __P((vm_object_t object, vm_pindex_t pindex));
 static int pmap_unuse_pt __P((pmap_t, vm_offset_t, vm_page_t));
 vm_offset_t pmap_kmem_choose(vm_offset_t addr) ;
 void pmap_collect(void);
 
 #define PDSTACKMAX 6
 static vm_offset_t pdstack[PDSTACKMAX];
 static int pdstackptr;
 unsigned pdir4mb;
 
 /*
  *	Routine:	pmap_pte
  *	Function:
  *		Extract the page table entry associated
  *		with the given map/virtual_address pair.
  */
 
 PMAP_INLINE unsigned *
 pmap_pte(pmap, va)
 	register pmap_t pmap;
 	vm_offset_t va;
 {
 	unsigned *pdeaddr;
 
 	if (pmap) {
 		pdeaddr = (unsigned *) pmap_pde(pmap, va);
 		if (*pdeaddr & PG_PS)
 			return pdeaddr;
 		if (*pdeaddr) {
 			return get_ptbase(pmap) + i386_btop(va);
 		}
 	}
 	return (0);
 }
 
 /*
  * Move the kernel virtual free pointer to the next
  * 4MB.  This is used to help improve performance
  * by using a large (4MB) page for much of the kernel
  * (.text, .data, .bss)
  */
 vm_offset_t
 pmap_kmem_choose(vm_offset_t addr) {
 	vm_offset_t newaddr = addr;
 #ifndef DISABLE_PSE
 	if (cpu_feature & CPUID_PSE) {
 		newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
 	}
 #endif
 	return newaddr;
 }
 
 /*
  *	Bootstrap the system enough to run with virtual memory.
  *
  *	On the i386 this is called after mapping has already been enabled
  *	and just syncs the pmap module with what has already been done.
  *	[We can't call it easily with mapping off since the kernel is not
  *	mapped with PA == VA, hence we would have to relocate every address
  *	from the linked base (virtual) address "KERNBASE" to the actual
  *	(physical) address starting relative to 0]
  */
 void
 pmap_bootstrap(firstaddr, loadaddr)
 	vm_offset_t firstaddr;
 	vm_offset_t loadaddr;
 {
 	vm_offset_t va;
 	pt_entry_t *pte;
 	int i, j;
 
 	avail_start = firstaddr;
 
 	/*
 	 * XXX The calculation of virtual_avail is wrong. It's NKPT*PAGE_SIZE too
 	 * large. It should instead be correctly calculated in locore.s and
 	 * not based on 'first' (which is a physical address, not a virtual
 	 * address, for the start of unused physical memory). The kernel
 	 * page tables are NOT double mapped and thus should not be included
 	 * in this calculation.
 	 */
 	virtual_avail = (vm_offset_t) KERNBASE + firstaddr;
 	virtual_avail = pmap_kmem_choose(virtual_avail);
 
 	virtual_end = VM_MAX_KERNEL_ADDRESS;
 
 	/*
 	 * Initialize protection array.
 	 */
 	i386_protection_init();
 
 	/*
 	 * The kernel's pmap is statically allocated so we don't have to use
 	 * pmap_create, which is unlikely to work correctly at this part of
 	 * the boot sequence (XXX and which no longer exists).
 	 */
 	kernel_pmap = &kernel_pmap_store;
 
 	kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD);
 
 	kernel_pmap->pm_count = 1;
 	TAILQ_INIT(&kernel_pmap->pm_pvlist);
 	nkpt = NKPT;
 
 	/*
 	 * Reserve some special page table entries/VA space for temporary
 	 * mapping of pages.
 	 */
 #define	SYSMAP(c, p, v, n)	\
 	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
 
 	va = virtual_avail;
 	pte = (pt_entry_t *) pmap_pte(kernel_pmap, va);
 
 	/*
 	 * CMAP1/CMAP2 are used for zeroing and copying pages.
 	 */
 	SYSMAP(caddr_t, CMAP1, CADDR1, 1)
 	SYSMAP(caddr_t, CMAP2, CADDR2, 1)
 
 	/*
 	 * ptvmmap is used for reading arbitrary physical pages via /dev/mem.
 	 * XXX ptmmap is not used.
 	 */
 	SYSMAP(caddr_t, ptmmap, ptvmmap, 1)
 
 	/*
 	 * msgbufp is used to map the system message buffer.
 	 * XXX msgbufmap is not used.
 	 */
 	SYSMAP(struct msgbuf *, msgbufmap, msgbufp,
 	       atop(round_page(sizeof(struct msgbuf))))
 
 	/*
 	 * ptemap is used for pmap_pte_quick
 	 */
 	SYSMAP(unsigned *, PMAP1, PADDR1, 1);
 
 	virtual_avail = va;
 
 	*(int *) CMAP1 = *(int *) CMAP2 = 0;
 	*(int *) PTD = 0;
 
 
 	pgeflag = 0;
 #if !defined(SMP)
 	if (cpu_feature & CPUID_PGE) {
 		pgeflag = PG_G;
 	}
 #endif
 	
 /*
  * Initialize the 4MB page size flag
  */
 	pseflag = 0;
 /*
  * The 4MB page version of the initial
  * kernel page mapping.
  */
 	pdir4mb = 0;
 
 #if !defined(DISABLE_PSE)
 	if (cpu_feature & CPUID_PSE) {
 		unsigned ptditmp;
 		/*
 		 * Enable the PSE mode
 		 */
 		load_cr4(rcr4() | CR4_PSE);
 
 		/*
 		 * Note that we have enabled PSE mode
 		 */
 		pseflag = PG_PS;
 		ptditmp = (unsigned) kernel_pmap->pm_pdir[KPTDI];
 		ptditmp &= ~(NBPDR - 1);
 		ptditmp |= PG_V | PG_RW | PG_PS | PG_U | pgeflag;
 		pdir4mb = ptditmp;
 		/*
 		 * We can do the mapping here for the single processor
 		 * case.  We simply ignore the old page table page from
 		 * now on.
 		 */
 #if !defined(SMP)
 		PTD[KPTDI] = (pd_entry_t) ptditmp;
 		kernel_pmap->pm_pdir[KPTDI] = (pd_entry_t) ptditmp;
 		invltlb();
 #endif
 	}
 #endif
 
 #ifdef SMP
 	if (cpu_apic_address == 0)
 		panic("pmap_bootstrap: no local apic!");
 
 	/* 0 = private page */
 	/* 1 = page table page */
 	/* 2 = local apic */
 	/* 16-31 = io apics */
 	SMP_prvpt[2] = (pt_entry_t)(PG_V | PG_RW | pgeflag | ((u_long)cpu_apic_address & PG_FRAME));
 
 	for (i = 0; i < mp_napics; i++) {
 		for (j = 0; j < 16; j++) {
 			/* same page frame as a previous IO apic? */
 			if (((u_long)SMP_prvpt[j + 16] & PG_FRAME) ==
 			    ((u_long)io_apic_address[0] & PG_FRAME)) {
 				ioapic[i] = (ioapic_t *)&SMP_ioapic[j * PAGE_SIZE];
 				break;
 			}
 			/* use this slot if available */
 			if (((u_long)SMP_prvpt[j + 16] & PG_FRAME) == 0) {
 				SMP_prvpt[j + 16] = (pt_entry_t)(PG_V | PG_RW | pgeflag |
 				    ((u_long)io_apic_address[i] & PG_FRAME));
 				ioapic[i] = (ioapic_t *)&SMP_ioapic[j * PAGE_SIZE];
 				break;
 			}
 		}
 		if (j == 16)
 			panic("no space to map IO apic %d!", i);
 	}
 
 	/* BSP does this itself, AP's get it pre-set */
 	prv_CMAP1 = (pt_entry_t *)&SMP_prvpt[3 + UPAGES];
 	prv_CMAP2 = (pt_entry_t *)&SMP_prvpt[4 + UPAGES];
 	prv_CMAP3 = (pt_entry_t *)&SMP_prvpt[5 + UPAGES];
 #endif
 
 	invltlb();
 
 }
 
 /*
  * Set 4mb pdir for mp startup, and global flags
  */
 void
 pmap_set_opt(unsigned *pdir) {
 	int i;
 
 	if (pseflag && (cpu_feature & CPUID_PSE)) {
 		load_cr4(rcr4() | CR4_PSE);
 		if (pdir4mb) {
 			(unsigned) pdir[KPTDI] = pdir4mb;
 		}
 	}
 
 	if (pgeflag && (cpu_feature & CPUID_PGE)) {
 		load_cr4(rcr4() | CR4_PGE);
 		for(i = KPTDI; i < KPTDI + nkpt; i++) {
 			if (pdir[i]) {
 				pdir[i] |= PG_G;
 			}
 		}
 	}
 }
 
 /*
  * Setup the PTD for the boot processor
  */
 void
 pmap_set_opt_bsp(void)
 {
 	pmap_set_opt((unsigned *)kernel_pmap->pm_pdir);
 	pmap_set_opt((unsigned *)PTD);
 	invltlb();
 }
 
 /*
  *	Initialize the pmap module.
  *	Called by vm_init, to initialize any structures that the pmap
  *	system needs to map virtual memory.
  *	pmap_init has been enhanced to support in a fairly consistant
  *	way, discontiguous physical memory.
  */
 void
 pmap_init(phys_start, phys_end)
 	vm_offset_t phys_start, phys_end;
 {
 	vm_offset_t addr;
 	vm_size_t s;
 	int i;
 	int initial_pvs;
 
 	/*
 	 * calculate the number of pv_entries needed
 	 */
 	vm_first_phys = phys_avail[0];
 	for (i = 0; phys_avail[i + 1]; i += 2);
 	pv_npg = (phys_avail[(i - 2) + 1] - vm_first_phys) / PAGE_SIZE;
 
 	/*
 	 * Allocate memory for random pmap data structures.  Includes the
 	 * pv_head_table.
 	 */
 	s = (vm_size_t) (sizeof(pv_table_t) * pv_npg);
 	s = round_page(s);
 
 	addr = (vm_offset_t) kmem_alloc(kernel_map, s);
 	pv_table = (pv_table_t *) addr;
 	for(i = 0; i < pv_npg; i++) {
 		vm_offset_t pa;
 		TAILQ_INIT(&pv_table[i].pv_list);
 		pv_table[i].pv_list_count = 0;
 		pa = vm_first_phys + i * PAGE_SIZE;
 		pv_table[i].pv_vm_page = PHYS_TO_VM_PAGE(pa);
 	}
 
 	/*
 	 * init the pv free list
 	 */
 	initial_pvs = pv_npg;
 	if (initial_pvs < MINPV)
 		initial_pvs = MINPV;
 	pvzone = &pvzone_store;
 	pvinit = (struct pv_entry *) kmem_alloc(kernel_map,
 		initial_pvs * sizeof (struct pv_entry));
 	zbootinit(pvzone, "PV ENTRY", sizeof (struct pv_entry), pvinit, pv_npg);
 
 	/*
 	 * Now it is safe to enable pv_table recording.
 	 */
 	pmap_initialized = TRUE;
 }
 
 /*
  * Initialize the address space (zone) for the pv_entries.  Set a
  * high water mark so that the system can recover from excessive
  * numbers of pv entries.
  */
 void
 pmap_init2() {
 	pv_entry_max = PMAP_SHPGPERPROC * maxproc + pv_npg;
 	pv_entry_high_water = 9 * (pv_entry_max / 10);
 	zinitna(pvzone, &pvzone_obj, NULL, 0, pv_entry_max, ZONE_INTERRUPT, 1);
 }
 
 /*
  *	Used to map a range of physical addresses into kernel
  *	virtual address space.
  *
  *	For now, VM is already on, we only need to map the
  *	specified memory.
  */
 vm_offset_t
 pmap_map(virt, start, end, prot)
 	vm_offset_t virt;
 	vm_offset_t start;
 	vm_offset_t end;
 	int prot;
 {
 	while (start < end) {
 		pmap_enter(kernel_pmap, virt, start, prot, FALSE);
 		virt += PAGE_SIZE;
 		start += PAGE_SIZE;
 	}
 	return (virt);
 }
 
 
 /***************************************************
  * Low level helper routines.....
  ***************************************************/
 
 #if defined(PMAP_DIAGNOSTIC)
 
 /*
  * This code checks for non-writeable/modified pages.
  * This should be an invalid condition.
  */
 static int
 pmap_nw_modified(pt_entry_t ptea) {
 	int pte;
 
 	pte = (int) ptea;
 
 	if ((pte & (PG_M|PG_RW)) == PG_M)
 		return 1;
 	else
 		return 0;
 }
 #endif
 
 
 /*
  * this routine defines the region(s) of memory that should
  * not be tested for the modified bit.
  */
 static PMAP_INLINE int
 pmap_track_modified( vm_offset_t va) {
 	if ((va < clean_sva) || (va >= clean_eva)) 
 		return 1;
 	else
 		return 0;
 }
 
 static PMAP_INLINE void
 invltlb_1pg( vm_offset_t va) {
 #if defined(I386_CPU)
 	if (cpu_class == CPUCLASS_386) {
 		invltlb();
 	} else
 #endif
 	{
 		invlpg(va);
 	}
 }
 
 static PMAP_INLINE void
 invltlb_2pg( vm_offset_t va1, vm_offset_t va2) {
 #if defined(I386_CPU)
 	if (cpu_class == CPUCLASS_386) {
 		invltlb();
 	} else
 #endif
 	{
 		invlpg(va1);
 		invlpg(va2);
 	}
 }
 
 static unsigned *
 get_ptbase(pmap)
 	pmap_t pmap;
 {
 	unsigned frame = (unsigned) pmap->pm_pdir[PTDPTDI] & PG_FRAME;
 
 	/* are we current address space or kernel? */
 	if (pmap == kernel_pmap || frame == (((unsigned) PTDpde) & PG_FRAME)) {
 		return (unsigned *) PTmap;
 	}
 	/* otherwise, we are alternate address space */
 	if (frame != (((unsigned) APTDpde) & PG_FRAME)) {
 		APTDpde = (pd_entry_t) (frame | PG_RW | PG_V);
 		invltlb();
 	}
 	return (unsigned *) APTmap;
 }
 
 /*
  * Super fast pmap_pte routine best used when scanning
  * the pv lists.  This eliminates many coarse-grained
  * invltlb calls.  Note that many of the pv list
  * scans are across different pmaps.  It is very wasteful
  * to do an entire invltlb for checking a single mapping.
  */
 
 static unsigned * 
 pmap_pte_quick(pmap, va)
 	register pmap_t pmap;
 	vm_offset_t va;
 {
 	unsigned pde, newpf;
 	if (pde = (unsigned) pmap->pm_pdir[va >> PDRSHIFT]) {
 		unsigned frame = (unsigned) pmap->pm_pdir[PTDPTDI] & PG_FRAME;
 		unsigned index = i386_btop(va);
 		/* are we current address space or kernel? */
 		if ((pmap == kernel_pmap) ||
 			(frame == (((unsigned) PTDpde) & PG_FRAME))) {
 			return (unsigned *) PTmap + index;
 		}
 		newpf = pde & PG_FRAME;
 		if ( ((* (unsigned *) PMAP1) & PG_FRAME) != newpf) {
 			* (unsigned *) PMAP1 = newpf | PG_RW | PG_V;
 			invltlb_1pg((vm_offset_t) PADDR1);
 		}
 		return PADDR1 + ((unsigned) index & (NPTEPG - 1));
 	}
 	return (0);
 }
 
 /*
  *	Routine:	pmap_extract
  *	Function:
  *		Extract the physical page address associated
  *		with the given map/virtual_address pair.
  */
 vm_offset_t 
 pmap_extract(pmap, va)
 	register pmap_t pmap;
 	vm_offset_t va;
 {
 	vm_offset_t rtval;
 	vm_offset_t pdirindex;
 	pdirindex = va >> PDRSHIFT;
 	if (pmap && (rtval = (unsigned) pmap->pm_pdir[pdirindex])) {
 		unsigned *pte;
 		if ((rtval & PG_PS) != 0) {
 			rtval &= ~(NBPDR - 1);
 			rtval |= va & (NBPDR - 1);
 			return rtval;
 		}
 		pte = get_ptbase(pmap) + i386_btop(va);
 		rtval = ((*pte & PG_FRAME) | (va & PAGE_MASK));
 		return rtval;
 	}
 	return 0;
 
 }
 
 /*
  * determine if a page is managed (memory vs. device)
  */
 static PMAP_INLINE int
 pmap_is_managed(pa)
 	vm_offset_t pa;
 {
 	int i;
 
 	if (!pmap_initialized)
 		return 0;
 
 	for (i = 0; phys_avail[i + 1]; i += 2) {
 		if (pa < phys_avail[i + 1] && pa >= phys_avail[i])
 			return 1;
 	}
 	return 0;
 }
 
 
 /***************************************************
  * Low level mapping routines.....
  ***************************************************/
 
 /*
  * Add a list of wired pages to the kva
  * this routine is only used for temporary
  * kernel mappings that do not need to have
  * page modification or references recorded.
  * Note that old mappings are simply written
  * over.  The page *must* be wired.
  */
 void
 pmap_qenter(va, m, count)
 	vm_offset_t va;
 	vm_page_t *m;
 	int count;
 {
 	int i;
 	register unsigned *pte;
 
 	for (i = 0; i < count; i++) {
 		vm_offset_t tva = va + i * PAGE_SIZE;
 		unsigned npte = VM_PAGE_TO_PHYS(m[i]) | PG_RW | PG_V | pgeflag;
 		unsigned opte;
 		pte = (unsigned *)vtopte(tva);
 		opte = *pte;
 		*pte = npte;
 		if (opte)
 			invltlb_1pg(tva);
 	}
 }
 
 /*
  * this routine jerks page mappings from the
  * kernel -- it is meant only for temporary mappings.
  */
 void
 pmap_qremove(va, count)
 	vm_offset_t va;
 	int count;
 {
 	int i;
 	register unsigned *pte;
 
 	for (i = 0; i < count; i++) {
 		pte = (unsigned *)vtopte(va);
 		*pte = 0;
 		invltlb_1pg(va);
 		va += PAGE_SIZE;
 	}
 }
 
 /*
  * add a wired page to the kva
  * note that in order for the mapping to take effect -- you
  * should do a invltlb after doing the pmap_kenter...
  */
 PMAP_INLINE void 
 pmap_kenter(va, pa)
 	vm_offset_t va;
 	register vm_offset_t pa;
 {
 	register unsigned *pte;
 	unsigned npte, opte;
 
 	npte = pa | PG_RW | PG_V | pgeflag;
 	pte = (unsigned *)vtopte(va);
 	opte = *pte;
 	*pte = npte;
 	if (opte)
 		invltlb_1pg(va);
 }
 
 /*
  * remove a page from the kernel pagetables
  */
 PMAP_INLINE void
 pmap_kremove(va)
 	vm_offset_t va;
 {
 	register unsigned *pte;
 
 	pte = (unsigned *)vtopte(va);
 	*pte = 0;
 	invltlb_1pg(va);
 }
 
 static vm_page_t
 pmap_page_alloc(object, pindex)
 	vm_object_t object;
 	vm_pindex_t pindex;
 {
 	vm_page_t m;
 	m = vm_page_alloc(object, pindex, VM_ALLOC_ZERO);
 	if (m == NULL) {
 		VM_WAIT;
 	}
 	return m;
 }
 
 static vm_page_t
 pmap_page_lookup(object, pindex)
 	vm_object_t object;
 	vm_pindex_t pindex;
 {
 	vm_page_t m;
 retry:
 	m = vm_page_lookup(object, pindex);
 	if (m) {
 		if (m->flags & PG_BUSY) {
 			m->flags |= PG_WANTED;
 			tsleep(m, PVM, "pplookp", 0);
 			goto retry;
 		}
 	}
 
 	return m;
 }
 
 /*
  * Create the UPAGES for a new process.
  * This routine directly affects the fork perf for a process.
  */
 void
 pmap_new_proc(p)
 	struct proc *p;
 {
 	int i;
 	vm_object_t upobj;
 	vm_page_t m;
 	struct user *up;
 	unsigned *ptek;
 
 	/*
 	 * allocate object for the upages
 	 */
 	upobj = vm_object_allocate( OBJT_DEFAULT, UPAGES);
 	p->p_upages_obj = upobj;
 
 	/* get a kernel virtual address for the UPAGES for this proc */
 	up = (struct user *) kmem_alloc_pageable(u_map, UPAGES * PAGE_SIZE);
 	if (up == NULL)
 		panic("pmap_new_proc: u_map allocation failed");
 
 	ptek = (unsigned *) vtopte((vm_offset_t) up);
 
 	for(i=0;i<UPAGES;i++) {
 		/*
 		 * Get a kernel stack page
 		 */
 		while ((m = vm_page_alloc(upobj,
 			i, VM_ALLOC_NORMAL)) == NULL) {
 			VM_WAIT;
 		}
 
 		/*
 		 * Wire the page
 		 */
 		m->wire_count++;
 		++cnt.v_wire_count;
 
 		/*
 		 * Enter the page into the kernel address space.
 		 */
 		*(ptek + i) = VM_PAGE_TO_PHYS(m) | PG_RW | PG_V | pgeflag;
 
 		m->flags &= ~(PG_ZERO|PG_BUSY);
 		m->flags |= PG_MAPPED|PG_WRITEABLE;
 		m->valid = VM_PAGE_BITS_ALL;
 	}
 
 	p->p_addr = up;
 }
 
 /*
  * Dispose the UPAGES for a process that has exited.
  * This routine directly impacts the exit perf of a process.
  */
 void
 pmap_dispose_proc(p)
 	struct proc *p;
 {
 	int i;
 	vm_object_t upobj;
 	vm_page_t m;
 	unsigned *ptek;
 
 	ptek = (unsigned *) vtopte((vm_offset_t) p->p_addr);
 
 	upobj = p->p_upages_obj;
 
 	for(i=0;i<UPAGES;i++) {
 		unsigned oldpte;
 		if ((m = vm_page_lookup(upobj, i)) == NULL)
 			panic("pmap_dispose_proc: upage already missing???");
 		oldpte = *(ptek + i);
 		*(ptek + i) = 0;
 		if (oldpte & PG_G)
 			invlpg((vm_offset_t) p->p_addr + i * PAGE_SIZE);
 		vm_page_unwire(m);
 		vm_page_free(m);
 	}
 
 	vm_object_deallocate(upobj);
 
 	kmem_free(u_map, (vm_offset_t)p->p_addr, ctob(UPAGES));
 }
 
 /*
  * Allow the UPAGES for a process to be prejudicially paged out.
  */
 void
 pmap_swapout_proc(p)
 	struct proc *p;
 {
 	int i;
 	vm_object_t upobj;
 	vm_page_t m;
 
 	upobj = p->p_upages_obj;
 	/*
 	 * let the upages be paged
 	 */
 	for(i=0;i<UPAGES;i++) {
 		if ((m = vm_page_lookup(upobj, i)) == NULL)
 			panic("pmap_swapout_proc: upage already missing???");
 		m->dirty = VM_PAGE_BITS_ALL;
 		vm_page_unwire(m);
 		vm_page_deactivate(m);
 		pmap_kremove( (vm_offset_t) p->p_addr + PAGE_SIZE * i);
 	}
 }
 
 /*
  * Bring the UPAGES for a specified process back in.
  */
 void
 pmap_swapin_proc(p)
 	struct proc *p;
 {
 	int i;
 	vm_object_t upobj;
 	vm_page_t m;
 
 	upobj = p->p_upages_obj;
 	for(i=0;i<UPAGES;i++) {
 		int s;
 		s = splvm();
 retry:
 		if ((m = vm_page_lookup(upobj, i)) == NULL) {
 			if ((m = vm_page_alloc(upobj, i, VM_ALLOC_NORMAL)) == NULL) {
 				VM_WAIT;
 				goto retry;
 			}
 		} else {
 			if ((m->flags & PG_BUSY) || m->busy) {
 				m->flags |= PG_WANTED;
 				tsleep(m, PVM, "swinuw",0);
 				goto retry;
 			}
 			m->flags |= PG_BUSY;
 		}
 		vm_page_wire(m);
 		splx(s);
 
 		pmap_kenter(((vm_offset_t) p->p_addr) + i * PAGE_SIZE,
 			VM_PAGE_TO_PHYS(m));
 
 		if (m->valid != VM_PAGE_BITS_ALL) {
 			int rv;
 			rv = vm_pager_get_pages(upobj, &m, 1, 0);
 			if (rv != VM_PAGER_OK)
 				panic("pmap_swapin_proc: cannot get upages for proc: %d\n", p->p_pid);
 			m->valid = VM_PAGE_BITS_ALL;
 		}
 		PAGE_WAKEUP(m);
 		m->flags |= PG_MAPPED|PG_WRITEABLE;
 	}
 }
 
 /***************************************************
  * Page table page management routines.....
  ***************************************************/
 
 /*
  * This routine unholds page table pages, and if the hold count
  * drops to zero, then it decrements the wire count.
  */
 static int 
 _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m) {
 	int s;
 
 	if (m->flags & PG_BUSY) {
 		s = splvm();
 		while (m->flags & PG_BUSY) {
 			m->flags |= PG_WANTED;
 			tsleep(m, PVM, "pmuwpt", 0);
 		}
 		splx(s);
 	}
 
 	if (m->hold_count == 0) {
 		vm_offset_t pteva;
 		/*
 		 * unmap the page table page
 		 */
 		pmap->pm_pdir[m->pindex] = 0;
 		--pmap->pm_stats.resident_count;
 		if ((((unsigned)pmap->pm_pdir[PTDPTDI]) & PG_FRAME) ==
 			(((unsigned) PTDpde) & PG_FRAME)) {
 			/*
 			 * Do a invltlb to make the invalidated mapping
 			 * take effect immediately.
 			 */
 			pteva = UPT_MIN_ADDRESS + i386_ptob(m->pindex);
 			invltlb_1pg(pteva);
 		}
 
 #if defined(PTPHINT)
 		if (pmap->pm_ptphint == m)
 			pmap->pm_ptphint = NULL;
 #endif
 
 		/*
 		 * If the page is finally unwired, simply free it.
 		 */
 		--m->wire_count;
 		if (m->wire_count == 0) {
 
 			if (m->flags & PG_WANTED) {
 				m->flags &= ~PG_WANTED;
 				wakeup(m);
 			}
 
 			vm_page_free_zero(m);
 			--cnt.v_wire_count;
 		}
 		return 1;
 	}
 	return 0;
 }
 
 __inline static int
 pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m) {
 	vm_page_unhold(m);
 	if (m->hold_count == 0)
 		return _pmap_unwire_pte_hold(pmap, m);
 	else
 		return 0;
 }
 
 /*
  * After removing a page table entry, this routine is used to
  * conditionally free the page, and manage the hold/wire counts.
  */
 static int
 pmap_unuse_pt(pmap, va, mpte)
 	pmap_t pmap;
 	vm_offset_t va;
 	vm_page_t mpte;
 {
 	unsigned ptepindex;
 	if (va >= UPT_MIN_ADDRESS)
 		return 0;
 
 	if (mpte == NULL) {
 		ptepindex = (va >> PDRSHIFT);
 #if defined(PTPHINT)
 		if (pmap->pm_ptphint &&
 			(pmap->pm_ptphint->pindex == ptepindex)) {
 			mpte = pmap->pm_ptphint;
 		} else {
 			mpte = pmap_page_lookup( pmap->pm_pteobj, ptepindex);
 			pmap->pm_ptphint = mpte;
 		}
 #else
 		mpte = pmap_page_lookup( pmap->pm_pteobj, ptepindex);
 #endif
 	}
 
 	return pmap_unwire_pte_hold(pmap, mpte);
 }
 
+#if !defined(SMP)
+void
+pmap_pinit0(pmap)
+	struct pmap *pmap;
+{
+	pmap->pm_pdir =
+		(pd_entry_t *)kmem_alloc_pageable(kernel_map, PAGE_SIZE);
+	pmap_kenter((vm_offset_t) pmap->pm_pdir, (vm_offset_t) IdlePTD);
+	pmap->pm_flags = 0;
+	pmap->pm_count = 1;
+	pmap->pm_ptphint = NULL;
+	TAILQ_INIT(&pmap->pm_pvlist);
+}
+#else
+void
+pmap_pinit0(pmap)
+	struct pmap *pmap;
+{
+	pmap_pinit(pmap);
+}
+#endif
+
 /*
  * Initialize a preallocated and zeroed pmap structure,
  * such as one in a vmspace structure.
  */
 void
 pmap_pinit(pmap)
 	register struct pmap *pmap;
 {
 	vm_page_t ptdpg;
 	/*
 	 * No need to allocate page table space yet but we do need a valid
 	 * page directory table.
 	 */
 
 	if (pdstackptr > 0) {
 		--pdstackptr;
 		pmap->pm_pdir = (pd_entry_t *)pdstack[pdstackptr];
 	} else {
 		pmap->pm_pdir =
 			(pd_entry_t *)kmem_alloc_pageable(kernel_map, PAGE_SIZE);
 	}
 
 	/*
 	 * allocate object for the ptes
 	 */
 	pmap->pm_pteobj = vm_object_allocate( OBJT_DEFAULT, PTDPTDI + 1);
 
 	/*
 	 * allocate the page directory page
 	 */
 retry:
 	ptdpg = pmap_page_alloc( pmap->pm_pteobj, PTDPTDI);
 	if (ptdpg == NULL) 
 		goto retry;
 
 	ptdpg->wire_count = 1;
 	++cnt.v_wire_count;
 
 	ptdpg->flags &= ~(PG_MAPPED|PG_BUSY);	/* not mapped normally */
 	ptdpg->valid = VM_PAGE_BITS_ALL;
 
 	pmap_kenter((vm_offset_t) pmap->pm_pdir, VM_PAGE_TO_PHYS(ptdpg));
 	if ((ptdpg->flags & PG_ZERO) == 0)
 		bzero(pmap->pm_pdir, PAGE_SIZE);
 
 	/* wire in kernel global address entries */
 	/* XXX copies current process, does not fill in MPPTDI */
 	bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * PTESIZE);
 
 	/* install self-referential address mapping entry */
 	*(unsigned *) (pmap->pm_pdir + PTDPTDI) =
 		VM_PAGE_TO_PHYS(ptdpg) | PG_V | PG_RW;
 
 	pmap->pm_flags = 0;
 	pmap->pm_count = 1;
 	pmap->pm_ptphint = NULL;
 	TAILQ_INIT(&pmap->pm_pvlist);
 }
 
 static int
 pmap_release_free_page(pmap, p)
 	struct pmap *pmap;
 	vm_page_t p;
 {
 	int s;
 	unsigned *pde = (unsigned *) pmap->pm_pdir;
 	/*
 	 * This code optimizes the case of freeing non-busy
 	 * page-table pages.  Those pages are zero now, and
 	 * might as well be placed directly into the zero queue.
 	 */
 	s = splvm();
 	if (p->flags & PG_BUSY) {
 		p->flags |= PG_WANTED;
 		tsleep(p, PVM, "pmaprl", 0);
 		splx(s);
 		return 0;
 	}
 
 	if (p->flags & PG_WANTED) {
 		p->flags &= ~PG_WANTED;
 		wakeup(p);
 	}
 
 	/*
 	 * Remove the page table page from the processes address space.
 	 */
 	pde[p->pindex] = 0;
 	--pmap->pm_stats.resident_count;
 
 	if (p->hold_count)  {
 		panic("pmap_release: freeing held page table page");
 	}
 	/*
 	 * Page directory pages need to have the kernel
 	 * stuff cleared, so they can go into the zero queue also.
 	 */
 	if (p->pindex == PTDPTDI) {
 		bzero(pde + KPTDI, nkpt * PTESIZE);
 #ifdef SMP
 		pde[MPPTDI] = 0;
 #endif
 		pde[APTDPTDI] = 0;
 		pmap_kremove((vm_offset_t) pmap->pm_pdir);
 	}
 
 #if defined(PTPHINT)
 	if (pmap->pm_ptphint &&
 		(pmap->pm_ptphint->pindex == p->pindex))
 		pmap->pm_ptphint = NULL;
 #endif
 
 	vm_page_free_zero(p);
 	splx(s);
 	return 1;
 }
 
 /*
  * this routine is called if the page table page is not
  * mapped correctly.
  */
 static vm_page_t
 _pmap_allocpte(pmap, ptepindex)
 	pmap_t	pmap;
 	unsigned ptepindex;
 {
 	vm_offset_t pteva, ptepa;
 	vm_page_t m;
 	int needszero = 0;
 
 	/*
 	 * Find or fabricate a new pagetable page
 	 */
 retry:
 	m = vm_page_lookup(pmap->pm_pteobj, ptepindex);
 	if (m == NULL) {
 		m = pmap_page_alloc(pmap->pm_pteobj, ptepindex);
 		if (m == NULL)
 			goto retry;
 		if ((m->flags & PG_ZERO) == 0)
 			needszero = 1;
 		m->flags &= ~(PG_ZERO|PG_BUSY);
 		m->valid = VM_PAGE_BITS_ALL;
 	} else {
 		if ((m->flags & PG_BUSY) || m->busy) {
 			m->flags |= PG_WANTED;
 			tsleep(m, PVM, "ptewai", 0);
 			goto retry;
 		}
 	}
 
 	if (m->queue != PQ_NONE) {
 		int s = splvm();
 		vm_page_unqueue(m);
 		splx(s);
 	}
 
 	if (m->wire_count == 0)
 		++cnt.v_wire_count;
 	++m->wire_count;
 
 	/*
 	 * Increment the hold count for the page table page
 	 * (denoting a new mapping.)
 	 */
 	++m->hold_count;
 
 	/*
 	 * Map the pagetable page into the process address space, if
 	 * it isn't already there.
 	 */
 
 	pmap->pm_stats.resident_count++;
 
 	ptepa = VM_PAGE_TO_PHYS(m);
 	pmap->pm_pdir[ptepindex] = (pd_entry_t) (ptepa | PG_U | PG_RW | PG_V);
 
 #if defined(PTPHINT)
 	/*
 	 * Set the page table hint
 	 */
 	pmap->pm_ptphint = m;
 #endif
 
 	/*
 	 * Try to use the new mapping, but if we cannot, then
 	 * do it with the routine that maps the page explicitly.
 	 */
 	if (needszero) {
 		if ((((unsigned)pmap->pm_pdir[PTDPTDI]) & PG_FRAME) ==
 			(((unsigned) PTDpde) & PG_FRAME)) {
 			pteva = UPT_MIN_ADDRESS + i386_ptob(ptepindex);
 			bzero((caddr_t) pteva, PAGE_SIZE);
 		} else {
 			pmap_zero_page(ptepa);
 		}
 	}
 
 	m->valid = VM_PAGE_BITS_ALL;
 	m->flags |= PG_MAPPED;
 
 	return m;
 }
 
 static vm_page_t
 pmap_allocpte(pmap, va)
 	pmap_t	pmap;
 	vm_offset_t va;
 {
 	unsigned ptepindex;
 	vm_offset_t ptepa;
 	vm_page_t m;
 
 	/*
 	 * Calculate pagetable page index
 	 */
 	ptepindex = va >> PDRSHIFT;
 
 	/*
 	 * Get the page directory entry
 	 */
 	ptepa = (vm_offset_t) pmap->pm_pdir[ptepindex];
 
 	/*
 	 * This supports switching from a 4MB page to a
 	 * normal 4K page.
 	 */
 	if (ptepa & PG_PS) {
 		pmap->pm_pdir[ptepindex] = 0;
 		ptepa = 0;
 		invltlb();
 	}
 
 	/*
 	 * If the page table page is mapped, we just increment the
 	 * hold count, and activate it.
 	 */
 	if (ptepa) {
 #if defined(PTPHINT)
 		/*
 		 * In order to get the page table page, try the
 		 * hint first.
 		 */
 		if (pmap->pm_ptphint &&
 			(pmap->pm_ptphint->pindex == ptepindex)) {
 			m = pmap->pm_ptphint;
 		} else {
 			m = pmap_page_lookup( pmap->pm_pteobj, ptepindex);
 			pmap->pm_ptphint = m;
 		}
 #else
 		m = pmap_page_lookup( pmap->pm_pteobj, ptepindex);
 #endif
 		++m->hold_count;
 		return m;
 	}
 	/*
 	 * Here if the pte page isn't mapped, or if it has been deallocated.
 	 */
 	return _pmap_allocpte(pmap, ptepindex);
 }
 
 
 /***************************************************
 * Pmap allocation/deallocation routines.
  ***************************************************/
 
 /*
  * Release any resources held by the given physical map.
  * Called when a pmap initialized by pmap_pinit is being released.
  * Should only be called if the map contains no valid mappings.
  */
 void
 pmap_release(pmap)
 	register struct pmap *pmap;
 {
 	vm_page_t p,n,ptdpg;
 	vm_object_t object = pmap->pm_pteobj;
 
 #if defined(DIAGNOSTIC)
 	if (object->ref_count != 1)
 		panic("pmap_release: pteobj reference count != 1");
 #endif
 	
 	ptdpg = NULL;
 retry:
 	for (p = TAILQ_FIRST(&object->memq); p != NULL; p = n) {
 		n = TAILQ_NEXT(p, listq);
 		if (p->pindex == PTDPTDI) {
 			ptdpg = p;
 			continue;
 		}
 		if (!pmap_release_free_page(pmap, p))
 			goto retry;
 	}
 
 	if (ptdpg && !pmap_release_free_page(pmap, ptdpg))
 		goto retry;
 		
 	vm_object_deallocate(object);
 	if (pdstackptr < PDSTACKMAX) {
 		pdstack[pdstackptr] = (vm_offset_t) pmap->pm_pdir;
 		++pdstackptr;
 	} else {
 		int pdstmp = pdstackptr - 1;
 		kmem_free(kernel_map, pdstack[pdstmp], PAGE_SIZE);
 		pdstack[pdstmp] = (vm_offset_t) pmap->pm_pdir;
 	}
 	pmap->pm_pdir = 0;
 }
 
 /*
  * grow the number of kernel page table entries, if needed
  */
 void
 pmap_growkernel(vm_offset_t addr)
 {
 	struct proc *p;
 	struct pmap *pmap;
 	int s;
 	vm_offset_t ptpkva, ptppaddr;
 	vm_page_t nkpg;
 #ifdef SMP
 	int i;
 #endif
 	pd_entry_t newpdir;
 	vm_pindex_t ptpidx;
 
 	s = splhigh();
 	if (kernel_vm_end == 0) {
 		kernel_vm_end = KERNBASE;
 		nkpt = 0;
 		while (pdir_pde(PTD, kernel_vm_end)) {
 			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
 			++nkpt;
 		}
 	}
 	addr = (addr + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
 	while (kernel_vm_end < addr) {
 		if (pdir_pde(PTD, kernel_vm_end)) {
 			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
 			continue;
 		}
 		++nkpt;
 		ptpkva = (vm_offset_t) vtopte(addr);
 		ptpidx = (ptpkva >> PAGE_SHIFT); 
 		/*
 		 * This index is bogus, but out of the way
 		 */
 		nkpg = vm_page_alloc(kernel_object,
 			ptpidx, VM_ALLOC_SYSTEM);
 		if (!nkpg)
 			panic("pmap_growkernel: no memory to grow kernel");
 
 		vm_page_wire(nkpg);
 		vm_page_remove(nkpg);
 		ptppaddr = VM_PAGE_TO_PHYS(nkpg);
 		pmap_zero_page(ptppaddr);
 		newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW);
 		pdir_pde(PTD, kernel_vm_end) = newpdir;
 
 #ifdef SMP
 		for (i = 0; i < mp_ncpus; i++) {
 			if (IdlePTDS[i])
 				pdir_pde(IdlePTDS[i], kernel_vm_end) = newpdir;
 		}
 #endif
 
 		for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) {
 			if (p->p_vmspace) {
 				pmap = &p->p_vmspace->vm_pmap;
 				*pmap_pde(pmap, kernel_vm_end) = newpdir;
 			}
 		}
 		*pmap_pde(kernel_pmap, kernel_vm_end) = newpdir;
 		kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
 	}
 	splx(s);
 }
 
 /*
  *	Retire the given physical map from service.
  *	Should only be called if the map contains
  *	no valid mappings.
  */
 void
 pmap_destroy(pmap)
 	register pmap_t pmap;
 {
 	int count;
 
 	if (pmap == NULL)
 		return;
 
 	count = --pmap->pm_count;
 	if (count == 0) {
 		pmap_release(pmap);
 		panic("destroying a pmap is not yet implemented");
 		/* free((caddr_t) pmap, M_VMPMAP); */
 	}
 }
 
 /*
  *	Add a reference to the specified pmap.
  */
 void
 pmap_reference(pmap)
 	pmap_t pmap;
 {
 	if (pmap != NULL) {
 		pmap->pm_count++;
 	}
 }
 
 /***************************************************
 * page management routines.
  ***************************************************/
 
 /*
  * free the pv_entry back to the free list
  */
 static inline void
 free_pv_entry(pv)
 	pv_entry_t pv;
 {
 	pv_entry_count--;
 	zfreei(pvzone, pv);
 }
 
 /*
  * get a new pv_entry, allocating a block from the system
  * when needed.
  * the memory allocation is performed bypassing the malloc code
  * because of the possibility of allocations at interrupt time.
  */
 static pv_entry_t
 get_pv_entry(void)
 {
 	pv_entry_count++;
 	if (pv_entry_high_water &&
 		(pv_entry_count > pv_entry_high_water) &&
 		(pmap_pagedaemon_waken == 0)) {
 		pmap_pagedaemon_waken = 1;
 		wakeup (&vm_pages_needed);
 	}
 	return zalloci(pvzone);
 }
 
 /*
  * This routine is very drastic, but can save the system
  * in a pinch.
  */
 void
 pmap_collect() {
 	pv_table_t *ppv;
 	int i;
 	vm_offset_t pa;
 	vm_page_t m;
 	static int warningdone=0;
 
 	if (pmap_pagedaemon_waken == 0)
 		return;
 
 	if (warningdone < 5) {
 		printf("pmap_collect: collecting pv entries -- suggest increasing PMAP_SHPGPERPROC\n");
 		warningdone++;
 	}
 
 	for(i = 0; i < pv_npg; i++) {
 		if ((ppv = &pv_table[i]) == 0)
 			continue;
 		m = ppv->pv_vm_page;
 		if ((pa = VM_PAGE_TO_PHYS(m)) == 0)
 			continue;
 		if (m->wire_count || m->hold_count || m->busy || (m->flags & PG_BUSY))
 			continue;
 		pmap_remove_all(pa);
 	}
 	pmap_pagedaemon_waken = 0;
 }
 	
 
 /*
  * If it is the first entry on the list, it is actually
  * in the header and we must copy the following entry up
  * to the header.  Otherwise we must search the list for
  * the entry.  In either case we free the now unused entry.
  */
 
 static int
 pmap_remove_entry(pmap, ppv, va)
 	struct pmap *pmap;
 	pv_table_t *ppv;
 	vm_offset_t va;
 {
 	pv_entry_t pv;
 	int rtval;
 	int s;
 
 	s = splvm();
 	if (ppv->pv_list_count < pmap->pm_stats.resident_count) {
 		for (pv = TAILQ_FIRST(&ppv->pv_list);
 			pv;
 			pv = TAILQ_NEXT(pv, pv_list)) {
 			if (pmap == pv->pv_pmap && va == pv->pv_va) 
 				break;
 		}
 	} else {
 		for (pv = TAILQ_FIRST(&pmap->pm_pvlist);
 			pv;
 			pv = TAILQ_NEXT(pv, pv_plist)) {
 			if (va == pv->pv_va) 
 				break;
 		}
 	}
 
 	rtval = 0;
 	if (pv) {
 		rtval = pmap_unuse_pt(pmap, va, pv->pv_ptem);
 		TAILQ_REMOVE(&ppv->pv_list, pv, pv_list);
 		--ppv->pv_list_count;
 		if (TAILQ_FIRST(&ppv->pv_list) == NULL) {
 			ppv->pv_vm_page->flags &= ~(PG_MAPPED|PG_WRITEABLE);
 		}
 
 		TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
 		free_pv_entry(pv);
 	}
 			
 	splx(s);
 	return rtval;
 }
 
 /*
  * Create a pv entry for page at pa for
  * (pmap, va).
  */
 static void
 pmap_insert_entry(pmap, va, mpte, pa)
 	pmap_t pmap;
 	vm_offset_t va;
 	vm_page_t mpte;
 	vm_offset_t pa;
 {
 
 	int s;
 	pv_entry_t pv;
 	pv_table_t *ppv;
 
 	s = splvm();
 	pv = get_pv_entry();
 	pv->pv_va = va;
 	pv->pv_pmap = pmap;
 	pv->pv_ptem = mpte;
 
 	TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist);
 
 	ppv = pa_to_pvh(pa);
 	TAILQ_INSERT_TAIL(&ppv->pv_list, pv, pv_list);
 	++ppv->pv_list_count;
 
 	splx(s);
 }
 
 /*
  * pmap_remove_pte: do the things to unmap a page in a process
  */
 static int
 pmap_remove_pte(pmap, ptq, va)
 	struct pmap *pmap;
 	unsigned *ptq;
 	vm_offset_t va;
 {
 	unsigned oldpte;
 	pv_table_t *ppv;
 
 	oldpte = *ptq;
 	*ptq = 0;
 	if (oldpte & PG_W)
 		pmap->pm_stats.wired_count -= 1;
 	/*
 	 * Machines that don't support invlpg, also don't support
 	 * PG_G.
 	 */
 	if (oldpte & PG_G)
 		invlpg(va);
 	pmap->pm_stats.resident_count -= 1;
 	if (oldpte & PG_MANAGED) {
 		ppv = pa_to_pvh(oldpte);
 		if (oldpte & PG_M) {
 #if defined(PMAP_DIAGNOSTIC)
 			if (pmap_nw_modified((pt_entry_t) oldpte)) {
 				printf("pmap_remove: modified page not writable: va: 0x%lx, pte: 0x%lx\n", va, (int) oldpte);
 			}
 #endif
 			if (pmap_track_modified(va))
 				ppv->pv_vm_page->dirty = VM_PAGE_BITS_ALL;
 		}
 		return pmap_remove_entry(pmap, ppv, va);
 	} else {
 		return pmap_unuse_pt(pmap, va, NULL);
 	}
 
 	return 0;
 }
 
 /*
  * Remove a single page from a process address space
  */
 static void
 pmap_remove_page(pmap, va)
 	struct pmap *pmap;
 	register vm_offset_t va;
 {
 	register unsigned *ptq;
 
 	/*
 	 * if there is no pte for this address, just skip it!!!
 	 */
 	if (*pmap_pde(pmap, va) == 0) {
 		return;
 	}
 
 	/*
 	 * get a local va for mappings for this pmap.
 	 */
 	ptq = get_ptbase(pmap) + i386_btop(va);
 	if (*ptq) {
 		(void) pmap_remove_pte(pmap, ptq, va);
 		invltlb_1pg(va);
 	}
 	return;
 }
 
 /*
  *	Remove the given range of addresses from the specified map.
  *
  *	It is assumed that the start and end are properly
  *	rounded to the page size.
  */
 void
 pmap_remove(pmap, sva, eva)
 	struct pmap *pmap;
 	register vm_offset_t sva;
 	register vm_offset_t eva;
 {
 	register unsigned *ptbase;
 	vm_offset_t pdnxt;
 	vm_offset_t ptpaddr;
 	vm_offset_t sindex, eindex;
 	int anyvalid;
 
 	if (pmap == NULL)
 		return;
 
 	if (pmap->pm_stats.resident_count == 0)
 		return;
 
 	/*
 	 * special handling of removing one page.  a very
 	 * common operation and easy to short circuit some
 	 * code.
 	 */
 	if (((sva + PAGE_SIZE) == eva) && 
 		(((unsigned) pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) {
 		pmap_remove_page(pmap, sva);
 		return;
 	}
 
 	anyvalid = 0;
 
 	/*
 	 * Get a local virtual address for the mappings that are being
 	 * worked with.
 	 */
 	ptbase = get_ptbase(pmap);
 
 	sindex = i386_btop(sva);
 	eindex = i386_btop(eva);
 
 	for (; sindex < eindex; sindex = pdnxt) {
 		unsigned pdirindex;
 
 		/*
 		 * Calculate index for next page table.
 		 */
 		pdnxt = ((sindex + NPTEPG) & ~(NPTEPG - 1));
 		if (pmap->pm_stats.resident_count == 0)
 			break;
 
 		pdirindex = sindex / NPDEPG;
 		if (((ptpaddr = (unsigned) pmap->pm_pdir[pdirindex]) & PG_PS) != 0) {
 			pmap->pm_pdir[pdirindex] = 0;
 			pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
 			anyvalid++;
 			continue;
 		}
 
 		/*
 		 * Weed out invalid mappings. Note: we assume that the page
 		 * directory table is always allocated, and in kernel virtual.
 		 */
 		if (ptpaddr == 0)
 			continue;
 
 		/*
 		 * Limit our scan to either the end of the va represented
 		 * by the current page table page, or to the end of the
 		 * range being removed.
 		 */
 		if (pdnxt > eindex) {
 			pdnxt = eindex;
 		}
 
 		for ( ;sindex != pdnxt; sindex++) {
 			vm_offset_t va;
 			if (ptbase[sindex] == 0) {
 				continue;
 			}
 			va = i386_ptob(sindex);
 			
 			anyvalid++;
 			if (pmap_remove_pte(pmap,
 				ptbase + sindex, va))
 				break;
 		}
 	}
 
 	if (anyvalid) {
 		invltlb();
 	}
 }
 
 /*
  *	Routine:	pmap_remove_all
  *	Function:
  *		Removes this physical page from
  *		all physical maps in which it resides.
  *		Reflects back modify bits to the pager.
  *
  *	Notes:
  *		Original versions of this routine were very
  *		inefficient because they iteratively called
  *		pmap_remove (slow...)
  */
 
 static void
 pmap_remove_all(pa)
 	vm_offset_t pa;
 {
 	register pv_entry_t pv;
 	pv_table_t *ppv;
 	register unsigned *pte, tpte;
 	int nmodify;
 	int update_needed;
 	int s;
 
 	nmodify = 0;
 	update_needed = 0;
 #if defined(PMAP_DIAGNOSTIC)
 	/*
 	 * XXX this makes pmap_page_protect(NONE) illegal for non-managed
 	 * pages!
 	 */
 	if (!pmap_is_managed(pa)) {
 		panic("pmap_page_protect: illegal for unmanaged page, va: 0x%lx", pa);
 	}
 #endif
 
 	s = splvm();
 	ppv = pa_to_pvh(pa);
 	while ((pv = TAILQ_FIRST(&ppv->pv_list)) != NULL) {
 		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
 
 		pv->pv_pmap->pm_stats.resident_count--;
 
 		tpte = *pte;
 		*pte = 0;
 		if (tpte & PG_W)
 			pv->pv_pmap->pm_stats.wired_count--;
 		/*
 		 * Update the vm_page_t clean and reference bits.
 		 */
 		if (tpte & PG_M) {
 #if defined(PMAP_DIAGNOSTIC)
 			if (pmap_nw_modified((pt_entry_t) tpte)) {
 				printf("pmap_remove_all: modified page not writable: va: 0x%lx, pte: 0x%lx\n", pv->pv_va, tpte);
 			}
 #endif
 			if (pmap_track_modified(pv->pv_va))
 				ppv->pv_vm_page->dirty = VM_PAGE_BITS_ALL;
 		}
 		if (!update_needed &&
 			((!curproc || (&curproc->p_vmspace->vm_pmap == pv->pv_pmap)) ||
 			(pv->pv_pmap == kernel_pmap))) {
 			update_needed = 1;
 		}
 
 		TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist);
 		TAILQ_REMOVE(&ppv->pv_list, pv, pv_list);
 		--ppv->pv_list_count;
 		pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem);
 		free_pv_entry(pv);
 	}
 	ppv->pv_vm_page->flags &= ~(PG_MAPPED|PG_WRITEABLE);
 
 
 	if (update_needed)
 		invltlb();
 	splx(s);
 	return;
 }
 
 /*
  *	Set the physical protection on the
  *	specified range of this map as requested.
  */
 void
 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
 {
 	register unsigned *ptbase;
 	vm_offset_t pdnxt;
 	vm_offset_t ptpaddr;
 	vm_offset_t sindex, eindex;
 	int anychanged;
 
 
 	if (pmap == NULL)
 		return;
 
 	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
 		pmap_remove(pmap, sva, eva);
 		return;
 	}
 
 	anychanged = 0;
 
 	ptbase = get_ptbase(pmap);
 
 	sindex = i386_btop(sva);
 	eindex = i386_btop(eva);
 
 	for (; sindex < eindex; sindex = pdnxt) {
 
 		unsigned pdirindex;
 
 		pdnxt = ((sindex + NPTEPG) & ~(NPTEPG - 1));
 
 		pdirindex = sindex / NPDEPG;
 		if (((ptpaddr = (unsigned) pmap->pm_pdir[pdirindex]) & PG_PS) != 0) {
 			(unsigned) pmap->pm_pdir[pdirindex] &= ~(PG_M|PG_RW);
 			pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
 			anychanged++;
 			continue;
 		}
 
 		/*
 		 * Weed out invalid mappings. Note: we assume that the page
 		 * directory table is always allocated, and in kernel virtual.
 		 */
 		if (ptpaddr == 0)
 			continue;
 
 		if (pdnxt > eindex) {
 			pdnxt = eindex;
 		}
 
 		for (; sindex != pdnxt; sindex++) {
 
 			unsigned pbits = ptbase[sindex];
 
 			if (prot & VM_PROT_WRITE) {
 				if ((pbits & (PG_RW|PG_V)) == PG_V) {
 					if (pbits & PG_MANAGED) {
 						vm_page_t m = PHYS_TO_VM_PAGE(pbits);
 						m->flags |= PG_WRITEABLE;
 						m->object->flags |= OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY;
 					}
 					ptbase[sindex] = pbits | PG_RW;
 					anychanged = 1;
 				}
 			} else if (pbits & PG_RW) {
 				if (pbits & PG_M) {
 					vm_offset_t sva1 = i386_ptob(sindex);
 					if ((pbits & PG_MANAGED) && pmap_track_modified(sva1)) {
 						vm_page_t m = PHYS_TO_VM_PAGE(pbits);
 						m->dirty = VM_PAGE_BITS_ALL;
 					}
 				}
 				ptbase[sindex] = pbits & ~(PG_M|PG_RW);
 				anychanged = 1;
 			}
 		}
 	}
 	if (anychanged)
 		invltlb();
 }
 
 /*
  *	Insert the given physical page (p) at
  *	the specified virtual address (v) in the
  *	target physical map with the protection requested.
  *
  *	If specified, the page will be wired down, meaning
  *	that the related pte can not be reclaimed.
  *
  *	NB:  This is the only routine which MAY NOT lazy-evaluate
  *	or lose information.  That is, this routine must actually
  *	insert this page into the given map NOW.
  */
 void
 pmap_enter(pmap_t pmap, vm_offset_t va, vm_offset_t pa, vm_prot_t prot,
 	   boolean_t wired)
 {
 	register unsigned *pte;
 	vm_offset_t opa;
 	vm_offset_t origpte, newpte;
 	vm_page_t mpte;
 
 	if (pmap == NULL)
 		return;
 
 	va &= PG_FRAME;
 #ifdef PMAP_DIAGNOSTIC
 	if (va > VM_MAX_KERNEL_ADDRESS)
 		panic("pmap_enter: toobig");
 	if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS))
 		panic("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", va);
 #endif
 
 	mpte = NULL;
 	/*
 	 * In the case that a page table page is not
 	 * resident, we are creating it here.
 	 */
 	if (va < UPT_MIN_ADDRESS) {
 		mpte = pmap_allocpte(pmap, va);
 	}
 #if 0 && defined(PMAP_DIAGNOSTIC)
 	else {
 		vm_offset_t *pdeaddr = (vm_offset_t *)pmap_pde(pmap, va);
 		if (((origpte = (vm_offset_t) *pdeaddr) & PG_V) == 0) { 
 			panic("pmap_enter: invalid kernel page table page(0), pdir=%p, pde=%p, va=%p\n",
 				pmap->pm_pdir[PTDPTDI], origpte, va);
 		}
 		if (smp_active) {
 			pdeaddr = (vm_offset_t *) IdlePTDS[cpuid];
 			if (((newpte = pdeaddr[va >> PDRSHIFT]) & PG_V) == 0) {
 				if ((vm_offset_t) my_idlePTD != (vm_offset_t) vtophys(pdeaddr))
 					printf("pde mismatch: %x, %x\n", my_idlePTD, pdeaddr);
 				printf("cpuid: %d, pdeaddr: 0x%x\n", cpuid, pdeaddr);
 				panic("pmap_enter: invalid kernel page table page(1), pdir=%p, npde=%p, pde=%p, va=%p\n",
 					pmap->pm_pdir[PTDPTDI], newpte, origpte, va);
 			}
 		}
 	}
 #endif
 
 	pte = pmap_pte(pmap, va);
 	/*
 	 * Page Directory table entry not valid, we need a new PT page
 	 */
 	if (pte == NULL) {
 		panic("pmap_enter: invalid page directory, pdir=%p, va=0x%lx\n",
 			pmap->pm_pdir[PTDPTDI], va);
 	}
 
 	origpte = *(vm_offset_t *)pte;
 	pa &= PG_FRAME;
 	opa = origpte & PG_FRAME;
 	if (origpte & PG_PS)
 		panic("pmap_enter: attempted pmap_enter on 4MB page");
 
 	/*
 	 * Mapping has not changed, must be protection or wiring change.
 	 */
 	if (origpte && (opa == pa)) {
 		/*
 		 * Wiring change, just update stats. We don't worry about
 		 * wiring PT pages as they remain resident as long as there
 		 * are valid mappings in them. Hence, if a user page is wired,
 		 * the PT page will be also.
 		 */
 		if (wired && ((origpte & PG_W) == 0))
 			pmap->pm_stats.wired_count++;
 		else if (!wired && (origpte & PG_W))
 			pmap->pm_stats.wired_count--;
 
 #if defined(PMAP_DIAGNOSTIC)
 		if (pmap_nw_modified((pt_entry_t) origpte)) {
 			printf("pmap_enter: modified page not writable: va: 0x%lx, pte: 0x%lx\n", va, origpte);
 		}
 #endif
 
 		/*
 		 * We might be turning off write access to the page,
 		 * so we go ahead and sense modify status.
 		 */
 		if (origpte & PG_MANAGED) {
 			vm_page_t m;
 			if (origpte & PG_M) {
 				if (pmap_track_modified(va)) {
 					m = PHYS_TO_VM_PAGE(pa);
 					m->dirty = VM_PAGE_BITS_ALL;
 				}
 			}
 			pa |= PG_MANAGED;
 		}
 
 		if (mpte)
 			--mpte->hold_count;
 
 		goto validate;
 	} 
 	/*
 	 * Mapping has changed, invalidate old range and fall through to
 	 * handle validating new mapping.
 	 */
 	if (opa) {
 		int err;
 		err = pmap_remove_pte(pmap, pte, va);
 		if (err)
 			panic("pmap_enter: pte vanished, va: 0x%x", va);
 	}
 
 	/*
 	 * Enter on the PV list if part of our managed memory Note that we
 	 * raise IPL while manipulating pv_table since pmap_enter can be
 	 * called at interrupt time.
 	 */
 	if (pmap_is_managed(pa)) {
 		pmap_insert_entry(pmap, va, mpte, pa);
 		pa |= PG_MANAGED;
 	}
 
 	/*
 	 * Increment counters
 	 */
 	pmap->pm_stats.resident_count++;
 	if (wired)
 		pmap->pm_stats.wired_count++;
 
 validate:
 	/*
 	 * Now validate mapping with desired protection/wiring.
 	 */
 	newpte = (vm_offset_t) (pa | pte_prot(pmap, prot) | PG_V);
 
 	if (wired)
 		newpte |= PG_W;
 	if (va < UPT_MIN_ADDRESS)
 		newpte |= PG_U;
 	if (pmap == kernel_pmap)
 		newpte |= pgeflag;
 
 	/*
 	 * if the mapping or permission bits are different, we need
 	 * to update the pte.
 	 */
 	if ((origpte & ~(PG_M|PG_A)) != newpte) {
 		*pte = newpte;
 		if (origpte)
 			invltlb_1pg(va);
 	}
 }
 
 /*
  * this code makes some *MAJOR* assumptions:
  * 1. Current pmap & pmap exists.
  * 2. Not wired.
  * 3. Read access.
  * 4. No page table pages.
  * 5. Tlbflush is deferred to calling procedure.
  * 6. Page IS managed.
  * but is *MUCH* faster than pmap_enter...
  */
 
 static vm_page_t
 pmap_enter_quick(pmap, va, pa, mpte)
 	register pmap_t pmap;
 	vm_offset_t va;
 	register vm_offset_t pa;
 	vm_page_t mpte;
 {
 	register unsigned *pte;
 
 	/*
 	 * In the case that a page table page is not
 	 * resident, we are creating it here.
 	 */
 	if (va < UPT_MIN_ADDRESS) {
 		unsigned ptepindex;
 		vm_offset_t ptepa;
 
 		/*
 		 * Calculate pagetable page index
 		 */
 		ptepindex = va >> PDRSHIFT;
 		if (mpte && (mpte->pindex == ptepindex)) {
 			++mpte->hold_count;
 		} else {
 retry:
 			/*
 			 * Get the page directory entry
 			 */
 			ptepa = (vm_offset_t) pmap->pm_pdir[ptepindex];
 
 			/*
 			 * If the page table page is mapped, we just increment
 			 * the hold count, and activate it.
 			 */
 			if (ptepa) {
 				if (ptepa & PG_PS)
 					panic("pmap_enter_quick: unexpected mapping into 4MB page");
 #if defined(PTPHINT)
 				if (pmap->pm_ptphint &&
 					(pmap->pm_ptphint->pindex == ptepindex)) {
 					mpte = pmap->pm_ptphint;
 				} else {
 					mpte = pmap_page_lookup( pmap->pm_pteobj, ptepindex);
 					pmap->pm_ptphint = mpte;
 				}
 #else
 				mpte = pmap_page_lookup( pmap->pm_pteobj, ptepindex);
 #endif
 				if (mpte == NULL)
 					goto retry;
 				++mpte->hold_count;
 			} else {
 				mpte = _pmap_allocpte(pmap, ptepindex);
 			}
 		}
 	} else {
 		mpte = NULL;
 	}
 
 	/*
 	 * This call to vtopte makes the assumption that we are
 	 * entering the page into the current pmap.  In order to support
 	 * quick entry into any pmap, one would likely use pmap_pte_quick.
 	 * But that isn't as quick as vtopte.
 	 */
 	pte = (unsigned *)vtopte(va);
 	if (*pte) {
 		if (mpte)
 			pmap_unwire_pte_hold(pmap, mpte);
 		return 0;
 	}
 
 	/*
 	 * Enter on the PV list if part of our managed memory Note that we
 	 * raise IPL while manipulating pv_table since pmap_enter can be
 	 * called at interrupt time.
 	 */
 	pmap_insert_entry(pmap, va, mpte, pa);
 
 	/*
 	 * Increment counters
 	 */
 	pmap->pm_stats.resident_count++;
 
 	/*
 	 * Now validate mapping with RO protection
 	 */
 	*pte = pa | PG_V | PG_U | PG_MANAGED;
 
 	return mpte;
 }
 
 #define MAX_INIT_PT (96)
 /*
  * pmap_object_init_pt preloads the ptes for a given object
  * into the specified pmap.  This eliminates the blast of soft
  * faults on process startup and immediately after an mmap.
  */
 void
 pmap_object_init_pt(pmap, addr, object, pindex, size, limit)
 	pmap_t pmap;
 	vm_offset_t addr;
 	vm_object_t object;
 	vm_pindex_t pindex;
 	vm_size_t size;
 	int limit;
 {
 	vm_offset_t tmpidx;
 	int psize;
 	vm_page_t p, mpte;
 	int objpgs;
 
 	if (!pmap)
 		return;
 
 	/*
 	 * This code maps large physical mmap regions into the
 	 * processor address space.  Note that some shortcuts
 	 * are taken, but the code works.
 	 */
 	if (pseflag &&
 		(object->type == OBJT_DEVICE) &&
 		((addr & (NBPDR - 1)) == 0) &&
 		((size & (NBPDR - 1)) == 0) ) {
 		int i;
 		int s;
 		vm_page_t m[1];
 		unsigned int ptepindex;
 		int npdes;
 		vm_offset_t ptepa;
 
 		if (pmap->pm_pdir[ptepindex = (addr >> PDRSHIFT)])
 			return;
 
 		s = splhigh();
 retry:
 		p = vm_page_lookup(object, pindex);
 		if (p && (p->flags & PG_BUSY)) {
 			tsleep(p, PVM, "init4p", 0);
 			goto retry;
 		}
 		splx(s);
 		
 		if (p == NULL) {
 			p = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL);
 			if (p == NULL)
 				return;
 			m[0] = p;
 
 			if (vm_pager_get_pages(object, m, 1, 0) != VM_PAGER_OK) {
 				PAGE_WAKEUP(p);
 				vm_page_free(p);
 				return;
 			}
 
 			p = vm_page_lookup(object, pindex);
 			PAGE_WAKEUP(p);
 		}
 
 		ptepa = (vm_offset_t) VM_PAGE_TO_PHYS(p);
 		if (ptepa & (NBPDR - 1)) {
 			return;
 		}
 
 		p->valid = VM_PAGE_BITS_ALL;
 
 		pmap->pm_stats.resident_count += size >> PAGE_SHIFT;
 		npdes = size >> PDRSHIFT;
 		for(i=0;i<npdes;i++) {
 			pmap->pm_pdir[ptepindex] =
 				(pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_PS);
 			ptepa += NBPDR;
 			ptepindex += 1;
 		}
 		p->flags |= PG_MAPPED;
 		invltlb();
 		return;
 	}
 
 	psize = i386_btop(size);
 
 	if ((object->type != OBJT_VNODE) ||
 		(limit && (psize > MAX_INIT_PT) &&
 			(object->resident_page_count > MAX_INIT_PT))) {
 		return;
 	}
 
 	if (psize + pindex > object->size)
 		psize = object->size - pindex;
 
 	mpte = NULL;
 	/*
 	 * if we are processing a major portion of the object, then scan the
 	 * entire thing.
 	 */
 	if (psize > (object->size >> 2)) {
 		objpgs = psize;
 
 		for (p = TAILQ_FIRST(&object->memq);
 		    ((objpgs > 0) && (p != NULL));
 		    p = TAILQ_NEXT(p, listq)) {
 
 			tmpidx = p->pindex;
 			if (tmpidx < pindex) {
 				continue;
 			}
 			tmpidx -= pindex;
 			if (tmpidx >= psize) {
 				continue;
 			}
 			if (((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
 			    (p->busy == 0) &&
 			    (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
 				if ((p->queue - p->pc) == PQ_CACHE)
 					vm_page_deactivate(p);
 				p->flags |= PG_BUSY;
 				mpte = pmap_enter_quick(pmap, 
 					addr + i386_ptob(tmpidx),
 					VM_PAGE_TO_PHYS(p), mpte);
 				p->flags |= PG_MAPPED;
 				PAGE_WAKEUP(p);
 			}
 			objpgs -= 1;
 		}
 	} else {
 		/*
 		 * else lookup the pages one-by-one.
 		 */
 		for (tmpidx = 0; tmpidx < psize; tmpidx += 1) {
 			p = vm_page_lookup(object, tmpidx + pindex);
 			if (p &&
 			    ((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
 			    (p->busy == 0) &&
 			    (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
 				if ((p->queue - p->pc) == PQ_CACHE)
 					vm_page_deactivate(p);
 				p->flags |= PG_BUSY;
 				mpte = pmap_enter_quick(pmap, 
 					addr + i386_ptob(tmpidx),
 					VM_PAGE_TO_PHYS(p), mpte);
 				p->flags |= PG_MAPPED;
 				PAGE_WAKEUP(p);
 			}
 		}
 	}
 	return;
 }
 
 /*
  * pmap_prefault provides a quick way of clustering
  * pagefaults into a processes address space.  It is a "cousin"
  * of pmap_object_init_pt, except it runs at page fault time instead
  * of mmap time.
  */
 #define PFBAK 2
 #define PFFOR 2
 #define PAGEORDER_SIZE (PFBAK+PFFOR)
 
 static int pmap_prefault_pageorder[] = {
 	-PAGE_SIZE, PAGE_SIZE, -2 * PAGE_SIZE, 2 * PAGE_SIZE
 };
 
 void
 pmap_prefault(pmap, addra, entry, object)
 	pmap_t pmap;
 	vm_offset_t addra;
 	vm_map_entry_t entry;
 	vm_object_t object;
 {
 	int i;
 	vm_offset_t starta;
 	vm_offset_t addr;
 	vm_pindex_t pindex;
 	vm_page_t m, mpte;
 
 	if (entry->object.vm_object != object)
 		return;
 
 	if (!curproc || (pmap != &curproc->p_vmspace->vm_pmap))
 		return;
 
 	starta = addra - PFBAK * PAGE_SIZE;
 	if (starta < entry->start) {
 		starta = entry->start;
 	} else if (starta > addra) {
 		starta = 0;
 	}
 
 	mpte = NULL;
 	for (i = 0; i < PAGEORDER_SIZE; i++) {
 		vm_object_t lobject;
 		unsigned *pte;
 
 		addr = addra + pmap_prefault_pageorder[i];
 		if (addr < starta || addr >= entry->end)
 			continue;
 
 		if ((*pmap_pde(pmap, addr)) == NULL) 
 			continue;
 
 		pte = (unsigned *) vtopte(addr);
 		if (*pte)
 			continue;
 
 		pindex = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT;
 		lobject = object;
 		for (m = vm_page_lookup(lobject, pindex);
 		    (!m && (lobject->type == OBJT_DEFAULT) && (lobject->backing_object));
 		    lobject = lobject->backing_object) {
 			if (lobject->backing_object_offset & PAGE_MASK)
 				break;
 			pindex += (lobject->backing_object_offset >> PAGE_SHIFT);
 			m = vm_page_lookup(lobject->backing_object, pindex);
 		}
 
 		/*
 		 * give-up when a page is not in memory
 		 */
 		if (m == NULL)
 			break;
 
 		if (((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
 		    (m->busy == 0) &&
 		    (m->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
 
 			if ((m->queue - m->pc) == PQ_CACHE) {
 				vm_page_deactivate(m);
 			}
 			m->flags |= PG_BUSY;
 			mpte = pmap_enter_quick(pmap, addr,
 				VM_PAGE_TO_PHYS(m), mpte);
 			m->flags |= PG_MAPPED;
 			PAGE_WAKEUP(m);
 		}
 	}
 }
 
 /*
  *	Routine:	pmap_change_wiring
  *	Function:	Change the wiring attribute for a map/virtual-address
  *			pair.
  *	In/out conditions:
  *			The mapping must already exist in the pmap.
  */
 void
 pmap_change_wiring(pmap, va, wired)
 	register pmap_t pmap;
 	vm_offset_t va;
 	boolean_t wired;
 {
 	register unsigned *pte;
 
 	if (pmap == NULL)
 		return;
 
 	pte = pmap_pte(pmap, va);
 
 	if (wired && !pmap_pte_w(pte))
 		pmap->pm_stats.wired_count++;
 	else if (!wired && pmap_pte_w(pte))
 		pmap->pm_stats.wired_count--;
 
 	/*
 	 * Wiring is not a hardware characteristic so there is no need to
 	 * invalidate TLB.
 	 */
 	pmap_pte_set_w(pte, wired);
 }
 
 
 
 /*
  *	Copy the range specified by src_addr/len
  *	from the source map to the range dst_addr/len
  *	in the destination map.
  *
  *	This routine is only advisory and need not do anything.
  */
 
 void
 pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr)
 	pmap_t dst_pmap, src_pmap;
 	vm_offset_t dst_addr;
 	vm_size_t len;
 	vm_offset_t src_addr;
 {
 	vm_offset_t addr;
 	vm_offset_t end_addr = src_addr + len;
 	vm_offset_t pdnxt;
 	unsigned src_frame, dst_frame;
 
 	if (dst_addr != src_addr)
 		return;
 
 	src_frame = ((unsigned) src_pmap->pm_pdir[PTDPTDI]) & PG_FRAME;
 	if (src_frame != (((unsigned) PTDpde) & PG_FRAME)) {
 		return;
 	}
 
 	dst_frame = ((unsigned) dst_pmap->pm_pdir[PTDPTDI]) & PG_FRAME;
 	if (dst_frame != (((unsigned) APTDpde) & PG_FRAME)) {
 		APTDpde = (pd_entry_t) (dst_frame | PG_RW | PG_V);
 		invltlb();
 	}
 
 	for(addr = src_addr; addr < end_addr; addr = pdnxt) {
 		unsigned *src_pte, *dst_pte;
 		vm_page_t dstmpte, srcmpte;
 		vm_offset_t srcptepaddr;
 		unsigned ptepindex;
 
 		if (addr >= UPT_MIN_ADDRESS)
 			panic("pmap_copy: invalid to pmap_copy page tables\n");
 
 		pdnxt = ((addr + PAGE_SIZE*NPTEPG) & ~(PAGE_SIZE*NPTEPG - 1));
 		ptepindex = addr >> PDRSHIFT;
 
 		srcptepaddr = (vm_offset_t) src_pmap->pm_pdir[ptepindex];
 		if (srcptepaddr == 0)
 			continue;
 			
 		if (srcptepaddr & PG_PS) {
 			if (dst_pmap->pm_pdir[ptepindex] == 0) {
 				dst_pmap->pm_pdir[ptepindex] = (pd_entry_t) srcptepaddr;
 				dst_pmap->pm_stats.resident_count += NBPDR;
 			}
 			continue;
 		}
 
 		srcmpte = vm_page_lookup(src_pmap->pm_pteobj, ptepindex);
 		if ((srcmpte == NULL) ||
 			(srcmpte->hold_count == 0) || (srcmpte->flags & PG_BUSY))
 			continue;
 
 		if (pdnxt > end_addr)
 			pdnxt = end_addr;
 
 		src_pte = (unsigned *) vtopte(addr);
 		dst_pte = (unsigned *) avtopte(addr);
 		while (addr < pdnxt) {
 			unsigned ptetemp;
 			ptetemp = *src_pte;
 			/*
 			 * we only virtual copy managed pages
 			 */
 			if ((ptetemp & PG_MANAGED) != 0) {
 				/*
 				 * We have to check after allocpte for the
 				 * pte still being around...  allocpte can
 				 * block.
 				 */
 				dstmpte = pmap_allocpte(dst_pmap, addr);
 				if ((*dst_pte == 0) && (ptetemp = *src_pte)) {
 					/*
 					 * Clear the modified and
 					 * accessed (referenced) bits
 					 * during the copy.
 					 */
 					*dst_pte = ptetemp & ~(PG_M|PG_A);
 					dst_pmap->pm_stats.resident_count++;
 					pmap_insert_entry(dst_pmap, addr,
 						dstmpte,
 						(ptetemp & PG_FRAME));
 	 			} else {
 					pmap_unwire_pte_hold(dst_pmap, dstmpte);
 				}
 				if (dstmpte->hold_count >= srcmpte->hold_count)
 					break;
 			}
 			addr += PAGE_SIZE;
 			++src_pte;
 			++dst_pte;
 		}
 	}
 }	
 
 /*
  *	Routine:	pmap_kernel
  *	Function:
  *		Returns the physical map handle for the kernel.
  */
 pmap_t
 pmap_kernel()
 {
 	return (kernel_pmap);
 }
 
 /*
  *	pmap_zero_page zeros the specified (machine independent)
  *	page by mapping the page into virtual memory and using
  *	bzero to clear its contents, one machine dependent page
  *	at a time.
  */
 void
 pmap_zero_page(phys)
 	vm_offset_t phys;
 {
 #ifdef SMP
 	if (*(int *) prv_CMAP3)
 		panic("pmap_zero_page: prv_CMAP3 busy");
 
 	*(int *) prv_CMAP3 = PG_V | PG_RW | (phys & PG_FRAME);
 	invltlb_1pg((vm_offset_t) &prv_CPAGE3);
 
 	bzero(&prv_CPAGE3, PAGE_SIZE);
 
 	*(int *) prv_CMAP3 = 0;
 	invltlb_1pg((vm_offset_t) &prv_CPAGE3);
 #else
 	if (*(int *) CMAP2)
 		panic("pmap_zero_page: CMAP busy");
 
 	*(int *) CMAP2 = PG_V | PG_RW | (phys & PG_FRAME);
 	bzero(CADDR2, PAGE_SIZE);
 	*(int *) CMAP2 = 0;
 	invltlb_1pg((vm_offset_t) CADDR2);
 #endif
 }
 
 /*
  *	pmap_copy_page copies the specified (machine independent)
  *	page by mapping the page into virtual memory and using
  *	bcopy to copy the page, one machine dependent page at a
  *	time.
  */
 void
 pmap_copy_page(src, dst)
 	vm_offset_t src;
 	vm_offset_t dst;
 {
 #ifdef SMP
 	if (*(int *) prv_CMAP1)
 		panic("pmap_copy_page: prv_CMAP1 busy");
 	if (*(int *) prv_CMAP2)
 		panic("pmap_copy_page: prv_CMAP2 busy");
 
 	*(int *) prv_CMAP1 = PG_V | PG_RW | (src & PG_FRAME);
 	*(int *) prv_CMAP2 = PG_V | PG_RW | (dst & PG_FRAME);
 
 	invltlb_2pg( (vm_offset_t) &prv_CPAGE1, (vm_offset_t) &prv_CPAGE2);
 
 	bcopy(&prv_CPAGE1, &prv_CPAGE2, PAGE_SIZE);
 
 	*(int *) prv_CMAP1 = 0;
 	*(int *) prv_CMAP2 = 0;
 	invltlb_2pg( (vm_offset_t) &prv_CPAGE1, (vm_offset_t) &prv_CPAGE2);
 #else
 	if (*(int *) CMAP1 || *(int *) CMAP2)
 		panic("pmap_copy_page: CMAP busy");
 
 	*(int *) CMAP1 = PG_V | PG_RW | (src & PG_FRAME);
 	*(int *) CMAP2 = PG_V | PG_RW | (dst & PG_FRAME);
 
 	bcopy(CADDR1, CADDR2, PAGE_SIZE);
 
 	*(int *) CMAP1 = 0;
 	*(int *) CMAP2 = 0;
 	invltlb_2pg( (vm_offset_t) CADDR1, (vm_offset_t) CADDR2);
 #endif
 }
 
 
 /*
  *	Routine:	pmap_pageable
  *	Function:
  *		Make the specified pages (by pmap, offset)
  *		pageable (or not) as requested.
  *
  *		A page which is not pageable may not take
  *		a fault; therefore, its page table entry
  *		must remain valid for the duration.
  *
  *		This routine is merely advisory; pmap_enter
  *		will specify that these pages are to be wired
  *		down (or not) as appropriate.
  */
 void
 pmap_pageable(pmap, sva, eva, pageable)
 	pmap_t pmap;
 	vm_offset_t sva, eva;
 	boolean_t pageable;
 {
 }
 
 /*
  * this routine returns true if a physical page resides
  * in the given pmap.
  */
 boolean_t
 pmap_page_exists(pmap, pa)
 	pmap_t pmap;
 	vm_offset_t pa;
 {
 	register pv_entry_t pv;
 	pv_table_t *ppv;
 	int s;
 
 	if (!pmap_is_managed(pa))
 		return FALSE;
 
 	s = splvm();
 
 	ppv = pa_to_pvh(pa);
 	/*
 	 * Not found, check current mappings returning immediately if found.
 	 */
 	for (pv = TAILQ_FIRST(&ppv->pv_list);
 		pv;
 		pv = TAILQ_NEXT(pv, pv_list)) {
 		if (pv->pv_pmap == pmap) {
 			splx(s);
 			return TRUE;
 		}
 	}
 	splx(s);
 	return (FALSE);
 }
 
 #define PMAP_REMOVE_PAGES_CURPROC_ONLY
 /*
  * Remove all pages from specified address space
  * this aids process exit speeds.  Also, this code
  * is special cased for current process only, but
  * can have the more generic (and slightly slower)
  * mode enabled.  This is much faster than pmap_remove
  * in the case of running down an entire address space.
  */
 void
 pmap_remove_pages(pmap, sva, eva)
 	pmap_t pmap;
 	vm_offset_t sva, eva;
 {
 	unsigned *pte, tpte;
 	pv_table_t *ppv;
 	pv_entry_t pv, npv;
 	int s;
 
 #ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY
 	if (!curproc || (pmap != &curproc->p_vmspace->vm_pmap)) {
 		printf("warning: pmap_remove_pages called with non-current pmap\n");
 		return;
 	}
 #endif
 
 	s = splvm();
 	for(pv = TAILQ_FIRST(&pmap->pm_pvlist);
 		pv;
 		pv = npv) {
 
 		if (pv->pv_va >= eva || pv->pv_va < sva) {
 			npv = TAILQ_NEXT(pv, pv_plist);
 			continue;
 		}
 
 #ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY
 		pte = (unsigned *)vtopte(pv->pv_va);
 #else
 		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
 #endif
 		tpte = *pte;
 
 /*
  * We cannot remove wired pages from a process' mapping at this time
  */
 		if (tpte & PG_W) {
 			npv = TAILQ_NEXT(pv, pv_plist);
 			continue;
 		}
 		*pte = 0;
 
 		ppv = pa_to_pvh(tpte);
 
 		pv->pv_pmap->pm_stats.resident_count--;
 
 		/*
 		 * Update the vm_page_t clean and reference bits.
 		 */
 		if (tpte & PG_M) {
 			ppv->pv_vm_page->dirty = VM_PAGE_BITS_ALL;
 		}
 
 
 		npv = TAILQ_NEXT(pv, pv_plist);
 		TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist);
 
 		--ppv->pv_list_count;
 		TAILQ_REMOVE(&ppv->pv_list, pv, pv_list);
 		if (TAILQ_FIRST(&ppv->pv_list) == NULL) {
 			ppv->pv_vm_page->flags &= ~(PG_MAPPED|PG_WRITEABLE);
 		}
 
 		pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem);
 		free_pv_entry(pv);
 	}
 	splx(s);
 	invltlb();
 }
 
 /*
  * pmap_testbit tests bits in pte's
  * note that the testbit/changebit routines are inline,
  * and a lot of things compile-time evaluate.
  */
 static boolean_t
 pmap_testbit(pa, bit)
 	register vm_offset_t pa;
 	int bit;
 {
 	register pv_entry_t pv;
 	pv_table_t *ppv;
 	unsigned *pte;
 	int s;
 
 	if (!pmap_is_managed(pa))
 		return FALSE;
 
 	ppv = pa_to_pvh(pa);
 	if (TAILQ_FIRST(&ppv->pv_list) == NULL)
 		return FALSE;
 
 	s = splvm();
 
 	for (pv = TAILQ_FIRST(&ppv->pv_list);
 		pv;
 		pv = TAILQ_NEXT(pv, pv_list)) {
 
 		/*
 		 * if the bit being tested is the modified bit, then
 		 * mark clean_map and ptes as never
 		 * modified.
 		 */
 		if (bit & (PG_A|PG_M)) {
 			if (!pmap_track_modified(pv->pv_va))
 				continue;
 		}
 
 #if defined(PMAP_DIAGNOSTIC)
 		if (!pv->pv_pmap) {
 			printf("Null pmap (tb) at va: 0x%lx\n", pv->pv_va);
 			continue;
 		}
 #endif
 		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
 		if (*pte & bit) {
 			splx(s);
 			return TRUE;
 		}
 	}
 	splx(s);
 	return (FALSE);
 }
 
 /*
  * this routine is used to modify bits in ptes
  */
 static void
 pmap_changebit(pa, bit, setem)
 	vm_offset_t pa;
 	int bit;
 	boolean_t setem;
 {
 	register pv_entry_t pv;
 	pv_table_t *ppv;
 	register unsigned *pte;
 	int changed;
 	int s;
 
 	if (!pmap_is_managed(pa))
 		return;
 
 	s = splvm();
 	changed = 0;
 	ppv = pa_to_pvh(pa);
 
 	/*
 	 * Loop over all current mappings setting/clearing as appropos If
 	 * setting RO do we need to clear the VAC?
 	 */
 	for (pv = TAILQ_FIRST(&ppv->pv_list);
 		pv;
 		pv = TAILQ_NEXT(pv, pv_list)) {
 
 		/*
 		 * don't write protect pager mappings
 		 */
 		if (!setem && (bit == PG_RW)) {
 			if (!pmap_track_modified(pv->pv_va))
 				continue;
 		}
 
 #if defined(PMAP_DIAGNOSTIC)
 		if (!pv->pv_pmap) {
 			printf("Null pmap (cb) at va: 0x%lx\n", pv->pv_va);
 			continue;
 		}
 #endif
 
 		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
 
 		if (setem) {
 			*(int *)pte |= bit;
 			changed = 1;
 		} else {
 			vm_offset_t pbits = *(vm_offset_t *)pte;
 			if (pbits & bit) {
 				changed = 1;
 				if (bit == PG_RW) {
 					if (pbits & PG_M) {
 						ppv->pv_vm_page->dirty = VM_PAGE_BITS_ALL;
 					}
 					*(int *)pte = pbits & ~(PG_M|PG_RW);
 				} else {
 					*(int *)pte = pbits & ~bit;
 				}
 			}
 		}
 	}
 	splx(s);
 	if (changed)
 		invltlb();
 }
 
 /*
  *      pmap_page_protect:
  *
  *      Lower the permission for all mappings to a given page.
  */
 void
 pmap_page_protect(vm_offset_t phys, vm_prot_t prot)
 {
 	if ((prot & VM_PROT_WRITE) == 0) {
 		if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) {
 			pmap_changebit(phys, PG_RW, FALSE);
 		} else {
 			pmap_remove_all(phys);
 		}
 	}
 }
 
 vm_offset_t
 pmap_phys_address(ppn)
 	int ppn;
 {
 	return (i386_ptob(ppn));
 }
 
 /*
  *	pmap_ts_referenced:
  *
  *	Return the count of reference bits for a page, clearing all of them.
  *	
  */
 int
 pmap_ts_referenced(vm_offset_t pa)
 {
 	register pv_entry_t pv;
 	pv_table_t *ppv;
 	unsigned *pte;
 	int s;
 	int rtval = 0;
 
 	if (!pmap_is_managed(pa))
 		return FALSE;
 
 	s = splvm();
 
 	ppv = pa_to_pvh(pa);
 
 	if (TAILQ_FIRST(&ppv->pv_list) == NULL) {
 		splx(s);
 		return 0;
 	}
 		
 	/*
 	 * Not found, check current mappings returning immediately if found.
 	 */
 	for (pv = TAILQ_FIRST(&ppv->pv_list);
 		pv;
 		pv = TAILQ_NEXT(pv, pv_list)) {
 		/*
 		 * if the bit being tested is the modified bit, then
 		 * mark clean_map and ptes as never
 		 * modified.
 		 */
 		if (!pmap_track_modified(pv->pv_va))
 			continue;
 
 		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
 		if (pte == NULL) {
 			continue;
 		}
 		if (*pte & PG_A) {
 			rtval++;
 			*pte &= ~PG_A;
 		}
 	}
 	splx(s);
 	if (rtval) {
 		invltlb();
 	}
 	return (rtval);
 }
 
 /*
  *	pmap_is_modified:
  *
  *	Return whether or not the specified physical page was modified
  *	in any physical maps.
  */
 boolean_t
 pmap_is_modified(vm_offset_t pa)
 {
 	return pmap_testbit((pa), PG_M);
 }
 
 /*
  *	Clear the modify bits on the specified physical page.
  */
 void
 pmap_clear_modify(vm_offset_t pa)
 {
 	pmap_changebit((pa), PG_M, FALSE);
 }
 
 /*
  *	pmap_clear_reference:
  *
  *	Clear the reference bit on the specified physical page.
  */
 void
 pmap_clear_reference(vm_offset_t pa)
 {
 	pmap_changebit((pa), PG_A, FALSE);
 }
 
 /*
  * Miscellaneous support routines follow
  */
 
 static void
 i386_protection_init()
 {
 	register int *kp, prot;
 
 	kp = protection_codes;
 	for (prot = 0; prot < 8; prot++) {
 		switch (prot) {
 		case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_NONE:
 			/*
 			 * Read access is also 0. There isn't any execute bit,
 			 * so just make it readable.
 			 */
 		case VM_PROT_READ | VM_PROT_NONE | VM_PROT_NONE:
 		case VM_PROT_READ | VM_PROT_NONE | VM_PROT_EXECUTE:
 		case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_EXECUTE:
 			*kp++ = 0;
 			break;
 		case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_NONE:
 		case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_EXECUTE:
 		case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_NONE:
 		case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE:
 			*kp++ = PG_RW;
 			break;
 		}
 	}
 }
 
 /*
  * Map a set of physical memory pages into the kernel virtual
  * address space. Return a pointer to where it is mapped. This
  * routine is intended to be used for mapping device memory,
  * NOT real memory.
  */
 void *
 pmap_mapdev(pa, size)
 	vm_offset_t pa;
 	vm_size_t size;
 {
 	vm_offset_t va, tmpva;
 	unsigned *pte;
 
 	size = roundup(size, PAGE_SIZE);
 
 	va = kmem_alloc_pageable(kernel_map, size);
 	if (!va)
 		panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
 
 	pa = pa & PG_FRAME;
 	for (tmpva = va; size > 0;) {
 		pte = (unsigned *)vtopte(tmpva);
 		*pte = pa | PG_RW | PG_V | pgeflag;
 		size -= PAGE_SIZE;
 		tmpva += PAGE_SIZE;
 		pa += PAGE_SIZE;
 	}
 	invltlb();
 
 	return ((void *) va);
 }
 
 /*
  * perform the pmap work for mincore
  */
 int
 pmap_mincore(pmap, addr)
 	pmap_t pmap;
 	vm_offset_t addr;
 {
 	
 	unsigned *ptep, pte;
 	int val = 0;
 	
 	ptep = pmap_pte(pmap, addr);
 	if (ptep == 0) {
 		return 0;
 	}
 
 	if (pte = *ptep) {
 		vm_offset_t pa;
 		val = MINCORE_INCORE;
 		pa = pte & PG_FRAME;
 
 		/*
 		 * Modified by us
 		 */
 		if (pte & PG_M)
 			val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER;
 		/*
 		 * Modified by someone
 		 */
 		else if (PHYS_TO_VM_PAGE(pa)->dirty ||
 			pmap_is_modified(pa))
 			val |= MINCORE_MODIFIED_OTHER;
 		/*
 		 * Referenced by us
 		 */
 		if (pte & PG_U)
 			val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER;
 
 		/*
 		 * Referenced by someone
 		 */
 		else if ((PHYS_TO_VM_PAGE(pa)->flags & PG_REFERENCED) ||
 			pmap_ts_referenced(pa)) {
 			val |= MINCORE_REFERENCED_OTHER;
 			PHYS_TO_VM_PAGE(pa)->flags |= PG_REFERENCED;
 		}
 	} 
 	return val;
 }
 
 void
 pmap_activate(struct proc *p)
 {
+#if defined(SWTCH_OPTIM_STATS)
+	++tlb_flush_count;
+#endif
 	load_cr3(p->p_addr->u_pcb.pcb_cr3 =
 		vtophys(p->p_vmspace->vm_pmap.pm_pdir));
 }
 
 vm_offset_t
 pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size) {
 
 	if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) {
 		return addr;
 	}
 
 	addr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
 	return addr;
 }
 
 
 #if defined(PMAP_DEBUG)
 pmap_pid_dump(int pid) {
 	pmap_t pmap;
 	struct proc *p;
 	int npte = 0;
 	int index;
 	for (p = allproc.lh_first; p != NULL; p = p->p_list.le_next) {
 		if (p->p_pid != pid)
 			continue;
 
 		if (p->p_vmspace) {
 			int i,j;
 			index = 0;
 			pmap = &p->p_vmspace->vm_pmap;
 			for(i=0;i<1024;i++) {
 				pd_entry_t *pde;
 				unsigned *pte;
 				unsigned base = i << PDRSHIFT;
 				
 				pde = &pmap->pm_pdir[i];
 				if (pde && pmap_pde_v(pde)) {
 					for(j=0;j<1024;j++) {
 						unsigned va = base + (j << PAGE_SHIFT);
 						if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) {
 							if (index) {
 								index = 0;
 								printf("\n");
 							}
 							return npte;
 						}
 						pte = pmap_pte_quick( pmap, va);
 						if (pte && pmap_pte_v(pte)) {
 							vm_offset_t pa;
 							vm_page_t m;
 							pa = *(int *)pte;
 							m = PHYS_TO_VM_PAGE((pa & PG_FRAME));
 							printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x",
 								va, pa, m->hold_count, m->wire_count, m->flags);
 							npte++;
 							index++;
 							if (index >= 2) {
 								index = 0;
 								printf("\n");
 							} else {
 								printf(" ");
 							}
 						}
 					}
 				}
 			}
 		}
 	}
 	return npte;
 }
 #endif
 
 #if defined(DEBUG)
 
 static void	pads __P((pmap_t pm));
 static void	pmap_pvdump __P((vm_offset_t pa));
 
 /* print address space of pmap*/
 static void
 pads(pm)
 	pmap_t pm;
 {
 	unsigned va, i, j;
 	unsigned *ptep;
 
 	if (pm == kernel_pmap)
 		return;
 	for (i = 0; i < 1024; i++)
 		if (pm->pm_pdir[i])
 			for (j = 0; j < 1024; j++) {
 				va = (i << PDRSHIFT) + (j << PAGE_SHIFT);
 				if (pm == kernel_pmap && va < KERNBASE)
 					continue;
 				if (pm != kernel_pmap && va > UPT_MAX_ADDRESS)
 					continue;
 				ptep = pmap_pte_quick(pm, va);
 				if (pmap_pte_v(ptep))
 					printf("%x:%x ", va, *(int *) ptep);
 			};
 
 }
 
 static void
 pmap_pvdump(pa)
 	vm_offset_t pa;
 {
 	pv_table_t *ppv;
 	register pv_entry_t pv;
 
 	printf("pa %x", pa);
 	ppv = pa_to_pvh(pa);
 	for (pv = TAILQ_FIRST(&ppv->pv_list);
 		pv;
 		pv = TAILQ_NEXT(pv, pv_list)) {
 #ifdef used_to_be
 		printf(" -> pmap %x, va %x, flags %x",
 		    pv->pv_pmap, pv->pv_va, pv->pv_flags);
 #endif
 		printf(" -> pmap %x, va %x",
 		    pv->pv_pmap, pv->pv_va);
 		pads(pv->pv_pmap);
 	}
 	printf(" ");
 }
 #endif
Index: head/sys/amd64/amd64/support.S
===================================================================
--- head/sys/amd64/amd64/support.S	(revision 31708)
+++ head/sys/amd64/amd64/support.S	(revision 31709)
@@ -1,1571 +1,1574 @@
 /*-
  * Copyright (c) 1993 The Regents of the University of California.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *	$Id: support.s,v 1.56 1997/08/09 00:02:44 dyson Exp $
+ *	$Id: support.s,v 1.57 1997/09/02 20:05:30 bde Exp $
  */
 
 #include "npx.h"
 
 #include <machine/asmacros.h>
 #include <machine/cputypes.h>
 #include <machine/pmap.h>
 #include <machine/specialreg.h>
 
 #include "assym.s"
 
 #define KDSEL		0x10			/* kernel data selector */
 #define KCSEL		0x8			/* kernel code selector */
 #define IDXSHIFT	10
 
 	.data
 	.globl	_bcopy_vector
 _bcopy_vector:
 	.long	_generic_bcopy
 	.globl	_bzero
 _bzero:
 	.long	_generic_bzero
 	.globl	_copyin_vector
 _copyin_vector:
 	.long	_generic_copyin
 	.globl	_copyout_vector
 _copyout_vector:
 	.long	_generic_copyout
 	.globl	_ovbcopy_vector
 _ovbcopy_vector:
 	.long	_generic_bcopy
 #if defined(I586_CPU) && NNPX > 0
 kernel_fpu_lock:
 	.byte	0xfe
 	.space	3
 #endif
 
 	.text
 
 /*
  * bcopy family
  * void bzero(void *buf, u_int len)
  */
 
 ENTRY(generic_bzero)
 	pushl	%edi
 	movl	8(%esp),%edi
 	movl	12(%esp),%ecx
 	xorl	%eax,%eax
 	shrl	$2,%ecx
 	cld
 	rep
 	stosl
 	movl	12(%esp),%ecx
 	andl	$3,%ecx
 	rep
 	stosb
 	popl	%edi
 	ret
 
 #if defined(I486_CPU)
 ENTRY(i486_bzero)
 	movl	4(%esp),%edx
 	movl	8(%esp),%ecx
 	xorl	%eax,%eax
 /*
  * do 64 byte chunks first
  *
  * XXX this is probably over-unrolled at least for DX2's
  */
 2:
 	cmpl	$64,%ecx
 	jb	3f
 	movl	%eax,(%edx)
 	movl	%eax,4(%edx)
 	movl	%eax,8(%edx)
 	movl	%eax,12(%edx)
 	movl	%eax,16(%edx)
 	movl	%eax,20(%edx)
 	movl	%eax,24(%edx)
 	movl	%eax,28(%edx)
 	movl	%eax,32(%edx)
 	movl	%eax,36(%edx)
 	movl	%eax,40(%edx)
 	movl	%eax,44(%edx)
 	movl	%eax,48(%edx)
 	movl	%eax,52(%edx)
 	movl	%eax,56(%edx)
 	movl	%eax,60(%edx)
 	addl	$64,%edx
 	subl	$64,%ecx
 	jnz	2b
 	ret
 
 /*
  * do 16 byte chunks
  */
 	SUPERALIGN_TEXT
 3:
 	cmpl	$16,%ecx
 	jb	4f
 	movl	%eax,(%edx)
 	movl	%eax,4(%edx)
 	movl	%eax,8(%edx)
 	movl	%eax,12(%edx)
 	addl	$16,%edx
 	subl	$16,%ecx
 	jnz	3b
 	ret
 
 /*
  * do 4 byte chunks
  */
 	SUPERALIGN_TEXT
 4:
 	cmpl	$4,%ecx
 	jb	5f
 	movl	%eax,(%edx)
 	addl	$4,%edx
 	subl	$4,%ecx
 	jnz	4b
 	ret
 
 /*
  * do 1 byte chunks
  * a jump table seems to be faster than a loop or more range reductions
  *
  * XXX need a const section for non-text
  */
 	.data
 jtab:
 	.long	do0
 	.long	do1
 	.long	do2
 	.long	do3
 
 	.text
 	SUPERALIGN_TEXT
 5:
 	jmp	jtab(,%ecx,4)
 
 	SUPERALIGN_TEXT
 do3:
 	movw	%ax,(%edx)
 	movb	%al,2(%edx)
 	ret
 
 	SUPERALIGN_TEXT
 do2:
 	movw	%ax,(%edx)
 	ret
 
 	SUPERALIGN_TEXT
 do1:
 	movb	%al,(%edx)
 	ret
 
 	SUPERALIGN_TEXT
 do0:
 	ret
 #endif
 
 #if defined(I586_CPU) && NNPX > 0
 ENTRY(i586_bzero)
 	movl	4(%esp),%edx
 	movl	8(%esp),%ecx
 
 	/*
 	 * The FPU register method is twice as fast as the integer register
 	 * method unless the target is in the L1 cache and we pre-allocate a
 	 * cache line for it (then the integer register method is 4-5 times
 	 * faster).  However, we never pre-allocate cache lines, since that
 	 * would make the integer method 25% or more slower for the common
 	 * case when the target isn't in either the L1 cache or the L2 cache.
 	 * Thus we normally use the FPU register method unless the overhead
 	 * would be too large.
 	 */
 	cmpl	$256,%ecx	/* empirical; clts, fninit, smsw cost a lot */
 	jb	intreg_i586_bzero
 
 	/*
 	 * The FPU registers may belong to an application or to fastmove()
 	 * or to another invocation of bcopy() or ourself in a higher level
 	 * interrupt or trap handler.  Preserving the registers is
 	 * complicated since we avoid it if possible at all levels.  We
 	 * want to localize the complications even when that increases them.
 	 * Here the extra work involves preserving CR0_TS in TS.
 	 * `npxproc != NULL' is supposed to be the condition that all the
 	 * FPU resources belong to an application, but npxproc and CR0_TS
 	 * aren't set atomically enough for this condition to work in
 	 * interrupt handlers.
 	 *
 	 * Case 1: FPU registers belong to the application: we must preserve
 	 * the registers if we use them, so we only use the FPU register
 	 * method if the target size is large enough to amortize the extra
 	 * overhead for preserving them.  CR0_TS must be preserved although
 	 * it is very likely to end up as set.
 	 *
 	 * Case 2: FPU registers belong to fastmove(): fastmove() currently
 	 * makes the registers look like they belong to an application so
 	 * that cpu_switch() and savectx() don't have to know about it, so
 	 * this case reduces to case 1.
 	 *
 	 * Case 3: FPU registers belong to the kernel: don't use the FPU
 	 * register method.  This case is unlikely, and supporting it would
 	 * be more complicated and might take too much stack.
 	 *
 	 * Case 4: FPU registers don't belong to anyone: the FPU registers
 	 * don't need to be preserved, so we always use the FPU register
 	 * method.  CR0_TS must be preserved although it is very likely to
 	 * always end up as clear.
 	 */
 	cmpl	$0,_npxproc
 	je	i586_bz1
 	cmpl	$256+184,%ecx		/* empirical; not quite 2*108 more */
 	jb	intreg_i586_bzero
 	sarb	$1,kernel_fpu_lock
 	jc	intreg_i586_bzero
 	smsw	%ax
 	clts
 	subl	$108,%esp
 	fnsave	0(%esp)
 	jmp	i586_bz2
 
 i586_bz1:
 	sarb	$1,kernel_fpu_lock
 	jc	intreg_i586_bzero
 	smsw	%ax
 	clts
 	fninit				/* XXX should avoid needing this */
 i586_bz2:
 	fldz
 
 	/*
 	 * Align to an 8 byte boundary (misalignment in the main loop would
 	 * cost a factor of >= 2).  Avoid jumps (at little cost if it is
 	 * already aligned) by always zeroing 8 bytes and using the part up
 	 * to the _next_ alignment position.
 	 */
 	fstl	0(%edx)
 	addl	%edx,%ecx		/* part of %ecx -= new_%edx - %edx */
 	addl	$8,%edx
 	andl	$~7,%edx
 	subl	%edx,%ecx
 
 	/*
 	 * Similarly align `len' to a multiple of 8.
 	 */
 	fstl	-8(%edx,%ecx)
 	decl	%ecx
 	andl	$~7,%ecx
 
 	/*
 	 * This wouldn't be any faster if it were unrolled, since the loop
 	 * control instructions are much faster than the fstl and/or done
 	 * in parallel with it so their overhead is insignificant.
 	 */
 fpureg_i586_bzero_loop:
 	fstl	0(%edx)
 	addl	$8,%edx
 	subl	$8,%ecx
 	cmpl	$8,%ecx
 	jae	fpureg_i586_bzero_loop
 
 	cmpl	$0,_npxproc
 	je	i586_bz3
 	frstor	0(%esp)
 	addl	$108,%esp
 	lmsw	%ax
 	movb	$0xfe,kernel_fpu_lock
 	ret
 
 i586_bz3:
 	fstpl	%st(0)
 	lmsw	%ax
 	movb	$0xfe,kernel_fpu_lock
 	ret
 
 intreg_i586_bzero:
 	/*
 	 * `rep stos' seems to be the best method in practice for small
 	 * counts.  Fancy methods usually take too long to start up due
 	 * to cache and BTB misses.
 	 */
 	pushl	%edi
 	movl	%edx,%edi
 	xorl	%eax,%eax
 	shrl	$2,%ecx
 	cld
 	rep
 	stosl
 	movl	12(%esp),%ecx
 	andl	$3,%ecx
 	jne	1f
 	popl	%edi
 	ret
 
 1:
 	rep
 	stosb
 	popl	%edi
 	ret
 #endif /* I586_CPU && NNPX > 0 */
 
 /* fillw(pat, base, cnt) */
 ENTRY(fillw)
 	pushl	%edi
 	movl	8(%esp),%eax
 	movl	12(%esp),%edi
 	movl	16(%esp),%ecx
 	cld
 	rep
 	stosw
 	popl	%edi
 	ret
 
 ENTRY(bcopyb)
 bcopyb:
 	pushl	%esi
 	pushl	%edi
 	movl	12(%esp),%esi
 	movl	16(%esp),%edi
 	movl	20(%esp),%ecx
 	movl	%edi,%eax
 	subl	%esi,%eax
 	cmpl	%ecx,%eax			/* overlapping && src < dst? */
 	jb	1f
 	cld					/* nope, copy forwards */
 	rep
 	movsb
 	popl	%edi
 	popl	%esi
 	ret
 
 	ALIGN_TEXT
 1:
 	addl	%ecx,%edi			/* copy backwards. */
 	addl	%ecx,%esi
 	decl	%edi
 	decl	%esi
 	std
 	rep
 	movsb
 	popl	%edi
 	popl	%esi
 	cld
 	ret
 
 ENTRY(bcopy)
 	MEXITCOUNT
 	jmp	*_bcopy_vector
 
 ENTRY(ovbcopy)
 	MEXITCOUNT
 	jmp	*_ovbcopy_vector
 
 /*
  * generic_bcopy(src, dst, cnt)
  *  ws@tools.de     (Wolfgang Solfrank, TooLs GmbH) +49-228-985800
  */
 ENTRY(generic_bcopy)
 	pushl	%esi
 	pushl	%edi
 	movl	12(%esp),%esi
 	movl	16(%esp),%edi
 	movl	20(%esp),%ecx
 
 	movl	%edi,%eax
 	subl	%esi,%eax
 	cmpl	%ecx,%eax			/* overlapping && src < dst? */
 	jb	1f
 
 	shrl	$2,%ecx				/* copy by 32-bit words */
 	cld					/* nope, copy forwards */
 	rep
 	movsl
 	movl	20(%esp),%ecx
 	andl	$3,%ecx				/* any bytes left? */
 	rep
 	movsb
 	popl	%edi
 	popl	%esi
 	ret
 
 	ALIGN_TEXT
 1:
 	addl	%ecx,%edi			/* copy backwards */
 	addl	%ecx,%esi
 	decl	%edi
 	decl	%esi
 	andl	$3,%ecx				/* any fractional bytes? */
 	std
 	rep
 	movsb
 	movl	20(%esp),%ecx			/* copy remainder by 32-bit words */
 	shrl	$2,%ecx
 	subl	$3,%esi
 	subl	$3,%edi
 	rep
 	movsl
 	popl	%edi
 	popl	%esi
 	cld
 	ret
 
 #if defined(I586_CPU) && NNPX > 0
 ENTRY(i586_bcopy)
 	pushl	%esi
 	pushl	%edi
 	movl	12(%esp),%esi
 	movl	16(%esp),%edi
 	movl	20(%esp),%ecx
 
 	movl	%edi,%eax
 	subl	%esi,%eax
 	cmpl	%ecx,%eax			/* overlapping && src < dst? */
 	jb	1f
 
 	cmpl	$1024,%ecx
 	jb	small_i586_bcopy
 
 	sarb	$1,kernel_fpu_lock
 	jc	small_i586_bcopy
 	cmpl	$0,_npxproc
 	je	i586_bc1
 	smsw	%dx
 	clts
 	subl	$108,%esp
 	fnsave	0(%esp)
 	jmp	4f
 
 i586_bc1:
 	smsw	%dx
 	clts
 	fninit				/* XXX should avoid needing this */
 
 	ALIGN_TEXT
 4:
 	pushl	%ecx
 #define	DCACHE_SIZE	8192
 	cmpl	$(DCACHE_SIZE-512)/2,%ecx
 	jbe	2f
 	movl	$(DCACHE_SIZE-512)/2,%ecx
 2:
 	subl	%ecx,0(%esp)
 	cmpl	$256,%ecx
 	jb	5f			/* XXX should prefetch if %ecx >= 32 */
 	pushl	%esi
 	pushl	%ecx
 	ALIGN_TEXT
 3:
 	movl	0(%esi),%eax
 	movl	32(%esi),%eax
 	movl	64(%esi),%eax
 	movl	96(%esi),%eax
 	movl	128(%esi),%eax
 	movl	160(%esi),%eax
 	movl	192(%esi),%eax
 	movl	224(%esi),%eax
 	addl	$256,%esi
 	subl	$256,%ecx
 	cmpl	$256,%ecx
 	jae	3b
 	popl	%ecx
 	popl	%esi
 5:
 	ALIGN_TEXT
 large_i586_bcopy_loop:
 	fildq	0(%esi)
 	fildq	8(%esi)
 	fildq	16(%esi)
 	fildq	24(%esi)
 	fildq	32(%esi)
 	fildq	40(%esi)
 	fildq	48(%esi)
 	fildq	56(%esi)
 	fistpq	56(%edi)
 	fistpq	48(%edi)
 	fistpq	40(%edi)
 	fistpq	32(%edi)
 	fistpq	24(%edi)
 	fistpq	16(%edi)
 	fistpq	8(%edi)
 	fistpq	0(%edi)
 	addl	$64,%esi
 	addl	$64,%edi
 	subl	$64,%ecx
 	cmpl	$64,%ecx
 	jae	large_i586_bcopy_loop
 	popl	%eax
 	addl	%eax,%ecx
 	cmpl	$64,%ecx
 	jae	4b
 
 	cmpl	$0,_npxproc
 	je	i586_bc2
 	frstor	0(%esp)
 	addl	$108,%esp
 i586_bc2:
 	lmsw	%dx
 	movb	$0xfe,kernel_fpu_lock
 
 /*
  * This is a duplicate of the main part of generic_bcopy.  See the comments
  * there.  Jumping into generic_bcopy would cost a whole 0-1 cycles and
  * would mess up high resolution profiling.
  */
 	ALIGN_TEXT
 small_i586_bcopy:
 	shrl	$2,%ecx
 	cld
 	rep
 	movsl
 	movl	20(%esp),%ecx
 	andl	$3,%ecx
 	rep
 	movsb
 	popl	%edi
 	popl	%esi
 	ret
 
 	ALIGN_TEXT
 1:
 	addl	%ecx,%edi
 	addl	%ecx,%esi
 	decl	%edi
 	decl	%esi
 	andl	$3,%ecx
 	std
 	rep
 	movsb
 	movl	20(%esp),%ecx
 	shrl	$2,%ecx
 	subl	$3,%esi
 	subl	$3,%edi
 	rep
 	movsl
 	popl	%edi
 	popl	%esi
 	cld
 	ret
 #endif /* I586_CPU && NNPX > 0 */
 
 /*
  * Note: memcpy does not support overlapping copies
  */
 ENTRY(memcpy)
 	pushl	%edi
 	pushl	%esi
 	movl	12(%esp),%edi
 	movl	16(%esp),%esi
 	movl	20(%esp),%ecx
 	movl	%edi,%eax
 	shrl	$2,%ecx				/* copy by 32-bit words */
 	cld					/* nope, copy forwards */
 	rep
 	movsl
 	movl	20(%esp),%ecx
 	andl	$3,%ecx				/* any bytes left? */
 	rep
 	movsb
 	popl	%esi
 	popl	%edi
 	ret
 
 
 /*****************************************************************************/
 /* copyout and fubyte family                                                 */
 /*****************************************************************************/
 /*
  * Access user memory from inside the kernel. These routines and possibly
  * the math- and DOS emulators should be the only places that do this.
  *
  * We have to access the memory with user's permissions, so use a segment
  * selector with RPL 3. For writes to user space we have to additionally
  * check the PTE for write permission, because the 386 does not check
  * write permissions when we are executing with EPL 0. The 486 does check
  * this if the WP bit is set in CR0, so we can use a simpler version here.
  *
  * These routines set curpcb->onfault for the time they execute. When a
  * protection violation occurs inside the functions, the trap handler
  * returns to *curpcb->onfault instead of the function.
  */
 
 /* copyout(from_kernel, to_user, len) */
 ENTRY(copyout)
 	MEXITCOUNT
 	jmp	*_copyout_vector
 
 ENTRY(generic_copyout)
 	movl	_curpcb,%eax
 	movl	$copyout_fault,PCB_ONFAULT(%eax)
 	pushl	%esi
 	pushl	%edi
 	pushl	%ebx
 	movl	16(%esp),%esi
 	movl	20(%esp),%edi
 	movl	24(%esp),%ebx
 	testl	%ebx,%ebx			/* anything to do? */
 	jz	done_copyout
 
 	/*
 	 * Check explicitly for non-user addresses.  If 486 write protection
 	 * is being used, this check is essential because we are in kernel
 	 * mode so the h/w does not provide any protection against writing
 	 * kernel addresses.
 	 */
 
 	/*
 	 * First, prevent address wrapping.
 	 */
 	movl	%edi,%eax
 	addl	%ebx,%eax
 	jc	copyout_fault
 /*
  * XXX STOP USING VM_MAXUSER_ADDRESS.
  * It is an end address, not a max, so every time it is used correctly it
  * looks like there is an off by one error, and of course it caused an off
  * by one error in several places.
  */
 	cmpl	$VM_MAXUSER_ADDRESS,%eax
 	ja	copyout_fault
 
 #if defined(I386_CPU)
 
 #if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
 	cmpl	$CPUCLASS_386,_cpu_class
 	jne	3f
 #endif
 /*
  * We have to check each PTE for user write permission.
  * The checking may cause a page fault, so it is important to set
  * up everything for return via copyout_fault before here.
  */
 	/* compute number of pages */
 	movl	%edi,%ecx
 	andl	$PAGE_MASK,%ecx
 	addl	%ebx,%ecx
 	decl	%ecx
 	shrl	$IDXSHIFT+2,%ecx
 	incl	%ecx
 
 	/* compute PTE offset for start address */
 	movl	%edi,%edx
 	shrl	$IDXSHIFT,%edx
 	andb	$0xfc,%dl
 
 1:
 	/* check PTE for each page */
 	leal	_PTmap(%edx),%eax
 	shrl	$IDXSHIFT,%eax
 	andb	$0xfc,%al
 	testb	$PG_V,_PTmap(%eax)		/* PTE page must be valid */
 	je	4f
 	movb	_PTmap(%edx),%al
 	andb	$PG_V|PG_RW|PG_U,%al		/* page must be valid and user writable */
 	cmpb	$PG_V|PG_RW|PG_U,%al
 	je	2f
 
 4:
 	/* simulate a trap */
 	pushl	%edx
 	pushl	%ecx
 	shll	$IDXSHIFT,%edx
 	pushl	%edx
 	call	_trapwrite			/* trapwrite(addr) */
 	popl	%edx
 	popl	%ecx
 	popl	%edx
 
 	testl	%eax,%eax			/* if not ok, return EFAULT */
 	jnz	copyout_fault
 
 2:
 	addl	$4,%edx
 	decl	%ecx
 	jnz	1b				/* check next page */
 #endif /* I386_CPU */
 
 	/* bcopy(%esi, %edi, %ebx) */
 3:
 	movl	%ebx,%ecx
 
 #if defined(I586_CPU) && NNPX > 0
 	ALIGN_TEXT
 slow_copyout:
 #endif
 	shrl	$2,%ecx
 	cld
 	rep
 	movsl
 	movb	%bl,%cl
 	andb	$3,%cl
 	rep
 	movsb
 
 done_copyout:
 	popl	%ebx
 	popl	%edi
 	popl	%esi
 	xorl	%eax,%eax
 	movl	_curpcb,%edx
 	movl	%eax,PCB_ONFAULT(%edx)
 	ret
 
 	ALIGN_TEXT
 copyout_fault:
 	popl	%ebx
 	popl	%edi
 	popl	%esi
 	movl	_curpcb,%edx
 	movl	$0,PCB_ONFAULT(%edx)
 	movl	$EFAULT,%eax
 	ret
 
 #if defined(I586_CPU) && NNPX > 0
 ENTRY(i586_copyout)
 	/*
 	 * Duplicated from generic_copyout.  Could be done a bit better.
 	 */
 	movl	_curpcb,%eax
 	movl	$copyout_fault,PCB_ONFAULT(%eax)
 	pushl	%esi
 	pushl	%edi
 	pushl	%ebx
 	movl	16(%esp),%esi
 	movl	20(%esp),%edi
 	movl	24(%esp),%ebx
 	testl	%ebx,%ebx			/* anything to do? */
 	jz	done_copyout
 
 	/*
 	 * Check explicitly for non-user addresses.  If 486 write protection
 	 * is being used, this check is essential because we are in kernel
 	 * mode so the h/w does not provide any protection against writing
 	 * kernel addresses.
 	 */
 
 	/*
 	 * First, prevent address wrapping.
 	 */
 	movl	%edi,%eax
 	addl	%ebx,%eax
 	jc	copyout_fault
 /*
  * XXX STOP USING VM_MAXUSER_ADDRESS.
  * It is an end address, not a max, so every time it is used correctly it
  * looks like there is an off by one error, and of course it caused an off
  * by one error in several places.
  */
 	cmpl	$VM_MAXUSER_ADDRESS,%eax
 	ja	copyout_fault
 
 	/* bcopy(%esi, %edi, %ebx) */
 3:
 	movl	%ebx,%ecx
 	/*
 	 * End of duplicated code.
 	 */
 
 	cmpl	$1024,%ecx
 	jb	slow_copyout
 
 	pushl	%ecx
 	call	_fastmove
 	addl	$4,%esp
 	jmp	done_copyout
 #endif /* I586_CPU && NNPX > 0 */
 
 /* copyin(from_user, to_kernel, len) */
 ENTRY(copyin)
 	MEXITCOUNT
 	jmp	*_copyin_vector
 
 ENTRY(generic_copyin)
 	movl	_curpcb,%eax
 	movl	$copyin_fault,PCB_ONFAULT(%eax)
 	pushl	%esi
 	pushl	%edi
 	movl	12(%esp),%esi			/* caddr_t from */
 	movl	16(%esp),%edi			/* caddr_t to */
 	movl	20(%esp),%ecx			/* size_t  len */
 
 	/*
 	 * make sure address is valid
 	 */
 	movl	%esi,%edx
 	addl	%ecx,%edx
 	jc	copyin_fault
 	cmpl	$VM_MAXUSER_ADDRESS,%edx
 	ja	copyin_fault
 
 #if defined(I586_CPU) && NNPX > 0
 	ALIGN_TEXT
 slow_copyin:
 #endif
 	movb	%cl,%al
 	shrl	$2,%ecx				/* copy longword-wise */
 	cld
 	rep
 	movsl
 	movb	%al,%cl
 	andb	$3,%cl				/* copy remaining bytes */
 	rep
 	movsb
 
 #if defined(I586_CPU) && NNPX > 0
 	ALIGN_TEXT
 done_copyin:
 #endif
 	popl	%edi
 	popl	%esi
 	xorl	%eax,%eax
 	movl	_curpcb,%edx
 	movl	%eax,PCB_ONFAULT(%edx)
 	ret
 
 	ALIGN_TEXT
 copyin_fault:
 	popl	%edi
 	popl	%esi
 	movl	_curpcb,%edx
 	movl	$0,PCB_ONFAULT(%edx)
 	movl	$EFAULT,%eax
 	ret
 
 #if defined(I586_CPU) && NNPX > 0
 ENTRY(i586_copyin)
 	/*
 	 * Duplicated from generic_copyin.  Could be done a bit better.
 	 */
 	movl	_curpcb,%eax
 	movl	$copyin_fault,PCB_ONFAULT(%eax)
 	pushl	%esi
 	pushl	%edi
 	movl	12(%esp),%esi			/* caddr_t from */
 	movl	16(%esp),%edi			/* caddr_t to */
 	movl	20(%esp),%ecx			/* size_t  len */
 
 	/*
 	 * make sure address is valid
 	 */
 	movl	%esi,%edx
 	addl	%ecx,%edx
 	jc	copyin_fault
 	cmpl	$VM_MAXUSER_ADDRESS,%edx
 	ja	copyin_fault
 	/*
 	 * End of duplicated code.
 	 */
 
 	cmpl	$1024,%ecx
 	jb	slow_copyin
 
 	pushl	%ebx			/* XXX prepare for fastmove_fault */
 	pushl	%ecx
 	call	_fastmove
 	addl	$8,%esp
 	jmp	done_copyin
 #endif /* I586_CPU && NNPX > 0 */
 
 #if defined(I586_CPU) && NNPX > 0
 /* fastmove(src, dst, len)
 	src in %esi
 	dst in %edi
 	len in %ecx		XXX changed to on stack for profiling
 	uses %eax and %edx for tmp. storage
  */
 /* XXX use ENTRY() to get profiling.  fastmove() is actually a non-entry. */
 ENTRY(fastmove)
 	pushl	%ebp
 	movl	%esp,%ebp
 	subl	$PCB_SAVEFPU_SIZE+3*4,%esp
 
 	movl	8(%ebp),%ecx
 	cmpl	$63,%ecx
 	jbe	fastmove_tail
 
 	testl	$7,%esi	/* check if src addr is multiple of 8 */
 	jnz	fastmove_tail
 
 	testl	$7,%edi	/* check if dst addr is multiple of 8 */
 	jnz	fastmove_tail
 
 /* if (npxproc != NULL) { */
 	cmpl	$0,_npxproc
 	je	6f
 /*    fnsave(&curpcb->pcb_savefpu); */
 	movl	_curpcb,%eax
 	fnsave	PCB_SAVEFPU(%eax)
 /*   npxproc = NULL; */
 	movl	$0,_npxproc
 /* } */
 6:
 /* now we own the FPU. */
 
 /*
  * The process' FP state is saved in the pcb, but if we get
  * switched, the cpu_switch() will store our FP state in the
  * pcb.  It should be possible to avoid all the copying for
  * this, e.g., by setting a flag to tell cpu_switch() to
  * save the state somewhere else.
  */
 /* tmp = curpcb->pcb_savefpu; */
 	movl	%ecx,-12(%ebp)
 	movl	%esi,-8(%ebp)
 	movl	%edi,-4(%ebp)
 	movl	%esp,%edi
 	movl	_curpcb,%esi
 	addl	$PCB_SAVEFPU,%esi
 	cld
 	movl	$PCB_SAVEFPU_SIZE>>2,%ecx
 	rep
 	movsl
 	movl	-12(%ebp),%ecx
 	movl	-8(%ebp),%esi
 	movl	-4(%ebp),%edi
 /* stop_emulating(); */
 	clts
 /* npxproc = curproc; */
 	movl	_curproc,%eax
 	movl	%eax,_npxproc
 	movl	_curpcb,%eax
 	movl	$fastmove_fault,PCB_ONFAULT(%eax)
 4:
 	movl	%ecx,-12(%ebp)
 	cmpl	$1792,%ecx
 	jbe	2f
 	movl	$1792,%ecx
 2:
 	subl	%ecx,-12(%ebp)
 	cmpl	$256,%ecx
 	jb	5f
 	movl	%ecx,-8(%ebp)
 	movl	%esi,-4(%ebp)
 	ALIGN_TEXT
 3:
 	movl	0(%esi),%eax
 	movl	32(%esi),%eax
 	movl	64(%esi),%eax
 	movl	96(%esi),%eax
 	movl	128(%esi),%eax
 	movl	160(%esi),%eax
 	movl	192(%esi),%eax
 	movl	224(%esi),%eax
 	addl	$256,%esi
 	subl	$256,%ecx
 	cmpl	$256,%ecx
 	jae	3b
 	movl	-8(%ebp),%ecx
 	movl	-4(%ebp),%esi
 5:
 	ALIGN_TEXT
 fastmove_loop:
 	fildq	0(%esi)
 	fildq	8(%esi)
 	fildq	16(%esi)
 	fildq	24(%esi)
 	fildq	32(%esi)
 	fildq	40(%esi)
 	fildq	48(%esi)
 	fildq	56(%esi)
 	fistpq	56(%edi)
 	fistpq	48(%edi)
 	fistpq	40(%edi)
 	fistpq	32(%edi)
 	fistpq	24(%edi)
 	fistpq	16(%edi)
 	fistpq	8(%edi)
 	fistpq	0(%edi)
 	addl	$-64,%ecx
 	addl	$64,%esi
 	addl	$64,%edi
 	cmpl	$63,%ecx
 	ja	fastmove_loop
 	movl	-12(%ebp),%eax
 	addl	%eax,%ecx
 	cmpl	$64,%ecx
 	jae	4b
 
 /* curpcb->pcb_savefpu = tmp; */
 	movl	%ecx,-12(%ebp)
 	movl	%esi,-8(%ebp)
 	movl	%edi,-4(%ebp)
 	movl	_curpcb,%edi
 	addl	$PCB_SAVEFPU,%edi
 	movl	%esp,%esi
 	cld
 	movl	$PCB_SAVEFPU_SIZE>>2,%ecx
 	rep
 	movsl
 	movl	-12(%ebp),%ecx
 	movl	-8(%ebp),%esi
 	movl	-4(%ebp),%edi
 
 /* start_emulating(); */
 	smsw	%ax
 	orb	$CR0_TS,%al
 	lmsw	%ax
 /* npxproc = NULL; */
 	movl	$0,_npxproc
 
 	ALIGN_TEXT
 fastmove_tail:
 	movl	_curpcb,%eax
 	movl	$fastmove_tail_fault,PCB_ONFAULT(%eax)
 
 	movb	%cl,%al
 	shrl	$2,%ecx				/* copy longword-wise */
 	cld
 	rep
 	movsl
 	movb	%al,%cl
 	andb	$3,%cl				/* copy remaining bytes */
 	rep
 	movsb
 
 	movl	%ebp,%esp
 	popl	%ebp
 	ret
 
 	ALIGN_TEXT
 fastmove_fault:
 	movl	_curpcb,%edi
 	addl	$PCB_SAVEFPU,%edi
 	movl	%esp,%esi
 	cld
 	movl	$PCB_SAVEFPU_SIZE>>2,%ecx
 	rep
 	movsl
 
 	smsw	%ax
 	orb	$CR0_TS,%al
 	lmsw	%ax
 	movl	$0,_npxproc
 
 fastmove_tail_fault:
 	movl	%ebp,%esp
 	popl	%ebp
 	addl	$8,%esp
 	popl	%ebx
 	popl	%edi
 	popl	%esi
 	movl	_curpcb,%edx
 	movl	$0,PCB_ONFAULT(%edx)
 	movl	$EFAULT,%eax
 	ret
 #endif /* I586_CPU && NNPX > 0 */
 
 /*
  * fu{byte,sword,word} : fetch a byte (sword, word) from user memory
  */
 ENTRY(fuword)
 	movl	_curpcb,%ecx
 	movl	$fusufault,PCB_ONFAULT(%ecx)
 	movl	4(%esp),%edx			/* from */
 
 	cmpl	$VM_MAXUSER_ADDRESS-4,%edx	/* verify address is valid */
 	ja	fusufault
 
 	movl	(%edx),%eax
 	movl	$0,PCB_ONFAULT(%ecx)
 	ret
 
 /*
  * These two routines are called from the profiling code, potentially
  * at interrupt time. If they fail, that's okay, good things will
  * happen later. Fail all the time for now - until the trap code is
  * able to deal with this.
  */
 ALTENTRY(suswintr)
 ENTRY(fuswintr)
 	movl	$-1,%eax
 	ret
 
 ENTRY(fusword)
 	movl	_curpcb,%ecx
 	movl	$fusufault,PCB_ONFAULT(%ecx)
 	movl	4(%esp),%edx
 
 	cmpl	$VM_MAXUSER_ADDRESS-2,%edx
 	ja	fusufault
 
 	movzwl	(%edx),%eax
 	movl	$0,PCB_ONFAULT(%ecx)
 	ret
 
 ENTRY(fubyte)
 	movl	_curpcb,%ecx
 	movl	$fusufault,PCB_ONFAULT(%ecx)
 	movl	4(%esp),%edx
 
 	cmpl	$VM_MAXUSER_ADDRESS-1,%edx
 	ja	fusufault
 
 	movzbl	(%edx),%eax
 	movl	$0,PCB_ONFAULT(%ecx)
 	ret
 
 	ALIGN_TEXT
 fusufault:
 	movl	_curpcb,%ecx
 	xorl	%eax,%eax
 	movl	%eax,PCB_ONFAULT(%ecx)
 	decl	%eax
 	ret
 
 /*
  * su{byte,sword,word}: write a byte (word, longword) to user memory
  */
 ENTRY(suword)
 	movl	_curpcb,%ecx
 	movl	$fusufault,PCB_ONFAULT(%ecx)
 	movl	4(%esp),%edx
 
 #if defined(I386_CPU)
 
 #if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
 	cmpl	$CPUCLASS_386,_cpu_class
 	jne	2f				/* we only have to set the right segment selector */
 #endif /* I486_CPU || I586_CPU || I686_CPU */
 
 	/* XXX - page boundary crossing is still not handled */
 	movl	%edx,%eax
 	shrl	$IDXSHIFT,%edx
 	andb	$0xfc,%dl
 
 	leal	_PTmap(%edx),%ecx
 	shrl	$IDXSHIFT,%ecx
 	andb	$0xfc,%cl
 	testb	$PG_V,_PTmap(%ecx)		/* PTE page must be valid */
 	je	4f
 	movb	_PTmap(%edx),%dl
 	andb	$PG_V|PG_RW|PG_U,%dl		/* page must be valid and user writable */
 	cmpb	$PG_V|PG_RW|PG_U,%dl
 	je	1f
 
 4:
 	/* simulate a trap */
 	pushl	%eax
 	call	_trapwrite
 	popl	%edx				/* remove junk parameter from stack */
 	testl	%eax,%eax
 	jnz	fusufault
 1:
 	movl	4(%esp),%edx
 #endif
 
 2:
 	cmpl	$VM_MAXUSER_ADDRESS-4,%edx	/* verify address validity */
 	ja	fusufault
 
 	movl	8(%esp),%eax
 	movl	%eax,(%edx)
 	xorl	%eax,%eax
 	movl	_curpcb,%ecx
 	movl	%eax,PCB_ONFAULT(%ecx)
 	ret
 
 ENTRY(susword)
 	movl	_curpcb,%ecx
 	movl	$fusufault,PCB_ONFAULT(%ecx)
 	movl	4(%esp),%edx
 
 #if defined(I386_CPU)
 
 #if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
 	cmpl	$CPUCLASS_386,_cpu_class
 	jne	2f
 #endif /* I486_CPU || I586_CPU || I686_CPU */
 
 	/* XXX - page boundary crossing is still not handled */
 	movl	%edx,%eax
 	shrl	$IDXSHIFT,%edx
 	andb	$0xfc,%dl
 
 	leal	_PTmap(%edx),%ecx
 	shrl	$IDXSHIFT,%ecx
 	andb	$0xfc,%cl
 	testb	$PG_V,_PTmap(%ecx)		/* PTE page must be valid */
 	je	4f
 	movb	_PTmap(%edx),%dl
 	andb	$PG_V|PG_RW|PG_U,%dl		/* page must be valid and user writable */
 	cmpb	$PG_V|PG_RW|PG_U,%dl
 	je	1f
 
 4:
 	/* simulate a trap */
 	pushl	%eax
 	call	_trapwrite
 	popl	%edx				/* remove junk parameter from stack */
 	testl	%eax,%eax
 	jnz	fusufault
 1:
 	movl	4(%esp),%edx
 #endif
 
 2:
 	cmpl	$VM_MAXUSER_ADDRESS-2,%edx	/* verify address validity */
 	ja	fusufault
 
 	movw	8(%esp),%ax
 	movw	%ax,(%edx)
 	xorl	%eax,%eax
 	movl	_curpcb,%ecx			/* restore trashed register */
 	movl	%eax,PCB_ONFAULT(%ecx)
 	ret
 
 ALTENTRY(suibyte)
 ENTRY(subyte)
 	movl	_curpcb,%ecx
 	movl	$fusufault,PCB_ONFAULT(%ecx)
 	movl	4(%esp),%edx
 
 #if defined(I386_CPU)
 
 #if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
 	cmpl	$CPUCLASS_386,_cpu_class
 	jne	2f
 #endif /* I486_CPU || I586_CPU || I686_CPU */
 
 	movl	%edx,%eax
 	shrl	$IDXSHIFT,%edx
 	andb	$0xfc,%dl
 
 	leal	_PTmap(%edx),%ecx
 	shrl	$IDXSHIFT,%ecx
 	andb	$0xfc,%cl
 	testb	$PG_V,_PTmap(%ecx)		/* PTE page must be valid */
 	je	4f
 	movb	_PTmap(%edx),%dl
 	andb	$PG_V|PG_RW|PG_U,%dl		/* page must be valid and user writable */
 	cmpb	$PG_V|PG_RW|PG_U,%dl
 	je	1f
 
 4:
 	/* simulate a trap */
 	pushl	%eax
 	call	_trapwrite
 	popl	%edx				/* remove junk parameter from stack */
 	testl	%eax,%eax
 	jnz	fusufault
 1:
 	movl	4(%esp),%edx
 #endif
 
 2:
 	cmpl	$VM_MAXUSER_ADDRESS-1,%edx	/* verify address validity */
 	ja	fusufault
 
 	movb	8(%esp),%al
 	movb	%al,(%edx)
 	xorl	%eax,%eax
 	movl	_curpcb,%ecx			/* restore trashed register */
 	movl	%eax,PCB_ONFAULT(%ecx)
 	ret
 
 /*
  * copyinstr(from, to, maxlen, int *lencopied)
  *	copy a string from from to to, stop when a 0 character is reached.
  *	return ENAMETOOLONG if string is longer than maxlen, and
  *	EFAULT on protection violations. If lencopied is non-zero,
  *	return the actual length in *lencopied.
  */
 ENTRY(copyinstr)
 	pushl	%esi
 	pushl	%edi
 	movl	_curpcb,%ecx
 	movl	$cpystrflt,PCB_ONFAULT(%ecx)
 
 	movl	12(%esp),%esi			/* %esi = from */
 	movl	16(%esp),%edi			/* %edi = to */
 	movl	20(%esp),%edx			/* %edx = maxlen */
 
 	movl	$VM_MAXUSER_ADDRESS,%eax
 
 	/* make sure 'from' is within bounds */
 	subl	%esi,%eax
 	jbe	cpystrflt
 
 	/* restrict maxlen to <= VM_MAXUSER_ADDRESS-from */
 	cmpl	%edx,%eax
 	jae	1f
 	movl	%eax,%edx
 	movl	%eax,20(%esp)
 1:
 	incl	%edx
 	cld
 
 2:
 	decl	%edx
 	jz	3f
 
 	lodsb
 	stosb
 	orb	%al,%al
 	jnz	2b
 
 	/* Success -- 0 byte reached */
 	decl	%edx
 	xorl	%eax,%eax
 	jmp	cpystrflt_x
 3:
 	/* edx is zero - return ENAMETOOLONG or EFAULT */
 	cmpl	$VM_MAXUSER_ADDRESS,%esi
 	jae	cpystrflt
 4:
 	movl	$ENAMETOOLONG,%eax
 	jmp	cpystrflt_x
 
 cpystrflt:
 	movl	$EFAULT,%eax
 
 cpystrflt_x:
 	/* set *lencopied and return %eax */
 	movl	_curpcb,%ecx
 	movl	$0,PCB_ONFAULT(%ecx)
 	movl	20(%esp),%ecx
 	subl	%edx,%ecx
 	movl	24(%esp),%edx
 	testl	%edx,%edx
 	jz	1f
 	movl	%ecx,(%edx)
 1:
 	popl	%edi
 	popl	%esi
 	ret
 
 
 /*
  * copystr(from, to, maxlen, int *lencopied)
  */
 ENTRY(copystr)
 	pushl	%esi
 	pushl	%edi
 
 	movl	12(%esp),%esi			/* %esi = from */
 	movl	16(%esp),%edi			/* %edi = to */
 	movl	20(%esp),%edx			/* %edx = maxlen */
 	incl	%edx
 	cld
 1:
 	decl	%edx
 	jz	4f
 	lodsb
 	stosb
 	orb	%al,%al
 	jnz	1b
 
 	/* Success -- 0 byte reached */
 	decl	%edx
 	xorl	%eax,%eax
 	jmp	6f
 4:
 	/* edx is zero -- return ENAMETOOLONG */
 	movl	$ENAMETOOLONG,%eax
 
 6:
 	/* set *lencopied and return %eax */
 	movl	20(%esp),%ecx
 	subl	%edx,%ecx
 	movl	24(%esp),%edx
 	testl	%edx,%edx
 	jz	7f
 	movl	%ecx,(%edx)
 7:
 	popl	%edi
 	popl	%esi
 	ret
 
 ENTRY(bcmp)
 	pushl	%edi
 	pushl	%esi
 	movl	12(%esp),%edi
 	movl	16(%esp),%esi
 	movl	20(%esp),%edx
 	xorl	%eax,%eax
 
 	movl	%edx,%ecx
 	shrl	$2,%ecx
 	cld					/* compare forwards */
 	repe
 	cmpsl
 	jne	1f
 
 	movl	%edx,%ecx
 	andl	$3,%ecx
 	repe
 	cmpsb
 	je	2f
 1:
 	incl	%eax
 2:
 	popl	%esi
 	popl	%edi
 	ret
 
 
 /*
  * Handling of special 386 registers and descriptor tables etc
  */
 /* void lgdt(struct region_descriptor *rdp); */
 ENTRY(lgdt)
 	/* reload the descriptor table */
 	movl	4(%esp),%eax
 	lgdt	(%eax)
 
 	/* flush the prefetch q */
 	jmp	1f
 	nop
 1:
 	/* reload "stale" selectors */
 	movl	$KDSEL,%eax
 	movl	%ax,%ds
 	movl	%ax,%es
 	movl	%ax,%fs
 	movl	%ax,%gs
 	movl	%ax,%ss
 
 	/* reload code selector by turning return into intersegmental return */
 	movl	(%esp),%eax
 	pushl	%eax
 	movl	$KCSEL,4(%esp)
 	lret
 
 /*
  * void lidt(struct region_descriptor *rdp);
  */
 ENTRY(lidt)
 	movl	4(%esp),%eax
 	lidt	(%eax)
 	ret
 
 /*
  * void lldt(u_short sel)
  */
 ENTRY(lldt)
 	lldt	4(%esp)
 	ret
 
 /*
  * void ltr(u_short sel)
  */
 ENTRY(ltr)
 	ltr	4(%esp)
 	ret
 
 /* ssdtosd(*ssdp,*sdp) */
 ENTRY(ssdtosd)
 	pushl	%ebx
 	movl	8(%esp),%ecx
 	movl	8(%ecx),%ebx
 	shll	$16,%ebx
 	movl	(%ecx),%edx
 	roll	$16,%edx
 	movb	%dh,%bl
 	movb	%dl,%bh
 	rorl	$8,%ebx
 	movl	4(%ecx),%eax
 	movw	%ax,%dx
 	andl	$0xf0000,%eax
 	orl	%eax,%ebx
 	movl	12(%esp),%ecx
 	movl	%edx,(%ecx)
 	movl	%ebx,4(%ecx)
 	popl	%ebx
 	ret
 
 /* load_cr0(cr0) */
 ENTRY(load_cr0)
 	movl	4(%esp),%eax
 	movl	%eax,%cr0
 	ret
 
 /* rcr0() */
 ENTRY(rcr0)
 	movl	%cr0,%eax
 	ret
 
 /* rcr3() */
 ENTRY(rcr3)
 	movl	%cr3,%eax
 	ret
 
 /* void load_cr3(caddr_t cr3) */
 ENTRY(load_cr3)
+#if defined(SWTCH_OPTIM_STATS)
+	incl	_tlb_flush_count
+#endif
 	movl	4(%esp),%eax
 	movl	%eax,%cr3
 	ret
 
 /* rcr4() */
 ENTRY(rcr4)
 	movl	%cr4,%eax
 	ret
 
 /* void load_cr4(caddr_t cr4) */
 ENTRY(load_cr4)
 	movl	4(%esp),%eax
 	movl	%eax,%cr4
 	ret
 
 /*****************************************************************************/
 /* setjump, longjump                                                         */
 /*****************************************************************************/
 
 ENTRY(setjmp)
 	movl	4(%esp),%eax
 	movl	%ebx,(%eax)			/* save ebx */
 	movl	%esp,4(%eax)			/* save esp */
 	movl	%ebp,8(%eax)			/* save ebp */
 	movl	%esi,12(%eax)			/* save esi */
 	movl	%edi,16(%eax)			/* save edi */
 	movl	(%esp),%edx			/* get rta */
 	movl	%edx,20(%eax)			/* save eip */
 	xorl	%eax,%eax			/* return(0); */
 	ret
 
 ENTRY(longjmp)
 	movl	4(%esp),%eax
 	movl	(%eax),%ebx			/* restore ebx */
 	movl	4(%eax),%esp			/* restore esp */
 	movl	8(%eax),%ebp			/* restore ebp */
 	movl	12(%eax),%esi			/* restore esi */
 	movl	16(%eax),%edi			/* restore edi */
 	movl	20(%eax),%edx			/* get rta */
 	movl	%edx,(%esp)			/* put in return frame */
 	xorl	%eax,%eax			/* return(1); */
 	incl	%eax
 	ret
 
 /*
  * Here for doing BB-profiling (gcc -a).
  * We rely on the "bbset" instead, but need a dummy function.
  */
 NON_GPROF_ENTRY(__bb_init_func)
 	movl	4(%esp),%eax
 	movl	$1,(%eax)
 	.byte	0xc3				/* avoid macro for `ret' */
Index: head/sys/amd64/amd64/support.s
===================================================================
--- head/sys/amd64/amd64/support.s	(revision 31708)
+++ head/sys/amd64/amd64/support.s	(revision 31709)
@@ -1,1571 +1,1574 @@
 /*-
  * Copyright (c) 1993 The Regents of the University of California.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *	$Id: support.s,v 1.56 1997/08/09 00:02:44 dyson Exp $
+ *	$Id: support.s,v 1.57 1997/09/02 20:05:30 bde Exp $
  */
 
 #include "npx.h"
 
 #include <machine/asmacros.h>
 #include <machine/cputypes.h>
 #include <machine/pmap.h>
 #include <machine/specialreg.h>
 
 #include "assym.s"
 
 #define KDSEL		0x10			/* kernel data selector */
 #define KCSEL		0x8			/* kernel code selector */
 #define IDXSHIFT	10
 
 	.data
 	.globl	_bcopy_vector
 _bcopy_vector:
 	.long	_generic_bcopy
 	.globl	_bzero
 _bzero:
 	.long	_generic_bzero
 	.globl	_copyin_vector
 _copyin_vector:
 	.long	_generic_copyin
 	.globl	_copyout_vector
 _copyout_vector:
 	.long	_generic_copyout
 	.globl	_ovbcopy_vector
 _ovbcopy_vector:
 	.long	_generic_bcopy
 #if defined(I586_CPU) && NNPX > 0
 kernel_fpu_lock:
 	.byte	0xfe
 	.space	3
 #endif
 
 	.text
 
 /*
  * bcopy family
  * void bzero(void *buf, u_int len)
  */
 
 ENTRY(generic_bzero)
 	pushl	%edi
 	movl	8(%esp),%edi
 	movl	12(%esp),%ecx
 	xorl	%eax,%eax
 	shrl	$2,%ecx
 	cld
 	rep
 	stosl
 	movl	12(%esp),%ecx
 	andl	$3,%ecx
 	rep
 	stosb
 	popl	%edi
 	ret
 
 #if defined(I486_CPU)
 ENTRY(i486_bzero)
 	movl	4(%esp),%edx
 	movl	8(%esp),%ecx
 	xorl	%eax,%eax
 /*
  * do 64 byte chunks first
  *
  * XXX this is probably over-unrolled at least for DX2's
  */
 2:
 	cmpl	$64,%ecx
 	jb	3f
 	movl	%eax,(%edx)
 	movl	%eax,4(%edx)
 	movl	%eax,8(%edx)
 	movl	%eax,12(%edx)
 	movl	%eax,16(%edx)
 	movl	%eax,20(%edx)
 	movl	%eax,24(%edx)
 	movl	%eax,28(%edx)
 	movl	%eax,32(%edx)
 	movl	%eax,36(%edx)
 	movl	%eax,40(%edx)
 	movl	%eax,44(%edx)
 	movl	%eax,48(%edx)
 	movl	%eax,52(%edx)
 	movl	%eax,56(%edx)
 	movl	%eax,60(%edx)
 	addl	$64,%edx
 	subl	$64,%ecx
 	jnz	2b
 	ret
 
 /*
  * do 16 byte chunks
  */
 	SUPERALIGN_TEXT
 3:
 	cmpl	$16,%ecx
 	jb	4f
 	movl	%eax,(%edx)
 	movl	%eax,4(%edx)
 	movl	%eax,8(%edx)
 	movl	%eax,12(%edx)
 	addl	$16,%edx
 	subl	$16,%ecx
 	jnz	3b
 	ret
 
 /*
  * do 4 byte chunks
  */
 	SUPERALIGN_TEXT
 4:
 	cmpl	$4,%ecx
 	jb	5f
 	movl	%eax,(%edx)
 	addl	$4,%edx
 	subl	$4,%ecx
 	jnz	4b
 	ret
 
 /*
  * do 1 byte chunks
  * a jump table seems to be faster than a loop or more range reductions
  *
  * XXX need a const section for non-text
  */
 	.data
 jtab:
 	.long	do0
 	.long	do1
 	.long	do2
 	.long	do3
 
 	.text
 	SUPERALIGN_TEXT
 5:
 	jmp	jtab(,%ecx,4)
 
 	SUPERALIGN_TEXT
 do3:
 	movw	%ax,(%edx)
 	movb	%al,2(%edx)
 	ret
 
 	SUPERALIGN_TEXT
 do2:
 	movw	%ax,(%edx)
 	ret
 
 	SUPERALIGN_TEXT
 do1:
 	movb	%al,(%edx)
 	ret
 
 	SUPERALIGN_TEXT
 do0:
 	ret
 #endif
 
 #if defined(I586_CPU) && NNPX > 0
 ENTRY(i586_bzero)
 	movl	4(%esp),%edx
 	movl	8(%esp),%ecx
 
 	/*
 	 * The FPU register method is twice as fast as the integer register
 	 * method unless the target is in the L1 cache and we pre-allocate a
 	 * cache line for it (then the integer register method is 4-5 times
 	 * faster).  However, we never pre-allocate cache lines, since that
 	 * would make the integer method 25% or more slower for the common
 	 * case when the target isn't in either the L1 cache or the L2 cache.
 	 * Thus we normally use the FPU register method unless the overhead
 	 * would be too large.
 	 */
 	cmpl	$256,%ecx	/* empirical; clts, fninit, smsw cost a lot */
 	jb	intreg_i586_bzero
 
 	/*
 	 * The FPU registers may belong to an application or to fastmove()
 	 * or to another invocation of bcopy() or ourself in a higher level
 	 * interrupt or trap handler.  Preserving the registers is
 	 * complicated since we avoid it if possible at all levels.  We
 	 * want to localize the complications even when that increases them.
 	 * Here the extra work involves preserving CR0_TS in TS.
 	 * `npxproc != NULL' is supposed to be the condition that all the
 	 * FPU resources belong to an application, but npxproc and CR0_TS
 	 * aren't set atomically enough for this condition to work in
 	 * interrupt handlers.
 	 *
 	 * Case 1: FPU registers belong to the application: we must preserve
 	 * the registers if we use them, so we only use the FPU register
 	 * method if the target size is large enough to amortize the extra
 	 * overhead for preserving them.  CR0_TS must be preserved although
 	 * it is very likely to end up as set.
 	 *
 	 * Case 2: FPU registers belong to fastmove(): fastmove() currently
 	 * makes the registers look like they belong to an application so
 	 * that cpu_switch() and savectx() don't have to know about it, so
 	 * this case reduces to case 1.
 	 *
 	 * Case 3: FPU registers belong to the kernel: don't use the FPU
 	 * register method.  This case is unlikely, and supporting it would
 	 * be more complicated and might take too much stack.
 	 *
 	 * Case 4: FPU registers don't belong to anyone: the FPU registers
 	 * don't need to be preserved, so we always use the FPU register
 	 * method.  CR0_TS must be preserved although it is very likely to
 	 * always end up as clear.
 	 */
 	cmpl	$0,_npxproc
 	je	i586_bz1
 	cmpl	$256+184,%ecx		/* empirical; not quite 2*108 more */
 	jb	intreg_i586_bzero
 	sarb	$1,kernel_fpu_lock
 	jc	intreg_i586_bzero
 	smsw	%ax
 	clts
 	subl	$108,%esp
 	fnsave	0(%esp)
 	jmp	i586_bz2
 
 i586_bz1:
 	sarb	$1,kernel_fpu_lock
 	jc	intreg_i586_bzero
 	smsw	%ax
 	clts
 	fninit				/* XXX should avoid needing this */
 i586_bz2:
 	fldz
 
 	/*
 	 * Align to an 8 byte boundary (misalignment in the main loop would
 	 * cost a factor of >= 2).  Avoid jumps (at little cost if it is
 	 * already aligned) by always zeroing 8 bytes and using the part up
 	 * to the _next_ alignment position.
 	 */
 	fstl	0(%edx)
 	addl	%edx,%ecx		/* part of %ecx -= new_%edx - %edx */
 	addl	$8,%edx
 	andl	$~7,%edx
 	subl	%edx,%ecx
 
 	/*
 	 * Similarly align `len' to a multiple of 8.
 	 */
 	fstl	-8(%edx,%ecx)
 	decl	%ecx
 	andl	$~7,%ecx
 
 	/*
 	 * This wouldn't be any faster if it were unrolled, since the loop
 	 * control instructions are much faster than the fstl and/or done
 	 * in parallel with it so their overhead is insignificant.
 	 */
 fpureg_i586_bzero_loop:
 	fstl	0(%edx)
 	addl	$8,%edx
 	subl	$8,%ecx
 	cmpl	$8,%ecx
 	jae	fpureg_i586_bzero_loop
 
 	cmpl	$0,_npxproc
 	je	i586_bz3
 	frstor	0(%esp)
 	addl	$108,%esp
 	lmsw	%ax
 	movb	$0xfe,kernel_fpu_lock
 	ret
 
 i586_bz3:
 	fstpl	%st(0)
 	lmsw	%ax
 	movb	$0xfe,kernel_fpu_lock
 	ret
 
 intreg_i586_bzero:
 	/*
 	 * `rep stos' seems to be the best method in practice for small
 	 * counts.  Fancy methods usually take too long to start up due
 	 * to cache and BTB misses.
 	 */
 	pushl	%edi
 	movl	%edx,%edi
 	xorl	%eax,%eax
 	shrl	$2,%ecx
 	cld
 	rep
 	stosl
 	movl	12(%esp),%ecx
 	andl	$3,%ecx
 	jne	1f
 	popl	%edi
 	ret
 
 1:
 	rep
 	stosb
 	popl	%edi
 	ret
 #endif /* I586_CPU && NNPX > 0 */
 
 /* fillw(pat, base, cnt) */
 ENTRY(fillw)
 	pushl	%edi
 	movl	8(%esp),%eax
 	movl	12(%esp),%edi
 	movl	16(%esp),%ecx
 	cld
 	rep
 	stosw
 	popl	%edi
 	ret
 
 ENTRY(bcopyb)
 bcopyb:
 	pushl	%esi
 	pushl	%edi
 	movl	12(%esp),%esi
 	movl	16(%esp),%edi
 	movl	20(%esp),%ecx
 	movl	%edi,%eax
 	subl	%esi,%eax
 	cmpl	%ecx,%eax			/* overlapping && src < dst? */
 	jb	1f
 	cld					/* nope, copy forwards */
 	rep
 	movsb
 	popl	%edi
 	popl	%esi
 	ret
 
 	ALIGN_TEXT
 1:
 	addl	%ecx,%edi			/* copy backwards. */
 	addl	%ecx,%esi
 	decl	%edi
 	decl	%esi
 	std
 	rep
 	movsb
 	popl	%edi
 	popl	%esi
 	cld
 	ret
 
 ENTRY(bcopy)
 	MEXITCOUNT
 	jmp	*_bcopy_vector
 
 ENTRY(ovbcopy)
 	MEXITCOUNT
 	jmp	*_ovbcopy_vector
 
 /*
  * generic_bcopy(src, dst, cnt)
  *  ws@tools.de     (Wolfgang Solfrank, TooLs GmbH) +49-228-985800
  */
 ENTRY(generic_bcopy)
 	pushl	%esi
 	pushl	%edi
 	movl	12(%esp),%esi
 	movl	16(%esp),%edi
 	movl	20(%esp),%ecx
 
 	movl	%edi,%eax
 	subl	%esi,%eax
 	cmpl	%ecx,%eax			/* overlapping && src < dst? */
 	jb	1f
 
 	shrl	$2,%ecx				/* copy by 32-bit words */
 	cld					/* nope, copy forwards */
 	rep
 	movsl
 	movl	20(%esp),%ecx
 	andl	$3,%ecx				/* any bytes left? */
 	rep
 	movsb
 	popl	%edi
 	popl	%esi
 	ret
 
 	ALIGN_TEXT
 1:
 	addl	%ecx,%edi			/* copy backwards */
 	addl	%ecx,%esi
 	decl	%edi
 	decl	%esi
 	andl	$3,%ecx				/* any fractional bytes? */
 	std
 	rep
 	movsb
 	movl	20(%esp),%ecx			/* copy remainder by 32-bit words */
 	shrl	$2,%ecx
 	subl	$3,%esi
 	subl	$3,%edi
 	rep
 	movsl
 	popl	%edi
 	popl	%esi
 	cld
 	ret
 
 #if defined(I586_CPU) && NNPX > 0
 ENTRY(i586_bcopy)
 	pushl	%esi
 	pushl	%edi
 	movl	12(%esp),%esi
 	movl	16(%esp),%edi
 	movl	20(%esp),%ecx
 
 	movl	%edi,%eax
 	subl	%esi,%eax
 	cmpl	%ecx,%eax			/* overlapping && src < dst? */
 	jb	1f
 
 	cmpl	$1024,%ecx
 	jb	small_i586_bcopy
 
 	sarb	$1,kernel_fpu_lock
 	jc	small_i586_bcopy
 	cmpl	$0,_npxproc
 	je	i586_bc1
 	smsw	%dx
 	clts
 	subl	$108,%esp
 	fnsave	0(%esp)
 	jmp	4f
 
 i586_bc1:
 	smsw	%dx
 	clts
 	fninit				/* XXX should avoid needing this */
 
 	ALIGN_TEXT
 4:
 	pushl	%ecx
 #define	DCACHE_SIZE	8192
 	cmpl	$(DCACHE_SIZE-512)/2,%ecx
 	jbe	2f
 	movl	$(DCACHE_SIZE-512)/2,%ecx
 2:
 	subl	%ecx,0(%esp)
 	cmpl	$256,%ecx
 	jb	5f			/* XXX should prefetch if %ecx >= 32 */
 	pushl	%esi
 	pushl	%ecx
 	ALIGN_TEXT
 3:
 	movl	0(%esi),%eax
 	movl	32(%esi),%eax
 	movl	64(%esi),%eax
 	movl	96(%esi),%eax
 	movl	128(%esi),%eax
 	movl	160(%esi),%eax
 	movl	192(%esi),%eax
 	movl	224(%esi),%eax
 	addl	$256,%esi
 	subl	$256,%ecx
 	cmpl	$256,%ecx
 	jae	3b
 	popl	%ecx
 	popl	%esi
 5:
 	ALIGN_TEXT
 large_i586_bcopy_loop:
 	fildq	0(%esi)
 	fildq	8(%esi)
 	fildq	16(%esi)
 	fildq	24(%esi)
 	fildq	32(%esi)
 	fildq	40(%esi)
 	fildq	48(%esi)
 	fildq	56(%esi)
 	fistpq	56(%edi)
 	fistpq	48(%edi)
 	fistpq	40(%edi)
 	fistpq	32(%edi)
 	fistpq	24(%edi)
 	fistpq	16(%edi)
 	fistpq	8(%edi)
 	fistpq	0(%edi)
 	addl	$64,%esi
 	addl	$64,%edi
 	subl	$64,%ecx
 	cmpl	$64,%ecx
 	jae	large_i586_bcopy_loop
 	popl	%eax
 	addl	%eax,%ecx
 	cmpl	$64,%ecx
 	jae	4b
 
 	cmpl	$0,_npxproc
 	je	i586_bc2
 	frstor	0(%esp)
 	addl	$108,%esp
 i586_bc2:
 	lmsw	%dx
 	movb	$0xfe,kernel_fpu_lock
 
 /*
  * This is a duplicate of the main part of generic_bcopy.  See the comments
  * there.  Jumping into generic_bcopy would cost a whole 0-1 cycles and
  * would mess up high resolution profiling.
  */
 	ALIGN_TEXT
 small_i586_bcopy:
 	shrl	$2,%ecx
 	cld
 	rep
 	movsl
 	movl	20(%esp),%ecx
 	andl	$3,%ecx
 	rep
 	movsb
 	popl	%edi
 	popl	%esi
 	ret
 
 	ALIGN_TEXT
 1:
 	addl	%ecx,%edi
 	addl	%ecx,%esi
 	decl	%edi
 	decl	%esi
 	andl	$3,%ecx
 	std
 	rep
 	movsb
 	movl	20(%esp),%ecx
 	shrl	$2,%ecx
 	subl	$3,%esi
 	subl	$3,%edi
 	rep
 	movsl
 	popl	%edi
 	popl	%esi
 	cld
 	ret
 #endif /* I586_CPU && NNPX > 0 */
 
 /*
  * Note: memcpy does not support overlapping copies
  */
 ENTRY(memcpy)
 	pushl	%edi
 	pushl	%esi
 	movl	12(%esp),%edi
 	movl	16(%esp),%esi
 	movl	20(%esp),%ecx
 	movl	%edi,%eax
 	shrl	$2,%ecx				/* copy by 32-bit words */
 	cld					/* nope, copy forwards */
 	rep
 	movsl
 	movl	20(%esp),%ecx
 	andl	$3,%ecx				/* any bytes left? */
 	rep
 	movsb
 	popl	%esi
 	popl	%edi
 	ret
 
 
 /*****************************************************************************/
 /* copyout and fubyte family                                                 */
 /*****************************************************************************/
 /*
  * Access user memory from inside the kernel. These routines and possibly
  * the math- and DOS emulators should be the only places that do this.
  *
  * We have to access the memory with user's permissions, so use a segment
  * selector with RPL 3. For writes to user space we have to additionally
  * check the PTE for write permission, because the 386 does not check
  * write permissions when we are executing with EPL 0. The 486 does check
  * this if the WP bit is set in CR0, so we can use a simpler version here.
  *
  * These routines set curpcb->onfault for the time they execute. When a
  * protection violation occurs inside the functions, the trap handler
  * returns to *curpcb->onfault instead of the function.
  */
 
 /* copyout(from_kernel, to_user, len) */
 ENTRY(copyout)
 	MEXITCOUNT
 	jmp	*_copyout_vector
 
 ENTRY(generic_copyout)
 	movl	_curpcb,%eax
 	movl	$copyout_fault,PCB_ONFAULT(%eax)
 	pushl	%esi
 	pushl	%edi
 	pushl	%ebx
 	movl	16(%esp),%esi
 	movl	20(%esp),%edi
 	movl	24(%esp),%ebx
 	testl	%ebx,%ebx			/* anything to do? */
 	jz	done_copyout
 
 	/*
 	 * Check explicitly for non-user addresses.  If 486 write protection
 	 * is being used, this check is essential because we are in kernel
 	 * mode so the h/w does not provide any protection against writing
 	 * kernel addresses.
 	 */
 
 	/*
 	 * First, prevent address wrapping.
 	 */
 	movl	%edi,%eax
 	addl	%ebx,%eax
 	jc	copyout_fault
 /*
  * XXX STOP USING VM_MAXUSER_ADDRESS.
  * It is an end address, not a max, so every time it is used correctly it
  * looks like there is an off by one error, and of course it caused an off
  * by one error in several places.
  */
 	cmpl	$VM_MAXUSER_ADDRESS,%eax
 	ja	copyout_fault
 
 #if defined(I386_CPU)
 
 #if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
 	cmpl	$CPUCLASS_386,_cpu_class
 	jne	3f
 #endif
 /*
  * We have to check each PTE for user write permission.
  * The checking may cause a page fault, so it is important to set
  * up everything for return via copyout_fault before here.
  */
 	/* compute number of pages */
 	movl	%edi,%ecx
 	andl	$PAGE_MASK,%ecx
 	addl	%ebx,%ecx
 	decl	%ecx
 	shrl	$IDXSHIFT+2,%ecx
 	incl	%ecx
 
 	/* compute PTE offset for start address */
 	movl	%edi,%edx
 	shrl	$IDXSHIFT,%edx
 	andb	$0xfc,%dl
 
 1:
 	/* check PTE for each page */
 	leal	_PTmap(%edx),%eax
 	shrl	$IDXSHIFT,%eax
 	andb	$0xfc,%al
 	testb	$PG_V,_PTmap(%eax)		/* PTE page must be valid */
 	je	4f
 	movb	_PTmap(%edx),%al
 	andb	$PG_V|PG_RW|PG_U,%al		/* page must be valid and user writable */
 	cmpb	$PG_V|PG_RW|PG_U,%al
 	je	2f
 
 4:
 	/* simulate a trap */
 	pushl	%edx
 	pushl	%ecx
 	shll	$IDXSHIFT,%edx
 	pushl	%edx
 	call	_trapwrite			/* trapwrite(addr) */
 	popl	%edx
 	popl	%ecx
 	popl	%edx
 
 	testl	%eax,%eax			/* if not ok, return EFAULT */
 	jnz	copyout_fault
 
 2:
 	addl	$4,%edx
 	decl	%ecx
 	jnz	1b				/* check next page */
 #endif /* I386_CPU */
 
 	/* bcopy(%esi, %edi, %ebx) */
 3:
 	movl	%ebx,%ecx
 
 #if defined(I586_CPU) && NNPX > 0
 	ALIGN_TEXT
 slow_copyout:
 #endif
 	shrl	$2,%ecx
 	cld
 	rep
 	movsl
 	movb	%bl,%cl
 	andb	$3,%cl
 	rep
 	movsb
 
 done_copyout:
 	popl	%ebx
 	popl	%edi
 	popl	%esi
 	xorl	%eax,%eax
 	movl	_curpcb,%edx
 	movl	%eax,PCB_ONFAULT(%edx)
 	ret
 
 	ALIGN_TEXT
 copyout_fault:
 	popl	%ebx
 	popl	%edi
 	popl	%esi
 	movl	_curpcb,%edx
 	movl	$0,PCB_ONFAULT(%edx)
 	movl	$EFAULT,%eax
 	ret
 
 #if defined(I586_CPU) && NNPX > 0
 ENTRY(i586_copyout)
 	/*
 	 * Duplicated from generic_copyout.  Could be done a bit better.
 	 */
 	movl	_curpcb,%eax
 	movl	$copyout_fault,PCB_ONFAULT(%eax)
 	pushl	%esi
 	pushl	%edi
 	pushl	%ebx
 	movl	16(%esp),%esi
 	movl	20(%esp),%edi
 	movl	24(%esp),%ebx
 	testl	%ebx,%ebx			/* anything to do? */
 	jz	done_copyout
 
 	/*
 	 * Check explicitly for non-user addresses.  If 486 write protection
 	 * is being used, this check is essential because we are in kernel
 	 * mode so the h/w does not provide any protection against writing
 	 * kernel addresses.
 	 */
 
 	/*
 	 * First, prevent address wrapping.
 	 */
 	movl	%edi,%eax
 	addl	%ebx,%eax
 	jc	copyout_fault
 /*
  * XXX STOP USING VM_MAXUSER_ADDRESS.
  * It is an end address, not a max, so every time it is used correctly it
  * looks like there is an off by one error, and of course it caused an off
  * by one error in several places.
  */
 	cmpl	$VM_MAXUSER_ADDRESS,%eax
 	ja	copyout_fault
 
 	/* bcopy(%esi, %edi, %ebx) */
 3:
 	movl	%ebx,%ecx
 	/*
 	 * End of duplicated code.
 	 */
 
 	cmpl	$1024,%ecx
 	jb	slow_copyout
 
 	pushl	%ecx
 	call	_fastmove
 	addl	$4,%esp
 	jmp	done_copyout
 #endif /* I586_CPU && NNPX > 0 */
 
 /* copyin(from_user, to_kernel, len) */
 ENTRY(copyin)
 	MEXITCOUNT
 	jmp	*_copyin_vector
 
 ENTRY(generic_copyin)
 	movl	_curpcb,%eax
 	movl	$copyin_fault,PCB_ONFAULT(%eax)
 	pushl	%esi
 	pushl	%edi
 	movl	12(%esp),%esi			/* caddr_t from */
 	movl	16(%esp),%edi			/* caddr_t to */
 	movl	20(%esp),%ecx			/* size_t  len */
 
 	/*
 	 * make sure address is valid
 	 */
 	movl	%esi,%edx
 	addl	%ecx,%edx
 	jc	copyin_fault
 	cmpl	$VM_MAXUSER_ADDRESS,%edx
 	ja	copyin_fault
 
 #if defined(I586_CPU) && NNPX > 0
 	ALIGN_TEXT
 slow_copyin:
 #endif
 	movb	%cl,%al
 	shrl	$2,%ecx				/* copy longword-wise */
 	cld
 	rep
 	movsl
 	movb	%al,%cl
 	andb	$3,%cl				/* copy remaining bytes */
 	rep
 	movsb
 
 #if defined(I586_CPU) && NNPX > 0
 	ALIGN_TEXT
 done_copyin:
 #endif
 	popl	%edi
 	popl	%esi
 	xorl	%eax,%eax
 	movl	_curpcb,%edx
 	movl	%eax,PCB_ONFAULT(%edx)
 	ret
 
 	ALIGN_TEXT
 copyin_fault:
 	popl	%edi
 	popl	%esi
 	movl	_curpcb,%edx
 	movl	$0,PCB_ONFAULT(%edx)
 	movl	$EFAULT,%eax
 	ret
 
 #if defined(I586_CPU) && NNPX > 0
 ENTRY(i586_copyin)
 	/*
 	 * Duplicated from generic_copyin.  Could be done a bit better.
 	 */
 	movl	_curpcb,%eax
 	movl	$copyin_fault,PCB_ONFAULT(%eax)
 	pushl	%esi
 	pushl	%edi
 	movl	12(%esp),%esi			/* caddr_t from */
 	movl	16(%esp),%edi			/* caddr_t to */
 	movl	20(%esp),%ecx			/* size_t  len */
 
 	/*
 	 * make sure address is valid
 	 */
 	movl	%esi,%edx
 	addl	%ecx,%edx
 	jc	copyin_fault
 	cmpl	$VM_MAXUSER_ADDRESS,%edx
 	ja	copyin_fault
 	/*
 	 * End of duplicated code.
 	 */
 
 	cmpl	$1024,%ecx
 	jb	slow_copyin
 
 	pushl	%ebx			/* XXX prepare for fastmove_fault */
 	pushl	%ecx
 	call	_fastmove
 	addl	$8,%esp
 	jmp	done_copyin
 #endif /* I586_CPU && NNPX > 0 */
 
 #if defined(I586_CPU) && NNPX > 0
 /* fastmove(src, dst, len)
 	src in %esi
 	dst in %edi
 	len in %ecx		XXX changed to on stack for profiling
 	uses %eax and %edx for tmp. storage
  */
 /* XXX use ENTRY() to get profiling.  fastmove() is actually a non-entry. */
 ENTRY(fastmove)
 	pushl	%ebp
 	movl	%esp,%ebp
 	subl	$PCB_SAVEFPU_SIZE+3*4,%esp
 
 	movl	8(%ebp),%ecx
 	cmpl	$63,%ecx
 	jbe	fastmove_tail
 
 	testl	$7,%esi	/* check if src addr is multiple of 8 */
 	jnz	fastmove_tail
 
 	testl	$7,%edi	/* check if dst addr is multiple of 8 */
 	jnz	fastmove_tail
 
 /* if (npxproc != NULL) { */
 	cmpl	$0,_npxproc
 	je	6f
 /*    fnsave(&curpcb->pcb_savefpu); */
 	movl	_curpcb,%eax
 	fnsave	PCB_SAVEFPU(%eax)
 /*   npxproc = NULL; */
 	movl	$0,_npxproc
 /* } */
 6:
 /* now we own the FPU. */
 
 /*
  * The process' FP state is saved in the pcb, but if we get
  * switched, the cpu_switch() will store our FP state in the
  * pcb.  It should be possible to avoid all the copying for
  * this, e.g., by setting a flag to tell cpu_switch() to
  * save the state somewhere else.
  */
 /* tmp = curpcb->pcb_savefpu; */
 	movl	%ecx,-12(%ebp)
 	movl	%esi,-8(%ebp)
 	movl	%edi,-4(%ebp)
 	movl	%esp,%edi
 	movl	_curpcb,%esi
 	addl	$PCB_SAVEFPU,%esi
 	cld
 	movl	$PCB_SAVEFPU_SIZE>>2,%ecx
 	rep
 	movsl
 	movl	-12(%ebp),%ecx
 	movl	-8(%ebp),%esi
 	movl	-4(%ebp),%edi
 /* stop_emulating(); */
 	clts
 /* npxproc = curproc; */
 	movl	_curproc,%eax
 	movl	%eax,_npxproc
 	movl	_curpcb,%eax
 	movl	$fastmove_fault,PCB_ONFAULT(%eax)
 4:
 	movl	%ecx,-12(%ebp)
 	cmpl	$1792,%ecx
 	jbe	2f
 	movl	$1792,%ecx
 2:
 	subl	%ecx,-12(%ebp)
 	cmpl	$256,%ecx
 	jb	5f
 	movl	%ecx,-8(%ebp)
 	movl	%esi,-4(%ebp)
 	ALIGN_TEXT
 3:
 	movl	0(%esi),%eax
 	movl	32(%esi),%eax
 	movl	64(%esi),%eax
 	movl	96(%esi),%eax
 	movl	128(%esi),%eax
 	movl	160(%esi),%eax
 	movl	192(%esi),%eax
 	movl	224(%esi),%eax
 	addl	$256,%esi
 	subl	$256,%ecx
 	cmpl	$256,%ecx
 	jae	3b
 	movl	-8(%ebp),%ecx
 	movl	-4(%ebp),%esi
 5:
 	ALIGN_TEXT
 fastmove_loop:
 	fildq	0(%esi)
 	fildq	8(%esi)
 	fildq	16(%esi)
 	fildq	24(%esi)
 	fildq	32(%esi)
 	fildq	40(%esi)
 	fildq	48(%esi)
 	fildq	56(%esi)
 	fistpq	56(%edi)
 	fistpq	48(%edi)
 	fistpq	40(%edi)
 	fistpq	32(%edi)
 	fistpq	24(%edi)
 	fistpq	16(%edi)
 	fistpq	8(%edi)
 	fistpq	0(%edi)
 	addl	$-64,%ecx
 	addl	$64,%esi
 	addl	$64,%edi
 	cmpl	$63,%ecx
 	ja	fastmove_loop
 	movl	-12(%ebp),%eax
 	addl	%eax,%ecx
 	cmpl	$64,%ecx
 	jae	4b
 
 /* curpcb->pcb_savefpu = tmp; */
 	movl	%ecx,-12(%ebp)
 	movl	%esi,-8(%ebp)
 	movl	%edi,-4(%ebp)
 	movl	_curpcb,%edi
 	addl	$PCB_SAVEFPU,%edi
 	movl	%esp,%esi
 	cld
 	movl	$PCB_SAVEFPU_SIZE>>2,%ecx
 	rep
 	movsl
 	movl	-12(%ebp),%ecx
 	movl	-8(%ebp),%esi
 	movl	-4(%ebp),%edi
 
 /* start_emulating(); */
 	smsw	%ax
 	orb	$CR0_TS,%al
 	lmsw	%ax
 /* npxproc = NULL; */
 	movl	$0,_npxproc
 
 	ALIGN_TEXT
 fastmove_tail:
 	movl	_curpcb,%eax
 	movl	$fastmove_tail_fault,PCB_ONFAULT(%eax)
 
 	movb	%cl,%al
 	shrl	$2,%ecx				/* copy longword-wise */
 	cld
 	rep
 	movsl
 	movb	%al,%cl
 	andb	$3,%cl				/* copy remaining bytes */
 	rep
 	movsb
 
 	movl	%ebp,%esp
 	popl	%ebp
 	ret
 
 	ALIGN_TEXT
 fastmove_fault:
 	movl	_curpcb,%edi
 	addl	$PCB_SAVEFPU,%edi
 	movl	%esp,%esi
 	cld
 	movl	$PCB_SAVEFPU_SIZE>>2,%ecx
 	rep
 	movsl
 
 	smsw	%ax
 	orb	$CR0_TS,%al
 	lmsw	%ax
 	movl	$0,_npxproc
 
 fastmove_tail_fault:
 	movl	%ebp,%esp
 	popl	%ebp
 	addl	$8,%esp
 	popl	%ebx
 	popl	%edi
 	popl	%esi
 	movl	_curpcb,%edx
 	movl	$0,PCB_ONFAULT(%edx)
 	movl	$EFAULT,%eax
 	ret
 #endif /* I586_CPU && NNPX > 0 */
 
 /*
  * fu{byte,sword,word} : fetch a byte (sword, word) from user memory
  */
 ENTRY(fuword)
 	movl	_curpcb,%ecx
 	movl	$fusufault,PCB_ONFAULT(%ecx)
 	movl	4(%esp),%edx			/* from */
 
 	cmpl	$VM_MAXUSER_ADDRESS-4,%edx	/* verify address is valid */
 	ja	fusufault
 
 	movl	(%edx),%eax
 	movl	$0,PCB_ONFAULT(%ecx)
 	ret
 
 /*
  * These two routines are called from the profiling code, potentially
  * at interrupt time. If they fail, that's okay, good things will
  * happen later. Fail all the time for now - until the trap code is
  * able to deal with this.
  */
 ALTENTRY(suswintr)
 ENTRY(fuswintr)
 	movl	$-1,%eax
 	ret
 
 ENTRY(fusword)
 	movl	_curpcb,%ecx
 	movl	$fusufault,PCB_ONFAULT(%ecx)
 	movl	4(%esp),%edx
 
 	cmpl	$VM_MAXUSER_ADDRESS-2,%edx
 	ja	fusufault
 
 	movzwl	(%edx),%eax
 	movl	$0,PCB_ONFAULT(%ecx)
 	ret
 
 ENTRY(fubyte)
 	movl	_curpcb,%ecx
 	movl	$fusufault,PCB_ONFAULT(%ecx)
 	movl	4(%esp),%edx
 
 	cmpl	$VM_MAXUSER_ADDRESS-1,%edx
 	ja	fusufault
 
 	movzbl	(%edx),%eax
 	movl	$0,PCB_ONFAULT(%ecx)
 	ret
 
 	ALIGN_TEXT
 fusufault:
 	movl	_curpcb,%ecx
 	xorl	%eax,%eax
 	movl	%eax,PCB_ONFAULT(%ecx)
 	decl	%eax
 	ret
 
 /*
  * su{byte,sword,word}: write a byte (word, longword) to user memory
  */
 ENTRY(suword)
 	movl	_curpcb,%ecx
 	movl	$fusufault,PCB_ONFAULT(%ecx)
 	movl	4(%esp),%edx
 
 #if defined(I386_CPU)
 
 #if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
 	cmpl	$CPUCLASS_386,_cpu_class
 	jne	2f				/* we only have to set the right segment selector */
 #endif /* I486_CPU || I586_CPU || I686_CPU */
 
 	/* XXX - page boundary crossing is still not handled */
 	movl	%edx,%eax
 	shrl	$IDXSHIFT,%edx
 	andb	$0xfc,%dl
 
 	leal	_PTmap(%edx),%ecx
 	shrl	$IDXSHIFT,%ecx
 	andb	$0xfc,%cl
 	testb	$PG_V,_PTmap(%ecx)		/* PTE page must be valid */
 	je	4f
 	movb	_PTmap(%edx),%dl
 	andb	$PG_V|PG_RW|PG_U,%dl		/* page must be valid and user writable */
 	cmpb	$PG_V|PG_RW|PG_U,%dl
 	je	1f
 
 4:
 	/* simulate a trap */
 	pushl	%eax
 	call	_trapwrite
 	popl	%edx				/* remove junk parameter from stack */
 	testl	%eax,%eax
 	jnz	fusufault
 1:
 	movl	4(%esp),%edx
 #endif
 
 2:
 	cmpl	$VM_MAXUSER_ADDRESS-4,%edx	/* verify address validity */
 	ja	fusufault
 
 	movl	8(%esp),%eax
 	movl	%eax,(%edx)
 	xorl	%eax,%eax
 	movl	_curpcb,%ecx
 	movl	%eax,PCB_ONFAULT(%ecx)
 	ret
 
 ENTRY(susword)
 	movl	_curpcb,%ecx
 	movl	$fusufault,PCB_ONFAULT(%ecx)
 	movl	4(%esp),%edx
 
 #if defined(I386_CPU)
 
 #if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
 	cmpl	$CPUCLASS_386,_cpu_class
 	jne	2f
 #endif /* I486_CPU || I586_CPU || I686_CPU */
 
 	/* XXX - page boundary crossing is still not handled */
 	movl	%edx,%eax
 	shrl	$IDXSHIFT,%edx
 	andb	$0xfc,%dl
 
 	leal	_PTmap(%edx),%ecx
 	shrl	$IDXSHIFT,%ecx
 	andb	$0xfc,%cl
 	testb	$PG_V,_PTmap(%ecx)		/* PTE page must be valid */
 	je	4f
 	movb	_PTmap(%edx),%dl
 	andb	$PG_V|PG_RW|PG_U,%dl		/* page must be valid and user writable */
 	cmpb	$PG_V|PG_RW|PG_U,%dl
 	je	1f
 
 4:
 	/* simulate a trap */
 	pushl	%eax
 	call	_trapwrite
 	popl	%edx				/* remove junk parameter from stack */
 	testl	%eax,%eax
 	jnz	fusufault
 1:
 	movl	4(%esp),%edx
 #endif
 
 2:
 	cmpl	$VM_MAXUSER_ADDRESS-2,%edx	/* verify address validity */
 	ja	fusufault
 
 	movw	8(%esp),%ax
 	movw	%ax,(%edx)
 	xorl	%eax,%eax
 	movl	_curpcb,%ecx			/* restore trashed register */
 	movl	%eax,PCB_ONFAULT(%ecx)
 	ret
 
 ALTENTRY(suibyte)
 ENTRY(subyte)
 	movl	_curpcb,%ecx
 	movl	$fusufault,PCB_ONFAULT(%ecx)
 	movl	4(%esp),%edx
 
 #if defined(I386_CPU)
 
 #if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
 	cmpl	$CPUCLASS_386,_cpu_class
 	jne	2f
 #endif /* I486_CPU || I586_CPU || I686_CPU */
 
 	movl	%edx,%eax
 	shrl	$IDXSHIFT,%edx
 	andb	$0xfc,%dl
 
 	leal	_PTmap(%edx),%ecx
 	shrl	$IDXSHIFT,%ecx
 	andb	$0xfc,%cl
 	testb	$PG_V,_PTmap(%ecx)		/* PTE page must be valid */
 	je	4f
 	movb	_PTmap(%edx),%dl
 	andb	$PG_V|PG_RW|PG_U,%dl		/* page must be valid and user writable */
 	cmpb	$PG_V|PG_RW|PG_U,%dl
 	je	1f
 
 4:
 	/* simulate a trap */
 	pushl	%eax
 	call	_trapwrite
 	popl	%edx				/* remove junk parameter from stack */
 	testl	%eax,%eax
 	jnz	fusufault
 1:
 	movl	4(%esp),%edx
 #endif
 
 2:
 	cmpl	$VM_MAXUSER_ADDRESS-1,%edx	/* verify address validity */
 	ja	fusufault
 
 	movb	8(%esp),%al
 	movb	%al,(%edx)
 	xorl	%eax,%eax
 	movl	_curpcb,%ecx			/* restore trashed register */
 	movl	%eax,PCB_ONFAULT(%ecx)
 	ret
 
 /*
  * copyinstr(from, to, maxlen, int *lencopied)
  *	copy a string from from to to, stop when a 0 character is reached.
  *	return ENAMETOOLONG if string is longer than maxlen, and
  *	EFAULT on protection violations. If lencopied is non-zero,
  *	return the actual length in *lencopied.
  */
 ENTRY(copyinstr)
 	pushl	%esi
 	pushl	%edi
 	movl	_curpcb,%ecx
 	movl	$cpystrflt,PCB_ONFAULT(%ecx)
 
 	movl	12(%esp),%esi			/* %esi = from */
 	movl	16(%esp),%edi			/* %edi = to */
 	movl	20(%esp),%edx			/* %edx = maxlen */
 
 	movl	$VM_MAXUSER_ADDRESS,%eax
 
 	/* make sure 'from' is within bounds */
 	subl	%esi,%eax
 	jbe	cpystrflt
 
 	/* restrict maxlen to <= VM_MAXUSER_ADDRESS-from */
 	cmpl	%edx,%eax
 	jae	1f
 	movl	%eax,%edx
 	movl	%eax,20(%esp)
 1:
 	incl	%edx
 	cld
 
 2:
 	decl	%edx
 	jz	3f
 
 	lodsb
 	stosb
 	orb	%al,%al
 	jnz	2b
 
 	/* Success -- 0 byte reached */
 	decl	%edx
 	xorl	%eax,%eax
 	jmp	cpystrflt_x
 3:
 	/* edx is zero - return ENAMETOOLONG or EFAULT */
 	cmpl	$VM_MAXUSER_ADDRESS,%esi
 	jae	cpystrflt
 4:
 	movl	$ENAMETOOLONG,%eax
 	jmp	cpystrflt_x
 
 cpystrflt:
 	movl	$EFAULT,%eax
 
 cpystrflt_x:
 	/* set *lencopied and return %eax */
 	movl	_curpcb,%ecx
 	movl	$0,PCB_ONFAULT(%ecx)
 	movl	20(%esp),%ecx
 	subl	%edx,%ecx
 	movl	24(%esp),%edx
 	testl	%edx,%edx
 	jz	1f
 	movl	%ecx,(%edx)
 1:
 	popl	%edi
 	popl	%esi
 	ret
 
 
 /*
  * copystr(from, to, maxlen, int *lencopied)
  */
 ENTRY(copystr)
 	pushl	%esi
 	pushl	%edi
 
 	movl	12(%esp),%esi			/* %esi = from */
 	movl	16(%esp),%edi			/* %edi = to */
 	movl	20(%esp),%edx			/* %edx = maxlen */
 	incl	%edx
 	cld
 1:
 	decl	%edx
 	jz	4f
 	lodsb
 	stosb
 	orb	%al,%al
 	jnz	1b
 
 	/* Success -- 0 byte reached */
 	decl	%edx
 	xorl	%eax,%eax
 	jmp	6f
 4:
 	/* edx is zero -- return ENAMETOOLONG */
 	movl	$ENAMETOOLONG,%eax
 
 6:
 	/* set *lencopied and return %eax */
 	movl	20(%esp),%ecx
 	subl	%edx,%ecx
 	movl	24(%esp),%edx
 	testl	%edx,%edx
 	jz	7f
 	movl	%ecx,(%edx)
 7:
 	popl	%edi
 	popl	%esi
 	ret
 
 ENTRY(bcmp)
 	pushl	%edi
 	pushl	%esi
 	movl	12(%esp),%edi
 	movl	16(%esp),%esi
 	movl	20(%esp),%edx
 	xorl	%eax,%eax
 
 	movl	%edx,%ecx
 	shrl	$2,%ecx
 	cld					/* compare forwards */
 	repe
 	cmpsl
 	jne	1f
 
 	movl	%edx,%ecx
 	andl	$3,%ecx
 	repe
 	cmpsb
 	je	2f
 1:
 	incl	%eax
 2:
 	popl	%esi
 	popl	%edi
 	ret
 
 
 /*
  * Handling of special 386 registers and descriptor tables etc
  */
 /* void lgdt(struct region_descriptor *rdp); */
 ENTRY(lgdt)
 	/* reload the descriptor table */
 	movl	4(%esp),%eax
 	lgdt	(%eax)
 
 	/* flush the prefetch q */
 	jmp	1f
 	nop
 1:
 	/* reload "stale" selectors */
 	movl	$KDSEL,%eax
 	movl	%ax,%ds
 	movl	%ax,%es
 	movl	%ax,%fs
 	movl	%ax,%gs
 	movl	%ax,%ss
 
 	/* reload code selector by turning return into intersegmental return */
 	movl	(%esp),%eax
 	pushl	%eax
 	movl	$KCSEL,4(%esp)
 	lret
 
 /*
  * void lidt(struct region_descriptor *rdp);
  */
 ENTRY(lidt)
 	movl	4(%esp),%eax
 	lidt	(%eax)
 	ret
 
 /*
  * void lldt(u_short sel)
  */
 ENTRY(lldt)
 	lldt	4(%esp)
 	ret
 
 /*
  * void ltr(u_short sel)
  */
 ENTRY(ltr)
 	ltr	4(%esp)
 	ret
 
 /* ssdtosd(*ssdp,*sdp) */
 ENTRY(ssdtosd)
 	pushl	%ebx
 	movl	8(%esp),%ecx
 	movl	8(%ecx),%ebx
 	shll	$16,%ebx
 	movl	(%ecx),%edx
 	roll	$16,%edx
 	movb	%dh,%bl
 	movb	%dl,%bh
 	rorl	$8,%ebx
 	movl	4(%ecx),%eax
 	movw	%ax,%dx
 	andl	$0xf0000,%eax
 	orl	%eax,%ebx
 	movl	12(%esp),%ecx
 	movl	%edx,(%ecx)
 	movl	%ebx,4(%ecx)
 	popl	%ebx
 	ret
 
 /* load_cr0(cr0) */
 ENTRY(load_cr0)
 	movl	4(%esp),%eax
 	movl	%eax,%cr0
 	ret
 
 /* rcr0() */
 ENTRY(rcr0)
 	movl	%cr0,%eax
 	ret
 
 /* rcr3() */
 ENTRY(rcr3)
 	movl	%cr3,%eax
 	ret
 
 /* void load_cr3(caddr_t cr3) */
 ENTRY(load_cr3)
+#if defined(SWTCH_OPTIM_STATS)
+	incl	_tlb_flush_count
+#endif
 	movl	4(%esp),%eax
 	movl	%eax,%cr3
 	ret
 
 /* rcr4() */
 ENTRY(rcr4)
 	movl	%cr4,%eax
 	ret
 
 /* void load_cr4(caddr_t cr4) */
 ENTRY(load_cr4)
 	movl	4(%esp),%eax
 	movl	%eax,%cr4
 	ret
 
 /*****************************************************************************/
 /* setjump, longjump                                                         */
 /*****************************************************************************/
 
 ENTRY(setjmp)
 	movl	4(%esp),%eax
 	movl	%ebx,(%eax)			/* save ebx */
 	movl	%esp,4(%eax)			/* save esp */
 	movl	%ebp,8(%eax)			/* save ebp */
 	movl	%esi,12(%eax)			/* save esi */
 	movl	%edi,16(%eax)			/* save edi */
 	movl	(%esp),%edx			/* get rta */
 	movl	%edx,20(%eax)			/* save eip */
 	xorl	%eax,%eax			/* return(0); */
 	ret
 
 ENTRY(longjmp)
 	movl	4(%esp),%eax
 	movl	(%eax),%ebx			/* restore ebx */
 	movl	4(%eax),%esp			/* restore esp */
 	movl	8(%eax),%ebp			/* restore ebp */
 	movl	12(%eax),%esi			/* restore esi */
 	movl	16(%eax),%edi			/* restore edi */
 	movl	20(%eax),%edx			/* get rta */
 	movl	%edx,(%esp)			/* put in return frame */
 	xorl	%eax,%eax			/* return(1); */
 	incl	%eax
 	ret
 
 /*
  * Here for doing BB-profiling (gcc -a).
  * We rely on the "bbset" instead, but need a dummy function.
  */
 NON_GPROF_ENTRY(__bb_init_func)
 	movl	4(%esp),%eax
 	movl	$1,(%eax)
 	.byte	0xc3				/* avoid macro for `ret' */
Index: head/sys/amd64/amd64/swtch.s
===================================================================
--- head/sys/amd64/amd64/swtch.s	(revision 31708)
+++ head/sys/amd64/amd64/swtch.s	(revision 31709)
@@ -1,778 +1,815 @@
 /*-
  * Copyright (c) 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *	$Id: swtch.s,v 1.63 1997/09/21 15:03:58 peter Exp $
+ *	$Id: swtch.s,v 1.64 1997/10/10 09:44:06 peter Exp $
  */
 
 #include "npx.h"
 #include "opt_user_ldt.h"
 #include "opt_vm86.h"
 
 #include <sys/rtprio.h>
 
 #include <machine/asmacros.h>
 
 #ifdef SMP
 #include <machine/pmap.h>
 #include <machine/apic.h>
 #include <machine/smptests.h>		/** GRAB_LOPRIO */
 #endif /* SMP */
 
 #include "assym.s"
 
 
 /*****************************************************************************/
 /* Scheduling                                                                */
 /*****************************************************************************/
 
 /*
  * The following primitives manipulate the run queues.
  * _whichqs tells which of the 32 queues _qs
  * have processes in them.  setrunqueue puts processes into queues, Remrq
  * removes them from queues.  The running process is on no queue,
  * other processes are on a queue related to p->p_priority, divided by 4
  * actually to shrink the 0-127 range of priorities into the 32 available
  * queues.
  */
 	.data
 
 #ifndef SMP
 	.globl	_curpcb
 _curpcb:	.long	0		/* pointer to curproc's PCB area */
 #endif /* !SMP */
 
 	.globl	_whichqs, _whichrtqs, _whichidqs
 
 _whichqs:	.long	0		/* which run queues have data */
 _whichrtqs:	.long	0		/* which realtime run qs have data */
 _whichidqs:	.long	0		/* which idletime run qs have data */
 
 	.globl	_hlt_vector
 _hlt_vector:	.long	_default_halt	/* pointer to halt routine */
 
 	.globl	_qs,_cnt,_panic
 
 	.globl	_want_resched
 _want_resched:	.long	0		/* we need to re-run the scheduler */
+#if defined(SWTCH_OPTIM_STATS)
+	.globl	_swtch_optim_stats, _tlb_flush_count
+_swtch_optim_stats:	.long	0		/* number of _swtch_optims */
+_tlb_flush_count:	.long	0
+#endif
 
 	.text
 /*
  * setrunqueue(p)
  *
  * Call should be made at spl6(), and p->p_stat should be SRUN
  */
 ENTRY(setrunqueue)
 	movl	4(%esp),%eax
 #ifdef DIAGNOSTIC
 	cmpb	$SRUN,P_STAT(%eax)
 	je	set1
 	pushl	$set2
 	call	_panic
 set1:
 #endif
 	cmpw	$RTP_PRIO_NORMAL,P_RTPRIO_TYPE(%eax) /* normal priority process? */
 	je	set_nort
 
 	movzwl	P_RTPRIO_PRIO(%eax),%edx
 
 	cmpw	$RTP_PRIO_REALTIME,P_RTPRIO_TYPE(%eax) /* realtime priority? */
 	jne	set_id				/* must be idle priority */
 	
 set_rt:
 	btsl	%edx,_whichrtqs			/* set q full bit */
 	shll	$3,%edx
 	addl	$_rtqs,%edx			/* locate q hdr */
 	movl	%edx,P_FORW(%eax)		/* link process on tail of q */
 	movl	P_BACK(%edx),%ecx
 	movl	%ecx,P_BACK(%eax)
 	movl	%eax,P_BACK(%edx)
 	movl	%eax,P_FORW(%ecx)
 	ret
 
 set_id:	
 	btsl	%edx,_whichidqs			/* set q full bit */
 	shll	$3,%edx
 	addl	$_idqs,%edx			/* locate q hdr */
 	movl	%edx,P_FORW(%eax)		/* link process on tail of q */
 	movl	P_BACK(%edx),%ecx
 	movl	%ecx,P_BACK(%eax)
 	movl	%eax,P_BACK(%edx)
 	movl	%eax,P_FORW(%ecx)
 	ret
 
 set_nort:                    			/*  Normal (RTOFF) code */
 	movzbl	P_PRI(%eax),%edx
 	shrl	$2,%edx
 	btsl	%edx,_whichqs			/* set q full bit */
 	shll	$3,%edx
 	addl	$_qs,%edx			/* locate q hdr */
 	movl	%edx,P_FORW(%eax)		/* link process on tail of q */
 	movl	P_BACK(%edx),%ecx
 	movl	%ecx,P_BACK(%eax)
 	movl	%eax,P_BACK(%edx)
 	movl	%eax,P_FORW(%ecx)
 	ret
 
 set2:	.asciz	"setrunqueue"
 
 /*
  * Remrq(p)
  *
  * Call should be made at spl6().
  */
 ENTRY(remrq)
 	movl	4(%esp),%eax
 	cmpw	$RTP_PRIO_NORMAL,P_RTPRIO_TYPE(%eax) /* normal priority process? */
 	je	rem_nort
 
 	movzwl	P_RTPRIO_PRIO(%eax),%edx
 
 	cmpw	$RTP_PRIO_REALTIME,P_RTPRIO_TYPE(%eax) /* normal priority process? */
 	jne	rem_id
 		
 	btrl	%edx,_whichrtqs			/* clear full bit, panic if clear already */
 	jb	rem1rt
 	pushl	$rem3rt
 	call	_panic
 rem1rt:
 	pushl	%edx
 	movl	P_FORW(%eax),%ecx		/* unlink process */
 	movl	P_BACK(%eax),%edx
 	movl	%edx,P_BACK(%ecx)
 	movl	P_BACK(%eax),%ecx
 	movl	P_FORW(%eax),%edx
 	movl	%edx,P_FORW(%ecx)
 	popl	%edx
 	movl	$_rtqs,%ecx
 	shll	$3,%edx
 	addl	%edx,%ecx
 	cmpl	P_FORW(%ecx),%ecx		/* q still has something? */
 	je	rem2rt
 	shrl	$3,%edx				/* yes, set bit as still full */
 	btsl	%edx,_whichrtqs
 rem2rt:
 	ret
 rem_id:
 	btrl	%edx,_whichidqs			/* clear full bit, panic if clear already */
 	jb	rem1id
 	pushl	$rem3id
 	call	_panic
 rem1id:
 	pushl	%edx
 	movl	P_FORW(%eax),%ecx		/* unlink process */
 	movl	P_BACK(%eax),%edx
 	movl	%edx,P_BACK(%ecx)
 	movl	P_BACK(%eax),%ecx
 	movl	P_FORW(%eax),%edx
 	movl	%edx,P_FORW(%ecx)
 	popl	%edx
 	movl	$_idqs,%ecx
 	shll	$3,%edx
 	addl	%edx,%ecx
 	cmpl	P_FORW(%ecx),%ecx		/* q still has something? */
 	je	rem2id
 	shrl	$3,%edx				/* yes, set bit as still full */
 	btsl	%edx,_whichidqs
 rem2id:
 	ret
 
 rem_nort:     
 	movzbl	P_PRI(%eax),%edx
 	shrl	$2,%edx
 	btrl	%edx,_whichqs			/* clear full bit, panic if clear already */
 	jb	rem1
 	pushl	$rem3
 	call	_panic
 rem1:
 	pushl	%edx
 	movl	P_FORW(%eax),%ecx		/* unlink process */
 	movl	P_BACK(%eax),%edx
 	movl	%edx,P_BACK(%ecx)
 	movl	P_BACK(%eax),%ecx
 	movl	P_FORW(%eax),%edx
 	movl	%edx,P_FORW(%ecx)
 	popl	%edx
 	movl	$_qs,%ecx
 	shll	$3,%edx
 	addl	%edx,%ecx
 	cmpl	P_FORW(%ecx),%ecx		/* q still has something? */
 	je	rem2
 	shrl	$3,%edx				/* yes, set bit as still full */
 	btsl	%edx,_whichqs
 rem2:
 	ret
 
 rem3:	.asciz	"remrq"
 rem3rt:	.asciz	"remrq.rt"
 rem3id:	.asciz	"remrq.id"
 
 /*
  * When no processes are on the runq, cpu_switch() branches to _idle
  * to wait for something to come ready.
  */
 	ALIGN_TEXT
 _idle:
 #ifdef SMP
 	/* when called, we have the mplock, intr disabled */
 
 	xorl	%ebp,%ebp
 
 	/* use our idleproc's "context" */
 	movl	_my_idlePTD,%ecx
 	movl	%ecx,%cr3
+#if defined(SWTCH_OPTIM_STATS)
+	incl	_tlb_flush_count
+#endif
 	movl	$_idlestack_top,%ecx
 	movl	%ecx,%esp
 
 	/* update common_tss.tss_esp0 pointer */
 #ifdef VM86
 	movl	_my_tr, %esi
 #endif /* VM86 */
-	movl	$_common_tss, %eax
-	movl	%ecx, TSS_ESP0(%eax)
+	movl	%ecx, _common_tss + TSS_ESP0
 
 #ifdef VM86
 	btrl	%esi, _private_tss
 	je	1f
 	movl	$_common_tssd, %edi
 
 	/* move correct tss descriptor into GDT slot, then reload tr */
 	leal	_gdt(,%esi,8), %ebx		/* entry in GDT */
 	movl	0(%edi), %eax
 	movl	%eax, 0(%ebx)
 	movl	4(%edi), %eax
 	movl	%eax, 4(%ebx)
 	shll	$3, %esi			/* GSEL(entry, SEL_KPL) */
 	ltr	%si
 1:
 #endif /* VM86 */
 
 	sti
 
 	/*
 	 * XXX callers of cpu_switch() do a bogus splclock().  Locking should
 	 * be left to cpu_switch().
 	 */
 	call	_spl0
 
 	cli
 
 	/*
 	 * _REALLY_ free the lock, no matter how deep the prior nesting.
 	 * We will recover the nesting on the way out when we have a new
 	 * proc to load.
 	 *
 	 * XXX: we had damn well better be sure we had it before doing this!
 	 */
 	movl	$FREE_LOCK, %eax
 	movl	%eax, _mp_lock
 
 	/* do NOT have lock, intrs disabled */
 	.globl	idle_loop
 idle_loop:
 
+#if defined(SWTCH_OPTIM_STATS)
+	incl	_tlb_flush_count
+#endif
 	movl	%cr3,%eax			/* ouch! */
 	movl	%eax,%cr3
 
 	cmpl	$0,_smp_active
 	jne	1f
 	cmpl	$0,_cpuid
 	je	1f
 	jmp	2f
 
 1:	cmpl	$0,_whichrtqs			/* real-time queue */
 	jne	3f
 	cmpl	$0,_whichqs			/* normal queue */
 	jne	3f
 	cmpl	$0,_whichidqs			/* 'idle' queue */
 	jne	3f
 
 	cmpl	$0,_do_page_zero_idle
 	je	2f
 
 	/* XXX appears to cause panics */
 	/*
 	 * Inside zero_idle we enable interrupts and grab the mplock
 	 * as needed.  It needs to be careful about entry/exit mutexes.
 	 */
 	call	_vm_page_zero_idle		/* internal locking */
 	testl	%eax, %eax
 	jnz	idle_loop
 2:
 
 	/* enable intrs for a halt */
 #ifdef SMP
 	movl	$0, lapic_tpr			/* 1st candidate for an INT */
 #endif
 	sti
 	call	*_hlt_vector			/* wait for interrupt */
 	cli
 	jmp	idle_loop
 
 3:
 #ifdef SMP
 	movl	$LOPRIO_LEVEL, lapic_tpr	/* arbitrate for INTs */
 #endif
 	call	_get_mplock
 	cmpl	$0,_whichrtqs			/* real-time queue */
 	CROSSJUMP(jne, sw1a, je)
 	cmpl	$0,_whichqs			/* normal queue */
 	CROSSJUMP(jne, nortqr, je)
 	cmpl	$0,_whichidqs			/* 'idle' queue */
 	CROSSJUMP(jne, idqr, je)
 	call	_rel_mplock
 	jmp	idle_loop
 
 #else
 	xorl	%ebp,%ebp
 	movl	$HIDENAME(tmpstk),%esp
-	movl	_IdlePTD,%ecx
-	movl	%ecx,%cr3
+#if defined(OVERLY_CONSERVATIVE_PTD_MGMT)
+#if defined(SWTCH_OPTIM_STATS)
+	incl	_swtch_optim_stats
+#endif
+	movl	_IdlePTD, %ecx
+	movl	%cr3, %eax
+	cmpl	%ecx, %eax
+	je		2f
+#if defined(SWTCH_OPTIM_STATS)
+	decl	_swtch_optim_stats
+	incl	_tlb_flush_count
+#endif
+	movl	%ecx, %cr3
+2:
+#endif
 
 	/* update common_tss.tss_esp0 pointer */
 #ifdef VM86
 	movl	_my_tr, %esi
 #endif /* VM86 */
-	movl	$_common_tss, %eax
-	movl	%esp, TSS_ESP0(%eax)
+	movl	%esp, _common_tss + TSS_ESP0
 
 #ifdef VM86
 	btrl	%esi, _private_tss
 	je	1f
 	movl	$_common_tssd, %edi
 
 	/* move correct tss descriptor into GDT slot, then reload tr */
 	leal	_gdt(,%esi,8), %ebx		/* entry in GDT */
 	movl	0(%edi), %eax
 	movl	%eax, 0(%ebx)
 	movl	4(%edi), %eax
 	movl	%eax, 4(%ebx)
 	shll	$3, %esi			/* GSEL(entry, SEL_KPL) */
 	ltr	%si
 1:
 #endif /* VM86 */
 
 	sti
 
 	/*
 	 * XXX callers of cpu_switch() do a bogus splclock().  Locking should
 	 * be left to cpu_switch().
 	 */
 	call	_spl0
 
 	ALIGN_TEXT
 idle_loop:
 	cli
 	cmpl	$0,_whichrtqs			/* real-time queue */
 	CROSSJUMP(jne, sw1a, je)
 	cmpl	$0,_whichqs			/* normal queue */
 	CROSSJUMP(jne, nortqr, je)
 	cmpl	$0,_whichidqs			/* 'idle' queue */
 	CROSSJUMP(jne, idqr, je)
 	call	_vm_page_zero_idle
 	testl	%eax, %eax
 	jnz	idle_loop
 	sti
 	call	*_hlt_vector			/* wait for interrupt */
 	jmp	idle_loop
 #endif
 
 CROSSJUMPTARGET(_idle)
 
 ENTRY(default_halt)
 #ifndef SMP
 	hlt					/* XXX:	 until a wakeup IPI */
 #endif
 	ret
 
 /*
  * cpu_switch()
  */
 ENTRY(cpu_switch)
 	
 	/* switch to new process. first, save context as needed */
 	movl	_curproc,%ecx
 
 	/* if no process to save, don't bother */
 	testl	%ecx,%ecx
 	je	sw1
 
 #ifdef SMP
 	movb	P_ONCPU(%ecx), %al		/* save "last" cpu */
 	movb	%al, P_LASTCPU(%ecx)
 	movb	$0xff, P_ONCPU(%ecx)		/* "leave" the cpu */
 #endif /* SMP */
 
 	movl	P_ADDR(%ecx),%ecx
 
 	movl	(%esp),%eax			/* Hardware registers */
 	movl	%eax,PCB_EIP(%ecx)
 	movl	%ebx,PCB_EBX(%ecx)
 	movl	%esp,PCB_ESP(%ecx)
 	movl	%ebp,PCB_EBP(%ecx)
 	movl	%esi,PCB_ESI(%ecx)
 	movl	%edi,PCB_EDI(%ecx)
 	movl	%fs,PCB_FS(%ecx)
 	movl	%gs,PCB_GS(%ecx)
 
 #ifdef SMP
 	movl	_mp_lock, %eax
 	/* XXX FIXME: we should be saving the local APIC TPR */
 #ifdef DIAGNOSTIC
 	cmpl	$FREE_LOCK, %eax		/* is it free? */
 	je	badsw4				/* yes, bad medicine! */
 #endif /* DIAGNOSTIC */
 	andl	$COUNT_FIELD, %eax		/* clear CPU portion */
 	movl	%eax, PCB_MPNEST(%ecx)		/* store it */
 #endif /* SMP */
 
 #if NNPX > 0
 	/* have we used fp, and need a save? */
 	movl	_curproc,%eax
 	cmpl	%eax,_npxproc
 	jne	1f
 	addl	$PCB_SAVEFPU,%ecx		/* h/w bugs make saving complicated */
 	pushl	%ecx
 	call	_npxsave			/* do it in a big C function */
 	popl	%eax
 1:
 #endif	/* NNPX > 0 */
 
 	movl	$0,_curproc			/* out of process */
 
 	/* save is done, now choose a new process or idle */
 sw1:
 	cli
 
 #ifdef SMP
 	/* Stop scheduling if smp_active goes zero and we are not BSP */
 	cmpl	$0,_smp_active
 	jne	1f
 	cmpl	$0,_cpuid
 	je	1f
 	CROSSJUMP(je, _idle, jne)		/* wind down */
 1:
 #endif
 
 sw1a:
 	movl    _whichrtqs,%edi			/* pick next p. from rtqs */
 	testl	%edi,%edi
 	jz	nortqr				/* no realtime procs */
 
 	/* XXX - bsf is sloow */
 	bsfl	%edi,%ebx			/* find a full q */
 	jz	nortqr				/* no proc on rt q - try normal ... */
 
 	/* XX update whichqs? */
 	btrl	%ebx,%edi			/* clear q full status */
 	leal	_rtqs(,%ebx,8),%eax		/* select q */
 	movl	%eax,%esi
 
 	movl	P_FORW(%eax),%ecx		/* unlink from front of process q */
 	movl	P_FORW(%ecx),%edx
 	movl	%edx,P_FORW(%eax)
 	movl	P_BACK(%ecx),%eax
 	movl	%eax,P_BACK(%edx)
 
 	cmpl	P_FORW(%ecx),%esi		/* q empty */
 	je	rt3
 	btsl	%ebx,%edi			/* nope, set to indicate not empty */
 rt3:
 	movl	%edi,_whichrtqs			/* update q status */
 	jmp	swtch_com
 
 	/* old sw1a */
 /* Normal process priority's */
 nortqr:
 	movl	_whichqs,%edi
 2:
 	/* XXX - bsf is sloow */
 	bsfl	%edi,%ebx			/* find a full q */
 	jz	idqr				/* if none, idle */
 
 	/* XX update whichqs? */
 	btrl	%ebx,%edi			/* clear q full status */
 	leal	_qs(,%ebx,8),%eax		/* select q */
 	movl	%eax,%esi
 
 	movl	P_FORW(%eax),%ecx		/* unlink from front of process q */
 	movl	P_FORW(%ecx),%edx
 	movl	%edx,P_FORW(%eax)
 	movl	P_BACK(%ecx),%eax
 	movl	%eax,P_BACK(%edx)
 
 	cmpl	P_FORW(%ecx),%esi		/* q empty */
 	je	3f
 	btsl	%ebx,%edi			/* nope, set to indicate not empty */
 3:
 	movl	%edi,_whichqs			/* update q status */
 	jmp	swtch_com
 
 idqr: /* was sw1a */
 	movl    _whichidqs,%edi			/* pick next p. from idqs */
 
 	/* XXX - bsf is sloow */
 	bsfl	%edi,%ebx			/* find a full q */
 	CROSSJUMP(je, _idle, jne)		/* if no proc, idle */
 
 	/* XX update whichqs? */
 	btrl	%ebx,%edi			/* clear q full status */
 	leal	_idqs(,%ebx,8),%eax		/* select q */
 	movl	%eax,%esi
 
 	movl	P_FORW(%eax),%ecx		/* unlink from front of process q */
 	movl	P_FORW(%ecx),%edx
 	movl	%edx,P_FORW(%eax)
 	movl	P_BACK(%ecx),%eax
 	movl	%eax,P_BACK(%edx)
 
 	cmpl	P_FORW(%ecx),%esi		/* q empty */
 	je	id3
 	btsl	%ebx,%edi			/* nope, set to indicate not empty */
 id3:
 	movl	%edi,_whichidqs			/* update q status */
 
 swtch_com:
 	movl	$0,%eax
 	movl	%eax,_want_resched
 
 #ifdef	DIAGNOSTIC
 	cmpl	%eax,P_WCHAN(%ecx)
 	jne	badsw1
 	cmpb	$SRUN,P_STAT(%ecx)
 	jne	badsw2
 #endif
 
 	movl	%eax,P_BACK(%ecx) 		/* isolate process to run */
 	movl	P_ADDR(%ecx),%edx
-	movl	PCB_CR3(%edx),%ebx
 
 #ifdef SMP
+	movl	PCB_CR3(%edx),%ebx
 	/* Grab the private PT pointer from the outgoing process's PTD */
 	movl	$_PTD, %esi
 	movl	4*MPPTDI(%esi), %eax		/* fetch cpu's prv pt */
-#endif /* SMP */
-
+#else
+#if defined(SWTCH_OPTIM_STATS)
+	incl	_swtch_optim_stats
+#endif
 	/* switch address space */
+	movl	%cr3,%ebx
+	cmpl	PCB_CR3(%edx),%ebx
+	je		4f
+#if defined(SWTCH_OPTIM_STATS)
+	decl	_swtch_optim_stats
+	incl	_tlb_flush_count
+#endif
+	movl	PCB_CR3(%edx),%ebx
+#endif /* SMP */
 	movl	%ebx,%cr3
+4:
 
 #ifdef SMP
 	/* Copy the private PT to the new process's PTD */
 	/* XXX yuck, the _PTD changes when we switch, so we have to
 	 * reload %cr3 after changing the address space.
 	 * We need to fix this by storing a pointer to the virtual
 	 * location of the per-process PTD in the PCB or something quick.
 	 * Dereferencing proc->vm_map->pmap->p_pdir[] is painful in asm.
 	 */
 	movl	%eax, 4*MPPTDI(%esi)		/* restore cpu's prv page */
 
+#if defined(SWTCH_OPTIM_STATS)
+	incl	_tlb_flush_count
+#endif
 	/* XXX: we have just changed the page tables.. reload.. */
 	movl	%ebx, %cr3
 #endif /* SMP */
 
 #ifdef VM86
 	movl	_my_tr, %esi
 	cmpl	$0, PCB_EXT(%edx)		/* has pcb extension? */
 	je	1f
 	btsl	%esi, _private_tss		/* mark use of private tss */
 	movl	PCB_EXT(%edx), %edi		/* new tss descriptor */
 	jmp	2f
 1:
 #endif
 
 	/* update common_tss.tss_esp0 pointer */
 	movl	$_common_tss, %eax
 	movl	%edx, %ebx			/* pcb */
 #ifdef VM86
 	addl	$(UPAGES * PAGE_SIZE - 16), %ebx
 #else
 	addl	$(UPAGES * PAGE_SIZE), %ebx
 #endif /* VM86 */
 	movl	%ebx, TSS_ESP0(%eax)
 
 #ifdef VM86
 	btrl	%esi, _private_tss
 	je	3f
 	movl	$_common_tssd, %edi
 2:
 	/* move correct tss descriptor into GDT slot, then reload tr */
 	leal	_gdt(,%esi,8), %ebx		/* entry in GDT */
 	movl	0(%edi), %eax
 	movl	%eax, 0(%ebx)
 	movl	4(%edi), %eax
 	movl	%eax, 4(%ebx)
 	shll	$3, %esi			/* GSEL(entry, SEL_KPL) */
 	ltr	%si
 3:
 #endif /* VM86 */
 
 	/* restore context */
 	movl	PCB_EBX(%edx),%ebx
 	movl	PCB_ESP(%edx),%esp
 	movl	PCB_EBP(%edx),%ebp
 	movl	PCB_ESI(%edx),%esi
 	movl	PCB_EDI(%edx),%edi
 	movl	PCB_EIP(%edx),%eax
 	movl	%eax,(%esp)
 
 #ifdef SMP
 #ifdef GRAB_LOPRIO				/* hold LOPRIO for INTs */
 #ifdef CHEAP_TPR
 	movl	$0, lapic_tpr
 #else
 	andl	$~APIC_TPR_PRIO, lapic_tpr
 #endif /** CHEAP_TPR */
 #endif /** GRAB_LOPRIO */
 	movl	_cpuid,%eax
 	movb	%al, P_ONCPU(%ecx)
 #endif /* SMP */
 	movl	%edx, _curpcb
 	movl	%ecx, _curproc			/* into next process */
 
 #ifdef SMP
 	movl	_cpu_lockid, %eax
 	orl	PCB_MPNEST(%edx), %eax		/* add next count from PROC */
 	movl	%eax, _mp_lock			/* load the mp_lock */
 	/* XXX FIXME: we should be restoring the local APIC TPR */
 #endif /* SMP */
 
 #ifdef	USER_LDT
 	cmpl	$0, PCB_USERLDT(%edx)
 	jnz	1f
 	movl	__default_ldt,%eax
 	cmpl	_currentldt,%eax
 	je	2f
 	lldt	__default_ldt
 	movl	%eax,_currentldt
 	jmp	2f
 1:	pushl	%edx
 	call	_set_user_ldt
 	popl	%edx
 2:
 #endif
 
 	/* This must be done after loading the user LDT. */
 	.globl	cpu_switch_load_fs
 cpu_switch_load_fs:
 	movl	PCB_FS(%edx),%fs
 	.globl	cpu_switch_load_gs
 cpu_switch_load_gs:
 	movl	PCB_GS(%edx),%gs
 
 	sti
 	ret
 
 CROSSJUMPTARGET(idqr)
 CROSSJUMPTARGET(nortqr)
 CROSSJUMPTARGET(sw1a)
 
 #ifdef DIAGNOSTIC
 badsw1:
 	pushl	$sw0_1
 	call	_panic
 
 sw0_1:	.asciz	"cpu_switch: has wchan"
 
 badsw2:
 	pushl	$sw0_2
 	call	_panic
 
 sw0_2:	.asciz	"cpu_switch: not SRUN"
 #endif
 
 #if defined(SMP) && defined(DIAGNOSTIC)
 badsw4:
 	pushl	$sw0_4
 	call	_panic
 
 sw0_4:	.asciz	"cpu_switch: do not have lock"
 #endif /* SMP && DIAGNOSTIC */
 
 /*
  * savectx(pcb)
  * Update pcb, saving current processor state.
  */
 ENTRY(savectx)
 	/* fetch PCB */
 	movl	4(%esp),%ecx
 
 	/* caller's return address - child won't execute this routine */
 	movl	(%esp),%eax
 	movl	%eax,PCB_EIP(%ecx)
 
 	movl	%ebx,PCB_EBX(%ecx)
 	movl	%esp,PCB_ESP(%ecx)
 	movl	%ebp,PCB_EBP(%ecx)
 	movl	%esi,PCB_ESI(%ecx)
 	movl	%edi,PCB_EDI(%ecx)
 	movl	%fs,PCB_FS(%ecx)
 	movl	%gs,PCB_GS(%ecx)
 
 #if NNPX > 0
 	/*
 	 * If npxproc == NULL, then the npx h/w state is irrelevant and the
 	 * state had better already be in the pcb.  This is true for forks
 	 * but not for dumps (the old book-keeping with FP flags in the pcb
 	 * always lost for dumps because the dump pcb has 0 flags).
 	 *
 	 * If npxproc != NULL, then we have to save the npx h/w state to
 	 * npxproc's pcb and copy it to the requested pcb, or save to the
 	 * requested pcb and reload.  Copying is easier because we would
 	 * have to handle h/w bugs for reloading.  We used to lose the
 	 * parent's npx state for forks by forgetting to reload.
 	 */
 	movl	_npxproc,%eax
 	testl	%eax,%eax
 	je	1f
 
 	pushl	%ecx
 	movl	P_ADDR(%eax),%eax
 	leal	PCB_SAVEFPU(%eax),%eax
 	pushl	%eax
 	pushl	%eax
 	call	_npxsave
 	addl	$4,%esp
 	popl	%eax
 	popl	%ecx
 
 	pushl	$PCB_SAVEFPU_SIZE
 	leal	PCB_SAVEFPU(%ecx),%ecx
 	pushl	%ecx
 	pushl	%eax
 	call	_bcopy
 	addl	$12,%esp
 #endif	/* NNPX > 0 */
 
 1:
 	ret
Index: head/sys/amd64/include/cpufunc.h
===================================================================
--- head/sys/amd64/include/cpufunc.h	(revision 31708)
+++ head/sys/amd64/include/cpufunc.h	(revision 31709)
@@ -1,430 +1,436 @@
 /*-
  * Copyright (c) 1993 The Regents of the University of California.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *	$Id: cpufunc.h,v 1.3 1997/09/05 20:20:31 smp Exp smp $
+ *	$Id: cpufunc.h,v 1.72 1997/09/07 22:01:27 fsmp Exp $
  */
 
 /*
  * Functions to provide access to special i386 instructions.
  */
 
 #ifndef _MACHINE_CPUFUNC_H_
 #define	_MACHINE_CPUFUNC_H_
 
 #include <sys/cdefs.h>
 #include <sys/types.h>
 
 #include <machine/lock.h>
 
+#if defined(SWTCH_OPTIM_STATS)
+extern int tlb_flush_count;
+#endif
 
 #ifdef	__GNUC__
 
 static __inline void
 breakpoint(void)
 {
 	__asm __volatile("int $3");
 }
 
 static __inline void
 disable_intr(void)
 {
 	__asm __volatile("cli" : : : "memory");
 	MPINTR_LOCK();
 }
 
 static __inline void
 enable_intr(void)
 {
 	MPINTR_UNLOCK();
 	__asm __volatile("sti");
 }
 
 #define	HAVE_INLINE_FFS
 
 static __inline int
 ffs(int mask)
 {
 	int	result;
 	/*
 	 * bsfl turns out to be not all that slow on 486's.  It can beaten
 	 * using a binary search to reduce to 4 bits and then a table lookup,
 	 * but only if the code is inlined and in the cache, and the code
 	 * is quite large so inlining it probably busts the cache.
 	 *
 	 * Note that gcc-2's builtin ffs would be used if we didn't declare
 	 * this inline or turn off the builtin.  The builtin is faster but
 	 * broken in gcc-2.4.5 and slower but working in gcc-2.5 and 2.6.
 	 */
 	__asm __volatile("testl %0,%0; je 1f; bsfl %0,%0; incl %0; 1:"
 			 : "=r" (result) : "0" (mask));
 	return (result);
 }
 
 #define	HAVE_INLINE_FLS
 
 static __inline int
 fls(int mask)
 {
 	int	result;
 	__asm __volatile("testl %0,%0; je 1f; bsrl %0,%0; incl %0; 1:"
 			 : "=r" (result) : "0" (mask));
 	return (result);
 }
 
 #if __GNUC__ < 2
 
 #define	inb(port)		inbv(port)
 #define	outb(port, data)	outbv(port, data)
 
 #else /* __GNUC >= 2 */
 
 /*
  * The following complications are to get around gcc not having a
  * constraint letter for the range 0..255.  We still put "d" in the
  * constraint because "i" isn't a valid constraint when the port
  * isn't constant.  This only matters for -O0 because otherwise
  * the non-working version gets optimized away.
  * 
  * Use an expression-statement instead of a conditional expression
  * because gcc-2.6.0 would promote the operands of the conditional
  * and produce poor code for "if ((inb(var) & const1) == const2)".
  *
  * The unnecessary test `(port) < 0x10000' is to generate a warning if
  * the `port' has type u_short or smaller.  Such types are pessimal.
  * This actually only works for signed types.  The range check is
  * careful to avoid generating warnings.
  */
 #define	inb(port) __extension__ ({					\
 	u_char	_data;							\
 	if (__builtin_constant_p(port) && ((port) & 0xffff) < 0x100	\
 	    && (port) < 0x10000)					\
 		_data = inbc(port);					\
 	else								\
 		_data = inbv(port);					\
 	_data; })
 
 #define	outb(port, data) (						\
 	__builtin_constant_p(port) && ((port) & 0xffff) < 0x100		\
 	&& (port) < 0x10000						\
 	? outbc(port, data) : outbv(port, data))
 
 static __inline u_char
 inbc(u_int port)
 {
 	u_char	data;
 
 	__asm __volatile("inb %1,%0" : "=a" (data) : "id" ((u_short)(port)));
 	return (data);
 }
 
 static __inline void
 outbc(u_int port, u_char data)
 {
 	__asm __volatile("outb %0,%1" : : "a" (data), "id" ((u_short)(port)));
 }
 
 #endif /* __GNUC <= 2 */
 
 static __inline u_char
 inbv(u_int port)
 {
 	u_char	data;
 	/*
 	 * We use %%dx and not %1 here because i/o is done at %dx and not at
 	 * %edx, while gcc generates inferior code (movw instead of movl)
 	 * if we tell it to load (u_short) port.
 	 */
 	__asm __volatile("inb %%dx,%0" : "=a" (data) : "d" (port));
 	return (data);
 }
 
 static __inline u_long
 inl(u_int port)
 {
 	u_long	data;
 
 	__asm __volatile("inl %%dx,%0" : "=a" (data) : "d" (port));
 	return (data);
 }
 
 static __inline void
 insb(u_int port, void *addr, size_t cnt)
 {
 	__asm __volatile("cld; rep; insb"
 			 : : "d" (port), "D" (addr), "c" (cnt)
 			 : "di", "cx", "memory");
 }
 
 static __inline void
 insw(u_int port, void *addr, size_t cnt)
 {
 	__asm __volatile("cld; rep; insw"
 			 : : "d" (port), "D" (addr), "c" (cnt)
 			 : "di", "cx", "memory");
 }
 
 static __inline void
 insl(u_int port, void *addr, size_t cnt)
 {
 	__asm __volatile("cld; rep; insl"
 			 : : "d" (port), "D" (addr), "c" (cnt)
 			 : "di", "cx", "memory");
 }
 
 static __inline void
 invd(void)
 {
 	__asm __volatile("invd");
 }
 
 #ifdef KERNEL
 #ifdef SMP
 
 /*
  * When using APIC IPI's, the inlining cost is prohibitive since the call
  * executes into the IPI transmission system.
  */
 void	invlpg		__P((u_int addr));
 void	invltlb		__P((void));
 
 #else  /* !SMP */
 
 static __inline void
 invlpg(u_int addr)
 {
 	__asm __volatile("invlpg (%0)" : : "r" (addr) : "memory");
 }
 
 static __inline void
 invltlb(void)
 {
 	u_long	temp;
 	/*
 	 * This should be implemented as load_cr3(rcr3()) when load_cr3()
 	 * is inlined.
 	 */
 	__asm __volatile("movl %%cr3, %0; movl %0, %%cr3" : "=r" (temp)
 			 : : "memory");
+#if defined(SWTCH_OPTIM_STATS)
+	++tlb_flush_count;
+#endif
 }
 
 #endif	/* SMP */
 #endif  /* KERNEL */
 
 static __inline u_short
 inw(u_int port)
 {
 	u_short	data;
 
 	__asm __volatile("inw %%dx,%0" : "=a" (data) : "d" (port));
 	return (data);
 }
 
 static __inline u_int
 loadandclear(u_int *addr)
 {
 	u_int	result;
 
 	__asm __volatile("xorl %0,%0; xchgl %1,%0"
 			 : "=&r" (result) : "m" (*addr));
 	return (result);
 }
 
 static __inline void
 outbv(u_int port, u_char data)
 {
 	u_char	al;
 	/*
 	 * Use an unnecessary assignment to help gcc's register allocator.
 	 * This make a large difference for gcc-1.40 and a tiny difference
 	 * for gcc-2.6.0.  For gcc-1.40, al had to be ``asm("ax")'' for
 	 * best results.  gcc-2.6.0 can't handle this.
 	 */
 	al = data;
 	__asm __volatile("outb %0,%%dx" : : "a" (al), "d" (port));
 }
 
 static __inline void
 outl(u_int port, u_long data)
 {
 	/*
 	 * outl() and outw() aren't used much so we haven't looked at
 	 * possible micro-optimizations such as the unnecessary
 	 * assignment for them.
 	 */
 	__asm __volatile("outl %0,%%dx" : : "a" (data), "d" (port));
 }
 
 static __inline void
 outsb(u_int port, void *addr, size_t cnt)
 {
 	__asm __volatile("cld; rep; outsb"
 			 : : "d" (port), "S" (addr), "c" (cnt)
 			 : "si", "cx");
 }
 
 static __inline void
 outsw(u_int port, void *addr, size_t cnt)
 {
 	__asm __volatile("cld; rep; outsw"
 			 : : "d" (port), "S" (addr), "c" (cnt)
 			 : "si", "cx");
 }
 
 static __inline void
 outsl(u_int port, void *addr, size_t cnt)
 {
 	__asm __volatile("cld; rep; outsl"
 			 : : "d" (port), "S" (addr), "c" (cnt)
 			 : "si", "cx");
 }
 
 static __inline void
 outw(u_int port, u_short data)
 {
 	__asm __volatile("outw %0,%%dx" : : "a" (data), "d" (port));
 }
 
 static __inline u_long
 rcr2(void)
 {
 	u_long	data;
 
 	__asm __volatile("movl %%cr2,%0" : "=r" (data));
 	return (data);
 }
 
 static __inline u_long
 read_eflags(void)
 {
 	u_long	ef;
 
 	__asm __volatile("pushfl; popl %0" : "=r" (ef));
 	return (ef);
 }
 
 static __inline quad_t
 rdmsr(u_int msr)
 {
 	quad_t rv;
 
 	__asm __volatile(".byte 0x0f, 0x32" : "=A" (rv) : "c" (msr));
 	return (rv);
 }
 
 static __inline quad_t
 rdpmc(u_int pmc)
 {
 	quad_t rv;
 
 	__asm __volatile(".byte 0x0f, 0x33" : "=A" (rv) : "c" (pmc));
 	return (rv);
 }
 
 static __inline quad_t
 rdtsc(void)
 {
 	quad_t rv;
 
 	__asm __volatile(".byte 0x0f, 0x31" : "=A" (rv));
 	return (rv);
 }
 
 static __inline void
 setbits(volatile unsigned *addr, u_int bits)
 {
 	__asm __volatile(
 #ifdef SMP
 			 "lock; "
 #endif
 			 "orl %1,%0" : "=m" (*addr) : "ir" (bits));
 }
 
 static __inline void
 wbinvd(void)
 {
 	__asm __volatile("wbinvd");
 }
 
 static __inline void
 write_eflags(u_long ef)
 {
 	__asm __volatile("pushl %0; popfl" : : "r" (ef));
 }
 
 static __inline void
 wrmsr(u_int msr, quad_t newval)
 {
 	__asm __volatile(".byte 0x0f, 0x30" : : "A" (newval), "c" (msr));
 }
 
 #else /* !__GNUC__ */
 
 int	breakpoint	__P((void));
 void	disable_intr	__P((void));
 void	enable_intr	__P((void));
 u_char	inb		__P((u_int port));
 u_long	inl		__P((u_int port));
 void	insb		__P((u_int port, void *addr, size_t cnt));
 void	insl		__P((u_int port, void *addr, size_t cnt));
 void	insw		__P((u_int port, void *addr, size_t cnt));
 void	invd		__P((void));
 void	invlpg		__P((u_int addr));
 void	invltlb		__P((void));
 u_short	inw		__P((u_int port));
 u_int	loadandclear	__P((u_int *addr));
 void	outb		__P((u_int port, u_char data));
 void	outl		__P((u_int port, u_long data));
 void	outsb		__P((u_int port, void *addr, size_t cnt));
 void	outsl		__P((u_int port, void *addr, size_t cnt));
 void	outsw		__P((u_int port, void *addr, size_t cnt));
 void	outw		__P((u_int port, u_short data));
 u_long	rcr2		__P((void));
 quad_t	rdmsr		__P((u_int msr));
 quad_t	rdpmc		__P((u_int pmc));
 quad_t	rdtsc		__P((void));
 u_long	read_eflags	__P((void));
 void	setbits		__P((volatile unsigned *addr, u_int bits));
 void	wbinvd		__P((void));
 void	write_eflags	__P((u_long ef));
 void	wrmsr		__P((u_int msr, quad_t newval));
 
 #endif	/* __GNUC__ */
 
 void	load_cr0	__P((u_long cr0));
 void	load_cr3	__P((u_long cr3));
 void	load_cr4	__P((u_long cr4));
 void	ltr		__P((u_short sel));
 u_int	rcr0		__P((void));
 u_long	rcr3		__P((void));
 u_long	rcr4		__P((void));
 
 #endif /* !_MACHINE_CPUFUNC_H_ */
Index: head/sys/i386/i386/machdep.c
===================================================================
--- head/sys/i386/i386/machdep.c	(revision 31708)
+++ head/sys/i386/i386/machdep.c	(revision 31709)
@@ -1,1798 +1,1806 @@
 /*-
  * Copyright (c) 1992 Terrence R. Lambert.
  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
- *	$Id: machdep.c,v 1.277 1997/12/04 14:35:39 jkh Exp $
+ *	$Id: machdep.c,v 1.278 1997/12/04 21:21:24 jmg Exp $
  */
 
 #include "apm.h"
 #include "npx.h"
 #include "opt_bounce.h"
 #include "opt_cpu.h"
 #include "opt_ddb.h"
 #include "opt_maxmem.h"
 #include "opt_perfmon.h"
 #include "opt_smp.h"
 #include "opt_sysvipc.h"
 #include "opt_userconfig.h"
 #include "opt_vm86.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/signalvar.h>
 #include <sys/kernel.h>
 #include <sys/proc.h>
 #include <sys/buf.h>
 #include <sys/reboot.h>
 #include <sys/conf.h>
 #include <sys/callout.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/msgbuf.h>
 #include <sys/sysent.h>
 #include <sys/sysctl.h>
 #include <sys/vmmeter.h>
 
 #ifdef SYSVSHM
 #include <sys/shm.h>
 #endif
 
 #ifdef SYSVMSG
 #include <sys/msg.h>
 #endif
 
 #ifdef SYSVSEM
 #include <sys/sem.h>
 #endif
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_prot.h>
 #include <sys/lock.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_extern.h>
 
 #include <sys/user.h>
 #include <sys/exec.h>
 
 #include <ddb/ddb.h>
 
 #include <net/netisr.h>
 
 #if NAPM > 0
 #include <machine/apm_bios.h>
 #endif
 #include <machine/cpu.h>
 #include <machine/reg.h>
 #include <machine/clock.h>
 #include <machine/specialreg.h>
 #include <machine/cons.h>
 #include <machine/bootinfo.h>
 #include <machine/ipl.h>
 #include <machine/md_var.h>
 #include <machine/pcb_ext.h>		/* pcb.h included via sys/user.h */
 #ifdef SMP
 #include <machine/smp.h>
 #endif
 #ifdef PERFMON
 #include <machine/perfmon.h>
 #endif
 
 #include <i386/isa/isa_device.h>
 #include <i386/isa/intr_machdep.h>
 #include <i386/isa/rtc.h>
 #include <machine/random.h>
 
 extern void init386 __P((int first));
 extern int ptrace_set_pc __P((struct proc *p, unsigned int addr));
 extern int ptrace_single_step __P((struct proc *p));
 extern int ptrace_write_u __P((struct proc *p, vm_offset_t off, int data));
 extern void dblfault_handler __P((void));
 
 extern void printcpuinfo(void);	/* XXX header file */
 extern void earlysetcpuclass(void);	/* same header file */
 extern void finishidentcpu(void);
 extern void panicifcpuunsupported(void);
 extern void initializecpu(void);
 
 static void cpu_startup __P((void *));
 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL)
 
 static MALLOC_DEFINE(M_MBUF, "mbuf", "mbuf");
 
 #ifdef BOUNCE_BUFFERS
 #ifdef BOUNCEPAGES
 int	bouncepages = BOUNCEPAGES;
 #else
 int	bouncepages = 0;
 #endif
 #endif	/* BOUNCE_BUFFERS */
 
 int	msgbufmapped = 0;		/* set when safe to use msgbuf */
 int _udatasel, _ucodesel;
 u_int	atdevbase;
+
+#if defined(SWTCH_OPTIM_STATS)
+extern int swtch_optim_stats;
+SYSCTL_INT(_debug, OID_AUTO, swtch_optim_stats,
+	CTLFLAG_RD, &swtch_optim_stats, 0, "");
+SYSCTL_INT(_debug, OID_AUTO, tlb_flush_count,
+	CTLFLAG_RD, &tlb_flush_count, 0, "");
+#endif
 
 
 int physmem = 0;
 int cold = 1;
 
 static int
 sysctl_hw_physmem SYSCTL_HANDLER_ARGS
 {
 	int error = sysctl_handle_int(oidp, 0, ctob(physmem), req);
 	return (error);
 }
 
 SYSCTL_PROC(_hw, HW_PHYSMEM, physmem, CTLTYPE_INT|CTLFLAG_RD,
 	0, 0, sysctl_hw_physmem, "I", "");
 
 static int
 sysctl_hw_usermem SYSCTL_HANDLER_ARGS
 {
 	int error = sysctl_handle_int(oidp, 0,
 		ctob(physmem - cnt.v_wire_count), req);
 	return (error);
 }
 
 SYSCTL_PROC(_hw, HW_USERMEM, usermem, CTLTYPE_INT|CTLFLAG_RD,
 	0, 0, sysctl_hw_usermem, "I", "");
 
 int bootverbose = 0, Maxmem = 0;
 long dumplo;
 
 vm_offset_t phys_avail[10];
 
 /* must be 2 less so 0 0 can signal end of chunks */
 #define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(vm_offset_t)) - 2)
 
 static void setup_netisrs __P((struct linker_set *)); /* XXX declare elsewhere */
 
 static vm_offset_t buffer_sva, buffer_eva;
 vm_offset_t clean_sva, clean_eva;
 static vm_offset_t pager_sva, pager_eva;
 extern struct linker_set netisr_set;
 
 #define offsetof(type, member)	((size_t)(&((type *)0)->member))
 
 static void
 cpu_startup(dummy)
 	void *dummy;
 {
 	register unsigned i;
 	register caddr_t v;
 	vm_offset_t maxaddr;
 	vm_size_t size = 0;
 	int firstaddr;
 	vm_offset_t minaddr;
 
 	if (boothowto & RB_VERBOSE)
 		bootverbose++;
 
 	/*
 	 * Good {morning,afternoon,evening,night}.
 	 */
 	printf(version);
 	earlysetcpuclass();
 	startrtclock();
 	printcpuinfo();
 	panicifcpuunsupported();
 #ifdef PERFMON
 	perfmon_init();
 #endif
 	printf("real memory  = %d (%dK bytes)\n", ptoa(Maxmem), ptoa(Maxmem) / 1024);
 	/*
 	 * Display any holes after the first chunk of extended memory.
 	 */
 	if (bootverbose) {
 		int indx;
 
 		printf("Physical memory chunk(s):\n");
 		for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
 			int size1 = phys_avail[indx + 1] - phys_avail[indx];
 
 			printf("0x%08lx - 0x%08lx, %d bytes (%d pages)\n", phys_avail[indx],
 			    phys_avail[indx + 1] - 1, size1, size1 / PAGE_SIZE);
 		}
 	}
 
 	/*
 	 * Quickly wire in netisrs.
 	 */
 	setup_netisrs(&netisr_set);
 
 	/*
 	 * Calculate callout wheel size
 	 */
 	for (callwheelsize = 1, callwheelbits = 0;
 	     callwheelsize < ncallout;
 	     callwheelsize <<= 1, ++callwheelbits)
 		;
 	callwheelmask = callwheelsize - 1;
 
 	/*
 	 * Allocate space for system data structures.
 	 * The first available kernel virtual address is in "v".
 	 * As pages of kernel virtual memory are allocated, "v" is incremented.
 	 * As pages of memory are allocated and cleared,
 	 * "firstaddr" is incremented.
 	 * An index into the kernel page table corresponding to the
 	 * virtual memory address maintained in "v" is kept in "mapaddr".
 	 */
 
 	/*
 	 * Make two passes.  The first pass calculates how much memory is
 	 * needed and allocates it.  The second pass assigns virtual
 	 * addresses to the various data structures.
 	 */
 	firstaddr = 0;
 again:
 	v = (caddr_t)firstaddr;
 
 #define	valloc(name, type, num) \
 	    (name) = (type *)v; v = (caddr_t)((name)+(num))
 #define	valloclim(name, type, num, lim) \
 	    (name) = (type *)v; v = (caddr_t)((lim) = ((name)+(num)))
 	valloc(callout, struct callout, ncallout);
 	valloc(callwheel, struct callout_tailq, callwheelsize);
 #ifdef SYSVSHM
 	valloc(shmsegs, struct shmid_ds, shminfo.shmmni);
 #endif
 #ifdef SYSVSEM
 	valloc(sema, struct semid_ds, seminfo.semmni);
 	valloc(sem, struct sem, seminfo.semmns);
 	/* This is pretty disgusting! */
 	valloc(semu, int, (seminfo.semmnu * seminfo.semusz) / sizeof(int));
 #endif
 #ifdef SYSVMSG
 	valloc(msgpool, char, msginfo.msgmax);
 	valloc(msgmaps, struct msgmap, msginfo.msgseg);
 	valloc(msghdrs, struct msg, msginfo.msgtql);
 	valloc(msqids, struct msqid_ds, msginfo.msgmni);
 #endif
 
 	if (nbuf == 0) {
 		nbuf = 30;
 		if( physmem > 1024)
 			nbuf += min((physmem - 1024) / 8, 2048);
 	}
 	nswbuf = max(min(nbuf/4, 128), 16);
 
 	valloc(swbuf, struct buf, nswbuf);
 	valloc(buf, struct buf, nbuf);
 
 #ifdef BOUNCE_BUFFERS
 	/*
 	 * If there is more than 16MB of memory, allocate some bounce buffers
 	 */
 	if (Maxmem > 4096) {
 		if (bouncepages == 0) {
 			bouncepages = 64;
 			bouncepages += ((Maxmem - 4096) / 2048) * 32;
 			if (bouncepages > 128)
 				bouncepages = 128;
 		}
 		v = (caddr_t)((vm_offset_t)round_page(v));
 		valloc(bouncememory, char, bouncepages * PAGE_SIZE);
 	}
 #endif
 
 	/*
 	 * End of first pass, size has been calculated so allocate memory
 	 */
 	if (firstaddr == 0) {
 		size = (vm_size_t)(v - firstaddr);
 		firstaddr = (int)kmem_alloc(kernel_map, round_page(size));
 		if (firstaddr == 0)
 			panic("startup: no room for tables");
 		goto again;
 	}
 
 	/*
 	 * End of second pass, addresses have been assigned
 	 */
 	if ((vm_size_t)(v - firstaddr) != size)
 		panic("startup: table size inconsistency");
 
 #ifdef BOUNCE_BUFFERS
 	clean_map = kmem_suballoc(kernel_map, &clean_sva, &clean_eva,
 			(nbuf*BKVASIZE) + (nswbuf*MAXPHYS) +
 				maxbkva + pager_map_size, TRUE);
 	io_map = kmem_suballoc(clean_map, &minaddr, &maxaddr, maxbkva, FALSE);
 #else
 	clean_map = kmem_suballoc(kernel_map, &clean_sva, &clean_eva,
 			(nbuf*BKVASIZE) + (nswbuf*MAXPHYS) + pager_map_size, TRUE);
 #endif
 	buffer_map = kmem_suballoc(clean_map, &buffer_sva, &buffer_eva,
 				(nbuf*BKVASIZE), TRUE);
 	pager_map = kmem_suballoc(clean_map, &pager_sva, &pager_eva,
 				(nswbuf*MAXPHYS) + pager_map_size, TRUE);
 	pager_map->system_map = 1;
 	exec_map = kmem_suballoc(kernel_map, &minaddr, &maxaddr,
 				(16*ARG_MAX), TRUE);
 	u_map = kmem_suballoc(kernel_map, &minaddr, &maxaddr,
 				(maxproc*UPAGES*PAGE_SIZE), FALSE);
 
 	/*
 	 * Finally, allocate mbuf pool.  Since mclrefcnt is an off-size
 	 * we use the more space efficient malloc in place of kmem_alloc.
 	 */
 	{
 		vm_offset_t mb_map_size;
 
 		mb_map_size = nmbufs * MSIZE + nmbclusters * MCLBYTES;
 		mb_map_size = roundup2(mb_map_size, max(MCLBYTES, PAGE_SIZE));
 		mclrefcnt = malloc(mb_map_size / MCLBYTES, M_MBUF, M_NOWAIT);
 		bzero(mclrefcnt, mb_map_size / MCLBYTES);
 		mb_map = kmem_suballoc(kmem_map, (vm_offset_t *)&mbutl, &maxaddr,
 			mb_map_size, FALSE);
 		mb_map->system_map = 1;
 	}
 
 	/*
 	 * Initialize callouts
 	 */
 	SLIST_INIT(&callfree);
 	for (i = 0; i < ncallout; i++) {
 		SLIST_INSERT_HEAD(&callfree, &callout[i], c_links.sle);
 	}
 
 	for (i = 0; i < callwheelsize; i++) {
 		TAILQ_INIT(&callwheel[i]);
 	}
 
 #if defined(USERCONFIG)
 #if defined(USERCONFIG_BOOT)
 	if (1) {
 #else
         if (boothowto & RB_CONFIG) {
 #endif
 		userconfig();
 		cninit();	/* the preferred console may have changed */
 	}
 #endif
 
 #ifdef BOUNCE_BUFFERS
 	/*
 	 * init bounce buffers
 	 */
 	vm_bounce_init();
 #endif
 
 	printf("avail memory = %d (%dK bytes)\n", ptoa(cnt.v_free_count),
 	    ptoa(cnt.v_free_count) / 1024);
 
 	/*
 	 * Set up buffers, so they can be used to read disk labels.
 	 */
 	bufinit();
 	vm_pager_bufferinit();
 
 #ifdef SMP
 	/*
 	 * OK, enough kmem_alloc/malloc state should be up, lets get on with it!
 	 */
 	mp_start();			/* fire up the APs and APICs */
 	mp_announce();
 #endif  /* SMP */
 }
 
 int
 register_netisr(num, handler)
 	int num;
 	netisr_t *handler;
 {
 	
 	if (num < 0 || num >= (sizeof(netisrs)/sizeof(*netisrs)) ) {
 		printf("register_netisr: bad isr number: %d\n", num);
 		return (EINVAL);
 	}
 	netisrs[num] = handler;
 	return (0);
 }
 
 static void
 setup_netisrs(ls)
 	struct linker_set *ls;
 {
 	int i;
 	const struct netisrtab *nit;
 
 	for(i = 0; ls->ls_items[i]; i++) {
 		nit = (const struct netisrtab *)ls->ls_items[i];
 		register_netisr(nit->nit_num, nit->nit_isr);
 	}
 }
 
 /*
  * Send an interrupt to process.
  *
  * Stack is set up to allow sigcode stored
  * at top to call routine, followed by kcall
  * to sigreturn routine below.  After sigreturn
  * resets the signal mask, the stack, and the
  * frame pointer, it returns to the user
  * specified pc, psl.
  */
 void
 sendsig(catcher, sig, mask, code)
 	sig_t catcher;
 	int sig, mask;
 	u_long code;
 {
 	register struct proc *p = curproc;
 	register struct trapframe *regs;
 	register struct sigframe *fp;
 	struct sigframe sf;
 	struct sigacts *psp = p->p_sigacts;
 	int oonstack;
 
 	regs = p->p_md.md_regs;
         oonstack = psp->ps_sigstk.ss_flags & SS_ONSTACK;
 	/*
 	 * Allocate and validate space for the signal handler context.
 	 */
         if ((psp->ps_flags & SAS_ALTSTACK) && !oonstack &&
 	    (psp->ps_sigonstack & sigmask(sig))) {
 		fp = (struct sigframe *)(psp->ps_sigstk.ss_sp +
 		    psp->ps_sigstk.ss_size - sizeof(struct sigframe));
 		psp->ps_sigstk.ss_flags |= SS_ONSTACK;
 	} else {
 		fp = (struct sigframe *)regs->tf_esp - 1;
 	}
 
 	/*
 	 * grow() will return FALSE if the fp will not fit inside the stack
 	 *	and the stack can not be grown. useracc will return FALSE
 	 *	if access is denied.
 	 */
 	if ((grow(p, (int)fp) == FALSE) ||
 	    (useracc((caddr_t)fp, sizeof(struct sigframe), B_WRITE) == FALSE)) {
 		/*
 		 * Process has trashed its stack; give it an illegal
 		 * instruction to halt it in its tracks.
 		 */
 		SIGACTION(p, SIGILL) = SIG_DFL;
 		sig = sigmask(SIGILL);
 		p->p_sigignore &= ~sig;
 		p->p_sigcatch &= ~sig;
 		p->p_sigmask &= ~sig;
 		psignal(p, SIGILL);
 		return;
 	}
 
 	/*
 	 * Build the argument list for the signal handler.
 	 */
 	if (p->p_sysent->sv_sigtbl) {
 		if (sig < p->p_sysent->sv_sigsize)
 			sig = p->p_sysent->sv_sigtbl[sig];
 		else
 			sig = p->p_sysent->sv_sigsize + 1;
 	}
 	sf.sf_signum = sig;
 	sf.sf_code = code;
 	sf.sf_scp = &fp->sf_sc;
 	sf.sf_addr = (char *) regs->tf_err;
 	sf.sf_handler = catcher;
 
 	/* save scratch registers */
 	sf.sf_sc.sc_eax = regs->tf_eax;
 	sf.sf_sc.sc_ebx = regs->tf_ebx;
 	sf.sf_sc.sc_ecx = regs->tf_ecx;
 	sf.sf_sc.sc_edx = regs->tf_edx;
 	sf.sf_sc.sc_esi = regs->tf_esi;
 	sf.sf_sc.sc_edi = regs->tf_edi;
 	sf.sf_sc.sc_cs = regs->tf_cs;
 	sf.sf_sc.sc_ds = regs->tf_ds;
 	sf.sf_sc.sc_ss = regs->tf_ss;
 	sf.sf_sc.sc_es = regs->tf_es;
 	sf.sf_sc.sc_isp = regs->tf_isp;
 
 	/*
 	 * Build the signal context to be used by sigreturn.
 	 */
 	sf.sf_sc.sc_onstack = oonstack;
 	sf.sf_sc.sc_mask = mask;
 	sf.sf_sc.sc_sp = regs->tf_esp;
 	sf.sf_sc.sc_fp = regs->tf_ebp;
 	sf.sf_sc.sc_pc = regs->tf_eip;
 	sf.sf_sc.sc_ps = regs->tf_eflags;
 	sf.sf_sc.sc_trapno = regs->tf_trapno;
 	sf.sf_sc.sc_err = regs->tf_err;
 
 	/*
 	 * If we're a vm86 process, we want to save the segment registers.
 	 * We also change eflags to be our emulated eflags, not the actual
 	 * eflags.
 	 */
 	if (regs->tf_eflags & PSL_VM) {
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86 = &p->p_addr->u_pcb.pcb_ext->ext_vm86;
 
 		sf.sf_sc.sc_gs = tf->tf_vm86_gs;
 		sf.sf_sc.sc_fs = tf->tf_vm86_fs;
 		sf.sf_sc.sc_es = tf->tf_vm86_es;
 		sf.sf_sc.sc_ds = tf->tf_vm86_ds;
 
 		if (vm86->vm86_has_vme == 0)
 			sf.sf_sc.sc_ps = (tf->tf_eflags & ~(PSL_VIF | PSL_VIP))
 			    | (vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
 
 		/*
 		 * We should never have PSL_T set when returning from vm86
 		 * mode.  It may be set here if we deliver a signal before
 		 * getting to vm86 mode, so turn it off.
 		 */
 		tf->tf_eflags &= ~(PSL_VM | PSL_T | PSL_VIF | PSL_VIP);
 	}
 
 	/*
 	 * Copy the sigframe out to the user's stack.
 	 */
 	if (copyout(&sf, fp, sizeof(struct sigframe)) != 0) {
 		/*
 		 * Something is wrong with the stack pointer.
 		 * ...Kill the process.
 		 */
 		sigexit(p, SIGILL);
 	}
 
 	regs->tf_esp = (int)fp;
 	regs->tf_eip = (int)(((char *)PS_STRINGS) - *(p->p_sysent->sv_szsigcode));
 	regs->tf_cs = _ucodesel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_ss = _udatasel;
 }
 
 /*
  * System call to cleanup state after a signal
  * has been taken.  Reset signal mask and
  * stack state from context left by sendsig (above).
  * Return to previous pc and psl as specified by
  * context left by sendsig. Check carefully to
  * make sure that the user has not modified the
  * state to gain improper privileges.
  */
 int
 sigreturn(p, uap)
 	struct proc *p;
 	struct sigreturn_args /* {
 		struct sigcontext *sigcntxp;
 	} */ *uap;
 {
 	register struct sigcontext *scp;
 	register struct sigframe *fp;
 	register struct trapframe *regs = p->p_md.md_regs;
 	int eflags;
 
 	/*
 	 * (XXX old comment) regs->tf_esp points to the return address.
 	 * The user scp pointer is above that.
 	 * The return address is faked in the signal trampoline code
 	 * for consistency.
 	 */
 	scp = uap->sigcntxp;
 	fp = (struct sigframe *)
 	     ((caddr_t)scp - offsetof(struct sigframe, sf_sc));
 
 	if (useracc((caddr_t)fp, sizeof (*fp), B_WRITE) == 0)
 		return(EFAULT);
 
 	eflags = scp->sc_ps;
 	if (eflags & PSL_VM) {
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86;
 
 		/*
 		 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
 		 * set up the vm86 area, and we can't enter vm86 mode.
 		 */
 		if (p->p_addr->u_pcb.pcb_ext == 0)
 			return (EINVAL);
 		vm86 = &p->p_addr->u_pcb.pcb_ext->ext_vm86;
 		if (vm86->vm86_inited == 0)
 			return (EINVAL);
 
 		/* go back to user mode if both flags are set */
 		if ((eflags & PSL_VIP) && (eflags & PSL_VIF))
 			trapsignal(p, SIGBUS, 0);
 
 #define VM_USERCHANGE	(PSL_USERCHANGE | PSL_RF)
 #define VME_USERCHANGE	(VM_USERCHANGE | PSL_VIP | PSL_VIF)
 		if (vm86->vm86_has_vme) {
 			eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
 			    (eflags & VME_USERCHANGE) | PSL_VM;
 		} else {
 			vm86->vm86_eflags = eflags;	/* save VIF, VIP */
 			eflags = (tf->tf_eflags & ~VM_USERCHANGE) |					    (eflags & VM_USERCHANGE) | PSL_VM;
 		}
 		tf->tf_vm86_ds = scp->sc_ds;
 		tf->tf_vm86_es = scp->sc_es;
 		tf->tf_vm86_fs = scp->sc_fs;
 		tf->tf_vm86_gs = scp->sc_gs;
 		tf->tf_ds = _udatasel;
 		tf->tf_es = _udatasel;
 	} else {
 		/*
 		 * Don't allow users to change privileged or reserved flags.
 		 */
 #define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
 		/*
 		 * XXX do allow users to change the privileged flag PSL_RF.
 		 * The cpu sets PSL_RF in tf_eflags for faults.  Debuggers
 		 * should sometimes set it there too.  tf_eflags is kept in
 		 * the signal context during signal handling and there is no
 		 * other place to remember it, so the PSL_RF bit may be
 		 * corrupted by the signal handler without us knowing.
 		 * Corruption of the PSL_RF bit at worst causes one more or
 		 * one less debugger trap, so allowing it is fairly harmless.
 		 */
 		if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) {
 #ifdef DEBUG
 	    		printf("sigreturn: eflags = 0x%x\n", eflags);
 #endif
 	    		return(EINVAL);
 		}
 
 		/*
 		 * Don't allow users to load a valid privileged %cs.  Let the
 		 * hardware check for invalid selectors, excess privilege in
 		 * other selectors, invalid %eip's and invalid %esp's.
 		 */
 #define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
 		if (!CS_SECURE(scp->sc_cs)) {
 #ifdef DEBUG
     			printf("sigreturn: cs = 0x%x\n", scp->sc_cs);
 #endif
 			trapsignal(p, SIGBUS, T_PROTFLT);
 			return(EINVAL);
 		}
 		regs->tf_ds = scp->sc_ds;
 		regs->tf_es = scp->sc_es;
 	}
 	/* restore scratch registers */
 	regs->tf_eax = scp->sc_eax;
 	regs->tf_ebx = scp->sc_ebx;
 	regs->tf_ecx = scp->sc_ecx;
 	regs->tf_edx = scp->sc_edx;
 	regs->tf_esi = scp->sc_esi;
 	regs->tf_edi = scp->sc_edi;
 	regs->tf_cs = scp->sc_cs;
 	regs->tf_ss = scp->sc_ss;
 	regs->tf_isp = scp->sc_isp;
 
 	if (useracc((caddr_t)scp, sizeof (*scp), B_WRITE) == 0)
 		return(EINVAL);
 
 	if (scp->sc_onstack & 01)
 		p->p_sigacts->ps_sigstk.ss_flags |= SS_ONSTACK;
 	else
 		p->p_sigacts->ps_sigstk.ss_flags &= ~SS_ONSTACK;
 	p->p_sigmask = scp->sc_mask & ~sigcantmask;
 	regs->tf_ebp = scp->sc_fp;
 	regs->tf_esp = scp->sc_sp;
 	regs->tf_eip = scp->sc_pc;
 	regs->tf_eflags = eflags;
 	return(EJUSTRETURN);
 }
 
 /*
  * Machine dependent boot() routine
  *
  * I haven't seen anything to put here yet
  * Possibly some stuff might be grafted back here from boot()
  */
 void
 cpu_boot(int howto)
 {
 }
 
 /*
  * Shutdown the CPU as much as possible
  */
 void
 cpu_halt(void)
 {
 	for (;;)
 		__asm__ ("hlt");
 }
 
 /*
  * Turn the power off.
  */
 void
 cpu_power_down(void)
 {
 #if NAPM > 0
 	apm_power_off();
 #endif
 }
 
 /*
  * Clear registers on exec
  */
 void
 setregs(p, entry, stack)
 	struct proc *p;
 	u_long entry;
 	u_long stack;
 {
 	struct trapframe *regs = p->p_md.md_regs;
 
 #ifdef USER_LDT
 	struct pcb *pcb = &p->p_addr->u_pcb;
 
 	/* was i386_user_cleanup() in NetBSD */
 	if (pcb->pcb_ldt) {
 		if (pcb == curpcb)
 			lldt(GSEL(GUSERLDT_SEL, SEL_KPL));
 		kmem_free(kernel_map, (vm_offset_t)pcb->pcb_ldt,
 			pcb->pcb_ldt_len * sizeof(union descriptor));
 		pcb->pcb_ldt_len = (int)pcb->pcb_ldt = 0;
  	}
 #endif
   
 	bzero((char *)regs, sizeof(struct trapframe));
 	regs->tf_eip = entry;
 	regs->tf_esp = stack;
 	regs->tf_eflags = PSL_USER | (regs->tf_eflags & PSL_T);
 	regs->tf_ss = _udatasel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_cs = _ucodesel;
 
 	/*
 	 * Initialize the math emulator (if any) for the current process.
 	 * Actually, just clear the bit that says that the emulator has
 	 * been initialized.  Initialization is delayed until the process
 	 * traps to the emulator (if it is done at all) mainly because
 	 * emulators don't provide an entry point for initialization.
 	 */
 	p->p_addr->u_pcb.pcb_flags &= ~FP_SOFTFP;
 
 	/*
 	 * Arrange to trap the next npx or `fwait' instruction (see npx.c
 	 * for why fwait must be trapped at least if there is an npx or an
 	 * emulator).  This is mainly to handle the case where npx0 is not
 	 * configured, since the npx routines normally set up the trap
 	 * otherwise.  It should be done only at boot time, but doing it
 	 * here allows modifying `npx_exists' for testing the emulator on
 	 * systems with an npx.
 	 */
 	load_cr0(rcr0() | CR0_MP | CR0_TS);
 
 #if NNPX > 0
 	/* Initialize the npx (if any) for the current process. */
 	npxinit(__INITIAL_NPXCW__);
 #endif
 }
 
 static int
 sysctl_machdep_adjkerntz SYSCTL_HANDLER_ARGS
 {
 	int error;
 	error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2,
 		req);
 	if (!error && req->newptr)
 		resettodr();
 	return (error);
 }
 
 SYSCTL_PROC(_machdep, CPU_ADJKERNTZ, adjkerntz, CTLTYPE_INT|CTLFLAG_RW,
 	&adjkerntz, 0, sysctl_machdep_adjkerntz, "I", "");
 
 SYSCTL_INT(_machdep, CPU_DISRTCSET, disable_rtc_set,
 	CTLFLAG_RW, &disable_rtc_set, 0, "");
 
 SYSCTL_STRUCT(_machdep, CPU_BOOTINFO, bootinfo, 
 	CTLFLAG_RD, &bootinfo, bootinfo, "");
 
 SYSCTL_INT(_machdep, CPU_WALLCLOCK, wall_cmos_clock,
 	CTLFLAG_RW, &wall_cmos_clock, 0, "");
 
 /*
  * Initialize 386 and configure to run kernel
  */
 
 /*
  * Initialize segments & interrupt table
  */
 
 int currentldt;
 int _default_ldt;
 #ifdef SMP
 union descriptor gdt[NGDT + NCPU];	/* global descriptor table */
 #else
 union descriptor gdt[NGDT];		/* global descriptor table */
 #endif
 struct gate_descriptor idt[NIDT];	/* interrupt descriptor table */
 union descriptor ldt[NLDT];		/* local descriptor table */
 #ifdef SMP
 /* table descriptors - used to load tables by microp */
 struct region_descriptor r_gdt, r_idt;
 #endif
 
 #ifdef SMP
 extern struct i386tss common_tss;	/* One tss per cpu */
 #ifdef VM86
 extern struct segment_descriptor common_tssd;
 extern int private_tss;
 extern u_int my_tr;
 #endif /* VM86 */
 #else
 struct i386tss common_tss;
 #ifdef VM86
 struct segment_descriptor common_tssd;
 u_int private_tss;			/* flag indicating private tss */
 u_int my_tr;				/* which task register setting */
 #endif /* VM86 */
 #endif
 
 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 struct gate_descriptor *t_idt;
 extern int has_f00f_bug;
 #endif
 
 static struct i386tss dblfault_tss;
 static char dblfault_stack[PAGE_SIZE];
 
 extern  struct user *proc0paddr;
 
 
 /* software prototypes -- in more palatable form */
 struct soft_segment_descriptor gdt_segs[
 #ifdef SMP
 					NGDT + NCPU
 #endif
 						   ] = {
 /* GNULL_SEL	0 Null Descriptor */
 {	0x0,			/* segment base address  */
 	0x0,			/* length */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GCODE_SEL	1 Code Descriptor for kernel */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMERA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GDATA_SEL	2 Data Descriptor for kernel */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMRWA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GLDT_SEL	3 LDT Descriptor */
 {	(int) ldt,		/* segment base address  */
 	sizeof(ldt)-1,		/* length - all address space */
 	SDT_SYSLDT,		/* segment type */
 	SEL_UPL,		/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* unused - default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GTGATE_SEL	4 Null Descriptor - Placeholder */
 {	0x0,			/* segment base address  */
 	0x0,			/* length - all address space */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GPANIC_SEL	5 Panic Tss Descriptor */
 {	(int) &dblfault_tss,	/* segment base address  */
 	sizeof(struct i386tss)-1,/* length - all address space */
 	SDT_SYS386TSS,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* unused - default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GPROC0_SEL	6 Proc 0 Tss Descriptor */
 {
 	(int) &common_tss,	/* segment base address */
 	sizeof(struct i386tss)-1,/* length - all address space */
 	SDT_SYS386TSS,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* unused - default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GUSERLDT_SEL	7 User LDT Descriptor per process */
 {	(int) ldt,		/* segment base address  */
 	(512 * sizeof(union descriptor)-1),		/* length */
 	SDT_SYSLDT,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* unused - default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GAPMCODE32_SEL 8 APM BIOS 32-bit interface (32bit Code) */
 {	0,			/* segment base address (overwritten by APM)  */
 	0xfffff,		/* length */
 	SDT_MEMERA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GAPMCODE16_SEL 9 APM BIOS 32-bit interface (16bit Code) */
 {	0,			/* segment base address (overwritten by APM)  */
 	0xfffff,		/* length */
 	SDT_MEMERA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GAPMDATA_SEL	10 APM BIOS 32-bit interface (Data) */
 {	0,			/* segment base address (overwritten by APM) */
 	0xfffff,		/* length */
 	SDT_MEMRWA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 };
 
 static struct soft_segment_descriptor ldt_segs[] = {
 	/* Null Descriptor - overwritten by call gate */
 {	0x0,			/* segment base address  */
 	0x0,			/* length - all address space */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 	/* Null Descriptor - overwritten by call gate */
 {	0x0,			/* segment base address  */
 	0x0,			/* length - all address space */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 	/* Null Descriptor - overwritten by call gate */
 {	0x0,			/* segment base address  */
 	0x0,			/* length - all address space */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 	/* Code Descriptor for user */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMERA,		/* segment type */
 	SEL_UPL,		/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 	/* Data Descriptor for user */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMRWA,		/* segment type */
 	SEL_UPL,		/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 };
 
 void
 setidt(idx, func, typ, dpl, selec)
 	int idx;
 	inthand_t *func;
 	int typ;
 	int dpl;
 	int selec;
 {
 	struct gate_descriptor *ip = idt + idx;
 
 	ip->gd_looffset = (int)func;
 	ip->gd_selector = selec;
 	ip->gd_stkcpy = 0;
 	ip->gd_xx = 0;
 	ip->gd_type = typ;
 	ip->gd_dpl = dpl;
 	ip->gd_p = 1;
 	ip->gd_hioffset = ((int)func)>>16 ;
 }
 
 #define	IDTVEC(name)	__CONCAT(X,name)
 
 extern inthand_t
 	IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
 	IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
 	IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
 	IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
 	IDTVEC(syscall), IDTVEC(int0x80_syscall);
 
 void
 sdtossd(sd, ssd)
 	struct segment_descriptor *sd;
 	struct soft_segment_descriptor *ssd;
 {
 	ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
 	ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
 	ssd->ssd_type  = sd->sd_type;
 	ssd->ssd_dpl   = sd->sd_dpl;
 	ssd->ssd_p     = sd->sd_p;
 	ssd->ssd_def32 = sd->sd_def32;
 	ssd->ssd_gran  = sd->sd_gran;
 }
 
 void
 init386(first)
 	int first;
 {
 	int x;
 	unsigned biosbasemem, biosextmem;
 	struct gate_descriptor *gdp;
 	int gsel_tss;
 
 	struct isa_device *idp;
 #ifndef SMP
 	/* table descriptors - used to load tables by microp */
 	struct region_descriptor r_gdt, r_idt;
 #endif
 	int pagesinbase, pagesinext;
 	int target_page, pa_indx;
 	int off;
 	int speculative_mprobe;
 
 	/*
 	 * Prevent lowering of the ipl if we call tsleep() early.
 	 */
 	safepri = cpl;
 
 	proc0.p_addr = proc0paddr;
 
 	atdevbase = ISA_HOLE_START + KERNBASE;
 
 	/*
 	 * Initialize the console before we print anything out.
 	 */
 	cninit();
 
 	/*
 	 * make gdt memory segments, the code segment goes up to end of the
 	 * page with etext in it, the data segment goes to the end of
 	 * the address space
 	 */
 	/*
 	 * XXX text protection is temporarily (?) disabled.  The limit was
 	 * i386_btop(round_page(etext)) - 1.
 	 */
 	gdt_segs[GCODE_SEL].ssd_limit = i386_btop(0) - 1;
 	gdt_segs[GDATA_SEL].ssd_limit = i386_btop(0) - 1;
 #ifdef BDE_DEBUGGER
 #define	NGDT1	8		/* avoid overwriting db entries with APM ones */
 #else
 #define	NGDT1	(sizeof gdt_segs / sizeof gdt_segs[0])
 #endif
 	for (x = 0; x < NGDT1; x++)
 		ssdtosd(&gdt_segs[x], &gdt[x].sd);
 #ifdef VM86
 	common_tssd = gdt[GPROC0_SEL].sd;
 #endif /* VM86 */
 
 #ifdef SMP
 	/*
 	 * Spin these up now.  init_secondary() grabs them.  We could use
 	 * #for(x,y,z) / #endfor cpp directives if they existed.
 	 */
 	for (x = 0; x < NCPU; x++) {
 		gdt_segs[NGDT + x] = gdt_segs[GPROC0_SEL];
 		ssdtosd(&gdt_segs[NGDT + x], &gdt[NGDT + x].sd);
 	}
 #endif
 
 	/* make ldt memory segments */
 	/*
 	 * The data segment limit must not cover the user area because we
 	 * don't want the user area to be writable in copyout() etc. (page
 	 * level protection is lost in kernel mode on 386's).  Also, we
 	 * don't want the user area to be writable directly (page level
 	 * protection of the user area is not available on 486's with
 	 * CR0_WP set, because there is no user-read/kernel-write mode).
 	 *
 	 * XXX - VM_MAXUSER_ADDRESS is an end address, not a max.  And it
 	 * should be spelled ...MAX_USER...
 	 */
 #define VM_END_USER_RW_ADDRESS	VM_MAXUSER_ADDRESS
 	/*
 	 * The code segment limit has to cover the user area until we move
 	 * the signal trampoline out of the user area.  This is safe because
 	 * the code segment cannot be written to directly.
 	 */
 #define VM_END_USER_R_ADDRESS	(VM_END_USER_RW_ADDRESS + UPAGES * PAGE_SIZE)
 	ldt_segs[LUCODE_SEL].ssd_limit = i386_btop(VM_END_USER_R_ADDRESS) - 1;
 	ldt_segs[LUDATA_SEL].ssd_limit = i386_btop(VM_END_USER_RW_ADDRESS) - 1;
 	for (x = 0; x < sizeof ldt_segs / sizeof ldt_segs[0]; x++)
 		ssdtosd(&ldt_segs[x], &ldt[x].sd);
 
 	/* exceptions */
 	for (x = 0; x < NIDT; x++)
 		setidt(x, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(0, &IDTVEC(div),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(1, &IDTVEC(dbg),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(2, &IDTVEC(nmi),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
  	setidt(3, &IDTVEC(bpt),  SDT_SYS386TGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(4, &IDTVEC(ofl),  SDT_SYS386TGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(5, &IDTVEC(bnd),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(6, &IDTVEC(ill),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(7, &IDTVEC(dna),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(8, 0,  SDT_SYSTASKGT, SEL_KPL, GSEL(GPANIC_SEL, SEL_KPL));
 	setidt(9, &IDTVEC(fpusegm),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(10, &IDTVEC(tss),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(11, &IDTVEC(missing),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(12, &IDTVEC(stk),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(13, &IDTVEC(prot),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(14, &IDTVEC(page),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(15, &IDTVEC(rsvd),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(16, &IDTVEC(fpu),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(17, &IDTVEC(align), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(18, &IDTVEC(mchk),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
  	setidt(0x80, &IDTVEC(int0x80_syscall),
 			SDT_SYS386TGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL));
 
 #include	"isa.h"
 #if	NISA >0
 	isa_defaultirq();
 #endif
 	rand_initialize();
 
 	r_gdt.rd_limit = sizeof(gdt) - 1;
 	r_gdt.rd_base =  (int) gdt;
 	lgdt(&r_gdt);
 
 	r_idt.rd_limit = sizeof(idt) - 1;
 	r_idt.rd_base = (int) idt;
 	lidt(&r_idt);
 
 	_default_ldt = GSEL(GLDT_SEL, SEL_KPL);
 	lldt(_default_ldt);
 	currentldt = _default_ldt;
 
 #ifdef DDB
 	kdb_init();
 	if (boothowto & RB_KDB)
 		Debugger("Boot flags requested debugger");
 #endif
 
 	finishidentcpu();	/* Final stage of CPU initialization */
 	setidt(6, &IDTVEC(ill),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	initializecpu();	/* Initialize CPU registers */
 
 	/* Use BIOS values stored in RTC CMOS RAM, since probing
 	 * breaks certain 386 AT relics.
 	 */
 	biosbasemem = rtcin(RTC_BASELO)+ (rtcin(RTC_BASEHI)<<8);
 	biosextmem = rtcin(RTC_EXTLO)+ (rtcin(RTC_EXTHI)<<8);
 
 	/*
 	 * If BIOS tells us that it has more than 640k in the basemem,
 	 *	don't believe it - set it to 640k.
 	 */
 	if (biosbasemem > 640) {
 		printf("Preposterous RTC basemem of %dK, truncating to 640K\n",
 		       biosbasemem);
 		biosbasemem = 640;
 	}
 	if (bootinfo.bi_memsizes_valid && bootinfo.bi_basemem > 640) {
 		printf("Preposterous BIOS basemem of %dK, truncating to 640K\n",
 		       bootinfo.bi_basemem);
 		bootinfo.bi_basemem = 640;
 	}
 
 	/*
 	 * Warn if the official BIOS interface disagrees with the RTC
 	 * interface used above about the amount of base memory or the
 	 * amount of extended memory.  Prefer the BIOS value for the base
 	 * memory.  This is necessary for machines that `steal' base
 	 * memory for use as BIOS memory, at least if we are going to use
 	 * the BIOS for apm.  Prefer the RTC value for extended memory.
 	 * Eventually the hackish interface shouldn't even be looked at.
 	 */
 	if (bootinfo.bi_memsizes_valid) {
 		if (bootinfo.bi_basemem != biosbasemem) {
 			vm_offset_t pa;
 
 			printf(
 	"BIOS basemem (%ldK) != RTC basemem (%dK), setting to BIOS value\n",
 			       bootinfo.bi_basemem, biosbasemem);
 			biosbasemem = bootinfo.bi_basemem;
 
 			/*
 			 * XXX if biosbasemem is now < 640, there is `hole'
 			 * between the end of base memory and the start of
 			 * ISA memory.  The hole may be empty or it may
 			 * contain BIOS code or data.  Map it read/write so
 			 * that the BIOS can write to it.  (Memory from 0 to
 			 * the physical end of the kernel is mapped read-only
 			 * to begin with and then parts of it are remapped.
 			 * The parts that aren't remapped form holes that
 			 * remain read-only and are unused by the kernel.
 			 * The base memory area is below the physical end of
 			 * the kernel and right now forms a read-only hole.
 			 * The part of it from 0 to
 			 * (trunc_page(biosbasemem * 1024) - 1) will be
 			 * remapped and used by the kernel later.)
 			 *
 			 * This code is similar to the code used in
 			 * pmap_mapdev, but since no memory needs to be
 			 * allocated we simply change the mapping.
 			 */
 			for (pa = trunc_page(biosbasemem * 1024);
 			     pa < ISA_HOLE_START; pa += PAGE_SIZE) {
 				unsigned *pte;
 
 				pte = (unsigned *)vtopte(pa + KERNBASE);
 				*pte = pa | PG_RW | PG_V;
 			}
 		}
 		if (bootinfo.bi_extmem != biosextmem)
 			printf("BIOS extmem (%ldK) != RTC extmem (%dK)\n",
 			       bootinfo.bi_extmem, biosextmem);
 	}
 
 #ifdef SMP
 	/* make hole for AP bootstrap code */
 	pagesinbase = mp_bootaddress(biosbasemem) / PAGE_SIZE;
 #else
 	pagesinbase = biosbasemem * 1024 / PAGE_SIZE;
 #endif
 
 	pagesinext = biosextmem * 1024 / PAGE_SIZE;
 
 	/*
 	 * Special hack for chipsets that still remap the 384k hole when
 	 *	there's 16MB of memory - this really confuses people that
 	 *	are trying to use bus mastering ISA controllers with the
 	 *	"16MB limit"; they only have 16MB, but the remapping puts
 	 *	them beyond the limit.
 	 */
 	/*
 	 * If extended memory is between 15-16MB (16-17MB phys address range),
 	 *	chop it to 15MB.
 	 */
 	if ((pagesinext > 3840) && (pagesinext < 4096))
 		pagesinext = 3840;
 
 	/*
 	 * Maxmem isn't the "maximum memory", it's one larger than the
 	 * highest page of the physical address space.  It should be
 	 * called something like "Maxphyspage".
 	 */
 	Maxmem = pagesinext + 0x100000/PAGE_SIZE;
 	/*
 	 * Indicate that we wish to do a speculative search for memory beyond
 	 * the end of the reported size if the indicated amount is 64MB (0x4000
 	 * pages) - which is the largest amount that the BIOS/bootblocks can
 	 * currently report. If a specific amount of memory is indicated via
 	 * the MAXMEM option or the npx0 "msize", then don't do the speculative
 	 * memory probe.
 	 */
 	if (Maxmem >= 0x4000)
 		speculative_mprobe = TRUE;
 	else
 		speculative_mprobe = FALSE;
 
 #ifdef MAXMEM
 	Maxmem = MAXMEM/4;
 	speculative_mprobe = FALSE;
 #endif
 
 #if NNPX > 0
 	idp = find_isadev(isa_devtab_null, &npxdriver, 0);
 	if (idp != NULL && idp->id_msize != 0) {
 		Maxmem = idp->id_msize / 4;
 		speculative_mprobe = FALSE;
 	}
 #endif
 
 #ifdef SMP
 	/* look for the MP hardware - needed for apic addresses */
 	mp_probe();
 #endif
 
 	/* call pmap initialization to make new kernel address space */
 	pmap_bootstrap (first, 0);
 
 	/*
 	 * Size up each available chunk of physical memory.
 	 */
 
 	/*
 	 * We currently don't bother testing base memory.
 	 * XXX  ...but we probably should.
 	 */
 	pa_indx = 0;
 	if (pagesinbase > 1) {
 		phys_avail[pa_indx++] = PAGE_SIZE;	/* skip first page of memory */
 		phys_avail[pa_indx] = ptoa(pagesinbase);/* memory up to the ISA hole */
 		physmem = pagesinbase - 1;
 	} else {
 		/* point at first chunk end */
 		pa_indx++;
 	}
 
 	for (target_page = avail_start; target_page < ptoa(Maxmem); target_page += PAGE_SIZE) {
 		int tmp, page_bad;
 
 		page_bad = FALSE;
 
 		/*
 		 * map page into kernel: valid, read/write, non-cacheable
 		 */
 		*(int *)CMAP1 = PG_V | PG_RW | PG_N | target_page;
 		invltlb();
 
 		tmp = *(int *)CADDR1;
 		/*
 		 * Test for alternating 1's and 0's
 		 */
 		*(volatile int *)CADDR1 = 0xaaaaaaaa;
 		if (*(volatile int *)CADDR1 != 0xaaaaaaaa) {
 			page_bad = TRUE;
 		}
 		/*
 		 * Test for alternating 0's and 1's
 		 */
 		*(volatile int *)CADDR1 = 0x55555555;
 		if (*(volatile int *)CADDR1 != 0x55555555) {
 			page_bad = TRUE;
 		}
 		/*
 		 * Test for all 1's
 		 */
 		*(volatile int *)CADDR1 = 0xffffffff;
 		if (*(volatile int *)CADDR1 != 0xffffffff) {
 			page_bad = TRUE;
 		}
 		/*
 		 * Test for all 0's
 		 */
 		*(volatile int *)CADDR1 = 0x0;
 		if (*(volatile int *)CADDR1 != 0x0) {
 			/*
 			 * test of page failed
 			 */
 			page_bad = TRUE;
 		}
 		/*
 		 * Restore original value.
 		 */
 		*(int *)CADDR1 = tmp;
 
 		/*
 		 * Adjust array of valid/good pages.
 		 */
 		if (page_bad == FALSE) {
 			/*
 			 * If this good page is a continuation of the
 			 * previous set of good pages, then just increase
 			 * the end pointer. Otherwise start a new chunk.
 			 * Note that "end" points one higher than end,
 			 * making the range >= start and < end.
 			 * If we're also doing a speculative memory
 			 * test and we at or past the end, bump up Maxmem
 			 * so that we keep going. The first bad page
 			 * will terminate the loop.
 			 */
 			if (phys_avail[pa_indx] == target_page) {
 				phys_avail[pa_indx] += PAGE_SIZE;
 				if (speculative_mprobe == TRUE &&
 				    phys_avail[pa_indx] >= (64*1024*1024))
 					Maxmem++;
 			} else {
 				pa_indx++;
 				if (pa_indx == PHYS_AVAIL_ARRAY_END) {
 					printf("Too many holes in the physical address space, giving up\n");
 					pa_indx--;
 					break;
 				}
 				phys_avail[pa_indx++] = target_page;	/* start */
 				phys_avail[pa_indx] = target_page + PAGE_SIZE;	/* end */
 			}
 			physmem++;
 		}
 	}
 
 	*(int *)CMAP1 = 0;
 	invltlb();
 
 	/*
 	 * XXX
 	 * The last chunk must contain at least one page plus the message
 	 * buffer to avoid complicating other code (message buffer address
 	 * calculation, etc.).
 	 */
 	while (phys_avail[pa_indx - 1] + PAGE_SIZE +
 	    round_page(sizeof(struct msgbuf)) >= phys_avail[pa_indx]) {
 		physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
 		phys_avail[pa_indx--] = 0;
 		phys_avail[pa_indx--] = 0;
 	}
 
 	Maxmem = atop(phys_avail[pa_indx]);
 
 	/* Trim off space for the message buffer. */
 	phys_avail[pa_indx] -= round_page(sizeof(struct msgbuf));
 
 	avail_end = phys_avail[pa_indx];
 
 	/* now running on new page tables, configured,and u/iom is accessible */
 
 	/* Map the message buffer. */
 	for (off = 0; off < round_page(sizeof(struct msgbuf)); off += PAGE_SIZE)
 		pmap_enter(kernel_pmap, (vm_offset_t)msgbufp + off,
 			   avail_end + off, VM_PROT_ALL, TRUE);
 	msgbufmapped = 1;
 
 	/* make an initial tss so cpu can get interrupt stack on syscall! */
 #ifdef VM86
 	common_tss.tss_esp0 = (int) proc0.p_addr + UPAGES*PAGE_SIZE - 16;
 #else
 	common_tss.tss_esp0 = (int) proc0.p_addr + UPAGES*PAGE_SIZE;
 #endif /* VM86 */
 	common_tss.tss_ss0 = GSEL(GDATA_SEL, SEL_KPL) ;
 	common_tss.tss_ioopt = (sizeof common_tss) << 16;
 	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
 	ltr(gsel_tss);
 #ifdef VM86
 	private_tss = 0;
 	my_tr = GPROC0_SEL;
 #endif
 
 	dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 =
 	    dblfault_tss.tss_esp2 = (int) &dblfault_stack[sizeof(dblfault_stack)];
 	dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 =
 	    dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL);
 	dblfault_tss.tss_cr3 = (int)IdlePTD;
 	dblfault_tss.tss_eip = (int) dblfault_handler;
 	dblfault_tss.tss_eflags = PSL_KERNEL;
 	dblfault_tss.tss_ds = dblfault_tss.tss_es = dblfault_tss.tss_fs = 
 	    dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL);
 	dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL);
 	dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL);
 
 	/* make a call gate to reenter kernel with */
 	gdp = &ldt[LSYS5CALLS_SEL].gd;
 
 	x = (int) &IDTVEC(syscall);
 	gdp->gd_looffset = x++;
 	gdp->gd_selector = GSEL(GCODE_SEL,SEL_KPL);
 	gdp->gd_stkcpy = 1;
 	gdp->gd_type = SDT_SYS386CGT;
 	gdp->gd_dpl = SEL_UPL;
 	gdp->gd_p = 1;
 	gdp->gd_hioffset = ((int) &IDTVEC(syscall)) >>16;
 
 	/* XXX does this work? */
 	ldt[LBSDICALLS_SEL] = ldt[LSYS5CALLS_SEL];
 
 	/* transfer to user mode */
 
 	_ucodesel = LSEL(LUCODE_SEL, SEL_UPL);
 	_udatasel = LSEL(LUDATA_SEL, SEL_UPL);
 
 	/* setup proc 0's pcb */
 	proc0.p_addr->u_pcb.pcb_flags = 0;
 	proc0.p_addr->u_pcb.pcb_cr3 = (int)IdlePTD;
 	proc0.p_addr->u_pcb.pcb_mpnest = 1;
 	proc0.p_addr->u_pcb.pcb_ext = 0;
 }
 
 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 void f00f_hack(void);
 SYSINIT(f00f_hack, SI_SUB_INTRINSIC, SI_ORDER_FIRST, f00f_hack, NULL);
 
 void
 f00f_hack(void) {
 	struct region_descriptor r_idt;
 	unsigned char *tmp;
 	int i;
 
 	if (!has_f00f_bug)
 		return;
 
 	printf("Intel Pentium F00F detected, installing workaround\n");
 
 	r_idt.rd_limit = sizeof(idt) - 1;
 
 	tmp = kmem_alloc(kernel_map, PAGE_SIZE * 2);
 	if (tmp == 0)
 		panic("kmem_alloc returned 0");
 	if (((unsigned int)tmp & (PAGE_SIZE-1)) != 0)
 		panic("kmem_alloc returned non-page-aligned memory");
 	/* Put the first seven entries in the lower page */
 	t_idt = (struct gate_descriptor*)(tmp + PAGE_SIZE - (7*8));
 	bcopy(idt, t_idt, sizeof(idt));
 	r_idt.rd_base = (int)t_idt;
 	lidt(&r_idt);
 	if (vm_map_protect(kernel_map, tmp, tmp + PAGE_SIZE,
 			   VM_PROT_READ, FALSE) != KERN_SUCCESS)
 		panic("vm_map_protect failed");
 	return;
 }
 #endif /* defined(I586_CPU) && !NO_F00F_HACK */
 
 int
 ptrace_set_pc(p, addr)
 	struct proc *p;
 	unsigned int addr;
 {
 	p->p_md.md_regs->tf_eip = addr;
 	return (0);
 }
 
 int
 ptrace_single_step(p)
 	struct proc *p;
 {
 	p->p_md.md_regs->tf_eflags |= PSL_T;
 	return (0);
 }
 
 int ptrace_write_u(p, off, data)
 	struct proc *p;
 	vm_offset_t off;
 	int data;
 {
 	struct trapframe frame_copy;
 	vm_offset_t min;
 	struct trapframe *tp;
 
 	/*
 	 * Privileged kernel state is scattered all over the user area.
 	 * Only allow write access to parts of regs and to fpregs.
 	 */
 	min = (char *)p->p_md.md_regs - (char *)p->p_addr;
 	if (off >= min && off <= min + sizeof(struct trapframe) - sizeof(int)) {
 		tp = p->p_md.md_regs;
 		frame_copy = *tp;
 		*(int *)((char *)&frame_copy + (off - min)) = data;
 		if (!EFLAGS_SECURE(frame_copy.tf_eflags, tp->tf_eflags) ||
 		    !CS_SECURE(frame_copy.tf_cs))
 			return (EINVAL);
 		*(int*)((char *)p->p_addr + off) = data;
 		return (0);
 	}
 	min = offsetof(struct user, u_pcb) + offsetof(struct pcb, pcb_savefpu);
 	if (off >= min && off <= min + sizeof(struct save87) - sizeof(int)) {
 		*(int*)((char *)p->p_addr + off) = data;
 		return (0);
 	}
 	return (EFAULT);
 }
 
 int
 fill_regs(p, regs)
 	struct proc *p;
 	struct reg *regs;
 {
 	struct pcb *pcb;
 	struct trapframe *tp;
 
 	tp = p->p_md.md_regs;
 	regs->r_es = tp->tf_es;
 	regs->r_ds = tp->tf_ds;
 	regs->r_edi = tp->tf_edi;
 	regs->r_esi = tp->tf_esi;
 	regs->r_ebp = tp->tf_ebp;
 	regs->r_ebx = tp->tf_ebx;
 	regs->r_edx = tp->tf_edx;
 	regs->r_ecx = tp->tf_ecx;
 	regs->r_eax = tp->tf_eax;
 	regs->r_eip = tp->tf_eip;
 	regs->r_cs = tp->tf_cs;
 	regs->r_eflags = tp->tf_eflags;
 	regs->r_esp = tp->tf_esp;
 	regs->r_ss = tp->tf_ss;
 	pcb = &p->p_addr->u_pcb;
 	regs->r_fs = pcb->pcb_fs;
 	regs->r_gs = pcb->pcb_gs;
 	return (0);
 }
 
 int
 set_regs(p, regs)
 	struct proc *p;
 	struct reg *regs;
 {
 	struct pcb *pcb;
 	struct trapframe *tp;
 
 	tp = p->p_md.md_regs;
 	if (!EFLAGS_SECURE(regs->r_eflags, tp->tf_eflags) ||
 	    !CS_SECURE(regs->r_cs))
 		return (EINVAL);
 	tp->tf_es = regs->r_es;
 	tp->tf_ds = regs->r_ds;
 	tp->tf_edi = regs->r_edi;
 	tp->tf_esi = regs->r_esi;
 	tp->tf_ebp = regs->r_ebp;
 	tp->tf_ebx = regs->r_ebx;
 	tp->tf_edx = regs->r_edx;
 	tp->tf_ecx = regs->r_ecx;
 	tp->tf_eax = regs->r_eax;
 	tp->tf_eip = regs->r_eip;
 	tp->tf_cs = regs->r_cs;
 	tp->tf_eflags = regs->r_eflags;
 	tp->tf_esp = regs->r_esp;
 	tp->tf_ss = regs->r_ss;
 	pcb = &p->p_addr->u_pcb;
 	pcb->pcb_fs = regs->r_fs;
 	pcb->pcb_gs = regs->r_gs;
 	return (0);
 }
 
 #ifndef DDB
 void
 Debugger(const char *msg)
 {
 	printf("Debugger(\"%s\") called.\n", msg);
 }
 #endif /* no DDB */
 
 #include <sys/disklabel.h>
 
 /*
  * Determine the size of the transfer, and make sure it is
  * within the boundaries of the partition. Adjust transfer
  * if needed, and signal errors or early completion.
  */
 int
 bounds_check_with_label(struct buf *bp, struct disklabel *lp, int wlabel)
 {
         struct partition *p = lp->d_partitions + dkpart(bp->b_dev);
         int labelsect = lp->d_partitions[0].p_offset;
         int maxsz = p->p_size,
                 sz = (bp->b_bcount + DEV_BSIZE - 1) >> DEV_BSHIFT;
 
         /* overwriting disk label ? */
         /* XXX should also protect bootstrap in first 8K */
         if (bp->b_blkno + p->p_offset <= LABELSECTOR + labelsect &&
 #if LABELSECTOR != 0
             bp->b_blkno + p->p_offset + sz > LABELSECTOR + labelsect &&
 #endif
             (bp->b_flags & B_READ) == 0 && wlabel == 0) {
                 bp->b_error = EROFS;
                 goto bad;
         }
 
 #if     defined(DOSBBSECTOR) && defined(notyet)
         /* overwriting master boot record? */
         if (bp->b_blkno + p->p_offset <= DOSBBSECTOR &&
             (bp->b_flags & B_READ) == 0 && wlabel == 0) {
                 bp->b_error = EROFS;
                 goto bad;
         }
 #endif
 
         /* beyond partition? */
         if (bp->b_blkno < 0 || bp->b_blkno + sz > maxsz) {
                 /* if exactly at end of disk, return an EOF */
                 if (bp->b_blkno == maxsz) {
                         bp->b_resid = bp->b_bcount;
                         return(0);
                 }
                 /* or truncate if part of it fits */
                 sz = maxsz - bp->b_blkno;
                 if (sz <= 0) {
                         bp->b_error = EINVAL;
                         goto bad;
                 }
                 bp->b_bcount = sz << DEV_BSHIFT;
         }
 
         bp->b_pblkno = bp->b_blkno + p->p_offset;
         return(1);
 
 bad:
         bp->b_flags |= B_ERROR;
         return(-1);
 }
 
 #ifdef DDB
 
 /*
  * Provide inb() and outb() as functions.  They are normally only
  * available as macros calling inlined functions, thus cannot be
  * called inside DDB.
  *
  * The actual code is stolen from <machine/cpufunc.h>, and de-inlined.
  */
 
 #undef inb
 #undef outb
 
 /* silence compiler warnings */
 u_char inb(u_int);
 void outb(u_int, u_char);
 
 u_char
 inb(u_int port)
 {
 	u_char	data;
 	/*
 	 * We use %%dx and not %1 here because i/o is done at %dx and not at
 	 * %edx, while gcc generates inferior code (movw instead of movl)
 	 * if we tell it to load (u_short) port.
 	 */
 	__asm __volatile("inb %%dx,%0" : "=a" (data) : "d" (port));
 	return (data);
 }
 
 void
 outb(u_int port, u_char data)
 {
 	u_char	al;
 	/*
 	 * Use an unnecessary assignment to help gcc's register allocator.
 	 * This make a large difference for gcc-1.40 and a tiny difference
 	 * for gcc-2.6.0.  For gcc-1.40, al had to be ``asm("ax")'' for
 	 * best results.  gcc-2.6.0 can't handle this.
 	 */
 	al = data;
 	__asm __volatile("outb %0,%%dx" : : "a" (al), "d" (port));
 }
 
 #endif /* DDB */
Index: head/sys/i386/i386/pmap.c
===================================================================
--- head/sys/i386/i386/pmap.c	(revision 31708)
+++ head/sys/i386/i386/pmap.c	(revision 31709)
@@ -1,3374 +1,3399 @@
 /*
  * Copyright (c) 1991 Regents of the University of California.
  * All rights reserved.
  * Copyright (c) 1994 John S. Dyson
  * All rights reserved.
  * Copyright (c) 1994 David Greenman
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department and William Jolitz of UUNET Technologies Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
- *	$Id: pmap.c,v 1.172 1997/11/07 19:58:34 tegge Exp $
+ *	$Id: pmap.c,v 1.173 1997/11/20 19:30:31 bde Exp $
  */
 
 /*
  *	Manages physical address maps.
  *
  *	In addition to hardware address maps, this
  *	module is called upon to provide software-use-only
  *	maps which may or may not be stored in the same
  *	form as hardware maps.  These pseudo-maps are
  *	used to store intermediate results from copy
  *	operations to and from address spaces.
  *
  *	Since the information managed by this module is
  *	also stored by the logical address mapping module,
  *	this module may throw away valid virtual-to-physical
  *	mappings at almost any time.  However, invalidations
  *	of virtual-to-physical mappings must be done as
  *	requested.
  *
  *	In order to cope with hardware architectures which
  *	make virtual-to-physical map invalidates expensive,
  *	this module may delay invalidate or reduced protection
  *	operations until such time as they are actually
  *	necessary.  This module is given full information as
  *	to which processors are currently using which maps,
  *	and to when physical maps must be made correct.
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 #include <sys/msgbuf.h>
 #include <sys/vmmeter.h>
 #include <sys/mman.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_prot.h>
 #include <sys/lock.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_zone.h>
 
 #include <sys/user.h>
 
 #include <machine/cputypes.h>
 #include <machine/md_var.h>
 #include <machine/specialreg.h>
 #if defined(SMP) || defined(APIC_IO)
 #include <machine/smp.h>
 #include <machine/apic.h>
 #endif /* SMP || APIC_IO */
 
 #define PMAP_KEEP_PDIRS
 #ifndef PMAP_SHPGPERPROC
 #define PMAP_SHPGPERPROC 200
 #endif
 
 #if defined(DIAGNOSTIC)
 #define PMAP_DIAGNOSTIC
 #endif
 
 #define MINPV 2048
 
 #if !defined(PMAP_DIAGNOSTIC)
 #define PMAP_INLINE __inline
 #else
 #define PMAP_INLINE
 #endif
 
 #define PTPHINT
 
 /*
  * Get PDEs and PTEs for user/kernel address space
  */
 #define	pmap_pde(m, v)	(&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT]))
 #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT])
 
 #define pmap_pde_v(pte)		((*(int *)pte & PG_V) != 0)
 #define pmap_pte_w(pte)		((*(int *)pte & PG_W) != 0)
 #define pmap_pte_m(pte)		((*(int *)pte & PG_M) != 0)
 #define pmap_pte_u(pte)		((*(int *)pte & PG_A) != 0)
 #define pmap_pte_v(pte)		((*(int *)pte & PG_V) != 0)
 
 #define pmap_pte_set_w(pte, v) ((v)?(*(int *)pte |= PG_W):(*(int *)pte &= ~PG_W))
 #define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v)))
 
 /*
  * Given a map and a machine independent protection code,
  * convert to a vax protection code.
  */
 #define pte_prot(m, p)	(protection_codes[p])
 static int protection_codes[8];
 
 #define	pa_index(pa)		atop((pa) - vm_first_phys)
 #define	pa_to_pvh(pa)		(&pv_table[pa_index(pa)])
 
 static struct pmap kernel_pmap_store;
 pmap_t kernel_pmap;
 extern pd_entry_t my_idlePTD;
 
 vm_offset_t avail_start;	/* PA of first available physical page */
 vm_offset_t avail_end;		/* PA of last available physical page */
 vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
 vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
 static boolean_t pmap_initialized = FALSE;	/* Has pmap_init completed? */
 static vm_offset_t vm_first_phys;
 int pgeflag;		/* PG_G or-in */
 int pseflag;		/* PG_PS or-in */
 int pv_npg;
 
 int nkpt;
 vm_offset_t kernel_vm_end;
 
 /*
  * Data for the pv entry allocation mechanism
  */
 vm_zone_t pvzone;
 struct vm_zone pvzone_store;
 struct vm_object pvzone_obj;
 int pv_entry_count=0, pv_entry_max=0, pv_entry_high_water=0;
 int pmap_pagedaemon_waken = 0;
 struct pv_entry *pvinit;
 
 /*
  * All those kernel PT submaps that BSD is so fond of
  */
 pt_entry_t *CMAP1 = 0;
 static pt_entry_t *CMAP2, *ptmmap;
 static pv_table_t *pv_table;
 caddr_t CADDR1 = 0, ptvmmap = 0;
 static caddr_t CADDR2;
 static pt_entry_t *msgbufmap;
 struct msgbuf *msgbufp=0;
 
 #ifdef SMP
 extern char prv_CPAGE1[], prv_CPAGE2[], prv_CPAGE3[];
 extern pt_entry_t *prv_CMAP1, *prv_CMAP2, *prv_CMAP3;
 extern pd_entry_t *IdlePTDS[];
 extern pt_entry_t SMP_prvpt[];
 #endif
 
 pt_entry_t *PMAP1 = 0;
 unsigned *PADDR1 = 0;
 
 static PMAP_INLINE void	free_pv_entry __P((pv_entry_t pv));
 static unsigned * get_ptbase __P((pmap_t pmap));
 static pv_entry_t get_pv_entry __P((void));
 static void	i386_protection_init __P((void));
 static void	pmap_changebit __P((vm_offset_t pa, int bit, boolean_t setem));
 
 static PMAP_INLINE int	pmap_is_managed __P((vm_offset_t pa));
 static void	pmap_remove_all __P((vm_offset_t pa));
 static vm_page_t pmap_enter_quick __P((pmap_t pmap, vm_offset_t va,
 				      vm_offset_t pa, vm_page_t mpte));
 static int pmap_remove_pte __P((struct pmap *pmap, unsigned *ptq,
 					vm_offset_t sva));
 static void pmap_remove_page __P((struct pmap *pmap, vm_offset_t va));
 static int pmap_remove_entry __P((struct pmap *pmap, pv_table_t *pv,
 					vm_offset_t va));
 static boolean_t pmap_testbit __P((vm_offset_t pa, int bit));
 static void pmap_insert_entry __P((pmap_t pmap, vm_offset_t va,
 		vm_page_t mpte, vm_offset_t pa));
 
 static vm_page_t pmap_allocpte __P((pmap_t pmap, vm_offset_t va));
 
 static int pmap_release_free_page __P((pmap_t pmap, vm_page_t p));
 static vm_page_t _pmap_allocpte __P((pmap_t pmap, unsigned ptepindex));
 static unsigned * pmap_pte_quick __P((pmap_t pmap, vm_offset_t va));
 static vm_page_t pmap_page_alloc __P((vm_object_t object, vm_pindex_t pindex));
 static vm_page_t pmap_page_lookup __P((vm_object_t object, vm_pindex_t pindex));
 static int pmap_unuse_pt __P((pmap_t, vm_offset_t, vm_page_t));
 vm_offset_t pmap_kmem_choose(vm_offset_t addr) ;
 void pmap_collect(void);
 
 #define PDSTACKMAX 6
 static vm_offset_t pdstack[PDSTACKMAX];
 static int pdstackptr;
 unsigned pdir4mb;
 
 /*
  *	Routine:	pmap_pte
  *	Function:
  *		Extract the page table entry associated
  *		with the given map/virtual_address pair.
  */
 
 PMAP_INLINE unsigned *
 pmap_pte(pmap, va)
 	register pmap_t pmap;
 	vm_offset_t va;
 {
 	unsigned *pdeaddr;
 
 	if (pmap) {
 		pdeaddr = (unsigned *) pmap_pde(pmap, va);
 		if (*pdeaddr & PG_PS)
 			return pdeaddr;
 		if (*pdeaddr) {
 			return get_ptbase(pmap) + i386_btop(va);
 		}
 	}
 	return (0);
 }
 
 /*
  * Move the kernel virtual free pointer to the next
  * 4MB.  This is used to help improve performance
  * by using a large (4MB) page for much of the kernel
  * (.text, .data, .bss)
  */
 vm_offset_t
 pmap_kmem_choose(vm_offset_t addr) {
 	vm_offset_t newaddr = addr;
 #ifndef DISABLE_PSE
 	if (cpu_feature & CPUID_PSE) {
 		newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
 	}
 #endif
 	return newaddr;
 }
 
 /*
  *	Bootstrap the system enough to run with virtual memory.
  *
  *	On the i386 this is called after mapping has already been enabled
  *	and just syncs the pmap module with what has already been done.
  *	[We can't call it easily with mapping off since the kernel is not
  *	mapped with PA == VA, hence we would have to relocate every address
  *	from the linked base (virtual) address "KERNBASE" to the actual
  *	(physical) address starting relative to 0]
  */
 void
 pmap_bootstrap(firstaddr, loadaddr)
 	vm_offset_t firstaddr;
 	vm_offset_t loadaddr;
 {
 	vm_offset_t va;
 	pt_entry_t *pte;
 	int i, j;
 
 	avail_start = firstaddr;
 
 	/*
 	 * XXX The calculation of virtual_avail is wrong. It's NKPT*PAGE_SIZE too
 	 * large. It should instead be correctly calculated in locore.s and
 	 * not based on 'first' (which is a physical address, not a virtual
 	 * address, for the start of unused physical memory). The kernel
 	 * page tables are NOT double mapped and thus should not be included
 	 * in this calculation.
 	 */
 	virtual_avail = (vm_offset_t) KERNBASE + firstaddr;
 	virtual_avail = pmap_kmem_choose(virtual_avail);
 
 	virtual_end = VM_MAX_KERNEL_ADDRESS;
 
 	/*
 	 * Initialize protection array.
 	 */
 	i386_protection_init();
 
 	/*
 	 * The kernel's pmap is statically allocated so we don't have to use
 	 * pmap_create, which is unlikely to work correctly at this part of
 	 * the boot sequence (XXX and which no longer exists).
 	 */
 	kernel_pmap = &kernel_pmap_store;
 
 	kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD);
 
 	kernel_pmap->pm_count = 1;
 	TAILQ_INIT(&kernel_pmap->pm_pvlist);
 	nkpt = NKPT;
 
 	/*
 	 * Reserve some special page table entries/VA space for temporary
 	 * mapping of pages.
 	 */
 #define	SYSMAP(c, p, v, n)	\
 	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
 
 	va = virtual_avail;
 	pte = (pt_entry_t *) pmap_pte(kernel_pmap, va);
 
 	/*
 	 * CMAP1/CMAP2 are used for zeroing and copying pages.
 	 */
 	SYSMAP(caddr_t, CMAP1, CADDR1, 1)
 	SYSMAP(caddr_t, CMAP2, CADDR2, 1)
 
 	/*
 	 * ptvmmap is used for reading arbitrary physical pages via /dev/mem.
 	 * XXX ptmmap is not used.
 	 */
 	SYSMAP(caddr_t, ptmmap, ptvmmap, 1)
 
 	/*
 	 * msgbufp is used to map the system message buffer.
 	 * XXX msgbufmap is not used.
 	 */
 	SYSMAP(struct msgbuf *, msgbufmap, msgbufp,
 	       atop(round_page(sizeof(struct msgbuf))))
 
 	/*
 	 * ptemap is used for pmap_pte_quick
 	 */
 	SYSMAP(unsigned *, PMAP1, PADDR1, 1);
 
 	virtual_avail = va;
 
 	*(int *) CMAP1 = *(int *) CMAP2 = 0;
 	*(int *) PTD = 0;
 
 
 	pgeflag = 0;
 #if !defined(SMP)
 	if (cpu_feature & CPUID_PGE) {
 		pgeflag = PG_G;
 	}
 #endif
 	
 /*
  * Initialize the 4MB page size flag
  */
 	pseflag = 0;
 /*
  * The 4MB page version of the initial
  * kernel page mapping.
  */
 	pdir4mb = 0;
 
 #if !defined(DISABLE_PSE)
 	if (cpu_feature & CPUID_PSE) {
 		unsigned ptditmp;
 		/*
 		 * Enable the PSE mode
 		 */
 		load_cr4(rcr4() | CR4_PSE);
 
 		/*
 		 * Note that we have enabled PSE mode
 		 */
 		pseflag = PG_PS;
 		ptditmp = (unsigned) kernel_pmap->pm_pdir[KPTDI];
 		ptditmp &= ~(NBPDR - 1);
 		ptditmp |= PG_V | PG_RW | PG_PS | PG_U | pgeflag;
 		pdir4mb = ptditmp;
 		/*
 		 * We can do the mapping here for the single processor
 		 * case.  We simply ignore the old page table page from
 		 * now on.
 		 */
 #if !defined(SMP)
 		PTD[KPTDI] = (pd_entry_t) ptditmp;
 		kernel_pmap->pm_pdir[KPTDI] = (pd_entry_t) ptditmp;
 		invltlb();
 #endif
 	}
 #endif
 
 #ifdef SMP
 	if (cpu_apic_address == 0)
 		panic("pmap_bootstrap: no local apic!");
 
 	/* 0 = private page */
 	/* 1 = page table page */
 	/* 2 = local apic */
 	/* 16-31 = io apics */
 	SMP_prvpt[2] = (pt_entry_t)(PG_V | PG_RW | pgeflag | ((u_long)cpu_apic_address & PG_FRAME));
 
 	for (i = 0; i < mp_napics; i++) {
 		for (j = 0; j < 16; j++) {
 			/* same page frame as a previous IO apic? */
 			if (((u_long)SMP_prvpt[j + 16] & PG_FRAME) ==
 			    ((u_long)io_apic_address[0] & PG_FRAME)) {
 				ioapic[i] = (ioapic_t *)&SMP_ioapic[j * PAGE_SIZE];
 				break;
 			}
 			/* use this slot if available */
 			if (((u_long)SMP_prvpt[j + 16] & PG_FRAME) == 0) {
 				SMP_prvpt[j + 16] = (pt_entry_t)(PG_V | PG_RW | pgeflag |
 				    ((u_long)io_apic_address[i] & PG_FRAME));
 				ioapic[i] = (ioapic_t *)&SMP_ioapic[j * PAGE_SIZE];
 				break;
 			}
 		}
 		if (j == 16)
 			panic("no space to map IO apic %d!", i);
 	}
 
 	/* BSP does this itself, AP's get it pre-set */
 	prv_CMAP1 = (pt_entry_t *)&SMP_prvpt[3 + UPAGES];
 	prv_CMAP2 = (pt_entry_t *)&SMP_prvpt[4 + UPAGES];
 	prv_CMAP3 = (pt_entry_t *)&SMP_prvpt[5 + UPAGES];
 #endif
 
 	invltlb();
 
 }
 
 /*
  * Set 4mb pdir for mp startup, and global flags
  */
 void
 pmap_set_opt(unsigned *pdir) {
 	int i;
 
 	if (pseflag && (cpu_feature & CPUID_PSE)) {
 		load_cr4(rcr4() | CR4_PSE);
 		if (pdir4mb) {
 			(unsigned) pdir[KPTDI] = pdir4mb;
 		}
 	}
 
 	if (pgeflag && (cpu_feature & CPUID_PGE)) {
 		load_cr4(rcr4() | CR4_PGE);
 		for(i = KPTDI; i < KPTDI + nkpt; i++) {
 			if (pdir[i]) {
 				pdir[i] |= PG_G;
 			}
 		}
 	}
 }
 
 /*
  * Setup the PTD for the boot processor
  */
 void
 pmap_set_opt_bsp(void)
 {
 	pmap_set_opt((unsigned *)kernel_pmap->pm_pdir);
 	pmap_set_opt((unsigned *)PTD);
 	invltlb();
 }
 
 /*
  *	Initialize the pmap module.
  *	Called by vm_init, to initialize any structures that the pmap
  *	system needs to map virtual memory.
  *	pmap_init has been enhanced to support in a fairly consistant
  *	way, discontiguous physical memory.
  */
 void
 pmap_init(phys_start, phys_end)
 	vm_offset_t phys_start, phys_end;
 {
 	vm_offset_t addr;
 	vm_size_t s;
 	int i;
 	int initial_pvs;
 
 	/*
 	 * calculate the number of pv_entries needed
 	 */
 	vm_first_phys = phys_avail[0];
 	for (i = 0; phys_avail[i + 1]; i += 2);
 	pv_npg = (phys_avail[(i - 2) + 1] - vm_first_phys) / PAGE_SIZE;
 
 	/*
 	 * Allocate memory for random pmap data structures.  Includes the
 	 * pv_head_table.
 	 */
 	s = (vm_size_t) (sizeof(pv_table_t) * pv_npg);
 	s = round_page(s);
 
 	addr = (vm_offset_t) kmem_alloc(kernel_map, s);
 	pv_table = (pv_table_t *) addr;
 	for(i = 0; i < pv_npg; i++) {
 		vm_offset_t pa;
 		TAILQ_INIT(&pv_table[i].pv_list);
 		pv_table[i].pv_list_count = 0;
 		pa = vm_first_phys + i * PAGE_SIZE;
 		pv_table[i].pv_vm_page = PHYS_TO_VM_PAGE(pa);
 	}
 
 	/*
 	 * init the pv free list
 	 */
 	initial_pvs = pv_npg;
 	if (initial_pvs < MINPV)
 		initial_pvs = MINPV;
 	pvzone = &pvzone_store;
 	pvinit = (struct pv_entry *) kmem_alloc(kernel_map,
 		initial_pvs * sizeof (struct pv_entry));
 	zbootinit(pvzone, "PV ENTRY", sizeof (struct pv_entry), pvinit, pv_npg);
 
 	/*
 	 * Now it is safe to enable pv_table recording.
 	 */
 	pmap_initialized = TRUE;
 }
 
 /*
  * Initialize the address space (zone) for the pv_entries.  Set a
  * high water mark so that the system can recover from excessive
  * numbers of pv entries.
  */
 void
 pmap_init2() {
 	pv_entry_max = PMAP_SHPGPERPROC * maxproc + pv_npg;
 	pv_entry_high_water = 9 * (pv_entry_max / 10);
 	zinitna(pvzone, &pvzone_obj, NULL, 0, pv_entry_max, ZONE_INTERRUPT, 1);
 }
 
 /*
  *	Used to map a range of physical addresses into kernel
  *	virtual address space.
  *
  *	For now, VM is already on, we only need to map the
  *	specified memory.
  */
 vm_offset_t
 pmap_map(virt, start, end, prot)
 	vm_offset_t virt;
 	vm_offset_t start;
 	vm_offset_t end;
 	int prot;
 {
 	while (start < end) {
 		pmap_enter(kernel_pmap, virt, start, prot, FALSE);
 		virt += PAGE_SIZE;
 		start += PAGE_SIZE;
 	}
 	return (virt);
 }
 
 
 /***************************************************
  * Low level helper routines.....
  ***************************************************/
 
 #if defined(PMAP_DIAGNOSTIC)
 
 /*
  * This code checks for non-writeable/modified pages.
  * This should be an invalid condition.
  */
 static int
 pmap_nw_modified(pt_entry_t ptea) {
 	int pte;
 
 	pte = (int) ptea;
 
 	if ((pte & (PG_M|PG_RW)) == PG_M)
 		return 1;
 	else
 		return 0;
 }
 #endif
 
 
 /*
  * this routine defines the region(s) of memory that should
  * not be tested for the modified bit.
  */
 static PMAP_INLINE int
 pmap_track_modified( vm_offset_t va) {
 	if ((va < clean_sva) || (va >= clean_eva)) 
 		return 1;
 	else
 		return 0;
 }
 
 static PMAP_INLINE void
 invltlb_1pg( vm_offset_t va) {
 #if defined(I386_CPU)
 	if (cpu_class == CPUCLASS_386) {
 		invltlb();
 	} else
 #endif
 	{
 		invlpg(va);
 	}
 }
 
 static PMAP_INLINE void
 invltlb_2pg( vm_offset_t va1, vm_offset_t va2) {
 #if defined(I386_CPU)
 	if (cpu_class == CPUCLASS_386) {
 		invltlb();
 	} else
 #endif
 	{
 		invlpg(va1);
 		invlpg(va2);
 	}
 }
 
 static unsigned *
 get_ptbase(pmap)
 	pmap_t pmap;
 {
 	unsigned frame = (unsigned) pmap->pm_pdir[PTDPTDI] & PG_FRAME;
 
 	/* are we current address space or kernel? */
 	if (pmap == kernel_pmap || frame == (((unsigned) PTDpde) & PG_FRAME)) {
 		return (unsigned *) PTmap;
 	}
 	/* otherwise, we are alternate address space */
 	if (frame != (((unsigned) APTDpde) & PG_FRAME)) {
 		APTDpde = (pd_entry_t) (frame | PG_RW | PG_V);
 		invltlb();
 	}
 	return (unsigned *) APTmap;
 }
 
 /*
  * Super fast pmap_pte routine best used when scanning
  * the pv lists.  This eliminates many coarse-grained
  * invltlb calls.  Note that many of the pv list
  * scans are across different pmaps.  It is very wasteful
  * to do an entire invltlb for checking a single mapping.
  */
 
 static unsigned * 
 pmap_pte_quick(pmap, va)
 	register pmap_t pmap;
 	vm_offset_t va;
 {
 	unsigned pde, newpf;
 	if (pde = (unsigned) pmap->pm_pdir[va >> PDRSHIFT]) {
 		unsigned frame = (unsigned) pmap->pm_pdir[PTDPTDI] & PG_FRAME;
 		unsigned index = i386_btop(va);
 		/* are we current address space or kernel? */
 		if ((pmap == kernel_pmap) ||
 			(frame == (((unsigned) PTDpde) & PG_FRAME))) {
 			return (unsigned *) PTmap + index;
 		}
 		newpf = pde & PG_FRAME;
 		if ( ((* (unsigned *) PMAP1) & PG_FRAME) != newpf) {
 			* (unsigned *) PMAP1 = newpf | PG_RW | PG_V;
 			invltlb_1pg((vm_offset_t) PADDR1);
 		}
 		return PADDR1 + ((unsigned) index & (NPTEPG - 1));
 	}
 	return (0);
 }
 
 /*
  *	Routine:	pmap_extract
  *	Function:
  *		Extract the physical page address associated
  *		with the given map/virtual_address pair.
  */
 vm_offset_t 
 pmap_extract(pmap, va)
 	register pmap_t pmap;
 	vm_offset_t va;
 {
 	vm_offset_t rtval;
 	vm_offset_t pdirindex;
 	pdirindex = va >> PDRSHIFT;
 	if (pmap && (rtval = (unsigned) pmap->pm_pdir[pdirindex])) {
 		unsigned *pte;
 		if ((rtval & PG_PS) != 0) {
 			rtval &= ~(NBPDR - 1);
 			rtval |= va & (NBPDR - 1);
 			return rtval;
 		}
 		pte = get_ptbase(pmap) + i386_btop(va);
 		rtval = ((*pte & PG_FRAME) | (va & PAGE_MASK));
 		return rtval;
 	}
 	return 0;
 
 }
 
 /*
  * determine if a page is managed (memory vs. device)
  */
 static PMAP_INLINE int
 pmap_is_managed(pa)
 	vm_offset_t pa;
 {
 	int i;
 
 	if (!pmap_initialized)
 		return 0;
 
 	for (i = 0; phys_avail[i + 1]; i += 2) {
 		if (pa < phys_avail[i + 1] && pa >= phys_avail[i])
 			return 1;
 	}
 	return 0;
 }
 
 
 /***************************************************
  * Low level mapping routines.....
  ***************************************************/
 
 /*
  * Add a list of wired pages to the kva
  * this routine is only used for temporary
  * kernel mappings that do not need to have
  * page modification or references recorded.
  * Note that old mappings are simply written
  * over.  The page *must* be wired.
  */
 void
 pmap_qenter(va, m, count)
 	vm_offset_t va;
 	vm_page_t *m;
 	int count;
 {
 	int i;
 	register unsigned *pte;
 
 	for (i = 0; i < count; i++) {
 		vm_offset_t tva = va + i * PAGE_SIZE;
 		unsigned npte = VM_PAGE_TO_PHYS(m[i]) | PG_RW | PG_V | pgeflag;
 		unsigned opte;
 		pte = (unsigned *)vtopte(tva);
 		opte = *pte;
 		*pte = npte;
 		if (opte)
 			invltlb_1pg(tva);
 	}
 }
 
 /*
  * this routine jerks page mappings from the
  * kernel -- it is meant only for temporary mappings.
  */
 void
 pmap_qremove(va, count)
 	vm_offset_t va;
 	int count;
 {
 	int i;
 	register unsigned *pte;
 
 	for (i = 0; i < count; i++) {
 		pte = (unsigned *)vtopte(va);
 		*pte = 0;
 		invltlb_1pg(va);
 		va += PAGE_SIZE;
 	}
 }
 
 /*
  * add a wired page to the kva
  * note that in order for the mapping to take effect -- you
  * should do a invltlb after doing the pmap_kenter...
  */
 PMAP_INLINE void 
 pmap_kenter(va, pa)
 	vm_offset_t va;
 	register vm_offset_t pa;
 {
 	register unsigned *pte;
 	unsigned npte, opte;
 
 	npte = pa | PG_RW | PG_V | pgeflag;
 	pte = (unsigned *)vtopte(va);
 	opte = *pte;
 	*pte = npte;
 	if (opte)
 		invltlb_1pg(va);
 }
 
 /*
  * remove a page from the kernel pagetables
  */
 PMAP_INLINE void
 pmap_kremove(va)
 	vm_offset_t va;
 {
 	register unsigned *pte;
 
 	pte = (unsigned *)vtopte(va);
 	*pte = 0;
 	invltlb_1pg(va);
 }
 
 static vm_page_t
 pmap_page_alloc(object, pindex)
 	vm_object_t object;
 	vm_pindex_t pindex;
 {
 	vm_page_t m;
 	m = vm_page_alloc(object, pindex, VM_ALLOC_ZERO);
 	if (m == NULL) {
 		VM_WAIT;
 	}
 	return m;
 }
 
 static vm_page_t
 pmap_page_lookup(object, pindex)
 	vm_object_t object;
 	vm_pindex_t pindex;
 {
 	vm_page_t m;
 retry:
 	m = vm_page_lookup(object, pindex);
 	if (m) {
 		if (m->flags & PG_BUSY) {
 			m->flags |= PG_WANTED;
 			tsleep(m, PVM, "pplookp", 0);
 			goto retry;
 		}
 	}
 
 	return m;
 }
 
 /*
  * Create the UPAGES for a new process.
  * This routine directly affects the fork perf for a process.
  */
 void
 pmap_new_proc(p)
 	struct proc *p;
 {
 	int i;
 	vm_object_t upobj;
 	vm_page_t m;
 	struct user *up;
 	unsigned *ptek;
 
 	/*
 	 * allocate object for the upages
 	 */
 	upobj = vm_object_allocate( OBJT_DEFAULT, UPAGES);
 	p->p_upages_obj = upobj;
 
 	/* get a kernel virtual address for the UPAGES for this proc */
 	up = (struct user *) kmem_alloc_pageable(u_map, UPAGES * PAGE_SIZE);
 	if (up == NULL)
 		panic("pmap_new_proc: u_map allocation failed");
 
 	ptek = (unsigned *) vtopte((vm_offset_t) up);
 
 	for(i=0;i<UPAGES;i++) {
 		/*
 		 * Get a kernel stack page
 		 */
 		while ((m = vm_page_alloc(upobj,
 			i, VM_ALLOC_NORMAL)) == NULL) {
 			VM_WAIT;
 		}
 
 		/*
 		 * Wire the page
 		 */
 		m->wire_count++;
 		++cnt.v_wire_count;
 
 		/*
 		 * Enter the page into the kernel address space.
 		 */
 		*(ptek + i) = VM_PAGE_TO_PHYS(m) | PG_RW | PG_V | pgeflag;
 
 		m->flags &= ~(PG_ZERO|PG_BUSY);
 		m->flags |= PG_MAPPED|PG_WRITEABLE;
 		m->valid = VM_PAGE_BITS_ALL;
 	}
 
 	p->p_addr = up;
 }
 
 /*
  * Dispose the UPAGES for a process that has exited.
  * This routine directly impacts the exit perf of a process.
  */
 void
 pmap_dispose_proc(p)
 	struct proc *p;
 {
 	int i;
 	vm_object_t upobj;
 	vm_page_t m;
 	unsigned *ptek;
 
 	ptek = (unsigned *) vtopte((vm_offset_t) p->p_addr);
 
 	upobj = p->p_upages_obj;
 
 	for(i=0;i<UPAGES;i++) {
 		unsigned oldpte;
 		if ((m = vm_page_lookup(upobj, i)) == NULL)
 			panic("pmap_dispose_proc: upage already missing???");
 		oldpte = *(ptek + i);
 		*(ptek + i) = 0;
 		if (oldpte & PG_G)
 			invlpg((vm_offset_t) p->p_addr + i * PAGE_SIZE);
 		vm_page_unwire(m);
 		vm_page_free(m);
 	}
 
 	vm_object_deallocate(upobj);
 
 	kmem_free(u_map, (vm_offset_t)p->p_addr, ctob(UPAGES));
 }
 
 /*
  * Allow the UPAGES for a process to be prejudicially paged out.
  */
 void
 pmap_swapout_proc(p)
 	struct proc *p;
 {
 	int i;
 	vm_object_t upobj;
 	vm_page_t m;
 
 	upobj = p->p_upages_obj;
 	/*
 	 * let the upages be paged
 	 */
 	for(i=0;i<UPAGES;i++) {
 		if ((m = vm_page_lookup(upobj, i)) == NULL)
 			panic("pmap_swapout_proc: upage already missing???");
 		m->dirty = VM_PAGE_BITS_ALL;
 		vm_page_unwire(m);
 		vm_page_deactivate(m);
 		pmap_kremove( (vm_offset_t) p->p_addr + PAGE_SIZE * i);
 	}
 }
 
 /*
  * Bring the UPAGES for a specified process back in.
  */
 void
 pmap_swapin_proc(p)
 	struct proc *p;
 {
 	int i;
 	vm_object_t upobj;
 	vm_page_t m;
 
 	upobj = p->p_upages_obj;
 	for(i=0;i<UPAGES;i++) {
 		int s;
 		s = splvm();
 retry:
 		if ((m = vm_page_lookup(upobj, i)) == NULL) {
 			if ((m = vm_page_alloc(upobj, i, VM_ALLOC_NORMAL)) == NULL) {
 				VM_WAIT;
 				goto retry;
 			}
 		} else {
 			if ((m->flags & PG_BUSY) || m->busy) {
 				m->flags |= PG_WANTED;
 				tsleep(m, PVM, "swinuw",0);
 				goto retry;
 			}
 			m->flags |= PG_BUSY;
 		}
 		vm_page_wire(m);
 		splx(s);
 
 		pmap_kenter(((vm_offset_t) p->p_addr) + i * PAGE_SIZE,
 			VM_PAGE_TO_PHYS(m));
 
 		if (m->valid != VM_PAGE_BITS_ALL) {
 			int rv;
 			rv = vm_pager_get_pages(upobj, &m, 1, 0);
 			if (rv != VM_PAGER_OK)
 				panic("pmap_swapin_proc: cannot get upages for proc: %d\n", p->p_pid);
 			m->valid = VM_PAGE_BITS_ALL;
 		}
 		PAGE_WAKEUP(m);
 		m->flags |= PG_MAPPED|PG_WRITEABLE;
 	}
 }
 
 /***************************************************
  * Page table page management routines.....
  ***************************************************/
 
 /*
  * This routine unholds page table pages, and if the hold count
  * drops to zero, then it decrements the wire count.
  */
 static int 
 _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m) {
 	int s;
 
 	if (m->flags & PG_BUSY) {
 		s = splvm();
 		while (m->flags & PG_BUSY) {
 			m->flags |= PG_WANTED;
 			tsleep(m, PVM, "pmuwpt", 0);
 		}
 		splx(s);
 	}
 
 	if (m->hold_count == 0) {
 		vm_offset_t pteva;
 		/*
 		 * unmap the page table page
 		 */
 		pmap->pm_pdir[m->pindex] = 0;
 		--pmap->pm_stats.resident_count;
 		if ((((unsigned)pmap->pm_pdir[PTDPTDI]) & PG_FRAME) ==
 			(((unsigned) PTDpde) & PG_FRAME)) {
 			/*
 			 * Do a invltlb to make the invalidated mapping
 			 * take effect immediately.
 			 */
 			pteva = UPT_MIN_ADDRESS + i386_ptob(m->pindex);
 			invltlb_1pg(pteva);
 		}
 
 #if defined(PTPHINT)
 		if (pmap->pm_ptphint == m)
 			pmap->pm_ptphint = NULL;
 #endif
 
 		/*
 		 * If the page is finally unwired, simply free it.
 		 */
 		--m->wire_count;
 		if (m->wire_count == 0) {
 
 			if (m->flags & PG_WANTED) {
 				m->flags &= ~PG_WANTED;
 				wakeup(m);
 			}
 
 			vm_page_free_zero(m);
 			--cnt.v_wire_count;
 		}
 		return 1;
 	}
 	return 0;
 }
 
 __inline static int
 pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m) {
 	vm_page_unhold(m);
 	if (m->hold_count == 0)
 		return _pmap_unwire_pte_hold(pmap, m);
 	else
 		return 0;
 }
 
 /*
  * After removing a page table entry, this routine is used to
  * conditionally free the page, and manage the hold/wire counts.
  */
 static int
 pmap_unuse_pt(pmap, va, mpte)
 	pmap_t pmap;
 	vm_offset_t va;
 	vm_page_t mpte;
 {
 	unsigned ptepindex;
 	if (va >= UPT_MIN_ADDRESS)
 		return 0;
 
 	if (mpte == NULL) {
 		ptepindex = (va >> PDRSHIFT);
 #if defined(PTPHINT)
 		if (pmap->pm_ptphint &&
 			(pmap->pm_ptphint->pindex == ptepindex)) {
 			mpte = pmap->pm_ptphint;
 		} else {
 			mpte = pmap_page_lookup( pmap->pm_pteobj, ptepindex);
 			pmap->pm_ptphint = mpte;
 		}
 #else
 		mpte = pmap_page_lookup( pmap->pm_pteobj, ptepindex);
 #endif
 	}
 
 	return pmap_unwire_pte_hold(pmap, mpte);
 }
 
+#if !defined(SMP)
+void
+pmap_pinit0(pmap)
+	struct pmap *pmap;
+{
+	pmap->pm_pdir =
+		(pd_entry_t *)kmem_alloc_pageable(kernel_map, PAGE_SIZE);
+	pmap_kenter((vm_offset_t) pmap->pm_pdir, (vm_offset_t) IdlePTD);
+	pmap->pm_flags = 0;
+	pmap->pm_count = 1;
+	pmap->pm_ptphint = NULL;
+	TAILQ_INIT(&pmap->pm_pvlist);
+}
+#else
+void
+pmap_pinit0(pmap)
+	struct pmap *pmap;
+{
+	pmap_pinit(pmap);
+}
+#endif
+
 /*
  * Initialize a preallocated and zeroed pmap structure,
  * such as one in a vmspace structure.
  */
 void
 pmap_pinit(pmap)
 	register struct pmap *pmap;
 {
 	vm_page_t ptdpg;
 	/*
 	 * No need to allocate page table space yet but we do need a valid
 	 * page directory table.
 	 */
 
 	if (pdstackptr > 0) {
 		--pdstackptr;
 		pmap->pm_pdir = (pd_entry_t *)pdstack[pdstackptr];
 	} else {
 		pmap->pm_pdir =
 			(pd_entry_t *)kmem_alloc_pageable(kernel_map, PAGE_SIZE);
 	}
 
 	/*
 	 * allocate object for the ptes
 	 */
 	pmap->pm_pteobj = vm_object_allocate( OBJT_DEFAULT, PTDPTDI + 1);
 
 	/*
 	 * allocate the page directory page
 	 */
 retry:
 	ptdpg = pmap_page_alloc( pmap->pm_pteobj, PTDPTDI);
 	if (ptdpg == NULL) 
 		goto retry;
 
 	ptdpg->wire_count = 1;
 	++cnt.v_wire_count;
 
 	ptdpg->flags &= ~(PG_MAPPED|PG_BUSY);	/* not mapped normally */
 	ptdpg->valid = VM_PAGE_BITS_ALL;
 
 	pmap_kenter((vm_offset_t) pmap->pm_pdir, VM_PAGE_TO_PHYS(ptdpg));
 	if ((ptdpg->flags & PG_ZERO) == 0)
 		bzero(pmap->pm_pdir, PAGE_SIZE);
 
 	/* wire in kernel global address entries */
 	/* XXX copies current process, does not fill in MPPTDI */
 	bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * PTESIZE);
 
 	/* install self-referential address mapping entry */
 	*(unsigned *) (pmap->pm_pdir + PTDPTDI) =
 		VM_PAGE_TO_PHYS(ptdpg) | PG_V | PG_RW;
 
 	pmap->pm_flags = 0;
 	pmap->pm_count = 1;
 	pmap->pm_ptphint = NULL;
 	TAILQ_INIT(&pmap->pm_pvlist);
 }
 
 static int
 pmap_release_free_page(pmap, p)
 	struct pmap *pmap;
 	vm_page_t p;
 {
 	int s;
 	unsigned *pde = (unsigned *) pmap->pm_pdir;
 	/*
 	 * This code optimizes the case of freeing non-busy
 	 * page-table pages.  Those pages are zero now, and
 	 * might as well be placed directly into the zero queue.
 	 */
 	s = splvm();
 	if (p->flags & PG_BUSY) {
 		p->flags |= PG_WANTED;
 		tsleep(p, PVM, "pmaprl", 0);
 		splx(s);
 		return 0;
 	}
 
 	if (p->flags & PG_WANTED) {
 		p->flags &= ~PG_WANTED;
 		wakeup(p);
 	}
 
 	/*
 	 * Remove the page table page from the processes address space.
 	 */
 	pde[p->pindex] = 0;
 	--pmap->pm_stats.resident_count;
 
 	if (p->hold_count)  {
 		panic("pmap_release: freeing held page table page");
 	}
 	/*
 	 * Page directory pages need to have the kernel
 	 * stuff cleared, so they can go into the zero queue also.
 	 */
 	if (p->pindex == PTDPTDI) {
 		bzero(pde + KPTDI, nkpt * PTESIZE);
 #ifdef SMP
 		pde[MPPTDI] = 0;
 #endif
 		pde[APTDPTDI] = 0;
 		pmap_kremove((vm_offset_t) pmap->pm_pdir);
 	}
 
 #if defined(PTPHINT)
 	if (pmap->pm_ptphint &&
 		(pmap->pm_ptphint->pindex == p->pindex))
 		pmap->pm_ptphint = NULL;
 #endif
 
 	vm_page_free_zero(p);
 	splx(s);
 	return 1;
 }
 
 /*
  * this routine is called if the page table page is not
  * mapped correctly.
  */
 static vm_page_t
 _pmap_allocpte(pmap, ptepindex)
 	pmap_t	pmap;
 	unsigned ptepindex;
 {
 	vm_offset_t pteva, ptepa;
 	vm_page_t m;
 	int needszero = 0;
 
 	/*
 	 * Find or fabricate a new pagetable page
 	 */
 retry:
 	m = vm_page_lookup(pmap->pm_pteobj, ptepindex);
 	if (m == NULL) {
 		m = pmap_page_alloc(pmap->pm_pteobj, ptepindex);
 		if (m == NULL)
 			goto retry;
 		if ((m->flags & PG_ZERO) == 0)
 			needszero = 1;
 		m->flags &= ~(PG_ZERO|PG_BUSY);
 		m->valid = VM_PAGE_BITS_ALL;
 	} else {
 		if ((m->flags & PG_BUSY) || m->busy) {
 			m->flags |= PG_WANTED;
 			tsleep(m, PVM, "ptewai", 0);
 			goto retry;
 		}
 	}
 
 	if (m->queue != PQ_NONE) {
 		int s = splvm();
 		vm_page_unqueue(m);
 		splx(s);
 	}
 
 	if (m->wire_count == 0)
 		++cnt.v_wire_count;
 	++m->wire_count;
 
 	/*
 	 * Increment the hold count for the page table page
 	 * (denoting a new mapping.)
 	 */
 	++m->hold_count;
 
 	/*
 	 * Map the pagetable page into the process address space, if
 	 * it isn't already there.
 	 */
 
 	pmap->pm_stats.resident_count++;
 
 	ptepa = VM_PAGE_TO_PHYS(m);
 	pmap->pm_pdir[ptepindex] = (pd_entry_t) (ptepa | PG_U | PG_RW | PG_V);
 
 #if defined(PTPHINT)
 	/*
 	 * Set the page table hint
 	 */
 	pmap->pm_ptphint = m;
 #endif
 
 	/*
 	 * Try to use the new mapping, but if we cannot, then
 	 * do it with the routine that maps the page explicitly.
 	 */
 	if (needszero) {
 		if ((((unsigned)pmap->pm_pdir[PTDPTDI]) & PG_FRAME) ==
 			(((unsigned) PTDpde) & PG_FRAME)) {
 			pteva = UPT_MIN_ADDRESS + i386_ptob(ptepindex);
 			bzero((caddr_t) pteva, PAGE_SIZE);
 		} else {
 			pmap_zero_page(ptepa);
 		}
 	}
 
 	m->valid = VM_PAGE_BITS_ALL;
 	m->flags |= PG_MAPPED;
 
 	return m;
 }
 
 static vm_page_t
 pmap_allocpte(pmap, va)
 	pmap_t	pmap;
 	vm_offset_t va;
 {
 	unsigned ptepindex;
 	vm_offset_t ptepa;
 	vm_page_t m;
 
 	/*
 	 * Calculate pagetable page index
 	 */
 	ptepindex = va >> PDRSHIFT;
 
 	/*
 	 * Get the page directory entry
 	 */
 	ptepa = (vm_offset_t) pmap->pm_pdir[ptepindex];
 
 	/*
 	 * This supports switching from a 4MB page to a
 	 * normal 4K page.
 	 */
 	if (ptepa & PG_PS) {
 		pmap->pm_pdir[ptepindex] = 0;
 		ptepa = 0;
 		invltlb();
 	}
 
 	/*
 	 * If the page table page is mapped, we just increment the
 	 * hold count, and activate it.
 	 */
 	if (ptepa) {
 #if defined(PTPHINT)
 		/*
 		 * In order to get the page table page, try the
 		 * hint first.
 		 */
 		if (pmap->pm_ptphint &&
 			(pmap->pm_ptphint->pindex == ptepindex)) {
 			m = pmap->pm_ptphint;
 		} else {
 			m = pmap_page_lookup( pmap->pm_pteobj, ptepindex);
 			pmap->pm_ptphint = m;
 		}
 #else
 		m = pmap_page_lookup( pmap->pm_pteobj, ptepindex);
 #endif
 		++m->hold_count;
 		return m;
 	}
 	/*
 	 * Here if the pte page isn't mapped, or if it has been deallocated.
 	 */
 	return _pmap_allocpte(pmap, ptepindex);
 }
 
 
 /***************************************************
 * Pmap allocation/deallocation routines.
  ***************************************************/
 
 /*
  * Release any resources held by the given physical map.
  * Called when a pmap initialized by pmap_pinit is being released.
  * Should only be called if the map contains no valid mappings.
  */
 void
 pmap_release(pmap)
 	register struct pmap *pmap;
 {
 	vm_page_t p,n,ptdpg;
 	vm_object_t object = pmap->pm_pteobj;
 
 #if defined(DIAGNOSTIC)
 	if (object->ref_count != 1)
 		panic("pmap_release: pteobj reference count != 1");
 #endif
 	
 	ptdpg = NULL;
 retry:
 	for (p = TAILQ_FIRST(&object->memq); p != NULL; p = n) {
 		n = TAILQ_NEXT(p, listq);
 		if (p->pindex == PTDPTDI) {
 			ptdpg = p;
 			continue;
 		}
 		if (!pmap_release_free_page(pmap, p))
 			goto retry;
 	}
 
 	if (ptdpg && !pmap_release_free_page(pmap, ptdpg))
 		goto retry;
 		
 	vm_object_deallocate(object);
 	if (pdstackptr < PDSTACKMAX) {
 		pdstack[pdstackptr] = (vm_offset_t) pmap->pm_pdir;
 		++pdstackptr;
 	} else {
 		int pdstmp = pdstackptr - 1;
 		kmem_free(kernel_map, pdstack[pdstmp], PAGE_SIZE);
 		pdstack[pdstmp] = (vm_offset_t) pmap->pm_pdir;
 	}
 	pmap->pm_pdir = 0;
 }
 
 /*
  * grow the number of kernel page table entries, if needed
  */
 void
 pmap_growkernel(vm_offset_t addr)
 {
 	struct proc *p;
 	struct pmap *pmap;
 	int s;
 	vm_offset_t ptpkva, ptppaddr;
 	vm_page_t nkpg;
 #ifdef SMP
 	int i;
 #endif
 	pd_entry_t newpdir;
 	vm_pindex_t ptpidx;
 
 	s = splhigh();
 	if (kernel_vm_end == 0) {
 		kernel_vm_end = KERNBASE;
 		nkpt = 0;
 		while (pdir_pde(PTD, kernel_vm_end)) {
 			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
 			++nkpt;
 		}
 	}
 	addr = (addr + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
 	while (kernel_vm_end < addr) {
 		if (pdir_pde(PTD, kernel_vm_end)) {
 			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
 			continue;
 		}
 		++nkpt;
 		ptpkva = (vm_offset_t) vtopte(addr);
 		ptpidx = (ptpkva >> PAGE_SHIFT); 
 		/*
 		 * This index is bogus, but out of the way
 		 */
 		nkpg = vm_page_alloc(kernel_object,
 			ptpidx, VM_ALLOC_SYSTEM);
 		if (!nkpg)
 			panic("pmap_growkernel: no memory to grow kernel");
 
 		vm_page_wire(nkpg);
 		vm_page_remove(nkpg);
 		ptppaddr = VM_PAGE_TO_PHYS(nkpg);
 		pmap_zero_page(ptppaddr);
 		newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW);
 		pdir_pde(PTD, kernel_vm_end) = newpdir;
 
 #ifdef SMP
 		for (i = 0; i < mp_ncpus; i++) {
 			if (IdlePTDS[i])
 				pdir_pde(IdlePTDS[i], kernel_vm_end) = newpdir;
 		}
 #endif
 
 		for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) {
 			if (p->p_vmspace) {
 				pmap = &p->p_vmspace->vm_pmap;
 				*pmap_pde(pmap, kernel_vm_end) = newpdir;
 			}
 		}
 		*pmap_pde(kernel_pmap, kernel_vm_end) = newpdir;
 		kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
 	}
 	splx(s);
 }
 
 /*
  *	Retire the given physical map from service.
  *	Should only be called if the map contains
  *	no valid mappings.
  */
 void
 pmap_destroy(pmap)
 	register pmap_t pmap;
 {
 	int count;
 
 	if (pmap == NULL)
 		return;
 
 	count = --pmap->pm_count;
 	if (count == 0) {
 		pmap_release(pmap);
 		panic("destroying a pmap is not yet implemented");
 		/* free((caddr_t) pmap, M_VMPMAP); */
 	}
 }
 
 /*
  *	Add a reference to the specified pmap.
  */
 void
 pmap_reference(pmap)
 	pmap_t pmap;
 {
 	if (pmap != NULL) {
 		pmap->pm_count++;
 	}
 }
 
 /***************************************************
 * page management routines.
  ***************************************************/
 
 /*
  * free the pv_entry back to the free list
  */
 static inline void
 free_pv_entry(pv)
 	pv_entry_t pv;
 {
 	pv_entry_count--;
 	zfreei(pvzone, pv);
 }
 
 /*
  * get a new pv_entry, allocating a block from the system
  * when needed.
  * the memory allocation is performed bypassing the malloc code
  * because of the possibility of allocations at interrupt time.
  */
 static pv_entry_t
 get_pv_entry(void)
 {
 	pv_entry_count++;
 	if (pv_entry_high_water &&
 		(pv_entry_count > pv_entry_high_water) &&
 		(pmap_pagedaemon_waken == 0)) {
 		pmap_pagedaemon_waken = 1;
 		wakeup (&vm_pages_needed);
 	}
 	return zalloci(pvzone);
 }
 
 /*
  * This routine is very drastic, but can save the system
  * in a pinch.
  */
 void
 pmap_collect() {
 	pv_table_t *ppv;
 	int i;
 	vm_offset_t pa;
 	vm_page_t m;
 	static int warningdone=0;
 
 	if (pmap_pagedaemon_waken == 0)
 		return;
 
 	if (warningdone < 5) {
 		printf("pmap_collect: collecting pv entries -- suggest increasing PMAP_SHPGPERPROC\n");
 		warningdone++;
 	}
 
 	for(i = 0; i < pv_npg; i++) {
 		if ((ppv = &pv_table[i]) == 0)
 			continue;
 		m = ppv->pv_vm_page;
 		if ((pa = VM_PAGE_TO_PHYS(m)) == 0)
 			continue;
 		if (m->wire_count || m->hold_count || m->busy || (m->flags & PG_BUSY))
 			continue;
 		pmap_remove_all(pa);
 	}
 	pmap_pagedaemon_waken = 0;
 }
 	
 
 /*
  * If it is the first entry on the list, it is actually
  * in the header and we must copy the following entry up
  * to the header.  Otherwise we must search the list for
  * the entry.  In either case we free the now unused entry.
  */
 
 static int
 pmap_remove_entry(pmap, ppv, va)
 	struct pmap *pmap;
 	pv_table_t *ppv;
 	vm_offset_t va;
 {
 	pv_entry_t pv;
 	int rtval;
 	int s;
 
 	s = splvm();
 	if (ppv->pv_list_count < pmap->pm_stats.resident_count) {
 		for (pv = TAILQ_FIRST(&ppv->pv_list);
 			pv;
 			pv = TAILQ_NEXT(pv, pv_list)) {
 			if (pmap == pv->pv_pmap && va == pv->pv_va) 
 				break;
 		}
 	} else {
 		for (pv = TAILQ_FIRST(&pmap->pm_pvlist);
 			pv;
 			pv = TAILQ_NEXT(pv, pv_plist)) {
 			if (va == pv->pv_va) 
 				break;
 		}
 	}
 
 	rtval = 0;
 	if (pv) {
 		rtval = pmap_unuse_pt(pmap, va, pv->pv_ptem);
 		TAILQ_REMOVE(&ppv->pv_list, pv, pv_list);
 		--ppv->pv_list_count;
 		if (TAILQ_FIRST(&ppv->pv_list) == NULL) {
 			ppv->pv_vm_page->flags &= ~(PG_MAPPED|PG_WRITEABLE);
 		}
 
 		TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
 		free_pv_entry(pv);
 	}
 			
 	splx(s);
 	return rtval;
 }
 
 /*
  * Create a pv entry for page at pa for
  * (pmap, va).
  */
 static void
 pmap_insert_entry(pmap, va, mpte, pa)
 	pmap_t pmap;
 	vm_offset_t va;
 	vm_page_t mpte;
 	vm_offset_t pa;
 {
 
 	int s;
 	pv_entry_t pv;
 	pv_table_t *ppv;
 
 	s = splvm();
 	pv = get_pv_entry();
 	pv->pv_va = va;
 	pv->pv_pmap = pmap;
 	pv->pv_ptem = mpte;
 
 	TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist);
 
 	ppv = pa_to_pvh(pa);
 	TAILQ_INSERT_TAIL(&ppv->pv_list, pv, pv_list);
 	++ppv->pv_list_count;
 
 	splx(s);
 }
 
 /*
  * pmap_remove_pte: do the things to unmap a page in a process
  */
 static int
 pmap_remove_pte(pmap, ptq, va)
 	struct pmap *pmap;
 	unsigned *ptq;
 	vm_offset_t va;
 {
 	unsigned oldpte;
 	pv_table_t *ppv;
 
 	oldpte = *ptq;
 	*ptq = 0;
 	if (oldpte & PG_W)
 		pmap->pm_stats.wired_count -= 1;
 	/*
 	 * Machines that don't support invlpg, also don't support
 	 * PG_G.
 	 */
 	if (oldpte & PG_G)
 		invlpg(va);
 	pmap->pm_stats.resident_count -= 1;
 	if (oldpte & PG_MANAGED) {
 		ppv = pa_to_pvh(oldpte);
 		if (oldpte & PG_M) {
 #if defined(PMAP_DIAGNOSTIC)
 			if (pmap_nw_modified((pt_entry_t) oldpte)) {
 				printf("pmap_remove: modified page not writable: va: 0x%lx, pte: 0x%lx\n", va, (int) oldpte);
 			}
 #endif
 			if (pmap_track_modified(va))
 				ppv->pv_vm_page->dirty = VM_PAGE_BITS_ALL;
 		}
 		return pmap_remove_entry(pmap, ppv, va);
 	} else {
 		return pmap_unuse_pt(pmap, va, NULL);
 	}
 
 	return 0;
 }
 
 /*
  * Remove a single page from a process address space
  */
 static void
 pmap_remove_page(pmap, va)
 	struct pmap *pmap;
 	register vm_offset_t va;
 {
 	register unsigned *ptq;
 
 	/*
 	 * if there is no pte for this address, just skip it!!!
 	 */
 	if (*pmap_pde(pmap, va) == 0) {
 		return;
 	}
 
 	/*
 	 * get a local va for mappings for this pmap.
 	 */
 	ptq = get_ptbase(pmap) + i386_btop(va);
 	if (*ptq) {
 		(void) pmap_remove_pte(pmap, ptq, va);
 		invltlb_1pg(va);
 	}
 	return;
 }
 
 /*
  *	Remove the given range of addresses from the specified map.
  *
  *	It is assumed that the start and end are properly
  *	rounded to the page size.
  */
 void
 pmap_remove(pmap, sva, eva)
 	struct pmap *pmap;
 	register vm_offset_t sva;
 	register vm_offset_t eva;
 {
 	register unsigned *ptbase;
 	vm_offset_t pdnxt;
 	vm_offset_t ptpaddr;
 	vm_offset_t sindex, eindex;
 	int anyvalid;
 
 	if (pmap == NULL)
 		return;
 
 	if (pmap->pm_stats.resident_count == 0)
 		return;
 
 	/*
 	 * special handling of removing one page.  a very
 	 * common operation and easy to short circuit some
 	 * code.
 	 */
 	if (((sva + PAGE_SIZE) == eva) && 
 		(((unsigned) pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) {
 		pmap_remove_page(pmap, sva);
 		return;
 	}
 
 	anyvalid = 0;
 
 	/*
 	 * Get a local virtual address for the mappings that are being
 	 * worked with.
 	 */
 	ptbase = get_ptbase(pmap);
 
 	sindex = i386_btop(sva);
 	eindex = i386_btop(eva);
 
 	for (; sindex < eindex; sindex = pdnxt) {
 		unsigned pdirindex;
 
 		/*
 		 * Calculate index for next page table.
 		 */
 		pdnxt = ((sindex + NPTEPG) & ~(NPTEPG - 1));
 		if (pmap->pm_stats.resident_count == 0)
 			break;
 
 		pdirindex = sindex / NPDEPG;
 		if (((ptpaddr = (unsigned) pmap->pm_pdir[pdirindex]) & PG_PS) != 0) {
 			pmap->pm_pdir[pdirindex] = 0;
 			pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
 			anyvalid++;
 			continue;
 		}
 
 		/*
 		 * Weed out invalid mappings. Note: we assume that the page
 		 * directory table is always allocated, and in kernel virtual.
 		 */
 		if (ptpaddr == 0)
 			continue;
 
 		/*
 		 * Limit our scan to either the end of the va represented
 		 * by the current page table page, or to the end of the
 		 * range being removed.
 		 */
 		if (pdnxt > eindex) {
 			pdnxt = eindex;
 		}
 
 		for ( ;sindex != pdnxt; sindex++) {
 			vm_offset_t va;
 			if (ptbase[sindex] == 0) {
 				continue;
 			}
 			va = i386_ptob(sindex);
 			
 			anyvalid++;
 			if (pmap_remove_pte(pmap,
 				ptbase + sindex, va))
 				break;
 		}
 	}
 
 	if (anyvalid) {
 		invltlb();
 	}
 }
 
 /*
  *	Routine:	pmap_remove_all
  *	Function:
  *		Removes this physical page from
  *		all physical maps in which it resides.
  *		Reflects back modify bits to the pager.
  *
  *	Notes:
  *		Original versions of this routine were very
  *		inefficient because they iteratively called
  *		pmap_remove (slow...)
  */
 
 static void
 pmap_remove_all(pa)
 	vm_offset_t pa;
 {
 	register pv_entry_t pv;
 	pv_table_t *ppv;
 	register unsigned *pte, tpte;
 	int nmodify;
 	int update_needed;
 	int s;
 
 	nmodify = 0;
 	update_needed = 0;
 #if defined(PMAP_DIAGNOSTIC)
 	/*
 	 * XXX this makes pmap_page_protect(NONE) illegal for non-managed
 	 * pages!
 	 */
 	if (!pmap_is_managed(pa)) {
 		panic("pmap_page_protect: illegal for unmanaged page, va: 0x%lx", pa);
 	}
 #endif
 
 	s = splvm();
 	ppv = pa_to_pvh(pa);
 	while ((pv = TAILQ_FIRST(&ppv->pv_list)) != NULL) {
 		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
 
 		pv->pv_pmap->pm_stats.resident_count--;
 
 		tpte = *pte;
 		*pte = 0;
 		if (tpte & PG_W)
 			pv->pv_pmap->pm_stats.wired_count--;
 		/*
 		 * Update the vm_page_t clean and reference bits.
 		 */
 		if (tpte & PG_M) {
 #if defined(PMAP_DIAGNOSTIC)
 			if (pmap_nw_modified((pt_entry_t) tpte)) {
 				printf("pmap_remove_all: modified page not writable: va: 0x%lx, pte: 0x%lx\n", pv->pv_va, tpte);
 			}
 #endif
 			if (pmap_track_modified(pv->pv_va))
 				ppv->pv_vm_page->dirty = VM_PAGE_BITS_ALL;
 		}
 		if (!update_needed &&
 			((!curproc || (&curproc->p_vmspace->vm_pmap == pv->pv_pmap)) ||
 			(pv->pv_pmap == kernel_pmap))) {
 			update_needed = 1;
 		}
 
 		TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist);
 		TAILQ_REMOVE(&ppv->pv_list, pv, pv_list);
 		--ppv->pv_list_count;
 		pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem);
 		free_pv_entry(pv);
 	}
 	ppv->pv_vm_page->flags &= ~(PG_MAPPED|PG_WRITEABLE);
 
 
 	if (update_needed)
 		invltlb();
 	splx(s);
 	return;
 }
 
 /*
  *	Set the physical protection on the
  *	specified range of this map as requested.
  */
 void
 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
 {
 	register unsigned *ptbase;
 	vm_offset_t pdnxt;
 	vm_offset_t ptpaddr;
 	vm_offset_t sindex, eindex;
 	int anychanged;
 
 
 	if (pmap == NULL)
 		return;
 
 	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
 		pmap_remove(pmap, sva, eva);
 		return;
 	}
 
 	anychanged = 0;
 
 	ptbase = get_ptbase(pmap);
 
 	sindex = i386_btop(sva);
 	eindex = i386_btop(eva);
 
 	for (; sindex < eindex; sindex = pdnxt) {
 
 		unsigned pdirindex;
 
 		pdnxt = ((sindex + NPTEPG) & ~(NPTEPG - 1));
 
 		pdirindex = sindex / NPDEPG;
 		if (((ptpaddr = (unsigned) pmap->pm_pdir[pdirindex]) & PG_PS) != 0) {
 			(unsigned) pmap->pm_pdir[pdirindex] &= ~(PG_M|PG_RW);
 			pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
 			anychanged++;
 			continue;
 		}
 
 		/*
 		 * Weed out invalid mappings. Note: we assume that the page
 		 * directory table is always allocated, and in kernel virtual.
 		 */
 		if (ptpaddr == 0)
 			continue;
 
 		if (pdnxt > eindex) {
 			pdnxt = eindex;
 		}
 
 		for (; sindex != pdnxt; sindex++) {
 
 			unsigned pbits = ptbase[sindex];
 
 			if (prot & VM_PROT_WRITE) {
 				if ((pbits & (PG_RW|PG_V)) == PG_V) {
 					if (pbits & PG_MANAGED) {
 						vm_page_t m = PHYS_TO_VM_PAGE(pbits);
 						m->flags |= PG_WRITEABLE;
 						m->object->flags |= OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY;
 					}
 					ptbase[sindex] = pbits | PG_RW;
 					anychanged = 1;
 				}
 			} else if (pbits & PG_RW) {
 				if (pbits & PG_M) {
 					vm_offset_t sva1 = i386_ptob(sindex);
 					if ((pbits & PG_MANAGED) && pmap_track_modified(sva1)) {
 						vm_page_t m = PHYS_TO_VM_PAGE(pbits);
 						m->dirty = VM_PAGE_BITS_ALL;
 					}
 				}
 				ptbase[sindex] = pbits & ~(PG_M|PG_RW);
 				anychanged = 1;
 			}
 		}
 	}
 	if (anychanged)
 		invltlb();
 }
 
 /*
  *	Insert the given physical page (p) at
  *	the specified virtual address (v) in the
  *	target physical map with the protection requested.
  *
  *	If specified, the page will be wired down, meaning
  *	that the related pte can not be reclaimed.
  *
  *	NB:  This is the only routine which MAY NOT lazy-evaluate
  *	or lose information.  That is, this routine must actually
  *	insert this page into the given map NOW.
  */
 void
 pmap_enter(pmap_t pmap, vm_offset_t va, vm_offset_t pa, vm_prot_t prot,
 	   boolean_t wired)
 {
 	register unsigned *pte;
 	vm_offset_t opa;
 	vm_offset_t origpte, newpte;
 	vm_page_t mpte;
 
 	if (pmap == NULL)
 		return;
 
 	va &= PG_FRAME;
 #ifdef PMAP_DIAGNOSTIC
 	if (va > VM_MAX_KERNEL_ADDRESS)
 		panic("pmap_enter: toobig");
 	if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS))
 		panic("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", va);
 #endif
 
 	mpte = NULL;
 	/*
 	 * In the case that a page table page is not
 	 * resident, we are creating it here.
 	 */
 	if (va < UPT_MIN_ADDRESS) {
 		mpte = pmap_allocpte(pmap, va);
 	}
 #if 0 && defined(PMAP_DIAGNOSTIC)
 	else {
 		vm_offset_t *pdeaddr = (vm_offset_t *)pmap_pde(pmap, va);
 		if (((origpte = (vm_offset_t) *pdeaddr) & PG_V) == 0) { 
 			panic("pmap_enter: invalid kernel page table page(0), pdir=%p, pde=%p, va=%p\n",
 				pmap->pm_pdir[PTDPTDI], origpte, va);
 		}
 		if (smp_active) {
 			pdeaddr = (vm_offset_t *) IdlePTDS[cpuid];
 			if (((newpte = pdeaddr[va >> PDRSHIFT]) & PG_V) == 0) {
 				if ((vm_offset_t) my_idlePTD != (vm_offset_t) vtophys(pdeaddr))
 					printf("pde mismatch: %x, %x\n", my_idlePTD, pdeaddr);
 				printf("cpuid: %d, pdeaddr: 0x%x\n", cpuid, pdeaddr);
 				panic("pmap_enter: invalid kernel page table page(1), pdir=%p, npde=%p, pde=%p, va=%p\n",
 					pmap->pm_pdir[PTDPTDI], newpte, origpte, va);
 			}
 		}
 	}
 #endif
 
 	pte = pmap_pte(pmap, va);
 	/*
 	 * Page Directory table entry not valid, we need a new PT page
 	 */
 	if (pte == NULL) {
 		panic("pmap_enter: invalid page directory, pdir=%p, va=0x%lx\n",
 			pmap->pm_pdir[PTDPTDI], va);
 	}
 
 	origpte = *(vm_offset_t *)pte;
 	pa &= PG_FRAME;
 	opa = origpte & PG_FRAME;
 	if (origpte & PG_PS)
 		panic("pmap_enter: attempted pmap_enter on 4MB page");
 
 	/*
 	 * Mapping has not changed, must be protection or wiring change.
 	 */
 	if (origpte && (opa == pa)) {
 		/*
 		 * Wiring change, just update stats. We don't worry about
 		 * wiring PT pages as they remain resident as long as there
 		 * are valid mappings in them. Hence, if a user page is wired,
 		 * the PT page will be also.
 		 */
 		if (wired && ((origpte & PG_W) == 0))
 			pmap->pm_stats.wired_count++;
 		else if (!wired && (origpte & PG_W))
 			pmap->pm_stats.wired_count--;
 
 #if defined(PMAP_DIAGNOSTIC)
 		if (pmap_nw_modified((pt_entry_t) origpte)) {
 			printf("pmap_enter: modified page not writable: va: 0x%lx, pte: 0x%lx\n", va, origpte);
 		}
 #endif
 
 		/*
 		 * We might be turning off write access to the page,
 		 * so we go ahead and sense modify status.
 		 */
 		if (origpte & PG_MANAGED) {
 			vm_page_t m;
 			if (origpte & PG_M) {
 				if (pmap_track_modified(va)) {
 					m = PHYS_TO_VM_PAGE(pa);
 					m->dirty = VM_PAGE_BITS_ALL;
 				}
 			}
 			pa |= PG_MANAGED;
 		}
 
 		if (mpte)
 			--mpte->hold_count;
 
 		goto validate;
 	} 
 	/*
 	 * Mapping has changed, invalidate old range and fall through to
 	 * handle validating new mapping.
 	 */
 	if (opa) {
 		int err;
 		err = pmap_remove_pte(pmap, pte, va);
 		if (err)
 			panic("pmap_enter: pte vanished, va: 0x%x", va);
 	}
 
 	/*
 	 * Enter on the PV list if part of our managed memory Note that we
 	 * raise IPL while manipulating pv_table since pmap_enter can be
 	 * called at interrupt time.
 	 */
 	if (pmap_is_managed(pa)) {
 		pmap_insert_entry(pmap, va, mpte, pa);
 		pa |= PG_MANAGED;
 	}
 
 	/*
 	 * Increment counters
 	 */
 	pmap->pm_stats.resident_count++;
 	if (wired)
 		pmap->pm_stats.wired_count++;
 
 validate:
 	/*
 	 * Now validate mapping with desired protection/wiring.
 	 */
 	newpte = (vm_offset_t) (pa | pte_prot(pmap, prot) | PG_V);
 
 	if (wired)
 		newpte |= PG_W;
 	if (va < UPT_MIN_ADDRESS)
 		newpte |= PG_U;
 	if (pmap == kernel_pmap)
 		newpte |= pgeflag;
 
 	/*
 	 * if the mapping or permission bits are different, we need
 	 * to update the pte.
 	 */
 	if ((origpte & ~(PG_M|PG_A)) != newpte) {
 		*pte = newpte;
 		if (origpte)
 			invltlb_1pg(va);
 	}
 }
 
 /*
  * this code makes some *MAJOR* assumptions:
  * 1. Current pmap & pmap exists.
  * 2. Not wired.
  * 3. Read access.
  * 4. No page table pages.
  * 5. Tlbflush is deferred to calling procedure.
  * 6. Page IS managed.
  * but is *MUCH* faster than pmap_enter...
  */
 
 static vm_page_t
 pmap_enter_quick(pmap, va, pa, mpte)
 	register pmap_t pmap;
 	vm_offset_t va;
 	register vm_offset_t pa;
 	vm_page_t mpte;
 {
 	register unsigned *pte;
 
 	/*
 	 * In the case that a page table page is not
 	 * resident, we are creating it here.
 	 */
 	if (va < UPT_MIN_ADDRESS) {
 		unsigned ptepindex;
 		vm_offset_t ptepa;
 
 		/*
 		 * Calculate pagetable page index
 		 */
 		ptepindex = va >> PDRSHIFT;
 		if (mpte && (mpte->pindex == ptepindex)) {
 			++mpte->hold_count;
 		} else {
 retry:
 			/*
 			 * Get the page directory entry
 			 */
 			ptepa = (vm_offset_t) pmap->pm_pdir[ptepindex];
 
 			/*
 			 * If the page table page is mapped, we just increment
 			 * the hold count, and activate it.
 			 */
 			if (ptepa) {
 				if (ptepa & PG_PS)
 					panic("pmap_enter_quick: unexpected mapping into 4MB page");
 #if defined(PTPHINT)
 				if (pmap->pm_ptphint &&
 					(pmap->pm_ptphint->pindex == ptepindex)) {
 					mpte = pmap->pm_ptphint;
 				} else {
 					mpte = pmap_page_lookup( pmap->pm_pteobj, ptepindex);
 					pmap->pm_ptphint = mpte;
 				}
 #else
 				mpte = pmap_page_lookup( pmap->pm_pteobj, ptepindex);
 #endif
 				if (mpte == NULL)
 					goto retry;
 				++mpte->hold_count;
 			} else {
 				mpte = _pmap_allocpte(pmap, ptepindex);
 			}
 		}
 	} else {
 		mpte = NULL;
 	}
 
 	/*
 	 * This call to vtopte makes the assumption that we are
 	 * entering the page into the current pmap.  In order to support
 	 * quick entry into any pmap, one would likely use pmap_pte_quick.
 	 * But that isn't as quick as vtopte.
 	 */
 	pte = (unsigned *)vtopte(va);
 	if (*pte) {
 		if (mpte)
 			pmap_unwire_pte_hold(pmap, mpte);
 		return 0;
 	}
 
 	/*
 	 * Enter on the PV list if part of our managed memory Note that we
 	 * raise IPL while manipulating pv_table since pmap_enter can be
 	 * called at interrupt time.
 	 */
 	pmap_insert_entry(pmap, va, mpte, pa);
 
 	/*
 	 * Increment counters
 	 */
 	pmap->pm_stats.resident_count++;
 
 	/*
 	 * Now validate mapping with RO protection
 	 */
 	*pte = pa | PG_V | PG_U | PG_MANAGED;
 
 	return mpte;
 }
 
 #define MAX_INIT_PT (96)
 /*
  * pmap_object_init_pt preloads the ptes for a given object
  * into the specified pmap.  This eliminates the blast of soft
  * faults on process startup and immediately after an mmap.
  */
 void
 pmap_object_init_pt(pmap, addr, object, pindex, size, limit)
 	pmap_t pmap;
 	vm_offset_t addr;
 	vm_object_t object;
 	vm_pindex_t pindex;
 	vm_size_t size;
 	int limit;
 {
 	vm_offset_t tmpidx;
 	int psize;
 	vm_page_t p, mpte;
 	int objpgs;
 
 	if (!pmap)
 		return;
 
 	/*
 	 * This code maps large physical mmap regions into the
 	 * processor address space.  Note that some shortcuts
 	 * are taken, but the code works.
 	 */
 	if (pseflag &&
 		(object->type == OBJT_DEVICE) &&
 		((addr & (NBPDR - 1)) == 0) &&
 		((size & (NBPDR - 1)) == 0) ) {
 		int i;
 		int s;
 		vm_page_t m[1];
 		unsigned int ptepindex;
 		int npdes;
 		vm_offset_t ptepa;
 
 		if (pmap->pm_pdir[ptepindex = (addr >> PDRSHIFT)])
 			return;
 
 		s = splhigh();
 retry:
 		p = vm_page_lookup(object, pindex);
 		if (p && (p->flags & PG_BUSY)) {
 			tsleep(p, PVM, "init4p", 0);
 			goto retry;
 		}
 		splx(s);
 		
 		if (p == NULL) {
 			p = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL);
 			if (p == NULL)
 				return;
 			m[0] = p;
 
 			if (vm_pager_get_pages(object, m, 1, 0) != VM_PAGER_OK) {
 				PAGE_WAKEUP(p);
 				vm_page_free(p);
 				return;
 			}
 
 			p = vm_page_lookup(object, pindex);
 			PAGE_WAKEUP(p);
 		}
 
 		ptepa = (vm_offset_t) VM_PAGE_TO_PHYS(p);
 		if (ptepa & (NBPDR - 1)) {
 			return;
 		}
 
 		p->valid = VM_PAGE_BITS_ALL;
 
 		pmap->pm_stats.resident_count += size >> PAGE_SHIFT;
 		npdes = size >> PDRSHIFT;
 		for(i=0;i<npdes;i++) {
 			pmap->pm_pdir[ptepindex] =
 				(pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_PS);
 			ptepa += NBPDR;
 			ptepindex += 1;
 		}
 		p->flags |= PG_MAPPED;
 		invltlb();
 		return;
 	}
 
 	psize = i386_btop(size);
 
 	if ((object->type != OBJT_VNODE) ||
 		(limit && (psize > MAX_INIT_PT) &&
 			(object->resident_page_count > MAX_INIT_PT))) {
 		return;
 	}
 
 	if (psize + pindex > object->size)
 		psize = object->size - pindex;
 
 	mpte = NULL;
 	/*
 	 * if we are processing a major portion of the object, then scan the
 	 * entire thing.
 	 */
 	if (psize > (object->size >> 2)) {
 		objpgs = psize;
 
 		for (p = TAILQ_FIRST(&object->memq);
 		    ((objpgs > 0) && (p != NULL));
 		    p = TAILQ_NEXT(p, listq)) {
 
 			tmpidx = p->pindex;
 			if (tmpidx < pindex) {
 				continue;
 			}
 			tmpidx -= pindex;
 			if (tmpidx >= psize) {
 				continue;
 			}
 			if (((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
 			    (p->busy == 0) &&
 			    (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
 				if ((p->queue - p->pc) == PQ_CACHE)
 					vm_page_deactivate(p);
 				p->flags |= PG_BUSY;
 				mpte = pmap_enter_quick(pmap, 
 					addr + i386_ptob(tmpidx),
 					VM_PAGE_TO_PHYS(p), mpte);
 				p->flags |= PG_MAPPED;
 				PAGE_WAKEUP(p);
 			}
 			objpgs -= 1;
 		}
 	} else {
 		/*
 		 * else lookup the pages one-by-one.
 		 */
 		for (tmpidx = 0; tmpidx < psize; tmpidx += 1) {
 			p = vm_page_lookup(object, tmpidx + pindex);
 			if (p &&
 			    ((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
 			    (p->busy == 0) &&
 			    (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
 				if ((p->queue - p->pc) == PQ_CACHE)
 					vm_page_deactivate(p);
 				p->flags |= PG_BUSY;
 				mpte = pmap_enter_quick(pmap, 
 					addr + i386_ptob(tmpidx),
 					VM_PAGE_TO_PHYS(p), mpte);
 				p->flags |= PG_MAPPED;
 				PAGE_WAKEUP(p);
 			}
 		}
 	}
 	return;
 }
 
 /*
  * pmap_prefault provides a quick way of clustering
  * pagefaults into a processes address space.  It is a "cousin"
  * of pmap_object_init_pt, except it runs at page fault time instead
  * of mmap time.
  */
 #define PFBAK 2
 #define PFFOR 2
 #define PAGEORDER_SIZE (PFBAK+PFFOR)
 
 static int pmap_prefault_pageorder[] = {
 	-PAGE_SIZE, PAGE_SIZE, -2 * PAGE_SIZE, 2 * PAGE_SIZE
 };
 
 void
 pmap_prefault(pmap, addra, entry, object)
 	pmap_t pmap;
 	vm_offset_t addra;
 	vm_map_entry_t entry;
 	vm_object_t object;
 {
 	int i;
 	vm_offset_t starta;
 	vm_offset_t addr;
 	vm_pindex_t pindex;
 	vm_page_t m, mpte;
 
 	if (entry->object.vm_object != object)
 		return;
 
 	if (!curproc || (pmap != &curproc->p_vmspace->vm_pmap))
 		return;
 
 	starta = addra - PFBAK * PAGE_SIZE;
 	if (starta < entry->start) {
 		starta = entry->start;
 	} else if (starta > addra) {
 		starta = 0;
 	}
 
 	mpte = NULL;
 	for (i = 0; i < PAGEORDER_SIZE; i++) {
 		vm_object_t lobject;
 		unsigned *pte;
 
 		addr = addra + pmap_prefault_pageorder[i];
 		if (addr < starta || addr >= entry->end)
 			continue;
 
 		if ((*pmap_pde(pmap, addr)) == NULL) 
 			continue;
 
 		pte = (unsigned *) vtopte(addr);
 		if (*pte)
 			continue;
 
 		pindex = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT;
 		lobject = object;
 		for (m = vm_page_lookup(lobject, pindex);
 		    (!m && (lobject->type == OBJT_DEFAULT) && (lobject->backing_object));
 		    lobject = lobject->backing_object) {
 			if (lobject->backing_object_offset & PAGE_MASK)
 				break;
 			pindex += (lobject->backing_object_offset >> PAGE_SHIFT);
 			m = vm_page_lookup(lobject->backing_object, pindex);
 		}
 
 		/*
 		 * give-up when a page is not in memory
 		 */
 		if (m == NULL)
 			break;
 
 		if (((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
 		    (m->busy == 0) &&
 		    (m->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
 
 			if ((m->queue - m->pc) == PQ_CACHE) {
 				vm_page_deactivate(m);
 			}
 			m->flags |= PG_BUSY;
 			mpte = pmap_enter_quick(pmap, addr,
 				VM_PAGE_TO_PHYS(m), mpte);
 			m->flags |= PG_MAPPED;
 			PAGE_WAKEUP(m);
 		}
 	}
 }
 
 /*
  *	Routine:	pmap_change_wiring
  *	Function:	Change the wiring attribute for a map/virtual-address
  *			pair.
  *	In/out conditions:
  *			The mapping must already exist in the pmap.
  */
 void
 pmap_change_wiring(pmap, va, wired)
 	register pmap_t pmap;
 	vm_offset_t va;
 	boolean_t wired;
 {
 	register unsigned *pte;
 
 	if (pmap == NULL)
 		return;
 
 	pte = pmap_pte(pmap, va);
 
 	if (wired && !pmap_pte_w(pte))
 		pmap->pm_stats.wired_count++;
 	else if (!wired && pmap_pte_w(pte))
 		pmap->pm_stats.wired_count--;
 
 	/*
 	 * Wiring is not a hardware characteristic so there is no need to
 	 * invalidate TLB.
 	 */
 	pmap_pte_set_w(pte, wired);
 }
 
 
 
 /*
  *	Copy the range specified by src_addr/len
  *	from the source map to the range dst_addr/len
  *	in the destination map.
  *
  *	This routine is only advisory and need not do anything.
  */
 
 void
 pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr)
 	pmap_t dst_pmap, src_pmap;
 	vm_offset_t dst_addr;
 	vm_size_t len;
 	vm_offset_t src_addr;
 {
 	vm_offset_t addr;
 	vm_offset_t end_addr = src_addr + len;
 	vm_offset_t pdnxt;
 	unsigned src_frame, dst_frame;
 
 	if (dst_addr != src_addr)
 		return;
 
 	src_frame = ((unsigned) src_pmap->pm_pdir[PTDPTDI]) & PG_FRAME;
 	if (src_frame != (((unsigned) PTDpde) & PG_FRAME)) {
 		return;
 	}
 
 	dst_frame = ((unsigned) dst_pmap->pm_pdir[PTDPTDI]) & PG_FRAME;
 	if (dst_frame != (((unsigned) APTDpde) & PG_FRAME)) {
 		APTDpde = (pd_entry_t) (dst_frame | PG_RW | PG_V);
 		invltlb();
 	}
 
 	for(addr = src_addr; addr < end_addr; addr = pdnxt) {
 		unsigned *src_pte, *dst_pte;
 		vm_page_t dstmpte, srcmpte;
 		vm_offset_t srcptepaddr;
 		unsigned ptepindex;
 
 		if (addr >= UPT_MIN_ADDRESS)
 			panic("pmap_copy: invalid to pmap_copy page tables\n");
 
 		pdnxt = ((addr + PAGE_SIZE*NPTEPG) & ~(PAGE_SIZE*NPTEPG - 1));
 		ptepindex = addr >> PDRSHIFT;
 
 		srcptepaddr = (vm_offset_t) src_pmap->pm_pdir[ptepindex];
 		if (srcptepaddr == 0)
 			continue;
 			
 		if (srcptepaddr & PG_PS) {
 			if (dst_pmap->pm_pdir[ptepindex] == 0) {
 				dst_pmap->pm_pdir[ptepindex] = (pd_entry_t) srcptepaddr;
 				dst_pmap->pm_stats.resident_count += NBPDR;
 			}
 			continue;
 		}
 
 		srcmpte = vm_page_lookup(src_pmap->pm_pteobj, ptepindex);
 		if ((srcmpte == NULL) ||
 			(srcmpte->hold_count == 0) || (srcmpte->flags & PG_BUSY))
 			continue;
 
 		if (pdnxt > end_addr)
 			pdnxt = end_addr;
 
 		src_pte = (unsigned *) vtopte(addr);
 		dst_pte = (unsigned *) avtopte(addr);
 		while (addr < pdnxt) {
 			unsigned ptetemp;
 			ptetemp = *src_pte;
 			/*
 			 * we only virtual copy managed pages
 			 */
 			if ((ptetemp & PG_MANAGED) != 0) {
 				/*
 				 * We have to check after allocpte for the
 				 * pte still being around...  allocpte can
 				 * block.
 				 */
 				dstmpte = pmap_allocpte(dst_pmap, addr);
 				if ((*dst_pte == 0) && (ptetemp = *src_pte)) {
 					/*
 					 * Clear the modified and
 					 * accessed (referenced) bits
 					 * during the copy.
 					 */
 					*dst_pte = ptetemp & ~(PG_M|PG_A);
 					dst_pmap->pm_stats.resident_count++;
 					pmap_insert_entry(dst_pmap, addr,
 						dstmpte,
 						(ptetemp & PG_FRAME));
 	 			} else {
 					pmap_unwire_pte_hold(dst_pmap, dstmpte);
 				}
 				if (dstmpte->hold_count >= srcmpte->hold_count)
 					break;
 			}
 			addr += PAGE_SIZE;
 			++src_pte;
 			++dst_pte;
 		}
 	}
 }	
 
 /*
  *	Routine:	pmap_kernel
  *	Function:
  *		Returns the physical map handle for the kernel.
  */
 pmap_t
 pmap_kernel()
 {
 	return (kernel_pmap);
 }
 
 /*
  *	pmap_zero_page zeros the specified (machine independent)
  *	page by mapping the page into virtual memory and using
  *	bzero to clear its contents, one machine dependent page
  *	at a time.
  */
 void
 pmap_zero_page(phys)
 	vm_offset_t phys;
 {
 #ifdef SMP
 	if (*(int *) prv_CMAP3)
 		panic("pmap_zero_page: prv_CMAP3 busy");
 
 	*(int *) prv_CMAP3 = PG_V | PG_RW | (phys & PG_FRAME);
 	invltlb_1pg((vm_offset_t) &prv_CPAGE3);
 
 	bzero(&prv_CPAGE3, PAGE_SIZE);
 
 	*(int *) prv_CMAP3 = 0;
 	invltlb_1pg((vm_offset_t) &prv_CPAGE3);
 #else
 	if (*(int *) CMAP2)
 		panic("pmap_zero_page: CMAP busy");
 
 	*(int *) CMAP2 = PG_V | PG_RW | (phys & PG_FRAME);
 	bzero(CADDR2, PAGE_SIZE);
 	*(int *) CMAP2 = 0;
 	invltlb_1pg((vm_offset_t) CADDR2);
 #endif
 }
 
 /*
  *	pmap_copy_page copies the specified (machine independent)
  *	page by mapping the page into virtual memory and using
  *	bcopy to copy the page, one machine dependent page at a
  *	time.
  */
 void
 pmap_copy_page(src, dst)
 	vm_offset_t src;
 	vm_offset_t dst;
 {
 #ifdef SMP
 	if (*(int *) prv_CMAP1)
 		panic("pmap_copy_page: prv_CMAP1 busy");
 	if (*(int *) prv_CMAP2)
 		panic("pmap_copy_page: prv_CMAP2 busy");
 
 	*(int *) prv_CMAP1 = PG_V | PG_RW | (src & PG_FRAME);
 	*(int *) prv_CMAP2 = PG_V | PG_RW | (dst & PG_FRAME);
 
 	invltlb_2pg( (vm_offset_t) &prv_CPAGE1, (vm_offset_t) &prv_CPAGE2);
 
 	bcopy(&prv_CPAGE1, &prv_CPAGE2, PAGE_SIZE);
 
 	*(int *) prv_CMAP1 = 0;
 	*(int *) prv_CMAP2 = 0;
 	invltlb_2pg( (vm_offset_t) &prv_CPAGE1, (vm_offset_t) &prv_CPAGE2);
 #else
 	if (*(int *) CMAP1 || *(int *) CMAP2)
 		panic("pmap_copy_page: CMAP busy");
 
 	*(int *) CMAP1 = PG_V | PG_RW | (src & PG_FRAME);
 	*(int *) CMAP2 = PG_V | PG_RW | (dst & PG_FRAME);
 
 	bcopy(CADDR1, CADDR2, PAGE_SIZE);
 
 	*(int *) CMAP1 = 0;
 	*(int *) CMAP2 = 0;
 	invltlb_2pg( (vm_offset_t) CADDR1, (vm_offset_t) CADDR2);
 #endif
 }
 
 
 /*
  *	Routine:	pmap_pageable
  *	Function:
  *		Make the specified pages (by pmap, offset)
  *		pageable (or not) as requested.
  *
  *		A page which is not pageable may not take
  *		a fault; therefore, its page table entry
  *		must remain valid for the duration.
  *
  *		This routine is merely advisory; pmap_enter
  *		will specify that these pages are to be wired
  *		down (or not) as appropriate.
  */
 void
 pmap_pageable(pmap, sva, eva, pageable)
 	pmap_t pmap;
 	vm_offset_t sva, eva;
 	boolean_t pageable;
 {
 }
 
 /*
  * this routine returns true if a physical page resides
  * in the given pmap.
  */
 boolean_t
 pmap_page_exists(pmap, pa)
 	pmap_t pmap;
 	vm_offset_t pa;
 {
 	register pv_entry_t pv;
 	pv_table_t *ppv;
 	int s;
 
 	if (!pmap_is_managed(pa))
 		return FALSE;
 
 	s = splvm();
 
 	ppv = pa_to_pvh(pa);
 	/*
 	 * Not found, check current mappings returning immediately if found.
 	 */
 	for (pv = TAILQ_FIRST(&ppv->pv_list);
 		pv;
 		pv = TAILQ_NEXT(pv, pv_list)) {
 		if (pv->pv_pmap == pmap) {
 			splx(s);
 			return TRUE;
 		}
 	}
 	splx(s);
 	return (FALSE);
 }
 
 #define PMAP_REMOVE_PAGES_CURPROC_ONLY
 /*
  * Remove all pages from specified address space
  * this aids process exit speeds.  Also, this code
  * is special cased for current process only, but
  * can have the more generic (and slightly slower)
  * mode enabled.  This is much faster than pmap_remove
  * in the case of running down an entire address space.
  */
 void
 pmap_remove_pages(pmap, sva, eva)
 	pmap_t pmap;
 	vm_offset_t sva, eva;
 {
 	unsigned *pte, tpte;
 	pv_table_t *ppv;
 	pv_entry_t pv, npv;
 	int s;
 
 #ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY
 	if (!curproc || (pmap != &curproc->p_vmspace->vm_pmap)) {
 		printf("warning: pmap_remove_pages called with non-current pmap\n");
 		return;
 	}
 #endif
 
 	s = splvm();
 	for(pv = TAILQ_FIRST(&pmap->pm_pvlist);
 		pv;
 		pv = npv) {
 
 		if (pv->pv_va >= eva || pv->pv_va < sva) {
 			npv = TAILQ_NEXT(pv, pv_plist);
 			continue;
 		}
 
 #ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY
 		pte = (unsigned *)vtopte(pv->pv_va);
 #else
 		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
 #endif
 		tpte = *pte;
 
 /*
  * We cannot remove wired pages from a process' mapping at this time
  */
 		if (tpte & PG_W) {
 			npv = TAILQ_NEXT(pv, pv_plist);
 			continue;
 		}
 		*pte = 0;
 
 		ppv = pa_to_pvh(tpte);
 
 		pv->pv_pmap->pm_stats.resident_count--;
 
 		/*
 		 * Update the vm_page_t clean and reference bits.
 		 */
 		if (tpte & PG_M) {
 			ppv->pv_vm_page->dirty = VM_PAGE_BITS_ALL;
 		}
 
 
 		npv = TAILQ_NEXT(pv, pv_plist);
 		TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist);
 
 		--ppv->pv_list_count;
 		TAILQ_REMOVE(&ppv->pv_list, pv, pv_list);
 		if (TAILQ_FIRST(&ppv->pv_list) == NULL) {
 			ppv->pv_vm_page->flags &= ~(PG_MAPPED|PG_WRITEABLE);
 		}
 
 		pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem);
 		free_pv_entry(pv);
 	}
 	splx(s);
 	invltlb();
 }
 
 /*
  * pmap_testbit tests bits in pte's
  * note that the testbit/changebit routines are inline,
  * and a lot of things compile-time evaluate.
  */
 static boolean_t
 pmap_testbit(pa, bit)
 	register vm_offset_t pa;
 	int bit;
 {
 	register pv_entry_t pv;
 	pv_table_t *ppv;
 	unsigned *pte;
 	int s;
 
 	if (!pmap_is_managed(pa))
 		return FALSE;
 
 	ppv = pa_to_pvh(pa);
 	if (TAILQ_FIRST(&ppv->pv_list) == NULL)
 		return FALSE;
 
 	s = splvm();
 
 	for (pv = TAILQ_FIRST(&ppv->pv_list);
 		pv;
 		pv = TAILQ_NEXT(pv, pv_list)) {
 
 		/*
 		 * if the bit being tested is the modified bit, then
 		 * mark clean_map and ptes as never
 		 * modified.
 		 */
 		if (bit & (PG_A|PG_M)) {
 			if (!pmap_track_modified(pv->pv_va))
 				continue;
 		}
 
 #if defined(PMAP_DIAGNOSTIC)
 		if (!pv->pv_pmap) {
 			printf("Null pmap (tb) at va: 0x%lx\n", pv->pv_va);
 			continue;
 		}
 #endif
 		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
 		if (*pte & bit) {
 			splx(s);
 			return TRUE;
 		}
 	}
 	splx(s);
 	return (FALSE);
 }
 
 /*
  * this routine is used to modify bits in ptes
  */
 static void
 pmap_changebit(pa, bit, setem)
 	vm_offset_t pa;
 	int bit;
 	boolean_t setem;
 {
 	register pv_entry_t pv;
 	pv_table_t *ppv;
 	register unsigned *pte;
 	int changed;
 	int s;
 
 	if (!pmap_is_managed(pa))
 		return;
 
 	s = splvm();
 	changed = 0;
 	ppv = pa_to_pvh(pa);
 
 	/*
 	 * Loop over all current mappings setting/clearing as appropos If
 	 * setting RO do we need to clear the VAC?
 	 */
 	for (pv = TAILQ_FIRST(&ppv->pv_list);
 		pv;
 		pv = TAILQ_NEXT(pv, pv_list)) {
 
 		/*
 		 * don't write protect pager mappings
 		 */
 		if (!setem && (bit == PG_RW)) {
 			if (!pmap_track_modified(pv->pv_va))
 				continue;
 		}
 
 #if defined(PMAP_DIAGNOSTIC)
 		if (!pv->pv_pmap) {
 			printf("Null pmap (cb) at va: 0x%lx\n", pv->pv_va);
 			continue;
 		}
 #endif
 
 		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
 
 		if (setem) {
 			*(int *)pte |= bit;
 			changed = 1;
 		} else {
 			vm_offset_t pbits = *(vm_offset_t *)pte;
 			if (pbits & bit) {
 				changed = 1;
 				if (bit == PG_RW) {
 					if (pbits & PG_M) {
 						ppv->pv_vm_page->dirty = VM_PAGE_BITS_ALL;
 					}
 					*(int *)pte = pbits & ~(PG_M|PG_RW);
 				} else {
 					*(int *)pte = pbits & ~bit;
 				}
 			}
 		}
 	}
 	splx(s);
 	if (changed)
 		invltlb();
 }
 
 /*
  *      pmap_page_protect:
  *
  *      Lower the permission for all mappings to a given page.
  */
 void
 pmap_page_protect(vm_offset_t phys, vm_prot_t prot)
 {
 	if ((prot & VM_PROT_WRITE) == 0) {
 		if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) {
 			pmap_changebit(phys, PG_RW, FALSE);
 		} else {
 			pmap_remove_all(phys);
 		}
 	}
 }
 
 vm_offset_t
 pmap_phys_address(ppn)
 	int ppn;
 {
 	return (i386_ptob(ppn));
 }
 
 /*
  *	pmap_ts_referenced:
  *
  *	Return the count of reference bits for a page, clearing all of them.
  *	
  */
 int
 pmap_ts_referenced(vm_offset_t pa)
 {
 	register pv_entry_t pv;
 	pv_table_t *ppv;
 	unsigned *pte;
 	int s;
 	int rtval = 0;
 
 	if (!pmap_is_managed(pa))
 		return FALSE;
 
 	s = splvm();
 
 	ppv = pa_to_pvh(pa);
 
 	if (TAILQ_FIRST(&ppv->pv_list) == NULL) {
 		splx(s);
 		return 0;
 	}
 		
 	/*
 	 * Not found, check current mappings returning immediately if found.
 	 */
 	for (pv = TAILQ_FIRST(&ppv->pv_list);
 		pv;
 		pv = TAILQ_NEXT(pv, pv_list)) {
 		/*
 		 * if the bit being tested is the modified bit, then
 		 * mark clean_map and ptes as never
 		 * modified.
 		 */
 		if (!pmap_track_modified(pv->pv_va))
 			continue;
 
 		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
 		if (pte == NULL) {
 			continue;
 		}
 		if (*pte & PG_A) {
 			rtval++;
 			*pte &= ~PG_A;
 		}
 	}
 	splx(s);
 	if (rtval) {
 		invltlb();
 	}
 	return (rtval);
 }
 
 /*
  *	pmap_is_modified:
  *
  *	Return whether or not the specified physical page was modified
  *	in any physical maps.
  */
 boolean_t
 pmap_is_modified(vm_offset_t pa)
 {
 	return pmap_testbit((pa), PG_M);
 }
 
 /*
  *	Clear the modify bits on the specified physical page.
  */
 void
 pmap_clear_modify(vm_offset_t pa)
 {
 	pmap_changebit((pa), PG_M, FALSE);
 }
 
 /*
  *	pmap_clear_reference:
  *
  *	Clear the reference bit on the specified physical page.
  */
 void
 pmap_clear_reference(vm_offset_t pa)
 {
 	pmap_changebit((pa), PG_A, FALSE);
 }
 
 /*
  * Miscellaneous support routines follow
  */
 
 static void
 i386_protection_init()
 {
 	register int *kp, prot;
 
 	kp = protection_codes;
 	for (prot = 0; prot < 8; prot++) {
 		switch (prot) {
 		case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_NONE:
 			/*
 			 * Read access is also 0. There isn't any execute bit,
 			 * so just make it readable.
 			 */
 		case VM_PROT_READ | VM_PROT_NONE | VM_PROT_NONE:
 		case VM_PROT_READ | VM_PROT_NONE | VM_PROT_EXECUTE:
 		case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_EXECUTE:
 			*kp++ = 0;
 			break;
 		case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_NONE:
 		case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_EXECUTE:
 		case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_NONE:
 		case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE:
 			*kp++ = PG_RW;
 			break;
 		}
 	}
 }
 
 /*
  * Map a set of physical memory pages into the kernel virtual
  * address space. Return a pointer to where it is mapped. This
  * routine is intended to be used for mapping device memory,
  * NOT real memory.
  */
 void *
 pmap_mapdev(pa, size)
 	vm_offset_t pa;
 	vm_size_t size;
 {
 	vm_offset_t va, tmpva;
 	unsigned *pte;
 
 	size = roundup(size, PAGE_SIZE);
 
 	va = kmem_alloc_pageable(kernel_map, size);
 	if (!va)
 		panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
 
 	pa = pa & PG_FRAME;
 	for (tmpva = va; size > 0;) {
 		pte = (unsigned *)vtopte(tmpva);
 		*pte = pa | PG_RW | PG_V | pgeflag;
 		size -= PAGE_SIZE;
 		tmpva += PAGE_SIZE;
 		pa += PAGE_SIZE;
 	}
 	invltlb();
 
 	return ((void *) va);
 }
 
 /*
  * perform the pmap work for mincore
  */
 int
 pmap_mincore(pmap, addr)
 	pmap_t pmap;
 	vm_offset_t addr;
 {
 	
 	unsigned *ptep, pte;
 	int val = 0;
 	
 	ptep = pmap_pte(pmap, addr);
 	if (ptep == 0) {
 		return 0;
 	}
 
 	if (pte = *ptep) {
 		vm_offset_t pa;
 		val = MINCORE_INCORE;
 		pa = pte & PG_FRAME;
 
 		/*
 		 * Modified by us
 		 */
 		if (pte & PG_M)
 			val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER;
 		/*
 		 * Modified by someone
 		 */
 		else if (PHYS_TO_VM_PAGE(pa)->dirty ||
 			pmap_is_modified(pa))
 			val |= MINCORE_MODIFIED_OTHER;
 		/*
 		 * Referenced by us
 		 */
 		if (pte & PG_U)
 			val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER;
 
 		/*
 		 * Referenced by someone
 		 */
 		else if ((PHYS_TO_VM_PAGE(pa)->flags & PG_REFERENCED) ||
 			pmap_ts_referenced(pa)) {
 			val |= MINCORE_REFERENCED_OTHER;
 			PHYS_TO_VM_PAGE(pa)->flags |= PG_REFERENCED;
 		}
 	} 
 	return val;
 }
 
 void
 pmap_activate(struct proc *p)
 {
+#if defined(SWTCH_OPTIM_STATS)
+	++tlb_flush_count;
+#endif
 	load_cr3(p->p_addr->u_pcb.pcb_cr3 =
 		vtophys(p->p_vmspace->vm_pmap.pm_pdir));
 }
 
 vm_offset_t
 pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size) {
 
 	if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) {
 		return addr;
 	}
 
 	addr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
 	return addr;
 }
 
 
 #if defined(PMAP_DEBUG)
 pmap_pid_dump(int pid) {
 	pmap_t pmap;
 	struct proc *p;
 	int npte = 0;
 	int index;
 	for (p = allproc.lh_first; p != NULL; p = p->p_list.le_next) {
 		if (p->p_pid != pid)
 			continue;
 
 		if (p->p_vmspace) {
 			int i,j;
 			index = 0;
 			pmap = &p->p_vmspace->vm_pmap;
 			for(i=0;i<1024;i++) {
 				pd_entry_t *pde;
 				unsigned *pte;
 				unsigned base = i << PDRSHIFT;
 				
 				pde = &pmap->pm_pdir[i];
 				if (pde && pmap_pde_v(pde)) {
 					for(j=0;j<1024;j++) {
 						unsigned va = base + (j << PAGE_SHIFT);
 						if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) {
 							if (index) {
 								index = 0;
 								printf("\n");
 							}
 							return npte;
 						}
 						pte = pmap_pte_quick( pmap, va);
 						if (pte && pmap_pte_v(pte)) {
 							vm_offset_t pa;
 							vm_page_t m;
 							pa = *(int *)pte;
 							m = PHYS_TO_VM_PAGE((pa & PG_FRAME));
 							printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x",
 								va, pa, m->hold_count, m->wire_count, m->flags);
 							npte++;
 							index++;
 							if (index >= 2) {
 								index = 0;
 								printf("\n");
 							} else {
 								printf(" ");
 							}
 						}
 					}
 				}
 			}
 		}
 	}
 	return npte;
 }
 #endif
 
 #if defined(DEBUG)
 
 static void	pads __P((pmap_t pm));
 static void	pmap_pvdump __P((vm_offset_t pa));
 
 /* print address space of pmap*/
 static void
 pads(pm)
 	pmap_t pm;
 {
 	unsigned va, i, j;
 	unsigned *ptep;
 
 	if (pm == kernel_pmap)
 		return;
 	for (i = 0; i < 1024; i++)
 		if (pm->pm_pdir[i])
 			for (j = 0; j < 1024; j++) {
 				va = (i << PDRSHIFT) + (j << PAGE_SHIFT);
 				if (pm == kernel_pmap && va < KERNBASE)
 					continue;
 				if (pm != kernel_pmap && va > UPT_MAX_ADDRESS)
 					continue;
 				ptep = pmap_pte_quick(pm, va);
 				if (pmap_pte_v(ptep))
 					printf("%x:%x ", va, *(int *) ptep);
 			};
 
 }
 
 static void
 pmap_pvdump(pa)
 	vm_offset_t pa;
 {
 	pv_table_t *ppv;
 	register pv_entry_t pv;
 
 	printf("pa %x", pa);
 	ppv = pa_to_pvh(pa);
 	for (pv = TAILQ_FIRST(&ppv->pv_list);
 		pv;
 		pv = TAILQ_NEXT(pv, pv_list)) {
 #ifdef used_to_be
 		printf(" -> pmap %x, va %x, flags %x",
 		    pv->pv_pmap, pv->pv_va, pv->pv_flags);
 #endif
 		printf(" -> pmap %x, va %x",
 		    pv->pv_pmap, pv->pv_va);
 		pads(pv->pv_pmap);
 	}
 	printf(" ");
 }
 #endif
Index: head/sys/i386/i386/support.s
===================================================================
--- head/sys/i386/i386/support.s	(revision 31708)
+++ head/sys/i386/i386/support.s	(revision 31709)
@@ -1,1571 +1,1574 @@
 /*-
  * Copyright (c) 1993 The Regents of the University of California.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *	$Id: support.s,v 1.56 1997/08/09 00:02:44 dyson Exp $
+ *	$Id: support.s,v 1.57 1997/09/02 20:05:30 bde Exp $
  */
 
 #include "npx.h"
 
 #include <machine/asmacros.h>
 #include <machine/cputypes.h>
 #include <machine/pmap.h>
 #include <machine/specialreg.h>
 
 #include "assym.s"
 
 #define KDSEL		0x10			/* kernel data selector */
 #define KCSEL		0x8			/* kernel code selector */
 #define IDXSHIFT	10
 
 	.data
 	.globl	_bcopy_vector
 _bcopy_vector:
 	.long	_generic_bcopy
 	.globl	_bzero
 _bzero:
 	.long	_generic_bzero
 	.globl	_copyin_vector
 _copyin_vector:
 	.long	_generic_copyin
 	.globl	_copyout_vector
 _copyout_vector:
 	.long	_generic_copyout
 	.globl	_ovbcopy_vector
 _ovbcopy_vector:
 	.long	_generic_bcopy
 #if defined(I586_CPU) && NNPX > 0
 kernel_fpu_lock:
 	.byte	0xfe
 	.space	3
 #endif
 
 	.text
 
 /*
  * bcopy family
  * void bzero(void *buf, u_int len)
  */
 
 ENTRY(generic_bzero)
 	pushl	%edi
 	movl	8(%esp),%edi
 	movl	12(%esp),%ecx
 	xorl	%eax,%eax
 	shrl	$2,%ecx
 	cld
 	rep
 	stosl
 	movl	12(%esp),%ecx
 	andl	$3,%ecx
 	rep
 	stosb
 	popl	%edi
 	ret
 
 #if defined(I486_CPU)
 ENTRY(i486_bzero)
 	movl	4(%esp),%edx
 	movl	8(%esp),%ecx
 	xorl	%eax,%eax
 /*
  * do 64 byte chunks first
  *
  * XXX this is probably over-unrolled at least for DX2's
  */
 2:
 	cmpl	$64,%ecx
 	jb	3f
 	movl	%eax,(%edx)
 	movl	%eax,4(%edx)
 	movl	%eax,8(%edx)
 	movl	%eax,12(%edx)
 	movl	%eax,16(%edx)
 	movl	%eax,20(%edx)
 	movl	%eax,24(%edx)
 	movl	%eax,28(%edx)
 	movl	%eax,32(%edx)
 	movl	%eax,36(%edx)
 	movl	%eax,40(%edx)
 	movl	%eax,44(%edx)
 	movl	%eax,48(%edx)
 	movl	%eax,52(%edx)
 	movl	%eax,56(%edx)
 	movl	%eax,60(%edx)
 	addl	$64,%edx
 	subl	$64,%ecx
 	jnz	2b
 	ret
 
 /*
  * do 16 byte chunks
  */
 	SUPERALIGN_TEXT
 3:
 	cmpl	$16,%ecx
 	jb	4f
 	movl	%eax,(%edx)
 	movl	%eax,4(%edx)
 	movl	%eax,8(%edx)
 	movl	%eax,12(%edx)
 	addl	$16,%edx
 	subl	$16,%ecx
 	jnz	3b
 	ret
 
 /*
  * do 4 byte chunks
  */
 	SUPERALIGN_TEXT
 4:
 	cmpl	$4,%ecx
 	jb	5f
 	movl	%eax,(%edx)
 	addl	$4,%edx
 	subl	$4,%ecx
 	jnz	4b
 	ret
 
 /*
  * do 1 byte chunks
  * a jump table seems to be faster than a loop or more range reductions
  *
  * XXX need a const section for non-text
  */
 	.data
 jtab:
 	.long	do0
 	.long	do1
 	.long	do2
 	.long	do3
 
 	.text
 	SUPERALIGN_TEXT
 5:
 	jmp	jtab(,%ecx,4)
 
 	SUPERALIGN_TEXT
 do3:
 	movw	%ax,(%edx)
 	movb	%al,2(%edx)
 	ret
 
 	SUPERALIGN_TEXT
 do2:
 	movw	%ax,(%edx)
 	ret
 
 	SUPERALIGN_TEXT
 do1:
 	movb	%al,(%edx)
 	ret
 
 	SUPERALIGN_TEXT
 do0:
 	ret
 #endif
 
 #if defined(I586_CPU) && NNPX > 0
 ENTRY(i586_bzero)
 	movl	4(%esp),%edx
 	movl	8(%esp),%ecx
 
 	/*
 	 * The FPU register method is twice as fast as the integer register
 	 * method unless the target is in the L1 cache and we pre-allocate a
 	 * cache line for it (then the integer register method is 4-5 times
 	 * faster).  However, we never pre-allocate cache lines, since that
 	 * would make the integer method 25% or more slower for the common
 	 * case when the target isn't in either the L1 cache or the L2 cache.
 	 * Thus we normally use the FPU register method unless the overhead
 	 * would be too large.
 	 */
 	cmpl	$256,%ecx	/* empirical; clts, fninit, smsw cost a lot */
 	jb	intreg_i586_bzero
 
 	/*
 	 * The FPU registers may belong to an application or to fastmove()
 	 * or to another invocation of bcopy() or ourself in a higher level
 	 * interrupt or trap handler.  Preserving the registers is
 	 * complicated since we avoid it if possible at all levels.  We
 	 * want to localize the complications even when that increases them.
 	 * Here the extra work involves preserving CR0_TS in TS.
 	 * `npxproc != NULL' is supposed to be the condition that all the
 	 * FPU resources belong to an application, but npxproc and CR0_TS
 	 * aren't set atomically enough for this condition to work in
 	 * interrupt handlers.
 	 *
 	 * Case 1: FPU registers belong to the application: we must preserve
 	 * the registers if we use them, so we only use the FPU register
 	 * method if the target size is large enough to amortize the extra
 	 * overhead for preserving them.  CR0_TS must be preserved although
 	 * it is very likely to end up as set.
 	 *
 	 * Case 2: FPU registers belong to fastmove(): fastmove() currently
 	 * makes the registers look like they belong to an application so
 	 * that cpu_switch() and savectx() don't have to know about it, so
 	 * this case reduces to case 1.
 	 *
 	 * Case 3: FPU registers belong to the kernel: don't use the FPU
 	 * register method.  This case is unlikely, and supporting it would
 	 * be more complicated and might take too much stack.
 	 *
 	 * Case 4: FPU registers don't belong to anyone: the FPU registers
 	 * don't need to be preserved, so we always use the FPU register
 	 * method.  CR0_TS must be preserved although it is very likely to
 	 * always end up as clear.
 	 */
 	cmpl	$0,_npxproc
 	je	i586_bz1
 	cmpl	$256+184,%ecx		/* empirical; not quite 2*108 more */
 	jb	intreg_i586_bzero
 	sarb	$1,kernel_fpu_lock
 	jc	intreg_i586_bzero
 	smsw	%ax
 	clts
 	subl	$108,%esp
 	fnsave	0(%esp)
 	jmp	i586_bz2
 
 i586_bz1:
 	sarb	$1,kernel_fpu_lock
 	jc	intreg_i586_bzero
 	smsw	%ax
 	clts
 	fninit				/* XXX should avoid needing this */
 i586_bz2:
 	fldz
 
 	/*
 	 * Align to an 8 byte boundary (misalignment in the main loop would
 	 * cost a factor of >= 2).  Avoid jumps (at little cost if it is
 	 * already aligned) by always zeroing 8 bytes and using the part up
 	 * to the _next_ alignment position.
 	 */
 	fstl	0(%edx)
 	addl	%edx,%ecx		/* part of %ecx -= new_%edx - %edx */
 	addl	$8,%edx
 	andl	$~7,%edx
 	subl	%edx,%ecx
 
 	/*
 	 * Similarly align `len' to a multiple of 8.
 	 */
 	fstl	-8(%edx,%ecx)
 	decl	%ecx
 	andl	$~7,%ecx
 
 	/*
 	 * This wouldn't be any faster if it were unrolled, since the loop
 	 * control instructions are much faster than the fstl and/or done
 	 * in parallel with it so their overhead is insignificant.
 	 */
 fpureg_i586_bzero_loop:
 	fstl	0(%edx)
 	addl	$8,%edx
 	subl	$8,%ecx
 	cmpl	$8,%ecx
 	jae	fpureg_i586_bzero_loop
 
 	cmpl	$0,_npxproc
 	je	i586_bz3
 	frstor	0(%esp)
 	addl	$108,%esp
 	lmsw	%ax
 	movb	$0xfe,kernel_fpu_lock
 	ret
 
 i586_bz3:
 	fstpl	%st(0)
 	lmsw	%ax
 	movb	$0xfe,kernel_fpu_lock
 	ret
 
 intreg_i586_bzero:
 	/*
 	 * `rep stos' seems to be the best method in practice for small
 	 * counts.  Fancy methods usually take too long to start up due
 	 * to cache and BTB misses.
 	 */
 	pushl	%edi
 	movl	%edx,%edi
 	xorl	%eax,%eax
 	shrl	$2,%ecx
 	cld
 	rep
 	stosl
 	movl	12(%esp),%ecx
 	andl	$3,%ecx
 	jne	1f
 	popl	%edi
 	ret
 
 1:
 	rep
 	stosb
 	popl	%edi
 	ret
 #endif /* I586_CPU && NNPX > 0 */
 
 /* fillw(pat, base, cnt) */
 ENTRY(fillw)
 	pushl	%edi
 	movl	8(%esp),%eax
 	movl	12(%esp),%edi
 	movl	16(%esp),%ecx
 	cld
 	rep
 	stosw
 	popl	%edi
 	ret
 
 ENTRY(bcopyb)
 bcopyb:
 	pushl	%esi
 	pushl	%edi
 	movl	12(%esp),%esi
 	movl	16(%esp),%edi
 	movl	20(%esp),%ecx
 	movl	%edi,%eax
 	subl	%esi,%eax
 	cmpl	%ecx,%eax			/* overlapping && src < dst? */
 	jb	1f
 	cld					/* nope, copy forwards */
 	rep
 	movsb
 	popl	%edi
 	popl	%esi
 	ret
 
 	ALIGN_TEXT
 1:
 	addl	%ecx,%edi			/* copy backwards. */
 	addl	%ecx,%esi
 	decl	%edi
 	decl	%esi
 	std
 	rep
 	movsb
 	popl	%edi
 	popl	%esi
 	cld
 	ret
 
 ENTRY(bcopy)
 	MEXITCOUNT
 	jmp	*_bcopy_vector
 
 ENTRY(ovbcopy)
 	MEXITCOUNT
 	jmp	*_ovbcopy_vector
 
 /*
  * generic_bcopy(src, dst, cnt)
  *  ws@tools.de     (Wolfgang Solfrank, TooLs GmbH) +49-228-985800
  */
 ENTRY(generic_bcopy)
 	pushl	%esi
 	pushl	%edi
 	movl	12(%esp),%esi
 	movl	16(%esp),%edi
 	movl	20(%esp),%ecx
 
 	movl	%edi,%eax
 	subl	%esi,%eax
 	cmpl	%ecx,%eax			/* overlapping && src < dst? */
 	jb	1f
 
 	shrl	$2,%ecx				/* copy by 32-bit words */
 	cld					/* nope, copy forwards */
 	rep
 	movsl
 	movl	20(%esp),%ecx
 	andl	$3,%ecx				/* any bytes left? */
 	rep
 	movsb
 	popl	%edi
 	popl	%esi
 	ret
 
 	ALIGN_TEXT
 1:
 	addl	%ecx,%edi			/* copy backwards */
 	addl	%ecx,%esi
 	decl	%edi
 	decl	%esi
 	andl	$3,%ecx				/* any fractional bytes? */
 	std
 	rep
 	movsb
 	movl	20(%esp),%ecx			/* copy remainder by 32-bit words */
 	shrl	$2,%ecx
 	subl	$3,%esi
 	subl	$3,%edi
 	rep
 	movsl
 	popl	%edi
 	popl	%esi
 	cld
 	ret
 
 #if defined(I586_CPU) && NNPX > 0
 ENTRY(i586_bcopy)
 	pushl	%esi
 	pushl	%edi
 	movl	12(%esp),%esi
 	movl	16(%esp),%edi
 	movl	20(%esp),%ecx
 
 	movl	%edi,%eax
 	subl	%esi,%eax
 	cmpl	%ecx,%eax			/* overlapping && src < dst? */
 	jb	1f
 
 	cmpl	$1024,%ecx
 	jb	small_i586_bcopy
 
 	sarb	$1,kernel_fpu_lock
 	jc	small_i586_bcopy
 	cmpl	$0,_npxproc
 	je	i586_bc1
 	smsw	%dx
 	clts
 	subl	$108,%esp
 	fnsave	0(%esp)
 	jmp	4f
 
 i586_bc1:
 	smsw	%dx
 	clts
 	fninit				/* XXX should avoid needing this */
 
 	ALIGN_TEXT
 4:
 	pushl	%ecx
 #define	DCACHE_SIZE	8192
 	cmpl	$(DCACHE_SIZE-512)/2,%ecx
 	jbe	2f
 	movl	$(DCACHE_SIZE-512)/2,%ecx
 2:
 	subl	%ecx,0(%esp)
 	cmpl	$256,%ecx
 	jb	5f			/* XXX should prefetch if %ecx >= 32 */
 	pushl	%esi
 	pushl	%ecx
 	ALIGN_TEXT
 3:
 	movl	0(%esi),%eax
 	movl	32(%esi),%eax
 	movl	64(%esi),%eax
 	movl	96(%esi),%eax
 	movl	128(%esi),%eax
 	movl	160(%esi),%eax
 	movl	192(%esi),%eax
 	movl	224(%esi),%eax
 	addl	$256,%esi
 	subl	$256,%ecx
 	cmpl	$256,%ecx
 	jae	3b
 	popl	%ecx
 	popl	%esi
 5:
 	ALIGN_TEXT
 large_i586_bcopy_loop:
 	fildq	0(%esi)
 	fildq	8(%esi)
 	fildq	16(%esi)
 	fildq	24(%esi)
 	fildq	32(%esi)
 	fildq	40(%esi)
 	fildq	48(%esi)
 	fildq	56(%esi)
 	fistpq	56(%edi)
 	fistpq	48(%edi)
 	fistpq	40(%edi)
 	fistpq	32(%edi)
 	fistpq	24(%edi)
 	fistpq	16(%edi)
 	fistpq	8(%edi)
 	fistpq	0(%edi)
 	addl	$64,%esi
 	addl	$64,%edi
 	subl	$64,%ecx
 	cmpl	$64,%ecx
 	jae	large_i586_bcopy_loop
 	popl	%eax
 	addl	%eax,%ecx
 	cmpl	$64,%ecx
 	jae	4b
 
 	cmpl	$0,_npxproc
 	je	i586_bc2
 	frstor	0(%esp)
 	addl	$108,%esp
 i586_bc2:
 	lmsw	%dx
 	movb	$0xfe,kernel_fpu_lock
 
 /*
  * This is a duplicate of the main part of generic_bcopy.  See the comments
  * there.  Jumping into generic_bcopy would cost a whole 0-1 cycles and
  * would mess up high resolution profiling.
  */
 	ALIGN_TEXT
 small_i586_bcopy:
 	shrl	$2,%ecx
 	cld
 	rep
 	movsl
 	movl	20(%esp),%ecx
 	andl	$3,%ecx
 	rep
 	movsb
 	popl	%edi
 	popl	%esi
 	ret
 
 	ALIGN_TEXT
 1:
 	addl	%ecx,%edi
 	addl	%ecx,%esi
 	decl	%edi
 	decl	%esi
 	andl	$3,%ecx
 	std
 	rep
 	movsb
 	movl	20(%esp),%ecx
 	shrl	$2,%ecx
 	subl	$3,%esi
 	subl	$3,%edi
 	rep
 	movsl
 	popl	%edi
 	popl	%esi
 	cld
 	ret
 #endif /* I586_CPU && NNPX > 0 */
 
 /*
  * Note: memcpy does not support overlapping copies
  */
 ENTRY(memcpy)
 	pushl	%edi
 	pushl	%esi
 	movl	12(%esp),%edi
 	movl	16(%esp),%esi
 	movl	20(%esp),%ecx
 	movl	%edi,%eax
 	shrl	$2,%ecx				/* copy by 32-bit words */
 	cld					/* nope, copy forwards */
 	rep
 	movsl
 	movl	20(%esp),%ecx
 	andl	$3,%ecx				/* any bytes left? */
 	rep
 	movsb
 	popl	%esi
 	popl	%edi
 	ret
 
 
 /*****************************************************************************/
 /* copyout and fubyte family                                                 */
 /*****************************************************************************/
 /*
  * Access user memory from inside the kernel. These routines and possibly
  * the math- and DOS emulators should be the only places that do this.
  *
  * We have to access the memory with user's permissions, so use a segment
  * selector with RPL 3. For writes to user space we have to additionally
  * check the PTE for write permission, because the 386 does not check
  * write permissions when we are executing with EPL 0. The 486 does check
  * this if the WP bit is set in CR0, so we can use a simpler version here.
  *
  * These routines set curpcb->onfault for the time they execute. When a
  * protection violation occurs inside the functions, the trap handler
  * returns to *curpcb->onfault instead of the function.
  */
 
 /* copyout(from_kernel, to_user, len) */
 ENTRY(copyout)
 	MEXITCOUNT
 	jmp	*_copyout_vector
 
 ENTRY(generic_copyout)
 	movl	_curpcb,%eax
 	movl	$copyout_fault,PCB_ONFAULT(%eax)
 	pushl	%esi
 	pushl	%edi
 	pushl	%ebx
 	movl	16(%esp),%esi
 	movl	20(%esp),%edi
 	movl	24(%esp),%ebx
 	testl	%ebx,%ebx			/* anything to do? */
 	jz	done_copyout
 
 	/*
 	 * Check explicitly for non-user addresses.  If 486 write protection
 	 * is being used, this check is essential because we are in kernel
 	 * mode so the h/w does not provide any protection against writing
 	 * kernel addresses.
 	 */
 
 	/*
 	 * First, prevent address wrapping.
 	 */
 	movl	%edi,%eax
 	addl	%ebx,%eax
 	jc	copyout_fault
 /*
  * XXX STOP USING VM_MAXUSER_ADDRESS.
  * It is an end address, not a max, so every time it is used correctly it
  * looks like there is an off by one error, and of course it caused an off
  * by one error in several places.
  */
 	cmpl	$VM_MAXUSER_ADDRESS,%eax
 	ja	copyout_fault
 
 #if defined(I386_CPU)
 
 #if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
 	cmpl	$CPUCLASS_386,_cpu_class
 	jne	3f
 #endif
 /*
  * We have to check each PTE for user write permission.
  * The checking may cause a page fault, so it is important to set
  * up everything for return via copyout_fault before here.
  */
 	/* compute number of pages */
 	movl	%edi,%ecx
 	andl	$PAGE_MASK,%ecx
 	addl	%ebx,%ecx
 	decl	%ecx
 	shrl	$IDXSHIFT+2,%ecx
 	incl	%ecx
 
 	/* compute PTE offset for start address */
 	movl	%edi,%edx
 	shrl	$IDXSHIFT,%edx
 	andb	$0xfc,%dl
 
 1:
 	/* check PTE for each page */
 	leal	_PTmap(%edx),%eax
 	shrl	$IDXSHIFT,%eax
 	andb	$0xfc,%al
 	testb	$PG_V,_PTmap(%eax)		/* PTE page must be valid */
 	je	4f
 	movb	_PTmap(%edx),%al
 	andb	$PG_V|PG_RW|PG_U,%al		/* page must be valid and user writable */
 	cmpb	$PG_V|PG_RW|PG_U,%al
 	je	2f
 
 4:
 	/* simulate a trap */
 	pushl	%edx
 	pushl	%ecx
 	shll	$IDXSHIFT,%edx
 	pushl	%edx
 	call	_trapwrite			/* trapwrite(addr) */
 	popl	%edx
 	popl	%ecx
 	popl	%edx
 
 	testl	%eax,%eax			/* if not ok, return EFAULT */
 	jnz	copyout_fault
 
 2:
 	addl	$4,%edx
 	decl	%ecx
 	jnz	1b				/* check next page */
 #endif /* I386_CPU */
 
 	/* bcopy(%esi, %edi, %ebx) */
 3:
 	movl	%ebx,%ecx
 
 #if defined(I586_CPU) && NNPX > 0
 	ALIGN_TEXT
 slow_copyout:
 #endif
 	shrl	$2,%ecx
 	cld
 	rep
 	movsl
 	movb	%bl,%cl
 	andb	$3,%cl
 	rep
 	movsb
 
 done_copyout:
 	popl	%ebx
 	popl	%edi
 	popl	%esi
 	xorl	%eax,%eax
 	movl	_curpcb,%edx
 	movl	%eax,PCB_ONFAULT(%edx)
 	ret
 
 	ALIGN_TEXT
 copyout_fault:
 	popl	%ebx
 	popl	%edi
 	popl	%esi
 	movl	_curpcb,%edx
 	movl	$0,PCB_ONFAULT(%edx)
 	movl	$EFAULT,%eax
 	ret
 
 #if defined(I586_CPU) && NNPX > 0
 ENTRY(i586_copyout)
 	/*
 	 * Duplicated from generic_copyout.  Could be done a bit better.
 	 */
 	movl	_curpcb,%eax
 	movl	$copyout_fault,PCB_ONFAULT(%eax)
 	pushl	%esi
 	pushl	%edi
 	pushl	%ebx
 	movl	16(%esp),%esi
 	movl	20(%esp),%edi
 	movl	24(%esp),%ebx
 	testl	%ebx,%ebx			/* anything to do? */
 	jz	done_copyout
 
 	/*
 	 * Check explicitly for non-user addresses.  If 486 write protection
 	 * is being used, this check is essential because we are in kernel
 	 * mode so the h/w does not provide any protection against writing
 	 * kernel addresses.
 	 */
 
 	/*
 	 * First, prevent address wrapping.
 	 */
 	movl	%edi,%eax
 	addl	%ebx,%eax
 	jc	copyout_fault
 /*
  * XXX STOP USING VM_MAXUSER_ADDRESS.
  * It is an end address, not a max, so every time it is used correctly it
  * looks like there is an off by one error, and of course it caused an off
  * by one error in several places.
  */
 	cmpl	$VM_MAXUSER_ADDRESS,%eax
 	ja	copyout_fault
 
 	/* bcopy(%esi, %edi, %ebx) */
 3:
 	movl	%ebx,%ecx
 	/*
 	 * End of duplicated code.
 	 */
 
 	cmpl	$1024,%ecx
 	jb	slow_copyout
 
 	pushl	%ecx
 	call	_fastmove
 	addl	$4,%esp
 	jmp	done_copyout
 #endif /* I586_CPU && NNPX > 0 */
 
 /* copyin(from_user, to_kernel, len) */
 ENTRY(copyin)
 	MEXITCOUNT
 	jmp	*_copyin_vector
 
 ENTRY(generic_copyin)
 	movl	_curpcb,%eax
 	movl	$copyin_fault,PCB_ONFAULT(%eax)
 	pushl	%esi
 	pushl	%edi
 	movl	12(%esp),%esi			/* caddr_t from */
 	movl	16(%esp),%edi			/* caddr_t to */
 	movl	20(%esp),%ecx			/* size_t  len */
 
 	/*
 	 * make sure address is valid
 	 */
 	movl	%esi,%edx
 	addl	%ecx,%edx
 	jc	copyin_fault
 	cmpl	$VM_MAXUSER_ADDRESS,%edx
 	ja	copyin_fault
 
 #if defined(I586_CPU) && NNPX > 0
 	ALIGN_TEXT
 slow_copyin:
 #endif
 	movb	%cl,%al
 	shrl	$2,%ecx				/* copy longword-wise */
 	cld
 	rep
 	movsl
 	movb	%al,%cl
 	andb	$3,%cl				/* copy remaining bytes */
 	rep
 	movsb
 
 #if defined(I586_CPU) && NNPX > 0
 	ALIGN_TEXT
 done_copyin:
 #endif
 	popl	%edi
 	popl	%esi
 	xorl	%eax,%eax
 	movl	_curpcb,%edx
 	movl	%eax,PCB_ONFAULT(%edx)
 	ret
 
 	ALIGN_TEXT
 copyin_fault:
 	popl	%edi
 	popl	%esi
 	movl	_curpcb,%edx
 	movl	$0,PCB_ONFAULT(%edx)
 	movl	$EFAULT,%eax
 	ret
 
 #if defined(I586_CPU) && NNPX > 0
 ENTRY(i586_copyin)
 	/*
 	 * Duplicated from generic_copyin.  Could be done a bit better.
 	 */
 	movl	_curpcb,%eax
 	movl	$copyin_fault,PCB_ONFAULT(%eax)
 	pushl	%esi
 	pushl	%edi
 	movl	12(%esp),%esi			/* caddr_t from */
 	movl	16(%esp),%edi			/* caddr_t to */
 	movl	20(%esp),%ecx			/* size_t  len */
 
 	/*
 	 * make sure address is valid
 	 */
 	movl	%esi,%edx
 	addl	%ecx,%edx
 	jc	copyin_fault
 	cmpl	$VM_MAXUSER_ADDRESS,%edx
 	ja	copyin_fault
 	/*
 	 * End of duplicated code.
 	 */
 
 	cmpl	$1024,%ecx
 	jb	slow_copyin
 
 	pushl	%ebx			/* XXX prepare for fastmove_fault */
 	pushl	%ecx
 	call	_fastmove
 	addl	$8,%esp
 	jmp	done_copyin
 #endif /* I586_CPU && NNPX > 0 */
 
 #if defined(I586_CPU) && NNPX > 0
 /* fastmove(src, dst, len)
 	src in %esi
 	dst in %edi
 	len in %ecx		XXX changed to on stack for profiling
 	uses %eax and %edx for tmp. storage
  */
 /* XXX use ENTRY() to get profiling.  fastmove() is actually a non-entry. */
 ENTRY(fastmove)
 	pushl	%ebp
 	movl	%esp,%ebp
 	subl	$PCB_SAVEFPU_SIZE+3*4,%esp
 
 	movl	8(%ebp),%ecx
 	cmpl	$63,%ecx
 	jbe	fastmove_tail
 
 	testl	$7,%esi	/* check if src addr is multiple of 8 */
 	jnz	fastmove_tail
 
 	testl	$7,%edi	/* check if dst addr is multiple of 8 */
 	jnz	fastmove_tail
 
 /* if (npxproc != NULL) { */
 	cmpl	$0,_npxproc
 	je	6f
 /*    fnsave(&curpcb->pcb_savefpu); */
 	movl	_curpcb,%eax
 	fnsave	PCB_SAVEFPU(%eax)
 /*   npxproc = NULL; */
 	movl	$0,_npxproc
 /* } */
 6:
 /* now we own the FPU. */
 
 /*
  * The process' FP state is saved in the pcb, but if we get
  * switched, the cpu_switch() will store our FP state in the
  * pcb.  It should be possible to avoid all the copying for
  * this, e.g., by setting a flag to tell cpu_switch() to
  * save the state somewhere else.
  */
 /* tmp = curpcb->pcb_savefpu; */
 	movl	%ecx,-12(%ebp)
 	movl	%esi,-8(%ebp)
 	movl	%edi,-4(%ebp)
 	movl	%esp,%edi
 	movl	_curpcb,%esi
 	addl	$PCB_SAVEFPU,%esi
 	cld
 	movl	$PCB_SAVEFPU_SIZE>>2,%ecx
 	rep
 	movsl
 	movl	-12(%ebp),%ecx
 	movl	-8(%ebp),%esi
 	movl	-4(%ebp),%edi
 /* stop_emulating(); */
 	clts
 /* npxproc = curproc; */
 	movl	_curproc,%eax
 	movl	%eax,_npxproc
 	movl	_curpcb,%eax
 	movl	$fastmove_fault,PCB_ONFAULT(%eax)
 4:
 	movl	%ecx,-12(%ebp)
 	cmpl	$1792,%ecx
 	jbe	2f
 	movl	$1792,%ecx
 2:
 	subl	%ecx,-12(%ebp)
 	cmpl	$256,%ecx
 	jb	5f
 	movl	%ecx,-8(%ebp)
 	movl	%esi,-4(%ebp)
 	ALIGN_TEXT
 3:
 	movl	0(%esi),%eax
 	movl	32(%esi),%eax
 	movl	64(%esi),%eax
 	movl	96(%esi),%eax
 	movl	128(%esi),%eax
 	movl	160(%esi),%eax
 	movl	192(%esi),%eax
 	movl	224(%esi),%eax
 	addl	$256,%esi
 	subl	$256,%ecx
 	cmpl	$256,%ecx
 	jae	3b
 	movl	-8(%ebp),%ecx
 	movl	-4(%ebp),%esi
 5:
 	ALIGN_TEXT
 fastmove_loop:
 	fildq	0(%esi)
 	fildq	8(%esi)
 	fildq	16(%esi)
 	fildq	24(%esi)
 	fildq	32(%esi)
 	fildq	40(%esi)
 	fildq	48(%esi)
 	fildq	56(%esi)
 	fistpq	56(%edi)
 	fistpq	48(%edi)
 	fistpq	40(%edi)
 	fistpq	32(%edi)
 	fistpq	24(%edi)
 	fistpq	16(%edi)
 	fistpq	8(%edi)
 	fistpq	0(%edi)
 	addl	$-64,%ecx
 	addl	$64,%esi
 	addl	$64,%edi
 	cmpl	$63,%ecx
 	ja	fastmove_loop
 	movl	-12(%ebp),%eax
 	addl	%eax,%ecx
 	cmpl	$64,%ecx
 	jae	4b
 
 /* curpcb->pcb_savefpu = tmp; */
 	movl	%ecx,-12(%ebp)
 	movl	%esi,-8(%ebp)
 	movl	%edi,-4(%ebp)
 	movl	_curpcb,%edi
 	addl	$PCB_SAVEFPU,%edi
 	movl	%esp,%esi
 	cld
 	movl	$PCB_SAVEFPU_SIZE>>2,%ecx
 	rep
 	movsl
 	movl	-12(%ebp),%ecx
 	movl	-8(%ebp),%esi
 	movl	-4(%ebp),%edi
 
 /* start_emulating(); */
 	smsw	%ax
 	orb	$CR0_TS,%al
 	lmsw	%ax
 /* npxproc = NULL; */
 	movl	$0,_npxproc
 
 	ALIGN_TEXT
 fastmove_tail:
 	movl	_curpcb,%eax
 	movl	$fastmove_tail_fault,PCB_ONFAULT(%eax)
 
 	movb	%cl,%al
 	shrl	$2,%ecx				/* copy longword-wise */
 	cld
 	rep
 	movsl
 	movb	%al,%cl
 	andb	$3,%cl				/* copy remaining bytes */
 	rep
 	movsb
 
 	movl	%ebp,%esp
 	popl	%ebp
 	ret
 
 	ALIGN_TEXT
 fastmove_fault:
 	movl	_curpcb,%edi
 	addl	$PCB_SAVEFPU,%edi
 	movl	%esp,%esi
 	cld
 	movl	$PCB_SAVEFPU_SIZE>>2,%ecx
 	rep
 	movsl
 
 	smsw	%ax
 	orb	$CR0_TS,%al
 	lmsw	%ax
 	movl	$0,_npxproc
 
 fastmove_tail_fault:
 	movl	%ebp,%esp
 	popl	%ebp
 	addl	$8,%esp
 	popl	%ebx
 	popl	%edi
 	popl	%esi
 	movl	_curpcb,%edx
 	movl	$0,PCB_ONFAULT(%edx)
 	movl	$EFAULT,%eax
 	ret
 #endif /* I586_CPU && NNPX > 0 */
 
 /*
  * fu{byte,sword,word} : fetch a byte (sword, word) from user memory
  */
 ENTRY(fuword)
 	movl	_curpcb,%ecx
 	movl	$fusufault,PCB_ONFAULT(%ecx)
 	movl	4(%esp),%edx			/* from */
 
 	cmpl	$VM_MAXUSER_ADDRESS-4,%edx	/* verify address is valid */
 	ja	fusufault
 
 	movl	(%edx),%eax
 	movl	$0,PCB_ONFAULT(%ecx)
 	ret
 
 /*
  * These two routines are called from the profiling code, potentially
  * at interrupt time. If they fail, that's okay, good things will
  * happen later. Fail all the time for now - until the trap code is
  * able to deal with this.
  */
 ALTENTRY(suswintr)
 ENTRY(fuswintr)
 	movl	$-1,%eax
 	ret
 
 ENTRY(fusword)
 	movl	_curpcb,%ecx
 	movl	$fusufault,PCB_ONFAULT(%ecx)
 	movl	4(%esp),%edx
 
 	cmpl	$VM_MAXUSER_ADDRESS-2,%edx
 	ja	fusufault
 
 	movzwl	(%edx),%eax
 	movl	$0,PCB_ONFAULT(%ecx)
 	ret
 
 ENTRY(fubyte)
 	movl	_curpcb,%ecx
 	movl	$fusufault,PCB_ONFAULT(%ecx)
 	movl	4(%esp),%edx
 
 	cmpl	$VM_MAXUSER_ADDRESS-1,%edx
 	ja	fusufault
 
 	movzbl	(%edx),%eax
 	movl	$0,PCB_ONFAULT(%ecx)
 	ret
 
 	ALIGN_TEXT
 fusufault:
 	movl	_curpcb,%ecx
 	xorl	%eax,%eax
 	movl	%eax,PCB_ONFAULT(%ecx)
 	decl	%eax
 	ret
 
 /*
  * su{byte,sword,word}: write a byte (word, longword) to user memory
  */
 ENTRY(suword)
 	movl	_curpcb,%ecx
 	movl	$fusufault,PCB_ONFAULT(%ecx)
 	movl	4(%esp),%edx
 
 #if defined(I386_CPU)
 
 #if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
 	cmpl	$CPUCLASS_386,_cpu_class
 	jne	2f				/* we only have to set the right segment selector */
 #endif /* I486_CPU || I586_CPU || I686_CPU */
 
 	/* XXX - page boundary crossing is still not handled */
 	movl	%edx,%eax
 	shrl	$IDXSHIFT,%edx
 	andb	$0xfc,%dl
 
 	leal	_PTmap(%edx),%ecx
 	shrl	$IDXSHIFT,%ecx
 	andb	$0xfc,%cl
 	testb	$PG_V,_PTmap(%ecx)		/* PTE page must be valid */
 	je	4f
 	movb	_PTmap(%edx),%dl
 	andb	$PG_V|PG_RW|PG_U,%dl		/* page must be valid and user writable */
 	cmpb	$PG_V|PG_RW|PG_U,%dl
 	je	1f
 
 4:
 	/* simulate a trap */
 	pushl	%eax
 	call	_trapwrite
 	popl	%edx				/* remove junk parameter from stack */
 	testl	%eax,%eax
 	jnz	fusufault
 1:
 	movl	4(%esp),%edx
 #endif
 
 2:
 	cmpl	$VM_MAXUSER_ADDRESS-4,%edx	/* verify address validity */
 	ja	fusufault
 
 	movl	8(%esp),%eax
 	movl	%eax,(%edx)
 	xorl	%eax,%eax
 	movl	_curpcb,%ecx
 	movl	%eax,PCB_ONFAULT(%ecx)
 	ret
 
 ENTRY(susword)
 	movl	_curpcb,%ecx
 	movl	$fusufault,PCB_ONFAULT(%ecx)
 	movl	4(%esp),%edx
 
 #if defined(I386_CPU)
 
 #if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
 	cmpl	$CPUCLASS_386,_cpu_class
 	jne	2f
 #endif /* I486_CPU || I586_CPU || I686_CPU */
 
 	/* XXX - page boundary crossing is still not handled */
 	movl	%edx,%eax
 	shrl	$IDXSHIFT,%edx
 	andb	$0xfc,%dl
 
 	leal	_PTmap(%edx),%ecx
 	shrl	$IDXSHIFT,%ecx
 	andb	$0xfc,%cl
 	testb	$PG_V,_PTmap(%ecx)		/* PTE page must be valid */
 	je	4f
 	movb	_PTmap(%edx),%dl
 	andb	$PG_V|PG_RW|PG_U,%dl		/* page must be valid and user writable */
 	cmpb	$PG_V|PG_RW|PG_U,%dl
 	je	1f
 
 4:
 	/* simulate a trap */
 	pushl	%eax
 	call	_trapwrite
 	popl	%edx				/* remove junk parameter from stack */
 	testl	%eax,%eax
 	jnz	fusufault
 1:
 	movl	4(%esp),%edx
 #endif
 
 2:
 	cmpl	$VM_MAXUSER_ADDRESS-2,%edx	/* verify address validity */
 	ja	fusufault
 
 	movw	8(%esp),%ax
 	movw	%ax,(%edx)
 	xorl	%eax,%eax
 	movl	_curpcb,%ecx			/* restore trashed register */
 	movl	%eax,PCB_ONFAULT(%ecx)
 	ret
 
 ALTENTRY(suibyte)
 ENTRY(subyte)
 	movl	_curpcb,%ecx
 	movl	$fusufault,PCB_ONFAULT(%ecx)
 	movl	4(%esp),%edx
 
 #if defined(I386_CPU)
 
 #if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
 	cmpl	$CPUCLASS_386,_cpu_class
 	jne	2f
 #endif /* I486_CPU || I586_CPU || I686_CPU */
 
 	movl	%edx,%eax
 	shrl	$IDXSHIFT,%edx
 	andb	$0xfc,%dl
 
 	leal	_PTmap(%edx),%ecx
 	shrl	$IDXSHIFT,%ecx
 	andb	$0xfc,%cl
 	testb	$PG_V,_PTmap(%ecx)		/* PTE page must be valid */
 	je	4f
 	movb	_PTmap(%edx),%dl
 	andb	$PG_V|PG_RW|PG_U,%dl		/* page must be valid and user writable */
 	cmpb	$PG_V|PG_RW|PG_U,%dl
 	je	1f
 
 4:
 	/* simulate a trap */
 	pushl	%eax
 	call	_trapwrite
 	popl	%edx				/* remove junk parameter from stack */
 	testl	%eax,%eax
 	jnz	fusufault
 1:
 	movl	4(%esp),%edx
 #endif
 
 2:
 	cmpl	$VM_MAXUSER_ADDRESS-1,%edx	/* verify address validity */
 	ja	fusufault
 
 	movb	8(%esp),%al
 	movb	%al,(%edx)
 	xorl	%eax,%eax
 	movl	_curpcb,%ecx			/* restore trashed register */
 	movl	%eax,PCB_ONFAULT(%ecx)
 	ret
 
 /*
  * copyinstr(from, to, maxlen, int *lencopied)
  *	copy a string from from to to, stop when a 0 character is reached.
  *	return ENAMETOOLONG if string is longer than maxlen, and
  *	EFAULT on protection violations. If lencopied is non-zero,
  *	return the actual length in *lencopied.
  */
 ENTRY(copyinstr)
 	pushl	%esi
 	pushl	%edi
 	movl	_curpcb,%ecx
 	movl	$cpystrflt,PCB_ONFAULT(%ecx)
 
 	movl	12(%esp),%esi			/* %esi = from */
 	movl	16(%esp),%edi			/* %edi = to */
 	movl	20(%esp),%edx			/* %edx = maxlen */
 
 	movl	$VM_MAXUSER_ADDRESS,%eax
 
 	/* make sure 'from' is within bounds */
 	subl	%esi,%eax
 	jbe	cpystrflt
 
 	/* restrict maxlen to <= VM_MAXUSER_ADDRESS-from */
 	cmpl	%edx,%eax
 	jae	1f
 	movl	%eax,%edx
 	movl	%eax,20(%esp)
 1:
 	incl	%edx
 	cld
 
 2:
 	decl	%edx
 	jz	3f
 
 	lodsb
 	stosb
 	orb	%al,%al
 	jnz	2b
 
 	/* Success -- 0 byte reached */
 	decl	%edx
 	xorl	%eax,%eax
 	jmp	cpystrflt_x
 3:
 	/* edx is zero - return ENAMETOOLONG or EFAULT */
 	cmpl	$VM_MAXUSER_ADDRESS,%esi
 	jae	cpystrflt
 4:
 	movl	$ENAMETOOLONG,%eax
 	jmp	cpystrflt_x
 
 cpystrflt:
 	movl	$EFAULT,%eax
 
 cpystrflt_x:
 	/* set *lencopied and return %eax */
 	movl	_curpcb,%ecx
 	movl	$0,PCB_ONFAULT(%ecx)
 	movl	20(%esp),%ecx
 	subl	%edx,%ecx
 	movl	24(%esp),%edx
 	testl	%edx,%edx
 	jz	1f
 	movl	%ecx,(%edx)
 1:
 	popl	%edi
 	popl	%esi
 	ret
 
 
 /*
  * copystr(from, to, maxlen, int *lencopied)
  */
 ENTRY(copystr)
 	pushl	%esi
 	pushl	%edi
 
 	movl	12(%esp),%esi			/* %esi = from */
 	movl	16(%esp),%edi			/* %edi = to */
 	movl	20(%esp),%edx			/* %edx = maxlen */
 	incl	%edx
 	cld
 1:
 	decl	%edx
 	jz	4f
 	lodsb
 	stosb
 	orb	%al,%al
 	jnz	1b
 
 	/* Success -- 0 byte reached */
 	decl	%edx
 	xorl	%eax,%eax
 	jmp	6f
 4:
 	/* edx is zero -- return ENAMETOOLONG */
 	movl	$ENAMETOOLONG,%eax
 
 6:
 	/* set *lencopied and return %eax */
 	movl	20(%esp),%ecx
 	subl	%edx,%ecx
 	movl	24(%esp),%edx
 	testl	%edx,%edx
 	jz	7f
 	movl	%ecx,(%edx)
 7:
 	popl	%edi
 	popl	%esi
 	ret
 
 ENTRY(bcmp)
 	pushl	%edi
 	pushl	%esi
 	movl	12(%esp),%edi
 	movl	16(%esp),%esi
 	movl	20(%esp),%edx
 	xorl	%eax,%eax
 
 	movl	%edx,%ecx
 	shrl	$2,%ecx
 	cld					/* compare forwards */
 	repe
 	cmpsl
 	jne	1f
 
 	movl	%edx,%ecx
 	andl	$3,%ecx
 	repe
 	cmpsb
 	je	2f
 1:
 	incl	%eax
 2:
 	popl	%esi
 	popl	%edi
 	ret
 
 
 /*
  * Handling of special 386 registers and descriptor tables etc
  */
 /* void lgdt(struct region_descriptor *rdp); */
 ENTRY(lgdt)
 	/* reload the descriptor table */
 	movl	4(%esp),%eax
 	lgdt	(%eax)
 
 	/* flush the prefetch q */
 	jmp	1f
 	nop
 1:
 	/* reload "stale" selectors */
 	movl	$KDSEL,%eax
 	movl	%ax,%ds
 	movl	%ax,%es
 	movl	%ax,%fs
 	movl	%ax,%gs
 	movl	%ax,%ss
 
 	/* reload code selector by turning return into intersegmental return */
 	movl	(%esp),%eax
 	pushl	%eax
 	movl	$KCSEL,4(%esp)
 	lret
 
 /*
  * void lidt(struct region_descriptor *rdp);
  */
 ENTRY(lidt)
 	movl	4(%esp),%eax
 	lidt	(%eax)
 	ret
 
 /*
  * void lldt(u_short sel)
  */
 ENTRY(lldt)
 	lldt	4(%esp)
 	ret
 
 /*
  * void ltr(u_short sel)
  */
 ENTRY(ltr)
 	ltr	4(%esp)
 	ret
 
 /* ssdtosd(*ssdp,*sdp) */
 ENTRY(ssdtosd)
 	pushl	%ebx
 	movl	8(%esp),%ecx
 	movl	8(%ecx),%ebx
 	shll	$16,%ebx
 	movl	(%ecx),%edx
 	roll	$16,%edx
 	movb	%dh,%bl
 	movb	%dl,%bh
 	rorl	$8,%ebx
 	movl	4(%ecx),%eax
 	movw	%ax,%dx
 	andl	$0xf0000,%eax
 	orl	%eax,%ebx
 	movl	12(%esp),%ecx
 	movl	%edx,(%ecx)
 	movl	%ebx,4(%ecx)
 	popl	%ebx
 	ret
 
 /* load_cr0(cr0) */
 ENTRY(load_cr0)
 	movl	4(%esp),%eax
 	movl	%eax,%cr0
 	ret
 
 /* rcr0() */
 ENTRY(rcr0)
 	movl	%cr0,%eax
 	ret
 
 /* rcr3() */
 ENTRY(rcr3)
 	movl	%cr3,%eax
 	ret
 
 /* void load_cr3(caddr_t cr3) */
 ENTRY(load_cr3)
+#if defined(SWTCH_OPTIM_STATS)
+	incl	_tlb_flush_count
+#endif
 	movl	4(%esp),%eax
 	movl	%eax,%cr3
 	ret
 
 /* rcr4() */
 ENTRY(rcr4)
 	movl	%cr4,%eax
 	ret
 
 /* void load_cr4(caddr_t cr4) */
 ENTRY(load_cr4)
 	movl	4(%esp),%eax
 	movl	%eax,%cr4
 	ret
 
 /*****************************************************************************/
 /* setjump, longjump                                                         */
 /*****************************************************************************/
 
 ENTRY(setjmp)
 	movl	4(%esp),%eax
 	movl	%ebx,(%eax)			/* save ebx */
 	movl	%esp,4(%eax)			/* save esp */
 	movl	%ebp,8(%eax)			/* save ebp */
 	movl	%esi,12(%eax)			/* save esi */
 	movl	%edi,16(%eax)			/* save edi */
 	movl	(%esp),%edx			/* get rta */
 	movl	%edx,20(%eax)			/* save eip */
 	xorl	%eax,%eax			/* return(0); */
 	ret
 
 ENTRY(longjmp)
 	movl	4(%esp),%eax
 	movl	(%eax),%ebx			/* restore ebx */
 	movl	4(%eax),%esp			/* restore esp */
 	movl	8(%eax),%ebp			/* restore ebp */
 	movl	12(%eax),%esi			/* restore esi */
 	movl	16(%eax),%edi			/* restore edi */
 	movl	20(%eax),%edx			/* get rta */
 	movl	%edx,(%esp)			/* put in return frame */
 	xorl	%eax,%eax			/* return(1); */
 	incl	%eax
 	ret
 
 /*
  * Here for doing BB-profiling (gcc -a).
  * We rely on the "bbset" instead, but need a dummy function.
  */
 NON_GPROF_ENTRY(__bb_init_func)
 	movl	4(%esp),%eax
 	movl	$1,(%eax)
 	.byte	0xc3				/* avoid macro for `ret' */
Index: head/sys/i386/i386/swtch.s
===================================================================
--- head/sys/i386/i386/swtch.s	(revision 31708)
+++ head/sys/i386/i386/swtch.s	(revision 31709)
@@ -1,778 +1,815 @@
 /*-
  * Copyright (c) 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *	$Id: swtch.s,v 1.63 1997/09/21 15:03:58 peter Exp $
+ *	$Id: swtch.s,v 1.64 1997/10/10 09:44:06 peter Exp $
  */
 
 #include "npx.h"
 #include "opt_user_ldt.h"
 #include "opt_vm86.h"
 
 #include <sys/rtprio.h>
 
 #include <machine/asmacros.h>
 
 #ifdef SMP
 #include <machine/pmap.h>
 #include <machine/apic.h>
 #include <machine/smptests.h>		/** GRAB_LOPRIO */
 #endif /* SMP */
 
 #include "assym.s"
 
 
 /*****************************************************************************/
 /* Scheduling                                                                */
 /*****************************************************************************/
 
 /*
  * The following primitives manipulate the run queues.
  * _whichqs tells which of the 32 queues _qs
  * have processes in them.  setrunqueue puts processes into queues, Remrq
  * removes them from queues.  The running process is on no queue,
  * other processes are on a queue related to p->p_priority, divided by 4
  * actually to shrink the 0-127 range of priorities into the 32 available
  * queues.
  */
 	.data
 
 #ifndef SMP
 	.globl	_curpcb
 _curpcb:	.long	0		/* pointer to curproc's PCB area */
 #endif /* !SMP */
 
 	.globl	_whichqs, _whichrtqs, _whichidqs
 
 _whichqs:	.long	0		/* which run queues have data */
 _whichrtqs:	.long	0		/* which realtime run qs have data */
 _whichidqs:	.long	0		/* which idletime run qs have data */
 
 	.globl	_hlt_vector
 _hlt_vector:	.long	_default_halt	/* pointer to halt routine */
 
 	.globl	_qs,_cnt,_panic
 
 	.globl	_want_resched
 _want_resched:	.long	0		/* we need to re-run the scheduler */
+#if defined(SWTCH_OPTIM_STATS)
+	.globl	_swtch_optim_stats, _tlb_flush_count
+_swtch_optim_stats:	.long	0		/* number of _swtch_optims */
+_tlb_flush_count:	.long	0
+#endif
 
 	.text
 /*
  * setrunqueue(p)
  *
  * Call should be made at spl6(), and p->p_stat should be SRUN
  */
 ENTRY(setrunqueue)
 	movl	4(%esp),%eax
 #ifdef DIAGNOSTIC
 	cmpb	$SRUN,P_STAT(%eax)
 	je	set1
 	pushl	$set2
 	call	_panic
 set1:
 #endif
 	cmpw	$RTP_PRIO_NORMAL,P_RTPRIO_TYPE(%eax) /* normal priority process? */
 	je	set_nort
 
 	movzwl	P_RTPRIO_PRIO(%eax),%edx
 
 	cmpw	$RTP_PRIO_REALTIME,P_RTPRIO_TYPE(%eax) /* realtime priority? */
 	jne	set_id				/* must be idle priority */
 	
 set_rt:
 	btsl	%edx,_whichrtqs			/* set q full bit */
 	shll	$3,%edx
 	addl	$_rtqs,%edx			/* locate q hdr */
 	movl	%edx,P_FORW(%eax)		/* link process on tail of q */
 	movl	P_BACK(%edx),%ecx
 	movl	%ecx,P_BACK(%eax)
 	movl	%eax,P_BACK(%edx)
 	movl	%eax,P_FORW(%ecx)
 	ret
 
 set_id:	
 	btsl	%edx,_whichidqs			/* set q full bit */
 	shll	$3,%edx
 	addl	$_idqs,%edx			/* locate q hdr */
 	movl	%edx,P_FORW(%eax)		/* link process on tail of q */
 	movl	P_BACK(%edx),%ecx
 	movl	%ecx,P_BACK(%eax)
 	movl	%eax,P_BACK(%edx)
 	movl	%eax,P_FORW(%ecx)
 	ret
 
 set_nort:                    			/*  Normal (RTOFF) code */
 	movzbl	P_PRI(%eax),%edx
 	shrl	$2,%edx
 	btsl	%edx,_whichqs			/* set q full bit */
 	shll	$3,%edx
 	addl	$_qs,%edx			/* locate q hdr */
 	movl	%edx,P_FORW(%eax)		/* link process on tail of q */
 	movl	P_BACK(%edx),%ecx
 	movl	%ecx,P_BACK(%eax)
 	movl	%eax,P_BACK(%edx)
 	movl	%eax,P_FORW(%ecx)
 	ret
 
 set2:	.asciz	"setrunqueue"
 
 /*
  * Remrq(p)
  *
  * Call should be made at spl6().
  */
 ENTRY(remrq)
 	movl	4(%esp),%eax
 	cmpw	$RTP_PRIO_NORMAL,P_RTPRIO_TYPE(%eax) /* normal priority process? */
 	je	rem_nort
 
 	movzwl	P_RTPRIO_PRIO(%eax),%edx
 
 	cmpw	$RTP_PRIO_REALTIME,P_RTPRIO_TYPE(%eax) /* normal priority process? */
 	jne	rem_id
 		
 	btrl	%edx,_whichrtqs			/* clear full bit, panic if clear already */
 	jb	rem1rt
 	pushl	$rem3rt
 	call	_panic
 rem1rt:
 	pushl	%edx
 	movl	P_FORW(%eax),%ecx		/* unlink process */
 	movl	P_BACK(%eax),%edx
 	movl	%edx,P_BACK(%ecx)
 	movl	P_BACK(%eax),%ecx
 	movl	P_FORW(%eax),%edx
 	movl	%edx,P_FORW(%ecx)
 	popl	%edx
 	movl	$_rtqs,%ecx
 	shll	$3,%edx
 	addl	%edx,%ecx
 	cmpl	P_FORW(%ecx),%ecx		/* q still has something? */
 	je	rem2rt
 	shrl	$3,%edx				/* yes, set bit as still full */
 	btsl	%edx,_whichrtqs
 rem2rt:
 	ret
 rem_id:
 	btrl	%edx,_whichidqs			/* clear full bit, panic if clear already */
 	jb	rem1id
 	pushl	$rem3id
 	call	_panic
 rem1id:
 	pushl	%edx
 	movl	P_FORW(%eax),%ecx		/* unlink process */
 	movl	P_BACK(%eax),%edx
 	movl	%edx,P_BACK(%ecx)
 	movl	P_BACK(%eax),%ecx
 	movl	P_FORW(%eax),%edx
 	movl	%edx,P_FORW(%ecx)
 	popl	%edx
 	movl	$_idqs,%ecx
 	shll	$3,%edx
 	addl	%edx,%ecx
 	cmpl	P_FORW(%ecx),%ecx		/* q still has something? */
 	je	rem2id
 	shrl	$3,%edx				/* yes, set bit as still full */
 	btsl	%edx,_whichidqs
 rem2id:
 	ret
 
 rem_nort:     
 	movzbl	P_PRI(%eax),%edx
 	shrl	$2,%edx
 	btrl	%edx,_whichqs			/* clear full bit, panic if clear already */
 	jb	rem1
 	pushl	$rem3
 	call	_panic
 rem1:
 	pushl	%edx
 	movl	P_FORW(%eax),%ecx		/* unlink process */
 	movl	P_BACK(%eax),%edx
 	movl	%edx,P_BACK(%ecx)
 	movl	P_BACK(%eax),%ecx
 	movl	P_FORW(%eax),%edx
 	movl	%edx,P_FORW(%ecx)
 	popl	%edx
 	movl	$_qs,%ecx
 	shll	$3,%edx
 	addl	%edx,%ecx
 	cmpl	P_FORW(%ecx),%ecx		/* q still has something? */
 	je	rem2
 	shrl	$3,%edx				/* yes, set bit as still full */
 	btsl	%edx,_whichqs
 rem2:
 	ret
 
 rem3:	.asciz	"remrq"
 rem3rt:	.asciz	"remrq.rt"
 rem3id:	.asciz	"remrq.id"
 
 /*
  * When no processes are on the runq, cpu_switch() branches to _idle
  * to wait for something to come ready.
  */
 	ALIGN_TEXT
 _idle:
 #ifdef SMP
 	/* when called, we have the mplock, intr disabled */
 
 	xorl	%ebp,%ebp
 
 	/* use our idleproc's "context" */
 	movl	_my_idlePTD,%ecx
 	movl	%ecx,%cr3
+#if defined(SWTCH_OPTIM_STATS)
+	incl	_tlb_flush_count
+#endif
 	movl	$_idlestack_top,%ecx
 	movl	%ecx,%esp
 
 	/* update common_tss.tss_esp0 pointer */
 #ifdef VM86
 	movl	_my_tr, %esi
 #endif /* VM86 */
-	movl	$_common_tss, %eax
-	movl	%ecx, TSS_ESP0(%eax)
+	movl	%ecx, _common_tss + TSS_ESP0
 
 #ifdef VM86
 	btrl	%esi, _private_tss
 	je	1f
 	movl	$_common_tssd, %edi
 
 	/* move correct tss descriptor into GDT slot, then reload tr */
 	leal	_gdt(,%esi,8), %ebx		/* entry in GDT */
 	movl	0(%edi), %eax
 	movl	%eax, 0(%ebx)
 	movl	4(%edi), %eax
 	movl	%eax, 4(%ebx)
 	shll	$3, %esi			/* GSEL(entry, SEL_KPL) */
 	ltr	%si
 1:
 #endif /* VM86 */
 
 	sti
 
 	/*
 	 * XXX callers of cpu_switch() do a bogus splclock().  Locking should
 	 * be left to cpu_switch().
 	 */
 	call	_spl0
 
 	cli
 
 	/*
 	 * _REALLY_ free the lock, no matter how deep the prior nesting.
 	 * We will recover the nesting on the way out when we have a new
 	 * proc to load.
 	 *
 	 * XXX: we had damn well better be sure we had it before doing this!
 	 */
 	movl	$FREE_LOCK, %eax
 	movl	%eax, _mp_lock
 
 	/* do NOT have lock, intrs disabled */
 	.globl	idle_loop
 idle_loop:
 
+#if defined(SWTCH_OPTIM_STATS)
+	incl	_tlb_flush_count
+#endif
 	movl	%cr3,%eax			/* ouch! */
 	movl	%eax,%cr3
 
 	cmpl	$0,_smp_active
 	jne	1f
 	cmpl	$0,_cpuid
 	je	1f
 	jmp	2f
 
 1:	cmpl	$0,_whichrtqs			/* real-time queue */
 	jne	3f
 	cmpl	$0,_whichqs			/* normal queue */
 	jne	3f
 	cmpl	$0,_whichidqs			/* 'idle' queue */
 	jne	3f
 
 	cmpl	$0,_do_page_zero_idle
 	je	2f
 
 	/* XXX appears to cause panics */
 	/*
 	 * Inside zero_idle we enable interrupts and grab the mplock
 	 * as needed.  It needs to be careful about entry/exit mutexes.
 	 */
 	call	_vm_page_zero_idle		/* internal locking */
 	testl	%eax, %eax
 	jnz	idle_loop
 2:
 
 	/* enable intrs for a halt */
 #ifdef SMP
 	movl	$0, lapic_tpr			/* 1st candidate for an INT */
 #endif
 	sti
 	call	*_hlt_vector			/* wait for interrupt */
 	cli
 	jmp	idle_loop
 
 3:
 #ifdef SMP
 	movl	$LOPRIO_LEVEL, lapic_tpr	/* arbitrate for INTs */
 #endif
 	call	_get_mplock
 	cmpl	$0,_whichrtqs			/* real-time queue */
 	CROSSJUMP(jne, sw1a, je)
 	cmpl	$0,_whichqs			/* normal queue */
 	CROSSJUMP(jne, nortqr, je)
 	cmpl	$0,_whichidqs			/* 'idle' queue */
 	CROSSJUMP(jne, idqr, je)
 	call	_rel_mplock
 	jmp	idle_loop
 
 #else
 	xorl	%ebp,%ebp
 	movl	$HIDENAME(tmpstk),%esp
-	movl	_IdlePTD,%ecx
-	movl	%ecx,%cr3
+#if defined(OVERLY_CONSERVATIVE_PTD_MGMT)
+#if defined(SWTCH_OPTIM_STATS)
+	incl	_swtch_optim_stats
+#endif
+	movl	_IdlePTD, %ecx
+	movl	%cr3, %eax
+	cmpl	%ecx, %eax
+	je		2f
+#if defined(SWTCH_OPTIM_STATS)
+	decl	_swtch_optim_stats
+	incl	_tlb_flush_count
+#endif
+	movl	%ecx, %cr3
+2:
+#endif
 
 	/* update common_tss.tss_esp0 pointer */
 #ifdef VM86
 	movl	_my_tr, %esi
 #endif /* VM86 */
-	movl	$_common_tss, %eax
-	movl	%esp, TSS_ESP0(%eax)
+	movl	%esp, _common_tss + TSS_ESP0
 
 #ifdef VM86
 	btrl	%esi, _private_tss
 	je	1f
 	movl	$_common_tssd, %edi
 
 	/* move correct tss descriptor into GDT slot, then reload tr */
 	leal	_gdt(,%esi,8), %ebx		/* entry in GDT */
 	movl	0(%edi), %eax
 	movl	%eax, 0(%ebx)
 	movl	4(%edi), %eax
 	movl	%eax, 4(%ebx)
 	shll	$3, %esi			/* GSEL(entry, SEL_KPL) */
 	ltr	%si
 1:
 #endif /* VM86 */
 
 	sti
 
 	/*
 	 * XXX callers of cpu_switch() do a bogus splclock().  Locking should
 	 * be left to cpu_switch().
 	 */
 	call	_spl0
 
 	ALIGN_TEXT
 idle_loop:
 	cli
 	cmpl	$0,_whichrtqs			/* real-time queue */
 	CROSSJUMP(jne, sw1a, je)
 	cmpl	$0,_whichqs			/* normal queue */
 	CROSSJUMP(jne, nortqr, je)
 	cmpl	$0,_whichidqs			/* 'idle' queue */
 	CROSSJUMP(jne, idqr, je)
 	call	_vm_page_zero_idle
 	testl	%eax, %eax
 	jnz	idle_loop
 	sti
 	call	*_hlt_vector			/* wait for interrupt */
 	jmp	idle_loop
 #endif
 
 CROSSJUMPTARGET(_idle)
 
 ENTRY(default_halt)
 #ifndef SMP
 	hlt					/* XXX:	 until a wakeup IPI */
 #endif
 	ret
 
 /*
  * cpu_switch()
  */
 ENTRY(cpu_switch)
 	
 	/* switch to new process. first, save context as needed */
 	movl	_curproc,%ecx
 
 	/* if no process to save, don't bother */
 	testl	%ecx,%ecx
 	je	sw1
 
 #ifdef SMP
 	movb	P_ONCPU(%ecx), %al		/* save "last" cpu */
 	movb	%al, P_LASTCPU(%ecx)
 	movb	$0xff, P_ONCPU(%ecx)		/* "leave" the cpu */
 #endif /* SMP */
 
 	movl	P_ADDR(%ecx),%ecx
 
 	movl	(%esp),%eax			/* Hardware registers */
 	movl	%eax,PCB_EIP(%ecx)
 	movl	%ebx,PCB_EBX(%ecx)
 	movl	%esp,PCB_ESP(%ecx)
 	movl	%ebp,PCB_EBP(%ecx)
 	movl	%esi,PCB_ESI(%ecx)
 	movl	%edi,PCB_EDI(%ecx)
 	movl	%fs,PCB_FS(%ecx)
 	movl	%gs,PCB_GS(%ecx)
 
 #ifdef SMP
 	movl	_mp_lock, %eax
 	/* XXX FIXME: we should be saving the local APIC TPR */
 #ifdef DIAGNOSTIC
 	cmpl	$FREE_LOCK, %eax		/* is it free? */
 	je	badsw4				/* yes, bad medicine! */
 #endif /* DIAGNOSTIC */
 	andl	$COUNT_FIELD, %eax		/* clear CPU portion */
 	movl	%eax, PCB_MPNEST(%ecx)		/* store it */
 #endif /* SMP */
 
 #if NNPX > 0
 	/* have we used fp, and need a save? */
 	movl	_curproc,%eax
 	cmpl	%eax,_npxproc
 	jne	1f
 	addl	$PCB_SAVEFPU,%ecx		/* h/w bugs make saving complicated */
 	pushl	%ecx
 	call	_npxsave			/* do it in a big C function */
 	popl	%eax
 1:
 #endif	/* NNPX > 0 */
 
 	movl	$0,_curproc			/* out of process */
 
 	/* save is done, now choose a new process or idle */
 sw1:
 	cli
 
 #ifdef SMP
 	/* Stop scheduling if smp_active goes zero and we are not BSP */
 	cmpl	$0,_smp_active
 	jne	1f
 	cmpl	$0,_cpuid
 	je	1f
 	CROSSJUMP(je, _idle, jne)		/* wind down */
 1:
 #endif
 
 sw1a:
 	movl    _whichrtqs,%edi			/* pick next p. from rtqs */
 	testl	%edi,%edi
 	jz	nortqr				/* no realtime procs */
 
 	/* XXX - bsf is sloow */
 	bsfl	%edi,%ebx			/* find a full q */
 	jz	nortqr				/* no proc on rt q - try normal ... */
 
 	/* XX update whichqs? */
 	btrl	%ebx,%edi			/* clear q full status */
 	leal	_rtqs(,%ebx,8),%eax		/* select q */
 	movl	%eax,%esi
 
 	movl	P_FORW(%eax),%ecx		/* unlink from front of process q */
 	movl	P_FORW(%ecx),%edx
 	movl	%edx,P_FORW(%eax)
 	movl	P_BACK(%ecx),%eax
 	movl	%eax,P_BACK(%edx)
 
 	cmpl	P_FORW(%ecx),%esi		/* q empty */
 	je	rt3
 	btsl	%ebx,%edi			/* nope, set to indicate not empty */
 rt3:
 	movl	%edi,_whichrtqs			/* update q status */
 	jmp	swtch_com
 
 	/* old sw1a */
 /* Normal process priority's */
 nortqr:
 	movl	_whichqs,%edi
 2:
 	/* XXX - bsf is sloow */
 	bsfl	%edi,%ebx			/* find a full q */
 	jz	idqr				/* if none, idle */
 
 	/* XX update whichqs? */
 	btrl	%ebx,%edi			/* clear q full status */
 	leal	_qs(,%ebx,8),%eax		/* select q */
 	movl	%eax,%esi
 
 	movl	P_FORW(%eax),%ecx		/* unlink from front of process q */
 	movl	P_FORW(%ecx),%edx
 	movl	%edx,P_FORW(%eax)
 	movl	P_BACK(%ecx),%eax
 	movl	%eax,P_BACK(%edx)
 
 	cmpl	P_FORW(%ecx),%esi		/* q empty */
 	je	3f
 	btsl	%ebx,%edi			/* nope, set to indicate not empty */
 3:
 	movl	%edi,_whichqs			/* update q status */
 	jmp	swtch_com
 
 idqr: /* was sw1a */
 	movl    _whichidqs,%edi			/* pick next p. from idqs */
 
 	/* XXX - bsf is sloow */
 	bsfl	%edi,%ebx			/* find a full q */
 	CROSSJUMP(je, _idle, jne)		/* if no proc, idle */
 
 	/* XX update whichqs? */
 	btrl	%ebx,%edi			/* clear q full status */
 	leal	_idqs(,%ebx,8),%eax		/* select q */
 	movl	%eax,%esi
 
 	movl	P_FORW(%eax),%ecx		/* unlink from front of process q */
 	movl	P_FORW(%ecx),%edx
 	movl	%edx,P_FORW(%eax)
 	movl	P_BACK(%ecx),%eax
 	movl	%eax,P_BACK(%edx)
 
 	cmpl	P_FORW(%ecx),%esi		/* q empty */
 	je	id3
 	btsl	%ebx,%edi			/* nope, set to indicate not empty */
 id3:
 	movl	%edi,_whichidqs			/* update q status */
 
 swtch_com:
 	movl	$0,%eax
 	movl	%eax,_want_resched
 
 #ifdef	DIAGNOSTIC
 	cmpl	%eax,P_WCHAN(%ecx)
 	jne	badsw1
 	cmpb	$SRUN,P_STAT(%ecx)
 	jne	badsw2
 #endif
 
 	movl	%eax,P_BACK(%ecx) 		/* isolate process to run */
 	movl	P_ADDR(%ecx),%edx
-	movl	PCB_CR3(%edx),%ebx
 
 #ifdef SMP
+	movl	PCB_CR3(%edx),%ebx
 	/* Grab the private PT pointer from the outgoing process's PTD */
 	movl	$_PTD, %esi
 	movl	4*MPPTDI(%esi), %eax		/* fetch cpu's prv pt */
-#endif /* SMP */
-
+#else
+#if defined(SWTCH_OPTIM_STATS)
+	incl	_swtch_optim_stats
+#endif
 	/* switch address space */
+	movl	%cr3,%ebx
+	cmpl	PCB_CR3(%edx),%ebx
+	je		4f
+#if defined(SWTCH_OPTIM_STATS)
+	decl	_swtch_optim_stats
+	incl	_tlb_flush_count
+#endif
+	movl	PCB_CR3(%edx),%ebx
+#endif /* SMP */
 	movl	%ebx,%cr3
+4:
 
 #ifdef SMP
 	/* Copy the private PT to the new process's PTD */
 	/* XXX yuck, the _PTD changes when we switch, so we have to
 	 * reload %cr3 after changing the address space.
 	 * We need to fix this by storing a pointer to the virtual
 	 * location of the per-process PTD in the PCB or something quick.
 	 * Dereferencing proc->vm_map->pmap->p_pdir[] is painful in asm.
 	 */
 	movl	%eax, 4*MPPTDI(%esi)		/* restore cpu's prv page */
 
+#if defined(SWTCH_OPTIM_STATS)
+	incl	_tlb_flush_count
+#endif
 	/* XXX: we have just changed the page tables.. reload.. */
 	movl	%ebx, %cr3
 #endif /* SMP */
 
 #ifdef VM86
 	movl	_my_tr, %esi
 	cmpl	$0, PCB_EXT(%edx)		/* has pcb extension? */
 	je	1f
 	btsl	%esi, _private_tss		/* mark use of private tss */
 	movl	PCB_EXT(%edx), %edi		/* new tss descriptor */
 	jmp	2f
 1:
 #endif
 
 	/* update common_tss.tss_esp0 pointer */
 	movl	$_common_tss, %eax
 	movl	%edx, %ebx			/* pcb */
 #ifdef VM86
 	addl	$(UPAGES * PAGE_SIZE - 16), %ebx
 #else
 	addl	$(UPAGES * PAGE_SIZE), %ebx
 #endif /* VM86 */
 	movl	%ebx, TSS_ESP0(%eax)
 
 #ifdef VM86
 	btrl	%esi, _private_tss
 	je	3f
 	movl	$_common_tssd, %edi
 2:
 	/* move correct tss descriptor into GDT slot, then reload tr */
 	leal	_gdt(,%esi,8), %ebx		/* entry in GDT */
 	movl	0(%edi), %eax
 	movl	%eax, 0(%ebx)
 	movl	4(%edi), %eax
 	movl	%eax, 4(%ebx)
 	shll	$3, %esi			/* GSEL(entry, SEL_KPL) */
 	ltr	%si
 3:
 #endif /* VM86 */
 
 	/* restore context */
 	movl	PCB_EBX(%edx),%ebx
 	movl	PCB_ESP(%edx),%esp
 	movl	PCB_EBP(%edx),%ebp
 	movl	PCB_ESI(%edx),%esi
 	movl	PCB_EDI(%edx),%edi
 	movl	PCB_EIP(%edx),%eax
 	movl	%eax,(%esp)
 
 #ifdef SMP
 #ifdef GRAB_LOPRIO				/* hold LOPRIO for INTs */
 #ifdef CHEAP_TPR
 	movl	$0, lapic_tpr
 #else
 	andl	$~APIC_TPR_PRIO, lapic_tpr
 #endif /** CHEAP_TPR */
 #endif /** GRAB_LOPRIO */
 	movl	_cpuid,%eax
 	movb	%al, P_ONCPU(%ecx)
 #endif /* SMP */
 	movl	%edx, _curpcb
 	movl	%ecx, _curproc			/* into next process */
 
 #ifdef SMP
 	movl	_cpu_lockid, %eax
 	orl	PCB_MPNEST(%edx), %eax		/* add next count from PROC */
 	movl	%eax, _mp_lock			/* load the mp_lock */
 	/* XXX FIXME: we should be restoring the local APIC TPR */
 #endif /* SMP */
 
 #ifdef	USER_LDT
 	cmpl	$0, PCB_USERLDT(%edx)
 	jnz	1f
 	movl	__default_ldt,%eax
 	cmpl	_currentldt,%eax
 	je	2f
 	lldt	__default_ldt
 	movl	%eax,_currentldt
 	jmp	2f
 1:	pushl	%edx
 	call	_set_user_ldt
 	popl	%edx
 2:
 #endif
 
 	/* This must be done after loading the user LDT. */
 	.globl	cpu_switch_load_fs
 cpu_switch_load_fs:
 	movl	PCB_FS(%edx),%fs
 	.globl	cpu_switch_load_gs
 cpu_switch_load_gs:
 	movl	PCB_GS(%edx),%gs
 
 	sti
 	ret
 
 CROSSJUMPTARGET(idqr)
 CROSSJUMPTARGET(nortqr)
 CROSSJUMPTARGET(sw1a)
 
 #ifdef DIAGNOSTIC
 badsw1:
 	pushl	$sw0_1
 	call	_panic
 
 sw0_1:	.asciz	"cpu_switch: has wchan"
 
 badsw2:
 	pushl	$sw0_2
 	call	_panic
 
 sw0_2:	.asciz	"cpu_switch: not SRUN"
 #endif
 
 #if defined(SMP) && defined(DIAGNOSTIC)
 badsw4:
 	pushl	$sw0_4
 	call	_panic
 
 sw0_4:	.asciz	"cpu_switch: do not have lock"
 #endif /* SMP && DIAGNOSTIC */
 
 /*
  * savectx(pcb)
  * Update pcb, saving current processor state.
  */
 ENTRY(savectx)
 	/* fetch PCB */
 	movl	4(%esp),%ecx
 
 	/* caller's return address - child won't execute this routine */
 	movl	(%esp),%eax
 	movl	%eax,PCB_EIP(%ecx)
 
 	movl	%ebx,PCB_EBX(%ecx)
 	movl	%esp,PCB_ESP(%ecx)
 	movl	%ebp,PCB_EBP(%ecx)
 	movl	%esi,PCB_ESI(%ecx)
 	movl	%edi,PCB_EDI(%ecx)
 	movl	%fs,PCB_FS(%ecx)
 	movl	%gs,PCB_GS(%ecx)
 
 #if NNPX > 0
 	/*
 	 * If npxproc == NULL, then the npx h/w state is irrelevant and the
 	 * state had better already be in the pcb.  This is true for forks
 	 * but not for dumps (the old book-keeping with FP flags in the pcb
 	 * always lost for dumps because the dump pcb has 0 flags).
 	 *
 	 * If npxproc != NULL, then we have to save the npx h/w state to
 	 * npxproc's pcb and copy it to the requested pcb, or save to the
 	 * requested pcb and reload.  Copying is easier because we would
 	 * have to handle h/w bugs for reloading.  We used to lose the
 	 * parent's npx state for forks by forgetting to reload.
 	 */
 	movl	_npxproc,%eax
 	testl	%eax,%eax
 	je	1f
 
 	pushl	%ecx
 	movl	P_ADDR(%eax),%eax
 	leal	PCB_SAVEFPU(%eax),%eax
 	pushl	%eax
 	pushl	%eax
 	call	_npxsave
 	addl	$4,%esp
 	popl	%eax
 	popl	%ecx
 
 	pushl	$PCB_SAVEFPU_SIZE
 	leal	PCB_SAVEFPU(%ecx),%ecx
 	pushl	%ecx
 	pushl	%eax
 	call	_bcopy
 	addl	$12,%esp
 #endif	/* NNPX > 0 */
 
 1:
 	ret
Index: head/sys/i386/include/cpufunc.h
===================================================================
--- head/sys/i386/include/cpufunc.h	(revision 31708)
+++ head/sys/i386/include/cpufunc.h	(revision 31709)
@@ -1,430 +1,436 @@
 /*-
  * Copyright (c) 1993 The Regents of the University of California.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *	$Id: cpufunc.h,v 1.3 1997/09/05 20:20:31 smp Exp smp $
+ *	$Id: cpufunc.h,v 1.72 1997/09/07 22:01:27 fsmp Exp $
  */
 
 /*
  * Functions to provide access to special i386 instructions.
  */
 
 #ifndef _MACHINE_CPUFUNC_H_
 #define	_MACHINE_CPUFUNC_H_
 
 #include <sys/cdefs.h>
 #include <sys/types.h>
 
 #include <machine/lock.h>
 
+#if defined(SWTCH_OPTIM_STATS)
+extern int tlb_flush_count;
+#endif
 
 #ifdef	__GNUC__
 
 static __inline void
 breakpoint(void)
 {
 	__asm __volatile("int $3");
 }
 
 static __inline void
 disable_intr(void)
 {
 	__asm __volatile("cli" : : : "memory");
 	MPINTR_LOCK();
 }
 
 static __inline void
 enable_intr(void)
 {
 	MPINTR_UNLOCK();
 	__asm __volatile("sti");
 }
 
 #define	HAVE_INLINE_FFS
 
 static __inline int
 ffs(int mask)
 {
 	int	result;
 	/*
 	 * bsfl turns out to be not all that slow on 486's.  It can beaten
 	 * using a binary search to reduce to 4 bits and then a table lookup,
 	 * but only if the code is inlined and in the cache, and the code
 	 * is quite large so inlining it probably busts the cache.
 	 *
 	 * Note that gcc-2's builtin ffs would be used if we didn't declare
 	 * this inline or turn off the builtin.  The builtin is faster but
 	 * broken in gcc-2.4.5 and slower but working in gcc-2.5 and 2.6.
 	 */
 	__asm __volatile("testl %0,%0; je 1f; bsfl %0,%0; incl %0; 1:"
 			 : "=r" (result) : "0" (mask));
 	return (result);
 }
 
 #define	HAVE_INLINE_FLS
 
 static __inline int
 fls(int mask)
 {
 	int	result;
 	__asm __volatile("testl %0,%0; je 1f; bsrl %0,%0; incl %0; 1:"
 			 : "=r" (result) : "0" (mask));
 	return (result);
 }
 
 #if __GNUC__ < 2
 
 #define	inb(port)		inbv(port)
 #define	outb(port, data)	outbv(port, data)
 
 #else /* __GNUC >= 2 */
 
 /*
  * The following complications are to get around gcc not having a
  * constraint letter for the range 0..255.  We still put "d" in the
  * constraint because "i" isn't a valid constraint when the port
  * isn't constant.  This only matters for -O0 because otherwise
  * the non-working version gets optimized away.
  * 
  * Use an expression-statement instead of a conditional expression
  * because gcc-2.6.0 would promote the operands of the conditional
  * and produce poor code for "if ((inb(var) & const1) == const2)".
  *
  * The unnecessary test `(port) < 0x10000' is to generate a warning if
  * the `port' has type u_short or smaller.  Such types are pessimal.
  * This actually only works for signed types.  The range check is
  * careful to avoid generating warnings.
  */
 #define	inb(port) __extension__ ({					\
 	u_char	_data;							\
 	if (__builtin_constant_p(port) && ((port) & 0xffff) < 0x100	\
 	    && (port) < 0x10000)					\
 		_data = inbc(port);					\
 	else								\
 		_data = inbv(port);					\
 	_data; })
 
 #define	outb(port, data) (						\
 	__builtin_constant_p(port) && ((port) & 0xffff) < 0x100		\
 	&& (port) < 0x10000						\
 	? outbc(port, data) : outbv(port, data))
 
 static __inline u_char
 inbc(u_int port)
 {
 	u_char	data;
 
 	__asm __volatile("inb %1,%0" : "=a" (data) : "id" ((u_short)(port)));
 	return (data);
 }
 
 static __inline void
 outbc(u_int port, u_char data)
 {
 	__asm __volatile("outb %0,%1" : : "a" (data), "id" ((u_short)(port)));
 }
 
 #endif /* __GNUC <= 2 */
 
 static __inline u_char
 inbv(u_int port)
 {
 	u_char	data;
 	/*
 	 * We use %%dx and not %1 here because i/o is done at %dx and not at
 	 * %edx, while gcc generates inferior code (movw instead of movl)
 	 * if we tell it to load (u_short) port.
 	 */
 	__asm __volatile("inb %%dx,%0" : "=a" (data) : "d" (port));
 	return (data);
 }
 
 static __inline u_long
 inl(u_int port)
 {
 	u_long	data;
 
 	__asm __volatile("inl %%dx,%0" : "=a" (data) : "d" (port));
 	return (data);
 }
 
 static __inline void
 insb(u_int port, void *addr, size_t cnt)
 {
 	__asm __volatile("cld; rep; insb"
 			 : : "d" (port), "D" (addr), "c" (cnt)
 			 : "di", "cx", "memory");
 }
 
 static __inline void
 insw(u_int port, void *addr, size_t cnt)
 {
 	__asm __volatile("cld; rep; insw"
 			 : : "d" (port), "D" (addr), "c" (cnt)
 			 : "di", "cx", "memory");
 }
 
 static __inline void
 insl(u_int port, void *addr, size_t cnt)
 {
 	__asm __volatile("cld; rep; insl"
 			 : : "d" (port), "D" (addr), "c" (cnt)
 			 : "di", "cx", "memory");
 }
 
 static __inline void
 invd(void)
 {
 	__asm __volatile("invd");
 }
 
 #ifdef KERNEL
 #ifdef SMP
 
 /*
  * When using APIC IPI's, the inlining cost is prohibitive since the call
  * executes into the IPI transmission system.
  */
 void	invlpg		__P((u_int addr));
 void	invltlb		__P((void));
 
 #else  /* !SMP */
 
 static __inline void
 invlpg(u_int addr)
 {
 	__asm __volatile("invlpg (%0)" : : "r" (addr) : "memory");
 }
 
 static __inline void
 invltlb(void)
 {
 	u_long	temp;
 	/*
 	 * This should be implemented as load_cr3(rcr3()) when load_cr3()
 	 * is inlined.
 	 */
 	__asm __volatile("movl %%cr3, %0; movl %0, %%cr3" : "=r" (temp)
 			 : : "memory");
+#if defined(SWTCH_OPTIM_STATS)
+	++tlb_flush_count;
+#endif
 }
 
 #endif	/* SMP */
 #endif  /* KERNEL */
 
 static __inline u_short
 inw(u_int port)
 {
 	u_short	data;
 
 	__asm __volatile("inw %%dx,%0" : "=a" (data) : "d" (port));
 	return (data);
 }
 
 static __inline u_int
 loadandclear(u_int *addr)
 {
 	u_int	result;
 
 	__asm __volatile("xorl %0,%0; xchgl %1,%0"
 			 : "=&r" (result) : "m" (*addr));
 	return (result);
 }
 
 static __inline void
 outbv(u_int port, u_char data)
 {
 	u_char	al;
 	/*
 	 * Use an unnecessary assignment to help gcc's register allocator.
 	 * This make a large difference for gcc-1.40 and a tiny difference
 	 * for gcc-2.6.0.  For gcc-1.40, al had to be ``asm("ax")'' for
 	 * best results.  gcc-2.6.0 can't handle this.
 	 */
 	al = data;
 	__asm __volatile("outb %0,%%dx" : : "a" (al), "d" (port));
 }
 
 static __inline void
 outl(u_int port, u_long data)
 {
 	/*
 	 * outl() and outw() aren't used much so we haven't looked at
 	 * possible micro-optimizations such as the unnecessary
 	 * assignment for them.
 	 */
 	__asm __volatile("outl %0,%%dx" : : "a" (data), "d" (port));
 }
 
 static __inline void
 outsb(u_int port, void *addr, size_t cnt)
 {
 	__asm __volatile("cld; rep; outsb"
 			 : : "d" (port), "S" (addr), "c" (cnt)
 			 : "si", "cx");
 }
 
 static __inline void
 outsw(u_int port, void *addr, size_t cnt)
 {
 	__asm __volatile("cld; rep; outsw"
 			 : : "d" (port), "S" (addr), "c" (cnt)
 			 : "si", "cx");
 }
 
 static __inline void
 outsl(u_int port, void *addr, size_t cnt)
 {
 	__asm __volatile("cld; rep; outsl"
 			 : : "d" (port), "S" (addr), "c" (cnt)
 			 : "si", "cx");
 }
 
 static __inline void
 outw(u_int port, u_short data)
 {
 	__asm __volatile("outw %0,%%dx" : : "a" (data), "d" (port));
 }
 
 static __inline u_long
 rcr2(void)
 {
 	u_long	data;
 
 	__asm __volatile("movl %%cr2,%0" : "=r" (data));
 	return (data);
 }
 
 static __inline u_long
 read_eflags(void)
 {
 	u_long	ef;
 
 	__asm __volatile("pushfl; popl %0" : "=r" (ef));
 	return (ef);
 }
 
 static __inline quad_t
 rdmsr(u_int msr)
 {
 	quad_t rv;
 
 	__asm __volatile(".byte 0x0f, 0x32" : "=A" (rv) : "c" (msr));
 	return (rv);
 }
 
 static __inline quad_t
 rdpmc(u_int pmc)
 {
 	quad_t rv;
 
 	__asm __volatile(".byte 0x0f, 0x33" : "=A" (rv) : "c" (pmc));
 	return (rv);
 }
 
 static __inline quad_t
 rdtsc(void)
 {
 	quad_t rv;
 
 	__asm __volatile(".byte 0x0f, 0x31" : "=A" (rv));
 	return (rv);
 }
 
 static __inline void
 setbits(volatile unsigned *addr, u_int bits)
 {
 	__asm __volatile(
 #ifdef SMP
 			 "lock; "
 #endif
 			 "orl %1,%0" : "=m" (*addr) : "ir" (bits));
 }
 
 static __inline void
 wbinvd(void)
 {
 	__asm __volatile("wbinvd");
 }
 
 static __inline void
 write_eflags(u_long ef)
 {
 	__asm __volatile("pushl %0; popfl" : : "r" (ef));
 }
 
 static __inline void
 wrmsr(u_int msr, quad_t newval)
 {
 	__asm __volatile(".byte 0x0f, 0x30" : : "A" (newval), "c" (msr));
 }
 
 #else /* !__GNUC__ */
 
 int	breakpoint	__P((void));
 void	disable_intr	__P((void));
 void	enable_intr	__P((void));
 u_char	inb		__P((u_int port));
 u_long	inl		__P((u_int port));
 void	insb		__P((u_int port, void *addr, size_t cnt));
 void	insl		__P((u_int port, void *addr, size_t cnt));
 void	insw		__P((u_int port, void *addr, size_t cnt));
 void	invd		__P((void));
 void	invlpg		__P((u_int addr));
 void	invltlb		__P((void));
 u_short	inw		__P((u_int port));
 u_int	loadandclear	__P((u_int *addr));
 void	outb		__P((u_int port, u_char data));
 void	outl		__P((u_int port, u_long data));
 void	outsb		__P((u_int port, void *addr, size_t cnt));
 void	outsl		__P((u_int port, void *addr, size_t cnt));
 void	outsw		__P((u_int port, void *addr, size_t cnt));
 void	outw		__P((u_int port, u_short data));
 u_long	rcr2		__P((void));
 quad_t	rdmsr		__P((u_int msr));
 quad_t	rdpmc		__P((u_int pmc));
 quad_t	rdtsc		__P((void));
 u_long	read_eflags	__P((void));
 void	setbits		__P((volatile unsigned *addr, u_int bits));
 void	wbinvd		__P((void));
 void	write_eflags	__P((u_long ef));
 void	wrmsr		__P((u_int msr, quad_t newval));
 
 #endif	/* __GNUC__ */
 
 void	load_cr0	__P((u_long cr0));
 void	load_cr3	__P((u_long cr3));
 void	load_cr4	__P((u_long cr4));
 void	ltr		__P((u_short sel));
 u_int	rcr0		__P((void));
 u_long	rcr3		__P((void));
 u_long	rcr4		__P((void));
 
 #endif /* !_MACHINE_CPUFUNC_H_ */
Index: head/sys/kern/init_main.c
===================================================================
--- head/sys/kern/init_main.c	(revision 31708)
+++ head/sys/kern/init_main.c	(revision 31709)
@@ -1,647 +1,647 @@
 /*
  * Copyright (c) 1995 Terrence R. Lambert
  * All rights reserved.
  *
  * Copyright (c) 1982, 1986, 1989, 1991, 1992, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)init_main.c	8.9 (Berkeley) 1/21/94
- * $Id: init_main.c,v 1.77 1997/12/06 04:11:09 sef Exp $
+ * $Id: init_main.c,v 1.78 1997/12/12 04:00:57 dyson Exp $
  */
 
 #include "opt_devfs.h"
 
 #include <sys/param.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/kernel.h>
 #include <sys/mount.h>
 #include <sys/sysctl.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/signalvar.h>
 #include <sys/systm.h>
 #include <sys/vnode.h>
 #include <sys/sysent.h>
 #include <sys/reboot.h>
 #include <sys/sysproto.h>
 #include <sys/vmmeter.h>
 #include <sys/unistd.h>
 
 #include <machine/cpu.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_prot.h>
 #include <sys/lock.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <sys/user.h>
 #include <sys/copyright.h>
 
 extern struct linker_set	sysinit_set;	/* XXX */
 
 extern void __main __P((void));
 extern void main __P((void *framep));
 
 /* Components of the first process -- never freed. */
 static struct session session0;
 static struct pgrp pgrp0;
 struct	proc proc0;
 static struct pcred cred0;
 static struct filedesc0 filedesc0;
 static struct plimit limit0;
 static struct vmspace vmspace0;
 #ifndef SMP	/* per-cpu on smp */
 struct	proc *curproc = &proc0;
 #endif
 struct	proc *initproc;
 
 int cmask = CMASK;
 extern	struct user *proc0paddr;
 
 struct	vnode *rootvp;
 int	boothowto = 0;		/* initialized so that it can be patched */
 
 struct	timeval boottime;
 SYSCTL_STRUCT(_kern, KERN_BOOTTIME, boottime,
 	CTLFLAG_RD, &boottime, timeval, "");
 
 static int shutdowntimeout = 120;
 SYSCTL_INT(_kern, OID_AUTO, shutdown_timeout,
 	CTLFLAG_RW, &shutdowntimeout, 0, "");
 
 #ifndef SMP	/* per-cpu on smp */
 struct	timeval runtime;
 #endif
 
 /*
  * Promiscuous argument pass for start_init()
  *
  * This is a kludge because we use a return from main() rather than a call
  * to a new routine in locore.s to kick the kernel alive from locore.s.
  */
 static void	*init_framep;
 
 
 #if __GNUC__ >= 2
 void __main() {}
 #endif
 
 
 /*
  * This ensures that there is at least one entry so that the sysinit_set
  * symbol is not undefined.  A sybsystem ID of SI_SUB_DUMMY is never
  * executed.
  */
 SYSINIT(placeholder, SI_SUB_DUMMY,SI_ORDER_ANY, NULL, NULL)
 
 
 /*
  * System startup; initialize the world, create process 0, mount root
  * filesystem, and fork to create init and pagedaemon.  Most of the
  * hard work is done in the lower-level initialization routines including
  * startup(), which does memory initialization and autoconfiguration.
  *
  * This allows simple addition of new kernel subsystems that require
  * boot time initialization.  It also allows substitution of subsystem
  * (for instance, a scheduler, kernel profiler, or VM system) by object
  * module.  Finally, it allows for optional "kernel threads", like an LFS
  * cleaner.
  */
 void
 main(framep)
 	void *framep;
 {
 
 	register struct sysinit **sipp;		/* system initialization*/
 	register struct sysinit **xipp;		/* interior loop of sort*/
 	register struct sysinit *save;		/* bubble*/
 
 	/*
 	 * Copy the locore.s frame pointer for proc0, this is forked into
 	 * all other processes.
 	 */
 	init_framep = framep;
 
 #ifdef SMP
 	/*
 	 * XXX curproc is per-cpu in SMP, and exists in freshly mapped pages,
 	 * so its value must be initialized now before it is used by various
 	 * initialization routines.  proc0_init() isn't soon enough:
 	 * among other things, configure() is called first, and may call
 	 * setdumpdev(), which panics without a valid curproc.
 	 */
 	curproc = &proc0;
 #endif /* SMP */
 
 	/*
 	 * Perform a bubble sort of the system initialization objects by
 	 * their subsystem (primary key) and order (secondary key).
 	 *
 	 * Since some things care about execution order, this is the
 	 * operation which ensures continued function.
 	 */
 	for( sipp = (struct sysinit **)sysinit_set.ls_items; *sipp; sipp++) {
 		for( xipp = sipp + 1; *xipp; xipp++) {
 			if( (*sipp)->subsystem < (*xipp)->subsystem ||
 			    ( (*sipp)->subsystem == (*xipp)->subsystem &&
 			      (*sipp)->order < (*xipp)->order))
 				continue;	/* skip*/
 			save = *sipp;
 			*sipp = *xipp;
 			*xipp = save;
 		}
 	}
 
 	/*
 	 * Traverse the (now) ordered list of system initialization tasks.
 	 * Perform each task, and continue on to the next task.
 	 *
 	 * The last item on the list is expected to be the scheduler,
 	 * which will not return.
 	 */
 	for( sipp = (struct sysinit **)sysinit_set.ls_items; *sipp; sipp++) {
 
 		if( (*sipp)->subsystem == SI_SUB_DUMMY)
 			continue;	/* skip dummy task(s)*/
 
 		switch( (*sipp)->type) {
 		case SI_TYPE_DEFAULT:
 			/* no special processing*/
 			(*((*sipp)->func))( (*sipp)->udata);
 			break;
 
 		case SI_TYPE_KTHREAD:
 #if !defined(SMP)
 			/* kernel thread*/
 			if (fork1(&proc0, RFMEM|RFFDG|RFPROC))
 				panic("fork kernel thread");
 			cpu_set_fork_handler(pfind(proc0.p_retval[0]),
 			    (*sipp)->func, (*sipp)->udata);
 			break;
 #endif
 
 		case SI_TYPE_KPROCESS:
 			if (fork1(&proc0, RFFDG|RFPROC))
 				panic("fork kernel process");
 			cpu_set_fork_handler(pfind(proc0.p_retval[0]),
 			    (*sipp)->func, (*sipp)->udata);
 			break;
 
 		default:
 			panic( "init_main: unrecognized init type");
 		}
 	}
 
 	panic("Shouldn't get here!");
 	/* NOTREACHED*/
 }
 
 
 /*
  * Start a kernel process.  This is called after a fork() call in
  * main() in the file kern/init_main.c.
  *
  * This function is used to start "internal" daemons.
  */
 /* ARGSUSED*/
 void
 kproc_start(udata)
 	void *udata;
 {
 	struct kproc_desc	*kp = udata;
 	struct proc		*p = curproc;
 
 #ifdef DIAGNOSTIC
 	printf("Start pid=%d <%s>\n",p->p_pid, kp->arg0);
 #endif
 
 	/* save a global descriptor, if desired*/
 	if( kp->global_procpp != NULL)
 		*kp->global_procpp	= p;
 
 	/* this is a non-swapped system process*/
 	p->p_flag |= P_INMEM | P_SYSTEM;
 
 	/* set up arg0 for 'ps', et al*/
 	strcpy( p->p_comm, kp->arg0);
 
 	/* call the processes' main()...*/
 	(*kp->func)();
 
 	/* NOTREACHED */
 	panic("kproc_start: %s", kp->arg0);
 }
 
 
 /*
  ***************************************************************************
  ****
  **** The following SYSINIT's belong elsewhere, but have not yet
  **** been moved.
  ****
  ***************************************************************************
  */
 #ifdef OMIT
 /*
  * Handled by vfs_mountroot (bad idea) at this time... should be
  * done the same as 4.4Lite2.
  */
 SYSINIT(swapinit, SI_SUB_SWAP, SI_ORDER_FIRST, swapinit, NULL)
 #endif	/* OMIT*/
 
 static void print_caddr_t __P((void *data));
 static void
 print_caddr_t(data)
 	void *data;
 {
 	printf("%s", (char *)data);
 }
 SYSINIT(announce, SI_SUB_COPYRIGHT, SI_ORDER_FIRST, print_caddr_t, copyright)
 
 
 /*
  ***************************************************************************
  ****
  **** The two following SYSINT's are proc0 specific glue code.  I am not
  **** convinced that they can not be safely combined, but their order of
  **** operation has been maintained as the same as the original init_main.c
  **** for right now.
  ****
  **** These probably belong in init_proc.c or kern_proc.c, since they
  **** deal with proc0 (the fork template process).
  ****
  ***************************************************************************
  */
 /* ARGSUSED*/
 static void proc0_init __P((void *dummy));
 static void
 proc0_init(dummy)
 	void *dummy;
 {
 	register struct proc		*p;
 	register struct filedesc0	*fdp;
 	register unsigned i;
 
 	/*
 	 * Initialize the current process pointer (curproc) before
 	 * any possible traps/probes to simplify trap processing.
 	 */
 	p = &proc0;
 	curproc = p;			/* XXX redundant*/
 
 	/*
 	 * Initialize process and pgrp structures.
 	 */
 	procinit();
 
 	/*
 	 * Initialize sleep queue hash table
 	 */
 	sleepinit();
 
 	/*
 	 * additional VM structures
 	 */
 	vm_init2();
 
 	/*
 	 * Create process 0 (the swapper).
 	 */
 	LIST_INSERT_HEAD(&allproc, p, p_list);
 	p->p_pgrp = &pgrp0;
 	LIST_INSERT_HEAD(PGRPHASH(0), &pgrp0, pg_hash);
 	LIST_INIT(&pgrp0.pg_members);
 	LIST_INSERT_HEAD(&pgrp0.pg_members, p, p_pglist);
 
 	pgrp0.pg_session = &session0;
 	session0.s_count = 1;
 	session0.s_leader = p;
 
 	p->p_sysent = &aout_sysvec;
 
 	p->p_flag = P_INMEM | P_SYSTEM;
 	p->p_stat = SRUN;
 	p->p_nice = NZERO;
 	p->p_rtprio.type = RTP_PRIO_NORMAL;
 	p->p_rtprio.prio = 0;
 
 /*
  * Link for kernel based threads
  */
 	p->p_peers = 0;
 	p->p_leader = p;
 
 	bcopy("swapper", p->p_comm, sizeof ("swapper"));
 
 	/* Create credentials. */
 	cred0.p_refcnt = 1;
 	p->p_cred = &cred0;
 	p->p_ucred = crget();
 	p->p_ucred->cr_ngroups = 1;	/* group 0 */
 
 	/* Create the file descriptor table. */
 	fdp = &filedesc0;
 	p->p_fd = &fdp->fd_fd;
 	fdp->fd_fd.fd_refcnt = 1;
 	fdp->fd_fd.fd_cmask = cmask;
 	fdp->fd_fd.fd_ofiles = fdp->fd_dfiles;
 	fdp->fd_fd.fd_ofileflags = fdp->fd_dfileflags;
 	fdp->fd_fd.fd_nfiles = NDFILE;
 
 	/* Create the limits structures. */
 	p->p_limit = &limit0;
 	for (i = 0; i < sizeof(p->p_rlimit)/sizeof(p->p_rlimit[0]); i++)
 		limit0.pl_rlimit[i].rlim_cur =
 		    limit0.pl_rlimit[i].rlim_max = RLIM_INFINITY;
 	limit0.pl_rlimit[RLIMIT_NOFILE].rlim_cur =
 	    limit0.pl_rlimit[RLIMIT_NOFILE].rlim_max = maxfiles;
 	limit0.pl_rlimit[RLIMIT_NPROC].rlim_cur =
 	    limit0.pl_rlimit[RLIMIT_NPROC].rlim_max = maxproc;
 	i = ptoa(cnt.v_free_count);
 	limit0.pl_rlimit[RLIMIT_RSS].rlim_max = i;
 	limit0.pl_rlimit[RLIMIT_MEMLOCK].rlim_max = i;
 	limit0.pl_rlimit[RLIMIT_MEMLOCK].rlim_cur = i / 3;
 	limit0.p_refcnt = 1;
 
 	/* Allocate a prototype map so we have something to fork. */
 	p->p_vmspace = &vmspace0;
 	vmspace0.vm_refcnt = 1;
-	pmap_pinit(&vmspace0.vm_pmap);
+	pmap_pinit0(&vmspace0.vm_pmap);
 	vm_map_init(&vmspace0.vm_map, round_page(VM_MIN_ADDRESS),
 	    trunc_page(VM_MAXUSER_ADDRESS), TRUE);
 	vmspace0.vm_map.pmap = &vmspace0.vm_pmap;
 	p->p_addr = proc0paddr;				/* XXX */
 
 #define INCOMPAT_LITES2
 #ifdef INCOMPAT_LITES2
 	/*
 	 * proc0 needs to have a coherent frame base in it's stack.
 	 */
 	cpu_set_init_frame(p, init_framep);			/* XXX! */
 #endif	/* INCOMPAT_LITES2*/
 
 	/*
 	 * We continue to place resource usage info and signal
 	 * actions in the user struct so they're pageable.
 	 */
 	p->p_stats = &p->p_addr->u_stats;
 	p->p_sigacts = &p->p_addr->u_sigacts;
 
 	/*
 	 * Charge root for one process.
 	 */
 	(void)chgproccnt(0, 1);
 
 	/*
 	 * Initialize the procfs flags (to 0, of course)
 	 */
 	p->p_stops = p->p_stype = p->p_step = 0;
 
 }
 SYSINIT(p0init, SI_SUB_INTRINSIC, SI_ORDER_FIRST, proc0_init, NULL)
 
 /* ARGSUSED*/
 static void proc0_post __P((void *dummy));
 static void
 proc0_post(dummy)
 	void *dummy;
 {
 	struct timeval tv;
 
 	/*
 	 * Now can look at time, having had a chance to verify the time
 	 * from the file system.  Reset p->p_rtime as it may have been
 	 * munched in mi_switch() after the time got set.
 	 */
 	gettime(&boottime);
 	proc0.p_stats->p_start = runtime = mono_time = boottime;
 	proc0.p_rtime.tv_sec = proc0.p_rtime.tv_usec = 0;
 
 	/*
 	 * Give the ``random'' number generator a thump.
 	 */
 	microtime(&tv);
 	srandom(tv.tv_sec ^ tv.tv_usec);
 
 	/* Initialize signal state for process 0. */
 	siginit(&proc0);
 }
 SYSINIT(p0post, SI_SUB_INTRINSIC_POST, SI_ORDER_FIRST, proc0_post, NULL)
 
 
 
 
 /*
  ***************************************************************************
  ****
  **** The following SYSINIT's and glue code should be moved to the
  **** respective files on a per subsystem basis.
  ****
  ***************************************************************************
  */
 
 /* ARGSUSED */
 static void root_conf __P((void *dummy));
 static void
 root_conf(dummy)
 	void *dummy;
 {
 	cpu_rootconf();
 }
 SYSINIT(root_conf, SI_SUB_ROOT_CONF, SI_ORDER_FIRST, root_conf, NULL)
 
 /* ARGSUSED*/
 static void xxx_vfs_root_fdtab __P((void *dummy));
 static void
 xxx_vfs_root_fdtab(dummy)
 	void *dummy;
 {
 	register struct filedesc0	*fdp = &filedesc0;
 
 	/* Get the vnode for '/'.  Set fdp->fd_fd.fd_cdir to reference it. */
 	if (VFS_ROOT(mountlist.cqh_first, &rootvnode))
 		panic("cannot find root vnode");
 	fdp->fd_fd.fd_cdir = rootvnode;
 	VREF(fdp->fd_fd.fd_cdir);
 	VOP_UNLOCK(rootvnode, 0, &proc0);
 	fdp->fd_fd.fd_rdir = NULL;
 }
 SYSINIT(retrofit, SI_SUB_ROOT_FDTAB, SI_ORDER_FIRST, xxx_vfs_root_fdtab, NULL)
 
 
 /*
  ***************************************************************************
  ****
  **** The following code probably belongs in another file, like
  **** kern/init_init.c.  It is here for two reasons only:
  ****
  ****	1)	This code returns to startup the system; this is
  ****		abnormal for a kernel thread.
  ****	2)	This code promiscuously uses init_frame
  ****
  ***************************************************************************
  */
 
 static void kthread_init __P((void *dummy));
 SYSINIT_KP(init,SI_SUB_KTHREAD_INIT, SI_ORDER_FIRST, kthread_init, NULL)
 
 
 extern void prepare_usermode __P((void));
 static void start_init __P((struct proc *p));
 
 /* ARGSUSED*/
 static void
 kthread_init(dummy)
 	void *dummy;
 {
 	/* Create process 1 (init(8)). */
 	start_init(curproc);
 
 	prepare_usermode();
 
 	/*
 	 * This returns to the fork trampoline, then to user mode.
 	 */
 	return;	
 }
 
 
 /*
  * List of paths to try when searching for "init".
  */
 static char *initpaths[] = {
 	"/sbin/init",
 	"/sbin/oinit",
 	"/sbin/init.bak",
 	"/stand/sysinstall",
 	NULL,
 };
 
 /*
  * Start the initial user process; try exec'ing each pathname in "initpaths".
  * The program is invoked with one argument containing the boot flags.
  */
 static void
 start_init(p)
 	struct proc *p;
 {
 	vm_offset_t addr;
 	struct execve_args args;
 	int options, i, error;
 	char **pathp, *path, *ucp, **uap, *arg0, *arg1;
 
 	initproc = p;
 
 	/*
 	 * Need just enough stack to hold the faked-up "execve()" arguments.
 	 */
 	addr = trunc_page(VM_MAXUSER_ADDRESS - PAGE_SIZE);
 	if (vm_map_find(&p->p_vmspace->vm_map, NULL, 0, &addr, PAGE_SIZE, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0) != 0)
 		panic("init: couldn't allocate argument space");
 	p->p_vmspace->vm_maxsaddr = (caddr_t)addr;
 	p->p_vmspace->vm_ssize = 1;
 
 	for (pathp = &initpaths[0]; (path = *pathp) != NULL; pathp++) {
 		/*
 		 * Move out the boot flag argument.
 		 */
 		options = 0;
 		ucp = (char *)USRSTACK;
 		(void)subyte(--ucp, 0);		/* trailing zero */
 		if (boothowto & RB_SINGLE) {
 			(void)subyte(--ucp, 's');
 			options = 1;
 		}
 #ifdef notyet
                 if (boothowto & RB_FASTBOOT) {
 			(void)subyte(--ucp, 'f');
 			options = 1;
 		}
 #endif
 
 #ifdef BOOTCDROM
 		(void)subyte(--ucp, 'C');
 		options = 1;
 #endif
 
 #if defined(DEVFS) && defined(DEVFS_ROOT)
 		(void)subyte(--ucp, 'd');
 		options = 1;
 #endif
 		if (options == 0)
 			(void)subyte(--ucp, '-');
 		(void)subyte(--ucp, '-');		/* leading hyphen */
 		arg1 = ucp;
 
 		/*
 		 * Move out the file name (also arg 0).
 		 */
 		for (i = strlen(path) + 1; i >= 0; i--)
 			(void)subyte(--ucp, path[i]);
 		arg0 = ucp;
 
 		/*
 		 * Move out the arg pointers.
 		 */
 		uap = (char **)((int)ucp & ~(NBPW-1));
 		(void)suword((caddr_t)--uap, 0);	/* terminator */
 		(void)suword((caddr_t)--uap, (int)arg1);
 		(void)suword((caddr_t)--uap, (int)arg0);
 
 		/*
 		 * Point at the arguments.
 		 */
 		args.fname = arg0;
 		args.argv = uap;
 		args.envv = NULL;
 
 		/*
 		 * Now try to exec the program.  If can't for any reason
 		 * other than it doesn't exist, complain.
 		 *
 		 * Otherwise return to main() which returns to btext
 		 * which completes the system startup.
 		 */
 		if ((error = execve(p, &args)) == 0)
 			return;
 		if (error != ENOENT)
 			printf("exec %s: error %d\n", path, error);
 	}
 	printf("init: not found\n");
 	panic("no init");
 }
Index: head/sys/vm/pmap.h
===================================================================
--- head/sys/vm/pmap.h	(revision 31708)
+++ head/sys/vm/pmap.h	(revision 31709)
@@ -1,138 +1,139 @@
 /*
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)pmap.h	8.1 (Berkeley) 6/11/93
  *
  *
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Author: Avadis Tevanian, Jr.
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $Id: pmap.h,v 1.23 1997/08/05 22:07:21 dyson Exp $
+ * $Id: pmap.h,v 1.24 1997/08/05 23:03:24 dyson Exp $
  */
 
 /*
  *	Machine address mapping definitions -- machine-independent
  *	section.  [For machine-dependent section, see "machine/pmap.h".]
  */
 
 #ifndef	_PMAP_VM_
 #define	_PMAP_VM_
 
 /*
  * Each machine dependent implementation is expected to
  * keep certain statistics.  They may do this anyway they
  * so choose, but are expected to return the statistics
  * in the following structure.
  */
 struct pmap_statistics {
 	long resident_count;	/* # of pages mapped (total) */
 	long wired_count;	/* # of pages wired */
 };
 typedef struct pmap_statistics *pmap_statistics_t;
 
 #include <machine/pmap.h>
 
 #ifdef KERNEL
 void		 pmap_change_wiring __P((pmap_t, vm_offset_t, boolean_t));
 void		 pmap_clear_modify __P((vm_offset_t pa));
 void		 pmap_clear_reference __P((vm_offset_t pa));
 void		 pmap_copy __P((pmap_t, pmap_t, vm_offset_t, vm_size_t,
 		    vm_offset_t));
 void		 pmap_copy_page __P((vm_offset_t, vm_offset_t));
 void		 pmap_destroy __P((pmap_t));
 void		 pmap_enter __P((pmap_t, vm_offset_t, vm_offset_t, vm_prot_t,
 		    boolean_t));
 vm_offset_t	 pmap_extract __P((pmap_t, vm_offset_t));
 void		 pmap_growkernel __P((vm_offset_t));
 void		 pmap_init __P((vm_offset_t, vm_offset_t));
 boolean_t	 pmap_is_modified __P((vm_offset_t pa));
 boolean_t	 pmap_ts_referenced __P((vm_offset_t pa));
 void		 pmap_kenter __P((vm_offset_t, vm_offset_t));
 void		 pmap_kremove __P((vm_offset_t));
 vm_offset_t	 pmap_map __P((vm_offset_t, vm_offset_t, vm_offset_t, int));
 void		 pmap_object_init_pt __P((pmap_t pmap, vm_offset_t addr,
 		    vm_object_t object, vm_pindex_t pindex, vm_offset_t size,
 		    int pagelimit));
 boolean_t	 pmap_page_exists __P((pmap_t, vm_offset_t));
 void		 pmap_page_protect __P((vm_offset_t, vm_prot_t));
 void		 pmap_pageable __P((pmap_t, vm_offset_t, vm_offset_t,
 		    boolean_t));
 vm_offset_t	 pmap_phys_address __P((int));
 void		 pmap_pinit __P((pmap_t));
+void		 pmap_pinit0 __P((pmap_t));
 void		 pmap_protect __P((pmap_t, vm_offset_t, vm_offset_t,
 		    vm_prot_t));
 void		 pmap_qenter __P((vm_offset_t, vm_page_t *, int));
 void		 pmap_qremove __P((vm_offset_t, int));
 void		 pmap_reference __P((pmap_t));
 void		 pmap_release __P((pmap_t));
 void		 pmap_remove __P((pmap_t, vm_offset_t, vm_offset_t));
 void		 pmap_remove_pages __P((pmap_t, vm_offset_t, vm_offset_t));
 void		 pmap_zero_page __P((vm_offset_t));
 void		 pmap_prefault __P((pmap_t pmap, vm_offset_t addra,
 		    vm_map_entry_t entry, vm_object_t object));
 int		 pmap_mincore __P((pmap_t pmap, vm_offset_t addr));
 void		 pmap_new_proc __P((struct proc *p));
 void		 pmap_dispose_proc __P((struct proc *p));
 void		 pmap_swapout_proc __P((struct proc *p));
 void		 pmap_swapin_proc __P((struct proc *p));
 void		 pmap_activate __P((struct proc *p));
 vm_offset_t	 pmap_addr_hint __P((vm_object_t obj, vm_offset_t addr, vm_size_t size));
 void		pmap_init2 __P((void));
 
 #endif /* KERNEL */
 
 #endif /* _PMAP_VM_ */