Index: head/sys/i386/i386/apic_vector.s
===================================================================
--- head/sys/i386/i386/apic_vector.s	(revision 153140)
+++ head/sys/i386/i386/apic_vector.s	(revision 153141)
@@ -1,354 +1,313 @@
 /*-
  * Copyright (c) 1989, 1990 William F. Jolitz.
  * Copyright (c) 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: vector.s, 386BSD 0.1 unknown origin
  * $FreeBSD$
  */
 
 /*
  * Interrupt entry points for external interrupts triggered by I/O APICs
  * as well as IPI handlers.
  */
 
 #include "opt_smp.h"
 
 #include <machine/asmacros.h>
 #include <machine/apicreg.h>
 
 #include "assym.s"
 
 /*
  * I/O Interrupt Entry Point.  Rather than having one entry point for
  * each interrupt source, we use one entry point for each 32-bit word
  * in the ISR.  The handler determines the highest bit set in the ISR,
  * translates that into a vector, and passes the vector to the
  * lapic_handle_intr() function.
  */
 #define	ISR_VEC(index, vec_name)					\
 	.text ;								\
 	SUPERALIGN_TEXT ;						\
 IDTVEC(vec_name) ;							\
 	PUSH_FRAME ;							\
 	SET_KERNEL_SREGS ;						\
 	FAKE_MCOUNT(TF_EIP(%esp)) ;					\
 	movl	lapic, %edx ;	/* pointer to local APIC */		\
 	movl	LA_ISR + 16 * (index)(%edx), %eax ;	/* load ISR */	\
 	bsrl	%eax, %eax ;	/* index of highset set bit in ISR */	\
 	jz	2f ;							\
 	addl	$(32 * index),%eax ;					\
 1: ;									\
 	pushl	%eax ;		/* pass the IRQ */			\
 	call	lapic_handle_intr ;					\
 	addl	$4, %esp ;	/* discard parameter */			\
 	MEXITCOUNT ;							\
 	jmp	doreti ;						\
 2:	movl	$-1, %eax ;	/* send a vector of -1 */		\
 	jmp	1b
 
 /*
  * Handle "spurious INTerrupts".
  * Notes:
  *  This is different than the "spurious INTerrupt" generated by an
  *   8259 PIC for missing INTs.  See the APIC documentation for details.
  *  This routine should NOT do an 'EOI' cycle.
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(spuriousint)
 
 	/* No EOI cycle used here */
 
 	iret
 
 	ISR_VEC(1, apic_isr1)
 	ISR_VEC(2, apic_isr2)
 	ISR_VEC(3, apic_isr3)
 	ISR_VEC(4, apic_isr4)
 	ISR_VEC(5, apic_isr5)
 	ISR_VEC(6, apic_isr6)
 	ISR_VEC(7, apic_isr7)
 
 /*
  * Local APIC periodic timer handler.
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(timerint)
 	PUSH_FRAME
 	SET_KERNEL_SREGS
-
-	movl	lapic, %edx
-	movl	$0, LA_EOI(%edx)	/* End Of Interrupt to APIC */
-	
 	FAKE_MCOUNT(TF_EIP(%esp))
 
 	pushl	$0		/* XXX convert trapframe to clockframe */
 	call	lapic_handle_timer
 	addl	$4, %esp	/* XXX convert clockframe to trapframe */
 	MEXITCOUNT
 	jmp	doreti
 
 #ifdef SMP
 /*
  * Global address space TLB shootdown.
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(invltlb)
 	pushl	%eax
 	pushl	%ds
 	movl	$KDSEL, %eax		/* Kernel data selector */
 	movl	%eax, %ds
 
 #if defined(COUNT_XINVLTLB_HITS) || defined(COUNT_IPIS)
 	pushl	%fs
 	movl	$KPSEL, %eax		/* Private space selector */
 	movl	%eax, %fs
 	movl	PCPU(CPUID), %eax
 	popl	%fs
 #ifdef COUNT_XINVLTLB_HITS
 	incl	xhits_gbl(,%eax,4)
 #endif
 #ifdef COUNT_IPIS
 	movl	ipi_invltlb_counts(,%eax,4),%eax
 	incl	(%eax)
 #endif
 #endif
 
 	movl	%cr3, %eax		/* invalidate the TLB */
 	movl	%eax, %cr3
 
 	movl	lapic, %eax
 	movl	$0, LA_EOI(%eax)	/* End Of Interrupt to APIC */
 
 	lock
 	incl	smp_tlb_wait
 
 	popl	%ds
 	popl	%eax
 	iret
 
 /*
  * Single page TLB shootdown
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(invlpg)
 	pushl	%eax
 	pushl	%ds
 	movl	$KDSEL, %eax		/* Kernel data selector */
 	movl	%eax, %ds
 
 #if defined(COUNT_XINVLTLB_HITS) || defined(COUNT_IPIS)
 	pushl	%fs
 	movl	$KPSEL, %eax		/* Private space selector */
 	movl	%eax, %fs
 	movl	PCPU(CPUID), %eax
 	popl	%fs
 #ifdef COUNT_XINVLTLB_HITS
 	incl	xhits_pg(,%eax,4)
 #endif
 #ifdef COUNT_IPIS
 	movl	ipi_invlpg_counts(,%eax,4),%eax
 	incl	(%eax)
 #endif
 #endif
 
 	movl	smp_tlb_addr1, %eax
 	invlpg	(%eax)			/* invalidate single page */
 
 	movl	lapic, %eax
 	movl	$0, LA_EOI(%eax)	/* End Of Interrupt to APIC */
 
 	lock
 	incl	smp_tlb_wait
 
 	popl	%ds
 	popl	%eax
 	iret
 
 /*
  * Page range TLB shootdown.
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(invlrng)
 	pushl	%eax
 	pushl	%edx
 	pushl	%ds
 	movl	$KDSEL, %eax		/* Kernel data selector */
 	movl	%eax, %ds
 
 #if defined(COUNT_XINVLTLB_HITS) || defined(COUNT_IPIS)
 	pushl	%fs
 	movl	$KPSEL, %eax		/* Private space selector */
 	movl	%eax, %fs
 	movl	PCPU(CPUID), %eax
 	popl	%fs
 #ifdef COUNT_XINVLTLB_HITS
 	incl	xhits_rng(,%eax,4)
 #endif
 #ifdef COUNT_IPIS
 	movl	ipi_invlrng_counts(,%eax,4),%eax
 	incl	(%eax)
 #endif
 #endif
 
 	movl	smp_tlb_addr1, %edx
 	movl	smp_tlb_addr2, %eax
 1:	invlpg	(%edx)			/* invalidate single page */
 	addl	$PAGE_SIZE, %edx
 	cmpl	%eax, %edx
 	jb	1b
 
 	movl	lapic, %eax
 	movl	$0, LA_EOI(%eax)	/* End Of Interrupt to APIC */
 
 	lock
 	incl	smp_tlb_wait
 
 	popl	%ds
 	popl	%edx
 	popl	%eax
 	iret
 
 /*
  * Forward hardclock to another CPU.  Pushes a clockframe and calls
  * forwarded_hardclock().
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(ipi_intr_bitmap_handler)	
 	PUSH_FRAME
 	SET_KERNEL_SREGS
 
 	movl	lapic, %edx
 	movl	$0, LA_EOI(%edx)	/* End Of Interrupt to APIC */
 	
 	FAKE_MCOUNT(TF_EIP(%esp))
 
 	pushl	$0		/* XXX convert trapframe to clockframe */
 	call	ipi_bitmap_handler
 	addl	$4, %esp	/* XXX convert clockframe to trapframe */
 	MEXITCOUNT
 	jmp	doreti
 
 /*
- * Executed by a CPU when it receives an Xcpustop IPI from another CPU,
- *
- *  - Signals its receipt.
- *  - Waits for permission to restart.
- *  - Signals its restart.
+ * Executed by a CPU when it receives an IPI_STOP from another CPU.
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(cpustop)
 	PUSH_FRAME
 	SET_KERNEL_SREGS
 
 	movl	lapic, %eax
 	movl	$0, LA_EOI(%eax)	/* End Of Interrupt to APIC */
 
-	movl	PCPU(CPUID), %eax
-	imull	$PCB_SIZE, %eax
-	leal	CNAME(stoppcbs)(%eax), %eax
-	pushl	%eax
-	call	CNAME(savectx)		/* Save process context */
-	addl	$4, %esp
-		
-	movl	PCPU(CPUID), %eax
+	call	cpustop_handler
 
-	lock
-	btsl	%eax, CNAME(stopped_cpus) /* stopped_cpus |= (1<<id) */
-1:
-	btl	%eax, CNAME(started_cpus) /* while (!(started_cpus & (1<<id))) */
-	jnc	1b
-
-	lock
-	btrl	%eax, CNAME(started_cpus) /* started_cpus &= ~(1<<id) */
-	lock
-	btrl	%eax, CNAME(stopped_cpus) /* stopped_cpus &= ~(1<<id) */
-
-	test	%eax, %eax
-	jnz	2f
-
-	movl	CNAME(cpustop_restartfunc), %eax
-	test	%eax, %eax
-	jz	2f
-	movl	$0, CNAME(cpustop_restartfunc)	/* One-shot */
-
-	call	*%eax
-2:
 	POP_FRAME
 	iret
 
 /*
  * Executed by a CPU when it receives a RENDEZVOUS IPI from another CPU.
  *
  * - Calls the generic rendezvous action function.
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(rendezvous)
 	PUSH_FRAME
 	SET_KERNEL_SREGS
 
 #ifdef COUNT_IPIS
 	movl	PCPU(CPUID), %eax
 	movl	ipi_rendezvous_counts(,%eax,4), %eax
 	incl	(%eax)
 #endif
 	call	smp_rendezvous_action
 
 	movl	lapic, %eax
 	movl	$0, LA_EOI(%eax)	/* End Of Interrupt to APIC */
 	POP_FRAME
 	iret
 	
 /*
  * Clean up when we lose out on the lazy context switch optimization.
  * ie: when we are about to release a PTD but a cpu is still borrowing it.
  */
 	SUPERALIGN_TEXT
 IDTVEC(lazypmap)
 	PUSH_FRAME
 	SET_KERNEL_SREGS
 
-#ifdef COUNT_IPIS
-	movl	PCPU(CPUID), %eax
-	movl	ipi_lazypmap_counts(,%eax,4), %eax
-	incl	(%eax)
-#endif
 	call	pmap_lazyfix_action
 
 	movl	lapic, %eax
 	movl	$0, LA_EOI(%eax)	/* End Of Interrupt to APIC */
 	POP_FRAME
 	iret
 #endif /* SMP */
Index: head/sys/i386/i386/local_apic.c
===================================================================
--- head/sys/i386/i386/local_apic.c	(revision 153140)
+++ head/sys/i386/i386/local_apic.c	(revision 153141)
@@ -1,1046 +1,1050 @@
 /*-
  * Copyright (c) 2003 John Baldwin <jhb@FreeBSD.org>
  * Copyright (c) 1996, by Steve Passe
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. The name of the developer may NOT be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  * 3. Neither the name of the author nor the names of any co-contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * Local APIC support on Pentium and later processors.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_hwpmc_hooks.h"
 
 #include "opt_ddb.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/smp.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 #include <machine/apicreg.h>
 #include <machine/cputypes.h>
 #include <machine/frame.h>
 #include <machine/intr_machdep.h>
 #include <machine/apicvar.h>
 #include <machine/md_var.h>
 #include <machine/smp.h>
 #include <machine/specialreg.h>
 
 #ifdef DDB
 #include <sys/interrupt.h>
 #include <ddb/ddb.h>
 #endif
 
 /*
  * We can handle up to 60 APICs via our logical cluster IDs, but currently
  * the physical IDs on Intel processors up to the Pentium 4 are limited to
  * 16.
  */
 #define	MAX_APICID	16
 
 /* Sanity checks on IDT vectors. */
 CTASSERT(APIC_IO_INTS + APIC_NUM_IOINTS == APIC_TIMER_INT);
 CTASSERT(APIC_TIMER_INT < APIC_LOCAL_INTS);
 CTASSERT(APIC_LOCAL_INTS == 240);
 CTASSERT(IPI_STOP < APIC_SPURIOUS_INT);
 
 #define	LAPIC_TIMER_HZ_DIVIDER		2
 #define	LAPIC_TIMER_STATHZ_DIVIDER	15
 #define	LAPIC_TIMER_PROFHZ_DIVIDER	3
 
 /* Magic IRQ values for the timer and syscalls. */
 #define	IRQ_TIMER	(NUM_IO_INTS + 1)
 #define	IRQ_SYSCALL	(NUM_IO_INTS + 2)
 
 /*
  * Support for local APICs.  Local APICs manage interrupts on each
  * individual processor as opposed to I/O APICs which receive interrupts
  * from I/O devices and then forward them on to the local APICs.
  *
  * Local APICs can also send interrupts to each other thus providing the
  * mechanism for IPIs.
  */
 
 struct lvt {
 	u_int lvt_edgetrigger:1;
 	u_int lvt_activehi:1;
 	u_int lvt_masked:1;
 	u_int lvt_active:1;
 	u_int lvt_mode:16;
 	u_int lvt_vector:8;
 };
 
 struct lapic {
 	struct lvt la_lvts[LVT_MAX + 1];
 	u_int la_id:8;
 	u_int la_cluster:4;
 	u_int la_cluster_id:2;
 	u_int la_present:1;
 	u_long *la_timer_count;
 	u_long la_hard_ticks;
 	u_long la_stat_ticks;
 	u_long la_prof_ticks;
 } static lapics[MAX_APICID];
 
 /* XXX: should thermal be an NMI? */
 
 /* Global defaults for local APIC LVT entries. */
 static struct lvt lvts[LVT_MAX + 1] = {
 	{ 1, 1, 1, 1, APIC_LVT_DM_EXTINT, 0 },	/* LINT0: masked ExtINT */
 	{ 1, 1, 0, 1, APIC_LVT_DM_NMI, 0 },	/* LINT1: NMI */
 	{ 1, 1, 1, 1, APIC_LVT_DM_FIXED, APIC_TIMER_INT },	/* Timer */
 	{ 1, 1, 1, 1, APIC_LVT_DM_FIXED, APIC_ERROR_INT },	/* Error */
 	{ 1, 1, 0, 1, APIC_LVT_DM_NMI, 0 },	/* PMC */
 	{ 1, 1, 1, 1, APIC_LVT_DM_FIXED, APIC_THERMAL_INT },	/* Thermal */
 };
 
 static inthand_t *ioint_handlers[] = {
 	NULL,			/* 0 - 31 */
 	IDTVEC(apic_isr1),	/* 32 - 63 */
 	IDTVEC(apic_isr2),	/* 64 - 95 */
 	IDTVEC(apic_isr3),	/* 96 - 127 */
 	IDTVEC(apic_isr4),	/* 128 - 159 */
 	IDTVEC(apic_isr5),	/* 160 - 191 */
 	IDTVEC(apic_isr6),	/* 192 - 223 */
 	IDTVEC(apic_isr7),	/* 224 - 255 */
 };
 
 /* Include IDT_SYSCALL to make indexing easier. */
 static u_int ioint_irqs[APIC_NUM_IOINTS + 1];
 
 static u_int32_t lapic_timer_divisors[] = { 
 	APIC_TDCR_1, APIC_TDCR_2, APIC_TDCR_4, APIC_TDCR_8, APIC_TDCR_16,
 	APIC_TDCR_32, APIC_TDCR_64, APIC_TDCR_128
 };
 
 volatile lapic_t *lapic;
 static u_long lapic_timer_divisor, lapic_timer_period, lapic_timer_hz;
 
 static void	lapic_enable(void);
 static void	lapic_timer_enable_intr(void);
 static void	lapic_timer_oneshot(u_int count);
 static void	lapic_timer_periodic(u_int count);
 static void	lapic_timer_set_divisor(u_int divisor);
 static uint32_t	lvt_mode(struct lapic *la, u_int pin, uint32_t value);
 
 static uint32_t
 lvt_mode(struct lapic *la, u_int pin, uint32_t value)
 {
 	struct lvt *lvt;
 
 	KASSERT(pin <= LVT_MAX, ("%s: pin %u out of range", __func__, pin));
 	if (la->la_lvts[pin].lvt_active)
 		lvt = &la->la_lvts[pin];
 	else
 		lvt = &lvts[pin];
 
 	value &= ~(APIC_LVT_M | APIC_LVT_TM | APIC_LVT_IIPP | APIC_LVT_DM |
 	    APIC_LVT_VECTOR);
 	if (lvt->lvt_edgetrigger == 0)
 		value |= APIC_LVT_TM;
 	if (lvt->lvt_activehi == 0)
 		value |= APIC_LVT_IIPP_INTALO;
 	if (lvt->lvt_masked)
 		value |= APIC_LVT_M;
 	value |= lvt->lvt_mode;
 	switch (lvt->lvt_mode) {
 	case APIC_LVT_DM_NMI:
 	case APIC_LVT_DM_SMI:
 	case APIC_LVT_DM_INIT:
 	case APIC_LVT_DM_EXTINT:
 		if (!lvt->lvt_edgetrigger) {
 			printf("lapic%u: Forcing LINT%u to edge trigger\n",
 			    la->la_id, pin);
 			value |= APIC_LVT_TM;
 		}
 		/* Use a vector of 0. */
 		break;
 	case APIC_LVT_DM_FIXED:
 		value |= lvt->lvt_vector;
 		break;
 	default:
 		panic("bad APIC LVT delivery mode: %#x\n", value);
 	}
 	return (value);
 }
 
 /*
  * Map the local APIC and setup necessary interrupt vectors.
  */
 void
 lapic_init(uintptr_t addr)
 {
 
 	/* Map the local APIC and setup the spurious interrupt handler. */
 	KASSERT(trunc_page(addr) == addr,
 	    ("local APIC not aligned on a page boundary"));
 	lapic = (lapic_t *)pmap_mapdev(addr, sizeof(lapic_t));
 	setidt(APIC_SPURIOUS_INT, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 
 	/* Perform basic initialization of the BSP's local APIC. */
 	lapic_enable();
 	ioint_irqs[IDT_SYSCALL - APIC_IO_INTS] = IRQ_SYSCALL;
 
 	/* Set BSP's per-CPU local APIC ID. */
 	PCPU_SET(apic_id, lapic_id());
 
 	/* Local APIC timer interrupt. */
 	setidt(APIC_TIMER_INT, IDTVEC(timerint), SDT_SYS386IGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	ioint_irqs[APIC_TIMER_INT - APIC_IO_INTS] = IRQ_TIMER;
 
 	/* XXX: error/thermal interrupts */
 }
 
 /*
  * Create a local APIC instance.
  */
 void
 lapic_create(u_int apic_id, int boot_cpu)
 {
 	int i;
 
 	if (apic_id >= MAX_APICID) {
 		printf("APIC: Ignoring local APIC with ID %d\n", apic_id);
 		if (boot_cpu)
 			panic("Can't ignore BSP");
 		return;
 	}
 	KASSERT(!lapics[apic_id].la_present, ("duplicate local APIC %u",
 	    apic_id));
 
 	/*
 	 * Assume no local LVT overrides and a cluster of 0 and
 	 * intra-cluster ID of 0.
 	 */
 	lapics[apic_id].la_present = 1;
 	lapics[apic_id].la_id = apic_id;
 	for (i = 0; i < LVT_MAX; i++) {
 		lapics[apic_id].la_lvts[i] = lvts[i];
 		lapics[apic_id].la_lvts[i].lvt_active = 0;
 	}
 
 #ifdef SMP
 	cpu_add(apic_id, boot_cpu);
 #endif
 }
 
 /*
  * Dump contents of local APIC registers
  */
 void
 lapic_dump(const char* str)
 {
 
 	printf("cpu%d %s:\n", PCPU_GET(cpuid), str);
 	printf("     ID: 0x%08x   VER: 0x%08x LDR: 0x%08x DFR: 0x%08x\n",
 	    lapic->id, lapic->version, lapic->ldr, lapic->dfr);
 	printf("  lint0: 0x%08x lint1: 0x%08x TPR: 0x%08x SVR: 0x%08x\n",
 	    lapic->lvt_lint0, lapic->lvt_lint1, lapic->tpr, lapic->svr);
 	printf("  timer: 0x%08x therm: 0x%08x err: 0x%08x pcm: 0x%08x\n",
 	    lapic->lvt_timer, lapic->lvt_thermal, lapic->lvt_error,
 	    lapic->lvt_pcint);
 }
 
 void
 lapic_setup(void)
 {
 	struct lapic *la;
 	u_int32_t value, maxlvt;
 	register_t eflags;
 	char buf[MAXCOMLEN + 1];
 
 	la = &lapics[lapic_id()];
 	KASSERT(la->la_present, ("missing APIC structure"));
 	eflags = intr_disable();
 	maxlvt = (lapic->version & APIC_VER_MAXLVT) >> MAXLVTSHIFT;
 
 	/* Initialize the TPR to allow all interrupts. */
 	lapic_set_tpr(0);
 
 	/* Use the cluster model for logical IDs. */
 	value = lapic->dfr;
 	value &= ~APIC_DFR_MODEL_MASK;
 	value |= APIC_DFR_MODEL_CLUSTER;
 	lapic->dfr = value;
 
 	/* Set this APIC's logical ID. */
 	value = lapic->ldr;
 	value &= ~APIC_ID_MASK;
 	value |= (la->la_cluster << APIC_ID_CLUSTER_SHIFT |
 	    1 << la->la_cluster_id) << APIC_ID_SHIFT;
 	lapic->ldr = value;
 
 	/* Setup spurious vector and enable the local APIC. */
 	lapic_enable();
 
 	/* Program LINT[01] LVT entries. */
 	lapic->lvt_lint0 = lvt_mode(la, LVT_LINT0, lapic->lvt_lint0);
 	lapic->lvt_lint1 = lvt_mode(la, LVT_LINT1, lapic->lvt_lint1);
 #ifdef	HWPMC_HOOKS
 	/* Program the PMC LVT entry if present. */
 	if (maxlvt >= LVT_PMC)
 		lapic->lvt_pcint = lvt_mode(la, LVT_PMC, lapic->lvt_pcint);
 #endif
 
 	/* Program timer LVT and setup handler. */
 	lapic->lvt_timer = lvt_mode(la, LVT_TIMER, lapic->lvt_timer);
 	snprintf(buf, sizeof(buf), "cpu%d: timer", PCPU_GET(cpuid));
 	intrcnt_add(buf, &la->la_timer_count);
 	if (PCPU_GET(cpuid) != 0) {
 		KASSERT(lapic_timer_period != 0, ("lapic%u: zero divisor",
 		    lapic_id()));
 		lapic_timer_set_divisor(lapic_timer_divisor);
 		lapic_timer_periodic(lapic_timer_period);
 		lapic_timer_enable_intr();
 	}
 
 	/* XXX: Error and thermal LVTs */
 
 	intr_restore(eflags);
 }
 
 /*
  * Called by cpu_initclocks() on the BSP to setup the local APIC timer so
  * that it can drive hardclock, statclock, and profclock.  This function
  * returns true if it is able to use the local APIC timer to drive the
  * clocks and false if it is not able.
  */
 int
 lapic_setup_clock(void)
 {
 	u_long value;
 
 	/* Can't drive the timer without a local APIC. */
 	if (lapic == NULL)
 		return (0);
 
 	/* Start off with a divisor of 2 (power on reset default). */
 	lapic_timer_divisor = 2;
 
 	/* Try to calibrate the local APIC timer. */
 	do {
 		lapic_timer_set_divisor(lapic_timer_divisor);
 		lapic_timer_oneshot(APIC_TIMER_MAX_COUNT);
 		DELAY(2000000);
 		value = APIC_TIMER_MAX_COUNT - lapic->ccr_timer;
 		if (value != APIC_TIMER_MAX_COUNT)
 			break;
 		lapic_timer_divisor <<= 1;
 	} while (lapic_timer_divisor <= 128);
 	if (lapic_timer_divisor > 128)
 		panic("lapic: Divisor too big");
 	value /= 2;
 	if (bootverbose)
 		printf("lapic: Divisor %lu, Frequency %lu hz\n",
 		    lapic_timer_divisor, value);
 
 	/*
 	 * We will drive the timer at a small multiple of hz and drive
 	 * both of the other timers with similarly small but relatively
 	 * prime divisors.
 	 */
 	lapic_timer_hz = hz * LAPIC_TIMER_HZ_DIVIDER;
 	stathz = lapic_timer_hz / LAPIC_TIMER_STATHZ_DIVIDER;
 	profhz = lapic_timer_hz / LAPIC_TIMER_PROFHZ_DIVIDER;
 	lapic_timer_period = value / lapic_timer_hz;
 
 	/*
 	 * Start up the timer on the BSP.  The APs will kick off their
 	 * timer during lapic_setup().
 	 */
 	lapic_timer_periodic(lapic_timer_period);
 	lapic_timer_enable_intr();
 	return (1);
 }
 
 void
 lapic_disable(void)
 {
 	uint32_t value;
 
 	/* Software disable the local APIC. */
 	value = lapic->svr;
 	value &= ~APIC_SVR_SWEN;
 	lapic->svr = value;
 }
 
 static void
 lapic_enable(void)
 {
 	u_int32_t value;
 
 	/* Program the spurious vector to enable the local APIC. */
 	value = lapic->svr;
 	value &= ~(APIC_SVR_VECTOR | APIC_SVR_FOCUS);
 	value |= (APIC_SVR_FEN | APIC_SVR_SWEN | APIC_SPURIOUS_INT);
 	lapic->svr = value;
 }
 
 int
 lapic_id(void)
 {
 
 	KASSERT(lapic != NULL, ("local APIC is not mapped"));
 	return (lapic->id >> APIC_ID_SHIFT);
 }
 
 int
 lapic_intr_pending(u_int vector)
 {
 	volatile u_int32_t *irr;
 
 	/*
 	 * The IRR registers are an array of 128-bit registers each of
 	 * which only describes 32 interrupts in the low 32 bits..  Thus,
 	 * we divide the vector by 32 to get the 128-bit index.  We then
 	 * multiply that index by 4 to get the equivalent index from
 	 * treating the IRR as an array of 32-bit registers.  Finally, we
 	 * modulus the vector by 32 to determine the individual bit to
 	 * test.
 	 */
 	irr = &lapic->irr0;
 	return (irr[(vector / 32) * 4] & 1 << (vector % 32));
 }
 
 void
 lapic_set_logical_id(u_int apic_id, u_int cluster, u_int cluster_id)
 {
 	struct lapic *la;
 
 	KASSERT(lapics[apic_id].la_present, ("%s: APIC %u doesn't exist",
 	    __func__, apic_id));
 	KASSERT(cluster <= APIC_MAX_CLUSTER, ("%s: cluster %u too big",
 	    __func__, cluster));
 	KASSERT(cluster_id <= APIC_MAX_INTRACLUSTER_ID,
 	    ("%s: intra cluster id %u too big", __func__, cluster_id));
 	la = &lapics[apic_id];
 	la->la_cluster = cluster;
 	la->la_cluster_id = cluster_id;
 }
 
 int
 lapic_set_lvt_mask(u_int apic_id, u_int pin, u_char masked)
 {
 
 	if (pin > LVT_MAX)
 		return (EINVAL);
 	if (apic_id == APIC_ID_ALL) {
 		lvts[pin].lvt_masked = masked;
 		if (bootverbose)
 			printf("lapic:");
 	} else {
 		KASSERT(lapics[apic_id].la_present,
 		    ("%s: missing APIC %u", __func__, apic_id));
 		lapics[apic_id].la_lvts[pin].lvt_masked = masked;
 		lapics[apic_id].la_lvts[pin].lvt_active = 1;
 		if (bootverbose)
 			printf("lapic%u:", apic_id);
 	}
 	if (bootverbose)
 		printf(" LINT%u %s\n", pin, masked ? "masked" : "unmasked");
 	return (0);
 }
 
 int
 lapic_set_lvt_mode(u_int apic_id, u_int pin, u_int32_t mode)
 {
 	struct lvt *lvt;
 
 	if (pin > LVT_MAX)
 		return (EINVAL);
 	if (apic_id == APIC_ID_ALL) {
 		lvt = &lvts[pin];
 		if (bootverbose)
 			printf("lapic:");
 	} else {
 		KASSERT(lapics[apic_id].la_present,
 		    ("%s: missing APIC %u", __func__, apic_id));
 		lvt = &lapics[apic_id].la_lvts[pin];
 		lvt->lvt_active = 1;
 		if (bootverbose)
 			printf("lapic%u:", apic_id);
 	}
 	lvt->lvt_mode = mode;
 	switch (mode) {
 	case APIC_LVT_DM_NMI:
 	case APIC_LVT_DM_SMI:
 	case APIC_LVT_DM_INIT:
 	case APIC_LVT_DM_EXTINT:
 		lvt->lvt_edgetrigger = 1;
 		lvt->lvt_activehi = 1;
 		if (mode == APIC_LVT_DM_EXTINT)
 			lvt->lvt_masked = 1;
 		else
 			lvt->lvt_masked = 0;
 		break;
 	default:
 		panic("Unsupported delivery mode: 0x%x\n", mode);
 	}
 	if (bootverbose) {
 		printf(" Routing ");
 		switch (mode) {
 		case APIC_LVT_DM_NMI:
 			printf("NMI");
 			break;
 		case APIC_LVT_DM_SMI:
 			printf("SMI");
 			break;
 		case APIC_LVT_DM_INIT:
 			printf("INIT");
 			break;
 		case APIC_LVT_DM_EXTINT:
 			printf("ExtINT");
 			break;
 		}
 		printf(" -> LINT%u\n", pin);
 	}
 	return (0);
 }
 
 int
 lapic_set_lvt_polarity(u_int apic_id, u_int pin, enum intr_polarity pol)
 {
 
 	if (pin > LVT_MAX || pol == INTR_POLARITY_CONFORM)
 		return (EINVAL);
 	if (apic_id == APIC_ID_ALL) {
 		lvts[pin].lvt_activehi = (pol == INTR_POLARITY_HIGH);
 		if (bootverbose)
 			printf("lapic:");
 	} else {
 		KASSERT(lapics[apic_id].la_present,
 		    ("%s: missing APIC %u", __func__, apic_id));
 		lapics[apic_id].la_lvts[pin].lvt_active = 1;
 		lapics[apic_id].la_lvts[pin].lvt_activehi =
 		    (pol == INTR_POLARITY_HIGH);
 		if (bootverbose)
 			printf("lapic%u:", apic_id);
 	}
 	if (bootverbose)
 		printf(" LINT%u polarity: %s\n", pin,
 		    pol == INTR_POLARITY_HIGH ? "high" : "low");
 	return (0);
 }
 
 int
 lapic_set_lvt_triggermode(u_int apic_id, u_int pin, enum intr_trigger trigger)
 {
 
 	if (pin > LVT_MAX || trigger == INTR_TRIGGER_CONFORM)
 		return (EINVAL);
 	if (apic_id == APIC_ID_ALL) {
 		lvts[pin].lvt_edgetrigger = (trigger == INTR_TRIGGER_EDGE);
 		if (bootverbose)
 			printf("lapic:");
 	} else {
 		KASSERT(lapics[apic_id].la_present,
 		    ("%s: missing APIC %u", __func__, apic_id));
 		lapics[apic_id].la_lvts[pin].lvt_edgetrigger =
 		    (trigger == INTR_TRIGGER_EDGE);
 		lapics[apic_id].la_lvts[pin].lvt_active = 1;
 		if (bootverbose)
 			printf("lapic%u:", apic_id);
 	}
 	if (bootverbose)
 		printf(" LINT%u trigger: %s\n", pin,
 		    trigger == INTR_TRIGGER_EDGE ? "edge" : "level");
 	return (0);
 }
 
 /*
  * Adjust the TPR of the current CPU so that it blocks all interrupts below
  * the passed in vector.
  */
 void
 lapic_set_tpr(u_int vector)
 {
 #ifdef CHEAP_TPR
 	lapic->tpr = vector;
 #else
 	u_int32_t tpr;
 
 	tpr = lapic->tpr & ~APIC_TPR_PRIO;
 	tpr |= vector;
 	lapic->tpr = tpr;
 #endif
 }
 
 void
 lapic_eoi(void)
 {
 
 	lapic->eoi = 0;
 }
 
 void
 lapic_handle_intr(struct intrframe frame)
 {
 	struct intsrc *isrc;
 
 	if (frame.if_vec == -1)
 		panic("Couldn't get vector from ISR!");
 	isrc = intr_lookup_source(apic_idt_to_irq(frame.if_vec));
 	intr_execute_handlers(isrc, &frame);
 }
 
 void
 lapic_handle_timer(struct clockframe frame)
 {
 	struct lapic *la;
 
+	/* Send EOI first thing. */
+	lapic_eoi();
+
+	/* Look up our local APIC structure for the tick counters. */
 	la = &lapics[PCPU_GET(apic_id)];
 	(*la->la_timer_count)++;
 	critical_enter();
 
 	/* Fire hardclock at hz. */
 	la->la_hard_ticks += hz;
 	if (la->la_hard_ticks >= lapic_timer_hz) {
 		la->la_hard_ticks -= lapic_timer_hz;
 		if (PCPU_GET(cpuid) == 0)
 			hardclock(&frame);
 		else
 			hardclock_process(&frame);
 	}
 
 	/* Fire statclock at stathz. */
 	la->la_stat_ticks += stathz;
 	if (la->la_stat_ticks >= lapic_timer_hz) {
 		la->la_stat_ticks -= lapic_timer_hz;
 		statclock(&frame);
 	}
 
 	/* Fire profclock at profhz, but only when needed. */
 	la->la_prof_ticks += profhz;
 	if (la->la_prof_ticks >= lapic_timer_hz) {
 		la->la_prof_ticks -= lapic_timer_hz;
 		if (profprocs != 0)
 			profclock(&frame);
 	}
 	critical_exit();
 }
 
 static void
 lapic_timer_set_divisor(u_int divisor)
 {
 
 	KASSERT(powerof2(divisor), ("lapic: invalid divisor %u", divisor));
 	KASSERT(ffs(divisor) <= sizeof(lapic_timer_divisors) /
 	    sizeof(u_int32_t), ("lapic: invalid divisor %u", divisor));
 	lapic->dcr_timer = lapic_timer_divisors[ffs(divisor) - 1];
 }
 
 static void
 lapic_timer_oneshot(u_int count)
 {
 	u_int32_t value;
 
 	value = lapic->lvt_timer;
 	value &= ~APIC_LVTT_TM;
 	value |= APIC_LVTT_TM_ONE_SHOT;
 	lapic->lvt_timer = value;
 	lapic->icr_timer = count;
 }
 
 static void
 lapic_timer_periodic(u_int count)
 {
 	u_int32_t value;
 
 	value = lapic->lvt_timer;
 	value &= ~APIC_LVTT_TM;
 	value |= APIC_LVTT_TM_PERIODIC;
 	lapic->lvt_timer = value;
 	lapic->icr_timer = count;
 }
 
 static void
 lapic_timer_enable_intr(void)
 {
 	u_int32_t value;
 
 	value = lapic->lvt_timer;
 	value &= ~APIC_LVT_M;
 	lapic->lvt_timer = value;
 }
 
 /* Request a free IDT vector to be used by the specified IRQ. */
 u_int
 apic_alloc_vector(u_int irq)
 {
 	u_int vector;
 
 	KASSERT(irq < NUM_IO_INTS, ("Invalid IRQ %u", irq));
 
 	/*
 	 * Search for a free vector.  Currently we just use a very simple
 	 * algorithm to find the first free vector.
 	 */
 	mtx_lock_spin(&icu_lock);
 	for (vector = 0; vector < APIC_NUM_IOINTS; vector++) {
 		if (ioint_irqs[vector] != 0)
 			continue;
 		ioint_irqs[vector] = irq;
 		mtx_unlock_spin(&icu_lock);
 		return (vector + APIC_IO_INTS);
 	}
 	mtx_unlock_spin(&icu_lock);
 	panic("Couldn't find an APIC vector for IRQ %u", irq);
 }
 
 void
 apic_enable_vector(u_int vector)
 {
 
 	KASSERT(vector != IDT_SYSCALL, ("Attempt to overwrite syscall entry"));
 	KASSERT(ioint_handlers[vector / 32] != NULL,
 	    ("No ISR handler for vector %u", vector));
 	setidt(vector, ioint_handlers[vector / 32], SDT_SYS386IGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 }
 
 /* Release an APIC vector when it's no longer in use. */
 void
 apic_free_vector(u_int vector, u_int irq)
 {
 	KASSERT(vector >= APIC_IO_INTS && vector != IDT_SYSCALL &&
 	    vector <= APIC_IO_INTS + APIC_NUM_IOINTS,
 	    ("Vector %u does not map to an IRQ line", vector));
 	KASSERT(irq < NUM_IO_INTS, ("Invalid IRQ %u", irq));
 	KASSERT(ioint_irqs[vector - APIC_IO_INTS] == irq, ("IRQ mismatch"));
 	mtx_lock_spin(&icu_lock);
 	ioint_irqs[vector - APIC_IO_INTS] = 0;
 	mtx_unlock_spin(&icu_lock);
 }
 
 /* Map an IDT vector (APIC) to an IRQ (interrupt source). */
 u_int
 apic_idt_to_irq(u_int vector)
 {
 
 	KASSERT(vector >= APIC_IO_INTS && vector != IDT_SYSCALL &&
 	    vector <= APIC_IO_INTS + APIC_NUM_IOINTS,
 	    ("Vector %u does not map to an IRQ line", vector));
 	return (ioint_irqs[vector - APIC_IO_INTS]);
 }
 
 #ifdef DDB
 /*
  * Dump data about APIC IDT vector mappings.
  */
 DB_SHOW_COMMAND(apic, db_show_apic)
 {
 	struct intsrc *isrc;
 	int quit, i, verbose;
 	u_int irq;
 
 	quit = 0;
 	if (strcmp(modif, "vv") == 0)
 		verbose = 2;
 	else if (strcmp(modif, "v") == 0)
 		verbose = 1;
 	else
 		verbose = 0;
 	db_setup_paging(db_simple_pager, &quit, db_lines_per_page);
 	for (i = 0; i < APIC_NUM_IOINTS + 1 && !quit; i++) {
 		irq = ioint_irqs[i];
 		if (irq != 0 && irq != IRQ_SYSCALL) {
 			db_printf("vec 0x%2x -> ", i + APIC_IO_INTS);
 			if (irq == IRQ_TIMER)
 				db_printf("lapic timer\n");
 			else if (irq < NUM_IO_INTS) {
 				isrc = intr_lookup_source(irq);
 				if (isrc == NULL || verbose == 0)
 					db_printf("IRQ %u\n", irq);
 				else
 					db_dump_intr_event(isrc->is_event,
 					    verbose == 2);
 			} else
 				db_printf("IRQ %u ???\n", irq);
 		}
 	}
 }
 #endif
 
 /*
  * APIC probing support code.  This includes code to manage enumerators.
  */
 
 static SLIST_HEAD(, apic_enumerator) enumerators =
 	SLIST_HEAD_INITIALIZER(enumerators);
 static struct apic_enumerator *best_enum;
 	
 void
 apic_register_enumerator(struct apic_enumerator *enumerator)
 {
 #ifdef INVARIANTS
 	struct apic_enumerator *apic_enum;
 
 	SLIST_FOREACH(apic_enum, &enumerators, apic_next) {
 		if (apic_enum == enumerator)
 			panic("%s: Duplicate register of %s", __func__,
 			    enumerator->apic_name);
 	}
 #endif
 	SLIST_INSERT_HEAD(&enumerators, enumerator, apic_next);
 }
 
 /*
  * Probe the APIC enumerators, enumerate CPUs, and initialize the
  * local APIC.
  */
 static void
 apic_init(void *dummy __unused)
 {
 	struct apic_enumerator *enumerator;
 	uint64_t apic_base;
 	int retval, best;
 
 	/* We only support built in local APICs. */
 	if (!(cpu_feature & CPUID_APIC))
 		return;
 
 	/* Don't probe if APIC mode is disabled. */
 	if (resource_disabled("apic", 0))
 		return;
 
 	/* First, probe all the enumerators to find the best match. */
 	best_enum = NULL;
 	best = 0;
 	SLIST_FOREACH(enumerator, &enumerators, apic_next) {
 		retval = enumerator->apic_probe();
 		if (retval > 0)
 			continue;
 		if (best_enum == NULL || best < retval) {
 			best_enum = enumerator;
 			best = retval;
 		}
 	}
 	if (best_enum == NULL) {
 		if (bootverbose)
 			printf("APIC: Could not find any APICs.\n");
 		return;
 	}
 
 	if (bootverbose)
 		printf("APIC: Using the %s enumerator.\n",
 		    best_enum->apic_name);
 
 	/*
 	 * To work around an errata, we disable the local APIC on some
 	 * CPUs during early startup.  We need to turn the local APIC back
 	 * on on such CPUs now.
 	 */
 	if (cpu == CPU_686 && strcmp(cpu_vendor, "GenuineIntel") == 0 &&
 	    (cpu_id & 0xff0) == 0x610) {
 		apic_base = rdmsr(MSR_APICBASE);
 		apic_base |= APICBASE_ENABLED;
 		wrmsr(MSR_APICBASE, apic_base);
 	}
 
 	/* Second, probe the CPU's in the system. */
 	retval = best_enum->apic_probe_cpus();
 	if (retval != 0)
 		printf("%s: Failed to probe CPUs: returned %d\n",
 		    best_enum->apic_name, retval);
 
 	/* Third, initialize the local APIC. */
 	retval = best_enum->apic_setup_local();
 	if (retval != 0)
 		printf("%s: Failed to setup the local APIC: returned %d\n",
 		    best_enum->apic_name, retval);
 #ifdef SMP
 	/* Last, setup the cpu topology now that we have probed CPUs */
 	mp_topology();
 #endif
 }
 SYSINIT(apic_init, SI_SUB_CPU, SI_ORDER_FIRST, apic_init, NULL)
 
 /*
  * Setup the I/O APICs.
  */
 static void
 apic_setup_io(void *dummy __unused)
 {
 	int retval;
 
 	if (best_enum == NULL)
 		return;
 	retval = best_enum->apic_setup_io();
 	if (retval != 0)
 		printf("%s: Failed to setup I/O APICs: returned %d\n",
 		    best_enum->apic_name, retval);
 
 	/*
 	 * Finish setting up the local APIC on the BSP once we know how to
 	 * properly program the LINT pins.
 	 */
 	lapic_setup();
 	if (bootverbose)
 		lapic_dump("BSP");
 }
 SYSINIT(apic_setup_io, SI_SUB_INTR, SI_ORDER_SECOND, apic_setup_io, NULL)
 
 #ifdef SMP
 /*
  * Inter Processor Interrupt functions.  The lapic_ipi_*() functions are
  * private to the sys/i386 code.  The public interface for the rest of the
  * kernel is defined in mp_machdep.c.
  */
 int
 lapic_ipi_wait(int delay)
 {
 	int x, incr;
 
 	/*
 	 * Wait delay loops for IPI to be sent.  This is highly bogus
 	 * since this is sensitive to CPU clock speed.  If delay is
 	 * -1, we wait forever.
 	 */
 	if (delay == -1) {
 		incr = 0;
 		delay = 1;
 	} else
 		incr = 1;
 	for (x = 0; x < delay; x += incr) {
 		if ((lapic->icr_lo & APIC_DELSTAT_MASK) == APIC_DELSTAT_IDLE)
 			return (1);
 		ia32_pause();
 	}
 	return (0);
 }
 
 void
 lapic_ipi_raw(register_t icrlo, u_int dest)
 {
 	register_t value, eflags;
 
 	/* XXX: Need more sanity checking of icrlo? */
 	KASSERT(lapic != NULL, ("%s called too early", __func__));
 	KASSERT((dest & ~(APIC_ID_MASK >> APIC_ID_SHIFT)) == 0,
 	    ("%s: invalid dest field", __func__));
 	KASSERT((icrlo & APIC_ICRLO_RESV_MASK) == 0,
 	    ("%s: reserved bits set in ICR LO register", __func__));
 
 	/* Set destination in ICR HI register if it is being used. */
 	eflags = intr_disable();
 	if ((icrlo & APIC_DEST_MASK) == APIC_DEST_DESTFLD) {
 		value = lapic->icr_hi;
 		value &= ~APIC_ID_MASK;
 		value |= dest << APIC_ID_SHIFT;
 		lapic->icr_hi = value;
 	}
 
 	/* Program the contents of the IPI and dispatch it. */
 	value = lapic->icr_lo;
 	value &= APIC_ICRLO_RESV_MASK;
 	value |= icrlo;
 	lapic->icr_lo = value;
 	intr_restore(eflags);
 }
 
 #define	BEFORE_SPIN	1000000
 #ifdef DETECT_DEADLOCK
 #define	AFTER_SPIN	1000
 #endif
 
 void
 lapic_ipi_vectored(u_int vector, int dest)
 {
 	register_t icrlo, destfield;
 
 	KASSERT((vector & ~APIC_VECTOR_MASK) == 0,
 	    ("%s: invalid vector %d", __func__, vector));
 
 	icrlo = vector | APIC_DELMODE_FIXED | APIC_DESTMODE_PHY |
 	    APIC_LEVEL_DEASSERT | APIC_TRIGMOD_EDGE;
 	destfield = 0;
 	switch (dest) {
 	case APIC_IPI_DEST_SELF:
 		icrlo |= APIC_DEST_SELF;
 		break;
 	case APIC_IPI_DEST_ALL:
 		icrlo |= APIC_DEST_ALLISELF;
 		break;
 	case APIC_IPI_DEST_OTHERS:
 		icrlo |= APIC_DEST_ALLESELF;
 		break;
 	default:
 		KASSERT((dest & ~(APIC_ID_MASK >> APIC_ID_SHIFT)) == 0,
 		    ("%s: invalid destination 0x%x", __func__, dest));
 		destfield = dest;
 	}
 
 	/* Wait for an earlier IPI to finish. */
 	if (!lapic_ipi_wait(BEFORE_SPIN)) {
 		if (panicstr != NULL)
 			return;
 		else
 			panic("APIC: Previous IPI is stuck");
 	}
 
 	lapic_ipi_raw(icrlo, destfield);
 
 #ifdef DETECT_DEADLOCK
 	/* Wait for IPI to be delivered. */
 	if (!lapic_ipi_wait(AFTER_SPIN)) {
 #ifdef needsattention
 		/*
 		 * XXX FIXME:
 		 *
 		 * The above function waits for the message to actually be
 		 * delivered.  It breaks out after an arbitrary timeout
 		 * since the message should eventually be delivered (at
 		 * least in theory) and that if it wasn't we would catch
 		 * the failure with the check above when the next IPI is
 		 * sent.
 		 *
 		 * We could skip this wait entirely, EXCEPT it probably
 		 * protects us from other routines that assume that the
 		 * message was delivered and acted upon when this function
 		 * returns.
 		 */
 		printf("APIC: IPI might be stuck\n");
 #else /* !needsattention */
 		/* Wait until mesage is sent without a timeout. */
 		while (lapic->icr_lo & APIC_DELSTAT_PEND)
 			ia32_pause();
 #endif /* needsattention */
 	}
 #endif /* DETECT_DEADLOCK */
 }
 #endif /* SMP */
Index: head/sys/i386/i386/mp_machdep.c
===================================================================
--- head/sys/i386/i386/mp_machdep.c	(revision 153140)
+++ head/sys/i386/i386/mp_machdep.c	(revision 153141)
@@ -1,1501 +1,1511 @@
 /*-
  * Copyright (c) 1996, by Steve Passe
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. The name of the developer may NOT be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_apic.h"
 #include "opt_cpu.h"
 #include "opt_kstack_pages.h"
 #include "opt_mp_watchdog.h"
 #include "opt_sched.h"
 #include "opt_smp.h"
 
 #if !defined(lint)
 #if !defined(SMP)
 #error How did you get here?
 #endif
 
 #ifndef DEV_APIC
 #error The apic device is required for SMP, add "device apic" to your config file.
 #endif
 #if defined(CPU_DISABLE_CMPXCHG) && !defined(COMPILING_LINT)
 #error SMP not supported with CPU_DISABLE_CMPXCHG
 #endif
 #endif /* not lint */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/cons.h>	/* cngetc() */
 #ifdef GPROF 
 #include <sys/gmon.h>
 #endif
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/memrange.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/proc.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 
 #include <machine/apicreg.h>
 #include <machine/clock.h>
 #include <machine/md_var.h>
 #include <machine/mp_watchdog.h>
 #include <machine/pcb.h>
 #include <machine/smp.h>
 #include <machine/specialreg.h>
 #include <machine/privatespace.h>
 
 #define WARMBOOT_TARGET		0
 #define WARMBOOT_OFF		(KERNBASE + 0x0467)
 #define WARMBOOT_SEG		(KERNBASE + 0x0469)
 
 #define CMOS_REG		(0x70)
 #define CMOS_DATA		(0x71)
 #define BIOS_RESET		(0x0f)
 #define BIOS_WARM		(0x0a)
 
 /*
  * this code MUST be enabled here and in mpboot.s.
  * it follows the very early stages of AP boot by placing values in CMOS ram.
  * it NORMALLY will never be needed and thus the primitive method for enabling.
  *
 #define CHECK_POINTS
  */
 
 #if defined(CHECK_POINTS) && !defined(PC98)
 #define CHECK_READ(A)	 (outb(CMOS_REG, (A)), inb(CMOS_DATA))
 #define CHECK_WRITE(A,D) (outb(CMOS_REG, (A)), outb(CMOS_DATA, (D)))
 
 #define CHECK_INIT(D);				\
 	CHECK_WRITE(0x34, (D));			\
 	CHECK_WRITE(0x35, (D));			\
 	CHECK_WRITE(0x36, (D));			\
 	CHECK_WRITE(0x37, (D));			\
 	CHECK_WRITE(0x38, (D));			\
 	CHECK_WRITE(0x39, (D));
 
 #define CHECK_PRINT(S);				\
 	printf("%s: %d, %d, %d, %d, %d, %d\n",	\
 	   (S),					\
 	   CHECK_READ(0x34),			\
 	   CHECK_READ(0x35),			\
 	   CHECK_READ(0x36),			\
 	   CHECK_READ(0x37),			\
 	   CHECK_READ(0x38),			\
 	   CHECK_READ(0x39));
 
 #else				/* CHECK_POINTS */
 
 #define CHECK_INIT(D)
 #define CHECK_PRINT(S)
 #define CHECK_WRITE(A, D)
 
 #endif				/* CHECK_POINTS */
 
 /* lock region used by kernel profiling */
 int	mcount_lock;
 
 int	mp_naps;		/* # of Applications processors */
 int	boot_cpu_id = -1;	/* designated BSP */
 extern	int nkpt;
 
 /*
  * CPU topology map datastructures for HTT.
  */
 static struct cpu_group mp_groups[MAXCPU];
 static struct cpu_top mp_top;
 
 /* AP uses this during bootstrap.  Do not staticize.  */
 char *bootSTK;
 static int bootAP;
 
 /* Hotwire a 0->4MB V==P mapping */
 extern pt_entry_t *KPTphys;
 
 /* SMP page table page */
 extern pt_entry_t *SMPpt;
 
 struct pcb stoppcbs[MAXCPU];
 
 /* Variables needed for SMP tlb shootdown. */
 vm_offset_t smp_tlb_addr1;
 vm_offset_t smp_tlb_addr2;
 volatile int smp_tlb_wait;
 
 #ifdef STOP_NMI
 volatile cpumask_t ipi_nmi_pending;
 
 static void	ipi_nmi_selected(u_int32_t cpus);
 #endif 
 
 #ifdef COUNT_IPIS
 /* Interrupt counts. */
 #ifdef IPI_PREEMPTION
 static u_long *ipi_preempt_counts[MAXCPU];
 #endif
 static u_long *ipi_ast_counts[MAXCPU];
 u_long *ipi_invltlb_counts[MAXCPU];
 u_long *ipi_invlrng_counts[MAXCPU];
 u_long *ipi_invlpg_counts[MAXCPU];
 u_long *ipi_rendezvous_counts[MAXCPU];
 u_long *ipi_lazypmap_counts[MAXCPU];
 #endif
 
 /*
  * Local data and functions.
  */
 
 #ifdef STOP_NMI
 /* 
  * Provide an alternate method of stopping other CPUs. If another CPU has
  * disabled interrupts the conventional STOP IPI will be blocked. This 
  * NMI-based stop should get through in that case.
  */
 static int stop_cpus_with_nmi = 1;
 SYSCTL_INT(_debug, OID_AUTO, stop_cpus_with_nmi, CTLTYPE_INT | CTLFLAG_RW,
     &stop_cpus_with_nmi, 0, "");
 TUNABLE_INT("debug.stop_cpus_with_nmi", &stop_cpus_with_nmi);
 #else
 #define	stop_cpus_with_nmi	0
 #endif
 
 static u_int logical_cpus;
 
 /* used to hold the AP's until we are ready to release them */
 static struct mtx ap_boot_mtx;
 
 /* Set to 1 once we're ready to let the APs out of the pen. */
 static volatile int aps_ready = 0;
 
 /*
  * Store data from cpu_add() until later in the boot when we actually setup
  * the APs.
  */
 struct cpu_info {
 	int	cpu_present:1;
 	int	cpu_bsp:1;
 	int	cpu_disabled:1;
 } static cpu_info[MAXCPU];
 static int cpu_apic_ids[MAXCPU];
 
 /* Holds pending bitmap based IPIs per CPU */
 static volatile u_int cpu_ipi_pending[MAXCPU];
 
 static u_int boot_address;
 
 static void	set_logical_apic_ids(void);
 static int	start_all_aps(void);
 static void	install_ap_tramp(void);
 static int	start_ap(int apic_id);
 static void	release_aps(void *dummy);
 
 static int	hlt_logical_cpus;
 static u_int	hyperthreading_cpus;
 static cpumask_t	hyperthreading_cpus_mask;
 static int	hyperthreading_allowed = 1;
 static struct	sysctl_ctx_list logical_cpu_clist;
 
 static void
 mem_range_AP_init(void)
 {
 	if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP)
 		mem_range_softc.mr_op->initAP(&mem_range_softc);
 }
 
 void
 mp_topology(void)
 {
 	struct cpu_group *group;
 	int logical_cpus;
 	int apic_id;
 	int groups;
 	int cpu;
 
 	/* Build the smp_topology map. */
 	/* Nothing to do if there is no HTT support. */
 	if ((cpu_feature & CPUID_HTT) == 0)
 		return;
 	logical_cpus = (cpu_procinfo & CPUID_HTT_CORES) >> 16;
 	if (logical_cpus <= 1)
 		return;
 	group = &mp_groups[0];
 	groups = 1;
 	for (cpu = 0, apic_id = 0; apic_id < MAXCPU; apic_id++) {
 		if (!cpu_info[apic_id].cpu_present)
 			continue;
 		/*
 		 * If the current group has members and we're not a logical
 		 * cpu, create a new group.
 		 */
 		if (group->cg_count != 0 && (apic_id % logical_cpus) == 0) {
 			group++;
 			groups++;
 		}
 		group->cg_count++;
 		group->cg_mask |= 1 << cpu;
 		cpu++;
 	}
 
 	mp_top.ct_count = groups;
 	mp_top.ct_group = mp_groups;
 	smp_topology = &mp_top;
 }
 
 
 /*
  * Calculate usable address in base memory for AP trampoline code.
  */
 u_int
 mp_bootaddress(u_int basemem)
 {
 
 	boot_address = trunc_page(basemem);	/* round down to 4k boundary */
 	if ((basemem - boot_address) < bootMP_size)
 		boot_address -= PAGE_SIZE;	/* not enough, lower by 4k */
 
 	return boot_address;
 }
 
 void
 cpu_add(u_int apic_id, char boot_cpu)
 {
 
 	if (apic_id >= MAXCPU) {
 		printf("SMP: CPU %d exceeds maximum CPU %d, ignoring\n",
 		    apic_id, MAXCPU - 1);
 		return;
 	}
 	KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice",
 	    apic_id));
 	cpu_info[apic_id].cpu_present = 1;
 	if (boot_cpu) {
 		KASSERT(boot_cpu_id == -1,
 		    ("CPU %d claims to be BSP, but CPU %d already is", apic_id,
 		    boot_cpu_id));
 		boot_cpu_id = apic_id;
 		cpu_info[apic_id].cpu_bsp = 1;
 	}
 	mp_ncpus++;
 	if (bootverbose)
 		printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" :
 		    "AP");
 	
 }
 
 void
 cpu_mp_setmaxid(void)
 {
 
 	mp_maxid = MAXCPU - 1;
 }
 
 int
 cpu_mp_probe(void)
 {
 
 	/*
 	 * Always record BSP in CPU map so that the mbuf init code works
 	 * correctly.
 	 */
 	all_cpus = 1;
 	if (mp_ncpus == 0) {
 		/*
 		 * No CPUs were found, so this must be a UP system.  Setup
 		 * the variables to represent a system with a single CPU
 		 * with an id of 0.
 		 */
 		mp_ncpus = 1;
 		return (0);
 	}
 
 	/* At least one CPU was found. */
 	if (mp_ncpus == 1) {
 		/*
 		 * One CPU was found, so this must be a UP system with
 		 * an I/O APIC.
 		 */
 		return (0);
 	}
 
 	/* At least two CPUs were found. */
 	return (1);
 }
 
 /*
  * Initialize the IPI handlers and start up the AP's.
  */
 void
 cpu_mp_start(void)
 {
 	int i;
 	u_int threads_per_cache, p[4];
 
 	/* Initialize the logical ID to APIC ID table. */
 	for (i = 0; i < MAXCPU; i++) {
 		cpu_apic_ids[i] = -1;
 		cpu_ipi_pending[i] = 0;
 	}
 
 	/* Install an inter-CPU IPI for TLB invalidation */
 	setidt(IPI_INVLTLB, IDTVEC(invltlb),
 	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IPI_INVLPG, IDTVEC(invlpg),
 	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IPI_INVLRNG, IDTVEC(invlrng),
 	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	
 	/* Install an inter-CPU IPI for lazy pmap release */
 	setidt(IPI_LAZYPMAP, IDTVEC(lazypmap),
 	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 
 	/* Install an inter-CPU IPI for all-CPU rendezvous */
 	setidt(IPI_RENDEZVOUS, IDTVEC(rendezvous),
 	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 
 	/* Install generic inter-CPU IPI handler */
 	setidt(IPI_BITMAP_VECTOR, IDTVEC(ipi_intr_bitmap_handler),
 	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 
 	/* Install an inter-CPU IPI for CPU stop/restart */
 	setidt(IPI_STOP, IDTVEC(cpustop),
 	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 
 
 	/* Set boot_cpu_id if needed. */
 	if (boot_cpu_id == -1) {
 		boot_cpu_id = PCPU_GET(apic_id);
 		cpu_info[boot_cpu_id].cpu_bsp = 1;
 	} else
 		KASSERT(boot_cpu_id == PCPU_GET(apic_id),
 		    ("BSP's APIC ID doesn't match boot_cpu_id"));
 	cpu_apic_ids[0] = boot_cpu_id;
 
 	/* Start each Application Processor */
 	start_all_aps();
 
 	/* Setup the initial logical CPUs info. */
 	logical_cpus = logical_cpus_mask = 0;
 	if (cpu_feature & CPUID_HTT)
 		logical_cpus = (cpu_procinfo & CPUID_HTT_CORES) >> 16;
 
 	/*
 	 * Work out if hyperthreading is *really* enabled.  This
 	 * is made really ugly by the fact that processors lie: Dual
 	 * core processors claim to be hyperthreaded even when they're
 	 * not, presumably because they want to be treated the same
 	 * way as HTT with respect to per-cpu software licensing.
 	 * At the time of writing (May 12, 2005) the only hyperthreaded
 	 * cpus are from Intel, and Intel's dual-core processors can be
 	 * identified via the "deterministic cache parameters" cpuid
 	 * calls.
 	 */
 	/*
 	 * First determine if this is an Intel processor which claims
 	 * to have hyperthreading support.
 	 */
 	if ((cpu_feature & CPUID_HTT) &&
 	    (strcmp(cpu_vendor, "GenuineIntel") == 0)) {
 		/*
 		 * If the "deterministic cache parameters" cpuid calls
 		 * are available, use them.
 		 */
 		if (cpu_high >= 4) {
 			/* Ask the processor about up to 32 caches. */
 			for (i = 0; i < 32; i++) {
 				cpuid_count(4, i, p);
 				threads_per_cache = ((p[0] & 0x3ffc000) >> 14) + 1;
 				if (hyperthreading_cpus < threads_per_cache)
 					hyperthreading_cpus = threads_per_cache;
 				if ((p[0] & 0x1f) == 0)
 					break;
 			}
 		}
 
 		/*
 		 * If the deterministic cache parameters are not
 		 * available, or if no caches were reported to exist,
 		 * just accept what the HTT flag indicated.
 		 */
 		if (hyperthreading_cpus == 0)
 			hyperthreading_cpus = logical_cpus;
 	}
 
 	set_logical_apic_ids();
 }
 
 
 /*
  * Print various information about the SMP system hardware and setup.
  */
 void
 cpu_mp_announce(void)
 {
 	int i, x;
 
 	/* List CPUs */
 	printf(" cpu0 (BSP): APIC ID: %2d\n", boot_cpu_id);
 	for (i = 1, x = 0; x < MAXCPU; x++) {
 		if (!cpu_info[x].cpu_present || cpu_info[x].cpu_bsp)
 			continue;
 		if (cpu_info[x].cpu_disabled)
 			printf("  cpu (AP): APIC ID: %2d (disabled)\n", x);
 		else {
 			KASSERT(i < mp_ncpus,
 			    ("mp_ncpus and actual cpus are out of whack"));
 			printf(" cpu%d (AP): APIC ID: %2d\n", i++, x);
 		}
 	}
 }
 
 /*
  * AP CPU's call this to initialize themselves.
  */
 void
 init_secondary(void)
 {
 	vm_offset_t addr;
 	int	gsel_tss;
 	int	x, myid;
 	u_int	cr0;
 
 	/* bootAP is set in start_ap() to our ID. */
 	myid = bootAP;
 	gdt_segs[GPRIV_SEL].ssd_base = (int) &SMP_prvspace[myid];
 	gdt_segs[GPROC0_SEL].ssd_base =
 		(int) &SMP_prvspace[myid].pcpu.pc_common_tss;
 	SMP_prvspace[myid].pcpu.pc_prvspace =
 		&SMP_prvspace[myid].pcpu;
 
 	for (x = 0; x < NGDT; x++) {
 		ssdtosd(&gdt_segs[x], &gdt[myid * NGDT + x].sd);
 	}
 
 	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
 	r_gdt.rd_base = (int) &gdt[myid * NGDT];
 	lgdt(&r_gdt);			/* does magic intra-segment return */
 
 	lidt(&r_idt);
 
 	lldt(_default_ldt);
 	PCPU_SET(currentldt, _default_ldt);
 
 	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
 	gdt[myid * NGDT + GPROC0_SEL].sd.sd_type = SDT_SYS386TSS;
 	PCPU_SET(common_tss.tss_esp0, 0); /* not used until after switch */
 	PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL));
 	PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16);
 	PCPU_SET(tss_gdt, &gdt[myid * NGDT + GPROC0_SEL].sd);
 	PCPU_SET(common_tssd, *PCPU_GET(tss_gdt));
 	ltr(gsel_tss);
 
 	PCPU_SET(fsgs_gdt, &gdt[myid * NGDT + GUFS_SEL].sd);
 
 	/*
 	 * Set to a known state:
 	 * Set by mpboot.s: CR0_PG, CR0_PE
 	 * Set by cpu_setregs: CR0_NE, CR0_MP, CR0_TS, CR0_WP, CR0_AM
 	 */
 	cr0 = rcr0();
 	cr0 &= ~(CR0_CD | CR0_NW | CR0_EM);
 	load_cr0(cr0);
 	CHECK_WRITE(0x38, 5);
 	
 	/* Disable local APIC just to be sure. */
 	lapic_disable();
 
 	/* signal our startup to the BSP. */
 	mp_naps++;
 	CHECK_WRITE(0x39, 6);
 
 	/* Spin until the BSP releases the AP's. */
 	while (!aps_ready)
 		ia32_pause();
 
 	/* BSP may have changed PTD while we were waiting */
 	invltlb();
 	for (addr = 0; addr < NKPT * NBPDR - 1; addr += PAGE_SIZE)
 		invlpg(addr);
 
 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 	lidt(&r_idt);
 #endif
 
 	/* set up CPU registers and state */
 	cpu_setregs();
 
 	/* set up FPU state on the AP */
 	npxinit(__INITIAL_NPXCW__);
 
 	/* set up SSE registers */
 	enable_sse();
 
 	/* A quick check from sanity claus */
 	if (PCPU_GET(apic_id) != lapic_id()) {
 		printf("SMP: cpuid = %d\n", PCPU_GET(cpuid));
 		printf("SMP: actual apic_id = %d\n", lapic_id());
 		printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id));
 		printf("PTD[MPPTDI] = %#jx\n", (uintmax_t)PTD[MPPTDI]);
 		panic("cpuid mismatch! boom!!");
 	}
 
 	/* Initialize curthread. */
 	KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread"));
 	PCPU_SET(curthread, PCPU_GET(idlethread));
 
 	mtx_lock_spin(&ap_boot_mtx);
 
 	/* Init local apic for irq's */
 	lapic_setup();
 
 	/* Set memory range attributes for this CPU to match the BSP */
 	mem_range_AP_init();
 
 	smp_cpus++;
 
 	CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", PCPU_GET(cpuid));
 	printf("SMP: AP CPU #%d Launched!\n", PCPU_GET(cpuid));
 
 	/* Determine if we are a logical CPU. */
 	if (logical_cpus > 1 && PCPU_GET(apic_id) % logical_cpus != 0)
 		logical_cpus_mask |= PCPU_GET(cpumask);
 	
 	/* Determine if we are a hyperthread. */
 	if (hyperthreading_cpus > 1 &&
 	    PCPU_GET(apic_id) % hyperthreading_cpus != 0)
 		hyperthreading_cpus_mask |= PCPU_GET(cpumask);
 
 	/* Build our map of 'other' CPUs. */
 	PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask));
 
 	if (bootverbose)
 		lapic_dump("AP");
 
 	if (smp_cpus == mp_ncpus) {
 		/* enable IPI's, tlb shootdown, freezes etc */
 		atomic_store_rel_int(&smp_started, 1);
 		smp_active = 1;	 /* historic */
 	}
 
 	mtx_unlock_spin(&ap_boot_mtx);
 
 	/* wait until all the AP's are up */
 	while (smp_started == 0)
 		ia32_pause();
 
 	/* ok, now grab sched_lock and enter the scheduler */
 	mtx_lock_spin(&sched_lock);
 
 	/*
 	 * Correct spinlock nesting.  The idle thread context that we are
 	 * borrowing was created so that it would start out with a single
 	 * spin lock (sched_lock) held in fork_trampoline().  Since we've
 	 * explicitly acquired locks in this function, the nesting count
 	 * is now 2 rather than 1.  Since we are nested, calling
 	 * spinlock_exit() will simply adjust the counts without allowing
 	 * spin lock using code to interrupt us.
 	 */
 	spinlock_exit();
 	KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count"));
 
 	binuptime(PCPU_PTR(switchtime));
 	PCPU_SET(switchticks, ticks);
 
 	cpu_throw(NULL, choosethread());	/* doesn't return */
 
 	panic("scheduler returned us to %s", __func__);
 	/* NOTREACHED */
 }
 
 /*******************************************************************
  * local functions and data
  */
 
 /*
  * Set the APIC logical IDs.
  *
  * We want to cluster logical CPU's within the same APIC ID cluster.
  * Since logical CPU's are aligned simply filling in the clusters in
  * APIC ID order works fine.  Note that this does not try to balance
  * the number of CPU's in each cluster. (XXX?)
  */
 static void
 set_logical_apic_ids(void)
 {
 	u_int apic_id, cluster, cluster_id;
 
 	/* Force us to allocate cluster 0 at the start. */
 	cluster = -1;
 	cluster_id = APIC_MAX_INTRACLUSTER_ID;
 	for (apic_id = 0; apic_id < MAXCPU; apic_id++) {
 		if (!cpu_info[apic_id].cpu_present)
 			continue;
 		if (cluster_id == APIC_MAX_INTRACLUSTER_ID) {
 			cluster = ioapic_next_logical_cluster();
 			cluster_id = 0;
 		} else
 			cluster_id++;
 		if (bootverbose)
 			printf("APIC ID: physical %u, logical %u:%u\n",
 			    apic_id, cluster, cluster_id);
 		lapic_set_logical_id(apic_id, cluster, cluster_id);
 	}
 }
 
 /*
  * start each AP in our list
  */
 static int
 start_all_aps(void)
 {
 #ifndef PC98
 	u_char mpbiosreason;
 #endif
 	struct pcpu *pc;
 	char *stack;
 	uintptr_t kptbase;
 	u_int32_t mpbioswarmvec;
 	int apic_id, cpu, i, pg;
 
 	mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN);
 
 	/* install the AP 1st level boot code */
 	install_ap_tramp();
 
 	/* save the current value of the warm-start vector */
 	mpbioswarmvec = *((u_int32_t *) WARMBOOT_OFF);
 #ifndef PC98
 	outb(CMOS_REG, BIOS_RESET);
 	mpbiosreason = inb(CMOS_DATA);
 #endif
 
 	/* set up temporary P==V mapping for AP boot */
 	/* XXX this is a hack, we should boot the AP on its own stack/PTD */
 	kptbase = (uintptr_t)(void *)KPTphys;
 	for (i = 0; i < NKPT; i++)
 		PTD[i] = (pd_entry_t)(PG_V | PG_RW |
 		    ((kptbase + i * PAGE_SIZE) & PG_FRAME));
 	invltlb();
 
 	/* start each AP */
 	for (cpu = 0, apic_id = 0; apic_id < MAXCPU; apic_id++) {
 
 		/* Ignore non-existent CPUs and the BSP. */
 		if (!cpu_info[apic_id].cpu_present ||
 		    cpu_info[apic_id].cpu_bsp)
 			continue;
 
 		/* Don't use this CPU if it has been disabled by a tunable. */
 		if (resource_disabled("lapic", apic_id)) {
 			cpu_info[apic_id].cpu_disabled = 1;
 			mp_ncpus--;
 			continue;
 		}
 
 		cpu++;
 
 		/* save APIC ID for this logical ID */
 		cpu_apic_ids[cpu] = apic_id;
 
 		/* first page of AP's private space */
 		pg = cpu * i386_btop(sizeof(struct privatespace));
 
 		/* allocate a new private data page */
 		pc = (struct pcpu *)kmem_alloc(kernel_map, PAGE_SIZE);
 
 		/* wire it into the private page table page */
 		SMPpt[pg] = (pt_entry_t)(PG_V | PG_RW | vtophys(pc));
 
 		/* allocate and set up an idle stack data page */
 		stack = (char *)kmem_alloc(kernel_map, KSTACK_PAGES * PAGE_SIZE); /* XXXKSE */
 		for (i = 0; i < KSTACK_PAGES; i++)
 			SMPpt[pg + 1 + i] = (pt_entry_t)
 			    (PG_V | PG_RW | vtophys(PAGE_SIZE * i + stack));
 
 		/* prime data page for it to use */
 		pcpu_init(pc, cpu, sizeof(struct pcpu));
 		pc->pc_apic_id = apic_id;
 
 		/* setup a vector to our boot code */
 		*((volatile u_short *) WARMBOOT_OFF) = WARMBOOT_TARGET;
 		*((volatile u_short *) WARMBOOT_SEG) = (boot_address >> 4);
 #ifndef PC98
 		outb(CMOS_REG, BIOS_RESET);
 		outb(CMOS_DATA, BIOS_WARM);	/* 'warm-start' */
 #endif
 
 		bootSTK = &SMP_prvspace[cpu].idlekstack[KSTACK_PAGES *
 		    PAGE_SIZE];
 		bootAP = cpu;
 
 		/* attempt to start the Application Processor */
 		CHECK_INIT(99);	/* setup checkpoints */
 		if (!start_ap(apic_id)) {
 			printf("AP #%d (PHY# %d) failed!\n", cpu, apic_id);
 			CHECK_PRINT("trace");	/* show checkpoints */
 			/* better panic as the AP may be running loose */
 			printf("panic y/n? [y] ");
 			if (cngetc() != 'n')
 				panic("bye-bye");
 		}
 		CHECK_PRINT("trace");		/* show checkpoints */
 
 		all_cpus |= (1 << cpu);		/* record AP in CPU map */
 	}
 
 	/* build our map of 'other' CPUs */
 	PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask));
 
 	/* restore the warmstart vector */
 	*(u_int32_t *) WARMBOOT_OFF = mpbioswarmvec;
 
 #ifndef PC98
 	outb(CMOS_REG, BIOS_RESET);
 	outb(CMOS_DATA, mpbiosreason);
 #endif
 
 	/*
 	 * Set up the idle context for the BSP.  Similar to above except
 	 * that some was done by locore, some by pmap.c and some is implicit
 	 * because the BSP is cpu#0 and the page is initially zero and also
 	 * because we can refer to variables by name on the BSP..
 	 */
 
 	/* Allocate and setup BSP idle stack */
 	stack = (char *)kmem_alloc(kernel_map, KSTACK_PAGES * PAGE_SIZE);
 	for (i = 0; i < KSTACK_PAGES; i++)
 		SMPpt[1 + i] = (pt_entry_t)
 		    (PG_V | PG_RW | vtophys(PAGE_SIZE * i + stack));
 
 	for (i = 0; i < NKPT; i++)
 		PTD[i] = 0;
 	pmap_invalidate_range(kernel_pmap, 0, NKPT * NBPDR - 1);
 
 	/* number of APs actually started */
 	return mp_naps;
 }
 
 /*
  * load the 1st level AP boot code into base memory.
  */
 
 /* targets for relocation */
 extern void bigJump(void);
 extern void bootCodeSeg(void);
 extern void bootDataSeg(void);
 extern void MPentry(void);
 extern u_int MP_GDT;
 extern u_int mp_gdtbase;
 
 static void
 install_ap_tramp(void)
 {
 	int     x;
 	int     size = *(int *) ((u_long) & bootMP_size);
 	vm_offset_t va = boot_address + KERNBASE;
 	u_char *src = (u_char *) ((u_long) bootMP);
 	u_char *dst = (u_char *) va;
 	u_int   boot_base = (u_int) bootMP;
 	u_int8_t *dst8;
 	u_int16_t *dst16;
 	u_int32_t *dst32;
 
 	KASSERT (size <= PAGE_SIZE,
 	    ("'size' do not fit into PAGE_SIZE, as expected."));
 	pmap_kenter(va, boot_address);
 	pmap_invalidate_page (kernel_pmap, va);
 	for (x = 0; x < size; ++x)
 		*dst++ = *src++;
 
 	/*
 	 * modify addresses in code we just moved to basemem. unfortunately we
 	 * need fairly detailed info about mpboot.s for this to work.  changes
 	 * to mpboot.s might require changes here.
 	 */
 
 	/* boot code is located in KERNEL space */
 	dst = (u_char *) va;
 
 	/* modify the lgdt arg */
 	dst32 = (u_int32_t *) (dst + ((u_int) & mp_gdtbase - boot_base));
 	*dst32 = boot_address + ((u_int) & MP_GDT - boot_base);
 
 	/* modify the ljmp target for MPentry() */
 	dst32 = (u_int32_t *) (dst + ((u_int) bigJump - boot_base) + 1);
 	*dst32 = ((u_int) MPentry - KERNBASE);
 
 	/* modify the target for boot code segment */
 	dst16 = (u_int16_t *) (dst + ((u_int) bootCodeSeg - boot_base));
 	dst8 = (u_int8_t *) (dst16 + 1);
 	*dst16 = (u_int) boot_address & 0xffff;
 	*dst8 = ((u_int) boot_address >> 16) & 0xff;
 
 	/* modify the target for boot data segment */
 	dst16 = (u_int16_t *) (dst + ((u_int) bootDataSeg - boot_base));
 	dst8 = (u_int8_t *) (dst16 + 1);
 	*dst16 = (u_int) boot_address & 0xffff;
 	*dst8 = ((u_int) boot_address >> 16) & 0xff;
 }
 
 /*
  * This function starts the AP (application processor) identified
  * by the APIC ID 'physicalCpu'.  It does quite a "song and dance"
  * to accomplish this.  This is necessary because of the nuances
  * of the different hardware we might encounter.  It isn't pretty,
  * but it seems to work.
  */
 static int
 start_ap(int apic_id)
 {
 	int vector, ms;
 	int cpus;
 
 	/* calculate the vector */
 	vector = (boot_address >> 12) & 0xff;
 
 	/* used as a watchpoint to signal AP startup */
 	cpus = mp_naps;
 
 	/*
 	 * first we do an INIT/RESET IPI this INIT IPI might be run, reseting
 	 * and running the target CPU. OR this INIT IPI might be latched (P5
 	 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be
 	 * ignored.
 	 */
 
 	/* do an INIT IPI: assert RESET */
 	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
 	    APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id);
 
 	/* wait for pending status end */
 	lapic_ipi_wait(-1);
 
 	/* do an INIT IPI: deassert RESET */
 	lapic_ipi_raw(APIC_DEST_ALLESELF | APIC_TRIGMOD_LEVEL |
 	    APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, 0);
 
 	/* wait for pending status end */
 	DELAY(10000);		/* wait ~10mS */
 	lapic_ipi_wait(-1);
 
 	/*
 	 * next we do a STARTUP IPI: the previous INIT IPI might still be
 	 * latched, (P5 bug) this 1st STARTUP would then terminate
 	 * immediately, and the previously started INIT IPI would continue. OR
 	 * the previous INIT IPI has already run. and this STARTUP IPI will
 	 * run. OR the previous INIT IPI was ignored. and this STARTUP IPI
 	 * will run.
 	 */
 
 	/* do a STARTUP IPI */
 	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
 	    APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
 	    vector, apic_id);
 	lapic_ipi_wait(-1);
 	DELAY(200);		/* wait ~200uS */
 
 	/*
 	 * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF
 	 * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR
 	 * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is
 	 * recognized after hardware RESET or INIT IPI.
 	 */
 
 	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
 	    APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
 	    vector, apic_id);
 	lapic_ipi_wait(-1);
 	DELAY(200);		/* wait ~200uS */
 
 	/* Wait up to 5 seconds for it to start. */
 	for (ms = 0; ms < 5000; ms++) {
 		if (mp_naps > cpus)
 			return 1;	/* return SUCCESS */
 		DELAY(1000);
 	}
 	return 0;		/* return FAILURE */
 }
 
 #ifdef COUNT_XINVLTLB_HITS
 u_int xhits_gbl[MAXCPU];
 u_int xhits_pg[MAXCPU];
 u_int xhits_rng[MAXCPU];
 SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, "");
 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl,
     sizeof(xhits_gbl), "IU", "");
 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg,
     sizeof(xhits_pg), "IU", "");
 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng,
     sizeof(xhits_rng), "IU", "");
 
 u_int ipi_global;
 u_int ipi_page;
 u_int ipi_range;
 u_int ipi_range_size;
 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, "");
 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, "");
 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, "");
 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size,
     0, "");
 
 u_int ipi_masked_global;
 u_int ipi_masked_page;
 u_int ipi_masked_range;
 u_int ipi_masked_range_size;
 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_global, CTLFLAG_RW,
     &ipi_masked_global, 0, "");
 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_page, CTLFLAG_RW,
     &ipi_masked_page, 0, "");
 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range, CTLFLAG_RW,
     &ipi_masked_range, 0, "");
 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range_size, CTLFLAG_RW,
     &ipi_masked_range_size, 0, "");
 #endif /* COUNT_XINVLTLB_HITS */
 
 /*
  * Flush the TLB on all other CPU's
  */
 static void
 smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2)
 {
 	u_int ncpu;
 
 	ncpu = mp_ncpus - 1;	/* does not shootdown self */
 	if (ncpu < 1)
 		return;		/* no other cpus */
 	mtx_assert(&smp_ipi_mtx, MA_OWNED);
 	smp_tlb_addr1 = addr1;
 	smp_tlb_addr2 = addr2;
 	atomic_store_rel_int(&smp_tlb_wait, 0);
 	ipi_all_but_self(vector);
 	while (smp_tlb_wait < ncpu)
 		ia32_pause();
 }
 
 static void
 smp_targeted_tlb_shootdown(u_int mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2)
 {
 	int ncpu, othercpus;
 
 	othercpus = mp_ncpus - 1;
 	if (mask == (u_int)-1) {
 		ncpu = othercpus;
 		if (ncpu < 1)
 			return;
 	} else {
 		mask &= ~PCPU_GET(cpumask);
 		if (mask == 0)
 			return;
 		ncpu = bitcount32(mask);
 		if (ncpu > othercpus) {
 			/* XXX this should be a panic offence */
 			printf("SMP: tlb shootdown to %d other cpus (only have %d)\n",
 			    ncpu, othercpus);
 			ncpu = othercpus;
 		}
 		/* XXX should be a panic, implied by mask == 0 above */
 		if (ncpu < 1)
 			return;
 	}
 	mtx_assert(&smp_ipi_mtx, MA_OWNED);
 	smp_tlb_addr1 = addr1;
 	smp_tlb_addr2 = addr2;
 	atomic_store_rel_int(&smp_tlb_wait, 0);
 	if (mask == (u_int)-1)
 		ipi_all_but_self(vector);
 	else
 		ipi_selected(mask, vector);
 	while (smp_tlb_wait < ncpu)
 		ia32_pause();
 }
 
 void
 smp_invltlb(void)
 {
 
 	if (smp_started) {
 		smp_tlb_shootdown(IPI_INVLTLB, 0, 0);
 #ifdef COUNT_XINVLTLB_HITS
 		ipi_global++;
 #endif
 	}
 }
 
 void
 smp_invlpg(vm_offset_t addr)
 {
 
 	if (smp_started) {
 		smp_tlb_shootdown(IPI_INVLPG, addr, 0);
 #ifdef COUNT_XINVLTLB_HITS
 		ipi_page++;
 #endif
 	}
 }
 
 void
 smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2)
 {
 
 	if (smp_started) {
 		smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2);
 #ifdef COUNT_XINVLTLB_HITS
 		ipi_range++;
 		ipi_range_size += (addr2 - addr1) / PAGE_SIZE;
 #endif
 	}
 }
 
 void
 smp_masked_invltlb(u_int mask)
 {
 
 	if (smp_started) {
 		smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0);
 #ifdef COUNT_XINVLTLB_HITS
 		ipi_masked_global++;
 #endif
 	}
 }
 
 void
 smp_masked_invlpg(u_int mask, vm_offset_t addr)
 {
 
 	if (smp_started) {
 		smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0);
 #ifdef COUNT_XINVLTLB_HITS
 		ipi_masked_page++;
 #endif
 	}
 }
 
 void
 smp_masked_invlpg_range(u_int mask, vm_offset_t addr1, vm_offset_t addr2)
 {
 
 	if (smp_started) {
 		smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2);
 #ifdef COUNT_XINVLTLB_HITS
 		ipi_masked_range++;
 		ipi_masked_range_size += (addr2 - addr1) / PAGE_SIZE;
 #endif
 	}
 }
 
 void
 ipi_bitmap_handler(struct clockframe frame)
 {
 	int cpu = PCPU_GET(cpuid);
 	u_int ipi_bitmap;
 
 	ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]);
 
 #ifdef IPI_PREEMPTION
 	if (ipi_bitmap & IPI_PREEMPT) {
 #ifdef COUNT_IPIS
 		*ipi_preempt_counts[cpu]++;
 #endif
 		mtx_lock_spin(&sched_lock);
 		/* Don't preempt the idle thread */
 		if (curthread->td_priority <  PRI_MIN_IDLE) {
 			struct thread *running_thread = curthread;
 			if (running_thread->td_critnest > 1) 
 				running_thread->td_owepreempt = 1;
 			else 		
 				mi_switch(SW_INVOL | SW_PREEMPT, NULL);
 		}
 		mtx_unlock_spin(&sched_lock);
 	}
 #endif
 
 	if (ipi_bitmap & IPI_AST) {
 #ifdef COUNT_IPIS
 		*ipi_ast_counts[cpu]++;
 #endif
 		/* Nothing to do for AST */
 	}
 }
 
 /*
  * send an IPI to a set of cpus.
  */
 void
 ipi_selected(u_int32_t cpus, u_int ipi)
 {
 	int cpu;
 	u_int bitmap = 0;
 	u_int old_pending;
 	u_int new_pending;
 
 	if (IPI_IS_BITMAPED(ipi)) { 
 		bitmap = 1 << ipi;
 		ipi = IPI_BITMAP_VECTOR;
 	}
 
 #ifdef STOP_NMI
 	if (ipi == IPI_STOP && stop_cpus_with_nmi) {
 		ipi_nmi_selected(cpus);
 		return;
 	}
 #endif
 	CTR3(KTR_SMP, "%s: cpus: %x ipi: %x", __func__, cpus, ipi);
 	while ((cpu = ffs(cpus)) != 0) {
 		cpu--;
 		cpus &= ~(1 << cpu);
 
 		KASSERT(cpu_apic_ids[cpu] != -1,
 		    ("IPI to non-existent CPU %d", cpu));
 
 		if (bitmap) {
 			do {
 				old_pending = cpu_ipi_pending[cpu];
 				new_pending = old_pending | bitmap;
 			} while  (!atomic_cmpset_int(&cpu_ipi_pending[cpu],old_pending, new_pending));	
 
 			if (old_pending)
 				continue;
 		}
 
 		lapic_ipi_vectored(ipi, cpu_apic_ids[cpu]);
 	}
 
 }
 
 /*
  * send an IPI INTerrupt containing 'vector' to all CPUs, including myself
  */
 void
 ipi_all(u_int ipi)
 {
 
 	if (IPI_IS_BITMAPED(ipi) || (ipi == IPI_STOP && stop_cpus_with_nmi)) {
 		ipi_selected(all_cpus, ipi);
 		return;
 	}
 	CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
 	lapic_ipi_vectored(ipi, APIC_IPI_DEST_ALL);
 }
 
 /*
  * send an IPI to all CPUs EXCEPT myself
  */
 void
 ipi_all_but_self(u_int ipi)
 {
 
 	if (IPI_IS_BITMAPED(ipi) || (ipi == IPI_STOP && stop_cpus_with_nmi)) {
 		ipi_selected(PCPU_GET(other_cpus), ipi);
 		return;
 	}
 	CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
 	lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS);
 }
 
 /*
  * send an IPI to myself
  */
 void
 ipi_self(u_int ipi)
 {
 
 	if (IPI_IS_BITMAPED(ipi) || (ipi == IPI_STOP && stop_cpus_with_nmi)) {
 		ipi_selected(PCPU_GET(cpumask), ipi);
 		return;
 	}
 	CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
 	lapic_ipi_vectored(ipi, APIC_IPI_DEST_SELF);
 }
 
 #ifdef STOP_NMI
 /*
  * send NMI IPI to selected CPUs
  */
 
 #define	BEFORE_SPIN	1000000
 
 void
 ipi_nmi_selected(u_int32_t cpus)
 {
 	int cpu;
 	register_t icrlo;
 
 	icrlo = APIC_DELMODE_NMI | APIC_DESTMODE_PHY | APIC_LEVEL_ASSERT 
 		| APIC_TRIGMOD_EDGE; 
 	
 	CTR2(KTR_SMP, "%s: cpus: %x nmi", __func__, cpus);
 
 	atomic_set_int(&ipi_nmi_pending, cpus);
 
 	while ((cpu = ffs(cpus)) != 0) {
 		cpu--;
 		cpus &= ~(1 << cpu);
 
 		KASSERT(cpu_apic_ids[cpu] != -1,
 		    ("IPI NMI to non-existent CPU %d", cpu));
 		
 		/* Wait for an earlier IPI to finish. */
 		if (!lapic_ipi_wait(BEFORE_SPIN))
 			panic("ipi_nmi_selected: previous IPI has not cleared");
 
 		lapic_ipi_raw(icrlo, cpu_apic_ids[cpu]);
 	}
 }
 
 int
-ipi_nmi_handler()
+ipi_nmi_handler(void)
 {
-	int cpu = PCPU_GET(cpuid);
 	int cpumask = PCPU_GET(cpumask);
 
 	if (!(ipi_nmi_pending & cpumask))
 		return 1;
 
 	atomic_clear_int(&ipi_nmi_pending, cpumask);
+	cpustop_handler();
+	return 0;
+}
 
+#endif /* STOP_NMI */
+
+/*
+ * Handle an IPI_STOP by saving our current context and spinning until we
+ * are resumed.
+ */
+void
+cpustop_handler(void)
+{
+	int cpu = PCPU_GET(cpuid);
+	int cpumask = PCPU_GET(cpumask);
+
 	savectx(&stoppcbs[cpu]);
 
 	/* Indicate that we are stopped */
 	atomic_set_int(&stopped_cpus, cpumask);
 
 	/* Wait for restart */
 	while (!(started_cpus & cpumask))
 	    ia32_pause();
 
 	atomic_clear_int(&started_cpus, cpumask);
 	atomic_clear_int(&stopped_cpus, cpumask);
 
 	if (cpu == 0 && cpustop_restartfunc != NULL) {
 		cpustop_restartfunc();
 		cpustop_restartfunc = NULL;
 	}
-
-	return 0;
 }
-     
-#endif /* STOP_NMI */
 
 /*
  * This is called once the rest of the system is up and running and we're
  * ready to let the AP's out of the pen.
  */
 static void
 release_aps(void *dummy __unused)
 {
 
 	if (mp_ncpus == 1) 
 		return;
 	mtx_lock_spin(&sched_lock);
 	atomic_store_rel_int(&aps_ready, 1);
 	while (smp_started == 0)
 		ia32_pause();
 	mtx_unlock_spin(&sched_lock);
 }
 SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL);
 
 static int
 sysctl_hlt_cpus(SYSCTL_HANDLER_ARGS)
 {
 	u_int mask;
 	int error;
 
 	mask = hlt_cpus_mask;
 	error = sysctl_handle_int(oidp, &mask, 0, req);
 	if (error || !req->newptr)
 		return (error);
 
 	if (logical_cpus_mask != 0 &&
 	    (mask & logical_cpus_mask) == logical_cpus_mask)
 		hlt_logical_cpus = 1;
 	else
 		hlt_logical_cpus = 0;
 
 	if (! hyperthreading_allowed)
 		mask |= hyperthreading_cpus_mask;
 
 	if ((mask & all_cpus) == all_cpus)
 		mask &= ~(1<<0);
 	hlt_cpus_mask = mask;
 	return (error);
 }
 SYSCTL_PROC(_machdep, OID_AUTO, hlt_cpus, CTLTYPE_INT|CTLFLAG_RW,
     0, 0, sysctl_hlt_cpus, "IU",
     "Bitmap of CPUs to halt.  101 (binary) will halt CPUs 0 and 2.");
 
 static int
 sysctl_hlt_logical_cpus(SYSCTL_HANDLER_ARGS)
 {
 	int disable, error;
 
 	disable = hlt_logical_cpus;
 	error = sysctl_handle_int(oidp, &disable, 0, req);
 	if (error || !req->newptr)
 		return (error);
 
 	if (disable)
 		hlt_cpus_mask |= logical_cpus_mask;
 	else
 		hlt_cpus_mask &= ~logical_cpus_mask;
 
 	if (! hyperthreading_allowed)
 		hlt_cpus_mask |= hyperthreading_cpus_mask;
 
 	if ((hlt_cpus_mask & all_cpus) == all_cpus)
 		hlt_cpus_mask &= ~(1<<0);
 
 	hlt_logical_cpus = disable;
 	return (error);
 }
 
 static int
 sysctl_hyperthreading_allowed(SYSCTL_HANDLER_ARGS)
 {
 	int allowed, error;
 
 	allowed = hyperthreading_allowed;
 	error = sysctl_handle_int(oidp, &allowed, 0, req);
 	if (error || !req->newptr)
 		return (error);
 
 	if (allowed)
 		hlt_cpus_mask &= ~hyperthreading_cpus_mask;
 	else
 		hlt_cpus_mask |= hyperthreading_cpus_mask;
 
 	if (logical_cpus_mask != 0 &&
 	    (hlt_cpus_mask & logical_cpus_mask) == logical_cpus_mask)
 		hlt_logical_cpus = 1;
 	else
 		hlt_logical_cpus = 0;
 
 	if ((hlt_cpus_mask & all_cpus) == all_cpus)
 		hlt_cpus_mask &= ~(1<<0);
 
 	hyperthreading_allowed = allowed;
 	return (error);
 }
 
 static void
 cpu_hlt_setup(void *dummy __unused)
 {
 
 	if (logical_cpus_mask != 0) {
 		TUNABLE_INT_FETCH("machdep.hlt_logical_cpus",
 		    &hlt_logical_cpus);
 		sysctl_ctx_init(&logical_cpu_clist);
 		SYSCTL_ADD_PROC(&logical_cpu_clist,
 		    SYSCTL_STATIC_CHILDREN(_machdep), OID_AUTO,
 		    "hlt_logical_cpus", CTLTYPE_INT|CTLFLAG_RW, 0, 0,
 		    sysctl_hlt_logical_cpus, "IU", "");
 		SYSCTL_ADD_UINT(&logical_cpu_clist,
 		    SYSCTL_STATIC_CHILDREN(_machdep), OID_AUTO,
 		    "logical_cpus_mask", CTLTYPE_INT|CTLFLAG_RD,
 		    &logical_cpus_mask, 0, "");
 
 		if (hlt_logical_cpus)
 			hlt_cpus_mask |= logical_cpus_mask;
 
 		/*
 		 * If necessary for security purposes, force
 		 * hyperthreading off, regardless of the value
 		 * of hlt_logical_cpus.
 		 */
 		if (hyperthreading_cpus_mask) {
 			TUNABLE_INT_FETCH("machdep.hyperthreading_allowed",
 			    &hyperthreading_allowed);
 			SYSCTL_ADD_PROC(&logical_cpu_clist,
 			    SYSCTL_STATIC_CHILDREN(_machdep), OID_AUTO,
 			    "hyperthreading_allowed", CTLTYPE_INT|CTLFLAG_RW,
 			    0, 0, sysctl_hyperthreading_allowed, "IU", "");
 			if (! hyperthreading_allowed)
 				hlt_cpus_mask |= hyperthreading_cpus_mask;
 		}
 	}
 }
 SYSINIT(cpu_hlt, SI_SUB_SMP, SI_ORDER_ANY, cpu_hlt_setup, NULL);
 
 int
 mp_grab_cpu_hlt(void)
 {
 	u_int mask = PCPU_GET(cpumask);
 #ifdef MP_WATCHDOG
 	u_int cpuid = PCPU_GET(cpuid);
 #endif
 	int retval;
 
 #ifdef MP_WATCHDOG
 	ap_watchdog(cpuid);
 #endif
 
 	retval = mask & hlt_cpus_mask;
 	while (mask & hlt_cpus_mask)
 		__asm __volatile("sti; hlt" : : : "memory");
 	return (retval);
 }
 
 #ifdef COUNT_IPIS
 /*
  * Setup interrupt counters for IPI handlers.
  */
 static void
 mp_ipi_intrcnt(void *dummy)
 {
 	char buf[64];
 	int i;
 
 	for (i = 0; i < mp_maxid; i++) {
 		if (CPU_ABSENT(i))
 			continue;
 		snprintf(buf, sizeof(buf), "cpu%d: invltlb", i);
 		intrcnt_add(buf, &ipi_invltlb_counts[i]);
 		snprintf(buf, sizeof(buf), "cpu%d: invlrng", i);
 		intrcnt_add(buf, &ipi_invlrng_counts[i]);
 		snprintf(buf, sizeof(buf), "cpu%d: invlpg", i);
 		intrcnt_add(buf, &ipi_invlpg_counts[i]);
 #ifdef IPI_PREEMPTION
 		snprintf(buf, sizeof(buf), "cpu%d: preempt", i);
 		intrcnt_add(buf, &ipi_preempt_counts[i]);
 #endif
 		snprintf(buf, sizeof(buf), "cpu%d: ast", i);
 		intrcnt_add(buf, &ipi_ast_counts[i]);
 		snprintf(buf, sizeof(buf), "cpu%d: rendezvous", i);
 		intrcnt_add(buf, &ipi_rendezvous_counts[i]);
 		snprintf(buf, sizeof(buf), "cpu%d: lazypmap", i);
 		intrcnt_add(buf, &ipi_lazypmap_counts[i]);
 	}		
 }
 SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, mp_ipi_intrcnt, NULL)
 #endif
Index: head/sys/i386/i386/pmap.c
===================================================================
--- head/sys/i386/i386/pmap.c	(revision 153140)
+++ head/sys/i386/i386/pmap.c	(revision 153141)
@@ -1,3151 +1,3155 @@
 /*-
  * Copyright (c) 1991 Regents of the University of California.
  * All rights reserved.
  * Copyright (c) 1994 John S. Dyson
  * All rights reserved.
  * Copyright (c) 1994 David Greenman
  * All rights reserved.
  * Copyright (c) 2005 Alan L. Cox <alc@cs.rice.edu>
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department and William Jolitz of UUNET Technologies Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
  */
 /*-
  * Copyright (c) 2003 Networks Associates Technology, Inc.
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project by Jake Burkholder,
  * Safeport Network Services, and Network Associates Laboratories, the
  * Security Research Division of Network Associates, Inc. under
  * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
  * CHATS research program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  *	Manages physical address maps.
  *
  *	In addition to hardware address maps, this
  *	module is called upon to provide software-use-only
  *	maps which may or may not be stored in the same
  *	form as hardware maps.  These pseudo-maps are
  *	used to store intermediate results from copy
  *	operations to and from address spaces.
  *
  *	Since the information managed by this module is
  *	also stored by the logical address mapping module,
  *	this module may throw away valid virtual-to-physical
  *	mappings at almost any time.  However, invalidations
  *	of virtual-to-physical mappings must be done as
  *	requested.
  *
  *	In order to cope with hardware architectures which
  *	make virtual-to-physical map invalidates expensive,
  *	this module may delay invalidate or reduced protection
  *	operations until such time as they are actually
  *	necessary.  This module is given full information as
  *	to which processors are currently using which maps,
  *	and to when physical maps must be made correct.
  */
 
 #include "opt_cpu.h"
 #include "opt_pmap.h"
 #include "opt_msgbuf.h"
+#include "opt_smp.h"
 #include "opt_xbox.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/msgbuf.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/sx.h>
 #include <sys/vmmeter.h>
 #include <sys/sched.h>
 #include <sys/sysctl.h>
 #ifdef SMP
 #include <sys/smp.h>
 #endif
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/uma.h>
 
 #include <machine/cpu.h>
 #include <machine/cputypes.h>
 #include <machine/md_var.h>
 #include <machine/pcb.h>
 #include <machine/specialreg.h>
 #ifdef SMP
 #include <machine/smp.h>
 #endif
 
 #ifdef XBOX
 #include <machine/xbox.h>
 #endif
 
 #if !defined(CPU_DISABLE_SSE) && defined(I686_CPU)
 #define CPU_ENABLE_SSE
 #endif
 
 #ifndef PMAP_SHPGPERPROC
 #define PMAP_SHPGPERPROC 200
 #endif
 
 #if defined(DIAGNOSTIC)
 #define PMAP_DIAGNOSTIC
 #endif
 
 #if !defined(PMAP_DIAGNOSTIC)
 #define PMAP_INLINE __inline
 #else
 #define PMAP_INLINE
 #endif
 
 /*
  * Get PDEs and PTEs for user/kernel address space
  */
 #define	pmap_pde(m, v)	(&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT]))
 #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT])
 
 #define pmap_pde_v(pte)		((*(int *)pte & PG_V) != 0)
 #define pmap_pte_w(pte)		((*(int *)pte & PG_W) != 0)
 #define pmap_pte_m(pte)		((*(int *)pte & PG_M) != 0)
 #define pmap_pte_u(pte)		((*(int *)pte & PG_A) != 0)
 #define pmap_pte_v(pte)		((*(int *)pte & PG_V) != 0)
 
 #define pmap_pte_set_w(pte, v)	((v) ? atomic_set_int((u_int *)(pte), PG_W) : \
     atomic_clear_int((u_int *)(pte), PG_W))
 #define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v)))
 
 struct pmap kernel_pmap_store;
 LIST_HEAD(pmaplist, pmap);
 static struct pmaplist allpmaps;
 static struct mtx allpmaps_lock;
 
 vm_paddr_t avail_end;	/* PA of last available physical page */
 vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
 vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
 int pgeflag = 0;		/* PG_G or-in */
 int pseflag = 0;		/* PG_PS or-in */
 
 static int nkpt;
 vm_offset_t kernel_vm_end;
 extern u_int32_t KERNend;
 
 #ifdef PAE
 static uma_zone_t pdptzone;
 #endif
 
 /*
  * Data for the pv entry allocation mechanism
  */
 static uma_zone_t pvzone;
 static struct vm_object pvzone_obj;
 static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
 
 /*
  * All those kernel PT submaps that BSD is so fond of
  */
 struct sysmaps {
 	struct	mtx lock;
 	pt_entry_t *CMAP1;
 	pt_entry_t *CMAP2;
 	caddr_t	CADDR1;
 	caddr_t	CADDR2;
 };
 static struct sysmaps sysmaps_pcpu[MAXCPU];
 pt_entry_t *CMAP1 = 0;
 static pt_entry_t *CMAP3;
 caddr_t CADDR1 = 0, ptvmmap = 0;
 static caddr_t CADDR3;
 struct msgbuf *msgbufp = 0;
 
 /*
  * Crashdump maps.
  */
 static caddr_t crashdumpmap;
 
 #ifdef SMP
 extern pt_entry_t *SMPpt;
 #endif
 static pt_entry_t *PMAP1 = 0, *PMAP2;
 static pt_entry_t *PADDR1 = 0, *PADDR2;
 #ifdef SMP
 static int PMAP1cpu;
 static int PMAP1changedcpu;
 SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD, 
 	   &PMAP1changedcpu, 0,
 	   "Number of times pmap_pte_quick changed CPU with same PMAP1");
 #endif
 static int PMAP1changed;
 SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD, 
 	   &PMAP1changed, 0,
 	   "Number of times pmap_pte_quick changed PMAP1");
 static int PMAP1unchanged;
 SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD, 
 	   &PMAP1unchanged, 0,
 	   "Number of times pmap_pte_quick didn't change PMAP1");
 static struct mtx PMAP2mutex;
 
 static PMAP_INLINE void	free_pv_entry(pv_entry_t pv);
 static pv_entry_t get_pv_entry(pmap_t locked_pmap);
 static void	pmap_clear_ptes(vm_page_t m, int bit);
 
 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva);
 static void pmap_remove_page(struct pmap *pmap, vm_offset_t va);
 static void pmap_remove_entry(struct pmap *pmap, vm_page_t m,
 					vm_offset_t va);
 static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m);
 
 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags);
 
 static vm_page_t _pmap_allocpte(pmap_t pmap, unsigned ptepindex, int flags);
 static int _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m);
 static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va);
 static void pmap_pte_release(pt_entry_t *pte);
 static int pmap_unuse_pt(pmap_t, vm_offset_t);
 static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
 #ifdef PAE
 static void *pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait);
 #endif
 
 CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t));
 CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t));
 
 /*
  * Move the kernel virtual free pointer to the next
  * 4MB.  This is used to help improve performance
  * by using a large (4MB) page for much of the kernel
  * (.text, .data, .bss)
  */
 static vm_offset_t
 pmap_kmem_choose(vm_offset_t addr)
 {
 	vm_offset_t newaddr = addr;
 
 #ifndef DISABLE_PSE
 	if (cpu_feature & CPUID_PSE)
 		newaddr = (addr + PDRMASK) & ~PDRMASK;
 #endif
 	return newaddr;
 }
 
 /*
  *	Bootstrap the system enough to run with virtual memory.
  *
  *	On the i386 this is called after mapping has already been enabled
  *	and just syncs the pmap module with what has already been done.
  *	[We can't call it easily with mapping off since the kernel is not
  *	mapped with PA == VA, hence we would have to relocate every address
  *	from the linked base (virtual) address "KERNBASE" to the actual
  *	(physical) address starting relative to 0]
  */
 void
 pmap_bootstrap(firstaddr, loadaddr)
 	vm_paddr_t firstaddr;
 	vm_paddr_t loadaddr;
 {
 	vm_offset_t va;
 	pt_entry_t *pte, *unused;
 	struct sysmaps *sysmaps;
 	int i;
 
 	/*
 	 * XXX The calculation of virtual_avail is wrong. It's NKPT*PAGE_SIZE too
 	 * large. It should instead be correctly calculated in locore.s and
 	 * not based on 'first' (which is a physical address, not a virtual
 	 * address, for the start of unused physical memory). The kernel
 	 * page tables are NOT double mapped and thus should not be included
 	 * in this calculation.
 	 */
 	virtual_avail = (vm_offset_t) KERNBASE + firstaddr;
 	virtual_avail = pmap_kmem_choose(virtual_avail);
 
 	virtual_end = VM_MAX_KERNEL_ADDRESS;
 
 	/*
 	 * Initialize the kernel pmap (which is statically allocated).
 	 */
 	PMAP_LOCK_INIT(kernel_pmap);
 	kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD);
 #ifdef PAE
 	kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT);
 #endif
 	kernel_pmap->pm_active = -1;	/* don't allow deactivation */
 	TAILQ_INIT(&kernel_pmap->pm_pvlist);
 	LIST_INIT(&allpmaps);
 	mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN);
 	mtx_lock_spin(&allpmaps_lock);
 	LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list);
 	mtx_unlock_spin(&allpmaps_lock);
 	nkpt = NKPT;
 
 	/*
 	 * Reserve some special page table entries/VA space for temporary
 	 * mapping of pages.
 	 */
 #define	SYSMAP(c, p, v, n)	\
 	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
 
 	va = virtual_avail;
 	pte = vtopte(va);
 
 	/*
 	 * CMAP1/CMAP2 are used for zeroing and copying pages.
 	 * CMAP3 is used for the idle process page zeroing.
 	 */
 	for (i = 0; i < MAXCPU; i++) {
 		sysmaps = &sysmaps_pcpu[i];
 		mtx_init(&sysmaps->lock, "SYSMAPS", NULL, MTX_DEF);
 		SYSMAP(caddr_t, sysmaps->CMAP1, sysmaps->CADDR1, 1)
 		SYSMAP(caddr_t, sysmaps->CMAP2, sysmaps->CADDR2, 1)
 	}
 	SYSMAP(caddr_t, CMAP1, CADDR1, 1)
 	SYSMAP(caddr_t, CMAP3, CADDR3, 1)
 	*CMAP3 = 0;
 
 	/*
 	 * Crashdump maps.
 	 */
 	SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS)
 
 	/*
 	 * ptvmmap is used for reading arbitrary physical pages via /dev/mem.
 	 */
 	SYSMAP(caddr_t, unused, ptvmmap, 1)
 
 	/*
 	 * msgbufp is used to map the system message buffer.
 	 */
 	SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(MSGBUF_SIZE)))
 
 	/*
 	 * ptemap is used for pmap_pte_quick
 	 */
 	SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1);
 	SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1);
 
 	mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF);
 
 	virtual_avail = va;
 
 	*CMAP1 = 0;
 
 #ifdef XBOX
 	/* FIXME: This is gross, but needed for the XBOX. Since we are in such
 	 * an early stadium, we cannot yet neatly map video memory ... :-(
 	 * Better fixes are very welcome! */
 	if (!arch_i386_is_xbox)
 #endif
 	for (i = 0; i < NKPT; i++)
 		PTD[i] = 0;
 
 	/* Turn on PG_G on kernel page(s) */
 	pmap_set_pg();
 }
 
 /*
  * Set PG_G on kernel pages.  Only the BSP calls this when SMP is turned on.
  */
 void
 pmap_set_pg(void)
 {
 	pd_entry_t pdir;
 	pt_entry_t *pte;
 	vm_offset_t va, endva;
 	int i; 
 
 	if (pgeflag == 0)
 		return;
 
 	i = KERNLOAD/NBPDR;
 	endva = KERNBASE + KERNend;
 
 	if (pseflag) {
 		va = KERNBASE + KERNLOAD;
 		while (va  < endva) {
 			pdir = kernel_pmap->pm_pdir[KPTDI+i];
 			pdir |= pgeflag;
 			kernel_pmap->pm_pdir[KPTDI+i] = PTD[KPTDI+i] = pdir;
 			invltlb();	/* Play it safe, invltlb() every time */
 			i++;
 			va += NBPDR;
 		}
 	} else {
 		va = (vm_offset_t)btext;
 		while (va < endva) {
 			pte = vtopte(va);
 			if (*pte)
 				*pte |= pgeflag;
 			invltlb();	/* Play it safe, invltlb() every time */
 			va += PAGE_SIZE;
 		}
 	}
 }
 
 /*
  * Initialize a vm_page's machine-dependent fields.
  */
 void
 pmap_page_init(vm_page_t m)
 {
 
 	TAILQ_INIT(&m->md.pv_list);
 	m->md.pv_list_count = 0;
 }
 
 #ifdef PAE
 
 static MALLOC_DEFINE(M_PMAPPDPT, "pmap", "pmap pdpt");
 
 static void *
 pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
 {
 	*flags = UMA_SLAB_PRIV;
 	return (contigmalloc(PAGE_SIZE, M_PMAPPDPT, 0, 0x0ULL, 0xffffffffULL,
 	    1, 0));
 }
 #endif
 
 /*
  *	Initialize the pmap module.
  *	Called by vm_init, to initialize any structures that the pmap
  *	system needs to map virtual memory.
  */
 void
 pmap_init(void)
 {
 	int shpgperproc = PMAP_SHPGPERPROC;
 
 	/*
 	 * Initialize the address space (zone) for the pv entries.  Set a
 	 * high water mark so that the system can recover from excessive
 	 * numbers of pv entries.
 	 */
 	pvzone = uma_zcreate("PV ENTRY", sizeof(struct pv_entry), NULL, NULL, 
 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE);
 	TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
 	pv_entry_max = shpgperproc * maxproc + cnt.v_page_count;
 	TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
 	pv_entry_high_water = 9 * (pv_entry_max / 10);
 	uma_zone_set_obj(pvzone, &pvzone_obj, pv_entry_max);
 
 #ifdef PAE
 	pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL,
 	    NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1,
 	    UMA_ZONE_VM | UMA_ZONE_NOFREE);
 	uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf);
 #endif
 }
 
 
 /***************************************************
  * Low level helper routines.....
  ***************************************************/
 
 
 /*
  * this routine defines the region(s) of memory that should
  * not be tested for the modified bit.
  */
 static PMAP_INLINE int
 pmap_track_modified(vm_offset_t va)
 {
 	if ((va < kmi.clean_sva) || (va >= kmi.clean_eva)) 
 		return 1;
 	else
 		return 0;
 }
 
 #ifdef SMP
 /*
  * For SMP, these functions have to use the IPI mechanism for coherence.
  */
 void
 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 {
 	u_int cpumask;
 	u_int other_cpus;
 
 	if (smp_started) {
 		if (!(read_eflags() & PSL_I))
 			panic("%s: interrupts disabled", __func__);
 		mtx_lock_spin(&smp_ipi_mtx);
 	} else
 		critical_enter();
 	/*
 	 * We need to disable interrupt preemption but MUST NOT have
 	 * interrupts disabled here.
 	 * XXX we may need to hold schedlock to get a coherent pm_active
 	 * XXX critical sections disable interrupts again
 	 */
 	if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
 		invlpg(va);
 		smp_invlpg(va);
 	} else {
 		cpumask = PCPU_GET(cpumask);
 		other_cpus = PCPU_GET(other_cpus);
 		if (pmap->pm_active & cpumask)
 			invlpg(va);
 		if (pmap->pm_active & other_cpus)
 			smp_masked_invlpg(pmap->pm_active & other_cpus, va);
 	}
 	if (smp_started)
 		mtx_unlock_spin(&smp_ipi_mtx);
 	else
 		critical_exit();
 }
 
 void
 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	u_int cpumask;
 	u_int other_cpus;
 	vm_offset_t addr;
 
 	if (smp_started) {
 		if (!(read_eflags() & PSL_I))
 			panic("%s: interrupts disabled", __func__);
 		mtx_lock_spin(&smp_ipi_mtx);
 	} else
 		critical_enter();
 	/*
 	 * We need to disable interrupt preemption but MUST NOT have
 	 * interrupts disabled here.
 	 * XXX we may need to hold schedlock to get a coherent pm_active
 	 * XXX critical sections disable interrupts again
 	 */
 	if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
 		for (addr = sva; addr < eva; addr += PAGE_SIZE)
 			invlpg(addr);
 		smp_invlpg_range(sva, eva);
 	} else {
 		cpumask = PCPU_GET(cpumask);
 		other_cpus = PCPU_GET(other_cpus);
 		if (pmap->pm_active & cpumask)
 			for (addr = sva; addr < eva; addr += PAGE_SIZE)
 				invlpg(addr);
 		if (pmap->pm_active & other_cpus)
 			smp_masked_invlpg_range(pmap->pm_active & other_cpus,
 			    sva, eva);
 	}
 	if (smp_started)
 		mtx_unlock_spin(&smp_ipi_mtx);
 	else
 		critical_exit();
 }
 
 void
 pmap_invalidate_all(pmap_t pmap)
 {
 	u_int cpumask;
 	u_int other_cpus;
 
 	if (smp_started) {
 		if (!(read_eflags() & PSL_I))
 			panic("%s: interrupts disabled", __func__);
 		mtx_lock_spin(&smp_ipi_mtx);
 	} else
 		critical_enter();
 	/*
 	 * We need to disable interrupt preemption but MUST NOT have
 	 * interrupts disabled here.
 	 * XXX we may need to hold schedlock to get a coherent pm_active
 	 * XXX critical sections disable interrupts again
 	 */
 	if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
 		invltlb();
 		smp_invltlb();
 	} else {
 		cpumask = PCPU_GET(cpumask);
 		other_cpus = PCPU_GET(other_cpus);
 		if (pmap->pm_active & cpumask)
 			invltlb();
 		if (pmap->pm_active & other_cpus)
 			smp_masked_invltlb(pmap->pm_active & other_cpus);
 	}
 	if (smp_started)
 		mtx_unlock_spin(&smp_ipi_mtx);
 	else
 		critical_exit();
 }
 #else /* !SMP */
 /*
  * Normal, non-SMP, 486+ invalidation functions.
  * We inline these within pmap.c for speed.
  */
 PMAP_INLINE void
 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 {
 
 	if (pmap == kernel_pmap || pmap->pm_active)
 		invlpg(va);
 }
 
 PMAP_INLINE void
 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	vm_offset_t addr;
 
 	if (pmap == kernel_pmap || pmap->pm_active)
 		for (addr = sva; addr < eva; addr += PAGE_SIZE)
 			invlpg(addr);
 }
 
 PMAP_INLINE void
 pmap_invalidate_all(pmap_t pmap)
 {
 
 	if (pmap == kernel_pmap || pmap->pm_active)
 		invltlb();
 }
 #endif /* !SMP */
 
 /*
  * Are we current address space or kernel?  N.B. We return FALSE when
  * a pmap's page table is in use because a kernel thread is borrowing
  * it.  The borrowed page table can change spontaneously, making any
  * dependence on its continued use subject to a race condition.
  */
 static __inline int
 pmap_is_current(pmap_t pmap)
 {
 
 	return (pmap == kernel_pmap ||
 		(pmap == vmspace_pmap(curthread->td_proc->p_vmspace) &&
 	    (pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & PG_FRAME)));
 }
 
 /*
  * If the given pmap is not the current or kernel pmap, the returned pte must
  * be released by passing it to pmap_pte_release().
  */
 pt_entry_t *
 pmap_pte(pmap_t pmap, vm_offset_t va)
 {
 	pd_entry_t newpf;
 	pd_entry_t *pde;
 
 	pde = pmap_pde(pmap, va);
 	if (*pde & PG_PS)
 		return (pde);
 	if (*pde != 0) {
 		/* are we current address space or kernel? */
 		if (pmap_is_current(pmap))
 			return (vtopte(va));
 		mtx_lock(&PMAP2mutex);
 		newpf = *pde & PG_FRAME;
 		if ((*PMAP2 & PG_FRAME) != newpf) {
 			*PMAP2 = newpf | PG_RW | PG_V | PG_A | PG_M;
 			pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2);
 		}
 		return (PADDR2 + (i386_btop(va) & (NPTEPG - 1)));
 	}
 	return (0);
 }
 
 /*
  * Releases a pte that was obtained from pmap_pte().  Be prepared for the pte
  * being NULL.
  */
 static __inline void
 pmap_pte_release(pt_entry_t *pte)
 {
 
 	if ((pt_entry_t *)((vm_offset_t)pte & ~PAGE_MASK) == PADDR2)
 		mtx_unlock(&PMAP2mutex);
 }
 
 static __inline void
 invlcaddr(void *caddr)
 {
 
 	invlpg((u_int)caddr);
 }
 
 /*
  * Super fast pmap_pte routine best used when scanning
  * the pv lists.  This eliminates many coarse-grained
  * invltlb calls.  Note that many of the pv list
  * scans are across different pmaps.  It is very wasteful
  * to do an entire invltlb for checking a single mapping.
  *
  * If the given pmap is not the current pmap, vm_page_queue_mtx
  * must be held and curthread pinned to a CPU.
  */
 static pt_entry_t *
 pmap_pte_quick(pmap_t pmap, vm_offset_t va)
 {
 	pd_entry_t newpf;
 	pd_entry_t *pde;
 
 	pde = pmap_pde(pmap, va);
 	if (*pde & PG_PS)
 		return (pde);
 	if (*pde != 0) {
 		/* are we current address space or kernel? */
 		if (pmap_is_current(pmap))
 			return (vtopte(va));
 		mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 		KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
 		newpf = *pde & PG_FRAME;
 		if ((*PMAP1 & PG_FRAME) != newpf) {
 			*PMAP1 = newpf | PG_RW | PG_V | PG_A | PG_M;
 #ifdef SMP
 			PMAP1cpu = PCPU_GET(cpuid);
 #endif
 			invlcaddr(PADDR1);
 			PMAP1changed++;
 		} else
 #ifdef SMP
 		if (PMAP1cpu != PCPU_GET(cpuid)) {
 			PMAP1cpu = PCPU_GET(cpuid);
 			invlcaddr(PADDR1);
 			PMAP1changedcpu++;
 		} else
 #endif
 			PMAP1unchanged++;
 		return (PADDR1 + (i386_btop(va) & (NPTEPG - 1)));
 	}
 	return (0);
 }
 
 /*
  *	Routine:	pmap_extract
  *	Function:
  *		Extract the physical page address associated
  *		with the given map/virtual_address pair.
  */
 vm_paddr_t 
 pmap_extract(pmap_t pmap, vm_offset_t va)
 {
 	vm_paddr_t rtval;
 	pt_entry_t *pte;
 	pd_entry_t pde;
 
 	rtval = 0;
 	PMAP_LOCK(pmap);
 	pde = pmap->pm_pdir[va >> PDRSHIFT];
 	if (pde != 0) {
 		if ((pde & PG_PS) != 0) {
 			rtval = (pde & ~PDRMASK) | (va & PDRMASK);
 			PMAP_UNLOCK(pmap);
 			return rtval;
 		}
 		pte = pmap_pte(pmap, va);
 		rtval = (*pte & PG_FRAME) | (va & PAGE_MASK);
 		pmap_pte_release(pte);
 	}
 	PMAP_UNLOCK(pmap);
 	return (rtval);
 }
 
 /*
  *	Routine:	pmap_extract_and_hold
  *	Function:
  *		Atomically extract and hold the physical page
  *		with the given pmap and virtual address pair
  *		if that mapping permits the given protection.
  */
 vm_page_t
 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
 {
 	pd_entry_t pde;
 	pt_entry_t pte;
 	vm_page_t m;
 
 	m = NULL;
 	vm_page_lock_queues();
 	PMAP_LOCK(pmap);
 	pde = *pmap_pde(pmap, va);
 	if (pde != 0) {
 		if (pde & PG_PS) {
 			if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
 				m = PHYS_TO_VM_PAGE((pde & ~PDRMASK) |
 				    (va & PDRMASK));
 				vm_page_hold(m);
 			}
 		} else {
 			sched_pin();
 			pte = *pmap_pte_quick(pmap, va);
 			if (pte != 0 &&
 			    ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
 				m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
 				vm_page_hold(m);
 			}
 			sched_unpin();
 		}
 	}
 	vm_page_unlock_queues();
 	PMAP_UNLOCK(pmap);
 	return (m);
 }
 
 /***************************************************
  * Low level mapping routines.....
  ***************************************************/
 
 /*
  * Add a wired page to the kva.
  * Note: not SMP coherent.
  */
 PMAP_INLINE void 
 pmap_kenter(vm_offset_t va, vm_paddr_t pa)
 {
 	pt_entry_t *pte;
 
 	pte = vtopte(va);
 	pte_store(pte, pa | PG_RW | PG_V | pgeflag);
 }
 
 /*
  * Remove a page from the kernel pagetables.
  * Note: not SMP coherent.
  */
 PMAP_INLINE void
 pmap_kremove(vm_offset_t va)
 {
 	pt_entry_t *pte;
 
 	pte = vtopte(va);
 	pte_clear(pte);
 }
 
 /*
  *	Used to map a range of physical addresses into kernel
  *	virtual address space.
  *
  *	The value passed in '*virt' is a suggested virtual address for
  *	the mapping. Architectures which can support a direct-mapped
  *	physical to virtual region can return the appropriate address
  *	within that region, leaving '*virt' unchanged. Other
  *	architectures should map the pages starting at '*virt' and
  *	update '*virt' with the first usable address after the mapped
  *	region.
  */
 vm_offset_t
 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
 {
 	vm_offset_t va, sva;
 
 	va = sva = *virt;
 	while (start < end) {
 		pmap_kenter(va, start);
 		va += PAGE_SIZE;
 		start += PAGE_SIZE;
 	}
 	pmap_invalidate_range(kernel_pmap, sva, va);
 	*virt = va;
 	return (sva);
 }
 
 
 /*
  * Add a list of wired pages to the kva
  * this routine is only used for temporary
  * kernel mappings that do not need to have
  * page modification or references recorded.
  * Note that old mappings are simply written
  * over.  The page *must* be wired.
  * Note: SMP coherent.  Uses a ranged shootdown IPI.
  */
 void
 pmap_qenter(vm_offset_t sva, vm_page_t *m, int count)
 {
 	vm_offset_t va;
 
 	va = sva;
 	while (count-- > 0) {
 		pmap_kenter(va, VM_PAGE_TO_PHYS(*m));
 		va += PAGE_SIZE;
 		m++;
 	}
 	pmap_invalidate_range(kernel_pmap, sva, va);
 }
 
 /*
  * This routine tears out page mappings from the
  * kernel -- it is meant only for temporary mappings.
  * Note: SMP coherent.  Uses a ranged shootdown IPI.
  */
 void
 pmap_qremove(vm_offset_t sva, int count)
 {
 	vm_offset_t va;
 
 	va = sva;
 	while (count-- > 0) {
 		pmap_kremove(va);
 		va += PAGE_SIZE;
 	}
 	pmap_invalidate_range(kernel_pmap, sva, va);
 }
 
 /***************************************************
  * Page table page management routines.....
  ***************************************************/
 
 /*
  * This routine unholds page table pages, and if the hold count
  * drops to zero, then it decrements the wire count.
  */
 static PMAP_INLINE int
 pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m)
 {
 
 	--m->wire_count;
 	if (m->wire_count == 0)
 		return _pmap_unwire_pte_hold(pmap, m);
 	else
 		return 0;
 }
 
 static int 
 _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m)
 {
 	vm_offset_t pteva;
 
 	/*
 	 * unmap the page table page
 	 */
 	pmap->pm_pdir[m->pindex] = 0;
 	--pmap->pm_stats.resident_count;
 
 	/*
 	 * Do an invltlb to make the invalidated mapping
 	 * take effect immediately.
 	 */
 	pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex);
 	pmap_invalidate_page(pmap, pteva);
 
 	vm_page_free_zero(m);
 	atomic_subtract_int(&cnt.v_wire_count, 1);
 	return 1;
 }
 
 /*
  * After removing a page table entry, this routine is used to
  * conditionally free the page, and manage the hold/wire counts.
  */
 static int
 pmap_unuse_pt(pmap_t pmap, vm_offset_t va)
 {
 	pd_entry_t ptepde;
 	vm_page_t mpte;
 
 	if (va >= VM_MAXUSER_ADDRESS)
 		return 0;
 	ptepde = *pmap_pde(pmap, va);
 	mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
 	return pmap_unwire_pte_hold(pmap, mpte);
 }
 
 void
 pmap_pinit0(pmap)
 	struct pmap *pmap;
 {
 
 	PMAP_LOCK_INIT(pmap);
 	pmap->pm_pdir = (pd_entry_t *)(KERNBASE + (vm_offset_t)IdlePTD);
 #ifdef PAE
 	pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT);
 #endif
 	pmap->pm_active = 0;
 	PCPU_SET(curpmap, pmap);
 	TAILQ_INIT(&pmap->pm_pvlist);
 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
 	mtx_lock_spin(&allpmaps_lock);
 	LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
 	mtx_unlock_spin(&allpmaps_lock);
 }
 
 /*
  * Initialize a preallocated and zeroed pmap structure,
  * such as one in a vmspace structure.
  */
 void
 pmap_pinit(pmap)
 	register struct pmap *pmap;
 {
 	vm_page_t m, ptdpg[NPGPTD];
 	vm_paddr_t pa;
 	static int color;
 	int i;
 
 	PMAP_LOCK_INIT(pmap);
 
 	/*
 	 * No need to allocate page table space yet but we do need a valid
 	 * page directory table.
 	 */
 	if (pmap->pm_pdir == NULL) {
 		pmap->pm_pdir = (pd_entry_t *)kmem_alloc_nofault(kernel_map,
 		    NBPTD);
 #ifdef PAE
 		pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO);
 		KASSERT(((vm_offset_t)pmap->pm_pdpt &
 		    ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0,
 		    ("pmap_pinit: pdpt misaligned"));
 		KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30),
 		    ("pmap_pinit: pdpt above 4g"));
 #endif
 	}
 
 	/*
 	 * allocate the page directory page(s)
 	 */
 	for (i = 0; i < NPGPTD;) {
 		m = vm_page_alloc(NULL, color++,
 		    VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
 		    VM_ALLOC_ZERO);
 		if (m == NULL)
 			VM_WAIT;
 		else {
 			ptdpg[i++] = m;
 		}
 	}
 
 	pmap_qenter((vm_offset_t)pmap->pm_pdir, ptdpg, NPGPTD);
 
 	for (i = 0; i < NPGPTD; i++) {
 		if ((ptdpg[i]->flags & PG_ZERO) == 0)
 			bzero(pmap->pm_pdir + (i * NPDEPG), PAGE_SIZE);
 	}
 
 	mtx_lock_spin(&allpmaps_lock);
 	LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
 	mtx_unlock_spin(&allpmaps_lock);
 	/* Wire in kernel global address entries. */
 	/* XXX copies current process, does not fill in MPPTDI */
 	bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * sizeof(pd_entry_t));
 #ifdef SMP
 	pmap->pm_pdir[MPPTDI] = PTD[MPPTDI];
 #endif
 
 	/* install self-referential address mapping entry(s) */
 	for (i = 0; i < NPGPTD; i++) {
 		pa = VM_PAGE_TO_PHYS(ptdpg[i]);
 		pmap->pm_pdir[PTDPTDI + i] = pa | PG_V | PG_RW | PG_A | PG_M;
 #ifdef PAE
 		pmap->pm_pdpt[i] = pa | PG_V;
 #endif
 	}
 
 	pmap->pm_active = 0;
 	TAILQ_INIT(&pmap->pm_pvlist);
 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
 }
 
 /*
  * this routine is called if the page table page is not
  * mapped correctly.
  */
 static vm_page_t
 _pmap_allocpte(pmap_t pmap, unsigned ptepindex, int flags)
 {
 	vm_paddr_t ptepa;
 	vm_page_t m;
 
 	KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
 	    (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
 	    ("_pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
 
 	/*
 	 * Allocate a page table page.
 	 */
 	if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
 	    VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
 		if (flags & M_WAITOK) {
 			PMAP_UNLOCK(pmap);
 			vm_page_unlock_queues();
 			VM_WAIT;
 			vm_page_lock_queues();
 			PMAP_LOCK(pmap);
 		}
 
 		/*
 		 * Indicate the need to retry.  While waiting, the page table
 		 * page may have been allocated.
 		 */
 		return (NULL);
 	}
 	if ((m->flags & PG_ZERO) == 0)
 		pmap_zero_page(m);
 
 	/*
 	 * Map the pagetable page into the process address space, if
 	 * it isn't already there.
 	 */
 
 	pmap->pm_stats.resident_count++;
 
 	ptepa = VM_PAGE_TO_PHYS(m);
 	pmap->pm_pdir[ptepindex] =
 		(pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M);
 
 	return m;
 }
 
 static vm_page_t
 pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags)
 {
 	unsigned ptepindex;
 	pd_entry_t ptepa;
 	vm_page_t m;
 
 	KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
 	    (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
 	    ("pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
 
 	/*
 	 * Calculate pagetable page index
 	 */
 	ptepindex = va >> PDRSHIFT;
 retry:
 	/*
 	 * Get the page directory entry
 	 */
 	ptepa = pmap->pm_pdir[ptepindex];
 
 	/*
 	 * This supports switching from a 4MB page to a
 	 * normal 4K page.
 	 */
 	if (ptepa & PG_PS) {
 		pmap->pm_pdir[ptepindex] = 0;
 		ptepa = 0;
 		pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
 		pmap_invalidate_all(kernel_pmap);
 	}
 
 	/*
 	 * If the page table page is mapped, we just increment the
 	 * hold count, and activate it.
 	 */
 	if (ptepa) {
 		m = PHYS_TO_VM_PAGE(ptepa);
 		m->wire_count++;
 	} else {
 		/*
 		 * Here if the pte page isn't mapped, or if it has
 		 * been deallocated. 
 		 */
 		m = _pmap_allocpte(pmap, ptepindex, flags);
 		if (m == NULL && (flags & M_WAITOK))
 			goto retry;
 	}
 	return (m);
 }
 
 
 /***************************************************
 * Pmap allocation/deallocation routines.
  ***************************************************/
 
 #ifdef SMP
 /*
  * Deal with a SMP shootdown of other users of the pmap that we are
  * trying to dispose of.  This can be a bit hairy.
  */
 static u_int *lazymask;
 static u_int lazyptd;
 static volatile u_int lazywait;
 
 void pmap_lazyfix_action(void);
 
 void
 pmap_lazyfix_action(void)
 {
 	u_int mymask = PCPU_GET(cpumask);
 
+#ifdef COUNT_IPIS
+	*ipi_lazypmap_counts[PCPU_GET(cpuid)]++;
+#endif
 	if (rcr3() == lazyptd)
 		load_cr3(PCPU_GET(curpcb)->pcb_cr3);
 	atomic_clear_int(lazymask, mymask);
 	atomic_store_rel_int(&lazywait, 1);
 }
 
 static void
 pmap_lazyfix_self(u_int mymask)
 {
 
 	if (rcr3() == lazyptd)
 		load_cr3(PCPU_GET(curpcb)->pcb_cr3);
 	atomic_clear_int(lazymask, mymask);
 }
 
 
 static void
 pmap_lazyfix(pmap_t pmap)
 {
 	u_int mymask;
 	u_int mask;
 	register u_int spins;
 
 	while ((mask = pmap->pm_active) != 0) {
 		spins = 50000000;
 		mask = mask & -mask;	/* Find least significant set bit */
 		mtx_lock_spin(&smp_ipi_mtx);
 #ifdef PAE
 		lazyptd = vtophys(pmap->pm_pdpt);
 #else
 		lazyptd = vtophys(pmap->pm_pdir);
 #endif
 		mymask = PCPU_GET(cpumask);
 		if (mask == mymask) {
 			lazymask = &pmap->pm_active;
 			pmap_lazyfix_self(mymask);
 		} else {
 			atomic_store_rel_int((u_int *)&lazymask,
 			    (u_int)&pmap->pm_active);
 			atomic_store_rel_int(&lazywait, 0);
 			ipi_selected(mask, IPI_LAZYPMAP);
 			while (lazywait == 0) {
 				ia32_pause();
 				if (--spins == 0)
 					break;
 			}
 		}
 		mtx_unlock_spin(&smp_ipi_mtx);
 		if (spins == 0)
 			printf("pmap_lazyfix: spun for 50000000\n");
 	}
 }
 
 #else	/* SMP */
 
 /*
  * Cleaning up on uniprocessor is easy.  For various reasons, we're
  * unlikely to have to even execute this code, including the fact
  * that the cleanup is deferred until the parent does a wait(2), which
  * means that another userland process has run.
  */
 static void
 pmap_lazyfix(pmap_t pmap)
 {
 	u_int cr3;
 
 	cr3 = vtophys(pmap->pm_pdir);
 	if (cr3 == rcr3()) {
 		load_cr3(PCPU_GET(curpcb)->pcb_cr3);
 		pmap->pm_active &= ~(PCPU_GET(cpumask));
 	}
 }
 #endif	/* SMP */
 
 /*
  * Release any resources held by the given physical map.
  * Called when a pmap initialized by pmap_pinit is being released.
  * Should only be called if the map contains no valid mappings.
  */
 void
 pmap_release(pmap_t pmap)
 {
 	vm_page_t m, ptdpg[NPGPTD];
 	int i;
 
 	KASSERT(pmap->pm_stats.resident_count == 0,
 	    ("pmap_release: pmap resident count %ld != 0",
 	    pmap->pm_stats.resident_count));
 
 	pmap_lazyfix(pmap);
 	mtx_lock_spin(&allpmaps_lock);
 	LIST_REMOVE(pmap, pm_list);
 	mtx_unlock_spin(&allpmaps_lock);
 
 	for (i = 0; i < NPGPTD; i++)
 		ptdpg[i] = PHYS_TO_VM_PAGE(pmap->pm_pdir[PTDPTDI + i]);
 
 	bzero(pmap->pm_pdir + PTDPTDI, (nkpt + NPGPTD) *
 	    sizeof(*pmap->pm_pdir));
 #ifdef SMP
 	pmap->pm_pdir[MPPTDI] = 0;
 #endif
 
 	pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD);
 
 	vm_page_lock_queues();
 	for (i = 0; i < NPGPTD; i++) {
 		m = ptdpg[i];
 #ifdef PAE
 		KASSERT(VM_PAGE_TO_PHYS(m) == (pmap->pm_pdpt[i] & PG_FRAME),
 		    ("pmap_release: got wrong ptd page"));
 #endif
 		m->wire_count--;
 		atomic_subtract_int(&cnt.v_wire_count, 1);
 		vm_page_free_zero(m);
 	}
 	vm_page_unlock_queues();
 	PMAP_LOCK_DESTROY(pmap);
 }
 
 static int
 kvm_size(SYSCTL_HANDLER_ARGS)
 {
 	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE;
 
 	return sysctl_handle_long(oidp, &ksize, 0, req);
 }
 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 
     0, 0, kvm_size, "IU", "Size of KVM");
 
 static int
 kvm_free(SYSCTL_HANDLER_ARGS)
 {
 	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
 
 	return sysctl_handle_long(oidp, &kfree, 0, req);
 }
 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 
     0, 0, kvm_free, "IU", "Amount of KVM free");
 
 /*
  * grow the number of kernel page table entries, if needed
  */
 void
 pmap_growkernel(vm_offset_t addr)
 {
 	struct pmap *pmap;
 	vm_paddr_t ptppaddr;
 	vm_page_t nkpg;
 	pd_entry_t newpdir;
 	pt_entry_t *pde;
 
 	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
 	if (kernel_vm_end == 0) {
 		kernel_vm_end = KERNBASE;
 		nkpt = 0;
 		while (pdir_pde(PTD, kernel_vm_end)) {
 			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
 			nkpt++;
 		}
 	}
 	addr = roundup2(addr, PAGE_SIZE * NPTEPG);
 	while (kernel_vm_end < addr) {
 		if (pdir_pde(PTD, kernel_vm_end)) {
 			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
 			continue;
 		}
 
 		/*
 		 * This index is bogus, but out of the way
 		 */
 		nkpg = vm_page_alloc(NULL, nkpt,
 		    VM_ALLOC_NOOBJ | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED);
 		if (!nkpg)
 			panic("pmap_growkernel: no memory to grow kernel");
 
 		nkpt++;
 
 		pmap_zero_page(nkpg);
 		ptppaddr = VM_PAGE_TO_PHYS(nkpg);
 		newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M);
 		pdir_pde(PTD, kernel_vm_end) = newpdir;
 
 		mtx_lock_spin(&allpmaps_lock);
 		LIST_FOREACH(pmap, &allpmaps, pm_list) {
 			pde = pmap_pde(pmap, kernel_vm_end);
 			pde_store(pde, newpdir);
 		}
 		mtx_unlock_spin(&allpmaps_lock);
 		kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
 	}
 }
 
 
 /***************************************************
  * page management routines.
  ***************************************************/
 
 /*
  * free the pv_entry back to the free list
  */
 static PMAP_INLINE void
 free_pv_entry(pv_entry_t pv)
 {
 	pv_entry_count--;
 	uma_zfree(pvzone, pv);
 }
 
 /*
  * get a new pv_entry, allocating a block from the system
  * when needed.
  */
 static pv_entry_t
 get_pv_entry(pmap_t locked_pmap)
 {
 	static const struct timeval printinterval = { 60, 0 };
 	static struct timeval lastprint;
 	struct vpgqueues *vpq;
 	pmap_t pmap;
 	pt_entry_t *pte, tpte;
 	pv_entry_t allocated_pv, next_pv, pv;
 	vm_offset_t va;
 	vm_page_t m;
 
 	PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	allocated_pv = uma_zalloc(pvzone, M_NOWAIT);
 	if (allocated_pv != NULL) {
 		pv_entry_count++;
 		if (pv_entry_count > pv_entry_high_water)
 			pagedaemon_wakeup();
 		else
 			return (allocated_pv);
 	}
 
 	/*
 	 * Reclaim pv entries: At first, destroy mappings to inactive
 	 * pages.  After that, if a pv entry is still needed, destroy
 	 * mappings to active pages.
 	 */
 	if (ratecheck(&lastprint, &printinterval))
 		printf("Approaching the limit on PV entries, "
 		    "increase the vm.pmap.shpgperproc tunable.\n");
 	vpq = &vm_page_queues[PQ_INACTIVE];
 retry:
 	sched_pin();
 	TAILQ_FOREACH(m, &vpq->pl, pageq) {
 		if (m->hold_count || m->busy || (m->flags & PG_BUSY))
 			continue;
 		TAILQ_FOREACH_SAFE(pv, &m->md.pv_list, pv_list, next_pv) {
 			va = pv->pv_va;
 			pmap = pv->pv_pmap;
 			/* Avoid deadlock and lock recursion. */
 			if (pmap > locked_pmap)
 				PMAP_LOCK(pmap);
 			else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap))
 				continue;
 			pmap->pm_stats.resident_count--;
 			pte = pmap_pte_quick(pmap, va);
 			tpte = pte_load_clear(pte);
 			KASSERT((tpte & PG_W) == 0,
 			    ("get_pv_entry: wired pte %#jx", (uintmax_t)tpte));
 			if (tpte & PG_A)
 				vm_page_flag_set(m, PG_REFERENCED);
 			if (tpte & PG_M) {
 				KASSERT((tpte & PG_RW),
 	("get_pv_entry: modified page not writable: va: %#x, pte: %#jx",
 				    va, (uintmax_t)tpte));
 				if (pmap_track_modified(va))
 					vm_page_dirty(m);
 			}
 			pmap_invalidate_page(pmap, va);
 			TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
 			TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
 			if (TAILQ_EMPTY(&m->md.pv_list))
 				vm_page_flag_clear(m, PG_WRITEABLE);
 			m->md.pv_list_count--;
 			pmap_unuse_pt(pmap, va);
 			if (pmap != locked_pmap)
 				PMAP_UNLOCK(pmap);
 			if (allocated_pv == NULL)
 				allocated_pv = pv;
 			else
 				free_pv_entry(pv);
 		}
 	}
 	sched_unpin();
 	if (allocated_pv == NULL) {
 		if (vpq == &vm_page_queues[PQ_INACTIVE]) {
 			vpq = &vm_page_queues[PQ_ACTIVE];
 			goto retry;
 		}
 		panic("get_pv_entry: increase the vm.pmap.shpgperproc tunable");
 	}
 	return (allocated_pv);
 }
 
 static void
 pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
 {
 	pv_entry_t pv;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	if (m->md.pv_list_count < pmap->pm_stats.resident_count) {
 		TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 			if (pmap == pv->pv_pmap && va == pv->pv_va) 
 				break;
 		}
 	} else {
 		TAILQ_FOREACH(pv, &pmap->pm_pvlist, pv_plist) {
 			if (va == pv->pv_va) 
 				break;
 		}
 	}
 	KASSERT(pv != NULL, ("pmap_remove_entry: pv not found"));
 	TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
 	m->md.pv_list_count--;
 	if (TAILQ_EMPTY(&m->md.pv_list))
 		vm_page_flag_clear(m, PG_WRITEABLE);
 	TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
 	free_pv_entry(pv);
 }
 
 /*
  * Create a pv entry for page at pa for
  * (pmap, va).
  */
 static void
 pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
 {
 	pv_entry_t pv;
 
 	pv = get_pv_entry(pmap);
 	pv->pv_va = va;
 	pv->pv_pmap = pmap;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist);
 	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
 	m->md.pv_list_count++;
 }
 
 /*
  * pmap_remove_pte: do the things to unmap a page in a process
  */
 static int
 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va)
 {
 	pt_entry_t oldpte;
 	vm_page_t m;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	oldpte = pte_load_clear(ptq);
 	if (oldpte & PG_W)
 		pmap->pm_stats.wired_count -= 1;
 	/*
 	 * Machines that don't support invlpg, also don't support
 	 * PG_G.
 	 */
 	if (oldpte & PG_G)
 		pmap_invalidate_page(kernel_pmap, va);
 	pmap->pm_stats.resident_count -= 1;
 	if (oldpte & PG_MANAGED) {
 		m = PHYS_TO_VM_PAGE(oldpte);
 		if (oldpte & PG_M) {
 			KASSERT((oldpte & PG_RW),
 	("pmap_remove_pte: modified page not writable: va: %#x, pte: %#jx",
 			    va, (uintmax_t)oldpte));
 			if (pmap_track_modified(va))
 				vm_page_dirty(m);
 		}
 		if (oldpte & PG_A)
 			vm_page_flag_set(m, PG_REFERENCED);
 		pmap_remove_entry(pmap, m, va);
 	}
 	return (pmap_unuse_pt(pmap, va));
 }
 
 /*
  * Remove a single page from a process address space
  */
 static void
 pmap_remove_page(pmap_t pmap, vm_offset_t va)
 {
 	pt_entry_t *pte;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	if ((pte = pmap_pte_quick(pmap, va)) == NULL || *pte == 0)
 		return;
 	pmap_remove_pte(pmap, pte, va);
 	pmap_invalidate_page(pmap, va);
 }
 
 /*
  *	Remove the given range of addresses from the specified map.
  *
  *	It is assumed that the start and end are properly
  *	rounded to the page size.
  */
 void
 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	vm_offset_t pdnxt;
 	pd_entry_t ptpaddr;
 	pt_entry_t *pte;
 	int anyvalid;
 
 	/*
 	 * Perform an unsynchronized read.  This is, however, safe.
 	 */
 	if (pmap->pm_stats.resident_count == 0)
 		return;
 
 	anyvalid = 0;
 
 	vm_page_lock_queues();
 	sched_pin();
 	PMAP_LOCK(pmap);
 
 	/*
 	 * special handling of removing one page.  a very
 	 * common operation and easy to short circuit some
 	 * code.
 	 */
 	if ((sva + PAGE_SIZE == eva) && 
 	    ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) {
 		pmap_remove_page(pmap, sva);
 		goto out;
 	}
 
 	for (; sva < eva; sva = pdnxt) {
 		unsigned pdirindex;
 
 		/*
 		 * Calculate index for next page table.
 		 */
 		pdnxt = (sva + NBPDR) & ~PDRMASK;
 		if (pmap->pm_stats.resident_count == 0)
 			break;
 
 		pdirindex = sva >> PDRSHIFT;
 		ptpaddr = pmap->pm_pdir[pdirindex];
 
 		/*
 		 * Weed out invalid mappings. Note: we assume that the page
 		 * directory table is always allocated, and in kernel virtual.
 		 */
 		if (ptpaddr == 0)
 			continue;
 
 		/*
 		 * Check for large page.
 		 */
 		if ((ptpaddr & PG_PS) != 0) {
 			pmap->pm_pdir[pdirindex] = 0;
 			pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
 			anyvalid = 1;
 			continue;
 		}
 
 		/*
 		 * Limit our scan to either the end of the va represented
 		 * by the current page table page, or to the end of the
 		 * range being removed.
 		 */
 		if (pdnxt > eva)
 			pdnxt = eva;
 
 		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
 		    sva += PAGE_SIZE) {
 			if (*pte == 0)
 				continue;
 			anyvalid = 1;
 			if (pmap_remove_pte(pmap, pte, sva))
 				break;
 		}
 	}
 out:
 	sched_unpin();
 	vm_page_unlock_queues();
 	if (anyvalid)
 		pmap_invalidate_all(pmap);
 	PMAP_UNLOCK(pmap);
 }
 
 /*
  *	Routine:	pmap_remove_all
  *	Function:
  *		Removes this physical page from
  *		all physical maps in which it resides.
  *		Reflects back modify bits to the pager.
  *
  *	Notes:
  *		Original versions of this routine were very
  *		inefficient because they iteratively called
  *		pmap_remove (slow...)
  */
 
 void
 pmap_remove_all(vm_page_t m)
 {
 	register pv_entry_t pv;
 	pt_entry_t *pte, tpte;
 
 #if defined(PMAP_DIAGNOSTIC)
 	/*
 	 * XXX This makes pmap_remove_all() illegal for non-managed pages!
 	 */
 	if (m->flags & PG_FICTITIOUS) {
 		panic("pmap_remove_all: illegal for unmanaged page, va: 0x%x",
 		    VM_PAGE_TO_PHYS(m));
 	}
 #endif
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	sched_pin();
 	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
 		PMAP_LOCK(pv->pv_pmap);
 		pv->pv_pmap->pm_stats.resident_count--;
 		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
 		tpte = pte_load_clear(pte);
 		if (tpte & PG_W)
 			pv->pv_pmap->pm_stats.wired_count--;
 		if (tpte & PG_A)
 			vm_page_flag_set(m, PG_REFERENCED);
 
 		/*
 		 * Update the vm_page_t clean and reference bits.
 		 */
 		if (tpte & PG_M) {
 			KASSERT((tpte & PG_RW),
 	("pmap_remove_all: modified page not writable: va: %#x, pte: %#jx",
 			    pv->pv_va, (uintmax_t)tpte));
 			if (pmap_track_modified(pv->pv_va))
 				vm_page_dirty(m);
 		}
 		pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
 		TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist);
 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
 		m->md.pv_list_count--;
 		pmap_unuse_pt(pv->pv_pmap, pv->pv_va);
 		PMAP_UNLOCK(pv->pv_pmap);
 		free_pv_entry(pv);
 	}
 	vm_page_flag_clear(m, PG_WRITEABLE);
 	sched_unpin();
 }
 
 /*
  *	Set the physical protection on the
  *	specified range of this map as requested.
  */
 void
 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
 {
 	vm_offset_t pdnxt;
 	pd_entry_t ptpaddr;
 	pt_entry_t *pte;
 	int anychanged;
 
 	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
 		pmap_remove(pmap, sva, eva);
 		return;
 	}
 
 	if (prot & VM_PROT_WRITE)
 		return;
 
 	anychanged = 0;
 
 	vm_page_lock_queues();
 	sched_pin();
 	PMAP_LOCK(pmap);
 	for (; sva < eva; sva = pdnxt) {
 		unsigned obits, pbits, pdirindex;
 
 		pdnxt = (sva + NBPDR) & ~PDRMASK;
 
 		pdirindex = sva >> PDRSHIFT;
 		ptpaddr = pmap->pm_pdir[pdirindex];
 
 		/*
 		 * Weed out invalid mappings. Note: we assume that the page
 		 * directory table is always allocated, and in kernel virtual.
 		 */
 		if (ptpaddr == 0)
 			continue;
 
 		/*
 		 * Check for large page.
 		 */
 		if ((ptpaddr & PG_PS) != 0) {
 			pmap->pm_pdir[pdirindex] &= ~(PG_M|PG_RW);
 			anychanged = 1;
 			continue;
 		}
 
 		if (pdnxt > eva)
 			pdnxt = eva;
 
 		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
 		    sva += PAGE_SIZE) {
 			vm_page_t m;
 
 retry:
 			/*
 			 * Regardless of whether a pte is 32 or 64 bits in
 			 * size, PG_RW, PG_A, and PG_M are among the least
 			 * significant 32 bits.
 			 */
 			obits = pbits = *(u_int *)pte;
 			if (pbits & PG_MANAGED) {
 				m = NULL;
 				if (pbits & PG_A) {
 					m = PHYS_TO_VM_PAGE(*pte);
 					vm_page_flag_set(m, PG_REFERENCED);
 					pbits &= ~PG_A;
 				}
 				if ((pbits & PG_M) != 0 &&
 				    pmap_track_modified(sva)) {
 					if (m == NULL)
 						m = PHYS_TO_VM_PAGE(*pte);
 					vm_page_dirty(m);
 				}
 			}
 
 			pbits &= ~(PG_RW | PG_M);
 
 			if (pbits != obits) {
 				if (!atomic_cmpset_int((u_int *)pte, obits,
 				    pbits))
 					goto retry;
 				if (obits & PG_G)
 					pmap_invalidate_page(pmap, sva);
 				else
 					anychanged = 1;
 			}
 		}
 	}
 	sched_unpin();
 	vm_page_unlock_queues();
 	if (anychanged)
 		pmap_invalidate_all(pmap);
 	PMAP_UNLOCK(pmap);
 }
 
 /*
  *	Insert the given physical page (p) at
  *	the specified virtual address (v) in the
  *	target physical map with the protection requested.
  *
  *	If specified, the page will be wired down, meaning
  *	that the related pte can not be reclaimed.
  *
  *	NB:  This is the only routine which MAY NOT lazy-evaluate
  *	or lose information.  That is, this routine must actually
  *	insert this page into the given map NOW.
  */
 void
 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
 	   boolean_t wired)
 {
 	vm_paddr_t pa;
 	register pt_entry_t *pte;
 	vm_paddr_t opa;
 	pt_entry_t origpte, newpte;
 	vm_page_t mpte, om;
 	boolean_t invlva;
 
 	va &= PG_FRAME;
 #ifdef PMAP_DIAGNOSTIC
 	if (va > VM_MAX_KERNEL_ADDRESS)
 		panic("pmap_enter: toobig");
 	if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS))
 		panic("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", va);
 #endif
 
 	mpte = NULL;
 
 	vm_page_lock_queues();
 	PMAP_LOCK(pmap);
 	sched_pin();
 
 	/*
 	 * In the case that a page table page is not
 	 * resident, we are creating it here.
 	 */
 	if (va < VM_MAXUSER_ADDRESS) {
 		mpte = pmap_allocpte(pmap, va, M_WAITOK);
 	}
 #if 0 && defined(PMAP_DIAGNOSTIC)
 	else {
 		pd_entry_t *pdeaddr = pmap_pde(pmap, va);
 		origpte = *pdeaddr;
 		if ((origpte & PG_V) == 0) { 
 			panic("pmap_enter: invalid kernel page table page, pdir=%p, pde=%p, va=%p\n",
 				pmap->pm_pdir[PTDPTDI], origpte, va);
 		}
 	}
 #endif
 
 	pte = pmap_pte_quick(pmap, va);
 
 	/*
 	 * Page Directory table entry not valid, we need a new PT page
 	 */
 	if (pte == NULL) {
 		panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x\n",
 			(uintmax_t)pmap->pm_pdir[PTDPTDI], va);
 	}
 
 	pa = VM_PAGE_TO_PHYS(m);
 	om = NULL;
 	origpte = *pte;
 	opa = origpte & PG_FRAME;
 
 	if (origpte & PG_PS) {
 		/*
 		 * Yes, I know this will truncate upper address bits for PAE,
 		 * but I'm actually more interested in the lower bits
 		 */
 		printf("pmap_enter: va %p, pte %p, origpte %p\n",
 		    (void *)va, (void *)pte, (void *)(uintptr_t)origpte);
 		panic("pmap_enter: attempted pmap_enter on 4MB page");
 	}
 
 	/*
 	 * Mapping has not changed, must be protection or wiring change.
 	 */
 	if (origpte && (opa == pa)) {
 		/*
 		 * Wiring change, just update stats. We don't worry about
 		 * wiring PT pages as they remain resident as long as there
 		 * are valid mappings in them. Hence, if a user page is wired,
 		 * the PT page will be also.
 		 */
 		if (wired && ((origpte & PG_W) == 0))
 			pmap->pm_stats.wired_count++;
 		else if (!wired && (origpte & PG_W))
 			pmap->pm_stats.wired_count--;
 
 		/*
 		 * Remove extra pte reference
 		 */
 		if (mpte)
 			mpte->wire_count--;
 
 		/*
 		 * We might be turning off write access to the page,
 		 * so we go ahead and sense modify status.
 		 */
 		if (origpte & PG_MANAGED) {
 			om = m;
 			pa |= PG_MANAGED;
 		}
 		goto validate;
 	} 
 	/*
 	 * Mapping has changed, invalidate old range and fall through to
 	 * handle validating new mapping.
 	 */
 	if (opa) {
 		if (origpte & PG_W)
 			pmap->pm_stats.wired_count--;
 		if (origpte & PG_MANAGED) {
 			om = PHYS_TO_VM_PAGE(opa);
 			pmap_remove_entry(pmap, om, va);
 		}
 		if (mpte != NULL) {
 			mpte->wire_count--;
 			KASSERT(mpte->wire_count > 0,
 			    ("pmap_enter: missing reference to page table page,"
 			     " va: 0x%x", va));
 		}
 	} else
 		pmap->pm_stats.resident_count++;
 
 	/*
 	 * Enter on the PV list if part of our managed memory.
 	 */
 	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) {
 		pmap_insert_entry(pmap, va, m);
 		pa |= PG_MANAGED;
 	}
 
 	/*
 	 * Increment counters
 	 */
 	if (wired)
 		pmap->pm_stats.wired_count++;
 
 validate:
 	/*
 	 * Now validate mapping with desired protection/wiring.
 	 */
 	newpte = (pt_entry_t)(pa | PG_V);
 	if ((prot & VM_PROT_WRITE) != 0)
 		newpte |= PG_RW;
 	if (wired)
 		newpte |= PG_W;
 	if (va < VM_MAXUSER_ADDRESS)
 		newpte |= PG_U;
 	if (pmap == kernel_pmap)
 		newpte |= pgeflag;
 
 	/*
 	 * if the mapping or permission bits are different, we need
 	 * to update the pte.
 	 */
 	if ((origpte & ~(PG_M|PG_A)) != newpte) {
 		if (origpte & PG_V) {
 			invlva = FALSE;
 			origpte = pte_load_store(pte, newpte | PG_A);
 			if (origpte & PG_A) {
 				if (origpte & PG_MANAGED)
 					vm_page_flag_set(om, PG_REFERENCED);
 				if (opa != VM_PAGE_TO_PHYS(m))
 					invlva = TRUE;
 			}
 			if (origpte & PG_M) {
 				KASSERT((origpte & PG_RW),
 	("pmap_enter: modified page not writable: va: %#x, pte: %#jx",
 				    va, (uintmax_t)origpte));
 				if ((origpte & PG_MANAGED) &&
 				    pmap_track_modified(va))
 					vm_page_dirty(om);
 				if ((prot & VM_PROT_WRITE) == 0)
 					invlva = TRUE;
 			}
 			if (invlva)
 				pmap_invalidate_page(pmap, va);
 		} else
 			pte_store(pte, newpte | PG_A);
 	}
 	sched_unpin();
 	vm_page_unlock_queues();
 	PMAP_UNLOCK(pmap);
 }
 
 /*
  * this code makes some *MAJOR* assumptions:
  * 1. Current pmap & pmap exists.
  * 2. Not wired.
  * 3. Read access.
  * 4. No page table pages.
  * but is *MUCH* faster than pmap_enter...
  */
 
 vm_page_t
 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
     vm_page_t mpte)
 {
 	pt_entry_t *pte;
 	vm_paddr_t pa;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
 	PMAP_LOCK(pmap);
 
 	/*
 	 * In the case that a page table page is not
 	 * resident, we are creating it here.
 	 */
 	if (va < VM_MAXUSER_ADDRESS) {
 		unsigned ptepindex;
 		pd_entry_t ptepa;
 
 		/*
 		 * Calculate pagetable page index
 		 */
 		ptepindex = va >> PDRSHIFT;
 		if (mpte && (mpte->pindex == ptepindex)) {
 			mpte->wire_count++;
 		} else {
 retry:
 			/*
 			 * Get the page directory entry
 			 */
 			ptepa = pmap->pm_pdir[ptepindex];
 
 			/*
 			 * If the page table page is mapped, we just increment
 			 * the hold count, and activate it.
 			 */
 			if (ptepa) {
 				if (ptepa & PG_PS)
 					panic("pmap_enter_quick: unexpected mapping into 4MB page");
 				mpte = PHYS_TO_VM_PAGE(ptepa);
 				mpte->wire_count++;
 			} else {
 				mpte = _pmap_allocpte(pmap, ptepindex,
 				    M_NOWAIT);
 				if (mpte == NULL) {
 					PMAP_UNLOCK(pmap);
 					vm_page_busy(m);
 					vm_page_unlock_queues();
 					VM_OBJECT_UNLOCK(m->object);
 					VM_WAIT;
 					VM_OBJECT_LOCK(m->object);
 					vm_page_lock_queues();
 					vm_page_wakeup(m);
 					PMAP_LOCK(pmap);
 					goto retry;
 				}
 			}
 		}
 	} else {
 		mpte = NULL;
 	}
 
 	/*
 	 * This call to vtopte makes the assumption that we are
 	 * entering the page into the current pmap.  In order to support
 	 * quick entry into any pmap, one would likely use pmap_pte_quick.
 	 * But that isn't as quick as vtopte.
 	 */
 	pte = vtopte(va);
 	if (*pte) {
 		if (mpte != NULL) {
 			pmap_unwire_pte_hold(pmap, mpte);
 			mpte = NULL;
 		}
 		goto out;
 	}
 
 	/*
 	 * Enter on the PV list if part of our managed memory. Note that we
 	 * raise IPL while manipulating pv_table since pmap_enter can be
 	 * called at interrupt time.
 	 */
 	if ((m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0)
 		pmap_insert_entry(pmap, va, m);
 
 	/*
 	 * Increment counters
 	 */
 	pmap->pm_stats.resident_count++;
 
 	pa = VM_PAGE_TO_PHYS(m);
 
 	/*
 	 * Now validate mapping with RO protection
 	 */
 	if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED))
 		pte_store(pte, pa | PG_V | PG_U);
 	else
 		pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
 out:
 	PMAP_UNLOCK(pmap);
 	return mpte;
 }
 
 /*
  * Make a temporary mapping for a physical address.  This is only intended
  * to be used for panic dumps.
  */
 void *
 pmap_kenter_temporary(vm_paddr_t pa, int i)
 {
 	vm_offset_t va;
 
 	va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
 	pmap_kenter(va, pa);
 	invlpg(va);
 	return ((void *)crashdumpmap);
 }
 
 /*
  * This code maps large physical mmap regions into the
  * processor address space.  Note that some shortcuts
  * are taken, but the code works.
  */
 void
 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr,
 		    vm_object_t object, vm_pindex_t pindex,
 		    vm_size_t size)
 {
 	vm_page_t p;
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	KASSERT(object->type == OBJT_DEVICE,
 	    ("pmap_object_init_pt: non-device object"));
 	if (pseflag && 
 	    ((addr & (NBPDR - 1)) == 0) && ((size & (NBPDR - 1)) == 0)) {
 		int i;
 		vm_page_t m[1];
 		unsigned int ptepindex;
 		int npdes;
 		pd_entry_t ptepa;
 
 		PMAP_LOCK(pmap);
 		if (pmap->pm_pdir[ptepindex = (addr >> PDRSHIFT)])
 			goto out;
 		PMAP_UNLOCK(pmap);
 retry:
 		p = vm_page_lookup(object, pindex);
 		if (p != NULL) {
 			vm_page_lock_queues();
 			if (vm_page_sleep_if_busy(p, FALSE, "init4p"))
 				goto retry;
 		} else {
 			p = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL);
 			if (p == NULL)
 				return;
 			m[0] = p;
 
 			if (vm_pager_get_pages(object, m, 1, 0) != VM_PAGER_OK) {
 				vm_page_lock_queues();
 				vm_page_free(p);
 				vm_page_unlock_queues();
 				return;
 			}
 
 			p = vm_page_lookup(object, pindex);
 			vm_page_lock_queues();
 			vm_page_wakeup(p);
 		}
 		vm_page_unlock_queues();
 
 		ptepa = VM_PAGE_TO_PHYS(p);
 		if (ptepa & (NBPDR - 1))
 			return;
 
 		p->valid = VM_PAGE_BITS_ALL;
 
 		PMAP_LOCK(pmap);
 		pmap->pm_stats.resident_count += size >> PAGE_SHIFT;
 		npdes = size >> PDRSHIFT;
 		for(i = 0; i < npdes; i++) {
 			pde_store(&pmap->pm_pdir[ptepindex],
 			    ptepa | PG_U | PG_RW | PG_V | PG_PS);
 			ptepa += NBPDR;
 			ptepindex += 1;
 		}
 		pmap_invalidate_all(pmap);
 out:
 		PMAP_UNLOCK(pmap);
 	}
 }
 
 /*
  *	Routine:	pmap_change_wiring
  *	Function:	Change the wiring attribute for a map/virtual-address
  *			pair.
  *	In/out conditions:
  *			The mapping must already exist in the pmap.
  */
 void
 pmap_change_wiring(pmap, va, wired)
 	register pmap_t pmap;
 	vm_offset_t va;
 	boolean_t wired;
 {
 	register pt_entry_t *pte;
 
 	PMAP_LOCK(pmap);
 	pte = pmap_pte(pmap, va);
 
 	if (wired && !pmap_pte_w(pte))
 		pmap->pm_stats.wired_count++;
 	else if (!wired && pmap_pte_w(pte))
 		pmap->pm_stats.wired_count--;
 
 	/*
 	 * Wiring is not a hardware characteristic so there is no need to
 	 * invalidate TLB.
 	 */
 	pmap_pte_set_w(pte, wired);
 	pmap_pte_release(pte);
 	PMAP_UNLOCK(pmap);
 }
 
 
 
 /*
  *	Copy the range specified by src_addr/len
  *	from the source map to the range dst_addr/len
  *	in the destination map.
  *
  *	This routine is only advisory and need not do anything.
  */
 
 void
 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
 	  vm_offset_t src_addr)
 {
 	vm_offset_t addr;
 	vm_offset_t end_addr = src_addr + len;
 	vm_offset_t pdnxt;
 	vm_page_t m;
 
 	if (dst_addr != src_addr)
 		return;
 
 	if (!pmap_is_current(src_pmap))
 		return;
 
 	vm_page_lock_queues();
 	if (dst_pmap < src_pmap) {
 		PMAP_LOCK(dst_pmap);
 		PMAP_LOCK(src_pmap);
 	} else {
 		PMAP_LOCK(src_pmap);
 		PMAP_LOCK(dst_pmap);
 	}
 	sched_pin();
 	for (addr = src_addr; addr < end_addr; addr = pdnxt) {
 		pt_entry_t *src_pte, *dst_pte;
 		vm_page_t dstmpte, srcmpte;
 		pd_entry_t srcptepaddr;
 		unsigned ptepindex;
 
 		if (addr >= UPT_MIN_ADDRESS)
 			panic("pmap_copy: invalid to pmap_copy page tables");
 
 		/*
 		 * Don't let optional prefaulting of pages make us go
 		 * way below the low water mark of free pages or way
 		 * above high water mark of used pv entries.
 		 */
 		if (cnt.v_free_count < cnt.v_free_reserved ||
 		    pv_entry_count > pv_entry_high_water)
 			break;
 		
 		pdnxt = (addr + NBPDR) & ~PDRMASK;
 		ptepindex = addr >> PDRSHIFT;
 
 		srcptepaddr = src_pmap->pm_pdir[ptepindex];
 		if (srcptepaddr == 0)
 			continue;
 			
 		if (srcptepaddr & PG_PS) {
 			if (dst_pmap->pm_pdir[ptepindex] == 0) {
 				dst_pmap->pm_pdir[ptepindex] = srcptepaddr;
 				dst_pmap->pm_stats.resident_count +=
 				    NBPDR / PAGE_SIZE;
 			}
 			continue;
 		}
 
 		srcmpte = PHYS_TO_VM_PAGE(srcptepaddr);
 		if (srcmpte->wire_count == 0)
 			panic("pmap_copy: source page table page is unused");
 
 		if (pdnxt > end_addr)
 			pdnxt = end_addr;
 
 		src_pte = vtopte(addr);
 		while (addr < pdnxt) {
 			pt_entry_t ptetemp;
 			ptetemp = *src_pte;
 			/*
 			 * we only virtual copy managed pages
 			 */
 			if ((ptetemp & PG_MANAGED) != 0) {
 				/*
 				 * We have to check after allocpte for the
 				 * pte still being around...  allocpte can
 				 * block.
 				 */
 				dstmpte = pmap_allocpte(dst_pmap, addr,
 				    M_NOWAIT);
 				if (dstmpte == NULL)
 					break;
 				dst_pte = pmap_pte_quick(dst_pmap, addr);
 				if (*dst_pte == 0) {
 					/*
 					 * Clear the modified and
 					 * accessed (referenced) bits
 					 * during the copy.
 					 */
 					m = PHYS_TO_VM_PAGE(ptetemp);
 					*dst_pte = ptetemp & ~(PG_M | PG_A);
 					dst_pmap->pm_stats.resident_count++;
 					pmap_insert_entry(dst_pmap, addr, m);
 	 			} else
 					pmap_unwire_pte_hold(dst_pmap, dstmpte);
 				if (dstmpte->wire_count >= srcmpte->wire_count)
 					break;
 			}
 			addr += PAGE_SIZE;
 			src_pte++;
 		}
 	}
 	sched_unpin();
 	vm_page_unlock_queues();
 	PMAP_UNLOCK(src_pmap);
 	PMAP_UNLOCK(dst_pmap);
 }	
 
 static __inline void
 pagezero(void *page)
 {
 #if defined(I686_CPU)
 	if (cpu_class == CPUCLASS_686) {
 #if defined(CPU_ENABLE_SSE)
 		if (cpu_feature & CPUID_SSE2)
 			sse2_pagezero(page);
 		else
 #endif
 			i686_pagezero(page);
 	} else
 #endif
 		bzero(page, PAGE_SIZE);
 }
 
 /*
  *	pmap_zero_page zeros the specified hardware page by mapping 
  *	the page into KVM and using bzero to clear its contents.
  */
 void
 pmap_zero_page(vm_page_t m)
 {
 	struct sysmaps *sysmaps;
 
 	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
 	mtx_lock(&sysmaps->lock);
 	if (*sysmaps->CMAP2)
 		panic("pmap_zero_page: CMAP2 busy");
 	sched_pin();
 	*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M;
 	invlcaddr(sysmaps->CADDR2);
 	pagezero(sysmaps->CADDR2);
 	*sysmaps->CMAP2 = 0;
 	sched_unpin();
 	mtx_unlock(&sysmaps->lock);
 }
 
 /*
  *	pmap_zero_page_area zeros the specified hardware page by mapping 
  *	the page into KVM and using bzero to clear its contents.
  *
  *	off and size may not cover an area beyond a single hardware page.
  */
 void
 pmap_zero_page_area(vm_page_t m, int off, int size)
 {
 	struct sysmaps *sysmaps;
 
 	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
 	mtx_lock(&sysmaps->lock);
 	if (*sysmaps->CMAP2)
 		panic("pmap_zero_page: CMAP2 busy");
 	sched_pin();
 	*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M;
 	invlcaddr(sysmaps->CADDR2);
 	if (off == 0 && size == PAGE_SIZE) 
 		pagezero(sysmaps->CADDR2);
 	else
 		bzero((char *)sysmaps->CADDR2 + off, size);
 	*sysmaps->CMAP2 = 0;
 	sched_unpin();
 	mtx_unlock(&sysmaps->lock);
 }
 
 /*
  *	pmap_zero_page_idle zeros the specified hardware page by mapping 
  *	the page into KVM and using bzero to clear its contents.  This
  *	is intended to be called from the vm_pagezero process only and
  *	outside of Giant.
  */
 void
 pmap_zero_page_idle(vm_page_t m)
 {
 
 	if (*CMAP3)
 		panic("pmap_zero_page: CMAP3 busy");
 	sched_pin();
 	*CMAP3 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M;
 	invlcaddr(CADDR3);
 	pagezero(CADDR3);
 	*CMAP3 = 0;
 	sched_unpin();
 }
 
 /*
  *	pmap_copy_page copies the specified (machine independent)
  *	page by mapping the page into virtual memory and using
  *	bcopy to copy the page, one machine dependent page at a
  *	time.
  */
 void
 pmap_copy_page(vm_page_t src, vm_page_t dst)
 {
 	struct sysmaps *sysmaps;
 
 	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
 	mtx_lock(&sysmaps->lock);
 	if (*sysmaps->CMAP1)
 		panic("pmap_copy_page: CMAP1 busy");
 	if (*sysmaps->CMAP2)
 		panic("pmap_copy_page: CMAP2 busy");
 	sched_pin();
 	invlpg((u_int)sysmaps->CADDR1);
 	invlpg((u_int)sysmaps->CADDR2);
 	*sysmaps->CMAP1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A;
 	*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M;
 	bcopy(sysmaps->CADDR1, sysmaps->CADDR2, PAGE_SIZE);
 	*sysmaps->CMAP1 = 0;
 	*sysmaps->CMAP2 = 0;
 	sched_unpin();
 	mtx_unlock(&sysmaps->lock);
 }
 
 /*
  * Returns true if the pmap's pv is one of the first
  * 16 pvs linked to from this page.  This count may
  * be changed upwards or downwards in the future; it
  * is only necessary that true be returned for a small
  * subset of pmaps for proper page aging.
  */
 boolean_t
 pmap_page_exists_quick(pmap, m)
 	pmap_t pmap;
 	vm_page_t m;
 {
 	pv_entry_t pv;
 	int loops = 0;
 
 	if (m->flags & PG_FICTITIOUS)
 		return FALSE;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 		if (pv->pv_pmap == pmap) {
 			return TRUE;
 		}
 		loops++;
 		if (loops >= 16)
 			break;
 	}
 	return (FALSE);
 }
 
 #define PMAP_REMOVE_PAGES_CURPROC_ONLY
 /*
  * Remove all pages from specified address space
  * this aids process exit speeds.  Also, this code
  * is special cased for current process only, but
  * can have the more generic (and slightly slower)
  * mode enabled.  This is much faster than pmap_remove
  * in the case of running down an entire address space.
  */
 void
 pmap_remove_pages(pmap, sva, eva)
 	pmap_t pmap;
 	vm_offset_t sva, eva;
 {
 	pt_entry_t *pte, tpte;
 	vm_page_t m;
 	pv_entry_t pv, npv;
 
 #ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY
 	if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)) {
 		printf("warning: pmap_remove_pages called with non-current pmap\n");
 		return;
 	}
 #endif
 	vm_page_lock_queues();
 	PMAP_LOCK(pmap);
 	sched_pin();
 	for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) {
 
 		if (pv->pv_va >= eva || pv->pv_va < sva) {
 			npv = TAILQ_NEXT(pv, pv_plist);
 			continue;
 		}
 
 #ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY
 		pte = vtopte(pv->pv_va);
 #else
 		pte = pmap_pte_quick(pmap, pv->pv_va);
 #endif
 		tpte = *pte;
 
 		if (tpte == 0) {
 			printf("TPTE at %p  IS ZERO @ VA %08x\n",
 							pte, pv->pv_va);
 			panic("bad pte");
 		}
 
 /*
  * We cannot remove wired pages from a process' mapping at this time
  */
 		if (tpte & PG_W) {
 			npv = TAILQ_NEXT(pv, pv_plist);
 			continue;
 		}
 
 		m = PHYS_TO_VM_PAGE(tpte);
 		KASSERT(m->phys_addr == (tpte & PG_FRAME),
 		    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
 		    m, (uintmax_t)m->phys_addr, (uintmax_t)tpte));
 
 		KASSERT(m < &vm_page_array[vm_page_array_size],
 			("pmap_remove_pages: bad tpte %#jx", (uintmax_t)tpte));
 
 		pmap->pm_stats.resident_count--;
 
 		pte_clear(pte);
 
 		/*
 		 * Update the vm_page_t clean and reference bits.
 		 */
 		if (tpte & PG_M) {
 			vm_page_dirty(m);
 		}
 
 		npv = TAILQ_NEXT(pv, pv_plist);
 		TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
 
 		m->md.pv_list_count--;
 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
 		if (TAILQ_EMPTY(&m->md.pv_list))
 			vm_page_flag_clear(m, PG_WRITEABLE);
 
 		pmap_unuse_pt(pmap, pv->pv_va);
 		free_pv_entry(pv);
 	}
 	sched_unpin();
 	pmap_invalidate_all(pmap);
 	PMAP_UNLOCK(pmap);
 	vm_page_unlock_queues();
 }
 
 /*
  *	pmap_is_modified:
  *
  *	Return whether or not the specified physical page was modified
  *	in any physical maps.
  */
 boolean_t
 pmap_is_modified(vm_page_t m)
 {
 	pv_entry_t pv;
 	pt_entry_t *pte;
 	boolean_t rv;
 
 	rv = FALSE;
 	if (m->flags & PG_FICTITIOUS)
 		return (rv);
 
 	sched_pin();
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 		/*
 		 * if the bit being tested is the modified bit, then
 		 * mark clean_map and ptes as never
 		 * modified.
 		 */
 		if (!pmap_track_modified(pv->pv_va))
 			continue;
 		PMAP_LOCK(pv->pv_pmap);
 		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
 		rv = (*pte & PG_M) != 0;
 		PMAP_UNLOCK(pv->pv_pmap);
 		if (rv)
 			break;
 	}
 	sched_unpin();
 	return (rv);
 }
 
 /*
  *	pmap_is_prefaultable:
  *
  *	Return whether or not the specified virtual address is elgible
  *	for prefault.
  */
 boolean_t
 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
 {
 	pt_entry_t *pte;
 	boolean_t rv;
 
 	rv = FALSE;
 	PMAP_LOCK(pmap);
 	if (*pmap_pde(pmap, addr)) {
 		pte = vtopte(addr);
 		rv = *pte == 0;
 	}
 	PMAP_UNLOCK(pmap);
 	return (rv);
 }
 
 /*
  *	Clear the given bit in each of the given page's ptes.  The bit is
  *	expressed as a 32-bit mask.  Consequently, if the pte is 64 bits in
  *	size, only a bit within the least significant 32 can be cleared.
  */
 static __inline void
 pmap_clear_ptes(vm_page_t m, int bit)
 {
 	register pv_entry_t pv;
 	pt_entry_t pbits, *pte;
 
 	if ((m->flags & PG_FICTITIOUS) ||
 	    (bit == PG_RW && (m->flags & PG_WRITEABLE) == 0))
 		return;
 
 	sched_pin();
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	/*
 	 * Loop over all current mappings setting/clearing as appropos If
 	 * setting RO do we need to clear the VAC?
 	 */
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 		/*
 		 * don't write protect pager mappings
 		 */
 		if (bit == PG_RW) {
 			if (!pmap_track_modified(pv->pv_va))
 				continue;
 		}
 
 		PMAP_LOCK(pv->pv_pmap);
 		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
 retry:
 		pbits = *pte;
 		if (pbits & bit) {
 			if (bit == PG_RW) {
 				/*
 				 * Regardless of whether a pte is 32 or 64 bits
 				 * in size, PG_RW and PG_M are among the least
 				 * significant 32 bits.
 				 */
 				if (!atomic_cmpset_int((u_int *)pte, pbits,
 				    pbits & ~(PG_RW | PG_M)))
 					goto retry;
 				if (pbits & PG_M) {
 					vm_page_dirty(m);
 				}
 			} else {
 				atomic_clear_int((u_int *)pte, bit);
 			}
 			pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
 		}
 		PMAP_UNLOCK(pv->pv_pmap);
 	}
 	if (bit == PG_RW)
 		vm_page_flag_clear(m, PG_WRITEABLE);
 	sched_unpin();
 }
 
 /*
  *      pmap_page_protect:
  *
  *      Lower the permission for all mappings to a given page.
  */
 void
 pmap_page_protect(vm_page_t m, vm_prot_t prot)
 {
 	if ((prot & VM_PROT_WRITE) == 0) {
 		if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) {
 			pmap_clear_ptes(m, PG_RW);
 		} else {
 			pmap_remove_all(m);
 		}
 	}
 }
 
 /*
  *	pmap_ts_referenced:
  *
  *	Return a count of reference bits for a page, clearing those bits.
  *	It is not necessary for every reference bit to be cleared, but it
  *	is necessary that 0 only be returned when there are truly no
  *	reference bits set.
  *
  *	XXX: The exact number of bits to check and clear is a matter that
  *	should be tested and standardized at some point in the future for
  *	optimal aging of shared pages.
  */
 int
 pmap_ts_referenced(vm_page_t m)
 {
 	register pv_entry_t pv, pvf, pvn;
 	pt_entry_t *pte;
 	pt_entry_t v;
 	int rtval = 0;
 
 	if (m->flags & PG_FICTITIOUS)
 		return (rtval);
 
 	sched_pin();
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
 
 		pvf = pv;
 
 		do {
 			pvn = TAILQ_NEXT(pv, pv_list);
 
 			TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
 
 			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
 
 			if (!pmap_track_modified(pv->pv_va))
 				continue;
 
 			PMAP_LOCK(pv->pv_pmap);
 			pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
 
 			if (pte && ((v = pte_load(pte)) & PG_A) != 0) {
 				atomic_clear_int((u_int *)pte, PG_A);
 				pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
 
 				rtval++;
 				if (rtval > 4) {
 					PMAP_UNLOCK(pv->pv_pmap);
 					break;
 				}
 			}
 			PMAP_UNLOCK(pv->pv_pmap);
 		} while ((pv = pvn) != NULL && pv != pvf);
 	}
 	sched_unpin();
 
 	return (rtval);
 }
 
 /*
  *	Clear the modify bits on the specified physical page.
  */
 void
 pmap_clear_modify(vm_page_t m)
 {
 	pmap_clear_ptes(m, PG_M);
 }
 
 /*
  *	pmap_clear_reference:
  *
  *	Clear the reference bit on the specified physical page.
  */
 void
 pmap_clear_reference(vm_page_t m)
 {
 	pmap_clear_ptes(m, PG_A);
 }
 
 /*
  * Miscellaneous support routines follow
  */
 
 /*
  * Map a set of physical memory pages into the kernel virtual
  * address space. Return a pointer to where it is mapped. This
  * routine is intended to be used for mapping device memory,
  * NOT real memory.
  */
 void *
 pmap_mapdev(pa, size)
 	vm_paddr_t pa;
 	vm_size_t size;
 {
 	vm_offset_t va, tmpva, offset;
 
 	offset = pa & PAGE_MASK;
 	size = roundup(offset + size, PAGE_SIZE);
 	pa = pa & PG_FRAME;
 
 	if (pa < KERNLOAD && pa + size <= KERNLOAD)
 		va = KERNBASE + pa;
 	else
 		va = kmem_alloc_nofault(kernel_map, size);
 	if (!va)
 		panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
 
 	for (tmpva = va; size > 0; ) {
 		pmap_kenter(tmpva, pa);
 		size -= PAGE_SIZE;
 		tmpva += PAGE_SIZE;
 		pa += PAGE_SIZE;
 	}
 	pmap_invalidate_range(kernel_pmap, va, tmpva);
 	return ((void *)(va + offset));
 }
 
 void
 pmap_unmapdev(va, size)
 	vm_offset_t va;
 	vm_size_t size;
 {
 	vm_offset_t base, offset, tmpva;
 
 	if (va >= KERNBASE && va + size <= KERNBASE + KERNLOAD)
 		return;
 	base = va & PG_FRAME;
 	offset = va & PAGE_MASK;
 	size = roundup(offset + size, PAGE_SIZE);
 	for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE)
 		pmap_kremove(tmpva);
 	pmap_invalidate_range(kernel_pmap, va, tmpva);
 	kmem_free(kernel_map, base, size);
 }
 
 /*
  * perform the pmap work for mincore
  */
 int
 pmap_mincore(pmap, addr)
 	pmap_t pmap;
 	vm_offset_t addr;
 {
 	pt_entry_t *ptep, pte;
 	vm_page_t m;
 	int val = 0;
 	
 	PMAP_LOCK(pmap);
 	ptep = pmap_pte(pmap, addr);
 	pte = (ptep != NULL) ? *ptep : 0;
 	pmap_pte_release(ptep);
 	PMAP_UNLOCK(pmap);
 
 	if (pte != 0) {
 		vm_paddr_t pa;
 
 		val = MINCORE_INCORE;
 		if ((pte & PG_MANAGED) == 0)
 			return val;
 
 		pa = pte & PG_FRAME;
 
 		m = PHYS_TO_VM_PAGE(pa);
 
 		/*
 		 * Modified by us
 		 */
 		if (pte & PG_M)
 			val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER;
 		else {
 			/*
 			 * Modified by someone else
 			 */
 			vm_page_lock_queues();
 			if (m->dirty || pmap_is_modified(m))
 				val |= MINCORE_MODIFIED_OTHER;
 			vm_page_unlock_queues();
 		}
 		/*
 		 * Referenced by us
 		 */
 		if (pte & PG_A)
 			val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER;
 		else {
 			/*
 			 * Referenced by someone else
 			 */
 			vm_page_lock_queues();
 			if ((m->flags & PG_REFERENCED) ||
 			    pmap_ts_referenced(m)) {
 				val |= MINCORE_REFERENCED_OTHER;
 				vm_page_flag_set(m, PG_REFERENCED);
 			}
 			vm_page_unlock_queues();
 		}
 	} 
 	return val;
 }
 
 void
 pmap_activate(struct thread *td)
 {
 	struct proc *p = td->td_proc;
 	pmap_t	pmap, oldpmap;
 	u_int32_t  cr3;
 
 	critical_enter();
 	pmap = vmspace_pmap(td->td_proc->p_vmspace);
 	oldpmap = PCPU_GET(curpmap);
 #if defined(SMP)
 	atomic_clear_int(&oldpmap->pm_active, PCPU_GET(cpumask));
 	atomic_set_int(&pmap->pm_active, PCPU_GET(cpumask));
 #else
 	oldpmap->pm_active &= ~1;
 	pmap->pm_active |= 1;
 #endif
 #ifdef PAE
 	cr3 = vtophys(pmap->pm_pdpt);
 #else
 	cr3 = vtophys(pmap->pm_pdir);
 #endif
 	/* XXXKSE this is wrong.
 	 * pmap_activate is for the current thread on the current cpu
 	 */
 	if (p->p_flag & P_SA) {
 		/* Make sure all other cr3 entries are updated. */
 		/* what if they are running?  XXXKSE (maybe abort them) */
 		FOREACH_THREAD_IN_PROC(p, td) {
 			td->td_pcb->pcb_cr3 = cr3;
 		}
 	} else {
 		td->td_pcb->pcb_cr3 = cr3;
 	}
 	load_cr3(cr3);
 	PCPU_SET(curpmap, pmap);
 	critical_exit();
 }
 
 vm_offset_t
 pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size)
 {
 
 	if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) {
 		return addr;
 	}
 
 	addr = (addr + PDRMASK) & ~PDRMASK;
 	return addr;
 }
 
 
 #if defined(PMAP_DEBUG)
 pmap_pid_dump(int pid)
 {
 	pmap_t pmap;
 	struct proc *p;
 	int npte = 0;
 	int index;
 
 	sx_slock(&allproc_lock);
 	LIST_FOREACH(p, &allproc, p_list) {
 		if (p->p_pid != pid)
 			continue;
 
 		if (p->p_vmspace) {
 			int i,j;
 			index = 0;
 			pmap = vmspace_pmap(p->p_vmspace);
 			for (i = 0; i < NPDEPTD; i++) {
 				pd_entry_t *pde;
 				pt_entry_t *pte;
 				vm_offset_t base = i << PDRSHIFT;
 				
 				pde = &pmap->pm_pdir[i];
 				if (pde && pmap_pde_v(pde)) {
 					for (j = 0; j < NPTEPG; j++) {
 						vm_offset_t va = base + (j << PAGE_SHIFT);
 						if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) {
 							if (index) {
 								index = 0;
 								printf("\n");
 							}
 							sx_sunlock(&allproc_lock);
 							return npte;
 						}
 						pte = pmap_pte(pmap, va);
 						if (pte && pmap_pte_v(pte)) {
 							pt_entry_t pa;
 							vm_page_t m;
 							pa = *pte;
 							m = PHYS_TO_VM_PAGE(pa);
 							printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x",
 								va, pa, m->hold_count, m->wire_count, m->flags);
 							npte++;
 							index++;
 							if (index >= 2) {
 								index = 0;
 								printf("\n");
 							} else {
 								printf(" ");
 							}
 						}
 					}
 				}
 			}
 		}
 	}
 	sx_sunlock(&allproc_lock);
 	return npte;
 }
 #endif
 
 #if defined(DEBUG)
 
 static void	pads(pmap_t pm);
 void		pmap_pvdump(vm_offset_t pa);
 
 /* print address space of pmap*/
 static void
 pads(pm)
 	pmap_t pm;
 {
 	int i, j;
 	vm_paddr_t va;
 	pt_entry_t *ptep;
 
 	if (pm == kernel_pmap)
 		return;
 	for (i = 0; i < NPDEPTD; i++)
 		if (pm->pm_pdir[i])
 			for (j = 0; j < NPTEPG; j++) {
 				va = (i << PDRSHIFT) + (j << PAGE_SHIFT);
 				if (pm == kernel_pmap && va < KERNBASE)
 					continue;
 				if (pm != kernel_pmap && va > UPT_MAX_ADDRESS)
 					continue;
 				ptep = pmap_pte(pm, va);
 				if (pmap_pte_v(ptep))
 					printf("%x:%x ", va, *ptep);
 			};
 
 }
 
 void
 pmap_pvdump(pa)
 	vm_paddr_t pa;
 {
 	pv_entry_t pv;
 	vm_page_t m;
 
 	printf("pa %x", pa);
 	m = PHYS_TO_VM_PAGE(pa);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 		printf(" -> pmap %p, va %x", (void *)pv->pv_pmap, pv->pv_va);
 		pads(pv->pv_pmap);
 	}
 	printf(" ");
 }
 #endif
Index: head/sys/i386/include/smp.h
===================================================================
--- head/sys/i386/include/smp.h	(revision 153140)
+++ head/sys/i386/include/smp.h	(revision 153141)
@@ -1,76 +1,84 @@
 /*-
  * ----------------------------------------------------------------------------
  * "THE BEER-WARE LICENSE" (Revision 42):
  * <phk@FreeBSD.org> wrote this file.  As long as you retain this notice you
  * can do whatever you want with this stuff. If we meet some day, and you think
  * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
  * ----------------------------------------------------------------------------
  *
  * $FreeBSD$
  *
  */
 
 #ifndef _MACHINE_SMP_H_
 #define _MACHINE_SMP_H_
 
 #ifdef _KERNEL
 
 #ifdef SMP
 
 #ifndef LOCORE
 
 #include <sys/bus.h>
 #include <machine/frame.h>
 #include <machine/intr_machdep.h>
 #include <machine/apicvar.h>
 
 /* global data in mpboot.s */
 extern int			bootMP_size;
 
 /* functions in mpboot.s */
 void	bootMP(void);
 
 /* global data in mp_machdep.c */
 extern int			mp_naps;
 extern int			boot_cpu_id;
 extern struct pcb		stoppcbs[];
 extern struct mtx		smp_tlb_mtx;
+#ifdef COUNT_IPIS
+extern u_long *ipi_invltlb_counts[MAXCPU];
+extern u_long *ipi_invlrng_counts[MAXCPU];
+extern u_long *ipi_invlpg_counts[MAXCPU];
+extern u_long *ipi_rendezvous_counts[MAXCPU];
+extern u_long *ipi_lazypmap_counts[MAXCPU];
+#endif
 
 /* IPI handlers */
 inthand_t
 	IDTVEC(invltlb),	/* TLB shootdowns - global */
 	IDTVEC(invlpg),		/* TLB shootdowns - 1 page */
 	IDTVEC(invlrng),	/* TLB shootdowns - page range */
 	IDTVEC(ipi_intr_bitmap_handler), /* Bitmap based IPIs */ 
 	IDTVEC(cpustop),	/* CPU stops & waits to be restarted */
 	IDTVEC(rendezvous),	/* handle CPU rendezvous */
 	IDTVEC(lazypmap);	/* handle lazy pmap release */
 
 /* functions in mp_machdep.c */
 void	cpu_add(u_int apic_id, char boot_cpu);
+void	cpustop_handler(void);
 void	init_secondary(void);
 void	ipi_selected(u_int cpus, u_int ipi);
 void	ipi_all(u_int ipi);
 void	ipi_all_but_self(u_int ipi);
 void	ipi_self(u_int ipi);
 void 	ipi_bitmap_handler(struct clockframe frame);
 u_int	mp_bootaddress(u_int);
 int	mp_grab_cpu_hlt(void);
 void	mp_topology(void);
 void	smp_invlpg(vm_offset_t addr);
 void	smp_masked_invlpg(u_int mask, vm_offset_t addr);
 void	smp_invlpg_range(vm_offset_t startva, vm_offset_t endva);
 void	smp_masked_invlpg_range(u_int mask, vm_offset_t startva,
 	    vm_offset_t endva);
 void	smp_invltlb(void);
 void	smp_masked_invltlb(u_int mask);
 
 #ifdef STOP_NMI
 int	ipi_nmi_handler(void);
 #endif
 
 #endif /* !LOCORE */
 #endif /* SMP */
 
 #endif /* _KERNEL */
 #endif /* _MACHINE_SMP_H_ */