Index: stable/9/sys/i386/acpica/acpi_wakecode.S
===================================================================
--- stable/9/sys/i386/acpica/acpi_wakecode.S	(revision 235795)
+++ stable/9/sys/i386/acpica/acpi_wakecode.S	(revision 235796)
@@ -1,259 +1,205 @@
 /*-
  * Copyright (c) 2001 Takanori Watanabe <takawata@jp.freebsd.org>
- * Copyright (c) 2001 Mitsuru IWASAKI <iwasaki@jp.freebsd.org>
+ * Copyright (c) 2001-2012 Mitsuru IWASAKI <iwasaki@jp.freebsd.org>
+ * Copyright (c) 2003 Peter Wemm
+ * Copyright (c) 2008-2012 Jung-uk Kim <jkim@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <machine/asmacros.h>
+#include <machine/ppireg.h>
 #include <machine/specialreg.h>
+#include <machine/timerreg.h>
 
 #include "assym.s"
 
 /*
  * Resume entry point.  The BIOS enters here in real mode after POST with
  * CS set to the page where we stored this code.  It should configure the
  * segment registers with a flat 4 GB address space and EFLAGS.IF = 0.
  * Depending on the previous sleep state, we may need to initialize more
  * of the system (i.e., S3 suspend-to-RAM vs. S4 suspend-to-disk).
  */
-	.align 4
-	.code16
-wakeup_16:
-	nop
-	cli
-	cld
 
+	.data				/* So we can modify it */
+
+	ALIGN_TEXT
+	.code16
+wakeup_start:
 	/*
 	 * Set up segment registers for real mode, a small stack for
 	 * any calls we make, and clear any flags.
 	 */
-	movw	%cs,%ax
-	movw	%ax,%ds
-	movw	%ax,%ss
-	movw	$PAGE_SIZE,%sp
-	pushl	$0
-	popfl
+	cli				/* make sure no interrupts */
+	mov	%cs, %ax		/* copy %cs to %ds.  Remember these */
+	mov	%ax, %ds		/* are offsets rather than selectors */
+	mov	%ax, %ss
+	movw	$PAGE_SIZE, %sp
+	xorw	%ax, %ax
+	pushw	%ax
+	popfw
 
 	/* To debug resume hangs, beep the speaker if the user requested. */
-	cmpl	$1,resume_beep
-	jne	nobeep
-	movb	$0xc0,%al
-	outb	%al,$0x42
-	movb	$0x04,%al
-	outb	%al,$0x42
-	inb	$0x61,%al
-	orb	$0x3,%al
-	outb	%al,$0x61
-nobeep:
+	testb	$~0, resume_beep - wakeup_start
+	jz	1f
+	movb	$0, resume_beep - wakeup_start
 
-	/* Re-initialize video BIOS if the reset_video tunable is set. */
-	cmpl	$1,reset_video
-	jne	nobiosreset
-	lcall	$0xc000,$3
+	/* Set PIC timer2 to beep. */
+	movb	$(TIMER_SEL2 | TIMER_SQWAVE | TIMER_16BIT), %al
+	outb	%al, $TIMER_MODE
 
-	/*
-	 * Set up segment registers for real mode again in case the
-	 * previous BIOS call clobbers them.
-	 */
-	movw	%cs,%ax
-	movw	%ax,%ds
-	movw	%ax,%ss
-nobiosreset:
+	/* Turn on speaker. */
+	inb	$IO_PPI, %al
+	orb	$PIT_SPKR, %al
+	outb	%al, $IO_PPI
 
-	/* Load GDT for real mode.  Use 32 bit prefix for addresses >16 MB. */
-	lgdtl	physical_gdt
+	/* Set frequency. */
+	movw	$0x4c0, %ax
+	outb	%al, $TIMER_CNTR2
+	shrw	$8, %ax
+	outb	%al, $TIMER_CNTR2
+1:
 
-	/* Restore CR2, CR3 and CR4 */
-	movl	previous_cr2,%eax
-	movl	%eax,%cr2
-	movl	previous_cr3,%eax
-	movl	%eax,%cr3
-	movl	previous_cr4,%eax
-	movl	%eax,%cr4
+	/* Re-initialize video BIOS if the reset_video tunable is set. */
+	testb	$~0, reset_video - wakeup_start
+	jz	1f
+	movb	$0, reset_video - wakeup_start
+	lcall	$0xc000, $3
 
-	/* Transfer some values to protected mode with an inline stack */
-#define NVALUES	9
-#define TRANSFER_STACK32(val, idx)	\
-	movl	val,%eax;		\
-	movl	%eax,wakeup_32stack+(idx+1)+(idx*4)
+	/* When we reach here, int 0x10 should be ready.  Hide cursor. */
+	movb	$0x01, %ah
+	movb	$0x20, %ch
+	int	$0x10
 
-	TRANSFER_STACK32(previous_ss,		(NVALUES - 9))
-	TRANSFER_STACK32(previous_fs,		(NVALUES - 8))
-	TRANSFER_STACK32(previous_ds,		(NVALUES - 7))
-	TRANSFER_STACK32(physical_gdt+2,	(NVALUES - 6))
-	TRANSFER_STACK32(where_to_recover,	(NVALUES - 5))
-	TRANSFER_STACK32(previous_idt+2,	(NVALUES - 4))
-	TRANSFER_STACK32(previous_ldt,		(NVALUES - 3))
-	TRANSFER_STACK32(previous_gdt+2,	(NVALUES - 2))
-	TRANSFER_STACK32(previous_tr,		(NVALUES - 1))
-	TRANSFER_STACK32(previous_cr0,		(NVALUES - 0))
+	/* Re-start in case the previous BIOS call clobbers them. */
+	jmp	wakeup_start
+1:
 
-	mov	physical_esp,%esi	/* to be used in 32bit code */
+	/*
+	 * Find relocation base and patch the gdt descript and ljmp targets
+	 */
+	xorl	%ebx, %ebx
+	mov	%cs, %bx
+	sall	$4, %ebx		/* %ebx is now our relocation base */
 
+	/*
+	 * Load the descriptor table pointer.  We'll need it when running
+	 * in 16-bit protected mode.
+	 */
+	lgdtl	bootgdtdesc - wakeup_start
+
 	/* Enable protected mode */
-	movl	%cr0,%eax
-	orl	$(CR0_PE),%eax
-	movl	%eax,%cr0
+	movl	$CR0_PE, %eax
+	mov	%eax, %cr0
 
+	/*
+	 * Now execute a far jump to turn on protected mode.  This
+	 * causes the segment registers to turn into selectors and causes
+	 * %cs to be loaded from the gdt.
+	 *
+	 * The following instruction is:
+	 * ljmpl $bootcode32 - bootgdt, $wakeup_32 - wakeup_start
+	 * but gas cannot assemble that.  And besides, we patch the targets
+	 * in early startup and its a little clearer what we are patching.
+	 */
 wakeup_sw32:
-	/* Switch to protected mode by intersegmental jump */
-	ljmpl	$KCSEL,$0x12345678	/* Code location, to be replaced */
+	.byte	0x66			/* size override to 32 bits */
+	.byte	0xea			/* opcode for far jump */
+	.long	wakeup_32 - wakeup_start /* offset in segment */
+	.word	bootcode32 - bootgdt	/* index in gdt for 32 bit code */
 
 	/*
-	 * Now switched to protected mode without paging enabled.
-	 *	%esi: KERNEL stack pointer (physical address)
+	 * At this point, we are running in 32 bit legacy protected mode.
 	 */
+	ALIGN_TEXT
 	.code32
 wakeup_32:
-	nop
 
-	/* Set up segment registers for protected mode */
-	movw	$KDSEL,%ax		/* KDSEL to segment registers */
-	movw	%ax,%ds
-	movw	%ax,%es
-	movw	%ax,%gs
-	movw	%ax,%ss
-	movw	$KPSEL,%ax		/* KPSEL to %fs */
-	movw	%ax,%fs
-	movl	%esi,%esp		/* physical address stack pointer */
+	mov	$bootdata32 - bootgdt, %eax
+	mov	%ax, %ds
 
-wakeup_32stack:
-	/* Operands are overwritten in 16 bit code by TRANSFER_STACK32 macro */
-	pushl	$0xabcdef09		/* ss + dummy */
-	pushl	$0xabcdef08		/* fs + gs */
-	pushl	$0xabcdef07		/* ds + es */
-	pushl	$0xabcdef06		/* gdt:base (physical address) */
-	pushl	$0xabcdef05		/* recover address */
-	pushl	$0xabcdef04		/* idt:base */
-	pushl	$0xabcdef03		/* ldt + idt:limit */
-	pushl	$0xabcdef02		/* gdt:base */
-	pushl	$0xabcdef01		/* TR + gdt:limit */
-	pushl	$0xabcdef00		/* CR0 */
+	/* Get PCB and return address. */
+	movl	wakeup_pcb - wakeup_start(%ebx), %esi
+	movl	wakeup_ret - wakeup_start(%ebx), %edi
 
-	movl	%esp,%ebp
-#define CR0_REGISTER		0(%ebp)
-#define TASK_REGISTER		4(%ebp)
-#define PREVIOUS_GDT		6(%ebp)
-#define PREVIOUS_LDT		12(%ebp)
-#define PREVIOUS_IDT		14(%ebp)
-#define RECOVER_ADDR		20(%ebp)
-#define PHYSICAL_GDT_BASE	24(%ebp)
-#define PREVIOUS_DS		28(%ebp)
-#define PREVIOUS_ES		30(%ebp)
-#define PREVIOUS_FS		32(%ebp)
-#define PREVIOUS_GS		34(%ebp)
-#define PREVIOUS_SS		36(%ebp)
+	/* Restore CR4 and CR3. */
+	movl	wakeup_cr4 - wakeup_start(%ebx), %eax
+	mov	%eax, %cr4
+	movl	wakeup_cr3 - wakeup_start(%ebx), %eax
+	mov	%eax, %cr3
 
-	/* Fixup TSS type field */
-#define TSS_TYPEFIX_MASK	0xf9
-	xorl	%esi,%esi
-	movl	PHYSICAL_GDT_BASE,%ebx
-	movw	TASK_REGISTER,%si
-	leal	(%ebx,%esi),%eax	/* get TSS segment descriptor */
-	andb	$TSS_TYPEFIX_MASK,5(%eax)
+	/*
+	 * Finally, switch to long bit mode by enabling paging.  We have
+	 * to be very careful here because all the segmentation disappears
+	 * out from underneath us.  The spec says we can depend on the
+	 * subsequent pipelined branch to execute, but *only if* everthing
+	 * is still identity mapped.  If any mappings change, the pipeline
+	 * will flush.
+	 */
+	mov	%cr0, %eax
+	orl	$CR0_PG, %eax
+	mov	%eax, %cr0
 
-	/* Prepare to return to sleep/wakeup code point */
-	lgdtl	PREVIOUS_GDT
-	lidtl	PREVIOUS_IDT
-
-	/* Pack values from the GDT to be loaded into segment registers. */
-	movl	PREVIOUS_DS,%ebx
-	movl	PREVIOUS_FS,%ecx
-	movl	PREVIOUS_SS,%edx
-	movw	TASK_REGISTER,%si
-	shll	$16,%esi
-	movw	PREVIOUS_LDT,%si
-	movl	RECOVER_ADDR,%edi
-
-	/* Enable paging and etc. */
-	movl	CR0_REGISTER,%eax
-	movl	%eax,%cr0
-
-	/* Flush the prefetch queue */
 	jmp	1f
-1:	jmp	1f
 1:
+	/* Jump to return address. */
+	jmp	*%edi
 
-	/*
-	 * Now we are in kernel virtual memory addressing with the following
-	 * original register values:
-	 *	%ebx: ds + es
-	 *	%ecx: fs + gs
-	 *	%edx: ss + dummy
-	 *	%esi: LDTR + TR
-	 *	%edi: recover address
-	 * We'll load these back into the segment registers now.
-	 */
-	nop
+	.data
 
-	movl	%esi,%eax		/* LDTR + TR */
-	lldt	%ax			/* load LDT register */
-	shrl	$16,%eax
-	ltr	%ax			/* load task register */
+resume_beep:
+	.byte	0
+reset_video:
+	.byte	0
 
-	/* Restore segment registers */
-	movl	%ebx,%eax		/* ds + es */
-	movw	%ax,%ds
-	shrl	$16,%eax
-	movw	%ax,%es
-	movl	%ecx,%eax		/* fs + gs */
-	movw	%ax,%fs
-	shrl	$16,%eax
-	movw	%ax,%gs
-	movl	%edx,%eax		/* ss */
-	movw	%ax,%ss
+	ALIGN_DATA
+bootgdt:
+	.long	0x00000000
+	.long	0x00000000
 
-	/* Jump to acpi_restorecpu() */
-	jmp	*%edi
+bootcode32:
+	.long	0x0000ffff
+	.long	0x00cf9b00
 
-/* used in real mode */
-physical_gdt:		.word 0
-			.long 0
-physical_esp:		.long 0
-previous_cr2:		.long 0
-previous_cr3:		.long 0
-previous_cr4:		.long 0
-resume_beep:		.long 0
-reset_video:		.long 0
+bootdata32:
+	.long	0x0000ffff
+	.long	0x00cf9300
+bootgdtend:
 
-/*
- * Transfer from real mode to protected mode.  The order of these variables
- * is very important, DO NOT INSERT OR CHANGE unless you know why.
- */
-previous_cr0:		.long 0
-previous_tr:		.word 0
-previous_gdt:		.word 0
-			.long 0
-previous_ldt:		.word 0
-previous_idt:		.word 0
-			.long 0
-where_to_recover:	.long 0
-previous_ds:		.word 0
-previous_es:		.word 0
-previous_fs:		.word 0
-previous_gs:		.word 0
-previous_ss:		.word 0
-dummy:			.word 0
+bootgdtdesc:
+	.word	bootgdtend - bootgdt	/* Length */
+	.long	bootgdt - wakeup_start	/* Offset plus %ds << 4 */
+
+	ALIGN_DATA
+wakeup_cr4:
+	.long	0
+wakeup_cr3:
+	.long	0
+wakeup_pcb:
+	.long	0
+wakeup_ret:
+	.long	0
+dummy:
Index: stable/9/sys/i386/acpica/acpi_wakeup.c
===================================================================
--- stable/9/sys/i386/acpica/acpi_wakeup.c	(revision 235795)
+++ stable/9/sys/i386/acpica/acpi_wakeup.c	(revision 235796)
@@ -1,371 +1,380 @@
 /*-
  * Copyright (c) 2001 Takanori Watanabe <takawata@jp.freebsd.org>
- * Copyright (c) 2001 Mitsuru IWASAKI <iwasaki@jp.freebsd.org>
+ * Copyright (c) 2001-2012 Mitsuru IWASAKI <iwasaki@jp.freebsd.org>
+ * Copyright (c) 2003 Peter Wemm
+ * Copyright (c) 2008-2012 Jung-uk Kim <jkim@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/kernel.h>
 #include <sys/bus.h>
-#include <sys/lock.h>
+#include <sys/eventhandler.h>
+#include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/memrange.h>
-#include <sys/proc.h>
-#include <sys/sysctl.h>
+#include <sys/smp.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
-#include <vm/vm_object.h>
-#include <vm/vm_page.h>
-#include <vm/vm_map.h>
 
-#include <machine/bus.h>
-#include <machine/cpufunc.h>
+#include <machine/clock.h>
 #include <machine/intr_machdep.h>
 #include <x86/mca.h>
-#include <machine/segments.h>
+#include <machine/pcb.h>
+#include <machine/pmap.h>
+#include <machine/specialreg.h>
+#include <machine/md_var.h>
 
+#ifdef SMP
+#include <x86/apicreg.h>
+#include <machine/smp.h>
+#include <machine/vmparam.h>
+#endif
+
 #include <contrib/dev/acpica/include/acpi.h>
 
 #include <dev/acpica/acpivar.h>
 
 #include "acpi_wakecode.h"
 #include "acpi_wakedata.h"
 
-/* Make sure the code is less than one page and leave room for the stack. */
+/* Make sure the code is less than a page and leave room for the stack. */
 CTASSERT(sizeof(wakecode) < PAGE_SIZE - 1024);
 
-#ifndef _SYS_CDEFS_H_
-#error this file needs sys/cdefs.h as a prerequisite
+extern int		acpi_resume_beep;
+extern int		acpi_reset_video;
+
+#ifdef SMP
+extern struct pcb	**susppcbs;
+#else
+static struct pcb	**susppcbs;
 #endif
 
-extern uint32_t	acpi_resume_beep;
-extern uint32_t	acpi_reset_video;
-extern void	initializecpu(void);
+static void		*acpi_alloc_wakeup_handler(void);
+static void		acpi_stop_beep(void *);
 
-static struct region_descriptor __used	saved_idt, saved_gdt;
-static struct region_descriptor	*p_gdt;
-static uint16_t __used 	saved_ldt;
+#ifdef SMP
+static int		acpi_wakeup_ap(struct acpi_softc *, int);
+static void		acpi_wakeup_cpus(struct acpi_softc *, const cpuset_t *);
+#endif
 
-static uint32_t	__used	r_eax, r_ebx, r_ecx, r_edx, r_ebp, r_esi, r_edi,
-			r_efl, r_cr0, r_cr2, r_cr3, r_cr4, ret_addr;
+#define ACPI_PAGETABLES	0
+#define	WAKECODE_VADDR(sc)	((sc)->acpi_wakeaddr + (ACPI_PAGETABLES * PAGE_SIZE))
+#define	WAKECODE_PADDR(sc)	((sc)->acpi_wakephys + (ACPI_PAGETABLES * PAGE_SIZE))
+#define	WAKECODE_FIXUP(offset, type, val) do	{	\
+	type	*addr;					\
+	addr = (type *)(WAKECODE_VADDR(sc) + offset);	\
+	*addr = val;					\
+} while (0)
 
-static uint16_t	__used	r_cs, r_ds, r_es, r_fs, r_gs, r_ss, r_tr;
-static uint32_t	__used	r_esp;
+static void
+acpi_stop_beep(void *arg)
+{
 
-static void		acpi_printcpu(void);
-static void		acpi_realmodeinst(void *arg, bus_dma_segment_t *segs,
-					  int nsegs, int error);
-static void		acpi_alloc_wakeup_handler(void);
+	if (acpi_resume_beep != 0)
+		timer_spkr_release();
+}
 
-/* XXX shut gcc up */
-extern int		acpi_savecpu(void);
-extern int		acpi_restorecpu(void);
+#ifdef SMP
+static int
+acpi_wakeup_ap(struct acpi_softc *sc, int cpu)
+{
+	int		vector = (WAKECODE_PADDR(sc) >> 12) & 0xff;
+	int		apic_id = cpu_apic_ids[cpu];
+	int		ms;
 
-#ifdef __GNUCLIKE_ASM
-__asm__("				\n\
-	.text				\n\
-	.p2align 2, 0x90		\n\
-	.type acpi_restorecpu, @function\n\
-acpi_restorecpu:			\n\
-	.align 4			\n\
-	movl	r_eax,%eax		\n\
-	movl	r_ebx,%ebx		\n\
-	movl	r_ecx,%ecx		\n\
-	movl	r_edx,%edx		\n\
-	movl	r_ebp,%ebp		\n\
-	movl	r_esi,%esi		\n\
-	movl	r_edi,%edi		\n\
-	movl	r_esp,%esp		\n\
-					\n\
-	pushl	r_efl			\n\
-	popfl				\n\
-					\n\
-	movl	ret_addr,%eax		\n\
-	movl	%eax,(%esp)		\n\
-	xorl	%eax,%eax		\n\
-	ret				\n\
-					\n\
-	.text				\n\
-	.p2align 2, 0x90		\n\
-	.type acpi_savecpu, @function	\n\
-acpi_savecpu:				\n\
-	movw	%cs,r_cs		\n\
-	movw	%ds,r_ds		\n\
-	movw	%es,r_es		\n\
-	movw	%fs,r_fs		\n\
-	movw	%gs,r_gs		\n\
-	movw	%ss,r_ss		\n\
-					\n\
-	movl	%eax,r_eax		\n\
-	movl	%ebx,r_ebx		\n\
-	movl	%ecx,r_ecx		\n\
-	movl	%edx,r_edx		\n\
-	movl	%ebp,r_ebp		\n\
-	movl	%esi,r_esi		\n\
-	movl	%edi,r_edi		\n\
-					\n\
-	movl	%cr0,%eax		\n\
-	movl	%eax,r_cr0		\n\
-	movl	%cr2,%eax		\n\
-	movl	%eax,r_cr2		\n\
-	movl	%cr3,%eax		\n\
-	movl	%eax,r_cr3		\n\
-	movl	%cr4,%eax		\n\
-	movl	%eax,r_cr4		\n\
-					\n\
-	pushfl				\n\
-	popl	r_efl			\n\
-					\n\
-	movl	%esp,r_esp		\n\
-					\n\
-	sgdt	saved_gdt		\n\
-	sidt	saved_idt		\n\
-	sldt	saved_ldt		\n\
-	str	r_tr			\n\
-					\n\
-	movl	(%esp),%eax		\n\
-	movl	%eax,ret_addr		\n\
-	movl	$1,%eax			\n\
-	ret				\n\
-");
-#endif /* __GNUCLIKE_ASM */
+	WAKECODE_FIXUP(wakeup_pcb, struct pcb *, susppcbs[cpu]);
 
-static void
-acpi_printcpu(void)
-{
-	printf("======== acpi_printcpu() debug dump ========\n");
-	printf("gdt[%04x:%08x] idt[%04x:%08x] ldt[%04x] tr[%04x] efl[%08x]\n",
-		saved_gdt.rd_limit, saved_gdt.rd_base,
-		saved_idt.rd_limit, saved_idt.rd_base,
-		saved_ldt, r_tr, r_efl);
-	printf("eax[%08x] ebx[%08x] ecx[%08x] edx[%08x]\n",
-		r_eax, r_ebx, r_ecx, r_edx);
-	printf("esi[%08x] edi[%08x] ebp[%08x] esp[%08x]\n",
-		r_esi, r_edi, r_ebp, r_esp);
-	printf("cr0[%08x] cr2[%08x] cr3[%08x] cr4[%08x]\n",
-		r_cr0, r_cr2, r_cr3, r_cr4);
-	printf("cs[%04x] ds[%04x] es[%04x] fs[%04x] gs[%04x] ss[%04x]\n",
-		r_cs, r_ds, r_es, r_fs, r_gs, r_ss);
+	/* do an INIT IPI: assert RESET */
+	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
+	    APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id);
+
+	/* wait for pending status end */
+	lapic_ipi_wait(-1);
+
+	/* do an INIT IPI: deassert RESET */
+	lapic_ipi_raw(APIC_DEST_ALLESELF | APIC_TRIGMOD_LEVEL |
+	    APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, 0);
+
+	/* wait for pending status end */
+	DELAY(10000);		/* wait ~10mS */
+	lapic_ipi_wait(-1);
+
+	/*
+	 * next we do a STARTUP IPI: the previous INIT IPI might still be
+	 * latched, (P5 bug) this 1st STARTUP would then terminate
+	 * immediately, and the previously started INIT IPI would continue. OR
+	 * the previous INIT IPI has already run. and this STARTUP IPI will
+	 * run. OR the previous INIT IPI was ignored. and this STARTUP IPI
+	 * will run.
+	 */
+
+	/* do a STARTUP IPI */
+	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
+	    APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
+	    vector, apic_id);
+	lapic_ipi_wait(-1);
+	DELAY(200);		/* wait ~200uS */
+
+	/*
+	 * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF
+	 * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR
+	 * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is
+	 * recognized after hardware RESET or INIT IPI.
+	 */
+
+	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
+	    APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
+	    vector, apic_id);
+	lapic_ipi_wait(-1);
+	DELAY(200);		/* wait ~200uS */
+
+	/* Wait up to 5 seconds for it to start. */
+	for (ms = 0; ms < 5000; ms++) {
+		if (susppcbs[cpu]->pcb_eip == 0)
+			return (1);	/* return SUCCESS */
+		DELAY(1000);
+	}
+	return (0);		/* return FAILURE */
 }
 
-#define WAKECODE_FIXUP(offset, type, val) do	{		\
-	type	*addr;						\
-	addr = (type *)(sc->acpi_wakeaddr + offset);		\
-	*addr = val;						\
-} while (0)
+#define	WARMBOOT_TARGET		0
+#define	WARMBOOT_OFF		(KERNBASE + 0x0467)
+#define	WARMBOOT_SEG		(KERNBASE + 0x0469)
 
-#define WAKECODE_BCOPY(offset, type, val) do	{		\
-	void	*addr;						\
-	addr = (void *)(sc->acpi_wakeaddr + offset);		\
-	bcopy(&(val), addr, sizeof(type));			\
-} while (0)
+#define	CMOS_REG		(0x70)
+#define	CMOS_DATA		(0x71)
+#define	BIOS_RESET		(0x0f)
+#define	BIOS_WARM		(0x0a)
 
-/* Turn off bits 1&2 of the PIT, stopping the beep. */
 static void
-acpi_stop_beep(void *arg)
+acpi_wakeup_cpus(struct acpi_softc *sc, const cpuset_t *wakeup_cpus)
 {
-	outb(0x61, inb(0x61) & ~0x3);
+	uint32_t	mpbioswarmvec;
+	int		cpu;
+	u_char		mpbiosreason;
+
+	/* save the current value of the warm-start vector */
+	mpbioswarmvec = *((uint32_t *)WARMBOOT_OFF);
+	outb(CMOS_REG, BIOS_RESET);
+	mpbiosreason = inb(CMOS_DATA);
+
+	/* setup a vector to our boot code */
+	*((volatile u_short *)WARMBOOT_OFF) = WARMBOOT_TARGET;
+	*((volatile u_short *)WARMBOOT_SEG) = WAKECODE_PADDR(sc) >> 4;
+	outb(CMOS_REG, BIOS_RESET);
+	outb(CMOS_DATA, BIOS_WARM);	/* 'warm-start' */
+
+	/* Wake up each AP. */
+	for (cpu = 1; cpu < mp_ncpus; cpu++) {
+		if (!CPU_ISSET(cpu, wakeup_cpus))
+			continue;
+		if (acpi_wakeup_ap(sc, cpu) == 0) {
+			/* restore the warmstart vector */
+			*(uint32_t *)WARMBOOT_OFF = mpbioswarmvec;
+			panic("acpi_wakeup: failed to resume AP #%d (PHY #%d)",
+			    cpu, cpu_apic_ids[cpu]);
+		}
+	}
+
+	/* restore the warmstart vector */
+	*(uint32_t *)WARMBOOT_OFF = mpbioswarmvec;
+
+	outb(CMOS_REG, BIOS_RESET);
+	outb(CMOS_DATA, mpbiosreason);
 }
+#endif
 
 int
 acpi_sleep_machdep(struct acpi_softc *sc, int state)
 {
-	ACPI_STATUS		status;
-	struct pmap		*pm;
-	int			ret;
-	uint32_t		cr3;
-	u_long			ef;
+#ifdef SMP
+	cpuset_t	wakeup_cpus;
+#endif
+	register_t	cr3, rf;
+	ACPI_STATUS	status;
+	struct pmap	*pm;
+	int		ret;
 
 	ret = -1;
-	if (sc->acpi_wakeaddr == 0)
+
+	if (sc->acpi_wakeaddr == 0ul)
 		return (ret);
 
-	AcpiSetFirmwareWakingVector(sc->acpi_wakephys);
+#ifdef SMP
+	wakeup_cpus = all_cpus;
+	CPU_CLR(PCPU_GET(cpuid), &wakeup_cpus);
+#endif
 
-	ef = intr_disable();
+	if (acpi_resume_beep != 0)
+		timer_spkr_acquire();
+
+	AcpiSetFirmwareWakingVector(WAKECODE_PADDR(sc));
+
+	rf = intr_disable();
 	intr_suspend();
 
 	/*
-	 * Temporarily switch to the kernel pmap because it provides an
-	 * identity mapping (setup at boot) for the low physical memory
-	 * region containing the wakeup code.
+	 * Temporarily switch to the kernel pmap because it provides
+	 * an identity mapping (setup at boot) for the low physical
+	 * memory region containing the wakeup code.
 	 */
 	pm = kernel_pmap;
 	cr3 = rcr3();
 #ifdef PAE
 	load_cr3(vtophys(pm->pm_pdpt));
 #else
 	load_cr3(vtophys(pm->pm_pdir));
 #endif
 
-	ret_addr = 0;
-	if (acpi_savecpu()) {
-		/* Execute Sleep */
+	if (suspendctx(susppcbs[0])) {
+#ifdef SMP
+		if (!CPU_EMPTY(&wakeup_cpus) &&
+		    suspend_cpus(wakeup_cpus) == 0) {
+			device_printf(sc->acpi_dev, "Failed to suspend APs\n");
+			goto out;
+		}
+#endif
 
-		p_gdt = (struct region_descriptor *)
-				(sc->acpi_wakeaddr + physical_gdt);
-		p_gdt->rd_limit = saved_gdt.rd_limit;
-		p_gdt->rd_base = vtophys(saved_gdt.rd_base);
+		WAKECODE_FIXUP(resume_beep, uint8_t, (acpi_resume_beep != 0));
+		WAKECODE_FIXUP(reset_video, uint8_t, (acpi_reset_video != 0));
 
-		WAKECODE_FIXUP(physical_esp, uint32_t, vtophys(r_esp));
-		WAKECODE_FIXUP(previous_cr0, uint32_t, r_cr0);
-		WAKECODE_FIXUP(previous_cr2, uint32_t, r_cr2);
-		WAKECODE_FIXUP(previous_cr3, uint32_t, r_cr3);
-		WAKECODE_FIXUP(previous_cr4, uint32_t, r_cr4);
+		WAKECODE_FIXUP(wakeup_cr4, register_t, susppcbs[0]->pcb_cr4);
+		WAKECODE_FIXUP(wakeup_cr3, register_t, susppcbs[0]->pcb_cr3);
 
-		WAKECODE_FIXUP(resume_beep, uint32_t, acpi_resume_beep);
-		WAKECODE_FIXUP(reset_video, uint32_t, acpi_reset_video);
+		WAKECODE_FIXUP(wakeup_pcb, struct pcb *, susppcbs[0]);
 
-		WAKECODE_FIXUP(previous_tr,  uint16_t, r_tr);
-		WAKECODE_BCOPY(previous_gdt, struct region_descriptor, saved_gdt);
-		WAKECODE_FIXUP(previous_ldt, uint16_t, saved_ldt);
-		WAKECODE_BCOPY(previous_idt, struct region_descriptor, saved_idt);
-
-		WAKECODE_FIXUP(where_to_recover, void *, acpi_restorecpu);
-
-		WAKECODE_FIXUP(previous_ds,  uint16_t, r_ds);
-		WAKECODE_FIXUP(previous_es,  uint16_t, r_es);
-		WAKECODE_FIXUP(previous_fs,  uint16_t, r_fs);
-		WAKECODE_FIXUP(previous_gs,  uint16_t, r_gs);
-		WAKECODE_FIXUP(previous_ss,  uint16_t, r_ss);
-
-		if (bootverbose)
-			acpi_printcpu();
-
 		/* Call ACPICA to enter the desired sleep state */
 		if (state == ACPI_STATE_S4 && sc->acpi_s4bios)
 			status = AcpiEnterSleepStateS4bios();
 		else
 			status = AcpiEnterSleepState(state);
 
 		if (status != AE_OK) {
 			device_printf(sc->acpi_dev,
-				"AcpiEnterSleepState failed - %s\n",
-				AcpiFormatException(status));
+			    "AcpiEnterSleepState failed - %s\n",
+			    AcpiFormatException(status));
 			goto out;
 		}
 
 		for (;;)
 			ia32_pause();
 	} else {
 		pmap_init_pat();
+		initializecpu();
 		PCPU_SET(switchtime, 0);
 		PCPU_SET(switchticks, ticks);
-		if (bootverbose) {
-			acpi_savecpu();
-			acpi_printcpu();
-		}
+#ifdef SMP
+		if (!CPU_EMPTY(&wakeup_cpus))
+			acpi_wakeup_cpus(sc, &wakeup_cpus);
+#endif
 		ret = 0;
 	}
 
 out:
+#ifdef SMP
+	if (!CPU_EMPTY(&wakeup_cpus))
+		restart_cpus(wakeup_cpus);
+#endif
+
 	load_cr3(cr3);
 	mca_resume();
 	intr_resume();
-	intr_restore(ef);
+	intr_restore(rf);
 
+	AcpiSetFirmwareWakingVector(0);
+
 	if (ret == 0 && mem_range_softc.mr_op != NULL &&
 	    mem_range_softc.mr_op->reinit != NULL)
 		mem_range_softc.mr_op->reinit(&mem_range_softc);
 
-	/* If we beeped, turn it off after a delay. */
-	if (acpi_resume_beep)
-		timeout(acpi_stop_beep, NULL, 3 * hz);
-
 	return (ret);
 }
 
-static bus_dma_tag_t	acpi_waketag;
-static bus_dmamap_t	acpi_wakemap;
-static vm_offset_t	acpi_wakeaddr;
-
-static void
+static void *
 acpi_alloc_wakeup_handler(void)
 {
-	void *wakeaddr;
+	void		*wakeaddr;
+	int		i;
 
-	if (!cold)
-		return;
-
 	/*
 	 * Specify the region for our wakeup code.  We want it in the low 1 MB
-	 * region, excluding video memory and above (0xa0000).  We ask for
-	 * it to be page-aligned, just to be safe.
+	 * region, excluding real mode IVT (0-0x3ff), BDA (0x400-0x4ff), EBDA
+	 * (less than 128KB, below 0xa0000, must be excluded by SMAP and DSDT),
+	 * and ROM area (0xa0000 and above).  The temporary page tables must be
+	 * page-aligned.
 	 */
-	if (bus_dma_tag_create(/*parent*/ NULL,
-	    /*alignment*/ PAGE_SIZE, /*no boundary*/ 0,
-	    /*lowaddr*/ 0x9ffff, /*highaddr*/ BUS_SPACE_MAXADDR, NULL, NULL,
-	    /*maxsize*/ PAGE_SIZE, /*segments*/ 1, /*maxsegsize*/ PAGE_SIZE,
-	    0, busdma_lock_mutex, &Giant, &acpi_waketag) != 0) {
-		printf("acpi_alloc_wakeup_handler: can't create wake tag\n");
-		return;
+	wakeaddr = contigmalloc((ACPI_PAGETABLES + 1) * PAGE_SIZE, M_DEVBUF,
+	    M_NOWAIT, 0x500, 0xa0000, PAGE_SIZE, 0ul);
+	if (wakeaddr == NULL) {
+		printf("%s: can't alloc wake memory\n", __func__);
+		return (NULL);
 	}
-	if (bus_dmamem_alloc(acpi_waketag, &wakeaddr, BUS_DMA_NOWAIT,
-	    &acpi_wakemap) != 0) {
-		printf("acpi_alloc_wakeup_handler: can't alloc wake memory\n");
-		return;
+	if (EVENTHANDLER_REGISTER(power_resume, acpi_stop_beep, NULL,
+	    EVENTHANDLER_PRI_LAST) == NULL) {
+		printf("%s: can't register event handler\n", __func__);
+		contigfree(wakeaddr, (ACPI_PAGETABLES + 1) * PAGE_SIZE, M_DEVBUF);
+		return (NULL);
 	}
-	acpi_wakeaddr = (vm_offset_t)wakeaddr;
-}
-
-SYSINIT(acpiwakeup, SI_SUB_KMEM, SI_ORDER_ANY, acpi_alloc_wakeup_handler, 0);
-
-static void
-acpi_realmodeinst(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
-{
-	struct acpi_softc *sc;
-	uint32_t *addr;
-
-	/* Overwrite the ljmp target with the real address */
-	sc = arg;
-	sc->acpi_wakephys = segs[0].ds_addr;
-	addr = (uint32_t *)&wakecode[wakeup_sw32 + 2];
-	*addr = sc->acpi_wakephys + wakeup_32;
-
-	/* Copy the wake code into our low page and save its physical addr. */
-	bcopy(wakecode, (void *)sc->acpi_wakeaddr, sizeof(wakecode));
-	if (bootverbose) {
-		device_printf(sc->acpi_dev, "wakeup code va %#x pa %#jx\n",
-		    acpi_wakeaddr, (uintmax_t)sc->acpi_wakephys);
+	susppcbs = malloc(mp_ncpus * sizeof(*susppcbs), M_DEVBUF, M_WAITOK);
+	for (i = 0; i < mp_ncpus; i++) {
+		susppcbs[i] = malloc(sizeof(**susppcbs), M_DEVBUF, M_WAITOK);
 	}
+
+	return (wakeaddr);
 }
 
 void
 acpi_install_wakeup_handler(struct acpi_softc *sc)
 {
-	if (acpi_wakeaddr == 0)
+	static void	*wakeaddr = NULL;
+
+	if (wakeaddr != NULL)
 		return;
 
-	sc->acpi_waketag = acpi_waketag;
-	sc->acpi_wakeaddr = acpi_wakeaddr;
-	sc->acpi_wakemap = acpi_wakemap;
+	wakeaddr = acpi_alloc_wakeup_handler();
+	if (wakeaddr == NULL)
+		return;
 
-	bus_dmamap_load(sc->acpi_waketag, sc->acpi_wakemap,
-	    (void *)sc->acpi_wakeaddr, PAGE_SIZE, acpi_realmodeinst, sc, 0);
+	sc->acpi_wakeaddr = (vm_offset_t)wakeaddr;
+	sc->acpi_wakephys = vtophys(wakeaddr);
+
+	bcopy(wakecode, (void *)WAKECODE_VADDR(sc), sizeof(wakecode));
+
+	/* Patch GDT base address, ljmp target. */
+	WAKECODE_FIXUP((bootgdtdesc + 2), uint32_t,
+	    WAKECODE_PADDR(sc) + bootgdt);
+	WAKECODE_FIXUP((wakeup_sw32 + 2), uint32_t,
+	    WAKECODE_PADDR(sc) + wakeup_32);
+
+	/* Save pointers to some global data. */
+	WAKECODE_FIXUP(wakeup_ret, void *, resumectx);
+
+	if (bootverbose)
+		device_printf(sc->acpi_dev, "wakeup code va %#x pa %#jx\n",
+		    sc->acpi_wakeaddr, (uintmax_t)sc->acpi_wakephys);
 }
Index: stable/9/sys/i386/i386/apic_vector.s
===================================================================
--- stable/9/sys/i386/i386/apic_vector.s	(revision 235795)
+++ stable/9/sys/i386/i386/apic_vector.s	(revision 235796)
@@ -1,376 +1,396 @@
 /*-
  * Copyright (c) 1989, 1990 William F. Jolitz.
  * Copyright (c) 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: vector.s, 386BSD 0.1 unknown origin
  * $FreeBSD$
  */
 
 /*
  * Interrupt entry points for external interrupts triggered by I/O APICs
  * as well as IPI handlers.
  */
 
 #include "opt_smp.h"
 
 #include <machine/asmacros.h>
 #include <x86/apicreg.h>
 
 #include "assym.s"
 
 /*
  * I/O Interrupt Entry Point.  Rather than having one entry point for
  * each interrupt source, we use one entry point for each 32-bit word
  * in the ISR.  The handler determines the highest bit set in the ISR,
  * translates that into a vector, and passes the vector to the
  * lapic_handle_intr() function.
  */
 #define	ISR_VEC(index, vec_name)					\
 	.text ;								\
 	SUPERALIGN_TEXT ;						\
 IDTVEC(vec_name) ;							\
 	PUSH_FRAME ;							\
 	SET_KERNEL_SREGS ;						\
 	cld ;								\
 	FAKE_MCOUNT(TF_EIP(%esp)) ;					\
 	movl	lapic, %edx ;	/* pointer to local APIC */		\
 	movl	LA_ISR + 16 * (index)(%edx), %eax ;	/* load ISR */	\
 	bsrl	%eax, %eax ;	/* index of highest set bit in ISR */	\
 	jz	1f ;							\
 	addl	$(32 * index),%eax ;					\
 	pushl	%esp		;                                       \
 	pushl	%eax ;		/* pass the IRQ */			\
 	call	lapic_handle_intr ;					\
 	addl	$8, %esp ;	/* discard parameter */			\
 1: ;									\
 	MEXITCOUNT ;							\
 	jmp	doreti
 
 /*
  * Handle "spurious INTerrupts".
  * Notes:
  *  This is different than the "spurious INTerrupt" generated by an
  *   8259 PIC for missing INTs.  See the APIC documentation for details.
  *  This routine should NOT do an 'EOI' cycle.
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(spuriousint)
 
 	/* No EOI cycle used here */
 
 	iret
 
 	ISR_VEC(1, apic_isr1)
 	ISR_VEC(2, apic_isr2)
 	ISR_VEC(3, apic_isr3)
 	ISR_VEC(4, apic_isr4)
 	ISR_VEC(5, apic_isr5)
 	ISR_VEC(6, apic_isr6)
 	ISR_VEC(7, apic_isr7)
 
 /*
  * Local APIC periodic timer handler.
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(timerint)
 	PUSH_FRAME
 	SET_KERNEL_SREGS
 	cld
 	FAKE_MCOUNT(TF_EIP(%esp))
 	pushl	%esp
 	call	lapic_handle_timer
 	add	$4, %esp
 	MEXITCOUNT
 	jmp	doreti
 
 /*
  * Local APIC CMCI handler.
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(cmcint)
 	PUSH_FRAME
 	SET_KERNEL_SREGS
 	cld
 	FAKE_MCOUNT(TF_EIP(%esp))
 	call	lapic_handle_cmc
 	MEXITCOUNT
 	jmp	doreti
 
 /*
  * Local APIC error interrupt handler.
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(errorint)
 	PUSH_FRAME
 	SET_KERNEL_SREGS
 	cld
 	FAKE_MCOUNT(TF_EIP(%esp))
 	call	lapic_handle_error
 	MEXITCOUNT
 	jmp	doreti
 
 #ifdef SMP
 /*
  * Global address space TLB shootdown.
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(invltlb)
 	pushl	%eax
 	pushl	%ds
 	movl	$KDSEL, %eax		/* Kernel data selector */
 	movl	%eax, %ds
 
 #if defined(COUNT_XINVLTLB_HITS) || defined(COUNT_IPIS)
 	pushl	%fs
 	movl	$KPSEL, %eax		/* Private space selector */
 	movl	%eax, %fs
 	movl	PCPU(CPUID), %eax
 	popl	%fs
 #ifdef COUNT_XINVLTLB_HITS
 	incl	xhits_gbl(,%eax,4)
 #endif
 #ifdef COUNT_IPIS
 	movl	ipi_invltlb_counts(,%eax,4),%eax
 	incl	(%eax)
 #endif
 #endif
 
 	movl	%cr3, %eax		/* invalidate the TLB */
 	movl	%eax, %cr3
 
 	movl	lapic, %eax
 	movl	$0, LA_EOI(%eax)	/* End Of Interrupt to APIC */
 
 	lock
 	incl	smp_tlb_wait
 
 	popl	%ds
 	popl	%eax
 	iret
 
 /*
  * Single page TLB shootdown
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(invlpg)
 	pushl	%eax
 	pushl	%ds
 	movl	$KDSEL, %eax		/* Kernel data selector */
 	movl	%eax, %ds
 
 #if defined(COUNT_XINVLTLB_HITS) || defined(COUNT_IPIS)
 	pushl	%fs
 	movl	$KPSEL, %eax		/* Private space selector */
 	movl	%eax, %fs
 	movl	PCPU(CPUID), %eax
 	popl	%fs
 #ifdef COUNT_XINVLTLB_HITS
 	incl	xhits_pg(,%eax,4)
 #endif
 #ifdef COUNT_IPIS
 	movl	ipi_invlpg_counts(,%eax,4),%eax
 	incl	(%eax)
 #endif
 #endif
 
 	movl	smp_tlb_addr1, %eax
 	invlpg	(%eax)			/* invalidate single page */
 
 	movl	lapic, %eax
 	movl	$0, LA_EOI(%eax)	/* End Of Interrupt to APIC */
 
 	lock
 	incl	smp_tlb_wait
 
 	popl	%ds
 	popl	%eax
 	iret
 
 /*
  * Page range TLB shootdown.
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(invlrng)
 	pushl	%eax
 	pushl	%edx
 	pushl	%ds
 	movl	$KDSEL, %eax		/* Kernel data selector */
 	movl	%eax, %ds
 
 #if defined(COUNT_XINVLTLB_HITS) || defined(COUNT_IPIS)
 	pushl	%fs
 	movl	$KPSEL, %eax		/* Private space selector */
 	movl	%eax, %fs
 	movl	PCPU(CPUID), %eax
 	popl	%fs
 #ifdef COUNT_XINVLTLB_HITS
 	incl	xhits_rng(,%eax,4)
 #endif
 #ifdef COUNT_IPIS
 	movl	ipi_invlrng_counts(,%eax,4),%eax
 	incl	(%eax)
 #endif
 #endif
 
 	movl	smp_tlb_addr1, %edx
 	movl	smp_tlb_addr2, %eax
 1:	invlpg	(%edx)			/* invalidate single page */
 	addl	$PAGE_SIZE, %edx
 	cmpl	%eax, %edx
 	jb	1b
 
 	movl	lapic, %eax
 	movl	$0, LA_EOI(%eax)	/* End Of Interrupt to APIC */
 
 	lock
 	incl	smp_tlb_wait
 
 	popl	%ds
 	popl	%edx
 	popl	%eax
 	iret
 
 /*
  * Invalidate cache.
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(invlcache)
 	pushl	%eax
 	pushl	%ds
 	movl	$KDSEL, %eax		/* Kernel data selector */
 	movl	%eax, %ds
 
 #ifdef COUNT_IPIS
 	pushl	%fs
 	movl	$KPSEL, %eax		/* Private space selector */
 	movl	%eax, %fs
 	movl	PCPU(CPUID), %eax
 	popl	%fs
 	movl	ipi_invlcache_counts(,%eax,4),%eax
 	incl	(%eax)
 #endif
 
 	wbinvd
 
 	movl	lapic, %eax
 	movl	$0, LA_EOI(%eax)	/* End Of Interrupt to APIC */
 
 	lock
 	incl	smp_tlb_wait
 
 	popl	%ds
 	popl	%eax
 	iret
 
 /*
  * Handler for IPIs sent via the per-cpu IPI bitmap.
  */
 #ifndef XEN
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(ipi_intr_bitmap_handler)	
 	PUSH_FRAME
 	SET_KERNEL_SREGS
 	cld
 
 	movl	lapic, %edx
 	movl	$0, LA_EOI(%edx)	/* End Of Interrupt to APIC */
 	
 	FAKE_MCOUNT(TF_EIP(%esp))
 
 	call	ipi_bitmap_handler
 	MEXITCOUNT
 	jmp	doreti
 #endif
 /*
  * Executed by a CPU when it receives an IPI_STOP from another CPU.
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(cpustop)
 	PUSH_FRAME
 	SET_KERNEL_SREGS
 	cld
 
 	movl	lapic, %eax
 	movl	$0, LA_EOI(%eax)	/* End Of Interrupt to APIC */
 
 	call	cpustop_handler
 
 	POP_FRAME
 	iret
 
 /*
+ * Executed by a CPU when it receives an IPI_SUSPEND from another CPU.
+ */
+#ifndef XEN
+	.text
+	SUPERALIGN_TEXT
+IDTVEC(cpususpend)
+	PUSH_FRAME
+	SET_KERNEL_SREGS
+	cld
+
+	movl	lapic, %eax
+	movl	$0, LA_EOI(%eax)	/* End Of Interrupt to APIC */
+
+	call	cpususpend_handler
+
+	POP_FRAME
+	jmp	doreti_iret
+#endif
+
+/*
  * Executed by a CPU when it receives a RENDEZVOUS IPI from another CPU.
  *
  * - Calls the generic rendezvous action function.
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(rendezvous)
 	PUSH_FRAME
 	SET_KERNEL_SREGS
 	cld
 
 #ifdef COUNT_IPIS
 	movl	PCPU(CPUID), %eax
 	movl	ipi_rendezvous_counts(,%eax,4), %eax
 	incl	(%eax)
 #endif
 	call	smp_rendezvous_action
 
 	movl	lapic, %eax
 	movl	$0, LA_EOI(%eax)	/* End Of Interrupt to APIC */
 	POP_FRAME
 	iret
 	
 /*
  * Clean up when we lose out on the lazy context switch optimization.
  * ie: when we are about to release a PTD but a cpu is still borrowing it.
  */
 	SUPERALIGN_TEXT
 IDTVEC(lazypmap)
 	PUSH_FRAME
 	SET_KERNEL_SREGS
 	cld
 
 	call	pmap_lazyfix_action
 
 	movl	lapic, %eax
 	movl	$0, LA_EOI(%eax)	/* End Of Interrupt to APIC */
 	POP_FRAME
 	iret
 #endif /* SMP */
Index: stable/9/sys/i386/i386/genassym.c
===================================================================
--- stable/9/sys/i386/i386/genassym.c	(revision 235795)
+++ stable/9/sys/i386/i386/genassym.c	(revision 235796)
@@ -1,243 +1,256 @@
 /*-
  * Copyright (c) 1982, 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)genassym.c	5.11 (Berkeley) 5/10/91
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_apic.h"
 #include "opt_compat.h"
 #include "opt_hwpmc_hooks.h"
 #include "opt_kstack_pages.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/assym.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #ifdef	HWPMC_HOOKS
 #include <sys/pmckern.h>
 #endif
 #include <sys/proc.h>
 #include <sys/errno.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/socket.h>
 #include <sys/resourcevar.h>
 #include <sys/signalvar.h>
 #include <sys/ucontext.h>
 #include <machine/bootinfo.h>
 #include <machine/tss.h>
 #include <sys/vmmeter.h>
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <sys/proc.h>
 #include <net/if.h>
 #include <netinet/in.h>
 #include <nfs/nfsproto.h>
 #include <nfsclient/nfs.h>
 #include <nfs/nfsdiskless.h>
 #ifdef DEV_APIC
 #include <x86/apicreg.h>
 #endif
 #include <machine/cpu.h>
 #include <machine/pcb.h>
 #include <machine/sigframe.h>
 #include <machine/vm86.h>
 #include <machine/proc.h>
 
 ASSYM(P_VMSPACE, offsetof(struct proc, p_vmspace));
 ASSYM(VM_PMAP, offsetof(struct vmspace, vm_pmap));
 ASSYM(PM_ACTIVE, offsetof(struct pmap, pm_active));
 
 ASSYM(TD_FLAGS, offsetof(struct thread, td_flags));
 ASSYM(TD_LOCK, offsetof(struct thread, td_lock));
 ASSYM(TD_PCB, offsetof(struct thread, td_pcb));
 ASSYM(TD_PFLAGS, offsetof(struct thread, td_pflags));
 ASSYM(TD_PROC, offsetof(struct thread, td_proc));
 ASSYM(TD_MD, offsetof(struct thread, td_md));
 ASSYM(TD_TID, offsetof(struct thread, td_tid));
 
 ASSYM(TDP_CALLCHAIN, TDP_CALLCHAIN);
 
 ASSYM(P_MD, offsetof(struct proc, p_md));
 ASSYM(MD_LDT, offsetof(struct mdproc, md_ldt));
 
 ASSYM(TDF_ASTPENDING, TDF_ASTPENDING);
 ASSYM(TDF_NEEDRESCHED, TDF_NEEDRESCHED);
 
 ASSYM(V_TRAP, offsetof(struct vmmeter, v_trap));
 ASSYM(V_SYSCALL, offsetof(struct vmmeter, v_syscall));
 ASSYM(V_INTR, offsetof(struct vmmeter, v_intr));
 /* ASSYM(UPAGES, UPAGES);*/
 ASSYM(KSTACK_PAGES, KSTACK_PAGES);
 ASSYM(PAGE_SIZE, PAGE_SIZE);
 ASSYM(NPTEPG, NPTEPG);
 ASSYM(NPDEPG, NPDEPG);
 ASSYM(NPDEPTD, NPDEPTD);
 ASSYM(NPGPTD, NPGPTD);
 ASSYM(PDESIZE, sizeof(pd_entry_t));
 ASSYM(PTESIZE, sizeof(pt_entry_t));
 ASSYM(PDESHIFT, PDESHIFT);
 ASSYM(PTESHIFT, PTESHIFT);
 ASSYM(PAGE_SHIFT, PAGE_SHIFT);
 ASSYM(PAGE_MASK, PAGE_MASK);
 ASSYM(PDRSHIFT, PDRSHIFT);
 ASSYM(PDRMASK, PDRMASK);
 ASSYM(USRSTACK, USRSTACK);
 ASSYM(VM_MAXUSER_ADDRESS, VM_MAXUSER_ADDRESS);
 ASSYM(KERNBASE, KERNBASE);
 ASSYM(KERNLOAD, KERNLOAD);
 ASSYM(MCLBYTES, MCLBYTES);
+ASSYM(PCB_CR0, offsetof(struct pcb, pcb_cr0));
+ASSYM(PCB_CR2, offsetof(struct pcb, pcb_cr2));
 ASSYM(PCB_CR3, offsetof(struct pcb, pcb_cr3));
+ASSYM(PCB_CR4, offsetof(struct pcb, pcb_cr4));
 ASSYM(PCB_EDI, offsetof(struct pcb, pcb_edi));
 ASSYM(PCB_ESI, offsetof(struct pcb, pcb_esi));
 ASSYM(PCB_EBP, offsetof(struct pcb, pcb_ebp));
 ASSYM(PCB_ESP, offsetof(struct pcb, pcb_esp));
 ASSYM(PCB_EBX, offsetof(struct pcb, pcb_ebx));
 ASSYM(PCB_EIP, offsetof(struct pcb, pcb_eip));
 ASSYM(TSS_ESP0, offsetof(struct i386tss, tss_esp0));
 
+ASSYM(PCB_DS, offsetof(struct pcb, pcb_ds));
+ASSYM(PCB_ES, offsetof(struct pcb, pcb_es));
+ASSYM(PCB_FS, offsetof(struct pcb, pcb_fs));
 ASSYM(PCB_GS, offsetof(struct pcb, pcb_gs));
+ASSYM(PCB_SS, offsetof(struct pcb, pcb_ss));
 ASSYM(PCB_DR0, offsetof(struct pcb, pcb_dr0));
 ASSYM(PCB_DR1, offsetof(struct pcb, pcb_dr1));
 ASSYM(PCB_DR2, offsetof(struct pcb, pcb_dr2));
 ASSYM(PCB_DR3, offsetof(struct pcb, pcb_dr3));
 ASSYM(PCB_DR6, offsetof(struct pcb, pcb_dr6));
 ASSYM(PCB_DR7, offsetof(struct pcb, pcb_dr7));
 ASSYM(PCB_USERFPU, offsetof(struct pcb, pcb_user_save));
 ASSYM(PCB_PSL, offsetof(struct pcb, pcb_psl));
 ASSYM(PCB_DBREGS, PCB_DBREGS);
 ASSYM(PCB_EXT, offsetof(struct pcb, pcb_ext));
 
 ASSYM(PCB_FSD, offsetof(struct pcb, pcb_fsd));
+ASSYM(PCB_GSD, offsetof(struct pcb, pcb_gsd));
 ASSYM(PCB_VM86, offsetof(struct pcb, pcb_vm86));
 ASSYM(PCB_FLAGS, offsetof(struct pcb, pcb_flags));
 ASSYM(PCB_SAVEFPU, offsetof(struct pcb, pcb_save));
 ASSYM(PCB_SAVEFPU_SIZE, sizeof(union savefpu));
 ASSYM(PCB_ONFAULT, offsetof(struct pcb, pcb_onfault));
 
 ASSYM(PCB_SIZE, sizeof(struct pcb));
 ASSYM(PCB_VM86CALL, PCB_VM86CALL);
+
+ASSYM(PCB_GDT, offsetof(struct pcb, pcb_gdt));
+ASSYM(PCB_IDT, offsetof(struct pcb, pcb_idt));
+ASSYM(PCB_LDT, offsetof(struct pcb, pcb_ldt));
+ASSYM(PCB_TR, offsetof(struct pcb, pcb_tr));
 
 ASSYM(TF_TRAPNO, offsetof(struct trapframe, tf_trapno));
 ASSYM(TF_ERR, offsetof(struct trapframe, tf_err));
 ASSYM(TF_EIP, offsetof(struct trapframe, tf_eip));
 ASSYM(TF_CS, offsetof(struct trapframe, tf_cs));
 ASSYM(TF_EFLAGS, offsetof(struct trapframe, tf_eflags));
 ASSYM(SIGF_HANDLER, offsetof(struct sigframe, sf_ahu.sf_handler));
 #ifdef COMPAT_43
 ASSYM(SIGF_SC, offsetof(struct osigframe, sf_siginfo.si_sc));
 #endif
 ASSYM(SIGF_UC, offsetof(struct sigframe, sf_uc));
 #ifdef COMPAT_FREEBSD4
 ASSYM(SIGF_UC4, offsetof(struct sigframe4, sf_uc));
 #endif
 #ifdef COMPAT_43
 ASSYM(SC_PS, offsetof(struct osigcontext, sc_ps));
 ASSYM(SC_FS, offsetof(struct osigcontext, sc_fs));
 ASSYM(SC_GS, offsetof(struct osigcontext, sc_gs));
 ASSYM(SC_TRAPNO, offsetof(struct osigcontext, sc_trapno));
 #endif
 #ifdef COMPAT_FREEBSD4
 ASSYM(UC4_EFLAGS, offsetof(struct ucontext4, uc_mcontext.mc_eflags));
 ASSYM(UC4_GS, offsetof(struct ucontext4, uc_mcontext.mc_gs));
 #endif
 ASSYM(UC_EFLAGS, offsetof(ucontext_t, uc_mcontext.mc_eflags));
 ASSYM(UC_GS, offsetof(ucontext_t, uc_mcontext.mc_gs));
 ASSYM(ENOENT, ENOENT);
 ASSYM(EFAULT, EFAULT);
 ASSYM(ENAMETOOLONG, ENAMETOOLONG);
 ASSYM(MAXCOMLEN, MAXCOMLEN);
 ASSYM(MAXPATHLEN, MAXPATHLEN);
 ASSYM(BOOTINFO_SIZE, sizeof(struct bootinfo));
 ASSYM(BI_VERSION, offsetof(struct bootinfo, bi_version));
 ASSYM(BI_KERNELNAME, offsetof(struct bootinfo, bi_kernelname));
 ASSYM(BI_NFS_DISKLESS, offsetof(struct bootinfo, bi_nfs_diskless));
 ASSYM(BI_ENDCOMMON, offsetof(struct bootinfo, bi_endcommon));
 ASSYM(NFSDISKLESS_SIZE, sizeof(struct nfs_diskless));
 ASSYM(BI_SIZE, offsetof(struct bootinfo, bi_size));
 ASSYM(BI_SYMTAB, offsetof(struct bootinfo, bi_symtab));
 ASSYM(BI_ESYMTAB, offsetof(struct bootinfo, bi_esymtab));
 ASSYM(BI_KERNEND, offsetof(struct bootinfo, bi_kernend));
 ASSYM(PC_SIZEOF, sizeof(struct pcpu));
 ASSYM(PC_PRVSPACE, offsetof(struct pcpu, pc_prvspace));
 ASSYM(PC_CURTHREAD, offsetof(struct pcpu, pc_curthread));
 ASSYM(PC_FPCURTHREAD, offsetof(struct pcpu, pc_fpcurthread));
 ASSYM(PC_IDLETHREAD, offsetof(struct pcpu, pc_idlethread));
 ASSYM(PC_CURPCB, offsetof(struct pcpu, pc_curpcb));
 ASSYM(PC_COMMON_TSS, offsetof(struct pcpu, pc_common_tss));
 ASSYM(PC_COMMON_TSSD, offsetof(struct pcpu, pc_common_tssd));
 ASSYM(PC_TSS_GDT, offsetof(struct pcpu, pc_tss_gdt));
 ASSYM(PC_FSGS_GDT, offsetof(struct pcpu, pc_fsgs_gdt));
 ASSYM(PC_CURRENTLDT, offsetof(struct pcpu, pc_currentldt));
 ASSYM(PC_CPUID, offsetof(struct pcpu, pc_cpuid));
 ASSYM(PC_CURPMAP, offsetof(struct pcpu, pc_curpmap));
 ASSYM(PC_PRIVATE_TSS, offsetof(struct pcpu, pc_private_tss));
 
 #ifdef DEV_APIC
 ASSYM(LA_VER, offsetof(struct LAPIC, version));
 ASSYM(LA_TPR, offsetof(struct LAPIC, tpr));
 ASSYM(LA_EOI, offsetof(struct LAPIC, eoi));
 ASSYM(LA_SVR, offsetof(struct LAPIC, svr));
 ASSYM(LA_ICR_LO, offsetof(struct LAPIC, icr_lo));
 ASSYM(LA_ICR_HI, offsetof(struct LAPIC, icr_hi));
 ASSYM(LA_ISR, offsetof(struct LAPIC, isr0));
 #endif
 
 ASSYM(KCSEL, GSEL(GCODE_SEL, SEL_KPL));
 ASSYM(KDSEL, GSEL(GDATA_SEL, SEL_KPL));
 ASSYM(KPSEL, GSEL(GPRIV_SEL, SEL_KPL));
 
 ASSYM(BC32SEL, GSEL(GBIOSCODE32_SEL, SEL_KPL));
 ASSYM(GPROC0_SEL, GPROC0_SEL);
 ASSYM(VM86_FRAMESIZE, sizeof(struct vm86frame));
 
 #ifdef PC98
 #include <machine/bus.h>
 
 ASSYM(BUS_SPACE_HANDLE_BASE, offsetof(struct bus_space_handle, bsh_base));
 ASSYM(BUS_SPACE_HANDLE_IAT, offsetof(struct bus_space_handle, bsh_iat));
 #endif
 
 #ifdef XEN
 #include <xen/hypervisor.h>
 ASSYM(PC_CR3, offsetof(struct pcpu, pc_cr3));
 ASSYM(HYPERVISOR_VIRT_START, __HYPERVISOR_VIRT_START);
 #endif
 
 #ifdef	HWPMC_HOOKS
 ASSYM(PMC_FN_USER_CALLCHAIN, PMC_FN_USER_CALLCHAIN);
 #endif
Index: stable/9/sys/i386/i386/initcpu.c
===================================================================
--- stable/9/sys/i386/i386/initcpu.c	(revision 235795)
+++ stable/9/sys/i386/i386/initcpu.c	(revision 235796)
@@ -1,1041 +1,1040 @@
 /*-
  * Copyright (c) KATO Takenori, 1997, 1998.
  * 
  * All rights reserved.  Unpublished rights reserved under the copyright
  * laws of Japan.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer as
  *    the first lines of this file unmodified.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_cpu.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/sysctl.h>
 
 #include <machine/cputypes.h>
 #include <machine/md_var.h>
 #include <machine/specialreg.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 #if !defined(CPU_DISABLE_SSE) && defined(I686_CPU)
 #define CPU_ENABLE_SSE
 #endif
 
-void initializecpu(void);
 #if defined(I586_CPU) && defined(CPU_WT_ALLOC)
 void	enable_K5_wt_alloc(void);
 void	enable_K6_wt_alloc(void);
 void	enable_K6_2_wt_alloc(void);
 #endif
 
 #ifdef I486_CPU
 static void init_5x86(void);
 static void init_bluelightning(void);
 static void init_486dlc(void);
 static void init_cy486dx(void);
 #ifdef CPU_I486_ON_386
 static void init_i486_on_386(void);
 #endif
 static void init_6x86(void);
 #endif /* I486_CPU */
 
 #ifdef I686_CPU
 static void	init_6x86MX(void);
 static void	init_ppro(void);
 static void	init_mendocino(void);
 #endif
 
 static int	hw_instruction_sse;
 SYSCTL_INT(_hw, OID_AUTO, instruction_sse, CTLFLAG_RD,
     &hw_instruction_sse, 0, "SIMD/MMX2 instructions available in CPU");
 /*
  * -1: automatic (default)
  *  0: keep enable CLFLUSH
  *  1: force disable CLFLUSH
  */
 static int	hw_clflush_disable = -1;
 
 /* Must *NOT* be BSS or locore will bzero these after setting them */
 int	cpu = 0;		/* Are we 386, 386sx, 486, etc? */
 u_int	cpu_feature = 0;	/* Feature flags */
 u_int	cpu_feature2 = 0;	/* Feature flags */
 u_int	amd_feature = 0;	/* AMD feature flags */
 u_int	amd_feature2 = 0;	/* AMD feature flags */
 u_int	amd_pminfo = 0;		/* AMD advanced power management info */
 u_int	via_feature_rng = 0;	/* VIA RNG features */
 u_int	via_feature_xcrypt = 0;	/* VIA ACE features */
 u_int	cpu_high = 0;		/* Highest arg to CPUID */
 u_int	cpu_id = 0;		/* Stepping ID */
 u_int	cpu_procinfo = 0;	/* HyperThreading Info / Brand Index / CLFUSH */
 u_int	cpu_procinfo2 = 0;	/* Multicore info */
 char	cpu_vendor[20] = "";	/* CPU Origin code */
 u_int	cpu_vendor_id = 0;	/* CPU vendor ID */
 u_int	cpu_clflush_line_size = 32;
 
 SYSCTL_UINT(_hw, OID_AUTO, via_feature_rng, CTLFLAG_RD,
 	&via_feature_rng, 0, "VIA RNG feature available in CPU");
 SYSCTL_UINT(_hw, OID_AUTO, via_feature_xcrypt, CTLFLAG_RD,
 	&via_feature_xcrypt, 0, "VIA xcrypt feature available in CPU");
 
 #ifdef CPU_ENABLE_SSE
 u_int	cpu_fxsr;		/* SSE enabled */
 u_int	cpu_mxcsr_mask;		/* valid bits in mxcsr */
 #endif
 
 #ifdef I486_CPU
 /*
  * IBM Blue Lightning
  */
 static void
 init_bluelightning(void)
 {
 	register_t saveintr;
 
 #if defined(PC98) && !defined(CPU_UPGRADE_HW_CACHE)
 	need_post_dma_flush = 1;
 #endif
 
 	saveintr = intr_disable();
 
 	load_cr0(rcr0() | CR0_CD | CR0_NW);
 	invd();
 
 #ifdef CPU_BLUELIGHTNING_FPU_OP_CACHE
 	wrmsr(0x1000, 0x9c92LL);	/* FP operand can be cacheable on Cyrix FPU */
 #else
 	wrmsr(0x1000, 0x1c92LL);	/* Intel FPU */
 #endif
 	/* Enables 13MB and 0-640KB cache. */
 	wrmsr(0x1001, (0xd0LL << 32) | 0x3ff);
 #ifdef CPU_BLUELIGHTNING_3X
 	wrmsr(0x1002, 0x04000000LL);	/* Enables triple-clock mode. */
 #else
 	wrmsr(0x1002, 0x03000000LL);	/* Enables double-clock mode. */
 #endif
 
 	/* Enable caching in CR0. */
 	load_cr0(rcr0() & ~(CR0_CD | CR0_NW));	/* CD = 0 and NW = 0 */
 	invd();
 	intr_restore(saveintr);
 }
 
 /*
  * Cyrix 486SLC/DLC/SR/DR series
  */
 static void
 init_486dlc(void)
 {
 	register_t saveintr;
 	u_char	ccr0;
 
 	saveintr = intr_disable();
 	invd();
 
 	ccr0 = read_cyrix_reg(CCR0);
 #ifndef CYRIX_CACHE_WORKS
 	ccr0 |= CCR0_NC1 | CCR0_BARB;
 	write_cyrix_reg(CCR0, ccr0);
 	invd();
 #else
 	ccr0 &= ~CCR0_NC0;
 #ifndef CYRIX_CACHE_REALLY_WORKS
 	ccr0 |= CCR0_NC1 | CCR0_BARB;
 #else
 	ccr0 |= CCR0_NC1;
 #endif
 #ifdef CPU_DIRECT_MAPPED_CACHE
 	ccr0 |= CCR0_CO;			/* Direct mapped mode. */
 #endif
 	write_cyrix_reg(CCR0, ccr0);
 
 	/* Clear non-cacheable region. */
 	write_cyrix_reg(NCR1+2, NCR_SIZE_0K);
 	write_cyrix_reg(NCR2+2, NCR_SIZE_0K);
 	write_cyrix_reg(NCR3+2, NCR_SIZE_0K);
 	write_cyrix_reg(NCR4+2, NCR_SIZE_0K);
 
 	write_cyrix_reg(0, 0);	/* dummy write */
 
 	/* Enable caching in CR0. */
 	load_cr0(rcr0() & ~(CR0_CD | CR0_NW));	/* CD = 0 and NW = 0 */
 	invd();
 #endif /* !CYRIX_CACHE_WORKS */
 	intr_restore(saveintr);
 }
 
 
 /*
  * Cyrix 486S/DX series
  */
 static void
 init_cy486dx(void)
 {
 	register_t saveintr;
 	u_char	ccr2;
 
 	saveintr = intr_disable();
 	invd();
 
 	ccr2 = read_cyrix_reg(CCR2);
 #ifdef CPU_SUSP_HLT
 	ccr2 |= CCR2_SUSP_HLT;
 #endif
 
 #ifdef PC98
 	/* Enables WB cache interface pin and Lock NW bit in CR0. */
 	ccr2 |= CCR2_WB | CCR2_LOCK_NW;
 	/* Unlock NW bit in CR0. */
 	write_cyrix_reg(CCR2, ccr2 & ~CCR2_LOCK_NW);
 	load_cr0((rcr0() & ~CR0_CD) | CR0_NW);	/* CD = 0, NW = 1 */
 #endif
 
 	write_cyrix_reg(CCR2, ccr2);
 	intr_restore(saveintr);
 }
 
 
 /*
  * Cyrix 5x86
  */
 static void
 init_5x86(void)
 {
 	register_t saveintr;
 	u_char	ccr2, ccr3, ccr4, pcr0;
 
 	saveintr = intr_disable();
 
 	load_cr0(rcr0() | CR0_CD | CR0_NW);
 	wbinvd();
 
 	(void)read_cyrix_reg(CCR3);		/* dummy */
 
 	/* Initialize CCR2. */
 	ccr2 = read_cyrix_reg(CCR2);
 	ccr2 |= CCR2_WB;
 #ifdef CPU_SUSP_HLT
 	ccr2 |= CCR2_SUSP_HLT;
 #else
 	ccr2 &= ~CCR2_SUSP_HLT;
 #endif
 	ccr2 |= CCR2_WT1;
 	write_cyrix_reg(CCR2, ccr2);
 
 	/* Initialize CCR4. */
 	ccr3 = read_cyrix_reg(CCR3);
 	write_cyrix_reg(CCR3, CCR3_MAPEN0);
 
 	ccr4 = read_cyrix_reg(CCR4);
 	ccr4 |= CCR4_DTE;
 	ccr4 |= CCR4_MEM;
 #ifdef CPU_FASTER_5X86_FPU
 	ccr4 |= CCR4_FASTFPE;
 #else
 	ccr4 &= ~CCR4_FASTFPE;
 #endif
 	ccr4 &= ~CCR4_IOMASK;
 	/********************************************************************
 	 * WARNING: The "BIOS Writers Guide" mentions that I/O recovery time
 	 * should be 0 for errata fix.
 	 ********************************************************************/
 #ifdef CPU_IORT
 	ccr4 |= CPU_IORT & CCR4_IOMASK;
 #endif
 	write_cyrix_reg(CCR4, ccr4);
 
 	/* Initialize PCR0. */
 	/****************************************************************
 	 * WARNING: RSTK_EN and LOOP_EN could make your system unstable.
 	 * BTB_EN might make your system unstable.
 	 ****************************************************************/
 	pcr0 = read_cyrix_reg(PCR0);
 #ifdef CPU_RSTK_EN
 	pcr0 |= PCR0_RSTK;
 #else
 	pcr0 &= ~PCR0_RSTK;
 #endif
 #ifdef CPU_BTB_EN
 	pcr0 |= PCR0_BTB;
 #else
 	pcr0 &= ~PCR0_BTB;
 #endif
 #ifdef CPU_LOOP_EN
 	pcr0 |= PCR0_LOOP;
 #else
 	pcr0 &= ~PCR0_LOOP;
 #endif
 
 	/****************************************************************
 	 * WARNING: if you use a memory mapped I/O device, don't use
 	 * DISABLE_5X86_LSSER option, which may reorder memory mapped
 	 * I/O access.
 	 * IF YOUR MOTHERBOARD HAS PCI BUS, DON'T DISABLE LSSER.
 	 ****************************************************************/
 #ifdef CPU_DISABLE_5X86_LSSER
 	pcr0 &= ~PCR0_LSSER;
 #else
 	pcr0 |= PCR0_LSSER;
 #endif
 	write_cyrix_reg(PCR0, pcr0);
 
 	/* Restore CCR3. */
 	write_cyrix_reg(CCR3, ccr3);
 
 	(void)read_cyrix_reg(0x80);		/* dummy */
 
 	/* Unlock NW bit in CR0. */
 	write_cyrix_reg(CCR2, read_cyrix_reg(CCR2) & ~CCR2_LOCK_NW);
 	load_cr0((rcr0() & ~CR0_CD) | CR0_NW);	/* CD = 0, NW = 1 */
 	/* Lock NW bit in CR0. */
 	write_cyrix_reg(CCR2, read_cyrix_reg(CCR2) | CCR2_LOCK_NW);
 
 	intr_restore(saveintr);
 }
 
 #ifdef CPU_I486_ON_386
 /*
  * There are i486 based upgrade products for i386 machines.
  * In this case, BIOS doesn't enable CPU cache.
  */
 static void
 init_i486_on_386(void)
 {
 	register_t saveintr;
 
 #if defined(PC98) && !defined(CPU_UPGRADE_HW_CACHE)
 	need_post_dma_flush = 1;
 #endif
 
 	saveintr = intr_disable();
 
 	load_cr0(rcr0() & ~(CR0_CD | CR0_NW));	/* CD = 0, NW = 0 */
 
 	intr_restore(saveintr);
 }
 #endif
 
 /*
  * Cyrix 6x86
  *
  * XXX - What should I do here?  Please let me know.
  */
 static void
 init_6x86(void)
 {
 	register_t saveintr;
 	u_char	ccr3, ccr4;
 
 	saveintr = intr_disable();
 
 	load_cr0(rcr0() | CR0_CD | CR0_NW);
 	wbinvd();
 
 	/* Initialize CCR0. */
 	write_cyrix_reg(CCR0, read_cyrix_reg(CCR0) | CCR0_NC1);
 
 	/* Initialize CCR1. */
 #ifdef CPU_CYRIX_NO_LOCK
 	write_cyrix_reg(CCR1, read_cyrix_reg(CCR1) | CCR1_NO_LOCK);
 #else
 	write_cyrix_reg(CCR1, read_cyrix_reg(CCR1) & ~CCR1_NO_LOCK);
 #endif
 
 	/* Initialize CCR2. */
 #ifdef CPU_SUSP_HLT
 	write_cyrix_reg(CCR2, read_cyrix_reg(CCR2) | CCR2_SUSP_HLT);
 #else
 	write_cyrix_reg(CCR2, read_cyrix_reg(CCR2) & ~CCR2_SUSP_HLT);
 #endif
 
 	ccr3 = read_cyrix_reg(CCR3);
 	write_cyrix_reg(CCR3, CCR3_MAPEN0);
 
 	/* Initialize CCR4. */
 	ccr4 = read_cyrix_reg(CCR4);
 	ccr4 |= CCR4_DTE;
 	ccr4 &= ~CCR4_IOMASK;
 #ifdef CPU_IORT
 	write_cyrix_reg(CCR4, ccr4 | (CPU_IORT & CCR4_IOMASK));
 #else
 	write_cyrix_reg(CCR4, ccr4 | 7);
 #endif
 
 	/* Initialize CCR5. */
 #ifdef CPU_WT_ALLOC
 	write_cyrix_reg(CCR5, read_cyrix_reg(CCR5) | CCR5_WT_ALLOC);
 #endif
 
 	/* Restore CCR3. */
 	write_cyrix_reg(CCR3, ccr3);
 
 	/* Unlock NW bit in CR0. */
 	write_cyrix_reg(CCR2, read_cyrix_reg(CCR2) & ~CCR2_LOCK_NW);
 
 	/*
 	 * Earlier revision of the 6x86 CPU could crash the system if
 	 * L1 cache is in write-back mode.
 	 */
 	if ((cyrix_did & 0xff00) > 0x1600)
 		load_cr0(rcr0() & ~(CR0_CD | CR0_NW));	/* CD = 0 and NW = 0 */
 	else {
 		/* Revision 2.6 and lower. */
 #ifdef CYRIX_CACHE_REALLY_WORKS
 		load_cr0(rcr0() & ~(CR0_CD | CR0_NW));	/* CD = 0 and NW = 0 */
 #else
 		load_cr0((rcr0() & ~CR0_CD) | CR0_NW);	/* CD = 0 and NW = 1 */
 #endif
 	}
 
 	/* Lock NW bit in CR0. */
 	write_cyrix_reg(CCR2, read_cyrix_reg(CCR2) | CCR2_LOCK_NW);
 
 	intr_restore(saveintr);
 }
 #endif /* I486_CPU */
 
 #ifdef I586_CPU
 /*
  * IDT WinChip C6/2/2A/2B/3
  *
  * http://www.centtech.com/winchip_bios_writers_guide_v4_0.pdf
  */
 static void
 init_winchip(void)
 {
 	u_int regs[4];
 	uint64_t fcr;
 
 	fcr = rdmsr(0x0107);
 
 	/*
 	 * Set ECX8, DSMC, DTLOCK/EDCTLB, EMMX, and ERETSTK and clear DPDC.
 	 */
 	fcr |= (1 << 1) | (1 << 7) | (1 << 8) | (1 << 9) | (1 << 16);
 	fcr &= ~(1ULL << 11);
 
 	/*
 	 * Additioanlly, set EBRPRED, E2MMX and EAMD3D for WinChip 2 and 3.
 	 */
 	if (CPUID_TO_MODEL(cpu_id) >= 8)
 		fcr |= (1 << 12) | (1 << 19) | (1 << 20);
 
 	wrmsr(0x0107, fcr);
 	do_cpuid(1, regs);
 	cpu_feature = regs[3];
 }
 #endif
 
 #ifdef I686_CPU
 /*
  * Cyrix 6x86MX (code-named M2)
  *
  * XXX - What should I do here?  Please let me know.
  */
 static void
 init_6x86MX(void)
 {
 	register_t saveintr;
 	u_char	ccr3, ccr4;
 
 	saveintr = intr_disable();
 
 	load_cr0(rcr0() | CR0_CD | CR0_NW);
 	wbinvd();
 
 	/* Initialize CCR0. */
 	write_cyrix_reg(CCR0, read_cyrix_reg(CCR0) | CCR0_NC1);
 
 	/* Initialize CCR1. */
 #ifdef CPU_CYRIX_NO_LOCK
 	write_cyrix_reg(CCR1, read_cyrix_reg(CCR1) | CCR1_NO_LOCK);
 #else
 	write_cyrix_reg(CCR1, read_cyrix_reg(CCR1) & ~CCR1_NO_LOCK);
 #endif
 
 	/* Initialize CCR2. */
 #ifdef CPU_SUSP_HLT
 	write_cyrix_reg(CCR2, read_cyrix_reg(CCR2) | CCR2_SUSP_HLT);
 #else
 	write_cyrix_reg(CCR2, read_cyrix_reg(CCR2) & ~CCR2_SUSP_HLT);
 #endif
 
 	ccr3 = read_cyrix_reg(CCR3);
 	write_cyrix_reg(CCR3, CCR3_MAPEN0);
 
 	/* Initialize CCR4. */
 	ccr4 = read_cyrix_reg(CCR4);
 	ccr4 &= ~CCR4_IOMASK;
 #ifdef CPU_IORT
 	write_cyrix_reg(CCR4, ccr4 | (CPU_IORT & CCR4_IOMASK));
 #else
 	write_cyrix_reg(CCR4, ccr4 | 7);
 #endif
 
 	/* Initialize CCR5. */
 #ifdef CPU_WT_ALLOC
 	write_cyrix_reg(CCR5, read_cyrix_reg(CCR5) | CCR5_WT_ALLOC);
 #endif
 
 	/* Restore CCR3. */
 	write_cyrix_reg(CCR3, ccr3);
 
 	/* Unlock NW bit in CR0. */
 	write_cyrix_reg(CCR2, read_cyrix_reg(CCR2) & ~CCR2_LOCK_NW);
 
 	load_cr0(rcr0() & ~(CR0_CD | CR0_NW));	/* CD = 0 and NW = 0 */
 
 	/* Lock NW bit in CR0. */
 	write_cyrix_reg(CCR2, read_cyrix_reg(CCR2) | CCR2_LOCK_NW);
 
 	intr_restore(saveintr);
 }
 
 static void
 init_ppro(void)
 {
 	u_int64_t	apicbase;
 
 	/*
 	 * Local APIC should be disabled if it is not going to be used.
 	 */
 	apicbase = rdmsr(MSR_APICBASE);
 	apicbase &= ~APICBASE_ENABLED;
 	wrmsr(MSR_APICBASE, apicbase);
 }
 
 /*
  * Initialize BBL_CR_CTL3 (Control register 3: used to configure the
  * L2 cache).
  */
 static void
 init_mendocino(void)
 {
 #ifdef CPU_PPRO2CELERON
 	register_t	saveintr;
 	u_int64_t	bbl_cr_ctl3;
 
 	saveintr = intr_disable();
 
 	load_cr0(rcr0() | CR0_CD | CR0_NW);
 	wbinvd();
 
 	bbl_cr_ctl3 = rdmsr(MSR_BBL_CR_CTL3);
 
 	/* If the L2 cache is configured, do nothing. */
 	if (!(bbl_cr_ctl3 & 1)) {
 		bbl_cr_ctl3 = 0x134052bLL;
 
 		/* Set L2 Cache Latency (Default: 5). */
 #ifdef	CPU_CELERON_L2_LATENCY
 #if CPU_L2_LATENCY > 15
 #error invalid CPU_L2_LATENCY.
 #endif
 		bbl_cr_ctl3 |= CPU_L2_LATENCY << 1;
 #else
 		bbl_cr_ctl3 |= 5 << 1;
 #endif
 		wrmsr(MSR_BBL_CR_CTL3, bbl_cr_ctl3);
 	}
 
 	load_cr0(rcr0() & ~(CR0_CD | CR0_NW));
 	intr_restore(saveintr);
 #endif /* CPU_PPRO2CELERON */
 }
 
 /*
  * Initialize special VIA features
  */
 static void
 init_via(void)
 {
 	u_int regs[4], val;
 	uint64_t fcr;
 
 	/*
 	 * Explicitly enable CX8 and PGE on C3.
 	 *
 	 * http://www.via.com.tw/download/mainboards/6/13/VIA_C3_EBGA%20datasheet110.pdf
 	 */
 	if (CPUID_TO_MODEL(cpu_id) <= 9)
 		fcr = (1 << 1) | (1 << 7);
 	else
 		fcr = 0;
 
 	/*
 	 * Check extended CPUID for PadLock features.
 	 *
 	 * http://www.via.com.tw/en/downloads/whitepapers/initiatives/padlock/programming_guide.pdf
 	 */
 	do_cpuid(0xc0000000, regs);
 	if (regs[0] >= 0xc0000001) {
 		do_cpuid(0xc0000001, regs);
 		val = regs[3];
 	} else
 		val = 0;
 
 	/* Enable RNG if present. */
 	if ((val & VIA_CPUID_HAS_RNG) != 0) {
 		via_feature_rng = VIA_HAS_RNG;
 		wrmsr(0x110B, rdmsr(0x110B) | VIA_CPUID_DO_RNG);
 	}
 
 	/* Enable PadLock if present. */
 	if ((val & VIA_CPUID_HAS_ACE) != 0)
 		via_feature_xcrypt |= VIA_HAS_AES;
 	if ((val & VIA_CPUID_HAS_ACE2) != 0)
 		via_feature_xcrypt |= VIA_HAS_AESCTR;
 	if ((val & VIA_CPUID_HAS_PHE) != 0)
 		via_feature_xcrypt |= VIA_HAS_SHA;
 	if ((val & VIA_CPUID_HAS_PMM) != 0)
 		via_feature_xcrypt |= VIA_HAS_MM;
 	if (via_feature_xcrypt != 0)
 		fcr |= 1 << 28;
 
 	wrmsr(0x1107, rdmsr(0x1107) | fcr);
 }
 
 #endif /* I686_CPU */
 
 #if defined(I586_CPU) || defined(I686_CPU)
 static void
 init_transmeta(void)
 {
 	u_int regs[0];
 
 	/* Expose all hidden features. */
 	wrmsr(0x80860004, rdmsr(0x80860004) | ~0UL);
 	do_cpuid(1, regs);
 	cpu_feature = regs[3];
 }
 #endif
 
 /*
  * Initialize CR4 (Control register 4) to enable SSE instructions.
  */
 void
 enable_sse(void)
 {
 #if defined(CPU_ENABLE_SSE)
 	if ((cpu_feature & CPUID_XMM) && (cpu_feature & CPUID_FXSR)) {
 		load_cr4(rcr4() | CR4_FXSR | CR4_XMM);
 		cpu_fxsr = hw_instruction_sse = 1;
 	}
 #endif
 }
 
 void
 initializecpu(void)
 {
 
 	switch (cpu) {
 #ifdef I486_CPU
 	case CPU_BLUE:
 		init_bluelightning();
 		break;
 	case CPU_486DLC:
 		init_486dlc();
 		break;
 	case CPU_CY486DX:
 		init_cy486dx();
 		break;
 	case CPU_M1SC:
 		init_5x86();
 		break;
 #ifdef CPU_I486_ON_386
 	case CPU_486:
 		init_i486_on_386();
 		break;
 #endif
 	case CPU_M1:
 		init_6x86();
 		break;
 #endif /* I486_CPU */
 #ifdef I586_CPU
 	case CPU_586:
 		switch (cpu_vendor_id) {
 		case CPU_VENDOR_CENTAUR:
 			init_winchip();
 			break;
 		case CPU_VENDOR_TRANSMETA:
 			init_transmeta();
 			break;
 		}
 		break;
 #endif
 #ifdef I686_CPU
 	case CPU_M2:
 		init_6x86MX();
 		break;
 	case CPU_686:
 		switch (cpu_vendor_id) {
 		case CPU_VENDOR_INTEL:
 			switch (cpu_id & 0xff0) {
 			case 0x610:
 				init_ppro();
 				break;
 			case 0x660:
 				init_mendocino();
 				break;
 			}
 			break;
 #ifdef CPU_ATHLON_SSE_HACK
 		case CPU_VENDOR_AMD:
 			/*
 			 * Sometimes the BIOS doesn't enable SSE instructions.
 			 * According to AMD document 20734, the mobile
 			 * Duron, the (mobile) Athlon 4 and the Athlon MP
 			 * support SSE. These correspond to cpu_id 0x66X
 			 * or 0x67X.
 			 */
 			if ((cpu_feature & CPUID_XMM) == 0 &&
 			    ((cpu_id & ~0xf) == 0x660 ||
 			     (cpu_id & ~0xf) == 0x670 ||
 			     (cpu_id & ~0xf) == 0x680)) {
 				u_int regs[4];
 				wrmsr(MSR_HWCR, rdmsr(MSR_HWCR) & ~0x08000);
 				do_cpuid(1, regs);
 				cpu_feature = regs[3];
 			}
 			break;
 #endif
 		case CPU_VENDOR_CENTAUR:
 			init_via();
 			break;
 		case CPU_VENDOR_TRANSMETA:
 			init_transmeta();
 			break;
 		}
 #ifdef PAE
 		if ((amd_feature & AMDID_NX) != 0) {
 			uint64_t msr;
 
 			msr = rdmsr(MSR_EFER) | EFER_NXE;
 			wrmsr(MSR_EFER, msr);
 			pg_nx = PG_NX;
 		}
 #endif
 		break;
 #endif
 	default:
 		break;
 	}
 	enable_sse();
 
 	/*
 	 * CPUID with %eax = 1, %ebx returns
 	 * Bits 15-8: CLFLUSH line size
 	 * 	(Value * 8 = cache line size in bytes)
 	 */
 	if ((cpu_feature & CPUID_CLFSH) != 0)
 		cpu_clflush_line_size = ((cpu_procinfo >> 8) & 0xff) * 8;
 	/*
 	 * XXXKIB: (temporary) hack to work around traps generated
 	 * when CLFLUSHing APIC register window under virtualization
 	 * environments.  These environments tend to disable the
 	 * CPUID_SS feature even though the native CPU supports it.
 	 */
 	TUNABLE_INT_FETCH("hw.clflush_disable", &hw_clflush_disable);
 	if (vm_guest != VM_GUEST_NO && hw_clflush_disable == -1)
 		cpu_feature &= ~CPUID_CLFSH;
 	/*
 	 * Allow to disable CLFLUSH feature manually by
 	 * hw.clflush_disable tunable.
 	 */
 	if (hw_clflush_disable == 1)
 		cpu_feature &= ~CPUID_CLFSH;
 
 #if defined(PC98) && !defined(CPU_UPGRADE_HW_CACHE)
 	/*
 	 * OS should flush L1 cache by itself because no PC-98 supports
 	 * non-Intel CPUs.  Use wbinvd instruction before DMA transfer
 	 * when need_pre_dma_flush = 1, use invd instruction after DMA
 	 * transfer when need_post_dma_flush = 1.  If your CPU upgrade
 	 * product supports hardware cache control, you can add the
 	 * CPU_UPGRADE_HW_CACHE option in your kernel configuration file.
 	 * This option eliminates unneeded cache flush instruction(s).
 	 */
 	if (cpu_vendor_id == CPU_VENDOR_CYRIX) {
 		switch (cpu) {
 #ifdef I486_CPU
 		case CPU_486DLC:
 			need_post_dma_flush = 1;
 			break;
 		case CPU_M1SC:
 			need_pre_dma_flush = 1;
 			break;
 		case CPU_CY486DX:
 			need_pre_dma_flush = 1;
 #ifdef CPU_I486_ON_386
 			need_post_dma_flush = 1;
 #endif
 			break;
 #endif
 		default:
 			break;
 		}
 	} else if (cpu_vendor_id == CPU_VENDOR_AMD) {
 		switch (cpu_id & 0xFF0) {
 		case 0x470:		/* Enhanced Am486DX2 WB */
 		case 0x490:		/* Enhanced Am486DX4 WB */
 		case 0x4F0:		/* Am5x86 WB */
 			need_pre_dma_flush = 1;
 			break;
 		}
 	} else if (cpu_vendor_id == CPU_VENDOR_IBM) {
 		need_post_dma_flush = 1;
 	} else {
 #ifdef CPU_I486_ON_386
 		need_pre_dma_flush = 1;
 #endif
 	}
 #endif /* PC98 && !CPU_UPGRADE_HW_CACHE */
 }
 
 #if defined(I586_CPU) && defined(CPU_WT_ALLOC)
 /*
  * Enable write allocate feature of AMD processors.
  * Following two functions require the Maxmem variable being set.
  */
 void
 enable_K5_wt_alloc(void)
 {
 	u_int64_t	msr;
 	register_t	saveintr;
 
 	/*
 	 * Write allocate is supported only on models 1, 2, and 3, with
 	 * a stepping of 4 or greater.
 	 */
 	if (((cpu_id & 0xf0) > 0) && ((cpu_id & 0x0f) > 3)) {
 		saveintr = intr_disable();
 		msr = rdmsr(0x83);		/* HWCR */
 		wrmsr(0x83, msr & !(0x10));
 
 		/*
 		 * We have to tell the chip where the top of memory is,
 		 * since video cards could have frame bufferes there,
 		 * memory-mapped I/O could be there, etc.
 		 */
 		if(Maxmem > 0)
 		  msr = Maxmem / 16;
 		else
 		  msr = 0;
 		msr |= AMD_WT_ALLOC_TME | AMD_WT_ALLOC_FRE;
 #ifdef PC98
 		if (!(inb(0x43b) & 4)) {
 			wrmsr(0x86, 0x0ff00f0);
 			msr |= AMD_WT_ALLOC_PRE;
 		}
 #else
 		/*
 		 * There is no way to know wheter 15-16M hole exists or not. 
 		 * Therefore, we disable write allocate for this range.
 		 */
 			wrmsr(0x86, 0x0ff00f0);
 			msr |= AMD_WT_ALLOC_PRE;
 #endif
 		wrmsr(0x85, msr);
 
 		msr=rdmsr(0x83);
 		wrmsr(0x83, msr|0x10); /* enable write allocate */
 		intr_restore(saveintr);
 	}
 }
 
 void
 enable_K6_wt_alloc(void)
 {
 	quad_t	size;
 	u_int64_t	whcr;
 	register_t	saveintr;
 
 	saveintr = intr_disable();
 	wbinvd();
 
 #ifdef CPU_DISABLE_CACHE
 	/*
 	 * Certain K6-2 box becomes unstable when write allocation is
 	 * enabled.
 	 */
 	/*
 	 * The AMD-K6 processer provides the 64-bit Test Register 12(TR12),
 	 * but only the Cache Inhibit(CI) (bit 3 of TR12) is suppported.
 	 * All other bits in TR12 have no effect on the processer's operation.
 	 * The I/O Trap Restart function (bit 9 of TR12) is always enabled
 	 * on the AMD-K6.
 	 */
 	wrmsr(0x0000000e, (u_int64_t)0x0008);
 #endif
 	/* Don't assume that memory size is aligned with 4M. */
 	if (Maxmem > 0)
 	  size = ((Maxmem >> 8) + 3) >> 2;
 	else
 	  size = 0;
 
 	/* Limit is 508M bytes. */
 	if (size > 0x7f)
 		size = 0x7f;
 	whcr = (rdmsr(0xc0000082) & ~(0x7fLL << 1)) | (size << 1);
 
 #if defined(PC98) || defined(NO_MEMORY_HOLE)
 	if (whcr & (0x7fLL << 1)) {
 #ifdef PC98
 		/*
 		 * If bit 2 of port 0x43b is 0, disable wrte allocate for the
 		 * 15-16M range.
 		 */
 		if (!(inb(0x43b) & 4))
 			whcr &= ~0x0001LL;
 		else
 #endif
 			whcr |=  0x0001LL;
 	}
 #else
 	/*
 	 * There is no way to know wheter 15-16M hole exists or not. 
 	 * Therefore, we disable write allocate for this range.
 	 */
 	whcr &= ~0x0001LL;
 #endif
 	wrmsr(0x0c0000082, whcr);
 
 	intr_restore(saveintr);
 }
 
 void
 enable_K6_2_wt_alloc(void)
 {
 	quad_t	size;
 	u_int64_t	whcr;
 	register_t	saveintr;
 
 	saveintr = intr_disable();
 	wbinvd();
 
 #ifdef CPU_DISABLE_CACHE
 	/*
 	 * Certain K6-2 box becomes unstable when write allocation is
 	 * enabled.
 	 */
 	/*
 	 * The AMD-K6 processer provides the 64-bit Test Register 12(TR12),
 	 * but only the Cache Inhibit(CI) (bit 3 of TR12) is suppported.
 	 * All other bits in TR12 have no effect on the processer's operation.
 	 * The I/O Trap Restart function (bit 9 of TR12) is always enabled
 	 * on the AMD-K6.
 	 */
 	wrmsr(0x0000000e, (u_int64_t)0x0008);
 #endif
 	/* Don't assume that memory size is aligned with 4M. */
 	if (Maxmem > 0)
 	  size = ((Maxmem >> 8) + 3) >> 2;
 	else
 	  size = 0;
 
 	/* Limit is 4092M bytes. */
 	if (size > 0x3fff)
 		size = 0x3ff;
 	whcr = (rdmsr(0xc0000082) & ~(0x3ffLL << 22)) | (size << 22);
 
 #if defined(PC98) || defined(NO_MEMORY_HOLE)
 	if (whcr & (0x3ffLL << 22)) {
 #ifdef PC98
 		/*
 		 * If bit 2 of port 0x43b is 0, disable wrte allocate for the
 		 * 15-16M range.
 		 */
 		if (!(inb(0x43b) & 4))
 			whcr &= ~(1LL << 16);
 		else
 #endif
 			whcr |=  1LL << 16;
 	}
 #else
 	/*
 	 * There is no way to know wheter 15-16M hole exists or not. 
 	 * Therefore, we disable write allocate for this range.
 	 */
 	whcr &= ~(1LL << 16);
 #endif
 	wrmsr(0x0c0000082, whcr);
 
 	intr_restore(saveintr);
 }
 #endif /* I585_CPU && CPU_WT_ALLOC */
 
 #include "opt_ddb.h"
 #ifdef DDB
 #include <ddb/ddb.h>
 
 DB_SHOW_COMMAND(cyrixreg, cyrixreg)
 {
 	register_t saveintr;
 	u_int	cr0;
 	u_char	ccr1, ccr2, ccr3;
 	u_char	ccr0 = 0, ccr4 = 0, ccr5 = 0, pcr0 = 0;
 
 	cr0 = rcr0();
 	if (cpu_vendor_id == CPU_VENDOR_CYRIX) {
 		saveintr = intr_disable();
 
 
 		if ((cpu != CPU_M1SC) && (cpu != CPU_CY486DX)) {
 			ccr0 = read_cyrix_reg(CCR0);
 		}
 		ccr1 = read_cyrix_reg(CCR1);
 		ccr2 = read_cyrix_reg(CCR2);
 		ccr3 = read_cyrix_reg(CCR3);
 		if ((cpu == CPU_M1SC) || (cpu == CPU_M1) || (cpu == CPU_M2)) {
 			write_cyrix_reg(CCR3, CCR3_MAPEN0);
 			ccr4 = read_cyrix_reg(CCR4);
 			if ((cpu == CPU_M1) || (cpu == CPU_M2))
 				ccr5 = read_cyrix_reg(CCR5);
 			else
 				pcr0 = read_cyrix_reg(PCR0);
 			write_cyrix_reg(CCR3, ccr3);		/* Restore CCR3. */
 		}
 		intr_restore(saveintr);
 
 		if ((cpu != CPU_M1SC) && (cpu != CPU_CY486DX))
 			printf("CCR0=%x, ", (u_int)ccr0);
 
 		printf("CCR1=%x, CCR2=%x, CCR3=%x",
 			(u_int)ccr1, (u_int)ccr2, (u_int)ccr3);
 		if ((cpu == CPU_M1SC) || (cpu == CPU_M1) || (cpu == CPU_M2)) {
 			printf(", CCR4=%x, ", (u_int)ccr4);
 			if (cpu == CPU_M1SC)
 				printf("PCR0=%x\n", pcr0);
 			else
 				printf("CCR5=%x\n", ccr5);
 		}
 	}
 	printf("CR0=%x\n", cr0);
 }
 #endif /* DDB */
Index: stable/9/sys/i386/i386/machdep.c
===================================================================
--- stable/9/sys/i386/i386/machdep.c	(revision 235795)
+++ stable/9/sys/i386/i386/machdep.c	(revision 235796)
@@ -1,3706 +1,3705 @@
 /*-
  * Copyright (c) 1992 Terrence R. Lambert.
  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_apic.h"
 #include "opt_atalk.h"
 #include "opt_atpic.h"
 #include "opt_compat.h"
 #include "opt_cpu.h"
 #include "opt_ddb.h"
 #include "opt_inet.h"
 #include "opt_ipx.h"
 #include "opt_isa.h"
 #include "opt_kstack_pages.h"
 #include "opt_maxmem.h"
 #include "opt_mp_watchdog.h"
 #include "opt_npx.h"
 #include "opt_perfmon.h"
 #include "opt_xbox.h"
 #include "opt_kdtrace.h"
 
 #include <sys/param.h>
 #include <sys/proc.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/bus.h>
 #include <sys/callout.h>
 #include <sys/cons.h>
 #include <sys/cpu.h>
 #include <sys/eventhandler.h>
 #include <sys/exec.h>
 #include <sys/imgact.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/linker.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/msgbuf.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/ptrace.h>
 #include <sys/reboot.h>
 #include <sys/sched.h>
 #include <sys/signalvar.h>
 #ifdef SMP
 #include <sys/smp.h>
 #endif
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/ucontext.h>
 #include <sys/vmmeter.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_param.h>
 
 #ifdef DDB
 #ifndef KDB
 #error KDB must be enabled in order for DDB to work!
 #endif
 #include <ddb/ddb.h>
 #include <ddb/db_sym.h>
 #endif
 
 #include <isa/rtc.h>
 
 #include <net/netisr.h>
 
 #include <machine/bootinfo.h>
 #include <machine/clock.h>
 #include <machine/cpu.h>
 #include <machine/cputypes.h>
 #include <machine/intr_machdep.h>
 #include <x86/mca.h>
 #include <machine/md_var.h>
 #include <machine/metadata.h>
 #include <machine/mp_watchdog.h>
 #include <machine/pc/bios.h>
 #include <machine/pcb.h>
 #include <machine/pcb_ext.h>
 #include <machine/proc.h>
 #include <machine/reg.h>
 #include <machine/sigframe.h>
 #include <machine/specialreg.h>
 #include <machine/vm86.h>
 #ifdef PERFMON
 #include <machine/perfmon.h>
 #endif
 #ifdef SMP
 #include <machine/smp.h>
 #endif
 
 #ifdef DEV_APIC
 #include <machine/apicvar.h>
 #endif
 
 #ifdef DEV_ISA
 #include <x86/isa/icu.h>
 #endif
 
 #ifdef XBOX
 #include <machine/xbox.h>
 
 int arch_i386_is_xbox = 0;
 uint32_t arch_i386_xbox_memsize = 0;
 #endif
 
 #ifdef XEN
 /* XEN includes */
 #include <machine/xen/xen-os.h>
 #include <xen/hypervisor.h>
 #include <machine/xen/xen-os.h>
 #include <machine/xen/xenvar.h>
 #include <machine/xen/xenfunc.h>
 #include <xen/xen_intr.h>
 
 void Xhypervisor_callback(void);
 void failsafe_callback(void);
 
 extern trap_info_t trap_table[];
 struct proc_ldt default_proc_ldt;
 extern int init_first;
 int running_xen = 1;
 extern unsigned long physfree;
 #endif /* XEN */
 
 /* Sanity check for __curthread() */
 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
 
 extern void init386(int first);
 extern void dblfault_handler(void);
 
 extern void printcpuinfo(void);	/* XXX header file */
 extern void finishidentcpu(void);
 extern void panicifcpuunsupported(void);
-extern void initializecpu(void);
 
 #define	CS_SECURE(cs)		(ISPL(cs) == SEL_UPL)
 #define	EFL_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
 
 #if !defined(CPU_DISABLE_SSE) && defined(I686_CPU)
 #define CPU_ENABLE_SSE
 #endif
 
 static void cpu_startup(void *);
 static void fpstate_drop(struct thread *td);
 static void get_fpcontext(struct thread *td, mcontext_t *mcp);
 static int  set_fpcontext(struct thread *td, const mcontext_t *mcp);
 #ifdef CPU_ENABLE_SSE
 static void set_fpregs_xmm(struct save87 *, struct savexmm *);
 static void fill_fpregs_xmm(struct savexmm *, struct save87 *);
 #endif /* CPU_ENABLE_SSE */
 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
 
 #ifdef DDB
 extern vm_offset_t ksym_start, ksym_end;
 #endif
 
 /* Intel ICH registers */
 #define ICH_PMBASE	0x400
 #define ICH_SMI_EN	ICH_PMBASE + 0x30
 
 int	_udatasel, _ucodesel;
 u_int	basemem;
 
 int cold = 1;
 
 #ifdef COMPAT_43
 static void osendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask);
 #endif
 #ifdef COMPAT_FREEBSD4
 static void freebsd4_sendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask);
 #endif
 
 long Maxmem = 0;
 long realmem = 0;
 
 #ifdef PAE
 FEATURE(pae, "Physical Address Extensions");
 #endif
 
 /*
  * The number of PHYSMAP entries must be one less than the number of
  * PHYSSEG entries because the PHYSMAP entry that spans the largest
  * physical address that is accessible by ISA DMA is split into two
  * PHYSSEG entries.
  */
 #define	PHYSMAP_SIZE	(2 * (VM_PHYSSEG_MAX - 1))
 
 vm_paddr_t phys_avail[PHYSMAP_SIZE + 2];
 vm_paddr_t dump_avail[PHYSMAP_SIZE + 2];
 
 /* must be 2 less so 0 0 can signal end of chunks */
 #define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(phys_avail[0])) - 2)
 #define DUMP_AVAIL_ARRAY_END ((sizeof(dump_avail) / sizeof(dump_avail[0])) - 2)
 
 struct kva_md_info kmi;
 
 static struct trapframe proc0_tf;
 struct pcpu __pcpu[MAXCPU];
 
 struct mtx icu_lock;
 
 static void
 cpu_startup(dummy)
 	void *dummy;
 {
 	uintmax_t memsize;
 	char *sysenv;
 	
 	/*
 	 * On MacBooks, we need to disallow the legacy USB circuit to
 	 * generate an SMI# because this can cause several problems,
 	 * namely: incorrect CPU frequency detection and failure to
 	 * start the APs.
 	 * We do this by disabling a bit in the SMI_EN (SMI Control and
 	 * Enable register) of the Intel ICH LPC Interface Bridge.
 	 */
 	sysenv = getenv("smbios.system.product");
 	if (sysenv != NULL) {
 		if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
 		    strncmp(sysenv, "MacBook3,1", 10) == 0 ||
 		    strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
 		    strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
 		    strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
 		    strncmp(sysenv, "Macmini1,1", 10) == 0) {
 			if (bootverbose)
 				printf("Disabling LEGACY_USB_EN bit on "
 				    "Intel ICH.\n");
 			outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
 		}
 		freeenv(sysenv);
 	}
 
 	/*
 	 * Good {morning,afternoon,evening,night}.
 	 */
 	startrtclock();
 	printcpuinfo();
 	panicifcpuunsupported();
 #ifdef PERFMON
 	perfmon_init();
 #endif
 	realmem = Maxmem;
 
 	/*
 	 * Display physical memory if SMBIOS reports reasonable amount.
 	 */
 	memsize = 0;
 	sysenv = getenv("smbios.memory.enabled");
 	if (sysenv != NULL) {
 		memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
 		freeenv(sysenv);
 	}
 	if (memsize < ptoa((uintmax_t)cnt.v_free_count))
 		memsize = ptoa((uintmax_t)Maxmem);
 	printf("real memory  = %ju (%ju MB)\n", memsize, memsize >> 20);
 
 	/*
 	 * Display any holes after the first chunk of extended memory.
 	 */
 	if (bootverbose) {
 		int indx;
 
 		printf("Physical memory chunk(s):\n");
 		for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
 			vm_paddr_t size;
 
 			size = phys_avail[indx + 1] - phys_avail[indx];
 			printf(
 			    "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
 			    (uintmax_t)phys_avail[indx],
 			    (uintmax_t)phys_avail[indx + 1] - 1,
 			    (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
 		}
 	}
 
 	vm_ksubmap_init(&kmi);
 
 	printf("avail memory = %ju (%ju MB)\n",
 	    ptoa((uintmax_t)cnt.v_free_count),
 	    ptoa((uintmax_t)cnt.v_free_count) / 1048576);
 
 	/*
 	 * Set up buffers, so they can be used to read disk labels.
 	 */
 	bufinit();
 	vm_pager_bufferinit();
 #ifndef XEN
 	cpu_setregs();
 #endif
 
 	/*
 	 * Add BSP as an interrupt target.
 	 */
 	intr_add_cpu(0);
 }
 
 /*
  * Send an interrupt to process.
  *
  * Stack is set up to allow sigcode stored
  * at top to call routine, followed by kcall
  * to sigreturn routine below.  After sigreturn
  * resets the signal mask, the stack, and the
  * frame pointer, it returns to the user
  * specified pc, psl.
  */
 #ifdef COMPAT_43
 static void
 osendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 {
 	struct osigframe sf, *fp;
 	struct proc *p;
 	struct thread *td;
 	struct sigacts *psp;
 	struct trapframe *regs;
 	int sig;
 	int oonstack;
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	sig = ksi->ksi_signo;
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 	regs = td->td_frame;
 	oonstack = sigonstack(regs->tf_esp);
 
 	/* Allocate space for the signal handler context. */
 	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		fp = (struct osigframe *)(td->td_sigstk.ss_sp +
 		    td->td_sigstk.ss_size - sizeof(struct osigframe));
 #if defined(COMPAT_43)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 #endif
 	} else
 		fp = (struct osigframe *)regs->tf_esp - 1;
 
 	/* Translate the signal if appropriate. */
 	if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
 		sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
 
 	/* Build the argument list for the signal handler. */
 	sf.sf_signum = sig;
 	sf.sf_scp = (register_t)&fp->sf_siginfo.si_sc;
 	bzero(&sf.sf_siginfo, sizeof(sf.sf_siginfo));
 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
 		/* Signal handler installed with SA_SIGINFO. */
 		sf.sf_arg2 = (register_t)&fp->sf_siginfo;
 		sf.sf_siginfo.si_signo = sig;
 		sf.sf_siginfo.si_code = ksi->ksi_code;
 		sf.sf_ahu.sf_action = (__osiginfohandler_t *)catcher;
 		sf.sf_addr = 0;
 	} else {
 		/* Old FreeBSD-style arguments. */
 		sf.sf_arg2 = ksi->ksi_code;
 		sf.sf_addr = (register_t)ksi->ksi_addr;
 		sf.sf_ahu.sf_handler = catcher;
 	}
 	mtx_unlock(&psp->ps_mtx);
 	PROC_UNLOCK(p);
 
 	/* Save most if not all of trap frame. */
 	sf.sf_siginfo.si_sc.sc_eax = regs->tf_eax;
 	sf.sf_siginfo.si_sc.sc_ebx = regs->tf_ebx;
 	sf.sf_siginfo.si_sc.sc_ecx = regs->tf_ecx;
 	sf.sf_siginfo.si_sc.sc_edx = regs->tf_edx;
 	sf.sf_siginfo.si_sc.sc_esi = regs->tf_esi;
 	sf.sf_siginfo.si_sc.sc_edi = regs->tf_edi;
 	sf.sf_siginfo.si_sc.sc_cs = regs->tf_cs;
 	sf.sf_siginfo.si_sc.sc_ds = regs->tf_ds;
 	sf.sf_siginfo.si_sc.sc_ss = regs->tf_ss;
 	sf.sf_siginfo.si_sc.sc_es = regs->tf_es;
 	sf.sf_siginfo.si_sc.sc_fs = regs->tf_fs;
 	sf.sf_siginfo.si_sc.sc_gs = rgs();
 	sf.sf_siginfo.si_sc.sc_isp = regs->tf_isp;
 
 	/* Build the signal context to be used by osigreturn(). */
 	sf.sf_siginfo.si_sc.sc_onstack = (oonstack) ? 1 : 0;
 	SIG2OSIG(*mask, sf.sf_siginfo.si_sc.sc_mask);
 	sf.sf_siginfo.si_sc.sc_sp = regs->tf_esp;
 	sf.sf_siginfo.si_sc.sc_fp = regs->tf_ebp;
 	sf.sf_siginfo.si_sc.sc_pc = regs->tf_eip;
 	sf.sf_siginfo.si_sc.sc_ps = regs->tf_eflags;
 	sf.sf_siginfo.si_sc.sc_trapno = regs->tf_trapno;
 	sf.sf_siginfo.si_sc.sc_err = regs->tf_err;
 
 	/*
 	 * If we're a vm86 process, we want to save the segment registers.
 	 * We also change eflags to be our emulated eflags, not the actual
 	 * eflags.
 	 */
 	if (regs->tf_eflags & PSL_VM) {
 		/* XXX confusing names: `tf' isn't a trapframe; `regs' is. */
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86;
 
 		sf.sf_siginfo.si_sc.sc_gs = tf->tf_vm86_gs;
 		sf.sf_siginfo.si_sc.sc_fs = tf->tf_vm86_fs;
 		sf.sf_siginfo.si_sc.sc_es = tf->tf_vm86_es;
 		sf.sf_siginfo.si_sc.sc_ds = tf->tf_vm86_ds;
 
 		if (vm86->vm86_has_vme == 0)
 			sf.sf_siginfo.si_sc.sc_ps =
 			    (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) |
 			    (vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
 
 		/* See sendsig() for comments. */
 		tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP);
 	}
 
 	/*
 	 * Copy the sigframe out to the user's stack.
 	 */
 	if (copyout(&sf, fp, sizeof(*fp)) != 0) {
 #ifdef DEBUG
 		printf("process %ld has trashed its stack\n", (long)p->p_pid);
 #endif
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 	}
 
 	regs->tf_esp = (int)fp;
 	regs->tf_eip = PS_STRINGS - szosigcode;
 	regs->tf_eflags &= ~(PSL_T | PSL_D);
 	regs->tf_cs = _ucodesel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _udatasel;
 	load_gs(_udatasel);
 	regs->tf_ss = _udatasel;
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 #endif /* COMPAT_43 */
 
 #ifdef COMPAT_FREEBSD4
 static void
 freebsd4_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 {
 	struct sigframe4 sf, *sfp;
 	struct proc *p;
 	struct thread *td;
 	struct sigacts *psp;
 	struct trapframe *regs;
 	int sig;
 	int oonstack;
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	sig = ksi->ksi_signo;
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 	regs = td->td_frame;
 	oonstack = sigonstack(regs->tf_esp);
 
 	/* Save user context. */
 	bzero(&sf, sizeof(sf));
 	sf.sf_uc.uc_sigmask = *mask;
 	sf.sf_uc.uc_stack = td->td_sigstk;
 	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
 	    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
 	sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
 	sf.sf_uc.uc_mcontext.mc_gs = rgs();
 	bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs));
 	bzero(sf.sf_uc.uc_mcontext.mc_fpregs,
 	    sizeof(sf.sf_uc.uc_mcontext.mc_fpregs));
 	bzero(sf.sf_uc.uc_mcontext.__spare__,
 	    sizeof(sf.sf_uc.uc_mcontext.__spare__));
 	bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__));
 
 	/* Allocate space for the signal handler context. */
 	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		sfp = (struct sigframe4 *)(td->td_sigstk.ss_sp +
 		    td->td_sigstk.ss_size - sizeof(struct sigframe4));
 #if defined(COMPAT_43)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 #endif
 	} else
 		sfp = (struct sigframe4 *)regs->tf_esp - 1;
 
 	/* Translate the signal if appropriate. */
 	if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
 		sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
 
 	/* Build the argument list for the signal handler. */
 	sf.sf_signum = sig;
 	sf.sf_ucontext = (register_t)&sfp->sf_uc;
 	bzero(&sf.sf_si, sizeof(sf.sf_si));
 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
 		/* Signal handler installed with SA_SIGINFO. */
 		sf.sf_siginfo = (register_t)&sfp->sf_si;
 		sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
 
 		/* Fill in POSIX parts */
 		sf.sf_si.si_signo = sig;
 		sf.sf_si.si_code = ksi->ksi_code;
 		sf.sf_si.si_addr = ksi->ksi_addr;
 	} else {
 		/* Old FreeBSD-style arguments. */
 		sf.sf_siginfo = ksi->ksi_code;
 		sf.sf_addr = (register_t)ksi->ksi_addr;
 		sf.sf_ahu.sf_handler = catcher;
 	}
 	mtx_unlock(&psp->ps_mtx);
 	PROC_UNLOCK(p);
 
 	/*
 	 * If we're a vm86 process, we want to save the segment registers.
 	 * We also change eflags to be our emulated eflags, not the actual
 	 * eflags.
 	 */
 	if (regs->tf_eflags & PSL_VM) {
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86;
 
 		sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs;
 		sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs;
 		sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es;
 		sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds;
 
 		if (vm86->vm86_has_vme == 0)
 			sf.sf_uc.uc_mcontext.mc_eflags =
 			    (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) |
 			    (vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
 
 		/*
 		 * Clear PSL_NT to inhibit T_TSSFLT faults on return from
 		 * syscalls made by the signal handler.  This just avoids
 		 * wasting time for our lazy fixup of such faults.  PSL_NT
 		 * does nothing in vm86 mode, but vm86 programs can set it
 		 * almost legitimately in probes for old cpu types.
 		 */
 		tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP);
 	}
 
 	/*
 	 * Copy the sigframe out to the user's stack.
 	 */
 	if (copyout(&sf, sfp, sizeof(*sfp)) != 0) {
 #ifdef DEBUG
 		printf("process %ld has trashed its stack\n", (long)p->p_pid);
 #endif
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 	}
 
 	regs->tf_esp = (int)sfp;
 	regs->tf_eip = PS_STRINGS - szfreebsd4_sigcode;
 	regs->tf_eflags &= ~(PSL_T | PSL_D);
 	regs->tf_cs = _ucodesel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _udatasel;
 	regs->tf_ss = _udatasel;
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 #endif	/* COMPAT_FREEBSD4 */
 
 void
 sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 {
 	struct sigframe sf, *sfp;
 	struct proc *p;
 	struct thread *td;
 	struct sigacts *psp;
 	char *sp;
 	struct trapframe *regs;
 	struct segment_descriptor *sdp;
 	int sig;
 	int oonstack;
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	sig = ksi->ksi_signo;
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 #ifdef COMPAT_FREEBSD4
 	if (SIGISMEMBER(psp->ps_freebsd4, sig)) {
 		freebsd4_sendsig(catcher, ksi, mask);
 		return;
 	}
 #endif
 #ifdef COMPAT_43
 	if (SIGISMEMBER(psp->ps_osigset, sig)) {
 		osendsig(catcher, ksi, mask);
 		return;
 	}
 #endif
 	regs = td->td_frame;
 	oonstack = sigonstack(regs->tf_esp);
 
 	/* Save user context. */
 	bzero(&sf, sizeof(sf));
 	sf.sf_uc.uc_sigmask = *mask;
 	sf.sf_uc.uc_stack = td->td_sigstk;
 	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
 	    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
 	sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
 	sf.sf_uc.uc_mcontext.mc_gs = rgs();
 	bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs));
 	sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */
 	get_fpcontext(td, &sf.sf_uc.uc_mcontext);
 	fpstate_drop(td);
 	/*
 	 * Unconditionally fill the fsbase and gsbase into the mcontext.
 	 */
 	sdp = &td->td_pcb->pcb_fsd;
 	sf.sf_uc.uc_mcontext.mc_fsbase = sdp->sd_hibase << 24 |
 	    sdp->sd_lobase;
 	sdp = &td->td_pcb->pcb_gsd;
 	sf.sf_uc.uc_mcontext.mc_gsbase = sdp->sd_hibase << 24 |
 	    sdp->sd_lobase;
 	sf.sf_uc.uc_mcontext.mc_flags = 0;
 	bzero(sf.sf_uc.uc_mcontext.mc_spare2,
 	    sizeof(sf.sf_uc.uc_mcontext.mc_spare2));
 	bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__));
 
 	/* Allocate space for the signal handler context. */
 	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		sp = td->td_sigstk.ss_sp +
 		    td->td_sigstk.ss_size - sizeof(struct sigframe);
 #if defined(COMPAT_43)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 #endif
 	} else
 		sp = (char *)regs->tf_esp - sizeof(struct sigframe);
 	/* Align to 16 bytes. */
 	sfp = (struct sigframe *)((unsigned int)sp & ~0xF);
 
 	/* Translate the signal if appropriate. */
 	if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
 		sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
 
 	/* Build the argument list for the signal handler. */
 	sf.sf_signum = sig;
 	sf.sf_ucontext = (register_t)&sfp->sf_uc;
 	bzero(&sf.sf_si, sizeof(sf.sf_si));
 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
 		/* Signal handler installed with SA_SIGINFO. */
 		sf.sf_siginfo = (register_t)&sfp->sf_si;
 		sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
 
 		/* Fill in POSIX parts */
 		sf.sf_si = ksi->ksi_info;
 		sf.sf_si.si_signo = sig; /* maybe a translated signal */
 	} else {
 		/* Old FreeBSD-style arguments. */
 		sf.sf_siginfo = ksi->ksi_code;
 		sf.sf_addr = (register_t)ksi->ksi_addr;
 		sf.sf_ahu.sf_handler = catcher;
 	}
 	mtx_unlock(&psp->ps_mtx);
 	PROC_UNLOCK(p);
 
 	/*
 	 * If we're a vm86 process, we want to save the segment registers.
 	 * We also change eflags to be our emulated eflags, not the actual
 	 * eflags.
 	 */
 	if (regs->tf_eflags & PSL_VM) {
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86;
 
 		sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs;
 		sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs;
 		sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es;
 		sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds;
 
 		if (vm86->vm86_has_vme == 0)
 			sf.sf_uc.uc_mcontext.mc_eflags =
 			    (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) |
 			    (vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
 
 		/*
 		 * Clear PSL_NT to inhibit T_TSSFLT faults on return from
 		 * syscalls made by the signal handler.  This just avoids
 		 * wasting time for our lazy fixup of such faults.  PSL_NT
 		 * does nothing in vm86 mode, but vm86 programs can set it
 		 * almost legitimately in probes for old cpu types.
 		 */
 		tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP);
 	}
 
 	/*
 	 * Copy the sigframe out to the user's stack.
 	 */
 	if (copyout(&sf, sfp, sizeof(*sfp)) != 0) {
 #ifdef DEBUG
 		printf("process %ld has trashed its stack\n", (long)p->p_pid);
 #endif
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 	}
 
 	regs->tf_esp = (int)sfp;
 	regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode);
 	regs->tf_eflags &= ~(PSL_T | PSL_D);
 	regs->tf_cs = _ucodesel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _udatasel;
 	regs->tf_ss = _udatasel;
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 
 /*
  * System call to cleanup state after a signal
  * has been taken.  Reset signal mask and
  * stack state from context left by sendsig (above).
  * Return to previous pc and psl as specified by
  * context left by sendsig. Check carefully to
  * make sure that the user has not modified the
  * state to gain improper privileges.
  *
  * MPSAFE
  */
 #ifdef COMPAT_43
 int
 osigreturn(td, uap)
 	struct thread *td;
 	struct osigreturn_args /* {
 		struct osigcontext *sigcntxp;
 	} */ *uap;
 {
 	struct osigcontext sc;
 	struct trapframe *regs;
 	struct osigcontext *scp;
 	int eflags, error;
 	ksiginfo_t ksi;
 
 	regs = td->td_frame;
 	error = copyin(uap->sigcntxp, &sc, sizeof(sc));
 	if (error != 0)
 		return (error);
 	scp = &sc;
 	eflags = scp->sc_ps;
 	if (eflags & PSL_VM) {
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86;
 
 		/*
 		 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
 		 * set up the vm86 area, and we can't enter vm86 mode.
 		 */
 		if (td->td_pcb->pcb_ext == 0)
 			return (EINVAL);
 		vm86 = &td->td_pcb->pcb_ext->ext_vm86;
 		if (vm86->vm86_inited == 0)
 			return (EINVAL);
 
 		/* Go back to user mode if both flags are set. */
 		if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) {
 			ksiginfo_init_trap(&ksi);
 			ksi.ksi_signo = SIGBUS;
 			ksi.ksi_code = BUS_OBJERR;
 			ksi.ksi_addr = (void *)regs->tf_eip;
 			trapsignal(td, &ksi);
 		}
 
 		if (vm86->vm86_has_vme) {
 			eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
 			    (eflags & VME_USERCHANGE) | PSL_VM;
 		} else {
 			vm86->vm86_eflags = eflags;	/* save VIF, VIP */
 			eflags = (tf->tf_eflags & ~VM_USERCHANGE) |
 			    (eflags & VM_USERCHANGE) | PSL_VM;
 		}
 		tf->tf_vm86_ds = scp->sc_ds;
 		tf->tf_vm86_es = scp->sc_es;
 		tf->tf_vm86_fs = scp->sc_fs;
 		tf->tf_vm86_gs = scp->sc_gs;
 		tf->tf_ds = _udatasel;
 		tf->tf_es = _udatasel;
 		tf->tf_fs = _udatasel;
 	} else {
 		/*
 		 * Don't allow users to change privileged or reserved flags.
 		 */
 		/*
 		 * XXX do allow users to change the privileged flag PSL_RF.
 		 * The cpu sets PSL_RF in tf_eflags for faults.  Debuggers
 		 * should sometimes set it there too.  tf_eflags is kept in
 		 * the signal context during signal handling and there is no
 		 * other place to remember it, so the PSL_RF bit may be
 		 * corrupted by the signal handler without us knowing.
 		 * Corruption of the PSL_RF bit at worst causes one more or
 		 * one less debugger trap, so allowing it is fairly harmless.
 		 */
 		if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) {
 	    		return (EINVAL);
 		}
 
 		/*
 		 * Don't allow users to load a valid privileged %cs.  Let the
 		 * hardware check for invalid selectors, excess privilege in
 		 * other selectors, invalid %eip's and invalid %esp's.
 		 */
 		if (!CS_SECURE(scp->sc_cs)) {
 			ksiginfo_init_trap(&ksi);
 			ksi.ksi_signo = SIGBUS;
 			ksi.ksi_code = BUS_OBJERR;
 			ksi.ksi_trapno = T_PROTFLT;
 			ksi.ksi_addr = (void *)regs->tf_eip;
 			trapsignal(td, &ksi);
 			return (EINVAL);
 		}
 		regs->tf_ds = scp->sc_ds;
 		regs->tf_es = scp->sc_es;
 		regs->tf_fs = scp->sc_fs;
 	}
 
 	/* Restore remaining registers. */
 	regs->tf_eax = scp->sc_eax;
 	regs->tf_ebx = scp->sc_ebx;
 	regs->tf_ecx = scp->sc_ecx;
 	regs->tf_edx = scp->sc_edx;
 	regs->tf_esi = scp->sc_esi;
 	regs->tf_edi = scp->sc_edi;
 	regs->tf_cs = scp->sc_cs;
 	regs->tf_ss = scp->sc_ss;
 	regs->tf_isp = scp->sc_isp;
 	regs->tf_ebp = scp->sc_fp;
 	regs->tf_esp = scp->sc_sp;
 	regs->tf_eip = scp->sc_pc;
 	regs->tf_eflags = eflags;
 
 #if defined(COMPAT_43)
 	if (scp->sc_onstack & 1)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 	else
 		td->td_sigstk.ss_flags &= ~SS_ONSTACK;
 #endif
 	kern_sigprocmask(td, SIG_SETMASK, (sigset_t *)&scp->sc_mask, NULL,
 	    SIGPROCMASK_OLD);
 	return (EJUSTRETURN);
 }
 #endif /* COMPAT_43 */
 
 #ifdef COMPAT_FREEBSD4
 /*
  * MPSAFE
  */
 int
 freebsd4_sigreturn(td, uap)
 	struct thread *td;
 	struct freebsd4_sigreturn_args /* {
 		const ucontext4 *sigcntxp;
 	} */ *uap;
 {
 	struct ucontext4 uc;
 	struct trapframe *regs;
 	struct ucontext4 *ucp;
 	int cs, eflags, error;
 	ksiginfo_t ksi;
 
 	error = copyin(uap->sigcntxp, &uc, sizeof(uc));
 	if (error != 0)
 		return (error);
 	ucp = &uc;
 	regs = td->td_frame;
 	eflags = ucp->uc_mcontext.mc_eflags;
 	if (eflags & PSL_VM) {
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86;
 
 		/*
 		 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
 		 * set up the vm86 area, and we can't enter vm86 mode.
 		 */
 		if (td->td_pcb->pcb_ext == 0)
 			return (EINVAL);
 		vm86 = &td->td_pcb->pcb_ext->ext_vm86;
 		if (vm86->vm86_inited == 0)
 			return (EINVAL);
 
 		/* Go back to user mode if both flags are set. */
 		if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) {
 			ksiginfo_init_trap(&ksi);
 			ksi.ksi_signo = SIGBUS;
 			ksi.ksi_code = BUS_OBJERR;
 			ksi.ksi_addr = (void *)regs->tf_eip;
 			trapsignal(td, &ksi);
 		}
 		if (vm86->vm86_has_vme) {
 			eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
 			    (eflags & VME_USERCHANGE) | PSL_VM;
 		} else {
 			vm86->vm86_eflags = eflags;	/* save VIF, VIP */
 			eflags = (tf->tf_eflags & ~VM_USERCHANGE) |
 			    (eflags & VM_USERCHANGE) | PSL_VM;
 		}
 		bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe));
 		tf->tf_eflags = eflags;
 		tf->tf_vm86_ds = tf->tf_ds;
 		tf->tf_vm86_es = tf->tf_es;
 		tf->tf_vm86_fs = tf->tf_fs;
 		tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs;
 		tf->tf_ds = _udatasel;
 		tf->tf_es = _udatasel;
 		tf->tf_fs = _udatasel;
 	} else {
 		/*
 		 * Don't allow users to change privileged or reserved flags.
 		 */
 		/*
 		 * XXX do allow users to change the privileged flag PSL_RF.
 		 * The cpu sets PSL_RF in tf_eflags for faults.  Debuggers
 		 * should sometimes set it there too.  tf_eflags is kept in
 		 * the signal context during signal handling and there is no
 		 * other place to remember it, so the PSL_RF bit may be
 		 * corrupted by the signal handler without us knowing.
 		 * Corruption of the PSL_RF bit at worst causes one more or
 		 * one less debugger trap, so allowing it is fairly harmless.
 		 */
 		if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) {
 			uprintf("pid %d (%s): freebsd4_sigreturn eflags = 0x%x\n",
 			    td->td_proc->p_pid, td->td_name, eflags);
 	    		return (EINVAL);
 		}
 
 		/*
 		 * Don't allow users to load a valid privileged %cs.  Let the
 		 * hardware check for invalid selectors, excess privilege in
 		 * other selectors, invalid %eip's and invalid %esp's.
 		 */
 		cs = ucp->uc_mcontext.mc_cs;
 		if (!CS_SECURE(cs)) {
 			uprintf("pid %d (%s): freebsd4_sigreturn cs = 0x%x\n",
 			    td->td_proc->p_pid, td->td_name, cs);
 			ksiginfo_init_trap(&ksi);
 			ksi.ksi_signo = SIGBUS;
 			ksi.ksi_code = BUS_OBJERR;
 			ksi.ksi_trapno = T_PROTFLT;
 			ksi.ksi_addr = (void *)regs->tf_eip;
 			trapsignal(td, &ksi);
 			return (EINVAL);
 		}
 
 		bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs));
 	}
 
 #if defined(COMPAT_43)
 	if (ucp->uc_mcontext.mc_onstack & 1)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 	else
 		td->td_sigstk.ss_flags &= ~SS_ONSTACK;
 #endif
 	kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
 	return (EJUSTRETURN);
 }
 #endif	/* COMPAT_FREEBSD4 */
 
 /*
  * MPSAFE
  */
 int
 sys_sigreturn(td, uap)
 	struct thread *td;
 	struct sigreturn_args /* {
 		const struct __ucontext *sigcntxp;
 	} */ *uap;
 {
 	ucontext_t uc;
 	struct trapframe *regs;
 	ucontext_t *ucp;
 	int cs, eflags, error, ret;
 	ksiginfo_t ksi;
 
 	error = copyin(uap->sigcntxp, &uc, sizeof(uc));
 	if (error != 0)
 		return (error);
 	ucp = &uc;
 	regs = td->td_frame;
 	eflags = ucp->uc_mcontext.mc_eflags;
 	if (eflags & PSL_VM) {
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86;
 
 		/*
 		 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
 		 * set up the vm86 area, and we can't enter vm86 mode.
 		 */
 		if (td->td_pcb->pcb_ext == 0)
 			return (EINVAL);
 		vm86 = &td->td_pcb->pcb_ext->ext_vm86;
 		if (vm86->vm86_inited == 0)
 			return (EINVAL);
 
 		/* Go back to user mode if both flags are set. */
 		if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) {
 			ksiginfo_init_trap(&ksi);
 			ksi.ksi_signo = SIGBUS;
 			ksi.ksi_code = BUS_OBJERR;
 			ksi.ksi_addr = (void *)regs->tf_eip;
 			trapsignal(td, &ksi);
 		}
 
 		if (vm86->vm86_has_vme) {
 			eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
 			    (eflags & VME_USERCHANGE) | PSL_VM;
 		} else {
 			vm86->vm86_eflags = eflags;	/* save VIF, VIP */
 			eflags = (tf->tf_eflags & ~VM_USERCHANGE) |
 			    (eflags & VM_USERCHANGE) | PSL_VM;
 		}
 		bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe));
 		tf->tf_eflags = eflags;
 		tf->tf_vm86_ds = tf->tf_ds;
 		tf->tf_vm86_es = tf->tf_es;
 		tf->tf_vm86_fs = tf->tf_fs;
 		tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs;
 		tf->tf_ds = _udatasel;
 		tf->tf_es = _udatasel;
 		tf->tf_fs = _udatasel;
 	} else {
 		/*
 		 * Don't allow users to change privileged or reserved flags.
 		 */
 		/*
 		 * XXX do allow users to change the privileged flag PSL_RF.
 		 * The cpu sets PSL_RF in tf_eflags for faults.  Debuggers
 		 * should sometimes set it there too.  tf_eflags is kept in
 		 * the signal context during signal handling and there is no
 		 * other place to remember it, so the PSL_RF bit may be
 		 * corrupted by the signal handler without us knowing.
 		 * Corruption of the PSL_RF bit at worst causes one more or
 		 * one less debugger trap, so allowing it is fairly harmless.
 		 */
 		if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) {
 			uprintf("pid %d (%s): sigreturn eflags = 0x%x\n",
 			    td->td_proc->p_pid, td->td_name, eflags);
 	    		return (EINVAL);
 		}
 
 		/*
 		 * Don't allow users to load a valid privileged %cs.  Let the
 		 * hardware check for invalid selectors, excess privilege in
 		 * other selectors, invalid %eip's and invalid %esp's.
 		 */
 		cs = ucp->uc_mcontext.mc_cs;
 		if (!CS_SECURE(cs)) {
 			uprintf("pid %d (%s): sigreturn cs = 0x%x\n",
 			    td->td_proc->p_pid, td->td_name, cs);
 			ksiginfo_init_trap(&ksi);
 			ksi.ksi_signo = SIGBUS;
 			ksi.ksi_code = BUS_OBJERR;
 			ksi.ksi_trapno = T_PROTFLT;
 			ksi.ksi_addr = (void *)regs->tf_eip;
 			trapsignal(td, &ksi);
 			return (EINVAL);
 		}
 
 		ret = set_fpcontext(td, &ucp->uc_mcontext);
 		if (ret != 0)
 			return (ret);
 		bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs));
 	}
 
 #if defined(COMPAT_43)
 	if (ucp->uc_mcontext.mc_onstack & 1)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 	else
 		td->td_sigstk.ss_flags &= ~SS_ONSTACK;
 #endif
 
 	kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
 	return (EJUSTRETURN);
 }
 
 /*
  * Machine dependent boot() routine
  *
  * I haven't seen anything to put here yet
  * Possibly some stuff might be grafted back here from boot()
  */
 void
 cpu_boot(int howto)
 {
 }
 
 /*
  * Flush the D-cache for non-DMA I/O so that the I-cache can
  * be made coherent later.
  */
 void
 cpu_flush_dcache(void *ptr, size_t len)
 {
 	/* Not applicable */
 }
 
 /* Get current clock frequency for the given cpu id. */
 int
 cpu_est_clockrate(int cpu_id, uint64_t *rate)
 {
 	uint64_t tsc1, tsc2;
 	uint64_t acnt, mcnt, perf;
 	register_t reg;
 
 	if (pcpu_find(cpu_id) == NULL || rate == NULL)
 		return (EINVAL);
 	if ((cpu_feature & CPUID_TSC) == 0)
 		return (EOPNOTSUPP);
 
 	/*
 	 * If TSC is P-state invariant and APERF/MPERF MSRs do not exist,
 	 * DELAY(9) based logic fails.
 	 */
 	if (tsc_is_invariant && !tsc_perf_stat)
 		return (EOPNOTSUPP);
 
 #ifdef SMP
 	if (smp_cpus > 1) {
 		/* Schedule ourselves on the indicated cpu. */
 		thread_lock(curthread);
 		sched_bind(curthread, cpu_id);
 		thread_unlock(curthread);
 	}
 #endif
 
 	/* Calibrate by measuring a short delay. */
 	reg = intr_disable();
 	if (tsc_is_invariant) {
 		wrmsr(MSR_MPERF, 0);
 		wrmsr(MSR_APERF, 0);
 		tsc1 = rdtsc();
 		DELAY(1000);
 		mcnt = rdmsr(MSR_MPERF);
 		acnt = rdmsr(MSR_APERF);
 		tsc2 = rdtsc();
 		intr_restore(reg);
 		perf = 1000 * acnt / mcnt;
 		*rate = (tsc2 - tsc1) * perf;
 	} else {
 		tsc1 = rdtsc();
 		DELAY(1000);
 		tsc2 = rdtsc();
 		intr_restore(reg);
 		*rate = (tsc2 - tsc1) * 1000;
 	}
 
 #ifdef SMP
 	if (smp_cpus > 1) {
 		thread_lock(curthread);
 		sched_unbind(curthread);
 		thread_unlock(curthread);
 	}
 #endif
 
 	return (0);
 }
 
 #ifdef XEN
 
 void
 cpu_halt(void)
 {
 	HYPERVISOR_shutdown(SHUTDOWN_poweroff);
 }
 
 int scheduler_running;
 
 static void
 cpu_idle_hlt(int busy)
 {
 
 	scheduler_running = 1;
 	enable_intr();
 	idle_block();
 }
 
 #else
 /*
  * Shutdown the CPU as much as possible
  */
 void
 cpu_halt(void)
 {
 	for (;;)
 		__asm__ ("hlt");
 }
 
 #endif
 
 void (*cpu_idle_hook)(void) = NULL;	/* ACPI idle hook. */
 static int	cpu_ident_amdc1e = 0;	/* AMD C1E supported. */
 static int	idle_mwait = 1;		/* Use MONITOR/MWAIT for short idle. */
 TUNABLE_INT("machdep.idle_mwait", &idle_mwait);
 SYSCTL_INT(_machdep, OID_AUTO, idle_mwait, CTLFLAG_RW, &idle_mwait,
     0, "Use MONITOR/MWAIT for short idle");
 
 #define	STATE_RUNNING	0x0
 #define	STATE_MWAIT	0x1
 #define	STATE_SLEEPING	0x2
 
 static void
 cpu_idle_acpi(int busy)
 {
 	int *state;
 
 	state = (int *)PCPU_PTR(monitorbuf);
 	*state = STATE_SLEEPING;
 	disable_intr();
 	if (sched_runnable())
 		enable_intr();
 	else if (cpu_idle_hook)
 		cpu_idle_hook();
 	else
 		__asm __volatile("sti; hlt");
 	*state = STATE_RUNNING;
 }
 
 #ifndef XEN
 static void
 cpu_idle_hlt(int busy)
 {
 	int *state;
 
 	state = (int *)PCPU_PTR(monitorbuf);
 	*state = STATE_SLEEPING;
 	/*
 	 * We must absolutely guarentee that hlt is the next instruction
 	 * after sti or we introduce a timing window.
 	 */
 	disable_intr();
 	if (sched_runnable())
 		enable_intr();
 	else
 		__asm __volatile("sti; hlt");
 	*state = STATE_RUNNING;
 }
 #endif
 
 /*
  * MWAIT cpu power states.  Lower 4 bits are sub-states.
  */
 #define	MWAIT_C0	0xf0
 #define	MWAIT_C1	0x00
 #define	MWAIT_C2	0x10
 #define	MWAIT_C3	0x20
 #define	MWAIT_C4	0x30
 
 static void
 cpu_idle_mwait(int busy)
 {
 	int *state;
 
 	state = (int *)PCPU_PTR(monitorbuf);
 	*state = STATE_MWAIT;
 	if (!sched_runnable()) {
 		cpu_monitor(state, 0, 0);
 		if (*state == STATE_MWAIT)
 			cpu_mwait(0, MWAIT_C1);
 	}
 	*state = STATE_RUNNING;
 }
 
 static void
 cpu_idle_spin(int busy)
 {
 	int *state;
 	int i;
 
 	state = (int *)PCPU_PTR(monitorbuf);
 	*state = STATE_RUNNING;
 	for (i = 0; i < 1000; i++) {
 		if (sched_runnable())
 			return;
 		cpu_spinwait();
 	}
 }
 
 /*
  * C1E renders the local APIC timer dead, so we disable it by
  * reading the Interrupt Pending Message register and clearing
  * both C1eOnCmpHalt (bit 28) and SmiOnCmpHalt (bit 27).
  * 
  * Reference:
  *   "BIOS and Kernel Developer's Guide for AMD NPT Family 0Fh Processors"
  *   #32559 revision 3.00+
  */
 #define	MSR_AMDK8_IPM		0xc0010055
 #define	AMDK8_SMIONCMPHALT	(1ULL << 27)
 #define	AMDK8_C1EONCMPHALT	(1ULL << 28)
 #define	AMDK8_CMPHALT		(AMDK8_SMIONCMPHALT | AMDK8_C1EONCMPHALT)
 
 static void
 cpu_probe_amdc1e(void)
 {
 
 	/*
 	 * Detect the presence of C1E capability mostly on latest
 	 * dual-cores (or future) k8 family.
 	 */
 	if (cpu_vendor_id == CPU_VENDOR_AMD &&
 	    (cpu_id & 0x00000f00) == 0x00000f00 &&
 	    (cpu_id & 0x0fff0000) >=  0x00040000) {
 		cpu_ident_amdc1e = 1;
 	}
 }
 
 #ifdef XEN
 void (*cpu_idle_fn)(int) = cpu_idle_hlt;
 #else
 void (*cpu_idle_fn)(int) = cpu_idle_acpi;
 #endif
 
 void
 cpu_idle(int busy)
 {
 #ifndef XEN
 	uint64_t msr;
 #endif
 
 	CTR2(KTR_SPARE2, "cpu_idle(%d) at %d",
 	    busy, curcpu);
 #if defined(MP_WATCHDOG) && !defined(XEN)
 	ap_watchdog(PCPU_GET(cpuid));
 #endif
 #ifndef XEN
 	/* If we are busy - try to use fast methods. */
 	if (busy) {
 		if ((cpu_feature2 & CPUID2_MON) && idle_mwait) {
 			cpu_idle_mwait(busy);
 			goto out;
 		}
 	}
 #endif
 
 	/* If we have time - switch timers into idle mode. */
 	if (!busy) {
 		critical_enter();
 		cpu_idleclock();
 	}
 
 #ifndef XEN
 	/* Apply AMD APIC timer C1E workaround. */
 	if (cpu_ident_amdc1e && cpu_disable_deep_sleep) {
 		msr = rdmsr(MSR_AMDK8_IPM);
 		if (msr & AMDK8_CMPHALT)
 			wrmsr(MSR_AMDK8_IPM, msr & ~AMDK8_CMPHALT);
 	}
 #endif
 
 	/* Call main idle method. */
 	cpu_idle_fn(busy);
 
 	/* Switch timers mack into active mode. */
 	if (!busy) {
 		cpu_activeclock();
 		critical_exit();
 	}
 #ifndef XEN
 out:
 #endif
 	CTR2(KTR_SPARE2, "cpu_idle(%d) at %d done",
 	    busy, curcpu);
 }
 
 int
 cpu_idle_wakeup(int cpu)
 {
 	struct pcpu *pcpu;
 	int *state;
 
 	pcpu = pcpu_find(cpu);
 	state = (int *)pcpu->pc_monitorbuf;
 	/*
 	 * This doesn't need to be atomic since missing the race will
 	 * simply result in unnecessary IPIs.
 	 */
 	if (*state == STATE_SLEEPING)
 		return (0);
 	if (*state == STATE_MWAIT)
 		*state = STATE_RUNNING;
 	return (1);
 }
 
 /*
  * Ordered by speed/power consumption.
  */
 struct {
 	void	*id_fn;
 	char	*id_name;
 } idle_tbl[] = {
 	{ cpu_idle_spin, "spin" },
 	{ cpu_idle_mwait, "mwait" },
 	{ cpu_idle_hlt, "hlt" },
 	{ cpu_idle_acpi, "acpi" },
 	{ NULL, NULL }
 };
 
 static int
 idle_sysctl_available(SYSCTL_HANDLER_ARGS)
 {
 	char *avail, *p;
 	int error;
 	int i;
 
 	avail = malloc(256, M_TEMP, M_WAITOK);
 	p = avail;
 	for (i = 0; idle_tbl[i].id_name != NULL; i++) {
 		if (strstr(idle_tbl[i].id_name, "mwait") &&
 		    (cpu_feature2 & CPUID2_MON) == 0)
 			continue;
 		if (strcmp(idle_tbl[i].id_name, "acpi") == 0 &&
 		    cpu_idle_hook == NULL)
 			continue;
 		p += sprintf(p, "%s%s", p != avail ? ", " : "",
 		    idle_tbl[i].id_name);
 	}
 	error = sysctl_handle_string(oidp, avail, 0, req);
 	free(avail, M_TEMP);
 	return (error);
 }
 
 SYSCTL_PROC(_machdep, OID_AUTO, idle_available, CTLTYPE_STRING | CTLFLAG_RD,
     0, 0, idle_sysctl_available, "A", "list of available idle functions");
 
 static int
 idle_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	char buf[16];
 	int error;
 	char *p;
 	int i;
 
 	p = "unknown";
 	for (i = 0; idle_tbl[i].id_name != NULL; i++) {
 		if (idle_tbl[i].id_fn == cpu_idle_fn) {
 			p = idle_tbl[i].id_name;
 			break;
 		}
 	}
 	strncpy(buf, p, sizeof(buf));
 	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	for (i = 0; idle_tbl[i].id_name != NULL; i++) {
 		if (strstr(idle_tbl[i].id_name, "mwait") &&
 		    (cpu_feature2 & CPUID2_MON) == 0)
 			continue;
 		if (strcmp(idle_tbl[i].id_name, "acpi") == 0 &&
 		    cpu_idle_hook == NULL)
 			continue;
 		if (strcmp(idle_tbl[i].id_name, buf))
 			continue;
 		cpu_idle_fn = idle_tbl[i].id_fn;
 		return (0);
 	}
 	return (EINVAL);
 }
 
 SYSCTL_PROC(_machdep, OID_AUTO, idle, CTLTYPE_STRING | CTLFLAG_RW, 0, 0,
     idle_sysctl, "A", "currently selected idle function");
 
 uint64_t (*atomic_load_acq_64)(volatile uint64_t *) =
     atomic_load_acq_64_i386;
 void (*atomic_store_rel_64)(volatile uint64_t *, uint64_t) =
     atomic_store_rel_64_i386;
 
 static void
 cpu_probe_cmpxchg8b(void)
 {
 
 	if ((cpu_feature & CPUID_CX8) != 0 ||
 	    cpu_vendor_id == CPU_VENDOR_RISE) {
 		atomic_load_acq_64 = atomic_load_acq_64_i586;
 		atomic_store_rel_64 = atomic_store_rel_64_i586;
 	}
 }
 
 /*
  * Reset registers to default values on exec.
  */
 void
 exec_setregs(struct thread *td, struct image_params *imgp, u_long stack)
 {
 	struct trapframe *regs = td->td_frame;
 	struct pcb *pcb = td->td_pcb;
 
 	/* Reset pc->pcb_gs and %gs before possibly invalidating it. */
 	pcb->pcb_gs = _udatasel;
 	load_gs(_udatasel);
 
 	mtx_lock_spin(&dt_lock);
 	if (td->td_proc->p_md.md_ldt)
 		user_ldt_free(td);
 	else
 		mtx_unlock_spin(&dt_lock);
   
 	bzero((char *)regs, sizeof(struct trapframe));
 	regs->tf_eip = imgp->entry_addr;
 	regs->tf_esp = stack;
 	regs->tf_eflags = PSL_USER | (regs->tf_eflags & PSL_T);
 	regs->tf_ss = _udatasel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _udatasel;
 	regs->tf_cs = _ucodesel;
 
 	/* PS_STRINGS value for BSD/OS binaries.  It is 0 for non-BSD/OS. */
 	regs->tf_ebx = imgp->ps_strings;
 
         /*
          * Reset the hardware debug registers if they were in use.
          * They won't have any meaning for the newly exec'd process.  
          */
         if (pcb->pcb_flags & PCB_DBREGS) {
                 pcb->pcb_dr0 = 0;
                 pcb->pcb_dr1 = 0;
                 pcb->pcb_dr2 = 0;
                 pcb->pcb_dr3 = 0;
                 pcb->pcb_dr6 = 0;
                 pcb->pcb_dr7 = 0;
                 if (pcb == PCPU_GET(curpcb)) {
 		        /*
 			 * Clear the debug registers on the running
 			 * CPU, otherwise they will end up affecting
 			 * the next process we switch to.
 			 */
 		        reset_dbregs();
                 }
                 pcb->pcb_flags &= ~PCB_DBREGS;
         }
 
 	/*
 	 * Initialize the math emulator (if any) for the current process.
 	 * Actually, just clear the bit that says that the emulator has
 	 * been initialized.  Initialization is delayed until the process
 	 * traps to the emulator (if it is done at all) mainly because
 	 * emulators don't provide an entry point for initialization.
 	 */
 	td->td_pcb->pcb_flags &= ~FP_SOFTFP;
 	pcb->pcb_initial_npxcw = __INITIAL_NPXCW__;
 
 	/*
 	 * Drop the FP state if we hold it, so that the process gets a
 	 * clean FP state if it uses the FPU again.
 	 */
 	fpstate_drop(td);
 
 	/*
 	 * XXX - Linux emulator
 	 * Make sure sure edx is 0x0 on entry. Linux binaries depend
 	 * on it.
 	 */
 	td->td_retval[1] = 0;
 }
 
 void
 cpu_setregs(void)
 {
 	unsigned int cr0;
 
 	cr0 = rcr0();
 
 	/*
 	 * CR0_MP, CR0_NE and CR0_TS are set for NPX (FPU) support:
 	 *
 	 * Prepare to trap all ESC (i.e., NPX) instructions and all WAIT
 	 * instructions.  We must set the CR0_MP bit and use the CR0_TS
 	 * bit to control the trap, because setting the CR0_EM bit does
 	 * not cause WAIT instructions to trap.  It's important to trap
 	 * WAIT instructions - otherwise the "wait" variants of no-wait
 	 * control instructions would degenerate to the "no-wait" variants
 	 * after FP context switches but work correctly otherwise.  It's
 	 * particularly important to trap WAITs when there is no NPX -
 	 * otherwise the "wait" variants would always degenerate.
 	 *
 	 * Try setting CR0_NE to get correct error reporting on 486DX's.
 	 * Setting it should fail or do nothing on lesser processors.
 	 */
 	cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
 	load_cr0(cr0);
 	load_gs(_udatasel);
 }
 
 u_long bootdev;		/* not a struct cdev *- encoding is different */
 SYSCTL_ULONG(_machdep, OID_AUTO, guessed_bootdev,
 	CTLFLAG_RD, &bootdev, 0, "Maybe the Boot device (not in struct cdev *format)");
 
 /*
  * Initialize 386 and configure to run kernel
  */
 
 /*
  * Initialize segments & interrupt table
  */
 
 int _default_ldt;
 
 #ifdef XEN
 union descriptor *gdt;
 union descriptor *ldt;
 #else
 union descriptor gdt[NGDT * MAXCPU];	/* global descriptor table */
 union descriptor ldt[NLDT];		/* local descriptor table */
 #endif
 static struct gate_descriptor idt0[NIDT];
 struct gate_descriptor *idt = &idt0[0];	/* interrupt descriptor table */
 struct region_descriptor r_gdt, r_idt;	/* table descriptors */
 struct mtx dt_lock;			/* lock for GDT and LDT */
 
 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 extern int has_f00f_bug;
 #endif
 
 static struct i386tss dblfault_tss;
 static char dblfault_stack[PAGE_SIZE];
 
 extern  vm_offset_t	proc0kstack;
 
 
 /*
  * software prototypes -- in more palatable form.
  *
  * GCODE_SEL through GUDATA_SEL must be in this order for syscall/sysret
  * GUFS_SEL and GUGS_SEL must be in this order (swtch.s knows it)
  */
 struct soft_segment_descriptor gdt_segs[] = {
 /* GNULL_SEL	0 Null Descriptor */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = SEL_KPL,
 	.ssd_p = 0,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 /* GPRIV_SEL	1 SMP Per-Processor Private Data Descriptor */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_KPL,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GUFS_SEL	2 %fs Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GUGS_SEL	3 %gs Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GCODE_SEL	4 Code Descriptor for kernel */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMERA,
 	.ssd_dpl = SEL_KPL,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GDATA_SEL	5 Data Descriptor for kernel */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_KPL,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GUCODE_SEL	6 Code Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMERA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GUDATA_SEL	7 Data Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GBIOSLOWMEM_SEL 8 BIOS access to realmode segment 0x40, must be #8 in GDT */
 {	.ssd_base = 0x400,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_KPL,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 #ifndef XEN
 /* GPROC0_SEL	9 Proc 0 Tss Descriptor */
 {
 	.ssd_base = 0x0,
 	.ssd_limit = sizeof(struct i386tss)-1,
 	.ssd_type = SDT_SYS386TSS,
 	.ssd_dpl = 0,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 /* GLDT_SEL	10 LDT Descriptor */
 {	.ssd_base = (int) ldt,
 	.ssd_limit = sizeof(ldt)-1,
 	.ssd_type = SDT_SYSLDT,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 /* GUSERLDT_SEL	11 User LDT Descriptor per process */
 {	.ssd_base = (int) ldt,
 	.ssd_limit = (512 * sizeof(union descriptor)-1),
 	.ssd_type = SDT_SYSLDT,
 	.ssd_dpl = 0,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 /* GPANIC_SEL	12 Panic Tss Descriptor */
 {	.ssd_base = (int) &dblfault_tss,
 	.ssd_limit = sizeof(struct i386tss)-1,
 	.ssd_type = SDT_SYS386TSS,
 	.ssd_dpl = 0,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 /* GBIOSCODE32_SEL 13 BIOS 32-bit interface (32bit Code) */
 {	.ssd_base = 0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMERA,
 	.ssd_dpl = 0,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 1		},
 /* GBIOSCODE16_SEL 14 BIOS 32-bit interface (16bit Code) */
 {	.ssd_base = 0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMERA,
 	.ssd_dpl = 0,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 1		},
 /* GBIOSDATA_SEL 15 BIOS 32-bit interface (Data) */
 {	.ssd_base = 0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = 0,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GBIOSUTIL_SEL 16 BIOS 16-bit interface (Utility) */
 {	.ssd_base = 0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = 0,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 1		},
 /* GBIOSARGS_SEL 17 BIOS 16-bit interface (Arguments) */
 {	.ssd_base = 0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = 0,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 1		},
 /* GNDIS_SEL	18 NDIS Descriptor */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = 0,
 	.ssd_p = 0,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 #endif /* !XEN */
 };
 
 static struct soft_segment_descriptor ldt_segs[] = {
 	/* Null Descriptor - overwritten by call gate */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = 0,
 	.ssd_p = 0,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 	/* Null Descriptor - overwritten by call gate */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = 0,
 	.ssd_p = 0,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 	/* Null Descriptor - overwritten by call gate */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = 0,
 	.ssd_p = 0,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 	/* Code Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMERA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 	/* Null Descriptor - overwritten by call gate */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = 0,
 	.ssd_p = 0,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 	/* Data Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 };
 
 void
 setidt(idx, func, typ, dpl, selec)
 	int idx;
 	inthand_t *func;
 	int typ;
 	int dpl;
 	int selec;
 {
 	struct gate_descriptor *ip;
 
 	ip = idt + idx;
 	ip->gd_looffset = (int)func;
 	ip->gd_selector = selec;
 	ip->gd_stkcpy = 0;
 	ip->gd_xx = 0;
 	ip->gd_type = typ;
 	ip->gd_dpl = dpl;
 	ip->gd_p = 1;
 	ip->gd_hioffset = ((int)func)>>16 ;
 }
 
 extern inthand_t
 	IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
 	IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
 	IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
 	IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
 	IDTVEC(xmm),
 #ifdef KDTRACE_HOOKS
 	IDTVEC(dtrace_ret),
 #endif
 	IDTVEC(lcall_syscall), IDTVEC(int0x80_syscall);
 
 #ifdef DDB
 /*
  * Display the index and function name of any IDT entries that don't use
  * the default 'rsvd' entry point.
  */
 DB_SHOW_COMMAND(idt, db_show_idt)
 {
 	struct gate_descriptor *ip;
 	int idx;
 	uintptr_t func;
 
 	ip = idt;
 	for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
 		func = (ip->gd_hioffset << 16 | ip->gd_looffset);
 		if (func != (uintptr_t)&IDTVEC(rsvd)) {
 			db_printf("%3d\t", idx);
 			db_printsym(func, DB_STGY_PROC);
 			db_printf("\n");
 		}
 		ip++;
 	}
 }
 
 /* Show privileged registers. */
 DB_SHOW_COMMAND(sysregs, db_show_sysregs)
 {
 	uint64_t idtr, gdtr;
 
 	idtr = ridt();
 	db_printf("idtr\t0x%08x/%04x\n",
 	    (u_int)(idtr >> 16), (u_int)idtr & 0xffff);
 	gdtr = rgdt();
 	db_printf("gdtr\t0x%08x/%04x\n",
 	    (u_int)(gdtr >> 16), (u_int)gdtr & 0xffff);
 	db_printf("ldtr\t0x%04x\n", rldt());
 	db_printf("tr\t0x%04x\n", rtr());
 	db_printf("cr0\t0x%08x\n", rcr0());
 	db_printf("cr2\t0x%08x\n", rcr2());
 	db_printf("cr3\t0x%08x\n", rcr3());
 	db_printf("cr4\t0x%08x\n", rcr4());
 }
 #endif
 
 void
 sdtossd(sd, ssd)
 	struct segment_descriptor *sd;
 	struct soft_segment_descriptor *ssd;
 {
 	ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
 	ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
 	ssd->ssd_type  = sd->sd_type;
 	ssd->ssd_dpl   = sd->sd_dpl;
 	ssd->ssd_p     = sd->sd_p;
 	ssd->ssd_def32 = sd->sd_def32;
 	ssd->ssd_gran  = sd->sd_gran;
 }
 
 #ifndef XEN
 static int
 add_smap_entry(struct bios_smap *smap, vm_paddr_t *physmap, int *physmap_idxp)
 {
 	int i, insert_idx, physmap_idx;
 
 	physmap_idx = *physmap_idxp;
 	
 	if (boothowto & RB_VERBOSE)
 		printf("SMAP type=%02x base=%016llx len=%016llx\n",
 		    smap->type, smap->base, smap->length);
 
 	if (smap->type != SMAP_TYPE_MEMORY)
 		return (1);
 
 	if (smap->length == 0)
 		return (1);
 
 #ifndef PAE
 	if (smap->base > 0xffffffff) {
 		printf("%uK of memory above 4GB ignored\n",
 		    (u_int)(smap->length / 1024));
 		return (1);
 	}
 #endif
 
 	/*
 	 * Find insertion point while checking for overlap.  Start off by
 	 * assuming the new entry will be added to the end.
 	 */
 	insert_idx = physmap_idx + 2;
 	for (i = 0; i <= physmap_idx; i += 2) {
 		if (smap->base < physmap[i + 1]) {
 			if (smap->base + smap->length <= physmap[i]) {
 				insert_idx = i;
 				break;
 			}
 			if (boothowto & RB_VERBOSE)
 				printf(
 		    "Overlapping memory regions, ignoring second region\n");
 			return (1);
 		}
 	}
 
 	/* See if we can prepend to the next entry. */
 	if (insert_idx <= physmap_idx &&
 	    smap->base + smap->length == physmap[insert_idx]) {
 		physmap[insert_idx] = smap->base;
 		return (1);
 	}
 
 	/* See if we can append to the previous entry. */
 	if (insert_idx > 0 && smap->base == physmap[insert_idx - 1]) {
 		physmap[insert_idx - 1] += smap->length;
 		return (1);
 	}
 
 	physmap_idx += 2;
 	*physmap_idxp = physmap_idx;
 	if (physmap_idx == PHYSMAP_SIZE) {
 		printf(
 		"Too many segments in the physical address map, giving up\n");
 		return (0);
 	}
 
 	/*
 	 * Move the last 'N' entries down to make room for the new
 	 * entry if needed.
 	 */
 	for (i = physmap_idx; i > insert_idx; i -= 2) {
 		physmap[i] = physmap[i - 2];
 		physmap[i + 1] = physmap[i - 1];
 	}
 
 	/* Insert the new entry. */
 	physmap[insert_idx] = smap->base;
 	physmap[insert_idx + 1] = smap->base + smap->length;
 	return (1);
 }
 
 static void
 basemem_setup(void)
 {
 	vm_paddr_t pa;
 	pt_entry_t *pte;
 	int i;
 
 	if (basemem > 640) {
 		printf("Preposterous BIOS basemem of %uK, truncating to 640K\n",
 			basemem);
 		basemem = 640;
 	}
 
 	/*
 	 * XXX if biosbasemem is now < 640, there is a `hole'
 	 * between the end of base memory and the start of
 	 * ISA memory.  The hole may be empty or it may
 	 * contain BIOS code or data.  Map it read/write so
 	 * that the BIOS can write to it.  (Memory from 0 to
 	 * the physical end of the kernel is mapped read-only
 	 * to begin with and then parts of it are remapped.
 	 * The parts that aren't remapped form holes that
 	 * remain read-only and are unused by the kernel.
 	 * The base memory area is below the physical end of
 	 * the kernel and right now forms a read-only hole.
 	 * The part of it from PAGE_SIZE to
 	 * (trunc_page(biosbasemem * 1024) - 1) will be
 	 * remapped and used by the kernel later.)
 	 *
 	 * This code is similar to the code used in
 	 * pmap_mapdev, but since no memory needs to be
 	 * allocated we simply change the mapping.
 	 */
 	for (pa = trunc_page(basemem * 1024);
 	     pa < ISA_HOLE_START; pa += PAGE_SIZE)
 		pmap_kenter(KERNBASE + pa, pa);
 
 	/*
 	 * Map pages between basemem and ISA_HOLE_START, if any, r/w into
 	 * the vm86 page table so that vm86 can scribble on them using
 	 * the vm86 map too.  XXX: why 2 ways for this and only 1 way for
 	 * page 0, at least as initialized here?
 	 */
 	pte = (pt_entry_t *)vm86paddr;
 	for (i = basemem / 4; i < 160; i++)
 		pte[i] = (i << PAGE_SHIFT) | PG_V | PG_RW | PG_U;
 }
 #endif
 
 /*
  * Populate the (physmap) array with base/bound pairs describing the
  * available physical memory in the system, then test this memory and
  * build the phys_avail array describing the actually-available memory.
  *
  * If we cannot accurately determine the physical memory map, then use
  * value from the 0xE801 call, and failing that, the RTC.
  *
  * Total memory size may be set by the kernel environment variable
  * hw.physmem or the compile-time define MAXMEM.
  *
  * XXX first should be vm_paddr_t.
  */
 static void
 getmemsize(int first)
 {
 	int has_smap, off, physmap_idx, pa_indx, da_indx;
 	u_long physmem_tunable, memtest;
 	vm_paddr_t physmap[PHYSMAP_SIZE];
 	pt_entry_t *pte;
 	quad_t dcons_addr, dcons_size;
 #ifndef XEN
 	int hasbrokenint12, i;
 	u_int extmem;
 	struct vm86frame vmf;
 	struct vm86context vmc;
 	vm_paddr_t pa;
 	struct bios_smap *smap, *smapbase, *smapend;
 	u_int32_t smapsize;
 	caddr_t kmdp;
 #endif
 
 	has_smap = 0;
 #if defined(XEN)
 	Maxmem = xen_start_info->nr_pages - init_first;
 	physmem = Maxmem;
 	basemem = 0;
 	physmap[0] = init_first << PAGE_SHIFT;
 	physmap[1] = ptoa(Maxmem) - round_page(msgbufsize);
 	physmap_idx = 0;
 #else
 #ifdef XBOX
 	if (arch_i386_is_xbox) {
 		/*
 		 * We queried the memory size before, so chop off 4MB for
 		 * the framebuffer and inform the OS of this.
 		 */
 		physmap[0] = 0;
 		physmap[1] = (arch_i386_xbox_memsize * 1024 * 1024) - XBOX_FB_SIZE;
 		physmap_idx = 0;
 		goto physmap_done;
 	}
 #endif
 	bzero(&vmf, sizeof(vmf));
 	bzero(physmap, sizeof(physmap));
 	basemem = 0;
 
 	/*
 	 * Check if the loader supplied an SMAP memory map.  If so,
 	 * use that and do not make any VM86 calls.
 	 */
 	physmap_idx = 0;
 	smapbase = NULL;
 	kmdp = preload_search_by_type("elf kernel");
 	if (kmdp == NULL)
 		kmdp = preload_search_by_type("elf32 kernel");
 	if (kmdp != NULL)
 		smapbase = (struct bios_smap *)preload_search_info(kmdp,
 		    MODINFO_METADATA | MODINFOMD_SMAP);
 	if (smapbase != NULL) {
 		/*
 		 * subr_module.c says:
 		 * "Consumer may safely assume that size value precedes data."
 		 * ie: an int32_t immediately precedes SMAP.
 		 */
 		smapsize = *((u_int32_t *)smapbase - 1);
 		smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
 		has_smap = 1;
 
 		for (smap = smapbase; smap < smapend; smap++)
 			if (!add_smap_entry(smap, physmap, &physmap_idx))
 				break;
 		goto have_smap;
 	}
 
 	/*
 	 * Some newer BIOSes have a broken INT 12H implementation
 	 * which causes a kernel panic immediately.  In this case, we
 	 * need use the SMAP to determine the base memory size.
 	 */
 	hasbrokenint12 = 0;
 	TUNABLE_INT_FETCH("hw.hasbrokenint12", &hasbrokenint12);
 	if (hasbrokenint12 == 0) {
 		/* Use INT12 to determine base memory size. */
 		vm86_intcall(0x12, &vmf);
 		basemem = vmf.vmf_ax;
 		basemem_setup();
 	}
 
 	/*
 	 * Fetch the memory map with INT 15:E820.  Map page 1 R/W into
 	 * the kernel page table so we can use it as a buffer.  The
 	 * kernel will unmap this page later.
 	 */
 	pmap_kenter(KERNBASE + (1 << PAGE_SHIFT), 1 << PAGE_SHIFT);
 	vmc.npages = 0;
 	smap = (void *)vm86_addpage(&vmc, 1, KERNBASE + (1 << PAGE_SHIFT));
 	vm86_getptr(&vmc, (vm_offset_t)smap, &vmf.vmf_es, &vmf.vmf_di);
 
 	vmf.vmf_ebx = 0;
 	do {
 		vmf.vmf_eax = 0xE820;
 		vmf.vmf_edx = SMAP_SIG;
 		vmf.vmf_ecx = sizeof(struct bios_smap);
 		i = vm86_datacall(0x15, &vmf, &vmc);
 		if (i || vmf.vmf_eax != SMAP_SIG)
 			break;
 		has_smap = 1;
 		if (!add_smap_entry(smap, physmap, &physmap_idx))
 			break;
 	} while (vmf.vmf_ebx != 0);
 
 have_smap:
 	/*
 	 * If we didn't fetch the "base memory" size from INT12,
 	 * figure it out from the SMAP (or just guess).
 	 */
 	if (basemem == 0) {
 		for (i = 0; i <= physmap_idx; i += 2) {
 			if (physmap[i] == 0x00000000) {
 				basemem = physmap[i + 1] / 1024;
 				break;
 			}
 		}
 
 		/* XXX: If we couldn't find basemem from SMAP, just guess. */
 		if (basemem == 0)
 			basemem = 640;
 		basemem_setup();
 	}
 
 	if (physmap[1] != 0)
 		goto physmap_done;
 
 	/*
 	 * If we failed to find an SMAP, figure out the extended
 	 * memory size.  We will then build a simple memory map with
 	 * two segments, one for "base memory" and the second for
 	 * "extended memory".  Note that "extended memory" starts at a
 	 * physical address of 1MB and that both basemem and extmem
 	 * are in units of 1KB.
 	 *
 	 * First, try to fetch the extended memory size via INT 15:E801.
 	 */
 	vmf.vmf_ax = 0xE801;
 	if (vm86_intcall(0x15, &vmf) == 0) {
 		extmem = vmf.vmf_cx + vmf.vmf_dx * 64;
 	} else {
 		/*
 		 * If INT15:E801 fails, this is our last ditch effort
 		 * to determine the extended memory size.  Currently
 		 * we prefer the RTC value over INT15:88.
 		 */
 #if 0
 		vmf.vmf_ah = 0x88;
 		vm86_intcall(0x15, &vmf);
 		extmem = vmf.vmf_ax;
 #else
 		extmem = rtcin(RTC_EXTLO) + (rtcin(RTC_EXTHI) << 8);
 #endif
 	}
 
 	/*
 	 * Special hack for chipsets that still remap the 384k hole when
 	 * there's 16MB of memory - this really confuses people that
 	 * are trying to use bus mastering ISA controllers with the
 	 * "16MB limit"; they only have 16MB, but the remapping puts
 	 * them beyond the limit.
 	 *
 	 * If extended memory is between 15-16MB (16-17MB phys address range),
 	 *	chop it to 15MB.
 	 */
 	if ((extmem > 15 * 1024) && (extmem < 16 * 1024))
 		extmem = 15 * 1024;
 
 	physmap[0] = 0;
 	physmap[1] = basemem * 1024;
 	physmap_idx = 2;
 	physmap[physmap_idx] = 0x100000;
 	physmap[physmap_idx + 1] = physmap[physmap_idx] + extmem * 1024;
 
 physmap_done:
 #endif	
 	/*
 	 * Now, physmap contains a map of physical memory.
 	 */
 
 #ifdef SMP
 	/* make hole for AP bootstrap code */
 	physmap[1] = mp_bootaddress(physmap[1]);
 #endif
 
 	/*
 	 * Maxmem isn't the "maximum memory", it's one larger than the
 	 * highest page of the physical address space.  It should be
 	 * called something like "Maxphyspage".  We may adjust this 
 	 * based on ``hw.physmem'' and the results of the memory test.
 	 */
 	Maxmem = atop(physmap[physmap_idx + 1]);
 
 #ifdef MAXMEM
 	Maxmem = MAXMEM / 4;
 #endif
 
 	if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
 		Maxmem = atop(physmem_tunable);
 
 	/*
 	 * If we have an SMAP, don't allow MAXMEM or hw.physmem to extend
 	 * the amount of memory in the system.
 	 */
 	if (has_smap && Maxmem > atop(physmap[physmap_idx + 1]))
 		Maxmem = atop(physmap[physmap_idx + 1]);
 
 	/*
 	 * By default enable the memory test on real hardware, and disable
 	 * it if we appear to be running in a VM.  This avoids touching all
 	 * pages unnecessarily, which doesn't matter on real hardware but is
 	 * bad for shared VM hosts.  Use a general name so that
 	 * one could eventually do more with the code than just disable it.
 	 */
 	memtest = (vm_guest > VM_GUEST_NO) ? 0 : 1;
 	TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
 
 	if (atop(physmap[physmap_idx + 1]) != Maxmem &&
 	    (boothowto & RB_VERBOSE))
 		printf("Physical memory use set to %ldK\n", Maxmem * 4);
 
 	/*
 	 * If Maxmem has been increased beyond what the system has detected,
 	 * extend the last memory segment to the new limit.
 	 */ 
 	if (atop(physmap[physmap_idx + 1]) < Maxmem)
 		physmap[physmap_idx + 1] = ptoa((vm_paddr_t)Maxmem);
 
 	/* call pmap initialization to make new kernel address space */
 	pmap_bootstrap(first);
 
 	/*
 	 * Size up each available chunk of physical memory.
 	 */
 	physmap[0] = PAGE_SIZE;		/* mask off page 0 */
 	pa_indx = 0;
 	da_indx = 1;
 	phys_avail[pa_indx++] = physmap[0];
 	phys_avail[pa_indx] = physmap[0];
 	dump_avail[da_indx] = physmap[0];
 	pte = CMAP1;
 
 	/*
 	 * Get dcons buffer address
 	 */
 	if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
 	    getenv_quad("dcons.size", &dcons_size) == 0)
 		dcons_addr = 0;
 
 #ifndef XEN
 	/*
 	 * physmap is in bytes, so when converting to page boundaries,
 	 * round up the start address and round down the end address.
 	 */
 	for (i = 0; i <= physmap_idx; i += 2) {
 		vm_paddr_t end;
 
 		end = ptoa((vm_paddr_t)Maxmem);
 		if (physmap[i + 1] < end)
 			end = trunc_page(physmap[i + 1]);
 		for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
 			int tmp, page_bad, full;
 			int *ptr = (int *)CADDR1;
 
 			full = FALSE;
 			/*
 			 * block out kernel memory as not available.
 			 */
 			if (pa >= KERNLOAD && pa < first)
 				goto do_dump_avail;
 
 			/*
 			 * block out dcons buffer
 			 */
 			if (dcons_addr > 0
 			    && pa >= trunc_page(dcons_addr)
 			    && pa < dcons_addr + dcons_size)
 				goto do_dump_avail;
 
 			page_bad = FALSE;
 			if (memtest == 0)
 				goto skip_memtest;
 
 			/*
 			 * map page into kernel: valid, read/write,non-cacheable
 			 */
 			*pte = pa | PG_V | PG_RW | PG_N;
 			invltlb();
 
 			tmp = *(int *)ptr;
 			/*
 			 * Test for alternating 1's and 0's
 			 */
 			*(volatile int *)ptr = 0xaaaaaaaa;
 			if (*(volatile int *)ptr != 0xaaaaaaaa)
 				page_bad = TRUE;
 			/*
 			 * Test for alternating 0's and 1's
 			 */
 			*(volatile int *)ptr = 0x55555555;
 			if (*(volatile int *)ptr != 0x55555555)
 				page_bad = TRUE;
 			/*
 			 * Test for all 1's
 			 */
 			*(volatile int *)ptr = 0xffffffff;
 			if (*(volatile int *)ptr != 0xffffffff)
 				page_bad = TRUE;
 			/*
 			 * Test for all 0's
 			 */
 			*(volatile int *)ptr = 0x0;
 			if (*(volatile int *)ptr != 0x0)
 				page_bad = TRUE;
 			/*
 			 * Restore original value.
 			 */
 			*(int *)ptr = tmp;
 
 skip_memtest:
 			/*
 			 * Adjust array of valid/good pages.
 			 */
 			if (page_bad == TRUE)
 				continue;
 			/*
 			 * If this good page is a continuation of the
 			 * previous set of good pages, then just increase
 			 * the end pointer. Otherwise start a new chunk.
 			 * Note that "end" points one higher than end,
 			 * making the range >= start and < end.
 			 * If we're also doing a speculative memory
 			 * test and we at or past the end, bump up Maxmem
 			 * so that we keep going. The first bad page
 			 * will terminate the loop.
 			 */
 			if (phys_avail[pa_indx] == pa) {
 				phys_avail[pa_indx] += PAGE_SIZE;
 			} else {
 				pa_indx++;
 				if (pa_indx == PHYS_AVAIL_ARRAY_END) {
 					printf(
 		"Too many holes in the physical address space, giving up\n");
 					pa_indx--;
 					full = TRUE;
 					goto do_dump_avail;
 				}
 				phys_avail[pa_indx++] = pa;	/* start */
 				phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
 			}
 			physmem++;
 do_dump_avail:
 			if (dump_avail[da_indx] == pa) {
 				dump_avail[da_indx] += PAGE_SIZE;
 			} else {
 				da_indx++;
 				if (da_indx == DUMP_AVAIL_ARRAY_END) {
 					da_indx--;
 					goto do_next;
 				}
 				dump_avail[da_indx++] = pa;	/* start */
 				dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
 			}
 do_next:
 			if (full)
 				break;
 		}
 	}
 	*pte = 0;
 	invltlb();
 #else
 	phys_avail[0] = physfree;
 	phys_avail[1] = xen_start_info->nr_pages*PAGE_SIZE;
 	dump_avail[0] = 0;	
 	dump_avail[1] = xen_start_info->nr_pages*PAGE_SIZE;
 	
 #endif
 	
 	/*
 	 * XXX
 	 * The last chunk must contain at least one page plus the message
 	 * buffer to avoid complicating other code (message buffer address
 	 * calculation, etc.).
 	 */
 	while (phys_avail[pa_indx - 1] + PAGE_SIZE +
 	    round_page(msgbufsize) >= phys_avail[pa_indx]) {
 		physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
 		phys_avail[pa_indx--] = 0;
 		phys_avail[pa_indx--] = 0;
 	}
 
 	Maxmem = atop(phys_avail[pa_indx]);
 
 	/* Trim off space for the message buffer. */
 	phys_avail[pa_indx] -= round_page(msgbufsize);
 
 	/* Map the message buffer. */
 	for (off = 0; off < round_page(msgbufsize); off += PAGE_SIZE)
 		pmap_kenter((vm_offset_t)msgbufp + off, phys_avail[pa_indx] +
 		    off);
 
 	PT_UPDATES_FLUSH();
 }
 
 #ifdef XEN
 #define MTOPSIZE (1<<(14 + PAGE_SHIFT))
 
 void
 init386(first)
 	int first;
 {
 	unsigned long gdtmachpfn;
 	int error, gsel_tss, metadata_missing, x, pa;
 	size_t kstack0_sz;
 	struct pcpu *pc;
 	struct callback_register event = {
 		.type = CALLBACKTYPE_event,
 		.address = {GSEL(GCODE_SEL, SEL_KPL), (unsigned long)Xhypervisor_callback },
 	};
 	struct callback_register failsafe = {
 		.type = CALLBACKTYPE_failsafe,
 		.address = {GSEL(GCODE_SEL, SEL_KPL), (unsigned long)failsafe_callback },
 	};
 
 	thread0.td_kstack = proc0kstack;
 	thread0.td_kstack_pages = KSTACK_PAGES;
 	kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
 	thread0.td_pcb = (struct pcb *)(thread0.td_kstack + kstack0_sz) - 1;
 
 	/*
  	 * This may be done better later if it gets more high level
  	 * components in it. If so just link td->td_proc here.
 	 */
 	proc_linkup0(&proc0, &thread0);
 
 	metadata_missing = 0;
 	if (xen_start_info->mod_start) {
 		preload_metadata = (caddr_t)xen_start_info->mod_start;
 		preload_bootstrap_relocate(KERNBASE);
 	} else {
 		metadata_missing = 1;
 	}
 	if (envmode == 1)
 		kern_envp = static_env;
 	else if ((caddr_t)xen_start_info->cmd_line)
 	        kern_envp = xen_setbootenv((caddr_t)xen_start_info->cmd_line);
 
 	boothowto |= xen_boothowto(kern_envp);
 	
 	/* Init basic tunables, hz etc */
 	init_param1();
 
 	/*
 	 * XEN occupies a portion of the upper virtual address space 
 	 * At its base it manages an array mapping machine page frames 
 	 * to physical page frames - hence we need to be able to 
 	 * access 4GB - (64MB  - 4MB + 64k) 
 	 */
 	gdt_segs[GPRIV_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE);
 	gdt_segs[GUFS_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE);
 	gdt_segs[GUGS_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE);
 	gdt_segs[GCODE_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE);
 	gdt_segs[GDATA_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE);
 	gdt_segs[GUCODE_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE);
 	gdt_segs[GUDATA_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE);
 	gdt_segs[GBIOSLOWMEM_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE);
 
 	pc = &__pcpu[0];
 	gdt_segs[GPRIV_SEL].ssd_base = (int) pc;
 	gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss;
 
 	PT_SET_MA(gdt, xpmap_ptom(VTOP(gdt)) | PG_V | PG_RW);
 	bzero(gdt, PAGE_SIZE);
 	for (x = 0; x < NGDT; x++)
 		ssdtosd(&gdt_segs[x], &gdt[x].sd);
 
 	mtx_init(&dt_lock, "descriptor tables", NULL, MTX_SPIN);
 
 	gdtmachpfn = vtomach(gdt) >> PAGE_SHIFT;
 	PT_SET_MA(gdt, xpmap_ptom(VTOP(gdt)) | PG_V);
 	PANIC_IF(HYPERVISOR_set_gdt(&gdtmachpfn, 512) != 0);	
 	lgdt(&r_gdt);
 	gdtset = 1;
 
 	if ((error = HYPERVISOR_set_trap_table(trap_table)) != 0) {
 		panic("set_trap_table failed - error %d\n", error);
 	}
 	
 	error = HYPERVISOR_callback_op(CALLBACKOP_register, &event);
 	if (error == 0)
 		error = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe);
 #if	CONFIG_XEN_COMPAT <= 0x030002
 	if (error == -ENOXENSYS)
 		HYPERVISOR_set_callbacks(GSEL(GCODE_SEL, SEL_KPL),
 		    (unsigned long)Xhypervisor_callback,
 		    GSEL(GCODE_SEL, SEL_KPL), (unsigned long)failsafe_callback);
 #endif
 	pcpu_init(pc, 0, sizeof(struct pcpu));
 	for (pa = first; pa < first + DPCPU_SIZE; pa += PAGE_SIZE)
 		pmap_kenter(pa + KERNBASE, pa);
 	dpcpu_init((void *)(first + KERNBASE), 0);
 	first += DPCPU_SIZE;
 	physfree += DPCPU_SIZE;
 	init_first += DPCPU_SIZE / PAGE_SIZE;
 
 	PCPU_SET(prvspace, pc);
 	PCPU_SET(curthread, &thread0);
 	PCPU_SET(curpcb, thread0.td_pcb);
 
 	/*
 	 * Initialize mutexes.
 	 *
 	 * icu_lock: in order to allow an interrupt to occur in a critical
 	 * 	     section, to set pcpu->ipending (etc...) properly, we
 	 *	     must be able to get the icu lock, so it can't be
 	 *	     under witness.
 	 */
 	mutex_init();
 	mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS | MTX_NOPROFILE);
 
 	/* make ldt memory segments */
 	PT_SET_MA(ldt, xpmap_ptom(VTOP(ldt)) | PG_V | PG_RW);
 	bzero(ldt, PAGE_SIZE);
 	ldt_segs[LUCODE_SEL].ssd_limit = atop(0 - 1);
 	ldt_segs[LUDATA_SEL].ssd_limit = atop(0 - 1);
 	for (x = 0; x < sizeof ldt_segs / sizeof ldt_segs[0]; x++)
 		ssdtosd(&ldt_segs[x], &ldt[x].sd);
 
 	default_proc_ldt.ldt_base = (caddr_t)ldt;
 	default_proc_ldt.ldt_len = 6;
 	_default_ldt = (int)&default_proc_ldt;
 	PCPU_SET(currentldt, _default_ldt);
 	PT_SET_MA(ldt, *vtopte((unsigned long)ldt) & ~PG_RW);
 	xen_set_ldt((unsigned long) ldt, (sizeof ldt_segs / sizeof ldt_segs[0]));
 	
 #if defined(XEN_PRIVILEGED)
 	/*
 	 * Initialize the i8254 before the console so that console
 	 * initialization can use DELAY().
 	 */
 	i8254_init();
 #endif
 	
 	/*
 	 * Initialize the console before we print anything out.
 	 */
 	cninit();
 
 	if (metadata_missing)
 		printf("WARNING: loader(8) metadata is missing!\n");
 
 #ifdef DEV_ISA
 #ifdef DEV_ATPIC
 	elcr_probe();
 	atpic_startup();
 #else
 	/* Reset and mask the atpics and leave them shut down. */
 	atpic_reset();
 
 	/*
 	 * Point the ICU spurious interrupt vectors at the APIC spurious
 	 * interrupt handler.
 	 */
 	setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 #endif
 #endif
 
 #ifdef DDB
 	ksym_start = bootinfo.bi_symtab;
 	ksym_end = bootinfo.bi_esymtab;
 #endif
 
 	kdb_init();
 
 #ifdef KDB
 	if (boothowto & RB_KDB)
 		kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
 #endif
 
 	finishidentcpu();	/* Final stage of CPU initialization */
 	setidt(IDT_UD, &IDTVEC(ill),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_GP, &IDTVEC(prot),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	initializecpu();	/* Initialize CPU registers */
 
 	/* make an initial tss so cpu can get interrupt stack on syscall! */
 	/* Note: -16 is so we can grow the trapframe if we came from vm86 */
 	PCPU_SET(common_tss.tss_esp0, thread0.td_kstack +
 	    kstack0_sz - sizeof(struct pcb) - 16);
 	PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL));
 	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
 	HYPERVISOR_stack_switch(GSEL(GDATA_SEL, SEL_KPL),
 	    PCPU_GET(common_tss.tss_esp0));
 	
 	/* pointer to selector slot for %fs/%gs */
 	PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd);
 
 	dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 =
 	    dblfault_tss.tss_esp2 = (int)&dblfault_stack[sizeof(dblfault_stack)];
 	dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 =
 	    dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL);
 #ifdef PAE
 	dblfault_tss.tss_cr3 = (int)IdlePDPT;
 #else
 	dblfault_tss.tss_cr3 = (int)IdlePTD;
 #endif
 	dblfault_tss.tss_eip = (int)dblfault_handler;
 	dblfault_tss.tss_eflags = PSL_KERNEL;
 	dblfault_tss.tss_ds = dblfault_tss.tss_es =
 	    dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL);
 	dblfault_tss.tss_fs = GSEL(GPRIV_SEL, SEL_KPL);
 	dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL);
 	dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL);
 
 	vm86_initialize();
 	getmemsize(first);
 	init_param2(physmem);
 
 	/* now running on new page tables, configured,and u/iom is accessible */
 
 	msgbufinit(msgbufp, msgbufsize);
 	/* transfer to user mode */
 
 	_ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
 	_udatasel = GSEL(GUDATA_SEL, SEL_UPL);
 
 	/* setup proc 0's pcb */
 	thread0.td_pcb->pcb_flags = 0;
 #ifdef PAE
 	thread0.td_pcb->pcb_cr3 = (int)IdlePDPT;
 #else
 	thread0.td_pcb->pcb_cr3 = (int)IdlePTD;
 #endif
 	thread0.td_pcb->pcb_ext = 0;
 	thread0.td_frame = &proc0_tf;
 	thread0.td_pcb->pcb_fsd = PCPU_GET(fsgs_gdt)[0];
 	thread0.td_pcb->pcb_gsd = PCPU_GET(fsgs_gdt)[1];
 
 	cpu_probe_amdc1e();
 	cpu_probe_cmpxchg8b();
 }
 
 #else
 void
 init386(first)
 	int first;
 {
 	struct gate_descriptor *gdp;
 	int gsel_tss, metadata_missing, x, pa;
 	size_t kstack0_sz;
 	struct pcpu *pc;
 
 	thread0.td_kstack = proc0kstack;
 	thread0.td_kstack_pages = KSTACK_PAGES;
 	kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
 	thread0.td_pcb = (struct pcb *)(thread0.td_kstack + kstack0_sz) - 1;
 
 	/*
  	 * This may be done better later if it gets more high level
  	 * components in it. If so just link td->td_proc here.
 	 */
 	proc_linkup0(&proc0, &thread0);
 
 	metadata_missing = 0;
 	if (bootinfo.bi_modulep) {
 		preload_metadata = (caddr_t)bootinfo.bi_modulep + KERNBASE;
 		preload_bootstrap_relocate(KERNBASE);
 	} else {
 		metadata_missing = 1;
 	}
 	if (envmode == 1)
 		kern_envp = static_env;
 	else if (bootinfo.bi_envp)
 		kern_envp = (caddr_t)bootinfo.bi_envp + KERNBASE;
 
 	/* Init basic tunables, hz etc */
 	init_param1();
 
 	/*
 	 * Make gdt memory segments.  All segments cover the full 4GB
 	 * of address space and permissions are enforced at page level.
 	 */
 	gdt_segs[GCODE_SEL].ssd_limit = atop(0 - 1);
 	gdt_segs[GDATA_SEL].ssd_limit = atop(0 - 1);
 	gdt_segs[GUCODE_SEL].ssd_limit = atop(0 - 1);
 	gdt_segs[GUDATA_SEL].ssd_limit = atop(0 - 1);
 	gdt_segs[GUFS_SEL].ssd_limit = atop(0 - 1);
 	gdt_segs[GUGS_SEL].ssd_limit = atop(0 - 1);
 
 	pc = &__pcpu[0];
 	gdt_segs[GPRIV_SEL].ssd_limit = atop(0 - 1);
 	gdt_segs[GPRIV_SEL].ssd_base = (int) pc;
 	gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss;
 
 	for (x = 0; x < NGDT; x++)
 		ssdtosd(&gdt_segs[x], &gdt[x].sd);
 
 	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
 	r_gdt.rd_base =  (int) gdt;
 	mtx_init(&dt_lock, "descriptor tables", NULL, MTX_SPIN);
 	lgdt(&r_gdt);
 
 	pcpu_init(pc, 0, sizeof(struct pcpu));
 	for (pa = first; pa < first + DPCPU_SIZE; pa += PAGE_SIZE)
 		pmap_kenter(pa + KERNBASE, pa);
 	dpcpu_init((void *)(first + KERNBASE), 0);
 	first += DPCPU_SIZE;
 	PCPU_SET(prvspace, pc);
 	PCPU_SET(curthread, &thread0);
 	PCPU_SET(curpcb, thread0.td_pcb);
 
 	/*
 	 * Initialize mutexes.
 	 *
 	 * icu_lock: in order to allow an interrupt to occur in a critical
 	 * 	     section, to set pcpu->ipending (etc...) properly, we
 	 *	     must be able to get the icu lock, so it can't be
 	 *	     under witness.
 	 */
 	mutex_init();
 	mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS | MTX_NOPROFILE);
 
 	/* make ldt memory segments */
 	ldt_segs[LUCODE_SEL].ssd_limit = atop(0 - 1);
 	ldt_segs[LUDATA_SEL].ssd_limit = atop(0 - 1);
 	for (x = 0; x < sizeof ldt_segs / sizeof ldt_segs[0]; x++)
 		ssdtosd(&ldt_segs[x], &ldt[x].sd);
 
 	_default_ldt = GSEL(GLDT_SEL, SEL_KPL);
 	lldt(_default_ldt);
 	PCPU_SET(currentldt, _default_ldt);
 
 	/* exceptions */
 	for (x = 0; x < NIDT; x++)
 		setidt(x, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL,
 		    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_DE, &IDTVEC(div),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_DB, &IDTVEC(dbg),  SDT_SYS386IGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_NMI, &IDTVEC(nmi),  SDT_SYS386IGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
  	setidt(IDT_BP, &IDTVEC(bpt),  SDT_SYS386IGT, SEL_UPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_OF, &IDTVEC(ofl),  SDT_SYS386TGT, SEL_UPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_BR, &IDTVEC(bnd),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_UD, &IDTVEC(ill),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_NM, &IDTVEC(dna),  SDT_SYS386TGT, SEL_KPL
 	    , GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_DF, 0,  SDT_SYSTASKGT, SEL_KPL, GSEL(GPANIC_SEL, SEL_KPL));
 	setidt(IDT_FPUGP, &IDTVEC(fpusegm),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_TS, &IDTVEC(tss),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_NP, &IDTVEC(missing),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_SS, &IDTVEC(stk),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_GP, &IDTVEC(prot),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_PF, &IDTVEC(page),  SDT_SYS386IGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_MF, &IDTVEC(fpu),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_AC, &IDTVEC(align), SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_MC, &IDTVEC(mchk),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_XF, &IDTVEC(xmm), SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
  	setidt(IDT_SYSCALL, &IDTVEC(int0x80_syscall), SDT_SYS386TGT, SEL_UPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 #ifdef KDTRACE_HOOKS
 	setidt(IDT_DTRACE_RET, &IDTVEC(dtrace_ret), SDT_SYS386TGT, SEL_UPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 #endif
 
 	r_idt.rd_limit = sizeof(idt0) - 1;
 	r_idt.rd_base = (int) idt;
 	lidt(&r_idt);
 
 #ifdef XBOX
 	/*
 	 * The following code queries the PCI ID of 0:0:0. For the XBOX,
 	 * This should be 0x10de / 0x02a5.
 	 *
 	 * This is exactly what Linux does.
 	 */
 	outl(0xcf8, 0x80000000);
 	if (inl(0xcfc) == 0x02a510de) {
 		arch_i386_is_xbox = 1;
 		pic16l_setled(XBOX_LED_GREEN);
 
 		/*
 		 * We are an XBOX, but we may have either 64MB or 128MB of
 		 * memory. The PCI host bridge should be programmed for this,
 		 * so we just query it. 
 		 */
 		outl(0xcf8, 0x80000084);
 		arch_i386_xbox_memsize = (inl(0xcfc) == 0x7FFFFFF) ? 128 : 64;
 	}
 #endif /* XBOX */
 
 	/*
 	 * Initialize the i8254 before the console so that console
 	 * initialization can use DELAY().
 	 */
 	i8254_init();
 
 	/*
 	 * Initialize the console before we print anything out.
 	 */
 	cninit();
 
 	if (metadata_missing)
 		printf("WARNING: loader(8) metadata is missing!\n");
 
 #ifdef DEV_ISA
 #ifdef DEV_ATPIC
 	elcr_probe();
 	atpic_startup();
 #else
 	/* Reset and mask the atpics and leave them shut down. */
 	atpic_reset();
 
 	/*
 	 * Point the ICU spurious interrupt vectors at the APIC spurious
 	 * interrupt handler.
 	 */
 	setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 #endif
 #endif
 
 #ifdef DDB
 	ksym_start = bootinfo.bi_symtab;
 	ksym_end = bootinfo.bi_esymtab;
 #endif
 
 	kdb_init();
 
 #ifdef KDB
 	if (boothowto & RB_KDB)
 		kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
 #endif
 
 	finishidentcpu();	/* Final stage of CPU initialization */
 	setidt(IDT_UD, &IDTVEC(ill),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_GP, &IDTVEC(prot),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	initializecpu();	/* Initialize CPU registers */
 
 	/* make an initial tss so cpu can get interrupt stack on syscall! */
 	/* Note: -16 is so we can grow the trapframe if we came from vm86 */
 	PCPU_SET(common_tss.tss_esp0, thread0.td_kstack +
 	    kstack0_sz - sizeof(struct pcb) - 16);
 	PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL));
 	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
 	PCPU_SET(tss_gdt, &gdt[GPROC0_SEL].sd);
 	PCPU_SET(common_tssd, *PCPU_GET(tss_gdt));
 	PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16);
 	ltr(gsel_tss);
 
 	/* pointer to selector slot for %fs/%gs */
 	PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd);
 
 	dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 =
 	    dblfault_tss.tss_esp2 = (int)&dblfault_stack[sizeof(dblfault_stack)];
 	dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 =
 	    dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL);
 #ifdef PAE
 	dblfault_tss.tss_cr3 = (int)IdlePDPT;
 #else
 	dblfault_tss.tss_cr3 = (int)IdlePTD;
 #endif
 	dblfault_tss.tss_eip = (int)dblfault_handler;
 	dblfault_tss.tss_eflags = PSL_KERNEL;
 	dblfault_tss.tss_ds = dblfault_tss.tss_es =
 	    dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL);
 	dblfault_tss.tss_fs = GSEL(GPRIV_SEL, SEL_KPL);
 	dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL);
 	dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL);
 
 	vm86_initialize();
 	getmemsize(first);
 	init_param2(physmem);
 
 	/* now running on new page tables, configured,and u/iom is accessible */
 
 	msgbufinit(msgbufp, msgbufsize);
 
 	/* make a call gate to reenter kernel with */
 	gdp = &ldt[LSYS5CALLS_SEL].gd;
 
 	x = (int) &IDTVEC(lcall_syscall);
 	gdp->gd_looffset = x;
 	gdp->gd_selector = GSEL(GCODE_SEL,SEL_KPL);
 	gdp->gd_stkcpy = 1;
 	gdp->gd_type = SDT_SYS386CGT;
 	gdp->gd_dpl = SEL_UPL;
 	gdp->gd_p = 1;
 	gdp->gd_hioffset = x >> 16;
 
 	/* XXX does this work? */
 	/* XXX yes! */
 	ldt[LBSDICALLS_SEL] = ldt[LSYS5CALLS_SEL];
 	ldt[LSOL26CALLS_SEL] = ldt[LSYS5CALLS_SEL];
 
 	/* transfer to user mode */
 
 	_ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
 	_udatasel = GSEL(GUDATA_SEL, SEL_UPL);
 
 	/* setup proc 0's pcb */
 	thread0.td_pcb->pcb_flags = 0;
 #ifdef PAE
 	thread0.td_pcb->pcb_cr3 = (int)IdlePDPT;
 #else
 	thread0.td_pcb->pcb_cr3 = (int)IdlePTD;
 #endif
 	thread0.td_pcb->pcb_ext = 0;
 	thread0.td_frame = &proc0_tf;
 
 	cpu_probe_amdc1e();
 	cpu_probe_cmpxchg8b();
 }
 #endif
 
 void
 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
 {
 
 	pcpu->pc_acpi_id = 0xffffffff;
 }
 
 void
 spinlock_enter(void)
 {
 	struct thread *td;
 	register_t flags;
 
 	td = curthread;
 	if (td->td_md.md_spinlock_count == 0) {
 		flags = intr_disable();
 		td->td_md.md_spinlock_count = 1;
 		td->td_md.md_saved_flags = flags;
 	} else
 		td->td_md.md_spinlock_count++;
 	critical_enter();
 }
 
 void
 spinlock_exit(void)
 {
 	struct thread *td;
 	register_t flags;
 
 	td = curthread;
 	critical_exit();
 	flags = td->td_md.md_saved_flags;
 	td->td_md.md_spinlock_count--;
 	if (td->td_md.md_spinlock_count == 0)
 		intr_restore(flags);
 }
 
 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 static void f00f_hack(void *unused);
 SYSINIT(f00f_hack, SI_SUB_INTRINSIC, SI_ORDER_FIRST, f00f_hack, NULL);
 
 static void
 f00f_hack(void *unused)
 {
 	struct gate_descriptor *new_idt;
 	vm_offset_t tmp;
 
 	if (!has_f00f_bug)
 		return;
 
 	GIANT_REQUIRED;
 
 	printf("Intel Pentium detected, installing workaround for F00F bug\n");
 
 	tmp = kmem_alloc(kernel_map, PAGE_SIZE * 2);
 	if (tmp == 0)
 		panic("kmem_alloc returned 0");
 
 	/* Put the problematic entry (#6) at the end of the lower page. */
 	new_idt = (struct gate_descriptor*)
 	    (tmp + PAGE_SIZE - 7 * sizeof(struct gate_descriptor));
 	bcopy(idt, new_idt, sizeof(idt0));
 	r_idt.rd_base = (u_int)new_idt;
 	lidt(&r_idt);
 	idt = new_idt;
 	if (vm_map_protect(kernel_map, tmp, tmp + PAGE_SIZE,
 			   VM_PROT_READ, FALSE) != KERN_SUCCESS)
 		panic("vm_map_protect failed");
 }
 #endif /* defined(I586_CPU) && !NO_F00F_HACK */
 
 /*
  * Construct a PCB from a trapframe. This is called from kdb_trap() where
  * we want to start a backtrace from the function that caused us to enter
  * the debugger. We have the context in the trapframe, but base the trace
  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
  * enough for a backtrace.
  */
 void
 makectx(struct trapframe *tf, struct pcb *pcb)
 {
 
 	pcb->pcb_edi = tf->tf_edi;
 	pcb->pcb_esi = tf->tf_esi;
 	pcb->pcb_ebp = tf->tf_ebp;
 	pcb->pcb_ebx = tf->tf_ebx;
 	pcb->pcb_eip = tf->tf_eip;
 	pcb->pcb_esp = (ISPL(tf->tf_cs)) ? tf->tf_esp : (int)(tf + 1) - 8;
 }
 
 int
 ptrace_set_pc(struct thread *td, u_long addr)
 {
 
 	td->td_frame->tf_eip = addr;
 	return (0);
 }
 
 int
 ptrace_single_step(struct thread *td)
 {
 	td->td_frame->tf_eflags |= PSL_T;
 	return (0);
 }
 
 int
 ptrace_clear_single_step(struct thread *td)
 {
 	td->td_frame->tf_eflags &= ~PSL_T;
 	return (0);
 }
 
 int
 fill_regs(struct thread *td, struct reg *regs)
 {
 	struct pcb *pcb;
 	struct trapframe *tp;
 
 	tp = td->td_frame;
 	pcb = td->td_pcb;
 	regs->r_gs = pcb->pcb_gs;
 	return (fill_frame_regs(tp, regs));
 }
 
 int
 fill_frame_regs(struct trapframe *tp, struct reg *regs)
 {
 	regs->r_fs = tp->tf_fs;
 	regs->r_es = tp->tf_es;
 	regs->r_ds = tp->tf_ds;
 	regs->r_edi = tp->tf_edi;
 	regs->r_esi = tp->tf_esi;
 	regs->r_ebp = tp->tf_ebp;
 	regs->r_ebx = tp->tf_ebx;
 	regs->r_edx = tp->tf_edx;
 	regs->r_ecx = tp->tf_ecx;
 	regs->r_eax = tp->tf_eax;
 	regs->r_eip = tp->tf_eip;
 	regs->r_cs = tp->tf_cs;
 	regs->r_eflags = tp->tf_eflags;
 	regs->r_esp = tp->tf_esp;
 	regs->r_ss = tp->tf_ss;
 	return (0);
 }
 
 int
 set_regs(struct thread *td, struct reg *regs)
 {
 	struct pcb *pcb;
 	struct trapframe *tp;
 
 	tp = td->td_frame;
 	if (!EFL_SECURE(regs->r_eflags, tp->tf_eflags) ||
 	    !CS_SECURE(regs->r_cs))
 		return (EINVAL);
 	pcb = td->td_pcb;
 	tp->tf_fs = regs->r_fs;
 	tp->tf_es = regs->r_es;
 	tp->tf_ds = regs->r_ds;
 	tp->tf_edi = regs->r_edi;
 	tp->tf_esi = regs->r_esi;
 	tp->tf_ebp = regs->r_ebp;
 	tp->tf_ebx = regs->r_ebx;
 	tp->tf_edx = regs->r_edx;
 	tp->tf_ecx = regs->r_ecx;
 	tp->tf_eax = regs->r_eax;
 	tp->tf_eip = regs->r_eip;
 	tp->tf_cs = regs->r_cs;
 	tp->tf_eflags = regs->r_eflags;
 	tp->tf_esp = regs->r_esp;
 	tp->tf_ss = regs->r_ss;
 	pcb->pcb_gs = regs->r_gs;
 	return (0);
 }
 
 #ifdef CPU_ENABLE_SSE
 static void
 fill_fpregs_xmm(sv_xmm, sv_87)
 	struct savexmm *sv_xmm;
 	struct save87 *sv_87;
 {
 	register struct env87 *penv_87 = &sv_87->sv_env;
 	register struct envxmm *penv_xmm = &sv_xmm->sv_env;
 	int i;
 
 	bzero(sv_87, sizeof(*sv_87));
 
 	/* FPU control/status */
 	penv_87->en_cw = penv_xmm->en_cw;
 	penv_87->en_sw = penv_xmm->en_sw;
 	penv_87->en_tw = penv_xmm->en_tw;
 	penv_87->en_fip = penv_xmm->en_fip;
 	penv_87->en_fcs = penv_xmm->en_fcs;
 	penv_87->en_opcode = penv_xmm->en_opcode;
 	penv_87->en_foo = penv_xmm->en_foo;
 	penv_87->en_fos = penv_xmm->en_fos;
 
 	/* FPU registers */
 	for (i = 0; i < 8; ++i)
 		sv_87->sv_ac[i] = sv_xmm->sv_fp[i].fp_acc;
 }
 
 static void
 set_fpregs_xmm(sv_87, sv_xmm)
 	struct save87 *sv_87;
 	struct savexmm *sv_xmm;
 {
 	register struct env87 *penv_87 = &sv_87->sv_env;
 	register struct envxmm *penv_xmm = &sv_xmm->sv_env;
 	int i;
 
 	/* FPU control/status */
 	penv_xmm->en_cw = penv_87->en_cw;
 	penv_xmm->en_sw = penv_87->en_sw;
 	penv_xmm->en_tw = penv_87->en_tw;
 	penv_xmm->en_fip = penv_87->en_fip;
 	penv_xmm->en_fcs = penv_87->en_fcs;
 	penv_xmm->en_opcode = penv_87->en_opcode;
 	penv_xmm->en_foo = penv_87->en_foo;
 	penv_xmm->en_fos = penv_87->en_fos;
 
 	/* FPU registers */
 	for (i = 0; i < 8; ++i)
 		sv_xmm->sv_fp[i].fp_acc = sv_87->sv_ac[i];
 }
 #endif /* CPU_ENABLE_SSE */
 
 int
 fill_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 
 	KASSERT(td == curthread || TD_IS_SUSPENDED(td) ||
 	    P_SHOULDSTOP(td->td_proc),
 	    ("not suspended thread %p", td));
 #ifdef DEV_NPX
 	npxgetregs(td);
 #else
 	bzero(fpregs, sizeof(*fpregs));
 #endif
 #ifdef CPU_ENABLE_SSE
 	if (cpu_fxsr)
 		fill_fpregs_xmm(&td->td_pcb->pcb_user_save.sv_xmm,
 		    (struct save87 *)fpregs);
 	else
 #endif /* CPU_ENABLE_SSE */
 		bcopy(&td->td_pcb->pcb_user_save.sv_87, fpregs,
 		    sizeof(*fpregs));
 	return (0);
 }
 
 int
 set_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 
 #ifdef CPU_ENABLE_SSE
 	if (cpu_fxsr)
 		set_fpregs_xmm((struct save87 *)fpregs,
 		    &td->td_pcb->pcb_user_save.sv_xmm);
 	else
 #endif /* CPU_ENABLE_SSE */
 		bcopy(fpregs, &td->td_pcb->pcb_user_save.sv_87,
 		    sizeof(*fpregs));
 #ifdef DEV_NPX
 	npxuserinited(td);
 #endif
 	return (0);
 }
 
 /*
  * Get machine context.
  */
 int
 get_mcontext(struct thread *td, mcontext_t *mcp, int flags)
 {
 	struct trapframe *tp;
 	struct segment_descriptor *sdp;
 
 	tp = td->td_frame;
 
 	PROC_LOCK(curthread->td_proc);
 	mcp->mc_onstack = sigonstack(tp->tf_esp);
 	PROC_UNLOCK(curthread->td_proc);
 	mcp->mc_gs = td->td_pcb->pcb_gs;
 	mcp->mc_fs = tp->tf_fs;
 	mcp->mc_es = tp->tf_es;
 	mcp->mc_ds = tp->tf_ds;
 	mcp->mc_edi = tp->tf_edi;
 	mcp->mc_esi = tp->tf_esi;
 	mcp->mc_ebp = tp->tf_ebp;
 	mcp->mc_isp = tp->tf_isp;
 	mcp->mc_eflags = tp->tf_eflags;
 	if (flags & GET_MC_CLEAR_RET) {
 		mcp->mc_eax = 0;
 		mcp->mc_edx = 0;
 		mcp->mc_eflags &= ~PSL_C;
 	} else {
 		mcp->mc_eax = tp->tf_eax;
 		mcp->mc_edx = tp->tf_edx;
 	}
 	mcp->mc_ebx = tp->tf_ebx;
 	mcp->mc_ecx = tp->tf_ecx;
 	mcp->mc_eip = tp->tf_eip;
 	mcp->mc_cs = tp->tf_cs;
 	mcp->mc_esp = tp->tf_esp;
 	mcp->mc_ss = tp->tf_ss;
 	mcp->mc_len = sizeof(*mcp);
 	get_fpcontext(td, mcp);
 	sdp = &td->td_pcb->pcb_fsd;
 	mcp->mc_fsbase = sdp->sd_hibase << 24 | sdp->sd_lobase;
 	sdp = &td->td_pcb->pcb_gsd;
 	mcp->mc_gsbase = sdp->sd_hibase << 24 | sdp->sd_lobase;
 	mcp->mc_flags = 0;
 	bzero(mcp->mc_spare2, sizeof(mcp->mc_spare2));
 	return (0);
 }
 
 /*
  * Set machine context.
  *
  * However, we don't set any but the user modifiable flags, and we won't
  * touch the cs selector.
  */
 int
 set_mcontext(struct thread *td, const mcontext_t *mcp)
 {
 	struct trapframe *tp;
 	int eflags, ret;
 
 	tp = td->td_frame;
 	if (mcp->mc_len != sizeof(*mcp))
 		return (EINVAL);
 	eflags = (mcp->mc_eflags & PSL_USERCHANGE) |
 	    (tp->tf_eflags & ~PSL_USERCHANGE);
 	if ((ret = set_fpcontext(td, mcp)) == 0) {
 		tp->tf_fs = mcp->mc_fs;
 		tp->tf_es = mcp->mc_es;
 		tp->tf_ds = mcp->mc_ds;
 		tp->tf_edi = mcp->mc_edi;
 		tp->tf_esi = mcp->mc_esi;
 		tp->tf_ebp = mcp->mc_ebp;
 		tp->tf_ebx = mcp->mc_ebx;
 		tp->tf_edx = mcp->mc_edx;
 		tp->tf_ecx = mcp->mc_ecx;
 		tp->tf_eax = mcp->mc_eax;
 		tp->tf_eip = mcp->mc_eip;
 		tp->tf_eflags = eflags;
 		tp->tf_esp = mcp->mc_esp;
 		tp->tf_ss = mcp->mc_ss;
 		td->td_pcb->pcb_gs = mcp->mc_gs;
 		ret = 0;
 	}
 	return (ret);
 }
 
 static void
 get_fpcontext(struct thread *td, mcontext_t *mcp)
 {
 
 #ifndef DEV_NPX
 	mcp->mc_fpformat = _MC_FPFMT_NODEV;
 	mcp->mc_ownedfp = _MC_FPOWNED_NONE;
 	bzero(mcp->mc_fpstate, sizeof(mcp->mc_fpstate));
 #else
 	mcp->mc_ownedfp = npxgetregs(td);
 	bcopy(&td->td_pcb->pcb_user_save, &mcp->mc_fpstate,
 	    sizeof(mcp->mc_fpstate));
 	mcp->mc_fpformat = npxformat();
 #endif
 }
 
 static int
 set_fpcontext(struct thread *td, const mcontext_t *mcp)
 {
 
 	if (mcp->mc_fpformat == _MC_FPFMT_NODEV)
 		return (0);
 	else if (mcp->mc_fpformat != _MC_FPFMT_387 &&
 	    mcp->mc_fpformat != _MC_FPFMT_XMM)
 		return (EINVAL);
 	else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE)
 		/* We don't care what state is left in the FPU or PCB. */
 		fpstate_drop(td);
 	else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU ||
 	    mcp->mc_ownedfp == _MC_FPOWNED_PCB) {
 #ifdef DEV_NPX
 #ifdef CPU_ENABLE_SSE
 		if (cpu_fxsr)
 			((union savefpu *)&mcp->mc_fpstate)->sv_xmm.sv_env.
 			    en_mxcsr &= cpu_mxcsr_mask;
 #endif
 		npxsetregs(td, (union savefpu *)&mcp->mc_fpstate);
 #endif
 	} else
 		return (EINVAL);
 	return (0);
 }
 
 static void
 fpstate_drop(struct thread *td)
 {
 
 	KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu"));
 	critical_enter();
 #ifdef DEV_NPX
 	if (PCPU_GET(fpcurthread) == td)
 		npxdrop();
 #endif
 	/*
 	 * XXX force a full drop of the npx.  The above only drops it if we
 	 * owned it.  npxgetregs() has the same bug in the !cpu_fxsr case.
 	 *
 	 * XXX I don't much like npxgetregs()'s semantics of doing a full
 	 * drop.  Dropping only to the pcb matches fnsave's behaviour.
 	 * We only need to drop to !PCB_INITDONE in sendsig().  But
 	 * sendsig() is the only caller of npxgetregs()... perhaps we just
 	 * have too many layers.
 	 */
 	curthread->td_pcb->pcb_flags &= ~(PCB_NPXINITDONE |
 	    PCB_NPXUSERINITDONE);
 	critical_exit();
 }
 
 int
 fill_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 	struct pcb *pcb;
 
 	if (td == NULL) {
 		dbregs->dr[0] = rdr0();
 		dbregs->dr[1] = rdr1();
 		dbregs->dr[2] = rdr2();
 		dbregs->dr[3] = rdr3();
 		dbregs->dr[4] = rdr4();
 		dbregs->dr[5] = rdr5();
 		dbregs->dr[6] = rdr6();
 		dbregs->dr[7] = rdr7();
 	} else {
 		pcb = td->td_pcb;
 		dbregs->dr[0] = pcb->pcb_dr0;
 		dbregs->dr[1] = pcb->pcb_dr1;
 		dbregs->dr[2] = pcb->pcb_dr2;
 		dbregs->dr[3] = pcb->pcb_dr3;
 		dbregs->dr[4] = 0;
 		dbregs->dr[5] = 0;
 		dbregs->dr[6] = pcb->pcb_dr6;
 		dbregs->dr[7] = pcb->pcb_dr7;
 	}
 	return (0);
 }
 
 int
 set_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 	struct pcb *pcb;
 	int i;
 
 	if (td == NULL) {
 		load_dr0(dbregs->dr[0]);
 		load_dr1(dbregs->dr[1]);
 		load_dr2(dbregs->dr[2]);
 		load_dr3(dbregs->dr[3]);
 		load_dr4(dbregs->dr[4]);
 		load_dr5(dbregs->dr[5]);
 		load_dr6(dbregs->dr[6]);
 		load_dr7(dbregs->dr[7]);
 	} else {
 		/*
 		 * Don't let an illegal value for dr7 get set.	Specifically,
 		 * check for undefined settings.  Setting these bit patterns
 		 * result in undefined behaviour and can lead to an unexpected
 		 * TRCTRAP.
 		 */
 		for (i = 0; i < 4; i++) {
 			if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02)
 				return (EINVAL);
 			if (DBREG_DR7_LEN(dbregs->dr[7], i) == 0x02)
 				return (EINVAL);
 		}
 		
 		pcb = td->td_pcb;
 		
 		/*
 		 * Don't let a process set a breakpoint that is not within the
 		 * process's address space.  If a process could do this, it
 		 * could halt the system by setting a breakpoint in the kernel
 		 * (if ddb was enabled).  Thus, we need to check to make sure
 		 * that no breakpoints are being enabled for addresses outside
 		 * process's address space.
 		 *
 		 * XXX - what about when the watched area of the user's
 		 * address space is written into from within the kernel
 		 * ... wouldn't that still cause a breakpoint to be generated
 		 * from within kernel mode?
 		 */
 
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) {
 			/* dr0 is enabled */
 			if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 			
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) {
 			/* dr1 is enabled */
 			if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 			
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) {
 			/* dr2 is enabled */
 			if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 			
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) {
 			/* dr3 is enabled */
 			if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 
 		pcb->pcb_dr0 = dbregs->dr[0];
 		pcb->pcb_dr1 = dbregs->dr[1];
 		pcb->pcb_dr2 = dbregs->dr[2];
 		pcb->pcb_dr3 = dbregs->dr[3];
 		pcb->pcb_dr6 = dbregs->dr[6];
 		pcb->pcb_dr7 = dbregs->dr[7];
 
 		pcb->pcb_flags |= PCB_DBREGS;
 	}
 
 	return (0);
 }
 
 /*
  * Return > 0 if a hardware breakpoint has been hit, and the
  * breakpoint was in user space.  Return 0, otherwise.
  */
 int
 user_dbreg_trap(void)
 {
         u_int32_t dr7, dr6; /* debug registers dr6 and dr7 */
         u_int32_t bp;       /* breakpoint bits extracted from dr6 */
         int nbp;            /* number of breakpoints that triggered */
         caddr_t addr[4];    /* breakpoint addresses */
         int i;
         
         dr7 = rdr7();
         if ((dr7 & 0x000000ff) == 0) {
                 /*
                  * all GE and LE bits in the dr7 register are zero,
                  * thus the trap couldn't have been caused by the
                  * hardware debug registers
                  */
                 return 0;
         }
 
         nbp = 0;
         dr6 = rdr6();
         bp = dr6 & 0x0000000f;
 
         if (!bp) {
                 /*
                  * None of the breakpoint bits are set meaning this
                  * trap was not caused by any of the debug registers
                  */
                 return 0;
         }
 
         /*
          * at least one of the breakpoints were hit, check to see
          * which ones and if any of them are user space addresses
          */
 
         if (bp & 0x01) {
                 addr[nbp++] = (caddr_t)rdr0();
         }
         if (bp & 0x02) {
                 addr[nbp++] = (caddr_t)rdr1();
         }
         if (bp & 0x04) {
                 addr[nbp++] = (caddr_t)rdr2();
         }
         if (bp & 0x08) {
                 addr[nbp++] = (caddr_t)rdr3();
         }
 
         for (i = 0; i < nbp; i++) {
                 if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) {
                         /*
                          * addr[i] is in user space
                          */
                         return nbp;
                 }
         }
 
         /*
          * None of the breakpoints are in user space.
          */
         return 0;
 }
 
 #ifdef KDB
 
 /*
  * Provide inb() and outb() as functions.  They are normally only available as
  * inline functions, thus cannot be called from the debugger.
  */
 
 /* silence compiler warnings */
 u_char inb_(u_short);
 void outb_(u_short, u_char);
 
 u_char
 inb_(u_short port)
 {
 	return inb(port);
 }
 
 void
 outb_(u_short port, u_char data)
 {
 	outb(port, data);
 }
 
 #endif /* KDB */
Index: stable/9/sys/i386/i386/mp_machdep.c
===================================================================
--- stable/9/sys/i386/i386/mp_machdep.c	(revision 235795)
+++ stable/9/sys/i386/i386/mp_machdep.c	(revision 235796)
@@ -1,1555 +1,1591 @@
 /*-
  * Copyright (c) 1996, by Steve Passe
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. The name of the developer may NOT be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_apic.h"
 #include "opt_cpu.h"
 #include "opt_kstack_pages.h"
 #include "opt_pmap.h"
 #include "opt_sched.h"
 #include "opt_smp.h"
 
 #if !defined(lint)
 #if !defined(SMP)
 #error How did you get here?
 #endif
 
 #ifndef DEV_APIC
 #error The apic device is required for SMP, add "device apic" to your config file.
 #endif
 #if defined(CPU_DISABLE_CMPXCHG) && !defined(COMPILING_LINT)
 #error SMP not supported with CPU_DISABLE_CMPXCHG
 #endif
 #endif /* not lint */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/cons.h>	/* cngetc() */
 #include <sys/cpuset.h>
 #ifdef GPROF 
 #include <sys/gmon.h>
 #endif
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/memrange.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/proc.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 
 #include <x86/apicreg.h>
 #include <machine/clock.h>
 #include <machine/cputypes.h>
 #include <x86/mca.h>
 #include <machine/md_var.h>
 #include <machine/pcb.h>
 #include <machine/psl.h>
 #include <machine/smp.h>
 #include <machine/specialreg.h>
 
 #define WARMBOOT_TARGET		0
 #define WARMBOOT_OFF		(KERNBASE + 0x0467)
 #define WARMBOOT_SEG		(KERNBASE + 0x0469)
 
 #define CMOS_REG		(0x70)
 #define CMOS_DATA		(0x71)
 #define BIOS_RESET		(0x0f)
 #define BIOS_WARM		(0x0a)
 
 /*
  * this code MUST be enabled here and in mpboot.s.
  * it follows the very early stages of AP boot by placing values in CMOS ram.
  * it NORMALLY will never be needed and thus the primitive method for enabling.
  *
 #define CHECK_POINTS
  */
 
 #if defined(CHECK_POINTS) && !defined(PC98)
 #define CHECK_READ(A)	 (outb(CMOS_REG, (A)), inb(CMOS_DATA))
 #define CHECK_WRITE(A,D) (outb(CMOS_REG, (A)), outb(CMOS_DATA, (D)))
 
 #define CHECK_INIT(D);				\
 	CHECK_WRITE(0x34, (D));			\
 	CHECK_WRITE(0x35, (D));			\
 	CHECK_WRITE(0x36, (D));			\
 	CHECK_WRITE(0x37, (D));			\
 	CHECK_WRITE(0x38, (D));			\
 	CHECK_WRITE(0x39, (D));
 
 #define CHECK_PRINT(S);				\
 	printf("%s: %d, %d, %d, %d, %d, %d\n",	\
 	   (S),					\
 	   CHECK_READ(0x34),			\
 	   CHECK_READ(0x35),			\
 	   CHECK_READ(0x36),			\
 	   CHECK_READ(0x37),			\
 	   CHECK_READ(0x38),			\
 	   CHECK_READ(0x39));
 
 #else				/* CHECK_POINTS */
 
 #define CHECK_INIT(D)
 #define CHECK_PRINT(S)
 #define CHECK_WRITE(A, D)
 
 #endif				/* CHECK_POINTS */
 
 /* lock region used by kernel profiling */
 int	mcount_lock;
 
 int	mp_naps;		/* # of Applications processors */
 int	boot_cpu_id = -1;	/* designated BSP */
 
 extern	struct pcpu __pcpu[];
 
 /* AP uses this during bootstrap.  Do not staticize.  */
 char *bootSTK;
 static int bootAP;
 
 /* Free these after use */
 void *bootstacks[MAXCPU];
 static void *dpcpu;
 
 /* Hotwire a 0->4MB V==P mapping */
 extern pt_entry_t *KPTphys;
 
 struct pcb stoppcbs[MAXCPU];
+struct pcb **susppcbs = NULL;
 
 /* Variables needed for SMP tlb shootdown. */
 vm_offset_t smp_tlb_addr1;
 vm_offset_t smp_tlb_addr2;
 volatile int smp_tlb_wait;
 
 #ifdef COUNT_IPIS
 /* Interrupt counts. */
 static u_long *ipi_preempt_counts[MAXCPU];
 static u_long *ipi_ast_counts[MAXCPU];
 u_long *ipi_invltlb_counts[MAXCPU];
 u_long *ipi_invlrng_counts[MAXCPU];
 u_long *ipi_invlpg_counts[MAXCPU];
 u_long *ipi_invlcache_counts[MAXCPU];
 u_long *ipi_rendezvous_counts[MAXCPU];
 u_long *ipi_lazypmap_counts[MAXCPU];
 static u_long *ipi_hardclock_counts[MAXCPU];
 #endif
 
 /*
  * Local data and functions.
  */
 
 static volatile cpuset_t ipi_nmi_pending;
 
 /* used to hold the AP's until we are ready to release them */
 static struct mtx ap_boot_mtx;
 
 /* Set to 1 once we're ready to let the APs out of the pen. */
 static volatile int aps_ready = 0;
 
 /*
  * Store data from cpu_add() until later in the boot when we actually setup
  * the APs.
  */
 struct cpu_info {
 	int	cpu_present:1;
 	int	cpu_bsp:1;
 	int	cpu_disabled:1;
 	int	cpu_hyperthread:1;
 } static cpu_info[MAX_APIC_ID + 1];
 int cpu_apic_ids[MAXCPU];
 int apic_cpuids[MAX_APIC_ID + 1];
 
 /* Holds pending bitmap based IPIs per CPU */
 static volatile u_int cpu_ipi_pending[MAXCPU];
 
 static u_int boot_address;
 static int cpu_logical;			/* logical cpus per core */
 static int cpu_cores;			/* cores per package */
 
 static void	assign_cpu_ids(void);
 static void	install_ap_tramp(void);
 static void	set_interrupt_apic_ids(void);
 static int	start_all_aps(void);
 static int	start_ap(int apic_id);
 static void	release_aps(void *dummy);
 
 static u_int	hyperthreading_cpus;	/* logical cpus sharing L1 cache */
 static int	hyperthreading_allowed = 1;
 
 static void
 mem_range_AP_init(void)
 {
 	if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP)
 		mem_range_softc.mr_op->initAP(&mem_range_softc);
 }
 
 static void
 topo_probe_amd(void)
 {
 	int core_id_bits;
 	int id;
 
 	/* AMD processors do not support HTT. */
 	cpu_logical = 1;
 
 	if ((amd_feature2 & AMDID2_CMP) == 0) {
 		cpu_cores = 1;
 		return;
 	}
 
 	core_id_bits = (cpu_procinfo2 & AMDID_COREID_SIZE) >>
 	    AMDID_COREID_SIZE_SHIFT;
 	if (core_id_bits == 0) {
 		cpu_cores = (cpu_procinfo2 & AMDID_CMP_CORES) + 1;
 		return;
 	}
 
 	/* Fam 10h and newer should get here. */
 	for (id = 0; id <= MAX_APIC_ID; id++) {
 		/* Check logical CPU availability. */
 		if (!cpu_info[id].cpu_present || cpu_info[id].cpu_disabled)
 			continue;
 		/* Check if logical CPU has the same package ID. */
 		if ((id >> core_id_bits) != (boot_cpu_id >> core_id_bits))
 			continue;
 		cpu_cores++;
 	}
 }
 
 /*
  * Round up to the next power of two, if necessary, and then
  * take log2.
  * Returns -1 if argument is zero.
  */
 static __inline int
 mask_width(u_int x)
 {
 
 	return (fls(x << (1 - powerof2(x))) - 1);
 }
 
 static void
 topo_probe_0x4(void)
 {
 	u_int p[4];
 	int pkg_id_bits;
 	int core_id_bits;
 	int max_cores;
 	int max_logical;
 	int id;
 
 	/* Both zero and one here mean one logical processor per package. */
 	max_logical = (cpu_feature & CPUID_HTT) != 0 ?
 	    (cpu_procinfo & CPUID_HTT_CORES) >> 16 : 1;
 	if (max_logical <= 1)
 		return;
 
 	/*
 	 * Because of uniformity assumption we examine only
 	 * those logical processors that belong to the same
 	 * package as BSP.  Further, we count number of
 	 * logical processors that belong to the same core
 	 * as BSP thus deducing number of threads per core.
 	 */
 	if (cpu_high >= 0x4) {
 		cpuid_count(0x04, 0, p);
 		max_cores = ((p[0] >> 26) & 0x3f) + 1;
 	} else
 		max_cores = 1;
 	core_id_bits = mask_width(max_logical/max_cores);
 	if (core_id_bits < 0)
 		return;
 	pkg_id_bits = core_id_bits + mask_width(max_cores);
 
 	for (id = 0; id <= MAX_APIC_ID; id++) {
 		/* Check logical CPU availability. */
 		if (!cpu_info[id].cpu_present || cpu_info[id].cpu_disabled)
 			continue;
 		/* Check if logical CPU has the same package ID. */
 		if ((id >> pkg_id_bits) != (boot_cpu_id >> pkg_id_bits))
 			continue;
 		cpu_cores++;
 		/* Check if logical CPU has the same package and core IDs. */
 		if ((id >> core_id_bits) == (boot_cpu_id >> core_id_bits))
 			cpu_logical++;
 	}
 
 	KASSERT(cpu_cores >= 1 && cpu_logical >= 1,
 	    ("topo_probe_0x4 couldn't find BSP"));
 
 	cpu_cores /= cpu_logical;
 	hyperthreading_cpus = cpu_logical;
 }
 
 static void
 topo_probe_0xb(void)
 {
 	u_int p[4];
 	int bits;
 	int cnt;
 	int i;
 	int logical;
 	int type;
 	int x;
 
 	/* We only support three levels for now. */
 	for (i = 0; i < 3; i++) {
 		cpuid_count(0x0b, i, p);
 
 		/* Fall back if CPU leaf 11 doesn't really exist. */
 		if (i == 0 && p[1] == 0) {
 			topo_probe_0x4();
 			return;
 		}
 
 		bits = p[0] & 0x1f;
 		logical = p[1] &= 0xffff;
 		type = (p[2] >> 8) & 0xff;
 		if (type == 0 || logical == 0)
 			break;
 		/*
 		 * Because of uniformity assumption we examine only
 		 * those logical processors that belong to the same
 		 * package as BSP.
 		 */
 		for (cnt = 0, x = 0; x <= MAX_APIC_ID; x++) {
 			if (!cpu_info[x].cpu_present ||
 			    cpu_info[x].cpu_disabled)
 				continue;
 			if (x >> bits == boot_cpu_id >> bits)
 				cnt++;
 		}
 		if (type == CPUID_TYPE_SMT)
 			cpu_logical = cnt;
 		else if (type == CPUID_TYPE_CORE)
 			cpu_cores = cnt;
 	}
 	if (cpu_logical == 0)
 		cpu_logical = 1;
 	cpu_cores /= cpu_logical;
 }
 
 /*
  * Both topology discovery code and code that consumes topology
  * information assume top-down uniformity of the topology.
  * That is, all physical packages must be identical and each
  * core in a package must have the same number of threads.
  * Topology information is queried only on BSP, on which this
  * code runs and for which it can query CPUID information.
  * Then topology is extrapolated on all packages using the
  * uniformity assumption.
  */
 static void
 topo_probe(void)
 {
 	static int cpu_topo_probed = 0;
 
 	if (cpu_topo_probed)
 		return;
 
 	CPU_ZERO(&logical_cpus_mask);
 	if (mp_ncpus <= 1)
 		cpu_cores = cpu_logical = 1;
 	else if (cpu_vendor_id == CPU_VENDOR_AMD)
 		topo_probe_amd();
 	else if (cpu_vendor_id == CPU_VENDOR_INTEL) {
 		/*
 		 * See Intel(R) 64 Architecture Processor
 		 * Topology Enumeration article for details.
 		 *
 		 * Note that 0x1 <= cpu_high < 4 case should be
 		 * compatible with topo_probe_0x4() logic when
 		 * CPUID.1:EBX[23:16] > 0 (cpu_cores will be 1)
 		 * or it should trigger the fallback otherwise.
 		 */
 		if (cpu_high >= 0xb)
 			topo_probe_0xb();
 		else if (cpu_high >= 0x1)
 			topo_probe_0x4();
 	}
 
 	/*
 	 * Fallback: assume each logical CPU is in separate
 	 * physical package.  That is, no multi-core, no SMT.
 	 */
 	if (cpu_cores == 0 || cpu_logical == 0)
 		cpu_cores = cpu_logical = 1;
 	cpu_topo_probed = 1;
 }
 
 struct cpu_group *
 cpu_topo(void)
 {
 	int cg_flags;
 
 	/*
 	 * Determine whether any threading flags are
 	 * necessry.
 	 */
 	topo_probe();
 	if (cpu_logical > 1 && hyperthreading_cpus)
 		cg_flags = CG_FLAG_HTT;
 	else if (cpu_logical > 1)
 		cg_flags = CG_FLAG_SMT;
 	else
 		cg_flags = 0;
 	if (mp_ncpus % (cpu_cores * cpu_logical) != 0) {
 		printf("WARNING: Non-uniform processors.\n");
 		printf("WARNING: Using suboptimal topology.\n");
 		return (smp_topo_none());
 	}
 	/*
 	 * No multi-core or hyper-threaded.
 	 */
 	if (cpu_logical * cpu_cores == 1)
 		return (smp_topo_none());
 	/*
 	 * Only HTT no multi-core.
 	 */
 	if (cpu_logical > 1 && cpu_cores == 1)
 		return (smp_topo_1level(CG_SHARE_L1, cpu_logical, cg_flags));
 	/*
 	 * Only multi-core no HTT.
 	 */
 	if (cpu_cores > 1 && cpu_logical == 1)
 		return (smp_topo_1level(CG_SHARE_L2, cpu_cores, cg_flags));
 	/*
 	 * Both HTT and multi-core.
 	 */
 	return (smp_topo_2level(CG_SHARE_L2, cpu_cores,
 	    CG_SHARE_L1, cpu_logical, cg_flags));
 }
 
 
 /*
  * Calculate usable address in base memory for AP trampoline code.
  */
 u_int
 mp_bootaddress(u_int basemem)
 {
 
 	boot_address = trunc_page(basemem);	/* round down to 4k boundary */
 	if ((basemem - boot_address) < bootMP_size)
 		boot_address -= PAGE_SIZE;	/* not enough, lower by 4k */
 
 	return boot_address;
 }
 
 void
 cpu_add(u_int apic_id, char boot_cpu)
 {
 
 	if (apic_id > MAX_APIC_ID) {
 		panic("SMP: APIC ID %d too high", apic_id);
 		return;
 	}
 	KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice",
 	    apic_id));
 	cpu_info[apic_id].cpu_present = 1;
 	if (boot_cpu) {
 		KASSERT(boot_cpu_id == -1,
 		    ("CPU %d claims to be BSP, but CPU %d already is", apic_id,
 		    boot_cpu_id));
 		boot_cpu_id = apic_id;
 		cpu_info[apic_id].cpu_bsp = 1;
 	}
 	if (mp_ncpus < MAXCPU) {
 		mp_ncpus++;
 		mp_maxid = mp_ncpus - 1;
 	}
 	if (bootverbose)
 		printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" :
 		    "AP");
 }
 
 void
 cpu_mp_setmaxid(void)
 {
 
 	/*
 	 * mp_maxid should be already set by calls to cpu_add().
 	 * Just sanity check its value here.
 	 */
 	if (mp_ncpus == 0)
 		KASSERT(mp_maxid == 0,
 		    ("%s: mp_ncpus is zero, but mp_maxid is not", __func__));
 	else if (mp_ncpus == 1)
 		mp_maxid = 0;
 	else
 		KASSERT(mp_maxid >= mp_ncpus - 1,
 		    ("%s: counters out of sync: max %d, count %d", __func__,
 			mp_maxid, mp_ncpus));
 }
 
 int
 cpu_mp_probe(void)
 {
 
 	/*
 	 * Always record BSP in CPU map so that the mbuf init code works
 	 * correctly.
 	 */
 	CPU_SETOF(0, &all_cpus);
 	if (mp_ncpus == 0) {
 		/*
 		 * No CPUs were found, so this must be a UP system.  Setup
 		 * the variables to represent a system with a single CPU
 		 * with an id of 0.
 		 */
 		mp_ncpus = 1;
 		return (0);
 	}
 
 	/* At least one CPU was found. */
 	if (mp_ncpus == 1) {
 		/*
 		 * One CPU was found, so this must be a UP system with
 		 * an I/O APIC.
 		 */
 		mp_maxid = 0;
 		return (0);
 	}
 
 	/* At least two CPUs were found. */
 	return (1);
 }
 
 /*
  * Initialize the IPI handlers and start up the AP's.
  */
 void
 cpu_mp_start(void)
 {
 	int i;
 
 	/* Initialize the logical ID to APIC ID table. */
 	for (i = 0; i < MAXCPU; i++) {
 		cpu_apic_ids[i] = -1;
 		cpu_ipi_pending[i] = 0;
 	}
 
 	/* Install an inter-CPU IPI for TLB invalidation */
 	setidt(IPI_INVLTLB, IDTVEC(invltlb),
 	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IPI_INVLPG, IDTVEC(invlpg),
 	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IPI_INVLRNG, IDTVEC(invlrng),
 	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 
 	/* Install an inter-CPU IPI for cache invalidation. */
 	setidt(IPI_INVLCACHE, IDTVEC(invlcache),
 	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 
 	/* Install an inter-CPU IPI for lazy pmap release */
 	setidt(IPI_LAZYPMAP, IDTVEC(lazypmap),
 	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 
 	/* Install an inter-CPU IPI for all-CPU rendezvous */
 	setidt(IPI_RENDEZVOUS, IDTVEC(rendezvous),
 	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 
 	/* Install generic inter-CPU IPI handler */
 	setidt(IPI_BITMAP_VECTOR, IDTVEC(ipi_intr_bitmap_handler),
 	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 
 	/* Install an inter-CPU IPI for CPU stop/restart */
 	setidt(IPI_STOP, IDTVEC(cpustop),
 	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 
+	/* Install an inter-CPU IPI for CPU suspend/resume */
+	setidt(IPI_SUSPEND, IDTVEC(cpususpend),
+	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 
 	/* Set boot_cpu_id if needed. */
 	if (boot_cpu_id == -1) {
 		boot_cpu_id = PCPU_GET(apic_id);
 		cpu_info[boot_cpu_id].cpu_bsp = 1;
 	} else
 		KASSERT(boot_cpu_id == PCPU_GET(apic_id),
 		    ("BSP's APIC ID doesn't match boot_cpu_id"));
 
 	/* Probe logical/physical core configuration. */
 	topo_probe();
 
 	assign_cpu_ids();
 
 	/* Start each Application Processor */
 	start_all_aps();
 
 	set_interrupt_apic_ids();
 }
 
 
 /*
  * Print various information about the SMP system hardware and setup.
  */
 void
 cpu_mp_announce(void)
 {
 	const char *hyperthread;
 	int i;
 
 	printf("FreeBSD/SMP: %d package(s) x %d core(s)",
 	    mp_ncpus / (cpu_cores * cpu_logical), cpu_cores);
 	if (hyperthreading_cpus > 1)
 	    printf(" x %d HTT threads", cpu_logical);
 	else if (cpu_logical > 1)
 	    printf(" x %d SMT threads", cpu_logical);
 	printf("\n");
 
 	/* List active CPUs first. */
 	printf(" cpu0 (BSP): APIC ID: %2d\n", boot_cpu_id);
 	for (i = 1; i < mp_ncpus; i++) {
 		if (cpu_info[cpu_apic_ids[i]].cpu_hyperthread)
 			hyperthread = "/HT";
 		else
 			hyperthread = "";
 		printf(" cpu%d (AP%s): APIC ID: %2d\n", i, hyperthread,
 		    cpu_apic_ids[i]);
 	}
 
 	/* List disabled CPUs last. */
 	for (i = 0; i <= MAX_APIC_ID; i++) {
 		if (!cpu_info[i].cpu_present || !cpu_info[i].cpu_disabled)
 			continue;
 		if (cpu_info[i].cpu_hyperthread)
 			hyperthread = "/HT";
 		else
 			hyperthread = "";
 		printf("  cpu (AP%s): APIC ID: %2d (disabled)\n", hyperthread,
 		    i);
 	}
 }
 
 /*
  * AP CPU's call this to initialize themselves.
  */
 void
 init_secondary(void)
 {
 	struct pcpu *pc;
 	vm_offset_t addr;
 	int	gsel_tss;
 	int	x, myid;
 	u_int	cpuid, cr0;
 
 	/* bootAP is set in start_ap() to our ID. */
 	myid = bootAP;
 
 	/* Get per-cpu data */
 	pc = &__pcpu[myid];
 
 	/* prime data page for it to use */
 	pcpu_init(pc, myid, sizeof(struct pcpu));
 	dpcpu_init(dpcpu, myid);
 	pc->pc_apic_id = cpu_apic_ids[myid];
 	pc->pc_prvspace = pc;
 	pc->pc_curthread = 0;
 
 	gdt_segs[GPRIV_SEL].ssd_base = (int) pc;
 	gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss;
 
 	for (x = 0; x < NGDT; x++) {
 		ssdtosd(&gdt_segs[x], &gdt[myid * NGDT + x].sd);
 	}
 
 	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
 	r_gdt.rd_base = (int) &gdt[myid * NGDT];
 	lgdt(&r_gdt);			/* does magic intra-segment return */
 
 	lidt(&r_idt);
 
 	lldt(_default_ldt);
 	PCPU_SET(currentldt, _default_ldt);
 
 	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
 	gdt[myid * NGDT + GPROC0_SEL].sd.sd_type = SDT_SYS386TSS;
 	PCPU_SET(common_tss.tss_esp0, 0); /* not used until after switch */
 	PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL));
 	PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16);
 	PCPU_SET(tss_gdt, &gdt[myid * NGDT + GPROC0_SEL].sd);
 	PCPU_SET(common_tssd, *PCPU_GET(tss_gdt));
 	ltr(gsel_tss);
 
 	PCPU_SET(fsgs_gdt, &gdt[myid * NGDT + GUFS_SEL].sd);
 
 	/*
 	 * Set to a known state:
 	 * Set by mpboot.s: CR0_PG, CR0_PE
 	 * Set by cpu_setregs: CR0_NE, CR0_MP, CR0_TS, CR0_WP, CR0_AM
 	 */
 	cr0 = rcr0();
 	cr0 &= ~(CR0_CD | CR0_NW | CR0_EM);
 	load_cr0(cr0);
 	CHECK_WRITE(0x38, 5);
 	
 	/* Disable local APIC just to be sure. */
 	lapic_disable();
 
 	/* signal our startup to the BSP. */
 	mp_naps++;
 	CHECK_WRITE(0x39, 6);
 
 	/* Spin until the BSP releases the AP's. */
 	while (!aps_ready)
 		ia32_pause();
 
 	/* BSP may have changed PTD while we were waiting */
 	invltlb();
 	for (addr = 0; addr < NKPT * NBPDR - 1; addr += PAGE_SIZE)
 		invlpg(addr);
 
 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 	lidt(&r_idt);
 #endif
 
 	/* Initialize the PAT MSR if present. */
 	pmap_init_pat();
 
 	/* set up CPU registers and state */
 	cpu_setregs();
 
 	/* set up FPU state on the AP */
 	npxinit();
 
 	/* set up SSE registers */
 	enable_sse();
 
 #ifdef PAE
 	/* Enable the PTE no-execute bit. */
 	if ((amd_feature & AMDID_NX) != 0) {
 		uint64_t msr;
 
 		msr = rdmsr(MSR_EFER) | EFER_NXE;
 		wrmsr(MSR_EFER, msr);
 	}
 #endif
 
 	/* A quick check from sanity claus */
 	cpuid = PCPU_GET(cpuid);
 	if (PCPU_GET(apic_id) != lapic_id()) {
 		printf("SMP: cpuid = %d\n", cpuid);
 		printf("SMP: actual apic_id = %d\n", lapic_id());
 		printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id));
 		panic("cpuid mismatch! boom!!");
 	}
 
 	/* Initialize curthread. */
 	KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread"));
 	PCPU_SET(curthread, PCPU_GET(idlethread));
 
 	mca_init();
 
 	mtx_lock_spin(&ap_boot_mtx);
 
 	/* Init local apic for irq's */
 	lapic_setup(1);
 
 	/* Set memory range attributes for this CPU to match the BSP */
 	mem_range_AP_init();
 
 	smp_cpus++;
 
 	CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", cpuid);
 	printf("SMP: AP CPU #%d Launched!\n", cpuid);
 
 	/* Determine if we are a logical CPU. */
 	/* XXX Calculation depends on cpu_logical being a power of 2, e.g. 2 */
 	if (cpu_logical > 1 && PCPU_GET(apic_id) % cpu_logical != 0)
 		CPU_SET(cpuid, &logical_cpus_mask);
 
 	if (bootverbose)
 		lapic_dump("AP");
 
 	if (smp_cpus == mp_ncpus) {
 		/* enable IPI's, tlb shootdown, freezes etc */
 		atomic_store_rel_int(&smp_started, 1);
 		smp_active = 1;	 /* historic */
 	}
 
 	mtx_unlock_spin(&ap_boot_mtx);
 
 	/* Wait until all the AP's are up. */
 	while (smp_started == 0)
 		ia32_pause();
 
 	/* Start per-CPU event timers. */
 	cpu_initclocks_ap();
 
 	/* Enter the scheduler. */
 	sched_throw(NULL);
 
 	panic("scheduler returned us to %s", __func__);
 	/* NOTREACHED */
 }
 
 /*******************************************************************
  * local functions and data
  */
 
 /*
  * We tell the I/O APIC code about all the CPUs we want to receive
  * interrupts.  If we don't want certain CPUs to receive IRQs we
  * can simply not tell the I/O APIC code about them in this function.
  */
 static void
 set_interrupt_apic_ids(void)
 {
 	u_int i, apic_id;
 
 	for (i = 0; i < MAXCPU; i++) {
 		apic_id = cpu_apic_ids[i];
 		if (apic_id == -1)
 			continue;
 		if (cpu_info[apic_id].cpu_disabled)
 			continue;
 
 		/* Don't let hyperthreads service interrupts. */
 		if (hyperthreading_cpus > 1 &&
 		    apic_id % hyperthreading_cpus != 0)
 			continue;
 
 		intr_add_cpu(i);
 	}
 }
 
 /*
  * Assign logical CPU IDs to local APICs.
  */
 static void
 assign_cpu_ids(void)
 {
 	u_int i;
 
 	TUNABLE_INT_FETCH("machdep.hyperthreading_allowed",
 	    &hyperthreading_allowed);
 
 	/* Check for explicitly disabled CPUs. */
 	for (i = 0; i <= MAX_APIC_ID; i++) {
 		if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp)
 			continue;
 
 		if (hyperthreading_cpus > 1 && i % hyperthreading_cpus != 0) {
 			cpu_info[i].cpu_hyperthread = 1;
 
 			/*
 			 * Don't use HT CPU if it has been disabled by a
 			 * tunable.
 			 */
 			if (hyperthreading_allowed == 0) {
 				cpu_info[i].cpu_disabled = 1;
 				continue;
 			}
 		}
 
 		/* Don't use this CPU if it has been disabled by a tunable. */
 		if (resource_disabled("lapic", i)) {
 			cpu_info[i].cpu_disabled = 1;
 			continue;
 		}
 	}
 
 	if (hyperthreading_allowed == 0 && hyperthreading_cpus > 1) {
 		hyperthreading_cpus = 0;
 		cpu_logical = 1;
 	}
 
 	/*
 	 * Assign CPU IDs to local APIC IDs and disable any CPUs
 	 * beyond MAXCPU.  CPU 0 is always assigned to the BSP.
 	 *
 	 * To minimize confusion for userland, we attempt to number
 	 * CPUs such that all threads and cores in a package are
 	 * grouped together.  For now we assume that the BSP is always
 	 * the first thread in a package and just start adding APs
 	 * starting with the BSP's APIC ID.
 	 */
 	mp_ncpus = 1;
 	cpu_apic_ids[0] = boot_cpu_id;
 	apic_cpuids[boot_cpu_id] = 0;
 	for (i = boot_cpu_id + 1; i != boot_cpu_id;
 	     i == MAX_APIC_ID ? i = 0 : i++) {
 		if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp ||
 		    cpu_info[i].cpu_disabled)
 			continue;
 
 		if (mp_ncpus < MAXCPU) {
 			cpu_apic_ids[mp_ncpus] = i;
 			apic_cpuids[i] = mp_ncpus;
 			mp_ncpus++;
 		} else
 			cpu_info[i].cpu_disabled = 1;
 	}
 	KASSERT(mp_maxid >= mp_ncpus - 1,
 	    ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid,
 	    mp_ncpus));		
 }
 
 /*
  * start each AP in our list
  */
 /* Lowest 1MB is already mapped: don't touch*/
 #define TMPMAP_START 1
 static int
 start_all_aps(void)
 {
 #ifndef PC98
 	u_char mpbiosreason;
 #endif
 	uintptr_t kptbase;
 	u_int32_t mpbioswarmvec;
 	int apic_id, cpu, i;
 
 	mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN);
 
 	/* install the AP 1st level boot code */
 	install_ap_tramp();
 
 	/* save the current value of the warm-start vector */
 	mpbioswarmvec = *((u_int32_t *) WARMBOOT_OFF);
 #ifndef PC98
 	outb(CMOS_REG, BIOS_RESET);
 	mpbiosreason = inb(CMOS_DATA);
 #endif
 
 	/* set up temporary P==V mapping for AP boot */
 	/* XXX this is a hack, we should boot the AP on its own stack/PTD */
 
 	kptbase = (uintptr_t)(void *)KPTphys;
 	for (i = TMPMAP_START; i < NKPT; i++)
 		PTD[i] = (pd_entry_t)(PG_V | PG_RW |
 		    ((kptbase + i * PAGE_SIZE) & PG_FRAME));
 	invltlb();
 
 	/* start each AP */
 	for (cpu = 1; cpu < mp_ncpus; cpu++) {
 		apic_id = cpu_apic_ids[cpu];
 
 		/* allocate and set up a boot stack data page */
 		bootstacks[cpu] =
 		    (char *)kmem_alloc(kernel_map, KSTACK_PAGES * PAGE_SIZE);
 		dpcpu = (void *)kmem_alloc(kernel_map, DPCPU_SIZE);
 		/* setup a vector to our boot code */
 		*((volatile u_short *) WARMBOOT_OFF) = WARMBOOT_TARGET;
 		*((volatile u_short *) WARMBOOT_SEG) = (boot_address >> 4);
 #ifndef PC98
 		outb(CMOS_REG, BIOS_RESET);
 		outb(CMOS_DATA, BIOS_WARM);	/* 'warm-start' */
 #endif
 
 		bootSTK = (char *)bootstacks[cpu] + KSTACK_PAGES * PAGE_SIZE - 4;
 		bootAP = cpu;
 
 		/* attempt to start the Application Processor */
 		CHECK_INIT(99);	/* setup checkpoints */
 		if (!start_ap(apic_id)) {
 			printf("AP #%d (PHY# %d) failed!\n", cpu, apic_id);
 			CHECK_PRINT("trace");	/* show checkpoints */
 			/* better panic as the AP may be running loose */
 			printf("panic y/n? [y] ");
 			if (cngetc() != 'n')
 				panic("bye-bye");
 		}
 		CHECK_PRINT("trace");		/* show checkpoints */
 
 		CPU_SET(cpu, &all_cpus);	/* record AP in CPU map */
 	}
 
 	/* restore the warmstart vector */
 	*(u_int32_t *) WARMBOOT_OFF = mpbioswarmvec;
 
 #ifndef PC98
 	outb(CMOS_REG, BIOS_RESET);
 	outb(CMOS_DATA, mpbiosreason);
 #endif
 
 	/* Undo V==P hack from above */
 	for (i = TMPMAP_START; i < NKPT; i++)
 		PTD[i] = 0;
 	pmap_invalidate_range(kernel_pmap, 0, NKPT * NBPDR - 1);
 
 	/* number of APs actually started */
 	return mp_naps;
 }
 
 /*
  * load the 1st level AP boot code into base memory.
  */
 
 /* targets for relocation */
 extern void bigJump(void);
 extern void bootCodeSeg(void);
 extern void bootDataSeg(void);
 extern void MPentry(void);
 extern u_int MP_GDT;
 extern u_int mp_gdtbase;
 
 static void
 install_ap_tramp(void)
 {
 	int     x;
 	int     size = *(int *) ((u_long) & bootMP_size);
 	vm_offset_t va = boot_address + KERNBASE;
 	u_char *src = (u_char *) ((u_long) bootMP);
 	u_char *dst = (u_char *) va;
 	u_int   boot_base = (u_int) bootMP;
 	u_int8_t *dst8;
 	u_int16_t *dst16;
 	u_int32_t *dst32;
 
 	KASSERT (size <= PAGE_SIZE,
 	    ("'size' do not fit into PAGE_SIZE, as expected."));
 	pmap_kenter(va, boot_address);
 	pmap_invalidate_page (kernel_pmap, va);
 	for (x = 0; x < size; ++x)
 		*dst++ = *src++;
 
 	/*
 	 * modify addresses in code we just moved to basemem. unfortunately we
 	 * need fairly detailed info about mpboot.s for this to work.  changes
 	 * to mpboot.s might require changes here.
 	 */
 
 	/* boot code is located in KERNEL space */
 	dst = (u_char *) va;
 
 	/* modify the lgdt arg */
 	dst32 = (u_int32_t *) (dst + ((u_int) & mp_gdtbase - boot_base));
 	*dst32 = boot_address + ((u_int) & MP_GDT - boot_base);
 
 	/* modify the ljmp target for MPentry() */
 	dst32 = (u_int32_t *) (dst + ((u_int) bigJump - boot_base) + 1);
 	*dst32 = ((u_int) MPentry - KERNBASE);
 
 	/* modify the target for boot code segment */
 	dst16 = (u_int16_t *) (dst + ((u_int) bootCodeSeg - boot_base));
 	dst8 = (u_int8_t *) (dst16 + 1);
 	*dst16 = (u_int) boot_address & 0xffff;
 	*dst8 = ((u_int) boot_address >> 16) & 0xff;
 
 	/* modify the target for boot data segment */
 	dst16 = (u_int16_t *) (dst + ((u_int) bootDataSeg - boot_base));
 	dst8 = (u_int8_t *) (dst16 + 1);
 	*dst16 = (u_int) boot_address & 0xffff;
 	*dst8 = ((u_int) boot_address >> 16) & 0xff;
 }
 
 /*
  * This function starts the AP (application processor) identified
  * by the APIC ID 'physicalCpu'.  It does quite a "song and dance"
  * to accomplish this.  This is necessary because of the nuances
  * of the different hardware we might encounter.  It isn't pretty,
  * but it seems to work.
  */
 static int
 start_ap(int apic_id)
 {
 	int vector, ms;
 	int cpus;
 
 	/* calculate the vector */
 	vector = (boot_address >> 12) & 0xff;
 
 	/* used as a watchpoint to signal AP startup */
 	cpus = mp_naps;
 
 	/*
 	 * first we do an INIT/RESET IPI this INIT IPI might be run, reseting
 	 * and running the target CPU. OR this INIT IPI might be latched (P5
 	 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be
 	 * ignored.
 	 */
 
 	/* do an INIT IPI: assert RESET */
 	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
 	    APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id);
 
 	/* wait for pending status end */
 	lapic_ipi_wait(-1);
 
 	/* do an INIT IPI: deassert RESET */
 	lapic_ipi_raw(APIC_DEST_ALLESELF | APIC_TRIGMOD_LEVEL |
 	    APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, 0);
 
 	/* wait for pending status end */
 	DELAY(10000);		/* wait ~10mS */
 	lapic_ipi_wait(-1);
 
 	/*
 	 * next we do a STARTUP IPI: the previous INIT IPI might still be
 	 * latched, (P5 bug) this 1st STARTUP would then terminate
 	 * immediately, and the previously started INIT IPI would continue. OR
 	 * the previous INIT IPI has already run. and this STARTUP IPI will
 	 * run. OR the previous INIT IPI was ignored. and this STARTUP IPI
 	 * will run.
 	 */
 
 	/* do a STARTUP IPI */
 	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
 	    APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
 	    vector, apic_id);
 	lapic_ipi_wait(-1);
 	DELAY(200);		/* wait ~200uS */
 
 	/*
 	 * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF
 	 * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR
 	 * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is
 	 * recognized after hardware RESET or INIT IPI.
 	 */
 
 	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
 	    APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
 	    vector, apic_id);
 	lapic_ipi_wait(-1);
 	DELAY(200);		/* wait ~200uS */
 
 	/* Wait up to 5 seconds for it to start. */
 	for (ms = 0; ms < 5000; ms++) {
 		if (mp_naps > cpus)
 			return 1;	/* return SUCCESS */
 		DELAY(1000);
 	}
 	return 0;		/* return FAILURE */
 }
 
 #ifdef COUNT_XINVLTLB_HITS
 u_int xhits_gbl[MAXCPU];
 u_int xhits_pg[MAXCPU];
 u_int xhits_rng[MAXCPU];
 SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, "");
 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl,
     sizeof(xhits_gbl), "IU", "");
 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg,
     sizeof(xhits_pg), "IU", "");
 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng,
     sizeof(xhits_rng), "IU", "");
 
 u_int ipi_global;
 u_int ipi_page;
 u_int ipi_range;
 u_int ipi_range_size;
 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, "");
 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, "");
 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, "");
 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size,
     0, "");
 
 u_int ipi_masked_global;
 u_int ipi_masked_page;
 u_int ipi_masked_range;
 u_int ipi_masked_range_size;
 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_global, CTLFLAG_RW,
     &ipi_masked_global, 0, "");
 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_page, CTLFLAG_RW,
     &ipi_masked_page, 0, "");
 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range, CTLFLAG_RW,
     &ipi_masked_range, 0, "");
 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range_size, CTLFLAG_RW,
     &ipi_masked_range_size, 0, "");
 #endif /* COUNT_XINVLTLB_HITS */
 
 /*
  * Send an IPI to specified CPU handling the bitmap logic.
  */
 static void
 ipi_send_cpu(int cpu, u_int ipi)
 {
 	u_int bitmap, old_pending, new_pending;
 
 	KASSERT(cpu_apic_ids[cpu] != -1, ("IPI to non-existent CPU %d", cpu));
 
 	if (IPI_IS_BITMAPED(ipi)) {
 		bitmap = 1 << ipi;
 		ipi = IPI_BITMAP_VECTOR;
 		do {
 			old_pending = cpu_ipi_pending[cpu];
 			new_pending = old_pending | bitmap;
 		} while  (!atomic_cmpset_int(&cpu_ipi_pending[cpu],
 		    old_pending, new_pending));	
 		if (old_pending)
 			return;
 	}
 	lapic_ipi_vectored(ipi, cpu_apic_ids[cpu]);
 }
 
 /*
  * Flush the TLB on all other CPU's
  */
 static void
 smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2)
 {
 	u_int ncpu;
 
 	ncpu = mp_ncpus - 1;	/* does not shootdown self */
 	if (ncpu < 1)
 		return;		/* no other cpus */
 	if (!(read_eflags() & PSL_I))
 		panic("%s: interrupts disabled", __func__);
 	mtx_lock_spin(&smp_ipi_mtx);
 	smp_tlb_addr1 = addr1;
 	smp_tlb_addr2 = addr2;
 	atomic_store_rel_int(&smp_tlb_wait, 0);
 	ipi_all_but_self(vector);
 	while (smp_tlb_wait < ncpu)
 		ia32_pause();
 	mtx_unlock_spin(&smp_ipi_mtx);
 }
 
 static void
 smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2)
 {
 	int cpu, ncpu, othercpus;
 
 	othercpus = mp_ncpus - 1;
 	if (CPU_ISFULLSET(&mask)) {
 		if (othercpus < 1)
 			return;
 	} else {
 		CPU_CLR(PCPU_GET(cpuid), &mask);
 		if (CPU_EMPTY(&mask))
 			return;
 	}
 	if (!(read_eflags() & PSL_I))
 		panic("%s: interrupts disabled", __func__);
 	mtx_lock_spin(&smp_ipi_mtx);
 	smp_tlb_addr1 = addr1;
 	smp_tlb_addr2 = addr2;
 	atomic_store_rel_int(&smp_tlb_wait, 0);
 	if (CPU_ISFULLSET(&mask)) {
 		ncpu = othercpus;
 		ipi_all_but_self(vector);
 	} else {
 		ncpu = 0;
 		while ((cpu = cpusetobj_ffs(&mask)) != 0) {
 			cpu--;
 			CPU_CLR(cpu, &mask);
 			CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu,
 			    vector);
 			ipi_send_cpu(cpu, vector);
 			ncpu++;
 		}
 	}
 	while (smp_tlb_wait < ncpu)
 		ia32_pause();
 	mtx_unlock_spin(&smp_ipi_mtx);
 }
 
 void
 smp_cache_flush(void)
 {
 
 	if (smp_started)
 		smp_tlb_shootdown(IPI_INVLCACHE, 0, 0);
 }
 
 void
 smp_invltlb(void)
 {
 
 	if (smp_started) {
 		smp_tlb_shootdown(IPI_INVLTLB, 0, 0);
 #ifdef COUNT_XINVLTLB_HITS
 		ipi_global++;
 #endif
 	}
 }
 
 void
 smp_invlpg(vm_offset_t addr)
 {
 
 	if (smp_started) {
 		smp_tlb_shootdown(IPI_INVLPG, addr, 0);
 #ifdef COUNT_XINVLTLB_HITS
 		ipi_page++;
 #endif
 	}
 }
 
 void
 smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2)
 {
 
 	if (smp_started) {
 		smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2);
 #ifdef COUNT_XINVLTLB_HITS
 		ipi_range++;
 		ipi_range_size += (addr2 - addr1) / PAGE_SIZE;
 #endif
 	}
 }
 
 void
 smp_masked_invltlb(cpuset_t mask)
 {
 
 	if (smp_started) {
 		smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0);
 #ifdef COUNT_XINVLTLB_HITS
 		ipi_masked_global++;
 #endif
 	}
 }
 
 void
 smp_masked_invlpg(cpuset_t mask, vm_offset_t addr)
 {
 
 	if (smp_started) {
 		smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0);
 #ifdef COUNT_XINVLTLB_HITS
 		ipi_masked_page++;
 #endif
 	}
 }
 
 void
 smp_masked_invlpg_range(cpuset_t mask, vm_offset_t addr1, vm_offset_t addr2)
 {
 
 	if (smp_started) {
 		smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2);
 #ifdef COUNT_XINVLTLB_HITS
 		ipi_masked_range++;
 		ipi_masked_range_size += (addr2 - addr1) / PAGE_SIZE;
 #endif
 	}
 }
 
 void
 ipi_bitmap_handler(struct trapframe frame)
 {
 	struct trapframe *oldframe;
 	struct thread *td;
 	int cpu = PCPU_GET(cpuid);
 	u_int ipi_bitmap;
 
 	critical_enter();
 	td = curthread;
 	td->td_intr_nesting_level++;
 	oldframe = td->td_intr_frame;
 	td->td_intr_frame = &frame;
 	ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]);
 	if (ipi_bitmap & (1 << IPI_PREEMPT)) {
 #ifdef COUNT_IPIS
 		(*ipi_preempt_counts[cpu])++;
 #endif
 		sched_preempt(td);
 	}
 	if (ipi_bitmap & (1 << IPI_AST)) {
 #ifdef COUNT_IPIS
 		(*ipi_ast_counts[cpu])++;
 #endif
 		/* Nothing to do for AST */
 	}
 	if (ipi_bitmap & (1 << IPI_HARDCLOCK)) {
 #ifdef COUNT_IPIS
 		(*ipi_hardclock_counts[cpu])++;
 #endif
 		hardclockintr();
 	}
 	td->td_intr_frame = oldframe;
 	td->td_intr_nesting_level--;
 	critical_exit();
 }
 
 /*
  * send an IPI to a set of cpus.
  */
 void
 ipi_selected(cpuset_t cpus, u_int ipi)
 {
 	int cpu;
 
 	/*
 	 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
 	 * of help in order to understand what is the source.
 	 * Set the mask of receiving CPUs for this purpose.
 	 */
 	if (ipi == IPI_STOP_HARD)
 		CPU_OR_ATOMIC(&ipi_nmi_pending, &cpus);
 
 	while ((cpu = cpusetobj_ffs(&cpus)) != 0) {
 		cpu--;
 		CPU_CLR(cpu, &cpus);
 		CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi);
 		ipi_send_cpu(cpu, ipi);
 	}
 }
 
 /*
  * send an IPI to a specific CPU.
  */
 void
 ipi_cpu(int cpu, u_int ipi)
 {
 
 	/*
 	 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
 	 * of help in order to understand what is the source.
 	 * Set the mask of receiving CPUs for this purpose.
 	 */
 	if (ipi == IPI_STOP_HARD)
 		CPU_SET_ATOMIC(cpu, &ipi_nmi_pending);
 
 	CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi);
 	ipi_send_cpu(cpu, ipi);
 }
 
 /*
  * send an IPI to all CPUs EXCEPT myself
  */
 void
 ipi_all_but_self(u_int ipi)
 {
 	cpuset_t other_cpus;
 
 	other_cpus = all_cpus;
 	CPU_CLR(PCPU_GET(cpuid), &other_cpus);
 	if (IPI_IS_BITMAPED(ipi)) {
 		ipi_selected(other_cpus, ipi);
 		return;
 	}
 
 	/*
 	 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
 	 * of help in order to understand what is the source.
 	 * Set the mask of receiving CPUs for this purpose.
 	 */
 	if (ipi == IPI_STOP_HARD)
 		CPU_OR_ATOMIC(&ipi_nmi_pending, &other_cpus);
 
 	CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
 	lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS);
 }
 
 int
 ipi_nmi_handler()
 {
 	u_int cpuid;
 
 	/*
 	 * As long as there is not a simple way to know about a NMI's
 	 * source, if the bitmask for the current CPU is present in
 	 * the global pending bitword an IPI_STOP_HARD has been issued
 	 * and should be handled.
 	 */
 	cpuid = PCPU_GET(cpuid);
 	if (!CPU_ISSET(cpuid, &ipi_nmi_pending))
 		return (1);
 
 	CPU_CLR_ATOMIC(cpuid, &ipi_nmi_pending);
 	cpustop_handler();
 	return (0);
 }
 
 /*
  * Handle an IPI_STOP by saving our current context and spinning until we
  * are resumed.
  */
 void
 cpustop_handler(void)
 {
 	u_int cpu;
 
 	cpu = PCPU_GET(cpuid);
 
 	savectx(&stoppcbs[cpu]);
 
 	/* Indicate that we are stopped */
 	CPU_SET_ATOMIC(cpu, &stopped_cpus);
 
 	/* Wait for restart */
 	while (!CPU_ISSET(cpu, &started_cpus))
 	    ia32_pause();
 
 	CPU_CLR_ATOMIC(cpu, &started_cpus);
 	CPU_CLR_ATOMIC(cpu, &stopped_cpus);
 
 	if (cpu == 0 && cpustop_restartfunc != NULL) {
 		cpustop_restartfunc();
 		cpustop_restartfunc = NULL;
 	}
 }
 
+/*
+ * Handle an IPI_SUSPEND by saving our current context and spinning until we
+ * are resumed.
+ */
+void
+cpususpend_handler(void)
+{
+	u_int cpu;
+
+	cpu = PCPU_GET(cpuid);
+
+	if (suspendctx(susppcbs[cpu])) {
+		wbinvd();
+		CPU_SET_ATOMIC(cpu, &stopped_cpus);
+	} else {
+		pmap_init_pat();
+		PCPU_SET(switchtime, 0);
+		PCPU_SET(switchticks, ticks);
+		susppcbs[cpu]->pcb_eip = 0;
+	}
+
+	/* Wait for resume */
+	while (!CPU_ISSET(cpu, &started_cpus))
+		ia32_pause();
+
+	CPU_CLR_ATOMIC(cpu, &started_cpus);
+	CPU_CLR_ATOMIC(cpu, &stopped_cpus);
+
+	/* Resume MCA and local APIC */
+	mca_resume();
+	lapic_setup(0);
+}
 /*
  * This is called once the rest of the system is up and running and we're
  * ready to let the AP's out of the pen.
  */
 static void
 release_aps(void *dummy __unused)
 {
 
 	if (mp_ncpus == 1) 
 		return;
 	atomic_store_rel_int(&aps_ready, 1);
 	while (smp_started == 0)
 		ia32_pause();
 }
 SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL);
 
 #ifdef COUNT_IPIS
 /*
  * Setup interrupt counters for IPI handlers.
  */
 static void
 mp_ipi_intrcnt(void *dummy)
 {
 	char buf[64];
 	int i;
 
 	CPU_FOREACH(i) {
 		snprintf(buf, sizeof(buf), "cpu%d:invltlb", i);
 		intrcnt_add(buf, &ipi_invltlb_counts[i]);
 		snprintf(buf, sizeof(buf), "cpu%d:invlrng", i);
 		intrcnt_add(buf, &ipi_invlrng_counts[i]);
 		snprintf(buf, sizeof(buf), "cpu%d:invlpg", i);
 		intrcnt_add(buf, &ipi_invlpg_counts[i]);
 		snprintf(buf, sizeof(buf), "cpu%d:invlcache", i);
 		intrcnt_add(buf, &ipi_invlcache_counts[i]);
 		snprintf(buf, sizeof(buf), "cpu%d:preempt", i);
 		intrcnt_add(buf, &ipi_preempt_counts[i]);
 		snprintf(buf, sizeof(buf), "cpu%d:ast", i);
 		intrcnt_add(buf, &ipi_ast_counts[i]);
 		snprintf(buf, sizeof(buf), "cpu%d:rendezvous", i);
 		intrcnt_add(buf, &ipi_rendezvous_counts[i]);
 		snprintf(buf, sizeof(buf), "cpu%d:lazypmap", i);
 		intrcnt_add(buf, &ipi_lazypmap_counts[i]);
 		snprintf(buf, sizeof(buf), "cpu%d:hardclock", i);
 		intrcnt_add(buf, &ipi_hardclock_counts[i]);
 	}		
 }
 SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, mp_ipi_intrcnt, NULL);
 #endif
Index: stable/9/sys/i386/i386/swtch.s
===================================================================
--- stable/9/sys/i386/i386/swtch.s	(revision 235795)
+++ stable/9/sys/i386/i386/swtch.s	(revision 235796)
@@ -1,429 +1,566 @@
 /*-
  * Copyright (c) 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include "opt_npx.h"
 #include "opt_sched.h"
 
 #include <machine/asmacros.h>
 
 #include "assym.s"
 
 #if defined(SMP) && defined(SCHED_ULE)
 #define	SETOP		xchgl
 #define	BLOCK_SPIN(reg)							\
 		movl		$blocked_lock,%eax ;			\
 	100: ;								\
 		lock ;							\
 		cmpxchgl	%eax,TD_LOCK(reg) ;			\
 		jne		101f ;					\
 		pause ;							\
 		jmp		100b ;					\
 	101:
 #else
 #define	SETOP		movl
 #define	BLOCK_SPIN(reg)
 #endif
 
 /*****************************************************************************/
 /* Scheduling                                                                */
 /*****************************************************************************/
 
 	.text
 
 /*
  * cpu_throw()
  *
  * This is the second half of cpu_switch(). It is used when the current
  * thread is either a dummy or slated to die, and we no longer care
  * about its state.  This is only a slight optimization and is probably
  * not worth it anymore.  Note that we need to clear the pm_active bits so
  * we do need the old proc if it still exists.
  * 0(%esp) = ret
  * 4(%esp) = oldtd
  * 8(%esp) = newtd
  */
 ENTRY(cpu_throw)
 	movl	PCPU(CPUID), %esi
 	movl	4(%esp),%ecx			/* Old thread */
 	testl	%ecx,%ecx			/* no thread? */
 	jz	1f
 	/* release bit from old pm_active */
 	movl	PCPU(CURPMAP), %ebx
 #ifdef SMP
 	lock
 #endif
 	btrl	%esi, PM_ACTIVE(%ebx)		/* clear old */
 1:
 	movl	8(%esp),%ecx			/* New thread */
 	movl	TD_PCB(%ecx),%edx
 	movl	PCB_CR3(%edx),%eax
 	LOAD_CR3(%eax)
 	/* set bit in new pm_active */
 	movl	TD_PROC(%ecx),%eax
 	movl	P_VMSPACE(%eax), %ebx
 	addl	$VM_PMAP, %ebx
 	movl	%ebx, PCPU(CURPMAP)
 #ifdef SMP
 	lock
 #endif
 	btsl	%esi, PM_ACTIVE(%ebx)		/* set new */
 	jmp	sw1
 END(cpu_throw)
 
 /*
  * cpu_switch(old, new)
  *
  * Save the current thread state, then select the next thread to run
  * and load its state.
  * 0(%esp) = ret
  * 4(%esp) = oldtd
  * 8(%esp) = newtd
  * 12(%esp) = newlock
  */
 ENTRY(cpu_switch)
 
 	/* Switch to new thread.  First, save context. */
 	movl	4(%esp),%ecx
 
 #ifdef INVARIANTS
 	testl	%ecx,%ecx			/* no thread? */
 	jz	badsw2				/* no, panic */
 #endif
 
 	movl	TD_PCB(%ecx),%edx
 
 	movl	(%esp),%eax			/* Hardware registers */
 	movl	%eax,PCB_EIP(%edx)
 	movl	%ebx,PCB_EBX(%edx)
 	movl	%esp,PCB_ESP(%edx)
 	movl	%ebp,PCB_EBP(%edx)
 	movl	%esi,PCB_ESI(%edx)
 	movl	%edi,PCB_EDI(%edx)
 	mov	%gs,PCB_GS(%edx)
 	pushfl					/* PSL */
 	popl	PCB_PSL(%edx)
 	/* Test if debug registers should be saved. */
 	testl	$PCB_DBREGS,PCB_FLAGS(%edx)
 	jz      1f                              /* no, skip over */
 	movl    %dr7,%eax                       /* yes, do the save */
 	movl    %eax,PCB_DR7(%edx)
 	andl    $0x0000fc00, %eax               /* disable all watchpoints */
 	movl    %eax,%dr7
 	movl    %dr6,%eax
 	movl    %eax,PCB_DR6(%edx)
 	movl    %dr3,%eax
 	movl    %eax,PCB_DR3(%edx)
 	movl    %dr2,%eax
 	movl    %eax,PCB_DR2(%edx)
 	movl    %dr1,%eax
 	movl    %eax,PCB_DR1(%edx)
 	movl    %dr0,%eax
 	movl    %eax,PCB_DR0(%edx)
 1:
 
 #ifdef DEV_NPX
 	/* have we used fp, and need a save? */
 	cmpl	%ecx,PCPU(FPCURTHREAD)
 	jne	1f
 	pushl	PCB_SAVEFPU(%edx)		/* h/w bugs make saving complicated */
 	call	npxsave				/* do it in a big C function */
 	popl	%eax
 1:
 #endif
 
 	/* Save is done.  Now fire up new thread. Leave old vmspace. */
 	movl	4(%esp),%edi
 	movl	8(%esp),%ecx			/* New thread */
 	movl	12(%esp),%esi			/* New lock */
 #ifdef INVARIANTS
 	testl	%ecx,%ecx			/* no thread? */
 	jz	badsw3				/* no, panic */
 #endif
 	movl	TD_PCB(%ecx),%edx
 
 	/* switch address space */
 	movl	PCB_CR3(%edx),%eax
 #ifdef PAE
 	cmpl	%eax,IdlePDPT			/* Kernel address space? */
 #else
 	cmpl	%eax,IdlePTD			/* Kernel address space? */
 #endif
 	je	sw0
 	READ_CR3(%ebx)				/* The same address space? */
 	cmpl	%ebx,%eax
 	je	sw0
 	LOAD_CR3(%eax)				/* new address space */
 	movl	%esi,%eax
 	movl	PCPU(CPUID),%esi
 	SETOP	%eax,TD_LOCK(%edi)		/* Switchout td_lock */
 
 	/* Release bit from old pmap->pm_active */
 	movl	PCPU(CURPMAP), %ebx
 #ifdef SMP
 	lock
 #endif
 	btrl	%esi, PM_ACTIVE(%ebx)		/* clear old */
 
 	/* Set bit in new pmap->pm_active */
 	movl	TD_PROC(%ecx),%eax		/* newproc */
 	movl	P_VMSPACE(%eax), %ebx
 	addl	$VM_PMAP, %ebx
 	movl	%ebx, PCPU(CURPMAP)
 #ifdef SMP
 	lock
 #endif
 	btsl	%esi, PM_ACTIVE(%ebx)		/* set new */
 	jmp	sw1
 
 sw0:
 	SETOP	%esi,TD_LOCK(%edi)		/* Switchout td_lock */
 sw1:
 	BLOCK_SPIN(%ecx)
 #ifdef XEN
 	pushl	%eax
 	pushl	%ecx
 	pushl	%edx
 	call	xen_handle_thread_switch
 	popl	%edx
 	popl	%ecx
 	popl	%eax
 	/*
 	 * XXX set IOPL
 	 */
 #else		
 	/*
 	 * At this point, we've switched address spaces and are ready
 	 * to load up the rest of the next context.
 	 */
 	cmpl	$0, PCB_EXT(%edx)		/* has pcb extension? */
 	je	1f				/* If not, use the default */
 	movl	$1, PCPU(PRIVATE_TSS) 		/* mark use of private tss */
 	movl	PCB_EXT(%edx), %edi		/* new tss descriptor */
 	jmp	2f				/* Load it up */
 
 1:	/*
 	 * Use the common default TSS instead of our own.
 	 * Set our stack pointer into the TSS, it's set to just
 	 * below the PCB.  In C, common_tss.tss_esp0 = &pcb - 16;
 	 */
 	leal	-16(%edx), %ebx			/* leave space for vm86 */
 	movl	%ebx, PCPU(COMMON_TSS) + TSS_ESP0
 
 	/*
 	 * Test this CPU's  bit in the bitmap to see if this
 	 * CPU was using a private TSS.
 	 */
 	cmpl	$0, PCPU(PRIVATE_TSS)		/* Already using the common? */
 	je	3f				/* if so, skip reloading */
 	movl	$0, PCPU(PRIVATE_TSS)
 	PCPU_ADDR(COMMON_TSSD, %edi)
 2:
 	/* Move correct tss descriptor into GDT slot, then reload tr. */
 	movl	PCPU(TSS_GDT), %ebx		/* entry in GDT */
 	movl	0(%edi), %eax
 	movl	4(%edi), %esi
 	movl	%eax, 0(%ebx)
 	movl	%esi, 4(%ebx)
 	movl	$GPROC0_SEL*8, %esi		/* GSEL(GPROC0_SEL, SEL_KPL) */
 	ltr	%si
 3:
 
 	/* Copy the %fs and %gs selectors into this pcpu gdt */
 	leal	PCB_FSD(%edx), %esi
 	movl	PCPU(FSGS_GDT), %edi
 	movl	0(%esi), %eax		/* %fs selector */
 	movl	4(%esi), %ebx
 	movl	%eax, 0(%edi)
 	movl	%ebx, 4(%edi)
 	movl	8(%esi), %eax		/* %gs selector, comes straight after */
 	movl	12(%esi), %ebx
 	movl	%eax, 8(%edi)
 	movl	%ebx, 12(%edi)
 #endif
 	/* Restore context. */
 	movl	PCB_EBX(%edx),%ebx
 	movl	PCB_ESP(%edx),%esp
 	movl	PCB_EBP(%edx),%ebp
 	movl	PCB_ESI(%edx),%esi
 	movl	PCB_EDI(%edx),%edi
 	movl	PCB_EIP(%edx),%eax
 	movl	%eax,(%esp)
 	pushl	PCB_PSL(%edx)
 	popfl
 
 	movl	%edx, PCPU(CURPCB)
 	movl	TD_TID(%ecx),%eax
 	movl	%ecx, PCPU(CURTHREAD)		/* into next thread */
 
 	/*
 	 * Determine the LDT to use and load it if is the default one and
 	 * that is not the current one.
 	 */
 	movl	TD_PROC(%ecx),%eax
 	cmpl    $0,P_MD+MD_LDT(%eax)
 	jnz	1f
 	movl	_default_ldt,%eax
 	cmpl	PCPU(CURRENTLDT),%eax
 	je	2f
 	LLDT(_default_ldt)
 	movl	%eax,PCPU(CURRENTLDT)
 	jmp	2f
 1:
 	/* Load the LDT when it is not the default one. */
 	pushl	%edx				/* Preserve pointer to pcb. */
 	addl	$P_MD,%eax			/* Pointer to mdproc is arg. */
 	pushl	%eax
 	call	set_user_ldt
 	addl	$4,%esp
 	popl	%edx
 2:
 
 	/* This must be done after loading the user LDT. */
 	.globl	cpu_switch_load_gs
 cpu_switch_load_gs:
 	mov	PCB_GS(%edx),%gs
 
 	/* Test if debug registers should be restored. */
 	testl	$PCB_DBREGS,PCB_FLAGS(%edx)
 	jz      1f
 
 	/*
 	 * Restore debug registers.  The special code for dr7 is to
 	 * preserve the current values of its reserved bits.
 	 */
 	movl    PCB_DR6(%edx),%eax
 	movl    %eax,%dr6
 	movl    PCB_DR3(%edx),%eax
 	movl    %eax,%dr3
 	movl    PCB_DR2(%edx),%eax
 	movl    %eax,%dr2
 	movl    PCB_DR1(%edx),%eax
 	movl    %eax,%dr1
 	movl    PCB_DR0(%edx),%eax
 	movl    %eax,%dr0
 	movl	%dr7,%eax
 	andl    $0x0000fc00,%eax
 	movl    PCB_DR7(%edx),%ecx
 	andl	$~0x0000fc00,%ecx
 	orl     %ecx,%eax
 	movl    %eax,%dr7
 1:
 	ret
 
 #ifdef INVARIANTS
 badsw1:
 	pushal
 	pushl	$sw0_1
 	call	panic
 sw0_1:	.asciz	"cpu_throw: no newthread supplied"
 
 badsw2:
 	pushal
 	pushl	$sw0_2
 	call	panic
 sw0_2:	.asciz	"cpu_switch: no curthread supplied"
 
 badsw3:
 	pushal
 	pushl	$sw0_3
 	call	panic
 sw0_3:	.asciz	"cpu_switch: no newthread supplied"
 #endif
 END(cpu_switch)
 
 /*
  * savectx(pcb)
  * Update pcb, saving current processor state.
  */
 ENTRY(savectx)
 	/* Fetch PCB. */
 	movl	4(%esp),%ecx
 
 	/* Save caller's return address.  Child won't execute this routine. */
 	movl	(%esp),%eax
 	movl	%eax,PCB_EIP(%ecx)
 
 	movl	%cr3,%eax
 	movl	%eax,PCB_CR3(%ecx)
 
 	movl	%ebx,PCB_EBX(%ecx)
 	movl	%esp,PCB_ESP(%ecx)
 	movl	%ebp,PCB_EBP(%ecx)
 	movl	%esi,PCB_ESI(%ecx)
 	movl	%edi,PCB_EDI(%ecx)
 	mov	%gs,PCB_GS(%ecx)
 	pushfl
 	popl	PCB_PSL(%ecx)
 
 #ifdef DEV_NPX
 	/*
 	 * If fpcurthread == NULL, then the npx h/w state is irrelevant and the
 	 * state had better already be in the pcb.  This is true for forks
 	 * but not for dumps (the old book-keeping with FP flags in the pcb
 	 * always lost for dumps because the dump pcb has 0 flags).
 	 *
 	 * If fpcurthread != NULL, then we have to save the npx h/w state to
 	 * fpcurthread's pcb and copy it to the requested pcb, or save to the
 	 * requested pcb and reload.  Copying is easier because we would
 	 * have to handle h/w bugs for reloading.  We used to lose the
 	 * parent's npx state for forks by forgetting to reload.
 	 */
 	pushfl
 	CLI
 	movl	PCPU(FPCURTHREAD),%eax
 	testl	%eax,%eax
 	je	1f
 
 	pushl	%ecx
 	movl	TD_PCB(%eax),%eax
 	movl	PCB_SAVEFPU(%eax),%eax
 	pushl	%eax
 	pushl	%eax
 	call	npxsave
 	addl	$4,%esp
 	popl	%eax
 	popl	%ecx
 
 	pushl	$PCB_SAVEFPU_SIZE
 	leal	PCB_USERFPU(%ecx),%ecx
 	pushl	%ecx
 	pushl	%eax
 	call	bcopy
 	addl	$12,%esp
 1:
 	popfl
 #endif	/* DEV_NPX */
 
 	ret
 END(savectx)
+
+/*
+ * suspendctx(pcb)
+ * Update pcb, suspending current processor state.
+ */
+ENTRY(suspendctx)
+	/* Fetch PCB. */
+	movl	4(%esp),%ecx
+
+	/* Save context by calling savectx(). */
+	pushl	%ecx
+	call	savectx
+	addl	$4,%esp
+
+	/* Fetch PCB again. */
+	movl	4(%esp),%ecx
+
+	/* Update caller's return address and stack pointer. */
+	movl	(%esp),%eax
+	movl	%eax,PCB_EIP(%ecx)
+	movl	%esp,PCB_ESP(%ecx)
+
+	/* Save other registers and descriptor tables. */
+	movl	%cr0,%eax
+	movl	%eax,PCB_CR0(%ecx)
+	movl	%cr2,%eax
+	movl	%eax,PCB_CR2(%ecx)
+	movl	%cr4,%eax
+	movl	%eax,PCB_CR4(%ecx)
+
+	movl	%dr0,%eax
+	movl	%eax,PCB_DR0(%ecx)
+	movl	%dr1,%eax
+	movl	%eax,PCB_DR1(%ecx)
+	movl	%dr2,%eax
+	movl	%eax,PCB_DR2(%ecx)
+	movl	%dr3,%eax
+	movl	%eax,PCB_DR3(%ecx)
+	movl	%dr6,%eax
+	movl	%eax,PCB_DR6(%ecx)
+	movl	%dr7,%eax
+	movl	%eax,PCB_DR7(%ecx)
+
+	mov	%ds,PCB_DS(%ecx)
+	mov	%es,PCB_ES(%ecx)
+	mov	%fs,PCB_FS(%ecx)
+	mov	%ss,PCB_SS(%ecx)
+	
+	sgdt	PCB_GDT(%ecx)
+	sidt	PCB_IDT(%ecx)
+	sldt	PCB_LDT(%ecx)
+	str	PCB_TR(%ecx)
+
+	movl	$1,%eax
+	ret
+END(suspendctx)
+
+/*
+ * resumectx(pcb in %esi)
+ * Resuming processor state from pcb.
+ */
+ENTRY(resumectx)
+	/* Fetch PCB. */
+	movl	%esi,%ecx
+
+	/* Restore GDT. */
+	lgdt	PCB_GDT(%ecx)
+
+	/* Restore segment registers */
+	movzwl	PCB_DS(%ecx),%eax
+	mov	%ax,%ds
+	movzwl	PCB_ES(%ecx),%eax
+	mov	%ax,%es
+	movzwl	PCB_FS(%ecx),%eax
+	mov	%ax,%fs
+	movzwl	PCB_GS(%ecx),%eax
+	movw	%ax,%gs
+	movzwl	PCB_SS(%ecx),%eax
+	mov	%ax,%ss
+
+	/* Restore CR2, CR4, CR3 and CR0 */
+	movl	PCB_CR2(%ecx),%eax
+	movl	%eax,%cr2
+	movl	PCB_CR4(%ecx),%eax
+	movl	%eax,%cr4
+	movl	PCB_CR3(%ecx),%eax
+	movl	%eax,%cr3
+	movl	PCB_CR0(%ecx),%eax
+	movl	%eax,%cr0
+	jmp	1f
+1:
+
+	/* Restore descriptor tables */
+	lidt	PCB_IDT(%ecx)
+	lldt	PCB_LDT(%ecx)
+
+#define SDT_SYS386TSS	9
+#define SDT_SYS386BSY	11
+	/* Clear "task busy" bit and reload TR */
+	movl	PCPU(TSS_GDT),%eax
+	andb	$(~SDT_SYS386BSY | SDT_SYS386TSS),5(%eax)
+	movzwl	PCB_TR(%ecx),%eax
+	ltr	%ax
+#undef SDT_SYS386TSS
+#undef SDT_SYS386BSY
+
+	/* Restore debug registers */
+	movl	PCB_DR0(%ecx),%eax
+	movl	%eax,%dr0
+	movl	PCB_DR1(%ecx),%eax
+	movl	%eax,%dr1
+	movl	PCB_DR2(%ecx),%eax
+	movl	%eax,%dr2
+	movl	PCB_DR3(%ecx),%eax
+	movl	%eax,%dr3
+	movl	PCB_DR6(%ecx),%eax
+	movl	%eax,%dr6
+	movl	PCB_DR7(%ecx),%eax
+	movl	%eax,%dr7
+
+#ifdef DEV_NPX
+	/* XXX FIX ME */
+#endif
+
+	/* Restore other registers */
+	movl	PCB_EDI(%ecx),%edi
+	movl	PCB_ESI(%ecx),%esi
+	movl	PCB_EBP(%ecx),%ebp
+	movl	PCB_ESP(%ecx),%esp
+	movl	PCB_EBX(%ecx),%ebx
+
+	/* reload code selector by turning return into intersegmental return */
+	pushl	PCB_EIP(%ecx)
+	movl	$KCSEL,4(%esp)
+	xorl	%eax,%eax
+	lret
+END(resumectx)
Index: stable/9/sys/i386/include/apicvar.h
===================================================================
--- stable/9/sys/i386/include/apicvar.h	(revision 235795)
+++ stable/9/sys/i386/include/apicvar.h	(revision 235796)
@@ -1,260 +1,262 @@
 /*-
  * Copyright (c) 2003 John Baldwin <jhb@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the author nor the names of any co-contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _MACHINE_APICVAR_H_
 #define _MACHINE_APICVAR_H_
 
 /*
  * Local && I/O APIC variable definitions.
  */
 
 /*
  * Layout of local APIC interrupt vectors:
  *
  *	0xff (255)  +-------------+
  *                  |             | 15 (Spurious / IPIs / Local Interrupts)
  *	0xf0 (240)  +-------------+
  *                  |             | 14 (I/O Interrupts / Timer)
  *	0xe0 (224)  +-------------+
  *                  |             | 13 (I/O Interrupts)
  *	0xd0 (208)  +-------------+
  *                  |             | 12 (I/O Interrupts)
  *	0xc0 (192)  +-------------+
  *                  |             | 11 (I/O Interrupts)
  *	0xb0 (176)  +-------------+
  *                  |             | 10 (I/O Interrupts)
  *	0xa0 (160)  +-------------+
  *                  |             | 9 (I/O Interrupts)
  *	0x90 (144)  +-------------+
  *                  |             | 8 (I/O Interrupts / System Calls)
  *	0x80 (128)  +-------------+
  *                  |             | 7 (I/O Interrupts)
  *	0x70 (112)  +-------------+
  *                  |             | 6 (I/O Interrupts)
  *	0x60 (96)   +-------------+
  *                  |             | 5 (I/O Interrupts)
  *	0x50 (80)   +-------------+
  *                  |             | 4 (I/O Interrupts)
  *	0x40 (64)   +-------------+
  *                  |             | 3 (I/O Interrupts)
  *	0x30 (48)   +-------------+
  *                  |             | 2 (ATPIC Interrupts)
  *	0x20 (32)   +-------------+
  *                  |             | 1 (Exceptions, traps, faults, etc.)
  *	0x10 (16)   +-------------+
  *                  |             | 0 (Exceptions, traps, faults, etc.)
  *	0x00 (0)    +-------------+
  *
  * Note: 0x80 needs to be handled specially and not allocated to an
  * I/O device!
  */
 
 #define	MAX_APIC_ID	0xfe
 #define	APIC_ID_ALL	0xff
 
 /* I/O Interrupts are used for external devices such as ISA, PCI, etc. */
 #define	APIC_IO_INTS	(IDT_IO_INTS + 16)
 #define	APIC_NUM_IOINTS	191
 
 /* The timer interrupt is used for clock handling and drives hardclock, etc. */
 #define	APIC_TIMER_INT	(APIC_IO_INTS + APIC_NUM_IOINTS)
 
 /*  
  ********************* !!! WARNING !!! ******************************
  * Each local apic has an interrupt receive fifo that is two entries deep
  * for each interrupt priority class (higher 4 bits of interrupt vector).
  * Once the fifo is full the APIC can no longer receive interrupts for this
  * class and sending IPIs from other CPUs will be blocked.
  * To avoid deadlocks there should be no more than two IPI interrupts
  * pending at the same time.
  * Currently this is guaranteed by dividing the IPIs in two groups that have 
  * each at most one IPI interrupt pending. The first group is protected by the
  * smp_ipi_mtx and waits for the completion of the IPI (Only one IPI user 
  * at a time) The second group uses a single interrupt and a bitmap to avoid
  * redundant IPI interrupts.
  */ 
 
 /* Interrupts for local APIC LVT entries other than the timer. */
 #ifdef XEN
 /* These are the Xen i386 APIC definitions */
 #define	APIC_LOCAL_INTS	240
 #define	APIC_ERROR_INT	APIC_LOCAL_INTS
 #define	APIC_THERMAL_INT (APIC_LOCAL_INTS + 1)
 #define	APIC_CMC_INT	(APIC_LOCAL_INTS + 2)
 #define	APIC_IPI_INTS	(APIC_LOCAL_INTS + 3)
 
 #define	IPI_RENDEZVOUS		(APIC_IPI_INTS)	/* Inter-CPU rendezvous. */
 #define	IPI_INVLTLB		(APIC_IPI_INTS + 1)	/* TLB Shootdown IPIs */
 #define	IPI_INVLPG		(APIC_IPI_INTS + 2)
 #define	IPI_INVLRNG		(APIC_IPI_INTS + 3)
 #define	IPI_INVLCACHE		(APIC_IPI_INTS + 4)
 #define	IPI_LAZYPMAP		(APIC_IPI_INTS + 5)	/* Lazy pmap release. */
 /* Vector to handle bitmap based IPIs */
 #define	IPI_BITMAP_VECTOR	(APIC_IPI_INTS + 6)
 
 /* IPIs handled by IPI_BITMAPED_VECTOR  (XXX ups is there a better place?) */
 #define	IPI_AST		0 	/* Generate software trap. */
 #define IPI_PREEMPT     1
 #define IPI_HARDCLOCK   2 
 #define IPI_BITMAP_LAST IPI_HARDCLOCK
 #define IPI_IS_BITMAPED(x) ((x) <= IPI_BITMAP_LAST)
 
 #define	IPI_STOP	(APIC_IPI_INTS + 7)	/* Stop CPU until restarted. */
-#define	IPI_STOP_HARD	(APIC_IPI_INTS + 8)	/* Stop CPU with a NMI. */
+#define	IPI_SUSPEND	(APIC_IPI_INTS + 8)	/* Suspend CPU until restarted. */
+#define	IPI_STOP_HARD	(APIC_IPI_INTS + 9)	/* Stop CPU with a NMI. */
 
 #else /* XEN */
 /* These are the normal i386 APIC definitions */
 #define	APIC_LOCAL_INTS	240
 #define	APIC_ERROR_INT	APIC_LOCAL_INTS
 #define	APIC_THERMAL_INT (APIC_LOCAL_INTS + 1)
 #define	APIC_CMC_INT	(APIC_LOCAL_INTS + 2)
 #define	APIC_IPI_INTS	(APIC_LOCAL_INTS + 3)
 
 #define	IPI_RENDEZVOUS	(APIC_IPI_INTS)		/* Inter-CPU rendezvous. */
 #define	IPI_INVLTLB	(APIC_IPI_INTS + 1)	/* TLB Shootdown IPIs */
 #define	IPI_INVLPG	(APIC_IPI_INTS + 2)
 #define	IPI_INVLRNG	(APIC_IPI_INTS + 3)
 #define	IPI_INVLCACHE	(APIC_IPI_INTS + 4)
 #define	IPI_LAZYPMAP	(APIC_IPI_INTS + 5)	/* Lazy pmap release. */
 /* Vector to handle bitmap based IPIs */
 #define	IPI_BITMAP_VECTOR	(APIC_IPI_INTS + 6) 
 
 /* IPIs handled by IPI_BITMAPED_VECTOR  (XXX ups is there a better place?) */
 #define	IPI_AST		0 	/* Generate software trap. */
 #define IPI_PREEMPT     1
 #define IPI_HARDCLOCK   2 
 #define IPI_BITMAP_LAST IPI_HARDCLOCK
 #define IPI_IS_BITMAPED(x) ((x) <= IPI_BITMAP_LAST)
 
 #define	IPI_STOP	(APIC_IPI_INTS + 7)	/* Stop CPU until restarted. */
-#define	IPI_STOP_HARD	(APIC_IPI_INTS + 8)	/* Stop CPU with a NMI. */
+#define	IPI_SUSPEND	(APIC_IPI_INTS + 8)	/* Suspend CPU until restarted. */
+#define	IPI_STOP_HARD	(APIC_IPI_INTS + 9)	/* Stop CPU with a NMI. */
 #endif /* XEN */
 
 /*
  * The spurious interrupt can share the priority class with the IPIs since
  * it is not a normal interrupt. (Does not use the APIC's interrupt fifo)
  */
 #define	APIC_SPURIOUS_INT 255
 
 #define	LVT_LINT0	0
 #define	LVT_LINT1	1
 #define	LVT_TIMER	2
 #define	LVT_ERROR	3
 #define	LVT_PMC		4
 #define	LVT_THERMAL	5
 #define	LVT_CMCI	6
 #define	LVT_MAX		LVT_CMCI
 
 #ifndef LOCORE
 
 #define	APIC_IPI_DEST_SELF	-1
 #define	APIC_IPI_DEST_ALL	-2
 #define	APIC_IPI_DEST_OTHERS	-3
 
 #define	APIC_BUS_UNKNOWN	-1
 #define	APIC_BUS_ISA		0
 #define	APIC_BUS_EISA		1
 #define	APIC_BUS_PCI		2
 #define	APIC_BUS_MAX		APIC_BUS_PCI
 
 /*
  * An APIC enumerator is a psuedo bus driver that enumerates APIC's including
  * CPU's and I/O APIC's.
  */
 struct apic_enumerator {
 	const char *apic_name;
 	int (*apic_probe)(void);
 	int (*apic_probe_cpus)(void);
 	int (*apic_setup_local)(void);
 	int (*apic_setup_io)(void);
 	SLIST_ENTRY(apic_enumerator) apic_next;
 };
 
 inthand_t
 	IDTVEC(apic_isr1), IDTVEC(apic_isr2), IDTVEC(apic_isr3),
 	IDTVEC(apic_isr4), IDTVEC(apic_isr5), IDTVEC(apic_isr6),
 	IDTVEC(apic_isr7), IDTVEC(cmcint), IDTVEC(errorint),
 	IDTVEC(spuriousint), IDTVEC(timerint);
 
 extern vm_paddr_t lapic_paddr;
 extern int apic_cpuids[];
 
 u_int	apic_alloc_vector(u_int apic_id, u_int irq);
 u_int	apic_alloc_vectors(u_int apic_id, u_int *irqs, u_int count,
 	    u_int align);
 void	apic_disable_vector(u_int apic_id, u_int vector);
 void	apic_enable_vector(u_int apic_id, u_int vector);
 void	apic_free_vector(u_int apic_id, u_int vector, u_int irq);
 u_int	apic_idt_to_irq(u_int apic_id, u_int vector);
 void	apic_register_enumerator(struct apic_enumerator *enumerator);
 u_int	apic_cpuid(u_int apic_id);
 void	*ioapic_create(vm_paddr_t addr, int32_t apic_id, int intbase);
 int	ioapic_disable_pin(void *cookie, u_int pin);
 int	ioapic_get_vector(void *cookie, u_int pin);
 void	ioapic_register(void *cookie);
 int	ioapic_remap_vector(void *cookie, u_int pin, int vector);
 int	ioapic_set_bus(void *cookie, u_int pin, int bus_type);
 int	ioapic_set_extint(void *cookie, u_int pin);
 int	ioapic_set_nmi(void *cookie, u_int pin);
 int	ioapic_set_polarity(void *cookie, u_int pin, enum intr_polarity pol);
 int	ioapic_set_triggermode(void *cookie, u_int pin,
 	    enum intr_trigger trigger);
 int	ioapic_set_smi(void *cookie, u_int pin);
 void	lapic_create(u_int apic_id, int boot_cpu);
 void	lapic_disable(void);
 void	lapic_disable_pmc(void);
 void	lapic_dump(const char *str);
 void	lapic_enable_cmc(void);
 int	lapic_enable_pmc(void);
 void	lapic_eoi(void);
 int	lapic_id(void);
 void	lapic_init(vm_paddr_t addr);
 int	lapic_intr_pending(u_int vector);
 void	lapic_ipi_raw(register_t icrlo, u_int dest);
 void	lapic_ipi_vectored(u_int vector, int dest);
 int	lapic_ipi_wait(int delay);
 void	lapic_handle_cmc(void);
 void	lapic_handle_error(void);
 void	lapic_handle_intr(int vector, struct trapframe *frame);
 void	lapic_handle_timer(struct trapframe *frame);
 void	lapic_reenable_pmc(void);
 void	lapic_set_logical_id(u_int apic_id, u_int cluster, u_int cluster_id);
 int	lapic_set_lvt_mask(u_int apic_id, u_int lvt, u_char masked);
 int	lapic_set_lvt_mode(u_int apic_id, u_int lvt, u_int32_t mode);
 int	lapic_set_lvt_polarity(u_int apic_id, u_int lvt,
 	    enum intr_polarity pol);
 int	lapic_set_lvt_triggermode(u_int apic_id, u_int lvt,
 	    enum intr_trigger trigger);
 void	lapic_set_tpr(u_int vector);
 void	lapic_setup(int boot);
 
 #endif /* !LOCORE */
 #endif /* _MACHINE_APICVAR_H_ */
Index: stable/9/sys/i386/include/md_var.h
===================================================================
--- stable/9/sys/i386/include/md_var.h	(revision 235795)
+++ stable/9/sys/i386/include/md_var.h	(revision 235796)
@@ -1,106 +1,107 @@
 /*-
  * Copyright (c) 1995 Bruce D. Evans.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the author nor the names of contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _MACHINE_MD_VAR_H_
 #define	_MACHINE_MD_VAR_H_
 
 /*
  * Miscellaneous machine-dependent declarations.
  */
 
 extern	long	Maxmem;
 extern	u_int	basemem;	/* PA of original top of base memory */
 extern	int	busdma_swi_pending;
 extern	u_int	cpu_exthigh;
 extern	u_int	cpu_feature;
 extern	u_int	cpu_feature2;
 extern	u_int	amd_feature;
 extern	u_int	amd_feature2;
 extern	u_int	amd_pminfo;
 extern	u_int	via_feature_rng;
 extern	u_int	via_feature_xcrypt;
 extern	u_int	cpu_clflush_line_size;
 extern	u_int	cpu_fxsr;
 extern	u_int	cpu_high;
 extern	u_int	cpu_id;
 extern	u_int	cpu_mxcsr_mask;
 extern	u_int	cpu_procinfo;
 extern	u_int	cpu_procinfo2;
 extern	char	cpu_vendor[];
 extern	u_int	cpu_vendor_id;
 extern	u_int	cyrix_did;
 extern	char	kstack[];
 extern	char	sigcode[];
 extern	int	szsigcode;
 #ifdef COMPAT_FREEBSD4
 extern	int	szfreebsd4_sigcode;
 #endif
 #ifdef COMPAT_43
 extern	int	szosigcode;
 #endif
 extern	uint32_t *vm_page_dump;
 extern	int	vm_page_dump_size;
 extern	int	workaround_erratum383;
 
 typedef void alias_for_inthand_t(u_int cs, u_int ef, u_int esp, u_int ss);
 struct	thread;
 struct	reg;
 struct	fpreg;
 struct  dbreg;
 struct	dumperinfo;
 
 void	bcopyb(const void *from, void *to, size_t len);
 void	busdma_swi(void);
 void	cpu_setregs(void);
 void	cpu_switch_load_gs(void) __asm(__STRING(cpu_switch_load_gs));
 void	doreti_iret(void) __asm(__STRING(doreti_iret));
 void	doreti_iret_fault(void) __asm(__STRING(doreti_iret_fault));
 void	doreti_popl_ds(void) __asm(__STRING(doreti_popl_ds));
 void	doreti_popl_ds_fault(void) __asm(__STRING(doreti_popl_ds_fault));
 void	doreti_popl_es(void) __asm(__STRING(doreti_popl_es));
 void	doreti_popl_es_fault(void) __asm(__STRING(doreti_popl_es_fault));
 void	doreti_popl_fs(void) __asm(__STRING(doreti_popl_fs));
 void	doreti_popl_fs_fault(void) __asm(__STRING(doreti_popl_fs_fault));
 void	dump_add_page(vm_paddr_t);
 void	dump_drop_page(vm_paddr_t);
+void	initializecpu(void);
 void	enable_sse(void);
 void	fillw(int /*u_short*/ pat, void *base, size_t cnt);
 void	i686_pagezero(void *addr);
 void	sse2_pagezero(void *addr);
 void	init_AMD_Elan_sc520(void);
 int	is_physical_memory(vm_paddr_t addr);
 int	isa_nmi(int cd);
 vm_paddr_t kvtop(void *addr);
 void	setidt(int idx, alias_for_inthand_t *func, int typ, int dpl, int selec);
 int     user_dbreg_trap(void);
 void	minidumpsys(struct dumperinfo *);
 
 #endif /* !_MACHINE_MD_VAR_H_ */
Index: stable/9/sys/i386/include/pcb.h
===================================================================
--- stable/9/sys/i386/include/pcb.h	(revision 235795)
+++ stable/9/sys/i386/include/pcb.h	(revision 235796)
@@ -1,90 +1,104 @@
 /*-
  * Copyright (c) 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)pcb.h	5.10 (Berkeley) 5/12/91
  * $FreeBSD$
  */
 
 #ifndef _I386_PCB_H_
 #define _I386_PCB_H_
 
 /*
  * Intel 386 process control block
  */
 #ifndef _KERNEL
 #include <machine/segments.h>
 #endif
 #include <machine/npx.h>
 
 struct pcb {
+	int	pcb_cr0;
+	int	pcb_cr2;
 	int	pcb_cr3;
+	int	pcb_cr4;
 	int	pcb_edi;
 	int	pcb_esi;
 	int	pcb_ebp;
 	int	pcb_esp;
 	int	pcb_ebx;
 	int	pcb_eip;
 
 	int     pcb_dr0;
 	int     pcb_dr1;
 	int     pcb_dr2;
 	int     pcb_dr3;
 	int     pcb_dr6;
 	int     pcb_dr7;
 
 	union	savefpu	pcb_user_save;
 	uint16_t pcb_initial_npxcw;
 	u_int	pcb_flags;
 #define	FP_SOFTFP	0x01	/* process using software fltng pnt emulator */
 #define	PCB_DBREGS	0x02	/* process using debug registers */
 #define	PCB_NPXINITDONE	0x08	/* fpu state is initialized */
 #define	PCB_VM86CALL	0x10	/* in vm86 call */
 #define	PCB_NPXUSERINITDONE 0x20 /* user fpu state is initialized */
 #define	PCB_KERNNPX	0x40	/* kernel uses npx */
 
 	caddr_t	pcb_onfault;	/* copyin/out fault recovery */
+	int	pcb_ds;
+	int	pcb_es;
+	int	pcb_fs;
 	int	pcb_gs;
+	int	pcb_ss;
 	struct segment_descriptor pcb_fsd;
 	struct segment_descriptor pcb_gsd;
 	struct	pcb_ext	*pcb_ext;	/* optional pcb extension */
 	int	pcb_psl;	/* process status long */
 	u_long	pcb_vm86[2];	/* vm86bios scratch space */
 	union	savefpu *pcb_save;
+
+	struct region_descriptor pcb_gdt;
+	struct region_descriptor pcb_idt;
+	uint16_t	pcb_ldt;
+	uint16_t	pcb_tr;
 };
 
 #ifdef _KERNEL
 struct trapframe;
 
 void	makectx(struct trapframe *, struct pcb *);
 void	savectx(struct pcb *) __returns_twice;
+int	suspendctx(struct pcb *) __returns_twice;
+void	resumectx(struct pcb *);
 #endif
 
 #endif /* _I386_PCB_H_ */
Index: stable/9/sys/i386/include/smp.h
===================================================================
--- stable/9/sys/i386/include/smp.h	(revision 235795)
+++ stable/9/sys/i386/include/smp.h	(revision 235796)
@@ -1,92 +1,96 @@
 /*-
  * ----------------------------------------------------------------------------
  * "THE BEER-WARE LICENSE" (Revision 42):
  * <phk@FreeBSD.org> wrote this file.  As long as you retain this notice you
  * can do whatever you want with this stuff. If we meet some day, and you think
  * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
  * ----------------------------------------------------------------------------
  *
  * $FreeBSD$
  *
  */
 
 #ifndef _MACHINE_SMP_H_
 #define _MACHINE_SMP_H_
 
 #ifdef _KERNEL
 
 #ifdef SMP
 
 #ifndef LOCORE
 
 #include <sys/bus.h>
 #include <machine/frame.h>
 #include <machine/intr_machdep.h>
 #include <machine/apicvar.h>
 #include <machine/pcb.h>
 
 /* global data in mpboot.s */
 extern int			bootMP_size;
 
 /* functions in mpboot.s */
 void	bootMP(void);
 
 /* global data in mp_machdep.c */
 extern int			mp_naps;
 extern int			boot_cpu_id;
 extern struct pcb		stoppcbs[];
 extern int			cpu_apic_ids[];
 #ifdef COUNT_IPIS
 extern u_long *ipi_invltlb_counts[MAXCPU];
 extern u_long *ipi_invlrng_counts[MAXCPU];
 extern u_long *ipi_invlpg_counts[MAXCPU];
 extern u_long *ipi_invlcache_counts[MAXCPU];
 extern u_long *ipi_rendezvous_counts[MAXCPU];
 extern u_long *ipi_lazypmap_counts[MAXCPU];
 #endif
 
 /* IPI handlers */
 inthand_t
 	IDTVEC(invltlb),	/* TLB shootdowns - global */
 	IDTVEC(invlpg),		/* TLB shootdowns - 1 page */
 	IDTVEC(invlrng),	/* TLB shootdowns - page range */
 	IDTVEC(invlcache),	/* Write back and invalidate cache */
 	IDTVEC(ipi_intr_bitmap_handler), /* Bitmap based IPIs */ 
 	IDTVEC(cpustop),	/* CPU stops & waits to be restarted */
+	IDTVEC(cpususpend),	/* CPU suspends & waits to be resumed */
 	IDTVEC(rendezvous),	/* handle CPU rendezvous */
 	IDTVEC(lazypmap);	/* handle lazy pmap release */
 
 /* functions in mp_machdep.c */
 void	cpu_add(u_int apic_id, char boot_cpu);
 void	cpustop_handler(void);
+#ifndef XEN
+void	cpususpend_handler(void);
+#endif
 void	init_secondary(void);
 void	ipi_all_but_self(u_int ipi);
 #ifndef XEN
 void 	ipi_bitmap_handler(struct trapframe frame);
 #endif
 void	ipi_cpu(int cpu, u_int ipi);
 int	ipi_nmi_handler(void);
 void	ipi_selected(cpuset_t cpus, u_int ipi);
 u_int	mp_bootaddress(u_int);
 void	smp_cache_flush(void);
 void	smp_invlpg(vm_offset_t addr);
 void	smp_masked_invlpg(cpuset_t mask, vm_offset_t addr);
 void	smp_invlpg_range(vm_offset_t startva, vm_offset_t endva);
 void	smp_masked_invlpg_range(cpuset_t mask, vm_offset_t startva,
 	    vm_offset_t endva);
 void	smp_invltlb(void);
 void	smp_masked_invltlb(cpuset_t mask);
 
 #ifdef XEN
 void ipi_to_irq_init(void);
 
 #define RESCHEDULE_VECTOR	0
 #define CALL_FUNCTION_VECTOR	1
 #define NR_IPIS			2
 
 #endif
 #endif /* !LOCORE */
 #endif /* SMP */
 
 #endif /* _KERNEL */
 #endif /* _MACHINE_SMP_H_ */
Index: stable/9/sys/kern/subr_smp.c
===================================================================
--- stable/9/sys/kern/subr_smp.c	(revision 235795)
+++ stable/9/sys/kern/subr_smp.c	(revision 235796)
@@ -1,726 +1,726 @@
 /*-
  * Copyright (c) 2001, John Baldwin <jhb@FreeBSD.org>.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the author nor the names of any co-contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * This module holds the global variables and machine independent functions
  * used for the kernel SMP support.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/proc.h>
 #include <sys/bus.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 
 #include <machine/cpu.h>
 #include <machine/smp.h>
 
 #include "opt_sched.h"
 
 #ifdef SMP
 volatile cpuset_t stopped_cpus;
 volatile cpuset_t started_cpus;
 cpuset_t hlt_cpus_mask;
 cpuset_t logical_cpus_mask;
 
 void (*cpustop_restartfunc)(void);
 #endif
 /* This is used in modules that need to work in both SMP and UP. */
 cpuset_t all_cpus;
 
 int mp_ncpus;
 /* export this for libkvm consumers. */
 int mp_maxcpus = MAXCPU;
 
 volatile int smp_started;
 u_int mp_maxid;
 
 SYSCTL_NODE(_kern, OID_AUTO, smp, CTLFLAG_RD|CTLFLAG_CAPRD, NULL, "Kernel SMP");
 
 SYSCTL_INT(_kern_smp, OID_AUTO, maxid, CTLFLAG_RD|CTLFLAG_CAPRD, &mp_maxid, 0,
     "Max CPU ID.");
 
 SYSCTL_INT(_kern_smp, OID_AUTO, maxcpus, CTLFLAG_RD|CTLFLAG_CAPRD, &mp_maxcpus,
     0, "Max number of CPUs that the system was compiled for.");
 
 int smp_active = 0;	/* are the APs allowed to run? */
 SYSCTL_INT(_kern_smp, OID_AUTO, active, CTLFLAG_RW, &smp_active, 0,
     "Number of Auxillary Processors (APs) that were successfully started");
 
 int smp_disabled = 0;	/* has smp been disabled? */
 SYSCTL_INT(_kern_smp, OID_AUTO, disabled, CTLFLAG_RDTUN|CTLFLAG_CAPRD,
     &smp_disabled, 0, "SMP has been disabled from the loader");
 TUNABLE_INT("kern.smp.disabled", &smp_disabled);
 
 int smp_cpus = 1;	/* how many cpu's running */
 SYSCTL_INT(_kern_smp, OID_AUTO, cpus, CTLFLAG_RD|CTLFLAG_CAPRD, &smp_cpus, 0,
     "Number of CPUs online");
 
 int smp_topology = 0;	/* Which topology we're using. */
 SYSCTL_INT(_kern_smp, OID_AUTO, topology, CTLFLAG_RD, &smp_topology, 0,
     "Topology override setting; 0 is default provided by hardware.");
 TUNABLE_INT("kern.smp.topology", &smp_topology);
 
 #ifdef SMP
 /* Enable forwarding of a signal to a process running on a different CPU */
 static int forward_signal_enabled = 1;
 SYSCTL_INT(_kern_smp, OID_AUTO, forward_signal_enabled, CTLFLAG_RW,
 	   &forward_signal_enabled, 0,
 	   "Forwarding of a signal to a process on a different CPU");
 
 /* Variables needed for SMP rendezvous. */
 static volatile int smp_rv_ncpus;
 static void (*volatile smp_rv_setup_func)(void *arg);
 static void (*volatile smp_rv_action_func)(void *arg);
 static void (*volatile smp_rv_teardown_func)(void *arg);
 static void *volatile smp_rv_func_arg;
 static volatile int smp_rv_waiters[4];
 
 /* 
  * Shared mutex to restrict busywaits between smp_rendezvous() and
  * smp(_targeted)_tlb_shootdown().  A deadlock occurs if both of these
  * functions trigger at once and cause multiple CPUs to busywait with
  * interrupts disabled. 
  */
 struct mtx smp_ipi_mtx;
 
 /*
  * Let the MD SMP code initialize mp_maxid very early if it can.
  */
 static void
 mp_setmaxid(void *dummy)
 {
 	cpu_mp_setmaxid();
 }
 SYSINIT(cpu_mp_setmaxid, SI_SUB_TUNABLES, SI_ORDER_FIRST, mp_setmaxid, NULL);
 
 /*
  * Call the MD SMP initialization code.
  */
 static void
 mp_start(void *dummy)
 {
 
 	mtx_init(&smp_ipi_mtx, "smp rendezvous", NULL, MTX_SPIN);
 
 	/* Probe for MP hardware. */
 	if (smp_disabled != 0 || cpu_mp_probe() == 0) {
 		mp_ncpus = 1;
 		CPU_SETOF(PCPU_GET(cpuid), &all_cpus);
 		return;
 	}
 
 	cpu_mp_start();
 	printf("FreeBSD/SMP: Multiprocessor System Detected: %d CPUs\n",
 	    mp_ncpus);
 	cpu_mp_announce();
 }
 SYSINIT(cpu_mp, SI_SUB_CPU, SI_ORDER_THIRD, mp_start, NULL);
 
 void
 forward_signal(struct thread *td)
 {
 	int id;
 
 	/*
 	 * signotify() has already set TDF_ASTPENDING and TDF_NEEDSIGCHECK on
 	 * this thread, so all we need to do is poke it if it is currently
 	 * executing so that it executes ast().
 	 */
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	KASSERT(TD_IS_RUNNING(td),
 	    ("forward_signal: thread is not TDS_RUNNING"));
 
 	CTR1(KTR_SMP, "forward_signal(%p)", td->td_proc);
 
 	if (!smp_started || cold || panicstr)
 		return;
 	if (!forward_signal_enabled)
 		return;
 
 	/* No need to IPI ourself. */
 	if (td == curthread)
 		return;
 
 	id = td->td_oncpu;
 	if (id == NOCPU)
 		return;
 	ipi_cpu(id, IPI_AST);
 }
 
 /*
  * When called the executing CPU will send an IPI to all other CPUs
  *  requesting that they halt execution.
  *
  * Usually (but not necessarily) called with 'other_cpus' as its arg.
  *
  *  - Signals all CPUs in map to stop.
  *  - Waits for each to stop.
  *
  * Returns:
  *  -1: error
  *   0: NA
  *   1: ok
  *
  */
 static int
 generic_stop_cpus(cpuset_t map, u_int type)
 {
 #ifdef KTR
 	char cpusetbuf[CPUSETBUFSIZ];
 #endif
 	static volatile u_int stopping_cpu = NOCPU;
 	int i;
 
 	KASSERT(
-#if defined(__amd64__)
+#if defined(__amd64__) || defined(__i386__)
 	    type == IPI_STOP || type == IPI_STOP_HARD || type == IPI_SUSPEND,
 #else
 	    type == IPI_STOP || type == IPI_STOP_HARD,
 #endif
 	    ("%s: invalid stop type", __func__));
 
 	if (!smp_started)
 		return (0);
 
 	CTR2(KTR_SMP, "stop_cpus(%s) with %u type",
 	    cpusetobj_strprint(cpusetbuf, &map), type);
 
 	if (stopping_cpu != PCPU_GET(cpuid))
 		while (atomic_cmpset_int(&stopping_cpu, NOCPU,
 		    PCPU_GET(cpuid)) == 0)
 			while (stopping_cpu != NOCPU)
 				cpu_spinwait(); /* spin */
 
 	/* send the stop IPI to all CPUs in map */
 	ipi_selected(map, type);
 
 	i = 0;
 	while (!CPU_SUBSET(&stopped_cpus, &map)) {
 		/* spin */
 		cpu_spinwait();
 		i++;
 		if (i == 100000000) {
 			printf("timeout stopping cpus\n");
 			break;
 		}
 	}
 
 	stopping_cpu = NOCPU;
 	return (1);
 }
 
 int
 stop_cpus(cpuset_t map)
 {
 
 	return (generic_stop_cpus(map, IPI_STOP));
 }
 
 int
 stop_cpus_hard(cpuset_t map)
 {
 
 	return (generic_stop_cpus(map, IPI_STOP_HARD));
 }
 
-#if defined(__amd64__)
+#if defined(__amd64__) || defined(__i386__)
 int
 suspend_cpus(cpuset_t map)
 {
 
 	return (generic_stop_cpus(map, IPI_SUSPEND));
 }
 #endif
 
 /*
  * Called by a CPU to restart stopped CPUs. 
  *
  * Usually (but not necessarily) called with 'stopped_cpus' as its arg.
  *
  *  - Signals all CPUs in map to restart.
  *  - Waits for each to restart.
  *
  * Returns:
  *  -1: error
  *   0: NA
  *   1: ok
  */
 int
 restart_cpus(cpuset_t map)
 {
 #ifdef KTR
 	char cpusetbuf[CPUSETBUFSIZ];
 #endif
 
 	if (!smp_started)
 		return 0;
 
 	CTR1(KTR_SMP, "restart_cpus(%s)", cpusetobj_strprint(cpusetbuf, &map));
 
 	/* signal other cpus to restart */
 	CPU_COPY_STORE_REL(&map, &started_cpus);
 
 	/* wait for each to clear its bit */
 	while (CPU_OVERLAP(&stopped_cpus, &map))
 		cpu_spinwait();
 
 	return 1;
 }
 
 /*
  * All-CPU rendezvous.  CPUs are signalled, all execute the setup function 
  * (if specified), rendezvous, execute the action function (if specified),
  * rendezvous again, execute the teardown function (if specified), and then
  * resume.
  *
  * Note that the supplied external functions _must_ be reentrant and aware
  * that they are running in parallel and in an unknown lock context.
  */
 void
 smp_rendezvous_action(void)
 {
 	struct thread *td;
 	void *local_func_arg;
 	void (*local_setup_func)(void*);
 	void (*local_action_func)(void*);
 	void (*local_teardown_func)(void*);
 #ifdef INVARIANTS
 	int owepreempt;
 #endif
 
 	/* Ensure we have up-to-date values. */
 	atomic_add_acq_int(&smp_rv_waiters[0], 1);
 	while (smp_rv_waiters[0] < smp_rv_ncpus)
 		cpu_spinwait();
 
 	/* Fetch rendezvous parameters after acquire barrier. */
 	local_func_arg = smp_rv_func_arg;
 	local_setup_func = smp_rv_setup_func;
 	local_action_func = smp_rv_action_func;
 	local_teardown_func = smp_rv_teardown_func;
 
 	/*
 	 * Use a nested critical section to prevent any preemptions
 	 * from occurring during a rendezvous action routine.
 	 * Specifically, if a rendezvous handler is invoked via an IPI
 	 * and the interrupted thread was in the critical_exit()
 	 * function after setting td_critnest to 0 but before
 	 * performing a deferred preemption, this routine can be
 	 * invoked with td_critnest set to 0 and td_owepreempt true.
 	 * In that case, a critical_exit() during the rendezvous
 	 * action would trigger a preemption which is not permitted in
 	 * a rendezvous action.  To fix this, wrap all of the
 	 * rendezvous action handlers in a critical section.  We
 	 * cannot use a regular critical section however as having
 	 * critical_exit() preempt from this routine would also be
 	 * problematic (the preemption must not occur before the IPI
 	 * has been acknowledged via an EOI).  Instead, we
 	 * intentionally ignore td_owepreempt when leaving the
 	 * critical section.  This should be harmless because we do
 	 * not permit rendezvous action routines to schedule threads,
 	 * and thus td_owepreempt should never transition from 0 to 1
 	 * during this routine.
 	 */
 	td = curthread;
 	td->td_critnest++;
 #ifdef INVARIANTS
 	owepreempt = td->td_owepreempt;
 #endif
 	
 	/*
 	 * If requested, run a setup function before the main action
 	 * function.  Ensure all CPUs have completed the setup
 	 * function before moving on to the action function.
 	 */
 	if (local_setup_func != smp_no_rendevous_barrier) {
 		if (smp_rv_setup_func != NULL)
 			smp_rv_setup_func(smp_rv_func_arg);
 		atomic_add_int(&smp_rv_waiters[1], 1);
 		while (smp_rv_waiters[1] < smp_rv_ncpus)
                 	cpu_spinwait();
 	}
 
 	if (local_action_func != NULL)
 		local_action_func(local_func_arg);
 
 	if (local_teardown_func != smp_no_rendevous_barrier) {
 		/*
 		 * Signal that the main action has been completed.  If a
 		 * full exit rendezvous is requested, then all CPUs will
 		 * wait here until all CPUs have finished the main action.
 		 */
 		atomic_add_int(&smp_rv_waiters[2], 1);
 		while (smp_rv_waiters[2] < smp_rv_ncpus)
 			cpu_spinwait();
 
 		if (local_teardown_func != NULL)
 			local_teardown_func(local_func_arg);
 	}
 
 	/*
 	 * Signal that the rendezvous is fully completed by this CPU.
 	 * This means that no member of smp_rv_* pseudo-structure will be
 	 * accessed by this target CPU after this point; in particular,
 	 * memory pointed by smp_rv_func_arg.
 	 */
 	atomic_add_int(&smp_rv_waiters[3], 1);
 
 	td->td_critnest--;
 	KASSERT(owepreempt == td->td_owepreempt,
 	    ("rendezvous action changed td_owepreempt"));
 }
 
 void
 smp_rendezvous_cpus(cpuset_t map,
 	void (* setup_func)(void *), 
 	void (* action_func)(void *),
 	void (* teardown_func)(void *),
 	void *arg)
 {
 	int curcpumap, i, ncpus = 0;
 
 	/* Look comments in the !SMP case. */
 	if (!smp_started) {
 		spinlock_enter();
 		if (setup_func != NULL)
 			setup_func(arg);
 		if (action_func != NULL)
 			action_func(arg);
 		if (teardown_func != NULL)
 			teardown_func(arg);
 		spinlock_exit();
 		return;
 	}
 
 	CPU_FOREACH(i) {
 		if (CPU_ISSET(i, &map))
 			ncpus++;
 	}
 	if (ncpus == 0)
 		panic("ncpus is 0 with non-zero map");
 
 	mtx_lock_spin(&smp_ipi_mtx);
 
 	/* Pass rendezvous parameters via global variables. */
 	smp_rv_ncpus = ncpus;
 	smp_rv_setup_func = setup_func;
 	smp_rv_action_func = action_func;
 	smp_rv_teardown_func = teardown_func;
 	smp_rv_func_arg = arg;
 	smp_rv_waiters[1] = 0;
 	smp_rv_waiters[2] = 0;
 	smp_rv_waiters[3] = 0;
 	atomic_store_rel_int(&smp_rv_waiters[0], 0);
 
 	/*
 	 * Signal other processors, which will enter the IPI with
 	 * interrupts off.
 	 */
 	curcpumap = CPU_ISSET(curcpu, &map);
 	CPU_CLR(curcpu, &map);
 	ipi_selected(map, IPI_RENDEZVOUS);
 
 	/* Check if the current CPU is in the map */
 	if (curcpumap != 0)
 		smp_rendezvous_action();
 
 	/*
 	 * Ensure that the master CPU waits for all the other
 	 * CPUs to finish the rendezvous, so that smp_rv_*
 	 * pseudo-structure and the arg are guaranteed to not
 	 * be in use.
 	 */
 	while (atomic_load_acq_int(&smp_rv_waiters[3]) < ncpus)
 		cpu_spinwait();
 
 	mtx_unlock_spin(&smp_ipi_mtx);
 }
 
 void
 smp_rendezvous(void (* setup_func)(void *), 
 	       void (* action_func)(void *),
 	       void (* teardown_func)(void *),
 	       void *arg)
 {
 	smp_rendezvous_cpus(all_cpus, setup_func, action_func, teardown_func, arg);
 }
 
 static struct cpu_group group[MAXCPU];
 
 struct cpu_group *
 smp_topo(void)
 {
 	char cpusetbuf[CPUSETBUFSIZ], cpusetbuf2[CPUSETBUFSIZ];
 	struct cpu_group *top;
 
 	/*
 	 * Check for a fake topology request for debugging purposes.
 	 */
 	switch (smp_topology) {
 	case 1:
 		/* Dual core with no sharing.  */
 		top = smp_topo_1level(CG_SHARE_NONE, 2, 0);
 		break;
 	case 2:
 		/* No topology, all cpus are equal. */
 		top = smp_topo_none();
 		break;
 	case 3:
 		/* Dual core with shared L2.  */
 		top = smp_topo_1level(CG_SHARE_L2, 2, 0);
 		break;
 	case 4:
 		/* quad core, shared l3 among each package, private l2.  */
 		top = smp_topo_1level(CG_SHARE_L3, 4, 0);
 		break;
 	case 5:
 		/* quad core,  2 dualcore parts on each package share l2.  */
 		top = smp_topo_2level(CG_SHARE_NONE, 2, CG_SHARE_L2, 2, 0);
 		break;
 	case 6:
 		/* Single-core 2xHTT */
 		top = smp_topo_1level(CG_SHARE_L1, 2, CG_FLAG_HTT);
 		break;
 	case 7:
 		/* quad core with a shared l3, 8 threads sharing L2.  */
 		top = smp_topo_2level(CG_SHARE_L3, 4, CG_SHARE_L2, 8,
 		    CG_FLAG_SMT);
 		break;
 	default:
 		/* Default, ask the system what it wants. */
 		top = cpu_topo();
 		break;
 	}
 	/*
 	 * Verify the returned topology.
 	 */
 	if (top->cg_count != mp_ncpus)
 		panic("Built bad topology at %p.  CPU count %d != %d",
 		    top, top->cg_count, mp_ncpus);
 	if (CPU_CMP(&top->cg_mask, &all_cpus))
 		panic("Built bad topology at %p.  CPU mask (%s) != (%s)",
 		    top, cpusetobj_strprint(cpusetbuf, &top->cg_mask),
 		    cpusetobj_strprint(cpusetbuf2, &all_cpus));
 	return (top);
 }
 
 struct cpu_group *
 smp_topo_none(void)
 {
 	struct cpu_group *top;
 
 	top = &group[0];
 	top->cg_parent = NULL;
 	top->cg_child = NULL;
 	top->cg_mask = all_cpus;
 	top->cg_count = mp_ncpus;
 	top->cg_children = 0;
 	top->cg_level = CG_SHARE_NONE;
 	top->cg_flags = 0;
 	
 	return (top);
 }
 
 static int
 smp_topo_addleaf(struct cpu_group *parent, struct cpu_group *child, int share,
     int count, int flags, int start)
 {
 	char cpusetbuf[CPUSETBUFSIZ], cpusetbuf2[CPUSETBUFSIZ];
 	cpuset_t mask;
 	int i;
 
 	CPU_ZERO(&mask);
 	for (i = 0; i < count; i++, start++)
 		CPU_SET(start, &mask);
 	child->cg_parent = parent;
 	child->cg_child = NULL;
 	child->cg_children = 0;
 	child->cg_level = share;
 	child->cg_count = count;
 	child->cg_flags = flags;
 	child->cg_mask = mask;
 	parent->cg_children++;
 	for (; parent != NULL; parent = parent->cg_parent) {
 		if (CPU_OVERLAP(&parent->cg_mask, &child->cg_mask))
 			panic("Duplicate children in %p.  mask (%s) child (%s)",
 			    parent,
 			    cpusetobj_strprint(cpusetbuf, &parent->cg_mask),
 			    cpusetobj_strprint(cpusetbuf2, &child->cg_mask));
 		CPU_OR(&parent->cg_mask, &child->cg_mask);
 		parent->cg_count += child->cg_count;
 	}
 
 	return (start);
 }
 
 struct cpu_group *
 smp_topo_1level(int share, int count, int flags)
 {
 	struct cpu_group *child;
 	struct cpu_group *top;
 	int packages;
 	int cpu;
 	int i;
 
 	cpu = 0;
 	top = &group[0];
 	packages = mp_ncpus / count;
 	top->cg_child = child = &group[1];
 	top->cg_level = CG_SHARE_NONE;
 	for (i = 0; i < packages; i++, child++)
 		cpu = smp_topo_addleaf(top, child, share, count, flags, cpu);
 	return (top);
 }
 
 struct cpu_group *
 smp_topo_2level(int l2share, int l2count, int l1share, int l1count,
     int l1flags)
 {
 	struct cpu_group *top;
 	struct cpu_group *l1g;
 	struct cpu_group *l2g;
 	int cpu;
 	int i;
 	int j;
 
 	cpu = 0;
 	top = &group[0];
 	l2g = &group[1];
 	top->cg_child = l2g;
 	top->cg_level = CG_SHARE_NONE;
 	top->cg_children = mp_ncpus / (l2count * l1count);
 	l1g = l2g + top->cg_children;
 	for (i = 0; i < top->cg_children; i++, l2g++) {
 		l2g->cg_parent = top;
 		l2g->cg_child = l1g;
 		l2g->cg_level = l2share;
 		for (j = 0; j < l2count; j++, l1g++)
 			cpu = smp_topo_addleaf(l2g, l1g, l1share, l1count,
 			    l1flags, cpu);
 	}
 	return (top);
 }
 
 
 struct cpu_group *
 smp_topo_find(struct cpu_group *top, int cpu)
 {
 	struct cpu_group *cg;
 	cpuset_t mask;
 	int children;
 	int i;
 
 	CPU_SETOF(cpu, &mask);
 	cg = top;
 	for (;;) {
 		if (!CPU_OVERLAP(&cg->cg_mask, &mask))
 			return (NULL);
 		if (cg->cg_children == 0)
 			return (cg);
 		children = cg->cg_children;
 		for (i = 0, cg = cg->cg_child; i < children; cg++, i++)
 			if (CPU_OVERLAP(&cg->cg_mask, &mask))
 				break;
 	}
 	return (NULL);
 }
 #else /* !SMP */
 
 void
 smp_rendezvous_cpus(cpuset_t map,
 	void (*setup_func)(void *), 
 	void (*action_func)(void *),
 	void (*teardown_func)(void *),
 	void *arg)
 {
 	/*
 	 * In the !SMP case we just need to ensure the same initial conditions
 	 * as the SMP case.
 	 */
 	spinlock_enter();
 	if (setup_func != NULL)
 		setup_func(arg);
 	if (action_func != NULL)
 		action_func(arg);
 	if (teardown_func != NULL)
 		teardown_func(arg);
 	spinlock_exit();
 }
 
 void
 smp_rendezvous(void (*setup_func)(void *), 
 	       void (*action_func)(void *),
 	       void (*teardown_func)(void *),
 	       void *arg)
 {
 
 	/* Look comments in the smp_rendezvous_cpus() case. */
 	spinlock_enter();
 	if (setup_func != NULL)
 		setup_func(arg);
 	if (action_func != NULL)
 		action_func(arg);
 	if (teardown_func != NULL)
 		teardown_func(arg);
 	spinlock_exit();
 }
 
 /*
  * Provide dummy SMP support for UP kernels.  Modules that need to use SMP
  * APIs will still work using this dummy support.
  */
 static void
 mp_setvariables_for_up(void *dummy)
 {
 	mp_ncpus = 1;
 	mp_maxid = PCPU_GET(cpuid);
 	CPU_SETOF(mp_maxid, &all_cpus);
 	KASSERT(PCPU_GET(cpuid) == 0, ("UP must have a CPU ID of zero"));
 }
 SYSINIT(cpu_mp_setvariables, SI_SUB_TUNABLES, SI_ORDER_FIRST,
     mp_setvariables_for_up, NULL);
 #endif /* SMP */
 
 void
 smp_no_rendevous_barrier(void *dummy)
 {
 #ifdef SMP
 	KASSERT((!smp_started),("smp_no_rendevous called and smp is started"));
 #endif
 }
Index: stable/9/sys/pc98/pc98/machdep.c
===================================================================
--- stable/9/sys/pc98/pc98/machdep.c	(revision 235795)
+++ stable/9/sys/pc98/pc98/machdep.c	(revision 235796)
@@ -1,3005 +1,3004 @@
 /*-
  * Copyright (c) 1992 Terrence R. Lambert.
  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_apic.h"
 #include "opt_atalk.h"
 #include "opt_atpic.h"
 #include "opt_compat.h"
 #include "opt_cpu.h"
 #include "opt_ddb.h"
 #include "opt_inet.h"
 #include "opt_ipx.h"
 #include "opt_isa.h"
 #include "opt_kstack_pages.h"
 #include "opt_maxmem.h"
 #include "opt_mp_watchdog.h"
 #include "opt_npx.h"
 #include "opt_perfmon.h"
 
 #include <sys/param.h>
 #include <sys/proc.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/bus.h>
 #include <sys/callout.h>
 #include <sys/cons.h>
 #include <sys/cpu.h>
 #include <sys/eventhandler.h>
 #include <sys/exec.h>
 #include <sys/imgact.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/linker.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/msgbuf.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/ptrace.h>
 #include <sys/reboot.h>
 #include <sys/sched.h>
 #include <sys/signalvar.h>
 #ifdef SMP
 #include <sys/smp.h>
 #endif
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/ucontext.h>
 #include <sys/vmmeter.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_param.h>
 
 #ifdef DDB
 #ifndef KDB
 #error KDB must be enabled in order for DDB to work!
 #endif
 #include <ddb/ddb.h>
 #include <ddb/db_sym.h>
 #endif
 
 #include <pc98/pc98/pc98_machdep.h>
 
 #include <net/netisr.h>
 
 #include <machine/bootinfo.h>
 #include <machine/clock.h>
 #include <machine/cpu.h>
 #include <machine/cputypes.h>
 #include <machine/intr_machdep.h>
 #include <x86/mca.h>
 #include <machine/md_var.h>
 #include <machine/mp_watchdog.h>
 #include <machine/pc/bios.h>
 #include <machine/pcb.h>
 #include <machine/pcb_ext.h>
 #include <machine/proc.h>
 #include <machine/reg.h>
 #include <machine/sigframe.h>
 #include <machine/specialreg.h>
 #include <machine/vm86.h>
 #ifdef PERFMON
 #include <machine/perfmon.h>
 #endif
 #ifdef SMP
 #include <machine/smp.h>
 #endif
 
 #ifdef DEV_APIC
 #include <machine/apicvar.h>
 #endif
 
 #ifdef DEV_ISA
 #include <x86/isa/icu.h>
 #endif
 
 /* Sanity check for __curthread() */
 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
 
 extern void init386(int first);
 extern void dblfault_handler(void);
 
 extern void printcpuinfo(void);	/* XXX header file */
 extern void finishidentcpu(void);
 extern void panicifcpuunsupported(void);
-extern void initializecpu(void);
 
 #define	CS_SECURE(cs)		(ISPL(cs) == SEL_UPL)
 #define	EFL_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
 
 #if !defined(CPU_DISABLE_SSE) && defined(I686_CPU)
 #define CPU_ENABLE_SSE
 #endif
 
 static void cpu_startup(void *);
 static void fpstate_drop(struct thread *td);
 static void get_fpcontext(struct thread *td, mcontext_t *mcp);
 static int  set_fpcontext(struct thread *td, const mcontext_t *mcp);
 #ifdef CPU_ENABLE_SSE
 static void set_fpregs_xmm(struct save87 *, struct savexmm *);
 static void fill_fpregs_xmm(struct savexmm *, struct save87 *);
 #endif /* CPU_ENABLE_SSE */
 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
 
 int	need_pre_dma_flush;	/* If 1, use wbinvd befor DMA transfer. */
 int	need_post_dma_flush;	/* If 1, use invd after DMA transfer. */
 
 #ifdef DDB
 extern vm_offset_t ksym_start, ksym_end;
 #endif
 
 int	_udatasel, _ucodesel;
 u_int	basemem;
 
 static int	ispc98 = 1;
 SYSCTL_INT(_machdep, OID_AUTO, ispc98, CTLFLAG_RD, &ispc98, 0, "");
 
 int cold = 1;
 
 #ifdef COMPAT_43
 static void osendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask);
 #endif
 #ifdef COMPAT_FREEBSD4
 static void freebsd4_sendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask);
 #endif
 
 long Maxmem = 0;
 long realmem = 0;
 
 /*
  * The number of PHYSMAP entries must be one less than the number of
  * PHYSSEG entries because the PHYSMAP entry that spans the largest
  * physical address that is accessible by ISA DMA is split into two
  * PHYSSEG entries.
  */
 #define	PHYSMAP_SIZE	(2 * (VM_PHYSSEG_MAX - 1))
 
 vm_paddr_t phys_avail[PHYSMAP_SIZE + 2];
 vm_paddr_t dump_avail[PHYSMAP_SIZE + 2];
 
 /* must be 2 less so 0 0 can signal end of chunks */
 #define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(phys_avail[0])) - 2)
 #define DUMP_AVAIL_ARRAY_END ((sizeof(dump_avail) / sizeof(dump_avail[0])) - 2)
 
 struct kva_md_info kmi;
 
 static struct trapframe proc0_tf;
 struct pcpu __pcpu[MAXCPU];
 
 struct mtx icu_lock;
 
 static void
 cpu_startup(dummy)
 	void *dummy;
 {
 	uintmax_t memsize;
 
 	/*
 	 * Good {morning,afternoon,evening,night}.
 	 */
 	startrtclock();
 	printcpuinfo();
 	panicifcpuunsupported();
 #ifdef PERFMON
 	perfmon_init();
 #endif
 	realmem = Maxmem;
 
 	/*
 	 * Display physical memory.
 	 */
 	memsize = ptoa((uintmax_t)Maxmem);
 	printf("real memory  = %ju (%ju MB)\n", memsize, memsize >> 20);
 
 	/*
 	 * Display any holes after the first chunk of extended memory.
 	 */
 	if (bootverbose) {
 		int indx;
 
 		printf("Physical memory chunk(s):\n");
 		for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
 			vm_paddr_t size;
 
 			size = phys_avail[indx + 1] - phys_avail[indx];
 			printf(
 			    "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
 			    (uintmax_t)phys_avail[indx],
 			    (uintmax_t)phys_avail[indx + 1] - 1,
 			    (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
 		}
 	}
 
 	vm_ksubmap_init(&kmi);
 
 	printf("avail memory = %ju (%ju MB)\n",
 	    ptoa((uintmax_t)cnt.v_free_count),
 	    ptoa((uintmax_t)cnt.v_free_count) / 1048576);
 
 	/*
 	 * Set up buffers, so they can be used to read disk labels.
 	 */
 	bufinit();
 	vm_pager_bufferinit();
 	cpu_setregs();
 
 	/*
 	 * Add BSP as an interrupt target.
 	 */
 	intr_add_cpu(0);
 }
 
 /*
  * Send an interrupt to process.
  *
  * Stack is set up to allow sigcode stored
  * at top to call routine, followed by kcall
  * to sigreturn routine below.  After sigreturn
  * resets the signal mask, the stack, and the
  * frame pointer, it returns to the user
  * specified pc, psl.
  */
 #ifdef COMPAT_43
 static void
 osendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 {
 	struct osigframe sf, *fp;
 	struct proc *p;
 	struct thread *td;
 	struct sigacts *psp;
 	struct trapframe *regs;
 	int sig;
 	int oonstack;
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	sig = ksi->ksi_signo;
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 	regs = td->td_frame;
 	oonstack = sigonstack(regs->tf_esp);
 
 	/* Allocate space for the signal handler context. */
 	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		fp = (struct osigframe *)(td->td_sigstk.ss_sp +
 		    td->td_sigstk.ss_size - sizeof(struct osigframe));
 #if defined(COMPAT_43)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 #endif
 	} else
 		fp = (struct osigframe *)regs->tf_esp - 1;
 
 	/* Translate the signal if appropriate. */
 	if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
 		sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
 
 	/* Build the argument list for the signal handler. */
 	sf.sf_signum = sig;
 	sf.sf_scp = (register_t)&fp->sf_siginfo.si_sc;
 	bzero(&sf.sf_siginfo, sizeof(sf.sf_siginfo));
 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
 		/* Signal handler installed with SA_SIGINFO. */
 		sf.sf_arg2 = (register_t)&fp->sf_siginfo;
 		sf.sf_siginfo.si_signo = sig;
 		sf.sf_siginfo.si_code = ksi->ksi_code;
 		sf.sf_ahu.sf_action = (__osiginfohandler_t *)catcher;
 		sf.sf_addr = 0;
 	} else {
 		/* Old FreeBSD-style arguments. */
 		sf.sf_arg2 = ksi->ksi_code;
 		sf.sf_addr = (register_t)ksi->ksi_addr;
 		sf.sf_ahu.sf_handler = catcher;
 	}
 	mtx_unlock(&psp->ps_mtx);
 	PROC_UNLOCK(p);
 
 	/* Save most if not all of trap frame. */
 	sf.sf_siginfo.si_sc.sc_eax = regs->tf_eax;
 	sf.sf_siginfo.si_sc.sc_ebx = regs->tf_ebx;
 	sf.sf_siginfo.si_sc.sc_ecx = regs->tf_ecx;
 	sf.sf_siginfo.si_sc.sc_edx = regs->tf_edx;
 	sf.sf_siginfo.si_sc.sc_esi = regs->tf_esi;
 	sf.sf_siginfo.si_sc.sc_edi = regs->tf_edi;
 	sf.sf_siginfo.si_sc.sc_cs = regs->tf_cs;
 	sf.sf_siginfo.si_sc.sc_ds = regs->tf_ds;
 	sf.sf_siginfo.si_sc.sc_ss = regs->tf_ss;
 	sf.sf_siginfo.si_sc.sc_es = regs->tf_es;
 	sf.sf_siginfo.si_sc.sc_fs = regs->tf_fs;
 	sf.sf_siginfo.si_sc.sc_gs = rgs();
 	sf.sf_siginfo.si_sc.sc_isp = regs->tf_isp;
 
 	/* Build the signal context to be used by osigreturn(). */
 	sf.sf_siginfo.si_sc.sc_onstack = (oonstack) ? 1 : 0;
 	SIG2OSIG(*mask, sf.sf_siginfo.si_sc.sc_mask);
 	sf.sf_siginfo.si_sc.sc_sp = regs->tf_esp;
 	sf.sf_siginfo.si_sc.sc_fp = regs->tf_ebp;
 	sf.sf_siginfo.si_sc.sc_pc = regs->tf_eip;
 	sf.sf_siginfo.si_sc.sc_ps = regs->tf_eflags;
 	sf.sf_siginfo.si_sc.sc_trapno = regs->tf_trapno;
 	sf.sf_siginfo.si_sc.sc_err = regs->tf_err;
 
 	/*
 	 * If we're a vm86 process, we want to save the segment registers.
 	 * We also change eflags to be our emulated eflags, not the actual
 	 * eflags.
 	 */
 	if (regs->tf_eflags & PSL_VM) {
 		/* XXX confusing names: `tf' isn't a trapframe; `regs' is. */
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86;
 
 		sf.sf_siginfo.si_sc.sc_gs = tf->tf_vm86_gs;
 		sf.sf_siginfo.si_sc.sc_fs = tf->tf_vm86_fs;
 		sf.sf_siginfo.si_sc.sc_es = tf->tf_vm86_es;
 		sf.sf_siginfo.si_sc.sc_ds = tf->tf_vm86_ds;
 
 		if (vm86->vm86_has_vme == 0)
 			sf.sf_siginfo.si_sc.sc_ps =
 			    (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) |
 			    (vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
 
 		/* See sendsig() for comments. */
 		tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP);
 	}
 
 	/*
 	 * Copy the sigframe out to the user's stack.
 	 */
 	if (copyout(&sf, fp, sizeof(*fp)) != 0) {
 #ifdef DEBUG
 		printf("process %ld has trashed its stack\n", (long)p->p_pid);
 #endif
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 	}
 
 	regs->tf_esp = (int)fp;
 	regs->tf_eip = PS_STRINGS - szosigcode;
 	regs->tf_eflags &= ~(PSL_T | PSL_D);
 	regs->tf_cs = _ucodesel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _udatasel;
 	load_gs(_udatasel);
 	regs->tf_ss = _udatasel;
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 #endif /* COMPAT_43 */
 
 #ifdef COMPAT_FREEBSD4
 static void
 freebsd4_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 {
 	struct sigframe4 sf, *sfp;
 	struct proc *p;
 	struct thread *td;
 	struct sigacts *psp;
 	struct trapframe *regs;
 	int sig;
 	int oonstack;
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	sig = ksi->ksi_signo;
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 	regs = td->td_frame;
 	oonstack = sigonstack(regs->tf_esp);
 
 	/* Save user context. */
 	bzero(&sf, sizeof(sf));
 	sf.sf_uc.uc_sigmask = *mask;
 	sf.sf_uc.uc_stack = td->td_sigstk;
 	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
 	    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
 	sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
 	sf.sf_uc.uc_mcontext.mc_gs = rgs();
 	bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs));
 	bzero(sf.sf_uc.uc_mcontext.mc_fpregs,
 	    sizeof(sf.sf_uc.uc_mcontext.mc_fpregs));
 	bzero(sf.sf_uc.uc_mcontext.__spare__,
 	    sizeof(sf.sf_uc.uc_mcontext.__spare__));
 	bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__));
 
 	/* Allocate space for the signal handler context. */
 	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		sfp = (struct sigframe4 *)(td->td_sigstk.ss_sp +
 		    td->td_sigstk.ss_size - sizeof(struct sigframe4));
 #if defined(COMPAT_43)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 #endif
 	} else
 		sfp = (struct sigframe4 *)regs->tf_esp - 1;
 
 	/* Translate the signal if appropriate. */
 	if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
 		sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
 
 	/* Build the argument list for the signal handler. */
 	sf.sf_signum = sig;
 	sf.sf_ucontext = (register_t)&sfp->sf_uc;
 	bzero(&sf.sf_si, sizeof(sf.sf_si));
 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
 		/* Signal handler installed with SA_SIGINFO. */
 		sf.sf_siginfo = (register_t)&sfp->sf_si;
 		sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
 
 		/* Fill in POSIX parts */
 		sf.sf_si.si_signo = sig;
 		sf.sf_si.si_code = ksi->ksi_code;
 		sf.sf_si.si_addr = ksi->ksi_addr;
 	} else {
 		/* Old FreeBSD-style arguments. */
 		sf.sf_siginfo = ksi->ksi_code;
 		sf.sf_addr = (register_t)ksi->ksi_addr;
 		sf.sf_ahu.sf_handler = catcher;
 	}
 	mtx_unlock(&psp->ps_mtx);
 	PROC_UNLOCK(p);
 
 	/*
 	 * If we're a vm86 process, we want to save the segment registers.
 	 * We also change eflags to be our emulated eflags, not the actual
 	 * eflags.
 	 */
 	if (regs->tf_eflags & PSL_VM) {
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86;
 
 		sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs;
 		sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs;
 		sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es;
 		sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds;
 
 		if (vm86->vm86_has_vme == 0)
 			sf.sf_uc.uc_mcontext.mc_eflags =
 			    (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) |
 			    (vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
 
 		/*
 		 * Clear PSL_NT to inhibit T_TSSFLT faults on return from
 		 * syscalls made by the signal handler.  This just avoids
 		 * wasting time for our lazy fixup of such faults.  PSL_NT
 		 * does nothing in vm86 mode, but vm86 programs can set it
 		 * almost legitimately in probes for old cpu types.
 		 */
 		tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP);
 	}
 
 	/*
 	 * Copy the sigframe out to the user's stack.
 	 */
 	if (copyout(&sf, sfp, sizeof(*sfp)) != 0) {
 #ifdef DEBUG
 		printf("process %ld has trashed its stack\n", (long)p->p_pid);
 #endif
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 	}
 
 	regs->tf_esp = (int)sfp;
 	regs->tf_eip = PS_STRINGS - szfreebsd4_sigcode;
 	regs->tf_eflags &= ~(PSL_T | PSL_D);
 	regs->tf_cs = _ucodesel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _udatasel;
 	regs->tf_ss = _udatasel;
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 #endif	/* COMPAT_FREEBSD4 */
 
 void
 sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 {
 	struct sigframe sf, *sfp;
 	struct proc *p;
 	struct thread *td;
 	struct sigacts *psp;
 	char *sp;
 	struct trapframe *regs;
 	struct segment_descriptor *sdp;
 	int sig;
 	int oonstack;
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	sig = ksi->ksi_signo;
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 #ifdef COMPAT_FREEBSD4
 	if (SIGISMEMBER(psp->ps_freebsd4, sig)) {
 		freebsd4_sendsig(catcher, ksi, mask);
 		return;
 	}
 #endif
 #ifdef COMPAT_43
 	if (SIGISMEMBER(psp->ps_osigset, sig)) {
 		osendsig(catcher, ksi, mask);
 		return;
 	}
 #endif
 	regs = td->td_frame;
 	oonstack = sigonstack(regs->tf_esp);
 
 	/* Save user context. */
 	bzero(&sf, sizeof(sf));
 	sf.sf_uc.uc_sigmask = *mask;
 	sf.sf_uc.uc_stack = td->td_sigstk;
 	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
 	    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
 	sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
 	sf.sf_uc.uc_mcontext.mc_gs = rgs();
 	bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs));
 	sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */
 	get_fpcontext(td, &sf.sf_uc.uc_mcontext);
 	fpstate_drop(td);
 	/*
 	 * Unconditionally fill the fsbase and gsbase into the mcontext.
 	 */
 	sdp = &td->td_pcb->pcb_fsd;
 	sf.sf_uc.uc_mcontext.mc_fsbase = sdp->sd_hibase << 24 |
 	    sdp->sd_lobase;
 	sdp = &td->td_pcb->pcb_gsd;
 	sf.sf_uc.uc_mcontext.mc_gsbase = sdp->sd_hibase << 24 |
 	    sdp->sd_lobase;
 	sf.sf_uc.uc_mcontext.mc_flags = 0;
 	bzero(sf.sf_uc.uc_mcontext.mc_spare2,
 	    sizeof(sf.sf_uc.uc_mcontext.mc_spare2));
 	bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__));
 
 	/* Allocate space for the signal handler context. */
 	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		sp = td->td_sigstk.ss_sp +
 		    td->td_sigstk.ss_size - sizeof(struct sigframe);
 #if defined(COMPAT_43)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 #endif
 	} else
 		sp = (char *)regs->tf_esp - sizeof(struct sigframe);
 	/* Align to 16 bytes. */
 	sfp = (struct sigframe *)((unsigned int)sp & ~0xF);
 
 	/* Translate the signal if appropriate. */
 	if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
 		sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
 
 	/* Build the argument list for the signal handler. */
 	sf.sf_signum = sig;
 	sf.sf_ucontext = (register_t)&sfp->sf_uc;
 	bzero(&sf.sf_si, sizeof(sf.sf_si));
 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
 		/* Signal handler installed with SA_SIGINFO. */
 		sf.sf_siginfo = (register_t)&sfp->sf_si;
 		sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
 
 		/* Fill in POSIX parts */
 		sf.sf_si = ksi->ksi_info;
 		sf.sf_si.si_signo = sig; /* maybe a translated signal */
 	} else {
 		/* Old FreeBSD-style arguments. */
 		sf.sf_siginfo = ksi->ksi_code;
 		sf.sf_addr = (register_t)ksi->ksi_addr;
 		sf.sf_ahu.sf_handler = catcher;
 	}
 	mtx_unlock(&psp->ps_mtx);
 	PROC_UNLOCK(p);
 
 	/*
 	 * If we're a vm86 process, we want to save the segment registers.
 	 * We also change eflags to be our emulated eflags, not the actual
 	 * eflags.
 	 */
 	if (regs->tf_eflags & PSL_VM) {
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86;
 
 		sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs;
 		sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs;
 		sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es;
 		sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds;
 
 		if (vm86->vm86_has_vme == 0)
 			sf.sf_uc.uc_mcontext.mc_eflags =
 			    (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) |
 			    (vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
 
 		/*
 		 * Clear PSL_NT to inhibit T_TSSFLT faults on return from
 		 * syscalls made by the signal handler.  This just avoids
 		 * wasting time for our lazy fixup of such faults.  PSL_NT
 		 * does nothing in vm86 mode, but vm86 programs can set it
 		 * almost legitimately in probes for old cpu types.
 		 */
 		tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP);
 	}
 
 	/*
 	 * Copy the sigframe out to the user's stack.
 	 */
 	if (copyout(&sf, sfp, sizeof(*sfp)) != 0) {
 #ifdef DEBUG
 		printf("process %ld has trashed its stack\n", (long)p->p_pid);
 #endif
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 	}
 
 	regs->tf_esp = (int)sfp;
 	regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode);
 	regs->tf_eflags &= ~(PSL_T | PSL_D);
 	regs->tf_cs = _ucodesel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _udatasel;
 	regs->tf_ss = _udatasel;
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 
 /*
  * System call to cleanup state after a signal
  * has been taken.  Reset signal mask and
  * stack state from context left by sendsig (above).
  * Return to previous pc and psl as specified by
  * context left by sendsig. Check carefully to
  * make sure that the user has not modified the
  * state to gain improper privileges.
  *
  * MPSAFE
  */
 #ifdef COMPAT_43
 int
 osigreturn(td, uap)
 	struct thread *td;
 	struct osigreturn_args /* {
 		struct osigcontext *sigcntxp;
 	} */ *uap;
 {
 	struct osigcontext sc;
 	struct trapframe *regs;
 	struct osigcontext *scp;
 	int eflags, error;
 	ksiginfo_t ksi;
 
 	regs = td->td_frame;
 	error = copyin(uap->sigcntxp, &sc, sizeof(sc));
 	if (error != 0)
 		return (error);
 	scp = &sc;
 	eflags = scp->sc_ps;
 	if (eflags & PSL_VM) {
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86;
 
 		/*
 		 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
 		 * set up the vm86 area, and we can't enter vm86 mode.
 		 */
 		if (td->td_pcb->pcb_ext == 0)
 			return (EINVAL);
 		vm86 = &td->td_pcb->pcb_ext->ext_vm86;
 		if (vm86->vm86_inited == 0)
 			return (EINVAL);
 
 		/* Go back to user mode if both flags are set. */
 		if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) {
 			ksiginfo_init_trap(&ksi);
 			ksi.ksi_signo = SIGBUS;
 			ksi.ksi_code = BUS_OBJERR;
 			ksi.ksi_addr = (void *)regs->tf_eip;
 			trapsignal(td, &ksi);
 		}
 
 		if (vm86->vm86_has_vme) {
 			eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
 			    (eflags & VME_USERCHANGE) | PSL_VM;
 		} else {
 			vm86->vm86_eflags = eflags;	/* save VIF, VIP */
 			eflags = (tf->tf_eflags & ~VM_USERCHANGE) |
 			    (eflags & VM_USERCHANGE) | PSL_VM;
 		}
 		tf->tf_vm86_ds = scp->sc_ds;
 		tf->tf_vm86_es = scp->sc_es;
 		tf->tf_vm86_fs = scp->sc_fs;
 		tf->tf_vm86_gs = scp->sc_gs;
 		tf->tf_ds = _udatasel;
 		tf->tf_es = _udatasel;
 		tf->tf_fs = _udatasel;
 	} else {
 		/*
 		 * Don't allow users to change privileged or reserved flags.
 		 */
 		/*
 		 * XXX do allow users to change the privileged flag PSL_RF.
 		 * The cpu sets PSL_RF in tf_eflags for faults.  Debuggers
 		 * should sometimes set it there too.  tf_eflags is kept in
 		 * the signal context during signal handling and there is no
 		 * other place to remember it, so the PSL_RF bit may be
 		 * corrupted by the signal handler without us knowing.
 		 * Corruption of the PSL_RF bit at worst causes one more or
 		 * one less debugger trap, so allowing it is fairly harmless.
 		 */
 		if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) {
 	    		return (EINVAL);
 		}
 
 		/*
 		 * Don't allow users to load a valid privileged %cs.  Let the
 		 * hardware check for invalid selectors, excess privilege in
 		 * other selectors, invalid %eip's and invalid %esp's.
 		 */
 		if (!CS_SECURE(scp->sc_cs)) {
 			ksiginfo_init_trap(&ksi);
 			ksi.ksi_signo = SIGBUS;
 			ksi.ksi_code = BUS_OBJERR;
 			ksi.ksi_trapno = T_PROTFLT;
 			ksi.ksi_addr = (void *)regs->tf_eip;
 			trapsignal(td, &ksi);
 			return (EINVAL);
 		}
 		regs->tf_ds = scp->sc_ds;
 		regs->tf_es = scp->sc_es;
 		regs->tf_fs = scp->sc_fs;
 	}
 
 	/* Restore remaining registers. */
 	regs->tf_eax = scp->sc_eax;
 	regs->tf_ebx = scp->sc_ebx;
 	regs->tf_ecx = scp->sc_ecx;
 	regs->tf_edx = scp->sc_edx;
 	regs->tf_esi = scp->sc_esi;
 	regs->tf_edi = scp->sc_edi;
 	regs->tf_cs = scp->sc_cs;
 	regs->tf_ss = scp->sc_ss;
 	regs->tf_isp = scp->sc_isp;
 	regs->tf_ebp = scp->sc_fp;
 	regs->tf_esp = scp->sc_sp;
 	regs->tf_eip = scp->sc_pc;
 	regs->tf_eflags = eflags;
 
 #if defined(COMPAT_43)
 	if (scp->sc_onstack & 1)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 	else
 		td->td_sigstk.ss_flags &= ~SS_ONSTACK;
 #endif
 	kern_sigprocmask(td, SIG_SETMASK, (sigset_t *)&scp->sc_mask, NULL,
 	    SIGPROCMASK_OLD);
 	return (EJUSTRETURN);
 }
 #endif /* COMPAT_43 */
 
 #ifdef COMPAT_FREEBSD4
 /*
  * MPSAFE
  */
 int
 freebsd4_sigreturn(td, uap)
 	struct thread *td;
 	struct freebsd4_sigreturn_args /* {
 		const ucontext4 *sigcntxp;
 	} */ *uap;
 {
 	struct ucontext4 uc;
 	struct trapframe *regs;
 	struct ucontext4 *ucp;
 	int cs, eflags, error;
 	ksiginfo_t ksi;
 
 	error = copyin(uap->sigcntxp, &uc, sizeof(uc));
 	if (error != 0)
 		return (error);
 	ucp = &uc;
 	regs = td->td_frame;
 	eflags = ucp->uc_mcontext.mc_eflags;
 	if (eflags & PSL_VM) {
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86;
 
 		/*
 		 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
 		 * set up the vm86 area, and we can't enter vm86 mode.
 		 */
 		if (td->td_pcb->pcb_ext == 0)
 			return (EINVAL);
 		vm86 = &td->td_pcb->pcb_ext->ext_vm86;
 		if (vm86->vm86_inited == 0)
 			return (EINVAL);
 
 		/* Go back to user mode if both flags are set. */
 		if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) {
 			ksiginfo_init_trap(&ksi);
 			ksi.ksi_signo = SIGBUS;
 			ksi.ksi_code = BUS_OBJERR;
 			ksi.ksi_addr = (void *)regs->tf_eip;
 			trapsignal(td, &ksi);
 		}
 		if (vm86->vm86_has_vme) {
 			eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
 			    (eflags & VME_USERCHANGE) | PSL_VM;
 		} else {
 			vm86->vm86_eflags = eflags;	/* save VIF, VIP */
 			eflags = (tf->tf_eflags & ~VM_USERCHANGE) |
 			    (eflags & VM_USERCHANGE) | PSL_VM;
 		}
 		bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe));
 		tf->tf_eflags = eflags;
 		tf->tf_vm86_ds = tf->tf_ds;
 		tf->tf_vm86_es = tf->tf_es;
 		tf->tf_vm86_fs = tf->tf_fs;
 		tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs;
 		tf->tf_ds = _udatasel;
 		tf->tf_es = _udatasel;
 		tf->tf_fs = _udatasel;
 	} else {
 		/*
 		 * Don't allow users to change privileged or reserved flags.
 		 */
 		/*
 		 * XXX do allow users to change the privileged flag PSL_RF.
 		 * The cpu sets PSL_RF in tf_eflags for faults.  Debuggers
 		 * should sometimes set it there too.  tf_eflags is kept in
 		 * the signal context during signal handling and there is no
 		 * other place to remember it, so the PSL_RF bit may be
 		 * corrupted by the signal handler without us knowing.
 		 * Corruption of the PSL_RF bit at worst causes one more or
 		 * one less debugger trap, so allowing it is fairly harmless.
 		 */
 		if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) {
 			uprintf("pid %d (%s): freebsd4_sigreturn eflags = 0x%x\n",
 			    td->td_proc->p_pid, td->td_name, eflags);
 	    		return (EINVAL);
 		}
 
 		/*
 		 * Don't allow users to load a valid privileged %cs.  Let the
 		 * hardware check for invalid selectors, excess privilege in
 		 * other selectors, invalid %eip's and invalid %esp's.
 		 */
 		cs = ucp->uc_mcontext.mc_cs;
 		if (!CS_SECURE(cs)) {
 			uprintf("pid %d (%s): freebsd4_sigreturn cs = 0x%x\n",
 			    td->td_proc->p_pid, td->td_name, cs);
 			ksiginfo_init_trap(&ksi);
 			ksi.ksi_signo = SIGBUS;
 			ksi.ksi_code = BUS_OBJERR;
 			ksi.ksi_trapno = T_PROTFLT;
 			ksi.ksi_addr = (void *)regs->tf_eip;
 			trapsignal(td, &ksi);
 			return (EINVAL);
 		}
 
 		bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs));
 	}
 
 #if defined(COMPAT_43)
 	if (ucp->uc_mcontext.mc_onstack & 1)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 	else
 		td->td_sigstk.ss_flags &= ~SS_ONSTACK;
 #endif
 	kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
 	return (EJUSTRETURN);
 }
 #endif	/* COMPAT_FREEBSD4 */
 
 /*
  * MPSAFE
  */
 int
 sys_sigreturn(td, uap)
 	struct thread *td;
 	struct sigreturn_args /* {
 		const struct __ucontext *sigcntxp;
 	} */ *uap;
 {
 	ucontext_t uc;
 	struct trapframe *regs;
 	ucontext_t *ucp;
 	int cs, eflags, error, ret;
 	ksiginfo_t ksi;
 
 	error = copyin(uap->sigcntxp, &uc, sizeof(uc));
 	if (error != 0)
 		return (error);
 	ucp = &uc;
 	regs = td->td_frame;
 	eflags = ucp->uc_mcontext.mc_eflags;
 	if (eflags & PSL_VM) {
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86;
 
 		/*
 		 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
 		 * set up the vm86 area, and we can't enter vm86 mode.
 		 */
 		if (td->td_pcb->pcb_ext == 0)
 			return (EINVAL);
 		vm86 = &td->td_pcb->pcb_ext->ext_vm86;
 		if (vm86->vm86_inited == 0)
 			return (EINVAL);
 
 		/* Go back to user mode if both flags are set. */
 		if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) {
 			ksiginfo_init_trap(&ksi);
 			ksi.ksi_signo = SIGBUS;
 			ksi.ksi_code = BUS_OBJERR;
 			ksi.ksi_addr = (void *)regs->tf_eip;
 			trapsignal(td, &ksi);
 		}
 
 		if (vm86->vm86_has_vme) {
 			eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
 			    (eflags & VME_USERCHANGE) | PSL_VM;
 		} else {
 			vm86->vm86_eflags = eflags;	/* save VIF, VIP */
 			eflags = (tf->tf_eflags & ~VM_USERCHANGE) |
 			    (eflags & VM_USERCHANGE) | PSL_VM;
 		}
 		bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe));
 		tf->tf_eflags = eflags;
 		tf->tf_vm86_ds = tf->tf_ds;
 		tf->tf_vm86_es = tf->tf_es;
 		tf->tf_vm86_fs = tf->tf_fs;
 		tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs;
 		tf->tf_ds = _udatasel;
 		tf->tf_es = _udatasel;
 		tf->tf_fs = _udatasel;
 	} else {
 		/*
 		 * Don't allow users to change privileged or reserved flags.
 		 */
 		/*
 		 * XXX do allow users to change the privileged flag PSL_RF.
 		 * The cpu sets PSL_RF in tf_eflags for faults.  Debuggers
 		 * should sometimes set it there too.  tf_eflags is kept in
 		 * the signal context during signal handling and there is no
 		 * other place to remember it, so the PSL_RF bit may be
 		 * corrupted by the signal handler without us knowing.
 		 * Corruption of the PSL_RF bit at worst causes one more or
 		 * one less debugger trap, so allowing it is fairly harmless.
 		 */
 		if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) {
 			uprintf("pid %d (%s): sigreturn eflags = 0x%x\n",
 			    td->td_proc->p_pid, td->td_name, eflags);
 	    		return (EINVAL);
 		}
 
 		/*
 		 * Don't allow users to load a valid privileged %cs.  Let the
 		 * hardware check for invalid selectors, excess privilege in
 		 * other selectors, invalid %eip's and invalid %esp's.
 		 */
 		cs = ucp->uc_mcontext.mc_cs;
 		if (!CS_SECURE(cs)) {
 			uprintf("pid %d (%s): sigreturn cs = 0x%x\n",
 			    td->td_proc->p_pid, td->td_name, cs);
 			ksiginfo_init_trap(&ksi);
 			ksi.ksi_signo = SIGBUS;
 			ksi.ksi_code = BUS_OBJERR;
 			ksi.ksi_trapno = T_PROTFLT;
 			ksi.ksi_addr = (void *)regs->tf_eip;
 			trapsignal(td, &ksi);
 			return (EINVAL);
 		}
 
 		ret = set_fpcontext(td, &ucp->uc_mcontext);
 		if (ret != 0)
 			return (ret);
 		bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs));
 	}
 
 #if defined(COMPAT_43)
 	if (ucp->uc_mcontext.mc_onstack & 1)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 	else
 		td->td_sigstk.ss_flags &= ~SS_ONSTACK;
 #endif
 
 	kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
 	return (EJUSTRETURN);
 }
 
 /*
  * Machine dependent boot() routine
  *
  * I haven't seen anything to put here yet
  * Possibly some stuff might be grafted back here from boot()
  */
 void
 cpu_boot(int howto)
 {
 }
 
 /*
  * Flush the D-cache for non-DMA I/O so that the I-cache can
  * be made coherent later.
  */
 void
 cpu_flush_dcache(void *ptr, size_t len)
 {
 	/* Not applicable */
 }
 
 /* Get current clock frequency for the given cpu id. */
 int
 cpu_est_clockrate(int cpu_id, uint64_t *rate)
 {
 	uint64_t tsc1, tsc2;
 	register_t reg;
 
 	if (pcpu_find(cpu_id) == NULL || rate == NULL)
 		return (EINVAL);
 	if ((cpu_feature & CPUID_TSC) == 0)
 		return (EOPNOTSUPP);
 
 #ifdef SMP
 	if (smp_cpus > 1) {
 		/* Schedule ourselves on the indicated cpu. */
 		thread_lock(curthread);
 		sched_bind(curthread, cpu_id);
 		thread_unlock(curthread);
 	}
 #endif
 
 	/* Calibrate by measuring a short delay. */
 	reg = intr_disable();
 	tsc1 = rdtsc();
 	DELAY(1000);
 	tsc2 = rdtsc();
 	intr_restore(reg);
 	*rate = (tsc2 - tsc1) * 1000;
 
 #ifdef SMP
 	if (smp_cpus > 1) {
 		thread_lock(curthread);
 		sched_unbind(curthread);
 		thread_unlock(curthread);
 	}
 #endif
 
 	return (0);
 }
 
 
 /*
  * Shutdown the CPU as much as possible
  */
 void
 cpu_halt(void)
 {
 	for (;;)
 		__asm__ ("hlt");
 }
 
 static int	idle_mwait = 1;		/* Use MONITOR/MWAIT for short idle. */
 TUNABLE_INT("machdep.idle_mwait", &idle_mwait);
 SYSCTL_INT(_machdep, OID_AUTO, idle_mwait, CTLFLAG_RW, &idle_mwait,
     0, "Use MONITOR/MWAIT for short idle");
 
 #define	STATE_RUNNING	0x0
 #define	STATE_MWAIT	0x1
 #define	STATE_SLEEPING	0x2
 
 static void
 cpu_idle_hlt(int busy)
 {
 	int *state;
 
 	state = (int *)PCPU_PTR(monitorbuf);
 	*state = STATE_SLEEPING;
 	/*
 	 * We must absolutely guarentee that hlt is the next instruction
 	 * after sti or we introduce a timing window.
 	 */
 	disable_intr();
 	if (sched_runnable())
 		enable_intr();
 	else
 		__asm __volatile("sti; hlt");
 	*state = STATE_RUNNING;
 }
 
 /*
  * MWAIT cpu power states.  Lower 4 bits are sub-states.
  */
 #define	MWAIT_C0	0xf0
 #define	MWAIT_C1	0x00
 #define	MWAIT_C2	0x10
 #define	MWAIT_C3	0x20
 #define	MWAIT_C4	0x30
 
 static void
 cpu_idle_mwait(int busy)
 {
 	int *state;
 
 	state = (int *)PCPU_PTR(monitorbuf);
 	*state = STATE_MWAIT;
 	if (!sched_runnable()) {
 		cpu_monitor(state, 0, 0);
 		if (*state == STATE_MWAIT)
 			cpu_mwait(0, MWAIT_C1);
 	}
 	*state = STATE_RUNNING;
 }
 
 static void
 cpu_idle_spin(int busy)
 {
 	int *state;
 	int i;
 
 	state = (int *)PCPU_PTR(monitorbuf);
 	*state = STATE_RUNNING;
 	for (i = 0; i < 1000; i++) {
 		if (sched_runnable())
 			return;
 		cpu_spinwait();
 	}
 }
 
 void (*cpu_idle_fn)(int) = cpu_idle_hlt;
 
 void
 cpu_idle(int busy)
 {
 
 	CTR2(KTR_SPARE2, "cpu_idle(%d) at %d",
 	    busy, curcpu);
 #ifdef MP_WATCHDOG
 	ap_watchdog(PCPU_GET(cpuid));
 #endif
 	/* If we are busy - try to use fast methods. */
 	if (busy) {
 		if ((cpu_feature2 & CPUID2_MON) && idle_mwait) {
 			cpu_idle_mwait(busy);
 			goto out;
 		}
 	}
 
 	/* If we have time - switch timers into idle mode. */
 	if (!busy) {
 		critical_enter();
 		cpu_idleclock();
 	}
 
 	/* Call main idle method. */
 	cpu_idle_fn(busy);
 
 	/* Switch timers mack into active mode. */
 	if (!busy) {
 		cpu_activeclock();
 		critical_exit();
 	}
 out:
 	CTR2(KTR_SPARE2, "cpu_idle(%d) at %d done",
 	    busy, curcpu);
 }
 
 int
 cpu_idle_wakeup(int cpu)
 {
 	struct pcpu *pcpu;
 	int *state;
 
 	pcpu = pcpu_find(cpu);
 	state = (int *)pcpu->pc_monitorbuf;
 	/*
 	 * This doesn't need to be atomic since missing the race will
 	 * simply result in unnecessary IPIs.
 	 */
 	if (*state == STATE_SLEEPING)
 		return (0);
 	if (*state == STATE_MWAIT)
 		*state = STATE_RUNNING;
 	return (1);
 }
 
 /*
  * Ordered by speed/power consumption.
  */
 struct {
 	void	*id_fn;
 	char	*id_name;
 } idle_tbl[] = {
 	{ cpu_idle_spin, "spin" },
 	{ cpu_idle_mwait, "mwait" },
 	{ cpu_idle_hlt, "hlt" },
 	{ NULL, NULL }
 };
 
 static int
 idle_sysctl_available(SYSCTL_HANDLER_ARGS)
 {
 	char *avail, *p;
 	int error;
 	int i;
 
 	avail = malloc(256, M_TEMP, M_WAITOK);
 	p = avail;
 	for (i = 0; idle_tbl[i].id_name != NULL; i++) {
 		if (strstr(idle_tbl[i].id_name, "mwait") &&
 		    (cpu_feature2 & CPUID2_MON) == 0)
 			continue;
 		p += sprintf(p, "%s%s", p != avail ? ", " : "",
 		    idle_tbl[i].id_name);
 	}
 	error = sysctl_handle_string(oidp, avail, 0, req);
 	free(avail, M_TEMP);
 	return (error);
 }
 
 SYSCTL_PROC(_machdep, OID_AUTO, idle_available, CTLTYPE_STRING | CTLFLAG_RD,
     0, 0, idle_sysctl_available, "A", "list of available idle functions");
 
 static int
 idle_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	char buf[16];
 	int error;
 	char *p;
 	int i;
 
 	p = "unknown";
 	for (i = 0; idle_tbl[i].id_name != NULL; i++) {
 		if (idle_tbl[i].id_fn == cpu_idle_fn) {
 			p = idle_tbl[i].id_name;
 			break;
 		}
 	}
 	strncpy(buf, p, sizeof(buf));
 	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	for (i = 0; idle_tbl[i].id_name != NULL; i++) {
 		if (strstr(idle_tbl[i].id_name, "mwait") &&
 		    (cpu_feature2 & CPUID2_MON) == 0)
 			continue;
 		if (strcmp(idle_tbl[i].id_name, buf))
 			continue;
 		cpu_idle_fn = idle_tbl[i].id_fn;
 		return (0);
 	}
 	return (EINVAL);
 }
 
 SYSCTL_PROC(_machdep, OID_AUTO, idle, CTLTYPE_STRING | CTLFLAG_RW, 0, 0,
     idle_sysctl, "A", "currently selected idle function");
 
 uint64_t (*atomic_load_acq_64)(volatile uint64_t *) =
     atomic_load_acq_64_i386;
 void (*atomic_store_rel_64)(volatile uint64_t *, uint64_t) =
     atomic_store_rel_64_i386;
 
 static void
 cpu_probe_cmpxchg8b(void)
 {
 
 	if ((cpu_feature & CPUID_CX8) != 0) {
 		atomic_load_acq_64 = atomic_load_acq_64_i586;
 		atomic_store_rel_64 = atomic_store_rel_64_i586;
 	}
 }
 
 /*
  * Reset registers to default values on exec.
  */
 void
 exec_setregs(struct thread *td, struct image_params *imgp, u_long stack)
 {
 	struct trapframe *regs = td->td_frame;
 	struct pcb *pcb = td->td_pcb;
 
 	/* Reset pc->pcb_gs and %gs before possibly invalidating it. */
 	pcb->pcb_gs = _udatasel;
 	load_gs(_udatasel);
 
 	mtx_lock_spin(&dt_lock);
 	if (td->td_proc->p_md.md_ldt)
 		user_ldt_free(td);
 	else
 		mtx_unlock_spin(&dt_lock);
   
 	bzero((char *)regs, sizeof(struct trapframe));
 	regs->tf_eip = imgp->entry_addr;
 	regs->tf_esp = stack;
 	regs->tf_eflags = PSL_USER | (regs->tf_eflags & PSL_T);
 	regs->tf_ss = _udatasel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _udatasel;
 	regs->tf_cs = _ucodesel;
 
 	/* PS_STRINGS value for BSD/OS binaries.  It is 0 for non-BSD/OS. */
 	regs->tf_ebx = imgp->ps_strings;
 
         /*
          * Reset the hardware debug registers if they were in use.
          * They won't have any meaning for the newly exec'd process.  
          */
         if (pcb->pcb_flags & PCB_DBREGS) {
                 pcb->pcb_dr0 = 0;
                 pcb->pcb_dr1 = 0;
                 pcb->pcb_dr2 = 0;
                 pcb->pcb_dr3 = 0;
                 pcb->pcb_dr6 = 0;
                 pcb->pcb_dr7 = 0;
                 if (pcb == PCPU_GET(curpcb)) {
 		        /*
 			 * Clear the debug registers on the running
 			 * CPU, otherwise they will end up affecting
 			 * the next process we switch to.
 			 */
 		        reset_dbregs();
                 }
                 pcb->pcb_flags &= ~PCB_DBREGS;
         }
 
 	/*
 	 * Initialize the math emulator (if any) for the current process.
 	 * Actually, just clear the bit that says that the emulator has
 	 * been initialized.  Initialization is delayed until the process
 	 * traps to the emulator (if it is done at all) mainly because
 	 * emulators don't provide an entry point for initialization.
 	 */
 	td->td_pcb->pcb_flags &= ~FP_SOFTFP;
 	pcb->pcb_initial_npxcw = __INITIAL_NPXCW__;
 
 	/*
 	 * Drop the FP state if we hold it, so that the process gets a
 	 * clean FP state if it uses the FPU again.
 	 */
 	fpstate_drop(td);
 
 	/*
 	 * XXX - Linux emulator
 	 * Make sure sure edx is 0x0 on entry. Linux binaries depend
 	 * on it.
 	 */
 	td->td_retval[1] = 0;
 }
 
 void
 cpu_setregs(void)
 {
 	unsigned int cr0;
 
 	cr0 = rcr0();
 
 	/*
 	 * CR0_MP, CR0_NE and CR0_TS are set for NPX (FPU) support:
 	 *
 	 * Prepare to trap all ESC (i.e., NPX) instructions and all WAIT
 	 * instructions.  We must set the CR0_MP bit and use the CR0_TS
 	 * bit to control the trap, because setting the CR0_EM bit does
 	 * not cause WAIT instructions to trap.  It's important to trap
 	 * WAIT instructions - otherwise the "wait" variants of no-wait
 	 * control instructions would degenerate to the "no-wait" variants
 	 * after FP context switches but work correctly otherwise.  It's
 	 * particularly important to trap WAITs when there is no NPX -
 	 * otherwise the "wait" variants would always degenerate.
 	 *
 	 * Try setting CR0_NE to get correct error reporting on 486DX's.
 	 * Setting it should fail or do nothing on lesser processors.
 	 */
 	cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
 	load_cr0(cr0);
 	load_gs(_udatasel);
 }
 
 u_long bootdev;		/* not a struct cdev *- encoding is different */
 SYSCTL_ULONG(_machdep, OID_AUTO, guessed_bootdev,
 	CTLFLAG_RD, &bootdev, 0, "Maybe the Boot device (not in struct cdev *format)");
 
 /*
  * Initialize 386 and configure to run kernel
  */
 
 /*
  * Initialize segments & interrupt table
  */
 
 int _default_ldt;
 
 union descriptor gdt[NGDT * MAXCPU];	/* global descriptor table */
 union descriptor ldt[NLDT];		/* local descriptor table */
 static struct gate_descriptor idt0[NIDT];
 struct gate_descriptor *idt = &idt0[0];	/* interrupt descriptor table */
 struct region_descriptor r_gdt, r_idt;	/* table descriptors */
 struct mtx dt_lock;			/* lock for GDT and LDT */
 
 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 extern int has_f00f_bug;
 #endif
 
 static struct i386tss dblfault_tss;
 static char dblfault_stack[PAGE_SIZE];
 
 extern  vm_offset_t	proc0kstack;
 
 
 /*
  * software prototypes -- in more palatable form.
  *
  * GCODE_SEL through GUDATA_SEL must be in this order for syscall/sysret
  * GUFS_SEL and GUGS_SEL must be in this order (swtch.s knows it)
  */
 struct soft_segment_descriptor gdt_segs[] = {
 /* GNULL_SEL	0 Null Descriptor */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = SEL_KPL,
 	.ssd_p = 0,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 /* GPRIV_SEL	1 SMP Per-Processor Private Data Descriptor */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_KPL,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GUFS_SEL	2 %fs Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GUGS_SEL	3 %gs Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GCODE_SEL	4 Code Descriptor for kernel */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMERA,
 	.ssd_dpl = SEL_KPL,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GDATA_SEL	5 Data Descriptor for kernel */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_KPL,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GUCODE_SEL	6 Code Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMERA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GUDATA_SEL	7 Data Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GBIOSLOWMEM_SEL 8 BIOS access to realmode segment 0x40, must be #8 in GDT */
 {	.ssd_base = 0x400,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_KPL,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GPROC0_SEL	9 Proc 0 Tss Descriptor */
 {
 	.ssd_base = 0x0,
 	.ssd_limit = sizeof(struct i386tss)-1,
 	.ssd_type = SDT_SYS386TSS,
 	.ssd_dpl = 0,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 /* GLDT_SEL	10 LDT Descriptor */
 {	.ssd_base = (int) ldt,
 	.ssd_limit = sizeof(ldt)-1,
 	.ssd_type = SDT_SYSLDT,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 /* GUSERLDT_SEL	11 User LDT Descriptor per process */
 {	.ssd_base = (int) ldt,
 	.ssd_limit = (512 * sizeof(union descriptor)-1),
 	.ssd_type = SDT_SYSLDT,
 	.ssd_dpl = 0,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 /* GPANIC_SEL	12 Panic Tss Descriptor */
 {	.ssd_base = (int) &dblfault_tss,
 	.ssd_limit = sizeof(struct i386tss)-1,
 	.ssd_type = SDT_SYS386TSS,
 	.ssd_dpl = 0,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 /* GBIOSCODE32_SEL 13 BIOS 32-bit interface (32bit Code) */
 {	.ssd_base = 0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMERA,
 	.ssd_dpl = 0,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 1		},
 /* GBIOSCODE16_SEL 14 BIOS 32-bit interface (16bit Code) */
 {	.ssd_base = 0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMERA,
 	.ssd_dpl = 0,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 1		},
 /* GBIOSDATA_SEL 15 BIOS 32-bit interface (Data) */
 {	.ssd_base = 0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = 0,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GBIOSUTIL_SEL 16 BIOS 16-bit interface (Utility) */
 {	.ssd_base = 0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = 0,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 1		},
 /* GBIOSARGS_SEL 17 BIOS 16-bit interface (Arguments) */
 {	.ssd_base = 0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = 0,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 1		},
 /* GNDIS_SEL	18 NDIS Descriptor */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = 0,
 	.ssd_p = 0,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 };
 
 static struct soft_segment_descriptor ldt_segs[] = {
 	/* Null Descriptor - overwritten by call gate */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = 0,
 	.ssd_p = 0,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 	/* Null Descriptor - overwritten by call gate */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = 0,
 	.ssd_p = 0,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 	/* Null Descriptor - overwritten by call gate */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = 0,
 	.ssd_p = 0,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 	/* Code Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMERA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 	/* Null Descriptor - overwritten by call gate */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = 0,
 	.ssd_p = 0,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 	/* Data Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_xx = 0, .ssd_xx1 = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 };
 
 void
 setidt(idx, func, typ, dpl, selec)
 	int idx;
 	inthand_t *func;
 	int typ;
 	int dpl;
 	int selec;
 {
 	struct gate_descriptor *ip;
 
 	ip = idt + idx;
 	ip->gd_looffset = (int)func;
 	ip->gd_selector = selec;
 	ip->gd_stkcpy = 0;
 	ip->gd_xx = 0;
 	ip->gd_type = typ;
 	ip->gd_dpl = dpl;
 	ip->gd_p = 1;
 	ip->gd_hioffset = ((int)func)>>16 ;
 }
 
 extern inthand_t
 	IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
 	IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
 	IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
 	IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
 	IDTVEC(xmm), IDTVEC(lcall_syscall), IDTVEC(int0x80_syscall);
 
 #ifdef DDB
 /*
  * Display the index and function name of any IDT entries that don't use
  * the default 'rsvd' entry point.
  */
 DB_SHOW_COMMAND(idt, db_show_idt)
 {
 	struct gate_descriptor *ip;
 	int idx;
 	uintptr_t func;
 
 	ip = idt;
 	for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
 		func = (ip->gd_hioffset << 16 | ip->gd_looffset);
 		if (func != (uintptr_t)&IDTVEC(rsvd)) {
 			db_printf("%3d\t", idx);
 			db_printsym(func, DB_STGY_PROC);
 			db_printf("\n");
 		}
 		ip++;
 	}
 }
 
 /* Show privileged registers. */
 DB_SHOW_COMMAND(sysregs, db_show_sysregs)
 {
 	uint64_t idtr, gdtr;
 
 	idtr = ridt();
 	db_printf("idtr\t0x%08x/%04x\n",
 	    (u_int)(idtr >> 16), (u_int)idtr & 0xffff);
 	gdtr = rgdt();
 	db_printf("gdtr\t0x%08x/%04x\n",
 	    (u_int)(gdtr >> 16), (u_int)gdtr & 0xffff);
 	db_printf("ldtr\t0x%04x\n", rldt());
 	db_printf("tr\t0x%04x\n", rtr());
 	db_printf("cr0\t0x%08x\n", rcr0());
 	db_printf("cr2\t0x%08x\n", rcr2());
 	db_printf("cr3\t0x%08x\n", rcr3());
 	db_printf("cr4\t0x%08x\n", rcr4());
 }
 #endif
 
 void
 sdtossd(sd, ssd)
 	struct segment_descriptor *sd;
 	struct soft_segment_descriptor *ssd;
 {
 	ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
 	ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
 	ssd->ssd_type  = sd->sd_type;
 	ssd->ssd_dpl   = sd->sd_dpl;
 	ssd->ssd_p     = sd->sd_p;
 	ssd->ssd_def32 = sd->sd_def32;
 	ssd->ssd_gran  = sd->sd_gran;
 }
 
 static void
 basemem_setup(void)
 {
 	vm_paddr_t pa;
 	pt_entry_t *pte;
 	int i;
 
 	if (basemem > 640) {
 		printf("Preposterous BIOS basemem of %uK, truncating to 640K\n",
 			basemem);
 		basemem = 640;
 	}
 
 	/*
 	 * XXX if biosbasemem is now < 640, there is a `hole'
 	 * between the end of base memory and the start of
 	 * ISA memory.  The hole may be empty or it may
 	 * contain BIOS code or data.  Map it read/write so
 	 * that the BIOS can write to it.  (Memory from 0 to
 	 * the physical end of the kernel is mapped read-only
 	 * to begin with and then parts of it are remapped.
 	 * The parts that aren't remapped form holes that
 	 * remain read-only and are unused by the kernel.
 	 * The base memory area is below the physical end of
 	 * the kernel and right now forms a read-only hole.
 	 * The part of it from PAGE_SIZE to
 	 * (trunc_page(biosbasemem * 1024) - 1) will be
 	 * remapped and used by the kernel later.)
 	 *
 	 * This code is similar to the code used in
 	 * pmap_mapdev, but since no memory needs to be
 	 * allocated we simply change the mapping.
 	 */
 	for (pa = trunc_page(basemem * 1024);
 	     pa < ISA_HOLE_START; pa += PAGE_SIZE)
 		pmap_kenter(KERNBASE + pa, pa);
 
 	/*
 	 * Map pages between basemem and ISA_HOLE_START, if any, r/w into
 	 * the vm86 page table so that vm86 can scribble on them using
 	 * the vm86 map too.  XXX: why 2 ways for this and only 1 way for
 	 * page 0, at least as initialized here?
 	 */
 	pte = (pt_entry_t *)vm86paddr;
 	for (i = basemem / 4; i < 160; i++)
 		pte[i] = (i << PAGE_SHIFT) | PG_V | PG_RW | PG_U;
 }
 
 /*
  * Populate the (physmap) array with base/bound pairs describing the
  * available physical memory in the system, then test this memory and
  * build the phys_avail array describing the actually-available memory.
  *
  * If we cannot accurately determine the physical memory map, then use
  * value from the 0xE801 call, and failing that, the RTC.
  *
  * Total memory size may be set by the kernel environment variable
  * hw.physmem or the compile-time define MAXMEM.
  *
  * XXX first should be vm_paddr_t.
  */
 static void
 getmemsize(int first)
 {
 	int off, physmap_idx, pa_indx, da_indx;
 	u_long physmem_tunable, memtest;
 	vm_paddr_t physmap[PHYSMAP_SIZE];
 	pt_entry_t *pte;
 	quad_t dcons_addr, dcons_size;
 	int i;
 	int pg_n;
 	u_int extmem;
 	u_int under16;
 	vm_paddr_t pa;
 
 	bzero(physmap, sizeof(physmap));
 
 	/* XXX - some of EPSON machines can't use PG_N */
 	pg_n = PG_N;
 	if (pc98_machine_type & M_EPSON_PC98) {
 		switch (epson_machine_id) {
 #ifdef WB_CACHE
 		default:
 #endif
 		case EPSON_PC486_HX:
 		case EPSON_PC486_HG:
 		case EPSON_PC486_HA:
 			pg_n = 0;
 			break;
 		}
 	}
 
 	under16 = pc98_getmemsize(&basemem, &extmem);
 	basemem_setup();
 
 	physmap[0] = 0;
 	physmap[1] = basemem * 1024;
 	physmap_idx = 2;
 	physmap[physmap_idx] = 0x100000;
 	physmap[physmap_idx + 1] = physmap[physmap_idx] + extmem * 1024;
 
 	/*
 	 * Now, physmap contains a map of physical memory.
 	 */
 
 #ifdef SMP
 	/* make hole for AP bootstrap code */
 	physmap[1] = mp_bootaddress(physmap[1]);
 #endif
 
 	/*
 	 * Maxmem isn't the "maximum memory", it's one larger than the
 	 * highest page of the physical address space.  It should be
 	 * called something like "Maxphyspage".  We may adjust this 
 	 * based on ``hw.physmem'' and the results of the memory test.
 	 */
 	Maxmem = atop(physmap[physmap_idx + 1]);
 
 #ifdef MAXMEM
 	Maxmem = MAXMEM / 4;
 #endif
 
 	if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
 		Maxmem = atop(physmem_tunable);
 
 	/*
 	 * By default keep the memtest enabled.  Use a general name so that
 	 * one could eventually do more with the code than just disable it.
 	 */
 	memtest = 1;
 	TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
 
 	if (atop(physmap[physmap_idx + 1]) != Maxmem &&
 	    (boothowto & RB_VERBOSE))
 		printf("Physical memory use set to %ldK\n", Maxmem * 4);
 
 	/*
 	 * If Maxmem has been increased beyond what the system has detected,
 	 * extend the last memory segment to the new limit.
 	 */ 
 	if (atop(physmap[physmap_idx + 1]) < Maxmem)
 		physmap[physmap_idx + 1] = ptoa((vm_paddr_t)Maxmem);
 
 	/*
 	 * We need to divide chunk if Maxmem is larger than 16MB and
 	 * under 16MB area is not full of memory.
 	 * (1) system area (15-16MB region) is cut off
 	 * (2) extended memory is only over 16MB area (ex. Melco "HYPERMEMORY")
 	 */
 	if ((under16 != 16 * 1024) && (extmem > 15 * 1024)) {
 		/* 15M - 16M region is cut off, so need to divide chunk */
 		physmap[physmap_idx + 1] = under16 * 1024;
 		physmap_idx += 2;
 		physmap[physmap_idx] = 0x1000000;
 		physmap[physmap_idx + 1] = physmap[2] + extmem * 1024;
 	}
 
 	/* call pmap initialization to make new kernel address space */
 	pmap_bootstrap(first);
 
 	/*
 	 * Size up each available chunk of physical memory.
 	 */
 	physmap[0] = PAGE_SIZE;		/* mask off page 0 */
 	pa_indx = 0;
 	da_indx = 1;
 	phys_avail[pa_indx++] = physmap[0];
 	phys_avail[pa_indx] = physmap[0];
 	dump_avail[da_indx] = physmap[0];
 	pte = CMAP1;
 
 	/*
 	 * Get dcons buffer address
 	 */
 	if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
 	    getenv_quad("dcons.size", &dcons_size) == 0)
 		dcons_addr = 0;
 
 	/*
 	 * physmap is in bytes, so when converting to page boundaries,
 	 * round up the start address and round down the end address.
 	 */
 	for (i = 0; i <= physmap_idx; i += 2) {
 		vm_paddr_t end;
 
 		end = ptoa((vm_paddr_t)Maxmem);
 		if (physmap[i + 1] < end)
 			end = trunc_page(physmap[i + 1]);
 		for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
 			int tmp, page_bad, full;
 			int *ptr = (int *)CADDR1;
 
 			full = FALSE;
 			/*
 			 * block out kernel memory as not available.
 			 */
 			if (pa >= KERNLOAD && pa < first)
 				goto do_dump_avail;
 
 			/*
 			 * block out dcons buffer
 			 */
 			if (dcons_addr > 0
 			    && pa >= trunc_page(dcons_addr)
 			    && pa < dcons_addr + dcons_size)
 				goto do_dump_avail;
 
 			page_bad = FALSE;
 			if (memtest == 0)
 				goto skip_memtest;
 
 			/*
 			 * map page into kernel: valid, read/write,non-cacheable
 			 */
 			*pte = pa | PG_V | PG_RW | pg_n;
 			invltlb();
 
 			tmp = *(int *)ptr;
 			/*
 			 * Test for alternating 1's and 0's
 			 */
 			*(volatile int *)ptr = 0xaaaaaaaa;
 			if (*(volatile int *)ptr != 0xaaaaaaaa)
 				page_bad = TRUE;
 			/*
 			 * Test for alternating 0's and 1's
 			 */
 			*(volatile int *)ptr = 0x55555555;
 			if (*(volatile int *)ptr != 0x55555555)
 				page_bad = TRUE;
 			/*
 			 * Test for all 1's
 			 */
 			*(volatile int *)ptr = 0xffffffff;
 			if (*(volatile int *)ptr != 0xffffffff)
 				page_bad = TRUE;
 			/*
 			 * Test for all 0's
 			 */
 			*(volatile int *)ptr = 0x0;
 			if (*(volatile int *)ptr != 0x0)
 				page_bad = TRUE;
 			/*
 			 * Restore original value.
 			 */
 			*(int *)ptr = tmp;
 
 skip_memtest:
 			/*
 			 * Adjust array of valid/good pages.
 			 */
 			if (page_bad == TRUE)
 				continue;
 			/*
 			 * If this good page is a continuation of the
 			 * previous set of good pages, then just increase
 			 * the end pointer. Otherwise start a new chunk.
 			 * Note that "end" points one higher than end,
 			 * making the range >= start and < end.
 			 * If we're also doing a speculative memory
 			 * test and we at or past the end, bump up Maxmem
 			 * so that we keep going. The first bad page
 			 * will terminate the loop.
 			 */
 			if (phys_avail[pa_indx] == pa) {
 				phys_avail[pa_indx] += PAGE_SIZE;
 			} else {
 				pa_indx++;
 				if (pa_indx == PHYS_AVAIL_ARRAY_END) {
 					printf(
 		"Too many holes in the physical address space, giving up\n");
 					pa_indx--;
 					full = TRUE;
 					goto do_dump_avail;
 				}
 				phys_avail[pa_indx++] = pa;	/* start */
 				phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
 			}
 			physmem++;
 do_dump_avail:
 			if (dump_avail[da_indx] == pa) {
 				dump_avail[da_indx] += PAGE_SIZE;
 			} else {
 				da_indx++;
 				if (da_indx == DUMP_AVAIL_ARRAY_END) {
 					da_indx--;
 					goto do_next;
 				}
 				dump_avail[da_indx++] = pa;	/* start */
 				dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
 			}
 do_next:
 			if (full)
 				break;
 		}
 	}
 	*pte = 0;
 	invltlb();
 	
 	/*
 	 * XXX
 	 * The last chunk must contain at least one page plus the message
 	 * buffer to avoid complicating other code (message buffer address
 	 * calculation, etc.).
 	 */
 	while (phys_avail[pa_indx - 1] + PAGE_SIZE +
 	    round_page(msgbufsize) >= phys_avail[pa_indx]) {
 		physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
 		phys_avail[pa_indx--] = 0;
 		phys_avail[pa_indx--] = 0;
 	}
 
 	Maxmem = atop(phys_avail[pa_indx]);
 
 	/* Trim off space for the message buffer. */
 	phys_avail[pa_indx] -= round_page(msgbufsize);
 
 	/* Map the message buffer. */
 	for (off = 0; off < round_page(msgbufsize); off += PAGE_SIZE)
 		pmap_kenter((vm_offset_t)msgbufp + off, phys_avail[pa_indx] +
 		    off);
 }
 
 void
 init386(first)
 	int first;
 {
 	struct gate_descriptor *gdp;
 	int gsel_tss, metadata_missing, x, pa;
 	size_t kstack0_sz;
 	struct pcpu *pc;
 
 	thread0.td_kstack = proc0kstack;
 	thread0.td_kstack_pages = KSTACK_PAGES;
 	kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
 	thread0.td_pcb = (struct pcb *)(thread0.td_kstack + kstack0_sz) - 1;
 
 	/*
  	 * This may be done better later if it gets more high level
  	 * components in it. If so just link td->td_proc here.
 	 */
 	proc_linkup0(&proc0, &thread0);
 
 	/*
 	 * Initialize DMAC
 	 */
 	pc98_init_dmac();
 
 	metadata_missing = 0;
 	if (bootinfo.bi_modulep) {
 		preload_metadata = (caddr_t)bootinfo.bi_modulep + KERNBASE;
 		preload_bootstrap_relocate(KERNBASE);
 	} else {
 		metadata_missing = 1;
 	}
 	if (envmode == 1)
 		kern_envp = static_env;
 	else if (bootinfo.bi_envp)
 		kern_envp = (caddr_t)bootinfo.bi_envp + KERNBASE;
 
 	/* Init basic tunables, hz etc */
 	init_param1();
 
 	/*
 	 * Make gdt memory segments.  All segments cover the full 4GB
 	 * of address space and permissions are enforced at page level.
 	 */
 	gdt_segs[GCODE_SEL].ssd_limit = atop(0 - 1);
 	gdt_segs[GDATA_SEL].ssd_limit = atop(0 - 1);
 	gdt_segs[GUCODE_SEL].ssd_limit = atop(0 - 1);
 	gdt_segs[GUDATA_SEL].ssd_limit = atop(0 - 1);
 	gdt_segs[GUFS_SEL].ssd_limit = atop(0 - 1);
 	gdt_segs[GUGS_SEL].ssd_limit = atop(0 - 1);
 
 	pc = &__pcpu[0];
 	gdt_segs[GPRIV_SEL].ssd_limit = atop(0 - 1);
 	gdt_segs[GPRIV_SEL].ssd_base = (int) pc;
 	gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss;
 
 	for (x = 0; x < NGDT; x++)
 		ssdtosd(&gdt_segs[x], &gdt[x].sd);
 
 	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
 	r_gdt.rd_base =  (int) gdt;
 	mtx_init(&dt_lock, "descriptor tables", NULL, MTX_SPIN);
 	lgdt(&r_gdt);
 
 	pcpu_init(pc, 0, sizeof(struct pcpu));
 	for (pa = first; pa < first + DPCPU_SIZE; pa += PAGE_SIZE)
 		pmap_kenter(pa + KERNBASE, pa);
 	dpcpu_init((void *)(first + KERNBASE), 0);
 	first += DPCPU_SIZE;
 	PCPU_SET(prvspace, pc);
 	PCPU_SET(curthread, &thread0);
 	PCPU_SET(curpcb, thread0.td_pcb);
 
 	/*
 	 * Initialize mutexes.
 	 *
 	 * icu_lock: in order to allow an interrupt to occur in a critical
 	 * 	     section, to set pcpu->ipending (etc...) properly, we
 	 *	     must be able to get the icu lock, so it can't be
 	 *	     under witness.
 	 */
 	mutex_init();
 	mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS | MTX_NOPROFILE);
 
 	/* make ldt memory segments */
 	ldt_segs[LUCODE_SEL].ssd_limit = atop(0 - 1);
 	ldt_segs[LUDATA_SEL].ssd_limit = atop(0 - 1);
 	for (x = 0; x < sizeof ldt_segs / sizeof ldt_segs[0]; x++)
 		ssdtosd(&ldt_segs[x], &ldt[x].sd);
 
 	_default_ldt = GSEL(GLDT_SEL, SEL_KPL);
 	lldt(_default_ldt);
 	PCPU_SET(currentldt, _default_ldt);
 
 	/* exceptions */
 	for (x = 0; x < NIDT; x++)
 		setidt(x, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL,
 		    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_DE, &IDTVEC(div),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_DB, &IDTVEC(dbg),  SDT_SYS386IGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_NMI, &IDTVEC(nmi),  SDT_SYS386IGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
  	setidt(IDT_BP, &IDTVEC(bpt),  SDT_SYS386IGT, SEL_UPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_OF, &IDTVEC(ofl),  SDT_SYS386TGT, SEL_UPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_BR, &IDTVEC(bnd),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_UD, &IDTVEC(ill),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_NM, &IDTVEC(dna),  SDT_SYS386TGT, SEL_KPL
 	    , GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_DF, 0,  SDT_SYSTASKGT, SEL_KPL, GSEL(GPANIC_SEL, SEL_KPL));
 	setidt(IDT_FPUGP, &IDTVEC(fpusegm),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_TS, &IDTVEC(tss),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_NP, &IDTVEC(missing),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_SS, &IDTVEC(stk),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_GP, &IDTVEC(prot),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_PF, &IDTVEC(page),  SDT_SYS386IGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_MF, &IDTVEC(fpu),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_AC, &IDTVEC(align), SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_MC, &IDTVEC(mchk),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_XF, &IDTVEC(xmm), SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
  	setidt(IDT_SYSCALL, &IDTVEC(int0x80_syscall), SDT_SYS386TGT, SEL_UPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 
 	r_idt.rd_limit = sizeof(idt0) - 1;
 	r_idt.rd_base = (int) idt;
 	lidt(&r_idt);
 
 	/*
 	 * Initialize the i8254 before the console so that console
 	 * initialization can use DELAY().
 	 */
 	i8254_init();
 
 	/*
 	 * Initialize the console before we print anything out.
 	 */
 	cninit();
 
 	if (metadata_missing)
 		printf("WARNING: loader(8) metadata is missing!\n");
 
 #ifdef DEV_ISA
 #ifdef DEV_ATPIC
 	atpic_startup();
 #else
 	/* Reset and mask the atpics and leave them shut down. */
 	atpic_reset();
 
 	/*
 	 * Point the ICU spurious interrupt vectors at the APIC spurious
 	 * interrupt handler.
 	 */
 	setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 #endif
 #endif
 
 #ifdef DDB
 	ksym_start = bootinfo.bi_symtab;
 	ksym_end = bootinfo.bi_esymtab;
 #endif
 
 	kdb_init();
 
 #ifdef KDB
 	if (boothowto & RB_KDB)
 		kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
 #endif
 
 	finishidentcpu();	/* Final stage of CPU initialization */
 	setidt(IDT_UD, &IDTVEC(ill),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_GP, &IDTVEC(prot),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	initializecpu();	/* Initialize CPU registers */
 
 	/* make an initial tss so cpu can get interrupt stack on syscall! */
 	/* Note: -16 is so we can grow the trapframe if we came from vm86 */
 	PCPU_SET(common_tss.tss_esp0, thread0.td_kstack +
 	    kstack0_sz - sizeof(struct pcb) - 16);
 	PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL));
 	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
 	PCPU_SET(tss_gdt, &gdt[GPROC0_SEL].sd);
 	PCPU_SET(common_tssd, *PCPU_GET(tss_gdt));
 	PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16);
 	ltr(gsel_tss);
 
 	/* pointer to selector slot for %fs/%gs */
 	PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd);
 
 	dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 =
 	    dblfault_tss.tss_esp2 = (int)&dblfault_stack[sizeof(dblfault_stack)];
 	dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 =
 	    dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL);
 	dblfault_tss.tss_cr3 = (int)IdlePTD;
 	dblfault_tss.tss_eip = (int)dblfault_handler;
 	dblfault_tss.tss_eflags = PSL_KERNEL;
 	dblfault_tss.tss_ds = dblfault_tss.tss_es =
 	    dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL);
 	dblfault_tss.tss_fs = GSEL(GPRIV_SEL, SEL_KPL);
 	dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL);
 	dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL);
 
 	vm86_initialize();
 	getmemsize(first);
 	init_param2(physmem);
 
 	/* now running on new page tables, configured,and u/iom is accessible */
 
 	msgbufinit(msgbufp, msgbufsize);
 
 	/* make a call gate to reenter kernel with */
 	gdp = &ldt[LSYS5CALLS_SEL].gd;
 
 	x = (int) &IDTVEC(lcall_syscall);
 	gdp->gd_looffset = x;
 	gdp->gd_selector = GSEL(GCODE_SEL,SEL_KPL);
 	gdp->gd_stkcpy = 1;
 	gdp->gd_type = SDT_SYS386CGT;
 	gdp->gd_dpl = SEL_UPL;
 	gdp->gd_p = 1;
 	gdp->gd_hioffset = x >> 16;
 
 	/* XXX does this work? */
 	/* XXX yes! */
 	ldt[LBSDICALLS_SEL] = ldt[LSYS5CALLS_SEL];
 	ldt[LSOL26CALLS_SEL] = ldt[LSYS5CALLS_SEL];
 
 	/* transfer to user mode */
 
 	_ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
 	_udatasel = GSEL(GUDATA_SEL, SEL_UPL);
 
 	/* setup proc 0's pcb */
 	thread0.td_pcb->pcb_flags = 0;
 	thread0.td_pcb->pcb_cr3 = (int)IdlePTD;
 	thread0.td_pcb->pcb_ext = 0;
 	thread0.td_frame = &proc0_tf;
 
 	cpu_probe_cmpxchg8b();
 }
 
 void
 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
 {
 
 }
 
 void
 spinlock_enter(void)
 {
 	struct thread *td;
 	register_t flags;
 
 	td = curthread;
 	if (td->td_md.md_spinlock_count == 0) {
 		flags = intr_disable();
 		td->td_md.md_spinlock_count = 1;
 		td->td_md.md_saved_flags = flags;
 	} else
 		td->td_md.md_spinlock_count++;
 	critical_enter();
 }
 
 void
 spinlock_exit(void)
 {
 	struct thread *td;
 	register_t flags;
 
 	td = curthread;
 	critical_exit();
 	flags = td->td_md.md_saved_flags;
 	td->td_md.md_spinlock_count--;
 	if (td->td_md.md_spinlock_count == 0)
 		intr_restore(flags);
 }
 
 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 static void f00f_hack(void *unused);
 SYSINIT(f00f_hack, SI_SUB_INTRINSIC, SI_ORDER_FIRST, f00f_hack, NULL);
 
 static void
 f00f_hack(void *unused)
 {
 	struct gate_descriptor *new_idt;
 	vm_offset_t tmp;
 
 	if (!has_f00f_bug)
 		return;
 
 	GIANT_REQUIRED;
 
 	printf("Intel Pentium detected, installing workaround for F00F bug\n");
 
 	tmp = kmem_alloc(kernel_map, PAGE_SIZE * 2);
 	if (tmp == 0)
 		panic("kmem_alloc returned 0");
 
 	/* Put the problematic entry (#6) at the end of the lower page. */
 	new_idt = (struct gate_descriptor*)
 	    (tmp + PAGE_SIZE - 7 * sizeof(struct gate_descriptor));
 	bcopy(idt, new_idt, sizeof(idt0));
 	r_idt.rd_base = (u_int)new_idt;
 	lidt(&r_idt);
 	idt = new_idt;
 	if (vm_map_protect(kernel_map, tmp, tmp + PAGE_SIZE,
 			   VM_PROT_READ, FALSE) != KERN_SUCCESS)
 		panic("vm_map_protect failed");
 }
 #endif /* defined(I586_CPU) && !NO_F00F_HACK */
 
 /*
  * Construct a PCB from a trapframe. This is called from kdb_trap() where
  * we want to start a backtrace from the function that caused us to enter
  * the debugger. We have the context in the trapframe, but base the trace
  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
  * enough for a backtrace.
  */
 void
 makectx(struct trapframe *tf, struct pcb *pcb)
 {
 
 	pcb->pcb_edi = tf->tf_edi;
 	pcb->pcb_esi = tf->tf_esi;
 	pcb->pcb_ebp = tf->tf_ebp;
 	pcb->pcb_ebx = tf->tf_ebx;
 	pcb->pcb_eip = tf->tf_eip;
 	pcb->pcb_esp = (ISPL(tf->tf_cs)) ? tf->tf_esp : (int)(tf + 1) - 8;
 }
 
 int
 ptrace_set_pc(struct thread *td, u_long addr)
 {
 
 	td->td_frame->tf_eip = addr;
 	return (0);
 }
 
 int
 ptrace_single_step(struct thread *td)
 {
 	td->td_frame->tf_eflags |= PSL_T;
 	return (0);
 }
 
 int
 ptrace_clear_single_step(struct thread *td)
 {
 	td->td_frame->tf_eflags &= ~PSL_T;
 	return (0);
 }
 
 int
 fill_regs(struct thread *td, struct reg *regs)
 {
 	struct pcb *pcb;
 	struct trapframe *tp;
 
 	tp = td->td_frame;
 	pcb = td->td_pcb;
 	regs->r_gs = pcb->pcb_gs;
 	return (fill_frame_regs(tp, regs));
 }
 
 int
 fill_frame_regs(struct trapframe *tp, struct reg *regs)
 {
 	regs->r_fs = tp->tf_fs;
 	regs->r_es = tp->tf_es;
 	regs->r_ds = tp->tf_ds;
 	regs->r_edi = tp->tf_edi;
 	regs->r_esi = tp->tf_esi;
 	regs->r_ebp = tp->tf_ebp;
 	regs->r_ebx = tp->tf_ebx;
 	regs->r_edx = tp->tf_edx;
 	regs->r_ecx = tp->tf_ecx;
 	regs->r_eax = tp->tf_eax;
 	regs->r_eip = tp->tf_eip;
 	regs->r_cs = tp->tf_cs;
 	regs->r_eflags = tp->tf_eflags;
 	regs->r_esp = tp->tf_esp;
 	regs->r_ss = tp->tf_ss;
 	return (0);
 }
 
 int
 set_regs(struct thread *td, struct reg *regs)
 {
 	struct pcb *pcb;
 	struct trapframe *tp;
 
 	tp = td->td_frame;
 	if (!EFL_SECURE(regs->r_eflags, tp->tf_eflags) ||
 	    !CS_SECURE(regs->r_cs))
 		return (EINVAL);
 	pcb = td->td_pcb;
 	tp->tf_fs = regs->r_fs;
 	tp->tf_es = regs->r_es;
 	tp->tf_ds = regs->r_ds;
 	tp->tf_edi = regs->r_edi;
 	tp->tf_esi = regs->r_esi;
 	tp->tf_ebp = regs->r_ebp;
 	tp->tf_ebx = regs->r_ebx;
 	tp->tf_edx = regs->r_edx;
 	tp->tf_ecx = regs->r_ecx;
 	tp->tf_eax = regs->r_eax;
 	tp->tf_eip = regs->r_eip;
 	tp->tf_cs = regs->r_cs;
 	tp->tf_eflags = regs->r_eflags;
 	tp->tf_esp = regs->r_esp;
 	tp->tf_ss = regs->r_ss;
 	pcb->pcb_gs = regs->r_gs;
 	return (0);
 }
 
 #ifdef CPU_ENABLE_SSE
 static void
 fill_fpregs_xmm(sv_xmm, sv_87)
 	struct savexmm *sv_xmm;
 	struct save87 *sv_87;
 {
 	register struct env87 *penv_87 = &sv_87->sv_env;
 	register struct envxmm *penv_xmm = &sv_xmm->sv_env;
 	int i;
 
 	bzero(sv_87, sizeof(*sv_87));
 
 	/* FPU control/status */
 	penv_87->en_cw = penv_xmm->en_cw;
 	penv_87->en_sw = penv_xmm->en_sw;
 	penv_87->en_tw = penv_xmm->en_tw;
 	penv_87->en_fip = penv_xmm->en_fip;
 	penv_87->en_fcs = penv_xmm->en_fcs;
 	penv_87->en_opcode = penv_xmm->en_opcode;
 	penv_87->en_foo = penv_xmm->en_foo;
 	penv_87->en_fos = penv_xmm->en_fos;
 
 	/* FPU registers */
 	for (i = 0; i < 8; ++i)
 		sv_87->sv_ac[i] = sv_xmm->sv_fp[i].fp_acc;
 }
 
 static void
 set_fpregs_xmm(sv_87, sv_xmm)
 	struct save87 *sv_87;
 	struct savexmm *sv_xmm;
 {
 	register struct env87 *penv_87 = &sv_87->sv_env;
 	register struct envxmm *penv_xmm = &sv_xmm->sv_env;
 	int i;
 
 	/* FPU control/status */
 	penv_xmm->en_cw = penv_87->en_cw;
 	penv_xmm->en_sw = penv_87->en_sw;
 	penv_xmm->en_tw = penv_87->en_tw;
 	penv_xmm->en_fip = penv_87->en_fip;
 	penv_xmm->en_fcs = penv_87->en_fcs;
 	penv_xmm->en_opcode = penv_87->en_opcode;
 	penv_xmm->en_foo = penv_87->en_foo;
 	penv_xmm->en_fos = penv_87->en_fos;
 
 	/* FPU registers */
 	for (i = 0; i < 8; ++i)
 		sv_xmm->sv_fp[i].fp_acc = sv_87->sv_ac[i];
 }
 #endif /* CPU_ENABLE_SSE */
 
 int
 fill_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 
 	KASSERT(td == curthread || TD_IS_SUSPENDED(td),
 	    ("not suspended thread %p", td));
 #ifdef DEV_NPX
 	npxgetregs(td);
 #else
 	bzero(fpregs, sizeof(*fpregs));
 #endif
 #ifdef CPU_ENABLE_SSE
 	if (cpu_fxsr)
 		fill_fpregs_xmm(&td->td_pcb->pcb_user_save.sv_xmm,
 		    (struct save87 *)fpregs);
 	else
 #endif /* CPU_ENABLE_SSE */
 		bcopy(&td->td_pcb->pcb_user_save.sv_87, fpregs,
 		    sizeof(*fpregs));
 	return (0);
 }
 
 int
 set_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 
 #ifdef CPU_ENABLE_SSE
 	if (cpu_fxsr)
 		set_fpregs_xmm((struct save87 *)fpregs,
 		    &td->td_pcb->pcb_user_save.sv_xmm);
 	else
 #endif /* CPU_ENABLE_SSE */
 		bcopy(fpregs, &td->td_pcb->pcb_user_save.sv_87,
 		    sizeof(*fpregs));
 #ifdef DEV_NPX
 	npxuserinited(td);
 #endif
 	return (0);
 }
 
 /*
  * Get machine context.
  */
 int
 get_mcontext(struct thread *td, mcontext_t *mcp, int flags)
 {
 	struct trapframe *tp;
 	struct segment_descriptor *sdp;
 
 	tp = td->td_frame;
 
 	PROC_LOCK(curthread->td_proc);
 	mcp->mc_onstack = sigonstack(tp->tf_esp);
 	PROC_UNLOCK(curthread->td_proc);
 	mcp->mc_gs = td->td_pcb->pcb_gs;
 	mcp->mc_fs = tp->tf_fs;
 	mcp->mc_es = tp->tf_es;
 	mcp->mc_ds = tp->tf_ds;
 	mcp->mc_edi = tp->tf_edi;
 	mcp->mc_esi = tp->tf_esi;
 	mcp->mc_ebp = tp->tf_ebp;
 	mcp->mc_isp = tp->tf_isp;
 	mcp->mc_eflags = tp->tf_eflags;
 	if (flags & GET_MC_CLEAR_RET) {
 		mcp->mc_eax = 0;
 		mcp->mc_edx = 0;
 		mcp->mc_eflags &= ~PSL_C;
 	} else {
 		mcp->mc_eax = tp->tf_eax;
 		mcp->mc_edx = tp->tf_edx;
 	}
 	mcp->mc_ebx = tp->tf_ebx;
 	mcp->mc_ecx = tp->tf_ecx;
 	mcp->mc_eip = tp->tf_eip;
 	mcp->mc_cs = tp->tf_cs;
 	mcp->mc_esp = tp->tf_esp;
 	mcp->mc_ss = tp->tf_ss;
 	mcp->mc_len = sizeof(*mcp);
 	get_fpcontext(td, mcp);
 	sdp = &td->td_pcb->pcb_fsd;
 	mcp->mc_fsbase = sdp->sd_hibase << 24 | sdp->sd_lobase;
 	sdp = &td->td_pcb->pcb_gsd;
 	mcp->mc_gsbase = sdp->sd_hibase << 24 | sdp->sd_lobase;
 	mcp->mc_flags = 0;
 	bzero(mcp->mc_spare2, sizeof(mcp->mc_spare2));
 	return (0);
 }
 
 /*
  * Set machine context.
  *
  * However, we don't set any but the user modifiable flags, and we won't
  * touch the cs selector.
  */
 int
 set_mcontext(struct thread *td, const mcontext_t *mcp)
 {
 	struct trapframe *tp;
 	int eflags, ret;
 
 	tp = td->td_frame;
 	if (mcp->mc_len != sizeof(*mcp))
 		return (EINVAL);
 	eflags = (mcp->mc_eflags & PSL_USERCHANGE) |
 	    (tp->tf_eflags & ~PSL_USERCHANGE);
 	if ((ret = set_fpcontext(td, mcp)) == 0) {
 		tp->tf_fs = mcp->mc_fs;
 		tp->tf_es = mcp->mc_es;
 		tp->tf_ds = mcp->mc_ds;
 		tp->tf_edi = mcp->mc_edi;
 		tp->tf_esi = mcp->mc_esi;
 		tp->tf_ebp = mcp->mc_ebp;
 		tp->tf_ebx = mcp->mc_ebx;
 		tp->tf_edx = mcp->mc_edx;
 		tp->tf_ecx = mcp->mc_ecx;
 		tp->tf_eax = mcp->mc_eax;
 		tp->tf_eip = mcp->mc_eip;
 		tp->tf_eflags = eflags;
 		tp->tf_esp = mcp->mc_esp;
 		tp->tf_ss = mcp->mc_ss;
 		td->td_pcb->pcb_gs = mcp->mc_gs;
 		ret = 0;
 	}
 	return (ret);
 }
 
 static void
 get_fpcontext(struct thread *td, mcontext_t *mcp)
 {
 
 #ifndef DEV_NPX
 	mcp->mc_fpformat = _MC_FPFMT_NODEV;
 	mcp->mc_ownedfp = _MC_FPOWNED_NONE;
 	bzero(mcp->mc_fpstate, sizeof(mcp->mc_fpstate));
 #else
 	mcp->mc_ownedfp = npxgetregs(td);
 	bcopy(&td->td_pcb->pcb_user_save, &mcp->mc_fpstate,
 	    sizeof(mcp->mc_fpstate));
 	mcp->mc_fpformat = npxformat();
 #endif
 }
 
 static int
 set_fpcontext(struct thread *td, const mcontext_t *mcp)
 {
 
 	if (mcp->mc_fpformat == _MC_FPFMT_NODEV)
 		return (0);
 	else if (mcp->mc_fpformat != _MC_FPFMT_387 &&
 	    mcp->mc_fpformat != _MC_FPFMT_XMM)
 		return (EINVAL);
 	else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE)
 		/* We don't care what state is left in the FPU or PCB. */
 		fpstate_drop(td);
 	else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU ||
 	    mcp->mc_ownedfp == _MC_FPOWNED_PCB) {
 #ifdef DEV_NPX
 #ifdef CPU_ENABLE_SSE
 		if (cpu_fxsr)
 			((union savefpu *)&mcp->mc_fpstate)->sv_xmm.sv_env.
 			    en_mxcsr &= cpu_mxcsr_mask;
 #endif
 		npxsetregs(td, (union savefpu *)&mcp->mc_fpstate);
 #endif
 	} else
 		return (EINVAL);
 	return (0);
 }
 
 static void
 fpstate_drop(struct thread *td)
 {
 
 	critical_enter();
 #ifdef DEV_NPX
 	if (PCPU_GET(fpcurthread) == td)
 		npxdrop();
 #endif
 	/*
 	 * XXX force a full drop of the npx.  The above only drops it if we
 	 * owned it.  npxgetregs() has the same bug in the !cpu_fxsr case.
 	 *
 	 * XXX I don't much like npxgetregs()'s semantics of doing a full
 	 * drop.  Dropping only to the pcb matches fnsave's behaviour.
 	 * We only need to drop to !PCB_INITDONE in sendsig().  But
 	 * sendsig() is the only caller of npxgetregs()... perhaps we just
 	 * have too many layers.
 	 */
 	curthread->td_pcb->pcb_flags &= ~(PCB_NPXINITDONE |
 	    PCB_NPXUSERINITDONE);
 	critical_exit();
 }
 
 int
 fill_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 	struct pcb *pcb;
 
 	if (td == NULL) {
 		dbregs->dr[0] = rdr0();
 		dbregs->dr[1] = rdr1();
 		dbregs->dr[2] = rdr2();
 		dbregs->dr[3] = rdr3();
 		dbregs->dr[4] = rdr4();
 		dbregs->dr[5] = rdr5();
 		dbregs->dr[6] = rdr6();
 		dbregs->dr[7] = rdr7();
 	} else {
 		pcb = td->td_pcb;
 		dbregs->dr[0] = pcb->pcb_dr0;
 		dbregs->dr[1] = pcb->pcb_dr1;
 		dbregs->dr[2] = pcb->pcb_dr2;
 		dbregs->dr[3] = pcb->pcb_dr3;
 		dbregs->dr[4] = 0;
 		dbregs->dr[5] = 0;
 		dbregs->dr[6] = pcb->pcb_dr6;
 		dbregs->dr[7] = pcb->pcb_dr7;
 	}
 	return (0);
 }
 
 int
 set_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 	struct pcb *pcb;
 	int i;
 
 	if (td == NULL) {
 		load_dr0(dbregs->dr[0]);
 		load_dr1(dbregs->dr[1]);
 		load_dr2(dbregs->dr[2]);
 		load_dr3(dbregs->dr[3]);
 		load_dr4(dbregs->dr[4]);
 		load_dr5(dbregs->dr[5]);
 		load_dr6(dbregs->dr[6]);
 		load_dr7(dbregs->dr[7]);
 	} else {
 		/*
 		 * Don't let an illegal value for dr7 get set.	Specifically,
 		 * check for undefined settings.  Setting these bit patterns
 		 * result in undefined behaviour and can lead to an unexpected
 		 * TRCTRAP.
 		 */
 		for (i = 0; i < 4; i++) {
 			if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02)
 				return (EINVAL);
 			if (DBREG_DR7_LEN(dbregs->dr[7], i) == 0x02)
 				return (EINVAL);
 		}
 		
 		pcb = td->td_pcb;
 		
 		/*
 		 * Don't let a process set a breakpoint that is not within the
 		 * process's address space.  If a process could do this, it
 		 * could halt the system by setting a breakpoint in the kernel
 		 * (if ddb was enabled).  Thus, we need to check to make sure
 		 * that no breakpoints are being enabled for addresses outside
 		 * process's address space.
 		 *
 		 * XXX - what about when the watched area of the user's
 		 * address space is written into from within the kernel
 		 * ... wouldn't that still cause a breakpoint to be generated
 		 * from within kernel mode?
 		 */
 
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) {
 			/* dr0 is enabled */
 			if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 			
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) {
 			/* dr1 is enabled */
 			if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 			
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) {
 			/* dr2 is enabled */
 			if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 			
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) {
 			/* dr3 is enabled */
 			if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 
 		pcb->pcb_dr0 = dbregs->dr[0];
 		pcb->pcb_dr1 = dbregs->dr[1];
 		pcb->pcb_dr2 = dbregs->dr[2];
 		pcb->pcb_dr3 = dbregs->dr[3];
 		pcb->pcb_dr6 = dbregs->dr[6];
 		pcb->pcb_dr7 = dbregs->dr[7];
 
 		pcb->pcb_flags |= PCB_DBREGS;
 	}
 
 	return (0);
 }
 
 /*
  * Return > 0 if a hardware breakpoint has been hit, and the
  * breakpoint was in user space.  Return 0, otherwise.
  */
 int
 user_dbreg_trap(void)
 {
         u_int32_t dr7, dr6; /* debug registers dr6 and dr7 */
         u_int32_t bp;       /* breakpoint bits extracted from dr6 */
         int nbp;            /* number of breakpoints that triggered */
         caddr_t addr[4];    /* breakpoint addresses */
         int i;
         
         dr7 = rdr7();
         if ((dr7 & 0x000000ff) == 0) {
                 /*
                  * all GE and LE bits in the dr7 register are zero,
                  * thus the trap couldn't have been caused by the
                  * hardware debug registers
                  */
                 return 0;
         }
 
         nbp = 0;
         dr6 = rdr6();
         bp = dr6 & 0x0000000f;
 
         if (!bp) {
                 /*
                  * None of the breakpoint bits are set meaning this
                  * trap was not caused by any of the debug registers
                  */
                 return 0;
         }
 
         /*
          * at least one of the breakpoints were hit, check to see
          * which ones and if any of them are user space addresses
          */
 
         if (bp & 0x01) {
                 addr[nbp++] = (caddr_t)rdr0();
         }
         if (bp & 0x02) {
                 addr[nbp++] = (caddr_t)rdr1();
         }
         if (bp & 0x04) {
                 addr[nbp++] = (caddr_t)rdr2();
         }
         if (bp & 0x08) {
                 addr[nbp++] = (caddr_t)rdr3();
         }
 
         for (i = 0; i < nbp; i++) {
                 if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) {
                         /*
                          * addr[i] is in user space
                          */
                         return nbp;
                 }
         }
 
         /*
          * None of the breakpoints are in user space.
          */
         return 0;
 }
 
 #ifdef KDB
 
 /*
  * Provide inb() and outb() as functions.  They are normally only available as
  * inline functions, thus cannot be called from the debugger.
  */
 
 /* silence compiler warnings */
 u_char inb_(u_short);
 void outb_(u_short, u_char);
 
 u_char
 inb_(u_short port)
 {
 	return inb(port);
 }
 
 void
 outb_(u_short port, u_char data)
 {
 	outb(port, data);
 }
 
 #endif /* KDB */
Index: stable/9/sys/sys/smp.h
===================================================================
--- stable/9/sys/sys/smp.h	(revision 235795)
+++ stable/9/sys/sys/smp.h	(revision 235796)
@@ -1,185 +1,185 @@
 /*-
  * ----------------------------------------------------------------------------
  * "THE BEER-WARE LICENSE" (Revision 42):
  * <phk@FreeBSD.org> wrote this file.  As long as you retain this notice you
  * can do whatever you want with this stuff. If we meet some day, and you think
  * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
  * ----------------------------------------------------------------------------
  *
  * $FreeBSD$
  */
 
 #ifndef _SYS_SMP_H_
 #define _SYS_SMP_H_
 
 #ifdef _KERNEL
 
 #ifndef LOCORE
 
 #include <sys/cpuset.h>
 
 /*
  * Topology of a NUMA or HTT system.
  *
  * The top level topology is an array of pointers to groups.  Each group
  * contains a bitmask of cpus in its group or subgroups.  It may also
  * contain a pointer to an array of child groups.
  *
  * The bitmasks at non leaf groups may be used by consumers who support
  * a smaller depth than the hardware provides.
  *
  * The topology may be omitted by systems where all CPUs are equal.
  */
 
 struct cpu_group {
 	struct cpu_group *cg_parent;	/* Our parent group. */
 	struct cpu_group *cg_child;	/* Optional children groups. */
 	cpuset_t	cg_mask;	/* Mask of cpus in this group. */
 	int32_t		cg_count;	/* Count of cpus in this group. */
 	int16_t		cg_children;	/* Number of children groups. */
 	int8_t		cg_level;	/* Shared cache level. */
 	int8_t		cg_flags;	/* Traversal modifiers. */
 };
 
 typedef struct cpu_group *cpu_group_t;
 
 /*
  * Defines common resources for CPUs in the group.  The highest level
  * resource should be used when multiple are shared.
  */
 #define	CG_SHARE_NONE	0
 #define	CG_SHARE_L1	1
 #define	CG_SHARE_L2	2
 #define	CG_SHARE_L3	3
 
 /*
  * Behavior modifiers for load balancing and affinity.
  */
 #define	CG_FLAG_HTT	0x01		/* Schedule the alternate core last. */
 #define	CG_FLAG_SMT	0x02		/* New age htt, less crippled. */
 #define	CG_FLAG_THREAD	(CG_FLAG_HTT | CG_FLAG_SMT)	/* Any threading. */
 
 /*
  * Convenience routines for building topologies.
  */
 #ifdef SMP
 struct cpu_group *smp_topo(void);
 struct cpu_group *smp_topo_none(void);
 struct cpu_group *smp_topo_1level(int l1share, int l1count, int l1flags);
 struct cpu_group *smp_topo_2level(int l2share, int l2count, int l1share,
     int l1count, int l1flags);
 struct cpu_group *smp_topo_find(struct cpu_group *top, int cpu);
 
 extern void (*cpustop_restartfunc)(void);
 extern int smp_active;
 extern int smp_cpus;
 extern volatile cpuset_t started_cpus;
 extern volatile cpuset_t stopped_cpus;
 extern cpuset_t hlt_cpus_mask;
 extern cpuset_t logical_cpus_mask;
 #endif /* SMP */
 
 extern u_int mp_maxid;
 extern int mp_maxcpus;
 extern int mp_ncpus;
 extern volatile int smp_started;
 
 extern cpuset_t all_cpus;
 
 /*
  * Macro allowing us to determine whether a CPU is absent at any given
  * time, thus permitting us to configure sparse maps of cpuid-dependent
  * (per-CPU) structures.
  */
 #define	CPU_ABSENT(x_cpu)	(!CPU_ISSET(x_cpu, &all_cpus))
 
 /*
  * Macros to iterate over non-absent CPUs.  CPU_FOREACH() takes an
  * integer iterator and iterates over the available set of CPUs.
  * CPU_FIRST() returns the id of the first non-absent CPU.  CPU_NEXT()
  * returns the id of the next non-absent CPU.  It will wrap back to
  * CPU_FIRST() once the end of the list is reached.  The iterators are
  * currently implemented via inline functions.
  */
 #define	CPU_FOREACH(i)							\
 	for ((i) = 0; (i) <= mp_maxid; (i)++)				\
 		if (!CPU_ABSENT((i)))
 
 static __inline int
 cpu_first(void)
 {
 	int i;
 
 	for (i = 0;; i++)
 		if (!CPU_ABSENT(i))
 			return (i);
 }
 
 static __inline int
 cpu_next(int i)
 {
 
 	for (;;) {
 		i++;
 		if (i > mp_maxid)
 			i = 0;
 		if (!CPU_ABSENT(i))
 			return (i);
 	}
 }
 
 #define	CPU_FIRST()	cpu_first()
 #define	CPU_NEXT(i)	cpu_next((i))
 
 #ifdef SMP
 /*
  * Machine dependent functions used to initialize MP support.
  *
  * The cpu_mp_probe() should check to see if MP support is present and return
  * zero if it is not or non-zero if it is.  If MP support is present, then
  * cpu_mp_start() will be called so that MP can be enabled.  This function
  * should do things such as startup secondary processors.  It should also
  * setup mp_ncpus, all_cpus, and smp_cpus.  It should also ensure that
  * smp_active and smp_started are initialized at the appropriate time.
  * Once cpu_mp_start() returns, machine independent MP startup code will be
  * executed and a simple message will be output to the console.  Finally,
  * cpu_mp_announce() will be called so that machine dependent messages about
  * the MP support may be output to the console if desired.
  *
  * The cpu_setmaxid() function is called very early during the boot process
  * so that the MD code may set mp_maxid to provide an upper bound on CPU IDs
  * that other subsystems may use.  If a platform is not able to determine
  * the exact maximum ID that early, then it may set mp_maxid to MAXCPU - 1.
  */
 struct thread;
 
 struct cpu_group *cpu_topo(void);
 void	cpu_mp_announce(void);
 int	cpu_mp_probe(void);
 void	cpu_mp_setmaxid(void);
 void	cpu_mp_start(void);
 
 void	forward_signal(struct thread *);
 int	restart_cpus(cpuset_t);
 int	stop_cpus(cpuset_t);
 int	stop_cpus_hard(cpuset_t);
-#if defined(__amd64__)
+#if defined(__amd64__) || defined(__i386__)
 int	suspend_cpus(cpuset_t);
 #endif
 void	smp_rendezvous_action(void);
 extern	struct mtx smp_ipi_mtx;
 
 #endif /* SMP */
 void	smp_no_rendevous_barrier(void *);
 void	smp_rendezvous(void (*)(void *), 
 		       void (*)(void *),
 		       void (*)(void *),
 		       void *arg);
 void	smp_rendezvous_cpus(cpuset_t,
 		       void (*)(void *), 
 		       void (*)(void *),
 		       void (*)(void *),
 		       void *arg);
 #endif /* !LOCORE */
 #endif /* _KERNEL */
 #endif /* _SYS_SMP_H_ */
Index: stable/9/sys
===================================================================
--- stable/9/sys	(revision 235795)
+++ stable/9/sys	(revision 235796)

Property changes on: stable/9/sys
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head/sys:r235622,235639,235683