Index: head/sys/amd64/amd64/apic_vector.S
===================================================================
--- head/sys/amd64/amd64/apic_vector.S	(revision 315958)
+++ head/sys/amd64/amd64/apic_vector.S	(revision 315959)
@@ -1,331 +1,309 @@
 /*-
  * Copyright (c) 1989, 1990 William F. Jolitz.
  * Copyright (c) 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: vector.s, 386BSD 0.1 unknown origin
  * $FreeBSD$
  */
 
 /*
  * Interrupt entry points for external interrupts triggered by I/O APICs
  * as well as IPI handlers.
  */
 
 #include "opt_smp.h"
 
 #include <machine/asmacros.h>
 #include <machine/specialreg.h>
 #include <x86/apicreg.h>
 
 #include "assym.s"
 
 #ifdef SMP
 #define LK	lock ;
 #else
 #define LK
 #endif
 
-	.text
-	SUPERALIGN_TEXT
-	/* End Of Interrupt to APIC */
-as_lapic_eoi:
-	cmpl	$0,x2apic_mode
-	jne	1f
-	movq	lapic_map,%rax
-	movl	$0,LA_EOI(%rax)
-	ret
-1:
-	movl	$MSR_APIC_EOI,%ecx
-	xorl	%eax,%eax
-	xorl	%edx,%edx
-	wrmsr
-	ret
-
 /*
  * I/O Interrupt Entry Point.  Rather than having one entry point for
  * each interrupt source, we use one entry point for each 32-bit word
  * in the ISR.  The handler determines the highest bit set in the ISR,
  * translates that into a vector, and passes the vector to the
  * lapic_handle_intr() function.
  */
 #define	ISR_VEC(index, vec_name)					\
 	.text ;								\
 	SUPERALIGN_TEXT ;						\
 IDTVEC(vec_name) ;							\
 	PUSH_FRAME ;							\
 	FAKE_MCOUNT(TF_RIP(%rsp)) ;					\
 	cmpl	$0,x2apic_mode ;					\
 	je	1f ;							\
 	movl	$(MSR_APIC_ISR0 + index),%ecx ;				\
 	rdmsr ;								\
 	jmp	2f ;							\
 1: ;									\
 	movq	lapic_map, %rdx ;	/* pointer to local APIC */	\
 	movl	LA_ISR + 16 * (index)(%rdx), %eax ;	/* load ISR */	\
 2: ;									\
 	bsrl	%eax, %eax ;	/* index of highest set bit in ISR */	\
 	jz	3f ;							\
 	addl	$(32 * index),%eax ;					\
 	movq	%rsp, %rsi	;                                       \
 	movl	%eax, %edi ;	/* pass the IRQ */			\
 	call	lapic_handle_intr ;					\
 3: ;									\
 	MEXITCOUNT ;							\
 	jmp	doreti
 
 /*
  * Handle "spurious INTerrupts".
  * Notes:
  *  This is different than the "spurious INTerrupt" generated by an
  *   8259 PIC for missing INTs.  See the APIC documentation for details.
  *  This routine should NOT do an 'EOI' cycle.
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(spuriousint)
 
 	/* No EOI cycle used here */
 
 	jmp	doreti_iret
 
 	ISR_VEC(1, apic_isr1)
 	ISR_VEC(2, apic_isr2)
 	ISR_VEC(3, apic_isr3)
 	ISR_VEC(4, apic_isr4)
 	ISR_VEC(5, apic_isr5)
 	ISR_VEC(6, apic_isr6)
 	ISR_VEC(7, apic_isr7)
 
 /*
  * Local APIC periodic timer handler.
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(timerint)
 	PUSH_FRAME
 	FAKE_MCOUNT(TF_RIP(%rsp))
 	movq	%rsp, %rdi
 	call	lapic_handle_timer
 	MEXITCOUNT
 	jmp	doreti
 
 /*
  * Local APIC CMCI handler.
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(cmcint)
 	PUSH_FRAME
 	FAKE_MCOUNT(TF_RIP(%rsp))
 	call	lapic_handle_cmc
 	MEXITCOUNT
 	jmp	doreti
 
 /*
  * Local APIC error interrupt handler.
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(errorint)
 	PUSH_FRAME
 	FAKE_MCOUNT(TF_RIP(%rsp))
 	call	lapic_handle_error
 	MEXITCOUNT
 	jmp	doreti
 
 #ifdef XENHVM
 /*
  * Xen event channel upcall interrupt handler.
  * Only used when the hypervisor supports direct vector callbacks.
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(xen_intr_upcall)
 	PUSH_FRAME
 	FAKE_MCOUNT(TF_RIP(%rsp))
 	movq	%rsp, %rdi
 	call	xen_intr_handle_upcall
 	MEXITCOUNT
 	jmp	doreti
 #endif
 
 #ifdef SMP
 /*
  * Global address space TLB shootdown.
  */
 	.text
 
 	SUPERALIGN_TEXT
 invltlb_ret:
-	call	as_lapic_eoi
+	call	native_lapic_eoi
 	POP_FRAME
 	jmp	doreti_iret
 
 	SUPERALIGN_TEXT
 IDTVEC(invltlb)
 	PUSH_FRAME
 
 	call	invltlb_handler
+	movl	$IPI_INVLTLB, %edi
 	jmp	invltlb_ret
 
 IDTVEC(invltlb_pcid)
 	PUSH_FRAME
 
 	call	invltlb_pcid_handler
+	movl	$IPI_INVLTLB, %edi
 	jmp	invltlb_ret
 
 IDTVEC(invltlb_invpcid)
 	PUSH_FRAME
 
 	call	invltlb_invpcid_handler
+	movl	$IPI_INVLTLB, %edi
 	jmp	invltlb_ret
 
 /*
  * Single page TLB shootdown
  */
 	.text
 
 	SUPERALIGN_TEXT
 IDTVEC(invlpg)
 	PUSH_FRAME
 
 	call	invlpg_handler
+	movl	$IPI_INVLPG, %edi
 	jmp	invltlb_ret
 
 /*
  * Page range TLB shootdown.
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(invlrng)
 	PUSH_FRAME
 
 	call	invlrng_handler
+	movl	$IPI_INVLRNG, %edi
 	jmp	invltlb_ret
 
 /*
  * Invalidate cache.
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(invlcache)
 	PUSH_FRAME
 
 	call	invlcache_handler
+	movl	$IPI_INVLCACHE, %edi
 	jmp	invltlb_ret
 
 /*
  * Handler for IPIs sent via the per-cpu IPI bitmap.
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(ipi_intr_bitmap_handler)		
 	PUSH_FRAME
 
-	call	as_lapic_eoi
+	movl	$IPI_BITMAP_VECTOR, %edi
+	call	native_lapic_eoi
 	
 	FAKE_MCOUNT(TF_RIP(%rsp))
 
 	call	ipi_bitmap_handler
 	MEXITCOUNT
 	jmp	doreti
 
 /*
  * Executed by a CPU when it receives an IPI_STOP from another CPU.
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(cpustop)
 	PUSH_FRAME
 
-	call	as_lapic_eoi
+	movl	$IPI_STOP, %edi
+	call	native_lapic_eoi
 
 	call	cpustop_handler
 	jmp	doreti
 
 /*
  * Executed by a CPU when it receives an IPI_SUSPEND from another CPU.
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(cpususpend)
 	PUSH_FRAME
 
 	call	cpususpend_handler
-	call	as_lapic_eoi
+	movl	$IPI_SUSPEND, %edi
+	call	native_lapic_eoi
 	jmp	doreti
 
 /*
  * Executed by a CPU when it receives a RENDEZVOUS IPI from another CPU.
  *
  * - Calls the generic rendezvous action function.
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(rendezvous)
 	PUSH_FRAME
 #ifdef COUNT_IPIS
 	movl	PCPU(CPUID), %eax
 	movq	ipi_rendezvous_counts(,%rax,8), %rax
 	incq	(%rax)
 #endif
 	call	smp_rendezvous_action
-	call	as_lapic_eoi
+	movl	$IPI_RENDEZVOUS, %edi
+	call	native_lapic_eoi
 	jmp	doreti
 
 /*
  * IPI handler whose purpose is to interrupt the CPU with minimum overhead.
  * This is used by bhyve to force a host cpu executing in guest context to
  * trap into the hypervisor.
- *
- * This handler is different from other IPI handlers in the following aspects:
- *
- * 1. It doesn't push a trapframe on the stack.
- *
- * This implies that a DDB backtrace involving 'justreturn' will skip the
- * function that was interrupted by this handler.
- *
- * 2. It doesn't 'swapgs' when userspace is interrupted.
- *
- * The 'justreturn' handler does not access any pcpu data so it is not an
- * issue. Moreover the 'justreturn' handler can only be interrupted by an NMI
- * whose handler already doesn't trust GS.base when kernel code is interrupted.
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(justreturn)
-	pushq	%rax
-	pushq	%rcx
-	pushq	%rdx
-	call	as_lapic_eoi
-	popq	%rdx
-	popq	%rcx
-	popq	%rax
+	PUSH_FRAME
+	movl	vmm_ipinum, %edi
+	call	native_lapic_eoi
+	POP_FRAME
 	jmp	doreti_iret
 
 #endif /* SMP */
Index: head/sys/amd64/amd64/genassym.c
===================================================================
--- head/sys/amd64/amd64/genassym.c	(revision 315958)
+++ head/sys/amd64/amd64/genassym.c	(revision 315959)
@@ -1,237 +1,249 @@
 /*-
  * Copyright (c) 1982, 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)genassym.c	5.11 (Berkeley) 5/10/91
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_hwpmc_hooks.h"
 #include "opt_kstack_pages.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/assym.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
+#include <sys/bus.h>
 #include <sys/proc.h>
 #ifdef	HWPMC_HOOKS
 #include <sys/pmckern.h>
 #endif
 #include <sys/errno.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/socket.h>
 #include <sys/resourcevar.h>
 #include <sys/ucontext.h>
 #include <machine/tss.h>
 #include <sys/vmmeter.h>
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <sys/proc.h>
+#include <machine/intr_machdep.h>
 #include <x86/apicreg.h>
+#include <x86/apicvar.h>
 #include <machine/cpu.h>
 #include <machine/pcb.h>
 #include <machine/sigframe.h>
 #include <machine/proc.h>
 #include <machine/segments.h>
 
 ASSYM(P_VMSPACE, offsetof(struct proc, p_vmspace));
 ASSYM(VM_PMAP, offsetof(struct vmspace, vm_pmap));
 ASSYM(PM_ACTIVE, offsetof(struct pmap, pm_active));
 
 ASSYM(P_MD, offsetof(struct proc, p_md));
 ASSYM(MD_LDT, offsetof(struct mdproc, md_ldt));
 ASSYM(MD_LDT_SD, offsetof(struct mdproc, md_ldt_sd));
 
 ASSYM(TD_LOCK, offsetof(struct thread, td_lock));
 ASSYM(TD_FLAGS, offsetof(struct thread, td_flags));
 ASSYM(TD_PCB, offsetof(struct thread, td_pcb));
 ASSYM(TD_PFLAGS, offsetof(struct thread, td_pflags));
 ASSYM(TD_PROC, offsetof(struct thread, td_proc));
 ASSYM(TD_TID, offsetof(struct thread, td_tid));
 ASSYM(TD_FRAME, offsetof(struct thread, td_frame));
 
 ASSYM(TDF_ASTPENDING, TDF_ASTPENDING);
 ASSYM(TDF_NEEDRESCHED, TDF_NEEDRESCHED);
 
 ASSYM(TDP_CALLCHAIN, TDP_CALLCHAIN);
 ASSYM(TDP_KTHREAD, TDP_KTHREAD);
 
 ASSYM(V_TRAP, offsetof(struct vmmeter, v_trap));
 ASSYM(V_SYSCALL, offsetof(struct vmmeter, v_syscall));
 ASSYM(V_INTR, offsetof(struct vmmeter, v_intr));
 ASSYM(PAGE_SIZE, PAGE_SIZE);
 ASSYM(NPTEPG, NPTEPG);
 ASSYM(NPDEPG, NPDEPG);
 ASSYM(addr_PTmap, addr_PTmap);
 ASSYM(addr_PDmap, addr_PDmap);
 ASSYM(addr_PDPmap, addr_PDPmap);
 ASSYM(addr_PML4map, addr_PML4map);
 ASSYM(addr_PML4pml4e, addr_PML4pml4e);
 ASSYM(PDESIZE, sizeof(pd_entry_t));
 ASSYM(PTESIZE, sizeof(pt_entry_t));
 ASSYM(PAGE_SHIFT, PAGE_SHIFT);
 ASSYM(PAGE_MASK, PAGE_MASK);
 ASSYM(PDRSHIFT, PDRSHIFT);
 ASSYM(PDPSHIFT, PDPSHIFT);
 ASSYM(PML4SHIFT, PML4SHIFT);
 ASSYM(val_KPDPI, KPDPI);
 ASSYM(val_KPML4I, KPML4I);
 ASSYM(val_PML4PML4I, PML4PML4I);
 ASSYM(USRSTACK, USRSTACK);
 ASSYM(VM_MAXUSER_ADDRESS, VM_MAXUSER_ADDRESS);
 ASSYM(KERNBASE, KERNBASE);
 ASSYM(DMAP_MIN_ADDRESS, DMAP_MIN_ADDRESS);
 ASSYM(DMAP_MAX_ADDRESS, DMAP_MAX_ADDRESS);
 ASSYM(MCLBYTES, MCLBYTES);
 
 ASSYM(PCB_R15, offsetof(struct pcb, pcb_r15));
 ASSYM(PCB_R14, offsetof(struct pcb, pcb_r14));
 ASSYM(PCB_R13, offsetof(struct pcb, pcb_r13));
 ASSYM(PCB_R12, offsetof(struct pcb, pcb_r12));
 ASSYM(PCB_RBP, offsetof(struct pcb, pcb_rbp));
 ASSYM(PCB_RSP, offsetof(struct pcb, pcb_rsp));
 ASSYM(PCB_RBX, offsetof(struct pcb, pcb_rbx));
 ASSYM(PCB_RIP, offsetof(struct pcb, pcb_rip));
 ASSYM(PCB_FSBASE, offsetof(struct pcb, pcb_fsbase));
 ASSYM(PCB_GSBASE, offsetof(struct pcb, pcb_gsbase));
 ASSYM(PCB_KGSBASE, offsetof(struct pcb, pcb_kgsbase));
 ASSYM(PCB_CR0, offsetof(struct pcb, pcb_cr0));
 ASSYM(PCB_CR2, offsetof(struct pcb, pcb_cr2));
 ASSYM(PCB_CR3, offsetof(struct pcb, pcb_cr3));
 ASSYM(PCB_CR4, offsetof(struct pcb, pcb_cr4));
 ASSYM(PCB_DR0, offsetof(struct pcb, pcb_dr0));
 ASSYM(PCB_DR1, offsetof(struct pcb, pcb_dr1));
 ASSYM(PCB_DR2, offsetof(struct pcb, pcb_dr2));
 ASSYM(PCB_DR3, offsetof(struct pcb, pcb_dr3));
 ASSYM(PCB_DR6, offsetof(struct pcb, pcb_dr6));
 ASSYM(PCB_DR7, offsetof(struct pcb, pcb_dr7));
 ASSYM(PCB_GDT, offsetof(struct pcb, pcb_gdt));
 ASSYM(PCB_IDT, offsetof(struct pcb, pcb_idt));
 ASSYM(PCB_LDT, offsetof(struct pcb, pcb_ldt));
 ASSYM(PCB_TR, offsetof(struct pcb, pcb_tr));
 ASSYM(PCB_FLAGS, offsetof(struct pcb, pcb_flags));
 ASSYM(PCB_ONFAULT, offsetof(struct pcb, pcb_onfault));
 ASSYM(PCB_TSSP, offsetof(struct pcb, pcb_tssp));
 ASSYM(PCB_SAVEFPU, offsetof(struct pcb, pcb_save));
 ASSYM(PCB_EFER, offsetof(struct pcb, pcb_efer));
 ASSYM(PCB_STAR, offsetof(struct pcb, pcb_star));
 ASSYM(PCB_LSTAR, offsetof(struct pcb, pcb_lstar));
 ASSYM(PCB_CSTAR, offsetof(struct pcb, pcb_cstar));
 ASSYM(PCB_SFMASK, offsetof(struct pcb, pcb_sfmask));
 ASSYM(PCB_SIZE, sizeof(struct pcb));
 ASSYM(PCB_FULL_IRET, PCB_FULL_IRET);
 ASSYM(PCB_DBREGS, PCB_DBREGS);
 ASSYM(PCB_32BIT, PCB_32BIT);
 
 ASSYM(COMMON_TSS_RSP0, offsetof(struct amd64tss, tss_rsp0));
 
 ASSYM(TF_R15, offsetof(struct trapframe, tf_r15));
 ASSYM(TF_R14, offsetof(struct trapframe, tf_r14));
 ASSYM(TF_R13, offsetof(struct trapframe, tf_r13));
 ASSYM(TF_R12, offsetof(struct trapframe, tf_r12));
 ASSYM(TF_R11, offsetof(struct trapframe, tf_r11));
 ASSYM(TF_R10, offsetof(struct trapframe, tf_r10));
 ASSYM(TF_R9, offsetof(struct trapframe, tf_r9));
 ASSYM(TF_R8, offsetof(struct trapframe, tf_r8));
 ASSYM(TF_RDI, offsetof(struct trapframe, tf_rdi));
 ASSYM(TF_RSI, offsetof(struct trapframe, tf_rsi));
 ASSYM(TF_RBP, offsetof(struct trapframe, tf_rbp));
 ASSYM(TF_RBX, offsetof(struct trapframe, tf_rbx));
 ASSYM(TF_RDX, offsetof(struct trapframe, tf_rdx));
 ASSYM(TF_RCX, offsetof(struct trapframe, tf_rcx));
 ASSYM(TF_RAX, offsetof(struct trapframe, tf_rax));
 ASSYM(TF_TRAPNO, offsetof(struct trapframe, tf_trapno));
 ASSYM(TF_ADDR, offsetof(struct trapframe, tf_addr));
 ASSYM(TF_ERR, offsetof(struct trapframe, tf_err));
 ASSYM(TF_RIP, offsetof(struct trapframe, tf_rip));
 ASSYM(TF_CS, offsetof(struct trapframe, tf_cs));
 ASSYM(TF_RFLAGS, offsetof(struct trapframe, tf_rflags));
 ASSYM(TF_RSP, offsetof(struct trapframe, tf_rsp));
 ASSYM(TF_SS, offsetof(struct trapframe, tf_ss));
 ASSYM(TF_DS, offsetof(struct trapframe, tf_ds));
 ASSYM(TF_ES, offsetof(struct trapframe, tf_es));
 ASSYM(TF_FS, offsetof(struct trapframe, tf_fs));
 ASSYM(TF_GS, offsetof(struct trapframe, tf_gs));
 ASSYM(TF_FLAGS, offsetof(struct trapframe, tf_flags));
 ASSYM(TF_SIZE, sizeof(struct trapframe));
 ASSYM(TF_HASSEGS, TF_HASSEGS);
 
 ASSYM(SIGF_HANDLER, offsetof(struct sigframe, sf_ahu.sf_handler));
 ASSYM(SIGF_UC, offsetof(struct sigframe, sf_uc));
 ASSYM(UC_EFLAGS, offsetof(ucontext_t, uc_mcontext.mc_rflags));
 ASSYM(ENOENT, ENOENT);
 ASSYM(EFAULT, EFAULT);
 ASSYM(ENAMETOOLONG, ENAMETOOLONG);
 ASSYM(MAXCOMLEN, MAXCOMLEN);
 ASSYM(MAXPATHLEN, MAXPATHLEN);
 ASSYM(PC_SIZEOF, sizeof(struct pcpu));
 ASSYM(PC_PRVSPACE, offsetof(struct pcpu, pc_prvspace));
 ASSYM(PC_CURTHREAD, offsetof(struct pcpu, pc_curthread));
 ASSYM(PC_FPCURTHREAD, offsetof(struct pcpu, pc_fpcurthread));
 ASSYM(PC_IDLETHREAD, offsetof(struct pcpu, pc_idlethread));
 ASSYM(PC_CURPCB, offsetof(struct pcpu, pc_curpcb));
 ASSYM(PC_CPUID, offsetof(struct pcpu, pc_cpuid));
 ASSYM(PC_SCRATCH_RSP, offsetof(struct pcpu, pc_scratch_rsp));
 ASSYM(PC_CURPMAP, offsetof(struct pcpu, pc_curpmap));
 ASSYM(PC_TSSP, offsetof(struct pcpu, pc_tssp));
 ASSYM(PC_RSP0, offsetof(struct pcpu, pc_rsp0));
 ASSYM(PC_FS32P, offsetof(struct pcpu, pc_fs32p));
 ASSYM(PC_GS32P, offsetof(struct pcpu, pc_gs32p));
 ASSYM(PC_LDT, offsetof(struct pcpu, pc_ldt));
 ASSYM(PC_COMMONTSSP, offsetof(struct pcpu, pc_commontssp));
 ASSYM(PC_TSS, offsetof(struct pcpu, pc_tss));
 ASSYM(PC_PM_SAVE_CNT, offsetof(struct pcpu, pc_pm_save_cnt));
- 
+
 ASSYM(LA_EOI, LAPIC_EOI * LAPIC_MEM_MUL);
 ASSYM(LA_ISR, LAPIC_ISR0 * LAPIC_MEM_MUL);
+
+ASSYM(IPI_INVLTLB, IPI_INVLTLB);
+ASSYM(IPI_INVLPG, IPI_INVLPG);
+ASSYM(IPI_INVLRNG, IPI_INVLRNG);
+ASSYM(IPI_INVLCACHE, IPI_INVLCACHE);
+ASSYM(IPI_BITMAP_VECTOR, IPI_BITMAP_VECTOR);
+ASSYM(IPI_STOP, IPI_STOP);
+ASSYM(IPI_SUSPEND, IPI_SUSPEND);
+ASSYM(IPI_RENDEZVOUS, IPI_RENDEZVOUS);
 
 ASSYM(KCSEL, GSEL(GCODE_SEL, SEL_KPL));
 ASSYM(KDSEL, GSEL(GDATA_SEL, SEL_KPL));
 ASSYM(KUCSEL, GSEL(GUCODE_SEL, SEL_UPL));
 ASSYM(KUDSEL, GSEL(GUDATA_SEL, SEL_UPL));
 ASSYM(KUC32SEL, GSEL(GUCODE32_SEL, SEL_UPL));
 ASSYM(KUF32SEL, GSEL(GUFS32_SEL, SEL_UPL));
 ASSYM(KUG32SEL, GSEL(GUGS32_SEL, SEL_UPL));
 ASSYM(TSSSEL, GSEL(GPROC0_SEL, SEL_KPL));
 ASSYM(LDTSEL, GSEL(GUSERLDT_SEL, SEL_KPL));
 ASSYM(SEL_RPL_MASK, SEL_RPL_MASK);
 
 ASSYM(__FreeBSD_version, __FreeBSD_version);
 
 #ifdef	HWPMC_HOOKS
 ASSYM(PMC_FN_USER_CALLCHAIN, PMC_FN_USER_CALLCHAIN);
 #endif
Index: head/sys/amd64/vmm/amd/svm.c
===================================================================
--- head/sys/amd64/vmm/amd/svm.c	(revision 315958)
+++ head/sys/amd64/vmm/amd/svm.c	(revision 315959)
@@ -1,2246 +1,2247 @@
 /*-
  * Copyright (c) 2013, Anish Gupta (akgupt3@gmail.com)
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/smp.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/pcpu.h>
 #include <sys/proc.h>
 #include <sys/sysctl.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 #include <machine/cpufunc.h>
 #include <machine/psl.h>
 #include <machine/md_var.h>
 #include <machine/specialreg.h>
 #include <machine/smp.h>
 #include <machine/vmm.h>
 #include <machine/vmm_dev.h>
 #include <machine/vmm_instruction_emul.h>
 
+#include "vmm_host.h"
 #include "vmm_lapic.h"
 #include "vmm_stat.h"
 #include "vmm_ktr.h"
 #include "vmm_ioport.h"
 #include "vatpic.h"
 #include "vlapic.h"
 #include "vlapic_priv.h"
 
 #include "x86.h"
 #include "vmcb.h"
 #include "svm.h"
 #include "svm_softc.h"
 #include "svm_msr.h"
 #include "npt.h"
 
 SYSCTL_DECL(_hw_vmm);
 SYSCTL_NODE(_hw_vmm, OID_AUTO, svm, CTLFLAG_RW, NULL, NULL);
 
 /*
  * SVM CPUID function 0x8000_000A, edx bit decoding.
  */
 #define AMD_CPUID_SVM_NP		BIT(0)  /* Nested paging or RVI */
 #define AMD_CPUID_SVM_LBR		BIT(1)  /* Last branch virtualization */
 #define AMD_CPUID_SVM_SVML		BIT(2)  /* SVM lock */
 #define AMD_CPUID_SVM_NRIP_SAVE		BIT(3)  /* Next RIP is saved */
 #define AMD_CPUID_SVM_TSC_RATE		BIT(4)  /* TSC rate control. */
 #define AMD_CPUID_SVM_VMCB_CLEAN	BIT(5)  /* VMCB state caching */
 #define AMD_CPUID_SVM_FLUSH_BY_ASID	BIT(6)  /* Flush by ASID */
 #define AMD_CPUID_SVM_DECODE_ASSIST	BIT(7)  /* Decode assist */
 #define AMD_CPUID_SVM_PAUSE_INC		BIT(10) /* Pause intercept filter. */
 #define AMD_CPUID_SVM_PAUSE_FTH		BIT(12) /* Pause filter threshold */
 #define	AMD_CPUID_SVM_AVIC		BIT(13)	/* AVIC present */
 
 #define	VMCB_CACHE_DEFAULT	(VMCB_CACHE_ASID 	|	\
 				VMCB_CACHE_IOPM		|	\
 				VMCB_CACHE_I		|	\
 				VMCB_CACHE_TPR		|	\
 				VMCB_CACHE_CR2		|	\
 				VMCB_CACHE_CR		|	\
 				VMCB_CACHE_DT		|	\
 				VMCB_CACHE_SEG		|	\
 				VMCB_CACHE_NP)
 
 static uint32_t vmcb_clean = VMCB_CACHE_DEFAULT;
 SYSCTL_INT(_hw_vmm_svm, OID_AUTO, vmcb_clean, CTLFLAG_RDTUN, &vmcb_clean,
     0, NULL);
 
 static MALLOC_DEFINE(M_SVM, "svm", "svm");
 static MALLOC_DEFINE(M_SVM_VLAPIC, "svm-vlapic", "svm-vlapic");
 
 /* Per-CPU context area. */
 extern struct pcpu __pcpu[];
 
 static uint32_t svm_feature = ~0U;	/* AMD SVM features. */
 SYSCTL_UINT(_hw_vmm_svm, OID_AUTO, features, CTLFLAG_RDTUN, &svm_feature, 0,
     "SVM features advertised by CPUID.8000000AH:EDX");
 
 static int disable_npf_assist;
 SYSCTL_INT(_hw_vmm_svm, OID_AUTO, disable_npf_assist, CTLFLAG_RWTUN,
     &disable_npf_assist, 0, NULL);
 
 /* Maximum ASIDs supported by the processor */
 static uint32_t nasid;
 SYSCTL_UINT(_hw_vmm_svm, OID_AUTO, num_asids, CTLFLAG_RDTUN, &nasid, 0,
     "Number of ASIDs supported by this processor");
 
 /* Current ASID generation for each host cpu */
 static struct asid asid[MAXCPU];
 
 /* 
  * SVM host state saved area of size 4KB for each core.
  */
 static uint8_t hsave[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE);
 
 static VMM_STAT_AMD(VCPU_EXITINTINFO, "VM exits during event delivery");
 static VMM_STAT_AMD(VCPU_INTINFO_INJECTED, "Events pending at VM entry");
 static VMM_STAT_AMD(VMEXIT_VINTR, "VM exits due to interrupt window");
 
 static int svm_setreg(void *arg, int vcpu, int ident, uint64_t val);
 
 static __inline int
 flush_by_asid(void)
 {
 
 	return (svm_feature & AMD_CPUID_SVM_FLUSH_BY_ASID);
 }
 
 static __inline int
 decode_assist(void)
 {
 
 	return (svm_feature & AMD_CPUID_SVM_DECODE_ASSIST);
 }
 
 static void
 svm_disable(void *arg __unused)
 {
 	uint64_t efer;
 
 	efer = rdmsr(MSR_EFER);
 	efer &= ~EFER_SVM;
 	wrmsr(MSR_EFER, efer);
 }
 
 /*
  * Disable SVM on all CPUs.
  */
 static int
 svm_cleanup(void)
 {
 
 	smp_rendezvous(NULL, svm_disable, NULL, NULL);
 	return (0);
 }
 
 /*
  * Verify that all the features required by bhyve are available.
  */
 static int
 check_svm_features(void)
 {
 	u_int regs[4];
 
 	/* CPUID Fn8000_000A is for SVM */
 	do_cpuid(0x8000000A, regs);
 	svm_feature &= regs[3];
 
 	/*
 	 * The number of ASIDs can be configured to be less than what is
 	 * supported by the hardware but not more.
 	 */
 	if (nasid == 0 || nasid > regs[1])
 		nasid = regs[1];
 	KASSERT(nasid > 1, ("Insufficient ASIDs for guests: %#x", nasid));
 
 	/* bhyve requires the Nested Paging feature */
 	if (!(svm_feature & AMD_CPUID_SVM_NP)) {
 		printf("SVM: Nested Paging feature not available.\n");
 		return (ENXIO);
 	}
 
 	/* bhyve requires the NRIP Save feature */
 	if (!(svm_feature & AMD_CPUID_SVM_NRIP_SAVE)) {
 		printf("SVM: NRIP Save feature not available.\n");
 		return (ENXIO);
 	}
 
 	return (0);
 }
 
 static void
 svm_enable(void *arg __unused)
 {
 	uint64_t efer;
 
 	efer = rdmsr(MSR_EFER);
 	efer |= EFER_SVM;
 	wrmsr(MSR_EFER, efer);
 
 	wrmsr(MSR_VM_HSAVE_PA, vtophys(hsave[curcpu]));
 }
 
 /*
  * Return 1 if SVM is enabled on this processor and 0 otherwise.
  */
 static int
 svm_available(void)
 {
 	uint64_t msr;
 
 	/* Section 15.4 Enabling SVM from APM2. */
 	if ((amd_feature2 & AMDID2_SVM) == 0) {
 		printf("SVM: not available.\n");
 		return (0);
 	}
 
 	msr = rdmsr(MSR_VM_CR);
 	if ((msr & VM_CR_SVMDIS) != 0) {
 		printf("SVM: disabled by BIOS.\n");
 		return (0);
 	}
 
 	return (1);
 }
 
 static int
 svm_init(int ipinum)
 {
 	int error, cpu;
 
 	if (!svm_available())
 		return (ENXIO);
 
 	error = check_svm_features();
 	if (error)
 		return (error);
 
 	vmcb_clean &= VMCB_CACHE_DEFAULT;
 
 	for (cpu = 0; cpu < MAXCPU; cpu++) {
 		/*
 		 * Initialize the host ASIDs to their "highest" valid values.
 		 *
 		 * The next ASID allocation will rollover both 'gen' and 'num'
 		 * and start off the sequence at {1,1}.
 		 */
 		asid[cpu].gen = ~0UL;
 		asid[cpu].num = nasid - 1;
 	}
 
 	svm_msr_init();
 	svm_npt_init(ipinum);
 
 	/* Enable SVM on all CPUs */
 	smp_rendezvous(NULL, svm_enable, NULL, NULL);
 
 	return (0);
 }
 
 static void
 svm_restore(void)
 {
 
 	svm_enable(NULL);
 }		
 
 /* Pentium compatible MSRs */
 #define MSR_PENTIUM_START 	0	
 #define MSR_PENTIUM_END 	0x1FFF
 /* AMD 6th generation and Intel compatible MSRs */
 #define MSR_AMD6TH_START 	0xC0000000UL	
 #define MSR_AMD6TH_END 		0xC0001FFFUL	
 /* AMD 7th and 8th generation compatible MSRs */
 #define MSR_AMD7TH_START 	0xC0010000UL	
 #define MSR_AMD7TH_END 		0xC0011FFFUL	
 
 /*
  * Get the index and bit position for a MSR in permission bitmap.
  * Two bits are used for each MSR: lower bit for read and higher bit for write.
  */
 static int
 svm_msr_index(uint64_t msr, int *index, int *bit)
 {
 	uint32_t base, off;
 
 	*index = -1;
 	*bit = (msr % 4) * 2;
 	base = 0;
 
 	if (msr >= MSR_PENTIUM_START && msr <= MSR_PENTIUM_END) {
 		*index = msr / 4;
 		return (0);
 	}
 
 	base += (MSR_PENTIUM_END - MSR_PENTIUM_START + 1); 
 	if (msr >= MSR_AMD6TH_START && msr <= MSR_AMD6TH_END) {
 		off = (msr - MSR_AMD6TH_START); 
 		*index = (off + base) / 4;
 		return (0);
 	} 
 
 	base += (MSR_AMD6TH_END - MSR_AMD6TH_START + 1);
 	if (msr >= MSR_AMD7TH_START && msr <= MSR_AMD7TH_END) {
 		off = (msr - MSR_AMD7TH_START);
 		*index = (off + base) / 4;
 		return (0);
 	}
 
 	return (EINVAL);
 }
 
 /*
  * Allow vcpu to read or write the 'msr' without trapping into the hypervisor.
  */
 static void
 svm_msr_perm(uint8_t *perm_bitmap, uint64_t msr, bool read, bool write)
 {
 	int index, bit, error;
 
 	error = svm_msr_index(msr, &index, &bit);
 	KASSERT(error == 0, ("%s: invalid msr %#lx", __func__, msr));
 	KASSERT(index >= 0 && index < SVM_MSR_BITMAP_SIZE,
 	    ("%s: invalid index %d for msr %#lx", __func__, index, msr));
 	KASSERT(bit >= 0 && bit <= 6, ("%s: invalid bit position %d "
 	    "msr %#lx", __func__, bit, msr));
 
 	if (read)
 		perm_bitmap[index] &= ~(1UL << bit);
 
 	if (write)
 		perm_bitmap[index] &= ~(2UL << bit);
 }
 
 static void
 svm_msr_rw_ok(uint8_t *perm_bitmap, uint64_t msr)
 {
 
 	svm_msr_perm(perm_bitmap, msr, true, true);
 }
 
 static void
 svm_msr_rd_ok(uint8_t *perm_bitmap, uint64_t msr)
 {
 
 	svm_msr_perm(perm_bitmap, msr, true, false);
 }
 
 static __inline int
 svm_get_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask)
 {
 	struct vmcb_ctrl *ctrl;
 
 	KASSERT(idx >=0 && idx < 5, ("invalid intercept index %d", idx));
 
 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
 	return (ctrl->intercept[idx] & bitmask ? 1 : 0);
 }
 
 static __inline void
 svm_set_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask,
     int enabled)
 {
 	struct vmcb_ctrl *ctrl;
 	uint32_t oldval;
 
 	KASSERT(idx >=0 && idx < 5, ("invalid intercept index %d", idx));
 
 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
 	oldval = ctrl->intercept[idx];
 
 	if (enabled)
 		ctrl->intercept[idx] |= bitmask;
 	else
 		ctrl->intercept[idx] &= ~bitmask;
 
 	if (ctrl->intercept[idx] != oldval) {
 		svm_set_dirty(sc, vcpu, VMCB_CACHE_I);
 		VCPU_CTR3(sc->vm, vcpu, "intercept[%d] modified "
 		    "from %#x to %#x", idx, oldval, ctrl->intercept[idx]);
 	}
 }
 
 static __inline void
 svm_disable_intercept(struct svm_softc *sc, int vcpu, int off, uint32_t bitmask)
 {
 
 	svm_set_intercept(sc, vcpu, off, bitmask, 0);
 }
 
 static __inline void
 svm_enable_intercept(struct svm_softc *sc, int vcpu, int off, uint32_t bitmask)
 {
 
 	svm_set_intercept(sc, vcpu, off, bitmask, 1);
 }
 
 static void
 vmcb_init(struct svm_softc *sc, int vcpu, uint64_t iopm_base_pa,
     uint64_t msrpm_base_pa, uint64_t np_pml4)
 {
 	struct vmcb_ctrl *ctrl;
 	struct vmcb_state *state;
 	uint32_t mask;
 	int n;
 
 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
 	state = svm_get_vmcb_state(sc, vcpu);
 
 	ctrl->iopm_base_pa = iopm_base_pa;
 	ctrl->msrpm_base_pa = msrpm_base_pa;
 
 	/* Enable nested paging */
 	ctrl->np_enable = 1;
 	ctrl->n_cr3 = np_pml4;
 
 	/*
 	 * Intercept accesses to the control registers that are not shadowed
 	 * in the VMCB - i.e. all except cr0, cr2, cr3, cr4 and cr8.
 	 */
 	for (n = 0; n < 16; n++) {
 		mask = (BIT(n) << 16) | BIT(n);
 		if (n == 0 || n == 2 || n == 3 || n == 4 || n == 8)
 			svm_disable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask);
 		else
 			svm_enable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask);
 	}
 
 
 	/*
 	 * Intercept everything when tracing guest exceptions otherwise
 	 * just intercept machine check exception.
 	 */
 	if (vcpu_trace_exceptions(sc->vm, vcpu)) {
 		for (n = 0; n < 32; n++) {
 			/*
 			 * Skip unimplemented vectors in the exception bitmap.
 			 */
 			if (n == 2 || n == 9) {
 				continue;
 			}
 			svm_enable_intercept(sc, vcpu, VMCB_EXC_INTCPT, BIT(n));
 		}
 	} else {
 		svm_enable_intercept(sc, vcpu, VMCB_EXC_INTCPT, BIT(IDT_MC));
 	}
 
 	/* Intercept various events (for e.g. I/O, MSR and CPUID accesses) */
 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IO);
 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_MSR);
 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_CPUID);
 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INTR);
 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INIT);
 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_NMI);
 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SMI);
 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SHUTDOWN);
 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
 	    VMCB_INTCPT_FERR_FREEZE);
 
 	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_MONITOR);
 	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_MWAIT);
 
 	/*
 	 * From section "Canonicalization and Consistency Checks" in APMv2
 	 * the VMRUN intercept bit must be set to pass the consistency check.
 	 */
 	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMRUN);
 
 	/*
 	 * The ASID will be set to a non-zero value just before VMRUN.
 	 */
 	ctrl->asid = 0;
 
 	/*
 	 * Section 15.21.1, Interrupt Masking in EFLAGS
 	 * Section 15.21.2, Virtualizing APIC.TPR
 	 *
 	 * This must be set for %rflag and %cr8 isolation of guest and host.
 	 */
 	ctrl->v_intr_masking = 1;
 
 	/* Enable Last Branch Record aka LBR for debugging */
 	ctrl->lbr_virt_en = 1;
 	state->dbgctl = BIT(0);
 
 	/* EFER_SVM must always be set when the guest is executing */
 	state->efer = EFER_SVM;
 
 	/* Set up the PAT to power-on state */
 	state->g_pat = PAT_VALUE(0, PAT_WRITE_BACK)	|
 	    PAT_VALUE(1, PAT_WRITE_THROUGH)	|
 	    PAT_VALUE(2, PAT_UNCACHED)		|
 	    PAT_VALUE(3, PAT_UNCACHEABLE)	|
 	    PAT_VALUE(4, PAT_WRITE_BACK)	|
 	    PAT_VALUE(5, PAT_WRITE_THROUGH)	|
 	    PAT_VALUE(6, PAT_UNCACHED)		|
 	    PAT_VALUE(7, PAT_UNCACHEABLE);
 }
 
 /*
  * Initialize a virtual machine.
  */
 static void *
 svm_vminit(struct vm *vm, pmap_t pmap)
 {
 	struct svm_softc *svm_sc;
 	struct svm_vcpu *vcpu;
 	vm_paddr_t msrpm_pa, iopm_pa, pml4_pa;
 	int i;
 
 	svm_sc = contigmalloc(sizeof (*svm_sc), M_SVM, M_WAITOK | M_ZERO,
 	    0, ~(vm_paddr_t)0, PAGE_SIZE, 0);
 	svm_sc->vm = vm;
 	svm_sc->nptp = (vm_offset_t)vtophys(pmap->pm_pml4);
 
 	/*
 	 * Intercept read and write accesses to all MSRs.
 	 */
 	memset(svm_sc->msr_bitmap, 0xFF, sizeof(svm_sc->msr_bitmap));
 
 	/*
 	 * Access to the following MSRs is redirected to the VMCB when the
 	 * guest is executing. Therefore it is safe to allow the guest to
 	 * read/write these MSRs directly without hypervisor involvement.
 	 */
 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_GSBASE);
 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_FSBASE);
 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_KGSBASE);
 
 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_STAR);
 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_LSTAR);
 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_CSTAR);
 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SF_MASK);
 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_CS_MSR);
 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_ESP_MSR);
 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_EIP_MSR);
 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_PAT);
 
 	svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_TSC);
 
 	/*
 	 * Intercept writes to make sure that the EFER_SVM bit is not cleared.
 	 */
 	svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_EFER);
 
 	/* Intercept access to all I/O ports. */
 	memset(svm_sc->iopm_bitmap, 0xFF, sizeof(svm_sc->iopm_bitmap));
 
 	iopm_pa = vtophys(svm_sc->iopm_bitmap);
 	msrpm_pa = vtophys(svm_sc->msr_bitmap);
 	pml4_pa = svm_sc->nptp;
 	for (i = 0; i < VM_MAXCPU; i++) {
 		vcpu = svm_get_vcpu(svm_sc, i);
 		vcpu->nextrip = ~0;
 		vcpu->lastcpu = NOCPU;
 		vcpu->vmcb_pa = vtophys(&vcpu->vmcb);
 		vmcb_init(svm_sc, i, iopm_pa, msrpm_pa, pml4_pa);
 		svm_msr_guest_init(svm_sc, i);
 	}
 	return (svm_sc);
 }
 
 /*
  * Collateral for a generic SVM VM-exit.
  */
 static void
 vm_exit_svm(struct vm_exit *vme, uint64_t code, uint64_t info1, uint64_t info2)
 {
 
 	vme->exitcode = VM_EXITCODE_SVM;
 	vme->u.svm.exitcode = code;
 	vme->u.svm.exitinfo1 = info1;
 	vme->u.svm.exitinfo2 = info2;
 }
 
 static int
 svm_cpl(struct vmcb_state *state)
 {
 
 	/*
 	 * From APMv2:
 	 *   "Retrieve the CPL from the CPL field in the VMCB, not
 	 *    from any segment DPL"
 	 */
 	return (state->cpl);
 }
 
 static enum vm_cpu_mode
 svm_vcpu_mode(struct vmcb *vmcb)
 {
 	struct vmcb_segment seg;
 	struct vmcb_state *state;
 	int error;
 
 	state = &vmcb->state;
 
 	if (state->efer & EFER_LMA) {
 		error = vmcb_seg(vmcb, VM_REG_GUEST_CS, &seg);
 		KASSERT(error == 0, ("%s: vmcb_seg(cs) error %d", __func__,
 		    error));
 
 		/*
 		 * Section 4.8.1 for APM2, check if Code Segment has
 		 * Long attribute set in descriptor.
 		 */
 		if (seg.attrib & VMCB_CS_ATTRIB_L)
 			return (CPU_MODE_64BIT);
 		else
 			return (CPU_MODE_COMPATIBILITY);
 	} else  if (state->cr0 & CR0_PE) {
 		return (CPU_MODE_PROTECTED);
 	} else {
 		return (CPU_MODE_REAL);
 	}
 }
 
 static enum vm_paging_mode
 svm_paging_mode(uint64_t cr0, uint64_t cr4, uint64_t efer)
 {
 
 	if ((cr0 & CR0_PG) == 0)
 		return (PAGING_MODE_FLAT);
 	if ((cr4 & CR4_PAE) == 0)
 		return (PAGING_MODE_32);
 	if (efer & EFER_LME)
 		return (PAGING_MODE_64);
 	else
 		return (PAGING_MODE_PAE);
 }
 
 /*
  * ins/outs utility routines
  */
 static uint64_t
 svm_inout_str_index(struct svm_regctx *regs, int in)
 {
 	uint64_t val;
 
 	val = in ? regs->sctx_rdi : regs->sctx_rsi;
 
 	return (val);
 }
 
 static uint64_t
 svm_inout_str_count(struct svm_regctx *regs, int rep)
 {
 	uint64_t val;
 
 	val = rep ? regs->sctx_rcx : 1;
 
 	return (val);
 }
 
 static void
 svm_inout_str_seginfo(struct svm_softc *svm_sc, int vcpu, int64_t info1,
     int in, struct vm_inout_str *vis)
 {
 	int error, s;
 
 	if (in) {
 		vis->seg_name = VM_REG_GUEST_ES;
 	} else {
 		/* The segment field has standard encoding */
 		s = (info1 >> 10) & 0x7;
 		vis->seg_name = vm_segment_name(s);
 	}
 
 	error = vmcb_getdesc(svm_sc, vcpu, vis->seg_name, &vis->seg_desc);
 	KASSERT(error == 0, ("%s: svm_getdesc error %d", __func__, error));
 }
 
 static int
 svm_inout_str_addrsize(uint64_t info1)
 {
         uint32_t size;
 
         size = (info1 >> 7) & 0x7;
         switch (size) {
         case 1:
                 return (2);     /* 16 bit */
         case 2:
                 return (4);     /* 32 bit */
         case 4:
                 return (8);     /* 64 bit */
         default:
                 panic("%s: invalid size encoding %d", __func__, size);
         }
 }
 
 static void
 svm_paging_info(struct vmcb *vmcb, struct vm_guest_paging *paging)
 {
 	struct vmcb_state *state;
 
 	state = &vmcb->state;
 	paging->cr3 = state->cr3;
 	paging->cpl = svm_cpl(state);
 	paging->cpu_mode = svm_vcpu_mode(vmcb);
 	paging->paging_mode = svm_paging_mode(state->cr0, state->cr4,
 	    state->efer);
 }
 
 #define	UNHANDLED 0
 
 /*
  * Handle guest I/O intercept.
  */
 static int
 svm_handle_io(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit)
 {
 	struct vmcb_ctrl *ctrl;
 	struct vmcb_state *state;
 	struct svm_regctx *regs;
 	struct vm_inout_str *vis;
 	uint64_t info1;
 	int inout_string;
 
 	state = svm_get_vmcb_state(svm_sc, vcpu);
 	ctrl  = svm_get_vmcb_ctrl(svm_sc, vcpu);
 	regs  = svm_get_guest_regctx(svm_sc, vcpu);
 
 	info1 = ctrl->exitinfo1;
 	inout_string = info1 & BIT(2) ? 1 : 0;
 
 	/*
 	 * The effective segment number in EXITINFO1[12:10] is populated
 	 * only if the processor has the DecodeAssist capability.
 	 *
 	 * XXX this is not specified explicitly in APMv2 but can be verified
 	 * empirically.
 	 */
 	if (inout_string && !decode_assist())
 		return (UNHANDLED);
 
 	vmexit->exitcode 	= VM_EXITCODE_INOUT;
 	vmexit->u.inout.in 	= (info1 & BIT(0)) ? 1 : 0;
 	vmexit->u.inout.string 	= inout_string;
 	vmexit->u.inout.rep 	= (info1 & BIT(3)) ? 1 : 0;
 	vmexit->u.inout.bytes 	= (info1 >> 4) & 0x7;
 	vmexit->u.inout.port 	= (uint16_t)(info1 >> 16);
 	vmexit->u.inout.eax 	= (uint32_t)(state->rax);
 
 	if (inout_string) {
 		vmexit->exitcode = VM_EXITCODE_INOUT_STR;
 		vis = &vmexit->u.inout_str;
 		svm_paging_info(svm_get_vmcb(svm_sc, vcpu), &vis->paging);
 		vis->rflags = state->rflags;
 		vis->cr0 = state->cr0;
 		vis->index = svm_inout_str_index(regs, vmexit->u.inout.in);
 		vis->count = svm_inout_str_count(regs, vmexit->u.inout.rep);
 		vis->addrsize = svm_inout_str_addrsize(info1);
 		svm_inout_str_seginfo(svm_sc, vcpu, info1,
 		    vmexit->u.inout.in, vis);
 	}
 
 	return (UNHANDLED);
 }
 
 static int
 npf_fault_type(uint64_t exitinfo1)
 {
 
 	if (exitinfo1 & VMCB_NPF_INFO1_W)
 		return (VM_PROT_WRITE);
 	else if (exitinfo1 & VMCB_NPF_INFO1_ID)
 		return (VM_PROT_EXECUTE);
 	else
 		return (VM_PROT_READ);
 }
 
 static bool
 svm_npf_emul_fault(uint64_t exitinfo1)
 {
 	
 	if (exitinfo1 & VMCB_NPF_INFO1_ID) {
 		return (false);
 	}
 
 	if (exitinfo1 & VMCB_NPF_INFO1_GPT) {
 		return (false);
 	}
 
 	if ((exitinfo1 & VMCB_NPF_INFO1_GPA) == 0) {
 		return (false);
 	}
 
 	return (true);	
 }
 
 static void
 svm_handle_inst_emul(struct vmcb *vmcb, uint64_t gpa, struct vm_exit *vmexit)
 {
 	struct vm_guest_paging *paging;
 	struct vmcb_segment seg;
 	struct vmcb_ctrl *ctrl;
 	char *inst_bytes;
 	int error, inst_len;
 
 	ctrl = &vmcb->ctrl;
 	paging = &vmexit->u.inst_emul.paging;
 
 	vmexit->exitcode = VM_EXITCODE_INST_EMUL;
 	vmexit->u.inst_emul.gpa = gpa;
 	vmexit->u.inst_emul.gla = VIE_INVALID_GLA;
 	svm_paging_info(vmcb, paging);
 
 	error = vmcb_seg(vmcb, VM_REG_GUEST_CS, &seg);
 	KASSERT(error == 0, ("%s: vmcb_seg(CS) error %d", __func__, error));
 
 	switch(paging->cpu_mode) {
 	case CPU_MODE_REAL:
 		vmexit->u.inst_emul.cs_base = seg.base;
 		vmexit->u.inst_emul.cs_d = 0;
 		break;
 	case CPU_MODE_PROTECTED:
 	case CPU_MODE_COMPATIBILITY:
 		vmexit->u.inst_emul.cs_base = seg.base;
 
 		/*
 		 * Section 4.8.1 of APM2, Default Operand Size or D bit.
 		 */
 		vmexit->u.inst_emul.cs_d = (seg.attrib & VMCB_CS_ATTRIB_D) ?
 		    1 : 0;
 		break;
 	default:
 		vmexit->u.inst_emul.cs_base = 0;
 		vmexit->u.inst_emul.cs_d = 0;
 		break;	
 	}
 
 	/*
 	 * Copy the instruction bytes into 'vie' if available.
 	 */
 	if (decode_assist() && !disable_npf_assist) {
 		inst_len = ctrl->inst_len;
 		inst_bytes = ctrl->inst_bytes;
 	} else {
 		inst_len = 0;
 		inst_bytes = NULL;
 	}
 	vie_init(&vmexit->u.inst_emul.vie, inst_bytes, inst_len);
 }
 
 #ifdef KTR
 static const char *
 intrtype_to_str(int intr_type)
 {
 	switch (intr_type) {
 	case VMCB_EVENTINJ_TYPE_INTR:
 		return ("hwintr");
 	case VMCB_EVENTINJ_TYPE_NMI:
 		return ("nmi");
 	case VMCB_EVENTINJ_TYPE_INTn:
 		return ("swintr");
 	case VMCB_EVENTINJ_TYPE_EXCEPTION:
 		return ("exception");
 	default:
 		panic("%s: unknown intr_type %d", __func__, intr_type);
 	}
 }
 #endif
 
 /*
  * Inject an event to vcpu as described in section 15.20, "Event injection".
  */
 static void
 svm_eventinject(struct svm_softc *sc, int vcpu, int intr_type, int vector,
 		 uint32_t error, bool ec_valid)
 {
 	struct vmcb_ctrl *ctrl;
 
 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
 
 	KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) == 0,
 	    ("%s: event already pending %#lx", __func__, ctrl->eventinj));
 
 	KASSERT(vector >=0 && vector <= 255, ("%s: invalid vector %d",
 	    __func__, vector));
 
 	switch (intr_type) {
 	case VMCB_EVENTINJ_TYPE_INTR:
 	case VMCB_EVENTINJ_TYPE_NMI:
 	case VMCB_EVENTINJ_TYPE_INTn:
 		break;
 	case VMCB_EVENTINJ_TYPE_EXCEPTION:
 		if (vector >= 0 && vector <= 31 && vector != 2)
 			break;
 		/* FALLTHROUGH */
 	default:
 		panic("%s: invalid intr_type/vector: %d/%d", __func__,
 		    intr_type, vector);
 	}
 	ctrl->eventinj = vector | (intr_type << 8) | VMCB_EVENTINJ_VALID;
 	if (ec_valid) {
 		ctrl->eventinj |= VMCB_EVENTINJ_EC_VALID;
 		ctrl->eventinj |= (uint64_t)error << 32;
 		VCPU_CTR3(sc->vm, vcpu, "Injecting %s at vector %d errcode %#x",
 		    intrtype_to_str(intr_type), vector, error);
 	} else {
 		VCPU_CTR2(sc->vm, vcpu, "Injecting %s at vector %d",
 		    intrtype_to_str(intr_type), vector);
 	}
 }
 
 static void
 svm_update_virqinfo(struct svm_softc *sc, int vcpu)
 {
 	struct vm *vm;
 	struct vlapic *vlapic;
 	struct vmcb_ctrl *ctrl;
 	int pending;
 
 	vm = sc->vm;
 	vlapic = vm_lapic(vm, vcpu);
 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
 
 	/* Update %cr8 in the emulated vlapic */
 	vlapic_set_cr8(vlapic, ctrl->v_tpr);
 
 	/*
 	 * If V_IRQ indicates that the interrupt injection attempted on then
 	 * last VMRUN was successful then update the vlapic accordingly.
 	 */
 	if (ctrl->v_intr_vector != 0) {
 		pending = ctrl->v_irq;
 		KASSERT(ctrl->v_intr_vector >= 16, ("%s: invalid "
 		    "v_intr_vector %d", __func__, ctrl->v_intr_vector));
 		KASSERT(!ctrl->v_ign_tpr, ("%s: invalid v_ign_tpr", __func__));
 		VCPU_CTR2(vm, vcpu, "v_intr_vector %d %s", ctrl->v_intr_vector,
 		    pending ? "pending" : "accepted");
 		if (!pending)
 			vlapic_intr_accepted(vlapic, ctrl->v_intr_vector);
 	}
 }
 
 static void
 svm_save_intinfo(struct svm_softc *svm_sc, int vcpu)
 {
 	struct vmcb_ctrl *ctrl;
 	uint64_t intinfo;
 
 	ctrl  = svm_get_vmcb_ctrl(svm_sc, vcpu);
 	intinfo = ctrl->exitintinfo;	
 	if (!VMCB_EXITINTINFO_VALID(intinfo))
 		return;
 
 	/*
 	 * From APMv2, Section "Intercepts during IDT interrupt delivery"
 	 *
 	 * If a #VMEXIT happened during event delivery then record the event
 	 * that was being delivered.
 	 */
 	VCPU_CTR2(svm_sc->vm, vcpu, "SVM:Pending INTINFO(0x%lx), vector=%d.\n",
 		intinfo, VMCB_EXITINTINFO_VECTOR(intinfo));
 	vmm_stat_incr(svm_sc->vm, vcpu, VCPU_EXITINTINFO, 1);
 	vm_exit_intinfo(svm_sc->vm, vcpu, intinfo);
 }
 
 static __inline int
 vintr_intercept_enabled(struct svm_softc *sc, int vcpu)
 {
 
 	return (svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
 	    VMCB_INTCPT_VINTR));
 }
 
 static __inline void
 enable_intr_window_exiting(struct svm_softc *sc, int vcpu)
 {
 	struct vmcb_ctrl *ctrl;
 
 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
 
 	if (ctrl->v_irq && ctrl->v_intr_vector == 0) {
 		KASSERT(ctrl->v_ign_tpr, ("%s: invalid v_ign_tpr", __func__));
 		KASSERT(vintr_intercept_enabled(sc, vcpu),
 		    ("%s: vintr intercept should be enabled", __func__));
 		return;
 	}
 
 	VCPU_CTR0(sc->vm, vcpu, "Enable intr window exiting");
 	ctrl->v_irq = 1;
 	ctrl->v_ign_tpr = 1;
 	ctrl->v_intr_vector = 0;
 	svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR);
 }
 
 static __inline void
 disable_intr_window_exiting(struct svm_softc *sc, int vcpu)
 {
 	struct vmcb_ctrl *ctrl;
 
 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
 
 	if (!ctrl->v_irq && ctrl->v_intr_vector == 0) {
 		KASSERT(!vintr_intercept_enabled(sc, vcpu),
 		    ("%s: vintr intercept should be disabled", __func__));
 		return;
 	}
 
 #ifdef KTR
 	if (ctrl->v_intr_vector == 0)
 		VCPU_CTR0(sc->vm, vcpu, "Disable intr window exiting");
 	else
 		VCPU_CTR0(sc->vm, vcpu, "Clearing V_IRQ interrupt injection");
 #endif
 	ctrl->v_irq = 0;
 	ctrl->v_intr_vector = 0;
 	svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
 	svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR);
 }
 
 static int
 svm_modify_intr_shadow(struct svm_softc *sc, int vcpu, uint64_t val)
 {
 	struct vmcb_ctrl *ctrl;
 	int oldval, newval;
 
 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
 	oldval = ctrl->intr_shadow;
 	newval = val ? 1 : 0;
 	if (newval != oldval) {
 		ctrl->intr_shadow = newval;
 		VCPU_CTR1(sc->vm, vcpu, "Setting intr_shadow to %d", newval);
 	}
 	return (0);
 }
 
 static int
 svm_get_intr_shadow(struct svm_softc *sc, int vcpu, uint64_t *val)
 {
 	struct vmcb_ctrl *ctrl;
 
 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
 	*val = ctrl->intr_shadow;
 	return (0);
 }
 
 /*
  * Once an NMI is injected it blocks delivery of further NMIs until the handler
  * executes an IRET. The IRET intercept is enabled when an NMI is injected to
  * to track when the vcpu is done handling the NMI.
  */
 static int
 nmi_blocked(struct svm_softc *sc, int vcpu)
 {
 	int blocked;
 
 	blocked = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
 	    VMCB_INTCPT_IRET);
 	return (blocked);
 }
 
 static void
 enable_nmi_blocking(struct svm_softc *sc, int vcpu)
 {
 
 	KASSERT(!nmi_blocked(sc, vcpu), ("vNMI already blocked"));
 	VCPU_CTR0(sc->vm, vcpu, "vNMI blocking enabled");
 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET);
 }
 
 static void
 clear_nmi_blocking(struct svm_softc *sc, int vcpu)
 {
 	int error;
 
 	KASSERT(nmi_blocked(sc, vcpu), ("vNMI already unblocked"));
 	VCPU_CTR0(sc->vm, vcpu, "vNMI blocking cleared");
 	/*
 	 * When the IRET intercept is cleared the vcpu will attempt to execute
 	 * the "iret" when it runs next. However, it is possible to inject
 	 * another NMI into the vcpu before the "iret" has actually executed.
 	 *
 	 * For e.g. if the "iret" encounters a #NPF when accessing the stack
 	 * it will trap back into the hypervisor. If an NMI is pending for
 	 * the vcpu it will be injected into the guest.
 	 *
 	 * XXX this needs to be fixed
 	 */
 	svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET);
 
 	/*
 	 * Set 'intr_shadow' to prevent an NMI from being injected on the
 	 * immediate VMRUN.
 	 */
 	error = svm_modify_intr_shadow(sc, vcpu, 1);
 	KASSERT(!error, ("%s: error %d setting intr_shadow", __func__, error));
 }
 
 #define	EFER_MBZ_BITS	0xFFFFFFFFFFFF0200UL
 
 static int
 svm_write_efer(struct svm_softc *sc, int vcpu, uint64_t newval, bool *retu)
 {
 	struct vm_exit *vme;
 	struct vmcb_state *state;
 	uint64_t changed, lma, oldval;
 	int error;
 
 	state = svm_get_vmcb_state(sc, vcpu);
 
 	oldval = state->efer;
 	VCPU_CTR2(sc->vm, vcpu, "wrmsr(efer) %#lx/%#lx", oldval, newval);
 
 	newval &= ~0xFE;		/* clear the Read-As-Zero (RAZ) bits */
 	changed = oldval ^ newval;
 
 	if (newval & EFER_MBZ_BITS)
 		goto gpf;
 
 	/* APMv2 Table 14-5 "Long-Mode Consistency Checks" */
 	if (changed & EFER_LME) {
 		if (state->cr0 & CR0_PG)
 			goto gpf;
 	}
 
 	/* EFER.LMA = EFER.LME & CR0.PG */
 	if ((newval & EFER_LME) != 0 && (state->cr0 & CR0_PG) != 0)
 		lma = EFER_LMA;
 	else
 		lma = 0;
 
 	if ((newval & EFER_LMA) != lma)
 		goto gpf;
 
 	if (newval & EFER_NXE) {
 		if (!vm_cpuid_capability(sc->vm, vcpu, VCC_NO_EXECUTE))
 			goto gpf;
 	}
 
 	/*
 	 * XXX bhyve does not enforce segment limits in 64-bit mode. Until
 	 * this is fixed flag guest attempt to set EFER_LMSLE as an error.
 	 */
 	if (newval & EFER_LMSLE) {
 		vme = vm_exitinfo(sc->vm, vcpu);
 		vm_exit_svm(vme, VMCB_EXIT_MSR, 1, 0);
 		*retu = true;
 		return (0);
 	}
 
 	if (newval & EFER_FFXSR) {
 		if (!vm_cpuid_capability(sc->vm, vcpu, VCC_FFXSR))
 			goto gpf;
 	}
 
 	if (newval & EFER_TCE) {
 		if (!vm_cpuid_capability(sc->vm, vcpu, VCC_TCE))
 			goto gpf;
 	}
 
 	error = svm_setreg(sc, vcpu, VM_REG_GUEST_EFER, newval);
 	KASSERT(error == 0, ("%s: error %d updating efer", __func__, error));
 	return (0);
 gpf:
 	vm_inject_gp(sc->vm, vcpu);
 	return (0);
 }
 
 static int
 emulate_wrmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t val,
     bool *retu)
 {
 	int error;
 
 	if (lapic_msr(num))
 		error = lapic_wrmsr(sc->vm, vcpu, num, val, retu);
 	else if (num == MSR_EFER)
 		error = svm_write_efer(sc, vcpu, val, retu);
 	else
 		error = svm_wrmsr(sc, vcpu, num, val, retu);
 
 	return (error);
 }
 
 static int
 emulate_rdmsr(struct svm_softc *sc, int vcpu, u_int num, bool *retu)
 {
 	struct vmcb_state *state;
 	struct svm_regctx *ctx;
 	uint64_t result;
 	int error;
 
 	if (lapic_msr(num))
 		error = lapic_rdmsr(sc->vm, vcpu, num, &result, retu);
 	else
 		error = svm_rdmsr(sc, vcpu, num, &result, retu);
 
 	if (error == 0) {
 		state = svm_get_vmcb_state(sc, vcpu);
 		ctx = svm_get_guest_regctx(sc, vcpu);
 		state->rax = result & 0xffffffff;
 		ctx->sctx_rdx = result >> 32;
 	}
 
 	return (error);
 }
 
 #ifdef KTR
 static const char *
 exit_reason_to_str(uint64_t reason)
 {
 	static char reasonbuf[32];
 
 	switch (reason) {
 	case VMCB_EXIT_INVALID:
 		return ("invalvmcb");
 	case VMCB_EXIT_SHUTDOWN:
 		return ("shutdown");
 	case VMCB_EXIT_NPF:
 		return ("nptfault");
 	case VMCB_EXIT_PAUSE:
 		return ("pause");
 	case VMCB_EXIT_HLT:
 		return ("hlt");
 	case VMCB_EXIT_CPUID:
 		return ("cpuid");
 	case VMCB_EXIT_IO:
 		return ("inout");
 	case VMCB_EXIT_MC:
 		return ("mchk");
 	case VMCB_EXIT_INTR:
 		return ("extintr");
 	case VMCB_EXIT_NMI:
 		return ("nmi");
 	case VMCB_EXIT_VINTR:
 		return ("vintr");
 	case VMCB_EXIT_MSR:
 		return ("msr");
 	case VMCB_EXIT_IRET:
 		return ("iret");
 	case VMCB_EXIT_MONITOR:
 		return ("monitor");
 	case VMCB_EXIT_MWAIT:
 		return ("mwait");
 	default:
 		snprintf(reasonbuf, sizeof(reasonbuf), "%#lx", reason);
 		return (reasonbuf);
 	}
 }
 #endif	/* KTR */
 
 /*
  * From section "State Saved on Exit" in APMv2: nRIP is saved for all #VMEXITs
  * that are due to instruction intercepts as well as MSR and IOIO intercepts
  * and exceptions caused by INT3, INTO and BOUND instructions.
  *
  * Return 1 if the nRIP is valid and 0 otherwise.
  */
 static int
 nrip_valid(uint64_t exitcode)
 {
 	switch (exitcode) {
 	case 0x00 ... 0x0F:	/* read of CR0 through CR15 */
 	case 0x10 ... 0x1F:	/* write of CR0 through CR15 */
 	case 0x20 ... 0x2F:	/* read of DR0 through DR15 */
 	case 0x30 ... 0x3F:	/* write of DR0 through DR15 */
 	case 0x43:		/* INT3 */
 	case 0x44:		/* INTO */
 	case 0x45:		/* BOUND */
 	case 0x65 ... 0x7C:	/* VMEXIT_CR0_SEL_WRITE ... VMEXIT_MSR */
 	case 0x80 ... 0x8D:	/* VMEXIT_VMRUN ... VMEXIT_XSETBV */
 		return (1);
 	default:
 		return (0);
 	}
 }
 
 static int
 svm_vmexit(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit)
 {
 	struct vmcb *vmcb;
 	struct vmcb_state *state;
 	struct vmcb_ctrl *ctrl;
 	struct svm_regctx *ctx;
 	uint64_t code, info1, info2, val;
 	uint32_t eax, ecx, edx;
 	int error, errcode_valid, handled, idtvec, reflect;
 	bool retu;
 
 	ctx = svm_get_guest_regctx(svm_sc, vcpu);
 	vmcb = svm_get_vmcb(svm_sc, vcpu);
 	state = &vmcb->state;
 	ctrl = &vmcb->ctrl;
 
 	handled = 0;
 	code = ctrl->exitcode;
 	info1 = ctrl->exitinfo1;
 	info2 = ctrl->exitinfo2;
 
 	vmexit->exitcode = VM_EXITCODE_BOGUS;
 	vmexit->rip = state->rip;
 	vmexit->inst_length = nrip_valid(code) ? ctrl->nrip - state->rip : 0;
 
 	vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_COUNT, 1);
 
 	/*
 	 * #VMEXIT(INVALID) needs to be handled early because the VMCB is
 	 * in an inconsistent state and can trigger assertions that would
 	 * never happen otherwise.
 	 */
 	if (code == VMCB_EXIT_INVALID) {
 		vm_exit_svm(vmexit, code, info1, info2);
 		return (0);
 	}
 
 	KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) == 0, ("%s: event "
 	    "injection valid bit is set %#lx", __func__, ctrl->eventinj));
 
 	KASSERT(vmexit->inst_length >= 0 && vmexit->inst_length <= 15,
 	    ("invalid inst_length %d: code (%#lx), info1 (%#lx), info2 (%#lx)",
 	    vmexit->inst_length, code, info1, info2));
 
 	svm_update_virqinfo(svm_sc, vcpu);
 	svm_save_intinfo(svm_sc, vcpu);
 
 	switch (code) {
 	case VMCB_EXIT_IRET:
 		/*
 		 * Restart execution at "iret" but with the intercept cleared.
 		 */
 		vmexit->inst_length = 0;
 		clear_nmi_blocking(svm_sc, vcpu);
 		handled = 1;
 		break;
 	case VMCB_EXIT_VINTR:	/* interrupt window exiting */
 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_VINTR, 1);
 		handled = 1;
 		break;
 	case VMCB_EXIT_INTR:	/* external interrupt */
 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXTINT, 1);
 		handled = 1;
 		break;
 	case VMCB_EXIT_NMI:	/* external NMI */
 		handled = 1;
 		break;
 	case 0x40 ... 0x5F:
 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXCEPTION, 1);
 		reflect = 1;
 		idtvec = code - 0x40;
 		switch (idtvec) {
 		case IDT_MC:
 			/*
 			 * Call the machine check handler by hand. Also don't
 			 * reflect the machine check back into the guest.
 			 */
 			reflect = 0;
 			VCPU_CTR0(svm_sc->vm, vcpu, "Vectoring to MCE handler");
 			__asm __volatile("int $18");
 			break;
 		case IDT_PF:
 			error = svm_setreg(svm_sc, vcpu, VM_REG_GUEST_CR2,
 			    info2);
 			KASSERT(error == 0, ("%s: error %d updating cr2",
 			    __func__, error));
 			/* fallthru */
 		case IDT_NP:
 		case IDT_SS:
 		case IDT_GP:
 		case IDT_AC:
 		case IDT_TS:
 			errcode_valid = 1;
 			break;
 
 		case IDT_DF:
 			errcode_valid = 1;
 			info1 = 0;
 			break;
 
 		case IDT_BP:
 		case IDT_OF:
 		case IDT_BR:
 			/*
 			 * The 'nrip' field is populated for INT3, INTO and
 			 * BOUND exceptions and this also implies that
 			 * 'inst_length' is non-zero.
 			 *
 			 * Reset 'inst_length' to zero so the guest %rip at
 			 * event injection is identical to what it was when
 			 * the exception originally happened.
 			 */
 			VCPU_CTR2(svm_sc->vm, vcpu, "Reset inst_length from %d "
 			    "to zero before injecting exception %d",
 			    vmexit->inst_length, idtvec);
 			vmexit->inst_length = 0;
 			/* fallthru */
 		default:
 			errcode_valid = 0;
 			info1 = 0;
 			break;
 		}
 		KASSERT(vmexit->inst_length == 0, ("invalid inst_length (%d) "
 		    "when reflecting exception %d into guest",
 		    vmexit->inst_length, idtvec));
 
 		if (reflect) {
 			/* Reflect the exception back into the guest */
 			VCPU_CTR2(svm_sc->vm, vcpu, "Reflecting exception "
 			    "%d/%#x into the guest", idtvec, (int)info1);
 			error = vm_inject_exception(svm_sc->vm, vcpu, idtvec,
 			    errcode_valid, info1, 0);
 			KASSERT(error == 0, ("%s: vm_inject_exception error %d",
 			    __func__, error));
 		}
 		handled = 1;
 		break;
 	case VMCB_EXIT_MSR:	/* MSR access. */
 		eax = state->rax;
 		ecx = ctx->sctx_rcx;
 		edx = ctx->sctx_rdx;
 		retu = false;	
 
 		if (info1) {
 			vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_WRMSR, 1);
 			val = (uint64_t)edx << 32 | eax;
 			VCPU_CTR2(svm_sc->vm, vcpu, "wrmsr %#x val %#lx",
 			    ecx, val);
 			if (emulate_wrmsr(svm_sc, vcpu, ecx, val, &retu)) {
 				vmexit->exitcode = VM_EXITCODE_WRMSR;
 				vmexit->u.msr.code = ecx;
 				vmexit->u.msr.wval = val;
 			} else if (!retu) {
 				handled = 1;
 			} else {
 				KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
 				    ("emulate_wrmsr retu with bogus exitcode"));
 			}
 		} else {
 			VCPU_CTR1(svm_sc->vm, vcpu, "rdmsr %#x", ecx);
 			vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_RDMSR, 1);
 			if (emulate_rdmsr(svm_sc, vcpu, ecx, &retu)) {
 				vmexit->exitcode = VM_EXITCODE_RDMSR;
 				vmexit->u.msr.code = ecx;
 			} else if (!retu) {
 				handled = 1;
 			} else {
 				KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
 				    ("emulate_rdmsr retu with bogus exitcode"));
 			}
 		}
 		break;
 	case VMCB_EXIT_IO:
 		handled = svm_handle_io(svm_sc, vcpu, vmexit);
 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_INOUT, 1);
 		break;
 	case VMCB_EXIT_CPUID:
 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_CPUID, 1);
 		handled = x86_emulate_cpuid(svm_sc->vm, vcpu,
 		    (uint32_t *)&state->rax,
 		    (uint32_t *)&ctx->sctx_rbx,
 		    (uint32_t *)&ctx->sctx_rcx,
 		    (uint32_t *)&ctx->sctx_rdx);
 		break;
 	case VMCB_EXIT_HLT:
 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_HLT, 1);
 		vmexit->exitcode = VM_EXITCODE_HLT;
 		vmexit->u.hlt.rflags = state->rflags;
 		break;
 	case VMCB_EXIT_PAUSE:
 		vmexit->exitcode = VM_EXITCODE_PAUSE;
 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_PAUSE, 1);
 		break;
 	case VMCB_EXIT_NPF:
 		/* EXITINFO2 contains the faulting guest physical address */
 		if (info1 & VMCB_NPF_INFO1_RSV) {
 			VCPU_CTR2(svm_sc->vm, vcpu, "nested page fault with "
 			    "reserved bits set: info1(%#lx) info2(%#lx)",
 			    info1, info2);
 		} else if (vm_mem_allocated(svm_sc->vm, vcpu, info2)) {
 			vmexit->exitcode = VM_EXITCODE_PAGING;
 			vmexit->u.paging.gpa = info2;
 			vmexit->u.paging.fault_type = npf_fault_type(info1);
 			vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_NESTED_FAULT, 1);
 			VCPU_CTR3(svm_sc->vm, vcpu, "nested page fault "
 			    "on gpa %#lx/%#lx at rip %#lx",
 			    info2, info1, state->rip);
 		} else if (svm_npf_emul_fault(info1)) {
 			svm_handle_inst_emul(vmcb, info2, vmexit);
 			vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_INST_EMUL, 1);
 			VCPU_CTR3(svm_sc->vm, vcpu, "inst_emul fault "
 			    "for gpa %#lx/%#lx at rip %#lx",
 			    info2, info1, state->rip);
 		}
 		break;
 	case VMCB_EXIT_MONITOR:
 		vmexit->exitcode = VM_EXITCODE_MONITOR;
 		break;
 	case VMCB_EXIT_MWAIT:
 		vmexit->exitcode = VM_EXITCODE_MWAIT;
 		break;
 	default:
 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_UNKNOWN, 1);
 		break;
 	}	
 
 	VCPU_CTR4(svm_sc->vm, vcpu, "%s %s vmexit at %#lx/%d",
 	    handled ? "handled" : "unhandled", exit_reason_to_str(code),
 	    vmexit->rip, vmexit->inst_length);
 
 	if (handled) {
 		vmexit->rip += vmexit->inst_length;
 		vmexit->inst_length = 0;
 		state->rip = vmexit->rip;
 	} else {
 		if (vmexit->exitcode == VM_EXITCODE_BOGUS) {
 			/*
 			 * If this VM exit was not claimed by anybody then
 			 * treat it as a generic SVM exit.
 			 */
 			vm_exit_svm(vmexit, code, info1, info2);
 		} else {
 			/*
 			 * The exitcode and collateral have been populated.
 			 * The VM exit will be processed further in userland.
 			 */
 		}
 	}
 	return (handled);
 }
 
 static void
 svm_inj_intinfo(struct svm_softc *svm_sc, int vcpu)
 {
 	uint64_t intinfo;
 
 	if (!vm_entry_intinfo(svm_sc->vm, vcpu, &intinfo))
 		return;
 
 	KASSERT(VMCB_EXITINTINFO_VALID(intinfo), ("%s: entry intinfo is not "
 	    "valid: %#lx", __func__, intinfo));
 
 	svm_eventinject(svm_sc, vcpu, VMCB_EXITINTINFO_TYPE(intinfo),
 		VMCB_EXITINTINFO_VECTOR(intinfo),
 		VMCB_EXITINTINFO_EC(intinfo),
 		VMCB_EXITINTINFO_EC_VALID(intinfo));
 	vmm_stat_incr(svm_sc->vm, vcpu, VCPU_INTINFO_INJECTED, 1);
 	VCPU_CTR1(svm_sc->vm, vcpu, "Injected entry intinfo: %#lx", intinfo);
 }
 
 /*
  * Inject event to virtual cpu.
  */
 static void
 svm_inj_interrupts(struct svm_softc *sc, int vcpu, struct vlapic *vlapic)
 {
 	struct vmcb_ctrl *ctrl;
 	struct vmcb_state *state;
 	struct svm_vcpu *vcpustate;
 	uint8_t v_tpr;
 	int vector, need_intr_window, pending_apic_vector;
 
 	state = svm_get_vmcb_state(sc, vcpu);
 	ctrl  = svm_get_vmcb_ctrl(sc, vcpu);
 	vcpustate = svm_get_vcpu(sc, vcpu);
 
 	need_intr_window = 0;
 	pending_apic_vector = 0;
 
 	if (vcpustate->nextrip != state->rip) {
 		ctrl->intr_shadow = 0;
 		VCPU_CTR2(sc->vm, vcpu, "Guest interrupt blocking "
 		    "cleared due to rip change: %#lx/%#lx",
 		    vcpustate->nextrip, state->rip);
 	}
 
 	/*
 	 * Inject pending events or exceptions for this vcpu.
 	 *
 	 * An event might be pending because the previous #VMEXIT happened
 	 * during event delivery (i.e. ctrl->exitintinfo).
 	 *
 	 * An event might also be pending because an exception was injected
 	 * by the hypervisor (e.g. #PF during instruction emulation).
 	 */
 	svm_inj_intinfo(sc, vcpu);
 
 	/* NMI event has priority over interrupts. */
 	if (vm_nmi_pending(sc->vm, vcpu)) {
 		if (nmi_blocked(sc, vcpu)) {
 			/*
 			 * Can't inject another NMI if the guest has not
 			 * yet executed an "iret" after the last NMI.
 			 */
 			VCPU_CTR0(sc->vm, vcpu, "Cannot inject NMI due "
 			    "to NMI-blocking");
 		} else if (ctrl->intr_shadow) {
 			/*
 			 * Can't inject an NMI if the vcpu is in an intr_shadow.
 			 */
 			VCPU_CTR0(sc->vm, vcpu, "Cannot inject NMI due to "
 			    "interrupt shadow");
 			need_intr_window = 1;
 			goto done;
 		} else if (ctrl->eventinj & VMCB_EVENTINJ_VALID) {
 			/*
 			 * If there is already an exception/interrupt pending
 			 * then defer the NMI until after that.
 			 */
 			VCPU_CTR1(sc->vm, vcpu, "Cannot inject NMI due to "
 			    "eventinj %#lx", ctrl->eventinj);
 
 			/*
 			 * Use self-IPI to trigger a VM-exit as soon as
 			 * possible after the event injection is completed.
 			 *
 			 * This works only if the external interrupt exiting
 			 * is at a lower priority than the event injection.
 			 *
 			 * Although not explicitly specified in APMv2 the
 			 * relative priorities were verified empirically.
 			 */
-			ipi_cpu(curcpu, IPI_AST);	/* XXX vmm_ipinum? */
+			ipi_cpu(curcpu, vmm_ipinum);
 		} else {
 			vm_nmi_clear(sc->vm, vcpu);
 
 			/* Inject NMI, vector number is not used */
 			svm_eventinject(sc, vcpu, VMCB_EVENTINJ_TYPE_NMI,
 			    IDT_NMI, 0, false);
 
 			/* virtual NMI blocking is now in effect */
 			enable_nmi_blocking(sc, vcpu);
 
 			VCPU_CTR0(sc->vm, vcpu, "Injecting vNMI");
 		}
 	}
 
 	if (!vm_extint_pending(sc->vm, vcpu)) {
 		/*
 		 * APIC interrupts are delivered using the V_IRQ offload.
 		 *
 		 * The primary benefit is that the hypervisor doesn't need to
 		 * deal with the various conditions that inhibit interrupts.
 		 * It also means that TPR changes via CR8 will be handled
 		 * without any hypervisor involvement.
 		 *
 		 * Note that the APIC vector must remain pending in the vIRR
 		 * until it is confirmed that it was delivered to the guest.
 		 * This can be confirmed based on the value of V_IRQ at the
 		 * next #VMEXIT (1 = pending, 0 = delivered).
 		 *
 		 * Also note that it is possible that another higher priority
 		 * vector can become pending before this vector is delivered
 		 * to the guest. This is alright because vcpu_notify_event()
 		 * will send an IPI and force the vcpu to trap back into the
 		 * hypervisor. The higher priority vector will be injected on
 		 * the next VMRUN.
 		 */
 		if (vlapic_pending_intr(vlapic, &vector)) {
 			KASSERT(vector >= 16 && vector <= 255,
 			    ("invalid vector %d from local APIC", vector));
 			pending_apic_vector = vector;
 		}
 		goto done;
 	}
 
 	/* Ask the legacy pic for a vector to inject */
 	vatpic_pending_intr(sc->vm, &vector);
 	KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d from INTR",
 	    vector));
 
 	/*
 	 * If the guest has disabled interrupts or is in an interrupt shadow
 	 * then we cannot inject the pending interrupt.
 	 */
 	if ((state->rflags & PSL_I) == 0) {
 		VCPU_CTR2(sc->vm, vcpu, "Cannot inject vector %d due to "
 		    "rflags %#lx", vector, state->rflags);
 		need_intr_window = 1;
 		goto done;
 	}
 
 	if (ctrl->intr_shadow) {
 		VCPU_CTR1(sc->vm, vcpu, "Cannot inject vector %d due to "
 		    "interrupt shadow", vector);
 		need_intr_window = 1;
 		goto done;
 	}
 
 	if (ctrl->eventinj & VMCB_EVENTINJ_VALID) {
 		VCPU_CTR2(sc->vm, vcpu, "Cannot inject vector %d due to "
 		    "eventinj %#lx", vector, ctrl->eventinj);
 		need_intr_window = 1;
 		goto done;
 	}
 
 	/*
 	 * Legacy PIC interrupts are delivered via the event injection
 	 * mechanism.
 	 */
 	svm_eventinject(sc, vcpu, VMCB_EVENTINJ_TYPE_INTR, vector, 0, false);
 
 	vm_extint_clear(sc->vm, vcpu);
 	vatpic_intr_accepted(sc->vm, vector);
 
 	/*
 	 * Force a VM-exit as soon as the vcpu is ready to accept another
 	 * interrupt. This is done because the PIC might have another vector
 	 * that it wants to inject. Also, if the APIC has a pending interrupt
 	 * that was preempted by the ExtInt then it allows us to inject the
 	 * APIC vector as soon as possible.
 	 */
 	need_intr_window = 1;
 done:
 	/*
 	 * The guest can modify the TPR by writing to %CR8. In guest mode
 	 * the processor reflects this write to V_TPR without hypervisor
 	 * intervention.
 	 *
 	 * The guest can also modify the TPR by writing to it via the memory
 	 * mapped APIC page. In this case, the write will be emulated by the
 	 * hypervisor. For this reason V_TPR must be updated before every
 	 * VMRUN.
 	 */
 	v_tpr = vlapic_get_cr8(vlapic);
 	KASSERT(v_tpr <= 15, ("invalid v_tpr %#x", v_tpr));
 	if (ctrl->v_tpr != v_tpr) {
 		VCPU_CTR2(sc->vm, vcpu, "VMCB V_TPR changed from %#x to %#x",
 		    ctrl->v_tpr, v_tpr);
 		ctrl->v_tpr = v_tpr;
 		svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
 	}
 
 	if (pending_apic_vector) {
 		/*
 		 * If an APIC vector is being injected then interrupt window
 		 * exiting is not possible on this VMRUN.
 		 */
 		KASSERT(!need_intr_window, ("intr_window exiting impossible"));
 		VCPU_CTR1(sc->vm, vcpu, "Injecting vector %d using V_IRQ",
 		    pending_apic_vector);
 
 		ctrl->v_irq = 1;
 		ctrl->v_ign_tpr = 0;
 		ctrl->v_intr_vector = pending_apic_vector;
 		ctrl->v_intr_prio = pending_apic_vector >> 4;
 		svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
 	} else if (need_intr_window) {
 		/*
 		 * We use V_IRQ in conjunction with the VINTR intercept to
 		 * trap into the hypervisor as soon as a virtual interrupt
 		 * can be delivered.
 		 *
 		 * Since injected events are not subject to intercept checks
 		 * we need to ensure that the V_IRQ is not actually going to
 		 * be delivered on VM entry. The KASSERT below enforces this.
 		 */
 		KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) != 0 ||
 		    (state->rflags & PSL_I) == 0 || ctrl->intr_shadow,
 		    ("Bogus intr_window_exiting: eventinj (%#lx), "
 		    "intr_shadow (%u), rflags (%#lx)",
 		    ctrl->eventinj, ctrl->intr_shadow, state->rflags));
 		enable_intr_window_exiting(sc, vcpu);
 	} else {
 		disable_intr_window_exiting(sc, vcpu);
 	}
 }
 
 static __inline void
 restore_host_tss(void)
 {
 	struct system_segment_descriptor *tss_sd;
 
 	/*
 	 * The TSS descriptor was in use prior to launching the guest so it
 	 * has been marked busy.
 	 *
 	 * 'ltr' requires the descriptor to be marked available so change the
 	 * type to "64-bit available TSS".
 	 */
 	tss_sd = PCPU_GET(tss);
 	tss_sd->sd_type = SDT_SYSTSS;
 	ltr(GSEL(GPROC0_SEL, SEL_KPL));
 }
 
 static void
 check_asid(struct svm_softc *sc, int vcpuid, pmap_t pmap, u_int thiscpu)
 {
 	struct svm_vcpu *vcpustate;
 	struct vmcb_ctrl *ctrl;
 	long eptgen;
 	bool alloc_asid;
 
 	KASSERT(CPU_ISSET(thiscpu, &pmap->pm_active), ("%s: nested pmap not "
 	    "active on cpu %u", __func__, thiscpu));
 
 	vcpustate = svm_get_vcpu(sc, vcpuid);
 	ctrl = svm_get_vmcb_ctrl(sc, vcpuid);
 
 	/*
 	 * The TLB entries associated with the vcpu's ASID are not valid
 	 * if either of the following conditions is true:
 	 *
 	 * 1. The vcpu's ASID generation is different than the host cpu's
 	 *    ASID generation. This happens when the vcpu migrates to a new
 	 *    host cpu. It can also happen when the number of vcpus executing
 	 *    on a host cpu is greater than the number of ASIDs available.
 	 *
 	 * 2. The pmap generation number is different than the value cached in
 	 *    the 'vcpustate'. This happens when the host invalidates pages
 	 *    belonging to the guest.
 	 *
 	 *	asidgen		eptgen	      Action
 	 *	mismatch	mismatch
 	 *	   0		   0		(a)
 	 *	   0		   1		(b1) or (b2)
 	 *	   1		   0		(c)
 	 *	   1		   1		(d)
 	 *
 	 * (a) There is no mismatch in eptgen or ASID generation and therefore
 	 *     no further action is needed.
 	 *
 	 * (b1) If the cpu supports FlushByAsid then the vcpu's ASID is
 	 *      retained and the TLB entries associated with this ASID
 	 *      are flushed by VMRUN.
 	 *
 	 * (b2) If the cpu does not support FlushByAsid then a new ASID is
 	 *      allocated.
 	 *
 	 * (c) A new ASID is allocated.
 	 *
 	 * (d) A new ASID is allocated.
 	 */
 
 	alloc_asid = false;
 	eptgen = pmap->pm_eptgen;
 	ctrl->tlb_ctrl = VMCB_TLB_FLUSH_NOTHING;
 
 	if (vcpustate->asid.gen != asid[thiscpu].gen) {
 		alloc_asid = true;	/* (c) and (d) */
 	} else if (vcpustate->eptgen != eptgen) {
 		if (flush_by_asid())
 			ctrl->tlb_ctrl = VMCB_TLB_FLUSH_GUEST;	/* (b1) */
 		else
 			alloc_asid = true;			/* (b2) */
 	} else {
 		/*
 		 * This is the common case (a).
 		 */
 		KASSERT(!alloc_asid, ("ASID allocation not necessary"));
 		KASSERT(ctrl->tlb_ctrl == VMCB_TLB_FLUSH_NOTHING,
 		    ("Invalid VMCB tlb_ctrl: %#x", ctrl->tlb_ctrl));
 	}
 
 	if (alloc_asid) {
 		if (++asid[thiscpu].num >= nasid) {
 			asid[thiscpu].num = 1;
 			if (++asid[thiscpu].gen == 0)
 				asid[thiscpu].gen = 1;
 			/*
 			 * If this cpu does not support "flush-by-asid"
 			 * then flush the entire TLB on a generation
 			 * bump. Subsequent ASID allocation in this
 			 * generation can be done without a TLB flush.
 			 */
 			if (!flush_by_asid())
 				ctrl->tlb_ctrl = VMCB_TLB_FLUSH_ALL;
 		}
 		vcpustate->asid.gen = asid[thiscpu].gen;
 		vcpustate->asid.num = asid[thiscpu].num;
 
 		ctrl->asid = vcpustate->asid.num;
 		svm_set_dirty(sc, vcpuid, VMCB_CACHE_ASID);
 		/*
 		 * If this cpu supports "flush-by-asid" then the TLB
 		 * was not flushed after the generation bump. The TLB
 		 * is flushed selectively after every new ASID allocation.
 		 */
 		if (flush_by_asid())
 			ctrl->tlb_ctrl = VMCB_TLB_FLUSH_GUEST;
 	}
 	vcpustate->eptgen = eptgen;
 
 	KASSERT(ctrl->asid != 0, ("Guest ASID must be non-zero"));
 	KASSERT(ctrl->asid == vcpustate->asid.num,
 	    ("ASID mismatch: %u/%u", ctrl->asid, vcpustate->asid.num));
 }
 
 static __inline void
 disable_gintr(void)
 {
 
 	__asm __volatile("clgi");
 }
 
 static __inline void
 enable_gintr(void)
 {
 
         __asm __volatile("stgi");
 }
 
 /*
  * Start vcpu with specified RIP.
  */
 static int
 svm_vmrun(void *arg, int vcpu, register_t rip, pmap_t pmap, 
 	struct vm_eventinfo *evinfo)
 {
 	struct svm_regctx *gctx;
 	struct svm_softc *svm_sc;
 	struct svm_vcpu *vcpustate;
 	struct vmcb_state *state;
 	struct vmcb_ctrl *ctrl;
 	struct vm_exit *vmexit;
 	struct vlapic *vlapic;
 	struct vm *vm;
 	uint64_t vmcb_pa;
 	int handled;
 
 	svm_sc = arg;
 	vm = svm_sc->vm;
 
 	vcpustate = svm_get_vcpu(svm_sc, vcpu);
 	state = svm_get_vmcb_state(svm_sc, vcpu);
 	ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu);
 	vmexit = vm_exitinfo(vm, vcpu);
 	vlapic = vm_lapic(vm, vcpu);
 
 	gctx = svm_get_guest_regctx(svm_sc, vcpu);
 	vmcb_pa = svm_sc->vcpu[vcpu].vmcb_pa;
 
 	if (vcpustate->lastcpu != curcpu) {
 		/*
 		 * Force new ASID allocation by invalidating the generation.
 		 */
 		vcpustate->asid.gen = 0;
 
 		/*
 		 * Invalidate the VMCB state cache by marking all fields dirty.
 		 */
 		svm_set_dirty(svm_sc, vcpu, 0xffffffff);
 
 		/*
 		 * XXX
 		 * Setting 'vcpustate->lastcpu' here is bit premature because
 		 * we may return from this function without actually executing
 		 * the VMRUN  instruction. This could happen if a rendezvous
 		 * or an AST is pending on the first time through the loop.
 		 *
 		 * This works for now but any new side-effects of vcpu
 		 * migration should take this case into account.
 		 */
 		vcpustate->lastcpu = curcpu;
 		vmm_stat_incr(vm, vcpu, VCPU_MIGRATIONS, 1);
 	}
 
 	svm_msr_guest_enter(svm_sc, vcpu);
 
 	/* Update Guest RIP */
 	state->rip = rip;
 
 	do {
 		/*
 		 * Disable global interrupts to guarantee atomicity during
 		 * loading of guest state. This includes not only the state
 		 * loaded by the "vmrun" instruction but also software state
 		 * maintained by the hypervisor: suspended and rendezvous
 		 * state, NPT generation number, vlapic interrupts etc.
 		 */
 		disable_gintr();
 
 		if (vcpu_suspended(evinfo)) {
 			enable_gintr();
 			vm_exit_suspended(vm, vcpu, state->rip);
 			break;
 		}
 
 		if (vcpu_rendezvous_pending(evinfo)) {
 			enable_gintr();
 			vm_exit_rendezvous(vm, vcpu, state->rip);
 			break;
 		}
 
 		if (vcpu_reqidle(evinfo)) {
 			enable_gintr();
 			vm_exit_reqidle(vm, vcpu, state->rip);
 			break;
 		}
 
 		/* We are asked to give the cpu by scheduler. */
 		if (vcpu_should_yield(vm, vcpu)) {
 			enable_gintr();
 			vm_exit_astpending(vm, vcpu, state->rip);
 			break;
 		}
 
 		svm_inj_interrupts(svm_sc, vcpu, vlapic);
 
 		/* Activate the nested pmap on 'curcpu' */
 		CPU_SET_ATOMIC_ACQ(curcpu, &pmap->pm_active);
 
 		/*
 		 * Check the pmap generation and the ASID generation to
 		 * ensure that the vcpu does not use stale TLB mappings.
 		 */
 		check_asid(svm_sc, vcpu, pmap, curcpu);
 
 		ctrl->vmcb_clean = vmcb_clean & ~vcpustate->dirty;
 		vcpustate->dirty = 0;
 		VCPU_CTR1(vm, vcpu, "vmcb clean %#x", ctrl->vmcb_clean);
 
 		/* Launch Virtual Machine. */
 		VCPU_CTR1(vm, vcpu, "Resume execution at %#lx", state->rip);
 		svm_launch(vmcb_pa, gctx, &__pcpu[curcpu]);
 
 		CPU_CLR_ATOMIC(curcpu, &pmap->pm_active);
 
 		/*
 		 * The host GDTR and IDTR is saved by VMRUN and restored
 		 * automatically on #VMEXIT. However, the host TSS needs
 		 * to be restored explicitly.
 		 */
 		restore_host_tss();
 
 		/* #VMEXIT disables interrupts so re-enable them here. */ 
 		enable_gintr();
 
 		/* Update 'nextrip' */
 		vcpustate->nextrip = state->rip;
 
 		/* Handle #VMEXIT and if required return to user space. */
 		handled = svm_vmexit(svm_sc, vcpu, vmexit);
 	} while (handled);
 
 	svm_msr_guest_exit(svm_sc, vcpu);
 
 	return (0);
 }
 
 static void
 svm_vmcleanup(void *arg)
 {
 	struct svm_softc *sc = arg;
 
 	contigfree(sc, sizeof (*sc), M_SVM);
 }
 
 static register_t *
 swctx_regptr(struct svm_regctx *regctx, int reg)
 {
 
 	switch (reg) {
 	case VM_REG_GUEST_RBX:
 		return (&regctx->sctx_rbx);
 	case VM_REG_GUEST_RCX:
 		return (&regctx->sctx_rcx);
 	case VM_REG_GUEST_RDX:
 		return (&regctx->sctx_rdx);
 	case VM_REG_GUEST_RDI:
 		return (&regctx->sctx_rdi);
 	case VM_REG_GUEST_RSI:
 		return (&regctx->sctx_rsi);
 	case VM_REG_GUEST_RBP:
 		return (&regctx->sctx_rbp);
 	case VM_REG_GUEST_R8:
 		return (&regctx->sctx_r8);
 	case VM_REG_GUEST_R9:
 		return (&regctx->sctx_r9);
 	case VM_REG_GUEST_R10:
 		return (&regctx->sctx_r10);
 	case VM_REG_GUEST_R11:
 		return (&regctx->sctx_r11);
 	case VM_REG_GUEST_R12:
 		return (&regctx->sctx_r12);
 	case VM_REG_GUEST_R13:
 		return (&regctx->sctx_r13);
 	case VM_REG_GUEST_R14:
 		return (&regctx->sctx_r14);
 	case VM_REG_GUEST_R15:
 		return (&regctx->sctx_r15);
 	default:
 		return (NULL);
 	}
 }
 
 static int
 svm_getreg(void *arg, int vcpu, int ident, uint64_t *val)
 {
 	struct svm_softc *svm_sc;
 	register_t *reg;
 
 	svm_sc = arg;
 
 	if (ident == VM_REG_GUEST_INTR_SHADOW) {
 		return (svm_get_intr_shadow(svm_sc, vcpu, val));
 	}
 
 	if (vmcb_read(svm_sc, vcpu, ident, val) == 0) {
 		return (0);
 	}
 
 	reg = swctx_regptr(svm_get_guest_regctx(svm_sc, vcpu), ident);
 
 	if (reg != NULL) {
 		*val = *reg;
 		return (0);
 	}
 
 	VCPU_CTR1(svm_sc->vm, vcpu, "svm_getreg: unknown register %#x", ident);
 	return (EINVAL);
 }
 
 static int
 svm_setreg(void *arg, int vcpu, int ident, uint64_t val)
 {
 	struct svm_softc *svm_sc;
 	register_t *reg;
 
 	svm_sc = arg;
 
 	if (ident == VM_REG_GUEST_INTR_SHADOW) {
 		return (svm_modify_intr_shadow(svm_sc, vcpu, val));
 	}
 
 	if (vmcb_write(svm_sc, vcpu, ident, val) == 0) {
 		return (0);
 	}
 
 	reg = swctx_regptr(svm_get_guest_regctx(svm_sc, vcpu), ident);
 
 	if (reg != NULL) {
 		*reg = val;
 		return (0);
 	}
 
 	/*
 	 * XXX deal with CR3 and invalidate TLB entries tagged with the
 	 * vcpu's ASID. This needs to be treated differently depending on
 	 * whether 'running' is true/false.
 	 */
 
 	VCPU_CTR1(svm_sc->vm, vcpu, "svm_setreg: unknown register %#x", ident);
 	return (EINVAL);
 }
 
 static int
 svm_setcap(void *arg, int vcpu, int type, int val)
 {
 	struct svm_softc *sc;
 	int error;
 
 	sc = arg;
 	error = 0;
 	switch (type) {
 	case VM_CAP_HALT_EXIT:
 		svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
 		    VMCB_INTCPT_HLT, val);
 		break;
 	case VM_CAP_PAUSE_EXIT:
 		svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
 		    VMCB_INTCPT_PAUSE, val);
 		break;
 	case VM_CAP_UNRESTRICTED_GUEST:
 		/* Unrestricted guest execution cannot be disabled in SVM */
 		if (val == 0)
 			error = EINVAL;
 		break;
 	default:
 		error = ENOENT;
 		break;
 	}
 	return (error);
 }
 
 static int
 svm_getcap(void *arg, int vcpu, int type, int *retval)
 {
 	struct svm_softc *sc;
 	int error;
 
 	sc = arg;
 	error = 0;
 
 	switch (type) {
 	case VM_CAP_HALT_EXIT:
 		*retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
 		    VMCB_INTCPT_HLT);
 		break;
 	case VM_CAP_PAUSE_EXIT:
 		*retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
 		    VMCB_INTCPT_PAUSE);
 		break;
 	case VM_CAP_UNRESTRICTED_GUEST:
 		*retval = 1;	/* unrestricted guest is always enabled */
 		break;
 	default:
 		error = ENOENT;
 		break;
 	}
 	return (error);
 }
 
 static struct vlapic *
 svm_vlapic_init(void *arg, int vcpuid)
 {
 	struct svm_softc *svm_sc;
 	struct vlapic *vlapic;
 
 	svm_sc = arg;
 	vlapic = malloc(sizeof(struct vlapic), M_SVM_VLAPIC, M_WAITOK | M_ZERO);
 	vlapic->vm = svm_sc->vm;
 	vlapic->vcpuid = vcpuid;
 	vlapic->apic_page = (struct LAPIC *)&svm_sc->apic_page[vcpuid];
 
 	vlapic_init(vlapic);
 
 	return (vlapic);
 }
 
 static void
 svm_vlapic_cleanup(void *arg, struct vlapic *vlapic)
 {
 
         vlapic_cleanup(vlapic);
         free(vlapic, M_SVM_VLAPIC);
 }
 
 struct vmm_ops vmm_ops_amd = {
 	svm_init,
 	svm_cleanup,
 	svm_restore,
 	svm_vminit,
 	svm_vmrun,
 	svm_vmcleanup,
 	svm_getreg,
 	svm_setreg,
 	vmcb_getdesc,
 	vmcb_setdesc,
 	svm_getcap,
 	svm_setcap,
 	svm_npt_alloc,
 	svm_npt_free,
 	svm_vlapic_init,
 	svm_vlapic_cleanup	
 };
Index: head/sys/amd64/vmm/vmm.c
===================================================================
--- head/sys/amd64/vmm/vmm.c	(revision 315958)
+++ head/sys/amd64/vmm/vmm.c	(revision 315959)
@@ -1,2590 +1,2589 @@
 /*-
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/sysctl.h>
 #include <sys/malloc.h>
 #include <sys/pcpu.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/systm.h>
 
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_param.h>
 
 #include <machine/cpu.h>
 #include <machine/pcb.h>
 #include <machine/smp.h>
 #include <x86/psl.h>
 #include <x86/apicreg.h>
 
 #include <machine/vmm.h>
 #include <machine/vmm_dev.h>
 #include <machine/vmm_instruction_emul.h>
 
 #include "vmm_ioport.h"
 #include "vmm_ktr.h"
 #include "vmm_host.h"
 #include "vmm_mem.h"
 #include "vmm_util.h"
 #include "vatpic.h"
 #include "vatpit.h"
 #include "vhpet.h"
 #include "vioapic.h"
 #include "vlapic.h"
 #include "vpmtmr.h"
 #include "vrtc.h"
 #include "vmm_stat.h"
 #include "vmm_lapic.h"
 
 #include "io/ppt.h"
 #include "io/iommu.h"
 
 struct vlapic;
 
 /*
  * Initialization:
  * (a) allocated when vcpu is created
  * (i) initialized when vcpu is created and when it is reinitialized
  * (o) initialized the first time the vcpu is created
  * (x) initialized before use
  */
 struct vcpu {
 	struct mtx 	mtx;		/* (o) protects 'state' and 'hostcpu' */
 	enum vcpu_state	state;		/* (o) vcpu state */
 	int		hostcpu;	/* (o) vcpu's host cpu */
 	int		reqidle;	/* (i) request vcpu to idle */
 	struct vlapic	*vlapic;	/* (i) APIC device model */
 	enum x2apic_state x2apic_state;	/* (i) APIC mode */
 	uint64_t	exitintinfo;	/* (i) events pending at VM exit */
 	int		nmi_pending;	/* (i) NMI pending */
 	int		extint_pending;	/* (i) INTR pending */
 	int	exception_pending;	/* (i) exception pending */
 	int	exc_vector;		/* (x) exception collateral */
 	int	exc_errcode_valid;
 	uint32_t exc_errcode;
 	struct savefpu	*guestfpu;	/* (a,i) guest fpu state */
 	uint64_t	guest_xcr0;	/* (i) guest %xcr0 register */
 	void		*stats;		/* (a,i) statistics */
 	struct vm_exit	exitinfo;	/* (x) exit reason and collateral */
 	uint64_t	nextrip;	/* (x) next instruction to execute */
 };
 
 #define	vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx))
 #define	vcpu_lock_init(v)	mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
 #define	vcpu_lock(v)		mtx_lock_spin(&((v)->mtx))
 #define	vcpu_unlock(v)		mtx_unlock_spin(&((v)->mtx))
 #define	vcpu_assert_locked(v)	mtx_assert(&((v)->mtx), MA_OWNED)
 
 struct mem_seg {
 	size_t	len;
 	bool	sysmem;
 	struct vm_object *object;
 };
 #define	VM_MAX_MEMSEGS	3
 
 struct mem_map {
 	vm_paddr_t	gpa;
 	size_t		len;
 	vm_ooffset_t	segoff;
 	int		segid;
 	int		prot;
 	int		flags;
 };
 #define	VM_MAX_MEMMAPS	4
 
 /*
  * Initialization:
  * (o) initialized the first time the VM is created
  * (i) initialized when VM is created and when it is reinitialized
  * (x) initialized before use
  */
 struct vm {
 	void		*cookie;		/* (i) cpu-specific data */
 	void		*iommu;			/* (x) iommu-specific data */
 	struct vhpet	*vhpet;			/* (i) virtual HPET */
 	struct vioapic	*vioapic;		/* (i) virtual ioapic */
 	struct vatpic	*vatpic;		/* (i) virtual atpic */
 	struct vatpit	*vatpit;		/* (i) virtual atpit */
 	struct vpmtmr	*vpmtmr;		/* (i) virtual ACPI PM timer */
 	struct vrtc	*vrtc;			/* (o) virtual RTC */
 	volatile cpuset_t active_cpus;		/* (i) active vcpus */
 	int		suspend;		/* (i) stop VM execution */
 	volatile cpuset_t suspended_cpus; 	/* (i) suspended vcpus */
 	volatile cpuset_t halted_cpus;		/* (x) cpus in a hard halt */
 	cpuset_t	rendezvous_req_cpus;	/* (x) rendezvous requested */
 	cpuset_t	rendezvous_done_cpus;	/* (x) rendezvous finished */
 	void		*rendezvous_arg;	/* (x) rendezvous func/arg */
 	vm_rendezvous_func_t rendezvous_func;
 	struct mtx	rendezvous_mtx;		/* (o) rendezvous lock */
 	struct mem_map	mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */
 	struct mem_seg	mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */
 	struct vmspace	*vmspace;		/* (o) guest's address space */
 	char		name[VM_MAX_NAMELEN];	/* (o) virtual machine name */
 	struct vcpu	vcpu[VM_MAXCPU];	/* (i) guest vcpus */
 };
 
 static int vmm_initialized;
 
 static struct vmm_ops *ops;
 #define	VMM_INIT(num)	(ops != NULL ? (*ops->init)(num) : 0)
 #define	VMM_CLEANUP()	(ops != NULL ? (*ops->cleanup)() : 0)
 #define	VMM_RESUME()	(ops != NULL ? (*ops->resume)() : 0)
 
 #define	VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL)
 #define	VMRUN(vmi, vcpu, rip, pmap, evinfo) \
 	(ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap, evinfo) : ENXIO)
 #define	VMCLEANUP(vmi)	(ops != NULL ? (*ops->vmcleanup)(vmi) : NULL)
 #define	VMSPACE_ALLOC(min, max) \
 	(ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL)
 #define	VMSPACE_FREE(vmspace) \
 	(ops != NULL ? (*ops->vmspace_free)(vmspace) : ENXIO)
 #define	VMGETREG(vmi, vcpu, num, retval)		\
 	(ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO)
 #define	VMSETREG(vmi, vcpu, num, val)		\
 	(ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO)
 #define	VMGETDESC(vmi, vcpu, num, desc)		\
 	(ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO)
 #define	VMSETDESC(vmi, vcpu, num, desc)		\
 	(ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO)
 #define	VMGETCAP(vmi, vcpu, num, retval)	\
 	(ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO)
 #define	VMSETCAP(vmi, vcpu, num, val)		\
 	(ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO)
 #define	VLAPIC_INIT(vmi, vcpu)			\
 	(ops != NULL ? (*ops->vlapic_init)(vmi, vcpu) : NULL)
 #define	VLAPIC_CLEANUP(vmi, vlapic)		\
 	(ops != NULL ? (*ops->vlapic_cleanup)(vmi, vlapic) : NULL)
 
 #define	fpu_start_emulating()	load_cr0(rcr0() | CR0_TS)
 #define	fpu_stop_emulating()	clts()
 
 static MALLOC_DEFINE(M_VM, "vm", "vm");
 
 /* statistics */
 static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
 
 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
 
 /*
  * Halt the guest if all vcpus are executing a HLT instruction with
  * interrupts disabled.
  */
 static int halt_detection_enabled = 1;
 SYSCTL_INT(_hw_vmm, OID_AUTO, halt_detection, CTLFLAG_RDTUN,
     &halt_detection_enabled, 0,
     "Halt VM if all vcpus execute HLT with interrupts disabled");
 
-static int vmm_ipinum;
 SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0,
     "IPI vector used for vcpu notifications");
 
 static int trace_guest_exceptions;
 SYSCTL_INT(_hw_vmm, OID_AUTO, trace_guest_exceptions, CTLFLAG_RDTUN,
     &trace_guest_exceptions, 0,
     "Trap into hypervisor on all guest exceptions and reflect them back");
 
 static void vm_free_memmap(struct vm *vm, int ident);
 static bool sysmem_mapping(struct vm *vm, struct mem_map *mm);
 static void vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr);
 
 #ifdef KTR
 static const char *
 vcpu_state2str(enum vcpu_state state)
 {
 
 	switch (state) {
 	case VCPU_IDLE:
 		return ("idle");
 	case VCPU_FROZEN:
 		return ("frozen");
 	case VCPU_RUNNING:
 		return ("running");
 	case VCPU_SLEEPING:
 		return ("sleeping");
 	default:
 		return ("unknown");
 	}
 }
 #endif
 
 static void
 vcpu_cleanup(struct vm *vm, int i, bool destroy)
 {
 	struct vcpu *vcpu = &vm->vcpu[i];
 
 	VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic);
 	if (destroy) {
 		vmm_stat_free(vcpu->stats);	
 		fpu_save_area_free(vcpu->guestfpu);
 	}
 }
 
 static void
 vcpu_init(struct vm *vm, int vcpu_id, bool create)
 {
 	struct vcpu *vcpu;
 
 	KASSERT(vcpu_id >= 0 && vcpu_id < VM_MAXCPU,
 	    ("vcpu_init: invalid vcpu %d", vcpu_id));
 	  
 	vcpu = &vm->vcpu[vcpu_id];
 
 	if (create) {
 		KASSERT(!vcpu_lock_initialized(vcpu), ("vcpu %d already "
 		    "initialized", vcpu_id));
 		vcpu_lock_init(vcpu);
 		vcpu->state = VCPU_IDLE;
 		vcpu->hostcpu = NOCPU;
 		vcpu->guestfpu = fpu_save_area_alloc();
 		vcpu->stats = vmm_stat_alloc();
 	}
 
 	vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
 	vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED);
 	vcpu->reqidle = 0;
 	vcpu->exitintinfo = 0;
 	vcpu->nmi_pending = 0;
 	vcpu->extint_pending = 0;
 	vcpu->exception_pending = 0;
 	vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
 	fpu_save_area_reset(vcpu->guestfpu);
 	vmm_stat_init(vcpu->stats);
 }
 
 int
 vcpu_trace_exceptions(struct vm *vm, int vcpuid)
 {
 
 	return (trace_guest_exceptions);
 }
 
 struct vm_exit *
 vm_exitinfo(struct vm *vm, int cpuid)
 {
 	struct vcpu *vcpu;
 
 	if (cpuid < 0 || cpuid >= VM_MAXCPU)
 		panic("vm_exitinfo: invalid cpuid %d", cpuid);
 
 	vcpu = &vm->vcpu[cpuid];
 
 	return (&vcpu->exitinfo);
 }
 
 static void
 vmm_resume(void)
 {
 	VMM_RESUME();
 }
 
 static int
 vmm_init(void)
 {
 	int error;
 
 	vmm_host_state_init();
 
 	vmm_ipinum = lapic_ipi_alloc(&IDTVEC(justreturn));
 	if (vmm_ipinum < 0)
 		vmm_ipinum = IPI_AST;
 
 	error = vmm_mem_init();
 	if (error)
 		return (error);
 	
 	if (vmm_is_intel())
 		ops = &vmm_ops_intel;
 	else if (vmm_is_amd())
 		ops = &vmm_ops_amd;
 	else
 		return (ENXIO);
 
 	vmm_resume_p = vmm_resume;
 
 	return (VMM_INIT(vmm_ipinum));
 }
 
 static int
 vmm_handler(module_t mod, int what, void *arg)
 {
 	int error;
 
 	switch (what) {
 	case MOD_LOAD:
 		vmmdev_init();
 		error = vmm_init();
 		if (error == 0)
 			vmm_initialized = 1;
 		break;
 	case MOD_UNLOAD:
 		error = vmmdev_cleanup();
 		if (error == 0) {
 			vmm_resume_p = NULL;
 			iommu_cleanup();
 			if (vmm_ipinum != IPI_AST)
 				lapic_ipi_free(vmm_ipinum);
 			error = VMM_CLEANUP();
 			/*
 			 * Something bad happened - prevent new
 			 * VMs from being created
 			 */
 			if (error)
 				vmm_initialized = 0;
 		}
 		break;
 	default:
 		error = 0;
 		break;
 	}
 	return (error);
 }
 
 static moduledata_t vmm_kmod = {
 	"vmm",
 	vmm_handler,
 	NULL
 };
 
 /*
  * vmm initialization has the following dependencies:
  *
  * - VT-x initialization requires smp_rendezvous() and therefore must happen
  *   after SMP is fully functional (after SI_SUB_SMP).
  */
 DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY);
 MODULE_VERSION(vmm, 1);
 
 static void
 vm_init(struct vm *vm, bool create)
 {
 	int i;
 
 	vm->cookie = VMINIT(vm, vmspace_pmap(vm->vmspace));
 	vm->iommu = NULL;
 	vm->vioapic = vioapic_init(vm);
 	vm->vhpet = vhpet_init(vm);
 	vm->vatpic = vatpic_init(vm);
 	vm->vatpit = vatpit_init(vm);
 	vm->vpmtmr = vpmtmr_init(vm);
 	if (create)
 		vm->vrtc = vrtc_init(vm);
 
 	CPU_ZERO(&vm->active_cpus);
 
 	vm->suspend = 0;
 	CPU_ZERO(&vm->suspended_cpus);
 
 	for (i = 0; i < VM_MAXCPU; i++)
 		vcpu_init(vm, i, create);
 }
 
 int
 vm_create(const char *name, struct vm **retvm)
 {
 	struct vm *vm;
 	struct vmspace *vmspace;
 
 	/*
 	 * If vmm.ko could not be successfully initialized then don't attempt
 	 * to create the virtual machine.
 	 */
 	if (!vmm_initialized)
 		return (ENXIO);
 
 	if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
 		return (EINVAL);
 
 	vmspace = VMSPACE_ALLOC(0, VM_MAXUSER_ADDRESS);
 	if (vmspace == NULL)
 		return (ENOMEM);
 
 	vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
 	strcpy(vm->name, name);
 	vm->vmspace = vmspace;
 	mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF);
 
 	vm_init(vm, true);
 
 	*retvm = vm;
 	return (0);
 }
 
 static void
 vm_cleanup(struct vm *vm, bool destroy)
 {
 	struct mem_map *mm;
 	int i;
 
 	ppt_unassign_all(vm);
 
 	if (vm->iommu != NULL)
 		iommu_destroy_domain(vm->iommu);
 
 	if (destroy)
 		vrtc_cleanup(vm->vrtc);
 	else
 		vrtc_reset(vm->vrtc);
 	vpmtmr_cleanup(vm->vpmtmr);
 	vatpit_cleanup(vm->vatpit);
 	vhpet_cleanup(vm->vhpet);
 	vatpic_cleanup(vm->vatpic);
 	vioapic_cleanup(vm->vioapic);
 
 	for (i = 0; i < VM_MAXCPU; i++)
 		vcpu_cleanup(vm, i, destroy);
 
 	VMCLEANUP(vm->cookie);
 
 	/*
 	 * System memory is removed from the guest address space only when
 	 * the VM is destroyed. This is because the mapping remains the same
 	 * across VM reset.
 	 *
 	 * Device memory can be relocated by the guest (e.g. using PCI BARs)
 	 * so those mappings are removed on a VM reset.
 	 */
 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 		mm = &vm->mem_maps[i];
 		if (destroy || !sysmem_mapping(vm, mm))
 			vm_free_memmap(vm, i);
 	}
 
 	if (destroy) {
 		for (i = 0; i < VM_MAX_MEMSEGS; i++)
 			vm_free_memseg(vm, i);
 
 		VMSPACE_FREE(vm->vmspace);
 		vm->vmspace = NULL;
 	}
 }
 
 void
 vm_destroy(struct vm *vm)
 {
 	vm_cleanup(vm, true);
 	free(vm, M_VM);
 }
 
 int
 vm_reinit(struct vm *vm)
 {
 	int error;
 
 	/*
 	 * A virtual machine can be reset only if all vcpus are suspended.
 	 */
 	if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
 		vm_cleanup(vm, false);
 		vm_init(vm, false);
 		error = 0;
 	} else {
 		error = EBUSY;
 	}
 
 	return (error);
 }
 
 const char *
 vm_name(struct vm *vm)
 {
 	return (vm->name);
 }
 
 int
 vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
 {
 	vm_object_t obj;
 
 	if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL)
 		return (ENOMEM);
 	else
 		return (0);
 }
 
 int
 vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
 {
 
 	vmm_mmio_free(vm->vmspace, gpa, len);
 	return (0);
 }
 
 /*
  * Return 'true' if 'gpa' is allocated in the guest address space.
  *
  * This function is called in the context of a running vcpu which acts as
  * an implicit lock on 'vm->mem_maps[]'.
  */
 bool
 vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa)
 {
 	struct mem_map *mm;
 	int i;
 
 #ifdef INVARIANTS
 	int hostcpu, state;
 	state = vcpu_get_state(vm, vcpuid, &hostcpu);
 	KASSERT(state == VCPU_RUNNING && hostcpu == curcpu,
 	    ("%s: invalid vcpu state %d/%d", __func__, state, hostcpu));
 #endif
 
 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 		mm = &vm->mem_maps[i];
 		if (mm->len != 0 && gpa >= mm->gpa && gpa < mm->gpa + mm->len)
 			return (true);		/* 'gpa' is sysmem or devmem */
 	}
 
 	if (ppt_is_mmio(vm, gpa))
 		return (true);			/* 'gpa' is pci passthru mmio */
 
 	return (false);
 }
 
 int
 vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem)
 {
 	struct mem_seg *seg;
 	vm_object_t obj;
 
 	if (ident < 0 || ident >= VM_MAX_MEMSEGS)
 		return (EINVAL);
 
 	if (len == 0 || (len & PAGE_MASK))
 		return (EINVAL);
 
 	seg = &vm->mem_segs[ident];
 	if (seg->object != NULL) {
 		if (seg->len == len && seg->sysmem == sysmem)
 			return (EEXIST);
 		else
 			return (EINVAL);
 	}
 
 	obj = vm_object_allocate(OBJT_DEFAULT, len >> PAGE_SHIFT);
 	if (obj == NULL)
 		return (ENOMEM);
 
 	seg->len = len;
 	seg->object = obj;
 	seg->sysmem = sysmem;
 	return (0);
 }
 
 int
 vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem,
     vm_object_t *objptr)
 {
 	struct mem_seg *seg;
 
 	if (ident < 0 || ident >= VM_MAX_MEMSEGS)
 		return (EINVAL);
 
 	seg = &vm->mem_segs[ident];
 	if (len)
 		*len = seg->len;
 	if (sysmem)
 		*sysmem = seg->sysmem;
 	if (objptr)
 		*objptr = seg->object;
 	return (0);
 }
 
 void
 vm_free_memseg(struct vm *vm, int ident)
 {
 	struct mem_seg *seg;
 
 	KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS,
 	    ("%s: invalid memseg ident %d", __func__, ident));
 
 	seg = &vm->mem_segs[ident];
 	if (seg->object != NULL) {
 		vm_object_deallocate(seg->object);
 		bzero(seg, sizeof(struct mem_seg));
 	}
 }
 
 int
 vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first,
     size_t len, int prot, int flags)
 {
 	struct mem_seg *seg;
 	struct mem_map *m, *map;
 	vm_ooffset_t last;
 	int i, error;
 
 	if (prot == 0 || (prot & ~(VM_PROT_ALL)) != 0)
 		return (EINVAL);
 
 	if (flags & ~VM_MEMMAP_F_WIRED)
 		return (EINVAL);
 
 	if (segid < 0 || segid >= VM_MAX_MEMSEGS)
 		return (EINVAL);
 
 	seg = &vm->mem_segs[segid];
 	if (seg->object == NULL)
 		return (EINVAL);
 
 	last = first + len;
 	if (first < 0 || first >= last || last > seg->len)
 		return (EINVAL);
 
 	if ((gpa | first | last) & PAGE_MASK)
 		return (EINVAL);
 
 	map = NULL;
 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 		m = &vm->mem_maps[i];
 		if (m->len == 0) {
 			map = m;
 			break;
 		}
 	}
 
 	if (map == NULL)
 		return (ENOSPC);
 
 	error = vm_map_find(&vm->vmspace->vm_map, seg->object, first, &gpa,
 	    len, 0, VMFS_NO_SPACE, prot, prot, 0);
 	if (error != KERN_SUCCESS)
 		return (EFAULT);
 
 	vm_object_reference(seg->object);
 
 	if (flags & VM_MEMMAP_F_WIRED) {
 		error = vm_map_wire(&vm->vmspace->vm_map, gpa, gpa + len,
 		    VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
 		if (error != KERN_SUCCESS) {
 			vm_map_remove(&vm->vmspace->vm_map, gpa, gpa + len);
 			return (EFAULT);
 		}
 	}
 
 	map->gpa = gpa;
 	map->len = len;
 	map->segoff = first;
 	map->segid = segid;
 	map->prot = prot;
 	map->flags = flags;
 	return (0);
 }
 
 int
 vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid,
     vm_ooffset_t *segoff, size_t *len, int *prot, int *flags)
 {
 	struct mem_map *mm, *mmnext;
 	int i;
 
 	mmnext = NULL;
 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 		mm = &vm->mem_maps[i];
 		if (mm->len == 0 || mm->gpa < *gpa)
 			continue;
 		if (mmnext == NULL || mm->gpa < mmnext->gpa)
 			mmnext = mm;
 	}
 
 	if (mmnext != NULL) {
 		*gpa = mmnext->gpa;
 		if (segid)
 			*segid = mmnext->segid;
 		if (segoff)
 			*segoff = mmnext->segoff;
 		if (len)
 			*len = mmnext->len;
 		if (prot)
 			*prot = mmnext->prot;
 		if (flags)
 			*flags = mmnext->flags;
 		return (0);
 	} else {
 		return (ENOENT);
 	}
 }
 
 static void
 vm_free_memmap(struct vm *vm, int ident)
 {
 	struct mem_map *mm;
 	int error;
 
 	mm = &vm->mem_maps[ident];
 	if (mm->len) {
 		error = vm_map_remove(&vm->vmspace->vm_map, mm->gpa,
 		    mm->gpa + mm->len);
 		KASSERT(error == KERN_SUCCESS, ("%s: vm_map_remove error %d",
 		    __func__, error));
 		bzero(mm, sizeof(struct mem_map));
 	}
 }
 
 static __inline bool
 sysmem_mapping(struct vm *vm, struct mem_map *mm)
 {
 
 	if (mm->len != 0 && vm->mem_segs[mm->segid].sysmem)
 		return (true);
 	else
 		return (false);
 }
 
 static vm_paddr_t
 sysmem_maxaddr(struct vm *vm)
 {
 	struct mem_map *mm;
 	vm_paddr_t maxaddr;
 	int i;
 
 	maxaddr = 0;
 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 		mm = &vm->mem_maps[i];
 		if (sysmem_mapping(vm, mm)) {
 			if (maxaddr < mm->gpa + mm->len)
 				maxaddr = mm->gpa + mm->len;
 		}
 	}
 	return (maxaddr);
 }
 
 static void
 vm_iommu_modify(struct vm *vm, boolean_t map)
 {
 	int i, sz;
 	vm_paddr_t gpa, hpa;
 	struct mem_map *mm;
 	void *vp, *cookie, *host_domain;
 
 	sz = PAGE_SIZE;
 	host_domain = iommu_host_domain();
 
 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 		mm = &vm->mem_maps[i];
 		if (!sysmem_mapping(vm, mm))
 			continue;
 
 		if (map) {
 			KASSERT((mm->flags & VM_MEMMAP_F_IOMMU) == 0,
 			    ("iommu map found invalid memmap %#lx/%#lx/%#x",
 			    mm->gpa, mm->len, mm->flags));
 			if ((mm->flags & VM_MEMMAP_F_WIRED) == 0)
 				continue;
 			mm->flags |= VM_MEMMAP_F_IOMMU;
 		} else {
 			if ((mm->flags & VM_MEMMAP_F_IOMMU) == 0)
 				continue;
 			mm->flags &= ~VM_MEMMAP_F_IOMMU;
 			KASSERT((mm->flags & VM_MEMMAP_F_WIRED) != 0,
 			    ("iommu unmap found invalid memmap %#lx/%#lx/%#x",
 			    mm->gpa, mm->len, mm->flags));
 		}
 
 		gpa = mm->gpa;
 		while (gpa < mm->gpa + mm->len) {
 			vp = vm_gpa_hold(vm, -1, gpa, PAGE_SIZE, VM_PROT_WRITE,
 					 &cookie);
 			KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx",
 			    vm_name(vm), gpa));
 
 			vm_gpa_release(cookie);
 
 			hpa = DMAP_TO_PHYS((uintptr_t)vp);
 			if (map) {
 				iommu_create_mapping(vm->iommu, gpa, hpa, sz);
 				iommu_remove_mapping(host_domain, hpa, sz);
 			} else {
 				iommu_remove_mapping(vm->iommu, gpa, sz);
 				iommu_create_mapping(host_domain, hpa, hpa, sz);
 			}
 
 			gpa += PAGE_SIZE;
 		}
 	}
 
 	/*
 	 * Invalidate the cached translations associated with the domain
 	 * from which pages were removed.
 	 */
 	if (map)
 		iommu_invalidate_tlb(host_domain);
 	else
 		iommu_invalidate_tlb(vm->iommu);
 }
 
 #define	vm_iommu_unmap(vm)	vm_iommu_modify((vm), FALSE)
 #define	vm_iommu_map(vm)	vm_iommu_modify((vm), TRUE)
 
 int
 vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func)
 {
 	int error;
 
 	error = ppt_unassign_device(vm, bus, slot, func);
 	if (error)
 		return (error);
 
 	if (ppt_assigned_devices(vm) == 0)
 		vm_iommu_unmap(vm);
 
 	return (0);
 }
 
 int
 vm_assign_pptdev(struct vm *vm, int bus, int slot, int func)
 {
 	int error;
 	vm_paddr_t maxaddr;
 
 	/* Set up the IOMMU to do the 'gpa' to 'hpa' translation */
 	if (ppt_assigned_devices(vm) == 0) {
 		KASSERT(vm->iommu == NULL,
 		    ("vm_assign_pptdev: iommu must be NULL"));
 		maxaddr = sysmem_maxaddr(vm);
 		vm->iommu = iommu_create_domain(maxaddr);
 		if (vm->iommu == NULL)
 			return (ENXIO);
 		vm_iommu_map(vm);
 	}
 
 	error = ppt_assign_device(vm, bus, slot, func);
 	return (error);
 }
 
 void *
 vm_gpa_hold(struct vm *vm, int vcpuid, vm_paddr_t gpa, size_t len, int reqprot,
 	    void **cookie)
 {
 	int i, count, pageoff;
 	struct mem_map *mm;
 	vm_page_t m;
 #ifdef INVARIANTS
 	/*
 	 * All vcpus are frozen by ioctls that modify the memory map
 	 * (e.g. VM_MMAP_MEMSEG). Therefore 'vm->memmap[]' stability is
 	 * guaranteed if at least one vcpu is in the VCPU_FROZEN state.
 	 */
 	int state;
 	KASSERT(vcpuid >= -1 && vcpuid < VM_MAXCPU, ("%s: invalid vcpuid %d",
 	    __func__, vcpuid));
 	for (i = 0; i < VM_MAXCPU; i++) {
 		if (vcpuid != -1 && vcpuid != i)
 			continue;
 		state = vcpu_get_state(vm, i, NULL);
 		KASSERT(state == VCPU_FROZEN, ("%s: invalid vcpu state %d",
 		    __func__, state));
 	}
 #endif
 	pageoff = gpa & PAGE_MASK;
 	if (len > PAGE_SIZE - pageoff)
 		panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
 
 	count = 0;
 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 		mm = &vm->mem_maps[i];
 		if (sysmem_mapping(vm, mm) && gpa >= mm->gpa &&
 		    gpa < mm->gpa + mm->len) {
 			count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map,
 			    trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1);
 			break;
 		}
 	}
 
 	if (count == 1) {
 		*cookie = m;
 		return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff));
 	} else {
 		*cookie = NULL;
 		return (NULL);
 	}
 }
 
 void
 vm_gpa_release(void *cookie)
 {
 	vm_page_t m = cookie;
 
 	vm_page_lock(m);
 	vm_page_unhold(m);
 	vm_page_unlock(m);
 }
 
 int
 vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
 {
 
 	if (vcpu < 0 || vcpu >= VM_MAXCPU)
 		return (EINVAL);
 
 	if (reg >= VM_REG_LAST)
 		return (EINVAL);
 
 	return (VMGETREG(vm->cookie, vcpu, reg, retval));
 }
 
 int
 vm_set_register(struct vm *vm, int vcpuid, int reg, uint64_t val)
 {
 	struct vcpu *vcpu;
 	int error;
 
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		return (EINVAL);
 
 	if (reg >= VM_REG_LAST)
 		return (EINVAL);
 
 	error = VMSETREG(vm->cookie, vcpuid, reg, val);
 	if (error || reg != VM_REG_GUEST_RIP)
 		return (error);
 
 	/* Set 'nextrip' to match the value of %rip */
 	VCPU_CTR1(vm, vcpuid, "Setting nextrip to %#lx", val);
 	vcpu = &vm->vcpu[vcpuid];
 	vcpu->nextrip = val;
 	return (0);
 }
 
 static boolean_t
 is_descriptor_table(int reg)
 {
 
 	switch (reg) {
 	case VM_REG_GUEST_IDTR:
 	case VM_REG_GUEST_GDTR:
 		return (TRUE);
 	default:
 		return (FALSE);
 	}
 }
 
 static boolean_t
 is_segment_register(int reg)
 {
 	
 	switch (reg) {
 	case VM_REG_GUEST_ES:
 	case VM_REG_GUEST_CS:
 	case VM_REG_GUEST_SS:
 	case VM_REG_GUEST_DS:
 	case VM_REG_GUEST_FS:
 	case VM_REG_GUEST_GS:
 	case VM_REG_GUEST_TR:
 	case VM_REG_GUEST_LDTR:
 		return (TRUE);
 	default:
 		return (FALSE);
 	}
 }
 
 int
 vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
 		struct seg_desc *desc)
 {
 
 	if (vcpu < 0 || vcpu >= VM_MAXCPU)
 		return (EINVAL);
 
 	if (!is_segment_register(reg) && !is_descriptor_table(reg))
 		return (EINVAL);
 
 	return (VMGETDESC(vm->cookie, vcpu, reg, desc));
 }
 
 int
 vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
 		struct seg_desc *desc)
 {
 	if (vcpu < 0 || vcpu >= VM_MAXCPU)
 		return (EINVAL);
 
 	if (!is_segment_register(reg) && !is_descriptor_table(reg))
 		return (EINVAL);
 
 	return (VMSETDESC(vm->cookie, vcpu, reg, desc));
 }
 
 static void
 restore_guest_fpustate(struct vcpu *vcpu)
 {
 
 	/* flush host state to the pcb */
 	fpuexit(curthread);
 
 	/* restore guest FPU state */
 	fpu_stop_emulating();
 	fpurestore(vcpu->guestfpu);
 
 	/* restore guest XCR0 if XSAVE is enabled in the host */
 	if (rcr4() & CR4_XSAVE)
 		load_xcr(0, vcpu->guest_xcr0);
 
 	/*
 	 * The FPU is now "dirty" with the guest's state so turn on emulation
 	 * to trap any access to the FPU by the host.
 	 */
 	fpu_start_emulating();
 }
 
 static void
 save_guest_fpustate(struct vcpu *vcpu)
 {
 
 	if ((rcr0() & CR0_TS) == 0)
 		panic("fpu emulation not enabled in host!");
 
 	/* save guest XCR0 and restore host XCR0 */
 	if (rcr4() & CR4_XSAVE) {
 		vcpu->guest_xcr0 = rxcr(0);
 		load_xcr(0, vmm_get_host_xcr0());
 	}
 
 	/* save guest FPU state */
 	fpu_stop_emulating();
 	fpusave(vcpu->guestfpu);
 	fpu_start_emulating();
 }
 
 static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle");
 
 static int
 vcpu_set_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate,
     bool from_idle)
 {
 	struct vcpu *vcpu;
 	int error;
 
 	vcpu = &vm->vcpu[vcpuid];
 	vcpu_assert_locked(vcpu);
 
 	/*
 	 * State transitions from the vmmdev_ioctl() must always begin from
 	 * the VCPU_IDLE state. This guarantees that there is only a single
 	 * ioctl() operating on a vcpu at any point.
 	 */
 	if (from_idle) {
 		while (vcpu->state != VCPU_IDLE) {
 			vcpu->reqidle = 1;
 			vcpu_notify_event_locked(vcpu, false);
 			VCPU_CTR1(vm, vcpuid, "vcpu state change from %s to "
 			    "idle requested", vcpu_state2str(vcpu->state));
 			msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz);
 		}
 	} else {
 		KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
 		    "vcpu idle state"));
 	}
 
 	if (vcpu->state == VCPU_RUNNING) {
 		KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d "
 		    "mismatch for running vcpu", curcpu, vcpu->hostcpu));
 	} else {
 		KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a "
 		    "vcpu that is not running", vcpu->hostcpu));
 	}
 
 	/*
 	 * The following state transitions are allowed:
 	 * IDLE -> FROZEN -> IDLE
 	 * FROZEN -> RUNNING -> FROZEN
 	 * FROZEN -> SLEEPING -> FROZEN
 	 */
 	switch (vcpu->state) {
 	case VCPU_IDLE:
 	case VCPU_RUNNING:
 	case VCPU_SLEEPING:
 		error = (newstate != VCPU_FROZEN);
 		break;
 	case VCPU_FROZEN:
 		error = (newstate == VCPU_FROZEN);
 		break;
 	default:
 		error = 1;
 		break;
 	}
 
 	if (error)
 		return (EBUSY);
 
 	VCPU_CTR2(vm, vcpuid, "vcpu state changed from %s to %s",
 	    vcpu_state2str(vcpu->state), vcpu_state2str(newstate));
 
 	vcpu->state = newstate;
 	if (newstate == VCPU_RUNNING)
 		vcpu->hostcpu = curcpu;
 	else
 		vcpu->hostcpu = NOCPU;
 
 	if (newstate == VCPU_IDLE)
 		wakeup(&vcpu->state);
 
 	return (0);
 }
 
 static void
 vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
 {
 	int error;
 
 	if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0)
 		panic("Error %d setting state to %d\n", error, newstate);
 }
 
 static void
 vcpu_require_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate)
 {
 	int error;
 
 	if ((error = vcpu_set_state_locked(vm, vcpuid, newstate, false)) != 0)
 		panic("Error %d setting state to %d", error, newstate);
 }
 
 static void
 vm_set_rendezvous_func(struct vm *vm, vm_rendezvous_func_t func)
 {
 
 	KASSERT(mtx_owned(&vm->rendezvous_mtx), ("rendezvous_mtx not locked"));
 
 	/*
 	 * Update 'rendezvous_func' and execute a write memory barrier to
 	 * ensure that it is visible across all host cpus. This is not needed
 	 * for correctness but it does ensure that all the vcpus will notice
 	 * that the rendezvous is requested immediately.
 	 */
 	vm->rendezvous_func = func;
 	wmb();
 }
 
 #define	RENDEZVOUS_CTR0(vm, vcpuid, fmt)				\
 	do {								\
 		if (vcpuid >= 0)					\
 			VCPU_CTR0(vm, vcpuid, fmt);			\
 		else							\
 			VM_CTR0(vm, fmt);				\
 	} while (0)
 
 static void
 vm_handle_rendezvous(struct vm *vm, int vcpuid)
 {
 
 	KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU),
 	    ("vm_handle_rendezvous: invalid vcpuid %d", vcpuid));
 
 	mtx_lock(&vm->rendezvous_mtx);
 	while (vm->rendezvous_func != NULL) {
 		/* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */
 		CPU_AND(&vm->rendezvous_req_cpus, &vm->active_cpus);
 
 		if (vcpuid != -1 &&
 		    CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus) &&
 		    !CPU_ISSET(vcpuid, &vm->rendezvous_done_cpus)) {
 			VCPU_CTR0(vm, vcpuid, "Calling rendezvous func");
 			(*vm->rendezvous_func)(vm, vcpuid, vm->rendezvous_arg);
 			CPU_SET(vcpuid, &vm->rendezvous_done_cpus);
 		}
 		if (CPU_CMP(&vm->rendezvous_req_cpus,
 		    &vm->rendezvous_done_cpus) == 0) {
 			VCPU_CTR0(vm, vcpuid, "Rendezvous completed");
 			vm_set_rendezvous_func(vm, NULL);
 			wakeup(&vm->rendezvous_func);
 			break;
 		}
 		RENDEZVOUS_CTR0(vm, vcpuid, "Wait for rendezvous completion");
 		mtx_sleep(&vm->rendezvous_func, &vm->rendezvous_mtx, 0,
 		    "vmrndv", 0);
 	}
 	mtx_unlock(&vm->rendezvous_mtx);
 }
 
 /*
  * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
  */
 static int
 vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu)
 {
 	struct vcpu *vcpu;
 	const char *wmesg;
 	int t, vcpu_halted, vm_halted;
 
 	KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted"));
 
 	vcpu = &vm->vcpu[vcpuid];
 	vcpu_halted = 0;
 	vm_halted = 0;
 
 	vcpu_lock(vcpu);
 	while (1) {
 		/*
 		 * Do a final check for pending NMI or interrupts before
 		 * really putting this thread to sleep. Also check for
 		 * software events that would cause this vcpu to wakeup.
 		 *
 		 * These interrupts/events could have happened after the
 		 * vcpu returned from VMRUN() and before it acquired the
 		 * vcpu lock above.
 		 */
 		if (vm->rendezvous_func != NULL || vm->suspend || vcpu->reqidle)
 			break;
 		if (vm_nmi_pending(vm, vcpuid))
 			break;
 		if (!intr_disabled) {
 			if (vm_extint_pending(vm, vcpuid) ||
 			    vlapic_pending_intr(vcpu->vlapic, NULL)) {
 				break;
 			}
 		}
 
 		/* Don't go to sleep if the vcpu thread needs to yield */
 		if (vcpu_should_yield(vm, vcpuid))
 			break;
 
 		/*
 		 * Some Linux guests implement "halt" by having all vcpus
 		 * execute HLT with interrupts disabled. 'halted_cpus' keeps
 		 * track of the vcpus that have entered this state. When all
 		 * vcpus enter the halted state the virtual machine is halted.
 		 */
 		if (intr_disabled) {
 			wmesg = "vmhalt";
 			VCPU_CTR0(vm, vcpuid, "Halted");
 			if (!vcpu_halted && halt_detection_enabled) {
 				vcpu_halted = 1;
 				CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus);
 			}
 			if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) {
 				vm_halted = 1;
 				break;
 			}
 		} else {
 			wmesg = "vmidle";
 		}
 
 		t = ticks;
 		vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
 		/*
 		 * XXX msleep_spin() cannot be interrupted by signals so
 		 * wake up periodically to check pending signals.
 		 */
 		msleep_spin(vcpu, &vcpu->mtx, wmesg, hz);
 		vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
 		vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
 	}
 
 	if (vcpu_halted)
 		CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus);
 
 	vcpu_unlock(vcpu);
 
 	if (vm_halted)
 		vm_suspend(vm, VM_SUSPEND_HALT);
 
 	return (0);
 }
 
 static int
 vm_handle_paging(struct vm *vm, int vcpuid, bool *retu)
 {
 	int rv, ftype;
 	struct vm_map *map;
 	struct vcpu *vcpu;
 	struct vm_exit *vme;
 
 	vcpu = &vm->vcpu[vcpuid];
 	vme = &vcpu->exitinfo;
 
 	KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d",
 	    __func__, vme->inst_length));
 
 	ftype = vme->u.paging.fault_type;
 	KASSERT(ftype == VM_PROT_READ ||
 	    ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE,
 	    ("vm_handle_paging: invalid fault_type %d", ftype));
 
 	if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
 		rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace),
 		    vme->u.paging.gpa, ftype);
 		if (rv == 0) {
 			VCPU_CTR2(vm, vcpuid, "%s bit emulation for gpa %#lx",
 			    ftype == VM_PROT_READ ? "accessed" : "dirty",
 			    vme->u.paging.gpa);
 			goto done;
 		}
 	}
 
 	map = &vm->vmspace->vm_map;
 	rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL);
 
 	VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, "
 	    "ftype = %d", rv, vme->u.paging.gpa, ftype);
 
 	if (rv != KERN_SUCCESS)
 		return (EFAULT);
 done:
 	return (0);
 }
 
 static int
 vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu)
 {
 	struct vie *vie;
 	struct vcpu *vcpu;
 	struct vm_exit *vme;
 	uint64_t gla, gpa, cs_base;
 	struct vm_guest_paging *paging;
 	mem_region_read_t mread;
 	mem_region_write_t mwrite;
 	enum vm_cpu_mode cpu_mode;
 	int cs_d, error, fault;
 
 	vcpu = &vm->vcpu[vcpuid];
 	vme = &vcpu->exitinfo;
 
 	KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d",
 	    __func__, vme->inst_length));
 
 	gla = vme->u.inst_emul.gla;
 	gpa = vme->u.inst_emul.gpa;
 	cs_base = vme->u.inst_emul.cs_base;
 	cs_d = vme->u.inst_emul.cs_d;
 	vie = &vme->u.inst_emul.vie;
 	paging = &vme->u.inst_emul.paging;
 	cpu_mode = paging->cpu_mode;
 
 	VCPU_CTR1(vm, vcpuid, "inst_emul fault accessing gpa %#lx", gpa);
 
 	/* Fetch, decode and emulate the faulting instruction */
 	if (vie->num_valid == 0) {
 		error = vmm_fetch_instruction(vm, vcpuid, paging, vme->rip +
 		    cs_base, VIE_INST_SIZE, vie, &fault);
 	} else {
 		/*
 		 * The instruction bytes have already been copied into 'vie'
 		 */
 		error = fault = 0;
 	}
 	if (error || fault)
 		return (error);
 
 	if (vmm_decode_instruction(vm, vcpuid, gla, cpu_mode, cs_d, vie) != 0) {
 		VCPU_CTR1(vm, vcpuid, "Error decoding instruction at %#lx",
 		    vme->rip + cs_base);
 		*retu = true;	    /* dump instruction bytes in userspace */
 		return (0);
 	}
 
 	/*
 	 * Update 'nextrip' based on the length of the emulated instruction.
 	 */
 	vme->inst_length = vie->num_processed;
 	vcpu->nextrip += vie->num_processed;
 	VCPU_CTR1(vm, vcpuid, "nextrip updated to %#lx after instruction "
 	    "decoding", vcpu->nextrip);
  
 	/* return to userland unless this is an in-kernel emulated device */
 	if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
 		mread = lapic_mmio_read;
 		mwrite = lapic_mmio_write;
 	} else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
 		mread = vioapic_mmio_read;
 		mwrite = vioapic_mmio_write;
 	} else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
 		mread = vhpet_mmio_read;
 		mwrite = vhpet_mmio_write;
 	} else {
 		*retu = true;
 		return (0);
 	}
 
 	error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, paging,
 	    mread, mwrite, retu);
 
 	return (error);
 }
 
 static int
 vm_handle_suspend(struct vm *vm, int vcpuid, bool *retu)
 {
 	int i, done;
 	struct vcpu *vcpu;
 
 	done = 0;
 	vcpu = &vm->vcpu[vcpuid];
 
 	CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus);
 
 	/*
 	 * Wait until all 'active_cpus' have suspended themselves.
 	 *
 	 * Since a VM may be suspended at any time including when one or
 	 * more vcpus are doing a rendezvous we need to call the rendezvous
 	 * handler while we are waiting to prevent a deadlock.
 	 */
 	vcpu_lock(vcpu);
 	while (1) {
 		if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
 			VCPU_CTR0(vm, vcpuid, "All vcpus suspended");
 			break;
 		}
 
 		if (vm->rendezvous_func == NULL) {
 			VCPU_CTR0(vm, vcpuid, "Sleeping during suspend");
 			vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
 			msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz);
 			vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
 		} else {
 			VCPU_CTR0(vm, vcpuid, "Rendezvous during suspend");
 			vcpu_unlock(vcpu);
 			vm_handle_rendezvous(vm, vcpuid);
 			vcpu_lock(vcpu);
 		}
 	}
 	vcpu_unlock(vcpu);
 
 	/*
 	 * Wakeup the other sleeping vcpus and return to userspace.
 	 */
 	for (i = 0; i < VM_MAXCPU; i++) {
 		if (CPU_ISSET(i, &vm->suspended_cpus)) {
 			vcpu_notify_event(vm, i, false);
 		}
 	}
 
 	*retu = true;
 	return (0);
 }
 
 static int
 vm_handle_reqidle(struct vm *vm, int vcpuid, bool *retu)
 {
 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
 
 	vcpu_lock(vcpu);
 	KASSERT(vcpu->reqidle, ("invalid vcpu reqidle %d", vcpu->reqidle));
 	vcpu->reqidle = 0;
 	vcpu_unlock(vcpu);
 	*retu = true;
 	return (0);
 }
 
 int
 vm_suspend(struct vm *vm, enum vm_suspend_how how)
 {
 	int i;
 
 	if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST)
 		return (EINVAL);
 
 	if (atomic_cmpset_int(&vm->suspend, 0, how) == 0) {
 		VM_CTR2(vm, "virtual machine already suspended %d/%d",
 		    vm->suspend, how);
 		return (EALREADY);
 	}
 
 	VM_CTR1(vm, "virtual machine successfully suspended %d", how);
 
 	/*
 	 * Notify all active vcpus that they are now suspended.
 	 */
 	for (i = 0; i < VM_MAXCPU; i++) {
 		if (CPU_ISSET(i, &vm->active_cpus))
 			vcpu_notify_event(vm, i, false);
 	}
 
 	return (0);
 }
 
 void
 vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip)
 {
 	struct vm_exit *vmexit;
 
 	KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST,
 	    ("vm_exit_suspended: invalid suspend type %d", vm->suspend));
 
 	vmexit = vm_exitinfo(vm, vcpuid);
 	vmexit->rip = rip;
 	vmexit->inst_length = 0;
 	vmexit->exitcode = VM_EXITCODE_SUSPENDED;
 	vmexit->u.suspended.how = vm->suspend;
 }
 
 void
 vm_exit_rendezvous(struct vm *vm, int vcpuid, uint64_t rip)
 {
 	struct vm_exit *vmexit;
 
 	KASSERT(vm->rendezvous_func != NULL, ("rendezvous not in progress"));
 
 	vmexit = vm_exitinfo(vm, vcpuid);
 	vmexit->rip = rip;
 	vmexit->inst_length = 0;
 	vmexit->exitcode = VM_EXITCODE_RENDEZVOUS;
 	vmm_stat_incr(vm, vcpuid, VMEXIT_RENDEZVOUS, 1);
 }
 
 void
 vm_exit_reqidle(struct vm *vm, int vcpuid, uint64_t rip)
 {
 	struct vm_exit *vmexit;
 
 	vmexit = vm_exitinfo(vm, vcpuid);
 	vmexit->rip = rip;
 	vmexit->inst_length = 0;
 	vmexit->exitcode = VM_EXITCODE_REQIDLE;
 	vmm_stat_incr(vm, vcpuid, VMEXIT_REQIDLE, 1);
 }
 
 void
 vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip)
 {
 	struct vm_exit *vmexit;
 
 	vmexit = vm_exitinfo(vm, vcpuid);
 	vmexit->rip = rip;
 	vmexit->inst_length = 0;
 	vmexit->exitcode = VM_EXITCODE_BOGUS;
 	vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1);
 }
 
 int
 vm_run(struct vm *vm, struct vm_run *vmrun)
 {
 	struct vm_eventinfo evinfo;
 	int error, vcpuid;
 	struct vcpu *vcpu;
 	struct pcb *pcb;
 	uint64_t tscval;
 	struct vm_exit *vme;
 	bool retu, intr_disabled;
 	pmap_t pmap;
 
 	vcpuid = vmrun->cpuid;
 
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		return (EINVAL);
 
 	if (!CPU_ISSET(vcpuid, &vm->active_cpus))
 		return (EINVAL);
 
 	if (CPU_ISSET(vcpuid, &vm->suspended_cpus))
 		return (EINVAL);
 
 	pmap = vmspace_pmap(vm->vmspace);
 	vcpu = &vm->vcpu[vcpuid];
 	vme = &vcpu->exitinfo;
 	evinfo.rptr = &vm->rendezvous_func;
 	evinfo.sptr = &vm->suspend;
 	evinfo.iptr = &vcpu->reqidle;
 restart:
 	critical_enter();
 
 	KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
 	    ("vm_run: absurd pm_active"));
 
 	tscval = rdtsc();
 
 	pcb = PCPU_GET(curpcb);
 	set_pcb_flags(pcb, PCB_FULL_IRET);
 
 	restore_guest_fpustate(vcpu);
 
 	vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
 	error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip, pmap, &evinfo);
 	vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
 
 	save_guest_fpustate(vcpu);
 
 	vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
 
 	critical_exit();
 
 	if (error == 0) {
 		retu = false;
 		vcpu->nextrip = vme->rip + vme->inst_length;
 		switch (vme->exitcode) {
 		case VM_EXITCODE_REQIDLE:
 			error = vm_handle_reqidle(vm, vcpuid, &retu);
 			break;
 		case VM_EXITCODE_SUSPENDED:
 			error = vm_handle_suspend(vm, vcpuid, &retu);
 			break;
 		case VM_EXITCODE_IOAPIC_EOI:
 			vioapic_process_eoi(vm, vcpuid,
 			    vme->u.ioapic_eoi.vector);
 			break;
 		case VM_EXITCODE_RENDEZVOUS:
 			vm_handle_rendezvous(vm, vcpuid);
 			error = 0;
 			break;
 		case VM_EXITCODE_HLT:
 			intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0);
 			error = vm_handle_hlt(vm, vcpuid, intr_disabled, &retu);
 			break;
 		case VM_EXITCODE_PAGING:
 			error = vm_handle_paging(vm, vcpuid, &retu);
 			break;
 		case VM_EXITCODE_INST_EMUL:
 			error = vm_handle_inst_emul(vm, vcpuid, &retu);
 			break;
 		case VM_EXITCODE_INOUT:
 		case VM_EXITCODE_INOUT_STR:
 			error = vm_handle_inout(vm, vcpuid, vme, &retu);
 			break;
 		case VM_EXITCODE_MONITOR:
 		case VM_EXITCODE_MWAIT:
 			vm_inject_ud(vm, vcpuid);
 			break;
 		default:
 			retu = true;	/* handled in userland */
 			break;
 		}
 	}
 
 	if (error == 0 && retu == false)
 		goto restart;
 
 	VCPU_CTR2(vm, vcpuid, "retu %d/%d", error, vme->exitcode);
 
 	/* copy the exit information */
 	bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit));
 	return (error);
 }
 
 int
 vm_restart_instruction(void *arg, int vcpuid)
 {
 	struct vm *vm;
 	struct vcpu *vcpu;
 	enum vcpu_state state;
 	uint64_t rip;
 	int error;
 
 	vm = arg;
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		return (EINVAL);
 
 	vcpu = &vm->vcpu[vcpuid];
 	state = vcpu_get_state(vm, vcpuid, NULL);
 	if (state == VCPU_RUNNING) {
 		/*
 		 * When a vcpu is "running" the next instruction is determined
 		 * by adding 'rip' and 'inst_length' in the vcpu's 'exitinfo'.
 		 * Thus setting 'inst_length' to zero will cause the current
 		 * instruction to be restarted.
 		 */
 		vcpu->exitinfo.inst_length = 0;
 		VCPU_CTR1(vm, vcpuid, "restarting instruction at %#lx by "
 		    "setting inst_length to zero", vcpu->exitinfo.rip);
 	} else if (state == VCPU_FROZEN) {
 		/*
 		 * When a vcpu is "frozen" it is outside the critical section
 		 * around VMRUN() and 'nextrip' points to the next instruction.
 		 * Thus instruction restart is achieved by setting 'nextrip'
 		 * to the vcpu's %rip.
 		 */
 		error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RIP, &rip);
 		KASSERT(!error, ("%s: error %d getting rip", __func__, error));
 		VCPU_CTR2(vm, vcpuid, "restarting instruction by updating "
 		    "nextrip from %#lx to %#lx", vcpu->nextrip, rip);
 		vcpu->nextrip = rip;
 	} else {
 		panic("%s: invalid state %d", __func__, state);
 	}
 	return (0);
 }
 
 int
 vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info)
 {
 	struct vcpu *vcpu;
 	int type, vector;
 
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		return (EINVAL);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	if (info & VM_INTINFO_VALID) {
 		type = info & VM_INTINFO_TYPE;
 		vector = info & 0xff;
 		if (type == VM_INTINFO_NMI && vector != IDT_NMI)
 			return (EINVAL);
 		if (type == VM_INTINFO_HWEXCEPTION && vector >= 32)
 			return (EINVAL);
 		if (info & VM_INTINFO_RSVD)
 			return (EINVAL);
 	} else {
 		info = 0;
 	}
 	VCPU_CTR2(vm, vcpuid, "%s: info1(%#lx)", __func__, info);
 	vcpu->exitintinfo = info;
 	return (0);
 }
 
 enum exc_class {
 	EXC_BENIGN,
 	EXC_CONTRIBUTORY,
 	EXC_PAGEFAULT
 };
 
 #define	IDT_VE	20	/* Virtualization Exception (Intel specific) */
 
 static enum exc_class
 exception_class(uint64_t info)
 {
 	int type, vector;
 
 	KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %#lx", info));
 	type = info & VM_INTINFO_TYPE;
 	vector = info & 0xff;
 
 	/* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */
 	switch (type) {
 	case VM_INTINFO_HWINTR:
 	case VM_INTINFO_SWINTR:
 	case VM_INTINFO_NMI:
 		return (EXC_BENIGN);
 	default:
 		/*
 		 * Hardware exception.
 		 *
 		 * SVM and VT-x use identical type values to represent NMI,
 		 * hardware interrupt and software interrupt.
 		 *
 		 * SVM uses type '3' for all exceptions. VT-x uses type '3'
 		 * for exceptions except #BP and #OF. #BP and #OF use a type
 		 * value of '5' or '6'. Therefore we don't check for explicit
 		 * values of 'type' to classify 'intinfo' into a hardware
 		 * exception.
 		 */
 		break;
 	}
 
 	switch (vector) {
 	case IDT_PF:
 	case IDT_VE:
 		return (EXC_PAGEFAULT);
 	case IDT_DE:
 	case IDT_TS:
 	case IDT_NP:
 	case IDT_SS:
 	case IDT_GP:
 		return (EXC_CONTRIBUTORY);
 	default:
 		return (EXC_BENIGN);
 	}
 }
 
 static int
 nested_fault(struct vm *vm, int vcpuid, uint64_t info1, uint64_t info2,
     uint64_t *retinfo)
 {
 	enum exc_class exc1, exc2;
 	int type1, vector1;
 
 	KASSERT(info1 & VM_INTINFO_VALID, ("info1 %#lx is not valid", info1));
 	KASSERT(info2 & VM_INTINFO_VALID, ("info2 %#lx is not valid", info2));
 
 	/*
 	 * If an exception occurs while attempting to call the double-fault
 	 * handler the processor enters shutdown mode (aka triple fault).
 	 */
 	type1 = info1 & VM_INTINFO_TYPE;
 	vector1 = info1 & 0xff;
 	if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) {
 		VCPU_CTR2(vm, vcpuid, "triple fault: info1(%#lx), info2(%#lx)",
 		    info1, info2);
 		vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT);
 		*retinfo = 0;
 		return (0);
 	}
 
 	/*
 	 * Table 6-5 "Conditions for Generating a Double Fault", Intel SDM, Vol3
 	 */
 	exc1 = exception_class(info1);
 	exc2 = exception_class(info2);
 	if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) ||
 	    (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) {
 		/* Convert nested fault into a double fault. */
 		*retinfo = IDT_DF;
 		*retinfo |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
 		*retinfo |= VM_INTINFO_DEL_ERRCODE;
 	} else {
 		/* Handle exceptions serially */
 		*retinfo = info2;
 	}
 	return (1);
 }
 
 static uint64_t
 vcpu_exception_intinfo(struct vcpu *vcpu)
 {
 	uint64_t info = 0;
 
 	if (vcpu->exception_pending) {
 		info = vcpu->exc_vector & 0xff;
 		info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
 		if (vcpu->exc_errcode_valid) {
 			info |= VM_INTINFO_DEL_ERRCODE;
 			info |= (uint64_t)vcpu->exc_errcode << 32;
 		}
 	}
 	return (info);
 }
 
 int
 vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo)
 {
 	struct vcpu *vcpu;
 	uint64_t info1, info2;
 	int valid;
 
 	KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU, ("invalid vcpu %d", vcpuid));
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	info1 = vcpu->exitintinfo;
 	vcpu->exitintinfo = 0;
 
 	info2 = 0;
 	if (vcpu->exception_pending) {
 		info2 = vcpu_exception_intinfo(vcpu);
 		vcpu->exception_pending = 0;
 		VCPU_CTR2(vm, vcpuid, "Exception %d delivered: %#lx",
 		    vcpu->exc_vector, info2);
 	}
 
 	if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) {
 		valid = nested_fault(vm, vcpuid, info1, info2, retinfo);
 	} else if (info1 & VM_INTINFO_VALID) {
 		*retinfo = info1;
 		valid = 1;
 	} else if (info2 & VM_INTINFO_VALID) {
 		*retinfo = info2;
 		valid = 1;
 	} else {
 		valid = 0;
 	}
 
 	if (valid) {
 		VCPU_CTR4(vm, vcpuid, "%s: info1(%#lx), info2(%#lx), "
 		    "retinfo(%#lx)", __func__, info1, info2, *retinfo);
 	}
 
 	return (valid);
 }
 
 int
 vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2)
 {
 	struct vcpu *vcpu;
 
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		return (EINVAL);
 
 	vcpu = &vm->vcpu[vcpuid];
 	*info1 = vcpu->exitintinfo;
 	*info2 = vcpu_exception_intinfo(vcpu);
 	return (0);
 }
 
 int
 vm_inject_exception(struct vm *vm, int vcpuid, int vector, int errcode_valid,
     uint32_t errcode, int restart_instruction)
 {
 	struct vcpu *vcpu;
 	uint64_t regval;
 	int error;
 
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		return (EINVAL);
 
 	if (vector < 0 || vector >= 32)
 		return (EINVAL);
 
 	/*
 	 * A double fault exception should never be injected directly into
 	 * the guest. It is a derived exception that results from specific
 	 * combinations of nested faults.
 	 */
 	if (vector == IDT_DF)
 		return (EINVAL);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	if (vcpu->exception_pending) {
 		VCPU_CTR2(vm, vcpuid, "Unable to inject exception %d due to "
 		    "pending exception %d", vector, vcpu->exc_vector);
 		return (EBUSY);
 	}
 
 	if (errcode_valid) {
 		/*
 		 * Exceptions don't deliver an error code in real mode.
 		 */
 		error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, &regval);
 		KASSERT(!error, ("%s: error %d getting CR0", __func__, error));
 		if (!(regval & CR0_PE))
 			errcode_valid = 0;
 	}
 
 	/*
 	 * From section 26.6.1 "Interruptibility State" in Intel SDM:
 	 *
 	 * Event blocking by "STI" or "MOV SS" is cleared after guest executes
 	 * one instruction or incurs an exception.
 	 */
 	error = vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0);
 	KASSERT(error == 0, ("%s: error %d clearing interrupt shadow",
 	    __func__, error));
 
 	if (restart_instruction)
 		vm_restart_instruction(vm, vcpuid);
 
 	vcpu->exception_pending = 1;
 	vcpu->exc_vector = vector;
 	vcpu->exc_errcode = errcode;
 	vcpu->exc_errcode_valid = errcode_valid;
 	VCPU_CTR1(vm, vcpuid, "Exception %d pending", vector);
 	return (0);
 }
 
 void
 vm_inject_fault(void *vmarg, int vcpuid, int vector, int errcode_valid,
     int errcode)
 {
 	struct vm *vm;
 	int error, restart_instruction;
 
 	vm = vmarg;
 	restart_instruction = 1;
 
 	error = vm_inject_exception(vm, vcpuid, vector, errcode_valid,
 	    errcode, restart_instruction);
 	KASSERT(error == 0, ("vm_inject_exception error %d", error));
 }
 
 void
 vm_inject_pf(void *vmarg, int vcpuid, int error_code, uint64_t cr2)
 {
 	struct vm *vm;
 	int error;
 
 	vm = vmarg;
 	VCPU_CTR2(vm, vcpuid, "Injecting page fault: error_code %#x, cr2 %#lx",
 	    error_code, cr2);
 
 	error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2);
 	KASSERT(error == 0, ("vm_set_register(cr2) error %d", error));
 
 	vm_inject_fault(vm, vcpuid, IDT_PF, 1, error_code);
 }
 
 static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
 
 int
 vm_inject_nmi(struct vm *vm, int vcpuid)
 {
 	struct vcpu *vcpu;
 
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		return (EINVAL);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	vcpu->nmi_pending = 1;
 	vcpu_notify_event(vm, vcpuid, false);
 	return (0);
 }
 
 int
 vm_nmi_pending(struct vm *vm, int vcpuid)
 {
 	struct vcpu *vcpu;
 
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	return (vcpu->nmi_pending);
 }
 
 void
 vm_nmi_clear(struct vm *vm, int vcpuid)
 {
 	struct vcpu *vcpu;
 
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	if (vcpu->nmi_pending == 0)
 		panic("vm_nmi_clear: inconsistent nmi_pending state");
 
 	vcpu->nmi_pending = 0;
 	vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1);
 }
 
 static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu");
 
 int
 vm_inject_extint(struct vm *vm, int vcpuid)
 {
 	struct vcpu *vcpu;
 
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		return (EINVAL);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	vcpu->extint_pending = 1;
 	vcpu_notify_event(vm, vcpuid, false);
 	return (0);
 }
 
 int
 vm_extint_pending(struct vm *vm, int vcpuid)
 {
 	struct vcpu *vcpu;
 
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	return (vcpu->extint_pending);
 }
 
 void
 vm_extint_clear(struct vm *vm, int vcpuid)
 {
 	struct vcpu *vcpu;
 
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	if (vcpu->extint_pending == 0)
 		panic("vm_extint_clear: inconsistent extint_pending state");
 
 	vcpu->extint_pending = 0;
 	vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1);
 }
 
 int
 vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
 {
 	if (vcpu < 0 || vcpu >= VM_MAXCPU)
 		return (EINVAL);
 
 	if (type < 0 || type >= VM_CAP_MAX)
 		return (EINVAL);
 
 	return (VMGETCAP(vm->cookie, vcpu, type, retval));
 }
 
 int
 vm_set_capability(struct vm *vm, int vcpu, int type, int val)
 {
 	if (vcpu < 0 || vcpu >= VM_MAXCPU)
 		return (EINVAL);
 
 	if (type < 0 || type >= VM_CAP_MAX)
 		return (EINVAL);
 
 	return (VMSETCAP(vm->cookie, vcpu, type, val));
 }
 
 struct vlapic *
 vm_lapic(struct vm *vm, int cpu)
 {
 	return (vm->vcpu[cpu].vlapic);
 }
 
 struct vioapic *
 vm_ioapic(struct vm *vm)
 {
 
 	return (vm->vioapic);
 }
 
 struct vhpet *
 vm_hpet(struct vm *vm)
 {
 
 	return (vm->vhpet);
 }
 
 boolean_t
 vmm_is_pptdev(int bus, int slot, int func)
 {
 	int found, i, n;
 	int b, s, f;
 	char *val, *cp, *cp2;
 
 	/*
 	 * XXX
 	 * The length of an environment variable is limited to 128 bytes which
 	 * puts an upper limit on the number of passthru devices that may be
 	 * specified using a single environment variable.
 	 *
 	 * Work around this by scanning multiple environment variable
 	 * names instead of a single one - yuck!
 	 */
 	const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL };
 
 	/* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */
 	found = 0;
 	for (i = 0; names[i] != NULL && !found; i++) {
 		cp = val = kern_getenv(names[i]);
 		while (cp != NULL && *cp != '\0') {
 			if ((cp2 = strchr(cp, ' ')) != NULL)
 				*cp2 = '\0';
 
 			n = sscanf(cp, "%d/%d/%d", &b, &s, &f);
 			if (n == 3 && bus == b && slot == s && func == f) {
 				found = 1;
 				break;
 			}
 		
 			if (cp2 != NULL)
 				*cp2++ = ' ';
 
 			cp = cp2;
 		}
 		freeenv(val);
 	}
 	return (found);
 }
 
 void *
 vm_iommu_domain(struct vm *vm)
 {
 
 	return (vm->iommu);
 }
 
 int
 vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate,
     bool from_idle)
 {
 	int error;
 	struct vcpu *vcpu;
 
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		panic("vm_set_run_state: invalid vcpuid %d", vcpuid);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	vcpu_lock(vcpu);
 	error = vcpu_set_state_locked(vm, vcpuid, newstate, from_idle);
 	vcpu_unlock(vcpu);
 
 	return (error);
 }
 
 enum vcpu_state
 vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu)
 {
 	struct vcpu *vcpu;
 	enum vcpu_state state;
 
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		panic("vm_get_run_state: invalid vcpuid %d", vcpuid);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	vcpu_lock(vcpu);
 	state = vcpu->state;
 	if (hostcpu != NULL)
 		*hostcpu = vcpu->hostcpu;
 	vcpu_unlock(vcpu);
 
 	return (state);
 }
 
 int
 vm_activate_cpu(struct vm *vm, int vcpuid)
 {
 
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		return (EINVAL);
 
 	if (CPU_ISSET(vcpuid, &vm->active_cpus))
 		return (EBUSY);
 
 	VCPU_CTR0(vm, vcpuid, "activated");
 	CPU_SET_ATOMIC(vcpuid, &vm->active_cpus);
 	return (0);
 }
 
 cpuset_t
 vm_active_cpus(struct vm *vm)
 {
 
 	return (vm->active_cpus);
 }
 
 cpuset_t
 vm_suspended_cpus(struct vm *vm)
 {
 
 	return (vm->suspended_cpus);
 }
 
 void *
 vcpu_stats(struct vm *vm, int vcpuid)
 {
 
 	return (vm->vcpu[vcpuid].stats);
 }
 
 int
 vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state)
 {
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		return (EINVAL);
 
 	*state = vm->vcpu[vcpuid].x2apic_state;
 
 	return (0);
 }
 
 int
 vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
 {
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		return (EINVAL);
 
 	if (state >= X2APIC_STATE_LAST)
 		return (EINVAL);
 
 	vm->vcpu[vcpuid].x2apic_state = state;
 
 	vlapic_set_x2apic_state(vm, vcpuid, state);
 
 	return (0);
 }
 
 /*
  * This function is called to ensure that a vcpu "sees" a pending event
  * as soon as possible:
  * - If the vcpu thread is sleeping then it is woken up.
  * - If the vcpu is running on a different host_cpu then an IPI will be directed
  *   to the host_cpu to cause the vcpu to trap into the hypervisor.
  */
 static void
 vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr)
 {
 	int hostcpu;
 
 	hostcpu = vcpu->hostcpu;
 	if (vcpu->state == VCPU_RUNNING) {
 		KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
 		if (hostcpu != curcpu) {
 			if (lapic_intr) {
 				vlapic_post_intr(vcpu->vlapic, hostcpu,
 				    vmm_ipinum);
 			} else {
 				ipi_cpu(hostcpu, vmm_ipinum);
 			}
 		} else {
 			/*
 			 * If the 'vcpu' is running on 'curcpu' then it must
 			 * be sending a notification to itself (e.g. SELF_IPI).
 			 * The pending event will be picked up when the vcpu
 			 * transitions back to guest context.
 			 */
 		}
 	} else {
 		KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent "
 		    "with hostcpu %d", vcpu->state, hostcpu));
 		if (vcpu->state == VCPU_SLEEPING)
 			wakeup_one(vcpu);
 	}
 }
 
 void
 vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr)
 {
 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
 
 	vcpu_lock(vcpu);
 	vcpu_notify_event_locked(vcpu, lapic_intr);
 	vcpu_unlock(vcpu);
 }
 
 struct vmspace *
 vm_get_vmspace(struct vm *vm)
 {
 
 	return (vm->vmspace);
 }
 
 int
 vm_apicid2vcpuid(struct vm *vm, int apicid)
 {
 	/*
 	 * XXX apic id is assumed to be numerically identical to vcpu id
 	 */
 	return (apicid);
 }
 
 void
 vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest,
     vm_rendezvous_func_t func, void *arg)
 {
 	int i;
 
 	/*
 	 * Enforce that this function is called without any locks
 	 */
 	WITNESS_WARN(WARN_PANIC, NULL, "vm_smp_rendezvous");
 	KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU),
 	    ("vm_smp_rendezvous: invalid vcpuid %d", vcpuid));
 
 restart:
 	mtx_lock(&vm->rendezvous_mtx);
 	if (vm->rendezvous_func != NULL) {
 		/*
 		 * If a rendezvous is already in progress then we need to
 		 * call the rendezvous handler in case this 'vcpuid' is one
 		 * of the targets of the rendezvous.
 		 */
 		RENDEZVOUS_CTR0(vm, vcpuid, "Rendezvous already in progress");
 		mtx_unlock(&vm->rendezvous_mtx);
 		vm_handle_rendezvous(vm, vcpuid);
 		goto restart;
 	}
 	KASSERT(vm->rendezvous_func == NULL, ("vm_smp_rendezvous: previous "
 	    "rendezvous is still in progress"));
 
 	RENDEZVOUS_CTR0(vm, vcpuid, "Initiating rendezvous");
 	vm->rendezvous_req_cpus = dest;
 	CPU_ZERO(&vm->rendezvous_done_cpus);
 	vm->rendezvous_arg = arg;
 	vm_set_rendezvous_func(vm, func);
 	mtx_unlock(&vm->rendezvous_mtx);
 
 	/*
 	 * Wake up any sleeping vcpus and trigger a VM-exit in any running
 	 * vcpus so they handle the rendezvous as soon as possible.
 	 */
 	for (i = 0; i < VM_MAXCPU; i++) {
 		if (CPU_ISSET(i, &dest))
 			vcpu_notify_event(vm, i, false);
 	}
 
 	vm_handle_rendezvous(vm, vcpuid);
 }
 
 struct vatpic *
 vm_atpic(struct vm *vm)
 {
 	return (vm->vatpic);
 }
 
 struct vatpit *
 vm_atpit(struct vm *vm)
 {
 	return (vm->vatpit);
 }
 
 struct vpmtmr *
 vm_pmtmr(struct vm *vm)
 {
 
 	return (vm->vpmtmr);
 }
 
 struct vrtc *
 vm_rtc(struct vm *vm)
 {
 
 	return (vm->vrtc);
 }
 
 enum vm_reg_name
 vm_segment_name(int seg)
 {
 	static enum vm_reg_name seg_names[] = {
 		VM_REG_GUEST_ES,
 		VM_REG_GUEST_CS,
 		VM_REG_GUEST_SS,
 		VM_REG_GUEST_DS,
 		VM_REG_GUEST_FS,
 		VM_REG_GUEST_GS
 	};
 
 	KASSERT(seg >= 0 && seg < nitems(seg_names),
 	    ("%s: invalid segment encoding %d", __func__, seg));
 	return (seg_names[seg]);
 }
 
 void
 vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
     int num_copyinfo)
 {
 	int idx;
 
 	for (idx = 0; idx < num_copyinfo; idx++) {
 		if (copyinfo[idx].cookie != NULL)
 			vm_gpa_release(copyinfo[idx].cookie);
 	}
 	bzero(copyinfo, num_copyinfo * sizeof(struct vm_copyinfo));
 }
 
 int
 vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
     uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
     int num_copyinfo, int *fault)
 {
 	int error, idx, nused;
 	size_t n, off, remaining;
 	void *hva, *cookie;
 	uint64_t gpa;
 
 	bzero(copyinfo, sizeof(struct vm_copyinfo) * num_copyinfo);
 
 	nused = 0;
 	remaining = len;
 	while (remaining > 0) {
 		KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo"));
 		error = vm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa, fault);
 		if (error || *fault)
 			return (error);
 		off = gpa & PAGE_MASK;
 		n = min(remaining, PAGE_SIZE - off);
 		copyinfo[nused].gpa = gpa;
 		copyinfo[nused].len = n;
 		remaining -= n;
 		gla += n;
 		nused++;
 	}
 
 	for (idx = 0; idx < nused; idx++) {
 		hva = vm_gpa_hold(vm, vcpuid, copyinfo[idx].gpa,
 		    copyinfo[idx].len, prot, &cookie);
 		if (hva == NULL)
 			break;
 		copyinfo[idx].hva = hva;
 		copyinfo[idx].cookie = cookie;
 	}
 
 	if (idx != nused) {
 		vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo);
 		return (EFAULT);
 	} else {
 		*fault = 0;
 		return (0);
 	}
 }
 
 void
 vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr,
     size_t len)
 {
 	char *dst;
 	int idx;
 	
 	dst = kaddr;
 	idx = 0;
 	while (len > 0) {
 		bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len);
 		len -= copyinfo[idx].len;
 		dst += copyinfo[idx].len;
 		idx++;
 	}
 }
 
 void
 vm_copyout(struct vm *vm, int vcpuid, const void *kaddr,
     struct vm_copyinfo *copyinfo, size_t len)
 {
 	const char *src;
 	int idx;
 
 	src = kaddr;
 	idx = 0;
 	while (len > 0) {
 		bcopy(src, copyinfo[idx].hva, copyinfo[idx].len);
 		len -= copyinfo[idx].len;
 		src += copyinfo[idx].len;
 		idx++;
 	}
 }
 
 /*
  * Return the amount of in-use and wired memory for the VM. Since
  * these are global stats, only return the values with for vCPU 0
  */
 VMM_STAT_DECLARE(VMM_MEM_RESIDENT);
 VMM_STAT_DECLARE(VMM_MEM_WIRED);
 
 static void
 vm_get_rescnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
 {
 
 	if (vcpu == 0) {
 		vmm_stat_set(vm, vcpu, VMM_MEM_RESIDENT,
 	       	    PAGE_SIZE * vmspace_resident_count(vm->vmspace));
 	}	
 }
 
 static void
 vm_get_wiredcnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
 {
 
 	if (vcpu == 0) {
 		vmm_stat_set(vm, vcpu, VMM_MEM_WIRED,
 	      	    PAGE_SIZE * pmap_wired_count(vmspace_pmap(vm->vmspace)));
 	}	
 }
 
 VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt);
 VMM_STAT_FUNC(VMM_MEM_WIRED, "Wired memory", vm_get_wiredcnt);
Index: head/sys/amd64/vmm/vmm_host.h
===================================================================
--- head/sys/amd64/vmm/vmm_host.h	(revision 315958)
+++ head/sys/amd64/vmm/vmm_host.h	(revision 315959)
@@ -1,83 +1,85 @@
 /*-
  * Copyright (c) 2012 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef	_VMM_HOST_H_
 #define	_VMM_HOST_H_
 
 #ifndef	_KERNEL
 #error "no user-serviceable parts inside"
 #endif
 
+extern int vmm_ipinum;
+
 struct xsave_limits {
 	int		xsave_enabled;
 	uint64_t	xcr0_allowed;
 	uint32_t	xsave_max_size;
 };
 
 void vmm_host_state_init(void);
 
 uint64_t vmm_get_host_pat(void);
 uint64_t vmm_get_host_efer(void);
 uint64_t vmm_get_host_cr0(void);
 uint64_t vmm_get_host_cr4(void);
 uint64_t vmm_get_host_xcr0(void);
 uint64_t vmm_get_host_datasel(void);
 uint64_t vmm_get_host_codesel(void);
 uint64_t vmm_get_host_tsssel(void);
 uint64_t vmm_get_host_fsbase(void);
 uint64_t vmm_get_host_idtrbase(void);
 const struct xsave_limits *vmm_get_xsave_limits(void);
 
 /*
  * Inline access to host state that is used on every VM entry
  */
 static __inline uint64_t
 vmm_get_host_trbase(void)
 {
 
 	return ((uint64_t)PCPU_GET(tssp));
 }
 
 static __inline uint64_t
 vmm_get_host_gdtrbase(void)
 {
 
 	return ((uint64_t)&gdt[NGDT * curcpu]);
 }
 
 struct pcpu;
 extern struct pcpu __pcpu[];
 
 static __inline uint64_t
 vmm_get_host_gsbase(void)
 {
 
 	return ((uint64_t)&__pcpu[curcpu]);
 }
 
 #endif
Index: head/sys/i386/i386/apic_vector.s
===================================================================
--- head/sys/i386/i386/apic_vector.s	(revision 315958)
+++ head/sys/i386/i386/apic_vector.s	(revision 315959)
@@ -1,320 +1,317 @@
 /*-
  * Copyright (c) 1989, 1990 William F. Jolitz.
  * Copyright (c) 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: vector.s, 386BSD 0.1 unknown origin
  * $FreeBSD$
  */
 
 /*
  * Interrupt entry points for external interrupts triggered by I/O APICs
  * as well as IPI handlers.
  */
 
 #include "opt_smp.h"
 
 #include <machine/asmacros.h>
 #include <machine/specialreg.h>
 #include <x86/apicreg.h>
 
 #include "assym.s"
 
-	.text
-	SUPERALIGN_TEXT
-	/* End Of Interrupt to APIC */
-as_lapic_eoi:
-	cmpl	$0,x2apic_mode
-	jne	1f
-	movl	lapic_map,%eax
-	movl	$0,LA_EOI(%eax)
-	ret
-1:
-	movl	$MSR_APIC_EOI,%ecx
-	xorl	%eax,%eax
-	xorl	%edx,%edx
-	wrmsr
-	ret
-
 /*
  * I/O Interrupt Entry Point.  Rather than having one entry point for
  * each interrupt source, we use one entry point for each 32-bit word
  * in the ISR.  The handler determines the highest bit set in the ISR,
  * translates that into a vector, and passes the vector to the
  * lapic_handle_intr() function.
  */
 #define	ISR_VEC(index, vec_name)					\
 	.text ;								\
 	SUPERALIGN_TEXT ;						\
 IDTVEC(vec_name) ;							\
 	PUSH_FRAME ;							\
 	SET_KERNEL_SREGS ;						\
 	cld ;								\
 	FAKE_MCOUNT(TF_EIP(%esp)) ;					\
 	cmpl	$0,x2apic_mode ;					\
 	je	1f ;							\
 	movl	$(MSR_APIC_ISR0 + index),%ecx ;				\
 	rdmsr ;								\
 	jmp	2f ;							\
 1: ;									\
 	movl	lapic_map, %edx ;/* pointer to local APIC */		\
 	movl	LA_ISR + 16 * (index)(%edx), %eax ;	/* load ISR */	\
 2: ;									\
 	bsrl	%eax, %eax ;	/* index of highest set bit in ISR */	\
 	jz	3f ;							\
 	addl	$(32 * index),%eax ;					\
 	pushl	%esp		;                                       \
 	pushl	%eax ;		/* pass the IRQ */			\
 	call	lapic_handle_intr ;					\
 	addl	$8, %esp ;	/* discard parameter */			\
 3: ;									\
 	MEXITCOUNT ;							\
 	jmp	doreti
 
 /*
  * Handle "spurious INTerrupts".
  * Notes:
  *  This is different than the "spurious INTerrupt" generated by an
  *   8259 PIC for missing INTs.  See the APIC documentation for details.
  *  This routine should NOT do an 'EOI' cycle.
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(spuriousint)
 
 	/* No EOI cycle used here */
 
 	iret
 
 	ISR_VEC(1, apic_isr1)
 	ISR_VEC(2, apic_isr2)
 	ISR_VEC(3, apic_isr3)
 	ISR_VEC(4, apic_isr4)
 	ISR_VEC(5, apic_isr5)
 	ISR_VEC(6, apic_isr6)
 	ISR_VEC(7, apic_isr7)
 
 /*
  * Local APIC periodic timer handler.
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(timerint)
 	PUSH_FRAME
 	SET_KERNEL_SREGS
 	cld
 	FAKE_MCOUNT(TF_EIP(%esp))
 	pushl	%esp
 	call	lapic_handle_timer
 	add	$4, %esp
 	MEXITCOUNT
 	jmp	doreti
 
 /*
  * Local APIC CMCI handler.
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(cmcint)
 	PUSH_FRAME
 	SET_KERNEL_SREGS
 	cld
 	FAKE_MCOUNT(TF_EIP(%esp))
 	call	lapic_handle_cmc
 	MEXITCOUNT
 	jmp	doreti
 
 /*
  * Local APIC error interrupt handler.
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(errorint)
 	PUSH_FRAME
 	SET_KERNEL_SREGS
 	cld
 	FAKE_MCOUNT(TF_EIP(%esp))
 	call	lapic_handle_error
 	MEXITCOUNT
 	jmp	doreti
 
 #ifdef XENHVM
 /*
  * Xen event channel upcall interrupt handler.
  * Only used when the hypervisor supports direct vector callbacks.
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(xen_intr_upcall)
 	PUSH_FRAME
 	SET_KERNEL_SREGS
 	cld
 	FAKE_MCOUNT(TF_EIP(%esp))
 	pushl	%esp
 	call	xen_intr_handle_upcall
 	add	$4, %esp
 	MEXITCOUNT
 	jmp	doreti
 #endif
 
 #ifdef SMP
 /*
  * Global address space TLB shootdown.
  */
 	.text
 	SUPERALIGN_TEXT
 invltlb_ret:
-	call	as_lapic_eoi
+	call	native_lapic_eoi
+	add	$4, %esp
 	POP_FRAME
 	iret
 
 	SUPERALIGN_TEXT
 IDTVEC(invltlb)
 	PUSH_FRAME
 	SET_KERNEL_SREGS
 	cld
 
 	call	invltlb_handler
 
+	pushl	$IPI_INVLTLB
 	jmp	invltlb_ret
 
 /*
  * Single page TLB shootdown
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(invlpg)
 	PUSH_FRAME
 	SET_KERNEL_SREGS
 	cld
 
 	call	invlpg_handler
 
+	pushl	$IPI_INVLPG
 	jmp	invltlb_ret
 
 /*
  * Page range TLB shootdown.
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(invlrng)
 	PUSH_FRAME
 	SET_KERNEL_SREGS
 	cld
 
 	call	invlrng_handler
 
+	pushl	$IPI_INVLRNG
 	jmp	invltlb_ret
 
 /*
  * Invalidate cache.
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(invlcache)
 	PUSH_FRAME
 	SET_KERNEL_SREGS
 	cld
 
 	call	invlcache_handler
 
+	pushl	$IPI_INVLCACHE
 	jmp	invltlb_ret
 
 /*
  * Handler for IPIs sent via the per-cpu IPI bitmap.
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(ipi_intr_bitmap_handler)	
 	PUSH_FRAME
 	SET_KERNEL_SREGS
 	cld
 
-	call	as_lapic_eoi
-	
+	pushl	$IPI_BITMAP_VECTOR
+	call	native_lapic_eoi
+	add	$4, %esp
+
 	FAKE_MCOUNT(TF_EIP(%esp))
 
 	call	ipi_bitmap_handler
 	MEXITCOUNT
 	jmp	doreti
 
 /*
  * Executed by a CPU when it receives an IPI_STOP from another CPU.
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(cpustop)
 	PUSH_FRAME
 	SET_KERNEL_SREGS
 	cld
 
-	call	as_lapic_eoi
+	pushl	$IPI_STOP
+	call	native_lapic_eoi
+	add	$4, %esp
 	call	cpustop_handler
 
 	POP_FRAME
 	iret
 
 /*
  * Executed by a CPU when it receives an IPI_SUSPEND from another CPU.
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(cpususpend)
 	PUSH_FRAME
 	SET_KERNEL_SREGS
 	cld
 
-	call	as_lapic_eoi
+	pushl	$IPI_SUSPEND
+	call	native_lapic_eoi
+	add	$4, %esp
 	call	cpususpend_handler
 
 	POP_FRAME
 	jmp	doreti_iret
 
 /*
  * Executed by a CPU when it receives a RENDEZVOUS IPI from another CPU.
  *
  * - Calls the generic rendezvous action function.
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(rendezvous)
 	PUSH_FRAME
 	SET_KERNEL_SREGS
 	cld
 
 #ifdef COUNT_IPIS
 	movl	PCPU(CPUID), %eax
 	movl	ipi_rendezvous_counts(,%eax,4), %eax
 	incl	(%eax)
 #endif
 	call	smp_rendezvous_action
 
-	call	as_lapic_eoi
+	pushl	$IPI_RENDEZVOUS
+	call	native_lapic_eoi
+	add	$4, %esp
 	POP_FRAME
 	iret
 	
 #endif /* SMP */
Index: head/sys/i386/i386/genassym.c
===================================================================
--- head/sys/i386/i386/genassym.c	(revision 315958)
+++ head/sys/i386/i386/genassym.c	(revision 315959)
@@ -1,234 +1,246 @@
 /*-
  * Copyright (c) 1982, 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)genassym.c	5.11 (Berkeley) 5/10/91
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_apic.h"
 #include "opt_compat.h"
 #include "opt_hwpmc_hooks.h"
 #include "opt_kstack_pages.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/assym.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #ifdef	HWPMC_HOOKS
 #include <sys/pmckern.h>
 #endif
 #include <sys/proc.h>
 #include <sys/errno.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/socket.h>
 #include <sys/resourcevar.h>
 #include <sys/signalvar.h>
 #include <sys/ucontext.h>
 #include <machine/bootinfo.h>
 #include <machine/tss.h>
 #include <sys/vmmeter.h>
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <sys/proc.h>
 #include <net/if.h>
 #include <netinet/in.h>
 #include <nfs/nfsproto.h>
 #include <nfsclient/nfs.h>
 #include <nfs/nfsdiskless.h>
 #ifdef DEV_APIC
+#include <sys/bus.h>
+#include <machine/intr_machdep.h>
 #include <x86/apicreg.h>
+#include <x86/apicvar.h>
 #endif
 #include <machine/cpu.h>
 #include <machine/pcb.h>
 #include <machine/sigframe.h>
 #include <machine/vm86.h>
 #include <machine/proc.h>
 
 ASSYM(P_VMSPACE, offsetof(struct proc, p_vmspace));
 ASSYM(VM_PMAP, offsetof(struct vmspace, vm_pmap));
 ASSYM(PM_ACTIVE, offsetof(struct pmap, pm_active));
 
 ASSYM(TD_FLAGS, offsetof(struct thread, td_flags));
 ASSYM(TD_LOCK, offsetof(struct thread, td_lock));
 ASSYM(TD_PCB, offsetof(struct thread, td_pcb));
 ASSYM(TD_PFLAGS, offsetof(struct thread, td_pflags));
 ASSYM(TD_PROC, offsetof(struct thread, td_proc));
 ASSYM(TD_MD, offsetof(struct thread, td_md));
 ASSYM(TD_TID, offsetof(struct thread, td_tid));
 
 ASSYM(TDP_CALLCHAIN, TDP_CALLCHAIN);
 
 ASSYM(P_MD, offsetof(struct proc, p_md));
 ASSYM(MD_LDT, offsetof(struct mdproc, md_ldt));
 
 ASSYM(TDF_ASTPENDING, TDF_ASTPENDING);
 ASSYM(TDF_NEEDRESCHED, TDF_NEEDRESCHED);
 
 ASSYM(V_TRAP, offsetof(struct vmmeter, v_trap));
 ASSYM(V_SYSCALL, offsetof(struct vmmeter, v_syscall));
 ASSYM(V_INTR, offsetof(struct vmmeter, v_intr));
 ASSYM(TD0_KSTACK_PAGES, TD0_KSTACK_PAGES);
 ASSYM(PAGE_SIZE, PAGE_SIZE);
 ASSYM(NPTEPG, NPTEPG);
 ASSYM(NPDEPG, NPDEPG);
 ASSYM(NPDEPTD, NPDEPTD);
 ASSYM(NPGPTD, NPGPTD);
 ASSYM(PDESIZE, sizeof(pd_entry_t));
 ASSYM(PTESIZE, sizeof(pt_entry_t));
 ASSYM(PDESHIFT, PDESHIFT);
 ASSYM(PTESHIFT, PTESHIFT);
 ASSYM(PAGE_SHIFT, PAGE_SHIFT);
 ASSYM(PAGE_MASK, PAGE_MASK);
 ASSYM(PDRSHIFT, PDRSHIFT);
 ASSYM(PDRMASK, PDRMASK);
 ASSYM(USRSTACK, USRSTACK);
 ASSYM(VM_MAXUSER_ADDRESS, VM_MAXUSER_ADDRESS);
 ASSYM(KERNBASE, KERNBASE);
 ASSYM(KERNLOAD, KERNLOAD);
 ASSYM(MCLBYTES, MCLBYTES);
 ASSYM(PCB_CR0, offsetof(struct pcb, pcb_cr0));
 ASSYM(PCB_CR2, offsetof(struct pcb, pcb_cr2));
 ASSYM(PCB_CR3, offsetof(struct pcb, pcb_cr3));
 ASSYM(PCB_CR4, offsetof(struct pcb, pcb_cr4));
 ASSYM(PCB_EDI, offsetof(struct pcb, pcb_edi));
 ASSYM(PCB_ESI, offsetof(struct pcb, pcb_esi));
 ASSYM(PCB_EBP, offsetof(struct pcb, pcb_ebp));
 ASSYM(PCB_ESP, offsetof(struct pcb, pcb_esp));
 ASSYM(PCB_EBX, offsetof(struct pcb, pcb_ebx));
 ASSYM(PCB_EIP, offsetof(struct pcb, pcb_eip));
 ASSYM(TSS_ESP0, offsetof(struct i386tss, tss_esp0));
 
 ASSYM(PCB_DS, offsetof(struct pcb, pcb_ds));
 ASSYM(PCB_ES, offsetof(struct pcb, pcb_es));
 ASSYM(PCB_FS, offsetof(struct pcb, pcb_fs));
 ASSYM(PCB_GS, offsetof(struct pcb, pcb_gs));
 ASSYM(PCB_SS, offsetof(struct pcb, pcb_ss));
 ASSYM(PCB_DR0, offsetof(struct pcb, pcb_dr0));
 ASSYM(PCB_DR1, offsetof(struct pcb, pcb_dr1));
 ASSYM(PCB_DR2, offsetof(struct pcb, pcb_dr2));
 ASSYM(PCB_DR3, offsetof(struct pcb, pcb_dr3));
 ASSYM(PCB_DR6, offsetof(struct pcb, pcb_dr6));
 ASSYM(PCB_DR7, offsetof(struct pcb, pcb_dr7));
 ASSYM(PCB_DBREGS, PCB_DBREGS);
 ASSYM(PCB_EXT, offsetof(struct pcb, pcb_ext));
 
 ASSYM(PCB_FSD, offsetof(struct pcb, pcb_fsd));
 ASSYM(PCB_GSD, offsetof(struct pcb, pcb_gsd));
 ASSYM(PCB_VM86, offsetof(struct pcb, pcb_vm86));
 ASSYM(PCB_FLAGS, offsetof(struct pcb, pcb_flags));
 ASSYM(PCB_SAVEFPU, offsetof(struct pcb, pcb_save));
 ASSYM(PCB_ONFAULT, offsetof(struct pcb, pcb_onfault));
 
 ASSYM(PCB_SIZE, sizeof(struct pcb));
 ASSYM(PCB_VM86CALL, PCB_VM86CALL);
 
 ASSYM(PCB_GDT, offsetof(struct pcb, pcb_gdt));
 ASSYM(PCB_IDT, offsetof(struct pcb, pcb_idt));
 ASSYM(PCB_LDT, offsetof(struct pcb, pcb_ldt));
 ASSYM(PCB_TR, offsetof(struct pcb, pcb_tr));
 
 ASSYM(TF_TRAPNO, offsetof(struct trapframe, tf_trapno));
 ASSYM(TF_ERR, offsetof(struct trapframe, tf_err));
 ASSYM(TF_EIP, offsetof(struct trapframe, tf_eip));
 ASSYM(TF_CS, offsetof(struct trapframe, tf_cs));
 ASSYM(TF_EFLAGS, offsetof(struct trapframe, tf_eflags));
 ASSYM(SIGF_HANDLER, offsetof(struct sigframe, sf_ahu.sf_handler));
 #ifdef COMPAT_43
 ASSYM(SIGF_SC, offsetof(struct osigframe, sf_siginfo.si_sc));
 #endif
 ASSYM(SIGF_UC, offsetof(struct sigframe, sf_uc));
 #ifdef COMPAT_FREEBSD4
 ASSYM(SIGF_UC4, offsetof(struct sigframe4, sf_uc));
 #endif
 #ifdef COMPAT_43
 ASSYM(SC_PS, offsetof(struct osigcontext, sc_ps));
 ASSYM(SC_FS, offsetof(struct osigcontext, sc_fs));
 ASSYM(SC_GS, offsetof(struct osigcontext, sc_gs));
 ASSYM(SC_TRAPNO, offsetof(struct osigcontext, sc_trapno));
 #endif
 #ifdef COMPAT_FREEBSD4
 ASSYM(UC4_EFLAGS, offsetof(struct ucontext4, uc_mcontext.mc_eflags));
 ASSYM(UC4_GS, offsetof(struct ucontext4, uc_mcontext.mc_gs));
 #endif
 ASSYM(UC_EFLAGS, offsetof(ucontext_t, uc_mcontext.mc_eflags));
 ASSYM(UC_GS, offsetof(ucontext_t, uc_mcontext.mc_gs));
 ASSYM(ENOENT, ENOENT);
 ASSYM(EFAULT, EFAULT);
 ASSYM(ENAMETOOLONG, ENAMETOOLONG);
 ASSYM(MAXCOMLEN, MAXCOMLEN);
 ASSYM(MAXPATHLEN, MAXPATHLEN);
 ASSYM(BOOTINFO_SIZE, sizeof(struct bootinfo));
 ASSYM(BI_VERSION, offsetof(struct bootinfo, bi_version));
 ASSYM(BI_KERNELNAME, offsetof(struct bootinfo, bi_kernelname));
 ASSYM(BI_NFS_DISKLESS, offsetof(struct bootinfo, bi_nfs_diskless));
 ASSYM(BI_ENDCOMMON, offsetof(struct bootinfo, bi_endcommon));
 ASSYM(NFSDISKLESS_SIZE, sizeof(struct nfs_diskless));
 ASSYM(BI_SIZE, offsetof(struct bootinfo, bi_size));
 ASSYM(BI_SYMTAB, offsetof(struct bootinfo, bi_symtab));
 ASSYM(BI_ESYMTAB, offsetof(struct bootinfo, bi_esymtab));
 ASSYM(BI_KERNEND, offsetof(struct bootinfo, bi_kernend));
 ASSYM(PC_SIZEOF, sizeof(struct pcpu));
 ASSYM(PC_PRVSPACE, offsetof(struct pcpu, pc_prvspace));
 ASSYM(PC_CURTHREAD, offsetof(struct pcpu, pc_curthread));
 ASSYM(PC_FPCURTHREAD, offsetof(struct pcpu, pc_fpcurthread));
 ASSYM(PC_IDLETHREAD, offsetof(struct pcpu, pc_idlethread));
 ASSYM(PC_CURPCB, offsetof(struct pcpu, pc_curpcb));
 ASSYM(PC_COMMON_TSS, offsetof(struct pcpu, pc_common_tss));
 ASSYM(PC_COMMON_TSSD, offsetof(struct pcpu, pc_common_tssd));
 ASSYM(PC_TSS_GDT, offsetof(struct pcpu, pc_tss_gdt));
 ASSYM(PC_FSGS_GDT, offsetof(struct pcpu, pc_fsgs_gdt));
 ASSYM(PC_CURRENTLDT, offsetof(struct pcpu, pc_currentldt));
 ASSYM(PC_CPUID, offsetof(struct pcpu, pc_cpuid));
 ASSYM(PC_CURPMAP, offsetof(struct pcpu, pc_curpmap));
 ASSYM(PC_PRIVATE_TSS, offsetof(struct pcpu, pc_private_tss));
 
 #ifdef DEV_APIC
 ASSYM(LA_EOI, LAPIC_EOI * LAPIC_MEM_MUL);
 ASSYM(LA_ISR, LAPIC_ISR0 * LAPIC_MEM_MUL);
+
+ASSYM(IPI_INVLTLB, IPI_INVLTLB);
+ASSYM(IPI_INVLPG, IPI_INVLPG);
+ASSYM(IPI_INVLRNG, IPI_INVLRNG);
+ASSYM(IPI_INVLCACHE, IPI_INVLCACHE);
+ASSYM(IPI_BITMAP_VECTOR, IPI_BITMAP_VECTOR);
+ASSYM(IPI_STOP, IPI_STOP);
+ASSYM(IPI_SUSPEND, IPI_SUSPEND);
+ASSYM(IPI_RENDEZVOUS, IPI_RENDEZVOUS);
 #endif
 
 ASSYM(KCSEL, GSEL(GCODE_SEL, SEL_KPL));
 ASSYM(KDSEL, GSEL(GDATA_SEL, SEL_KPL));
 ASSYM(KPSEL, GSEL(GPRIV_SEL, SEL_KPL));
 
 ASSYM(BC32SEL, GSEL(GBIOSCODE32_SEL, SEL_KPL));
 ASSYM(GPROC0_SEL, GPROC0_SEL);
 ASSYM(VM86_FRAMESIZE, sizeof(struct vm86frame));
 
 #ifdef	HWPMC_HOOKS
 ASSYM(PMC_FN_USER_CALLCHAIN, PMC_FN_USER_CALLCHAIN);
 #endif
Index: head/sys/x86/include/apicvar.h
===================================================================
--- head/sys/x86/include/apicvar.h	(revision 315958)
+++ head/sys/x86/include/apicvar.h	(revision 315959)
@@ -1,485 +1,486 @@
 /*-
  * Copyright (c) 2003 John Baldwin <jhb@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _X86_APICVAR_H_
 #define _X86_APICVAR_H_
 
 /*
  * Local && I/O APIC variable definitions.
  */
 
 /*
  * Layout of local APIC interrupt vectors:
  *
  *	0xff (255)  +-------------+
  *                  |             | 15 (Spurious / IPIs / Local Interrupts)
  *	0xf0 (240)  +-------------+
  *                  |             | 14 (I/O Interrupts / Timer)
  *	0xe0 (224)  +-------------+
  *                  |             | 13 (I/O Interrupts)
  *	0xd0 (208)  +-------------+
  *                  |             | 12 (I/O Interrupts)
  *	0xc0 (192)  +-------------+
  *                  |             | 11 (I/O Interrupts)
  *	0xb0 (176)  +-------------+
  *                  |             | 10 (I/O Interrupts)
  *	0xa0 (160)  +-------------+
  *                  |             | 9 (I/O Interrupts)
  *	0x90 (144)  +-------------+
  *                  |             | 8 (I/O Interrupts / System Calls)
  *	0x80 (128)  +-------------+
  *                  |             | 7 (I/O Interrupts)
  *	0x70 (112)  +-------------+
  *                  |             | 6 (I/O Interrupts)
  *	0x60 (96)   +-------------+
  *                  |             | 5 (I/O Interrupts)
  *	0x50 (80)   +-------------+
  *                  |             | 4 (I/O Interrupts)
  *	0x40 (64)   +-------------+
  *                  |             | 3 (I/O Interrupts)
  *	0x30 (48)   +-------------+
  *                  |             | 2 (ATPIC Interrupts)
  *	0x20 (32)   +-------------+
  *                  |             | 1 (Exceptions, traps, faults, etc.)
  *	0x10 (16)   +-------------+
  *                  |             | 0 (Exceptions, traps, faults, etc.)
  *	0x00 (0)    +-------------+
  *
  * Note: 0x80 needs to be handled specially and not allocated to an
  * I/O device!
  */
 
 #define	MAX_APIC_ID	0xfe
 #define	APIC_ID_ALL	0xff
 
 /* I/O Interrupts are used for external devices such as ISA, PCI, etc. */
 #define	APIC_IO_INTS	(IDT_IO_INTS + 16)
 #define	APIC_NUM_IOINTS	191
 
 /* The timer interrupt is used for clock handling and drives hardclock, etc. */
 #define	APIC_TIMER_INT	(APIC_IO_INTS + APIC_NUM_IOINTS)
 
 /*  
  ********************* !!! WARNING !!! ******************************
  * Each local apic has an interrupt receive fifo that is two entries deep
  * for each interrupt priority class (higher 4 bits of interrupt vector).
  * Once the fifo is full the APIC can no longer receive interrupts for this
  * class and sending IPIs from other CPUs will be blocked.
  * To avoid deadlocks there should be no more than two IPI interrupts
  * pending at the same time.
  * Currently this is guaranteed by dividing the IPIs in two groups that have 
  * each at most one IPI interrupt pending. The first group is protected by the
  * smp_ipi_mtx and waits for the completion of the IPI (Only one IPI user 
  * at a time) The second group uses a single interrupt and a bitmap to avoid
  * redundant IPI interrupts.
  */ 
 
 /* Interrupts for local APIC LVT entries other than the timer. */
 #define	APIC_LOCAL_INTS	240
 #define	APIC_ERROR_INT	APIC_LOCAL_INTS
 #define	APIC_THERMAL_INT (APIC_LOCAL_INTS + 1)
 #define	APIC_CMC_INT	(APIC_LOCAL_INTS + 2)
 #define	APIC_IPI_INTS	(APIC_LOCAL_INTS + 3)
 
 #define	IPI_RENDEZVOUS	(APIC_IPI_INTS)		/* Inter-CPU rendezvous. */
 #define	IPI_INVLTLB	(APIC_IPI_INTS + 1)	/* TLB Shootdown IPIs */
 #define	IPI_INVLPG	(APIC_IPI_INTS + 2)
 #define	IPI_INVLRNG	(APIC_IPI_INTS + 3)
 #define	IPI_INVLCACHE	(APIC_IPI_INTS + 4)
 /* Vector to handle bitmap based IPIs */
 #define	IPI_BITMAP_VECTOR	(APIC_IPI_INTS + 5) 
 
 /* IPIs handled by IPI_BITMAP_VECTOR */
 #define	IPI_AST		0 	/* Generate software trap. */
 #define IPI_PREEMPT     1
 #define IPI_HARDCLOCK   2
 #define IPI_BITMAP_LAST IPI_HARDCLOCK
 #define IPI_IS_BITMAPED(x) ((x) <= IPI_BITMAP_LAST)
 
 #define	IPI_STOP	(APIC_IPI_INTS + 6)	/* Stop CPU until restarted. */
 #define	IPI_SUSPEND	(APIC_IPI_INTS + 7)	/* Suspend CPU until restarted. */
 #ifdef __i386__
 #define	IPI_LAZYPMAP	(APIC_IPI_INTS + 8)	/* Lazy pmap release. */
 #define	IPI_DYN_FIRST	(APIC_IPI_INTS + 9)
 #else
 #define	IPI_DYN_FIRST	(APIC_IPI_INTS + 8)
 #endif
 #define	IPI_DYN_LAST	(253)			/* IPIs allocated at runtime */
 
 /*
  * IPI_STOP_HARD does not need to occupy a slot in the IPI vector space since
  * it is delivered using an NMI anyways.
  */
 #define	IPI_NMI_FIRST	254
 #define	IPI_TRACE	254			/* Interrupt for tracing. */
 #define	IPI_STOP_HARD	255			/* Stop CPU with a NMI. */
 
 /*
  * The spurious interrupt can share the priority class with the IPIs since
  * it is not a normal interrupt. (Does not use the APIC's interrupt fifo)
  */
 #define	APIC_SPURIOUS_INT 255
 
 #ifndef LOCORE
 
 #define	APIC_IPI_DEST_SELF	-1
 #define	APIC_IPI_DEST_ALL	-2
 #define	APIC_IPI_DEST_OTHERS	-3
 
 #define	APIC_BUS_UNKNOWN	-1
 #define	APIC_BUS_ISA		0
 #define	APIC_BUS_EISA		1
 #define	APIC_BUS_PCI		2
 #define	APIC_BUS_MAX		APIC_BUS_PCI
 
 #define	IRQ_EXTINT		(NUM_IO_INTS + 1)
 #define	IRQ_NMI			(NUM_IO_INTS + 2)
 #define	IRQ_SMI			(NUM_IO_INTS + 3)
 #define	IRQ_DISABLED		(NUM_IO_INTS + 4)
 
 /*
  * An APIC enumerator is a psuedo bus driver that enumerates APIC's including
  * CPU's and I/O APIC's.
  */
 struct apic_enumerator {
 	const char *apic_name;
 	int (*apic_probe)(void);
 	int (*apic_probe_cpus)(void);
 	int (*apic_setup_local)(void);
 	int (*apic_setup_io)(void);
 	SLIST_ENTRY(apic_enumerator) apic_next;
 };
 
 inthand_t
 	IDTVEC(apic_isr1), IDTVEC(apic_isr2), IDTVEC(apic_isr3),
 	IDTVEC(apic_isr4), IDTVEC(apic_isr5), IDTVEC(apic_isr6),
 	IDTVEC(apic_isr7), IDTVEC(cmcint), IDTVEC(errorint),
 	IDTVEC(spuriousint), IDTVEC(timerint);
 
 extern vm_paddr_t lapic_paddr;
 extern int apic_cpuids[];
 
 void	apic_register_enumerator(struct apic_enumerator *enumerator);
 void	*ioapic_create(vm_paddr_t addr, int32_t apic_id, int intbase);
 int	ioapic_disable_pin(void *cookie, u_int pin);
 int	ioapic_get_vector(void *cookie, u_int pin);
 void	ioapic_register(void *cookie);
 int	ioapic_remap_vector(void *cookie, u_int pin, int vector);
 int	ioapic_set_bus(void *cookie, u_int pin, int bus_type);
 int	ioapic_set_extint(void *cookie, u_int pin);
 int	ioapic_set_nmi(void *cookie, u_int pin);
 int	ioapic_set_polarity(void *cookie, u_int pin, enum intr_polarity pol);
 int	ioapic_set_triggermode(void *cookie, u_int pin,
 	    enum intr_trigger trigger);
 int	ioapic_set_smi(void *cookie, u_int pin);
 
 /*
  * Struct containing pointers to APIC functions whose
  * implementation is run time selectable.
  */
 struct apic_ops {
 	void	(*create)(u_int, int);
 	void	(*init)(vm_paddr_t);
 	void	(*xapic_mode)(void);
 	bool	(*is_x2apic)(void);
 	void	(*setup)(int);
 	void	(*dump)(const char *);
 	void	(*disable)(void);
-	void	(*eoi)(void);
+	void	(*eoi)(u_int vector);
 	int	(*id)(void);
 	int	(*intr_pending)(u_int);
 	void	(*set_logical_id)(u_int, u_int, u_int);
 	u_int	(*cpuid)(u_int);
 
 	/* Vectors */
 	u_int	(*alloc_vector)(u_int, u_int);
 	u_int	(*alloc_vectors)(u_int, u_int *, u_int, u_int);
 	void	(*enable_vector)(u_int, u_int);
 	void	(*disable_vector)(u_int, u_int);
 	void	(*free_vector)(u_int, u_int, u_int);
 
 
 	/* PMC */
 	int	(*enable_pmc)(void);
 	void	(*disable_pmc)(void);
 	void	(*reenable_pmc)(void);
 
 	/* CMC */
 	void	(*enable_cmc)(void);
 
 	/* AMD ELVT */
 	int	(*enable_mca_elvt)(void);
 
 	/* IPI */
 	void	(*ipi_raw)(register_t, u_int);
 	void	(*ipi_vectored)(u_int, int);
 	int	(*ipi_wait)(int);
 	int	(*ipi_alloc)(inthand_t *ipifunc);
 	void	(*ipi_free)(int vector);
 
 	/* LVT */
 	int	(*set_lvt_mask)(u_int, u_int, u_char);
 	int	(*set_lvt_mode)(u_int, u_int, u_int32_t);
 	int	(*set_lvt_polarity)(u_int, u_int, enum intr_polarity);
 	int	(*set_lvt_triggermode)(u_int, u_int, enum intr_trigger);
 };
 
 extern struct apic_ops apic_ops;
 
 static inline void
 lapic_create(u_int apic_id, int boot_cpu)
 {
 
 	apic_ops.create(apic_id, boot_cpu);
 }
 
 static inline void
 lapic_init(vm_paddr_t addr)
 {
 
 	apic_ops.init(addr);
 }
 
 static inline void
 lapic_xapic_mode(void)
 {
 
 	apic_ops.xapic_mode();
 }
 
 static inline bool
 lapic_is_x2apic(void)
 {
 
 	return (apic_ops.is_x2apic());
 }
 
 static inline void
 lapic_setup(int boot)
 {
 
 	apic_ops.setup(boot);
 }
 
 static inline void
 lapic_dump(const char *str)
 {
 
 	apic_ops.dump(str);
 }
 
 static inline void
 lapic_disable(void)
 {
 
 	apic_ops.disable();
 }
 
 static inline void
-lapic_eoi(void)
+lapic_eoi(u_int vector)
 {
 
-	apic_ops.eoi();
+	apic_ops.eoi(vector);
 }
 
 static inline int
 lapic_id(void)
 {
 
 	return (apic_ops.id());
 }
 
 static inline int
 lapic_intr_pending(u_int vector)
 {
 
 	return (apic_ops.intr_pending(vector));
 }
 
 /* XXX: UNUSED */
 static inline void
 lapic_set_logical_id(u_int apic_id, u_int cluster, u_int cluster_id)
 {
 
 	apic_ops.set_logical_id(apic_id, cluster, cluster_id);
 }
 
 static inline u_int
 apic_cpuid(u_int apic_id)
 {
 
 	return (apic_ops.cpuid(apic_id));
 }
 
 static inline u_int
 apic_alloc_vector(u_int apic_id, u_int irq)
 {
 
 	return (apic_ops.alloc_vector(apic_id, irq));
 }
 
 static inline u_int
 apic_alloc_vectors(u_int apic_id, u_int *irqs, u_int count, u_int align)
 {
 
 	return (apic_ops.alloc_vectors(apic_id, irqs, count, align));
 }
 
 static inline void
 apic_enable_vector(u_int apic_id, u_int vector)
 {
 
 	apic_ops.enable_vector(apic_id, vector);
 }
 
 static inline void
 apic_disable_vector(u_int apic_id, u_int vector)
 {
 
 	apic_ops.disable_vector(apic_id, vector);
 }
 
 static inline void
 apic_free_vector(u_int apic_id, u_int vector, u_int irq)
 {
 
 	apic_ops.free_vector(apic_id, vector, irq);
 }
 
 static inline int
 lapic_enable_pmc(void)
 {
 
 	return (apic_ops.enable_pmc());
 }
 
 static inline void
 lapic_disable_pmc(void)
 {
 
 	apic_ops.disable_pmc();
 }
 
 static inline void
 lapic_reenable_pmc(void)
 {
 
 	apic_ops.reenable_pmc();
 }
 
 static inline void
 lapic_enable_cmc(void)
 {
 
 	apic_ops.enable_cmc();
 }
 
 static inline int
 lapic_enable_mca_elvt(void)
 {
 
 	return (apic_ops.enable_mca_elvt());
 }
 
 static inline void
 lapic_ipi_raw(register_t icrlo, u_int dest)
 {
 
 	apic_ops.ipi_raw(icrlo, dest);
 }
 
 static inline void
 lapic_ipi_vectored(u_int vector, int dest)
 {
 
 	apic_ops.ipi_vectored(vector, dest);
 }
 
 static inline int
 lapic_ipi_wait(int delay)
 {
 
 	return (apic_ops.ipi_wait(delay));
 }
 
 static inline int
 lapic_ipi_alloc(inthand_t *ipifunc)
 {
 
 	return (apic_ops.ipi_alloc(ipifunc));
 }
 
 static inline void
 lapic_ipi_free(int vector)
 {
 
 	return (apic_ops.ipi_free(vector));
 }
 
 static inline int
 lapic_set_lvt_mask(u_int apic_id, u_int lvt, u_char masked)
 {
 
 	return (apic_ops.set_lvt_mask(apic_id, lvt, masked));
 }
 
 static inline int
 lapic_set_lvt_mode(u_int apic_id, u_int lvt, u_int32_t mode)
 {
 
 	return (apic_ops.set_lvt_mode(apic_id, lvt, mode));
 }
 
 static inline int
 lapic_set_lvt_polarity(u_int apic_id, u_int lvt, enum intr_polarity pol)
 {
 
 	return (apic_ops.set_lvt_polarity(apic_id, lvt, pol));
 }
 
 static inline int
 lapic_set_lvt_triggermode(u_int apic_id, u_int lvt, enum intr_trigger trigger)
 {
 
 	return (apic_ops.set_lvt_triggermode(apic_id, lvt, trigger));
 }
 
+void	native_lapic_eoi(u_int vector);
 void	lapic_handle_cmc(void);
 void	lapic_handle_error(void);
 void	lapic_handle_intr(int vector, struct trapframe *frame);
 void	lapic_handle_timer(struct trapframe *frame);
 
 extern int x2apic_mode;
 extern int lapic_eoi_suppression;
 
 #ifdef _SYS_SYSCTL_H_
 SYSCTL_DECL(_hw_apic);
 #endif
 
 #endif /* !LOCORE */
 #endif /* _X86_APICVAR_H_ */
Index: head/sys/x86/x86/io_apic.c
===================================================================
--- head/sys/x86/x86/io_apic.c	(revision 315958)
+++ head/sys/x86/x86/io_apic.c	(revision 315959)
@@ -1,1138 +1,1138 @@
 /*-
  * Copyright (c) 2003 John Baldwin <jhb@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_acpi.h"
 #include "opt_isa.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/sysctl.h>
 
 #include <dev/pci/pcireg.h>
 #include <dev/pci/pcivar.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 #include <x86/apicreg.h>
 #include <machine/frame.h>
 #include <machine/intr_machdep.h>
 #include <x86/apicvar.h>
 #include <machine/resource.h>
 #include <machine/segments.h>
 #include <x86/iommu/iommu_intrmap.h>
 
 #define IOAPIC_ISA_INTS		16
 #define	IOAPIC_MEM_REGION	32
 #define	IOAPIC_REDTBL_LO(i)	(IOAPIC_REDTBL + (i) * 2)
 #define	IOAPIC_REDTBL_HI(i)	(IOAPIC_REDTBL_LO(i) + 1)
 
 static MALLOC_DEFINE(M_IOAPIC, "io_apic", "I/O APIC structures");
 
 /*
  * I/O APIC interrupt source driver.  Each pin is assigned an IRQ cookie
  * as laid out in the ACPI System Interrupt number model where each I/O
  * APIC has a contiguous chunk of the System Interrupt address space.
  * We assume that IRQs 1 - 15 behave like ISA IRQs and that all other
  * IRQs behave as PCI IRQs by default.  We also assume that the pin for
  * IRQ 0 is actually an ExtINT pin.  The apic enumerators override the
  * configuration of individual pins as indicated by their tables.
  *
  * Documentation for the I/O APIC: "82093AA I/O Advanced Programmable
  * Interrupt Controller (IOAPIC)", May 1996, Intel Corp.
  * ftp://download.intel.com/design/chipsets/datashts/29056601.pdf
  */
 
 struct ioapic_intsrc {
 	struct intsrc io_intsrc;
 	u_int io_irq;
 	u_int io_intpin:8;
 	u_int io_vector:8;
 	u_int io_cpu;
 	u_int io_activehi:1;
 	u_int io_edgetrigger:1;
 	u_int io_masked:1;
 	int io_bus:4;
 	uint32_t io_lowreg;
 	u_int io_remap_cookie;
 };
 
 struct ioapic {
 	struct pic io_pic;
 	u_int io_id:8;			/* logical ID */
 	u_int io_apic_id:4;
 	u_int io_intbase:8;		/* System Interrupt base */
 	u_int io_numintr:8;
 	u_int io_haseoi:1;
 	volatile ioapic_t *io_addr;	/* XXX: should use bus_space */
 	vm_paddr_t io_paddr;
 	STAILQ_ENTRY(ioapic) io_next;
 	struct ioapic_intsrc io_pins[0];
 };
 
 static u_int	ioapic_read(volatile ioapic_t *apic, int reg);
 static void	ioapic_write(volatile ioapic_t *apic, int reg, u_int val);
 static const char *ioapic_bus_string(int bus_type);
 static void	ioapic_print_irq(struct ioapic_intsrc *intpin);
 static void	ioapic_enable_source(struct intsrc *isrc);
 static void	ioapic_disable_source(struct intsrc *isrc, int eoi);
 static void	ioapic_eoi_source(struct intsrc *isrc);
 static void	ioapic_enable_intr(struct intsrc *isrc);
 static void	ioapic_disable_intr(struct intsrc *isrc);
 static int	ioapic_vector(struct intsrc *isrc);
 static int	ioapic_source_pending(struct intsrc *isrc);
 static int	ioapic_config_intr(struct intsrc *isrc, enum intr_trigger trig,
 		    enum intr_polarity pol);
 static void	ioapic_resume(struct pic *pic, bool suspend_cancelled);
 static int	ioapic_assign_cpu(struct intsrc *isrc, u_int apic_id);
 static void	ioapic_program_intpin(struct ioapic_intsrc *intpin);
 static void	ioapic_reprogram_intpin(struct intsrc *isrc);
 
 static STAILQ_HEAD(,ioapic) ioapic_list = STAILQ_HEAD_INITIALIZER(ioapic_list);
 struct pic ioapic_template = {
 	.pic_enable_source = ioapic_enable_source,
 	.pic_disable_source = ioapic_disable_source,
 	.pic_eoi_source = ioapic_eoi_source,
 	.pic_enable_intr = ioapic_enable_intr,
 	.pic_disable_intr = ioapic_disable_intr,
 	.pic_vector = ioapic_vector,
 	.pic_source_pending = ioapic_source_pending,
 	.pic_suspend = NULL,
 	.pic_resume = ioapic_resume,
 	.pic_config_intr = ioapic_config_intr,
 	.pic_assign_cpu = ioapic_assign_cpu,
 	.pic_reprogram_pin = ioapic_reprogram_intpin,
 };
 
 static int next_ioapic_base;
 static u_int next_id;
 
 static int enable_extint;
 SYSCTL_INT(_hw_apic, OID_AUTO, enable_extint, CTLFLAG_RDTUN, &enable_extint, 0,
     "Enable the ExtINT pin in the first I/O APIC");
 
 static void
 _ioapic_eoi_source(struct intsrc *isrc, int locked)
 {
 	struct ioapic_intsrc *src;
 	struct ioapic *io;
 	volatile uint32_t *apic_eoi;
 	uint32_t low1;
 
-	lapic_eoi();
+	src = (struct ioapic_intsrc *)isrc;
+	lapic_eoi(src->io_vector);
 	if (!lapic_eoi_suppression)
 		return;
-	src = (struct ioapic_intsrc *)isrc;
 	if (src->io_edgetrigger)
 		return;
 	io = (struct ioapic *)isrc->is_pic;
 
 	/*
 	 * Handle targeted EOI for level-triggered pins, if broadcast
 	 * EOI suppression is supported by LAPICs.
 	 */
 	if (io->io_haseoi) {
 		/*
 		 * If IOAPIC has EOI Register, simply write vector
 		 * number into the reg.
 		 */
 		apic_eoi = (volatile uint32_t *)((volatile char *)
 		    io->io_addr + IOAPIC_EOIR);
 		*apic_eoi = src->io_vector;
 	} else {
 		/*
 		 * Otherwise, if IO-APIC is too old to provide EOIR,
 		 * do what Intel did for the Linux kernel. Temporary
 		 * switch the pin to edge-trigger and back, masking
 		 * the pin during the trick.
 		 */
 		if (!locked)
 			mtx_lock_spin(&icu_lock);
 		low1 = src->io_lowreg;
 		low1 &= ~IOART_TRGRLVL;
 		low1 |= IOART_TRGREDG | IOART_INTMSET;
 		ioapic_write(io->io_addr, IOAPIC_REDTBL_LO(src->io_intpin),
 		    low1);
 		ioapic_write(io->io_addr, IOAPIC_REDTBL_LO(src->io_intpin),
 		    src->io_lowreg);
 		if (!locked)
 			mtx_unlock_spin(&icu_lock);
 	}
 }
 
 static u_int
 ioapic_read(volatile ioapic_t *apic, int reg)
 {
 
 	mtx_assert(&icu_lock, MA_OWNED);
 	apic->ioregsel = reg;
 	return (apic->iowin);
 }
 
 static void
 ioapic_write(volatile ioapic_t *apic, int reg, u_int val)
 {
 
 	mtx_assert(&icu_lock, MA_OWNED);
 	apic->ioregsel = reg;
 	apic->iowin = val;
 }
 
 static const char *
 ioapic_bus_string(int bus_type)
 {
 
 	switch (bus_type) {
 	case APIC_BUS_ISA:
 		return ("ISA");
 	case APIC_BUS_EISA:
 		return ("EISA");
 	case APIC_BUS_PCI:
 		return ("PCI");
 	default:
 		return ("unknown");
 	}
 }
 
 static void
 ioapic_print_irq(struct ioapic_intsrc *intpin)
 {
 
 	switch (intpin->io_irq) {
 	case IRQ_DISABLED:
 		printf("disabled");
 		break;
 	case IRQ_EXTINT:
 		printf("ExtINT");
 		break;
 	case IRQ_NMI:
 		printf("NMI");
 		break;
 	case IRQ_SMI:
 		printf("SMI");
 		break;
 	default:
 		printf("%s IRQ %u", ioapic_bus_string(intpin->io_bus),
 		    intpin->io_irq);
 	}
 }
 
 static void
 ioapic_enable_source(struct intsrc *isrc)
 {
 	struct ioapic_intsrc *intpin = (struct ioapic_intsrc *)isrc;
 	struct ioapic *io = (struct ioapic *)isrc->is_pic;
 	uint32_t flags;
 
 	mtx_lock_spin(&icu_lock);
 	if (intpin->io_masked) {
 		flags = intpin->io_lowreg & ~IOART_INTMASK;
 		ioapic_write(io->io_addr, IOAPIC_REDTBL_LO(intpin->io_intpin),
 		    flags);
 		intpin->io_masked = 0;
 	}
 	mtx_unlock_spin(&icu_lock);
 }
 
 static void
 ioapic_disable_source(struct intsrc *isrc, int eoi)
 {
 	struct ioapic_intsrc *intpin = (struct ioapic_intsrc *)isrc;
 	struct ioapic *io = (struct ioapic *)isrc->is_pic;
 	uint32_t flags;
 
 	mtx_lock_spin(&icu_lock);
 	if (!intpin->io_masked && !intpin->io_edgetrigger) {
 		flags = intpin->io_lowreg | IOART_INTMSET;
 		ioapic_write(io->io_addr, IOAPIC_REDTBL_LO(intpin->io_intpin),
 		    flags);
 		intpin->io_masked = 1;
 	}
 
 	if (eoi == PIC_EOI)
 		_ioapic_eoi_source(isrc, 1);
 
 	mtx_unlock_spin(&icu_lock);
 }
 
 static void
 ioapic_eoi_source(struct intsrc *isrc)
 {
 
 	_ioapic_eoi_source(isrc, 0);
 }
 
 /*
  * Completely program an intpin based on the data in its interrupt source
  * structure.
  */
 static void
 ioapic_program_intpin(struct ioapic_intsrc *intpin)
 {
 	struct ioapic *io = (struct ioapic *)intpin->io_intsrc.is_pic;
 	uint32_t low, high, value;
 #ifdef ACPI_DMAR
 	int error;
 #endif
 
 	/*
 	 * If a pin is completely invalid or if it is valid but hasn't
 	 * been enabled yet, just ensure that the pin is masked.
 	 */
 	mtx_assert(&icu_lock, MA_OWNED);
 	if (intpin->io_irq == IRQ_DISABLED || (intpin->io_irq < NUM_IO_INTS &&
 	    intpin->io_vector == 0)) {
 		low = ioapic_read(io->io_addr,
 		    IOAPIC_REDTBL_LO(intpin->io_intpin));
 		if ((low & IOART_INTMASK) == IOART_INTMCLR)
 			ioapic_write(io->io_addr,
 			    IOAPIC_REDTBL_LO(intpin->io_intpin),
 			    low | IOART_INTMSET);
 #ifdef ACPI_DMAR
 		mtx_unlock_spin(&icu_lock);
 		iommu_unmap_ioapic_intr(io->io_apic_id,
 		    &intpin->io_remap_cookie);
 		mtx_lock_spin(&icu_lock);
 #endif
 		return;
 	}
 
 #ifdef ACPI_DMAR
 	mtx_unlock_spin(&icu_lock);
 	error = iommu_map_ioapic_intr(io->io_apic_id,
 	    intpin->io_cpu, intpin->io_vector, intpin->io_edgetrigger,
 	    intpin->io_activehi, intpin->io_irq, &intpin->io_remap_cookie,
 	    &high, &low);
 	mtx_lock_spin(&icu_lock);
 	if (error == 0) {
 		ioapic_write(io->io_addr, IOAPIC_REDTBL_HI(intpin->io_intpin),
 		    high);
 		intpin->io_lowreg = low;
 		ioapic_write(io->io_addr, IOAPIC_REDTBL_LO(intpin->io_intpin),
 		    low);
 		return;
 	} else if (error != EOPNOTSUPP) {
 		return;
 	}
 #endif
 
 	/* Set the destination. */
 	low = IOART_DESTPHY;
 	high = intpin->io_cpu << APIC_ID_SHIFT;
 
 	/* Program the rest of the low word. */
 	if (intpin->io_edgetrigger)
 		low |= IOART_TRGREDG;
 	else
 		low |= IOART_TRGRLVL;
 	if (intpin->io_activehi)
 		low |= IOART_INTAHI;
 	else
 		low |= IOART_INTALO;
 	if (intpin->io_masked)
 		low |= IOART_INTMSET;
 	switch (intpin->io_irq) {
 	case IRQ_EXTINT:
 		KASSERT(intpin->io_edgetrigger,
 		    ("ExtINT not edge triggered"));
 		low |= IOART_DELEXINT;
 		break;
 	case IRQ_NMI:
 		KASSERT(intpin->io_edgetrigger,
 		    ("NMI not edge triggered"));
 		low |= IOART_DELNMI;
 		break;
 	case IRQ_SMI:
 		KASSERT(intpin->io_edgetrigger,
 		    ("SMI not edge triggered"));
 		low |= IOART_DELSMI;
 		break;
 	default:
 		KASSERT(intpin->io_vector != 0, ("No vector for IRQ %u",
 		    intpin->io_irq));
 		low |= IOART_DELFIXED | intpin->io_vector;
 	}
 
 	/* Write the values to the APIC. */
 	value = ioapic_read(io->io_addr, IOAPIC_REDTBL_HI(intpin->io_intpin));
 	value &= ~IOART_DEST;
 	value |= high;
 	ioapic_write(io->io_addr, IOAPIC_REDTBL_HI(intpin->io_intpin), value);
 	intpin->io_lowreg = low;
 	ioapic_write(io->io_addr, IOAPIC_REDTBL_LO(intpin->io_intpin), low);
 }
 
 static void
 ioapic_reprogram_intpin(struct intsrc *isrc)
 {
 
 	mtx_lock_spin(&icu_lock);
 	ioapic_program_intpin((struct ioapic_intsrc *)isrc);
 	mtx_unlock_spin(&icu_lock);
 }
 
 static int
 ioapic_assign_cpu(struct intsrc *isrc, u_int apic_id)
 {
 	struct ioapic_intsrc *intpin = (struct ioapic_intsrc *)isrc;
 	struct ioapic *io = (struct ioapic *)isrc->is_pic;
 	u_int old_vector, new_vector;
 	u_int old_id;
 
 	/*
 	 * On Hyper-V:
 	 * - Stick to the first cpu for all I/O APIC pins.
 	 * - And don't allow destination cpu changes.
 	 */
 	if (vm_guest == VM_GUEST_HV) {
 		if (intpin->io_vector)
 			return (EINVAL);
 		else
 			apic_id = 0;
 	}
 
 	/*
 	 * keep 1st core as the destination for NMI
 	 */
 	if (intpin->io_irq == IRQ_NMI)
 		apic_id = 0;
 
 	/*
 	 * Set us up to free the old irq.
 	 */
 	old_vector = intpin->io_vector;
 	old_id = intpin->io_cpu;
 	if (old_vector && apic_id == old_id)
 		return (0);
 
 	/*
 	 * Allocate an APIC vector for this interrupt pin.  Once
 	 * we have a vector we program the interrupt pin.
 	 */
 	new_vector = apic_alloc_vector(apic_id, intpin->io_irq);
 	if (new_vector == 0)
 		return (ENOSPC);
 
 	/*
 	 * Mask the old intpin if it is enabled while it is migrated.
 	 *
 	 * At least some level-triggered interrupts seem to need the
 	 * extra DELAY() to avoid being stuck in a non-EOI'd state.
 	 */
 	mtx_lock_spin(&icu_lock);
 	if (!intpin->io_masked && !intpin->io_edgetrigger) {
 		ioapic_write(io->io_addr, IOAPIC_REDTBL_LO(intpin->io_intpin),
 		    intpin->io_lowreg | IOART_INTMSET);
 		mtx_unlock_spin(&icu_lock);
 		DELAY(100);
 		mtx_lock_spin(&icu_lock);
 	}
 
 	intpin->io_cpu = apic_id;
 	intpin->io_vector = new_vector;
 	if (isrc->is_handlers > 0)
 		apic_enable_vector(intpin->io_cpu, intpin->io_vector);
 	if (bootverbose) {
 		printf("ioapic%u: routing intpin %u (", io->io_id,
 		    intpin->io_intpin);
 		ioapic_print_irq(intpin);
 		printf(") to lapic %u vector %u\n", intpin->io_cpu,
 		    intpin->io_vector);
 	}
 	ioapic_program_intpin(intpin);
 	mtx_unlock_spin(&icu_lock);
 
 	/*
 	 * Free the old vector after the new one is established.  This is done
 	 * to prevent races where we could miss an interrupt.
 	 */
 	if (old_vector) {
 		if (isrc->is_handlers > 0)
 			apic_disable_vector(old_id, old_vector);
 		apic_free_vector(old_id, old_vector, intpin->io_irq);
 	}
 	return (0);
 }
 
 static void
 ioapic_enable_intr(struct intsrc *isrc)
 {
 	struct ioapic_intsrc *intpin = (struct ioapic_intsrc *)isrc;
 
 	if (intpin->io_vector == 0)
 		if (ioapic_assign_cpu(isrc, intr_next_cpu()) != 0)
 			panic("Couldn't find an APIC vector for IRQ %d",
 			    intpin->io_irq);
 	apic_enable_vector(intpin->io_cpu, intpin->io_vector);
 }
 
 
 static void
 ioapic_disable_intr(struct intsrc *isrc)
 {
 	struct ioapic_intsrc *intpin = (struct ioapic_intsrc *)isrc;
 	u_int vector;
 
 	if (intpin->io_vector != 0) {
 		/* Mask this interrupt pin and free its APIC vector. */
 		vector = intpin->io_vector;
 		apic_disable_vector(intpin->io_cpu, vector);
 		mtx_lock_spin(&icu_lock);
 		intpin->io_masked = 1;
 		intpin->io_vector = 0;
 		ioapic_program_intpin(intpin);
 		mtx_unlock_spin(&icu_lock);
 		apic_free_vector(intpin->io_cpu, vector, intpin->io_irq);
 	}
 }
 
 static int
 ioapic_vector(struct intsrc *isrc)
 {
 	struct ioapic_intsrc *pin;
 
 	pin = (struct ioapic_intsrc *)isrc;
 	return (pin->io_irq);
 }
 
 static int
 ioapic_source_pending(struct intsrc *isrc)
 {
 	struct ioapic_intsrc *intpin = (struct ioapic_intsrc *)isrc;
 
 	if (intpin->io_vector == 0)
 		return 0;
 	return (lapic_intr_pending(intpin->io_vector));
 }
 
 static int
 ioapic_config_intr(struct intsrc *isrc, enum intr_trigger trig,
     enum intr_polarity pol)
 {
 	struct ioapic_intsrc *intpin = (struct ioapic_intsrc *)isrc;
 	struct ioapic *io = (struct ioapic *)isrc->is_pic;
 	int changed;
 
 	KASSERT(!(trig == INTR_TRIGGER_CONFORM || pol == INTR_POLARITY_CONFORM),
 	    ("%s: Conforming trigger or polarity\n", __func__));
 
 	/*
 	 * EISA interrupts always use active high polarity, so don't allow
 	 * them to be set to active low.
 	 *
 	 * XXX: Should we write to the ELCR if the trigger mode changes for
 	 * an EISA IRQ or an ISA IRQ with the ELCR present?
 	 */
 	mtx_lock_spin(&icu_lock);
 	if (intpin->io_bus == APIC_BUS_EISA)
 		pol = INTR_POLARITY_HIGH;
 	changed = 0;
 	if (intpin->io_edgetrigger != (trig == INTR_TRIGGER_EDGE)) {
 		if (bootverbose)
 			printf("ioapic%u: Changing trigger for pin %u to %s\n",
 			    io->io_id, intpin->io_intpin,
 			    trig == INTR_TRIGGER_EDGE ? "edge" : "level");
 		intpin->io_edgetrigger = (trig == INTR_TRIGGER_EDGE);
 		changed++;
 	}
 	if (intpin->io_activehi != (pol == INTR_POLARITY_HIGH)) {
 		if (bootverbose)
 			printf("ioapic%u: Changing polarity for pin %u to %s\n",
 			    io->io_id, intpin->io_intpin,
 			    pol == INTR_POLARITY_HIGH ? "high" : "low");
 		intpin->io_activehi = (pol == INTR_POLARITY_HIGH);
 		changed++;
 	}
 	if (changed)
 		ioapic_program_intpin(intpin);
 	mtx_unlock_spin(&icu_lock);
 	return (0);
 }
 
 static void
 ioapic_resume(struct pic *pic, bool suspend_cancelled)
 {
 	struct ioapic *io = (struct ioapic *)pic;
 	int i;
 
 	mtx_lock_spin(&icu_lock);
 	for (i = 0; i < io->io_numintr; i++)
 		ioapic_program_intpin(&io->io_pins[i]);
 	mtx_unlock_spin(&icu_lock);
 }
 
 /*
  * Create a plain I/O APIC object.
  */
 void *
 ioapic_create(vm_paddr_t addr, int32_t apic_id, int intbase)
 {
 	struct ioapic *io;
 	struct ioapic_intsrc *intpin;
 	volatile ioapic_t *apic;
 	u_int numintr, i;
 	uint32_t value;
 
 	/* Map the register window so we can access the device. */
 	apic = pmap_mapdev(addr, IOAPIC_MEM_REGION);
 	mtx_lock_spin(&icu_lock);
 	value = ioapic_read(apic, IOAPIC_VER);
 	mtx_unlock_spin(&icu_lock);
 
 	/* If it's version register doesn't seem to work, punt. */
 	if (value == 0xffffffff) {
 		pmap_unmapdev((vm_offset_t)apic, IOAPIC_MEM_REGION);
 		return (NULL);
 	}
 
 	/* Determine the number of vectors and set the APIC ID. */
 	numintr = ((value & IOART_VER_MAXREDIR) >> MAXREDIRSHIFT) + 1;
 	io = malloc(sizeof(struct ioapic) +
 	    numintr * sizeof(struct ioapic_intsrc), M_IOAPIC, M_WAITOK);
 	io->io_pic = ioapic_template;
 	mtx_lock_spin(&icu_lock);
 	io->io_id = next_id++;
 	io->io_apic_id = ioapic_read(apic, IOAPIC_ID) >> APIC_ID_SHIFT;
 	if (apic_id != -1 && io->io_apic_id != apic_id) {
 		ioapic_write(apic, IOAPIC_ID, apic_id << APIC_ID_SHIFT);
 		mtx_unlock_spin(&icu_lock);
 		io->io_apic_id = apic_id;
 		printf("ioapic%u: Changing APIC ID to %d\n", io->io_id,
 		    apic_id);
 	} else
 		mtx_unlock_spin(&icu_lock);
 	if (intbase == -1) {
 		intbase = next_ioapic_base;
 		printf("ioapic%u: Assuming intbase of %d\n", io->io_id,
 		    intbase);
 	} else if (intbase != next_ioapic_base && bootverbose)
 		printf("ioapic%u: WARNING: intbase %d != expected base %d\n",
 		    io->io_id, intbase, next_ioapic_base);
 	io->io_intbase = intbase;
 	next_ioapic_base = intbase + numintr;
 	io->io_numintr = numintr;
 	io->io_addr = apic;
 	io->io_paddr = addr;
 
 	if (bootverbose) {
 		printf("ioapic%u: ver 0x%02x maxredir 0x%02x\n", io->io_id,
 		    (value & IOART_VER_VERSION), (value & IOART_VER_MAXREDIR)
 		    >> MAXREDIRSHIFT);
 	}
 	/*
 	 * The  summary information about IO-APIC versions is taken from
 	 * the Linux kernel source:
 	 *     0Xh     82489DX
 	 *     1Xh     I/OAPIC or I/O(x)APIC which are not PCI 2.2 Compliant
 	 *     2Xh     I/O(x)APIC which is PCI 2.2 Compliant
 	 *     30h-FFh Reserved
 	 * IO-APICs with version >= 0x20 have working EOIR register.
 	 */
 	io->io_haseoi = (value & IOART_VER_VERSION) >= 0x20;
 
 	/*
 	 * Initialize pins.  Start off with interrupts disabled.  Default
 	 * to active-hi and edge-triggered for ISA interrupts and active-lo
 	 * and level-triggered for all others.
 	 */
 	bzero(io->io_pins, sizeof(struct ioapic_intsrc) * numintr);
 	mtx_lock_spin(&icu_lock);
 	for (i = 0, intpin = io->io_pins; i < numintr; i++, intpin++) {
 		intpin->io_intsrc.is_pic = (struct pic *)io;
 		intpin->io_intpin = i;
 		intpin->io_irq = intbase + i;
 
 		/*
 		 * Assume that pin 0 on the first I/O APIC is an ExtINT pin.
 		 * Assume that pins 1-15 are ISA interrupts and that all
 		 * other pins are PCI interrupts.
 		 */
 		if (intpin->io_irq == 0)
 			ioapic_set_extint(io, i);
 		else if (intpin->io_irq < IOAPIC_ISA_INTS) {
 			intpin->io_bus = APIC_BUS_ISA;
 			intpin->io_activehi = 1;
 			intpin->io_edgetrigger = 1;
 			intpin->io_masked = 1;
 		} else {
 			intpin->io_bus = APIC_BUS_PCI;
 			intpin->io_activehi = 0;
 			intpin->io_edgetrigger = 0;
 			intpin->io_masked = 1;
 		}
 
 		/*
 		 * Route interrupts to the BSP by default.  Interrupts may
 		 * be routed to other CPUs later after they are enabled.
 		 */
 		intpin->io_cpu = PCPU_GET(apic_id);
 		value = ioapic_read(apic, IOAPIC_REDTBL_LO(i));
 		ioapic_write(apic, IOAPIC_REDTBL_LO(i), value | IOART_INTMSET);
 #ifdef ACPI_DMAR
 		/* dummy, but sets cookie */
 		mtx_unlock_spin(&icu_lock);
 		iommu_map_ioapic_intr(io->io_apic_id,
 		    intpin->io_cpu, intpin->io_vector, intpin->io_edgetrigger,
 		    intpin->io_activehi, intpin->io_irq,
 		    &intpin->io_remap_cookie, NULL, NULL);
 		mtx_lock_spin(&icu_lock);
 #endif
 	}
 	mtx_unlock_spin(&icu_lock);
 
 	return (io);
 }
 
 int
 ioapic_get_vector(void *cookie, u_int pin)
 {
 	struct ioapic *io;
 
 	io = (struct ioapic *)cookie;
 	if (pin >= io->io_numintr)
 		return (-1);
 	return (io->io_pins[pin].io_irq);
 }
 
 int
 ioapic_disable_pin(void *cookie, u_int pin)
 {
 	struct ioapic *io;
 
 	io = (struct ioapic *)cookie;
 	if (pin >= io->io_numintr)
 		return (EINVAL);
 	if (io->io_pins[pin].io_irq == IRQ_DISABLED)
 		return (EINVAL);
 	io->io_pins[pin].io_irq = IRQ_DISABLED;
 	if (bootverbose)
 		printf("ioapic%u: intpin %d disabled\n", io->io_id, pin);
 	return (0);
 }
 
 int
 ioapic_remap_vector(void *cookie, u_int pin, int vector)
 {
 	struct ioapic *io;
 
 	io = (struct ioapic *)cookie;
 	if (pin >= io->io_numintr || vector < 0)
 		return (EINVAL);
 	if (io->io_pins[pin].io_irq >= NUM_IO_INTS)
 		return (EINVAL);
 	io->io_pins[pin].io_irq = vector;
 	if (bootverbose)
 		printf("ioapic%u: Routing IRQ %d -> intpin %d\n", io->io_id,
 		    vector, pin);
 	return (0);
 }
 
 int
 ioapic_set_bus(void *cookie, u_int pin, int bus_type)
 {
 	struct ioapic *io;
 
 	if (bus_type < 0 || bus_type > APIC_BUS_MAX)
 		return (EINVAL);
 	io = (struct ioapic *)cookie;
 	if (pin >= io->io_numintr)
 		return (EINVAL);
 	if (io->io_pins[pin].io_irq >= NUM_IO_INTS)
 		return (EINVAL);
 	if (io->io_pins[pin].io_bus == bus_type)
 		return (0);
 	io->io_pins[pin].io_bus = bus_type;
 	if (bootverbose)
 		printf("ioapic%u: intpin %d bus %s\n", io->io_id, pin,
 		    ioapic_bus_string(bus_type));
 	return (0);
 }
 
 int
 ioapic_set_nmi(void *cookie, u_int pin)
 {
 	struct ioapic *io;
 
 	io = (struct ioapic *)cookie;
 	if (pin >= io->io_numintr)
 		return (EINVAL);
 	if (io->io_pins[pin].io_irq == IRQ_NMI)
 		return (0);
 	if (io->io_pins[pin].io_irq >= NUM_IO_INTS)
 		return (EINVAL);
 	io->io_pins[pin].io_bus = APIC_BUS_UNKNOWN;
 	io->io_pins[pin].io_irq = IRQ_NMI;
 	io->io_pins[pin].io_masked = 0;
 	io->io_pins[pin].io_edgetrigger = 1;
 	io->io_pins[pin].io_activehi = 1;
 	if (bootverbose)
 		printf("ioapic%u: Routing NMI -> intpin %d\n",
 		    io->io_id, pin);
 	return (0);
 }
 
 int
 ioapic_set_smi(void *cookie, u_int pin)
 {
 	struct ioapic *io;
 
 	io = (struct ioapic *)cookie;
 	if (pin >= io->io_numintr)
 		return (EINVAL);
 	if (io->io_pins[pin].io_irq == IRQ_SMI)
 		return (0);
 	if (io->io_pins[pin].io_irq >= NUM_IO_INTS)
 		return (EINVAL);
 	io->io_pins[pin].io_bus = APIC_BUS_UNKNOWN;
 	io->io_pins[pin].io_irq = IRQ_SMI;
 	io->io_pins[pin].io_masked = 0;
 	io->io_pins[pin].io_edgetrigger = 1;
 	io->io_pins[pin].io_activehi = 1;
 	if (bootverbose)
 		printf("ioapic%u: Routing SMI -> intpin %d\n",
 		    io->io_id, pin);
 	return (0);
 }
 
 int
 ioapic_set_extint(void *cookie, u_int pin)
 {
 	struct ioapic *io;
 
 	io = (struct ioapic *)cookie;
 	if (pin >= io->io_numintr)
 		return (EINVAL);
 	if (io->io_pins[pin].io_irq == IRQ_EXTINT)
 		return (0);
 	if (io->io_pins[pin].io_irq >= NUM_IO_INTS)
 		return (EINVAL);
 	io->io_pins[pin].io_bus = APIC_BUS_UNKNOWN;
 	io->io_pins[pin].io_irq = IRQ_EXTINT;
 	if (enable_extint)
 		io->io_pins[pin].io_masked = 0;
 	else
 		io->io_pins[pin].io_masked = 1;
 	io->io_pins[pin].io_edgetrigger = 1;
 	io->io_pins[pin].io_activehi = 1;
 	if (bootverbose)
 		printf("ioapic%u: Routing external 8259A's -> intpin %d\n",
 		    io->io_id, pin);
 	return (0);
 }
 
 int
 ioapic_set_polarity(void *cookie, u_int pin, enum intr_polarity pol)
 {
 	struct ioapic *io;
 	int activehi;
 
 	io = (struct ioapic *)cookie;
 	if (pin >= io->io_numintr || pol == INTR_POLARITY_CONFORM)
 		return (EINVAL);
 	if (io->io_pins[pin].io_irq >= NUM_IO_INTS)
 		return (EINVAL);
 	activehi = (pol == INTR_POLARITY_HIGH);
 	if (io->io_pins[pin].io_activehi == activehi)
 		return (0);
 	io->io_pins[pin].io_activehi = activehi;
 	if (bootverbose)
 		printf("ioapic%u: intpin %d polarity: %s\n", io->io_id, pin,
 		    pol == INTR_POLARITY_HIGH ? "high" : "low");
 	return (0);
 }
 
 int
 ioapic_set_triggermode(void *cookie, u_int pin, enum intr_trigger trigger)
 {
 	struct ioapic *io;
 	int edgetrigger;
 
 	io = (struct ioapic *)cookie;
 	if (pin >= io->io_numintr || trigger == INTR_TRIGGER_CONFORM)
 		return (EINVAL);
 	if (io->io_pins[pin].io_irq >= NUM_IO_INTS)
 		return (EINVAL);
 	edgetrigger = (trigger == INTR_TRIGGER_EDGE);
 	if (io->io_pins[pin].io_edgetrigger == edgetrigger)
 		return (0);
 	io->io_pins[pin].io_edgetrigger = edgetrigger;
 	if (bootverbose)
 		printf("ioapic%u: intpin %d trigger: %s\n", io->io_id, pin,
 		    trigger == INTR_TRIGGER_EDGE ? "edge" : "level");
 	return (0);
 }
 
 /*
  * Register a complete I/O APIC object with the interrupt subsystem.
  */
 void
 ioapic_register(void *cookie)
 {
 	struct ioapic_intsrc *pin;
 	struct ioapic *io;
 	volatile ioapic_t *apic;
 	uint32_t flags;
 	int i;
 
 	io = (struct ioapic *)cookie;
 	apic = io->io_addr;
 	mtx_lock_spin(&icu_lock);
 	flags = ioapic_read(apic, IOAPIC_VER) & IOART_VER_VERSION;
 	STAILQ_INSERT_TAIL(&ioapic_list, io, io_next);
 	mtx_unlock_spin(&icu_lock);
 	printf("ioapic%u <Version %u.%u> irqs %u-%u on motherboard\n",
 	    io->io_id, flags >> 4, flags & 0xf, io->io_intbase,
 	    io->io_intbase + io->io_numintr - 1);
 
 	/*
 	 * Reprogram pins to handle special case pins (such as NMI and
 	 * SMI) and register valid pins as interrupt sources.
 	 */
 	intr_register_pic(&io->io_pic);
 	for (i = 0, pin = io->io_pins; i < io->io_numintr; i++, pin++) {
 		ioapic_reprogram_intpin(&pin->io_intsrc);
 		if (pin->io_irq < NUM_IO_INTS)
 			intr_register_source(&pin->io_intsrc);
 	}
 }
 
 /* A simple new-bus driver to consume PCI I/O APIC devices. */
 static int
 ioapic_pci_probe(device_t dev)
 {
 
 	if (pci_get_class(dev) == PCIC_BASEPERIPH &&
 	    pci_get_subclass(dev) == PCIS_BASEPERIPH_PIC) {
 		switch (pci_get_progif(dev)) {
 		case PCIP_BASEPERIPH_PIC_IO_APIC:
 			device_set_desc(dev, "IO APIC");
 			break;
 		case PCIP_BASEPERIPH_PIC_IOX_APIC:
 			device_set_desc(dev, "IO(x) APIC");
 			break;
 		default:
 			return (ENXIO);
 		}
 		device_quiet(dev);
 		return (-10000);
 	}
 	return (ENXIO);
 }
 
 static int
 ioapic_pci_attach(device_t dev)
 {
 
 	return (0);
 }
 
 static device_method_t ioapic_pci_methods[] = {
 	/* Device interface */
 	DEVMETHOD(device_probe,		ioapic_pci_probe),
 	DEVMETHOD(device_attach,	ioapic_pci_attach),
 
 	{ 0, 0 }
 };
 
 DEFINE_CLASS_0(ioapic, ioapic_pci_driver, ioapic_pci_methods, 0);
 
 static devclass_t ioapic_devclass;
 DRIVER_MODULE(ioapic, pci, ioapic_pci_driver, ioapic_devclass, 0, 0);
 
 /*
  * A new-bus driver to consume the memory resources associated with
  * the APICs in the system.  On some systems ACPI or PnPBIOS system
  * resource devices may already claim these resources.  To keep from
  * breaking those devices, we attach ourself to the nexus device after
  * legacy0 and acpi0 and ignore any allocation failures.
  */
 static void
 apic_identify(driver_t *driver, device_t parent)
 {
 
 	/*
 	 * Add at order 12.  acpi0 is probed at order 10 and legacy0
 	 * is probed at order 11.
 	 */
 	if (lapic_paddr != 0)
 		BUS_ADD_CHILD(parent, 12, "apic", 0);
 }
 
 static int
 apic_probe(device_t dev)
 {
 
 	device_set_desc(dev, "APIC resources");
 	device_quiet(dev);
 	return (0);
 }
 
 static void
 apic_add_resource(device_t dev, int rid, vm_paddr_t base, size_t length)
 {
 	int error;
 
 	error = bus_set_resource(dev, SYS_RES_MEMORY, rid, base, length);
 	if (error)
 		panic("apic_add_resource: resource %d failed set with %d", rid,
 		    error);
 	bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, 0);
 }
 
 static int
 apic_attach(device_t dev)
 {
 	struct ioapic *io;
 	int i;
 
 	/* Reserve the local APIC. */
 	apic_add_resource(dev, 0, lapic_paddr, LAPIC_MEM_REGION);
 	i = 1;
 	STAILQ_FOREACH(io, &ioapic_list, io_next) {
 		apic_add_resource(dev, i, io->io_paddr, IOAPIC_MEM_REGION);
 		i++;
 	}
 	return (0);
 }
 
 static device_method_t apic_methods[] = {
 	/* Device interface */
 	DEVMETHOD(device_identify,	apic_identify),
 	DEVMETHOD(device_probe,		apic_probe),
 	DEVMETHOD(device_attach,	apic_attach),
 
 	{ 0, 0 }
 };
 
 DEFINE_CLASS_0(apic, apic_driver, apic_methods, 0);
 
 static devclass_t apic_devclass;
 DRIVER_MODULE(apic, nexus, apic_driver, apic_devclass, 0, 0);
 
 #include "opt_ddb.h"
 
 #ifdef DDB
 #include <ddb/ddb.h>
 
 static const char *
 ioapic_delivery_mode(uint32_t mode)
 {
 
 	switch (mode) {
 	case IOART_DELFIXED:
 		return ("fixed");
 	case IOART_DELLOPRI:
 		return ("lowestpri");
 	case IOART_DELSMI:
 		return ("SMI");
 	case IOART_DELRSV1:
 		return ("rsrvd1");
 	case IOART_DELNMI:
 		return ("NMI");
 	case IOART_DELINIT:
 		return ("INIT");
 	case IOART_DELRSV2:
 		return ("rsrvd2");
 	case IOART_DELEXINT:
 		return ("ExtINT");
 	default:
 		return ("");
 	}
 }
 
 static u_int
 db_ioapic_read(volatile ioapic_t *apic, int reg)
 {
 
 	apic->ioregsel = reg;
 	return (apic->iowin);
 }
 
 static void
 db_show_ioapic_one(volatile ioapic_t *io_addr)
 {
 	uint32_t r, lo, hi;
 	int mre, i;
 
 	r = db_ioapic_read(io_addr, IOAPIC_VER);
 	mre = (r & IOART_VER_MAXREDIR) >> MAXREDIRSHIFT;
 	db_printf("Id 0x%08x Ver 0x%02x MRE %d\n",
 	    db_ioapic_read(io_addr, IOAPIC_ID), r & IOART_VER_VERSION, mre);
 	for (i = 0; i < mre; i++) {
 		lo = db_ioapic_read(io_addr, IOAPIC_REDTBL_LO(i));
 		hi = db_ioapic_read(io_addr, IOAPIC_REDTBL_HI(i));
 		db_printf("  pin %d Dest %s/%x %smasked Trig %s RemoteIRR %d "
 		    "Polarity %s Status %s DeliveryMode %s Vec %d\n", i,
 		    (lo & IOART_DESTMOD) == IOART_DESTLOG ? "log" : "phy",
 		    (hi & IOART_DEST) >> 24,
 		    (lo & IOART_INTMASK) == IOART_INTMSET ? "" : "not",
 		    (lo & IOART_TRGRMOD) == IOART_TRGRLVL ? "lvl" : "edge",
 		    (lo & IOART_REM_IRR) == IOART_REM_IRR ? 1 : 0,
 		    (lo & IOART_INTPOL) == IOART_INTALO ? "low" : "high",
 		    (lo & IOART_DELIVS) == IOART_DELIVS ? "pend" : "idle",
 		    ioapic_delivery_mode(lo & IOART_DELMOD),
 		    (lo & IOART_INTVEC));
 	  }
 }
 
 DB_SHOW_COMMAND(ioapic, db_show_ioapic)
 {
 	struct ioapic *ioapic;
 	int idx, i;
 
 	if (!have_addr) {
 		db_printf("usage: show ioapic index\n");
 		return;
 	}
 
 	idx = (int)addr;
 	i = 0;
 	STAILQ_FOREACH(ioapic, &ioapic_list, io_next) {
 		if (idx == i) {
 			db_show_ioapic_one(ioapic->io_addr);
 			break;
 		}
 		i++;
 	}
 }
 
 DB_SHOW_ALL_COMMAND(ioapics, db_show_all_ioapics)
 {
 	struct ioapic *ioapic;
 
 	STAILQ_FOREACH(ioapic, &ioapic_list, io_next)
 		db_show_ioapic_one(ioapic->io_addr);
 }
 #endif
Index: head/sys/x86/x86/local_apic.c
===================================================================
--- head/sys/x86/x86/local_apic.c	(revision 315958)
+++ head/sys/x86/x86/local_apic.c	(revision 315959)
@@ -1,2120 +1,2147 @@
 /*-
  * Copyright (c) 2003 John Baldwin <jhb@FreeBSD.org>
  * Copyright (c) 1996, by Steve Passe
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. The name of the developer may NOT be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  * 3. Neither the name of the author nor the names of any co-contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * Local APIC support on Pentium and later processors.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_atpic.h"
 #include "opt_hwpmc_hooks.h"
 
 #include "opt_ddb.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/proc.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/timeet.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 #include <x86/apicreg.h>
 #include <machine/clock.h>
 #include <machine/cpufunc.h>
 #include <machine/cputypes.h>
 #include <machine/frame.h>
 #include <machine/intr_machdep.h>
 #include <x86/apicvar.h>
 #include <x86/mca.h>
 #include <machine/md_var.h>
 #include <machine/smp.h>
 #include <machine/specialreg.h>
 #include <x86/init.h>
 
 #ifdef DDB
 #include <sys/interrupt.h>
 #include <ddb/ddb.h>
 #endif
 
 #ifdef __amd64__
 #define	SDT_APIC	SDT_SYSIGT
 #define	SDT_APICT	SDT_SYSIGT
 #define	GSEL_APIC	0
 #else
 #define	SDT_APIC	SDT_SYS386IGT
 #define	SDT_APICT	SDT_SYS386TGT
 #define	GSEL_APIC	GSEL(GCODE_SEL, SEL_KPL)
 #endif
 
+#define	INTEL_SEOI	1
+#define	AMD_SEOI	2
+
 /* Sanity checks on IDT vectors. */
 CTASSERT(APIC_IO_INTS + APIC_NUM_IOINTS == APIC_TIMER_INT);
 CTASSERT(APIC_TIMER_INT < APIC_LOCAL_INTS);
 CTASSERT(APIC_LOCAL_INTS == 240);
 CTASSERT(IPI_STOP < APIC_SPURIOUS_INT);
 
 /* Magic IRQ values for the timer and syscalls. */
 #define	IRQ_TIMER	(NUM_IO_INTS + 1)
 #define	IRQ_SYSCALL	(NUM_IO_INTS + 2)
 #define	IRQ_DTRACE_RET	(NUM_IO_INTS + 3)
 #define	IRQ_EVTCHN	(NUM_IO_INTS + 4)
 
 enum lat_timer_mode {
 	LAT_MODE_UNDEF =	0,
 	LAT_MODE_PERIODIC =	1,
 	LAT_MODE_ONESHOT =	2,
 	LAT_MODE_DEADLINE =	3,
 };
 
 /*
  * Support for local APICs.  Local APICs manage interrupts on each
  * individual processor as opposed to I/O APICs which receive interrupts
  * from I/O devices and then forward them on to the local APICs.
  *
  * Local APICs can also send interrupts to each other thus providing the
  * mechanism for IPIs.
  */
 
 struct lvt {
 	u_int lvt_edgetrigger:1;
 	u_int lvt_activehi:1;
 	u_int lvt_masked:1;
 	u_int lvt_active:1;
 	u_int lvt_mode:16;
 	u_int lvt_vector:8;
 };
 
 struct lapic {
 	struct lvt la_lvts[APIC_LVT_MAX + 1];
 	struct lvt la_elvts[APIC_ELVT_MAX + 1];;
 	u_int la_id:8;
 	u_int la_cluster:4;
 	u_int la_cluster_id:2;
 	u_int la_present:1;
 	u_long *la_timer_count;
 	uint64_t la_timer_period;
 	enum lat_timer_mode la_timer_mode;
 	uint32_t lvt_timer_base;
 	uint32_t lvt_timer_last;
 	/* Include IDT_SYSCALL to make indexing easier. */
 	int la_ioint_irqs[APIC_NUM_IOINTS + 1];
 } static lapics[MAX_APIC_ID + 1];
 
 /* Global defaults for local APIC LVT entries. */
 static struct lvt lvts[APIC_LVT_MAX + 1] = {
 	{ 1, 1, 1, 1, APIC_LVT_DM_EXTINT, 0 },	/* LINT0: masked ExtINT */
 	{ 1, 1, 0, 1, APIC_LVT_DM_NMI, 0 },	/* LINT1: NMI */
 	{ 1, 1, 1, 1, APIC_LVT_DM_FIXED, APIC_TIMER_INT },	/* Timer */
 	{ 1, 1, 0, 1, APIC_LVT_DM_FIXED, APIC_ERROR_INT },	/* Error */
 	{ 1, 1, 1, 1, APIC_LVT_DM_NMI, 0 },	/* PMC */
 	{ 1, 1, 1, 1, APIC_LVT_DM_FIXED, APIC_THERMAL_INT },	/* Thermal */
 	{ 1, 1, 1, 1, APIC_LVT_DM_FIXED, APIC_CMC_INT },	/* CMCI */
 };
 
 /* Global defaults for AMD local APIC ELVT entries. */
 static struct lvt elvts[APIC_ELVT_MAX + 1] = {
 	{ 1, 1, 1, 0, APIC_LVT_DM_FIXED, 0 },
 	{ 1, 1, 1, 0, APIC_LVT_DM_FIXED, APIC_CMC_INT },
 	{ 1, 1, 1, 0, APIC_LVT_DM_FIXED, 0 },
 	{ 1, 1, 1, 0, APIC_LVT_DM_FIXED, 0 },
 };
 
 static inthand_t *ioint_handlers[] = {
 	NULL,			/* 0 - 31 */
 	IDTVEC(apic_isr1),	/* 32 - 63 */
 	IDTVEC(apic_isr2),	/* 64 - 95 */
 	IDTVEC(apic_isr3),	/* 96 - 127 */
 	IDTVEC(apic_isr4),	/* 128 - 159 */
 	IDTVEC(apic_isr5),	/* 160 - 191 */
 	IDTVEC(apic_isr6),	/* 192 - 223 */
 	IDTVEC(apic_isr7),	/* 224 - 255 */
 };
 
 
 static u_int32_t lapic_timer_divisors[] = {
 	APIC_TDCR_1, APIC_TDCR_2, APIC_TDCR_4, APIC_TDCR_8, APIC_TDCR_16,
 	APIC_TDCR_32, APIC_TDCR_64, APIC_TDCR_128
 };
 
 extern inthand_t IDTVEC(rsvd);
 
 volatile char *lapic_map;
 vm_paddr_t lapic_paddr;
 int x2apic_mode;
 int lapic_eoi_suppression;
 static int lapic_timer_tsc_deadline;
 static u_long lapic_timer_divisor, count_freq;
 static struct eventtimer lapic_et;
 #ifdef SMP
 static uint64_t lapic_ipi_wait_mult;
 #endif
+#ifdef __amd64__
+/* IPI vector used for VMM VCPU notifications. */
+int vmm_ipinum;
+#endif
 
 SYSCTL_NODE(_hw, OID_AUTO, apic, CTLFLAG_RD, 0, "APIC options");
 SYSCTL_INT(_hw_apic, OID_AUTO, x2apic_mode, CTLFLAG_RD, &x2apic_mode, 0, "");
 SYSCTL_INT(_hw_apic, OID_AUTO, eoi_suppression, CTLFLAG_RD,
     &lapic_eoi_suppression, 0, "");
 SYSCTL_INT(_hw_apic, OID_AUTO, timer_tsc_deadline, CTLFLAG_RD,
     &lapic_timer_tsc_deadline, 0, "");
 
 static uint32_t
 lapic_read32(enum LAPIC_REGISTERS reg)
 {
 	uint32_t res;
 
 	if (x2apic_mode) {
 		res = rdmsr32(MSR_APIC_000 + reg);
 	} else {
 		res = *(volatile uint32_t *)(lapic_map + reg * LAPIC_MEM_MUL);
 	}
 	return (res);
 }
 
 static void
 lapic_write32(enum LAPIC_REGISTERS reg, uint32_t val)
 {
 
 	if (x2apic_mode) {
 		mfence();
 		wrmsr(MSR_APIC_000 + reg, val);
 	} else {
 		*(volatile uint32_t *)(lapic_map + reg * LAPIC_MEM_MUL) = val;
 	}
 }
 
 static void
 lapic_write32_nofence(enum LAPIC_REGISTERS reg, uint32_t val)
 {
 
 	if (x2apic_mode) {
 		wrmsr(MSR_APIC_000 + reg, val);
 	} else {
 		*(volatile uint32_t *)(lapic_map + reg * LAPIC_MEM_MUL) = val;
 	}
 }
 
 #ifdef SMP
 static uint64_t
 lapic_read_icr(void)
 {
 	uint64_t v;
 	uint32_t vhi, vlo;
 
 	if (x2apic_mode) {
 		v = rdmsr(MSR_APIC_000 + LAPIC_ICR_LO);
 	} else {
 		vhi = lapic_read32(LAPIC_ICR_HI);
 		vlo = lapic_read32(LAPIC_ICR_LO);
 		v = ((uint64_t)vhi << 32) | vlo;
 	}
 	return (v);
 }
 
 static uint64_t
 lapic_read_icr_lo(void)
 {
 
 	return (lapic_read32(LAPIC_ICR_LO));
 }
 
 static void
 lapic_write_icr(uint32_t vhi, uint32_t vlo)
 {
 	uint64_t v;
 
 	if (x2apic_mode) {
 		v = ((uint64_t)vhi << 32) | vlo;
 		mfence();
 		wrmsr(MSR_APIC_000 + LAPIC_ICR_LO, v);
 	} else {
 		lapic_write32(LAPIC_ICR_HI, vhi);
 		lapic_write32(LAPIC_ICR_LO, vlo);
 	}
 }
 #endif /* SMP */
 
 static void
 native_lapic_enable_x2apic(void)
 {
 	uint64_t apic_base;
 
 	apic_base = rdmsr(MSR_APICBASE);
 	apic_base |= APICBASE_X2APIC | APICBASE_ENABLED;
 	wrmsr(MSR_APICBASE, apic_base);
 }
 
 static bool
 native_lapic_is_x2apic(void)
 {
 	uint64_t apic_base;
 
 	apic_base = rdmsr(MSR_APICBASE);
 	return ((apic_base & (APICBASE_X2APIC | APICBASE_ENABLED)) ==
 	    (APICBASE_X2APIC | APICBASE_ENABLED));
 }
 
 static void	lapic_enable(void);
 static void	lapic_resume(struct pic *pic, bool suspend_cancelled);
 static void	lapic_timer_oneshot(struct lapic *);
 static void	lapic_timer_oneshot_nointr(struct lapic *, uint32_t);
 static void	lapic_timer_periodic(struct lapic *);
 static void	lapic_timer_deadline(struct lapic *);
 static void	lapic_timer_stop(struct lapic *);
 static void	lapic_timer_set_divisor(u_int divisor);
 static uint32_t	lvt_mode(struct lapic *la, u_int pin, uint32_t value);
 static int	lapic_et_start(struct eventtimer *et,
 		    sbintime_t first, sbintime_t period);
 static int	lapic_et_stop(struct eventtimer *et);
 static u_int	apic_idt_to_irq(u_int apic_id, u_int vector);
 static void	lapic_set_tpr(u_int vector);
 
 struct pic lapic_pic = { .pic_resume = lapic_resume };
 
 /* Forward declarations for apic_ops */
 static void	native_lapic_create(u_int apic_id, int boot_cpu);
 static void	native_lapic_init(vm_paddr_t addr);
 static void	native_lapic_xapic_mode(void);
 static void	native_lapic_setup(int boot);
 static void	native_lapic_dump(const char *str);
 static void	native_lapic_disable(void);
-static void	native_lapic_eoi(void);
 static int	native_lapic_id(void);
 static int	native_lapic_intr_pending(u_int vector);
 static u_int	native_apic_cpuid(u_int apic_id);
 static u_int	native_apic_alloc_vector(u_int apic_id, u_int irq);
 static u_int	native_apic_alloc_vectors(u_int apic_id, u_int *irqs,
 		    u_int count, u_int align);
 static void 	native_apic_disable_vector(u_int apic_id, u_int vector);
 static void 	native_apic_enable_vector(u_int apic_id, u_int vector);
 static void 	native_apic_free_vector(u_int apic_id, u_int vector, u_int irq);
 static void 	native_lapic_set_logical_id(u_int apic_id, u_int cluster,
 		    u_int cluster_id);
 static int 	native_lapic_enable_pmc(void);
 static void 	native_lapic_disable_pmc(void);
 static void 	native_lapic_reenable_pmc(void);
 static void 	native_lapic_enable_cmc(void);
 static int 	native_lapic_enable_mca_elvt(void);
 static int 	native_lapic_set_lvt_mask(u_int apic_id, u_int lvt,
 		    u_char masked);
 static int 	native_lapic_set_lvt_mode(u_int apic_id, u_int lvt,
 		    uint32_t mode);
 static int 	native_lapic_set_lvt_polarity(u_int apic_id, u_int lvt,
 		    enum intr_polarity pol);
 static int 	native_lapic_set_lvt_triggermode(u_int apic_id, u_int lvt,
 		    enum intr_trigger trigger);
 #ifdef SMP
 static void 	native_lapic_ipi_raw(register_t icrlo, u_int dest);
 static void 	native_lapic_ipi_vectored(u_int vector, int dest);
 static int 	native_lapic_ipi_wait(int delay);
 #endif /* SMP */
 static int	native_lapic_ipi_alloc(inthand_t *ipifunc);
 static void	native_lapic_ipi_free(int vector);
 
 struct apic_ops apic_ops = {
 	.create			= native_lapic_create,
 	.init			= native_lapic_init,
 	.xapic_mode		= native_lapic_xapic_mode,
 	.is_x2apic		= native_lapic_is_x2apic,
 	.setup			= native_lapic_setup,
 	.dump			= native_lapic_dump,
 	.disable		= native_lapic_disable,
 	.eoi			= native_lapic_eoi,
 	.id			= native_lapic_id,
 	.intr_pending		= native_lapic_intr_pending,
 	.set_logical_id		= native_lapic_set_logical_id,
 	.cpuid			= native_apic_cpuid,
 	.alloc_vector		= native_apic_alloc_vector,
 	.alloc_vectors		= native_apic_alloc_vectors,
 	.enable_vector		= native_apic_enable_vector,
 	.disable_vector		= native_apic_disable_vector,
 	.free_vector		= native_apic_free_vector,
 	.enable_pmc		= native_lapic_enable_pmc,
 	.disable_pmc		= native_lapic_disable_pmc,
 	.reenable_pmc		= native_lapic_reenable_pmc,
 	.enable_cmc		= native_lapic_enable_cmc,
 	.enable_mca_elvt	= native_lapic_enable_mca_elvt,
 #ifdef SMP
 	.ipi_raw		= native_lapic_ipi_raw,
 	.ipi_vectored		= native_lapic_ipi_vectored,
 	.ipi_wait		= native_lapic_ipi_wait,
 #endif
 	.ipi_alloc		= native_lapic_ipi_alloc,
 	.ipi_free		= native_lapic_ipi_free,
 	.set_lvt_mask		= native_lapic_set_lvt_mask,
 	.set_lvt_mode		= native_lapic_set_lvt_mode,
 	.set_lvt_polarity	= native_lapic_set_lvt_polarity,
 	.set_lvt_triggermode	= native_lapic_set_lvt_triggermode,
 };
 
 static uint32_t
 lvt_mode_impl(struct lapic *la, struct lvt *lvt, u_int pin, uint32_t value)
 {
 
 	value &= ~(APIC_LVT_M | APIC_LVT_TM | APIC_LVT_IIPP | APIC_LVT_DM |
 	    APIC_LVT_VECTOR);
 	if (lvt->lvt_edgetrigger == 0)
 		value |= APIC_LVT_TM;
 	if (lvt->lvt_activehi == 0)
 		value |= APIC_LVT_IIPP_INTALO;
 	if (lvt->lvt_masked)
 		value |= APIC_LVT_M;
 	value |= lvt->lvt_mode;
 	switch (lvt->lvt_mode) {
 	case APIC_LVT_DM_NMI:
 	case APIC_LVT_DM_SMI:
 	case APIC_LVT_DM_INIT:
 	case APIC_LVT_DM_EXTINT:
 		if (!lvt->lvt_edgetrigger && bootverbose) {
 			printf("lapic%u: Forcing LINT%u to edge trigger\n",
 			    la->la_id, pin);
 			value &= ~APIC_LVT_TM;
 		}
 		/* Use a vector of 0. */
 		break;
 	case APIC_LVT_DM_FIXED:
 		value |= lvt->lvt_vector;
 		break;
 	default:
 		panic("bad APIC LVT delivery mode: %#x\n", value);
 	}
 	return (value);
 }
 
 static uint32_t
 lvt_mode(struct lapic *la, u_int pin, uint32_t value)
 {
 	struct lvt *lvt;
 
 	KASSERT(pin <= APIC_LVT_MAX,
 	    ("%s: pin %u out of range", __func__, pin));
 	if (la->la_lvts[pin].lvt_active)
 		lvt = &la->la_lvts[pin];
 	else
 		lvt = &lvts[pin];
 
 	return (lvt_mode_impl(la, lvt, pin, value));
 }
 
 static uint32_t
 elvt_mode(struct lapic *la, u_int idx, uint32_t value)
 {
 	struct lvt *elvt;
 
 	KASSERT(idx <= APIC_ELVT_MAX,
 	    ("%s: idx %u out of range", __func__, idx));
 
 	elvt = &la->la_elvts[idx];
 	KASSERT(elvt->lvt_active, ("%s: ELVT%u is not active", __func__, idx));
 	KASSERT(elvt->lvt_edgetrigger,
 	    ("%s: ELVT%u is not edge triggered", __func__, idx));
 	KASSERT(elvt->lvt_activehi,
 	    ("%s: ELVT%u is not active high", __func__, idx));
 	return (lvt_mode_impl(la, elvt, idx, value));
 }
 
+static inline uint32_t
+amd_read_ext_features(void)
+{
+	uint32_t version;
+
+	if (cpu_vendor_id != CPU_VENDOR_AMD)
+		return (0);
+	version = lapic_read32(LAPIC_VERSION);
+	if ((version & APIC_VER_AMD_EXT_SPACE) != 0)
+		return (lapic_read32(LAPIC_EXT_FEATURES));
+	return (0);
+}
+
+static inline uint32_t
+amd_read_elvt_count(void)
+{
+	uint32_t extf;
+	uint32_t count;
+
+	extf = amd_read_ext_features();
+	count = (extf & APIC_EXTF_ELVT_MASK) >> APIC_EXTF_ELVT_SHIFT;
+	count = min(count, APIC_ELVT_MAX + 1);
+	return (count);
+}
+
 /*
  * Map the local APIC and setup necessary interrupt vectors.
  */
 static void
 native_lapic_init(vm_paddr_t addr)
 {
 #ifdef SMP
 	uint64_t r, r1, r2, rx;
 #endif
-	uint32_t ver;
+	uint32_t extf, ver;
 	u_int regs[4];
-	int i, arat;
+	int i, arat, seoi_enable;
 
 	/*
 	 * Enable x2APIC mode if possible. Map the local APIC
 	 * registers page.
 	 *
 	 * Keep the LAPIC registers page mapped uncached for x2APIC
 	 * mode too, to have direct map page attribute set to
 	 * uncached.  This is needed to work around CPU errata present
 	 * on all Intel processors.
 	 */
 	KASSERT(trunc_page(addr) == addr,
 	    ("local APIC not aligned on a page boundary"));
 	lapic_paddr = addr;
 	lapic_map = pmap_mapdev(addr, PAGE_SIZE);
 	if (x2apic_mode) {
 		native_lapic_enable_x2apic();
 		lapic_map = NULL;
 	}
 
 	/* Setup the spurious interrupt handler. */
 	setidt(APIC_SPURIOUS_INT, IDTVEC(spuriousint), SDT_APIC, SEL_KPL,
 	    GSEL_APIC);
 
 	/* Perform basic initialization of the BSP's local APIC. */
 	lapic_enable();
 
 	/* Set BSP's per-CPU local APIC ID. */
 	PCPU_SET(apic_id, lapic_id());
 
 	/* Local APIC timer interrupt. */
 	setidt(APIC_TIMER_INT, IDTVEC(timerint), SDT_APIC, SEL_KPL, GSEL_APIC);
 
 	/* Local APIC error interrupt. */
 	setidt(APIC_ERROR_INT, IDTVEC(errorint), SDT_APIC, SEL_KPL, GSEL_APIC);
 
 	/* XXX: Thermal interrupt */
 
 	/* Local APIC CMCI. */
 	setidt(APIC_CMC_INT, IDTVEC(cmcint), SDT_APICT, SEL_KPL, GSEL_APIC);
 
 	if ((resource_int_value("apic", 0, "clock", &i) != 0 || i != 0)) {
 		arat = 0;
 		/* Intel CPUID 0x06 EAX[2] set if APIC timer runs in C3. */
 		if (cpu_vendor_id == CPU_VENDOR_INTEL && cpu_high >= 6) {
 			do_cpuid(0x06, regs);
 			if ((regs[0] & CPUTPM1_ARAT) != 0)
 				arat = 1;
 		}
 		bzero(&lapic_et, sizeof(lapic_et));
 		lapic_et.et_name = "LAPIC";
 		lapic_et.et_flags = ET_FLAGS_PERIODIC | ET_FLAGS_ONESHOT |
 		    ET_FLAGS_PERCPU;
 		lapic_et.et_quality = 600;
 		if (!arat) {
 			lapic_et.et_flags |= ET_FLAGS_C3STOP;
 			lapic_et.et_quality = 100;
 		}
 		if ((cpu_feature & CPUID_TSC) != 0 &&
 		    (cpu_feature2 & CPUID2_TSCDLT) != 0 &&
 		    tsc_is_invariant && tsc_freq != 0) {
 			lapic_timer_tsc_deadline = 1;
 			TUNABLE_INT_FETCH("hw.lapic_tsc_deadline",
 			    &lapic_timer_tsc_deadline);
 		}
 
 		lapic_et.et_frequency = 0;
 		/* We don't know frequency yet, so trying to guess. */
 		lapic_et.et_min_period = 0x00001000LL;
 		lapic_et.et_max_period = SBT_1S;
 		lapic_et.et_start = lapic_et_start;
 		lapic_et.et_stop = lapic_et_stop;
 		lapic_et.et_priv = NULL;
 		et_register(&lapic_et);
 	}
 
 	/*
 	 * Set lapic_eoi_suppression after lapic_enable(), to not
 	 * enable suppression in the hardware prematurely.  Note that
 	 * we by default enable suppression even when system only has
 	 * one IO-APIC, since EOI is broadcasted to all APIC agents,
 	 * including CPUs, otherwise.
 	 *
 	 * It seems that at least some KVM versions report
 	 * EOI_SUPPRESSION bit, but auto-EOI does not work.
 	 */
 	ver = lapic_read32(LAPIC_VERSION);
 	if ((ver & APIC_VER_EOI_SUPPRESSION) != 0) {
-		lapic_eoi_suppression = 1;
+		lapic_eoi_suppression = INTEL_SEOI;
+	} else {
+		extf = amd_read_ext_features();
+		if ((extf & APIC_EXTF_SEIO_CAP) != 0)
+			lapic_eoi_suppression = AMD_SEOI;
+	}
+	if (lapic_eoi_suppression != 0) {
+		seoi_enable = 1;
 		if (vm_guest == VM_GUEST_KVM) {
 			if (bootverbose)
 				printf(
 		       "KVM -- disabling lapic eoi suppression\n");
-			lapic_eoi_suppression = 0;
+			seoi_enable = 0;
 		}
 		TUNABLE_INT_FETCH("hw.lapic_eoi_suppression",
-		    &lapic_eoi_suppression);
+		    &seoi_enable);
+		if (seoi_enable == 0)
+			lapic_eoi_suppression = 0;
 	}
+	if (lapic_eoi_suppression != 0)
+		printf("LAPIC specific EOI enabled\n");
 
 #ifdef SMP
 #define	LOOPS	100000
 	/*
 	 * Calibrate the busy loop waiting for IPI ack in xAPIC mode.
 	 * lapic_ipi_wait_mult contains the number of iterations which
 	 * approximately delay execution for 1 microsecond (the
 	 * argument to native_lapic_ipi_wait() is in microseconds).
 	 *
 	 * We assume that TSC is present and already measured.
 	 * Possible TSC frequency jumps are irrelevant to the
 	 * calibration loop below, the CPU clock management code is
 	 * not yet started, and we do not enter sleep states.
 	 */
 	KASSERT((cpu_feature & CPUID_TSC) != 0 && tsc_freq != 0,
 	    ("TSC not initialized"));
 	if (!x2apic_mode) {
 		r = rdtsc();
 		for (rx = 0; rx < LOOPS; rx++) {
 			(void)lapic_read_icr_lo();
 			ia32_pause();
 		}
 		r = rdtsc() - r;
 		r1 = tsc_freq * LOOPS;
 		r2 = r * 1000000;
 		lapic_ipi_wait_mult = r1 >= r2 ? r1 / r2 : 1;
 		if (bootverbose) {
 			printf("LAPIC: ipi_wait() us multiplier %ju (r %ju "
 			    "tsc %ju)\n", (uintmax_t)lapic_ipi_wait_mult,
 			    (uintmax_t)r, (uintmax_t)tsc_freq);
 		}
 	}
 #undef LOOPS
 #endif /* SMP */
 }
 
 /*
  * Create a local APIC instance.
  */
 static void
 native_lapic_create(u_int apic_id, int boot_cpu)
 {
 	int i;
 
 	if (apic_id > MAX_APIC_ID) {
 		printf("APIC: Ignoring local APIC with ID %d\n", apic_id);
 		if (boot_cpu)
 			panic("Can't ignore BSP");
 		return;
 	}
 	KASSERT(!lapics[apic_id].la_present, ("duplicate local APIC %u",
 	    apic_id));
 
 	/*
 	 * Assume no local LVT overrides and a cluster of 0 and
 	 * intra-cluster ID of 0.
 	 */
 	lapics[apic_id].la_present = 1;
 	lapics[apic_id].la_id = apic_id;
 	for (i = 0; i <= APIC_LVT_MAX; i++) {
 		lapics[apic_id].la_lvts[i] = lvts[i];
 		lapics[apic_id].la_lvts[i].lvt_active = 0;
 	}
 	for (i = 0; i <= APIC_ELVT_MAX; i++) {
 		lapics[apic_id].la_elvts[i] = elvts[i];
 		lapics[apic_id].la_elvts[i].lvt_active = 0;
 	}
 	for (i = 0; i <= APIC_NUM_IOINTS; i++)
 	    lapics[apic_id].la_ioint_irqs[i] = -1;
 	lapics[apic_id].la_ioint_irqs[IDT_SYSCALL - APIC_IO_INTS] = IRQ_SYSCALL;
 	lapics[apic_id].la_ioint_irqs[APIC_TIMER_INT - APIC_IO_INTS] =
 	    IRQ_TIMER;
 #ifdef KDTRACE_HOOKS
 	lapics[apic_id].la_ioint_irqs[IDT_DTRACE_RET - APIC_IO_INTS] =
 	    IRQ_DTRACE_RET;
 #endif
 #ifdef XENHVM
 	lapics[apic_id].la_ioint_irqs[IDT_EVTCHN - APIC_IO_INTS] = IRQ_EVTCHN;
 #endif
 
 
 #ifdef SMP
 	cpu_add(apic_id, boot_cpu);
 #endif
 }
 
-static inline uint32_t
-amd_read_ext_features(void)
-{
-	uint32_t version;
-
-	if (cpu_vendor_id != CPU_VENDOR_AMD)
-		return (0);
-	version = lapic_read32(LAPIC_VERSION);
-	if ((version & APIC_VER_AMD_EXT_SPACE) != 0)
-		return (lapic_read32(LAPIC_EXT_FEATURES));
-	else
-		return (0);
-}
-
-static inline uint32_t
-amd_read_elvt_count(void)
-{
-	uint32_t extf;
-	uint32_t count;
-
-	extf = amd_read_ext_features();
-	count = (extf & APIC_EXTF_ELVT_MASK) >> APIC_EXTF_ELVT_SHIFT;
-	count = min(count, APIC_ELVT_MAX + 1);
-	return (count);
-}
-
 /*
  * Dump contents of local APIC registers
  */
 static void
 native_lapic_dump(const char* str)
 {
 	uint32_t version;
 	uint32_t maxlvt;
 	uint32_t extf;
 	int elvt_count;
 	int i;
 
 	version = lapic_read32(LAPIC_VERSION);
 	maxlvt = (version & APIC_VER_MAXLVT) >> MAXLVTSHIFT;
 	printf("cpu%d %s:\n", PCPU_GET(cpuid), str);
 	printf("     ID: 0x%08x   VER: 0x%08x LDR: 0x%08x DFR: 0x%08x",
 	    lapic_read32(LAPIC_ID), version,
 	    lapic_read32(LAPIC_LDR), x2apic_mode ? 0 : lapic_read32(LAPIC_DFR));
 	if ((cpu_feature2 & CPUID2_X2APIC) != 0)
 		printf(" x2APIC: %d", x2apic_mode);
 	printf("\n  lint0: 0x%08x lint1: 0x%08x TPR: 0x%08x SVR: 0x%08x\n",
 	    lapic_read32(LAPIC_LVT_LINT0), lapic_read32(LAPIC_LVT_LINT1),
 	    lapic_read32(LAPIC_TPR), lapic_read32(LAPIC_SVR));
 	printf("  timer: 0x%08x therm: 0x%08x err: 0x%08x",
 	    lapic_read32(LAPIC_LVT_TIMER), lapic_read32(LAPIC_LVT_THERMAL),
 	    lapic_read32(LAPIC_LVT_ERROR));
 	if (maxlvt >= APIC_LVT_PMC)
 		printf(" pmc: 0x%08x", lapic_read32(LAPIC_LVT_PCINT));
 	printf("\n");
 	if (maxlvt >= APIC_LVT_CMCI)
 		printf("   cmci: 0x%08x\n", lapic_read32(LAPIC_LVT_CMCI));
 	extf = amd_read_ext_features();
 	if (extf != 0) {
 		printf("   AMD ext features: 0x%08x\n", extf);
+		extf = lapic_read32(LAPIC_EXT_CTRL);
+		printf("    AMD ext control: 0x%08x\n", extf);
 		elvt_count = amd_read_elvt_count();
 		for (i = 0; i < elvt_count; i++)
-			printf("   AMD elvt%d: 0x%08x\n", i,
+			printf("          AMD elvt%d: 0x%08x\n", i,
 			    lapic_read32(LAPIC_EXT_LVT0 + i));
 	}
 }
 
 static void
 native_lapic_xapic_mode(void)
 {
 	register_t saveintr;
 
 	saveintr = intr_disable();
 	if (x2apic_mode)
 		native_lapic_enable_x2apic();
 	intr_restore(saveintr);
 }
 
 static void
 native_lapic_setup(int boot)
 {
 	struct lapic *la;
 	uint32_t version;
 	uint32_t maxlvt;
 	register_t saveintr;
 	char buf[MAXCOMLEN + 1];
 	int elvt_count;
 	int i;
 
 	saveintr = intr_disable();
 
 	la = &lapics[lapic_id()];
 	KASSERT(la->la_present, ("missing APIC structure"));
 	version = lapic_read32(LAPIC_VERSION);
 	maxlvt = (version & APIC_VER_MAXLVT) >> MAXLVTSHIFT;
 
 	/* Initialize the TPR to allow all interrupts. */
 	lapic_set_tpr(0);
 
 	/* Setup spurious vector and enable the local APIC. */
 	lapic_enable();
 
 	/* Program LINT[01] LVT entries. */
 	lapic_write32(LAPIC_LVT_LINT0, lvt_mode(la, APIC_LVT_LINT0,
 	    lapic_read32(LAPIC_LVT_LINT0)));
 	lapic_write32(LAPIC_LVT_LINT1, lvt_mode(la, APIC_LVT_LINT1,
 	    lapic_read32(LAPIC_LVT_LINT1)));
 
 	/* Program the PMC LVT entry if present. */
 	if (maxlvt >= APIC_LVT_PMC) {
 		lapic_write32(LAPIC_LVT_PCINT, lvt_mode(la, APIC_LVT_PMC,
 		    LAPIC_LVT_PCINT));
 	}
 
 	/* Program timer LVT and setup handler. */
 	la->lvt_timer_base = lvt_mode(la, APIC_LVT_TIMER,
 	    lapic_read32(LAPIC_LVT_TIMER));
 	la->lvt_timer_last = la->lvt_timer_base;
 	lapic_write32(LAPIC_LVT_TIMER, la->lvt_timer_base);
 	if (boot) {
 		snprintf(buf, sizeof(buf), "cpu%d:timer", PCPU_GET(cpuid));
 		intrcnt_add(buf, &la->la_timer_count);
 	}
 
 	/* Setup the timer if configured. */
 	if (la->la_timer_mode != LAT_MODE_UNDEF) {
 		KASSERT(la->la_timer_period != 0, ("lapic%u: zero divisor",
 		    lapic_id()));
 		switch (la->la_timer_mode) {
 		case LAT_MODE_PERIODIC:
 			lapic_timer_set_divisor(lapic_timer_divisor);
 			lapic_timer_periodic(la);
 			break;
 		case LAT_MODE_ONESHOT:
 			lapic_timer_set_divisor(lapic_timer_divisor);
 			lapic_timer_oneshot(la);
 			break;
 		case LAT_MODE_DEADLINE:
 			lapic_timer_deadline(la);
 			break;
 		default:
 			panic("corrupted la_timer_mode %p %d", la,
 			    la->la_timer_mode);
 		}
 	}
 
 	/* Program error LVT and clear any existing errors. */
 	lapic_write32(LAPIC_LVT_ERROR, lvt_mode(la, APIC_LVT_ERROR,
 	    lapic_read32(LAPIC_LVT_ERROR)));
 	lapic_write32(LAPIC_ESR, 0);
 
 	/* XXX: Thermal LVT */
 
 	/* Program the CMCI LVT entry if present. */
 	if (maxlvt >= APIC_LVT_CMCI) {
 		lapic_write32(LAPIC_LVT_CMCI, lvt_mode(la, APIC_LVT_CMCI,
 		    lapic_read32(LAPIC_LVT_CMCI)));
 	}
 
 	elvt_count = amd_read_elvt_count();
 	for (i = 0; i < elvt_count; i++) {
 		if (la->la_elvts[i].lvt_active)
 			lapic_write32(LAPIC_EXT_LVT0 + i,
 			    elvt_mode(la, i, lapic_read32(LAPIC_EXT_LVT0 + i)));
 	}
 
 	intr_restore(saveintr);
 }
 
 static void
 native_lapic_reenable_pmc(void)
 {
 #ifdef HWPMC_HOOKS
 	uint32_t value;
 
 	value = lapic_read32(LAPIC_LVT_PCINT);
 	value &= ~APIC_LVT_M;
 	lapic_write32(LAPIC_LVT_PCINT, value);
 #endif
 }
 
 #ifdef HWPMC_HOOKS
 static void
 lapic_update_pmc(void *dummy)
 {
 	struct lapic *la;
 
 	la = &lapics[lapic_id()];
 	lapic_write32(LAPIC_LVT_PCINT, lvt_mode(la, APIC_LVT_PMC,
 	    lapic_read32(LAPIC_LVT_PCINT)));
 }
 #endif
 
 static int
 native_lapic_enable_pmc(void)
 {
 #ifdef HWPMC_HOOKS
 	u_int32_t maxlvt;
 
 	/* Fail if the local APIC is not present. */
 	if (!x2apic_mode && lapic_map == NULL)
 		return (0);
 
 	/* Fail if the PMC LVT is not present. */
 	maxlvt = (lapic_read32(LAPIC_VERSION) & APIC_VER_MAXLVT) >> MAXLVTSHIFT;
 	if (maxlvt < APIC_LVT_PMC)
 		return (0);
 
 	lvts[APIC_LVT_PMC].lvt_masked = 0;
 
 #ifdef EARLY_AP_STARTUP
 	MPASS(mp_ncpus == 1 || smp_started);
 	smp_rendezvous(NULL, lapic_update_pmc, NULL, NULL);
 #else
 #ifdef SMP
 	/*
 	 * If hwpmc was loaded at boot time then the APs may not be
 	 * started yet.  In that case, don't forward the request to
 	 * them as they will program the lvt when they start.
 	 */
 	if (smp_started)
 		smp_rendezvous(NULL, lapic_update_pmc, NULL, NULL);
 	else
 #endif
 		lapic_update_pmc(NULL);
 #endif
 	return (1);
 #else
 	return (0);
 #endif
 }
 
 static void
 native_lapic_disable_pmc(void)
 {
 #ifdef HWPMC_HOOKS
 	u_int32_t maxlvt;
 
 	/* Fail if the local APIC is not present. */
 	if (!x2apic_mode && lapic_map == NULL)
 		return;
 
 	/* Fail if the PMC LVT is not present. */
 	maxlvt = (lapic_read32(LAPIC_VERSION) & APIC_VER_MAXLVT) >> MAXLVTSHIFT;
 	if (maxlvt < APIC_LVT_PMC)
 		return;
 
 	lvts[APIC_LVT_PMC].lvt_masked = 1;
 
 #ifdef SMP
 	/* The APs should always be started when hwpmc is unloaded. */
 	KASSERT(mp_ncpus == 1 || smp_started, ("hwpmc unloaded too early"));
 #endif
 	smp_rendezvous(NULL, lapic_update_pmc, NULL, NULL);
 #endif
 }
 
 static void
 lapic_calibrate_initcount(struct eventtimer *et, struct lapic *la)
 {
 	u_long value;
 
 	/* Start off with a divisor of 2 (power on reset default). */
 	lapic_timer_divisor = 2;
 	/* Try to calibrate the local APIC timer. */
 	do {
 		lapic_timer_set_divisor(lapic_timer_divisor);
 		lapic_timer_oneshot_nointr(la, APIC_TIMER_MAX_COUNT);
 		DELAY(1000000);
 		value = APIC_TIMER_MAX_COUNT - lapic_read32(LAPIC_CCR_TIMER);
 		if (value != APIC_TIMER_MAX_COUNT)
 			break;
 		lapic_timer_divisor <<= 1;
 	} while (lapic_timer_divisor <= 128);
 	if (lapic_timer_divisor > 128)
 		panic("lapic: Divisor too big");
 	if (bootverbose) {
 		printf("lapic: Divisor %lu, Frequency %lu Hz\n",
 		    lapic_timer_divisor, value);
 	}
 	count_freq = value;
 }
 
 static void
 lapic_calibrate_deadline(struct eventtimer *et, struct lapic *la __unused)
 {
 
 	if (bootverbose) {
 		printf("lapic: deadline tsc mode, Frequency %ju Hz\n",
 		    (uintmax_t)tsc_freq);
 	}
 }
 
 static void
 lapic_change_mode(struct eventtimer *et, struct lapic *la,
     enum lat_timer_mode newmode)
 {
 
 	if (la->la_timer_mode == newmode)
 		return;
 	switch (newmode) {
 	case LAT_MODE_PERIODIC:
 		lapic_timer_set_divisor(lapic_timer_divisor);
 		et->et_frequency = count_freq;
 		break;
 	case LAT_MODE_DEADLINE:
 		et->et_frequency = tsc_freq;
 		break;
 	case LAT_MODE_ONESHOT:
 		lapic_timer_set_divisor(lapic_timer_divisor);
 		et->et_frequency = count_freq;
 		break;
 	default:
 		panic("lapic_change_mode %d", newmode);
 	}
 	la->la_timer_mode = newmode;
 	et->et_min_period = (0x00000002LLU << 32) / et->et_frequency;
 	et->et_max_period = (0xfffffffeLLU << 32) / et->et_frequency;
 }
 
 static int
 lapic_et_start(struct eventtimer *et, sbintime_t first, sbintime_t period)
 {
 	struct lapic *la;
 
 	la = &lapics[PCPU_GET(apic_id)];
 	if (et->et_frequency == 0) {
 		lapic_calibrate_initcount(et, la);
 		if (lapic_timer_tsc_deadline)
 			lapic_calibrate_deadline(et, la);
 	}
 	if (period != 0) {
 		lapic_change_mode(et, la, LAT_MODE_PERIODIC);
 		la->la_timer_period = ((uint32_t)et->et_frequency * period) >>
 		    32;
 		lapic_timer_periodic(la);
 	} else if (lapic_timer_tsc_deadline) {
 		lapic_change_mode(et, la, LAT_MODE_DEADLINE);
 		la->la_timer_period = (et->et_frequency * first) >> 32;
 		lapic_timer_deadline(la);
 	} else {
 		lapic_change_mode(et, la, LAT_MODE_ONESHOT);
 		la->la_timer_period = ((uint32_t)et->et_frequency * first) >>
 		    32;
 		lapic_timer_oneshot(la);
 	}
 	return (0);
 }
 
 static int
 lapic_et_stop(struct eventtimer *et)
 {
 	struct lapic *la;
 
 	la = &lapics[PCPU_GET(apic_id)];
 	lapic_timer_stop(la);
 	la->la_timer_mode = LAT_MODE_UNDEF;
 	return (0);
 }
 
 static void
 native_lapic_disable(void)
 {
 	uint32_t value;
 
 	/* Software disable the local APIC. */
 	value = lapic_read32(LAPIC_SVR);
 	value &= ~APIC_SVR_SWEN;
 	lapic_write32(LAPIC_SVR, value);
 }
 
 static void
 lapic_enable(void)
 {
 	uint32_t value;
 
 	/* Program the spurious vector to enable the local APIC. */
 	value = lapic_read32(LAPIC_SVR);
 	value &= ~(APIC_SVR_VECTOR | APIC_SVR_FOCUS);
 	value |= APIC_SVR_FEN | APIC_SVR_SWEN | APIC_SPURIOUS_INT;
-	if (lapic_eoi_suppression)
+	if (lapic_eoi_suppression == INTEL_SEOI)
 		value |= APIC_SVR_EOI_SUPPRESSION;
 	lapic_write32(LAPIC_SVR, value);
+
+	if (lapic_eoi_suppression == AMD_SEOI) {
+		value = lapic_read32(LAPIC_EXT_CTRL);
+		value |= APIC_EXTF_SEIO_CAP;
+		lapic_write32(LAPIC_EXT_CTRL, value);
+	}
 }
 
 /* Reset the local APIC on the BSP during resume. */
 static void
 lapic_resume(struct pic *pic, bool suspend_cancelled)
 {
 
 	lapic_setup(0);
 }
 
 static int
 native_lapic_id(void)
 {
 	uint32_t v;
 
 	KASSERT(x2apic_mode || lapic_map != NULL, ("local APIC is not mapped"));
 	v = lapic_read32(LAPIC_ID);
 	if (!x2apic_mode)
 		v >>= APIC_ID_SHIFT;
 	return (v);
 }
 
 static int
 native_lapic_intr_pending(u_int vector)
 {
 	uint32_t irr;
 
 	/*
 	 * The IRR registers are an array of registers each of which
 	 * only describes 32 interrupts in the low 32 bits.  Thus, we
 	 * divide the vector by 32 to get the register index.
 	 * Finally, we modulus the vector by 32 to determine the
 	 * individual bit to test.
 	 */
 	irr = lapic_read32(LAPIC_IRR0 + vector / 32);
 	return (irr & 1 << (vector % 32));
 }
 
 static void
 native_lapic_set_logical_id(u_int apic_id, u_int cluster, u_int cluster_id)
 {
 	struct lapic *la;
 
 	KASSERT(lapics[apic_id].la_present, ("%s: APIC %u doesn't exist",
 	    __func__, apic_id));
 	KASSERT(cluster <= APIC_MAX_CLUSTER, ("%s: cluster %u too big",
 	    __func__, cluster));
 	KASSERT(cluster_id <= APIC_MAX_INTRACLUSTER_ID,
 	    ("%s: intra cluster id %u too big", __func__, cluster_id));
 	la = &lapics[apic_id];
 	la->la_cluster = cluster;
 	la->la_cluster_id = cluster_id;
 }
 
 static int
 native_lapic_set_lvt_mask(u_int apic_id, u_int pin, u_char masked)
 {
 
 	if (pin > APIC_LVT_MAX)
 		return (EINVAL);
 	if (apic_id == APIC_ID_ALL) {
 		lvts[pin].lvt_masked = masked;
 		if (bootverbose)
 			printf("lapic:");
 	} else {
 		KASSERT(lapics[apic_id].la_present,
 		    ("%s: missing APIC %u", __func__, apic_id));
 		lapics[apic_id].la_lvts[pin].lvt_masked = masked;
 		lapics[apic_id].la_lvts[pin].lvt_active = 1;
 		if (bootverbose)
 			printf("lapic%u:", apic_id);
 	}
 	if (bootverbose)
 		printf(" LINT%u %s\n", pin, masked ? "masked" : "unmasked");
 	return (0);
 }
 
 static int
 native_lapic_set_lvt_mode(u_int apic_id, u_int pin, u_int32_t mode)
 {
 	struct lvt *lvt;
 
 	if (pin > APIC_LVT_MAX)
 		return (EINVAL);
 	if (apic_id == APIC_ID_ALL) {
 		lvt = &lvts[pin];
 		if (bootverbose)
 			printf("lapic:");
 	} else {
 		KASSERT(lapics[apic_id].la_present,
 		    ("%s: missing APIC %u", __func__, apic_id));
 		lvt = &lapics[apic_id].la_lvts[pin];
 		lvt->lvt_active = 1;
 		if (bootverbose)
 			printf("lapic%u:", apic_id);
 	}
 	lvt->lvt_mode = mode;
 	switch (mode) {
 	case APIC_LVT_DM_NMI:
 	case APIC_LVT_DM_SMI:
 	case APIC_LVT_DM_INIT:
 	case APIC_LVT_DM_EXTINT:
 		lvt->lvt_edgetrigger = 1;
 		lvt->lvt_activehi = 1;
 		if (mode == APIC_LVT_DM_EXTINT)
 			lvt->lvt_masked = 1;
 		else
 			lvt->lvt_masked = 0;
 		break;
 	default:
 		panic("Unsupported delivery mode: 0x%x\n", mode);
 	}
 	if (bootverbose) {
 		printf(" Routing ");
 		switch (mode) {
 		case APIC_LVT_DM_NMI:
 			printf("NMI");
 			break;
 		case APIC_LVT_DM_SMI:
 			printf("SMI");
 			break;
 		case APIC_LVT_DM_INIT:
 			printf("INIT");
 			break;
 		case APIC_LVT_DM_EXTINT:
 			printf("ExtINT");
 			break;
 		}
 		printf(" -> LINT%u\n", pin);
 	}
 	return (0);
 }
 
 static int
 native_lapic_set_lvt_polarity(u_int apic_id, u_int pin, enum intr_polarity pol)
 {
 
 	if (pin > APIC_LVT_MAX || pol == INTR_POLARITY_CONFORM)
 		return (EINVAL);
 	if (apic_id == APIC_ID_ALL) {
 		lvts[pin].lvt_activehi = (pol == INTR_POLARITY_HIGH);
 		if (bootverbose)
 			printf("lapic:");
 	} else {
 		KASSERT(lapics[apic_id].la_present,
 		    ("%s: missing APIC %u", __func__, apic_id));
 		lapics[apic_id].la_lvts[pin].lvt_active = 1;
 		lapics[apic_id].la_lvts[pin].lvt_activehi =
 		    (pol == INTR_POLARITY_HIGH);
 		if (bootverbose)
 			printf("lapic%u:", apic_id);
 	}
 	if (bootverbose)
 		printf(" LINT%u polarity: %s\n", pin,
 		    pol == INTR_POLARITY_HIGH ? "high" : "low");
 	return (0);
 }
 
 static int
 native_lapic_set_lvt_triggermode(u_int apic_id, u_int pin,
      enum intr_trigger trigger)
 {
 
 	if (pin > APIC_LVT_MAX || trigger == INTR_TRIGGER_CONFORM)
 		return (EINVAL);
 	if (apic_id == APIC_ID_ALL) {
 		lvts[pin].lvt_edgetrigger = (trigger == INTR_TRIGGER_EDGE);
 		if (bootverbose)
 			printf("lapic:");
 	} else {
 		KASSERT(lapics[apic_id].la_present,
 		    ("%s: missing APIC %u", __func__, apic_id));
 		lapics[apic_id].la_lvts[pin].lvt_edgetrigger =
 		    (trigger == INTR_TRIGGER_EDGE);
 		lapics[apic_id].la_lvts[pin].lvt_active = 1;
 		if (bootverbose)
 			printf("lapic%u:", apic_id);
 	}
 	if (bootverbose)
 		printf(" LINT%u trigger: %s\n", pin,
 		    trigger == INTR_TRIGGER_EDGE ? "edge" : "level");
 	return (0);
 }
 
 /*
  * Adjust the TPR of the current CPU so that it blocks all interrupts below
  * the passed in vector.
  */
 static void
 lapic_set_tpr(u_int vector)
 {
 #ifdef CHEAP_TPR
 	lapic_write32(LAPIC_TPR, vector);
 #else
 	uint32_t tpr;
 
 	tpr = lapic_read32(LAPIC_TPR) & ~APIC_TPR_PRIO;
 	tpr |= vector;
 	lapic_write32(LAPIC_TPR, tpr);
 #endif
 }
 
-static void
-native_lapic_eoi(void)
+void
+native_lapic_eoi(u_int vector)
 {
 
-	lapic_write32_nofence(LAPIC_EOI, 0);
+	if (lapic_eoi_suppression == AMD_SEOI)
+		lapic_write32(LAPIC_EXT_SEOI, vector);
+	else
+		lapic_write32_nofence(LAPIC_EOI, 0);
 }
 
 void
 lapic_handle_intr(int vector, struct trapframe *frame)
 {
 	struct intsrc *isrc;
 
 	isrc = intr_lookup_source(apic_idt_to_irq(PCPU_GET(apic_id),
 	    vector));
 	intr_execute_handlers(isrc, frame);
 }
 
 void
 lapic_handle_timer(struct trapframe *frame)
 {
 	struct lapic *la;
 	struct trapframe *oldframe;
 	struct thread *td;
 
 	/* Send EOI first thing. */
-	lapic_eoi();
+	lapic_eoi(APIC_TIMER_INT);
 
 #if defined(SMP) && !defined(SCHED_ULE)
 	/*
 	 * Don't do any accounting for the disabled HTT cores, since it
 	 * will provide misleading numbers for the userland.
 	 *
 	 * No locking is necessary here, since even if we lose the race
 	 * when hlt_cpus_mask changes it is not a big deal, really.
 	 *
 	 * Don't do that for ULE, since ULE doesn't consider hlt_cpus_mask
 	 * and unlike other schedulers it actually schedules threads to
 	 * those CPUs.
 	 */
 	if (CPU_ISSET(PCPU_GET(cpuid), &hlt_cpus_mask))
 		return;
 #endif
 
 	/* Look up our local APIC structure for the tick counters. */
 	la = &lapics[PCPU_GET(apic_id)];
 	(*la->la_timer_count)++;
 	critical_enter();
 	if (lapic_et.et_active) {
 		td = curthread;
 		td->td_intr_nesting_level++;
 		oldframe = td->td_intr_frame;
 		td->td_intr_frame = frame;
 		lapic_et.et_event_cb(&lapic_et, lapic_et.et_arg);
 		td->td_intr_frame = oldframe;
 		td->td_intr_nesting_level--;
 	}
 	critical_exit();
 }
 
 static void
 lapic_timer_set_divisor(u_int divisor)
 {
 
 	KASSERT(powerof2(divisor), ("lapic: invalid divisor %u", divisor));
 	KASSERT(ffs(divisor) <= nitems(lapic_timer_divisors),
 		("lapic: invalid divisor %u", divisor));
 	lapic_write32(LAPIC_DCR_TIMER, lapic_timer_divisors[ffs(divisor) - 1]);
 }
 
 static void
 lapic_timer_oneshot(struct lapic *la)
 {
 	uint32_t value;
 
 	value = la->lvt_timer_base;
 	value &= ~(APIC_LVTT_TM | APIC_LVT_M);
 	value |= APIC_LVTT_TM_ONE_SHOT;
 	la->lvt_timer_last = value;
 	lapic_write32(LAPIC_LVT_TIMER, value);
 	lapic_write32(LAPIC_ICR_TIMER, la->la_timer_period);
 }
 
 static void
 lapic_timer_oneshot_nointr(struct lapic *la, uint32_t count)
 {
 	uint32_t value;
 
 	value = la->lvt_timer_base;
 	value &= ~APIC_LVTT_TM;
 	value |= APIC_LVTT_TM_ONE_SHOT | APIC_LVT_M;
 	la->lvt_timer_last = value;
 	lapic_write32(LAPIC_LVT_TIMER, value);
 	lapic_write32(LAPIC_ICR_TIMER, count);
 }
 
 static void
 lapic_timer_periodic(struct lapic *la)
 {
 	uint32_t value;
 
 	value = la->lvt_timer_base;
 	value &= ~(APIC_LVTT_TM | APIC_LVT_M);
 	value |= APIC_LVTT_TM_PERIODIC;
 	la->lvt_timer_last = value;
 	lapic_write32(LAPIC_LVT_TIMER, value);
 	lapic_write32(LAPIC_ICR_TIMER, la->la_timer_period);
 }
 
 static void
 lapic_timer_deadline(struct lapic *la)
 {
 	uint32_t value;
 
 	value = la->lvt_timer_base;
 	value &= ~(APIC_LVTT_TM | APIC_LVT_M);
 	value |= APIC_LVTT_TM_TSCDLT;
 	if (value != la->lvt_timer_last) {
 		la->lvt_timer_last = value;
 		lapic_write32_nofence(LAPIC_LVT_TIMER, value);
 		if (!x2apic_mode)
 			mfence();
 	}
 	wrmsr(MSR_TSC_DEADLINE, la->la_timer_period + rdtsc());
 }
 
 static void
 lapic_timer_stop(struct lapic *la)
 {
 	uint32_t value;
 
 	if (la->la_timer_mode == LAT_MODE_DEADLINE) {
 		wrmsr(MSR_TSC_DEADLINE, 0);
 		mfence();
 	} else {
 		value = la->lvt_timer_base;
 		value &= ~APIC_LVTT_TM;
 		value |= APIC_LVT_M;
 		la->lvt_timer_last = value;
 		lapic_write32(LAPIC_LVT_TIMER, value);
 	}
 }
 
 void
 lapic_handle_cmc(void)
 {
 
-	lapic_eoi();
+	lapic_eoi(APIC_CMC_INT);
 	cmc_intr();
 }
 
 /*
  * Called from the mca_init() to activate the CMC interrupt if this CPU is
  * responsible for monitoring any MC banks for CMC events.  Since mca_init()
  * is called prior to lapic_setup() during boot, this just needs to unmask
  * this CPU's LVT_CMCI entry.
  */
 static void
 native_lapic_enable_cmc(void)
 {
 	u_int apic_id;
 
 #ifdef DEV_ATPIC
 	if (!x2apic_mode && lapic_map == NULL)
 		return;
 #endif
 	apic_id = PCPU_GET(apic_id);
 	KASSERT(lapics[apic_id].la_present,
 	    ("%s: missing APIC %u", __func__, apic_id));
 	lapics[apic_id].la_lvts[APIC_LVT_CMCI].lvt_masked = 0;
 	lapics[apic_id].la_lvts[APIC_LVT_CMCI].lvt_active = 1;
 	if (bootverbose)
 		printf("lapic%u: CMCI unmasked\n", apic_id);
 }
 
 static int
 native_lapic_enable_mca_elvt(void)
 {
 	u_int apic_id;
 	uint32_t value;
 	int elvt_count;
 
 #ifdef DEV_ATPIC
 	if (lapic_map == NULL)
 		return (-1);
 #endif
 
 	apic_id = PCPU_GET(apic_id);
 	KASSERT(lapics[apic_id].la_present,
 	    ("%s: missing APIC %u", __func__, apic_id));
 	elvt_count = amd_read_elvt_count();
 	if (elvt_count <= APIC_ELVT_MCA)
 		return (-1);
 
 	value = lapic_read32(LAPIC_EXT_LVT0 + APIC_ELVT_MCA);
 	if ((value & APIC_LVT_M) == 0) {
 		printf("AMD MCE Thresholding Extended LVT is already active\n");
 		return (-1);
 	}
 	lapics[apic_id].la_elvts[APIC_ELVT_MCA].lvt_masked = 0;
 	lapics[apic_id].la_elvts[APIC_ELVT_MCA].lvt_active = 1;
 	if (bootverbose)
 		printf("lapic%u: MCE Thresholding ELVT unmasked\n", apic_id);
 	return (APIC_ELVT_MCA);
 }
 
 void
 lapic_handle_error(void)
 {
 	uint32_t esr;
 
 	/*
 	 * Read the contents of the error status register.  Write to
 	 * the register first before reading from it to force the APIC
 	 * to update its value to indicate any errors that have
 	 * occurred since the previous write to the register.
 	 */
 	lapic_write32(LAPIC_ESR, 0);
 	esr = lapic_read32(LAPIC_ESR);
 
 	printf("CPU%d: local APIC error 0x%x\n", PCPU_GET(cpuid), esr);
-	lapic_eoi();
+	lapic_eoi(APIC_ERROR_INT);
 }
 
 static u_int
 native_apic_cpuid(u_int apic_id)
 {
 #ifdef SMP
 	return apic_cpuids[apic_id];
 #else
 	return 0;
 #endif
 }
 
 /* Request a free IDT vector to be used by the specified IRQ. */
 static u_int
 native_apic_alloc_vector(u_int apic_id, u_int irq)
 {
 	u_int vector;
 
 	KASSERT(irq < NUM_IO_INTS, ("Invalid IRQ %u", irq));
 
 	/*
 	 * Search for a free vector.  Currently we just use a very simple
 	 * algorithm to find the first free vector.
 	 */
 	mtx_lock_spin(&icu_lock);
 	for (vector = 0; vector < APIC_NUM_IOINTS; vector++) {
 		if (lapics[apic_id].la_ioint_irqs[vector] != -1)
 			continue;
 		lapics[apic_id].la_ioint_irqs[vector] = irq;
 		mtx_unlock_spin(&icu_lock);
 		return (vector + APIC_IO_INTS);
 	}
 	mtx_unlock_spin(&icu_lock);
 	return (0);
 }
 
 /*
  * Request 'count' free contiguous IDT vectors to be used by 'count'
  * IRQs.  'count' must be a power of two and the vectors will be
  * aligned on a boundary of 'align'.  If the request cannot be
  * satisfied, 0 is returned.
  */
 static u_int
 native_apic_alloc_vectors(u_int apic_id, u_int *irqs, u_int count, u_int align)
 {
 	u_int first, run, vector;
 
 	KASSERT(powerof2(count), ("bad count"));
 	KASSERT(powerof2(align), ("bad align"));
 	KASSERT(align >= count, ("align < count"));
 #ifdef INVARIANTS
 	for (run = 0; run < count; run++)
 		KASSERT(irqs[run] < NUM_IO_INTS, ("Invalid IRQ %u at index %u",
 		    irqs[run], run));
 #endif
 
 	/*
 	 * Search for 'count' free vectors.  As with apic_alloc_vector(),
 	 * this just uses a simple first fit algorithm.
 	 */
 	run = 0;
 	first = 0;
 	mtx_lock_spin(&icu_lock);
 	for (vector = 0; vector < APIC_NUM_IOINTS; vector++) {
 
 		/* Vector is in use, end run. */
 		if (lapics[apic_id].la_ioint_irqs[vector] != -1) {
 			run = 0;
 			first = 0;
 			continue;
 		}
 
 		/* Start a new run if run == 0 and vector is aligned. */
 		if (run == 0) {
 			if ((vector & (align - 1)) != 0)
 				continue;
 			first = vector;
 		}
 		run++;
 
 		/* Keep looping if the run isn't long enough yet. */
 		if (run < count)
 			continue;
 
 		/* Found a run, assign IRQs and return the first vector. */
 		for (vector = 0; vector < count; vector++)
 			lapics[apic_id].la_ioint_irqs[first + vector] =
 			    irqs[vector];
 		mtx_unlock_spin(&icu_lock);
 		return (first + APIC_IO_INTS);
 	}
 	mtx_unlock_spin(&icu_lock);
 	printf("APIC: Couldn't find APIC vectors for %u IRQs\n", count);
 	return (0);
 }
 
 /*
  * Enable a vector for a particular apic_id.  Since all lapics share idt
  * entries and ioint_handlers this enables the vector on all lapics.  lapics
  * which do not have the vector configured would report spurious interrupts
  * should it fire.
  */
 static void
 native_apic_enable_vector(u_int apic_id, u_int vector)
 {
 
 	KASSERT(vector != IDT_SYSCALL, ("Attempt to overwrite syscall entry"));
 	KASSERT(ioint_handlers[vector / 32] != NULL,
 	    ("No ISR handler for vector %u", vector));
 #ifdef KDTRACE_HOOKS
 	KASSERT(vector != IDT_DTRACE_RET,
 	    ("Attempt to overwrite DTrace entry"));
 #endif
 	setidt(vector, ioint_handlers[vector / 32], SDT_APIC, SEL_KPL,
 	    GSEL_APIC);
 }
 
 static void
 native_apic_disable_vector(u_int apic_id, u_int vector)
 {
 
 	KASSERT(vector != IDT_SYSCALL, ("Attempt to overwrite syscall entry"));
 #ifdef KDTRACE_HOOKS
 	KASSERT(vector != IDT_DTRACE_RET,
 	    ("Attempt to overwrite DTrace entry"));
 #endif
 	KASSERT(ioint_handlers[vector / 32] != NULL,
 	    ("No ISR handler for vector %u", vector));
 #ifdef notyet
 	/*
 	 * We can not currently clear the idt entry because other cpus
 	 * may have a valid vector at this offset.
 	 */
 	setidt(vector, &IDTVEC(rsvd), SDT_APICT, SEL_KPL, GSEL_APIC);
 #endif
 }
 
 /* Release an APIC vector when it's no longer in use. */
 static void
 native_apic_free_vector(u_int apic_id, u_int vector, u_int irq)
 {
 	struct thread *td;
 
 	KASSERT(vector >= APIC_IO_INTS && vector != IDT_SYSCALL &&
 	    vector <= APIC_IO_INTS + APIC_NUM_IOINTS,
 	    ("Vector %u does not map to an IRQ line", vector));
 	KASSERT(irq < NUM_IO_INTS, ("Invalid IRQ %u", irq));
 	KASSERT(lapics[apic_id].la_ioint_irqs[vector - APIC_IO_INTS] ==
 	    irq, ("IRQ mismatch"));
 #ifdef KDTRACE_HOOKS
 	KASSERT(vector != IDT_DTRACE_RET,
 	    ("Attempt to overwrite DTrace entry"));
 #endif
 
 	/*
 	 * Bind us to the cpu that owned the vector before freeing it so
 	 * we don't lose an interrupt delivery race.
 	 */
 	td = curthread;
 	if (!rebooting) {
 		thread_lock(td);
 		if (sched_is_bound(td))
 			panic("apic_free_vector: Thread already bound.\n");
 		sched_bind(td, apic_cpuid(apic_id));
 		thread_unlock(td);
 	}
 	mtx_lock_spin(&icu_lock);
 	lapics[apic_id].la_ioint_irqs[vector - APIC_IO_INTS] = -1;
 	mtx_unlock_spin(&icu_lock);
 	if (!rebooting) {
 		thread_lock(td);
 		sched_unbind(td);
 		thread_unlock(td);
 	}
 }
 
 /* Map an IDT vector (APIC) to an IRQ (interrupt source). */
 static u_int
 apic_idt_to_irq(u_int apic_id, u_int vector)
 {
 	int irq;
 
 	KASSERT(vector >= APIC_IO_INTS && vector != IDT_SYSCALL &&
 	    vector <= APIC_IO_INTS + APIC_NUM_IOINTS,
 	    ("Vector %u does not map to an IRQ line", vector));
 #ifdef KDTRACE_HOOKS
 	KASSERT(vector != IDT_DTRACE_RET,
 	    ("Attempt to overwrite DTrace entry"));
 #endif
 	irq = lapics[apic_id].la_ioint_irqs[vector - APIC_IO_INTS];
 	if (irq < 0)
 		irq = 0;
 	return (irq);
 }
 
 #ifdef DDB
 /*
  * Dump data about APIC IDT vector mappings.
  */
 DB_SHOW_COMMAND(apic, db_show_apic)
 {
 	struct intsrc *isrc;
 	int i, verbose;
 	u_int apic_id;
 	u_int irq;
 
 	if (strcmp(modif, "vv") == 0)
 		verbose = 2;
 	else if (strcmp(modif, "v") == 0)
 		verbose = 1;
 	else
 		verbose = 0;
 	for (apic_id = 0; apic_id <= MAX_APIC_ID; apic_id++) {
 		if (lapics[apic_id].la_present == 0)
 			continue;
 		db_printf("Interrupts bound to lapic %u\n", apic_id);
 		for (i = 0; i < APIC_NUM_IOINTS + 1 && !db_pager_quit; i++) {
 			irq = lapics[apic_id].la_ioint_irqs[i];
 			if (irq == -1 || irq == IRQ_SYSCALL)
 				continue;
 #ifdef KDTRACE_HOOKS
 			if (irq == IRQ_DTRACE_RET)
 				continue;
 #endif
 #ifdef XENHVM
 			if (irq == IRQ_EVTCHN)
 				continue;
 #endif
 			db_printf("vec 0x%2x -> ", i + APIC_IO_INTS);
 			if (irq == IRQ_TIMER)
 				db_printf("lapic timer\n");
 			else if (irq < NUM_IO_INTS) {
 				isrc = intr_lookup_source(irq);
 				if (isrc == NULL || verbose == 0)
 					db_printf("IRQ %u\n", irq);
 				else
 					db_dump_intr_event(isrc->is_event,
 					    verbose == 2);
 			} else
 				db_printf("IRQ %u ???\n", irq);
 		}
 	}
 }
 
 static void
 dump_mask(const char *prefix, uint32_t v, int base)
 {
 	int i, first;
 
 	first = 1;
 	for (i = 0; i < 32; i++)
 		if (v & (1 << i)) {
 			if (first) {
 				db_printf("%s:", prefix);
 				first = 0;
 			}
 			db_printf(" %02x", base + i);
 		}
 	if (!first)
 		db_printf("\n");
 }
 
 /* Show info from the lapic regs for this CPU. */
 DB_SHOW_COMMAND(lapic, db_show_lapic)
 {
 	uint32_t v;
 
 	db_printf("lapic ID = %d\n", lapic_id());
 	v = lapic_read32(LAPIC_VERSION);
 	db_printf("version  = %d.%d\n", (v & APIC_VER_VERSION) >> 4,
 	    v & 0xf);
 	db_printf("max LVT  = %d\n", (v & APIC_VER_MAXLVT) >> MAXLVTSHIFT);
 	v = lapic_read32(LAPIC_SVR);
 	db_printf("SVR      = %02x (%s)\n", v & APIC_SVR_VECTOR,
 	    v & APIC_SVR_ENABLE ? "enabled" : "disabled");
 	db_printf("TPR      = %02x\n", lapic_read32(LAPIC_TPR));
 
 #define dump_field(prefix, regn, index)					\
 	dump_mask(__XSTRING(prefix ## index), 				\
 	    lapic_read32(LAPIC_ ## regn ## index),			\
 	    index * 32)
 
 	db_printf("In-service Interrupts:\n");
 	dump_field(isr, ISR, 0);
 	dump_field(isr, ISR, 1);
 	dump_field(isr, ISR, 2);
 	dump_field(isr, ISR, 3);
 	dump_field(isr, ISR, 4);
 	dump_field(isr, ISR, 5);
 	dump_field(isr, ISR, 6);
 	dump_field(isr, ISR, 7);
 
 	db_printf("TMR Interrupts:\n");
 	dump_field(tmr, TMR, 0);
 	dump_field(tmr, TMR, 1);
 	dump_field(tmr, TMR, 2);
 	dump_field(tmr, TMR, 3);
 	dump_field(tmr, TMR, 4);
 	dump_field(tmr, TMR, 5);
 	dump_field(tmr, TMR, 6);
 	dump_field(tmr, TMR, 7);
 
 	db_printf("IRR Interrupts:\n");
 	dump_field(irr, IRR, 0);
 	dump_field(irr, IRR, 1);
 	dump_field(irr, IRR, 2);
 	dump_field(irr, IRR, 3);
 	dump_field(irr, IRR, 4);
 	dump_field(irr, IRR, 5);
 	dump_field(irr, IRR, 6);
 	dump_field(irr, IRR, 7);
 
 #undef dump_field
 }
 #endif
 
 /*
  * APIC probing support code.  This includes code to manage enumerators.
  */
 
 static SLIST_HEAD(, apic_enumerator) enumerators =
 	SLIST_HEAD_INITIALIZER(enumerators);
 static struct apic_enumerator *best_enum;
 
 void
 apic_register_enumerator(struct apic_enumerator *enumerator)
 {
 #ifdef INVARIANTS
 	struct apic_enumerator *apic_enum;
 
 	SLIST_FOREACH(apic_enum, &enumerators, apic_next) {
 		if (apic_enum == enumerator)
 			panic("%s: Duplicate register of %s", __func__,
 			    enumerator->apic_name);
 	}
 #endif
 	SLIST_INSERT_HEAD(&enumerators, enumerator, apic_next);
 }
 
 /*
  * We have to look for CPU's very, very early because certain subsystems
  * want to know how many CPU's we have extremely early on in the boot
  * process.
  */
 static void
 apic_init(void *dummy __unused)
 {
 	struct apic_enumerator *enumerator;
 	int retval, best;
 
 	/* We only support built in local APICs. */
 	if (!(cpu_feature & CPUID_APIC))
 		return;
 
 	/* Don't probe if APIC mode is disabled. */
 	if (resource_disabled("apic", 0))
 		return;
 
 	/* Probe all the enumerators to find the best match. */
 	best_enum = NULL;
 	best = 0;
 	SLIST_FOREACH(enumerator, &enumerators, apic_next) {
 		retval = enumerator->apic_probe();
 		if (retval > 0)
 			continue;
 		if (best_enum == NULL || best < retval) {
 			best_enum = enumerator;
 			best = retval;
 		}
 	}
 	if (best_enum == NULL) {
 		if (bootverbose)
 			printf("APIC: Could not find any APICs.\n");
 #ifndef DEV_ATPIC
 		panic("running without device atpic requires a local APIC");
 #endif
 		return;
 	}
 
 	if (bootverbose)
 		printf("APIC: Using the %s enumerator.\n",
 		    best_enum->apic_name);
 
 #ifdef I686_CPU
 	/*
 	 * To work around an errata, we disable the local APIC on some
 	 * CPUs during early startup.  We need to turn the local APIC back
 	 * on on such CPUs now.
 	 */
 	ppro_reenable_apic();
 #endif
 
 	/* Probe the CPU's in the system. */
 	retval = best_enum->apic_probe_cpus();
 	if (retval != 0)
 		printf("%s: Failed to probe CPUs: returned %d\n",
 		    best_enum->apic_name, retval);
 
 }
 SYSINIT(apic_init, SI_SUB_TUNABLES - 1, SI_ORDER_SECOND, apic_init, NULL);
 
 /*
  * Setup the local APIC.  We have to do this prior to starting up the APs
  * in the SMP case.
  */
 static void
 apic_setup_local(void *dummy __unused)
 {
 	int retval;
 
 	if (best_enum == NULL)
 		return;
 
 	/* Initialize the local APIC. */
 	retval = best_enum->apic_setup_local();
 	if (retval != 0)
 		printf("%s: Failed to setup the local APIC: returned %d\n",
 		    best_enum->apic_name, retval);
 }
 SYSINIT(apic_setup_local, SI_SUB_CPU, SI_ORDER_SECOND, apic_setup_local, NULL);
 
 /*
  * Setup the I/O APICs.
  */
 static void
 apic_setup_io(void *dummy __unused)
 {
 	int retval;
 
 	if (best_enum == NULL)
 		return;
 
 	/*
 	 * Local APIC must be registered before other PICs and pseudo PICs
 	 * for proper suspend/resume order.
 	 */
 	intr_register_pic(&lapic_pic);
 
 	retval = best_enum->apic_setup_io();
 	if (retval != 0)
 		printf("%s: Failed to setup I/O APICs: returned %d\n",
 		    best_enum->apic_name, retval);
 
 	/*
 	 * Finish setting up the local APIC on the BSP once we know
 	 * how to properly program the LINT pins.  In particular, this
 	 * enables the EOI suppression mode, if LAPIC support it and
 	 * user did not disabled the mode.
 	 */
 	lapic_setup(1);
 	if (bootverbose)
 		lapic_dump("BSP");
 
 	/* Enable the MSI "pic". */
 	init_ops.msi_init();
 }
 SYSINIT(apic_setup_io, SI_SUB_INTR, SI_ORDER_THIRD, apic_setup_io, NULL);
 
 #ifdef SMP
 /*
  * Inter Processor Interrupt functions.  The lapic_ipi_*() functions are
  * private to the MD code.  The public interface for the rest of the
  * kernel is defined in mp_machdep.c.
  */
 
 /*
  * Wait delay microseconds for IPI to be sent.  If delay is -1, we
  * wait forever.
  */
 static int
 native_lapic_ipi_wait(int delay)
 {
 	uint64_t rx;
 
 	/* LAPIC_ICR.APIC_DELSTAT_MASK is undefined in x2APIC mode */
 	if (x2apic_mode)
 		return (1);
 
 	for (rx = 0; delay == -1 || rx < lapic_ipi_wait_mult * delay; rx++) {
 		if ((lapic_read_icr_lo() & APIC_DELSTAT_MASK) ==
 		    APIC_DELSTAT_IDLE)
 			return (1);
 		ia32_pause();
 	}
 	return (0);
 }
 
 static void
 native_lapic_ipi_raw(register_t icrlo, u_int dest)
 {
 	uint64_t icr;
 	uint32_t vhi, vlo;
 	register_t saveintr;
 
 	/* XXX: Need more sanity checking of icrlo? */
 	KASSERT(x2apic_mode || lapic_map != NULL,
 	    ("%s called too early", __func__));
 	KASSERT(x2apic_mode ||
 	    (dest & ~(APIC_ID_MASK >> APIC_ID_SHIFT)) == 0,
 	    ("%s: invalid dest field", __func__));
 	KASSERT((icrlo & APIC_ICRLO_RESV_MASK) == 0,
 	    ("%s: reserved bits set in ICR LO register", __func__));
 
 	/* Set destination in ICR HI register if it is being used. */
 	if (!x2apic_mode) {
 		saveintr = intr_disable();
 		icr = lapic_read_icr();
 	}
 
 	if ((icrlo & APIC_DEST_MASK) == APIC_DEST_DESTFLD) {
 		if (x2apic_mode) {
 			vhi = dest;
 		} else {
 			vhi = icr >> 32;
 			vhi &= ~APIC_ID_MASK;
 			vhi |= dest << APIC_ID_SHIFT;
 		}
 	} else {
 		vhi = 0;
 	}
 
 	/* Program the contents of the IPI and dispatch it. */
 	if (x2apic_mode) {
 		vlo = icrlo;
 	} else {
 		vlo = icr;
 		vlo &= APIC_ICRLO_RESV_MASK;
 		vlo |= icrlo;
 	}
 	lapic_write_icr(vhi, vlo);
 	if (!x2apic_mode)
 		intr_restore(saveintr);
 }
 
 #define	BEFORE_SPIN	50000
 #ifdef DETECT_DEADLOCK
 #define	AFTER_SPIN	50
 #endif
 
 static void
 native_lapic_ipi_vectored(u_int vector, int dest)
 {
 	register_t icrlo, destfield;
 
 	KASSERT((vector & ~APIC_VECTOR_MASK) == 0,
 	    ("%s: invalid vector %d", __func__, vector));
 
 	icrlo = APIC_DESTMODE_PHY | APIC_TRIGMOD_EDGE | APIC_LEVEL_ASSERT;
 
 	/*
 	 * NMI IPIs are just fake vectors used to send a NMI.  Use special rules
 	 * regarding NMIs if passed, otherwise specify the vector.
 	 */
 	if (vector >= IPI_NMI_FIRST)
 		icrlo |= APIC_DELMODE_NMI;
 	else
 		icrlo |= vector | APIC_DELMODE_FIXED;
 	destfield = 0;
 	switch (dest) {
 	case APIC_IPI_DEST_SELF:
 		icrlo |= APIC_DEST_SELF;
 		break;
 	case APIC_IPI_DEST_ALL:
 		icrlo |= APIC_DEST_ALLISELF;
 		break;
 	case APIC_IPI_DEST_OTHERS:
 		icrlo |= APIC_DEST_ALLESELF;
 		break;
 	default:
 		KASSERT(x2apic_mode ||
 		    (dest & ~(APIC_ID_MASK >> APIC_ID_SHIFT)) == 0,
 		    ("%s: invalid destination 0x%x", __func__, dest));
 		destfield = dest;
 	}
 
 	/* Wait for an earlier IPI to finish. */
 	if (!lapic_ipi_wait(BEFORE_SPIN)) {
 		if (panicstr != NULL)
 			return;
 		else
 			panic("APIC: Previous IPI is stuck");
 	}
 
 	lapic_ipi_raw(icrlo, destfield);
 
 #ifdef DETECT_DEADLOCK
 	/* Wait for IPI to be delivered. */
 	if (!lapic_ipi_wait(AFTER_SPIN)) {
 #ifdef needsattention
 		/*
 		 * XXX FIXME:
 		 *
 		 * The above function waits for the message to actually be
 		 * delivered.  It breaks out after an arbitrary timeout
 		 * since the message should eventually be delivered (at
 		 * least in theory) and that if it wasn't we would catch
 		 * the failure with the check above when the next IPI is
 		 * sent.
 		 *
 		 * We could skip this wait entirely, EXCEPT it probably
 		 * protects us from other routines that assume that the
 		 * message was delivered and acted upon when this function
 		 * returns.
 		 */
 		printf("APIC: IPI might be stuck\n");
 #else /* !needsattention */
 		/* Wait until mesage is sent without a timeout. */
 		while (lapic_read_icr_lo() & APIC_DELSTAT_PEND)
 			ia32_pause();
 #endif /* needsattention */
 	}
 #endif /* DETECT_DEADLOCK */
 }
 
 #endif /* SMP */
 
 /*
  * Since the IDT is shared by all CPUs the IPI slot update needs to be globally
  * visible.
  *
  * Consider the case where an IPI is generated immediately after allocation:
  *     vector = lapic_ipi_alloc(ipifunc);
  *     ipi_selected(other_cpus, vector);
  *
  * In xAPIC mode a write to ICR_LO has serializing semantics because the
  * APIC page is mapped as an uncached region. In x2APIC mode there is an
  * explicit 'mfence' before the ICR MSR is written. Therefore in both cases
  * the IDT slot update is globally visible before the IPI is delivered.
  */
 static int
 native_lapic_ipi_alloc(inthand_t *ipifunc)
 {
 	struct gate_descriptor *ip;
 	long func;
 	int idx, vector;
 
 	KASSERT(ipifunc != &IDTVEC(rsvd), ("invalid ipifunc %p", ipifunc));
 
 	vector = -1;
 	mtx_lock_spin(&icu_lock);
 	for (idx = IPI_DYN_FIRST; idx <= IPI_DYN_LAST; idx++) {
 		ip = &idt[idx];
 		func = (ip->gd_hioffset << 16) | ip->gd_looffset;
 		if (func == (uintptr_t)&IDTVEC(rsvd)) {
 			vector = idx;
 			setidt(vector, ipifunc, SDT_APIC, SEL_KPL, GSEL_APIC);
 			break;
 		}
 	}
 	mtx_unlock_spin(&icu_lock);
 	return (vector);
 }
 
 static void
 native_lapic_ipi_free(int vector)
 {
 	struct gate_descriptor *ip;
 	long func;
 
 	KASSERT(vector >= IPI_DYN_FIRST && vector <= IPI_DYN_LAST,
 	    ("%s: invalid vector %d", __func__, vector));
 
 	mtx_lock_spin(&icu_lock);
 	ip = &idt[vector];
 	func = (ip->gd_hioffset << 16) | ip->gd_looffset;
 	KASSERT(func != (uintptr_t)&IDTVEC(rsvd),
 	    ("invalid idtfunc %#lx", func));
 	setidt(vector, &IDTVEC(rsvd), SDT_APICT, SEL_KPL, GSEL_APIC);
 	mtx_unlock_spin(&icu_lock);
 }
Index: head/sys/x86/x86/msi.c
===================================================================
--- head/sys/x86/x86/msi.c	(revision 315958)
+++ head/sys/x86/x86/msi.c	(revision 315959)
@@ -1,730 +1,732 @@
 /*-
  * Copyright (c) 2006 Yahoo!, Inc.
  * All rights reserved.
  * Written by: John Baldwin <jhb@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the author nor the names of any co-contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * Support for PCI Message Signalled Interrupts (MSI).  MSI interrupts on
  * x86 are basically APIC messages that the northbridge delivers directly
  * to the local APICs as if they had come from an I/O APIC.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_acpi.h"
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <x86/apicreg.h>
 #include <machine/cputypes.h>
 #include <machine/md_var.h>
 #include <machine/frame.h>
 #include <machine/intr_machdep.h>
 #include <x86/apicvar.h>
 #include <x86/iommu/iommu_intrmap.h>
 #include <machine/specialreg.h>
 #include <dev/pci/pcivar.h>
 
 /* Fields in address for Intel MSI messages. */
 #define	MSI_INTEL_ADDR_DEST		0x000ff000
 #define	MSI_INTEL_ADDR_RH		0x00000008
 # define MSI_INTEL_ADDR_RH_ON		0x00000008
 # define MSI_INTEL_ADDR_RH_OFF		0x00000000
 #define	MSI_INTEL_ADDR_DM		0x00000004
 # define MSI_INTEL_ADDR_DM_PHYSICAL	0x00000000
 # define MSI_INTEL_ADDR_DM_LOGICAL	0x00000004
 
 /* Fields in data for Intel MSI messages. */
 #define	MSI_INTEL_DATA_TRGRMOD		IOART_TRGRMOD	/* Trigger mode. */
 # define MSI_INTEL_DATA_TRGREDG		IOART_TRGREDG
 # define MSI_INTEL_DATA_TRGRLVL		IOART_TRGRLVL
 #define	MSI_INTEL_DATA_LEVEL		0x00004000	/* Polarity. */
 # define MSI_INTEL_DATA_DEASSERT	0x00000000
 # define MSI_INTEL_DATA_ASSERT		0x00004000
 #define	MSI_INTEL_DATA_DELMOD		IOART_DELMOD	/* Delivery mode. */
 # define MSI_INTEL_DATA_DELFIXED	IOART_DELFIXED
 # define MSI_INTEL_DATA_DELLOPRI	IOART_DELLOPRI
 # define MSI_INTEL_DATA_DELSMI		IOART_DELSMI
 # define MSI_INTEL_DATA_DELNMI		IOART_DELNMI
 # define MSI_INTEL_DATA_DELINIT		IOART_DELINIT
 # define MSI_INTEL_DATA_DELEXINT	IOART_DELEXINT
 #define	MSI_INTEL_DATA_INTVEC		IOART_INTVEC	/* Interrupt vector. */
 
 /*
  * Build Intel MSI message and data values from a source.  AMD64 systems
  * seem to be compatible, so we use the same function for both.
  */
 #define	INTEL_ADDR(msi)							\
 	(MSI_INTEL_ADDR_BASE | (msi)->msi_cpu << 12 |			\
 	    MSI_INTEL_ADDR_RH_OFF | MSI_INTEL_ADDR_DM_PHYSICAL)
 #define	INTEL_DATA(msi)							\
 	(MSI_INTEL_DATA_TRGREDG | MSI_INTEL_DATA_DELFIXED | (msi)->msi_vector)
 
 static MALLOC_DEFINE(M_MSI, "msi", "PCI MSI");
 
 /*
  * MSI sources are bunched into groups.  This is because MSI forces
  * all of the messages to share the address and data registers and
  * thus certain properties (such as the local APIC ID target on x86).
  * Each group has a 'first' source that contains information global to
  * the group.  These fields are marked with (g) below.
  *
  * Note that local APIC ID is kind of special.  Each message will be
  * assigned an ID by the system; however, a group will use the ID from
  * the first message.
  *
  * For MSI-X, each message is isolated.
  */
 struct msi_intsrc {
 	struct intsrc msi_intsrc;
 	device_t msi_dev;		/* Owning device. (g) */
 	struct msi_intsrc *msi_first;	/* First source in group. */
 	u_int msi_irq;			/* IRQ cookie. */
 	u_int msi_msix;			/* MSI-X message. */
 	u_int msi_vector:8;		/* IDT vector. */
 	u_int msi_cpu;			/* Local APIC ID. (g) */
 	u_int msi_count:8;		/* Messages in this group. (g) */
 	u_int msi_maxcount:8;		/* Alignment for this group. (g) */
 	int *msi_irqs;			/* Group's IRQ list. (g) */
 	u_int msi_remap_cookie;
 };
 
 static void	msi_create_source(void);
 static void	msi_enable_source(struct intsrc *isrc);
 static void	msi_disable_source(struct intsrc *isrc, int eoi);
 static void	msi_eoi_source(struct intsrc *isrc);
 static void	msi_enable_intr(struct intsrc *isrc);
 static void	msi_disable_intr(struct intsrc *isrc);
 static int	msi_vector(struct intsrc *isrc);
 static int	msi_source_pending(struct intsrc *isrc);
 static int	msi_config_intr(struct intsrc *isrc, enum intr_trigger trig,
 		    enum intr_polarity pol);
 static int	msi_assign_cpu(struct intsrc *isrc, u_int apic_id);
 
 struct pic msi_pic = {
 	.pic_enable_source = msi_enable_source,
 	.pic_disable_source = msi_disable_source,
 	.pic_eoi_source = msi_eoi_source,
 	.pic_enable_intr = msi_enable_intr,
 	.pic_disable_intr = msi_disable_intr,
 	.pic_vector = msi_vector,
 	.pic_source_pending = msi_source_pending,
 	.pic_suspend = NULL,
 	.pic_resume = NULL,
 	.pic_config_intr = msi_config_intr,
 	.pic_assign_cpu = msi_assign_cpu,
 	.pic_reprogram_pin = NULL,
 };
 
 #ifdef SMP
 /**
  * Xen hypervisors prior to 4.6.0 do not properly handle updates to
  * enabled MSI-X table entries.  Allow migration of MSI-X interrupts
  * to be disabled via a tunable. Values have the following meaning:
  *
  * -1: automatic detection by FreeBSD
  *  0: enable migration
  *  1: disable migration
  */
 int msix_disable_migration = -1;
 SYSCTL_INT(_machdep, OID_AUTO, disable_msix_migration, CTLFLAG_RDTUN,
     &msix_disable_migration, 0,
     "Disable migration of MSI-X interrupts between CPUs");
 #endif
 
 static int msi_enabled;
 static int msi_last_irq;
 static struct mtx msi_lock;
 
 static void
 msi_enable_source(struct intsrc *isrc)
 {
 }
 
 static void
 msi_disable_source(struct intsrc *isrc, int eoi)
 {
+	struct msi_intsrc *msi = (struct msi_intsrc *)isrc;
 
 	if (eoi == PIC_EOI)
-		lapic_eoi();
+		lapic_eoi(msi->msi_vector);
 }
 
 static void
 msi_eoi_source(struct intsrc *isrc)
 {
+	struct msi_intsrc *msi = (struct msi_intsrc *)isrc;
 
-	lapic_eoi();
+	lapic_eoi(msi->msi_vector);
 }
 
 static void
 msi_enable_intr(struct intsrc *isrc)
 {
 	struct msi_intsrc *msi = (struct msi_intsrc *)isrc;
 
 	apic_enable_vector(msi->msi_cpu, msi->msi_vector);
 }
 
 static void
 msi_disable_intr(struct intsrc *isrc)
 {
 	struct msi_intsrc *msi = (struct msi_intsrc *)isrc;
 
 	apic_disable_vector(msi->msi_cpu, msi->msi_vector);
 }
 
 static int
 msi_vector(struct intsrc *isrc)
 {
 	struct msi_intsrc *msi = (struct msi_intsrc *)isrc;
 
 	return (msi->msi_irq);
 }
 
 static int
 msi_source_pending(struct intsrc *isrc)
 {
 
 	return (0);
 }
 
 static int
 msi_config_intr(struct intsrc *isrc, enum intr_trigger trig,
     enum intr_polarity pol)
 {
 
 	return (ENODEV);
 }
 
 static int
 msi_assign_cpu(struct intsrc *isrc, u_int apic_id)
 {
 	struct msi_intsrc *sib, *msi = (struct msi_intsrc *)isrc;
 	int old_vector;
 	u_int old_id;
 	int i, vector;
 
 	/*
 	 * Only allow CPUs to be assigned to the first message for an
 	 * MSI group.
 	 */
 	if (msi->msi_first != msi)
 		return (EINVAL);
 
 #ifdef SMP
 	if (msix_disable_migration && msi->msi_msix)
 		return (EINVAL);
 #endif
 
 	/* Store information to free existing irq. */
 	old_vector = msi->msi_vector;
 	old_id = msi->msi_cpu;
 	if (old_id == apic_id)
 		return (0);
 
 	/* Allocate IDT vectors on this cpu. */
 	if (msi->msi_count > 1) {
 		KASSERT(msi->msi_msix == 0, ("MSI-X message group"));
 		vector = apic_alloc_vectors(apic_id, msi->msi_irqs,
 		    msi->msi_count, msi->msi_maxcount);
 	} else
 		vector = apic_alloc_vector(apic_id, msi->msi_irq);
 	if (vector == 0)
 		return (ENOSPC);
 
 	msi->msi_cpu = apic_id;
 	msi->msi_vector = vector;
 	if (msi->msi_intsrc.is_handlers > 0)
 		apic_enable_vector(msi->msi_cpu, msi->msi_vector);
 	if (bootverbose)
 		printf("msi: Assigning %s IRQ %d to local APIC %u vector %u\n",
 		    msi->msi_msix ? "MSI-X" : "MSI", msi->msi_irq,
 		    msi->msi_cpu, msi->msi_vector);
 	for (i = 1; i < msi->msi_count; i++) {
 		sib = (struct msi_intsrc *)intr_lookup_source(msi->msi_irqs[i]);
 		sib->msi_cpu = apic_id;
 		sib->msi_vector = vector + i;
 		if (sib->msi_intsrc.is_handlers > 0)
 			apic_enable_vector(sib->msi_cpu, sib->msi_vector);
 		if (bootverbose)
 			printf(
 		    "msi: Assigning MSI IRQ %d to local APIC %u vector %u\n",
 			    sib->msi_irq, sib->msi_cpu, sib->msi_vector);
 	}
 	BUS_REMAP_INTR(device_get_parent(msi->msi_dev), msi->msi_dev,
 	    msi->msi_irq);
 
 	/*
 	 * Free the old vector after the new one is established.  This is done
 	 * to prevent races where we could miss an interrupt.
 	 */
 	if (msi->msi_intsrc.is_handlers > 0)
 		apic_disable_vector(old_id, old_vector);
 	apic_free_vector(old_id, old_vector, msi->msi_irq);
 	for (i = 1; i < msi->msi_count; i++) {
 		sib = (struct msi_intsrc *)intr_lookup_source(msi->msi_irqs[i]);
 		if (sib->msi_intsrc.is_handlers > 0)
 			apic_disable_vector(old_id, old_vector + i);
 		apic_free_vector(old_id, old_vector + i, msi->msi_irqs[i]);
 	}
 	return (0);
 }
 
 void
 msi_init(void)
 {
 
 	/* Check if we have a supported CPU. */
 	switch (cpu_vendor_id) {
 	case CPU_VENDOR_INTEL:
 	case CPU_VENDOR_AMD:
 		break;
 	case CPU_VENDOR_CENTAUR:
 		if (CPUID_TO_FAMILY(cpu_id) == 0x6 &&
 		    CPUID_TO_MODEL(cpu_id) >= 0xf)
 			break;
 		/* FALLTHROUGH */
 	default:
 		return;
 	}
 
 #ifdef SMP
 	if (msix_disable_migration == -1) {
 		/* The default is to allow migration of MSI-X interrupts. */
 		msix_disable_migration = 0;
 	}
 #endif
 
 	msi_enabled = 1;
 	intr_register_pic(&msi_pic);
 	mtx_init(&msi_lock, "msi", NULL, MTX_DEF);
 }
 
 static void
 msi_create_source(void)
 {
 	struct msi_intsrc *msi;
 	u_int irq;
 
 	mtx_lock(&msi_lock);
 	if (msi_last_irq >= NUM_MSI_INTS) {
 		mtx_unlock(&msi_lock);
 		return;
 	}
 	irq = msi_last_irq + FIRST_MSI_INT;
 	msi_last_irq++;
 	mtx_unlock(&msi_lock);
 
 	msi = malloc(sizeof(struct msi_intsrc), M_MSI, M_WAITOK | M_ZERO);
 	msi->msi_intsrc.is_pic = &msi_pic;
 	msi->msi_irq = irq;
 	intr_register_source(&msi->msi_intsrc);
 	nexus_add_irq(irq);
 }
 
 /*
  * Try to allocate 'count' interrupt sources with contiguous IDT values.
  */
 int
 msi_alloc(device_t dev, int count, int maxcount, int *irqs)
 {
 	struct msi_intsrc *msi, *fsrc;
 	u_int cpu;
 	int cnt, i, *mirqs, vector;
 #ifdef ACPI_DMAR
 	u_int cookies[count];
 	int error;
 #endif
 
 	if (!msi_enabled)
 		return (ENXIO);
 
 	if (count > 1)
 		mirqs = malloc(count * sizeof(*mirqs), M_MSI, M_WAITOK);
 	else
 		mirqs = NULL;
 again:
 	mtx_lock(&msi_lock);
 
 	/* Try to find 'count' free IRQs. */
 	cnt = 0;
 	for (i = FIRST_MSI_INT; i < FIRST_MSI_INT + NUM_MSI_INTS; i++) {
 		msi = (struct msi_intsrc *)intr_lookup_source(i);
 
 		/* End of allocated sources, so break. */
 		if (msi == NULL)
 			break;
 
 		/* If this is a free one, save its IRQ in the array. */
 		if (msi->msi_dev == NULL) {
 			irqs[cnt] = i;
 			cnt++;
 			if (cnt == count)
 				break;
 		}
 	}
 
 	/* Do we need to create some new sources? */
 	if (cnt < count) {
 		/* If we would exceed the max, give up. */
 		if (i + (count - cnt) > FIRST_MSI_INT + NUM_MSI_INTS) {
 			mtx_unlock(&msi_lock);
 			free(mirqs, M_MSI);
 			return (ENXIO);
 		}
 		mtx_unlock(&msi_lock);
 
 		/* We need count - cnt more sources. */
 		while (cnt < count) {
 			msi_create_source();
 			cnt++;
 		}
 		goto again;
 	}
 
 	/* Ok, we now have the IRQs allocated. */
 	KASSERT(cnt == count, ("count mismatch"));
 
 	/* Allocate 'count' IDT vectors. */
 	cpu = intr_next_cpu();
 	vector = apic_alloc_vectors(cpu, irqs, count, maxcount);
 	if (vector == 0) {
 		mtx_unlock(&msi_lock);
 		free(mirqs, M_MSI);
 		return (ENOSPC);
 	}
 
 #ifdef ACPI_DMAR
 	mtx_unlock(&msi_lock);
 	error = iommu_alloc_msi_intr(dev, cookies, count);
 	mtx_lock(&msi_lock);
 	if (error == EOPNOTSUPP)
 		error = 0;
 	if (error != 0) {
 		for (i = 0; i < count; i++)
 			apic_free_vector(cpu, vector + i, irqs[i]);
 		free(mirqs, M_MSI);
 		return (error);
 	}
 	for (i = 0; i < count; i++) {
 		msi = (struct msi_intsrc *)intr_lookup_source(irqs[i]);
 		msi->msi_remap_cookie = cookies[i];
 	}
 #endif
 
 	/* Assign IDT vectors and make these messages owned by 'dev'. */
 	fsrc = (struct msi_intsrc *)intr_lookup_source(irqs[0]);
 	for (i = 0; i < count; i++) {
 		msi = (struct msi_intsrc *)intr_lookup_source(irqs[i]);
 		msi->msi_cpu = cpu;
 		msi->msi_dev = dev;
 		msi->msi_vector = vector + i;
 		if (bootverbose)
 			printf(
 		    "msi: routing MSI IRQ %d to local APIC %u vector %u\n",
 			    msi->msi_irq, msi->msi_cpu, msi->msi_vector);
 		msi->msi_first = fsrc;
 		KASSERT(msi->msi_intsrc.is_handlers == 0,
 		    ("dead MSI has handlers"));
 	}
 	fsrc->msi_count = count;
 	fsrc->msi_maxcount = maxcount;
 	if (count > 1)
 		bcopy(irqs, mirqs, count * sizeof(*mirqs));
 	fsrc->msi_irqs = mirqs;
 	mtx_unlock(&msi_lock);
 	return (0);
 }
 
 int
 msi_release(int *irqs, int count)
 {
 	struct msi_intsrc *msi, *first;
 	int i;
 
 	mtx_lock(&msi_lock);
 	first = (struct msi_intsrc *)intr_lookup_source(irqs[0]);
 	if (first == NULL) {
 		mtx_unlock(&msi_lock);
 		return (ENOENT);
 	}
 
 	/* Make sure this isn't an MSI-X message. */
 	if (first->msi_msix) {
 		mtx_unlock(&msi_lock);
 		return (EINVAL);
 	}
 
 	/* Make sure this message is allocated to a group. */
 	if (first->msi_first == NULL) {
 		mtx_unlock(&msi_lock);
 		return (ENXIO);
 	}
 
 	/*
 	 * Make sure this is the start of a group and that we are releasing
 	 * the entire group.
 	 */
 	if (first->msi_first != first || first->msi_count != count) {
 		mtx_unlock(&msi_lock);
 		return (EINVAL);
 	}
 	KASSERT(first->msi_dev != NULL, ("unowned group"));
 
 	/* Clear all the extra messages in the group. */
 	for (i = 1; i < count; i++) {
 		msi = (struct msi_intsrc *)intr_lookup_source(irqs[i]);
 		KASSERT(msi->msi_first == first, ("message not in group"));
 		KASSERT(msi->msi_dev == first->msi_dev, ("owner mismatch"));
 #ifdef ACPI_DMAR
 		iommu_unmap_msi_intr(first->msi_dev, msi->msi_remap_cookie);
 #endif
 		msi->msi_first = NULL;
 		msi->msi_dev = NULL;
 		apic_free_vector(msi->msi_cpu, msi->msi_vector, msi->msi_irq);
 		msi->msi_vector = 0;
 	}
 
 	/* Clear out the first message. */
 #ifdef ACPI_DMAR
 	mtx_unlock(&msi_lock);
 	iommu_unmap_msi_intr(first->msi_dev, first->msi_remap_cookie);
 	mtx_lock(&msi_lock);
 #endif
 	first->msi_first = NULL;
 	first->msi_dev = NULL;
 	apic_free_vector(first->msi_cpu, first->msi_vector, first->msi_irq);
 	first->msi_vector = 0;
 	first->msi_count = 0;
 	first->msi_maxcount = 0;
 	free(first->msi_irqs, M_MSI);
 	first->msi_irqs = NULL;
 
 	mtx_unlock(&msi_lock);
 	return (0);
 }
 
 int
 msi_map(int irq, uint64_t *addr, uint32_t *data)
 {
 	struct msi_intsrc *msi;
 	int error;
 #ifdef ACPI_DMAR
 	struct msi_intsrc *msi1;
 	int i, k;
 #endif
 
 	mtx_lock(&msi_lock);
 	msi = (struct msi_intsrc *)intr_lookup_source(irq);
 	if (msi == NULL) {
 		mtx_unlock(&msi_lock);
 		return (ENOENT);
 	}
 
 	/* Make sure this message is allocated to a device. */
 	if (msi->msi_dev == NULL) {
 		mtx_unlock(&msi_lock);
 		return (ENXIO);
 	}
 
 	/*
 	 * If this message isn't an MSI-X message, make sure it's part
 	 * of a group, and switch to the first message in the
 	 * group.
 	 */
 	if (!msi->msi_msix) {
 		if (msi->msi_first == NULL) {
 			mtx_unlock(&msi_lock);
 			return (ENXIO);
 		}
 		msi = msi->msi_first;
 	}
 
 #ifdef ACPI_DMAR
 	if (!msi->msi_msix) {
 		for (k = msi->msi_count - 1, i = FIRST_MSI_INT; k > 0 &&
 		    i < FIRST_MSI_INT + NUM_MSI_INTS; i++) {
 			if (i == msi->msi_irq)
 				continue;
 			msi1 = (struct msi_intsrc *)intr_lookup_source(i);
 			if (!msi1->msi_msix && msi1->msi_first == msi) {
 				mtx_unlock(&msi_lock);
 				iommu_map_msi_intr(msi1->msi_dev,
 				    msi1->msi_cpu, msi1->msi_vector,
 				    msi1->msi_remap_cookie, NULL, NULL);
 				k--;
 				mtx_lock(&msi_lock);
 			}
 		}
 	}
 	mtx_unlock(&msi_lock);
 	error = iommu_map_msi_intr(msi->msi_dev, msi->msi_cpu,
 	    msi->msi_vector, msi->msi_remap_cookie, addr, data);
 #else
 	mtx_unlock(&msi_lock);
 	error = EOPNOTSUPP;
 #endif
 	if (error == EOPNOTSUPP) {
 		*addr = INTEL_ADDR(msi);
 		*data = INTEL_DATA(msi);
 		error = 0;
 	}
 	return (error);
 }
 
 int
 msix_alloc(device_t dev, int *irq)
 {
 	struct msi_intsrc *msi;
 	u_int cpu;
 	int i, vector;
 #ifdef ACPI_DMAR
 	u_int cookie;
 	int error;
 #endif
 
 	if (!msi_enabled)
 		return (ENXIO);
 
 again:
 	mtx_lock(&msi_lock);
 
 	/* Find a free IRQ. */
 	for (i = FIRST_MSI_INT; i < FIRST_MSI_INT + NUM_MSI_INTS; i++) {
 		msi = (struct msi_intsrc *)intr_lookup_source(i);
 
 		/* End of allocated sources, so break. */
 		if (msi == NULL)
 			break;
 
 		/* Stop at the first free source. */
 		if (msi->msi_dev == NULL)
 			break;
 	}
 
 	/* Do we need to create a new source? */
 	if (msi == NULL) {
 		/* If we would exceed the max, give up. */
 		if (i + 1 > FIRST_MSI_INT + NUM_MSI_INTS) {
 			mtx_unlock(&msi_lock);
 			return (ENXIO);
 		}
 		mtx_unlock(&msi_lock);
 
 		/* Create a new source. */
 		msi_create_source();
 		goto again;
 	}
 
 	/* Allocate an IDT vector. */
 	cpu = intr_next_cpu();
 	vector = apic_alloc_vector(cpu, i);
 	if (vector == 0) {
 		mtx_unlock(&msi_lock);
 		return (ENOSPC);
 	}
 
 	msi->msi_dev = dev;
 #ifdef ACPI_DMAR
 	mtx_unlock(&msi_lock);
 	error = iommu_alloc_msi_intr(dev, &cookie, 1);
 	mtx_lock(&msi_lock);
 	if (error == EOPNOTSUPP)
 		error = 0;
 	if (error != 0) {
 		msi->msi_dev = NULL;
 		apic_free_vector(cpu, vector, i);
 		return (error);
 	}
 	msi->msi_remap_cookie = cookie;
 #endif
 
 	if (bootverbose)
 		printf("msi: routing MSI-X IRQ %d to local APIC %u vector %u\n",
 		    msi->msi_irq, cpu, vector);
 
 	/* Setup source. */
 	msi->msi_cpu = cpu;
 	msi->msi_first = msi;
 	msi->msi_vector = vector;
 	msi->msi_msix = 1;
 	msi->msi_count = 1;
 	msi->msi_maxcount = 1;
 	msi->msi_irqs = NULL;
 
 	KASSERT(msi->msi_intsrc.is_handlers == 0, ("dead MSI-X has handlers"));
 	mtx_unlock(&msi_lock);
 
 	*irq = i;
 	return (0);
 }
 
 int
 msix_release(int irq)
 {
 	struct msi_intsrc *msi;
 
 	mtx_lock(&msi_lock);
 	msi = (struct msi_intsrc *)intr_lookup_source(irq);
 	if (msi == NULL) {
 		mtx_unlock(&msi_lock);
 		return (ENOENT);
 	}
 
 	/* Make sure this is an MSI-X message. */
 	if (!msi->msi_msix) {
 		mtx_unlock(&msi_lock);
 		return (EINVAL);
 	}
 
 	KASSERT(msi->msi_dev != NULL, ("unowned message"));
 
 	/* Clear out the message. */
 #ifdef ACPI_DMAR
 	mtx_unlock(&msi_lock);
 	iommu_unmap_msi_intr(msi->msi_dev, msi->msi_remap_cookie);
 	mtx_lock(&msi_lock);
 #endif
 	msi->msi_first = NULL;
 	msi->msi_dev = NULL;
 	apic_free_vector(msi->msi_cpu, msi->msi_vector, msi->msi_irq);
 	msi->msi_vector = 0;
 	msi->msi_msix = 0;
 	msi->msi_count = 0;
 	msi->msi_maxcount = 0;
 
 	mtx_unlock(&msi_lock);
 	return (0);
 }
Index: head/sys/x86/xen/xen_apic.c
===================================================================
--- head/sys/x86/xen/xen_apic.c	(revision 315958)
+++ head/sys/x86/xen/xen_apic.c	(revision 315959)
@@ -1,546 +1,546 @@
 /*
  * Copyright (c) 2014 Roger Pau Monné <roger.pau@citrix.com>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/proc.h>
 #include <sys/smp.h>
 #include <sys/systm.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 #include <machine/cpufunc.h>
 #include <machine/cpu.h>
 #include <machine/intr_machdep.h>
 #include <machine/smp.h>
 
 #include <x86/apicreg.h>
 #include <x86/apicvar.h>
 
 #include <xen/xen-os.h>
 #include <xen/features.h>
 #include <xen/gnttab.h>
 #include <xen/hypervisor.h>
 #include <xen/hvm.h>
 #include <xen/xen_intr.h>
 
 #include <xen/interface/vcpu.h>
 
 /*--------------------------------- Macros -----------------------------------*/
 
 #define XEN_APIC_UNSUPPORTED \
 	panic("%s: not available in Xen PV port.", __func__)
 
 
 /*--------------------------- Forward Declarations ---------------------------*/
 #ifdef SMP
 static driver_filter_t xen_smp_rendezvous_action;
 static driver_filter_t xen_invltlb;
 static driver_filter_t xen_invlpg;
 static driver_filter_t xen_invlrng;
 static driver_filter_t xen_invlcache;
 static driver_filter_t xen_ipi_bitmap_handler;
 static driver_filter_t xen_cpustop_handler;
 static driver_filter_t xen_cpususpend_handler;
 static driver_filter_t xen_cpustophard_handler;
 #endif
 
 /*---------------------------------- Macros ----------------------------------*/
 #define	IPI_TO_IDX(ipi) ((ipi) - APIC_IPI_INTS)
 
 /*--------------------------------- Xen IPIs ---------------------------------*/
 #ifdef SMP
 struct xen_ipi_handler
 {
 	driver_filter_t	*filter;
 	const char	*description;
 };
 
 static struct xen_ipi_handler xen_ipis[] = 
 {
 	[IPI_TO_IDX(IPI_RENDEZVOUS)]	= { xen_smp_rendezvous_action,	"r"   },
 	[IPI_TO_IDX(IPI_INVLTLB)]	= { xen_invltlb,		"itlb"},
 	[IPI_TO_IDX(IPI_INVLPG)]	= { xen_invlpg,			"ipg" },
 	[IPI_TO_IDX(IPI_INVLRNG)]	= { xen_invlrng,		"irg" },
 	[IPI_TO_IDX(IPI_INVLCACHE)]	= { xen_invlcache,		"ic"  },
 	[IPI_TO_IDX(IPI_BITMAP_VECTOR)] = { xen_ipi_bitmap_handler,	"b"   },
 	[IPI_TO_IDX(IPI_STOP)]		= { xen_cpustop_handler,	"st"  },
 	[IPI_TO_IDX(IPI_SUSPEND)]	= { xen_cpususpend_handler,	"sp"  },
 	[IPI_TO_IDX(IPI_STOP_HARD)]	= { xen_cpustophard_handler,	"sth" },
 };
 #endif
 
 /*------------------------------- Per-CPU Data -------------------------------*/
 #ifdef SMP
 DPCPU_DEFINE(xen_intr_handle_t, ipi_handle[nitems(xen_ipis)]);
 #endif
 
 /*------------------------------- Xen PV APIC --------------------------------*/
 
 static void
 xen_pv_lapic_create(u_int apic_id, int boot_cpu)
 {
 #ifdef SMP
 	cpu_add(apic_id, boot_cpu);
 #endif
 }
 
 static void
 xen_pv_lapic_init(vm_paddr_t addr)
 {
 
 }
 
 static void
 xen_pv_lapic_setup(int boot)
 {
 
 }
 
 static void
 xen_pv_lapic_dump(const char *str)
 {
 
 	printf("cpu%d %s XEN PV LAPIC\n", PCPU_GET(cpuid), str);
 }
 
 static void
 xen_pv_lapic_disable(void)
 {
 
 }
 
 static bool
 xen_pv_lapic_is_x2apic(void)
 {
 
 	return (false);
 }
 
 static void
-xen_pv_lapic_eoi(void)
+xen_pv_lapic_eoi(u_int vector)
 {
 
 	XEN_APIC_UNSUPPORTED;
 }
 
 static int
 xen_pv_lapic_id(void)
 {
 
 	return (PCPU_GET(apic_id));
 }
 
 static int
 xen_pv_lapic_intr_pending(u_int vector)
 {
 
 	XEN_APIC_UNSUPPORTED;
 	return (0);
 }
 
 static u_int
 xen_pv_apic_cpuid(u_int apic_id)
 {
 #ifdef SMP
 	return (apic_cpuids[apic_id]);
 #else
 	return (0);
 #endif
 }
 
 static u_int
 xen_pv_apic_alloc_vector(u_int apic_id, u_int irq)
 {
 
 	XEN_APIC_UNSUPPORTED;
 	return (0);
 }
 
 static u_int
 xen_pv_apic_alloc_vectors(u_int apic_id, u_int *irqs, u_int count, u_int align)
 {
 
 	XEN_APIC_UNSUPPORTED;
 	return (0);
 }
 
 static void
 xen_pv_apic_disable_vector(u_int apic_id, u_int vector)
 {
 
 	XEN_APIC_UNSUPPORTED;
 }
 
 static void
 xen_pv_apic_enable_vector(u_int apic_id, u_int vector)
 {
 
 	XEN_APIC_UNSUPPORTED;
 }
 
 static void
 xen_pv_apic_free_vector(u_int apic_id, u_int vector, u_int irq)
 {
 
 	XEN_APIC_UNSUPPORTED;
 }
 
 static void
 xen_pv_lapic_set_logical_id(u_int apic_id, u_int cluster, u_int cluster_id)
 {
 
 	XEN_APIC_UNSUPPORTED;
 }
 
 static int
 xen_pv_lapic_enable_pmc(void)
 {
 
 	XEN_APIC_UNSUPPORTED;
 	return (0);
 }
 
 static void
 xen_pv_lapic_disable_pmc(void)
 {
 
 	XEN_APIC_UNSUPPORTED;
 }
 
 static void
 xen_pv_lapic_reenable_pmc(void)
 {
 
 	XEN_APIC_UNSUPPORTED;
 }
 
 static void
 xen_pv_lapic_enable_cmc(void)
 {
 
 }
 
 #ifdef SMP
 static void
 xen_pv_lapic_ipi_raw(register_t icrlo, u_int dest)
 {
 
 	XEN_APIC_UNSUPPORTED;
 }
 
 static void
 xen_pv_lapic_ipi_vectored(u_int vector, int dest)
 {
 	xen_intr_handle_t *ipi_handle;
 	int ipi_idx, to_cpu, self;
 
 	ipi_idx = IPI_TO_IDX(vector);
 	if (ipi_idx >= nitems(xen_ipis))
 		panic("IPI out of range");
 
 	switch(dest) {
 	case APIC_IPI_DEST_SELF:
 		ipi_handle = DPCPU_GET(ipi_handle);
 		xen_intr_signal(ipi_handle[ipi_idx]);
 		break;
 	case APIC_IPI_DEST_ALL:
 		CPU_FOREACH(to_cpu) {
 			ipi_handle = DPCPU_ID_GET(to_cpu, ipi_handle);
 			xen_intr_signal(ipi_handle[ipi_idx]);
 		}
 		break;
 	case APIC_IPI_DEST_OTHERS:
 		self = PCPU_GET(cpuid);
 		CPU_FOREACH(to_cpu) {
 			if (to_cpu != self) {
 				ipi_handle = DPCPU_ID_GET(to_cpu, ipi_handle);
 				xen_intr_signal(ipi_handle[ipi_idx]);
 			}
 		}
 		break;
 	default:
 		to_cpu = apic_cpuid(dest);
 		ipi_handle = DPCPU_ID_GET(to_cpu, ipi_handle);
 		xen_intr_signal(ipi_handle[ipi_idx]);
 		break;
 	}
 }
 
 static int
 xen_pv_lapic_ipi_wait(int delay)
 {
 
 	XEN_APIC_UNSUPPORTED;
 	return (0);
 }
 #endif	/* SMP */
 
 static int
 xen_pv_lapic_ipi_alloc(inthand_t *ipifunc)
 {
 
 	XEN_APIC_UNSUPPORTED;
 	return (-1);
 }
 
 static void
 xen_pv_lapic_ipi_free(int vector)
 {
 
 	XEN_APIC_UNSUPPORTED;
 }
 
 static int
 xen_pv_lapic_set_lvt_mask(u_int apic_id, u_int lvt, u_char masked)
 {
 
 	XEN_APIC_UNSUPPORTED;
 	return (0);
 }
 
 static int
 xen_pv_lapic_set_lvt_mode(u_int apic_id, u_int lvt, uint32_t mode)
 {
 
 	XEN_APIC_UNSUPPORTED;
 	return (0);
 }
 
 static int
 xen_pv_lapic_set_lvt_polarity(u_int apic_id, u_int lvt, enum intr_polarity pol)
 {
 
 	XEN_APIC_UNSUPPORTED;
 	return (0);
 }
 
 static int
 xen_pv_lapic_set_lvt_triggermode(u_int apic_id, u_int lvt,
     enum intr_trigger trigger)
 {
 
 	XEN_APIC_UNSUPPORTED;
 	return (0);
 }
 
 /* Xen apic_ops implementation */
 struct apic_ops xen_apic_ops = {
 	.create			= xen_pv_lapic_create,
 	.init			= xen_pv_lapic_init,
 	.xapic_mode		= xen_pv_lapic_disable,
 	.is_x2apic		= xen_pv_lapic_is_x2apic,
 	.setup			= xen_pv_lapic_setup,
 	.dump			= xen_pv_lapic_dump,
 	.disable		= xen_pv_lapic_disable,
 	.eoi			= xen_pv_lapic_eoi,
 	.id			= xen_pv_lapic_id,
 	.intr_pending		= xen_pv_lapic_intr_pending,
 	.set_logical_id		= xen_pv_lapic_set_logical_id,
 	.cpuid			= xen_pv_apic_cpuid,
 	.alloc_vector		= xen_pv_apic_alloc_vector,
 	.alloc_vectors		= xen_pv_apic_alloc_vectors,
 	.enable_vector		= xen_pv_apic_enable_vector,
 	.disable_vector		= xen_pv_apic_disable_vector,
 	.free_vector		= xen_pv_apic_free_vector,
 	.enable_pmc		= xen_pv_lapic_enable_pmc,
 	.disable_pmc		= xen_pv_lapic_disable_pmc,
 	.reenable_pmc		= xen_pv_lapic_reenable_pmc,
 	.enable_cmc		= xen_pv_lapic_enable_cmc,
 #ifdef SMP
 	.ipi_raw		= xen_pv_lapic_ipi_raw,
 	.ipi_vectored		= xen_pv_lapic_ipi_vectored,
 	.ipi_wait		= xen_pv_lapic_ipi_wait,
 #endif
 	.ipi_alloc		= xen_pv_lapic_ipi_alloc,
 	.ipi_free		= xen_pv_lapic_ipi_free,
 	.set_lvt_mask		= xen_pv_lapic_set_lvt_mask,
 	.set_lvt_mode		= xen_pv_lapic_set_lvt_mode,
 	.set_lvt_polarity	= xen_pv_lapic_set_lvt_polarity,
 	.set_lvt_triggermode	= xen_pv_lapic_set_lvt_triggermode,
 };
 
 #ifdef SMP
 /*---------------------------- XEN PV IPI Handlers ---------------------------*/
 /*
  * These are C clones of the ASM functions found in apic_vector.
  */
 static int
 xen_ipi_bitmap_handler(void *arg)
 {
 	struct trapframe *frame;
 
 	frame = arg;
 	ipi_bitmap_handler(*frame);
 	return (FILTER_HANDLED);
 }
 
 static int
 xen_smp_rendezvous_action(void *arg)
 {
 #ifdef COUNT_IPIS
 	(*ipi_rendezvous_counts[PCPU_GET(cpuid)])++;
 #endif /* COUNT_IPIS */
 
 	smp_rendezvous_action();
 	return (FILTER_HANDLED);
 }
 
 static int
 xen_invltlb(void *arg)
 {
 
 	invltlb_handler();
 	return (FILTER_HANDLED);
 }
 
 #ifdef __amd64__
 static int
 xen_invltlb_invpcid(void *arg)
 {
 
 	invltlb_invpcid_handler();
 	return (FILTER_HANDLED);
 }
 
 static int
 xen_invltlb_pcid(void *arg)
 {
 
 	invltlb_pcid_handler();
 	return (FILTER_HANDLED);
 }
 #endif
 
 static int
 xen_invlpg(void *arg)
 {
 
 	invlpg_handler();
 	return (FILTER_HANDLED);
 }
 
 static int
 xen_invlrng(void *arg)
 {
 
 	invlrng_handler();
 	return (FILTER_HANDLED);
 }
 
 static int
 xen_invlcache(void *arg)
 {
 
 	invlcache_handler();
 	return (FILTER_HANDLED);
 }
 
 static int
 xen_cpustop_handler(void *arg)
 {
 
 	cpustop_handler();
 	return (FILTER_HANDLED);
 }
 
 static int
 xen_cpususpend_handler(void *arg)
 {
 
 	cpususpend_handler();
 	return (FILTER_HANDLED);
 }
 
 static int
 xen_cpustophard_handler(void *arg)
 {
 
 	ipi_nmi_handler();
 	return (FILTER_HANDLED);
 }
 
 /*----------------------------- XEN PV IPI setup -----------------------------*/
 /*
  * Those functions are provided outside of the Xen PV APIC implementation
  * so PVHVM guests can also use PV IPIs without having an actual Xen PV APIC,
  * because on PVHVM there's an emulated LAPIC provided by Xen.
  */
 static void
 xen_cpu_ipi_init(int cpu)
 {
 	xen_intr_handle_t *ipi_handle;
 	const struct xen_ipi_handler *ipi;
 	int idx, rc;
 
 	ipi_handle = DPCPU_ID_GET(cpu, ipi_handle);
 
 	for (ipi = xen_ipis, idx = 0; idx < nitems(xen_ipis); ipi++, idx++) {
 
 		if (ipi->filter == NULL) {
 			ipi_handle[idx] = NULL;
 			continue;
 		}
 
 		rc = xen_intr_alloc_and_bind_ipi(cpu, ipi->filter,
 		    INTR_TYPE_TTY, &ipi_handle[idx]);
 		if (rc != 0)
 			panic("Unable to allocate a XEN IPI port");
 		xen_intr_describe(ipi_handle[idx], "%s", ipi->description);
 	}
 }
 
 static void
 xen_setup_cpus(void)
 {
 	int i;
 
 	if (!xen_vector_callback_enabled)
 		return;
 
 #ifdef __amd64__
 	if (pmap_pcid_enabled) {
 		xen_ipis[IPI_TO_IDX(IPI_INVLTLB)].filter = invpcid_works ?
 		    xen_invltlb_invpcid : xen_invltlb_pcid;
 	}
 #endif
 	CPU_FOREACH(i)
 		xen_cpu_ipi_init(i);
 
 	/* Set the xen pv ipi ops to replace the native ones */
 	if (xen_hvm_domain())
 		apic_ops.ipi_vectored = xen_pv_lapic_ipi_vectored;
 }
 
 /* We need to setup IPIs before APs are started */
 SYSINIT(xen_setup_cpus, SI_SUB_SMP-1, SI_ORDER_FIRST, xen_setup_cpus, NULL);
 #endif /* SMP */