Index: head/sys/amd64/amd64/apic_vector.S =================================================================== --- head/sys/amd64/amd64/apic_vector.S (revision 316020) +++ head/sys/amd64/amd64/apic_vector.S (revision 316021) @@ -1,309 +1,331 @@ /*- * Copyright (c) 1989, 1990 William F. Jolitz. * Copyright (c) 1990 The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: vector.s, 386BSD 0.1 unknown origin * $FreeBSD$ */ /* * Interrupt entry points for external interrupts triggered by I/O APICs * as well as IPI handlers. */ #include "opt_smp.h" #include #include #include #include "assym.s" #ifdef SMP #define LK lock ; #else #define LK #endif + .text + SUPERALIGN_TEXT + /* End Of Interrupt to APIC */ +as_lapic_eoi: + cmpl $0,x2apic_mode + jne 1f + movq lapic_map,%rax + movl $0,LA_EOI(%rax) + ret +1: + movl $MSR_APIC_EOI,%ecx + xorl %eax,%eax + xorl %edx,%edx + wrmsr + ret + /* * I/O Interrupt Entry Point. Rather than having one entry point for * each interrupt source, we use one entry point for each 32-bit word * in the ISR. The handler determines the highest bit set in the ISR, * translates that into a vector, and passes the vector to the * lapic_handle_intr() function. */ #define ISR_VEC(index, vec_name) \ .text ; \ SUPERALIGN_TEXT ; \ IDTVEC(vec_name) ; \ PUSH_FRAME ; \ FAKE_MCOUNT(TF_RIP(%rsp)) ; \ cmpl $0,x2apic_mode ; \ je 1f ; \ movl $(MSR_APIC_ISR0 + index),%ecx ; \ rdmsr ; \ jmp 2f ; \ 1: ; \ movq lapic_map, %rdx ; /* pointer to local APIC */ \ movl LA_ISR + 16 * (index)(%rdx), %eax ; /* load ISR */ \ 2: ; \ bsrl %eax, %eax ; /* index of highest set bit in ISR */ \ jz 3f ; \ addl $(32 * index),%eax ; \ movq %rsp, %rsi ; \ movl %eax, %edi ; /* pass the IRQ */ \ call lapic_handle_intr ; \ 3: ; \ MEXITCOUNT ; \ jmp doreti /* * Handle "spurious INTerrupts". * Notes: * This is different than the "spurious INTerrupt" generated by an * 8259 PIC for missing INTs. See the APIC documentation for details. * This routine should NOT do an 'EOI' cycle. */ .text SUPERALIGN_TEXT IDTVEC(spuriousint) /* No EOI cycle used here */ jmp doreti_iret ISR_VEC(1, apic_isr1) ISR_VEC(2, apic_isr2) ISR_VEC(3, apic_isr3) ISR_VEC(4, apic_isr4) ISR_VEC(5, apic_isr5) ISR_VEC(6, apic_isr6) ISR_VEC(7, apic_isr7) /* * Local APIC periodic timer handler. */ .text SUPERALIGN_TEXT IDTVEC(timerint) PUSH_FRAME FAKE_MCOUNT(TF_RIP(%rsp)) movq %rsp, %rdi call lapic_handle_timer MEXITCOUNT jmp doreti /* * Local APIC CMCI handler. */ .text SUPERALIGN_TEXT IDTVEC(cmcint) PUSH_FRAME FAKE_MCOUNT(TF_RIP(%rsp)) call lapic_handle_cmc MEXITCOUNT jmp doreti /* * Local APIC error interrupt handler. */ .text SUPERALIGN_TEXT IDTVEC(errorint) PUSH_FRAME FAKE_MCOUNT(TF_RIP(%rsp)) call lapic_handle_error MEXITCOUNT jmp doreti #ifdef XENHVM /* * Xen event channel upcall interrupt handler. * Only used when the hypervisor supports direct vector callbacks. */ .text SUPERALIGN_TEXT IDTVEC(xen_intr_upcall) PUSH_FRAME FAKE_MCOUNT(TF_RIP(%rsp)) movq %rsp, %rdi call xen_intr_handle_upcall MEXITCOUNT jmp doreti #endif #ifdef SMP /* * Global address space TLB shootdown. */ .text SUPERALIGN_TEXT invltlb_ret: - call native_lapic_eoi + call as_lapic_eoi POP_FRAME jmp doreti_iret SUPERALIGN_TEXT IDTVEC(invltlb) PUSH_FRAME call invltlb_handler - movl $IPI_INVLTLB, %edi jmp invltlb_ret IDTVEC(invltlb_pcid) PUSH_FRAME call invltlb_pcid_handler - movl $IPI_INVLTLB, %edi jmp invltlb_ret IDTVEC(invltlb_invpcid) PUSH_FRAME call invltlb_invpcid_handler - movl $IPI_INVLTLB, %edi jmp invltlb_ret /* * Single page TLB shootdown */ .text SUPERALIGN_TEXT IDTVEC(invlpg) PUSH_FRAME call invlpg_handler - movl $IPI_INVLPG, %edi jmp invltlb_ret /* * Page range TLB shootdown. */ .text SUPERALIGN_TEXT IDTVEC(invlrng) PUSH_FRAME call invlrng_handler - movl $IPI_INVLRNG, %edi jmp invltlb_ret /* * Invalidate cache. */ .text SUPERALIGN_TEXT IDTVEC(invlcache) PUSH_FRAME call invlcache_handler - movl $IPI_INVLCACHE, %edi jmp invltlb_ret /* * Handler for IPIs sent via the per-cpu IPI bitmap. */ .text SUPERALIGN_TEXT IDTVEC(ipi_intr_bitmap_handler) PUSH_FRAME - movl $IPI_BITMAP_VECTOR, %edi - call native_lapic_eoi + call as_lapic_eoi FAKE_MCOUNT(TF_RIP(%rsp)) call ipi_bitmap_handler MEXITCOUNT jmp doreti /* * Executed by a CPU when it receives an IPI_STOP from another CPU. */ .text SUPERALIGN_TEXT IDTVEC(cpustop) PUSH_FRAME - movl $IPI_STOP, %edi - call native_lapic_eoi + call as_lapic_eoi call cpustop_handler jmp doreti /* * Executed by a CPU when it receives an IPI_SUSPEND from another CPU. */ .text SUPERALIGN_TEXT IDTVEC(cpususpend) PUSH_FRAME call cpususpend_handler - movl $IPI_SUSPEND, %edi - call native_lapic_eoi + call as_lapic_eoi jmp doreti /* * Executed by a CPU when it receives a RENDEZVOUS IPI from another CPU. * * - Calls the generic rendezvous action function. */ .text SUPERALIGN_TEXT IDTVEC(rendezvous) PUSH_FRAME #ifdef COUNT_IPIS movl PCPU(CPUID), %eax movq ipi_rendezvous_counts(,%rax,8), %rax incq (%rax) #endif call smp_rendezvous_action - movl $IPI_RENDEZVOUS, %edi - call native_lapic_eoi + call as_lapic_eoi jmp doreti /* * IPI handler whose purpose is to interrupt the CPU with minimum overhead. * This is used by bhyve to force a host cpu executing in guest context to * trap into the hypervisor. + * + * This handler is different from other IPI handlers in the following aspects: + * + * 1. It doesn't push a trapframe on the stack. + * + * This implies that a DDB backtrace involving 'justreturn' will skip the + * function that was interrupted by this handler. + * + * 2. It doesn't 'swapgs' when userspace is interrupted. + * + * The 'justreturn' handler does not access any pcpu data so it is not an + * issue. Moreover the 'justreturn' handler can only be interrupted by an NMI + * whose handler already doesn't trust GS.base when kernel code is interrupted. */ .text SUPERALIGN_TEXT IDTVEC(justreturn) - PUSH_FRAME - movl vmm_ipinum, %edi - call native_lapic_eoi - POP_FRAME + pushq %rax + pushq %rcx + pushq %rdx + call as_lapic_eoi + popq %rdx + popq %rcx + popq %rax jmp doreti_iret #endif /* SMP */ Index: head/sys/amd64/amd64/genassym.c =================================================================== --- head/sys/amd64/amd64/genassym.c (revision 316020) +++ head/sys/amd64/amd64/genassym.c (revision 316021) @@ -1,249 +1,237 @@ /*- * Copyright (c) 1982, 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)genassym.c 5.11 (Berkeley) 5/10/91 */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_hwpmc_hooks.h" #include "opt_kstack_pages.h" #include #include #include #include #include -#include #include #ifdef HWPMC_HOOKS #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include -#include #include -#include #include #include #include #include #include ASSYM(P_VMSPACE, offsetof(struct proc, p_vmspace)); ASSYM(VM_PMAP, offsetof(struct vmspace, vm_pmap)); ASSYM(PM_ACTIVE, offsetof(struct pmap, pm_active)); ASSYM(P_MD, offsetof(struct proc, p_md)); ASSYM(MD_LDT, offsetof(struct mdproc, md_ldt)); ASSYM(MD_LDT_SD, offsetof(struct mdproc, md_ldt_sd)); ASSYM(TD_LOCK, offsetof(struct thread, td_lock)); ASSYM(TD_FLAGS, offsetof(struct thread, td_flags)); ASSYM(TD_PCB, offsetof(struct thread, td_pcb)); ASSYM(TD_PFLAGS, offsetof(struct thread, td_pflags)); ASSYM(TD_PROC, offsetof(struct thread, td_proc)); ASSYM(TD_TID, offsetof(struct thread, td_tid)); ASSYM(TD_FRAME, offsetof(struct thread, td_frame)); ASSYM(TDF_ASTPENDING, TDF_ASTPENDING); ASSYM(TDF_NEEDRESCHED, TDF_NEEDRESCHED); ASSYM(TDP_CALLCHAIN, TDP_CALLCHAIN); ASSYM(TDP_KTHREAD, TDP_KTHREAD); ASSYM(V_TRAP, offsetof(struct vmmeter, v_trap)); ASSYM(V_SYSCALL, offsetof(struct vmmeter, v_syscall)); ASSYM(V_INTR, offsetof(struct vmmeter, v_intr)); ASSYM(PAGE_SIZE, PAGE_SIZE); ASSYM(NPTEPG, NPTEPG); ASSYM(NPDEPG, NPDEPG); ASSYM(addr_PTmap, addr_PTmap); ASSYM(addr_PDmap, addr_PDmap); ASSYM(addr_PDPmap, addr_PDPmap); ASSYM(addr_PML4map, addr_PML4map); ASSYM(addr_PML4pml4e, addr_PML4pml4e); ASSYM(PDESIZE, sizeof(pd_entry_t)); ASSYM(PTESIZE, sizeof(pt_entry_t)); ASSYM(PAGE_SHIFT, PAGE_SHIFT); ASSYM(PAGE_MASK, PAGE_MASK); ASSYM(PDRSHIFT, PDRSHIFT); ASSYM(PDPSHIFT, PDPSHIFT); ASSYM(PML4SHIFT, PML4SHIFT); ASSYM(val_KPDPI, KPDPI); ASSYM(val_KPML4I, KPML4I); ASSYM(val_PML4PML4I, PML4PML4I); ASSYM(USRSTACK, USRSTACK); ASSYM(VM_MAXUSER_ADDRESS, VM_MAXUSER_ADDRESS); ASSYM(KERNBASE, KERNBASE); ASSYM(DMAP_MIN_ADDRESS, DMAP_MIN_ADDRESS); ASSYM(DMAP_MAX_ADDRESS, DMAP_MAX_ADDRESS); ASSYM(MCLBYTES, MCLBYTES); ASSYM(PCB_R15, offsetof(struct pcb, pcb_r15)); ASSYM(PCB_R14, offsetof(struct pcb, pcb_r14)); ASSYM(PCB_R13, offsetof(struct pcb, pcb_r13)); ASSYM(PCB_R12, offsetof(struct pcb, pcb_r12)); ASSYM(PCB_RBP, offsetof(struct pcb, pcb_rbp)); ASSYM(PCB_RSP, offsetof(struct pcb, pcb_rsp)); ASSYM(PCB_RBX, offsetof(struct pcb, pcb_rbx)); ASSYM(PCB_RIP, offsetof(struct pcb, pcb_rip)); ASSYM(PCB_FSBASE, offsetof(struct pcb, pcb_fsbase)); ASSYM(PCB_GSBASE, offsetof(struct pcb, pcb_gsbase)); ASSYM(PCB_KGSBASE, offsetof(struct pcb, pcb_kgsbase)); ASSYM(PCB_CR0, offsetof(struct pcb, pcb_cr0)); ASSYM(PCB_CR2, offsetof(struct pcb, pcb_cr2)); ASSYM(PCB_CR3, offsetof(struct pcb, pcb_cr3)); ASSYM(PCB_CR4, offsetof(struct pcb, pcb_cr4)); ASSYM(PCB_DR0, offsetof(struct pcb, pcb_dr0)); ASSYM(PCB_DR1, offsetof(struct pcb, pcb_dr1)); ASSYM(PCB_DR2, offsetof(struct pcb, pcb_dr2)); ASSYM(PCB_DR3, offsetof(struct pcb, pcb_dr3)); ASSYM(PCB_DR6, offsetof(struct pcb, pcb_dr6)); ASSYM(PCB_DR7, offsetof(struct pcb, pcb_dr7)); ASSYM(PCB_GDT, offsetof(struct pcb, pcb_gdt)); ASSYM(PCB_IDT, offsetof(struct pcb, pcb_idt)); ASSYM(PCB_LDT, offsetof(struct pcb, pcb_ldt)); ASSYM(PCB_TR, offsetof(struct pcb, pcb_tr)); ASSYM(PCB_FLAGS, offsetof(struct pcb, pcb_flags)); ASSYM(PCB_ONFAULT, offsetof(struct pcb, pcb_onfault)); ASSYM(PCB_TSSP, offsetof(struct pcb, pcb_tssp)); ASSYM(PCB_SAVEFPU, offsetof(struct pcb, pcb_save)); ASSYM(PCB_EFER, offsetof(struct pcb, pcb_efer)); ASSYM(PCB_STAR, offsetof(struct pcb, pcb_star)); ASSYM(PCB_LSTAR, offsetof(struct pcb, pcb_lstar)); ASSYM(PCB_CSTAR, offsetof(struct pcb, pcb_cstar)); ASSYM(PCB_SFMASK, offsetof(struct pcb, pcb_sfmask)); ASSYM(PCB_SIZE, sizeof(struct pcb)); ASSYM(PCB_FULL_IRET, PCB_FULL_IRET); ASSYM(PCB_DBREGS, PCB_DBREGS); ASSYM(PCB_32BIT, PCB_32BIT); ASSYM(COMMON_TSS_RSP0, offsetof(struct amd64tss, tss_rsp0)); ASSYM(TF_R15, offsetof(struct trapframe, tf_r15)); ASSYM(TF_R14, offsetof(struct trapframe, tf_r14)); ASSYM(TF_R13, offsetof(struct trapframe, tf_r13)); ASSYM(TF_R12, offsetof(struct trapframe, tf_r12)); ASSYM(TF_R11, offsetof(struct trapframe, tf_r11)); ASSYM(TF_R10, offsetof(struct trapframe, tf_r10)); ASSYM(TF_R9, offsetof(struct trapframe, tf_r9)); ASSYM(TF_R8, offsetof(struct trapframe, tf_r8)); ASSYM(TF_RDI, offsetof(struct trapframe, tf_rdi)); ASSYM(TF_RSI, offsetof(struct trapframe, tf_rsi)); ASSYM(TF_RBP, offsetof(struct trapframe, tf_rbp)); ASSYM(TF_RBX, offsetof(struct trapframe, tf_rbx)); ASSYM(TF_RDX, offsetof(struct trapframe, tf_rdx)); ASSYM(TF_RCX, offsetof(struct trapframe, tf_rcx)); ASSYM(TF_RAX, offsetof(struct trapframe, tf_rax)); ASSYM(TF_TRAPNO, offsetof(struct trapframe, tf_trapno)); ASSYM(TF_ADDR, offsetof(struct trapframe, tf_addr)); ASSYM(TF_ERR, offsetof(struct trapframe, tf_err)); ASSYM(TF_RIP, offsetof(struct trapframe, tf_rip)); ASSYM(TF_CS, offsetof(struct trapframe, tf_cs)); ASSYM(TF_RFLAGS, offsetof(struct trapframe, tf_rflags)); ASSYM(TF_RSP, offsetof(struct trapframe, tf_rsp)); ASSYM(TF_SS, offsetof(struct trapframe, tf_ss)); ASSYM(TF_DS, offsetof(struct trapframe, tf_ds)); ASSYM(TF_ES, offsetof(struct trapframe, tf_es)); ASSYM(TF_FS, offsetof(struct trapframe, tf_fs)); ASSYM(TF_GS, offsetof(struct trapframe, tf_gs)); ASSYM(TF_FLAGS, offsetof(struct trapframe, tf_flags)); ASSYM(TF_SIZE, sizeof(struct trapframe)); ASSYM(TF_HASSEGS, TF_HASSEGS); ASSYM(SIGF_HANDLER, offsetof(struct sigframe, sf_ahu.sf_handler)); ASSYM(SIGF_UC, offsetof(struct sigframe, sf_uc)); ASSYM(UC_EFLAGS, offsetof(ucontext_t, uc_mcontext.mc_rflags)); ASSYM(ENOENT, ENOENT); ASSYM(EFAULT, EFAULT); ASSYM(ENAMETOOLONG, ENAMETOOLONG); ASSYM(MAXCOMLEN, MAXCOMLEN); ASSYM(MAXPATHLEN, MAXPATHLEN); ASSYM(PC_SIZEOF, sizeof(struct pcpu)); ASSYM(PC_PRVSPACE, offsetof(struct pcpu, pc_prvspace)); ASSYM(PC_CURTHREAD, offsetof(struct pcpu, pc_curthread)); ASSYM(PC_FPCURTHREAD, offsetof(struct pcpu, pc_fpcurthread)); ASSYM(PC_IDLETHREAD, offsetof(struct pcpu, pc_idlethread)); ASSYM(PC_CURPCB, offsetof(struct pcpu, pc_curpcb)); ASSYM(PC_CPUID, offsetof(struct pcpu, pc_cpuid)); ASSYM(PC_SCRATCH_RSP, offsetof(struct pcpu, pc_scratch_rsp)); ASSYM(PC_CURPMAP, offsetof(struct pcpu, pc_curpmap)); ASSYM(PC_TSSP, offsetof(struct pcpu, pc_tssp)); ASSYM(PC_RSP0, offsetof(struct pcpu, pc_rsp0)); ASSYM(PC_FS32P, offsetof(struct pcpu, pc_fs32p)); ASSYM(PC_GS32P, offsetof(struct pcpu, pc_gs32p)); ASSYM(PC_LDT, offsetof(struct pcpu, pc_ldt)); ASSYM(PC_COMMONTSSP, offsetof(struct pcpu, pc_commontssp)); ASSYM(PC_TSS, offsetof(struct pcpu, pc_tss)); ASSYM(PC_PM_SAVE_CNT, offsetof(struct pcpu, pc_pm_save_cnt)); - + ASSYM(LA_EOI, LAPIC_EOI * LAPIC_MEM_MUL); ASSYM(LA_ISR, LAPIC_ISR0 * LAPIC_MEM_MUL); - -ASSYM(IPI_INVLTLB, IPI_INVLTLB); -ASSYM(IPI_INVLPG, IPI_INVLPG); -ASSYM(IPI_INVLRNG, IPI_INVLRNG); -ASSYM(IPI_INVLCACHE, IPI_INVLCACHE); -ASSYM(IPI_BITMAP_VECTOR, IPI_BITMAP_VECTOR); -ASSYM(IPI_STOP, IPI_STOP); -ASSYM(IPI_SUSPEND, IPI_SUSPEND); -ASSYM(IPI_RENDEZVOUS, IPI_RENDEZVOUS); ASSYM(KCSEL, GSEL(GCODE_SEL, SEL_KPL)); ASSYM(KDSEL, GSEL(GDATA_SEL, SEL_KPL)); ASSYM(KUCSEL, GSEL(GUCODE_SEL, SEL_UPL)); ASSYM(KUDSEL, GSEL(GUDATA_SEL, SEL_UPL)); ASSYM(KUC32SEL, GSEL(GUCODE32_SEL, SEL_UPL)); ASSYM(KUF32SEL, GSEL(GUFS32_SEL, SEL_UPL)); ASSYM(KUG32SEL, GSEL(GUGS32_SEL, SEL_UPL)); ASSYM(TSSSEL, GSEL(GPROC0_SEL, SEL_KPL)); ASSYM(LDTSEL, GSEL(GUSERLDT_SEL, SEL_KPL)); ASSYM(SEL_RPL_MASK, SEL_RPL_MASK); ASSYM(__FreeBSD_version, __FreeBSD_version); #ifdef HWPMC_HOOKS ASSYM(PMC_FN_USER_CALLCHAIN, PMC_FN_USER_CALLCHAIN); #endif Index: head/sys/amd64/vmm/amd/svm.c =================================================================== --- head/sys/amd64/vmm/amd/svm.c (revision 316020) +++ head/sys/amd64/vmm/amd/svm.c (revision 316021) @@ -1,2247 +1,2246 @@ /*- * Copyright (c) 2013, Anish Gupta (akgupt3@gmail.com) * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include -#include "vmm_host.h" #include "vmm_lapic.h" #include "vmm_stat.h" #include "vmm_ktr.h" #include "vmm_ioport.h" #include "vatpic.h" #include "vlapic.h" #include "vlapic_priv.h" #include "x86.h" #include "vmcb.h" #include "svm.h" #include "svm_softc.h" #include "svm_msr.h" #include "npt.h" SYSCTL_DECL(_hw_vmm); SYSCTL_NODE(_hw_vmm, OID_AUTO, svm, CTLFLAG_RW, NULL, NULL); /* * SVM CPUID function 0x8000_000A, edx bit decoding. */ #define AMD_CPUID_SVM_NP BIT(0) /* Nested paging or RVI */ #define AMD_CPUID_SVM_LBR BIT(1) /* Last branch virtualization */ #define AMD_CPUID_SVM_SVML BIT(2) /* SVM lock */ #define AMD_CPUID_SVM_NRIP_SAVE BIT(3) /* Next RIP is saved */ #define AMD_CPUID_SVM_TSC_RATE BIT(4) /* TSC rate control. */ #define AMD_CPUID_SVM_VMCB_CLEAN BIT(5) /* VMCB state caching */ #define AMD_CPUID_SVM_FLUSH_BY_ASID BIT(6) /* Flush by ASID */ #define AMD_CPUID_SVM_DECODE_ASSIST BIT(7) /* Decode assist */ #define AMD_CPUID_SVM_PAUSE_INC BIT(10) /* Pause intercept filter. */ #define AMD_CPUID_SVM_PAUSE_FTH BIT(12) /* Pause filter threshold */ #define AMD_CPUID_SVM_AVIC BIT(13) /* AVIC present */ #define VMCB_CACHE_DEFAULT (VMCB_CACHE_ASID | \ VMCB_CACHE_IOPM | \ VMCB_CACHE_I | \ VMCB_CACHE_TPR | \ VMCB_CACHE_CR2 | \ VMCB_CACHE_CR | \ VMCB_CACHE_DT | \ VMCB_CACHE_SEG | \ VMCB_CACHE_NP) static uint32_t vmcb_clean = VMCB_CACHE_DEFAULT; SYSCTL_INT(_hw_vmm_svm, OID_AUTO, vmcb_clean, CTLFLAG_RDTUN, &vmcb_clean, 0, NULL); static MALLOC_DEFINE(M_SVM, "svm", "svm"); static MALLOC_DEFINE(M_SVM_VLAPIC, "svm-vlapic", "svm-vlapic"); /* Per-CPU context area. */ extern struct pcpu __pcpu[]; static uint32_t svm_feature = ~0U; /* AMD SVM features. */ SYSCTL_UINT(_hw_vmm_svm, OID_AUTO, features, CTLFLAG_RDTUN, &svm_feature, 0, "SVM features advertised by CPUID.8000000AH:EDX"); static int disable_npf_assist; SYSCTL_INT(_hw_vmm_svm, OID_AUTO, disable_npf_assist, CTLFLAG_RWTUN, &disable_npf_assist, 0, NULL); /* Maximum ASIDs supported by the processor */ static uint32_t nasid; SYSCTL_UINT(_hw_vmm_svm, OID_AUTO, num_asids, CTLFLAG_RDTUN, &nasid, 0, "Number of ASIDs supported by this processor"); /* Current ASID generation for each host cpu */ static struct asid asid[MAXCPU]; /* * SVM host state saved area of size 4KB for each core. */ static uint8_t hsave[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE); static VMM_STAT_AMD(VCPU_EXITINTINFO, "VM exits during event delivery"); static VMM_STAT_AMD(VCPU_INTINFO_INJECTED, "Events pending at VM entry"); static VMM_STAT_AMD(VMEXIT_VINTR, "VM exits due to interrupt window"); static int svm_setreg(void *arg, int vcpu, int ident, uint64_t val); static __inline int flush_by_asid(void) { return (svm_feature & AMD_CPUID_SVM_FLUSH_BY_ASID); } static __inline int decode_assist(void) { return (svm_feature & AMD_CPUID_SVM_DECODE_ASSIST); } static void svm_disable(void *arg __unused) { uint64_t efer; efer = rdmsr(MSR_EFER); efer &= ~EFER_SVM; wrmsr(MSR_EFER, efer); } /* * Disable SVM on all CPUs. */ static int svm_cleanup(void) { smp_rendezvous(NULL, svm_disable, NULL, NULL); return (0); } /* * Verify that all the features required by bhyve are available. */ static int check_svm_features(void) { u_int regs[4]; /* CPUID Fn8000_000A is for SVM */ do_cpuid(0x8000000A, regs); svm_feature &= regs[3]; /* * The number of ASIDs can be configured to be less than what is * supported by the hardware but not more. */ if (nasid == 0 || nasid > regs[1]) nasid = regs[1]; KASSERT(nasid > 1, ("Insufficient ASIDs for guests: %#x", nasid)); /* bhyve requires the Nested Paging feature */ if (!(svm_feature & AMD_CPUID_SVM_NP)) { printf("SVM: Nested Paging feature not available.\n"); return (ENXIO); } /* bhyve requires the NRIP Save feature */ if (!(svm_feature & AMD_CPUID_SVM_NRIP_SAVE)) { printf("SVM: NRIP Save feature not available.\n"); return (ENXIO); } return (0); } static void svm_enable(void *arg __unused) { uint64_t efer; efer = rdmsr(MSR_EFER); efer |= EFER_SVM; wrmsr(MSR_EFER, efer); wrmsr(MSR_VM_HSAVE_PA, vtophys(hsave[curcpu])); } /* * Return 1 if SVM is enabled on this processor and 0 otherwise. */ static int svm_available(void) { uint64_t msr; /* Section 15.4 Enabling SVM from APM2. */ if ((amd_feature2 & AMDID2_SVM) == 0) { printf("SVM: not available.\n"); return (0); } msr = rdmsr(MSR_VM_CR); if ((msr & VM_CR_SVMDIS) != 0) { printf("SVM: disabled by BIOS.\n"); return (0); } return (1); } static int svm_init(int ipinum) { int error, cpu; if (!svm_available()) return (ENXIO); error = check_svm_features(); if (error) return (error); vmcb_clean &= VMCB_CACHE_DEFAULT; for (cpu = 0; cpu < MAXCPU; cpu++) { /* * Initialize the host ASIDs to their "highest" valid values. * * The next ASID allocation will rollover both 'gen' and 'num' * and start off the sequence at {1,1}. */ asid[cpu].gen = ~0UL; asid[cpu].num = nasid - 1; } svm_msr_init(); svm_npt_init(ipinum); /* Enable SVM on all CPUs */ smp_rendezvous(NULL, svm_enable, NULL, NULL); return (0); } static void svm_restore(void) { svm_enable(NULL); } /* Pentium compatible MSRs */ #define MSR_PENTIUM_START 0 #define MSR_PENTIUM_END 0x1FFF /* AMD 6th generation and Intel compatible MSRs */ #define MSR_AMD6TH_START 0xC0000000UL #define MSR_AMD6TH_END 0xC0001FFFUL /* AMD 7th and 8th generation compatible MSRs */ #define MSR_AMD7TH_START 0xC0010000UL #define MSR_AMD7TH_END 0xC0011FFFUL /* * Get the index and bit position for a MSR in permission bitmap. * Two bits are used for each MSR: lower bit for read and higher bit for write. */ static int svm_msr_index(uint64_t msr, int *index, int *bit) { uint32_t base, off; *index = -1; *bit = (msr % 4) * 2; base = 0; if (msr >= MSR_PENTIUM_START && msr <= MSR_PENTIUM_END) { *index = msr / 4; return (0); } base += (MSR_PENTIUM_END - MSR_PENTIUM_START + 1); if (msr >= MSR_AMD6TH_START && msr <= MSR_AMD6TH_END) { off = (msr - MSR_AMD6TH_START); *index = (off + base) / 4; return (0); } base += (MSR_AMD6TH_END - MSR_AMD6TH_START + 1); if (msr >= MSR_AMD7TH_START && msr <= MSR_AMD7TH_END) { off = (msr - MSR_AMD7TH_START); *index = (off + base) / 4; return (0); } return (EINVAL); } /* * Allow vcpu to read or write the 'msr' without trapping into the hypervisor. */ static void svm_msr_perm(uint8_t *perm_bitmap, uint64_t msr, bool read, bool write) { int index, bit, error; error = svm_msr_index(msr, &index, &bit); KASSERT(error == 0, ("%s: invalid msr %#lx", __func__, msr)); KASSERT(index >= 0 && index < SVM_MSR_BITMAP_SIZE, ("%s: invalid index %d for msr %#lx", __func__, index, msr)); KASSERT(bit >= 0 && bit <= 6, ("%s: invalid bit position %d " "msr %#lx", __func__, bit, msr)); if (read) perm_bitmap[index] &= ~(1UL << bit); if (write) perm_bitmap[index] &= ~(2UL << bit); } static void svm_msr_rw_ok(uint8_t *perm_bitmap, uint64_t msr) { svm_msr_perm(perm_bitmap, msr, true, true); } static void svm_msr_rd_ok(uint8_t *perm_bitmap, uint64_t msr) { svm_msr_perm(perm_bitmap, msr, true, false); } static __inline int svm_get_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask) { struct vmcb_ctrl *ctrl; KASSERT(idx >=0 && idx < 5, ("invalid intercept index %d", idx)); ctrl = svm_get_vmcb_ctrl(sc, vcpu); return (ctrl->intercept[idx] & bitmask ? 1 : 0); } static __inline void svm_set_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask, int enabled) { struct vmcb_ctrl *ctrl; uint32_t oldval; KASSERT(idx >=0 && idx < 5, ("invalid intercept index %d", idx)); ctrl = svm_get_vmcb_ctrl(sc, vcpu); oldval = ctrl->intercept[idx]; if (enabled) ctrl->intercept[idx] |= bitmask; else ctrl->intercept[idx] &= ~bitmask; if (ctrl->intercept[idx] != oldval) { svm_set_dirty(sc, vcpu, VMCB_CACHE_I); VCPU_CTR3(sc->vm, vcpu, "intercept[%d] modified " "from %#x to %#x", idx, oldval, ctrl->intercept[idx]); } } static __inline void svm_disable_intercept(struct svm_softc *sc, int vcpu, int off, uint32_t bitmask) { svm_set_intercept(sc, vcpu, off, bitmask, 0); } static __inline void svm_enable_intercept(struct svm_softc *sc, int vcpu, int off, uint32_t bitmask) { svm_set_intercept(sc, vcpu, off, bitmask, 1); } static void vmcb_init(struct svm_softc *sc, int vcpu, uint64_t iopm_base_pa, uint64_t msrpm_base_pa, uint64_t np_pml4) { struct vmcb_ctrl *ctrl; struct vmcb_state *state; uint32_t mask; int n; ctrl = svm_get_vmcb_ctrl(sc, vcpu); state = svm_get_vmcb_state(sc, vcpu); ctrl->iopm_base_pa = iopm_base_pa; ctrl->msrpm_base_pa = msrpm_base_pa; /* Enable nested paging */ ctrl->np_enable = 1; ctrl->n_cr3 = np_pml4; /* * Intercept accesses to the control registers that are not shadowed * in the VMCB - i.e. all except cr0, cr2, cr3, cr4 and cr8. */ for (n = 0; n < 16; n++) { mask = (BIT(n) << 16) | BIT(n); if (n == 0 || n == 2 || n == 3 || n == 4 || n == 8) svm_disable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask); else svm_enable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask); } /* * Intercept everything when tracing guest exceptions otherwise * just intercept machine check exception. */ if (vcpu_trace_exceptions(sc->vm, vcpu)) { for (n = 0; n < 32; n++) { /* * Skip unimplemented vectors in the exception bitmap. */ if (n == 2 || n == 9) { continue; } svm_enable_intercept(sc, vcpu, VMCB_EXC_INTCPT, BIT(n)); } } else { svm_enable_intercept(sc, vcpu, VMCB_EXC_INTCPT, BIT(IDT_MC)); } /* Intercept various events (for e.g. I/O, MSR and CPUID accesses) */ svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IO); svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_MSR); svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_CPUID); svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INTR); svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INIT); svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_NMI); svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SMI); svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SHUTDOWN); svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_FERR_FREEZE); svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_MONITOR); svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_MWAIT); /* * From section "Canonicalization and Consistency Checks" in APMv2 * the VMRUN intercept bit must be set to pass the consistency check. */ svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMRUN); /* * The ASID will be set to a non-zero value just before VMRUN. */ ctrl->asid = 0; /* * Section 15.21.1, Interrupt Masking in EFLAGS * Section 15.21.2, Virtualizing APIC.TPR * * This must be set for %rflag and %cr8 isolation of guest and host. */ ctrl->v_intr_masking = 1; /* Enable Last Branch Record aka LBR for debugging */ ctrl->lbr_virt_en = 1; state->dbgctl = BIT(0); /* EFER_SVM must always be set when the guest is executing */ state->efer = EFER_SVM; /* Set up the PAT to power-on state */ state->g_pat = PAT_VALUE(0, PAT_WRITE_BACK) | PAT_VALUE(1, PAT_WRITE_THROUGH) | PAT_VALUE(2, PAT_UNCACHED) | PAT_VALUE(3, PAT_UNCACHEABLE) | PAT_VALUE(4, PAT_WRITE_BACK) | PAT_VALUE(5, PAT_WRITE_THROUGH) | PAT_VALUE(6, PAT_UNCACHED) | PAT_VALUE(7, PAT_UNCACHEABLE); } /* * Initialize a virtual machine. */ static void * svm_vminit(struct vm *vm, pmap_t pmap) { struct svm_softc *svm_sc; struct svm_vcpu *vcpu; vm_paddr_t msrpm_pa, iopm_pa, pml4_pa; int i; svm_sc = contigmalloc(sizeof (*svm_sc), M_SVM, M_WAITOK | M_ZERO, 0, ~(vm_paddr_t)0, PAGE_SIZE, 0); svm_sc->vm = vm; svm_sc->nptp = (vm_offset_t)vtophys(pmap->pm_pml4); /* * Intercept read and write accesses to all MSRs. */ memset(svm_sc->msr_bitmap, 0xFF, sizeof(svm_sc->msr_bitmap)); /* * Access to the following MSRs is redirected to the VMCB when the * guest is executing. Therefore it is safe to allow the guest to * read/write these MSRs directly without hypervisor involvement. */ svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_GSBASE); svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_FSBASE); svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_KGSBASE); svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_STAR); svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_LSTAR); svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_CSTAR); svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SF_MASK); svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_CS_MSR); svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_ESP_MSR); svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_EIP_MSR); svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_PAT); svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_TSC); /* * Intercept writes to make sure that the EFER_SVM bit is not cleared. */ svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_EFER); /* Intercept access to all I/O ports. */ memset(svm_sc->iopm_bitmap, 0xFF, sizeof(svm_sc->iopm_bitmap)); iopm_pa = vtophys(svm_sc->iopm_bitmap); msrpm_pa = vtophys(svm_sc->msr_bitmap); pml4_pa = svm_sc->nptp; for (i = 0; i < VM_MAXCPU; i++) { vcpu = svm_get_vcpu(svm_sc, i); vcpu->nextrip = ~0; vcpu->lastcpu = NOCPU; vcpu->vmcb_pa = vtophys(&vcpu->vmcb); vmcb_init(svm_sc, i, iopm_pa, msrpm_pa, pml4_pa); svm_msr_guest_init(svm_sc, i); } return (svm_sc); } /* * Collateral for a generic SVM VM-exit. */ static void vm_exit_svm(struct vm_exit *vme, uint64_t code, uint64_t info1, uint64_t info2) { vme->exitcode = VM_EXITCODE_SVM; vme->u.svm.exitcode = code; vme->u.svm.exitinfo1 = info1; vme->u.svm.exitinfo2 = info2; } static int svm_cpl(struct vmcb_state *state) { /* * From APMv2: * "Retrieve the CPL from the CPL field in the VMCB, not * from any segment DPL" */ return (state->cpl); } static enum vm_cpu_mode svm_vcpu_mode(struct vmcb *vmcb) { struct vmcb_segment seg; struct vmcb_state *state; int error; state = &vmcb->state; if (state->efer & EFER_LMA) { error = vmcb_seg(vmcb, VM_REG_GUEST_CS, &seg); KASSERT(error == 0, ("%s: vmcb_seg(cs) error %d", __func__, error)); /* * Section 4.8.1 for APM2, check if Code Segment has * Long attribute set in descriptor. */ if (seg.attrib & VMCB_CS_ATTRIB_L) return (CPU_MODE_64BIT); else return (CPU_MODE_COMPATIBILITY); } else if (state->cr0 & CR0_PE) { return (CPU_MODE_PROTECTED); } else { return (CPU_MODE_REAL); } } static enum vm_paging_mode svm_paging_mode(uint64_t cr0, uint64_t cr4, uint64_t efer) { if ((cr0 & CR0_PG) == 0) return (PAGING_MODE_FLAT); if ((cr4 & CR4_PAE) == 0) return (PAGING_MODE_32); if (efer & EFER_LME) return (PAGING_MODE_64); else return (PAGING_MODE_PAE); } /* * ins/outs utility routines */ static uint64_t svm_inout_str_index(struct svm_regctx *regs, int in) { uint64_t val; val = in ? regs->sctx_rdi : regs->sctx_rsi; return (val); } static uint64_t svm_inout_str_count(struct svm_regctx *regs, int rep) { uint64_t val; val = rep ? regs->sctx_rcx : 1; return (val); } static void svm_inout_str_seginfo(struct svm_softc *svm_sc, int vcpu, int64_t info1, int in, struct vm_inout_str *vis) { int error, s; if (in) { vis->seg_name = VM_REG_GUEST_ES; } else { /* The segment field has standard encoding */ s = (info1 >> 10) & 0x7; vis->seg_name = vm_segment_name(s); } error = vmcb_getdesc(svm_sc, vcpu, vis->seg_name, &vis->seg_desc); KASSERT(error == 0, ("%s: svm_getdesc error %d", __func__, error)); } static int svm_inout_str_addrsize(uint64_t info1) { uint32_t size; size = (info1 >> 7) & 0x7; switch (size) { case 1: return (2); /* 16 bit */ case 2: return (4); /* 32 bit */ case 4: return (8); /* 64 bit */ default: panic("%s: invalid size encoding %d", __func__, size); } } static void svm_paging_info(struct vmcb *vmcb, struct vm_guest_paging *paging) { struct vmcb_state *state; state = &vmcb->state; paging->cr3 = state->cr3; paging->cpl = svm_cpl(state); paging->cpu_mode = svm_vcpu_mode(vmcb); paging->paging_mode = svm_paging_mode(state->cr0, state->cr4, state->efer); } #define UNHANDLED 0 /* * Handle guest I/O intercept. */ static int svm_handle_io(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit) { struct vmcb_ctrl *ctrl; struct vmcb_state *state; struct svm_regctx *regs; struct vm_inout_str *vis; uint64_t info1; int inout_string; state = svm_get_vmcb_state(svm_sc, vcpu); ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu); regs = svm_get_guest_regctx(svm_sc, vcpu); info1 = ctrl->exitinfo1; inout_string = info1 & BIT(2) ? 1 : 0; /* * The effective segment number in EXITINFO1[12:10] is populated * only if the processor has the DecodeAssist capability. * * XXX this is not specified explicitly in APMv2 but can be verified * empirically. */ if (inout_string && !decode_assist()) return (UNHANDLED); vmexit->exitcode = VM_EXITCODE_INOUT; vmexit->u.inout.in = (info1 & BIT(0)) ? 1 : 0; vmexit->u.inout.string = inout_string; vmexit->u.inout.rep = (info1 & BIT(3)) ? 1 : 0; vmexit->u.inout.bytes = (info1 >> 4) & 0x7; vmexit->u.inout.port = (uint16_t)(info1 >> 16); vmexit->u.inout.eax = (uint32_t)(state->rax); if (inout_string) { vmexit->exitcode = VM_EXITCODE_INOUT_STR; vis = &vmexit->u.inout_str; svm_paging_info(svm_get_vmcb(svm_sc, vcpu), &vis->paging); vis->rflags = state->rflags; vis->cr0 = state->cr0; vis->index = svm_inout_str_index(regs, vmexit->u.inout.in); vis->count = svm_inout_str_count(regs, vmexit->u.inout.rep); vis->addrsize = svm_inout_str_addrsize(info1); svm_inout_str_seginfo(svm_sc, vcpu, info1, vmexit->u.inout.in, vis); } return (UNHANDLED); } static int npf_fault_type(uint64_t exitinfo1) { if (exitinfo1 & VMCB_NPF_INFO1_W) return (VM_PROT_WRITE); else if (exitinfo1 & VMCB_NPF_INFO1_ID) return (VM_PROT_EXECUTE); else return (VM_PROT_READ); } static bool svm_npf_emul_fault(uint64_t exitinfo1) { if (exitinfo1 & VMCB_NPF_INFO1_ID) { return (false); } if (exitinfo1 & VMCB_NPF_INFO1_GPT) { return (false); } if ((exitinfo1 & VMCB_NPF_INFO1_GPA) == 0) { return (false); } return (true); } static void svm_handle_inst_emul(struct vmcb *vmcb, uint64_t gpa, struct vm_exit *vmexit) { struct vm_guest_paging *paging; struct vmcb_segment seg; struct vmcb_ctrl *ctrl; char *inst_bytes; int error, inst_len; ctrl = &vmcb->ctrl; paging = &vmexit->u.inst_emul.paging; vmexit->exitcode = VM_EXITCODE_INST_EMUL; vmexit->u.inst_emul.gpa = gpa; vmexit->u.inst_emul.gla = VIE_INVALID_GLA; svm_paging_info(vmcb, paging); error = vmcb_seg(vmcb, VM_REG_GUEST_CS, &seg); KASSERT(error == 0, ("%s: vmcb_seg(CS) error %d", __func__, error)); switch(paging->cpu_mode) { case CPU_MODE_REAL: vmexit->u.inst_emul.cs_base = seg.base; vmexit->u.inst_emul.cs_d = 0; break; case CPU_MODE_PROTECTED: case CPU_MODE_COMPATIBILITY: vmexit->u.inst_emul.cs_base = seg.base; /* * Section 4.8.1 of APM2, Default Operand Size or D bit. */ vmexit->u.inst_emul.cs_d = (seg.attrib & VMCB_CS_ATTRIB_D) ? 1 : 0; break; default: vmexit->u.inst_emul.cs_base = 0; vmexit->u.inst_emul.cs_d = 0; break; } /* * Copy the instruction bytes into 'vie' if available. */ if (decode_assist() && !disable_npf_assist) { inst_len = ctrl->inst_len; inst_bytes = ctrl->inst_bytes; } else { inst_len = 0; inst_bytes = NULL; } vie_init(&vmexit->u.inst_emul.vie, inst_bytes, inst_len); } #ifdef KTR static const char * intrtype_to_str(int intr_type) { switch (intr_type) { case VMCB_EVENTINJ_TYPE_INTR: return ("hwintr"); case VMCB_EVENTINJ_TYPE_NMI: return ("nmi"); case VMCB_EVENTINJ_TYPE_INTn: return ("swintr"); case VMCB_EVENTINJ_TYPE_EXCEPTION: return ("exception"); default: panic("%s: unknown intr_type %d", __func__, intr_type); } } #endif /* * Inject an event to vcpu as described in section 15.20, "Event injection". */ static void svm_eventinject(struct svm_softc *sc, int vcpu, int intr_type, int vector, uint32_t error, bool ec_valid) { struct vmcb_ctrl *ctrl; ctrl = svm_get_vmcb_ctrl(sc, vcpu); KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) == 0, ("%s: event already pending %#lx", __func__, ctrl->eventinj)); KASSERT(vector >=0 && vector <= 255, ("%s: invalid vector %d", __func__, vector)); switch (intr_type) { case VMCB_EVENTINJ_TYPE_INTR: case VMCB_EVENTINJ_TYPE_NMI: case VMCB_EVENTINJ_TYPE_INTn: break; case VMCB_EVENTINJ_TYPE_EXCEPTION: if (vector >= 0 && vector <= 31 && vector != 2) break; /* FALLTHROUGH */ default: panic("%s: invalid intr_type/vector: %d/%d", __func__, intr_type, vector); } ctrl->eventinj = vector | (intr_type << 8) | VMCB_EVENTINJ_VALID; if (ec_valid) { ctrl->eventinj |= VMCB_EVENTINJ_EC_VALID; ctrl->eventinj |= (uint64_t)error << 32; VCPU_CTR3(sc->vm, vcpu, "Injecting %s at vector %d errcode %#x", intrtype_to_str(intr_type), vector, error); } else { VCPU_CTR2(sc->vm, vcpu, "Injecting %s at vector %d", intrtype_to_str(intr_type), vector); } } static void svm_update_virqinfo(struct svm_softc *sc, int vcpu) { struct vm *vm; struct vlapic *vlapic; struct vmcb_ctrl *ctrl; int pending; vm = sc->vm; vlapic = vm_lapic(vm, vcpu); ctrl = svm_get_vmcb_ctrl(sc, vcpu); /* Update %cr8 in the emulated vlapic */ vlapic_set_cr8(vlapic, ctrl->v_tpr); /* * If V_IRQ indicates that the interrupt injection attempted on then * last VMRUN was successful then update the vlapic accordingly. */ if (ctrl->v_intr_vector != 0) { pending = ctrl->v_irq; KASSERT(ctrl->v_intr_vector >= 16, ("%s: invalid " "v_intr_vector %d", __func__, ctrl->v_intr_vector)); KASSERT(!ctrl->v_ign_tpr, ("%s: invalid v_ign_tpr", __func__)); VCPU_CTR2(vm, vcpu, "v_intr_vector %d %s", ctrl->v_intr_vector, pending ? "pending" : "accepted"); if (!pending) vlapic_intr_accepted(vlapic, ctrl->v_intr_vector); } } static void svm_save_intinfo(struct svm_softc *svm_sc, int vcpu) { struct vmcb_ctrl *ctrl; uint64_t intinfo; ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu); intinfo = ctrl->exitintinfo; if (!VMCB_EXITINTINFO_VALID(intinfo)) return; /* * From APMv2, Section "Intercepts during IDT interrupt delivery" * * If a #VMEXIT happened during event delivery then record the event * that was being delivered. */ VCPU_CTR2(svm_sc->vm, vcpu, "SVM:Pending INTINFO(0x%lx), vector=%d.\n", intinfo, VMCB_EXITINTINFO_VECTOR(intinfo)); vmm_stat_incr(svm_sc->vm, vcpu, VCPU_EXITINTINFO, 1); vm_exit_intinfo(svm_sc->vm, vcpu, intinfo); } static __inline int vintr_intercept_enabled(struct svm_softc *sc, int vcpu) { return (svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR)); } static __inline void enable_intr_window_exiting(struct svm_softc *sc, int vcpu) { struct vmcb_ctrl *ctrl; ctrl = svm_get_vmcb_ctrl(sc, vcpu); if (ctrl->v_irq && ctrl->v_intr_vector == 0) { KASSERT(ctrl->v_ign_tpr, ("%s: invalid v_ign_tpr", __func__)); KASSERT(vintr_intercept_enabled(sc, vcpu), ("%s: vintr intercept should be enabled", __func__)); return; } VCPU_CTR0(sc->vm, vcpu, "Enable intr window exiting"); ctrl->v_irq = 1; ctrl->v_ign_tpr = 1; ctrl->v_intr_vector = 0; svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR); svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR); } static __inline void disable_intr_window_exiting(struct svm_softc *sc, int vcpu) { struct vmcb_ctrl *ctrl; ctrl = svm_get_vmcb_ctrl(sc, vcpu); if (!ctrl->v_irq && ctrl->v_intr_vector == 0) { KASSERT(!vintr_intercept_enabled(sc, vcpu), ("%s: vintr intercept should be disabled", __func__)); return; } #ifdef KTR if (ctrl->v_intr_vector == 0) VCPU_CTR0(sc->vm, vcpu, "Disable intr window exiting"); else VCPU_CTR0(sc->vm, vcpu, "Clearing V_IRQ interrupt injection"); #endif ctrl->v_irq = 0; ctrl->v_intr_vector = 0; svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR); svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR); } static int svm_modify_intr_shadow(struct svm_softc *sc, int vcpu, uint64_t val) { struct vmcb_ctrl *ctrl; int oldval, newval; ctrl = svm_get_vmcb_ctrl(sc, vcpu); oldval = ctrl->intr_shadow; newval = val ? 1 : 0; if (newval != oldval) { ctrl->intr_shadow = newval; VCPU_CTR1(sc->vm, vcpu, "Setting intr_shadow to %d", newval); } return (0); } static int svm_get_intr_shadow(struct svm_softc *sc, int vcpu, uint64_t *val) { struct vmcb_ctrl *ctrl; ctrl = svm_get_vmcb_ctrl(sc, vcpu); *val = ctrl->intr_shadow; return (0); } /* * Once an NMI is injected it blocks delivery of further NMIs until the handler * executes an IRET. The IRET intercept is enabled when an NMI is injected to * to track when the vcpu is done handling the NMI. */ static int nmi_blocked(struct svm_softc *sc, int vcpu) { int blocked; blocked = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET); return (blocked); } static void enable_nmi_blocking(struct svm_softc *sc, int vcpu) { KASSERT(!nmi_blocked(sc, vcpu), ("vNMI already blocked")); VCPU_CTR0(sc->vm, vcpu, "vNMI blocking enabled"); svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET); } static void clear_nmi_blocking(struct svm_softc *sc, int vcpu) { int error; KASSERT(nmi_blocked(sc, vcpu), ("vNMI already unblocked")); VCPU_CTR0(sc->vm, vcpu, "vNMI blocking cleared"); /* * When the IRET intercept is cleared the vcpu will attempt to execute * the "iret" when it runs next. However, it is possible to inject * another NMI into the vcpu before the "iret" has actually executed. * * For e.g. if the "iret" encounters a #NPF when accessing the stack * it will trap back into the hypervisor. If an NMI is pending for * the vcpu it will be injected into the guest. * * XXX this needs to be fixed */ svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET); /* * Set 'intr_shadow' to prevent an NMI from being injected on the * immediate VMRUN. */ error = svm_modify_intr_shadow(sc, vcpu, 1); KASSERT(!error, ("%s: error %d setting intr_shadow", __func__, error)); } #define EFER_MBZ_BITS 0xFFFFFFFFFFFF0200UL static int svm_write_efer(struct svm_softc *sc, int vcpu, uint64_t newval, bool *retu) { struct vm_exit *vme; struct vmcb_state *state; uint64_t changed, lma, oldval; int error; state = svm_get_vmcb_state(sc, vcpu); oldval = state->efer; VCPU_CTR2(sc->vm, vcpu, "wrmsr(efer) %#lx/%#lx", oldval, newval); newval &= ~0xFE; /* clear the Read-As-Zero (RAZ) bits */ changed = oldval ^ newval; if (newval & EFER_MBZ_BITS) goto gpf; /* APMv2 Table 14-5 "Long-Mode Consistency Checks" */ if (changed & EFER_LME) { if (state->cr0 & CR0_PG) goto gpf; } /* EFER.LMA = EFER.LME & CR0.PG */ if ((newval & EFER_LME) != 0 && (state->cr0 & CR0_PG) != 0) lma = EFER_LMA; else lma = 0; if ((newval & EFER_LMA) != lma) goto gpf; if (newval & EFER_NXE) { if (!vm_cpuid_capability(sc->vm, vcpu, VCC_NO_EXECUTE)) goto gpf; } /* * XXX bhyve does not enforce segment limits in 64-bit mode. Until * this is fixed flag guest attempt to set EFER_LMSLE as an error. */ if (newval & EFER_LMSLE) { vme = vm_exitinfo(sc->vm, vcpu); vm_exit_svm(vme, VMCB_EXIT_MSR, 1, 0); *retu = true; return (0); } if (newval & EFER_FFXSR) { if (!vm_cpuid_capability(sc->vm, vcpu, VCC_FFXSR)) goto gpf; } if (newval & EFER_TCE) { if (!vm_cpuid_capability(sc->vm, vcpu, VCC_TCE)) goto gpf; } error = svm_setreg(sc, vcpu, VM_REG_GUEST_EFER, newval); KASSERT(error == 0, ("%s: error %d updating efer", __func__, error)); return (0); gpf: vm_inject_gp(sc->vm, vcpu); return (0); } static int emulate_wrmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t val, bool *retu) { int error; if (lapic_msr(num)) error = lapic_wrmsr(sc->vm, vcpu, num, val, retu); else if (num == MSR_EFER) error = svm_write_efer(sc, vcpu, val, retu); else error = svm_wrmsr(sc, vcpu, num, val, retu); return (error); } static int emulate_rdmsr(struct svm_softc *sc, int vcpu, u_int num, bool *retu) { struct vmcb_state *state; struct svm_regctx *ctx; uint64_t result; int error; if (lapic_msr(num)) error = lapic_rdmsr(sc->vm, vcpu, num, &result, retu); else error = svm_rdmsr(sc, vcpu, num, &result, retu); if (error == 0) { state = svm_get_vmcb_state(sc, vcpu); ctx = svm_get_guest_regctx(sc, vcpu); state->rax = result & 0xffffffff; ctx->sctx_rdx = result >> 32; } return (error); } #ifdef KTR static const char * exit_reason_to_str(uint64_t reason) { static char reasonbuf[32]; switch (reason) { case VMCB_EXIT_INVALID: return ("invalvmcb"); case VMCB_EXIT_SHUTDOWN: return ("shutdown"); case VMCB_EXIT_NPF: return ("nptfault"); case VMCB_EXIT_PAUSE: return ("pause"); case VMCB_EXIT_HLT: return ("hlt"); case VMCB_EXIT_CPUID: return ("cpuid"); case VMCB_EXIT_IO: return ("inout"); case VMCB_EXIT_MC: return ("mchk"); case VMCB_EXIT_INTR: return ("extintr"); case VMCB_EXIT_NMI: return ("nmi"); case VMCB_EXIT_VINTR: return ("vintr"); case VMCB_EXIT_MSR: return ("msr"); case VMCB_EXIT_IRET: return ("iret"); case VMCB_EXIT_MONITOR: return ("monitor"); case VMCB_EXIT_MWAIT: return ("mwait"); default: snprintf(reasonbuf, sizeof(reasonbuf), "%#lx", reason); return (reasonbuf); } } #endif /* KTR */ /* * From section "State Saved on Exit" in APMv2: nRIP is saved for all #VMEXITs * that are due to instruction intercepts as well as MSR and IOIO intercepts * and exceptions caused by INT3, INTO and BOUND instructions. * * Return 1 if the nRIP is valid and 0 otherwise. */ static int nrip_valid(uint64_t exitcode) { switch (exitcode) { case 0x00 ... 0x0F: /* read of CR0 through CR15 */ case 0x10 ... 0x1F: /* write of CR0 through CR15 */ case 0x20 ... 0x2F: /* read of DR0 through DR15 */ case 0x30 ... 0x3F: /* write of DR0 through DR15 */ case 0x43: /* INT3 */ case 0x44: /* INTO */ case 0x45: /* BOUND */ case 0x65 ... 0x7C: /* VMEXIT_CR0_SEL_WRITE ... VMEXIT_MSR */ case 0x80 ... 0x8D: /* VMEXIT_VMRUN ... VMEXIT_XSETBV */ return (1); default: return (0); } } static int svm_vmexit(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit) { struct vmcb *vmcb; struct vmcb_state *state; struct vmcb_ctrl *ctrl; struct svm_regctx *ctx; uint64_t code, info1, info2, val; uint32_t eax, ecx, edx; int error, errcode_valid, handled, idtvec, reflect; bool retu; ctx = svm_get_guest_regctx(svm_sc, vcpu); vmcb = svm_get_vmcb(svm_sc, vcpu); state = &vmcb->state; ctrl = &vmcb->ctrl; handled = 0; code = ctrl->exitcode; info1 = ctrl->exitinfo1; info2 = ctrl->exitinfo2; vmexit->exitcode = VM_EXITCODE_BOGUS; vmexit->rip = state->rip; vmexit->inst_length = nrip_valid(code) ? ctrl->nrip - state->rip : 0; vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_COUNT, 1); /* * #VMEXIT(INVALID) needs to be handled early because the VMCB is * in an inconsistent state and can trigger assertions that would * never happen otherwise. */ if (code == VMCB_EXIT_INVALID) { vm_exit_svm(vmexit, code, info1, info2); return (0); } KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) == 0, ("%s: event " "injection valid bit is set %#lx", __func__, ctrl->eventinj)); KASSERT(vmexit->inst_length >= 0 && vmexit->inst_length <= 15, ("invalid inst_length %d: code (%#lx), info1 (%#lx), info2 (%#lx)", vmexit->inst_length, code, info1, info2)); svm_update_virqinfo(svm_sc, vcpu); svm_save_intinfo(svm_sc, vcpu); switch (code) { case VMCB_EXIT_IRET: /* * Restart execution at "iret" but with the intercept cleared. */ vmexit->inst_length = 0; clear_nmi_blocking(svm_sc, vcpu); handled = 1; break; case VMCB_EXIT_VINTR: /* interrupt window exiting */ vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_VINTR, 1); handled = 1; break; case VMCB_EXIT_INTR: /* external interrupt */ vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXTINT, 1); handled = 1; break; case VMCB_EXIT_NMI: /* external NMI */ handled = 1; break; case 0x40 ... 0x5F: vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXCEPTION, 1); reflect = 1; idtvec = code - 0x40; switch (idtvec) { case IDT_MC: /* * Call the machine check handler by hand. Also don't * reflect the machine check back into the guest. */ reflect = 0; VCPU_CTR0(svm_sc->vm, vcpu, "Vectoring to MCE handler"); __asm __volatile("int $18"); break; case IDT_PF: error = svm_setreg(svm_sc, vcpu, VM_REG_GUEST_CR2, info2); KASSERT(error == 0, ("%s: error %d updating cr2", __func__, error)); /* fallthru */ case IDT_NP: case IDT_SS: case IDT_GP: case IDT_AC: case IDT_TS: errcode_valid = 1; break; case IDT_DF: errcode_valid = 1; info1 = 0; break; case IDT_BP: case IDT_OF: case IDT_BR: /* * The 'nrip' field is populated for INT3, INTO and * BOUND exceptions and this also implies that * 'inst_length' is non-zero. * * Reset 'inst_length' to zero so the guest %rip at * event injection is identical to what it was when * the exception originally happened. */ VCPU_CTR2(svm_sc->vm, vcpu, "Reset inst_length from %d " "to zero before injecting exception %d", vmexit->inst_length, idtvec); vmexit->inst_length = 0; /* fallthru */ default: errcode_valid = 0; info1 = 0; break; } KASSERT(vmexit->inst_length == 0, ("invalid inst_length (%d) " "when reflecting exception %d into guest", vmexit->inst_length, idtvec)); if (reflect) { /* Reflect the exception back into the guest */ VCPU_CTR2(svm_sc->vm, vcpu, "Reflecting exception " "%d/%#x into the guest", idtvec, (int)info1); error = vm_inject_exception(svm_sc->vm, vcpu, idtvec, errcode_valid, info1, 0); KASSERT(error == 0, ("%s: vm_inject_exception error %d", __func__, error)); } handled = 1; break; case VMCB_EXIT_MSR: /* MSR access. */ eax = state->rax; ecx = ctx->sctx_rcx; edx = ctx->sctx_rdx; retu = false; if (info1) { vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_WRMSR, 1); val = (uint64_t)edx << 32 | eax; VCPU_CTR2(svm_sc->vm, vcpu, "wrmsr %#x val %#lx", ecx, val); if (emulate_wrmsr(svm_sc, vcpu, ecx, val, &retu)) { vmexit->exitcode = VM_EXITCODE_WRMSR; vmexit->u.msr.code = ecx; vmexit->u.msr.wval = val; } else if (!retu) { handled = 1; } else { KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS, ("emulate_wrmsr retu with bogus exitcode")); } } else { VCPU_CTR1(svm_sc->vm, vcpu, "rdmsr %#x", ecx); vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_RDMSR, 1); if (emulate_rdmsr(svm_sc, vcpu, ecx, &retu)) { vmexit->exitcode = VM_EXITCODE_RDMSR; vmexit->u.msr.code = ecx; } else if (!retu) { handled = 1; } else { KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS, ("emulate_rdmsr retu with bogus exitcode")); } } break; case VMCB_EXIT_IO: handled = svm_handle_io(svm_sc, vcpu, vmexit); vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_INOUT, 1); break; case VMCB_EXIT_CPUID: vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_CPUID, 1); handled = x86_emulate_cpuid(svm_sc->vm, vcpu, (uint32_t *)&state->rax, (uint32_t *)&ctx->sctx_rbx, (uint32_t *)&ctx->sctx_rcx, (uint32_t *)&ctx->sctx_rdx); break; case VMCB_EXIT_HLT: vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_HLT, 1); vmexit->exitcode = VM_EXITCODE_HLT; vmexit->u.hlt.rflags = state->rflags; break; case VMCB_EXIT_PAUSE: vmexit->exitcode = VM_EXITCODE_PAUSE; vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_PAUSE, 1); break; case VMCB_EXIT_NPF: /* EXITINFO2 contains the faulting guest physical address */ if (info1 & VMCB_NPF_INFO1_RSV) { VCPU_CTR2(svm_sc->vm, vcpu, "nested page fault with " "reserved bits set: info1(%#lx) info2(%#lx)", info1, info2); } else if (vm_mem_allocated(svm_sc->vm, vcpu, info2)) { vmexit->exitcode = VM_EXITCODE_PAGING; vmexit->u.paging.gpa = info2; vmexit->u.paging.fault_type = npf_fault_type(info1); vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_NESTED_FAULT, 1); VCPU_CTR3(svm_sc->vm, vcpu, "nested page fault " "on gpa %#lx/%#lx at rip %#lx", info2, info1, state->rip); } else if (svm_npf_emul_fault(info1)) { svm_handle_inst_emul(vmcb, info2, vmexit); vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_INST_EMUL, 1); VCPU_CTR3(svm_sc->vm, vcpu, "inst_emul fault " "for gpa %#lx/%#lx at rip %#lx", info2, info1, state->rip); } break; case VMCB_EXIT_MONITOR: vmexit->exitcode = VM_EXITCODE_MONITOR; break; case VMCB_EXIT_MWAIT: vmexit->exitcode = VM_EXITCODE_MWAIT; break; default: vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_UNKNOWN, 1); break; } VCPU_CTR4(svm_sc->vm, vcpu, "%s %s vmexit at %#lx/%d", handled ? "handled" : "unhandled", exit_reason_to_str(code), vmexit->rip, vmexit->inst_length); if (handled) { vmexit->rip += vmexit->inst_length; vmexit->inst_length = 0; state->rip = vmexit->rip; } else { if (vmexit->exitcode == VM_EXITCODE_BOGUS) { /* * If this VM exit was not claimed by anybody then * treat it as a generic SVM exit. */ vm_exit_svm(vmexit, code, info1, info2); } else { /* * The exitcode and collateral have been populated. * The VM exit will be processed further in userland. */ } } return (handled); } static void svm_inj_intinfo(struct svm_softc *svm_sc, int vcpu) { uint64_t intinfo; if (!vm_entry_intinfo(svm_sc->vm, vcpu, &intinfo)) return; KASSERT(VMCB_EXITINTINFO_VALID(intinfo), ("%s: entry intinfo is not " "valid: %#lx", __func__, intinfo)); svm_eventinject(svm_sc, vcpu, VMCB_EXITINTINFO_TYPE(intinfo), VMCB_EXITINTINFO_VECTOR(intinfo), VMCB_EXITINTINFO_EC(intinfo), VMCB_EXITINTINFO_EC_VALID(intinfo)); vmm_stat_incr(svm_sc->vm, vcpu, VCPU_INTINFO_INJECTED, 1); VCPU_CTR1(svm_sc->vm, vcpu, "Injected entry intinfo: %#lx", intinfo); } /* * Inject event to virtual cpu. */ static void svm_inj_interrupts(struct svm_softc *sc, int vcpu, struct vlapic *vlapic) { struct vmcb_ctrl *ctrl; struct vmcb_state *state; struct svm_vcpu *vcpustate; uint8_t v_tpr; int vector, need_intr_window, pending_apic_vector; state = svm_get_vmcb_state(sc, vcpu); ctrl = svm_get_vmcb_ctrl(sc, vcpu); vcpustate = svm_get_vcpu(sc, vcpu); need_intr_window = 0; pending_apic_vector = 0; if (vcpustate->nextrip != state->rip) { ctrl->intr_shadow = 0; VCPU_CTR2(sc->vm, vcpu, "Guest interrupt blocking " "cleared due to rip change: %#lx/%#lx", vcpustate->nextrip, state->rip); } /* * Inject pending events or exceptions for this vcpu. * * An event might be pending because the previous #VMEXIT happened * during event delivery (i.e. ctrl->exitintinfo). * * An event might also be pending because an exception was injected * by the hypervisor (e.g. #PF during instruction emulation). */ svm_inj_intinfo(sc, vcpu); /* NMI event has priority over interrupts. */ if (vm_nmi_pending(sc->vm, vcpu)) { if (nmi_blocked(sc, vcpu)) { /* * Can't inject another NMI if the guest has not * yet executed an "iret" after the last NMI. */ VCPU_CTR0(sc->vm, vcpu, "Cannot inject NMI due " "to NMI-blocking"); } else if (ctrl->intr_shadow) { /* * Can't inject an NMI if the vcpu is in an intr_shadow. */ VCPU_CTR0(sc->vm, vcpu, "Cannot inject NMI due to " "interrupt shadow"); need_intr_window = 1; goto done; } else if (ctrl->eventinj & VMCB_EVENTINJ_VALID) { /* * If there is already an exception/interrupt pending * then defer the NMI until after that. */ VCPU_CTR1(sc->vm, vcpu, "Cannot inject NMI due to " "eventinj %#lx", ctrl->eventinj); /* * Use self-IPI to trigger a VM-exit as soon as * possible after the event injection is completed. * * This works only if the external interrupt exiting * is at a lower priority than the event injection. * * Although not explicitly specified in APMv2 the * relative priorities were verified empirically. */ - ipi_cpu(curcpu, vmm_ipinum); + ipi_cpu(curcpu, IPI_AST); /* XXX vmm_ipinum? */ } else { vm_nmi_clear(sc->vm, vcpu); /* Inject NMI, vector number is not used */ svm_eventinject(sc, vcpu, VMCB_EVENTINJ_TYPE_NMI, IDT_NMI, 0, false); /* virtual NMI blocking is now in effect */ enable_nmi_blocking(sc, vcpu); VCPU_CTR0(sc->vm, vcpu, "Injecting vNMI"); } } if (!vm_extint_pending(sc->vm, vcpu)) { /* * APIC interrupts are delivered using the V_IRQ offload. * * The primary benefit is that the hypervisor doesn't need to * deal with the various conditions that inhibit interrupts. * It also means that TPR changes via CR8 will be handled * without any hypervisor involvement. * * Note that the APIC vector must remain pending in the vIRR * until it is confirmed that it was delivered to the guest. * This can be confirmed based on the value of V_IRQ at the * next #VMEXIT (1 = pending, 0 = delivered). * * Also note that it is possible that another higher priority * vector can become pending before this vector is delivered * to the guest. This is alright because vcpu_notify_event() * will send an IPI and force the vcpu to trap back into the * hypervisor. The higher priority vector will be injected on * the next VMRUN. */ if (vlapic_pending_intr(vlapic, &vector)) { KASSERT(vector >= 16 && vector <= 255, ("invalid vector %d from local APIC", vector)); pending_apic_vector = vector; } goto done; } /* Ask the legacy pic for a vector to inject */ vatpic_pending_intr(sc->vm, &vector); KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d from INTR", vector)); /* * If the guest has disabled interrupts or is in an interrupt shadow * then we cannot inject the pending interrupt. */ if ((state->rflags & PSL_I) == 0) { VCPU_CTR2(sc->vm, vcpu, "Cannot inject vector %d due to " "rflags %#lx", vector, state->rflags); need_intr_window = 1; goto done; } if (ctrl->intr_shadow) { VCPU_CTR1(sc->vm, vcpu, "Cannot inject vector %d due to " "interrupt shadow", vector); need_intr_window = 1; goto done; } if (ctrl->eventinj & VMCB_EVENTINJ_VALID) { VCPU_CTR2(sc->vm, vcpu, "Cannot inject vector %d due to " "eventinj %#lx", vector, ctrl->eventinj); need_intr_window = 1; goto done; } /* * Legacy PIC interrupts are delivered via the event injection * mechanism. */ svm_eventinject(sc, vcpu, VMCB_EVENTINJ_TYPE_INTR, vector, 0, false); vm_extint_clear(sc->vm, vcpu); vatpic_intr_accepted(sc->vm, vector); /* * Force a VM-exit as soon as the vcpu is ready to accept another * interrupt. This is done because the PIC might have another vector * that it wants to inject. Also, if the APIC has a pending interrupt * that was preempted by the ExtInt then it allows us to inject the * APIC vector as soon as possible. */ need_intr_window = 1; done: /* * The guest can modify the TPR by writing to %CR8. In guest mode * the processor reflects this write to V_TPR without hypervisor * intervention. * * The guest can also modify the TPR by writing to it via the memory * mapped APIC page. In this case, the write will be emulated by the * hypervisor. For this reason V_TPR must be updated before every * VMRUN. */ v_tpr = vlapic_get_cr8(vlapic); KASSERT(v_tpr <= 15, ("invalid v_tpr %#x", v_tpr)); if (ctrl->v_tpr != v_tpr) { VCPU_CTR2(sc->vm, vcpu, "VMCB V_TPR changed from %#x to %#x", ctrl->v_tpr, v_tpr); ctrl->v_tpr = v_tpr; svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR); } if (pending_apic_vector) { /* * If an APIC vector is being injected then interrupt window * exiting is not possible on this VMRUN. */ KASSERT(!need_intr_window, ("intr_window exiting impossible")); VCPU_CTR1(sc->vm, vcpu, "Injecting vector %d using V_IRQ", pending_apic_vector); ctrl->v_irq = 1; ctrl->v_ign_tpr = 0; ctrl->v_intr_vector = pending_apic_vector; ctrl->v_intr_prio = pending_apic_vector >> 4; svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR); } else if (need_intr_window) { /* * We use V_IRQ in conjunction with the VINTR intercept to * trap into the hypervisor as soon as a virtual interrupt * can be delivered. * * Since injected events are not subject to intercept checks * we need to ensure that the V_IRQ is not actually going to * be delivered on VM entry. The KASSERT below enforces this. */ KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) != 0 || (state->rflags & PSL_I) == 0 || ctrl->intr_shadow, ("Bogus intr_window_exiting: eventinj (%#lx), " "intr_shadow (%u), rflags (%#lx)", ctrl->eventinj, ctrl->intr_shadow, state->rflags)); enable_intr_window_exiting(sc, vcpu); } else { disable_intr_window_exiting(sc, vcpu); } } static __inline void restore_host_tss(void) { struct system_segment_descriptor *tss_sd; /* * The TSS descriptor was in use prior to launching the guest so it * has been marked busy. * * 'ltr' requires the descriptor to be marked available so change the * type to "64-bit available TSS". */ tss_sd = PCPU_GET(tss); tss_sd->sd_type = SDT_SYSTSS; ltr(GSEL(GPROC0_SEL, SEL_KPL)); } static void check_asid(struct svm_softc *sc, int vcpuid, pmap_t pmap, u_int thiscpu) { struct svm_vcpu *vcpustate; struct vmcb_ctrl *ctrl; long eptgen; bool alloc_asid; KASSERT(CPU_ISSET(thiscpu, &pmap->pm_active), ("%s: nested pmap not " "active on cpu %u", __func__, thiscpu)); vcpustate = svm_get_vcpu(sc, vcpuid); ctrl = svm_get_vmcb_ctrl(sc, vcpuid); /* * The TLB entries associated with the vcpu's ASID are not valid * if either of the following conditions is true: * * 1. The vcpu's ASID generation is different than the host cpu's * ASID generation. This happens when the vcpu migrates to a new * host cpu. It can also happen when the number of vcpus executing * on a host cpu is greater than the number of ASIDs available. * * 2. The pmap generation number is different than the value cached in * the 'vcpustate'. This happens when the host invalidates pages * belonging to the guest. * * asidgen eptgen Action * mismatch mismatch * 0 0 (a) * 0 1 (b1) or (b2) * 1 0 (c) * 1 1 (d) * * (a) There is no mismatch in eptgen or ASID generation and therefore * no further action is needed. * * (b1) If the cpu supports FlushByAsid then the vcpu's ASID is * retained and the TLB entries associated with this ASID * are flushed by VMRUN. * * (b2) If the cpu does not support FlushByAsid then a new ASID is * allocated. * * (c) A new ASID is allocated. * * (d) A new ASID is allocated. */ alloc_asid = false; eptgen = pmap->pm_eptgen; ctrl->tlb_ctrl = VMCB_TLB_FLUSH_NOTHING; if (vcpustate->asid.gen != asid[thiscpu].gen) { alloc_asid = true; /* (c) and (d) */ } else if (vcpustate->eptgen != eptgen) { if (flush_by_asid()) ctrl->tlb_ctrl = VMCB_TLB_FLUSH_GUEST; /* (b1) */ else alloc_asid = true; /* (b2) */ } else { /* * This is the common case (a). */ KASSERT(!alloc_asid, ("ASID allocation not necessary")); KASSERT(ctrl->tlb_ctrl == VMCB_TLB_FLUSH_NOTHING, ("Invalid VMCB tlb_ctrl: %#x", ctrl->tlb_ctrl)); } if (alloc_asid) { if (++asid[thiscpu].num >= nasid) { asid[thiscpu].num = 1; if (++asid[thiscpu].gen == 0) asid[thiscpu].gen = 1; /* * If this cpu does not support "flush-by-asid" * then flush the entire TLB on a generation * bump. Subsequent ASID allocation in this * generation can be done without a TLB flush. */ if (!flush_by_asid()) ctrl->tlb_ctrl = VMCB_TLB_FLUSH_ALL; } vcpustate->asid.gen = asid[thiscpu].gen; vcpustate->asid.num = asid[thiscpu].num; ctrl->asid = vcpustate->asid.num; svm_set_dirty(sc, vcpuid, VMCB_CACHE_ASID); /* * If this cpu supports "flush-by-asid" then the TLB * was not flushed after the generation bump. The TLB * is flushed selectively after every new ASID allocation. */ if (flush_by_asid()) ctrl->tlb_ctrl = VMCB_TLB_FLUSH_GUEST; } vcpustate->eptgen = eptgen; KASSERT(ctrl->asid != 0, ("Guest ASID must be non-zero")); KASSERT(ctrl->asid == vcpustate->asid.num, ("ASID mismatch: %u/%u", ctrl->asid, vcpustate->asid.num)); } static __inline void disable_gintr(void) { __asm __volatile("clgi"); } static __inline void enable_gintr(void) { __asm __volatile("stgi"); } /* * Start vcpu with specified RIP. */ static int svm_vmrun(void *arg, int vcpu, register_t rip, pmap_t pmap, struct vm_eventinfo *evinfo) { struct svm_regctx *gctx; struct svm_softc *svm_sc; struct svm_vcpu *vcpustate; struct vmcb_state *state; struct vmcb_ctrl *ctrl; struct vm_exit *vmexit; struct vlapic *vlapic; struct vm *vm; uint64_t vmcb_pa; int handled; svm_sc = arg; vm = svm_sc->vm; vcpustate = svm_get_vcpu(svm_sc, vcpu); state = svm_get_vmcb_state(svm_sc, vcpu); ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu); vmexit = vm_exitinfo(vm, vcpu); vlapic = vm_lapic(vm, vcpu); gctx = svm_get_guest_regctx(svm_sc, vcpu); vmcb_pa = svm_sc->vcpu[vcpu].vmcb_pa; if (vcpustate->lastcpu != curcpu) { /* * Force new ASID allocation by invalidating the generation. */ vcpustate->asid.gen = 0; /* * Invalidate the VMCB state cache by marking all fields dirty. */ svm_set_dirty(svm_sc, vcpu, 0xffffffff); /* * XXX * Setting 'vcpustate->lastcpu' here is bit premature because * we may return from this function without actually executing * the VMRUN instruction. This could happen if a rendezvous * or an AST is pending on the first time through the loop. * * This works for now but any new side-effects of vcpu * migration should take this case into account. */ vcpustate->lastcpu = curcpu; vmm_stat_incr(vm, vcpu, VCPU_MIGRATIONS, 1); } svm_msr_guest_enter(svm_sc, vcpu); /* Update Guest RIP */ state->rip = rip; do { /* * Disable global interrupts to guarantee atomicity during * loading of guest state. This includes not only the state * loaded by the "vmrun" instruction but also software state * maintained by the hypervisor: suspended and rendezvous * state, NPT generation number, vlapic interrupts etc. */ disable_gintr(); if (vcpu_suspended(evinfo)) { enable_gintr(); vm_exit_suspended(vm, vcpu, state->rip); break; } if (vcpu_rendezvous_pending(evinfo)) { enable_gintr(); vm_exit_rendezvous(vm, vcpu, state->rip); break; } if (vcpu_reqidle(evinfo)) { enable_gintr(); vm_exit_reqidle(vm, vcpu, state->rip); break; } /* We are asked to give the cpu by scheduler. */ if (vcpu_should_yield(vm, vcpu)) { enable_gintr(); vm_exit_astpending(vm, vcpu, state->rip); break; } svm_inj_interrupts(svm_sc, vcpu, vlapic); /* Activate the nested pmap on 'curcpu' */ CPU_SET_ATOMIC_ACQ(curcpu, &pmap->pm_active); /* * Check the pmap generation and the ASID generation to * ensure that the vcpu does not use stale TLB mappings. */ check_asid(svm_sc, vcpu, pmap, curcpu); ctrl->vmcb_clean = vmcb_clean & ~vcpustate->dirty; vcpustate->dirty = 0; VCPU_CTR1(vm, vcpu, "vmcb clean %#x", ctrl->vmcb_clean); /* Launch Virtual Machine. */ VCPU_CTR1(vm, vcpu, "Resume execution at %#lx", state->rip); svm_launch(vmcb_pa, gctx, &__pcpu[curcpu]); CPU_CLR_ATOMIC(curcpu, &pmap->pm_active); /* * The host GDTR and IDTR is saved by VMRUN and restored * automatically on #VMEXIT. However, the host TSS needs * to be restored explicitly. */ restore_host_tss(); /* #VMEXIT disables interrupts so re-enable them here. */ enable_gintr(); /* Update 'nextrip' */ vcpustate->nextrip = state->rip; /* Handle #VMEXIT and if required return to user space. */ handled = svm_vmexit(svm_sc, vcpu, vmexit); } while (handled); svm_msr_guest_exit(svm_sc, vcpu); return (0); } static void svm_vmcleanup(void *arg) { struct svm_softc *sc = arg; contigfree(sc, sizeof (*sc), M_SVM); } static register_t * swctx_regptr(struct svm_regctx *regctx, int reg) { switch (reg) { case VM_REG_GUEST_RBX: return (®ctx->sctx_rbx); case VM_REG_GUEST_RCX: return (®ctx->sctx_rcx); case VM_REG_GUEST_RDX: return (®ctx->sctx_rdx); case VM_REG_GUEST_RDI: return (®ctx->sctx_rdi); case VM_REG_GUEST_RSI: return (®ctx->sctx_rsi); case VM_REG_GUEST_RBP: return (®ctx->sctx_rbp); case VM_REG_GUEST_R8: return (®ctx->sctx_r8); case VM_REG_GUEST_R9: return (®ctx->sctx_r9); case VM_REG_GUEST_R10: return (®ctx->sctx_r10); case VM_REG_GUEST_R11: return (®ctx->sctx_r11); case VM_REG_GUEST_R12: return (®ctx->sctx_r12); case VM_REG_GUEST_R13: return (®ctx->sctx_r13); case VM_REG_GUEST_R14: return (®ctx->sctx_r14); case VM_REG_GUEST_R15: return (®ctx->sctx_r15); default: return (NULL); } } static int svm_getreg(void *arg, int vcpu, int ident, uint64_t *val) { struct svm_softc *svm_sc; register_t *reg; svm_sc = arg; if (ident == VM_REG_GUEST_INTR_SHADOW) { return (svm_get_intr_shadow(svm_sc, vcpu, val)); } if (vmcb_read(svm_sc, vcpu, ident, val) == 0) { return (0); } reg = swctx_regptr(svm_get_guest_regctx(svm_sc, vcpu), ident); if (reg != NULL) { *val = *reg; return (0); } VCPU_CTR1(svm_sc->vm, vcpu, "svm_getreg: unknown register %#x", ident); return (EINVAL); } static int svm_setreg(void *arg, int vcpu, int ident, uint64_t val) { struct svm_softc *svm_sc; register_t *reg; svm_sc = arg; if (ident == VM_REG_GUEST_INTR_SHADOW) { return (svm_modify_intr_shadow(svm_sc, vcpu, val)); } if (vmcb_write(svm_sc, vcpu, ident, val) == 0) { return (0); } reg = swctx_regptr(svm_get_guest_regctx(svm_sc, vcpu), ident); if (reg != NULL) { *reg = val; return (0); } /* * XXX deal with CR3 and invalidate TLB entries tagged with the * vcpu's ASID. This needs to be treated differently depending on * whether 'running' is true/false. */ VCPU_CTR1(svm_sc->vm, vcpu, "svm_setreg: unknown register %#x", ident); return (EINVAL); } static int svm_setcap(void *arg, int vcpu, int type, int val) { struct svm_softc *sc; int error; sc = arg; error = 0; switch (type) { case VM_CAP_HALT_EXIT: svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_HLT, val); break; case VM_CAP_PAUSE_EXIT: svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_PAUSE, val); break; case VM_CAP_UNRESTRICTED_GUEST: /* Unrestricted guest execution cannot be disabled in SVM */ if (val == 0) error = EINVAL; break; default: error = ENOENT; break; } return (error); } static int svm_getcap(void *arg, int vcpu, int type, int *retval) { struct svm_softc *sc; int error; sc = arg; error = 0; switch (type) { case VM_CAP_HALT_EXIT: *retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_HLT); break; case VM_CAP_PAUSE_EXIT: *retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_PAUSE); break; case VM_CAP_UNRESTRICTED_GUEST: *retval = 1; /* unrestricted guest is always enabled */ break; default: error = ENOENT; break; } return (error); } static struct vlapic * svm_vlapic_init(void *arg, int vcpuid) { struct svm_softc *svm_sc; struct vlapic *vlapic; svm_sc = arg; vlapic = malloc(sizeof(struct vlapic), M_SVM_VLAPIC, M_WAITOK | M_ZERO); vlapic->vm = svm_sc->vm; vlapic->vcpuid = vcpuid; vlapic->apic_page = (struct LAPIC *)&svm_sc->apic_page[vcpuid]; vlapic_init(vlapic); return (vlapic); } static void svm_vlapic_cleanup(void *arg, struct vlapic *vlapic) { vlapic_cleanup(vlapic); free(vlapic, M_SVM_VLAPIC); } struct vmm_ops vmm_ops_amd = { svm_init, svm_cleanup, svm_restore, svm_vminit, svm_vmrun, svm_vmcleanup, svm_getreg, svm_setreg, vmcb_getdesc, vmcb_setdesc, svm_getcap, svm_setcap, svm_npt_alloc, svm_npt_free, svm_vlapic_init, svm_vlapic_cleanup }; Index: head/sys/amd64/vmm/vmm.c =================================================================== --- head/sys/amd64/vmm/vmm.c (revision 316020) +++ head/sys/amd64/vmm/vmm.c (revision 316021) @@ -1,2589 +1,2590 @@ /*- * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "vmm_ioport.h" #include "vmm_ktr.h" #include "vmm_host.h" #include "vmm_mem.h" #include "vmm_util.h" #include "vatpic.h" #include "vatpit.h" #include "vhpet.h" #include "vioapic.h" #include "vlapic.h" #include "vpmtmr.h" #include "vrtc.h" #include "vmm_stat.h" #include "vmm_lapic.h" #include "io/ppt.h" #include "io/iommu.h" struct vlapic; /* * Initialization: * (a) allocated when vcpu is created * (i) initialized when vcpu is created and when it is reinitialized * (o) initialized the first time the vcpu is created * (x) initialized before use */ struct vcpu { struct mtx mtx; /* (o) protects 'state' and 'hostcpu' */ enum vcpu_state state; /* (o) vcpu state */ int hostcpu; /* (o) vcpu's host cpu */ int reqidle; /* (i) request vcpu to idle */ struct vlapic *vlapic; /* (i) APIC device model */ enum x2apic_state x2apic_state; /* (i) APIC mode */ uint64_t exitintinfo; /* (i) events pending at VM exit */ int nmi_pending; /* (i) NMI pending */ int extint_pending; /* (i) INTR pending */ int exception_pending; /* (i) exception pending */ int exc_vector; /* (x) exception collateral */ int exc_errcode_valid; uint32_t exc_errcode; struct savefpu *guestfpu; /* (a,i) guest fpu state */ uint64_t guest_xcr0; /* (i) guest %xcr0 register */ void *stats; /* (a,i) statistics */ struct vm_exit exitinfo; /* (x) exit reason and collateral */ uint64_t nextrip; /* (x) next instruction to execute */ }; #define vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx)) #define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN) #define vcpu_lock(v) mtx_lock_spin(&((v)->mtx)) #define vcpu_unlock(v) mtx_unlock_spin(&((v)->mtx)) #define vcpu_assert_locked(v) mtx_assert(&((v)->mtx), MA_OWNED) struct mem_seg { size_t len; bool sysmem; struct vm_object *object; }; #define VM_MAX_MEMSEGS 3 struct mem_map { vm_paddr_t gpa; size_t len; vm_ooffset_t segoff; int segid; int prot; int flags; }; #define VM_MAX_MEMMAPS 4 /* * Initialization: * (o) initialized the first time the VM is created * (i) initialized when VM is created and when it is reinitialized * (x) initialized before use */ struct vm { void *cookie; /* (i) cpu-specific data */ void *iommu; /* (x) iommu-specific data */ struct vhpet *vhpet; /* (i) virtual HPET */ struct vioapic *vioapic; /* (i) virtual ioapic */ struct vatpic *vatpic; /* (i) virtual atpic */ struct vatpit *vatpit; /* (i) virtual atpit */ struct vpmtmr *vpmtmr; /* (i) virtual ACPI PM timer */ struct vrtc *vrtc; /* (o) virtual RTC */ volatile cpuset_t active_cpus; /* (i) active vcpus */ int suspend; /* (i) stop VM execution */ volatile cpuset_t suspended_cpus; /* (i) suspended vcpus */ volatile cpuset_t halted_cpus; /* (x) cpus in a hard halt */ cpuset_t rendezvous_req_cpus; /* (x) rendezvous requested */ cpuset_t rendezvous_done_cpus; /* (x) rendezvous finished */ void *rendezvous_arg; /* (x) rendezvous func/arg */ vm_rendezvous_func_t rendezvous_func; struct mtx rendezvous_mtx; /* (o) rendezvous lock */ struct mem_map mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */ struct mem_seg mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */ struct vmspace *vmspace; /* (o) guest's address space */ char name[VM_MAX_NAMELEN]; /* (o) virtual machine name */ struct vcpu vcpu[VM_MAXCPU]; /* (i) guest vcpus */ }; static int vmm_initialized; static struct vmm_ops *ops; #define VMM_INIT(num) (ops != NULL ? (*ops->init)(num) : 0) #define VMM_CLEANUP() (ops != NULL ? (*ops->cleanup)() : 0) #define VMM_RESUME() (ops != NULL ? (*ops->resume)() : 0) #define VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL) #define VMRUN(vmi, vcpu, rip, pmap, evinfo) \ (ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap, evinfo) : ENXIO) #define VMCLEANUP(vmi) (ops != NULL ? (*ops->vmcleanup)(vmi) : NULL) #define VMSPACE_ALLOC(min, max) \ (ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL) #define VMSPACE_FREE(vmspace) \ (ops != NULL ? (*ops->vmspace_free)(vmspace) : ENXIO) #define VMGETREG(vmi, vcpu, num, retval) \ (ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO) #define VMSETREG(vmi, vcpu, num, val) \ (ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO) #define VMGETDESC(vmi, vcpu, num, desc) \ (ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO) #define VMSETDESC(vmi, vcpu, num, desc) \ (ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO) #define VMGETCAP(vmi, vcpu, num, retval) \ (ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO) #define VMSETCAP(vmi, vcpu, num, val) \ (ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO) #define VLAPIC_INIT(vmi, vcpu) \ (ops != NULL ? (*ops->vlapic_init)(vmi, vcpu) : NULL) #define VLAPIC_CLEANUP(vmi, vlapic) \ (ops != NULL ? (*ops->vlapic_cleanup)(vmi, vlapic) : NULL) #define fpu_start_emulating() load_cr0(rcr0() | CR0_TS) #define fpu_stop_emulating() clts() static MALLOC_DEFINE(M_VM, "vm", "vm"); /* statistics */ static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime"); SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL); /* * Halt the guest if all vcpus are executing a HLT instruction with * interrupts disabled. */ static int halt_detection_enabled = 1; SYSCTL_INT(_hw_vmm, OID_AUTO, halt_detection, CTLFLAG_RDTUN, &halt_detection_enabled, 0, "Halt VM if all vcpus execute HLT with interrupts disabled"); +static int vmm_ipinum; SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0, "IPI vector used for vcpu notifications"); static int trace_guest_exceptions; SYSCTL_INT(_hw_vmm, OID_AUTO, trace_guest_exceptions, CTLFLAG_RDTUN, &trace_guest_exceptions, 0, "Trap into hypervisor on all guest exceptions and reflect them back"); static void vm_free_memmap(struct vm *vm, int ident); static bool sysmem_mapping(struct vm *vm, struct mem_map *mm); static void vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr); #ifdef KTR static const char * vcpu_state2str(enum vcpu_state state) { switch (state) { case VCPU_IDLE: return ("idle"); case VCPU_FROZEN: return ("frozen"); case VCPU_RUNNING: return ("running"); case VCPU_SLEEPING: return ("sleeping"); default: return ("unknown"); } } #endif static void vcpu_cleanup(struct vm *vm, int i, bool destroy) { struct vcpu *vcpu = &vm->vcpu[i]; VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic); if (destroy) { vmm_stat_free(vcpu->stats); fpu_save_area_free(vcpu->guestfpu); } } static void vcpu_init(struct vm *vm, int vcpu_id, bool create) { struct vcpu *vcpu; KASSERT(vcpu_id >= 0 && vcpu_id < VM_MAXCPU, ("vcpu_init: invalid vcpu %d", vcpu_id)); vcpu = &vm->vcpu[vcpu_id]; if (create) { KASSERT(!vcpu_lock_initialized(vcpu), ("vcpu %d already " "initialized", vcpu_id)); vcpu_lock_init(vcpu); vcpu->state = VCPU_IDLE; vcpu->hostcpu = NOCPU; vcpu->guestfpu = fpu_save_area_alloc(); vcpu->stats = vmm_stat_alloc(); } vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id); vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED); vcpu->reqidle = 0; vcpu->exitintinfo = 0; vcpu->nmi_pending = 0; vcpu->extint_pending = 0; vcpu->exception_pending = 0; vcpu->guest_xcr0 = XFEATURE_ENABLED_X87; fpu_save_area_reset(vcpu->guestfpu); vmm_stat_init(vcpu->stats); } int vcpu_trace_exceptions(struct vm *vm, int vcpuid) { return (trace_guest_exceptions); } struct vm_exit * vm_exitinfo(struct vm *vm, int cpuid) { struct vcpu *vcpu; if (cpuid < 0 || cpuid >= VM_MAXCPU) panic("vm_exitinfo: invalid cpuid %d", cpuid); vcpu = &vm->vcpu[cpuid]; return (&vcpu->exitinfo); } static void vmm_resume(void) { VMM_RESUME(); } static int vmm_init(void) { int error; vmm_host_state_init(); vmm_ipinum = lapic_ipi_alloc(&IDTVEC(justreturn)); if (vmm_ipinum < 0) vmm_ipinum = IPI_AST; error = vmm_mem_init(); if (error) return (error); if (vmm_is_intel()) ops = &vmm_ops_intel; else if (vmm_is_amd()) ops = &vmm_ops_amd; else return (ENXIO); vmm_resume_p = vmm_resume; return (VMM_INIT(vmm_ipinum)); } static int vmm_handler(module_t mod, int what, void *arg) { int error; switch (what) { case MOD_LOAD: vmmdev_init(); error = vmm_init(); if (error == 0) vmm_initialized = 1; break; case MOD_UNLOAD: error = vmmdev_cleanup(); if (error == 0) { vmm_resume_p = NULL; iommu_cleanup(); if (vmm_ipinum != IPI_AST) lapic_ipi_free(vmm_ipinum); error = VMM_CLEANUP(); /* * Something bad happened - prevent new * VMs from being created */ if (error) vmm_initialized = 0; } break; default: error = 0; break; } return (error); } static moduledata_t vmm_kmod = { "vmm", vmm_handler, NULL }; /* * vmm initialization has the following dependencies: * * - VT-x initialization requires smp_rendezvous() and therefore must happen * after SMP is fully functional (after SI_SUB_SMP). */ DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY); MODULE_VERSION(vmm, 1); static void vm_init(struct vm *vm, bool create) { int i; vm->cookie = VMINIT(vm, vmspace_pmap(vm->vmspace)); vm->iommu = NULL; vm->vioapic = vioapic_init(vm); vm->vhpet = vhpet_init(vm); vm->vatpic = vatpic_init(vm); vm->vatpit = vatpit_init(vm); vm->vpmtmr = vpmtmr_init(vm); if (create) vm->vrtc = vrtc_init(vm); CPU_ZERO(&vm->active_cpus); vm->suspend = 0; CPU_ZERO(&vm->suspended_cpus); for (i = 0; i < VM_MAXCPU; i++) vcpu_init(vm, i, create); } int vm_create(const char *name, struct vm **retvm) { struct vm *vm; struct vmspace *vmspace; /* * If vmm.ko could not be successfully initialized then don't attempt * to create the virtual machine. */ if (!vmm_initialized) return (ENXIO); if (name == NULL || strlen(name) >= VM_MAX_NAMELEN) return (EINVAL); vmspace = VMSPACE_ALLOC(0, VM_MAXUSER_ADDRESS); if (vmspace == NULL) return (ENOMEM); vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO); strcpy(vm->name, name); vm->vmspace = vmspace; mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF); vm_init(vm, true); *retvm = vm; return (0); } static void vm_cleanup(struct vm *vm, bool destroy) { struct mem_map *mm; int i; ppt_unassign_all(vm); if (vm->iommu != NULL) iommu_destroy_domain(vm->iommu); if (destroy) vrtc_cleanup(vm->vrtc); else vrtc_reset(vm->vrtc); vpmtmr_cleanup(vm->vpmtmr); vatpit_cleanup(vm->vatpit); vhpet_cleanup(vm->vhpet); vatpic_cleanup(vm->vatpic); vioapic_cleanup(vm->vioapic); for (i = 0; i < VM_MAXCPU; i++) vcpu_cleanup(vm, i, destroy); VMCLEANUP(vm->cookie); /* * System memory is removed from the guest address space only when * the VM is destroyed. This is because the mapping remains the same * across VM reset. * * Device memory can be relocated by the guest (e.g. using PCI BARs) * so those mappings are removed on a VM reset. */ for (i = 0; i < VM_MAX_MEMMAPS; i++) { mm = &vm->mem_maps[i]; if (destroy || !sysmem_mapping(vm, mm)) vm_free_memmap(vm, i); } if (destroy) { for (i = 0; i < VM_MAX_MEMSEGS; i++) vm_free_memseg(vm, i); VMSPACE_FREE(vm->vmspace); vm->vmspace = NULL; } } void vm_destroy(struct vm *vm) { vm_cleanup(vm, true); free(vm, M_VM); } int vm_reinit(struct vm *vm) { int error; /* * A virtual machine can be reset only if all vcpus are suspended. */ if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) { vm_cleanup(vm, false); vm_init(vm, false); error = 0; } else { error = EBUSY; } return (error); } const char * vm_name(struct vm *vm) { return (vm->name); } int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) { vm_object_t obj; if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL) return (ENOMEM); else return (0); } int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len) { vmm_mmio_free(vm->vmspace, gpa, len); return (0); } /* * Return 'true' if 'gpa' is allocated in the guest address space. * * This function is called in the context of a running vcpu which acts as * an implicit lock on 'vm->mem_maps[]'. */ bool vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa) { struct mem_map *mm; int i; #ifdef INVARIANTS int hostcpu, state; state = vcpu_get_state(vm, vcpuid, &hostcpu); KASSERT(state == VCPU_RUNNING && hostcpu == curcpu, ("%s: invalid vcpu state %d/%d", __func__, state, hostcpu)); #endif for (i = 0; i < VM_MAX_MEMMAPS; i++) { mm = &vm->mem_maps[i]; if (mm->len != 0 && gpa >= mm->gpa && gpa < mm->gpa + mm->len) return (true); /* 'gpa' is sysmem or devmem */ } if (ppt_is_mmio(vm, gpa)) return (true); /* 'gpa' is pci passthru mmio */ return (false); } int vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem) { struct mem_seg *seg; vm_object_t obj; if (ident < 0 || ident >= VM_MAX_MEMSEGS) return (EINVAL); if (len == 0 || (len & PAGE_MASK)) return (EINVAL); seg = &vm->mem_segs[ident]; if (seg->object != NULL) { if (seg->len == len && seg->sysmem == sysmem) return (EEXIST); else return (EINVAL); } obj = vm_object_allocate(OBJT_DEFAULT, len >> PAGE_SHIFT); if (obj == NULL) return (ENOMEM); seg->len = len; seg->object = obj; seg->sysmem = sysmem; return (0); } int vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem, vm_object_t *objptr) { struct mem_seg *seg; if (ident < 0 || ident >= VM_MAX_MEMSEGS) return (EINVAL); seg = &vm->mem_segs[ident]; if (len) *len = seg->len; if (sysmem) *sysmem = seg->sysmem; if (objptr) *objptr = seg->object; return (0); } void vm_free_memseg(struct vm *vm, int ident) { struct mem_seg *seg; KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS, ("%s: invalid memseg ident %d", __func__, ident)); seg = &vm->mem_segs[ident]; if (seg->object != NULL) { vm_object_deallocate(seg->object); bzero(seg, sizeof(struct mem_seg)); } } int vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first, size_t len, int prot, int flags) { struct mem_seg *seg; struct mem_map *m, *map; vm_ooffset_t last; int i, error; if (prot == 0 || (prot & ~(VM_PROT_ALL)) != 0) return (EINVAL); if (flags & ~VM_MEMMAP_F_WIRED) return (EINVAL); if (segid < 0 || segid >= VM_MAX_MEMSEGS) return (EINVAL); seg = &vm->mem_segs[segid]; if (seg->object == NULL) return (EINVAL); last = first + len; if (first < 0 || first >= last || last > seg->len) return (EINVAL); if ((gpa | first | last) & PAGE_MASK) return (EINVAL); map = NULL; for (i = 0; i < VM_MAX_MEMMAPS; i++) { m = &vm->mem_maps[i]; if (m->len == 0) { map = m; break; } } if (map == NULL) return (ENOSPC); error = vm_map_find(&vm->vmspace->vm_map, seg->object, first, &gpa, len, 0, VMFS_NO_SPACE, prot, prot, 0); if (error != KERN_SUCCESS) return (EFAULT); vm_object_reference(seg->object); if (flags & VM_MEMMAP_F_WIRED) { error = vm_map_wire(&vm->vmspace->vm_map, gpa, gpa + len, VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); if (error != KERN_SUCCESS) { vm_map_remove(&vm->vmspace->vm_map, gpa, gpa + len); return (EFAULT); } } map->gpa = gpa; map->len = len; map->segoff = first; map->segid = segid; map->prot = prot; map->flags = flags; return (0); } int vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid, vm_ooffset_t *segoff, size_t *len, int *prot, int *flags) { struct mem_map *mm, *mmnext; int i; mmnext = NULL; for (i = 0; i < VM_MAX_MEMMAPS; i++) { mm = &vm->mem_maps[i]; if (mm->len == 0 || mm->gpa < *gpa) continue; if (mmnext == NULL || mm->gpa < mmnext->gpa) mmnext = mm; } if (mmnext != NULL) { *gpa = mmnext->gpa; if (segid) *segid = mmnext->segid; if (segoff) *segoff = mmnext->segoff; if (len) *len = mmnext->len; if (prot) *prot = mmnext->prot; if (flags) *flags = mmnext->flags; return (0); } else { return (ENOENT); } } static void vm_free_memmap(struct vm *vm, int ident) { struct mem_map *mm; int error; mm = &vm->mem_maps[ident]; if (mm->len) { error = vm_map_remove(&vm->vmspace->vm_map, mm->gpa, mm->gpa + mm->len); KASSERT(error == KERN_SUCCESS, ("%s: vm_map_remove error %d", __func__, error)); bzero(mm, sizeof(struct mem_map)); } } static __inline bool sysmem_mapping(struct vm *vm, struct mem_map *mm) { if (mm->len != 0 && vm->mem_segs[mm->segid].sysmem) return (true); else return (false); } static vm_paddr_t sysmem_maxaddr(struct vm *vm) { struct mem_map *mm; vm_paddr_t maxaddr; int i; maxaddr = 0; for (i = 0; i < VM_MAX_MEMMAPS; i++) { mm = &vm->mem_maps[i]; if (sysmem_mapping(vm, mm)) { if (maxaddr < mm->gpa + mm->len) maxaddr = mm->gpa + mm->len; } } return (maxaddr); } static void vm_iommu_modify(struct vm *vm, boolean_t map) { int i, sz; vm_paddr_t gpa, hpa; struct mem_map *mm; void *vp, *cookie, *host_domain; sz = PAGE_SIZE; host_domain = iommu_host_domain(); for (i = 0; i < VM_MAX_MEMMAPS; i++) { mm = &vm->mem_maps[i]; if (!sysmem_mapping(vm, mm)) continue; if (map) { KASSERT((mm->flags & VM_MEMMAP_F_IOMMU) == 0, ("iommu map found invalid memmap %#lx/%#lx/%#x", mm->gpa, mm->len, mm->flags)); if ((mm->flags & VM_MEMMAP_F_WIRED) == 0) continue; mm->flags |= VM_MEMMAP_F_IOMMU; } else { if ((mm->flags & VM_MEMMAP_F_IOMMU) == 0) continue; mm->flags &= ~VM_MEMMAP_F_IOMMU; KASSERT((mm->flags & VM_MEMMAP_F_WIRED) != 0, ("iommu unmap found invalid memmap %#lx/%#lx/%#x", mm->gpa, mm->len, mm->flags)); } gpa = mm->gpa; while (gpa < mm->gpa + mm->len) { vp = vm_gpa_hold(vm, -1, gpa, PAGE_SIZE, VM_PROT_WRITE, &cookie); KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx", vm_name(vm), gpa)); vm_gpa_release(cookie); hpa = DMAP_TO_PHYS((uintptr_t)vp); if (map) { iommu_create_mapping(vm->iommu, gpa, hpa, sz); iommu_remove_mapping(host_domain, hpa, sz); } else { iommu_remove_mapping(vm->iommu, gpa, sz); iommu_create_mapping(host_domain, hpa, hpa, sz); } gpa += PAGE_SIZE; } } /* * Invalidate the cached translations associated with the domain * from which pages were removed. */ if (map) iommu_invalidate_tlb(host_domain); else iommu_invalidate_tlb(vm->iommu); } #define vm_iommu_unmap(vm) vm_iommu_modify((vm), FALSE) #define vm_iommu_map(vm) vm_iommu_modify((vm), TRUE) int vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func) { int error; error = ppt_unassign_device(vm, bus, slot, func); if (error) return (error); if (ppt_assigned_devices(vm) == 0) vm_iommu_unmap(vm); return (0); } int vm_assign_pptdev(struct vm *vm, int bus, int slot, int func) { int error; vm_paddr_t maxaddr; /* Set up the IOMMU to do the 'gpa' to 'hpa' translation */ if (ppt_assigned_devices(vm) == 0) { KASSERT(vm->iommu == NULL, ("vm_assign_pptdev: iommu must be NULL")); maxaddr = sysmem_maxaddr(vm); vm->iommu = iommu_create_domain(maxaddr); if (vm->iommu == NULL) return (ENXIO); vm_iommu_map(vm); } error = ppt_assign_device(vm, bus, slot, func); return (error); } void * vm_gpa_hold(struct vm *vm, int vcpuid, vm_paddr_t gpa, size_t len, int reqprot, void **cookie) { int i, count, pageoff; struct mem_map *mm; vm_page_t m; #ifdef INVARIANTS /* * All vcpus are frozen by ioctls that modify the memory map * (e.g. VM_MMAP_MEMSEG). Therefore 'vm->memmap[]' stability is * guaranteed if at least one vcpu is in the VCPU_FROZEN state. */ int state; KASSERT(vcpuid >= -1 && vcpuid < VM_MAXCPU, ("%s: invalid vcpuid %d", __func__, vcpuid)); for (i = 0; i < VM_MAXCPU; i++) { if (vcpuid != -1 && vcpuid != i) continue; state = vcpu_get_state(vm, i, NULL); KASSERT(state == VCPU_FROZEN, ("%s: invalid vcpu state %d", __func__, state)); } #endif pageoff = gpa & PAGE_MASK; if (len > PAGE_SIZE - pageoff) panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len); count = 0; for (i = 0; i < VM_MAX_MEMMAPS; i++) { mm = &vm->mem_maps[i]; if (sysmem_mapping(vm, mm) && gpa >= mm->gpa && gpa < mm->gpa + mm->len) { count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map, trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1); break; } } if (count == 1) { *cookie = m; return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff)); } else { *cookie = NULL; return (NULL); } } void vm_gpa_release(void *cookie) { vm_page_t m = cookie; vm_page_lock(m); vm_page_unhold(m); vm_page_unlock(m); } int vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval) { if (vcpu < 0 || vcpu >= VM_MAXCPU) return (EINVAL); if (reg >= VM_REG_LAST) return (EINVAL); return (VMGETREG(vm->cookie, vcpu, reg, retval)); } int vm_set_register(struct vm *vm, int vcpuid, int reg, uint64_t val) { struct vcpu *vcpu; int error; if (vcpuid < 0 || vcpuid >= VM_MAXCPU) return (EINVAL); if (reg >= VM_REG_LAST) return (EINVAL); error = VMSETREG(vm->cookie, vcpuid, reg, val); if (error || reg != VM_REG_GUEST_RIP) return (error); /* Set 'nextrip' to match the value of %rip */ VCPU_CTR1(vm, vcpuid, "Setting nextrip to %#lx", val); vcpu = &vm->vcpu[vcpuid]; vcpu->nextrip = val; return (0); } static boolean_t is_descriptor_table(int reg) { switch (reg) { case VM_REG_GUEST_IDTR: case VM_REG_GUEST_GDTR: return (TRUE); default: return (FALSE); } } static boolean_t is_segment_register(int reg) { switch (reg) { case VM_REG_GUEST_ES: case VM_REG_GUEST_CS: case VM_REG_GUEST_SS: case VM_REG_GUEST_DS: case VM_REG_GUEST_FS: case VM_REG_GUEST_GS: case VM_REG_GUEST_TR: case VM_REG_GUEST_LDTR: return (TRUE); default: return (FALSE); } } int vm_get_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc) { if (vcpu < 0 || vcpu >= VM_MAXCPU) return (EINVAL); if (!is_segment_register(reg) && !is_descriptor_table(reg)) return (EINVAL); return (VMGETDESC(vm->cookie, vcpu, reg, desc)); } int vm_set_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc) { if (vcpu < 0 || vcpu >= VM_MAXCPU) return (EINVAL); if (!is_segment_register(reg) && !is_descriptor_table(reg)) return (EINVAL); return (VMSETDESC(vm->cookie, vcpu, reg, desc)); } static void restore_guest_fpustate(struct vcpu *vcpu) { /* flush host state to the pcb */ fpuexit(curthread); /* restore guest FPU state */ fpu_stop_emulating(); fpurestore(vcpu->guestfpu); /* restore guest XCR0 if XSAVE is enabled in the host */ if (rcr4() & CR4_XSAVE) load_xcr(0, vcpu->guest_xcr0); /* * The FPU is now "dirty" with the guest's state so turn on emulation * to trap any access to the FPU by the host. */ fpu_start_emulating(); } static void save_guest_fpustate(struct vcpu *vcpu) { if ((rcr0() & CR0_TS) == 0) panic("fpu emulation not enabled in host!"); /* save guest XCR0 and restore host XCR0 */ if (rcr4() & CR4_XSAVE) { vcpu->guest_xcr0 = rxcr(0); load_xcr(0, vmm_get_host_xcr0()); } /* save guest FPU state */ fpu_stop_emulating(); fpusave(vcpu->guestfpu); fpu_start_emulating(); } static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle"); static int vcpu_set_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate, bool from_idle) { struct vcpu *vcpu; int error; vcpu = &vm->vcpu[vcpuid]; vcpu_assert_locked(vcpu); /* * State transitions from the vmmdev_ioctl() must always begin from * the VCPU_IDLE state. This guarantees that there is only a single * ioctl() operating on a vcpu at any point. */ if (from_idle) { while (vcpu->state != VCPU_IDLE) { vcpu->reqidle = 1; vcpu_notify_event_locked(vcpu, false); VCPU_CTR1(vm, vcpuid, "vcpu state change from %s to " "idle requested", vcpu_state2str(vcpu->state)); msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz); } } else { KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from " "vcpu idle state")); } if (vcpu->state == VCPU_RUNNING) { KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d " "mismatch for running vcpu", curcpu, vcpu->hostcpu)); } else { KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a " "vcpu that is not running", vcpu->hostcpu)); } /* * The following state transitions are allowed: * IDLE -> FROZEN -> IDLE * FROZEN -> RUNNING -> FROZEN * FROZEN -> SLEEPING -> FROZEN */ switch (vcpu->state) { case VCPU_IDLE: case VCPU_RUNNING: case VCPU_SLEEPING: error = (newstate != VCPU_FROZEN); break; case VCPU_FROZEN: error = (newstate == VCPU_FROZEN); break; default: error = 1; break; } if (error) return (EBUSY); VCPU_CTR2(vm, vcpuid, "vcpu state changed from %s to %s", vcpu_state2str(vcpu->state), vcpu_state2str(newstate)); vcpu->state = newstate; if (newstate == VCPU_RUNNING) vcpu->hostcpu = curcpu; else vcpu->hostcpu = NOCPU; if (newstate == VCPU_IDLE) wakeup(&vcpu->state); return (0); } static void vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate) { int error; if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0) panic("Error %d setting state to %d\n", error, newstate); } static void vcpu_require_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate) { int error; if ((error = vcpu_set_state_locked(vm, vcpuid, newstate, false)) != 0) panic("Error %d setting state to %d", error, newstate); } static void vm_set_rendezvous_func(struct vm *vm, vm_rendezvous_func_t func) { KASSERT(mtx_owned(&vm->rendezvous_mtx), ("rendezvous_mtx not locked")); /* * Update 'rendezvous_func' and execute a write memory barrier to * ensure that it is visible across all host cpus. This is not needed * for correctness but it does ensure that all the vcpus will notice * that the rendezvous is requested immediately. */ vm->rendezvous_func = func; wmb(); } #define RENDEZVOUS_CTR0(vm, vcpuid, fmt) \ do { \ if (vcpuid >= 0) \ VCPU_CTR0(vm, vcpuid, fmt); \ else \ VM_CTR0(vm, fmt); \ } while (0) static void vm_handle_rendezvous(struct vm *vm, int vcpuid) { KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU), ("vm_handle_rendezvous: invalid vcpuid %d", vcpuid)); mtx_lock(&vm->rendezvous_mtx); while (vm->rendezvous_func != NULL) { /* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */ CPU_AND(&vm->rendezvous_req_cpus, &vm->active_cpus); if (vcpuid != -1 && CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus) && !CPU_ISSET(vcpuid, &vm->rendezvous_done_cpus)) { VCPU_CTR0(vm, vcpuid, "Calling rendezvous func"); (*vm->rendezvous_func)(vm, vcpuid, vm->rendezvous_arg); CPU_SET(vcpuid, &vm->rendezvous_done_cpus); } if (CPU_CMP(&vm->rendezvous_req_cpus, &vm->rendezvous_done_cpus) == 0) { VCPU_CTR0(vm, vcpuid, "Rendezvous completed"); vm_set_rendezvous_func(vm, NULL); wakeup(&vm->rendezvous_func); break; } RENDEZVOUS_CTR0(vm, vcpuid, "Wait for rendezvous completion"); mtx_sleep(&vm->rendezvous_func, &vm->rendezvous_mtx, 0, "vmrndv", 0); } mtx_unlock(&vm->rendezvous_mtx); } /* * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run. */ static int vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu) { struct vcpu *vcpu; const char *wmesg; int t, vcpu_halted, vm_halted; KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted")); vcpu = &vm->vcpu[vcpuid]; vcpu_halted = 0; vm_halted = 0; vcpu_lock(vcpu); while (1) { /* * Do a final check for pending NMI or interrupts before * really putting this thread to sleep. Also check for * software events that would cause this vcpu to wakeup. * * These interrupts/events could have happened after the * vcpu returned from VMRUN() and before it acquired the * vcpu lock above. */ if (vm->rendezvous_func != NULL || vm->suspend || vcpu->reqidle) break; if (vm_nmi_pending(vm, vcpuid)) break; if (!intr_disabled) { if (vm_extint_pending(vm, vcpuid) || vlapic_pending_intr(vcpu->vlapic, NULL)) { break; } } /* Don't go to sleep if the vcpu thread needs to yield */ if (vcpu_should_yield(vm, vcpuid)) break; /* * Some Linux guests implement "halt" by having all vcpus * execute HLT with interrupts disabled. 'halted_cpus' keeps * track of the vcpus that have entered this state. When all * vcpus enter the halted state the virtual machine is halted. */ if (intr_disabled) { wmesg = "vmhalt"; VCPU_CTR0(vm, vcpuid, "Halted"); if (!vcpu_halted && halt_detection_enabled) { vcpu_halted = 1; CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus); } if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) { vm_halted = 1; break; } } else { wmesg = "vmidle"; } t = ticks; vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING); /* * XXX msleep_spin() cannot be interrupted by signals so * wake up periodically to check pending signals. */ msleep_spin(vcpu, &vcpu->mtx, wmesg, hz); vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN); vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t); } if (vcpu_halted) CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus); vcpu_unlock(vcpu); if (vm_halted) vm_suspend(vm, VM_SUSPEND_HALT); return (0); } static int vm_handle_paging(struct vm *vm, int vcpuid, bool *retu) { int rv, ftype; struct vm_map *map; struct vcpu *vcpu; struct vm_exit *vme; vcpu = &vm->vcpu[vcpuid]; vme = &vcpu->exitinfo; KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d", __func__, vme->inst_length)); ftype = vme->u.paging.fault_type; KASSERT(ftype == VM_PROT_READ || ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE, ("vm_handle_paging: invalid fault_type %d", ftype)); if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) { rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace), vme->u.paging.gpa, ftype); if (rv == 0) { VCPU_CTR2(vm, vcpuid, "%s bit emulation for gpa %#lx", ftype == VM_PROT_READ ? "accessed" : "dirty", vme->u.paging.gpa); goto done; } } map = &vm->vmspace->vm_map; rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL); VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, " "ftype = %d", rv, vme->u.paging.gpa, ftype); if (rv != KERN_SUCCESS) return (EFAULT); done: return (0); } static int vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu) { struct vie *vie; struct vcpu *vcpu; struct vm_exit *vme; uint64_t gla, gpa, cs_base; struct vm_guest_paging *paging; mem_region_read_t mread; mem_region_write_t mwrite; enum vm_cpu_mode cpu_mode; int cs_d, error, fault; vcpu = &vm->vcpu[vcpuid]; vme = &vcpu->exitinfo; KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d", __func__, vme->inst_length)); gla = vme->u.inst_emul.gla; gpa = vme->u.inst_emul.gpa; cs_base = vme->u.inst_emul.cs_base; cs_d = vme->u.inst_emul.cs_d; vie = &vme->u.inst_emul.vie; paging = &vme->u.inst_emul.paging; cpu_mode = paging->cpu_mode; VCPU_CTR1(vm, vcpuid, "inst_emul fault accessing gpa %#lx", gpa); /* Fetch, decode and emulate the faulting instruction */ if (vie->num_valid == 0) { error = vmm_fetch_instruction(vm, vcpuid, paging, vme->rip + cs_base, VIE_INST_SIZE, vie, &fault); } else { /* * The instruction bytes have already been copied into 'vie' */ error = fault = 0; } if (error || fault) return (error); if (vmm_decode_instruction(vm, vcpuid, gla, cpu_mode, cs_d, vie) != 0) { VCPU_CTR1(vm, vcpuid, "Error decoding instruction at %#lx", vme->rip + cs_base); *retu = true; /* dump instruction bytes in userspace */ return (0); } /* * Update 'nextrip' based on the length of the emulated instruction. */ vme->inst_length = vie->num_processed; vcpu->nextrip += vie->num_processed; VCPU_CTR1(vm, vcpuid, "nextrip updated to %#lx after instruction " "decoding", vcpu->nextrip); /* return to userland unless this is an in-kernel emulated device */ if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) { mread = lapic_mmio_read; mwrite = lapic_mmio_write; } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) { mread = vioapic_mmio_read; mwrite = vioapic_mmio_write; } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) { mread = vhpet_mmio_read; mwrite = vhpet_mmio_write; } else { *retu = true; return (0); } error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, paging, mread, mwrite, retu); return (error); } static int vm_handle_suspend(struct vm *vm, int vcpuid, bool *retu) { int i, done; struct vcpu *vcpu; done = 0; vcpu = &vm->vcpu[vcpuid]; CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus); /* * Wait until all 'active_cpus' have suspended themselves. * * Since a VM may be suspended at any time including when one or * more vcpus are doing a rendezvous we need to call the rendezvous * handler while we are waiting to prevent a deadlock. */ vcpu_lock(vcpu); while (1) { if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) { VCPU_CTR0(vm, vcpuid, "All vcpus suspended"); break; } if (vm->rendezvous_func == NULL) { VCPU_CTR0(vm, vcpuid, "Sleeping during suspend"); vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING); msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz); vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN); } else { VCPU_CTR0(vm, vcpuid, "Rendezvous during suspend"); vcpu_unlock(vcpu); vm_handle_rendezvous(vm, vcpuid); vcpu_lock(vcpu); } } vcpu_unlock(vcpu); /* * Wakeup the other sleeping vcpus and return to userspace. */ for (i = 0; i < VM_MAXCPU; i++) { if (CPU_ISSET(i, &vm->suspended_cpus)) { vcpu_notify_event(vm, i, false); } } *retu = true; return (0); } static int vm_handle_reqidle(struct vm *vm, int vcpuid, bool *retu) { struct vcpu *vcpu = &vm->vcpu[vcpuid]; vcpu_lock(vcpu); KASSERT(vcpu->reqidle, ("invalid vcpu reqidle %d", vcpu->reqidle)); vcpu->reqidle = 0; vcpu_unlock(vcpu); *retu = true; return (0); } int vm_suspend(struct vm *vm, enum vm_suspend_how how) { int i; if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST) return (EINVAL); if (atomic_cmpset_int(&vm->suspend, 0, how) == 0) { VM_CTR2(vm, "virtual machine already suspended %d/%d", vm->suspend, how); return (EALREADY); } VM_CTR1(vm, "virtual machine successfully suspended %d", how); /* * Notify all active vcpus that they are now suspended. */ for (i = 0; i < VM_MAXCPU; i++) { if (CPU_ISSET(i, &vm->active_cpus)) vcpu_notify_event(vm, i, false); } return (0); } void vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip) { struct vm_exit *vmexit; KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST, ("vm_exit_suspended: invalid suspend type %d", vm->suspend)); vmexit = vm_exitinfo(vm, vcpuid); vmexit->rip = rip; vmexit->inst_length = 0; vmexit->exitcode = VM_EXITCODE_SUSPENDED; vmexit->u.suspended.how = vm->suspend; } void vm_exit_rendezvous(struct vm *vm, int vcpuid, uint64_t rip) { struct vm_exit *vmexit; KASSERT(vm->rendezvous_func != NULL, ("rendezvous not in progress")); vmexit = vm_exitinfo(vm, vcpuid); vmexit->rip = rip; vmexit->inst_length = 0; vmexit->exitcode = VM_EXITCODE_RENDEZVOUS; vmm_stat_incr(vm, vcpuid, VMEXIT_RENDEZVOUS, 1); } void vm_exit_reqidle(struct vm *vm, int vcpuid, uint64_t rip) { struct vm_exit *vmexit; vmexit = vm_exitinfo(vm, vcpuid); vmexit->rip = rip; vmexit->inst_length = 0; vmexit->exitcode = VM_EXITCODE_REQIDLE; vmm_stat_incr(vm, vcpuid, VMEXIT_REQIDLE, 1); } void vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip) { struct vm_exit *vmexit; vmexit = vm_exitinfo(vm, vcpuid); vmexit->rip = rip; vmexit->inst_length = 0; vmexit->exitcode = VM_EXITCODE_BOGUS; vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1); } int vm_run(struct vm *vm, struct vm_run *vmrun) { struct vm_eventinfo evinfo; int error, vcpuid; struct vcpu *vcpu; struct pcb *pcb; uint64_t tscval; struct vm_exit *vme; bool retu, intr_disabled; pmap_t pmap; vcpuid = vmrun->cpuid; if (vcpuid < 0 || vcpuid >= VM_MAXCPU) return (EINVAL); if (!CPU_ISSET(vcpuid, &vm->active_cpus)) return (EINVAL); if (CPU_ISSET(vcpuid, &vm->suspended_cpus)) return (EINVAL); pmap = vmspace_pmap(vm->vmspace); vcpu = &vm->vcpu[vcpuid]; vme = &vcpu->exitinfo; evinfo.rptr = &vm->rendezvous_func; evinfo.sptr = &vm->suspend; evinfo.iptr = &vcpu->reqidle; restart: critical_enter(); KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active), ("vm_run: absurd pm_active")); tscval = rdtsc(); pcb = PCPU_GET(curpcb); set_pcb_flags(pcb, PCB_FULL_IRET); restore_guest_fpustate(vcpu); vcpu_require_state(vm, vcpuid, VCPU_RUNNING); error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip, pmap, &evinfo); vcpu_require_state(vm, vcpuid, VCPU_FROZEN); save_guest_fpustate(vcpu); vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval); critical_exit(); if (error == 0) { retu = false; vcpu->nextrip = vme->rip + vme->inst_length; switch (vme->exitcode) { case VM_EXITCODE_REQIDLE: error = vm_handle_reqidle(vm, vcpuid, &retu); break; case VM_EXITCODE_SUSPENDED: error = vm_handle_suspend(vm, vcpuid, &retu); break; case VM_EXITCODE_IOAPIC_EOI: vioapic_process_eoi(vm, vcpuid, vme->u.ioapic_eoi.vector); break; case VM_EXITCODE_RENDEZVOUS: vm_handle_rendezvous(vm, vcpuid); error = 0; break; case VM_EXITCODE_HLT: intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0); error = vm_handle_hlt(vm, vcpuid, intr_disabled, &retu); break; case VM_EXITCODE_PAGING: error = vm_handle_paging(vm, vcpuid, &retu); break; case VM_EXITCODE_INST_EMUL: error = vm_handle_inst_emul(vm, vcpuid, &retu); break; case VM_EXITCODE_INOUT: case VM_EXITCODE_INOUT_STR: error = vm_handle_inout(vm, vcpuid, vme, &retu); break; case VM_EXITCODE_MONITOR: case VM_EXITCODE_MWAIT: vm_inject_ud(vm, vcpuid); break; default: retu = true; /* handled in userland */ break; } } if (error == 0 && retu == false) goto restart; VCPU_CTR2(vm, vcpuid, "retu %d/%d", error, vme->exitcode); /* copy the exit information */ bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit)); return (error); } int vm_restart_instruction(void *arg, int vcpuid) { struct vm *vm; struct vcpu *vcpu; enum vcpu_state state; uint64_t rip; int error; vm = arg; if (vcpuid < 0 || vcpuid >= VM_MAXCPU) return (EINVAL); vcpu = &vm->vcpu[vcpuid]; state = vcpu_get_state(vm, vcpuid, NULL); if (state == VCPU_RUNNING) { /* * When a vcpu is "running" the next instruction is determined * by adding 'rip' and 'inst_length' in the vcpu's 'exitinfo'. * Thus setting 'inst_length' to zero will cause the current * instruction to be restarted. */ vcpu->exitinfo.inst_length = 0; VCPU_CTR1(vm, vcpuid, "restarting instruction at %#lx by " "setting inst_length to zero", vcpu->exitinfo.rip); } else if (state == VCPU_FROZEN) { /* * When a vcpu is "frozen" it is outside the critical section * around VMRUN() and 'nextrip' points to the next instruction. * Thus instruction restart is achieved by setting 'nextrip' * to the vcpu's %rip. */ error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RIP, &rip); KASSERT(!error, ("%s: error %d getting rip", __func__, error)); VCPU_CTR2(vm, vcpuid, "restarting instruction by updating " "nextrip from %#lx to %#lx", vcpu->nextrip, rip); vcpu->nextrip = rip; } else { panic("%s: invalid state %d", __func__, state); } return (0); } int vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info) { struct vcpu *vcpu; int type, vector; if (vcpuid < 0 || vcpuid >= VM_MAXCPU) return (EINVAL); vcpu = &vm->vcpu[vcpuid]; if (info & VM_INTINFO_VALID) { type = info & VM_INTINFO_TYPE; vector = info & 0xff; if (type == VM_INTINFO_NMI && vector != IDT_NMI) return (EINVAL); if (type == VM_INTINFO_HWEXCEPTION && vector >= 32) return (EINVAL); if (info & VM_INTINFO_RSVD) return (EINVAL); } else { info = 0; } VCPU_CTR2(vm, vcpuid, "%s: info1(%#lx)", __func__, info); vcpu->exitintinfo = info; return (0); } enum exc_class { EXC_BENIGN, EXC_CONTRIBUTORY, EXC_PAGEFAULT }; #define IDT_VE 20 /* Virtualization Exception (Intel specific) */ static enum exc_class exception_class(uint64_t info) { int type, vector; KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %#lx", info)); type = info & VM_INTINFO_TYPE; vector = info & 0xff; /* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */ switch (type) { case VM_INTINFO_HWINTR: case VM_INTINFO_SWINTR: case VM_INTINFO_NMI: return (EXC_BENIGN); default: /* * Hardware exception. * * SVM and VT-x use identical type values to represent NMI, * hardware interrupt and software interrupt. * * SVM uses type '3' for all exceptions. VT-x uses type '3' * for exceptions except #BP and #OF. #BP and #OF use a type * value of '5' or '6'. Therefore we don't check for explicit * values of 'type' to classify 'intinfo' into a hardware * exception. */ break; } switch (vector) { case IDT_PF: case IDT_VE: return (EXC_PAGEFAULT); case IDT_DE: case IDT_TS: case IDT_NP: case IDT_SS: case IDT_GP: return (EXC_CONTRIBUTORY); default: return (EXC_BENIGN); } } static int nested_fault(struct vm *vm, int vcpuid, uint64_t info1, uint64_t info2, uint64_t *retinfo) { enum exc_class exc1, exc2; int type1, vector1; KASSERT(info1 & VM_INTINFO_VALID, ("info1 %#lx is not valid", info1)); KASSERT(info2 & VM_INTINFO_VALID, ("info2 %#lx is not valid", info2)); /* * If an exception occurs while attempting to call the double-fault * handler the processor enters shutdown mode (aka triple fault). */ type1 = info1 & VM_INTINFO_TYPE; vector1 = info1 & 0xff; if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) { VCPU_CTR2(vm, vcpuid, "triple fault: info1(%#lx), info2(%#lx)", info1, info2); vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT); *retinfo = 0; return (0); } /* * Table 6-5 "Conditions for Generating a Double Fault", Intel SDM, Vol3 */ exc1 = exception_class(info1); exc2 = exception_class(info2); if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) || (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) { /* Convert nested fault into a double fault. */ *retinfo = IDT_DF; *retinfo |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION; *retinfo |= VM_INTINFO_DEL_ERRCODE; } else { /* Handle exceptions serially */ *retinfo = info2; } return (1); } static uint64_t vcpu_exception_intinfo(struct vcpu *vcpu) { uint64_t info = 0; if (vcpu->exception_pending) { info = vcpu->exc_vector & 0xff; info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION; if (vcpu->exc_errcode_valid) { info |= VM_INTINFO_DEL_ERRCODE; info |= (uint64_t)vcpu->exc_errcode << 32; } } return (info); } int vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo) { struct vcpu *vcpu; uint64_t info1, info2; int valid; KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU, ("invalid vcpu %d", vcpuid)); vcpu = &vm->vcpu[vcpuid]; info1 = vcpu->exitintinfo; vcpu->exitintinfo = 0; info2 = 0; if (vcpu->exception_pending) { info2 = vcpu_exception_intinfo(vcpu); vcpu->exception_pending = 0; VCPU_CTR2(vm, vcpuid, "Exception %d delivered: %#lx", vcpu->exc_vector, info2); } if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) { valid = nested_fault(vm, vcpuid, info1, info2, retinfo); } else if (info1 & VM_INTINFO_VALID) { *retinfo = info1; valid = 1; } else if (info2 & VM_INTINFO_VALID) { *retinfo = info2; valid = 1; } else { valid = 0; } if (valid) { VCPU_CTR4(vm, vcpuid, "%s: info1(%#lx), info2(%#lx), " "retinfo(%#lx)", __func__, info1, info2, *retinfo); } return (valid); } int vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2) { struct vcpu *vcpu; if (vcpuid < 0 || vcpuid >= VM_MAXCPU) return (EINVAL); vcpu = &vm->vcpu[vcpuid]; *info1 = vcpu->exitintinfo; *info2 = vcpu_exception_intinfo(vcpu); return (0); } int vm_inject_exception(struct vm *vm, int vcpuid, int vector, int errcode_valid, uint32_t errcode, int restart_instruction) { struct vcpu *vcpu; uint64_t regval; int error; if (vcpuid < 0 || vcpuid >= VM_MAXCPU) return (EINVAL); if (vector < 0 || vector >= 32) return (EINVAL); /* * A double fault exception should never be injected directly into * the guest. It is a derived exception that results from specific * combinations of nested faults. */ if (vector == IDT_DF) return (EINVAL); vcpu = &vm->vcpu[vcpuid]; if (vcpu->exception_pending) { VCPU_CTR2(vm, vcpuid, "Unable to inject exception %d due to " "pending exception %d", vector, vcpu->exc_vector); return (EBUSY); } if (errcode_valid) { /* * Exceptions don't deliver an error code in real mode. */ error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, ®val); KASSERT(!error, ("%s: error %d getting CR0", __func__, error)); if (!(regval & CR0_PE)) errcode_valid = 0; } /* * From section 26.6.1 "Interruptibility State" in Intel SDM: * * Event blocking by "STI" or "MOV SS" is cleared after guest executes * one instruction or incurs an exception. */ error = vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0); KASSERT(error == 0, ("%s: error %d clearing interrupt shadow", __func__, error)); if (restart_instruction) vm_restart_instruction(vm, vcpuid); vcpu->exception_pending = 1; vcpu->exc_vector = vector; vcpu->exc_errcode = errcode; vcpu->exc_errcode_valid = errcode_valid; VCPU_CTR1(vm, vcpuid, "Exception %d pending", vector); return (0); } void vm_inject_fault(void *vmarg, int vcpuid, int vector, int errcode_valid, int errcode) { struct vm *vm; int error, restart_instruction; vm = vmarg; restart_instruction = 1; error = vm_inject_exception(vm, vcpuid, vector, errcode_valid, errcode, restart_instruction); KASSERT(error == 0, ("vm_inject_exception error %d", error)); } void vm_inject_pf(void *vmarg, int vcpuid, int error_code, uint64_t cr2) { struct vm *vm; int error; vm = vmarg; VCPU_CTR2(vm, vcpuid, "Injecting page fault: error_code %#x, cr2 %#lx", error_code, cr2); error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2); KASSERT(error == 0, ("vm_set_register(cr2) error %d", error)); vm_inject_fault(vm, vcpuid, IDT_PF, 1, error_code); } static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu"); int vm_inject_nmi(struct vm *vm, int vcpuid) { struct vcpu *vcpu; if (vcpuid < 0 || vcpuid >= VM_MAXCPU) return (EINVAL); vcpu = &vm->vcpu[vcpuid]; vcpu->nmi_pending = 1; vcpu_notify_event(vm, vcpuid, false); return (0); } int vm_nmi_pending(struct vm *vm, int vcpuid) { struct vcpu *vcpu; if (vcpuid < 0 || vcpuid >= VM_MAXCPU) panic("vm_nmi_pending: invalid vcpuid %d", vcpuid); vcpu = &vm->vcpu[vcpuid]; return (vcpu->nmi_pending); } void vm_nmi_clear(struct vm *vm, int vcpuid) { struct vcpu *vcpu; if (vcpuid < 0 || vcpuid >= VM_MAXCPU) panic("vm_nmi_pending: invalid vcpuid %d", vcpuid); vcpu = &vm->vcpu[vcpuid]; if (vcpu->nmi_pending == 0) panic("vm_nmi_clear: inconsistent nmi_pending state"); vcpu->nmi_pending = 0; vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1); } static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu"); int vm_inject_extint(struct vm *vm, int vcpuid) { struct vcpu *vcpu; if (vcpuid < 0 || vcpuid >= VM_MAXCPU) return (EINVAL); vcpu = &vm->vcpu[vcpuid]; vcpu->extint_pending = 1; vcpu_notify_event(vm, vcpuid, false); return (0); } int vm_extint_pending(struct vm *vm, int vcpuid) { struct vcpu *vcpu; if (vcpuid < 0 || vcpuid >= VM_MAXCPU) panic("vm_extint_pending: invalid vcpuid %d", vcpuid); vcpu = &vm->vcpu[vcpuid]; return (vcpu->extint_pending); } void vm_extint_clear(struct vm *vm, int vcpuid) { struct vcpu *vcpu; if (vcpuid < 0 || vcpuid >= VM_MAXCPU) panic("vm_extint_pending: invalid vcpuid %d", vcpuid); vcpu = &vm->vcpu[vcpuid]; if (vcpu->extint_pending == 0) panic("vm_extint_clear: inconsistent extint_pending state"); vcpu->extint_pending = 0; vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1); } int vm_get_capability(struct vm *vm, int vcpu, int type, int *retval) { if (vcpu < 0 || vcpu >= VM_MAXCPU) return (EINVAL); if (type < 0 || type >= VM_CAP_MAX) return (EINVAL); return (VMGETCAP(vm->cookie, vcpu, type, retval)); } int vm_set_capability(struct vm *vm, int vcpu, int type, int val) { if (vcpu < 0 || vcpu >= VM_MAXCPU) return (EINVAL); if (type < 0 || type >= VM_CAP_MAX) return (EINVAL); return (VMSETCAP(vm->cookie, vcpu, type, val)); } struct vlapic * vm_lapic(struct vm *vm, int cpu) { return (vm->vcpu[cpu].vlapic); } struct vioapic * vm_ioapic(struct vm *vm) { return (vm->vioapic); } struct vhpet * vm_hpet(struct vm *vm) { return (vm->vhpet); } boolean_t vmm_is_pptdev(int bus, int slot, int func) { int found, i, n; int b, s, f; char *val, *cp, *cp2; /* * XXX * The length of an environment variable is limited to 128 bytes which * puts an upper limit on the number of passthru devices that may be * specified using a single environment variable. * * Work around this by scanning multiple environment variable * names instead of a single one - yuck! */ const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL }; /* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */ found = 0; for (i = 0; names[i] != NULL && !found; i++) { cp = val = kern_getenv(names[i]); while (cp != NULL && *cp != '\0') { if ((cp2 = strchr(cp, ' ')) != NULL) *cp2 = '\0'; n = sscanf(cp, "%d/%d/%d", &b, &s, &f); if (n == 3 && bus == b && slot == s && func == f) { found = 1; break; } if (cp2 != NULL) *cp2++ = ' '; cp = cp2; } freeenv(val); } return (found); } void * vm_iommu_domain(struct vm *vm) { return (vm->iommu); } int vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate, bool from_idle) { int error; struct vcpu *vcpu; if (vcpuid < 0 || vcpuid >= VM_MAXCPU) panic("vm_set_run_state: invalid vcpuid %d", vcpuid); vcpu = &vm->vcpu[vcpuid]; vcpu_lock(vcpu); error = vcpu_set_state_locked(vm, vcpuid, newstate, from_idle); vcpu_unlock(vcpu); return (error); } enum vcpu_state vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu) { struct vcpu *vcpu; enum vcpu_state state; if (vcpuid < 0 || vcpuid >= VM_MAXCPU) panic("vm_get_run_state: invalid vcpuid %d", vcpuid); vcpu = &vm->vcpu[vcpuid]; vcpu_lock(vcpu); state = vcpu->state; if (hostcpu != NULL) *hostcpu = vcpu->hostcpu; vcpu_unlock(vcpu); return (state); } int vm_activate_cpu(struct vm *vm, int vcpuid) { if (vcpuid < 0 || vcpuid >= VM_MAXCPU) return (EINVAL); if (CPU_ISSET(vcpuid, &vm->active_cpus)) return (EBUSY); VCPU_CTR0(vm, vcpuid, "activated"); CPU_SET_ATOMIC(vcpuid, &vm->active_cpus); return (0); } cpuset_t vm_active_cpus(struct vm *vm) { return (vm->active_cpus); } cpuset_t vm_suspended_cpus(struct vm *vm) { return (vm->suspended_cpus); } void * vcpu_stats(struct vm *vm, int vcpuid) { return (vm->vcpu[vcpuid].stats); } int vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state) { if (vcpuid < 0 || vcpuid >= VM_MAXCPU) return (EINVAL); *state = vm->vcpu[vcpuid].x2apic_state; return (0); } int vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state) { if (vcpuid < 0 || vcpuid >= VM_MAXCPU) return (EINVAL); if (state >= X2APIC_STATE_LAST) return (EINVAL); vm->vcpu[vcpuid].x2apic_state = state; vlapic_set_x2apic_state(vm, vcpuid, state); return (0); } /* * This function is called to ensure that a vcpu "sees" a pending event * as soon as possible: * - If the vcpu thread is sleeping then it is woken up. * - If the vcpu is running on a different host_cpu then an IPI will be directed * to the host_cpu to cause the vcpu to trap into the hypervisor. */ static void vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr) { int hostcpu; hostcpu = vcpu->hostcpu; if (vcpu->state == VCPU_RUNNING) { KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu")); if (hostcpu != curcpu) { if (lapic_intr) { vlapic_post_intr(vcpu->vlapic, hostcpu, vmm_ipinum); } else { ipi_cpu(hostcpu, vmm_ipinum); } } else { /* * If the 'vcpu' is running on 'curcpu' then it must * be sending a notification to itself (e.g. SELF_IPI). * The pending event will be picked up when the vcpu * transitions back to guest context. */ } } else { KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent " "with hostcpu %d", vcpu->state, hostcpu)); if (vcpu->state == VCPU_SLEEPING) wakeup_one(vcpu); } } void vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr) { struct vcpu *vcpu = &vm->vcpu[vcpuid]; vcpu_lock(vcpu); vcpu_notify_event_locked(vcpu, lapic_intr); vcpu_unlock(vcpu); } struct vmspace * vm_get_vmspace(struct vm *vm) { return (vm->vmspace); } int vm_apicid2vcpuid(struct vm *vm, int apicid) { /* * XXX apic id is assumed to be numerically identical to vcpu id */ return (apicid); } void vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest, vm_rendezvous_func_t func, void *arg) { int i; /* * Enforce that this function is called without any locks */ WITNESS_WARN(WARN_PANIC, NULL, "vm_smp_rendezvous"); KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU), ("vm_smp_rendezvous: invalid vcpuid %d", vcpuid)); restart: mtx_lock(&vm->rendezvous_mtx); if (vm->rendezvous_func != NULL) { /* * If a rendezvous is already in progress then we need to * call the rendezvous handler in case this 'vcpuid' is one * of the targets of the rendezvous. */ RENDEZVOUS_CTR0(vm, vcpuid, "Rendezvous already in progress"); mtx_unlock(&vm->rendezvous_mtx); vm_handle_rendezvous(vm, vcpuid); goto restart; } KASSERT(vm->rendezvous_func == NULL, ("vm_smp_rendezvous: previous " "rendezvous is still in progress")); RENDEZVOUS_CTR0(vm, vcpuid, "Initiating rendezvous"); vm->rendezvous_req_cpus = dest; CPU_ZERO(&vm->rendezvous_done_cpus); vm->rendezvous_arg = arg; vm_set_rendezvous_func(vm, func); mtx_unlock(&vm->rendezvous_mtx); /* * Wake up any sleeping vcpus and trigger a VM-exit in any running * vcpus so they handle the rendezvous as soon as possible. */ for (i = 0; i < VM_MAXCPU; i++) { if (CPU_ISSET(i, &dest)) vcpu_notify_event(vm, i, false); } vm_handle_rendezvous(vm, vcpuid); } struct vatpic * vm_atpic(struct vm *vm) { return (vm->vatpic); } struct vatpit * vm_atpit(struct vm *vm) { return (vm->vatpit); } struct vpmtmr * vm_pmtmr(struct vm *vm) { return (vm->vpmtmr); } struct vrtc * vm_rtc(struct vm *vm) { return (vm->vrtc); } enum vm_reg_name vm_segment_name(int seg) { static enum vm_reg_name seg_names[] = { VM_REG_GUEST_ES, VM_REG_GUEST_CS, VM_REG_GUEST_SS, VM_REG_GUEST_DS, VM_REG_GUEST_FS, VM_REG_GUEST_GS }; KASSERT(seg >= 0 && seg < nitems(seg_names), ("%s: invalid segment encoding %d", __func__, seg)); return (seg_names[seg]); } void vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, int num_copyinfo) { int idx; for (idx = 0; idx < num_copyinfo; idx++) { if (copyinfo[idx].cookie != NULL) vm_gpa_release(copyinfo[idx].cookie); } bzero(copyinfo, num_copyinfo * sizeof(struct vm_copyinfo)); } int vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo, int num_copyinfo, int *fault) { int error, idx, nused; size_t n, off, remaining; void *hva, *cookie; uint64_t gpa; bzero(copyinfo, sizeof(struct vm_copyinfo) * num_copyinfo); nused = 0; remaining = len; while (remaining > 0) { KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo")); error = vm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa, fault); if (error || *fault) return (error); off = gpa & PAGE_MASK; n = min(remaining, PAGE_SIZE - off); copyinfo[nused].gpa = gpa; copyinfo[nused].len = n; remaining -= n; gla += n; nused++; } for (idx = 0; idx < nused; idx++) { hva = vm_gpa_hold(vm, vcpuid, copyinfo[idx].gpa, copyinfo[idx].len, prot, &cookie); if (hva == NULL) break; copyinfo[idx].hva = hva; copyinfo[idx].cookie = cookie; } if (idx != nused) { vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo); return (EFAULT); } else { *fault = 0; return (0); } } void vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr, size_t len) { char *dst; int idx; dst = kaddr; idx = 0; while (len > 0) { bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len); len -= copyinfo[idx].len; dst += copyinfo[idx].len; idx++; } } void vm_copyout(struct vm *vm, int vcpuid, const void *kaddr, struct vm_copyinfo *copyinfo, size_t len) { const char *src; int idx; src = kaddr; idx = 0; while (len > 0) { bcopy(src, copyinfo[idx].hva, copyinfo[idx].len); len -= copyinfo[idx].len; src += copyinfo[idx].len; idx++; } } /* * Return the amount of in-use and wired memory for the VM. Since * these are global stats, only return the values with for vCPU 0 */ VMM_STAT_DECLARE(VMM_MEM_RESIDENT); VMM_STAT_DECLARE(VMM_MEM_WIRED); static void vm_get_rescnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat) { if (vcpu == 0) { vmm_stat_set(vm, vcpu, VMM_MEM_RESIDENT, PAGE_SIZE * vmspace_resident_count(vm->vmspace)); } } static void vm_get_wiredcnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat) { if (vcpu == 0) { vmm_stat_set(vm, vcpu, VMM_MEM_WIRED, PAGE_SIZE * pmap_wired_count(vmspace_pmap(vm->vmspace))); } } VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt); VMM_STAT_FUNC(VMM_MEM_WIRED, "Wired memory", vm_get_wiredcnt); Index: head/sys/amd64/vmm/vmm_host.h =================================================================== --- head/sys/amd64/vmm/vmm_host.h (revision 316020) +++ head/sys/amd64/vmm/vmm_host.h (revision 316021) @@ -1,85 +1,83 @@ /*- * Copyright (c) 2012 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _VMM_HOST_H_ #define _VMM_HOST_H_ #ifndef _KERNEL #error "no user-serviceable parts inside" #endif -extern int vmm_ipinum; - struct xsave_limits { int xsave_enabled; uint64_t xcr0_allowed; uint32_t xsave_max_size; }; void vmm_host_state_init(void); uint64_t vmm_get_host_pat(void); uint64_t vmm_get_host_efer(void); uint64_t vmm_get_host_cr0(void); uint64_t vmm_get_host_cr4(void); uint64_t vmm_get_host_xcr0(void); uint64_t vmm_get_host_datasel(void); uint64_t vmm_get_host_codesel(void); uint64_t vmm_get_host_tsssel(void); uint64_t vmm_get_host_fsbase(void); uint64_t vmm_get_host_idtrbase(void); const struct xsave_limits *vmm_get_xsave_limits(void); /* * Inline access to host state that is used on every VM entry */ static __inline uint64_t vmm_get_host_trbase(void) { return ((uint64_t)PCPU_GET(tssp)); } static __inline uint64_t vmm_get_host_gdtrbase(void) { return ((uint64_t)&gdt[NGDT * curcpu]); } struct pcpu; extern struct pcpu __pcpu[]; static __inline uint64_t vmm_get_host_gsbase(void) { return ((uint64_t)&__pcpu[curcpu]); } #endif Index: head/sys/i386/i386/apic_vector.s =================================================================== --- head/sys/i386/i386/apic_vector.s (revision 316020) +++ head/sys/i386/i386/apic_vector.s (revision 316021) @@ -1,317 +1,320 @@ /*- * Copyright (c) 1989, 1990 William F. Jolitz. * Copyright (c) 1990 The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: vector.s, 386BSD 0.1 unknown origin * $FreeBSD$ */ /* * Interrupt entry points for external interrupts triggered by I/O APICs * as well as IPI handlers. */ #include "opt_smp.h" #include #include #include #include "assym.s" + .text + SUPERALIGN_TEXT + /* End Of Interrupt to APIC */ +as_lapic_eoi: + cmpl $0,x2apic_mode + jne 1f + movl lapic_map,%eax + movl $0,LA_EOI(%eax) + ret +1: + movl $MSR_APIC_EOI,%ecx + xorl %eax,%eax + xorl %edx,%edx + wrmsr + ret + /* * I/O Interrupt Entry Point. Rather than having one entry point for * each interrupt source, we use one entry point for each 32-bit word * in the ISR. The handler determines the highest bit set in the ISR, * translates that into a vector, and passes the vector to the * lapic_handle_intr() function. */ #define ISR_VEC(index, vec_name) \ .text ; \ SUPERALIGN_TEXT ; \ IDTVEC(vec_name) ; \ PUSH_FRAME ; \ SET_KERNEL_SREGS ; \ cld ; \ FAKE_MCOUNT(TF_EIP(%esp)) ; \ cmpl $0,x2apic_mode ; \ je 1f ; \ movl $(MSR_APIC_ISR0 + index),%ecx ; \ rdmsr ; \ jmp 2f ; \ 1: ; \ movl lapic_map, %edx ;/* pointer to local APIC */ \ movl LA_ISR + 16 * (index)(%edx), %eax ; /* load ISR */ \ 2: ; \ bsrl %eax, %eax ; /* index of highest set bit in ISR */ \ jz 3f ; \ addl $(32 * index),%eax ; \ pushl %esp ; \ pushl %eax ; /* pass the IRQ */ \ call lapic_handle_intr ; \ addl $8, %esp ; /* discard parameter */ \ 3: ; \ MEXITCOUNT ; \ jmp doreti /* * Handle "spurious INTerrupts". * Notes: * This is different than the "spurious INTerrupt" generated by an * 8259 PIC for missing INTs. See the APIC documentation for details. * This routine should NOT do an 'EOI' cycle. */ .text SUPERALIGN_TEXT IDTVEC(spuriousint) /* No EOI cycle used here */ iret ISR_VEC(1, apic_isr1) ISR_VEC(2, apic_isr2) ISR_VEC(3, apic_isr3) ISR_VEC(4, apic_isr4) ISR_VEC(5, apic_isr5) ISR_VEC(6, apic_isr6) ISR_VEC(7, apic_isr7) /* * Local APIC periodic timer handler. */ .text SUPERALIGN_TEXT IDTVEC(timerint) PUSH_FRAME SET_KERNEL_SREGS cld FAKE_MCOUNT(TF_EIP(%esp)) pushl %esp call lapic_handle_timer add $4, %esp MEXITCOUNT jmp doreti /* * Local APIC CMCI handler. */ .text SUPERALIGN_TEXT IDTVEC(cmcint) PUSH_FRAME SET_KERNEL_SREGS cld FAKE_MCOUNT(TF_EIP(%esp)) call lapic_handle_cmc MEXITCOUNT jmp doreti /* * Local APIC error interrupt handler. */ .text SUPERALIGN_TEXT IDTVEC(errorint) PUSH_FRAME SET_KERNEL_SREGS cld FAKE_MCOUNT(TF_EIP(%esp)) call lapic_handle_error MEXITCOUNT jmp doreti #ifdef XENHVM /* * Xen event channel upcall interrupt handler. * Only used when the hypervisor supports direct vector callbacks. */ .text SUPERALIGN_TEXT IDTVEC(xen_intr_upcall) PUSH_FRAME SET_KERNEL_SREGS cld FAKE_MCOUNT(TF_EIP(%esp)) pushl %esp call xen_intr_handle_upcall add $4, %esp MEXITCOUNT jmp doreti #endif #ifdef SMP /* * Global address space TLB shootdown. */ .text SUPERALIGN_TEXT invltlb_ret: - call native_lapic_eoi - add $4, %esp + call as_lapic_eoi POP_FRAME iret SUPERALIGN_TEXT IDTVEC(invltlb) PUSH_FRAME SET_KERNEL_SREGS cld call invltlb_handler - pushl $IPI_INVLTLB jmp invltlb_ret /* * Single page TLB shootdown */ .text SUPERALIGN_TEXT IDTVEC(invlpg) PUSH_FRAME SET_KERNEL_SREGS cld call invlpg_handler - pushl $IPI_INVLPG jmp invltlb_ret /* * Page range TLB shootdown. */ .text SUPERALIGN_TEXT IDTVEC(invlrng) PUSH_FRAME SET_KERNEL_SREGS cld call invlrng_handler - pushl $IPI_INVLRNG jmp invltlb_ret /* * Invalidate cache. */ .text SUPERALIGN_TEXT IDTVEC(invlcache) PUSH_FRAME SET_KERNEL_SREGS cld call invlcache_handler - pushl $IPI_INVLCACHE jmp invltlb_ret /* * Handler for IPIs sent via the per-cpu IPI bitmap. */ .text SUPERALIGN_TEXT IDTVEC(ipi_intr_bitmap_handler) PUSH_FRAME SET_KERNEL_SREGS cld - pushl $IPI_BITMAP_VECTOR - call native_lapic_eoi - add $4, %esp - + call as_lapic_eoi + FAKE_MCOUNT(TF_EIP(%esp)) call ipi_bitmap_handler MEXITCOUNT jmp doreti /* * Executed by a CPU when it receives an IPI_STOP from another CPU. */ .text SUPERALIGN_TEXT IDTVEC(cpustop) PUSH_FRAME SET_KERNEL_SREGS cld - pushl $IPI_STOP - call native_lapic_eoi - add $4, %esp + call as_lapic_eoi call cpustop_handler POP_FRAME iret /* * Executed by a CPU when it receives an IPI_SUSPEND from another CPU. */ .text SUPERALIGN_TEXT IDTVEC(cpususpend) PUSH_FRAME SET_KERNEL_SREGS cld - pushl $IPI_SUSPEND - call native_lapic_eoi - add $4, %esp + call as_lapic_eoi call cpususpend_handler POP_FRAME jmp doreti_iret /* * Executed by a CPU when it receives a RENDEZVOUS IPI from another CPU. * * - Calls the generic rendezvous action function. */ .text SUPERALIGN_TEXT IDTVEC(rendezvous) PUSH_FRAME SET_KERNEL_SREGS cld #ifdef COUNT_IPIS movl PCPU(CPUID), %eax movl ipi_rendezvous_counts(,%eax,4), %eax incl (%eax) #endif call smp_rendezvous_action - pushl $IPI_RENDEZVOUS - call native_lapic_eoi - add $4, %esp + call as_lapic_eoi POP_FRAME iret #endif /* SMP */ Index: head/sys/i386/i386/genassym.c =================================================================== --- head/sys/i386/i386/genassym.c (revision 316020) +++ head/sys/i386/i386/genassym.c (revision 316021) @@ -1,246 +1,234 @@ /*- * Copyright (c) 1982, 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)genassym.c 5.11 (Berkeley) 5/10/91 */ #include __FBSDID("$FreeBSD$"); #include "opt_apic.h" #include "opt_compat.h" #include "opt_hwpmc_hooks.h" #include "opt_kstack_pages.h" #include #include #include #include #include #ifdef HWPMC_HOOKS #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DEV_APIC -#include -#include #include -#include #endif #include #include #include #include #include ASSYM(P_VMSPACE, offsetof(struct proc, p_vmspace)); ASSYM(VM_PMAP, offsetof(struct vmspace, vm_pmap)); ASSYM(PM_ACTIVE, offsetof(struct pmap, pm_active)); ASSYM(TD_FLAGS, offsetof(struct thread, td_flags)); ASSYM(TD_LOCK, offsetof(struct thread, td_lock)); ASSYM(TD_PCB, offsetof(struct thread, td_pcb)); ASSYM(TD_PFLAGS, offsetof(struct thread, td_pflags)); ASSYM(TD_PROC, offsetof(struct thread, td_proc)); ASSYM(TD_MD, offsetof(struct thread, td_md)); ASSYM(TD_TID, offsetof(struct thread, td_tid)); ASSYM(TDP_CALLCHAIN, TDP_CALLCHAIN); ASSYM(P_MD, offsetof(struct proc, p_md)); ASSYM(MD_LDT, offsetof(struct mdproc, md_ldt)); ASSYM(TDF_ASTPENDING, TDF_ASTPENDING); ASSYM(TDF_NEEDRESCHED, TDF_NEEDRESCHED); ASSYM(V_TRAP, offsetof(struct vmmeter, v_trap)); ASSYM(V_SYSCALL, offsetof(struct vmmeter, v_syscall)); ASSYM(V_INTR, offsetof(struct vmmeter, v_intr)); ASSYM(TD0_KSTACK_PAGES, TD0_KSTACK_PAGES); ASSYM(PAGE_SIZE, PAGE_SIZE); ASSYM(NPTEPG, NPTEPG); ASSYM(NPDEPG, NPDEPG); ASSYM(NPDEPTD, NPDEPTD); ASSYM(NPGPTD, NPGPTD); ASSYM(PDESIZE, sizeof(pd_entry_t)); ASSYM(PTESIZE, sizeof(pt_entry_t)); ASSYM(PDESHIFT, PDESHIFT); ASSYM(PTESHIFT, PTESHIFT); ASSYM(PAGE_SHIFT, PAGE_SHIFT); ASSYM(PAGE_MASK, PAGE_MASK); ASSYM(PDRSHIFT, PDRSHIFT); ASSYM(PDRMASK, PDRMASK); ASSYM(USRSTACK, USRSTACK); ASSYM(VM_MAXUSER_ADDRESS, VM_MAXUSER_ADDRESS); ASSYM(KERNBASE, KERNBASE); ASSYM(KERNLOAD, KERNLOAD); ASSYM(MCLBYTES, MCLBYTES); ASSYM(PCB_CR0, offsetof(struct pcb, pcb_cr0)); ASSYM(PCB_CR2, offsetof(struct pcb, pcb_cr2)); ASSYM(PCB_CR3, offsetof(struct pcb, pcb_cr3)); ASSYM(PCB_CR4, offsetof(struct pcb, pcb_cr4)); ASSYM(PCB_EDI, offsetof(struct pcb, pcb_edi)); ASSYM(PCB_ESI, offsetof(struct pcb, pcb_esi)); ASSYM(PCB_EBP, offsetof(struct pcb, pcb_ebp)); ASSYM(PCB_ESP, offsetof(struct pcb, pcb_esp)); ASSYM(PCB_EBX, offsetof(struct pcb, pcb_ebx)); ASSYM(PCB_EIP, offsetof(struct pcb, pcb_eip)); ASSYM(TSS_ESP0, offsetof(struct i386tss, tss_esp0)); ASSYM(PCB_DS, offsetof(struct pcb, pcb_ds)); ASSYM(PCB_ES, offsetof(struct pcb, pcb_es)); ASSYM(PCB_FS, offsetof(struct pcb, pcb_fs)); ASSYM(PCB_GS, offsetof(struct pcb, pcb_gs)); ASSYM(PCB_SS, offsetof(struct pcb, pcb_ss)); ASSYM(PCB_DR0, offsetof(struct pcb, pcb_dr0)); ASSYM(PCB_DR1, offsetof(struct pcb, pcb_dr1)); ASSYM(PCB_DR2, offsetof(struct pcb, pcb_dr2)); ASSYM(PCB_DR3, offsetof(struct pcb, pcb_dr3)); ASSYM(PCB_DR6, offsetof(struct pcb, pcb_dr6)); ASSYM(PCB_DR7, offsetof(struct pcb, pcb_dr7)); ASSYM(PCB_DBREGS, PCB_DBREGS); ASSYM(PCB_EXT, offsetof(struct pcb, pcb_ext)); ASSYM(PCB_FSD, offsetof(struct pcb, pcb_fsd)); ASSYM(PCB_GSD, offsetof(struct pcb, pcb_gsd)); ASSYM(PCB_VM86, offsetof(struct pcb, pcb_vm86)); ASSYM(PCB_FLAGS, offsetof(struct pcb, pcb_flags)); ASSYM(PCB_SAVEFPU, offsetof(struct pcb, pcb_save)); ASSYM(PCB_ONFAULT, offsetof(struct pcb, pcb_onfault)); ASSYM(PCB_SIZE, sizeof(struct pcb)); ASSYM(PCB_VM86CALL, PCB_VM86CALL); ASSYM(PCB_GDT, offsetof(struct pcb, pcb_gdt)); ASSYM(PCB_IDT, offsetof(struct pcb, pcb_idt)); ASSYM(PCB_LDT, offsetof(struct pcb, pcb_ldt)); ASSYM(PCB_TR, offsetof(struct pcb, pcb_tr)); ASSYM(TF_TRAPNO, offsetof(struct trapframe, tf_trapno)); ASSYM(TF_ERR, offsetof(struct trapframe, tf_err)); ASSYM(TF_EIP, offsetof(struct trapframe, tf_eip)); ASSYM(TF_CS, offsetof(struct trapframe, tf_cs)); ASSYM(TF_EFLAGS, offsetof(struct trapframe, tf_eflags)); ASSYM(SIGF_HANDLER, offsetof(struct sigframe, sf_ahu.sf_handler)); #ifdef COMPAT_43 ASSYM(SIGF_SC, offsetof(struct osigframe, sf_siginfo.si_sc)); #endif ASSYM(SIGF_UC, offsetof(struct sigframe, sf_uc)); #ifdef COMPAT_FREEBSD4 ASSYM(SIGF_UC4, offsetof(struct sigframe4, sf_uc)); #endif #ifdef COMPAT_43 ASSYM(SC_PS, offsetof(struct osigcontext, sc_ps)); ASSYM(SC_FS, offsetof(struct osigcontext, sc_fs)); ASSYM(SC_GS, offsetof(struct osigcontext, sc_gs)); ASSYM(SC_TRAPNO, offsetof(struct osigcontext, sc_trapno)); #endif #ifdef COMPAT_FREEBSD4 ASSYM(UC4_EFLAGS, offsetof(struct ucontext4, uc_mcontext.mc_eflags)); ASSYM(UC4_GS, offsetof(struct ucontext4, uc_mcontext.mc_gs)); #endif ASSYM(UC_EFLAGS, offsetof(ucontext_t, uc_mcontext.mc_eflags)); ASSYM(UC_GS, offsetof(ucontext_t, uc_mcontext.mc_gs)); ASSYM(ENOENT, ENOENT); ASSYM(EFAULT, EFAULT); ASSYM(ENAMETOOLONG, ENAMETOOLONG); ASSYM(MAXCOMLEN, MAXCOMLEN); ASSYM(MAXPATHLEN, MAXPATHLEN); ASSYM(BOOTINFO_SIZE, sizeof(struct bootinfo)); ASSYM(BI_VERSION, offsetof(struct bootinfo, bi_version)); ASSYM(BI_KERNELNAME, offsetof(struct bootinfo, bi_kernelname)); ASSYM(BI_NFS_DISKLESS, offsetof(struct bootinfo, bi_nfs_diskless)); ASSYM(BI_ENDCOMMON, offsetof(struct bootinfo, bi_endcommon)); ASSYM(NFSDISKLESS_SIZE, sizeof(struct nfs_diskless)); ASSYM(BI_SIZE, offsetof(struct bootinfo, bi_size)); ASSYM(BI_SYMTAB, offsetof(struct bootinfo, bi_symtab)); ASSYM(BI_ESYMTAB, offsetof(struct bootinfo, bi_esymtab)); ASSYM(BI_KERNEND, offsetof(struct bootinfo, bi_kernend)); ASSYM(PC_SIZEOF, sizeof(struct pcpu)); ASSYM(PC_PRVSPACE, offsetof(struct pcpu, pc_prvspace)); ASSYM(PC_CURTHREAD, offsetof(struct pcpu, pc_curthread)); ASSYM(PC_FPCURTHREAD, offsetof(struct pcpu, pc_fpcurthread)); ASSYM(PC_IDLETHREAD, offsetof(struct pcpu, pc_idlethread)); ASSYM(PC_CURPCB, offsetof(struct pcpu, pc_curpcb)); ASSYM(PC_COMMON_TSS, offsetof(struct pcpu, pc_common_tss)); ASSYM(PC_COMMON_TSSD, offsetof(struct pcpu, pc_common_tssd)); ASSYM(PC_TSS_GDT, offsetof(struct pcpu, pc_tss_gdt)); ASSYM(PC_FSGS_GDT, offsetof(struct pcpu, pc_fsgs_gdt)); ASSYM(PC_CURRENTLDT, offsetof(struct pcpu, pc_currentldt)); ASSYM(PC_CPUID, offsetof(struct pcpu, pc_cpuid)); ASSYM(PC_CURPMAP, offsetof(struct pcpu, pc_curpmap)); ASSYM(PC_PRIVATE_TSS, offsetof(struct pcpu, pc_private_tss)); #ifdef DEV_APIC ASSYM(LA_EOI, LAPIC_EOI * LAPIC_MEM_MUL); ASSYM(LA_ISR, LAPIC_ISR0 * LAPIC_MEM_MUL); - -ASSYM(IPI_INVLTLB, IPI_INVLTLB); -ASSYM(IPI_INVLPG, IPI_INVLPG); -ASSYM(IPI_INVLRNG, IPI_INVLRNG); -ASSYM(IPI_INVLCACHE, IPI_INVLCACHE); -ASSYM(IPI_BITMAP_VECTOR, IPI_BITMAP_VECTOR); -ASSYM(IPI_STOP, IPI_STOP); -ASSYM(IPI_SUSPEND, IPI_SUSPEND); -ASSYM(IPI_RENDEZVOUS, IPI_RENDEZVOUS); #endif ASSYM(KCSEL, GSEL(GCODE_SEL, SEL_KPL)); ASSYM(KDSEL, GSEL(GDATA_SEL, SEL_KPL)); ASSYM(KPSEL, GSEL(GPRIV_SEL, SEL_KPL)); ASSYM(BC32SEL, GSEL(GBIOSCODE32_SEL, SEL_KPL)); ASSYM(GPROC0_SEL, GPROC0_SEL); ASSYM(VM86_FRAMESIZE, sizeof(struct vm86frame)); #ifdef HWPMC_HOOKS ASSYM(PMC_FN_USER_CALLCHAIN, PMC_FN_USER_CALLCHAIN); #endif Index: head/sys/x86/include/apicvar.h =================================================================== --- head/sys/x86/include/apicvar.h (revision 316020) +++ head/sys/x86/include/apicvar.h (revision 316021) @@ -1,486 +1,485 @@ /*- * Copyright (c) 2003 John Baldwin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _X86_APICVAR_H_ #define _X86_APICVAR_H_ /* * Local && I/O APIC variable definitions. */ /* * Layout of local APIC interrupt vectors: * * 0xff (255) +-------------+ * | | 15 (Spurious / IPIs / Local Interrupts) * 0xf0 (240) +-------------+ * | | 14 (I/O Interrupts / Timer) * 0xe0 (224) +-------------+ * | | 13 (I/O Interrupts) * 0xd0 (208) +-------------+ * | | 12 (I/O Interrupts) * 0xc0 (192) +-------------+ * | | 11 (I/O Interrupts) * 0xb0 (176) +-------------+ * | | 10 (I/O Interrupts) * 0xa0 (160) +-------------+ * | | 9 (I/O Interrupts) * 0x90 (144) +-------------+ * | | 8 (I/O Interrupts / System Calls) * 0x80 (128) +-------------+ * | | 7 (I/O Interrupts) * 0x70 (112) +-------------+ * | | 6 (I/O Interrupts) * 0x60 (96) +-------------+ * | | 5 (I/O Interrupts) * 0x50 (80) +-------------+ * | | 4 (I/O Interrupts) * 0x40 (64) +-------------+ * | | 3 (I/O Interrupts) * 0x30 (48) +-------------+ * | | 2 (ATPIC Interrupts) * 0x20 (32) +-------------+ * | | 1 (Exceptions, traps, faults, etc.) * 0x10 (16) +-------------+ * | | 0 (Exceptions, traps, faults, etc.) * 0x00 (0) +-------------+ * * Note: 0x80 needs to be handled specially and not allocated to an * I/O device! */ #define MAX_APIC_ID 0xfe #define APIC_ID_ALL 0xff /* I/O Interrupts are used for external devices such as ISA, PCI, etc. */ #define APIC_IO_INTS (IDT_IO_INTS + 16) #define APIC_NUM_IOINTS 191 /* The timer interrupt is used for clock handling and drives hardclock, etc. */ #define APIC_TIMER_INT (APIC_IO_INTS + APIC_NUM_IOINTS) /* ********************* !!! WARNING !!! ****************************** * Each local apic has an interrupt receive fifo that is two entries deep * for each interrupt priority class (higher 4 bits of interrupt vector). * Once the fifo is full the APIC can no longer receive interrupts for this * class and sending IPIs from other CPUs will be blocked. * To avoid deadlocks there should be no more than two IPI interrupts * pending at the same time. * Currently this is guaranteed by dividing the IPIs in two groups that have * each at most one IPI interrupt pending. The first group is protected by the * smp_ipi_mtx and waits for the completion of the IPI (Only one IPI user * at a time) The second group uses a single interrupt and a bitmap to avoid * redundant IPI interrupts. */ /* Interrupts for local APIC LVT entries other than the timer. */ #define APIC_LOCAL_INTS 240 #define APIC_ERROR_INT APIC_LOCAL_INTS #define APIC_THERMAL_INT (APIC_LOCAL_INTS + 1) #define APIC_CMC_INT (APIC_LOCAL_INTS + 2) #define APIC_IPI_INTS (APIC_LOCAL_INTS + 3) #define IPI_RENDEZVOUS (APIC_IPI_INTS) /* Inter-CPU rendezvous. */ #define IPI_INVLTLB (APIC_IPI_INTS + 1) /* TLB Shootdown IPIs */ #define IPI_INVLPG (APIC_IPI_INTS + 2) #define IPI_INVLRNG (APIC_IPI_INTS + 3) #define IPI_INVLCACHE (APIC_IPI_INTS + 4) /* Vector to handle bitmap based IPIs */ #define IPI_BITMAP_VECTOR (APIC_IPI_INTS + 5) /* IPIs handled by IPI_BITMAP_VECTOR */ #define IPI_AST 0 /* Generate software trap. */ #define IPI_PREEMPT 1 #define IPI_HARDCLOCK 2 #define IPI_BITMAP_LAST IPI_HARDCLOCK #define IPI_IS_BITMAPED(x) ((x) <= IPI_BITMAP_LAST) #define IPI_STOP (APIC_IPI_INTS + 6) /* Stop CPU until restarted. */ #define IPI_SUSPEND (APIC_IPI_INTS + 7) /* Suspend CPU until restarted. */ #ifdef __i386__ #define IPI_LAZYPMAP (APIC_IPI_INTS + 8) /* Lazy pmap release. */ #define IPI_DYN_FIRST (APIC_IPI_INTS + 9) #else #define IPI_DYN_FIRST (APIC_IPI_INTS + 8) #endif #define IPI_DYN_LAST (253) /* IPIs allocated at runtime */ /* * IPI_STOP_HARD does not need to occupy a slot in the IPI vector space since * it is delivered using an NMI anyways. */ #define IPI_NMI_FIRST 254 #define IPI_TRACE 254 /* Interrupt for tracing. */ #define IPI_STOP_HARD 255 /* Stop CPU with a NMI. */ /* * The spurious interrupt can share the priority class with the IPIs since * it is not a normal interrupt. (Does not use the APIC's interrupt fifo) */ #define APIC_SPURIOUS_INT 255 #ifndef LOCORE #define APIC_IPI_DEST_SELF -1 #define APIC_IPI_DEST_ALL -2 #define APIC_IPI_DEST_OTHERS -3 #define APIC_BUS_UNKNOWN -1 #define APIC_BUS_ISA 0 #define APIC_BUS_EISA 1 #define APIC_BUS_PCI 2 #define APIC_BUS_MAX APIC_BUS_PCI #define IRQ_EXTINT (NUM_IO_INTS + 1) #define IRQ_NMI (NUM_IO_INTS + 2) #define IRQ_SMI (NUM_IO_INTS + 3) #define IRQ_DISABLED (NUM_IO_INTS + 4) /* * An APIC enumerator is a psuedo bus driver that enumerates APIC's including * CPU's and I/O APIC's. */ struct apic_enumerator { const char *apic_name; int (*apic_probe)(void); int (*apic_probe_cpus)(void); int (*apic_setup_local)(void); int (*apic_setup_io)(void); SLIST_ENTRY(apic_enumerator) apic_next; }; inthand_t IDTVEC(apic_isr1), IDTVEC(apic_isr2), IDTVEC(apic_isr3), IDTVEC(apic_isr4), IDTVEC(apic_isr5), IDTVEC(apic_isr6), IDTVEC(apic_isr7), IDTVEC(cmcint), IDTVEC(errorint), IDTVEC(spuriousint), IDTVEC(timerint); extern vm_paddr_t lapic_paddr; extern int apic_cpuids[]; void apic_register_enumerator(struct apic_enumerator *enumerator); void *ioapic_create(vm_paddr_t addr, int32_t apic_id, int intbase); int ioapic_disable_pin(void *cookie, u_int pin); int ioapic_get_vector(void *cookie, u_int pin); void ioapic_register(void *cookie); int ioapic_remap_vector(void *cookie, u_int pin, int vector); int ioapic_set_bus(void *cookie, u_int pin, int bus_type); int ioapic_set_extint(void *cookie, u_int pin); int ioapic_set_nmi(void *cookie, u_int pin); int ioapic_set_polarity(void *cookie, u_int pin, enum intr_polarity pol); int ioapic_set_triggermode(void *cookie, u_int pin, enum intr_trigger trigger); int ioapic_set_smi(void *cookie, u_int pin); /* * Struct containing pointers to APIC functions whose * implementation is run time selectable. */ struct apic_ops { void (*create)(u_int, int); void (*init)(vm_paddr_t); void (*xapic_mode)(void); bool (*is_x2apic)(void); void (*setup)(int); void (*dump)(const char *); void (*disable)(void); - void (*eoi)(u_int vector); + void (*eoi)(void); int (*id)(void); int (*intr_pending)(u_int); void (*set_logical_id)(u_int, u_int, u_int); u_int (*cpuid)(u_int); /* Vectors */ u_int (*alloc_vector)(u_int, u_int); u_int (*alloc_vectors)(u_int, u_int *, u_int, u_int); void (*enable_vector)(u_int, u_int); void (*disable_vector)(u_int, u_int); void (*free_vector)(u_int, u_int, u_int); /* PMC */ int (*enable_pmc)(void); void (*disable_pmc)(void); void (*reenable_pmc)(void); /* CMC */ void (*enable_cmc)(void); /* AMD ELVT */ int (*enable_mca_elvt)(void); /* IPI */ void (*ipi_raw)(register_t, u_int); void (*ipi_vectored)(u_int, int); int (*ipi_wait)(int); int (*ipi_alloc)(inthand_t *ipifunc); void (*ipi_free)(int vector); /* LVT */ int (*set_lvt_mask)(u_int, u_int, u_char); int (*set_lvt_mode)(u_int, u_int, u_int32_t); int (*set_lvt_polarity)(u_int, u_int, enum intr_polarity); int (*set_lvt_triggermode)(u_int, u_int, enum intr_trigger); }; extern struct apic_ops apic_ops; static inline void lapic_create(u_int apic_id, int boot_cpu) { apic_ops.create(apic_id, boot_cpu); } static inline void lapic_init(vm_paddr_t addr) { apic_ops.init(addr); } static inline void lapic_xapic_mode(void) { apic_ops.xapic_mode(); } static inline bool lapic_is_x2apic(void) { return (apic_ops.is_x2apic()); } static inline void lapic_setup(int boot) { apic_ops.setup(boot); } static inline void lapic_dump(const char *str) { apic_ops.dump(str); } static inline void lapic_disable(void) { apic_ops.disable(); } static inline void -lapic_eoi(u_int vector) +lapic_eoi(void) { - apic_ops.eoi(vector); + apic_ops.eoi(); } static inline int lapic_id(void) { return (apic_ops.id()); } static inline int lapic_intr_pending(u_int vector) { return (apic_ops.intr_pending(vector)); } /* XXX: UNUSED */ static inline void lapic_set_logical_id(u_int apic_id, u_int cluster, u_int cluster_id) { apic_ops.set_logical_id(apic_id, cluster, cluster_id); } static inline u_int apic_cpuid(u_int apic_id) { return (apic_ops.cpuid(apic_id)); } static inline u_int apic_alloc_vector(u_int apic_id, u_int irq) { return (apic_ops.alloc_vector(apic_id, irq)); } static inline u_int apic_alloc_vectors(u_int apic_id, u_int *irqs, u_int count, u_int align) { return (apic_ops.alloc_vectors(apic_id, irqs, count, align)); } static inline void apic_enable_vector(u_int apic_id, u_int vector) { apic_ops.enable_vector(apic_id, vector); } static inline void apic_disable_vector(u_int apic_id, u_int vector) { apic_ops.disable_vector(apic_id, vector); } static inline void apic_free_vector(u_int apic_id, u_int vector, u_int irq) { apic_ops.free_vector(apic_id, vector, irq); } static inline int lapic_enable_pmc(void) { return (apic_ops.enable_pmc()); } static inline void lapic_disable_pmc(void) { apic_ops.disable_pmc(); } static inline void lapic_reenable_pmc(void) { apic_ops.reenable_pmc(); } static inline void lapic_enable_cmc(void) { apic_ops.enable_cmc(); } static inline int lapic_enable_mca_elvt(void) { return (apic_ops.enable_mca_elvt()); } static inline void lapic_ipi_raw(register_t icrlo, u_int dest) { apic_ops.ipi_raw(icrlo, dest); } static inline void lapic_ipi_vectored(u_int vector, int dest) { apic_ops.ipi_vectored(vector, dest); } static inline int lapic_ipi_wait(int delay) { return (apic_ops.ipi_wait(delay)); } static inline int lapic_ipi_alloc(inthand_t *ipifunc) { return (apic_ops.ipi_alloc(ipifunc)); } static inline void lapic_ipi_free(int vector) { return (apic_ops.ipi_free(vector)); } static inline int lapic_set_lvt_mask(u_int apic_id, u_int lvt, u_char masked) { return (apic_ops.set_lvt_mask(apic_id, lvt, masked)); } static inline int lapic_set_lvt_mode(u_int apic_id, u_int lvt, u_int32_t mode) { return (apic_ops.set_lvt_mode(apic_id, lvt, mode)); } static inline int lapic_set_lvt_polarity(u_int apic_id, u_int lvt, enum intr_polarity pol) { return (apic_ops.set_lvt_polarity(apic_id, lvt, pol)); } static inline int lapic_set_lvt_triggermode(u_int apic_id, u_int lvt, enum intr_trigger trigger) { return (apic_ops.set_lvt_triggermode(apic_id, lvt, trigger)); } -void native_lapic_eoi(u_int vector); void lapic_handle_cmc(void); void lapic_handle_error(void); void lapic_handle_intr(int vector, struct trapframe *frame); void lapic_handle_timer(struct trapframe *frame); extern int x2apic_mode; extern int lapic_eoi_suppression; #ifdef _SYS_SYSCTL_H_ SYSCTL_DECL(_hw_apic); #endif #endif /* !LOCORE */ #endif /* _X86_APICVAR_H_ */ Index: head/sys/x86/x86/io_apic.c =================================================================== --- head/sys/x86/x86/io_apic.c (revision 316020) +++ head/sys/x86/x86/io_apic.c (revision 316021) @@ -1,1138 +1,1138 @@ /*- * Copyright (c) 2003 John Baldwin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_acpi.h" #include "opt_isa.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define IOAPIC_ISA_INTS 16 #define IOAPIC_MEM_REGION 32 #define IOAPIC_REDTBL_LO(i) (IOAPIC_REDTBL + (i) * 2) #define IOAPIC_REDTBL_HI(i) (IOAPIC_REDTBL_LO(i) + 1) static MALLOC_DEFINE(M_IOAPIC, "io_apic", "I/O APIC structures"); /* * I/O APIC interrupt source driver. Each pin is assigned an IRQ cookie * as laid out in the ACPI System Interrupt number model where each I/O * APIC has a contiguous chunk of the System Interrupt address space. * We assume that IRQs 1 - 15 behave like ISA IRQs and that all other * IRQs behave as PCI IRQs by default. We also assume that the pin for * IRQ 0 is actually an ExtINT pin. The apic enumerators override the * configuration of individual pins as indicated by their tables. * * Documentation for the I/O APIC: "82093AA I/O Advanced Programmable * Interrupt Controller (IOAPIC)", May 1996, Intel Corp. * ftp://download.intel.com/design/chipsets/datashts/29056601.pdf */ struct ioapic_intsrc { struct intsrc io_intsrc; u_int io_irq; u_int io_intpin:8; u_int io_vector:8; u_int io_cpu; u_int io_activehi:1; u_int io_edgetrigger:1; u_int io_masked:1; int io_bus:4; uint32_t io_lowreg; u_int io_remap_cookie; }; struct ioapic { struct pic io_pic; u_int io_id:8; /* logical ID */ u_int io_apic_id:4; u_int io_intbase:8; /* System Interrupt base */ u_int io_numintr:8; u_int io_haseoi:1; volatile ioapic_t *io_addr; /* XXX: should use bus_space */ vm_paddr_t io_paddr; STAILQ_ENTRY(ioapic) io_next; struct ioapic_intsrc io_pins[0]; }; static u_int ioapic_read(volatile ioapic_t *apic, int reg); static void ioapic_write(volatile ioapic_t *apic, int reg, u_int val); static const char *ioapic_bus_string(int bus_type); static void ioapic_print_irq(struct ioapic_intsrc *intpin); static void ioapic_enable_source(struct intsrc *isrc); static void ioapic_disable_source(struct intsrc *isrc, int eoi); static void ioapic_eoi_source(struct intsrc *isrc); static void ioapic_enable_intr(struct intsrc *isrc); static void ioapic_disable_intr(struct intsrc *isrc); static int ioapic_vector(struct intsrc *isrc); static int ioapic_source_pending(struct intsrc *isrc); static int ioapic_config_intr(struct intsrc *isrc, enum intr_trigger trig, enum intr_polarity pol); static void ioapic_resume(struct pic *pic, bool suspend_cancelled); static int ioapic_assign_cpu(struct intsrc *isrc, u_int apic_id); static void ioapic_program_intpin(struct ioapic_intsrc *intpin); static void ioapic_reprogram_intpin(struct intsrc *isrc); static STAILQ_HEAD(,ioapic) ioapic_list = STAILQ_HEAD_INITIALIZER(ioapic_list); struct pic ioapic_template = { .pic_enable_source = ioapic_enable_source, .pic_disable_source = ioapic_disable_source, .pic_eoi_source = ioapic_eoi_source, .pic_enable_intr = ioapic_enable_intr, .pic_disable_intr = ioapic_disable_intr, .pic_vector = ioapic_vector, .pic_source_pending = ioapic_source_pending, .pic_suspend = NULL, .pic_resume = ioapic_resume, .pic_config_intr = ioapic_config_intr, .pic_assign_cpu = ioapic_assign_cpu, .pic_reprogram_pin = ioapic_reprogram_intpin, }; static int next_ioapic_base; static u_int next_id; static int enable_extint; SYSCTL_INT(_hw_apic, OID_AUTO, enable_extint, CTLFLAG_RDTUN, &enable_extint, 0, "Enable the ExtINT pin in the first I/O APIC"); static void _ioapic_eoi_source(struct intsrc *isrc, int locked) { struct ioapic_intsrc *src; struct ioapic *io; volatile uint32_t *apic_eoi; uint32_t low1; - src = (struct ioapic_intsrc *)isrc; - lapic_eoi(src->io_vector); + lapic_eoi(); if (!lapic_eoi_suppression) return; + src = (struct ioapic_intsrc *)isrc; if (src->io_edgetrigger) return; io = (struct ioapic *)isrc->is_pic; /* * Handle targeted EOI for level-triggered pins, if broadcast * EOI suppression is supported by LAPICs. */ if (io->io_haseoi) { /* * If IOAPIC has EOI Register, simply write vector * number into the reg. */ apic_eoi = (volatile uint32_t *)((volatile char *) io->io_addr + IOAPIC_EOIR); *apic_eoi = src->io_vector; } else { /* * Otherwise, if IO-APIC is too old to provide EOIR, * do what Intel did for the Linux kernel. Temporary * switch the pin to edge-trigger and back, masking * the pin during the trick. */ if (!locked) mtx_lock_spin(&icu_lock); low1 = src->io_lowreg; low1 &= ~IOART_TRGRLVL; low1 |= IOART_TRGREDG | IOART_INTMSET; ioapic_write(io->io_addr, IOAPIC_REDTBL_LO(src->io_intpin), low1); ioapic_write(io->io_addr, IOAPIC_REDTBL_LO(src->io_intpin), src->io_lowreg); if (!locked) mtx_unlock_spin(&icu_lock); } } static u_int ioapic_read(volatile ioapic_t *apic, int reg) { mtx_assert(&icu_lock, MA_OWNED); apic->ioregsel = reg; return (apic->iowin); } static void ioapic_write(volatile ioapic_t *apic, int reg, u_int val) { mtx_assert(&icu_lock, MA_OWNED); apic->ioregsel = reg; apic->iowin = val; } static const char * ioapic_bus_string(int bus_type) { switch (bus_type) { case APIC_BUS_ISA: return ("ISA"); case APIC_BUS_EISA: return ("EISA"); case APIC_BUS_PCI: return ("PCI"); default: return ("unknown"); } } static void ioapic_print_irq(struct ioapic_intsrc *intpin) { switch (intpin->io_irq) { case IRQ_DISABLED: printf("disabled"); break; case IRQ_EXTINT: printf("ExtINT"); break; case IRQ_NMI: printf("NMI"); break; case IRQ_SMI: printf("SMI"); break; default: printf("%s IRQ %u", ioapic_bus_string(intpin->io_bus), intpin->io_irq); } } static void ioapic_enable_source(struct intsrc *isrc) { struct ioapic_intsrc *intpin = (struct ioapic_intsrc *)isrc; struct ioapic *io = (struct ioapic *)isrc->is_pic; uint32_t flags; mtx_lock_spin(&icu_lock); if (intpin->io_masked) { flags = intpin->io_lowreg & ~IOART_INTMASK; ioapic_write(io->io_addr, IOAPIC_REDTBL_LO(intpin->io_intpin), flags); intpin->io_masked = 0; } mtx_unlock_spin(&icu_lock); } static void ioapic_disable_source(struct intsrc *isrc, int eoi) { struct ioapic_intsrc *intpin = (struct ioapic_intsrc *)isrc; struct ioapic *io = (struct ioapic *)isrc->is_pic; uint32_t flags; mtx_lock_spin(&icu_lock); if (!intpin->io_masked && !intpin->io_edgetrigger) { flags = intpin->io_lowreg | IOART_INTMSET; ioapic_write(io->io_addr, IOAPIC_REDTBL_LO(intpin->io_intpin), flags); intpin->io_masked = 1; } if (eoi == PIC_EOI) _ioapic_eoi_source(isrc, 1); mtx_unlock_spin(&icu_lock); } static void ioapic_eoi_source(struct intsrc *isrc) { _ioapic_eoi_source(isrc, 0); } /* * Completely program an intpin based on the data in its interrupt source * structure. */ static void ioapic_program_intpin(struct ioapic_intsrc *intpin) { struct ioapic *io = (struct ioapic *)intpin->io_intsrc.is_pic; uint32_t low, high, value; #ifdef ACPI_DMAR int error; #endif /* * If a pin is completely invalid or if it is valid but hasn't * been enabled yet, just ensure that the pin is masked. */ mtx_assert(&icu_lock, MA_OWNED); if (intpin->io_irq == IRQ_DISABLED || (intpin->io_irq < NUM_IO_INTS && intpin->io_vector == 0)) { low = ioapic_read(io->io_addr, IOAPIC_REDTBL_LO(intpin->io_intpin)); if ((low & IOART_INTMASK) == IOART_INTMCLR) ioapic_write(io->io_addr, IOAPIC_REDTBL_LO(intpin->io_intpin), low | IOART_INTMSET); #ifdef ACPI_DMAR mtx_unlock_spin(&icu_lock); iommu_unmap_ioapic_intr(io->io_apic_id, &intpin->io_remap_cookie); mtx_lock_spin(&icu_lock); #endif return; } #ifdef ACPI_DMAR mtx_unlock_spin(&icu_lock); error = iommu_map_ioapic_intr(io->io_apic_id, intpin->io_cpu, intpin->io_vector, intpin->io_edgetrigger, intpin->io_activehi, intpin->io_irq, &intpin->io_remap_cookie, &high, &low); mtx_lock_spin(&icu_lock); if (error == 0) { ioapic_write(io->io_addr, IOAPIC_REDTBL_HI(intpin->io_intpin), high); intpin->io_lowreg = low; ioapic_write(io->io_addr, IOAPIC_REDTBL_LO(intpin->io_intpin), low); return; } else if (error != EOPNOTSUPP) { return; } #endif /* Set the destination. */ low = IOART_DESTPHY; high = intpin->io_cpu << APIC_ID_SHIFT; /* Program the rest of the low word. */ if (intpin->io_edgetrigger) low |= IOART_TRGREDG; else low |= IOART_TRGRLVL; if (intpin->io_activehi) low |= IOART_INTAHI; else low |= IOART_INTALO; if (intpin->io_masked) low |= IOART_INTMSET; switch (intpin->io_irq) { case IRQ_EXTINT: KASSERT(intpin->io_edgetrigger, ("ExtINT not edge triggered")); low |= IOART_DELEXINT; break; case IRQ_NMI: KASSERT(intpin->io_edgetrigger, ("NMI not edge triggered")); low |= IOART_DELNMI; break; case IRQ_SMI: KASSERT(intpin->io_edgetrigger, ("SMI not edge triggered")); low |= IOART_DELSMI; break; default: KASSERT(intpin->io_vector != 0, ("No vector for IRQ %u", intpin->io_irq)); low |= IOART_DELFIXED | intpin->io_vector; } /* Write the values to the APIC. */ value = ioapic_read(io->io_addr, IOAPIC_REDTBL_HI(intpin->io_intpin)); value &= ~IOART_DEST; value |= high; ioapic_write(io->io_addr, IOAPIC_REDTBL_HI(intpin->io_intpin), value); intpin->io_lowreg = low; ioapic_write(io->io_addr, IOAPIC_REDTBL_LO(intpin->io_intpin), low); } static void ioapic_reprogram_intpin(struct intsrc *isrc) { mtx_lock_spin(&icu_lock); ioapic_program_intpin((struct ioapic_intsrc *)isrc); mtx_unlock_spin(&icu_lock); } static int ioapic_assign_cpu(struct intsrc *isrc, u_int apic_id) { struct ioapic_intsrc *intpin = (struct ioapic_intsrc *)isrc; struct ioapic *io = (struct ioapic *)isrc->is_pic; u_int old_vector, new_vector; u_int old_id; /* * On Hyper-V: * - Stick to the first cpu for all I/O APIC pins. * - And don't allow destination cpu changes. */ if (vm_guest == VM_GUEST_HV) { if (intpin->io_vector) return (EINVAL); else apic_id = 0; } /* * keep 1st core as the destination for NMI */ if (intpin->io_irq == IRQ_NMI) apic_id = 0; /* * Set us up to free the old irq. */ old_vector = intpin->io_vector; old_id = intpin->io_cpu; if (old_vector && apic_id == old_id) return (0); /* * Allocate an APIC vector for this interrupt pin. Once * we have a vector we program the interrupt pin. */ new_vector = apic_alloc_vector(apic_id, intpin->io_irq); if (new_vector == 0) return (ENOSPC); /* * Mask the old intpin if it is enabled while it is migrated. * * At least some level-triggered interrupts seem to need the * extra DELAY() to avoid being stuck in a non-EOI'd state. */ mtx_lock_spin(&icu_lock); if (!intpin->io_masked && !intpin->io_edgetrigger) { ioapic_write(io->io_addr, IOAPIC_REDTBL_LO(intpin->io_intpin), intpin->io_lowreg | IOART_INTMSET); mtx_unlock_spin(&icu_lock); DELAY(100); mtx_lock_spin(&icu_lock); } intpin->io_cpu = apic_id; intpin->io_vector = new_vector; if (isrc->is_handlers > 0) apic_enable_vector(intpin->io_cpu, intpin->io_vector); if (bootverbose) { printf("ioapic%u: routing intpin %u (", io->io_id, intpin->io_intpin); ioapic_print_irq(intpin); printf(") to lapic %u vector %u\n", intpin->io_cpu, intpin->io_vector); } ioapic_program_intpin(intpin); mtx_unlock_spin(&icu_lock); /* * Free the old vector after the new one is established. This is done * to prevent races where we could miss an interrupt. */ if (old_vector) { if (isrc->is_handlers > 0) apic_disable_vector(old_id, old_vector); apic_free_vector(old_id, old_vector, intpin->io_irq); } return (0); } static void ioapic_enable_intr(struct intsrc *isrc) { struct ioapic_intsrc *intpin = (struct ioapic_intsrc *)isrc; if (intpin->io_vector == 0) if (ioapic_assign_cpu(isrc, intr_next_cpu()) != 0) panic("Couldn't find an APIC vector for IRQ %d", intpin->io_irq); apic_enable_vector(intpin->io_cpu, intpin->io_vector); } static void ioapic_disable_intr(struct intsrc *isrc) { struct ioapic_intsrc *intpin = (struct ioapic_intsrc *)isrc; u_int vector; if (intpin->io_vector != 0) { /* Mask this interrupt pin and free its APIC vector. */ vector = intpin->io_vector; apic_disable_vector(intpin->io_cpu, vector); mtx_lock_spin(&icu_lock); intpin->io_masked = 1; intpin->io_vector = 0; ioapic_program_intpin(intpin); mtx_unlock_spin(&icu_lock); apic_free_vector(intpin->io_cpu, vector, intpin->io_irq); } } static int ioapic_vector(struct intsrc *isrc) { struct ioapic_intsrc *pin; pin = (struct ioapic_intsrc *)isrc; return (pin->io_irq); } static int ioapic_source_pending(struct intsrc *isrc) { struct ioapic_intsrc *intpin = (struct ioapic_intsrc *)isrc; if (intpin->io_vector == 0) return 0; return (lapic_intr_pending(intpin->io_vector)); } static int ioapic_config_intr(struct intsrc *isrc, enum intr_trigger trig, enum intr_polarity pol) { struct ioapic_intsrc *intpin = (struct ioapic_intsrc *)isrc; struct ioapic *io = (struct ioapic *)isrc->is_pic; int changed; KASSERT(!(trig == INTR_TRIGGER_CONFORM || pol == INTR_POLARITY_CONFORM), ("%s: Conforming trigger or polarity\n", __func__)); /* * EISA interrupts always use active high polarity, so don't allow * them to be set to active low. * * XXX: Should we write to the ELCR if the trigger mode changes for * an EISA IRQ or an ISA IRQ with the ELCR present? */ mtx_lock_spin(&icu_lock); if (intpin->io_bus == APIC_BUS_EISA) pol = INTR_POLARITY_HIGH; changed = 0; if (intpin->io_edgetrigger != (trig == INTR_TRIGGER_EDGE)) { if (bootverbose) printf("ioapic%u: Changing trigger for pin %u to %s\n", io->io_id, intpin->io_intpin, trig == INTR_TRIGGER_EDGE ? "edge" : "level"); intpin->io_edgetrigger = (trig == INTR_TRIGGER_EDGE); changed++; } if (intpin->io_activehi != (pol == INTR_POLARITY_HIGH)) { if (bootverbose) printf("ioapic%u: Changing polarity for pin %u to %s\n", io->io_id, intpin->io_intpin, pol == INTR_POLARITY_HIGH ? "high" : "low"); intpin->io_activehi = (pol == INTR_POLARITY_HIGH); changed++; } if (changed) ioapic_program_intpin(intpin); mtx_unlock_spin(&icu_lock); return (0); } static void ioapic_resume(struct pic *pic, bool suspend_cancelled) { struct ioapic *io = (struct ioapic *)pic; int i; mtx_lock_spin(&icu_lock); for (i = 0; i < io->io_numintr; i++) ioapic_program_intpin(&io->io_pins[i]); mtx_unlock_spin(&icu_lock); } /* * Create a plain I/O APIC object. */ void * ioapic_create(vm_paddr_t addr, int32_t apic_id, int intbase) { struct ioapic *io; struct ioapic_intsrc *intpin; volatile ioapic_t *apic; u_int numintr, i; uint32_t value; /* Map the register window so we can access the device. */ apic = pmap_mapdev(addr, IOAPIC_MEM_REGION); mtx_lock_spin(&icu_lock); value = ioapic_read(apic, IOAPIC_VER); mtx_unlock_spin(&icu_lock); /* If it's version register doesn't seem to work, punt. */ if (value == 0xffffffff) { pmap_unmapdev((vm_offset_t)apic, IOAPIC_MEM_REGION); return (NULL); } /* Determine the number of vectors and set the APIC ID. */ numintr = ((value & IOART_VER_MAXREDIR) >> MAXREDIRSHIFT) + 1; io = malloc(sizeof(struct ioapic) + numintr * sizeof(struct ioapic_intsrc), M_IOAPIC, M_WAITOK); io->io_pic = ioapic_template; mtx_lock_spin(&icu_lock); io->io_id = next_id++; io->io_apic_id = ioapic_read(apic, IOAPIC_ID) >> APIC_ID_SHIFT; if (apic_id != -1 && io->io_apic_id != apic_id) { ioapic_write(apic, IOAPIC_ID, apic_id << APIC_ID_SHIFT); mtx_unlock_spin(&icu_lock); io->io_apic_id = apic_id; printf("ioapic%u: Changing APIC ID to %d\n", io->io_id, apic_id); } else mtx_unlock_spin(&icu_lock); if (intbase == -1) { intbase = next_ioapic_base; printf("ioapic%u: Assuming intbase of %d\n", io->io_id, intbase); } else if (intbase != next_ioapic_base && bootverbose) printf("ioapic%u: WARNING: intbase %d != expected base %d\n", io->io_id, intbase, next_ioapic_base); io->io_intbase = intbase; next_ioapic_base = intbase + numintr; io->io_numintr = numintr; io->io_addr = apic; io->io_paddr = addr; if (bootverbose) { printf("ioapic%u: ver 0x%02x maxredir 0x%02x\n", io->io_id, (value & IOART_VER_VERSION), (value & IOART_VER_MAXREDIR) >> MAXREDIRSHIFT); } /* * The summary information about IO-APIC versions is taken from * the Linux kernel source: * 0Xh 82489DX * 1Xh I/OAPIC or I/O(x)APIC which are not PCI 2.2 Compliant * 2Xh I/O(x)APIC which is PCI 2.2 Compliant * 30h-FFh Reserved * IO-APICs with version >= 0x20 have working EOIR register. */ io->io_haseoi = (value & IOART_VER_VERSION) >= 0x20; /* * Initialize pins. Start off with interrupts disabled. Default * to active-hi and edge-triggered for ISA interrupts and active-lo * and level-triggered for all others. */ bzero(io->io_pins, sizeof(struct ioapic_intsrc) * numintr); mtx_lock_spin(&icu_lock); for (i = 0, intpin = io->io_pins; i < numintr; i++, intpin++) { intpin->io_intsrc.is_pic = (struct pic *)io; intpin->io_intpin = i; intpin->io_irq = intbase + i; /* * Assume that pin 0 on the first I/O APIC is an ExtINT pin. * Assume that pins 1-15 are ISA interrupts and that all * other pins are PCI interrupts. */ if (intpin->io_irq == 0) ioapic_set_extint(io, i); else if (intpin->io_irq < IOAPIC_ISA_INTS) { intpin->io_bus = APIC_BUS_ISA; intpin->io_activehi = 1; intpin->io_edgetrigger = 1; intpin->io_masked = 1; } else { intpin->io_bus = APIC_BUS_PCI; intpin->io_activehi = 0; intpin->io_edgetrigger = 0; intpin->io_masked = 1; } /* * Route interrupts to the BSP by default. Interrupts may * be routed to other CPUs later after they are enabled. */ intpin->io_cpu = PCPU_GET(apic_id); value = ioapic_read(apic, IOAPIC_REDTBL_LO(i)); ioapic_write(apic, IOAPIC_REDTBL_LO(i), value | IOART_INTMSET); #ifdef ACPI_DMAR /* dummy, but sets cookie */ mtx_unlock_spin(&icu_lock); iommu_map_ioapic_intr(io->io_apic_id, intpin->io_cpu, intpin->io_vector, intpin->io_edgetrigger, intpin->io_activehi, intpin->io_irq, &intpin->io_remap_cookie, NULL, NULL); mtx_lock_spin(&icu_lock); #endif } mtx_unlock_spin(&icu_lock); return (io); } int ioapic_get_vector(void *cookie, u_int pin) { struct ioapic *io; io = (struct ioapic *)cookie; if (pin >= io->io_numintr) return (-1); return (io->io_pins[pin].io_irq); } int ioapic_disable_pin(void *cookie, u_int pin) { struct ioapic *io; io = (struct ioapic *)cookie; if (pin >= io->io_numintr) return (EINVAL); if (io->io_pins[pin].io_irq == IRQ_DISABLED) return (EINVAL); io->io_pins[pin].io_irq = IRQ_DISABLED; if (bootverbose) printf("ioapic%u: intpin %d disabled\n", io->io_id, pin); return (0); } int ioapic_remap_vector(void *cookie, u_int pin, int vector) { struct ioapic *io; io = (struct ioapic *)cookie; if (pin >= io->io_numintr || vector < 0) return (EINVAL); if (io->io_pins[pin].io_irq >= NUM_IO_INTS) return (EINVAL); io->io_pins[pin].io_irq = vector; if (bootverbose) printf("ioapic%u: Routing IRQ %d -> intpin %d\n", io->io_id, vector, pin); return (0); } int ioapic_set_bus(void *cookie, u_int pin, int bus_type) { struct ioapic *io; if (bus_type < 0 || bus_type > APIC_BUS_MAX) return (EINVAL); io = (struct ioapic *)cookie; if (pin >= io->io_numintr) return (EINVAL); if (io->io_pins[pin].io_irq >= NUM_IO_INTS) return (EINVAL); if (io->io_pins[pin].io_bus == bus_type) return (0); io->io_pins[pin].io_bus = bus_type; if (bootverbose) printf("ioapic%u: intpin %d bus %s\n", io->io_id, pin, ioapic_bus_string(bus_type)); return (0); } int ioapic_set_nmi(void *cookie, u_int pin) { struct ioapic *io; io = (struct ioapic *)cookie; if (pin >= io->io_numintr) return (EINVAL); if (io->io_pins[pin].io_irq == IRQ_NMI) return (0); if (io->io_pins[pin].io_irq >= NUM_IO_INTS) return (EINVAL); io->io_pins[pin].io_bus = APIC_BUS_UNKNOWN; io->io_pins[pin].io_irq = IRQ_NMI; io->io_pins[pin].io_masked = 0; io->io_pins[pin].io_edgetrigger = 1; io->io_pins[pin].io_activehi = 1; if (bootverbose) printf("ioapic%u: Routing NMI -> intpin %d\n", io->io_id, pin); return (0); } int ioapic_set_smi(void *cookie, u_int pin) { struct ioapic *io; io = (struct ioapic *)cookie; if (pin >= io->io_numintr) return (EINVAL); if (io->io_pins[pin].io_irq == IRQ_SMI) return (0); if (io->io_pins[pin].io_irq >= NUM_IO_INTS) return (EINVAL); io->io_pins[pin].io_bus = APIC_BUS_UNKNOWN; io->io_pins[pin].io_irq = IRQ_SMI; io->io_pins[pin].io_masked = 0; io->io_pins[pin].io_edgetrigger = 1; io->io_pins[pin].io_activehi = 1; if (bootverbose) printf("ioapic%u: Routing SMI -> intpin %d\n", io->io_id, pin); return (0); } int ioapic_set_extint(void *cookie, u_int pin) { struct ioapic *io; io = (struct ioapic *)cookie; if (pin >= io->io_numintr) return (EINVAL); if (io->io_pins[pin].io_irq == IRQ_EXTINT) return (0); if (io->io_pins[pin].io_irq >= NUM_IO_INTS) return (EINVAL); io->io_pins[pin].io_bus = APIC_BUS_UNKNOWN; io->io_pins[pin].io_irq = IRQ_EXTINT; if (enable_extint) io->io_pins[pin].io_masked = 0; else io->io_pins[pin].io_masked = 1; io->io_pins[pin].io_edgetrigger = 1; io->io_pins[pin].io_activehi = 1; if (bootverbose) printf("ioapic%u: Routing external 8259A's -> intpin %d\n", io->io_id, pin); return (0); } int ioapic_set_polarity(void *cookie, u_int pin, enum intr_polarity pol) { struct ioapic *io; int activehi; io = (struct ioapic *)cookie; if (pin >= io->io_numintr || pol == INTR_POLARITY_CONFORM) return (EINVAL); if (io->io_pins[pin].io_irq >= NUM_IO_INTS) return (EINVAL); activehi = (pol == INTR_POLARITY_HIGH); if (io->io_pins[pin].io_activehi == activehi) return (0); io->io_pins[pin].io_activehi = activehi; if (bootverbose) printf("ioapic%u: intpin %d polarity: %s\n", io->io_id, pin, pol == INTR_POLARITY_HIGH ? "high" : "low"); return (0); } int ioapic_set_triggermode(void *cookie, u_int pin, enum intr_trigger trigger) { struct ioapic *io; int edgetrigger; io = (struct ioapic *)cookie; if (pin >= io->io_numintr || trigger == INTR_TRIGGER_CONFORM) return (EINVAL); if (io->io_pins[pin].io_irq >= NUM_IO_INTS) return (EINVAL); edgetrigger = (trigger == INTR_TRIGGER_EDGE); if (io->io_pins[pin].io_edgetrigger == edgetrigger) return (0); io->io_pins[pin].io_edgetrigger = edgetrigger; if (bootverbose) printf("ioapic%u: intpin %d trigger: %s\n", io->io_id, pin, trigger == INTR_TRIGGER_EDGE ? "edge" : "level"); return (0); } /* * Register a complete I/O APIC object with the interrupt subsystem. */ void ioapic_register(void *cookie) { struct ioapic_intsrc *pin; struct ioapic *io; volatile ioapic_t *apic; uint32_t flags; int i; io = (struct ioapic *)cookie; apic = io->io_addr; mtx_lock_spin(&icu_lock); flags = ioapic_read(apic, IOAPIC_VER) & IOART_VER_VERSION; STAILQ_INSERT_TAIL(&ioapic_list, io, io_next); mtx_unlock_spin(&icu_lock); printf("ioapic%u irqs %u-%u on motherboard\n", io->io_id, flags >> 4, flags & 0xf, io->io_intbase, io->io_intbase + io->io_numintr - 1); /* * Reprogram pins to handle special case pins (such as NMI and * SMI) and register valid pins as interrupt sources. */ intr_register_pic(&io->io_pic); for (i = 0, pin = io->io_pins; i < io->io_numintr; i++, pin++) { ioapic_reprogram_intpin(&pin->io_intsrc); if (pin->io_irq < NUM_IO_INTS) intr_register_source(&pin->io_intsrc); } } /* A simple new-bus driver to consume PCI I/O APIC devices. */ static int ioapic_pci_probe(device_t dev) { if (pci_get_class(dev) == PCIC_BASEPERIPH && pci_get_subclass(dev) == PCIS_BASEPERIPH_PIC) { switch (pci_get_progif(dev)) { case PCIP_BASEPERIPH_PIC_IO_APIC: device_set_desc(dev, "IO APIC"); break; case PCIP_BASEPERIPH_PIC_IOX_APIC: device_set_desc(dev, "IO(x) APIC"); break; default: return (ENXIO); } device_quiet(dev); return (-10000); } return (ENXIO); } static int ioapic_pci_attach(device_t dev) { return (0); } static device_method_t ioapic_pci_methods[] = { /* Device interface */ DEVMETHOD(device_probe, ioapic_pci_probe), DEVMETHOD(device_attach, ioapic_pci_attach), { 0, 0 } }; DEFINE_CLASS_0(ioapic, ioapic_pci_driver, ioapic_pci_methods, 0); static devclass_t ioapic_devclass; DRIVER_MODULE(ioapic, pci, ioapic_pci_driver, ioapic_devclass, 0, 0); /* * A new-bus driver to consume the memory resources associated with * the APICs in the system. On some systems ACPI or PnPBIOS system * resource devices may already claim these resources. To keep from * breaking those devices, we attach ourself to the nexus device after * legacy0 and acpi0 and ignore any allocation failures. */ static void apic_identify(driver_t *driver, device_t parent) { /* * Add at order 12. acpi0 is probed at order 10 and legacy0 * is probed at order 11. */ if (lapic_paddr != 0) BUS_ADD_CHILD(parent, 12, "apic", 0); } static int apic_probe(device_t dev) { device_set_desc(dev, "APIC resources"); device_quiet(dev); return (0); } static void apic_add_resource(device_t dev, int rid, vm_paddr_t base, size_t length) { int error; error = bus_set_resource(dev, SYS_RES_MEMORY, rid, base, length); if (error) panic("apic_add_resource: resource %d failed set with %d", rid, error); bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, 0); } static int apic_attach(device_t dev) { struct ioapic *io; int i; /* Reserve the local APIC. */ apic_add_resource(dev, 0, lapic_paddr, LAPIC_MEM_REGION); i = 1; STAILQ_FOREACH(io, &ioapic_list, io_next) { apic_add_resource(dev, i, io->io_paddr, IOAPIC_MEM_REGION); i++; } return (0); } static device_method_t apic_methods[] = { /* Device interface */ DEVMETHOD(device_identify, apic_identify), DEVMETHOD(device_probe, apic_probe), DEVMETHOD(device_attach, apic_attach), { 0, 0 } }; DEFINE_CLASS_0(apic, apic_driver, apic_methods, 0); static devclass_t apic_devclass; DRIVER_MODULE(apic, nexus, apic_driver, apic_devclass, 0, 0); #include "opt_ddb.h" #ifdef DDB #include static const char * ioapic_delivery_mode(uint32_t mode) { switch (mode) { case IOART_DELFIXED: return ("fixed"); case IOART_DELLOPRI: return ("lowestpri"); case IOART_DELSMI: return ("SMI"); case IOART_DELRSV1: return ("rsrvd1"); case IOART_DELNMI: return ("NMI"); case IOART_DELINIT: return ("INIT"); case IOART_DELRSV2: return ("rsrvd2"); case IOART_DELEXINT: return ("ExtINT"); default: return (""); } } static u_int db_ioapic_read(volatile ioapic_t *apic, int reg) { apic->ioregsel = reg; return (apic->iowin); } static void db_show_ioapic_one(volatile ioapic_t *io_addr) { uint32_t r, lo, hi; int mre, i; r = db_ioapic_read(io_addr, IOAPIC_VER); mre = (r & IOART_VER_MAXREDIR) >> MAXREDIRSHIFT; db_printf("Id 0x%08x Ver 0x%02x MRE %d\n", db_ioapic_read(io_addr, IOAPIC_ID), r & IOART_VER_VERSION, mre); for (i = 0; i < mre; i++) { lo = db_ioapic_read(io_addr, IOAPIC_REDTBL_LO(i)); hi = db_ioapic_read(io_addr, IOAPIC_REDTBL_HI(i)); db_printf(" pin %d Dest %s/%x %smasked Trig %s RemoteIRR %d " "Polarity %s Status %s DeliveryMode %s Vec %d\n", i, (lo & IOART_DESTMOD) == IOART_DESTLOG ? "log" : "phy", (hi & IOART_DEST) >> 24, (lo & IOART_INTMASK) == IOART_INTMSET ? "" : "not", (lo & IOART_TRGRMOD) == IOART_TRGRLVL ? "lvl" : "edge", (lo & IOART_REM_IRR) == IOART_REM_IRR ? 1 : 0, (lo & IOART_INTPOL) == IOART_INTALO ? "low" : "high", (lo & IOART_DELIVS) == IOART_DELIVS ? "pend" : "idle", ioapic_delivery_mode(lo & IOART_DELMOD), (lo & IOART_INTVEC)); } } DB_SHOW_COMMAND(ioapic, db_show_ioapic) { struct ioapic *ioapic; int idx, i; if (!have_addr) { db_printf("usage: show ioapic index\n"); return; } idx = (int)addr; i = 0; STAILQ_FOREACH(ioapic, &ioapic_list, io_next) { if (idx == i) { db_show_ioapic_one(ioapic->io_addr); break; } i++; } } DB_SHOW_ALL_COMMAND(ioapics, db_show_all_ioapics) { struct ioapic *ioapic; STAILQ_FOREACH(ioapic, &ioapic_list, io_next) db_show_ioapic_one(ioapic->io_addr); } #endif Index: head/sys/x86/x86/local_apic.c =================================================================== --- head/sys/x86/x86/local_apic.c (revision 316020) +++ head/sys/x86/x86/local_apic.c (revision 316021) @@ -1,2147 +1,2120 @@ /*- * Copyright (c) 2003 John Baldwin * Copyright (c) 1996, by Steve Passe * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. The name of the developer may NOT be used to endorse or promote products * derived from this software without specific prior written permission. * 3. Neither the name of the author nor the names of any co-contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Local APIC support on Pentium and later processors. */ #include __FBSDID("$FreeBSD$"); #include "opt_atpic.h" #include "opt_hwpmc_hooks.h" #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #include #endif #ifdef __amd64__ #define SDT_APIC SDT_SYSIGT #define SDT_APICT SDT_SYSIGT #define GSEL_APIC 0 #else #define SDT_APIC SDT_SYS386IGT #define SDT_APICT SDT_SYS386TGT #define GSEL_APIC GSEL(GCODE_SEL, SEL_KPL) #endif -#define INTEL_SEOI 1 -#define AMD_SEOI 2 - /* Sanity checks on IDT vectors. */ CTASSERT(APIC_IO_INTS + APIC_NUM_IOINTS == APIC_TIMER_INT); CTASSERT(APIC_TIMER_INT < APIC_LOCAL_INTS); CTASSERT(APIC_LOCAL_INTS == 240); CTASSERT(IPI_STOP < APIC_SPURIOUS_INT); /* Magic IRQ values for the timer and syscalls. */ #define IRQ_TIMER (NUM_IO_INTS + 1) #define IRQ_SYSCALL (NUM_IO_INTS + 2) #define IRQ_DTRACE_RET (NUM_IO_INTS + 3) #define IRQ_EVTCHN (NUM_IO_INTS + 4) enum lat_timer_mode { LAT_MODE_UNDEF = 0, LAT_MODE_PERIODIC = 1, LAT_MODE_ONESHOT = 2, LAT_MODE_DEADLINE = 3, }; /* * Support for local APICs. Local APICs manage interrupts on each * individual processor as opposed to I/O APICs which receive interrupts * from I/O devices and then forward them on to the local APICs. * * Local APICs can also send interrupts to each other thus providing the * mechanism for IPIs. */ struct lvt { u_int lvt_edgetrigger:1; u_int lvt_activehi:1; u_int lvt_masked:1; u_int lvt_active:1; u_int lvt_mode:16; u_int lvt_vector:8; }; struct lapic { struct lvt la_lvts[APIC_LVT_MAX + 1]; struct lvt la_elvts[APIC_ELVT_MAX + 1];; u_int la_id:8; u_int la_cluster:4; u_int la_cluster_id:2; u_int la_present:1; u_long *la_timer_count; uint64_t la_timer_period; enum lat_timer_mode la_timer_mode; uint32_t lvt_timer_base; uint32_t lvt_timer_last; /* Include IDT_SYSCALL to make indexing easier. */ int la_ioint_irqs[APIC_NUM_IOINTS + 1]; } static lapics[MAX_APIC_ID + 1]; /* Global defaults for local APIC LVT entries. */ static struct lvt lvts[APIC_LVT_MAX + 1] = { { 1, 1, 1, 1, APIC_LVT_DM_EXTINT, 0 }, /* LINT0: masked ExtINT */ { 1, 1, 0, 1, APIC_LVT_DM_NMI, 0 }, /* LINT1: NMI */ { 1, 1, 1, 1, APIC_LVT_DM_FIXED, APIC_TIMER_INT }, /* Timer */ { 1, 1, 0, 1, APIC_LVT_DM_FIXED, APIC_ERROR_INT }, /* Error */ { 1, 1, 1, 1, APIC_LVT_DM_NMI, 0 }, /* PMC */ { 1, 1, 1, 1, APIC_LVT_DM_FIXED, APIC_THERMAL_INT }, /* Thermal */ { 1, 1, 1, 1, APIC_LVT_DM_FIXED, APIC_CMC_INT }, /* CMCI */ }; /* Global defaults for AMD local APIC ELVT entries. */ static struct lvt elvts[APIC_ELVT_MAX + 1] = { { 1, 1, 1, 0, APIC_LVT_DM_FIXED, 0 }, { 1, 1, 1, 0, APIC_LVT_DM_FIXED, APIC_CMC_INT }, { 1, 1, 1, 0, APIC_LVT_DM_FIXED, 0 }, { 1, 1, 1, 0, APIC_LVT_DM_FIXED, 0 }, }; static inthand_t *ioint_handlers[] = { NULL, /* 0 - 31 */ IDTVEC(apic_isr1), /* 32 - 63 */ IDTVEC(apic_isr2), /* 64 - 95 */ IDTVEC(apic_isr3), /* 96 - 127 */ IDTVEC(apic_isr4), /* 128 - 159 */ IDTVEC(apic_isr5), /* 160 - 191 */ IDTVEC(apic_isr6), /* 192 - 223 */ IDTVEC(apic_isr7), /* 224 - 255 */ }; static u_int32_t lapic_timer_divisors[] = { APIC_TDCR_1, APIC_TDCR_2, APIC_TDCR_4, APIC_TDCR_8, APIC_TDCR_16, APIC_TDCR_32, APIC_TDCR_64, APIC_TDCR_128 }; extern inthand_t IDTVEC(rsvd); volatile char *lapic_map; vm_paddr_t lapic_paddr; int x2apic_mode; int lapic_eoi_suppression; static int lapic_timer_tsc_deadline; static u_long lapic_timer_divisor, count_freq; static struct eventtimer lapic_et; #ifdef SMP static uint64_t lapic_ipi_wait_mult; #endif -#ifdef __amd64__ -/* IPI vector used for VMM VCPU notifications. */ -int vmm_ipinum; -#endif SYSCTL_NODE(_hw, OID_AUTO, apic, CTLFLAG_RD, 0, "APIC options"); SYSCTL_INT(_hw_apic, OID_AUTO, x2apic_mode, CTLFLAG_RD, &x2apic_mode, 0, ""); SYSCTL_INT(_hw_apic, OID_AUTO, eoi_suppression, CTLFLAG_RD, &lapic_eoi_suppression, 0, ""); SYSCTL_INT(_hw_apic, OID_AUTO, timer_tsc_deadline, CTLFLAG_RD, &lapic_timer_tsc_deadline, 0, ""); static uint32_t lapic_read32(enum LAPIC_REGISTERS reg) { uint32_t res; if (x2apic_mode) { res = rdmsr32(MSR_APIC_000 + reg); } else { res = *(volatile uint32_t *)(lapic_map + reg * LAPIC_MEM_MUL); } return (res); } static void lapic_write32(enum LAPIC_REGISTERS reg, uint32_t val) { if (x2apic_mode) { mfence(); wrmsr(MSR_APIC_000 + reg, val); } else { *(volatile uint32_t *)(lapic_map + reg * LAPIC_MEM_MUL) = val; } } static void lapic_write32_nofence(enum LAPIC_REGISTERS reg, uint32_t val) { if (x2apic_mode) { wrmsr(MSR_APIC_000 + reg, val); } else { *(volatile uint32_t *)(lapic_map + reg * LAPIC_MEM_MUL) = val; } } #ifdef SMP static uint64_t lapic_read_icr(void) { uint64_t v; uint32_t vhi, vlo; if (x2apic_mode) { v = rdmsr(MSR_APIC_000 + LAPIC_ICR_LO); } else { vhi = lapic_read32(LAPIC_ICR_HI); vlo = lapic_read32(LAPIC_ICR_LO); v = ((uint64_t)vhi << 32) | vlo; } return (v); } static uint64_t lapic_read_icr_lo(void) { return (lapic_read32(LAPIC_ICR_LO)); } static void lapic_write_icr(uint32_t vhi, uint32_t vlo) { uint64_t v; if (x2apic_mode) { v = ((uint64_t)vhi << 32) | vlo; mfence(); wrmsr(MSR_APIC_000 + LAPIC_ICR_LO, v); } else { lapic_write32(LAPIC_ICR_HI, vhi); lapic_write32(LAPIC_ICR_LO, vlo); } } #endif /* SMP */ static void native_lapic_enable_x2apic(void) { uint64_t apic_base; apic_base = rdmsr(MSR_APICBASE); apic_base |= APICBASE_X2APIC | APICBASE_ENABLED; wrmsr(MSR_APICBASE, apic_base); } static bool native_lapic_is_x2apic(void) { uint64_t apic_base; apic_base = rdmsr(MSR_APICBASE); return ((apic_base & (APICBASE_X2APIC | APICBASE_ENABLED)) == (APICBASE_X2APIC | APICBASE_ENABLED)); } static void lapic_enable(void); static void lapic_resume(struct pic *pic, bool suspend_cancelled); static void lapic_timer_oneshot(struct lapic *); static void lapic_timer_oneshot_nointr(struct lapic *, uint32_t); static void lapic_timer_periodic(struct lapic *); static void lapic_timer_deadline(struct lapic *); static void lapic_timer_stop(struct lapic *); static void lapic_timer_set_divisor(u_int divisor); static uint32_t lvt_mode(struct lapic *la, u_int pin, uint32_t value); static int lapic_et_start(struct eventtimer *et, sbintime_t first, sbintime_t period); static int lapic_et_stop(struct eventtimer *et); static u_int apic_idt_to_irq(u_int apic_id, u_int vector); static void lapic_set_tpr(u_int vector); struct pic lapic_pic = { .pic_resume = lapic_resume }; /* Forward declarations for apic_ops */ static void native_lapic_create(u_int apic_id, int boot_cpu); static void native_lapic_init(vm_paddr_t addr); static void native_lapic_xapic_mode(void); static void native_lapic_setup(int boot); static void native_lapic_dump(const char *str); static void native_lapic_disable(void); +static void native_lapic_eoi(void); static int native_lapic_id(void); static int native_lapic_intr_pending(u_int vector); static u_int native_apic_cpuid(u_int apic_id); static u_int native_apic_alloc_vector(u_int apic_id, u_int irq); static u_int native_apic_alloc_vectors(u_int apic_id, u_int *irqs, u_int count, u_int align); static void native_apic_disable_vector(u_int apic_id, u_int vector); static void native_apic_enable_vector(u_int apic_id, u_int vector); static void native_apic_free_vector(u_int apic_id, u_int vector, u_int irq); static void native_lapic_set_logical_id(u_int apic_id, u_int cluster, u_int cluster_id); static int native_lapic_enable_pmc(void); static void native_lapic_disable_pmc(void); static void native_lapic_reenable_pmc(void); static void native_lapic_enable_cmc(void); static int native_lapic_enable_mca_elvt(void); static int native_lapic_set_lvt_mask(u_int apic_id, u_int lvt, u_char masked); static int native_lapic_set_lvt_mode(u_int apic_id, u_int lvt, uint32_t mode); static int native_lapic_set_lvt_polarity(u_int apic_id, u_int lvt, enum intr_polarity pol); static int native_lapic_set_lvt_triggermode(u_int apic_id, u_int lvt, enum intr_trigger trigger); #ifdef SMP static void native_lapic_ipi_raw(register_t icrlo, u_int dest); static void native_lapic_ipi_vectored(u_int vector, int dest); static int native_lapic_ipi_wait(int delay); #endif /* SMP */ static int native_lapic_ipi_alloc(inthand_t *ipifunc); static void native_lapic_ipi_free(int vector); struct apic_ops apic_ops = { .create = native_lapic_create, .init = native_lapic_init, .xapic_mode = native_lapic_xapic_mode, .is_x2apic = native_lapic_is_x2apic, .setup = native_lapic_setup, .dump = native_lapic_dump, .disable = native_lapic_disable, .eoi = native_lapic_eoi, .id = native_lapic_id, .intr_pending = native_lapic_intr_pending, .set_logical_id = native_lapic_set_logical_id, .cpuid = native_apic_cpuid, .alloc_vector = native_apic_alloc_vector, .alloc_vectors = native_apic_alloc_vectors, .enable_vector = native_apic_enable_vector, .disable_vector = native_apic_disable_vector, .free_vector = native_apic_free_vector, .enable_pmc = native_lapic_enable_pmc, .disable_pmc = native_lapic_disable_pmc, .reenable_pmc = native_lapic_reenable_pmc, .enable_cmc = native_lapic_enable_cmc, .enable_mca_elvt = native_lapic_enable_mca_elvt, #ifdef SMP .ipi_raw = native_lapic_ipi_raw, .ipi_vectored = native_lapic_ipi_vectored, .ipi_wait = native_lapic_ipi_wait, #endif .ipi_alloc = native_lapic_ipi_alloc, .ipi_free = native_lapic_ipi_free, .set_lvt_mask = native_lapic_set_lvt_mask, .set_lvt_mode = native_lapic_set_lvt_mode, .set_lvt_polarity = native_lapic_set_lvt_polarity, .set_lvt_triggermode = native_lapic_set_lvt_triggermode, }; static uint32_t lvt_mode_impl(struct lapic *la, struct lvt *lvt, u_int pin, uint32_t value) { value &= ~(APIC_LVT_M | APIC_LVT_TM | APIC_LVT_IIPP | APIC_LVT_DM | APIC_LVT_VECTOR); if (lvt->lvt_edgetrigger == 0) value |= APIC_LVT_TM; if (lvt->lvt_activehi == 0) value |= APIC_LVT_IIPP_INTALO; if (lvt->lvt_masked) value |= APIC_LVT_M; value |= lvt->lvt_mode; switch (lvt->lvt_mode) { case APIC_LVT_DM_NMI: case APIC_LVT_DM_SMI: case APIC_LVT_DM_INIT: case APIC_LVT_DM_EXTINT: if (!lvt->lvt_edgetrigger && bootverbose) { printf("lapic%u: Forcing LINT%u to edge trigger\n", la->la_id, pin); value &= ~APIC_LVT_TM; } /* Use a vector of 0. */ break; case APIC_LVT_DM_FIXED: value |= lvt->lvt_vector; break; default: panic("bad APIC LVT delivery mode: %#x\n", value); } return (value); } static uint32_t lvt_mode(struct lapic *la, u_int pin, uint32_t value) { struct lvt *lvt; KASSERT(pin <= APIC_LVT_MAX, ("%s: pin %u out of range", __func__, pin)); if (la->la_lvts[pin].lvt_active) lvt = &la->la_lvts[pin]; else lvt = &lvts[pin]; return (lvt_mode_impl(la, lvt, pin, value)); } static uint32_t elvt_mode(struct lapic *la, u_int idx, uint32_t value) { struct lvt *elvt; KASSERT(idx <= APIC_ELVT_MAX, ("%s: idx %u out of range", __func__, idx)); elvt = &la->la_elvts[idx]; KASSERT(elvt->lvt_active, ("%s: ELVT%u is not active", __func__, idx)); KASSERT(elvt->lvt_edgetrigger, ("%s: ELVT%u is not edge triggered", __func__, idx)); KASSERT(elvt->lvt_activehi, ("%s: ELVT%u is not active high", __func__, idx)); return (lvt_mode_impl(la, elvt, idx, value)); } -static inline uint32_t -amd_read_ext_features(void) -{ - uint32_t version; - - if (cpu_vendor_id != CPU_VENDOR_AMD) - return (0); - version = lapic_read32(LAPIC_VERSION); - if ((version & APIC_VER_AMD_EXT_SPACE) != 0) - return (lapic_read32(LAPIC_EXT_FEATURES)); - return (0); -} - -static inline uint32_t -amd_read_elvt_count(void) -{ - uint32_t extf; - uint32_t count; - - extf = amd_read_ext_features(); - count = (extf & APIC_EXTF_ELVT_MASK) >> APIC_EXTF_ELVT_SHIFT; - count = min(count, APIC_ELVT_MAX + 1); - return (count); -} - /* * Map the local APIC and setup necessary interrupt vectors. */ static void native_lapic_init(vm_paddr_t addr) { #ifdef SMP uint64_t r, r1, r2, rx; #endif - uint32_t extf, ver; + uint32_t ver; u_int regs[4]; - int i, arat, seoi_enable; + int i, arat; /* * Enable x2APIC mode if possible. Map the local APIC * registers page. * * Keep the LAPIC registers page mapped uncached for x2APIC * mode too, to have direct map page attribute set to * uncached. This is needed to work around CPU errata present * on all Intel processors. */ KASSERT(trunc_page(addr) == addr, ("local APIC not aligned on a page boundary")); lapic_paddr = addr; lapic_map = pmap_mapdev(addr, PAGE_SIZE); if (x2apic_mode) { native_lapic_enable_x2apic(); lapic_map = NULL; } /* Setup the spurious interrupt handler. */ setidt(APIC_SPURIOUS_INT, IDTVEC(spuriousint), SDT_APIC, SEL_KPL, GSEL_APIC); /* Perform basic initialization of the BSP's local APIC. */ lapic_enable(); /* Set BSP's per-CPU local APIC ID. */ PCPU_SET(apic_id, lapic_id()); /* Local APIC timer interrupt. */ setidt(APIC_TIMER_INT, IDTVEC(timerint), SDT_APIC, SEL_KPL, GSEL_APIC); /* Local APIC error interrupt. */ setidt(APIC_ERROR_INT, IDTVEC(errorint), SDT_APIC, SEL_KPL, GSEL_APIC); /* XXX: Thermal interrupt */ /* Local APIC CMCI. */ setidt(APIC_CMC_INT, IDTVEC(cmcint), SDT_APICT, SEL_KPL, GSEL_APIC); if ((resource_int_value("apic", 0, "clock", &i) != 0 || i != 0)) { arat = 0; /* Intel CPUID 0x06 EAX[2] set if APIC timer runs in C3. */ if (cpu_vendor_id == CPU_VENDOR_INTEL && cpu_high >= 6) { do_cpuid(0x06, regs); if ((regs[0] & CPUTPM1_ARAT) != 0) arat = 1; } bzero(&lapic_et, sizeof(lapic_et)); lapic_et.et_name = "LAPIC"; lapic_et.et_flags = ET_FLAGS_PERIODIC | ET_FLAGS_ONESHOT | ET_FLAGS_PERCPU; lapic_et.et_quality = 600; if (!arat) { lapic_et.et_flags |= ET_FLAGS_C3STOP; lapic_et.et_quality = 100; } if ((cpu_feature & CPUID_TSC) != 0 && (cpu_feature2 & CPUID2_TSCDLT) != 0 && tsc_is_invariant && tsc_freq != 0) { lapic_timer_tsc_deadline = 1; TUNABLE_INT_FETCH("hw.lapic_tsc_deadline", &lapic_timer_tsc_deadline); } lapic_et.et_frequency = 0; /* We don't know frequency yet, so trying to guess. */ lapic_et.et_min_period = 0x00001000LL; lapic_et.et_max_period = SBT_1S; lapic_et.et_start = lapic_et_start; lapic_et.et_stop = lapic_et_stop; lapic_et.et_priv = NULL; et_register(&lapic_et); } /* * Set lapic_eoi_suppression after lapic_enable(), to not * enable suppression in the hardware prematurely. Note that * we by default enable suppression even when system only has * one IO-APIC, since EOI is broadcasted to all APIC agents, * including CPUs, otherwise. * * It seems that at least some KVM versions report * EOI_SUPPRESSION bit, but auto-EOI does not work. */ ver = lapic_read32(LAPIC_VERSION); if ((ver & APIC_VER_EOI_SUPPRESSION) != 0) { - lapic_eoi_suppression = INTEL_SEOI; - } else { - extf = amd_read_ext_features(); - if ((extf & APIC_EXTF_SEIO_CAP) != 0) - lapic_eoi_suppression = AMD_SEOI; - } - if (lapic_eoi_suppression != 0) { - seoi_enable = 1; + lapic_eoi_suppression = 1; if (vm_guest == VM_GUEST_KVM) { if (bootverbose) printf( "KVM -- disabling lapic eoi suppression\n"); - seoi_enable = 0; + lapic_eoi_suppression = 0; } TUNABLE_INT_FETCH("hw.lapic_eoi_suppression", - &seoi_enable); - if (seoi_enable == 0) - lapic_eoi_suppression = 0; + &lapic_eoi_suppression); } - if (lapic_eoi_suppression != 0) - printf("LAPIC specific EOI enabled\n"); #ifdef SMP #define LOOPS 100000 /* * Calibrate the busy loop waiting for IPI ack in xAPIC mode. * lapic_ipi_wait_mult contains the number of iterations which * approximately delay execution for 1 microsecond (the * argument to native_lapic_ipi_wait() is in microseconds). * * We assume that TSC is present and already measured. * Possible TSC frequency jumps are irrelevant to the * calibration loop below, the CPU clock management code is * not yet started, and we do not enter sleep states. */ KASSERT((cpu_feature & CPUID_TSC) != 0 && tsc_freq != 0, ("TSC not initialized")); if (!x2apic_mode) { r = rdtsc(); for (rx = 0; rx < LOOPS; rx++) { (void)lapic_read_icr_lo(); ia32_pause(); } r = rdtsc() - r; r1 = tsc_freq * LOOPS; r2 = r * 1000000; lapic_ipi_wait_mult = r1 >= r2 ? r1 / r2 : 1; if (bootverbose) { printf("LAPIC: ipi_wait() us multiplier %ju (r %ju " "tsc %ju)\n", (uintmax_t)lapic_ipi_wait_mult, (uintmax_t)r, (uintmax_t)tsc_freq); } } #undef LOOPS #endif /* SMP */ } /* * Create a local APIC instance. */ static void native_lapic_create(u_int apic_id, int boot_cpu) { int i; if (apic_id > MAX_APIC_ID) { printf("APIC: Ignoring local APIC with ID %d\n", apic_id); if (boot_cpu) panic("Can't ignore BSP"); return; } KASSERT(!lapics[apic_id].la_present, ("duplicate local APIC %u", apic_id)); /* * Assume no local LVT overrides and a cluster of 0 and * intra-cluster ID of 0. */ lapics[apic_id].la_present = 1; lapics[apic_id].la_id = apic_id; for (i = 0; i <= APIC_LVT_MAX; i++) { lapics[apic_id].la_lvts[i] = lvts[i]; lapics[apic_id].la_lvts[i].lvt_active = 0; } for (i = 0; i <= APIC_ELVT_MAX; i++) { lapics[apic_id].la_elvts[i] = elvts[i]; lapics[apic_id].la_elvts[i].lvt_active = 0; } for (i = 0; i <= APIC_NUM_IOINTS; i++) lapics[apic_id].la_ioint_irqs[i] = -1; lapics[apic_id].la_ioint_irqs[IDT_SYSCALL - APIC_IO_INTS] = IRQ_SYSCALL; lapics[apic_id].la_ioint_irqs[APIC_TIMER_INT - APIC_IO_INTS] = IRQ_TIMER; #ifdef KDTRACE_HOOKS lapics[apic_id].la_ioint_irqs[IDT_DTRACE_RET - APIC_IO_INTS] = IRQ_DTRACE_RET; #endif #ifdef XENHVM lapics[apic_id].la_ioint_irqs[IDT_EVTCHN - APIC_IO_INTS] = IRQ_EVTCHN; #endif #ifdef SMP cpu_add(apic_id, boot_cpu); #endif } +static inline uint32_t +amd_read_ext_features(void) +{ + uint32_t version; + + if (cpu_vendor_id != CPU_VENDOR_AMD) + return (0); + version = lapic_read32(LAPIC_VERSION); + if ((version & APIC_VER_AMD_EXT_SPACE) != 0) + return (lapic_read32(LAPIC_EXT_FEATURES)); + else + return (0); +} + +static inline uint32_t +amd_read_elvt_count(void) +{ + uint32_t extf; + uint32_t count; + + extf = amd_read_ext_features(); + count = (extf & APIC_EXTF_ELVT_MASK) >> APIC_EXTF_ELVT_SHIFT; + count = min(count, APIC_ELVT_MAX + 1); + return (count); +} + /* * Dump contents of local APIC registers */ static void native_lapic_dump(const char* str) { uint32_t version; uint32_t maxlvt; uint32_t extf; int elvt_count; int i; version = lapic_read32(LAPIC_VERSION); maxlvt = (version & APIC_VER_MAXLVT) >> MAXLVTSHIFT; printf("cpu%d %s:\n", PCPU_GET(cpuid), str); printf(" ID: 0x%08x VER: 0x%08x LDR: 0x%08x DFR: 0x%08x", lapic_read32(LAPIC_ID), version, lapic_read32(LAPIC_LDR), x2apic_mode ? 0 : lapic_read32(LAPIC_DFR)); if ((cpu_feature2 & CPUID2_X2APIC) != 0) printf(" x2APIC: %d", x2apic_mode); printf("\n lint0: 0x%08x lint1: 0x%08x TPR: 0x%08x SVR: 0x%08x\n", lapic_read32(LAPIC_LVT_LINT0), lapic_read32(LAPIC_LVT_LINT1), lapic_read32(LAPIC_TPR), lapic_read32(LAPIC_SVR)); printf(" timer: 0x%08x therm: 0x%08x err: 0x%08x", lapic_read32(LAPIC_LVT_TIMER), lapic_read32(LAPIC_LVT_THERMAL), lapic_read32(LAPIC_LVT_ERROR)); if (maxlvt >= APIC_LVT_PMC) printf(" pmc: 0x%08x", lapic_read32(LAPIC_LVT_PCINT)); printf("\n"); if (maxlvt >= APIC_LVT_CMCI) printf(" cmci: 0x%08x\n", lapic_read32(LAPIC_LVT_CMCI)); extf = amd_read_ext_features(); if (extf != 0) { printf(" AMD ext features: 0x%08x\n", extf); - extf = lapic_read32(LAPIC_EXT_CTRL); - printf(" AMD ext control: 0x%08x\n", extf); elvt_count = amd_read_elvt_count(); for (i = 0; i < elvt_count; i++) - printf(" AMD elvt%d: 0x%08x\n", i, + printf(" AMD elvt%d: 0x%08x\n", i, lapic_read32(LAPIC_EXT_LVT0 + i)); } } static void native_lapic_xapic_mode(void) { register_t saveintr; saveintr = intr_disable(); if (x2apic_mode) native_lapic_enable_x2apic(); intr_restore(saveintr); } static void native_lapic_setup(int boot) { struct lapic *la; uint32_t version; uint32_t maxlvt; register_t saveintr; char buf[MAXCOMLEN + 1]; int elvt_count; int i; saveintr = intr_disable(); la = &lapics[lapic_id()]; KASSERT(la->la_present, ("missing APIC structure")); version = lapic_read32(LAPIC_VERSION); maxlvt = (version & APIC_VER_MAXLVT) >> MAXLVTSHIFT; /* Initialize the TPR to allow all interrupts. */ lapic_set_tpr(0); /* Setup spurious vector and enable the local APIC. */ lapic_enable(); /* Program LINT[01] LVT entries. */ lapic_write32(LAPIC_LVT_LINT0, lvt_mode(la, APIC_LVT_LINT0, lapic_read32(LAPIC_LVT_LINT0))); lapic_write32(LAPIC_LVT_LINT1, lvt_mode(la, APIC_LVT_LINT1, lapic_read32(LAPIC_LVT_LINT1))); /* Program the PMC LVT entry if present. */ if (maxlvt >= APIC_LVT_PMC) { lapic_write32(LAPIC_LVT_PCINT, lvt_mode(la, APIC_LVT_PMC, LAPIC_LVT_PCINT)); } /* Program timer LVT and setup handler. */ la->lvt_timer_base = lvt_mode(la, APIC_LVT_TIMER, lapic_read32(LAPIC_LVT_TIMER)); la->lvt_timer_last = la->lvt_timer_base; lapic_write32(LAPIC_LVT_TIMER, la->lvt_timer_base); if (boot) { snprintf(buf, sizeof(buf), "cpu%d:timer", PCPU_GET(cpuid)); intrcnt_add(buf, &la->la_timer_count); } /* Setup the timer if configured. */ if (la->la_timer_mode != LAT_MODE_UNDEF) { KASSERT(la->la_timer_period != 0, ("lapic%u: zero divisor", lapic_id())); switch (la->la_timer_mode) { case LAT_MODE_PERIODIC: lapic_timer_set_divisor(lapic_timer_divisor); lapic_timer_periodic(la); break; case LAT_MODE_ONESHOT: lapic_timer_set_divisor(lapic_timer_divisor); lapic_timer_oneshot(la); break; case LAT_MODE_DEADLINE: lapic_timer_deadline(la); break; default: panic("corrupted la_timer_mode %p %d", la, la->la_timer_mode); } } /* Program error LVT and clear any existing errors. */ lapic_write32(LAPIC_LVT_ERROR, lvt_mode(la, APIC_LVT_ERROR, lapic_read32(LAPIC_LVT_ERROR))); lapic_write32(LAPIC_ESR, 0); /* XXX: Thermal LVT */ /* Program the CMCI LVT entry if present. */ if (maxlvt >= APIC_LVT_CMCI) { lapic_write32(LAPIC_LVT_CMCI, lvt_mode(la, APIC_LVT_CMCI, lapic_read32(LAPIC_LVT_CMCI))); } elvt_count = amd_read_elvt_count(); for (i = 0; i < elvt_count; i++) { if (la->la_elvts[i].lvt_active) lapic_write32(LAPIC_EXT_LVT0 + i, elvt_mode(la, i, lapic_read32(LAPIC_EXT_LVT0 + i))); } intr_restore(saveintr); } static void native_lapic_reenable_pmc(void) { #ifdef HWPMC_HOOKS uint32_t value; value = lapic_read32(LAPIC_LVT_PCINT); value &= ~APIC_LVT_M; lapic_write32(LAPIC_LVT_PCINT, value); #endif } #ifdef HWPMC_HOOKS static void lapic_update_pmc(void *dummy) { struct lapic *la; la = &lapics[lapic_id()]; lapic_write32(LAPIC_LVT_PCINT, lvt_mode(la, APIC_LVT_PMC, lapic_read32(LAPIC_LVT_PCINT))); } #endif static int native_lapic_enable_pmc(void) { #ifdef HWPMC_HOOKS u_int32_t maxlvt; /* Fail if the local APIC is not present. */ if (!x2apic_mode && lapic_map == NULL) return (0); /* Fail if the PMC LVT is not present. */ maxlvt = (lapic_read32(LAPIC_VERSION) & APIC_VER_MAXLVT) >> MAXLVTSHIFT; if (maxlvt < APIC_LVT_PMC) return (0); lvts[APIC_LVT_PMC].lvt_masked = 0; #ifdef EARLY_AP_STARTUP MPASS(mp_ncpus == 1 || smp_started); smp_rendezvous(NULL, lapic_update_pmc, NULL, NULL); #else #ifdef SMP /* * If hwpmc was loaded at boot time then the APs may not be * started yet. In that case, don't forward the request to * them as they will program the lvt when they start. */ if (smp_started) smp_rendezvous(NULL, lapic_update_pmc, NULL, NULL); else #endif lapic_update_pmc(NULL); #endif return (1); #else return (0); #endif } static void native_lapic_disable_pmc(void) { #ifdef HWPMC_HOOKS u_int32_t maxlvt; /* Fail if the local APIC is not present. */ if (!x2apic_mode && lapic_map == NULL) return; /* Fail if the PMC LVT is not present. */ maxlvt = (lapic_read32(LAPIC_VERSION) & APIC_VER_MAXLVT) >> MAXLVTSHIFT; if (maxlvt < APIC_LVT_PMC) return; lvts[APIC_LVT_PMC].lvt_masked = 1; #ifdef SMP /* The APs should always be started when hwpmc is unloaded. */ KASSERT(mp_ncpus == 1 || smp_started, ("hwpmc unloaded too early")); #endif smp_rendezvous(NULL, lapic_update_pmc, NULL, NULL); #endif } static void lapic_calibrate_initcount(struct eventtimer *et, struct lapic *la) { u_long value; /* Start off with a divisor of 2 (power on reset default). */ lapic_timer_divisor = 2; /* Try to calibrate the local APIC timer. */ do { lapic_timer_set_divisor(lapic_timer_divisor); lapic_timer_oneshot_nointr(la, APIC_TIMER_MAX_COUNT); DELAY(1000000); value = APIC_TIMER_MAX_COUNT - lapic_read32(LAPIC_CCR_TIMER); if (value != APIC_TIMER_MAX_COUNT) break; lapic_timer_divisor <<= 1; } while (lapic_timer_divisor <= 128); if (lapic_timer_divisor > 128) panic("lapic: Divisor too big"); if (bootverbose) { printf("lapic: Divisor %lu, Frequency %lu Hz\n", lapic_timer_divisor, value); } count_freq = value; } static void lapic_calibrate_deadline(struct eventtimer *et, struct lapic *la __unused) { if (bootverbose) { printf("lapic: deadline tsc mode, Frequency %ju Hz\n", (uintmax_t)tsc_freq); } } static void lapic_change_mode(struct eventtimer *et, struct lapic *la, enum lat_timer_mode newmode) { if (la->la_timer_mode == newmode) return; switch (newmode) { case LAT_MODE_PERIODIC: lapic_timer_set_divisor(lapic_timer_divisor); et->et_frequency = count_freq; break; case LAT_MODE_DEADLINE: et->et_frequency = tsc_freq; break; case LAT_MODE_ONESHOT: lapic_timer_set_divisor(lapic_timer_divisor); et->et_frequency = count_freq; break; default: panic("lapic_change_mode %d", newmode); } la->la_timer_mode = newmode; et->et_min_period = (0x00000002LLU << 32) / et->et_frequency; et->et_max_period = (0xfffffffeLLU << 32) / et->et_frequency; } static int lapic_et_start(struct eventtimer *et, sbintime_t first, sbintime_t period) { struct lapic *la; la = &lapics[PCPU_GET(apic_id)]; if (et->et_frequency == 0) { lapic_calibrate_initcount(et, la); if (lapic_timer_tsc_deadline) lapic_calibrate_deadline(et, la); } if (period != 0) { lapic_change_mode(et, la, LAT_MODE_PERIODIC); la->la_timer_period = ((uint32_t)et->et_frequency * period) >> 32; lapic_timer_periodic(la); } else if (lapic_timer_tsc_deadline) { lapic_change_mode(et, la, LAT_MODE_DEADLINE); la->la_timer_period = (et->et_frequency * first) >> 32; lapic_timer_deadline(la); } else { lapic_change_mode(et, la, LAT_MODE_ONESHOT); la->la_timer_period = ((uint32_t)et->et_frequency * first) >> 32; lapic_timer_oneshot(la); } return (0); } static int lapic_et_stop(struct eventtimer *et) { struct lapic *la; la = &lapics[PCPU_GET(apic_id)]; lapic_timer_stop(la); la->la_timer_mode = LAT_MODE_UNDEF; return (0); } static void native_lapic_disable(void) { uint32_t value; /* Software disable the local APIC. */ value = lapic_read32(LAPIC_SVR); value &= ~APIC_SVR_SWEN; lapic_write32(LAPIC_SVR, value); } static void lapic_enable(void) { uint32_t value; /* Program the spurious vector to enable the local APIC. */ value = lapic_read32(LAPIC_SVR); value &= ~(APIC_SVR_VECTOR | APIC_SVR_FOCUS); value |= APIC_SVR_FEN | APIC_SVR_SWEN | APIC_SPURIOUS_INT; - if (lapic_eoi_suppression == INTEL_SEOI) + if (lapic_eoi_suppression) value |= APIC_SVR_EOI_SUPPRESSION; lapic_write32(LAPIC_SVR, value); - - if (lapic_eoi_suppression == AMD_SEOI) { - value = lapic_read32(LAPIC_EXT_CTRL); - value |= APIC_EXTF_SEIO_CAP; - lapic_write32(LAPIC_EXT_CTRL, value); - } } /* Reset the local APIC on the BSP during resume. */ static void lapic_resume(struct pic *pic, bool suspend_cancelled) { lapic_setup(0); } static int native_lapic_id(void) { uint32_t v; KASSERT(x2apic_mode || lapic_map != NULL, ("local APIC is not mapped")); v = lapic_read32(LAPIC_ID); if (!x2apic_mode) v >>= APIC_ID_SHIFT; return (v); } static int native_lapic_intr_pending(u_int vector) { uint32_t irr; /* * The IRR registers are an array of registers each of which * only describes 32 interrupts in the low 32 bits. Thus, we * divide the vector by 32 to get the register index. * Finally, we modulus the vector by 32 to determine the * individual bit to test. */ irr = lapic_read32(LAPIC_IRR0 + vector / 32); return (irr & 1 << (vector % 32)); } static void native_lapic_set_logical_id(u_int apic_id, u_int cluster, u_int cluster_id) { struct lapic *la; KASSERT(lapics[apic_id].la_present, ("%s: APIC %u doesn't exist", __func__, apic_id)); KASSERT(cluster <= APIC_MAX_CLUSTER, ("%s: cluster %u too big", __func__, cluster)); KASSERT(cluster_id <= APIC_MAX_INTRACLUSTER_ID, ("%s: intra cluster id %u too big", __func__, cluster_id)); la = &lapics[apic_id]; la->la_cluster = cluster; la->la_cluster_id = cluster_id; } static int native_lapic_set_lvt_mask(u_int apic_id, u_int pin, u_char masked) { if (pin > APIC_LVT_MAX) return (EINVAL); if (apic_id == APIC_ID_ALL) { lvts[pin].lvt_masked = masked; if (bootverbose) printf("lapic:"); } else { KASSERT(lapics[apic_id].la_present, ("%s: missing APIC %u", __func__, apic_id)); lapics[apic_id].la_lvts[pin].lvt_masked = masked; lapics[apic_id].la_lvts[pin].lvt_active = 1; if (bootverbose) printf("lapic%u:", apic_id); } if (bootverbose) printf(" LINT%u %s\n", pin, masked ? "masked" : "unmasked"); return (0); } static int native_lapic_set_lvt_mode(u_int apic_id, u_int pin, u_int32_t mode) { struct lvt *lvt; if (pin > APIC_LVT_MAX) return (EINVAL); if (apic_id == APIC_ID_ALL) { lvt = &lvts[pin]; if (bootverbose) printf("lapic:"); } else { KASSERT(lapics[apic_id].la_present, ("%s: missing APIC %u", __func__, apic_id)); lvt = &lapics[apic_id].la_lvts[pin]; lvt->lvt_active = 1; if (bootverbose) printf("lapic%u:", apic_id); } lvt->lvt_mode = mode; switch (mode) { case APIC_LVT_DM_NMI: case APIC_LVT_DM_SMI: case APIC_LVT_DM_INIT: case APIC_LVT_DM_EXTINT: lvt->lvt_edgetrigger = 1; lvt->lvt_activehi = 1; if (mode == APIC_LVT_DM_EXTINT) lvt->lvt_masked = 1; else lvt->lvt_masked = 0; break; default: panic("Unsupported delivery mode: 0x%x\n", mode); } if (bootverbose) { printf(" Routing "); switch (mode) { case APIC_LVT_DM_NMI: printf("NMI"); break; case APIC_LVT_DM_SMI: printf("SMI"); break; case APIC_LVT_DM_INIT: printf("INIT"); break; case APIC_LVT_DM_EXTINT: printf("ExtINT"); break; } printf(" -> LINT%u\n", pin); } return (0); } static int native_lapic_set_lvt_polarity(u_int apic_id, u_int pin, enum intr_polarity pol) { if (pin > APIC_LVT_MAX || pol == INTR_POLARITY_CONFORM) return (EINVAL); if (apic_id == APIC_ID_ALL) { lvts[pin].lvt_activehi = (pol == INTR_POLARITY_HIGH); if (bootverbose) printf("lapic:"); } else { KASSERT(lapics[apic_id].la_present, ("%s: missing APIC %u", __func__, apic_id)); lapics[apic_id].la_lvts[pin].lvt_active = 1; lapics[apic_id].la_lvts[pin].lvt_activehi = (pol == INTR_POLARITY_HIGH); if (bootverbose) printf("lapic%u:", apic_id); } if (bootverbose) printf(" LINT%u polarity: %s\n", pin, pol == INTR_POLARITY_HIGH ? "high" : "low"); return (0); } static int native_lapic_set_lvt_triggermode(u_int apic_id, u_int pin, enum intr_trigger trigger) { if (pin > APIC_LVT_MAX || trigger == INTR_TRIGGER_CONFORM) return (EINVAL); if (apic_id == APIC_ID_ALL) { lvts[pin].lvt_edgetrigger = (trigger == INTR_TRIGGER_EDGE); if (bootverbose) printf("lapic:"); } else { KASSERT(lapics[apic_id].la_present, ("%s: missing APIC %u", __func__, apic_id)); lapics[apic_id].la_lvts[pin].lvt_edgetrigger = (trigger == INTR_TRIGGER_EDGE); lapics[apic_id].la_lvts[pin].lvt_active = 1; if (bootverbose) printf("lapic%u:", apic_id); } if (bootverbose) printf(" LINT%u trigger: %s\n", pin, trigger == INTR_TRIGGER_EDGE ? "edge" : "level"); return (0); } /* * Adjust the TPR of the current CPU so that it blocks all interrupts below * the passed in vector. */ static void lapic_set_tpr(u_int vector) { #ifdef CHEAP_TPR lapic_write32(LAPIC_TPR, vector); #else uint32_t tpr; tpr = lapic_read32(LAPIC_TPR) & ~APIC_TPR_PRIO; tpr |= vector; lapic_write32(LAPIC_TPR, tpr); #endif } -void -native_lapic_eoi(u_int vector) +static void +native_lapic_eoi(void) { - if (lapic_eoi_suppression == AMD_SEOI) - lapic_write32(LAPIC_EXT_SEOI, vector); - else - lapic_write32_nofence(LAPIC_EOI, 0); + lapic_write32_nofence(LAPIC_EOI, 0); } void lapic_handle_intr(int vector, struct trapframe *frame) { struct intsrc *isrc; isrc = intr_lookup_source(apic_idt_to_irq(PCPU_GET(apic_id), vector)); intr_execute_handlers(isrc, frame); } void lapic_handle_timer(struct trapframe *frame) { struct lapic *la; struct trapframe *oldframe; struct thread *td; /* Send EOI first thing. */ - lapic_eoi(APIC_TIMER_INT); + lapic_eoi(); #if defined(SMP) && !defined(SCHED_ULE) /* * Don't do any accounting for the disabled HTT cores, since it * will provide misleading numbers for the userland. * * No locking is necessary here, since even if we lose the race * when hlt_cpus_mask changes it is not a big deal, really. * * Don't do that for ULE, since ULE doesn't consider hlt_cpus_mask * and unlike other schedulers it actually schedules threads to * those CPUs. */ if (CPU_ISSET(PCPU_GET(cpuid), &hlt_cpus_mask)) return; #endif /* Look up our local APIC structure for the tick counters. */ la = &lapics[PCPU_GET(apic_id)]; (*la->la_timer_count)++; critical_enter(); if (lapic_et.et_active) { td = curthread; td->td_intr_nesting_level++; oldframe = td->td_intr_frame; td->td_intr_frame = frame; lapic_et.et_event_cb(&lapic_et, lapic_et.et_arg); td->td_intr_frame = oldframe; td->td_intr_nesting_level--; } critical_exit(); } static void lapic_timer_set_divisor(u_int divisor) { KASSERT(powerof2(divisor), ("lapic: invalid divisor %u", divisor)); KASSERT(ffs(divisor) <= nitems(lapic_timer_divisors), ("lapic: invalid divisor %u", divisor)); lapic_write32(LAPIC_DCR_TIMER, lapic_timer_divisors[ffs(divisor) - 1]); } static void lapic_timer_oneshot(struct lapic *la) { uint32_t value; value = la->lvt_timer_base; value &= ~(APIC_LVTT_TM | APIC_LVT_M); value |= APIC_LVTT_TM_ONE_SHOT; la->lvt_timer_last = value; lapic_write32(LAPIC_LVT_TIMER, value); lapic_write32(LAPIC_ICR_TIMER, la->la_timer_period); } static void lapic_timer_oneshot_nointr(struct lapic *la, uint32_t count) { uint32_t value; value = la->lvt_timer_base; value &= ~APIC_LVTT_TM; value |= APIC_LVTT_TM_ONE_SHOT | APIC_LVT_M; la->lvt_timer_last = value; lapic_write32(LAPIC_LVT_TIMER, value); lapic_write32(LAPIC_ICR_TIMER, count); } static void lapic_timer_periodic(struct lapic *la) { uint32_t value; value = la->lvt_timer_base; value &= ~(APIC_LVTT_TM | APIC_LVT_M); value |= APIC_LVTT_TM_PERIODIC; la->lvt_timer_last = value; lapic_write32(LAPIC_LVT_TIMER, value); lapic_write32(LAPIC_ICR_TIMER, la->la_timer_period); } static void lapic_timer_deadline(struct lapic *la) { uint32_t value; value = la->lvt_timer_base; value &= ~(APIC_LVTT_TM | APIC_LVT_M); value |= APIC_LVTT_TM_TSCDLT; if (value != la->lvt_timer_last) { la->lvt_timer_last = value; lapic_write32_nofence(LAPIC_LVT_TIMER, value); if (!x2apic_mode) mfence(); } wrmsr(MSR_TSC_DEADLINE, la->la_timer_period + rdtsc()); } static void lapic_timer_stop(struct lapic *la) { uint32_t value; if (la->la_timer_mode == LAT_MODE_DEADLINE) { wrmsr(MSR_TSC_DEADLINE, 0); mfence(); } else { value = la->lvt_timer_base; value &= ~APIC_LVTT_TM; value |= APIC_LVT_M; la->lvt_timer_last = value; lapic_write32(LAPIC_LVT_TIMER, value); } } void lapic_handle_cmc(void) { - lapic_eoi(APIC_CMC_INT); + lapic_eoi(); cmc_intr(); } /* * Called from the mca_init() to activate the CMC interrupt if this CPU is * responsible for monitoring any MC banks for CMC events. Since mca_init() * is called prior to lapic_setup() during boot, this just needs to unmask * this CPU's LVT_CMCI entry. */ static void native_lapic_enable_cmc(void) { u_int apic_id; #ifdef DEV_ATPIC if (!x2apic_mode && lapic_map == NULL) return; #endif apic_id = PCPU_GET(apic_id); KASSERT(lapics[apic_id].la_present, ("%s: missing APIC %u", __func__, apic_id)); lapics[apic_id].la_lvts[APIC_LVT_CMCI].lvt_masked = 0; lapics[apic_id].la_lvts[APIC_LVT_CMCI].lvt_active = 1; if (bootverbose) printf("lapic%u: CMCI unmasked\n", apic_id); } static int native_lapic_enable_mca_elvt(void) { u_int apic_id; uint32_t value; int elvt_count; #ifdef DEV_ATPIC if (lapic_map == NULL) return (-1); #endif apic_id = PCPU_GET(apic_id); KASSERT(lapics[apic_id].la_present, ("%s: missing APIC %u", __func__, apic_id)); elvt_count = amd_read_elvt_count(); if (elvt_count <= APIC_ELVT_MCA) return (-1); value = lapic_read32(LAPIC_EXT_LVT0 + APIC_ELVT_MCA); if ((value & APIC_LVT_M) == 0) { printf("AMD MCE Thresholding Extended LVT is already active\n"); return (-1); } lapics[apic_id].la_elvts[APIC_ELVT_MCA].lvt_masked = 0; lapics[apic_id].la_elvts[APIC_ELVT_MCA].lvt_active = 1; if (bootverbose) printf("lapic%u: MCE Thresholding ELVT unmasked\n", apic_id); return (APIC_ELVT_MCA); } void lapic_handle_error(void) { uint32_t esr; /* * Read the contents of the error status register. Write to * the register first before reading from it to force the APIC * to update its value to indicate any errors that have * occurred since the previous write to the register. */ lapic_write32(LAPIC_ESR, 0); esr = lapic_read32(LAPIC_ESR); printf("CPU%d: local APIC error 0x%x\n", PCPU_GET(cpuid), esr); - lapic_eoi(APIC_ERROR_INT); + lapic_eoi(); } static u_int native_apic_cpuid(u_int apic_id) { #ifdef SMP return apic_cpuids[apic_id]; #else return 0; #endif } /* Request a free IDT vector to be used by the specified IRQ. */ static u_int native_apic_alloc_vector(u_int apic_id, u_int irq) { u_int vector; KASSERT(irq < NUM_IO_INTS, ("Invalid IRQ %u", irq)); /* * Search for a free vector. Currently we just use a very simple * algorithm to find the first free vector. */ mtx_lock_spin(&icu_lock); for (vector = 0; vector < APIC_NUM_IOINTS; vector++) { if (lapics[apic_id].la_ioint_irqs[vector] != -1) continue; lapics[apic_id].la_ioint_irqs[vector] = irq; mtx_unlock_spin(&icu_lock); return (vector + APIC_IO_INTS); } mtx_unlock_spin(&icu_lock); return (0); } /* * Request 'count' free contiguous IDT vectors to be used by 'count' * IRQs. 'count' must be a power of two and the vectors will be * aligned on a boundary of 'align'. If the request cannot be * satisfied, 0 is returned. */ static u_int native_apic_alloc_vectors(u_int apic_id, u_int *irqs, u_int count, u_int align) { u_int first, run, vector; KASSERT(powerof2(count), ("bad count")); KASSERT(powerof2(align), ("bad align")); KASSERT(align >= count, ("align < count")); #ifdef INVARIANTS for (run = 0; run < count; run++) KASSERT(irqs[run] < NUM_IO_INTS, ("Invalid IRQ %u at index %u", irqs[run], run)); #endif /* * Search for 'count' free vectors. As with apic_alloc_vector(), * this just uses a simple first fit algorithm. */ run = 0; first = 0; mtx_lock_spin(&icu_lock); for (vector = 0; vector < APIC_NUM_IOINTS; vector++) { /* Vector is in use, end run. */ if (lapics[apic_id].la_ioint_irqs[vector] != -1) { run = 0; first = 0; continue; } /* Start a new run if run == 0 and vector is aligned. */ if (run == 0) { if ((vector & (align - 1)) != 0) continue; first = vector; } run++; /* Keep looping if the run isn't long enough yet. */ if (run < count) continue; /* Found a run, assign IRQs and return the first vector. */ for (vector = 0; vector < count; vector++) lapics[apic_id].la_ioint_irqs[first + vector] = irqs[vector]; mtx_unlock_spin(&icu_lock); return (first + APIC_IO_INTS); } mtx_unlock_spin(&icu_lock); printf("APIC: Couldn't find APIC vectors for %u IRQs\n", count); return (0); } /* * Enable a vector for a particular apic_id. Since all lapics share idt * entries and ioint_handlers this enables the vector on all lapics. lapics * which do not have the vector configured would report spurious interrupts * should it fire. */ static void native_apic_enable_vector(u_int apic_id, u_int vector) { KASSERT(vector != IDT_SYSCALL, ("Attempt to overwrite syscall entry")); KASSERT(ioint_handlers[vector / 32] != NULL, ("No ISR handler for vector %u", vector)); #ifdef KDTRACE_HOOKS KASSERT(vector != IDT_DTRACE_RET, ("Attempt to overwrite DTrace entry")); #endif setidt(vector, ioint_handlers[vector / 32], SDT_APIC, SEL_KPL, GSEL_APIC); } static void native_apic_disable_vector(u_int apic_id, u_int vector) { KASSERT(vector != IDT_SYSCALL, ("Attempt to overwrite syscall entry")); #ifdef KDTRACE_HOOKS KASSERT(vector != IDT_DTRACE_RET, ("Attempt to overwrite DTrace entry")); #endif KASSERT(ioint_handlers[vector / 32] != NULL, ("No ISR handler for vector %u", vector)); #ifdef notyet /* * We can not currently clear the idt entry because other cpus * may have a valid vector at this offset. */ setidt(vector, &IDTVEC(rsvd), SDT_APICT, SEL_KPL, GSEL_APIC); #endif } /* Release an APIC vector when it's no longer in use. */ static void native_apic_free_vector(u_int apic_id, u_int vector, u_int irq) { struct thread *td; KASSERT(vector >= APIC_IO_INTS && vector != IDT_SYSCALL && vector <= APIC_IO_INTS + APIC_NUM_IOINTS, ("Vector %u does not map to an IRQ line", vector)); KASSERT(irq < NUM_IO_INTS, ("Invalid IRQ %u", irq)); KASSERT(lapics[apic_id].la_ioint_irqs[vector - APIC_IO_INTS] == irq, ("IRQ mismatch")); #ifdef KDTRACE_HOOKS KASSERT(vector != IDT_DTRACE_RET, ("Attempt to overwrite DTrace entry")); #endif /* * Bind us to the cpu that owned the vector before freeing it so * we don't lose an interrupt delivery race. */ td = curthread; if (!rebooting) { thread_lock(td); if (sched_is_bound(td)) panic("apic_free_vector: Thread already bound.\n"); sched_bind(td, apic_cpuid(apic_id)); thread_unlock(td); } mtx_lock_spin(&icu_lock); lapics[apic_id].la_ioint_irqs[vector - APIC_IO_INTS] = -1; mtx_unlock_spin(&icu_lock); if (!rebooting) { thread_lock(td); sched_unbind(td); thread_unlock(td); } } /* Map an IDT vector (APIC) to an IRQ (interrupt source). */ static u_int apic_idt_to_irq(u_int apic_id, u_int vector) { int irq; KASSERT(vector >= APIC_IO_INTS && vector != IDT_SYSCALL && vector <= APIC_IO_INTS + APIC_NUM_IOINTS, ("Vector %u does not map to an IRQ line", vector)); #ifdef KDTRACE_HOOKS KASSERT(vector != IDT_DTRACE_RET, ("Attempt to overwrite DTrace entry")); #endif irq = lapics[apic_id].la_ioint_irqs[vector - APIC_IO_INTS]; if (irq < 0) irq = 0; return (irq); } #ifdef DDB /* * Dump data about APIC IDT vector mappings. */ DB_SHOW_COMMAND(apic, db_show_apic) { struct intsrc *isrc; int i, verbose; u_int apic_id; u_int irq; if (strcmp(modif, "vv") == 0) verbose = 2; else if (strcmp(modif, "v") == 0) verbose = 1; else verbose = 0; for (apic_id = 0; apic_id <= MAX_APIC_ID; apic_id++) { if (lapics[apic_id].la_present == 0) continue; db_printf("Interrupts bound to lapic %u\n", apic_id); for (i = 0; i < APIC_NUM_IOINTS + 1 && !db_pager_quit; i++) { irq = lapics[apic_id].la_ioint_irqs[i]; if (irq == -1 || irq == IRQ_SYSCALL) continue; #ifdef KDTRACE_HOOKS if (irq == IRQ_DTRACE_RET) continue; #endif #ifdef XENHVM if (irq == IRQ_EVTCHN) continue; #endif db_printf("vec 0x%2x -> ", i + APIC_IO_INTS); if (irq == IRQ_TIMER) db_printf("lapic timer\n"); else if (irq < NUM_IO_INTS) { isrc = intr_lookup_source(irq); if (isrc == NULL || verbose == 0) db_printf("IRQ %u\n", irq); else db_dump_intr_event(isrc->is_event, verbose == 2); } else db_printf("IRQ %u ???\n", irq); } } } static void dump_mask(const char *prefix, uint32_t v, int base) { int i, first; first = 1; for (i = 0; i < 32; i++) if (v & (1 << i)) { if (first) { db_printf("%s:", prefix); first = 0; } db_printf(" %02x", base + i); } if (!first) db_printf("\n"); } /* Show info from the lapic regs for this CPU. */ DB_SHOW_COMMAND(lapic, db_show_lapic) { uint32_t v; db_printf("lapic ID = %d\n", lapic_id()); v = lapic_read32(LAPIC_VERSION); db_printf("version = %d.%d\n", (v & APIC_VER_VERSION) >> 4, v & 0xf); db_printf("max LVT = %d\n", (v & APIC_VER_MAXLVT) >> MAXLVTSHIFT); v = lapic_read32(LAPIC_SVR); db_printf("SVR = %02x (%s)\n", v & APIC_SVR_VECTOR, v & APIC_SVR_ENABLE ? "enabled" : "disabled"); db_printf("TPR = %02x\n", lapic_read32(LAPIC_TPR)); #define dump_field(prefix, regn, index) \ dump_mask(__XSTRING(prefix ## index), \ lapic_read32(LAPIC_ ## regn ## index), \ index * 32) db_printf("In-service Interrupts:\n"); dump_field(isr, ISR, 0); dump_field(isr, ISR, 1); dump_field(isr, ISR, 2); dump_field(isr, ISR, 3); dump_field(isr, ISR, 4); dump_field(isr, ISR, 5); dump_field(isr, ISR, 6); dump_field(isr, ISR, 7); db_printf("TMR Interrupts:\n"); dump_field(tmr, TMR, 0); dump_field(tmr, TMR, 1); dump_field(tmr, TMR, 2); dump_field(tmr, TMR, 3); dump_field(tmr, TMR, 4); dump_field(tmr, TMR, 5); dump_field(tmr, TMR, 6); dump_field(tmr, TMR, 7); db_printf("IRR Interrupts:\n"); dump_field(irr, IRR, 0); dump_field(irr, IRR, 1); dump_field(irr, IRR, 2); dump_field(irr, IRR, 3); dump_field(irr, IRR, 4); dump_field(irr, IRR, 5); dump_field(irr, IRR, 6); dump_field(irr, IRR, 7); #undef dump_field } #endif /* * APIC probing support code. This includes code to manage enumerators. */ static SLIST_HEAD(, apic_enumerator) enumerators = SLIST_HEAD_INITIALIZER(enumerators); static struct apic_enumerator *best_enum; void apic_register_enumerator(struct apic_enumerator *enumerator) { #ifdef INVARIANTS struct apic_enumerator *apic_enum; SLIST_FOREACH(apic_enum, &enumerators, apic_next) { if (apic_enum == enumerator) panic("%s: Duplicate register of %s", __func__, enumerator->apic_name); } #endif SLIST_INSERT_HEAD(&enumerators, enumerator, apic_next); } /* * We have to look for CPU's very, very early because certain subsystems * want to know how many CPU's we have extremely early on in the boot * process. */ static void apic_init(void *dummy __unused) { struct apic_enumerator *enumerator; int retval, best; /* We only support built in local APICs. */ if (!(cpu_feature & CPUID_APIC)) return; /* Don't probe if APIC mode is disabled. */ if (resource_disabled("apic", 0)) return; /* Probe all the enumerators to find the best match. */ best_enum = NULL; best = 0; SLIST_FOREACH(enumerator, &enumerators, apic_next) { retval = enumerator->apic_probe(); if (retval > 0) continue; if (best_enum == NULL || best < retval) { best_enum = enumerator; best = retval; } } if (best_enum == NULL) { if (bootverbose) printf("APIC: Could not find any APICs.\n"); #ifndef DEV_ATPIC panic("running without device atpic requires a local APIC"); #endif return; } if (bootverbose) printf("APIC: Using the %s enumerator.\n", best_enum->apic_name); #ifdef I686_CPU /* * To work around an errata, we disable the local APIC on some * CPUs during early startup. We need to turn the local APIC back * on on such CPUs now. */ ppro_reenable_apic(); #endif /* Probe the CPU's in the system. */ retval = best_enum->apic_probe_cpus(); if (retval != 0) printf("%s: Failed to probe CPUs: returned %d\n", best_enum->apic_name, retval); } SYSINIT(apic_init, SI_SUB_TUNABLES - 1, SI_ORDER_SECOND, apic_init, NULL); /* * Setup the local APIC. We have to do this prior to starting up the APs * in the SMP case. */ static void apic_setup_local(void *dummy __unused) { int retval; if (best_enum == NULL) return; /* Initialize the local APIC. */ retval = best_enum->apic_setup_local(); if (retval != 0) printf("%s: Failed to setup the local APIC: returned %d\n", best_enum->apic_name, retval); } SYSINIT(apic_setup_local, SI_SUB_CPU, SI_ORDER_SECOND, apic_setup_local, NULL); /* * Setup the I/O APICs. */ static void apic_setup_io(void *dummy __unused) { int retval; if (best_enum == NULL) return; /* * Local APIC must be registered before other PICs and pseudo PICs * for proper suspend/resume order. */ intr_register_pic(&lapic_pic); retval = best_enum->apic_setup_io(); if (retval != 0) printf("%s: Failed to setup I/O APICs: returned %d\n", best_enum->apic_name, retval); /* * Finish setting up the local APIC on the BSP once we know * how to properly program the LINT pins. In particular, this * enables the EOI suppression mode, if LAPIC support it and * user did not disabled the mode. */ lapic_setup(1); if (bootverbose) lapic_dump("BSP"); /* Enable the MSI "pic". */ init_ops.msi_init(); } SYSINIT(apic_setup_io, SI_SUB_INTR, SI_ORDER_THIRD, apic_setup_io, NULL); #ifdef SMP /* * Inter Processor Interrupt functions. The lapic_ipi_*() functions are * private to the MD code. The public interface for the rest of the * kernel is defined in mp_machdep.c. */ /* * Wait delay microseconds for IPI to be sent. If delay is -1, we * wait forever. */ static int native_lapic_ipi_wait(int delay) { uint64_t rx; /* LAPIC_ICR.APIC_DELSTAT_MASK is undefined in x2APIC mode */ if (x2apic_mode) return (1); for (rx = 0; delay == -1 || rx < lapic_ipi_wait_mult * delay; rx++) { if ((lapic_read_icr_lo() & APIC_DELSTAT_MASK) == APIC_DELSTAT_IDLE) return (1); ia32_pause(); } return (0); } static void native_lapic_ipi_raw(register_t icrlo, u_int dest) { uint64_t icr; uint32_t vhi, vlo; register_t saveintr; /* XXX: Need more sanity checking of icrlo? */ KASSERT(x2apic_mode || lapic_map != NULL, ("%s called too early", __func__)); KASSERT(x2apic_mode || (dest & ~(APIC_ID_MASK >> APIC_ID_SHIFT)) == 0, ("%s: invalid dest field", __func__)); KASSERT((icrlo & APIC_ICRLO_RESV_MASK) == 0, ("%s: reserved bits set in ICR LO register", __func__)); /* Set destination in ICR HI register if it is being used. */ if (!x2apic_mode) { saveintr = intr_disable(); icr = lapic_read_icr(); } if ((icrlo & APIC_DEST_MASK) == APIC_DEST_DESTFLD) { if (x2apic_mode) { vhi = dest; } else { vhi = icr >> 32; vhi &= ~APIC_ID_MASK; vhi |= dest << APIC_ID_SHIFT; } } else { vhi = 0; } /* Program the contents of the IPI and dispatch it. */ if (x2apic_mode) { vlo = icrlo; } else { vlo = icr; vlo &= APIC_ICRLO_RESV_MASK; vlo |= icrlo; } lapic_write_icr(vhi, vlo); if (!x2apic_mode) intr_restore(saveintr); } #define BEFORE_SPIN 50000 #ifdef DETECT_DEADLOCK #define AFTER_SPIN 50 #endif static void native_lapic_ipi_vectored(u_int vector, int dest) { register_t icrlo, destfield; KASSERT((vector & ~APIC_VECTOR_MASK) == 0, ("%s: invalid vector %d", __func__, vector)); icrlo = APIC_DESTMODE_PHY | APIC_TRIGMOD_EDGE | APIC_LEVEL_ASSERT; /* * NMI IPIs are just fake vectors used to send a NMI. Use special rules * regarding NMIs if passed, otherwise specify the vector. */ if (vector >= IPI_NMI_FIRST) icrlo |= APIC_DELMODE_NMI; else icrlo |= vector | APIC_DELMODE_FIXED; destfield = 0; switch (dest) { case APIC_IPI_DEST_SELF: icrlo |= APIC_DEST_SELF; break; case APIC_IPI_DEST_ALL: icrlo |= APIC_DEST_ALLISELF; break; case APIC_IPI_DEST_OTHERS: icrlo |= APIC_DEST_ALLESELF; break; default: KASSERT(x2apic_mode || (dest & ~(APIC_ID_MASK >> APIC_ID_SHIFT)) == 0, ("%s: invalid destination 0x%x", __func__, dest)); destfield = dest; } /* Wait for an earlier IPI to finish. */ if (!lapic_ipi_wait(BEFORE_SPIN)) { if (panicstr != NULL) return; else panic("APIC: Previous IPI is stuck"); } lapic_ipi_raw(icrlo, destfield); #ifdef DETECT_DEADLOCK /* Wait for IPI to be delivered. */ if (!lapic_ipi_wait(AFTER_SPIN)) { #ifdef needsattention /* * XXX FIXME: * * The above function waits for the message to actually be * delivered. It breaks out after an arbitrary timeout * since the message should eventually be delivered (at * least in theory) and that if it wasn't we would catch * the failure with the check above when the next IPI is * sent. * * We could skip this wait entirely, EXCEPT it probably * protects us from other routines that assume that the * message was delivered and acted upon when this function * returns. */ printf("APIC: IPI might be stuck\n"); #else /* !needsattention */ /* Wait until mesage is sent without a timeout. */ while (lapic_read_icr_lo() & APIC_DELSTAT_PEND) ia32_pause(); #endif /* needsattention */ } #endif /* DETECT_DEADLOCK */ } #endif /* SMP */ /* * Since the IDT is shared by all CPUs the IPI slot update needs to be globally * visible. * * Consider the case where an IPI is generated immediately after allocation: * vector = lapic_ipi_alloc(ipifunc); * ipi_selected(other_cpus, vector); * * In xAPIC mode a write to ICR_LO has serializing semantics because the * APIC page is mapped as an uncached region. In x2APIC mode there is an * explicit 'mfence' before the ICR MSR is written. Therefore in both cases * the IDT slot update is globally visible before the IPI is delivered. */ static int native_lapic_ipi_alloc(inthand_t *ipifunc) { struct gate_descriptor *ip; long func; int idx, vector; KASSERT(ipifunc != &IDTVEC(rsvd), ("invalid ipifunc %p", ipifunc)); vector = -1; mtx_lock_spin(&icu_lock); for (idx = IPI_DYN_FIRST; idx <= IPI_DYN_LAST; idx++) { ip = &idt[idx]; func = (ip->gd_hioffset << 16) | ip->gd_looffset; if (func == (uintptr_t)&IDTVEC(rsvd)) { vector = idx; setidt(vector, ipifunc, SDT_APIC, SEL_KPL, GSEL_APIC); break; } } mtx_unlock_spin(&icu_lock); return (vector); } static void native_lapic_ipi_free(int vector) { struct gate_descriptor *ip; long func; KASSERT(vector >= IPI_DYN_FIRST && vector <= IPI_DYN_LAST, ("%s: invalid vector %d", __func__, vector)); mtx_lock_spin(&icu_lock); ip = &idt[vector]; func = (ip->gd_hioffset << 16) | ip->gd_looffset; KASSERT(func != (uintptr_t)&IDTVEC(rsvd), ("invalid idtfunc %#lx", func)); setidt(vector, &IDTVEC(rsvd), SDT_APICT, SEL_KPL, GSEL_APIC); mtx_unlock_spin(&icu_lock); } Index: head/sys/x86/x86/msi.c =================================================================== --- head/sys/x86/x86/msi.c (revision 316020) +++ head/sys/x86/x86/msi.c (revision 316021) @@ -1,732 +1,730 @@ /*- * Copyright (c) 2006 Yahoo!, Inc. * All rights reserved. * Written by: John Baldwin * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the author nor the names of any co-contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Support for PCI Message Signalled Interrupts (MSI). MSI interrupts on * x86 are basically APIC messages that the northbridge delivers directly * to the local APICs as if they had come from an I/O APIC. */ #include __FBSDID("$FreeBSD$"); #include "opt_acpi.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* Fields in address for Intel MSI messages. */ #define MSI_INTEL_ADDR_DEST 0x000ff000 #define MSI_INTEL_ADDR_RH 0x00000008 # define MSI_INTEL_ADDR_RH_ON 0x00000008 # define MSI_INTEL_ADDR_RH_OFF 0x00000000 #define MSI_INTEL_ADDR_DM 0x00000004 # define MSI_INTEL_ADDR_DM_PHYSICAL 0x00000000 # define MSI_INTEL_ADDR_DM_LOGICAL 0x00000004 /* Fields in data for Intel MSI messages. */ #define MSI_INTEL_DATA_TRGRMOD IOART_TRGRMOD /* Trigger mode. */ # define MSI_INTEL_DATA_TRGREDG IOART_TRGREDG # define MSI_INTEL_DATA_TRGRLVL IOART_TRGRLVL #define MSI_INTEL_DATA_LEVEL 0x00004000 /* Polarity. */ # define MSI_INTEL_DATA_DEASSERT 0x00000000 # define MSI_INTEL_DATA_ASSERT 0x00004000 #define MSI_INTEL_DATA_DELMOD IOART_DELMOD /* Delivery mode. */ # define MSI_INTEL_DATA_DELFIXED IOART_DELFIXED # define MSI_INTEL_DATA_DELLOPRI IOART_DELLOPRI # define MSI_INTEL_DATA_DELSMI IOART_DELSMI # define MSI_INTEL_DATA_DELNMI IOART_DELNMI # define MSI_INTEL_DATA_DELINIT IOART_DELINIT # define MSI_INTEL_DATA_DELEXINT IOART_DELEXINT #define MSI_INTEL_DATA_INTVEC IOART_INTVEC /* Interrupt vector. */ /* * Build Intel MSI message and data values from a source. AMD64 systems * seem to be compatible, so we use the same function for both. */ #define INTEL_ADDR(msi) \ (MSI_INTEL_ADDR_BASE | (msi)->msi_cpu << 12 | \ MSI_INTEL_ADDR_RH_OFF | MSI_INTEL_ADDR_DM_PHYSICAL) #define INTEL_DATA(msi) \ (MSI_INTEL_DATA_TRGREDG | MSI_INTEL_DATA_DELFIXED | (msi)->msi_vector) static MALLOC_DEFINE(M_MSI, "msi", "PCI MSI"); /* * MSI sources are bunched into groups. This is because MSI forces * all of the messages to share the address and data registers and * thus certain properties (such as the local APIC ID target on x86). * Each group has a 'first' source that contains information global to * the group. These fields are marked with (g) below. * * Note that local APIC ID is kind of special. Each message will be * assigned an ID by the system; however, a group will use the ID from * the first message. * * For MSI-X, each message is isolated. */ struct msi_intsrc { struct intsrc msi_intsrc; device_t msi_dev; /* Owning device. (g) */ struct msi_intsrc *msi_first; /* First source in group. */ u_int msi_irq; /* IRQ cookie. */ u_int msi_msix; /* MSI-X message. */ u_int msi_vector:8; /* IDT vector. */ u_int msi_cpu; /* Local APIC ID. (g) */ u_int msi_count:8; /* Messages in this group. (g) */ u_int msi_maxcount:8; /* Alignment for this group. (g) */ int *msi_irqs; /* Group's IRQ list. (g) */ u_int msi_remap_cookie; }; static void msi_create_source(void); static void msi_enable_source(struct intsrc *isrc); static void msi_disable_source(struct intsrc *isrc, int eoi); static void msi_eoi_source(struct intsrc *isrc); static void msi_enable_intr(struct intsrc *isrc); static void msi_disable_intr(struct intsrc *isrc); static int msi_vector(struct intsrc *isrc); static int msi_source_pending(struct intsrc *isrc); static int msi_config_intr(struct intsrc *isrc, enum intr_trigger trig, enum intr_polarity pol); static int msi_assign_cpu(struct intsrc *isrc, u_int apic_id); struct pic msi_pic = { .pic_enable_source = msi_enable_source, .pic_disable_source = msi_disable_source, .pic_eoi_source = msi_eoi_source, .pic_enable_intr = msi_enable_intr, .pic_disable_intr = msi_disable_intr, .pic_vector = msi_vector, .pic_source_pending = msi_source_pending, .pic_suspend = NULL, .pic_resume = NULL, .pic_config_intr = msi_config_intr, .pic_assign_cpu = msi_assign_cpu, .pic_reprogram_pin = NULL, }; #ifdef SMP /** * Xen hypervisors prior to 4.6.0 do not properly handle updates to * enabled MSI-X table entries. Allow migration of MSI-X interrupts * to be disabled via a tunable. Values have the following meaning: * * -1: automatic detection by FreeBSD * 0: enable migration * 1: disable migration */ int msix_disable_migration = -1; SYSCTL_INT(_machdep, OID_AUTO, disable_msix_migration, CTLFLAG_RDTUN, &msix_disable_migration, 0, "Disable migration of MSI-X interrupts between CPUs"); #endif static int msi_enabled; static int msi_last_irq; static struct mtx msi_lock; static void msi_enable_source(struct intsrc *isrc) { } static void msi_disable_source(struct intsrc *isrc, int eoi) { - struct msi_intsrc *msi = (struct msi_intsrc *)isrc; if (eoi == PIC_EOI) - lapic_eoi(msi->msi_vector); + lapic_eoi(); } static void msi_eoi_source(struct intsrc *isrc) { - struct msi_intsrc *msi = (struct msi_intsrc *)isrc; - lapic_eoi(msi->msi_vector); + lapic_eoi(); } static void msi_enable_intr(struct intsrc *isrc) { struct msi_intsrc *msi = (struct msi_intsrc *)isrc; apic_enable_vector(msi->msi_cpu, msi->msi_vector); } static void msi_disable_intr(struct intsrc *isrc) { struct msi_intsrc *msi = (struct msi_intsrc *)isrc; apic_disable_vector(msi->msi_cpu, msi->msi_vector); } static int msi_vector(struct intsrc *isrc) { struct msi_intsrc *msi = (struct msi_intsrc *)isrc; return (msi->msi_irq); } static int msi_source_pending(struct intsrc *isrc) { return (0); } static int msi_config_intr(struct intsrc *isrc, enum intr_trigger trig, enum intr_polarity pol) { return (ENODEV); } static int msi_assign_cpu(struct intsrc *isrc, u_int apic_id) { struct msi_intsrc *sib, *msi = (struct msi_intsrc *)isrc; int old_vector; u_int old_id; int i, vector; /* * Only allow CPUs to be assigned to the first message for an * MSI group. */ if (msi->msi_first != msi) return (EINVAL); #ifdef SMP if (msix_disable_migration && msi->msi_msix) return (EINVAL); #endif /* Store information to free existing irq. */ old_vector = msi->msi_vector; old_id = msi->msi_cpu; if (old_id == apic_id) return (0); /* Allocate IDT vectors on this cpu. */ if (msi->msi_count > 1) { KASSERT(msi->msi_msix == 0, ("MSI-X message group")); vector = apic_alloc_vectors(apic_id, msi->msi_irqs, msi->msi_count, msi->msi_maxcount); } else vector = apic_alloc_vector(apic_id, msi->msi_irq); if (vector == 0) return (ENOSPC); msi->msi_cpu = apic_id; msi->msi_vector = vector; if (msi->msi_intsrc.is_handlers > 0) apic_enable_vector(msi->msi_cpu, msi->msi_vector); if (bootverbose) printf("msi: Assigning %s IRQ %d to local APIC %u vector %u\n", msi->msi_msix ? "MSI-X" : "MSI", msi->msi_irq, msi->msi_cpu, msi->msi_vector); for (i = 1; i < msi->msi_count; i++) { sib = (struct msi_intsrc *)intr_lookup_source(msi->msi_irqs[i]); sib->msi_cpu = apic_id; sib->msi_vector = vector + i; if (sib->msi_intsrc.is_handlers > 0) apic_enable_vector(sib->msi_cpu, sib->msi_vector); if (bootverbose) printf( "msi: Assigning MSI IRQ %d to local APIC %u vector %u\n", sib->msi_irq, sib->msi_cpu, sib->msi_vector); } BUS_REMAP_INTR(device_get_parent(msi->msi_dev), msi->msi_dev, msi->msi_irq); /* * Free the old vector after the new one is established. This is done * to prevent races where we could miss an interrupt. */ if (msi->msi_intsrc.is_handlers > 0) apic_disable_vector(old_id, old_vector); apic_free_vector(old_id, old_vector, msi->msi_irq); for (i = 1; i < msi->msi_count; i++) { sib = (struct msi_intsrc *)intr_lookup_source(msi->msi_irqs[i]); if (sib->msi_intsrc.is_handlers > 0) apic_disable_vector(old_id, old_vector + i); apic_free_vector(old_id, old_vector + i, msi->msi_irqs[i]); } return (0); } void msi_init(void) { /* Check if we have a supported CPU. */ switch (cpu_vendor_id) { case CPU_VENDOR_INTEL: case CPU_VENDOR_AMD: break; case CPU_VENDOR_CENTAUR: if (CPUID_TO_FAMILY(cpu_id) == 0x6 && CPUID_TO_MODEL(cpu_id) >= 0xf) break; /* FALLTHROUGH */ default: return; } #ifdef SMP if (msix_disable_migration == -1) { /* The default is to allow migration of MSI-X interrupts. */ msix_disable_migration = 0; } #endif msi_enabled = 1; intr_register_pic(&msi_pic); mtx_init(&msi_lock, "msi", NULL, MTX_DEF); } static void msi_create_source(void) { struct msi_intsrc *msi; u_int irq; mtx_lock(&msi_lock); if (msi_last_irq >= NUM_MSI_INTS) { mtx_unlock(&msi_lock); return; } irq = msi_last_irq + FIRST_MSI_INT; msi_last_irq++; mtx_unlock(&msi_lock); msi = malloc(sizeof(struct msi_intsrc), M_MSI, M_WAITOK | M_ZERO); msi->msi_intsrc.is_pic = &msi_pic; msi->msi_irq = irq; intr_register_source(&msi->msi_intsrc); nexus_add_irq(irq); } /* * Try to allocate 'count' interrupt sources with contiguous IDT values. */ int msi_alloc(device_t dev, int count, int maxcount, int *irqs) { struct msi_intsrc *msi, *fsrc; u_int cpu; int cnt, i, *mirqs, vector; #ifdef ACPI_DMAR u_int cookies[count]; int error; #endif if (!msi_enabled) return (ENXIO); if (count > 1) mirqs = malloc(count * sizeof(*mirqs), M_MSI, M_WAITOK); else mirqs = NULL; again: mtx_lock(&msi_lock); /* Try to find 'count' free IRQs. */ cnt = 0; for (i = FIRST_MSI_INT; i < FIRST_MSI_INT + NUM_MSI_INTS; i++) { msi = (struct msi_intsrc *)intr_lookup_source(i); /* End of allocated sources, so break. */ if (msi == NULL) break; /* If this is a free one, save its IRQ in the array. */ if (msi->msi_dev == NULL) { irqs[cnt] = i; cnt++; if (cnt == count) break; } } /* Do we need to create some new sources? */ if (cnt < count) { /* If we would exceed the max, give up. */ if (i + (count - cnt) > FIRST_MSI_INT + NUM_MSI_INTS) { mtx_unlock(&msi_lock); free(mirqs, M_MSI); return (ENXIO); } mtx_unlock(&msi_lock); /* We need count - cnt more sources. */ while (cnt < count) { msi_create_source(); cnt++; } goto again; } /* Ok, we now have the IRQs allocated. */ KASSERT(cnt == count, ("count mismatch")); /* Allocate 'count' IDT vectors. */ cpu = intr_next_cpu(); vector = apic_alloc_vectors(cpu, irqs, count, maxcount); if (vector == 0) { mtx_unlock(&msi_lock); free(mirqs, M_MSI); return (ENOSPC); } #ifdef ACPI_DMAR mtx_unlock(&msi_lock); error = iommu_alloc_msi_intr(dev, cookies, count); mtx_lock(&msi_lock); if (error == EOPNOTSUPP) error = 0; if (error != 0) { for (i = 0; i < count; i++) apic_free_vector(cpu, vector + i, irqs[i]); free(mirqs, M_MSI); return (error); } for (i = 0; i < count; i++) { msi = (struct msi_intsrc *)intr_lookup_source(irqs[i]); msi->msi_remap_cookie = cookies[i]; } #endif /* Assign IDT vectors and make these messages owned by 'dev'. */ fsrc = (struct msi_intsrc *)intr_lookup_source(irqs[0]); for (i = 0; i < count; i++) { msi = (struct msi_intsrc *)intr_lookup_source(irqs[i]); msi->msi_cpu = cpu; msi->msi_dev = dev; msi->msi_vector = vector + i; if (bootverbose) printf( "msi: routing MSI IRQ %d to local APIC %u vector %u\n", msi->msi_irq, msi->msi_cpu, msi->msi_vector); msi->msi_first = fsrc; KASSERT(msi->msi_intsrc.is_handlers == 0, ("dead MSI has handlers")); } fsrc->msi_count = count; fsrc->msi_maxcount = maxcount; if (count > 1) bcopy(irqs, mirqs, count * sizeof(*mirqs)); fsrc->msi_irqs = mirqs; mtx_unlock(&msi_lock); return (0); } int msi_release(int *irqs, int count) { struct msi_intsrc *msi, *first; int i; mtx_lock(&msi_lock); first = (struct msi_intsrc *)intr_lookup_source(irqs[0]); if (first == NULL) { mtx_unlock(&msi_lock); return (ENOENT); } /* Make sure this isn't an MSI-X message. */ if (first->msi_msix) { mtx_unlock(&msi_lock); return (EINVAL); } /* Make sure this message is allocated to a group. */ if (first->msi_first == NULL) { mtx_unlock(&msi_lock); return (ENXIO); } /* * Make sure this is the start of a group and that we are releasing * the entire group. */ if (first->msi_first != first || first->msi_count != count) { mtx_unlock(&msi_lock); return (EINVAL); } KASSERT(first->msi_dev != NULL, ("unowned group")); /* Clear all the extra messages in the group. */ for (i = 1; i < count; i++) { msi = (struct msi_intsrc *)intr_lookup_source(irqs[i]); KASSERT(msi->msi_first == first, ("message not in group")); KASSERT(msi->msi_dev == first->msi_dev, ("owner mismatch")); #ifdef ACPI_DMAR iommu_unmap_msi_intr(first->msi_dev, msi->msi_remap_cookie); #endif msi->msi_first = NULL; msi->msi_dev = NULL; apic_free_vector(msi->msi_cpu, msi->msi_vector, msi->msi_irq); msi->msi_vector = 0; } /* Clear out the first message. */ #ifdef ACPI_DMAR mtx_unlock(&msi_lock); iommu_unmap_msi_intr(first->msi_dev, first->msi_remap_cookie); mtx_lock(&msi_lock); #endif first->msi_first = NULL; first->msi_dev = NULL; apic_free_vector(first->msi_cpu, first->msi_vector, first->msi_irq); first->msi_vector = 0; first->msi_count = 0; first->msi_maxcount = 0; free(first->msi_irqs, M_MSI); first->msi_irqs = NULL; mtx_unlock(&msi_lock); return (0); } int msi_map(int irq, uint64_t *addr, uint32_t *data) { struct msi_intsrc *msi; int error; #ifdef ACPI_DMAR struct msi_intsrc *msi1; int i, k; #endif mtx_lock(&msi_lock); msi = (struct msi_intsrc *)intr_lookup_source(irq); if (msi == NULL) { mtx_unlock(&msi_lock); return (ENOENT); } /* Make sure this message is allocated to a device. */ if (msi->msi_dev == NULL) { mtx_unlock(&msi_lock); return (ENXIO); } /* * If this message isn't an MSI-X message, make sure it's part * of a group, and switch to the first message in the * group. */ if (!msi->msi_msix) { if (msi->msi_first == NULL) { mtx_unlock(&msi_lock); return (ENXIO); } msi = msi->msi_first; } #ifdef ACPI_DMAR if (!msi->msi_msix) { for (k = msi->msi_count - 1, i = FIRST_MSI_INT; k > 0 && i < FIRST_MSI_INT + NUM_MSI_INTS; i++) { if (i == msi->msi_irq) continue; msi1 = (struct msi_intsrc *)intr_lookup_source(i); if (!msi1->msi_msix && msi1->msi_first == msi) { mtx_unlock(&msi_lock); iommu_map_msi_intr(msi1->msi_dev, msi1->msi_cpu, msi1->msi_vector, msi1->msi_remap_cookie, NULL, NULL); k--; mtx_lock(&msi_lock); } } } mtx_unlock(&msi_lock); error = iommu_map_msi_intr(msi->msi_dev, msi->msi_cpu, msi->msi_vector, msi->msi_remap_cookie, addr, data); #else mtx_unlock(&msi_lock); error = EOPNOTSUPP; #endif if (error == EOPNOTSUPP) { *addr = INTEL_ADDR(msi); *data = INTEL_DATA(msi); error = 0; } return (error); } int msix_alloc(device_t dev, int *irq) { struct msi_intsrc *msi; u_int cpu; int i, vector; #ifdef ACPI_DMAR u_int cookie; int error; #endif if (!msi_enabled) return (ENXIO); again: mtx_lock(&msi_lock); /* Find a free IRQ. */ for (i = FIRST_MSI_INT; i < FIRST_MSI_INT + NUM_MSI_INTS; i++) { msi = (struct msi_intsrc *)intr_lookup_source(i); /* End of allocated sources, so break. */ if (msi == NULL) break; /* Stop at the first free source. */ if (msi->msi_dev == NULL) break; } /* Do we need to create a new source? */ if (msi == NULL) { /* If we would exceed the max, give up. */ if (i + 1 > FIRST_MSI_INT + NUM_MSI_INTS) { mtx_unlock(&msi_lock); return (ENXIO); } mtx_unlock(&msi_lock); /* Create a new source. */ msi_create_source(); goto again; } /* Allocate an IDT vector. */ cpu = intr_next_cpu(); vector = apic_alloc_vector(cpu, i); if (vector == 0) { mtx_unlock(&msi_lock); return (ENOSPC); } msi->msi_dev = dev; #ifdef ACPI_DMAR mtx_unlock(&msi_lock); error = iommu_alloc_msi_intr(dev, &cookie, 1); mtx_lock(&msi_lock); if (error == EOPNOTSUPP) error = 0; if (error != 0) { msi->msi_dev = NULL; apic_free_vector(cpu, vector, i); return (error); } msi->msi_remap_cookie = cookie; #endif if (bootverbose) printf("msi: routing MSI-X IRQ %d to local APIC %u vector %u\n", msi->msi_irq, cpu, vector); /* Setup source. */ msi->msi_cpu = cpu; msi->msi_first = msi; msi->msi_vector = vector; msi->msi_msix = 1; msi->msi_count = 1; msi->msi_maxcount = 1; msi->msi_irqs = NULL; KASSERT(msi->msi_intsrc.is_handlers == 0, ("dead MSI-X has handlers")); mtx_unlock(&msi_lock); *irq = i; return (0); } int msix_release(int irq) { struct msi_intsrc *msi; mtx_lock(&msi_lock); msi = (struct msi_intsrc *)intr_lookup_source(irq); if (msi == NULL) { mtx_unlock(&msi_lock); return (ENOENT); } /* Make sure this is an MSI-X message. */ if (!msi->msi_msix) { mtx_unlock(&msi_lock); return (EINVAL); } KASSERT(msi->msi_dev != NULL, ("unowned message")); /* Clear out the message. */ #ifdef ACPI_DMAR mtx_unlock(&msi_lock); iommu_unmap_msi_intr(msi->msi_dev, msi->msi_remap_cookie); mtx_lock(&msi_lock); #endif msi->msi_first = NULL; msi->msi_dev = NULL; apic_free_vector(msi->msi_cpu, msi->msi_vector, msi->msi_irq); msi->msi_vector = 0; msi->msi_msix = 0; msi->msi_count = 0; msi->msi_maxcount = 0; mtx_unlock(&msi_lock); return (0); } Index: head/sys/x86/xen/xen_apic.c =================================================================== --- head/sys/x86/xen/xen_apic.c (revision 316020) +++ head/sys/x86/xen/xen_apic.c (revision 316021) @@ -1,546 +1,546 @@ /* * Copyright (c) 2014 Roger Pau MonnĂ© * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /*--------------------------------- Macros -----------------------------------*/ #define XEN_APIC_UNSUPPORTED \ panic("%s: not available in Xen PV port.", __func__) /*--------------------------- Forward Declarations ---------------------------*/ #ifdef SMP static driver_filter_t xen_smp_rendezvous_action; static driver_filter_t xen_invltlb; static driver_filter_t xen_invlpg; static driver_filter_t xen_invlrng; static driver_filter_t xen_invlcache; static driver_filter_t xen_ipi_bitmap_handler; static driver_filter_t xen_cpustop_handler; static driver_filter_t xen_cpususpend_handler; static driver_filter_t xen_cpustophard_handler; #endif /*---------------------------------- Macros ----------------------------------*/ #define IPI_TO_IDX(ipi) ((ipi) - APIC_IPI_INTS) /*--------------------------------- Xen IPIs ---------------------------------*/ #ifdef SMP struct xen_ipi_handler { driver_filter_t *filter; const char *description; }; static struct xen_ipi_handler xen_ipis[] = { [IPI_TO_IDX(IPI_RENDEZVOUS)] = { xen_smp_rendezvous_action, "r" }, [IPI_TO_IDX(IPI_INVLTLB)] = { xen_invltlb, "itlb"}, [IPI_TO_IDX(IPI_INVLPG)] = { xen_invlpg, "ipg" }, [IPI_TO_IDX(IPI_INVLRNG)] = { xen_invlrng, "irg" }, [IPI_TO_IDX(IPI_INVLCACHE)] = { xen_invlcache, "ic" }, [IPI_TO_IDX(IPI_BITMAP_VECTOR)] = { xen_ipi_bitmap_handler, "b" }, [IPI_TO_IDX(IPI_STOP)] = { xen_cpustop_handler, "st" }, [IPI_TO_IDX(IPI_SUSPEND)] = { xen_cpususpend_handler, "sp" }, [IPI_TO_IDX(IPI_STOP_HARD)] = { xen_cpustophard_handler, "sth" }, }; #endif /*------------------------------- Per-CPU Data -------------------------------*/ #ifdef SMP DPCPU_DEFINE(xen_intr_handle_t, ipi_handle[nitems(xen_ipis)]); #endif /*------------------------------- Xen PV APIC --------------------------------*/ static void xen_pv_lapic_create(u_int apic_id, int boot_cpu) { #ifdef SMP cpu_add(apic_id, boot_cpu); #endif } static void xen_pv_lapic_init(vm_paddr_t addr) { } static void xen_pv_lapic_setup(int boot) { } static void xen_pv_lapic_dump(const char *str) { printf("cpu%d %s XEN PV LAPIC\n", PCPU_GET(cpuid), str); } static void xen_pv_lapic_disable(void) { } static bool xen_pv_lapic_is_x2apic(void) { return (false); } static void -xen_pv_lapic_eoi(u_int vector) +xen_pv_lapic_eoi(void) { XEN_APIC_UNSUPPORTED; } static int xen_pv_lapic_id(void) { return (PCPU_GET(apic_id)); } static int xen_pv_lapic_intr_pending(u_int vector) { XEN_APIC_UNSUPPORTED; return (0); } static u_int xen_pv_apic_cpuid(u_int apic_id) { #ifdef SMP return (apic_cpuids[apic_id]); #else return (0); #endif } static u_int xen_pv_apic_alloc_vector(u_int apic_id, u_int irq) { XEN_APIC_UNSUPPORTED; return (0); } static u_int xen_pv_apic_alloc_vectors(u_int apic_id, u_int *irqs, u_int count, u_int align) { XEN_APIC_UNSUPPORTED; return (0); } static void xen_pv_apic_disable_vector(u_int apic_id, u_int vector) { XEN_APIC_UNSUPPORTED; } static void xen_pv_apic_enable_vector(u_int apic_id, u_int vector) { XEN_APIC_UNSUPPORTED; } static void xen_pv_apic_free_vector(u_int apic_id, u_int vector, u_int irq) { XEN_APIC_UNSUPPORTED; } static void xen_pv_lapic_set_logical_id(u_int apic_id, u_int cluster, u_int cluster_id) { XEN_APIC_UNSUPPORTED; } static int xen_pv_lapic_enable_pmc(void) { XEN_APIC_UNSUPPORTED; return (0); } static void xen_pv_lapic_disable_pmc(void) { XEN_APIC_UNSUPPORTED; } static void xen_pv_lapic_reenable_pmc(void) { XEN_APIC_UNSUPPORTED; } static void xen_pv_lapic_enable_cmc(void) { } #ifdef SMP static void xen_pv_lapic_ipi_raw(register_t icrlo, u_int dest) { XEN_APIC_UNSUPPORTED; } static void xen_pv_lapic_ipi_vectored(u_int vector, int dest) { xen_intr_handle_t *ipi_handle; int ipi_idx, to_cpu, self; ipi_idx = IPI_TO_IDX(vector); if (ipi_idx >= nitems(xen_ipis)) panic("IPI out of range"); switch(dest) { case APIC_IPI_DEST_SELF: ipi_handle = DPCPU_GET(ipi_handle); xen_intr_signal(ipi_handle[ipi_idx]); break; case APIC_IPI_DEST_ALL: CPU_FOREACH(to_cpu) { ipi_handle = DPCPU_ID_GET(to_cpu, ipi_handle); xen_intr_signal(ipi_handle[ipi_idx]); } break; case APIC_IPI_DEST_OTHERS: self = PCPU_GET(cpuid); CPU_FOREACH(to_cpu) { if (to_cpu != self) { ipi_handle = DPCPU_ID_GET(to_cpu, ipi_handle); xen_intr_signal(ipi_handle[ipi_idx]); } } break; default: to_cpu = apic_cpuid(dest); ipi_handle = DPCPU_ID_GET(to_cpu, ipi_handle); xen_intr_signal(ipi_handle[ipi_idx]); break; } } static int xen_pv_lapic_ipi_wait(int delay) { XEN_APIC_UNSUPPORTED; return (0); } #endif /* SMP */ static int xen_pv_lapic_ipi_alloc(inthand_t *ipifunc) { XEN_APIC_UNSUPPORTED; return (-1); } static void xen_pv_lapic_ipi_free(int vector) { XEN_APIC_UNSUPPORTED; } static int xen_pv_lapic_set_lvt_mask(u_int apic_id, u_int lvt, u_char masked) { XEN_APIC_UNSUPPORTED; return (0); } static int xen_pv_lapic_set_lvt_mode(u_int apic_id, u_int lvt, uint32_t mode) { XEN_APIC_UNSUPPORTED; return (0); } static int xen_pv_lapic_set_lvt_polarity(u_int apic_id, u_int lvt, enum intr_polarity pol) { XEN_APIC_UNSUPPORTED; return (0); } static int xen_pv_lapic_set_lvt_triggermode(u_int apic_id, u_int lvt, enum intr_trigger trigger) { XEN_APIC_UNSUPPORTED; return (0); } /* Xen apic_ops implementation */ struct apic_ops xen_apic_ops = { .create = xen_pv_lapic_create, .init = xen_pv_lapic_init, .xapic_mode = xen_pv_lapic_disable, .is_x2apic = xen_pv_lapic_is_x2apic, .setup = xen_pv_lapic_setup, .dump = xen_pv_lapic_dump, .disable = xen_pv_lapic_disable, .eoi = xen_pv_lapic_eoi, .id = xen_pv_lapic_id, .intr_pending = xen_pv_lapic_intr_pending, .set_logical_id = xen_pv_lapic_set_logical_id, .cpuid = xen_pv_apic_cpuid, .alloc_vector = xen_pv_apic_alloc_vector, .alloc_vectors = xen_pv_apic_alloc_vectors, .enable_vector = xen_pv_apic_enable_vector, .disable_vector = xen_pv_apic_disable_vector, .free_vector = xen_pv_apic_free_vector, .enable_pmc = xen_pv_lapic_enable_pmc, .disable_pmc = xen_pv_lapic_disable_pmc, .reenable_pmc = xen_pv_lapic_reenable_pmc, .enable_cmc = xen_pv_lapic_enable_cmc, #ifdef SMP .ipi_raw = xen_pv_lapic_ipi_raw, .ipi_vectored = xen_pv_lapic_ipi_vectored, .ipi_wait = xen_pv_lapic_ipi_wait, #endif .ipi_alloc = xen_pv_lapic_ipi_alloc, .ipi_free = xen_pv_lapic_ipi_free, .set_lvt_mask = xen_pv_lapic_set_lvt_mask, .set_lvt_mode = xen_pv_lapic_set_lvt_mode, .set_lvt_polarity = xen_pv_lapic_set_lvt_polarity, .set_lvt_triggermode = xen_pv_lapic_set_lvt_triggermode, }; #ifdef SMP /*---------------------------- XEN PV IPI Handlers ---------------------------*/ /* * These are C clones of the ASM functions found in apic_vector. */ static int xen_ipi_bitmap_handler(void *arg) { struct trapframe *frame; frame = arg; ipi_bitmap_handler(*frame); return (FILTER_HANDLED); } static int xen_smp_rendezvous_action(void *arg) { #ifdef COUNT_IPIS (*ipi_rendezvous_counts[PCPU_GET(cpuid)])++; #endif /* COUNT_IPIS */ smp_rendezvous_action(); return (FILTER_HANDLED); } static int xen_invltlb(void *arg) { invltlb_handler(); return (FILTER_HANDLED); } #ifdef __amd64__ static int xen_invltlb_invpcid(void *arg) { invltlb_invpcid_handler(); return (FILTER_HANDLED); } static int xen_invltlb_pcid(void *arg) { invltlb_pcid_handler(); return (FILTER_HANDLED); } #endif static int xen_invlpg(void *arg) { invlpg_handler(); return (FILTER_HANDLED); } static int xen_invlrng(void *arg) { invlrng_handler(); return (FILTER_HANDLED); } static int xen_invlcache(void *arg) { invlcache_handler(); return (FILTER_HANDLED); } static int xen_cpustop_handler(void *arg) { cpustop_handler(); return (FILTER_HANDLED); } static int xen_cpususpend_handler(void *arg) { cpususpend_handler(); return (FILTER_HANDLED); } static int xen_cpustophard_handler(void *arg) { ipi_nmi_handler(); return (FILTER_HANDLED); } /*----------------------------- XEN PV IPI setup -----------------------------*/ /* * Those functions are provided outside of the Xen PV APIC implementation * so PVHVM guests can also use PV IPIs without having an actual Xen PV APIC, * because on PVHVM there's an emulated LAPIC provided by Xen. */ static void xen_cpu_ipi_init(int cpu) { xen_intr_handle_t *ipi_handle; const struct xen_ipi_handler *ipi; int idx, rc; ipi_handle = DPCPU_ID_GET(cpu, ipi_handle); for (ipi = xen_ipis, idx = 0; idx < nitems(xen_ipis); ipi++, idx++) { if (ipi->filter == NULL) { ipi_handle[idx] = NULL; continue; } rc = xen_intr_alloc_and_bind_ipi(cpu, ipi->filter, INTR_TYPE_TTY, &ipi_handle[idx]); if (rc != 0) panic("Unable to allocate a XEN IPI port"); xen_intr_describe(ipi_handle[idx], "%s", ipi->description); } } static void xen_setup_cpus(void) { int i; if (!xen_vector_callback_enabled) return; #ifdef __amd64__ if (pmap_pcid_enabled) { xen_ipis[IPI_TO_IDX(IPI_INVLTLB)].filter = invpcid_works ? xen_invltlb_invpcid : xen_invltlb_pcid; } #endif CPU_FOREACH(i) xen_cpu_ipi_init(i); /* Set the xen pv ipi ops to replace the native ones */ if (xen_hvm_domain()) apic_ops.ipi_vectored = xen_pv_lapic_ipi_vectored; } /* We need to setup IPIs before APs are started */ SYSINIT(xen_setup_cpus, SI_SUB_SMP-1, SI_ORDER_FIRST, xen_setup_cpus, NULL); #endif /* SMP */