diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h index 8b6933a2c1ef..4dfb4fe1fe8d 100644 --- a/sys/amd64/include/vmm.h +++ b/sys/amd64/include/vmm.h @@ -1,366 +1,365 @@ /*- * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _VMM_H_ #define _VMM_H_ #ifdef _KERNEL #define VM_MAX_NAMELEN 32 struct vm; struct vm_memory_segment; struct seg_desc; struct vm_exit; struct vm_run; struct vhpet; struct vioapic; struct vlapic; struct vmspace; struct vm_object; struct pmap; enum x2apic_state; typedef int (*vmm_init_func_t)(int ipinum); typedef int (*vmm_cleanup_func_t)(void); typedef void (*vmm_resume_func_t)(void); typedef void * (*vmi_init_func_t)(struct vm *vm, struct pmap *pmap); typedef int (*vmi_run_func_t)(void *vmi, int vcpu, register_t rip, struct pmap *pmap, void *rendezvous_cookie); typedef void (*vmi_cleanup_func_t)(void *vmi); typedef int (*vmi_get_register_t)(void *vmi, int vcpu, int num, uint64_t *retval); typedef int (*vmi_set_register_t)(void *vmi, int vcpu, int num, uint64_t val); typedef int (*vmi_get_desc_t)(void *vmi, int vcpu, int num, struct seg_desc *desc); typedef int (*vmi_set_desc_t)(void *vmi, int vcpu, int num, struct seg_desc *desc); typedef int (*vmi_inject_event_t)(void *vmi, int vcpu, int type, int vector, uint32_t code, int code_valid); typedef int (*vmi_get_cap_t)(void *vmi, int vcpu, int num, int *retval); typedef int (*vmi_set_cap_t)(void *vmi, int vcpu, int num, int val); typedef struct vmspace * (*vmi_vmspace_alloc)(vm_offset_t min, vm_offset_t max); typedef void (*vmi_vmspace_free)(struct vmspace *vmspace); typedef struct vlapic * (*vmi_vlapic_init)(void *vmi, int vcpu); typedef void (*vmi_vlapic_cleanup)(void *vmi, struct vlapic *vlapic); struct vmm_ops { vmm_init_func_t init; /* module wide initialization */ vmm_cleanup_func_t cleanup; vmm_resume_func_t resume; vmi_init_func_t vminit; /* vm-specific initialization */ vmi_run_func_t vmrun; vmi_cleanup_func_t vmcleanup; vmi_get_register_t vmgetreg; vmi_set_register_t vmsetreg; vmi_get_desc_t vmgetdesc; vmi_set_desc_t vmsetdesc; vmi_inject_event_t vminject; vmi_get_cap_t vmgetcap; vmi_set_cap_t vmsetcap; vmi_vmspace_alloc vmspace_alloc; vmi_vmspace_free vmspace_free; vmi_vlapic_init vlapic_init; vmi_vlapic_cleanup vlapic_cleanup; }; extern struct vmm_ops vmm_ops_intel; extern struct vmm_ops vmm_ops_amd; int vm_create(const char *name, struct vm **retvm); void vm_destroy(struct vm *vm); const char *vm_name(struct vm *vm); int vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len); int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa); int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len); void *vm_gpa_hold(struct vm *, vm_paddr_t gpa, size_t len, int prot, void **cookie); void vm_gpa_release(void *cookie); int vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase, struct vm_memory_segment *seg); int vm_get_memobj(struct vm *vm, vm_paddr_t gpa, size_t len, vm_offset_t *offset, struct vm_object **object); boolean_t vm_mem_allocated(struct vm *vm, vm_paddr_t gpa); int vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval); int vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val); int vm_get_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *ret_desc); int vm_set_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc); int vm_run(struct vm *vm, struct vm_run *vmrun); int vm_inject_event(struct vm *vm, int vcpu, int type, int vector, uint32_t error_code, int error_code_valid); int vm_inject_nmi(struct vm *vm, int vcpu); int vm_nmi_pending(struct vm *vm, int vcpuid); void vm_nmi_clear(struct vm *vm, int vcpuid); uint64_t *vm_guest_msrs(struct vm *vm, int cpu); struct vlapic *vm_lapic(struct vm *vm, int cpu); struct vioapic *vm_ioapic(struct vm *vm); struct vhpet *vm_hpet(struct vm *vm); int vm_get_capability(struct vm *vm, int vcpu, int type, int *val); int vm_set_capability(struct vm *vm, int vcpu, int type, int val); int vm_get_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state *state); int vm_set_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state state); int vm_apicid2vcpuid(struct vm *vm, int apicid); void vm_activate_cpu(struct vm *vm, int vcpu); cpuset_t vm_active_cpus(struct vm *vm); struct vm_exit *vm_exitinfo(struct vm *vm, int vcpuid); /* * Rendezvous all vcpus specified in 'dest' and execute 'func(arg)'. * The rendezvous 'func(arg)' is not allowed to do anything that will * cause the thread to be put to sleep. * * If the rendezvous is being initiated from a vcpu context then the * 'vcpuid' must refer to that vcpu, otherwise it should be set to -1. * * The caller cannot hold any locks when initiating the rendezvous. * * The implementation of this API may cause vcpus other than those specified * by 'dest' to be stalled. The caller should not rely on any vcpus making * forward progress when the rendezvous is in progress. */ typedef void (*vm_rendezvous_func_t)(struct vm *vm, int vcpuid, void *arg); void vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest, vm_rendezvous_func_t func, void *arg); static __inline int vcpu_rendezvous_pending(void *rendezvous_cookie) { return (*(uintptr_t *)rendezvous_cookie != 0); } /* * Return 1 if device indicated by bus/slot/func is supposed to be a * pci passthrough device. * * Return 0 otherwise. */ int vmm_is_pptdev(int bus, int slot, int func); void *vm_iommu_domain(struct vm *vm); enum vcpu_state { VCPU_IDLE, VCPU_FROZEN, VCPU_RUNNING, VCPU_SLEEPING, }; int vcpu_set_state(struct vm *vm, int vcpu, enum vcpu_state state, bool from_idle); enum vcpu_state vcpu_get_state(struct vm *vm, int vcpu, int *hostcpu); static int __inline vcpu_is_running(struct vm *vm, int vcpu, int *hostcpu) { return (vcpu_get_state(vm, vcpu, hostcpu) == VCPU_RUNNING); } void *vcpu_stats(struct vm *vm, int vcpu); void vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr); struct vmspace *vm_get_vmspace(struct vm *vm); int vm_assign_pptdev(struct vm *vm, int bus, int slot, int func); int vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func); #endif /* KERNEL */ #include #define VM_MAXCPU 16 /* maximum virtual cpus */ /* * Identifiers for events that can be injected into the VM */ enum vm_event_type { VM_EVENT_NONE, VM_HW_INTR, VM_NMI, VM_HW_EXCEPTION, VM_SW_INTR, VM_PRIV_SW_EXCEPTION, VM_SW_EXCEPTION, VM_EVENT_MAX }; /* * Identifiers for architecturally defined registers. */ enum vm_reg_name { VM_REG_GUEST_RAX, VM_REG_GUEST_RBX, VM_REG_GUEST_RCX, VM_REG_GUEST_RDX, VM_REG_GUEST_RSI, VM_REG_GUEST_RDI, VM_REG_GUEST_RBP, VM_REG_GUEST_R8, VM_REG_GUEST_R9, VM_REG_GUEST_R10, VM_REG_GUEST_R11, VM_REG_GUEST_R12, VM_REG_GUEST_R13, VM_REG_GUEST_R14, VM_REG_GUEST_R15, VM_REG_GUEST_CR0, VM_REG_GUEST_CR3, VM_REG_GUEST_CR4, VM_REG_GUEST_DR7, VM_REG_GUEST_RSP, VM_REG_GUEST_RIP, VM_REG_GUEST_RFLAGS, VM_REG_GUEST_ES, VM_REG_GUEST_CS, VM_REG_GUEST_SS, VM_REG_GUEST_DS, VM_REG_GUEST_FS, VM_REG_GUEST_GS, VM_REG_GUEST_LDTR, VM_REG_GUEST_TR, VM_REG_GUEST_IDTR, VM_REG_GUEST_GDTR, VM_REG_GUEST_EFER, VM_REG_LAST }; /* * Identifiers for optional vmm capabilities */ enum vm_cap_type { VM_CAP_HALT_EXIT, VM_CAP_MTRAP_EXIT, VM_CAP_PAUSE_EXIT, VM_CAP_UNRESTRICTED_GUEST, VM_CAP_ENABLE_INVPCID, VM_CAP_MAX }; enum x2apic_state { - X2APIC_ENABLED, - X2APIC_AVAILABLE, X2APIC_DISABLED, + X2APIC_ENABLED, X2APIC_STATE_LAST }; /* * The 'access' field has the format specified in Table 21-2 of the Intel * Architecture Manual vol 3b. * * XXX The contents of the 'access' field are architecturally defined except * bit 16 - Segment Unusable. */ struct seg_desc { uint64_t base; uint32_t limit; uint32_t access; }; enum vm_exitcode { VM_EXITCODE_INOUT, VM_EXITCODE_VMX, VM_EXITCODE_BOGUS, VM_EXITCODE_RDMSR, VM_EXITCODE_WRMSR, VM_EXITCODE_HLT, VM_EXITCODE_MTRAP, VM_EXITCODE_PAUSE, VM_EXITCODE_PAGING, VM_EXITCODE_INST_EMUL, VM_EXITCODE_SPINUP_AP, VM_EXITCODE_SPINDOWN_CPU, VM_EXITCODE_RENDEZVOUS, VM_EXITCODE_IOAPIC_EOI, VM_EXITCODE_MAX }; struct vm_exit { enum vm_exitcode exitcode; int inst_length; /* 0 means unknown */ uint64_t rip; union { struct { uint16_t bytes:3; /* 1 or 2 or 4 */ uint16_t in:1; /* out is 0, in is 1 */ uint16_t string:1; uint16_t rep:1; uint16_t port; uint32_t eax; /* valid for out */ } inout; struct { uint64_t gpa; int fault_type; } paging; struct { uint64_t gpa; uint64_t gla; uint64_t cr3; enum vie_cpu_mode cpu_mode; enum vie_paging_mode paging_mode; struct vie vie; } inst_emul; /* * VMX specific payload. Used when there is no "better" * exitcode to represent the VM-exit. */ struct { int status; /* vmx inst status */ /* * 'exit_reason' and 'exit_qualification' are valid * only if 'status' is zero. */ uint32_t exit_reason; uint64_t exit_qualification; /* * 'inst_error' and 'inst_type' are valid * only if 'status' is non-zero. */ int inst_type; int inst_error; } vmx; struct { uint32_t code; /* ecx value */ uint64_t wval; } msr; struct { int vcpu; uint64_t rip; } spinup_ap; struct { uint64_t rflags; } hlt; struct { int vector; } ioapic_eoi; } u; }; #endif /* _VMM_H_ */ diff --git a/sys/amd64/vmm/io/vlapic.c b/sys/amd64/vmm/io/vlapic.c index f855f730c49c..d1f72341a909 100644 --- a/sys/amd64/vmm/io/vlapic.c +++ b/sys/amd64/vmm/io/vlapic.c @@ -1,1544 +1,1577 @@ /*- * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include "vmm_ipi.h" #include "vmm_lapic.h" #include "vmm_ktr.h" #include "vmm_stat.h" #include "vlapic.h" #include "vlapic_priv.h" #include "vioapic.h" #define PRIO(x) ((x) >> 4) #define VLAPIC_VERSION (16) #define x2apic(vlapic) (((vlapic)->msr_apicbase & APICBASE_X2APIC) ? 1 : 0) /* * The 'vlapic->timer_mtx' is used to provide mutual exclusion between the * vlapic_callout_handler() and vcpu accesses to: * - timer_freq_bt, timer_period_bt, timer_fire_bt * - timer LVT register */ #define VLAPIC_TIMER_LOCK(vlapic) mtx_lock_spin(&((vlapic)->timer_mtx)) #define VLAPIC_TIMER_UNLOCK(vlapic) mtx_unlock_spin(&((vlapic)->timer_mtx)) #define VLAPIC_TIMER_LOCKED(vlapic) mtx_owned(&((vlapic)->timer_mtx)) #define VLAPIC_BUS_FREQ tsc_freq static __inline uint32_t vlapic_get_id(struct vlapic *vlapic) { if (x2apic(vlapic)) return (vlapic->vcpuid); else return (vlapic->vcpuid << 24); } static uint32_t x2apic_ldr(struct vlapic *vlapic) { int apicid; uint32_t ldr; apicid = vlapic_get_id(vlapic); ldr = 1 << (apicid & 0xf); ldr |= (apicid & 0xffff0) << 12; return (ldr); } void vlapic_dfr_write_handler(struct vlapic *vlapic) { struct LAPIC *lapic; lapic = vlapic->apic_page; if (x2apic(vlapic)) { VM_CTR1(vlapic->vm, "ignoring write to DFR in x2apic mode: %#x", lapic->dfr); lapic->dfr = 0; return; } lapic->dfr &= APIC_DFR_MODEL_MASK; lapic->dfr |= APIC_DFR_RESERVED; if ((lapic->dfr & APIC_DFR_MODEL_MASK) == APIC_DFR_MODEL_FLAT) VLAPIC_CTR0(vlapic, "vlapic DFR in Flat Model"); else if ((lapic->dfr & APIC_DFR_MODEL_MASK) == APIC_DFR_MODEL_CLUSTER) VLAPIC_CTR0(vlapic, "vlapic DFR in Cluster Model"); else VLAPIC_CTR1(vlapic, "DFR in Unknown Model %#x", lapic->dfr); } void vlapic_ldr_write_handler(struct vlapic *vlapic) { struct LAPIC *lapic; lapic = vlapic->apic_page; /* LDR is read-only in x2apic mode */ if (x2apic(vlapic)) { VLAPIC_CTR1(vlapic, "ignoring write to LDR in x2apic mode: %#x", lapic->ldr); lapic->ldr = x2apic_ldr(vlapic); } else { lapic->ldr &= ~APIC_LDR_RESERVED; VLAPIC_CTR1(vlapic, "vlapic LDR set to %#x", lapic->ldr); } } void vlapic_id_write_handler(struct vlapic *vlapic) { struct LAPIC *lapic; /* * We don't allow the ID register to be modified so reset it back to * its default value. */ lapic = vlapic->apic_page; lapic->id = vlapic_get_id(vlapic); } static int vlapic_timer_divisor(uint32_t dcr) { switch (dcr & 0xB) { case APIC_TDCR_1: return (1); case APIC_TDCR_2: return (2); case APIC_TDCR_4: return (4); case APIC_TDCR_8: return (8); case APIC_TDCR_16: return (16); case APIC_TDCR_32: return (32); case APIC_TDCR_64: return (64); case APIC_TDCR_128: return (128); default: panic("vlapic_timer_divisor: invalid dcr 0x%08x", dcr); } } #if 0 static inline void vlapic_dump_lvt(uint32_t offset, uint32_t *lvt) { printf("Offset %x: lvt %08x (V:%02x DS:%x M:%x)\n", offset, *lvt, *lvt & APIC_LVTT_VECTOR, *lvt & APIC_LVTT_DS, *lvt & APIC_LVTT_M); } #endif static uint32_t vlapic_get_ccr(struct vlapic *vlapic) { struct bintime bt_now, bt_rem; struct LAPIC *lapic; uint32_t ccr; ccr = 0; lapic = vlapic->apic_page; VLAPIC_TIMER_LOCK(vlapic); if (callout_active(&vlapic->callout)) { /* * If the timer is scheduled to expire in the future then * compute the value of 'ccr' based on the remaining time. */ binuptime(&bt_now); if (bintime_cmp(&vlapic->timer_fire_bt, &bt_now, >)) { bt_rem = vlapic->timer_fire_bt; bintime_sub(&bt_rem, &bt_now); ccr += bt_rem.sec * BT2FREQ(&vlapic->timer_freq_bt); ccr += bt_rem.frac / vlapic->timer_freq_bt.frac; } } KASSERT(ccr <= lapic->icr_timer, ("vlapic_get_ccr: invalid ccr %#x, " "icr_timer is %#x", ccr, lapic->icr_timer)); VLAPIC_CTR2(vlapic, "vlapic ccr_timer = %#x, icr_timer = %#x", ccr, lapic->icr_timer); VLAPIC_TIMER_UNLOCK(vlapic); return (ccr); } void vlapic_dcr_write_handler(struct vlapic *vlapic) { struct LAPIC *lapic; int divisor; lapic = vlapic->apic_page; VLAPIC_TIMER_LOCK(vlapic); divisor = vlapic_timer_divisor(lapic->dcr_timer); VLAPIC_CTR2(vlapic, "vlapic dcr_timer=%#x, divisor=%d", lapic->dcr_timer, divisor); /* * Update the timer frequency and the timer period. * * XXX changes to the frequency divider will not take effect until * the timer is reloaded. */ FREQ2BT(VLAPIC_BUS_FREQ / divisor, &vlapic->timer_freq_bt); vlapic->timer_period_bt = vlapic->timer_freq_bt; bintime_mul(&vlapic->timer_period_bt, lapic->icr_timer); VLAPIC_TIMER_UNLOCK(vlapic); } void vlapic_esr_write_handler(struct vlapic *vlapic) { struct LAPIC *lapic; lapic = vlapic->apic_page; lapic->esr = vlapic->esr_pending; vlapic->esr_pending = 0; } int vlapic_set_intr_ready(struct vlapic *vlapic, int vector, bool level) { struct LAPIC *lapic; uint32_t *irrptr, *tmrptr, mask; int idx; KASSERT(vector >= 0 && vector < 256, ("invalid vector %d", vector)); lapic = vlapic->apic_page; if (!(lapic->svr & APIC_SVR_ENABLE)) { VLAPIC_CTR1(vlapic, "vlapic is software disabled, ignoring " "interrupt %d", vector); return (0); } if (vector < 16) { vlapic_set_error(vlapic, APIC_ESR_RECEIVE_ILLEGAL_VECTOR); VLAPIC_CTR1(vlapic, "vlapic ignoring interrupt to vector %d", vector); return (1); } if (vlapic->ops.set_intr_ready) return ((*vlapic->ops.set_intr_ready)(vlapic, vector, level)); idx = (vector / 32) * 4; mask = 1 << (vector % 32); irrptr = &lapic->irr0; atomic_set_int(&irrptr[idx], mask); /* * Verify that the trigger-mode of the interrupt matches with * the vlapic TMR registers. */ tmrptr = &lapic->tmr0; if ((tmrptr[idx] & mask) != (level ? mask : 0)) { VLAPIC_CTR3(vlapic, "vlapic TMR[%d] is 0x%08x but " "interrupt is %s-triggered", idx / 4, tmrptr[idx], level ? "level" : "edge"); } VLAPIC_CTR_IRR(vlapic, "vlapic_set_intr_ready"); return (1); } static __inline uint32_t * vlapic_get_lvtptr(struct vlapic *vlapic, uint32_t offset) { struct LAPIC *lapic = vlapic->apic_page; int i; switch (offset) { case APIC_OFFSET_CMCI_LVT: return (&lapic->lvt_cmci); case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT: i = (offset - APIC_OFFSET_TIMER_LVT) >> 2; return ((&lapic->lvt_timer) + i);; default: panic("vlapic_get_lvt: invalid LVT\n"); } } static __inline int lvt_off_to_idx(uint32_t offset) { int index; switch (offset) { case APIC_OFFSET_CMCI_LVT: index = APIC_LVT_CMCI; break; case APIC_OFFSET_TIMER_LVT: index = APIC_LVT_TIMER; break; case APIC_OFFSET_THERM_LVT: index = APIC_LVT_THERMAL; break; case APIC_OFFSET_PERF_LVT: index = APIC_LVT_PMC; break; case APIC_OFFSET_LINT0_LVT: index = APIC_LVT_LINT0; break; case APIC_OFFSET_LINT1_LVT: index = APIC_LVT_LINT1; break; case APIC_OFFSET_ERROR_LVT: index = APIC_LVT_ERROR; break; default: index = -1; break; } KASSERT(index >= 0 && index <= VLAPIC_MAXLVT_INDEX, ("lvt_off_to_idx: " "invalid lvt index %d for offset %#x", index, offset)); return (index); } static __inline uint32_t vlapic_get_lvt(struct vlapic *vlapic, uint32_t offset) { int idx; uint32_t val; idx = lvt_off_to_idx(offset); val = atomic_load_acq_32(&vlapic->lvt_last[idx]); return (val); } void vlapic_lvt_write_handler(struct vlapic *vlapic, uint32_t offset) { uint32_t *lvtptr, mask, val; struct LAPIC *lapic; int idx; lapic = vlapic->apic_page; lvtptr = vlapic_get_lvtptr(vlapic, offset); val = *lvtptr; idx = lvt_off_to_idx(offset); if (!(lapic->svr & APIC_SVR_ENABLE)) val |= APIC_LVT_M; mask = APIC_LVT_M | APIC_LVT_DS | APIC_LVT_VECTOR; switch (offset) { case APIC_OFFSET_TIMER_LVT: mask |= APIC_LVTT_TM; break; case APIC_OFFSET_ERROR_LVT: break; case APIC_OFFSET_LINT0_LVT: case APIC_OFFSET_LINT1_LVT: mask |= APIC_LVT_TM | APIC_LVT_RIRR | APIC_LVT_IIPP; /* FALLTHROUGH */ default: mask |= APIC_LVT_DM; break; } val &= mask; *lvtptr = val; atomic_store_rel_32(&vlapic->lvt_last[idx], val); } static void vlapic_mask_lvts(struct vlapic *vlapic) { struct LAPIC *lapic = vlapic->apic_page; lapic->lvt_cmci |= APIC_LVT_M; vlapic_lvt_write_handler(vlapic, APIC_OFFSET_CMCI_LVT); lapic->lvt_timer |= APIC_LVT_M; vlapic_lvt_write_handler(vlapic, APIC_OFFSET_TIMER_LVT); lapic->lvt_thermal |= APIC_LVT_M; vlapic_lvt_write_handler(vlapic, APIC_OFFSET_THERM_LVT); lapic->lvt_pcint |= APIC_LVT_M; vlapic_lvt_write_handler(vlapic, APIC_OFFSET_PERF_LVT); lapic->lvt_lint0 |= APIC_LVT_M; vlapic_lvt_write_handler(vlapic, APIC_OFFSET_LINT0_LVT); lapic->lvt_lint1 |= APIC_LVT_M; vlapic_lvt_write_handler(vlapic, APIC_OFFSET_LINT1_LVT); lapic->lvt_error |= APIC_LVT_M; vlapic_lvt_write_handler(vlapic, APIC_OFFSET_ERROR_LVT); } static int vlapic_fire_lvt(struct vlapic *vlapic, uint32_t lvt) { uint32_t vec, mode; if (lvt & APIC_LVT_M) return (0); vec = lvt & APIC_LVT_VECTOR; mode = lvt & APIC_LVT_DM; switch (mode) { case APIC_LVT_DM_FIXED: if (vec < 16) { vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR); return (0); } if (vlapic_set_intr_ready(vlapic, vec, false)) vcpu_notify_event(vlapic->vm, vlapic->vcpuid, true); break; case APIC_LVT_DM_NMI: vm_inject_nmi(vlapic->vm, vlapic->vcpuid); break; default: // Other modes ignored return (0); } return (1); } #if 1 static void dump_isrvec_stk(struct vlapic *vlapic) { int i; uint32_t *isrptr; isrptr = &vlapic->apic_page->isr0; for (i = 0; i < 8; i++) printf("ISR%d 0x%08x\n", i, isrptr[i * 4]); for (i = 0; i <= vlapic->isrvec_stk_top; i++) printf("isrvec_stk[%d] = %d\n", i, vlapic->isrvec_stk[i]); } #endif /* * Algorithm adopted from section "Interrupt, Task and Processor Priority" * in Intel Architecture Manual Vol 3a. */ static void vlapic_update_ppr(struct vlapic *vlapic) { int isrvec, tpr, ppr; /* * Note that the value on the stack at index 0 is always 0. * * This is a placeholder for the value of ISRV when none of the * bits is set in the ISRx registers. */ isrvec = vlapic->isrvec_stk[vlapic->isrvec_stk_top]; tpr = vlapic->apic_page->tpr; #if 1 { int i, lastprio, curprio, vector, idx; uint32_t *isrptr; if (vlapic->isrvec_stk_top == 0 && isrvec != 0) panic("isrvec_stk is corrupted: %d", isrvec); /* * Make sure that the priority of the nested interrupts is * always increasing. */ lastprio = -1; for (i = 1; i <= vlapic->isrvec_stk_top; i++) { curprio = PRIO(vlapic->isrvec_stk[i]); if (curprio <= lastprio) { dump_isrvec_stk(vlapic); panic("isrvec_stk does not satisfy invariant"); } lastprio = curprio; } /* * Make sure that each bit set in the ISRx registers has a * corresponding entry on the isrvec stack. */ i = 1; isrptr = &vlapic->apic_page->isr0; for (vector = 0; vector < 256; vector++) { idx = (vector / 32) * 4; if (isrptr[idx] & (1 << (vector % 32))) { if (i > vlapic->isrvec_stk_top || vlapic->isrvec_stk[i] != vector) { dump_isrvec_stk(vlapic); panic("ISR and isrvec_stk out of sync"); } i++; } } } #endif if (PRIO(tpr) >= PRIO(isrvec)) ppr = tpr; else ppr = isrvec & 0xf0; vlapic->apic_page->ppr = ppr; VLAPIC_CTR1(vlapic, "vlapic_update_ppr 0x%02x", ppr); } static void vlapic_process_eoi(struct vlapic *vlapic) { struct LAPIC *lapic = vlapic->apic_page; uint32_t *isrptr, *tmrptr; int i, idx, bitpos, vector; isrptr = &lapic->isr0; tmrptr = &lapic->tmr0; /* * The x86 architecture reserves the the first 32 vectors for use * by the processor. */ for (i = 7; i > 0; i--) { idx = i * 4; bitpos = fls(isrptr[idx]); if (bitpos-- != 0) { if (vlapic->isrvec_stk_top <= 0) { panic("invalid vlapic isrvec_stk_top %d", vlapic->isrvec_stk_top); } isrptr[idx] &= ~(1 << bitpos); VLAPIC_CTR_ISR(vlapic, "vlapic_process_eoi"); vlapic->isrvec_stk_top--; vlapic_update_ppr(vlapic); if ((tmrptr[idx] & (1 << bitpos)) != 0) { vector = i * 32 + bitpos; vioapic_process_eoi(vlapic->vm, vlapic->vcpuid, vector); } return; } } } static __inline int vlapic_get_lvt_field(uint32_t lvt, uint32_t mask) { return (lvt & mask); } static __inline int vlapic_periodic_timer(struct vlapic *vlapic) { uint32_t lvt; lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT); return (vlapic_get_lvt_field(lvt, APIC_LVTT_TM_PERIODIC)); } static VMM_STAT(VLAPIC_INTR_ERROR, "error interrupts generated by vlapic"); void vlapic_set_error(struct vlapic *vlapic, uint32_t mask) { uint32_t lvt; vlapic->esr_pending |= mask; if (vlapic->esr_firing) return; vlapic->esr_firing = 1; // The error LVT always uses the fixed delivery mode. lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_ERROR_LVT); if (vlapic_fire_lvt(vlapic, lvt | APIC_LVT_DM_FIXED)) { vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_INTR_ERROR, 1); } vlapic->esr_firing = 0; } static VMM_STAT(VLAPIC_INTR_TIMER, "timer interrupts generated by vlapic"); static void vlapic_fire_timer(struct vlapic *vlapic) { uint32_t lvt; KASSERT(VLAPIC_TIMER_LOCKED(vlapic), ("vlapic_fire_timer not locked")); // The timer LVT always uses the fixed delivery mode. lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT); if (vlapic_fire_lvt(vlapic, lvt | APIC_LVT_DM_FIXED)) { vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_INTR_TIMER, 1); } } static VMM_STAT(VLAPIC_INTR_CMC, "corrected machine check interrupts generated by vlapic"); void vlapic_fire_cmci(struct vlapic *vlapic) { uint32_t lvt; lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_CMCI_LVT); if (vlapic_fire_lvt(vlapic, lvt)) { vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_INTR_CMC, 1); } } static VMM_STAT_ARRAY(LVTS_TRIGGERRED, VLAPIC_MAXLVT_INDEX + 1, "lvts triggered"); int vlapic_trigger_lvt(struct vlapic *vlapic, int vector) { uint32_t lvt; switch (vector) { case APIC_LVT_LINT0: lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_LINT0_LVT); break; case APIC_LVT_LINT1: lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_LINT1_LVT); break; case APIC_LVT_TIMER: lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT); lvt |= APIC_LVT_DM_FIXED; break; case APIC_LVT_ERROR: lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_ERROR_LVT); lvt |= APIC_LVT_DM_FIXED; break; case APIC_LVT_PMC: lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_PERF_LVT); break; case APIC_LVT_THERMAL: lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_THERM_LVT); break; case APIC_LVT_CMCI: lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_CMCI_LVT); break; default: return (EINVAL); } if (vlapic_fire_lvt(vlapic, lvt)) { vmm_stat_array_incr(vlapic->vm, vlapic->vcpuid, LVTS_TRIGGERRED, vector, 1); } return (0); } static void vlapic_callout_handler(void *arg) { struct vlapic *vlapic; struct bintime bt, btnow; sbintime_t rem_sbt; vlapic = arg; VLAPIC_TIMER_LOCK(vlapic); if (callout_pending(&vlapic->callout)) /* callout was reset */ goto done; if (!callout_active(&vlapic->callout)) /* callout was stopped */ goto done; callout_deactivate(&vlapic->callout); vlapic_fire_timer(vlapic); if (vlapic_periodic_timer(vlapic)) { binuptime(&btnow); KASSERT(bintime_cmp(&btnow, &vlapic->timer_fire_bt, >=), ("vlapic callout at %#lx.%#lx, expected at %#lx.#%lx", btnow.sec, btnow.frac, vlapic->timer_fire_bt.sec, vlapic->timer_fire_bt.frac)); /* * Compute the delta between when the timer was supposed to * fire and the present time. */ bt = btnow; bintime_sub(&bt, &vlapic->timer_fire_bt); rem_sbt = bttosbt(vlapic->timer_period_bt); if (bintime_cmp(&bt, &vlapic->timer_period_bt, <)) { /* * Adjust the time until the next countdown downward * to account for the lost time. */ rem_sbt -= bttosbt(bt); } else { /* * If the delta is greater than the timer period then * just reset our time base instead of trying to catch * up. */ vlapic->timer_fire_bt = btnow; VLAPIC_CTR2(vlapic, "vlapic timer lagging by %lu " "usecs, period is %lu usecs - resetting time base", bttosbt(bt) / SBT_1US, bttosbt(vlapic->timer_period_bt) / SBT_1US); } bintime_add(&vlapic->timer_fire_bt, &vlapic->timer_period_bt); callout_reset_sbt(&vlapic->callout, rem_sbt, 0, vlapic_callout_handler, vlapic, 0); } done: VLAPIC_TIMER_UNLOCK(vlapic); } void vlapic_icrtmr_write_handler(struct vlapic *vlapic) { struct LAPIC *lapic; sbintime_t sbt; uint32_t icr_timer; VLAPIC_TIMER_LOCK(vlapic); lapic = vlapic->apic_page; icr_timer = lapic->icr_timer; vlapic->timer_period_bt = vlapic->timer_freq_bt; bintime_mul(&vlapic->timer_period_bt, icr_timer); if (icr_timer != 0) { binuptime(&vlapic->timer_fire_bt); bintime_add(&vlapic->timer_fire_bt, &vlapic->timer_period_bt); sbt = bttosbt(vlapic->timer_period_bt); callout_reset_sbt(&vlapic->callout, sbt, 0, vlapic_callout_handler, vlapic, 0); } else callout_stop(&vlapic->callout); VLAPIC_TIMER_UNLOCK(vlapic); } /* * This function populates 'dmask' with the set of vcpus that match the * addressing specified by the (dest, phys, lowprio) tuple. * * 'x2apic_dest' specifies whether 'dest' is interpreted as x2APIC (32-bit) * or xAPIC (8-bit) destination field. */ static void vlapic_calcdest(struct vm *vm, cpuset_t *dmask, uint32_t dest, bool phys, bool lowprio, bool x2apic_dest) { struct vlapic *vlapic; uint32_t dfr, ldr, ldest, cluster; uint32_t mda_flat_ldest, mda_cluster_ldest, mda_ldest, mda_cluster_id; cpuset_t amask; int vcpuid; if ((x2apic_dest && dest == 0xffffffff) || (!x2apic_dest && dest == 0xff)) { /* * Broadcast in both logical and physical modes. */ *dmask = vm_active_cpus(vm); return; } if (phys) { /* * Physical mode: destination is APIC ID. */ CPU_ZERO(dmask); vcpuid = vm_apicid2vcpuid(vm, dest); if (vcpuid < VM_MAXCPU) CPU_SET(vcpuid, dmask); } else { /* * In the "Flat Model" the MDA is interpreted as an 8-bit wide * bitmask. This model is only avilable in the xAPIC mode. */ mda_flat_ldest = dest & 0xff; /* * In the "Cluster Model" the MDA is used to identify a * specific cluster and a set of APICs in that cluster. */ if (x2apic_dest) { mda_cluster_id = dest >> 16; mda_cluster_ldest = dest & 0xffff; } else { mda_cluster_id = (dest >> 4) & 0xf; mda_cluster_ldest = dest & 0xf; } /* * Logical mode: match each APIC that has a bit set * in it's LDR that matches a bit in the ldest. */ CPU_ZERO(dmask); amask = vm_active_cpus(vm); while ((vcpuid = CPU_FFS(&amask)) != 0) { vcpuid--; CPU_CLR(vcpuid, &amask); vlapic = vm_lapic(vm, vcpuid); dfr = vlapic->apic_page->dfr; ldr = vlapic->apic_page->ldr; if ((dfr & APIC_DFR_MODEL_MASK) == APIC_DFR_MODEL_FLAT) { ldest = ldr >> 24; mda_ldest = mda_flat_ldest; } else if ((dfr & APIC_DFR_MODEL_MASK) == APIC_DFR_MODEL_CLUSTER) { if (x2apic(vlapic)) { cluster = ldr >> 16; ldest = ldr & 0xffff; } else { cluster = ldr >> 28; ldest = (ldr >> 24) & 0xf; } if (cluster != mda_cluster_id) continue; mda_ldest = mda_cluster_ldest; } else { /* * Guest has configured a bad logical * model for this vcpu - skip it. */ VLAPIC_CTR1(vlapic, "vlapic has bad logical " "model %x - cannot deliver interrupt", dfr); continue; } if ((mda_ldest & ldest) != 0) { CPU_SET(vcpuid, dmask); if (lowprio) break; } } } } static VMM_STAT_ARRAY(IPIS_SENT, VM_MAXCPU, "ipis sent to vcpu"); int vlapic_icrlo_write_handler(struct vlapic *vlapic, bool *retu) { int i; bool phys; cpuset_t dmask; uint64_t icrval; uint32_t dest, vec, mode; struct vlapic *vlapic2; struct vm_exit *vmexit; struct LAPIC *lapic; lapic = vlapic->apic_page; lapic->icr_lo &= ~APIC_DELSTAT_PEND; icrval = ((uint64_t)lapic->icr_hi << 32) | lapic->icr_lo; if (x2apic(vlapic)) dest = icrval >> 32; else dest = icrval >> (32 + 24); vec = icrval & APIC_VECTOR_MASK; mode = icrval & APIC_DELMODE_MASK; if (mode == APIC_DELMODE_FIXED && vec < 16) { vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR); VLAPIC_CTR1(vlapic, "Ignoring invalid IPI %d", vec); return (0); } VLAPIC_CTR2(vlapic, "icrlo 0x%016lx triggered ipi %d", icrval, vec); if (mode == APIC_DELMODE_FIXED || mode == APIC_DELMODE_NMI) { switch (icrval & APIC_DEST_MASK) { case APIC_DEST_DESTFLD: phys = ((icrval & APIC_DESTMODE_LOG) == 0); vlapic_calcdest(vlapic->vm, &dmask, dest, phys, false, x2apic(vlapic)); break; case APIC_DEST_SELF: CPU_SETOF(vlapic->vcpuid, &dmask); break; case APIC_DEST_ALLISELF: dmask = vm_active_cpus(vlapic->vm); break; case APIC_DEST_ALLESELF: dmask = vm_active_cpus(vlapic->vm); CPU_CLR(vlapic->vcpuid, &dmask); break; default: CPU_ZERO(&dmask); /* satisfy gcc */ break; } while ((i = CPU_FFS(&dmask)) != 0) { i--; CPU_CLR(i, &dmask); if (mode == APIC_DELMODE_FIXED) { lapic_intr_edge(vlapic->vm, i, vec); vmm_stat_array_incr(vlapic->vm, vlapic->vcpuid, IPIS_SENT, i, 1); VLAPIC_CTR2(vlapic, "vlapic sending ipi %d " "to vcpuid %d", vec, i); } else { vm_inject_nmi(vlapic->vm, i); VLAPIC_CTR1(vlapic, "vlapic sending ipi nmi " "to vcpuid %d", i); } } return (0); /* handled completely in the kernel */ } if (mode == APIC_DELMODE_INIT) { if ((icrval & APIC_LEVEL_MASK) == APIC_LEVEL_DEASSERT) return (0); if (vlapic->vcpuid == 0 && dest != 0 && dest < VM_MAXCPU) { vlapic2 = vm_lapic(vlapic->vm, dest); /* move from INIT to waiting-for-SIPI state */ if (vlapic2->boot_state == BS_INIT) { vlapic2->boot_state = BS_SIPI; } return (0); } } if (mode == APIC_DELMODE_STARTUP) { if (vlapic->vcpuid == 0 && dest != 0 && dest < VM_MAXCPU) { vlapic2 = vm_lapic(vlapic->vm, dest); /* * Ignore SIPIs in any state other than wait-for-SIPI */ if (vlapic2->boot_state != BS_SIPI) return (0); /* * XXX this assumes that the startup IPI always succeeds */ vlapic2->boot_state = BS_RUNNING; vm_activate_cpu(vlapic2->vm, dest); *retu = true; vmexit = vm_exitinfo(vlapic->vm, vlapic->vcpuid); vmexit->exitcode = VM_EXITCODE_SPINUP_AP; vmexit->u.spinup_ap.vcpu = dest; vmexit->u.spinup_ap.rip = vec << PAGE_SHIFT; return (0); } } /* * This will cause a return to userland. */ return (1); } static void vlapic_self_ipi_handler(struct vlapic *vlapic, uint64_t val) { int vec; vec = val & 0xff; lapic_intr_edge(vlapic->vm, vlapic->vcpuid, vec); vmm_stat_array_incr(vlapic->vm, vlapic->vcpuid, IPIS_SENT, vlapic->vcpuid, 1); VLAPIC_CTR1(vlapic, "vlapic self-ipi %d", vec); } int vlapic_pending_intr(struct vlapic *vlapic, int *vecptr) { struct LAPIC *lapic = vlapic->apic_page; int idx, i, bitpos, vector; uint32_t *irrptr, val; if (vlapic->ops.pending_intr) return ((*vlapic->ops.pending_intr)(vlapic, vecptr)); irrptr = &lapic->irr0; /* * The x86 architecture reserves the the first 32 vectors for use * by the processor. */ for (i = 7; i > 0; i--) { idx = i * 4; val = atomic_load_acq_int(&irrptr[idx]); bitpos = fls(val); if (bitpos != 0) { vector = i * 32 + (bitpos - 1); if (PRIO(vector) > PRIO(lapic->ppr)) { VLAPIC_CTR1(vlapic, "pending intr %d", vector); if (vecptr != NULL) *vecptr = vector; return (1); } else break; } } return (0); } void vlapic_intr_accepted(struct vlapic *vlapic, int vector) { struct LAPIC *lapic = vlapic->apic_page; uint32_t *irrptr, *isrptr; int idx, stk_top; if (vlapic->ops.intr_accepted) return ((*vlapic->ops.intr_accepted)(vlapic, vector)); /* * clear the ready bit for vector being accepted in irr * and set the vector as in service in isr. */ idx = (vector / 32) * 4; irrptr = &lapic->irr0; atomic_clear_int(&irrptr[idx], 1 << (vector % 32)); VLAPIC_CTR_IRR(vlapic, "vlapic_intr_accepted"); isrptr = &lapic->isr0; isrptr[idx] |= 1 << (vector % 32); VLAPIC_CTR_ISR(vlapic, "vlapic_intr_accepted"); /* * Update the PPR */ vlapic->isrvec_stk_top++; stk_top = vlapic->isrvec_stk_top; if (stk_top >= ISRVEC_STK_SIZE) panic("isrvec_stk_top overflow %d", stk_top); vlapic->isrvec_stk[stk_top] = vector; vlapic_update_ppr(vlapic); } void vlapic_svr_write_handler(struct vlapic *vlapic) { struct LAPIC *lapic; uint32_t old, new, changed; lapic = vlapic->apic_page; new = lapic->svr; old = vlapic->svr_last; vlapic->svr_last = new; changed = old ^ new; if ((changed & APIC_SVR_ENABLE) != 0) { if ((new & APIC_SVR_ENABLE) == 0) { /* * The apic is now disabled so stop the apic timer * and mask all the LVT entries. */ VLAPIC_CTR0(vlapic, "vlapic is software-disabled"); VLAPIC_TIMER_LOCK(vlapic); callout_stop(&vlapic->callout); VLAPIC_TIMER_UNLOCK(vlapic); vlapic_mask_lvts(vlapic); } else { /* * The apic is now enabled so restart the apic timer * if it is configured in periodic mode. */ VLAPIC_CTR0(vlapic, "vlapic is software-enabled"); if (vlapic_periodic_timer(vlapic)) vlapic_icrtmr_write_handler(vlapic); } } } int -vlapic_read(struct vlapic *vlapic, uint64_t offset, uint64_t *data, bool *retu) +vlapic_read(struct vlapic *vlapic, int mmio_access, uint64_t offset, + uint64_t *data, bool *retu) { struct LAPIC *lapic = vlapic->apic_page; uint32_t *reg; int i; + /* Ignore MMIO accesses in x2APIC mode */ + if (x2apic(vlapic) && mmio_access) { + VLAPIC_CTR1(vlapic, "MMIO read from offset %#lx in x2APIC mode", + offset); + *data = 0; + goto done; + } + + if (!x2apic(vlapic) && !mmio_access) { + /* + * XXX Generate GP fault for MSR accesses in xAPIC mode + */ + VLAPIC_CTR1(vlapic, "x2APIC MSR read from offset %#lx in " + "xAPIC mode", offset); + *data = 0; + goto done; + } + if (offset > sizeof(*lapic)) { *data = 0; goto done; } offset &= ~3; switch(offset) { case APIC_OFFSET_ID: *data = lapic->id; break; case APIC_OFFSET_VER: *data = lapic->version; break; case APIC_OFFSET_TPR: *data = lapic->tpr; break; case APIC_OFFSET_APR: *data = lapic->apr; break; case APIC_OFFSET_PPR: *data = lapic->ppr; break; case APIC_OFFSET_EOI: *data = lapic->eoi; break; case APIC_OFFSET_LDR: *data = lapic->ldr; break; case APIC_OFFSET_DFR: *data = lapic->dfr; break; case APIC_OFFSET_SVR: *data = lapic->svr; break; case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7: i = (offset - APIC_OFFSET_ISR0) >> 2; reg = &lapic->isr0; *data = *(reg + i); break; case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7: i = (offset - APIC_OFFSET_TMR0) >> 2; reg = &lapic->tmr0; *data = *(reg + i); break; case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7: i = (offset - APIC_OFFSET_IRR0) >> 2; reg = &lapic->irr0; *data = atomic_load_acq_int(reg + i); break; case APIC_OFFSET_ESR: *data = lapic->esr; break; case APIC_OFFSET_ICR_LOW: *data = lapic->icr_lo; if (x2apic(vlapic)) *data |= (uint64_t)lapic->icr_hi << 32; break; case APIC_OFFSET_ICR_HI: *data = lapic->icr_hi; break; case APIC_OFFSET_CMCI_LVT: case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT: *data = vlapic_get_lvt(vlapic, offset); #ifdef INVARIANTS reg = vlapic_get_lvtptr(vlapic, offset); KASSERT(*data == *reg, ("inconsistent lvt value at " "offset %#lx: %#lx/%#x", offset, *data, *reg)); #endif break; case APIC_OFFSET_TIMER_ICR: *data = lapic->icr_timer; break; case APIC_OFFSET_TIMER_CCR: *data = vlapic_get_ccr(vlapic); break; case APIC_OFFSET_TIMER_DCR: *data = lapic->dcr_timer; break; case APIC_OFFSET_SELF_IPI: /* * XXX generate a GP fault if vlapic is in x2apic mode */ *data = 0; break; case APIC_OFFSET_RRR: default: *data = 0; break; } done: VLAPIC_CTR2(vlapic, "vlapic read offset %#x, data %#lx", offset, *data); return 0; } int -vlapic_write(struct vlapic *vlapic, uint64_t offset, uint64_t data, bool *retu) +vlapic_write(struct vlapic *vlapic, int mmio_access, uint64_t offset, + uint64_t data, bool *retu) { struct LAPIC *lapic = vlapic->apic_page; uint32_t *regptr; int retval; KASSERT((offset & 0xf) == 0 && offset < PAGE_SIZE, ("vlapic_write: invalid offset %#lx", offset)); - VLAPIC_CTR2(vlapic, "vlapic write offset %#x, data %#lx", offset, data); + VLAPIC_CTR2(vlapic, "vlapic write offset %#lx, data %#lx", + offset, data); - if (offset > sizeof(*lapic)) { - return 0; + if (offset > sizeof(*lapic)) + return (0); + + /* Ignore MMIO accesses in x2APIC mode */ + if (x2apic(vlapic) && mmio_access) { + VLAPIC_CTR2(vlapic, "MMIO write of %#lx to offset %#lx " + "in x2APIC mode", data, offset); + return (0); + } + + /* + * XXX Generate GP fault for MSR accesses in xAPIC mode + */ + if (!x2apic(vlapic) && !mmio_access) { + VLAPIC_CTR2(vlapic, "x2APIC MSR write of %#lx to offset %#lx " + "in xAPIC mode", data, offset); + return (0); } retval = 0; switch(offset) { case APIC_OFFSET_ID: lapic->id = data; vlapic_id_write_handler(vlapic); break; case APIC_OFFSET_TPR: lapic->tpr = data & 0xff; vlapic_update_ppr(vlapic); break; case APIC_OFFSET_EOI: vlapic_process_eoi(vlapic); break; case APIC_OFFSET_LDR: lapic->ldr = data; vlapic_ldr_write_handler(vlapic); break; case APIC_OFFSET_DFR: lapic->dfr = data; vlapic_dfr_write_handler(vlapic); break; case APIC_OFFSET_SVR: lapic->svr = data; vlapic_svr_write_handler(vlapic); break; case APIC_OFFSET_ICR_LOW: lapic->icr_lo = data; if (x2apic(vlapic)) lapic->icr_hi = data >> 32; retval = vlapic_icrlo_write_handler(vlapic, retu); break; case APIC_OFFSET_ICR_HI: lapic->icr_hi = data; break; case APIC_OFFSET_CMCI_LVT: case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT: regptr = vlapic_get_lvtptr(vlapic, offset); *regptr = data; vlapic_lvt_write_handler(vlapic, offset); break; case APIC_OFFSET_TIMER_ICR: lapic->icr_timer = data; vlapic_icrtmr_write_handler(vlapic); break; case APIC_OFFSET_TIMER_DCR: lapic->dcr_timer = data; vlapic_dcr_write_handler(vlapic); break; case APIC_OFFSET_ESR: vlapic_esr_write_handler(vlapic); break; case APIC_OFFSET_SELF_IPI: if (x2apic(vlapic)) vlapic_self_ipi_handler(vlapic, data); break; case APIC_OFFSET_VER: case APIC_OFFSET_APR: case APIC_OFFSET_PPR: case APIC_OFFSET_RRR: case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7: case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7: case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7: case APIC_OFFSET_TIMER_CCR: default: // Read only. break; } return (retval); } static void vlapic_reset(struct vlapic *vlapic) { struct LAPIC *lapic; lapic = vlapic->apic_page; bzero(lapic, sizeof(struct LAPIC)); lapic->id = vlapic_get_id(vlapic); lapic->version = VLAPIC_VERSION; lapic->version |= (VLAPIC_MAXLVT_INDEX << MAXLVTSHIFT); lapic->dfr = 0xffffffff; lapic->svr = APIC_SVR_VECTOR; vlapic_mask_lvts(vlapic); vlapic_reset_tmr(vlapic); lapic->dcr_timer = 0; vlapic_dcr_write_handler(vlapic); if (vlapic->vcpuid == 0) vlapic->boot_state = BS_RUNNING; /* BSP */ else vlapic->boot_state = BS_INIT; /* AP */ vlapic->svr_last = lapic->svr; } void vlapic_init(struct vlapic *vlapic) { KASSERT(vlapic->vm != NULL, ("vlapic_init: vm is not initialized")); KASSERT(vlapic->vcpuid >= 0 && vlapic->vcpuid < VM_MAXCPU, ("vlapic_init: vcpuid is not initialized")); KASSERT(vlapic->apic_page != NULL, ("vlapic_init: apic_page is not " "initialized")); /* * If the vlapic is configured in x2apic mode then it will be * accessed in the critical section via the MSR emulation code. * * Therefore the timer mutex must be a spinlock because blockable * mutexes cannot be acquired in a critical section. */ mtx_init(&vlapic->timer_mtx, "vlapic timer mtx", NULL, MTX_SPIN); callout_init(&vlapic->callout, 1); vlapic->msr_apicbase = DEFAULT_APIC_BASE | APICBASE_ENABLED; if (vlapic->vcpuid == 0) vlapic->msr_apicbase |= APICBASE_BSP; vlapic_reset(vlapic); } void vlapic_cleanup(struct vlapic *vlapic) { callout_drain(&vlapic->callout); } uint64_t vlapic_get_apicbase(struct vlapic *vlapic) { return (vlapic->msr_apicbase); } -void +int vlapic_set_apicbase(struct vlapic *vlapic, uint64_t new) { - struct LAPIC *lapic; - enum x2apic_state state; - uint64_t old; - int err; - - err = vm_get_x2apic_state(vlapic->vm, vlapic->vcpuid, &state); - if (err) - panic("vlapic_set_apicbase: err %d fetching x2apic state", err); - if (state == X2APIC_DISABLED) - new &= ~APICBASE_X2APIC; - - old = vlapic->msr_apicbase; - vlapic->msr_apicbase = new; - - /* - * If the vlapic is switching between xAPIC and x2APIC modes then - * reset the mode-dependent registers. - */ - if ((old ^ new) & APICBASE_X2APIC) { - lapic = vlapic->apic_page; - lapic->id = vlapic_get_id(vlapic); - if (x2apic(vlapic)) { - lapic->ldr = x2apic_ldr(vlapic); - lapic->dfr = 0; - } else { - lapic->ldr = 0; - lapic->dfr = 0xffffffff; - } + if (vlapic->msr_apicbase != new) { + VLAPIC_CTR2(vlapic, "Changing APIC_BASE MSR from %#lx to %#lx " + "not supported", vlapic->msr_apicbase, new); + return (-1); } + + return (0); } void vlapic_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state) { struct vlapic *vlapic; + struct LAPIC *lapic; vlapic = vm_lapic(vm, vcpuid); if (state == X2APIC_DISABLED) vlapic->msr_apicbase &= ~APICBASE_X2APIC; + else + vlapic->msr_apicbase |= APICBASE_X2APIC; + + /* + * Reset the local APIC registers whose values are mode-dependent. + * + * XXX this works because the APIC mode can be changed only at vcpu + * initialization time. + */ + lapic = vlapic->apic_page; + lapic->id = vlapic_get_id(vlapic); + if (x2apic(vlapic)) { + lapic->ldr = x2apic_ldr(vlapic); + lapic->dfr = 0; + } else { + lapic->ldr = 0; + lapic->dfr = 0xffffffff; + } } void vlapic_deliver_intr(struct vm *vm, bool level, uint32_t dest, bool phys, int delmode, int vec) { bool lowprio; int vcpuid; cpuset_t dmask; if (delmode != APIC_DELMODE_FIXED && delmode != APIC_DELMODE_LOWPRIO) { VM_CTR1(vm, "vlapic intr invalid delmode %#x", delmode); return; } lowprio = (delmode == APIC_DELMODE_LOWPRIO); /* * We don't provide any virtual interrupt redirection hardware so * all interrupts originating from the ioapic or MSI specify the * 'dest' in the legacy xAPIC format. */ vlapic_calcdest(vm, &dmask, dest, phys, lowprio, false); while ((vcpuid = CPU_FFS(&dmask)) != 0) { vcpuid--; CPU_CLR(vcpuid, &dmask); lapic_set_intr(vm, vcpuid, vec, level); } } void vlapic_post_intr(struct vlapic *vlapic, int hostcpu, int ipinum) { /* * Post an interrupt to the vcpu currently running on 'hostcpu'. * * This is done by leveraging features like Posted Interrupts (Intel) * Doorbell MSR (AMD AVIC) that avoid a VM exit. * * If neither of these features are available then fallback to * sending an IPI to 'hostcpu'. */ if (vlapic->ops.post_intr) (*vlapic->ops.post_intr)(vlapic, hostcpu); else ipi_cpu(hostcpu, ipinum); } bool vlapic_enabled(struct vlapic *vlapic) { struct LAPIC *lapic = vlapic->apic_page; if ((vlapic->msr_apicbase & APICBASE_ENABLED) != 0 && (lapic->svr & APIC_SVR_ENABLE) != 0) return (true); else return (false); } static void vlapic_set_tmr(struct vlapic *vlapic, int vector, bool level) { struct LAPIC *lapic; uint32_t *tmrptr, mask; int idx; lapic = vlapic->apic_page; tmrptr = &lapic->tmr0; idx = (vector / 32) * 4; mask = 1 << (vector % 32); if (level) tmrptr[idx] |= mask; else tmrptr[idx] &= ~mask; if (vlapic->ops.set_tmr != NULL) (*vlapic->ops.set_tmr)(vlapic, vector, level); } void vlapic_reset_tmr(struct vlapic *vlapic) { int vector; VLAPIC_CTR0(vlapic, "vlapic resetting all vectors to edge-triggered"); for (vector = 0; vector <= 255; vector++) vlapic_set_tmr(vlapic, vector, false); } void vlapic_set_tmr_level(struct vlapic *vlapic, uint32_t dest, bool phys, int delmode, int vector) { cpuset_t dmask; bool lowprio; KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector)); /* * A level trigger is valid only for fixed and lowprio delivery modes. */ if (delmode != APIC_DELMODE_FIXED && delmode != APIC_DELMODE_LOWPRIO) { VLAPIC_CTR1(vlapic, "Ignoring level trigger-mode for " "delivery-mode %d", delmode); return; } lowprio = (delmode == APIC_DELMODE_LOWPRIO); vlapic_calcdest(vlapic->vm, &dmask, dest, phys, lowprio, false); if (!CPU_ISSET(vlapic->vcpuid, &dmask)) return; VLAPIC_CTR1(vlapic, "vector %d set to level-triggered", vector); vlapic_set_tmr(vlapic, vector, true); } diff --git a/sys/amd64/vmm/io/vlapic.h b/sys/amd64/vmm/io/vlapic.h index d2fc6d9a6188..b215e57a9711 100644 --- a/sys/amd64/vmm/io/vlapic.h +++ b/sys/amd64/vmm/io/vlapic.h @@ -1,105 +1,105 @@ /*- * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _VLAPIC_H_ #define _VLAPIC_H_ struct vm; enum x2apic_state; -int vlapic_write(struct vlapic *vlapic, uint64_t offset, uint64_t data, - bool *retu); -int vlapic_read(struct vlapic *vlapic, uint64_t offset, uint64_t *data, - bool *retu); +int vlapic_write(struct vlapic *vlapic, int mmio_access, uint64_t offset, + uint64_t data, bool *retu); +int vlapic_read(struct vlapic *vlapic, int mmio_access, uint64_t offset, + uint64_t *data, bool *retu); /* * Returns 0 if there is no eligible vector that can be delivered to the * guest at this time and non-zero otherwise. * * If an eligible vector number is found and 'vecptr' is not NULL then it will * be stored in the location pointed to by 'vecptr'. * * Note that the vector does not automatically transition to the ISR as a * result of calling this function. */ int vlapic_pending_intr(struct vlapic *vlapic, int *vecptr); /* * Transition 'vector' from IRR to ISR. This function is called with the * vector returned by 'vlapic_pending_intr()' when the guest is able to * accept this interrupt (i.e. RFLAGS.IF = 1 and no conditions exist that * block interrupt delivery). */ void vlapic_intr_accepted(struct vlapic *vlapic, int vector); /* * Returns 1 if the vcpu needs to be notified of the interrupt and 0 otherwise. */ int vlapic_set_intr_ready(struct vlapic *vlapic, int vector, bool level); /* * Post an interrupt to the vcpu running on 'hostcpu'. This will use a * hardware assist if available (e.g. Posted Interrupt) or fall back to * sending an 'ipinum' to interrupt the 'hostcpu'. */ void vlapic_post_intr(struct vlapic *vlapic, int hostcpu, int ipinum); void vlapic_set_error(struct vlapic *vlapic, uint32_t mask); void vlapic_fire_cmci(struct vlapic *vlapic); int vlapic_trigger_lvt(struct vlapic *vlapic, int vector); uint64_t vlapic_get_apicbase(struct vlapic *vlapic); -void vlapic_set_apicbase(struct vlapic *vlapic, uint64_t val); +int vlapic_set_apicbase(struct vlapic *vlapic, uint64_t val); void vlapic_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state s); bool vlapic_enabled(struct vlapic *vlapic); void vlapic_deliver_intr(struct vm *vm, bool level, uint32_t dest, bool phys, int delmode, int vec); /* Reset the trigger-mode bits for all vectors to be edge-triggered */ void vlapic_reset_tmr(struct vlapic *vlapic); /* * Set the trigger-mode bit associated with 'vector' to level-triggered if * the (dest,phys,delmode) tuple resolves to an interrupt being delivered to * this 'vlapic'. */ void vlapic_set_tmr_level(struct vlapic *vlapic, uint32_t dest, bool phys, int delmode, int vector); /* APIC write handlers */ void vlapic_id_write_handler(struct vlapic *vlapic); void vlapic_ldr_write_handler(struct vlapic *vlapic); void vlapic_dfr_write_handler(struct vlapic *vlapic); void vlapic_svr_write_handler(struct vlapic *vlapic); void vlapic_esr_write_handler(struct vlapic *vlapic); int vlapic_icrlo_write_handler(struct vlapic *vlapic, bool *retu); void vlapic_icrtmr_write_handler(struct vlapic *vlapic); void vlapic_dcr_write_handler(struct vlapic *vlapic); void vlapic_lvt_write_handler(struct vlapic *vlapic, uint32_t offset); #endif /* _VLAPIC_H_ */ diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c index 4ae691563fec..80ff5be96691 100644 --- a/sys/amd64/vmm/vmm.c +++ b/sys/amd64/vmm/vmm.c @@ -1,1547 +1,1547 @@ /*- * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "vmm_ktr.h" #include "vmm_host.h" #include "vmm_mem.h" #include "vmm_util.h" #include "vhpet.h" #include "vioapic.h" #include "vlapic.h" #include "vmm_msr.h" #include "vmm_ipi.h" #include "vmm_stat.h" #include "vmm_lapic.h" #include "io/ppt.h" #include "io/iommu.h" struct vlapic; struct vcpu { int flags; enum vcpu_state state; struct mtx mtx; int hostcpu; /* host cpuid this vcpu last ran on */ uint64_t guest_msrs[VMM_MSR_NUM]; struct vlapic *vlapic; int vcpuid; struct savefpu *guestfpu; /* guest fpu state */ uint64_t guest_xcr0; void *stats; struct vm_exit exitinfo; enum x2apic_state x2apic_state; int nmi_pending; }; #define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN) #define vcpu_lock(v) mtx_lock_spin(&((v)->mtx)) #define vcpu_unlock(v) mtx_unlock_spin(&((v)->mtx)) #define vcpu_assert_locked(v) mtx_assert(&((v)->mtx), MA_OWNED) struct mem_seg { vm_paddr_t gpa; size_t len; boolean_t wired; vm_object_t object; }; #define VM_MAX_MEMORY_SEGMENTS 2 struct vm { void *cookie; /* processor-specific data */ void *iommu; /* iommu-specific data */ struct vhpet *vhpet; /* virtual HPET */ struct vioapic *vioapic; /* virtual ioapic */ struct vmspace *vmspace; /* guest's address space */ struct vcpu vcpu[VM_MAXCPU]; int num_mem_segs; struct mem_seg mem_segs[VM_MAX_MEMORY_SEGMENTS]; char name[VM_MAX_NAMELEN]; /* * Set of active vcpus. * An active vcpu is one that has been started implicitly (BSP) or * explicitly (AP) by sending it a startup ipi. */ cpuset_t active_cpus; struct mtx rendezvous_mtx; cpuset_t rendezvous_req_cpus; cpuset_t rendezvous_done_cpus; void *rendezvous_arg; vm_rendezvous_func_t rendezvous_func; }; static int vmm_initialized; static struct vmm_ops *ops; #define VMM_INIT(num) (ops != NULL ? (*ops->init)(num) : 0) #define VMM_CLEANUP() (ops != NULL ? (*ops->cleanup)() : 0) #define VMM_RESUME() (ops != NULL ? (*ops->resume)() : 0) #define VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL) #define VMRUN(vmi, vcpu, rip, pmap, rptr) \ (ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap, rptr) : ENXIO) #define VMCLEANUP(vmi) (ops != NULL ? (*ops->vmcleanup)(vmi) : NULL) #define VMSPACE_ALLOC(min, max) \ (ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL) #define VMSPACE_FREE(vmspace) \ (ops != NULL ? (*ops->vmspace_free)(vmspace) : ENXIO) #define VMGETREG(vmi, vcpu, num, retval) \ (ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO) #define VMSETREG(vmi, vcpu, num, val) \ (ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO) #define VMGETDESC(vmi, vcpu, num, desc) \ (ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO) #define VMSETDESC(vmi, vcpu, num, desc) \ (ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO) #define VMINJECT(vmi, vcpu, type, vec, ec, ecv) \ (ops != NULL ? (*ops->vminject)(vmi, vcpu, type, vec, ec, ecv) : ENXIO) #define VMGETCAP(vmi, vcpu, num, retval) \ (ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO) #define VMSETCAP(vmi, vcpu, num, val) \ (ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO) #define VLAPIC_INIT(vmi, vcpu) \ (ops != NULL ? (*ops->vlapic_init)(vmi, vcpu) : NULL) #define VLAPIC_CLEANUP(vmi, vlapic) \ (ops != NULL ? (*ops->vlapic_cleanup)(vmi, vlapic) : NULL) #define fpu_start_emulating() load_cr0(rcr0() | CR0_TS) #define fpu_stop_emulating() clts() static MALLOC_DEFINE(M_VM, "vm", "vm"); CTASSERT(VMM_MSR_NUM <= 64); /* msr_mask can keep track of up to 64 msrs */ /* statistics */ static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime"); SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL); static int vmm_ipinum; SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0, "IPI vector used for vcpu notifications"); static void vm_deactivate_cpu(struct vm *vm, int vcpuid); static void vcpu_cleanup(struct vm *vm, int i) { struct vcpu *vcpu = &vm->vcpu[i]; VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic); vmm_stat_free(vcpu->stats); fpu_save_area_free(vcpu->guestfpu); } static void vcpu_init(struct vm *vm, uint32_t vcpu_id) { struct vcpu *vcpu; vcpu = &vm->vcpu[vcpu_id]; vcpu_lock_init(vcpu); vcpu->hostcpu = NOCPU; vcpu->vcpuid = vcpu_id; vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id); - vm_set_x2apic_state(vm, vcpu_id, X2APIC_ENABLED); + vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED); vcpu->guest_xcr0 = XFEATURE_ENABLED_X87; vcpu->guestfpu = fpu_save_area_alloc(); fpu_save_area_reset(vcpu->guestfpu); vcpu->stats = vmm_stat_alloc(); } struct vm_exit * vm_exitinfo(struct vm *vm, int cpuid) { struct vcpu *vcpu; if (cpuid < 0 || cpuid >= VM_MAXCPU) panic("vm_exitinfo: invalid cpuid %d", cpuid); vcpu = &vm->vcpu[cpuid]; return (&vcpu->exitinfo); } static void vmm_resume(void) { VMM_RESUME(); } static int vmm_init(void) { int error; vmm_host_state_init(); vmm_ipinum = vmm_ipi_alloc(); if (vmm_ipinum == 0) vmm_ipinum = IPI_AST; error = vmm_mem_init(); if (error) return (error); if (vmm_is_intel()) ops = &vmm_ops_intel; else if (vmm_is_amd()) ops = &vmm_ops_amd; else return (ENXIO); vmm_msr_init(); vmm_resume_p = vmm_resume; return (VMM_INIT(vmm_ipinum)); } static int vmm_handler(module_t mod, int what, void *arg) { int error; switch (what) { case MOD_LOAD: vmmdev_init(); if (ppt_avail_devices() > 0) iommu_init(); error = vmm_init(); if (error == 0) vmm_initialized = 1; break; case MOD_UNLOAD: error = vmmdev_cleanup(); if (error == 0) { vmm_resume_p = NULL; iommu_cleanup(); if (vmm_ipinum != IPI_AST) vmm_ipi_free(vmm_ipinum); error = VMM_CLEANUP(); /* * Something bad happened - prevent new * VMs from being created */ if (error) vmm_initialized = 0; } break; default: error = 0; break; } return (error); } static moduledata_t vmm_kmod = { "vmm", vmm_handler, NULL }; /* * vmm initialization has the following dependencies: * * - iommu initialization must happen after the pci passthru driver has had * a chance to attach to any passthru devices (after SI_SUB_CONFIGURE). * * - VT-x initialization requires smp_rendezvous() and therefore must happen * after SMP is fully functional (after SI_SUB_SMP). */ DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY); MODULE_VERSION(vmm, 1); int vm_create(const char *name, struct vm **retvm) { int i; struct vm *vm; struct vmspace *vmspace; const int BSP = 0; /* * If vmm.ko could not be successfully initialized then don't attempt * to create the virtual machine. */ if (!vmm_initialized) return (ENXIO); if (name == NULL || strlen(name) >= VM_MAX_NAMELEN) return (EINVAL); vmspace = VMSPACE_ALLOC(VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS); if (vmspace == NULL) return (ENOMEM); vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO); strcpy(vm->name, name); vm->vmspace = vmspace; mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF); vm->cookie = VMINIT(vm, vmspace_pmap(vmspace)); vm->vioapic = vioapic_init(vm); vm->vhpet = vhpet_init(vm); for (i = 0; i < VM_MAXCPU; i++) { vcpu_init(vm, i); guest_msrs_init(vm, i); } vm_activate_cpu(vm, BSP); *retvm = vm; return (0); } static void vm_free_mem_seg(struct vm *vm, struct mem_seg *seg) { if (seg->object != NULL) vmm_mem_free(vm->vmspace, seg->gpa, seg->len); bzero(seg, sizeof(*seg)); } void vm_destroy(struct vm *vm) { int i; ppt_unassign_all(vm); if (vm->iommu != NULL) iommu_destroy_domain(vm->iommu); vhpet_cleanup(vm->vhpet); vioapic_cleanup(vm->vioapic); for (i = 0; i < vm->num_mem_segs; i++) vm_free_mem_seg(vm, &vm->mem_segs[i]); vm->num_mem_segs = 0; for (i = 0; i < VM_MAXCPU; i++) vcpu_cleanup(vm, i); VMSPACE_FREE(vm->vmspace); VMCLEANUP(vm->cookie); free(vm, M_VM); } const char * vm_name(struct vm *vm) { return (vm->name); } int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) { vm_object_t obj; if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL) return (ENOMEM); else return (0); } int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len) { vmm_mmio_free(vm->vmspace, gpa, len); return (0); } boolean_t vm_mem_allocated(struct vm *vm, vm_paddr_t gpa) { int i; vm_paddr_t gpabase, gpalimit; for (i = 0; i < vm->num_mem_segs; i++) { gpabase = vm->mem_segs[i].gpa; gpalimit = gpabase + vm->mem_segs[i].len; if (gpa >= gpabase && gpa < gpalimit) return (TRUE); /* 'gpa' is regular memory */ } if (ppt_is_mmio(vm, gpa)) return (TRUE); /* 'gpa' is pci passthru mmio */ return (FALSE); } int vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len) { int available, allocated; struct mem_seg *seg; vm_object_t object; vm_paddr_t g; if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0) return (EINVAL); available = allocated = 0; g = gpa; while (g < gpa + len) { if (vm_mem_allocated(vm, g)) allocated++; else available++; g += PAGE_SIZE; } /* * If there are some allocated and some available pages in the address * range then it is an error. */ if (allocated && available) return (EINVAL); /* * If the entire address range being requested has already been * allocated then there isn't anything more to do. */ if (allocated && available == 0) return (0); if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS) return (E2BIG); seg = &vm->mem_segs[vm->num_mem_segs]; if ((object = vmm_mem_alloc(vm->vmspace, gpa, len)) == NULL) return (ENOMEM); seg->gpa = gpa; seg->len = len; seg->object = object; seg->wired = FALSE; vm->num_mem_segs++; return (0); } static void vm_gpa_unwire(struct vm *vm) { int i, rv; struct mem_seg *seg; for (i = 0; i < vm->num_mem_segs; i++) { seg = &vm->mem_segs[i]; if (!seg->wired) continue; rv = vm_map_unwire(&vm->vmspace->vm_map, seg->gpa, seg->gpa + seg->len, VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); KASSERT(rv == KERN_SUCCESS, ("vm(%s) memory segment " "%#lx/%ld could not be unwired: %d", vm_name(vm), seg->gpa, seg->len, rv)); seg->wired = FALSE; } } static int vm_gpa_wire(struct vm *vm) { int i, rv; struct mem_seg *seg; for (i = 0; i < vm->num_mem_segs; i++) { seg = &vm->mem_segs[i]; if (seg->wired) continue; /* XXX rlimits? */ rv = vm_map_wire(&vm->vmspace->vm_map, seg->gpa, seg->gpa + seg->len, VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); if (rv != KERN_SUCCESS) break; seg->wired = TRUE; } if (i < vm->num_mem_segs) { /* * Undo the wiring before returning an error. */ vm_gpa_unwire(vm); return (EAGAIN); } return (0); } static void vm_iommu_modify(struct vm *vm, boolean_t map) { int i, sz; vm_paddr_t gpa, hpa; struct mem_seg *seg; void *vp, *cookie, *host_domain; sz = PAGE_SIZE; host_domain = iommu_host_domain(); for (i = 0; i < vm->num_mem_segs; i++) { seg = &vm->mem_segs[i]; KASSERT(seg->wired, ("vm(%s) memory segment %#lx/%ld not wired", vm_name(vm), seg->gpa, seg->len)); gpa = seg->gpa; while (gpa < seg->gpa + seg->len) { vp = vm_gpa_hold(vm, gpa, PAGE_SIZE, VM_PROT_WRITE, &cookie); KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx", vm_name(vm), gpa)); vm_gpa_release(cookie); hpa = DMAP_TO_PHYS((uintptr_t)vp); if (map) { iommu_create_mapping(vm->iommu, gpa, hpa, sz); iommu_remove_mapping(host_domain, hpa, sz); } else { iommu_remove_mapping(vm->iommu, gpa, sz); iommu_create_mapping(host_domain, hpa, hpa, sz); } gpa += PAGE_SIZE; } } /* * Invalidate the cached translations associated with the domain * from which pages were removed. */ if (map) iommu_invalidate_tlb(host_domain); else iommu_invalidate_tlb(vm->iommu); } #define vm_iommu_unmap(vm) vm_iommu_modify((vm), FALSE) #define vm_iommu_map(vm) vm_iommu_modify((vm), TRUE) int vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func) { int error; error = ppt_unassign_device(vm, bus, slot, func); if (error) return (error); if (ppt_assigned_devices(vm) == 0) { vm_iommu_unmap(vm); vm_gpa_unwire(vm); } return (0); } int vm_assign_pptdev(struct vm *vm, int bus, int slot, int func) { int error; vm_paddr_t maxaddr; /* * Virtual machines with pci passthru devices get special treatment: * - the guest physical memory is wired * - the iommu is programmed to do the 'gpa' to 'hpa' translation * * We need to do this before the first pci passthru device is attached. */ if (ppt_assigned_devices(vm) == 0) { KASSERT(vm->iommu == NULL, ("vm_assign_pptdev: iommu must be NULL")); maxaddr = vmm_mem_maxaddr(); vm->iommu = iommu_create_domain(maxaddr); error = vm_gpa_wire(vm); if (error) return (error); vm_iommu_map(vm); } error = ppt_assign_device(vm, bus, slot, func); return (error); } void * vm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot, void **cookie) { int count, pageoff; vm_page_t m; pageoff = gpa & PAGE_MASK; if (len > PAGE_SIZE - pageoff) panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len); count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map, trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1); if (count == 1) { *cookie = m; return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff)); } else { *cookie = NULL; return (NULL); } } void vm_gpa_release(void *cookie) { vm_page_t m = cookie; vm_page_lock(m); vm_page_unhold(m); vm_page_unlock(m); } int vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase, struct vm_memory_segment *seg) { int i; for (i = 0; i < vm->num_mem_segs; i++) { if (gpabase == vm->mem_segs[i].gpa) { seg->gpa = vm->mem_segs[i].gpa; seg->len = vm->mem_segs[i].len; seg->wired = vm->mem_segs[i].wired; return (0); } } return (-1); } int vm_get_memobj(struct vm *vm, vm_paddr_t gpa, size_t len, vm_offset_t *offset, struct vm_object **object) { int i; size_t seg_len; vm_paddr_t seg_gpa; vm_object_t seg_obj; for (i = 0; i < vm->num_mem_segs; i++) { if ((seg_obj = vm->mem_segs[i].object) == NULL) continue; seg_gpa = vm->mem_segs[i].gpa; seg_len = vm->mem_segs[i].len; if (gpa >= seg_gpa && gpa < seg_gpa + seg_len) { *offset = gpa - seg_gpa; *object = seg_obj; vm_object_reference(seg_obj); return (0); } } return (EINVAL); } int vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval) { if (vcpu < 0 || vcpu >= VM_MAXCPU) return (EINVAL); if (reg >= VM_REG_LAST) return (EINVAL); return (VMGETREG(vm->cookie, vcpu, reg, retval)); } int vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val) { if (vcpu < 0 || vcpu >= VM_MAXCPU) return (EINVAL); if (reg >= VM_REG_LAST) return (EINVAL); return (VMSETREG(vm->cookie, vcpu, reg, val)); } static boolean_t is_descriptor_table(int reg) { switch (reg) { case VM_REG_GUEST_IDTR: case VM_REG_GUEST_GDTR: return (TRUE); default: return (FALSE); } } static boolean_t is_segment_register(int reg) { switch (reg) { case VM_REG_GUEST_ES: case VM_REG_GUEST_CS: case VM_REG_GUEST_SS: case VM_REG_GUEST_DS: case VM_REG_GUEST_FS: case VM_REG_GUEST_GS: case VM_REG_GUEST_TR: case VM_REG_GUEST_LDTR: return (TRUE); default: return (FALSE); } } int vm_get_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc) { if (vcpu < 0 || vcpu >= VM_MAXCPU) return (EINVAL); if (!is_segment_register(reg) && !is_descriptor_table(reg)) return (EINVAL); return (VMGETDESC(vm->cookie, vcpu, reg, desc)); } int vm_set_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc) { if (vcpu < 0 || vcpu >= VM_MAXCPU) return (EINVAL); if (!is_segment_register(reg) && !is_descriptor_table(reg)) return (EINVAL); return (VMSETDESC(vm->cookie, vcpu, reg, desc)); } static void restore_guest_fpustate(struct vcpu *vcpu) { /* flush host state to the pcb */ fpuexit(curthread); /* restore guest FPU state */ fpu_stop_emulating(); fpurestore(vcpu->guestfpu); /* restore guest XCR0 if XSAVE is enabled in the host */ if (rcr4() & CR4_XSAVE) load_xcr(0, vcpu->guest_xcr0); /* * The FPU is now "dirty" with the guest's state so turn on emulation * to trap any access to the FPU by the host. */ fpu_start_emulating(); } static void save_guest_fpustate(struct vcpu *vcpu) { if ((rcr0() & CR0_TS) == 0) panic("fpu emulation not enabled in host!"); /* save guest XCR0 and restore host XCR0 */ if (rcr4() & CR4_XSAVE) { vcpu->guest_xcr0 = rxcr(0); load_xcr(0, vmm_get_host_xcr0()); } /* save guest FPU state */ fpu_stop_emulating(); fpusave(vcpu->guestfpu); fpu_start_emulating(); } static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle"); static int vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate, bool from_idle) { int error; vcpu_assert_locked(vcpu); /* * State transitions from the vmmdev_ioctl() must always begin from * the VCPU_IDLE state. This guarantees that there is only a single * ioctl() operating on a vcpu at any point. */ if (from_idle) { while (vcpu->state != VCPU_IDLE) msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz); } else { KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from " "vcpu idle state")); } /* * The following state transitions are allowed: * IDLE -> FROZEN -> IDLE * FROZEN -> RUNNING -> FROZEN * FROZEN -> SLEEPING -> FROZEN */ switch (vcpu->state) { case VCPU_IDLE: case VCPU_RUNNING: case VCPU_SLEEPING: error = (newstate != VCPU_FROZEN); break; case VCPU_FROZEN: error = (newstate == VCPU_FROZEN); break; default: error = 1; break; } if (error) return (EBUSY); vcpu->state = newstate; if (newstate == VCPU_IDLE) wakeup(&vcpu->state); return (0); } static void vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate) { int error; if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0) panic("Error %d setting state to %d\n", error, newstate); } static void vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate) { int error; if ((error = vcpu_set_state_locked(vcpu, newstate, false)) != 0) panic("Error %d setting state to %d", error, newstate); } static void vm_set_rendezvous_func(struct vm *vm, vm_rendezvous_func_t func) { KASSERT(mtx_owned(&vm->rendezvous_mtx), ("rendezvous_mtx not locked")); /* * Update 'rendezvous_func' and execute a write memory barrier to * ensure that it is visible across all host cpus. This is not needed * for correctness but it does ensure that all the vcpus will notice * that the rendezvous is requested immediately. */ vm->rendezvous_func = func; wmb(); } #define RENDEZVOUS_CTR0(vm, vcpuid, fmt) \ do { \ if (vcpuid >= 0) \ VCPU_CTR0(vm, vcpuid, fmt); \ else \ VM_CTR0(vm, fmt); \ } while (0) static void vm_handle_rendezvous(struct vm *vm, int vcpuid) { KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU), ("vm_handle_rendezvous: invalid vcpuid %d", vcpuid)); mtx_lock(&vm->rendezvous_mtx); while (vm->rendezvous_func != NULL) { if (vcpuid != -1 && CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus)) { VCPU_CTR0(vm, vcpuid, "Calling rendezvous func"); (*vm->rendezvous_func)(vm, vcpuid, vm->rendezvous_arg); CPU_SET(vcpuid, &vm->rendezvous_done_cpus); } if (CPU_CMP(&vm->rendezvous_req_cpus, &vm->rendezvous_done_cpus) == 0) { VCPU_CTR0(vm, vcpuid, "Rendezvous completed"); vm_set_rendezvous_func(vm, NULL); wakeup(&vm->rendezvous_func); break; } RENDEZVOUS_CTR0(vm, vcpuid, "Wait for rendezvous completion"); mtx_sleep(&vm->rendezvous_func, &vm->rendezvous_mtx, 0, "vmrndv", 0); } mtx_unlock(&vm->rendezvous_mtx); } /* * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run. */ static int vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu) { struct vm_exit *vmexit; struct vcpu *vcpu; int t, timo; vcpu = &vm->vcpu[vcpuid]; vcpu_lock(vcpu); /* * Do a final check for pending NMI or interrupts before * really putting this thread to sleep. * * These interrupts could have happened any time after we * returned from VMRUN() and before we grabbed the vcpu lock. */ if (!vm_nmi_pending(vm, vcpuid) && (intr_disabled || !vlapic_pending_intr(vcpu->vlapic, NULL))) { t = ticks; vcpu_require_state_locked(vcpu, VCPU_SLEEPING); if (vlapic_enabled(vcpu->vlapic)) { /* * XXX msleep_spin() is not interruptible so use the * 'timo' to put an upper bound on the sleep time. */ timo = hz; msleep_spin(vcpu, &vcpu->mtx, "vmidle", timo); } else { /* * Spindown the vcpu if the apic is disabled and it * had entered the halted state. */ *retu = true; vmexit = vm_exitinfo(vm, vcpuid); vmexit->exitcode = VM_EXITCODE_SPINDOWN_CPU; vm_deactivate_cpu(vm, vcpuid); VCPU_CTR0(vm, vcpuid, "spinning down cpu"); } vcpu_require_state_locked(vcpu, VCPU_FROZEN); vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t); } vcpu_unlock(vcpu); return (0); } static int vm_handle_paging(struct vm *vm, int vcpuid, bool *retu) { int rv, ftype; struct vm_map *map; struct vcpu *vcpu; struct vm_exit *vme; vcpu = &vm->vcpu[vcpuid]; vme = &vcpu->exitinfo; ftype = vme->u.paging.fault_type; KASSERT(ftype == VM_PROT_READ || ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE, ("vm_handle_paging: invalid fault_type %d", ftype)); if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) { rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace), vme->u.paging.gpa, ftype); if (rv == 0) goto done; } map = &vm->vmspace->vm_map; rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL); VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, " "ftype = %d", rv, vme->u.paging.gpa, ftype); if (rv != KERN_SUCCESS) return (EFAULT); done: /* restart execution at the faulting instruction */ vme->inst_length = 0; return (0); } static int vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu) { struct vie *vie; struct vcpu *vcpu; struct vm_exit *vme; int error, inst_length; uint64_t rip, gla, gpa, cr3; enum vie_cpu_mode cpu_mode; enum vie_paging_mode paging_mode; mem_region_read_t mread; mem_region_write_t mwrite; vcpu = &vm->vcpu[vcpuid]; vme = &vcpu->exitinfo; rip = vme->rip; inst_length = vme->inst_length; gla = vme->u.inst_emul.gla; gpa = vme->u.inst_emul.gpa; cr3 = vme->u.inst_emul.cr3; cpu_mode = vme->u.inst_emul.cpu_mode; paging_mode = vme->u.inst_emul.paging_mode; vie = &vme->u.inst_emul.vie; vie_init(vie); /* Fetch, decode and emulate the faulting instruction */ if (vmm_fetch_instruction(vm, vcpuid, rip, inst_length, cr3, paging_mode, vie) != 0) return (EFAULT); if (vmm_decode_instruction(vm, vcpuid, gla, cpu_mode, vie) != 0) return (EFAULT); /* return to userland unless this is an in-kernel emulated device */ if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) { mread = lapic_mmio_read; mwrite = lapic_mmio_write; } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) { mread = vioapic_mmio_read; mwrite = vioapic_mmio_write; } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) { mread = vhpet_mmio_read; mwrite = vhpet_mmio_write; } else { *retu = true; return (0); } error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, mread, mwrite, retu); return (error); } int vm_run(struct vm *vm, struct vm_run *vmrun) { int error, vcpuid; struct vcpu *vcpu; struct pcb *pcb; uint64_t tscval, rip; struct vm_exit *vme; bool retu, intr_disabled; pmap_t pmap; vcpuid = vmrun->cpuid; if (vcpuid < 0 || vcpuid >= VM_MAXCPU) return (EINVAL); pmap = vmspace_pmap(vm->vmspace); vcpu = &vm->vcpu[vcpuid]; vme = &vcpu->exitinfo; rip = vmrun->rip; restart: critical_enter(); KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active), ("vm_run: absurd pm_active")); tscval = rdtsc(); pcb = PCPU_GET(curpcb); set_pcb_flags(pcb, PCB_FULL_IRET); restore_guest_msrs(vm, vcpuid); restore_guest_fpustate(vcpu); vcpu_require_state(vm, vcpuid, VCPU_RUNNING); vcpu->hostcpu = curcpu; error = VMRUN(vm->cookie, vcpuid, rip, pmap, &vm->rendezvous_func); vcpu->hostcpu = NOCPU; vcpu_require_state(vm, vcpuid, VCPU_FROZEN); save_guest_fpustate(vcpu); restore_host_msrs(vm, vcpuid); vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval); critical_exit(); if (error == 0) { retu = false; switch (vme->exitcode) { case VM_EXITCODE_IOAPIC_EOI: vioapic_process_eoi(vm, vcpuid, vme->u.ioapic_eoi.vector); break; case VM_EXITCODE_RENDEZVOUS: vm_handle_rendezvous(vm, vcpuid); error = 0; break; case VM_EXITCODE_HLT: intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0); error = vm_handle_hlt(vm, vcpuid, intr_disabled, &retu); break; case VM_EXITCODE_PAGING: error = vm_handle_paging(vm, vcpuid, &retu); break; case VM_EXITCODE_INST_EMUL: error = vm_handle_inst_emul(vm, vcpuid, &retu); break; default: retu = true; /* handled in userland */ break; } } if (error == 0 && retu == false) { rip = vme->rip + vme->inst_length; goto restart; } /* copy the exit information */ bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit)); return (error); } int vm_inject_event(struct vm *vm, int vcpuid, int type, int vector, uint32_t code, int code_valid) { if (vcpuid < 0 || vcpuid >= VM_MAXCPU) return (EINVAL); if ((type > VM_EVENT_NONE && type < VM_EVENT_MAX) == 0) return (EINVAL); if (vector < 0 || vector > 255) return (EINVAL); return (VMINJECT(vm->cookie, vcpuid, type, vector, code, code_valid)); } static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu"); int vm_inject_nmi(struct vm *vm, int vcpuid) { struct vcpu *vcpu; if (vcpuid < 0 || vcpuid >= VM_MAXCPU) return (EINVAL); vcpu = &vm->vcpu[vcpuid]; vcpu->nmi_pending = 1; vcpu_notify_event(vm, vcpuid, false); return (0); } int vm_nmi_pending(struct vm *vm, int vcpuid) { struct vcpu *vcpu; if (vcpuid < 0 || vcpuid >= VM_MAXCPU) panic("vm_nmi_pending: invalid vcpuid %d", vcpuid); vcpu = &vm->vcpu[vcpuid]; return (vcpu->nmi_pending); } void vm_nmi_clear(struct vm *vm, int vcpuid) { struct vcpu *vcpu; if (vcpuid < 0 || vcpuid >= VM_MAXCPU) panic("vm_nmi_pending: invalid vcpuid %d", vcpuid); vcpu = &vm->vcpu[vcpuid]; if (vcpu->nmi_pending == 0) panic("vm_nmi_clear: inconsistent nmi_pending state"); vcpu->nmi_pending = 0; vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1); } int vm_get_capability(struct vm *vm, int vcpu, int type, int *retval) { if (vcpu < 0 || vcpu >= VM_MAXCPU) return (EINVAL); if (type < 0 || type >= VM_CAP_MAX) return (EINVAL); return (VMGETCAP(vm->cookie, vcpu, type, retval)); } int vm_set_capability(struct vm *vm, int vcpu, int type, int val) { if (vcpu < 0 || vcpu >= VM_MAXCPU) return (EINVAL); if (type < 0 || type >= VM_CAP_MAX) return (EINVAL); return (VMSETCAP(vm->cookie, vcpu, type, val)); } uint64_t * vm_guest_msrs(struct vm *vm, int cpu) { return (vm->vcpu[cpu].guest_msrs); } struct vlapic * vm_lapic(struct vm *vm, int cpu) { return (vm->vcpu[cpu].vlapic); } struct vioapic * vm_ioapic(struct vm *vm) { return (vm->vioapic); } struct vhpet * vm_hpet(struct vm *vm) { return (vm->vhpet); } boolean_t vmm_is_pptdev(int bus, int slot, int func) { int found, i, n; int b, s, f; char *val, *cp, *cp2; /* * XXX * The length of an environment variable is limited to 128 bytes which * puts an upper limit on the number of passthru devices that may be * specified using a single environment variable. * * Work around this by scanning multiple environment variable * names instead of a single one - yuck! */ const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL }; /* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */ found = 0; for (i = 0; names[i] != NULL && !found; i++) { cp = val = getenv(names[i]); while (cp != NULL && *cp != '\0') { if ((cp2 = strchr(cp, ' ')) != NULL) *cp2 = '\0'; n = sscanf(cp, "%d/%d/%d", &b, &s, &f); if (n == 3 && bus == b && slot == s && func == f) { found = 1; break; } if (cp2 != NULL) *cp2++ = ' '; cp = cp2; } freeenv(val); } return (found); } void * vm_iommu_domain(struct vm *vm) { return (vm->iommu); } int vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate, bool from_idle) { int error; struct vcpu *vcpu; if (vcpuid < 0 || vcpuid >= VM_MAXCPU) panic("vm_set_run_state: invalid vcpuid %d", vcpuid); vcpu = &vm->vcpu[vcpuid]; vcpu_lock(vcpu); error = vcpu_set_state_locked(vcpu, newstate, from_idle); vcpu_unlock(vcpu); return (error); } enum vcpu_state vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu) { struct vcpu *vcpu; enum vcpu_state state; if (vcpuid < 0 || vcpuid >= VM_MAXCPU) panic("vm_get_run_state: invalid vcpuid %d", vcpuid); vcpu = &vm->vcpu[vcpuid]; vcpu_lock(vcpu); state = vcpu->state; if (hostcpu != NULL) *hostcpu = vcpu->hostcpu; vcpu_unlock(vcpu); return (state); } void vm_activate_cpu(struct vm *vm, int vcpuid) { if (vcpuid >= 0 && vcpuid < VM_MAXCPU) CPU_SET(vcpuid, &vm->active_cpus); } static void vm_deactivate_cpu(struct vm *vm, int vcpuid) { if (vcpuid >= 0 && vcpuid < VM_MAXCPU) CPU_CLR(vcpuid, &vm->active_cpus); } cpuset_t vm_active_cpus(struct vm *vm) { return (vm->active_cpus); } void * vcpu_stats(struct vm *vm, int vcpuid) { return (vm->vcpu[vcpuid].stats); } int vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state) { if (vcpuid < 0 || vcpuid >= VM_MAXCPU) return (EINVAL); *state = vm->vcpu[vcpuid].x2apic_state; return (0); } int vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state) { if (vcpuid < 0 || vcpuid >= VM_MAXCPU) return (EINVAL); if (state >= X2APIC_STATE_LAST) return (EINVAL); vm->vcpu[vcpuid].x2apic_state = state; vlapic_set_x2apic_state(vm, vcpuid, state); return (0); } /* * This function is called to ensure that a vcpu "sees" a pending event * as soon as possible: * - If the vcpu thread is sleeping then it is woken up. * - If the vcpu is running on a different host_cpu then an IPI will be directed * to the host_cpu to cause the vcpu to trap into the hypervisor. */ void vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr) { int hostcpu; struct vcpu *vcpu; vcpu = &vm->vcpu[vcpuid]; vcpu_lock(vcpu); hostcpu = vcpu->hostcpu; if (hostcpu == NOCPU) { if (vcpu->state == VCPU_SLEEPING) wakeup_one(vcpu); } else { if (vcpu->state != VCPU_RUNNING) panic("invalid vcpu state %d", vcpu->state); if (hostcpu != curcpu) { if (lapic_intr) vlapic_post_intr(vcpu->vlapic, hostcpu, vmm_ipinum); else ipi_cpu(hostcpu, vmm_ipinum); } } vcpu_unlock(vcpu); } struct vmspace * vm_get_vmspace(struct vm *vm) { return (vm->vmspace); } int vm_apicid2vcpuid(struct vm *vm, int apicid) { /* * XXX apic id is assumed to be numerically identical to vcpu id */ return (apicid); } void vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest, vm_rendezvous_func_t func, void *arg) { /* * Enforce that this function is called without any locks */ WITNESS_WARN(WARN_PANIC, NULL, "vm_smp_rendezvous"); KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU), ("vm_smp_rendezvous: invalid vcpuid %d", vcpuid)); restart: mtx_lock(&vm->rendezvous_mtx); if (vm->rendezvous_func != NULL) { /* * If a rendezvous is already in progress then we need to * call the rendezvous handler in case this 'vcpuid' is one * of the targets of the rendezvous. */ RENDEZVOUS_CTR0(vm, vcpuid, "Rendezvous already in progress"); mtx_unlock(&vm->rendezvous_mtx); vm_handle_rendezvous(vm, vcpuid); goto restart; } KASSERT(vm->rendezvous_func == NULL, ("vm_smp_rendezvous: previous " "rendezvous is still in progress")); RENDEZVOUS_CTR0(vm, vcpuid, "Initiating rendezvous"); vm->rendezvous_req_cpus = dest; CPU_ZERO(&vm->rendezvous_done_cpus); vm->rendezvous_arg = arg; vm_set_rendezvous_func(vm, func); mtx_unlock(&vm->rendezvous_mtx); vm_handle_rendezvous(vm, vcpuid); } diff --git a/sys/amd64/vmm/vmm_lapic.c b/sys/amd64/vmm/vmm_lapic.c index 47e04da6b8b3..640c779c3d4a 100644 --- a/sys/amd64/vmm/vmm_lapic.c +++ b/sys/amd64/vmm/vmm_lapic.c @@ -1,243 +1,242 @@ /*- * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include "vmm_ipi.h" #include "vmm_ktr.h" #include "vmm_lapic.h" #include "vlapic.h" /* * Some MSI message definitions */ #define MSI_X86_ADDR_MASK 0xfff00000 #define MSI_X86_ADDR_BASE 0xfee00000 #define MSI_X86_ADDR_RH 0x00000008 /* Redirection Hint */ #define MSI_X86_ADDR_LOG 0x00000004 /* Destination Mode */ int lapic_set_intr(struct vm *vm, int cpu, int vector, bool level) { struct vlapic *vlapic; if (cpu < 0 || cpu >= VM_MAXCPU) return (EINVAL); if (vector < 32 || vector > 255) return (EINVAL); vlapic = vm_lapic(vm, cpu); if (vlapic_set_intr_ready(vlapic, vector, level)) vcpu_notify_event(vm, cpu, true); return (0); } int lapic_set_local_intr(struct vm *vm, int cpu, int vector) { struct vlapic *vlapic; cpuset_t dmask; int error; if (cpu < -1 || cpu >= VM_MAXCPU) return (EINVAL); if (cpu == -1) dmask = vm_active_cpus(vm); else CPU_SETOF(cpu, &dmask); error = 0; while ((cpu = CPU_FFS(&dmask)) != 0) { cpu--; CPU_CLR(cpu, &dmask); vlapic = vm_lapic(vm, cpu); error = vlapic_trigger_lvt(vlapic, vector); if (error) break; } return (error); } int lapic_intr_msi(struct vm *vm, uint64_t addr, uint64_t msg) { int delmode, vec; uint32_t dest; bool phys; VM_CTR2(vm, "lapic MSI addr: %#lx msg: %#lx", addr, msg); if ((addr & MSI_X86_ADDR_MASK) != MSI_X86_ADDR_BASE) { VM_CTR1(vm, "lapic MSI invalid addr %#lx", addr); return (-1); } /* * Extract the x86-specific fields from the MSI addr/msg * params according to the Intel Arch spec, Vol3 Ch 10. * * The PCI specification does not support level triggered * MSI/MSI-X so ignore trigger level in 'msg'. * * The 'dest' is interpreted as a logical APIC ID if both * the Redirection Hint and Destination Mode are '1' and * physical otherwise. */ dest = (addr >> 12) & 0xff; phys = ((addr & (MSI_X86_ADDR_RH | MSI_X86_ADDR_LOG)) != (MSI_X86_ADDR_RH | MSI_X86_ADDR_LOG)); delmode = msg & APIC_DELMODE_MASK; vec = msg & 0xff; VM_CTR3(vm, "lapic MSI %s dest %#x, vec %d", phys ? "physical" : "logical", dest, vec); vlapic_deliver_intr(vm, LAPIC_TRIG_EDGE, dest, phys, delmode, vec); return (0); } static boolean_t x2apic_msr(u_int msr) { if (msr >= 0x800 && msr <= 0xBFF) return (TRUE); else return (FALSE); } static u_int x2apic_msr_to_regoff(u_int msr) { return ((msr - 0x800) << 4); } boolean_t lapic_msr(u_int msr) { if (x2apic_msr(msr) || (msr == MSR_APICBASE)) return (TRUE); else return (FALSE); } int lapic_rdmsr(struct vm *vm, int cpu, u_int msr, uint64_t *rval, bool *retu) { int error; u_int offset; struct vlapic *vlapic; vlapic = vm_lapic(vm, cpu); if (msr == MSR_APICBASE) { *rval = vlapic_get_apicbase(vlapic); error = 0; } else { offset = x2apic_msr_to_regoff(msr); - error = vlapic_read(vlapic, offset, rval, retu); + error = vlapic_read(vlapic, 0, offset, rval, retu); } return (error); } int lapic_wrmsr(struct vm *vm, int cpu, u_int msr, uint64_t val, bool *retu) { int error; u_int offset; struct vlapic *vlapic; vlapic = vm_lapic(vm, cpu); if (msr == MSR_APICBASE) { - vlapic_set_apicbase(vlapic, val); - error = 0; + error = vlapic_set_apicbase(vlapic, val); } else { offset = x2apic_msr_to_regoff(msr); - error = vlapic_write(vlapic, offset, val, retu); + error = vlapic_write(vlapic, 0, offset, val, retu); } return (error); } int lapic_mmio_write(void *vm, int cpu, uint64_t gpa, uint64_t wval, int size, void *arg) { int error; uint64_t off; struct vlapic *vlapic; off = gpa - DEFAULT_APIC_BASE; /* * Memory mapped local apic accesses must be 4 bytes wide and * aligned on a 16-byte boundary. */ if (size != 4 || off & 0xf) return (EINVAL); vlapic = vm_lapic(vm, cpu); - error = vlapic_write(vlapic, off, wval, arg); + error = vlapic_write(vlapic, 1, off, wval, arg); return (error); } int lapic_mmio_read(void *vm, int cpu, uint64_t gpa, uint64_t *rval, int size, void *arg) { int error; uint64_t off; struct vlapic *vlapic; off = gpa - DEFAULT_APIC_BASE; /* * Memory mapped local apic accesses must be 4 bytes wide and * aligned on a 16-byte boundary. */ if (size != 4 || off & 0xf) return (EINVAL); vlapic = vm_lapic(vm, cpu); - error = vlapic_read(vlapic, off, rval, arg); + error = vlapic_read(vlapic, 1, off, rval, arg); return (error); } diff --git a/sys/amd64/vmm/x86.c b/sys/amd64/vmm/x86.c index 7ae32eca5d69..d3a0248a825a 100644 --- a/sys/amd64/vmm/x86.c +++ b/sys/amd64/vmm/x86.c @@ -1,337 +1,339 @@ /*- * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include "vmm_host.h" #include "x86.h" #define CPUID_VM_HIGH 0x40000000 static const char bhyve_id[12] = "bhyve bhyve "; static uint64_t bhyve_xcpuids; int x86_emulate_cpuid(struct vm *vm, int vcpu_id, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx) { const struct xsave_limits *limits; uint64_t cr4; int error, enable_invpcid; unsigned int func, regs[4]; enum x2apic_state x2apic_state; /* * Requests for invalid CPUID levels should map to the highest * available level instead. */ if (cpu_exthigh != 0 && *eax >= 0x80000000) { if (*eax > cpu_exthigh) *eax = cpu_exthigh; } else if (*eax >= 0x40000000) { if (*eax > CPUID_VM_HIGH) *eax = CPUID_VM_HIGH; } else if (*eax > cpu_high) { *eax = cpu_high; } func = *eax; /* * In general the approach used for CPU topology is to * advertise a flat topology where all CPUs are packages with * no multi-core or SMT. */ switch (func) { /* * Pass these through to the guest */ case CPUID_0000_0000: case CPUID_0000_0002: case CPUID_0000_0003: case CPUID_8000_0000: case CPUID_8000_0002: case CPUID_8000_0003: case CPUID_8000_0004: case CPUID_8000_0006: case CPUID_8000_0008: cpuid_count(*eax, *ecx, regs); break; case CPUID_8000_0001: /* * Hide rdtscp/ia32_tsc_aux until we know how * to deal with them. */ cpuid_count(*eax, *ecx, regs); regs[3] &= ~AMDID_RDTSCP; break; case CPUID_8000_0007: cpuid_count(*eax, *ecx, regs); /* * If the host TSCs are not synchronized across * physical cpus then we cannot advertise an * invariant tsc to a vcpu. * * XXX This still falls short because the vcpu * can observe the TSC moving backwards as it * migrates across physical cpus. But at least * it should discourage the guest from using the * TSC to keep track of time. */ if (!smp_tsc) regs[3] &= ~AMDPM_TSC_INVARIANT; break; case CPUID_0000_0001: do_cpuid(1, regs); error = vm_get_x2apic_state(vm, vcpu_id, &x2apic_state); if (error) { panic("x86_emulate_cpuid: error %d " "fetching x2apic state", error); } /* * Override the APIC ID only in ebx */ regs[1] &= ~(CPUID_LOCAL_APIC_ID); regs[1] |= (vcpu_id << CPUID_0000_0001_APICID_SHIFT); /* * Don't expose VMX, SpeedStep or TME capability. * Advertise x2APIC capability and Hypervisor guest. */ regs[2] &= ~(CPUID2_VMX | CPUID2_EST | CPUID2_TM2); regs[2] |= CPUID2_HV; if (x2apic_state != X2APIC_DISABLED) regs[2] |= CPUID2_X2APIC; + else + regs[2] &= ~CPUID2_X2APIC; /* * Only advertise CPUID2_XSAVE in the guest if * the host is using XSAVE. */ if (!(regs[2] & CPUID2_OSXSAVE)) regs[2] &= ~CPUID2_XSAVE; /* * If CPUID2_XSAVE is being advertised and the * guest has set CR4_XSAVE, set * CPUID2_OSXSAVE. */ regs[2] &= ~CPUID2_OSXSAVE; if (regs[2] & CPUID2_XSAVE) { error = vm_get_register(vm, vcpu_id, VM_REG_GUEST_CR4, &cr4); if (error) panic("x86_emulate_cpuid: error %d " "fetching %%cr4", error); if (cr4 & CR4_XSAVE) regs[2] |= CPUID2_OSXSAVE; } /* * Hide monitor/mwait until we know how to deal with * these instructions. */ regs[2] &= ~CPUID2_MON; /* * Hide the performance and debug features. */ regs[2] &= ~CPUID2_PDCM; /* * No TSC deadline support in the APIC yet */ regs[2] &= ~CPUID2_TSCDLT; /* * Hide thermal monitoring */ regs[3] &= ~(CPUID_ACPI | CPUID_TM); /* * Machine check handling is done in the host. * Hide MTRR capability. */ regs[3] &= ~(CPUID_MCA | CPUID_MCE | CPUID_MTRR); /* * Hide the debug store capability. */ regs[3] &= ~CPUID_DS; /* * Disable multi-core. */ regs[1] &= ~CPUID_HTT_CORES; regs[3] &= ~CPUID_HTT; break; case CPUID_0000_0004: do_cpuid(4, regs); /* * Do not expose topology. */ regs[0] &= 0xffff8000; regs[0] |= 0x04008000; break; case CPUID_0000_0007: regs[0] = 0; regs[1] = 0; regs[2] = 0; regs[3] = 0; /* leaf 0 */ if (*ecx == 0) { error = vm_get_capability(vm, vcpu_id, VM_CAP_ENABLE_INVPCID, &enable_invpcid); if (error == 0 && enable_invpcid) regs[1] |= CPUID_STDEXT_INVPCID; } break; case CPUID_0000_0006: case CPUID_0000_000A: /* * Handle the access, but report 0 for * all options */ regs[0] = 0; regs[1] = 0; regs[2] = 0; regs[3] = 0; break; case CPUID_0000_000B: /* * Processor topology enumeration */ regs[0] = 0; regs[1] = 0; regs[2] = *ecx & 0xff; regs[3] = vcpu_id; break; case CPUID_0000_000D: limits = vmm_get_xsave_limits(); if (!limits->xsave_enabled) { regs[0] = 0; regs[1] = 0; regs[2] = 0; regs[3] = 0; break; } cpuid_count(*eax, *ecx, regs); switch (*ecx) { case 0: /* * Only permit the guest to use bits * that are active in the host in * %xcr0. Also, claim that the * maximum save area size is * equivalent to the host's current * save area size. Since this runs * "inside" of vmrun(), it runs with * the guest's xcr0, so the current * save area size is correct as-is. */ regs[0] &= limits->xcr0_allowed; regs[2] = limits->xsave_max_size; regs[3] &= (limits->xcr0_allowed >> 32); break; case 1: /* Only permit XSAVEOPT. */ regs[0] &= CPUID_EXTSTATE_XSAVEOPT; regs[1] = 0; regs[2] = 0; regs[3] = 0; break; default: /* * If the leaf is for a permitted feature, * pass through as-is, otherwise return * all zeroes. */ if (!(limits->xcr0_allowed & (1ul << *ecx))) { regs[0] = 0; regs[1] = 0; regs[2] = 0; regs[3] = 0; } break; } break; case 0x40000000: regs[0] = CPUID_VM_HIGH; bcopy(bhyve_id, ®s[1], 4); bcopy(bhyve_id + 4, ®s[2], 4); bcopy(bhyve_id + 8, ®s[3], 4); break; default: /* * The leaf value has already been clamped so * simply pass this through, keeping count of * how many unhandled leaf values have been seen. */ atomic_add_long(&bhyve_xcpuids, 1); cpuid_count(*eax, *ecx, regs); break; } *eax = regs[0]; *ebx = regs[1]; *ecx = regs[2]; *edx = regs[3]; return (1); } diff --git a/usr.sbin/bhyve/bhyverun.c b/usr.sbin/bhyve/bhyverun.c index c587aeff29b6..2218622569cb 100644 --- a/usr.sbin/bhyve/bhyverun.c +++ b/usr.sbin/bhyve/bhyverun.c @@ -1,743 +1,741 @@ /*- * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" #include "acpi.h" #include "inout.h" #include "dbgport.h" #include "ioapic.h" #include "mem.h" #include "mevent.h" #include "mptbl.h" #include "pci_emul.h" #include "pci_lpc.h" #include "xmsr.h" #include "spinup_ap.h" #include "rtc.h" #define GUEST_NIO_PORT 0x488 /* guest upcalls via i/o port */ #define VMEXIT_SWITCH 0 /* force vcpu switch in mux mode */ #define VMEXIT_CONTINUE 1 /* continue from next instruction */ #define VMEXIT_RESTART 2 /* restart current instruction */ #define VMEXIT_ABORT 3 /* abort the vm run loop */ #define VMEXIT_RESET 4 /* guest machine has reset */ #define VMEXIT_POWEROFF 5 /* guest machine has powered off */ #define MB (1024UL * 1024) #define GB (1024UL * MB) typedef int (*vmexit_handler_t)(struct vmctx *, struct vm_exit *, int *vcpu); char *vmname; int guest_ncpus; static int pincpu = -1; -static int guest_vmexit_on_hlt, guest_vmexit_on_pause, disable_x2apic; +static int guest_vmexit_on_hlt, guest_vmexit_on_pause; static int virtio_msix = 1; +static int x2apic_mode = 0; /* default is xAPIC */ static int strictio; static int strictmsr = 1; static int acpi; static char *progname; static const int BSP = 0; static int cpumask; static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip); struct vm_exit vmexit[VM_MAXCPU]; struct bhyvestats { uint64_t vmexit_bogus; uint64_t vmexit_bogus_switch; uint64_t vmexit_hlt; uint64_t vmexit_pause; uint64_t vmexit_mtrap; uint64_t vmexit_inst_emul; uint64_t cpu_switch_rotate; uint64_t cpu_switch_direct; int io_reset; } stats; struct mt_vmm_info { pthread_t mt_thr; struct vmctx *mt_ctx; int mt_vcpu; } mt_vmm_info[VM_MAXCPU]; static void usage(int code) { fprintf(stderr, "Usage: %s [-aehwAHIPW] [-g ] [-s ]\n" " %*s [-c vcpus] [-p pincpu] [-m mem] [-l ] \n" - " -a: local apic is in XAPIC mode (default is X2APIC)\n" + " -a: local apic is in xAPIC mode (deprecated)\n" " -A: create an ACPI table\n" " -g: gdb port\n" " -c: # cpus (default 1)\n" " -p: pin vcpu 'n' to host cpu 'pincpu + n'\n" " -H: vmexit from the guest on hlt\n" " -P: vmexit from the guest on pause\n" " -W: force virtio to use single-vector MSI\n" " -e: exit on unhandled I/O access\n" " -h: help\n" " -s: PCI slot config\n" " -l: LPC device configuration\n" " -m: memory size in MB\n" - " -w: ignore unimplemented MSRs\n", + " -w: ignore unimplemented MSRs\n" + " -x: local apic is in x2APIC mode\n", progname, (int)strlen(progname), ""); exit(code); } void * paddr_guest2host(struct vmctx *ctx, uintptr_t gaddr, size_t len) { return (vm_map_gpa(ctx, gaddr, len)); } -int -fbsdrun_disable_x2apic(void) -{ - - return (disable_x2apic); -} - int fbsdrun_vmexit_on_pause(void) { return (guest_vmexit_on_pause); } int fbsdrun_vmexit_on_hlt(void) { return (guest_vmexit_on_hlt); } int fbsdrun_virtio_msix(void) { return (virtio_msix); } static void * fbsdrun_start_thread(void *param) { char tname[MAXCOMLEN + 1]; struct mt_vmm_info *mtp; int vcpu; mtp = param; vcpu = mtp->mt_vcpu; snprintf(tname, sizeof(tname), "vcpu %d", vcpu); pthread_set_name_np(mtp->mt_thr, tname); vm_loop(mtp->mt_ctx, vcpu, vmexit[vcpu].rip); /* not reached */ exit(1); return (NULL); } void fbsdrun_addcpu(struct vmctx *ctx, int vcpu, uint64_t rip) { int error; if (cpumask & (1 << vcpu)) { fprintf(stderr, "addcpu: attempting to add existing cpu %d\n", vcpu); exit(1); } atomic_set_int(&cpumask, 1 << vcpu); /* * Set up the vmexit struct to allow execution to start * at the given RIP */ vmexit[vcpu].rip = rip; vmexit[vcpu].inst_length = 0; mt_vmm_info[vcpu].mt_ctx = ctx; mt_vmm_info[vcpu].mt_vcpu = vcpu; error = pthread_create(&mt_vmm_info[vcpu].mt_thr, NULL, fbsdrun_start_thread, &mt_vmm_info[vcpu]); assert(error == 0); } static int fbsdrun_deletecpu(struct vmctx *ctx, int vcpu) { if ((cpumask & (1 << vcpu)) == 0) { fprintf(stderr, "addcpu: attempting to delete unknown cpu %d\n", vcpu); exit(1); } atomic_clear_int(&cpumask, 1 << vcpu); return (cpumask == 0); } static int vmexit_catch_reset(void) { stats.io_reset++; return (VMEXIT_RESET); } static int vmexit_catch_inout(void) { return (VMEXIT_ABORT); } static int vmexit_handle_notify(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu, uint32_t eax) { #if BHYVE_DEBUG /* * put guest-driven debug here */ #endif return (VMEXIT_CONTINUE); } static int vmexit_inout(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) { int error; int bytes, port, in, out; uint32_t eax; int vcpu; vcpu = *pvcpu; port = vme->u.inout.port; bytes = vme->u.inout.bytes; eax = vme->u.inout.eax; in = vme->u.inout.in; out = !in; /* We don't deal with these */ if (vme->u.inout.string || vme->u.inout.rep) return (VMEXIT_ABORT); /* Special case of guest reset */ if (out && port == 0x64 && (uint8_t)eax == 0xFE) return (vmexit_catch_reset()); /* Extra-special case of host notifications */ if (out && port == GUEST_NIO_PORT) return (vmexit_handle_notify(ctx, vme, pvcpu, eax)); error = emulate_inout(ctx, vcpu, in, port, bytes, &eax, strictio); if (error == INOUT_OK && in) error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX, eax); switch (error) { case INOUT_OK: return (VMEXIT_CONTINUE); case INOUT_RESET: return (VMEXIT_RESET); case INOUT_POWEROFF: return (VMEXIT_POWEROFF); default: fprintf(stderr, "Unhandled %s%c 0x%04x\n", in ? "in" : "out", bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'), port); return (vmexit_catch_inout()); } } static int vmexit_rdmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) { uint64_t val; uint32_t eax, edx; int error; val = 0; error = emulate_rdmsr(ctx, *pvcpu, vme->u.msr.code, &val); if (error != 0) { fprintf(stderr, "rdmsr to register %#x on vcpu %d\n", vme->u.msr.code, *pvcpu); if (strictmsr) return (VMEXIT_ABORT); } eax = val; error = vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RAX, eax); assert(error == 0); edx = val >> 32; error = vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RDX, edx); assert(error == 0); return (VMEXIT_CONTINUE); } static int vmexit_wrmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) { int error; error = emulate_wrmsr(ctx, *pvcpu, vme->u.msr.code, vme->u.msr.wval); if (error != 0) { fprintf(stderr, "wrmsr to register %#x(%#lx) on vcpu %d\n", vme->u.msr.code, vme->u.msr.wval, *pvcpu); if (strictmsr) return (VMEXIT_ABORT); } return (VMEXIT_CONTINUE); } static int vmexit_spinup_ap(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) { int newcpu; int retval = VMEXIT_CONTINUE; newcpu = spinup_ap(ctx, *pvcpu, vme->u.spinup_ap.vcpu, vme->u.spinup_ap.rip); return (retval); } static int vmexit_spindown_cpu(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) { int lastcpu; lastcpu = fbsdrun_deletecpu(ctx, *pvcpu); if (!lastcpu) pthread_exit(NULL); return (vmexit_catch_reset()); } static int vmexit_vmx(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { fprintf(stderr, "vm exit[%d]\n", *pvcpu); fprintf(stderr, "\treason\t\tVMX\n"); fprintf(stderr, "\trip\t\t0x%016lx\n", vmexit->rip); fprintf(stderr, "\tinst_length\t%d\n", vmexit->inst_length); fprintf(stderr, "\tstatus\t\t%d\n", vmexit->u.vmx.status); fprintf(stderr, "\texit_reason\t%u\n", vmexit->u.vmx.exit_reason); fprintf(stderr, "\tqualification\t0x%016lx\n", vmexit->u.vmx.exit_qualification); fprintf(stderr, "\tinst_type\t\t%d\n", vmexit->u.vmx.inst_type); fprintf(stderr, "\tinst_error\t\t%d\n", vmexit->u.vmx.inst_error); return (VMEXIT_ABORT); } static int vmexit_bogus(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { stats.vmexit_bogus++; return (VMEXIT_RESTART); } static int vmexit_hlt(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { stats.vmexit_hlt++; /* * Just continue execution with the next instruction. We use * the HLT VM exit as a way to be friendly with the host * scheduler. */ return (VMEXIT_CONTINUE); } static int vmexit_pause(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { stats.vmexit_pause++; return (VMEXIT_CONTINUE); } static int vmexit_mtrap(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { stats.vmexit_mtrap++; return (VMEXIT_RESTART); } static int vmexit_inst_emul(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { int err; stats.vmexit_inst_emul++; err = emulate_mem(ctx, *pvcpu, vmexit->u.inst_emul.gpa, &vmexit->u.inst_emul.vie); if (err) { if (err == EINVAL) { fprintf(stderr, "Failed to emulate instruction at 0x%lx\n", vmexit->rip); } else if (err == ESRCH) { fprintf(stderr, "Unhandled memory access to 0x%lx\n", vmexit->u.inst_emul.gpa); } return (VMEXIT_ABORT); } return (VMEXIT_CONTINUE); } static vmexit_handler_t handler[VM_EXITCODE_MAX] = { [VM_EXITCODE_INOUT] = vmexit_inout, [VM_EXITCODE_VMX] = vmexit_vmx, [VM_EXITCODE_BOGUS] = vmexit_bogus, [VM_EXITCODE_RDMSR] = vmexit_rdmsr, [VM_EXITCODE_WRMSR] = vmexit_wrmsr, [VM_EXITCODE_MTRAP] = vmexit_mtrap, [VM_EXITCODE_INST_EMUL] = vmexit_inst_emul, [VM_EXITCODE_SPINUP_AP] = vmexit_spinup_ap, [VM_EXITCODE_SPINDOWN_CPU] = vmexit_spindown_cpu, }; static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip) { cpuset_t mask; int error, rc, prevcpu; enum vm_exitcode exitcode; if (pincpu >= 0) { CPU_ZERO(&mask); CPU_SET(pincpu + vcpu, &mask); error = pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask); assert(error == 0); } while (1) { error = vm_run(ctx, vcpu, rip, &vmexit[vcpu]); if (error != 0) break; prevcpu = vcpu; exitcode = vmexit[vcpu].exitcode; if (exitcode >= VM_EXITCODE_MAX || handler[exitcode] == NULL) { fprintf(stderr, "vm_loop: unexpected exitcode 0x%x\n", exitcode); exit(1); } rc = (*handler[exitcode])(ctx, &vmexit[vcpu], &vcpu); switch (rc) { case VMEXIT_CONTINUE: rip = vmexit[vcpu].rip + vmexit[vcpu].inst_length; break; case VMEXIT_RESTART: rip = vmexit[vcpu].rip; break; case VMEXIT_RESET: exit(0); default: exit(1); } } fprintf(stderr, "vm_run error %d, errno %d\n", error, errno); } static int num_vcpus_allowed(struct vmctx *ctx) { int tmp, error; error = vm_get_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, &tmp); /* * The guest is allowed to spinup more than one processor only if the * UNRESTRICTED_GUEST capability is available. */ if (error == 0) return (VM_MAXCPU); else return (1); } void fbsdrun_set_capabilities(struct vmctx *ctx, int cpu) { int err, tmp; if (fbsdrun_vmexit_on_hlt()) { err = vm_get_capability(ctx, cpu, VM_CAP_HALT_EXIT, &tmp); if (err < 0) { fprintf(stderr, "VM exit on HLT not supported\n"); exit(1); } vm_set_capability(ctx, cpu, VM_CAP_HALT_EXIT, 1); if (cpu == BSP) handler[VM_EXITCODE_HLT] = vmexit_hlt; } if (fbsdrun_vmexit_on_pause()) { /* * pause exit support required for this mode */ err = vm_get_capability(ctx, cpu, VM_CAP_PAUSE_EXIT, &tmp); if (err < 0) { fprintf(stderr, "SMP mux requested, no pause support\n"); exit(1); } vm_set_capability(ctx, cpu, VM_CAP_PAUSE_EXIT, 1); if (cpu == BSP) handler[VM_EXITCODE_PAUSE] = vmexit_pause; } - if (fbsdrun_disable_x2apic()) - err = vm_set_x2apic_state(ctx, cpu, X2APIC_DISABLED); - else + if (x2apic_mode) err = vm_set_x2apic_state(ctx, cpu, X2APIC_ENABLED); + else + err = vm_set_x2apic_state(ctx, cpu, X2APIC_DISABLED); if (err) { fprintf(stderr, "Unable to set x2apic state (%d)\n", err); exit(1); } vm_set_capability(ctx, cpu, VM_CAP_ENABLE_INVPCID, 1); } int main(int argc, char *argv[]) { int c, error, gdb_port, err, bvmcons; int max_vcpus; struct vmctx *ctx; uint64_t rip; size_t memsize; bvmcons = 0; progname = basename(argv[0]); gdb_port = 0; guest_ncpus = 1; memsize = 256 * MB; - while ((c = getopt(argc, argv, "abehwAHIPWp:g:c:s:m:l:")) != -1) { + while ((c = getopt(argc, argv, "abehwxAHIPWp:g:c:s:m:l:")) != -1) { switch (c) { case 'a': - disable_x2apic = 1; + x2apic_mode = 0; break; case 'A': acpi = 1; break; case 'b': bvmcons = 1; break; case 'p': pincpu = atoi(optarg); break; case 'c': guest_ncpus = atoi(optarg); break; case 'g': gdb_port = atoi(optarg); break; case 'l': if (lpc_device_parse(optarg) != 0) { errx(EX_USAGE, "invalid lpc device " "configuration '%s'", optarg); } break; case 's': if (pci_parse_slot(optarg) != 0) exit(1); else break; case 'm': error = vm_parse_memsize(optarg, &memsize); if (error) errx(EX_USAGE, "invalid memsize '%s'", optarg); break; case 'H': guest_vmexit_on_hlt = 1; break; case 'I': /* * The "-I" option was used to add an ioapic to the * virtual machine. * * An ioapic is now provided unconditionally for each * virtual machine and this option is now deprecated. */ break; case 'P': guest_vmexit_on_pause = 1; break; case 'e': strictio = 1; break; case 'w': strictmsr = 0; break; case 'W': virtio_msix = 0; break; + case 'x': + x2apic_mode = 1; + break; case 'h': usage(0); default: usage(1); } } argc -= optind; argv += optind; if (argc != 1) usage(1); vmname = argv[0]; ctx = vm_open(vmname); if (ctx == NULL) { perror("vm_open"); exit(1); } max_vcpus = num_vcpus_allowed(ctx); if (guest_ncpus > max_vcpus) { fprintf(stderr, "%d vCPUs requested but only %d available\n", guest_ncpus, max_vcpus); exit(1); } fbsdrun_set_capabilities(ctx, BSP); err = vm_setup_memory(ctx, memsize, VM_MMAP_ALL); if (err) { fprintf(stderr, "Unable to setup memory (%d)\n", err); exit(1); } init_mem(); init_inout(); ioapic_init(ctx); rtc_init(ctx); /* * Exit if a device emulation finds an error in it's initilization */ if (init_pci(ctx) != 0) exit(1); if (gdb_port != 0) init_dbgport(gdb_port); if (bvmcons) init_bvmcons(); error = vm_get_register(ctx, BSP, VM_REG_GUEST_RIP, &rip); assert(error == 0); /* * build the guest tables, MP etc. */ mptable_build(ctx, guest_ncpus); if (acpi) { error = acpi_build(ctx, guest_ncpus); assert(error == 0); } /* * Change the proc title to include the VM name. */ setproctitle("%s", vmname); /* * Add CPU 0 */ fbsdrun_addcpu(ctx, BSP, rip); /* * Head off to the main event dispatch loop */ mevent_dispatch(); exit(1); }