Index: head/sys/amd64/include/vmm.h =================================================================== --- head/sys/amd64/include/vmm.h (revision 355723) +++ head/sys/amd64/include/vmm.h (revision 355724) @@ -1,723 +1,729 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _VMM_H_ #define _VMM_H_ #include #include #ifdef _KERNEL SDT_PROVIDER_DECLARE(vmm); #endif enum vm_suspend_how { VM_SUSPEND_NONE, VM_SUSPEND_RESET, VM_SUSPEND_POWEROFF, VM_SUSPEND_HALT, VM_SUSPEND_TRIPLEFAULT, VM_SUSPEND_LAST }; /* * Identifiers for architecturally defined registers. */ enum vm_reg_name { VM_REG_GUEST_RAX, VM_REG_GUEST_RBX, VM_REG_GUEST_RCX, VM_REG_GUEST_RDX, VM_REG_GUEST_RSI, VM_REG_GUEST_RDI, VM_REG_GUEST_RBP, VM_REG_GUEST_R8, VM_REG_GUEST_R9, VM_REG_GUEST_R10, VM_REG_GUEST_R11, VM_REG_GUEST_R12, VM_REG_GUEST_R13, VM_REG_GUEST_R14, VM_REG_GUEST_R15, VM_REG_GUEST_CR0, VM_REG_GUEST_CR3, VM_REG_GUEST_CR4, VM_REG_GUEST_DR7, VM_REG_GUEST_RSP, VM_REG_GUEST_RIP, VM_REG_GUEST_RFLAGS, VM_REG_GUEST_ES, VM_REG_GUEST_CS, VM_REG_GUEST_SS, VM_REG_GUEST_DS, VM_REG_GUEST_FS, VM_REG_GUEST_GS, VM_REG_GUEST_LDTR, VM_REG_GUEST_TR, VM_REG_GUEST_IDTR, VM_REG_GUEST_GDTR, VM_REG_GUEST_EFER, VM_REG_GUEST_CR2, VM_REG_GUEST_PDPTE0, VM_REG_GUEST_PDPTE1, VM_REG_GUEST_PDPTE2, VM_REG_GUEST_PDPTE3, VM_REG_GUEST_INTR_SHADOW, VM_REG_GUEST_DR0, VM_REG_GUEST_DR1, VM_REG_GUEST_DR2, VM_REG_GUEST_DR3, VM_REG_GUEST_DR6, + VM_REG_GUEST_ENTRY_INST_LENGTH, VM_REG_LAST }; enum x2apic_state { X2APIC_DISABLED, X2APIC_ENABLED, X2APIC_STATE_LAST }; #define VM_INTINFO_VECTOR(info) ((info) & 0xff) #define VM_INTINFO_DEL_ERRCODE 0x800 #define VM_INTINFO_RSVD 0x7ffff000 #define VM_INTINFO_VALID 0x80000000 #define VM_INTINFO_TYPE 0x700 #define VM_INTINFO_HWINTR (0 << 8) #define VM_INTINFO_NMI (2 << 8) #define VM_INTINFO_HWEXCEPTION (3 << 8) #define VM_INTINFO_SWINTR (4 << 8) /* * The VM name has to fit into the pathname length constraints of devfs, * governed primarily by SPECNAMELEN. The length is the total number of * characters in the full path, relative to the mount point and not * including any leading '/' characters. * A prefix and a suffix are added to the name specified by the user. * The prefix is usually "vmm/" or "vmm.io/", but can be a few characters * longer for future use. * The suffix is a string that identifies a bootrom image or some similar * image that is attached to the VM. A separator character gets added to * the suffix automatically when generating the full path, so it must be * accounted for, reducing the effective length by 1. * The effective length of a VM name is 229 bytes for FreeBSD 13 and 37 * bytes for FreeBSD 12. A minimum length is set for safety and supports * a SPECNAMELEN as small as 32 on old systems. */ #define VM_MAX_PREFIXLEN 10 #define VM_MAX_SUFFIXLEN 15 #define VM_MIN_NAMELEN 6 #define VM_MAX_NAMELEN \ (SPECNAMELEN - VM_MAX_PREFIXLEN - VM_MAX_SUFFIXLEN - 1) #ifdef _KERNEL CTASSERT(VM_MAX_NAMELEN >= VM_MIN_NAMELEN); struct vm; struct vm_exception; struct seg_desc; struct vm_exit; struct vm_run; struct vhpet; struct vioapic; struct vlapic; struct vmspace; struct vm_object; struct vm_guest_paging; struct pmap; struct vm_eventinfo { void *rptr; /* rendezvous cookie */ int *sptr; /* suspend cookie */ int *iptr; /* reqidle cookie */ }; typedef int (*vmm_init_func_t)(int ipinum); typedef int (*vmm_cleanup_func_t)(void); typedef void (*vmm_resume_func_t)(void); typedef void * (*vmi_init_func_t)(struct vm *vm, struct pmap *pmap); typedef int (*vmi_run_func_t)(void *vmi, int vcpu, register_t rip, struct pmap *pmap, struct vm_eventinfo *info); typedef void (*vmi_cleanup_func_t)(void *vmi); typedef int (*vmi_get_register_t)(void *vmi, int vcpu, int num, uint64_t *retval); typedef int (*vmi_set_register_t)(void *vmi, int vcpu, int num, uint64_t val); typedef int (*vmi_get_desc_t)(void *vmi, int vcpu, int num, struct seg_desc *desc); typedef int (*vmi_set_desc_t)(void *vmi, int vcpu, int num, struct seg_desc *desc); typedef int (*vmi_get_cap_t)(void *vmi, int vcpu, int num, int *retval); typedef int (*vmi_set_cap_t)(void *vmi, int vcpu, int num, int val); typedef struct vmspace * (*vmi_vmspace_alloc)(vm_offset_t min, vm_offset_t max); typedef void (*vmi_vmspace_free)(struct vmspace *vmspace); typedef struct vlapic * (*vmi_vlapic_init)(void *vmi, int vcpu); typedef void (*vmi_vlapic_cleanup)(void *vmi, struct vlapic *vlapic); struct vmm_ops { vmm_init_func_t init; /* module wide initialization */ vmm_cleanup_func_t cleanup; vmm_resume_func_t resume; vmi_init_func_t vminit; /* vm-specific initialization */ vmi_run_func_t vmrun; vmi_cleanup_func_t vmcleanup; vmi_get_register_t vmgetreg; vmi_set_register_t vmsetreg; vmi_get_desc_t vmgetdesc; vmi_set_desc_t vmsetdesc; vmi_get_cap_t vmgetcap; vmi_set_cap_t vmsetcap; vmi_vmspace_alloc vmspace_alloc; vmi_vmspace_free vmspace_free; vmi_vlapic_init vlapic_init; vmi_vlapic_cleanup vlapic_cleanup; }; extern struct vmm_ops vmm_ops_intel; extern struct vmm_ops vmm_ops_amd; int vm_create(const char *name, struct vm **retvm); void vm_destroy(struct vm *vm); int vm_reinit(struct vm *vm); const char *vm_name(struct vm *vm); uint16_t vm_get_maxcpus(struct vm *vm); void vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores, uint16_t *threads, uint16_t *maxcpus); int vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores, uint16_t threads, uint16_t maxcpus); /* * APIs that modify the guest memory map require all vcpus to be frozen. */ int vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t off, size_t len, int prot, int flags); int vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem); void vm_free_memseg(struct vm *vm, int ident); int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa); int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len); int vm_assign_pptdev(struct vm *vm, int bus, int slot, int func); int vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func); /* * APIs that inspect the guest memory map require only a *single* vcpu to * be frozen. This acts like a read lock on the guest memory map since any * modification requires *all* vcpus to be frozen. */ int vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid, vm_ooffset_t *segoff, size_t *len, int *prot, int *flags); int vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem, struct vm_object **objptr); vm_paddr_t vmm_sysmem_maxaddr(struct vm *vm); void *vm_gpa_hold(struct vm *, int vcpuid, vm_paddr_t gpa, size_t len, int prot, void **cookie); void vm_gpa_release(void *cookie); bool vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa); int vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval); int vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val); int vm_get_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *ret_desc); int vm_set_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc); int vm_run(struct vm *vm, struct vm_run *vmrun); int vm_suspend(struct vm *vm, enum vm_suspend_how how); int vm_inject_nmi(struct vm *vm, int vcpu); int vm_nmi_pending(struct vm *vm, int vcpuid); void vm_nmi_clear(struct vm *vm, int vcpuid); int vm_inject_extint(struct vm *vm, int vcpu); int vm_extint_pending(struct vm *vm, int vcpuid); void vm_extint_clear(struct vm *vm, int vcpuid); struct vlapic *vm_lapic(struct vm *vm, int cpu); struct vioapic *vm_ioapic(struct vm *vm); struct vhpet *vm_hpet(struct vm *vm); int vm_get_capability(struct vm *vm, int vcpu, int type, int *val); int vm_set_capability(struct vm *vm, int vcpu, int type, int val); int vm_get_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state *state); int vm_set_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state state); int vm_apicid2vcpuid(struct vm *vm, int apicid); int vm_activate_cpu(struct vm *vm, int vcpu); int vm_suspend_cpu(struct vm *vm, int vcpu); int vm_resume_cpu(struct vm *vm, int vcpu); struct vm_exit *vm_exitinfo(struct vm *vm, int vcpuid); void vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip); void vm_exit_debug(struct vm *vm, int vcpuid, uint64_t rip); void vm_exit_rendezvous(struct vm *vm, int vcpuid, uint64_t rip); void vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip); void vm_exit_reqidle(struct vm *vm, int vcpuid, uint64_t rip); #ifdef _SYS__CPUSET_H_ /* * Rendezvous all vcpus specified in 'dest' and execute 'func(arg)'. * The rendezvous 'func(arg)' is not allowed to do anything that will * cause the thread to be put to sleep. * * If the rendezvous is being initiated from a vcpu context then the * 'vcpuid' must refer to that vcpu, otherwise it should be set to -1. * * The caller cannot hold any locks when initiating the rendezvous. * * The implementation of this API may cause vcpus other than those specified * by 'dest' to be stalled. The caller should not rely on any vcpus making * forward progress when the rendezvous is in progress. */ typedef void (*vm_rendezvous_func_t)(struct vm *vm, int vcpuid, void *arg); void vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest, vm_rendezvous_func_t func, void *arg); cpuset_t vm_active_cpus(struct vm *vm); cpuset_t vm_debug_cpus(struct vm *vm); cpuset_t vm_suspended_cpus(struct vm *vm); #endif /* _SYS__CPUSET_H_ */ static __inline int vcpu_rendezvous_pending(struct vm_eventinfo *info) { return (*((uintptr_t *)(info->rptr)) != 0); } static __inline int vcpu_suspended(struct vm_eventinfo *info) { return (*info->sptr); } static __inline int vcpu_reqidle(struct vm_eventinfo *info) { return (*info->iptr); } int vcpu_debugged(struct vm *vm, int vcpuid); /* * Return true if device indicated by bus/slot/func is supposed to be a * pci passthrough device. * * Return false otherwise. */ bool vmm_is_pptdev(int bus, int slot, int func); void *vm_iommu_domain(struct vm *vm); enum vcpu_state { VCPU_IDLE, VCPU_FROZEN, VCPU_RUNNING, VCPU_SLEEPING, }; int vcpu_set_state(struct vm *vm, int vcpu, enum vcpu_state state, bool from_idle); enum vcpu_state vcpu_get_state(struct vm *vm, int vcpu, int *hostcpu); static int __inline vcpu_is_running(struct vm *vm, int vcpu, int *hostcpu) { return (vcpu_get_state(vm, vcpu, hostcpu) == VCPU_RUNNING); } #ifdef _SYS_PROC_H_ static int __inline vcpu_should_yield(struct vm *vm, int vcpu) { if (curthread->td_flags & (TDF_ASTPENDING | TDF_NEEDRESCHED)) return (1); else if (curthread->td_owepreempt) return (1); else return (0); } #endif void *vcpu_stats(struct vm *vm, int vcpu); void vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr); struct vmspace *vm_get_vmspace(struct vm *vm); struct vatpic *vm_atpic(struct vm *vm); struct vatpit *vm_atpit(struct vm *vm); struct vpmtmr *vm_pmtmr(struct vm *vm); struct vrtc *vm_rtc(struct vm *vm); /* * Inject exception 'vector' into the guest vcpu. This function returns 0 on * success and non-zero on failure. * * Wrapper functions like 'vm_inject_gp()' should be preferred to calling * this function directly because they enforce the trap-like or fault-like * behavior of an exception. * * This function should only be called in the context of the thread that is * executing this vcpu. */ int vm_inject_exception(struct vm *vm, int vcpuid, int vector, int err_valid, uint32_t errcode, int restart_instruction); /* * This function is called after a VM-exit that occurred during exception or * interrupt delivery through the IDT. The format of 'intinfo' is described * in Figure 15-1, "EXITINTINFO for All Intercepts", APM, Vol 2. * * If a VM-exit handler completes the event delivery successfully then it * should call vm_exit_intinfo() to extinguish the pending event. For e.g., * if the task switch emulation is triggered via a task gate then it should * call this function with 'intinfo=0' to indicate that the external event * is not pending anymore. * * Return value is 0 on success and non-zero on failure. */ int vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t intinfo); /* * This function is called before every VM-entry to retrieve a pending * event that should be injected into the guest. This function combines * nested events into a double or triple fault. * * Returns 0 if there are no events that need to be injected into the guest * and non-zero otherwise. */ int vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *info); int vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2); enum vm_reg_name vm_segment_name(int seg_encoding); struct vm_copyinfo { uint64_t gpa; size_t len; void *hva; void *cookie; }; /* * Set up 'copyinfo[]' to copy to/from guest linear address space starting * at 'gla' and 'len' bytes long. The 'prot' should be set to PROT_READ for * a copyin or PROT_WRITE for a copyout. * * retval is_fault Interpretation * 0 0 Success * 0 1 An exception was injected into the guest * EFAULT N/A Unrecoverable error * * The 'copyinfo[]' can be passed to 'vm_copyin()' or 'vm_copyout()' only if * the return value is 0. The 'copyinfo[]' resources should be freed by calling * 'vm_copy_teardown()' after the copy is done. */ int vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo, int num_copyinfo, int *is_fault); void vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, int num_copyinfo); void vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr, size_t len); void vm_copyout(struct vm *vm, int vcpuid, const void *kaddr, struct vm_copyinfo *copyinfo, size_t len); int vcpu_trace_exceptions(struct vm *vm, int vcpuid); #endif /* KERNEL */ #define VM_MAXCPU 16 /* maximum virtual cpus */ /* * Identifiers for optional vmm capabilities */ enum vm_cap_type { VM_CAP_HALT_EXIT, VM_CAP_MTRAP_EXIT, VM_CAP_PAUSE_EXIT, VM_CAP_UNRESTRICTED_GUEST, VM_CAP_ENABLE_INVPCID, + VM_CAP_BPT_EXIT, VM_CAP_MAX }; enum vm_intr_trigger { EDGE_TRIGGER, LEVEL_TRIGGER }; /* * The 'access' field has the format specified in Table 21-2 of the Intel * Architecture Manual vol 3b. * * XXX The contents of the 'access' field are architecturally defined except * bit 16 - Segment Unusable. */ struct seg_desc { uint64_t base; uint32_t limit; uint32_t access; }; #define SEG_DESC_TYPE(access) ((access) & 0x001f) #define SEG_DESC_DPL(access) (((access) >> 5) & 0x3) #define SEG_DESC_PRESENT(access) (((access) & 0x0080) ? 1 : 0) #define SEG_DESC_DEF32(access) (((access) & 0x4000) ? 1 : 0) #define SEG_DESC_GRANULARITY(access) (((access) & 0x8000) ? 1 : 0) #define SEG_DESC_UNUSABLE(access) (((access) & 0x10000) ? 1 : 0) enum vm_cpu_mode { CPU_MODE_REAL, CPU_MODE_PROTECTED, CPU_MODE_COMPATIBILITY, /* IA-32E mode (CS.L = 0) */ CPU_MODE_64BIT, /* IA-32E mode (CS.L = 1) */ }; enum vm_paging_mode { PAGING_MODE_FLAT, PAGING_MODE_32, PAGING_MODE_PAE, PAGING_MODE_64, }; struct vm_guest_paging { uint64_t cr3; int cpl; enum vm_cpu_mode cpu_mode; enum vm_paging_mode paging_mode; }; /* * The data structures 'vie' and 'vie_op' are meant to be opaque to the * consumers of instruction decoding. The only reason why their contents * need to be exposed is because they are part of the 'vm_exit' structure. */ struct vie_op { uint8_t op_byte; /* actual opcode byte */ uint8_t op_type; /* type of operation (e.g. MOV) */ uint16_t op_flags; }; #define VIE_INST_SIZE 15 struct vie { uint8_t inst[VIE_INST_SIZE]; /* instruction bytes */ uint8_t num_valid; /* size of the instruction */ uint8_t num_processed; uint8_t addrsize:4, opsize:4; /* address and operand sizes */ uint8_t rex_w:1, /* REX prefix */ rex_r:1, rex_x:1, rex_b:1, rex_present:1, repz_present:1, /* REP/REPE/REPZ prefix */ repnz_present:1, /* REPNE/REPNZ prefix */ opsize_override:1, /* Operand size override */ addrsize_override:1, /* Address size override */ segment_override:1; /* Segment override */ uint8_t mod:2, /* ModRM byte */ reg:4, rm:4; uint8_t ss:2, /* SIB byte */ index:4, base:4; uint8_t disp_bytes; uint8_t imm_bytes; uint8_t scale; int base_register; /* VM_REG_GUEST_xyz */ int index_register; /* VM_REG_GUEST_xyz */ int segment_register; /* VM_REG_GUEST_xyz */ int64_t displacement; /* optional addr displacement */ int64_t immediate; /* optional immediate operand */ uint8_t decoded; /* set to 1 if successfully decoded */ struct vie_op op; /* opcode description */ }; enum vm_exitcode { VM_EXITCODE_INOUT, VM_EXITCODE_VMX, VM_EXITCODE_BOGUS, VM_EXITCODE_RDMSR, VM_EXITCODE_WRMSR, VM_EXITCODE_HLT, VM_EXITCODE_MTRAP, VM_EXITCODE_PAUSE, VM_EXITCODE_PAGING, VM_EXITCODE_INST_EMUL, VM_EXITCODE_SPINUP_AP, VM_EXITCODE_DEPRECATED1, /* used to be SPINDOWN_CPU */ VM_EXITCODE_RENDEZVOUS, VM_EXITCODE_IOAPIC_EOI, VM_EXITCODE_SUSPENDED, VM_EXITCODE_INOUT_STR, VM_EXITCODE_TASK_SWITCH, VM_EXITCODE_MONITOR, VM_EXITCODE_MWAIT, VM_EXITCODE_SVM, VM_EXITCODE_REQIDLE, VM_EXITCODE_DEBUG, VM_EXITCODE_VMINSN, + VM_EXITCODE_BPT, VM_EXITCODE_MAX }; struct vm_inout { uint16_t bytes:3; /* 1 or 2 or 4 */ uint16_t in:1; uint16_t string:1; uint16_t rep:1; uint16_t port; uint32_t eax; /* valid for out */ }; struct vm_inout_str { struct vm_inout inout; /* must be the first element */ struct vm_guest_paging paging; uint64_t rflags; uint64_t cr0; uint64_t index; uint64_t count; /* rep=1 (%rcx), rep=0 (1) */ int addrsize; enum vm_reg_name seg_name; struct seg_desc seg_desc; }; enum task_switch_reason { TSR_CALL, TSR_IRET, TSR_JMP, TSR_IDT_GATE, /* task gate in IDT */ }; struct vm_task_switch { uint16_t tsssel; /* new TSS selector */ int ext; /* task switch due to external event */ uint32_t errcode; int errcode_valid; /* push 'errcode' on the new stack */ enum task_switch_reason reason; struct vm_guest_paging paging; }; struct vm_exit { enum vm_exitcode exitcode; int inst_length; /* 0 means unknown */ uint64_t rip; union { struct vm_inout inout; struct vm_inout_str inout_str; struct { uint64_t gpa; int fault_type; } paging; struct { uint64_t gpa; uint64_t gla; uint64_t cs_base; int cs_d; /* CS.D */ struct vm_guest_paging paging; struct vie vie; } inst_emul; /* * VMX specific payload. Used when there is no "better" * exitcode to represent the VM-exit. */ struct { int status; /* vmx inst status */ /* * 'exit_reason' and 'exit_qualification' are valid * only if 'status' is zero. */ uint32_t exit_reason; uint64_t exit_qualification; /* * 'inst_error' and 'inst_type' are valid * only if 'status' is non-zero. */ int inst_type; int inst_error; } vmx; /* * SVM specific payload. */ struct { uint64_t exitcode; uint64_t exitinfo1; uint64_t exitinfo2; } svm; + struct { + int inst_length; + } bpt; struct { uint32_t code; /* ecx value */ uint64_t wval; } msr; struct { int vcpu; uint64_t rip; } spinup_ap; struct { uint64_t rflags; uint64_t intr_status; } hlt; struct { int vector; } ioapic_eoi; struct { enum vm_suspend_how how; } suspended; struct vm_task_switch task_switch; } u; }; /* APIs to inject faults into the guest */ void vm_inject_fault(void *vm, int vcpuid, int vector, int errcode_valid, int errcode); static __inline void vm_inject_ud(void *vm, int vcpuid) { vm_inject_fault(vm, vcpuid, IDT_UD, 0, 0); } static __inline void vm_inject_gp(void *vm, int vcpuid) { vm_inject_fault(vm, vcpuid, IDT_GP, 1, 0); } static __inline void vm_inject_ac(void *vm, int vcpuid, int errcode) { vm_inject_fault(vm, vcpuid, IDT_AC, 1, errcode); } static __inline void vm_inject_ss(void *vm, int vcpuid, int errcode) { vm_inject_fault(vm, vcpuid, IDT_SS, 1, errcode); } void vm_inject_pf(void *vm, int vcpuid, int error_code, uint64_t cr2); int vm_restart_instruction(void *vm, int vcpuid); #endif /* _VMM_H_ */ Index: head/sys/amd64/vmm/amd/svm.c =================================================================== --- head/sys/amd64/vmm/amd/svm.c (revision 355723) +++ head/sys/amd64/vmm/amd/svm.c (revision 355724) @@ -1,2299 +1,2304 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2013, Anish Gupta (akgupt3@gmail.com) * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "vmm_lapic.h" #include "vmm_stat.h" #include "vmm_ktr.h" #include "vmm_ioport.h" #include "vatpic.h" #include "vlapic.h" #include "vlapic_priv.h" #include "x86.h" #include "vmcb.h" #include "svm.h" #include "svm_softc.h" #include "svm_msr.h" #include "npt.h" SYSCTL_DECL(_hw_vmm); SYSCTL_NODE(_hw_vmm, OID_AUTO, svm, CTLFLAG_RW, NULL, NULL); /* * SVM CPUID function 0x8000_000A, edx bit decoding. */ #define AMD_CPUID_SVM_NP BIT(0) /* Nested paging or RVI */ #define AMD_CPUID_SVM_LBR BIT(1) /* Last branch virtualization */ #define AMD_CPUID_SVM_SVML BIT(2) /* SVM lock */ #define AMD_CPUID_SVM_NRIP_SAVE BIT(3) /* Next RIP is saved */ #define AMD_CPUID_SVM_TSC_RATE BIT(4) /* TSC rate control. */ #define AMD_CPUID_SVM_VMCB_CLEAN BIT(5) /* VMCB state caching */ #define AMD_CPUID_SVM_FLUSH_BY_ASID BIT(6) /* Flush by ASID */ #define AMD_CPUID_SVM_DECODE_ASSIST BIT(7) /* Decode assist */ #define AMD_CPUID_SVM_PAUSE_INC BIT(10) /* Pause intercept filter. */ #define AMD_CPUID_SVM_PAUSE_FTH BIT(12) /* Pause filter threshold */ #define AMD_CPUID_SVM_AVIC BIT(13) /* AVIC present */ #define VMCB_CACHE_DEFAULT (VMCB_CACHE_ASID | \ VMCB_CACHE_IOPM | \ VMCB_CACHE_I | \ VMCB_CACHE_TPR | \ VMCB_CACHE_CR2 | \ VMCB_CACHE_CR | \ VMCB_CACHE_DR | \ VMCB_CACHE_DT | \ VMCB_CACHE_SEG | \ VMCB_CACHE_NP) static uint32_t vmcb_clean = VMCB_CACHE_DEFAULT; SYSCTL_INT(_hw_vmm_svm, OID_AUTO, vmcb_clean, CTLFLAG_RDTUN, &vmcb_clean, 0, NULL); static MALLOC_DEFINE(M_SVM, "svm", "svm"); static MALLOC_DEFINE(M_SVM_VLAPIC, "svm-vlapic", "svm-vlapic"); static uint32_t svm_feature = ~0U; /* AMD SVM features. */ SYSCTL_UINT(_hw_vmm_svm, OID_AUTO, features, CTLFLAG_RDTUN, &svm_feature, 0, "SVM features advertised by CPUID.8000000AH:EDX"); static int disable_npf_assist; SYSCTL_INT(_hw_vmm_svm, OID_AUTO, disable_npf_assist, CTLFLAG_RWTUN, &disable_npf_assist, 0, NULL); /* Maximum ASIDs supported by the processor */ static uint32_t nasid; SYSCTL_UINT(_hw_vmm_svm, OID_AUTO, num_asids, CTLFLAG_RDTUN, &nasid, 0, "Number of ASIDs supported by this processor"); /* Current ASID generation for each host cpu */ static struct asid asid[MAXCPU]; /* * SVM host state saved area of size 4KB for each core. */ static uint8_t hsave[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE); static VMM_STAT_AMD(VCPU_EXITINTINFO, "VM exits during event delivery"); static VMM_STAT_AMD(VCPU_INTINFO_INJECTED, "Events pending at VM entry"); static VMM_STAT_AMD(VMEXIT_VINTR, "VM exits due to interrupt window"); static int svm_setreg(void *arg, int vcpu, int ident, uint64_t val); static __inline int flush_by_asid(void) { return (svm_feature & AMD_CPUID_SVM_FLUSH_BY_ASID); } static __inline int decode_assist(void) { return (svm_feature & AMD_CPUID_SVM_DECODE_ASSIST); } static void svm_disable(void *arg __unused) { uint64_t efer; efer = rdmsr(MSR_EFER); efer &= ~EFER_SVM; wrmsr(MSR_EFER, efer); } /* * Disable SVM on all CPUs. */ static int svm_cleanup(void) { smp_rendezvous(NULL, svm_disable, NULL, NULL); return (0); } /* * Verify that all the features required by bhyve are available. */ static int check_svm_features(void) { u_int regs[4]; /* CPUID Fn8000_000A is for SVM */ do_cpuid(0x8000000A, regs); svm_feature &= regs[3]; /* * The number of ASIDs can be configured to be less than what is * supported by the hardware but not more. */ if (nasid == 0 || nasid > regs[1]) nasid = regs[1]; KASSERT(nasid > 1, ("Insufficient ASIDs for guests: %#x", nasid)); /* bhyve requires the Nested Paging feature */ if (!(svm_feature & AMD_CPUID_SVM_NP)) { printf("SVM: Nested Paging feature not available.\n"); return (ENXIO); } /* bhyve requires the NRIP Save feature */ if (!(svm_feature & AMD_CPUID_SVM_NRIP_SAVE)) { printf("SVM: NRIP Save feature not available.\n"); return (ENXIO); } return (0); } static void svm_enable(void *arg __unused) { uint64_t efer; efer = rdmsr(MSR_EFER); efer |= EFER_SVM; wrmsr(MSR_EFER, efer); wrmsr(MSR_VM_HSAVE_PA, vtophys(hsave[curcpu])); } /* * Return 1 if SVM is enabled on this processor and 0 otherwise. */ static int svm_available(void) { uint64_t msr; /* Section 15.4 Enabling SVM from APM2. */ if ((amd_feature2 & AMDID2_SVM) == 0) { printf("SVM: not available.\n"); return (0); } msr = rdmsr(MSR_VM_CR); if ((msr & VM_CR_SVMDIS) != 0) { printf("SVM: disabled by BIOS.\n"); return (0); } return (1); } static int svm_init(int ipinum) { int error, cpu; if (!svm_available()) return (ENXIO); error = check_svm_features(); if (error) return (error); vmcb_clean &= VMCB_CACHE_DEFAULT; for (cpu = 0; cpu < MAXCPU; cpu++) { /* * Initialize the host ASIDs to their "highest" valid values. * * The next ASID allocation will rollover both 'gen' and 'num' * and start off the sequence at {1,1}. */ asid[cpu].gen = ~0UL; asid[cpu].num = nasid - 1; } svm_msr_init(); svm_npt_init(ipinum); /* Enable SVM on all CPUs */ smp_rendezvous(NULL, svm_enable, NULL, NULL); return (0); } static void svm_restore(void) { svm_enable(NULL); } /* Pentium compatible MSRs */ #define MSR_PENTIUM_START 0 #define MSR_PENTIUM_END 0x1FFF /* AMD 6th generation and Intel compatible MSRs */ #define MSR_AMD6TH_START 0xC0000000UL #define MSR_AMD6TH_END 0xC0001FFFUL /* AMD 7th and 8th generation compatible MSRs */ #define MSR_AMD7TH_START 0xC0010000UL #define MSR_AMD7TH_END 0xC0011FFFUL /* * Get the index and bit position for a MSR in permission bitmap. * Two bits are used for each MSR: lower bit for read and higher bit for write. */ static int svm_msr_index(uint64_t msr, int *index, int *bit) { uint32_t base, off; *index = -1; *bit = (msr % 4) * 2; base = 0; if (msr >= MSR_PENTIUM_START && msr <= MSR_PENTIUM_END) { *index = msr / 4; return (0); } base += (MSR_PENTIUM_END - MSR_PENTIUM_START + 1); if (msr >= MSR_AMD6TH_START && msr <= MSR_AMD6TH_END) { off = (msr - MSR_AMD6TH_START); *index = (off + base) / 4; return (0); } base += (MSR_AMD6TH_END - MSR_AMD6TH_START + 1); if (msr >= MSR_AMD7TH_START && msr <= MSR_AMD7TH_END) { off = (msr - MSR_AMD7TH_START); *index = (off + base) / 4; return (0); } return (EINVAL); } /* * Allow vcpu to read or write the 'msr' without trapping into the hypervisor. */ static void svm_msr_perm(uint8_t *perm_bitmap, uint64_t msr, bool read, bool write) { int index, bit, error; error = svm_msr_index(msr, &index, &bit); KASSERT(error == 0, ("%s: invalid msr %#lx", __func__, msr)); KASSERT(index >= 0 && index < SVM_MSR_BITMAP_SIZE, ("%s: invalid index %d for msr %#lx", __func__, index, msr)); KASSERT(bit >= 0 && bit <= 6, ("%s: invalid bit position %d " "msr %#lx", __func__, bit, msr)); if (read) perm_bitmap[index] &= ~(1UL << bit); if (write) perm_bitmap[index] &= ~(2UL << bit); } static void svm_msr_rw_ok(uint8_t *perm_bitmap, uint64_t msr) { svm_msr_perm(perm_bitmap, msr, true, true); } static void svm_msr_rd_ok(uint8_t *perm_bitmap, uint64_t msr) { svm_msr_perm(perm_bitmap, msr, true, false); } static __inline int svm_get_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask) { struct vmcb_ctrl *ctrl; KASSERT(idx >=0 && idx < 5, ("invalid intercept index %d", idx)); ctrl = svm_get_vmcb_ctrl(sc, vcpu); return (ctrl->intercept[idx] & bitmask ? 1 : 0); } static __inline void svm_set_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask, int enabled) { struct vmcb_ctrl *ctrl; uint32_t oldval; KASSERT(idx >=0 && idx < 5, ("invalid intercept index %d", idx)); ctrl = svm_get_vmcb_ctrl(sc, vcpu); oldval = ctrl->intercept[idx]; if (enabled) ctrl->intercept[idx] |= bitmask; else ctrl->intercept[idx] &= ~bitmask; if (ctrl->intercept[idx] != oldval) { svm_set_dirty(sc, vcpu, VMCB_CACHE_I); VCPU_CTR3(sc->vm, vcpu, "intercept[%d] modified " "from %#x to %#x", idx, oldval, ctrl->intercept[idx]); } } static __inline void svm_disable_intercept(struct svm_softc *sc, int vcpu, int off, uint32_t bitmask) { svm_set_intercept(sc, vcpu, off, bitmask, 0); } static __inline void svm_enable_intercept(struct svm_softc *sc, int vcpu, int off, uint32_t bitmask) { svm_set_intercept(sc, vcpu, off, bitmask, 1); } static void vmcb_init(struct svm_softc *sc, int vcpu, uint64_t iopm_base_pa, uint64_t msrpm_base_pa, uint64_t np_pml4) { struct vmcb_ctrl *ctrl; struct vmcb_state *state; uint32_t mask; int n; ctrl = svm_get_vmcb_ctrl(sc, vcpu); state = svm_get_vmcb_state(sc, vcpu); ctrl->iopm_base_pa = iopm_base_pa; ctrl->msrpm_base_pa = msrpm_base_pa; /* Enable nested paging */ ctrl->np_enable = 1; ctrl->n_cr3 = np_pml4; /* * Intercept accesses to the control registers that are not shadowed * in the VMCB - i.e. all except cr0, cr2, cr3, cr4 and cr8. */ for (n = 0; n < 16; n++) { mask = (BIT(n) << 16) | BIT(n); if (n == 0 || n == 2 || n == 3 || n == 4 || n == 8) svm_disable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask); else svm_enable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask); } /* * Intercept everything when tracing guest exceptions otherwise * just intercept machine check exception. */ if (vcpu_trace_exceptions(sc->vm, vcpu)) { for (n = 0; n < 32; n++) { /* * Skip unimplemented vectors in the exception bitmap. */ if (n == 2 || n == 9) { continue; } svm_enable_intercept(sc, vcpu, VMCB_EXC_INTCPT, BIT(n)); } } else { svm_enable_intercept(sc, vcpu, VMCB_EXC_INTCPT, BIT(IDT_MC)); } /* Intercept various events (for e.g. I/O, MSR and CPUID accesses) */ svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IO); svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_MSR); svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_CPUID); svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INTR); svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INIT); svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_NMI); svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SMI); svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SHUTDOWN); svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_FERR_FREEZE); svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_MONITOR); svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_MWAIT); /* * From section "Canonicalization and Consistency Checks" in APMv2 * the VMRUN intercept bit must be set to pass the consistency check. */ svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMRUN); /* * The ASID will be set to a non-zero value just before VMRUN. */ ctrl->asid = 0; /* * Section 15.21.1, Interrupt Masking in EFLAGS * Section 15.21.2, Virtualizing APIC.TPR * * This must be set for %rflag and %cr8 isolation of guest and host. */ ctrl->v_intr_masking = 1; /* Enable Last Branch Record aka LBR for debugging */ ctrl->lbr_virt_en = 1; state->dbgctl = BIT(0); /* EFER_SVM must always be set when the guest is executing */ state->efer = EFER_SVM; /* Set up the PAT to power-on state */ state->g_pat = PAT_VALUE(0, PAT_WRITE_BACK) | PAT_VALUE(1, PAT_WRITE_THROUGH) | PAT_VALUE(2, PAT_UNCACHED) | PAT_VALUE(3, PAT_UNCACHEABLE) | PAT_VALUE(4, PAT_WRITE_BACK) | PAT_VALUE(5, PAT_WRITE_THROUGH) | PAT_VALUE(6, PAT_UNCACHED) | PAT_VALUE(7, PAT_UNCACHEABLE); /* Set up DR6/7 to power-on state */ state->dr6 = DBREG_DR6_RESERVED1; state->dr7 = DBREG_DR7_RESERVED1; } /* * Initialize a virtual machine. */ static void * svm_vminit(struct vm *vm, pmap_t pmap) { struct svm_softc *svm_sc; struct svm_vcpu *vcpu; vm_paddr_t msrpm_pa, iopm_pa, pml4_pa; int i; uint16_t maxcpus; svm_sc = malloc(sizeof (*svm_sc), M_SVM, M_WAITOK | M_ZERO); if (((uintptr_t)svm_sc & PAGE_MASK) != 0) panic("malloc of svm_softc not aligned on page boundary"); svm_sc->msr_bitmap = contigmalloc(SVM_MSR_BITMAP_SIZE, M_SVM, M_WAITOK, 0, ~(vm_paddr_t)0, PAGE_SIZE, 0); if (svm_sc->msr_bitmap == NULL) panic("contigmalloc of SVM MSR bitmap failed"); svm_sc->iopm_bitmap = contigmalloc(SVM_IO_BITMAP_SIZE, M_SVM, M_WAITOK, 0, ~(vm_paddr_t)0, PAGE_SIZE, 0); if (svm_sc->iopm_bitmap == NULL) panic("contigmalloc of SVM IO bitmap failed"); svm_sc->vm = vm; svm_sc->nptp = (vm_offset_t)vtophys(pmap->pm_pml4); /* * Intercept read and write accesses to all MSRs. */ memset(svm_sc->msr_bitmap, 0xFF, SVM_MSR_BITMAP_SIZE); /* * Access to the following MSRs is redirected to the VMCB when the * guest is executing. Therefore it is safe to allow the guest to * read/write these MSRs directly without hypervisor involvement. */ svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_GSBASE); svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_FSBASE); svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_KGSBASE); svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_STAR); svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_LSTAR); svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_CSTAR); svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SF_MASK); svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_CS_MSR); svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_ESP_MSR); svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_EIP_MSR); svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_PAT); svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_TSC); /* * Intercept writes to make sure that the EFER_SVM bit is not cleared. */ svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_EFER); /* Intercept access to all I/O ports. */ memset(svm_sc->iopm_bitmap, 0xFF, SVM_IO_BITMAP_SIZE); iopm_pa = vtophys(svm_sc->iopm_bitmap); msrpm_pa = vtophys(svm_sc->msr_bitmap); pml4_pa = svm_sc->nptp; maxcpus = vm_get_maxcpus(svm_sc->vm); for (i = 0; i < maxcpus; i++) { vcpu = svm_get_vcpu(svm_sc, i); vcpu->nextrip = ~0; vcpu->lastcpu = NOCPU; vcpu->vmcb_pa = vtophys(&vcpu->vmcb); vmcb_init(svm_sc, i, iopm_pa, msrpm_pa, pml4_pa); svm_msr_guest_init(svm_sc, i); } return (svm_sc); } /* * Collateral for a generic SVM VM-exit. */ static void vm_exit_svm(struct vm_exit *vme, uint64_t code, uint64_t info1, uint64_t info2) { vme->exitcode = VM_EXITCODE_SVM; vme->u.svm.exitcode = code; vme->u.svm.exitinfo1 = info1; vme->u.svm.exitinfo2 = info2; } static int svm_cpl(struct vmcb_state *state) { /* * From APMv2: * "Retrieve the CPL from the CPL field in the VMCB, not * from any segment DPL" */ return (state->cpl); } static enum vm_cpu_mode svm_vcpu_mode(struct vmcb *vmcb) { struct vmcb_segment seg; struct vmcb_state *state; int error; state = &vmcb->state; if (state->efer & EFER_LMA) { error = vmcb_seg(vmcb, VM_REG_GUEST_CS, &seg); KASSERT(error == 0, ("%s: vmcb_seg(cs) error %d", __func__, error)); /* * Section 4.8.1 for APM2, check if Code Segment has * Long attribute set in descriptor. */ if (seg.attrib & VMCB_CS_ATTRIB_L) return (CPU_MODE_64BIT); else return (CPU_MODE_COMPATIBILITY); } else if (state->cr0 & CR0_PE) { return (CPU_MODE_PROTECTED); } else { return (CPU_MODE_REAL); } } static enum vm_paging_mode svm_paging_mode(uint64_t cr0, uint64_t cr4, uint64_t efer) { if ((cr0 & CR0_PG) == 0) return (PAGING_MODE_FLAT); if ((cr4 & CR4_PAE) == 0) return (PAGING_MODE_32); if (efer & EFER_LME) return (PAGING_MODE_64); else return (PAGING_MODE_PAE); } /* * ins/outs utility routines */ static uint64_t svm_inout_str_index(struct svm_regctx *regs, int in) { uint64_t val; val = in ? regs->sctx_rdi : regs->sctx_rsi; return (val); } static uint64_t svm_inout_str_count(struct svm_regctx *regs, int rep) { uint64_t val; val = rep ? regs->sctx_rcx : 1; return (val); } static void svm_inout_str_seginfo(struct svm_softc *svm_sc, int vcpu, int64_t info1, int in, struct vm_inout_str *vis) { int error, s; if (in) { vis->seg_name = VM_REG_GUEST_ES; } else { /* The segment field has standard encoding */ s = (info1 >> 10) & 0x7; vis->seg_name = vm_segment_name(s); } error = vmcb_getdesc(svm_sc, vcpu, vis->seg_name, &vis->seg_desc); KASSERT(error == 0, ("%s: svm_getdesc error %d", __func__, error)); } static int svm_inout_str_addrsize(uint64_t info1) { uint32_t size; size = (info1 >> 7) & 0x7; switch (size) { case 1: return (2); /* 16 bit */ case 2: return (4); /* 32 bit */ case 4: return (8); /* 64 bit */ default: panic("%s: invalid size encoding %d", __func__, size); } } static void svm_paging_info(struct vmcb *vmcb, struct vm_guest_paging *paging) { struct vmcb_state *state; state = &vmcb->state; paging->cr3 = state->cr3; paging->cpl = svm_cpl(state); paging->cpu_mode = svm_vcpu_mode(vmcb); paging->paging_mode = svm_paging_mode(state->cr0, state->cr4, state->efer); } #define UNHANDLED 0 /* * Handle guest I/O intercept. */ static int svm_handle_io(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit) { struct vmcb_ctrl *ctrl; struct vmcb_state *state; struct svm_regctx *regs; struct vm_inout_str *vis; uint64_t info1; int inout_string; state = svm_get_vmcb_state(svm_sc, vcpu); ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu); regs = svm_get_guest_regctx(svm_sc, vcpu); info1 = ctrl->exitinfo1; inout_string = info1 & BIT(2) ? 1 : 0; /* * The effective segment number in EXITINFO1[12:10] is populated * only if the processor has the DecodeAssist capability. * * XXX this is not specified explicitly in APMv2 but can be verified * empirically. */ if (inout_string && !decode_assist()) return (UNHANDLED); vmexit->exitcode = VM_EXITCODE_INOUT; vmexit->u.inout.in = (info1 & BIT(0)) ? 1 : 0; vmexit->u.inout.string = inout_string; vmexit->u.inout.rep = (info1 & BIT(3)) ? 1 : 0; vmexit->u.inout.bytes = (info1 >> 4) & 0x7; vmexit->u.inout.port = (uint16_t)(info1 >> 16); vmexit->u.inout.eax = (uint32_t)(state->rax); if (inout_string) { vmexit->exitcode = VM_EXITCODE_INOUT_STR; vis = &vmexit->u.inout_str; svm_paging_info(svm_get_vmcb(svm_sc, vcpu), &vis->paging); vis->rflags = state->rflags; vis->cr0 = state->cr0; vis->index = svm_inout_str_index(regs, vmexit->u.inout.in); vis->count = svm_inout_str_count(regs, vmexit->u.inout.rep); vis->addrsize = svm_inout_str_addrsize(info1); svm_inout_str_seginfo(svm_sc, vcpu, info1, vmexit->u.inout.in, vis); } return (UNHANDLED); } static int npf_fault_type(uint64_t exitinfo1) { if (exitinfo1 & VMCB_NPF_INFO1_W) return (VM_PROT_WRITE); else if (exitinfo1 & VMCB_NPF_INFO1_ID) return (VM_PROT_EXECUTE); else return (VM_PROT_READ); } static bool svm_npf_emul_fault(uint64_t exitinfo1) { if (exitinfo1 & VMCB_NPF_INFO1_ID) { return (false); } if (exitinfo1 & VMCB_NPF_INFO1_GPT) { return (false); } if ((exitinfo1 & VMCB_NPF_INFO1_GPA) == 0) { return (false); } return (true); } static void svm_handle_inst_emul(struct vmcb *vmcb, uint64_t gpa, struct vm_exit *vmexit) { struct vm_guest_paging *paging; struct vmcb_segment seg; struct vmcb_ctrl *ctrl; char *inst_bytes; int error, inst_len; ctrl = &vmcb->ctrl; paging = &vmexit->u.inst_emul.paging; vmexit->exitcode = VM_EXITCODE_INST_EMUL; vmexit->u.inst_emul.gpa = gpa; vmexit->u.inst_emul.gla = VIE_INVALID_GLA; svm_paging_info(vmcb, paging); error = vmcb_seg(vmcb, VM_REG_GUEST_CS, &seg); KASSERT(error == 0, ("%s: vmcb_seg(CS) error %d", __func__, error)); switch(paging->cpu_mode) { case CPU_MODE_REAL: vmexit->u.inst_emul.cs_base = seg.base; vmexit->u.inst_emul.cs_d = 0; break; case CPU_MODE_PROTECTED: case CPU_MODE_COMPATIBILITY: vmexit->u.inst_emul.cs_base = seg.base; /* * Section 4.8.1 of APM2, Default Operand Size or D bit. */ vmexit->u.inst_emul.cs_d = (seg.attrib & VMCB_CS_ATTRIB_D) ? 1 : 0; break; default: vmexit->u.inst_emul.cs_base = 0; vmexit->u.inst_emul.cs_d = 0; break; } /* * Copy the instruction bytes into 'vie' if available. */ if (decode_assist() && !disable_npf_assist) { inst_len = ctrl->inst_len; inst_bytes = ctrl->inst_bytes; } else { inst_len = 0; inst_bytes = NULL; } vie_init(&vmexit->u.inst_emul.vie, inst_bytes, inst_len); } #ifdef KTR static const char * intrtype_to_str(int intr_type) { switch (intr_type) { case VMCB_EVENTINJ_TYPE_INTR: return ("hwintr"); case VMCB_EVENTINJ_TYPE_NMI: return ("nmi"); case VMCB_EVENTINJ_TYPE_INTn: return ("swintr"); case VMCB_EVENTINJ_TYPE_EXCEPTION: return ("exception"); default: panic("%s: unknown intr_type %d", __func__, intr_type); } } #endif /* * Inject an event to vcpu as described in section 15.20, "Event injection". */ static void svm_eventinject(struct svm_softc *sc, int vcpu, int intr_type, int vector, uint32_t error, bool ec_valid) { struct vmcb_ctrl *ctrl; ctrl = svm_get_vmcb_ctrl(sc, vcpu); KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) == 0, ("%s: event already pending %#lx", __func__, ctrl->eventinj)); KASSERT(vector >=0 && vector <= 255, ("%s: invalid vector %d", __func__, vector)); switch (intr_type) { case VMCB_EVENTINJ_TYPE_INTR: case VMCB_EVENTINJ_TYPE_NMI: case VMCB_EVENTINJ_TYPE_INTn: break; case VMCB_EVENTINJ_TYPE_EXCEPTION: if (vector >= 0 && vector <= 31 && vector != 2) break; /* FALLTHROUGH */ default: panic("%s: invalid intr_type/vector: %d/%d", __func__, intr_type, vector); } ctrl->eventinj = vector | (intr_type << 8) | VMCB_EVENTINJ_VALID; if (ec_valid) { ctrl->eventinj |= VMCB_EVENTINJ_EC_VALID; ctrl->eventinj |= (uint64_t)error << 32; VCPU_CTR3(sc->vm, vcpu, "Injecting %s at vector %d errcode %#x", intrtype_to_str(intr_type), vector, error); } else { VCPU_CTR2(sc->vm, vcpu, "Injecting %s at vector %d", intrtype_to_str(intr_type), vector); } } static void svm_update_virqinfo(struct svm_softc *sc, int vcpu) { struct vm *vm; struct vlapic *vlapic; struct vmcb_ctrl *ctrl; vm = sc->vm; vlapic = vm_lapic(vm, vcpu); ctrl = svm_get_vmcb_ctrl(sc, vcpu); /* Update %cr8 in the emulated vlapic */ vlapic_set_cr8(vlapic, ctrl->v_tpr); /* Virtual interrupt injection is not used. */ KASSERT(ctrl->v_intr_vector == 0, ("%s: invalid " "v_intr_vector %d", __func__, ctrl->v_intr_vector)); } static void svm_save_intinfo(struct svm_softc *svm_sc, int vcpu) { struct vmcb_ctrl *ctrl; uint64_t intinfo; ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu); intinfo = ctrl->exitintinfo; if (!VMCB_EXITINTINFO_VALID(intinfo)) return; /* * From APMv2, Section "Intercepts during IDT interrupt delivery" * * If a #VMEXIT happened during event delivery then record the event * that was being delivered. */ VCPU_CTR2(svm_sc->vm, vcpu, "SVM:Pending INTINFO(0x%lx), vector=%d.\n", intinfo, VMCB_EXITINTINFO_VECTOR(intinfo)); vmm_stat_incr(svm_sc->vm, vcpu, VCPU_EXITINTINFO, 1); vm_exit_intinfo(svm_sc->vm, vcpu, intinfo); } #ifdef INVARIANTS static __inline int vintr_intercept_enabled(struct svm_softc *sc, int vcpu) { return (svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR)); } #endif static __inline void enable_intr_window_exiting(struct svm_softc *sc, int vcpu) { struct vmcb_ctrl *ctrl; ctrl = svm_get_vmcb_ctrl(sc, vcpu); if (ctrl->v_irq && ctrl->v_intr_vector == 0) { KASSERT(ctrl->v_ign_tpr, ("%s: invalid v_ign_tpr", __func__)); KASSERT(vintr_intercept_enabled(sc, vcpu), ("%s: vintr intercept should be enabled", __func__)); return; } VCPU_CTR0(sc->vm, vcpu, "Enable intr window exiting"); ctrl->v_irq = 1; ctrl->v_ign_tpr = 1; ctrl->v_intr_vector = 0; svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR); svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR); } static __inline void disable_intr_window_exiting(struct svm_softc *sc, int vcpu) { struct vmcb_ctrl *ctrl; ctrl = svm_get_vmcb_ctrl(sc, vcpu); if (!ctrl->v_irq && ctrl->v_intr_vector == 0) { KASSERT(!vintr_intercept_enabled(sc, vcpu), ("%s: vintr intercept should be disabled", __func__)); return; } VCPU_CTR0(sc->vm, vcpu, "Disable intr window exiting"); ctrl->v_irq = 0; ctrl->v_intr_vector = 0; svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR); svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR); } static int svm_modify_intr_shadow(struct svm_softc *sc, int vcpu, uint64_t val) { struct vmcb_ctrl *ctrl; int oldval, newval; ctrl = svm_get_vmcb_ctrl(sc, vcpu); oldval = ctrl->intr_shadow; newval = val ? 1 : 0; if (newval != oldval) { ctrl->intr_shadow = newval; VCPU_CTR1(sc->vm, vcpu, "Setting intr_shadow to %d", newval); } return (0); } static int svm_get_intr_shadow(struct svm_softc *sc, int vcpu, uint64_t *val) { struct vmcb_ctrl *ctrl; ctrl = svm_get_vmcb_ctrl(sc, vcpu); *val = ctrl->intr_shadow; return (0); } /* * Once an NMI is injected it blocks delivery of further NMIs until the handler * executes an IRET. The IRET intercept is enabled when an NMI is injected to * to track when the vcpu is done handling the NMI. */ static int nmi_blocked(struct svm_softc *sc, int vcpu) { int blocked; blocked = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET); return (blocked); } static void enable_nmi_blocking(struct svm_softc *sc, int vcpu) { KASSERT(!nmi_blocked(sc, vcpu), ("vNMI already blocked")); VCPU_CTR0(sc->vm, vcpu, "vNMI blocking enabled"); svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET); } static void clear_nmi_blocking(struct svm_softc *sc, int vcpu) { int error; KASSERT(nmi_blocked(sc, vcpu), ("vNMI already unblocked")); VCPU_CTR0(sc->vm, vcpu, "vNMI blocking cleared"); /* * When the IRET intercept is cleared the vcpu will attempt to execute * the "iret" when it runs next. However, it is possible to inject * another NMI into the vcpu before the "iret" has actually executed. * * For e.g. if the "iret" encounters a #NPF when accessing the stack * it will trap back into the hypervisor. If an NMI is pending for * the vcpu it will be injected into the guest. * * XXX this needs to be fixed */ svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET); /* * Set 'intr_shadow' to prevent an NMI from being injected on the * immediate VMRUN. */ error = svm_modify_intr_shadow(sc, vcpu, 1); KASSERT(!error, ("%s: error %d setting intr_shadow", __func__, error)); } #define EFER_MBZ_BITS 0xFFFFFFFFFFFF0200UL static int svm_write_efer(struct svm_softc *sc, int vcpu, uint64_t newval, bool *retu) { struct vm_exit *vme; struct vmcb_state *state; uint64_t changed, lma, oldval; int error; state = svm_get_vmcb_state(sc, vcpu); oldval = state->efer; VCPU_CTR2(sc->vm, vcpu, "wrmsr(efer) %#lx/%#lx", oldval, newval); newval &= ~0xFE; /* clear the Read-As-Zero (RAZ) bits */ changed = oldval ^ newval; if (newval & EFER_MBZ_BITS) goto gpf; /* APMv2 Table 14-5 "Long-Mode Consistency Checks" */ if (changed & EFER_LME) { if (state->cr0 & CR0_PG) goto gpf; } /* EFER.LMA = EFER.LME & CR0.PG */ if ((newval & EFER_LME) != 0 && (state->cr0 & CR0_PG) != 0) lma = EFER_LMA; else lma = 0; if ((newval & EFER_LMA) != lma) goto gpf; if (newval & EFER_NXE) { if (!vm_cpuid_capability(sc->vm, vcpu, VCC_NO_EXECUTE)) goto gpf; } /* * XXX bhyve does not enforce segment limits in 64-bit mode. Until * this is fixed flag guest attempt to set EFER_LMSLE as an error. */ if (newval & EFER_LMSLE) { vme = vm_exitinfo(sc->vm, vcpu); vm_exit_svm(vme, VMCB_EXIT_MSR, 1, 0); *retu = true; return (0); } if (newval & EFER_FFXSR) { if (!vm_cpuid_capability(sc->vm, vcpu, VCC_FFXSR)) goto gpf; } if (newval & EFER_TCE) { if (!vm_cpuid_capability(sc->vm, vcpu, VCC_TCE)) goto gpf; } error = svm_setreg(sc, vcpu, VM_REG_GUEST_EFER, newval); KASSERT(error == 0, ("%s: error %d updating efer", __func__, error)); return (0); gpf: vm_inject_gp(sc->vm, vcpu); return (0); } static int emulate_wrmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t val, bool *retu) { int error; if (lapic_msr(num)) error = lapic_wrmsr(sc->vm, vcpu, num, val, retu); else if (num == MSR_EFER) error = svm_write_efer(sc, vcpu, val, retu); else error = svm_wrmsr(sc, vcpu, num, val, retu); return (error); } static int emulate_rdmsr(struct svm_softc *sc, int vcpu, u_int num, bool *retu) { struct vmcb_state *state; struct svm_regctx *ctx; uint64_t result; int error; if (lapic_msr(num)) error = lapic_rdmsr(sc->vm, vcpu, num, &result, retu); else error = svm_rdmsr(sc, vcpu, num, &result, retu); if (error == 0) { state = svm_get_vmcb_state(sc, vcpu); ctx = svm_get_guest_regctx(sc, vcpu); state->rax = result & 0xffffffff; ctx->sctx_rdx = result >> 32; } return (error); } #ifdef KTR static const char * exit_reason_to_str(uint64_t reason) { static char reasonbuf[32]; switch (reason) { case VMCB_EXIT_INVALID: return ("invalvmcb"); case VMCB_EXIT_SHUTDOWN: return ("shutdown"); case VMCB_EXIT_NPF: return ("nptfault"); case VMCB_EXIT_PAUSE: return ("pause"); case VMCB_EXIT_HLT: return ("hlt"); case VMCB_EXIT_CPUID: return ("cpuid"); case VMCB_EXIT_IO: return ("inout"); case VMCB_EXIT_MC: return ("mchk"); case VMCB_EXIT_INTR: return ("extintr"); case VMCB_EXIT_NMI: return ("nmi"); case VMCB_EXIT_VINTR: return ("vintr"); case VMCB_EXIT_MSR: return ("msr"); case VMCB_EXIT_IRET: return ("iret"); case VMCB_EXIT_MONITOR: return ("monitor"); case VMCB_EXIT_MWAIT: return ("mwait"); default: snprintf(reasonbuf, sizeof(reasonbuf), "%#lx", reason); return (reasonbuf); } } #endif /* KTR */ /* * From section "State Saved on Exit" in APMv2: nRIP is saved for all #VMEXITs * that are due to instruction intercepts as well as MSR and IOIO intercepts * and exceptions caused by INT3, INTO and BOUND instructions. * * Return 1 if the nRIP is valid and 0 otherwise. */ static int nrip_valid(uint64_t exitcode) { switch (exitcode) { case 0x00 ... 0x0F: /* read of CR0 through CR15 */ case 0x10 ... 0x1F: /* write of CR0 through CR15 */ case 0x20 ... 0x2F: /* read of DR0 through DR15 */ case 0x30 ... 0x3F: /* write of DR0 through DR15 */ case 0x43: /* INT3 */ case 0x44: /* INTO */ case 0x45: /* BOUND */ case 0x65 ... 0x7C: /* VMEXIT_CR0_SEL_WRITE ... VMEXIT_MSR */ case 0x80 ... 0x8D: /* VMEXIT_VMRUN ... VMEXIT_XSETBV */ return (1); default: return (0); } } static int svm_vmexit(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit) { struct vmcb *vmcb; struct vmcb_state *state; struct vmcb_ctrl *ctrl; struct svm_regctx *ctx; uint64_t code, info1, info2, val; uint32_t eax, ecx, edx; int error, errcode_valid, handled, idtvec, reflect; bool retu; ctx = svm_get_guest_regctx(svm_sc, vcpu); vmcb = svm_get_vmcb(svm_sc, vcpu); state = &vmcb->state; ctrl = &vmcb->ctrl; handled = 0; code = ctrl->exitcode; info1 = ctrl->exitinfo1; info2 = ctrl->exitinfo2; vmexit->exitcode = VM_EXITCODE_BOGUS; vmexit->rip = state->rip; vmexit->inst_length = nrip_valid(code) ? ctrl->nrip - state->rip : 0; vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_COUNT, 1); /* * #VMEXIT(INVALID) needs to be handled early because the VMCB is * in an inconsistent state and can trigger assertions that would * never happen otherwise. */ if (code == VMCB_EXIT_INVALID) { vm_exit_svm(vmexit, code, info1, info2); return (0); } KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) == 0, ("%s: event " "injection valid bit is set %#lx", __func__, ctrl->eventinj)); KASSERT(vmexit->inst_length >= 0 && vmexit->inst_length <= 15, ("invalid inst_length %d: code (%#lx), info1 (%#lx), info2 (%#lx)", vmexit->inst_length, code, info1, info2)); svm_update_virqinfo(svm_sc, vcpu); svm_save_intinfo(svm_sc, vcpu); switch (code) { case VMCB_EXIT_IRET: /* * Restart execution at "iret" but with the intercept cleared. */ vmexit->inst_length = 0; clear_nmi_blocking(svm_sc, vcpu); handled = 1; break; case VMCB_EXIT_VINTR: /* interrupt window exiting */ vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_VINTR, 1); handled = 1; break; case VMCB_EXIT_INTR: /* external interrupt */ vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXTINT, 1); handled = 1; break; case VMCB_EXIT_NMI: /* external NMI */ handled = 1; break; case 0x40 ... 0x5F: vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXCEPTION, 1); reflect = 1; idtvec = code - 0x40; switch (idtvec) { case IDT_MC: /* * Call the machine check handler by hand. Also don't * reflect the machine check back into the guest. */ reflect = 0; VCPU_CTR0(svm_sc->vm, vcpu, "Vectoring to MCE handler"); __asm __volatile("int $18"); break; case IDT_PF: error = svm_setreg(svm_sc, vcpu, VM_REG_GUEST_CR2, info2); KASSERT(error == 0, ("%s: error %d updating cr2", __func__, error)); /* fallthru */ case IDT_NP: case IDT_SS: case IDT_GP: case IDT_AC: case IDT_TS: errcode_valid = 1; break; case IDT_DF: errcode_valid = 1; info1 = 0; break; case IDT_BP: case IDT_OF: case IDT_BR: /* * The 'nrip' field is populated for INT3, INTO and * BOUND exceptions and this also implies that * 'inst_length' is non-zero. * * Reset 'inst_length' to zero so the guest %rip at * event injection is identical to what it was when * the exception originally happened. */ VCPU_CTR2(svm_sc->vm, vcpu, "Reset inst_length from %d " "to zero before injecting exception %d", vmexit->inst_length, idtvec); vmexit->inst_length = 0; /* fallthru */ default: errcode_valid = 0; info1 = 0; break; } KASSERT(vmexit->inst_length == 0, ("invalid inst_length (%d) " "when reflecting exception %d into guest", vmexit->inst_length, idtvec)); if (reflect) { /* Reflect the exception back into the guest */ VCPU_CTR2(svm_sc->vm, vcpu, "Reflecting exception " "%d/%#x into the guest", idtvec, (int)info1); error = vm_inject_exception(svm_sc->vm, vcpu, idtvec, errcode_valid, info1, 0); KASSERT(error == 0, ("%s: vm_inject_exception error %d", __func__, error)); } handled = 1; break; case VMCB_EXIT_MSR: /* MSR access. */ eax = state->rax; ecx = ctx->sctx_rcx; edx = ctx->sctx_rdx; retu = false; if (info1) { vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_WRMSR, 1); val = (uint64_t)edx << 32 | eax; VCPU_CTR2(svm_sc->vm, vcpu, "wrmsr %#x val %#lx", ecx, val); if (emulate_wrmsr(svm_sc, vcpu, ecx, val, &retu)) { vmexit->exitcode = VM_EXITCODE_WRMSR; vmexit->u.msr.code = ecx; vmexit->u.msr.wval = val; } else if (!retu) { handled = 1; } else { KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS, ("emulate_wrmsr retu with bogus exitcode")); } } else { VCPU_CTR1(svm_sc->vm, vcpu, "rdmsr %#x", ecx); vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_RDMSR, 1); if (emulate_rdmsr(svm_sc, vcpu, ecx, &retu)) { vmexit->exitcode = VM_EXITCODE_RDMSR; vmexit->u.msr.code = ecx; } else if (!retu) { handled = 1; } else { KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS, ("emulate_rdmsr retu with bogus exitcode")); } } break; case VMCB_EXIT_IO: handled = svm_handle_io(svm_sc, vcpu, vmexit); vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_INOUT, 1); break; case VMCB_EXIT_CPUID: vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_CPUID, 1); handled = x86_emulate_cpuid(svm_sc->vm, vcpu, (uint32_t *)&state->rax, (uint32_t *)&ctx->sctx_rbx, (uint32_t *)&ctx->sctx_rcx, (uint32_t *)&ctx->sctx_rdx); break; case VMCB_EXIT_HLT: vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_HLT, 1); vmexit->exitcode = VM_EXITCODE_HLT; vmexit->u.hlt.rflags = state->rflags; break; case VMCB_EXIT_PAUSE: vmexit->exitcode = VM_EXITCODE_PAUSE; vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_PAUSE, 1); break; case VMCB_EXIT_NPF: /* EXITINFO2 contains the faulting guest physical address */ if (info1 & VMCB_NPF_INFO1_RSV) { VCPU_CTR2(svm_sc->vm, vcpu, "nested page fault with " "reserved bits set: info1(%#lx) info2(%#lx)", info1, info2); } else if (vm_mem_allocated(svm_sc->vm, vcpu, info2)) { vmexit->exitcode = VM_EXITCODE_PAGING; vmexit->u.paging.gpa = info2; vmexit->u.paging.fault_type = npf_fault_type(info1); vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_NESTED_FAULT, 1); VCPU_CTR3(svm_sc->vm, vcpu, "nested page fault " "on gpa %#lx/%#lx at rip %#lx", info2, info1, state->rip); } else if (svm_npf_emul_fault(info1)) { svm_handle_inst_emul(vmcb, info2, vmexit); vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_INST_EMUL, 1); VCPU_CTR3(svm_sc->vm, vcpu, "inst_emul fault " "for gpa %#lx/%#lx at rip %#lx", info2, info1, state->rip); } break; case VMCB_EXIT_MONITOR: vmexit->exitcode = VM_EXITCODE_MONITOR; break; case VMCB_EXIT_MWAIT: vmexit->exitcode = VM_EXITCODE_MWAIT; break; default: vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_UNKNOWN, 1); break; } VCPU_CTR4(svm_sc->vm, vcpu, "%s %s vmexit at %#lx/%d", handled ? "handled" : "unhandled", exit_reason_to_str(code), vmexit->rip, vmexit->inst_length); if (handled) { vmexit->rip += vmexit->inst_length; vmexit->inst_length = 0; state->rip = vmexit->rip; } else { if (vmexit->exitcode == VM_EXITCODE_BOGUS) { /* * If this VM exit was not claimed by anybody then * treat it as a generic SVM exit. */ vm_exit_svm(vmexit, code, info1, info2); } else { /* * The exitcode and collateral have been populated. * The VM exit will be processed further in userland. */ } } return (handled); } static void svm_inj_intinfo(struct svm_softc *svm_sc, int vcpu) { uint64_t intinfo; if (!vm_entry_intinfo(svm_sc->vm, vcpu, &intinfo)) return; KASSERT(VMCB_EXITINTINFO_VALID(intinfo), ("%s: entry intinfo is not " "valid: %#lx", __func__, intinfo)); svm_eventinject(svm_sc, vcpu, VMCB_EXITINTINFO_TYPE(intinfo), VMCB_EXITINTINFO_VECTOR(intinfo), VMCB_EXITINTINFO_EC(intinfo), VMCB_EXITINTINFO_EC_VALID(intinfo)); vmm_stat_incr(svm_sc->vm, vcpu, VCPU_INTINFO_INJECTED, 1); VCPU_CTR1(svm_sc->vm, vcpu, "Injected entry intinfo: %#lx", intinfo); } /* * Inject event to virtual cpu. */ static void svm_inj_interrupts(struct svm_softc *sc, int vcpu, struct vlapic *vlapic) { struct vmcb_ctrl *ctrl; struct vmcb_state *state; struct svm_vcpu *vcpustate; uint8_t v_tpr; int vector, need_intr_window; int extint_pending; state = svm_get_vmcb_state(sc, vcpu); ctrl = svm_get_vmcb_ctrl(sc, vcpu); vcpustate = svm_get_vcpu(sc, vcpu); need_intr_window = 0; if (vcpustate->nextrip != state->rip) { ctrl->intr_shadow = 0; VCPU_CTR2(sc->vm, vcpu, "Guest interrupt blocking " "cleared due to rip change: %#lx/%#lx", vcpustate->nextrip, state->rip); } /* * Inject pending events or exceptions for this vcpu. * * An event might be pending because the previous #VMEXIT happened * during event delivery (i.e. ctrl->exitintinfo). * * An event might also be pending because an exception was injected * by the hypervisor (e.g. #PF during instruction emulation). */ svm_inj_intinfo(sc, vcpu); /* NMI event has priority over interrupts. */ if (vm_nmi_pending(sc->vm, vcpu)) { if (nmi_blocked(sc, vcpu)) { /* * Can't inject another NMI if the guest has not * yet executed an "iret" after the last NMI. */ VCPU_CTR0(sc->vm, vcpu, "Cannot inject NMI due " "to NMI-blocking"); } else if (ctrl->intr_shadow) { /* * Can't inject an NMI if the vcpu is in an intr_shadow. */ VCPU_CTR0(sc->vm, vcpu, "Cannot inject NMI due to " "interrupt shadow"); need_intr_window = 1; goto done; } else if (ctrl->eventinj & VMCB_EVENTINJ_VALID) { /* * If there is already an exception/interrupt pending * then defer the NMI until after that. */ VCPU_CTR1(sc->vm, vcpu, "Cannot inject NMI due to " "eventinj %#lx", ctrl->eventinj); /* * Use self-IPI to trigger a VM-exit as soon as * possible after the event injection is completed. * * This works only if the external interrupt exiting * is at a lower priority than the event injection. * * Although not explicitly specified in APMv2 the * relative priorities were verified empirically. */ ipi_cpu(curcpu, IPI_AST); /* XXX vmm_ipinum? */ } else { vm_nmi_clear(sc->vm, vcpu); /* Inject NMI, vector number is not used */ svm_eventinject(sc, vcpu, VMCB_EVENTINJ_TYPE_NMI, IDT_NMI, 0, false); /* virtual NMI blocking is now in effect */ enable_nmi_blocking(sc, vcpu); VCPU_CTR0(sc->vm, vcpu, "Injecting vNMI"); } } extint_pending = vm_extint_pending(sc->vm, vcpu); if (!extint_pending) { if (!vlapic_pending_intr(vlapic, &vector)) goto done; KASSERT(vector >= 16 && vector <= 255, ("invalid vector %d from local APIC", vector)); } else { /* Ask the legacy pic for a vector to inject */ vatpic_pending_intr(sc->vm, &vector); KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d from INTR", vector)); } /* * If the guest has disabled interrupts or is in an interrupt shadow * then we cannot inject the pending interrupt. */ if ((state->rflags & PSL_I) == 0) { VCPU_CTR2(sc->vm, vcpu, "Cannot inject vector %d due to " "rflags %#lx", vector, state->rflags); need_intr_window = 1; goto done; } if (ctrl->intr_shadow) { VCPU_CTR1(sc->vm, vcpu, "Cannot inject vector %d due to " "interrupt shadow", vector); need_intr_window = 1; goto done; } if (ctrl->eventinj & VMCB_EVENTINJ_VALID) { VCPU_CTR2(sc->vm, vcpu, "Cannot inject vector %d due to " "eventinj %#lx", vector, ctrl->eventinj); need_intr_window = 1; goto done; } svm_eventinject(sc, vcpu, VMCB_EVENTINJ_TYPE_INTR, vector, 0, false); if (!extint_pending) { vlapic_intr_accepted(vlapic, vector); } else { vm_extint_clear(sc->vm, vcpu); vatpic_intr_accepted(sc->vm, vector); } /* * Force a VM-exit as soon as the vcpu is ready to accept another * interrupt. This is done because the PIC might have another vector * that it wants to inject. Also, if the APIC has a pending interrupt * that was preempted by the ExtInt then it allows us to inject the * APIC vector as soon as possible. */ need_intr_window = 1; done: /* * The guest can modify the TPR by writing to %CR8. In guest mode * the processor reflects this write to V_TPR without hypervisor * intervention. * * The guest can also modify the TPR by writing to it via the memory * mapped APIC page. In this case, the write will be emulated by the * hypervisor. For this reason V_TPR must be updated before every * VMRUN. */ v_tpr = vlapic_get_cr8(vlapic); KASSERT(v_tpr <= 15, ("invalid v_tpr %#x", v_tpr)); if (ctrl->v_tpr != v_tpr) { VCPU_CTR2(sc->vm, vcpu, "VMCB V_TPR changed from %#x to %#x", ctrl->v_tpr, v_tpr); ctrl->v_tpr = v_tpr; svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR); } if (need_intr_window) { /* * We use V_IRQ in conjunction with the VINTR intercept to * trap into the hypervisor as soon as a virtual interrupt * can be delivered. * * Since injected events are not subject to intercept checks * we need to ensure that the V_IRQ is not actually going to * be delivered on VM entry. The KASSERT below enforces this. */ KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) != 0 || (state->rflags & PSL_I) == 0 || ctrl->intr_shadow, ("Bogus intr_window_exiting: eventinj (%#lx), " "intr_shadow (%u), rflags (%#lx)", ctrl->eventinj, ctrl->intr_shadow, state->rflags)); enable_intr_window_exiting(sc, vcpu); } else { disable_intr_window_exiting(sc, vcpu); } } static __inline void restore_host_tss(void) { struct system_segment_descriptor *tss_sd; /* * The TSS descriptor was in use prior to launching the guest so it * has been marked busy. * * 'ltr' requires the descriptor to be marked available so change the * type to "64-bit available TSS". */ tss_sd = PCPU_GET(tss); tss_sd->sd_type = SDT_SYSTSS; ltr(GSEL(GPROC0_SEL, SEL_KPL)); } static void check_asid(struct svm_softc *sc, int vcpuid, pmap_t pmap, u_int thiscpu) { struct svm_vcpu *vcpustate; struct vmcb_ctrl *ctrl; long eptgen; bool alloc_asid; KASSERT(CPU_ISSET(thiscpu, &pmap->pm_active), ("%s: nested pmap not " "active on cpu %u", __func__, thiscpu)); vcpustate = svm_get_vcpu(sc, vcpuid); ctrl = svm_get_vmcb_ctrl(sc, vcpuid); /* * The TLB entries associated with the vcpu's ASID are not valid * if either of the following conditions is true: * * 1. The vcpu's ASID generation is different than the host cpu's * ASID generation. This happens when the vcpu migrates to a new * host cpu. It can also happen when the number of vcpus executing * on a host cpu is greater than the number of ASIDs available. * * 2. The pmap generation number is different than the value cached in * the 'vcpustate'. This happens when the host invalidates pages * belonging to the guest. * * asidgen eptgen Action * mismatch mismatch * 0 0 (a) * 0 1 (b1) or (b2) * 1 0 (c) * 1 1 (d) * * (a) There is no mismatch in eptgen or ASID generation and therefore * no further action is needed. * * (b1) If the cpu supports FlushByAsid then the vcpu's ASID is * retained and the TLB entries associated with this ASID * are flushed by VMRUN. * * (b2) If the cpu does not support FlushByAsid then a new ASID is * allocated. * * (c) A new ASID is allocated. * * (d) A new ASID is allocated. */ alloc_asid = false; eptgen = pmap->pm_eptgen; ctrl->tlb_ctrl = VMCB_TLB_FLUSH_NOTHING; if (vcpustate->asid.gen != asid[thiscpu].gen) { alloc_asid = true; /* (c) and (d) */ } else if (vcpustate->eptgen != eptgen) { if (flush_by_asid()) ctrl->tlb_ctrl = VMCB_TLB_FLUSH_GUEST; /* (b1) */ else alloc_asid = true; /* (b2) */ } else { /* * This is the common case (a). */ KASSERT(!alloc_asid, ("ASID allocation not necessary")); KASSERT(ctrl->tlb_ctrl == VMCB_TLB_FLUSH_NOTHING, ("Invalid VMCB tlb_ctrl: %#x", ctrl->tlb_ctrl)); } if (alloc_asid) { if (++asid[thiscpu].num >= nasid) { asid[thiscpu].num = 1; if (++asid[thiscpu].gen == 0) asid[thiscpu].gen = 1; /* * If this cpu does not support "flush-by-asid" * then flush the entire TLB on a generation * bump. Subsequent ASID allocation in this * generation can be done without a TLB flush. */ if (!flush_by_asid()) ctrl->tlb_ctrl = VMCB_TLB_FLUSH_ALL; } vcpustate->asid.gen = asid[thiscpu].gen; vcpustate->asid.num = asid[thiscpu].num; ctrl->asid = vcpustate->asid.num; svm_set_dirty(sc, vcpuid, VMCB_CACHE_ASID); /* * If this cpu supports "flush-by-asid" then the TLB * was not flushed after the generation bump. The TLB * is flushed selectively after every new ASID allocation. */ if (flush_by_asid()) ctrl->tlb_ctrl = VMCB_TLB_FLUSH_GUEST; } vcpustate->eptgen = eptgen; KASSERT(ctrl->asid != 0, ("Guest ASID must be non-zero")); KASSERT(ctrl->asid == vcpustate->asid.num, ("ASID mismatch: %u/%u", ctrl->asid, vcpustate->asid.num)); } static __inline void disable_gintr(void) { __asm __volatile("clgi"); } static __inline void enable_gintr(void) { __asm __volatile("stgi"); } static __inline void svm_dr_enter_guest(struct svm_regctx *gctx) { /* Save host control debug registers. */ gctx->host_dr7 = rdr7(); gctx->host_debugctl = rdmsr(MSR_DEBUGCTLMSR); /* * Disable debugging in DR7 and DEBUGCTL to avoid triggering * exceptions in the host based on the guest DRx values. The * guest DR6, DR7, and DEBUGCTL are saved/restored in the * VMCB. */ load_dr7(0); wrmsr(MSR_DEBUGCTLMSR, 0); /* Save host debug registers. */ gctx->host_dr0 = rdr0(); gctx->host_dr1 = rdr1(); gctx->host_dr2 = rdr2(); gctx->host_dr3 = rdr3(); gctx->host_dr6 = rdr6(); /* Restore guest debug registers. */ load_dr0(gctx->sctx_dr0); load_dr1(gctx->sctx_dr1); load_dr2(gctx->sctx_dr2); load_dr3(gctx->sctx_dr3); } static __inline void svm_dr_leave_guest(struct svm_regctx *gctx) { /* Save guest debug registers. */ gctx->sctx_dr0 = rdr0(); gctx->sctx_dr1 = rdr1(); gctx->sctx_dr2 = rdr2(); gctx->sctx_dr3 = rdr3(); /* * Restore host debug registers. Restore DR7 and DEBUGCTL * last. */ load_dr0(gctx->host_dr0); load_dr1(gctx->host_dr1); load_dr2(gctx->host_dr2); load_dr3(gctx->host_dr3); load_dr6(gctx->host_dr6); wrmsr(MSR_DEBUGCTLMSR, gctx->host_debugctl); load_dr7(gctx->host_dr7); } /* * Start vcpu with specified RIP. */ static int svm_vmrun(void *arg, int vcpu, register_t rip, pmap_t pmap, struct vm_eventinfo *evinfo) { struct svm_regctx *gctx; struct svm_softc *svm_sc; struct svm_vcpu *vcpustate; struct vmcb_state *state; struct vmcb_ctrl *ctrl; struct vm_exit *vmexit; struct vlapic *vlapic; struct vm *vm; uint64_t vmcb_pa; int handled; uint16_t ldt_sel; svm_sc = arg; vm = svm_sc->vm; vcpustate = svm_get_vcpu(svm_sc, vcpu); state = svm_get_vmcb_state(svm_sc, vcpu); ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu); vmexit = vm_exitinfo(vm, vcpu); vlapic = vm_lapic(vm, vcpu); gctx = svm_get_guest_regctx(svm_sc, vcpu); vmcb_pa = svm_sc->vcpu[vcpu].vmcb_pa; if (vcpustate->lastcpu != curcpu) { /* * Force new ASID allocation by invalidating the generation. */ vcpustate->asid.gen = 0; /* * Invalidate the VMCB state cache by marking all fields dirty. */ svm_set_dirty(svm_sc, vcpu, 0xffffffff); /* * XXX * Setting 'vcpustate->lastcpu' here is bit premature because * we may return from this function without actually executing * the VMRUN instruction. This could happen if a rendezvous * or an AST is pending on the first time through the loop. * * This works for now but any new side-effects of vcpu * migration should take this case into account. */ vcpustate->lastcpu = curcpu; vmm_stat_incr(vm, vcpu, VCPU_MIGRATIONS, 1); } svm_msr_guest_enter(svm_sc, vcpu); /* Update Guest RIP */ state->rip = rip; do { /* * Disable global interrupts to guarantee atomicity during * loading of guest state. This includes not only the state * loaded by the "vmrun" instruction but also software state * maintained by the hypervisor: suspended and rendezvous * state, NPT generation number, vlapic interrupts etc. */ disable_gintr(); if (vcpu_suspended(evinfo)) { enable_gintr(); vm_exit_suspended(vm, vcpu, state->rip); break; } if (vcpu_rendezvous_pending(evinfo)) { enable_gintr(); vm_exit_rendezvous(vm, vcpu, state->rip); break; } if (vcpu_reqidle(evinfo)) { enable_gintr(); vm_exit_reqidle(vm, vcpu, state->rip); break; } /* We are asked to give the cpu by scheduler. */ if (vcpu_should_yield(vm, vcpu)) { enable_gintr(); vm_exit_astpending(vm, vcpu, state->rip); break; } if (vcpu_debugged(vm, vcpu)) { enable_gintr(); vm_exit_debug(vm, vcpu, state->rip); break; } /* * #VMEXIT resumes the host with the guest LDTR, so * save the current LDT selector so it can be restored * after an exit. The userspace hypervisor probably * doesn't use a LDT, but save and restore it to be * safe. */ ldt_sel = sldt(); svm_inj_interrupts(svm_sc, vcpu, vlapic); /* Activate the nested pmap on 'curcpu' */ CPU_SET_ATOMIC_ACQ(curcpu, &pmap->pm_active); /* * Check the pmap generation and the ASID generation to * ensure that the vcpu does not use stale TLB mappings. */ check_asid(svm_sc, vcpu, pmap, curcpu); ctrl->vmcb_clean = vmcb_clean & ~vcpustate->dirty; vcpustate->dirty = 0; VCPU_CTR1(vm, vcpu, "vmcb clean %#x", ctrl->vmcb_clean); /* Launch Virtual Machine. */ VCPU_CTR1(vm, vcpu, "Resume execution at %#lx", state->rip); svm_dr_enter_guest(gctx); svm_launch(vmcb_pa, gctx, get_pcpu()); svm_dr_leave_guest(gctx); CPU_CLR_ATOMIC(curcpu, &pmap->pm_active); /* * The host GDTR and IDTR is saved by VMRUN and restored * automatically on #VMEXIT. However, the host TSS needs * to be restored explicitly. */ restore_host_tss(); /* Restore host LDTR. */ lldt(ldt_sel); /* #VMEXIT disables interrupts so re-enable them here. */ enable_gintr(); /* Update 'nextrip' */ vcpustate->nextrip = state->rip; /* Handle #VMEXIT and if required return to user space. */ handled = svm_vmexit(svm_sc, vcpu, vmexit); } while (handled); svm_msr_guest_exit(svm_sc, vcpu); return (0); } static void svm_vmcleanup(void *arg) { struct svm_softc *sc = arg; contigfree(sc->iopm_bitmap, SVM_IO_BITMAP_SIZE, M_SVM); contigfree(sc->msr_bitmap, SVM_MSR_BITMAP_SIZE, M_SVM); free(sc, M_SVM); } static register_t * swctx_regptr(struct svm_regctx *regctx, int reg) { switch (reg) { case VM_REG_GUEST_RBX: return (®ctx->sctx_rbx); case VM_REG_GUEST_RCX: return (®ctx->sctx_rcx); case VM_REG_GUEST_RDX: return (®ctx->sctx_rdx); case VM_REG_GUEST_RDI: return (®ctx->sctx_rdi); case VM_REG_GUEST_RSI: return (®ctx->sctx_rsi); case VM_REG_GUEST_RBP: return (®ctx->sctx_rbp); case VM_REG_GUEST_R8: return (®ctx->sctx_r8); case VM_REG_GUEST_R9: return (®ctx->sctx_r9); case VM_REG_GUEST_R10: return (®ctx->sctx_r10); case VM_REG_GUEST_R11: return (®ctx->sctx_r11); case VM_REG_GUEST_R12: return (®ctx->sctx_r12); case VM_REG_GUEST_R13: return (®ctx->sctx_r13); case VM_REG_GUEST_R14: return (®ctx->sctx_r14); case VM_REG_GUEST_R15: return (®ctx->sctx_r15); case VM_REG_GUEST_DR0: return (®ctx->sctx_dr0); case VM_REG_GUEST_DR1: return (®ctx->sctx_dr1); case VM_REG_GUEST_DR2: return (®ctx->sctx_dr2); case VM_REG_GUEST_DR3: return (®ctx->sctx_dr3); default: return (NULL); } } static int svm_getreg(void *arg, int vcpu, int ident, uint64_t *val) { struct svm_softc *svm_sc; register_t *reg; svm_sc = arg; if (ident == VM_REG_GUEST_INTR_SHADOW) { return (svm_get_intr_shadow(svm_sc, vcpu, val)); } if (vmcb_read(svm_sc, vcpu, ident, val) == 0) { return (0); } reg = swctx_regptr(svm_get_guest_regctx(svm_sc, vcpu), ident); if (reg != NULL) { *val = *reg; return (0); } VCPU_CTR1(svm_sc->vm, vcpu, "svm_getreg: unknown register %#x", ident); return (EINVAL); } static int svm_setreg(void *arg, int vcpu, int ident, uint64_t val) { struct svm_softc *svm_sc; register_t *reg; svm_sc = arg; if (ident == VM_REG_GUEST_INTR_SHADOW) { return (svm_modify_intr_shadow(svm_sc, vcpu, val)); } if (vmcb_write(svm_sc, vcpu, ident, val) == 0) { return (0); } reg = swctx_regptr(svm_get_guest_regctx(svm_sc, vcpu), ident); if (reg != NULL) { *reg = val; return (0); } + if (ident == VM_REG_GUEST_ENTRY_INST_LENGTH) { + /* Ignore. */ + return (0); + } + /* * XXX deal with CR3 and invalidate TLB entries tagged with the * vcpu's ASID. This needs to be treated differently depending on * whether 'running' is true/false. */ VCPU_CTR1(svm_sc->vm, vcpu, "svm_setreg: unknown register %#x", ident); return (EINVAL); } static int svm_setcap(void *arg, int vcpu, int type, int val) { struct svm_softc *sc; int error; sc = arg; error = 0; switch (type) { case VM_CAP_HALT_EXIT: svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_HLT, val); break; case VM_CAP_PAUSE_EXIT: svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_PAUSE, val); break; case VM_CAP_UNRESTRICTED_GUEST: /* Unrestricted guest execution cannot be disabled in SVM */ if (val == 0) error = EINVAL; break; default: error = ENOENT; break; } return (error); } static int svm_getcap(void *arg, int vcpu, int type, int *retval) { struct svm_softc *sc; int error; sc = arg; error = 0; switch (type) { case VM_CAP_HALT_EXIT: *retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_HLT); break; case VM_CAP_PAUSE_EXIT: *retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_PAUSE); break; case VM_CAP_UNRESTRICTED_GUEST: *retval = 1; /* unrestricted guest is always enabled */ break; default: error = ENOENT; break; } return (error); } static struct vlapic * svm_vlapic_init(void *arg, int vcpuid) { struct svm_softc *svm_sc; struct vlapic *vlapic; svm_sc = arg; vlapic = malloc(sizeof(struct vlapic), M_SVM_VLAPIC, M_WAITOK | M_ZERO); vlapic->vm = svm_sc->vm; vlapic->vcpuid = vcpuid; vlapic->apic_page = (struct LAPIC *)&svm_sc->apic_page[vcpuid]; vlapic_init(vlapic); return (vlapic); } static void svm_vlapic_cleanup(void *arg, struct vlapic *vlapic) { vlapic_cleanup(vlapic); free(vlapic, M_SVM_VLAPIC); } struct vmm_ops vmm_ops_amd = { .init = svm_init, .cleanup = svm_cleanup, .resume = svm_restore, .vminit = svm_vminit, .vmrun = svm_vmrun, .vmcleanup = svm_vmcleanup, .vmgetreg = svm_getreg, .vmsetreg = svm_setreg, .vmgetdesc = vmcb_getdesc, .vmsetdesc = vmcb_setdesc, .vmgetcap = svm_getcap, .vmsetcap = svm_setcap, .vmspace_alloc = svm_npt_alloc, .vmspace_free = svm_npt_free, .vlapic_init = svm_vlapic_init, .vlapic_cleanup = svm_vlapic_cleanup, }; Index: head/sys/amd64/vmm/intel/vmcs.c =================================================================== --- head/sys/amd64/vmm/intel/vmcs.c (revision 355723) +++ head/sys/amd64/vmm/intel/vmcs.c (revision 355724) @@ -1,519 +1,521 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include "opt_ddb.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include "vmm_host.h" #include "vmx_cpufunc.h" #include "vmcs.h" #include "ept.h" #include "vmx.h" #ifdef DDB #include #endif SYSCTL_DECL(_hw_vmm_vmx); static int no_flush_rsb; SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, no_flush_rsb, CTLFLAG_RW, &no_flush_rsb, 0, "Do not flush RSB upon vmexit"); static uint64_t vmcs_fix_regval(uint32_t encoding, uint64_t val) { switch (encoding) { case VMCS_GUEST_CR0: val = vmx_fix_cr0(val); break; case VMCS_GUEST_CR4: val = vmx_fix_cr4(val); break; default: break; } return (val); } static uint32_t vmcs_field_encoding(int ident) { switch (ident) { case VM_REG_GUEST_CR0: return (VMCS_GUEST_CR0); case VM_REG_GUEST_CR3: return (VMCS_GUEST_CR3); case VM_REG_GUEST_CR4: return (VMCS_GUEST_CR4); case VM_REG_GUEST_DR7: return (VMCS_GUEST_DR7); case VM_REG_GUEST_RSP: return (VMCS_GUEST_RSP); case VM_REG_GUEST_RIP: return (VMCS_GUEST_RIP); case VM_REG_GUEST_RFLAGS: return (VMCS_GUEST_RFLAGS); case VM_REG_GUEST_ES: return (VMCS_GUEST_ES_SELECTOR); case VM_REG_GUEST_CS: return (VMCS_GUEST_CS_SELECTOR); case VM_REG_GUEST_SS: return (VMCS_GUEST_SS_SELECTOR); case VM_REG_GUEST_DS: return (VMCS_GUEST_DS_SELECTOR); case VM_REG_GUEST_FS: return (VMCS_GUEST_FS_SELECTOR); case VM_REG_GUEST_GS: return (VMCS_GUEST_GS_SELECTOR); case VM_REG_GUEST_TR: return (VMCS_GUEST_TR_SELECTOR); case VM_REG_GUEST_LDTR: return (VMCS_GUEST_LDTR_SELECTOR); case VM_REG_GUEST_EFER: return (VMCS_GUEST_IA32_EFER); case VM_REG_GUEST_PDPTE0: return (VMCS_GUEST_PDPTE0); case VM_REG_GUEST_PDPTE1: return (VMCS_GUEST_PDPTE1); case VM_REG_GUEST_PDPTE2: return (VMCS_GUEST_PDPTE2); case VM_REG_GUEST_PDPTE3: return (VMCS_GUEST_PDPTE3); + case VM_REG_GUEST_ENTRY_INST_LENGTH: + return (VMCS_ENTRY_INST_LENGTH); default: return (-1); } } static int vmcs_seg_desc_encoding(int seg, uint32_t *base, uint32_t *lim, uint32_t *acc) { switch (seg) { case VM_REG_GUEST_ES: *base = VMCS_GUEST_ES_BASE; *lim = VMCS_GUEST_ES_LIMIT; *acc = VMCS_GUEST_ES_ACCESS_RIGHTS; break; case VM_REG_GUEST_CS: *base = VMCS_GUEST_CS_BASE; *lim = VMCS_GUEST_CS_LIMIT; *acc = VMCS_GUEST_CS_ACCESS_RIGHTS; break; case VM_REG_GUEST_SS: *base = VMCS_GUEST_SS_BASE; *lim = VMCS_GUEST_SS_LIMIT; *acc = VMCS_GUEST_SS_ACCESS_RIGHTS; break; case VM_REG_GUEST_DS: *base = VMCS_GUEST_DS_BASE; *lim = VMCS_GUEST_DS_LIMIT; *acc = VMCS_GUEST_DS_ACCESS_RIGHTS; break; case VM_REG_GUEST_FS: *base = VMCS_GUEST_FS_BASE; *lim = VMCS_GUEST_FS_LIMIT; *acc = VMCS_GUEST_FS_ACCESS_RIGHTS; break; case VM_REG_GUEST_GS: *base = VMCS_GUEST_GS_BASE; *lim = VMCS_GUEST_GS_LIMIT; *acc = VMCS_GUEST_GS_ACCESS_RIGHTS; break; case VM_REG_GUEST_TR: *base = VMCS_GUEST_TR_BASE; *lim = VMCS_GUEST_TR_LIMIT; *acc = VMCS_GUEST_TR_ACCESS_RIGHTS; break; case VM_REG_GUEST_LDTR: *base = VMCS_GUEST_LDTR_BASE; *lim = VMCS_GUEST_LDTR_LIMIT; *acc = VMCS_GUEST_LDTR_ACCESS_RIGHTS; break; case VM_REG_GUEST_IDTR: *base = VMCS_GUEST_IDTR_BASE; *lim = VMCS_GUEST_IDTR_LIMIT; *acc = VMCS_INVALID_ENCODING; break; case VM_REG_GUEST_GDTR: *base = VMCS_GUEST_GDTR_BASE; *lim = VMCS_GUEST_GDTR_LIMIT; *acc = VMCS_INVALID_ENCODING; break; default: return (EINVAL); } return (0); } int vmcs_getreg(struct vmcs *vmcs, int running, int ident, uint64_t *retval) { int error; uint32_t encoding; /* * If we need to get at vmx-specific state in the VMCS we can bypass * the translation of 'ident' to 'encoding' by simply setting the * sign bit. As it so happens the upper 16 bits are reserved (i.e * set to 0) in the encodings for the VMCS so we are free to use the * sign bit. */ if (ident < 0) encoding = ident & 0x7fffffff; else encoding = vmcs_field_encoding(ident); if (encoding == (uint32_t)-1) return (EINVAL); if (!running) VMPTRLD(vmcs); error = vmread(encoding, retval); if (!running) VMCLEAR(vmcs); return (error); } int vmcs_setreg(struct vmcs *vmcs, int running, int ident, uint64_t val) { int error; uint32_t encoding; if (ident < 0) encoding = ident & 0x7fffffff; else encoding = vmcs_field_encoding(ident); if (encoding == (uint32_t)-1) return (EINVAL); val = vmcs_fix_regval(encoding, val); if (!running) VMPTRLD(vmcs); error = vmwrite(encoding, val); if (!running) VMCLEAR(vmcs); return (error); } int vmcs_setdesc(struct vmcs *vmcs, int running, int seg, struct seg_desc *desc) { int error; uint32_t base, limit, access; error = vmcs_seg_desc_encoding(seg, &base, &limit, &access); if (error != 0) panic("vmcs_setdesc: invalid segment register %d", seg); if (!running) VMPTRLD(vmcs); if ((error = vmwrite(base, desc->base)) != 0) goto done; if ((error = vmwrite(limit, desc->limit)) != 0) goto done; if (access != VMCS_INVALID_ENCODING) { if ((error = vmwrite(access, desc->access)) != 0) goto done; } done: if (!running) VMCLEAR(vmcs); return (error); } int vmcs_getdesc(struct vmcs *vmcs, int running, int seg, struct seg_desc *desc) { int error; uint32_t base, limit, access; uint64_t u64; error = vmcs_seg_desc_encoding(seg, &base, &limit, &access); if (error != 0) panic("vmcs_getdesc: invalid segment register %d", seg); if (!running) VMPTRLD(vmcs); if ((error = vmread(base, &u64)) != 0) goto done; desc->base = u64; if ((error = vmread(limit, &u64)) != 0) goto done; desc->limit = u64; if (access != VMCS_INVALID_ENCODING) { if ((error = vmread(access, &u64)) != 0) goto done; desc->access = u64; } done: if (!running) VMCLEAR(vmcs); return (error); } int vmcs_set_msr_save(struct vmcs *vmcs, u_long g_area, u_int g_count) { int error; VMPTRLD(vmcs); /* * Guest MSRs are saved in the VM-exit MSR-store area. * Guest MSRs are loaded from the VM-entry MSR-load area. * Both areas point to the same location in memory. */ if ((error = vmwrite(VMCS_EXIT_MSR_STORE, g_area)) != 0) goto done; if ((error = vmwrite(VMCS_EXIT_MSR_STORE_COUNT, g_count)) != 0) goto done; if ((error = vmwrite(VMCS_ENTRY_MSR_LOAD, g_area)) != 0) goto done; if ((error = vmwrite(VMCS_ENTRY_MSR_LOAD_COUNT, g_count)) != 0) goto done; error = 0; done: VMCLEAR(vmcs); return (error); } int vmcs_init(struct vmcs *vmcs) { int error, codesel, datasel, tsssel; u_long cr0, cr4, efer; uint64_t pat, fsbase, idtrbase; codesel = vmm_get_host_codesel(); datasel = vmm_get_host_datasel(); tsssel = vmm_get_host_tsssel(); /* * Make sure we have a "current" VMCS to work with. */ VMPTRLD(vmcs); /* Host state */ /* Initialize host IA32_PAT MSR */ pat = vmm_get_host_pat(); if ((error = vmwrite(VMCS_HOST_IA32_PAT, pat)) != 0) goto done; /* Load the IA32_EFER MSR */ efer = vmm_get_host_efer(); if ((error = vmwrite(VMCS_HOST_IA32_EFER, efer)) != 0) goto done; /* Load the control registers */ cr0 = vmm_get_host_cr0(); if ((error = vmwrite(VMCS_HOST_CR0, cr0)) != 0) goto done; cr4 = vmm_get_host_cr4() | CR4_VMXE; if ((error = vmwrite(VMCS_HOST_CR4, cr4)) != 0) goto done; /* Load the segment selectors */ if ((error = vmwrite(VMCS_HOST_ES_SELECTOR, datasel)) != 0) goto done; if ((error = vmwrite(VMCS_HOST_CS_SELECTOR, codesel)) != 0) goto done; if ((error = vmwrite(VMCS_HOST_SS_SELECTOR, datasel)) != 0) goto done; if ((error = vmwrite(VMCS_HOST_DS_SELECTOR, datasel)) != 0) goto done; if ((error = vmwrite(VMCS_HOST_FS_SELECTOR, datasel)) != 0) goto done; if ((error = vmwrite(VMCS_HOST_GS_SELECTOR, datasel)) != 0) goto done; if ((error = vmwrite(VMCS_HOST_TR_SELECTOR, tsssel)) != 0) goto done; /* * Load the Base-Address for %fs and idtr. * * Note that we exclude %gs, tss and gdtr here because their base * address is pcpu specific. */ fsbase = vmm_get_host_fsbase(); if ((error = vmwrite(VMCS_HOST_FS_BASE, fsbase)) != 0) goto done; idtrbase = vmm_get_host_idtrbase(); if ((error = vmwrite(VMCS_HOST_IDTR_BASE, idtrbase)) != 0) goto done; /* instruction pointer */ if (no_flush_rsb) { if ((error = vmwrite(VMCS_HOST_RIP, (u_long)vmx_exit_guest)) != 0) goto done; } else { if ((error = vmwrite(VMCS_HOST_RIP, (u_long)vmx_exit_guest_flush_rsb)) != 0) goto done; } /* link pointer */ if ((error = vmwrite(VMCS_LINK_POINTER, ~0)) != 0) goto done; done: VMCLEAR(vmcs); return (error); } #ifdef DDB extern int vmxon_enabled[]; DB_SHOW_COMMAND(vmcs, db_show_vmcs) { uint64_t cur_vmcs, val; uint32_t exit; if (!vmxon_enabled[curcpu]) { db_printf("VMX not enabled\n"); return; } if (have_addr) { db_printf("Only current VMCS supported\n"); return; } vmptrst(&cur_vmcs); if (cur_vmcs == VMCS_INITIAL) { db_printf("No current VM context\n"); return; } db_printf("VMCS: %jx\n", cur_vmcs); db_printf("VPID: %lu\n", vmcs_read(VMCS_VPID)); db_printf("Activity: "); val = vmcs_read(VMCS_GUEST_ACTIVITY); switch (val) { case 0: db_printf("Active"); break; case 1: db_printf("HLT"); break; case 2: db_printf("Shutdown"); break; case 3: db_printf("Wait for SIPI"); break; default: db_printf("Unknown: %#lx", val); } db_printf("\n"); exit = vmcs_read(VMCS_EXIT_REASON); if (exit & 0x80000000) db_printf("Entry Failure Reason: %u\n", exit & 0xffff); else db_printf("Exit Reason: %u\n", exit & 0xffff); db_printf("Qualification: %#lx\n", vmcs_exit_qualification()); db_printf("Guest Linear Address: %#lx\n", vmcs_read(VMCS_GUEST_LINEAR_ADDRESS)); switch (exit & 0x8000ffff) { case EXIT_REASON_EXCEPTION: case EXIT_REASON_EXT_INTR: val = vmcs_read(VMCS_EXIT_INTR_INFO); db_printf("Interrupt Type: "); switch (val >> 8 & 0x7) { case 0: db_printf("external"); break; case 2: db_printf("NMI"); break; case 3: db_printf("HW exception"); break; case 4: db_printf("SW exception"); break; default: db_printf("?? %lu", val >> 8 & 0x7); break; } db_printf(" Vector: %lu", val & 0xff); if (val & 0x800) db_printf(" Error Code: %lx", vmcs_read(VMCS_EXIT_INTR_ERRCODE)); db_printf("\n"); break; case EXIT_REASON_EPT_FAULT: case EXIT_REASON_EPT_MISCONFIG: db_printf("Guest Physical Address: %#lx\n", vmcs_read(VMCS_GUEST_PHYSICAL_ADDRESS)); break; } db_printf("VM-instruction error: %#lx\n", vmcs_instruction_error()); } #endif Index: head/sys/amd64/vmm/intel/vmx.c =================================================================== --- head/sys/amd64/vmm/intel/vmx.c (revision 355723) +++ head/sys/amd64/vmm/intel/vmx.c (revision 355724) @@ -1,3809 +1,3836 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * Copyright (c) 2018 Joyent, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "vmm_lapic.h" #include "vmm_host.h" #include "vmm_ioport.h" #include "vmm_ktr.h" #include "vmm_stat.h" #include "vatpic.h" #include "vlapic.h" #include "vlapic_priv.h" #include "ept.h" #include "vmx_cpufunc.h" #include "vmx.h" #include "vmx_msr.h" #include "x86.h" #include "vmx_controls.h" #define PINBASED_CTLS_ONE_SETTING \ (PINBASED_EXTINT_EXITING | \ PINBASED_NMI_EXITING | \ PINBASED_VIRTUAL_NMI) #define PINBASED_CTLS_ZERO_SETTING 0 #define PROCBASED_CTLS_WINDOW_SETTING \ (PROCBASED_INT_WINDOW_EXITING | \ PROCBASED_NMI_WINDOW_EXITING) #define PROCBASED_CTLS_ONE_SETTING \ (PROCBASED_SECONDARY_CONTROLS | \ PROCBASED_MWAIT_EXITING | \ PROCBASED_MONITOR_EXITING | \ PROCBASED_IO_EXITING | \ PROCBASED_MSR_BITMAPS | \ PROCBASED_CTLS_WINDOW_SETTING | \ PROCBASED_CR8_LOAD_EXITING | \ PROCBASED_CR8_STORE_EXITING) #define PROCBASED_CTLS_ZERO_SETTING \ (PROCBASED_CR3_LOAD_EXITING | \ PROCBASED_CR3_STORE_EXITING | \ PROCBASED_IO_BITMAPS) #define PROCBASED_CTLS2_ONE_SETTING PROCBASED2_ENABLE_EPT #define PROCBASED_CTLS2_ZERO_SETTING 0 #define VM_EXIT_CTLS_ONE_SETTING \ (VM_EXIT_SAVE_DEBUG_CONTROLS | \ VM_EXIT_HOST_LMA | \ VM_EXIT_SAVE_EFER | \ VM_EXIT_LOAD_EFER | \ VM_EXIT_ACKNOWLEDGE_INTERRUPT) #define VM_EXIT_CTLS_ZERO_SETTING 0 #define VM_ENTRY_CTLS_ONE_SETTING \ (VM_ENTRY_LOAD_DEBUG_CONTROLS | \ VM_ENTRY_LOAD_EFER) #define VM_ENTRY_CTLS_ZERO_SETTING \ (VM_ENTRY_INTO_SMM | \ VM_ENTRY_DEACTIVATE_DUAL_MONITOR) #define HANDLED 1 #define UNHANDLED 0 static MALLOC_DEFINE(M_VMX, "vmx", "vmx"); static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic"); SYSCTL_DECL(_hw_vmm); SYSCTL_NODE(_hw_vmm, OID_AUTO, vmx, CTLFLAG_RW, NULL, NULL); int vmxon_enabled[MAXCPU]; static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE); static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2; static uint32_t exit_ctls, entry_ctls; static uint64_t cr0_ones_mask, cr0_zeros_mask; SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_ones_mask, CTLFLAG_RD, &cr0_ones_mask, 0, NULL); SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_zeros_mask, CTLFLAG_RD, &cr0_zeros_mask, 0, NULL); static uint64_t cr4_ones_mask, cr4_zeros_mask; SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_ones_mask, CTLFLAG_RD, &cr4_ones_mask, 0, NULL); SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_zeros_mask, CTLFLAG_RD, &cr4_zeros_mask, 0, NULL); static int vmx_initialized; SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD, &vmx_initialized, 0, "Intel VMX initialized"); /* * Optional capabilities */ static SYSCTL_NODE(_hw_vmm_vmx, OID_AUTO, cap, CTLFLAG_RW, NULL, NULL); static int cap_halt_exit; SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, halt_exit, CTLFLAG_RD, &cap_halt_exit, 0, "HLT triggers a VM-exit"); static int cap_pause_exit; SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, pause_exit, CTLFLAG_RD, &cap_pause_exit, 0, "PAUSE triggers a VM-exit"); static int cap_unrestricted_guest; SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, unrestricted_guest, CTLFLAG_RD, &cap_unrestricted_guest, 0, "Unrestricted guests"); static int cap_monitor_trap; SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, monitor_trap, CTLFLAG_RD, &cap_monitor_trap, 0, "Monitor trap flag"); static int cap_invpcid; SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, invpcid, CTLFLAG_RD, &cap_invpcid, 0, "Guests are allowed to use INVPCID"); static int virtual_interrupt_delivery; SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, virtual_interrupt_delivery, CTLFLAG_RD, &virtual_interrupt_delivery, 0, "APICv virtual interrupt delivery support"); static int posted_interrupts; SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, posted_interrupts, CTLFLAG_RD, &posted_interrupts, 0, "APICv posted interrupt support"); static int pirvec = -1; SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupt_vector, CTLFLAG_RD, &pirvec, 0, "APICv posted interrupt vector"); static struct unrhdr *vpid_unr; static u_int vpid_alloc_failed; SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD, &vpid_alloc_failed, 0, NULL); int guest_l1d_flush; SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, l1d_flush, CTLFLAG_RD, &guest_l1d_flush, 0, NULL); int guest_l1d_flush_sw; SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, l1d_flush_sw, CTLFLAG_RD, &guest_l1d_flush_sw, 0, NULL); static struct msr_entry msr_load_list[1] __aligned(16); /* * The definitions of SDT probes for VMX. */ SDT_PROBE_DEFINE3(vmm, vmx, exit, entry, "struct vmx *", "int", "struct vm_exit *"); SDT_PROBE_DEFINE4(vmm, vmx, exit, taskswitch, "struct vmx *", "int", "struct vm_exit *", "struct vm_task_switch *"); SDT_PROBE_DEFINE4(vmm, vmx, exit, craccess, "struct vmx *", "int", "struct vm_exit *", "uint64_t"); SDT_PROBE_DEFINE4(vmm, vmx, exit, rdmsr, "struct vmx *", "int", "struct vm_exit *", "uint32_t"); SDT_PROBE_DEFINE5(vmm, vmx, exit, wrmsr, "struct vmx *", "int", "struct vm_exit *", "uint32_t", "uint64_t"); SDT_PROBE_DEFINE3(vmm, vmx, exit, halt, "struct vmx *", "int", "struct vm_exit *"); SDT_PROBE_DEFINE3(vmm, vmx, exit, mtrap, "struct vmx *", "int", "struct vm_exit *"); SDT_PROBE_DEFINE3(vmm, vmx, exit, pause, "struct vmx *", "int", "struct vm_exit *"); SDT_PROBE_DEFINE3(vmm, vmx, exit, intrwindow, "struct vmx *", "int", "struct vm_exit *"); SDT_PROBE_DEFINE4(vmm, vmx, exit, interrupt, "struct vmx *", "int", "struct vm_exit *", "uint32_t"); SDT_PROBE_DEFINE3(vmm, vmx, exit, nmiwindow, "struct vmx *", "int", "struct vm_exit *"); SDT_PROBE_DEFINE3(vmm, vmx, exit, inout, "struct vmx *", "int", "struct vm_exit *"); SDT_PROBE_DEFINE3(vmm, vmx, exit, cpuid, "struct vmx *", "int", "struct vm_exit *"); SDT_PROBE_DEFINE5(vmm, vmx, exit, exception, "struct vmx *", "int", "struct vm_exit *", "uint32_t", "int"); SDT_PROBE_DEFINE5(vmm, vmx, exit, nestedfault, "struct vmx *", "int", "struct vm_exit *", "uint64_t", "uint64_t"); SDT_PROBE_DEFINE4(vmm, vmx, exit, mmiofault, "struct vmx *", "int", "struct vm_exit *", "uint64_t"); SDT_PROBE_DEFINE3(vmm, vmx, exit, eoi, "struct vmx *", "int", "struct vm_exit *"); SDT_PROBE_DEFINE3(vmm, vmx, exit, apicaccess, "struct vmx *", "int", "struct vm_exit *"); SDT_PROBE_DEFINE4(vmm, vmx, exit, apicwrite, "struct vmx *", "int", "struct vm_exit *", "struct vlapic *"); SDT_PROBE_DEFINE3(vmm, vmx, exit, xsetbv, "struct vmx *", "int", "struct vm_exit *"); SDT_PROBE_DEFINE3(vmm, vmx, exit, monitor, "struct vmx *", "int", "struct vm_exit *"); SDT_PROBE_DEFINE3(vmm, vmx, exit, mwait, "struct vmx *", "int", "struct vm_exit *"); SDT_PROBE_DEFINE3(vmm, vmx, exit, vminsn, "struct vmx *", "int", "struct vm_exit *"); SDT_PROBE_DEFINE4(vmm, vmx, exit, unknown, "struct vmx *", "int", "struct vm_exit *", "uint32_t"); SDT_PROBE_DEFINE4(vmm, vmx, exit, return, "struct vmx *", "int", "struct vm_exit *", "int"); /* * Use the last page below 4GB as the APIC access address. This address is * occupied by the boot firmware so it is guaranteed that it will not conflict * with a page in system memory. */ #define APIC_ACCESS_ADDRESS 0xFFFFF000 static int vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc); static int vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval); static int vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val); static void vmx_inject_pir(struct vlapic *vlapic); #ifdef KTR static const char * exit_reason_to_str(int reason) { static char reasonbuf[32]; switch (reason) { case EXIT_REASON_EXCEPTION: return "exception"; case EXIT_REASON_EXT_INTR: return "extint"; case EXIT_REASON_TRIPLE_FAULT: return "triplefault"; case EXIT_REASON_INIT: return "init"; case EXIT_REASON_SIPI: return "sipi"; case EXIT_REASON_IO_SMI: return "iosmi"; case EXIT_REASON_SMI: return "smi"; case EXIT_REASON_INTR_WINDOW: return "intrwindow"; case EXIT_REASON_NMI_WINDOW: return "nmiwindow"; case EXIT_REASON_TASK_SWITCH: return "taskswitch"; case EXIT_REASON_CPUID: return "cpuid"; case EXIT_REASON_GETSEC: return "getsec"; case EXIT_REASON_HLT: return "hlt"; case EXIT_REASON_INVD: return "invd"; case EXIT_REASON_INVLPG: return "invlpg"; case EXIT_REASON_RDPMC: return "rdpmc"; case EXIT_REASON_RDTSC: return "rdtsc"; case EXIT_REASON_RSM: return "rsm"; case EXIT_REASON_VMCALL: return "vmcall"; case EXIT_REASON_VMCLEAR: return "vmclear"; case EXIT_REASON_VMLAUNCH: return "vmlaunch"; case EXIT_REASON_VMPTRLD: return "vmptrld"; case EXIT_REASON_VMPTRST: return "vmptrst"; case EXIT_REASON_VMREAD: return "vmread"; case EXIT_REASON_VMRESUME: return "vmresume"; case EXIT_REASON_VMWRITE: return "vmwrite"; case EXIT_REASON_VMXOFF: return "vmxoff"; case EXIT_REASON_VMXON: return "vmxon"; case EXIT_REASON_CR_ACCESS: return "craccess"; case EXIT_REASON_DR_ACCESS: return "draccess"; case EXIT_REASON_INOUT: return "inout"; case EXIT_REASON_RDMSR: return "rdmsr"; case EXIT_REASON_WRMSR: return "wrmsr"; case EXIT_REASON_INVAL_VMCS: return "invalvmcs"; case EXIT_REASON_INVAL_MSR: return "invalmsr"; case EXIT_REASON_MWAIT: return "mwait"; case EXIT_REASON_MTF: return "mtf"; case EXIT_REASON_MONITOR: return "monitor"; case EXIT_REASON_PAUSE: return "pause"; case EXIT_REASON_MCE_DURING_ENTRY: return "mce-during-entry"; case EXIT_REASON_TPR: return "tpr"; case EXIT_REASON_APIC_ACCESS: return "apic-access"; case EXIT_REASON_GDTR_IDTR: return "gdtridtr"; case EXIT_REASON_LDTR_TR: return "ldtrtr"; case EXIT_REASON_EPT_FAULT: return "eptfault"; case EXIT_REASON_EPT_MISCONFIG: return "eptmisconfig"; case EXIT_REASON_INVEPT: return "invept"; case EXIT_REASON_RDTSCP: return "rdtscp"; case EXIT_REASON_VMX_PREEMPT: return "vmxpreempt"; case EXIT_REASON_INVVPID: return "invvpid"; case EXIT_REASON_WBINVD: return "wbinvd"; case EXIT_REASON_XSETBV: return "xsetbv"; case EXIT_REASON_APIC_WRITE: return "apic-write"; default: snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason); return (reasonbuf); } } #endif /* KTR */ static int vmx_allow_x2apic_msrs(struct vmx *vmx) { int i, error; error = 0; /* * Allow readonly access to the following x2APIC MSRs from the guest. */ error += guest_msr_ro(vmx, MSR_APIC_ID); error += guest_msr_ro(vmx, MSR_APIC_VERSION); error += guest_msr_ro(vmx, MSR_APIC_LDR); error += guest_msr_ro(vmx, MSR_APIC_SVR); for (i = 0; i < 8; i++) error += guest_msr_ro(vmx, MSR_APIC_ISR0 + i); for (i = 0; i < 8; i++) error += guest_msr_ro(vmx, MSR_APIC_TMR0 + i); for (i = 0; i < 8; i++) error += guest_msr_ro(vmx, MSR_APIC_IRR0 + i); error += guest_msr_ro(vmx, MSR_APIC_ESR); error += guest_msr_ro(vmx, MSR_APIC_LVT_TIMER); error += guest_msr_ro(vmx, MSR_APIC_LVT_THERMAL); error += guest_msr_ro(vmx, MSR_APIC_LVT_PCINT); error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT0); error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT1); error += guest_msr_ro(vmx, MSR_APIC_LVT_ERROR); error += guest_msr_ro(vmx, MSR_APIC_ICR_TIMER); error += guest_msr_ro(vmx, MSR_APIC_DCR_TIMER); error += guest_msr_ro(vmx, MSR_APIC_ICR); /* * Allow TPR, EOI and SELF_IPI MSRs to be read and written by the guest. * * These registers get special treatment described in the section * "Virtualizing MSR-Based APIC Accesses". */ error += guest_msr_rw(vmx, MSR_APIC_TPR); error += guest_msr_rw(vmx, MSR_APIC_EOI); error += guest_msr_rw(vmx, MSR_APIC_SELF_IPI); return (error); } u_long vmx_fix_cr0(u_long cr0) { return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask); } u_long vmx_fix_cr4(u_long cr4) { return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask); } static void vpid_free(int vpid) { if (vpid < 0 || vpid > 0xffff) panic("vpid_free: invalid vpid %d", vpid); /* * VPIDs [0,VM_MAXCPU] are special and are not allocated from * the unit number allocator. */ if (vpid > VM_MAXCPU) free_unr(vpid_unr, vpid); } static void vpid_alloc(uint16_t *vpid, int num) { int i, x; if (num <= 0 || num > VM_MAXCPU) panic("invalid number of vpids requested: %d", num); /* * If the "enable vpid" execution control is not enabled then the * VPID is required to be 0 for all vcpus. */ if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) == 0) { for (i = 0; i < num; i++) vpid[i] = 0; return; } /* * Allocate a unique VPID for each vcpu from the unit number allocator. */ for (i = 0; i < num; i++) { x = alloc_unr(vpid_unr); if (x == -1) break; else vpid[i] = x; } if (i < num) { atomic_add_int(&vpid_alloc_failed, 1); /* * If the unit number allocator does not have enough unique * VPIDs then we need to allocate from the [1,VM_MAXCPU] range. * * These VPIDs are not be unique across VMs but this does not * affect correctness because the combined mappings are also * tagged with the EP4TA which is unique for each VM. * * It is still sub-optimal because the invvpid will invalidate * combined mappings for a particular VPID across all EP4TAs. */ while (i-- > 0) vpid_free(vpid[i]); for (i = 0; i < num; i++) vpid[i] = i + 1; } } static void vpid_init(void) { /* * VPID 0 is required when the "enable VPID" execution control is * disabled. * * VPIDs [1,VM_MAXCPU] are used as the "overflow namespace" when the * unit number allocator does not have sufficient unique VPIDs to * satisfy the allocation. * * The remaining VPIDs are managed by the unit number allocator. */ vpid_unr = new_unrhdr(VM_MAXCPU + 1, 0xffff, NULL); } static void vmx_disable(void *arg __unused) { struct invvpid_desc invvpid_desc = { 0 }; struct invept_desc invept_desc = { 0 }; if (vmxon_enabled[curcpu]) { /* * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b. * * VMXON or VMXOFF are not required to invalidate any TLB * caching structures. This prevents potential retention of * cached information in the TLB between distinct VMX episodes. */ invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc); invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc); vmxoff(); } load_cr4(rcr4() & ~CR4_VMXE); } static int vmx_cleanup(void) { if (pirvec >= 0) lapic_ipi_free(pirvec); if (vpid_unr != NULL) { delete_unrhdr(vpid_unr); vpid_unr = NULL; } if (nmi_flush_l1d_sw == 1) nmi_flush_l1d_sw = 0; smp_rendezvous(NULL, vmx_disable, NULL, NULL); return (0); } static void vmx_enable(void *arg __unused) { int error; uint64_t feature_control; feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL); if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 0 || (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) { wrmsr(MSR_IA32_FEATURE_CONTROL, feature_control | IA32_FEATURE_CONTROL_VMX_EN | IA32_FEATURE_CONTROL_LOCK); } load_cr4(rcr4() | CR4_VMXE); *(uint32_t *)vmxon_region[curcpu] = vmx_revision(); error = vmxon(vmxon_region[curcpu]); if (error == 0) vmxon_enabled[curcpu] = 1; } static void vmx_restore(void) { if (vmxon_enabled[curcpu]) vmxon(vmxon_region[curcpu]); } static int vmx_init(int ipinum) { int error, use_tpr_shadow; uint64_t basic, fixed0, fixed1, feature_control; uint32_t tmp, procbased2_vid_bits; /* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */ if (!(cpu_feature2 & CPUID2_VMX)) { printf("vmx_init: processor does not support VMX operation\n"); return (ENXIO); } /* * Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits * are set (bits 0 and 2 respectively). */ feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL); if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 1 && (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) { printf("vmx_init: VMX operation disabled by BIOS\n"); return (ENXIO); } /* * Verify capabilities MSR_VMX_BASIC: * - bit 54 indicates support for INS/OUTS decoding */ basic = rdmsr(MSR_VMX_BASIC); if ((basic & (1UL << 54)) == 0) { printf("vmx_init: processor does not support desired basic " "capabilities\n"); return (EINVAL); } /* Check support for primary processor-based VM-execution controls */ error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, MSR_VMX_TRUE_PROCBASED_CTLS, PROCBASED_CTLS_ONE_SETTING, PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls); if (error) { printf("vmx_init: processor does not support desired primary " "processor-based controls\n"); return (error); } /* Clear the processor-based ctl bits that are set on demand */ procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING; /* Check support for secondary processor-based VM-execution controls */ error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, PROCBASED_CTLS2_ONE_SETTING, PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2); if (error) { printf("vmx_init: processor does not support desired secondary " "processor-based controls\n"); return (error); } /* Check support for VPID */ error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, PROCBASED2_ENABLE_VPID, 0, &tmp); if (error == 0) procbased_ctls2 |= PROCBASED2_ENABLE_VPID; /* Check support for pin-based VM-execution controls */ error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS, MSR_VMX_TRUE_PINBASED_CTLS, PINBASED_CTLS_ONE_SETTING, PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls); if (error) { printf("vmx_init: processor does not support desired " "pin-based controls\n"); return (error); } /* Check support for VM-exit controls */ error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS, VM_EXIT_CTLS_ONE_SETTING, VM_EXIT_CTLS_ZERO_SETTING, &exit_ctls); if (error) { printf("vmx_init: processor does not support desired " "exit controls\n"); return (error); } /* Check support for VM-entry controls */ error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, MSR_VMX_TRUE_ENTRY_CTLS, VM_ENTRY_CTLS_ONE_SETTING, VM_ENTRY_CTLS_ZERO_SETTING, &entry_ctls); if (error) { printf("vmx_init: processor does not support desired " "entry controls\n"); return (error); } /* * Check support for optional features by testing them * as individual bits */ cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, MSR_VMX_TRUE_PROCBASED_CTLS, PROCBASED_HLT_EXITING, 0, &tmp) == 0); cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, MSR_VMX_PROCBASED_CTLS, PROCBASED_MTF, 0, &tmp) == 0); cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, MSR_VMX_TRUE_PROCBASED_CTLS, PROCBASED_PAUSE_EXITING, 0, &tmp) == 0); cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, PROCBASED2_UNRESTRICTED_GUEST, 0, &tmp) == 0); cap_invpcid = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, PROCBASED2_ENABLE_INVPCID, 0, &tmp) == 0); /* * Check support for virtual interrupt delivery. */ procbased2_vid_bits = (PROCBASED2_VIRTUALIZE_APIC_ACCESSES | PROCBASED2_VIRTUALIZE_X2APIC_MODE | PROCBASED2_APIC_REGISTER_VIRTUALIZATION | PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY); use_tpr_shadow = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, MSR_VMX_TRUE_PROCBASED_CTLS, PROCBASED_USE_TPR_SHADOW, 0, &tmp) == 0); error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, procbased2_vid_bits, 0, &tmp); if (error == 0 && use_tpr_shadow) { virtual_interrupt_delivery = 1; TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_vid", &virtual_interrupt_delivery); } if (virtual_interrupt_delivery) { procbased_ctls |= PROCBASED_USE_TPR_SHADOW; procbased_ctls2 |= procbased2_vid_bits; procbased_ctls2 &= ~PROCBASED2_VIRTUALIZE_X2APIC_MODE; /* * No need to emulate accesses to %CR8 if virtual * interrupt delivery is enabled. */ procbased_ctls &= ~PROCBASED_CR8_LOAD_EXITING; procbased_ctls &= ~PROCBASED_CR8_STORE_EXITING; /* * Check for Posted Interrupts only if Virtual Interrupt * Delivery is enabled. */ error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS, MSR_VMX_TRUE_PINBASED_CTLS, PINBASED_POSTED_INTERRUPT, 0, &tmp); if (error == 0) { pirvec = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) : &IDTVEC(justreturn)); if (pirvec < 0) { if (bootverbose) { printf("vmx_init: unable to allocate " "posted interrupt vector\n"); } } else { posted_interrupts = 1; TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_pir", &posted_interrupts); } } } if (posted_interrupts) pinbased_ctls |= PINBASED_POSTED_INTERRUPT; /* Initialize EPT */ error = ept_init(ipinum); if (error) { printf("vmx_init: ept initialization failed (%d)\n", error); return (error); } guest_l1d_flush = (cpu_ia32_arch_caps & IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY) == 0; TUNABLE_INT_FETCH("hw.vmm.l1d_flush", &guest_l1d_flush); /* * L1D cache flush is enabled. Use IA32_FLUSH_CMD MSR when * available. Otherwise fall back to the software flush * method which loads enough data from the kernel text to * flush existing L1D content, both on VMX entry and on NMI * return. */ if (guest_l1d_flush) { if ((cpu_stdext_feature3 & CPUID_STDEXT3_L1D_FLUSH) == 0) { guest_l1d_flush_sw = 1; TUNABLE_INT_FETCH("hw.vmm.l1d_flush_sw", &guest_l1d_flush_sw); } if (guest_l1d_flush_sw) { if (nmi_flush_l1d_sw <= 1) nmi_flush_l1d_sw = 1; } else { msr_load_list[0].index = MSR_IA32_FLUSH_CMD; msr_load_list[0].val = IA32_FLUSH_CMD_L1D; } } /* * Stash the cr0 and cr4 bits that must be fixed to 0 or 1 */ fixed0 = rdmsr(MSR_VMX_CR0_FIXED0); fixed1 = rdmsr(MSR_VMX_CR0_FIXED1); cr0_ones_mask = fixed0 & fixed1; cr0_zeros_mask = ~fixed0 & ~fixed1; /* * CR0_PE and CR0_PG can be set to zero in VMX non-root operation * if unrestricted guest execution is allowed. */ if (cap_unrestricted_guest) cr0_ones_mask &= ~(CR0_PG | CR0_PE); /* * Do not allow the guest to set CR0_NW or CR0_CD. */ cr0_zeros_mask |= (CR0_NW | CR0_CD); fixed0 = rdmsr(MSR_VMX_CR4_FIXED0); fixed1 = rdmsr(MSR_VMX_CR4_FIXED1); cr4_ones_mask = fixed0 & fixed1; cr4_zeros_mask = ~fixed0 & ~fixed1; vpid_init(); vmx_msr_init(); /* enable VMX operation */ smp_rendezvous(NULL, vmx_enable, NULL, NULL); vmx_initialized = 1; return (0); } static void vmx_trigger_hostintr(int vector) { uintptr_t func; struct gate_descriptor *gd; gd = &idt[vector]; KASSERT(vector >= 32 && vector <= 255, ("vmx_trigger_hostintr: " "invalid vector %d", vector)); KASSERT(gd->gd_p == 1, ("gate descriptor for vector %d not present", vector)); KASSERT(gd->gd_type == SDT_SYSIGT, ("gate descriptor for vector %d " "has invalid type %d", vector, gd->gd_type)); KASSERT(gd->gd_dpl == SEL_KPL, ("gate descriptor for vector %d " "has invalid dpl %d", vector, gd->gd_dpl)); KASSERT(gd->gd_selector == GSEL(GCODE_SEL, SEL_KPL), ("gate descriptor " "for vector %d has invalid selector %d", vector, gd->gd_selector)); KASSERT(gd->gd_ist == 0, ("gate descriptor for vector %d has invalid " "IST %d", vector, gd->gd_ist)); func = ((long)gd->gd_hioffset << 16 | gd->gd_looffset); vmx_call_isr(func); } static int vmx_setup_cr_shadow(int which, struct vmcs *vmcs, uint32_t initial) { int error, mask_ident, shadow_ident; uint64_t mask_value; if (which != 0 && which != 4) panic("vmx_setup_cr_shadow: unknown cr%d", which); if (which == 0) { mask_ident = VMCS_CR0_MASK; mask_value = cr0_ones_mask | cr0_zeros_mask; shadow_ident = VMCS_CR0_SHADOW; } else { mask_ident = VMCS_CR4_MASK; mask_value = cr4_ones_mask | cr4_zeros_mask; shadow_ident = VMCS_CR4_SHADOW; } error = vmcs_setreg(vmcs, 0, VMCS_IDENT(mask_ident), mask_value); if (error) return (error); error = vmcs_setreg(vmcs, 0, VMCS_IDENT(shadow_ident), initial); if (error) return (error); return (0); } #define vmx_setup_cr0_shadow(vmcs,init) vmx_setup_cr_shadow(0, (vmcs), (init)) #define vmx_setup_cr4_shadow(vmcs,init) vmx_setup_cr_shadow(4, (vmcs), (init)) static void * vmx_vminit(struct vm *vm, pmap_t pmap) { uint16_t vpid[VM_MAXCPU]; int i, error; struct vmx *vmx; struct vmcs *vmcs; uint32_t exc_bitmap; uint16_t maxcpus; vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO); if ((uintptr_t)vmx & PAGE_MASK) { panic("malloc of struct vmx not aligned on %d byte boundary", PAGE_SIZE); } vmx->vm = vm; vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pml4)); /* * Clean up EPTP-tagged guest physical and combined mappings * * VMX transitions are not required to invalidate any guest physical * mappings. So, it may be possible for stale guest physical mappings * to be present in the processor TLBs. * * Combined mappings for this EP4TA are also invalidated for all VPIDs. */ ept_invalidate_mappings(vmx->eptp); msr_bitmap_initialize(vmx->msr_bitmap); /* * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE. * The guest FSBASE and GSBASE are saved and restored during * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are * always restored from the vmcs host state area on vm-exit. * * The SYSENTER_CS/ESP/EIP MSRs are identical to FS/GSBASE in * how they are saved/restored so can be directly accessed by the * guest. * * MSR_EFER is saved and restored in the guest VMCS area on a * VM exit and entry respectively. It is also restored from the * host VMCS area on a VM exit. * * The TSC MSR is exposed read-only. Writes are disallowed as * that will impact the host TSC. If the guest does a write * the "use TSC offsetting" execution control is enabled and the * difference between the host TSC and the guest TSC is written * into the TSC offset in the VMCS. */ if (guest_msr_rw(vmx, MSR_GSBASE) || guest_msr_rw(vmx, MSR_FSBASE) || guest_msr_rw(vmx, MSR_SYSENTER_CS_MSR) || guest_msr_rw(vmx, MSR_SYSENTER_ESP_MSR) || guest_msr_rw(vmx, MSR_SYSENTER_EIP_MSR) || guest_msr_rw(vmx, MSR_EFER) || guest_msr_ro(vmx, MSR_TSC)) panic("vmx_vminit: error setting guest msr access"); vpid_alloc(vpid, VM_MAXCPU); if (virtual_interrupt_delivery) { error = vm_map_mmio(vm, DEFAULT_APIC_BASE, PAGE_SIZE, APIC_ACCESS_ADDRESS); /* XXX this should really return an error to the caller */ KASSERT(error == 0, ("vm_map_mmio(apicbase) error %d", error)); } maxcpus = vm_get_maxcpus(vm); for (i = 0; i < maxcpus; i++) { vmcs = &vmx->vmcs[i]; vmcs->identifier = vmx_revision(); error = vmclear(vmcs); if (error != 0) { panic("vmx_vminit: vmclear error %d on vcpu %d\n", error, i); } vmx_msr_guest_init(vmx, i); error = vmcs_init(vmcs); KASSERT(error == 0, ("vmcs_init error %d", error)); VMPTRLD(vmcs); error = 0; error += vmwrite(VMCS_HOST_RSP, (u_long)&vmx->ctx[i]); error += vmwrite(VMCS_EPTP, vmx->eptp); error += vmwrite(VMCS_PIN_BASED_CTLS, pinbased_ctls); error += vmwrite(VMCS_PRI_PROC_BASED_CTLS, procbased_ctls); error += vmwrite(VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2); error += vmwrite(VMCS_EXIT_CTLS, exit_ctls); error += vmwrite(VMCS_ENTRY_CTLS, entry_ctls); error += vmwrite(VMCS_MSR_BITMAP, vtophys(vmx->msr_bitmap)); error += vmwrite(VMCS_VPID, vpid[i]); if (guest_l1d_flush && !guest_l1d_flush_sw) { vmcs_write(VMCS_ENTRY_MSR_LOAD, pmap_kextract( (vm_offset_t)&msr_load_list[0])); vmcs_write(VMCS_ENTRY_MSR_LOAD_COUNT, nitems(msr_load_list)); vmcs_write(VMCS_EXIT_MSR_STORE, 0); vmcs_write(VMCS_EXIT_MSR_STORE_COUNT, 0); } /* exception bitmap */ if (vcpu_trace_exceptions(vm, i)) exc_bitmap = 0xffffffff; else exc_bitmap = 1 << IDT_MC; error += vmwrite(VMCS_EXCEPTION_BITMAP, exc_bitmap); vmx->ctx[i].guest_dr6 = DBREG_DR6_RESERVED1; error += vmwrite(VMCS_GUEST_DR7, DBREG_DR7_RESERVED1); if (virtual_interrupt_delivery) { error += vmwrite(VMCS_APIC_ACCESS, APIC_ACCESS_ADDRESS); error += vmwrite(VMCS_VIRTUAL_APIC, vtophys(&vmx->apic_page[i])); error += vmwrite(VMCS_EOI_EXIT0, 0); error += vmwrite(VMCS_EOI_EXIT1, 0); error += vmwrite(VMCS_EOI_EXIT2, 0); error += vmwrite(VMCS_EOI_EXIT3, 0); } if (posted_interrupts) { error += vmwrite(VMCS_PIR_VECTOR, pirvec); error += vmwrite(VMCS_PIR_DESC, vtophys(&vmx->pir_desc[i])); } VMCLEAR(vmcs); KASSERT(error == 0, ("vmx_vminit: error customizing the vmcs")); vmx->cap[i].set = 0; vmx->cap[i].proc_ctls = procbased_ctls; vmx->cap[i].proc_ctls2 = procbased_ctls2; + vmx->cap[i].exc_bitmap = exc_bitmap; vmx->state[i].nextrip = ~0; vmx->state[i].lastcpu = NOCPU; vmx->state[i].vpid = vpid[i]; /* * Set up the CR0/4 shadows, and init the read shadow * to the power-on register value from the Intel Sys Arch. * CR0 - 0x60000010 * CR4 - 0 */ error = vmx_setup_cr0_shadow(vmcs, 0x60000010); if (error != 0) panic("vmx_setup_cr0_shadow %d", error); error = vmx_setup_cr4_shadow(vmcs, 0); if (error != 0) panic("vmx_setup_cr4_shadow %d", error); vmx->ctx[i].pmap = pmap; } return (vmx); } static int vmx_handle_cpuid(struct vm *vm, int vcpu, struct vmxctx *vmxctx) { int handled, func; func = vmxctx->guest_rax; handled = x86_emulate_cpuid(vm, vcpu, (uint32_t*)(&vmxctx->guest_rax), (uint32_t*)(&vmxctx->guest_rbx), (uint32_t*)(&vmxctx->guest_rcx), (uint32_t*)(&vmxctx->guest_rdx)); return (handled); } static __inline void vmx_run_trace(struct vmx *vmx, int vcpu) { #ifdef KTR VCPU_CTR1(vmx->vm, vcpu, "Resume execution at %#lx", vmcs_guest_rip()); #endif } static __inline void vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason, int handled) { #ifdef KTR VCPU_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx", handled ? "handled" : "unhandled", exit_reason_to_str(exit_reason), rip); #endif } static __inline void vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip) { #ifdef KTR VCPU_CTR1(vmx->vm, vcpu, "astpending vmexit at 0x%0lx", rip); #endif } static VMM_STAT_INTEL(VCPU_INVVPID_SAVED, "Number of vpid invalidations saved"); static VMM_STAT_INTEL(VCPU_INVVPID_DONE, "Number of vpid invalidations done"); /* * Invalidate guest mappings identified by its vpid from the TLB. */ static __inline void vmx_invvpid(struct vmx *vmx, int vcpu, pmap_t pmap, int running) { struct vmxstate *vmxstate; struct invvpid_desc invvpid_desc; vmxstate = &vmx->state[vcpu]; if (vmxstate->vpid == 0) return; if (!running) { /* * Set the 'lastcpu' to an invalid host cpu. * * This will invalidate TLB entries tagged with the vcpu's * vpid the next time it runs via vmx_set_pcpu_defaults(). */ vmxstate->lastcpu = NOCPU; return; } KASSERT(curthread->td_critnest > 0, ("%s: vcpu %d running outside " "critical section", __func__, vcpu)); /* * Invalidate all mappings tagged with 'vpid' * * We do this because this vcpu was executing on a different host * cpu when it last ran. We do not track whether it invalidated * mappings associated with its 'vpid' during that run. So we must * assume that the mappings associated with 'vpid' on 'curcpu' are * stale and invalidate them. * * Note that we incur this penalty only when the scheduler chooses to * move the thread associated with this vcpu between host cpus. * * Note also that this will invalidate mappings tagged with 'vpid' * for "all" EP4TAs. */ if (pmap->pm_eptgen == vmx->eptgen[curcpu]) { invvpid_desc._res1 = 0; invvpid_desc._res2 = 0; invvpid_desc.vpid = vmxstate->vpid; invvpid_desc.linear_addr = 0; invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc); vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_DONE, 1); } else { /* * The invvpid can be skipped if an invept is going to * be performed before entering the guest. The invept * will invalidate combined mappings tagged with * 'vmx->eptp' for all vpids. */ vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_SAVED, 1); } } static void vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu, pmap_t pmap) { struct vmxstate *vmxstate; vmxstate = &vmx->state[vcpu]; if (vmxstate->lastcpu == curcpu) return; vmxstate->lastcpu = curcpu; vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1); vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase()); vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase()); vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase()); vmx_invvpid(vmx, vcpu, pmap, 1); } /* * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set. */ CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0); static void __inline vmx_set_int_window_exiting(struct vmx *vmx, int vcpu) { if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) == 0) { vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING; vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); VCPU_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting"); } } static void __inline vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu) { KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0, ("intr_window_exiting not set: %#x", vmx->cap[vcpu].proc_ctls)); vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING; vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); VCPU_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting"); } static void __inline vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu) { if ((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) == 0) { vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING; vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); VCPU_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting"); } } static void __inline vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu) { KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) != 0, ("nmi_window_exiting not set %#x", vmx->cap[vcpu].proc_ctls)); vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING; vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); VCPU_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting"); } int vmx_set_tsc_offset(struct vmx *vmx, int vcpu, uint64_t offset) { int error; if ((vmx->cap[vcpu].proc_ctls & PROCBASED_TSC_OFFSET) == 0) { vmx->cap[vcpu].proc_ctls |= PROCBASED_TSC_OFFSET; vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); VCPU_CTR0(vmx->vm, vcpu, "Enabling TSC offsetting"); } error = vmwrite(VMCS_TSC_OFFSET, offset); return (error); } #define NMI_BLOCKING (VMCS_INTERRUPTIBILITY_NMI_BLOCKING | \ VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING) #define HWINTR_BLOCKING (VMCS_INTERRUPTIBILITY_STI_BLOCKING | \ VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING) static void vmx_inject_nmi(struct vmx *vmx, int vcpu) { uint32_t gi, info; gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); KASSERT((gi & NMI_BLOCKING) == 0, ("vmx_inject_nmi: invalid guest " "interruptibility-state %#x", gi)); info = vmcs_read(VMCS_ENTRY_INTR_INFO); KASSERT((info & VMCS_INTR_VALID) == 0, ("vmx_inject_nmi: invalid " "VM-entry interruption information %#x", info)); /* * Inject the virtual NMI. The vector must be the NMI IDT entry * or the VMCS entry check will fail. */ info = IDT_NMI | VMCS_INTR_T_NMI | VMCS_INTR_VALID; vmcs_write(VMCS_ENTRY_INTR_INFO, info); VCPU_CTR0(vmx->vm, vcpu, "Injecting vNMI"); /* Clear the request */ vm_nmi_clear(vmx->vm, vcpu); } static void vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic, uint64_t guestrip) { int vector, need_nmi_exiting, extint_pending; uint64_t rflags, entryinfo; uint32_t gi, info; if (vmx->state[vcpu].nextrip != guestrip) { gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); if (gi & HWINTR_BLOCKING) { VCPU_CTR2(vmx->vm, vcpu, "Guest interrupt blocking " "cleared due to rip change: %#lx/%#lx", vmx->state[vcpu].nextrip, guestrip); gi &= ~HWINTR_BLOCKING; vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi); } } if (vm_entry_intinfo(vmx->vm, vcpu, &entryinfo)) { KASSERT((entryinfo & VMCS_INTR_VALID) != 0, ("%s: entry " "intinfo is not valid: %#lx", __func__, entryinfo)); info = vmcs_read(VMCS_ENTRY_INTR_INFO); KASSERT((info & VMCS_INTR_VALID) == 0, ("%s: cannot inject " "pending exception: %#lx/%#x", __func__, entryinfo, info)); info = entryinfo; vector = info & 0xff; if (vector == IDT_BP || vector == IDT_OF) { /* * VT-x requires #BP and #OF to be injected as software * exceptions. */ info &= ~VMCS_INTR_T_MASK; info |= VMCS_INTR_T_SWEXCEPTION; } if (info & VMCS_INTR_DEL_ERRCODE) vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, entryinfo >> 32); vmcs_write(VMCS_ENTRY_INTR_INFO, info); } if (vm_nmi_pending(vmx->vm, vcpu)) { /* * If there are no conditions blocking NMI injection then * inject it directly here otherwise enable "NMI window * exiting" to inject it as soon as we can. * * We also check for STI_BLOCKING because some implementations * don't allow NMI injection in this case. If we are running * on a processor that doesn't have this restriction it will * immediately exit and the NMI will be injected in the * "NMI window exiting" handler. */ need_nmi_exiting = 1; gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); if ((gi & (HWINTR_BLOCKING | NMI_BLOCKING)) == 0) { info = vmcs_read(VMCS_ENTRY_INTR_INFO); if ((info & VMCS_INTR_VALID) == 0) { vmx_inject_nmi(vmx, vcpu); need_nmi_exiting = 0; } else { VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI " "due to VM-entry intr info %#x", info); } } else { VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI due to " "Guest Interruptibility-state %#x", gi); } if (need_nmi_exiting) vmx_set_nmi_window_exiting(vmx, vcpu); } extint_pending = vm_extint_pending(vmx->vm, vcpu); if (!extint_pending && virtual_interrupt_delivery) { vmx_inject_pir(vlapic); return; } /* * If interrupt-window exiting is already in effect then don't bother * checking for pending interrupts. This is just an optimization and * not needed for correctness. */ if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0) { VCPU_CTR0(vmx->vm, vcpu, "Skip interrupt injection due to " "pending int_window_exiting"); return; } if (!extint_pending) { /* Ask the local apic for a vector to inject */ if (!vlapic_pending_intr(vlapic, &vector)) return; /* * From the Intel SDM, Volume 3, Section "Maskable * Hardware Interrupts": * - maskable interrupt vectors [16,255] can be delivered * through the local APIC. */ KASSERT(vector >= 16 && vector <= 255, ("invalid vector %d from local APIC", vector)); } else { /* Ask the legacy pic for a vector to inject */ vatpic_pending_intr(vmx->vm, &vector); /* * From the Intel SDM, Volume 3, Section "Maskable * Hardware Interrupts": * - maskable interrupt vectors [0,255] can be delivered * through the INTR pin. */ KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d from INTR", vector)); } /* Check RFLAGS.IF and the interruptibility state of the guest */ rflags = vmcs_read(VMCS_GUEST_RFLAGS); if ((rflags & PSL_I) == 0) { VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to " "rflags %#lx", vector, rflags); goto cantinject; } gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); if (gi & HWINTR_BLOCKING) { VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to " "Guest Interruptibility-state %#x", vector, gi); goto cantinject; } info = vmcs_read(VMCS_ENTRY_INTR_INFO); if (info & VMCS_INTR_VALID) { /* * This is expected and could happen for multiple reasons: * - A vectoring VM-entry was aborted due to astpending * - A VM-exit happened during event injection. * - An exception was injected above. * - An NMI was injected above or after "NMI window exiting" */ VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to " "VM-entry intr info %#x", vector, info); goto cantinject; } /* Inject the interrupt */ info = VMCS_INTR_T_HWINTR | VMCS_INTR_VALID; info |= vector; vmcs_write(VMCS_ENTRY_INTR_INFO, info); if (!extint_pending) { /* Update the Local APIC ISR */ vlapic_intr_accepted(vlapic, vector); } else { vm_extint_clear(vmx->vm, vcpu); vatpic_intr_accepted(vmx->vm, vector); /* * After we accepted the current ExtINT the PIC may * have posted another one. If that is the case, set * the Interrupt Window Exiting execution control so * we can inject that one too. * * Also, interrupt window exiting allows us to inject any * pending APIC vector that was preempted by the ExtINT * as soon as possible. This applies both for the software * emulated vlapic and the hardware assisted virtual APIC. */ vmx_set_int_window_exiting(vmx, vcpu); } VCPU_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector); return; cantinject: /* * Set the Interrupt Window Exiting execution control so we can inject * the interrupt as soon as blocking condition goes away. */ vmx_set_int_window_exiting(vmx, vcpu); } /* * If the Virtual NMIs execution control is '1' then the logical processor * tracks virtual-NMI blocking in the Guest Interruptibility-state field of * the VMCS. An IRET instruction in VMX non-root operation will remove any * virtual-NMI blocking. * * This unblocking occurs even if the IRET causes a fault. In this case the * hypervisor needs to restore virtual-NMI blocking before resuming the guest. */ static void vmx_restore_nmi_blocking(struct vmx *vmx, int vcpuid) { uint32_t gi; VCPU_CTR0(vmx->vm, vcpuid, "Restore Virtual-NMI blocking"); gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); gi |= VMCS_INTERRUPTIBILITY_NMI_BLOCKING; vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi); } static void vmx_clear_nmi_blocking(struct vmx *vmx, int vcpuid) { uint32_t gi; VCPU_CTR0(vmx->vm, vcpuid, "Clear Virtual-NMI blocking"); gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); gi &= ~VMCS_INTERRUPTIBILITY_NMI_BLOCKING; vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi); } static void vmx_assert_nmi_blocking(struct vmx *vmx, int vcpuid) { uint32_t gi; gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); KASSERT(gi & VMCS_INTERRUPTIBILITY_NMI_BLOCKING, ("NMI blocking is not in effect %#x", gi)); } static int vmx_emulate_xsetbv(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) { struct vmxctx *vmxctx; uint64_t xcrval; const struct xsave_limits *limits; vmxctx = &vmx->ctx[vcpu]; limits = vmm_get_xsave_limits(); /* * Note that the processor raises a GP# fault on its own if * xsetbv is executed for CPL != 0, so we do not have to * emulate that fault here. */ /* Only xcr0 is supported. */ if (vmxctx->guest_rcx != 0) { vm_inject_gp(vmx->vm, vcpu); return (HANDLED); } /* We only handle xcr0 if both the host and guest have XSAVE enabled. */ if (!limits->xsave_enabled || !(vmcs_read(VMCS_GUEST_CR4) & CR4_XSAVE)) { vm_inject_ud(vmx->vm, vcpu); return (HANDLED); } xcrval = vmxctx->guest_rdx << 32 | (vmxctx->guest_rax & 0xffffffff); if ((xcrval & ~limits->xcr0_allowed) != 0) { vm_inject_gp(vmx->vm, vcpu); return (HANDLED); } if (!(xcrval & XFEATURE_ENABLED_X87)) { vm_inject_gp(vmx->vm, vcpu); return (HANDLED); } /* AVX (YMM_Hi128) requires SSE. */ if (xcrval & XFEATURE_ENABLED_AVX && (xcrval & XFEATURE_AVX) != XFEATURE_AVX) { vm_inject_gp(vmx->vm, vcpu); return (HANDLED); } /* * AVX512 requires base AVX (YMM_Hi128) as well as OpMask, * ZMM_Hi256, and Hi16_ZMM. */ if (xcrval & XFEATURE_AVX512 && (xcrval & (XFEATURE_AVX512 | XFEATURE_AVX)) != (XFEATURE_AVX512 | XFEATURE_AVX)) { vm_inject_gp(vmx->vm, vcpu); return (HANDLED); } /* * Intel MPX requires both bound register state flags to be * set. */ if (((xcrval & XFEATURE_ENABLED_BNDREGS) != 0) != ((xcrval & XFEATURE_ENABLED_BNDCSR) != 0)) { vm_inject_gp(vmx->vm, vcpu); return (HANDLED); } /* * This runs "inside" vmrun() with the guest's FPU state, so * modifying xcr0 directly modifies the guest's xcr0, not the * host's. */ load_xcr(0, xcrval); return (HANDLED); } static uint64_t vmx_get_guest_reg(struct vmx *vmx, int vcpu, int ident) { const struct vmxctx *vmxctx; vmxctx = &vmx->ctx[vcpu]; switch (ident) { case 0: return (vmxctx->guest_rax); case 1: return (vmxctx->guest_rcx); case 2: return (vmxctx->guest_rdx); case 3: return (vmxctx->guest_rbx); case 4: return (vmcs_read(VMCS_GUEST_RSP)); case 5: return (vmxctx->guest_rbp); case 6: return (vmxctx->guest_rsi); case 7: return (vmxctx->guest_rdi); case 8: return (vmxctx->guest_r8); case 9: return (vmxctx->guest_r9); case 10: return (vmxctx->guest_r10); case 11: return (vmxctx->guest_r11); case 12: return (vmxctx->guest_r12); case 13: return (vmxctx->guest_r13); case 14: return (vmxctx->guest_r14); case 15: return (vmxctx->guest_r15); default: panic("invalid vmx register %d", ident); } } static void vmx_set_guest_reg(struct vmx *vmx, int vcpu, int ident, uint64_t regval) { struct vmxctx *vmxctx; vmxctx = &vmx->ctx[vcpu]; switch (ident) { case 0: vmxctx->guest_rax = regval; break; case 1: vmxctx->guest_rcx = regval; break; case 2: vmxctx->guest_rdx = regval; break; case 3: vmxctx->guest_rbx = regval; break; case 4: vmcs_write(VMCS_GUEST_RSP, regval); break; case 5: vmxctx->guest_rbp = regval; break; case 6: vmxctx->guest_rsi = regval; break; case 7: vmxctx->guest_rdi = regval; break; case 8: vmxctx->guest_r8 = regval; break; case 9: vmxctx->guest_r9 = regval; break; case 10: vmxctx->guest_r10 = regval; break; case 11: vmxctx->guest_r11 = regval; break; case 12: vmxctx->guest_r12 = regval; break; case 13: vmxctx->guest_r13 = regval; break; case 14: vmxctx->guest_r14 = regval; break; case 15: vmxctx->guest_r15 = regval; break; default: panic("invalid vmx register %d", ident); } } static int vmx_emulate_cr0_access(struct vmx *vmx, int vcpu, uint64_t exitqual) { uint64_t crval, regval; /* We only handle mov to %cr0 at this time */ if ((exitqual & 0xf0) != 0x00) return (UNHANDLED); regval = vmx_get_guest_reg(vmx, vcpu, (exitqual >> 8) & 0xf); vmcs_write(VMCS_CR0_SHADOW, regval); crval = regval | cr0_ones_mask; crval &= ~cr0_zeros_mask; vmcs_write(VMCS_GUEST_CR0, crval); if (regval & CR0_PG) { uint64_t efer, entry_ctls; /* * If CR0.PG is 1 and EFER.LME is 1 then EFER.LMA and * the "IA-32e mode guest" bit in VM-entry control must be * equal. */ efer = vmcs_read(VMCS_GUEST_IA32_EFER); if (efer & EFER_LME) { efer |= EFER_LMA; vmcs_write(VMCS_GUEST_IA32_EFER, efer); entry_ctls = vmcs_read(VMCS_ENTRY_CTLS); entry_ctls |= VM_ENTRY_GUEST_LMA; vmcs_write(VMCS_ENTRY_CTLS, entry_ctls); } } return (HANDLED); } static int vmx_emulate_cr4_access(struct vmx *vmx, int vcpu, uint64_t exitqual) { uint64_t crval, regval; /* We only handle mov to %cr4 at this time */ if ((exitqual & 0xf0) != 0x00) return (UNHANDLED); regval = vmx_get_guest_reg(vmx, vcpu, (exitqual >> 8) & 0xf); vmcs_write(VMCS_CR4_SHADOW, regval); crval = regval | cr4_ones_mask; crval &= ~cr4_zeros_mask; vmcs_write(VMCS_GUEST_CR4, crval); return (HANDLED); } static int vmx_emulate_cr8_access(struct vmx *vmx, int vcpu, uint64_t exitqual) { struct vlapic *vlapic; uint64_t cr8; int regnum; /* We only handle mov %cr8 to/from a register at this time. */ if ((exitqual & 0xe0) != 0x00) { return (UNHANDLED); } vlapic = vm_lapic(vmx->vm, vcpu); regnum = (exitqual >> 8) & 0xf; if (exitqual & 0x10) { cr8 = vlapic_get_cr8(vlapic); vmx_set_guest_reg(vmx, vcpu, regnum, cr8); } else { cr8 = vmx_get_guest_reg(vmx, vcpu, regnum); vlapic_set_cr8(vlapic, cr8); } return (HANDLED); } /* * From section "Guest Register State" in the Intel SDM: CPL = SS.DPL */ static int vmx_cpl(void) { uint32_t ssar; ssar = vmcs_read(VMCS_GUEST_SS_ACCESS_RIGHTS); return ((ssar >> 5) & 0x3); } static enum vm_cpu_mode vmx_cpu_mode(void) { uint32_t csar; if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LMA) { csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS); if (csar & 0x2000) return (CPU_MODE_64BIT); /* CS.L = 1 */ else return (CPU_MODE_COMPATIBILITY); } else if (vmcs_read(VMCS_GUEST_CR0) & CR0_PE) { return (CPU_MODE_PROTECTED); } else { return (CPU_MODE_REAL); } } static enum vm_paging_mode vmx_paging_mode(void) { if (!(vmcs_read(VMCS_GUEST_CR0) & CR0_PG)) return (PAGING_MODE_FLAT); if (!(vmcs_read(VMCS_GUEST_CR4) & CR4_PAE)) return (PAGING_MODE_32); if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LME) return (PAGING_MODE_64); else return (PAGING_MODE_PAE); } static uint64_t inout_str_index(struct vmx *vmx, int vcpuid, int in) { uint64_t val; int error; enum vm_reg_name reg; reg = in ? VM_REG_GUEST_RDI : VM_REG_GUEST_RSI; error = vmx_getreg(vmx, vcpuid, reg, &val); KASSERT(error == 0, ("%s: vmx_getreg error %d", __func__, error)); return (val); } static uint64_t inout_str_count(struct vmx *vmx, int vcpuid, int rep) { uint64_t val; int error; if (rep) { error = vmx_getreg(vmx, vcpuid, VM_REG_GUEST_RCX, &val); KASSERT(!error, ("%s: vmx_getreg error %d", __func__, error)); } else { val = 1; } return (val); } static int inout_str_addrsize(uint32_t inst_info) { uint32_t size; size = (inst_info >> 7) & 0x7; switch (size) { case 0: return (2); /* 16 bit */ case 1: return (4); /* 32 bit */ case 2: return (8); /* 64 bit */ default: panic("%s: invalid size encoding %d", __func__, size); } } static void inout_str_seginfo(struct vmx *vmx, int vcpuid, uint32_t inst_info, int in, struct vm_inout_str *vis) { int error, s; if (in) { vis->seg_name = VM_REG_GUEST_ES; } else { s = (inst_info >> 15) & 0x7; vis->seg_name = vm_segment_name(s); } error = vmx_getdesc(vmx, vcpuid, vis->seg_name, &vis->seg_desc); KASSERT(error == 0, ("%s: vmx_getdesc error %d", __func__, error)); } static void vmx_paging_info(struct vm_guest_paging *paging) { paging->cr3 = vmcs_guest_cr3(); paging->cpl = vmx_cpl(); paging->cpu_mode = vmx_cpu_mode(); paging->paging_mode = vmx_paging_mode(); } static void vmexit_inst_emul(struct vm_exit *vmexit, uint64_t gpa, uint64_t gla) { struct vm_guest_paging *paging; uint32_t csar; paging = &vmexit->u.inst_emul.paging; vmexit->exitcode = VM_EXITCODE_INST_EMUL; vmexit->inst_length = 0; vmexit->u.inst_emul.gpa = gpa; vmexit->u.inst_emul.gla = gla; vmx_paging_info(paging); switch (paging->cpu_mode) { case CPU_MODE_REAL: vmexit->u.inst_emul.cs_base = vmcs_read(VMCS_GUEST_CS_BASE); vmexit->u.inst_emul.cs_d = 0; break; case CPU_MODE_PROTECTED: case CPU_MODE_COMPATIBILITY: vmexit->u.inst_emul.cs_base = vmcs_read(VMCS_GUEST_CS_BASE); csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS); vmexit->u.inst_emul.cs_d = SEG_DESC_DEF32(csar); break; default: vmexit->u.inst_emul.cs_base = 0; vmexit->u.inst_emul.cs_d = 0; break; } vie_init(&vmexit->u.inst_emul.vie, NULL, 0); } static int ept_fault_type(uint64_t ept_qual) { int fault_type; if (ept_qual & EPT_VIOLATION_DATA_WRITE) fault_type = VM_PROT_WRITE; else if (ept_qual & EPT_VIOLATION_INST_FETCH) fault_type = VM_PROT_EXECUTE; else fault_type= VM_PROT_READ; return (fault_type); } static bool ept_emulation_fault(uint64_t ept_qual) { int read, write; /* EPT fault on an instruction fetch doesn't make sense here */ if (ept_qual & EPT_VIOLATION_INST_FETCH) return (false); /* EPT fault must be a read fault or a write fault */ read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0; write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0; if ((read | write) == 0) return (false); /* * The EPT violation must have been caused by accessing a * guest-physical address that is a translation of a guest-linear * address. */ if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 || (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) { return (false); } return (true); } static __inline int apic_access_virtualization(struct vmx *vmx, int vcpuid) { uint32_t proc_ctls2; proc_ctls2 = vmx->cap[vcpuid].proc_ctls2; return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) ? 1 : 0); } static __inline int x2apic_virtualization(struct vmx *vmx, int vcpuid) { uint32_t proc_ctls2; proc_ctls2 = vmx->cap[vcpuid].proc_ctls2; return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_X2APIC_MODE) ? 1 : 0); } static int vmx_handle_apic_write(struct vmx *vmx, int vcpuid, struct vlapic *vlapic, uint64_t qual) { int error, handled, offset; uint32_t *apic_regs, vector; bool retu; handled = HANDLED; offset = APIC_WRITE_OFFSET(qual); if (!apic_access_virtualization(vmx, vcpuid)) { /* * In general there should not be any APIC write VM-exits * unless APIC-access virtualization is enabled. * * However self-IPI virtualization can legitimately trigger * an APIC-write VM-exit so treat it specially. */ if (x2apic_virtualization(vmx, vcpuid) && offset == APIC_OFFSET_SELF_IPI) { apic_regs = (uint32_t *)(vlapic->apic_page); vector = apic_regs[APIC_OFFSET_SELF_IPI / 4]; vlapic_self_ipi_handler(vlapic, vector); return (HANDLED); } else return (UNHANDLED); } switch (offset) { case APIC_OFFSET_ID: vlapic_id_write_handler(vlapic); break; case APIC_OFFSET_LDR: vlapic_ldr_write_handler(vlapic); break; case APIC_OFFSET_DFR: vlapic_dfr_write_handler(vlapic); break; case APIC_OFFSET_SVR: vlapic_svr_write_handler(vlapic); break; case APIC_OFFSET_ESR: vlapic_esr_write_handler(vlapic); break; case APIC_OFFSET_ICR_LOW: retu = false; error = vlapic_icrlo_write_handler(vlapic, &retu); if (error != 0 || retu) handled = UNHANDLED; break; case APIC_OFFSET_CMCI_LVT: case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT: vlapic_lvt_write_handler(vlapic, offset); break; case APIC_OFFSET_TIMER_ICR: vlapic_icrtmr_write_handler(vlapic); break; case APIC_OFFSET_TIMER_DCR: vlapic_dcr_write_handler(vlapic); break; default: handled = UNHANDLED; break; } return (handled); } static bool apic_access_fault(struct vmx *vmx, int vcpuid, uint64_t gpa) { if (apic_access_virtualization(vmx, vcpuid) && (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE)) return (true); else return (false); } static int vmx_handle_apic_access(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit) { uint64_t qual; int access_type, offset, allowed; if (!apic_access_virtualization(vmx, vcpuid)) return (UNHANDLED); qual = vmexit->u.vmx.exit_qualification; access_type = APIC_ACCESS_TYPE(qual); offset = APIC_ACCESS_OFFSET(qual); allowed = 0; if (access_type == 0) { /* * Read data access to the following registers is expected. */ switch (offset) { case APIC_OFFSET_APR: case APIC_OFFSET_PPR: case APIC_OFFSET_RRR: case APIC_OFFSET_CMCI_LVT: case APIC_OFFSET_TIMER_CCR: allowed = 1; break; default: break; } } else if (access_type == 1) { /* * Write data access to the following registers is expected. */ switch (offset) { case APIC_OFFSET_VER: case APIC_OFFSET_APR: case APIC_OFFSET_PPR: case APIC_OFFSET_RRR: case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7: case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7: case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7: case APIC_OFFSET_CMCI_LVT: case APIC_OFFSET_TIMER_CCR: allowed = 1; break; default: break; } } if (allowed) { vmexit_inst_emul(vmexit, DEFAULT_APIC_BASE + offset, VIE_INVALID_GLA); } /* * Regardless of whether the APIC-access is allowed this handler * always returns UNHANDLED: * - if the access is allowed then it is handled by emulating the * instruction that caused the VM-exit (outside the critical section) * - if the access is not allowed then it will be converted to an * exitcode of VM_EXITCODE_VMX and will be dealt with in userland. */ return (UNHANDLED); } static enum task_switch_reason vmx_task_switch_reason(uint64_t qual) { int reason; reason = (qual >> 30) & 0x3; switch (reason) { case 0: return (TSR_CALL); case 1: return (TSR_IRET); case 2: return (TSR_JMP); case 3: return (TSR_IDT_GATE); default: panic("%s: invalid reason %d", __func__, reason); } } static int emulate_wrmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t val, bool *retu) { int error; if (lapic_msr(num)) error = lapic_wrmsr(vmx->vm, vcpuid, num, val, retu); else error = vmx_wrmsr(vmx, vcpuid, num, val, retu); return (error); } static int emulate_rdmsr(struct vmx *vmx, int vcpuid, u_int num, bool *retu) { struct vmxctx *vmxctx; uint64_t result; uint32_t eax, edx; int error; if (lapic_msr(num)) error = lapic_rdmsr(vmx->vm, vcpuid, num, &result, retu); else error = vmx_rdmsr(vmx, vcpuid, num, &result, retu); if (error == 0) { eax = result; vmxctx = &vmx->ctx[vcpuid]; error = vmxctx_setreg(vmxctx, VM_REG_GUEST_RAX, eax); KASSERT(error == 0, ("vmxctx_setreg(rax) error %d", error)); edx = result >> 32; error = vmxctx_setreg(vmxctx, VM_REG_GUEST_RDX, edx); KASSERT(error == 0, ("vmxctx_setreg(rdx) error %d", error)); } return (error); } static int vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) { int error, errcode, errcode_valid, handled, in; struct vmxctx *vmxctx; struct vlapic *vlapic; struct vm_inout_str *vis; struct vm_task_switch *ts; uint32_t eax, ecx, edx, idtvec_info, idtvec_err, intr_info, inst_info; uint32_t intr_type, intr_vec, reason; uint64_t exitintinfo, qual, gpa; bool retu; CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_VIRTUAL_NMI) != 0); CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_NMI_EXITING) != 0); handled = UNHANDLED; vmxctx = &vmx->ctx[vcpu]; qual = vmexit->u.vmx.exit_qualification; reason = vmexit->u.vmx.exit_reason; vmexit->exitcode = VM_EXITCODE_BOGUS; vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1); SDT_PROBE3(vmm, vmx, exit, entry, vmx, vcpu, vmexit); /* * VM-entry failures during or after loading guest state. * * These VM-exits are uncommon but must be handled specially * as most VM-exit fields are not populated as usual. */ if (__predict_false(reason == EXIT_REASON_MCE_DURING_ENTRY)) { VCPU_CTR0(vmx->vm, vcpu, "Handling MCE during VM-entry"); __asm __volatile("int $18"); return (1); } /* * VM exits that can be triggered during event delivery need to * be handled specially by re-injecting the event if the IDT * vectoring information field's valid bit is set. * * See "Information for VM Exits During Event Delivery" in Intel SDM * for details. */ idtvec_info = vmcs_idt_vectoring_info(); if (idtvec_info & VMCS_IDT_VEC_VALID) { idtvec_info &= ~(1 << 12); /* clear undefined bit */ exitintinfo = idtvec_info; if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) { idtvec_err = vmcs_idt_vectoring_err(); exitintinfo |= (uint64_t)idtvec_err << 32; } error = vm_exit_intinfo(vmx->vm, vcpu, exitintinfo); KASSERT(error == 0, ("%s: vm_set_intinfo error %d", __func__, error)); /* * If 'virtual NMIs' are being used and the VM-exit * happened while injecting an NMI during the previous * VM-entry, then clear "blocking by NMI" in the * Guest Interruptibility-State so the NMI can be * reinjected on the subsequent VM-entry. * * However, if the NMI was being delivered through a task * gate, then the new task must start execution with NMIs * blocked so don't clear NMI blocking in this case. */ intr_type = idtvec_info & VMCS_INTR_T_MASK; if (intr_type == VMCS_INTR_T_NMI) { if (reason != EXIT_REASON_TASK_SWITCH) vmx_clear_nmi_blocking(vmx, vcpu); else vmx_assert_nmi_blocking(vmx, vcpu); } /* * Update VM-entry instruction length if the event being * delivered was a software interrupt or software exception. */ if (intr_type == VMCS_INTR_T_SWINTR || intr_type == VMCS_INTR_T_PRIV_SWEXCEPTION || intr_type == VMCS_INTR_T_SWEXCEPTION) { vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length); } } switch (reason) { case EXIT_REASON_TASK_SWITCH: ts = &vmexit->u.task_switch; ts->tsssel = qual & 0xffff; ts->reason = vmx_task_switch_reason(qual); ts->ext = 0; ts->errcode_valid = 0; vmx_paging_info(&ts->paging); /* * If the task switch was due to a CALL, JMP, IRET, software * interrupt (INT n) or software exception (INT3, INTO), * then the saved %rip references the instruction that caused * the task switch. The instruction length field in the VMCS * is valid in this case. * * In all other cases (e.g., NMI, hardware exception) the * saved %rip is one that would have been saved in the old TSS * had the task switch completed normally so the instruction * length field is not needed in this case and is explicitly * set to 0. */ if (ts->reason == TSR_IDT_GATE) { KASSERT(idtvec_info & VMCS_IDT_VEC_VALID, ("invalid idtvec_info %#x for IDT task switch", idtvec_info)); intr_type = idtvec_info & VMCS_INTR_T_MASK; if (intr_type != VMCS_INTR_T_SWINTR && intr_type != VMCS_INTR_T_SWEXCEPTION && intr_type != VMCS_INTR_T_PRIV_SWEXCEPTION) { /* Task switch triggered by external event */ ts->ext = 1; vmexit->inst_length = 0; if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) { ts->errcode_valid = 1; ts->errcode = vmcs_idt_vectoring_err(); } } } vmexit->exitcode = VM_EXITCODE_TASK_SWITCH; SDT_PROBE4(vmm, vmx, exit, taskswitch, vmx, vcpu, vmexit, ts); VCPU_CTR4(vmx->vm, vcpu, "task switch reason %d, tss 0x%04x, " "%s errcode 0x%016lx", ts->reason, ts->tsssel, ts->ext ? "external" : "internal", ((uint64_t)ts->errcode << 32) | ts->errcode_valid); break; case EXIT_REASON_CR_ACCESS: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1); SDT_PROBE4(vmm, vmx, exit, craccess, vmx, vcpu, vmexit, qual); switch (qual & 0xf) { case 0: handled = vmx_emulate_cr0_access(vmx, vcpu, qual); break; case 4: handled = vmx_emulate_cr4_access(vmx, vcpu, qual); break; case 8: handled = vmx_emulate_cr8_access(vmx, vcpu, qual); break; } break; case EXIT_REASON_RDMSR: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_RDMSR, 1); retu = false; ecx = vmxctx->guest_rcx; VCPU_CTR1(vmx->vm, vcpu, "rdmsr 0x%08x", ecx); SDT_PROBE4(vmm, vmx, exit, rdmsr, vmx, vcpu, vmexit, ecx); error = emulate_rdmsr(vmx, vcpu, ecx, &retu); if (error) { vmexit->exitcode = VM_EXITCODE_RDMSR; vmexit->u.msr.code = ecx; } else if (!retu) { handled = HANDLED; } else { /* Return to userspace with a valid exitcode */ KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS, ("emulate_rdmsr retu with bogus exitcode")); } break; case EXIT_REASON_WRMSR: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_WRMSR, 1); retu = false; eax = vmxctx->guest_rax; ecx = vmxctx->guest_rcx; edx = vmxctx->guest_rdx; VCPU_CTR2(vmx->vm, vcpu, "wrmsr 0x%08x value 0x%016lx", ecx, (uint64_t)edx << 32 | eax); SDT_PROBE5(vmm, vmx, exit, wrmsr, vmx, vmexit, vcpu, ecx, (uint64_t)edx << 32 | eax); error = emulate_wrmsr(vmx, vcpu, ecx, (uint64_t)edx << 32 | eax, &retu); if (error) { vmexit->exitcode = VM_EXITCODE_WRMSR; vmexit->u.msr.code = ecx; vmexit->u.msr.wval = (uint64_t)edx << 32 | eax; } else if (!retu) { handled = HANDLED; } else { /* Return to userspace with a valid exitcode */ KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS, ("emulate_wrmsr retu with bogus exitcode")); } break; case EXIT_REASON_HLT: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1); SDT_PROBE3(vmm, vmx, exit, halt, vmx, vcpu, vmexit); vmexit->exitcode = VM_EXITCODE_HLT; vmexit->u.hlt.rflags = vmcs_read(VMCS_GUEST_RFLAGS); if (virtual_interrupt_delivery) vmexit->u.hlt.intr_status = vmcs_read(VMCS_GUEST_INTR_STATUS); else vmexit->u.hlt.intr_status = 0; break; case EXIT_REASON_MTF: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1); SDT_PROBE3(vmm, vmx, exit, mtrap, vmx, vcpu, vmexit); vmexit->exitcode = VM_EXITCODE_MTRAP; vmexit->inst_length = 0; break; case EXIT_REASON_PAUSE: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_PAUSE, 1); SDT_PROBE3(vmm, vmx, exit, pause, vmx, vcpu, vmexit); vmexit->exitcode = VM_EXITCODE_PAUSE; break; case EXIT_REASON_INTR_WINDOW: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INTR_WINDOW, 1); SDT_PROBE3(vmm, vmx, exit, intrwindow, vmx, vcpu, vmexit); vmx_clear_int_window_exiting(vmx, vcpu); return (1); case EXIT_REASON_EXT_INTR: /* * External interrupts serve only to cause VM exits and allow * the host interrupt handler to run. * * If this external interrupt triggers a virtual interrupt * to a VM, then that state will be recorded by the * host interrupt handler in the VM's softc. We will inject * this virtual interrupt during the subsequent VM enter. */ intr_info = vmcs_read(VMCS_EXIT_INTR_INFO); SDT_PROBE4(vmm, vmx, exit, interrupt, vmx, vcpu, vmexit, intr_info); /* * XXX: Ignore this exit if VMCS_INTR_VALID is not set. * This appears to be a bug in VMware Fusion? */ if (!(intr_info & VMCS_INTR_VALID)) return (1); KASSERT((intr_info & VMCS_INTR_VALID) != 0 && (intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_HWINTR, ("VM exit interruption info invalid: %#x", intr_info)); vmx_trigger_hostintr(intr_info & 0xff); /* * This is special. We want to treat this as an 'handled' * VM-exit but not increment the instruction pointer. */ vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1); return (1); case EXIT_REASON_NMI_WINDOW: SDT_PROBE3(vmm, vmx, exit, nmiwindow, vmx, vcpu, vmexit); /* Exit to allow the pending virtual NMI to be injected */ if (vm_nmi_pending(vmx->vm, vcpu)) vmx_inject_nmi(vmx, vcpu); vmx_clear_nmi_window_exiting(vmx, vcpu); vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NMI_WINDOW, 1); return (1); case EXIT_REASON_INOUT: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INOUT, 1); vmexit->exitcode = VM_EXITCODE_INOUT; vmexit->u.inout.bytes = (qual & 0x7) + 1; vmexit->u.inout.in = in = (qual & 0x8) ? 1 : 0; vmexit->u.inout.string = (qual & 0x10) ? 1 : 0; vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0; vmexit->u.inout.port = (uint16_t)(qual >> 16); vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax); if (vmexit->u.inout.string) { inst_info = vmcs_read(VMCS_EXIT_INSTRUCTION_INFO); vmexit->exitcode = VM_EXITCODE_INOUT_STR; vis = &vmexit->u.inout_str; vmx_paging_info(&vis->paging); vis->rflags = vmcs_read(VMCS_GUEST_RFLAGS); vis->cr0 = vmcs_read(VMCS_GUEST_CR0); vis->index = inout_str_index(vmx, vcpu, in); vis->count = inout_str_count(vmx, vcpu, vis->inout.rep); vis->addrsize = inout_str_addrsize(inst_info); inout_str_seginfo(vmx, vcpu, inst_info, in, vis); } SDT_PROBE3(vmm, vmx, exit, inout, vmx, vcpu, vmexit); break; case EXIT_REASON_CPUID: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CPUID, 1); SDT_PROBE3(vmm, vmx, exit, cpuid, vmx, vcpu, vmexit); handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx); break; case EXIT_REASON_EXCEPTION: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXCEPTION, 1); intr_info = vmcs_read(VMCS_EXIT_INTR_INFO); KASSERT((intr_info & VMCS_INTR_VALID) != 0, ("VM exit interruption info invalid: %#x", intr_info)); intr_vec = intr_info & 0xff; intr_type = intr_info & VMCS_INTR_T_MASK; /* * If Virtual NMIs control is 1 and the VM-exit is due to a * fault encountered during the execution of IRET then we must * restore the state of "virtual-NMI blocking" before resuming * the guest. * * See "Resuming Guest Software after Handling an Exception". * See "Information for VM Exits Due to Vectored Events". */ if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 && (intr_vec != IDT_DF) && (intr_info & EXIT_QUAL_NMIUDTI) != 0) vmx_restore_nmi_blocking(vmx, vcpu); /* * The NMI has already been handled in vmx_exit_handle_nmi(). */ if (intr_type == VMCS_INTR_T_NMI) return (1); /* * Call the machine check handler by hand. Also don't reflect * the machine check back into the guest. */ if (intr_vec == IDT_MC) { VCPU_CTR0(vmx->vm, vcpu, "Vectoring to MCE handler"); __asm __volatile("int $18"); return (1); } + /* + * If the hypervisor has requested user exits for + * debug exceptions, bounce them out to userland. + */ + if (intr_type == VMCS_INTR_T_SWEXCEPTION && intr_vec == IDT_BP && + (vmx->cap[vcpu].set & (1 << VM_CAP_BPT_EXIT))) { + vmexit->exitcode = VM_EXITCODE_BPT; + vmexit->u.bpt.inst_length = vmexit->inst_length; + vmexit->inst_length = 0; + break; + } + if (intr_vec == IDT_PF) { error = vmxctx_setreg(vmxctx, VM_REG_GUEST_CR2, qual); KASSERT(error == 0, ("%s: vmxctx_setreg(cr2) error %d", __func__, error)); } /* * Software exceptions exhibit trap-like behavior. This in * turn requires populating the VM-entry instruction length * so that the %rip in the trap frame is past the INT3/INTO * instruction. */ if (intr_type == VMCS_INTR_T_SWEXCEPTION) vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length); /* Reflect all other exceptions back into the guest */ errcode_valid = errcode = 0; if (intr_info & VMCS_INTR_DEL_ERRCODE) { errcode_valid = 1; errcode = vmcs_read(VMCS_EXIT_INTR_ERRCODE); } VCPU_CTR2(vmx->vm, vcpu, "Reflecting exception %d/%#x into " "the guest", intr_vec, errcode); SDT_PROBE5(vmm, vmx, exit, exception, vmx, vcpu, vmexit, intr_vec, errcode); error = vm_inject_exception(vmx->vm, vcpu, intr_vec, errcode_valid, errcode, 0); KASSERT(error == 0, ("%s: vm_inject_exception error %d", __func__, error)); return (1); case EXIT_REASON_EPT_FAULT: /* * If 'gpa' lies within the address space allocated to * memory then this must be a nested page fault otherwise * this must be an instruction that accesses MMIO space. */ gpa = vmcs_gpa(); if (vm_mem_allocated(vmx->vm, vcpu, gpa) || apic_access_fault(vmx, vcpu, gpa)) { vmexit->exitcode = VM_EXITCODE_PAGING; vmexit->inst_length = 0; vmexit->u.paging.gpa = gpa; vmexit->u.paging.fault_type = ept_fault_type(qual); vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NESTED_FAULT, 1); SDT_PROBE5(vmm, vmx, exit, nestedfault, vmx, vcpu, vmexit, gpa, qual); } else if (ept_emulation_fault(qual)) { vmexit_inst_emul(vmexit, gpa, vmcs_gla()); vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INST_EMUL, 1); SDT_PROBE4(vmm, vmx, exit, mmiofault, vmx, vcpu, vmexit, gpa); } /* * If Virtual NMIs control is 1 and the VM-exit is due to an * EPT fault during the execution of IRET then we must restore * the state of "virtual-NMI blocking" before resuming. * * See description of "NMI unblocking due to IRET" in * "Exit Qualification for EPT Violations". */ if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 && (qual & EXIT_QUAL_NMIUDTI) != 0) vmx_restore_nmi_blocking(vmx, vcpu); break; case EXIT_REASON_VIRTUALIZED_EOI: vmexit->exitcode = VM_EXITCODE_IOAPIC_EOI; vmexit->u.ioapic_eoi.vector = qual & 0xFF; SDT_PROBE3(vmm, vmx, exit, eoi, vmx, vcpu, vmexit); vmexit->inst_length = 0; /* trap-like */ break; case EXIT_REASON_APIC_ACCESS: SDT_PROBE3(vmm, vmx, exit, apicaccess, vmx, vcpu, vmexit); handled = vmx_handle_apic_access(vmx, vcpu, vmexit); break; case EXIT_REASON_APIC_WRITE: /* * APIC-write VM exit is trap-like so the %rip is already * pointing to the next instruction. */ vmexit->inst_length = 0; vlapic = vm_lapic(vmx->vm, vcpu); SDT_PROBE4(vmm, vmx, exit, apicwrite, vmx, vcpu, vmexit, vlapic); handled = vmx_handle_apic_write(vmx, vcpu, vlapic, qual); break; case EXIT_REASON_XSETBV: SDT_PROBE3(vmm, vmx, exit, xsetbv, vmx, vcpu, vmexit); handled = vmx_emulate_xsetbv(vmx, vcpu, vmexit); break; case EXIT_REASON_MONITOR: SDT_PROBE3(vmm, vmx, exit, monitor, vmx, vcpu, vmexit); vmexit->exitcode = VM_EXITCODE_MONITOR; break; case EXIT_REASON_MWAIT: SDT_PROBE3(vmm, vmx, exit, mwait, vmx, vcpu, vmexit); vmexit->exitcode = VM_EXITCODE_MWAIT; break; case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD: case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE: case EXIT_REASON_VMXOFF: case EXIT_REASON_VMXON: SDT_PROBE3(vmm, vmx, exit, vminsn, vmx, vcpu, vmexit); vmexit->exitcode = VM_EXITCODE_VMINSN; break; default: SDT_PROBE4(vmm, vmx, exit, unknown, vmx, vcpu, vmexit, reason); vmm_stat_incr(vmx->vm, vcpu, VMEXIT_UNKNOWN, 1); break; } if (handled) { /* * It is possible that control is returned to userland * even though we were able to handle the VM exit in the * kernel. * * In such a case we want to make sure that the userland * restarts guest execution at the instruction *after* * the one we just processed. Therefore we update the * guest rip in the VMCS and in 'vmexit'. */ vmexit->rip += vmexit->inst_length; vmexit->inst_length = 0; vmcs_write(VMCS_GUEST_RIP, vmexit->rip); } else { if (vmexit->exitcode == VM_EXITCODE_BOGUS) { /* * If this VM exit was not claimed by anybody then * treat it as a generic VMX exit. */ vmexit->exitcode = VM_EXITCODE_VMX; vmexit->u.vmx.status = VM_SUCCESS; vmexit->u.vmx.inst_type = 0; vmexit->u.vmx.inst_error = 0; } else { /* * The exitcode and collateral have been populated. * The VM exit will be processed further in userland. */ } } SDT_PROBE4(vmm, vmx, exit, return, vmx, vcpu, vmexit, handled); return (handled); } static __inline void vmx_exit_inst_error(struct vmxctx *vmxctx, int rc, struct vm_exit *vmexit) { KASSERT(vmxctx->inst_fail_status != VM_SUCCESS, ("vmx_exit_inst_error: invalid inst_fail_status %d", vmxctx->inst_fail_status)); vmexit->inst_length = 0; vmexit->exitcode = VM_EXITCODE_VMX; vmexit->u.vmx.status = vmxctx->inst_fail_status; vmexit->u.vmx.inst_error = vmcs_instruction_error(); vmexit->u.vmx.exit_reason = ~0; vmexit->u.vmx.exit_qualification = ~0; switch (rc) { case VMX_VMRESUME_ERROR: case VMX_VMLAUNCH_ERROR: case VMX_INVEPT_ERROR: vmexit->u.vmx.inst_type = rc; break; default: panic("vm_exit_inst_error: vmx_enter_guest returned %d", rc); } } /* * If the NMI-exiting VM execution control is set to '1' then an NMI in * non-root operation causes a VM-exit. NMI blocking is in effect so it is * sufficient to simply vector to the NMI handler via a software interrupt. * However, this must be done before maskable interrupts are enabled * otherwise the "iret" issued by an interrupt handler will incorrectly * clear NMI blocking. */ static __inline void vmx_exit_handle_nmi(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit) { uint32_t intr_info; KASSERT((read_rflags() & PSL_I) == 0, ("interrupts enabled")); if (vmexit->u.vmx.exit_reason != EXIT_REASON_EXCEPTION) return; intr_info = vmcs_read(VMCS_EXIT_INTR_INFO); KASSERT((intr_info & VMCS_INTR_VALID) != 0, ("VM exit interruption info invalid: %#x", intr_info)); if ((intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_NMI) { KASSERT((intr_info & 0xff) == IDT_NMI, ("VM exit due " "to NMI has invalid vector: %#x", intr_info)); VCPU_CTR0(vmx->vm, vcpuid, "Vectoring to NMI handler"); __asm __volatile("int $2"); } } static __inline void vmx_dr_enter_guest(struct vmxctx *vmxctx) { register_t rflags; /* Save host control debug registers. */ vmxctx->host_dr7 = rdr7(); vmxctx->host_debugctl = rdmsr(MSR_DEBUGCTLMSR); /* * Disable debugging in DR7 and DEBUGCTL to avoid triggering * exceptions in the host based on the guest DRx values. The * guest DR7 and DEBUGCTL are saved/restored in the VMCS. */ load_dr7(0); wrmsr(MSR_DEBUGCTLMSR, 0); /* * Disable single stepping the kernel to avoid corrupting the * guest DR6. A debugger might still be able to corrupt the * guest DR6 by setting a breakpoint after this point and then * single stepping. */ rflags = read_rflags(); vmxctx->host_tf = rflags & PSL_T; write_rflags(rflags & ~PSL_T); /* Save host debug registers. */ vmxctx->host_dr0 = rdr0(); vmxctx->host_dr1 = rdr1(); vmxctx->host_dr2 = rdr2(); vmxctx->host_dr3 = rdr3(); vmxctx->host_dr6 = rdr6(); /* Restore guest debug registers. */ load_dr0(vmxctx->guest_dr0); load_dr1(vmxctx->guest_dr1); load_dr2(vmxctx->guest_dr2); load_dr3(vmxctx->guest_dr3); load_dr6(vmxctx->guest_dr6); } static __inline void vmx_dr_leave_guest(struct vmxctx *vmxctx) { /* Save guest debug registers. */ vmxctx->guest_dr0 = rdr0(); vmxctx->guest_dr1 = rdr1(); vmxctx->guest_dr2 = rdr2(); vmxctx->guest_dr3 = rdr3(); vmxctx->guest_dr6 = rdr6(); /* * Restore host debug registers. Restore DR7, DEBUGCTL, and * PSL_T last. */ load_dr0(vmxctx->host_dr0); load_dr1(vmxctx->host_dr1); load_dr2(vmxctx->host_dr2); load_dr3(vmxctx->host_dr3); load_dr6(vmxctx->host_dr6); wrmsr(MSR_DEBUGCTLMSR, vmxctx->host_debugctl); load_dr7(vmxctx->host_dr7); write_rflags(read_rflags() | vmxctx->host_tf); } static int vmx_run(void *arg, int vcpu, register_t rip, pmap_t pmap, struct vm_eventinfo *evinfo) { int rc, handled, launched; struct vmx *vmx; struct vm *vm; struct vmxctx *vmxctx; struct vmcs *vmcs; struct vm_exit *vmexit; struct vlapic *vlapic; uint32_t exit_reason; struct region_descriptor gdtr, idtr; uint16_t ldt_sel; vmx = arg; vm = vmx->vm; vmcs = &vmx->vmcs[vcpu]; vmxctx = &vmx->ctx[vcpu]; vlapic = vm_lapic(vm, vcpu); vmexit = vm_exitinfo(vm, vcpu); launched = 0; KASSERT(vmxctx->pmap == pmap, ("pmap %p different than ctx pmap %p", pmap, vmxctx->pmap)); vmx_msr_guest_enter(vmx, vcpu); VMPTRLD(vmcs); /* * XXX * We do this every time because we may setup the virtual machine * from a different process than the one that actually runs it. * * If the life of a virtual machine was spent entirely in the context * of a single process we could do this once in vmx_vminit(). */ vmcs_write(VMCS_HOST_CR3, rcr3()); vmcs_write(VMCS_GUEST_RIP, rip); vmx_set_pcpu_defaults(vmx, vcpu, pmap); do { KASSERT(vmcs_guest_rip() == rip, ("%s: vmcs guest rip mismatch " "%#lx/%#lx", __func__, vmcs_guest_rip(), rip)); handled = UNHANDLED; /* * Interrupts are disabled from this point on until the * guest starts executing. This is done for the following * reasons: * * If an AST is asserted on this thread after the check below, * then the IPI_AST notification will not be lost, because it * will cause a VM exit due to external interrupt as soon as * the guest state is loaded. * * A posted interrupt after 'vmx_inject_interrupts()' will * not be "lost" because it will be held pending in the host * APIC because interrupts are disabled. The pending interrupt * will be recognized as soon as the guest state is loaded. * * The same reasoning applies to the IPI generated by * pmap_invalidate_ept(). */ disable_intr(); vmx_inject_interrupts(vmx, vcpu, vlapic, rip); /* * Check for vcpu suspension after injecting events because * vmx_inject_interrupts() can suspend the vcpu due to a * triple fault. */ if (vcpu_suspended(evinfo)) { enable_intr(); vm_exit_suspended(vmx->vm, vcpu, rip); break; } if (vcpu_rendezvous_pending(evinfo)) { enable_intr(); vm_exit_rendezvous(vmx->vm, vcpu, rip); break; } if (vcpu_reqidle(evinfo)) { enable_intr(); vm_exit_reqidle(vmx->vm, vcpu, rip); break; } if (vcpu_should_yield(vm, vcpu)) { enable_intr(); vm_exit_astpending(vmx->vm, vcpu, rip); vmx_astpending_trace(vmx, vcpu, rip); handled = HANDLED; break; } if (vcpu_debugged(vm, vcpu)) { enable_intr(); vm_exit_debug(vmx->vm, vcpu, rip); break; } /* * VM exits restore the base address but not the * limits of GDTR and IDTR. The VMCS only stores the * base address, so VM exits set the limits to 0xffff. * Save and restore the full GDTR and IDTR to restore * the limits. * * The VMCS does not save the LDTR at all, and VM * exits clear LDTR as if a NULL selector were loaded. * The userspace hypervisor probably doesn't use a * LDT, but save and restore it to be safe. */ sgdt(&gdtr); sidt(&idtr); ldt_sel = sldt(); vmx_run_trace(vmx, vcpu); vmx_dr_enter_guest(vmxctx); rc = vmx_enter_guest(vmxctx, vmx, launched); vmx_dr_leave_guest(vmxctx); bare_lgdt(&gdtr); lidt(&idtr); lldt(ldt_sel); /* Collect some information for VM exit processing */ vmexit->rip = rip = vmcs_guest_rip(); vmexit->inst_length = vmexit_instruction_length(); vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason(); vmexit->u.vmx.exit_qualification = vmcs_exit_qualification(); /* Update 'nextrip' */ vmx->state[vcpu].nextrip = rip; if (rc == VMX_GUEST_VMEXIT) { vmx_exit_handle_nmi(vmx, vcpu, vmexit); enable_intr(); handled = vmx_exit_process(vmx, vcpu, vmexit); } else { enable_intr(); vmx_exit_inst_error(vmxctx, rc, vmexit); } launched = 1; vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled); rip = vmexit->rip; } while (handled); /* * If a VM exit has been handled then the exitcode must be BOGUS * If a VM exit is not handled then the exitcode must not be BOGUS */ if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) || (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) { panic("Mismatch between handled (%d) and exitcode (%d)", handled, vmexit->exitcode); } if (!handled) vmm_stat_incr(vm, vcpu, VMEXIT_USERSPACE, 1); VCPU_CTR1(vm, vcpu, "returning from vmx_run: exitcode %d", vmexit->exitcode); VMCLEAR(vmcs); vmx_msr_guest_exit(vmx, vcpu); return (0); } static void vmx_vmcleanup(void *arg) { int i; struct vmx *vmx = arg; uint16_t maxcpus; if (apic_access_virtualization(vmx, 0)) vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE); maxcpus = vm_get_maxcpus(vmx->vm); for (i = 0; i < maxcpus; i++) vpid_free(vmx->state[i].vpid); free(vmx, M_VMX); return; } static register_t * vmxctx_regptr(struct vmxctx *vmxctx, int reg) { switch (reg) { case VM_REG_GUEST_RAX: return (&vmxctx->guest_rax); case VM_REG_GUEST_RBX: return (&vmxctx->guest_rbx); case VM_REG_GUEST_RCX: return (&vmxctx->guest_rcx); case VM_REG_GUEST_RDX: return (&vmxctx->guest_rdx); case VM_REG_GUEST_RSI: return (&vmxctx->guest_rsi); case VM_REG_GUEST_RDI: return (&vmxctx->guest_rdi); case VM_REG_GUEST_RBP: return (&vmxctx->guest_rbp); case VM_REG_GUEST_R8: return (&vmxctx->guest_r8); case VM_REG_GUEST_R9: return (&vmxctx->guest_r9); case VM_REG_GUEST_R10: return (&vmxctx->guest_r10); case VM_REG_GUEST_R11: return (&vmxctx->guest_r11); case VM_REG_GUEST_R12: return (&vmxctx->guest_r12); case VM_REG_GUEST_R13: return (&vmxctx->guest_r13); case VM_REG_GUEST_R14: return (&vmxctx->guest_r14); case VM_REG_GUEST_R15: return (&vmxctx->guest_r15); case VM_REG_GUEST_CR2: return (&vmxctx->guest_cr2); case VM_REG_GUEST_DR0: return (&vmxctx->guest_dr0); case VM_REG_GUEST_DR1: return (&vmxctx->guest_dr1); case VM_REG_GUEST_DR2: return (&vmxctx->guest_dr2); case VM_REG_GUEST_DR3: return (&vmxctx->guest_dr3); case VM_REG_GUEST_DR6: return (&vmxctx->guest_dr6); default: break; } return (NULL); } static int vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval) { register_t *regp; if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { *retval = *regp; return (0); } else return (EINVAL); } static int vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val) { register_t *regp; if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { *regp = val; return (0); } else return (EINVAL); } static int vmx_get_intr_shadow(struct vmx *vmx, int vcpu, int running, uint64_t *retval) { uint64_t gi; int error; error = vmcs_getreg(&vmx->vmcs[vcpu], running, VMCS_IDENT(VMCS_GUEST_INTERRUPTIBILITY), &gi); *retval = (gi & HWINTR_BLOCKING) ? 1 : 0; return (error); } static int vmx_modify_intr_shadow(struct vmx *vmx, int vcpu, int running, uint64_t val) { struct vmcs *vmcs; uint64_t gi; int error, ident; /* * Forcing the vcpu into an interrupt shadow is not supported. */ if (val) { error = EINVAL; goto done; } vmcs = &vmx->vmcs[vcpu]; ident = VMCS_IDENT(VMCS_GUEST_INTERRUPTIBILITY); error = vmcs_getreg(vmcs, running, ident, &gi); if (error == 0) { gi &= ~HWINTR_BLOCKING; error = vmcs_setreg(vmcs, running, ident, gi); } done: VCPU_CTR2(vmx->vm, vcpu, "Setting intr_shadow to %#lx %s", val, error ? "failed" : "succeeded"); return (error); } static int vmx_shadow_reg(int reg) { int shreg; shreg = -1; switch (reg) { case VM_REG_GUEST_CR0: shreg = VMCS_CR0_SHADOW; break; case VM_REG_GUEST_CR4: shreg = VMCS_CR4_SHADOW; break; default: break; } return (shreg); } static int vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval) { int running, hostcpu; struct vmx *vmx = arg; running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); if (running && hostcpu != curcpu) panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu); if (reg == VM_REG_GUEST_INTR_SHADOW) return (vmx_get_intr_shadow(vmx, vcpu, running, retval)); if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0) return (0); return (vmcs_getreg(&vmx->vmcs[vcpu], running, reg, retval)); } static int vmx_setreg(void *arg, int vcpu, int reg, uint64_t val) { int error, hostcpu, running, shadow; uint64_t ctls; pmap_t pmap; struct vmx *vmx = arg; running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); if (running && hostcpu != curcpu) panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu); if (reg == VM_REG_GUEST_INTR_SHADOW) return (vmx_modify_intr_shadow(vmx, vcpu, running, val)); if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0) return (0); error = vmcs_setreg(&vmx->vmcs[vcpu], running, reg, val); if (error == 0) { /* * If the "load EFER" VM-entry control is 1 then the * value of EFER.LMA must be identical to "IA-32e mode guest" * bit in the VM-entry control. */ if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 && (reg == VM_REG_GUEST_EFER)) { vmcs_getreg(&vmx->vmcs[vcpu], running, VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls); if (val & EFER_LMA) ctls |= VM_ENTRY_GUEST_LMA; else ctls &= ~VM_ENTRY_GUEST_LMA; vmcs_setreg(&vmx->vmcs[vcpu], running, VMCS_IDENT(VMCS_ENTRY_CTLS), ctls); } shadow = vmx_shadow_reg(reg); if (shadow > 0) { /* * Store the unmodified value in the shadow */ error = vmcs_setreg(&vmx->vmcs[vcpu], running, VMCS_IDENT(shadow), val); } if (reg == VM_REG_GUEST_CR3) { /* * Invalidate the guest vcpu's TLB mappings to emulate * the behavior of updating %cr3. * * XXX the processor retains global mappings when %cr3 * is updated but vmx_invvpid() does not. */ pmap = vmx->ctx[vcpu].pmap; vmx_invvpid(vmx, vcpu, pmap, running); } } return (error); } static int vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) { int hostcpu, running; struct vmx *vmx = arg; running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); if (running && hostcpu != curcpu) panic("vmx_getdesc: %s%d is running", vm_name(vmx->vm), vcpu); return (vmcs_getdesc(&vmx->vmcs[vcpu], running, reg, desc)); } static int vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) { int hostcpu, running; struct vmx *vmx = arg; running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); if (running && hostcpu != curcpu) panic("vmx_setdesc: %s%d is running", vm_name(vmx->vm), vcpu); return (vmcs_setdesc(&vmx->vmcs[vcpu], running, reg, desc)); } static int vmx_getcap(void *arg, int vcpu, int type, int *retval) { struct vmx *vmx = arg; int vcap; int ret; ret = ENOENT; vcap = vmx->cap[vcpu].set; switch (type) { case VM_CAP_HALT_EXIT: if (cap_halt_exit) ret = 0; break; case VM_CAP_PAUSE_EXIT: if (cap_pause_exit) ret = 0; break; case VM_CAP_MTRAP_EXIT: if (cap_monitor_trap) ret = 0; break; case VM_CAP_UNRESTRICTED_GUEST: if (cap_unrestricted_guest) ret = 0; break; case VM_CAP_ENABLE_INVPCID: if (cap_invpcid) ret = 0; break; + case VM_CAP_BPT_EXIT: + ret = 0; + break; default: break; } if (ret == 0) *retval = (vcap & (1 << type)) ? 1 : 0; return (ret); } static int vmx_setcap(void *arg, int vcpu, int type, int val) { struct vmx *vmx = arg; struct vmcs *vmcs = &vmx->vmcs[vcpu]; uint32_t baseval; uint32_t *pptr; int error; int flag; int reg; int retval; retval = ENOENT; pptr = NULL; switch (type) { case VM_CAP_HALT_EXIT: if (cap_halt_exit) { retval = 0; pptr = &vmx->cap[vcpu].proc_ctls; baseval = *pptr; flag = PROCBASED_HLT_EXITING; reg = VMCS_PRI_PROC_BASED_CTLS; } break; case VM_CAP_MTRAP_EXIT: if (cap_monitor_trap) { retval = 0; pptr = &vmx->cap[vcpu].proc_ctls; baseval = *pptr; flag = PROCBASED_MTF; reg = VMCS_PRI_PROC_BASED_CTLS; } break; case VM_CAP_PAUSE_EXIT: if (cap_pause_exit) { retval = 0; pptr = &vmx->cap[vcpu].proc_ctls; baseval = *pptr; flag = PROCBASED_PAUSE_EXITING; reg = VMCS_PRI_PROC_BASED_CTLS; } break; case VM_CAP_UNRESTRICTED_GUEST: if (cap_unrestricted_guest) { retval = 0; pptr = &vmx->cap[vcpu].proc_ctls2; baseval = *pptr; flag = PROCBASED2_UNRESTRICTED_GUEST; reg = VMCS_SEC_PROC_BASED_CTLS; } break; case VM_CAP_ENABLE_INVPCID: if (cap_invpcid) { retval = 0; pptr = &vmx->cap[vcpu].proc_ctls2; baseval = *pptr; flag = PROCBASED2_ENABLE_INVPCID; reg = VMCS_SEC_PROC_BASED_CTLS; } break; + case VM_CAP_BPT_EXIT: + retval = 0; + + /* Don't change the bitmap if we are tracing all exceptions. */ + if (vmx->cap[vcpu].exc_bitmap != 0xffffffff) { + pptr = &vmx->cap[vcpu].exc_bitmap; + baseval = *pptr; + flag = (1 << IDT_BP); + reg = VMCS_EXCEPTION_BITMAP; + } + break; default: break; } - if (retval == 0) { + if (retval) + return (retval); + + if (pptr != NULL) { if (val) { baseval |= flag; } else { baseval &= ~flag; } VMPTRLD(vmcs); error = vmwrite(reg, baseval); VMCLEAR(vmcs); - if (error) { - retval = error; - } else { - /* - * Update optional stored flags, and record - * setting - */ - if (pptr != NULL) { - *pptr = baseval; - } + if (error) + return (error); - if (val) { - vmx->cap[vcpu].set |= (1 << type); - } else { - vmx->cap[vcpu].set &= ~(1 << type); - } - } + /* + * Update optional stored flags, and record + * setting + */ + *pptr = baseval; } - return (retval); + if (val) { + vmx->cap[vcpu].set |= (1 << type); + } else { + vmx->cap[vcpu].set &= ~(1 << type); + } + + return (0); } struct vlapic_vtx { struct vlapic vlapic; struct pir_desc *pir_desc; struct vmx *vmx; u_int pending_prio; }; #define VPR_PRIO_BIT(vpr) (1 << ((vpr) >> 4)) #define VMX_CTR_PIR(vm, vcpuid, pir_desc, notify, vector, level, msg) \ do { \ VCPU_CTR2(vm, vcpuid, msg " assert %s-triggered vector %d", \ level ? "level" : "edge", vector); \ VCPU_CTR1(vm, vcpuid, msg " pir0 0x%016lx", pir_desc->pir[0]); \ VCPU_CTR1(vm, vcpuid, msg " pir1 0x%016lx", pir_desc->pir[1]); \ VCPU_CTR1(vm, vcpuid, msg " pir2 0x%016lx", pir_desc->pir[2]); \ VCPU_CTR1(vm, vcpuid, msg " pir3 0x%016lx", pir_desc->pir[3]); \ VCPU_CTR1(vm, vcpuid, msg " notify: %s", notify ? "yes" : "no");\ } while (0) /* * vlapic->ops handlers that utilize the APICv hardware assist described in * Chapter 29 of the Intel SDM. */ static int vmx_set_intr_ready(struct vlapic *vlapic, int vector, bool level) { struct vlapic_vtx *vlapic_vtx; struct pir_desc *pir_desc; uint64_t mask; int idx, notify = 0; vlapic_vtx = (struct vlapic_vtx *)vlapic; pir_desc = vlapic_vtx->pir_desc; /* * Keep track of interrupt requests in the PIR descriptor. This is * because the virtual APIC page pointed to by the VMCS cannot be * modified if the vcpu is running. */ idx = vector / 64; mask = 1UL << (vector % 64); atomic_set_long(&pir_desc->pir[idx], mask); /* * A notification is required whenever the 'pending' bit makes a * transition from 0->1. * * Even if the 'pending' bit is already asserted, notification about * the incoming interrupt may still be necessary. For example, if a * vCPU is HLTed with a high PPR, a low priority interrupt would cause * the 0->1 'pending' transition with a notification, but the vCPU * would ignore the interrupt for the time being. The same vCPU would * need to then be notified if a high-priority interrupt arrived which * satisfied the PPR. * * The priorities of interrupts injected while 'pending' is asserted * are tracked in a custom bitfield 'pending_prio'. Should the * to-be-injected interrupt exceed the priorities already present, the * notification is sent. The priorities recorded in 'pending_prio' are * cleared whenever the 'pending' bit makes another 0->1 transition. */ if (atomic_cmpset_long(&pir_desc->pending, 0, 1) != 0) { notify = 1; vlapic_vtx->pending_prio = 0; } else { const u_int old_prio = vlapic_vtx->pending_prio; const u_int prio_bit = VPR_PRIO_BIT(vector & APIC_TPR_INT); if ((old_prio & prio_bit) == 0 && prio_bit > old_prio) { atomic_set_int(&vlapic_vtx->pending_prio, prio_bit); notify = 1; } } VMX_CTR_PIR(vlapic->vm, vlapic->vcpuid, pir_desc, notify, vector, level, "vmx_set_intr_ready"); return (notify); } static int vmx_pending_intr(struct vlapic *vlapic, int *vecptr) { struct vlapic_vtx *vlapic_vtx; struct pir_desc *pir_desc; struct LAPIC *lapic; uint64_t pending, pirval; uint32_t ppr, vpr; int i; /* * This function is only expected to be called from the 'HLT' exit * handler which does not care about the vector that is pending. */ KASSERT(vecptr == NULL, ("vmx_pending_intr: vecptr must be NULL")); vlapic_vtx = (struct vlapic_vtx *)vlapic; pir_desc = vlapic_vtx->pir_desc; pending = atomic_load_acq_long(&pir_desc->pending); if (!pending) { /* * While a virtual interrupt may have already been * processed the actual delivery maybe pending the * interruptibility of the guest. Recognize a pending * interrupt by reevaluating virtual interrupts * following Section 29.2.1 in the Intel SDM Volume 3. */ struct vm_exit *vmexit; uint8_t rvi, ppr; vmexit = vm_exitinfo(vlapic->vm, vlapic->vcpuid); KASSERT(vmexit->exitcode == VM_EXITCODE_HLT, ("vmx_pending_intr: exitcode not 'HLT'")); rvi = vmexit->u.hlt.intr_status & APIC_TPR_INT; lapic = vlapic->apic_page; ppr = lapic->ppr & APIC_TPR_INT; if (rvi > ppr) { return (1); } return (0); } /* * If there is an interrupt pending then it will be recognized only * if its priority is greater than the processor priority. * * Special case: if the processor priority is zero then any pending * interrupt will be recognized. */ lapic = vlapic->apic_page; ppr = lapic->ppr & APIC_TPR_INT; if (ppr == 0) return (1); VCPU_CTR1(vlapic->vm, vlapic->vcpuid, "HLT with non-zero PPR %d", lapic->ppr); vpr = 0; for (i = 3; i >= 0; i--) { pirval = pir_desc->pir[i]; if (pirval != 0) { vpr = (i * 64 + flsl(pirval) - 1) & APIC_TPR_INT; break; } } /* * If the highest-priority pending interrupt falls short of the * processor priority of this vCPU, ensure that 'pending_prio' does not * have any stale bits which would preclude a higher-priority interrupt * from incurring a notification later. */ if (vpr <= ppr) { const u_int prio_bit = VPR_PRIO_BIT(vpr); const u_int old = vlapic_vtx->pending_prio; if (old > prio_bit && (old & prio_bit) == 0) { vlapic_vtx->pending_prio = prio_bit; } return (0); } return (1); } static void vmx_intr_accepted(struct vlapic *vlapic, int vector) { panic("vmx_intr_accepted: not expected to be called"); } static void vmx_set_tmr(struct vlapic *vlapic, int vector, bool level) { struct vlapic_vtx *vlapic_vtx; struct vmx *vmx; struct vmcs *vmcs; uint64_t mask, val; KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector)); KASSERT(!vcpu_is_running(vlapic->vm, vlapic->vcpuid, NULL), ("vmx_set_tmr: vcpu cannot be running")); vlapic_vtx = (struct vlapic_vtx *)vlapic; vmx = vlapic_vtx->vmx; vmcs = &vmx->vmcs[vlapic->vcpuid]; mask = 1UL << (vector % 64); VMPTRLD(vmcs); val = vmcs_read(VMCS_EOI_EXIT(vector)); if (level) val |= mask; else val &= ~mask; vmcs_write(VMCS_EOI_EXIT(vector), val); VMCLEAR(vmcs); } static void vmx_enable_x2apic_mode(struct vlapic *vlapic) { struct vmx *vmx; struct vmcs *vmcs; uint32_t proc_ctls2; int vcpuid, error; vcpuid = vlapic->vcpuid; vmx = ((struct vlapic_vtx *)vlapic)->vmx; vmcs = &vmx->vmcs[vcpuid]; proc_ctls2 = vmx->cap[vcpuid].proc_ctls2; KASSERT((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) != 0, ("%s: invalid proc_ctls2 %#x", __func__, proc_ctls2)); proc_ctls2 &= ~PROCBASED2_VIRTUALIZE_APIC_ACCESSES; proc_ctls2 |= PROCBASED2_VIRTUALIZE_X2APIC_MODE; vmx->cap[vcpuid].proc_ctls2 = proc_ctls2; VMPTRLD(vmcs); vmcs_write(VMCS_SEC_PROC_BASED_CTLS, proc_ctls2); VMCLEAR(vmcs); if (vlapic->vcpuid == 0) { /* * The nested page table mappings are shared by all vcpus * so unmap the APIC access page just once. */ error = vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE); KASSERT(error == 0, ("%s: vm_unmap_mmio error %d", __func__, error)); /* * The MSR bitmap is shared by all vcpus so modify it only * once in the context of vcpu 0. */ error = vmx_allow_x2apic_msrs(vmx); KASSERT(error == 0, ("%s: vmx_allow_x2apic_msrs error %d", __func__, error)); } } static void vmx_post_intr(struct vlapic *vlapic, int hostcpu) { ipi_cpu(hostcpu, pirvec); } /* * Transfer the pending interrupts in the PIR descriptor to the IRR * in the virtual APIC page. */ static void vmx_inject_pir(struct vlapic *vlapic) { struct vlapic_vtx *vlapic_vtx; struct pir_desc *pir_desc; struct LAPIC *lapic; uint64_t val, pirval; int rvi, pirbase = -1; uint16_t intr_status_old, intr_status_new; vlapic_vtx = (struct vlapic_vtx *)vlapic; pir_desc = vlapic_vtx->pir_desc; if (atomic_cmpset_long(&pir_desc->pending, 1, 0) == 0) { VCPU_CTR0(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: " "no posted interrupt pending"); return; } pirval = 0; pirbase = -1; lapic = vlapic->apic_page; val = atomic_readandclear_long(&pir_desc->pir[0]); if (val != 0) { lapic->irr0 |= val; lapic->irr1 |= val >> 32; pirbase = 0; pirval = val; } val = atomic_readandclear_long(&pir_desc->pir[1]); if (val != 0) { lapic->irr2 |= val; lapic->irr3 |= val >> 32; pirbase = 64; pirval = val; } val = atomic_readandclear_long(&pir_desc->pir[2]); if (val != 0) { lapic->irr4 |= val; lapic->irr5 |= val >> 32; pirbase = 128; pirval = val; } val = atomic_readandclear_long(&pir_desc->pir[3]); if (val != 0) { lapic->irr6 |= val; lapic->irr7 |= val >> 32; pirbase = 192; pirval = val; } VLAPIC_CTR_IRR(vlapic, "vmx_inject_pir"); /* * Update RVI so the processor can evaluate pending virtual * interrupts on VM-entry. * * It is possible for pirval to be 0 here, even though the * pending bit has been set. The scenario is: * CPU-Y is sending a posted interrupt to CPU-X, which * is running a guest and processing posted interrupts in h/w. * CPU-X will eventually exit and the state seen in s/w is * the pending bit set, but no PIR bits set. * * CPU-X CPU-Y * (vm running) (host running) * rx posted interrupt * CLEAR pending bit * SET PIR bit * READ/CLEAR PIR bits * SET pending bit * (vm exit) * pending bit set, PIR 0 */ if (pirval != 0) { rvi = pirbase + flsl(pirval) - 1; intr_status_old = vmcs_read(VMCS_GUEST_INTR_STATUS); intr_status_new = (intr_status_old & 0xFF00) | rvi; if (intr_status_new > intr_status_old) { vmcs_write(VMCS_GUEST_INTR_STATUS, intr_status_new); VCPU_CTR2(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: " "guest_intr_status changed from 0x%04x to 0x%04x", intr_status_old, intr_status_new); } } } static struct vlapic * vmx_vlapic_init(void *arg, int vcpuid) { struct vmx *vmx; struct vlapic *vlapic; struct vlapic_vtx *vlapic_vtx; vmx = arg; vlapic = malloc(sizeof(struct vlapic_vtx), M_VLAPIC, M_WAITOK | M_ZERO); vlapic->vm = vmx->vm; vlapic->vcpuid = vcpuid; vlapic->apic_page = (struct LAPIC *)&vmx->apic_page[vcpuid]; vlapic_vtx = (struct vlapic_vtx *)vlapic; vlapic_vtx->pir_desc = &vmx->pir_desc[vcpuid]; vlapic_vtx->vmx = vmx; if (virtual_interrupt_delivery) { vlapic->ops.set_intr_ready = vmx_set_intr_ready; vlapic->ops.pending_intr = vmx_pending_intr; vlapic->ops.intr_accepted = vmx_intr_accepted; vlapic->ops.set_tmr = vmx_set_tmr; vlapic->ops.enable_x2apic_mode = vmx_enable_x2apic_mode; } if (posted_interrupts) vlapic->ops.post_intr = vmx_post_intr; vlapic_init(vlapic); return (vlapic); } static void vmx_vlapic_cleanup(void *arg, struct vlapic *vlapic) { vlapic_cleanup(vlapic); free(vlapic, M_VLAPIC); } struct vmm_ops vmm_ops_intel = { .init = vmx_init, .cleanup = vmx_cleanup, .resume = vmx_restore, .vminit = vmx_vminit, .vmrun = vmx_run, .vmcleanup = vmx_vmcleanup, .vmgetreg = vmx_getreg, .vmsetreg = vmx_setreg, .vmgetdesc = vmx_getdesc, .vmsetdesc = vmx_setdesc, .vmgetcap = vmx_getcap, .vmsetcap = vmx_setcap, .vmspace_alloc = ept_vmspace_alloc, .vmspace_free = ept_vmspace_free, .vlapic_init = vmx_vlapic_init, .vlapic_cleanup = vmx_vlapic_cleanup, }; Index: head/sys/amd64/vmm/intel/vmx.h =================================================================== --- head/sys/amd64/vmm/intel/vmx.h (revision 355723) +++ head/sys/amd64/vmm/intel/vmx.h (revision 355724) @@ -1,155 +1,156 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _VMX_H_ #define _VMX_H_ #include "vmcs.h" struct pmap; struct vmxctx { register_t guest_rdi; /* Guest state */ register_t guest_rsi; register_t guest_rdx; register_t guest_rcx; register_t guest_r8; register_t guest_r9; register_t guest_rax; register_t guest_rbx; register_t guest_rbp; register_t guest_r10; register_t guest_r11; register_t guest_r12; register_t guest_r13; register_t guest_r14; register_t guest_r15; register_t guest_cr2; register_t guest_dr0; register_t guest_dr1; register_t guest_dr2; register_t guest_dr3; register_t guest_dr6; register_t host_r15; /* Host state */ register_t host_r14; register_t host_r13; register_t host_r12; register_t host_rbp; register_t host_rsp; register_t host_rbx; register_t host_dr0; register_t host_dr1; register_t host_dr2; register_t host_dr3; register_t host_dr6; register_t host_dr7; uint64_t host_debugctl; int host_tf; int inst_fail_status; /* * The pmap needs to be deactivated in vmx_enter_guest() * so keep a copy of the 'pmap' in each vmxctx. */ struct pmap *pmap; }; struct vmxcap { int set; uint32_t proc_ctls; uint32_t proc_ctls2; + uint32_t exc_bitmap; }; struct vmxstate { uint64_t nextrip; /* next instruction to be executed by guest */ int lastcpu; /* host cpu that this 'vcpu' last ran on */ uint16_t vpid; }; struct apic_page { uint32_t reg[PAGE_SIZE / 4]; }; CTASSERT(sizeof(struct apic_page) == PAGE_SIZE); /* Posted Interrupt Descriptor (described in section 29.6 of the Intel SDM) */ struct pir_desc { uint64_t pir[4]; uint64_t pending; uint64_t unused[3]; } __aligned(64); CTASSERT(sizeof(struct pir_desc) == 64); /* Index into the 'guest_msrs[]' array */ enum { IDX_MSR_LSTAR, IDX_MSR_CSTAR, IDX_MSR_STAR, IDX_MSR_SF_MASK, IDX_MSR_KGSBASE, IDX_MSR_PAT, GUEST_MSR_NUM /* must be the last enumeration */ }; /* virtual machine softc */ struct vmx { struct vmcs vmcs[VM_MAXCPU]; /* one vmcs per virtual cpu */ struct apic_page apic_page[VM_MAXCPU]; /* one apic page per vcpu */ char msr_bitmap[PAGE_SIZE]; struct pir_desc pir_desc[VM_MAXCPU]; uint64_t guest_msrs[VM_MAXCPU][GUEST_MSR_NUM]; struct vmxctx ctx[VM_MAXCPU]; struct vmxcap cap[VM_MAXCPU]; struct vmxstate state[VM_MAXCPU]; uint64_t eptp; struct vm *vm; long eptgen[MAXCPU]; /* cached pmap->pm_eptgen */ }; CTASSERT((offsetof(struct vmx, vmcs) & PAGE_MASK) == 0); CTASSERT((offsetof(struct vmx, msr_bitmap) & PAGE_MASK) == 0); CTASSERT((offsetof(struct vmx, pir_desc[0]) & 63) == 0); #define VMX_GUEST_VMEXIT 0 #define VMX_VMRESUME_ERROR 1 #define VMX_VMLAUNCH_ERROR 2 #define VMX_INVEPT_ERROR 3 int vmx_enter_guest(struct vmxctx *ctx, struct vmx *vmx, int launched); void vmx_call_isr(uintptr_t entry); u_long vmx_fix_cr0(u_long cr0); u_long vmx_fix_cr4(u_long cr4); int vmx_set_tsc_offset(struct vmx *vmx, int vcpu, uint64_t offset); extern char vmx_exit_guest[]; extern char vmx_exit_guest_flush_rsb[]; #endif Index: head/usr.sbin/bhyve/bhyve.8 =================================================================== --- head/usr.sbin/bhyve/bhyve.8 (revision 355723) +++ head/usr.sbin/bhyve/bhyve.8 (revision 355724) @@ -1,649 +1,652 @@ .\" Copyright (c) 2013 Peter Grehan .\" All rights reserved. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" .\" THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" .\" $FreeBSD$ .\" -.Dd December 12, 2019 +.Dd December 13, 2019 .Dt BHYVE 8 .Os .Sh NAME .Nm bhyve .Nd "run a guest operating system inside a virtual machine" .Sh SYNOPSIS .Nm .Op Fl abehuwxACHPSWY .Oo .Fl c\~ Ns .Oo .Op Ar cpus= Ns .Ar numcpus Ns .Oc Ns .Op Ar ,sockets=n Ns .Op Ar ,cores=n Ns .Op Ar ,threads=n .Oc .Op Fl g Ar gdbport .Op Fl l Ar help|lpcdev Ns Op , Ns Ar conf .Op Fl m Ar memsize Ns Op Ar K|k|M|m|G|g|T|t .Op Fl p Ar vcpu:hostcpu .Op Fl s Ar help|slot,emulation Ns Op , Ns Ar conf .Op Fl G Ar port .Op Fl U Ar uuid .Ar vmname .Sh DESCRIPTION .Nm is a hypervisor that runs guest operating systems inside a virtual machine. .Pp Parameters such as the number of virtual CPUs, amount of guest memory, and I/O connectivity can be specified with command-line parameters. .Pp If not using a boot ROM, the guest operating system must be loaded with .Xr bhyveload 8 or a similar boot loader before running .Nm , otherwise, it is enough to run .Nm with a boot ROM of choice. .Pp .Nm runs until the guest operating system reboots or an unhandled hypervisor exit is detected. .Sh OPTIONS .Bl -tag -width 10n .It Fl a The guest's local APIC is configured in xAPIC mode. The xAPIC mode is the default setting so this option is redundant. It will be deprecated in a future version. .It Fl A Generate ACPI tables. Required for .Fx Ns /amd64 guests. .It Fl b Enable a low-level console device supported by .Fx kernels compiled with .Cd "device bvmconsole" . This option will be deprecated in a future version. .It Fl c Op Ar setting ... Number of guest virtual CPUs and/or the CPU topology. The default value for each of .Ar numcpus , .Ar sockets , .Ar cores , and .Ar threads is 1. The current maximum number of guest virtual CPUs is 16. If .Ar numcpus is not specified then it will be calculated from the other arguments. The topology must be consistent in that the .Ar numcpus must equal the product of .Ar sockets , .Ar cores , and .Ar threads . If a .Ar setting is specified more than once the last one has precedence. .It Fl C Include guest memory in core file. .It Fl e Force .Nm to exit when a guest issues an access to an I/O port that is not emulated. This is intended for debug purposes. .It Fl g Ar gdbport For .Fx kernels compiled with .Cd "device bvmdebug" , allow a remote kernel kgdb to be relayed to the guest kernel gdb stub via a local IPv4 address and this port. This option will be deprecated in a future version. .It Fl G Ar port Start a debug server that uses the GDB protocol to export guest state to a debugger. An IPv4 TCP socket will be bound to the supplied .Ar port to listen for debugger connections. Only a single debugger may be attached to the debug server at a time. If .Ar port begins with .Sq w , .Nm will pause execution at the first instruction waiting for a debugger to attach. .It Fl h Print help message and exit. .It Fl H Yield the virtual CPU thread when a HLT instruction is detected. If this option is not specified, virtual CPUs will use 100% of a host CPU. .It Fl l Op Ar help|lpcdev Ns Op , Ns Ar conf Allow devices behind the LPC PCI-ISA bridge to be configured. The only supported devices are the TTY-class devices .Ar com1 and .Ar com2 and the boot ROM device .Ar bootrom . .Pp .Ar help print a list of supported LPC devices. .It Fl m Ar memsize Ns Op Ar K|k|M|m|G|g|T|t Guest physical memory size in bytes. This must be the same size that was given to .Xr bhyveload 8 . .Pp The size argument may be suffixed with one of K, M, G or T (either upper or lower case) to indicate a multiple of kilobytes, megabytes, gigabytes, or terabytes. If no suffix is given, the value is assumed to be in megabytes. .Pp .Ar memsize defaults to 256M. .It Fl p Ar vcpu:hostcpu Pin guest's virtual CPU .Em vcpu to .Em hostcpu . .It Fl P Force the guest virtual CPU to exit when a PAUSE instruction is detected. .It Fl s Op Ar help|slot,emulation Ns Op , Ns Ar conf Configure a virtual PCI slot and function. .Pp .Nm provides PCI bus emulation and virtual devices that can be attached to slots on the bus. There are 32 available slots, with the option of providing up to 8 functions per slot. .Bl -tag -width 10n .It Ar help print a list of supported PCI devices. .It Ar slot .Ar pcislot[:function] .Ar bus:pcislot:function .Pp The .Ar pcislot value is 0 to 31. The optional .Ar function value is 0 to 7. The optional .Ar bus value is 0 to 255. If not specified, the .Ar function value defaults to 0. If not specified, the .Ar bus value defaults to 0. .It Ar emulation .Bl -tag -width 10n .It Li hostbridge | Li amd_hostbridge .Pp Provide a simple host bridge. This is usually configured at slot 0, and is required by most guest operating systems. The .Li amd_hostbridge emulation is identical but uses a PCI vendor ID of .Li AMD . .It Li passthru PCI pass-through device. .It Li virtio-net Virtio network interface. .It Li virtio-blk Virtio block storage interface. .It Li virtio-scsi Virtio SCSI interface. .It Li virtio-rnd Virtio RNG interface. .It Li virtio-console Virtio console interface, which exposes multiple ports to the guest in the form of simple char devices for simple IO between the guest and host userspaces. .It Li ahci AHCI controller attached to arbitrary devices. .It Li ahci-cd AHCI controller attached to an ATAPI CD/DVD. .It Li ahci-hd AHCI controller attached to a SATA hard-drive. .It Li e1000 Intel e82545 network interface. .It Li uart PCI 16550 serial device. .It Li lpc LPC PCI-ISA bridge with COM1 and COM2 16550 serial ports and a boot ROM. The LPC bridge emulation can only be configured on bus 0. .It Li fbuf Raw framebuffer device attached to VNC server. .It Li xhci eXtensible Host Controller Interface (xHCI) USB controller. .It Li nvme NVM Express (NVMe) controller. .It Li hda High Definition Audio Controller. .El .It Op Ar conf This optional parameter describes the backend for device emulations. If .Ar conf is not specified, the device emulation has no backend and can be considered unconnected. .Pp Network devices: .Bl -tag -width 10n .It Ar tapN Ns Op , Ns Ar mac=xx:xx:xx:xx:xx:xx .It Ar vmnetN Ns Op , Ns Ar mac=xx:xx:xx:xx:xx:xx .Pp If .Ar mac is not specified, the MAC address is derived from a fixed OUI and the remaining bytes from an MD5 hash of the slot and function numbers and the device name. .Pp The MAC address is an ASCII string in .Xr ethers 5 format. .El .Pp Block storage devices: .Bl -tag -width 10n .It Pa /filename Ns Oo , Ns Ar block-device-options Oc .It Pa /dev/xxx Ns Oo , Ns Ar block-device-options Oc .El .Pp The .Ar block-device-options are: .Bl -tag -width 8n .It Li nocache Open the file with .Dv O_DIRECT . .It Li direct Open the file using .Dv O_SYNC . .It Li ro Force the file to be opened read-only. .It Li sectorsize= Ns Ar logical Ns Oo / Ns Ar physical Oc Specify the logical and physical sector sizes of the emulated disk. The physical sector size is optional and is equal to the logical sector size if not explicitly specified. .El .Pp SCSI devices: .Bl -tag -width 10n .It Pa /dev/cam/ctl Ns Oo Ar pp . Ns Ar vp Oc Ns Oo , Ns Ar scsi-device-options Oc .El .Pp The .Ar scsi-device-options are: .Bl -tag -width 10n .It Li iid= Ns Ar IID Initiator ID to use when sending requests to specified CTL port. The default value is 0. .El .Pp TTY devices: .Bl -tag -width 10n .It Li stdio Connect the serial port to the standard input and output of the .Nm process. .It Pa /dev/xxx Use the host TTY device for serial port I/O. .El .Pp Boot ROM device: .Bl -tag -width 10n .It Pa romfile Map .Ar romfile in the guest address space reserved for boot firmware. .El .Pp Pass-through devices: .Bl -tag -width 10n .It Ns Ar slot Ns / Ns Ar bus Ns / Ns Ar function Connect to a PCI device on the host at the selector described by .Ar slot , .Ar bus , and .Ar function numbers. .El .Pp Guest memory must be wired using the .Fl S option when a pass-through device is configured. .Pp The host device must have been reserved at boot-time using the .Va pptdevs loader variable as described in .Xr vmm 4 . .Pp Virtio console devices: .Bl -tag -width 10n .It Li port1= Ns Pa /path/to/port1.sock Ns ,anotherport= Ns Pa ... A maximum of 16 ports per device can be created. Every port is named and corresponds to a Unix domain socket created by .Nm . .Nm accepts at most one connection per port at a time. .Pp Limitations: .Bl -bullet -offset 2n .It Due to lack of destructors in .Nm , sockets on the filesystem must be cleaned up manually after .Nm exits. .It There is no way to use the "console port" feature, nor the console port resize at present. .It Emergency write is advertised, but no-op at present. .El .El .Pp Framebuffer devices: .Bl -tag -width 10n .It Xo .Oo rfb= Ns Oo Ar IP\&: Oc Ns Ar port Oc Ns Oo ,w= Ns Ar width Oc Ns Oo ,h= Ns .Ar height Oc Ns Oo ,vga= Ns Ar vgaconf Oc Ns Oo Ns ,wait Oc Ns Oo ,password= Ns .Ar password Oc .Xc .Bl -tag -width 8n .It Ar IPv4:port No or Ar [IPv6%zone]:port An .Ar IP address and a .Ar port VNC should listen on. The default is to listen on localhost IPv4 address and default VNC port 5900. An IPv6 address must be enclosed in square brackets and may contain an optional zone identifier. .It Ar width No and Ar height A display resolution, width and height, respectively. If not specified, a default resolution of 1024x768 pixels will be used. Minimal supported resolution is 640x480 pixels, and maximum is 1920x1200 pixels. .It Ar vgaconf Possible values for this option are .Dq io (default), .Dq on , and .Dq off . PCI graphics cards have a dual personality in that they are standard PCI devices with BAR addressing, but may also implicitly decode legacy VGA I/O space .Pq Ad 0x3c0-3df and memory space .Pq 64KB at Ad 0xA0000 . The default .Dq io option should be used for guests that attempt to issue BIOS calls which result in I/O port queries, and fail to boot if I/O decode is disabled. .Pp The .Dq on option should be used along with the CSM BIOS capability in UEFI to boot traditional BIOS guests that require the legacy VGA I/O and memory regions to be available. .Pp The .Dq off option should be used for the UEFI guests that assume that VGA adapter is present if they detect the I/O ports. An example of such a guest is .Ox in UEFI mode. .Pp Please refer to the .Nm .Fx wiki page .Pq Lk https://wiki.freebsd.org/bhyve for configuration notes of particular guests. .It wait Instruct .Nm to only boot upon the initiation of a VNC connection, simplifying the installation of operating systems that require immediate keyboard input. This can be removed for post-installation use. .It password This type of authentication is known to be cryptographically weak and is not intended for use on untrusted networks. Many implementations will want to use stronger security, such as running the session over an encrypted channel provided by IPsec or SSH. .El .El .Pp xHCI USB devices: .Bl -tag -width 10n .It Li tablet A USB tablet device which provides precise cursor synchronization when using VNC. .El .Pp NVMe devices: .Bl -tag -width 10n .It Li devpath Accepted device paths are: .Ar /dev/blockdev or .Ar /path/to/image or .Ar ram=size_in_MiB . .It Li maxq Max number of queues. .It Li qsz Max elements in each queue. .It Li ioslots Max number of concurrent I/O requests. .It Li sectsz Sector size (defaults to blockif sector size). .It Li ser Serial number with maximum 20 characters. .El .Pp HD Audio devices: .Bl -tag -width 10n .It Li play Playback device, typically .Ar /dev/dsp0 . .It Li rec Recording device, typically .Ar /dev/dsp0 . .El .El .It Fl S Wire guest memory. .It Fl u RTC keeps UTC time. .It Fl U Ar uuid Set the universally unique identifier .Pq UUID in the guest's System Management BIOS System Information structure. By default a UUID is generated from the host's hostname and .Ar vmname . .It Fl w Ignore accesses to unimplemented Model Specific Registers (MSRs). This is intended for debug purposes. .It Fl W Force virtio PCI device emulations to use MSI interrupts instead of MSI-X interrupts. .It Fl x The guest's local APIC is configured in x2APIC mode. .It Fl Y Disable MPtable generation. .It Ar vmname Alphanumeric name of the guest. This should be the same as that created by .Xr bhyveload 8 . .El .Sh DEBUG SERVER The current debug server provides limited support for debuggers. .Ss Registers Each virtual CPU is exposed to the debugger as a thread. .Pp General purpose registers can be queried for each virtual CPU, but other registers such as floating-point and system registers cannot be queried. .Ss Memory Memory (including memory mapped I/O regions) can be read and written by the debugger. Memory operations use virtual addresses that are resolved to physical addresses via the current virtual CPU's active address translation. .Ss Control The running guest can be interrupted by the debugger at any time .Pq for example, by pressing Ctrl-C in the debugger . .Pp Single stepping is only supported on Intel CPUs supporting the MTRAP VM exit. .Pp -Breakpoints are not supported. +Breakpoints are supported on Intel CPUs that support single stepping. +Note that continuing from a breakpoint while interrupts are enabled in the +guest may not work as expected due to timer interrupts firing while single +stepping over the breakpoint. .Sh SIGNAL HANDLING .Nm deals with the following signals: .Pp .Bl -tag -width indent -compact .It SIGTERM Trigger ACPI poweroff for a VM .El .Sh EXIT STATUS Exit status indicates how the VM was terminated: .Pp .Bl -tag -width indent -compact .It 0 rebooted .It 1 powered off .It 2 halted .It 3 triple fault .It 4 exited due to an error .El .Sh EXAMPLES If not using a boot ROM, the guest operating system must have been loaded with .Xr bhyveload 8 or a similar boot loader before .Xr bhyve 4 can be run. Otherwise, the boot loader is not needed. .Pp To run a virtual machine with 1GB of memory, two virtual CPUs, a virtio block device backed by the .Pa /my/image filesystem image, and a serial port for the console: .Bd -literal -offset indent bhyve -c 2 -s 0,hostbridge -s 1,lpc -s 2,virtio-blk,/my/image \\ -l com1,stdio -A -H -P -m 1G vm1 .Ed .Pp Run a 24GB single-CPU virtual machine with three network ports, one of which has a MAC address specified: .Bd -literal -offset indent bhyve -s 0,hostbridge -s 1,lpc -s 2:0,virtio-net,tap0 \\ -s 2:1,virtio-net,tap1 \\ -s 2:2,virtio-net,tap2,mac=00:be:fa:76:45:00 \\ -s 3,virtio-blk,/my/image -l com1,stdio \\ -A -H -P -m 24G bigvm .Ed .Pp Run an 8GB quad-CPU virtual machine with 8 AHCI SATA disks, an AHCI ATAPI CD-ROM, a single virtio network port, an AMD hostbridge, and the console port connected to an .Xr nmdm 4 null-modem device. .Bd -literal -offset indent bhyve -c 4 \\ -s 0,amd_hostbridge -s 1,lpc \\ -s 1:0,ahci,hd:/images/disk.1,hd:/images/disk.2,\\ hd:/images/disk.3,hd:/images/disk.4,\\ hd:/images/disk.5,hd:/images/disk.6,\\ hd:/images/disk.7,hd:/images/disk.8,\\ cd:/images/install.iso \\ -s 3,virtio-net,tap0 \\ -l com1,/dev/nmdm0A \\ -A -H -P -m 8G .Ed .Pp Run a UEFI virtual machine with a display resolution of 800 by 600 pixels that can be accessed via VNC at: 0.0.0.0:5900. .Bd -literal -offset indent bhyve -c 2 -m 4G -w -H \\ -s 0,hostbridge \\ -s 3,ahci-cd,/path/to/uefi-OS-install.iso \\ -s 4,ahci-hd,disk.img \\ -s 5,virtio-net,tap0 \\ -s 29,fbuf,tcp=0.0.0.0:5900,w=800,h=600,wait \\ -s 30,xhci,tablet \\ -s 31,lpc -l com1,stdio \\ -l bootrom,/usr/local/share/uefi-firmware/BHYVE_UEFI.fd \\ uefivm .Ed .Pp Run a UEFI virtual machine with a VNC display that is bound to all IPv6 addresses on port 5900. .Bd -literal -offset indent bhyve -c 2 -m 4G -w -H \\ -s 0,hostbridge \\ -s 4,ahci-hd,disk.img \\ -s 5,virtio-net,tap0 \\ -s 29,fbuf,tcp=[::]:5900,w=800,h=600 \\ -s 30,xhci,tablet \\ -s 31,lpc -l com1,stdio \\ -l bootrom,/usr/local/share/uefi-firmware/BHYVE_UEFI.fd \\ uefivm .Ed .Sh SEE ALSO .Xr bhyve 4 , .Xr nmdm 4 , .Xr vmm 4 , .Xr ethers 5 , .Xr bhyvectl 8 , .Xr bhyveload 8 .Pp .Rs .%A Intel .%B 64 and IA-32 Architectures Software Developer’s Manual .%V Volume 3 .Re .Sh HISTORY .Nm first appeared in .Fx 10.0 . .Sh AUTHORS .An Neel Natu Aq Mt neel@freebsd.org .An Peter Grehan Aq Mt grehan@freebsd.org Index: head/usr.sbin/bhyve/bhyverun.c =================================================================== --- head/usr.sbin/bhyve/bhyverun.c (revision 355723) +++ head/usr.sbin/bhyve/bhyverun.c (revision 355724) @@ -1,1230 +1,1243 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include "bhyverun.h" #include "acpi.h" #include "atkbdc.h" #include "inout.h" #include "dbgport.h" #include "fwctl.h" #include "gdb.h" #include "ioapic.h" #include "mem.h" #include "mevent.h" #include "mptbl.h" #include "pci_emul.h" #include "pci_irq.h" #include "pci_lpc.h" #include "smbiostbl.h" #include "xmsr.h" #include "spinup_ap.h" #include "rtc.h" #define GUEST_NIO_PORT 0x488 /* guest upcalls via i/o port */ #define MB (1024UL * 1024) #define GB (1024UL * MB) static const char * const vmx_exit_reason_desc[] = { [EXIT_REASON_EXCEPTION] = "Exception or non-maskable interrupt (NMI)", [EXIT_REASON_EXT_INTR] = "External interrupt", [EXIT_REASON_TRIPLE_FAULT] = "Triple fault", [EXIT_REASON_INIT] = "INIT signal", [EXIT_REASON_SIPI] = "Start-up IPI (SIPI)", [EXIT_REASON_IO_SMI] = "I/O system-management interrupt (SMI)", [EXIT_REASON_SMI] = "Other SMI", [EXIT_REASON_INTR_WINDOW] = "Interrupt window", [EXIT_REASON_NMI_WINDOW] = "NMI window", [EXIT_REASON_TASK_SWITCH] = "Task switch", [EXIT_REASON_CPUID] = "CPUID", [EXIT_REASON_GETSEC] = "GETSEC", [EXIT_REASON_HLT] = "HLT", [EXIT_REASON_INVD] = "INVD", [EXIT_REASON_INVLPG] = "INVLPG", [EXIT_REASON_RDPMC] = "RDPMC", [EXIT_REASON_RDTSC] = "RDTSC", [EXIT_REASON_RSM] = "RSM", [EXIT_REASON_VMCALL] = "VMCALL", [EXIT_REASON_VMCLEAR] = "VMCLEAR", [EXIT_REASON_VMLAUNCH] = "VMLAUNCH", [EXIT_REASON_VMPTRLD] = "VMPTRLD", [EXIT_REASON_VMPTRST] = "VMPTRST", [EXIT_REASON_VMREAD] = "VMREAD", [EXIT_REASON_VMRESUME] = "VMRESUME", [EXIT_REASON_VMWRITE] = "VMWRITE", [EXIT_REASON_VMXOFF] = "VMXOFF", [EXIT_REASON_VMXON] = "VMXON", [EXIT_REASON_CR_ACCESS] = "Control-register accesses", [EXIT_REASON_DR_ACCESS] = "MOV DR", [EXIT_REASON_INOUT] = "I/O instruction", [EXIT_REASON_RDMSR] = "RDMSR", [EXIT_REASON_WRMSR] = "WRMSR", [EXIT_REASON_INVAL_VMCS] = "VM-entry failure due to invalid guest state", [EXIT_REASON_INVAL_MSR] = "VM-entry failure due to MSR loading", [EXIT_REASON_MWAIT] = "MWAIT", [EXIT_REASON_MTF] = "Monitor trap flag", [EXIT_REASON_MONITOR] = "MONITOR", [EXIT_REASON_PAUSE] = "PAUSE", [EXIT_REASON_MCE_DURING_ENTRY] = "VM-entry failure due to machine-check event", [EXIT_REASON_TPR] = "TPR below threshold", [EXIT_REASON_APIC_ACCESS] = "APIC access", [EXIT_REASON_VIRTUALIZED_EOI] = "Virtualized EOI", [EXIT_REASON_GDTR_IDTR] = "Access to GDTR or IDTR", [EXIT_REASON_LDTR_TR] = "Access to LDTR or TR", [EXIT_REASON_EPT_FAULT] = "EPT violation", [EXIT_REASON_EPT_MISCONFIG] = "EPT misconfiguration", [EXIT_REASON_INVEPT] = "INVEPT", [EXIT_REASON_RDTSCP] = "RDTSCP", [EXIT_REASON_VMX_PREEMPT] = "VMX-preemption timer expired", [EXIT_REASON_INVVPID] = "INVVPID", [EXIT_REASON_WBINVD] = "WBINVD", [EXIT_REASON_XSETBV] = "XSETBV", [EXIT_REASON_APIC_WRITE] = "APIC write", [EXIT_REASON_RDRAND] = "RDRAND", [EXIT_REASON_INVPCID] = "INVPCID", [EXIT_REASON_VMFUNC] = "VMFUNC", [EXIT_REASON_ENCLS] = "ENCLS", [EXIT_REASON_RDSEED] = "RDSEED", [EXIT_REASON_PM_LOG_FULL] = "Page-modification log full", [EXIT_REASON_XSAVES] = "XSAVES", [EXIT_REASON_XRSTORS] = "XRSTORS" }; typedef int (*vmexit_handler_t)(struct vmctx *, struct vm_exit *, int *vcpu); extern int vmexit_task_switch(struct vmctx *, struct vm_exit *, int *vcpu); char *vmname; int guest_ncpus; uint16_t cores, maxcpus, sockets, threads; char *guest_uuid_str; static int gdb_port = 0; static int guest_vmexit_on_hlt, guest_vmexit_on_pause; static int virtio_msix = 1; static int x2apic_mode = 0; /* default is xAPIC */ static int strictio; static int strictmsr = 1; static int acpi; static char *progname; static const int BSP = 0; static cpuset_t cpumask; static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip); static struct vm_exit vmexit[VM_MAXCPU]; struct bhyvestats { uint64_t vmexit_bogus; uint64_t vmexit_reqidle; uint64_t vmexit_hlt; uint64_t vmexit_pause; uint64_t vmexit_mtrap; uint64_t vmexit_inst_emul; uint64_t cpu_switch_rotate; uint64_t cpu_switch_direct; } stats; struct mt_vmm_info { pthread_t mt_thr; struct vmctx *mt_ctx; int mt_vcpu; } mt_vmm_info[VM_MAXCPU]; static cpuset_t *vcpumap[VM_MAXCPU] = { NULL }; static void usage(int code) { fprintf(stderr, "Usage: %s [-abehuwxACHPSWY]\n" " %*s [-c [[cpus=]numcpus][,sockets=n][,cores=n][,threads=n]]\n" " %*s [-g ] [-l ]\n" " %*s [-m mem] [-p vcpu:hostcpu] [-s ] [-U uuid] \n" " -a: local apic is in xAPIC mode (deprecated)\n" " -A: create ACPI tables\n" " -c: number of cpus and/or topology specification\n" " -C: include guest memory in core file\n" " -e: exit on unhandled I/O access\n" " -g: gdb port\n" " -h: help\n" " -H: vmexit from the guest on hlt\n" " -l: LPC device configuration\n" " -m: memory size in MB\n" " -p: pin 'vcpu' to 'hostcpu'\n" " -P: vmexit from the guest on pause\n" " -s: PCI slot config\n" " -S: guest memory cannot be swapped\n" " -u: RTC keeps UTC time\n" " -U: uuid\n" " -w: ignore unimplemented MSRs\n" " -W: force virtio to use single-vector MSI\n" " -x: local apic is in x2APIC mode\n" " -Y: disable MPtable generation\n", progname, (int)strlen(progname), "", (int)strlen(progname), "", (int)strlen(progname), ""); exit(code); } /* * XXX This parser is known to have the following issues: * 1. It accepts null key=value tokens ",,". * 2. It accepts whitespace after = and before value. * 3. Values out of range of INT are silently wrapped. * 4. It doesn't check non-final values. * 5. The apparently bogus limits of UINT16_MAX are for future expansion. * * The acceptance of a null specification ('-c ""') is by design to match the * manual page syntax specification, this results in a topology of 1 vCPU. */ static int topology_parse(const char *opt) { uint64_t ncpus; int c, chk, n, s, t, tmp; char *cp, *str; bool ns, scts; c = 1, n = 1, s = 1, t = 1; ns = false, scts = false; str = strdup(opt); if (str == NULL) goto out; while ((cp = strsep(&str, ",")) != NULL) { if (sscanf(cp, "%i%n", &tmp, &chk) == 1) { n = tmp; ns = true; } else if (sscanf(cp, "cpus=%i%n", &tmp, &chk) == 1) { n = tmp; ns = true; } else if (sscanf(cp, "sockets=%i%n", &tmp, &chk) == 1) { s = tmp; scts = true; } else if (sscanf(cp, "cores=%i%n", &tmp, &chk) == 1) { c = tmp; scts = true; } else if (sscanf(cp, "threads=%i%n", &tmp, &chk) == 1) { t = tmp; scts = true; #ifdef notyet /* Do not expose this until vmm.ko implements it */ } else if (sscanf(cp, "maxcpus=%i%n", &tmp, &chk) == 1) { m = tmp; #endif /* Skip the empty argument case from -c "" */ } else if (cp[0] == '\0') continue; else goto out; /* Any trailing garbage causes an error */ if (cp[chk] != '\0') goto out; } free(str); str = NULL; /* * Range check 1 <= n <= UINT16_MAX all values */ if (n < 1 || s < 1 || c < 1 || t < 1 || n > UINT16_MAX || s > UINT16_MAX || c > UINT16_MAX || t > UINT16_MAX) return (-1); /* If only the cpus was specified, use that as sockets */ if (!scts) s = n; /* * Compute sockets * cores * threads avoiding overflow * The range check above insures these are 16 bit values * If n was specified check it against computed ncpus */ ncpus = (uint64_t)s * c * t; if (ncpus > UINT16_MAX || (ns && n != ncpus)) return (-1); guest_ncpus = ncpus; sockets = s; cores = c; threads = t; return(0); out: free(str); return (-1); } static int pincpu_parse(const char *opt) { int vcpu, pcpu; if (sscanf(opt, "%d:%d", &vcpu, &pcpu) != 2) { fprintf(stderr, "invalid format: %s\n", opt); return (-1); } if (vcpu < 0 || vcpu >= VM_MAXCPU) { fprintf(stderr, "vcpu '%d' outside valid range from 0 to %d\n", vcpu, VM_MAXCPU - 1); return (-1); } if (pcpu < 0 || pcpu >= CPU_SETSIZE) { fprintf(stderr, "hostcpu '%d' outside valid range from " "0 to %d\n", pcpu, CPU_SETSIZE - 1); return (-1); } if (vcpumap[vcpu] == NULL) { if ((vcpumap[vcpu] = malloc(sizeof(cpuset_t))) == NULL) { perror("malloc"); return (-1); } CPU_ZERO(vcpumap[vcpu]); } CPU_SET(pcpu, vcpumap[vcpu]); return (0); } void vm_inject_fault(void *arg, int vcpu, int vector, int errcode_valid, int errcode) { struct vmctx *ctx; int error, restart_instruction; ctx = arg; restart_instruction = 1; error = vm_inject_exception(ctx, vcpu, vector, errcode_valid, errcode, restart_instruction); assert(error == 0); } void * paddr_guest2host(struct vmctx *ctx, uintptr_t gaddr, size_t len) { return (vm_map_gpa(ctx, gaddr, len)); } int fbsdrun_vmexit_on_pause(void) { return (guest_vmexit_on_pause); } int fbsdrun_vmexit_on_hlt(void) { return (guest_vmexit_on_hlt); } int fbsdrun_virtio_msix(void) { return (virtio_msix); } static void * fbsdrun_start_thread(void *param) { char tname[MAXCOMLEN + 1]; struct mt_vmm_info *mtp; int vcpu; mtp = param; vcpu = mtp->mt_vcpu; snprintf(tname, sizeof(tname), "vcpu %d", vcpu); pthread_set_name_np(mtp->mt_thr, tname); if (gdb_port != 0) gdb_cpu_add(vcpu); vm_loop(mtp->mt_ctx, vcpu, vmexit[vcpu].rip); /* not reached */ exit(1); return (NULL); } void fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip) { int error; assert(fromcpu == BSP); /* * The 'newcpu' must be activated in the context of 'fromcpu'. If * vm_activate_cpu() is delayed until newcpu's pthread starts running * then vmm.ko is out-of-sync with bhyve and this can create a race * with vm_suspend(). */ error = vm_activate_cpu(ctx, newcpu); if (error != 0) err(EX_OSERR, "could not activate CPU %d", newcpu); CPU_SET_ATOMIC(newcpu, &cpumask); /* * Set up the vmexit struct to allow execution to start * at the given RIP */ vmexit[newcpu].rip = rip; vmexit[newcpu].inst_length = 0; mt_vmm_info[newcpu].mt_ctx = ctx; mt_vmm_info[newcpu].mt_vcpu = newcpu; error = pthread_create(&mt_vmm_info[newcpu].mt_thr, NULL, fbsdrun_start_thread, &mt_vmm_info[newcpu]); assert(error == 0); } static int fbsdrun_deletecpu(struct vmctx *ctx, int vcpu) { if (!CPU_ISSET(vcpu, &cpumask)) { fprintf(stderr, "Attempting to delete unknown cpu %d\n", vcpu); exit(4); } CPU_CLR_ATOMIC(vcpu, &cpumask); return (CPU_EMPTY(&cpumask)); } static int vmexit_handle_notify(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu, uint32_t eax) { #if BHYVE_DEBUG /* * put guest-driven debug here */ #endif return (VMEXIT_CONTINUE); } static int vmexit_inout(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) { int error; int bytes, port, in, out; int vcpu; vcpu = *pvcpu; port = vme->u.inout.port; bytes = vme->u.inout.bytes; in = vme->u.inout.in; out = !in; /* Extra-special case of host notifications */ if (out && port == GUEST_NIO_PORT) { error = vmexit_handle_notify(ctx, vme, pvcpu, vme->u.inout.eax); return (error); } error = emulate_inout(ctx, vcpu, vme, strictio); if (error) { fprintf(stderr, "Unhandled %s%c 0x%04x at 0x%lx\n", in ? "in" : "out", bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'), port, vmexit->rip); return (VMEXIT_ABORT); } else { return (VMEXIT_CONTINUE); } } static int vmexit_rdmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) { uint64_t val; uint32_t eax, edx; int error; val = 0; error = emulate_rdmsr(ctx, *pvcpu, vme->u.msr.code, &val); if (error != 0) { fprintf(stderr, "rdmsr to register %#x on vcpu %d\n", vme->u.msr.code, *pvcpu); if (strictmsr) { vm_inject_gp(ctx, *pvcpu); return (VMEXIT_CONTINUE); } } eax = val; error = vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RAX, eax); assert(error == 0); edx = val >> 32; error = vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RDX, edx); assert(error == 0); return (VMEXIT_CONTINUE); } static int vmexit_wrmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) { int error; error = emulate_wrmsr(ctx, *pvcpu, vme->u.msr.code, vme->u.msr.wval); if (error != 0) { fprintf(stderr, "wrmsr to register %#x(%#lx) on vcpu %d\n", vme->u.msr.code, vme->u.msr.wval, *pvcpu); if (strictmsr) { vm_inject_gp(ctx, *pvcpu); return (VMEXIT_CONTINUE); } } return (VMEXIT_CONTINUE); } static int vmexit_spinup_ap(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) { (void)spinup_ap(ctx, *pvcpu, vme->u.spinup_ap.vcpu, vme->u.spinup_ap.rip); return (VMEXIT_CONTINUE); } #define DEBUG_EPT_MISCONFIG #ifdef DEBUG_EPT_MISCONFIG #define VMCS_GUEST_PHYSICAL_ADDRESS 0x00002400 static uint64_t ept_misconfig_gpa, ept_misconfig_pte[4]; static int ept_misconfig_ptenum; #endif static const char * vmexit_vmx_desc(uint32_t exit_reason) { if (exit_reason >= nitems(vmx_exit_reason_desc) || vmx_exit_reason_desc[exit_reason] == NULL) return ("Unknown"); return (vmx_exit_reason_desc[exit_reason]); } static int vmexit_vmx(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { fprintf(stderr, "vm exit[%d]\n", *pvcpu); fprintf(stderr, "\treason\t\tVMX\n"); fprintf(stderr, "\trip\t\t0x%016lx\n", vmexit->rip); fprintf(stderr, "\tinst_length\t%d\n", vmexit->inst_length); fprintf(stderr, "\tstatus\t\t%d\n", vmexit->u.vmx.status); fprintf(stderr, "\texit_reason\t%u (%s)\n", vmexit->u.vmx.exit_reason, vmexit_vmx_desc(vmexit->u.vmx.exit_reason)); fprintf(stderr, "\tqualification\t0x%016lx\n", vmexit->u.vmx.exit_qualification); fprintf(stderr, "\tinst_type\t\t%d\n", vmexit->u.vmx.inst_type); fprintf(stderr, "\tinst_error\t\t%d\n", vmexit->u.vmx.inst_error); #ifdef DEBUG_EPT_MISCONFIG if (vmexit->u.vmx.exit_reason == EXIT_REASON_EPT_MISCONFIG) { vm_get_register(ctx, *pvcpu, VMCS_IDENT(VMCS_GUEST_PHYSICAL_ADDRESS), &ept_misconfig_gpa); vm_get_gpa_pmap(ctx, ept_misconfig_gpa, ept_misconfig_pte, &ept_misconfig_ptenum); fprintf(stderr, "\tEPT misconfiguration:\n"); fprintf(stderr, "\t\tGPA: %#lx\n", ept_misconfig_gpa); fprintf(stderr, "\t\tPTE(%d): %#lx %#lx %#lx %#lx\n", ept_misconfig_ptenum, ept_misconfig_pte[0], ept_misconfig_pte[1], ept_misconfig_pte[2], ept_misconfig_pte[3]); } #endif /* DEBUG_EPT_MISCONFIG */ return (VMEXIT_ABORT); } static int vmexit_svm(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { fprintf(stderr, "vm exit[%d]\n", *pvcpu); fprintf(stderr, "\treason\t\tSVM\n"); fprintf(stderr, "\trip\t\t0x%016lx\n", vmexit->rip); fprintf(stderr, "\tinst_length\t%d\n", vmexit->inst_length); fprintf(stderr, "\texitcode\t%#lx\n", vmexit->u.svm.exitcode); fprintf(stderr, "\texitinfo1\t%#lx\n", vmexit->u.svm.exitinfo1); fprintf(stderr, "\texitinfo2\t%#lx\n", vmexit->u.svm.exitinfo2); return (VMEXIT_ABORT); } static int vmexit_bogus(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { assert(vmexit->inst_length == 0); stats.vmexit_bogus++; return (VMEXIT_CONTINUE); } static int vmexit_reqidle(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { assert(vmexit->inst_length == 0); stats.vmexit_reqidle++; return (VMEXIT_CONTINUE); } static int vmexit_hlt(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { stats.vmexit_hlt++; /* * Just continue execution with the next instruction. We use * the HLT VM exit as a way to be friendly with the host * scheduler. */ return (VMEXIT_CONTINUE); } static int vmexit_pause(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { stats.vmexit_pause++; return (VMEXIT_CONTINUE); } static int vmexit_mtrap(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { assert(vmexit->inst_length == 0); stats.vmexit_mtrap++; if (gdb_port == 0) { fprintf(stderr, "vm_loop: unexpected VMEXIT_MTRAP\n"); exit(4); } gdb_cpu_mtrap(*pvcpu); return (VMEXIT_CONTINUE); } static int vmexit_inst_emul(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { int err, i; struct vie *vie; stats.vmexit_inst_emul++; vie = &vmexit->u.inst_emul.vie; err = emulate_mem(ctx, *pvcpu, vmexit->u.inst_emul.gpa, vie, &vmexit->u.inst_emul.paging); if (err) { if (err == ESRCH) { fprintf(stderr, "Unhandled memory access to 0x%lx\n", vmexit->u.inst_emul.gpa); } fprintf(stderr, "Failed to emulate instruction ["); for (i = 0; i < vie->num_valid; i++) { fprintf(stderr, "0x%02x%s", vie->inst[i], i != (vie->num_valid - 1) ? " " : ""); } fprintf(stderr, "] at 0x%lx\n", vmexit->rip); return (VMEXIT_ABORT); } return (VMEXIT_CONTINUE); } static pthread_mutex_t resetcpu_mtx = PTHREAD_MUTEX_INITIALIZER; static pthread_cond_t resetcpu_cond = PTHREAD_COND_INITIALIZER; static int vmexit_suspend(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { enum vm_suspend_how how; how = vmexit->u.suspended.how; fbsdrun_deletecpu(ctx, *pvcpu); if (*pvcpu != BSP) { pthread_mutex_lock(&resetcpu_mtx); pthread_cond_signal(&resetcpu_cond); pthread_mutex_unlock(&resetcpu_mtx); pthread_exit(NULL); } pthread_mutex_lock(&resetcpu_mtx); while (!CPU_EMPTY(&cpumask)) { pthread_cond_wait(&resetcpu_cond, &resetcpu_mtx); } pthread_mutex_unlock(&resetcpu_mtx); switch (how) { case VM_SUSPEND_RESET: exit(0); case VM_SUSPEND_POWEROFF: exit(1); case VM_SUSPEND_HALT: exit(2); case VM_SUSPEND_TRIPLEFAULT: exit(3); default: fprintf(stderr, "vmexit_suspend: invalid reason %d\n", how); exit(100); } return (0); /* NOTREACHED */ } static int vmexit_debug(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { if (gdb_port == 0) { fprintf(stderr, "vm_loop: unexpected VMEXIT_DEBUG\n"); exit(4); } gdb_cpu_suspend(*pvcpu); return (VMEXIT_CONTINUE); } +static int +vmexit_breakpoint(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) +{ + + if (gdb_port == 0) { + fprintf(stderr, "vm_loop: unexpected VMEXIT_DEBUG\n"); + exit(4); + } + gdb_cpu_breakpoint(*pvcpu, vmexit); + return (VMEXIT_CONTINUE); +} + static vmexit_handler_t handler[VM_EXITCODE_MAX] = { [VM_EXITCODE_INOUT] = vmexit_inout, [VM_EXITCODE_INOUT_STR] = vmexit_inout, [VM_EXITCODE_VMX] = vmexit_vmx, [VM_EXITCODE_SVM] = vmexit_svm, [VM_EXITCODE_BOGUS] = vmexit_bogus, [VM_EXITCODE_REQIDLE] = vmexit_reqidle, [VM_EXITCODE_RDMSR] = vmexit_rdmsr, [VM_EXITCODE_WRMSR] = vmexit_wrmsr, [VM_EXITCODE_MTRAP] = vmexit_mtrap, [VM_EXITCODE_INST_EMUL] = vmexit_inst_emul, [VM_EXITCODE_SPINUP_AP] = vmexit_spinup_ap, [VM_EXITCODE_SUSPENDED] = vmexit_suspend, [VM_EXITCODE_TASK_SWITCH] = vmexit_task_switch, [VM_EXITCODE_DEBUG] = vmexit_debug, + [VM_EXITCODE_BPT] = vmexit_breakpoint, }; static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t startrip) { int error, rc; enum vm_exitcode exitcode; cpuset_t active_cpus; if (vcpumap[vcpu] != NULL) { error = pthread_setaffinity_np(pthread_self(), sizeof(cpuset_t), vcpumap[vcpu]); assert(error == 0); } error = vm_active_cpus(ctx, &active_cpus); assert(CPU_ISSET(vcpu, &active_cpus)); error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, startrip); assert(error == 0); while (1) { error = vm_run(ctx, vcpu, &vmexit[vcpu]); if (error != 0) break; exitcode = vmexit[vcpu].exitcode; if (exitcode >= VM_EXITCODE_MAX || handler[exitcode] == NULL) { fprintf(stderr, "vm_loop: unexpected exitcode 0x%x\n", exitcode); exit(4); } rc = (*handler[exitcode])(ctx, &vmexit[vcpu], &vcpu); switch (rc) { case VMEXIT_CONTINUE: break; case VMEXIT_ABORT: abort(); default: exit(4); } } fprintf(stderr, "vm_run error %d, errno %d\n", error, errno); } static int num_vcpus_allowed(struct vmctx *ctx) { int tmp, error; error = vm_get_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, &tmp); /* * The guest is allowed to spinup more than one processor only if the * UNRESTRICTED_GUEST capability is available. */ if (error == 0) return (VM_MAXCPU); else return (1); } void fbsdrun_set_capabilities(struct vmctx *ctx, int cpu) { int err, tmp; if (fbsdrun_vmexit_on_hlt()) { err = vm_get_capability(ctx, cpu, VM_CAP_HALT_EXIT, &tmp); if (err < 0) { fprintf(stderr, "VM exit on HLT not supported\n"); exit(4); } vm_set_capability(ctx, cpu, VM_CAP_HALT_EXIT, 1); if (cpu == BSP) handler[VM_EXITCODE_HLT] = vmexit_hlt; } if (fbsdrun_vmexit_on_pause()) { /* * pause exit support required for this mode */ err = vm_get_capability(ctx, cpu, VM_CAP_PAUSE_EXIT, &tmp); if (err < 0) { fprintf(stderr, "SMP mux requested, no pause support\n"); exit(4); } vm_set_capability(ctx, cpu, VM_CAP_PAUSE_EXIT, 1); if (cpu == BSP) handler[VM_EXITCODE_PAUSE] = vmexit_pause; } if (x2apic_mode) err = vm_set_x2apic_state(ctx, cpu, X2APIC_ENABLED); else err = vm_set_x2apic_state(ctx, cpu, X2APIC_DISABLED); if (err) { fprintf(stderr, "Unable to set x2apic state (%d)\n", err); exit(4); } vm_set_capability(ctx, cpu, VM_CAP_ENABLE_INVPCID, 1); } static struct vmctx * do_open(const char *vmname) { struct vmctx *ctx; int error; bool reinit, romboot; #ifndef WITHOUT_CAPSICUM cap_rights_t rights; const cap_ioctl_t *cmds; size_t ncmds; #endif reinit = romboot = false; if (lpc_bootrom()) romboot = true; error = vm_create(vmname); if (error) { if (errno == EEXIST) { if (romboot) { reinit = true; } else { /* * The virtual machine has been setup by the * userspace bootloader. */ } } else { perror("vm_create"); exit(4); } } else { if (!romboot) { /* * If the virtual machine was just created then a * bootrom must be configured to boot it. */ fprintf(stderr, "virtual machine cannot be booted\n"); exit(4); } } ctx = vm_open(vmname); if (ctx == NULL) { perror("vm_open"); exit(4); } #ifndef WITHOUT_CAPSICUM cap_rights_init(&rights, CAP_IOCTL, CAP_MMAP_RW); if (caph_rights_limit(vm_get_device_fd(ctx), &rights) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); vm_get_ioctls(&ncmds); cmds = vm_get_ioctls(NULL); if (cmds == NULL) errx(EX_OSERR, "out of memory"); if (caph_ioctls_limit(vm_get_device_fd(ctx), cmds, ncmds) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); free((cap_ioctl_t *)cmds); #endif if (reinit) { error = vm_reinit(ctx); if (error) { perror("vm_reinit"); exit(4); } } error = vm_set_topology(ctx, sockets, cores, threads, maxcpus); if (error) errx(EX_OSERR, "vm_set_topology"); return (ctx); } int main(int argc, char *argv[]) { int c, error, dbg_port, err, bvmcons; int max_vcpus, mptgen, memflags; int rtc_localtime; bool gdb_stop; struct vmctx *ctx; uint64_t rip; size_t memsize; char *optstr; bvmcons = 0; progname = basename(argv[0]); dbg_port = 0; gdb_stop = false; guest_ncpus = 1; sockets = cores = threads = 1; maxcpus = 0; memsize = 256 * MB; mptgen = 1; rtc_localtime = 1; memflags = 0; optstr = "abehuwxACHIPSWYp:g:G:c:s:m:l:U:"; while ((c = getopt(argc, argv, optstr)) != -1) { switch (c) { case 'a': x2apic_mode = 0; break; case 'A': acpi = 1; break; case 'b': bvmcons = 1; break; case 'p': if (pincpu_parse(optarg) != 0) { errx(EX_USAGE, "invalid vcpu pinning " "configuration '%s'", optarg); } break; case 'c': if (topology_parse(optarg) != 0) { errx(EX_USAGE, "invalid cpu topology " "'%s'", optarg); } break; case 'C': memflags |= VM_MEM_F_INCORE; break; case 'g': dbg_port = atoi(optarg); break; case 'G': if (optarg[0] == 'w') { gdb_stop = true; optarg++; } gdb_port = atoi(optarg); break; case 'l': if (strncmp(optarg, "help", strlen(optarg)) == 0) { lpc_print_supported_devices(); exit(0); } else if (lpc_device_parse(optarg) != 0) { errx(EX_USAGE, "invalid lpc device " "configuration '%s'", optarg); } break; case 's': if (strncmp(optarg, "help", strlen(optarg)) == 0) { pci_print_supported_devices(); exit(0); } else if (pci_parse_slot(optarg) != 0) exit(4); else break; case 'S': memflags |= VM_MEM_F_WIRED; break; case 'm': error = vm_parse_memsize(optarg, &memsize); if (error) errx(EX_USAGE, "invalid memsize '%s'", optarg); break; case 'H': guest_vmexit_on_hlt = 1; break; case 'I': /* * The "-I" option was used to add an ioapic to the * virtual machine. * * An ioapic is now provided unconditionally for each * virtual machine and this option is now deprecated. */ break; case 'P': guest_vmexit_on_pause = 1; break; case 'e': strictio = 1; break; case 'u': rtc_localtime = 0; break; case 'U': guest_uuid_str = optarg; break; case 'w': strictmsr = 0; break; case 'W': virtio_msix = 0; break; case 'x': x2apic_mode = 1; break; case 'Y': mptgen = 0; break; case 'h': usage(0); default: usage(1); } } argc -= optind; argv += optind; if (argc != 1) usage(1); vmname = argv[0]; ctx = do_open(vmname); max_vcpus = num_vcpus_allowed(ctx); if (guest_ncpus > max_vcpus) { fprintf(stderr, "%d vCPUs requested but only %d available\n", guest_ncpus, max_vcpus); exit(4); } fbsdrun_set_capabilities(ctx, BSP); vm_set_memflags(ctx, memflags); err = vm_setup_memory(ctx, memsize, VM_MMAP_ALL); if (err) { fprintf(stderr, "Unable to setup memory (%d)\n", errno); exit(4); } error = init_msr(); if (error) { fprintf(stderr, "init_msr error %d", error); exit(4); } init_mem(); init_inout(); atkbdc_init(ctx); pci_irq_init(ctx); ioapic_init(ctx); rtc_init(ctx, rtc_localtime); sci_init(ctx); /* * Exit if a device emulation finds an error in its initilization */ if (init_pci(ctx) != 0) { perror("device emulation initialization error"); exit(4); } if (dbg_port != 0) init_dbgport(dbg_port); if (gdb_port != 0) init_gdb(ctx, gdb_port, gdb_stop); if (bvmcons) init_bvmcons(); if (lpc_bootrom()) { if (vm_set_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, 1)) { fprintf(stderr, "ROM boot failed: unrestricted guest " "capability not available\n"); exit(4); } error = vcpu_reset(ctx, BSP); assert(error == 0); } error = vm_get_register(ctx, BSP, VM_REG_GUEST_RIP, &rip); assert(error == 0); /* * build the guest tables, MP etc. */ if (mptgen) { error = mptable_build(ctx, guest_ncpus); if (error) { perror("error to build the guest tables"); exit(4); } } error = smbios_build(ctx); assert(error == 0); if (acpi) { error = acpi_build(ctx, guest_ncpus); assert(error == 0); } if (lpc_bootrom()) fwctl_init(); /* * Change the proc title to include the VM name. */ setproctitle("%s", vmname); #ifndef WITHOUT_CAPSICUM caph_cache_catpages(); if (caph_limit_stdout() == -1 || caph_limit_stderr() == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); if (caph_enter() == -1) errx(EX_OSERR, "cap_enter() failed"); #endif /* * Add CPU 0 */ fbsdrun_addcpu(ctx, BSP, BSP, rip); /* * Head off to the main event dispatch loop */ mevent_dispatch(); exit(4); } Index: head/usr.sbin/bhyve/gdb.c =================================================================== --- head/usr.sbin/bhyve/gdb.c (revision 355723) +++ head/usr.sbin/bhyve/gdb.c (revision 355724) @@ -1,1496 +1,1861 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2017-2018 John H. Baldwin * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include +#include #include #include #include #include #include #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" +#include "gdb.h" #include "mem.h" #include "mevent.h" /* * GDB_SIGNAL_* numbers are part of the GDB remote protocol. Most stops * use SIGTRAP. */ #define GDB_SIGNAL_TRAP 5 static void gdb_resume_vcpus(void); static void check_command(int fd); static struct mevent *read_event, *write_event; static cpuset_t vcpus_active, vcpus_suspended, vcpus_waiting; static pthread_mutex_t gdb_lock; static pthread_cond_t idle_vcpus; -static bool stop_pending, first_stop; -static int stepping_vcpu, stopped_vcpu; +static bool first_stop, report_next_stop, swbreak_enabled; /* * An I/O buffer contains 'capacity' bytes of room at 'data'. For a * read buffer, 'start' is unused and 'len' contains the number of * valid bytes in the buffer. For a write buffer, 'start' is set to * the index of the next byte in 'data' to send, and 'len' contains * the remaining number of valid bytes to send. */ struct io_buffer { uint8_t *data; size_t capacity; size_t start; size_t len; }; +struct breakpoint { + uint64_t gpa; + uint8_t shadow_inst; + TAILQ_ENTRY(breakpoint) link; +}; + +/* + * When a vCPU stops to due to an event that should be reported to the + * debugger, information about the event is stored in this structure. + * The vCPU thread then sets 'stopped_vcpu' if it is not already set + * and stops other vCPUs so the event can be reported. The + * report_stop() function reports the event for the 'stopped_vcpu' + * vCPU. When the debugger resumes execution via continue or step, + * the event for 'stopped_vcpu' is cleared. vCPUs will loop in their + * event handlers until the associated event is reported or disabled. + * + * An idle vCPU will have all of the boolean fields set to false. + * + * When a vCPU is stepped, 'stepping' is set to true when the vCPU is + * released to execute the stepped instruction. When the vCPU reports + * the stepping trap, 'stepped' is set. + * + * When a vCPU hits a breakpoint set by the debug server, + * 'hit_swbreak' is set to true. + */ +struct vcpu_state { + bool stepping; + bool stepped; + bool hit_swbreak; +}; + static struct io_buffer cur_comm, cur_resp; static uint8_t cur_csum; -static int cur_vcpu; static struct vmctx *ctx; static int cur_fd = -1; +static TAILQ_HEAD(, breakpoint) breakpoints; +static struct vcpu_state *vcpu_state; +static int cur_vcpu, stopped_vcpu; const int gdb_regset[] = { VM_REG_GUEST_RAX, VM_REG_GUEST_RBX, VM_REG_GUEST_RCX, VM_REG_GUEST_RDX, VM_REG_GUEST_RSI, VM_REG_GUEST_RDI, VM_REG_GUEST_RBP, VM_REG_GUEST_RSP, VM_REG_GUEST_R8, VM_REG_GUEST_R9, VM_REG_GUEST_R10, VM_REG_GUEST_R11, VM_REG_GUEST_R12, VM_REG_GUEST_R13, VM_REG_GUEST_R14, VM_REG_GUEST_R15, VM_REG_GUEST_RIP, VM_REG_GUEST_RFLAGS, VM_REG_GUEST_CS, VM_REG_GUEST_SS, VM_REG_GUEST_DS, VM_REG_GUEST_ES, VM_REG_GUEST_FS, VM_REG_GUEST_GS }; const int gdb_regsize[] = { 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, 4, 4, 4, 4, 4, 4 }; #ifdef GDB_LOG #include #include static void __printflike(1, 2) debug(const char *fmt, ...) { static FILE *logfile; va_list ap; if (logfile == NULL) { logfile = fopen("/tmp/bhyve_gdb.log", "w"); if (logfile == NULL) return; #ifndef WITHOUT_CAPSICUM if (caph_limit_stream(fileno(logfile), CAPH_WRITE) == -1) { fclose(logfile); logfile = NULL; return; } #endif setlinebuf(logfile); } va_start(ap, fmt); vfprintf(logfile, fmt, ap); va_end(ap); } #else #define debug(...) #endif +static void remove_all_sw_breakpoints(void); + static int guest_paging_info(int vcpu, struct vm_guest_paging *paging) { uint64_t regs[4]; const int regset[4] = { VM_REG_GUEST_CR0, VM_REG_GUEST_CR3, VM_REG_GUEST_CR4, VM_REG_GUEST_EFER }; if (vm_get_register_set(ctx, vcpu, nitems(regset), regset, regs) == -1) return (-1); /* * For the debugger, always pretend to be the kernel (CPL 0), * and if long-mode is enabled, always parse addresses as if * in 64-bit mode. */ paging->cr3 = regs[1]; paging->cpl = 0; if (regs[3] & EFER_LMA) paging->cpu_mode = CPU_MODE_64BIT; else if (regs[0] & CR0_PE) paging->cpu_mode = CPU_MODE_PROTECTED; else paging->cpu_mode = CPU_MODE_REAL; if (!(regs[0] & CR0_PG)) paging->paging_mode = PAGING_MODE_FLAT; else if (!(regs[2] & CR4_PAE)) paging->paging_mode = PAGING_MODE_32; else if (regs[3] & EFER_LME) paging->paging_mode = PAGING_MODE_64; else paging->paging_mode = PAGING_MODE_PAE; return (0); } /* * Map a guest virtual address to a physical address (for a given vcpu). * If a guest virtual address is valid, return 1. If the address is * not valid, return 0. If an error occurs obtaining the mapping, * return -1. */ static int guest_vaddr2paddr(int vcpu, uint64_t vaddr, uint64_t *paddr) { struct vm_guest_paging paging; int fault; if (guest_paging_info(vcpu, &paging) == -1) return (-1); /* * Always use PROT_READ. We really care if the VA is * accessible, not if the current vCPU can write. */ if (vm_gla2gpa_nofault(ctx, vcpu, &paging, vaddr, PROT_READ, paddr, &fault) == -1) return (-1); if (fault) return (0); return (1); } static void io_buffer_reset(struct io_buffer *io) { io->start = 0; io->len = 0; } /* Available room for adding data. */ static size_t io_buffer_avail(struct io_buffer *io) { return (io->capacity - (io->start + io->len)); } static uint8_t * io_buffer_head(struct io_buffer *io) { return (io->data + io->start); } static uint8_t * io_buffer_tail(struct io_buffer *io) { return (io->data + io->start + io->len); } static void io_buffer_advance(struct io_buffer *io, size_t amount) { assert(amount <= io->len); io->start += amount; io->len -= amount; } static void io_buffer_consume(struct io_buffer *io, size_t amount) { io_buffer_advance(io, amount); if (io->len == 0) { io->start = 0; return; } /* * XXX: Consider making this move optional and compacting on a * future read() before realloc(). */ memmove(io->data, io_buffer_head(io), io->len); io->start = 0; } static void io_buffer_grow(struct io_buffer *io, size_t newsize) { uint8_t *new_data; size_t avail, new_cap; avail = io_buffer_avail(io); if (newsize <= avail) return; new_cap = io->capacity + (newsize - avail); new_data = realloc(io->data, new_cap); if (new_data == NULL) err(1, "Failed to grow GDB I/O buffer"); io->data = new_data; io->capacity = new_cap; } static bool response_pending(void) { if (cur_resp.start == 0 && cur_resp.len == 0) return (false); if (cur_resp.start + cur_resp.len == 1 && cur_resp.data[0] == '+') return (false); return (true); } static void close_connection(void) { /* * XXX: This triggers a warning because mevent does the close * before the EV_DELETE. */ pthread_mutex_lock(&gdb_lock); mevent_delete(write_event); mevent_delete_close(read_event); write_event = NULL; read_event = NULL; io_buffer_reset(&cur_comm); io_buffer_reset(&cur_resp); cur_fd = -1; + remove_all_sw_breakpoints(); + + /* Clear any pending events. */ + memset(vcpu_state, 0, guest_ncpus * sizeof(*vcpu_state)); + /* Resume any stopped vCPUs. */ gdb_resume_vcpus(); pthread_mutex_unlock(&gdb_lock); } static uint8_t hex_digit(uint8_t nibble) { if (nibble <= 9) return (nibble + '0'); else return (nibble + 'a' - 10); } static uint8_t parse_digit(uint8_t v) { if (v >= '0' && v <= '9') return (v - '0'); if (v >= 'a' && v <= 'f') return (v - 'a' + 10); if (v >= 'A' && v <= 'F') return (v - 'A' + 10); return (0xF); } /* Parses big-endian hexadecimal. */ static uintmax_t parse_integer(const uint8_t *p, size_t len) { uintmax_t v; v = 0; while (len > 0) { v <<= 4; v |= parse_digit(*p); p++; len--; } return (v); } static uint8_t parse_byte(const uint8_t *p) { return (parse_digit(p[0]) << 4 | parse_digit(p[1])); } static void send_pending_data(int fd) { ssize_t nwritten; if (cur_resp.len == 0) { mevent_disable(write_event); return; } nwritten = write(fd, io_buffer_head(&cur_resp), cur_resp.len); if (nwritten == -1) { warn("Write to GDB socket failed"); close_connection(); } else { io_buffer_advance(&cur_resp, nwritten); if (cur_resp.len == 0) mevent_disable(write_event); else mevent_enable(write_event); } } /* Append a single character to the output buffer. */ static void send_char(uint8_t data) { io_buffer_grow(&cur_resp, 1); *io_buffer_tail(&cur_resp) = data; cur_resp.len++; } /* Append an array of bytes to the output buffer. */ static void send_data(const uint8_t *data, size_t len) { io_buffer_grow(&cur_resp, len); memcpy(io_buffer_tail(&cur_resp), data, len); cur_resp.len += len; } static void format_byte(uint8_t v, uint8_t *buf) { buf[0] = hex_digit(v >> 4); buf[1] = hex_digit(v & 0xf); } /* * Append a single byte (formatted as two hex characters) to the * output buffer. */ static void send_byte(uint8_t v) { uint8_t buf[2]; format_byte(v, buf); send_data(buf, sizeof(buf)); } static void start_packet(void) { send_char('$'); cur_csum = 0; } static void finish_packet(void) { send_char('#'); send_byte(cur_csum); debug("-> %.*s\n", (int)cur_resp.len, io_buffer_head(&cur_resp)); } /* * Append a single character (for the packet payload) and update the * checksum. */ static void append_char(uint8_t v) { send_char(v); cur_csum += v; } /* * Append an array of bytes (for the packet payload) and update the * checksum. */ static void append_packet_data(const uint8_t *data, size_t len) { send_data(data, len); while (len > 0) { cur_csum += *data; data++; len--; } } static void append_string(const char *str) { append_packet_data(str, strlen(str)); } static void append_byte(uint8_t v) { uint8_t buf[2]; format_byte(v, buf); append_packet_data(buf, sizeof(buf)); } static void append_unsigned_native(uintmax_t value, size_t len) { size_t i; for (i = 0; i < len; i++) { append_byte(value); value >>= 8; } } static void append_unsigned_be(uintmax_t value, size_t len) { char buf[len * 2]; size_t i; for (i = 0; i < len; i++) { format_byte(value, buf + (len - i - 1) * 2); value >>= 8; } append_packet_data(buf, sizeof(buf)); } static void append_integer(unsigned int value) { if (value == 0) append_char('0'); else - append_unsigned_be(value, fls(value) + 7 / 8); + append_unsigned_be(value, (fls(value) + 7) / 8); } static void append_asciihex(const char *str) { while (*str != '\0') { append_byte(*str); str++; } } static void send_empty_response(void) { start_packet(); finish_packet(); } static void send_error(int error) { start_packet(); append_char('E'); append_byte(error); finish_packet(); } static void send_ok(void) { start_packet(); append_string("OK"); finish_packet(); } static int parse_threadid(const uint8_t *data, size_t len) { if (len == 1 && *data == '0') return (0); if (len == 2 && memcmp(data, "-1", 2) == 0) return (-1); if (len == 0) return (-2); return (parse_integer(data, len)); } +/* + * Report the current stop event to the debugger. If the stop is due + * to an event triggered on a specific vCPU such as a breakpoint or + * stepping trap, stopped_vcpu will be set to the vCPU triggering the + * stop. If 'set_cur_vcpu' is true, then cur_vcpu will be updated to + * the reporting vCPU for vCPU events. + */ static void -report_stop(void) +report_stop(bool set_cur_vcpu) { + struct vcpu_state *vs; start_packet(); - if (stopped_vcpu == -1) + if (stopped_vcpu == -1) { append_char('S'); - else + append_byte(GDB_SIGNAL_TRAP); + } else { + vs = &vcpu_state[stopped_vcpu]; + if (set_cur_vcpu) + cur_vcpu = stopped_vcpu; append_char('T'); - append_byte(GDB_SIGNAL_TRAP); - if (stopped_vcpu != -1) { + append_byte(GDB_SIGNAL_TRAP); append_string("thread:"); append_integer(stopped_vcpu + 1); append_char(';'); + if (vs->hit_swbreak) { + debug("$vCPU %d reporting swbreak\n", stopped_vcpu); + if (swbreak_enabled) + append_string("swbreak:;"); + } else if (vs->stepped) + debug("$vCPU %d reporting step\n", stopped_vcpu); + else + debug("$vCPU %d reporting ???\n", stopped_vcpu); } - stopped_vcpu = -1; finish_packet(); + report_next_stop = false; } +/* + * If this stop is due to a vCPU event, clear that event to mark it as + * acknowledged. + */ static void +discard_stop(void) +{ + struct vcpu_state *vs; + + if (stopped_vcpu != -1) { + vs = &vcpu_state[stopped_vcpu]; + vs->hit_swbreak = false; + vs->stepped = false; + stopped_vcpu = -1; + } + report_next_stop = true; +} + +static void gdb_finish_suspend_vcpus(void) { if (first_stop) { first_stop = false; stopped_vcpu = -1; - } else if (response_pending()) - stop_pending = true; - else { - report_stop(); + } else if (report_next_stop) { + assert(!response_pending()); + report_stop(true); send_pending_data(cur_fd); } } +/* + * vCPU threads invoke this function whenever the vCPU enters the + * debug server to pause or report an event. vCPU threads wait here + * as long as the debug server keeps them suspended. + */ static void _gdb_cpu_suspend(int vcpu, bool report_stop) { debug("$vCPU %d suspending\n", vcpu); CPU_SET(vcpu, &vcpus_waiting); if (report_stop && CPU_CMP(&vcpus_waiting, &vcpus_suspended) == 0) gdb_finish_suspend_vcpus(); - while (CPU_ISSET(vcpu, &vcpus_suspended) && vcpu != stepping_vcpu) + while (CPU_ISSET(vcpu, &vcpus_suspended)) pthread_cond_wait(&idle_vcpus, &gdb_lock); CPU_CLR(vcpu, &vcpus_waiting); debug("$vCPU %d resuming\n", vcpu); } +/* + * Invoked at the start of a vCPU thread's execution to inform the + * debug server about the new thread. + */ void gdb_cpu_add(int vcpu) { debug("$vCPU %d starting\n", vcpu); pthread_mutex_lock(&gdb_lock); + assert(vcpu < guest_ncpus); CPU_SET(vcpu, &vcpus_active); + if (!TAILQ_EMPTY(&breakpoints)) { + vm_set_capability(ctx, vcpu, VM_CAP_BPT_EXIT, 1); + debug("$vCPU %d enabled breakpoint exits\n", vcpu); + } /* * If a vcpu is added while vcpus are stopped, suspend the new * vcpu so that it will pop back out with a debug exit before * executing the first instruction. */ if (!CPU_EMPTY(&vcpus_suspended)) { CPU_SET(vcpu, &vcpus_suspended); _gdb_cpu_suspend(vcpu, false); } pthread_mutex_unlock(&gdb_lock); } +/* + * Invoked by vCPU before resuming execution. This enables stepping + * if the vCPU is marked as stepping. + */ +static void +gdb_cpu_resume(int vcpu) +{ + struct vcpu_state *vs; + int error; + + vs = &vcpu_state[vcpu]; + + /* + * Any pending event should already be reported before + * resuming. + */ + assert(vs->hit_swbreak == false); + assert(vs->stepped == false); + if (vs->stepping) { + error = vm_set_capability(ctx, vcpu, VM_CAP_MTRAP_EXIT, 1); + assert(error == 0); + } +} + +/* + * Handler for VM_EXITCODE_DEBUG used to suspend a vCPU when the guest + * has been suspended due to an event on different vCPU or in response + * to a guest-wide suspend such as Ctrl-C or the stop on attach. + */ void gdb_cpu_suspend(int vcpu) { pthread_mutex_lock(&gdb_lock); _gdb_cpu_suspend(vcpu, true); + gdb_cpu_resume(vcpu); pthread_mutex_unlock(&gdb_lock); } +static void +gdb_suspend_vcpus(void) +{ + + assert(pthread_mutex_isowned_np(&gdb_lock)); + debug("suspending all CPUs\n"); + vcpus_suspended = vcpus_active; + vm_suspend_cpu(ctx, -1); + if (CPU_CMP(&vcpus_waiting, &vcpus_suspended) == 0) + gdb_finish_suspend_vcpus(); +} + +/* + * Handler for VM_EXITCODE_MTRAP reported when a vCPU single-steps via + * the VT-x-specific MTRAP exit. + */ void gdb_cpu_mtrap(int vcpu) { + struct vcpu_state *vs; debug("$vCPU %d MTRAP\n", vcpu); pthread_mutex_lock(&gdb_lock); - if (vcpu == stepping_vcpu) { - stepping_vcpu = -1; + vs = &vcpu_state[vcpu]; + if (vs->stepping) { + vs->stepping = false; + vs->stepped = true; vm_set_capability(ctx, vcpu, VM_CAP_MTRAP_EXIT, 0); - vm_suspend_cpu(ctx, vcpu); - assert(stopped_vcpu == -1); - stopped_vcpu = vcpu; - _gdb_cpu_suspend(vcpu, true); + while (vs->stepped) { + if (stopped_vcpu == -1) { + debug("$vCPU %d reporting step\n", vcpu); + stopped_vcpu = vcpu; + gdb_suspend_vcpus(); + } + _gdb_cpu_suspend(vcpu, true); + } + gdb_cpu_resume(vcpu); } pthread_mutex_unlock(&gdb_lock); } -static void -gdb_suspend_vcpus(void) +static struct breakpoint * +find_breakpoint(uint64_t gpa) { + struct breakpoint *bp; - assert(pthread_mutex_isowned_np(&gdb_lock)); - debug("suspending all CPUs\n"); - vcpus_suspended = vcpus_active; - vm_suspend_cpu(ctx, -1); - if (CPU_CMP(&vcpus_waiting, &vcpus_suspended) == 0) - gdb_finish_suspend_vcpus(); + TAILQ_FOREACH(bp, &breakpoints, link) { + if (bp->gpa == gpa) + return (bp); + } + return (NULL); } +void +gdb_cpu_breakpoint(int vcpu, struct vm_exit *vmexit) +{ + struct breakpoint *bp; + struct vcpu_state *vs; + uint64_t gpa; + int error; + + pthread_mutex_lock(&gdb_lock); + error = guest_vaddr2paddr(vcpu, vmexit->rip, &gpa); + assert(error == 1); + bp = find_breakpoint(gpa); + if (bp != NULL) { + vs = &vcpu_state[vcpu]; + assert(vs->stepping == false); + assert(vs->stepped == false); + assert(vs->hit_swbreak == false); + vs->hit_swbreak = true; + vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, vmexit->rip); + for (;;) { + if (stopped_vcpu == -1) { + debug("$vCPU %d reporting breakpoint at rip %#lx\n", vcpu, + vmexit->rip); + stopped_vcpu = vcpu; + gdb_suspend_vcpus(); + } + _gdb_cpu_suspend(vcpu, true); + if (!vs->hit_swbreak) { + /* Breakpoint reported. */ + break; + } + bp = find_breakpoint(gpa); + if (bp == NULL) { + /* Breakpoint was removed. */ + vs->hit_swbreak = false; + break; + } + } + gdb_cpu_resume(vcpu); + } else { + debug("$vCPU %d injecting breakpoint at rip %#lx\n", vcpu, + vmexit->rip); + error = vm_set_register(ctx, vcpu, + VM_REG_GUEST_ENTRY_INST_LENGTH, vmexit->u.bpt.inst_length); + assert(error == 0); + error = vm_inject_exception(ctx, vcpu, IDT_BP, 0, 0, 0); + assert(error == 0); + } + pthread_mutex_unlock(&gdb_lock); +} + static bool gdb_step_vcpu(int vcpu) { int error, val; debug("$vCPU %d step\n", vcpu); error = vm_get_capability(ctx, vcpu, VM_CAP_MTRAP_EXIT, &val); if (error < 0) return (false); - error = vm_set_capability(ctx, vcpu, VM_CAP_MTRAP_EXIT, 1); + + discard_stop(); + vcpu_state[vcpu].stepping = true; vm_resume_cpu(ctx, vcpu); - stepping_vcpu = vcpu; + CPU_CLR(vcpu, &vcpus_suspended); pthread_cond_broadcast(&idle_vcpus); return (true); } static void gdb_resume_vcpus(void) { assert(pthread_mutex_isowned_np(&gdb_lock)); vm_resume_cpu(ctx, -1); debug("resuming all CPUs\n"); CPU_ZERO(&vcpus_suspended); pthread_cond_broadcast(&idle_vcpus); } static void gdb_read_regs(void) { uint64_t regvals[nitems(gdb_regset)]; int i; if (vm_get_register_set(ctx, cur_vcpu, nitems(gdb_regset), gdb_regset, regvals) == -1) { send_error(errno); return; } start_packet(); for (i = 0; i < nitems(regvals); i++) append_unsigned_native(regvals[i], gdb_regsize[i]); finish_packet(); } static void gdb_read_mem(const uint8_t *data, size_t len) { uint64_t gpa, gva, val; uint8_t *cp; size_t resid, todo, bytes; bool started; int error; /* Skip 'm' */ data += 1; len -= 1; /* Parse and consume address. */ cp = memchr(data, ',', len); if (cp == NULL || cp == data) { send_error(EINVAL); return; } gva = parse_integer(data, cp - data); len -= (cp - data) + 1; data += (cp - data) + 1; /* Parse length. */ resid = parse_integer(data, len); started = false; while (resid > 0) { error = guest_vaddr2paddr(cur_vcpu, gva, &gpa); if (error == -1) { if (started) finish_packet(); else send_error(errno); return; } if (error == 0) { if (started) finish_packet(); else send_error(EFAULT); return; } /* Read bytes from current page. */ todo = getpagesize() - gpa % getpagesize(); if (todo > resid) todo = resid; cp = paddr_guest2host(ctx, gpa, todo); if (cp != NULL) { /* * If this page is guest RAM, read it a byte * at a time. */ if (!started) { start_packet(); started = true; } while (todo > 0) { append_byte(*cp); cp++; gpa++; gva++; resid--; todo--; } } else { /* * If this page isn't guest RAM, try to handle * it via MMIO. For MMIO requests, use * aligned reads of words when possible. */ while (todo > 0) { if (gpa & 1 || todo == 1) bytes = 1; else if (gpa & 2 || todo == 2) bytes = 2; else bytes = 4; error = read_mem(ctx, cur_vcpu, gpa, &val, bytes); if (error == 0) { if (!started) { start_packet(); started = true; } gpa += bytes; gva += bytes; resid -= bytes; todo -= bytes; while (bytes > 0) { append_byte(val); val >>= 8; bytes--; } } else { if (started) finish_packet(); else send_error(EFAULT); return; } } } assert(resid == 0 || gpa % getpagesize() == 0); } if (!started) start_packet(); finish_packet(); } static void gdb_write_mem(const uint8_t *data, size_t len) { uint64_t gpa, gva, val; uint8_t *cp; size_t resid, todo, bytes; int error; /* Skip 'M' */ data += 1; len -= 1; /* Parse and consume address. */ cp = memchr(data, ',', len); if (cp == NULL || cp == data) { send_error(EINVAL); return; } gva = parse_integer(data, cp - data); len -= (cp - data) + 1; data += (cp - data) + 1; /* Parse and consume length. */ cp = memchr(data, ':', len); if (cp == NULL || cp == data) { send_error(EINVAL); return; } resid = parse_integer(data, cp - data); len -= (cp - data) + 1; data += (cp - data) + 1; /* Verify the available bytes match the length. */ if (len != resid * 2) { send_error(EINVAL); return; } while (resid > 0) { error = guest_vaddr2paddr(cur_vcpu, gva, &gpa); if (error == -1) { send_error(errno); return; } if (error == 0) { send_error(EFAULT); return; } /* Write bytes to current page. */ todo = getpagesize() - gpa % getpagesize(); if (todo > resid) todo = resid; cp = paddr_guest2host(ctx, gpa, todo); if (cp != NULL) { /* * If this page is guest RAM, write it a byte * at a time. */ while (todo > 0) { assert(len >= 2); *cp = parse_byte(data); data += 2; len -= 2; cp++; gpa++; gva++; resid--; todo--; } } else { /* * If this page isn't guest RAM, try to handle * it via MMIO. For MMIO requests, use * aligned writes of words when possible. */ while (todo > 0) { if (gpa & 1 || todo == 1) { bytes = 1; val = parse_byte(data); } else if (gpa & 2 || todo == 2) { bytes = 2; val = be16toh(parse_integer(data, 4)); } else { bytes = 4; val = be32toh(parse_integer(data, 8)); } error = write_mem(ctx, cur_vcpu, gpa, val, bytes); if (error == 0) { gpa += bytes; gva += bytes; resid -= bytes; todo -= bytes; data += 2 * bytes; len -= 2 * bytes; } else { send_error(EFAULT); return; } } } assert(resid == 0 || gpa % getpagesize() == 0); } assert(len == 0); send_ok(); } static bool +set_breakpoint_caps(bool enable) +{ + cpuset_t mask; + int vcpu; + + mask = vcpus_active; + while (!CPU_EMPTY(&mask)) { + vcpu = CPU_FFS(&mask) - 1; + CPU_CLR(vcpu, &mask); + if (vm_set_capability(ctx, vcpu, VM_CAP_BPT_EXIT, + enable ? 1 : 0) < 0) + return (false); + debug("$vCPU %d %sabled breakpoint exits\n", vcpu, + enable ? "en" : "dis"); + } + return (true); +} + +static void +remove_all_sw_breakpoints(void) +{ + struct breakpoint *bp, *nbp; + uint8_t *cp; + + if (TAILQ_EMPTY(&breakpoints)) + return; + + TAILQ_FOREACH_SAFE(bp, &breakpoints, link, nbp) { + debug("remove breakpoint at %#lx\n", bp->gpa); + cp = paddr_guest2host(ctx, bp->gpa, 1); + *cp = bp->shadow_inst; + TAILQ_REMOVE(&breakpoints, bp, link); + free(bp); + } + TAILQ_INIT(&breakpoints); + set_breakpoint_caps(false); +} + +static void +update_sw_breakpoint(uint64_t gva, int kind, bool insert) +{ + struct breakpoint *bp; + uint64_t gpa; + uint8_t *cp; + int error; + + if (kind != 1) { + send_error(EINVAL); + return; + } + + error = guest_vaddr2paddr(cur_vcpu, gva, &gpa); + if (error == -1) { + send_error(errno); + return; + } + if (error == 0) { + send_error(EFAULT); + return; + } + + cp = paddr_guest2host(ctx, gpa, 1); + + /* Only permit breakpoints in guest RAM. */ + if (cp == NULL) { + send_error(EFAULT); + return; + } + + /* Find any existing breakpoint. */ + bp = find_breakpoint(gpa); + + /* + * Silently ignore duplicate commands since the protocol + * requires these packets to be idempotent. + */ + if (insert) { + if (bp == NULL) { + if (TAILQ_EMPTY(&breakpoints) && + !set_breakpoint_caps(true)) { + send_empty_response(); + return; + } + bp = malloc(sizeof(*bp)); + bp->gpa = gpa; + bp->shadow_inst = *cp; + *cp = 0xcc; /* INT 3 */ + TAILQ_INSERT_TAIL(&breakpoints, bp, link); + debug("new breakpoint at %#lx\n", gpa); + } + } else { + if (bp != NULL) { + debug("remove breakpoint at %#lx\n", gpa); + *cp = bp->shadow_inst; + TAILQ_REMOVE(&breakpoints, bp, link); + free(bp); + if (TAILQ_EMPTY(&breakpoints)) + set_breakpoint_caps(false); + } + } + send_ok(); +} + +static void +parse_breakpoint(const uint8_t *data, size_t len) +{ + uint64_t gva; + uint8_t *cp; + bool insert; + int kind, type; + + insert = data[0] == 'Z'; + + /* Skip 'Z/z' */ + data += 1; + len -= 1; + + /* Parse and consume type. */ + cp = memchr(data, ',', len); + if (cp == NULL || cp == data) { + send_error(EINVAL); + return; + } + type = parse_integer(data, cp - data); + len -= (cp - data) + 1; + data += (cp - data) + 1; + + /* Parse and consume address. */ + cp = memchr(data, ',', len); + if (cp == NULL || cp == data) { + send_error(EINVAL); + return; + } + gva = parse_integer(data, cp - data); + len -= (cp - data) + 1; + data += (cp - data) + 1; + + /* Parse and consume kind. */ + cp = memchr(data, ';', len); + if (cp == data) { + send_error(EINVAL); + return; + } + if (cp != NULL) { + /* + * We do not advertise support for either the + * ConditionalBreakpoints or BreakpointCommands + * features, so we should not be getting conditions or + * commands from the remote end. + */ + send_empty_response(); + return; + } + kind = parse_integer(data, len); + data += len; + len = 0; + + switch (type) { + case 0: + update_sw_breakpoint(gva, kind, insert); + break; + default: + send_empty_response(); + break; + } +} + +static bool command_equals(const uint8_t *data, size_t len, const char *cmd) { if (strlen(cmd) > len) return (false); return (memcmp(data, cmd, strlen(cmd)) == 0); } static void check_features(const uint8_t *data, size_t len) { char *feature, *next_feature, *str, *value; bool supported; str = malloc(len + 1); memcpy(str, data, len); str[len] = '\0'; next_feature = str; while ((feature = strsep(&next_feature, ";")) != NULL) { /* * Null features shouldn't exist, but skip if they * do. */ if (strcmp(feature, "") == 0) continue; /* * Look for the value or supported / not supported * flag. */ value = strchr(feature, '='); if (value != NULL) { *value = '\0'; value++; supported = true; } else { value = feature + strlen(feature) - 1; switch (*value) { case '+': supported = true; break; case '-': supported = false; break; default: /* * This is really a protocol error, * but we just ignore malformed * features for ease of * implementation. */ continue; } value = NULL; } - /* No currently supported features. */ + if (strcmp(feature, "swbreak") == 0) + swbreak_enabled = supported; } free(str); start_packet(); /* This is an arbitrary limit. */ append_string("PacketSize=4096"); + append_string(";swbreak+"); finish_packet(); } static void gdb_query(const uint8_t *data, size_t len) { /* * TODO: * - qSearch */ if (command_equals(data, len, "qAttached")) { start_packet(); append_char('1'); finish_packet(); } else if (command_equals(data, len, "qC")) { start_packet(); append_string("QC"); append_integer(cur_vcpu + 1); finish_packet(); } else if (command_equals(data, len, "qfThreadInfo")) { cpuset_t mask; bool first; int vcpu; if (CPU_EMPTY(&vcpus_active)) { send_error(EINVAL); return; } mask = vcpus_active; start_packet(); append_char('m'); first = true; while (!CPU_EMPTY(&mask)) { vcpu = CPU_FFS(&mask) - 1; CPU_CLR(vcpu, &mask); if (first) first = false; else append_char(','); append_integer(vcpu + 1); } finish_packet(); } else if (command_equals(data, len, "qsThreadInfo")) { start_packet(); append_char('l'); finish_packet(); } else if (command_equals(data, len, "qSupported")) { data += strlen("qSupported"); len -= strlen("qSupported"); check_features(data, len); } else if (command_equals(data, len, "qThreadExtraInfo")) { char buf[16]; int tid; data += strlen("qThreadExtraInfo"); len -= strlen("qThreadExtraInfo"); if (*data != ',') { send_error(EINVAL); return; } tid = parse_threadid(data + 1, len - 1); if (tid <= 0 || !CPU_ISSET(tid - 1, &vcpus_active)) { send_error(EINVAL); return; } snprintf(buf, sizeof(buf), "vCPU %d", tid - 1); start_packet(); append_asciihex(buf); finish_packet(); } else send_empty_response(); } static void handle_command(const uint8_t *data, size_t len) { /* Reject packets with a sequence-id. */ if (len >= 3 && data[0] >= '0' && data[0] <= '9' && data[0] >= '0' && data[0] <= '9' && data[2] == ':') { send_empty_response(); return; } switch (*data) { case 'c': if (len != 1) { send_error(EINVAL); break; } - /* Don't send a reply until a stop occurs. */ + discard_stop(); gdb_resume_vcpus(); break; case 'D': send_ok(); /* TODO: Resume any stopped CPUs. */ break; case 'g': { gdb_read_regs(); break; } case 'H': { int tid; if (data[1] != 'g' && data[1] != 'c') { send_error(EINVAL); break; } tid = parse_threadid(data + 2, len - 2); if (tid == -2) { send_error(EINVAL); break; } if (CPU_EMPTY(&vcpus_active)) { send_error(EINVAL); break; } if (tid == -1 || tid == 0) cur_vcpu = CPU_FFS(&vcpus_active) - 1; else if (CPU_ISSET(tid - 1, &vcpus_active)) cur_vcpu = tid - 1; else { send_error(EINVAL); break; } send_ok(); break; } case 'm': gdb_read_mem(data, len); break; case 'M': gdb_write_mem(data, len); break; case 'T': { int tid; tid = parse_threadid(data + 1, len - 1); if (tid <= 0 || !CPU_ISSET(tid - 1, &vcpus_active)) { send_error(EINVAL); return; } send_ok(); break; } case 'q': gdb_query(data, len); break; case 's': if (len != 1) { send_error(EINVAL); break; } /* Don't send a reply until a stop occurs. */ if (!gdb_step_vcpu(cur_vcpu)) { send_error(EOPNOTSUPP); break; } break; + case 'z': + case 'Z': + parse_breakpoint(data, len); + break; case '?': - /* XXX: Only if stopped? */ - /* For now, just report that we are always stopped. */ - start_packet(); - append_char('S'); - append_byte(GDB_SIGNAL_TRAP); - finish_packet(); + report_stop(false); break; case 'G': /* TODO */ case 'v': /* Handle 'vCont' */ /* 'vCtrlC' */ case 'p': /* TODO */ case 'P': /* TODO */ case 'Q': /* TODO */ case 't': /* TODO */ case 'X': /* TODO */ - case 'z': /* TODO */ - case 'Z': /* TODO */ default: send_empty_response(); } } /* Check for a valid packet in the command buffer. */ static void check_command(int fd) { uint8_t *head, *hash, *p, sum; size_t avail, plen; for (;;) { avail = cur_comm.len; if (avail == 0) return; head = io_buffer_head(&cur_comm); switch (*head) { case 0x03: debug("<- Ctrl-C\n"); io_buffer_consume(&cur_comm, 1); gdb_suspend_vcpus(); break; case '+': /* ACK of previous response. */ debug("<- +\n"); if (response_pending()) io_buffer_reset(&cur_resp); io_buffer_consume(&cur_comm, 1); - if (stop_pending) { - stop_pending = false; - report_stop(); + if (stopped_vcpu != -1 && report_next_stop) { + report_stop(true); send_pending_data(fd); } break; case '-': /* NACK of previous response. */ debug("<- -\n"); if (response_pending()) { cur_resp.len += cur_resp.start; cur_resp.start = 0; if (cur_resp.data[0] == '+') io_buffer_advance(&cur_resp, 1); debug("-> %.*s\n", (int)cur_resp.len, io_buffer_head(&cur_resp)); } io_buffer_consume(&cur_comm, 1); send_pending_data(fd); break; case '$': /* Packet. */ if (response_pending()) { warnx("New GDB command while response in " "progress"); io_buffer_reset(&cur_resp); } /* Is packet complete? */ hash = memchr(head, '#', avail); if (hash == NULL) return; plen = (hash - head + 1) + 2; if (avail < plen) return; debug("<- %.*s\n", (int)plen, head); /* Verify checksum. */ for (sum = 0, p = head + 1; p < hash; p++) sum += *p; if (sum != parse_byte(hash + 1)) { io_buffer_consume(&cur_comm, plen); debug("-> -\n"); send_char('-'); send_pending_data(fd); break; } send_char('+'); handle_command(head + 1, hash - (head + 1)); io_buffer_consume(&cur_comm, plen); if (!response_pending()) debug("-> +\n"); send_pending_data(fd); break; default: /* XXX: Possibly drop connection instead. */ debug("-> %02x\n", *head); io_buffer_consume(&cur_comm, 1); break; } } } static void gdb_readable(int fd, enum ev_type event, void *arg) { ssize_t nread; int pending; if (ioctl(fd, FIONREAD, &pending) == -1) { warn("FIONREAD on GDB socket"); return; } /* * 'pending' might be zero due to EOF. We need to call read * with a non-zero length to detect EOF. */ if (pending == 0) pending = 1; /* Ensure there is room in the command buffer. */ io_buffer_grow(&cur_comm, pending); assert(io_buffer_avail(&cur_comm) >= pending); nread = read(fd, io_buffer_tail(&cur_comm), io_buffer_avail(&cur_comm)); if (nread == 0) { close_connection(); } else if (nread == -1) { if (errno == EAGAIN) return; warn("Read from GDB socket"); close_connection(); } else { cur_comm.len += nread; pthread_mutex_lock(&gdb_lock); check_command(fd); pthread_mutex_unlock(&gdb_lock); } } static void gdb_writable(int fd, enum ev_type event, void *arg) { send_pending_data(fd); } static void new_connection(int fd, enum ev_type event, void *arg) { int optval, s; s = accept4(fd, NULL, NULL, SOCK_NONBLOCK); if (s == -1) { if (arg != NULL) err(1, "Failed accepting initial GDB connection"); /* Silently ignore errors post-startup. */ return; } optval = 1; if (setsockopt(s, SOL_SOCKET, SO_NOSIGPIPE, &optval, sizeof(optval)) == -1) { warn("Failed to disable SIGPIPE for GDB connection"); close(s); return; } pthread_mutex_lock(&gdb_lock); if (cur_fd != -1) { close(s); warnx("Ignoring additional GDB connection."); } read_event = mevent_add(s, EVF_READ, gdb_readable, NULL); if (read_event == NULL) { if (arg != NULL) err(1, "Failed to setup initial GDB connection"); pthread_mutex_unlock(&gdb_lock); return; } write_event = mevent_add(s, EVF_WRITE, gdb_writable, NULL); if (write_event == NULL) { if (arg != NULL) err(1, "Failed to setup initial GDB connection"); mevent_delete_close(read_event); read_event = NULL; } cur_fd = s; cur_vcpu = 0; - stepping_vcpu = -1; stopped_vcpu = -1; - stop_pending = false; /* Break on attach. */ first_stop = true; + report_next_stop = false; gdb_suspend_vcpus(); pthread_mutex_unlock(&gdb_lock); } #ifndef WITHOUT_CAPSICUM void limit_gdb_socket(int s) { cap_rights_t rights; unsigned long ioctls[] = { FIONREAD }; cap_rights_init(&rights, CAP_ACCEPT, CAP_EVENT, CAP_READ, CAP_WRITE, CAP_SETSOCKOPT, CAP_IOCTL); if (caph_rights_limit(s, &rights) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); if (caph_ioctls_limit(s, ioctls, nitems(ioctls)) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); } #endif void init_gdb(struct vmctx *_ctx, int sport, bool wait) { struct sockaddr_in sin; int error, flags, s; debug("==> starting on %d, %swaiting\n", sport, wait ? "" : "not "); error = pthread_mutex_init(&gdb_lock, NULL); if (error != 0) errc(1, error, "gdb mutex init"); error = pthread_cond_init(&idle_vcpus, NULL); if (error != 0) errc(1, error, "gdb cv init"); ctx = _ctx; s = socket(PF_INET, SOCK_STREAM, 0); if (s < 0) err(1, "gdb socket create"); sin.sin_len = sizeof(sin); sin.sin_family = AF_INET; sin.sin_addr.s_addr = htonl(INADDR_ANY); sin.sin_port = htons(sport); if (bind(s, (struct sockaddr *)&sin, sizeof(sin)) < 0) err(1, "gdb socket bind"); if (listen(s, 1) < 0) err(1, "gdb socket listen"); + stopped_vcpu = -1; + TAILQ_INIT(&breakpoints); + vcpu_state = calloc(guest_ncpus, sizeof(*vcpu_state)); if (wait) { /* * Set vcpu 0 in vcpus_suspended. This will trigger the * logic in gdb_cpu_add() to suspend the first vcpu before * it starts execution. The vcpu will remain suspended * until a debugger connects. */ - stepping_vcpu = -1; - stopped_vcpu = -1; CPU_SET(0, &vcpus_suspended); + stopped_vcpu = 0; } flags = fcntl(s, F_GETFL); if (fcntl(s, F_SETFL, flags | O_NONBLOCK) == -1) err(1, "Failed to mark gdb socket non-blocking"); #ifndef WITHOUT_CAPSICUM limit_gdb_socket(s); #endif mevent_add(s, EVF_READ, new_connection, NULL); } Index: head/usr.sbin/bhyve/gdb.h =================================================================== --- head/usr.sbin/bhyve/gdb.h (revision 355723) +++ head/usr.sbin/bhyve/gdb.h (revision 355724) @@ -1,38 +1,39 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2017 John H. Baldwin * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef __GDB_H__ #define __GDB_H__ void gdb_cpu_add(int vcpu); +void gdb_cpu_breakpoint(int vcpu, struct vm_exit *vmexit); void gdb_cpu_mtrap(int vcpu); void gdb_cpu_suspend(int vcpu); void init_gdb(struct vmctx *ctx, int sport, bool wait); #endif /* !__GDB_H__ */