Index: head/sys/amd64/include/vmm.h =================================================================== --- head/sys/amd64/include/vmm.h (revision 362599) +++ head/sys/amd64/include/vmm.h (revision 362600) @@ -1,770 +1,773 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _VMM_H_ #define _VMM_H_ #include #include struct vm_snapshot_meta; #ifdef _KERNEL SDT_PROVIDER_DECLARE(vmm); #endif enum vm_suspend_how { VM_SUSPEND_NONE, VM_SUSPEND_RESET, VM_SUSPEND_POWEROFF, VM_SUSPEND_HALT, VM_SUSPEND_TRIPLEFAULT, VM_SUSPEND_LAST }; /* * Identifiers for architecturally defined registers. */ enum vm_reg_name { VM_REG_GUEST_RAX, VM_REG_GUEST_RBX, VM_REG_GUEST_RCX, VM_REG_GUEST_RDX, VM_REG_GUEST_RSI, VM_REG_GUEST_RDI, VM_REG_GUEST_RBP, VM_REG_GUEST_R8, VM_REG_GUEST_R9, VM_REG_GUEST_R10, VM_REG_GUEST_R11, VM_REG_GUEST_R12, VM_REG_GUEST_R13, VM_REG_GUEST_R14, VM_REG_GUEST_R15, VM_REG_GUEST_CR0, VM_REG_GUEST_CR3, VM_REG_GUEST_CR4, VM_REG_GUEST_DR7, VM_REG_GUEST_RSP, VM_REG_GUEST_RIP, VM_REG_GUEST_RFLAGS, VM_REG_GUEST_ES, VM_REG_GUEST_CS, VM_REG_GUEST_SS, VM_REG_GUEST_DS, VM_REG_GUEST_FS, VM_REG_GUEST_GS, VM_REG_GUEST_LDTR, VM_REG_GUEST_TR, VM_REG_GUEST_IDTR, VM_REG_GUEST_GDTR, VM_REG_GUEST_EFER, VM_REG_GUEST_CR2, VM_REG_GUEST_PDPTE0, VM_REG_GUEST_PDPTE1, VM_REG_GUEST_PDPTE2, VM_REG_GUEST_PDPTE3, VM_REG_GUEST_INTR_SHADOW, VM_REG_GUEST_DR0, VM_REG_GUEST_DR1, VM_REG_GUEST_DR2, VM_REG_GUEST_DR3, VM_REG_GUEST_DR6, VM_REG_GUEST_ENTRY_INST_LENGTH, VM_REG_LAST }; enum x2apic_state { X2APIC_DISABLED, X2APIC_ENABLED, X2APIC_STATE_LAST }; #define VM_INTINFO_VECTOR(info) ((info) & 0xff) #define VM_INTINFO_DEL_ERRCODE 0x800 #define VM_INTINFO_RSVD 0x7ffff000 #define VM_INTINFO_VALID 0x80000000 #define VM_INTINFO_TYPE 0x700 #define VM_INTINFO_HWINTR (0 << 8) #define VM_INTINFO_NMI (2 << 8) #define VM_INTINFO_HWEXCEPTION (3 << 8) #define VM_INTINFO_SWINTR (4 << 8) /* * The VM name has to fit into the pathname length constraints of devfs, * governed primarily by SPECNAMELEN. The length is the total number of * characters in the full path, relative to the mount point and not * including any leading '/' characters. * A prefix and a suffix are added to the name specified by the user. * The prefix is usually "vmm/" or "vmm.io/", but can be a few characters * longer for future use. * The suffix is a string that identifies a bootrom image or some similar * image that is attached to the VM. A separator character gets added to * the suffix automatically when generating the full path, so it must be * accounted for, reducing the effective length by 1. * The effective length of a VM name is 229 bytes for FreeBSD 13 and 37 * bytes for FreeBSD 12. A minimum length is set for safety and supports * a SPECNAMELEN as small as 32 on old systems. */ #define VM_MAX_PREFIXLEN 10 #define VM_MAX_SUFFIXLEN 15 #define VM_MIN_NAMELEN 6 #define VM_MAX_NAMELEN \ (SPECNAMELEN - VM_MAX_PREFIXLEN - VM_MAX_SUFFIXLEN - 1) #ifdef _KERNEL CTASSERT(VM_MAX_NAMELEN >= VM_MIN_NAMELEN); struct vm; struct vm_exception; struct seg_desc; struct vm_exit; struct vm_run; struct vhpet; struct vioapic; struct vlapic; struct vmspace; struct vm_object; struct vm_guest_paging; struct pmap; enum snapshot_req; struct vm_eventinfo { void *rptr; /* rendezvous cookie */ int *sptr; /* suspend cookie */ int *iptr; /* reqidle cookie */ }; typedef int (*vmm_init_func_t)(int ipinum); typedef int (*vmm_cleanup_func_t)(void); typedef void (*vmm_resume_func_t)(void); typedef void * (*vmi_init_func_t)(struct vm *vm, struct pmap *pmap); typedef int (*vmi_run_func_t)(void *vmi, int vcpu, register_t rip, struct pmap *pmap, struct vm_eventinfo *info); typedef void (*vmi_cleanup_func_t)(void *vmi); typedef int (*vmi_get_register_t)(void *vmi, int vcpu, int num, uint64_t *retval); typedef int (*vmi_set_register_t)(void *vmi, int vcpu, int num, uint64_t val); typedef int (*vmi_get_desc_t)(void *vmi, int vcpu, int num, struct seg_desc *desc); typedef int (*vmi_set_desc_t)(void *vmi, int vcpu, int num, struct seg_desc *desc); typedef int (*vmi_get_cap_t)(void *vmi, int vcpu, int num, int *retval); typedef int (*vmi_set_cap_t)(void *vmi, int vcpu, int num, int val); typedef struct vmspace * (*vmi_vmspace_alloc)(vm_offset_t min, vm_offset_t max); typedef void (*vmi_vmspace_free)(struct vmspace *vmspace); typedef struct vlapic * (*vmi_vlapic_init)(void *vmi, int vcpu); typedef void (*vmi_vlapic_cleanup)(void *vmi, struct vlapic *vlapic); typedef int (*vmi_snapshot_t)(void *vmi, struct vm_snapshot_meta *meta); typedef int (*vmi_snapshot_vmcx_t)(void *vmi, struct vm_snapshot_meta *meta, int vcpu); typedef int (*vmi_restore_tsc_t)(void *vmi, int vcpuid, uint64_t now); struct vmm_ops { vmm_init_func_t init; /* module wide initialization */ vmm_cleanup_func_t cleanup; vmm_resume_func_t resume; vmi_init_func_t vminit; /* vm-specific initialization */ vmi_run_func_t vmrun; vmi_cleanup_func_t vmcleanup; vmi_get_register_t vmgetreg; vmi_set_register_t vmsetreg; vmi_get_desc_t vmgetdesc; vmi_set_desc_t vmsetdesc; vmi_get_cap_t vmgetcap; vmi_set_cap_t vmsetcap; vmi_vmspace_alloc vmspace_alloc; vmi_vmspace_free vmspace_free; vmi_vlapic_init vlapic_init; vmi_vlapic_cleanup vlapic_cleanup; /* checkpoint operations */ vmi_snapshot_t vmsnapshot; vmi_snapshot_vmcx_t vmcx_snapshot; vmi_restore_tsc_t vm_restore_tsc; }; extern struct vmm_ops vmm_ops_intel; extern struct vmm_ops vmm_ops_amd; int vm_create(const char *name, struct vm **retvm); void vm_destroy(struct vm *vm); int vm_reinit(struct vm *vm); const char *vm_name(struct vm *vm); uint16_t vm_get_maxcpus(struct vm *vm); void vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores, uint16_t *threads, uint16_t *maxcpus); int vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores, uint16_t threads, uint16_t maxcpus); /* * APIs that modify the guest memory map require all vcpus to be frozen. */ int vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t off, size_t len, int prot, int flags); int vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem); void vm_free_memseg(struct vm *vm, int ident); int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa); int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len); int vm_assign_pptdev(struct vm *vm, int bus, int slot, int func); int vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func); /* * APIs that inspect the guest memory map require only a *single* vcpu to * be frozen. This acts like a read lock on the guest memory map since any * modification requires *all* vcpus to be frozen. */ int vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid, vm_ooffset_t *segoff, size_t *len, int *prot, int *flags); int vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem, struct vm_object **objptr); vm_paddr_t vmm_sysmem_maxaddr(struct vm *vm); void *vm_gpa_hold(struct vm *, int vcpuid, vm_paddr_t gpa, size_t len, int prot, void **cookie); void vm_gpa_release(void *cookie); bool vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa); int vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval); int vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val); int vm_get_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *ret_desc); int vm_set_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc); int vm_run(struct vm *vm, struct vm_run *vmrun); int vm_suspend(struct vm *vm, enum vm_suspend_how how); int vm_inject_nmi(struct vm *vm, int vcpu); int vm_nmi_pending(struct vm *vm, int vcpuid); void vm_nmi_clear(struct vm *vm, int vcpuid); int vm_inject_extint(struct vm *vm, int vcpu); int vm_extint_pending(struct vm *vm, int vcpuid); void vm_extint_clear(struct vm *vm, int vcpuid); struct vlapic *vm_lapic(struct vm *vm, int cpu); struct vioapic *vm_ioapic(struct vm *vm); struct vhpet *vm_hpet(struct vm *vm); int vm_get_capability(struct vm *vm, int vcpu, int type, int *val); int vm_set_capability(struct vm *vm, int vcpu, int type, int val); int vm_get_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state *state); int vm_set_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state state); int vm_apicid2vcpuid(struct vm *vm, int apicid); int vm_activate_cpu(struct vm *vm, int vcpu); int vm_suspend_cpu(struct vm *vm, int vcpu); int vm_resume_cpu(struct vm *vm, int vcpu); struct vm_exit *vm_exitinfo(struct vm *vm, int vcpuid); void vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip); void vm_exit_debug(struct vm *vm, int vcpuid, uint64_t rip); void vm_exit_rendezvous(struct vm *vm, int vcpuid, uint64_t rip); void vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip); void vm_exit_reqidle(struct vm *vm, int vcpuid, uint64_t rip); int vm_snapshot_req(struct vm *vm, struct vm_snapshot_meta *meta); int vm_restore_time(struct vm *vm); #ifdef _SYS__CPUSET_H_ /* * Rendezvous all vcpus specified in 'dest' and execute 'func(arg)'. * The rendezvous 'func(arg)' is not allowed to do anything that will * cause the thread to be put to sleep. * * If the rendezvous is being initiated from a vcpu context then the * 'vcpuid' must refer to that vcpu, otherwise it should be set to -1. * * The caller cannot hold any locks when initiating the rendezvous. * * The implementation of this API may cause vcpus other than those specified * by 'dest' to be stalled. The caller should not rely on any vcpus making * forward progress when the rendezvous is in progress. */ typedef void (*vm_rendezvous_func_t)(struct vm *vm, int vcpuid, void *arg); int vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest, vm_rendezvous_func_t func, void *arg); cpuset_t vm_active_cpus(struct vm *vm); cpuset_t vm_debug_cpus(struct vm *vm); cpuset_t vm_suspended_cpus(struct vm *vm); #endif /* _SYS__CPUSET_H_ */ static __inline int vcpu_rendezvous_pending(struct vm_eventinfo *info) { return (*((uintptr_t *)(info->rptr)) != 0); } static __inline int vcpu_suspended(struct vm_eventinfo *info) { return (*info->sptr); } static __inline int vcpu_reqidle(struct vm_eventinfo *info) { return (*info->iptr); } int vcpu_debugged(struct vm *vm, int vcpuid); /* * Return true if device indicated by bus/slot/func is supposed to be a * pci passthrough device. * * Return false otherwise. */ bool vmm_is_pptdev(int bus, int slot, int func); void *vm_iommu_domain(struct vm *vm); enum vcpu_state { VCPU_IDLE, VCPU_FROZEN, VCPU_RUNNING, VCPU_SLEEPING, }; int vcpu_set_state(struct vm *vm, int vcpu, enum vcpu_state state, bool from_idle); enum vcpu_state vcpu_get_state(struct vm *vm, int vcpu, int *hostcpu); static int __inline vcpu_is_running(struct vm *vm, int vcpu, int *hostcpu) { return (vcpu_get_state(vm, vcpu, hostcpu) == VCPU_RUNNING); } #ifdef _SYS_PROC_H_ static int __inline vcpu_should_yield(struct vm *vm, int vcpu) { if (curthread->td_flags & (TDF_ASTPENDING | TDF_NEEDRESCHED)) return (1); else if (curthread->td_owepreempt) return (1); else return (0); } #endif void *vcpu_stats(struct vm *vm, int vcpu); void vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr); struct vmspace *vm_get_vmspace(struct vm *vm); struct vatpic *vm_atpic(struct vm *vm); struct vatpit *vm_atpit(struct vm *vm); struct vpmtmr *vm_pmtmr(struct vm *vm); struct vrtc *vm_rtc(struct vm *vm); /* * Inject exception 'vector' into the guest vcpu. This function returns 0 on * success and non-zero on failure. * * Wrapper functions like 'vm_inject_gp()' should be preferred to calling * this function directly because they enforce the trap-like or fault-like * behavior of an exception. * * This function should only be called in the context of the thread that is * executing this vcpu. */ int vm_inject_exception(struct vm *vm, int vcpuid, int vector, int err_valid, uint32_t errcode, int restart_instruction); /* * This function is called after a VM-exit that occurred during exception or * interrupt delivery through the IDT. The format of 'intinfo' is described * in Figure 15-1, "EXITINTINFO for All Intercepts", APM, Vol 2. * * If a VM-exit handler completes the event delivery successfully then it * should call vm_exit_intinfo() to extinguish the pending event. For e.g., * if the task switch emulation is triggered via a task gate then it should * call this function with 'intinfo=0' to indicate that the external event * is not pending anymore. * * Return value is 0 on success and non-zero on failure. */ int vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t intinfo); /* * This function is called before every VM-entry to retrieve a pending * event that should be injected into the guest. This function combines * nested events into a double or triple fault. * * Returns 0 if there are no events that need to be injected into the guest * and non-zero otherwise. */ int vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *info); int vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2); /* * Function used to keep track of the guest's TSC offset. The * offset is used by the virutalization extensions to provide a consistent * value for the Time Stamp Counter to the guest. * * Return value is 0 on success and non-zero on failure. */ int vm_set_tsc_offset(struct vm *vm, int vcpu_id, uint64_t offset); enum vm_reg_name vm_segment_name(int seg_encoding); struct vm_copyinfo { uint64_t gpa; size_t len; void *hva; void *cookie; }; /* * Set up 'copyinfo[]' to copy to/from guest linear address space starting * at 'gla' and 'len' bytes long. The 'prot' should be set to PROT_READ for * a copyin or PROT_WRITE for a copyout. * * retval is_fault Interpretation * 0 0 Success * 0 1 An exception was injected into the guest * EFAULT N/A Unrecoverable error * * The 'copyinfo[]' can be passed to 'vm_copyin()' or 'vm_copyout()' only if * the return value is 0. The 'copyinfo[]' resources should be freed by calling * 'vm_copy_teardown()' after the copy is done. */ int vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo, int num_copyinfo, int *is_fault); void vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, int num_copyinfo); void vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr, size_t len); void vm_copyout(struct vm *vm, int vcpuid, const void *kaddr, struct vm_copyinfo *copyinfo, size_t len); int vcpu_trace_exceptions(struct vm *vm, int vcpuid); #endif /* KERNEL */ #define VM_MAXCPU 16 /* maximum virtual cpus */ /* * Identifiers for optional vmm capabilities */ enum vm_cap_type { VM_CAP_HALT_EXIT, VM_CAP_MTRAP_EXIT, VM_CAP_PAUSE_EXIT, VM_CAP_UNRESTRICTED_GUEST, VM_CAP_ENABLE_INVPCID, VM_CAP_BPT_EXIT, VM_CAP_MAX }; enum vm_intr_trigger { EDGE_TRIGGER, LEVEL_TRIGGER }; /* * The 'access' field has the format specified in Table 21-2 of the Intel * Architecture Manual vol 3b. * * XXX The contents of the 'access' field are architecturally defined except * bit 16 - Segment Unusable. */ struct seg_desc { uint64_t base; uint32_t limit; uint32_t access; }; #define SEG_DESC_TYPE(access) ((access) & 0x001f) #define SEG_DESC_DPL(access) (((access) >> 5) & 0x3) #define SEG_DESC_PRESENT(access) (((access) & 0x0080) ? 1 : 0) #define SEG_DESC_DEF32(access) (((access) & 0x4000) ? 1 : 0) #define SEG_DESC_GRANULARITY(access) (((access) & 0x8000) ? 1 : 0) #define SEG_DESC_UNUSABLE(access) (((access) & 0x10000) ? 1 : 0) enum vm_cpu_mode { CPU_MODE_REAL, CPU_MODE_PROTECTED, CPU_MODE_COMPATIBILITY, /* IA-32E mode (CS.L = 0) */ CPU_MODE_64BIT, /* IA-32E mode (CS.L = 1) */ }; enum vm_paging_mode { PAGING_MODE_FLAT, PAGING_MODE_32, PAGING_MODE_PAE, PAGING_MODE_64, }; struct vm_guest_paging { uint64_t cr3; int cpl; enum vm_cpu_mode cpu_mode; enum vm_paging_mode paging_mode; }; /* * The data structures 'vie' and 'vie_op' are meant to be opaque to the * consumers of instruction decoding. The only reason why their contents * need to be exposed is because they are part of the 'vm_exit' structure. */ struct vie_op { uint8_t op_byte; /* actual opcode byte */ uint8_t op_type; /* type of operation (e.g. MOV) */ uint16_t op_flags; }; _Static_assert(sizeof(struct vie_op) == 4, "ABI"); _Static_assert(_Alignof(struct vie_op) == 2, "ABI"); #define VIE_INST_SIZE 15 struct vie { uint8_t inst[VIE_INST_SIZE]; /* instruction bytes */ uint8_t num_valid; /* size of the instruction */ + +/* The following fields are all zeroed upon restart. */ +#define vie_startzero num_processed uint8_t num_processed; uint8_t addrsize:4, opsize:4; /* address and operand sizes */ uint8_t rex_w:1, /* REX prefix */ rex_r:1, rex_x:1, rex_b:1, rex_present:1, repz_present:1, /* REP/REPE/REPZ prefix */ repnz_present:1, /* REPNE/REPNZ prefix */ opsize_override:1, /* Operand size override */ addrsize_override:1, /* Address size override */ segment_override:1; /* Segment override */ uint8_t mod:2, /* ModRM byte */ reg:4, rm:4; uint8_t ss:2, /* SIB byte */ vex_present:1, /* VEX prefixed */ vex_l:1, /* L bit */ index:4, /* SIB byte */ base:4; /* SIB byte */ uint8_t disp_bytes; uint8_t imm_bytes; uint8_t scale; uint8_t vex_reg:4, /* vvvv: first source register specifier */ vex_pp:2, /* pp */ _sparebits:2; uint8_t _sparebytes[2]; int base_register; /* VM_REG_GUEST_xyz */ int index_register; /* VM_REG_GUEST_xyz */ int segment_register; /* VM_REG_GUEST_xyz */ int64_t displacement; /* optional addr displacement */ int64_t immediate; /* optional immediate operand */ uint8_t decoded; /* set to 1 if successfully decoded */ uint8_t _sparebyte; struct vie_op op; /* opcode description */ }; _Static_assert(sizeof(struct vie) == 64, "ABI"); _Static_assert(__offsetof(struct vie, disp_bytes) == 22, "ABI"); _Static_assert(__offsetof(struct vie, scale) == 24, "ABI"); _Static_assert(__offsetof(struct vie, base_register) == 28, "ABI"); enum vm_exitcode { VM_EXITCODE_INOUT, VM_EXITCODE_VMX, VM_EXITCODE_BOGUS, VM_EXITCODE_RDMSR, VM_EXITCODE_WRMSR, VM_EXITCODE_HLT, VM_EXITCODE_MTRAP, VM_EXITCODE_PAUSE, VM_EXITCODE_PAGING, VM_EXITCODE_INST_EMUL, VM_EXITCODE_SPINUP_AP, VM_EXITCODE_DEPRECATED1, /* used to be SPINDOWN_CPU */ VM_EXITCODE_RENDEZVOUS, VM_EXITCODE_IOAPIC_EOI, VM_EXITCODE_SUSPENDED, VM_EXITCODE_INOUT_STR, VM_EXITCODE_TASK_SWITCH, VM_EXITCODE_MONITOR, VM_EXITCODE_MWAIT, VM_EXITCODE_SVM, VM_EXITCODE_REQIDLE, VM_EXITCODE_DEBUG, VM_EXITCODE_VMINSN, VM_EXITCODE_BPT, VM_EXITCODE_MAX }; struct vm_inout { uint16_t bytes:3; /* 1 or 2 or 4 */ uint16_t in:1; uint16_t string:1; uint16_t rep:1; uint16_t port; uint32_t eax; /* valid for out */ }; struct vm_inout_str { struct vm_inout inout; /* must be the first element */ struct vm_guest_paging paging; uint64_t rflags; uint64_t cr0; uint64_t index; uint64_t count; /* rep=1 (%rcx), rep=0 (1) */ int addrsize; enum vm_reg_name seg_name; struct seg_desc seg_desc; }; enum task_switch_reason { TSR_CALL, TSR_IRET, TSR_JMP, TSR_IDT_GATE, /* task gate in IDT */ }; struct vm_task_switch { uint16_t tsssel; /* new TSS selector */ int ext; /* task switch due to external event */ uint32_t errcode; int errcode_valid; /* push 'errcode' on the new stack */ enum task_switch_reason reason; struct vm_guest_paging paging; }; struct vm_exit { enum vm_exitcode exitcode; int inst_length; /* 0 means unknown */ uint64_t rip; union { struct vm_inout inout; struct vm_inout_str inout_str; struct { uint64_t gpa; int fault_type; } paging; struct { uint64_t gpa; uint64_t gla; uint64_t cs_base; int cs_d; /* CS.D */ struct vm_guest_paging paging; struct vie vie; } inst_emul; /* * VMX specific payload. Used when there is no "better" * exitcode to represent the VM-exit. */ struct { int status; /* vmx inst status */ /* * 'exit_reason' and 'exit_qualification' are valid * only if 'status' is zero. */ uint32_t exit_reason; uint64_t exit_qualification; /* * 'inst_error' and 'inst_type' are valid * only if 'status' is non-zero. */ int inst_type; int inst_error; } vmx; /* * SVM specific payload. */ struct { uint64_t exitcode; uint64_t exitinfo1; uint64_t exitinfo2; } svm; struct { int inst_length; } bpt; struct { uint32_t code; /* ecx value */ uint64_t wval; } msr; struct { int vcpu; uint64_t rip; } spinup_ap; struct { uint64_t rflags; uint64_t intr_status; } hlt; struct { int vector; } ioapic_eoi; struct { enum vm_suspend_how how; } suspended; struct vm_task_switch task_switch; } u; }; /* APIs to inject faults into the guest */ void vm_inject_fault(void *vm, int vcpuid, int vector, int errcode_valid, int errcode); static __inline void vm_inject_ud(void *vm, int vcpuid) { vm_inject_fault(vm, vcpuid, IDT_UD, 0, 0); } static __inline void vm_inject_gp(void *vm, int vcpuid) { vm_inject_fault(vm, vcpuid, IDT_GP, 1, 0); } static __inline void vm_inject_ac(void *vm, int vcpuid, int errcode) { vm_inject_fault(vm, vcpuid, IDT_AC, 1, errcode); } static __inline void vm_inject_ss(void *vm, int vcpuid, int errcode) { vm_inject_fault(vm, vcpuid, IDT_SS, 1, errcode); } void vm_inject_pf(void *vm, int vcpuid, int error_code, uint64_t cr2); int vm_restart_instruction(void *vm, int vcpuid); #endif /* _VMM_H_ */ Index: head/sys/amd64/include/vmm_instruction_emul.h =================================================================== --- head/sys/amd64/include/vmm_instruction_emul.h (revision 362599) +++ head/sys/amd64/include/vmm_instruction_emul.h (revision 362600) @@ -1,134 +1,135 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2012 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _VMM_INSTRUCTION_EMUL_H_ #define _VMM_INSTRUCTION_EMUL_H_ #include /* * Callback functions to read and write memory regions. */ typedef int (*mem_region_read_t)(void *vm, int cpuid, uint64_t gpa, uint64_t *rval, int rsize, void *arg); typedef int (*mem_region_write_t)(void *vm, int cpuid, uint64_t gpa, uint64_t wval, int wsize, void *arg); /* * Emulate the decoded 'vie' instruction. * * The callbacks 'mrr' and 'mrw' emulate reads and writes to the memory region * containing 'gpa'. 'mrarg' is an opaque argument that is passed into the * callback functions. * * 'void *vm' should be 'struct vm *' when called from kernel context and * 'struct vmctx *' when called from user context. * s */ int vmm_emulate_instruction(void *vm, int cpuid, uint64_t gpa, struct vie *vie, struct vm_guest_paging *paging, mem_region_read_t mrr, mem_region_write_t mrw, void *mrarg); int vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t val, int size); /* * Returns 1 if an alignment check exception should be injected and 0 otherwise. */ int vie_alignment_check(int cpl, int operand_size, uint64_t cr0, uint64_t rflags, uint64_t gla); /* Returns 1 if the 'gla' is not canonical and 0 otherwise. */ int vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla); uint64_t vie_size2mask(int size); int vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg, struct seg_desc *desc, uint64_t off, int length, int addrsize, int prot, uint64_t *gla); #ifdef _KERNEL /* * APIs to fetch and decode the instruction from nested page fault handler. * * 'vie' must be initialized before calling 'vmm_fetch_instruction()' */ int vmm_fetch_instruction(struct vm *vm, int cpuid, struct vm_guest_paging *guest_paging, uint64_t rip, int inst_length, struct vie *vie, int *is_fault); /* * Translate the guest linear address 'gla' to a guest physical address. * * retval is_fault Interpretation * 0 0 'gpa' contains result of the translation * 0 1 An exception was injected into the guest * EFAULT N/A An unrecoverable hypervisor error occurred */ int vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, uint64_t gla, int prot, uint64_t *gpa, int *is_fault); /* * Like vm_gla2gpa, but no exceptions are injected into the guest and * PTEs are not changed. */ int vm_gla2gpa_nofault(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, uint64_t gla, int prot, uint64_t *gpa, int *is_fault); #endif /* _KERNEL */ +void vie_restart(struct vie *vie); void vie_init(struct vie *vie, const char *inst_bytes, int inst_length); /* * Decode the instruction fetched into 'vie' so it can be emulated. * * 'gla' is the guest linear address provided by the hardware assist * that caused the nested page table fault. It is used to verify that * the software instruction decoding is in agreement with the hardware. * * Some hardware assists do not provide the 'gla' to the hypervisor. * To skip the 'gla' verification for this or any other reason pass * in VIE_INVALID_GLA instead. */ #ifdef _KERNEL #define VIE_INVALID_GLA (1UL << 63) /* a non-canonical address */ int vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla, enum vm_cpu_mode cpu_mode, int csd, struct vie *vie); #else /* !_KERNEL */ /* * Permit instruction decoding logic to be compiled outside of the kernel for * rapid iteration and validation. No GLA validation is performed, obviously. */ int vmm_decode_instruction(enum vm_cpu_mode cpu_mode, int csd, struct vie *vie); #endif /* _KERNEL */ #endif /* _VMM_INSTRUCTION_EMUL_H_ */ Index: head/sys/amd64/vmm/vmm_instruction_emul.c =================================================================== --- head/sys/amd64/vmm/vmm_instruction_emul.c (revision 362599) +++ head/sys/amd64/vmm/vmm_instruction_emul.c (revision 362600) @@ -1,2922 +1,2938 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2012 Sandvine, Inc. * Copyright (c) 2012 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #ifdef _KERNEL #include #include #include #include #include #include #include #include #else /* !_KERNEL */ #include #include #include #include #include #include #include +#include #include +#include #include #include #define KASSERT(exp,msg) assert((exp)) #define panic(...) errx(4, __VA_ARGS__) #endif /* _KERNEL */ #include #include #include /* struct vie_op.op_type */ enum { VIE_OP_TYPE_NONE = 0, VIE_OP_TYPE_MOV, VIE_OP_TYPE_MOVSX, VIE_OP_TYPE_MOVZX, VIE_OP_TYPE_AND, VIE_OP_TYPE_OR, VIE_OP_TYPE_SUB, VIE_OP_TYPE_TWO_BYTE, VIE_OP_TYPE_PUSH, VIE_OP_TYPE_CMP, VIE_OP_TYPE_POP, VIE_OP_TYPE_MOVS, VIE_OP_TYPE_GROUP1, VIE_OP_TYPE_STOS, VIE_OP_TYPE_BITTEST, VIE_OP_TYPE_TWOB_GRP15, VIE_OP_TYPE_ADD, VIE_OP_TYPE_TEST, VIE_OP_TYPE_BEXTR, VIE_OP_TYPE_LAST }; /* struct vie_op.op_flags */ #define VIE_OP_F_IMM (1 << 0) /* 16/32-bit immediate operand */ #define VIE_OP_F_IMM8 (1 << 1) /* 8-bit immediate operand */ #define VIE_OP_F_MOFFSET (1 << 2) /* 16/32/64-bit immediate moffset */ #define VIE_OP_F_NO_MODRM (1 << 3) #define VIE_OP_F_NO_GLA_VERIFICATION (1 << 4) static const struct vie_op three_byte_opcodes_0f38[256] = { [0xF7] = { .op_byte = 0xF7, .op_type = VIE_OP_TYPE_BEXTR, }, }; static const struct vie_op two_byte_opcodes[256] = { [0xAE] = { .op_byte = 0xAE, .op_type = VIE_OP_TYPE_TWOB_GRP15, }, [0xB6] = { .op_byte = 0xB6, .op_type = VIE_OP_TYPE_MOVZX, }, [0xB7] = { .op_byte = 0xB7, .op_type = VIE_OP_TYPE_MOVZX, }, [0xBA] = { .op_byte = 0xBA, .op_type = VIE_OP_TYPE_BITTEST, .op_flags = VIE_OP_F_IMM8, }, [0xBE] = { .op_byte = 0xBE, .op_type = VIE_OP_TYPE_MOVSX, }, }; static const struct vie_op one_byte_opcodes[256] = { [0x03] = { .op_byte = 0x03, .op_type = VIE_OP_TYPE_ADD, }, [0x0F] = { .op_byte = 0x0F, .op_type = VIE_OP_TYPE_TWO_BYTE }, [0x0B] = { .op_byte = 0x0B, .op_type = VIE_OP_TYPE_OR, }, [0x2B] = { .op_byte = 0x2B, .op_type = VIE_OP_TYPE_SUB, }, [0x39] = { .op_byte = 0x39, .op_type = VIE_OP_TYPE_CMP, }, [0x3B] = { .op_byte = 0x3B, .op_type = VIE_OP_TYPE_CMP, }, [0x88] = { .op_byte = 0x88, .op_type = VIE_OP_TYPE_MOV, }, [0x89] = { .op_byte = 0x89, .op_type = VIE_OP_TYPE_MOV, }, [0x8A] = { .op_byte = 0x8A, .op_type = VIE_OP_TYPE_MOV, }, [0x8B] = { .op_byte = 0x8B, .op_type = VIE_OP_TYPE_MOV, }, [0xA1] = { .op_byte = 0xA1, .op_type = VIE_OP_TYPE_MOV, .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM, }, [0xA3] = { .op_byte = 0xA3, .op_type = VIE_OP_TYPE_MOV, .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM, }, [0xA4] = { .op_byte = 0xA4, .op_type = VIE_OP_TYPE_MOVS, .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION }, [0xA5] = { .op_byte = 0xA5, .op_type = VIE_OP_TYPE_MOVS, .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION }, [0xAA] = { .op_byte = 0xAA, .op_type = VIE_OP_TYPE_STOS, .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION }, [0xAB] = { .op_byte = 0xAB, .op_type = VIE_OP_TYPE_STOS, .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION }, [0xC6] = { /* XXX Group 11 extended opcode - not just MOV */ .op_byte = 0xC6, .op_type = VIE_OP_TYPE_MOV, .op_flags = VIE_OP_F_IMM8, }, [0xC7] = { .op_byte = 0xC7, .op_type = VIE_OP_TYPE_MOV, .op_flags = VIE_OP_F_IMM, }, [0x23] = { .op_byte = 0x23, .op_type = VIE_OP_TYPE_AND, }, [0x80] = { /* Group 1 extended opcode */ .op_byte = 0x80, .op_type = VIE_OP_TYPE_GROUP1, .op_flags = VIE_OP_F_IMM8, }, [0x81] = { /* Group 1 extended opcode */ .op_byte = 0x81, .op_type = VIE_OP_TYPE_GROUP1, .op_flags = VIE_OP_F_IMM, }, [0x83] = { /* Group 1 extended opcode */ .op_byte = 0x83, .op_type = VIE_OP_TYPE_GROUP1, .op_flags = VIE_OP_F_IMM8, }, [0x8F] = { /* XXX Group 1A extended opcode - not just POP */ .op_byte = 0x8F, .op_type = VIE_OP_TYPE_POP, }, [0xF7] = { /* XXX Group 3 extended opcode - not just TEST */ .op_byte = 0xF7, .op_type = VIE_OP_TYPE_TEST, .op_flags = VIE_OP_F_IMM, }, [0xFF] = { /* XXX Group 5 extended opcode - not just PUSH */ .op_byte = 0xFF, .op_type = VIE_OP_TYPE_PUSH, } }; /* struct vie.mod */ #define VIE_MOD_INDIRECT 0 #define VIE_MOD_INDIRECT_DISP8 1 #define VIE_MOD_INDIRECT_DISP32 2 #define VIE_MOD_DIRECT 3 /* struct vie.rm */ #define VIE_RM_SIB 4 #define VIE_RM_DISP32 5 #define GB (1024 * 1024 * 1024) static enum vm_reg_name gpr_map[16] = { VM_REG_GUEST_RAX, VM_REG_GUEST_RCX, VM_REG_GUEST_RDX, VM_REG_GUEST_RBX, VM_REG_GUEST_RSP, VM_REG_GUEST_RBP, VM_REG_GUEST_RSI, VM_REG_GUEST_RDI, VM_REG_GUEST_R8, VM_REG_GUEST_R9, VM_REG_GUEST_R10, VM_REG_GUEST_R11, VM_REG_GUEST_R12, VM_REG_GUEST_R13, VM_REG_GUEST_R14, VM_REG_GUEST_R15 }; static uint64_t size2mask[] = { [1] = 0xff, [2] = 0xffff, [4] = 0xffffffff, [8] = 0xffffffffffffffff, }; static int vie_read_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t *rval) { int error; error = vm_get_register(vm, vcpuid, reg, rval); return (error); } static void vie_calc_bytereg(struct vie *vie, enum vm_reg_name *reg, int *lhbr) { *lhbr = 0; *reg = gpr_map[vie->reg]; /* * 64-bit mode imposes limitations on accessing legacy high byte * registers (lhbr). * * The legacy high-byte registers cannot be addressed if the REX * prefix is present. In this case the values 4, 5, 6 and 7 of the * 'ModRM:reg' field address %spl, %bpl, %sil and %dil respectively. * * If the REX prefix is not present then the values 4, 5, 6 and 7 * of the 'ModRM:reg' field address the legacy high-byte registers, * %ah, %ch, %dh and %bh respectively. */ if (!vie->rex_present) { if (vie->reg & 0x4) { *lhbr = 1; *reg = gpr_map[vie->reg & 0x3]; } } } static int vie_read_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t *rval) { uint64_t val; int error, lhbr; enum vm_reg_name reg; vie_calc_bytereg(vie, ®, &lhbr); error = vm_get_register(vm, vcpuid, reg, &val); /* * To obtain the value of a legacy high byte register shift the * base register right by 8 bits (%ah = %rax >> 8). */ if (lhbr) *rval = val >> 8; else *rval = val; return (error); } static int vie_write_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t byte) { uint64_t origval, val, mask; int error, lhbr; enum vm_reg_name reg; vie_calc_bytereg(vie, ®, &lhbr); error = vm_get_register(vm, vcpuid, reg, &origval); if (error == 0) { val = byte; mask = 0xff; if (lhbr) { /* * Shift left by 8 to store 'byte' in a legacy high * byte register. */ val <<= 8; mask <<= 8; } val |= origval & ~mask; error = vm_set_register(vm, vcpuid, reg, val); } return (error); } int vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t val, int size) { int error; uint64_t origval; switch (size) { case 1: case 2: error = vie_read_register(vm, vcpuid, reg, &origval); if (error) return (error); val &= size2mask[size]; val |= origval & ~size2mask[size]; break; case 4: val &= 0xffffffffUL; break; case 8: break; default: return (EINVAL); } error = vm_set_register(vm, vcpuid, reg, val); return (error); } #define RFLAGS_STATUS_BITS (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_V) /* * Return the status flags that would result from doing (x - y). */ #define GETCC(sz) \ static u_long \ getcc##sz(uint##sz##_t x, uint##sz##_t y) \ { \ u_long rflags; \ \ __asm __volatile("sub %2,%1; pushfq; popq %0" : \ "=r" (rflags), "+r" (x) : "m" (y)); \ return (rflags); \ } struct __hack GETCC(8); GETCC(16); GETCC(32); GETCC(64); static u_long getcc(int opsize, uint64_t x, uint64_t y) { KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8, ("getcc: invalid operand size %d", opsize)); if (opsize == 1) return (getcc8(x, y)); else if (opsize == 2) return (getcc16(x, y)); else if (opsize == 4) return (getcc32(x, y)); else return (getcc64(x, y)); } /* * Macro creation of functions getaddflags{8,16,32,64} */ #define GETADDFLAGS(sz) \ static u_long \ getaddflags##sz(uint##sz##_t x, uint##sz##_t y) \ { \ u_long rflags; \ \ __asm __volatile("add %2,%1; pushfq; popq %0" : \ "=r" (rflags), "+r" (x) : "m" (y)); \ return (rflags); \ } struct __hack GETADDFLAGS(8); GETADDFLAGS(16); GETADDFLAGS(32); GETADDFLAGS(64); static u_long getaddflags(int opsize, uint64_t x, uint64_t y) { KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8, ("getaddflags: invalid operand size %d", opsize)); if (opsize == 1) return (getaddflags8(x, y)); else if (opsize == 2) return (getaddflags16(x, y)); else if (opsize == 4) return (getaddflags32(x, y)); else return (getaddflags64(x, y)); } /* * Return the status flags that would result from doing (x & y). */ #define GETANDFLAGS(sz) \ static u_long \ getandflags##sz(uint##sz##_t x, uint##sz##_t y) \ { \ u_long rflags; \ \ __asm __volatile("and %2,%1; pushfq; popq %0" : \ "=r" (rflags), "+r" (x) : "m" (y)); \ return (rflags); \ } struct __hack GETANDFLAGS(8); GETANDFLAGS(16); GETANDFLAGS(32); GETANDFLAGS(64); static u_long getandflags(int opsize, uint64_t x, uint64_t y) { KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8, ("getandflags: invalid operand size %d", opsize)); if (opsize == 1) return (getandflags8(x, y)); else if (opsize == 2) return (getandflags16(x, y)); else if (opsize == 4) return (getandflags32(x, y)); else return (getandflags64(x, y)); } static int emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, mem_region_read_t memread, mem_region_write_t memwrite, void *arg) { int error, size; enum vm_reg_name reg; uint8_t byte; uint64_t val; size = vie->opsize; error = EINVAL; switch (vie->op.op_byte) { case 0x88: /* * MOV byte from reg (ModRM:reg) to mem (ModRM:r/m) * 88/r: mov r/m8, r8 * REX + 88/r: mov r/m8, r8 (%ah, %ch, %dh, %bh not available) */ size = 1; /* override for byte operation */ error = vie_read_bytereg(vm, vcpuid, vie, &byte); if (error == 0) error = memwrite(vm, vcpuid, gpa, byte, size, arg); break; case 0x89: /* * MOV from reg (ModRM:reg) to mem (ModRM:r/m) * 89/r: mov r/m16, r16 * 89/r: mov r/m32, r32 * REX.W + 89/r mov r/m64, r64 */ reg = gpr_map[vie->reg]; error = vie_read_register(vm, vcpuid, reg, &val); if (error == 0) { val &= size2mask[size]; error = memwrite(vm, vcpuid, gpa, val, size, arg); } break; case 0x8A: /* * MOV byte from mem (ModRM:r/m) to reg (ModRM:reg) * 8A/r: mov r8, r/m8 * REX + 8A/r: mov r8, r/m8 */ size = 1; /* override for byte operation */ error = memread(vm, vcpuid, gpa, &val, size, arg); if (error == 0) error = vie_write_bytereg(vm, vcpuid, vie, val); break; case 0x8B: /* * MOV from mem (ModRM:r/m) to reg (ModRM:reg) * 8B/r: mov r16, r/m16 * 8B/r: mov r32, r/m32 * REX.W 8B/r: mov r64, r/m64 */ error = memread(vm, vcpuid, gpa, &val, size, arg); if (error == 0) { reg = gpr_map[vie->reg]; error = vie_update_register(vm, vcpuid, reg, val, size); } break; case 0xA1: /* * MOV from seg:moffset to AX/EAX/RAX * A1: mov AX, moffs16 * A1: mov EAX, moffs32 * REX.W + A1: mov RAX, moffs64 */ error = memread(vm, vcpuid, gpa, &val, size, arg); if (error == 0) { reg = VM_REG_GUEST_RAX; error = vie_update_register(vm, vcpuid, reg, val, size); } break; case 0xA3: /* * MOV from AX/EAX/RAX to seg:moffset * A3: mov moffs16, AX * A3: mov moffs32, EAX * REX.W + A3: mov moffs64, RAX */ error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val); if (error == 0) { val &= size2mask[size]; error = memwrite(vm, vcpuid, gpa, val, size, arg); } break; case 0xC6: /* * MOV from imm8 to mem (ModRM:r/m) * C6/0 mov r/m8, imm8 * REX + C6/0 mov r/m8, imm8 */ size = 1; /* override for byte operation */ error = memwrite(vm, vcpuid, gpa, vie->immediate, size, arg); break; case 0xC7: /* * MOV from imm16/imm32 to mem (ModRM:r/m) * C7/0 mov r/m16, imm16 * C7/0 mov r/m32, imm32 * REX.W + C7/0 mov r/m64, imm32 (sign-extended to 64-bits) */ val = vie->immediate & size2mask[size]; error = memwrite(vm, vcpuid, gpa, val, size, arg); break; default: break; } return (error); } static int emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, mem_region_read_t memread, mem_region_write_t memwrite, void *arg) { int error, size; enum vm_reg_name reg; uint64_t val; size = vie->opsize; error = EINVAL; switch (vie->op.op_byte) { case 0xB6: /* * MOV and zero extend byte from mem (ModRM:r/m) to * reg (ModRM:reg). * * 0F B6/r movzx r16, r/m8 * 0F B6/r movzx r32, r/m8 * REX.W + 0F B6/r movzx r64, r/m8 */ /* get the first operand */ error = memread(vm, vcpuid, gpa, &val, 1, arg); if (error) break; /* get the second operand */ reg = gpr_map[vie->reg]; /* zero-extend byte */ val = (uint8_t)val; /* write the result */ error = vie_update_register(vm, vcpuid, reg, val, size); break; case 0xB7: /* * MOV and zero extend word from mem (ModRM:r/m) to * reg (ModRM:reg). * * 0F B7/r movzx r32, r/m16 * REX.W + 0F B7/r movzx r64, r/m16 */ error = memread(vm, vcpuid, gpa, &val, 2, arg); if (error) return (error); reg = gpr_map[vie->reg]; /* zero-extend word */ val = (uint16_t)val; error = vie_update_register(vm, vcpuid, reg, val, size); break; case 0xBE: /* * MOV and sign extend byte from mem (ModRM:r/m) to * reg (ModRM:reg). * * 0F BE/r movsx r16, r/m8 * 0F BE/r movsx r32, r/m8 * REX.W + 0F BE/r movsx r64, r/m8 */ /* get the first operand */ error = memread(vm, vcpuid, gpa, &val, 1, arg); if (error) break; /* get the second operand */ reg = gpr_map[vie->reg]; /* sign extend byte */ val = (int8_t)val; /* write the result */ error = vie_update_register(vm, vcpuid, reg, val, size); break; default: break; } return (error); } /* * Helper function to calculate and validate a linear address. */ static int get_gla(void *vm, int vcpuid, struct vie *vie, struct vm_guest_paging *paging, int opsize, int addrsize, int prot, enum vm_reg_name seg, enum vm_reg_name gpr, uint64_t *gla, int *fault) { struct seg_desc desc; uint64_t cr0, val, rflags; int error; error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0); KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error)); error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); error = vm_get_seg_desc(vm, vcpuid, seg, &desc); KASSERT(error == 0, ("%s: error %d getting segment descriptor %d", __func__, error, seg)); error = vie_read_register(vm, vcpuid, gpr, &val); KASSERT(error == 0, ("%s: error %d getting register %d", __func__, error, gpr)); if (vie_calculate_gla(paging->cpu_mode, seg, &desc, val, opsize, addrsize, prot, gla)) { if (seg == VM_REG_GUEST_SS) vm_inject_ss(vm, vcpuid, 0); else vm_inject_gp(vm, vcpuid); goto guest_fault; } if (vie_canonical_check(paging->cpu_mode, *gla)) { if (seg == VM_REG_GUEST_SS) vm_inject_ss(vm, vcpuid, 0); else vm_inject_gp(vm, vcpuid); goto guest_fault; } if (vie_alignment_check(paging->cpl, opsize, cr0, rflags, *gla)) { vm_inject_ac(vm, vcpuid, 0); goto guest_fault; } *fault = 0; return (0); guest_fault: *fault = 1; return (0); } static int emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, struct vm_guest_paging *paging, mem_region_read_t memread, mem_region_write_t memwrite, void *arg) { #ifdef _KERNEL struct vm_copyinfo copyinfo[2]; #else struct iovec copyinfo[2]; #endif uint64_t dstaddr, srcaddr, dstgpa, srcgpa, val; uint64_t rcx, rdi, rsi, rflags; int error, fault, opsize, seg, repeat; opsize = (vie->op.op_byte == 0xA4) ? 1 : vie->opsize; val = 0; error = 0; /* * XXX although the MOVS instruction is only supposed to be used with * the "rep" prefix some guests like FreeBSD will use "repnz" instead. * * Empirically the "repnz" prefix has identical behavior to "rep" * and the zero flag does not make a difference. */ repeat = vie->repz_present | vie->repnz_present; if (repeat) { error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx); KASSERT(!error, ("%s: error %d getting rcx", __func__, error)); /* * The count register is %rcx, %ecx or %cx depending on the * address size of the instruction. */ if ((rcx & vie_size2mask(vie->addrsize)) == 0) { error = 0; goto done; } } /* * Source Destination Comments * -------------------------------------------- * (1) memory memory n/a * (2) memory mmio emulated * (3) mmio memory emulated * (4) mmio mmio emulated * * At this point we don't have sufficient information to distinguish * between (2), (3) and (4). We use 'vm_copy_setup()' to tease this * out because it will succeed only when operating on regular memory. * * XXX the emulation doesn't properly handle the case where 'gpa' * is straddling the boundary between the normal memory and MMIO. */ seg = vie->segment_override ? vie->segment_register : VM_REG_GUEST_DS; error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize, PROT_READ, seg, VM_REG_GUEST_RSI, &srcaddr, &fault); if (error || fault) goto done; error = vm_copy_setup(vm, vcpuid, paging, srcaddr, opsize, PROT_READ, copyinfo, nitems(copyinfo), &fault); if (error == 0) { if (fault) goto done; /* Resume guest to handle fault */ /* * case (2): read from system memory and write to mmio. */ vm_copyin(vm, vcpuid, copyinfo, &val, opsize); vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); error = memwrite(vm, vcpuid, gpa, val, opsize, arg); if (error) goto done; } else { /* * 'vm_copy_setup()' is expected to fail for cases (3) and (4) * if 'srcaddr' is in the mmio space. */ error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize, PROT_WRITE, VM_REG_GUEST_ES, VM_REG_GUEST_RDI, &dstaddr, &fault); if (error || fault) goto done; error = vm_copy_setup(vm, vcpuid, paging, dstaddr, opsize, PROT_WRITE, copyinfo, nitems(copyinfo), &fault); if (error == 0) { if (fault) goto done; /* Resume guest to handle fault */ /* * case (3): read from MMIO and write to system memory. * * A MMIO read can have side-effects so we * commit to it only after vm_copy_setup() is * successful. If a page-fault needs to be * injected into the guest then it will happen * before the MMIO read is attempted. */ error = memread(vm, vcpuid, gpa, &val, opsize, arg); if (error) goto done; vm_copyout(vm, vcpuid, &val, copyinfo, opsize); vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); } else { /* * Case (4): read from and write to mmio. * * Commit to the MMIO read/write (with potential * side-effects) only after we are sure that the * instruction is not going to be restarted due * to address translation faults. */ error = vm_gla2gpa(vm, vcpuid, paging, srcaddr, PROT_READ, &srcgpa, &fault); if (error || fault) goto done; error = vm_gla2gpa(vm, vcpuid, paging, dstaddr, PROT_WRITE, &dstgpa, &fault); if (error || fault) goto done; error = memread(vm, vcpuid, srcgpa, &val, opsize, arg); if (error) goto done; error = memwrite(vm, vcpuid, dstgpa, val, opsize, arg); if (error) goto done; } } error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSI, &rsi); KASSERT(error == 0, ("%s: error %d getting rsi", __func__, error)); error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi); KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error)); error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); if (rflags & PSL_D) { rsi -= opsize; rdi -= opsize; } else { rsi += opsize; rdi += opsize; } error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSI, rsi, vie->addrsize); KASSERT(error == 0, ("%s: error %d updating rsi", __func__, error)); error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi, vie->addrsize); KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error)); if (repeat) { rcx = rcx - 1; error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX, rcx, vie->addrsize); KASSERT(!error, ("%s: error %d updating rcx", __func__, error)); /* * Repeat the instruction if the count register is not zero. */ if ((rcx & vie_size2mask(vie->addrsize)) != 0) vm_restart_instruction(vm, vcpuid); } done: KASSERT(error == 0 || error == EFAULT, ("%s: unexpected error %d", __func__, error)); return (error); } static int emulate_stos(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, struct vm_guest_paging *paging, mem_region_read_t memread, mem_region_write_t memwrite, void *arg) { int error, opsize, repeat; uint64_t val; uint64_t rcx, rdi, rflags; opsize = (vie->op.op_byte == 0xAA) ? 1 : vie->opsize; repeat = vie->repz_present | vie->repnz_present; if (repeat) { error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx); KASSERT(!error, ("%s: error %d getting rcx", __func__, error)); /* * The count register is %rcx, %ecx or %cx depending on the * address size of the instruction. */ if ((rcx & vie_size2mask(vie->addrsize)) == 0) return (0); } error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val); KASSERT(!error, ("%s: error %d getting rax", __func__, error)); error = memwrite(vm, vcpuid, gpa, val, opsize, arg); if (error) return (error); error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi); KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error)); error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); if (rflags & PSL_D) rdi -= opsize; else rdi += opsize; error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi, vie->addrsize); KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error)); if (repeat) { rcx = rcx - 1; error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX, rcx, vie->addrsize); KASSERT(!error, ("%s: error %d updating rcx", __func__, error)); /* * Repeat the instruction if the count register is not zero. */ if ((rcx & vie_size2mask(vie->addrsize)) != 0) vm_restart_instruction(vm, vcpuid); } return (0); } static int emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, mem_region_read_t memread, mem_region_write_t memwrite, void *arg) { int error, size; enum vm_reg_name reg; uint64_t result, rflags, rflags2, val1, val2; size = vie->opsize; error = EINVAL; switch (vie->op.op_byte) { case 0x23: /* * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the * result in reg. * * 23/r and r16, r/m16 * 23/r and r32, r/m32 * REX.W + 23/r and r64, r/m64 */ /* get the first operand */ reg = gpr_map[vie->reg]; error = vie_read_register(vm, vcpuid, reg, &val1); if (error) break; /* get the second operand */ error = memread(vm, vcpuid, gpa, &val2, size, arg); if (error) break; /* perform the operation and write the result */ result = val1 & val2; error = vie_update_register(vm, vcpuid, reg, result, size); break; case 0x81: case 0x83: /* * AND mem (ModRM:r/m) with immediate and store the * result in mem. * * 81 /4 and r/m16, imm16 * 81 /4 and r/m32, imm32 * REX.W + 81 /4 and r/m64, imm32 sign-extended to 64 * * 83 /4 and r/m16, imm8 sign-extended to 16 * 83 /4 and r/m32, imm8 sign-extended to 32 * REX.W + 83/4 and r/m64, imm8 sign-extended to 64 */ /* get the first operand */ error = memread(vm, vcpuid, gpa, &val1, size, arg); if (error) break; /* * perform the operation with the pre-fetched immediate * operand and write the result */ result = val1 & vie->immediate; error = memwrite(vm, vcpuid, gpa, result, size, arg); break; default: break; } if (error) return (error); error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); if (error) return (error); /* * OF and CF are cleared; the SF, ZF and PF flags are set according * to the result; AF is undefined. * * The updated status flags are obtained by subtracting 0 from 'result'. */ rflags2 = getcc(size, result, 0); rflags &= ~RFLAGS_STATUS_BITS; rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N); error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); return (error); } static int emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, mem_region_read_t memread, mem_region_write_t memwrite, void *arg) { int error, size; enum vm_reg_name reg; uint64_t result, rflags, rflags2, val1, val2; size = vie->opsize; error = EINVAL; switch (vie->op.op_byte) { case 0x0B: /* * OR reg (ModRM:reg) and mem (ModRM:r/m) and store the * result in reg. * * 0b/r or r16, r/m16 * 0b/r or r32, r/m32 * REX.W + 0b/r or r64, r/m64 */ /* get the first operand */ reg = gpr_map[vie->reg]; error = vie_read_register(vm, vcpuid, reg, &val1); if (error) break; /* get the second operand */ error = memread(vm, vcpuid, gpa, &val2, size, arg); if (error) break; /* perform the operation and write the result */ result = val1 | val2; error = vie_update_register(vm, vcpuid, reg, result, size); break; case 0x81: case 0x83: /* * OR mem (ModRM:r/m) with immediate and store the * result in mem. * * 81 /1 or r/m16, imm16 * 81 /1 or r/m32, imm32 * REX.W + 81 /1 or r/m64, imm32 sign-extended to 64 * * 83 /1 or r/m16, imm8 sign-extended to 16 * 83 /1 or r/m32, imm8 sign-extended to 32 * REX.W + 83/1 or r/m64, imm8 sign-extended to 64 */ /* get the first operand */ error = memread(vm, vcpuid, gpa, &val1, size, arg); if (error) break; /* * perform the operation with the pre-fetched immediate * operand and write the result */ result = val1 | vie->immediate; error = memwrite(vm, vcpuid, gpa, result, size, arg); break; default: break; } if (error) return (error); error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); if (error) return (error); /* * OF and CF are cleared; the SF, ZF and PF flags are set according * to the result; AF is undefined. * * The updated status flags are obtained by subtracting 0 from 'result'. */ rflags2 = getcc(size, result, 0); rflags &= ~RFLAGS_STATUS_BITS; rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N); error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); return (error); } static int emulate_cmp(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, mem_region_read_t memread, mem_region_write_t memwrite, void *arg) { int error, size; uint64_t regop, memop, op1, op2, rflags, rflags2; enum vm_reg_name reg; size = vie->opsize; switch (vie->op.op_byte) { case 0x39: case 0x3B: /* * 39/r CMP r/m16, r16 * 39/r CMP r/m32, r32 * REX.W 39/r CMP r/m64, r64 * * 3B/r CMP r16, r/m16 * 3B/r CMP r32, r/m32 * REX.W + 3B/r CMP r64, r/m64 * * Compare the first operand with the second operand and * set status flags in EFLAGS register. The comparison is * performed by subtracting the second operand from the first * operand and then setting the status flags. */ /* Get the register operand */ reg = gpr_map[vie->reg]; error = vie_read_register(vm, vcpuid, reg, ®op); if (error) return (error); /* Get the memory operand */ error = memread(vm, vcpuid, gpa, &memop, size, arg); if (error) return (error); if (vie->op.op_byte == 0x3B) { op1 = regop; op2 = memop; } else { op1 = memop; op2 = regop; } rflags2 = getcc(size, op1, op2); break; case 0x80: case 0x81: case 0x83: /* * 80 /7 cmp r/m8, imm8 * REX + 80 /7 cmp r/m8, imm8 * * 81 /7 cmp r/m16, imm16 * 81 /7 cmp r/m32, imm32 * REX.W + 81 /7 cmp r/m64, imm32 sign-extended to 64 * * 83 /7 cmp r/m16, imm8 sign-extended to 16 * 83 /7 cmp r/m32, imm8 sign-extended to 32 * REX.W + 83 /7 cmp r/m64, imm8 sign-extended to 64 * * Compare mem (ModRM:r/m) with immediate and set * status flags according to the results. The * comparison is performed by subtracting the * immediate from the first operand and then setting * the status flags. * */ if (vie->op.op_byte == 0x80) size = 1; /* get the first operand */ error = memread(vm, vcpuid, gpa, &op1, size, arg); if (error) return (error); rflags2 = getcc(size, op1, vie->immediate); break; default: return (EINVAL); } error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); if (error) return (error); rflags &= ~RFLAGS_STATUS_BITS; rflags |= rflags2 & RFLAGS_STATUS_BITS; error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); return (error); } static int emulate_test(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, mem_region_read_t memread, mem_region_write_t memwrite, void *arg) { int error, size; uint64_t op1, rflags, rflags2; size = vie->opsize; error = EINVAL; switch (vie->op.op_byte) { case 0xF7: /* * F7 /0 test r/m16, imm16 * F7 /0 test r/m32, imm32 * REX.W + F7 /0 test r/m64, imm32 sign-extended to 64 * * Test mem (ModRM:r/m) with immediate and set status * flags according to the results. The comparison is * performed by anding the immediate from the first * operand and then setting the status flags. */ if ((vie->reg & 7) != 0) return (EINVAL); error = memread(vm, vcpuid, gpa, &op1, size, arg); if (error) return (error); rflags2 = getandflags(size, op1, vie->immediate); break; default: return (EINVAL); } error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); if (error) return (error); /* * OF and CF are cleared; the SF, ZF and PF flags are set according * to the result; AF is undefined. */ rflags &= ~RFLAGS_STATUS_BITS; rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N); error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); return (error); } static int emulate_bextr(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, struct vm_guest_paging *paging, mem_region_read_t memread, mem_region_write_t memwrite, void *arg) { uint64_t src1, src2, dst, rflags; unsigned start, len; int error, size; size = vie->opsize; error = EINVAL; /* * VEX.LZ.0F38.W0 F7 /r BEXTR r32a, r/m32, r32b * VEX.LZ.0F38.W1 F7 /r BEXTR r64a, r/m64, r64b * * Destination operand is ModRM:reg. Source operands are ModRM:r/m and * Vex.vvvv. * * Operand size is always 32-bit if not in 64-bit mode (W1 is ignored). */ if (size != 4 && paging->cpu_mode != CPU_MODE_64BIT) size = 4; /* * Extracts contiguous bits from the first /source/ operand (second * operand) using an index and length specified in the second /source/ * operand (third operand). */ error = memread(vm, vcpuid, gpa, &src1, size, arg); if (error) return (error); error = vie_read_register(vm, vcpuid, gpr_map[vie->vex_reg], &src2); if (error) return (error); error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); if (error) return (error); start = (src2 & 0xff); len = (src2 & 0xff00) >> 8; /* If no bits are extracted, the destination register is cleared. */ dst = 0; /* If START exceeds the operand size, no bits are extracted. */ if (start > size * 8) goto done; /* Length is bounded by both the destination size and start offset. */ if (start + len > size * 8) len = (size * 8) - start; if (len == 0) goto done; if (start > 0) src1 = (src1 >> start); if (len < 64) src1 = src1 & ((1ull << len) - 1); dst = src1; done: error = vie_update_register(vm, vcpuid, gpr_map[vie->reg], dst, size); if (error) return (error); /* * AMD: OF, CF cleared; SF/AF/PF undefined; ZF set by result. * Intel: ZF is set by result; AF/SF/PF undefined; all others cleared. */ rflags &= ~RFLAGS_STATUS_BITS; if (dst == 0) rflags |= PSL_Z; error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); return (error); } static int emulate_add(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, mem_region_read_t memread, mem_region_write_t memwrite, void *arg) { int error, size; uint64_t nval, rflags, rflags2, val1, val2; enum vm_reg_name reg; size = vie->opsize; error = EINVAL; switch (vie->op.op_byte) { case 0x03: /* * ADD r/m to r and store the result in r * * 03/r ADD r16, r/m16 * 03/r ADD r32, r/m32 * REX.W + 03/r ADD r64, r/m64 */ /* get the first operand */ reg = gpr_map[vie->reg]; error = vie_read_register(vm, vcpuid, reg, &val1); if (error) break; /* get the second operand */ error = memread(vm, vcpuid, gpa, &val2, size, arg); if (error) break; /* perform the operation and write the result */ nval = val1 + val2; error = vie_update_register(vm, vcpuid, reg, nval, size); break; default: break; } if (!error) { rflags2 = getaddflags(size, val1, val2); error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); if (error) return (error); rflags &= ~RFLAGS_STATUS_BITS; rflags |= rflags2 & RFLAGS_STATUS_BITS; error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); } return (error); } static int emulate_sub(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, mem_region_read_t memread, mem_region_write_t memwrite, void *arg) { int error, size; uint64_t nval, rflags, rflags2, val1, val2; enum vm_reg_name reg; size = vie->opsize; error = EINVAL; switch (vie->op.op_byte) { case 0x2B: /* * SUB r/m from r and store the result in r * * 2B/r SUB r16, r/m16 * 2B/r SUB r32, r/m32 * REX.W + 2B/r SUB r64, r/m64 */ /* get the first operand */ reg = gpr_map[vie->reg]; error = vie_read_register(vm, vcpuid, reg, &val1); if (error) break; /* get the second operand */ error = memread(vm, vcpuid, gpa, &val2, size, arg); if (error) break; /* perform the operation and write the result */ nval = val1 - val2; error = vie_update_register(vm, vcpuid, reg, nval, size); break; default: break; } if (!error) { rflags2 = getcc(size, val1, val2); error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); if (error) return (error); rflags &= ~RFLAGS_STATUS_BITS; rflags |= rflags2 & RFLAGS_STATUS_BITS; error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); } return (error); } static int emulate_stack_op(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie, struct vm_guest_paging *paging, mem_region_read_t memread, mem_region_write_t memwrite, void *arg) { #ifdef _KERNEL struct vm_copyinfo copyinfo[2]; #else struct iovec copyinfo[2]; #endif struct seg_desc ss_desc; uint64_t cr0, rflags, rsp, stack_gla, val; int error, fault, size, stackaddrsize, pushop; val = 0; size = vie->opsize; pushop = (vie->op.op_type == VIE_OP_TYPE_PUSH) ? 1 : 0; /* * From "Address-Size Attributes for Stack Accesses", Intel SDL, Vol 1 */ if (paging->cpu_mode == CPU_MODE_REAL) { stackaddrsize = 2; } else if (paging->cpu_mode == CPU_MODE_64BIT) { /* * "Stack Manipulation Instructions in 64-bit Mode", SDM, Vol 3 * - Stack pointer size is always 64-bits. * - PUSH/POP of 32-bit values is not possible in 64-bit mode. * - 16-bit PUSH/POP is supported by using the operand size * override prefix (66H). */ stackaddrsize = 8; size = vie->opsize_override ? 2 : 8; } else { /* * In protected or compatibility mode the 'B' flag in the * stack-segment descriptor determines the size of the * stack pointer. */ error = vm_get_seg_desc(vm, vcpuid, VM_REG_GUEST_SS, &ss_desc); KASSERT(error == 0, ("%s: error %d getting SS descriptor", __func__, error)); if (SEG_DESC_DEF32(ss_desc.access)) stackaddrsize = 4; else stackaddrsize = 2; } error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0); KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error)); error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSP, &rsp); KASSERT(error == 0, ("%s: error %d getting rsp", __func__, error)); if (pushop) { rsp -= size; } if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &ss_desc, rsp, size, stackaddrsize, pushop ? PROT_WRITE : PROT_READ, &stack_gla)) { vm_inject_ss(vm, vcpuid, 0); return (0); } if (vie_canonical_check(paging->cpu_mode, stack_gla)) { vm_inject_ss(vm, vcpuid, 0); return (0); } if (vie_alignment_check(paging->cpl, size, cr0, rflags, stack_gla)) { vm_inject_ac(vm, vcpuid, 0); return (0); } error = vm_copy_setup(vm, vcpuid, paging, stack_gla, size, pushop ? PROT_WRITE : PROT_READ, copyinfo, nitems(copyinfo), &fault); if (error || fault) return (error); if (pushop) { error = memread(vm, vcpuid, mmio_gpa, &val, size, arg); if (error == 0) vm_copyout(vm, vcpuid, &val, copyinfo, size); } else { vm_copyin(vm, vcpuid, copyinfo, &val, size); error = memwrite(vm, vcpuid, mmio_gpa, val, size, arg); rsp += size; } vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); if (error == 0) { error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSP, rsp, stackaddrsize); KASSERT(error == 0, ("error %d updating rsp", error)); } return (error); } static int emulate_push(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie, struct vm_guest_paging *paging, mem_region_read_t memread, mem_region_write_t memwrite, void *arg) { int error; /* * Table A-6, "Opcode Extensions", Intel SDM, Vol 2. * * PUSH is part of the group 5 extended opcodes and is identified * by ModRM:reg = b110. */ if ((vie->reg & 7) != 6) return (EINVAL); error = emulate_stack_op(vm, vcpuid, mmio_gpa, vie, paging, memread, memwrite, arg); return (error); } static int emulate_pop(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie, struct vm_guest_paging *paging, mem_region_read_t memread, mem_region_write_t memwrite, void *arg) { int error; /* * Table A-6, "Opcode Extensions", Intel SDM, Vol 2. * * POP is part of the group 1A extended opcodes and is identified * by ModRM:reg = b000. */ if ((vie->reg & 7) != 0) return (EINVAL); error = emulate_stack_op(vm, vcpuid, mmio_gpa, vie, paging, memread, memwrite, arg); return (error); } static int emulate_group1(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, struct vm_guest_paging *paging, mem_region_read_t memread, mem_region_write_t memwrite, void *memarg) { int error; switch (vie->reg & 7) { case 0x1: /* OR */ error = emulate_or(vm, vcpuid, gpa, vie, memread, memwrite, memarg); break; case 0x4: /* AND */ error = emulate_and(vm, vcpuid, gpa, vie, memread, memwrite, memarg); break; case 0x7: /* CMP */ error = emulate_cmp(vm, vcpuid, gpa, vie, memread, memwrite, memarg); break; default: error = EINVAL; break; } return (error); } static int emulate_bittest(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, mem_region_read_t memread, mem_region_write_t memwrite, void *memarg) { uint64_t val, rflags; int error, bitmask, bitoff; /* * 0F BA is a Group 8 extended opcode. * * Currently we only emulate the 'Bit Test' instruction which is * identified by a ModR/M:reg encoding of 100b. */ if ((vie->reg & 7) != 4) return (EINVAL); error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); error = memread(vm, vcpuid, gpa, &val, vie->opsize, memarg); if (error) return (error); /* * Intel SDM, Vol 2, Table 3-2: * "Range of Bit Positions Specified by Bit Offset Operands" */ bitmask = vie->opsize * 8 - 1; bitoff = vie->immediate & bitmask; /* Copy the bit into the Carry flag in %rflags */ if (val & (1UL << bitoff)) rflags |= PSL_C; else rflags &= ~PSL_C; error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); KASSERT(error == 0, ("%s: error %d updating rflags", __func__, error)); return (0); } static int emulate_twob_group15(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, mem_region_read_t memread, mem_region_write_t memwrite, void *memarg) { int error; uint64_t buf; switch (vie->reg & 7) { case 0x7: /* CLFLUSH, CLFLUSHOPT, and SFENCE */ if (vie->mod == 0x3) { /* * SFENCE. Ignore it, VM exit provides enough * barriers on its own. */ error = 0; } else { /* * CLFLUSH, CLFLUSHOPT. Only check for access * rights. */ error = memread(vm, vcpuid, gpa, &buf, 1, memarg); } break; default: error = EINVAL; break; } return (error); } int vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, struct vm_guest_paging *paging, mem_region_read_t memread, mem_region_write_t memwrite, void *memarg) { int error; if (!vie->decoded) return (EINVAL); switch (vie->op.op_type) { case VIE_OP_TYPE_GROUP1: error = emulate_group1(vm, vcpuid, gpa, vie, paging, memread, memwrite, memarg); break; case VIE_OP_TYPE_POP: error = emulate_pop(vm, vcpuid, gpa, vie, paging, memread, memwrite, memarg); break; case VIE_OP_TYPE_PUSH: error = emulate_push(vm, vcpuid, gpa, vie, paging, memread, memwrite, memarg); break; case VIE_OP_TYPE_CMP: error = emulate_cmp(vm, vcpuid, gpa, vie, memread, memwrite, memarg); break; case VIE_OP_TYPE_MOV: error = emulate_mov(vm, vcpuid, gpa, vie, memread, memwrite, memarg); break; case VIE_OP_TYPE_MOVSX: case VIE_OP_TYPE_MOVZX: error = emulate_movx(vm, vcpuid, gpa, vie, memread, memwrite, memarg); break; case VIE_OP_TYPE_MOVS: error = emulate_movs(vm, vcpuid, gpa, vie, paging, memread, memwrite, memarg); break; case VIE_OP_TYPE_STOS: error = emulate_stos(vm, vcpuid, gpa, vie, paging, memread, memwrite, memarg); break; case VIE_OP_TYPE_AND: error = emulate_and(vm, vcpuid, gpa, vie, memread, memwrite, memarg); break; case VIE_OP_TYPE_OR: error = emulate_or(vm, vcpuid, gpa, vie, memread, memwrite, memarg); break; case VIE_OP_TYPE_SUB: error = emulate_sub(vm, vcpuid, gpa, vie, memread, memwrite, memarg); break; case VIE_OP_TYPE_BITTEST: error = emulate_bittest(vm, vcpuid, gpa, vie, memread, memwrite, memarg); break; case VIE_OP_TYPE_TWOB_GRP15: error = emulate_twob_group15(vm, vcpuid, gpa, vie, memread, memwrite, memarg); break; case VIE_OP_TYPE_ADD: error = emulate_add(vm, vcpuid, gpa, vie, memread, memwrite, memarg); break; case VIE_OP_TYPE_TEST: error = emulate_test(vm, vcpuid, gpa, vie, memread, memwrite, memarg); break; case VIE_OP_TYPE_BEXTR: error = emulate_bextr(vm, vcpuid, gpa, vie, paging, memread, memwrite, memarg); break; default: error = EINVAL; break; } return (error); } int vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla) { KASSERT(size == 1 || size == 2 || size == 4 || size == 8, ("%s: invalid size %d", __func__, size)); KASSERT(cpl >= 0 && cpl <= 3, ("%s: invalid cpl %d", __func__, cpl)); if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0) return (0); return ((gla & (size - 1)) ? 1 : 0); } int vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla) { uint64_t mask; if (cpu_mode != CPU_MODE_64BIT) return (0); /* * The value of the bit 47 in the 'gla' should be replicated in the * most significant 16 bits. */ mask = ~((1UL << 48) - 1); if (gla & (1UL << 47)) return ((gla & mask) != mask); else return ((gla & mask) != 0); } uint64_t vie_size2mask(int size) { KASSERT(size == 1 || size == 2 || size == 4 || size == 8, ("vie_size2mask: invalid size %d", size)); return (size2mask[size]); } int vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg, struct seg_desc *desc, uint64_t offset, int length, int addrsize, int prot, uint64_t *gla) { uint64_t firstoff, low_limit, high_limit, segbase; int glasize, type; KASSERT(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS, ("%s: invalid segment %d", __func__, seg)); KASSERT(length == 1 || length == 2 || length == 4 || length == 8, ("%s: invalid operand size %d", __func__, length)); KASSERT((prot & ~(PROT_READ | PROT_WRITE)) == 0, ("%s: invalid prot %#x", __func__, prot)); firstoff = offset; if (cpu_mode == CPU_MODE_64BIT) { KASSERT(addrsize == 4 || addrsize == 8, ("%s: invalid address " "size %d for cpu_mode %d", __func__, addrsize, cpu_mode)); glasize = 8; } else { KASSERT(addrsize == 2 || addrsize == 4, ("%s: invalid address " "size %d for cpu mode %d", __func__, addrsize, cpu_mode)); glasize = 4; /* * If the segment selector is loaded with a NULL selector * then the descriptor is unusable and attempting to use * it results in a #GP(0). */ if (SEG_DESC_UNUSABLE(desc->access)) return (-1); /* * The processor generates a #NP exception when a segment * register is loaded with a selector that points to a * descriptor that is not present. If this was the case then * it would have been checked before the VM-exit. */ KASSERT(SEG_DESC_PRESENT(desc->access), ("segment %d not present: %#x", seg, desc->access)); /* * The descriptor type must indicate a code/data segment. */ type = SEG_DESC_TYPE(desc->access); KASSERT(type >= 16 && type <= 31, ("segment %d has invalid " "descriptor type %#x", seg, type)); if (prot & PROT_READ) { /* #GP on a read access to a exec-only code segment */ if ((type & 0xA) == 0x8) return (-1); } if (prot & PROT_WRITE) { /* * #GP on a write access to a code segment or a * read-only data segment. */ if (type & 0x8) /* code segment */ return (-1); if ((type & 0xA) == 0) /* read-only data seg */ return (-1); } /* * 'desc->limit' is fully expanded taking granularity into * account. */ if ((type & 0xC) == 0x4) { /* expand-down data segment */ low_limit = desc->limit + 1; high_limit = SEG_DESC_DEF32(desc->access) ? 0xffffffff : 0xffff; } else { /* code segment or expand-up data segment */ low_limit = 0; high_limit = desc->limit; } while (length > 0) { offset &= vie_size2mask(addrsize); if (offset < low_limit || offset > high_limit) return (-1); offset++; length--; } } /* * In 64-bit mode all segments except %fs and %gs have a segment * base address of 0. */ if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS && seg != VM_REG_GUEST_GS) { segbase = 0; } else { segbase = desc->base; } /* * Truncate 'firstoff' to the effective address size before adding * it to the segment base. */ firstoff &= vie_size2mask(addrsize); *gla = (segbase + firstoff) & vie_size2mask(glasize); return (0); } +/* + * Prepare a partially decoded vie for a 2nd attempt. + */ void -vie_init(struct vie *vie, const char *inst_bytes, int inst_length) +vie_restart(struct vie *vie) { - KASSERT(inst_length >= 0 && inst_length <= VIE_INST_SIZE, - ("%s: invalid instruction length (%d)", __func__, inst_length)); + _Static_assert( + offsetof(struct vie, inst) < offsetof(struct vie, vie_startzero) && + offsetof(struct vie, num_valid) < offsetof(struct vie, vie_startzero), + "restart should not erase instruction length or contents"); - bzero(vie, sizeof(struct vie)); + memset((char *)vie + offsetof(struct vie, vie_startzero), 0, + sizeof(*vie) - offsetof(struct vie, vie_startzero)); vie->base_register = VM_REG_LAST; vie->index_register = VM_REG_LAST; vie->segment_register = VM_REG_LAST; +} - if (inst_length) { - bcopy(inst_bytes, vie->inst, inst_length); - vie->num_valid = inst_length; - } +void +vie_init(struct vie *vie, const char *inst_bytes, int inst_length) +{ + KASSERT(inst_length >= 0 && inst_length <= VIE_INST_SIZE, + ("%s: invalid instruction length (%d)", __func__, inst_length)); + + vie_restart(vie); + memset(vie->inst, 0, sizeof(vie->inst)); + if (inst_length != 0) + memcpy(vie->inst, inst_bytes, inst_length); + vie->num_valid = inst_length; } #ifdef _KERNEL static int pf_error_code(int usermode, int prot, int rsvd, uint64_t pte) { int error_code = 0; if (pte & PG_V) error_code |= PGEX_P; if (prot & VM_PROT_WRITE) error_code |= PGEX_W; if (usermode) error_code |= PGEX_U; if (rsvd) error_code |= PGEX_RSV; if (prot & VM_PROT_EXECUTE) error_code |= PGEX_I; return (error_code); } static void ptp_release(void **cookie) { if (*cookie != NULL) { vm_gpa_release(*cookie); *cookie = NULL; } } static void * ptp_hold(struct vm *vm, int vcpu, vm_paddr_t ptpphys, size_t len, void **cookie) { void *ptr; ptp_release(cookie); ptr = vm_gpa_hold(vm, vcpu, ptpphys, len, VM_PROT_RW, cookie); return (ptr); } static int _vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, uint64_t gla, int prot, uint64_t *gpa, int *guest_fault, bool check_only) { int nlevels, pfcode, ptpshift, ptpindex, retval, usermode, writable; u_int retries; uint64_t *ptpbase, ptpphys, pte, pgsize; uint32_t *ptpbase32, pte32; void *cookie; *guest_fault = 0; usermode = (paging->cpl == 3 ? 1 : 0); writable = prot & VM_PROT_WRITE; cookie = NULL; retval = 0; retries = 0; restart: ptpphys = paging->cr3; /* root of the page tables */ ptp_release(&cookie); if (retries++ > 0) maybe_yield(); if (vie_canonical_check(paging->cpu_mode, gla)) { /* * XXX assuming a non-stack reference otherwise a stack fault * should be generated. */ if (!check_only) vm_inject_gp(vm, vcpuid); goto fault; } if (paging->paging_mode == PAGING_MODE_FLAT) { *gpa = gla; goto done; } if (paging->paging_mode == PAGING_MODE_32) { nlevels = 2; while (--nlevels >= 0) { /* Zero out the lower 12 bits. */ ptpphys &= ~0xfff; ptpbase32 = ptp_hold(vm, vcpuid, ptpphys, PAGE_SIZE, &cookie); if (ptpbase32 == NULL) goto error; ptpshift = PAGE_SHIFT + nlevels * 10; ptpindex = (gla >> ptpshift) & 0x3FF; pgsize = 1UL << ptpshift; pte32 = ptpbase32[ptpindex]; if ((pte32 & PG_V) == 0 || (usermode && (pte32 & PG_U) == 0) || (writable && (pte32 & PG_RW) == 0)) { if (!check_only) { pfcode = pf_error_code(usermode, prot, 0, pte32); vm_inject_pf(vm, vcpuid, pfcode, gla); } goto fault; } /* * Emulate the x86 MMU's management of the accessed * and dirty flags. While the accessed flag is set * at every level of the page table, the dirty flag * is only set at the last level providing the guest * physical address. */ if (!check_only && (pte32 & PG_A) == 0) { if (atomic_cmpset_32(&ptpbase32[ptpindex], pte32, pte32 | PG_A) == 0) { goto restart; } } /* XXX must be ignored if CR4.PSE=0 */ if (nlevels > 0 && (pte32 & PG_PS) != 0) break; ptpphys = pte32; } /* Set the dirty bit in the page table entry if necessary */ if (!check_only && writable && (pte32 & PG_M) == 0) { if (atomic_cmpset_32(&ptpbase32[ptpindex], pte32, pte32 | PG_M) == 0) { goto restart; } } /* Zero out the lower 'ptpshift' bits */ pte32 >>= ptpshift; pte32 <<= ptpshift; *gpa = pte32 | (gla & (pgsize - 1)); goto done; } if (paging->paging_mode == PAGING_MODE_PAE) { /* Zero out the lower 5 bits and the upper 32 bits */ ptpphys &= 0xffffffe0UL; ptpbase = ptp_hold(vm, vcpuid, ptpphys, sizeof(*ptpbase) * 4, &cookie); if (ptpbase == NULL) goto error; ptpindex = (gla >> 30) & 0x3; pte = ptpbase[ptpindex]; if ((pte & PG_V) == 0) { if (!check_only) { pfcode = pf_error_code(usermode, prot, 0, pte); vm_inject_pf(vm, vcpuid, pfcode, gla); } goto fault; } ptpphys = pte; nlevels = 2; } else nlevels = 4; while (--nlevels >= 0) { /* Zero out the lower 12 bits and the upper 12 bits */ ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12; ptpbase = ptp_hold(vm, vcpuid, ptpphys, PAGE_SIZE, &cookie); if (ptpbase == NULL) goto error; ptpshift = PAGE_SHIFT + nlevels * 9; ptpindex = (gla >> ptpshift) & 0x1FF; pgsize = 1UL << ptpshift; pte = ptpbase[ptpindex]; if ((pte & PG_V) == 0 || (usermode && (pte & PG_U) == 0) || (writable && (pte & PG_RW) == 0)) { if (!check_only) { pfcode = pf_error_code(usermode, prot, 0, pte); vm_inject_pf(vm, vcpuid, pfcode, gla); } goto fault; } /* Set the accessed bit in the page table entry */ if (!check_only && (pte & PG_A) == 0) { if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_A) == 0) { goto restart; } } if (nlevels > 0 && (pte & PG_PS) != 0) { if (pgsize > 1 * GB) { if (!check_only) { pfcode = pf_error_code(usermode, prot, 1, pte); vm_inject_pf(vm, vcpuid, pfcode, gla); } goto fault; } break; } ptpphys = pte; } /* Set the dirty bit in the page table entry if necessary */ if (!check_only && writable && (pte & PG_M) == 0) { if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_M) == 0) goto restart; } /* Zero out the lower 'ptpshift' bits and the upper 12 bits */ pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12; *gpa = pte | (gla & (pgsize - 1)); done: ptp_release(&cookie); KASSERT(retval == 0 || retval == EFAULT, ("%s: unexpected retval %d", __func__, retval)); return (retval); error: retval = EFAULT; goto done; fault: *guest_fault = 1; goto done; } int vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, uint64_t gla, int prot, uint64_t *gpa, int *guest_fault) { return (_vm_gla2gpa(vm, vcpuid, paging, gla, prot, gpa, guest_fault, false)); } int vm_gla2gpa_nofault(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, uint64_t gla, int prot, uint64_t *gpa, int *guest_fault) { return (_vm_gla2gpa(vm, vcpuid, paging, gla, prot, gpa, guest_fault, true)); } int vmm_fetch_instruction(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, uint64_t rip, int inst_length, struct vie *vie, int *faultptr) { struct vm_copyinfo copyinfo[2]; int error, prot; if (inst_length > VIE_INST_SIZE) panic("vmm_fetch_instruction: invalid length %d", inst_length); prot = PROT_READ | PROT_EXEC; error = vm_copy_setup(vm, vcpuid, paging, rip, inst_length, prot, copyinfo, nitems(copyinfo), faultptr); if (error || *faultptr) return (error); vm_copyin(vm, vcpuid, copyinfo, vie->inst, inst_length); vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); vie->num_valid = inst_length; return (0); } #endif /* _KERNEL */ static int vie_peek(struct vie *vie, uint8_t *x) { if (vie->num_processed < vie->num_valid) { *x = vie->inst[vie->num_processed]; return (0); } else return (-1); } static void vie_advance(struct vie *vie) { vie->num_processed++; } static bool segment_override(uint8_t x, int *seg) { switch (x) { case 0x2E: *seg = VM_REG_GUEST_CS; break; case 0x36: *seg = VM_REG_GUEST_SS; break; case 0x3E: *seg = VM_REG_GUEST_DS; break; case 0x26: *seg = VM_REG_GUEST_ES; break; case 0x64: *seg = VM_REG_GUEST_FS; break; case 0x65: *seg = VM_REG_GUEST_GS; break; default: return (false); } return (true); } static int decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d) { uint8_t x; while (1) { if (vie_peek(vie, &x)) return (-1); if (x == 0x66) vie->opsize_override = 1; else if (x == 0x67) vie->addrsize_override = 1; else if (x == 0xF3) vie->repz_present = 1; else if (x == 0xF2) vie->repnz_present = 1; else if (segment_override(x, &vie->segment_register)) vie->segment_override = 1; else break; vie_advance(vie); } /* * From section 2.2.1, "REX Prefixes", Intel SDM Vol 2: * - Only one REX prefix is allowed per instruction. * - The REX prefix must immediately precede the opcode byte or the * escape opcode byte. * - If an instruction has a mandatory prefix (0x66, 0xF2 or 0xF3) * the mandatory prefix must come before the REX prefix. */ if (cpu_mode == CPU_MODE_64BIT && x >= 0x40 && x <= 0x4F) { vie->rex_present = 1; vie->rex_w = x & 0x8 ? 1 : 0; vie->rex_r = x & 0x4 ? 1 : 0; vie->rex_x = x & 0x2 ? 1 : 0; vie->rex_b = x & 0x1 ? 1 : 0; vie_advance(vie); } /* * ยง 2.3.5, "The VEX Prefix", SDM Vol 2. */ if ((cpu_mode == CPU_MODE_64BIT || cpu_mode == CPU_MODE_COMPATIBILITY) && x == 0xC4) { const struct vie_op *optab; /* 3-byte VEX prefix. */ vie->vex_present = 1; vie_advance(vie); if (vie_peek(vie, &x)) return (-1); /* * 2nd byte: [R', X', B', mmmmm[4:0]]. Bits are inverted * relative to REX encoding. */ vie->rex_r = x & 0x80 ? 0 : 1; vie->rex_x = x & 0x40 ? 0 : 1; vie->rex_b = x & 0x20 ? 0 : 1; switch (x & 0x1F) { case 0x2: /* 0F 38. */ optab = three_byte_opcodes_0f38; break; case 0x1: /* 0F class - nothing handled here yet. */ /* FALLTHROUGH */ case 0x3: /* 0F 3A class - nothing handled here yet. */ /* FALLTHROUGH */ default: /* Reserved (#UD). */ return (-1); } vie_advance(vie); if (vie_peek(vie, &x)) return (-1); /* 3rd byte: [W, vvvv[6:3], L, pp[1:0]]. */ vie->rex_w = x & 0x80 ? 1 : 0; vie->vex_reg = ((~(unsigned)x & 0x78u) >> 3); vie->vex_l = !!(x & 0x4); vie->vex_pp = (x & 0x3); /* PP: 1=66 2=F3 3=F2 prefixes. */ switch (vie->vex_pp) { case 0x1: vie->opsize_override = 1; break; case 0x2: vie->repz_present = 1; break; case 0x3: vie->repnz_present = 1; break; } vie_advance(vie); /* Opcode, sans literal prefix prefix. */ if (vie_peek(vie, &x)) return (-1); vie->op = optab[x]; if (vie->op.op_type == VIE_OP_TYPE_NONE) return (-1); vie_advance(vie); } /* * Section "Operand-Size And Address-Size Attributes", Intel SDM, Vol 1 */ if (cpu_mode == CPU_MODE_64BIT) { /* * Default address size is 64-bits and default operand size * is 32-bits. */ vie->addrsize = vie->addrsize_override ? 4 : 8; if (vie->rex_w) vie->opsize = 8; else if (vie->opsize_override) vie->opsize = 2; else vie->opsize = 4; } else if (cs_d) { /* Default address and operand sizes are 32-bits */ vie->addrsize = vie->addrsize_override ? 2 : 4; vie->opsize = vie->opsize_override ? 2 : 4; } else { /* Default address and operand sizes are 16-bits */ vie->addrsize = vie->addrsize_override ? 4 : 2; vie->opsize = vie->opsize_override ? 4 : 2; } return (0); } static int decode_two_byte_opcode(struct vie *vie) { uint8_t x; if (vie_peek(vie, &x)) return (-1); vie->op = two_byte_opcodes[x]; if (vie->op.op_type == VIE_OP_TYPE_NONE) return (-1); vie_advance(vie); return (0); } static int decode_opcode(struct vie *vie) { uint8_t x; if (vie_peek(vie, &x)) return (-1); /* Already did this via VEX prefix. */ if (vie->op.op_type != VIE_OP_TYPE_NONE) return (0); vie->op = one_byte_opcodes[x]; if (vie->op.op_type == VIE_OP_TYPE_NONE) return (-1); vie_advance(vie); if (vie->op.op_type == VIE_OP_TYPE_TWO_BYTE) return (decode_two_byte_opcode(vie)); return (0); } static int decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode) { uint8_t x; if (vie->op.op_flags & VIE_OP_F_NO_MODRM) return (0); if (cpu_mode == CPU_MODE_REAL) return (-1); if (vie_peek(vie, &x)) return (-1); vie->mod = (x >> 6) & 0x3; vie->rm = (x >> 0) & 0x7; vie->reg = (x >> 3) & 0x7; /* * A direct addressing mode makes no sense in the context of an EPT * fault. There has to be a memory access involved to cause the * EPT fault. */ if (vie->mod == VIE_MOD_DIRECT) return (-1); if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) || (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) { /* * Table 2-5: Special Cases of REX Encodings * * mod=0, r/m=5 is used in the compatibility mode to * indicate a disp32 without a base register. * * mod!=3, r/m=4 is used in the compatibility mode to * indicate that the SIB byte is present. * * The 'b' bit in the REX prefix is don't care in * this case. */ } else { vie->rm |= (vie->rex_b << 3); } vie->reg |= (vie->rex_r << 3); /* SIB */ if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB) goto done; vie->base_register = gpr_map[vie->rm]; switch (vie->mod) { case VIE_MOD_INDIRECT_DISP8: vie->disp_bytes = 1; break; case VIE_MOD_INDIRECT_DISP32: vie->disp_bytes = 4; break; case VIE_MOD_INDIRECT: if (vie->rm == VIE_RM_DISP32) { vie->disp_bytes = 4; /* * Table 2-7. RIP-Relative Addressing * * In 64-bit mode mod=00 r/m=101 implies [rip] + disp32 * whereas in compatibility mode it just implies disp32. */ if (cpu_mode == CPU_MODE_64BIT) vie->base_register = VM_REG_GUEST_RIP; else vie->base_register = VM_REG_LAST; } break; } done: vie_advance(vie); return (0); } static int decode_sib(struct vie *vie) { uint8_t x; /* Proceed only if SIB byte is present */ if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB) return (0); if (vie_peek(vie, &x)) return (-1); /* De-construct the SIB byte */ vie->ss = (x >> 6) & 0x3; vie->index = (x >> 3) & 0x7; vie->base = (x >> 0) & 0x7; /* Apply the REX prefix modifiers */ vie->index |= vie->rex_x << 3; vie->base |= vie->rex_b << 3; switch (vie->mod) { case VIE_MOD_INDIRECT_DISP8: vie->disp_bytes = 1; break; case VIE_MOD_INDIRECT_DISP32: vie->disp_bytes = 4; break; } if (vie->mod == VIE_MOD_INDIRECT && (vie->base == 5 || vie->base == 13)) { /* * Special case when base register is unused if mod = 0 * and base = %rbp or %r13. * * Documented in: * Table 2-3: 32-bit Addressing Forms with the SIB Byte * Table 2-5: Special Cases of REX Encodings */ vie->disp_bytes = 4; } else { vie->base_register = gpr_map[vie->base]; } /* * All encodings of 'index' are valid except for %rsp (4). * * Documented in: * Table 2-3: 32-bit Addressing Forms with the SIB Byte * Table 2-5: Special Cases of REX Encodings */ if (vie->index != 4) vie->index_register = gpr_map[vie->index]; /* 'scale' makes sense only in the context of an index register */ if (vie->index_register < VM_REG_LAST) vie->scale = 1 << vie->ss; vie_advance(vie); return (0); } static int decode_displacement(struct vie *vie) { int n, i; uint8_t x; union { char buf[4]; int8_t signed8; int32_t signed32; } u; if ((n = vie->disp_bytes) == 0) return (0); if (n != 1 && n != 4) panic("decode_displacement: invalid disp_bytes %d", n); for (i = 0; i < n; i++) { if (vie_peek(vie, &x)) return (-1); u.buf[i] = x; vie_advance(vie); } if (n == 1) vie->displacement = u.signed8; /* sign-extended */ else vie->displacement = u.signed32; /* sign-extended */ return (0); } static int decode_immediate(struct vie *vie) { int i, n; uint8_t x; union { char buf[4]; int8_t signed8; int16_t signed16; int32_t signed32; } u; /* Figure out immediate operand size (if any) */ if (vie->op.op_flags & VIE_OP_F_IMM) { /* * Section 2.2.1.5 "Immediates", Intel SDM: * In 64-bit mode the typical size of immediate operands * remains 32-bits. When the operand size if 64-bits, the * processor sign-extends all immediates to 64-bits prior * to their use. */ if (vie->opsize == 4 || vie->opsize == 8) vie->imm_bytes = 4; else vie->imm_bytes = 2; } else if (vie->op.op_flags & VIE_OP_F_IMM8) { vie->imm_bytes = 1; } if ((n = vie->imm_bytes) == 0) return (0); KASSERT(n == 1 || n == 2 || n == 4, ("%s: invalid number of immediate bytes: %d", __func__, n)); for (i = 0; i < n; i++) { if (vie_peek(vie, &x)) return (-1); u.buf[i] = x; vie_advance(vie); } /* sign-extend the immediate value before use */ if (n == 1) vie->immediate = u.signed8; else if (n == 2) vie->immediate = u.signed16; else vie->immediate = u.signed32; return (0); } static int decode_moffset(struct vie *vie) { int i, n; uint8_t x; union { char buf[8]; uint64_t u64; } u; if ((vie->op.op_flags & VIE_OP_F_MOFFSET) == 0) return (0); /* * Section 2.2.1.4, "Direct Memory-Offset MOVs", Intel SDM: * The memory offset size follows the address-size of the instruction. */ n = vie->addrsize; KASSERT(n == 2 || n == 4 || n == 8, ("invalid moffset bytes: %d", n)); u.u64 = 0; for (i = 0; i < n; i++) { if (vie_peek(vie, &x)) return (-1); u.buf[i] = x; vie_advance(vie); } vie->displacement = u.u64; return (0); } #ifdef _KERNEL /* * Verify that the 'guest linear address' provided as collateral of the nested * page table fault matches with our instruction decoding. */ static int verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie, enum vm_cpu_mode cpu_mode) { int error; uint64_t base, segbase, idx, gla2; enum vm_reg_name seg; struct seg_desc desc; /* Skip 'gla' verification */ if (gla == VIE_INVALID_GLA) return (0); base = 0; if (vie->base_register != VM_REG_LAST) { error = vm_get_register(vm, cpuid, vie->base_register, &base); if (error) { printf("verify_gla: error %d getting base reg %d\n", error, vie->base_register); return (-1); } /* * RIP-relative addressing starts from the following * instruction */ if (vie->base_register == VM_REG_GUEST_RIP) base += vie->num_processed; } idx = 0; if (vie->index_register != VM_REG_LAST) { error = vm_get_register(vm, cpuid, vie->index_register, &idx); if (error) { printf("verify_gla: error %d getting index reg %d\n", error, vie->index_register); return (-1); } } /* * From "Specifying a Segment Selector", Intel SDM, Vol 1 * * In 64-bit mode, segmentation is generally (but not * completely) disabled. The exceptions are the FS and GS * segments. * * In legacy IA-32 mode, when the ESP or EBP register is used * as the base, the SS segment is the default segment. For * other data references, except when relative to stack or * string destination the DS segment is the default. These * can be overridden to allow other segments to be accessed. */ if (vie->segment_override) seg = vie->segment_register; else if (vie->base_register == VM_REG_GUEST_RSP || vie->base_register == VM_REG_GUEST_RBP) seg = VM_REG_GUEST_SS; else seg = VM_REG_GUEST_DS; if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS && seg != VM_REG_GUEST_GS) { segbase = 0; } else { error = vm_get_seg_desc(vm, cpuid, seg, &desc); if (error) { printf("verify_gla: error %d getting segment" " descriptor %d", error, vie->segment_register); return (-1); } segbase = desc.base; } gla2 = segbase + base + vie->scale * idx + vie->displacement; gla2 &= size2mask[vie->addrsize]; if (gla != gla2) { printf("verify_gla mismatch: segbase(0x%0lx)" "base(0x%0lx), scale(%d), index(0x%0lx), " "disp(0x%0lx), gla(0x%0lx), gla2(0x%0lx)\n", segbase, base, vie->scale, idx, vie->displacement, gla, gla2); return (-1); } return (0); } #endif /* _KERNEL */ int #ifdef _KERNEL vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla, enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie) #else vmm_decode_instruction(enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie) #endif { if (decode_prefixes(vie, cpu_mode, cs_d)) return (-1); if (decode_opcode(vie)) return (-1); if (decode_modrm(vie, cpu_mode)) return (-1); if (decode_sib(vie)) return (-1); if (decode_displacement(vie)) return (-1); if (decode_immediate(vie)) return (-1); if (decode_moffset(vie)) return (-1); #ifdef _KERNEL if ((vie->op.op_flags & VIE_OP_F_NO_GLA_VERIFICATION) == 0) { if (verify_gla(vm, cpuid, gla, vie, cpu_mode)) return (-1); } #endif vie->decoded = 1; /* success */ return (0); } Index: head/usr.sbin/bhyve/bhyverun.c =================================================================== --- head/usr.sbin/bhyve/bhyverun.c (revision 362599) +++ head/usr.sbin/bhyve/bhyverun.c (revision 362600) @@ -1,1430 +1,1445 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #ifndef WITHOUT_CAPSICUM #include #endif #include #ifdef BHYVE_SNAPSHOT #include #include #endif #include #ifdef BHYVE_SNAPSHOT #include #endif #include #include #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #ifdef BHYVE_SNAPSHOT #include #endif #include #include #include #include #include #include #include #include #ifdef BHYVE_SNAPSHOT #include #include #include #endif #include #ifndef WITHOUT_CAPSICUM #include #endif +#include #include #include "bhyverun.h" #include "acpi.h" #include "atkbdc.h" #include "bootrom.h" #include "inout.h" #include "dbgport.h" #include "debug.h" #include "fwctl.h" #include "gdb.h" #include "ioapic.h" #include "kernemu_dev.h" #include "mem.h" #include "mevent.h" #include "mptbl.h" #include "pci_emul.h" #include "pci_irq.h" #include "pci_lpc.h" #include "smbiostbl.h" #ifdef BHYVE_SNAPSHOT #include "snapshot.h" #endif #include "xmsr.h" #include "spinup_ap.h" #include "rtc.h" #include "vmgenc.h" #define GUEST_NIO_PORT 0x488 /* guest upcalls via i/o port */ #define MB (1024UL * 1024) #define GB (1024UL * MB) static const char * const vmx_exit_reason_desc[] = { [EXIT_REASON_EXCEPTION] = "Exception or non-maskable interrupt (NMI)", [EXIT_REASON_EXT_INTR] = "External interrupt", [EXIT_REASON_TRIPLE_FAULT] = "Triple fault", [EXIT_REASON_INIT] = "INIT signal", [EXIT_REASON_SIPI] = "Start-up IPI (SIPI)", [EXIT_REASON_IO_SMI] = "I/O system-management interrupt (SMI)", [EXIT_REASON_SMI] = "Other SMI", [EXIT_REASON_INTR_WINDOW] = "Interrupt window", [EXIT_REASON_NMI_WINDOW] = "NMI window", [EXIT_REASON_TASK_SWITCH] = "Task switch", [EXIT_REASON_CPUID] = "CPUID", [EXIT_REASON_GETSEC] = "GETSEC", [EXIT_REASON_HLT] = "HLT", [EXIT_REASON_INVD] = "INVD", [EXIT_REASON_INVLPG] = "INVLPG", [EXIT_REASON_RDPMC] = "RDPMC", [EXIT_REASON_RDTSC] = "RDTSC", [EXIT_REASON_RSM] = "RSM", [EXIT_REASON_VMCALL] = "VMCALL", [EXIT_REASON_VMCLEAR] = "VMCLEAR", [EXIT_REASON_VMLAUNCH] = "VMLAUNCH", [EXIT_REASON_VMPTRLD] = "VMPTRLD", [EXIT_REASON_VMPTRST] = "VMPTRST", [EXIT_REASON_VMREAD] = "VMREAD", [EXIT_REASON_VMRESUME] = "VMRESUME", [EXIT_REASON_VMWRITE] = "VMWRITE", [EXIT_REASON_VMXOFF] = "VMXOFF", [EXIT_REASON_VMXON] = "VMXON", [EXIT_REASON_CR_ACCESS] = "Control-register accesses", [EXIT_REASON_DR_ACCESS] = "MOV DR", [EXIT_REASON_INOUT] = "I/O instruction", [EXIT_REASON_RDMSR] = "RDMSR", [EXIT_REASON_WRMSR] = "WRMSR", [EXIT_REASON_INVAL_VMCS] = "VM-entry failure due to invalid guest state", [EXIT_REASON_INVAL_MSR] = "VM-entry failure due to MSR loading", [EXIT_REASON_MWAIT] = "MWAIT", [EXIT_REASON_MTF] = "Monitor trap flag", [EXIT_REASON_MONITOR] = "MONITOR", [EXIT_REASON_PAUSE] = "PAUSE", [EXIT_REASON_MCE_DURING_ENTRY] = "VM-entry failure due to machine-check event", [EXIT_REASON_TPR] = "TPR below threshold", [EXIT_REASON_APIC_ACCESS] = "APIC access", [EXIT_REASON_VIRTUALIZED_EOI] = "Virtualized EOI", [EXIT_REASON_GDTR_IDTR] = "Access to GDTR or IDTR", [EXIT_REASON_LDTR_TR] = "Access to LDTR or TR", [EXIT_REASON_EPT_FAULT] = "EPT violation", [EXIT_REASON_EPT_MISCONFIG] = "EPT misconfiguration", [EXIT_REASON_INVEPT] = "INVEPT", [EXIT_REASON_RDTSCP] = "RDTSCP", [EXIT_REASON_VMX_PREEMPT] = "VMX-preemption timer expired", [EXIT_REASON_INVVPID] = "INVVPID", [EXIT_REASON_WBINVD] = "WBINVD", [EXIT_REASON_XSETBV] = "XSETBV", [EXIT_REASON_APIC_WRITE] = "APIC write", [EXIT_REASON_RDRAND] = "RDRAND", [EXIT_REASON_INVPCID] = "INVPCID", [EXIT_REASON_VMFUNC] = "VMFUNC", [EXIT_REASON_ENCLS] = "ENCLS", [EXIT_REASON_RDSEED] = "RDSEED", [EXIT_REASON_PM_LOG_FULL] = "Page-modification log full", [EXIT_REASON_XSAVES] = "XSAVES", [EXIT_REASON_XRSTORS] = "XRSTORS" }; typedef int (*vmexit_handler_t)(struct vmctx *, struct vm_exit *, int *vcpu); extern int vmexit_task_switch(struct vmctx *, struct vm_exit *, int *vcpu); const char *vmname; int guest_ncpus; uint16_t cores, maxcpus, sockets, threads; char *guest_uuid_str; int raw_stdio = 0; static int gdb_port = 0; static int guest_vmexit_on_hlt, guest_vmexit_on_pause; static int virtio_msix = 1; static int x2apic_mode = 0; /* default is xAPIC */ static int strictio; static int strictmsr = 1; static int acpi; static char *progname; static const int BSP = 0; static cpuset_t cpumask; static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip); static struct vm_exit vmexit[VM_MAXCPU]; struct bhyvestats { uint64_t vmexit_bogus; uint64_t vmexit_reqidle; uint64_t vmexit_hlt; uint64_t vmexit_pause; uint64_t vmexit_mtrap; uint64_t vmexit_inst_emul; uint64_t cpu_switch_rotate; uint64_t cpu_switch_direct; } stats; struct mt_vmm_info { pthread_t mt_thr; struct vmctx *mt_ctx; int mt_vcpu; } mt_vmm_info[VM_MAXCPU]; static cpuset_t *vcpumap[VM_MAXCPU] = { NULL }; static void usage(int code) { fprintf(stderr, "Usage: %s [-abehuwxACHPSWY]\n" " %*s [-c [[cpus=]numcpus][,sockets=n][,cores=n][,threads=n]]\n" " %*s [-g ] [-l ]\n" " %*s [-m mem] [-p vcpu:hostcpu] [-s ] [-U uuid] \n" " -a: local apic is in xAPIC mode (deprecated)\n" " -A: create ACPI tables\n" " -c: number of cpus and/or topology specification\n" " -C: include guest memory in core file\n" " -e: exit on unhandled I/O access\n" " -g: gdb port\n" " -h: help\n" " -H: vmexit from the guest on hlt\n" " -l: LPC device configuration\n" " -m: memory size in MB\n" #ifdef BHYVE_SNAPSHOT " -r: path to checkpoint file\n" #endif " -p: pin 'vcpu' to 'hostcpu'\n" " -P: vmexit from the guest on pause\n" " -s: PCI slot config\n" " -S: guest memory cannot be swapped\n" " -u: RTC keeps UTC time\n" " -U: uuid\n" " -w: ignore unimplemented MSRs\n" " -W: force virtio to use single-vector MSI\n" " -x: local apic is in x2APIC mode\n" " -Y: disable MPtable generation\n", progname, (int)strlen(progname), "", (int)strlen(progname), "", (int)strlen(progname), ""); exit(code); } /* * XXX This parser is known to have the following issues: * 1. It accepts null key=value tokens ",,". * 2. It accepts whitespace after = and before value. * 3. Values out of range of INT are silently wrapped. * 4. It doesn't check non-final values. * 5. The apparently bogus limits of UINT16_MAX are for future expansion. * * The acceptance of a null specification ('-c ""') is by design to match the * manual page syntax specification, this results in a topology of 1 vCPU. */ static int topology_parse(const char *opt) { uint64_t ncpus; int c, chk, n, s, t, tmp; char *cp, *str; bool ns, scts; c = 1, n = 1, s = 1, t = 1; ns = false, scts = false; str = strdup(opt); if (str == NULL) goto out; while ((cp = strsep(&str, ",")) != NULL) { if (sscanf(cp, "%i%n", &tmp, &chk) == 1) { n = tmp; ns = true; } else if (sscanf(cp, "cpus=%i%n", &tmp, &chk) == 1) { n = tmp; ns = true; } else if (sscanf(cp, "sockets=%i%n", &tmp, &chk) == 1) { s = tmp; scts = true; } else if (sscanf(cp, "cores=%i%n", &tmp, &chk) == 1) { c = tmp; scts = true; } else if (sscanf(cp, "threads=%i%n", &tmp, &chk) == 1) { t = tmp; scts = true; #ifdef notyet /* Do not expose this until vmm.ko implements it */ } else if (sscanf(cp, "maxcpus=%i%n", &tmp, &chk) == 1) { m = tmp; #endif /* Skip the empty argument case from -c "" */ } else if (cp[0] == '\0') continue; else goto out; /* Any trailing garbage causes an error */ if (cp[chk] != '\0') goto out; } free(str); str = NULL; /* * Range check 1 <= n <= UINT16_MAX all values */ if (n < 1 || s < 1 || c < 1 || t < 1 || n > UINT16_MAX || s > UINT16_MAX || c > UINT16_MAX || t > UINT16_MAX) return (-1); /* If only the cpus was specified, use that as sockets */ if (!scts) s = n; /* * Compute sockets * cores * threads avoiding overflow * The range check above insures these are 16 bit values * If n was specified check it against computed ncpus */ ncpus = (uint64_t)s * c * t; if (ncpus > UINT16_MAX || (ns && n != ncpus)) return (-1); guest_ncpus = ncpus; sockets = s; cores = c; threads = t; return(0); out: free(str); return (-1); } static int pincpu_parse(const char *opt) { int vcpu, pcpu; if (sscanf(opt, "%d:%d", &vcpu, &pcpu) != 2) { fprintf(stderr, "invalid format: %s\n", opt); return (-1); } if (vcpu < 0 || vcpu >= VM_MAXCPU) { fprintf(stderr, "vcpu '%d' outside valid range from 0 to %d\n", vcpu, VM_MAXCPU - 1); return (-1); } if (pcpu < 0 || pcpu >= CPU_SETSIZE) { fprintf(stderr, "hostcpu '%d' outside valid range from " "0 to %d\n", pcpu, CPU_SETSIZE - 1); return (-1); } if (vcpumap[vcpu] == NULL) { if ((vcpumap[vcpu] = malloc(sizeof(cpuset_t))) == NULL) { perror("malloc"); return (-1); } CPU_ZERO(vcpumap[vcpu]); } CPU_SET(pcpu, vcpumap[vcpu]); return (0); } void vm_inject_fault(void *arg, int vcpu, int vector, int errcode_valid, int errcode) { struct vmctx *ctx; int error, restart_instruction; ctx = arg; restart_instruction = 1; error = vm_inject_exception(ctx, vcpu, vector, errcode_valid, errcode, restart_instruction); assert(error == 0); } void * paddr_guest2host(struct vmctx *ctx, uintptr_t gaddr, size_t len) { return (vm_map_gpa(ctx, gaddr, len)); } #ifdef BHYVE_SNAPSHOT uintptr_t paddr_host2guest(struct vmctx *ctx, void *addr) { return (vm_rev_map_gpa(ctx, addr)); } #endif int fbsdrun_vmexit_on_pause(void) { return (guest_vmexit_on_pause); } int fbsdrun_vmexit_on_hlt(void) { return (guest_vmexit_on_hlt); } int fbsdrun_virtio_msix(void) { return (virtio_msix); } static void * fbsdrun_start_thread(void *param) { char tname[MAXCOMLEN + 1]; struct mt_vmm_info *mtp; int vcpu; mtp = param; vcpu = mtp->mt_vcpu; snprintf(tname, sizeof(tname), "vcpu %d", vcpu); pthread_set_name_np(mtp->mt_thr, tname); #ifdef BHYVE_SNAPSHOT checkpoint_cpu_add(vcpu); #endif if (gdb_port != 0) gdb_cpu_add(vcpu); vm_loop(mtp->mt_ctx, vcpu, vmexit[vcpu].rip); /* not reached */ exit(1); return (NULL); } void fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip) { int error; assert(fromcpu == BSP); /* * The 'newcpu' must be activated in the context of 'fromcpu'. If * vm_activate_cpu() is delayed until newcpu's pthread starts running * then vmm.ko is out-of-sync with bhyve and this can create a race * with vm_suspend(). */ error = vm_activate_cpu(ctx, newcpu); if (error != 0) err(EX_OSERR, "could not activate CPU %d", newcpu); CPU_SET_ATOMIC(newcpu, &cpumask); /* * Set up the vmexit struct to allow execution to start * at the given RIP */ vmexit[newcpu].rip = rip; vmexit[newcpu].inst_length = 0; mt_vmm_info[newcpu].mt_ctx = ctx; mt_vmm_info[newcpu].mt_vcpu = newcpu; error = pthread_create(&mt_vmm_info[newcpu].mt_thr, NULL, fbsdrun_start_thread, &mt_vmm_info[newcpu]); assert(error == 0); } static int fbsdrun_deletecpu(struct vmctx *ctx, int vcpu) { if (!CPU_ISSET(vcpu, &cpumask)) { fprintf(stderr, "Attempting to delete unknown cpu %d\n", vcpu); exit(4); } CPU_CLR_ATOMIC(vcpu, &cpumask); return (CPU_EMPTY(&cpumask)); } static int vmexit_handle_notify(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu, uint32_t eax) { #if BHYVE_DEBUG /* * put guest-driven debug here */ #endif return (VMEXIT_CONTINUE); } static int vmexit_inout(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) { int error; int bytes, port, in, out; int vcpu; vcpu = *pvcpu; port = vme->u.inout.port; bytes = vme->u.inout.bytes; in = vme->u.inout.in; out = !in; /* Extra-special case of host notifications */ if (out && port == GUEST_NIO_PORT) { error = vmexit_handle_notify(ctx, vme, pvcpu, vme->u.inout.eax); return (error); } error = emulate_inout(ctx, vcpu, vme, strictio); if (error) { fprintf(stderr, "Unhandled %s%c 0x%04x at 0x%lx\n", in ? "in" : "out", bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'), port, vmexit->rip); return (VMEXIT_ABORT); } else { return (VMEXIT_CONTINUE); } } static int vmexit_rdmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) { uint64_t val; uint32_t eax, edx; int error; val = 0; error = emulate_rdmsr(ctx, *pvcpu, vme->u.msr.code, &val); if (error != 0) { fprintf(stderr, "rdmsr to register %#x on vcpu %d\n", vme->u.msr.code, *pvcpu); if (strictmsr) { vm_inject_gp(ctx, *pvcpu); return (VMEXIT_CONTINUE); } } eax = val; error = vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RAX, eax); assert(error == 0); edx = val >> 32; error = vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RDX, edx); assert(error == 0); return (VMEXIT_CONTINUE); } static int vmexit_wrmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) { int error; error = emulate_wrmsr(ctx, *pvcpu, vme->u.msr.code, vme->u.msr.wval); if (error != 0) { fprintf(stderr, "wrmsr to register %#x(%#lx) on vcpu %d\n", vme->u.msr.code, vme->u.msr.wval, *pvcpu); if (strictmsr) { vm_inject_gp(ctx, *pvcpu); return (VMEXIT_CONTINUE); } } return (VMEXIT_CONTINUE); } static int vmexit_spinup_ap(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) { (void)spinup_ap(ctx, *pvcpu, vme->u.spinup_ap.vcpu, vme->u.spinup_ap.rip); return (VMEXIT_CONTINUE); } #define DEBUG_EPT_MISCONFIG #ifdef DEBUG_EPT_MISCONFIG #define VMCS_GUEST_PHYSICAL_ADDRESS 0x00002400 static uint64_t ept_misconfig_gpa, ept_misconfig_pte[4]; static int ept_misconfig_ptenum; #endif static const char * vmexit_vmx_desc(uint32_t exit_reason) { if (exit_reason >= nitems(vmx_exit_reason_desc) || vmx_exit_reason_desc[exit_reason] == NULL) return ("Unknown"); return (vmx_exit_reason_desc[exit_reason]); } static int vmexit_vmx(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { fprintf(stderr, "vm exit[%d]\n", *pvcpu); fprintf(stderr, "\treason\t\tVMX\n"); fprintf(stderr, "\trip\t\t0x%016lx\n", vmexit->rip); fprintf(stderr, "\tinst_length\t%d\n", vmexit->inst_length); fprintf(stderr, "\tstatus\t\t%d\n", vmexit->u.vmx.status); fprintf(stderr, "\texit_reason\t%u (%s)\n", vmexit->u.vmx.exit_reason, vmexit_vmx_desc(vmexit->u.vmx.exit_reason)); fprintf(stderr, "\tqualification\t0x%016lx\n", vmexit->u.vmx.exit_qualification); fprintf(stderr, "\tinst_type\t\t%d\n", vmexit->u.vmx.inst_type); fprintf(stderr, "\tinst_error\t\t%d\n", vmexit->u.vmx.inst_error); #ifdef DEBUG_EPT_MISCONFIG if (vmexit->u.vmx.exit_reason == EXIT_REASON_EPT_MISCONFIG) { vm_get_register(ctx, *pvcpu, VMCS_IDENT(VMCS_GUEST_PHYSICAL_ADDRESS), &ept_misconfig_gpa); vm_get_gpa_pmap(ctx, ept_misconfig_gpa, ept_misconfig_pte, &ept_misconfig_ptenum); fprintf(stderr, "\tEPT misconfiguration:\n"); fprintf(stderr, "\t\tGPA: %#lx\n", ept_misconfig_gpa); fprintf(stderr, "\t\tPTE(%d): %#lx %#lx %#lx %#lx\n", ept_misconfig_ptenum, ept_misconfig_pte[0], ept_misconfig_pte[1], ept_misconfig_pte[2], ept_misconfig_pte[3]); } #endif /* DEBUG_EPT_MISCONFIG */ return (VMEXIT_ABORT); } static int vmexit_svm(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { fprintf(stderr, "vm exit[%d]\n", *pvcpu); fprintf(stderr, "\treason\t\tSVM\n"); fprintf(stderr, "\trip\t\t0x%016lx\n", vmexit->rip); fprintf(stderr, "\tinst_length\t%d\n", vmexit->inst_length); fprintf(stderr, "\texitcode\t%#lx\n", vmexit->u.svm.exitcode); fprintf(stderr, "\texitinfo1\t%#lx\n", vmexit->u.svm.exitinfo1); fprintf(stderr, "\texitinfo2\t%#lx\n", vmexit->u.svm.exitinfo2); return (VMEXIT_ABORT); } static int vmexit_bogus(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { assert(vmexit->inst_length == 0); stats.vmexit_bogus++; return (VMEXIT_CONTINUE); } static int vmexit_reqidle(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { assert(vmexit->inst_length == 0); stats.vmexit_reqidle++; return (VMEXIT_CONTINUE); } static int vmexit_hlt(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { stats.vmexit_hlt++; /* * Just continue execution with the next instruction. We use * the HLT VM exit as a way to be friendly with the host * scheduler. */ return (VMEXIT_CONTINUE); } static int vmexit_pause(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { stats.vmexit_pause++; return (VMEXIT_CONTINUE); } static int vmexit_mtrap(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { assert(vmexit->inst_length == 0); stats.vmexit_mtrap++; #ifdef BHYVE_SNAPSHOT checkpoint_cpu_suspend(*pvcpu); #endif if (gdb_port != 0) gdb_cpu_mtrap(*pvcpu); #ifdef BHYVE_SNAPSHOT checkpoint_cpu_resume(*pvcpu); #endif return (VMEXIT_CONTINUE); } static int vmexit_inst_emul(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { - int err, i; + int err, i, cs_d; struct vie *vie; + enum vm_cpu_mode mode; stats.vmexit_inst_emul++; vie = &vmexit->u.inst_emul.vie; + if (!vie->decoded) { + /* + * Attempt to decode in userspace as a fallback. This allows + * updating instruction decode in bhyve without rebooting the + * kernel (rapid prototyping), albeit with much slower + * emulation. + */ + vie_restart(vie); + mode = vmexit->u.inst_emul.paging.cpu_mode; + cs_d = vmexit->u.inst_emul.cs_d; + (void)vmm_decode_instruction(mode, cs_d, vie); + } + err = emulate_mem(ctx, *pvcpu, vmexit->u.inst_emul.gpa, vie, &vmexit->u.inst_emul.paging); if (err) { if (err == ESRCH) { EPRINTLN("Unhandled memory access to 0x%lx\n", vmexit->u.inst_emul.gpa); } fprintf(stderr, "Failed to emulate instruction sequence [ "); for (i = 0; i < vie->num_valid; i++) fprintf(stderr, "%02x", vie->inst[i]); FPRINTLN(stderr, " ] at 0x%lx", vmexit->rip); return (VMEXIT_ABORT); } return (VMEXIT_CONTINUE); } static pthread_mutex_t resetcpu_mtx = PTHREAD_MUTEX_INITIALIZER; static pthread_cond_t resetcpu_cond = PTHREAD_COND_INITIALIZER; static int vmexit_suspend(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { enum vm_suspend_how how; how = vmexit->u.suspended.how; fbsdrun_deletecpu(ctx, *pvcpu); if (*pvcpu != BSP) { pthread_mutex_lock(&resetcpu_mtx); pthread_cond_signal(&resetcpu_cond); pthread_mutex_unlock(&resetcpu_mtx); pthread_exit(NULL); } pthread_mutex_lock(&resetcpu_mtx); while (!CPU_EMPTY(&cpumask)) { pthread_cond_wait(&resetcpu_cond, &resetcpu_mtx); } pthread_mutex_unlock(&resetcpu_mtx); switch (how) { case VM_SUSPEND_RESET: exit(0); case VM_SUSPEND_POWEROFF: exit(1); case VM_SUSPEND_HALT: exit(2); case VM_SUSPEND_TRIPLEFAULT: exit(3); default: fprintf(stderr, "vmexit_suspend: invalid reason %d\n", how); exit(100); } return (0); /* NOTREACHED */ } static int vmexit_debug(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { #ifdef BHYVE_SNAPSHOT checkpoint_cpu_suspend(*pvcpu); #endif if (gdb_port != 0) gdb_cpu_suspend(*pvcpu); #ifdef BHYVE_SNAPSHOT checkpoint_cpu_resume(*pvcpu); #endif return (VMEXIT_CONTINUE); } static int vmexit_breakpoint(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { if (gdb_port == 0) { fprintf(stderr, "vm_loop: unexpected VMEXIT_DEBUG\n"); exit(4); } gdb_cpu_breakpoint(*pvcpu, vmexit); return (VMEXIT_CONTINUE); } static vmexit_handler_t handler[VM_EXITCODE_MAX] = { [VM_EXITCODE_INOUT] = vmexit_inout, [VM_EXITCODE_INOUT_STR] = vmexit_inout, [VM_EXITCODE_VMX] = vmexit_vmx, [VM_EXITCODE_SVM] = vmexit_svm, [VM_EXITCODE_BOGUS] = vmexit_bogus, [VM_EXITCODE_REQIDLE] = vmexit_reqidle, [VM_EXITCODE_RDMSR] = vmexit_rdmsr, [VM_EXITCODE_WRMSR] = vmexit_wrmsr, [VM_EXITCODE_MTRAP] = vmexit_mtrap, [VM_EXITCODE_INST_EMUL] = vmexit_inst_emul, [VM_EXITCODE_SPINUP_AP] = vmexit_spinup_ap, [VM_EXITCODE_SUSPENDED] = vmexit_suspend, [VM_EXITCODE_TASK_SWITCH] = vmexit_task_switch, [VM_EXITCODE_DEBUG] = vmexit_debug, [VM_EXITCODE_BPT] = vmexit_breakpoint, }; static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t startrip) { int error, rc; enum vm_exitcode exitcode; cpuset_t active_cpus; if (vcpumap[vcpu] != NULL) { error = pthread_setaffinity_np(pthread_self(), sizeof(cpuset_t), vcpumap[vcpu]); assert(error == 0); } error = vm_active_cpus(ctx, &active_cpus); assert(CPU_ISSET(vcpu, &active_cpus)); error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, startrip); assert(error == 0); while (1) { error = vm_run(ctx, vcpu, &vmexit[vcpu]); if (error != 0) break; exitcode = vmexit[vcpu].exitcode; if (exitcode >= VM_EXITCODE_MAX || handler[exitcode] == NULL) { fprintf(stderr, "vm_loop: unexpected exitcode 0x%x\n", exitcode); exit(4); } rc = (*handler[exitcode])(ctx, &vmexit[vcpu], &vcpu); switch (rc) { case VMEXIT_CONTINUE: break; case VMEXIT_ABORT: abort(); default: exit(4); } } fprintf(stderr, "vm_run error %d, errno %d\n", error, errno); } static int num_vcpus_allowed(struct vmctx *ctx) { int tmp, error; error = vm_get_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, &tmp); /* * The guest is allowed to spinup more than one processor only if the * UNRESTRICTED_GUEST capability is available. */ if (error == 0) return (VM_MAXCPU); else return (1); } void fbsdrun_set_capabilities(struct vmctx *ctx, int cpu) { int err, tmp; if (fbsdrun_vmexit_on_hlt()) { err = vm_get_capability(ctx, cpu, VM_CAP_HALT_EXIT, &tmp); if (err < 0) { fprintf(stderr, "VM exit on HLT not supported\n"); exit(4); } vm_set_capability(ctx, cpu, VM_CAP_HALT_EXIT, 1); if (cpu == BSP) handler[VM_EXITCODE_HLT] = vmexit_hlt; } if (fbsdrun_vmexit_on_pause()) { /* * pause exit support required for this mode */ err = vm_get_capability(ctx, cpu, VM_CAP_PAUSE_EXIT, &tmp); if (err < 0) { fprintf(stderr, "SMP mux requested, no pause support\n"); exit(4); } vm_set_capability(ctx, cpu, VM_CAP_PAUSE_EXIT, 1); if (cpu == BSP) handler[VM_EXITCODE_PAUSE] = vmexit_pause; } if (x2apic_mode) err = vm_set_x2apic_state(ctx, cpu, X2APIC_ENABLED); else err = vm_set_x2apic_state(ctx, cpu, X2APIC_DISABLED); if (err) { fprintf(stderr, "Unable to set x2apic state (%d)\n", err); exit(4); } vm_set_capability(ctx, cpu, VM_CAP_ENABLE_INVPCID, 1); } static struct vmctx * do_open(const char *vmname) { struct vmctx *ctx; int error; bool reinit, romboot; #ifndef WITHOUT_CAPSICUM cap_rights_t rights; const cap_ioctl_t *cmds; size_t ncmds; #endif reinit = romboot = false; if (lpc_bootrom()) romboot = true; error = vm_create(vmname); if (error) { if (errno == EEXIST) { if (romboot) { reinit = true; } else { /* * The virtual machine has been setup by the * userspace bootloader. */ } } else { perror("vm_create"); exit(4); } } else { if (!romboot) { /* * If the virtual machine was just created then a * bootrom must be configured to boot it. */ fprintf(stderr, "virtual machine cannot be booted\n"); exit(4); } } ctx = vm_open(vmname); if (ctx == NULL) { perror("vm_open"); exit(4); } #ifndef WITHOUT_CAPSICUM cap_rights_init(&rights, CAP_IOCTL, CAP_MMAP_RW); if (caph_rights_limit(vm_get_device_fd(ctx), &rights) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); vm_get_ioctls(&ncmds); cmds = vm_get_ioctls(NULL); if (cmds == NULL) errx(EX_OSERR, "out of memory"); if (caph_ioctls_limit(vm_get_device_fd(ctx), cmds, ncmds) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); free((cap_ioctl_t *)cmds); #endif if (reinit) { error = vm_reinit(ctx); if (error) { perror("vm_reinit"); exit(4); } } error = vm_set_topology(ctx, sockets, cores, threads, maxcpus); if (error) errx(EX_OSERR, "vm_set_topology"); return (ctx); } void spinup_vcpu(struct vmctx *ctx, int vcpu) { int error; uint64_t rip; error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RIP, &rip); assert(error == 0); fbsdrun_set_capabilities(ctx, vcpu); error = vm_set_capability(ctx, vcpu, VM_CAP_UNRESTRICTED_GUEST, 1); assert(error == 0); fbsdrun_addcpu(ctx, BSP, vcpu, rip); } int main(int argc, char *argv[]) { int c, error, dbg_port, err, bvmcons; int max_vcpus, mptgen, memflags; int rtc_localtime; bool gdb_stop; struct vmctx *ctx; uint64_t rip; size_t memsize; char *optstr; #ifdef BHYVE_SNAPSHOT char *restore_file; struct restore_state rstate; int vcpu; restore_file = NULL; #endif bvmcons = 0; progname = basename(argv[0]); dbg_port = 0; gdb_stop = false; guest_ncpus = 1; sockets = cores = threads = 1; maxcpus = 0; memsize = 256 * MB; mptgen = 1; rtc_localtime = 1; memflags = 0; #ifdef BHYVE_SNAPSHOT optstr = "abehuwxACHIPSWYp:g:G:c:s:m:l:U:r:"; #else optstr = "abehuwxACHIPSWYp:g:G:c:s:m:l:U:"; #endif while ((c = getopt(argc, argv, optstr)) != -1) { switch (c) { case 'a': x2apic_mode = 0; break; case 'A': acpi = 1; break; case 'b': bvmcons = 1; break; case 'p': if (pincpu_parse(optarg) != 0) { errx(EX_USAGE, "invalid vcpu pinning " "configuration '%s'", optarg); } break; case 'c': if (topology_parse(optarg) != 0) { errx(EX_USAGE, "invalid cpu topology " "'%s'", optarg); } break; case 'C': memflags |= VM_MEM_F_INCORE; break; case 'g': dbg_port = atoi(optarg); break; case 'G': if (optarg[0] == 'w') { gdb_stop = true; optarg++; } gdb_port = atoi(optarg); break; case 'l': if (strncmp(optarg, "help", strlen(optarg)) == 0) { lpc_print_supported_devices(); exit(0); } else if (lpc_device_parse(optarg) != 0) { errx(EX_USAGE, "invalid lpc device " "configuration '%s'", optarg); } break; #ifdef BHYVE_SNAPSHOT case 'r': restore_file = optarg; break; #endif case 's': if (strncmp(optarg, "help", strlen(optarg)) == 0) { pci_print_supported_devices(); exit(0); } else if (pci_parse_slot(optarg) != 0) exit(4); else break; case 'S': memflags |= VM_MEM_F_WIRED; break; case 'm': error = vm_parse_memsize(optarg, &memsize); if (error) errx(EX_USAGE, "invalid memsize '%s'", optarg); break; case 'H': guest_vmexit_on_hlt = 1; break; case 'I': /* * The "-I" option was used to add an ioapic to the * virtual machine. * * An ioapic is now provided unconditionally for each * virtual machine and this option is now deprecated. */ break; case 'P': guest_vmexit_on_pause = 1; break; case 'e': strictio = 1; break; case 'u': rtc_localtime = 0; break; case 'U': guest_uuid_str = optarg; break; case 'w': strictmsr = 0; break; case 'W': virtio_msix = 0; break; case 'x': x2apic_mode = 1; break; case 'Y': mptgen = 0; break; case 'h': usage(0); default: usage(1); } } argc -= optind; argv += optind; #ifdef BHYVE_SNAPSHOT if (argc > 1 || (argc == 0 && restore_file == NULL)) usage(1); if (restore_file != NULL) { error = load_restore_file(restore_file, &rstate); if (error) { fprintf(stderr, "Failed to read checkpoint info from " "file: '%s'.\n", restore_file); exit(1); } } if (argc == 1) { vmname = argv[0]; } else { vmname = lookup_vmname(&rstate); if (vmname == NULL) { fprintf(stderr, "Cannot find VM name in restore file. " "Please specify one.\n"); exit(1); } } #else if (argc != 1) usage(1); vmname = argv[0]; #endif ctx = do_open(vmname); #ifdef BHYVE_SNAPSHOT if (restore_file != NULL) { guest_ncpus = lookup_guest_ncpus(&rstate); memflags = lookup_memflags(&rstate); memsize = lookup_memsize(&rstate); } if (guest_ncpus < 1) { fprintf(stderr, "Invalid guest vCPUs (%d)\n", guest_ncpus); exit(1); } #endif max_vcpus = num_vcpus_allowed(ctx); if (guest_ncpus > max_vcpus) { fprintf(stderr, "%d vCPUs requested but only %d available\n", guest_ncpus, max_vcpus); exit(4); } fbsdrun_set_capabilities(ctx, BSP); vm_set_memflags(ctx, memflags); err = vm_setup_memory(ctx, memsize, VM_MMAP_ALL); if (err) { fprintf(stderr, "Unable to setup memory (%d)\n", errno); exit(4); } error = init_msr(); if (error) { fprintf(stderr, "init_msr error %d", error); exit(4); } init_mem(); init_inout(); kernemu_dev_init(); init_bootrom(ctx); atkbdc_init(ctx); pci_irq_init(ctx); ioapic_init(ctx); rtc_init(ctx, rtc_localtime); sci_init(ctx); /* * Exit if a device emulation finds an error in its initilization */ if (init_pci(ctx) != 0) { perror("device emulation initialization error"); exit(4); } /* * Initialize after PCI, to allow a bootrom file to reserve the high * region. */ if (acpi) vmgenc_init(ctx); if (dbg_port != 0) init_dbgport(dbg_port); if (gdb_port != 0) init_gdb(ctx, gdb_port, gdb_stop); if (bvmcons) init_bvmcons(); if (lpc_bootrom()) { if (vm_set_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, 1)) { fprintf(stderr, "ROM boot failed: unrestricted guest " "capability not available\n"); exit(4); } error = vcpu_reset(ctx, BSP); assert(error == 0); } #ifdef BHYVE_SNAPSHOT if (restore_file != NULL) { fprintf(stdout, "Pausing pci devs...\r\n"); if (vm_pause_user_devs(ctx) != 0) { fprintf(stderr, "Failed to pause PCI device state.\n"); exit(1); } fprintf(stdout, "Restoring vm mem...\r\n"); if (restore_vm_mem(ctx, &rstate) != 0) { fprintf(stderr, "Failed to restore VM memory.\n"); exit(1); } fprintf(stdout, "Restoring pci devs...\r\n"); if (vm_restore_user_devs(ctx, &rstate) != 0) { fprintf(stderr, "Failed to restore PCI device state.\n"); exit(1); } fprintf(stdout, "Restoring kernel structs...\r\n"); if (vm_restore_kern_structs(ctx, &rstate) != 0) { fprintf(stderr, "Failed to restore kernel structs.\n"); exit(1); } fprintf(stdout, "Resuming pci devs...\r\n"); if (vm_resume_user_devs(ctx) != 0) { fprintf(stderr, "Failed to resume PCI device state.\n"); exit(1); } } #endif error = vm_get_register(ctx, BSP, VM_REG_GUEST_RIP, &rip); assert(error == 0); /* * build the guest tables, MP etc. */ if (mptgen) { error = mptable_build(ctx, guest_ncpus); if (error) { perror("error to build the guest tables"); exit(4); } } error = smbios_build(ctx); assert(error == 0); if (acpi) { error = acpi_build(ctx, guest_ncpus); assert(error == 0); } if (lpc_bootrom()) fwctl_init(); /* * Change the proc title to include the VM name. */ setproctitle("%s", vmname); #ifndef WITHOUT_CAPSICUM caph_cache_catpages(); if (caph_limit_stdout() == -1 || caph_limit_stderr() == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); if (caph_enter() == -1) errx(EX_OSERR, "cap_enter() failed"); #endif #ifdef BHYVE_SNAPSHOT if (restore_file != NULL) destroy_restore_state(&rstate); /* * checkpointing thread for communication with bhyvectl */ if (init_checkpoint_thread(ctx) < 0) printf("Failed to start checkpoint thread!\r\n"); if (restore_file != NULL) vm_restore_time(ctx); #endif /* * Add CPU 0 */ fbsdrun_addcpu(ctx, BSP, BSP, rip); #ifdef BHYVE_SNAPSHOT /* * If we restore a VM, start all vCPUs now (including APs), otherwise, * let the guest OS to spin them up later via vmexits. */ if (restore_file != NULL) { for (vcpu = 0; vcpu < guest_ncpus; vcpu++) { if (vcpu == BSP) continue; fprintf(stdout, "spinning up vcpu no %d...\r\n", vcpu); spinup_vcpu(ctx, vcpu); } } #endif /* * Head off to the main event dispatch loop */ mevent_dispatch(); exit(4); }