diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h index e35119af8572..7c01259ed8cb 100644 --- a/sys/amd64/include/vmm.h +++ b/sys/amd64/include/vmm.h @@ -1,789 +1,797 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _VMM_H_ #define _VMM_H_ #include #include #include struct vcpu; struct vm_snapshot_meta; #ifdef _KERNEL SDT_PROVIDER_DECLARE(vmm); #endif enum vm_suspend_how { VM_SUSPEND_NONE, VM_SUSPEND_RESET, VM_SUSPEND_POWEROFF, VM_SUSPEND_HALT, VM_SUSPEND_TRIPLEFAULT, VM_SUSPEND_DESTROY, VM_SUSPEND_LAST }; /* * Identifiers for architecturally defined registers. */ enum vm_reg_name { VM_REG_GUEST_RAX, VM_REG_GUEST_RBX, VM_REG_GUEST_RCX, VM_REG_GUEST_RDX, VM_REG_GUEST_RSI, VM_REG_GUEST_RDI, VM_REG_GUEST_RBP, VM_REG_GUEST_R8, VM_REG_GUEST_R9, VM_REG_GUEST_R10, VM_REG_GUEST_R11, VM_REG_GUEST_R12, VM_REG_GUEST_R13, VM_REG_GUEST_R14, VM_REG_GUEST_R15, VM_REG_GUEST_CR0, VM_REG_GUEST_CR3, VM_REG_GUEST_CR4, VM_REG_GUEST_DR7, VM_REG_GUEST_RSP, VM_REG_GUEST_RIP, VM_REG_GUEST_RFLAGS, VM_REG_GUEST_ES, VM_REG_GUEST_CS, VM_REG_GUEST_SS, VM_REG_GUEST_DS, VM_REG_GUEST_FS, VM_REG_GUEST_GS, VM_REG_GUEST_LDTR, VM_REG_GUEST_TR, VM_REG_GUEST_IDTR, VM_REG_GUEST_GDTR, VM_REG_GUEST_EFER, VM_REG_GUEST_CR2, VM_REG_GUEST_PDPTE0, VM_REG_GUEST_PDPTE1, VM_REG_GUEST_PDPTE2, VM_REG_GUEST_PDPTE3, VM_REG_GUEST_INTR_SHADOW, VM_REG_GUEST_DR0, VM_REG_GUEST_DR1, VM_REG_GUEST_DR2, VM_REG_GUEST_DR3, VM_REG_GUEST_DR6, VM_REG_GUEST_ENTRY_INST_LENGTH, VM_REG_GUEST_FS_BASE, VM_REG_GUEST_GS_BASE, VM_REG_GUEST_KGS_BASE, VM_REG_GUEST_TPR, VM_REG_LAST }; enum x2apic_state { X2APIC_DISABLED, X2APIC_ENABLED, X2APIC_STATE_LAST }; #define VM_INTINFO_VECTOR(info) ((info) & 0xff) #define VM_INTINFO_DEL_ERRCODE 0x800 #define VM_INTINFO_RSVD 0x7ffff000 #define VM_INTINFO_VALID 0x80000000 #define VM_INTINFO_TYPE 0x700 #define VM_INTINFO_HWINTR (0 << 8) #define VM_INTINFO_NMI (2 << 8) #define VM_INTINFO_HWEXCEPTION (3 << 8) #define VM_INTINFO_SWINTR (4 << 8) /* * The VM name has to fit into the pathname length constraints of devfs, * governed primarily by SPECNAMELEN. The length is the total number of * characters in the full path, relative to the mount point and not * including any leading '/' characters. * A prefix and a suffix are added to the name specified by the user. * The prefix is usually "vmm/" or "vmm.io/", but can be a few characters * longer for future use. * The suffix is a string that identifies a bootrom image or some similar * image that is attached to the VM. A separator character gets added to * the suffix automatically when generating the full path, so it must be * accounted for, reducing the effective length by 1. * The effective length of a VM name is 229 bytes for FreeBSD 13 and 37 * bytes for FreeBSD 12. A minimum length is set for safety and supports * a SPECNAMELEN as small as 32 on old systems. */ #define VM_MAX_PREFIXLEN 10 #define VM_MAX_SUFFIXLEN 15 #define VM_MIN_NAMELEN 6 #define VM_MAX_NAMELEN \ (SPECNAMELEN - VM_MAX_PREFIXLEN - VM_MAX_SUFFIXLEN - 1) #ifdef _KERNEL #include CTASSERT(VM_MAX_NAMELEN >= VM_MIN_NAMELEN); struct vm; struct vm_exception; struct vm_mem; struct seg_desc; struct vm_exit; struct vm_run; struct vhpet; struct vioapic; struct vlapic; struct vmspace; struct vm_object; struct vm_guest_paging; struct pmap; enum snapshot_req; struct vm_eventinfo { cpuset_t *rptr; /* rendezvous cookie */ int *sptr; /* suspend cookie */ int *iptr; /* reqidle cookie */ }; -typedef int (*vmm_init_func_t)(int ipinum); -typedef int (*vmm_cleanup_func_t)(void); -typedef void (*vmm_suspend_func_t)(void); -typedef void (*vmm_resume_func_t)(void); -typedef void * (*vmi_init_func_t)(struct vm *vm, struct pmap *pmap); -typedef int (*vmi_run_func_t)(void *vcpui, register_t rip, - struct pmap *pmap, struct vm_eventinfo *info); -typedef void (*vmi_cleanup_func_t)(void *vmi); -typedef void * (*vmi_vcpu_init_func_t)(void *vmi, struct vcpu *vcpu, - int vcpu_id); -typedef void (*vmi_vcpu_cleanup_func_t)(void *vcpui); -typedef int (*vmi_get_register_t)(void *vcpui, int num, uint64_t *retval); -typedef int (*vmi_set_register_t)(void *vcpui, int num, uint64_t val); -typedef int (*vmi_get_desc_t)(void *vcpui, int num, struct seg_desc *desc); -typedef int (*vmi_set_desc_t)(void *vcpui, int num, struct seg_desc *desc); -typedef int (*vmi_get_cap_t)(void *vcpui, int num, int *retval); -typedef int (*vmi_set_cap_t)(void *vcpui, int num, int val); -typedef struct vmspace * (*vmi_vmspace_alloc)(vm_offset_t min, vm_offset_t max); -typedef void (*vmi_vmspace_free)(struct vmspace *vmspace); -typedef struct vlapic * (*vmi_vlapic_init)(void *vcpui); -typedef void (*vmi_vlapic_cleanup)(struct vlapic *vlapic); -typedef int (*vmi_snapshot_vcpu_t)(void *vcpui, struct vm_snapshot_meta *meta); -typedef int (*vmi_restore_tsc_t)(void *vcpui, uint64_t now); +#define DECLARE_VMMOPS_FUNC(ret_type, opname, args) \ + typedef ret_type (*vmmops_##opname##_t) args; \ + ret_type vmmops_##opname args + +DECLARE_VMMOPS_FUNC(int, modinit, (int ipinum)); +DECLARE_VMMOPS_FUNC(int, modcleanup, (void)); +DECLARE_VMMOPS_FUNC(void, modresume, (void)); +DECLARE_VMMOPS_FUNC(void, modsuspend, (void)); +DECLARE_VMMOPS_FUNC(void *, init, (struct vm *vm, struct pmap *pmap)); +DECLARE_VMMOPS_FUNC(int, run, (void *vcpui, register_t pc, + struct pmap *pmap, struct vm_eventinfo *info)); +DECLARE_VMMOPS_FUNC(void, cleanup, (void *vmi)); +DECLARE_VMMOPS_FUNC(void *, vcpu_init, (void *vmi, struct vcpu *vcpu, + int vcpu_id)); +DECLARE_VMMOPS_FUNC(void, vcpu_cleanup, (void *vcpui)); +DECLARE_VMMOPS_FUNC(int, getreg, (void *vcpui, int num, uint64_t *retval)); +DECLARE_VMMOPS_FUNC(int, setreg, (void *vcpui, int num, uint64_t val)); +DECLARE_VMMOPS_FUNC(int, getdesc, (void *vcpui, int num, + struct seg_desc *desc)); +DECLARE_VMMOPS_FUNC(int, setdesc, (void *vcpui, int num, + struct seg_desc *desc)); +DECLARE_VMMOPS_FUNC(int, getcap, (void *vcpui, int num, int *retval)); +DECLARE_VMMOPS_FUNC(int, setcap, (void *vcpui, int num, int val)); +DECLARE_VMMOPS_FUNC(struct vmspace *, vmspace_alloc, + (vm_offset_t min, vm_offset_t max)); +DECLARE_VMMOPS_FUNC(void, vmspace_free, (struct vmspace *vmspace)); +DECLARE_VMMOPS_FUNC(struct vlapic *, vlapic_init, (void *vcpui)); +DECLARE_VMMOPS_FUNC(void, vlapic_cleanup, (struct vlapic *vlapic)); +DECLARE_VMMOPS_FUNC(int, vcpu_snapshot, (void *vcpui, + struct vm_snapshot_meta *meta)); +DECLARE_VMMOPS_FUNC(int, restore_tsc, (void *vcpui, uint64_t now)); struct vmm_ops { - vmm_init_func_t modinit; /* module wide initialization */ - vmm_cleanup_func_t modcleanup; - vmm_resume_func_t modsuspend; - vmm_resume_func_t modresume; - - vmi_init_func_t init; /* vm-specific initialization */ - vmi_run_func_t run; - vmi_cleanup_func_t cleanup; - vmi_vcpu_init_func_t vcpu_init; - vmi_vcpu_cleanup_func_t vcpu_cleanup; - vmi_get_register_t getreg; - vmi_set_register_t setreg; - vmi_get_desc_t getdesc; - vmi_set_desc_t setdesc; - vmi_get_cap_t getcap; - vmi_set_cap_t setcap; - vmi_vmspace_alloc vmspace_alloc; - vmi_vmspace_free vmspace_free; - vmi_vlapic_init vlapic_init; - vmi_vlapic_cleanup vlapic_cleanup; + vmmops_modinit_t modinit; /* module wide initialization */ + vmmops_modcleanup_t modcleanup; + vmmops_modresume_t modsuspend; + vmmops_modresume_t modresume; + + vmmops_init_t init; /* vm-specific initialization */ + vmmops_run_t run; + vmmops_cleanup_t cleanup; + vmmops_vcpu_init_t vcpu_init; + vmmops_vcpu_cleanup_t vcpu_cleanup; + vmmops_getreg_t getreg; + vmmops_setreg_t setreg; + vmmops_getdesc_t getdesc; + vmmops_setdesc_t setdesc; + vmmops_getcap_t getcap; + vmmops_setcap_t setcap; + vmmops_vmspace_alloc_t vmspace_alloc; + vmmops_vmspace_free_t vmspace_free; + vmmops_vlapic_init_t vlapic_init; + vmmops_vlapic_cleanup_t vlapic_cleanup; /* checkpoint operations */ - vmi_snapshot_vcpu_t vcpu_snapshot; - vmi_restore_tsc_t restore_tsc; + vmmops_vcpu_snapshot_t vcpu_snapshot; + vmmops_restore_tsc_t restore_tsc; }; extern const struct vmm_ops vmm_ops_intel; extern const struct vmm_ops vmm_ops_amd; extern u_int vm_maxcpu; /* maximum virtual cpus */ int vm_create(const char *name, struct vm **retvm); struct vcpu *vm_alloc_vcpu(struct vm *vm, int vcpuid); void vm_disable_vcpu_creation(struct vm *vm); void vm_slock_vcpus(struct vm *vm); void vm_unlock_vcpus(struct vm *vm); void vm_destroy(struct vm *vm); int vm_reinit(struct vm *vm); const char *vm_name(struct vm *vm); uint16_t vm_get_maxcpus(struct vm *vm); void vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores, uint16_t *threads, uint16_t *maxcpus); int vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores, uint16_t threads, uint16_t maxcpus); int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa); int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len); int vm_assign_pptdev(struct vm *vm, int bus, int slot, int func); int vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func); int vm_get_register(struct vcpu *vcpu, int reg, uint64_t *retval); int vm_set_register(struct vcpu *vcpu, int reg, uint64_t val); int vm_get_seg_desc(struct vcpu *vcpu, int reg, struct seg_desc *ret_desc); int vm_set_seg_desc(struct vcpu *vcpu, int reg, struct seg_desc *desc); int vm_run(struct vcpu *vcpu); int vm_suspend(struct vm *vm, enum vm_suspend_how how); int vm_inject_nmi(struct vcpu *vcpu); int vm_nmi_pending(struct vcpu *vcpu); void vm_nmi_clear(struct vcpu *vcpu); int vm_inject_extint(struct vcpu *vcpu); int vm_extint_pending(struct vcpu *vcpu); void vm_extint_clear(struct vcpu *vcpu); int vcpu_vcpuid(struct vcpu *vcpu); struct vm *vcpu_vm(struct vcpu *vcpu); struct vcpu *vm_vcpu(struct vm *vm, int cpu); struct vlapic *vm_lapic(struct vcpu *vcpu); struct vioapic *vm_ioapic(struct vm *vm); struct vhpet *vm_hpet(struct vm *vm); int vm_get_capability(struct vcpu *vcpu, int type, int *val); int vm_set_capability(struct vcpu *vcpu, int type, int val); int vm_get_x2apic_state(struct vcpu *vcpu, enum x2apic_state *state); int vm_set_x2apic_state(struct vcpu *vcpu, enum x2apic_state state); int vm_apicid2vcpuid(struct vm *vm, int apicid); int vm_activate_cpu(struct vcpu *vcpu); int vm_suspend_cpu(struct vm *vm, struct vcpu *vcpu); int vm_resume_cpu(struct vm *vm, struct vcpu *vcpu); int vm_restart_instruction(struct vcpu *vcpu); struct vm_exit *vm_exitinfo(struct vcpu *vcpu); cpuset_t *vm_exitinfo_cpuset(struct vcpu *vcpu); void vm_exit_suspended(struct vcpu *vcpu, uint64_t rip); void vm_exit_debug(struct vcpu *vcpu, uint64_t rip); void vm_exit_rendezvous(struct vcpu *vcpu, uint64_t rip); void vm_exit_astpending(struct vcpu *vcpu, uint64_t rip); void vm_exit_reqidle(struct vcpu *vcpu, uint64_t rip); int vm_snapshot_req(struct vm *vm, struct vm_snapshot_meta *meta); int vm_restore_time(struct vm *vm); #ifdef _SYS__CPUSET_H_ /* * Rendezvous all vcpus specified in 'dest' and execute 'func(arg)'. * The rendezvous 'func(arg)' is not allowed to do anything that will * cause the thread to be put to sleep. * * The caller cannot hold any locks when initiating the rendezvous. * * The implementation of this API may cause vcpus other than those specified * by 'dest' to be stalled. The caller should not rely on any vcpus making * forward progress when the rendezvous is in progress. */ typedef void (*vm_rendezvous_func_t)(struct vcpu *vcpu, void *arg); int vm_smp_rendezvous(struct vcpu *vcpu, cpuset_t dest, vm_rendezvous_func_t func, void *arg); cpuset_t vm_active_cpus(struct vm *vm); cpuset_t vm_debug_cpus(struct vm *vm); cpuset_t vm_suspended_cpus(struct vm *vm); cpuset_t vm_start_cpus(struct vm *vm, const cpuset_t *tostart); void vm_await_start(struct vm *vm, const cpuset_t *waiting); #endif /* _SYS__CPUSET_H_ */ static __inline int vcpu_rendezvous_pending(struct vcpu *vcpu, struct vm_eventinfo *info) { /* * This check isn't done with atomic operations or under a lock because * there's no need to. If the vcpuid bit is set, the vcpu is part of a * rendezvous and the bit won't be cleared until the vcpu enters the * rendezvous. On rendezvous exit, the cpuset is cleared and the vcpu * will see an empty cpuset. So, the races are harmless. */ return (CPU_ISSET(vcpu_vcpuid(vcpu), info->rptr)); } static __inline int vcpu_suspended(struct vm_eventinfo *info) { return (*info->sptr); } static __inline int vcpu_reqidle(struct vm_eventinfo *info) { return (*info->iptr); } int vcpu_debugged(struct vcpu *vcpu); /* * Return true if device indicated by bus/slot/func is supposed to be a * pci passthrough device. * * Return false otherwise. */ bool vmm_is_pptdev(int bus, int slot, int func); void *vm_iommu_domain(struct vm *vm); enum vcpu_state { VCPU_IDLE, VCPU_FROZEN, VCPU_RUNNING, VCPU_SLEEPING, }; int vcpu_set_state(struct vcpu *vcpu, enum vcpu_state state, bool from_idle); enum vcpu_state vcpu_get_state(struct vcpu *vcpu, int *hostcpu); static int __inline vcpu_is_running(struct vcpu *vcpu, int *hostcpu) { return (vcpu_get_state(vcpu, hostcpu) == VCPU_RUNNING); } #ifdef _SYS_PROC_H_ static int __inline vcpu_should_yield(struct vcpu *vcpu) { struct thread *td; td = curthread; return (td->td_ast != 0 || td->td_owepreempt != 0); } #endif void *vcpu_stats(struct vcpu *vcpu); void vcpu_notify_event(struct vcpu *vcpu, bool lapic_intr); struct vmspace *vm_vmspace(struct vm *vm); struct vm_mem *vm_mem(struct vm *vm); struct vatpic *vm_atpic(struct vm *vm); struct vatpit *vm_atpit(struct vm *vm); struct vpmtmr *vm_pmtmr(struct vm *vm); struct vrtc *vm_rtc(struct vm *vm); /* * Inject exception 'vector' into the guest vcpu. This function returns 0 on * success and non-zero on failure. * * Wrapper functions like 'vm_inject_gp()' should be preferred to calling * this function directly because they enforce the trap-like or fault-like * behavior of an exception. * * This function should only be called in the context of the thread that is * executing this vcpu. */ int vm_inject_exception(struct vcpu *vcpu, int vector, int err_valid, uint32_t errcode, int restart_instruction); /* * This function is called after a VM-exit that occurred during exception or * interrupt delivery through the IDT. The format of 'intinfo' is described * in Figure 15-1, "EXITINTINFO for All Intercepts", APM, Vol 2. * * If a VM-exit handler completes the event delivery successfully then it * should call vm_exit_intinfo() to extinguish the pending event. For e.g., * if the task switch emulation is triggered via a task gate then it should * call this function with 'intinfo=0' to indicate that the external event * is not pending anymore. * * Return value is 0 on success and non-zero on failure. */ int vm_exit_intinfo(struct vcpu *vcpu, uint64_t intinfo); /* * This function is called before every VM-entry to retrieve a pending * event that should be injected into the guest. This function combines * nested events into a double or triple fault. * * Returns 0 if there are no events that need to be injected into the guest * and non-zero otherwise. */ int vm_entry_intinfo(struct vcpu *vcpu, uint64_t *info); int vm_get_intinfo(struct vcpu *vcpu, uint64_t *info1, uint64_t *info2); /* * Function used to keep track of the guest's TSC offset. The * offset is used by the virtualization extensions to provide a consistent * value for the Time Stamp Counter to the guest. */ void vm_set_tsc_offset(struct vcpu *vcpu, uint64_t offset); enum vm_reg_name vm_segment_name(int seg_encoding); struct vm_copyinfo { uint64_t gpa; size_t len; void *hva; void *cookie; }; /* * Set up 'copyinfo[]' to copy to/from guest linear address space starting * at 'gla' and 'len' bytes long. The 'prot' should be set to PROT_READ for * a copyin or PROT_WRITE for a copyout. * * retval is_fault Interpretation * 0 0 Success * 0 1 An exception was injected into the guest * EFAULT N/A Unrecoverable error * * The 'copyinfo[]' can be passed to 'vm_copyin()' or 'vm_copyout()' only if * the return value is 0. The 'copyinfo[]' resources should be freed by calling * 'vm_copy_teardown()' after the copy is done. */ int vm_copy_setup(struct vcpu *vcpu, struct vm_guest_paging *paging, uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo, int num_copyinfo, int *is_fault); void vm_copy_teardown(struct vm_copyinfo *copyinfo, int num_copyinfo); void vm_copyin(struct vm_copyinfo *copyinfo, void *kaddr, size_t len); void vm_copyout(const void *kaddr, struct vm_copyinfo *copyinfo, size_t len); int vcpu_trace_exceptions(struct vcpu *vcpu); int vcpu_trap_wbinvd(struct vcpu *vcpu); #endif /* KERNEL */ /* * Identifiers for optional vmm capabilities */ enum vm_cap_type { VM_CAP_HALT_EXIT, VM_CAP_MTRAP_EXIT, VM_CAP_PAUSE_EXIT, VM_CAP_UNRESTRICTED_GUEST, VM_CAP_ENABLE_INVPCID, VM_CAP_BPT_EXIT, VM_CAP_RDPID, VM_CAP_RDTSCP, VM_CAP_IPI_EXIT, VM_CAP_MASK_HWINTR, VM_CAP_RFLAGS_TF, VM_CAP_MAX }; enum vm_intr_trigger { EDGE_TRIGGER, LEVEL_TRIGGER }; /* * The 'access' field has the format specified in Table 21-2 of the Intel * Architecture Manual vol 3b. * * XXX The contents of the 'access' field are architecturally defined except * bit 16 - Segment Unusable. */ struct seg_desc { uint64_t base; uint32_t limit; uint32_t access; }; #define SEG_DESC_TYPE(access) ((access) & 0x001f) #define SEG_DESC_DPL(access) (((access) >> 5) & 0x3) #define SEG_DESC_PRESENT(access) (((access) & 0x0080) ? 1 : 0) #define SEG_DESC_DEF32(access) (((access) & 0x4000) ? 1 : 0) #define SEG_DESC_GRANULARITY(access) (((access) & 0x8000) ? 1 : 0) #define SEG_DESC_UNUSABLE(access) (((access) & 0x10000) ? 1 : 0) enum vm_cpu_mode { CPU_MODE_REAL, CPU_MODE_PROTECTED, CPU_MODE_COMPATIBILITY, /* IA-32E mode (CS.L = 0) */ CPU_MODE_64BIT, /* IA-32E mode (CS.L = 1) */ }; enum vm_paging_mode { PAGING_MODE_FLAT, PAGING_MODE_32, PAGING_MODE_PAE, PAGING_MODE_64, PAGING_MODE_64_LA57, }; struct vm_guest_paging { uint64_t cr3; int cpl; enum vm_cpu_mode cpu_mode; enum vm_paging_mode paging_mode; }; /* * The data structures 'vie' and 'vie_op' are meant to be opaque to the * consumers of instruction decoding. The only reason why their contents * need to be exposed is because they are part of the 'vm_exit' structure. */ struct vie_op { uint8_t op_byte; /* actual opcode byte */ uint8_t op_type; /* type of operation (e.g. MOV) */ uint16_t op_flags; }; _Static_assert(sizeof(struct vie_op) == 4, "ABI"); _Static_assert(_Alignof(struct vie_op) == 2, "ABI"); #define VIE_INST_SIZE 15 struct vie { uint8_t inst[VIE_INST_SIZE]; /* instruction bytes */ uint8_t num_valid; /* size of the instruction */ /* The following fields are all zeroed upon restart. */ #define vie_startzero num_processed uint8_t num_processed; uint8_t addrsize:4, opsize:4; /* address and operand sizes */ uint8_t rex_w:1, /* REX prefix */ rex_r:1, rex_x:1, rex_b:1, rex_present:1, repz_present:1, /* REP/REPE/REPZ prefix */ repnz_present:1, /* REPNE/REPNZ prefix */ opsize_override:1, /* Operand size override */ addrsize_override:1, /* Address size override */ segment_override:1; /* Segment override */ uint8_t mod:2, /* ModRM byte */ reg:4, rm:4; uint8_t ss:2, /* SIB byte */ vex_present:1, /* VEX prefixed */ vex_l:1, /* L bit */ index:4, /* SIB byte */ base:4; /* SIB byte */ uint8_t disp_bytes; uint8_t imm_bytes; uint8_t scale; uint8_t vex_reg:4, /* vvvv: first source register specifier */ vex_pp:2, /* pp */ _sparebits:2; uint8_t _sparebytes[2]; int base_register; /* VM_REG_GUEST_xyz */ int index_register; /* VM_REG_GUEST_xyz */ int segment_register; /* VM_REG_GUEST_xyz */ int64_t displacement; /* optional addr displacement */ int64_t immediate; /* optional immediate operand */ uint8_t decoded; /* set to 1 if successfully decoded */ uint8_t _sparebyte; struct vie_op op; /* opcode description */ }; _Static_assert(sizeof(struct vie) == 64, "ABI"); _Static_assert(__offsetof(struct vie, disp_bytes) == 22, "ABI"); _Static_assert(__offsetof(struct vie, scale) == 24, "ABI"); _Static_assert(__offsetof(struct vie, base_register) == 28, "ABI"); enum vm_exitcode { VM_EXITCODE_INOUT, VM_EXITCODE_VMX, VM_EXITCODE_BOGUS, VM_EXITCODE_RDMSR, VM_EXITCODE_WRMSR, VM_EXITCODE_HLT, VM_EXITCODE_MTRAP, VM_EXITCODE_PAUSE, VM_EXITCODE_PAGING, VM_EXITCODE_INST_EMUL, VM_EXITCODE_SPINUP_AP, VM_EXITCODE_DEPRECATED1, /* used to be SPINDOWN_CPU */ VM_EXITCODE_RENDEZVOUS, VM_EXITCODE_IOAPIC_EOI, VM_EXITCODE_SUSPENDED, VM_EXITCODE_INOUT_STR, VM_EXITCODE_TASK_SWITCH, VM_EXITCODE_MONITOR, VM_EXITCODE_MWAIT, VM_EXITCODE_SVM, VM_EXITCODE_REQIDLE, VM_EXITCODE_DEBUG, VM_EXITCODE_VMINSN, VM_EXITCODE_BPT, VM_EXITCODE_IPI, VM_EXITCODE_DB, VM_EXITCODE_MAX }; struct vm_inout { uint16_t bytes:3; /* 1 or 2 or 4 */ uint16_t in:1; uint16_t string:1; uint16_t rep:1; uint16_t port; uint32_t eax; /* valid for out */ }; struct vm_inout_str { struct vm_inout inout; /* must be the first element */ struct vm_guest_paging paging; uint64_t rflags; uint64_t cr0; uint64_t index; uint64_t count; /* rep=1 (%rcx), rep=0 (1) */ int addrsize; enum vm_reg_name seg_name; struct seg_desc seg_desc; int cs_d; uint64_t cs_base; }; enum task_switch_reason { TSR_CALL, TSR_IRET, TSR_JMP, TSR_IDT_GATE, /* task gate in IDT */ }; struct vm_task_switch { uint16_t tsssel; /* new TSS selector */ int ext; /* task switch due to external event */ uint32_t errcode; int errcode_valid; /* push 'errcode' on the new stack */ enum task_switch_reason reason; struct vm_guest_paging paging; }; struct vm_exit { enum vm_exitcode exitcode; int inst_length; /* 0 means unknown */ uint64_t rip; union { struct vm_inout inout; struct vm_inout_str inout_str; struct { uint64_t gpa; int fault_type; } paging; struct { uint64_t gpa; uint64_t gla; uint64_t cs_base; int cs_d; /* CS.D */ struct vm_guest_paging paging; struct vie vie; } inst_emul; /* * VMX specific payload. Used when there is no "better" * exitcode to represent the VM-exit. */ struct { int status; /* vmx inst status */ /* * 'exit_reason' and 'exit_qualification' are valid * only if 'status' is zero. */ uint32_t exit_reason; uint64_t exit_qualification; /* * 'inst_error' and 'inst_type' are valid * only if 'status' is non-zero. */ int inst_type; int inst_error; } vmx; /* * SVM specific payload. */ struct { uint64_t exitcode; uint64_t exitinfo1; uint64_t exitinfo2; } svm; struct { int inst_length; } bpt; struct { int trace_trap; int pushf_intercept; int tf_shadow_val; struct vm_guest_paging paging; } dbg; struct { uint32_t code; /* ecx value */ uint64_t wval; } msr; struct { int vcpu; uint64_t rip; } spinup_ap; struct { uint64_t rflags; uint64_t intr_status; } hlt; struct { int vector; } ioapic_eoi; struct { enum vm_suspend_how how; } suspended; struct { /* * The destination vCPU mask is saved in vcpu->cpuset * and is copied out to userspace separately to avoid * ABI concerns. */ uint32_t mode; uint8_t vector; } ipi; struct vm_task_switch task_switch; } u; }; /* APIs to inject faults into the guest */ void vm_inject_fault(struct vcpu *vcpu, int vector, int errcode_valid, int errcode); static __inline void vm_inject_ud(struct vcpu *vcpu) { vm_inject_fault(vcpu, IDT_UD, 0, 0); } static __inline void vm_inject_gp(struct vcpu *vcpu) { vm_inject_fault(vcpu, IDT_GP, 1, 0); } static __inline void vm_inject_ac(struct vcpu *vcpu, int errcode) { vm_inject_fault(vcpu, IDT_AC, 1, errcode); } static __inline void vm_inject_ss(struct vcpu *vcpu, int errcode) { vm_inject_fault(vcpu, IDT_SS, 1, errcode); } void vm_inject_pf(struct vcpu *vcpu, int error_code, uint64_t cr2); #endif /* _VMM_H_ */ diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c index c42da02d0bf6..d1eb2383ffed 100644 --- a/sys/amd64/vmm/vmm.c +++ b/sys/amd64/vmm/vmm.c @@ -1,2699 +1,2699 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "opt_bhyve_snapshot.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "vmm_ioport.h" #include "vmm_host.h" #include "vmm_mem.h" #include "vmm_util.h" #include "vatpic.h" #include "vatpit.h" #include "vhpet.h" #include "vioapic.h" #include "vlapic.h" #include "vpmtmr.h" #include "vrtc.h" #include "vmm_stat.h" #include "vmm_lapic.h" #include "io/ppt.h" #include "io/iommu.h" struct vlapic; /* * Initialization: * (a) allocated when vcpu is created * (i) initialized when vcpu is created and when it is reinitialized * (o) initialized the first time the vcpu is created * (x) initialized before use */ struct vcpu { struct mtx mtx; /* (o) protects 'state' and 'hostcpu' */ enum vcpu_state state; /* (o) vcpu state */ int vcpuid; /* (o) */ int hostcpu; /* (o) vcpu's host cpu */ int reqidle; /* (i) request vcpu to idle */ struct vm *vm; /* (o) */ void *cookie; /* (i) cpu-specific data */ struct vlapic *vlapic; /* (i) APIC device model */ enum x2apic_state x2apic_state; /* (i) APIC mode */ uint64_t exitintinfo; /* (i) events pending at VM exit */ int nmi_pending; /* (i) NMI pending */ int extint_pending; /* (i) INTR pending */ int exception_pending; /* (i) exception pending */ int exc_vector; /* (x) exception collateral */ int exc_errcode_valid; uint32_t exc_errcode; struct savefpu *guestfpu; /* (a,i) guest fpu state */ uint64_t guest_xcr0; /* (i) guest %xcr0 register */ void *stats; /* (a,i) statistics */ struct vm_exit exitinfo; /* (x) exit reason and collateral */ cpuset_t exitinfo_cpuset; /* (x) storage for vmexit handlers */ uint64_t nextrip; /* (x) next instruction to execute */ uint64_t tsc_offset; /* (o) TSC offsetting */ }; #define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN) #define vcpu_lock_destroy(v) mtx_destroy(&((v)->mtx)) #define vcpu_lock(v) mtx_lock_spin(&((v)->mtx)) #define vcpu_unlock(v) mtx_unlock_spin(&((v)->mtx)) #define vcpu_assert_locked(v) mtx_assert(&((v)->mtx), MA_OWNED) /* * Initialization: * (o) initialized the first time the VM is created * (i) initialized when VM is created and when it is reinitialized * (x) initialized before use * * Locking: * [m] mem_segs_lock * [r] rendezvous_mtx * [v] reads require one frozen vcpu, writes require freezing all vcpus */ struct vm { void *cookie; /* (i) cpu-specific data */ void *iommu; /* (x) iommu-specific data */ struct vhpet *vhpet; /* (i) virtual HPET */ struct vioapic *vioapic; /* (i) virtual ioapic */ struct vatpic *vatpic; /* (i) virtual atpic */ struct vatpit *vatpit; /* (i) virtual atpit */ struct vpmtmr *vpmtmr; /* (i) virtual ACPI PM timer */ struct vrtc *vrtc; /* (o) virtual RTC */ volatile cpuset_t active_cpus; /* (i) active vcpus */ volatile cpuset_t debug_cpus; /* (i) vcpus stopped for debug */ cpuset_t startup_cpus; /* (i) [r] waiting for startup */ int suspend; /* (i) stop VM execution */ bool dying; /* (o) is dying */ volatile cpuset_t suspended_cpus; /* (i) suspended vcpus */ volatile cpuset_t halted_cpus; /* (x) cpus in a hard halt */ cpuset_t rendezvous_req_cpus; /* (x) [r] rendezvous requested */ cpuset_t rendezvous_done_cpus; /* (x) [r] rendezvous finished */ void *rendezvous_arg; /* (x) [r] rendezvous func/arg */ vm_rendezvous_func_t rendezvous_func; struct mtx rendezvous_mtx; /* (o) rendezvous lock */ struct vmspace *vmspace; /* (o) guest's address space */ struct vm_mem mem; /* (i) [m+v] guest memory */ char name[VM_MAX_NAMELEN+1]; /* (o) virtual machine name */ struct vcpu **vcpu; /* (o) guest vcpus */ /* The following describe the vm cpu topology */ uint16_t sockets; /* (o) num of sockets */ uint16_t cores; /* (o) num of cores/socket */ uint16_t threads; /* (o) num of threads/core */ uint16_t maxcpus; /* (o) max pluggable cpus */ struct sx vcpus_init_lock; /* (o) */ }; #define VMM_CTR0(vcpu, format) \ VCPU_CTR0((vcpu)->vm, (vcpu)->vcpuid, format) #define VMM_CTR1(vcpu, format, p1) \ VCPU_CTR1((vcpu)->vm, (vcpu)->vcpuid, format, p1) #define VMM_CTR2(vcpu, format, p1, p2) \ VCPU_CTR2((vcpu)->vm, (vcpu)->vcpuid, format, p1, p2) #define VMM_CTR3(vcpu, format, p1, p2, p3) \ VCPU_CTR3((vcpu)->vm, (vcpu)->vcpuid, format, p1, p2, p3) #define VMM_CTR4(vcpu, format, p1, p2, p3, p4) \ VCPU_CTR4((vcpu)->vm, (vcpu)->vcpuid, format, p1, p2, p3, p4) static int vmm_initialized; static void vmmops_panic(void); static void vmmops_panic(void) { panic("vmm_ops func called when !vmm_is_intel() && !vmm_is_svm()"); } #define DEFINE_VMMOPS_IFUNC(ret_type, opname, args) \ - DEFINE_IFUNC(static, ret_type, vmmops_##opname, args) \ + DEFINE_IFUNC(, ret_type, vmmops_##opname, args) \ { \ if (vmm_is_intel()) \ return (vmm_ops_intel.opname); \ else if (vmm_is_svm()) \ return (vmm_ops_amd.opname); \ else \ return ((ret_type (*)args)vmmops_panic); \ } DEFINE_VMMOPS_IFUNC(int, modinit, (int ipinum)) DEFINE_VMMOPS_IFUNC(int, modcleanup, (void)) DEFINE_VMMOPS_IFUNC(void, modsuspend, (void)) DEFINE_VMMOPS_IFUNC(void, modresume, (void)) DEFINE_VMMOPS_IFUNC(void *, init, (struct vm *vm, struct pmap *pmap)) DEFINE_VMMOPS_IFUNC(int, run, (void *vcpui, register_t rip, struct pmap *pmap, struct vm_eventinfo *info)) DEFINE_VMMOPS_IFUNC(void, cleanup, (void *vmi)) DEFINE_VMMOPS_IFUNC(void *, vcpu_init, (void *vmi, struct vcpu *vcpu, int vcpu_id)) DEFINE_VMMOPS_IFUNC(void, vcpu_cleanup, (void *vcpui)) DEFINE_VMMOPS_IFUNC(int, getreg, (void *vcpui, int num, uint64_t *retval)) DEFINE_VMMOPS_IFUNC(int, setreg, (void *vcpui, int num, uint64_t val)) DEFINE_VMMOPS_IFUNC(int, getdesc, (void *vcpui, int num, struct seg_desc *desc)) DEFINE_VMMOPS_IFUNC(int, setdesc, (void *vcpui, int num, struct seg_desc *desc)) DEFINE_VMMOPS_IFUNC(int, getcap, (void *vcpui, int num, int *retval)) DEFINE_VMMOPS_IFUNC(int, setcap, (void *vcpui, int num, int val)) DEFINE_VMMOPS_IFUNC(struct vmspace *, vmspace_alloc, (vm_offset_t min, vm_offset_t max)) DEFINE_VMMOPS_IFUNC(void, vmspace_free, (struct vmspace *vmspace)) DEFINE_VMMOPS_IFUNC(struct vlapic *, vlapic_init, (void *vcpui)) DEFINE_VMMOPS_IFUNC(void, vlapic_cleanup, (struct vlapic *vlapic)) #ifdef BHYVE_SNAPSHOT DEFINE_VMMOPS_IFUNC(int, vcpu_snapshot, (void *vcpui, struct vm_snapshot_meta *meta)) DEFINE_VMMOPS_IFUNC(int, restore_tsc, (void *vcpui, uint64_t now)) #endif SDT_PROVIDER_DEFINE(vmm); static MALLOC_DEFINE(M_VM, "vm", "vm"); /* statistics */ static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime"); SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, NULL); /* * Halt the guest if all vcpus are executing a HLT instruction with * interrupts disabled. */ static int halt_detection_enabled = 1; SYSCTL_INT(_hw_vmm, OID_AUTO, halt_detection, CTLFLAG_RDTUN, &halt_detection_enabled, 0, "Halt VM if all vcpus execute HLT with interrupts disabled"); static int vmm_ipinum; SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0, "IPI vector used for vcpu notifications"); static int trace_guest_exceptions; SYSCTL_INT(_hw_vmm, OID_AUTO, trace_guest_exceptions, CTLFLAG_RDTUN, &trace_guest_exceptions, 0, "Trap into hypervisor on all guest exceptions and reflect them back"); static int trap_wbinvd; SYSCTL_INT(_hw_vmm, OID_AUTO, trap_wbinvd, CTLFLAG_RDTUN, &trap_wbinvd, 0, "WBINVD triggers a VM-exit"); u_int vm_maxcpu; SYSCTL_UINT(_hw_vmm, OID_AUTO, maxcpu, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &vm_maxcpu, 0, "Maximum number of vCPUs"); static void vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr); /* global statistics */ VMM_STAT(VCPU_MIGRATIONS, "vcpu migration across host cpus"); VMM_STAT(VMEXIT_COUNT, "total number of vm exits"); VMM_STAT(VMEXIT_EXTINT, "vm exits due to external interrupt"); VMM_STAT(VMEXIT_HLT, "number of times hlt was intercepted"); VMM_STAT(VMEXIT_CR_ACCESS, "number of times %cr access was intercepted"); VMM_STAT(VMEXIT_RDMSR, "number of times rdmsr was intercepted"); VMM_STAT(VMEXIT_WRMSR, "number of times wrmsr was intercepted"); VMM_STAT(VMEXIT_MTRAP, "number of monitor trap exits"); VMM_STAT(VMEXIT_PAUSE, "number of times pause was intercepted"); VMM_STAT(VMEXIT_INTR_WINDOW, "vm exits due to interrupt window opening"); VMM_STAT(VMEXIT_NMI_WINDOW, "vm exits due to nmi window opening"); VMM_STAT(VMEXIT_INOUT, "number of times in/out was intercepted"); VMM_STAT(VMEXIT_CPUID, "number of times cpuid was intercepted"); VMM_STAT(VMEXIT_NESTED_FAULT, "vm exits due to nested page fault"); VMM_STAT(VMEXIT_INST_EMUL, "vm exits for instruction emulation"); VMM_STAT(VMEXIT_UNKNOWN, "number of vm exits for unknown reason"); VMM_STAT(VMEXIT_ASTPENDING, "number of times astpending at exit"); VMM_STAT(VMEXIT_REQIDLE, "number of times idle requested at exit"); VMM_STAT(VMEXIT_USERSPACE, "number of vm exits handled in userspace"); VMM_STAT(VMEXIT_RENDEZVOUS, "number of times rendezvous pending at exit"); VMM_STAT(VMEXIT_EXCEPTION, "number of vm exits due to exceptions"); /* * Upper limit on vm_maxcpu. Limited by use of uint16_t types for CPU * counts as well as range of vpid values for VT-x and by the capacity * of cpuset_t masks. The call to new_unrhdr() in vpid_init() in * vmx.c requires 'vm_maxcpu + 1 <= 0xffff', hence the '- 1' below. */ #define VM_MAXCPU MIN(0xffff - 1, CPU_SETSIZE) #ifdef KTR static const char * vcpu_state2str(enum vcpu_state state) { switch (state) { case VCPU_IDLE: return ("idle"); case VCPU_FROZEN: return ("frozen"); case VCPU_RUNNING: return ("running"); case VCPU_SLEEPING: return ("sleeping"); default: return ("unknown"); } } #endif static void vcpu_cleanup(struct vcpu *vcpu, bool destroy) { vmmops_vlapic_cleanup(vcpu->vlapic); vmmops_vcpu_cleanup(vcpu->cookie); vcpu->cookie = NULL; if (destroy) { vmm_stat_free(vcpu->stats); fpu_save_area_free(vcpu->guestfpu); vcpu_lock_destroy(vcpu); free(vcpu, M_VM); } } static struct vcpu * vcpu_alloc(struct vm *vm, int vcpu_id) { struct vcpu *vcpu; KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus, ("vcpu_init: invalid vcpu %d", vcpu_id)); vcpu = malloc(sizeof(*vcpu), M_VM, M_WAITOK | M_ZERO); vcpu_lock_init(vcpu); vcpu->state = VCPU_IDLE; vcpu->hostcpu = NOCPU; vcpu->vcpuid = vcpu_id; vcpu->vm = vm; vcpu->guestfpu = fpu_save_area_alloc(); vcpu->stats = vmm_stat_alloc(); vcpu->tsc_offset = 0; return (vcpu); } static void vcpu_init(struct vcpu *vcpu) { vcpu->cookie = vmmops_vcpu_init(vcpu->vm->cookie, vcpu, vcpu->vcpuid); vcpu->vlapic = vmmops_vlapic_init(vcpu->cookie); vm_set_x2apic_state(vcpu, X2APIC_DISABLED); vcpu->reqidle = 0; vcpu->exitintinfo = 0; vcpu->nmi_pending = 0; vcpu->extint_pending = 0; vcpu->exception_pending = 0; vcpu->guest_xcr0 = XFEATURE_ENABLED_X87; fpu_save_area_reset(vcpu->guestfpu); vmm_stat_init(vcpu->stats); } int vcpu_trace_exceptions(struct vcpu *vcpu) { return (trace_guest_exceptions); } int vcpu_trap_wbinvd(struct vcpu *vcpu) { return (trap_wbinvd); } struct vm_exit * vm_exitinfo(struct vcpu *vcpu) { return (&vcpu->exitinfo); } cpuset_t * vm_exitinfo_cpuset(struct vcpu *vcpu) { return (&vcpu->exitinfo_cpuset); } static int vmm_init(void) { if (!vmm_is_hw_supported()) return (ENXIO); vm_maxcpu = mp_ncpus; TUNABLE_INT_FETCH("hw.vmm.maxcpu", &vm_maxcpu); if (vm_maxcpu > VM_MAXCPU) { printf("vmm: vm_maxcpu clamped to %u\n", VM_MAXCPU); vm_maxcpu = VM_MAXCPU; } if (vm_maxcpu == 0) vm_maxcpu = 1; vmm_host_state_init(); vmm_ipinum = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) : &IDTVEC(justreturn)); if (vmm_ipinum < 0) vmm_ipinum = IPI_AST; vmm_suspend_p = vmmops_modsuspend; vmm_resume_p = vmmops_modresume; return (vmmops_modinit(vmm_ipinum)); } static int vmm_handler(module_t mod, int what, void *arg) { int error; switch (what) { case MOD_LOAD: if (vmm_is_hw_supported()) { error = vmmdev_init(); if (error != 0) break; error = vmm_init(); if (error == 0) vmm_initialized = 1; else (void)vmmdev_cleanup(); } else { error = ENXIO; } break; case MOD_UNLOAD: if (vmm_is_hw_supported()) { error = vmmdev_cleanup(); if (error == 0) { vmm_suspend_p = NULL; vmm_resume_p = NULL; iommu_cleanup(); if (vmm_ipinum != IPI_AST) lapic_ipi_free(vmm_ipinum); error = vmmops_modcleanup(); /* * Something bad happened - prevent new * VMs from being created */ if (error) vmm_initialized = 0; } } else { error = 0; } break; default: error = 0; break; } return (error); } static moduledata_t vmm_kmod = { "vmm", vmm_handler, NULL }; /* * vmm initialization has the following dependencies: * * - VT-x initialization requires smp_rendezvous() and therefore must happen * after SMP is fully functional (after SI_SUB_SMP). * - vmm device initialization requires an initialized devfs. */ DECLARE_MODULE(vmm, vmm_kmod, MAX(SI_SUB_SMP, SI_SUB_DEVFS) + 1, SI_ORDER_ANY); MODULE_VERSION(vmm, 1); static void vm_init(struct vm *vm, bool create) { vm->cookie = vmmops_init(vm, vmspace_pmap(vm->vmspace)); vm->iommu = NULL; vm->vioapic = vioapic_init(vm); vm->vhpet = vhpet_init(vm); vm->vatpic = vatpic_init(vm); vm->vatpit = vatpit_init(vm); vm->vpmtmr = vpmtmr_init(vm); if (create) vm->vrtc = vrtc_init(vm); CPU_ZERO(&vm->active_cpus); CPU_ZERO(&vm->debug_cpus); CPU_ZERO(&vm->startup_cpus); vm->suspend = 0; CPU_ZERO(&vm->suspended_cpus); if (!create) { for (int i = 0; i < vm->maxcpus; i++) { if (vm->vcpu[i] != NULL) vcpu_init(vm->vcpu[i]); } } } void vm_disable_vcpu_creation(struct vm *vm) { sx_xlock(&vm->vcpus_init_lock); vm->dying = true; sx_xunlock(&vm->vcpus_init_lock); } struct vcpu * vm_alloc_vcpu(struct vm *vm, int vcpuid) { struct vcpu *vcpu; if (vcpuid < 0 || vcpuid >= vm_get_maxcpus(vm)) return (NULL); vcpu = (struct vcpu *) atomic_load_acq_ptr((uintptr_t *)&vm->vcpu[vcpuid]); if (__predict_true(vcpu != NULL)) return (vcpu); sx_xlock(&vm->vcpus_init_lock); vcpu = vm->vcpu[vcpuid]; if (vcpu == NULL && !vm->dying) { vcpu = vcpu_alloc(vm, vcpuid); vcpu_init(vcpu); /* * Ensure vCPU is fully created before updating pointer * to permit unlocked reads above. */ atomic_store_rel_ptr((uintptr_t *)&vm->vcpu[vcpuid], (uintptr_t)vcpu); } sx_xunlock(&vm->vcpus_init_lock); return (vcpu); } void vm_slock_vcpus(struct vm *vm) { sx_slock(&vm->vcpus_init_lock); } void vm_unlock_vcpus(struct vm *vm) { sx_unlock(&vm->vcpus_init_lock); } /* * The default CPU topology is a single thread per package. */ u_int cores_per_package = 1; u_int threads_per_core = 1; int vm_create(const char *name, struct vm **retvm) { struct vm *vm; struct vmspace *vmspace; /* * If vmm.ko could not be successfully initialized then don't attempt * to create the virtual machine. */ if (!vmm_initialized) return (ENXIO); if (name == NULL || strnlen(name, VM_MAX_NAMELEN + 1) == VM_MAX_NAMELEN + 1) return (EINVAL); vmspace = vmmops_vmspace_alloc(0, VM_MAXUSER_ADDRESS_LA48); if (vmspace == NULL) return (ENOMEM); vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO); strcpy(vm->name, name); vm->vmspace = vmspace; vm_mem_init(&vm->mem); mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF); sx_init(&vm->vcpus_init_lock, "vm vcpus"); vm->vcpu = malloc(sizeof(*vm->vcpu) * vm_maxcpu, M_VM, M_WAITOK | M_ZERO); vm->sockets = 1; vm->cores = cores_per_package; /* XXX backwards compatibility */ vm->threads = threads_per_core; /* XXX backwards compatibility */ vm->maxcpus = vm_maxcpu; vm_init(vm, true); *retvm = vm; return (0); } void vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores, uint16_t *threads, uint16_t *maxcpus) { *sockets = vm->sockets; *cores = vm->cores; *threads = vm->threads; *maxcpus = vm->maxcpus; } uint16_t vm_get_maxcpus(struct vm *vm) { return (vm->maxcpus); } int vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores, uint16_t threads, uint16_t maxcpus __unused) { /* Ignore maxcpus. */ if ((sockets * cores * threads) > vm->maxcpus) return (EINVAL); vm->sockets = sockets; vm->cores = cores; vm->threads = threads; return(0); } static void vm_cleanup(struct vm *vm, bool destroy) { if (destroy) vm_xlock_memsegs(vm); else vm_assert_memseg_xlocked(vm); ppt_unassign_all(vm); if (vm->iommu != NULL) iommu_destroy_domain(vm->iommu); if (destroy) vrtc_cleanup(vm->vrtc); else vrtc_reset(vm->vrtc); vpmtmr_cleanup(vm->vpmtmr); vatpit_cleanup(vm->vatpit); vhpet_cleanup(vm->vhpet); vatpic_cleanup(vm->vatpic); vioapic_cleanup(vm->vioapic); for (int i = 0; i < vm->maxcpus; i++) { if (vm->vcpu[i] != NULL) vcpu_cleanup(vm->vcpu[i], destroy); } vmmops_cleanup(vm->cookie); vm_mem_cleanup(vm); if (destroy) { vm_mem_destroy(vm); vmmops_vmspace_free(vm->vmspace); vm->vmspace = NULL; free(vm->vcpu, M_VM); sx_destroy(&vm->vcpus_init_lock); mtx_destroy(&vm->rendezvous_mtx); } } void vm_destroy(struct vm *vm) { vm_cleanup(vm, true); free(vm, M_VM); } int vm_reinit(struct vm *vm) { int error; /* * A virtual machine can be reset only if all vcpus are suspended. */ if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) { vm_cleanup(vm, false); vm_init(vm, false); error = 0; } else { error = EBUSY; } return (error); } const char * vm_name(struct vm *vm) { return (vm->name); } int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) { vm_object_t obj; if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL) return (ENOMEM); else return (0); } int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len) { vmm_mmio_free(vm->vmspace, gpa, len); return (0); } static int vm_iommu_map(struct vm *vm) { vm_paddr_t gpa, hpa; struct vm_mem_map *mm; int error, i; sx_assert(&vm->mem.mem_segs_lock, SX_LOCKED); for (i = 0; i < VM_MAX_MEMMAPS; i++) { if (!vm_memseg_sysmem(vm, i)) continue; mm = &vm->mem.mem_maps[i]; KASSERT((mm->flags & VM_MEMMAP_F_IOMMU) == 0, ("iommu map found invalid memmap %#lx/%#lx/%#x", mm->gpa, mm->len, mm->flags)); if ((mm->flags & VM_MEMMAP_F_WIRED) == 0) continue; mm->flags |= VM_MEMMAP_F_IOMMU; for (gpa = mm->gpa; gpa < mm->gpa + mm->len; gpa += PAGE_SIZE) { hpa = pmap_extract(vmspace_pmap(vm->vmspace), gpa); /* * All mappings in the vmm vmspace must be * present since they are managed by vmm in this way. * Because we are in pass-through mode, the * mappings must also be wired. This implies * that all pages must be mapped and wired, * allowing to use pmap_extract() and avoiding the * need to use vm_gpa_hold_global(). * * This could change if/when we start * supporting page faults on IOMMU maps. */ KASSERT(vm_page_wired(PHYS_TO_VM_PAGE(hpa)), ("vm_iommu_map: vm %p gpa %jx hpa %jx not wired", vm, (uintmax_t)gpa, (uintmax_t)hpa)); iommu_create_mapping(vm->iommu, gpa, hpa, PAGE_SIZE); } } error = iommu_invalidate_tlb(iommu_host_domain()); return (error); } static int vm_iommu_unmap(struct vm *vm) { vm_paddr_t gpa; struct vm_mem_map *mm; int error, i; sx_assert(&vm->mem.mem_segs_lock, SX_LOCKED); for (i = 0; i < VM_MAX_MEMMAPS; i++) { if (!vm_memseg_sysmem(vm, i)) continue; mm = &vm->mem.mem_maps[i]; if ((mm->flags & VM_MEMMAP_F_IOMMU) == 0) continue; mm->flags &= ~VM_MEMMAP_F_IOMMU; KASSERT((mm->flags & VM_MEMMAP_F_WIRED) != 0, ("iommu unmap found invalid memmap %#lx/%#lx/%#x", mm->gpa, mm->len, mm->flags)); for (gpa = mm->gpa; gpa < mm->gpa + mm->len; gpa += PAGE_SIZE) { KASSERT(vm_page_wired(PHYS_TO_VM_PAGE(pmap_extract( vmspace_pmap(vm->vmspace), gpa))), ("vm_iommu_unmap: vm %p gpa %jx not wired", vm, (uintmax_t)gpa)); iommu_remove_mapping(vm->iommu, gpa, PAGE_SIZE); } } /* * Invalidate the cached translations associated with the domain * from which pages were removed. */ error = iommu_invalidate_tlb(vm->iommu); return (error); } int vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func) { int error; error = ppt_unassign_device(vm, bus, slot, func); if (error) return (error); if (ppt_assigned_devices(vm) == 0) error = vm_iommu_unmap(vm); return (error); } int vm_assign_pptdev(struct vm *vm, int bus, int slot, int func) { int error; vm_paddr_t maxaddr; bool map = false; /* Set up the IOMMU to do the 'gpa' to 'hpa' translation */ if (ppt_assigned_devices(vm) == 0) { KASSERT(vm->iommu == NULL, ("vm_assign_pptdev: iommu must be NULL")); maxaddr = vmm_sysmem_maxaddr(vm); vm->iommu = iommu_create_domain(maxaddr); if (vm->iommu == NULL) return (ENXIO); map = true; } error = ppt_assign_device(vm, bus, slot, func); if (error == 0 && map) error = vm_iommu_map(vm); return (error); } int vm_get_register(struct vcpu *vcpu, int reg, uint64_t *retval) { if (reg >= VM_REG_LAST) return (EINVAL); return (vmmops_getreg(vcpu->cookie, reg, retval)); } int vm_set_register(struct vcpu *vcpu, int reg, uint64_t val) { int error; if (reg >= VM_REG_LAST) return (EINVAL); error = vmmops_setreg(vcpu->cookie, reg, val); if (error || reg != VM_REG_GUEST_RIP) return (error); /* Set 'nextrip' to match the value of %rip */ VMM_CTR1(vcpu, "Setting nextrip to %#lx", val); vcpu->nextrip = val; return (0); } static bool is_descriptor_table(int reg) { switch (reg) { case VM_REG_GUEST_IDTR: case VM_REG_GUEST_GDTR: return (true); default: return (false); } } static bool is_segment_register(int reg) { switch (reg) { case VM_REG_GUEST_ES: case VM_REG_GUEST_CS: case VM_REG_GUEST_SS: case VM_REG_GUEST_DS: case VM_REG_GUEST_FS: case VM_REG_GUEST_GS: case VM_REG_GUEST_TR: case VM_REG_GUEST_LDTR: return (true); default: return (false); } } int vm_get_seg_desc(struct vcpu *vcpu, int reg, struct seg_desc *desc) { if (!is_segment_register(reg) && !is_descriptor_table(reg)) return (EINVAL); return (vmmops_getdesc(vcpu->cookie, reg, desc)); } int vm_set_seg_desc(struct vcpu *vcpu, int reg, struct seg_desc *desc) { if (!is_segment_register(reg) && !is_descriptor_table(reg)) return (EINVAL); return (vmmops_setdesc(vcpu->cookie, reg, desc)); } static void restore_guest_fpustate(struct vcpu *vcpu) { /* flush host state to the pcb */ fpuexit(curthread); /* restore guest FPU state */ fpu_enable(); fpurestore(vcpu->guestfpu); /* restore guest XCR0 if XSAVE is enabled in the host */ if (rcr4() & CR4_XSAVE) load_xcr(0, vcpu->guest_xcr0); /* * The FPU is now "dirty" with the guest's state so disable * the FPU to trap any access by the host. */ fpu_disable(); } static void save_guest_fpustate(struct vcpu *vcpu) { if ((rcr0() & CR0_TS) == 0) panic("fpu emulation not enabled in host!"); /* save guest XCR0 and restore host XCR0 */ if (rcr4() & CR4_XSAVE) { vcpu->guest_xcr0 = rxcr(0); load_xcr(0, vmm_get_host_xcr0()); } /* save guest FPU state */ fpu_enable(); fpusave(vcpu->guestfpu); fpu_disable(); } static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle"); static int vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate, bool from_idle) { int error; vcpu_assert_locked(vcpu); /* * State transitions from the vmmdev_ioctl() must always begin from * the VCPU_IDLE state. This guarantees that there is only a single * ioctl() operating on a vcpu at any point. */ if (from_idle) { while (vcpu->state != VCPU_IDLE) { vcpu->reqidle = 1; vcpu_notify_event_locked(vcpu, false); VMM_CTR1(vcpu, "vcpu state change from %s to " "idle requested", vcpu_state2str(vcpu->state)); msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz); } } else { KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from " "vcpu idle state")); } if (vcpu->state == VCPU_RUNNING) { KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d " "mismatch for running vcpu", curcpu, vcpu->hostcpu)); } else { KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a " "vcpu that is not running", vcpu->hostcpu)); } /* * The following state transitions are allowed: * IDLE -> FROZEN -> IDLE * FROZEN -> RUNNING -> FROZEN * FROZEN -> SLEEPING -> FROZEN */ switch (vcpu->state) { case VCPU_IDLE: case VCPU_RUNNING: case VCPU_SLEEPING: error = (newstate != VCPU_FROZEN); break; case VCPU_FROZEN: error = (newstate == VCPU_FROZEN); break; default: error = 1; break; } if (error) return (EBUSY); VMM_CTR2(vcpu, "vcpu state changed from %s to %s", vcpu_state2str(vcpu->state), vcpu_state2str(newstate)); vcpu->state = newstate; if (newstate == VCPU_RUNNING) vcpu->hostcpu = curcpu; else vcpu->hostcpu = NOCPU; if (newstate == VCPU_IDLE) wakeup(&vcpu->state); return (0); } static void vcpu_require_state(struct vcpu *vcpu, enum vcpu_state newstate) { int error; if ((error = vcpu_set_state(vcpu, newstate, false)) != 0) panic("Error %d setting state to %d\n", error, newstate); } static void vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate) { int error; if ((error = vcpu_set_state_locked(vcpu, newstate, false)) != 0) panic("Error %d setting state to %d", error, newstate); } static int vm_handle_rendezvous(struct vcpu *vcpu) { struct vm *vm = vcpu->vm; struct thread *td; int error, vcpuid; error = 0; vcpuid = vcpu->vcpuid; td = curthread; mtx_lock(&vm->rendezvous_mtx); while (vm->rendezvous_func != NULL) { /* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */ CPU_AND(&vm->rendezvous_req_cpus, &vm->rendezvous_req_cpus, &vm->active_cpus); if (CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus) && !CPU_ISSET(vcpuid, &vm->rendezvous_done_cpus)) { VMM_CTR0(vcpu, "Calling rendezvous func"); (*vm->rendezvous_func)(vcpu, vm->rendezvous_arg); CPU_SET(vcpuid, &vm->rendezvous_done_cpus); } if (CPU_CMP(&vm->rendezvous_req_cpus, &vm->rendezvous_done_cpus) == 0) { VMM_CTR0(vcpu, "Rendezvous completed"); CPU_ZERO(&vm->rendezvous_req_cpus); vm->rendezvous_func = NULL; wakeup(&vm->rendezvous_func); break; } VMM_CTR0(vcpu, "Wait for rendezvous completion"); mtx_sleep(&vm->rendezvous_func, &vm->rendezvous_mtx, 0, "vmrndv", hz); if (td_ast_pending(td, TDA_SUSPEND)) { mtx_unlock(&vm->rendezvous_mtx); error = thread_check_susp(td, true); if (error != 0) return (error); mtx_lock(&vm->rendezvous_mtx); } } mtx_unlock(&vm->rendezvous_mtx); return (0); } /* * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run. */ static int vm_handle_hlt(struct vcpu *vcpu, bool intr_disabled, bool *retu) { struct vm *vm = vcpu->vm; const char *wmesg; struct thread *td; int error, t, vcpuid, vcpu_halted, vm_halted; vcpuid = vcpu->vcpuid; vcpu_halted = 0; vm_halted = 0; error = 0; td = curthread; KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted")); vcpu_lock(vcpu); while (1) { /* * Do a final check for pending NMI or interrupts before * really putting this thread to sleep. Also check for * software events that would cause this vcpu to wakeup. * * These interrupts/events could have happened after the * vcpu returned from vmmops_run() and before it acquired the * vcpu lock above. */ if (vm->rendezvous_func != NULL || vm->suspend || vcpu->reqidle) break; if (vm_nmi_pending(vcpu)) break; if (!intr_disabled) { if (vm_extint_pending(vcpu) || vlapic_pending_intr(vcpu->vlapic, NULL)) { break; } } /* Don't go to sleep if the vcpu thread needs to yield */ if (vcpu_should_yield(vcpu)) break; if (vcpu_debugged(vcpu)) break; /* * Some Linux guests implement "halt" by having all vcpus * execute HLT with interrupts disabled. 'halted_cpus' keeps * track of the vcpus that have entered this state. When all * vcpus enter the halted state the virtual machine is halted. */ if (intr_disabled) { wmesg = "vmhalt"; VMM_CTR0(vcpu, "Halted"); if (!vcpu_halted && halt_detection_enabled) { vcpu_halted = 1; CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus); } if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) { vm_halted = 1; break; } } else { wmesg = "vmidle"; } t = ticks; vcpu_require_state_locked(vcpu, VCPU_SLEEPING); /* * XXX msleep_spin() cannot be interrupted by signals so * wake up periodically to check pending signals. */ msleep_spin(vcpu, &vcpu->mtx, wmesg, hz); vcpu_require_state_locked(vcpu, VCPU_FROZEN); vmm_stat_incr(vcpu, VCPU_IDLE_TICKS, ticks - t); if (td_ast_pending(td, TDA_SUSPEND)) { vcpu_unlock(vcpu); error = thread_check_susp(td, false); if (error != 0) { if (vcpu_halted) { CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus); } return (error); } vcpu_lock(vcpu); } } if (vcpu_halted) CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus); vcpu_unlock(vcpu); if (vm_halted) vm_suspend(vm, VM_SUSPEND_HALT); return (0); } static int vm_handle_paging(struct vcpu *vcpu, bool *retu) { struct vm *vm = vcpu->vm; int rv, ftype; struct vm_map *map; struct vm_exit *vme; vme = &vcpu->exitinfo; KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d", __func__, vme->inst_length)); ftype = vme->u.paging.fault_type; KASSERT(ftype == VM_PROT_READ || ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE, ("vm_handle_paging: invalid fault_type %d", ftype)); if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) { rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace), vme->u.paging.gpa, ftype); if (rv == 0) { VMM_CTR2(vcpu, "%s bit emulation for gpa %#lx", ftype == VM_PROT_READ ? "accessed" : "dirty", vme->u.paging.gpa); goto done; } } map = &vm->vmspace->vm_map; rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL, NULL); VMM_CTR3(vcpu, "vm_handle_paging rv = %d, gpa = %#lx, " "ftype = %d", rv, vme->u.paging.gpa, ftype); if (rv != KERN_SUCCESS) return (EFAULT); done: return (0); } static int vm_handle_inst_emul(struct vcpu *vcpu, bool *retu) { struct vie *vie; struct vm_exit *vme; uint64_t gla, gpa, cs_base; struct vm_guest_paging *paging; mem_region_read_t mread; mem_region_write_t mwrite; enum vm_cpu_mode cpu_mode; int cs_d, error, fault; vme = &vcpu->exitinfo; KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d", __func__, vme->inst_length)); gla = vme->u.inst_emul.gla; gpa = vme->u.inst_emul.gpa; cs_base = vme->u.inst_emul.cs_base; cs_d = vme->u.inst_emul.cs_d; vie = &vme->u.inst_emul.vie; paging = &vme->u.inst_emul.paging; cpu_mode = paging->cpu_mode; VMM_CTR1(vcpu, "inst_emul fault accessing gpa %#lx", gpa); /* Fetch, decode and emulate the faulting instruction */ if (vie->num_valid == 0) { error = vmm_fetch_instruction(vcpu, paging, vme->rip + cs_base, VIE_INST_SIZE, vie, &fault); } else { /* * The instruction bytes have already been copied into 'vie' */ error = fault = 0; } if (error || fault) return (error); if (vmm_decode_instruction(vcpu, gla, cpu_mode, cs_d, vie) != 0) { VMM_CTR1(vcpu, "Error decoding instruction at %#lx", vme->rip + cs_base); *retu = true; /* dump instruction bytes in userspace */ return (0); } /* * Update 'nextrip' based on the length of the emulated instruction. */ vme->inst_length = vie->num_processed; vcpu->nextrip += vie->num_processed; VMM_CTR1(vcpu, "nextrip updated to %#lx after instruction decoding", vcpu->nextrip); /* return to userland unless this is an in-kernel emulated device */ if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) { mread = lapic_mmio_read; mwrite = lapic_mmio_write; } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) { mread = vioapic_mmio_read; mwrite = vioapic_mmio_write; } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) { mread = vhpet_mmio_read; mwrite = vhpet_mmio_write; } else { *retu = true; return (0); } error = vmm_emulate_instruction(vcpu, gpa, vie, paging, mread, mwrite, retu); return (error); } static int vm_handle_suspend(struct vcpu *vcpu, bool *retu) { struct vm *vm = vcpu->vm; int error, i; struct thread *td; error = 0; td = curthread; CPU_SET_ATOMIC(vcpu->vcpuid, &vm->suspended_cpus); /* * Wait until all 'active_cpus' have suspended themselves. * * Since a VM may be suspended at any time including when one or * more vcpus are doing a rendezvous we need to call the rendezvous * handler while we are waiting to prevent a deadlock. */ vcpu_lock(vcpu); while (error == 0) { if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) { VMM_CTR0(vcpu, "All vcpus suspended"); break; } if (vm->rendezvous_func == NULL) { VMM_CTR0(vcpu, "Sleeping during suspend"); vcpu_require_state_locked(vcpu, VCPU_SLEEPING); msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz); vcpu_require_state_locked(vcpu, VCPU_FROZEN); if (td_ast_pending(td, TDA_SUSPEND)) { vcpu_unlock(vcpu); error = thread_check_susp(td, false); vcpu_lock(vcpu); } } else { VMM_CTR0(vcpu, "Rendezvous during suspend"); vcpu_unlock(vcpu); error = vm_handle_rendezvous(vcpu); vcpu_lock(vcpu); } } vcpu_unlock(vcpu); /* * Wakeup the other sleeping vcpus and return to userspace. */ for (i = 0; i < vm->maxcpus; i++) { if (CPU_ISSET(i, &vm->suspended_cpus)) { vcpu_notify_event(vm_vcpu(vm, i), false); } } *retu = true; return (error); } static int vm_handle_reqidle(struct vcpu *vcpu, bool *retu) { vcpu_lock(vcpu); KASSERT(vcpu->reqidle, ("invalid vcpu reqidle %d", vcpu->reqidle)); vcpu->reqidle = 0; vcpu_unlock(vcpu); *retu = true; return (0); } static int vm_handle_db(struct vcpu *vcpu, struct vm_exit *vme, bool *retu) { int error, fault; uint64_t rsp; uint64_t rflags; struct vm_copyinfo copyinfo[2]; *retu = true; if (!vme->u.dbg.pushf_intercept || vme->u.dbg.tf_shadow_val != 0) { return (0); } vm_get_register(vcpu, VM_REG_GUEST_RSP, &rsp); error = vm_copy_setup(vcpu, &vme->u.dbg.paging, rsp, sizeof(uint64_t), VM_PROT_RW, copyinfo, nitems(copyinfo), &fault); if (error != 0 || fault != 0) { *retu = false; return (EINVAL); } /* Read pushed rflags value from top of stack. */ vm_copyin(copyinfo, &rflags, sizeof(uint64_t)); /* Clear TF bit. */ rflags &= ~(PSL_T); /* Write updated value back to memory. */ vm_copyout(&rflags, copyinfo, sizeof(uint64_t)); vm_copy_teardown(copyinfo, nitems(copyinfo)); return (0); } int vm_suspend(struct vm *vm, enum vm_suspend_how how) { int i; if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST) return (EINVAL); if (atomic_cmpset_int(&vm->suspend, 0, how) == 0) { VM_CTR2(vm, "virtual machine already suspended %d/%d", vm->suspend, how); return (EALREADY); } VM_CTR1(vm, "virtual machine successfully suspended %d", how); /* * Notify all active vcpus that they are now suspended. */ for (i = 0; i < vm->maxcpus; i++) { if (CPU_ISSET(i, &vm->active_cpus)) vcpu_notify_event(vm_vcpu(vm, i), false); } return (0); } void vm_exit_suspended(struct vcpu *vcpu, uint64_t rip) { struct vm *vm = vcpu->vm; struct vm_exit *vmexit; KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST, ("vm_exit_suspended: invalid suspend type %d", vm->suspend)); vmexit = vm_exitinfo(vcpu); vmexit->rip = rip; vmexit->inst_length = 0; vmexit->exitcode = VM_EXITCODE_SUSPENDED; vmexit->u.suspended.how = vm->suspend; } void vm_exit_debug(struct vcpu *vcpu, uint64_t rip) { struct vm_exit *vmexit; vmexit = vm_exitinfo(vcpu); vmexit->rip = rip; vmexit->inst_length = 0; vmexit->exitcode = VM_EXITCODE_DEBUG; } void vm_exit_rendezvous(struct vcpu *vcpu, uint64_t rip) { struct vm_exit *vmexit; vmexit = vm_exitinfo(vcpu); vmexit->rip = rip; vmexit->inst_length = 0; vmexit->exitcode = VM_EXITCODE_RENDEZVOUS; vmm_stat_incr(vcpu, VMEXIT_RENDEZVOUS, 1); } void vm_exit_reqidle(struct vcpu *vcpu, uint64_t rip) { struct vm_exit *vmexit; vmexit = vm_exitinfo(vcpu); vmexit->rip = rip; vmexit->inst_length = 0; vmexit->exitcode = VM_EXITCODE_REQIDLE; vmm_stat_incr(vcpu, VMEXIT_REQIDLE, 1); } void vm_exit_astpending(struct vcpu *vcpu, uint64_t rip) { struct vm_exit *vmexit; vmexit = vm_exitinfo(vcpu); vmexit->rip = rip; vmexit->inst_length = 0; vmexit->exitcode = VM_EXITCODE_BOGUS; vmm_stat_incr(vcpu, VMEXIT_ASTPENDING, 1); } int vm_run(struct vcpu *vcpu) { struct vm *vm = vcpu->vm; struct vm_eventinfo evinfo; int error, vcpuid; struct pcb *pcb; uint64_t tscval; struct vm_exit *vme; bool retu, intr_disabled; pmap_t pmap; vcpuid = vcpu->vcpuid; if (!CPU_ISSET(vcpuid, &vm->active_cpus)) return (EINVAL); if (CPU_ISSET(vcpuid, &vm->suspended_cpus)) return (EINVAL); pmap = vmspace_pmap(vm->vmspace); vme = &vcpu->exitinfo; evinfo.rptr = &vm->rendezvous_req_cpus; evinfo.sptr = &vm->suspend; evinfo.iptr = &vcpu->reqidle; restart: critical_enter(); KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active), ("vm_run: absurd pm_active")); tscval = rdtsc(); pcb = PCPU_GET(curpcb); set_pcb_flags(pcb, PCB_FULL_IRET); restore_guest_fpustate(vcpu); vcpu_require_state(vcpu, VCPU_RUNNING); error = vmmops_run(vcpu->cookie, vcpu->nextrip, pmap, &evinfo); vcpu_require_state(vcpu, VCPU_FROZEN); save_guest_fpustate(vcpu); vmm_stat_incr(vcpu, VCPU_TOTAL_RUNTIME, rdtsc() - tscval); critical_exit(); if (error == 0) { retu = false; vcpu->nextrip = vme->rip + vme->inst_length; switch (vme->exitcode) { case VM_EXITCODE_REQIDLE: error = vm_handle_reqidle(vcpu, &retu); break; case VM_EXITCODE_SUSPENDED: error = vm_handle_suspend(vcpu, &retu); break; case VM_EXITCODE_IOAPIC_EOI: vioapic_process_eoi(vm, vme->u.ioapic_eoi.vector); break; case VM_EXITCODE_RENDEZVOUS: error = vm_handle_rendezvous(vcpu); break; case VM_EXITCODE_HLT: intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0); error = vm_handle_hlt(vcpu, intr_disabled, &retu); break; case VM_EXITCODE_PAGING: error = vm_handle_paging(vcpu, &retu); break; case VM_EXITCODE_INST_EMUL: error = vm_handle_inst_emul(vcpu, &retu); break; case VM_EXITCODE_INOUT: case VM_EXITCODE_INOUT_STR: error = vm_handle_inout(vcpu, vme, &retu); break; case VM_EXITCODE_DB: error = vm_handle_db(vcpu, vme, &retu); break; case VM_EXITCODE_MONITOR: case VM_EXITCODE_MWAIT: case VM_EXITCODE_VMINSN: vm_inject_ud(vcpu); break; default: retu = true; /* handled in userland */ break; } } /* * VM_EXITCODE_INST_EMUL could access the apic which could transform the * exit code into VM_EXITCODE_IPI. */ if (error == 0 && vme->exitcode == VM_EXITCODE_IPI) error = vm_handle_ipi(vcpu, vme, &retu); if (error == 0 && retu == false) goto restart; vmm_stat_incr(vcpu, VMEXIT_USERSPACE, 1); VMM_CTR2(vcpu, "retu %d/%d", error, vme->exitcode); return (error); } int vm_restart_instruction(struct vcpu *vcpu) { enum vcpu_state state; uint64_t rip; int error __diagused; state = vcpu_get_state(vcpu, NULL); if (state == VCPU_RUNNING) { /* * When a vcpu is "running" the next instruction is determined * by adding 'rip' and 'inst_length' in the vcpu's 'exitinfo'. * Thus setting 'inst_length' to zero will cause the current * instruction to be restarted. */ vcpu->exitinfo.inst_length = 0; VMM_CTR1(vcpu, "restarting instruction at %#lx by " "setting inst_length to zero", vcpu->exitinfo.rip); } else if (state == VCPU_FROZEN) { /* * When a vcpu is "frozen" it is outside the critical section * around vmmops_run() and 'nextrip' points to the next * instruction. Thus instruction restart is achieved by setting * 'nextrip' to the vcpu's %rip. */ error = vm_get_register(vcpu, VM_REG_GUEST_RIP, &rip); KASSERT(!error, ("%s: error %d getting rip", __func__, error)); VMM_CTR2(vcpu, "restarting instruction by updating " "nextrip from %#lx to %#lx", vcpu->nextrip, rip); vcpu->nextrip = rip; } else { panic("%s: invalid state %d", __func__, state); } return (0); } int vm_exit_intinfo(struct vcpu *vcpu, uint64_t info) { int type, vector; if (info & VM_INTINFO_VALID) { type = info & VM_INTINFO_TYPE; vector = info & 0xff; if (type == VM_INTINFO_NMI && vector != IDT_NMI) return (EINVAL); if (type == VM_INTINFO_HWEXCEPTION && vector >= 32) return (EINVAL); if (info & VM_INTINFO_RSVD) return (EINVAL); } else { info = 0; } VMM_CTR2(vcpu, "%s: info1(%#lx)", __func__, info); vcpu->exitintinfo = info; return (0); } enum exc_class { EXC_BENIGN, EXC_CONTRIBUTORY, EXC_PAGEFAULT }; #define IDT_VE 20 /* Virtualization Exception (Intel specific) */ static enum exc_class exception_class(uint64_t info) { int type, vector; KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %#lx", info)); type = info & VM_INTINFO_TYPE; vector = info & 0xff; /* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */ switch (type) { case VM_INTINFO_HWINTR: case VM_INTINFO_SWINTR: case VM_INTINFO_NMI: return (EXC_BENIGN); default: /* * Hardware exception. * * SVM and VT-x use identical type values to represent NMI, * hardware interrupt and software interrupt. * * SVM uses type '3' for all exceptions. VT-x uses type '3' * for exceptions except #BP and #OF. #BP and #OF use a type * value of '5' or '6'. Therefore we don't check for explicit * values of 'type' to classify 'intinfo' into a hardware * exception. */ break; } switch (vector) { case IDT_PF: case IDT_VE: return (EXC_PAGEFAULT); case IDT_DE: case IDT_TS: case IDT_NP: case IDT_SS: case IDT_GP: return (EXC_CONTRIBUTORY); default: return (EXC_BENIGN); } } static int nested_fault(struct vcpu *vcpu, uint64_t info1, uint64_t info2, uint64_t *retinfo) { enum exc_class exc1, exc2; int type1, vector1; KASSERT(info1 & VM_INTINFO_VALID, ("info1 %#lx is not valid", info1)); KASSERT(info2 & VM_INTINFO_VALID, ("info2 %#lx is not valid", info2)); /* * If an exception occurs while attempting to call the double-fault * handler the processor enters shutdown mode (aka triple fault). */ type1 = info1 & VM_INTINFO_TYPE; vector1 = info1 & 0xff; if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) { VMM_CTR2(vcpu, "triple fault: info1(%#lx), info2(%#lx)", info1, info2); vm_suspend(vcpu->vm, VM_SUSPEND_TRIPLEFAULT); *retinfo = 0; return (0); } /* * Table 6-5 "Conditions for Generating a Double Fault", Intel SDM, Vol3 */ exc1 = exception_class(info1); exc2 = exception_class(info2); if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) || (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) { /* Convert nested fault into a double fault. */ *retinfo = IDT_DF; *retinfo |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION; *retinfo |= VM_INTINFO_DEL_ERRCODE; } else { /* Handle exceptions serially */ *retinfo = info2; } return (1); } static uint64_t vcpu_exception_intinfo(struct vcpu *vcpu) { uint64_t info = 0; if (vcpu->exception_pending) { info = vcpu->exc_vector & 0xff; info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION; if (vcpu->exc_errcode_valid) { info |= VM_INTINFO_DEL_ERRCODE; info |= (uint64_t)vcpu->exc_errcode << 32; } } return (info); } int vm_entry_intinfo(struct vcpu *vcpu, uint64_t *retinfo) { uint64_t info1, info2; int valid; info1 = vcpu->exitintinfo; vcpu->exitintinfo = 0; info2 = 0; if (vcpu->exception_pending) { info2 = vcpu_exception_intinfo(vcpu); vcpu->exception_pending = 0; VMM_CTR2(vcpu, "Exception %d delivered: %#lx", vcpu->exc_vector, info2); } if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) { valid = nested_fault(vcpu, info1, info2, retinfo); } else if (info1 & VM_INTINFO_VALID) { *retinfo = info1; valid = 1; } else if (info2 & VM_INTINFO_VALID) { *retinfo = info2; valid = 1; } else { valid = 0; } if (valid) { VMM_CTR4(vcpu, "%s: info1(%#lx), info2(%#lx), " "retinfo(%#lx)", __func__, info1, info2, *retinfo); } return (valid); } int vm_get_intinfo(struct vcpu *vcpu, uint64_t *info1, uint64_t *info2) { *info1 = vcpu->exitintinfo; *info2 = vcpu_exception_intinfo(vcpu); return (0); } int vm_inject_exception(struct vcpu *vcpu, int vector, int errcode_valid, uint32_t errcode, int restart_instruction) { uint64_t regval; int error __diagused; if (vector < 0 || vector >= 32) return (EINVAL); /* * A double fault exception should never be injected directly into * the guest. It is a derived exception that results from specific * combinations of nested faults. */ if (vector == IDT_DF) return (EINVAL); if (vcpu->exception_pending) { VMM_CTR2(vcpu, "Unable to inject exception %d due to " "pending exception %d", vector, vcpu->exc_vector); return (EBUSY); } if (errcode_valid) { /* * Exceptions don't deliver an error code in real mode. */ error = vm_get_register(vcpu, VM_REG_GUEST_CR0, ®val); KASSERT(!error, ("%s: error %d getting CR0", __func__, error)); if (!(regval & CR0_PE)) errcode_valid = 0; } /* * From section 26.6.1 "Interruptibility State" in Intel SDM: * * Event blocking by "STI" or "MOV SS" is cleared after guest executes * one instruction or incurs an exception. */ error = vm_set_register(vcpu, VM_REG_GUEST_INTR_SHADOW, 0); KASSERT(error == 0, ("%s: error %d clearing interrupt shadow", __func__, error)); if (restart_instruction) vm_restart_instruction(vcpu); vcpu->exception_pending = 1; vcpu->exc_vector = vector; vcpu->exc_errcode = errcode; vcpu->exc_errcode_valid = errcode_valid; VMM_CTR1(vcpu, "Exception %d pending", vector); return (0); } void vm_inject_fault(struct vcpu *vcpu, int vector, int errcode_valid, int errcode) { int error __diagused, restart_instruction; restart_instruction = 1; error = vm_inject_exception(vcpu, vector, errcode_valid, errcode, restart_instruction); KASSERT(error == 0, ("vm_inject_exception error %d", error)); } void vm_inject_pf(struct vcpu *vcpu, int error_code, uint64_t cr2) { int error __diagused; VMM_CTR2(vcpu, "Injecting page fault: error_code %#x, cr2 %#lx", error_code, cr2); error = vm_set_register(vcpu, VM_REG_GUEST_CR2, cr2); KASSERT(error == 0, ("vm_set_register(cr2) error %d", error)); vm_inject_fault(vcpu, IDT_PF, 1, error_code); } static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu"); int vm_inject_nmi(struct vcpu *vcpu) { vcpu->nmi_pending = 1; vcpu_notify_event(vcpu, false); return (0); } int vm_nmi_pending(struct vcpu *vcpu) { return (vcpu->nmi_pending); } void vm_nmi_clear(struct vcpu *vcpu) { if (vcpu->nmi_pending == 0) panic("vm_nmi_clear: inconsistent nmi_pending state"); vcpu->nmi_pending = 0; vmm_stat_incr(vcpu, VCPU_NMI_COUNT, 1); } static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu"); int vm_inject_extint(struct vcpu *vcpu) { vcpu->extint_pending = 1; vcpu_notify_event(vcpu, false); return (0); } int vm_extint_pending(struct vcpu *vcpu) { return (vcpu->extint_pending); } void vm_extint_clear(struct vcpu *vcpu) { if (vcpu->extint_pending == 0) panic("vm_extint_clear: inconsistent extint_pending state"); vcpu->extint_pending = 0; vmm_stat_incr(vcpu, VCPU_EXTINT_COUNT, 1); } int vm_get_capability(struct vcpu *vcpu, int type, int *retval) { if (type < 0 || type >= VM_CAP_MAX) return (EINVAL); return (vmmops_getcap(vcpu->cookie, type, retval)); } int vm_set_capability(struct vcpu *vcpu, int type, int val) { if (type < 0 || type >= VM_CAP_MAX) return (EINVAL); return (vmmops_setcap(vcpu->cookie, type, val)); } struct vm * vcpu_vm(struct vcpu *vcpu) { return (vcpu->vm); } int vcpu_vcpuid(struct vcpu *vcpu) { return (vcpu->vcpuid); } struct vcpu * vm_vcpu(struct vm *vm, int vcpuid) { return (vm->vcpu[vcpuid]); } struct vlapic * vm_lapic(struct vcpu *vcpu) { return (vcpu->vlapic); } struct vioapic * vm_ioapic(struct vm *vm) { return (vm->vioapic); } struct vhpet * vm_hpet(struct vm *vm) { return (vm->vhpet); } bool vmm_is_pptdev(int bus, int slot, int func) { int b, f, i, n, s; char *val, *cp, *cp2; bool found; /* * XXX * The length of an environment variable is limited to 128 bytes which * puts an upper limit on the number of passthru devices that may be * specified using a single environment variable. * * Work around this by scanning multiple environment variable * names instead of a single one - yuck! */ const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL }; /* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */ found = false; for (i = 0; names[i] != NULL && !found; i++) { cp = val = kern_getenv(names[i]); while (cp != NULL && *cp != '\0') { if ((cp2 = strchr(cp, ' ')) != NULL) *cp2 = '\0'; n = sscanf(cp, "%d/%d/%d", &b, &s, &f); if (n == 3 && bus == b && slot == s && func == f) { found = true; break; } if (cp2 != NULL) *cp2++ = ' '; cp = cp2; } freeenv(val); } return (found); } void * vm_iommu_domain(struct vm *vm) { return (vm->iommu); } int vcpu_set_state(struct vcpu *vcpu, enum vcpu_state newstate, bool from_idle) { int error; vcpu_lock(vcpu); error = vcpu_set_state_locked(vcpu, newstate, from_idle); vcpu_unlock(vcpu); return (error); } enum vcpu_state vcpu_get_state(struct vcpu *vcpu, int *hostcpu) { enum vcpu_state state; vcpu_lock(vcpu); state = vcpu->state; if (hostcpu != NULL) *hostcpu = vcpu->hostcpu; vcpu_unlock(vcpu); return (state); } int vm_activate_cpu(struct vcpu *vcpu) { struct vm *vm = vcpu->vm; if (CPU_ISSET(vcpu->vcpuid, &vm->active_cpus)) return (EBUSY); VMM_CTR0(vcpu, "activated"); CPU_SET_ATOMIC(vcpu->vcpuid, &vm->active_cpus); return (0); } int vm_suspend_cpu(struct vm *vm, struct vcpu *vcpu) { if (vcpu == NULL) { vm->debug_cpus = vm->active_cpus; for (int i = 0; i < vm->maxcpus; i++) { if (CPU_ISSET(i, &vm->active_cpus)) vcpu_notify_event(vm_vcpu(vm, i), false); } } else { if (!CPU_ISSET(vcpu->vcpuid, &vm->active_cpus)) return (EINVAL); CPU_SET_ATOMIC(vcpu->vcpuid, &vm->debug_cpus); vcpu_notify_event(vcpu, false); } return (0); } int vm_resume_cpu(struct vm *vm, struct vcpu *vcpu) { if (vcpu == NULL) { CPU_ZERO(&vm->debug_cpus); } else { if (!CPU_ISSET(vcpu->vcpuid, &vm->debug_cpus)) return (EINVAL); CPU_CLR_ATOMIC(vcpu->vcpuid, &vm->debug_cpus); } return (0); } int vcpu_debugged(struct vcpu *vcpu) { return (CPU_ISSET(vcpu->vcpuid, &vcpu->vm->debug_cpus)); } cpuset_t vm_active_cpus(struct vm *vm) { return (vm->active_cpus); } cpuset_t vm_debug_cpus(struct vm *vm) { return (vm->debug_cpus); } cpuset_t vm_suspended_cpus(struct vm *vm) { return (vm->suspended_cpus); } /* * Returns the subset of vCPUs in tostart that are awaiting startup. * These vCPUs are also marked as no longer awaiting startup. */ cpuset_t vm_start_cpus(struct vm *vm, const cpuset_t *tostart) { cpuset_t set; mtx_lock(&vm->rendezvous_mtx); CPU_AND(&set, &vm->startup_cpus, tostart); CPU_ANDNOT(&vm->startup_cpus, &vm->startup_cpus, &set); mtx_unlock(&vm->rendezvous_mtx); return (set); } void vm_await_start(struct vm *vm, const cpuset_t *waiting) { mtx_lock(&vm->rendezvous_mtx); CPU_OR(&vm->startup_cpus, &vm->startup_cpus, waiting); mtx_unlock(&vm->rendezvous_mtx); } void * vcpu_stats(struct vcpu *vcpu) { return (vcpu->stats); } int vm_get_x2apic_state(struct vcpu *vcpu, enum x2apic_state *state) { *state = vcpu->x2apic_state; return (0); } int vm_set_x2apic_state(struct vcpu *vcpu, enum x2apic_state state) { if (state >= X2APIC_STATE_LAST) return (EINVAL); vcpu->x2apic_state = state; vlapic_set_x2apic_state(vcpu, state); return (0); } /* * This function is called to ensure that a vcpu "sees" a pending event * as soon as possible: * - If the vcpu thread is sleeping then it is woken up. * - If the vcpu is running on a different host_cpu then an IPI will be directed * to the host_cpu to cause the vcpu to trap into the hypervisor. */ static void vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr) { int hostcpu; hostcpu = vcpu->hostcpu; if (vcpu->state == VCPU_RUNNING) { KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu")); if (hostcpu != curcpu) { if (lapic_intr) { vlapic_post_intr(vcpu->vlapic, hostcpu, vmm_ipinum); } else { ipi_cpu(hostcpu, vmm_ipinum); } } else { /* * If the 'vcpu' is running on 'curcpu' then it must * be sending a notification to itself (e.g. SELF_IPI). * The pending event will be picked up when the vcpu * transitions back to guest context. */ } } else { KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent " "with hostcpu %d", vcpu->state, hostcpu)); if (vcpu->state == VCPU_SLEEPING) wakeup_one(vcpu); } } void vcpu_notify_event(struct vcpu *vcpu, bool lapic_intr) { vcpu_lock(vcpu); vcpu_notify_event_locked(vcpu, lapic_intr); vcpu_unlock(vcpu); } struct vmspace * vm_vmspace(struct vm *vm) { return (vm->vmspace); } struct vm_mem * vm_mem(struct vm *vm) { return (&vm->mem); } int vm_apicid2vcpuid(struct vm *vm, int apicid) { /* * XXX apic id is assumed to be numerically identical to vcpu id */ return (apicid); } int vm_smp_rendezvous(struct vcpu *vcpu, cpuset_t dest, vm_rendezvous_func_t func, void *arg) { struct vm *vm = vcpu->vm; int error, i; /* * Enforce that this function is called without any locks */ WITNESS_WARN(WARN_PANIC, NULL, "vm_smp_rendezvous"); restart: mtx_lock(&vm->rendezvous_mtx); if (vm->rendezvous_func != NULL) { /* * If a rendezvous is already in progress then we need to * call the rendezvous handler in case this 'vcpu' is one * of the targets of the rendezvous. */ VMM_CTR0(vcpu, "Rendezvous already in progress"); mtx_unlock(&vm->rendezvous_mtx); error = vm_handle_rendezvous(vcpu); if (error != 0) return (error); goto restart; } KASSERT(vm->rendezvous_func == NULL, ("vm_smp_rendezvous: previous " "rendezvous is still in progress")); VMM_CTR0(vcpu, "Initiating rendezvous"); vm->rendezvous_req_cpus = dest; CPU_ZERO(&vm->rendezvous_done_cpus); vm->rendezvous_arg = arg; vm->rendezvous_func = func; mtx_unlock(&vm->rendezvous_mtx); /* * Wake up any sleeping vcpus and trigger a VM-exit in any running * vcpus so they handle the rendezvous as soon as possible. */ for (i = 0; i < vm->maxcpus; i++) { if (CPU_ISSET(i, &dest)) vcpu_notify_event(vm_vcpu(vm, i), false); } return (vm_handle_rendezvous(vcpu)); } struct vatpic * vm_atpic(struct vm *vm) { return (vm->vatpic); } struct vatpit * vm_atpit(struct vm *vm) { return (vm->vatpit); } struct vpmtmr * vm_pmtmr(struct vm *vm) { return (vm->vpmtmr); } struct vrtc * vm_rtc(struct vm *vm) { return (vm->vrtc); } enum vm_reg_name vm_segment_name(int seg) { static enum vm_reg_name seg_names[] = { VM_REG_GUEST_ES, VM_REG_GUEST_CS, VM_REG_GUEST_SS, VM_REG_GUEST_DS, VM_REG_GUEST_FS, VM_REG_GUEST_GS }; KASSERT(seg >= 0 && seg < nitems(seg_names), ("%s: invalid segment encoding %d", __func__, seg)); return (seg_names[seg]); } void vm_copy_teardown(struct vm_copyinfo *copyinfo, int num_copyinfo) { int idx; for (idx = 0; idx < num_copyinfo; idx++) { if (copyinfo[idx].cookie != NULL) vm_gpa_release(copyinfo[idx].cookie); } bzero(copyinfo, num_copyinfo * sizeof(struct vm_copyinfo)); } int vm_copy_setup(struct vcpu *vcpu, struct vm_guest_paging *paging, uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo, int num_copyinfo, int *fault) { int error, idx, nused; size_t n, off, remaining; void *hva, *cookie; uint64_t gpa; bzero(copyinfo, sizeof(struct vm_copyinfo) * num_copyinfo); nused = 0; remaining = len; while (remaining > 0) { if (nused >= num_copyinfo) return (EFAULT); error = vm_gla2gpa(vcpu, paging, gla, prot, &gpa, fault); if (error || *fault) return (error); off = gpa & PAGE_MASK; n = min(remaining, PAGE_SIZE - off); copyinfo[nused].gpa = gpa; copyinfo[nused].len = n; remaining -= n; gla += n; nused++; } for (idx = 0; idx < nused; idx++) { hva = vm_gpa_hold(vcpu, copyinfo[idx].gpa, copyinfo[idx].len, prot, &cookie); if (hva == NULL) break; copyinfo[idx].hva = hva; copyinfo[idx].cookie = cookie; } if (idx != nused) { vm_copy_teardown(copyinfo, num_copyinfo); return (EFAULT); } else { *fault = 0; return (0); } } void vm_copyin(struct vm_copyinfo *copyinfo, void *kaddr, size_t len) { char *dst; int idx; dst = kaddr; idx = 0; while (len > 0) { bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len); len -= copyinfo[idx].len; dst += copyinfo[idx].len; idx++; } } void vm_copyout(const void *kaddr, struct vm_copyinfo *copyinfo, size_t len) { const char *src; int idx; src = kaddr; idx = 0; while (len > 0) { bcopy(src, copyinfo[idx].hva, copyinfo[idx].len); len -= copyinfo[idx].len; src += copyinfo[idx].len; idx++; } } /* * Return the amount of in-use and wired memory for the VM. Since * these are global stats, only return the values with for vCPU 0 */ VMM_STAT_DECLARE(VMM_MEM_RESIDENT); VMM_STAT_DECLARE(VMM_MEM_WIRED); static void vm_get_rescnt(struct vcpu *vcpu, struct vmm_stat_type *stat) { if (vcpu->vcpuid == 0) { vmm_stat_set(vcpu, VMM_MEM_RESIDENT, PAGE_SIZE * vmspace_resident_count(vcpu->vm->vmspace)); } } static void vm_get_wiredcnt(struct vcpu *vcpu, struct vmm_stat_type *stat) { if (vcpu->vcpuid == 0) { vmm_stat_set(vcpu, VMM_MEM_WIRED, PAGE_SIZE * pmap_wired_count(vmspace_pmap(vcpu->vm->vmspace))); } } VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt); VMM_STAT_FUNC(VMM_MEM_WIRED, "Wired memory", vm_get_wiredcnt); #ifdef BHYVE_SNAPSHOT static int vm_snapshot_vcpus(struct vm *vm, struct vm_snapshot_meta *meta) { uint64_t tsc, now; int ret; struct vcpu *vcpu; uint16_t i, maxcpus; now = rdtsc(); maxcpus = vm_get_maxcpus(vm); for (i = 0; i < maxcpus; i++) { vcpu = vm->vcpu[i]; if (vcpu == NULL) continue; SNAPSHOT_VAR_OR_LEAVE(vcpu->x2apic_state, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vcpu->exitintinfo, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vcpu->exc_vector, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vcpu->exc_errcode_valid, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vcpu->exc_errcode, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vcpu->guest_xcr0, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vcpu->exitinfo, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vcpu->nextrip, meta, ret, done); /* * Save the absolute TSC value by adding now to tsc_offset. * * It will be turned turned back into an actual offset when the * TSC restore function is called */ tsc = now + vcpu->tsc_offset; SNAPSHOT_VAR_OR_LEAVE(tsc, meta, ret, done); if (meta->op == VM_SNAPSHOT_RESTORE) vcpu->tsc_offset = tsc; } done: return (ret); } static int vm_snapshot_vm(struct vm *vm, struct vm_snapshot_meta *meta) { int ret; ret = vm_snapshot_vcpus(vm, meta); if (ret != 0) goto done; SNAPSHOT_VAR_OR_LEAVE(vm->startup_cpus, meta, ret, done); done: return (ret); } static int vm_snapshot_vcpu(struct vm *vm, struct vm_snapshot_meta *meta) { int error; struct vcpu *vcpu; uint16_t i, maxcpus; error = 0; maxcpus = vm_get_maxcpus(vm); for (i = 0; i < maxcpus; i++) { vcpu = vm->vcpu[i]; if (vcpu == NULL) continue; error = vmmops_vcpu_snapshot(vcpu->cookie, meta); if (error != 0) { printf("%s: failed to snapshot vmcs/vmcb data for " "vCPU: %d; error: %d\n", __func__, i, error); goto done; } } done: return (error); } /* * Save kernel-side structures to user-space for snapshotting. */ int vm_snapshot_req(struct vm *vm, struct vm_snapshot_meta *meta) { int ret = 0; switch (meta->dev_req) { case STRUCT_VMCX: ret = vm_snapshot_vcpu(vm, meta); break; case STRUCT_VM: ret = vm_snapshot_vm(vm, meta); break; case STRUCT_VIOAPIC: ret = vioapic_snapshot(vm_ioapic(vm), meta); break; case STRUCT_VLAPIC: ret = vlapic_snapshot(vm, meta); break; case STRUCT_VHPET: ret = vhpet_snapshot(vm_hpet(vm), meta); break; case STRUCT_VATPIC: ret = vatpic_snapshot(vm_atpic(vm), meta); break; case STRUCT_VATPIT: ret = vatpit_snapshot(vm_atpit(vm), meta); break; case STRUCT_VPMTMR: ret = vpmtmr_snapshot(vm_pmtmr(vm), meta); break; case STRUCT_VRTC: ret = vrtc_snapshot(vm_rtc(vm), meta); break; default: printf("%s: failed to find the requested type %#x\n", __func__, meta->dev_req); ret = (EINVAL); } return (ret); } void vm_set_tsc_offset(struct vcpu *vcpu, uint64_t offset) { vcpu->tsc_offset = offset; } int vm_restore_time(struct vm *vm) { int error; uint64_t now; struct vcpu *vcpu; uint16_t i, maxcpus; now = rdtsc(); error = vhpet_restore_time(vm_hpet(vm)); if (error) return (error); maxcpus = vm_get_maxcpus(vm); for (i = 0; i < maxcpus; i++) { vcpu = vm->vcpu[i]; if (vcpu == NULL) continue; error = vmmops_restore_tsc(vcpu->cookie, vcpu->tsc_offset - now); if (error) return (error); } return (0); } #endif diff --git a/sys/arm64/include/vmm.h b/sys/arm64/include/vmm.h index e839b5dd92c9..6f5726f0bb1e 100644 --- a/sys/arm64/include/vmm.h +++ b/sys/arm64/include/vmm.h @@ -1,348 +1,379 @@ /* * Copyright (C) 2015 Mihai Carabas * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _VMM_H_ #define _VMM_H_ #include #include #include #include #include "pte.h" #include "pmap.h" struct vcpu; enum vm_suspend_how { VM_SUSPEND_NONE, VM_SUSPEND_RESET, VM_SUSPEND_POWEROFF, VM_SUSPEND_HALT, VM_SUSPEND_DESTROY, VM_SUSPEND_LAST }; /* * Identifiers for architecturally defined registers. */ enum vm_reg_name { VM_REG_GUEST_X0 = 0, VM_REG_GUEST_X1, VM_REG_GUEST_X2, VM_REG_GUEST_X3, VM_REG_GUEST_X4, VM_REG_GUEST_X5, VM_REG_GUEST_X6, VM_REG_GUEST_X7, VM_REG_GUEST_X8, VM_REG_GUEST_X9, VM_REG_GUEST_X10, VM_REG_GUEST_X11, VM_REG_GUEST_X12, VM_REG_GUEST_X13, VM_REG_GUEST_X14, VM_REG_GUEST_X15, VM_REG_GUEST_X16, VM_REG_GUEST_X17, VM_REG_GUEST_X18, VM_REG_GUEST_X19, VM_REG_GUEST_X20, VM_REG_GUEST_X21, VM_REG_GUEST_X22, VM_REG_GUEST_X23, VM_REG_GUEST_X24, VM_REG_GUEST_X25, VM_REG_GUEST_X26, VM_REG_GUEST_X27, VM_REG_GUEST_X28, VM_REG_GUEST_X29, VM_REG_GUEST_LR, VM_REG_GUEST_SP, VM_REG_GUEST_PC, VM_REG_GUEST_CPSR, VM_REG_GUEST_SCTLR_EL1, VM_REG_GUEST_TTBR0_EL1, VM_REG_GUEST_TTBR1_EL1, VM_REG_GUEST_TCR_EL1, VM_REG_GUEST_TCR2_EL1, VM_REG_GUEST_MPIDR_EL1, VM_REG_LAST }; #define VM_INTINFO_VECTOR(info) ((info) & 0xff) #define VM_INTINFO_DEL_ERRCODE 0x800 #define VM_INTINFO_RSVD 0x7ffff000 #define VM_INTINFO_VALID 0x80000000 #define VM_INTINFO_TYPE 0x700 #define VM_INTINFO_HWINTR (0 << 8) #define VM_INTINFO_NMI (2 << 8) #define VM_INTINFO_HWEXCEPTION (3 << 8) #define VM_INTINFO_SWINTR (4 << 8) #define VM_GUEST_BASE_IPA 0x80000000UL /* Guest kernel start ipa */ /* * The VM name has to fit into the pathname length constraints of devfs, * governed primarily by SPECNAMELEN. The length is the total number of * characters in the full path, relative to the mount point and not * including any leading '/' characters. * A prefix and a suffix are added to the name specified by the user. * The prefix is usually "vmm/" or "vmm.io/", but can be a few characters * longer for future use. * The suffix is a string that identifies a bootrom image or some similar * image that is attached to the VM. A separator character gets added to * the suffix automatically when generating the full path, so it must be * accounted for, reducing the effective length by 1. * The effective length of a VM name is 229 bytes for FreeBSD 13 and 37 * bytes for FreeBSD 12. A minimum length is set for safety and supports * a SPECNAMELEN as small as 32 on old systems. */ #define VM_MAX_PREFIXLEN 10 #define VM_MAX_SUFFIXLEN 15 #define VM_MAX_NAMELEN \ (SPECNAMELEN - VM_MAX_PREFIXLEN - VM_MAX_SUFFIXLEN - 1) #ifdef _KERNEL struct vm; struct vm_exception; struct vm_exit; struct vm_run; struct vm_object; struct vm_guest_paging; struct vm_vgic_descr; struct pmap; struct vm_eventinfo { void *rptr; /* rendezvous cookie */ int *sptr; /* suspend cookie */ int *iptr; /* reqidle cookie */ }; +#define DECLARE_VMMOPS_FUNC(ret_type, opname, args) \ + ret_type vmmops_##opname args + +DECLARE_VMMOPS_FUNC(int, modinit, (int ipinum)); +DECLARE_VMMOPS_FUNC(int, modcleanup, (void)); +DECLARE_VMMOPS_FUNC(void *, init, (struct vm *vm, struct pmap *pmap)); +DECLARE_VMMOPS_FUNC(int, gla2gpa, (void *vcpui, struct vm_guest_paging *paging, + uint64_t gla, int prot, uint64_t *gpa, int *is_fault)); +DECLARE_VMMOPS_FUNC(int, run, (void *vcpui, register_t pc, struct pmap *pmap, + struct vm_eventinfo *info)); +DECLARE_VMMOPS_FUNC(void, cleanup, (void *vmi)); +DECLARE_VMMOPS_FUNC(void *, vcpu_init, (void *vmi, struct vcpu *vcpu, + int vcpu_id)); +DECLARE_VMMOPS_FUNC(void, vcpu_cleanup, (void *vcpui)); +DECLARE_VMMOPS_FUNC(int, exception, (void *vcpui, uint64_t esr, uint64_t far)); +DECLARE_VMMOPS_FUNC(int, getreg, (void *vcpui, int num, uint64_t *retval)); +DECLARE_VMMOPS_FUNC(int, setreg, (void *vcpui, int num, uint64_t val)); +DECLARE_VMMOPS_FUNC(int, getcap, (void *vcpui, int num, int *retval)); +DECLARE_VMMOPS_FUNC(int, setcap, (void *vcpui, int num, int val)); +DECLARE_VMMOPS_FUNC(struct vmspace *, vmspace_alloc, (vm_offset_t min, + vm_offset_t max)); +DECLARE_VMMOPS_FUNC(void, vmspace_free, (struct vmspace *vmspace)); +#ifdef notyet +#ifdef BHYVE_SNAPSHOT +DECLARE_VMMOPS_FUNC(int, snapshot, (void *vmi, struct vm_snapshot_meta *meta)); +DECLARE_VMMOPS_FUNC(int, vcpu_snapshot, (void *vcpui, + struct vm_snapshot_meta *meta)); +DECLARE_VMMOPS_FUNC(int, restore_tsc, (void *vcpui, uint64_t now)); +#endif +#endif + int vm_create(const char *name, struct vm **retvm); struct vcpu *vm_alloc_vcpu(struct vm *vm, int vcpuid); void vm_disable_vcpu_creation(struct vm *vm); void vm_slock_vcpus(struct vm *vm); void vm_unlock_vcpus(struct vm *vm); void vm_destroy(struct vm *vm); int vm_reinit(struct vm *vm); const char *vm_name(struct vm *vm); uint16_t vm_get_maxcpus(struct vm *vm); void vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores, uint16_t *threads, uint16_t *maxcpus); int vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores, uint16_t threads, uint16_t maxcpus); int vm_get_register(struct vcpu *vcpu, int reg, uint64_t *retval); int vm_set_register(struct vcpu *vcpu, int reg, uint64_t val); int vm_run(struct vcpu *vcpu); int vm_suspend(struct vm *vm, enum vm_suspend_how how); void* vm_get_cookie(struct vm *vm); int vcpu_vcpuid(struct vcpu *vcpu); void *vcpu_get_cookie(struct vcpu *vcpu); struct vm *vcpu_vm(struct vcpu *vcpu); struct vcpu *vm_vcpu(struct vm *vm, int cpu); int vm_get_capability(struct vcpu *vcpu, int type, int *val); int vm_set_capability(struct vcpu *vcpu, int type, int val); int vm_activate_cpu(struct vcpu *vcpu); int vm_suspend_cpu(struct vm *vm, struct vcpu *vcpu); int vm_resume_cpu(struct vm *vm, struct vcpu *vcpu); int vm_inject_exception(struct vcpu *vcpu, uint64_t esr, uint64_t far); int vm_attach_vgic(struct vm *vm, struct vm_vgic_descr *descr); int vm_assert_irq(struct vm *vm, uint32_t irq); int vm_deassert_irq(struct vm *vm, uint32_t irq); int vm_raise_msi(struct vm *vm, uint64_t msg, uint64_t addr, int bus, int slot, int func); struct vm_exit *vm_exitinfo(struct vcpu *vcpu); void vm_exit_suspended(struct vcpu *vcpu, uint64_t pc); void vm_exit_debug(struct vcpu *vcpu, uint64_t pc); void vm_exit_rendezvous(struct vcpu *vcpu, uint64_t pc); void vm_exit_astpending(struct vcpu *vcpu, uint64_t pc); cpuset_t vm_active_cpus(struct vm *vm); cpuset_t vm_debug_cpus(struct vm *vm); cpuset_t vm_suspended_cpus(struct vm *vm); static __inline int vcpu_rendezvous_pending(struct vm_eventinfo *info) { return (*((uintptr_t *)(info->rptr)) != 0); } static __inline int vcpu_suspended(struct vm_eventinfo *info) { return (*info->sptr); } int vcpu_debugged(struct vcpu *vcpu); enum vcpu_state { VCPU_IDLE, VCPU_FROZEN, VCPU_RUNNING, VCPU_SLEEPING, }; int vcpu_set_state(struct vcpu *vcpu, enum vcpu_state state, bool from_idle); enum vcpu_state vcpu_get_state(struct vcpu *vcpu, int *hostcpu); static int __inline vcpu_is_running(struct vcpu *vcpu, int *hostcpu) { return (vcpu_get_state(vcpu, hostcpu) == VCPU_RUNNING); } #ifdef _SYS_PROC_H_ static int __inline vcpu_should_yield(struct vcpu *vcpu) { struct thread *td; td = curthread; return (td->td_ast != 0 || td->td_owepreempt != 0); } #endif void *vcpu_stats(struct vcpu *vcpu); void vcpu_notify_event(struct vcpu *vcpu); struct vmspace *vm_vmspace(struct vm *vm); struct vm_mem *vm_mem(struct vm *vm); enum vm_reg_name vm_segment_name(int seg_encoding); struct vm_copyinfo { uint64_t gpa; size_t len; void *hva; void *cookie; }; #endif /* _KERNEL */ #define VM_DIR_READ 0 #define VM_DIR_WRITE 1 #define VM_GP_M_MASK 0x1f #define VM_GP_MMU_ENABLED (1 << 5) struct vm_guest_paging { uint64_t ttbr0_addr; uint64_t ttbr1_addr; uint64_t tcr_el1; uint64_t tcr2_el1; int flags; int padding; }; struct vie { uint8_t access_size:4, sign_extend:1, dir:1, unused:2; enum vm_reg_name reg; }; struct vre { uint32_t inst_syndrome; uint8_t dir:1, unused:7; enum vm_reg_name reg; }; /* * Identifiers for optional vmm capabilities */ enum vm_cap_type { VM_CAP_HALT_EXIT, VM_CAP_PAUSE_EXIT, VM_CAP_UNRESTRICTED_GUEST, VM_CAP_BRK_EXIT, VM_CAP_SS_EXIT, VM_CAP_MASK_HWINTR, VM_CAP_MAX }; enum vm_exitcode { VM_EXITCODE_BOGUS, VM_EXITCODE_INST_EMUL, VM_EXITCODE_REG_EMUL, VM_EXITCODE_HVC, VM_EXITCODE_SUSPENDED, VM_EXITCODE_HYP, VM_EXITCODE_WFI, VM_EXITCODE_PAGING, VM_EXITCODE_SMCCC, VM_EXITCODE_DEBUG, VM_EXITCODE_BRK, VM_EXITCODE_SS, VM_EXITCODE_MAX }; struct vm_exit { enum vm_exitcode exitcode; int inst_length; uint64_t pc; union { /* * ARM specific payload. */ struct { uint32_t exception_nr; uint32_t pad; uint64_t esr_el2; /* Exception Syndrome Register */ uint64_t far_el2; /* Fault Address Register */ uint64_t hpfar_el2; /* Hypervisor IPA Fault Address Register */ } hyp; struct { struct vre vre; } reg_emul; struct { uint64_t gpa; uint64_t esr; } paging; struct { uint64_t gpa; struct vm_guest_paging paging; struct vie vie; } inst_emul; /* * A SMCCC call, e.g. starting a core via PSCI. * Further arguments can be read by asking the kernel for * all register values. */ struct { uint64_t func_id; uint64_t args[7]; } smccc_call; struct { enum vm_suspend_how how; } suspended; } u; }; #endif /* _VMM_H_ */ diff --git a/sys/arm64/vmm/arm64.h b/sys/arm64/vmm/arm64.h index 10261c6a6499..20cb3c312df5 100644 --- a/sys/arm64/vmm/arm64.h +++ b/sys/arm64/vmm/arm64.h @@ -1,178 +1,147 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (C) 2015 Mihai Carabas * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _VMM_ARM64_H_ #define _VMM_ARM64_H_ #include #include #include #include "mmu.h" #include "io/vgic_v3.h" #include "io/vtimer.h" struct vgic_v3; struct vgic_v3_cpu; /* * Per-vCPU hypervisor state. */ struct hypctx { struct trapframe tf; /* * EL1 control registers. */ uint64_t elr_el1; /* Exception Link Register */ uint64_t sp_el0; /* Stack pointer */ uint64_t tpidr_el0; /* EL0 Software ID Register */ uint64_t tpidrro_el0; /* Read-only Thread ID Register */ uint64_t tpidr_el1; /* EL1 Software ID Register */ uint64_t vbar_el1; /* Vector Base Address Register */ uint64_t actlr_el1; /* Auxiliary Control Register */ uint64_t afsr0_el1; /* Auxiliary Fault Status Register 0 */ uint64_t afsr1_el1; /* Auxiliary Fault Status Register 1 */ uint64_t amair_el1; /* Auxiliary Memory Attribute Indirection Register */ uint64_t contextidr_el1; /* Current Process Identifier */ uint64_t cpacr_el1; /* Architectural Feature Access Control Register */ uint64_t csselr_el1; /* Cache Size Selection Register */ uint64_t esr_el1; /* Exception Syndrome Register */ uint64_t far_el1; /* Fault Address Register */ uint64_t mair_el1; /* Memory Attribute Indirection Register */ uint64_t mdccint_el1; /* Monitor DCC Interrupt Enable Register */ uint64_t mdscr_el1; /* Monitor Debug System Control Register */ uint64_t par_el1; /* Physical Address Register */ uint64_t sctlr_el1; /* System Control Register */ uint64_t tcr_el1; /* Translation Control Register */ uint64_t tcr2_el1; /* Translation Control Register 2 */ uint64_t ttbr0_el1; /* Translation Table Base Register 0 */ uint64_t ttbr1_el1; /* Translation Table Base Register 1 */ uint64_t spsr_el1; /* Saved Program Status Register */ uint64_t pmcr_el0; /* Performance Monitors Control Register */ uint64_t pmccntr_el0; uint64_t pmccfiltr_el0; uint64_t pmuserenr_el0; uint64_t pmselr_el0; uint64_t pmxevcntr_el0; uint64_t pmcntenset_el0; uint64_t pmintenset_el1; uint64_t pmovsset_el0; uint64_t pmevcntr_el0[31]; uint64_t pmevtyper_el0[31]; uint64_t dbgclaimset_el1; uint64_t dbgbcr_el1[16]; /* Debug Breakpoint Control Registers */ uint64_t dbgbvr_el1[16]; /* Debug Breakpoint Value Registers */ uint64_t dbgwcr_el1[16]; /* Debug Watchpoint Control Registers */ uint64_t dbgwvr_el1[16]; /* Debug Watchpoint Value Registers */ /* EL2 control registers */ uint64_t cptr_el2; /* Architectural Feature Trap Register */ uint64_t hcr_el2; /* Hypervisor Configuration Register */ uint64_t hcrx_el2; /* Extended Hypervisor Configuration Register */ uint64_t mdcr_el2; /* Monitor Debug Configuration Register */ uint64_t vpidr_el2; /* Virtualization Processor ID Register */ uint64_t vmpidr_el2; /* Virtualization Multiprocessor ID Register */ uint64_t el2_addr; /* The address of this in el2 space */ struct hyp *hyp; struct vcpu *vcpu; struct { uint64_t far_el2; /* Fault Address Register */ uint64_t hpfar_el2; /* Hypervisor IPA Fault Address Register */ } exit_info; struct vtimer_cpu vtimer_cpu; uint64_t setcaps; /* Currently enabled capabilities. */ /* vCPU state used to handle guest debugging. */ uint64_t debug_spsr; /* Saved guest SPSR */ uint64_t debug_mdscr; /* Saved guest MDSCR */ struct vgic_v3_regs vgic_v3_regs; struct vgic_v3_cpu *vgic_cpu; bool has_exception; bool dbg_oslock; }; struct hyp { struct vm *vm; struct vtimer vtimer; uint64_t vmid_generation; uint64_t vttbr_el2; uint64_t el2_addr; /* The address of this in el2 space */ bool vgic_attached; struct vgic_v3 *vgic; struct hypctx *ctx[]; }; -#define DEFINE_VMMOPS_IFUNC(ret_type, opname, args) \ - ret_type vmmops_##opname args; - -DEFINE_VMMOPS_IFUNC(int, modinit, (int ipinum)) -DEFINE_VMMOPS_IFUNC(int, modcleanup, (void)) -DEFINE_VMMOPS_IFUNC(void *, init, (struct vm *vm, struct pmap *pmap)) -DEFINE_VMMOPS_IFUNC(int, gla2gpa, (void *vcpui, struct vm_guest_paging *paging, - uint64_t gla, int prot, uint64_t *gpa, int *is_fault)) -DEFINE_VMMOPS_IFUNC(int, run, (void *vcpui, register_t pc, struct pmap *pmap, - struct vm_eventinfo *info)) -DEFINE_VMMOPS_IFUNC(void, cleanup, (void *vmi)) -DEFINE_VMMOPS_IFUNC(void *, vcpu_init, (void *vmi, struct vcpu *vcpu, - int vcpu_id)) -DEFINE_VMMOPS_IFUNC(void, vcpu_cleanup, (void *vcpui)) -DEFINE_VMMOPS_IFUNC(int, exception, (void *vcpui, uint64_t esr, uint64_t far)) -DEFINE_VMMOPS_IFUNC(int, getreg, (void *vcpui, int num, uint64_t *retval)) -DEFINE_VMMOPS_IFUNC(int, setreg, (void *vcpui, int num, uint64_t val)) -DEFINE_VMMOPS_IFUNC(int, getcap, (void *vcpui, int num, int *retval)) -DEFINE_VMMOPS_IFUNC(int, setcap, (void *vcpui, int num, int val)) -DEFINE_VMMOPS_IFUNC(struct vmspace *, vmspace_alloc, (vm_offset_t min, - vm_offset_t max)) -DEFINE_VMMOPS_IFUNC(void, vmspace_free, (struct vmspace *vmspace)) -#ifdef notyet -#ifdef BHYVE_SNAPSHOT -DEFINE_VMMOPS_IFUNC(int, snapshot, (void *vmi, struct vm_snapshot_meta *meta)) -DEFINE_VMMOPS_IFUNC(int, vcpu_snapshot, (void *vcpui, - struct vm_snapshot_meta *meta)) -DEFINE_VMMOPS_IFUNC(int, restore_tsc, (void *vcpui, uint64_t now)) -#endif -#endif - uint64_t vmm_call_hyp(uint64_t, ...); #if 0 #define eprintf(fmt, ...) printf("%s:%d " fmt, __func__, __LINE__, ##__VA_ARGS__) #else #define eprintf(fmt, ...) do {} while(0) #endif struct hypctx *arm64_get_active_vcpu(void); void raise_data_insn_abort(struct hypctx *, uint64_t, bool, int); #endif /* !_VMM_ARM64_H_ */ diff --git a/sys/riscv/include/vmm.h b/sys/riscv/include/vmm.h index de7119dd534a..43d3dbea968e 100644 --- a/sys/riscv/include/vmm.h +++ b/sys/riscv/include/vmm.h @@ -1,299 +1,322 @@ /* * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2015 Mihai Carabas * Copyright (c) 2024 Ruslan Bukin * * This software was developed by the University of Cambridge Computer * Laboratory (Department of Computer Science and Technology) under Innovate * UK project 105694, "Digital Security by Design (DSbD) Technology Platform * Prototype". * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _VMM_H_ #define _VMM_H_ #include #include #include #include #include "pte.h" #include "pmap.h" struct vcpu; enum vm_suspend_how { VM_SUSPEND_NONE, VM_SUSPEND_RESET, VM_SUSPEND_POWEROFF, VM_SUSPEND_HALT, VM_SUSPEND_DESTROY, VM_SUSPEND_LAST }; /* * Identifiers for architecturally defined registers. */ enum vm_reg_name { VM_REG_GUEST_ZERO = 0, VM_REG_GUEST_RA, VM_REG_GUEST_SP, VM_REG_GUEST_GP, VM_REG_GUEST_TP, VM_REG_GUEST_T0, VM_REG_GUEST_T1, VM_REG_GUEST_T2, VM_REG_GUEST_S0, VM_REG_GUEST_S1, VM_REG_GUEST_A0, VM_REG_GUEST_A1, VM_REG_GUEST_A2, VM_REG_GUEST_A3, VM_REG_GUEST_A4, VM_REG_GUEST_A5, VM_REG_GUEST_A6, VM_REG_GUEST_A7, VM_REG_GUEST_S2, VM_REG_GUEST_S3, VM_REG_GUEST_S4, VM_REG_GUEST_S5, VM_REG_GUEST_S6, VM_REG_GUEST_S7, VM_REG_GUEST_S8, VM_REG_GUEST_S9, VM_REG_GUEST_S10, VM_REG_GUEST_S11, VM_REG_GUEST_T3, VM_REG_GUEST_T4, VM_REG_GUEST_T5, VM_REG_GUEST_T6, VM_REG_GUEST_SEPC, VM_REG_LAST }; #define VM_INTINFO_VECTOR(info) ((info) & 0xff) #define VM_INTINFO_DEL_ERRCODE 0x800 #define VM_INTINFO_RSVD 0x7ffff000 #define VM_INTINFO_VALID 0x80000000 #define VM_INTINFO_TYPE 0x700 #define VM_INTINFO_HWINTR (0 << 8) #define VM_INTINFO_NMI (2 << 8) #define VM_INTINFO_HWEXCEPTION (3 << 8) #define VM_INTINFO_SWINTR (4 << 8) #define VM_MAX_NAMELEN 32 #define VM_MAX_SUFFIXLEN 15 #ifdef _KERNEL struct vm; struct vm_exception; struct vm_exit; struct vm_run; struct vm_object; struct vm_guest_paging; struct vm_aplic_descr; struct pmap; struct vm_eventinfo { void *rptr; /* rendezvous cookie */ int *sptr; /* suspend cookie */ int *iptr; /* reqidle cookie */ }; +#define DECLARE_VMMOPS_FUNC(ret_type, opname, args) \ + ret_type vmmops_##opname args + +DECLARE_VMMOPS_FUNC(int, modinit, (void)); +DECLARE_VMMOPS_FUNC(int, modcleanup, (void)); +DECLARE_VMMOPS_FUNC(void *, init, (struct vm *vm, struct pmap *pmap)); +DECLARE_VMMOPS_FUNC(int, gla2gpa, (void *vcpui, struct vm_guest_paging *paging, + uint64_t gla, int prot, uint64_t *gpa, int *is_fault)); +DECLARE_VMMOPS_FUNC(int, run, (void *vcpui, register_t pc, struct pmap *pmap, + struct vm_eventinfo *info)); +DECLARE_VMMOPS_FUNC(void, cleanup, (void *vmi)); +DECLARE_VMMOPS_FUNC(void *, vcpu_init, (void *vmi, struct vcpu *vcpu, + int vcpu_id)); +DECLARE_VMMOPS_FUNC(void, vcpu_cleanup, (void *vcpui)); +DECLARE_VMMOPS_FUNC(int, exception, (void *vcpui, uint64_t scause)); +DECLARE_VMMOPS_FUNC(int, getreg, (void *vcpui, int num, uint64_t *retval)); +DECLARE_VMMOPS_FUNC(int, setreg, (void *vcpui, int num, uint64_t val)); +DECLARE_VMMOPS_FUNC(int, getcap, (void *vcpui, int num, int *retval)); +DECLARE_VMMOPS_FUNC(int, setcap, (void *vcpui, int num, int val)); +DECLARE_VMMOPS_FUNC(struct vmspace *, vmspace_alloc, (vm_offset_t min, + vm_offset_t max)); +DECLARE_VMMOPS_FUNC(void, vmspace_free, (struct vmspace *vmspace)); + int vm_create(const char *name, struct vm **retvm); struct vcpu *vm_alloc_vcpu(struct vm *vm, int vcpuid); void vm_disable_vcpu_creation(struct vm *vm); void vm_slock_vcpus(struct vm *vm); void vm_unlock_vcpus(struct vm *vm); void vm_destroy(struct vm *vm); int vm_reinit(struct vm *vm); const char *vm_name(struct vm *vm); uint16_t vm_get_maxcpus(struct vm *vm); void vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores, uint16_t *threads, uint16_t *maxcpus); int vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores, uint16_t threads, uint16_t maxcpus); int vm_get_register(struct vcpu *vcpu, int reg, uint64_t *retval); int vm_set_register(struct vcpu *vcpu, int reg, uint64_t val); int vm_run(struct vcpu *vcpu); int vm_suspend(struct vm *vm, enum vm_suspend_how how); void* vm_get_cookie(struct vm *vm); int vcpu_vcpuid(struct vcpu *vcpu); void *vcpu_get_cookie(struct vcpu *vcpu); struct vm *vcpu_vm(struct vcpu *vcpu); struct vcpu *vm_vcpu(struct vm *vm, int cpu); int vm_get_capability(struct vcpu *vcpu, int type, int *val); int vm_set_capability(struct vcpu *vcpu, int type, int val); int vm_activate_cpu(struct vcpu *vcpu); int vm_suspend_cpu(struct vm *vm, struct vcpu *vcpu); int vm_resume_cpu(struct vm *vm, struct vcpu *vcpu); int vm_inject_exception(struct vcpu *vcpu, uint64_t scause); int vm_attach_aplic(struct vm *vm, struct vm_aplic_descr *descr); int vm_assert_irq(struct vm *vm, uint32_t irq); int vm_deassert_irq(struct vm *vm, uint32_t irq); int vm_raise_msi(struct vm *vm, uint64_t msg, uint64_t addr, int bus, int slot, int func); struct vm_exit *vm_exitinfo(struct vcpu *vcpu); void vm_exit_suspended(struct vcpu *vcpu, uint64_t pc); void vm_exit_debug(struct vcpu *vcpu, uint64_t pc); void vm_exit_rendezvous(struct vcpu *vcpu, uint64_t pc); void vm_exit_astpending(struct vcpu *vcpu, uint64_t pc); cpuset_t vm_active_cpus(struct vm *vm); cpuset_t vm_debug_cpus(struct vm *vm); cpuset_t vm_suspended_cpus(struct vm *vm); static __inline int vcpu_rendezvous_pending(struct vm_eventinfo *info) { return (*((uintptr_t *)(info->rptr)) != 0); } static __inline int vcpu_suspended(struct vm_eventinfo *info) { return (*info->sptr); } int vcpu_debugged(struct vcpu *vcpu); enum vcpu_state { VCPU_IDLE, VCPU_FROZEN, VCPU_RUNNING, VCPU_SLEEPING, }; int vcpu_set_state(struct vcpu *vcpu, enum vcpu_state state, bool from_idle); enum vcpu_state vcpu_get_state(struct vcpu *vcpu, int *hostcpu); static int __inline vcpu_is_running(struct vcpu *vcpu, int *hostcpu) { return (vcpu_get_state(vcpu, hostcpu) == VCPU_RUNNING); } #ifdef _SYS_PROC_H_ static int __inline vcpu_should_yield(struct vcpu *vcpu) { struct thread *td; td = curthread; return (td->td_ast != 0 || td->td_owepreempt != 0); } #endif void *vcpu_stats(struct vcpu *vcpu); void vcpu_notify_event(struct vcpu *vcpu); struct vmspace *vm_vmspace(struct vm *vm); struct vm_mem *vm_mem(struct vm *vm); enum vm_reg_name vm_segment_name(int seg_encoding); #endif /* _KERNEL */ #define VM_DIR_READ 0 #define VM_DIR_WRITE 1 #define VM_GP_M_MASK 0x1f #define VM_GP_MMU_ENABLED (1 << 5) struct vm_guest_paging { int flags; int padding; }; struct vie { uint8_t access_size:4, sign_extend:1, dir:1, unused:2; enum vm_reg_name reg; }; struct vre { uint32_t inst_syndrome; uint8_t dir:1, unused:7; enum vm_reg_name reg; }; /* * Identifiers for optional vmm capabilities */ enum vm_cap_type { VM_CAP_UNRESTRICTED_GUEST, VM_CAP_SSTC, VM_CAP_MAX }; enum vm_exitcode { VM_EXITCODE_BOGUS, VM_EXITCODE_ECALL, VM_EXITCODE_HYP, VM_EXITCODE_PAGING, VM_EXITCODE_SUSPENDED, VM_EXITCODE_DEBUG, VM_EXITCODE_INST_EMUL, VM_EXITCODE_WFI, VM_EXITCODE_MAX }; struct vm_exit { uint64_t scause; uint64_t sepc; uint64_t stval; uint64_t htval; uint64_t htinst; enum vm_exitcode exitcode; int inst_length; uint64_t pc; union { struct { uint64_t gpa; } paging; struct { uint64_t gpa; struct vm_guest_paging paging; struct vie vie; } inst_emul; struct { uint64_t args[8]; } ecall; struct { enum vm_suspend_how how; } suspended; struct { uint64_t scause; } hyp; } u; }; #endif /* _VMM_H_ */ diff --git a/sys/riscv/vmm/riscv.h b/sys/riscv/vmm/riscv.h index 870d0d6c5cd1..917a333520ed 100644 --- a/sys/riscv/vmm/riscv.h +++ b/sys/riscv/vmm/riscv.h @@ -1,159 +1,136 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2015 Mihai Carabas * Copyright (c) 2024 Ruslan Bukin * * This software was developed by the University of Cambridge Computer * Laboratory (Department of Computer Science and Technology) under Innovate * UK project 105694, "Digital Security by Design (DSbD) Technology Platform * Prototype". * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _VMM_RISCV_H_ #define _VMM_RISCV_H_ #include #include #include #include struct hypregs { uint64_t hyp_ra; uint64_t hyp_sp; uint64_t hyp_gp; uint64_t hyp_tp; uint64_t hyp_t[7]; uint64_t hyp_s[12]; uint64_t hyp_a[8]; uint64_t hyp_sepc; uint64_t hyp_sstatus; uint64_t hyp_hstatus; }; struct hypcsr { uint64_t hvip; uint64_t vsstatus; uint64_t vsie; uint64_t vstvec; uint64_t vsscratch; uint64_t vsepc; uint64_t vscause; uint64_t vstval; uint64_t vsatp; uint64_t scounteren; uint64_t senvcfg; }; enum vmm_fence_type { VMM_RISCV_FENCE_INVALID = 0, VMM_RISCV_FENCE_I, VMM_RISCV_FENCE_VMA, VMM_RISCV_FENCE_VMA_ASID, }; struct vmm_fence { enum vmm_fence_type type; size_t start; size_t size; uint64_t asid; }; struct hypctx { struct hypregs host_regs; struct hypregs guest_regs; struct hypcsr guest_csrs; uint64_t host_sscratch; uint64_t host_stvec; uint64_t host_scounteren; uint64_t guest_scounteren; struct hyp *hyp; struct vcpu *vcpu; bool has_exception; int cpu_id; int ipi_pending; int interrupts_pending; struct vtimer vtimer; struct vmm_fence *fence_queue; struct mtx fence_queue_mtx; int fence_queue_head; int fence_queue_tail; #define FENCE_REQ_I (1 << 0) #define FENCE_REQ_VMA (1 << 1) int fence_req; }; struct hyp { struct vm *vm; uint64_t vmid_generation; bool aplic_attached; struct aplic *aplic; struct hypctx *ctx[]; }; struct hyptrap { uint64_t sepc; uint64_t scause; uint64_t stval; uint64_t htval; uint64_t htinst; }; -#define DEFINE_VMMOPS_IFUNC(ret_type, opname, args) \ - ret_type vmmops_##opname args; - -DEFINE_VMMOPS_IFUNC(int, modinit, (void)) -DEFINE_VMMOPS_IFUNC(int, modcleanup, (void)) -DEFINE_VMMOPS_IFUNC(void *, init, (struct vm *vm, struct pmap *pmap)) -DEFINE_VMMOPS_IFUNC(int, gla2gpa, (void *vcpui, struct vm_guest_paging *paging, - uint64_t gla, int prot, uint64_t *gpa, int *is_fault)) -DEFINE_VMMOPS_IFUNC(int, run, (void *vcpui, register_t pc, struct pmap *pmap, - struct vm_eventinfo *info)) -DEFINE_VMMOPS_IFUNC(void, cleanup, (void *vmi)) -DEFINE_VMMOPS_IFUNC(void *, vcpu_init, (void *vmi, struct vcpu *vcpu, - int vcpu_id)) -DEFINE_VMMOPS_IFUNC(void, vcpu_cleanup, (void *vcpui)) -DEFINE_VMMOPS_IFUNC(int, exception, (void *vcpui, uint64_t scause)) -DEFINE_VMMOPS_IFUNC(int, getreg, (void *vcpui, int num, uint64_t *retval)) -DEFINE_VMMOPS_IFUNC(int, setreg, (void *vcpui, int num, uint64_t val)) -DEFINE_VMMOPS_IFUNC(int, getcap, (void *vcpui, int num, int *retval)) -DEFINE_VMMOPS_IFUNC(int, setcap, (void *vcpui, int num, int val)) -DEFINE_VMMOPS_IFUNC(struct vmspace *, vmspace_alloc, (vm_offset_t min, - vm_offset_t max)) -DEFINE_VMMOPS_IFUNC(void, vmspace_free, (struct vmspace *vmspace)) - #define dprintf(fmt, ...) struct hypctx *riscv_get_active_vcpu(void); void vmm_switch(struct hypctx *); void vmm_unpriv_trap(struct hyptrap *, uint64_t tmp); bool vmm_sbi_ecall(struct vcpu *); void riscv_send_ipi(struct hyp *hyp, cpuset_t *cpus); int riscv_check_ipi(struct hypctx *hypctx, bool clear); bool riscv_check_interrupts_pending(struct hypctx *hypctx); #endif /* !_VMM_RISCV_H_ */