diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h index 1c68181f5ff4..23ab8427cb74 100644 --- a/sys/amd64/include/vmm.h +++ b/sys/amd64/include/vmm.h @@ -1,817 +1,816 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _VMM_H_ #define _VMM_H_ #include #include #include struct vm_snapshot_meta; #ifdef _KERNEL SDT_PROVIDER_DECLARE(vmm); #endif enum vm_suspend_how { VM_SUSPEND_NONE, VM_SUSPEND_RESET, VM_SUSPEND_POWEROFF, VM_SUSPEND_HALT, VM_SUSPEND_TRIPLEFAULT, VM_SUSPEND_LAST }; /* * Identifiers for architecturally defined registers. */ enum vm_reg_name { VM_REG_GUEST_RAX, VM_REG_GUEST_RBX, VM_REG_GUEST_RCX, VM_REG_GUEST_RDX, VM_REG_GUEST_RSI, VM_REG_GUEST_RDI, VM_REG_GUEST_RBP, VM_REG_GUEST_R8, VM_REG_GUEST_R9, VM_REG_GUEST_R10, VM_REG_GUEST_R11, VM_REG_GUEST_R12, VM_REG_GUEST_R13, VM_REG_GUEST_R14, VM_REG_GUEST_R15, VM_REG_GUEST_CR0, VM_REG_GUEST_CR3, VM_REG_GUEST_CR4, VM_REG_GUEST_DR7, VM_REG_GUEST_RSP, VM_REG_GUEST_RIP, VM_REG_GUEST_RFLAGS, VM_REG_GUEST_ES, VM_REG_GUEST_CS, VM_REG_GUEST_SS, VM_REG_GUEST_DS, VM_REG_GUEST_FS, VM_REG_GUEST_GS, VM_REG_GUEST_LDTR, VM_REG_GUEST_TR, VM_REG_GUEST_IDTR, VM_REG_GUEST_GDTR, VM_REG_GUEST_EFER, VM_REG_GUEST_CR2, VM_REG_GUEST_PDPTE0, VM_REG_GUEST_PDPTE1, VM_REG_GUEST_PDPTE2, VM_REG_GUEST_PDPTE3, VM_REG_GUEST_INTR_SHADOW, VM_REG_GUEST_DR0, VM_REG_GUEST_DR1, VM_REG_GUEST_DR2, VM_REG_GUEST_DR3, VM_REG_GUEST_DR6, VM_REG_GUEST_ENTRY_INST_LENGTH, VM_REG_LAST }; enum x2apic_state { X2APIC_DISABLED, X2APIC_ENABLED, X2APIC_STATE_LAST }; #define VM_INTINFO_VECTOR(info) ((info) & 0xff) #define VM_INTINFO_DEL_ERRCODE 0x800 #define VM_INTINFO_RSVD 0x7ffff000 #define VM_INTINFO_VALID 0x80000000 #define VM_INTINFO_TYPE 0x700 #define VM_INTINFO_HWINTR (0 << 8) #define VM_INTINFO_NMI (2 << 8) #define VM_INTINFO_HWEXCEPTION (3 << 8) #define VM_INTINFO_SWINTR (4 << 8) /* * The VM name has to fit into the pathname length constraints of devfs, * governed primarily by SPECNAMELEN. The length is the total number of * characters in the full path, relative to the mount point and not * including any leading '/' characters. * A prefix and a suffix are added to the name specified by the user. * The prefix is usually "vmm/" or "vmm.io/", but can be a few characters * longer for future use. * The suffix is a string that identifies a bootrom image or some similar * image that is attached to the VM. A separator character gets added to * the suffix automatically when generating the full path, so it must be * accounted for, reducing the effective length by 1. * The effective length of a VM name is 229 bytes for FreeBSD 13 and 37 * bytes for FreeBSD 12. A minimum length is set for safety and supports * a SPECNAMELEN as small as 32 on old systems. */ #define VM_MAX_PREFIXLEN 10 #define VM_MAX_SUFFIXLEN 15 #define VM_MIN_NAMELEN 6 #define VM_MAX_NAMELEN \ (SPECNAMELEN - VM_MAX_PREFIXLEN - VM_MAX_SUFFIXLEN - 1) #ifdef _KERNEL CTASSERT(VM_MAX_NAMELEN >= VM_MIN_NAMELEN); struct vcpu; struct vm; struct vm_exception; struct seg_desc; struct vm_exit; struct vm_run; struct vhpet; struct vioapic; struct vlapic; struct vmspace; struct vm_object; struct vm_guest_paging; struct pmap; enum snapshot_req; struct vm_eventinfo { void *rptr; /* rendezvous cookie */ int *sptr; /* suspend cookie */ int *iptr; /* reqidle cookie */ }; typedef int (*vmm_init_func_t)(int ipinum); typedef int (*vmm_cleanup_func_t)(void); typedef void (*vmm_resume_func_t)(void); typedef void * (*vmi_init_func_t)(struct vm *vm, struct pmap *pmap); typedef int (*vmi_run_func_t)(void *vcpui, register_t rip, struct pmap *pmap, struct vm_eventinfo *info); typedef void (*vmi_cleanup_func_t)(void *vmi); typedef void * (*vmi_vcpu_init_func_t)(void *vmi, struct vcpu *vcpu, int vcpu_id); typedef void (*vmi_vcpu_cleanup_func_t)(void *vcpui); typedef int (*vmi_get_register_t)(void *vcpui, int num, uint64_t *retval); typedef int (*vmi_set_register_t)(void *vcpui, int num, uint64_t val); typedef int (*vmi_get_desc_t)(void *vcpui, int num, struct seg_desc *desc); typedef int (*vmi_set_desc_t)(void *vcpui, int num, struct seg_desc *desc); typedef int (*vmi_get_cap_t)(void *vcpui, int num, int *retval); typedef int (*vmi_set_cap_t)(void *vcpui, int num, int val); typedef struct vmspace * (*vmi_vmspace_alloc)(vm_offset_t min, vm_offset_t max); typedef void (*vmi_vmspace_free)(struct vmspace *vmspace); typedef struct vlapic * (*vmi_vlapic_init)(void *vcpui); typedef void (*vmi_vlapic_cleanup)(struct vlapic *vlapic); typedef int (*vmi_snapshot_t)(void *vmi, struct vm_snapshot_meta *meta); typedef int (*vmi_snapshot_vcpu_t)(void *vcpui, struct vm_snapshot_meta *meta); typedef int (*vmi_restore_tsc_t)(void *vcpui, uint64_t now); struct vmm_ops { vmm_init_func_t modinit; /* module wide initialization */ vmm_cleanup_func_t modcleanup; vmm_resume_func_t modresume; vmi_init_func_t init; /* vm-specific initialization */ vmi_run_func_t run; vmi_cleanup_func_t cleanup; vmi_vcpu_init_func_t vcpu_init; vmi_vcpu_cleanup_func_t vcpu_cleanup; vmi_get_register_t getreg; vmi_set_register_t setreg; vmi_get_desc_t getdesc; vmi_set_desc_t setdesc; vmi_get_cap_t getcap; vmi_set_cap_t setcap; vmi_vmspace_alloc vmspace_alloc; vmi_vmspace_free vmspace_free; vmi_vlapic_init vlapic_init; vmi_vlapic_cleanup vlapic_cleanup; /* checkpoint operations */ vmi_snapshot_t snapshot; vmi_snapshot_vcpu_t vcpu_snapshot; vmi_restore_tsc_t restore_tsc; }; extern const struct vmm_ops vmm_ops_intel; extern const struct vmm_ops vmm_ops_amd; int vm_create(const char *name, struct vm **retvm); void vm_destroy(struct vm *vm); int vm_reinit(struct vm *vm); const char *vm_name(struct vm *vm); uint16_t vm_get_maxcpus(struct vm *vm); void vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores, uint16_t *threads, uint16_t *maxcpus); int vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores, uint16_t threads, uint16_t maxcpus); /* * APIs that modify the guest memory map require all vcpus to be frozen. */ int vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t off, size_t len, int prot, int flags); int vm_munmap_memseg(struct vm *vm, vm_paddr_t gpa, size_t len); int vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem); void vm_free_memseg(struct vm *vm, int ident); int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa); int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len); int vm_assign_pptdev(struct vm *vm, int bus, int slot, int func); int vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func); /* * APIs that inspect the guest memory map require only a *single* vcpu to * be frozen. This acts like a read lock on the guest memory map since any * modification requires *all* vcpus to be frozen. */ int vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid, vm_ooffset_t *segoff, size_t *len, int *prot, int *flags); int vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem, struct vm_object **objptr); vm_paddr_t vmm_sysmem_maxaddr(struct vm *vm); void *vm_gpa_hold(struct vcpu *vcpu, vm_paddr_t gpa, size_t len, int prot, void **cookie); void *vm_gpa_hold_global(struct vm *vm, vm_paddr_t gpa, size_t len, int prot, void **cookie); void *vm_gpa_hold_global(struct vm *vm, vm_paddr_t gpa, size_t len, int prot, void **cookie); void vm_gpa_release(void *cookie); bool vm_mem_allocated(struct vcpu *vcpu, vm_paddr_t gpa); int vm_get_register(struct vcpu *vcpu, int reg, uint64_t *retval); int vm_set_register(struct vcpu *vcpu, int reg, uint64_t val); int vm_get_seg_desc(struct vcpu *vcpu, int reg, struct seg_desc *ret_desc); -int vm_set_seg_desc(struct vm *vm, int vcpu, int reg, +int vm_set_seg_desc(struct vcpu *vcpu, int reg, struct seg_desc *desc); -int vm_run(struct vm *vm, struct vm_run *vmrun); +int vm_run(struct vcpu *vcpu, struct vm_exit *vme_user); int vm_suspend(struct vm *vm, enum vm_suspend_how how); -int vm_inject_nmi(struct vm *vm, int vcpu); +int vm_inject_nmi(struct vcpu *vcpu); int vm_nmi_pending(struct vcpu *vcpu); void vm_nmi_clear(struct vcpu *vcpu); -int vm_inject_extint(struct vm *vm, int vcpu); +int vm_inject_extint(struct vcpu *vcpu); int vm_extint_pending(struct vcpu *vcpu); void vm_extint_clear(struct vcpu *vcpu); int vcpu_vcpuid(struct vcpu *vcpu); struct vm *vcpu_vm(struct vcpu *vcpu); struct vcpu *vm_vcpu(struct vm *vm, int cpu); struct vlapic *vm_lapic(struct vcpu *vcpu); struct vioapic *vm_ioapic(struct vm *vm); struct vhpet *vm_hpet(struct vm *vm); -int vm_get_capability(struct vm *vm, int vcpu, int type, int *val); -int vm_set_capability(struct vm *vm, int vcpu, int type, int val); -int vm_get_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state *state); -int vm_set_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state state); +int vm_get_capability(struct vcpu *vcpu, int type, int *val); +int vm_set_capability(struct vcpu *vcpu, int type, int val); +int vm_get_x2apic_state(struct vcpu *vcpu, enum x2apic_state *state); +int vm_set_x2apic_state(struct vcpu *vcpu, enum x2apic_state state); int vm_apicid2vcpuid(struct vm *vm, int apicid); -int vm_activate_cpu(struct vm *vm, int vcpu); -int vm_suspend_cpu(struct vm *vm, int vcpu); -int vm_resume_cpu(struct vm *vm, int vcpu); +int vm_activate_cpu(struct vcpu *vcpu); +int vm_suspend_cpu(struct vm *vm, struct vcpu *vcpu); +int vm_resume_cpu(struct vm *vm, struct vcpu *vcpu); int vm_restart_instruction(struct vcpu *vcpu); struct vm_exit *vm_exitinfo(struct vcpu *vcpu); void vm_exit_suspended(struct vcpu *vcpu, uint64_t rip); void vm_exit_debug(struct vcpu *vcpu, uint64_t rip); void vm_exit_rendezvous(struct vcpu *vcpu, uint64_t rip); void vm_exit_astpending(struct vcpu *vcpu, uint64_t rip); void vm_exit_reqidle(struct vcpu *vcpu, uint64_t rip); int vm_snapshot_req(struct vm *vm, struct vm_snapshot_meta *meta); int vm_restore_time(struct vm *vm); #ifdef _SYS__CPUSET_H_ /* * Rendezvous all vcpus specified in 'dest' and execute 'func(arg)'. * The rendezvous 'func(arg)' is not allowed to do anything that will * cause the thread to be put to sleep. * * If the rendezvous is being initiated from a vcpu context then the * 'vcpuid' must refer to that vcpu, otherwise it should be set to -1. * * The caller cannot hold any locks when initiating the rendezvous. * * The implementation of this API may cause vcpus other than those specified * by 'dest' to be stalled. The caller should not rely on any vcpus making * forward progress when the rendezvous is in progress. */ typedef void (*vm_rendezvous_func_t)(struct vcpu *vcpu, void *arg); int vm_smp_rendezvous(struct vcpu *vcpu, cpuset_t dest, vm_rendezvous_func_t func, void *arg); cpuset_t vm_active_cpus(struct vm *vm); cpuset_t vm_debug_cpus(struct vm *vm); cpuset_t vm_suspended_cpus(struct vm *vm); #endif /* _SYS__CPUSET_H_ */ static __inline int vcpu_rendezvous_pending(struct vm_eventinfo *info) { return (*((uintptr_t *)(info->rptr)) != 0); } static __inline int vcpu_suspended(struct vm_eventinfo *info) { return (*info->sptr); } static __inline int vcpu_reqidle(struct vm_eventinfo *info) { return (*info->iptr); } int vcpu_debugged(struct vcpu *vcpu); /* * Return true if device indicated by bus/slot/func is supposed to be a * pci passthrough device. * * Return false otherwise. */ bool vmm_is_pptdev(int bus, int slot, int func); void *vm_iommu_domain(struct vm *vm); enum vcpu_state { VCPU_IDLE, VCPU_FROZEN, VCPU_RUNNING, VCPU_SLEEPING, }; -int vcpu_set_state(struct vm *vm, int vcpu, enum vcpu_state state, - bool from_idle); +int vcpu_set_state(struct vcpu *vcpu, enum vcpu_state state, bool from_idle); enum vcpu_state vcpu_get_state(struct vcpu *vcpu, int *hostcpu); static int __inline vcpu_is_running(struct vcpu *vcpu, int *hostcpu) { return (vcpu_get_state(vcpu, hostcpu) == VCPU_RUNNING); } #ifdef _SYS_PROC_H_ static int __inline vcpu_should_yield(struct vcpu *vcpu) { struct thread *td; td = curthread; return (td->td_ast != 0 || td->td_owepreempt != 0); } #endif void *vcpu_stats(struct vcpu *vcpu); -void vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr); +void vcpu_notify_event(struct vcpu *vcpu, bool lapic_intr); struct vmspace *vm_get_vmspace(struct vm *vm); struct vatpic *vm_atpic(struct vm *vm); struct vatpit *vm_atpit(struct vm *vm); struct vpmtmr *vm_pmtmr(struct vm *vm); struct vrtc *vm_rtc(struct vm *vm); /* * Inject exception 'vector' into the guest vcpu. This function returns 0 on * success and non-zero on failure. * * Wrapper functions like 'vm_inject_gp()' should be preferred to calling * this function directly because they enforce the trap-like or fault-like * behavior of an exception. * * This function should only be called in the context of the thread that is * executing this vcpu. */ int vm_inject_exception(struct vcpu *vcpu, int vector, int err_valid, uint32_t errcode, int restart_instruction); /* * This function is called after a VM-exit that occurred during exception or * interrupt delivery through the IDT. The format of 'intinfo' is described * in Figure 15-1, "EXITINTINFO for All Intercepts", APM, Vol 2. * * If a VM-exit handler completes the event delivery successfully then it * should call vm_exit_intinfo() to extinguish the pending event. For e.g., * if the task switch emulation is triggered via a task gate then it should * call this function with 'intinfo=0' to indicate that the external event * is not pending anymore. * * Return value is 0 on success and non-zero on failure. */ int vm_exit_intinfo(struct vcpu *vcpu, uint64_t intinfo); /* * This function is called before every VM-entry to retrieve a pending * event that should be injected into the guest. This function combines * nested events into a double or triple fault. * * Returns 0 if there are no events that need to be injected into the guest * and non-zero otherwise. */ int vm_entry_intinfo(struct vcpu *vcpu, uint64_t *info); -int vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2); +int vm_get_intinfo(struct vcpu *vcpu, uint64_t *info1, uint64_t *info2); /* * Function used to keep track of the guest's TSC offset. The * offset is used by the virutalization extensions to provide a consistent * value for the Time Stamp Counter to the guest. */ void vm_set_tsc_offset(struct vcpu *vcpu, uint64_t offset); enum vm_reg_name vm_segment_name(int seg_encoding); struct vm_copyinfo { uint64_t gpa; size_t len; void *hva; void *cookie; }; /* * Set up 'copyinfo[]' to copy to/from guest linear address space starting * at 'gla' and 'len' bytes long. The 'prot' should be set to PROT_READ for * a copyin or PROT_WRITE for a copyout. * * retval is_fault Interpretation * 0 0 Success * 0 1 An exception was injected into the guest * EFAULT N/A Unrecoverable error * * The 'copyinfo[]' can be passed to 'vm_copyin()' or 'vm_copyout()' only if * the return value is 0. The 'copyinfo[]' resources should be freed by calling * 'vm_copy_teardown()' after the copy is done. */ int vm_copy_setup(struct vcpu *vcpu, struct vm_guest_paging *paging, uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo, int num_copyinfo, int *is_fault); void vm_copy_teardown(struct vm_copyinfo *copyinfo, int num_copyinfo); void vm_copyin(struct vm_copyinfo *copyinfo, void *kaddr, size_t len); void vm_copyout(const void *kaddr, struct vm_copyinfo *copyinfo, size_t len); int vcpu_trace_exceptions(struct vcpu *vcpu); int vcpu_trap_wbinvd(struct vcpu *vcpu); #endif /* KERNEL */ #ifdef _KERNEL #define VM_MAXCPU 16 /* maximum virtual cpus */ #endif /* * Identifiers for optional vmm capabilities */ enum vm_cap_type { VM_CAP_HALT_EXIT, VM_CAP_MTRAP_EXIT, VM_CAP_PAUSE_EXIT, VM_CAP_UNRESTRICTED_GUEST, VM_CAP_ENABLE_INVPCID, VM_CAP_BPT_EXIT, VM_CAP_RDPID, VM_CAP_RDTSCP, VM_CAP_IPI_EXIT, VM_CAP_MAX }; enum vm_intr_trigger { EDGE_TRIGGER, LEVEL_TRIGGER }; /* * The 'access' field has the format specified in Table 21-2 of the Intel * Architecture Manual vol 3b. * * XXX The contents of the 'access' field are architecturally defined except * bit 16 - Segment Unusable. */ struct seg_desc { uint64_t base; uint32_t limit; uint32_t access; }; #define SEG_DESC_TYPE(access) ((access) & 0x001f) #define SEG_DESC_DPL(access) (((access) >> 5) & 0x3) #define SEG_DESC_PRESENT(access) (((access) & 0x0080) ? 1 : 0) #define SEG_DESC_DEF32(access) (((access) & 0x4000) ? 1 : 0) #define SEG_DESC_GRANULARITY(access) (((access) & 0x8000) ? 1 : 0) #define SEG_DESC_UNUSABLE(access) (((access) & 0x10000) ? 1 : 0) enum vm_cpu_mode { CPU_MODE_REAL, CPU_MODE_PROTECTED, CPU_MODE_COMPATIBILITY, /* IA-32E mode (CS.L = 0) */ CPU_MODE_64BIT, /* IA-32E mode (CS.L = 1) */ }; enum vm_paging_mode { PAGING_MODE_FLAT, PAGING_MODE_32, PAGING_MODE_PAE, PAGING_MODE_64, PAGING_MODE_64_LA57, }; struct vm_guest_paging { uint64_t cr3; int cpl; enum vm_cpu_mode cpu_mode; enum vm_paging_mode paging_mode; }; /* * The data structures 'vie' and 'vie_op' are meant to be opaque to the * consumers of instruction decoding. The only reason why their contents * need to be exposed is because they are part of the 'vm_exit' structure. */ struct vie_op { uint8_t op_byte; /* actual opcode byte */ uint8_t op_type; /* type of operation (e.g. MOV) */ uint16_t op_flags; }; _Static_assert(sizeof(struct vie_op) == 4, "ABI"); _Static_assert(_Alignof(struct vie_op) == 2, "ABI"); #define VIE_INST_SIZE 15 struct vie { uint8_t inst[VIE_INST_SIZE]; /* instruction bytes */ uint8_t num_valid; /* size of the instruction */ /* The following fields are all zeroed upon restart. */ #define vie_startzero num_processed uint8_t num_processed; uint8_t addrsize:4, opsize:4; /* address and operand sizes */ uint8_t rex_w:1, /* REX prefix */ rex_r:1, rex_x:1, rex_b:1, rex_present:1, repz_present:1, /* REP/REPE/REPZ prefix */ repnz_present:1, /* REPNE/REPNZ prefix */ opsize_override:1, /* Operand size override */ addrsize_override:1, /* Address size override */ segment_override:1; /* Segment override */ uint8_t mod:2, /* ModRM byte */ reg:4, rm:4; uint8_t ss:2, /* SIB byte */ vex_present:1, /* VEX prefixed */ vex_l:1, /* L bit */ index:4, /* SIB byte */ base:4; /* SIB byte */ uint8_t disp_bytes; uint8_t imm_bytes; uint8_t scale; uint8_t vex_reg:4, /* vvvv: first source register specifier */ vex_pp:2, /* pp */ _sparebits:2; uint8_t _sparebytes[2]; int base_register; /* VM_REG_GUEST_xyz */ int index_register; /* VM_REG_GUEST_xyz */ int segment_register; /* VM_REG_GUEST_xyz */ int64_t displacement; /* optional addr displacement */ int64_t immediate; /* optional immediate operand */ uint8_t decoded; /* set to 1 if successfully decoded */ uint8_t _sparebyte; struct vie_op op; /* opcode description */ }; _Static_assert(sizeof(struct vie) == 64, "ABI"); _Static_assert(__offsetof(struct vie, disp_bytes) == 22, "ABI"); _Static_assert(__offsetof(struct vie, scale) == 24, "ABI"); _Static_assert(__offsetof(struct vie, base_register) == 28, "ABI"); enum vm_exitcode { VM_EXITCODE_INOUT, VM_EXITCODE_VMX, VM_EXITCODE_BOGUS, VM_EXITCODE_RDMSR, VM_EXITCODE_WRMSR, VM_EXITCODE_HLT, VM_EXITCODE_MTRAP, VM_EXITCODE_PAUSE, VM_EXITCODE_PAGING, VM_EXITCODE_INST_EMUL, VM_EXITCODE_SPINUP_AP, VM_EXITCODE_DEPRECATED1, /* used to be SPINDOWN_CPU */ VM_EXITCODE_RENDEZVOUS, VM_EXITCODE_IOAPIC_EOI, VM_EXITCODE_SUSPENDED, VM_EXITCODE_INOUT_STR, VM_EXITCODE_TASK_SWITCH, VM_EXITCODE_MONITOR, VM_EXITCODE_MWAIT, VM_EXITCODE_SVM, VM_EXITCODE_REQIDLE, VM_EXITCODE_DEBUG, VM_EXITCODE_VMINSN, VM_EXITCODE_BPT, VM_EXITCODE_IPI, VM_EXITCODE_MAX }; struct vm_inout { uint16_t bytes:3; /* 1 or 2 or 4 */ uint16_t in:1; uint16_t string:1; uint16_t rep:1; uint16_t port; uint32_t eax; /* valid for out */ }; struct vm_inout_str { struct vm_inout inout; /* must be the first element */ struct vm_guest_paging paging; uint64_t rflags; uint64_t cr0; uint64_t index; uint64_t count; /* rep=1 (%rcx), rep=0 (1) */ int addrsize; enum vm_reg_name seg_name; struct seg_desc seg_desc; }; enum task_switch_reason { TSR_CALL, TSR_IRET, TSR_JMP, TSR_IDT_GATE, /* task gate in IDT */ }; struct vm_task_switch { uint16_t tsssel; /* new TSS selector */ int ext; /* task switch due to external event */ uint32_t errcode; int errcode_valid; /* push 'errcode' on the new stack */ enum task_switch_reason reason; struct vm_guest_paging paging; }; struct vm_exit { enum vm_exitcode exitcode; int inst_length; /* 0 means unknown */ uint64_t rip; union { struct vm_inout inout; struct vm_inout_str inout_str; struct { uint64_t gpa; int fault_type; } paging; struct { uint64_t gpa; uint64_t gla; uint64_t cs_base; int cs_d; /* CS.D */ struct vm_guest_paging paging; struct vie vie; } inst_emul; /* * VMX specific payload. Used when there is no "better" * exitcode to represent the VM-exit. */ struct { int status; /* vmx inst status */ /* * 'exit_reason' and 'exit_qualification' are valid * only if 'status' is zero. */ uint32_t exit_reason; uint64_t exit_qualification; /* * 'inst_error' and 'inst_type' are valid * only if 'status' is non-zero. */ int inst_type; int inst_error; } vmx; /* * SVM specific payload. */ struct { uint64_t exitcode; uint64_t exitinfo1; uint64_t exitinfo2; } svm; struct { int inst_length; } bpt; struct { uint32_t code; /* ecx value */ uint64_t wval; } msr; struct { int vcpu; uint64_t rip; } spinup_ap; struct { uint64_t rflags; uint64_t intr_status; } hlt; struct { int vector; } ioapic_eoi; struct { enum vm_suspend_how how; } suspended; struct { uint32_t mode; uint8_t vector; cpuset_t dmask; } ipi; struct vm_task_switch task_switch; } u; }; /* APIs to inject faults into the guest */ #ifdef _KERNEL void vm_inject_fault(struct vcpu *vcpu, int vector, int errcode_valid, int errcode); static __inline void vm_inject_ud(struct vcpu *vcpu) { vm_inject_fault(vcpu, IDT_UD, 0, 0); } static __inline void vm_inject_gp(struct vcpu *vcpu) { vm_inject_fault(vcpu, IDT_GP, 1, 0); } static __inline void vm_inject_ac(struct vcpu *vcpu, int errcode) { vm_inject_fault(vcpu, IDT_AC, 1, errcode); } static __inline void vm_inject_ss(struct vcpu *vcpu, int errcode) { vm_inject_fault(vcpu, IDT_SS, 1, errcode); } void vm_inject_pf(struct vcpu *vcpu, int error_code, uint64_t cr2); #else void vm_inject_fault(void *vm, int vcpuid, int vector, int errcode_valid, int errcode); static __inline void vm_inject_ud(struct vcpu *vcpu) { vm_inject_fault(vcpu, IDT_UD, 0, 0); } static __inline void vm_inject_gp(struct vcpu *vcpu) { vm_inject_fault(vcpu, IDT_GP, 1, 0); } static __inline void vm_inject_ac(struct vcpu *vcpu, int errcode) { vm_inject_fault(vcpu, IDT_AC, 1, errcode); } static __inline void vm_inject_ss(struct vcpu *vcpu, int errcode) { vm_inject_fault(vcpu, IDT_SS, 1, errcode); } void vm_inject_pf(void *vm, int vcpuid, int error_code, uint64_t cr2); #endif #endif /* _VMM_H_ */ diff --git a/sys/amd64/vmm/io/vatpic.c b/sys/amd64/vmm/io/vatpic.c index 1d4ccfb9d0ca..5ece5d433244 100644 --- a/sys/amd64/vmm/io/vatpic.c +++ b/sys/amd64/vmm/io/vatpic.c @@ -1,852 +1,852 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2014 Tycho Nightingale * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_bhyve_snapshot.h" #include #include #include #include #include #include #include #include #include #include #include #include #include "vmm_ktr.h" #include "vmm_lapic.h" #include "vioapic.h" #include "vatpic.h" static MALLOC_DEFINE(M_VATPIC, "atpic", "bhyve virtual atpic (8259)"); #define VATPIC_LOCK(vatpic) mtx_lock_spin(&((vatpic)->mtx)) #define VATPIC_UNLOCK(vatpic) mtx_unlock_spin(&((vatpic)->mtx)) #define VATPIC_LOCKED(vatpic) mtx_owned(&((vatpic)->mtx)) enum irqstate { IRQSTATE_ASSERT, IRQSTATE_DEASSERT, IRQSTATE_PULSE }; struct atpic { bool ready; int icw_num; int rd_cmd_reg; bool aeoi; bool poll; bool rotate; bool sfn; /* special fully-nested mode */ int irq_base; uint8_t request; /* Interrupt Request Register (IIR) */ uint8_t service; /* Interrupt Service (ISR) */ uint8_t mask; /* Interrupt Mask Register (IMR) */ uint8_t smm; /* special mask mode */ int acnt[8]; /* sum of pin asserts and deasserts */ int lowprio; /* lowest priority irq */ bool intr_raised; }; struct vatpic { struct vm *vm; struct mtx mtx; struct atpic atpic[2]; uint8_t elc[2]; }; #define VATPIC_CTR0(vatpic, fmt) \ VM_CTR0((vatpic)->vm, fmt) #define VATPIC_CTR1(vatpic, fmt, a1) \ VM_CTR1((vatpic)->vm, fmt, a1) #define VATPIC_CTR2(vatpic, fmt, a1, a2) \ VM_CTR2((vatpic)->vm, fmt, a1, a2) #define VATPIC_CTR3(vatpic, fmt, a1, a2, a3) \ VM_CTR3((vatpic)->vm, fmt, a1, a2, a3) #define VATPIC_CTR4(vatpic, fmt, a1, a2, a3, a4) \ VM_CTR4((vatpic)->vm, fmt, a1, a2, a3, a4) /* * Loop over all the pins in priority order from highest to lowest. */ #define ATPIC_PIN_FOREACH(pinvar, atpic, tmpvar) \ for (tmpvar = 0, pinvar = (atpic->lowprio + 1) & 0x7; \ tmpvar < 8; \ tmpvar++, pinvar = (pinvar + 1) & 0x7) static void vatpic_set_pinstate(struct vatpic *vatpic, int pin, bool newstate); static __inline bool master_atpic(struct vatpic *vatpic, struct atpic *atpic) { if (atpic == &vatpic->atpic[0]) return (true); else return (false); } static __inline int vatpic_get_highest_isrpin(struct atpic *atpic) { int bit, pin; int i; ATPIC_PIN_FOREACH(pin, atpic, i) { bit = (1 << pin); if (atpic->service & bit) { /* * An IS bit that is masked by an IMR bit will not be * cleared by a non-specific EOI in Special Mask Mode. */ if (atpic->smm && (atpic->mask & bit) != 0) continue; else return (pin); } } return (-1); } static __inline int vatpic_get_highest_irrpin(struct atpic *atpic) { int serviced; int bit, pin, tmp; /* * In 'Special Fully-Nested Mode' when an interrupt request from * a slave is in service, the slave is not locked out from the * master's priority logic. */ serviced = atpic->service; if (atpic->sfn) serviced &= ~(1 << 2); /* * In 'Special Mask Mode', when a mask bit is set in OCW1 it inhibits * further interrupts at that level and enables interrupts from all * other levels that are not masked. In other words the ISR has no * bearing on the levels that can generate interrupts. */ if (atpic->smm) serviced = 0; ATPIC_PIN_FOREACH(pin, atpic, tmp) { bit = 1 << pin; /* * If there is already an interrupt in service at the same * or higher priority then bail. */ if ((serviced & bit) != 0) break; /* * If an interrupt is asserted and not masked then return * the corresponding 'pin' to the caller. */ if ((atpic->request & bit) != 0 && (atpic->mask & bit) == 0) return (pin); } return (-1); } static void vatpic_notify_intr(struct vatpic *vatpic) { struct atpic *atpic; int pin; KASSERT(VATPIC_LOCKED(vatpic), ("vatpic_notify_intr not locked")); /* * First check the slave. */ atpic = &vatpic->atpic[1]; if (!atpic->intr_raised && (pin = vatpic_get_highest_irrpin(atpic)) != -1) { VATPIC_CTR4(vatpic, "atpic slave notify pin = %d " "(imr 0x%x irr 0x%x isr 0x%x)", pin, atpic->mask, atpic->request, atpic->service); /* * Cascade the request from the slave to the master. */ atpic->intr_raised = true; vatpic_set_pinstate(vatpic, 2, true); vatpic_set_pinstate(vatpic, 2, false); } else { VATPIC_CTR3(vatpic, "atpic slave no eligible interrupts " "(imr 0x%x irr 0x%x isr 0x%x)", atpic->mask, atpic->request, atpic->service); } /* * Then check the master. */ atpic = &vatpic->atpic[0]; if (!atpic->intr_raised && (pin = vatpic_get_highest_irrpin(atpic)) != -1) { VATPIC_CTR4(vatpic, "atpic master notify pin = %d " "(imr 0x%x irr 0x%x isr 0x%x)", pin, atpic->mask, atpic->request, atpic->service); /* * From Section 3.6.2, "Interrupt Modes", in the * MPtable Specification, Version 1.4 * * PIC interrupts are routed to both the Local APIC * and the I/O APIC to support operation in 1 of 3 * modes. * * 1. Legacy PIC Mode: the PIC effectively bypasses * all APIC components. In this mode the local APIC is * disabled and LINT0 is reconfigured as INTR to * deliver the PIC interrupt directly to the CPU. * * 2. Virtual Wire Mode: the APIC is treated as a * virtual wire which delivers interrupts from the PIC * to the CPU. In this mode LINT0 is programmed as * ExtINT to indicate that the PIC is the source of * the interrupt. * * 3. Virtual Wire Mode via I/O APIC: PIC interrupts are * fielded by the I/O APIC and delivered to the appropriate * CPU. In this mode the I/O APIC input 0 is programmed * as ExtINT to indicate that the PIC is the source of the * interrupt. */ atpic->intr_raised = true; - lapic_set_local_intr(vatpic->vm, -1, APIC_LVT_LINT0); + lapic_set_local_intr(vatpic->vm, NULL, APIC_LVT_LINT0); vioapic_pulse_irq(vatpic->vm, 0); } else { VATPIC_CTR3(vatpic, "atpic master no eligible interrupts " "(imr 0x%x irr 0x%x isr 0x%x)", atpic->mask, atpic->request, atpic->service); } } static int vatpic_icw1(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) { VATPIC_CTR1(vatpic, "atpic icw1 0x%x", val); atpic->ready = false; atpic->icw_num = 1; atpic->request = 0; atpic->mask = 0; atpic->lowprio = 7; atpic->rd_cmd_reg = 0; atpic->poll = 0; atpic->smm = 0; if ((val & ICW1_SNGL) != 0) { VATPIC_CTR0(vatpic, "vatpic cascade mode required"); return (-1); } if ((val & ICW1_IC4) == 0) { VATPIC_CTR0(vatpic, "vatpic icw4 required"); return (-1); } atpic->icw_num++; return (0); } static int vatpic_icw2(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) { VATPIC_CTR1(vatpic, "atpic icw2 0x%x", val); atpic->irq_base = val & 0xf8; atpic->icw_num++; return (0); } static int vatpic_icw3(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) { VATPIC_CTR1(vatpic, "atpic icw3 0x%x", val); atpic->icw_num++; return (0); } static int vatpic_icw4(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) { VATPIC_CTR1(vatpic, "atpic icw4 0x%x", val); if ((val & ICW4_8086) == 0) { VATPIC_CTR0(vatpic, "vatpic microprocessor mode required"); return (-1); } if ((val & ICW4_AEOI) != 0) atpic->aeoi = true; if ((val & ICW4_SFNM) != 0) { if (master_atpic(vatpic, atpic)) { atpic->sfn = true; } else { VATPIC_CTR1(vatpic, "Ignoring special fully nested " "mode on slave atpic: %#x", val); } } atpic->icw_num = 0; atpic->ready = true; return (0); } static int vatpic_ocw1(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) { VATPIC_CTR1(vatpic, "atpic ocw1 0x%x", val); atpic->mask = val & 0xff; return (0); } static int vatpic_ocw2(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) { VATPIC_CTR1(vatpic, "atpic ocw2 0x%x", val); atpic->rotate = ((val & OCW2_R) != 0); if ((val & OCW2_EOI) != 0) { int isr_bit; if ((val & OCW2_SL) != 0) { /* specific EOI */ isr_bit = val & 0x7; } else { /* non-specific EOI */ isr_bit = vatpic_get_highest_isrpin(atpic); } if (isr_bit != -1) { atpic->service &= ~(1 << isr_bit); if (atpic->rotate) atpic->lowprio = isr_bit; } } else if ((val & OCW2_SL) != 0 && atpic->rotate == true) { /* specific priority */ atpic->lowprio = val & 0x7; } return (0); } static int vatpic_ocw3(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) { VATPIC_CTR1(vatpic, "atpic ocw3 0x%x", val); if (val & OCW3_ESMM) { atpic->smm = val & OCW3_SMM ? 1 : 0; VATPIC_CTR2(vatpic, "%s atpic special mask mode %s", master_atpic(vatpic, atpic) ? "master" : "slave", atpic->smm ? "enabled" : "disabled"); } if (val & OCW3_RR) { /* read register command */ atpic->rd_cmd_reg = val & OCW3_RIS; /* Polling mode */ atpic->poll = ((val & OCW3_P) != 0); } return (0); } static void vatpic_set_pinstate(struct vatpic *vatpic, int pin, bool newstate) { struct atpic *atpic; int oldcnt, newcnt; bool level; KASSERT(pin >= 0 && pin < 16, ("vatpic_set_pinstate: invalid pin number %d", pin)); KASSERT(VATPIC_LOCKED(vatpic), ("vatpic_set_pinstate: vatpic is not locked")); atpic = &vatpic->atpic[pin >> 3]; oldcnt = atpic->acnt[pin & 0x7]; if (newstate) atpic->acnt[pin & 0x7]++; else atpic->acnt[pin & 0x7]--; newcnt = atpic->acnt[pin & 0x7]; if (newcnt < 0) { VATPIC_CTR2(vatpic, "atpic pin%d: bad acnt %d", pin, newcnt); } level = ((vatpic->elc[pin >> 3] & (1 << (pin & 0x7))) != 0); if ((oldcnt == 0 && newcnt == 1) || (newcnt > 0 && level == true)) { /* rising edge or level */ VATPIC_CTR1(vatpic, "atpic pin%d: asserted", pin); atpic->request |= (1 << (pin & 0x7)); } else if (oldcnt == 1 && newcnt == 0) { /* falling edge */ VATPIC_CTR1(vatpic, "atpic pin%d: deasserted", pin); if (level) atpic->request &= ~(1 << (pin & 0x7)); } else { VATPIC_CTR3(vatpic, "atpic pin%d: %s, ignored, acnt %d", pin, newstate ? "asserted" : "deasserted", newcnt); } vatpic_notify_intr(vatpic); } static int vatpic_set_irqstate(struct vm *vm, int irq, enum irqstate irqstate) { struct vatpic *vatpic; struct atpic *atpic; if (irq < 0 || irq > 15) return (EINVAL); vatpic = vm_atpic(vm); atpic = &vatpic->atpic[irq >> 3]; if (atpic->ready == false) return (0); VATPIC_LOCK(vatpic); switch (irqstate) { case IRQSTATE_ASSERT: vatpic_set_pinstate(vatpic, irq, true); break; case IRQSTATE_DEASSERT: vatpic_set_pinstate(vatpic, irq, false); break; case IRQSTATE_PULSE: vatpic_set_pinstate(vatpic, irq, true); vatpic_set_pinstate(vatpic, irq, false); break; default: panic("vatpic_set_irqstate: invalid irqstate %d", irqstate); } VATPIC_UNLOCK(vatpic); return (0); } int vatpic_assert_irq(struct vm *vm, int irq) { return (vatpic_set_irqstate(vm, irq, IRQSTATE_ASSERT)); } int vatpic_deassert_irq(struct vm *vm, int irq) { return (vatpic_set_irqstate(vm, irq, IRQSTATE_DEASSERT)); } int vatpic_pulse_irq(struct vm *vm, int irq) { return (vatpic_set_irqstate(vm, irq, IRQSTATE_PULSE)); } int vatpic_set_irq_trigger(struct vm *vm, int irq, enum vm_intr_trigger trigger) { struct vatpic *vatpic; if (irq < 0 || irq > 15) return (EINVAL); /* * See comment in vatpic_elc_handler. These IRQs must be * edge triggered. */ if (trigger == LEVEL_TRIGGER) { switch (irq) { case 0: case 1: case 2: case 8: case 13: return (EINVAL); } } vatpic = vm_atpic(vm); VATPIC_LOCK(vatpic); if (trigger == LEVEL_TRIGGER) vatpic->elc[irq >> 3] |= 1 << (irq & 0x7); else vatpic->elc[irq >> 3] &= ~(1 << (irq & 0x7)); VATPIC_UNLOCK(vatpic); return (0); } void vatpic_pending_intr(struct vm *vm, int *vecptr) { struct vatpic *vatpic; struct atpic *atpic; int pin; vatpic = vm_atpic(vm); atpic = &vatpic->atpic[0]; VATPIC_LOCK(vatpic); pin = vatpic_get_highest_irrpin(atpic); if (pin == 2) { atpic = &vatpic->atpic[1]; pin = vatpic_get_highest_irrpin(atpic); } /* * If there are no pins active at this moment then return the spurious * interrupt vector instead. */ if (pin == -1) pin = 7; KASSERT(pin >= 0 && pin <= 7, ("%s: invalid pin %d", __func__, pin)); *vecptr = atpic->irq_base + pin; VATPIC_UNLOCK(vatpic); } static void vatpic_pin_accepted(struct atpic *atpic, int pin) { atpic->intr_raised = false; if (atpic->acnt[pin] == 0) atpic->request &= ~(1 << pin); if (atpic->aeoi == true) { if (atpic->rotate == true) atpic->lowprio = pin; } else { atpic->service |= (1 << pin); } } void vatpic_intr_accepted(struct vm *vm, int vector) { struct vatpic *vatpic; int pin; vatpic = vm_atpic(vm); VATPIC_LOCK(vatpic); pin = vector & 0x7; if ((vector & ~0x7) == vatpic->atpic[1].irq_base) { vatpic_pin_accepted(&vatpic->atpic[1], pin); /* * If this vector originated from the slave, * accept the cascaded interrupt too. */ vatpic_pin_accepted(&vatpic->atpic[0], 2); } else { vatpic_pin_accepted(&vatpic->atpic[0], pin); } vatpic_notify_intr(vatpic); VATPIC_UNLOCK(vatpic); } static int vatpic_read(struct vatpic *vatpic, struct atpic *atpic, bool in, int port, int bytes, uint32_t *eax) { int pin; VATPIC_LOCK(vatpic); if (atpic->poll) { atpic->poll = 0; pin = vatpic_get_highest_irrpin(atpic); if (pin >= 0) { vatpic_pin_accepted(atpic, pin); *eax = 0x80 | pin; } else { *eax = 0; } } else { if (port & ICU_IMR_OFFSET) { /* read interrrupt mask register */ *eax = atpic->mask; } else { if (atpic->rd_cmd_reg == OCW3_RIS) { /* read interrupt service register */ *eax = atpic->service; } else { /* read interrupt request register */ *eax = atpic->request; } } } VATPIC_UNLOCK(vatpic); return (0); } static int vatpic_write(struct vatpic *vatpic, struct atpic *atpic, bool in, int port, int bytes, uint32_t *eax) { int error; uint8_t val; error = 0; val = *eax; VATPIC_LOCK(vatpic); if (port & ICU_IMR_OFFSET) { switch (atpic->icw_num) { case 2: error = vatpic_icw2(vatpic, atpic, val); break; case 3: error = vatpic_icw3(vatpic, atpic, val); break; case 4: error = vatpic_icw4(vatpic, atpic, val); break; default: error = vatpic_ocw1(vatpic, atpic, val); break; } } else { if (val & (1 << 4)) error = vatpic_icw1(vatpic, atpic, val); if (atpic->ready) { if (val & (1 << 3)) error = vatpic_ocw3(vatpic, atpic, val); else error = vatpic_ocw2(vatpic, atpic, val); } } if (atpic->ready) vatpic_notify_intr(vatpic); VATPIC_UNLOCK(vatpic); return (error); } int vatpic_master_handler(struct vm *vm, bool in, int port, int bytes, uint32_t *eax) { struct vatpic *vatpic; struct atpic *atpic; vatpic = vm_atpic(vm); atpic = &vatpic->atpic[0]; if (bytes != 1) return (-1); if (in) { return (vatpic_read(vatpic, atpic, in, port, bytes, eax)); } return (vatpic_write(vatpic, atpic, in, port, bytes, eax)); } int vatpic_slave_handler(struct vm *vm, bool in, int port, int bytes, uint32_t *eax) { struct vatpic *vatpic; struct atpic *atpic; vatpic = vm_atpic(vm); atpic = &vatpic->atpic[1]; if (bytes != 1) return (-1); if (in) { return (vatpic_read(vatpic, atpic, in, port, bytes, eax)); } return (vatpic_write(vatpic, atpic, in, port, bytes, eax)); } int vatpic_elc_handler(struct vm *vm, bool in, int port, int bytes, uint32_t *eax) { struct vatpic *vatpic; bool is_master; vatpic = vm_atpic(vm); is_master = (port == IO_ELCR1); if (bytes != 1) return (-1); VATPIC_LOCK(vatpic); if (in) { if (is_master) *eax = vatpic->elc[0]; else *eax = vatpic->elc[1]; } else { /* * For the master PIC the cascade channel (IRQ2), the * heart beat timer (IRQ0), and the keyboard * controller (IRQ1) cannot be programmed for level * mode. * * For the slave PIC the real time clock (IRQ8) and * the floating point error interrupt (IRQ13) cannot * be programmed for level mode. */ if (is_master) vatpic->elc[0] = (*eax & 0xf8); else vatpic->elc[1] = (*eax & 0xde); } VATPIC_UNLOCK(vatpic); return (0); } struct vatpic * vatpic_init(struct vm *vm) { struct vatpic *vatpic; vatpic = malloc(sizeof(struct vatpic), M_VATPIC, M_WAITOK | M_ZERO); vatpic->vm = vm; mtx_init(&vatpic->mtx, "vatpic lock", NULL, MTX_SPIN); return (vatpic); } void vatpic_cleanup(struct vatpic *vatpic) { free(vatpic, M_VATPIC); } #ifdef BHYVE_SNAPSHOT int vatpic_snapshot(struct vatpic *vatpic, struct vm_snapshot_meta *meta) { int ret; int i; struct atpic *atpic; for (i = 0; i < nitems(vatpic->atpic); i++) { atpic = &vatpic->atpic[i]; SNAPSHOT_VAR_OR_LEAVE(atpic->ready, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(atpic->icw_num, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(atpic->rd_cmd_reg, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(atpic->aeoi, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(atpic->poll, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(atpic->rotate, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(atpic->sfn, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(atpic->irq_base, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(atpic->request, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(atpic->service, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(atpic->mask, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(atpic->smm, meta, ret, done); SNAPSHOT_BUF_OR_LEAVE(atpic->acnt, sizeof(atpic->acnt), meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(atpic->lowprio, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(atpic->intr_raised, meta, ret, done); } SNAPSHOT_BUF_OR_LEAVE(vatpic->elc, sizeof(vatpic->elc), meta, ret, done); done: return (ret); } #endif diff --git a/sys/amd64/vmm/io/vlapic.c b/sys/amd64/vmm/io/vlapic.c index 664852260943..bcfe6b498721 100644 --- a/sys/amd64/vmm/io/vlapic.c +++ b/sys/amd64/vmm/io/vlapic.c @@ -1,1920 +1,1928 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * Copyright (c) 2019 Joyent, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include "opt_bhyve_snapshot.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include "vmm_lapic.h" #include "vmm_ktr.h" #include "vmm_stat.h" #include "vlapic.h" #include "vlapic_priv.h" #include "vioapic.h" #define PRIO(x) ((x) >> 4) #define VLAPIC_VERSION (0x14) #define x2apic(vlapic) (((vlapic)->msr_apicbase & APICBASE_X2APIC) ? 1 : 0) /* * The 'vlapic->timer_mtx' is used to provide mutual exclusion between the * vlapic_callout_handler() and vcpu accesses to: * - timer_freq_bt, timer_period_bt, timer_fire_bt * - timer LVT register */ #define VLAPIC_TIMER_LOCK(vlapic) mtx_lock_spin(&((vlapic)->timer_mtx)) #define VLAPIC_TIMER_UNLOCK(vlapic) mtx_unlock_spin(&((vlapic)->timer_mtx)) #define VLAPIC_TIMER_LOCKED(vlapic) mtx_owned(&((vlapic)->timer_mtx)) /* * APIC timer frequency: * - arbitrary but chosen to be in the ballpark of contemporary hardware. * - power-of-two to avoid loss of precision when converted to a bintime. */ #define VLAPIC_BUS_FREQ (128 * 1024 * 1024) static void vlapic_set_error(struct vlapic *, uint32_t, bool); static void vlapic_callout_handler(void *arg); static void vlapic_reset(struct vlapic *vlapic); static __inline uint32_t vlapic_get_id(struct vlapic *vlapic) { if (x2apic(vlapic)) return (vlapic->vcpuid); else return (vlapic->vcpuid << 24); } static uint32_t x2apic_ldr(struct vlapic *vlapic) { int apicid; uint32_t ldr; apicid = vlapic_get_id(vlapic); ldr = 1 << (apicid & 0xf); ldr |= (apicid & 0xffff0) << 12; return (ldr); } void vlapic_dfr_write_handler(struct vlapic *vlapic) { struct LAPIC *lapic; lapic = vlapic->apic_page; if (x2apic(vlapic)) { VM_CTR1(vlapic->vm, "ignoring write to DFR in x2apic mode: %#x", lapic->dfr); lapic->dfr = 0; return; } lapic->dfr &= APIC_DFR_MODEL_MASK; lapic->dfr |= APIC_DFR_RESERVED; if ((lapic->dfr & APIC_DFR_MODEL_MASK) == APIC_DFR_MODEL_FLAT) VLAPIC_CTR0(vlapic, "vlapic DFR in Flat Model"); else if ((lapic->dfr & APIC_DFR_MODEL_MASK) == APIC_DFR_MODEL_CLUSTER) VLAPIC_CTR0(vlapic, "vlapic DFR in Cluster Model"); else VLAPIC_CTR1(vlapic, "DFR in Unknown Model %#x", lapic->dfr); } void vlapic_ldr_write_handler(struct vlapic *vlapic) { struct LAPIC *lapic; lapic = vlapic->apic_page; /* LDR is read-only in x2apic mode */ if (x2apic(vlapic)) { VLAPIC_CTR1(vlapic, "ignoring write to LDR in x2apic mode: %#x", lapic->ldr); lapic->ldr = x2apic_ldr(vlapic); } else { lapic->ldr &= ~APIC_LDR_RESERVED; VLAPIC_CTR1(vlapic, "vlapic LDR set to %#x", lapic->ldr); } } void vlapic_id_write_handler(struct vlapic *vlapic) { struct LAPIC *lapic; /* * We don't allow the ID register to be modified so reset it back to * its default value. */ lapic = vlapic->apic_page; lapic->id = vlapic_get_id(vlapic); } static int vlapic_timer_divisor(uint32_t dcr) { switch (dcr & 0xB) { case APIC_TDCR_1: return (1); case APIC_TDCR_2: return (2); case APIC_TDCR_4: return (4); case APIC_TDCR_8: return (8); case APIC_TDCR_16: return (16); case APIC_TDCR_32: return (32); case APIC_TDCR_64: return (64); case APIC_TDCR_128: return (128); default: panic("vlapic_timer_divisor: invalid dcr 0x%08x", dcr); } } #if 0 static inline void vlapic_dump_lvt(uint32_t offset, uint32_t *lvt) { printf("Offset %x: lvt %08x (V:%02x DS:%x M:%x)\n", offset, *lvt, *lvt & APIC_LVTT_VECTOR, *lvt & APIC_LVTT_DS, *lvt & APIC_LVTT_M); } #endif static uint32_t vlapic_get_ccr(struct vlapic *vlapic) { struct bintime bt_now, bt_rem; struct LAPIC *lapic __diagused; uint32_t ccr; ccr = 0; lapic = vlapic->apic_page; VLAPIC_TIMER_LOCK(vlapic); if (callout_active(&vlapic->callout)) { /* * If the timer is scheduled to expire in the future then * compute the value of 'ccr' based on the remaining time. */ binuptime(&bt_now); if (bintime_cmp(&vlapic->timer_fire_bt, &bt_now, >)) { bt_rem = vlapic->timer_fire_bt; bintime_sub(&bt_rem, &bt_now); ccr += bt_rem.sec * BT2FREQ(&vlapic->timer_freq_bt); ccr += bt_rem.frac / vlapic->timer_freq_bt.frac; } } KASSERT(ccr <= lapic->icr_timer, ("vlapic_get_ccr: invalid ccr %#x, " "icr_timer is %#x", ccr, lapic->icr_timer)); VLAPIC_CTR2(vlapic, "vlapic ccr_timer = %#x, icr_timer = %#x", ccr, lapic->icr_timer); VLAPIC_TIMER_UNLOCK(vlapic); return (ccr); } void vlapic_dcr_write_handler(struct vlapic *vlapic) { struct LAPIC *lapic; int divisor; lapic = vlapic->apic_page; VLAPIC_TIMER_LOCK(vlapic); divisor = vlapic_timer_divisor(lapic->dcr_timer); VLAPIC_CTR2(vlapic, "vlapic dcr_timer=%#x, divisor=%d", lapic->dcr_timer, divisor); /* * Update the timer frequency and the timer period. * * XXX changes to the frequency divider will not take effect until * the timer is reloaded. */ FREQ2BT(VLAPIC_BUS_FREQ / divisor, &vlapic->timer_freq_bt); vlapic->timer_period_bt = vlapic->timer_freq_bt; bintime_mul(&vlapic->timer_period_bt, lapic->icr_timer); VLAPIC_TIMER_UNLOCK(vlapic); } void vlapic_esr_write_handler(struct vlapic *vlapic) { struct LAPIC *lapic; lapic = vlapic->apic_page; lapic->esr = vlapic->esr_pending; vlapic->esr_pending = 0; } int vlapic_set_intr_ready(struct vlapic *vlapic, int vector, bool level) { struct LAPIC *lapic; uint32_t *irrptr, *tmrptr, mask; int idx; KASSERT(vector >= 0 && vector < 256, ("invalid vector %d", vector)); lapic = vlapic->apic_page; if (!(lapic->svr & APIC_SVR_ENABLE)) { VLAPIC_CTR1(vlapic, "vlapic is software disabled, ignoring " "interrupt %d", vector); return (0); } if (vector < 16) { vlapic_set_error(vlapic, APIC_ESR_RECEIVE_ILLEGAL_VECTOR, false); VLAPIC_CTR1(vlapic, "vlapic ignoring interrupt to vector %d", vector); return (1); } if (vlapic->ops.set_intr_ready) return ((*vlapic->ops.set_intr_ready)(vlapic, vector, level)); idx = (vector / 32) * 4; mask = 1 << (vector % 32); irrptr = &lapic->irr0; atomic_set_int(&irrptr[idx], mask); /* * Verify that the trigger-mode of the interrupt matches with * the vlapic TMR registers. */ tmrptr = &lapic->tmr0; if ((tmrptr[idx] & mask) != (level ? mask : 0)) { VLAPIC_CTR3(vlapic, "vlapic TMR[%d] is 0x%08x but " "interrupt is %s-triggered", idx / 4, tmrptr[idx], level ? "level" : "edge"); } VLAPIC_CTR_IRR(vlapic, "vlapic_set_intr_ready"); return (1); } static __inline uint32_t * vlapic_get_lvtptr(struct vlapic *vlapic, uint32_t offset) { struct LAPIC *lapic = vlapic->apic_page; int i; switch (offset) { case APIC_OFFSET_CMCI_LVT: return (&lapic->lvt_cmci); case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT: i = (offset - APIC_OFFSET_TIMER_LVT) >> 2; return ((&lapic->lvt_timer) + i); default: panic("vlapic_get_lvt: invalid LVT\n"); } } static __inline int lvt_off_to_idx(uint32_t offset) { int index; switch (offset) { case APIC_OFFSET_CMCI_LVT: index = APIC_LVT_CMCI; break; case APIC_OFFSET_TIMER_LVT: index = APIC_LVT_TIMER; break; case APIC_OFFSET_THERM_LVT: index = APIC_LVT_THERMAL; break; case APIC_OFFSET_PERF_LVT: index = APIC_LVT_PMC; break; case APIC_OFFSET_LINT0_LVT: index = APIC_LVT_LINT0; break; case APIC_OFFSET_LINT1_LVT: index = APIC_LVT_LINT1; break; case APIC_OFFSET_ERROR_LVT: index = APIC_LVT_ERROR; break; default: index = -1; break; } KASSERT(index >= 0 && index <= VLAPIC_MAXLVT_INDEX, ("lvt_off_to_idx: " "invalid lvt index %d for offset %#x", index, offset)); return (index); } static __inline uint32_t vlapic_get_lvt(struct vlapic *vlapic, uint32_t offset) { int idx; uint32_t val; idx = lvt_off_to_idx(offset); val = atomic_load_acq_32(&vlapic->lvt_last[idx]); return (val); } void vlapic_lvt_write_handler(struct vlapic *vlapic, uint32_t offset) { uint32_t *lvtptr, mask, val; struct LAPIC *lapic; int idx; lapic = vlapic->apic_page; lvtptr = vlapic_get_lvtptr(vlapic, offset); val = *lvtptr; idx = lvt_off_to_idx(offset); if (!(lapic->svr & APIC_SVR_ENABLE)) val |= APIC_LVT_M; mask = APIC_LVT_M | APIC_LVT_DS | APIC_LVT_VECTOR; switch (offset) { case APIC_OFFSET_TIMER_LVT: mask |= APIC_LVTT_TM; break; case APIC_OFFSET_ERROR_LVT: break; case APIC_OFFSET_LINT0_LVT: case APIC_OFFSET_LINT1_LVT: mask |= APIC_LVT_TM | APIC_LVT_RIRR | APIC_LVT_IIPP; /* FALLTHROUGH */ default: mask |= APIC_LVT_DM; break; } val &= mask; *lvtptr = val; atomic_store_rel_32(&vlapic->lvt_last[idx], val); } static void vlapic_mask_lvts(struct vlapic *vlapic) { struct LAPIC *lapic = vlapic->apic_page; lapic->lvt_cmci |= APIC_LVT_M; vlapic_lvt_write_handler(vlapic, APIC_OFFSET_CMCI_LVT); lapic->lvt_timer |= APIC_LVT_M; vlapic_lvt_write_handler(vlapic, APIC_OFFSET_TIMER_LVT); lapic->lvt_thermal |= APIC_LVT_M; vlapic_lvt_write_handler(vlapic, APIC_OFFSET_THERM_LVT); lapic->lvt_pcint |= APIC_LVT_M; vlapic_lvt_write_handler(vlapic, APIC_OFFSET_PERF_LVT); lapic->lvt_lint0 |= APIC_LVT_M; vlapic_lvt_write_handler(vlapic, APIC_OFFSET_LINT0_LVT); lapic->lvt_lint1 |= APIC_LVT_M; vlapic_lvt_write_handler(vlapic, APIC_OFFSET_LINT1_LVT); lapic->lvt_error |= APIC_LVT_M; vlapic_lvt_write_handler(vlapic, APIC_OFFSET_ERROR_LVT); } static int vlapic_fire_lvt(struct vlapic *vlapic, u_int lvt) { uint32_t mode, reg, vec; reg = atomic_load_acq_32(&vlapic->lvt_last[lvt]); if (reg & APIC_LVT_M) return (0); vec = reg & APIC_LVT_VECTOR; mode = reg & APIC_LVT_DM; switch (mode) { case APIC_LVT_DM_FIXED: if (vec < 16) { vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR, lvt == APIC_LVT_ERROR); return (0); } if (vlapic_set_intr_ready(vlapic, vec, false)) - vcpu_notify_event(vlapic->vm, vlapic->vcpuid, true); + vcpu_notify_event(vlapic->vcpu, true); break; case APIC_LVT_DM_NMI: - vm_inject_nmi(vlapic->vm, vlapic->vcpuid); + vm_inject_nmi(vlapic->vcpu); break; case APIC_LVT_DM_EXTINT: - vm_inject_extint(vlapic->vm, vlapic->vcpuid); + vm_inject_extint(vlapic->vcpu); break; default: // Other modes ignored return (0); } return (1); } #if 1 static void dump_isrvec_stk(struct vlapic *vlapic) { int i; uint32_t *isrptr; isrptr = &vlapic->apic_page->isr0; for (i = 0; i < 8; i++) printf("ISR%d 0x%08x\n", i, isrptr[i * 4]); for (i = 0; i <= vlapic->isrvec_stk_top; i++) printf("isrvec_stk[%d] = %d\n", i, vlapic->isrvec_stk[i]); } #endif /* * Algorithm adopted from section "Interrupt, Task and Processor Priority" * in Intel Architecture Manual Vol 3a. */ static void vlapic_update_ppr(struct vlapic *vlapic) { int isrvec, tpr, ppr; /* * Note that the value on the stack at index 0 is always 0. * * This is a placeholder for the value of ISRV when none of the * bits is set in the ISRx registers. */ isrvec = vlapic->isrvec_stk[vlapic->isrvec_stk_top]; tpr = vlapic->apic_page->tpr; #if 1 { int i, lastprio, curprio, vector, idx; uint32_t *isrptr; if (vlapic->isrvec_stk_top == 0 && isrvec != 0) panic("isrvec_stk is corrupted: %d", isrvec); /* * Make sure that the priority of the nested interrupts is * always increasing. */ lastprio = -1; for (i = 1; i <= vlapic->isrvec_stk_top; i++) { curprio = PRIO(vlapic->isrvec_stk[i]); if (curprio <= lastprio) { dump_isrvec_stk(vlapic); panic("isrvec_stk does not satisfy invariant"); } lastprio = curprio; } /* * Make sure that each bit set in the ISRx registers has a * corresponding entry on the isrvec stack. */ i = 1; isrptr = &vlapic->apic_page->isr0; for (vector = 0; vector < 256; vector++) { idx = (vector / 32) * 4; if (isrptr[idx] & (1 << (vector % 32))) { if (i > vlapic->isrvec_stk_top || vlapic->isrvec_stk[i] != vector) { dump_isrvec_stk(vlapic); panic("ISR and isrvec_stk out of sync"); } i++; } } } #endif if (PRIO(tpr) >= PRIO(isrvec)) ppr = tpr; else ppr = isrvec & 0xf0; vlapic->apic_page->ppr = ppr; VLAPIC_CTR1(vlapic, "vlapic_update_ppr 0x%02x", ppr); } void vlapic_sync_tpr(struct vlapic *vlapic) { vlapic_update_ppr(vlapic); } static VMM_STAT(VLAPIC_GRATUITOUS_EOI, "EOI without any in-service interrupt"); static void vlapic_process_eoi(struct vlapic *vlapic) { struct LAPIC *lapic = vlapic->apic_page; uint32_t *isrptr, *tmrptr; int i, idx, bitpos, vector; isrptr = &lapic->isr0; tmrptr = &lapic->tmr0; for (i = 7; i >= 0; i--) { idx = i * 4; bitpos = fls(isrptr[idx]); if (bitpos-- != 0) { if (vlapic->isrvec_stk_top <= 0) { panic("invalid vlapic isrvec_stk_top %d", vlapic->isrvec_stk_top); } isrptr[idx] &= ~(1 << bitpos); vector = i * 32 + bitpos; VLAPIC_CTR1(vlapic, "EOI vector %d", vector); VLAPIC_CTR_ISR(vlapic, "vlapic_process_eoi"); vlapic->isrvec_stk_top--; vlapic_update_ppr(vlapic); if ((tmrptr[idx] & (1 << bitpos)) != 0) { vioapic_process_eoi(vlapic->vm, vector); } return; } } VLAPIC_CTR0(vlapic, "Gratuitous EOI"); vmm_stat_incr(vlapic->vcpu, VLAPIC_GRATUITOUS_EOI, 1); } static __inline int vlapic_get_lvt_field(uint32_t lvt, uint32_t mask) { return (lvt & mask); } static __inline int vlapic_periodic_timer(struct vlapic *vlapic) { uint32_t lvt; lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT); return (vlapic_get_lvt_field(lvt, APIC_LVTT_TM_PERIODIC)); } static VMM_STAT(VLAPIC_INTR_ERROR, "error interrupts generated by vlapic"); static void vlapic_set_error(struct vlapic *vlapic, uint32_t mask, bool lvt_error) { vlapic->esr_pending |= mask; /* * Avoid infinite recursion if the error LVT itself is configured with * an illegal vector. */ if (lvt_error) return; if (vlapic_fire_lvt(vlapic, APIC_LVT_ERROR)) { vmm_stat_incr(vlapic->vcpu, VLAPIC_INTR_ERROR, 1); } } static VMM_STAT(VLAPIC_INTR_TIMER, "timer interrupts generated by vlapic"); static void vlapic_fire_timer(struct vlapic *vlapic) { KASSERT(VLAPIC_TIMER_LOCKED(vlapic), ("vlapic_fire_timer not locked")); if (vlapic_fire_lvt(vlapic, APIC_LVT_TIMER)) { VLAPIC_CTR0(vlapic, "vlapic timer fired"); vmm_stat_incr(vlapic->vcpu, VLAPIC_INTR_TIMER, 1); } } static VMM_STAT(VLAPIC_INTR_CMC, "corrected machine check interrupts generated by vlapic"); void vlapic_fire_cmci(struct vlapic *vlapic) { if (vlapic_fire_lvt(vlapic, APIC_LVT_CMCI)) { vmm_stat_incr(vlapic->vcpu, VLAPIC_INTR_CMC, 1); } } static VMM_STAT_ARRAY(LVTS_TRIGGERRED, VLAPIC_MAXLVT_INDEX + 1, "lvts triggered"); int vlapic_trigger_lvt(struct vlapic *vlapic, int vector) { if (vlapic_enabled(vlapic) == false) { /* * When the local APIC is global/hardware disabled, * LINT[1:0] pins are configured as INTR and NMI pins, * respectively. */ switch (vector) { case APIC_LVT_LINT0: - vm_inject_extint(vlapic->vm, vlapic->vcpuid); + vm_inject_extint(vlapic->vcpu); break; case APIC_LVT_LINT1: - vm_inject_nmi(vlapic->vm, vlapic->vcpuid); + vm_inject_nmi(vlapic->vcpu); break; default: break; } return (0); } switch (vector) { case APIC_LVT_LINT0: case APIC_LVT_LINT1: case APIC_LVT_TIMER: case APIC_LVT_ERROR: case APIC_LVT_PMC: case APIC_LVT_THERMAL: case APIC_LVT_CMCI: if (vlapic_fire_lvt(vlapic, vector)) { vmm_stat_array_incr(vlapic->vcpu, LVTS_TRIGGERRED, vector, 1); } break; default: return (EINVAL); } return (0); } static void vlapic_callout_reset(struct vlapic *vlapic, sbintime_t t) { callout_reset_sbt_curcpu(&vlapic->callout, t, 0, vlapic_callout_handler, vlapic, 0); } static void vlapic_callout_handler(void *arg) { struct vlapic *vlapic; struct bintime bt, btnow; sbintime_t rem_sbt; vlapic = arg; VLAPIC_TIMER_LOCK(vlapic); if (callout_pending(&vlapic->callout)) /* callout was reset */ goto done; if (!callout_active(&vlapic->callout)) /* callout was stopped */ goto done; callout_deactivate(&vlapic->callout); vlapic_fire_timer(vlapic); if (vlapic_periodic_timer(vlapic)) { binuptime(&btnow); KASSERT(bintime_cmp(&btnow, &vlapic->timer_fire_bt, >=), ("vlapic callout at %#lx.%#lx, expected at %#lx.#%lx", btnow.sec, btnow.frac, vlapic->timer_fire_bt.sec, vlapic->timer_fire_bt.frac)); /* * Compute the delta between when the timer was supposed to * fire and the present time. */ bt = btnow; bintime_sub(&bt, &vlapic->timer_fire_bt); rem_sbt = bttosbt(vlapic->timer_period_bt); if (bintime_cmp(&bt, &vlapic->timer_period_bt, <)) { /* * Adjust the time until the next countdown downward * to account for the lost time. */ rem_sbt -= bttosbt(bt); } else { /* * If the delta is greater than the timer period then * just reset our time base instead of trying to catch * up. */ vlapic->timer_fire_bt = btnow; VLAPIC_CTR2(vlapic, "vlapic timer lagging by %lu " "usecs, period is %lu usecs - resetting time base", bttosbt(bt) / SBT_1US, bttosbt(vlapic->timer_period_bt) / SBT_1US); } bintime_add(&vlapic->timer_fire_bt, &vlapic->timer_period_bt); vlapic_callout_reset(vlapic, rem_sbt); } done: VLAPIC_TIMER_UNLOCK(vlapic); } void vlapic_icrtmr_write_handler(struct vlapic *vlapic) { struct LAPIC *lapic; sbintime_t sbt; uint32_t icr_timer; VLAPIC_TIMER_LOCK(vlapic); lapic = vlapic->apic_page; icr_timer = lapic->icr_timer; vlapic->timer_period_bt = vlapic->timer_freq_bt; bintime_mul(&vlapic->timer_period_bt, icr_timer); if (icr_timer != 0) { binuptime(&vlapic->timer_fire_bt); bintime_add(&vlapic->timer_fire_bt, &vlapic->timer_period_bt); sbt = bttosbt(vlapic->timer_period_bt); vlapic_callout_reset(vlapic, sbt); } else callout_stop(&vlapic->callout); VLAPIC_TIMER_UNLOCK(vlapic); } /* * This function populates 'dmask' with the set of vcpus that match the * addressing specified by the (dest, phys, lowprio) tuple. * * 'x2apic_dest' specifies whether 'dest' is interpreted as x2APIC (32-bit) * or xAPIC (8-bit) destination field. */ static void vlapic_calcdest(struct vm *vm, cpuset_t *dmask, uint32_t dest, bool phys, bool lowprio, bool x2apic_dest) { struct vlapic *vlapic; uint32_t dfr, ldr, ldest, cluster; uint32_t mda_flat_ldest, mda_cluster_ldest, mda_ldest, mda_cluster_id; cpuset_t amask; int vcpuid; if ((x2apic_dest && dest == 0xffffffff) || (!x2apic_dest && dest == 0xff)) { /* * Broadcast in both logical and physical modes. */ *dmask = vm_active_cpus(vm); return; } if (phys) { /* * Physical mode: destination is APIC ID. */ CPU_ZERO(dmask); vcpuid = vm_apicid2vcpuid(vm, dest); amask = vm_active_cpus(vm); if (vcpuid < vm_get_maxcpus(vm) && CPU_ISSET(vcpuid, &amask)) CPU_SET(vcpuid, dmask); } else { /* * In the "Flat Model" the MDA is interpreted as an 8-bit wide * bitmask. This model is only available in the xAPIC mode. */ mda_flat_ldest = dest & 0xff; /* * In the "Cluster Model" the MDA is used to identify a * specific cluster and a set of APICs in that cluster. */ if (x2apic_dest) { mda_cluster_id = dest >> 16; mda_cluster_ldest = dest & 0xffff; } else { mda_cluster_id = (dest >> 4) & 0xf; mda_cluster_ldest = dest & 0xf; } /* * Logical mode: match each APIC that has a bit set * in its LDR that matches a bit in the ldest. */ CPU_ZERO(dmask); amask = vm_active_cpus(vm); CPU_FOREACH_ISSET(vcpuid, &amask) { vlapic = vm_lapic(vm_vcpu(vm, vcpuid)); dfr = vlapic->apic_page->dfr; ldr = vlapic->apic_page->ldr; if ((dfr & APIC_DFR_MODEL_MASK) == APIC_DFR_MODEL_FLAT) { ldest = ldr >> 24; mda_ldest = mda_flat_ldest; } else if ((dfr & APIC_DFR_MODEL_MASK) == APIC_DFR_MODEL_CLUSTER) { if (x2apic(vlapic)) { cluster = ldr >> 16; ldest = ldr & 0xffff; } else { cluster = ldr >> 28; ldest = (ldr >> 24) & 0xf; } if (cluster != mda_cluster_id) continue; mda_ldest = mda_cluster_ldest; } else { /* * Guest has configured a bad logical * model for this vcpu - skip it. */ VLAPIC_CTR1(vlapic, "vlapic has bad logical " "model %x - cannot deliver interrupt", dfr); continue; } if ((mda_ldest & ldest) != 0) { CPU_SET(vcpuid, dmask); if (lowprio) break; } } } } static VMM_STAT_ARRAY(IPIS_SENT, VM_MAXCPU, "ipis sent to vcpu"); static void vlapic_set_tpr(struct vlapic *vlapic, uint8_t val) { struct LAPIC *lapic = vlapic->apic_page; if (lapic->tpr != val) { VLAPIC_CTR2(vlapic, "vlapic TPR changed from %#x to %#x", lapic->tpr, val); lapic->tpr = val; vlapic_update_ppr(vlapic); } } static uint8_t vlapic_get_tpr(struct vlapic *vlapic) { struct LAPIC *lapic = vlapic->apic_page; return (lapic->tpr); } void vlapic_set_cr8(struct vlapic *vlapic, uint64_t val) { uint8_t tpr; if (val & ~0xf) { vm_inject_gp(vlapic->vcpu); return; } tpr = val << 4; vlapic_set_tpr(vlapic, tpr); } uint64_t vlapic_get_cr8(struct vlapic *vlapic) { uint8_t tpr; tpr = vlapic_get_tpr(vlapic); return (tpr >> 4); } static bool vlapic_is_icr_valid(uint64_t icrval) { uint32_t mode = icrval & APIC_DELMODE_MASK; uint32_t level = icrval & APIC_LEVEL_MASK; uint32_t trigger = icrval & APIC_TRIGMOD_MASK; uint32_t shorthand = icrval & APIC_DEST_MASK; switch (mode) { case APIC_DELMODE_FIXED: if (trigger == APIC_TRIGMOD_EDGE) return (true); /* * AMD allows a level assert IPI and Intel converts a level * assert IPI into an edge IPI. */ if (trigger == APIC_TRIGMOD_LEVEL && level == APIC_LEVEL_ASSERT) return (true); break; case APIC_DELMODE_LOWPRIO: case APIC_DELMODE_SMI: case APIC_DELMODE_NMI: case APIC_DELMODE_INIT: if (trigger == APIC_TRIGMOD_EDGE && (shorthand == APIC_DEST_DESTFLD || shorthand == APIC_DEST_ALLESELF)) return (true); /* * AMD allows a level assert IPI and Intel converts a level * assert IPI into an edge IPI. */ if (trigger == APIC_TRIGMOD_LEVEL && level == APIC_LEVEL_ASSERT && (shorthand == APIC_DEST_DESTFLD || shorthand == APIC_DEST_ALLESELF)) return (true); /* * An level triggered deassert INIT is defined in the Intel * Multiprocessor Specification and the Intel Software Developer * Manual. Due to the MPS it's required to send a level assert * INIT to a cpu and then a level deassert INIT. Some operating * systems e.g. FreeBSD or Linux use that algorithm. According * to the SDM a level deassert INIT is only supported by Pentium * and P6 processors. It's always send to all cpus regardless of * the destination or shorthand field. It resets the arbitration * id register. This register is not software accessible and * only required for the APIC bus arbitration. So, the level * deassert INIT doesn't need any emulation and we should ignore * it. The SDM also defines that newer processors don't support * the level deassert INIT and it's not valid any more. As it's * defined for older systems, it can't be invalid per se. * Otherwise, backward compatibility would be broken. However, * when returning false here, it'll be ignored which is the * desired behaviour. */ if (mode == APIC_DELMODE_INIT && trigger == APIC_TRIGMOD_LEVEL && level == APIC_LEVEL_DEASSERT) return (false); break; case APIC_DELMODE_STARTUP: if (shorthand == APIC_DEST_DESTFLD || shorthand == APIC_DEST_ALLESELF) return (true); break; case APIC_DELMODE_RR: /* Only available on AMD! */ if (trigger == APIC_TRIGMOD_EDGE && shorthand == APIC_DEST_DESTFLD) return (true); break; case APIC_DELMODE_RESV: return (false); default: __assert_unreachable(); } return (false); } int vlapic_icrlo_write_handler(struct vlapic *vlapic, bool *retu) { int i; bool phys; cpuset_t dmask, ipimask; uint64_t icrval; uint32_t dest, vec, mode, shorthand; struct vlapic *vlapic2; + struct vcpu *vcpu; struct vm_exit *vmexit; struct LAPIC *lapic; lapic = vlapic->apic_page; lapic->icr_lo &= ~APIC_DELSTAT_PEND; icrval = ((uint64_t)lapic->icr_hi << 32) | lapic->icr_lo; if (x2apic(vlapic)) dest = icrval >> 32; else dest = icrval >> (32 + 24); vec = icrval & APIC_VECTOR_MASK; mode = icrval & APIC_DELMODE_MASK; phys = (icrval & APIC_DESTMODE_LOG) == 0; shorthand = icrval & APIC_DEST_MASK; VLAPIC_CTR2(vlapic, "icrlo 0x%016lx triggered ipi %d", icrval, vec); switch (shorthand) { case APIC_DEST_DESTFLD: vlapic_calcdest(vlapic->vm, &dmask, dest, phys, false, x2apic(vlapic)); break; case APIC_DEST_SELF: CPU_SETOF(vlapic->vcpuid, &dmask); break; case APIC_DEST_ALLISELF: dmask = vm_active_cpus(vlapic->vm); break; case APIC_DEST_ALLESELF: dmask = vm_active_cpus(vlapic->vm); CPU_CLR(vlapic->vcpuid, &dmask); break; default: __assert_unreachable(); } /* * Ignore invalid combinations of the icr. */ if (!vlapic_is_icr_valid(icrval)) { VLAPIC_CTR1(vlapic, "Ignoring invalid ICR %016lx", icrval); return (0); } /* * ipimask is a set of vCPUs needing userland handling of the current * IPI. */ CPU_ZERO(&ipimask); switch (mode) { case APIC_DELMODE_FIXED: if (vec < 16) { vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR, false); VLAPIC_CTR1(vlapic, "Ignoring invalid IPI %d", vec); return (0); } CPU_FOREACH_ISSET(i, &dmask) { - lapic_intr_edge(vlapic->vm, i, vec); + vcpu = vm_vcpu(vlapic->vm, i); + lapic_intr_edge(vcpu, vec); vmm_stat_array_incr(vlapic->vcpu, IPIS_SENT, i, 1); VLAPIC_CTR2(vlapic, "vlapic sending ipi %d to vcpuid %d", vec, i); } break; case APIC_DELMODE_NMI: CPU_FOREACH_ISSET(i, &dmask) { - vm_inject_nmi(vlapic->vm, i); + vcpu = vm_vcpu(vlapic->vm, i); + vm_inject_nmi(vcpu); VLAPIC_CTR1(vlapic, "vlapic sending ipi nmi to vcpuid %d", i); } break; case APIC_DELMODE_INIT: if (!vlapic->ipi_exit) { if (!phys) break; i = vm_apicid2vcpuid(vlapic->vm, dest); if (i >= vm_get_maxcpus(vlapic->vm) || i == vlapic->vcpuid) break; /* * Userland which doesn't support the IPI exit * requires that the boot state is set to SIPI * here. */ - vlapic2 = vm_lapic(vm_vcpu(vlapic->vm, i)); + vcpu = vm_vcpu(vlapic->vm, i); + vlapic2 = vm_lapic(vcpu); vlapic2->boot_state = BS_SIPI; break; } CPU_COPY(&dmask, &ipimask); break; case APIC_DELMODE_STARTUP: if (!vlapic->ipi_exit) { if (!phys) break; /* * Old bhyve versions don't support the IPI * exit. Translate it into the old style. */ i = vm_apicid2vcpuid(vlapic->vm, dest); if (i >= vm_get_maxcpus(vlapic->vm) || i == vlapic->vcpuid) break; /* * Ignore SIPIs in any state other than wait-for-SIPI */ - vlapic2 = vm_lapic(vm_vcpu(vlapic->vm, i)); + vcpu = vm_vcpu(vlapic->vm, i); + vlapic2 = vm_lapic(vcpu); if (vlapic2->boot_state != BS_SIPI) break; vlapic2->boot_state = BS_RUNNING; vmexit = vm_exitinfo(vlapic->vcpu); vmexit->exitcode = VM_EXITCODE_SPINUP_AP; vmexit->u.spinup_ap.vcpu = i; vmexit->u.spinup_ap.rip = vec << PAGE_SHIFT; *retu = true; break; } CPU_FOREACH_ISSET(i, &dmask) { - vlapic2 = vm_lapic(vm_vcpu(vlapic->vm, i)); + vcpu = vm_vcpu(vlapic->vm, i); + vlapic2 = vm_lapic(vcpu); /* * Ignore SIPIs in any state other than wait-for-SIPI */ if (vlapic2->boot_state != BS_SIPI) continue; vlapic2->boot_state = BS_RUNNING; CPU_SET(i, &ipimask); } break; default: return (1); } if (!CPU_EMPTY(&ipimask)) { vmexit = vm_exitinfo(vlapic->vcpu); vmexit->exitcode = VM_EXITCODE_IPI; vmexit->u.ipi.mode = mode; vmexit->u.ipi.vector = vec; vmexit->u.ipi.dmask = dmask; *retu = true; } return (0); } static void vlapic_handle_init(struct vcpu *vcpu, void *arg) { struct vlapic *vlapic = vm_lapic(vcpu); vlapic_reset(vlapic); /* vlapic_reset modifies the boot state. */ vlapic->boot_state = BS_SIPI; } int vm_handle_ipi(struct vcpu *vcpu, struct vm_exit *vme, bool *retu) { *retu = true; switch (vme->u.ipi.mode) { case APIC_DELMODE_INIT: vm_smp_rendezvous(vcpu, vme->u.ipi.dmask, vlapic_handle_init, NULL); break; case APIC_DELMODE_STARTUP: break; default: return (1); } return (0); } void vlapic_self_ipi_handler(struct vlapic *vlapic, uint64_t val) { int vec; KASSERT(x2apic(vlapic), ("SELF_IPI does not exist in xAPIC mode")); vec = val & 0xff; - lapic_intr_edge(vlapic->vm, vlapic->vcpuid, vec); + lapic_intr_edge(vlapic->vcpu, vec); vmm_stat_array_incr(vlapic->vcpu, IPIS_SENT, vlapic->vcpuid, 1); VLAPIC_CTR1(vlapic, "vlapic self-ipi %d", vec); } int vlapic_pending_intr(struct vlapic *vlapic, int *vecptr) { struct LAPIC *lapic = vlapic->apic_page; int idx, i, bitpos, vector; uint32_t *irrptr, val; vlapic_update_ppr(vlapic); if (vlapic->ops.pending_intr) return ((*vlapic->ops.pending_intr)(vlapic, vecptr)); irrptr = &lapic->irr0; for (i = 7; i >= 0; i--) { idx = i * 4; val = atomic_load_acq_int(&irrptr[idx]); bitpos = fls(val); if (bitpos != 0) { vector = i * 32 + (bitpos - 1); if (PRIO(vector) > PRIO(lapic->ppr)) { VLAPIC_CTR1(vlapic, "pending intr %d", vector); if (vecptr != NULL) *vecptr = vector; return (1); } else break; } } return (0); } void vlapic_intr_accepted(struct vlapic *vlapic, int vector) { struct LAPIC *lapic = vlapic->apic_page; uint32_t *irrptr, *isrptr; int idx, stk_top; if (vlapic->ops.intr_accepted) return ((*vlapic->ops.intr_accepted)(vlapic, vector)); /* * clear the ready bit for vector being accepted in irr * and set the vector as in service in isr. */ idx = (vector / 32) * 4; irrptr = &lapic->irr0; atomic_clear_int(&irrptr[idx], 1 << (vector % 32)); VLAPIC_CTR_IRR(vlapic, "vlapic_intr_accepted"); isrptr = &lapic->isr0; isrptr[idx] |= 1 << (vector % 32); VLAPIC_CTR_ISR(vlapic, "vlapic_intr_accepted"); /* * Update the PPR */ vlapic->isrvec_stk_top++; stk_top = vlapic->isrvec_stk_top; if (stk_top >= ISRVEC_STK_SIZE) panic("isrvec_stk_top overflow %d", stk_top); vlapic->isrvec_stk[stk_top] = vector; } void vlapic_svr_write_handler(struct vlapic *vlapic) { struct LAPIC *lapic; uint32_t old, new, changed; lapic = vlapic->apic_page; new = lapic->svr; old = vlapic->svr_last; vlapic->svr_last = new; changed = old ^ new; if ((changed & APIC_SVR_ENABLE) != 0) { if ((new & APIC_SVR_ENABLE) == 0) { /* * The apic is now disabled so stop the apic timer * and mask all the LVT entries. */ VLAPIC_CTR0(vlapic, "vlapic is software-disabled"); VLAPIC_TIMER_LOCK(vlapic); callout_stop(&vlapic->callout); VLAPIC_TIMER_UNLOCK(vlapic); vlapic_mask_lvts(vlapic); } else { /* * The apic is now enabled so restart the apic timer * if it is configured in periodic mode. */ VLAPIC_CTR0(vlapic, "vlapic is software-enabled"); if (vlapic_periodic_timer(vlapic)) vlapic_icrtmr_write_handler(vlapic); } } } int vlapic_read(struct vlapic *vlapic, int mmio_access, uint64_t offset, uint64_t *data, bool *retu) { struct LAPIC *lapic = vlapic->apic_page; uint32_t *reg; int i; /* Ignore MMIO accesses in x2APIC mode */ if (x2apic(vlapic) && mmio_access) { VLAPIC_CTR1(vlapic, "MMIO read from offset %#lx in x2APIC mode", offset); *data = 0; goto done; } if (!x2apic(vlapic) && !mmio_access) { /* * XXX Generate GP fault for MSR accesses in xAPIC mode */ VLAPIC_CTR1(vlapic, "x2APIC MSR read from offset %#lx in " "xAPIC mode", offset); *data = 0; goto done; } if (offset > sizeof(*lapic)) { *data = 0; goto done; } offset &= ~3; switch(offset) { case APIC_OFFSET_ID: *data = lapic->id; break; case APIC_OFFSET_VER: *data = lapic->version; break; case APIC_OFFSET_TPR: *data = vlapic_get_tpr(vlapic); break; case APIC_OFFSET_APR: *data = lapic->apr; break; case APIC_OFFSET_PPR: *data = lapic->ppr; break; case APIC_OFFSET_EOI: *data = lapic->eoi; break; case APIC_OFFSET_LDR: *data = lapic->ldr; break; case APIC_OFFSET_DFR: *data = lapic->dfr; break; case APIC_OFFSET_SVR: *data = lapic->svr; break; case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7: i = (offset - APIC_OFFSET_ISR0) >> 2; reg = &lapic->isr0; *data = *(reg + i); break; case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7: i = (offset - APIC_OFFSET_TMR0) >> 2; reg = &lapic->tmr0; *data = *(reg + i); break; case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7: i = (offset - APIC_OFFSET_IRR0) >> 2; reg = &lapic->irr0; *data = atomic_load_acq_int(reg + i); break; case APIC_OFFSET_ESR: *data = lapic->esr; break; case APIC_OFFSET_ICR_LOW: *data = lapic->icr_lo; if (x2apic(vlapic)) *data |= (uint64_t)lapic->icr_hi << 32; break; case APIC_OFFSET_ICR_HI: *data = lapic->icr_hi; break; case APIC_OFFSET_CMCI_LVT: case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT: *data = vlapic_get_lvt(vlapic, offset); #ifdef INVARIANTS reg = vlapic_get_lvtptr(vlapic, offset); KASSERT(*data == *reg, ("inconsistent lvt value at " "offset %#lx: %#lx/%#x", offset, *data, *reg)); #endif break; case APIC_OFFSET_TIMER_ICR: *data = lapic->icr_timer; break; case APIC_OFFSET_TIMER_CCR: *data = vlapic_get_ccr(vlapic); break; case APIC_OFFSET_TIMER_DCR: *data = lapic->dcr_timer; break; case APIC_OFFSET_SELF_IPI: /* * XXX generate a GP fault if vlapic is in x2apic mode */ *data = 0; break; case APIC_OFFSET_RRR: default: *data = 0; break; } done: VLAPIC_CTR2(vlapic, "vlapic read offset %#x, data %#lx", offset, *data); return 0; } int vlapic_write(struct vlapic *vlapic, int mmio_access, uint64_t offset, uint64_t data, bool *retu) { struct LAPIC *lapic = vlapic->apic_page; uint32_t *regptr; int retval; KASSERT((offset & 0xf) == 0 && offset < PAGE_SIZE, ("vlapic_write: invalid offset %#lx", offset)); VLAPIC_CTR2(vlapic, "vlapic write offset %#lx, data %#lx", offset, data); if (offset > sizeof(*lapic)) return (0); /* Ignore MMIO accesses in x2APIC mode */ if (x2apic(vlapic) && mmio_access) { VLAPIC_CTR2(vlapic, "MMIO write of %#lx to offset %#lx " "in x2APIC mode", data, offset); return (0); } /* * XXX Generate GP fault for MSR accesses in xAPIC mode */ if (!x2apic(vlapic) && !mmio_access) { VLAPIC_CTR2(vlapic, "x2APIC MSR write of %#lx to offset %#lx " "in xAPIC mode", data, offset); return (0); } retval = 0; switch(offset) { case APIC_OFFSET_ID: lapic->id = data; vlapic_id_write_handler(vlapic); break; case APIC_OFFSET_TPR: vlapic_set_tpr(vlapic, data & 0xff); break; case APIC_OFFSET_EOI: vlapic_process_eoi(vlapic); break; case APIC_OFFSET_LDR: lapic->ldr = data; vlapic_ldr_write_handler(vlapic); break; case APIC_OFFSET_DFR: lapic->dfr = data; vlapic_dfr_write_handler(vlapic); break; case APIC_OFFSET_SVR: lapic->svr = data; vlapic_svr_write_handler(vlapic); break; case APIC_OFFSET_ICR_LOW: lapic->icr_lo = data; if (x2apic(vlapic)) lapic->icr_hi = data >> 32; retval = vlapic_icrlo_write_handler(vlapic, retu); break; case APIC_OFFSET_ICR_HI: lapic->icr_hi = data; break; case APIC_OFFSET_CMCI_LVT: case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT: regptr = vlapic_get_lvtptr(vlapic, offset); *regptr = data; vlapic_lvt_write_handler(vlapic, offset); break; case APIC_OFFSET_TIMER_ICR: lapic->icr_timer = data; vlapic_icrtmr_write_handler(vlapic); break; case APIC_OFFSET_TIMER_DCR: lapic->dcr_timer = data; vlapic_dcr_write_handler(vlapic); break; case APIC_OFFSET_ESR: vlapic_esr_write_handler(vlapic); break; case APIC_OFFSET_SELF_IPI: if (x2apic(vlapic)) vlapic_self_ipi_handler(vlapic, data); break; case APIC_OFFSET_VER: case APIC_OFFSET_APR: case APIC_OFFSET_PPR: case APIC_OFFSET_RRR: case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7: case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7: case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7: case APIC_OFFSET_TIMER_CCR: default: // Read only. break; } return (retval); } static void vlapic_reset(struct vlapic *vlapic) { struct LAPIC *lapic; lapic = vlapic->apic_page; bzero(lapic, sizeof(struct LAPIC)); lapic->id = vlapic_get_id(vlapic); lapic->version = VLAPIC_VERSION; lapic->version |= (VLAPIC_MAXLVT_INDEX << MAXLVTSHIFT); lapic->dfr = 0xffffffff; lapic->svr = APIC_SVR_VECTOR; vlapic_mask_lvts(vlapic); vlapic_reset_tmr(vlapic); lapic->dcr_timer = 0; vlapic_dcr_write_handler(vlapic); if (vlapic->vcpuid == 0) vlapic->boot_state = BS_RUNNING; /* BSP */ else vlapic->boot_state = BS_INIT; /* AP */ vlapic->svr_last = lapic->svr; } void vlapic_init(struct vlapic *vlapic) { KASSERT(vlapic->vm != NULL, ("vlapic_init: vm is not initialized")); KASSERT(vlapic->vcpuid >= 0 && vlapic->vcpuid < vm_get_maxcpus(vlapic->vm), ("vlapic_init: vcpuid is not initialized")); KASSERT(vlapic->apic_page != NULL, ("vlapic_init: apic_page is not " "initialized")); /* * If the vlapic is configured in x2apic mode then it will be * accessed in the critical section via the MSR emulation code. * * Therefore the timer mutex must be a spinlock because blockable * mutexes cannot be acquired in a critical section. */ mtx_init(&vlapic->timer_mtx, "vlapic timer mtx", NULL, MTX_SPIN); callout_init(&vlapic->callout, 1); vlapic->msr_apicbase = DEFAULT_APIC_BASE | APICBASE_ENABLED; if (vlapic->vcpuid == 0) vlapic->msr_apicbase |= APICBASE_BSP; vlapic->ipi_exit = false; vlapic_reset(vlapic); } void vlapic_cleanup(struct vlapic *vlapic) { callout_drain(&vlapic->callout); } uint64_t vlapic_get_apicbase(struct vlapic *vlapic) { return (vlapic->msr_apicbase); } int vlapic_set_apicbase(struct vlapic *vlapic, uint64_t new) { if (vlapic->msr_apicbase != new) { VLAPIC_CTR2(vlapic, "Changing APIC_BASE MSR from %#lx to %#lx " "not supported", vlapic->msr_apicbase, new); return (-1); } return (0); } void vlapic_set_x2apic_state(struct vcpu *vcpu, enum x2apic_state state) { struct vlapic *vlapic; struct LAPIC *lapic; vlapic = vm_lapic(vcpu); if (state == X2APIC_DISABLED) vlapic->msr_apicbase &= ~APICBASE_X2APIC; else vlapic->msr_apicbase |= APICBASE_X2APIC; /* * Reset the local APIC registers whose values are mode-dependent. * * XXX this works because the APIC mode can be changed only at vcpu * initialization time. */ lapic = vlapic->apic_page; lapic->id = vlapic_get_id(vlapic); if (x2apic(vlapic)) { lapic->ldr = x2apic_ldr(vlapic); lapic->dfr = 0; } else { lapic->ldr = 0; lapic->dfr = 0xffffffff; } if (state == X2APIC_ENABLED) { if (vlapic->ops.enable_x2apic_mode) (*vlapic->ops.enable_x2apic_mode)(vlapic); } } void vlapic_deliver_intr(struct vm *vm, bool level, uint32_t dest, bool phys, int delmode, int vec) { + struct vcpu *vcpu; bool lowprio; int vcpuid; cpuset_t dmask; if (delmode != IOART_DELFIXED && delmode != IOART_DELLOPRI && delmode != IOART_DELEXINT) { VM_CTR1(vm, "vlapic intr invalid delmode %#x", delmode); return; } lowprio = (delmode == IOART_DELLOPRI); /* * We don't provide any virtual interrupt redirection hardware so * all interrupts originating from the ioapic or MSI specify the * 'dest' in the legacy xAPIC format. */ vlapic_calcdest(vm, &dmask, dest, phys, lowprio, false); CPU_FOREACH_ISSET(vcpuid, &dmask) { + vcpu = vm_vcpu(vm, vcpuid); if (delmode == IOART_DELEXINT) { - vm_inject_extint(vm, vcpuid); + vm_inject_extint(vcpu); } else { - lapic_set_intr(vm, vcpuid, vec, level); + lapic_set_intr(vcpu, vec, level); } } } void vlapic_post_intr(struct vlapic *vlapic, int hostcpu, int ipinum) { /* * Post an interrupt to the vcpu currently running on 'hostcpu'. * * This is done by leveraging features like Posted Interrupts (Intel) * Doorbell MSR (AMD AVIC) that avoid a VM exit. * * If neither of these features are available then fallback to * sending an IPI to 'hostcpu'. */ if (vlapic->ops.post_intr) (*vlapic->ops.post_intr)(vlapic, hostcpu); else ipi_cpu(hostcpu, ipinum); } bool vlapic_enabled(struct vlapic *vlapic) { struct LAPIC *lapic = vlapic->apic_page; if ((vlapic->msr_apicbase & APICBASE_ENABLED) != 0 && (lapic->svr & APIC_SVR_ENABLE) != 0) return (true); else return (false); } static void vlapic_set_tmr(struct vlapic *vlapic, int vector, bool level) { struct LAPIC *lapic; uint32_t *tmrptr, mask; int idx; lapic = vlapic->apic_page; tmrptr = &lapic->tmr0; idx = (vector / 32) * 4; mask = 1 << (vector % 32); if (level) tmrptr[idx] |= mask; else tmrptr[idx] &= ~mask; if (vlapic->ops.set_tmr != NULL) (*vlapic->ops.set_tmr)(vlapic, vector, level); } void vlapic_reset_tmr(struct vlapic *vlapic) { int vector; VLAPIC_CTR0(vlapic, "vlapic resetting all vectors to edge-triggered"); for (vector = 0; vector <= 255; vector++) vlapic_set_tmr(vlapic, vector, false); } void vlapic_set_tmr_level(struct vlapic *vlapic, uint32_t dest, bool phys, int delmode, int vector) { cpuset_t dmask; bool lowprio; KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector)); /* * A level trigger is valid only for fixed and lowprio delivery modes. */ if (delmode != APIC_DELMODE_FIXED && delmode != APIC_DELMODE_LOWPRIO) { VLAPIC_CTR1(vlapic, "Ignoring level trigger-mode for " "delivery-mode %d", delmode); return; } lowprio = (delmode == APIC_DELMODE_LOWPRIO); vlapic_calcdest(vlapic->vm, &dmask, dest, phys, lowprio, false); if (!CPU_ISSET(vlapic->vcpuid, &dmask)) return; VLAPIC_CTR1(vlapic, "vector %d set to level-triggered", vector); vlapic_set_tmr(vlapic, vector, true); } #ifdef BHYVE_SNAPSHOT static void vlapic_reset_callout(struct vlapic *vlapic, uint32_t ccr) { /* The implementation is similar to the one in the * `vlapic_icrtmr_write_handler` function */ sbintime_t sbt; struct bintime bt; VLAPIC_TIMER_LOCK(vlapic); bt = vlapic->timer_freq_bt; bintime_mul(&bt, ccr); if (ccr != 0) { binuptime(&vlapic->timer_fire_bt); bintime_add(&vlapic->timer_fire_bt, &bt); sbt = bttosbt(bt); vlapic_callout_reset(vlapic, sbt); } else { /* even if the CCR was 0, periodic timers should be reset */ if (vlapic_periodic_timer(vlapic)) { binuptime(&vlapic->timer_fire_bt); bintime_add(&vlapic->timer_fire_bt, &vlapic->timer_period_bt); sbt = bttosbt(vlapic->timer_period_bt); callout_stop(&vlapic->callout); vlapic_callout_reset(vlapic, sbt); } } VLAPIC_TIMER_UNLOCK(vlapic); } int vlapic_snapshot(struct vm *vm, struct vm_snapshot_meta *meta) { int ret; struct vlapic *vlapic; struct LAPIC *lapic; uint32_t ccr; uint16_t i, maxcpus; KASSERT(vm != NULL, ("%s: arg was NULL", __func__)); ret = 0; maxcpus = vm_get_maxcpus(vm); for (i = 0; i < maxcpus; i++) { vlapic = vm_lapic(vm_vcpu(vm, i)); /* snapshot the page first; timer period depends on icr_timer */ lapic = vlapic->apic_page; SNAPSHOT_BUF_OR_LEAVE(lapic, PAGE_SIZE, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vlapic->esr_pending, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vlapic->timer_freq_bt.sec, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vlapic->timer_freq_bt.frac, meta, ret, done); /* * Timer period is equal to 'icr_timer' ticks at a frequency of * 'timer_freq_bt'. */ if (meta->op == VM_SNAPSHOT_RESTORE) { vlapic->timer_period_bt = vlapic->timer_freq_bt; bintime_mul(&vlapic->timer_period_bt, lapic->icr_timer); } SNAPSHOT_BUF_OR_LEAVE(vlapic->isrvec_stk, sizeof(vlapic->isrvec_stk), meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vlapic->isrvec_stk_top, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vlapic->boot_state, meta, ret, done); SNAPSHOT_BUF_OR_LEAVE(vlapic->lvt_last, sizeof(vlapic->lvt_last), meta, ret, done); if (meta->op == VM_SNAPSHOT_SAVE) ccr = vlapic_get_ccr(vlapic); SNAPSHOT_VAR_OR_LEAVE(ccr, meta, ret, done); if (meta->op == VM_SNAPSHOT_RESTORE && vlapic_enabled(vlapic) && lapic->icr_timer != 0) { /* Reset the value of the 'timer_fire_bt' and the vlapic * callout based on the value of the current count * register saved when the VM snapshot was created. * If initial count register is 0, timer is not used. * Look at "10.5.4 APIC Timer" in Software Developer Manual. */ vlapic_reset_callout(vlapic, ccr); } } done: return (ret); } #endif diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c index 2ee57895d852..fa6420aab1c2 100644 --- a/sys/amd64/vmm/vmm.c +++ b/sys/amd64/vmm/vmm.c @@ -1,2930 +1,2857 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include "opt_bhyve_snapshot.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "vmm_ioport.h" #include "vmm_ktr.h" #include "vmm_host.h" #include "vmm_mem.h" #include "vmm_util.h" #include "vatpic.h" #include "vatpit.h" #include "vhpet.h" #include "vioapic.h" #include "vlapic.h" #include "vpmtmr.h" #include "vrtc.h" #include "vmm_stat.h" #include "vmm_lapic.h" #include "io/ppt.h" #include "io/iommu.h" struct vlapic; /* * Initialization: * (a) allocated when vcpu is created * (i) initialized when vcpu is created and when it is reinitialized * (o) initialized the first time the vcpu is created * (x) initialized before use */ struct vcpu { struct mtx mtx; /* (o) protects 'state' and 'hostcpu' */ enum vcpu_state state; /* (o) vcpu state */ int vcpuid; /* (o) */ int hostcpu; /* (o) vcpu's host cpu */ int reqidle; /* (i) request vcpu to idle */ struct vm *vm; /* (o) */ void *cookie; /* (i) cpu-specific data */ struct vlapic *vlapic; /* (i) APIC device model */ enum x2apic_state x2apic_state; /* (i) APIC mode */ uint64_t exitintinfo; /* (i) events pending at VM exit */ int nmi_pending; /* (i) NMI pending */ int extint_pending; /* (i) INTR pending */ int exception_pending; /* (i) exception pending */ int exc_vector; /* (x) exception collateral */ int exc_errcode_valid; uint32_t exc_errcode; struct savefpu *guestfpu; /* (a,i) guest fpu state */ uint64_t guest_xcr0; /* (i) guest %xcr0 register */ void *stats; /* (a,i) statistics */ struct vm_exit exitinfo; /* (x) exit reason and collateral */ uint64_t nextrip; /* (x) next instruction to execute */ uint64_t tsc_offset; /* (o) TSC offsetting */ }; #define vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx)) #define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN) #define vcpu_lock(v) mtx_lock_spin(&((v)->mtx)) #define vcpu_unlock(v) mtx_unlock_spin(&((v)->mtx)) #define vcpu_assert_locked(v) mtx_assert(&((v)->mtx), MA_OWNED) struct mem_seg { size_t len; bool sysmem; struct vm_object *object; }; #define VM_MAX_MEMSEGS 4 struct mem_map { vm_paddr_t gpa; size_t len; vm_ooffset_t segoff; int segid; int prot; int flags; }; #define VM_MAX_MEMMAPS 8 /* * Initialization: * (o) initialized the first time the VM is created * (i) initialized when VM is created and when it is reinitialized * (x) initialized before use */ struct vm { void *cookie; /* (i) cpu-specific data */ void *iommu; /* (x) iommu-specific data */ struct vhpet *vhpet; /* (i) virtual HPET */ struct vioapic *vioapic; /* (i) virtual ioapic */ struct vatpic *vatpic; /* (i) virtual atpic */ struct vatpit *vatpit; /* (i) virtual atpit */ struct vpmtmr *vpmtmr; /* (i) virtual ACPI PM timer */ struct vrtc *vrtc; /* (o) virtual RTC */ volatile cpuset_t active_cpus; /* (i) active vcpus */ volatile cpuset_t debug_cpus; /* (i) vcpus stopped for debug */ int suspend; /* (i) stop VM execution */ volatile cpuset_t suspended_cpus; /* (i) suspended vcpus */ volatile cpuset_t halted_cpus; /* (x) cpus in a hard halt */ cpuset_t rendezvous_req_cpus; /* (x) rendezvous requested */ cpuset_t rendezvous_done_cpus; /* (x) rendezvous finished */ void *rendezvous_arg; /* (x) rendezvous func/arg */ vm_rendezvous_func_t rendezvous_func; struct mtx rendezvous_mtx; /* (o) rendezvous lock */ struct mem_map mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */ struct mem_seg mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */ struct vmspace *vmspace; /* (o) guest's address space */ char name[VM_MAX_NAMELEN+1]; /* (o) virtual machine name */ struct vcpu vcpu[VM_MAXCPU]; /* (i) guest vcpus */ /* The following describe the vm cpu topology */ uint16_t sockets; /* (o) num of sockets */ uint16_t cores; /* (o) num of cores/socket */ uint16_t threads; /* (o) num of threads/core */ uint16_t maxcpus; /* (o) max pluggable cpus */ }; #define VMM_CTR0(vcpu, format) \ VCPU_CTR0((vcpu)->vm, (vcpu)->vcpuid, format) #define VMM_CTR1(vcpu, format, p1) \ VCPU_CTR1((vcpu)->vm, (vcpu)->vcpuid, format, p1) #define VMM_CTR2(vcpu, format, p1, p2) \ VCPU_CTR2((vcpu)->vm, (vcpu)->vcpuid, format, p1, p2) #define VMM_CTR3(vcpu, format, p1, p2, p3) \ VCPU_CTR3((vcpu)->vm, (vcpu)->vcpuid, format, p1, p2, p3) #define VMM_CTR4(vcpu, format, p1, p2, p3, p4) \ VCPU_CTR4((vcpu)->vm, (vcpu)->vcpuid, format, p1, p2, p3, p4) static int vmm_initialized; static void vmmops_panic(void); static void vmmops_panic(void) { panic("vmm_ops func called when !vmm_is_intel() && !vmm_is_svm()"); } #define DEFINE_VMMOPS_IFUNC(ret_type, opname, args) \ DEFINE_IFUNC(static, ret_type, vmmops_##opname, args) \ { \ if (vmm_is_intel()) \ return (vmm_ops_intel.opname); \ else if (vmm_is_svm()) \ return (vmm_ops_amd.opname); \ else \ return ((ret_type (*)args)vmmops_panic); \ } DEFINE_VMMOPS_IFUNC(int, modinit, (int ipinum)) DEFINE_VMMOPS_IFUNC(int, modcleanup, (void)) DEFINE_VMMOPS_IFUNC(void, modresume, (void)) DEFINE_VMMOPS_IFUNC(void *, init, (struct vm *vm, struct pmap *pmap)) DEFINE_VMMOPS_IFUNC(int, run, (void *vcpui, register_t rip, struct pmap *pmap, struct vm_eventinfo *info)) DEFINE_VMMOPS_IFUNC(void, cleanup, (void *vmi)) DEFINE_VMMOPS_IFUNC(void *, vcpu_init, (void *vmi, struct vcpu *vcpu, int vcpu_id)) DEFINE_VMMOPS_IFUNC(void, vcpu_cleanup, (void *vcpui)) DEFINE_VMMOPS_IFUNC(int, getreg, (void *vcpui, int num, uint64_t *retval)) DEFINE_VMMOPS_IFUNC(int, setreg, (void *vcpui, int num, uint64_t val)) DEFINE_VMMOPS_IFUNC(int, getdesc, (void *vcpui, int num, struct seg_desc *desc)) DEFINE_VMMOPS_IFUNC(int, setdesc, (void *vcpui, int num, struct seg_desc *desc)) DEFINE_VMMOPS_IFUNC(int, getcap, (void *vcpui, int num, int *retval)) DEFINE_VMMOPS_IFUNC(int, setcap, (void *vcpui, int num, int val)) DEFINE_VMMOPS_IFUNC(struct vmspace *, vmspace_alloc, (vm_offset_t min, vm_offset_t max)) DEFINE_VMMOPS_IFUNC(void, vmspace_free, (struct vmspace *vmspace)) DEFINE_VMMOPS_IFUNC(struct vlapic *, vlapic_init, (void *vcpui)) DEFINE_VMMOPS_IFUNC(void, vlapic_cleanup, (struct vlapic *vlapic)) #ifdef BHYVE_SNAPSHOT DEFINE_VMMOPS_IFUNC(int, snapshot, (void *vmi, struct vm_snapshot_meta *meta)) DEFINE_VMMOPS_IFUNC(int, vcpu_snapshot, (void *vcpui, struct vm_snapshot_meta *meta)) DEFINE_VMMOPS_IFUNC(int, restore_tsc, (void *vcpui, uint64_t now)) #endif #define fpu_start_emulating() load_cr0(rcr0() | CR0_TS) #define fpu_stop_emulating() clts() SDT_PROVIDER_DEFINE(vmm); static MALLOC_DEFINE(M_VM, "vm", "vm"); /* statistics */ static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime"); SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, NULL); /* * Halt the guest if all vcpus are executing a HLT instruction with * interrupts disabled. */ static int halt_detection_enabled = 1; SYSCTL_INT(_hw_vmm, OID_AUTO, halt_detection, CTLFLAG_RDTUN, &halt_detection_enabled, 0, "Halt VM if all vcpus execute HLT with interrupts disabled"); static int vmm_ipinum; SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0, "IPI vector used for vcpu notifications"); static int trace_guest_exceptions; SYSCTL_INT(_hw_vmm, OID_AUTO, trace_guest_exceptions, CTLFLAG_RDTUN, &trace_guest_exceptions, 0, "Trap into hypervisor on all guest exceptions and reflect them back"); static int trap_wbinvd; SYSCTL_INT(_hw_vmm, OID_AUTO, trap_wbinvd, CTLFLAG_RDTUN, &trap_wbinvd, 0, "WBINVD triggers a VM-exit"); static void vm_free_memmap(struct vm *vm, int ident); static bool sysmem_mapping(struct vm *vm, struct mem_map *mm); static void vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr); #ifdef KTR static const char * vcpu_state2str(enum vcpu_state state) { switch (state) { case VCPU_IDLE: return ("idle"); case VCPU_FROZEN: return ("frozen"); case VCPU_RUNNING: return ("running"); case VCPU_SLEEPING: return ("sleeping"); default: return ("unknown"); } } #endif -static __inline void * -vcpu_cookie(struct vm *vm, int i) -{ - return (vm->vcpu[i].cookie); -} - static void vcpu_cleanup(struct vm *vm, int i, bool destroy) { struct vcpu *vcpu = &vm->vcpu[i]; vmmops_vlapic_cleanup(vcpu->vlapic); vmmops_vcpu_cleanup(vcpu->cookie); vcpu->cookie = NULL; if (destroy) { vmm_stat_free(vcpu->stats); fpu_save_area_free(vcpu->guestfpu); } } static void vcpu_init(struct vm *vm, int vcpu_id, bool create) { struct vcpu *vcpu; KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus, ("vcpu_init: invalid vcpu %d", vcpu_id)); vcpu = &vm->vcpu[vcpu_id]; if (create) { KASSERT(!vcpu_lock_initialized(vcpu), ("vcpu %d already " "initialized", vcpu_id)); vcpu_lock_init(vcpu); vcpu->state = VCPU_IDLE; vcpu->hostcpu = NOCPU; vcpu->vcpuid = vcpu_id; vcpu->vm = vm; vcpu->guestfpu = fpu_save_area_alloc(); vcpu->stats = vmm_stat_alloc(); vcpu->tsc_offset = 0; } vcpu->cookie = vmmops_vcpu_init(vm->cookie, vcpu, vcpu_id); vcpu->vlapic = vmmops_vlapic_init(vcpu->cookie); - vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED); + vm_set_x2apic_state(vcpu, X2APIC_DISABLED); vcpu->reqidle = 0; vcpu->exitintinfo = 0; vcpu->nmi_pending = 0; vcpu->extint_pending = 0; vcpu->exception_pending = 0; vcpu->guest_xcr0 = XFEATURE_ENABLED_X87; fpu_save_area_reset(vcpu->guestfpu); vmm_stat_init(vcpu->stats); } int vcpu_trace_exceptions(struct vcpu *vcpu) { return (trace_guest_exceptions); } int vcpu_trap_wbinvd(struct vcpu *vcpu) { return (trap_wbinvd); } struct vm_exit * vm_exitinfo(struct vcpu *vcpu) { return (&vcpu->exitinfo); } static int vmm_init(void) { int error; if (!vmm_is_hw_supported()) return (ENXIO); vmm_host_state_init(); vmm_ipinum = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) : &IDTVEC(justreturn)); if (vmm_ipinum < 0) vmm_ipinum = IPI_AST; error = vmm_mem_init(); if (error) return (error); vmm_resume_p = vmmops_modresume; return (vmmops_modinit(vmm_ipinum)); } static int vmm_handler(module_t mod, int what, void *arg) { int error; switch (what) { case MOD_LOAD: if (vmm_is_hw_supported()) { vmmdev_init(); error = vmm_init(); if (error == 0) vmm_initialized = 1; } else { error = ENXIO; } break; case MOD_UNLOAD: if (vmm_is_hw_supported()) { error = vmmdev_cleanup(); if (error == 0) { vmm_resume_p = NULL; iommu_cleanup(); if (vmm_ipinum != IPI_AST) lapic_ipi_free(vmm_ipinum); error = vmmops_modcleanup(); /* * Something bad happened - prevent new * VMs from being created */ if (error) vmm_initialized = 0; } } else { error = 0; } break; default: error = 0; break; } return (error); } static moduledata_t vmm_kmod = { "vmm", vmm_handler, NULL }; /* * vmm initialization has the following dependencies: * * - VT-x initialization requires smp_rendezvous() and therefore must happen * after SMP is fully functional (after SI_SUB_SMP). */ DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY); MODULE_VERSION(vmm, 1); static void vm_init(struct vm *vm, bool create) { int i; vm->cookie = vmmops_init(vm, vmspace_pmap(vm->vmspace)); vm->iommu = NULL; vm->vioapic = vioapic_init(vm); vm->vhpet = vhpet_init(vm); vm->vatpic = vatpic_init(vm); vm->vatpit = vatpit_init(vm); vm->vpmtmr = vpmtmr_init(vm); if (create) vm->vrtc = vrtc_init(vm); CPU_ZERO(&vm->active_cpus); CPU_ZERO(&vm->debug_cpus); vm->suspend = 0; CPU_ZERO(&vm->suspended_cpus); for (i = 0; i < vm->maxcpus; i++) vcpu_init(vm, i, create); } /* * The default CPU topology is a single thread per package. */ u_int cores_per_package = 1; u_int threads_per_core = 1; int vm_create(const char *name, struct vm **retvm) { struct vm *vm; struct vmspace *vmspace; /* * If vmm.ko could not be successfully initialized then don't attempt * to create the virtual machine. */ if (!vmm_initialized) return (ENXIO); if (name == NULL || strnlen(name, VM_MAX_NAMELEN + 1) == VM_MAX_NAMELEN + 1) return (EINVAL); vmspace = vmmops_vmspace_alloc(0, VM_MAXUSER_ADDRESS_LA48); if (vmspace == NULL) return (ENOMEM); vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO); strcpy(vm->name, name); vm->vmspace = vmspace; mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF); vm->sockets = 1; vm->cores = cores_per_package; /* XXX backwards compatibility */ vm->threads = threads_per_core; /* XXX backwards compatibility */ vm->maxcpus = VM_MAXCPU; /* XXX temp to keep code working */ vm_init(vm, true); *retvm = vm; return (0); } void vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores, uint16_t *threads, uint16_t *maxcpus) { *sockets = vm->sockets; *cores = vm->cores; *threads = vm->threads; *maxcpus = vm->maxcpus; } uint16_t vm_get_maxcpus(struct vm *vm) { return (vm->maxcpus); } int vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores, uint16_t threads, uint16_t maxcpus) { if (maxcpus != 0) return (EINVAL); /* XXX remove when supported */ if ((sockets * cores * threads) > vm->maxcpus) return (EINVAL); /* XXX need to check sockets * cores * threads == vCPU, how? */ vm->sockets = sockets; vm->cores = cores; vm->threads = threads; vm->maxcpus = VM_MAXCPU; /* XXX temp to keep code working */ return(0); } static void vm_cleanup(struct vm *vm, bool destroy) { struct mem_map *mm; int i; ppt_unassign_all(vm); if (vm->iommu != NULL) iommu_destroy_domain(vm->iommu); if (destroy) vrtc_cleanup(vm->vrtc); else vrtc_reset(vm->vrtc); vpmtmr_cleanup(vm->vpmtmr); vatpit_cleanup(vm->vatpit); vhpet_cleanup(vm->vhpet); vatpic_cleanup(vm->vatpic); vioapic_cleanup(vm->vioapic); for (i = 0; i < vm->maxcpus; i++) vcpu_cleanup(vm, i, destroy); vmmops_cleanup(vm->cookie); /* * System memory is removed from the guest address space only when * the VM is destroyed. This is because the mapping remains the same * across VM reset. * * Device memory can be relocated by the guest (e.g. using PCI BARs) * so those mappings are removed on a VM reset. */ for (i = 0; i < VM_MAX_MEMMAPS; i++) { mm = &vm->mem_maps[i]; if (destroy || !sysmem_mapping(vm, mm)) vm_free_memmap(vm, i); } if (destroy) { for (i = 0; i < VM_MAX_MEMSEGS; i++) vm_free_memseg(vm, i); vmmops_vmspace_free(vm->vmspace); vm->vmspace = NULL; } } void vm_destroy(struct vm *vm) { vm_cleanup(vm, true); free(vm, M_VM); } int vm_reinit(struct vm *vm) { int error; /* * A virtual machine can be reset only if all vcpus are suspended. */ if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) { vm_cleanup(vm, false); vm_init(vm, false); error = 0; } else { error = EBUSY; } return (error); } const char * vm_name(struct vm *vm) { return (vm->name); } int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) { vm_object_t obj; if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL) return (ENOMEM); else return (0); } int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len) { vmm_mmio_free(vm->vmspace, gpa, len); return (0); } /* * Return 'true' if 'gpa' is allocated in the guest address space. * * This function is called in the context of a running vcpu which acts as * an implicit lock on 'vm->mem_maps[]'. */ bool vm_mem_allocated(struct vcpu *vcpu, vm_paddr_t gpa) { struct vm *vm = vcpu->vm; struct mem_map *mm; int i; #ifdef INVARIANTS int hostcpu, state; state = vcpu_get_state(vcpu, &hostcpu); KASSERT(state == VCPU_RUNNING && hostcpu == curcpu, ("%s: invalid vcpu state %d/%d", __func__, state, hostcpu)); #endif for (i = 0; i < VM_MAX_MEMMAPS; i++) { mm = &vm->mem_maps[i]; if (mm->len != 0 && gpa >= mm->gpa && gpa < mm->gpa + mm->len) return (true); /* 'gpa' is sysmem or devmem */ } if (ppt_is_mmio(vm, gpa)) return (true); /* 'gpa' is pci passthru mmio */ return (false); } int vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem) { struct mem_seg *seg; vm_object_t obj; if (ident < 0 || ident >= VM_MAX_MEMSEGS) return (EINVAL); if (len == 0 || (len & PAGE_MASK)) return (EINVAL); seg = &vm->mem_segs[ident]; if (seg->object != NULL) { if (seg->len == len && seg->sysmem == sysmem) return (EEXIST); else return (EINVAL); } obj = vm_object_allocate(OBJT_SWAP, len >> PAGE_SHIFT); if (obj == NULL) return (ENOMEM); seg->len = len; seg->object = obj; seg->sysmem = sysmem; return (0); } int vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem, vm_object_t *objptr) { struct mem_seg *seg; if (ident < 0 || ident >= VM_MAX_MEMSEGS) return (EINVAL); seg = &vm->mem_segs[ident]; if (len) *len = seg->len; if (sysmem) *sysmem = seg->sysmem; if (objptr) *objptr = seg->object; return (0); } void vm_free_memseg(struct vm *vm, int ident) { struct mem_seg *seg; KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS, ("%s: invalid memseg ident %d", __func__, ident)); seg = &vm->mem_segs[ident]; if (seg->object != NULL) { vm_object_deallocate(seg->object); bzero(seg, sizeof(struct mem_seg)); } } int vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first, size_t len, int prot, int flags) { struct mem_seg *seg; struct mem_map *m, *map; vm_ooffset_t last; int i, error; if (prot == 0 || (prot & ~(VM_PROT_ALL)) != 0) return (EINVAL); if (flags & ~VM_MEMMAP_F_WIRED) return (EINVAL); if (segid < 0 || segid >= VM_MAX_MEMSEGS) return (EINVAL); seg = &vm->mem_segs[segid]; if (seg->object == NULL) return (EINVAL); last = first + len; if (first < 0 || first >= last || last > seg->len) return (EINVAL); if ((gpa | first | last) & PAGE_MASK) return (EINVAL); map = NULL; for (i = 0; i < VM_MAX_MEMMAPS; i++) { m = &vm->mem_maps[i]; if (m->len == 0) { map = m; break; } } if (map == NULL) return (ENOSPC); error = vm_map_find(&vm->vmspace->vm_map, seg->object, first, &gpa, len, 0, VMFS_NO_SPACE, prot, prot, 0); if (error != KERN_SUCCESS) return (EFAULT); vm_object_reference(seg->object); if (flags & VM_MEMMAP_F_WIRED) { error = vm_map_wire(&vm->vmspace->vm_map, gpa, gpa + len, VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); if (error != KERN_SUCCESS) { vm_map_remove(&vm->vmspace->vm_map, gpa, gpa + len); return (error == KERN_RESOURCE_SHORTAGE ? ENOMEM : EFAULT); } } map->gpa = gpa; map->len = len; map->segoff = first; map->segid = segid; map->prot = prot; map->flags = flags; return (0); } int vm_munmap_memseg(struct vm *vm, vm_paddr_t gpa, size_t len) { struct mem_map *m; int i; for (i = 0; i < VM_MAX_MEMMAPS; i++) { m = &vm->mem_maps[i]; if (m->gpa == gpa && m->len == len && (m->flags & VM_MEMMAP_F_IOMMU) == 0) { vm_free_memmap(vm, i); return (0); } } return (EINVAL); } int vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid, vm_ooffset_t *segoff, size_t *len, int *prot, int *flags) { struct mem_map *mm, *mmnext; int i; mmnext = NULL; for (i = 0; i < VM_MAX_MEMMAPS; i++) { mm = &vm->mem_maps[i]; if (mm->len == 0 || mm->gpa < *gpa) continue; if (mmnext == NULL || mm->gpa < mmnext->gpa) mmnext = mm; } if (mmnext != NULL) { *gpa = mmnext->gpa; if (segid) *segid = mmnext->segid; if (segoff) *segoff = mmnext->segoff; if (len) *len = mmnext->len; if (prot) *prot = mmnext->prot; if (flags) *flags = mmnext->flags; return (0); } else { return (ENOENT); } } static void vm_free_memmap(struct vm *vm, int ident) { struct mem_map *mm; int error __diagused; mm = &vm->mem_maps[ident]; if (mm->len) { error = vm_map_remove(&vm->vmspace->vm_map, mm->gpa, mm->gpa + mm->len); KASSERT(error == KERN_SUCCESS, ("%s: vm_map_remove error %d", __func__, error)); bzero(mm, sizeof(struct mem_map)); } } static __inline bool sysmem_mapping(struct vm *vm, struct mem_map *mm) { if (mm->len != 0 && vm->mem_segs[mm->segid].sysmem) return (true); else return (false); } vm_paddr_t vmm_sysmem_maxaddr(struct vm *vm) { struct mem_map *mm; vm_paddr_t maxaddr; int i; maxaddr = 0; for (i = 0; i < VM_MAX_MEMMAPS; i++) { mm = &vm->mem_maps[i]; if (sysmem_mapping(vm, mm)) { if (maxaddr < mm->gpa + mm->len) maxaddr = mm->gpa + mm->len; } } return (maxaddr); } static void vm_iommu_modify(struct vm *vm, bool map) { int i, sz; vm_paddr_t gpa, hpa; struct mem_map *mm; void *vp, *cookie, *host_domain; sz = PAGE_SIZE; host_domain = iommu_host_domain(); for (i = 0; i < VM_MAX_MEMMAPS; i++) { mm = &vm->mem_maps[i]; if (!sysmem_mapping(vm, mm)) continue; if (map) { KASSERT((mm->flags & VM_MEMMAP_F_IOMMU) == 0, ("iommu map found invalid memmap %#lx/%#lx/%#x", mm->gpa, mm->len, mm->flags)); if ((mm->flags & VM_MEMMAP_F_WIRED) == 0) continue; mm->flags |= VM_MEMMAP_F_IOMMU; } else { if ((mm->flags & VM_MEMMAP_F_IOMMU) == 0) continue; mm->flags &= ~VM_MEMMAP_F_IOMMU; KASSERT((mm->flags & VM_MEMMAP_F_WIRED) != 0, ("iommu unmap found invalid memmap %#lx/%#lx/%#x", mm->gpa, mm->len, mm->flags)); } gpa = mm->gpa; while (gpa < mm->gpa + mm->len) { vp = vm_gpa_hold_global(vm, gpa, PAGE_SIZE, VM_PROT_WRITE, &cookie); KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx", vm_name(vm), gpa)); vm_gpa_release(cookie); hpa = DMAP_TO_PHYS((uintptr_t)vp); if (map) { iommu_create_mapping(vm->iommu, gpa, hpa, sz); } else { iommu_remove_mapping(vm->iommu, gpa, sz); } gpa += PAGE_SIZE; } } /* * Invalidate the cached translations associated with the domain * from which pages were removed. */ if (map) iommu_invalidate_tlb(host_domain); else iommu_invalidate_tlb(vm->iommu); } #define vm_iommu_unmap(vm) vm_iommu_modify((vm), false) #define vm_iommu_map(vm) vm_iommu_modify((vm), true) int vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func) { int error; error = ppt_unassign_device(vm, bus, slot, func); if (error) return (error); if (ppt_assigned_devices(vm) == 0) vm_iommu_unmap(vm); return (0); } int vm_assign_pptdev(struct vm *vm, int bus, int slot, int func) { int error; vm_paddr_t maxaddr; /* Set up the IOMMU to do the 'gpa' to 'hpa' translation */ if (ppt_assigned_devices(vm) == 0) { KASSERT(vm->iommu == NULL, ("vm_assign_pptdev: iommu must be NULL")); maxaddr = vmm_sysmem_maxaddr(vm); vm->iommu = iommu_create_domain(maxaddr); if (vm->iommu == NULL) return (ENXIO); vm_iommu_map(vm); } error = ppt_assign_device(vm, bus, slot, func); return (error); } static void * _vm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot, void **cookie) { int i, count, pageoff; struct mem_map *mm; vm_page_t m; pageoff = gpa & PAGE_MASK; if (len > PAGE_SIZE - pageoff) panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len); count = 0; for (i = 0; i < VM_MAX_MEMMAPS; i++) { mm = &vm->mem_maps[i]; if (gpa >= mm->gpa && gpa < mm->gpa + mm->len) { count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map, trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1); break; } } if (count == 1) { *cookie = m; return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff)); } else { *cookie = NULL; return (NULL); } } void * vm_gpa_hold(struct vcpu *vcpu, vm_paddr_t gpa, size_t len, int reqprot, void **cookie) { #ifdef INVARIANTS /* * The current vcpu should be frozen to ensure 'vm_memmap[]' * stability. */ int state = vcpu_get_state(vcpu, NULL); KASSERT(state == VCPU_FROZEN, ("%s: invalid vcpu state %d", __func__, state)); #endif return (_vm_gpa_hold(vcpu->vm, gpa, len, reqprot, cookie)); } void * vm_gpa_hold_global(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot, void **cookie) { #ifdef INVARIANTS /* * All vcpus are frozen by ioctls that modify the memory map * (e.g. VM_MMAP_MEMSEG). Therefore 'vm->memmap[]' stability is * guaranteed if at least one vcpu is in the VCPU_FROZEN state. */ int state; for (int i = 0; i < vm->maxcpus; i++) { state = vcpu_get_state(vm_vcpu(vm, i), NULL); KASSERT(state == VCPU_FROZEN, ("%s: invalid vcpu state %d", __func__, state)); } #endif return (_vm_gpa_hold(vm, gpa, len, reqprot, cookie)); } void vm_gpa_release(void *cookie) { vm_page_t m = cookie; vm_page_unwire(m, PQ_ACTIVE); } int vm_get_register(struct vcpu *vcpu, int reg, uint64_t *retval) { if (reg >= VM_REG_LAST) return (EINVAL); return (vmmops_getreg(vcpu->cookie, reg, retval)); } int vm_set_register(struct vcpu *vcpu, int reg, uint64_t val) { int error; if (reg >= VM_REG_LAST) return (EINVAL); error = vmmops_setreg(vcpu->cookie, reg, val); if (error || reg != VM_REG_GUEST_RIP) return (error); /* Set 'nextrip' to match the value of %rip */ VMM_CTR1(vcpu, "Setting nextrip to %#lx", val); vcpu->nextrip = val; return (0); } static bool is_descriptor_table(int reg) { switch (reg) { case VM_REG_GUEST_IDTR: case VM_REG_GUEST_GDTR: return (true); default: return (false); } } static bool is_segment_register(int reg) { switch (reg) { case VM_REG_GUEST_ES: case VM_REG_GUEST_CS: case VM_REG_GUEST_SS: case VM_REG_GUEST_DS: case VM_REG_GUEST_FS: case VM_REG_GUEST_GS: case VM_REG_GUEST_TR: case VM_REG_GUEST_LDTR: return (true); default: return (false); } } int vm_get_seg_desc(struct vcpu *vcpu, int reg, struct seg_desc *desc) { if (!is_segment_register(reg) && !is_descriptor_table(reg)) return (EINVAL); return (vmmops_getdesc(vcpu->cookie, reg, desc)); } int -vm_set_seg_desc(struct vm *vm, int vcpu, int reg, - struct seg_desc *desc) +vm_set_seg_desc(struct vcpu *vcpu, int reg, struct seg_desc *desc) { - if (vcpu < 0 || vcpu >= vm->maxcpus) - return (EINVAL); if (!is_segment_register(reg) && !is_descriptor_table(reg)) return (EINVAL); - return (vmmops_setdesc(vcpu_cookie(vm, vcpu), reg, desc)); + return (vmmops_setdesc(vcpu->cookie, reg, desc)); } static void restore_guest_fpustate(struct vcpu *vcpu) { /* flush host state to the pcb */ fpuexit(curthread); /* restore guest FPU state */ fpu_stop_emulating(); fpurestore(vcpu->guestfpu); /* restore guest XCR0 if XSAVE is enabled in the host */ if (rcr4() & CR4_XSAVE) load_xcr(0, vcpu->guest_xcr0); /* * The FPU is now "dirty" with the guest's state so turn on emulation * to trap any access to the FPU by the host. */ fpu_start_emulating(); } static void save_guest_fpustate(struct vcpu *vcpu) { if ((rcr0() & CR0_TS) == 0) panic("fpu emulation not enabled in host!"); /* save guest XCR0 and restore host XCR0 */ if (rcr4() & CR4_XSAVE) { vcpu->guest_xcr0 = rxcr(0); load_xcr(0, vmm_get_host_xcr0()); } /* save guest FPU state */ fpu_stop_emulating(); fpusave(vcpu->guestfpu); fpu_start_emulating(); } static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle"); static int -vcpu_set_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate, +vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate, bool from_idle) { - struct vcpu *vcpu; int error; - vcpu = &vm->vcpu[vcpuid]; vcpu_assert_locked(vcpu); /* * State transitions from the vmmdev_ioctl() must always begin from * the VCPU_IDLE state. This guarantees that there is only a single * ioctl() operating on a vcpu at any point. */ if (from_idle) { while (vcpu->state != VCPU_IDLE) { vcpu->reqidle = 1; vcpu_notify_event_locked(vcpu, false); - VCPU_CTR1(vm, vcpuid, "vcpu state change from %s to " + VMM_CTR1(vcpu, "vcpu state change from %s to " "idle requested", vcpu_state2str(vcpu->state)); msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz); } } else { KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from " "vcpu idle state")); } if (vcpu->state == VCPU_RUNNING) { KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d " "mismatch for running vcpu", curcpu, vcpu->hostcpu)); } else { KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a " "vcpu that is not running", vcpu->hostcpu)); } /* * The following state transitions are allowed: * IDLE -> FROZEN -> IDLE * FROZEN -> RUNNING -> FROZEN * FROZEN -> SLEEPING -> FROZEN */ switch (vcpu->state) { case VCPU_IDLE: case VCPU_RUNNING: case VCPU_SLEEPING: error = (newstate != VCPU_FROZEN); break; case VCPU_FROZEN: error = (newstate == VCPU_FROZEN); break; default: error = 1; break; } if (error) return (EBUSY); - VCPU_CTR2(vm, vcpuid, "vcpu state changed from %s to %s", + VMM_CTR2(vcpu, "vcpu state changed from %s to %s", vcpu_state2str(vcpu->state), vcpu_state2str(newstate)); vcpu->state = newstate; if (newstate == VCPU_RUNNING) vcpu->hostcpu = curcpu; else vcpu->hostcpu = NOCPU; if (newstate == VCPU_IDLE) wakeup(&vcpu->state); return (0); } static void -vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate) +vcpu_require_state(struct vcpu *vcpu, enum vcpu_state newstate) { int error; - if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0) + if ((error = vcpu_set_state(vcpu, newstate, false)) != 0) panic("Error %d setting state to %d\n", error, newstate); } static void -vcpu_require_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate) +vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate) { int error; - if ((error = vcpu_set_state_locked(vm, vcpuid, newstate, false)) != 0) + if ((error = vcpu_set_state_locked(vcpu, newstate, false)) != 0) panic("Error %d setting state to %d", error, newstate); } static int vm_handle_rendezvous(struct vcpu *vcpu) { struct vm *vm = vcpu->vm; struct thread *td; int error, vcpuid; error = 0; vcpuid = vcpu->vcpuid; td = curthread; mtx_lock(&vm->rendezvous_mtx); while (vm->rendezvous_func != NULL) { /* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */ CPU_AND(&vm->rendezvous_req_cpus, &vm->rendezvous_req_cpus, &vm->active_cpus); if (CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus) && !CPU_ISSET(vcpuid, &vm->rendezvous_done_cpus)) { VMM_CTR0(vcpu, "Calling rendezvous func"); (*vm->rendezvous_func)(vcpu, vm->rendezvous_arg); CPU_SET(vcpuid, &vm->rendezvous_done_cpus); } if (CPU_CMP(&vm->rendezvous_req_cpus, &vm->rendezvous_done_cpus) == 0) { VMM_CTR0(vcpu, "Rendezvous completed"); vm->rendezvous_func = NULL; wakeup(&vm->rendezvous_func); break; } VMM_CTR0(vcpu, "Wait for rendezvous completion"); mtx_sleep(&vm->rendezvous_func, &vm->rendezvous_mtx, 0, "vmrndv", hz); if (td_ast_pending(td, TDA_SUSPEND)) { mtx_unlock(&vm->rendezvous_mtx); error = thread_check_susp(td, true); if (error != 0) return (error); mtx_lock(&vm->rendezvous_mtx); } } mtx_unlock(&vm->rendezvous_mtx); return (0); } /* * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run. */ static int -vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu) +vm_handle_hlt(struct vcpu *vcpu, bool intr_disabled, bool *retu) { - struct vcpu *vcpu; + struct vm *vm = vcpu->vm; const char *wmesg; struct thread *td; - int error, t, vcpu_halted, vm_halted; + int error, t, vcpuid, vcpu_halted, vm_halted; - KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted")); - - vcpu = &vm->vcpu[vcpuid]; + vcpuid = vcpu->vcpuid; vcpu_halted = 0; vm_halted = 0; error = 0; td = curthread; + KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted")); + vcpu_lock(vcpu); while (1) { /* * Do a final check for pending NMI or interrupts before * really putting this thread to sleep. Also check for * software events that would cause this vcpu to wakeup. * * These interrupts/events could have happened after the * vcpu returned from vmmops_run() and before it acquired the * vcpu lock above. */ if (vm->rendezvous_func != NULL || vm->suspend || vcpu->reqidle) break; if (vm_nmi_pending(vcpu)) break; if (!intr_disabled) { if (vm_extint_pending(vcpu) || vlapic_pending_intr(vcpu->vlapic, NULL)) { break; } } /* Don't go to sleep if the vcpu thread needs to yield */ if (vcpu_should_yield(vcpu)) break; if (vcpu_debugged(vcpu)) break; /* * Some Linux guests implement "halt" by having all vcpus * execute HLT with interrupts disabled. 'halted_cpus' keeps * track of the vcpus that have entered this state. When all * vcpus enter the halted state the virtual machine is halted. */ if (intr_disabled) { wmesg = "vmhalt"; - VCPU_CTR0(vm, vcpuid, "Halted"); + VMM_CTR0(vcpu, "Halted"); if (!vcpu_halted && halt_detection_enabled) { vcpu_halted = 1; CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus); } if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) { vm_halted = 1; break; } } else { wmesg = "vmidle"; } t = ticks; - vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING); + vcpu_require_state_locked(vcpu, VCPU_SLEEPING); /* * XXX msleep_spin() cannot be interrupted by signals so * wake up periodically to check pending signals. */ msleep_spin(vcpu, &vcpu->mtx, wmesg, hz); - vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN); + vcpu_require_state_locked(vcpu, VCPU_FROZEN); vmm_stat_incr(vcpu, VCPU_IDLE_TICKS, ticks - t); if (td_ast_pending(td, TDA_SUSPEND)) { vcpu_unlock(vcpu); error = thread_check_susp(td, false); if (error != 0) { if (vcpu_halted) { CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus); } return (error); } vcpu_lock(vcpu); } } if (vcpu_halted) CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus); vcpu_unlock(vcpu); if (vm_halted) vm_suspend(vm, VM_SUSPEND_HALT); return (0); } static int -vm_handle_paging(struct vm *vm, int vcpuid, bool *retu) +vm_handle_paging(struct vcpu *vcpu, bool *retu) { + struct vm *vm = vcpu->vm; int rv, ftype; struct vm_map *map; - struct vcpu *vcpu; struct vm_exit *vme; - vcpu = &vm->vcpu[vcpuid]; vme = &vcpu->exitinfo; KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d", __func__, vme->inst_length)); ftype = vme->u.paging.fault_type; KASSERT(ftype == VM_PROT_READ || ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE, ("vm_handle_paging: invalid fault_type %d", ftype)); if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) { rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace), vme->u.paging.gpa, ftype); if (rv == 0) { - VCPU_CTR2(vm, vcpuid, "%s bit emulation for gpa %#lx", + VMM_CTR2(vcpu, "%s bit emulation for gpa %#lx", ftype == VM_PROT_READ ? "accessed" : "dirty", vme->u.paging.gpa); goto done; } } map = &vm->vmspace->vm_map; rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL, NULL); - VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, " + VMM_CTR3(vcpu, "vm_handle_paging rv = %d, gpa = %#lx, " "ftype = %d", rv, vme->u.paging.gpa, ftype); if (rv != KERN_SUCCESS) return (EFAULT); done: return (0); } static int -vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu) +vm_handle_inst_emul(struct vcpu *vcpu, bool *retu) { struct vie *vie; - struct vcpu *vcpu; struct vm_exit *vme; uint64_t gla, gpa, cs_base; struct vm_guest_paging *paging; mem_region_read_t mread; mem_region_write_t mwrite; enum vm_cpu_mode cpu_mode; int cs_d, error, fault; - vcpu = &vm->vcpu[vcpuid]; vme = &vcpu->exitinfo; KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d", __func__, vme->inst_length)); gla = vme->u.inst_emul.gla; gpa = vme->u.inst_emul.gpa; cs_base = vme->u.inst_emul.cs_base; cs_d = vme->u.inst_emul.cs_d; vie = &vme->u.inst_emul.vie; paging = &vme->u.inst_emul.paging; cpu_mode = paging->cpu_mode; - VCPU_CTR1(vm, vcpuid, "inst_emul fault accessing gpa %#lx", gpa); + VMM_CTR1(vcpu, "inst_emul fault accessing gpa %#lx", gpa); /* Fetch, decode and emulate the faulting instruction */ if (vie->num_valid == 0) { error = vmm_fetch_instruction(vcpu, paging, vme->rip + cs_base, VIE_INST_SIZE, vie, &fault); } else { /* * The instruction bytes have already been copied into 'vie' */ error = fault = 0; } if (error || fault) return (error); if (vmm_decode_instruction(vcpu, gla, cpu_mode, cs_d, vie) != 0) { - VCPU_CTR1(vm, vcpuid, "Error decoding instruction at %#lx", + VMM_CTR1(vcpu, "Error decoding instruction at %#lx", vme->rip + cs_base); *retu = true; /* dump instruction bytes in userspace */ return (0); } /* * Update 'nextrip' based on the length of the emulated instruction. */ vme->inst_length = vie->num_processed; vcpu->nextrip += vie->num_processed; - VCPU_CTR1(vm, vcpuid, "nextrip updated to %#lx after instruction " - "decoding", vcpu->nextrip); + VMM_CTR1(vcpu, "nextrip updated to %#lx after instruction decoding", + vcpu->nextrip); /* return to userland unless this is an in-kernel emulated device */ if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) { mread = lapic_mmio_read; mwrite = lapic_mmio_write; } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) { mread = vioapic_mmio_read; mwrite = vioapic_mmio_write; } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) { mread = vhpet_mmio_read; mwrite = vhpet_mmio_write; } else { *retu = true; return (0); } error = vmm_emulate_instruction(vcpu, gpa, vie, paging, mread, mwrite, retu); return (error); } static int -vm_handle_suspend(struct vm *vm, int vcpuid, bool *retu) +vm_handle_suspend(struct vcpu *vcpu, bool *retu) { + struct vm *vm = vcpu->vm; int error, i; - struct vcpu *vcpu; struct thread *td; error = 0; - vcpu = &vm->vcpu[vcpuid]; td = curthread; - CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus); + CPU_SET_ATOMIC(vcpu->vcpuid, &vm->suspended_cpus); /* * Wait until all 'active_cpus' have suspended themselves. * * Since a VM may be suspended at any time including when one or * more vcpus are doing a rendezvous we need to call the rendezvous * handler while we are waiting to prevent a deadlock. */ vcpu_lock(vcpu); while (error == 0) { if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) { - VCPU_CTR0(vm, vcpuid, "All vcpus suspended"); + VMM_CTR0(vcpu, "All vcpus suspended"); break; } if (vm->rendezvous_func == NULL) { - VCPU_CTR0(vm, vcpuid, "Sleeping during suspend"); - vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING); + VMM_CTR0(vcpu, "Sleeping during suspend"); + vcpu_require_state_locked(vcpu, VCPU_SLEEPING); msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz); - vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN); + vcpu_require_state_locked(vcpu, VCPU_FROZEN); if (td_ast_pending(td, TDA_SUSPEND)) { vcpu_unlock(vcpu); error = thread_check_susp(td, false); vcpu_lock(vcpu); } } else { - VCPU_CTR0(vm, vcpuid, "Rendezvous during suspend"); + VMM_CTR0(vcpu, "Rendezvous during suspend"); vcpu_unlock(vcpu); error = vm_handle_rendezvous(vcpu); vcpu_lock(vcpu); } } vcpu_unlock(vcpu); /* * Wakeup the other sleeping vcpus and return to userspace. */ for (i = 0; i < vm->maxcpus; i++) { if (CPU_ISSET(i, &vm->suspended_cpus)) { - vcpu_notify_event(vm, i, false); + vcpu_notify_event(vm_vcpu(vm, i), false); } } *retu = true; return (error); } static int -vm_handle_reqidle(struct vm *vm, int vcpuid, bool *retu) +vm_handle_reqidle(struct vcpu *vcpu, bool *retu) { - struct vcpu *vcpu = &vm->vcpu[vcpuid]; - vcpu_lock(vcpu); KASSERT(vcpu->reqidle, ("invalid vcpu reqidle %d", vcpu->reqidle)); vcpu->reqidle = 0; vcpu_unlock(vcpu); *retu = true; return (0); } int vm_suspend(struct vm *vm, enum vm_suspend_how how) { int i; if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST) return (EINVAL); if (atomic_cmpset_int(&vm->suspend, 0, how) == 0) { VM_CTR2(vm, "virtual machine already suspended %d/%d", vm->suspend, how); return (EALREADY); } VM_CTR1(vm, "virtual machine successfully suspended %d", how); /* * Notify all active vcpus that they are now suspended. */ for (i = 0; i < vm->maxcpus; i++) { if (CPU_ISSET(i, &vm->active_cpus)) - vcpu_notify_event(vm, i, false); + vcpu_notify_event(vm_vcpu(vm, i), false); } return (0); } void vm_exit_suspended(struct vcpu *vcpu, uint64_t rip) { struct vm *vm = vcpu->vm; struct vm_exit *vmexit; KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST, ("vm_exit_suspended: invalid suspend type %d", vm->suspend)); vmexit = vm_exitinfo(vcpu); vmexit->rip = rip; vmexit->inst_length = 0; vmexit->exitcode = VM_EXITCODE_SUSPENDED; vmexit->u.suspended.how = vm->suspend; } void vm_exit_debug(struct vcpu *vcpu, uint64_t rip) { struct vm_exit *vmexit; vmexit = vm_exitinfo(vcpu); vmexit->rip = rip; vmexit->inst_length = 0; vmexit->exitcode = VM_EXITCODE_DEBUG; } void vm_exit_rendezvous(struct vcpu *vcpu, uint64_t rip) { struct vm_exit *vmexit; KASSERT(vcpu->vm->rendezvous_func != NULL, ("rendezvous not in progress")); vmexit = vm_exitinfo(vcpu); vmexit->rip = rip; vmexit->inst_length = 0; vmexit->exitcode = VM_EXITCODE_RENDEZVOUS; vmm_stat_incr(vcpu, VMEXIT_RENDEZVOUS, 1); } void vm_exit_reqidle(struct vcpu *vcpu, uint64_t rip) { struct vm_exit *vmexit; vmexit = vm_exitinfo(vcpu); vmexit->rip = rip; vmexit->inst_length = 0; vmexit->exitcode = VM_EXITCODE_REQIDLE; vmm_stat_incr(vcpu, VMEXIT_REQIDLE, 1); } void vm_exit_astpending(struct vcpu *vcpu, uint64_t rip) { struct vm_exit *vmexit; vmexit = vm_exitinfo(vcpu); vmexit->rip = rip; vmexit->inst_length = 0; vmexit->exitcode = VM_EXITCODE_BOGUS; vmm_stat_incr(vcpu, VMEXIT_ASTPENDING, 1); } int -vm_run(struct vm *vm, struct vm_run *vmrun) +vm_run(struct vcpu *vcpu, struct vm_exit *vme_user) { + struct vm *vm = vcpu->vm; struct vm_eventinfo evinfo; int error, vcpuid; - struct vcpu *vcpu; struct pcb *pcb; uint64_t tscval; struct vm_exit *vme; bool retu, intr_disabled; pmap_t pmap; - vcpuid = vmrun->cpuid; - - if (vcpuid < 0 || vcpuid >= vm->maxcpus) - return (EINVAL); + vcpuid = vcpu->vcpuid; if (!CPU_ISSET(vcpuid, &vm->active_cpus)) return (EINVAL); if (CPU_ISSET(vcpuid, &vm->suspended_cpus)) return (EINVAL); pmap = vmspace_pmap(vm->vmspace); - vcpu = &vm->vcpu[vcpuid]; vme = &vcpu->exitinfo; evinfo.rptr = &vm->rendezvous_func; evinfo.sptr = &vm->suspend; evinfo.iptr = &vcpu->reqidle; restart: critical_enter(); KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active), ("vm_run: absurd pm_active")); tscval = rdtsc(); pcb = PCPU_GET(curpcb); set_pcb_flags(pcb, PCB_FULL_IRET); restore_guest_fpustate(vcpu); - vcpu_require_state(vm, vcpuid, VCPU_RUNNING); + vcpu_require_state(vcpu, VCPU_RUNNING); error = vmmops_run(vcpu->cookie, vcpu->nextrip, pmap, &evinfo); - vcpu_require_state(vm, vcpuid, VCPU_FROZEN); + vcpu_require_state(vcpu, VCPU_FROZEN); save_guest_fpustate(vcpu); vmm_stat_incr(vcpu, VCPU_TOTAL_RUNTIME, rdtsc() - tscval); critical_exit(); if (error == 0) { retu = false; vcpu->nextrip = vme->rip + vme->inst_length; switch (vme->exitcode) { case VM_EXITCODE_REQIDLE: - error = vm_handle_reqidle(vm, vcpuid, &retu); + error = vm_handle_reqidle(vcpu, &retu); break; case VM_EXITCODE_SUSPENDED: - error = vm_handle_suspend(vm, vcpuid, &retu); + error = vm_handle_suspend(vcpu, &retu); break; case VM_EXITCODE_IOAPIC_EOI: vioapic_process_eoi(vm, vme->u.ioapic_eoi.vector); break; case VM_EXITCODE_RENDEZVOUS: error = vm_handle_rendezvous(vcpu); break; case VM_EXITCODE_HLT: intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0); - error = vm_handle_hlt(vm, vcpuid, intr_disabled, &retu); + error = vm_handle_hlt(vcpu, intr_disabled, &retu); break; case VM_EXITCODE_PAGING: - error = vm_handle_paging(vm, vcpuid, &retu); + error = vm_handle_paging(vcpu, &retu); break; case VM_EXITCODE_INST_EMUL: - error = vm_handle_inst_emul(vm, vcpuid, &retu); + error = vm_handle_inst_emul(vcpu, &retu); break; case VM_EXITCODE_INOUT: case VM_EXITCODE_INOUT_STR: - error = vm_handle_inout(vm, vcpuid, vme, &retu); + error = vm_handle_inout(vcpu, vme, &retu); break; case VM_EXITCODE_MONITOR: case VM_EXITCODE_MWAIT: case VM_EXITCODE_VMINSN: vm_inject_ud(vcpu); break; default: retu = true; /* handled in userland */ break; } } /* * VM_EXITCODE_INST_EMUL could access the apic which could transform the * exit code into VM_EXITCODE_IPI. */ if (error == 0 && vme->exitcode == VM_EXITCODE_IPI) { retu = false; error = vm_handle_ipi(vcpu, vme, &retu); } if (error == 0 && retu == false) goto restart; vmm_stat_incr(vcpu, VMEXIT_USERSPACE, 1); - VCPU_CTR2(vm, vcpuid, "retu %d/%d", error, vme->exitcode); + VMM_CTR2(vcpu, "retu %d/%d", error, vme->exitcode); /* copy the exit information */ - bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit)); + *vme_user = *vme; return (error); } int vm_restart_instruction(struct vcpu *vcpu) { enum vcpu_state state; uint64_t rip; int error __diagused; state = vcpu_get_state(vcpu, NULL); if (state == VCPU_RUNNING) { /* * When a vcpu is "running" the next instruction is determined * by adding 'rip' and 'inst_length' in the vcpu's 'exitinfo'. * Thus setting 'inst_length' to zero will cause the current * instruction to be restarted. */ vcpu->exitinfo.inst_length = 0; VMM_CTR1(vcpu, "restarting instruction at %#lx by " "setting inst_length to zero", vcpu->exitinfo.rip); } else if (state == VCPU_FROZEN) { /* * When a vcpu is "frozen" it is outside the critical section * around vmmops_run() and 'nextrip' points to the next * instruction. Thus instruction restart is achieved by setting * 'nextrip' to the vcpu's %rip. */ error = vm_get_register(vcpu, VM_REG_GUEST_RIP, &rip); KASSERT(!error, ("%s: error %d getting rip", __func__, error)); VMM_CTR2(vcpu, "restarting instruction by updating " "nextrip from %#lx to %#lx", vcpu->nextrip, rip); vcpu->nextrip = rip; } else { panic("%s: invalid state %d", __func__, state); } return (0); } int vm_exit_intinfo(struct vcpu *vcpu, uint64_t info) { int type, vector; if (info & VM_INTINFO_VALID) { type = info & VM_INTINFO_TYPE; vector = info & 0xff; if (type == VM_INTINFO_NMI && vector != IDT_NMI) return (EINVAL); if (type == VM_INTINFO_HWEXCEPTION && vector >= 32) return (EINVAL); if (info & VM_INTINFO_RSVD) return (EINVAL); } else { info = 0; } VMM_CTR2(vcpu, "%s: info1(%#lx)", __func__, info); vcpu->exitintinfo = info; return (0); } enum exc_class { EXC_BENIGN, EXC_CONTRIBUTORY, EXC_PAGEFAULT }; #define IDT_VE 20 /* Virtualization Exception (Intel specific) */ static enum exc_class exception_class(uint64_t info) { int type, vector; KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %#lx", info)); type = info & VM_INTINFO_TYPE; vector = info & 0xff; /* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */ switch (type) { case VM_INTINFO_HWINTR: case VM_INTINFO_SWINTR: case VM_INTINFO_NMI: return (EXC_BENIGN); default: /* * Hardware exception. * * SVM and VT-x use identical type values to represent NMI, * hardware interrupt and software interrupt. * * SVM uses type '3' for all exceptions. VT-x uses type '3' * for exceptions except #BP and #OF. #BP and #OF use a type * value of '5' or '6'. Therefore we don't check for explicit * values of 'type' to classify 'intinfo' into a hardware * exception. */ break; } switch (vector) { case IDT_PF: case IDT_VE: return (EXC_PAGEFAULT); case IDT_DE: case IDT_TS: case IDT_NP: case IDT_SS: case IDT_GP: return (EXC_CONTRIBUTORY); default: return (EXC_BENIGN); } } static int nested_fault(struct vcpu *vcpu, uint64_t info1, uint64_t info2, uint64_t *retinfo) { enum exc_class exc1, exc2; int type1, vector1; KASSERT(info1 & VM_INTINFO_VALID, ("info1 %#lx is not valid", info1)); KASSERT(info2 & VM_INTINFO_VALID, ("info2 %#lx is not valid", info2)); /* * If an exception occurs while attempting to call the double-fault * handler the processor enters shutdown mode (aka triple fault). */ type1 = info1 & VM_INTINFO_TYPE; vector1 = info1 & 0xff; if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) { VMM_CTR2(vcpu, "triple fault: info1(%#lx), info2(%#lx)", info1, info2); vm_suspend(vcpu->vm, VM_SUSPEND_TRIPLEFAULT); *retinfo = 0; return (0); } /* * Table 6-5 "Conditions for Generating a Double Fault", Intel SDM, Vol3 */ exc1 = exception_class(info1); exc2 = exception_class(info2); if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) || (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) { /* Convert nested fault into a double fault. */ *retinfo = IDT_DF; *retinfo |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION; *retinfo |= VM_INTINFO_DEL_ERRCODE; } else { /* Handle exceptions serially */ *retinfo = info2; } return (1); } static uint64_t vcpu_exception_intinfo(struct vcpu *vcpu) { uint64_t info = 0; if (vcpu->exception_pending) { info = vcpu->exc_vector & 0xff; info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION; if (vcpu->exc_errcode_valid) { info |= VM_INTINFO_DEL_ERRCODE; info |= (uint64_t)vcpu->exc_errcode << 32; } } return (info); } int vm_entry_intinfo(struct vcpu *vcpu, uint64_t *retinfo) { uint64_t info1, info2; int valid; info1 = vcpu->exitintinfo; vcpu->exitintinfo = 0; info2 = 0; if (vcpu->exception_pending) { info2 = vcpu_exception_intinfo(vcpu); vcpu->exception_pending = 0; VMM_CTR2(vcpu, "Exception %d delivered: %#lx", vcpu->exc_vector, info2); } if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) { valid = nested_fault(vcpu, info1, info2, retinfo); } else if (info1 & VM_INTINFO_VALID) { *retinfo = info1; valid = 1; } else if (info2 & VM_INTINFO_VALID) { *retinfo = info2; valid = 1; } else { valid = 0; } if (valid) { VMM_CTR4(vcpu, "%s: info1(%#lx), info2(%#lx), " "retinfo(%#lx)", __func__, info1, info2, *retinfo); } return (valid); } int -vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2) +vm_get_intinfo(struct vcpu *vcpu, uint64_t *info1, uint64_t *info2) { - struct vcpu *vcpu; - - if (vcpuid < 0 || vcpuid >= vm->maxcpus) - return (EINVAL); - - vcpu = &vm->vcpu[vcpuid]; *info1 = vcpu->exitintinfo; *info2 = vcpu_exception_intinfo(vcpu); return (0); } int vm_inject_exception(struct vcpu *vcpu, int vector, int errcode_valid, uint32_t errcode, int restart_instruction) { uint64_t regval; int error __diagused; if (vector < 0 || vector >= 32) return (EINVAL); /* * A double fault exception should never be injected directly into * the guest. It is a derived exception that results from specific * combinations of nested faults. */ if (vector == IDT_DF) return (EINVAL); if (vcpu->exception_pending) { VMM_CTR2(vcpu, "Unable to inject exception %d due to " "pending exception %d", vector, vcpu->exc_vector); return (EBUSY); } if (errcode_valid) { /* * Exceptions don't deliver an error code in real mode. */ error = vm_get_register(vcpu, VM_REG_GUEST_CR0, ®val); KASSERT(!error, ("%s: error %d getting CR0", __func__, error)); if (!(regval & CR0_PE)) errcode_valid = 0; } /* * From section 26.6.1 "Interruptibility State" in Intel SDM: * * Event blocking by "STI" or "MOV SS" is cleared after guest executes * one instruction or incurs an exception. */ error = vm_set_register(vcpu, VM_REG_GUEST_INTR_SHADOW, 0); KASSERT(error == 0, ("%s: error %d clearing interrupt shadow", __func__, error)); if (restart_instruction) vm_restart_instruction(vcpu); vcpu->exception_pending = 1; vcpu->exc_vector = vector; vcpu->exc_errcode = errcode; vcpu->exc_errcode_valid = errcode_valid; VMM_CTR1(vcpu, "Exception %d pending", vector); return (0); } void vm_inject_fault(struct vcpu *vcpu, int vector, int errcode_valid, int errcode) { int error __diagused, restart_instruction; restart_instruction = 1; error = vm_inject_exception(vcpu, vector, errcode_valid, errcode, restart_instruction); KASSERT(error == 0, ("vm_inject_exception error %d", error)); } void vm_inject_pf(struct vcpu *vcpu, int error_code, uint64_t cr2) { int error __diagused; VMM_CTR2(vcpu, "Injecting page fault: error_code %#x, cr2 %#lx", error_code, cr2); error = vm_set_register(vcpu, VM_REG_GUEST_CR2, cr2); KASSERT(error == 0, ("vm_set_register(cr2) error %d", error)); vm_inject_fault(vcpu, IDT_PF, 1, error_code); } static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu"); int -vm_inject_nmi(struct vm *vm, int vcpuid) +vm_inject_nmi(struct vcpu *vcpu) { - struct vcpu *vcpu; - - if (vcpuid < 0 || vcpuid >= vm->maxcpus) - return (EINVAL); - - vcpu = &vm->vcpu[vcpuid]; vcpu->nmi_pending = 1; - vcpu_notify_event(vm, vcpuid, false); + vcpu_notify_event(vcpu, false); return (0); } int vm_nmi_pending(struct vcpu *vcpu) { return (vcpu->nmi_pending); } void vm_nmi_clear(struct vcpu *vcpu) { if (vcpu->nmi_pending == 0) panic("vm_nmi_clear: inconsistent nmi_pending state"); vcpu->nmi_pending = 0; vmm_stat_incr(vcpu, VCPU_NMI_COUNT, 1); } static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu"); int -vm_inject_extint(struct vm *vm, int vcpuid) +vm_inject_extint(struct vcpu *vcpu) { - struct vcpu *vcpu; - - if (vcpuid < 0 || vcpuid >= vm->maxcpus) - return (EINVAL); - - vcpu = &vm->vcpu[vcpuid]; vcpu->extint_pending = 1; - vcpu_notify_event(vm, vcpuid, false); + vcpu_notify_event(vcpu, false); return (0); } int vm_extint_pending(struct vcpu *vcpu) { return (vcpu->extint_pending); } void vm_extint_clear(struct vcpu *vcpu) { if (vcpu->extint_pending == 0) panic("vm_extint_clear: inconsistent extint_pending state"); vcpu->extint_pending = 0; vmm_stat_incr(vcpu, VCPU_EXTINT_COUNT, 1); } int -vm_get_capability(struct vm *vm, int vcpu, int type, int *retval) +vm_get_capability(struct vcpu *vcpu, int type, int *retval) { - if (vcpu < 0 || vcpu >= vm->maxcpus) - return (EINVAL); - if (type < 0 || type >= VM_CAP_MAX) return (EINVAL); - return (vmmops_getcap(vcpu_cookie(vm, vcpu), type, retval)); + return (vmmops_getcap(vcpu->cookie, type, retval)); } int -vm_set_capability(struct vm *vm, int vcpu, int type, int val) +vm_set_capability(struct vcpu *vcpu, int type, int val) { - if (vcpu < 0 || vcpu >= vm->maxcpus) - return (EINVAL); - if (type < 0 || type >= VM_CAP_MAX) return (EINVAL); - return (vmmops_setcap(vcpu_cookie(vm, vcpu), type, val)); + return (vmmops_setcap(vcpu->cookie, type, val)); } struct vm * vcpu_vm(struct vcpu *vcpu) { return (vcpu->vm); } int vcpu_vcpuid(struct vcpu *vcpu) { return (vcpu->vcpuid); } struct vcpu * vm_vcpu(struct vm *vm, int vcpuid) { return (&vm->vcpu[vcpuid]); } struct vlapic * vm_lapic(struct vcpu *vcpu) { return (vcpu->vlapic); } struct vioapic * vm_ioapic(struct vm *vm) { return (vm->vioapic); } struct vhpet * vm_hpet(struct vm *vm) { return (vm->vhpet); } bool vmm_is_pptdev(int bus, int slot, int func) { int b, f, i, n, s; char *val, *cp, *cp2; bool found; /* * XXX * The length of an environment variable is limited to 128 bytes which * puts an upper limit on the number of passthru devices that may be * specified using a single environment variable. * * Work around this by scanning multiple environment variable * names instead of a single one - yuck! */ const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL }; /* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */ found = false; for (i = 0; names[i] != NULL && !found; i++) { cp = val = kern_getenv(names[i]); while (cp != NULL && *cp != '\0') { if ((cp2 = strchr(cp, ' ')) != NULL) *cp2 = '\0'; n = sscanf(cp, "%d/%d/%d", &b, &s, &f); if (n == 3 && bus == b && slot == s && func == f) { found = true; break; } if (cp2 != NULL) *cp2++ = ' '; cp = cp2; } freeenv(val); } return (found); } void * vm_iommu_domain(struct vm *vm) { return (vm->iommu); } int -vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate, - bool from_idle) +vcpu_set_state(struct vcpu *vcpu, enum vcpu_state newstate, bool from_idle) { int error; - struct vcpu *vcpu; - - if (vcpuid < 0 || vcpuid >= vm->maxcpus) - panic("vm_set_run_state: invalid vcpuid %d", vcpuid); - - vcpu = &vm->vcpu[vcpuid]; vcpu_lock(vcpu); - error = vcpu_set_state_locked(vm, vcpuid, newstate, from_idle); + error = vcpu_set_state_locked(vcpu, newstate, from_idle); vcpu_unlock(vcpu); return (error); } enum vcpu_state vcpu_get_state(struct vcpu *vcpu, int *hostcpu) { enum vcpu_state state; vcpu_lock(vcpu); state = vcpu->state; if (hostcpu != NULL) *hostcpu = vcpu->hostcpu; vcpu_unlock(vcpu); return (state); } int -vm_activate_cpu(struct vm *vm, int vcpuid) +vm_activate_cpu(struct vcpu *vcpu) { + struct vm *vm = vcpu->vm; - if (vcpuid < 0 || vcpuid >= vm->maxcpus) - return (EINVAL); - - if (CPU_ISSET(vcpuid, &vm->active_cpus)) + if (CPU_ISSET(vcpu->vcpuid, &vm->active_cpus)) return (EBUSY); - VCPU_CTR0(vm, vcpuid, "activated"); - CPU_SET_ATOMIC(vcpuid, &vm->active_cpus); + VMM_CTR0(vcpu, "activated"); + CPU_SET_ATOMIC(vcpu->vcpuid, &vm->active_cpus); return (0); } int -vm_suspend_cpu(struct vm *vm, int vcpuid) +vm_suspend_cpu(struct vm *vm, struct vcpu *vcpu) { - int i; - - if (vcpuid < -1 || vcpuid >= vm->maxcpus) - return (EINVAL); - - if (vcpuid == -1) { + if (vcpu == NULL) { vm->debug_cpus = vm->active_cpus; - for (i = 0; i < vm->maxcpus; i++) { + for (int i = 0; i < vm->maxcpus; i++) { if (CPU_ISSET(i, &vm->active_cpus)) - vcpu_notify_event(vm, i, false); + vcpu_notify_event(vm_vcpu(vm, i), false); } } else { - if (!CPU_ISSET(vcpuid, &vm->active_cpus)) + if (!CPU_ISSET(vcpu->vcpuid, &vm->active_cpus)) return (EINVAL); - CPU_SET_ATOMIC(vcpuid, &vm->debug_cpus); - vcpu_notify_event(vm, vcpuid, false); + CPU_SET_ATOMIC(vcpu->vcpuid, &vm->debug_cpus); + vcpu_notify_event(vcpu, false); } return (0); } int -vm_resume_cpu(struct vm *vm, int vcpuid) +vm_resume_cpu(struct vm *vm, struct vcpu *vcpu) { - if (vcpuid < -1 || vcpuid >= vm->maxcpus) - return (EINVAL); - - if (vcpuid == -1) { + if (vcpu == NULL) { CPU_ZERO(&vm->debug_cpus); } else { - if (!CPU_ISSET(vcpuid, &vm->debug_cpus)) + if (!CPU_ISSET(vcpu->vcpuid, &vm->debug_cpus)) return (EINVAL); - CPU_CLR_ATOMIC(vcpuid, &vm->debug_cpus); + CPU_CLR_ATOMIC(vcpu->vcpuid, &vm->debug_cpus); } return (0); } int vcpu_debugged(struct vcpu *vcpu) { return (CPU_ISSET(vcpu->vcpuid, &vcpu->vm->debug_cpus)); } cpuset_t vm_active_cpus(struct vm *vm) { return (vm->active_cpus); } cpuset_t vm_debug_cpus(struct vm *vm) { return (vm->debug_cpus); } cpuset_t vm_suspended_cpus(struct vm *vm) { return (vm->suspended_cpus); } void * vcpu_stats(struct vcpu *vcpu) { return (vcpu->stats); } int -vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state) +vm_get_x2apic_state(struct vcpu *vcpu, enum x2apic_state *state) { - if (vcpuid < 0 || vcpuid >= vm->maxcpus) - return (EINVAL); - - *state = vm->vcpu[vcpuid].x2apic_state; + *state = vcpu->x2apic_state; return (0); } int -vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state) +vm_set_x2apic_state(struct vcpu *vcpu, enum x2apic_state state) { - struct vcpu *vcpu; - - if (vcpuid < 0 || vcpuid >= vm->maxcpus) - return (EINVAL); - if (state >= X2APIC_STATE_LAST) return (EINVAL); - vcpu = &vm->vcpu[vcpuid]; vcpu->x2apic_state = state; vlapic_set_x2apic_state(vcpu, state); return (0); } /* * This function is called to ensure that a vcpu "sees" a pending event * as soon as possible: * - If the vcpu thread is sleeping then it is woken up. * - If the vcpu is running on a different host_cpu then an IPI will be directed * to the host_cpu to cause the vcpu to trap into the hypervisor. */ static void vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr) { int hostcpu; hostcpu = vcpu->hostcpu; if (vcpu->state == VCPU_RUNNING) { KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu")); if (hostcpu != curcpu) { if (lapic_intr) { vlapic_post_intr(vcpu->vlapic, hostcpu, vmm_ipinum); } else { ipi_cpu(hostcpu, vmm_ipinum); } } else { /* * If the 'vcpu' is running on 'curcpu' then it must * be sending a notification to itself (e.g. SELF_IPI). * The pending event will be picked up when the vcpu * transitions back to guest context. */ } } else { KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent " "with hostcpu %d", vcpu->state, hostcpu)); if (vcpu->state == VCPU_SLEEPING) wakeup_one(vcpu); } } void -vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr) +vcpu_notify_event(struct vcpu *vcpu, bool lapic_intr) { - struct vcpu *vcpu = &vm->vcpu[vcpuid]; - vcpu_lock(vcpu); vcpu_notify_event_locked(vcpu, lapic_intr); vcpu_unlock(vcpu); } struct vmspace * vm_get_vmspace(struct vm *vm) { return (vm->vmspace); } int vm_apicid2vcpuid(struct vm *vm, int apicid) { /* * XXX apic id is assumed to be numerically identical to vcpu id */ return (apicid); } int vm_smp_rendezvous(struct vcpu *vcpu, cpuset_t dest, vm_rendezvous_func_t func, void *arg) { struct vm *vm = vcpu->vm; int error, i; /* * Enforce that this function is called without any locks */ WITNESS_WARN(WARN_PANIC, NULL, "vm_smp_rendezvous"); restart: mtx_lock(&vm->rendezvous_mtx); if (vm->rendezvous_func != NULL) { /* * If a rendezvous is already in progress then we need to - * call the rendezvous handler in case this 'vcpuid' is one + * call the rendezvous handler in case this 'vcpu' is one * of the targets of the rendezvous. */ VMM_CTR0(vcpu, "Rendezvous already in progress"); mtx_unlock(&vm->rendezvous_mtx); error = vm_handle_rendezvous(vcpu); if (error != 0) return (error); goto restart; } KASSERT(vm->rendezvous_func == NULL, ("vm_smp_rendezvous: previous " "rendezvous is still in progress")); VMM_CTR0(vcpu, "Initiating rendezvous"); vm->rendezvous_req_cpus = dest; CPU_ZERO(&vm->rendezvous_done_cpus); vm->rendezvous_arg = arg; vm->rendezvous_func = func; mtx_unlock(&vm->rendezvous_mtx); /* * Wake up any sleeping vcpus and trigger a VM-exit in any running * vcpus so they handle the rendezvous as soon as possible. */ for (i = 0; i < vm->maxcpus; i++) { if (CPU_ISSET(i, &dest)) - vcpu_notify_event(vm, i, false); + vcpu_notify_event(vm_vcpu(vm, i), false); } return (vm_handle_rendezvous(vcpu)); } struct vatpic * vm_atpic(struct vm *vm) { return (vm->vatpic); } struct vatpit * vm_atpit(struct vm *vm) { return (vm->vatpit); } struct vpmtmr * vm_pmtmr(struct vm *vm) { return (vm->vpmtmr); } struct vrtc * vm_rtc(struct vm *vm) { return (vm->vrtc); } enum vm_reg_name vm_segment_name(int seg) { static enum vm_reg_name seg_names[] = { VM_REG_GUEST_ES, VM_REG_GUEST_CS, VM_REG_GUEST_SS, VM_REG_GUEST_DS, VM_REG_GUEST_FS, VM_REG_GUEST_GS }; KASSERT(seg >= 0 && seg < nitems(seg_names), ("%s: invalid segment encoding %d", __func__, seg)); return (seg_names[seg]); } void vm_copy_teardown(struct vm_copyinfo *copyinfo, int num_copyinfo) { int idx; for (idx = 0; idx < num_copyinfo; idx++) { if (copyinfo[idx].cookie != NULL) vm_gpa_release(copyinfo[idx].cookie); } bzero(copyinfo, num_copyinfo * sizeof(struct vm_copyinfo)); } int vm_copy_setup(struct vcpu *vcpu, struct vm_guest_paging *paging, uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo, int num_copyinfo, int *fault) { int error, idx, nused; size_t n, off, remaining; void *hva, *cookie; uint64_t gpa; bzero(copyinfo, sizeof(struct vm_copyinfo) * num_copyinfo); nused = 0; remaining = len; while (remaining > 0) { KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo")); error = vm_gla2gpa(vcpu, paging, gla, prot, &gpa, fault); if (error || *fault) return (error); off = gpa & PAGE_MASK; n = min(remaining, PAGE_SIZE - off); copyinfo[nused].gpa = gpa; copyinfo[nused].len = n; remaining -= n; gla += n; nused++; } for (idx = 0; idx < nused; idx++) { hva = vm_gpa_hold(vcpu, copyinfo[idx].gpa, copyinfo[idx].len, prot, &cookie); if (hva == NULL) break; copyinfo[idx].hva = hva; copyinfo[idx].cookie = cookie; } if (idx != nused) { vm_copy_teardown(copyinfo, num_copyinfo); return (EFAULT); } else { *fault = 0; return (0); } } void vm_copyin(struct vm_copyinfo *copyinfo, void *kaddr, size_t len) { char *dst; int idx; dst = kaddr; idx = 0; while (len > 0) { bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len); len -= copyinfo[idx].len; dst += copyinfo[idx].len; idx++; } } void vm_copyout(const void *kaddr, struct vm_copyinfo *copyinfo, size_t len) { const char *src; int idx; src = kaddr; idx = 0; while (len > 0) { bcopy(src, copyinfo[idx].hva, copyinfo[idx].len); len -= copyinfo[idx].len; src += copyinfo[idx].len; idx++; } } /* * Return the amount of in-use and wired memory for the VM. Since * these are global stats, only return the values with for vCPU 0 */ VMM_STAT_DECLARE(VMM_MEM_RESIDENT); VMM_STAT_DECLARE(VMM_MEM_WIRED); static void -vm_get_rescnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat) +vm_get_rescnt(struct vcpu *vcpu, struct vmm_stat_type *stat) { - if (vcpu == 0) { - vmm_stat_set(vm_vcpu(vm, vcpu), VMM_MEM_RESIDENT, - PAGE_SIZE * vmspace_resident_count(vm->vmspace)); + if (vcpu->vcpuid == 0) { + vmm_stat_set(vcpu, VMM_MEM_RESIDENT, PAGE_SIZE * + vmspace_resident_count(vcpu->vm->vmspace)); } } static void -vm_get_wiredcnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat) +vm_get_wiredcnt(struct vcpu *vcpu, struct vmm_stat_type *stat) { - if (vcpu == 0) { - vmm_stat_set(vm_vcpu(vm, vcpu), VMM_MEM_WIRED, - PAGE_SIZE * pmap_wired_count(vmspace_pmap(vm->vmspace))); + if (vcpu->vcpuid == 0) { + vmm_stat_set(vcpu, VMM_MEM_WIRED, PAGE_SIZE * + pmap_wired_count(vmspace_pmap(vcpu->vm->vmspace))); } } VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt); VMM_STAT_FUNC(VMM_MEM_WIRED, "Wired memory", vm_get_wiredcnt); #ifdef BHYVE_SNAPSHOT static int vm_snapshot_vcpus(struct vm *vm, struct vm_snapshot_meta *meta) { uint64_t tsc, now; int ret; struct vcpu *vcpu; uint16_t i, maxcpus; now = rdtsc(); maxcpus = vm_get_maxcpus(vm); for (i = 0; i < maxcpus; i++) { vcpu = &vm->vcpu[i]; SNAPSHOT_VAR_OR_LEAVE(vcpu->x2apic_state, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vcpu->exitintinfo, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vcpu->exc_vector, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vcpu->exc_errcode_valid, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vcpu->exc_errcode, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vcpu->guest_xcr0, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vcpu->exitinfo, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vcpu->nextrip, meta, ret, done); /* * Save the absolute TSC value by adding now to tsc_offset. * * It will be turned turned back into an actual offset when the * TSC restore function is called */ tsc = now + vcpu->tsc_offset; SNAPSHOT_VAR_OR_LEAVE(tsc, meta, ret, done); } done: return (ret); } static int vm_snapshot_vm(struct vm *vm, struct vm_snapshot_meta *meta) { int ret; ret = vm_snapshot_vcpus(vm, meta); if (ret != 0) goto done; done: return (ret); } static int vm_snapshot_vcpu(struct vm *vm, struct vm_snapshot_meta *meta) { int error; struct vcpu *vcpu; uint16_t i, maxcpus; error = 0; maxcpus = vm_get_maxcpus(vm); for (i = 0; i < maxcpus; i++) { vcpu = &vm->vcpu[i]; error = vmmops_vcpu_snapshot(vcpu->cookie, meta); if (error != 0) { printf("%s: failed to snapshot vmcs/vmcb data for " "vCPU: %d; error: %d\n", __func__, i, error); goto done; } } done: return (error); } /* * Save kernel-side structures to user-space for snapshotting. */ int vm_snapshot_req(struct vm *vm, struct vm_snapshot_meta *meta) { int ret = 0; switch (meta->dev_req) { case STRUCT_VMX: ret = vmmops_snapshot(vm->cookie, meta); break; case STRUCT_VMCX: ret = vm_snapshot_vcpu(vm, meta); break; case STRUCT_VM: ret = vm_snapshot_vm(vm, meta); break; case STRUCT_VIOAPIC: ret = vioapic_snapshot(vm_ioapic(vm), meta); break; case STRUCT_VLAPIC: ret = vlapic_snapshot(vm, meta); break; case STRUCT_VHPET: ret = vhpet_snapshot(vm_hpet(vm), meta); break; case STRUCT_VATPIC: ret = vatpic_snapshot(vm_atpic(vm), meta); break; case STRUCT_VATPIT: ret = vatpit_snapshot(vm_atpit(vm), meta); break; case STRUCT_VPMTMR: ret = vpmtmr_snapshot(vm_pmtmr(vm), meta); break; case STRUCT_VRTC: ret = vrtc_snapshot(vm_rtc(vm), meta); break; default: printf("%s: failed to find the requested type %#x\n", __func__, meta->dev_req); ret = (EINVAL); } return (ret); } void vm_set_tsc_offset(struct vcpu *vcpu, uint64_t offset) { vcpu->tsc_offset = offset; } int vm_restore_time(struct vm *vm) { int error; uint64_t now; struct vcpu *vcpu; uint16_t i, maxcpus; now = rdtsc(); error = vhpet_restore_time(vm_hpet(vm)); if (error) return (error); maxcpus = vm_get_maxcpus(vm); for (i = 0; i < maxcpus; i++) { vcpu = &vm->vcpu[i]; error = vmmops_restore_tsc(vcpu->cookie, vcpu->tsc_offset - now); if (error) return (error); } return (0); } #endif diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c index 1d36e190c7a6..a007a25aa9b5 100644 --- a/sys/amd64/vmm/vmm_dev.c +++ b/sys/amd64/vmm/vmm_dev.c @@ -1,1288 +1,1330 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include "opt_bhyve_snapshot.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "vmm_lapic.h" #include "vmm_stat.h" #include "vmm_mem.h" #include "io/ppt.h" #include "io/vatpic.h" #include "io/vioapic.h" #include "io/vhpet.h" #include "io/vrtc.h" #ifdef COMPAT_FREEBSD13 struct vm_stats_old { int cpuid; /* in */ int num_entries; /* out */ struct timeval tv; uint64_t statbuf[MAX_VM_STATS]; }; #define VM_STATS_OLD \ _IOWR('v', IOCNUM_VM_STATS, struct vm_stats_old) #endif struct devmem_softc { int segid; char *name; struct cdev *cdev; struct vmmdev_softc *sc; SLIST_ENTRY(devmem_softc) link; }; struct vmmdev_softc { struct vm *vm; /* vm instance cookie */ struct cdev *cdev; struct ucred *ucred; SLIST_ENTRY(vmmdev_softc) link; SLIST_HEAD(, devmem_softc) devmem; int flags; }; #define VSC_LINKED 0x01 static SLIST_HEAD(, vmmdev_softc) head; static unsigned pr_allow_flag; static struct mtx vmmdev_mtx; static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev"); SYSCTL_DECL(_hw_vmm); static int vmm_priv_check(struct ucred *ucred); static int devmem_create_cdev(const char *vmname, int id, char *devmem); static void devmem_destroy(void *arg); static int vmm_priv_check(struct ucred *ucred) { if (jailed(ucred) && !(ucred->cr_prison->pr_allow & pr_allow_flag)) return (EPERM); return (0); } static int vcpu_lock_one(struct vmmdev_softc *sc, int vcpu) { int error; if (vcpu < 0 || vcpu >= vm_get_maxcpus(sc->vm)) return (EINVAL); - error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN, true); + error = vcpu_set_state(vm_vcpu(sc->vm, vcpu), VCPU_FROZEN, true); return (error); } static void -vcpu_unlock_one(struct vmmdev_softc *sc, int vcpu) +vcpu_unlock_one(struct vmmdev_softc *sc, int vcpuid) { + struct vcpu *vcpu; enum vcpu_state state; - state = vcpu_get_state(vm_vcpu(sc->vm, vcpu), NULL); + vcpu = vm_vcpu(sc->vm, vcpuid); + state = vcpu_get_state(vcpu, NULL); if (state != VCPU_FROZEN) { panic("vcpu %s(%d) has invalid state %d", vm_name(sc->vm), - vcpu, state); + vcpuid, state); } - vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false); + vcpu_set_state(vcpu, VCPU_IDLE, false); } static int vcpu_lock_all(struct vmmdev_softc *sc) { int error, vcpu; uint16_t maxcpus; maxcpus = vm_get_maxcpus(sc->vm); for (vcpu = 0; vcpu < maxcpus; vcpu++) { error = vcpu_lock_one(sc, vcpu); if (error) break; } if (error) { while (--vcpu >= 0) vcpu_unlock_one(sc, vcpu); } return (error); } static void vcpu_unlock_all(struct vmmdev_softc *sc) { int vcpu; uint16_t maxcpus; maxcpus = vm_get_maxcpus(sc->vm); for (vcpu = 0; vcpu < maxcpus; vcpu++) vcpu_unlock_one(sc, vcpu); } static struct vmmdev_softc * vmmdev_lookup(const char *name) { struct vmmdev_softc *sc; #ifdef notyet /* XXX kernel is not compiled with invariants */ mtx_assert(&vmmdev_mtx, MA_OWNED); #endif SLIST_FOREACH(sc, &head, link) { if (strcmp(name, vm_name(sc->vm)) == 0) break; } if (sc == NULL) return (NULL); if (cr_cansee(curthread->td_ucred, sc->ucred)) return (NULL); return (sc); } static struct vmmdev_softc * vmmdev_lookup2(struct cdev *cdev) { return (cdev->si_drv1); } static int vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags) { int error, off, c, prot; vm_paddr_t gpa, maxaddr; void *hpa, *cookie; struct vmmdev_softc *sc; struct vcpu *vcpu; uint16_t lastcpu; error = vmm_priv_check(curthread->td_ucred); if (error) return (error); sc = vmmdev_lookup2(cdev); if (sc == NULL) return (ENXIO); /* * Get a read lock on the guest memory map by freezing any vcpu. */ lastcpu = vm_get_maxcpus(sc->vm) - 1; error = vcpu_lock_one(sc, lastcpu); if (error) return (error); vcpu = vm_vcpu(sc->vm, lastcpu); prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ); maxaddr = vmm_sysmem_maxaddr(sc->vm); while (uio->uio_resid > 0 && error == 0) { gpa = uio->uio_offset; off = gpa & PAGE_MASK; c = min(uio->uio_resid, PAGE_SIZE - off); /* * The VM has a hole in its physical memory map. If we want to * use 'dd' to inspect memory beyond the hole we need to * provide bogus data for memory that lies in the hole. * * Since this device does not support lseek(2), dd(1) will * read(2) blocks of data to simulate the lseek(2). */ hpa = vm_gpa_hold(vcpu, gpa, c, prot, &cookie); if (hpa == NULL) { if (uio->uio_rw == UIO_READ && gpa < maxaddr) error = uiomove(__DECONST(void *, zero_region), c, uio); else error = EFAULT; } else { error = uiomove(hpa, c, uio); vm_gpa_release(cookie); } } vcpu_unlock_one(sc, lastcpu); return (error); } CTASSERT(sizeof(((struct vm_memseg *)0)->name) >= VM_MAX_SUFFIXLEN + 1); static int get_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len) { struct devmem_softc *dsc; int error; bool sysmem; error = vm_get_memseg(sc->vm, mseg->segid, &mseg->len, &sysmem, NULL); if (error || mseg->len == 0) return (error); if (!sysmem) { SLIST_FOREACH(dsc, &sc->devmem, link) { if (dsc->segid == mseg->segid) break; } KASSERT(dsc != NULL, ("%s: devmem segment %d not found", __func__, mseg->segid)); error = copystr(dsc->name, mseg->name, len, NULL); } else { bzero(mseg->name, len); } return (error); } static int alloc_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len) { char *name; int error; bool sysmem; error = 0; name = NULL; sysmem = true; /* * The allocation is lengthened by 1 to hold a terminating NUL. It'll * by stripped off when devfs processes the full string. */ if (VM_MEMSEG_NAME(mseg)) { sysmem = false; name = malloc(len, M_VMMDEV, M_WAITOK); error = copystr(mseg->name, name, len, NULL); if (error) goto done; } error = vm_alloc_memseg(sc->vm, mseg->segid, mseg->len, sysmem); if (error) goto done; if (VM_MEMSEG_NAME(mseg)) { error = devmem_create_cdev(vm_name(sc->vm), mseg->segid, name); if (error) vm_free_memseg(sc->vm, mseg->segid); else name = NULL; /* freed when 'cdev' is destroyed */ } done: free(name, M_VMMDEV); return (error); } static int vm_get_register_set(struct vcpu *vcpu, unsigned int count, int *regnum, uint64_t *regval) { int error, i; error = 0; for (i = 0; i < count; i++) { error = vm_get_register(vcpu, regnum[i], ®val[i]); if (error) break; } return (error); } static int vm_set_register_set(struct vcpu *vcpu, unsigned int count, int *regnum, uint64_t *regval) { int error, i; error = 0; for (i = 0; i < count; i++) { error = vm_set_register(vcpu, regnum[i], regval[i]); if (error) break; } return (error); } +static struct vcpu * +lookup_vcpu(struct vm *vm, int vcpuid) +{ + if (vcpuid < 0 || vcpuid >= vm_get_maxcpus(vm)) + return (NULL); + + return (vm_vcpu(vm, vcpuid)); +} + static int vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, struct thread *td) { int error, vcpuid, state_changed, size; cpuset_t *cpuset; struct vmmdev_softc *sc; struct vcpu *vcpu; struct vm_register *vmreg; struct vm_seg_desc *vmsegdesc; struct vm_register_set *vmregset; struct vm_run *vmrun; struct vm_exception *vmexc; struct vm_lapic_irq *vmirq; struct vm_lapic_msi *vmmsi; struct vm_ioapic_irq *ioapic_irq; struct vm_isa_irq *isa_irq; struct vm_isa_irq_trigger *isa_irq_trigger; struct vm_capability *vmcap; struct vm_pptdev *pptdev; struct vm_pptdev_mmio *pptmmio; struct vm_pptdev_msi *pptmsi; struct vm_pptdev_msix *pptmsix; - struct vm_nmi *vmnmi; #ifdef COMPAT_FREEBSD13 struct vm_stats_old *vmstats_old; #endif struct vm_stats *vmstats; struct vm_stat_desc *statdesc; struct vm_x2apic *x2apic; struct vm_gpa_pte *gpapte; struct vm_suspend *vmsuspend; struct vm_gla2gpa *gg; - struct vm_activate_cpu *vac; struct vm_cpuset *vm_cpuset; struct vm_intinfo *vmii; struct vm_rtc_time *rtctime; struct vm_rtc_data *rtcdata; struct vm_memmap *mm; struct vm_munmap *mu; struct vm_cpu_topology *topology; struct vm_readwrite_kernemu_device *kernemu; uint64_t *regvals; int *regnums; #ifdef BHYVE_SNAPSHOT struct vm_snapshot_meta *snapshot_meta; #endif error = vmm_priv_check(curthread->td_ucred); if (error) return (error); sc = vmmdev_lookup2(cdev); if (sc == NULL) return (ENXIO); vcpuid = -1; vcpu = NULL; state_changed = 0; /* - * Some VMM ioctls can operate only on vcpus that are not running. + * For VMM ioctls that operate on a single vCPU, lookup the + * vcpu. For VMM ioctls which require one or more vCPUs to + * not be running, lock necessary vCPUs. + * + * XXX fragile, handle with care + * Most of these assume that the first field of the ioctl data + * is the vcpuid. */ switch (cmd) { case VM_RUN: case VM_GET_REGISTER: case VM_SET_REGISTER: case VM_GET_SEGMENT_DESCRIPTOR: case VM_SET_SEGMENT_DESCRIPTOR: case VM_GET_REGISTER_SET: case VM_SET_REGISTER_SET: case VM_INJECT_EXCEPTION: case VM_GET_CAPABILITY: case VM_SET_CAPABILITY: case VM_PPTDEV_MSI: case VM_PPTDEV_MSIX: case VM_SET_X2APIC_STATE: case VM_GLA2GPA: case VM_GLA2GPA_NOFAULT: case VM_ACTIVATE_CPU: case VM_SET_INTINFO: case VM_GET_INTINFO: case VM_RESTART_INSTRUCTION: /* - * XXX fragile, handle with care - * Assumes that the first field of the ioctl data is the vcpu. + * ioctls that can operate only on vcpus that are not running. */ vcpuid = *(int *)data; error = vcpu_lock_one(sc, vcpuid); if (error) goto done; state_changed = 1; vcpu = vm_vcpu(sc->vm, vcpuid); break; case VM_MAP_PPTDEV_MMIO: case VM_UNMAP_PPTDEV_MMIO: case VM_BIND_PPTDEV: case VM_UNBIND_PPTDEV: #ifdef COMPAT_FREEBSD12 case VM_ALLOC_MEMSEG_FBSD12: #endif case VM_ALLOC_MEMSEG: case VM_MMAP_MEMSEG: case VM_MUNMAP_MEMSEG: case VM_REINIT: /* * ioctls that operate on the entire virtual machine must * prevent all vcpus from running. */ error = vcpu_lock_all(sc); if (error) goto done; state_changed = 2; break; #ifdef COMPAT_FREEBSD12 case VM_GET_MEMSEG_FBSD12: #endif case VM_GET_MEMSEG: case VM_MMAP_GETNEXT: /* * Lock a vcpu to make sure that the memory map cannot be * modified while it is being inspected. */ vcpuid = vm_get_maxcpus(sc->vm) - 1; error = vcpu_lock_one(sc, vcpuid); if (error) goto done; state_changed = 1; break; +#ifdef COMPAT_FREEBSD13 + case VM_STATS_OLD: +#endif + case VM_STATS: + case VM_INJECT_NMI: + case VM_LAPIC_IRQ: + case VM_GET_X2APIC_STATE: + /* + * These do not need the vCPU locked but do operate on + * a specific vCPU. + */ + vcpuid = *(int *)data; + vcpu = lookup_vcpu(sc->vm, vcpuid); + if (vcpu == NULL) { + error = EINVAL; + goto done; + } + break; + + case VM_LAPIC_LOCAL_IRQ: + case VM_SUSPEND_CPU: + case VM_RESUME_CPU: + /* + * These can either operate on all CPUs via a vcpuid of + * -1 or on a specific vCPU. + */ + vcpuid = *(int *)data; + if (vcpuid == -1) + break; + vcpu = lookup_vcpu(sc->vm, vcpuid); + if (vcpu == NULL) { + error = EINVAL; + goto done; + } + break; + default: break; } switch(cmd) { case VM_RUN: vmrun = (struct vm_run *)data; - error = vm_run(sc->vm, vmrun); + error = vm_run(vcpu, &vmrun->vm_exit); break; case VM_SUSPEND: vmsuspend = (struct vm_suspend *)data; error = vm_suspend(sc->vm, vmsuspend->how); break; case VM_REINIT: error = vm_reinit(sc->vm); break; case VM_STAT_DESC: { statdesc = (struct vm_stat_desc *)data; error = vmm_stat_desc_copy(statdesc->index, statdesc->desc, sizeof(statdesc->desc)); break; } #ifdef COMPAT_FREEBSD13 case VM_STATS_OLD: vmstats_old = (struct vm_stats_old *)data; getmicrotime(&vmstats_old->tv); - error = vmm_stat_copy(sc->vm, vmstats_old->cpuid, 0, + error = vmm_stat_copy(vcpu, 0, nitems(vmstats_old->statbuf), &vmstats_old->num_entries, vmstats_old->statbuf); break; #endif case VM_STATS: { vmstats = (struct vm_stats *)data; getmicrotime(&vmstats->tv); - error = vmm_stat_copy(sc->vm, vmstats->cpuid, vmstats->index, + error = vmm_stat_copy(vcpu, vmstats->index, nitems(vmstats->statbuf), &vmstats->num_entries, vmstats->statbuf); break; } case VM_PPTDEV_MSI: pptmsi = (struct vm_pptdev_msi *)data; error = ppt_setup_msi(sc->vm, pptmsi->bus, pptmsi->slot, pptmsi->func, pptmsi->addr, pptmsi->msg, pptmsi->numvec); break; case VM_PPTDEV_MSIX: pptmsix = (struct vm_pptdev_msix *)data; error = ppt_setup_msix(sc->vm, pptmsix->bus, pptmsix->slot, pptmsix->func, pptmsix->idx, pptmsix->addr, pptmsix->msg, pptmsix->vector_control); break; case VM_PPTDEV_DISABLE_MSIX: pptdev = (struct vm_pptdev *)data; error = ppt_disable_msix(sc->vm, pptdev->bus, pptdev->slot, pptdev->func); break; case VM_MAP_PPTDEV_MMIO: pptmmio = (struct vm_pptdev_mmio *)data; error = ppt_map_mmio(sc->vm, pptmmio->bus, pptmmio->slot, pptmmio->func, pptmmio->gpa, pptmmio->len, pptmmio->hpa); break; case VM_UNMAP_PPTDEV_MMIO: pptmmio = (struct vm_pptdev_mmio *)data; error = ppt_unmap_mmio(sc->vm, pptmmio->bus, pptmmio->slot, pptmmio->func, pptmmio->gpa, pptmmio->len); break; case VM_BIND_PPTDEV: pptdev = (struct vm_pptdev *)data; error = vm_assign_pptdev(sc->vm, pptdev->bus, pptdev->slot, pptdev->func); break; case VM_UNBIND_PPTDEV: pptdev = (struct vm_pptdev *)data; error = vm_unassign_pptdev(sc->vm, pptdev->bus, pptdev->slot, pptdev->func); break; case VM_INJECT_EXCEPTION: vmexc = (struct vm_exception *)data; error = vm_inject_exception(vcpu, vmexc->vector, vmexc->error_code_valid, vmexc->error_code, vmexc->restart_instruction); break; case VM_INJECT_NMI: - vmnmi = (struct vm_nmi *)data; - error = vm_inject_nmi(sc->vm, vmnmi->cpuid); + error = vm_inject_nmi(vcpu); break; case VM_LAPIC_IRQ: vmirq = (struct vm_lapic_irq *)data; - error = lapic_intr_edge(sc->vm, vmirq->cpuid, vmirq->vector); + error = lapic_intr_edge(vcpu, vmirq->vector); break; case VM_LAPIC_LOCAL_IRQ: vmirq = (struct vm_lapic_irq *)data; - error = lapic_set_local_intr(sc->vm, vmirq->cpuid, - vmirq->vector); + error = lapic_set_local_intr(sc->vm, vcpu, vmirq->vector); break; case VM_LAPIC_MSI: vmmsi = (struct vm_lapic_msi *)data; error = lapic_intr_msi(sc->vm, vmmsi->addr, vmmsi->msg); break; case VM_IOAPIC_ASSERT_IRQ: ioapic_irq = (struct vm_ioapic_irq *)data; error = vioapic_assert_irq(sc->vm, ioapic_irq->irq); break; case VM_IOAPIC_DEASSERT_IRQ: ioapic_irq = (struct vm_ioapic_irq *)data; error = vioapic_deassert_irq(sc->vm, ioapic_irq->irq); break; case VM_IOAPIC_PULSE_IRQ: ioapic_irq = (struct vm_ioapic_irq *)data; error = vioapic_pulse_irq(sc->vm, ioapic_irq->irq); break; case VM_IOAPIC_PINCOUNT: *(int *)data = vioapic_pincount(sc->vm); break; case VM_SET_KERNEMU_DEV: case VM_GET_KERNEMU_DEV: { mem_region_write_t mwrite; mem_region_read_t mread; bool arg; kernemu = (void *)data; if (kernemu->access_width > 0) size = (1u << kernemu->access_width); else size = 1; if (kernemu->gpa >= DEFAULT_APIC_BASE && kernemu->gpa < DEFAULT_APIC_BASE + PAGE_SIZE) { mread = lapic_mmio_read; mwrite = lapic_mmio_write; } else if (kernemu->gpa >= VIOAPIC_BASE && kernemu->gpa < VIOAPIC_BASE + VIOAPIC_SIZE) { mread = vioapic_mmio_read; mwrite = vioapic_mmio_write; } else if (kernemu->gpa >= VHPET_BASE && kernemu->gpa < VHPET_BASE + VHPET_SIZE) { mread = vhpet_mmio_read; mwrite = vhpet_mmio_write; } else { error = EINVAL; break; } if (cmd == VM_SET_KERNEMU_DEV) error = mwrite(vcpu, kernemu->gpa, kernemu->value, size, &arg); else error = mread(vcpu, kernemu->gpa, &kernemu->value, size, &arg); break; } case VM_ISA_ASSERT_IRQ: isa_irq = (struct vm_isa_irq *)data; error = vatpic_assert_irq(sc->vm, isa_irq->atpic_irq); if (error == 0 && isa_irq->ioapic_irq != -1) error = vioapic_assert_irq(sc->vm, isa_irq->ioapic_irq); break; case VM_ISA_DEASSERT_IRQ: isa_irq = (struct vm_isa_irq *)data; error = vatpic_deassert_irq(sc->vm, isa_irq->atpic_irq); if (error == 0 && isa_irq->ioapic_irq != -1) error = vioapic_deassert_irq(sc->vm, isa_irq->ioapic_irq); break; case VM_ISA_PULSE_IRQ: isa_irq = (struct vm_isa_irq *)data; error = vatpic_pulse_irq(sc->vm, isa_irq->atpic_irq); if (error == 0 && isa_irq->ioapic_irq != -1) error = vioapic_pulse_irq(sc->vm, isa_irq->ioapic_irq); break; case VM_ISA_SET_IRQ_TRIGGER: isa_irq_trigger = (struct vm_isa_irq_trigger *)data; error = vatpic_set_irq_trigger(sc->vm, isa_irq_trigger->atpic_irq, isa_irq_trigger->trigger); break; case VM_MMAP_GETNEXT: mm = (struct vm_memmap *)data; error = vm_mmap_getnext(sc->vm, &mm->gpa, &mm->segid, &mm->segoff, &mm->len, &mm->prot, &mm->flags); break; case VM_MMAP_MEMSEG: mm = (struct vm_memmap *)data; error = vm_mmap_memseg(sc->vm, mm->gpa, mm->segid, mm->segoff, mm->len, mm->prot, mm->flags); break; case VM_MUNMAP_MEMSEG: mu = (struct vm_munmap *)data; error = vm_munmap_memseg(sc->vm, mu->gpa, mu->len); break; #ifdef COMPAT_FREEBSD12 case VM_ALLOC_MEMSEG_FBSD12: error = alloc_memseg(sc, (struct vm_memseg *)data, sizeof(((struct vm_memseg_fbsd12 *)0)->name)); break; #endif case VM_ALLOC_MEMSEG: error = alloc_memseg(sc, (struct vm_memseg *)data, sizeof(((struct vm_memseg *)0)->name)); break; #ifdef COMPAT_FREEBSD12 case VM_GET_MEMSEG_FBSD12: error = get_memseg(sc, (struct vm_memseg *)data, sizeof(((struct vm_memseg_fbsd12 *)0)->name)); break; #endif case VM_GET_MEMSEG: error = get_memseg(sc, (struct vm_memseg *)data, sizeof(((struct vm_memseg *)0)->name)); break; case VM_GET_REGISTER: vmreg = (struct vm_register *)data; error = vm_get_register(vcpu, vmreg->regnum, &vmreg->regval); break; case VM_SET_REGISTER: vmreg = (struct vm_register *)data; error = vm_set_register(vcpu, vmreg->regnum, vmreg->regval); break; case VM_SET_SEGMENT_DESCRIPTOR: vmsegdesc = (struct vm_seg_desc *)data; - error = vm_set_seg_desc(sc->vm, vmsegdesc->cpuid, + error = vm_set_seg_desc(vcpu, vmsegdesc->regnum, &vmsegdesc->desc); break; case VM_GET_SEGMENT_DESCRIPTOR: vmsegdesc = (struct vm_seg_desc *)data; error = vm_get_seg_desc(vcpu, vmsegdesc->regnum, &vmsegdesc->desc); break; case VM_GET_REGISTER_SET: vmregset = (struct vm_register_set *)data; if (vmregset->count > VM_REG_LAST) { error = EINVAL; break; } regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV, M_WAITOK); regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV, M_WAITOK); error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) * vmregset->count); if (error == 0) error = vm_get_register_set(vcpu, vmregset->count, regnums, regvals); if (error == 0) error = copyout(regvals, vmregset->regvals, sizeof(regvals[0]) * vmregset->count); free(regvals, M_VMMDEV); free(regnums, M_VMMDEV); break; case VM_SET_REGISTER_SET: vmregset = (struct vm_register_set *)data; if (vmregset->count > VM_REG_LAST) { error = EINVAL; break; } regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV, M_WAITOK); regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV, M_WAITOK); error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) * vmregset->count); if (error == 0) error = copyin(vmregset->regvals, regvals, sizeof(regvals[0]) * vmregset->count); if (error == 0) error = vm_set_register_set(vcpu, vmregset->count, regnums, regvals); free(regvals, M_VMMDEV); free(regnums, M_VMMDEV); break; case VM_GET_CAPABILITY: vmcap = (struct vm_capability *)data; - error = vm_get_capability(sc->vm, vmcap->cpuid, + error = vm_get_capability(vcpu, vmcap->captype, &vmcap->capval); break; case VM_SET_CAPABILITY: vmcap = (struct vm_capability *)data; - error = vm_set_capability(sc->vm, vmcap->cpuid, + error = vm_set_capability(vcpu, vmcap->captype, vmcap->capval); break; case VM_SET_X2APIC_STATE: x2apic = (struct vm_x2apic *)data; - error = vm_set_x2apic_state(sc->vm, - x2apic->cpuid, x2apic->state); + error = vm_set_x2apic_state(vcpu, x2apic->state); break; case VM_GET_X2APIC_STATE: x2apic = (struct vm_x2apic *)data; - error = vm_get_x2apic_state(sc->vm, - x2apic->cpuid, &x2apic->state); + error = vm_get_x2apic_state(vcpu, &x2apic->state); break; case VM_GET_GPA_PMAP: gpapte = (struct vm_gpa_pte *)data; pmap_get_mapping(vmspace_pmap(vm_get_vmspace(sc->vm)), gpapte->gpa, gpapte->pte, &gpapte->ptenum); error = 0; break; case VM_GET_HPET_CAPABILITIES: error = vhpet_getcap((struct vm_hpet_cap *)data); break; case VM_GLA2GPA: { CTASSERT(PROT_READ == VM_PROT_READ); CTASSERT(PROT_WRITE == VM_PROT_WRITE); CTASSERT(PROT_EXEC == VM_PROT_EXECUTE); gg = (struct vm_gla2gpa *)data; error = vm_gla2gpa(vcpu, &gg->paging, gg->gla, gg->prot, &gg->gpa, &gg->fault); KASSERT(error == 0 || error == EFAULT, ("%s: vm_gla2gpa unknown error %d", __func__, error)); break; } case VM_GLA2GPA_NOFAULT: gg = (struct vm_gla2gpa *)data; error = vm_gla2gpa_nofault(vcpu, &gg->paging, gg->gla, gg->prot, &gg->gpa, &gg->fault); KASSERT(error == 0 || error == EFAULT, ("%s: vm_gla2gpa unknown error %d", __func__, error)); break; case VM_ACTIVATE_CPU: - vac = (struct vm_activate_cpu *)data; - error = vm_activate_cpu(sc->vm, vac->vcpuid); + error = vm_activate_cpu(vcpu); break; case VM_GET_CPUS: error = 0; vm_cpuset = (struct vm_cpuset *)data; size = vm_cpuset->cpusetsize; if (size < sizeof(cpuset_t) || size > CPU_MAXSIZE / NBBY) { error = ERANGE; break; } cpuset = malloc(size, M_TEMP, M_WAITOK | M_ZERO); if (vm_cpuset->which == VM_ACTIVE_CPUS) *cpuset = vm_active_cpus(sc->vm); else if (vm_cpuset->which == VM_SUSPENDED_CPUS) *cpuset = vm_suspended_cpus(sc->vm); else if (vm_cpuset->which == VM_DEBUG_CPUS) *cpuset = vm_debug_cpus(sc->vm); else error = EINVAL; if (error == 0) error = copyout(cpuset, vm_cpuset->cpus, size); free(cpuset, M_TEMP); break; case VM_SUSPEND_CPU: - vac = (struct vm_activate_cpu *)data; - error = vm_suspend_cpu(sc->vm, vac->vcpuid); + error = vm_suspend_cpu(sc->vm, vcpu); break; case VM_RESUME_CPU: - vac = (struct vm_activate_cpu *)data; - error = vm_resume_cpu(sc->vm, vac->vcpuid); + error = vm_resume_cpu(sc->vm, vcpu); break; case VM_SET_INTINFO: vmii = (struct vm_intinfo *)data; error = vm_exit_intinfo(vcpu, vmii->info1); break; case VM_GET_INTINFO: vmii = (struct vm_intinfo *)data; - error = vm_get_intinfo(sc->vm, vmii->vcpuid, &vmii->info1, - &vmii->info2); + error = vm_get_intinfo(vcpu, &vmii->info1, &vmii->info2); break; case VM_RTC_WRITE: rtcdata = (struct vm_rtc_data *)data; error = vrtc_nvram_write(sc->vm, rtcdata->offset, rtcdata->value); break; case VM_RTC_READ: rtcdata = (struct vm_rtc_data *)data; error = vrtc_nvram_read(sc->vm, rtcdata->offset, &rtcdata->value); break; case VM_RTC_SETTIME: rtctime = (struct vm_rtc_time *)data; error = vrtc_set_time(sc->vm, rtctime->secs); break; case VM_RTC_GETTIME: error = 0; rtctime = (struct vm_rtc_time *)data; rtctime->secs = vrtc_get_time(sc->vm); break; case VM_RESTART_INSTRUCTION: error = vm_restart_instruction(vcpu); break; case VM_SET_TOPOLOGY: topology = (struct vm_cpu_topology *)data; error = vm_set_topology(sc->vm, topology->sockets, topology->cores, topology->threads, topology->maxcpus); break; case VM_GET_TOPOLOGY: topology = (struct vm_cpu_topology *)data; vm_get_topology(sc->vm, &topology->sockets, &topology->cores, &topology->threads, &topology->maxcpus); error = 0; break; #ifdef BHYVE_SNAPSHOT case VM_SNAPSHOT_REQ: snapshot_meta = (struct vm_snapshot_meta *)data; error = vm_snapshot_req(sc->vm, snapshot_meta); break; case VM_RESTORE_TIME: error = vm_restore_time(sc->vm); break; #endif default: error = ENOTTY; break; } if (state_changed == 1) vcpu_unlock_one(sc, vcpuid); else if (state_changed == 2) vcpu_unlock_all(sc); done: /* * Make sure that no handler returns a kernel-internal * error value to userspace. */ KASSERT(error == ERESTART || error >= 0, ("vmmdev_ioctl: invalid error return %d", error)); return (error); } static int vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t mapsize, struct vm_object **objp, int nprot) { struct vmmdev_softc *sc; vm_paddr_t gpa; size_t len; vm_ooffset_t segoff, first, last; int error, found, segid; uint16_t lastcpu; bool sysmem; error = vmm_priv_check(curthread->td_ucred); if (error) return (error); first = *offset; last = first + mapsize; if ((nprot & PROT_EXEC) || first < 0 || first >= last) return (EINVAL); sc = vmmdev_lookup2(cdev); if (sc == NULL) { /* virtual machine is in the process of being created */ return (EINVAL); } /* * Get a read lock on the guest memory map by freezing any vcpu. */ lastcpu = vm_get_maxcpus(sc->vm) - 1; error = vcpu_lock_one(sc, lastcpu); if (error) return (error); gpa = 0; found = 0; while (!found) { error = vm_mmap_getnext(sc->vm, &gpa, &segid, &segoff, &len, NULL, NULL); if (error) break; if (first >= gpa && last <= gpa + len) found = 1; else gpa += len; } if (found) { error = vm_get_memseg(sc->vm, segid, &len, &sysmem, objp); KASSERT(error == 0 && *objp != NULL, ("%s: invalid memory segment %d", __func__, segid)); if (sysmem) { vm_object_reference(*objp); *offset = segoff + (first - gpa); } else { error = EINVAL; } } vcpu_unlock_one(sc, lastcpu); return (error); } static void vmmdev_destroy(void *arg) { struct vmmdev_softc *sc = arg; struct devmem_softc *dsc; int error __diagused; error = vcpu_lock_all(sc); KASSERT(error == 0, ("%s: error %d freezing vcpus", __func__, error)); while ((dsc = SLIST_FIRST(&sc->devmem)) != NULL) { KASSERT(dsc->cdev == NULL, ("%s: devmem not free", __func__)); SLIST_REMOVE_HEAD(&sc->devmem, link); free(dsc->name, M_VMMDEV); free(dsc, M_VMMDEV); } if (sc->cdev != NULL) destroy_dev(sc->cdev); if (sc->vm != NULL) vm_destroy(sc->vm); if (sc->ucred != NULL) crfree(sc->ucred); if ((sc->flags & VSC_LINKED) != 0) { mtx_lock(&vmmdev_mtx); SLIST_REMOVE(&head, sc, vmmdev_softc, link); mtx_unlock(&vmmdev_mtx); } free(sc, M_VMMDEV); } static int sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS) { struct devmem_softc *dsc; struct vmmdev_softc *sc; struct cdev *cdev; char *buf; int error, buflen; error = vmm_priv_check(req->td->td_ucred); if (error) return (error); buflen = VM_MAX_NAMELEN + 1; buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO); strlcpy(buf, "beavis", buflen); error = sysctl_handle_string(oidp, buf, buflen, req); if (error != 0 || req->newptr == NULL) goto out; mtx_lock(&vmmdev_mtx); sc = vmmdev_lookup(buf); if (sc == NULL || sc->cdev == NULL) { mtx_unlock(&vmmdev_mtx); error = EINVAL; goto out; } /* * Setting 'sc->cdev' to NULL is used to indicate that the VM * is scheduled for destruction. */ cdev = sc->cdev; sc->cdev = NULL; mtx_unlock(&vmmdev_mtx); /* * Destroy all cdevs: * * - any new operations on the 'cdev' will return an error (ENXIO). * * - the 'devmem' cdevs are destroyed before the virtual machine 'cdev' */ SLIST_FOREACH(dsc, &sc->devmem, link) { KASSERT(dsc->cdev != NULL, ("devmem cdev already destroyed")); destroy_dev(dsc->cdev); devmem_destroy(dsc); } destroy_dev(cdev); vmmdev_destroy(sc); error = 0; out: free(buf, M_VMMDEV); return (error); } SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE, NULL, 0, sysctl_vmm_destroy, "A", NULL); static struct cdevsw vmmdevsw = { .d_name = "vmmdev", .d_version = D_VERSION, .d_ioctl = vmmdev_ioctl, .d_mmap_single = vmmdev_mmap_single, .d_read = vmmdev_rw, .d_write = vmmdev_rw, }; static int sysctl_vmm_create(SYSCTL_HANDLER_ARGS) { struct vm *vm; struct cdev *cdev; struct vmmdev_softc *sc, *sc2; char *buf; int error, buflen; error = vmm_priv_check(req->td->td_ucred); if (error) return (error); buflen = VM_MAX_NAMELEN + 1; buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO); strlcpy(buf, "beavis", buflen); error = sysctl_handle_string(oidp, buf, buflen, req); if (error != 0 || req->newptr == NULL) goto out; mtx_lock(&vmmdev_mtx); sc = vmmdev_lookup(buf); mtx_unlock(&vmmdev_mtx); if (sc != NULL) { error = EEXIST; goto out; } error = vm_create(buf, &vm); if (error != 0) goto out; sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO); sc->ucred = crhold(curthread->td_ucred); sc->vm = vm; SLIST_INIT(&sc->devmem); /* * Lookup the name again just in case somebody sneaked in when we * dropped the lock. */ mtx_lock(&vmmdev_mtx); sc2 = vmmdev_lookup(buf); if (sc2 == NULL) { SLIST_INSERT_HEAD(&head, sc, link); sc->flags |= VSC_LINKED; } mtx_unlock(&vmmdev_mtx); if (sc2 != NULL) { vmmdev_destroy(sc); error = EEXIST; goto out; } error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &vmmdevsw, sc->ucred, UID_ROOT, GID_WHEEL, 0600, "vmm/%s", buf); if (error != 0) { vmmdev_destroy(sc); goto out; } mtx_lock(&vmmdev_mtx); sc->cdev = cdev; sc->cdev->si_drv1 = sc; mtx_unlock(&vmmdev_mtx); out: free(buf, M_VMMDEV); return (error); } SYSCTL_PROC(_hw_vmm, OID_AUTO, create, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE, NULL, 0, sysctl_vmm_create, "A", NULL); void vmmdev_init(void) { mtx_init(&vmmdev_mtx, "vmm device mutex", NULL, MTX_DEF); pr_allow_flag = prison_add_allow(NULL, "vmm", NULL, "Allow use of vmm in a jail."); } int vmmdev_cleanup(void) { int error; if (SLIST_EMPTY(&head)) error = 0; else error = EBUSY; return (error); } static int devmem_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t len, struct vm_object **objp, int nprot) { struct devmem_softc *dsc; vm_ooffset_t first, last; size_t seglen; int error; uint16_t lastcpu; bool sysmem; dsc = cdev->si_drv1; if (dsc == NULL) { /* 'cdev' has been created but is not ready for use */ return (ENXIO); } first = *offset; last = *offset + len; if ((nprot & PROT_EXEC) || first < 0 || first >= last) return (EINVAL); lastcpu = vm_get_maxcpus(dsc->sc->vm) - 1; error = vcpu_lock_one(dsc->sc, lastcpu); if (error) return (error); error = vm_get_memseg(dsc->sc->vm, dsc->segid, &seglen, &sysmem, objp); KASSERT(error == 0 && !sysmem && *objp != NULL, ("%s: invalid devmem segment %d", __func__, dsc->segid)); vcpu_unlock_one(dsc->sc, lastcpu); if (seglen >= last) { vm_object_reference(*objp); return (0); } else { return (EINVAL); } } static struct cdevsw devmemsw = { .d_name = "devmem", .d_version = D_VERSION, .d_mmap_single = devmem_mmap_single, }; static int devmem_create_cdev(const char *vmname, int segid, char *devname) { struct devmem_softc *dsc; struct vmmdev_softc *sc; struct cdev *cdev; int error; error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &devmemsw, NULL, UID_ROOT, GID_WHEEL, 0600, "vmm.io/%s.%s", vmname, devname); if (error) return (error); dsc = malloc(sizeof(struct devmem_softc), M_VMMDEV, M_WAITOK | M_ZERO); mtx_lock(&vmmdev_mtx); sc = vmmdev_lookup(vmname); KASSERT(sc != NULL, ("%s: vm %s softc not found", __func__, vmname)); if (sc->cdev == NULL) { /* virtual machine is being created or destroyed */ mtx_unlock(&vmmdev_mtx); free(dsc, M_VMMDEV); destroy_dev_sched_cb(cdev, NULL, 0); return (ENODEV); } dsc->segid = segid; dsc->name = devname; dsc->cdev = cdev; dsc->sc = sc; SLIST_INSERT_HEAD(&sc->devmem, dsc, link); mtx_unlock(&vmmdev_mtx); /* The 'cdev' is ready for use after 'si_drv1' is initialized */ cdev->si_drv1 = dsc; return (0); } static void devmem_destroy(void *arg) { struct devmem_softc *dsc = arg; KASSERT(dsc->cdev, ("%s: devmem cdev already destroyed", __func__)); dsc->cdev = NULL; dsc->sc = NULL; } diff --git a/sys/amd64/vmm/vmm_ioport.c b/sys/amd64/vmm/vmm_ioport.c index 50c02378a4c5..24c556a3c8ae 100644 --- a/sys/amd64/vmm/vmm_ioport.c +++ b/sys/amd64/vmm/vmm_ioport.c @@ -1,178 +1,177 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2014 Tycho Nightingale * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include "vatpic.h" #include "vatpit.h" #include "vpmtmr.h" #include "vrtc.h" #include "vmm_ioport.h" #include "vmm_ktr.h" #define MAX_IOPORTS 1280 ioport_handler_func_t ioport_handler[MAX_IOPORTS] = { [TIMER_MODE] = vatpit_handler, [TIMER_CNTR0] = vatpit_handler, [TIMER_CNTR1] = vatpit_handler, [TIMER_CNTR2] = vatpit_handler, [NMISC_PORT] = vatpit_nmisc_handler, [IO_ICU1] = vatpic_master_handler, [IO_ICU1 + ICU_IMR_OFFSET] = vatpic_master_handler, [IO_ICU2] = vatpic_slave_handler, [IO_ICU2 + ICU_IMR_OFFSET] = vatpic_slave_handler, [IO_ELCR1] = vatpic_elc_handler, [IO_ELCR2] = vatpic_elc_handler, [IO_PMTMR] = vpmtmr_handler, [IO_RTC] = vrtc_addr_handler, [IO_RTC + 1] = vrtc_data_handler, }; #ifdef KTR static const char * inout_instruction(struct vm_exit *vmexit) { int index; static const char *iodesc[] = { "outb", "outw", "outl", "inb", "inw", "inl", "outsb", "outsw", "outsd", "insb", "insw", "insd", }; switch (vmexit->u.inout.bytes) { case 1: index = 0; break; case 2: index = 1; break; default: index = 2; break; } if (vmexit->u.inout.in) index += 3; if (vmexit->u.inout.string) index += 6; KASSERT(index < nitems(iodesc), ("%s: invalid index %d", __func__, index)); return (iodesc[index]); } #endif /* KTR */ static int -emulate_inout_port(struct vm *vm, int vcpuid, struct vm_exit *vmexit, - bool *retu) +emulate_inout_port(struct vcpu *vcpu, struct vm_exit *vmexit, bool *retu) { ioport_handler_func_t handler; uint32_t mask, val; int error; /* * If there is no handler for the I/O port then punt to userspace. */ if (vmexit->u.inout.port >= MAX_IOPORTS || (handler = ioport_handler[vmexit->u.inout.port]) == NULL) { *retu = true; return (0); } mask = vie_size2mask(vmexit->u.inout.bytes); if (!vmexit->u.inout.in) { val = vmexit->u.inout.eax & mask; } - error = (*handler)(vm, vmexit->u.inout.in, vmexit->u.inout.port, - vmexit->u.inout.bytes, &val); + error = (*handler)(vcpu_vm(vcpu), vmexit->u.inout.in, + vmexit->u.inout.port, vmexit->u.inout.bytes, &val); if (error) { /* * The value returned by this function is also the return value * of vm_run(). This needs to be a positive number otherwise it * can be interpreted as a "pseudo-error" like ERESTART. * * Enforce this by mapping all errors to EIO. */ return (EIO); } if (vmexit->u.inout.in) { vmexit->u.inout.eax &= ~mask; vmexit->u.inout.eax |= val & mask; - error = vm_set_register(vm_vcpu(vm, vcpuid), VM_REG_GUEST_RAX, + error = vm_set_register(vcpu, VM_REG_GUEST_RAX, vmexit->u.inout.eax); KASSERT(error == 0, ("emulate_ioport: error %d setting guest " "rax register", error)); } *retu = false; return (0); } static int -emulate_inout_str(struct vm *vm, int vcpuid, struct vm_exit *vmexit, bool *retu) +emulate_inout_str(struct vcpu *vcpu, struct vm_exit *vmexit, bool *retu) { *retu = true; return (0); /* Return to userspace to finish emulation */ } int -vm_handle_inout(struct vm *vm, int vcpuid, struct vm_exit *vmexit, bool *retu) +vm_handle_inout(struct vcpu *vcpu, struct vm_exit *vmexit, bool *retu) { int bytes __diagused, error; bytes = vmexit->u.inout.bytes; KASSERT(bytes == 1 || bytes == 2 || bytes == 4, ("vm_handle_inout: invalid operand size %d", bytes)); if (vmexit->u.inout.string) - error = emulate_inout_str(vm, vcpuid, vmexit, retu); + error = emulate_inout_str(vcpu, vmexit, retu); else - error = emulate_inout_port(vm, vcpuid, vmexit, retu); + error = emulate_inout_port(vcpu, vmexit, retu); - VCPU_CTR4(vm, vcpuid, "%s%s 0x%04x: %s", + VCPU_CTR4(vcpu_vm(vcpu), vcpu_vcpuid(vcpu), "%s%s 0x%04x: %s", vmexit->u.inout.rep ? "rep " : "", inout_instruction(vmexit), vmexit->u.inout.port, error ? "error" : (*retu ? "userspace" : "handled")); return (error); } diff --git a/sys/amd64/vmm/vmm_ioport.h b/sys/amd64/vmm/vmm_ioport.h index fd20ecfd5e94..0b6f09f01e9a 100644 --- a/sys/amd64/vmm/vmm_ioport.h +++ b/sys/amd64/vmm/vmm_ioport.h @@ -1,39 +1,39 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2014 Tycho Nightingale * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _VMM_IOPORT_H_ #define _VMM_IOPORT_H_ typedef int (*ioport_handler_func_t)(struct vm *vm, bool in, int port, int bytes, uint32_t *val); -int vm_handle_inout(struct vm *vm, int vcpuid, struct vm_exit *vme, bool *retu); +int vm_handle_inout(struct vcpu *vcpu, struct vm_exit *vme, bool *retu); #endif /* _VMM_IOPORT_H_ */ diff --git a/sys/amd64/vmm/vmm_lapic.c b/sys/amd64/vmm/vmm_lapic.c index ae4205e11614..1b6c51c8283c 100644 --- a/sys/amd64/vmm/vmm_lapic.c +++ b/sys/amd64/vmm/vmm_lapic.c @@ -1,241 +1,237 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include "vmm_ktr.h" #include "vmm_lapic.h" #include "vlapic.h" /* * Some MSI message definitions */ #define MSI_X86_ADDR_MASK 0xfff00000 #define MSI_X86_ADDR_BASE 0xfee00000 #define MSI_X86_ADDR_RH 0x00000008 /* Redirection Hint */ #define MSI_X86_ADDR_LOG 0x00000004 /* Destination Mode */ int -lapic_set_intr(struct vm *vm, int cpu, int vector, bool level) +lapic_set_intr(struct vcpu *vcpu, int vector, bool level) { struct vlapic *vlapic; - if (cpu < 0 || cpu >= vm_get_maxcpus(vm)) - return (EINVAL); - /* * According to section "Maskable Hardware Interrupts" in Intel SDM * vectors 16 through 255 can be delivered through the local APIC. */ if (vector < 16 || vector > 255) return (EINVAL); - vlapic = vm_lapic(vm_vcpu(vm, cpu)); + vlapic = vm_lapic(vcpu); if (vlapic_set_intr_ready(vlapic, vector, level)) - vcpu_notify_event(vm, cpu, true); + vcpu_notify_event(vcpu, true); return (0); } int -lapic_set_local_intr(struct vm *vm, int cpu, int vector) +lapic_set_local_intr(struct vm *vm, struct vcpu *vcpu, int vector) { struct vlapic *vlapic; cpuset_t dmask; - int error; + int cpu, error; - if (cpu < -1 || cpu >= vm_get_maxcpus(vm)) - return (EINVAL); - - if (cpu == -1) + if (vcpu == NULL) { + error = 0; dmask = vm_active_cpus(vm); - else - CPU_SETOF(cpu, &dmask); - error = 0; - CPU_FOREACH_ISSET(cpu, &dmask) { - vlapic = vm_lapic(vm_vcpu(vm, cpu)); + CPU_FOREACH_ISSET(cpu, &dmask) { + vlapic = vm_lapic(vm_vcpu(vm, cpu)); + error = vlapic_trigger_lvt(vlapic, vector); + if (error) + break; + } + } else { + vlapic = vm_lapic(vcpu); error = vlapic_trigger_lvt(vlapic, vector); - if (error) - break; } return (error); } int lapic_intr_msi(struct vm *vm, uint64_t addr, uint64_t msg) { int delmode, vec; uint32_t dest; bool phys; VM_CTR2(vm, "lapic MSI addr: %#lx msg: %#lx", addr, msg); if ((addr & MSI_X86_ADDR_MASK) != MSI_X86_ADDR_BASE) { VM_CTR1(vm, "lapic MSI invalid addr %#lx", addr); return (-1); } /* * Extract the x86-specific fields from the MSI addr/msg * params according to the Intel Arch spec, Vol3 Ch 10. * * The PCI specification does not support level triggered * MSI/MSI-X so ignore trigger level in 'msg'. * * The 'dest' is interpreted as a logical APIC ID if both * the Redirection Hint and Destination Mode are '1' and * physical otherwise. */ dest = (addr >> 12) & 0xff; phys = ((addr & (MSI_X86_ADDR_RH | MSI_X86_ADDR_LOG)) != (MSI_X86_ADDR_RH | MSI_X86_ADDR_LOG)); delmode = msg & APIC_DELMODE_MASK; vec = msg & 0xff; VM_CTR3(vm, "lapic MSI %s dest %#x, vec %d", phys ? "physical" : "logical", dest, vec); vlapic_deliver_intr(vm, LAPIC_TRIG_EDGE, dest, phys, delmode, vec); return (0); } static bool x2apic_msr(u_int msr) { return (msr >= 0x800 && msr <= 0xBFF); } static u_int x2apic_msr_to_regoff(u_int msr) { return ((msr - 0x800) << 4); } bool lapic_msr(u_int msr) { return (x2apic_msr(msr) || msr == MSR_APICBASE); } int lapic_rdmsr(struct vcpu *vcpu, u_int msr, uint64_t *rval, bool *retu) { int error; u_int offset; struct vlapic *vlapic; vlapic = vm_lapic(vcpu); if (msr == MSR_APICBASE) { *rval = vlapic_get_apicbase(vlapic); error = 0; } else { offset = x2apic_msr_to_regoff(msr); error = vlapic_read(vlapic, 0, offset, rval, retu); } return (error); } int lapic_wrmsr(struct vcpu *vcpu, u_int msr, uint64_t val, bool *retu) { int error; u_int offset; struct vlapic *vlapic; vlapic = vm_lapic(vcpu); if (msr == MSR_APICBASE) { error = vlapic_set_apicbase(vlapic, val); } else { offset = x2apic_msr_to_regoff(msr); error = vlapic_write(vlapic, 0, offset, val, retu); } return (error); } int lapic_mmio_write(struct vcpu *vcpu, uint64_t gpa, uint64_t wval, int size, void *arg) { int error; uint64_t off; struct vlapic *vlapic; off = gpa - DEFAULT_APIC_BASE; /* * Memory mapped local apic accesses must be 4 bytes wide and * aligned on a 16-byte boundary. */ if (size != 4 || off & 0xf) return (EINVAL); vlapic = vm_lapic(vcpu); error = vlapic_write(vlapic, 1, off, wval, arg); return (error); } int lapic_mmio_read(struct vcpu *vcpu, uint64_t gpa, uint64_t *rval, int size, void *arg) { int error; uint64_t off; struct vlapic *vlapic; off = gpa - DEFAULT_APIC_BASE; /* * Memory mapped local apic accesses should be aligned on a * 16-byte boundary. They are also suggested to be 4 bytes * wide, alas not all OSes follow suggestions. */ off &= ~3; if (off & 0xf) return (EINVAL); vlapic = vm_lapic(vcpu); error = vlapic_read(vlapic, 1, off, rval, arg); return (error); } diff --git a/sys/amd64/vmm/vmm_lapic.h b/sys/amd64/vmm/vmm_lapic.h index e917fd50a69d..a361cd09c332 100644 --- a/sys/amd64/vmm/vmm_lapic.h +++ b/sys/amd64/vmm/vmm_lapic.h @@ -1,76 +1,76 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _VMM_LAPIC_H_ #define _VMM_LAPIC_H_ struct vcpu; struct vm; bool lapic_msr(u_int num); int lapic_rdmsr(struct vcpu *vcpu, u_int msr, uint64_t *rval, bool *retu); int lapic_wrmsr(struct vcpu *vcpu, u_int msr, uint64_t wval, bool *retu); int lapic_mmio_read(struct vcpu *vcpu, uint64_t gpa, uint64_t *rval, int size, void *arg); int lapic_mmio_write(struct vcpu *vcpu, uint64_t gpa, uint64_t wval, int size, void *arg); /* * Signals to the LAPIC that an interrupt at 'vector' needs to be generated * to the 'cpu', the state is recorded in IRR. */ -int lapic_set_intr(struct vm *vm, int cpu, int vector, bool trig); +int lapic_set_intr(struct vcpu *vcpu, int vector, bool trig); #define LAPIC_TRIG_LEVEL true #define LAPIC_TRIG_EDGE false static __inline int -lapic_intr_level(struct vm *vm, int cpu, int vector) +lapic_intr_level(struct vcpu *vcpu, int vector) { - return (lapic_set_intr(vm, cpu, vector, LAPIC_TRIG_LEVEL)); + return (lapic_set_intr(vcpu, vector, LAPIC_TRIG_LEVEL)); } static __inline int -lapic_intr_edge(struct vm *vm, int cpu, int vector) +lapic_intr_edge(struct vcpu *vcpu, int vector) { - return (lapic_set_intr(vm, cpu, vector, LAPIC_TRIG_EDGE)); + return (lapic_set_intr(vcpu, vector, LAPIC_TRIG_EDGE)); } /* * Triggers the LAPIC local interrupt (LVT) 'vector' on 'cpu'. 'cpu' can * be set to -1 to trigger the interrupt on all CPUs. */ -int lapic_set_local_intr(struct vm *vm, int cpu, int vector); +int lapic_set_local_intr(struct vm *vm, struct vcpu *vcpu, int vector); int lapic_intr_msi(struct vm *vm, uint64_t addr, uint64_t msg); #endif diff --git a/sys/amd64/vmm/vmm_stat.c b/sys/amd64/vmm/vmm_stat.c index b317f5679bf3..2df4423bc60f 100644 --- a/sys/amd64/vmm/vmm_stat.c +++ b/sys/amd64/vmm/vmm_stat.c @@ -1,185 +1,182 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include "vmm_util.h" #include "vmm_stat.h" /* * 'vst_num_elems' is the total number of addressable statistic elements * 'vst_num_types' is the number of unique statistic types * * It is always true that 'vst_num_elems' is greater than or equal to * 'vst_num_types'. This is because a stat type may represent more than * one element (for e.g. VMM_STAT_ARRAY). */ static int vst_num_elems, vst_num_types; static struct vmm_stat_type *vsttab[MAX_VMM_STAT_ELEMS]; static MALLOC_DEFINE(M_VMM_STAT, "vmm stat", "vmm stat"); #define vst_size ((size_t)vst_num_elems * sizeof(uint64_t)) void vmm_stat_register(void *arg) { struct vmm_stat_type *vst = arg; /* We require all stats to identify themselves with a description */ if (vst->desc == NULL) return; if (vst->scope == VMM_STAT_SCOPE_INTEL && !vmm_is_intel()) return; if (vst->scope == VMM_STAT_SCOPE_AMD && !vmm_is_svm()) return; if (vst_num_elems + vst->nelems >= MAX_VMM_STAT_ELEMS) { printf("Cannot accommodate vmm stat type \"%s\"!\n", vst->desc); return; } vst->index = vst_num_elems; vst_num_elems += vst->nelems; vsttab[vst_num_types++] = vst; } int -vmm_stat_copy(struct vm *vm, int vcpu, int index, int count, int *num_stats, +vmm_stat_copy(struct vcpu *vcpu, int index, int count, int *num_stats, uint64_t *buf) { struct vmm_stat_type *vst; uint64_t *stats; int i, tocopy; - if (vcpu < 0 || vcpu >= vm_get_maxcpus(vm)) - return (EINVAL); - if (index < 0 || count < 0) return (EINVAL); if (index > vst_num_elems) return (ENOENT); if (index == vst_num_elems) { *num_stats = 0; return (0); } tocopy = min(vst_num_elems - index, count); /* Let stats functions update their counters */ for (i = 0; i < vst_num_types; i++) { vst = vsttab[i]; if (vst->func != NULL) - (*vst->func)(vm, vcpu, vst); + (*vst->func)(vcpu, vst); } /* Copy over the stats */ - stats = vcpu_stats(vm_vcpu(vm, vcpu)); + stats = vcpu_stats(vcpu); memcpy(buf, stats + index, tocopy * sizeof(stats[0])); *num_stats = tocopy; return (0); } void * vmm_stat_alloc(void) { return (malloc(vst_size, M_VMM_STAT, M_WAITOK)); } void vmm_stat_init(void *vp) { bzero(vp, vst_size); } void vmm_stat_free(void *vp) { free(vp, M_VMM_STAT); } int vmm_stat_desc_copy(int index, char *buf, int bufsize) { int i; struct vmm_stat_type *vst; for (i = 0; i < vst_num_types; i++) { vst = vsttab[i]; if (index >= vst->index && index < vst->index + vst->nelems) { if (vst->nelems > 1) { snprintf(buf, bufsize, "%s[%d]", vst->desc, index - vst->index); } else { strlcpy(buf, vst->desc, bufsize); } return (0); /* found it */ } } return (EINVAL); } /* global statistics */ VMM_STAT(VCPU_MIGRATIONS, "vcpu migration across host cpus"); VMM_STAT(VMEXIT_COUNT, "total number of vm exits"); VMM_STAT(VMEXIT_EXTINT, "vm exits due to external interrupt"); VMM_STAT(VMEXIT_HLT, "number of times hlt was intercepted"); VMM_STAT(VMEXIT_CR_ACCESS, "number of times %cr access was intercepted"); VMM_STAT(VMEXIT_RDMSR, "number of times rdmsr was intercepted"); VMM_STAT(VMEXIT_WRMSR, "number of times wrmsr was intercepted"); VMM_STAT(VMEXIT_MTRAP, "number of monitor trap exits"); VMM_STAT(VMEXIT_PAUSE, "number of times pause was intercepted"); VMM_STAT(VMEXIT_INTR_WINDOW, "vm exits due to interrupt window opening"); VMM_STAT(VMEXIT_NMI_WINDOW, "vm exits due to nmi window opening"); VMM_STAT(VMEXIT_INOUT, "number of times in/out was intercepted"); VMM_STAT(VMEXIT_CPUID, "number of times cpuid was intercepted"); VMM_STAT(VMEXIT_NESTED_FAULT, "vm exits due to nested page fault"); VMM_STAT(VMEXIT_INST_EMUL, "vm exits for instruction emulation"); VMM_STAT(VMEXIT_UNKNOWN, "number of vm exits for unknown reason"); VMM_STAT(VMEXIT_ASTPENDING, "number of times astpending at exit"); VMM_STAT(VMEXIT_REQIDLE, "number of times idle requested at exit"); VMM_STAT(VMEXIT_USERSPACE, "number of vm exits handled in userspace"); VMM_STAT(VMEXIT_RENDEZVOUS, "number of times rendezvous pending at exit"); VMM_STAT(VMEXIT_EXCEPTION, "number of vm exits due to exceptions"); diff --git a/sys/amd64/vmm/vmm_stat.h b/sys/amd64/vmm/vmm_stat.h index 69e7b9367686..e40a960ec82a 100644 --- a/sys/amd64/vmm/vmm_stat.h +++ b/sys/amd64/vmm/vmm_stat.h @@ -1,161 +1,161 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _VMM_STAT_H_ #define _VMM_STAT_H_ struct vm; #define MAX_VMM_STAT_ELEMS 64 /* arbitrary */ enum vmm_stat_scope { VMM_STAT_SCOPE_ANY, VMM_STAT_SCOPE_INTEL, /* Intel VMX specific statistic */ VMM_STAT_SCOPE_AMD, /* AMD SVM specific statistic */ }; struct vmm_stat_type; -typedef void (*vmm_stat_func_t)(struct vm *vm, int vcpu, +typedef void (*vmm_stat_func_t)(struct vcpu *vcpu, struct vmm_stat_type *stat); struct vmm_stat_type { int index; /* position in the stats buffer */ int nelems; /* standalone or array */ const char *desc; /* description of statistic */ vmm_stat_func_t func; enum vmm_stat_scope scope; }; void vmm_stat_register(void *arg); #define VMM_STAT_FDEFINE(type, nelems, desc, func, scope) \ struct vmm_stat_type type[1] = { \ { -1, nelems, desc, func, scope } \ }; \ SYSINIT(type##_stat, SI_SUB_KLD, SI_ORDER_ANY, vmm_stat_register, type) #define VMM_STAT_DEFINE(type, nelems, desc, scope) \ VMM_STAT_FDEFINE(type, nelems, desc, NULL, scope) #define VMM_STAT_DECLARE(type) \ extern struct vmm_stat_type type[1] #define VMM_STAT(type, desc) \ VMM_STAT_DEFINE(type, 1, desc, VMM_STAT_SCOPE_ANY) #define VMM_STAT_INTEL(type, desc) \ VMM_STAT_DEFINE(type, 1, desc, VMM_STAT_SCOPE_INTEL) #define VMM_STAT_AMD(type, desc) \ VMM_STAT_DEFINE(type, 1, desc, VMM_STAT_SCOPE_AMD) #define VMM_STAT_FUNC(type, desc, func) \ VMM_STAT_FDEFINE(type, 1, desc, func, VMM_STAT_SCOPE_ANY) #define VMM_STAT_ARRAY(type, nelems, desc) \ VMM_STAT_DEFINE(type, nelems, desc, VMM_STAT_SCOPE_ANY) void *vmm_stat_alloc(void); void vmm_stat_init(void *vp); void vmm_stat_free(void *vp); -int vmm_stat_copy(struct vm *vm, int vcpu, int index, int count, +int vmm_stat_copy(struct vcpu *vcpu, int index, int count, int *num_stats, uint64_t *buf); int vmm_stat_desc_copy(int index, char *buf, int buflen); static void __inline vmm_stat_array_incr(struct vcpu *vcpu, struct vmm_stat_type *vst, int statidx, uint64_t x) { #ifdef VMM_KEEP_STATS uint64_t *stats; stats = vcpu_stats(vcpu); if (vst->index >= 0 && statidx < vst->nelems) stats[vst->index + statidx] += x; #endif } static void __inline vmm_stat_array_set(struct vcpu *vcpu, struct vmm_stat_type *vst, int statidx, uint64_t val) { #ifdef VMM_KEEP_STATS uint64_t *stats; stats = vcpu_stats(vcpu); if (vst->index >= 0 && statidx < vst->nelems) stats[vst->index + statidx] = val; #endif } static void __inline vmm_stat_incr(struct vcpu *vcpu, struct vmm_stat_type *vst, uint64_t x) { #ifdef VMM_KEEP_STATS vmm_stat_array_incr(vcpu, vst, 0, x); #endif } static void __inline vmm_stat_set(struct vcpu *vcpu, struct vmm_stat_type *vst, uint64_t val) { #ifdef VMM_KEEP_STATS vmm_stat_array_set(vcpu, vst, 0, val); #endif } VMM_STAT_DECLARE(VCPU_MIGRATIONS); VMM_STAT_DECLARE(VMEXIT_COUNT); VMM_STAT_DECLARE(VMEXIT_EXTINT); VMM_STAT_DECLARE(VMEXIT_HLT); VMM_STAT_DECLARE(VMEXIT_CR_ACCESS); VMM_STAT_DECLARE(VMEXIT_RDMSR); VMM_STAT_DECLARE(VMEXIT_WRMSR); VMM_STAT_DECLARE(VMEXIT_MTRAP); VMM_STAT_DECLARE(VMEXIT_PAUSE); VMM_STAT_DECLARE(VMEXIT_INTR_WINDOW); VMM_STAT_DECLARE(VMEXIT_NMI_WINDOW); VMM_STAT_DECLARE(VMEXIT_INOUT); VMM_STAT_DECLARE(VMEXIT_CPUID); VMM_STAT_DECLARE(VMEXIT_NESTED_FAULT); VMM_STAT_DECLARE(VMEXIT_INST_EMUL); VMM_STAT_DECLARE(VMEXIT_UNKNOWN); VMM_STAT_DECLARE(VMEXIT_ASTPENDING); VMM_STAT_DECLARE(VMEXIT_USERSPACE); VMM_STAT_DECLARE(VMEXIT_RENDEZVOUS); VMM_STAT_DECLARE(VMEXIT_EXCEPTION); VMM_STAT_DECLARE(VMEXIT_REQIDLE); #endif diff --git a/sys/amd64/vmm/x86.c b/sys/amd64/vmm/x86.c index 9a4bb049e4c9..d99fb391afba 100644 --- a/sys/amd64/vmm/x86.c +++ b/sys/amd64/vmm/x86.c @@ -1,752 +1,752 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include "vmm_host.h" #include "vmm_ktr.h" #include "vmm_util.h" #include "x86.h" SYSCTL_DECL(_hw_vmm); static SYSCTL_NODE(_hw_vmm, OID_AUTO, topology, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, NULL); #define CPUID_VM_HIGH 0x40000000 static const char bhyve_id[12] = "bhyve bhyve "; static uint64_t bhyve_xcpuids; SYSCTL_ULONG(_hw_vmm, OID_AUTO, bhyve_xcpuids, CTLFLAG_RW, &bhyve_xcpuids, 0, "Number of times an unknown cpuid leaf was accessed"); #if __FreeBSD_version < 1200060 /* Remove after 11 EOL helps MFCing */ extern u_int threads_per_core; SYSCTL_UINT(_hw_vmm_topology, OID_AUTO, threads_per_core, CTLFLAG_RDTUN, &threads_per_core, 0, NULL); extern u_int cores_per_package; SYSCTL_UINT(_hw_vmm_topology, OID_AUTO, cores_per_package, CTLFLAG_RDTUN, &cores_per_package, 0, NULL); #endif static int cpuid_leaf_b = 1; SYSCTL_INT(_hw_vmm_topology, OID_AUTO, cpuid_leaf_b, CTLFLAG_RDTUN, &cpuid_leaf_b, 0, NULL); /* * Round up to the next power of two, if necessary, and then take log2. * Returns -1 if argument is zero. */ static __inline int log2(u_int x) { return (fls(x << (1 - powerof2(x))) - 1); } int x86_emulate_cpuid(struct vcpu *vcpu, uint64_t *rax, uint64_t *rbx, uint64_t *rcx, uint64_t *rdx) { struct vm *vm = vcpu_vm(vcpu); int vcpu_id = vcpu_vcpuid(vcpu); const struct xsave_limits *limits; uint64_t cr4; int error, enable_invpcid, enable_rdpid, enable_rdtscp, level, width, x2apic_id; unsigned int func, regs[4], logical_cpus, param; enum x2apic_state x2apic_state; uint16_t cores, maxcpus, sockets, threads; /* * The function of CPUID is controlled through the provided value of * %eax (and secondarily %ecx, for certain leaf data). */ func = (uint32_t)*rax; param = (uint32_t)*rcx; VCPU_CTR2(vm, vcpu_id, "cpuid %#x,%#x", func, param); /* * Requests for invalid CPUID levels should map to the highest * available level instead. */ if (cpu_exthigh != 0 && func >= 0x80000000) { if (func > cpu_exthigh) func = cpu_exthigh; } else if (func >= 0x40000000) { if (func > CPUID_VM_HIGH) func = CPUID_VM_HIGH; } else if (func > cpu_high) { func = cpu_high; } /* * In general the approach used for CPU topology is to * advertise a flat topology where all CPUs are packages with * no multi-core or SMT. */ switch (func) { /* * Pass these through to the guest */ case CPUID_0000_0000: case CPUID_0000_0002: case CPUID_0000_0003: case CPUID_8000_0000: case CPUID_8000_0002: case CPUID_8000_0003: case CPUID_8000_0004: case CPUID_8000_0006: cpuid_count(func, param, regs); break; case CPUID_8000_0008: cpuid_count(func, param, regs); if (vmm_is_svm()) { /* * As on Intel (0000_0007:0, EDX), mask out * unsupported or unsafe AMD extended features * (8000_0008 EBX). */ regs[1] &= (AMDFEID_CLZERO | AMDFEID_IRPERF | AMDFEID_XSAVEERPTR); vm_get_topology(vm, &sockets, &cores, &threads, &maxcpus); /* * Here, width is ApicIdCoreIdSize, present on * at least Family 15h and newer. It * represents the "number of bits in the * initial apicid that indicate thread id * within a package." * * Our topo_probe_amd() uses it for * pkg_id_shift and other OSes may rely on it. */ width = MIN(0xF, log2(threads * cores)); if (width < 0x4) width = 0; logical_cpus = MIN(0xFF, threads * cores - 1); regs[2] = (width << AMDID_COREID_SIZE_SHIFT) | logical_cpus; } break; case CPUID_8000_0001: cpuid_count(func, param, regs); /* * Hide SVM from guest. */ regs[2] &= ~AMDID2_SVM; /* * Don't advertise extended performance counter MSRs * to the guest. */ regs[2] &= ~AMDID2_PCXC; regs[2] &= ~AMDID2_PNXC; regs[2] &= ~AMDID2_PTSCEL2I; /* * Don't advertise Instruction Based Sampling feature. */ regs[2] &= ~AMDID2_IBS; /* NodeID MSR not available */ regs[2] &= ~AMDID2_NODE_ID; /* Don't advertise the OS visible workaround feature */ regs[2] &= ~AMDID2_OSVW; /* Hide mwaitx/monitorx capability from the guest */ regs[2] &= ~AMDID2_MWAITX; /* Advertise RDTSCP if it is enabled. */ - error = vm_get_capability(vm, vcpu_id, + error = vm_get_capability(vcpu, VM_CAP_RDTSCP, &enable_rdtscp); if (error == 0 && enable_rdtscp) regs[3] |= AMDID_RDTSCP; else regs[3] &= ~AMDID_RDTSCP; break; case CPUID_8000_0007: /* * AMD uses this leaf to advertise the processor's * power monitoring and RAS capabilities. These * features are hardware-specific and exposing * them to a guest doesn't make a lot of sense. * * Intel uses this leaf only to advertise the * "Invariant TSC" feature with all other bits * being reserved (set to zero). */ regs[0] = 0; regs[1] = 0; regs[2] = 0; regs[3] = 0; /* * "Invariant TSC" can be advertised to the guest if: * - host TSC frequency is invariant * - host TSCs are synchronized across physical cpus * * XXX This still falls short because the vcpu * can observe the TSC moving backwards as it * migrates across physical cpus. But at least * it should discourage the guest from using the * TSC to keep track of time. */ if (tsc_is_invariant && smp_tsc) regs[3] |= AMDPM_TSC_INVARIANT; break; case CPUID_8000_001D: /* AMD Cache topology, like 0000_0004 for Intel. */ if (!vmm_is_svm()) goto default_leaf; /* * Similar to Intel, generate a ficticious cache * topology for the guest with L3 shared by the * package, and L1 and L2 local to a core. */ vm_get_topology(vm, &sockets, &cores, &threads, &maxcpus); switch (param) { case 0: logical_cpus = threads; level = 1; func = 1; /* data cache */ break; case 1: logical_cpus = threads; level = 2; func = 3; /* unified cache */ break; case 2: logical_cpus = threads * cores; level = 3; func = 3; /* unified cache */ break; default: logical_cpus = 0; level = 0; func = 0; break; } logical_cpus = MIN(0xfff, logical_cpus - 1); regs[0] = (logical_cpus << 14) | (1 << 8) | (level << 5) | func; regs[1] = (func > 0) ? (CACHE_LINE_SIZE - 1) : 0; regs[2] = 0; regs[3] = 0; break; case CPUID_8000_001E: /* * AMD Family 16h+ and Hygon Family 18h additional * identifiers. */ if (!vmm_is_svm() || CPUID_TO_FAMILY(cpu_id) < 0x16) goto default_leaf; vm_get_topology(vm, &sockets, &cores, &threads, &maxcpus); regs[0] = vcpu_id; threads = MIN(0xFF, threads - 1); regs[1] = (threads << 8) | (vcpu_id >> log2(threads + 1)); /* * XXX Bhyve topology cannot yet represent >1 node per * processor. */ regs[2] = 0; regs[3] = 0; break; case CPUID_0000_0001: do_cpuid(1, regs); - error = vm_get_x2apic_state(vm, vcpu_id, &x2apic_state); + error = vm_get_x2apic_state(vcpu, &x2apic_state); if (error) { panic("x86_emulate_cpuid: error %d " "fetching x2apic state", error); } /* * Override the APIC ID only in ebx */ regs[1] &= ~(CPUID_LOCAL_APIC_ID); regs[1] |= (vcpu_id << CPUID_0000_0001_APICID_SHIFT); /* * Don't expose VMX, SpeedStep, TME or SMX capability. * Advertise x2APIC capability and Hypervisor guest. */ regs[2] &= ~(CPUID2_VMX | CPUID2_EST | CPUID2_TM2); regs[2] &= ~(CPUID2_SMX); regs[2] |= CPUID2_HV; if (x2apic_state != X2APIC_DISABLED) regs[2] |= CPUID2_X2APIC; else regs[2] &= ~CPUID2_X2APIC; /* * Only advertise CPUID2_XSAVE in the guest if * the host is using XSAVE. */ if (!(regs[2] & CPUID2_OSXSAVE)) regs[2] &= ~CPUID2_XSAVE; /* * If CPUID2_XSAVE is being advertised and the * guest has set CR4_XSAVE, set * CPUID2_OSXSAVE. */ regs[2] &= ~CPUID2_OSXSAVE; if (regs[2] & CPUID2_XSAVE) { error = vm_get_register(vcpu, VM_REG_GUEST_CR4, &cr4); if (error) panic("x86_emulate_cpuid: error %d " "fetching %%cr4", error); if (cr4 & CR4_XSAVE) regs[2] |= CPUID2_OSXSAVE; } /* * Hide monitor/mwait until we know how to deal with * these instructions. */ regs[2] &= ~CPUID2_MON; /* * Hide the performance and debug features. */ regs[2] &= ~CPUID2_PDCM; /* * No TSC deadline support in the APIC yet */ regs[2] &= ~CPUID2_TSCDLT; /* * Hide thermal monitoring */ regs[3] &= ~(CPUID_ACPI | CPUID_TM); /* * Hide the debug store capability. */ regs[3] &= ~CPUID_DS; /* * Advertise the Machine Check and MTRR capability. * * Some guest OSes (e.g. Windows) will not boot if * these features are absent. */ regs[3] |= (CPUID_MCA | CPUID_MCE | CPUID_MTRR); vm_get_topology(vm, &sockets, &cores, &threads, &maxcpus); logical_cpus = threads * cores; regs[1] &= ~CPUID_HTT_CORES; regs[1] |= (logical_cpus & 0xff) << 16; regs[3] |= CPUID_HTT; break; case CPUID_0000_0004: cpuid_count(func, param, regs); if (regs[0] || regs[1] || regs[2] || regs[3]) { vm_get_topology(vm, &sockets, &cores, &threads, &maxcpus); regs[0] &= 0x3ff; regs[0] |= (cores - 1) << 26; /* * Cache topology: * - L1 and L2 are shared only by the logical * processors in a single core. * - L3 and above are shared by all logical * processors in the package. */ logical_cpus = threads; level = (regs[0] >> 5) & 0x7; if (level >= 3) logical_cpus *= cores; regs[0] |= (logical_cpus - 1) << 14; } break; case CPUID_0000_0007: regs[0] = 0; regs[1] = 0; regs[2] = 0; regs[3] = 0; /* leaf 0 */ if (param == 0) { cpuid_count(func, param, regs); /* Only leaf 0 is supported */ regs[0] = 0; /* * Expose known-safe features. */ regs[1] &= (CPUID_STDEXT_FSGSBASE | CPUID_STDEXT_BMI1 | CPUID_STDEXT_HLE | CPUID_STDEXT_AVX2 | CPUID_STDEXT_SMEP | CPUID_STDEXT_BMI2 | CPUID_STDEXT_ERMS | CPUID_STDEXT_RTM | CPUID_STDEXT_AVX512F | CPUID_STDEXT_RDSEED | CPUID_STDEXT_SMAP | CPUID_STDEXT_AVX512PF | CPUID_STDEXT_AVX512ER | CPUID_STDEXT_AVX512CD | CPUID_STDEXT_SHA); regs[2] = 0; regs[3] &= CPUID_STDEXT3_MD_CLEAR; /* Advertise RDPID if it is enabled. */ - error = vm_get_capability(vm, vcpu_id, - VM_CAP_RDPID, &enable_rdpid); + error = vm_get_capability(vcpu, VM_CAP_RDPID, + &enable_rdpid); if (error == 0 && enable_rdpid) regs[2] |= CPUID_STDEXT2_RDPID; /* Advertise INVPCID if it is enabled. */ - error = vm_get_capability(vm, vcpu_id, + error = vm_get_capability(vcpu, VM_CAP_ENABLE_INVPCID, &enable_invpcid); if (error == 0 && enable_invpcid) regs[1] |= CPUID_STDEXT_INVPCID; } break; case CPUID_0000_0006: regs[0] = CPUTPM1_ARAT; regs[1] = 0; regs[2] = 0; regs[3] = 0; break; case CPUID_0000_000A: /* * Handle the access, but report 0 for * all options */ regs[0] = 0; regs[1] = 0; regs[2] = 0; regs[3] = 0; break; case CPUID_0000_000B: /* * Intel processor topology enumeration */ if (vmm_is_intel()) { vm_get_topology(vm, &sockets, &cores, &threads, &maxcpus); if (param == 0) { logical_cpus = threads; width = log2(logical_cpus); level = CPUID_TYPE_SMT; x2apic_id = vcpu_id; } if (param == 1) { logical_cpus = threads * cores; width = log2(logical_cpus); level = CPUID_TYPE_CORE; x2apic_id = vcpu_id; } if (!cpuid_leaf_b || param >= 2) { width = 0; logical_cpus = 0; level = 0; x2apic_id = 0; } regs[0] = width & 0x1f; regs[1] = logical_cpus & 0xffff; regs[2] = (level << 8) | (param & 0xff); regs[3] = x2apic_id; } else { regs[0] = 0; regs[1] = 0; regs[2] = 0; regs[3] = 0; } break; case CPUID_0000_000D: limits = vmm_get_xsave_limits(); if (!limits->xsave_enabled) { regs[0] = 0; regs[1] = 0; regs[2] = 0; regs[3] = 0; break; } cpuid_count(func, param, regs); switch (param) { case 0: /* * Only permit the guest to use bits * that are active in the host in * %xcr0. Also, claim that the * maximum save area size is * equivalent to the host's current * save area size. Since this runs * "inside" of vmrun(), it runs with * the guest's xcr0, so the current * save area size is correct as-is. */ regs[0] &= limits->xcr0_allowed; regs[2] = limits->xsave_max_size; regs[3] &= (limits->xcr0_allowed >> 32); break; case 1: /* Only permit XSAVEOPT. */ regs[0] &= CPUID_EXTSTATE_XSAVEOPT; regs[1] = 0; regs[2] = 0; regs[3] = 0; break; default: /* * If the leaf is for a permitted feature, * pass through as-is, otherwise return * all zeroes. */ if (!(limits->xcr0_allowed & (1ul << param))) { regs[0] = 0; regs[1] = 0; regs[2] = 0; regs[3] = 0; } break; } break; case CPUID_0000_000F: case CPUID_0000_0010: /* * Do not report any Resource Director Technology * capabilities. Exposing control of cache or memory * controller resource partitioning to the guest is not * at all sensible. * * This is already hidden at a high level by masking of * leaf 0x7. Even still, a guest may look here for * detailed capability information. */ regs[0] = 0; regs[1] = 0; regs[2] = 0; regs[3] = 0; break; case CPUID_0000_0015: /* * Don't report CPU TSC/Crystal ratio and clock * values since guests may use these to derive the * local APIC frequency.. */ regs[0] = 0; regs[1] = 0; regs[2] = 0; regs[3] = 0; break; case 0x40000000: regs[0] = CPUID_VM_HIGH; bcopy(bhyve_id, ®s[1], 4); bcopy(bhyve_id + 4, ®s[2], 4); bcopy(bhyve_id + 8, ®s[3], 4); break; default: default_leaf: /* * The leaf value has already been clamped so * simply pass this through, keeping count of * how many unhandled leaf values have been seen. */ atomic_add_long(&bhyve_xcpuids, 1); cpuid_count(func, param, regs); break; } /* * CPUID clears the upper 32-bits of the long-mode registers. */ *rax = regs[0]; *rbx = regs[1]; *rcx = regs[2]; *rdx = regs[3]; return (1); } bool vm_cpuid_capability(struct vcpu *vcpu, enum vm_cpuid_capability cap) { bool rv; KASSERT(cap > 0 && cap < VCC_LAST, ("%s: invalid vm_cpu_capability %d", __func__, cap)); /* * Simply passthrough the capabilities of the host cpu for now. */ rv = false; switch (cap) { case VCC_NO_EXECUTE: if (amd_feature & AMDID_NX) rv = true; break; case VCC_FFXSR: if (amd_feature & AMDID_FFXSR) rv = true; break; case VCC_TCE: if (amd_feature2 & AMDID2_TCE) rv = true; break; default: panic("%s: unknown vm_cpu_capability %d", __func__, cap); } return (rv); } int vm_rdmtrr(struct vm_mtrr *mtrr, u_int num, uint64_t *val) { switch (num) { case MSR_MTRRcap: *val = MTRR_CAP_WC | MTRR_CAP_FIXED | VMM_MTRR_VAR_MAX; break; case MSR_MTRRdefType: *val = mtrr->def_type; break; case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7: *val = mtrr->fixed4k[num - MSR_MTRR4kBase]; break; case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: *val = mtrr->fixed16k[num - MSR_MTRR16kBase]; break; case MSR_MTRR64kBase: *val = mtrr->fixed64k; break; case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1: { u_int offset = num - MSR_MTRRVarBase; if (offset % 2 == 0) { *val = mtrr->var[offset / 2].base; } else { *val = mtrr->var[offset / 2].mask; } break; } default: return (-1); } return (0); } int vm_wrmtrr(struct vm_mtrr *mtrr, u_int num, uint64_t val) { switch (num) { case MSR_MTRRcap: /* MTRRCAP is read only */ return (-1); case MSR_MTRRdefType: if (val & ~VMM_MTRR_DEF_MASK) { /* generate #GP on writes to reserved fields */ return (-1); } mtrr->def_type = val; break; case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7: mtrr->fixed4k[num - MSR_MTRR4kBase] = val; break; case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: mtrr->fixed16k[num - MSR_MTRR16kBase] = val; break; case MSR_MTRR64kBase: mtrr->fixed64k = val; break; case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1: { u_int offset = num - MSR_MTRRVarBase; if (offset % 2 == 0) { if (val & ~VMM_MTRR_PHYSBASE_MASK) { /* generate #GP on writes to reserved fields */ return (-1); } mtrr->var[offset / 2].base = val; } else { if (val & ~VMM_MTRR_PHYSMASK_MASK) { /* generate #GP on writes to reserved fields */ return (-1); } mtrr->var[offset / 2].mask = val; } break; } default: return (-1); } return (0); }