diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h --- a/sys/amd64/include/vmm.h +++ b/sys/amd64/include/vmm.h @@ -794,4 +794,52 @@ void vm_inject_pf(struct vcpu *vcpu, int error_code, uint64_t cr2); +/* + * Describes an entry for `cpuid` emulation. + * Used internally by bhyve (kernel) in addition to exposed ioctl(2) interface. + */ +struct vcpu_cpuid_entry { + uint32_t vce_function; + uint32_t vce_index; + uint32_t vce_flags; + uint32_t vce_eax; + uint32_t vce_ebx; + uint32_t vce_ecx; + uint32_t vce_edx; + uint32_t _pad; +}; + +/* + * Defined flags for vcpu_cpuid_entry`vce_flags are below. + */ + +/* Use index (ecx) input value when matching entry */ +#define VCE_FLAG_MATCH_INDEX (1 << 0) + +/* All valid flags for vcpu_cpuid_entry`vce_flags */ +#define VCE_FLAGS_VALID VCE_FLAG_MATCH_INDEX + +/* + * Defined flags for vcpu_cpuid configuration are below. + * These are used by both the ioctl(2) interface via vm_vcpu_cpuid_config and + * internally in the kernel vmm. + */ + +/* Use legacy hard-coded cpuid masking tables applied to the host CPU */ +#define VCC_FLAG_LEGACY_HANDLING (1 << 0) +/* + * Emulate Intel-style fallback behavior (emit highest "standard" entry) if the + * queried function/index do not match. If not set, emulate AMD-style, where + * all zeroes are returned in such cases. + */ +#define VCC_FLAG_INTEL_FALLBACK (1 << 1) + +/* All valid flags for vm_vcpu_cpuid_config`vvcc_flags */ +#define VCC_FLAGS_VALID \ + (VCC_FLAG_LEGACY_HANDLING | VCC_FLAG_INTEL_FALLBACK) + +/* Maximum vcpu_cpuid_entry records per vCPU */ +#define VMM_MAX_CPUID_ENTRIES 256 + + #endif /* _VMM_H_ */ diff --git a/sys/amd64/include/vmm_dev.h b/sys/amd64/include/vmm_dev.h --- a/sys/amd64/include/vmm_dev.h +++ b/sys/amd64/include/vmm_dev.h @@ -251,6 +251,23 @@ }; _Static_assert(sizeof(struct vm_readwrite_kernemu_device) == 24, "ABI"); +struct vm_vcpu_cpuid_config { + int vvcc_vcpuid; + uint32_t vvcc_flags; + uint32_t vvcc_nent; + uint32_t _pad; + void *vvcc_entries; +}; + +/* Query the computed legacy cpuid value for a vcpuid with VM_LEGACY_CPUID */ +struct vm_legacy_cpuid { + int vlc_vcpuid; + uint32_t vlc_eax; + uint32_t vlc_ebx; + uint32_t vlc_ecx; + uint32_t vlc_edx; +}; + enum { /* general routines */ IOCNUM_ABIVERS = 0, @@ -318,6 +335,11 @@ IOCNUM_SET_TOPOLOGY = 63, IOCNUM_GET_TOPOLOGY = 64, + /* CPUID override */ + IOCNUM_GET_CPUID = 65, + IOCNUM_SET_CPUID = 66, + IOCNUM_LEGACY_CPUID = 67, + /* legacy interrupt injection */ IOCNUM_ISA_ASSERT_IRQ = 80, IOCNUM_ISA_DEASSERT_IRQ = 81, @@ -434,6 +456,12 @@ _IOW('v', IOCNUM_SET_TOPOLOGY, struct vm_cpu_topology) #define VM_GET_TOPOLOGY \ _IOR('v', IOCNUM_GET_TOPOLOGY, struct vm_cpu_topology) +#define VM_GET_CPUID \ + _IOWR('v', IOCNUM_GET_CPUID, struct vm_vcpu_cpuid_config) +#define VM_SET_CPUID \ + _IOW('v', IOCNUM_SET_CPUID, struct vm_vcpu_cpuid_config) +#define VM_LEGACY_CPUID \ + _IOWR('v', IOCNUM_LEGACY_CPUID, struct vm_legacy_cpuid) #define VM_GET_GPA_PMAP \ _IOWR('v', IOCNUM_GET_GPA_PMAP, struct vm_gpa_pte) #define VM_GLA2GPA \ diff --git a/sys/amd64/vmm/amd/svm.c b/sys/amd64/vmm/amd/svm.c --- a/sys/amd64/vmm/amd/svm.c +++ b/sys/amd64/vmm/amd/svm.c @@ -1592,9 +1592,9 @@ break; case VMCB_EXIT_CPUID: vmm_stat_incr(vcpu->vcpu, VMEXIT_CPUID, 1); - handled = x86_emulate_cpuid(vcpu->vcpu, - &state->rax, &ctx->sctx_rbx, &ctx->sctx_rcx, - &ctx->sctx_rdx); + vcpu_emulate_cpuid(vcpu->vcpu, &state->rax, + &ctx->sctx_rbx, &ctx->sctx_rcx, &ctx->sctx_rdx); + handled = 1; break; case VMCB_EXIT_HLT: vmm_stat_incr(vcpu->vcpu, VMEXIT_HLT, 1); diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c --- a/sys/amd64/vmm/intel/vmx.c +++ b/sys/amd64/vmm/intel/vmx.c @@ -1247,17 +1247,6 @@ return (vcpu); } -static int -vmx_handle_cpuid(struct vmx_vcpu *vcpu, struct vmxctx *vmxctx) -{ - int handled; - - handled = x86_emulate_cpuid(vcpu->vcpu, (uint64_t *)&vmxctx->guest_rax, - (uint64_t *)&vmxctx->guest_rbx, (uint64_t *)&vmxctx->guest_rcx, - (uint64_t *)&vmxctx->guest_rdx); - return (handled); -} - static __inline void vmx_run_trace(struct vmx_vcpu *vcpu) { @@ -2668,7 +2657,12 @@ case EXIT_REASON_CPUID: vmm_stat_incr(vcpu->vcpu, VMEXIT_CPUID, 1); SDT_PROBE3(vmm, vmx, exit, cpuid, vmx, vcpuid, vmexit); - handled = vmx_handle_cpuid(vcpu, vmxctx); + vcpu_emulate_cpuid(vcpu->vcpu, + (uint64_t *)&vmxctx->guest_rax, + (uint64_t *)&vmxctx->guest_rbx, + (uint64_t *)&vmxctx->guest_rcx, + (uint64_t *)&vmxctx->guest_rdx); + handled = HANDLED; break; case EXIT_REASON_EXCEPTION: vmm_stat_incr(vcpu->vcpu, VMEXIT_EXCEPTION, 1); diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c --- a/sys/amd64/vmm/vmm.c +++ b/sys/amd64/vmm/vmm.c @@ -86,6 +86,7 @@ #include "vrtc.h" #include "vmm_stat.h" #include "vmm_lapic.h" +#include "x86.h" #include "io/ppt.h" #include "io/iommu.h" @@ -123,6 +124,7 @@ cpuset_t exitinfo_cpuset; /* (x) storage for vmexit handlers */ uint64_t nextrip; /* (x) next instruction to execute */ uint64_t tsc_offset; /* (o) TSC offsetting */ + vcpu_cpuid_config_t cpuid_cfg; /* (x) cpuid configuration */ }; #define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN) @@ -335,6 +337,9 @@ vcpu->cookie = NULL; if (destroy) { vmm_stat_free(vcpu->stats); + + vcpu_cpuid_cleanup(&vcpu->cpuid_cfg); + fpu_save_area_free(vcpu->guestfpu); vcpu_lock_destroy(vcpu); free(vcpu, M_VM); @@ -364,6 +369,8 @@ static void vcpu_init(struct vcpu *vcpu) { + vcpu_cpuid_init(&vcpu->cpuid_cfg); + vcpu->cookie = vmmops_vcpu_init(vcpu->vm->cookie, vcpu, vcpu->vcpuid); vcpu->vlapic = vmmops_vlapic_init(vcpu->cookie); vm_set_x2apic_state(vcpu, X2APIC_DISABLED); @@ -2132,6 +2139,12 @@ return (vmmops_setcap(vcpu->cookie, type, val)); } +vcpu_cpuid_config_t * +vm_cpuid_config(struct vcpu *vcpu) +{ + return (&vcpu->cpuid_cfg); +} + struct vm * vcpu_vm(struct vcpu *vcpu) { diff --git a/sys/amd64/vmm/x86.c b/sys/amd64/vmm/vmm_cpuid.c copy from sys/amd64/vmm/x86.c copy to sys/amd64/vmm/vmm_cpuid.c --- a/sys/amd64/vmm/x86.c +++ b/sys/amd64/vmm/vmm_cpuid.c @@ -25,17 +25,23 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ +/* + * Copyright 2014 Pluribus Networks Inc. + * Copyright 2018 Joyent, Inc. + * Copyright 2022 Oxide Computer Company + */ #include -#include -#include +#include +#include #include +#include +#include #include -#include #include -#include #include + #include #include @@ -44,18 +50,273 @@ #include "vmm_util.h" #include "x86.h" -SYSCTL_DECL(_hw_vmm); -static SYSCTL_NODE(_hw_vmm, OID_AUTO, topology, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, - NULL); +static MALLOC_DEFINE(M_CPUID, "cpuid", "cpuid"); -#define CPUID_VM_SIGNATURE 0x40000000 -#define CPUID_BHYVE_FEATURES 0x40000001 -#define CPUID_VM_HIGH CPUID_BHYVE_FEATURES +/* + * CPUID Emulation + * + * All CPUID instruction exits are handled by the in-kernel emulation. + * + * ---------------- + * Legacy Emulation + * ---------------- + * + * Originally, the kernel vmm portion of bhyve relied on fixed logic to filter + * and/or generate CPUID results based on what was reported by the host CPU, as + * well as attributes of the VM (such as CPU topology, and enabled features). + * This is largely adequate to expose CPU capabilities to the guest in manner + * which allows it to operate properly. + * + * ------------------------------ + * Userspace-Controlled Emulation + * ------------------------------ + * + * In certain situations, more control over the CPUID emulation results present + * to the guest is desired. Live migration between physical hosts is one such + * example, where the underlying CPUs, or at least their microcode, may differ + * between the source and destination. In such cases, where changes to the + * CPUID results cannot be tolerated, the userspace portion of the VMM can be in + * complete control over the leaves which are presented to the guest. It may + * still consult the "legacy" CPUID data for guidance about which CPU features + * are safe to expose (due to hypervisor limitations, etc). This leaf + * information is configured on a per-vCPU basis. + * + * The emulation entries provided by userspace are expected to be in sorted + * order, running from lowest function and index to highest. + * + * For example: + * (func: 00h idx: 00h) -> + * (flags: 0, eax: highest std leaf, ebx-edx: vendor id) + * (func: 0Dh idx: 00h) -> + * (flags: VCE_FLAG_MATCH_INDEX, eax - edx: XCR0/XSAVE info) + * (func: 0Dh idx: 01h) -> + * (flags: VCE_FLAG_MATCH_INDEX, eax - edx: XSAVE/XSAVEOPT details) + * ... + * (func: 0Dh idx: 07H) -> + * (flags: VCE_FLAG_MATCH_INDEX, eax - edx: AVX-512 details) + * (func: 8000000h idx: 0h) -> + * (flags: 0, eax: highest extd leaf ...) + * ... + */ -/* Features advertised in CPUID_BHYVE_FEATURES %eax */ -#define CPUID_BHYVE_FEAT_EXT_DEST_ID (1UL << 0) /* MSI Extended Dest ID */ -static const char bhyve_id[12] = "bhyve bhyve "; +#define CPUID_TYPE_MASK 0xf0000000 +#define CPUID_TYPE_STD 0x00000000 +#define CPUID_TYPE_EXTD 0x80000000 + +static const struct vcpu_cpuid_entry cpuid_empty_entry = { 0 }; + +/* + * Given the CPUID configuration for a vCPU, locate the entry which matches the + * provided function/index tuple. The entries list is walked in order, and the + * first valid match based on the function/index and flags will be emitted. + * + * If no match is found, but Intel-style fallback is configured, then the + * highest standard leaf encountered will be emitted. + */ +static const struct vcpu_cpuid_entry * +cpuid_find_entry(const vcpu_cpuid_config_t *cfg, uint32_t func, uint32_t idx) +{ + const struct vcpu_cpuid_entry *last_std = NULL; + const bool intel_fallback = + (cfg->vcc_flags & VCC_FLAG_INTEL_FALLBACK) != 0; + bool matched_leaf = false; + + KASSERT((cfg->vcc_flags & VCC_FLAG_LEGACY_HANDLING) == 0, + ("legacy CPUID handling enabled")); + + for (u_int i = 0; i < cfg->vcc_nent; i++) { + const struct vcpu_cpuid_entry *ent = &cfg->vcc_entries[i]; + const bool ent_is_std = + (ent->vce_function & CPUID_TYPE_MASK) == CPUID_TYPE_STD; + const bool ent_must_match_idx = + (ent->vce_flags & VCE_FLAG_MATCH_INDEX) != 0; + + if (ent_is_std) { + /* + * Keep track of the last "standard" leaf for + * Intel-style fallback behavior. + * + * This does currently not account for the sub-leaf + * index matching behavior for fallback described in the + * SDM. It is not clear if any consumers rely on such + * matching when encountering fallback. + */ + last_std = ent; + } + if (ent->vce_function == func) { + if (ent->vce_index == idx || !ent_must_match_idx) { + return (ent); + } + /* + * Make note of when the top-level leaf matches, even + * when the index does not. + */ + matched_leaf = true; + } else if (ent->vce_function > func) { + if ((ent->vce_function & CPUID_TYPE_MASK) == + (func & CPUID_TYPE_MASK)) { + /* + * We are beyond a valid leaf to match, but have + * not exceeded the maximum leaf for this "type" + * (standard, extended, hvm, etc), so return an + * empty entry. + */ + return (&cpuid_empty_entry); + } else { + /* + * Otherwise, we can stop now, having gone + * beyond the last entry which could match the + * target function in a sorted list. + */ + break; + } + } + } + + if (matched_leaf || !intel_fallback) { + return (&cpuid_empty_entry); + } else { + return (last_std); + } +} + +void +vcpu_emulate_cpuid(struct vcpu *vcpu, uint64_t *rax, uint64_t *rbx, + uint64_t *rcx, uint64_t *rdx) +{ + const vcpu_cpuid_config_t *cfg = vm_cpuid_config(vcpu); + + KASSERT(rax != NULL, ("rax == NULL")); + KASSERT(rbx != NULL, ("rbx == NULL")); + KASSERT(rcx != NULL, ("rcx == NULL")); + KASSERT(rdx != NULL, ("rdx == NULL")); + + /* Fall back to legacy handling if specified */ + if ((cfg->vcc_flags & VCC_FLAG_LEGACY_HANDLING) != 0) { + uint32_t regs[4] = { *rax, 0, *rcx, 0 }; + + legacy_emulate_cpuid(vcpu, ®s[0], ®s[1], ®s[2], + ®s[3]); + /* CPUID clears the upper 32-bits of the long-mode registers. */ + *rax = regs[0]; + *rbx = regs[1]; + *rcx = regs[2]; + *rdx = regs[3]; + return; + } + + const struct vcpu_cpuid_entry *ent = cpuid_find_entry(cfg, *rax, *rcx); + KASSERT(ent != NULL, ("ent == NULL")); + /* CPUID clears the upper 32-bits of the long-mode registers. */ + *rax = ent->vce_eax; + *rbx = ent->vce_ebx; + *rcx = ent->vce_ecx; + *rdx = ent->vce_edx; +} + +/* + * Get the current CPUID emulation configuration for this vCPU. + * + * Only the existing flags will be emitted if the vCPU is configured for legacy + * operation via the VCC_FLAG_LEGACY_HANDLING flag. If in userspace-controlled + * mode, then we will attempt to copy the existing entries into vcc_entries, + * its side specified by vcc_nent. + * + * Regardless of whether vcc_entries is adequately sized (or even present), + * vcc_nent will be set to the number of existing entries. + */ +int +vm_get_cpuid(struct vcpu *vcpu, vcpu_cpuid_config_t *res) +{ + const vcpu_cpuid_config_t *src = vm_cpuid_config(vcpu); + if (src->vcc_nent > res->vcc_nent) { + res->vcc_nent = src->vcc_nent; + return (E2BIG); + } else if (src->vcc_nent != 0) { + bcopy(src->vcc_entries, res->vcc_entries, + src->vcc_nent * sizeof (struct vcpu_cpuid_entry)); + } + res->vcc_flags = src->vcc_flags; + res->vcc_nent = src->vcc_nent; + return (0); +} + +/* + * Set the CPUID emulation configuration for this vCPU. + * + * If VCC_FLAG_LEGACY_HANDLING is set in vcc_flags, then vcc_nent is expected to + * be set to 0, as configuring a list of entries would be useless when using the + * legacy handling. + * + * Any existing entries which are configured are freed, and the newly provided + * ones will be copied into their place. + */ +int +vm_set_cpuid(struct vcpu *vcpu, const vcpu_cpuid_config_t *src) +{ + if (src->vcc_nent > VMM_MAX_CPUID_ENTRIES) { + return (EINVAL); + } + if ((src->vcc_flags & ~VCC_FLAGS_VALID) != 0) { + return (EINVAL); + } + if ((src->vcc_flags & VCC_FLAG_LEGACY_HANDLING) != 0 && + src->vcc_nent != 0) { + /* No entries should be provided if using legacy handling */ + return (EINVAL); + } + for (u_int i = 0; i < src->vcc_nent; i++) { + /* Ensure all entries carry valid flags */ + if ((src->vcc_entries[i].vce_flags & ~VCE_FLAGS_VALID) != 0) { + return (EINVAL); + } + } + + vcpu_cpuid_config_t *cfg = vm_cpuid_config(vcpu); + + /* Free any existing entries first */ + vcpu_cpuid_cleanup(cfg); + + /* Copy supplied entries into freshly allocated space */ + if (src->vcc_nent != 0) { + const size_t entries_sz = + src->vcc_nent * sizeof (struct vcpu_cpuid_entry); + + cfg->vcc_nent = src->vcc_nent; + cfg->vcc_entries = malloc(entries_sz, M_CPUID, M_WAITOK); + bcopy(src->vcc_entries, cfg->vcc_entries, entries_sz); + } + cfg->vcc_flags = src->vcc_flags; + + return (0); +} + +void +vcpu_cpuid_init(vcpu_cpuid_config_t *cfg) +{ + /* Default to legacy-style handling */ + cfg->vcc_flags = VCC_FLAG_LEGACY_HANDLING; + cfg->vcc_nent = 0; + cfg->vcc_entries = NULL; +} + +void +vcpu_cpuid_cleanup(vcpu_cpuid_config_t *cfg) +{ + if (cfg->vcc_nent != 0) { + KASSERT(cfg->vcc_entries != NULL, ("cfg->vcc_entries == NULL")); + + free(cfg->vcc_entries, M_CPUID); + + cfg->vcc_nent = 0; + cfg->vcc_entries = NULL; + } +} + +SYSCTL_DECL(_hw_vmm); +static SYSCTL_NODE(_hw_vmm, OID_AUTO, topology, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, + NULL); static uint64_t bhyve_xcpuids; SYSCTL_ULONG(_hw_vmm, OID_AUTO, bhyve_xcpuids, CTLFLAG_RW, &bhyve_xcpuids, 0, @@ -65,6 +326,44 @@ SYSCTL_INT(_hw_vmm_topology, OID_AUTO, cpuid_leaf_b, CTLFLAG_RDTUN, &cpuid_leaf_b, 0, NULL); +static const char bhyve_id[12] = "bhyve bhyve "; + +#define CPUID_0000_0000 (0x0) +#define CPUID_0000_0001 (0x1) +#define CPUID_0000_0002 (0x2) +#define CPUID_0000_0003 (0x3) +#define CPUID_0000_0004 (0x4) +#define CPUID_0000_0006 (0x6) +#define CPUID_0000_0007 (0x7) +#define CPUID_0000_000A (0xA) +#define CPUID_0000_000B (0xB) +#define CPUID_0000_000D (0xD) +#define CPUID_0000_000F (0xF) +#define CPUID_0000_0010 (0x10) +#define CPUID_0000_0015 (0x15) +#define CPUID_8000_0000 (0x80000000) +#define CPUID_8000_0001 (0x80000001) +#define CPUID_8000_0002 (0x80000002) +#define CPUID_8000_0003 (0x80000003) +#define CPUID_8000_0004 (0x80000004) +#define CPUID_8000_0006 (0x80000006) +#define CPUID_8000_0007 (0x80000007) +#define CPUID_8000_0008 (0x80000008) +#define CPUID_8000_001D (0x8000001D) +#define CPUID_8000_001E (0x8000001E) + +#define CPUID_VM_SIGNATURE 0x40000000 +#define CPUID_BHYVE_FEATURES 0x40000001 +#define CPUID_VM_HIGH CPUID_BHYVE_FEATURES + +/* Features advertised in CPUID_BHYVE_FEATURES %eax */ +#define CPUID_BHYVE_FEAT_EXT_DEST_ID (1UL << 0) /* MSI Extended Dest ID */ + +/* + * CPUID instruction Fn0000_0001: + */ +#define CPUID_0000_0001_APICID_SHIFT 24 + /* * Compute ceil(log2(x)). Returns -1 if x is zero. */ @@ -75,9 +374,13 @@ return (x == 0 ? -1 : order_base_2(x)); } -int -x86_emulate_cpuid(struct vcpu *vcpu, uint64_t *rax, uint64_t *rbx, - uint64_t *rcx, uint64_t *rdx) +/* + * The "legacy" bhyve cpuid emulation, which largly applies statically defined + * masks to the data provided by the host CPU. + */ +void +legacy_emulate_cpuid(struct vcpu *vcpu, uint32_t *eax, uint32_t *ebx, + uint32_t *ecx, uint32_t *edx) { struct vm *vm = vcpu_vm(vcpu); int vcpu_id = vcpu_vcpuid(vcpu); @@ -93,8 +396,8 @@ * The function of CPUID is controlled through the provided value of * %eax (and secondarily %ecx, for certain leaf data). */ - func = (uint32_t)*rax; - param = (uint32_t)*rcx; + func = (uint32_t)*eax; + param = (uint32_t)*ecx; VCPU_CTR2(vm, vcpu_id, "cpuid %#x,%#x", func, param); @@ -155,8 +458,11 @@ * pkg_id_shift and other OSes may rely on it. */ width = MIN(0xF, log2(threads * cores)); + if (width < 0x4) + width = 0; logical_cpus = MIN(0xFF, threads * cores - 1); - regs[2] = (width << AMDID_COREID_SIZE_SHIFT) | logical_cpus; + regs[2] = (width << AMDID_COREID_SIZE_SHIFT) | + logical_cpus; } break; @@ -200,6 +506,7 @@ break; case CPUID_8000_0007: + cpuid_count(func, param, regs); /* * AMD uses this leaf to advertise the processor's * power monitoring and RAS capabilities. These @@ -236,7 +543,7 @@ goto default_leaf; /* - * Similar to Intel, generate a fictitious cache + * Similar to Intel, generate a ficticious cache * topology for the guest with L3 shared by the * package, and L1 and L2 local to a core. */ @@ -268,7 +575,7 @@ logical_cpus = MIN(0xfff, logical_cpus - 1); regs[0] = (logical_cpus << 14) | (1 << 8) | (level << 5) | func; - regs[1] = (func > 0) ? (CACHE_LINE_SIZE - 1) : 0; + regs[1] = func > 0 ? CACHE_LINE_SIZE - 1 : 0; /* * ecx: Number of cache ways for non-fully @@ -360,7 +667,7 @@ */ regs[2] &= ~CPUID2_MON; - /* + /* * Hide the performance and debug features. */ regs[2] &= ~CPUID2_PDCM; @@ -615,9 +922,9 @@ case CPUID_BHYVE_FEATURES: regs[0] = CPUID_BHYVE_FEAT_EXT_DEST_ID; - regs[1] = 0; - regs[2] = 0; - regs[3] = 0; + regs[1] = 1; + regs[2] = 2; + regs[3] = 3; break; default: @@ -632,126 +939,8 @@ break; } - /* - * CPUID clears the upper 32-bits of the long-mode registers. - */ - *rax = regs[0]; - *rbx = regs[1]; - *rcx = regs[2]; - *rdx = regs[3]; - - return (1); -} - -bool -vm_cpuid_capability(struct vcpu *vcpu, enum vm_cpuid_capability cap) -{ - bool rv; - - KASSERT(cap > 0 && cap < VCC_LAST, ("%s: invalid vm_cpu_capability %d", - __func__, cap)); - - /* - * Simply passthrough the capabilities of the host cpu for now. - */ - rv = false; - switch (cap) { - case VCC_NO_EXECUTE: - if (amd_feature & AMDID_NX) - rv = true; - break; - case VCC_FFXSR: - if (amd_feature & AMDID_FFXSR) - rv = true; - break; - case VCC_TCE: - if (amd_feature2 & AMDID2_TCE) - rv = true; - break; - default: - panic("%s: unknown vm_cpu_capability %d", __func__, cap); - } - return (rv); -} - -int -vm_rdmtrr(struct vm_mtrr *mtrr, u_int num, uint64_t *val) -{ - switch (num) { - case MSR_MTRRcap: - *val = MTRR_CAP_WC | MTRR_CAP_FIXED | VMM_MTRR_VAR_MAX; - break; - case MSR_MTRRdefType: - *val = mtrr->def_type; - break; - case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7: - *val = mtrr->fixed4k[num - MSR_MTRR4kBase]; - break; - case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: - *val = mtrr->fixed16k[num - MSR_MTRR16kBase]; - break; - case MSR_MTRR64kBase: - *val = mtrr->fixed64k; - break; - case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1: { - u_int offset = num - MSR_MTRRVarBase; - if (offset % 2 == 0) { - *val = mtrr->var[offset / 2].base; - } else { - *val = mtrr->var[offset / 2].mask; - } - break; - } - default: - return (-1); - } - - return (0); -} - -int -vm_wrmtrr(struct vm_mtrr *mtrr, u_int num, uint64_t val) -{ - switch (num) { - case MSR_MTRRcap: - /* MTRRCAP is read only */ - return (-1); - case MSR_MTRRdefType: - if (val & ~VMM_MTRR_DEF_MASK) { - /* generate #GP on writes to reserved fields */ - return (-1); - } - mtrr->def_type = val; - break; - case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7: - mtrr->fixed4k[num - MSR_MTRR4kBase] = val; - break; - case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: - mtrr->fixed16k[num - MSR_MTRR16kBase] = val; - break; - case MSR_MTRR64kBase: - mtrr->fixed64k = val; - break; - case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1: { - u_int offset = num - MSR_MTRRVarBase; - if (offset % 2 == 0) { - if (val & ~VMM_MTRR_PHYSBASE_MASK) { - /* generate #GP on writes to reserved fields */ - return (-1); - } - mtrr->var[offset / 2].base = val; - } else { - if (val & ~VMM_MTRR_PHYSMASK_MASK) { - /* generate #GP on writes to reserved fields */ - return (-1); - } - mtrr->var[offset / 2].mask = val; - } - break; - } - default: - return (-1); - } - - return (0); + *eax = regs[0]; + *ebx = regs[1]; + *ecx = regs[2]; + *edx = regs[3]; } diff --git a/sys/amd64/vmm/vmm_dev_machdep.c b/sys/amd64/vmm/vmm_dev_machdep.c --- a/sys/amd64/vmm/vmm_dev_machdep.c +++ b/sys/amd64/vmm/vmm_dev_machdep.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -58,6 +59,10 @@ #include "io/vioapic.h" #include "io/vhpet.h" #include "io/vrtc.h" +#include "x86.h" + + +static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev"); #ifdef COMPAT_FREEBSD13 struct vm_stats_13 { @@ -159,6 +164,9 @@ VMMDEV_IOCTL(VM_ISA_DEASSERT_IRQ, 0), VMMDEV_IOCTL(VM_ISA_PULSE_IRQ, 0), VMMDEV_IOCTL(VM_ISA_SET_IRQ_TRIGGER, 0), + VMMDEV_IOCTL(VM_GET_CPUID, VMMDEV_IOCTL_LOCK_ONE_VCPU), + VMMDEV_IOCTL(VM_SET_CPUID, VMMDEV_IOCTL_LOCK_ONE_VCPU), + VMMDEV_IOCTL(VM_LEGACY_CPUID, VMMDEV_IOCTL_LOCK_ONE_VCPU), VMMDEV_IOCTL(VM_GET_GPA_PMAP, 0), VMMDEV_IOCTL(VM_GET_HPET_CAPABILITIES, 0), VMMDEV_IOCTL(VM_RTC_READ, 0), @@ -440,6 +448,110 @@ x2apic = (struct vm_x2apic *)data; error = vm_get_x2apic_state(vcpu, &x2apic->state); break; + case VM_GET_CPUID: { + struct vm_vcpu_cpuid_config *cfg = (void *)data; + struct vcpu_cpuid_entry *entries = NULL; + + if (cfg->vvcc_vcpuid != vcpu_vcpuid(vcpu)) { + error = EINVAL; + break; + } + if (cfg->vvcc_nent > VMM_MAX_CPUID_ENTRIES) { + error = EINVAL; + break; + } + + const size_t entries_size = + cfg->vvcc_nent * sizeof (struct vcpu_cpuid_entry); + if (entries_size != 0) { + entries = malloc(entries_size, M_VMMDEV, M_WAITOK); + bzero(entries, entries_size); + } + + vcpu_cpuid_config_t vm_cfg = { + .vcc_nent = cfg->vvcc_nent, + .vcc_entries = entries, + }; + error = vm_get_cpuid(vcpu, &vm_cfg); + + /* + * Only attempt to copy out the resultant entries if we were + * able to query them from the instance. The flags and number + * of entries are emitted regardless. + */ + cfg->vvcc_flags = vm_cfg.vcc_flags; + cfg->vvcc_nent = vm_cfg.vcc_nent; + if (entries != NULL) { + if (error == 0) + error = copyout(entries, cfg->vvcc_entries, + entries_size); + + free(entries, M_VMMDEV); + } + + /* + * If vm_get_cpuid() returned E2BIG, clear the error to allow + * flags and number of entries to be returned. + */ + if (error == E2BIG) + error = 0; + break; + } + case VM_SET_CPUID: { + struct vm_vcpu_cpuid_config *cfg = (void *)data; + struct vcpu_cpuid_entry *entries = NULL; + size_t entries_size = 0; + + if (cfg->vvcc_vcpuid != vcpu_vcpuid(vcpu)) { + error = EINVAL; + break; + } + if (cfg->vvcc_nent > VMM_MAX_CPUID_ENTRIES) { + error = EFBIG; + break; + } + if ((cfg->vvcc_flags & VCC_FLAG_LEGACY_HANDLING) != 0) { + /* + * If we are being instructed to use "legacy" handling, + * then no entries should be provided, since the static + * in-kernel masking will be used. + */ + if (cfg->vvcc_nent != 0) { + error = EINVAL; + break; + } + } else if (cfg->vvcc_nent != 0) { + entries_size = + cfg->vvcc_nent * sizeof (struct vcpu_cpuid_entry); + entries = malloc(entries_size, M_VMMDEV, M_WAITOK); + + error = copyin(cfg->vvcc_entries, entries, + entries_size); + if (error != 0) { + free(entries, M_VMMDEV); + break; + } + } + + vcpu_cpuid_config_t vm_cfg = { + .vcc_flags = cfg->vvcc_flags, + .vcc_nent = cfg->vvcc_nent, + .vcc_entries = entries, + }; + error = vm_set_cpuid(vcpu, &vm_cfg); + + if (entries != NULL) { + free(entries, M_VMMDEV); + } + break; + } + case VM_LEGACY_CPUID: { + struct vm_legacy_cpuid *vlc = (void *)data; + + legacy_emulate_cpuid(vcpu, &vlc->vlc_eax, &vlc->vlc_ebx, + &vlc->vlc_ecx, &vlc->vlc_edx); + break; + } case VM_GET_GPA_PMAP: gpapte = (struct vm_gpa_pte *)data; pmap_get_mapping(vmspace_pmap(vm_vmspace(vm)), diff --git a/sys/amd64/vmm/x86.h b/sys/amd64/vmm/x86.h --- a/sys/amd64/vmm/x86.h +++ b/sys/amd64/vmm/x86.h @@ -64,9 +64,6 @@ */ #define CPUID_0000_0001_FEAT0_VMX (1<<5) -int x86_emulate_cpuid(struct vcpu *vcpu, uint64_t *rax, uint64_t *rbx, - uint64_t *rcx, uint64_t *rdx); - enum vm_cpuid_capability { VCC_NONE, VCC_NO_EXECUTE, @@ -75,6 +72,23 @@ VCC_LAST }; +/* Possible flags and entry count limit definited in sys/vmm.h */ +typedef struct vcpu_cpuid_config { + uint32_t vcc_flags; + uint32_t vcc_nent; + struct vcpu_cpuid_entry *vcc_entries; +} vcpu_cpuid_config_t; + +vcpu_cpuid_config_t *vm_cpuid_config(struct vcpu *); +int vm_get_cpuid(struct vcpu *, vcpu_cpuid_config_t *); +int vm_set_cpuid(struct vcpu *, const vcpu_cpuid_config_t *); +void vcpu_emulate_cpuid(struct vcpu *, uint64_t *, uint64_t *, uint64_t *, + uint64_t *); +void legacy_emulate_cpuid(struct vcpu *, uint32_t *, uint32_t *, uint32_t *, + uint32_t *); +void vcpu_cpuid_init(vcpu_cpuid_config_t *); +void vcpu_cpuid_cleanup(vcpu_cpuid_config_t *); + /* * Return 'true' if the capability 'cap' is enabled in this virtual cpu * and 'false' otherwise. diff --git a/sys/amd64/vmm/x86.c b/sys/amd64/vmm/x86.c --- a/sys/amd64/vmm/x86.c +++ b/sys/amd64/vmm/x86.c @@ -44,605 +44,6 @@ #include "vmm_util.h" #include "x86.h" -SYSCTL_DECL(_hw_vmm); -static SYSCTL_NODE(_hw_vmm, OID_AUTO, topology, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, - NULL); - -#define CPUID_VM_SIGNATURE 0x40000000 -#define CPUID_BHYVE_FEATURES 0x40000001 -#define CPUID_VM_HIGH CPUID_BHYVE_FEATURES - -/* Features advertised in CPUID_BHYVE_FEATURES %eax */ -#define CPUID_BHYVE_FEAT_EXT_DEST_ID (1UL << 0) /* MSI Extended Dest ID */ - -static const char bhyve_id[12] = "bhyve bhyve "; - -static uint64_t bhyve_xcpuids; -SYSCTL_ULONG(_hw_vmm, OID_AUTO, bhyve_xcpuids, CTLFLAG_RW, &bhyve_xcpuids, 0, - "Number of times an unknown cpuid leaf was accessed"); - -static int cpuid_leaf_b = 1; -SYSCTL_INT(_hw_vmm_topology, OID_AUTO, cpuid_leaf_b, CTLFLAG_RDTUN, - &cpuid_leaf_b, 0, NULL); - -/* - * Compute ceil(log2(x)). Returns -1 if x is zero. - */ -static __inline int -log2(u_int x) -{ - - return (x == 0 ? -1 : order_base_2(x)); -} - -int -x86_emulate_cpuid(struct vcpu *vcpu, uint64_t *rax, uint64_t *rbx, - uint64_t *rcx, uint64_t *rdx) -{ - struct vm *vm = vcpu_vm(vcpu); - int vcpu_id = vcpu_vcpuid(vcpu); - const struct xsave_limits *limits; - uint64_t cr4; - int error, enable_invpcid, enable_rdpid, enable_rdtscp, level, - width, x2apic_id; - unsigned int func, regs[4], logical_cpus, param; - enum x2apic_state x2apic_state; - uint16_t cores, maxcpus, sockets, threads; - - /* - * The function of CPUID is controlled through the provided value of - * %eax (and secondarily %ecx, for certain leaf data). - */ - func = (uint32_t)*rax; - param = (uint32_t)*rcx; - - VCPU_CTR2(vm, vcpu_id, "cpuid %#x,%#x", func, param); - - /* - * Requests for invalid CPUID levels should map to the highest - * available level instead. - */ - if (cpu_exthigh != 0 && func >= 0x80000000) { - if (func > cpu_exthigh) - func = cpu_exthigh; - } else if (func >= CPUID_VM_SIGNATURE) { - if (func > CPUID_VM_HIGH) - func = CPUID_VM_HIGH; - } else if (func > cpu_high) { - func = cpu_high; - } - - /* - * In general the approach used for CPU topology is to - * advertise a flat topology where all CPUs are packages with - * no multi-core or SMT. - */ - switch (func) { - /* - * Pass these through to the guest - */ - case CPUID_0000_0000: - case CPUID_0000_0002: - case CPUID_0000_0003: - case CPUID_8000_0000: - case CPUID_8000_0002: - case CPUID_8000_0003: - case CPUID_8000_0004: - case CPUID_8000_0006: - cpuid_count(func, param, regs); - break; - case CPUID_8000_0008: - cpuid_count(func, param, regs); - if (vmm_is_svm()) { - /* - * As on Intel (0000_0007:0, EDX), mask out - * unsupported or unsafe AMD extended features - * (8000_0008 EBX). - */ - regs[1] &= (AMDFEID_CLZERO | AMDFEID_IRPERF | - AMDFEID_XSAVEERPTR); - - vm_get_topology(vm, &sockets, &cores, &threads, - &maxcpus); - /* - * Here, width is ApicIdCoreIdSize, present on - * at least Family 15h and newer. It - * represents the "number of bits in the - * initial apicid that indicate thread id - * within a package." - * - * Our topo_probe_amd() uses it for - * pkg_id_shift and other OSes may rely on it. - */ - width = MIN(0xF, log2(threads * cores)); - logical_cpus = MIN(0xFF, threads * cores - 1); - regs[2] = (width << AMDID_COREID_SIZE_SHIFT) | logical_cpus; - } - break; - - case CPUID_8000_0001: - cpuid_count(func, param, regs); - - /* - * Hide SVM from guest. - */ - regs[2] &= ~AMDID2_SVM; - - /* - * Don't advertise extended performance counter MSRs - * to the guest. - */ - regs[2] &= ~AMDID2_PCXC; - regs[2] &= ~AMDID2_PNXC; - regs[2] &= ~AMDID2_PTSCEL2I; - - /* - * Don't advertise Instruction Based Sampling feature. - */ - regs[2] &= ~AMDID2_IBS; - - /* NodeID MSR not available */ - regs[2] &= ~AMDID2_NODE_ID; - - /* Don't advertise the OS visible workaround feature */ - regs[2] &= ~AMDID2_OSVW; - - /* Hide mwaitx/monitorx capability from the guest */ - regs[2] &= ~AMDID2_MWAITX; - - /* Advertise RDTSCP if it is enabled. */ - error = vm_get_capability(vcpu, - VM_CAP_RDTSCP, &enable_rdtscp); - if (error == 0 && enable_rdtscp) - regs[3] |= AMDID_RDTSCP; - else - regs[3] &= ~AMDID_RDTSCP; - break; - - case CPUID_8000_0007: - /* - * AMD uses this leaf to advertise the processor's - * power monitoring and RAS capabilities. These - * features are hardware-specific and exposing - * them to a guest doesn't make a lot of sense. - * - * Intel uses this leaf only to advertise the - * "Invariant TSC" feature with all other bits - * being reserved (set to zero). - */ - regs[0] = 0; - regs[1] = 0; - regs[2] = 0; - regs[3] = 0; - - /* - * "Invariant TSC" can be advertised to the guest if: - * - host TSC frequency is invariant - * - host TSCs are synchronized across physical cpus - * - * XXX This still falls short because the vcpu - * can observe the TSC moving backwards as it - * migrates across physical cpus. But at least - * it should discourage the guest from using the - * TSC to keep track of time. - */ - if (tsc_is_invariant && smp_tsc) - regs[3] |= AMDPM_TSC_INVARIANT; - break; - - case CPUID_8000_001D: - /* AMD Cache topology, like 0000_0004 for Intel. */ - if (!vmm_is_svm()) - goto default_leaf; - - /* - * Similar to Intel, generate a fictitious cache - * topology for the guest with L3 shared by the - * package, and L1 and L2 local to a core. - */ - vm_get_topology(vm, &sockets, &cores, &threads, - &maxcpus); - switch (param) { - case 0: - logical_cpus = threads; - level = 1; - func = 1; /* data cache */ - break; - case 1: - logical_cpus = threads; - level = 2; - func = 3; /* unified cache */ - break; - case 2: - logical_cpus = threads * cores; - level = 3; - func = 3; /* unified cache */ - break; - default: - logical_cpus = sockets * threads * cores; - level = 0; - func = 0; - break; - } - - logical_cpus = MIN(0xfff, logical_cpus - 1); - regs[0] = (logical_cpus << 14) | (1 << 8) | - (level << 5) | func; - regs[1] = (func > 0) ? (CACHE_LINE_SIZE - 1) : 0; - - /* - * ecx: Number of cache ways for non-fully - * associative cache, minus 1. Reported value - * of zero means there is one way. - */ - regs[2] = 0; - - regs[3] = 0; - break; - - case CPUID_8000_001E: - /* - * AMD Family 16h+ and Hygon Family 18h additional - * identifiers. - */ - if (!vmm_is_svm() || CPUID_TO_FAMILY(cpu_id) < 0x16) - goto default_leaf; - - vm_get_topology(vm, &sockets, &cores, &threads, - &maxcpus); - regs[0] = vcpu_id; - threads = MIN(0xFF, threads - 1); - regs[1] = (threads << 8) | - (vcpu_id >> log2(threads + 1)); - /* - * XXX Bhyve topology cannot yet represent >1 node per - * processor. - */ - regs[2] = 0; - regs[3] = 0; - break; - - case CPUID_0000_0001: - do_cpuid(1, regs); - - error = vm_get_x2apic_state(vcpu, &x2apic_state); - if (error) { - panic("x86_emulate_cpuid: error %d " - "fetching x2apic state", error); - } - - /* - * Override the APIC ID only in ebx - */ - regs[1] &= ~(CPUID_LOCAL_APIC_ID); - regs[1] |= (vcpu_id << CPUID_0000_0001_APICID_SHIFT); - - /* - * Don't expose VMX, SpeedStep, TME or SMX capability. - * Advertise x2APIC capability and Hypervisor guest. - */ - regs[2] &= ~(CPUID2_VMX | CPUID2_EST | CPUID2_TM2); - regs[2] &= ~(CPUID2_SMX); - - regs[2] |= CPUID2_HV; - - if (x2apic_state != X2APIC_DISABLED) - regs[2] |= CPUID2_X2APIC; - else - regs[2] &= ~CPUID2_X2APIC; - - /* - * Only advertise CPUID2_XSAVE in the guest if - * the host is using XSAVE. - */ - if (!(regs[2] & CPUID2_OSXSAVE)) - regs[2] &= ~CPUID2_XSAVE; - - /* - * If CPUID2_XSAVE is being advertised and the - * guest has set CR4_XSAVE, set - * CPUID2_OSXSAVE. - */ - regs[2] &= ~CPUID2_OSXSAVE; - if (regs[2] & CPUID2_XSAVE) { - error = vm_get_register(vcpu, - VM_REG_GUEST_CR4, &cr4); - if (error) - panic("x86_emulate_cpuid: error %d " - "fetching %%cr4", error); - if (cr4 & CR4_XSAVE) - regs[2] |= CPUID2_OSXSAVE; - } - - /* - * Hide monitor/mwait until we know how to deal with - * these instructions. - */ - regs[2] &= ~CPUID2_MON; - - /* - * Hide the performance and debug features. - */ - regs[2] &= ~CPUID2_PDCM; - - /* - * No TSC deadline support in the APIC yet - */ - regs[2] &= ~CPUID2_TSCDLT; - - /* - * Hide thermal monitoring - */ - regs[3] &= ~(CPUID_ACPI | CPUID_TM); - - /* - * Hide the debug store capability. - */ - regs[3] &= ~CPUID_DS; - - /* - * Advertise the Machine Check and MTRR capability. - * - * Some guest OSes (e.g. Windows) will not boot if - * these features are absent. - */ - regs[3] |= (CPUID_MCA | CPUID_MCE | CPUID_MTRR); - - vm_get_topology(vm, &sockets, &cores, &threads, - &maxcpus); - logical_cpus = threads * cores; - regs[1] &= ~CPUID_HTT_CORES; - regs[1] |= (logical_cpus & 0xff) << 16; - regs[3] |= CPUID_HTT; - break; - - case CPUID_0000_0004: - cpuid_count(func, param, regs); - - if (regs[0] || regs[1] || regs[2] || regs[3]) { - vm_get_topology(vm, &sockets, &cores, &threads, - &maxcpus); - regs[0] &= 0x3ff; - regs[0] |= (cores - 1) << 26; - /* - * Cache topology: - * - L1 and L2 are shared only by the logical - * processors in a single core. - * - L3 and above are shared by all logical - * processors in the package. - */ - logical_cpus = threads; - level = (regs[0] >> 5) & 0x7; - if (level >= 3) - logical_cpus *= cores; - regs[0] |= (logical_cpus - 1) << 14; - } - break; - - case CPUID_0000_0007: - regs[0] = 0; - regs[1] = 0; - regs[2] = 0; - regs[3] = 0; - - /* leaf 0 */ - if (param == 0) { - cpuid_count(func, param, regs); - - /* Only leaf 0 is supported */ - regs[0] = 0; - - /* - * Expose known-safe features. - */ - regs[1] &= CPUID_STDEXT_FSGSBASE | - CPUID_STDEXT_BMI1 | CPUID_STDEXT_HLE | - CPUID_STDEXT_AVX2 | CPUID_STDEXT_SMEP | - CPUID_STDEXT_BMI2 | - CPUID_STDEXT_ERMS | CPUID_STDEXT_RTM | - CPUID_STDEXT_AVX512F | - CPUID_STDEXT_AVX512DQ | - CPUID_STDEXT_RDSEED | - CPUID_STDEXT_SMAP | - CPUID_STDEXT_AVX512PF | - CPUID_STDEXT_AVX512ER | - CPUID_STDEXT_AVX512CD | CPUID_STDEXT_SHA | - CPUID_STDEXT_AVX512BW | - CPUID_STDEXT_AVX512VL; - regs[2] &= CPUID_STDEXT2_VAES | - CPUID_STDEXT2_VPCLMULQDQ; - regs[3] &= CPUID_STDEXT3_MD_CLEAR; - - /* Advertise RDPID if it is enabled. */ - error = vm_get_capability(vcpu, VM_CAP_RDPID, - &enable_rdpid); - if (error == 0 && enable_rdpid) - regs[2] |= CPUID_STDEXT2_RDPID; - - /* Advertise INVPCID if it is enabled. */ - error = vm_get_capability(vcpu, - VM_CAP_ENABLE_INVPCID, &enable_invpcid); - if (error == 0 && enable_invpcid) - regs[1] |= CPUID_STDEXT_INVPCID; - } - break; - - case CPUID_0000_0006: - regs[0] = CPUTPM1_ARAT; - regs[1] = 0; - regs[2] = 0; - regs[3] = 0; - break; - - case CPUID_0000_000A: - /* - * Handle the access, but report 0 for - * all options - */ - regs[0] = 0; - regs[1] = 0; - regs[2] = 0; - regs[3] = 0; - break; - - case CPUID_0000_000B: - /* - * Intel processor topology enumeration - */ - if (vmm_is_intel()) { - vm_get_topology(vm, &sockets, &cores, &threads, - &maxcpus); - if (param == 0) { - logical_cpus = threads; - width = log2(logical_cpus); - level = CPUID_TYPE_SMT; - x2apic_id = vcpu_id; - } - - if (param == 1) { - logical_cpus = threads * cores; - width = log2(logical_cpus); - level = CPUID_TYPE_CORE; - x2apic_id = vcpu_id; - } - - if (!cpuid_leaf_b || param >= 2) { - width = 0; - logical_cpus = 0; - level = 0; - x2apic_id = 0; - } - - regs[0] = width & 0x1f; - regs[1] = logical_cpus & 0xffff; - regs[2] = (level << 8) | (param & 0xff); - regs[3] = x2apic_id; - } else { - regs[0] = 0; - regs[1] = 0; - regs[2] = 0; - regs[3] = 0; - } - break; - - case CPUID_0000_000D: - limits = vmm_get_xsave_limits(); - if (!limits->xsave_enabled) { - regs[0] = 0; - regs[1] = 0; - regs[2] = 0; - regs[3] = 0; - break; - } - - cpuid_count(func, param, regs); - switch (param) { - case 0: - /* - * Only permit the guest to use bits - * that are active in the host in - * %xcr0. Also, claim that the - * maximum save area size is - * equivalent to the host's current - * save area size. Since this runs - * "inside" of vmrun(), it runs with - * the guest's xcr0, so the current - * save area size is correct as-is. - */ - regs[0] &= limits->xcr0_allowed; - regs[2] = limits->xsave_max_size; - regs[3] &= (limits->xcr0_allowed >> 32); - break; - case 1: - /* Only permit XSAVEOPT. */ - regs[0] &= CPUID_EXTSTATE_XSAVEOPT; - regs[1] = 0; - regs[2] = 0; - regs[3] = 0; - break; - default: - /* - * If the leaf is for a permitted feature, - * pass through as-is, otherwise return - * all zeroes. - */ - if (!(limits->xcr0_allowed & (1ul << param))) { - regs[0] = 0; - regs[1] = 0; - regs[2] = 0; - regs[3] = 0; - } - break; - } - break; - - case CPUID_0000_000F: - case CPUID_0000_0010: - /* - * Do not report any Resource Director Technology - * capabilities. Exposing control of cache or memory - * controller resource partitioning to the guest is not - * at all sensible. - * - * This is already hidden at a high level by masking of - * leaf 0x7. Even still, a guest may look here for - * detailed capability information. - */ - regs[0] = 0; - regs[1] = 0; - regs[2] = 0; - regs[3] = 0; - break; - - case CPUID_0000_0015: - /* - * Don't report CPU TSC/Crystal ratio and clock - * values since guests may use these to derive the - * local APIC frequency.. - */ - regs[0] = 0; - regs[1] = 0; - regs[2] = 0; - regs[3] = 0; - break; - - case CPUID_VM_SIGNATURE: - regs[0] = CPUID_VM_HIGH; - bcopy(bhyve_id, ®s[1], 4); - bcopy(bhyve_id + 4, ®s[2], 4); - bcopy(bhyve_id + 8, ®s[3], 4); - break; - - case CPUID_BHYVE_FEATURES: - regs[0] = CPUID_BHYVE_FEAT_EXT_DEST_ID; - regs[1] = 0; - regs[2] = 0; - regs[3] = 0; - break; - - default: -default_leaf: - /* - * The leaf value has already been clamped so - * simply pass this through, keeping count of - * how many unhandled leaf values have been seen. - */ - atomic_add_long(&bhyve_xcpuids, 1); - cpuid_count(func, param, regs); - break; - } - - /* - * CPUID clears the upper 32-bits of the long-mode registers. - */ - *rax = regs[0]; - *rbx = regs[1]; - *rcx = regs[2]; - *rdx = regs[3]; - - return (1); -} - bool vm_cpuid_capability(struct vcpu *vcpu, enum vm_cpuid_capability cap) { diff --git a/sys/modules/vmm/Makefile b/sys/modules/vmm/Makefile --- a/sys/modules/vmm/Makefile +++ b/sys/modules/vmm/Makefile @@ -80,7 +80,8 @@ opt_bhyve_snapshot.h \ opt_ddb.h -SRCS+= vmm_host.c \ +SRCS+= vmm_cpuid.c \ + vmm_host.c \ vmm_ioport.c \ vmm_lapic.c \ vmm_mem_machdep.c \