Index: head/sys/amd64/vmm/amd/svm_msr.c
===================================================================
--- head/sys/amd64/vmm/amd/svm_msr.c	(revision 282280)
+++ head/sys/amd64/vmm/amd/svm_msr.c	(revision 282281)
@@ -1,136 +1,157 @@
 /*-
  * Copyright (c) 2014, Neel Natu (neel@freebsd.org)
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
-#include <sys/types.h>
+#include <sys/param.h>
 #include <sys/errno.h>
+#include <sys/systm.h>
+#include <sys/cpuset.h>
 
 #include <machine/cpufunc.h>
 #include <machine/specialreg.h>
+#include <machine/vmm.h>
 
+#include "svm.h"
+#include "vmcb.h"
+#include "svm_softc.h"
 #include "svm_msr.h"
 
 #ifndef MSR_AMDK8_IPM
 #define	MSR_AMDK8_IPM	0xc0010055
 #endif
 
 enum {
 	IDX_MSR_LSTAR,
 	IDX_MSR_CSTAR,
 	IDX_MSR_STAR,
 	IDX_MSR_SF_MASK,
 	HOST_MSR_NUM		/* must be the last enumeration */
 };
 
 static uint64_t host_msrs[HOST_MSR_NUM];
 
 void
 svm_msr_init(void)
 {
 	/* 
 	 * It is safe to cache the values of the following MSRs because they
 	 * don't change based on curcpu, curproc or curthread.
 	 */
 	host_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR);
 	host_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR);
 	host_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR);
 	host_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK);
 }
 
 void
 svm_msr_guest_init(struct svm_softc *sc, int vcpu)
 {
 	/*
 	 * All the MSRs accessible to the guest are either saved/restored by
 	 * hardware on every #VMEXIT/VMRUN (e.g., G_PAT) or are saved/restored
 	 * by VMSAVE/VMLOAD (e.g., MSR_GSBASE).
 	 *
 	 * There are no guest MSRs that are saved/restored "by hand" so nothing
 	 * more to do here.
 	 */
 	return;
 }
 
 void
 svm_msr_guest_enter(struct svm_softc *sc, int vcpu)
 {
 	/*
 	 * Save host MSRs (if any) and restore guest MSRs (if any).
 	 */
 }
 
 void
 svm_msr_guest_exit(struct svm_softc *sc, int vcpu)
 {
 	/*
 	 * Save guest MSRs (if any) and restore host MSRs.
 	 */
 	wrmsr(MSR_LSTAR, host_msrs[IDX_MSR_LSTAR]);
 	wrmsr(MSR_CSTAR, host_msrs[IDX_MSR_CSTAR]);
 	wrmsr(MSR_STAR, host_msrs[IDX_MSR_STAR]);
 	wrmsr(MSR_SF_MASK, host_msrs[IDX_MSR_SF_MASK]);
 
 	/* MSR_KGSBASE will be restored on the way back to userspace */
 }
 
 int
 svm_rdmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t *result,
     bool *retu)
 {
 	int error = 0;
 
 	switch (num) {
+	case MSR_MTRRcap:
+	case MSR_MTRRdefType:
+	case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8:
+	case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
+	case MSR_MTRR64kBase:
+		*result = 0;
+		break;
 	case MSR_AMDK8_IPM:
 		*result = 0;
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 
 	return (error);
 }
 
 int
 svm_wrmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t val, bool *retu)
 {
 	int error = 0;
 
 	switch (num) {
+	case MSR_MTRRcap:
+		vm_inject_gp(sc->vm, vcpu);
+		break;
+	case MSR_MTRRdefType:
+	case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8:
+	case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
+	case MSR_MTRR64kBase:
+		break;		/* Ignore writes */
 	case MSR_AMDK8_IPM:
 		/*
 		 * Ignore writes to the "Interrupt Pending Message" MSR.
 		 */
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 
 	return (error);
 }
Index: head/sys/amd64/vmm/intel/vmx_msr.c
===================================================================
--- head/sys/amd64/vmm/intel/vmx_msr.c	(revision 282280)
+++ head/sys/amd64/vmm/intel/vmx_msr.c	(revision 282281)
@@ -1,462 +1,477 @@
 /*-
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/cpuset.h>
 
 #include <machine/clock.h>
 #include <machine/cpufunc.h>
 #include <machine/md_var.h>
 #include <machine/specialreg.h>
 #include <machine/vmm.h>
 
 #include "vmx.h"
 #include "vmx_msr.h"
 
 static boolean_t
 vmx_ctl_allows_one_setting(uint64_t msr_val, int bitpos)
 {
 
 	if (msr_val & (1UL << (bitpos + 32)))
 		return (TRUE);
 	else
 		return (FALSE);
 }
 
 static boolean_t
 vmx_ctl_allows_zero_setting(uint64_t msr_val, int bitpos)
 {
 
 	if ((msr_val & (1UL << bitpos)) == 0)
 		return (TRUE);
 	else
 		return (FALSE);
 }
 
 uint32_t
 vmx_revision(void)
 {
 
 	return (rdmsr(MSR_VMX_BASIC) & 0xffffffff);
 }
 
 /*
  * Generate a bitmask to be used for the VMCS execution control fields.
  *
  * The caller specifies what bits should be set to one in 'ones_mask'
  * and what bits should be set to zero in 'zeros_mask'. The don't-care
  * bits are set to the default value. The default values are obtained
  * based on "Algorithm 3" in Section 27.5.1 "Algorithms for Determining
  * VMX Capabilities".
  *
  * Returns zero on success and non-zero on error.
  */
 int
 vmx_set_ctlreg(int ctl_reg, int true_ctl_reg, uint32_t ones_mask,
 	       uint32_t zeros_mask, uint32_t *retval)
 {
 	int i;
 	uint64_t val, trueval;
 	boolean_t true_ctls_avail, one_allowed, zero_allowed;
 
 	/* We cannot ask the same bit to be set to both '1' and '0' */
 	if ((ones_mask ^ zeros_mask) != (ones_mask | zeros_mask))
 		return (EINVAL);
 
 	if (rdmsr(MSR_VMX_BASIC) & (1UL << 55))
 		true_ctls_avail = TRUE;
 	else
 		true_ctls_avail = FALSE;
 
 	val = rdmsr(ctl_reg);
 	if (true_ctls_avail)
 		trueval = rdmsr(true_ctl_reg);		/* step c */
 	else
 		trueval = val;				/* step a */
 
 	for (i = 0; i < 32; i++) {
 		one_allowed = vmx_ctl_allows_one_setting(trueval, i);
 		zero_allowed = vmx_ctl_allows_zero_setting(trueval, i);
 
 		KASSERT(one_allowed || zero_allowed,
 			("invalid zero/one setting for bit %d of ctl 0x%0x, "
 			 "truectl 0x%0x\n", i, ctl_reg, true_ctl_reg));
 
 		if (zero_allowed && !one_allowed) {		/* b(i),c(i) */
 			if (ones_mask & (1 << i))
 				return (EINVAL);
 			*retval &= ~(1 << i);
 		} else if (one_allowed && !zero_allowed) {	/* b(i),c(i) */
 			if (zeros_mask & (1 << i))
 				return (EINVAL);
 			*retval |= 1 << i;
 		} else {
 			if (zeros_mask & (1 << i))	/* b(ii),c(ii) */
 				*retval &= ~(1 << i);
 			else if (ones_mask & (1 << i)) /* b(ii), c(ii) */
 				*retval |= 1 << i;
 			else if (!true_ctls_avail)
 				*retval &= ~(1 << i);	/* b(iii) */
 			else if (vmx_ctl_allows_zero_setting(val, i))/* c(iii)*/
 				*retval &= ~(1 << i);
 			else if (vmx_ctl_allows_one_setting(val, i)) /* c(iv) */
 				*retval |= 1 << i;
 			else {
 				panic("vmx_set_ctlreg: unable to determine "
 				      "correct value of ctl bit %d for msr "
 				      "0x%0x and true msr 0x%0x", i, ctl_reg,
 				      true_ctl_reg);
 			}
 		}
 	}
 
 	return (0);
 }
 
 void
 msr_bitmap_initialize(char *bitmap)
 {
 
 	memset(bitmap, 0xff, PAGE_SIZE);
 }
 
 int
 msr_bitmap_change_access(char *bitmap, u_int msr, int access)
 {
 	int byte, bit;
 
 	if (msr <= 0x00001FFF)
 		byte = msr / 8;
 	else if (msr >= 0xC0000000 && msr <= 0xC0001FFF)
 		byte = 1024 + (msr - 0xC0000000) / 8;
 	else
 		return (EINVAL);
 
 	bit = msr & 0x7;
 
 	if (access & MSR_BITMAP_ACCESS_READ)
 		bitmap[byte] &= ~(1 << bit);
 	else
 		bitmap[byte] |= 1 << bit;
 
 	byte += 2048;
 	if (access & MSR_BITMAP_ACCESS_WRITE)
 		bitmap[byte] &= ~(1 << bit);
 	else
 		bitmap[byte] |= 1 << bit;
 
 	return (0);
 }
 
 static uint64_t misc_enable;
 static uint64_t platform_info;
 static uint64_t turbo_ratio_limit;
 static uint64_t host_msrs[GUEST_MSR_NUM];
 
 static bool
 nehalem_cpu(void)
 {
 	u_int family, model;
 
 	/*
 	 * The family:model numbers belonging to the Nehalem microarchitecture
 	 * are documented in Section 35.5, Intel SDM dated Feb 2014.
 	 */
 	family = CPUID_TO_FAMILY(cpu_id);
 	model = CPUID_TO_MODEL(cpu_id);
 	if (family == 0x6) {
 		switch (model) {
 		case 0x1A:
 		case 0x1E:
 		case 0x1F:
 		case 0x2E:
 			return (true);
 		default:
 			break;
 		}
 	}
 	return (false);
 }
 
 static bool
 westmere_cpu(void)
 {
 	u_int family, model;
 
 	/*
 	 * The family:model numbers belonging to the Westmere microarchitecture
 	 * are documented in Section 35.6, Intel SDM dated Feb 2014.
 	 */
 	family = CPUID_TO_FAMILY(cpu_id);
 	model = CPUID_TO_MODEL(cpu_id);
 	if (family == 0x6) {
 		switch (model) {
 		case 0x25:
 		case 0x2C:
 			return (true);
 		default:
 			break;
 		}
 	}
 	return (false);
 }
 
 static bool
 pat_valid(uint64_t val)
 {
 	int i, pa;
 
 	/*
 	 * From Intel SDM: Table "Memory Types That Can Be Encoded With PAT"
 	 *
 	 * Extract PA0 through PA7 and validate that each one encodes a
 	 * valid memory type.
 	 */
 	for (i = 0; i < 8; i++) {
 		pa = (val >> (i * 8)) & 0xff;
 		if (pa == 2 || pa == 3 || pa >= 8)
 			return (false);
 	}
 	return (true);
 }
 
 void
 vmx_msr_init(void)
 {
 	uint64_t bus_freq, ratio;
 	int i;
 
 	/*
 	 * It is safe to cache the values of the following MSRs because
 	 * they don't change based on curcpu, curproc or curthread.
 	 */
 	host_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR);
 	host_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR);
 	host_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR);
 	host_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK);
 
 	/*
 	 * Initialize emulated MSRs
 	 */
 	misc_enable = rdmsr(MSR_IA32_MISC_ENABLE);
 	/*
 	 * Set mandatory bits
 	 *  11:   branch trace disabled
 	 *  12:   PEBS unavailable
 	 * Clear unsupported features
 	 *  16:   SpeedStep enable
 	 *  18:   enable MONITOR FSM
 	 */
 	misc_enable |= (1 << 12) | (1 << 11);
 	misc_enable &= ~((1 << 18) | (1 << 16));
 
 	if (nehalem_cpu() || westmere_cpu())
 		bus_freq = 133330000;		/* 133Mhz */
 	else
 		bus_freq = 100000000;		/* 100Mhz */
 
 	/*
 	 * XXXtime
 	 * The ratio should really be based on the virtual TSC frequency as
 	 * opposed to the host TSC.
 	 */
 	ratio = (tsc_freq / bus_freq) & 0xff;
 
 	/*
 	 * The register definition is based on the micro-architecture
 	 * but the following bits are always the same:
 	 * [15:8]  Maximum Non-Turbo Ratio
 	 * [28]    Programmable Ratio Limit for Turbo Mode
 	 * [29]    Programmable TDC-TDP Limit for Turbo Mode
 	 * [47:40] Maximum Efficiency Ratio
 	 *
 	 * The other bits can be safely set to 0 on all
 	 * micro-architectures up to Haswell.
 	 */
 	platform_info = (ratio << 8) | (ratio << 40);
 
 	/*
 	 * The number of valid bits in the MSR_TURBO_RATIO_LIMITx register is
 	 * dependent on the maximum cores per package supported by the micro-
 	 * architecture. For e.g., Westmere supports 6 cores per package and
 	 * uses the low 48 bits. Sandybridge support 8 cores per package and
 	 * uses up all 64 bits.
 	 *
 	 * However, the unused bits are reserved so we pretend that all bits
 	 * in this MSR are valid.
 	 */
 	for (i = 0; i < 8; i++)
 		turbo_ratio_limit = (turbo_ratio_limit << 8) | ratio;
 }
 
 void
 vmx_msr_guest_init(struct vmx *vmx, int vcpuid)
 {
 	uint64_t *guest_msrs;
 
 	guest_msrs = vmx->guest_msrs[vcpuid];
 
 	/*
 	 * The permissions bitmap is shared between all vcpus so initialize it
 	 * once when initializing the vBSP.
 	 */
 	if (vcpuid == 0) {
 		guest_msr_rw(vmx, MSR_LSTAR);
 		guest_msr_rw(vmx, MSR_CSTAR);
 		guest_msr_rw(vmx, MSR_STAR);
 		guest_msr_rw(vmx, MSR_SF_MASK);
 		guest_msr_rw(vmx, MSR_KGSBASE);
 	}
 
 	/*
 	 * Initialize guest IA32_PAT MSR with default value after reset.
 	 */
 	guest_msrs[IDX_MSR_PAT] = PAT_VALUE(0, PAT_WRITE_BACK) |
 	    PAT_VALUE(1, PAT_WRITE_THROUGH)	|
 	    PAT_VALUE(2, PAT_UNCACHED)		|
 	    PAT_VALUE(3, PAT_UNCACHEABLE)	|
 	    PAT_VALUE(4, PAT_WRITE_BACK)	|
 	    PAT_VALUE(5, PAT_WRITE_THROUGH)	|
 	    PAT_VALUE(6, PAT_UNCACHED)		|
 	    PAT_VALUE(7, PAT_UNCACHEABLE);
 
 	return;
 }
 
 void
 vmx_msr_guest_enter(struct vmx *vmx, int vcpuid)
 {
 	uint64_t *guest_msrs = vmx->guest_msrs[vcpuid];
 
 	/* Save host MSRs (if any) and restore guest MSRs */
 	wrmsr(MSR_LSTAR, guest_msrs[IDX_MSR_LSTAR]);
 	wrmsr(MSR_CSTAR, guest_msrs[IDX_MSR_CSTAR]);
 	wrmsr(MSR_STAR, guest_msrs[IDX_MSR_STAR]);
 	wrmsr(MSR_SF_MASK, guest_msrs[IDX_MSR_SF_MASK]);
 	wrmsr(MSR_KGSBASE, guest_msrs[IDX_MSR_KGSBASE]);
 }
 
 void
 vmx_msr_guest_exit(struct vmx *vmx, int vcpuid)
 {
 	uint64_t *guest_msrs = vmx->guest_msrs[vcpuid];
 
 	/* Save guest MSRs */
 	guest_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR);
 	guest_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR);
 	guest_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR);
 	guest_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK);
 	guest_msrs[IDX_MSR_KGSBASE] = rdmsr(MSR_KGSBASE);
 
 	/* Restore host MSRs */
 	wrmsr(MSR_LSTAR, host_msrs[IDX_MSR_LSTAR]);
 	wrmsr(MSR_CSTAR, host_msrs[IDX_MSR_CSTAR]);
 	wrmsr(MSR_STAR, host_msrs[IDX_MSR_STAR]);
 	wrmsr(MSR_SF_MASK, host_msrs[IDX_MSR_SF_MASK]);
 
 	/* MSR_KGSBASE will be restored on the way back to userspace */
 }
 
 int
 vmx_rdmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t *val, bool *retu)
 {
 	const uint64_t *guest_msrs;
 	int error;
 
 	guest_msrs = vmx->guest_msrs[vcpuid];
 	error = 0;
 
 	switch (num) {
+	case MSR_MTRRcap:
+	case MSR_MTRRdefType:
+	case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8:
+	case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
+	case MSR_MTRR64kBase:
+		*val = 0;
+		break;
 	case MSR_IA32_MISC_ENABLE:
 		*val = misc_enable;
 		break;
 	case MSR_PLATFORM_INFO:
 		*val = platform_info;
 		break;
 	case MSR_TURBO_RATIO_LIMIT:
 	case MSR_TURBO_RATIO_LIMIT1:
 		*val = turbo_ratio_limit;
 		break;
 	case MSR_PAT:
 		*val = guest_msrs[IDX_MSR_PAT];
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	return (error);
 }
 
 int
 vmx_wrmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t val, bool *retu)
 {
 	uint64_t *guest_msrs;
 	uint64_t changed;
 	int error;
 	
 	guest_msrs = vmx->guest_msrs[vcpuid];
 	error = 0;
 
 	switch (num) {
+	case MSR_MTRRcap:
+		vm_inject_gp(vmx->vm, vcpuid);
+		break;
+	case MSR_MTRRdefType:
+	case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8:
+	case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
+	case MSR_MTRR64kBase:
+		break;		/* Ignore writes */
 	case MSR_IA32_MISC_ENABLE:
 		changed = val ^ misc_enable;
 		/*
 		 * If the host has disabled the NX feature then the guest
 		 * also cannot use it. However, a Linux guest will try to
 		 * enable the NX feature by writing to the MISC_ENABLE MSR.
 		 *
 		 * This can be safely ignored because the memory management
 		 * code looks at CPUID.80000001H:EDX.NX to check if the
 		 * functionality is actually enabled.
 		 */
 		changed &= ~(1UL << 34);
 
 		/*
 		 * Punt to userspace if any other bits are being modified.
 		 */
 		if (changed)
 			error = EINVAL;
 
 		break;
 	case MSR_PAT:
 		if (pat_valid(val))
 			guest_msrs[IDX_MSR_PAT] = val;
 		else
 			vm_inject_gp(vmx->vm, vcpuid);
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 
 	return (error);
 }
Index: head/sys/amd64/vmm/x86.c
===================================================================
--- head/sys/amd64/vmm/x86.c	(revision 282280)
+++ head/sys/amd64/vmm/x86.c	(revision 282281)
@@ -1,488 +1,487 @@
 /*-
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/pcpu.h>
 #include <sys/systm.h>
 #include <sys/cpuset.h>
 #include <sys/sysctl.h>
 
 #include <machine/clock.h>
 #include <machine/cpufunc.h>
 #include <machine/md_var.h>
 #include <machine/segments.h>
 #include <machine/specialreg.h>
 
 #include <machine/vmm.h>
 
 #include "vmm_host.h"
 #include "vmm_ktr.h"
 #include "vmm_util.h"
 #include "x86.h"
 
 SYSCTL_DECL(_hw_vmm);
 static SYSCTL_NODE(_hw_vmm, OID_AUTO, topology, CTLFLAG_RD, 0, NULL);
 
 #define	CPUID_VM_HIGH		0x40000000
 
 static const char bhyve_id[12] = "bhyve bhyve ";
 
 static uint64_t bhyve_xcpuids;
 SYSCTL_ULONG(_hw_vmm, OID_AUTO, bhyve_xcpuids, CTLFLAG_RW, &bhyve_xcpuids, 0,
     "Number of times an unknown cpuid leaf was accessed");
 
 /*
  * The default CPU topology is a single thread per package.
  */
 static u_int threads_per_core = 1;
 SYSCTL_UINT(_hw_vmm_topology, OID_AUTO, threads_per_core, CTLFLAG_RDTUN,
     &threads_per_core, 0, NULL);
 
 static u_int cores_per_package = 1;
 SYSCTL_UINT(_hw_vmm_topology, OID_AUTO, cores_per_package, CTLFLAG_RDTUN,
     &cores_per_package, 0, NULL);
 
 static int cpuid_leaf_b = 1;
 SYSCTL_INT(_hw_vmm_topology, OID_AUTO, cpuid_leaf_b, CTLFLAG_RDTUN,
     &cpuid_leaf_b, 0, NULL);
 
 /*
  * Round up to the next power of two, if necessary, and then take log2.
  * Returns -1 if argument is zero.
  */
 static __inline int
 log2(u_int x)
 {
 
 	return (fls(x << (1 - powerof2(x))) - 1);
 }
 
 int
 x86_emulate_cpuid(struct vm *vm, int vcpu_id,
 		  uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
 {
 	const struct xsave_limits *limits;
 	uint64_t cr4;
 	int error, enable_invpcid, level, width, x2apic_id;
 	unsigned int func, regs[4], logical_cpus;
 	enum x2apic_state x2apic_state;
 
 	VCPU_CTR2(vm, vcpu_id, "cpuid %#x,%#x", *eax, *ecx);
 
 	/*
 	 * Requests for invalid CPUID levels should map to the highest
 	 * available level instead.
 	 */
 	if (cpu_exthigh != 0 && *eax >= 0x80000000) {
 		if (*eax > cpu_exthigh)
 			*eax = cpu_exthigh;
 	} else if (*eax >= 0x40000000) {
 		if (*eax > CPUID_VM_HIGH)
 			*eax = CPUID_VM_HIGH;
 	} else if (*eax > cpu_high) {
 		*eax = cpu_high;
 	}
 
 	func = *eax;
 
 	/*
 	 * In general the approach used for CPU topology is to
 	 * advertise a flat topology where all CPUs are packages with
 	 * no multi-core or SMT.
 	 */
 	switch (func) {
 		/*
 		 * Pass these through to the guest
 		 */
 		case CPUID_0000_0000:
 		case CPUID_0000_0002:
 		case CPUID_0000_0003:
 		case CPUID_8000_0000:
 		case CPUID_8000_0002:
 		case CPUID_8000_0003:
 		case CPUID_8000_0004:
 		case CPUID_8000_0006:
 			cpuid_count(*eax, *ecx, regs);
 			break;
 		case CPUID_8000_0008:
 			cpuid_count(*eax, *ecx, regs);
 			if (vmm_is_amd()) {
 				/*
 				 * XXX this might appear silly because AMD
 				 * cpus don't have threads.
 				 *
 				 * However this matches the logical cpus as
 				 * advertised by leaf 0x1 and will work even
 				 * if the 'threads_per_core' tunable is set
 				 * incorrectly on an AMD host.
 				 */
 				logical_cpus = threads_per_core *
 				    cores_per_package;
 				regs[2] = logical_cpus - 1;
 			}
 			break;
 
 		case CPUID_8000_0001:
 			cpuid_count(*eax, *ecx, regs);
 
 			/*
 			 * Hide SVM and Topology Extension features from guest.
 			 */
 			regs[2] &= ~(AMDID2_SVM | AMDID2_TOPOLOGY);
 
 			/*
 			 * Don't advertise extended performance counter MSRs
 			 * to the guest.
 			 */
 			regs[2] &= ~AMDID2_PCXC;
 			regs[2] &= ~AMDID2_PNXC;
 			regs[2] &= ~AMDID2_PTSCEL2I;
 
 			/*
 			 * Don't advertise Instruction Based Sampling feature.
 			 */
 			regs[2] &= ~AMDID2_IBS;
 
 			/* NodeID MSR not available */
 			regs[2] &= ~AMDID2_NODE_ID;
 
 			/* Don't advertise the OS visible workaround feature */
 			regs[2] &= ~AMDID2_OSVW;
 
 			/*
 			 * Hide rdtscp/ia32_tsc_aux until we know how
 			 * to deal with them.
 			 */
 			regs[3] &= ~AMDID_RDTSCP;
 			break;
 
 		case CPUID_8000_0007:
 			/*
 			 * AMD uses this leaf to advertise the processor's
 			 * power monitoring and RAS capabilities. These
 			 * features are hardware-specific and exposing
 			 * them to a guest doesn't make a lot of sense.
 			 *
 			 * Intel uses this leaf only to advertise the
 			 * "Invariant TSC" feature with all other bits
 			 * being reserved (set to zero).
 			 */
 			regs[0] = 0;
 			regs[1] = 0;
 			regs[2] = 0;
 			regs[3] = 0;
 
 			/*
 			 * "Invariant TSC" can be advertised to the guest if:
 			 * - host TSC frequency is invariant
 			 * - host TSCs are synchronized across physical cpus
 			 *
 			 * XXX This still falls short because the vcpu
 			 * can observe the TSC moving backwards as it
 			 * migrates across physical cpus. But at least
 			 * it should discourage the guest from using the
 			 * TSC to keep track of time.
 			 */
 			if (tsc_is_invariant && smp_tsc)
 				regs[3] |= AMDPM_TSC_INVARIANT;
 			break;
 
 		case CPUID_0000_0001:
 			do_cpuid(1, regs);
 
 			error = vm_get_x2apic_state(vm, vcpu_id, &x2apic_state);
 			if (error) {
 				panic("x86_emulate_cpuid: error %d "
 				      "fetching x2apic state", error);
 			}
 
 			/*
 			 * Override the APIC ID only in ebx
 			 */
 			regs[1] &= ~(CPUID_LOCAL_APIC_ID);
 			regs[1] |= (vcpu_id << CPUID_0000_0001_APICID_SHIFT);
 
 			/*
 			 * Don't expose VMX, SpeedStep or TME capability.
 			 * Advertise x2APIC capability and Hypervisor guest.
 			 */
 			regs[2] &= ~(CPUID2_VMX | CPUID2_EST | CPUID2_TM2);
 
 			regs[2] |= CPUID2_HV;
 
 			if (x2apic_state != X2APIC_DISABLED)
 				regs[2] |= CPUID2_X2APIC;
 			else
 				regs[2] &= ~CPUID2_X2APIC;
 
 			/*
 			 * Only advertise CPUID2_XSAVE in the guest if
 			 * the host is using XSAVE.
 			 */
 			if (!(regs[2] & CPUID2_OSXSAVE))
 				regs[2] &= ~CPUID2_XSAVE;
 
 			/*
 			 * If CPUID2_XSAVE is being advertised and the
 			 * guest has set CR4_XSAVE, set
 			 * CPUID2_OSXSAVE.
 			 */
 			regs[2] &= ~CPUID2_OSXSAVE;
 			if (regs[2] & CPUID2_XSAVE) {
 				error = vm_get_register(vm, vcpu_id,
 				    VM_REG_GUEST_CR4, &cr4);
 				if (error)
 					panic("x86_emulate_cpuid: error %d "
 					      "fetching %%cr4", error);
 				if (cr4 & CR4_XSAVE)
 					regs[2] |= CPUID2_OSXSAVE;
 			}
 
 			/*
 			 * Hide monitor/mwait until we know how to deal with
 			 * these instructions.
 			 */
 			regs[2] &= ~CPUID2_MON;
 
                         /*
 			 * Hide the performance and debug features.
 			 */
 			regs[2] &= ~CPUID2_PDCM;
 
 			/*
 			 * No TSC deadline support in the APIC yet
 			 */
 			regs[2] &= ~CPUID2_TSCDLT;
 
 			/*
 			 * Hide thermal monitoring
 			 */
 			regs[3] &= ~(CPUID_ACPI | CPUID_TM);
 			
 			/*
 			 * Machine check handling is done in the host.
-			 * Hide MTRR capability.
 			 */
-			regs[3] &= ~(CPUID_MCA | CPUID_MCE | CPUID_MTRR);
+			regs[3] &= ~(CPUID_MCA | CPUID_MCE);
 
                         /*
                         * Hide the debug store capability.
                         */
 			regs[3] &= ~CPUID_DS;
 
 			logical_cpus = threads_per_core * cores_per_package;
 			regs[1] &= ~CPUID_HTT_CORES;
 			regs[1] |= (logical_cpus & 0xff) << 16;
 			regs[3] |= CPUID_HTT;
 			break;
 
 		case CPUID_0000_0004:
 			cpuid_count(*eax, *ecx, regs);
 
 			if (regs[0] || regs[1] || regs[2] || regs[3]) {
 				regs[0] &= 0x3ff;
 				regs[0] |= (cores_per_package - 1) << 26;
 				/*
 				 * Cache topology:
 				 * - L1 and L2 are shared only by the logical
 				 *   processors in a single core.
 				 * - L3 and above are shared by all logical
 				 *   processors in the package.
 				 */
 				logical_cpus = threads_per_core;
 				level = (regs[0] >> 5) & 0x7;
 				if (level >= 3)
 					logical_cpus *= cores_per_package;
 				regs[0] |= (logical_cpus - 1) << 14;
 			}
 			break;
 
 		case CPUID_0000_0007:
 			regs[0] = 0;
 			regs[1] = 0;
 			regs[2] = 0;
 			regs[3] = 0;
 
 			/* leaf 0 */
 			if (*ecx == 0) {
 				cpuid_count(*eax, *ecx, regs);
 
 				/* Only leaf 0 is supported */
 				regs[0] = 0;
 
 				/*
 				 * Expose known-safe features.
 				 */
 				regs[1] &= (CPUID_STDEXT_FSGSBASE |
 				    CPUID_STDEXT_BMI1 | CPUID_STDEXT_HLE |
 				    CPUID_STDEXT_AVX2 | CPUID_STDEXT_BMI2 |
 				    CPUID_STDEXT_ERMS | CPUID_STDEXT_RTM |
 				    CPUID_STDEXT_AVX512F |
 				    CPUID_STDEXT_AVX512PF |
 				    CPUID_STDEXT_AVX512ER |
 				    CPUID_STDEXT_AVX512CD);
 				regs[2] = 0;
 				regs[3] = 0;
 
 				/* Advertise INVPCID if it is enabled. */
 				error = vm_get_capability(vm, vcpu_id,
 				    VM_CAP_ENABLE_INVPCID, &enable_invpcid);
 				if (error == 0 && enable_invpcid)
 					regs[1] |= CPUID_STDEXT_INVPCID;
 			}
 			break;
 
 		case CPUID_0000_0006:
 			regs[0] = CPUTPM1_ARAT;
 			regs[1] = 0;
 			regs[2] = 0;
 			regs[3] = 0;
 			break;
 
 		case CPUID_0000_000A:
 			/*
 			 * Handle the access, but report 0 for
 			 * all options
 			 */
 			regs[0] = 0;
 			regs[1] = 0;
 			regs[2] = 0;
 			regs[3] = 0;
 			break;
 
 		case CPUID_0000_000B:
 			/*
 			 * Processor topology enumeration
 			 */
 			if (*ecx == 0) {
 				logical_cpus = threads_per_core;
 				width = log2(logical_cpus);
 				level = CPUID_TYPE_SMT;
 				x2apic_id = vcpu_id;
 			}
 
 			if (*ecx == 1) {
 				logical_cpus = threads_per_core *
 				    cores_per_package;
 				width = log2(logical_cpus);
 				level = CPUID_TYPE_CORE;
 				x2apic_id = vcpu_id;
 			}
 
 			if (!cpuid_leaf_b || *ecx >= 2) {
 				width = 0;
 				logical_cpus = 0;
 				level = 0;
 				x2apic_id = 0;
 			}
 
 			regs[0] = width & 0x1f;
 			regs[1] = logical_cpus & 0xffff;
 			regs[2] = (level << 8) | (*ecx & 0xff);
 			regs[3] = x2apic_id;
 			break;
 
 		case CPUID_0000_000D:
 			limits = vmm_get_xsave_limits();
 			if (!limits->xsave_enabled) {
 				regs[0] = 0;
 				regs[1] = 0;
 				regs[2] = 0;
 				regs[3] = 0;
 				break;
 			}
 
 			cpuid_count(*eax, *ecx, regs);
 			switch (*ecx) {
 			case 0:
 				/*
 				 * Only permit the guest to use bits
 				 * that are active in the host in
 				 * %xcr0.  Also, claim that the
 				 * maximum save area size is
 				 * equivalent to the host's current
 				 * save area size.  Since this runs
 				 * "inside" of vmrun(), it runs with
 				 * the guest's xcr0, so the current
 				 * save area size is correct as-is.
 				 */
 				regs[0] &= limits->xcr0_allowed;
 				regs[2] = limits->xsave_max_size;
 				regs[3] &= (limits->xcr0_allowed >> 32);
 				break;
 			case 1:
 				/* Only permit XSAVEOPT. */
 				regs[0] &= CPUID_EXTSTATE_XSAVEOPT;
 				regs[1] = 0;
 				regs[2] = 0;
 				regs[3] = 0;
 				break;
 			default:
 				/*
 				 * If the leaf is for a permitted feature,
 				 * pass through as-is, otherwise return
 				 * all zeroes.
 				 */
 				if (!(limits->xcr0_allowed & (1ul << *ecx))) {
 					regs[0] = 0;
 					regs[1] = 0;
 					regs[2] = 0;
 					regs[3] = 0;
 				}
 				break;
 			}
 			break;
 
 		case 0x40000000:
 			regs[0] = CPUID_VM_HIGH;
 			bcopy(bhyve_id, &regs[1], 4);
 			bcopy(bhyve_id + 4, &regs[2], 4);
 			bcopy(bhyve_id + 8, &regs[3], 4);
 			break;
 
 		default:
 			/*
 			 * The leaf value has already been clamped so
 			 * simply pass this through, keeping count of
 			 * how many unhandled leaf values have been seen.
 			 */
 			atomic_add_long(&bhyve_xcpuids, 1);
 			cpuid_count(*eax, *ecx, regs);
 			break;
 	}
 
 	*eax = regs[0];
 	*ebx = regs[1];
 	*ecx = regs[2];
 	*edx = regs[3];
 
 	return (1);
 }