Index: head/sys/amd64/vmm/amd/amdvi_hw.c
===================================================================
--- head/sys/amd64/vmm/amd/amdvi_hw.c	(revision 357973)
+++ head/sys/amd64/vmm/amd/amdvi_hw.c	(revision 357974)
@@ -1,1459 +1,1460 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2016, Anish Gupta (anish@freebsd.org)
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/malloc.h>
 #include <sys/pcpu.h>
 #include <sys/rman.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 #include <dev/pci/pcivar.h>
 #include <dev/pci/pcireg.h>
 
 #include <machine/resource.h>
 #include <machine/vmm.h>
 #include <machine/pmap.h>
 #include <machine/vmparam.h>
 #include <machine/pci_cfgreg.h>
 
 #include "pcib_if.h"
 
 #include "io/iommu.h"
 #include "amdvi_priv.h"
 
 SYSCTL_DECL(_hw_vmm);
-SYSCTL_NODE(_hw_vmm, OID_AUTO, amdvi, CTLFLAG_RW, NULL, NULL);
+SYSCTL_NODE(_hw_vmm, OID_AUTO, amdvi, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
+    NULL);
 
 #define MOD_INC(a, s, m) (((a) + (s)) % ((m) * (s)))
 #define MOD_DEC(a, s, m) (((a) - (s)) % ((m) * (s)))
 
 /* Print RID or device ID in PCI string format. */
 #define RID2PCI_STR(d) PCI_RID2BUS(d), PCI_RID2SLOT(d), PCI_RID2FUNC(d)
 
 static void amdvi_dump_cmds(struct amdvi_softc *softc, int count);
 static void amdvi_print_dev_cap(struct amdvi_softc *softc);
 
 MALLOC_DEFINE(M_AMDVI, "amdvi", "amdvi");
 
 extern device_t *ivhd_devs;
 
 extern int ivhd_count;
 SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, count, CTLFLAG_RDTUN, &ivhd_count,
     0, NULL);
 
 static int amdvi_enable_user = 0;
 SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, enable, CTLFLAG_RDTUN,
     &amdvi_enable_user, 0, NULL);
 TUNABLE_INT("hw.vmm.amdvi_enable", &amdvi_enable_user);
 
 #ifdef AMDVI_ATS_ENABLE
 /* XXX: ATS is not tested. */
 static int amdvi_enable_iotlb = 1;
 SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, iotlb_enabled, CTLFLAG_RDTUN,
     &amdvi_enable_iotlb, 0, NULL);
 TUNABLE_INT("hw.vmm.enable_iotlb", &amdvi_enable_iotlb);
 #endif
 
 static int amdvi_host_ptp = 1;	/* Use page tables for host. */
 SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, host_ptp, CTLFLAG_RDTUN,
     &amdvi_host_ptp, 0, NULL);
 TUNABLE_INT("hw.vmm.amdvi.host_ptp", &amdvi_host_ptp);
 
 /* Page table level used <= supported by h/w[v1=7]. */
 int amdvi_ptp_level = 4;
 SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, ptp_level, CTLFLAG_RDTUN,
     &amdvi_ptp_level, 0, NULL);
 TUNABLE_INT("hw.vmm.amdvi.ptp_level", &amdvi_ptp_level);
 
 /* Disable fault event reporting. */
 static int amdvi_disable_io_fault = 0;
 SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, disable_io_fault, CTLFLAG_RDTUN,
     &amdvi_disable_io_fault, 0, NULL);
 TUNABLE_INT("hw.vmm.amdvi.disable_io_fault", &amdvi_disable_io_fault);
 
 static uint32_t amdvi_dom_id = 0;	/* 0 is reserved for host. */
 SYSCTL_UINT(_hw_vmm_amdvi, OID_AUTO, domain_id, CTLFLAG_RD,
     &amdvi_dom_id, 0, NULL);
 /*
  * Device table entry.
  * Bus(256) x Dev(32) x Fun(8) x DTE(256 bits or 32 bytes).
  *	= 256 * 2 * PAGE_SIZE.
  */
 static struct amdvi_dte amdvi_dte[PCI_NUM_DEV_MAX] __aligned(PAGE_SIZE);
 CTASSERT(PCI_NUM_DEV_MAX == 0x10000);
 CTASSERT(sizeof(amdvi_dte) == 0x200000);
 
 static SLIST_HEAD (, amdvi_domain) dom_head;
 
 static inline uint32_t
 amdvi_pci_read(struct amdvi_softc *softc, int off)
 {
 
 	return (pci_cfgregread(PCI_RID2BUS(softc->pci_rid),
 	    PCI_RID2SLOT(softc->pci_rid), PCI_RID2FUNC(softc->pci_rid),
 	    off, 4));
 }
 
 #ifdef AMDVI_ATS_ENABLE
 /* XXX: Should be in pci.c */
 /*
  * Check if device has ATS capability and its enabled.
  * If ATS is absent or disabled, return (-1), otherwise ATS
  * queue length.
  */
 static int
 amdvi_find_ats_qlen(uint16_t devid)
 {
 	device_t dev;
 	uint32_t off, cap;
 	int qlen = -1;
 
 	dev = pci_find_bsf(PCI_RID2BUS(devid), PCI_RID2SLOT(devid),
 			   PCI_RID2FUNC(devid));
 
 	if (!dev) {
 		return (-1);
 	}
 #define PCIM_ATS_EN	BIT(31)
 
 	if (pci_find_extcap(dev, PCIZ_ATS, &off) == 0) {
 		cap = pci_read_config(dev, off + 4, 4);
 		qlen = (cap & 0x1F);
 		qlen = qlen ? qlen : 32;
 		printf("AMD-Vi: PCI device %d.%d.%d ATS %s qlen=%d\n",
 		       RID2PCI_STR(devid),
 		       (cap & PCIM_ATS_EN) ? "enabled" : "Disabled",
 		       qlen);
 		qlen = (cap & PCIM_ATS_EN) ? qlen : -1;
 	}
 
 	return (qlen);
 }
 
 /*
  * Check if an endpoint device support device IOTLB or ATS.
  */
 static inline bool
 amdvi_dev_support_iotlb(struct amdvi_softc *softc, uint16_t devid)
 {
 	struct ivhd_dev_cfg *cfg;
 	int qlen, i;
 	bool pci_ats, ivhd_ats;
 
 	qlen = amdvi_find_ats_qlen(devid);
 	if (qlen < 0)
 		return (false);
 
 	KASSERT(softc, ("softc is NULL"));
 	cfg = softc->dev_cfg;
 
 	ivhd_ats = false;
 	for (i = 0; i < softc->dev_cfg_cnt; i++) {
 		if ((cfg->start_id <= devid) && (cfg->end_id >= devid)) {
 			ivhd_ats = cfg->enable_ats;
 			break;
 		}
 		cfg++;
 	}
 
 	pci_ats = (qlen < 0) ? false : true;
 	if (pci_ats != ivhd_ats)
 		device_printf(softc->dev,
 		    "BIOS bug: mismatch in ATS setting for %d.%d.%d,"
 		    "ATS inv qlen = %d\n", RID2PCI_STR(devid), qlen);
 
 	/* Ignore IVRS setting and respect PCI setting. */
 	return (pci_ats);
 }
 #endif
 
 /* Enable IOTLB support for IOMMU if its supported. */
 static inline void
 amdvi_hw_enable_iotlb(struct amdvi_softc *softc)
 {
 #ifndef AMDVI_ATS_ENABLE
 	softc->iotlb = false;
 #else
 	bool supported;
 
 	supported = (softc->ivhd_flag & IVHD_FLAG_IOTLB) ? true : false;
 
 	if (softc->pci_cap & AMDVI_PCI_CAP_IOTLB) {
 		if (!supported)
 			device_printf(softc->dev, "IOTLB disabled by BIOS.\n");
 
 		if (supported && !amdvi_enable_iotlb) {
 			device_printf(softc->dev, "IOTLB disabled by user.\n");
 			supported = false;
 		}
 	} else
 		supported = false;
 
 	softc->iotlb = supported;
 
 #endif
 }
 
 static int
 amdvi_init_cmd(struct amdvi_softc *softc)
 {
 	struct amdvi_ctrl *ctrl = softc->ctrl;
 
 	ctrl->cmd.len = 8;	/* Use 256 command buffer entries. */
 	softc->cmd_max = 1 << ctrl->cmd.len;
 
 	softc->cmd = malloc(sizeof(struct amdvi_cmd) *
 	    softc->cmd_max, M_AMDVI, M_WAITOK | M_ZERO);
 
 	if ((uintptr_t)softc->cmd & PAGE_MASK)
 		panic("AMDVi: Command buffer not aligned on page boundary.");
 
 	ctrl->cmd.base = vtophys(softc->cmd) / PAGE_SIZE;
 	/*
 	 * XXX: Reset the h/w pointers in case IOMMU is restarting,
 	 * h/w doesn't clear these pointers based on empirical data.
 	 */
 	ctrl->cmd_tail = 0;
 	ctrl->cmd_head = 0;
 
 	return (0);
 }
 
 /*
  * Note: Update tail pointer after we have written the command since tail
  * pointer update cause h/w to execute new commands, see section 3.3
  * of AMD IOMMU spec ver 2.0.
  */
 /* Get the command tail pointer w/o updating it. */
 static struct amdvi_cmd *
 amdvi_get_cmd_tail(struct amdvi_softc *softc)
 {
 	struct amdvi_ctrl *ctrl;
 	struct amdvi_cmd *tail;
 
 	KASSERT(softc, ("softc is NULL"));
 	KASSERT(softc->cmd != NULL, ("cmd is NULL"));
 
 	ctrl = softc->ctrl;
 	KASSERT(ctrl != NULL, ("ctrl is NULL"));
 
 	tail = (struct amdvi_cmd *)((uint8_t *)softc->cmd +
 	    ctrl->cmd_tail);
 
 	return (tail);
 }
 
 /*
  * Update the command tail pointer which will start command execution.
  */
 static void
 amdvi_update_cmd_tail(struct amdvi_softc *softc)
 {
 	struct amdvi_ctrl *ctrl;
 	int size;
 
 	size = sizeof(struct amdvi_cmd);
 	KASSERT(softc->cmd != NULL, ("cmd is NULL"));
 
 	ctrl = softc->ctrl;
 	KASSERT(ctrl != NULL, ("ctrl is NULL"));
 
 	ctrl->cmd_tail = MOD_INC(ctrl->cmd_tail, size, softc->cmd_max);
 	softc->total_cmd++;
 
 #ifdef AMDVI_DEBUG_CMD
 	device_printf(softc->dev, "cmd_tail: %s Tail:0x%x, Head:0x%x.\n",
 	    ctrl->cmd_tail,
 	    ctrl->cmd_head);
 #endif
 
 }
 
 /*
  * Various commands supported by IOMMU.
  */
 
 /* Completion wait command. */
 static void
 amdvi_cmd_cmp(struct amdvi_softc *softc, const uint64_t data)
 {
 	struct amdvi_cmd *cmd;
 	uint64_t pa;
 
 	cmd = amdvi_get_cmd_tail(softc);
 	KASSERT(cmd != NULL, ("Cmd is NULL"));
 
 	pa = vtophys(&softc->cmp_data);
 	cmd->opcode = AMDVI_CMP_WAIT_OPCODE;
 	cmd->word0 = (pa & 0xFFFFFFF8) | AMDVI_CMP_WAIT_STORE;
 	cmd->word1 = (pa >> 32) & 0xFFFFF;
 	cmd->addr = data;
 
 	amdvi_update_cmd_tail(softc);
 }
 
 /* Invalidate device table entry. */
 static void
 amdvi_cmd_inv_dte(struct amdvi_softc *softc, uint16_t devid)
 {
 	struct amdvi_cmd *cmd;
 
 	cmd = amdvi_get_cmd_tail(softc);
 	KASSERT(cmd != NULL, ("Cmd is NULL"));
 	cmd->opcode = AMDVI_INVD_DTE_OPCODE;
 	cmd->word0 = devid;
 	amdvi_update_cmd_tail(softc);
 #ifdef AMDVI_DEBUG_CMD
 	device_printf(softc->dev, "Invalidated DTE:0x%x\n", devid);
 #endif
 }
 
 /* Invalidate IOMMU page, use for invalidation of domain. */
 static void
 amdvi_cmd_inv_iommu_pages(struct amdvi_softc *softc, uint16_t domain_id,
 			  uint64_t addr, bool guest_nested,
 			  bool pde, bool page)
 {
 	struct amdvi_cmd *cmd;
 
 	cmd = amdvi_get_cmd_tail(softc);
 	KASSERT(cmd != NULL, ("Cmd is NULL"));
 
 
 	cmd->opcode = AMDVI_INVD_PAGE_OPCODE;
 	cmd->word1 = domain_id;
 	/*
 	 * Invalidate all addresses for this domain.
 	 */
 	cmd->addr = addr;
 	cmd->addr |= pde ? AMDVI_INVD_PAGE_PDE : 0;
 	cmd->addr |= page ? AMDVI_INVD_PAGE_S : 0;
 
 	amdvi_update_cmd_tail(softc);
 }
 
 #ifdef AMDVI_ATS_ENABLE
 /* Invalidate device IOTLB. */
 static void
 amdvi_cmd_inv_iotlb(struct amdvi_softc *softc, uint16_t devid)
 {
 	struct amdvi_cmd *cmd;
 	int qlen;
 
 	if (!softc->iotlb)
 		return;
 
 	qlen = amdvi_find_ats_qlen(devid);
 	if (qlen < 0) {
 		panic("AMDVI: Invalid ATS qlen(%d) for device %d.%d.%d\n",
 		      qlen, RID2PCI_STR(devid));
 	}
 	cmd = amdvi_get_cmd_tail(softc);
 	KASSERT(cmd != NULL, ("Cmd is NULL"));
 
 #ifdef AMDVI_DEBUG_CMD
 	device_printf(softc->dev, "Invalidate IOTLB devID 0x%x"
 		      " Qlen:%d\n", devid, qlen);
 #endif
 	cmd->opcode = AMDVI_INVD_IOTLB_OPCODE;
 	cmd->word0 = devid;
 	cmd->word1 = qlen;
 	cmd->addr = AMDVI_INVD_IOTLB_ALL_ADDR |
 		AMDVI_INVD_IOTLB_S;
 	amdvi_update_cmd_tail(softc);
 }
 #endif
 
 #ifdef notyet				/* For Interrupt Remap. */
 static void
 amdvi_cmd_inv_intr_map(struct amdvi_softc *softc,
 		       uint16_t devid)
 {
 	struct amdvi_cmd *cmd;
 
 	cmd = amdvi_get_cmd_tail(softc);
 	KASSERT(cmd != NULL, ("Cmd is NULL"));
 	cmd->opcode = AMDVI_INVD_INTR_OPCODE;
 	cmd->word0 = devid;
 	amdvi_update_cmd_tail(softc);
 #ifdef AMDVI_DEBUG_CMD
 	device_printf(softc->dev, "Invalidate INTR map of devID 0x%x\n", devid);
 #endif
 }
 #endif
 
 /* Invalidate domain using INVALIDATE_IOMMU_PAGES command. */
 static void
 amdvi_inv_domain(struct amdvi_softc *softc, uint16_t domain_id)
 {
 	struct amdvi_cmd *cmd;
 
 	cmd = amdvi_get_cmd_tail(softc);
 	KASSERT(cmd != NULL, ("Cmd is NULL"));
 
 	/*
 	 * See section 3.3.3 of IOMMU spec rev 2.0, software note
 	 * for invalidating domain.
 	 */
 	amdvi_cmd_inv_iommu_pages(softc, domain_id, AMDVI_INVD_PAGE_ALL_ADDR,
 				false, true, true);
 
 #ifdef AMDVI_DEBUG_CMD
 	device_printf(softc->dev, "Invalidate domain:0x%x\n", domain_id);
 
 #endif
 }
 
 static	bool
 amdvi_cmp_wait(struct amdvi_softc *softc)
 {
 	struct amdvi_ctrl *ctrl;
 	const uint64_t VERIFY = 0xA5A5;
 	volatile uint64_t *read;
 	int i;
 	bool status;
 
 	ctrl = softc->ctrl;
 	read = &softc->cmp_data;
 	*read = 0;
 	amdvi_cmd_cmp(softc, VERIFY);
 	/* Wait for h/w to update completion data. */
 	for (i = 0; i < 100 && (*read != VERIFY); i++) {
 		DELAY(1000);		/* 1 ms */
 	}
 	status = (VERIFY == softc->cmp_data) ? true : false;
 
 #ifdef AMDVI_DEBUG_CMD
 	if (status)
 		device_printf(softc->dev, "CMD completion DONE Tail:0x%x, "
 			      "Head:0x%x, loop:%d.\n", ctrl->cmd_tail,
 			      ctrl->cmd_head, loop);
 #endif
 	return (status);
 }
 
 static void
 amdvi_wait(struct amdvi_softc *softc)
 {
 	struct amdvi_ctrl *ctrl;
 	int i;
 
 	KASSERT(softc, ("softc is NULL"));
 
 	ctrl = softc->ctrl;
 	KASSERT(ctrl != NULL, ("ctrl is NULL"));
 	/* Don't wait if h/w is not enabled. */
 	if ((ctrl->control & AMDVI_CTRL_EN) == 0)
 		return;
 
 	for (i = 0; i < 10; i++) {
 		if (amdvi_cmp_wait(softc))
 			return;
 	}
 
 	device_printf(softc->dev, "Error: completion failed"
 		      " tail:0x%x, head:0x%x.\n",
 		      ctrl->cmd_tail, ctrl->cmd_head);
 	/* Dump the last command. */
 	amdvi_dump_cmds(softc, 1);
 }
 
 static void
 amdvi_dump_cmds(struct amdvi_softc *softc, int count)
 {
 	struct amdvi_ctrl *ctrl;
 	struct amdvi_cmd *cmd;
 	int off, i;
 
 	ctrl = softc->ctrl;
 	device_printf(softc->dev, "Dump last %d command(s):\n", count);
 	/*
 	 * If h/w is stuck in completion, it is the previous command,
 	 * start dumping from previous command onward.
 	 */
 	off = MOD_DEC(ctrl->cmd_head, sizeof(struct amdvi_cmd),
 	    softc->cmd_max);
 	for (i = 0; off != ctrl->cmd_tail && i < count; i++) {
 		cmd = (struct amdvi_cmd *)((uint8_t *)softc->cmd + off);
 		printf("  [CMD%d, off:0x%x] opcode= 0x%x 0x%x"
 		    " 0x%x 0x%lx\n", i, off, cmd->opcode,
 		    cmd->word0, cmd->word1, cmd->addr);
 		off = (off + sizeof(struct amdvi_cmd)) %
 		    (softc->cmd_max * sizeof(struct amdvi_cmd));
 	}
 }
 
 static int
 amdvi_init_event(struct amdvi_softc *softc)
 {
 	struct amdvi_ctrl *ctrl;
 
 	ctrl = softc->ctrl;
 	ctrl->event.len = 8;
 	softc->event_max = 1 << ctrl->event.len;
 	softc->event = malloc(sizeof(struct amdvi_event) *
 	    softc->event_max, M_AMDVI, M_WAITOK | M_ZERO);
 	if ((uintptr_t)softc->event & PAGE_MASK) {
 		device_printf(softc->dev, "Event buffer not aligned on page.");
 		return (false);
 	}
 	ctrl->event.base = vtophys(softc->event) / PAGE_SIZE;
 
 	/* Reset the pointers. */
 	ctrl->evt_head = 0;
 	ctrl->evt_tail = 0;
 
 	return (0);
 }
 
 static inline void
 amdvi_decode_evt_flag(uint16_t flag)
 {
 
 	flag &= AMDVI_EVENT_FLAG_MASK;
 	printf(" 0x%b]\n", flag,
 		"\020"
 		"\001GN"
 		"\002NX"
 		"\003US"
 		"\004I"
 		"\005PR"
 		"\006RW"
 		"\007PE"
 		"\010RZ"
 		"\011TR"
 		);
 }
 
 /* See section 2.5.4 of AMD IOMMU spec ver 2.62.*/
 static inline void
 amdvi_decode_evt_flag_type(uint8_t type)
 {
 
 	switch (AMDVI_EVENT_FLAG_TYPE(type)) {
 	case 0:
 		printf("RSVD\n");
 		break;
 	case 1:
 		printf("Master Abort\n");
 		break;
 	case 2:
 		printf("Target Abort\n");
 		break;
 	case 3:
 		printf("Data Err\n");
 		break;
 	default:
 		break;
 	}
 }
 
 static void
 amdvi_decode_inv_dte_evt(uint16_t devid, uint16_t domid, uint64_t addr,
     uint16_t flag)
 {
 
 	printf("\t[IO_PAGE_FAULT EVT: devId:0x%x DomId:0x%x"
 	    " Addr:0x%lx",
 	    devid, domid, addr);
 	amdvi_decode_evt_flag(flag);
 }
 
 static void
 amdvi_decode_pf_evt(uint16_t devid, uint16_t domid, uint64_t addr,
     uint16_t flag)
 {
 
 	printf("\t[IO_PAGE_FAULT EVT: devId:0x%x DomId:0x%x"
 	    " Addr:0x%lx",
 	    devid, domid, addr);
 	amdvi_decode_evt_flag(flag);
 }
 
 static void
 amdvi_decode_dte_hwerr_evt(uint16_t devid, uint16_t domid,
     uint64_t addr, uint16_t flag)
 {
 
 	printf("\t[DEV_TAB_HW_ERR EVT: devId:0x%x DomId:0x%x"
 	    " Addr:0x%lx", devid, domid, addr);
 	amdvi_decode_evt_flag(flag);
 	amdvi_decode_evt_flag_type(flag);
 }
 
 static void
 amdvi_decode_page_hwerr_evt(uint16_t devid, uint16_t domid, uint64_t addr,
     uint16_t flag)
 {
 
 	printf("\t[PAGE_TAB_HW_ERR EVT: devId:0x%x DomId:0x%x"
 	    " Addr:0x%lx", devid, domid, addr);
 	amdvi_decode_evt_flag(flag);
 	amdvi_decode_evt_flag_type(AMDVI_EVENT_FLAG_TYPE(flag));
 }
 
 static void
 amdvi_decode_evt(struct amdvi_event *evt)
 {
 	struct amdvi_cmd *cmd;
 
 	switch (evt->opcode) {
 	case AMDVI_EVENT_INVALID_DTE:
 		amdvi_decode_inv_dte_evt(evt->devid, evt->pasid_domid,
 		    evt->addr, evt->flag);
 		break;
 
 	case AMDVI_EVENT_PFAULT:
 		amdvi_decode_pf_evt(evt->devid, evt->pasid_domid,
 		    evt->addr, evt->flag);
 		break;
 
 	case AMDVI_EVENT_DTE_HW_ERROR:
 		amdvi_decode_dte_hwerr_evt(evt->devid, evt->pasid_domid,
 		    evt->addr, evt->flag);
 		break;
 
 	case AMDVI_EVENT_PAGE_HW_ERROR:
 		amdvi_decode_page_hwerr_evt(evt->devid, evt->pasid_domid,
 		    evt->addr, evt->flag);
 		break;
 
 	case AMDVI_EVENT_ILLEGAL_CMD:
 		/* FALL THROUGH */
 	case AMDVI_EVENT_CMD_HW_ERROR:
 		printf("\t[%s EVT]\n", (evt->opcode == AMDVI_EVENT_ILLEGAL_CMD) ?
 		    "ILLEGAL CMD" : "CMD HW ERR");
 		cmd = (struct amdvi_cmd *)PHYS_TO_DMAP(evt->addr);
 		printf("\tCMD opcode= 0x%x 0x%x 0x%x 0x%lx\n",
 		    cmd->opcode, cmd->word0, cmd->word1, cmd->addr);
 		break;
 
 	case AMDVI_EVENT_IOTLB_TIMEOUT:
 		printf("\t[IOTLB_INV_TIMEOUT devid:0x%x addr:0x%lx]\n",
 		    evt->devid, evt->addr);
 		break;
 
 	case AMDVI_EVENT_INVALID_DTE_REQ:
 		printf("\t[INV_DTE devid:0x%x addr:0x%lx type:0x%x tr:%d]\n",
 		    evt->devid, evt->addr, evt->flag >> 9,
 		    (evt->flag >> 8) & 1);
 		break;
 
 	case AMDVI_EVENT_INVALID_PPR_REQ:
 	case AMDVI_EVENT_COUNTER_ZERO:
 		printf("AMD-Vi: v2 events.\n");
 		break;
 
 	default:
 		printf("Unsupported AMD-Vi event:%d\n", evt->opcode);
 	}
 }
 
 static void
 amdvi_print_events(struct amdvi_softc *softc)
 {
 	struct amdvi_ctrl *ctrl;
 	struct amdvi_event *event;
 	int i, size;
 
 	ctrl = softc->ctrl;
 	size = sizeof(struct amdvi_event);
 	for (i = 0; i < softc->event_max; i++) {
 		event = &softc->event[ctrl->evt_head / size];
 		if (!event->opcode)
 			break;
 		device_printf(softc->dev, "\t[Event%d: Head:0x%x Tail:0x%x]\n",
 		    i, ctrl->evt_head, ctrl->evt_tail);
 		amdvi_decode_evt(event);
 		ctrl->evt_head = MOD_INC(ctrl->evt_head, size,
 		    softc->event_max);
 	}
 }
 
 static int
 amdvi_init_dte(struct amdvi_softc *softc)
 {
 	struct amdvi_ctrl *ctrl;
 
 	ctrl = softc->ctrl;
 	ctrl->dte.base = vtophys(amdvi_dte) / PAGE_SIZE;
 	ctrl->dte.size = 0x1FF;		/* 2MB device table. */
 
 	return (0);
 }
 
 /*
  * Not all capabilities of IOMMU are available in ACPI IVHD flag
  * or EFR entry, read directly from device.
  */
 static int
 amdvi_print_pci_cap(device_t dev)
 {
 	struct amdvi_softc *softc;
 	uint32_t off, cap;
 
 
 	softc = device_get_softc(dev);
 	off = softc->cap_off;
 
 	/*
 	 * Section 3.7.1 of IOMMU sepc rev 2.0.
 	 * Read capability from device.
 	 */
 	cap = amdvi_pci_read(softc, off);
 
 	/* Make sure capability type[18:16] is 3. */
 	KASSERT((((cap >> 16) & 0x7) == 0x3),
 	    ("Not a IOMMU capability 0x%x@0x%x", cap, off));
 
 	softc->pci_cap = cap >> 24;
 	device_printf(softc->dev, "PCI cap 0x%x@0x%x feature:%b\n",
 	    cap, off, softc->pci_cap,
 	    "\20\1IOTLB\2HT\3NPCache\4EFR\5CapExt");
 
 	return (0);
 }
 
 static void
 amdvi_event_intr(void *arg)
 {
 	struct amdvi_softc *softc;
 	struct amdvi_ctrl *ctrl;
 
 	softc = (struct amdvi_softc *)arg;
 	ctrl = softc->ctrl;
 	device_printf(softc->dev, "EVT INTR %ld Status:0x%x"
 	    " EVT Head:0x%x Tail:0x%x]\n", softc->event_intr_cnt++,
 	    ctrl->status, ctrl->evt_head, ctrl->evt_tail);
 	printf("  [CMD Total 0x%lx] Tail:0x%x, Head:0x%x.\n",
 	    softc->total_cmd, ctrl->cmd_tail, ctrl->cmd_head);
 
 	amdvi_print_events(softc);
 	ctrl->status &= AMDVI_STATUS_EV_OF | AMDVI_STATUS_EV_INTR;
 }
 
 static void
 amdvi_free_evt_intr_res(device_t dev)
 {
 
 	struct amdvi_softc *softc;
 
 	softc = device_get_softc(dev);
 	if (softc->event_tag != NULL) {
 		bus_teardown_intr(dev, softc->event_res, softc->event_tag);
 	}
 	if (softc->event_res != NULL) {
 		bus_release_resource(dev, SYS_RES_IRQ, softc->event_rid,
 		    softc->event_res);
 	}
 	bus_delete_resource(dev, SYS_RES_IRQ, softc->event_rid);
 	PCIB_RELEASE_MSI(device_get_parent(device_get_parent(dev)),
 	    dev, 1, &softc->event_irq);
 }
 
 static bool
 amdvi_alloc_intr_resources(struct amdvi_softc *softc)
 {
 	struct amdvi_ctrl *ctrl;
 	device_t dev, pcib;
 	device_t mmio_dev;
 	uint64_t msi_addr;
 	uint32_t msi_data;
 	int err;
 
 	dev = softc->dev;
 	pcib = device_get_parent(device_get_parent(dev));
 	mmio_dev = pci_find_bsf(PCI_RID2BUS(softc->pci_rid),
             PCI_RID2SLOT(softc->pci_rid), PCI_RID2FUNC(softc->pci_rid));
 	if (device_is_attached(mmio_dev)) {
 		device_printf(dev,
 		    "warning: IOMMU device is claimed by another driver %s\n",
 		    device_get_driver(mmio_dev)->name);
 	}
 
 	softc->event_irq = -1;
 	softc->event_rid = 0;
 
 	/*
 	 * Section 3.7.1 of IOMMU rev 2.0. With MSI, there is only one
 	 * interrupt. XXX: Enable MSI/X support.
 	 */
 	err = PCIB_ALLOC_MSI(pcib, dev, 1, 1, &softc->event_irq);
 	if (err) {
 		device_printf(dev,
 		    "Couldn't find event MSI IRQ resource.\n");
 		return (ENOENT);
 	}
 
 	err = bus_set_resource(dev, SYS_RES_IRQ, softc->event_rid,
 	    softc->event_irq, 1);
 	if (err) {
 		device_printf(dev, "Couldn't set event MSI resource.\n");
 		return (ENXIO);
 	}
 
 	softc->event_res = bus_alloc_resource_any(dev, SYS_RES_IRQ,
 	    &softc->event_rid, RF_ACTIVE);
 	if (!softc->event_res) {
 		device_printf(dev,
 		    "Unable to allocate event INTR resource.\n");
 		return (ENOMEM);
 	}
 
 	if (bus_setup_intr(dev, softc->event_res,
 	    INTR_TYPE_MISC | INTR_MPSAFE, NULL, amdvi_event_intr,
 	    softc, &softc->event_tag)) {
 		device_printf(dev, "Fail to setup event intr\n");
 		bus_release_resource(softc->dev, SYS_RES_IRQ,
 		    softc->event_rid, softc->event_res);
 		softc->event_res = NULL;
 		return (ENXIO);
 	}
 
 	bus_describe_intr(dev, softc->event_res, softc->event_tag,
 	    "fault");
 
 	err = PCIB_MAP_MSI(pcib, dev, softc->event_irq, &msi_addr,
 	    &msi_data);
 	if (err) {
 		device_printf(dev,
 		    "Event interrupt config failed, err=%d.\n",
 		    err);
 		amdvi_free_evt_intr_res(softc->dev);
 		return (err);
 	}
 
 	/* Clear interrupt status bits. */
 	ctrl = softc->ctrl;
 	ctrl->status &= AMDVI_STATUS_EV_OF | AMDVI_STATUS_EV_INTR;
 
 	/* Now enable MSI interrupt. */
 	pci_enable_msi(mmio_dev, msi_addr, msi_data);
 	return (0);
 }
 
 
 static void
 amdvi_print_dev_cap(struct amdvi_softc *softc)
 {
 	struct ivhd_dev_cfg *cfg;
 	int i;
 
 	cfg = softc->dev_cfg;
 	for (i = 0; i < softc->dev_cfg_cnt; i++) {
 		device_printf(softc->dev, "device [0x%x - 0x%x]"
 		    "config:%b%s\n", cfg->start_id, cfg->end_id,
 		    cfg->data,
 		    "\020\001INIT\002ExtInt\003NMI"
 		    "\007LINT0\008LINT1",
 		    cfg->enable_ats ? "ATS enabled" : "");
 		cfg++;
 	}
 }
 
 static int
 amdvi_handle_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	struct amdvi_softc *softc;
 	int result, type, error = 0;
 
 	softc = (struct amdvi_softc *)arg1;
 	type = arg2;
 
 	switch (type) {
 	case 0:
 		result = softc->ctrl->cmd_head;
 		error = sysctl_handle_int(oidp, &result, 0,
 		    req);
 		break;
 	case 1:
 		result = softc->ctrl->cmd_tail;
 		error = sysctl_handle_int(oidp, &result, 0,
 		    req);
 		break;
 	case 2:
 		result = softc->ctrl->evt_head;
 		error = sysctl_handle_int(oidp, &result, 0,
 		    req);
 		break;
 	case 3:
 		result = softc->ctrl->evt_tail;
 		error = sysctl_handle_int(oidp, &result, 0,
 		    req);
 		break;
 
 	default:
 		device_printf(softc->dev, "Unknown sysctl:%d\n", type);
 	}
 
 	return (error);
 }
 
 static void
 amdvi_add_sysctl(struct amdvi_softc *softc)
 {
 	struct sysctl_oid_list *child;
 	struct sysctl_ctx_list *ctx;
 	device_t dev;
 
 	dev = softc->dev;
 	ctx = device_get_sysctl_ctx(dev);
 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
 
 	SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "event_intr_count", CTLFLAG_RD,
 	    &softc->event_intr_cnt, "Event interrupt count");
 	SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "command_count", CTLFLAG_RD,
 	    &softc->total_cmd, "Command submitted count");
 	SYSCTL_ADD_U16(ctx, child, OID_AUTO, "pci_rid", CTLFLAG_RD,
 	    &softc->pci_rid, 0, "IOMMU RID");
 	SYSCTL_ADD_U16(ctx, child, OID_AUTO, "start_dev_rid", CTLFLAG_RD,
 	    &softc->start_dev_rid, 0, "Start of device under this IOMMU");
 	SYSCTL_ADD_U16(ctx, child, OID_AUTO, "end_dev_rid", CTLFLAG_RD,
 	    &softc->end_dev_rid, 0, "End of device under this IOMMU");
 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "command_head",
-	    CTLTYPE_UINT | CTLFLAG_RD, softc, 0,
+	    CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_MPSAFE, softc, 0,
 	    amdvi_handle_sysctl, "IU", "Command head");
 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "command_tail",
-	    CTLTYPE_UINT | CTLFLAG_RD, softc, 1,
+	    CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_MPSAFE, softc, 1,
 	    amdvi_handle_sysctl, "IU", "Command tail");
 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "event_head",
-	    CTLTYPE_UINT | CTLFLAG_RD, softc, 2,
+	    CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_MPSAFE, softc, 2,
 	    amdvi_handle_sysctl, "IU", "Command head");
 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "event_tail",
-	    CTLTYPE_UINT | CTLFLAG_RD, softc, 3,
+	    CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_MPSAFE, softc, 3,
 	    amdvi_handle_sysctl, "IU", "Command tail");
 }
 
 int
 amdvi_setup_hw(struct amdvi_softc *softc)
 {
 	device_t dev;
 	int status;
 
 	dev = softc->dev;
 
 	amdvi_hw_enable_iotlb(softc);
 
 	amdvi_print_dev_cap(softc);
 
 	if ((status = amdvi_print_pci_cap(dev)) != 0) {
 		device_printf(dev, "PCI capability.\n");
 		return (status);
 	}
 	if ((status = amdvi_init_cmd(softc)) != 0) {
 		device_printf(dev, "Couldn't configure command buffer.\n");
 		return (status);
 	}
 	if ((status = amdvi_init_event(softc)) != 0) {
 		device_printf(dev, "Couldn't configure event buffer.\n");
 		return (status);
 	}
 	if ((status = amdvi_init_dte(softc)) != 0) {
 		device_printf(dev, "Couldn't configure device table.\n");
 		return (status);
 	}
 	if ((status = amdvi_alloc_intr_resources(softc)) != 0) {
 		return (status);
 	}
 	amdvi_add_sysctl(softc);
 	return (0);
 }
 
 int
 amdvi_teardown_hw(struct amdvi_softc *softc)
 {
 	device_t dev;
 
 	dev = softc->dev;
 
 	/* 
 	 * Called after disable, h/w is stopped by now, free all the resources. 
 	 */
 	amdvi_free_evt_intr_res(dev);
 
 	if (softc->cmd)
 		free(softc->cmd, M_AMDVI);
 
 	if (softc->event)
 		free(softc->event, M_AMDVI);
 
 	return (0);
 }
 
 /*********** bhyve interfaces *********************/
 static int
 amdvi_init(void)
 {
 	if (!ivhd_count) {
 		return (EIO);
 	}
 	if (!amdvi_enable_user && ivhd_count) {
 		printf("bhyve: Found %d AMD-Vi/IOMMU device(s), "
 		    	"use hw.vmm.amdvi.enable=1 to enable pass-through.\n",
 		    ivhd_count);
 		return (EINVAL);
 	}
 	return (0);
 }
 
 static void
 amdvi_cleanup(void)
 {
 	/* Nothing. */
 }
 
 static uint16_t
 amdvi_domainId(void)
 {
 
 	/*
 	 * If we hit maximum domain limit, rollover leaving host
 	 * domain(0).
 	 * XXX: make sure that this domain is not used.
 	 */
 	if (amdvi_dom_id == AMDVI_MAX_DOMAIN)
 		amdvi_dom_id = 1;
 
 	return ((uint16_t)amdvi_dom_id++);
 }
 
 static void
 amdvi_do_inv_domain(uint16_t domain_id, bool create)
 {
 	struct amdvi_softc *softc;
 	int i;
 
 	for (i = 0; i < ivhd_count; i++) {
 		softc = device_get_softc(ivhd_devs[i]);
 		KASSERT(softc, ("softc is NULL"));
 		/*
 		 * If not present pages are cached, invalidate page after
 		 * creating domain.
 		 */
 #if 0
 		if (create && ((softc->pci_cap & AMDVI_PCI_CAP_NPCACHE) == 0))
 			continue;
 #endif
 		amdvi_inv_domain(softc, domain_id);
 		amdvi_wait(softc);
 	}
 }
 
 static void *
 amdvi_create_domain(vm_paddr_t maxaddr)
 {
 	struct amdvi_domain *dom;
 
 	dom = malloc(sizeof(struct amdvi_domain), M_AMDVI, M_ZERO | M_WAITOK);
 	dom->id = amdvi_domainId();
 	//dom->maxaddr = maxaddr;
 #ifdef AMDVI_DEBUG_CMD
 	printf("Created domain #%d\n", dom->id);
 #endif
 	/*
 	 * Host domain(#0) don't create translation table.
 	 */
 	if (dom->id || amdvi_host_ptp)
 		dom->ptp = malloc(PAGE_SIZE, M_AMDVI, M_WAITOK | M_ZERO);
 
 	dom->ptp_level = amdvi_ptp_level;
 
 	amdvi_do_inv_domain(dom->id, true);
 	SLIST_INSERT_HEAD(&dom_head, dom, next);
 
 	return (dom);
 }
 
 static void
 amdvi_free_ptp(uint64_t *ptp, int level)
 {
 	int i;
 
 	if (level < 1)
 		return;
 
 	for (i = 0; i < NPTEPG ; i++) {
 		if ((ptp[i] & AMDVI_PT_PRESENT) == 0)
 			continue;
 		/* XXX: Add super-page or PTE mapping > 4KB. */
 #ifdef notyet
 		/* Super-page mapping. */
 		if (AMDVI_PD_SUPER(ptp[i]))
 			continue;
 #endif
 
 		amdvi_free_ptp((uint64_t *)PHYS_TO_DMAP(ptp[i]
 		    & AMDVI_PT_MASK), level - 1);
 
 	}
 
 	free(ptp, M_AMDVI);
 }
 
 static void
 amdvi_destroy_domain(void *arg)
 {
 	struct amdvi_domain *domain;
 
 	domain = (struct amdvi_domain *)arg;
 	KASSERT(domain, ("domain is NULL"));
 #ifdef AMDVI_DEBUG_CMD
 	printf("Destroying domain %d\n", domain->id);
 #endif
 	if (domain->ptp)
 		amdvi_free_ptp(domain->ptp, domain->ptp_level);
 
 	amdvi_do_inv_domain(domain->id, false);
 	SLIST_REMOVE(&dom_head, domain, amdvi_domain, next);
 	free(domain, M_AMDVI);
 }
 
 static uint64_t
 amdvi_set_pt(uint64_t *pt, int level, vm_paddr_t gpa,
     vm_paddr_t hpa, uint64_t pg_size, bool create)
 {
 	uint64_t *page, pa;
 	int shift, index;
 	const int PT_SHIFT = 9;
 	const int PT_INDEX_MASK = (1 << PT_SHIFT) - 1;	/* Based on PT_SHIFT */
 
 	if (!pg_size)
 		return (0);
 
 	if (hpa & (pg_size - 1)) {
 		printf("HPA is not size aligned.\n");
 		return (0);
 	}
 	if (gpa & (pg_size - 1)) {
 		printf("HPA is not size aligned.\n");
 		return (0);
 	}
 	shift = PML4SHIFT;
 	while ((shift > PAGE_SHIFT) && (pg_size < (1UL << shift))) {
 		index = (gpa >> shift) & PT_INDEX_MASK;
 
 		if ((pt[index] == 0) && create) {
 			page = malloc(PAGE_SIZE, M_AMDVI, M_WAITOK | M_ZERO);
 			pa = vtophys(page);
 			pt[index] = pa | AMDVI_PT_PRESENT | AMDVI_PT_RW |
 			    ((level - 1) << AMDVI_PD_LEVEL_SHIFT);
 		}
 #ifdef AMDVI_DEBUG_PTE
 		if ((gpa % 0x1000000) == 0)
 			printf("[level%d, shift = %d]PTE:0x%lx\n",
 			    level, shift, pt[index]);
 #endif
 #define PTE2PA(x)	((uint64_t)(x) & AMDVI_PT_MASK)
 		pa = PTE2PA(pt[index]);
 		pt = (uint64_t *)PHYS_TO_DMAP(pa);
 		shift -= PT_SHIFT;
 		level--;
 	}
 
 	/* Leaf entry. */
 	index = (gpa >> shift) & PT_INDEX_MASK;
 
 	if (create) {
 		pt[index] = hpa | AMDVI_PT_RW | AMDVI_PT_PRESENT;
 	} else
 		pt[index] = 0;
 
 #ifdef AMDVI_DEBUG_PTE
 	if ((gpa % 0x1000000) == 0)
 		printf("[Last level%d, shift = %d]PTE:0x%lx\n",
 		    level, shift, pt[index]);
 #endif
 	return (1ULL << shift);
 }
 
 static uint64_t
 amdvi_update_mapping(struct amdvi_domain *domain, vm_paddr_t gpa,
     vm_paddr_t hpa, uint64_t size, bool create)
 {
 	uint64_t mapped, *ptp, len;
 	int level;
 
 	KASSERT(domain, ("domain is NULL"));
 	level = domain->ptp_level;
 	KASSERT(level, ("Page table level is 0"));
 
 	ptp = domain->ptp;
 	KASSERT(ptp, ("PTP is NULL"));
 	mapped = 0;
 	while (mapped < size) {
 		len = amdvi_set_pt(ptp, level, gpa + mapped, hpa + mapped,
 		    PAGE_SIZE, create);
 		if (!len) {
 			printf("Error: Couldn't map HPA:0x%lx GPA:0x%lx\n",
 			    hpa, gpa);
 			return (0);
 		}
 		mapped += len;
 	}
 
 	return (mapped);
 }
 
 static uint64_t
 amdvi_create_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa,
     uint64_t len)
 {
 	struct amdvi_domain *domain;
 
 	domain = (struct amdvi_domain *)arg;
 
 	if (domain->id && !domain->ptp) {
 		printf("ptp is NULL");
 		return (-1);
 	}
 
 	/*
 	 * If host domain is created w/o page table, skip IOMMU page
 	 * table set-up.
 	 */
 	if (domain->ptp)
 		return (amdvi_update_mapping(domain, gpa, hpa, len, true));
 	else
 		return (len);
 }
 
 static uint64_t
 amdvi_destroy_mapping(void *arg, vm_paddr_t gpa, uint64_t len)
 {
 	struct amdvi_domain *domain;
 
 	domain = (struct amdvi_domain *)arg;
 	/*
 	 * If host domain is created w/o page table, skip IOMMU page
 	 * table set-up.
 	 */
 	if (domain->ptp)
 		return (amdvi_update_mapping(domain, gpa, 0, len, false));
 	return
 	    (len);
 }
 
 static struct amdvi_softc *
 amdvi_find_iommu(uint16_t devid)
 {
 	struct amdvi_softc *softc;
 	int i;
 
 	for (i = 0; i < ivhd_count; i++) {
 		softc = device_get_softc(ivhd_devs[i]);
 		if ((devid >= softc->start_dev_rid) &&
 		    (devid <= softc->end_dev_rid))
 			return (softc);
 	}
 
 	/*
 	 * XXX: BIOS bug, device not in IVRS table, assume its from first IOMMU.
 	 */
 	printf("BIOS bug device(%d.%d.%d) doesn't have IVHD entry.\n",
 	    RID2PCI_STR(devid));
 
 	return (device_get_softc(ivhd_devs[0]));
 }
 
 /*
  * Set-up device table entry.
  * IOMMU spec Rev 2.0, section 3.2.2.2, some of the fields must
  * be set concurrently, e.g. read and write bits.
  */
 static void
 amdvi_set_dte(struct amdvi_domain *domain, uint16_t devid, bool enable)
 {
 	struct amdvi_softc *softc;
 	struct amdvi_dte* temp;
 
 	KASSERT(domain, ("domain is NULL for pci_rid:0x%x\n", devid));
 	
 	softc = amdvi_find_iommu(devid);
 	KASSERT(softc, ("softc is NULL for pci_rid:0x%x\n", devid));
 
 	temp = &amdvi_dte[devid];
 
 #ifdef AMDVI_ATS_ENABLE
 	/* If IOMMU and device support IOTLB, enable it. */
 	if (amdvi_dev_support_iotlb(softc, devid) && softc->iotlb)
 		temp->iotlb_enable = 1;
 #endif
 
 	/* Avoid duplicate I/O faults. */
 	temp->sup_second_io_fault = 1;
 	temp->sup_all_io_fault = amdvi_disable_io_fault;
 
 	temp->dt_valid = 1;
 	temp->domain_id = domain->id;
 
 	if (enable) {
 		if (domain->ptp) {
 			temp->pt_base = vtophys(domain->ptp) >> 12;
 			temp->pt_level = amdvi_ptp_level;
 		}
 		/*
 		 * XXX: Page table valid[TV] bit must be set even if host domain
 		 * page tables are not enabled.
 		 */
 		temp->pt_valid = 1;
 		temp->read_allow = 1;
 		temp->write_allow = 1;
 	}
 }
 
 static void
 amdvi_inv_device(uint16_t devid)
 {
 	struct amdvi_softc *softc;
 
 	softc = amdvi_find_iommu(devid);
 	KASSERT(softc, ("softc is NULL"));
 
 	amdvi_cmd_inv_dte(softc, devid);
 #ifdef AMDVI_ATS_ENABLE
 	if (amdvi_dev_support_iotlb(softc, devid))
 		amdvi_cmd_inv_iotlb(softc, devid);
 #endif
 	amdvi_wait(softc);
 }
 
 static void
 amdvi_add_device(void *arg, uint16_t devid)
 {
 	struct amdvi_domain *domain;
 
 	domain = (struct amdvi_domain *)arg;
 	KASSERT(domain != NULL, ("domain is NULL"));
 #ifdef AMDVI_DEBUG_CMD
 	printf("Assigning device(%d.%d.%d) to domain:%d\n",
 	    RID2PCI_STR(devid), domain->id);
 #endif
 	amdvi_set_dte(domain, devid, true);
 	amdvi_inv_device(devid);
 }
 
 static void
 amdvi_remove_device(void *arg, uint16_t devid)
 {
 	struct amdvi_domain *domain;
 
 	domain = (struct amdvi_domain *)arg;
 #ifdef AMDVI_DEBUG_CMD
 	printf("Remove device(0x%x) from domain:%d\n",
 	       devid, domain->id);
 #endif
 	amdvi_set_dte(domain, devid, false);
 	amdvi_inv_device(devid);
 }
 
 static void
 amdvi_enable(void)
 {
 	struct amdvi_ctrl *ctrl;
 	struct amdvi_softc *softc;
 	uint64_t val;
 	int i;
 
 	for (i = 0; i < ivhd_count; i++) {
 		softc = device_get_softc(ivhd_devs[i]);
 		KASSERT(softc, ("softc is NULL\n"));
 		ctrl = softc->ctrl;
 		KASSERT(ctrl, ("ctrl is NULL\n"));
 
 		val = (	AMDVI_CTRL_EN 		|
 			AMDVI_CTRL_CMD 		|
 		    	AMDVI_CTRL_ELOG 	|
 		    	AMDVI_CTRL_ELOGINT 	|
 		    	AMDVI_CTRL_INV_TO_1S);
 
 		if (softc->ivhd_flag & IVHD_FLAG_COH)
 			val |= AMDVI_CTRL_COH;
 		if (softc->ivhd_flag & IVHD_FLAG_HTT)
 			val |= AMDVI_CTRL_HTT;
 		if (softc->ivhd_flag & IVHD_FLAG_RPPW)
 			val |= AMDVI_CTRL_RPPW;
 		if (softc->ivhd_flag & IVHD_FLAG_PPW)
 			val |= AMDVI_CTRL_PPW;
 		if (softc->ivhd_flag & IVHD_FLAG_ISOC)
 			val |= AMDVI_CTRL_ISOC;
 
 		ctrl->control = val;
 	}
 }
 
 static void
 amdvi_disable(void)
 {
 	struct amdvi_ctrl *ctrl;
 	struct amdvi_softc *softc;
 	int i;
 
 	for (i = 0; i < ivhd_count; i++) {
 		softc = device_get_softc(ivhd_devs[i]);
 		KASSERT(softc, ("softc is NULL\n"));
 		ctrl = softc->ctrl;
 		KASSERT(ctrl, ("ctrl is NULL\n"));
 
 		ctrl->control = 0;
 	}
 }
 
 static void
 amdvi_inv_tlb(void *arg)
 {
 	struct amdvi_domain *domain;
 
 	domain = (struct amdvi_domain *)arg;
 	KASSERT(domain, ("domain is NULL"));
 	amdvi_do_inv_domain(domain->id, false);
 }
 
 struct iommu_ops iommu_ops_amd = {
 	amdvi_init,
 	amdvi_cleanup,
 	amdvi_enable,
 	amdvi_disable,
 	amdvi_create_domain,
 	amdvi_destroy_domain,
 	amdvi_create_mapping,
 	amdvi_destroy_mapping,
 	amdvi_add_device,
 	amdvi_remove_device,
 	amdvi_inv_tlb
 };
Index: head/sys/amd64/vmm/amd/npt.c
===================================================================
--- head/sys/amd64/vmm/amd/npt.c	(revision 357973)
+++ head/sys/amd64/vmm/amd/npt.c	(revision 357974)
@@ -1,87 +1,88 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com)
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/sysctl.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_extern.h>
 
 #include "npt.h"
 
 SYSCTL_DECL(_hw_vmm);
-SYSCTL_NODE(_hw_vmm, OID_AUTO, npt, CTLFLAG_RW, NULL, NULL);
+SYSCTL_NODE(_hw_vmm, OID_AUTO, npt, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
+    NULL);
 
 static int npt_flags;
 SYSCTL_INT(_hw_vmm_npt, OID_AUTO, pmap_flags, CTLFLAG_RD,
 	&npt_flags, 0, NULL);
 
 #define NPT_IPIMASK	0xFF
 
 /*
  * AMD nested page table init.
  */
 int
 svm_npt_init(int ipinum)
 {
 	int enable_superpage = 1;
 
 	npt_flags = ipinum & NPT_IPIMASK;
 	TUNABLE_INT_FETCH("hw.vmm.npt.enable_superpage", &enable_superpage);
 	if (enable_superpage)
 		npt_flags |= PMAP_PDE_SUPERPAGE; 
 	
 	return (0);
 }
 
 static int
 npt_pinit(pmap_t pmap)
 {
 
 	return (pmap_pinit_type(pmap, PT_RVI, npt_flags));
 }
 
 struct vmspace *
 svm_npt_alloc(vm_offset_t min, vm_offset_t max)
 {
 	
 	return (vmspace_alloc(min, max, npt_pinit));
 }
 
 void
 svm_npt_free(struct vmspace *vmspace)
 {
 
 	vmspace_free(vmspace);
 }
Index: head/sys/amd64/vmm/amd/svm.c
===================================================================
--- head/sys/amd64/vmm/amd/svm.c	(revision 357973)
+++ head/sys/amd64/vmm/amd/svm.c	(revision 357974)
@@ -1,2304 +1,2305 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2013, Anish Gupta (akgupt3@gmail.com)
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/smp.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/pcpu.h>
 #include <sys/proc.h>
 #include <sys/sysctl.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 #include <machine/cpufunc.h>
 #include <machine/psl.h>
 #include <machine/md_var.h>
 #include <machine/reg.h>
 #include <machine/specialreg.h>
 #include <machine/smp.h>
 #include <machine/vmm.h>
 #include <machine/vmm_dev.h>
 #include <machine/vmm_instruction_emul.h>
 
 #include "vmm_lapic.h"
 #include "vmm_stat.h"
 #include "vmm_ktr.h"
 #include "vmm_ioport.h"
 #include "vatpic.h"
 #include "vlapic.h"
 #include "vlapic_priv.h"
 
 #include "x86.h"
 #include "vmcb.h"
 #include "svm.h"
 #include "svm_softc.h"
 #include "svm_msr.h"
 #include "npt.h"
 
 SYSCTL_DECL(_hw_vmm);
-SYSCTL_NODE(_hw_vmm, OID_AUTO, svm, CTLFLAG_RW, NULL, NULL);
+SYSCTL_NODE(_hw_vmm, OID_AUTO, svm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
+    NULL);
 
 /*
  * SVM CPUID function 0x8000_000A, edx bit decoding.
  */
 #define AMD_CPUID_SVM_NP		BIT(0)  /* Nested paging or RVI */
 #define AMD_CPUID_SVM_LBR		BIT(1)  /* Last branch virtualization */
 #define AMD_CPUID_SVM_SVML		BIT(2)  /* SVM lock */
 #define AMD_CPUID_SVM_NRIP_SAVE		BIT(3)  /* Next RIP is saved */
 #define AMD_CPUID_SVM_TSC_RATE		BIT(4)  /* TSC rate control. */
 #define AMD_CPUID_SVM_VMCB_CLEAN	BIT(5)  /* VMCB state caching */
 #define AMD_CPUID_SVM_FLUSH_BY_ASID	BIT(6)  /* Flush by ASID */
 #define AMD_CPUID_SVM_DECODE_ASSIST	BIT(7)  /* Decode assist */
 #define AMD_CPUID_SVM_PAUSE_INC		BIT(10) /* Pause intercept filter. */
 #define AMD_CPUID_SVM_PAUSE_FTH		BIT(12) /* Pause filter threshold */
 #define	AMD_CPUID_SVM_AVIC		BIT(13)	/* AVIC present */
 
 #define	VMCB_CACHE_DEFAULT	(VMCB_CACHE_ASID 	|	\
 				VMCB_CACHE_IOPM		|	\
 				VMCB_CACHE_I		|	\
 				VMCB_CACHE_TPR		|	\
 				VMCB_CACHE_CR2		|	\
 				VMCB_CACHE_CR		|	\
 				VMCB_CACHE_DR		|	\
 				VMCB_CACHE_DT		|	\
 				VMCB_CACHE_SEG		|	\
 				VMCB_CACHE_NP)
 
 static uint32_t vmcb_clean = VMCB_CACHE_DEFAULT;
 SYSCTL_INT(_hw_vmm_svm, OID_AUTO, vmcb_clean, CTLFLAG_RDTUN, &vmcb_clean,
     0, NULL);
 
 static MALLOC_DEFINE(M_SVM, "svm", "svm");
 static MALLOC_DEFINE(M_SVM_VLAPIC, "svm-vlapic", "svm-vlapic");
 
 static uint32_t svm_feature = ~0U;	/* AMD SVM features. */
 SYSCTL_UINT(_hw_vmm_svm, OID_AUTO, features, CTLFLAG_RDTUN, &svm_feature, 0,
     "SVM features advertised by CPUID.8000000AH:EDX");
 
 static int disable_npf_assist;
 SYSCTL_INT(_hw_vmm_svm, OID_AUTO, disable_npf_assist, CTLFLAG_RWTUN,
     &disable_npf_assist, 0, NULL);
 
 /* Maximum ASIDs supported by the processor */
 static uint32_t nasid;
 SYSCTL_UINT(_hw_vmm_svm, OID_AUTO, num_asids, CTLFLAG_RDTUN, &nasid, 0,
     "Number of ASIDs supported by this processor");
 
 /* Current ASID generation for each host cpu */
 static struct asid asid[MAXCPU];
 
 /* 
  * SVM host state saved area of size 4KB for each core.
  */
 static uint8_t hsave[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE);
 
 static VMM_STAT_AMD(VCPU_EXITINTINFO, "VM exits during event delivery");
 static VMM_STAT_AMD(VCPU_INTINFO_INJECTED, "Events pending at VM entry");
 static VMM_STAT_AMD(VMEXIT_VINTR, "VM exits due to interrupt window");
 
 static int svm_setreg(void *arg, int vcpu, int ident, uint64_t val);
 
 static __inline int
 flush_by_asid(void)
 {
 
 	return (svm_feature & AMD_CPUID_SVM_FLUSH_BY_ASID);
 }
 
 static __inline int
 decode_assist(void)
 {
 
 	return (svm_feature & AMD_CPUID_SVM_DECODE_ASSIST);
 }
 
 static void
 svm_disable(void *arg __unused)
 {
 	uint64_t efer;
 
 	efer = rdmsr(MSR_EFER);
 	efer &= ~EFER_SVM;
 	wrmsr(MSR_EFER, efer);
 }
 
 /*
  * Disable SVM on all CPUs.
  */
 static int
 svm_cleanup(void)
 {
 
 	smp_rendezvous(NULL, svm_disable, NULL, NULL);
 	return (0);
 }
 
 /*
  * Verify that all the features required by bhyve are available.
  */
 static int
 check_svm_features(void)
 {
 	u_int regs[4];
 
 	/* CPUID Fn8000_000A is for SVM */
 	do_cpuid(0x8000000A, regs);
 	svm_feature &= regs[3];
 
 	/*
 	 * The number of ASIDs can be configured to be less than what is
 	 * supported by the hardware but not more.
 	 */
 	if (nasid == 0 || nasid > regs[1])
 		nasid = regs[1];
 	KASSERT(nasid > 1, ("Insufficient ASIDs for guests: %#x", nasid));
 
 	/* bhyve requires the Nested Paging feature */
 	if (!(svm_feature & AMD_CPUID_SVM_NP)) {
 		printf("SVM: Nested Paging feature not available.\n");
 		return (ENXIO);
 	}
 
 	/* bhyve requires the NRIP Save feature */
 	if (!(svm_feature & AMD_CPUID_SVM_NRIP_SAVE)) {
 		printf("SVM: NRIP Save feature not available.\n");
 		return (ENXIO);
 	}
 
 	return (0);
 }
 
 static void
 svm_enable(void *arg __unused)
 {
 	uint64_t efer;
 
 	efer = rdmsr(MSR_EFER);
 	efer |= EFER_SVM;
 	wrmsr(MSR_EFER, efer);
 
 	wrmsr(MSR_VM_HSAVE_PA, vtophys(hsave[curcpu]));
 }
 
 /*
  * Return 1 if SVM is enabled on this processor and 0 otherwise.
  */
 static int
 svm_available(void)
 {
 	uint64_t msr;
 
 	/* Section 15.4 Enabling SVM from APM2. */
 	if ((amd_feature2 & AMDID2_SVM) == 0) {
 		printf("SVM: not available.\n");
 		return (0);
 	}
 
 	msr = rdmsr(MSR_VM_CR);
 	if ((msr & VM_CR_SVMDIS) != 0) {
 		printf("SVM: disabled by BIOS.\n");
 		return (0);
 	}
 
 	return (1);
 }
 
 static int
 svm_init(int ipinum)
 {
 	int error, cpu;
 
 	if (!svm_available())
 		return (ENXIO);
 
 	error = check_svm_features();
 	if (error)
 		return (error);
 
 	vmcb_clean &= VMCB_CACHE_DEFAULT;
 
 	for (cpu = 0; cpu < MAXCPU; cpu++) {
 		/*
 		 * Initialize the host ASIDs to their "highest" valid values.
 		 *
 		 * The next ASID allocation will rollover both 'gen' and 'num'
 		 * and start off the sequence at {1,1}.
 		 */
 		asid[cpu].gen = ~0UL;
 		asid[cpu].num = nasid - 1;
 	}
 
 	svm_msr_init();
 	svm_npt_init(ipinum);
 
 	/* Enable SVM on all CPUs */
 	smp_rendezvous(NULL, svm_enable, NULL, NULL);
 
 	return (0);
 }
 
 static void
 svm_restore(void)
 {
 
 	svm_enable(NULL);
 }		
 
 /* Pentium compatible MSRs */
 #define MSR_PENTIUM_START 	0	
 #define MSR_PENTIUM_END 	0x1FFF
 /* AMD 6th generation and Intel compatible MSRs */
 #define MSR_AMD6TH_START 	0xC0000000UL	
 #define MSR_AMD6TH_END 		0xC0001FFFUL	
 /* AMD 7th and 8th generation compatible MSRs */
 #define MSR_AMD7TH_START 	0xC0010000UL	
 #define MSR_AMD7TH_END 		0xC0011FFFUL	
 
 /*
  * Get the index and bit position for a MSR in permission bitmap.
  * Two bits are used for each MSR: lower bit for read and higher bit for write.
  */
 static int
 svm_msr_index(uint64_t msr, int *index, int *bit)
 {
 	uint32_t base, off;
 
 	*index = -1;
 	*bit = (msr % 4) * 2;
 	base = 0;
 
 	if (msr >= MSR_PENTIUM_START && msr <= MSR_PENTIUM_END) {
 		*index = msr / 4;
 		return (0);
 	}
 
 	base += (MSR_PENTIUM_END - MSR_PENTIUM_START + 1); 
 	if (msr >= MSR_AMD6TH_START && msr <= MSR_AMD6TH_END) {
 		off = (msr - MSR_AMD6TH_START); 
 		*index = (off + base) / 4;
 		return (0);
 	} 
 
 	base += (MSR_AMD6TH_END - MSR_AMD6TH_START + 1);
 	if (msr >= MSR_AMD7TH_START && msr <= MSR_AMD7TH_END) {
 		off = (msr - MSR_AMD7TH_START);
 		*index = (off + base) / 4;
 		return (0);
 	}
 
 	return (EINVAL);
 }
 
 /*
  * Allow vcpu to read or write the 'msr' without trapping into the hypervisor.
  */
 static void
 svm_msr_perm(uint8_t *perm_bitmap, uint64_t msr, bool read, bool write)
 {
 	int index, bit, error;
 
 	error = svm_msr_index(msr, &index, &bit);
 	KASSERT(error == 0, ("%s: invalid msr %#lx", __func__, msr));
 	KASSERT(index >= 0 && index < SVM_MSR_BITMAP_SIZE,
 	    ("%s: invalid index %d for msr %#lx", __func__, index, msr));
 	KASSERT(bit >= 0 && bit <= 6, ("%s: invalid bit position %d "
 	    "msr %#lx", __func__, bit, msr));
 
 	if (read)
 		perm_bitmap[index] &= ~(1UL << bit);
 
 	if (write)
 		perm_bitmap[index] &= ~(2UL << bit);
 }
 
 static void
 svm_msr_rw_ok(uint8_t *perm_bitmap, uint64_t msr)
 {
 
 	svm_msr_perm(perm_bitmap, msr, true, true);
 }
 
 static void
 svm_msr_rd_ok(uint8_t *perm_bitmap, uint64_t msr)
 {
 
 	svm_msr_perm(perm_bitmap, msr, true, false);
 }
 
 static __inline int
 svm_get_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask)
 {
 	struct vmcb_ctrl *ctrl;
 
 	KASSERT(idx >=0 && idx < 5, ("invalid intercept index %d", idx));
 
 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
 	return (ctrl->intercept[idx] & bitmask ? 1 : 0);
 }
 
 static __inline void
 svm_set_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask,
     int enabled)
 {
 	struct vmcb_ctrl *ctrl;
 	uint32_t oldval;
 
 	KASSERT(idx >=0 && idx < 5, ("invalid intercept index %d", idx));
 
 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
 	oldval = ctrl->intercept[idx];
 
 	if (enabled)
 		ctrl->intercept[idx] |= bitmask;
 	else
 		ctrl->intercept[idx] &= ~bitmask;
 
 	if (ctrl->intercept[idx] != oldval) {
 		svm_set_dirty(sc, vcpu, VMCB_CACHE_I);
 		VCPU_CTR3(sc->vm, vcpu, "intercept[%d] modified "
 		    "from %#x to %#x", idx, oldval, ctrl->intercept[idx]);
 	}
 }
 
 static __inline void
 svm_disable_intercept(struct svm_softc *sc, int vcpu, int off, uint32_t bitmask)
 {
 
 	svm_set_intercept(sc, vcpu, off, bitmask, 0);
 }
 
 static __inline void
 svm_enable_intercept(struct svm_softc *sc, int vcpu, int off, uint32_t bitmask)
 {
 
 	svm_set_intercept(sc, vcpu, off, bitmask, 1);
 }
 
 static void
 vmcb_init(struct svm_softc *sc, int vcpu, uint64_t iopm_base_pa,
     uint64_t msrpm_base_pa, uint64_t np_pml4)
 {
 	struct vmcb_ctrl *ctrl;
 	struct vmcb_state *state;
 	uint32_t mask;
 	int n;
 
 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
 	state = svm_get_vmcb_state(sc, vcpu);
 
 	ctrl->iopm_base_pa = iopm_base_pa;
 	ctrl->msrpm_base_pa = msrpm_base_pa;
 
 	/* Enable nested paging */
 	ctrl->np_enable = 1;
 	ctrl->n_cr3 = np_pml4;
 
 	/*
 	 * Intercept accesses to the control registers that are not shadowed
 	 * in the VMCB - i.e. all except cr0, cr2, cr3, cr4 and cr8.
 	 */
 	for (n = 0; n < 16; n++) {
 		mask = (BIT(n) << 16) | BIT(n);
 		if (n == 0 || n == 2 || n == 3 || n == 4 || n == 8)
 			svm_disable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask);
 		else
 			svm_enable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask);
 	}
 
 
 	/*
 	 * Intercept everything when tracing guest exceptions otherwise
 	 * just intercept machine check exception.
 	 */
 	if (vcpu_trace_exceptions(sc->vm, vcpu)) {
 		for (n = 0; n < 32; n++) {
 			/*
 			 * Skip unimplemented vectors in the exception bitmap.
 			 */
 			if (n == 2 || n == 9) {
 				continue;
 			}
 			svm_enable_intercept(sc, vcpu, VMCB_EXC_INTCPT, BIT(n));
 		}
 	} else {
 		svm_enable_intercept(sc, vcpu, VMCB_EXC_INTCPT, BIT(IDT_MC));
 	}
 
 	/* Intercept various events (for e.g. I/O, MSR and CPUID accesses) */
 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IO);
 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_MSR);
 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_CPUID);
 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INTR);
 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INIT);
 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_NMI);
 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SMI);
 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SHUTDOWN);
 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
 	    VMCB_INTCPT_FERR_FREEZE);
 
 	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_MONITOR);
 	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_MWAIT);
 
 	/*
 	 * From section "Canonicalization and Consistency Checks" in APMv2
 	 * the VMRUN intercept bit must be set to pass the consistency check.
 	 */
 	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMRUN);
 
 	/*
 	 * The ASID will be set to a non-zero value just before VMRUN.
 	 */
 	ctrl->asid = 0;
 
 	/*
 	 * Section 15.21.1, Interrupt Masking in EFLAGS
 	 * Section 15.21.2, Virtualizing APIC.TPR
 	 *
 	 * This must be set for %rflag and %cr8 isolation of guest and host.
 	 */
 	ctrl->v_intr_masking = 1;
 
 	/* Enable Last Branch Record aka LBR for debugging */
 	ctrl->lbr_virt_en = 1;
 	state->dbgctl = BIT(0);
 
 	/* EFER_SVM must always be set when the guest is executing */
 	state->efer = EFER_SVM;
 
 	/* Set up the PAT to power-on state */
 	state->g_pat = PAT_VALUE(0, PAT_WRITE_BACK)	|
 	    PAT_VALUE(1, PAT_WRITE_THROUGH)	|
 	    PAT_VALUE(2, PAT_UNCACHED)		|
 	    PAT_VALUE(3, PAT_UNCACHEABLE)	|
 	    PAT_VALUE(4, PAT_WRITE_BACK)	|
 	    PAT_VALUE(5, PAT_WRITE_THROUGH)	|
 	    PAT_VALUE(6, PAT_UNCACHED)		|
 	    PAT_VALUE(7, PAT_UNCACHEABLE);
 
 	/* Set up DR6/7 to power-on state */
 	state->dr6 = DBREG_DR6_RESERVED1;
 	state->dr7 = DBREG_DR7_RESERVED1;
 }
 
 /*
  * Initialize a virtual machine.
  */
 static void *
 svm_vminit(struct vm *vm, pmap_t pmap)
 {
 	struct svm_softc *svm_sc;
 	struct svm_vcpu *vcpu;
 	vm_paddr_t msrpm_pa, iopm_pa, pml4_pa;
 	int i;
 	uint16_t maxcpus;
 
 	svm_sc = malloc(sizeof (*svm_sc), M_SVM, M_WAITOK | M_ZERO);
 	if (((uintptr_t)svm_sc & PAGE_MASK) != 0)
 		panic("malloc of svm_softc not aligned on page boundary");
 
 	svm_sc->msr_bitmap = contigmalloc(SVM_MSR_BITMAP_SIZE, M_SVM,
 	    M_WAITOK, 0, ~(vm_paddr_t)0, PAGE_SIZE, 0);
 	if (svm_sc->msr_bitmap == NULL)
 		panic("contigmalloc of SVM MSR bitmap failed");
 	svm_sc->iopm_bitmap = contigmalloc(SVM_IO_BITMAP_SIZE, M_SVM,
 	    M_WAITOK, 0, ~(vm_paddr_t)0, PAGE_SIZE, 0);
 	if (svm_sc->iopm_bitmap == NULL)
 		panic("contigmalloc of SVM IO bitmap failed");
 
 	svm_sc->vm = vm;
 	svm_sc->nptp = (vm_offset_t)vtophys(pmap->pm_pml4);
 
 	/*
 	 * Intercept read and write accesses to all MSRs.
 	 */
 	memset(svm_sc->msr_bitmap, 0xFF, SVM_MSR_BITMAP_SIZE);
 
 	/*
 	 * Access to the following MSRs is redirected to the VMCB when the
 	 * guest is executing. Therefore it is safe to allow the guest to
 	 * read/write these MSRs directly without hypervisor involvement.
 	 */
 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_GSBASE);
 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_FSBASE);
 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_KGSBASE);
 
 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_STAR);
 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_LSTAR);
 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_CSTAR);
 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SF_MASK);
 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_CS_MSR);
 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_ESP_MSR);
 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_EIP_MSR);
 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_PAT);
 
 	svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_TSC);
 
 	/*
 	 * Intercept writes to make sure that the EFER_SVM bit is not cleared.
 	 */
 	svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_EFER);
 
 	/* Intercept access to all I/O ports. */
 	memset(svm_sc->iopm_bitmap, 0xFF, SVM_IO_BITMAP_SIZE);
 
 	iopm_pa = vtophys(svm_sc->iopm_bitmap);
 	msrpm_pa = vtophys(svm_sc->msr_bitmap);
 	pml4_pa = svm_sc->nptp;
 	maxcpus = vm_get_maxcpus(svm_sc->vm);
 	for (i = 0; i < maxcpus; i++) {
 		vcpu = svm_get_vcpu(svm_sc, i);
 		vcpu->nextrip = ~0;
 		vcpu->lastcpu = NOCPU;
 		vcpu->vmcb_pa = vtophys(&vcpu->vmcb);
 		vmcb_init(svm_sc, i, iopm_pa, msrpm_pa, pml4_pa);
 		svm_msr_guest_init(svm_sc, i);
 	}
 	return (svm_sc);
 }
 
 /*
  * Collateral for a generic SVM VM-exit.
  */
 static void
 vm_exit_svm(struct vm_exit *vme, uint64_t code, uint64_t info1, uint64_t info2)
 {
 
 	vme->exitcode = VM_EXITCODE_SVM;
 	vme->u.svm.exitcode = code;
 	vme->u.svm.exitinfo1 = info1;
 	vme->u.svm.exitinfo2 = info2;
 }
 
 static int
 svm_cpl(struct vmcb_state *state)
 {
 
 	/*
 	 * From APMv2:
 	 *   "Retrieve the CPL from the CPL field in the VMCB, not
 	 *    from any segment DPL"
 	 */
 	return (state->cpl);
 }
 
 static enum vm_cpu_mode
 svm_vcpu_mode(struct vmcb *vmcb)
 {
 	struct vmcb_segment seg;
 	struct vmcb_state *state;
 	int error;
 
 	state = &vmcb->state;
 
 	if (state->efer & EFER_LMA) {
 		error = vmcb_seg(vmcb, VM_REG_GUEST_CS, &seg);
 		KASSERT(error == 0, ("%s: vmcb_seg(cs) error %d", __func__,
 		    error));
 
 		/*
 		 * Section 4.8.1 for APM2, check if Code Segment has
 		 * Long attribute set in descriptor.
 		 */
 		if (seg.attrib & VMCB_CS_ATTRIB_L)
 			return (CPU_MODE_64BIT);
 		else
 			return (CPU_MODE_COMPATIBILITY);
 	} else  if (state->cr0 & CR0_PE) {
 		return (CPU_MODE_PROTECTED);
 	} else {
 		return (CPU_MODE_REAL);
 	}
 }
 
 static enum vm_paging_mode
 svm_paging_mode(uint64_t cr0, uint64_t cr4, uint64_t efer)
 {
 
 	if ((cr0 & CR0_PG) == 0)
 		return (PAGING_MODE_FLAT);
 	if ((cr4 & CR4_PAE) == 0)
 		return (PAGING_MODE_32);
 	if (efer & EFER_LME)
 		return (PAGING_MODE_64);
 	else
 		return (PAGING_MODE_PAE);
 }
 
 /*
  * ins/outs utility routines
  */
 static uint64_t
 svm_inout_str_index(struct svm_regctx *regs, int in)
 {
 	uint64_t val;
 
 	val = in ? regs->sctx_rdi : regs->sctx_rsi;
 
 	return (val);
 }
 
 static uint64_t
 svm_inout_str_count(struct svm_regctx *regs, int rep)
 {
 	uint64_t val;
 
 	val = rep ? regs->sctx_rcx : 1;
 
 	return (val);
 }
 
 static void
 svm_inout_str_seginfo(struct svm_softc *svm_sc, int vcpu, int64_t info1,
     int in, struct vm_inout_str *vis)
 {
 	int error, s;
 
 	if (in) {
 		vis->seg_name = VM_REG_GUEST_ES;
 	} else {
 		/* The segment field has standard encoding */
 		s = (info1 >> 10) & 0x7;
 		vis->seg_name = vm_segment_name(s);
 	}
 
 	error = vmcb_getdesc(svm_sc, vcpu, vis->seg_name, &vis->seg_desc);
 	KASSERT(error == 0, ("%s: svm_getdesc error %d", __func__, error));
 }
 
 static int
 svm_inout_str_addrsize(uint64_t info1)
 {
         uint32_t size;
 
         size = (info1 >> 7) & 0x7;
         switch (size) {
         case 1:
                 return (2);     /* 16 bit */
         case 2:
                 return (4);     /* 32 bit */
         case 4:
                 return (8);     /* 64 bit */
         default:
                 panic("%s: invalid size encoding %d", __func__, size);
         }
 }
 
 static void
 svm_paging_info(struct vmcb *vmcb, struct vm_guest_paging *paging)
 {
 	struct vmcb_state *state;
 
 	state = &vmcb->state;
 	paging->cr3 = state->cr3;
 	paging->cpl = svm_cpl(state);
 	paging->cpu_mode = svm_vcpu_mode(vmcb);
 	paging->paging_mode = svm_paging_mode(state->cr0, state->cr4,
 	    state->efer);
 }
 
 #define	UNHANDLED 0
 
 /*
  * Handle guest I/O intercept.
  */
 static int
 svm_handle_io(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit)
 {
 	struct vmcb_ctrl *ctrl;
 	struct vmcb_state *state;
 	struct svm_regctx *regs;
 	struct vm_inout_str *vis;
 	uint64_t info1;
 	int inout_string;
 
 	state = svm_get_vmcb_state(svm_sc, vcpu);
 	ctrl  = svm_get_vmcb_ctrl(svm_sc, vcpu);
 	regs  = svm_get_guest_regctx(svm_sc, vcpu);
 
 	info1 = ctrl->exitinfo1;
 	inout_string = info1 & BIT(2) ? 1 : 0;
 
 	/*
 	 * The effective segment number in EXITINFO1[12:10] is populated
 	 * only if the processor has the DecodeAssist capability.
 	 *
 	 * XXX this is not specified explicitly in APMv2 but can be verified
 	 * empirically.
 	 */
 	if (inout_string && !decode_assist())
 		return (UNHANDLED);
 
 	vmexit->exitcode 	= VM_EXITCODE_INOUT;
 	vmexit->u.inout.in 	= (info1 & BIT(0)) ? 1 : 0;
 	vmexit->u.inout.string 	= inout_string;
 	vmexit->u.inout.rep 	= (info1 & BIT(3)) ? 1 : 0;
 	vmexit->u.inout.bytes 	= (info1 >> 4) & 0x7;
 	vmexit->u.inout.port 	= (uint16_t)(info1 >> 16);
 	vmexit->u.inout.eax 	= (uint32_t)(state->rax);
 
 	if (inout_string) {
 		vmexit->exitcode = VM_EXITCODE_INOUT_STR;
 		vis = &vmexit->u.inout_str;
 		svm_paging_info(svm_get_vmcb(svm_sc, vcpu), &vis->paging);
 		vis->rflags = state->rflags;
 		vis->cr0 = state->cr0;
 		vis->index = svm_inout_str_index(regs, vmexit->u.inout.in);
 		vis->count = svm_inout_str_count(regs, vmexit->u.inout.rep);
 		vis->addrsize = svm_inout_str_addrsize(info1);
 		svm_inout_str_seginfo(svm_sc, vcpu, info1,
 		    vmexit->u.inout.in, vis);
 	}
 
 	return (UNHANDLED);
 }
 
 static int
 npf_fault_type(uint64_t exitinfo1)
 {
 
 	if (exitinfo1 & VMCB_NPF_INFO1_W)
 		return (VM_PROT_WRITE);
 	else if (exitinfo1 & VMCB_NPF_INFO1_ID)
 		return (VM_PROT_EXECUTE);
 	else
 		return (VM_PROT_READ);
 }
 
 static bool
 svm_npf_emul_fault(uint64_t exitinfo1)
 {
 	
 	if (exitinfo1 & VMCB_NPF_INFO1_ID) {
 		return (false);
 	}
 
 	if (exitinfo1 & VMCB_NPF_INFO1_GPT) {
 		return (false);
 	}
 
 	if ((exitinfo1 & VMCB_NPF_INFO1_GPA) == 0) {
 		return (false);
 	}
 
 	return (true);	
 }
 
 static void
 svm_handle_inst_emul(struct vmcb *vmcb, uint64_t gpa, struct vm_exit *vmexit)
 {
 	struct vm_guest_paging *paging;
 	struct vmcb_segment seg;
 	struct vmcb_ctrl *ctrl;
 	char *inst_bytes;
 	int error, inst_len;
 
 	ctrl = &vmcb->ctrl;
 	paging = &vmexit->u.inst_emul.paging;
 
 	vmexit->exitcode = VM_EXITCODE_INST_EMUL;
 	vmexit->u.inst_emul.gpa = gpa;
 	vmexit->u.inst_emul.gla = VIE_INVALID_GLA;
 	svm_paging_info(vmcb, paging);
 
 	error = vmcb_seg(vmcb, VM_REG_GUEST_CS, &seg);
 	KASSERT(error == 0, ("%s: vmcb_seg(CS) error %d", __func__, error));
 
 	switch(paging->cpu_mode) {
 	case CPU_MODE_REAL:
 		vmexit->u.inst_emul.cs_base = seg.base;
 		vmexit->u.inst_emul.cs_d = 0;
 		break;
 	case CPU_MODE_PROTECTED:
 	case CPU_MODE_COMPATIBILITY:
 		vmexit->u.inst_emul.cs_base = seg.base;
 
 		/*
 		 * Section 4.8.1 of APM2, Default Operand Size or D bit.
 		 */
 		vmexit->u.inst_emul.cs_d = (seg.attrib & VMCB_CS_ATTRIB_D) ?
 		    1 : 0;
 		break;
 	default:
 		vmexit->u.inst_emul.cs_base = 0;
 		vmexit->u.inst_emul.cs_d = 0;
 		break;	
 	}
 
 	/*
 	 * Copy the instruction bytes into 'vie' if available.
 	 */
 	if (decode_assist() && !disable_npf_assist) {
 		inst_len = ctrl->inst_len;
 		inst_bytes = ctrl->inst_bytes;
 	} else {
 		inst_len = 0;
 		inst_bytes = NULL;
 	}
 	vie_init(&vmexit->u.inst_emul.vie, inst_bytes, inst_len);
 }
 
 #ifdef KTR
 static const char *
 intrtype_to_str(int intr_type)
 {
 	switch (intr_type) {
 	case VMCB_EVENTINJ_TYPE_INTR:
 		return ("hwintr");
 	case VMCB_EVENTINJ_TYPE_NMI:
 		return ("nmi");
 	case VMCB_EVENTINJ_TYPE_INTn:
 		return ("swintr");
 	case VMCB_EVENTINJ_TYPE_EXCEPTION:
 		return ("exception");
 	default:
 		panic("%s: unknown intr_type %d", __func__, intr_type);
 	}
 }
 #endif
 
 /*
  * Inject an event to vcpu as described in section 15.20, "Event injection".
  */
 static void
 svm_eventinject(struct svm_softc *sc, int vcpu, int intr_type, int vector,
 		 uint32_t error, bool ec_valid)
 {
 	struct vmcb_ctrl *ctrl;
 
 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
 
 	KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) == 0,
 	    ("%s: event already pending %#lx", __func__, ctrl->eventinj));
 
 	KASSERT(vector >=0 && vector <= 255, ("%s: invalid vector %d",
 	    __func__, vector));
 
 	switch (intr_type) {
 	case VMCB_EVENTINJ_TYPE_INTR:
 	case VMCB_EVENTINJ_TYPE_NMI:
 	case VMCB_EVENTINJ_TYPE_INTn:
 		break;
 	case VMCB_EVENTINJ_TYPE_EXCEPTION:
 		if (vector >= 0 && vector <= 31 && vector != 2)
 			break;
 		/* FALLTHROUGH */
 	default:
 		panic("%s: invalid intr_type/vector: %d/%d", __func__,
 		    intr_type, vector);
 	}
 	ctrl->eventinj = vector | (intr_type << 8) | VMCB_EVENTINJ_VALID;
 	if (ec_valid) {
 		ctrl->eventinj |= VMCB_EVENTINJ_EC_VALID;
 		ctrl->eventinj |= (uint64_t)error << 32;
 		VCPU_CTR3(sc->vm, vcpu, "Injecting %s at vector %d errcode %#x",
 		    intrtype_to_str(intr_type), vector, error);
 	} else {
 		VCPU_CTR2(sc->vm, vcpu, "Injecting %s at vector %d",
 		    intrtype_to_str(intr_type), vector);
 	}
 }
 
 static void
 svm_update_virqinfo(struct svm_softc *sc, int vcpu)
 {
 	struct vm *vm;
 	struct vlapic *vlapic;
 	struct vmcb_ctrl *ctrl;
 
 	vm = sc->vm;
 	vlapic = vm_lapic(vm, vcpu);
 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
 
 	/* Update %cr8 in the emulated vlapic */
 	vlapic_set_cr8(vlapic, ctrl->v_tpr);
 
 	/* Virtual interrupt injection is not used. */
 	KASSERT(ctrl->v_intr_vector == 0, ("%s: invalid "
 	    "v_intr_vector %d", __func__, ctrl->v_intr_vector));
 }
 
 static void
 svm_save_intinfo(struct svm_softc *svm_sc, int vcpu)
 {
 	struct vmcb_ctrl *ctrl;
 	uint64_t intinfo;
 
 	ctrl  = svm_get_vmcb_ctrl(svm_sc, vcpu);
 	intinfo = ctrl->exitintinfo;	
 	if (!VMCB_EXITINTINFO_VALID(intinfo))
 		return;
 
 	/*
 	 * From APMv2, Section "Intercepts during IDT interrupt delivery"
 	 *
 	 * If a #VMEXIT happened during event delivery then record the event
 	 * that was being delivered.
 	 */
 	VCPU_CTR2(svm_sc->vm, vcpu, "SVM:Pending INTINFO(0x%lx), vector=%d.\n",
 		intinfo, VMCB_EXITINTINFO_VECTOR(intinfo));
 	vmm_stat_incr(svm_sc->vm, vcpu, VCPU_EXITINTINFO, 1);
 	vm_exit_intinfo(svm_sc->vm, vcpu, intinfo);
 }
 
 #ifdef INVARIANTS
 static __inline int
 vintr_intercept_enabled(struct svm_softc *sc, int vcpu)
 {
 
 	return (svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
 	    VMCB_INTCPT_VINTR));
 }
 #endif
 
 static __inline void
 enable_intr_window_exiting(struct svm_softc *sc, int vcpu)
 {
 	struct vmcb_ctrl *ctrl;
 
 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
 
 	if (ctrl->v_irq && ctrl->v_intr_vector == 0) {
 		KASSERT(ctrl->v_ign_tpr, ("%s: invalid v_ign_tpr", __func__));
 		KASSERT(vintr_intercept_enabled(sc, vcpu),
 		    ("%s: vintr intercept should be enabled", __func__));
 		return;
 	}
 
 	VCPU_CTR0(sc->vm, vcpu, "Enable intr window exiting");
 	ctrl->v_irq = 1;
 	ctrl->v_ign_tpr = 1;
 	ctrl->v_intr_vector = 0;
 	svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR);
 }
 
 static __inline void
 disable_intr_window_exiting(struct svm_softc *sc, int vcpu)
 {
 	struct vmcb_ctrl *ctrl;
 
 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
 
 	if (!ctrl->v_irq && ctrl->v_intr_vector == 0) {
 		KASSERT(!vintr_intercept_enabled(sc, vcpu),
 		    ("%s: vintr intercept should be disabled", __func__));
 		return;
 	}
 
 	VCPU_CTR0(sc->vm, vcpu, "Disable intr window exiting");
 	ctrl->v_irq = 0;
 	ctrl->v_intr_vector = 0;
 	svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
 	svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR);
 }
 
 static int
 svm_modify_intr_shadow(struct svm_softc *sc, int vcpu, uint64_t val)
 {
 	struct vmcb_ctrl *ctrl;
 	int oldval, newval;
 
 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
 	oldval = ctrl->intr_shadow;
 	newval = val ? 1 : 0;
 	if (newval != oldval) {
 		ctrl->intr_shadow = newval;
 		VCPU_CTR1(sc->vm, vcpu, "Setting intr_shadow to %d", newval);
 	}
 	return (0);
 }
 
 static int
 svm_get_intr_shadow(struct svm_softc *sc, int vcpu, uint64_t *val)
 {
 	struct vmcb_ctrl *ctrl;
 
 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
 	*val = ctrl->intr_shadow;
 	return (0);
 }
 
 /*
  * Once an NMI is injected it blocks delivery of further NMIs until the handler
  * executes an IRET. The IRET intercept is enabled when an NMI is injected to
  * to track when the vcpu is done handling the NMI.
  */
 static int
 nmi_blocked(struct svm_softc *sc, int vcpu)
 {
 	int blocked;
 
 	blocked = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
 	    VMCB_INTCPT_IRET);
 	return (blocked);
 }
 
 static void
 enable_nmi_blocking(struct svm_softc *sc, int vcpu)
 {
 
 	KASSERT(!nmi_blocked(sc, vcpu), ("vNMI already blocked"));
 	VCPU_CTR0(sc->vm, vcpu, "vNMI blocking enabled");
 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET);
 }
 
 static void
 clear_nmi_blocking(struct svm_softc *sc, int vcpu)
 {
 	int error;
 
 	KASSERT(nmi_blocked(sc, vcpu), ("vNMI already unblocked"));
 	VCPU_CTR0(sc->vm, vcpu, "vNMI blocking cleared");
 	/*
 	 * When the IRET intercept is cleared the vcpu will attempt to execute
 	 * the "iret" when it runs next. However, it is possible to inject
 	 * another NMI into the vcpu before the "iret" has actually executed.
 	 *
 	 * For e.g. if the "iret" encounters a #NPF when accessing the stack
 	 * it will trap back into the hypervisor. If an NMI is pending for
 	 * the vcpu it will be injected into the guest.
 	 *
 	 * XXX this needs to be fixed
 	 */
 	svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET);
 
 	/*
 	 * Set 'intr_shadow' to prevent an NMI from being injected on the
 	 * immediate VMRUN.
 	 */
 	error = svm_modify_intr_shadow(sc, vcpu, 1);
 	KASSERT(!error, ("%s: error %d setting intr_shadow", __func__, error));
 }
 
 #define	EFER_MBZ_BITS	0xFFFFFFFFFFFF0200UL
 
 static int
 svm_write_efer(struct svm_softc *sc, int vcpu, uint64_t newval, bool *retu)
 {
 	struct vm_exit *vme;
 	struct vmcb_state *state;
 	uint64_t changed, lma, oldval;
 	int error;
 
 	state = svm_get_vmcb_state(sc, vcpu);
 
 	oldval = state->efer;
 	VCPU_CTR2(sc->vm, vcpu, "wrmsr(efer) %#lx/%#lx", oldval, newval);
 
 	newval &= ~0xFE;		/* clear the Read-As-Zero (RAZ) bits */
 	changed = oldval ^ newval;
 
 	if (newval & EFER_MBZ_BITS)
 		goto gpf;
 
 	/* APMv2 Table 14-5 "Long-Mode Consistency Checks" */
 	if (changed & EFER_LME) {
 		if (state->cr0 & CR0_PG)
 			goto gpf;
 	}
 
 	/* EFER.LMA = EFER.LME & CR0.PG */
 	if ((newval & EFER_LME) != 0 && (state->cr0 & CR0_PG) != 0)
 		lma = EFER_LMA;
 	else
 		lma = 0;
 
 	if ((newval & EFER_LMA) != lma)
 		goto gpf;
 
 	if (newval & EFER_NXE) {
 		if (!vm_cpuid_capability(sc->vm, vcpu, VCC_NO_EXECUTE))
 			goto gpf;
 	}
 
 	/*
 	 * XXX bhyve does not enforce segment limits in 64-bit mode. Until
 	 * this is fixed flag guest attempt to set EFER_LMSLE as an error.
 	 */
 	if (newval & EFER_LMSLE) {
 		vme = vm_exitinfo(sc->vm, vcpu);
 		vm_exit_svm(vme, VMCB_EXIT_MSR, 1, 0);
 		*retu = true;
 		return (0);
 	}
 
 	if (newval & EFER_FFXSR) {
 		if (!vm_cpuid_capability(sc->vm, vcpu, VCC_FFXSR))
 			goto gpf;
 	}
 
 	if (newval & EFER_TCE) {
 		if (!vm_cpuid_capability(sc->vm, vcpu, VCC_TCE))
 			goto gpf;
 	}
 
 	error = svm_setreg(sc, vcpu, VM_REG_GUEST_EFER, newval);
 	KASSERT(error == 0, ("%s: error %d updating efer", __func__, error));
 	return (0);
 gpf:
 	vm_inject_gp(sc->vm, vcpu);
 	return (0);
 }
 
 static int
 emulate_wrmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t val,
     bool *retu)
 {
 	int error;
 
 	if (lapic_msr(num))
 		error = lapic_wrmsr(sc->vm, vcpu, num, val, retu);
 	else if (num == MSR_EFER)
 		error = svm_write_efer(sc, vcpu, val, retu);
 	else
 		error = svm_wrmsr(sc, vcpu, num, val, retu);
 
 	return (error);
 }
 
 static int
 emulate_rdmsr(struct svm_softc *sc, int vcpu, u_int num, bool *retu)
 {
 	struct vmcb_state *state;
 	struct svm_regctx *ctx;
 	uint64_t result;
 	int error;
 
 	if (lapic_msr(num))
 		error = lapic_rdmsr(sc->vm, vcpu, num, &result, retu);
 	else
 		error = svm_rdmsr(sc, vcpu, num, &result, retu);
 
 	if (error == 0) {
 		state = svm_get_vmcb_state(sc, vcpu);
 		ctx = svm_get_guest_regctx(sc, vcpu);
 		state->rax = result & 0xffffffff;
 		ctx->sctx_rdx = result >> 32;
 	}
 
 	return (error);
 }
 
 #ifdef KTR
 static const char *
 exit_reason_to_str(uint64_t reason)
 {
 	static char reasonbuf[32];
 
 	switch (reason) {
 	case VMCB_EXIT_INVALID:
 		return ("invalvmcb");
 	case VMCB_EXIT_SHUTDOWN:
 		return ("shutdown");
 	case VMCB_EXIT_NPF:
 		return ("nptfault");
 	case VMCB_EXIT_PAUSE:
 		return ("pause");
 	case VMCB_EXIT_HLT:
 		return ("hlt");
 	case VMCB_EXIT_CPUID:
 		return ("cpuid");
 	case VMCB_EXIT_IO:
 		return ("inout");
 	case VMCB_EXIT_MC:
 		return ("mchk");
 	case VMCB_EXIT_INTR:
 		return ("extintr");
 	case VMCB_EXIT_NMI:
 		return ("nmi");
 	case VMCB_EXIT_VINTR:
 		return ("vintr");
 	case VMCB_EXIT_MSR:
 		return ("msr");
 	case VMCB_EXIT_IRET:
 		return ("iret");
 	case VMCB_EXIT_MONITOR:
 		return ("monitor");
 	case VMCB_EXIT_MWAIT:
 		return ("mwait");
 	default:
 		snprintf(reasonbuf, sizeof(reasonbuf), "%#lx", reason);
 		return (reasonbuf);
 	}
 }
 #endif	/* KTR */
 
 /*
  * From section "State Saved on Exit" in APMv2: nRIP is saved for all #VMEXITs
  * that are due to instruction intercepts as well as MSR and IOIO intercepts
  * and exceptions caused by INT3, INTO and BOUND instructions.
  *
  * Return 1 if the nRIP is valid and 0 otherwise.
  */
 static int
 nrip_valid(uint64_t exitcode)
 {
 	switch (exitcode) {
 	case 0x00 ... 0x0F:	/* read of CR0 through CR15 */
 	case 0x10 ... 0x1F:	/* write of CR0 through CR15 */
 	case 0x20 ... 0x2F:	/* read of DR0 through DR15 */
 	case 0x30 ... 0x3F:	/* write of DR0 through DR15 */
 	case 0x43:		/* INT3 */
 	case 0x44:		/* INTO */
 	case 0x45:		/* BOUND */
 	case 0x65 ... 0x7C:	/* VMEXIT_CR0_SEL_WRITE ... VMEXIT_MSR */
 	case 0x80 ... 0x8D:	/* VMEXIT_VMRUN ... VMEXIT_XSETBV */
 		return (1);
 	default:
 		return (0);
 	}
 }
 
 static int
 svm_vmexit(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit)
 {
 	struct vmcb *vmcb;
 	struct vmcb_state *state;
 	struct vmcb_ctrl *ctrl;
 	struct svm_regctx *ctx;
 	uint64_t code, info1, info2, val;
 	uint32_t eax, ecx, edx;
 	int error, errcode_valid, handled, idtvec, reflect;
 	bool retu;
 
 	ctx = svm_get_guest_regctx(svm_sc, vcpu);
 	vmcb = svm_get_vmcb(svm_sc, vcpu);
 	state = &vmcb->state;
 	ctrl = &vmcb->ctrl;
 
 	handled = 0;
 	code = ctrl->exitcode;
 	info1 = ctrl->exitinfo1;
 	info2 = ctrl->exitinfo2;
 
 	vmexit->exitcode = VM_EXITCODE_BOGUS;
 	vmexit->rip = state->rip;
 	vmexit->inst_length = nrip_valid(code) ? ctrl->nrip - state->rip : 0;
 
 	vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_COUNT, 1);
 
 	/*
 	 * #VMEXIT(INVALID) needs to be handled early because the VMCB is
 	 * in an inconsistent state and can trigger assertions that would
 	 * never happen otherwise.
 	 */
 	if (code == VMCB_EXIT_INVALID) {
 		vm_exit_svm(vmexit, code, info1, info2);
 		return (0);
 	}
 
 	KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) == 0, ("%s: event "
 	    "injection valid bit is set %#lx", __func__, ctrl->eventinj));
 
 	KASSERT(vmexit->inst_length >= 0 && vmexit->inst_length <= 15,
 	    ("invalid inst_length %d: code (%#lx), info1 (%#lx), info2 (%#lx)",
 	    vmexit->inst_length, code, info1, info2));
 
 	svm_update_virqinfo(svm_sc, vcpu);
 	svm_save_intinfo(svm_sc, vcpu);
 
 	switch (code) {
 	case VMCB_EXIT_IRET:
 		/*
 		 * Restart execution at "iret" but with the intercept cleared.
 		 */
 		vmexit->inst_length = 0;
 		clear_nmi_blocking(svm_sc, vcpu);
 		handled = 1;
 		break;
 	case VMCB_EXIT_VINTR:	/* interrupt window exiting */
 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_VINTR, 1);
 		handled = 1;
 		break;
 	case VMCB_EXIT_INTR:	/* external interrupt */
 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXTINT, 1);
 		handled = 1;
 		break;
 	case VMCB_EXIT_NMI:	/* external NMI */
 		handled = 1;
 		break;
 	case 0x40 ... 0x5F:
 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXCEPTION, 1);
 		reflect = 1;
 		idtvec = code - 0x40;
 		switch (idtvec) {
 		case IDT_MC:
 			/*
 			 * Call the machine check handler by hand. Also don't
 			 * reflect the machine check back into the guest.
 			 */
 			reflect = 0;
 			VCPU_CTR0(svm_sc->vm, vcpu, "Vectoring to MCE handler");
 			__asm __volatile("int $18");
 			break;
 		case IDT_PF:
 			error = svm_setreg(svm_sc, vcpu, VM_REG_GUEST_CR2,
 			    info2);
 			KASSERT(error == 0, ("%s: error %d updating cr2",
 			    __func__, error));
 			/* fallthru */
 		case IDT_NP:
 		case IDT_SS:
 		case IDT_GP:
 		case IDT_AC:
 		case IDT_TS:
 			errcode_valid = 1;
 			break;
 
 		case IDT_DF:
 			errcode_valid = 1;
 			info1 = 0;
 			break;
 
 		case IDT_BP:
 		case IDT_OF:
 		case IDT_BR:
 			/*
 			 * The 'nrip' field is populated for INT3, INTO and
 			 * BOUND exceptions and this also implies that
 			 * 'inst_length' is non-zero.
 			 *
 			 * Reset 'inst_length' to zero so the guest %rip at
 			 * event injection is identical to what it was when
 			 * the exception originally happened.
 			 */
 			VCPU_CTR2(svm_sc->vm, vcpu, "Reset inst_length from %d "
 			    "to zero before injecting exception %d",
 			    vmexit->inst_length, idtvec);
 			vmexit->inst_length = 0;
 			/* fallthru */
 		default:
 			errcode_valid = 0;
 			info1 = 0;
 			break;
 		}
 		KASSERT(vmexit->inst_length == 0, ("invalid inst_length (%d) "
 		    "when reflecting exception %d into guest",
 		    vmexit->inst_length, idtvec));
 
 		if (reflect) {
 			/* Reflect the exception back into the guest */
 			VCPU_CTR2(svm_sc->vm, vcpu, "Reflecting exception "
 			    "%d/%#x into the guest", idtvec, (int)info1);
 			error = vm_inject_exception(svm_sc->vm, vcpu, idtvec,
 			    errcode_valid, info1, 0);
 			KASSERT(error == 0, ("%s: vm_inject_exception error %d",
 			    __func__, error));
 		}
 		handled = 1;
 		break;
 	case VMCB_EXIT_MSR:	/* MSR access. */
 		eax = state->rax;
 		ecx = ctx->sctx_rcx;
 		edx = ctx->sctx_rdx;
 		retu = false;	
 
 		if (info1) {
 			vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_WRMSR, 1);
 			val = (uint64_t)edx << 32 | eax;
 			VCPU_CTR2(svm_sc->vm, vcpu, "wrmsr %#x val %#lx",
 			    ecx, val);
 			if (emulate_wrmsr(svm_sc, vcpu, ecx, val, &retu)) {
 				vmexit->exitcode = VM_EXITCODE_WRMSR;
 				vmexit->u.msr.code = ecx;
 				vmexit->u.msr.wval = val;
 			} else if (!retu) {
 				handled = 1;
 			} else {
 				KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
 				    ("emulate_wrmsr retu with bogus exitcode"));
 			}
 		} else {
 			VCPU_CTR1(svm_sc->vm, vcpu, "rdmsr %#x", ecx);
 			vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_RDMSR, 1);
 			if (emulate_rdmsr(svm_sc, vcpu, ecx, &retu)) {
 				vmexit->exitcode = VM_EXITCODE_RDMSR;
 				vmexit->u.msr.code = ecx;
 			} else if (!retu) {
 				handled = 1;
 			} else {
 				KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
 				    ("emulate_rdmsr retu with bogus exitcode"));
 			}
 		}
 		break;
 	case VMCB_EXIT_IO:
 		handled = svm_handle_io(svm_sc, vcpu, vmexit);
 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_INOUT, 1);
 		break;
 	case VMCB_EXIT_CPUID:
 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_CPUID, 1);
 		handled = x86_emulate_cpuid(svm_sc->vm, vcpu,
 		    (uint32_t *)&state->rax,
 		    (uint32_t *)&ctx->sctx_rbx,
 		    (uint32_t *)&ctx->sctx_rcx,
 		    (uint32_t *)&ctx->sctx_rdx);
 		break;
 	case VMCB_EXIT_HLT:
 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_HLT, 1);
 		vmexit->exitcode = VM_EXITCODE_HLT;
 		vmexit->u.hlt.rflags = state->rflags;
 		break;
 	case VMCB_EXIT_PAUSE:
 		vmexit->exitcode = VM_EXITCODE_PAUSE;
 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_PAUSE, 1);
 		break;
 	case VMCB_EXIT_NPF:
 		/* EXITINFO2 contains the faulting guest physical address */
 		if (info1 & VMCB_NPF_INFO1_RSV) {
 			VCPU_CTR2(svm_sc->vm, vcpu, "nested page fault with "
 			    "reserved bits set: info1(%#lx) info2(%#lx)",
 			    info1, info2);
 		} else if (vm_mem_allocated(svm_sc->vm, vcpu, info2)) {
 			vmexit->exitcode = VM_EXITCODE_PAGING;
 			vmexit->u.paging.gpa = info2;
 			vmexit->u.paging.fault_type = npf_fault_type(info1);
 			vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_NESTED_FAULT, 1);
 			VCPU_CTR3(svm_sc->vm, vcpu, "nested page fault "
 			    "on gpa %#lx/%#lx at rip %#lx",
 			    info2, info1, state->rip);
 		} else if (svm_npf_emul_fault(info1)) {
 			svm_handle_inst_emul(vmcb, info2, vmexit);
 			vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_INST_EMUL, 1);
 			VCPU_CTR3(svm_sc->vm, vcpu, "inst_emul fault "
 			    "for gpa %#lx/%#lx at rip %#lx",
 			    info2, info1, state->rip);
 		}
 		break;
 	case VMCB_EXIT_MONITOR:
 		vmexit->exitcode = VM_EXITCODE_MONITOR;
 		break;
 	case VMCB_EXIT_MWAIT:
 		vmexit->exitcode = VM_EXITCODE_MWAIT;
 		break;
 	default:
 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_UNKNOWN, 1);
 		break;
 	}	
 
 	VCPU_CTR4(svm_sc->vm, vcpu, "%s %s vmexit at %#lx/%d",
 	    handled ? "handled" : "unhandled", exit_reason_to_str(code),
 	    vmexit->rip, vmexit->inst_length);
 
 	if (handled) {
 		vmexit->rip += vmexit->inst_length;
 		vmexit->inst_length = 0;
 		state->rip = vmexit->rip;
 	} else {
 		if (vmexit->exitcode == VM_EXITCODE_BOGUS) {
 			/*
 			 * If this VM exit was not claimed by anybody then
 			 * treat it as a generic SVM exit.
 			 */
 			vm_exit_svm(vmexit, code, info1, info2);
 		} else {
 			/*
 			 * The exitcode and collateral have been populated.
 			 * The VM exit will be processed further in userland.
 			 */
 		}
 	}
 	return (handled);
 }
 
 static void
 svm_inj_intinfo(struct svm_softc *svm_sc, int vcpu)
 {
 	uint64_t intinfo;
 
 	if (!vm_entry_intinfo(svm_sc->vm, vcpu, &intinfo))
 		return;
 
 	KASSERT(VMCB_EXITINTINFO_VALID(intinfo), ("%s: entry intinfo is not "
 	    "valid: %#lx", __func__, intinfo));
 
 	svm_eventinject(svm_sc, vcpu, VMCB_EXITINTINFO_TYPE(intinfo),
 		VMCB_EXITINTINFO_VECTOR(intinfo),
 		VMCB_EXITINTINFO_EC(intinfo),
 		VMCB_EXITINTINFO_EC_VALID(intinfo));
 	vmm_stat_incr(svm_sc->vm, vcpu, VCPU_INTINFO_INJECTED, 1);
 	VCPU_CTR1(svm_sc->vm, vcpu, "Injected entry intinfo: %#lx", intinfo);
 }
 
 /*
  * Inject event to virtual cpu.
  */
 static void
 svm_inj_interrupts(struct svm_softc *sc, int vcpu, struct vlapic *vlapic)
 {
 	struct vmcb_ctrl *ctrl;
 	struct vmcb_state *state;
 	struct svm_vcpu *vcpustate;
 	uint8_t v_tpr;
 	int vector, need_intr_window;
 	int extint_pending;
 
 	state = svm_get_vmcb_state(sc, vcpu);
 	ctrl  = svm_get_vmcb_ctrl(sc, vcpu);
 	vcpustate = svm_get_vcpu(sc, vcpu);
 
 	need_intr_window = 0;
 
 	if (vcpustate->nextrip != state->rip) {
 		ctrl->intr_shadow = 0;
 		VCPU_CTR2(sc->vm, vcpu, "Guest interrupt blocking "
 		    "cleared due to rip change: %#lx/%#lx",
 		    vcpustate->nextrip, state->rip);
 	}
 
 	/*
 	 * Inject pending events or exceptions for this vcpu.
 	 *
 	 * An event might be pending because the previous #VMEXIT happened
 	 * during event delivery (i.e. ctrl->exitintinfo).
 	 *
 	 * An event might also be pending because an exception was injected
 	 * by the hypervisor (e.g. #PF during instruction emulation).
 	 */
 	svm_inj_intinfo(sc, vcpu);
 
 	/* NMI event has priority over interrupts. */
 	if (vm_nmi_pending(sc->vm, vcpu)) {
 		if (nmi_blocked(sc, vcpu)) {
 			/*
 			 * Can't inject another NMI if the guest has not
 			 * yet executed an "iret" after the last NMI.
 			 */
 			VCPU_CTR0(sc->vm, vcpu, "Cannot inject NMI due "
 			    "to NMI-blocking");
 		} else if (ctrl->intr_shadow) {
 			/*
 			 * Can't inject an NMI if the vcpu is in an intr_shadow.
 			 */
 			VCPU_CTR0(sc->vm, vcpu, "Cannot inject NMI due to "
 			    "interrupt shadow");
 			need_intr_window = 1;
 			goto done;
 		} else if (ctrl->eventinj & VMCB_EVENTINJ_VALID) {
 			/*
 			 * If there is already an exception/interrupt pending
 			 * then defer the NMI until after that.
 			 */
 			VCPU_CTR1(sc->vm, vcpu, "Cannot inject NMI due to "
 			    "eventinj %#lx", ctrl->eventinj);
 
 			/*
 			 * Use self-IPI to trigger a VM-exit as soon as
 			 * possible after the event injection is completed.
 			 *
 			 * This works only if the external interrupt exiting
 			 * is at a lower priority than the event injection.
 			 *
 			 * Although not explicitly specified in APMv2 the
 			 * relative priorities were verified empirically.
 			 */
 			ipi_cpu(curcpu, IPI_AST);	/* XXX vmm_ipinum? */
 		} else {
 			vm_nmi_clear(sc->vm, vcpu);
 
 			/* Inject NMI, vector number is not used */
 			svm_eventinject(sc, vcpu, VMCB_EVENTINJ_TYPE_NMI,
 			    IDT_NMI, 0, false);
 
 			/* virtual NMI blocking is now in effect */
 			enable_nmi_blocking(sc, vcpu);
 
 			VCPU_CTR0(sc->vm, vcpu, "Injecting vNMI");
 		}
 	}
 
 	extint_pending = vm_extint_pending(sc->vm, vcpu);
 	if (!extint_pending) {
 		if (!vlapic_pending_intr(vlapic, &vector))
 			goto done;
 		KASSERT(vector >= 16 && vector <= 255,
 		    ("invalid vector %d from local APIC", vector));
 	} else {
 		/* Ask the legacy pic for a vector to inject */
 		vatpic_pending_intr(sc->vm, &vector);
 		KASSERT(vector >= 0 && vector <= 255,
 		    ("invalid vector %d from INTR", vector));
 	}
 
 	/*
 	 * If the guest has disabled interrupts or is in an interrupt shadow
 	 * then we cannot inject the pending interrupt.
 	 */
 	if ((state->rflags & PSL_I) == 0) {
 		VCPU_CTR2(sc->vm, vcpu, "Cannot inject vector %d due to "
 		    "rflags %#lx", vector, state->rflags);
 		need_intr_window = 1;
 		goto done;
 	}
 
 	if (ctrl->intr_shadow) {
 		VCPU_CTR1(sc->vm, vcpu, "Cannot inject vector %d due to "
 		    "interrupt shadow", vector);
 		need_intr_window = 1;
 		goto done;
 	}
 
 	if (ctrl->eventinj & VMCB_EVENTINJ_VALID) {
 		VCPU_CTR2(sc->vm, vcpu, "Cannot inject vector %d due to "
 		    "eventinj %#lx", vector, ctrl->eventinj);
 		need_intr_window = 1;
 		goto done;
 	}
 
 	svm_eventinject(sc, vcpu, VMCB_EVENTINJ_TYPE_INTR, vector, 0, false);
 
 	if (!extint_pending) {
 		vlapic_intr_accepted(vlapic, vector);
 	} else {
 		vm_extint_clear(sc->vm, vcpu);
 		vatpic_intr_accepted(sc->vm, vector);
 	}
 
 	/*
 	 * Force a VM-exit as soon as the vcpu is ready to accept another
 	 * interrupt. This is done because the PIC might have another vector
 	 * that it wants to inject. Also, if the APIC has a pending interrupt
 	 * that was preempted by the ExtInt then it allows us to inject the
 	 * APIC vector as soon as possible.
 	 */
 	need_intr_window = 1;
 done:
 	/*
 	 * The guest can modify the TPR by writing to %CR8. In guest mode
 	 * the processor reflects this write to V_TPR without hypervisor
 	 * intervention.
 	 *
 	 * The guest can also modify the TPR by writing to it via the memory
 	 * mapped APIC page. In this case, the write will be emulated by the
 	 * hypervisor. For this reason V_TPR must be updated before every
 	 * VMRUN.
 	 */
 	v_tpr = vlapic_get_cr8(vlapic);
 	KASSERT(v_tpr <= 15, ("invalid v_tpr %#x", v_tpr));
 	if (ctrl->v_tpr != v_tpr) {
 		VCPU_CTR2(sc->vm, vcpu, "VMCB V_TPR changed from %#x to %#x",
 		    ctrl->v_tpr, v_tpr);
 		ctrl->v_tpr = v_tpr;
 		svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
 	}
 
 	if (need_intr_window) {
 		/*
 		 * We use V_IRQ in conjunction with the VINTR intercept to
 		 * trap into the hypervisor as soon as a virtual interrupt
 		 * can be delivered.
 		 *
 		 * Since injected events are not subject to intercept checks
 		 * we need to ensure that the V_IRQ is not actually going to
 		 * be delivered on VM entry. The KASSERT below enforces this.
 		 */
 		KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) != 0 ||
 		    (state->rflags & PSL_I) == 0 || ctrl->intr_shadow,
 		    ("Bogus intr_window_exiting: eventinj (%#lx), "
 		    "intr_shadow (%u), rflags (%#lx)",
 		    ctrl->eventinj, ctrl->intr_shadow, state->rflags));
 		enable_intr_window_exiting(sc, vcpu);
 	} else {
 		disable_intr_window_exiting(sc, vcpu);
 	}
 }
 
 static __inline void
 restore_host_tss(void)
 {
 	struct system_segment_descriptor *tss_sd;
 
 	/*
 	 * The TSS descriptor was in use prior to launching the guest so it
 	 * has been marked busy.
 	 *
 	 * 'ltr' requires the descriptor to be marked available so change the
 	 * type to "64-bit available TSS".
 	 */
 	tss_sd = PCPU_GET(tss);
 	tss_sd->sd_type = SDT_SYSTSS;
 	ltr(GSEL(GPROC0_SEL, SEL_KPL));
 }
 
 static void
 check_asid(struct svm_softc *sc, int vcpuid, pmap_t pmap, u_int thiscpu)
 {
 	struct svm_vcpu *vcpustate;
 	struct vmcb_ctrl *ctrl;
 	long eptgen;
 	bool alloc_asid;
 
 	KASSERT(CPU_ISSET(thiscpu, &pmap->pm_active), ("%s: nested pmap not "
 	    "active on cpu %u", __func__, thiscpu));
 
 	vcpustate = svm_get_vcpu(sc, vcpuid);
 	ctrl = svm_get_vmcb_ctrl(sc, vcpuid);
 
 	/*
 	 * The TLB entries associated with the vcpu's ASID are not valid
 	 * if either of the following conditions is true:
 	 *
 	 * 1. The vcpu's ASID generation is different than the host cpu's
 	 *    ASID generation. This happens when the vcpu migrates to a new
 	 *    host cpu. It can also happen when the number of vcpus executing
 	 *    on a host cpu is greater than the number of ASIDs available.
 	 *
 	 * 2. The pmap generation number is different than the value cached in
 	 *    the 'vcpustate'. This happens when the host invalidates pages
 	 *    belonging to the guest.
 	 *
 	 *	asidgen		eptgen	      Action
 	 *	mismatch	mismatch
 	 *	   0		   0		(a)
 	 *	   0		   1		(b1) or (b2)
 	 *	   1		   0		(c)
 	 *	   1		   1		(d)
 	 *
 	 * (a) There is no mismatch in eptgen or ASID generation and therefore
 	 *     no further action is needed.
 	 *
 	 * (b1) If the cpu supports FlushByAsid then the vcpu's ASID is
 	 *      retained and the TLB entries associated with this ASID
 	 *      are flushed by VMRUN.
 	 *
 	 * (b2) If the cpu does not support FlushByAsid then a new ASID is
 	 *      allocated.
 	 *
 	 * (c) A new ASID is allocated.
 	 *
 	 * (d) A new ASID is allocated.
 	 */
 
 	alloc_asid = false;
 	eptgen = pmap->pm_eptgen;
 	ctrl->tlb_ctrl = VMCB_TLB_FLUSH_NOTHING;
 
 	if (vcpustate->asid.gen != asid[thiscpu].gen) {
 		alloc_asid = true;	/* (c) and (d) */
 	} else if (vcpustate->eptgen != eptgen) {
 		if (flush_by_asid())
 			ctrl->tlb_ctrl = VMCB_TLB_FLUSH_GUEST;	/* (b1) */
 		else
 			alloc_asid = true;			/* (b2) */
 	} else {
 		/*
 		 * This is the common case (a).
 		 */
 		KASSERT(!alloc_asid, ("ASID allocation not necessary"));
 		KASSERT(ctrl->tlb_ctrl == VMCB_TLB_FLUSH_NOTHING,
 		    ("Invalid VMCB tlb_ctrl: %#x", ctrl->tlb_ctrl));
 	}
 
 	if (alloc_asid) {
 		if (++asid[thiscpu].num >= nasid) {
 			asid[thiscpu].num = 1;
 			if (++asid[thiscpu].gen == 0)
 				asid[thiscpu].gen = 1;
 			/*
 			 * If this cpu does not support "flush-by-asid"
 			 * then flush the entire TLB on a generation
 			 * bump. Subsequent ASID allocation in this
 			 * generation can be done without a TLB flush.
 			 */
 			if (!flush_by_asid())
 				ctrl->tlb_ctrl = VMCB_TLB_FLUSH_ALL;
 		}
 		vcpustate->asid.gen = asid[thiscpu].gen;
 		vcpustate->asid.num = asid[thiscpu].num;
 
 		ctrl->asid = vcpustate->asid.num;
 		svm_set_dirty(sc, vcpuid, VMCB_CACHE_ASID);
 		/*
 		 * If this cpu supports "flush-by-asid" then the TLB
 		 * was not flushed after the generation bump. The TLB
 		 * is flushed selectively after every new ASID allocation.
 		 */
 		if (flush_by_asid())
 			ctrl->tlb_ctrl = VMCB_TLB_FLUSH_GUEST;
 	}
 	vcpustate->eptgen = eptgen;
 
 	KASSERT(ctrl->asid != 0, ("Guest ASID must be non-zero"));
 	KASSERT(ctrl->asid == vcpustate->asid.num,
 	    ("ASID mismatch: %u/%u", ctrl->asid, vcpustate->asid.num));
 }
 
 static __inline void
 disable_gintr(void)
 {
 
 	__asm __volatile("clgi");
 }
 
 static __inline void
 enable_gintr(void)
 {
 
         __asm __volatile("stgi");
 }
 
 static __inline void
 svm_dr_enter_guest(struct svm_regctx *gctx)
 {
 
 	/* Save host control debug registers. */
 	gctx->host_dr7 = rdr7();
 	gctx->host_debugctl = rdmsr(MSR_DEBUGCTLMSR);
 
 	/*
 	 * Disable debugging in DR7 and DEBUGCTL to avoid triggering
 	 * exceptions in the host based on the guest DRx values.  The
 	 * guest DR6, DR7, and DEBUGCTL are saved/restored in the
 	 * VMCB.
 	 */
 	load_dr7(0);
 	wrmsr(MSR_DEBUGCTLMSR, 0);
 
 	/* Save host debug registers. */
 	gctx->host_dr0 = rdr0();
 	gctx->host_dr1 = rdr1();
 	gctx->host_dr2 = rdr2();
 	gctx->host_dr3 = rdr3();
 	gctx->host_dr6 = rdr6();
 
 	/* Restore guest debug registers. */
 	load_dr0(gctx->sctx_dr0);
 	load_dr1(gctx->sctx_dr1);
 	load_dr2(gctx->sctx_dr2);
 	load_dr3(gctx->sctx_dr3);
 }
 
 static __inline void
 svm_dr_leave_guest(struct svm_regctx *gctx)
 {
 
 	/* Save guest debug registers. */
 	gctx->sctx_dr0 = rdr0();
 	gctx->sctx_dr1 = rdr1();
 	gctx->sctx_dr2 = rdr2();
 	gctx->sctx_dr3 = rdr3();
 
 	/*
 	 * Restore host debug registers.  Restore DR7 and DEBUGCTL
 	 * last.
 	 */
 	load_dr0(gctx->host_dr0);
 	load_dr1(gctx->host_dr1);
 	load_dr2(gctx->host_dr2);
 	load_dr3(gctx->host_dr3);
 	load_dr6(gctx->host_dr6);
 	wrmsr(MSR_DEBUGCTLMSR, gctx->host_debugctl);
 	load_dr7(gctx->host_dr7);
 }
 
 /*
  * Start vcpu with specified RIP.
  */
 static int
 svm_vmrun(void *arg, int vcpu, register_t rip, pmap_t pmap, 
 	struct vm_eventinfo *evinfo)
 {
 	struct svm_regctx *gctx;
 	struct svm_softc *svm_sc;
 	struct svm_vcpu *vcpustate;
 	struct vmcb_state *state;
 	struct vmcb_ctrl *ctrl;
 	struct vm_exit *vmexit;
 	struct vlapic *vlapic;
 	struct vm *vm;
 	uint64_t vmcb_pa;
 	int handled;
 	uint16_t ldt_sel;
 
 	svm_sc = arg;
 	vm = svm_sc->vm;
 
 	vcpustate = svm_get_vcpu(svm_sc, vcpu);
 	state = svm_get_vmcb_state(svm_sc, vcpu);
 	ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu);
 	vmexit = vm_exitinfo(vm, vcpu);
 	vlapic = vm_lapic(vm, vcpu);
 
 	gctx = svm_get_guest_regctx(svm_sc, vcpu);
 	vmcb_pa = svm_sc->vcpu[vcpu].vmcb_pa;
 
 	if (vcpustate->lastcpu != curcpu) {
 		/*
 		 * Force new ASID allocation by invalidating the generation.
 		 */
 		vcpustate->asid.gen = 0;
 
 		/*
 		 * Invalidate the VMCB state cache by marking all fields dirty.
 		 */
 		svm_set_dirty(svm_sc, vcpu, 0xffffffff);
 
 		/*
 		 * XXX
 		 * Setting 'vcpustate->lastcpu' here is bit premature because
 		 * we may return from this function without actually executing
 		 * the VMRUN  instruction. This could happen if a rendezvous
 		 * or an AST is pending on the first time through the loop.
 		 *
 		 * This works for now but any new side-effects of vcpu
 		 * migration should take this case into account.
 		 */
 		vcpustate->lastcpu = curcpu;
 		vmm_stat_incr(vm, vcpu, VCPU_MIGRATIONS, 1);
 	}
 
 	svm_msr_guest_enter(svm_sc, vcpu);
 
 	/* Update Guest RIP */
 	state->rip = rip;
 
 	do {
 		/*
 		 * Disable global interrupts to guarantee atomicity during
 		 * loading of guest state. This includes not only the state
 		 * loaded by the "vmrun" instruction but also software state
 		 * maintained by the hypervisor: suspended and rendezvous
 		 * state, NPT generation number, vlapic interrupts etc.
 		 */
 		disable_gintr();
 
 		if (vcpu_suspended(evinfo)) {
 			enable_gintr();
 			vm_exit_suspended(vm, vcpu, state->rip);
 			break;
 		}
 
 		if (vcpu_rendezvous_pending(evinfo)) {
 			enable_gintr();
 			vm_exit_rendezvous(vm, vcpu, state->rip);
 			break;
 		}
 
 		if (vcpu_reqidle(evinfo)) {
 			enable_gintr();
 			vm_exit_reqidle(vm, vcpu, state->rip);
 			break;
 		}
 
 		/* We are asked to give the cpu by scheduler. */
 		if (vcpu_should_yield(vm, vcpu)) {
 			enable_gintr();
 			vm_exit_astpending(vm, vcpu, state->rip);
 			break;
 		}
 
 		if (vcpu_debugged(vm, vcpu)) {
 			enable_gintr();
 			vm_exit_debug(vm, vcpu, state->rip);
 			break;
 		}
 
 		/*
 		 * #VMEXIT resumes the host with the guest LDTR, so
 		 * save the current LDT selector so it can be restored
 		 * after an exit.  The userspace hypervisor probably
 		 * doesn't use a LDT, but save and restore it to be
 		 * safe.
 		 */
 		ldt_sel = sldt();
 
 		svm_inj_interrupts(svm_sc, vcpu, vlapic);
 
 		/* Activate the nested pmap on 'curcpu' */
 		CPU_SET_ATOMIC_ACQ(curcpu, &pmap->pm_active);
 
 		/*
 		 * Check the pmap generation and the ASID generation to
 		 * ensure that the vcpu does not use stale TLB mappings.
 		 */
 		check_asid(svm_sc, vcpu, pmap, curcpu);
 
 		ctrl->vmcb_clean = vmcb_clean & ~vcpustate->dirty;
 		vcpustate->dirty = 0;
 		VCPU_CTR1(vm, vcpu, "vmcb clean %#x", ctrl->vmcb_clean);
 
 		/* Launch Virtual Machine. */
 		VCPU_CTR1(vm, vcpu, "Resume execution at %#lx", state->rip);
 		svm_dr_enter_guest(gctx);
 		svm_launch(vmcb_pa, gctx, get_pcpu());
 		svm_dr_leave_guest(gctx);
 
 		CPU_CLR_ATOMIC(curcpu, &pmap->pm_active);
 
 		/*
 		 * The host GDTR and IDTR is saved by VMRUN and restored
 		 * automatically on #VMEXIT. However, the host TSS needs
 		 * to be restored explicitly.
 		 */
 		restore_host_tss();
 
 		/* Restore host LDTR. */
 		lldt(ldt_sel);
 
 		/* #VMEXIT disables interrupts so re-enable them here. */ 
 		enable_gintr();
 
 		/* Update 'nextrip' */
 		vcpustate->nextrip = state->rip;
 
 		/* Handle #VMEXIT and if required return to user space. */
 		handled = svm_vmexit(svm_sc, vcpu, vmexit);
 	} while (handled);
 
 	svm_msr_guest_exit(svm_sc, vcpu);
 
 	return (0);
 }
 
 static void
 svm_vmcleanup(void *arg)
 {
 	struct svm_softc *sc = arg;
 
 	contigfree(sc->iopm_bitmap, SVM_IO_BITMAP_SIZE, M_SVM);
 	contigfree(sc->msr_bitmap, SVM_MSR_BITMAP_SIZE, M_SVM);
 	free(sc, M_SVM);
 }
 
 static register_t *
 swctx_regptr(struct svm_regctx *regctx, int reg)
 {
 
 	switch (reg) {
 	case VM_REG_GUEST_RBX:
 		return (&regctx->sctx_rbx);
 	case VM_REG_GUEST_RCX:
 		return (&regctx->sctx_rcx);
 	case VM_REG_GUEST_RDX:
 		return (&regctx->sctx_rdx);
 	case VM_REG_GUEST_RDI:
 		return (&regctx->sctx_rdi);
 	case VM_REG_GUEST_RSI:
 		return (&regctx->sctx_rsi);
 	case VM_REG_GUEST_RBP:
 		return (&regctx->sctx_rbp);
 	case VM_REG_GUEST_R8:
 		return (&regctx->sctx_r8);
 	case VM_REG_GUEST_R9:
 		return (&regctx->sctx_r9);
 	case VM_REG_GUEST_R10:
 		return (&regctx->sctx_r10);
 	case VM_REG_GUEST_R11:
 		return (&regctx->sctx_r11);
 	case VM_REG_GUEST_R12:
 		return (&regctx->sctx_r12);
 	case VM_REG_GUEST_R13:
 		return (&regctx->sctx_r13);
 	case VM_REG_GUEST_R14:
 		return (&regctx->sctx_r14);
 	case VM_REG_GUEST_R15:
 		return (&regctx->sctx_r15);
 	case VM_REG_GUEST_DR0:
 		return (&regctx->sctx_dr0);
 	case VM_REG_GUEST_DR1:
 		return (&regctx->sctx_dr1);
 	case VM_REG_GUEST_DR2:
 		return (&regctx->sctx_dr2);
 	case VM_REG_GUEST_DR3:
 		return (&regctx->sctx_dr3);
 	default:
 		return (NULL);
 	}
 }
 
 static int
 svm_getreg(void *arg, int vcpu, int ident, uint64_t *val)
 {
 	struct svm_softc *svm_sc;
 	register_t *reg;
 
 	svm_sc = arg;
 
 	if (ident == VM_REG_GUEST_INTR_SHADOW) {
 		return (svm_get_intr_shadow(svm_sc, vcpu, val));
 	}
 
 	if (vmcb_read(svm_sc, vcpu, ident, val) == 0) {
 		return (0);
 	}
 
 	reg = swctx_regptr(svm_get_guest_regctx(svm_sc, vcpu), ident);
 
 	if (reg != NULL) {
 		*val = *reg;
 		return (0);
 	}
 
 	VCPU_CTR1(svm_sc->vm, vcpu, "svm_getreg: unknown register %#x", ident);
 	return (EINVAL);
 }
 
 static int
 svm_setreg(void *arg, int vcpu, int ident, uint64_t val)
 {
 	struct svm_softc *svm_sc;
 	register_t *reg;
 
 	svm_sc = arg;
 
 	if (ident == VM_REG_GUEST_INTR_SHADOW) {
 		return (svm_modify_intr_shadow(svm_sc, vcpu, val));
 	}
 
 	if (vmcb_write(svm_sc, vcpu, ident, val) == 0) {
 		return (0);
 	}
 
 	reg = swctx_regptr(svm_get_guest_regctx(svm_sc, vcpu), ident);
 
 	if (reg != NULL) {
 		*reg = val;
 		return (0);
 	}
 
 	if (ident == VM_REG_GUEST_ENTRY_INST_LENGTH) {
 		/* Ignore. */
 		return (0);
 	}
 
 	/*
 	 * XXX deal with CR3 and invalidate TLB entries tagged with the
 	 * vcpu's ASID. This needs to be treated differently depending on
 	 * whether 'running' is true/false.
 	 */
 
 	VCPU_CTR1(svm_sc->vm, vcpu, "svm_setreg: unknown register %#x", ident);
 	return (EINVAL);
 }
 
 static int
 svm_setcap(void *arg, int vcpu, int type, int val)
 {
 	struct svm_softc *sc;
 	int error;
 
 	sc = arg;
 	error = 0;
 	switch (type) {
 	case VM_CAP_HALT_EXIT:
 		svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
 		    VMCB_INTCPT_HLT, val);
 		break;
 	case VM_CAP_PAUSE_EXIT:
 		svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
 		    VMCB_INTCPT_PAUSE, val);
 		break;
 	case VM_CAP_UNRESTRICTED_GUEST:
 		/* Unrestricted guest execution cannot be disabled in SVM */
 		if (val == 0)
 			error = EINVAL;
 		break;
 	default:
 		error = ENOENT;
 		break;
 	}
 	return (error);
 }
 
 static int
 svm_getcap(void *arg, int vcpu, int type, int *retval)
 {
 	struct svm_softc *sc;
 	int error;
 
 	sc = arg;
 	error = 0;
 
 	switch (type) {
 	case VM_CAP_HALT_EXIT:
 		*retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
 		    VMCB_INTCPT_HLT);
 		break;
 	case VM_CAP_PAUSE_EXIT:
 		*retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
 		    VMCB_INTCPT_PAUSE);
 		break;
 	case VM_CAP_UNRESTRICTED_GUEST:
 		*retval = 1;	/* unrestricted guest is always enabled */
 		break;
 	default:
 		error = ENOENT;
 		break;
 	}
 	return (error);
 }
 
 static struct vlapic *
 svm_vlapic_init(void *arg, int vcpuid)
 {
 	struct svm_softc *svm_sc;
 	struct vlapic *vlapic;
 
 	svm_sc = arg;
 	vlapic = malloc(sizeof(struct vlapic), M_SVM_VLAPIC, M_WAITOK | M_ZERO);
 	vlapic->vm = svm_sc->vm;
 	vlapic->vcpuid = vcpuid;
 	vlapic->apic_page = (struct LAPIC *)&svm_sc->apic_page[vcpuid];
 
 	vlapic_init(vlapic);
 
 	return (vlapic);
 }
 
 static void
 svm_vlapic_cleanup(void *arg, struct vlapic *vlapic)
 {
 
         vlapic_cleanup(vlapic);
         free(vlapic, M_SVM_VLAPIC);
 }
 
 struct vmm_ops vmm_ops_amd = {
 	.init		= svm_init,
 	.cleanup	= svm_cleanup,
 	.resume		= svm_restore,
 	.vminit		= svm_vminit,
 	.vmrun		= svm_vmrun,
 	.vmcleanup	= svm_vmcleanup,
 	.vmgetreg	= svm_getreg,
 	.vmsetreg	= svm_setreg,
 	.vmgetdesc	= vmcb_getdesc,
 	.vmsetdesc	= vmcb_setdesc,
 	.vmgetcap	= svm_getcap,
 	.vmsetcap	= svm_setcap,
 	.vmspace_alloc	= svm_npt_alloc,
 	.vmspace_free	= svm_npt_free,
 	.vlapic_init	= svm_vlapic_init,
 	.vlapic_cleanup	= svm_vlapic_cleanup,
 };
Index: head/sys/amd64/vmm/intel/ept.c
===================================================================
--- head/sys/amd64/vmm/intel/ept.c	(revision 357973)
+++ head/sys/amd64/vmm/intel/ept.c	(revision 357974)
@@ -1,207 +1,208 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/types.h>
 #include <sys/systm.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_extern.h>
 
 #include <machine/vmm.h>
 
 #include "vmx_cpufunc.h"
 #include "ept.h"
 
 #define	EPT_SUPPORTS_EXEC_ONLY(cap)	((cap) & (1UL << 0))
 #define	EPT_PWL4(cap)			((cap) & (1UL << 6))
 #define	EPT_MEMORY_TYPE_WB(cap)		((cap) & (1UL << 14))
 #define	EPT_PDE_SUPERPAGE(cap)		((cap) & (1UL << 16))	/* 2MB pages */
 #define	EPT_PDPTE_SUPERPAGE(cap)	((cap) & (1UL << 17))	/* 1GB pages */
 #define	INVEPT_SUPPORTED(cap)		((cap) & (1UL << 20))
 #define	AD_BITS_SUPPORTED(cap)		((cap) & (1UL << 21))
 #define	INVVPID_SUPPORTED(cap)		((cap) & (1UL << 32))
 
 #define	INVVPID_ALL_TYPES_MASK		0xF0000000000UL
 #define	INVVPID_ALL_TYPES_SUPPORTED(cap)	\
 	(((cap) & INVVPID_ALL_TYPES_MASK) == INVVPID_ALL_TYPES_MASK)
 
 #define	INVEPT_ALL_TYPES_MASK		0x6000000UL
 #define	INVEPT_ALL_TYPES_SUPPORTED(cap)		\
 	(((cap) & INVEPT_ALL_TYPES_MASK) == INVEPT_ALL_TYPES_MASK)
 
 #define	EPT_PWLEVELS		4		/* page walk levels */
 #define	EPT_ENABLE_AD_BITS	(1 << 6)
 
 SYSCTL_DECL(_hw_vmm);
-SYSCTL_NODE(_hw_vmm, OID_AUTO, ept, CTLFLAG_RW, NULL, NULL);
+SYSCTL_NODE(_hw_vmm, OID_AUTO, ept, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
+    NULL);
 
 static int ept_enable_ad_bits;
 
 static int ept_pmap_flags;
 SYSCTL_INT(_hw_vmm_ept, OID_AUTO, pmap_flags, CTLFLAG_RD,
     &ept_pmap_flags, 0, NULL);
 
 int
 ept_init(int ipinum)
 {
 	int use_hw_ad_bits, use_superpages, use_exec_only;
 	uint64_t cap;
 
 	cap = rdmsr(MSR_VMX_EPT_VPID_CAP);
 
 	/*
 	 * Verify that:
 	 * - page walk length is 4 steps
 	 * - extended page tables can be laid out in write-back memory
 	 * - invvpid instruction with all possible types is supported
 	 * - invept instruction with all possible types is supported
 	 */
 	if (!EPT_PWL4(cap) ||
 	    !EPT_MEMORY_TYPE_WB(cap) ||
 	    !INVVPID_SUPPORTED(cap) ||
 	    !INVVPID_ALL_TYPES_SUPPORTED(cap) ||
 	    !INVEPT_SUPPORTED(cap) ||
 	    !INVEPT_ALL_TYPES_SUPPORTED(cap))
 		return (EINVAL);
 
 	ept_pmap_flags = ipinum & PMAP_NESTED_IPIMASK;
 
 	use_superpages = 1;
 	TUNABLE_INT_FETCH("hw.vmm.ept.use_superpages", &use_superpages);
 	if (use_superpages && EPT_PDE_SUPERPAGE(cap))
 		ept_pmap_flags |= PMAP_PDE_SUPERPAGE;	/* 2MB superpage */
 
 	use_hw_ad_bits = 1;
 	TUNABLE_INT_FETCH("hw.vmm.ept.use_hw_ad_bits", &use_hw_ad_bits);
 	if (use_hw_ad_bits && AD_BITS_SUPPORTED(cap))
 		ept_enable_ad_bits = 1;
 	else
 		ept_pmap_flags |= PMAP_EMULATE_AD_BITS;
 
 	use_exec_only = 1;
 	TUNABLE_INT_FETCH("hw.vmm.ept.use_exec_only", &use_exec_only);
 	if (use_exec_only && EPT_SUPPORTS_EXEC_ONLY(cap))
 		ept_pmap_flags |= PMAP_SUPPORTS_EXEC_ONLY;
 
 	return (0);
 }
 
 #if 0
 static void
 ept_dump(uint64_t *ptp, int nlevels)
 {
 	int i, t, tabs;
 	uint64_t *ptpnext, ptpval;
 
 	if (--nlevels < 0)
 		return;
 
 	tabs = 3 - nlevels;
 	for (t = 0; t < tabs; t++)
 		printf("\t");
 	printf("PTP = %p\n", ptp);
 
 	for (i = 0; i < 512; i++) {
 		ptpval = ptp[i];
 
 		if (ptpval == 0)
 			continue;
 		
 		for (t = 0; t < tabs; t++)
 			printf("\t");
 		printf("%3d 0x%016lx\n", i, ptpval);
 
 		if (nlevels != 0 && (ptpval & EPT_PG_SUPERPAGE) == 0) {
 			ptpnext = (uint64_t *)
 				  PHYS_TO_DMAP(ptpval & EPT_ADDR_MASK);
 			ept_dump(ptpnext, nlevels);
 		}
 	}
 }
 #endif
 
 static void
 invept_single_context(void *arg)
 {
 	struct invept_desc desc = *(struct invept_desc *)arg;
 
 	invept(INVEPT_TYPE_SINGLE_CONTEXT, desc);
 }
 
 void
 ept_invalidate_mappings(u_long eptp)
 {
 	struct invept_desc invept_desc = { 0 };
 
 	invept_desc.eptp = eptp;
 
 	smp_rendezvous(NULL, invept_single_context, NULL, &invept_desc);
 }
 
 static int
 ept_pinit(pmap_t pmap)
 {
 
 	return (pmap_pinit_type(pmap, PT_EPT, ept_pmap_flags));
 }
 
 struct vmspace *
 ept_vmspace_alloc(vm_offset_t min, vm_offset_t max)
 {
 
 	return (vmspace_alloc(min, max, ept_pinit));
 }
 
 void
 ept_vmspace_free(struct vmspace *vmspace)
 {
 
 	vmspace_free(vmspace);
 }
 
 uint64_t
 eptp(uint64_t pml4)
 {
 	uint64_t eptp_val;
 
 	eptp_val = pml4 | (EPT_PWLEVELS - 1) << 3 | PAT_WRITE_BACK;
 	if (ept_enable_ad_bits)
 		eptp_val |= EPT_ENABLE_AD_BITS;
 
 	return (eptp_val);
 }
Index: head/sys/amd64/vmm/intel/vmx.c
===================================================================
--- head/sys/amd64/vmm/intel/vmx.c	(revision 357973)
+++ head/sys/amd64/vmm/intel/vmx.c	(revision 357974)
@@ -1,3836 +1,3839 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  * Copyright (c) 2018 Joyent, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/smp.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/pcpu.h>
 #include <sys/proc.h>
 #include <sys/sysctl.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 #include <machine/psl.h>
 #include <machine/cpufunc.h>
 #include <machine/md_var.h>
 #include <machine/reg.h>
 #include <machine/segments.h>
 #include <machine/smp.h>
 #include <machine/specialreg.h>
 #include <machine/vmparam.h>
 
 #include <machine/vmm.h>
 #include <machine/vmm_dev.h>
 #include <machine/vmm_instruction_emul.h>
 #include "vmm_lapic.h"
 #include "vmm_host.h"
 #include "vmm_ioport.h"
 #include "vmm_ktr.h"
 #include "vmm_stat.h"
 #include "vatpic.h"
 #include "vlapic.h"
 #include "vlapic_priv.h"
 
 #include "ept.h"
 #include "vmx_cpufunc.h"
 #include "vmx.h"
 #include "vmx_msr.h"
 #include "x86.h"
 #include "vmx_controls.h"
 
 #define	PINBASED_CTLS_ONE_SETTING					\
 	(PINBASED_EXTINT_EXITING	|				\
 	 PINBASED_NMI_EXITING		|				\
 	 PINBASED_VIRTUAL_NMI)
 #define	PINBASED_CTLS_ZERO_SETTING	0
 
 #define PROCBASED_CTLS_WINDOW_SETTING					\
 	(PROCBASED_INT_WINDOW_EXITING	|				\
 	 PROCBASED_NMI_WINDOW_EXITING)
 
 #define	PROCBASED_CTLS_ONE_SETTING					\
 	(PROCBASED_SECONDARY_CONTROLS	|				\
 	 PROCBASED_MWAIT_EXITING	|				\
 	 PROCBASED_MONITOR_EXITING	|				\
 	 PROCBASED_IO_EXITING		|				\
 	 PROCBASED_MSR_BITMAPS		|				\
 	 PROCBASED_CTLS_WINDOW_SETTING	|				\
 	 PROCBASED_CR8_LOAD_EXITING	|				\
 	 PROCBASED_CR8_STORE_EXITING)
 #define	PROCBASED_CTLS_ZERO_SETTING	\
 	(PROCBASED_CR3_LOAD_EXITING |	\
 	PROCBASED_CR3_STORE_EXITING |	\
 	PROCBASED_IO_BITMAPS)
 
 #define	PROCBASED_CTLS2_ONE_SETTING	PROCBASED2_ENABLE_EPT
 #define	PROCBASED_CTLS2_ZERO_SETTING	0
 
 #define	VM_EXIT_CTLS_ONE_SETTING					\
 	(VM_EXIT_SAVE_DEBUG_CONTROLS		|			\
 	VM_EXIT_HOST_LMA			|			\
 	VM_EXIT_SAVE_EFER			|			\
 	VM_EXIT_LOAD_EFER			|			\
 	VM_EXIT_ACKNOWLEDGE_INTERRUPT)
 
 #define	VM_EXIT_CTLS_ZERO_SETTING	0
 
 #define	VM_ENTRY_CTLS_ONE_SETTING					\
 	(VM_ENTRY_LOAD_DEBUG_CONTROLS		|			\
 	VM_ENTRY_LOAD_EFER)
 
 #define	VM_ENTRY_CTLS_ZERO_SETTING					\
 	(VM_ENTRY_INTO_SMM			|			\
 	VM_ENTRY_DEACTIVATE_DUAL_MONITOR)
 
 #define	HANDLED		1
 #define	UNHANDLED	0
 
 static MALLOC_DEFINE(M_VMX, "vmx", "vmx");
 static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic");
 
 SYSCTL_DECL(_hw_vmm);
-SYSCTL_NODE(_hw_vmm, OID_AUTO, vmx, CTLFLAG_RW, NULL, NULL);
+SYSCTL_NODE(_hw_vmm, OID_AUTO, vmx, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
+    NULL);
 
 int vmxon_enabled[MAXCPU];
 static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE);
 
 static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2;
 static uint32_t exit_ctls, entry_ctls;
 
 static uint64_t cr0_ones_mask, cr0_zeros_mask;
 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_ones_mask, CTLFLAG_RD,
 	     &cr0_ones_mask, 0, NULL);
 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_zeros_mask, CTLFLAG_RD,
 	     &cr0_zeros_mask, 0, NULL);
 
 static uint64_t cr4_ones_mask, cr4_zeros_mask;
 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_ones_mask, CTLFLAG_RD,
 	     &cr4_ones_mask, 0, NULL);
 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_zeros_mask, CTLFLAG_RD,
 	     &cr4_zeros_mask, 0, NULL);
 
 static int vmx_initialized;
 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD,
 	   &vmx_initialized, 0, "Intel VMX initialized");
 
 /*
  * Optional capabilities
  */
-static SYSCTL_NODE(_hw_vmm_vmx, OID_AUTO, cap, CTLFLAG_RW, NULL, NULL);
+static SYSCTL_NODE(_hw_vmm_vmx, OID_AUTO, cap,
+    CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
+    NULL);
 
 static int cap_halt_exit;
 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, halt_exit, CTLFLAG_RD, &cap_halt_exit, 0,
     "HLT triggers a VM-exit");
 
 static int cap_pause_exit;
 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, pause_exit, CTLFLAG_RD, &cap_pause_exit,
     0, "PAUSE triggers a VM-exit");
 
 static int cap_unrestricted_guest;
 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, unrestricted_guest, CTLFLAG_RD,
     &cap_unrestricted_guest, 0, "Unrestricted guests");
 
 static int cap_monitor_trap;
 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, monitor_trap, CTLFLAG_RD,
     &cap_monitor_trap, 0, "Monitor trap flag");
 
 static int cap_invpcid;
 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, invpcid, CTLFLAG_RD, &cap_invpcid,
     0, "Guests are allowed to use INVPCID");
 
 static int virtual_interrupt_delivery;
 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, virtual_interrupt_delivery, CTLFLAG_RD,
     &virtual_interrupt_delivery, 0, "APICv virtual interrupt delivery support");
 
 static int posted_interrupts;
 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, posted_interrupts, CTLFLAG_RD,
     &posted_interrupts, 0, "APICv posted interrupt support");
 
 static int pirvec = -1;
 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupt_vector, CTLFLAG_RD,
     &pirvec, 0, "APICv posted interrupt vector");
 
 static struct unrhdr *vpid_unr;
 static u_int vpid_alloc_failed;
 SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD,
 	    &vpid_alloc_failed, 0, NULL);
 
 int guest_l1d_flush;
 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, l1d_flush, CTLFLAG_RD,
     &guest_l1d_flush, 0, NULL);
 int guest_l1d_flush_sw;
 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, l1d_flush_sw, CTLFLAG_RD,
     &guest_l1d_flush_sw, 0, NULL);
 
 static struct msr_entry msr_load_list[1] __aligned(16);
 
 /*
  * The definitions of SDT probes for VMX.
  */
 
 SDT_PROBE_DEFINE3(vmm, vmx, exit, entry,
     "struct vmx *", "int", "struct vm_exit *");
 
 SDT_PROBE_DEFINE4(vmm, vmx, exit, taskswitch,
     "struct vmx *", "int", "struct vm_exit *", "struct vm_task_switch *");
 
 SDT_PROBE_DEFINE4(vmm, vmx, exit, craccess,
     "struct vmx *", "int", "struct vm_exit *", "uint64_t");
 
 SDT_PROBE_DEFINE4(vmm, vmx, exit, rdmsr,
     "struct vmx *", "int", "struct vm_exit *", "uint32_t");
 
 SDT_PROBE_DEFINE5(vmm, vmx, exit, wrmsr,
     "struct vmx *", "int", "struct vm_exit *", "uint32_t", "uint64_t");
 
 SDT_PROBE_DEFINE3(vmm, vmx, exit, halt,
     "struct vmx *", "int", "struct vm_exit *");
 
 SDT_PROBE_DEFINE3(vmm, vmx, exit, mtrap,
     "struct vmx *", "int", "struct vm_exit *");
 
 SDT_PROBE_DEFINE3(vmm, vmx, exit, pause,
     "struct vmx *", "int", "struct vm_exit *");
 
 SDT_PROBE_DEFINE3(vmm, vmx, exit, intrwindow,
     "struct vmx *", "int", "struct vm_exit *");
 
 SDT_PROBE_DEFINE4(vmm, vmx, exit, interrupt,
     "struct vmx *", "int", "struct vm_exit *", "uint32_t");
 
 SDT_PROBE_DEFINE3(vmm, vmx, exit, nmiwindow,
     "struct vmx *", "int", "struct vm_exit *");
 
 SDT_PROBE_DEFINE3(vmm, vmx, exit, inout,
     "struct vmx *", "int", "struct vm_exit *");
 
 SDT_PROBE_DEFINE3(vmm, vmx, exit, cpuid,
     "struct vmx *", "int", "struct vm_exit *");
 
 SDT_PROBE_DEFINE5(vmm, vmx, exit, exception,
     "struct vmx *", "int", "struct vm_exit *", "uint32_t", "int");
 
 SDT_PROBE_DEFINE5(vmm, vmx, exit, nestedfault,
     "struct vmx *", "int", "struct vm_exit *", "uint64_t", "uint64_t");
 
 SDT_PROBE_DEFINE4(vmm, vmx, exit, mmiofault,
     "struct vmx *", "int", "struct vm_exit *", "uint64_t");
 
 SDT_PROBE_DEFINE3(vmm, vmx, exit, eoi,
     "struct vmx *", "int", "struct vm_exit *");
 
 SDT_PROBE_DEFINE3(vmm, vmx, exit, apicaccess,
     "struct vmx *", "int", "struct vm_exit *");
 
 SDT_PROBE_DEFINE4(vmm, vmx, exit, apicwrite,
     "struct vmx *", "int", "struct vm_exit *", "struct vlapic *");
 
 SDT_PROBE_DEFINE3(vmm, vmx, exit, xsetbv,
     "struct vmx *", "int", "struct vm_exit *");
 
 SDT_PROBE_DEFINE3(vmm, vmx, exit, monitor,
     "struct vmx *", "int", "struct vm_exit *");
 
 SDT_PROBE_DEFINE3(vmm, vmx, exit, mwait,
     "struct vmx *", "int", "struct vm_exit *");
 
 SDT_PROBE_DEFINE3(vmm, vmx, exit, vminsn,
     "struct vmx *", "int", "struct vm_exit *");
 
 SDT_PROBE_DEFINE4(vmm, vmx, exit, unknown,
     "struct vmx *", "int", "struct vm_exit *", "uint32_t");
 
 SDT_PROBE_DEFINE4(vmm, vmx, exit, return,
     "struct vmx *", "int", "struct vm_exit *", "int");
 
 /*
  * Use the last page below 4GB as the APIC access address. This address is
  * occupied by the boot firmware so it is guaranteed that it will not conflict
  * with a page in system memory.
  */
 #define	APIC_ACCESS_ADDRESS	0xFFFFF000
 
 static int vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc);
 static int vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval);
 static int vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val);
 static void vmx_inject_pir(struct vlapic *vlapic);
 
 #ifdef KTR
 static const char *
 exit_reason_to_str(int reason)
 {
 	static char reasonbuf[32];
 
 	switch (reason) {
 	case EXIT_REASON_EXCEPTION:
 		return "exception";
 	case EXIT_REASON_EXT_INTR:
 		return "extint";
 	case EXIT_REASON_TRIPLE_FAULT:
 		return "triplefault";
 	case EXIT_REASON_INIT:
 		return "init";
 	case EXIT_REASON_SIPI:
 		return "sipi";
 	case EXIT_REASON_IO_SMI:
 		return "iosmi";
 	case EXIT_REASON_SMI:
 		return "smi";
 	case EXIT_REASON_INTR_WINDOW:
 		return "intrwindow";
 	case EXIT_REASON_NMI_WINDOW:
 		return "nmiwindow";
 	case EXIT_REASON_TASK_SWITCH:
 		return "taskswitch";
 	case EXIT_REASON_CPUID:
 		return "cpuid";
 	case EXIT_REASON_GETSEC:
 		return "getsec";
 	case EXIT_REASON_HLT:
 		return "hlt";
 	case EXIT_REASON_INVD:
 		return "invd";
 	case EXIT_REASON_INVLPG:
 		return "invlpg";
 	case EXIT_REASON_RDPMC:
 		return "rdpmc";
 	case EXIT_REASON_RDTSC:
 		return "rdtsc";
 	case EXIT_REASON_RSM:
 		return "rsm";
 	case EXIT_REASON_VMCALL:
 		return "vmcall";
 	case EXIT_REASON_VMCLEAR:
 		return "vmclear";
 	case EXIT_REASON_VMLAUNCH:
 		return "vmlaunch";
 	case EXIT_REASON_VMPTRLD:
 		return "vmptrld";
 	case EXIT_REASON_VMPTRST:
 		return "vmptrst";
 	case EXIT_REASON_VMREAD:
 		return "vmread";
 	case EXIT_REASON_VMRESUME:
 		return "vmresume";
 	case EXIT_REASON_VMWRITE:
 		return "vmwrite";
 	case EXIT_REASON_VMXOFF:
 		return "vmxoff";
 	case EXIT_REASON_VMXON:
 		return "vmxon";
 	case EXIT_REASON_CR_ACCESS:
 		return "craccess";
 	case EXIT_REASON_DR_ACCESS:
 		return "draccess";
 	case EXIT_REASON_INOUT:
 		return "inout";
 	case EXIT_REASON_RDMSR:
 		return "rdmsr";
 	case EXIT_REASON_WRMSR:
 		return "wrmsr";
 	case EXIT_REASON_INVAL_VMCS:
 		return "invalvmcs";
 	case EXIT_REASON_INVAL_MSR:
 		return "invalmsr";
 	case EXIT_REASON_MWAIT:
 		return "mwait";
 	case EXIT_REASON_MTF:
 		return "mtf";
 	case EXIT_REASON_MONITOR:
 		return "monitor";
 	case EXIT_REASON_PAUSE:
 		return "pause";
 	case EXIT_REASON_MCE_DURING_ENTRY:
 		return "mce-during-entry";
 	case EXIT_REASON_TPR:
 		return "tpr";
 	case EXIT_REASON_APIC_ACCESS:
 		return "apic-access";
 	case EXIT_REASON_GDTR_IDTR:
 		return "gdtridtr";
 	case EXIT_REASON_LDTR_TR:
 		return "ldtrtr";
 	case EXIT_REASON_EPT_FAULT:
 		return "eptfault";
 	case EXIT_REASON_EPT_MISCONFIG:
 		return "eptmisconfig";
 	case EXIT_REASON_INVEPT:
 		return "invept";
 	case EXIT_REASON_RDTSCP:
 		return "rdtscp";
 	case EXIT_REASON_VMX_PREEMPT:
 		return "vmxpreempt";
 	case EXIT_REASON_INVVPID:
 		return "invvpid";
 	case EXIT_REASON_WBINVD:
 		return "wbinvd";
 	case EXIT_REASON_XSETBV:
 		return "xsetbv";
 	case EXIT_REASON_APIC_WRITE:
 		return "apic-write";
 	default:
 		snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason);
 		return (reasonbuf);
 	}
 }
 #endif	/* KTR */
 
 static int
 vmx_allow_x2apic_msrs(struct vmx *vmx)
 {
 	int i, error;
 
 	error = 0;
 
 	/*
 	 * Allow readonly access to the following x2APIC MSRs from the guest.
 	 */
 	error += guest_msr_ro(vmx, MSR_APIC_ID);
 	error += guest_msr_ro(vmx, MSR_APIC_VERSION);
 	error += guest_msr_ro(vmx, MSR_APIC_LDR);
 	error += guest_msr_ro(vmx, MSR_APIC_SVR);
 
 	for (i = 0; i < 8; i++)
 		error += guest_msr_ro(vmx, MSR_APIC_ISR0 + i);
 
 	for (i = 0; i < 8; i++)
 		error += guest_msr_ro(vmx, MSR_APIC_TMR0 + i);
 
 	for (i = 0; i < 8; i++)
 		error += guest_msr_ro(vmx, MSR_APIC_IRR0 + i);
 
 	error += guest_msr_ro(vmx, MSR_APIC_ESR);
 	error += guest_msr_ro(vmx, MSR_APIC_LVT_TIMER);
 	error += guest_msr_ro(vmx, MSR_APIC_LVT_THERMAL);
 	error += guest_msr_ro(vmx, MSR_APIC_LVT_PCINT);
 	error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT0);
 	error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT1);
 	error += guest_msr_ro(vmx, MSR_APIC_LVT_ERROR);
 	error += guest_msr_ro(vmx, MSR_APIC_ICR_TIMER);
 	error += guest_msr_ro(vmx, MSR_APIC_DCR_TIMER);
 	error += guest_msr_ro(vmx, MSR_APIC_ICR);
 
 	/*
 	 * Allow TPR, EOI and SELF_IPI MSRs to be read and written by the guest.
 	 *
 	 * These registers get special treatment described in the section
 	 * "Virtualizing MSR-Based APIC Accesses".
 	 */
 	error += guest_msr_rw(vmx, MSR_APIC_TPR);
 	error += guest_msr_rw(vmx, MSR_APIC_EOI);
 	error += guest_msr_rw(vmx, MSR_APIC_SELF_IPI);
 
 	return (error);
 }
 
 u_long
 vmx_fix_cr0(u_long cr0)
 {
 
 	return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask);
 }
 
 u_long
 vmx_fix_cr4(u_long cr4)
 {
 
 	return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask);
 }
 
 static void
 vpid_free(int vpid)
 {
 	if (vpid < 0 || vpid > 0xffff)
 		panic("vpid_free: invalid vpid %d", vpid);
 
 	/*
 	 * VPIDs [0,VM_MAXCPU] are special and are not allocated from
 	 * the unit number allocator.
 	 */
 
 	if (vpid > VM_MAXCPU)
 		free_unr(vpid_unr, vpid);
 }
 
 static void
 vpid_alloc(uint16_t *vpid, int num)
 {
 	int i, x;
 
 	if (num <= 0 || num > VM_MAXCPU)
 		panic("invalid number of vpids requested: %d", num);
 
 	/*
 	 * If the "enable vpid" execution control is not enabled then the
 	 * VPID is required to be 0 for all vcpus.
 	 */
 	if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) == 0) {
 		for (i = 0; i < num; i++)
 			vpid[i] = 0;
 		return;
 	}
 
 	/*
 	 * Allocate a unique VPID for each vcpu from the unit number allocator.
 	 */
 	for (i = 0; i < num; i++) {
 		x = alloc_unr(vpid_unr);
 		if (x == -1)
 			break;
 		else
 			vpid[i] = x;
 	}
 
 	if (i < num) {
 		atomic_add_int(&vpid_alloc_failed, 1);
 
 		/*
 		 * If the unit number allocator does not have enough unique
 		 * VPIDs then we need to allocate from the [1,VM_MAXCPU] range.
 		 *
 		 * These VPIDs are not be unique across VMs but this does not
 		 * affect correctness because the combined mappings are also
 		 * tagged with the EP4TA which is unique for each VM.
 		 *
 		 * It is still sub-optimal because the invvpid will invalidate
 		 * combined mappings for a particular VPID across all EP4TAs.
 		 */
 		while (i-- > 0)
 			vpid_free(vpid[i]);
 
 		for (i = 0; i < num; i++)
 			vpid[i] = i + 1;
 	}
 }
 
 static void
 vpid_init(void)
 {
 	/*
 	 * VPID 0 is required when the "enable VPID" execution control is
 	 * disabled.
 	 *
 	 * VPIDs [1,VM_MAXCPU] are used as the "overflow namespace" when the
 	 * unit number allocator does not have sufficient unique VPIDs to
 	 * satisfy the allocation.
 	 *
 	 * The remaining VPIDs are managed by the unit number allocator.
 	 */
 	vpid_unr = new_unrhdr(VM_MAXCPU + 1, 0xffff, NULL);
 }
 
 static void
 vmx_disable(void *arg __unused)
 {
 	struct invvpid_desc invvpid_desc = { 0 };
 	struct invept_desc invept_desc = { 0 };
 
 	if (vmxon_enabled[curcpu]) {
 		/*
 		 * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b.
 		 *
 		 * VMXON or VMXOFF are not required to invalidate any TLB
 		 * caching structures. This prevents potential retention of
 		 * cached information in the TLB between distinct VMX episodes.
 		 */
 		invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc);
 		invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc);
 		vmxoff();
 	}
 	load_cr4(rcr4() & ~CR4_VMXE);
 }
 
 static int
 vmx_cleanup(void)
 {
 
 	if (pirvec >= 0)
 		lapic_ipi_free(pirvec);
 
 	if (vpid_unr != NULL) {
 		delete_unrhdr(vpid_unr);
 		vpid_unr = NULL;
 	}
 
 	if (nmi_flush_l1d_sw == 1)
 		nmi_flush_l1d_sw = 0;
 
 	smp_rendezvous(NULL, vmx_disable, NULL, NULL);
 
 	return (0);
 }
 
 static void
 vmx_enable(void *arg __unused)
 {
 	int error;
 	uint64_t feature_control;
 
 	feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL);
 	if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 0 ||
 	    (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) {
 		wrmsr(MSR_IA32_FEATURE_CONTROL,
 		    feature_control | IA32_FEATURE_CONTROL_VMX_EN |
 		    IA32_FEATURE_CONTROL_LOCK);
 	}
 
 	load_cr4(rcr4() | CR4_VMXE);
 
 	*(uint32_t *)vmxon_region[curcpu] = vmx_revision();
 	error = vmxon(vmxon_region[curcpu]);
 	if (error == 0)
 		vmxon_enabled[curcpu] = 1;
 }
 
 static void
 vmx_restore(void)
 {
 
 	if (vmxon_enabled[curcpu])
 		vmxon(vmxon_region[curcpu]);
 }
 
 static int
 vmx_init(int ipinum)
 {
 	int error, use_tpr_shadow;
 	uint64_t basic, fixed0, fixed1, feature_control;
 	uint32_t tmp, procbased2_vid_bits;
 
 	/* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */
 	if (!(cpu_feature2 & CPUID2_VMX)) {
 		printf("vmx_init: processor does not support VMX operation\n");
 		return (ENXIO);
 	}
 
 	/*
 	 * Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits
 	 * are set (bits 0 and 2 respectively).
 	 */
 	feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL);
 	if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 1 &&
 	    (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) {
 		printf("vmx_init: VMX operation disabled by BIOS\n");
 		return (ENXIO);
 	}
 
 	/*
 	 * Verify capabilities MSR_VMX_BASIC:
 	 * - bit 54 indicates support for INS/OUTS decoding
 	 */
 	basic = rdmsr(MSR_VMX_BASIC);
 	if ((basic & (1UL << 54)) == 0) {
 		printf("vmx_init: processor does not support desired basic "
 		    "capabilities\n");
 		return (EINVAL);
 	}
 
 	/* Check support for primary processor-based VM-execution controls */
 	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
 			       MSR_VMX_TRUE_PROCBASED_CTLS,
 			       PROCBASED_CTLS_ONE_SETTING,
 			       PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls);
 	if (error) {
 		printf("vmx_init: processor does not support desired primary "
 		       "processor-based controls\n");
 		return (error);
 	}
 
 	/* Clear the processor-based ctl bits that are set on demand */
 	procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING;
 
 	/* Check support for secondary processor-based VM-execution controls */
 	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
 			       MSR_VMX_PROCBASED_CTLS2,
 			       PROCBASED_CTLS2_ONE_SETTING,
 			       PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2);
 	if (error) {
 		printf("vmx_init: processor does not support desired secondary "
 		       "processor-based controls\n");
 		return (error);
 	}
 
 	/* Check support for VPID */
 	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2,
 			       PROCBASED2_ENABLE_VPID, 0, &tmp);
 	if (error == 0)
 		procbased_ctls2 |= PROCBASED2_ENABLE_VPID;
 
 	/* Check support for pin-based VM-execution controls */
 	error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS,
 			       MSR_VMX_TRUE_PINBASED_CTLS,
 			       PINBASED_CTLS_ONE_SETTING,
 			       PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls);
 	if (error) {
 		printf("vmx_init: processor does not support desired "
 		       "pin-based controls\n");
 		return (error);
 	}
 
 	/* Check support for VM-exit controls */
 	error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS,
 			       VM_EXIT_CTLS_ONE_SETTING,
 			       VM_EXIT_CTLS_ZERO_SETTING,
 			       &exit_ctls);
 	if (error) {
 		printf("vmx_init: processor does not support desired "
 		    "exit controls\n");
 		return (error);
 	}
 
 	/* Check support for VM-entry controls */
 	error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, MSR_VMX_TRUE_ENTRY_CTLS,
 	    VM_ENTRY_CTLS_ONE_SETTING, VM_ENTRY_CTLS_ZERO_SETTING,
 	    &entry_ctls);
 	if (error) {
 		printf("vmx_init: processor does not support desired "
 		    "entry controls\n");
 		return (error);
 	}
 
 	/*
 	 * Check support for optional features by testing them
 	 * as individual bits
 	 */
 	cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
 					MSR_VMX_TRUE_PROCBASED_CTLS,
 					PROCBASED_HLT_EXITING, 0,
 					&tmp) == 0);
 
 	cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
 					MSR_VMX_PROCBASED_CTLS,
 					PROCBASED_MTF, 0,
 					&tmp) == 0);
 
 	cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
 					 MSR_VMX_TRUE_PROCBASED_CTLS,
 					 PROCBASED_PAUSE_EXITING, 0,
 					 &tmp) == 0);
 
 	cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
 					MSR_VMX_PROCBASED_CTLS2,
 					PROCBASED2_UNRESTRICTED_GUEST, 0,
 				        &tmp) == 0);
 
 	cap_invpcid = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
 	    MSR_VMX_PROCBASED_CTLS2, PROCBASED2_ENABLE_INVPCID, 0,
 	    &tmp) == 0);
 
 	/*
 	 * Check support for virtual interrupt delivery.
 	 */
 	procbased2_vid_bits = (PROCBASED2_VIRTUALIZE_APIC_ACCESSES |
 	    PROCBASED2_VIRTUALIZE_X2APIC_MODE |
 	    PROCBASED2_APIC_REGISTER_VIRTUALIZATION |
 	    PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY);
 
 	use_tpr_shadow = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
 	    MSR_VMX_TRUE_PROCBASED_CTLS, PROCBASED_USE_TPR_SHADOW, 0,
 	    &tmp) == 0);
 
 	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2,
 	    procbased2_vid_bits, 0, &tmp);
 	if (error == 0 && use_tpr_shadow) {
 		virtual_interrupt_delivery = 1;
 		TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_vid",
 		    &virtual_interrupt_delivery);
 	}
 
 	if (virtual_interrupt_delivery) {
 		procbased_ctls |= PROCBASED_USE_TPR_SHADOW;
 		procbased_ctls2 |= procbased2_vid_bits;
 		procbased_ctls2 &= ~PROCBASED2_VIRTUALIZE_X2APIC_MODE;
 
 		/*
 		 * No need to emulate accesses to %CR8 if virtual
 		 * interrupt delivery is enabled.
 		 */
 		procbased_ctls &= ~PROCBASED_CR8_LOAD_EXITING;
 		procbased_ctls &= ~PROCBASED_CR8_STORE_EXITING;
 
 		/*
 		 * Check for Posted Interrupts only if Virtual Interrupt
 		 * Delivery is enabled.
 		 */
 		error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS,
 		    MSR_VMX_TRUE_PINBASED_CTLS, PINBASED_POSTED_INTERRUPT, 0,
 		    &tmp);
 		if (error == 0) {
 			pirvec = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) :
 			    &IDTVEC(justreturn));
 			if (pirvec < 0) {
 				if (bootverbose) {
 					printf("vmx_init: unable to allocate "
 					    "posted interrupt vector\n");
 				}
 			} else {
 				posted_interrupts = 1;
 				TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_pir",
 				    &posted_interrupts);
 			}
 		}
 	}
 
 	if (posted_interrupts)
 		    pinbased_ctls |= PINBASED_POSTED_INTERRUPT;
 
 	/* Initialize EPT */
 	error = ept_init(ipinum);
 	if (error) {
 		printf("vmx_init: ept initialization failed (%d)\n", error);
 		return (error);
 	}
 
 	guest_l1d_flush = (cpu_ia32_arch_caps &
 	    IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY) == 0;
 	TUNABLE_INT_FETCH("hw.vmm.l1d_flush", &guest_l1d_flush);
 
 	/*
 	 * L1D cache flush is enabled.  Use IA32_FLUSH_CMD MSR when
 	 * available.  Otherwise fall back to the software flush
 	 * method which loads enough data from the kernel text to
 	 * flush existing L1D content, both on VMX entry and on NMI
 	 * return.
 	 */
 	if (guest_l1d_flush) {
 		if ((cpu_stdext_feature3 & CPUID_STDEXT3_L1D_FLUSH) == 0) {
 			guest_l1d_flush_sw = 1;
 			TUNABLE_INT_FETCH("hw.vmm.l1d_flush_sw",
 			    &guest_l1d_flush_sw);
 		}
 		if (guest_l1d_flush_sw) {
 			if (nmi_flush_l1d_sw <= 1)
 				nmi_flush_l1d_sw = 1;
 		} else {
 			msr_load_list[0].index = MSR_IA32_FLUSH_CMD;
 			msr_load_list[0].val = IA32_FLUSH_CMD_L1D;
 		}
 	}
 
 	/*
 	 * Stash the cr0 and cr4 bits that must be fixed to 0 or 1
 	 */
 	fixed0 = rdmsr(MSR_VMX_CR0_FIXED0);
 	fixed1 = rdmsr(MSR_VMX_CR0_FIXED1);
 	cr0_ones_mask = fixed0 & fixed1;
 	cr0_zeros_mask = ~fixed0 & ~fixed1;
 
 	/*
 	 * CR0_PE and CR0_PG can be set to zero in VMX non-root operation
 	 * if unrestricted guest execution is allowed.
 	 */
 	if (cap_unrestricted_guest)
 		cr0_ones_mask &= ~(CR0_PG | CR0_PE);
 
 	/*
 	 * Do not allow the guest to set CR0_NW or CR0_CD.
 	 */
 	cr0_zeros_mask |= (CR0_NW | CR0_CD);
 
 	fixed0 = rdmsr(MSR_VMX_CR4_FIXED0);
 	fixed1 = rdmsr(MSR_VMX_CR4_FIXED1);
 	cr4_ones_mask = fixed0 & fixed1;
 	cr4_zeros_mask = ~fixed0 & ~fixed1;
 
 	vpid_init();
 
 	vmx_msr_init();
 
 	/* enable VMX operation */
 	smp_rendezvous(NULL, vmx_enable, NULL, NULL);
 
 	vmx_initialized = 1;
 
 	return (0);
 }
 
 static void
 vmx_trigger_hostintr(int vector)
 {
 	uintptr_t func;
 	struct gate_descriptor *gd;
 
 	gd = &idt[vector];
 
 	KASSERT(vector >= 32 && vector <= 255, ("vmx_trigger_hostintr: "
 	    "invalid vector %d", vector));
 	KASSERT(gd->gd_p == 1, ("gate descriptor for vector %d not present",
 	    vector));
 	KASSERT(gd->gd_type == SDT_SYSIGT, ("gate descriptor for vector %d "
 	    "has invalid type %d", vector, gd->gd_type));
 	KASSERT(gd->gd_dpl == SEL_KPL, ("gate descriptor for vector %d "
 	    "has invalid dpl %d", vector, gd->gd_dpl));
 	KASSERT(gd->gd_selector == GSEL(GCODE_SEL, SEL_KPL), ("gate descriptor "
 	    "for vector %d has invalid selector %d", vector, gd->gd_selector));
 	KASSERT(gd->gd_ist == 0, ("gate descriptor for vector %d has invalid "
 	    "IST %d", vector, gd->gd_ist));
 
 	func = ((long)gd->gd_hioffset << 16 | gd->gd_looffset);
 	vmx_call_isr(func);
 }
 
 static int
 vmx_setup_cr_shadow(int which, struct vmcs *vmcs, uint32_t initial)
 {
 	int error, mask_ident, shadow_ident;
 	uint64_t mask_value;
 
 	if (which != 0 && which != 4)
 		panic("vmx_setup_cr_shadow: unknown cr%d", which);
 
 	if (which == 0) {
 		mask_ident = VMCS_CR0_MASK;
 		mask_value = cr0_ones_mask | cr0_zeros_mask;
 		shadow_ident = VMCS_CR0_SHADOW;
 	} else {
 		mask_ident = VMCS_CR4_MASK;
 		mask_value = cr4_ones_mask | cr4_zeros_mask;
 		shadow_ident = VMCS_CR4_SHADOW;
 	}
 
 	error = vmcs_setreg(vmcs, 0, VMCS_IDENT(mask_ident), mask_value);
 	if (error)
 		return (error);
 
 	error = vmcs_setreg(vmcs, 0, VMCS_IDENT(shadow_ident), initial);
 	if (error)
 		return (error);
 
 	return (0);
 }
 #define	vmx_setup_cr0_shadow(vmcs,init)	vmx_setup_cr_shadow(0, (vmcs), (init))
 #define	vmx_setup_cr4_shadow(vmcs,init)	vmx_setup_cr_shadow(4, (vmcs), (init))
 
 static void *
 vmx_vminit(struct vm *vm, pmap_t pmap)
 {
 	uint16_t vpid[VM_MAXCPU];
 	int i, error;
 	struct vmx *vmx;
 	struct vmcs *vmcs;
 	uint32_t exc_bitmap;
 	uint16_t maxcpus;
 
 	vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO);
 	if ((uintptr_t)vmx & PAGE_MASK) {
 		panic("malloc of struct vmx not aligned on %d byte boundary",
 		      PAGE_SIZE);
 	}
 	vmx->vm = vm;
 
 	vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pml4));
 
 	/*
 	 * Clean up EPTP-tagged guest physical and combined mappings
 	 *
 	 * VMX transitions are not required to invalidate any guest physical
 	 * mappings. So, it may be possible for stale guest physical mappings
 	 * to be present in the processor TLBs.
 	 *
 	 * Combined mappings for this EP4TA are also invalidated for all VPIDs.
 	 */
 	ept_invalidate_mappings(vmx->eptp);
 
 	msr_bitmap_initialize(vmx->msr_bitmap);
 
 	/*
 	 * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE.
 	 * The guest FSBASE and GSBASE are saved and restored during
 	 * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are
 	 * always restored from the vmcs host state area on vm-exit.
 	 *
 	 * The SYSENTER_CS/ESP/EIP MSRs are identical to FS/GSBASE in
 	 * how they are saved/restored so can be directly accessed by the
 	 * guest.
 	 *
 	 * MSR_EFER is saved and restored in the guest VMCS area on a
 	 * VM exit and entry respectively. It is also restored from the
 	 * host VMCS area on a VM exit.
 	 *
 	 * The TSC MSR is exposed read-only. Writes are disallowed as
 	 * that will impact the host TSC.  If the guest does a write
 	 * the "use TSC offsetting" execution control is enabled and the
 	 * difference between the host TSC and the guest TSC is written
 	 * into the TSC offset in the VMCS.
 	 */
 	if (guest_msr_rw(vmx, MSR_GSBASE) ||
 	    guest_msr_rw(vmx, MSR_FSBASE) ||
 	    guest_msr_rw(vmx, MSR_SYSENTER_CS_MSR) ||
 	    guest_msr_rw(vmx, MSR_SYSENTER_ESP_MSR) ||
 	    guest_msr_rw(vmx, MSR_SYSENTER_EIP_MSR) ||
 	    guest_msr_rw(vmx, MSR_EFER) ||
 	    guest_msr_ro(vmx, MSR_TSC))
 		panic("vmx_vminit: error setting guest msr access");
 
 	vpid_alloc(vpid, VM_MAXCPU);
 
 	if (virtual_interrupt_delivery) {
 		error = vm_map_mmio(vm, DEFAULT_APIC_BASE, PAGE_SIZE,
 		    APIC_ACCESS_ADDRESS);
 		/* XXX this should really return an error to the caller */
 		KASSERT(error == 0, ("vm_map_mmio(apicbase) error %d", error));
 	}
 
 	maxcpus = vm_get_maxcpus(vm);
 	for (i = 0; i < maxcpus; i++) {
 		vmcs = &vmx->vmcs[i];
 		vmcs->identifier = vmx_revision();
 		error = vmclear(vmcs);
 		if (error != 0) {
 			panic("vmx_vminit: vmclear error %d on vcpu %d\n",
 			      error, i);
 		}
 
 		vmx_msr_guest_init(vmx, i);
 
 		error = vmcs_init(vmcs);
 		KASSERT(error == 0, ("vmcs_init error %d", error));
 
 		VMPTRLD(vmcs);
 		error = 0;
 		error += vmwrite(VMCS_HOST_RSP, (u_long)&vmx->ctx[i]);
 		error += vmwrite(VMCS_EPTP, vmx->eptp);
 		error += vmwrite(VMCS_PIN_BASED_CTLS, pinbased_ctls);
 		error += vmwrite(VMCS_PRI_PROC_BASED_CTLS, procbased_ctls);
 		error += vmwrite(VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2);
 		error += vmwrite(VMCS_EXIT_CTLS, exit_ctls);
 		error += vmwrite(VMCS_ENTRY_CTLS, entry_ctls);
 		error += vmwrite(VMCS_MSR_BITMAP, vtophys(vmx->msr_bitmap));
 		error += vmwrite(VMCS_VPID, vpid[i]);
 
 		if (guest_l1d_flush && !guest_l1d_flush_sw) {
 			vmcs_write(VMCS_ENTRY_MSR_LOAD, pmap_kextract(
 			    (vm_offset_t)&msr_load_list[0]));
 			vmcs_write(VMCS_ENTRY_MSR_LOAD_COUNT,
 			    nitems(msr_load_list));
 			vmcs_write(VMCS_EXIT_MSR_STORE, 0);
 			vmcs_write(VMCS_EXIT_MSR_STORE_COUNT, 0);
 		}
 
 		/* exception bitmap */
 		if (vcpu_trace_exceptions(vm, i))
 			exc_bitmap = 0xffffffff;
 		else
 			exc_bitmap = 1 << IDT_MC;
 		error += vmwrite(VMCS_EXCEPTION_BITMAP, exc_bitmap);
 
 		vmx->ctx[i].guest_dr6 = DBREG_DR6_RESERVED1;
 		error += vmwrite(VMCS_GUEST_DR7, DBREG_DR7_RESERVED1);
 
 		if (virtual_interrupt_delivery) {
 			error += vmwrite(VMCS_APIC_ACCESS, APIC_ACCESS_ADDRESS);
 			error += vmwrite(VMCS_VIRTUAL_APIC,
 			    vtophys(&vmx->apic_page[i]));
 			error += vmwrite(VMCS_EOI_EXIT0, 0);
 			error += vmwrite(VMCS_EOI_EXIT1, 0);
 			error += vmwrite(VMCS_EOI_EXIT2, 0);
 			error += vmwrite(VMCS_EOI_EXIT3, 0);
 		}
 		if (posted_interrupts) {
 			error += vmwrite(VMCS_PIR_VECTOR, pirvec);
 			error += vmwrite(VMCS_PIR_DESC,
 			    vtophys(&vmx->pir_desc[i]));
 		}
 		VMCLEAR(vmcs);
 		KASSERT(error == 0, ("vmx_vminit: error customizing the vmcs"));
 
 		vmx->cap[i].set = 0;
 		vmx->cap[i].proc_ctls = procbased_ctls;
 		vmx->cap[i].proc_ctls2 = procbased_ctls2;
 		vmx->cap[i].exc_bitmap = exc_bitmap;
 
 		vmx->state[i].nextrip = ~0;
 		vmx->state[i].lastcpu = NOCPU;
 		vmx->state[i].vpid = vpid[i];
 
 		/*
 		 * Set up the CR0/4 shadows, and init the read shadow
 		 * to the power-on register value from the Intel Sys Arch.
 		 *  CR0 - 0x60000010
 		 *  CR4 - 0
 		 */
 		error = vmx_setup_cr0_shadow(vmcs, 0x60000010);
 		if (error != 0)
 			panic("vmx_setup_cr0_shadow %d", error);
 
 		error = vmx_setup_cr4_shadow(vmcs, 0);
 		if (error != 0)
 			panic("vmx_setup_cr4_shadow %d", error);
 
 		vmx->ctx[i].pmap = pmap;
 	}
 
 	return (vmx);
 }
 
 static int
 vmx_handle_cpuid(struct vm *vm, int vcpu, struct vmxctx *vmxctx)
 {
 	int handled, func;
 
 	func = vmxctx->guest_rax;
 
 	handled = x86_emulate_cpuid(vm, vcpu,
 				    (uint32_t*)(&vmxctx->guest_rax),
 				    (uint32_t*)(&vmxctx->guest_rbx),
 				    (uint32_t*)(&vmxctx->guest_rcx),
 				    (uint32_t*)(&vmxctx->guest_rdx));
 	return (handled);
 }
 
 static __inline void
 vmx_run_trace(struct vmx *vmx, int vcpu)
 {
 #ifdef KTR
 	VCPU_CTR1(vmx->vm, vcpu, "Resume execution at %#lx", vmcs_guest_rip());
 #endif
 }
 
 static __inline void
 vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason,
 	       int handled)
 {
 #ifdef KTR
 	VCPU_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx",
 		 handled ? "handled" : "unhandled",
 		 exit_reason_to_str(exit_reason), rip);
 #endif
 }
 
 static __inline void
 vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip)
 {
 #ifdef KTR
 	VCPU_CTR1(vmx->vm, vcpu, "astpending vmexit at 0x%0lx", rip);
 #endif
 }
 
 static VMM_STAT_INTEL(VCPU_INVVPID_SAVED, "Number of vpid invalidations saved");
 static VMM_STAT_INTEL(VCPU_INVVPID_DONE, "Number of vpid invalidations done");
 
 /*
  * Invalidate guest mappings identified by its vpid from the TLB.
  */
 static __inline void
 vmx_invvpid(struct vmx *vmx, int vcpu, pmap_t pmap, int running)
 {
 	struct vmxstate *vmxstate;
 	struct invvpid_desc invvpid_desc;
 
 	vmxstate = &vmx->state[vcpu];
 	if (vmxstate->vpid == 0)
 		return;
 
 	if (!running) {
 		/*
 		 * Set the 'lastcpu' to an invalid host cpu.
 		 *
 		 * This will invalidate TLB entries tagged with the vcpu's
 		 * vpid the next time it runs via vmx_set_pcpu_defaults().
 		 */
 		vmxstate->lastcpu = NOCPU;
 		return;
 	}
 
 	KASSERT(curthread->td_critnest > 0, ("%s: vcpu %d running outside "
 	    "critical section", __func__, vcpu));
 
 	/*
 	 * Invalidate all mappings tagged with 'vpid'
 	 *
 	 * We do this because this vcpu was executing on a different host
 	 * cpu when it last ran. We do not track whether it invalidated
 	 * mappings associated with its 'vpid' during that run. So we must
 	 * assume that the mappings associated with 'vpid' on 'curcpu' are
 	 * stale and invalidate them.
 	 *
 	 * Note that we incur this penalty only when the scheduler chooses to
 	 * move the thread associated with this vcpu between host cpus.
 	 *
 	 * Note also that this will invalidate mappings tagged with 'vpid'
 	 * for "all" EP4TAs.
 	 */
 	if (pmap->pm_eptgen == vmx->eptgen[curcpu]) {
 		invvpid_desc._res1 = 0;
 		invvpid_desc._res2 = 0;
 		invvpid_desc.vpid = vmxstate->vpid;
 		invvpid_desc.linear_addr = 0;
 		invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc);
 		vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_DONE, 1);
 	} else {
 		/*
 		 * The invvpid can be skipped if an invept is going to
 		 * be performed before entering the guest. The invept
 		 * will invalidate combined mappings tagged with
 		 * 'vmx->eptp' for all vpids.
 		 */
 		vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_SAVED, 1);
 	}
 }
 
 static void
 vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu, pmap_t pmap)
 {
 	struct vmxstate *vmxstate;
 
 	vmxstate = &vmx->state[vcpu];
 	if (vmxstate->lastcpu == curcpu)
 		return;
 
 	vmxstate->lastcpu = curcpu;
 
 	vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1);
 
 	vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase());
 	vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase());
 	vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase());
 	vmx_invvpid(vmx, vcpu, pmap, 1);
 }
 
 /*
  * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set.
  */
 CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0);
 
 static void __inline
 vmx_set_int_window_exiting(struct vmx *vmx, int vcpu)
 {
 
 	if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) == 0) {
 		vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING;
 		vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
 		VCPU_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting");
 	}
 }
 
 static void __inline
 vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu)
 {
 
 	KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0,
 	    ("intr_window_exiting not set: %#x", vmx->cap[vcpu].proc_ctls));
 	vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING;
 	vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
 	VCPU_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting");
 }
 
 static void __inline
 vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu)
 {
 
 	if ((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) == 0) {
 		vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING;
 		vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
 		VCPU_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting");
 	}
 }
 
 static void __inline
 vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu)
 {
 
 	KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) != 0,
 	    ("nmi_window_exiting not set %#x", vmx->cap[vcpu].proc_ctls));
 	vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING;
 	vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
 	VCPU_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting");
 }
 
 int
 vmx_set_tsc_offset(struct vmx *vmx, int vcpu, uint64_t offset)
 {
 	int error;
 
 	if ((vmx->cap[vcpu].proc_ctls & PROCBASED_TSC_OFFSET) == 0) {
 		vmx->cap[vcpu].proc_ctls |= PROCBASED_TSC_OFFSET;
 		vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
 		VCPU_CTR0(vmx->vm, vcpu, "Enabling TSC offsetting");
 	}
 
 	error = vmwrite(VMCS_TSC_OFFSET, offset);
 
 	return (error);
 }
 
 #define	NMI_BLOCKING	(VMCS_INTERRUPTIBILITY_NMI_BLOCKING |		\
 			 VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING)
 #define	HWINTR_BLOCKING	(VMCS_INTERRUPTIBILITY_STI_BLOCKING |		\
 			 VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING)
 
 static void
 vmx_inject_nmi(struct vmx *vmx, int vcpu)
 {
 	uint32_t gi, info;
 
 	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
 	KASSERT((gi & NMI_BLOCKING) == 0, ("vmx_inject_nmi: invalid guest "
 	    "interruptibility-state %#x", gi));
 
 	info = vmcs_read(VMCS_ENTRY_INTR_INFO);
 	KASSERT((info & VMCS_INTR_VALID) == 0, ("vmx_inject_nmi: invalid "
 	    "VM-entry interruption information %#x", info));
 
 	/*
 	 * Inject the virtual NMI. The vector must be the NMI IDT entry
 	 * or the VMCS entry check will fail.
 	 */
 	info = IDT_NMI | VMCS_INTR_T_NMI | VMCS_INTR_VALID;
 	vmcs_write(VMCS_ENTRY_INTR_INFO, info);
 
 	VCPU_CTR0(vmx->vm, vcpu, "Injecting vNMI");
 
 	/* Clear the request */
 	vm_nmi_clear(vmx->vm, vcpu);
 }
 
 static void
 vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic,
     uint64_t guestrip)
 {
 	int vector, need_nmi_exiting, extint_pending;
 	uint64_t rflags, entryinfo;
 	uint32_t gi, info;
 
 	if (vmx->state[vcpu].nextrip != guestrip) {
 		gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
 		if (gi & HWINTR_BLOCKING) {
 			VCPU_CTR2(vmx->vm, vcpu, "Guest interrupt blocking "
 			    "cleared due to rip change: %#lx/%#lx",
 			    vmx->state[vcpu].nextrip, guestrip);
 			gi &= ~HWINTR_BLOCKING;
 			vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
 		}
 	}
 
 	if (vm_entry_intinfo(vmx->vm, vcpu, &entryinfo)) {
 		KASSERT((entryinfo & VMCS_INTR_VALID) != 0, ("%s: entry "
 		    "intinfo is not valid: %#lx", __func__, entryinfo));
 
 		info = vmcs_read(VMCS_ENTRY_INTR_INFO);
 		KASSERT((info & VMCS_INTR_VALID) == 0, ("%s: cannot inject "
 		     "pending exception: %#lx/%#x", __func__, entryinfo, info));
 
 		info = entryinfo;
 		vector = info & 0xff;
 		if (vector == IDT_BP || vector == IDT_OF) {
 			/*
 			 * VT-x requires #BP and #OF to be injected as software
 			 * exceptions.
 			 */
 			info &= ~VMCS_INTR_T_MASK;
 			info |= VMCS_INTR_T_SWEXCEPTION;
 		}
 
 		if (info & VMCS_INTR_DEL_ERRCODE)
 			vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, entryinfo >> 32);
 
 		vmcs_write(VMCS_ENTRY_INTR_INFO, info);
 	}
 
 	if (vm_nmi_pending(vmx->vm, vcpu)) {
 		/*
 		 * If there are no conditions blocking NMI injection then
 		 * inject it directly here otherwise enable "NMI window
 		 * exiting" to inject it as soon as we can.
 		 *
 		 * We also check for STI_BLOCKING because some implementations
 		 * don't allow NMI injection in this case. If we are running
 		 * on a processor that doesn't have this restriction it will
 		 * immediately exit and the NMI will be injected in the
 		 * "NMI window exiting" handler.
 		 */
 		need_nmi_exiting = 1;
 		gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
 		if ((gi & (HWINTR_BLOCKING | NMI_BLOCKING)) == 0) {
 			info = vmcs_read(VMCS_ENTRY_INTR_INFO);
 			if ((info & VMCS_INTR_VALID) == 0) {
 				vmx_inject_nmi(vmx, vcpu);
 				need_nmi_exiting = 0;
 			} else {
 				VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI "
 				    "due to VM-entry intr info %#x", info);
 			}
 		} else {
 			VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI due to "
 			    "Guest Interruptibility-state %#x", gi);
 		}
 
 		if (need_nmi_exiting)
 			vmx_set_nmi_window_exiting(vmx, vcpu);
 	}
 
 	extint_pending = vm_extint_pending(vmx->vm, vcpu);
 
 	if (!extint_pending && virtual_interrupt_delivery) {
 		vmx_inject_pir(vlapic);
 		return;
 	}
 
 	/*
 	 * If interrupt-window exiting is already in effect then don't bother
 	 * checking for pending interrupts. This is just an optimization and
 	 * not needed for correctness.
 	 */
 	if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0) {
 		VCPU_CTR0(vmx->vm, vcpu, "Skip interrupt injection due to "
 		    "pending int_window_exiting");
 		return;
 	}
 
 	if (!extint_pending) {
 		/* Ask the local apic for a vector to inject */
 		if (!vlapic_pending_intr(vlapic, &vector))
 			return;
 
 		/*
 		 * From the Intel SDM, Volume 3, Section "Maskable
 		 * Hardware Interrupts":
 		 * - maskable interrupt vectors [16,255] can be delivered
 		 *   through the local APIC.
 		*/
 		KASSERT(vector >= 16 && vector <= 255,
 		    ("invalid vector %d from local APIC", vector));
 	} else {
 		/* Ask the legacy pic for a vector to inject */
 		vatpic_pending_intr(vmx->vm, &vector);
 
 		/*
 		 * From the Intel SDM, Volume 3, Section "Maskable
 		 * Hardware Interrupts":
 		 * - maskable interrupt vectors [0,255] can be delivered
 		 *   through the INTR pin.
 		 */
 		KASSERT(vector >= 0 && vector <= 255,
 		    ("invalid vector %d from INTR", vector));
 	}
 
 	/* Check RFLAGS.IF and the interruptibility state of the guest */
 	rflags = vmcs_read(VMCS_GUEST_RFLAGS);
 	if ((rflags & PSL_I) == 0) {
 		VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
 		    "rflags %#lx", vector, rflags);
 		goto cantinject;
 	}
 
 	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
 	if (gi & HWINTR_BLOCKING) {
 		VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
 		    "Guest Interruptibility-state %#x", vector, gi);
 		goto cantinject;
 	}
 
 	info = vmcs_read(VMCS_ENTRY_INTR_INFO);
 	if (info & VMCS_INTR_VALID) {
 		/*
 		 * This is expected and could happen for multiple reasons:
 		 * - A vectoring VM-entry was aborted due to astpending
 		 * - A VM-exit happened during event injection.
 		 * - An exception was injected above.
 		 * - An NMI was injected above or after "NMI window exiting"
 		 */
 		VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
 		    "VM-entry intr info %#x", vector, info);
 		goto cantinject;
 	}
 
 	/* Inject the interrupt */
 	info = VMCS_INTR_T_HWINTR | VMCS_INTR_VALID;
 	info |= vector;
 	vmcs_write(VMCS_ENTRY_INTR_INFO, info);
 
 	if (!extint_pending) {
 		/* Update the Local APIC ISR */
 		vlapic_intr_accepted(vlapic, vector);
 	} else {
 		vm_extint_clear(vmx->vm, vcpu);
 		vatpic_intr_accepted(vmx->vm, vector);
 
 		/*
 		 * After we accepted the current ExtINT the PIC may
 		 * have posted another one.  If that is the case, set
 		 * the Interrupt Window Exiting execution control so
 		 * we can inject that one too.
 		 *
 		 * Also, interrupt window exiting allows us to inject any
 		 * pending APIC vector that was preempted by the ExtINT
 		 * as soon as possible. This applies both for the software
 		 * emulated vlapic and the hardware assisted virtual APIC.
 		 */
 		vmx_set_int_window_exiting(vmx, vcpu);
 	}
 
 	VCPU_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector);
 
 	return;
 
 cantinject:
 	/*
 	 * Set the Interrupt Window Exiting execution control so we can inject
 	 * the interrupt as soon as blocking condition goes away.
 	 */
 	vmx_set_int_window_exiting(vmx, vcpu);
 }
 
 /*
  * If the Virtual NMIs execution control is '1' then the logical processor
  * tracks virtual-NMI blocking in the Guest Interruptibility-state field of
  * the VMCS. An IRET instruction in VMX non-root operation will remove any
  * virtual-NMI blocking.
  *
  * This unblocking occurs even if the IRET causes a fault. In this case the
  * hypervisor needs to restore virtual-NMI blocking before resuming the guest.
  */
 static void
 vmx_restore_nmi_blocking(struct vmx *vmx, int vcpuid)
 {
 	uint32_t gi;
 
 	VCPU_CTR0(vmx->vm, vcpuid, "Restore Virtual-NMI blocking");
 	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
 	gi |= VMCS_INTERRUPTIBILITY_NMI_BLOCKING;
 	vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
 }
 
 static void
 vmx_clear_nmi_blocking(struct vmx *vmx, int vcpuid)
 {
 	uint32_t gi;
 
 	VCPU_CTR0(vmx->vm, vcpuid, "Clear Virtual-NMI blocking");
 	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
 	gi &= ~VMCS_INTERRUPTIBILITY_NMI_BLOCKING;
 	vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
 }
 
 static void
 vmx_assert_nmi_blocking(struct vmx *vmx, int vcpuid)
 {
 	uint32_t gi;
 
 	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
 	KASSERT(gi & VMCS_INTERRUPTIBILITY_NMI_BLOCKING,
 	    ("NMI blocking is not in effect %#x", gi));
 }
 
 static int
 vmx_emulate_xsetbv(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 {
 	struct vmxctx *vmxctx;
 	uint64_t xcrval;
 	const struct xsave_limits *limits;
 
 	vmxctx = &vmx->ctx[vcpu];
 	limits = vmm_get_xsave_limits();
 
 	/*
 	 * Note that the processor raises a GP# fault on its own if
 	 * xsetbv is executed for CPL != 0, so we do not have to
 	 * emulate that fault here.
 	 */
 
 	/* Only xcr0 is supported. */
 	if (vmxctx->guest_rcx != 0) {
 		vm_inject_gp(vmx->vm, vcpu);
 		return (HANDLED);
 	}
 
 	/* We only handle xcr0 if both the host and guest have XSAVE enabled. */
 	if (!limits->xsave_enabled || !(vmcs_read(VMCS_GUEST_CR4) & CR4_XSAVE)) {
 		vm_inject_ud(vmx->vm, vcpu);
 		return (HANDLED);
 	}
 
 	xcrval = vmxctx->guest_rdx << 32 | (vmxctx->guest_rax & 0xffffffff);
 	if ((xcrval & ~limits->xcr0_allowed) != 0) {
 		vm_inject_gp(vmx->vm, vcpu);
 		return (HANDLED);
 	}
 
 	if (!(xcrval & XFEATURE_ENABLED_X87)) {
 		vm_inject_gp(vmx->vm, vcpu);
 		return (HANDLED);
 	}
 
 	/* AVX (YMM_Hi128) requires SSE. */
 	if (xcrval & XFEATURE_ENABLED_AVX &&
 	    (xcrval & XFEATURE_AVX) != XFEATURE_AVX) {
 		vm_inject_gp(vmx->vm, vcpu);
 		return (HANDLED);
 	}
 
 	/*
 	 * AVX512 requires base AVX (YMM_Hi128) as well as OpMask,
 	 * ZMM_Hi256, and Hi16_ZMM.
 	 */
 	if (xcrval & XFEATURE_AVX512 &&
 	    (xcrval & (XFEATURE_AVX512 | XFEATURE_AVX)) !=
 	    (XFEATURE_AVX512 | XFEATURE_AVX)) {
 		vm_inject_gp(vmx->vm, vcpu);
 		return (HANDLED);
 	}
 
 	/*
 	 * Intel MPX requires both bound register state flags to be
 	 * set.
 	 */
 	if (((xcrval & XFEATURE_ENABLED_BNDREGS) != 0) !=
 	    ((xcrval & XFEATURE_ENABLED_BNDCSR) != 0)) {
 		vm_inject_gp(vmx->vm, vcpu);
 		return (HANDLED);
 	}
 
 	/*
 	 * This runs "inside" vmrun() with the guest's FPU state, so
 	 * modifying xcr0 directly modifies the guest's xcr0, not the
 	 * host's.
 	 */
 	load_xcr(0, xcrval);
 	return (HANDLED);
 }
 
 static uint64_t
 vmx_get_guest_reg(struct vmx *vmx, int vcpu, int ident)
 {
 	const struct vmxctx *vmxctx;
 
 	vmxctx = &vmx->ctx[vcpu];
 
 	switch (ident) {
 	case 0:
 		return (vmxctx->guest_rax);
 	case 1:
 		return (vmxctx->guest_rcx);
 	case 2:
 		return (vmxctx->guest_rdx);
 	case 3:
 		return (vmxctx->guest_rbx);
 	case 4:
 		return (vmcs_read(VMCS_GUEST_RSP));
 	case 5:
 		return (vmxctx->guest_rbp);
 	case 6:
 		return (vmxctx->guest_rsi);
 	case 7:
 		return (vmxctx->guest_rdi);
 	case 8:
 		return (vmxctx->guest_r8);
 	case 9:
 		return (vmxctx->guest_r9);
 	case 10:
 		return (vmxctx->guest_r10);
 	case 11:
 		return (vmxctx->guest_r11);
 	case 12:
 		return (vmxctx->guest_r12);
 	case 13:
 		return (vmxctx->guest_r13);
 	case 14:
 		return (vmxctx->guest_r14);
 	case 15:
 		return (vmxctx->guest_r15);
 	default:
 		panic("invalid vmx register %d", ident);
 	}
 }
 
 static void
 vmx_set_guest_reg(struct vmx *vmx, int vcpu, int ident, uint64_t regval)
 {
 	struct vmxctx *vmxctx;
 
 	vmxctx = &vmx->ctx[vcpu];
 
 	switch (ident) {
 	case 0:
 		vmxctx->guest_rax = regval;
 		break;
 	case 1:
 		vmxctx->guest_rcx = regval;
 		break;
 	case 2:
 		vmxctx->guest_rdx = regval;
 		break;
 	case 3:
 		vmxctx->guest_rbx = regval;
 		break;
 	case 4:
 		vmcs_write(VMCS_GUEST_RSP, regval);
 		break;
 	case 5:
 		vmxctx->guest_rbp = regval;
 		break;
 	case 6:
 		vmxctx->guest_rsi = regval;
 		break;
 	case 7:
 		vmxctx->guest_rdi = regval;
 		break;
 	case 8:
 		vmxctx->guest_r8 = regval;
 		break;
 	case 9:
 		vmxctx->guest_r9 = regval;
 		break;
 	case 10:
 		vmxctx->guest_r10 = regval;
 		break;
 	case 11:
 		vmxctx->guest_r11 = regval;
 		break;
 	case 12:
 		vmxctx->guest_r12 = regval;
 		break;
 	case 13:
 		vmxctx->guest_r13 = regval;
 		break;
 	case 14:
 		vmxctx->guest_r14 = regval;
 		break;
 	case 15:
 		vmxctx->guest_r15 = regval;
 		break;
 	default:
 		panic("invalid vmx register %d", ident);
 	}
 }
 
 static int
 vmx_emulate_cr0_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
 {
 	uint64_t crval, regval;
 
 	/* We only handle mov to %cr0 at this time */
 	if ((exitqual & 0xf0) != 0x00)
 		return (UNHANDLED);
 
 	regval = vmx_get_guest_reg(vmx, vcpu, (exitqual >> 8) & 0xf);
 
 	vmcs_write(VMCS_CR0_SHADOW, regval);
 
 	crval = regval | cr0_ones_mask;
 	crval &= ~cr0_zeros_mask;
 	vmcs_write(VMCS_GUEST_CR0, crval);
 
 	if (regval & CR0_PG) {
 		uint64_t efer, entry_ctls;
 
 		/*
 		 * If CR0.PG is 1 and EFER.LME is 1 then EFER.LMA and
 		 * the "IA-32e mode guest" bit in VM-entry control must be
 		 * equal.
 		 */
 		efer = vmcs_read(VMCS_GUEST_IA32_EFER);
 		if (efer & EFER_LME) {
 			efer |= EFER_LMA;
 			vmcs_write(VMCS_GUEST_IA32_EFER, efer);
 			entry_ctls = vmcs_read(VMCS_ENTRY_CTLS);
 			entry_ctls |= VM_ENTRY_GUEST_LMA;
 			vmcs_write(VMCS_ENTRY_CTLS, entry_ctls);
 		}
 	}
 
 	return (HANDLED);
 }
 
 static int
 vmx_emulate_cr4_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
 {
 	uint64_t crval, regval;
 
 	/* We only handle mov to %cr4 at this time */
 	if ((exitqual & 0xf0) != 0x00)
 		return (UNHANDLED);
 
 	regval = vmx_get_guest_reg(vmx, vcpu, (exitqual >> 8) & 0xf);
 
 	vmcs_write(VMCS_CR4_SHADOW, regval);
 
 	crval = regval | cr4_ones_mask;
 	crval &= ~cr4_zeros_mask;
 	vmcs_write(VMCS_GUEST_CR4, crval);
 
 	return (HANDLED);
 }
 
 static int
 vmx_emulate_cr8_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
 {
 	struct vlapic *vlapic;
 	uint64_t cr8;
 	int regnum;
 
 	/* We only handle mov %cr8 to/from a register at this time. */
 	if ((exitqual & 0xe0) != 0x00) {
 		return (UNHANDLED);
 	}
 
 	vlapic = vm_lapic(vmx->vm, vcpu);
 	regnum = (exitqual >> 8) & 0xf;
 	if (exitqual & 0x10) {
 		cr8 = vlapic_get_cr8(vlapic);
 		vmx_set_guest_reg(vmx, vcpu, regnum, cr8);
 	} else {
 		cr8 = vmx_get_guest_reg(vmx, vcpu, regnum);
 		vlapic_set_cr8(vlapic, cr8);
 	}
 
 	return (HANDLED);
 }
 
 /*
  * From section "Guest Register State" in the Intel SDM: CPL = SS.DPL
  */
 static int
 vmx_cpl(void)
 {
 	uint32_t ssar;
 
 	ssar = vmcs_read(VMCS_GUEST_SS_ACCESS_RIGHTS);
 	return ((ssar >> 5) & 0x3);
 }
 
 static enum vm_cpu_mode
 vmx_cpu_mode(void)
 {
 	uint32_t csar;
 
 	if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LMA) {
 		csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS);
 		if (csar & 0x2000)
 			return (CPU_MODE_64BIT);	/* CS.L = 1 */
 		else
 			return (CPU_MODE_COMPATIBILITY);
 	} else if (vmcs_read(VMCS_GUEST_CR0) & CR0_PE) {
 		return (CPU_MODE_PROTECTED);
 	} else {
 		return (CPU_MODE_REAL);
 	}
 }
 
 static enum vm_paging_mode
 vmx_paging_mode(void)
 {
 
 	if (!(vmcs_read(VMCS_GUEST_CR0) & CR0_PG))
 		return (PAGING_MODE_FLAT);
 	if (!(vmcs_read(VMCS_GUEST_CR4) & CR4_PAE))
 		return (PAGING_MODE_32);
 	if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LME)
 		return (PAGING_MODE_64);
 	else
 		return (PAGING_MODE_PAE);
 }
 
 static uint64_t
 inout_str_index(struct vmx *vmx, int vcpuid, int in)
 {
 	uint64_t val;
 	int error;
 	enum vm_reg_name reg;
 
 	reg = in ? VM_REG_GUEST_RDI : VM_REG_GUEST_RSI;
 	error = vmx_getreg(vmx, vcpuid, reg, &val);
 	KASSERT(error == 0, ("%s: vmx_getreg error %d", __func__, error));
 	return (val);
 }
 
 static uint64_t
 inout_str_count(struct vmx *vmx, int vcpuid, int rep)
 {
 	uint64_t val;
 	int error;
 
 	if (rep) {
 		error = vmx_getreg(vmx, vcpuid, VM_REG_GUEST_RCX, &val);
 		KASSERT(!error, ("%s: vmx_getreg error %d", __func__, error));
 	} else {
 		val = 1;
 	}
 	return (val);
 }
 
 static int
 inout_str_addrsize(uint32_t inst_info)
 {
 	uint32_t size;
 
 	size = (inst_info >> 7) & 0x7;
 	switch (size) {
 	case 0:
 		return (2);	/* 16 bit */
 	case 1:
 		return (4);	/* 32 bit */
 	case 2:
 		return (8);	/* 64 bit */
 	default:
 		panic("%s: invalid size encoding %d", __func__, size);
 	}
 }
 
 static void
 inout_str_seginfo(struct vmx *vmx, int vcpuid, uint32_t inst_info, int in,
     struct vm_inout_str *vis)
 {
 	int error, s;
 
 	if (in) {
 		vis->seg_name = VM_REG_GUEST_ES;
 	} else {
 		s = (inst_info >> 15) & 0x7;
 		vis->seg_name = vm_segment_name(s);
 	}
 
 	error = vmx_getdesc(vmx, vcpuid, vis->seg_name, &vis->seg_desc);
 	KASSERT(error == 0, ("%s: vmx_getdesc error %d", __func__, error));
 }
 
 static void
 vmx_paging_info(struct vm_guest_paging *paging)
 {
 	paging->cr3 = vmcs_guest_cr3();
 	paging->cpl = vmx_cpl();
 	paging->cpu_mode = vmx_cpu_mode();
 	paging->paging_mode = vmx_paging_mode();
 }
 
 static void
 vmexit_inst_emul(struct vm_exit *vmexit, uint64_t gpa, uint64_t gla)
 {
 	struct vm_guest_paging *paging;
 	uint32_t csar;
 
 	paging = &vmexit->u.inst_emul.paging;
 
 	vmexit->exitcode = VM_EXITCODE_INST_EMUL;
 	vmexit->inst_length = 0;
 	vmexit->u.inst_emul.gpa = gpa;
 	vmexit->u.inst_emul.gla = gla;
 	vmx_paging_info(paging);
 	switch (paging->cpu_mode) {
 	case CPU_MODE_REAL:
 		vmexit->u.inst_emul.cs_base = vmcs_read(VMCS_GUEST_CS_BASE);
 		vmexit->u.inst_emul.cs_d = 0;
 		break;
 	case CPU_MODE_PROTECTED:
 	case CPU_MODE_COMPATIBILITY:
 		vmexit->u.inst_emul.cs_base = vmcs_read(VMCS_GUEST_CS_BASE);
 		csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS);
 		vmexit->u.inst_emul.cs_d = SEG_DESC_DEF32(csar);
 		break;
 	default:
 		vmexit->u.inst_emul.cs_base = 0;
 		vmexit->u.inst_emul.cs_d = 0;
 		break;
 	}
 	vie_init(&vmexit->u.inst_emul.vie, NULL, 0);
 }
 
 static int
 ept_fault_type(uint64_t ept_qual)
 {
 	int fault_type;
 
 	if (ept_qual & EPT_VIOLATION_DATA_WRITE)
 		fault_type = VM_PROT_WRITE;
 	else if (ept_qual & EPT_VIOLATION_INST_FETCH)
 		fault_type = VM_PROT_EXECUTE;
 	else
 		fault_type= VM_PROT_READ;
 
 	return (fault_type);
 }
 
 static bool
 ept_emulation_fault(uint64_t ept_qual)
 {
 	int read, write;
 
 	/* EPT fault on an instruction fetch doesn't make sense here */
 	if (ept_qual & EPT_VIOLATION_INST_FETCH)
 		return (false);
 
 	/* EPT fault must be a read fault or a write fault */
 	read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0;
 	write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0;
 	if ((read | write) == 0)
 		return (false);
 
 	/*
 	 * The EPT violation must have been caused by accessing a
 	 * guest-physical address that is a translation of a guest-linear
 	 * address.
 	 */
 	if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 ||
 	    (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) {
 		return (false);
 	}
 
 	return (true);
 }
 
 static __inline int
 apic_access_virtualization(struct vmx *vmx, int vcpuid)
 {
 	uint32_t proc_ctls2;
 
 	proc_ctls2 = vmx->cap[vcpuid].proc_ctls2;
 	return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) ? 1 : 0);
 }
 
 static __inline int
 x2apic_virtualization(struct vmx *vmx, int vcpuid)
 {
 	uint32_t proc_ctls2;
 
 	proc_ctls2 = vmx->cap[vcpuid].proc_ctls2;
 	return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_X2APIC_MODE) ? 1 : 0);
 }
 
 static int
 vmx_handle_apic_write(struct vmx *vmx, int vcpuid, struct vlapic *vlapic,
     uint64_t qual)
 {
 	int error, handled, offset;
 	uint32_t *apic_regs, vector;
 	bool retu;
 
 	handled = HANDLED;
 	offset = APIC_WRITE_OFFSET(qual);
 
 	if (!apic_access_virtualization(vmx, vcpuid)) {
 		/*
 		 * In general there should not be any APIC write VM-exits
 		 * unless APIC-access virtualization is enabled.
 		 *
 		 * However self-IPI virtualization can legitimately trigger
 		 * an APIC-write VM-exit so treat it specially.
 		 */
 		if (x2apic_virtualization(vmx, vcpuid) &&
 		    offset == APIC_OFFSET_SELF_IPI) {
 			apic_regs = (uint32_t *)(vlapic->apic_page);
 			vector = apic_regs[APIC_OFFSET_SELF_IPI / 4];
 			vlapic_self_ipi_handler(vlapic, vector);
 			return (HANDLED);
 		} else
 			return (UNHANDLED);
 	}
 
 	switch (offset) {
 	case APIC_OFFSET_ID:
 		vlapic_id_write_handler(vlapic);
 		break;
 	case APIC_OFFSET_LDR:
 		vlapic_ldr_write_handler(vlapic);
 		break;
 	case APIC_OFFSET_DFR:
 		vlapic_dfr_write_handler(vlapic);
 		break;
 	case APIC_OFFSET_SVR:
 		vlapic_svr_write_handler(vlapic);
 		break;
 	case APIC_OFFSET_ESR:
 		vlapic_esr_write_handler(vlapic);
 		break;
 	case APIC_OFFSET_ICR_LOW:
 		retu = false;
 		error = vlapic_icrlo_write_handler(vlapic, &retu);
 		if (error != 0 || retu)
 			handled = UNHANDLED;
 		break;
 	case APIC_OFFSET_CMCI_LVT:
 	case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
 		vlapic_lvt_write_handler(vlapic, offset);
 		break;
 	case APIC_OFFSET_TIMER_ICR:
 		vlapic_icrtmr_write_handler(vlapic);
 		break;
 	case APIC_OFFSET_TIMER_DCR:
 		vlapic_dcr_write_handler(vlapic);
 		break;
 	default:
 		handled = UNHANDLED;
 		break;
 	}
 	return (handled);
 }
 
 static bool
 apic_access_fault(struct vmx *vmx, int vcpuid, uint64_t gpa)
 {
 
 	if (apic_access_virtualization(vmx, vcpuid) &&
 	    (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE))
 		return (true);
 	else
 		return (false);
 }
 
 static int
 vmx_handle_apic_access(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit)
 {
 	uint64_t qual;
 	int access_type, offset, allowed;
 
 	if (!apic_access_virtualization(vmx, vcpuid))
 		return (UNHANDLED);
 
 	qual = vmexit->u.vmx.exit_qualification;
 	access_type = APIC_ACCESS_TYPE(qual);
 	offset = APIC_ACCESS_OFFSET(qual);
 
 	allowed = 0;
 	if (access_type == 0) {
 		/*
 		 * Read data access to the following registers is expected.
 		 */
 		switch (offset) {
 		case APIC_OFFSET_APR:
 		case APIC_OFFSET_PPR:
 		case APIC_OFFSET_RRR:
 		case APIC_OFFSET_CMCI_LVT:
 		case APIC_OFFSET_TIMER_CCR:
 			allowed = 1;
 			break;
 		default:
 			break;
 		}
 	} else if (access_type == 1) {
 		/*
 		 * Write data access to the following registers is expected.
 		 */
 		switch (offset) {
 		case APIC_OFFSET_VER:
 		case APIC_OFFSET_APR:
 		case APIC_OFFSET_PPR:
 		case APIC_OFFSET_RRR:
 		case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
 		case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
 		case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
 		case APIC_OFFSET_CMCI_LVT:
 		case APIC_OFFSET_TIMER_CCR:
 			allowed = 1;
 			break;
 		default:
 			break;
 		}
 	}
 
 	if (allowed) {
 		vmexit_inst_emul(vmexit, DEFAULT_APIC_BASE + offset,
 		    VIE_INVALID_GLA);
 	}
 
 	/*
 	 * Regardless of whether the APIC-access is allowed this handler
 	 * always returns UNHANDLED:
 	 * - if the access is allowed then it is handled by emulating the
 	 *   instruction that caused the VM-exit (outside the critical section)
 	 * - if the access is not allowed then it will be converted to an
 	 *   exitcode of VM_EXITCODE_VMX and will be dealt with in userland.
 	 */
 	return (UNHANDLED);
 }
 
 static enum task_switch_reason
 vmx_task_switch_reason(uint64_t qual)
 {
 	int reason;
 
 	reason = (qual >> 30) & 0x3;
 	switch (reason) {
 	case 0:
 		return (TSR_CALL);
 	case 1:
 		return (TSR_IRET);
 	case 2:
 		return (TSR_JMP);
 	case 3:
 		return (TSR_IDT_GATE);
 	default:
 		panic("%s: invalid reason %d", __func__, reason);
 	}
 }
 
 static int
 emulate_wrmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t val, bool *retu)
 {
 	int error;
 
 	if (lapic_msr(num))
 		error = lapic_wrmsr(vmx->vm, vcpuid, num, val, retu);
 	else
 		error = vmx_wrmsr(vmx, vcpuid, num, val, retu);
 
 	return (error);
 }
 
 static int
 emulate_rdmsr(struct vmx *vmx, int vcpuid, u_int num, bool *retu)
 {
 	struct vmxctx *vmxctx;
 	uint64_t result;
 	uint32_t eax, edx;
 	int error;
 
 	if (lapic_msr(num))
 		error = lapic_rdmsr(vmx->vm, vcpuid, num, &result, retu);
 	else
 		error = vmx_rdmsr(vmx, vcpuid, num, &result, retu);
 
 	if (error == 0) {
 		eax = result;
 		vmxctx = &vmx->ctx[vcpuid];
 		error = vmxctx_setreg(vmxctx, VM_REG_GUEST_RAX, eax);
 		KASSERT(error == 0, ("vmxctx_setreg(rax) error %d", error));
 
 		edx = result >> 32;
 		error = vmxctx_setreg(vmxctx, VM_REG_GUEST_RDX, edx);
 		KASSERT(error == 0, ("vmxctx_setreg(rdx) error %d", error));
 	}
 
 	return (error);
 }
 
 static int
 vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 {
 	int error, errcode, errcode_valid, handled, in;
 	struct vmxctx *vmxctx;
 	struct vlapic *vlapic;
 	struct vm_inout_str *vis;
 	struct vm_task_switch *ts;
 	uint32_t eax, ecx, edx, idtvec_info, idtvec_err, intr_info, inst_info;
 	uint32_t intr_type, intr_vec, reason;
 	uint64_t exitintinfo, qual, gpa;
 	bool retu;
 
 	CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_VIRTUAL_NMI) != 0);
 	CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_NMI_EXITING) != 0);
 
 	handled = UNHANDLED;
 	vmxctx = &vmx->ctx[vcpu];
 
 	qual = vmexit->u.vmx.exit_qualification;
 	reason = vmexit->u.vmx.exit_reason;
 	vmexit->exitcode = VM_EXITCODE_BOGUS;
 
 	vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1);
 	SDT_PROBE3(vmm, vmx, exit, entry, vmx, vcpu, vmexit);
 
 	/*
 	 * VM-entry failures during or after loading guest state.
 	 *
 	 * These VM-exits are uncommon but must be handled specially
 	 * as most VM-exit fields are not populated as usual.
 	 */
 	if (__predict_false(reason == EXIT_REASON_MCE_DURING_ENTRY)) {
 		VCPU_CTR0(vmx->vm, vcpu, "Handling MCE during VM-entry");
 		__asm __volatile("int $18");
 		return (1);
 	}
 
 	/*
 	 * VM exits that can be triggered during event delivery need to
 	 * be handled specially by re-injecting the event if the IDT
 	 * vectoring information field's valid bit is set.
 	 *
 	 * See "Information for VM Exits During Event Delivery" in Intel SDM
 	 * for details.
 	 */
 	idtvec_info = vmcs_idt_vectoring_info();
 	if (idtvec_info & VMCS_IDT_VEC_VALID) {
 		idtvec_info &= ~(1 << 12); /* clear undefined bit */
 		exitintinfo = idtvec_info;
 		if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) {
 			idtvec_err = vmcs_idt_vectoring_err();
 			exitintinfo |= (uint64_t)idtvec_err << 32;
 		}
 		error = vm_exit_intinfo(vmx->vm, vcpu, exitintinfo);
 		KASSERT(error == 0, ("%s: vm_set_intinfo error %d",
 		    __func__, error));
 
 		/*
 		 * If 'virtual NMIs' are being used and the VM-exit
 		 * happened while injecting an NMI during the previous
 		 * VM-entry, then clear "blocking by NMI" in the
 		 * Guest Interruptibility-State so the NMI can be
 		 * reinjected on the subsequent VM-entry.
 		 *
 		 * However, if the NMI was being delivered through a task
 		 * gate, then the new task must start execution with NMIs
 		 * blocked so don't clear NMI blocking in this case.
 		 */
 		intr_type = idtvec_info & VMCS_INTR_T_MASK;
 		if (intr_type == VMCS_INTR_T_NMI) {
 			if (reason != EXIT_REASON_TASK_SWITCH)
 				vmx_clear_nmi_blocking(vmx, vcpu);
 			else
 				vmx_assert_nmi_blocking(vmx, vcpu);
 		}
 
 		/*
 		 * Update VM-entry instruction length if the event being
 		 * delivered was a software interrupt or software exception.
 		 */
 		if (intr_type == VMCS_INTR_T_SWINTR ||
 		    intr_type == VMCS_INTR_T_PRIV_SWEXCEPTION ||
 		    intr_type == VMCS_INTR_T_SWEXCEPTION) {
 			vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length);
 		}
 	}
 
 	switch (reason) {
 	case EXIT_REASON_TASK_SWITCH:
 		ts = &vmexit->u.task_switch;
 		ts->tsssel = qual & 0xffff;
 		ts->reason = vmx_task_switch_reason(qual);
 		ts->ext = 0;
 		ts->errcode_valid = 0;
 		vmx_paging_info(&ts->paging);
 		/*
 		 * If the task switch was due to a CALL, JMP, IRET, software
 		 * interrupt (INT n) or software exception (INT3, INTO),
 		 * then the saved %rip references the instruction that caused
 		 * the task switch. The instruction length field in the VMCS
 		 * is valid in this case.
 		 *
 		 * In all other cases (e.g., NMI, hardware exception) the
 		 * saved %rip is one that would have been saved in the old TSS
 		 * had the task switch completed normally so the instruction
 		 * length field is not needed in this case and is explicitly
 		 * set to 0.
 		 */
 		if (ts->reason == TSR_IDT_GATE) {
 			KASSERT(idtvec_info & VMCS_IDT_VEC_VALID,
 			    ("invalid idtvec_info %#x for IDT task switch",
 			    idtvec_info));
 			intr_type = idtvec_info & VMCS_INTR_T_MASK;
 			if (intr_type != VMCS_INTR_T_SWINTR &&
 			    intr_type != VMCS_INTR_T_SWEXCEPTION &&
 			    intr_type != VMCS_INTR_T_PRIV_SWEXCEPTION) {
 				/* Task switch triggered by external event */
 				ts->ext = 1;
 				vmexit->inst_length = 0;
 				if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) {
 					ts->errcode_valid = 1;
 					ts->errcode = vmcs_idt_vectoring_err();
 				}
 			}
 		}
 		vmexit->exitcode = VM_EXITCODE_TASK_SWITCH;
 		SDT_PROBE4(vmm, vmx, exit, taskswitch, vmx, vcpu, vmexit, ts);
 		VCPU_CTR4(vmx->vm, vcpu, "task switch reason %d, tss 0x%04x, "
 		    "%s errcode 0x%016lx", ts->reason, ts->tsssel,
 		    ts->ext ? "external" : "internal",
 		    ((uint64_t)ts->errcode << 32) | ts->errcode_valid);
 		break;
 	case EXIT_REASON_CR_ACCESS:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1);
 		SDT_PROBE4(vmm, vmx, exit, craccess, vmx, vcpu, vmexit, qual);
 		switch (qual & 0xf) {
 		case 0:
 			handled = vmx_emulate_cr0_access(vmx, vcpu, qual);
 			break;
 		case 4:
 			handled = vmx_emulate_cr4_access(vmx, vcpu, qual);
 			break;
 		case 8:
 			handled = vmx_emulate_cr8_access(vmx, vcpu, qual);
 			break;
 		}
 		break;
 	case EXIT_REASON_RDMSR:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_RDMSR, 1);
 		retu = false;
 		ecx = vmxctx->guest_rcx;
 		VCPU_CTR1(vmx->vm, vcpu, "rdmsr 0x%08x", ecx);
 		SDT_PROBE4(vmm, vmx, exit, rdmsr, vmx, vcpu, vmexit, ecx);
 		error = emulate_rdmsr(vmx, vcpu, ecx, &retu);
 		if (error) {
 			vmexit->exitcode = VM_EXITCODE_RDMSR;
 			vmexit->u.msr.code = ecx;
 		} else if (!retu) {
 			handled = HANDLED;
 		} else {
 			/* Return to userspace with a valid exitcode */
 			KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
 			    ("emulate_rdmsr retu with bogus exitcode"));
 		}
 		break;
 	case EXIT_REASON_WRMSR:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_WRMSR, 1);
 		retu = false;
 		eax = vmxctx->guest_rax;
 		ecx = vmxctx->guest_rcx;
 		edx = vmxctx->guest_rdx;
 		VCPU_CTR2(vmx->vm, vcpu, "wrmsr 0x%08x value 0x%016lx",
 		    ecx, (uint64_t)edx << 32 | eax);
 		SDT_PROBE5(vmm, vmx, exit, wrmsr, vmx, vmexit, vcpu, ecx,
 		    (uint64_t)edx << 32 | eax);
 		error = emulate_wrmsr(vmx, vcpu, ecx,
 		    (uint64_t)edx << 32 | eax, &retu);
 		if (error) {
 			vmexit->exitcode = VM_EXITCODE_WRMSR;
 			vmexit->u.msr.code = ecx;
 			vmexit->u.msr.wval = (uint64_t)edx << 32 | eax;
 		} else if (!retu) {
 			handled = HANDLED;
 		} else {
 			/* Return to userspace with a valid exitcode */
 			KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
 			    ("emulate_wrmsr retu with bogus exitcode"));
 		}
 		break;
 	case EXIT_REASON_HLT:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1);
 		SDT_PROBE3(vmm, vmx, exit, halt, vmx, vcpu, vmexit);
 		vmexit->exitcode = VM_EXITCODE_HLT;
 		vmexit->u.hlt.rflags = vmcs_read(VMCS_GUEST_RFLAGS);
 		if (virtual_interrupt_delivery)
 			vmexit->u.hlt.intr_status =
 			    vmcs_read(VMCS_GUEST_INTR_STATUS);
 		else
 			vmexit->u.hlt.intr_status = 0;
 		break;
 	case EXIT_REASON_MTF:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1);
 		SDT_PROBE3(vmm, vmx, exit, mtrap, vmx, vcpu, vmexit);
 		vmexit->exitcode = VM_EXITCODE_MTRAP;
 		vmexit->inst_length = 0;
 		break;
 	case EXIT_REASON_PAUSE:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_PAUSE, 1);
 		SDT_PROBE3(vmm, vmx, exit, pause, vmx, vcpu, vmexit);
 		vmexit->exitcode = VM_EXITCODE_PAUSE;
 		break;
 	case EXIT_REASON_INTR_WINDOW:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INTR_WINDOW, 1);
 		SDT_PROBE3(vmm, vmx, exit, intrwindow, vmx, vcpu, vmexit);
 		vmx_clear_int_window_exiting(vmx, vcpu);
 		return (1);
 	case EXIT_REASON_EXT_INTR:
 		/*
 		 * External interrupts serve only to cause VM exits and allow
 		 * the host interrupt handler to run.
 		 *
 		 * If this external interrupt triggers a virtual interrupt
 		 * to a VM, then that state will be recorded by the
 		 * host interrupt handler in the VM's softc. We will inject
 		 * this virtual interrupt during the subsequent VM enter.
 		 */
 		intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
 		SDT_PROBE4(vmm, vmx, exit, interrupt,
 		    vmx, vcpu, vmexit, intr_info);
 
 		/*
 		 * XXX: Ignore this exit if VMCS_INTR_VALID is not set.
 		 * This appears to be a bug in VMware Fusion?
 		 */
 		if (!(intr_info & VMCS_INTR_VALID))
 			return (1);
 		KASSERT((intr_info & VMCS_INTR_VALID) != 0 &&
 		    (intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_HWINTR,
 		    ("VM exit interruption info invalid: %#x", intr_info));
 		vmx_trigger_hostintr(intr_info & 0xff);
 
 		/*
 		 * This is special. We want to treat this as an 'handled'
 		 * VM-exit but not increment the instruction pointer.
 		 */
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1);
 		return (1);
 	case EXIT_REASON_NMI_WINDOW:
 		SDT_PROBE3(vmm, vmx, exit, nmiwindow, vmx, vcpu, vmexit);
 		/* Exit to allow the pending virtual NMI to be injected */
 		if (vm_nmi_pending(vmx->vm, vcpu))
 			vmx_inject_nmi(vmx, vcpu);
 		vmx_clear_nmi_window_exiting(vmx, vcpu);
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NMI_WINDOW, 1);
 		return (1);
 	case EXIT_REASON_INOUT:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INOUT, 1);
 		vmexit->exitcode = VM_EXITCODE_INOUT;
 		vmexit->u.inout.bytes = (qual & 0x7) + 1;
 		vmexit->u.inout.in = in = (qual & 0x8) ? 1 : 0;
 		vmexit->u.inout.string = (qual & 0x10) ? 1 : 0;
 		vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0;
 		vmexit->u.inout.port = (uint16_t)(qual >> 16);
 		vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax);
 		if (vmexit->u.inout.string) {
 			inst_info = vmcs_read(VMCS_EXIT_INSTRUCTION_INFO);
 			vmexit->exitcode = VM_EXITCODE_INOUT_STR;
 			vis = &vmexit->u.inout_str;
 			vmx_paging_info(&vis->paging);
 			vis->rflags = vmcs_read(VMCS_GUEST_RFLAGS);
 			vis->cr0 = vmcs_read(VMCS_GUEST_CR0);
 			vis->index = inout_str_index(vmx, vcpu, in);
 			vis->count = inout_str_count(vmx, vcpu, vis->inout.rep);
 			vis->addrsize = inout_str_addrsize(inst_info);
 			inout_str_seginfo(vmx, vcpu, inst_info, in, vis);
 		}
 		SDT_PROBE3(vmm, vmx, exit, inout, vmx, vcpu, vmexit);
 		break;
 	case EXIT_REASON_CPUID:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CPUID, 1);
 		SDT_PROBE3(vmm, vmx, exit, cpuid, vmx, vcpu, vmexit);
 		handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx);
 		break;
 	case EXIT_REASON_EXCEPTION:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXCEPTION, 1);
 		intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
 		KASSERT((intr_info & VMCS_INTR_VALID) != 0,
 		    ("VM exit interruption info invalid: %#x", intr_info));
 
 		intr_vec = intr_info & 0xff;
 		intr_type = intr_info & VMCS_INTR_T_MASK;
 
 		/*
 		 * If Virtual NMIs control is 1 and the VM-exit is due to a
 		 * fault encountered during the execution of IRET then we must
 		 * restore the state of "virtual-NMI blocking" before resuming
 		 * the guest.
 		 *
 		 * See "Resuming Guest Software after Handling an Exception".
 		 * See "Information for VM Exits Due to Vectored Events".
 		 */
 		if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 &&
 		    (intr_vec != IDT_DF) &&
 		    (intr_info & EXIT_QUAL_NMIUDTI) != 0)
 			vmx_restore_nmi_blocking(vmx, vcpu);
 
 		/*
 		 * The NMI has already been handled in vmx_exit_handle_nmi().
 		 */
 		if (intr_type == VMCS_INTR_T_NMI)
 			return (1);
 
 		/*
 		 * Call the machine check handler by hand. Also don't reflect
 		 * the machine check back into the guest.
 		 */
 		if (intr_vec == IDT_MC) {
 			VCPU_CTR0(vmx->vm, vcpu, "Vectoring to MCE handler");
 			__asm __volatile("int $18");
 			return (1);
 		}
 
 		/*
 		 * If the hypervisor has requested user exits for
 		 * debug exceptions, bounce them out to userland.
 		 */
 		if (intr_type == VMCS_INTR_T_SWEXCEPTION && intr_vec == IDT_BP &&
 		    (vmx->cap[vcpu].set & (1 << VM_CAP_BPT_EXIT))) {
 			vmexit->exitcode = VM_EXITCODE_BPT;
 			vmexit->u.bpt.inst_length = vmexit->inst_length;
 			vmexit->inst_length = 0;
 			break;
 		}
 
 		if (intr_vec == IDT_PF) {
 			error = vmxctx_setreg(vmxctx, VM_REG_GUEST_CR2, qual);
 			KASSERT(error == 0, ("%s: vmxctx_setreg(cr2) error %d",
 			    __func__, error));
 		}
 
 		/*
 		 * Software exceptions exhibit trap-like behavior. This in
 		 * turn requires populating the VM-entry instruction length
 		 * so that the %rip in the trap frame is past the INT3/INTO
 		 * instruction.
 		 */
 		if (intr_type == VMCS_INTR_T_SWEXCEPTION)
 			vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length);
 
 		/* Reflect all other exceptions back into the guest */
 		errcode_valid = errcode = 0;
 		if (intr_info & VMCS_INTR_DEL_ERRCODE) {
 			errcode_valid = 1;
 			errcode = vmcs_read(VMCS_EXIT_INTR_ERRCODE);
 		}
 		VCPU_CTR2(vmx->vm, vcpu, "Reflecting exception %d/%#x into "
 		    "the guest", intr_vec, errcode);
 		SDT_PROBE5(vmm, vmx, exit, exception,
 		    vmx, vcpu, vmexit, intr_vec, errcode);
 		error = vm_inject_exception(vmx->vm, vcpu, intr_vec,
 		    errcode_valid, errcode, 0);
 		KASSERT(error == 0, ("%s: vm_inject_exception error %d",
 		    __func__, error));
 		return (1);
 
 	case EXIT_REASON_EPT_FAULT:
 		/*
 		 * If 'gpa' lies within the address space allocated to
 		 * memory then this must be a nested page fault otherwise
 		 * this must be an instruction that accesses MMIO space.
 		 */
 		gpa = vmcs_gpa();
 		if (vm_mem_allocated(vmx->vm, vcpu, gpa) ||
 		    apic_access_fault(vmx, vcpu, gpa)) {
 			vmexit->exitcode = VM_EXITCODE_PAGING;
 			vmexit->inst_length = 0;
 			vmexit->u.paging.gpa = gpa;
 			vmexit->u.paging.fault_type = ept_fault_type(qual);
 			vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NESTED_FAULT, 1);
 			SDT_PROBE5(vmm, vmx, exit, nestedfault,
 			    vmx, vcpu, vmexit, gpa, qual);
 		} else if (ept_emulation_fault(qual)) {
 			vmexit_inst_emul(vmexit, gpa, vmcs_gla());
 			vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INST_EMUL, 1);
 			SDT_PROBE4(vmm, vmx, exit, mmiofault,
 			    vmx, vcpu, vmexit, gpa);
 		}
 		/*
 		 * If Virtual NMIs control is 1 and the VM-exit is due to an
 		 * EPT fault during the execution of IRET then we must restore
 		 * the state of "virtual-NMI blocking" before resuming.
 		 *
 		 * See description of "NMI unblocking due to IRET" in
 		 * "Exit Qualification for EPT Violations".
 		 */
 		if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 &&
 		    (qual & EXIT_QUAL_NMIUDTI) != 0)
 			vmx_restore_nmi_blocking(vmx, vcpu);
 		break;
 	case EXIT_REASON_VIRTUALIZED_EOI:
 		vmexit->exitcode = VM_EXITCODE_IOAPIC_EOI;
 		vmexit->u.ioapic_eoi.vector = qual & 0xFF;
 		SDT_PROBE3(vmm, vmx, exit, eoi, vmx, vcpu, vmexit);
 		vmexit->inst_length = 0;	/* trap-like */
 		break;
 	case EXIT_REASON_APIC_ACCESS:
 		SDT_PROBE3(vmm, vmx, exit, apicaccess, vmx, vcpu, vmexit);
 		handled = vmx_handle_apic_access(vmx, vcpu, vmexit);
 		break;
 	case EXIT_REASON_APIC_WRITE:
 		/*
 		 * APIC-write VM exit is trap-like so the %rip is already
 		 * pointing to the next instruction.
 		 */
 		vmexit->inst_length = 0;
 		vlapic = vm_lapic(vmx->vm, vcpu);
 		SDT_PROBE4(vmm, vmx, exit, apicwrite,
 		    vmx, vcpu, vmexit, vlapic);
 		handled = vmx_handle_apic_write(vmx, vcpu, vlapic, qual);
 		break;
 	case EXIT_REASON_XSETBV:
 		SDT_PROBE3(vmm, vmx, exit, xsetbv, vmx, vcpu, vmexit);
 		handled = vmx_emulate_xsetbv(vmx, vcpu, vmexit);
 		break;
 	case EXIT_REASON_MONITOR:
 		SDT_PROBE3(vmm, vmx, exit, monitor, vmx, vcpu, vmexit);
 		vmexit->exitcode = VM_EXITCODE_MONITOR;
 		break;
 	case EXIT_REASON_MWAIT:
 		SDT_PROBE3(vmm, vmx, exit, mwait, vmx, vcpu, vmexit);
 		vmexit->exitcode = VM_EXITCODE_MWAIT;
 		break;
 	case EXIT_REASON_VMCALL:
 	case EXIT_REASON_VMCLEAR:
 	case EXIT_REASON_VMLAUNCH:
 	case EXIT_REASON_VMPTRLD:
 	case EXIT_REASON_VMPTRST:
 	case EXIT_REASON_VMREAD:
 	case EXIT_REASON_VMRESUME:
 	case EXIT_REASON_VMWRITE:
 	case EXIT_REASON_VMXOFF:
 	case EXIT_REASON_VMXON:
 		SDT_PROBE3(vmm, vmx, exit, vminsn, vmx, vcpu, vmexit);
 		vmexit->exitcode = VM_EXITCODE_VMINSN;
 		break;
 	default:
 		SDT_PROBE4(vmm, vmx, exit, unknown,
 		    vmx, vcpu, vmexit, reason);
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_UNKNOWN, 1);
 		break;
 	}
 
 	if (handled) {
 		/*
 		 * It is possible that control is returned to userland
 		 * even though we were able to handle the VM exit in the
 		 * kernel.
 		 *
 		 * In such a case we want to make sure that the userland
 		 * restarts guest execution at the instruction *after*
 		 * the one we just processed. Therefore we update the
 		 * guest rip in the VMCS and in 'vmexit'.
 		 */
 		vmexit->rip += vmexit->inst_length;
 		vmexit->inst_length = 0;
 		vmcs_write(VMCS_GUEST_RIP, vmexit->rip);
 	} else {
 		if (vmexit->exitcode == VM_EXITCODE_BOGUS) {
 			/*
 			 * If this VM exit was not claimed by anybody then
 			 * treat it as a generic VMX exit.
 			 */
 			vmexit->exitcode = VM_EXITCODE_VMX;
 			vmexit->u.vmx.status = VM_SUCCESS;
 			vmexit->u.vmx.inst_type = 0;
 			vmexit->u.vmx.inst_error = 0;
 		} else {
 			/*
 			 * The exitcode and collateral have been populated.
 			 * The VM exit will be processed further in userland.
 			 */
 		}
 	}
 
 	SDT_PROBE4(vmm, vmx, exit, return,
 	    vmx, vcpu, vmexit, handled);
 	return (handled);
 }
 
 static __inline void
 vmx_exit_inst_error(struct vmxctx *vmxctx, int rc, struct vm_exit *vmexit)
 {
 
 	KASSERT(vmxctx->inst_fail_status != VM_SUCCESS,
 	    ("vmx_exit_inst_error: invalid inst_fail_status %d",
 	    vmxctx->inst_fail_status));
 
 	vmexit->inst_length = 0;
 	vmexit->exitcode = VM_EXITCODE_VMX;
 	vmexit->u.vmx.status = vmxctx->inst_fail_status;
 	vmexit->u.vmx.inst_error = vmcs_instruction_error();
 	vmexit->u.vmx.exit_reason = ~0;
 	vmexit->u.vmx.exit_qualification = ~0;
 
 	switch (rc) {
 	case VMX_VMRESUME_ERROR:
 	case VMX_VMLAUNCH_ERROR:
 	case VMX_INVEPT_ERROR:
 		vmexit->u.vmx.inst_type = rc;
 		break;
 	default:
 		panic("vm_exit_inst_error: vmx_enter_guest returned %d", rc);
 	}
 }
 
 /*
  * If the NMI-exiting VM execution control is set to '1' then an NMI in
  * non-root operation causes a VM-exit. NMI blocking is in effect so it is
  * sufficient to simply vector to the NMI handler via a software interrupt.
  * However, this must be done before maskable interrupts are enabled
  * otherwise the "iret" issued by an interrupt handler will incorrectly
  * clear NMI blocking.
  */
 static __inline void
 vmx_exit_handle_nmi(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit)
 {
 	uint32_t intr_info;
 
 	KASSERT((read_rflags() & PSL_I) == 0, ("interrupts enabled"));
 
 	if (vmexit->u.vmx.exit_reason != EXIT_REASON_EXCEPTION)
 		return;
 
 	intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
 	KASSERT((intr_info & VMCS_INTR_VALID) != 0,
 	    ("VM exit interruption info invalid: %#x", intr_info));
 
 	if ((intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_NMI) {
 		KASSERT((intr_info & 0xff) == IDT_NMI, ("VM exit due "
 		    "to NMI has invalid vector: %#x", intr_info));
 		VCPU_CTR0(vmx->vm, vcpuid, "Vectoring to NMI handler");
 		__asm __volatile("int $2");
 	}
 }
 
 static __inline void
 vmx_dr_enter_guest(struct vmxctx *vmxctx)
 {
 	register_t rflags;
 
 	/* Save host control debug registers. */
 	vmxctx->host_dr7 = rdr7();
 	vmxctx->host_debugctl = rdmsr(MSR_DEBUGCTLMSR);
 
 	/*
 	 * Disable debugging in DR7 and DEBUGCTL to avoid triggering
 	 * exceptions in the host based on the guest DRx values.  The
 	 * guest DR7 and DEBUGCTL are saved/restored in the VMCS.
 	 */
 	load_dr7(0);
 	wrmsr(MSR_DEBUGCTLMSR, 0);
 
 	/*
 	 * Disable single stepping the kernel to avoid corrupting the
 	 * guest DR6.  A debugger might still be able to corrupt the
 	 * guest DR6 by setting a breakpoint after this point and then
 	 * single stepping.
 	 */
 	rflags = read_rflags();
 	vmxctx->host_tf = rflags & PSL_T;
 	write_rflags(rflags & ~PSL_T);
 
 	/* Save host debug registers. */
 	vmxctx->host_dr0 = rdr0();
 	vmxctx->host_dr1 = rdr1();
 	vmxctx->host_dr2 = rdr2();
 	vmxctx->host_dr3 = rdr3();
 	vmxctx->host_dr6 = rdr6();
 
 	/* Restore guest debug registers. */
 	load_dr0(vmxctx->guest_dr0);
 	load_dr1(vmxctx->guest_dr1);
 	load_dr2(vmxctx->guest_dr2);
 	load_dr3(vmxctx->guest_dr3);
 	load_dr6(vmxctx->guest_dr6);
 }
 
 static __inline void
 vmx_dr_leave_guest(struct vmxctx *vmxctx)
 {
 
 	/* Save guest debug registers. */
 	vmxctx->guest_dr0 = rdr0();
 	vmxctx->guest_dr1 = rdr1();
 	vmxctx->guest_dr2 = rdr2();
 	vmxctx->guest_dr3 = rdr3();
 	vmxctx->guest_dr6 = rdr6();
 
 	/*
 	 * Restore host debug registers.  Restore DR7, DEBUGCTL, and
 	 * PSL_T last.
 	 */
 	load_dr0(vmxctx->host_dr0);
 	load_dr1(vmxctx->host_dr1);
 	load_dr2(vmxctx->host_dr2);
 	load_dr3(vmxctx->host_dr3);
 	load_dr6(vmxctx->host_dr6);
 	wrmsr(MSR_DEBUGCTLMSR, vmxctx->host_debugctl);
 	load_dr7(vmxctx->host_dr7);
 	write_rflags(read_rflags() | vmxctx->host_tf);
 }
 
 static int
 vmx_run(void *arg, int vcpu, register_t rip, pmap_t pmap,
     struct vm_eventinfo *evinfo)
 {
 	int rc, handled, launched;
 	struct vmx *vmx;
 	struct vm *vm;
 	struct vmxctx *vmxctx;
 	struct vmcs *vmcs;
 	struct vm_exit *vmexit;
 	struct vlapic *vlapic;
 	uint32_t exit_reason;
 	struct region_descriptor gdtr, idtr;
 	uint16_t ldt_sel;
 
 	vmx = arg;
 	vm = vmx->vm;
 	vmcs = &vmx->vmcs[vcpu];
 	vmxctx = &vmx->ctx[vcpu];
 	vlapic = vm_lapic(vm, vcpu);
 	vmexit = vm_exitinfo(vm, vcpu);
 	launched = 0;
 
 	KASSERT(vmxctx->pmap == pmap,
 	    ("pmap %p different than ctx pmap %p", pmap, vmxctx->pmap));
 
 	vmx_msr_guest_enter(vmx, vcpu);
 
 	VMPTRLD(vmcs);
 
 	/*
 	 * XXX
 	 * We do this every time because we may setup the virtual machine
 	 * from a different process than the one that actually runs it.
 	 *
 	 * If the life of a virtual machine was spent entirely in the context
 	 * of a single process we could do this once in vmx_vminit().
 	 */
 	vmcs_write(VMCS_HOST_CR3, rcr3());
 
 	vmcs_write(VMCS_GUEST_RIP, rip);
 	vmx_set_pcpu_defaults(vmx, vcpu, pmap);
 	do {
 		KASSERT(vmcs_guest_rip() == rip, ("%s: vmcs guest rip mismatch "
 		    "%#lx/%#lx", __func__, vmcs_guest_rip(), rip));
 
 		handled = UNHANDLED;
 		/*
 		 * Interrupts are disabled from this point on until the
 		 * guest starts executing. This is done for the following
 		 * reasons:
 		 *
 		 * If an AST is asserted on this thread after the check below,
 		 * then the IPI_AST notification will not be lost, because it
 		 * will cause a VM exit due to external interrupt as soon as
 		 * the guest state is loaded.
 		 *
 		 * A posted interrupt after 'vmx_inject_interrupts()' will
 		 * not be "lost" because it will be held pending in the host
 		 * APIC because interrupts are disabled. The pending interrupt
 		 * will be recognized as soon as the guest state is loaded.
 		 *
 		 * The same reasoning applies to the IPI generated by
 		 * pmap_invalidate_ept().
 		 */
 		disable_intr();
 		vmx_inject_interrupts(vmx, vcpu, vlapic, rip);
 
 		/*
 		 * Check for vcpu suspension after injecting events because
 		 * vmx_inject_interrupts() can suspend the vcpu due to a
 		 * triple fault.
 		 */
 		if (vcpu_suspended(evinfo)) {
 			enable_intr();
 			vm_exit_suspended(vmx->vm, vcpu, rip);
 			break;
 		}
 
 		if (vcpu_rendezvous_pending(evinfo)) {
 			enable_intr();
 			vm_exit_rendezvous(vmx->vm, vcpu, rip);
 			break;
 		}
 
 		if (vcpu_reqidle(evinfo)) {
 			enable_intr();
 			vm_exit_reqidle(vmx->vm, vcpu, rip);
 			break;
 		}
 
 		if (vcpu_should_yield(vm, vcpu)) {
 			enable_intr();
 			vm_exit_astpending(vmx->vm, vcpu, rip);
 			vmx_astpending_trace(vmx, vcpu, rip);
 			handled = HANDLED;
 			break;
 		}
 
 		if (vcpu_debugged(vm, vcpu)) {
 			enable_intr();
 			vm_exit_debug(vmx->vm, vcpu, rip);
 			break;
 		}
 
 		/*
 		 * VM exits restore the base address but not the
 		 * limits of GDTR and IDTR.  The VMCS only stores the
 		 * base address, so VM exits set the limits to 0xffff.
 		 * Save and restore the full GDTR and IDTR to restore
 		 * the limits.
 		 *
 		 * The VMCS does not save the LDTR at all, and VM
 		 * exits clear LDTR as if a NULL selector were loaded.
 		 * The userspace hypervisor probably doesn't use a
 		 * LDT, but save and restore it to be safe.
 		 */
 		sgdt(&gdtr);
 		sidt(&idtr);
 		ldt_sel = sldt();
 
 		vmx_run_trace(vmx, vcpu);
 		vmx_dr_enter_guest(vmxctx);
 		rc = vmx_enter_guest(vmxctx, vmx, launched);
 		vmx_dr_leave_guest(vmxctx);
 
 		bare_lgdt(&gdtr);
 		lidt(&idtr);
 		lldt(ldt_sel);
 
 		/* Collect some information for VM exit processing */
 		vmexit->rip = rip = vmcs_guest_rip();
 		vmexit->inst_length = vmexit_instruction_length();
 		vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason();
 		vmexit->u.vmx.exit_qualification = vmcs_exit_qualification();
 
 		/* Update 'nextrip' */
 		vmx->state[vcpu].nextrip = rip;
 
 		if (rc == VMX_GUEST_VMEXIT) {
 			vmx_exit_handle_nmi(vmx, vcpu, vmexit);
 			enable_intr();
 			handled = vmx_exit_process(vmx, vcpu, vmexit);
 		} else {
 			enable_intr();
 			vmx_exit_inst_error(vmxctx, rc, vmexit);
 		}
 		launched = 1;
 		vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled);
 		rip = vmexit->rip;
 	} while (handled);
 
 	/*
 	 * If a VM exit has been handled then the exitcode must be BOGUS
 	 * If a VM exit is not handled then the exitcode must not be BOGUS
 	 */
 	if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) ||
 	    (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) {
 		panic("Mismatch between handled (%d) and exitcode (%d)",
 		      handled, vmexit->exitcode);
 	}
 
 	if (!handled)
 		vmm_stat_incr(vm, vcpu, VMEXIT_USERSPACE, 1);
 
 	VCPU_CTR1(vm, vcpu, "returning from vmx_run: exitcode %d",
 	    vmexit->exitcode);
 
 	VMCLEAR(vmcs);
 	vmx_msr_guest_exit(vmx, vcpu);
 
 	return (0);
 }
 
 static void
 vmx_vmcleanup(void *arg)
 {
 	int i;
 	struct vmx *vmx = arg;
 	uint16_t maxcpus;
 
 	if (apic_access_virtualization(vmx, 0))
 		vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE);
 
 	maxcpus = vm_get_maxcpus(vmx->vm);
 	for (i = 0; i < maxcpus; i++)
 		vpid_free(vmx->state[i].vpid);
 
 	free(vmx, M_VMX);
 
 	return;
 }
 
 static register_t *
 vmxctx_regptr(struct vmxctx *vmxctx, int reg)
 {
 
 	switch (reg) {
 	case VM_REG_GUEST_RAX:
 		return (&vmxctx->guest_rax);
 	case VM_REG_GUEST_RBX:
 		return (&vmxctx->guest_rbx);
 	case VM_REG_GUEST_RCX:
 		return (&vmxctx->guest_rcx);
 	case VM_REG_GUEST_RDX:
 		return (&vmxctx->guest_rdx);
 	case VM_REG_GUEST_RSI:
 		return (&vmxctx->guest_rsi);
 	case VM_REG_GUEST_RDI:
 		return (&vmxctx->guest_rdi);
 	case VM_REG_GUEST_RBP:
 		return (&vmxctx->guest_rbp);
 	case VM_REG_GUEST_R8:
 		return (&vmxctx->guest_r8);
 	case VM_REG_GUEST_R9:
 		return (&vmxctx->guest_r9);
 	case VM_REG_GUEST_R10:
 		return (&vmxctx->guest_r10);
 	case VM_REG_GUEST_R11:
 		return (&vmxctx->guest_r11);
 	case VM_REG_GUEST_R12:
 		return (&vmxctx->guest_r12);
 	case VM_REG_GUEST_R13:
 		return (&vmxctx->guest_r13);
 	case VM_REG_GUEST_R14:
 		return (&vmxctx->guest_r14);
 	case VM_REG_GUEST_R15:
 		return (&vmxctx->guest_r15);
 	case VM_REG_GUEST_CR2:
 		return (&vmxctx->guest_cr2);
 	case VM_REG_GUEST_DR0:
 		return (&vmxctx->guest_dr0);
 	case VM_REG_GUEST_DR1:
 		return (&vmxctx->guest_dr1);
 	case VM_REG_GUEST_DR2:
 		return (&vmxctx->guest_dr2);
 	case VM_REG_GUEST_DR3:
 		return (&vmxctx->guest_dr3);
 	case VM_REG_GUEST_DR6:
 		return (&vmxctx->guest_dr6);
 	default:
 		break;
 	}
 	return (NULL);
 }
 
 static int
 vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval)
 {
 	register_t *regp;
 
 	if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
 		*retval = *regp;
 		return (0);
 	} else
 		return (EINVAL);
 }
 
 static int
 vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val)
 {
 	register_t *regp;
 
 	if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
 		*regp = val;
 		return (0);
 	} else
 		return (EINVAL);
 }
 
 static int
 vmx_get_intr_shadow(struct vmx *vmx, int vcpu, int running, uint64_t *retval)
 {
 	uint64_t gi;
 	int error;
 
 	error = vmcs_getreg(&vmx->vmcs[vcpu], running,
 	    VMCS_IDENT(VMCS_GUEST_INTERRUPTIBILITY), &gi);
 	*retval = (gi & HWINTR_BLOCKING) ? 1 : 0;
 	return (error);
 }
 
 static int
 vmx_modify_intr_shadow(struct vmx *vmx, int vcpu, int running, uint64_t val)
 {
 	struct vmcs *vmcs;
 	uint64_t gi;
 	int error, ident;
 
 	/*
 	 * Forcing the vcpu into an interrupt shadow is not supported.
 	 */
 	if (val) {
 		error = EINVAL;
 		goto done;
 	}
 
 	vmcs = &vmx->vmcs[vcpu];
 	ident = VMCS_IDENT(VMCS_GUEST_INTERRUPTIBILITY);
 	error = vmcs_getreg(vmcs, running, ident, &gi);
 	if (error == 0) {
 		gi &= ~HWINTR_BLOCKING;
 		error = vmcs_setreg(vmcs, running, ident, gi);
 	}
 done:
 	VCPU_CTR2(vmx->vm, vcpu, "Setting intr_shadow to %#lx %s", val,
 	    error ? "failed" : "succeeded");
 	return (error);
 }
 
 static int
 vmx_shadow_reg(int reg)
 {
 	int shreg;
 
 	shreg = -1;
 
 	switch (reg) {
 	case VM_REG_GUEST_CR0:
 		shreg = VMCS_CR0_SHADOW;
 		break;
 	case VM_REG_GUEST_CR4:
 		shreg = VMCS_CR4_SHADOW;
 		break;
 	default:
 		break;
 	}
 
 	return (shreg);
 }
 
 static int
 vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval)
 {
 	int running, hostcpu;
 	struct vmx *vmx = arg;
 
 	running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
 	if (running && hostcpu != curcpu)
 		panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu);
 
 	if (reg == VM_REG_GUEST_INTR_SHADOW)
 		return (vmx_get_intr_shadow(vmx, vcpu, running, retval));
 
 	if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0)
 		return (0);
 
 	return (vmcs_getreg(&vmx->vmcs[vcpu], running, reg, retval));
 }
 
 static int
 vmx_setreg(void *arg, int vcpu, int reg, uint64_t val)
 {
 	int error, hostcpu, running, shadow;
 	uint64_t ctls;
 	pmap_t pmap;
 	struct vmx *vmx = arg;
 
 	running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
 	if (running && hostcpu != curcpu)
 		panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu);
 
 	if (reg == VM_REG_GUEST_INTR_SHADOW)
 		return (vmx_modify_intr_shadow(vmx, vcpu, running, val));
 
 	if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0)
 		return (0);
 
 	error = vmcs_setreg(&vmx->vmcs[vcpu], running, reg, val);
 
 	if (error == 0) {
 		/*
 		 * If the "load EFER" VM-entry control is 1 then the
 		 * value of EFER.LMA must be identical to "IA-32e mode guest"
 		 * bit in the VM-entry control.
 		 */
 		if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 &&
 		    (reg == VM_REG_GUEST_EFER)) {
 			vmcs_getreg(&vmx->vmcs[vcpu], running,
 				    VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls);
 			if (val & EFER_LMA)
 				ctls |= VM_ENTRY_GUEST_LMA;
 			else
 				ctls &= ~VM_ENTRY_GUEST_LMA;
 			vmcs_setreg(&vmx->vmcs[vcpu], running,
 				    VMCS_IDENT(VMCS_ENTRY_CTLS), ctls);
 		}
 
 		shadow = vmx_shadow_reg(reg);
 		if (shadow > 0) {
 			/*
 			 * Store the unmodified value in the shadow
 			 */
 			error = vmcs_setreg(&vmx->vmcs[vcpu], running,
 				    VMCS_IDENT(shadow), val);
 		}
 
 		if (reg == VM_REG_GUEST_CR3) {
 			/*
 			 * Invalidate the guest vcpu's TLB mappings to emulate
 			 * the behavior of updating %cr3.
 			 *
 			 * XXX the processor retains global mappings when %cr3
 			 * is updated but vmx_invvpid() does not.
 			 */
 			pmap = vmx->ctx[vcpu].pmap;
 			vmx_invvpid(vmx, vcpu, pmap, running);
 		}
 	}
 
 	return (error);
 }
 
 static int
 vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
 {
 	int hostcpu, running;
 	struct vmx *vmx = arg;
 
 	running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
 	if (running && hostcpu != curcpu)
 		panic("vmx_getdesc: %s%d is running", vm_name(vmx->vm), vcpu);
 
 	return (vmcs_getdesc(&vmx->vmcs[vcpu], running, reg, desc));
 }
 
 static int
 vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
 {
 	int hostcpu, running;
 	struct vmx *vmx = arg;
 
 	running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
 	if (running && hostcpu != curcpu)
 		panic("vmx_setdesc: %s%d is running", vm_name(vmx->vm), vcpu);
 
 	return (vmcs_setdesc(&vmx->vmcs[vcpu], running, reg, desc));
 }
 
 static int
 vmx_getcap(void *arg, int vcpu, int type, int *retval)
 {
 	struct vmx *vmx = arg;
 	int vcap;
 	int ret;
 
 	ret = ENOENT;
 
 	vcap = vmx->cap[vcpu].set;
 
 	switch (type) {
 	case VM_CAP_HALT_EXIT:
 		if (cap_halt_exit)
 			ret = 0;
 		break;
 	case VM_CAP_PAUSE_EXIT:
 		if (cap_pause_exit)
 			ret = 0;
 		break;
 	case VM_CAP_MTRAP_EXIT:
 		if (cap_monitor_trap)
 			ret = 0;
 		break;
 	case VM_CAP_UNRESTRICTED_GUEST:
 		if (cap_unrestricted_guest)
 			ret = 0;
 		break;
 	case VM_CAP_ENABLE_INVPCID:
 		if (cap_invpcid)
 			ret = 0;
 		break;
 	case VM_CAP_BPT_EXIT:
 		ret = 0;
 		break;
 	default:
 		break;
 	}
 
 	if (ret == 0)
 		*retval = (vcap & (1 << type)) ? 1 : 0;
 
 	return (ret);
 }
 
 static int
 vmx_setcap(void *arg, int vcpu, int type, int val)
 {
 	struct vmx *vmx = arg;
 	struct vmcs *vmcs = &vmx->vmcs[vcpu];
 	uint32_t baseval;
 	uint32_t *pptr;
 	int error;
 	int flag;
 	int reg;
 	int retval;
 
 	retval = ENOENT;
 	pptr = NULL;
 
 	switch (type) {
 	case VM_CAP_HALT_EXIT:
 		if (cap_halt_exit) {
 			retval = 0;
 			pptr = &vmx->cap[vcpu].proc_ctls;
 			baseval = *pptr;
 			flag = PROCBASED_HLT_EXITING;
 			reg = VMCS_PRI_PROC_BASED_CTLS;
 		}
 		break;
 	case VM_CAP_MTRAP_EXIT:
 		if (cap_monitor_trap) {
 			retval = 0;
 			pptr = &vmx->cap[vcpu].proc_ctls;
 			baseval = *pptr;
 			flag = PROCBASED_MTF;
 			reg = VMCS_PRI_PROC_BASED_CTLS;
 		}
 		break;
 	case VM_CAP_PAUSE_EXIT:
 		if (cap_pause_exit) {
 			retval = 0;
 			pptr = &vmx->cap[vcpu].proc_ctls;
 			baseval = *pptr;
 			flag = PROCBASED_PAUSE_EXITING;
 			reg = VMCS_PRI_PROC_BASED_CTLS;
 		}
 		break;
 	case VM_CAP_UNRESTRICTED_GUEST:
 		if (cap_unrestricted_guest) {
 			retval = 0;
 			pptr = &vmx->cap[vcpu].proc_ctls2;
 			baseval = *pptr;
 			flag = PROCBASED2_UNRESTRICTED_GUEST;
 			reg = VMCS_SEC_PROC_BASED_CTLS;
 		}
 		break;
 	case VM_CAP_ENABLE_INVPCID:
 		if (cap_invpcid) {
 			retval = 0;
 			pptr = &vmx->cap[vcpu].proc_ctls2;
 			baseval = *pptr;
 			flag = PROCBASED2_ENABLE_INVPCID;
 			reg = VMCS_SEC_PROC_BASED_CTLS;
 		}
 		break;
 	case VM_CAP_BPT_EXIT:
 		retval = 0;
 
 		/* Don't change the bitmap if we are tracing all exceptions. */
 		if (vmx->cap[vcpu].exc_bitmap != 0xffffffff) {
 			pptr = &vmx->cap[vcpu].exc_bitmap;
 			baseval = *pptr;
 			flag = (1 << IDT_BP);
 			reg = VMCS_EXCEPTION_BITMAP;
 		}
 		break;
 	default:
 		break;
 	}
 
 	if (retval)
 		return (retval);
 
 	if (pptr != NULL) {
 		if (val) {
 			baseval |= flag;
 		} else {
 			baseval &= ~flag;
 		}
 		VMPTRLD(vmcs);
 		error = vmwrite(reg, baseval);
 		VMCLEAR(vmcs);
 
 		if (error)
 			return (error);
 
 		/*
 		 * Update optional stored flags, and record
 		 * setting
 		 */
 		*pptr = baseval;
 	}
 
 	if (val) {
 		vmx->cap[vcpu].set |= (1 << type);
 	} else {
 		vmx->cap[vcpu].set &= ~(1 << type);
 	}
 
 	return (0);
 }
 
 struct vlapic_vtx {
 	struct vlapic	vlapic;
 	struct pir_desc	*pir_desc;
 	struct vmx	*vmx;
 	u_int	pending_prio;
 };
 
 #define VPR_PRIO_BIT(vpr)	(1 << ((vpr) >> 4))
 
 #define	VMX_CTR_PIR(vm, vcpuid, pir_desc, notify, vector, level, msg)	\
 do {									\
 	VCPU_CTR2(vm, vcpuid, msg " assert %s-triggered vector %d",	\
 	    level ? "level" : "edge", vector);				\
 	VCPU_CTR1(vm, vcpuid, msg " pir0 0x%016lx", pir_desc->pir[0]);	\
 	VCPU_CTR1(vm, vcpuid, msg " pir1 0x%016lx", pir_desc->pir[1]);	\
 	VCPU_CTR1(vm, vcpuid, msg " pir2 0x%016lx", pir_desc->pir[2]);	\
 	VCPU_CTR1(vm, vcpuid, msg " pir3 0x%016lx", pir_desc->pir[3]);	\
 	VCPU_CTR1(vm, vcpuid, msg " notify: %s", notify ? "yes" : "no");\
 } while (0)
 
 /*
  * vlapic->ops handlers that utilize the APICv hardware assist described in
  * Chapter 29 of the Intel SDM.
  */
 static int
 vmx_set_intr_ready(struct vlapic *vlapic, int vector, bool level)
 {
 	struct vlapic_vtx *vlapic_vtx;
 	struct pir_desc *pir_desc;
 	uint64_t mask;
 	int idx, notify = 0;
 
 	vlapic_vtx = (struct vlapic_vtx *)vlapic;
 	pir_desc = vlapic_vtx->pir_desc;
 
 	/*
 	 * Keep track of interrupt requests in the PIR descriptor. This is
 	 * because the virtual APIC page pointed to by the VMCS cannot be
 	 * modified if the vcpu is running.
 	 */
 	idx = vector / 64;
 	mask = 1UL << (vector % 64);
 	atomic_set_long(&pir_desc->pir[idx], mask);
 
 	/*
 	 * A notification is required whenever the 'pending' bit makes a
 	 * transition from 0->1.
 	 *
 	 * Even if the 'pending' bit is already asserted, notification about
 	 * the incoming interrupt may still be necessary.  For example, if a
 	 * vCPU is HLTed with a high PPR, a low priority interrupt would cause
 	 * the 0->1 'pending' transition with a notification, but the vCPU
 	 * would ignore the interrupt for the time being.  The same vCPU would
 	 * need to then be notified if a high-priority interrupt arrived which
 	 * satisfied the PPR.
 	 *
 	 * The priorities of interrupts injected while 'pending' is asserted
 	 * are tracked in a custom bitfield 'pending_prio'.  Should the
 	 * to-be-injected interrupt exceed the priorities already present, the
 	 * notification is sent.  The priorities recorded in 'pending_prio' are
 	 * cleared whenever the 'pending' bit makes another 0->1 transition.
 	 */
 	if (atomic_cmpset_long(&pir_desc->pending, 0, 1) != 0) {
 		notify = 1;
 		vlapic_vtx->pending_prio = 0;
 	} else {
 		const u_int old_prio = vlapic_vtx->pending_prio;
 		const u_int prio_bit = VPR_PRIO_BIT(vector & APIC_TPR_INT);
 
 		if ((old_prio & prio_bit) == 0 && prio_bit > old_prio) {
 			atomic_set_int(&vlapic_vtx->pending_prio, prio_bit);
 			notify = 1;
 		}
 	}
 
 	VMX_CTR_PIR(vlapic->vm, vlapic->vcpuid, pir_desc, notify, vector,
 	    level, "vmx_set_intr_ready");
 	return (notify);
 }
 
 static int
 vmx_pending_intr(struct vlapic *vlapic, int *vecptr)
 {
 	struct vlapic_vtx *vlapic_vtx;
 	struct pir_desc *pir_desc;
 	struct LAPIC *lapic;
 	uint64_t pending, pirval;
 	uint32_t ppr, vpr;
 	int i;
 
 	/*
 	 * This function is only expected to be called from the 'HLT' exit
 	 * handler which does not care about the vector that is pending.
 	 */
 	KASSERT(vecptr == NULL, ("vmx_pending_intr: vecptr must be NULL"));
 
 	vlapic_vtx = (struct vlapic_vtx *)vlapic;
 	pir_desc = vlapic_vtx->pir_desc;
 
 	pending = atomic_load_acq_long(&pir_desc->pending);
 	if (!pending) {
 		/*
 		 * While a virtual interrupt may have already been
 		 * processed the actual delivery maybe pending the
 		 * interruptibility of the guest.  Recognize a pending
 		 * interrupt by reevaluating virtual interrupts
 		 * following Section 29.2.1 in the Intel SDM Volume 3.
 		 */
 		struct vm_exit *vmexit;
 		uint8_t rvi, ppr;
 
 		vmexit = vm_exitinfo(vlapic->vm, vlapic->vcpuid);
 		KASSERT(vmexit->exitcode == VM_EXITCODE_HLT,
 		    ("vmx_pending_intr: exitcode not 'HLT'"));
 		rvi = vmexit->u.hlt.intr_status & APIC_TPR_INT;
 		lapic = vlapic->apic_page;
 		ppr = lapic->ppr & APIC_TPR_INT;
 		if (rvi > ppr) {
 			return (1);
 		}
 
 		return (0);
 	}
 
 	/*
 	 * If there is an interrupt pending then it will be recognized only
 	 * if its priority is greater than the processor priority.
 	 *
 	 * Special case: if the processor priority is zero then any pending
 	 * interrupt will be recognized.
 	 */
 	lapic = vlapic->apic_page;
 	ppr = lapic->ppr & APIC_TPR_INT;
 	if (ppr == 0)
 		return (1);
 
 	VCPU_CTR1(vlapic->vm, vlapic->vcpuid, "HLT with non-zero PPR %d",
 	    lapic->ppr);
 
 	vpr = 0;
 	for (i = 3; i >= 0; i--) {
 		pirval = pir_desc->pir[i];
 		if (pirval != 0) {
 			vpr = (i * 64 + flsl(pirval) - 1) & APIC_TPR_INT;
 			break;
 		}
 	}
 
 	/*
 	 * If the highest-priority pending interrupt falls short of the
 	 * processor priority of this vCPU, ensure that 'pending_prio' does not
 	 * have any stale bits which would preclude a higher-priority interrupt
 	 * from incurring a notification later.
 	 */
 	if (vpr <= ppr) {
 		const u_int prio_bit = VPR_PRIO_BIT(vpr);
 		const u_int old = vlapic_vtx->pending_prio;
 
 		if (old > prio_bit && (old & prio_bit) == 0) {
 			vlapic_vtx->pending_prio = prio_bit;
 		}
 		return (0);
 	}
 	return (1);
 }
 
 static void
 vmx_intr_accepted(struct vlapic *vlapic, int vector)
 {
 
 	panic("vmx_intr_accepted: not expected to be called");
 }
 
 static void
 vmx_set_tmr(struct vlapic *vlapic, int vector, bool level)
 {
 	struct vlapic_vtx *vlapic_vtx;
 	struct vmx *vmx;
 	struct vmcs *vmcs;
 	uint64_t mask, val;
 
 	KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector));
 	KASSERT(!vcpu_is_running(vlapic->vm, vlapic->vcpuid, NULL),
 	    ("vmx_set_tmr: vcpu cannot be running"));
 
 	vlapic_vtx = (struct vlapic_vtx *)vlapic;
 	vmx = vlapic_vtx->vmx;
 	vmcs = &vmx->vmcs[vlapic->vcpuid];
 	mask = 1UL << (vector % 64);
 
 	VMPTRLD(vmcs);
 	val = vmcs_read(VMCS_EOI_EXIT(vector));
 	if (level)
 		val |= mask;
 	else
 		val &= ~mask;
 	vmcs_write(VMCS_EOI_EXIT(vector), val);
 	VMCLEAR(vmcs);
 }
 
 static void
 vmx_enable_x2apic_mode(struct vlapic *vlapic)
 {
 	struct vmx *vmx;
 	struct vmcs *vmcs;
 	uint32_t proc_ctls2;
 	int vcpuid, error;
 
 	vcpuid = vlapic->vcpuid;
 	vmx = ((struct vlapic_vtx *)vlapic)->vmx;
 	vmcs = &vmx->vmcs[vcpuid];
 
 	proc_ctls2 = vmx->cap[vcpuid].proc_ctls2;
 	KASSERT((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) != 0,
 	    ("%s: invalid proc_ctls2 %#x", __func__, proc_ctls2));
 
 	proc_ctls2 &= ~PROCBASED2_VIRTUALIZE_APIC_ACCESSES;
 	proc_ctls2 |= PROCBASED2_VIRTUALIZE_X2APIC_MODE;
 	vmx->cap[vcpuid].proc_ctls2 = proc_ctls2;
 
 	VMPTRLD(vmcs);
 	vmcs_write(VMCS_SEC_PROC_BASED_CTLS, proc_ctls2);
 	VMCLEAR(vmcs);
 
 	if (vlapic->vcpuid == 0) {
 		/*
 		 * The nested page table mappings are shared by all vcpus
 		 * so unmap the APIC access page just once.
 		 */
 		error = vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE);
 		KASSERT(error == 0, ("%s: vm_unmap_mmio error %d",
 		    __func__, error));
 
 		/*
 		 * The MSR bitmap is shared by all vcpus so modify it only
 		 * once in the context of vcpu 0.
 		 */
 		error = vmx_allow_x2apic_msrs(vmx);
 		KASSERT(error == 0, ("%s: vmx_allow_x2apic_msrs error %d",
 		    __func__, error));
 	}
 }
 
 static void
 vmx_post_intr(struct vlapic *vlapic, int hostcpu)
 {
 
 	ipi_cpu(hostcpu, pirvec);
 }
 
 /*
  * Transfer the pending interrupts in the PIR descriptor to the IRR
  * in the virtual APIC page.
  */
 static void
 vmx_inject_pir(struct vlapic *vlapic)
 {
 	struct vlapic_vtx *vlapic_vtx;
 	struct pir_desc *pir_desc;
 	struct LAPIC *lapic;
 	uint64_t val, pirval;
 	int rvi, pirbase = -1;
 	uint16_t intr_status_old, intr_status_new;
 
 	vlapic_vtx = (struct vlapic_vtx *)vlapic;
 	pir_desc = vlapic_vtx->pir_desc;
 	if (atomic_cmpset_long(&pir_desc->pending, 1, 0) == 0) {
 		VCPU_CTR0(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: "
 		    "no posted interrupt pending");
 		return;
 	}
 
 	pirval = 0;
 	pirbase = -1;
 	lapic = vlapic->apic_page;
 
 	val = atomic_readandclear_long(&pir_desc->pir[0]);
 	if (val != 0) {
 		lapic->irr0 |= val;
 		lapic->irr1 |= val >> 32;
 		pirbase = 0;
 		pirval = val;
 	}
 
 	val = atomic_readandclear_long(&pir_desc->pir[1]);
 	if (val != 0) {
 		lapic->irr2 |= val;
 		lapic->irr3 |= val >> 32;
 		pirbase = 64;
 		pirval = val;
 	}
 
 	val = atomic_readandclear_long(&pir_desc->pir[2]);
 	if (val != 0) {
 		lapic->irr4 |= val;
 		lapic->irr5 |= val >> 32;
 		pirbase = 128;
 		pirval = val;
 	}
 
 	val = atomic_readandclear_long(&pir_desc->pir[3]);
 	if (val != 0) {
 		lapic->irr6 |= val;
 		lapic->irr7 |= val >> 32;
 		pirbase = 192;
 		pirval = val;
 	}
 
 	VLAPIC_CTR_IRR(vlapic, "vmx_inject_pir");
 
 	/*
 	 * Update RVI so the processor can evaluate pending virtual
 	 * interrupts on VM-entry.
 	 *
 	 * It is possible for pirval to be 0 here, even though the
 	 * pending bit has been set. The scenario is:
 	 * CPU-Y is sending a posted interrupt to CPU-X, which
 	 * is running a guest and processing posted interrupts in h/w.
 	 * CPU-X will eventually exit and the state seen in s/w is
 	 * the pending bit set, but no PIR bits set.
 	 *
 	 *      CPU-X                      CPU-Y
 	 *   (vm running)                (host running)
 	 *   rx posted interrupt
 	 *   CLEAR pending bit
 	 *				 SET PIR bit
 	 *   READ/CLEAR PIR bits
 	 *				 SET pending bit
 	 *   (vm exit)
 	 *   pending bit set, PIR 0
 	 */
 	if (pirval != 0) {
 		rvi = pirbase + flsl(pirval) - 1;
 		intr_status_old = vmcs_read(VMCS_GUEST_INTR_STATUS);
 		intr_status_new = (intr_status_old & 0xFF00) | rvi;
 		if (intr_status_new > intr_status_old) {
 			vmcs_write(VMCS_GUEST_INTR_STATUS, intr_status_new);
 			VCPU_CTR2(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: "
 			    "guest_intr_status changed from 0x%04x to 0x%04x",
 			    intr_status_old, intr_status_new);
 		}
 	}
 }
 
 static struct vlapic *
 vmx_vlapic_init(void *arg, int vcpuid)
 {
 	struct vmx *vmx;
 	struct vlapic *vlapic;
 	struct vlapic_vtx *vlapic_vtx;
 
 	vmx = arg;
 
 	vlapic = malloc(sizeof(struct vlapic_vtx), M_VLAPIC, M_WAITOK | M_ZERO);
 	vlapic->vm = vmx->vm;
 	vlapic->vcpuid = vcpuid;
 	vlapic->apic_page = (struct LAPIC *)&vmx->apic_page[vcpuid];
 
 	vlapic_vtx = (struct vlapic_vtx *)vlapic;
 	vlapic_vtx->pir_desc = &vmx->pir_desc[vcpuid];
 	vlapic_vtx->vmx = vmx;
 
 	if (virtual_interrupt_delivery) {
 		vlapic->ops.set_intr_ready = vmx_set_intr_ready;
 		vlapic->ops.pending_intr = vmx_pending_intr;
 		vlapic->ops.intr_accepted = vmx_intr_accepted;
 		vlapic->ops.set_tmr = vmx_set_tmr;
 		vlapic->ops.enable_x2apic_mode = vmx_enable_x2apic_mode;
 	}
 
 	if (posted_interrupts)
 		vlapic->ops.post_intr = vmx_post_intr;
 
 	vlapic_init(vlapic);
 
 	return (vlapic);
 }
 
 static void
 vmx_vlapic_cleanup(void *arg, struct vlapic *vlapic)
 {
 
 	vlapic_cleanup(vlapic);
 	free(vlapic, M_VLAPIC);
 }
 
 struct vmm_ops vmm_ops_intel = {
 	.init		= vmx_init,
 	.cleanup	= vmx_cleanup,
 	.resume		= vmx_restore,
 	.vminit		= vmx_vminit,
 	.vmrun		= vmx_run,
 	.vmcleanup	= vmx_vmcleanup,
 	.vmgetreg	= vmx_getreg,
 	.vmsetreg	= vmx_setreg,
 	.vmgetdesc	= vmx_getdesc,
 	.vmsetdesc	= vmx_setdesc,
 	.vmgetcap	= vmx_getcap,
 	.vmsetcap	= vmx_setcap,
 	.vmspace_alloc	= ept_vmspace_alloc,
 	.vmspace_free	= ept_vmspace_free,
 	.vlapic_init	= vmx_vlapic_init,
 	.vlapic_cleanup	= vmx_vlapic_cleanup,
 };
Index: head/sys/amd64/vmm/io/iommu.c
===================================================================
--- head/sys/amd64/vmm/io/iommu.c	(revision 357973)
+++ head/sys/amd64/vmm/io/iommu.c	(revision 357974)
@@ -1,341 +1,342 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/eventhandler.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 
 #include <dev/pci/pcivar.h>
 #include <dev/pci/pcireg.h>
 
 #include <machine/cpu.h>
 #include <machine/md_var.h>
 
 #include "vmm_util.h"
 #include "vmm_mem.h"
 #include "iommu.h"
 
 SYSCTL_DECL(_hw_vmm);
-SYSCTL_NODE(_hw_vmm, OID_AUTO, iommu, CTLFLAG_RW, 0, "bhyve iommu parameters");
+SYSCTL_NODE(_hw_vmm, OID_AUTO, iommu, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
+    "bhyve iommu parameters");
 
 static int iommu_avail;
 SYSCTL_INT(_hw_vmm_iommu, OID_AUTO, initialized, CTLFLAG_RD, &iommu_avail,
     0, "bhyve iommu initialized?");
 
 static int iommu_enable = 1;
 SYSCTL_INT(_hw_vmm_iommu, OID_AUTO, enable, CTLFLAG_RDTUN, &iommu_enable, 0,
     "Enable use of I/O MMU (required for PCI passthrough).");
 
 static struct iommu_ops *ops;
 static void *host_domain;
 static eventhandler_tag add_tag, delete_tag;
 
 static __inline int
 IOMMU_INIT(void)
 {
 	if (ops != NULL)
 		return ((*ops->init)());
 	else
 		return (ENXIO);
 }
 
 static __inline void
 IOMMU_CLEANUP(void)
 {
 	if (ops != NULL && iommu_avail)
 		(*ops->cleanup)();
 }
 
 static __inline void *
 IOMMU_CREATE_DOMAIN(vm_paddr_t maxaddr)
 {
 
 	if (ops != NULL && iommu_avail)
 		return ((*ops->create_domain)(maxaddr));
 	else
 		return (NULL);
 }
 
 static __inline void
 IOMMU_DESTROY_DOMAIN(void *dom)
 {
 
 	if (ops != NULL && iommu_avail)
 		(*ops->destroy_domain)(dom);
 }
 
 static __inline uint64_t
 IOMMU_CREATE_MAPPING(void *domain, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len)
 {
 
 	if (ops != NULL && iommu_avail)
 		return ((*ops->create_mapping)(domain, gpa, hpa, len));
 	else
 		return (len);		/* XXX */
 }
 
 static __inline uint64_t
 IOMMU_REMOVE_MAPPING(void *domain, vm_paddr_t gpa, uint64_t len)
 {
 
 	if (ops != NULL && iommu_avail)
 		return ((*ops->remove_mapping)(domain, gpa, len));
 	else
 		return (len);		/* XXX */
 }
 
 static __inline void
 IOMMU_ADD_DEVICE(void *domain, uint16_t rid)
 {
 
 	if (ops != NULL && iommu_avail)
 		(*ops->add_device)(domain, rid);
 }
 
 static __inline void
 IOMMU_REMOVE_DEVICE(void *domain, uint16_t rid)
 {
 
 	if (ops != NULL && iommu_avail)
 		(*ops->remove_device)(domain, rid);
 }
 
 static __inline void
 IOMMU_INVALIDATE_TLB(void *domain)
 {
 
 	if (ops != NULL && iommu_avail)
 		(*ops->invalidate_tlb)(domain);
 }
 
 static __inline void
 IOMMU_ENABLE(void)
 {
 
 	if (ops != NULL && iommu_avail)
 		(*ops->enable)();
 }
 
 static __inline void
 IOMMU_DISABLE(void)
 {
 
 	if (ops != NULL && iommu_avail)
 		(*ops->disable)();
 }
 
 static void
 iommu_pci_add(void *arg, device_t dev)
 {
 
 	/* Add new devices to the host domain. */
 	iommu_add_device(host_domain, pci_get_rid(dev));
 }
 
 static void
 iommu_pci_delete(void *arg, device_t dev)
 {
 
 	iommu_remove_device(host_domain, pci_get_rid(dev));
 }
 
 static void
 iommu_init(void)
 {
 	int error, bus, slot, func;
 	vm_paddr_t maxaddr;
 	devclass_t dc;
 	device_t dev;
 
 	if (!iommu_enable)
 		return;
 
 	if (vmm_is_intel())
 		ops = &iommu_ops_intel;
 	else if (vmm_is_svm())
 		ops = &iommu_ops_amd;
 	else
 		ops = NULL;
 
 	error = IOMMU_INIT();
 	if (error)
 		return;
 
 	iommu_avail = 1;
 
 	/*
 	 * Create a domain for the devices owned by the host
 	 */
 	maxaddr = vmm_mem_maxaddr();
 	host_domain = IOMMU_CREATE_DOMAIN(maxaddr);
 	if (host_domain == NULL) {
 		printf("iommu_init: unable to create a host domain");
 		IOMMU_CLEANUP();
 		ops = NULL;
 		iommu_avail = 0;
 		return;
 	}
 
 	/*
 	 * Create 1:1 mappings from '0' to 'maxaddr' for devices assigned to
 	 * the host
 	 */
 	iommu_create_mapping(host_domain, 0, 0, maxaddr);
 
 	add_tag = EVENTHANDLER_REGISTER(pci_add_device, iommu_pci_add, NULL, 0);
 	delete_tag = EVENTHANDLER_REGISTER(pci_delete_device, iommu_pci_delete,
 	    NULL, 0);
 	dc = devclass_find("ppt");
 	for (bus = 0; bus <= PCI_BUSMAX; bus++) {
 		for (slot = 0; slot <= PCI_SLOTMAX; slot++) {
 			for (func = 0; func <= PCI_FUNCMAX; func++) {
 				dev = pci_find_dbsf(0, bus, slot, func);
 				if (dev == NULL)
 					continue;
 
 				/* Skip passthrough devices. */
 				if (dc != NULL &&
 				    device_get_devclass(dev) == dc)
 					continue;
 
 				/*
 				 * Everything else belongs to the host
 				 * domain.
 				 */
 				iommu_add_device(host_domain,
 				    pci_get_rid(dev));
 			}
 		}
 	}
 	IOMMU_ENABLE();
 
 }
 
 void
 iommu_cleanup(void)
 {
 
 	if (add_tag != NULL) {
 		EVENTHANDLER_DEREGISTER(pci_add_device, add_tag);
 		add_tag = NULL;
 	}
 	if (delete_tag != NULL) {
 		EVENTHANDLER_DEREGISTER(pci_delete_device, delete_tag);
 		delete_tag = NULL;
 	}
 	IOMMU_DISABLE();
 	IOMMU_DESTROY_DOMAIN(host_domain);
 	IOMMU_CLEANUP();
 }
 
 void *
 iommu_create_domain(vm_paddr_t maxaddr)
 {
 	static volatile int iommu_initted;
 
 	if (iommu_initted < 2) {
 		if (atomic_cmpset_int(&iommu_initted, 0, 1)) {
 			iommu_init();
 			atomic_store_rel_int(&iommu_initted, 2);
 		} else
 			while (iommu_initted == 1)
 				cpu_spinwait();
 	}
 	return (IOMMU_CREATE_DOMAIN(maxaddr));
 }
 
 void
 iommu_destroy_domain(void *dom)
 {
 
 	IOMMU_DESTROY_DOMAIN(dom);
 }
 
 void
 iommu_create_mapping(void *dom, vm_paddr_t gpa, vm_paddr_t hpa, size_t len)
 {
 	uint64_t mapped, remaining;
 
 	remaining = len;
 
 	while (remaining > 0) {
 		mapped = IOMMU_CREATE_MAPPING(dom, gpa, hpa, remaining);
 		gpa += mapped;
 		hpa += mapped;
 		remaining -= mapped;
 	}
 }
 
 void
 iommu_remove_mapping(void *dom, vm_paddr_t gpa, size_t len)
 {
 	uint64_t unmapped, remaining;
 
 	remaining = len;
 
 	while (remaining > 0) {
 		unmapped = IOMMU_REMOVE_MAPPING(dom, gpa, remaining);
 		gpa += unmapped;
 		remaining -= unmapped;
 	}
 }
 
 void *
 iommu_host_domain(void)
 {
 
 	return (host_domain);
 }
 
 void
 iommu_add_device(void *dom, uint16_t rid)
 {
 
 	IOMMU_ADD_DEVICE(dom, rid);
 }
 
 void
 iommu_remove_device(void *dom, uint16_t rid)
 {
 
 	IOMMU_REMOVE_DEVICE(dom, rid);
 }
 
 void
 iommu_invalidate_tlb(void *domain)
 {
 
 	IOMMU_INVALIDATE_TLB(domain);
 }
Index: head/sys/amd64/vmm/io/ppt.c
===================================================================
--- head/sys/amd64/vmm/io/ppt.c	(revision 357973)
+++ head/sys/amd64/vmm/io/ppt.c	(revision 357974)
@@ -1,701 +1,702 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/bus.h>
 #include <sys/pciio.h>
 #include <sys/rman.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 
 #include <dev/pci/pcivar.h>
 #include <dev/pci/pcireg.h>
 
 #include <machine/resource.h>
 
 #include <machine/vmm.h>
 #include <machine/vmm_dev.h>
 
 #include "vmm_lapic.h"
 #include "vmm_ktr.h"
 
 #include "iommu.h"
 #include "ppt.h"
 
 /* XXX locking */
 
 #define	MAX_MSIMSGS	32
 
 /*
  * If the MSI-X table is located in the middle of a BAR then that MMIO
  * region gets split into two segments - one segment above the MSI-X table
  * and the other segment below the MSI-X table - with a hole in place of
  * the MSI-X table so accesses to it can be trapped and emulated.
  *
  * So, allocate a MMIO segment for each BAR register + 1 additional segment.
  */
 #define	MAX_MMIOSEGS	((PCIR_MAX_BAR_0 + 1) + 1)
 
 MALLOC_DEFINE(M_PPTMSIX, "pptmsix", "Passthru MSI-X resources");
 
 struct pptintr_arg {				/* pptintr(pptintr_arg) */
 	struct pptdev	*pptdev;
 	uint64_t	addr;
 	uint64_t	msg_data;
 };
 
 struct pptseg {
 	vm_paddr_t	gpa;
 	size_t		len;
 	int		wired;
 };
 
 struct pptdev {
 	device_t	dev;
 	struct vm	*vm;			/* owner of this device */
 	TAILQ_ENTRY(pptdev)	next;
 	struct pptseg mmio[MAX_MMIOSEGS];
 	struct {
 		int	num_msgs;		/* guest state */
 
 		int	startrid;		/* host state */
 		struct resource *res[MAX_MSIMSGS];
 		void	*cookie[MAX_MSIMSGS];
 		struct pptintr_arg arg[MAX_MSIMSGS];
 	} msi;
 
 	struct {
 		int num_msgs;
 		int startrid;
 		int msix_table_rid;
 		int msix_pba_rid;
 		struct resource *msix_table_res;
 		struct resource *msix_pba_res;
 		struct resource **res;
 		void **cookie;
 		struct pptintr_arg *arg;
 	} msix;
 };
 
 SYSCTL_DECL(_hw_vmm);
-SYSCTL_NODE(_hw_vmm, OID_AUTO, ppt, CTLFLAG_RW, 0, "bhyve passthru devices");
+SYSCTL_NODE(_hw_vmm, OID_AUTO, ppt, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
+    "bhyve passthru devices");
 
 static int num_pptdevs;
 SYSCTL_INT(_hw_vmm_ppt, OID_AUTO, devices, CTLFLAG_RD, &num_pptdevs, 0,
     "number of pci passthru devices");
 
 static TAILQ_HEAD(, pptdev) pptdev_list = TAILQ_HEAD_INITIALIZER(pptdev_list);
 
 static int
 ppt_probe(device_t dev)
 {
 	int bus, slot, func;
 	struct pci_devinfo *dinfo;
 
 	dinfo = (struct pci_devinfo *)device_get_ivars(dev);
 
 	bus = pci_get_bus(dev);
 	slot = pci_get_slot(dev);
 	func = pci_get_function(dev);
 
 	/*
 	 * To qualify as a pci passthrough device a device must:
 	 * - be allowed by administrator to be used in this role
 	 * - be an endpoint device
 	 */
 	if ((dinfo->cfg.hdrtype & PCIM_HDRTYPE) != PCIM_HDRTYPE_NORMAL)
 		return (ENXIO);
 	else if (vmm_is_pptdev(bus, slot, func))
 		return (0);
 	else
 		/*
 		 * Returning BUS_PROBE_NOWILDCARD here matches devices that the
 		 * SR-IOV infrastructure specified as "ppt" passthrough devices.
 		 * All normal devices that did not have "ppt" specified as their
 		 * driver will not be matched by this.
 		 */
 		return (BUS_PROBE_NOWILDCARD);
 }
 
 static int
 ppt_attach(device_t dev)
 {
 	struct pptdev *ppt;
 
 	ppt = device_get_softc(dev);
 
 	iommu_remove_device(iommu_host_domain(), pci_get_rid(dev));
 	num_pptdevs++;
 	TAILQ_INSERT_TAIL(&pptdev_list, ppt, next);
 	ppt->dev = dev;
 
 	if (bootverbose)
 		device_printf(dev, "attached\n");
 
 	return (0);
 }
 
 static int
 ppt_detach(device_t dev)
 {
 	struct pptdev *ppt;
 
 	ppt = device_get_softc(dev);
 
 	if (ppt->vm != NULL)
 		return (EBUSY);
 	num_pptdevs--;
 	TAILQ_REMOVE(&pptdev_list, ppt, next);
 	pci_disable_busmaster(dev);
 	iommu_add_device(iommu_host_domain(), pci_get_rid(dev));
 
 	return (0);
 }
 
 static device_method_t ppt_methods[] = {
 	/* Device interface */
 	DEVMETHOD(device_probe,		ppt_probe),
 	DEVMETHOD(device_attach,	ppt_attach),
 	DEVMETHOD(device_detach,	ppt_detach),
 	{0, 0}
 };
 
 static devclass_t ppt_devclass;
 DEFINE_CLASS_0(ppt, ppt_driver, ppt_methods, sizeof(struct pptdev));
 DRIVER_MODULE(ppt, pci, ppt_driver, ppt_devclass, NULL, NULL);
 
 static struct pptdev *
 ppt_find(int bus, int slot, int func)
 {
 	device_t dev;
 	struct pptdev *ppt;
 	int b, s, f;
 
 	TAILQ_FOREACH(ppt, &pptdev_list, next) {
 		dev = ppt->dev;
 		b = pci_get_bus(dev);
 		s = pci_get_slot(dev);
 		f = pci_get_function(dev);
 		if (bus == b && slot == s && func == f)
 			return (ppt);
 	}
 	return (NULL);
 }
 
 static void
 ppt_unmap_mmio(struct vm *vm, struct pptdev *ppt)
 {
 	int i;
 	struct pptseg *seg;
 
 	for (i = 0; i < MAX_MMIOSEGS; i++) {
 		seg = &ppt->mmio[i];
 		if (seg->len == 0)
 			continue;
 		(void)vm_unmap_mmio(vm, seg->gpa, seg->len);
 		bzero(seg, sizeof(struct pptseg));
 	}
 }
 
 static void
 ppt_teardown_msi(struct pptdev *ppt)
 {
 	int i, rid;
 	void *cookie;
 	struct resource *res;
 
 	if (ppt->msi.num_msgs == 0)
 		return;
 
 	for (i = 0; i < ppt->msi.num_msgs; i++) {
 		rid = ppt->msi.startrid + i;
 		res = ppt->msi.res[i];
 		cookie = ppt->msi.cookie[i];
 
 		if (cookie != NULL)
 			bus_teardown_intr(ppt->dev, res, cookie);
 
 		if (res != NULL)
 			bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, res);
 		
 		ppt->msi.res[i] = NULL;
 		ppt->msi.cookie[i] = NULL;
 	}
 
 	if (ppt->msi.startrid == 1)
 		pci_release_msi(ppt->dev);
 
 	ppt->msi.num_msgs = 0;
 }
 
 static void 
 ppt_teardown_msix_intr(struct pptdev *ppt, int idx)
 {
 	int rid;
 	struct resource *res;
 	void *cookie;
 
 	rid = ppt->msix.startrid + idx;
 	res = ppt->msix.res[idx];
 	cookie = ppt->msix.cookie[idx];
 
 	if (cookie != NULL) 
 		bus_teardown_intr(ppt->dev, res, cookie);
 
 	if (res != NULL) 
 		bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, res);
 
 	ppt->msix.res[idx] = NULL;
 	ppt->msix.cookie[idx] = NULL;
 }
 
 static void 
 ppt_teardown_msix(struct pptdev *ppt)
 {
 	int i;
 
 	if (ppt->msix.num_msgs == 0) 
 		return;
 
 	for (i = 0; i < ppt->msix.num_msgs; i++) 
 		ppt_teardown_msix_intr(ppt, i);
 
 	free(ppt->msix.res, M_PPTMSIX);
 	free(ppt->msix.cookie, M_PPTMSIX);
 	free(ppt->msix.arg, M_PPTMSIX);
 
 	pci_release_msi(ppt->dev);
 
 	if (ppt->msix.msix_table_res) {
 		bus_release_resource(ppt->dev, SYS_RES_MEMORY, 
 				     ppt->msix.msix_table_rid,
 				     ppt->msix.msix_table_res);
 		ppt->msix.msix_table_res = NULL;
 		ppt->msix.msix_table_rid = 0;
 	}
 	if (ppt->msix.msix_pba_res) {
 		bus_release_resource(ppt->dev, SYS_RES_MEMORY, 
 				     ppt->msix.msix_pba_rid,
 				     ppt->msix.msix_pba_res);
 		ppt->msix.msix_pba_res = NULL;
 		ppt->msix.msix_pba_rid = 0;
 	}
 
 	ppt->msix.num_msgs = 0;
 }
 
 int
 ppt_avail_devices(void)
 {
 
 	return (num_pptdevs);
 }
 
 int
 ppt_assigned_devices(struct vm *vm)
 {
 	struct pptdev *ppt;
 	int num;
 
 	num = 0;
 	TAILQ_FOREACH(ppt, &pptdev_list, next) {
 		if (ppt->vm == vm)
 			num++;
 	}
 	return (num);
 }
 
 bool
 ppt_is_mmio(struct vm *vm, vm_paddr_t gpa)
 {
 	int i;
 	struct pptdev *ppt;
 	struct pptseg *seg;
 
 	TAILQ_FOREACH(ppt, &pptdev_list, next) {
 		if (ppt->vm != vm)
 			continue;
 
 		for (i = 0; i < MAX_MMIOSEGS; i++) {
 			seg = &ppt->mmio[i];
 			if (seg->len == 0)
 				continue;
 			if (gpa >= seg->gpa && gpa < seg->gpa + seg->len)
 				return (true);
 		}
 	}
 
 	return (false);
 }
 
 static void
 ppt_pci_reset(device_t dev)
 {
 
 	if (pcie_flr(dev,
 	     max(pcie_get_max_completion_timeout(dev) / 1000, 10), true))
 		return;
 
 	pci_power_reset(dev);
 }
 
 int
 ppt_assign_device(struct vm *vm, int bus, int slot, int func)
 {
 	struct pptdev *ppt;
 
 	ppt = ppt_find(bus, slot, func);
 	if (ppt != NULL) {
 		/*
 		 * If this device is owned by a different VM then we
 		 * cannot change its owner.
 		 */
 		if (ppt->vm != NULL && ppt->vm != vm)
 			return (EBUSY);
 
 		pci_save_state(ppt->dev);
 		ppt_pci_reset(ppt->dev);
 		pci_restore_state(ppt->dev);
 		ppt->vm = vm;
 		iommu_add_device(vm_iommu_domain(vm), pci_get_rid(ppt->dev));
 		return (0);
 	}
 	return (ENOENT);
 }
 
 int
 ppt_unassign_device(struct vm *vm, int bus, int slot, int func)
 {
 	struct pptdev *ppt;
 
 	ppt = ppt_find(bus, slot, func);
 	if (ppt != NULL) {
 		/*
 		 * If this device is not owned by this 'vm' then bail out.
 		 */
 		if (ppt->vm != vm)
 			return (EBUSY);
 
 		pci_save_state(ppt->dev);
 		ppt_pci_reset(ppt->dev);
 		pci_restore_state(ppt->dev);
 		ppt_unmap_mmio(vm, ppt);
 		ppt_teardown_msi(ppt);
 		ppt_teardown_msix(ppt);
 		iommu_remove_device(vm_iommu_domain(vm), pci_get_rid(ppt->dev));
 		ppt->vm = NULL;
 		return (0);
 	}
 	return (ENOENT);
 }
 
 int
 ppt_unassign_all(struct vm *vm)
 {
 	struct pptdev *ppt;
 	int bus, slot, func;
 	device_t dev;
 
 	TAILQ_FOREACH(ppt, &pptdev_list, next) {
 		if (ppt->vm == vm) {
 			dev = ppt->dev;
 			bus = pci_get_bus(dev);
 			slot = pci_get_slot(dev);
 			func = pci_get_function(dev);
 			vm_unassign_pptdev(vm, bus, slot, func);
 		}
 	}
 
 	return (0);
 }
 
 int
 ppt_map_mmio(struct vm *vm, int bus, int slot, int func,
 	     vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
 {
 	int i, error;
 	struct pptseg *seg;
 	struct pptdev *ppt;
 
 	ppt = ppt_find(bus, slot, func);
 	if (ppt != NULL) {
 		if (ppt->vm != vm)
 			return (EBUSY);
 
 		for (i = 0; i < MAX_MMIOSEGS; i++) {
 			seg = &ppt->mmio[i];
 			if (seg->len == 0) {
 				error = vm_map_mmio(vm, gpa, len, hpa);
 				if (error == 0) {
 					seg->gpa = gpa;
 					seg->len = len;
 				}
 				return (error);
 			}
 		}
 		return (ENOSPC);
 	}
 	return (ENOENT);
 }
 
 static int
 pptintr(void *arg)
 {
 	struct pptdev *ppt;
 	struct pptintr_arg *pptarg;
 	
 	pptarg = arg;
 	ppt = pptarg->pptdev;
 
 	if (ppt->vm != NULL)
 		lapic_intr_msi(ppt->vm, pptarg->addr, pptarg->msg_data);
 	else {
 		/*
 		 * XXX
 		 * This is not expected to happen - panic?
 		 */
 	}
 
 	/*
 	 * For legacy interrupts give other filters a chance in case
 	 * the interrupt was not generated by the passthrough device.
 	 */
 	if (ppt->msi.startrid == 0)
 		return (FILTER_STRAY);
 	else
 		return (FILTER_HANDLED);
 }
 
 int
 ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func,
 	      uint64_t addr, uint64_t msg, int numvec)
 {
 	int i, rid, flags;
 	int msi_count, startrid, error, tmp;
 	struct pptdev *ppt;
 
 	if (numvec < 0 || numvec > MAX_MSIMSGS)
 		return (EINVAL);
 
 	ppt = ppt_find(bus, slot, func);
 	if (ppt == NULL)
 		return (ENOENT);
 	if (ppt->vm != vm)		/* Make sure we own this device */
 		return (EBUSY);
 
 	/* Free any allocated resources */
 	ppt_teardown_msi(ppt);
 
 	if (numvec == 0)		/* nothing more to do */
 		return (0);
 
 	flags = RF_ACTIVE;
 	msi_count = pci_msi_count(ppt->dev);
 	if (msi_count == 0) {
 		startrid = 0;		/* legacy interrupt */
 		msi_count = 1;
 		flags |= RF_SHAREABLE;
 	} else
 		startrid = 1;		/* MSI */
 
 	/*
 	 * The device must be capable of supporting the number of vectors
 	 * the guest wants to allocate.
 	 */
 	if (numvec > msi_count)
 		return (EINVAL);
 
 	/*
 	 * Make sure that we can allocate all the MSI vectors that are needed
 	 * by the guest.
 	 */
 	if (startrid == 1) {
 		tmp = numvec;
 		error = pci_alloc_msi(ppt->dev, &tmp);
 		if (error)
 			return (error);
 		else if (tmp != numvec) {
 			pci_release_msi(ppt->dev);
 			return (ENOSPC);
 		} else {
 			/* success */
 		}
 	}
 	
 	ppt->msi.startrid = startrid;
 
 	/*
 	 * Allocate the irq resource and attach it to the interrupt handler.
 	 */
 	for (i = 0; i < numvec; i++) {
 		ppt->msi.num_msgs = i + 1;
 		ppt->msi.cookie[i] = NULL;
 
 		rid = startrid + i;
 		ppt->msi.res[i] = bus_alloc_resource_any(ppt->dev, SYS_RES_IRQ,
 							 &rid, flags);
 		if (ppt->msi.res[i] == NULL)
 			break;
 
 		ppt->msi.arg[i].pptdev = ppt;
 		ppt->msi.arg[i].addr = addr;
 		ppt->msi.arg[i].msg_data = msg + i;
 
 		error = bus_setup_intr(ppt->dev, ppt->msi.res[i],
 				       INTR_TYPE_NET | INTR_MPSAFE,
 				       pptintr, NULL, &ppt->msi.arg[i],
 				       &ppt->msi.cookie[i]);
 		if (error != 0)
 			break;
 	}
 	
 	if (i < numvec) {
 		ppt_teardown_msi(ppt);
 		return (ENXIO);
 	}
 
 	return (0);
 }
 
 int
 ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func,
 	       int idx, uint64_t addr, uint64_t msg, uint32_t vector_control)
 {
 	struct pptdev *ppt;
 	struct pci_devinfo *dinfo;
 	int numvec, alloced, rid, error;
 	size_t res_size, cookie_size, arg_size;
 
 	ppt = ppt_find(bus, slot, func);
 	if (ppt == NULL)
 		return (ENOENT);
 	if (ppt->vm != vm)		/* Make sure we own this device */
 		return (EBUSY);
 
 	dinfo = device_get_ivars(ppt->dev);
 	if (!dinfo) 
 		return (ENXIO);
 
 	/* 
 	 * First-time configuration:
 	 * 	Allocate the MSI-X table
 	 *	Allocate the IRQ resources
 	 *	Set up some variables in ppt->msix
 	 */
 	if (ppt->msix.num_msgs == 0) {
 		numvec = pci_msix_count(ppt->dev);
 		if (numvec <= 0)
 			return (EINVAL);
 
 		ppt->msix.startrid = 1;
 		ppt->msix.num_msgs = numvec;
 
 		res_size = numvec * sizeof(ppt->msix.res[0]);
 		cookie_size = numvec * sizeof(ppt->msix.cookie[0]);
 		arg_size = numvec * sizeof(ppt->msix.arg[0]);
 
 		ppt->msix.res = malloc(res_size, M_PPTMSIX, M_WAITOK | M_ZERO);
 		ppt->msix.cookie = malloc(cookie_size, M_PPTMSIX,
 					  M_WAITOK | M_ZERO);
 		ppt->msix.arg = malloc(arg_size, M_PPTMSIX, M_WAITOK | M_ZERO);
 
 		rid = dinfo->cfg.msix.msix_table_bar;
 		ppt->msix.msix_table_res = bus_alloc_resource_any(ppt->dev,
 					       SYS_RES_MEMORY, &rid, RF_ACTIVE);
 
 		if (ppt->msix.msix_table_res == NULL) {
 			ppt_teardown_msix(ppt);
 			return (ENOSPC);
 		}
 		ppt->msix.msix_table_rid = rid;
 
 		if (dinfo->cfg.msix.msix_table_bar !=
 		    dinfo->cfg.msix.msix_pba_bar) {
 			rid = dinfo->cfg.msix.msix_pba_bar;
 			ppt->msix.msix_pba_res = bus_alloc_resource_any(
 			    ppt->dev, SYS_RES_MEMORY, &rid, RF_ACTIVE);
 
 			if (ppt->msix.msix_pba_res == NULL) {
 				ppt_teardown_msix(ppt);
 				return (ENOSPC);
 			}
 			ppt->msix.msix_pba_rid = rid;
 		}
 
 		alloced = numvec;
 		error = pci_alloc_msix(ppt->dev, &alloced);
 		if (error || alloced != numvec) {
 			ppt_teardown_msix(ppt);
 			return (error == 0 ? ENOSPC: error);
 		}
 	}
 
 	if ((vector_control & PCIM_MSIX_VCTRL_MASK) == 0) {
 		/* Tear down the IRQ if it's already set up */
 		ppt_teardown_msix_intr(ppt, idx);
 
 		/* Allocate the IRQ resource */
 		ppt->msix.cookie[idx] = NULL;
 		rid = ppt->msix.startrid + idx;
 		ppt->msix.res[idx] = bus_alloc_resource_any(ppt->dev, SYS_RES_IRQ,
 							    &rid, RF_ACTIVE);
 		if (ppt->msix.res[idx] == NULL)
 			return (ENXIO);
 	
 		ppt->msix.arg[idx].pptdev = ppt;
 		ppt->msix.arg[idx].addr = addr;
 		ppt->msix.arg[idx].msg_data = msg;
 	
 		/* Setup the MSI-X interrupt */
 		error = bus_setup_intr(ppt->dev, ppt->msix.res[idx],
 				       INTR_TYPE_NET | INTR_MPSAFE,
 				       pptintr, NULL, &ppt->msix.arg[idx],
 				       &ppt->msix.cookie[idx]);
 	
 		if (error != 0) {
 			bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, ppt->msix.res[idx]);
 			ppt->msix.cookie[idx] = NULL;
 			ppt->msix.res[idx] = NULL;
 			return (ENXIO);
 		}
 	} else {
 		/* Masked, tear it down if it's already been set up */
 		ppt_teardown_msix_intr(ppt, idx);
 	}
 
 	return (0);
 }
Index: head/sys/amd64/vmm/io/vrtc.c
===================================================================
--- head/sys/amd64/vmm/io/vrtc.c	(revision 357973)
+++ head/sys/amd64/vmm/io/vrtc.c	(revision 357974)
@@ -1,1021 +1,1022 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2014, Neel Natu (neel@freebsd.org)
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/queue.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/clock.h>
 #include <sys/sysctl.h>
 
 #include <machine/vmm.h>
 
 #include <isa/rtc.h>
 
 #include "vmm_ktr.h"
 #include "vatpic.h"
 #include "vioapic.h"
 #include "vrtc.h"
 
 /* Register layout of the RTC */
 struct rtcdev {
 	uint8_t	sec;
 	uint8_t	alarm_sec;
 	uint8_t	min;
 	uint8_t	alarm_min;
 	uint8_t	hour;
 	uint8_t	alarm_hour;
 	uint8_t	day_of_week;
 	uint8_t	day_of_month;
 	uint8_t	month;
 	uint8_t	year;
 	uint8_t	reg_a;
 	uint8_t	reg_b;
 	uint8_t	reg_c;
 	uint8_t	reg_d;
 	uint8_t	nvram[36];
 	uint8_t	century;
 	uint8_t	nvram2[128 - 51];
 } __packed;
 CTASSERT(sizeof(struct rtcdev) == 128);
 CTASSERT(offsetof(struct rtcdev, century) == RTC_CENTURY);
 
 struct vrtc {
 	struct vm	*vm;
 	struct mtx	mtx;
 	struct callout	callout;
 	u_int		addr;		/* RTC register to read or write */
 	sbintime_t	base_uptime;
 	time_t		base_rtctime;
 	struct rtcdev	rtcdev;
 };
 
 #define	VRTC_LOCK(vrtc)		mtx_lock(&((vrtc)->mtx))
 #define	VRTC_UNLOCK(vrtc)	mtx_unlock(&((vrtc)->mtx))
 #define	VRTC_LOCKED(vrtc)	mtx_owned(&((vrtc)->mtx))
 
 /*
  * RTC time is considered "broken" if:
  * - RTC updates are halted by the guest
  * - RTC date/time fields have invalid values
  */
 #define	VRTC_BROKEN_TIME	((time_t)-1)
 
 #define	RTC_IRQ			8
 #define	RTCSB_BIN		0x04
 #define	RTCSB_ALL_INTRS		(RTCSB_UINTR | RTCSB_AINTR | RTCSB_PINTR)
 #define	rtc_halted(vrtc)	((vrtc->rtcdev.reg_b & RTCSB_HALT) != 0)
 #define	aintr_enabled(vrtc)	(((vrtc)->rtcdev.reg_b & RTCSB_AINTR) != 0)
 #define	pintr_enabled(vrtc)	(((vrtc)->rtcdev.reg_b & RTCSB_PINTR) != 0)
 #define	uintr_enabled(vrtc)	(((vrtc)->rtcdev.reg_b & RTCSB_UINTR) != 0)
 
 static void vrtc_callout_handler(void *arg);
 static void vrtc_set_reg_c(struct vrtc *vrtc, uint8_t newval);
 
 static MALLOC_DEFINE(M_VRTC, "vrtc", "bhyve virtual rtc");
 
 SYSCTL_DECL(_hw_vmm);
-SYSCTL_NODE(_hw_vmm, OID_AUTO, vrtc, CTLFLAG_RW, NULL, NULL);
+SYSCTL_NODE(_hw_vmm, OID_AUTO, vrtc, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
+    NULL);
 
 static int rtc_flag_broken_time = 1;
 SYSCTL_INT(_hw_vmm_vrtc, OID_AUTO, flag_broken_time, CTLFLAG_RDTUN,
     &rtc_flag_broken_time, 0, "Stop guest when invalid RTC time is detected");
 
 static __inline bool
 divider_enabled(int reg_a)
 {
 	/*
 	 * The RTC is counting only when dividers are not held in reset.
 	 */
 	return ((reg_a & 0x70) == 0x20);
 }
 
 static __inline bool
 update_enabled(struct vrtc *vrtc)
 {
 	/*
 	 * RTC date/time can be updated only if:
 	 * - divider is not held in reset
 	 * - guest has not disabled updates
 	 * - the date/time fields have valid contents
 	 */
 	if (!divider_enabled(vrtc->rtcdev.reg_a))
 		return (false);
 
 	if (rtc_halted(vrtc))
 		return (false);
 
 	if (vrtc->base_rtctime == VRTC_BROKEN_TIME)
 		return (false);
 
 	return (true);
 }
 
 static time_t
 vrtc_curtime(struct vrtc *vrtc, sbintime_t *basetime)
 {
 	sbintime_t now, delta;
 	time_t t, secs;
 
 	KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__));
 
 	t = vrtc->base_rtctime;
 	*basetime = vrtc->base_uptime;
 	if (update_enabled(vrtc)) {
 		now = sbinuptime();
 		delta = now - vrtc->base_uptime;
 		KASSERT(delta >= 0, ("vrtc_curtime: uptime went backwards: "
 		    "%#lx to %#lx", vrtc->base_uptime, now));
 		secs = delta / SBT_1S;
 		t += secs;
 		*basetime += secs * SBT_1S;
 	}
 	return (t);
 }
 
 static __inline uint8_t
 rtcset(struct rtcdev *rtc, int val)
 {
 
 	KASSERT(val >= 0 && val < 100, ("%s: invalid bin2bcd index %d",
 	    __func__, val));
 
 	return ((rtc->reg_b & RTCSB_BIN) ? val : bin2bcd_data[val]);
 }
 
 static void
 secs_to_rtc(time_t rtctime, struct vrtc *vrtc, int force_update)
 {
 	struct clocktime ct;
 	struct timespec ts;
 	struct rtcdev *rtc;
 	int hour;
 
 	KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__));
 
 	if (rtctime < 0) {
 		KASSERT(rtctime == VRTC_BROKEN_TIME,
 		    ("%s: invalid vrtc time %#lx", __func__, rtctime));
 		return;
 	}
 
 	/*
 	 * If the RTC is halted then the guest has "ownership" of the
 	 * date/time fields. Don't update the RTC date/time fields in
 	 * this case (unless forced).
 	 */
 	if (rtc_halted(vrtc) && !force_update)
 		return;
 
 	ts.tv_sec = rtctime;
 	ts.tv_nsec = 0;
 	clock_ts_to_ct(&ts, &ct);
 
 	KASSERT(ct.sec >= 0 && ct.sec <= 59, ("invalid clocktime sec %d",
 	    ct.sec));
 	KASSERT(ct.min >= 0 && ct.min <= 59, ("invalid clocktime min %d",
 	    ct.min));
 	KASSERT(ct.hour >= 0 && ct.hour <= 23, ("invalid clocktime hour %d",
 	    ct.hour));
 	KASSERT(ct.dow >= 0 && ct.dow <= 6, ("invalid clocktime wday %d",
 	    ct.dow));
 	KASSERT(ct.day >= 1 && ct.day <= 31, ("invalid clocktime mday %d",
 	    ct.day));
 	KASSERT(ct.mon >= 1 && ct.mon <= 12, ("invalid clocktime month %d",
 	    ct.mon));
 	KASSERT(ct.year >= POSIX_BASE_YEAR, ("invalid clocktime year %d",
 	    ct.year));
 
 	rtc = &vrtc->rtcdev;
 	rtc->sec = rtcset(rtc, ct.sec);
 	rtc->min = rtcset(rtc, ct.min);
 
 	if (rtc->reg_b & RTCSB_24HR) {
 		hour = ct.hour;
 	} else {
 		/*
 		 * Convert to the 12-hour format.
 		 */
 		switch (ct.hour) {
 		case 0:			/* 12 AM */
 		case 12:		/* 12 PM */
 			hour = 12;
 			break;
 		default:
 			/*
 			 * The remaining 'ct.hour' values are interpreted as:
 			 * [1  - 11] ->  1 - 11 AM
 			 * [13 - 23] ->  1 - 11 PM
 			 */
 			hour = ct.hour % 12;
 			break;
 		}
 	}
 
 	rtc->hour = rtcset(rtc, hour);
 
 	if ((rtc->reg_b & RTCSB_24HR) == 0 && ct.hour >= 12)
 		rtc->hour |= 0x80;	    /* set MSB to indicate PM */
 
 	rtc->day_of_week = rtcset(rtc, ct.dow + 1);
 	rtc->day_of_month = rtcset(rtc, ct.day);
 	rtc->month = rtcset(rtc, ct.mon);
 	rtc->year = rtcset(rtc, ct.year % 100);
 	rtc->century = rtcset(rtc, ct.year / 100);
 }
 
 static int
 rtcget(struct rtcdev *rtc, int val, int *retval)
 {
 	uint8_t upper, lower;
 
 	if (rtc->reg_b & RTCSB_BIN) {
 		*retval = val;
 		return (0);
 	}
 
 	lower = val & 0xf;
 	upper = (val >> 4) & 0xf;
 
 	if (lower > 9 || upper > 9)
 		return (-1);
 
 	*retval = upper * 10 + lower;
 	return (0);
 }
 
 static time_t
 rtc_to_secs(struct vrtc *vrtc)
 {
 	struct clocktime ct;
 	struct timespec ts;
 	struct rtcdev *rtc;
 	struct vm *vm;
 	int century, error, hour, pm, year;
 
 	KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__));
 
 	vm = vrtc->vm;
 	rtc = &vrtc->rtcdev;
 
 	bzero(&ct, sizeof(struct clocktime));
 
 	error = rtcget(rtc, rtc->sec, &ct.sec);
 	if (error || ct.sec < 0 || ct.sec > 59) {
 		VM_CTR2(vm, "Invalid RTC sec %#x/%d", rtc->sec, ct.sec);
 		goto fail;
 	}
 
 	error = rtcget(rtc, rtc->min, &ct.min);
 	if (error || ct.min < 0 || ct.min > 59) {
 		VM_CTR2(vm, "Invalid RTC min %#x/%d", rtc->min, ct.min);
 		goto fail;
 	}
 
 	pm = 0;
 	hour = rtc->hour;
 	if ((rtc->reg_b & RTCSB_24HR) == 0) {
 		if (hour & 0x80) {
 			hour &= ~0x80;
 			pm = 1;
 		}
 	}
 	error = rtcget(rtc, hour, &ct.hour);
 	if ((rtc->reg_b & RTCSB_24HR) == 0) {
 		if (ct.hour >= 1 && ct.hour <= 12) {
 			/*
 			 * Convert from 12-hour format to internal 24-hour
 			 * representation as follows:
 			 *
 			 *    12-hour format		ct.hour
 			 *	12	AM		0
 			 *	1 - 11	AM		1 - 11
 			 *	12	PM		12
 			 *	1 - 11	PM		13 - 23
 			 */
 			if (ct.hour == 12)
 				ct.hour = 0;
 			if (pm)
 				ct.hour += 12;
 		} else {
 			VM_CTR2(vm, "Invalid RTC 12-hour format %#x/%d",
 			    rtc->hour, ct.hour);
 			goto fail;
 		}
 	}
 
 	if (error || ct.hour < 0 || ct.hour > 23) {
 		VM_CTR2(vm, "Invalid RTC hour %#x/%d", rtc->hour, ct.hour);
 		goto fail;
 	}
 
 	/*
 	 * Ignore 'rtc->dow' because some guests like Linux don't bother
 	 * setting it at all while others like OpenBSD/i386 set it incorrectly. 
 	 *
 	 * clock_ct_to_ts() does not depend on 'ct.dow' anyways so ignore it.
 	 */
 	ct.dow = -1;
 
 	error = rtcget(rtc, rtc->day_of_month, &ct.day);
 	if (error || ct.day < 1 || ct.day > 31) {
 		VM_CTR2(vm, "Invalid RTC mday %#x/%d", rtc->day_of_month,
 		    ct.day);
 		goto fail;
 	}
 
 	error = rtcget(rtc, rtc->month, &ct.mon);
 	if (error || ct.mon < 1 || ct.mon > 12) {
 		VM_CTR2(vm, "Invalid RTC month %#x/%d", rtc->month, ct.mon);
 		goto fail;
 	}
 
 	error = rtcget(rtc, rtc->year, &year);
 	if (error || year < 0 || year > 99) {
 		VM_CTR2(vm, "Invalid RTC year %#x/%d", rtc->year, year);
 		goto fail;
 	}
 
 	error = rtcget(rtc, rtc->century, &century);
 	ct.year = century * 100 + year;
 	if (error || ct.year < POSIX_BASE_YEAR) {
 		VM_CTR2(vm, "Invalid RTC century %#x/%d", rtc->century,
 		    ct.year);
 		goto fail;
 	}
 
 	error = clock_ct_to_ts(&ct, &ts);
 	if (error || ts.tv_sec < 0) {
 		VM_CTR3(vm, "Invalid RTC clocktime.date %04d-%02d-%02d",
 		    ct.year, ct.mon, ct.day);
 		VM_CTR3(vm, "Invalid RTC clocktime.time %02d:%02d:%02d",
 		    ct.hour, ct.min, ct.sec);
 		goto fail;
 	}
 	return (ts.tv_sec);		/* success */
 fail:
 	/*
 	 * Stop updating the RTC if the date/time fields programmed by
 	 * the guest are invalid.
 	 */
 	VM_CTR0(vrtc->vm, "Invalid RTC date/time programming detected");
 	return (VRTC_BROKEN_TIME);
 }
 
 static int
 vrtc_time_update(struct vrtc *vrtc, time_t newtime, sbintime_t newbase)
 {
 	struct rtcdev *rtc;
 	sbintime_t oldbase;
 	time_t oldtime;
 	uint8_t alarm_sec, alarm_min, alarm_hour;
 
 	KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__));
 
 	rtc = &vrtc->rtcdev;
 	alarm_sec = rtc->alarm_sec;
 	alarm_min = rtc->alarm_min;
 	alarm_hour = rtc->alarm_hour;
 
 	oldtime = vrtc->base_rtctime;
 	VM_CTR2(vrtc->vm, "Updating RTC secs from %#lx to %#lx",
 	    oldtime, newtime);
 
 	oldbase = vrtc->base_uptime;
 	VM_CTR2(vrtc->vm, "Updating RTC base uptime from %#lx to %#lx",
 	    oldbase, newbase);
 	vrtc->base_uptime = newbase;
 
 	if (newtime == oldtime)
 		return (0);
 
 	/*
 	 * If 'newtime' indicates that RTC updates are disabled then just
 	 * record that and return. There is no need to do alarm interrupt
 	 * processing in this case.
 	 */
 	if (newtime == VRTC_BROKEN_TIME) {
 		vrtc->base_rtctime = VRTC_BROKEN_TIME;
 		return (0);
 	}
 
 	/*
 	 * Return an error if RTC updates are halted by the guest.
 	 */
 	if (rtc_halted(vrtc)) {
 		VM_CTR0(vrtc->vm, "RTC update halted by guest");
 		return (EBUSY);
 	}
 
 	do {
 		/*
 		 * If the alarm interrupt is enabled and 'oldtime' is valid
 		 * then visit all the seconds between 'oldtime' and 'newtime'
 		 * to check for the alarm condition.
 		 *
 		 * Otherwise move the RTC time forward directly to 'newtime'.
 		 */
 		if (aintr_enabled(vrtc) && oldtime != VRTC_BROKEN_TIME)
 			vrtc->base_rtctime++;
 		else
 			vrtc->base_rtctime = newtime;
 
 		if (aintr_enabled(vrtc)) {
 			/*
 			 * Update the RTC date/time fields before checking
 			 * if the alarm conditions are satisfied.
 			 */
 			secs_to_rtc(vrtc->base_rtctime, vrtc, 0);
 
 			if ((alarm_sec >= 0xC0 || alarm_sec == rtc->sec) &&
 			    (alarm_min >= 0xC0 || alarm_min == rtc->min) &&
 			    (alarm_hour >= 0xC0 || alarm_hour == rtc->hour)) {
 				vrtc_set_reg_c(vrtc, rtc->reg_c | RTCIR_ALARM);
 			}
 		}
 	} while (vrtc->base_rtctime != newtime);
 
 	if (uintr_enabled(vrtc))
 		vrtc_set_reg_c(vrtc, rtc->reg_c | RTCIR_UPDATE);
 
 	return (0);
 }
 
 static sbintime_t
 vrtc_freq(struct vrtc *vrtc)
 {
 	int ratesel;
 
 	static sbintime_t pf[16] = {
 		0,
 		SBT_1S / 256,
 		SBT_1S / 128,
 		SBT_1S / 8192,
 		SBT_1S / 4096,
 		SBT_1S / 2048,
 		SBT_1S / 1024,
 		SBT_1S / 512,
 		SBT_1S / 256,
 		SBT_1S / 128,
 		SBT_1S / 64,
 		SBT_1S / 32,
 		SBT_1S / 16,
 		SBT_1S / 8,
 		SBT_1S / 4,
 		SBT_1S / 2,
 	};
 
 	KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__));
 
 	/*
 	 * If both periodic and alarm interrupts are enabled then use the
 	 * periodic frequency to drive the callout. The minimum periodic
 	 * frequency (2 Hz) is higher than the alarm frequency (1 Hz) so
 	 * piggyback the alarm on top of it. The same argument applies to
 	 * the update interrupt.
 	 */
 	if (pintr_enabled(vrtc) && divider_enabled(vrtc->rtcdev.reg_a)) {
 		ratesel = vrtc->rtcdev.reg_a & 0xf;
 		return (pf[ratesel]);
 	} else if (aintr_enabled(vrtc) && update_enabled(vrtc)) {
 		return (SBT_1S);
 	} else if (uintr_enabled(vrtc) && update_enabled(vrtc)) {
 		return (SBT_1S);
 	} else {
 		return (0);
 	}
 }
 
 static void
 vrtc_callout_reset(struct vrtc *vrtc, sbintime_t freqsbt)
 {
 
 	KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__));
 
 	if (freqsbt == 0) {
 		if (callout_active(&vrtc->callout)) {
 			VM_CTR0(vrtc->vm, "RTC callout stopped");
 			callout_stop(&vrtc->callout);
 		}
 		return;
 	}
 	VM_CTR1(vrtc->vm, "RTC callout frequency %d hz", SBT_1S / freqsbt);
 	callout_reset_sbt(&vrtc->callout, freqsbt, 0, vrtc_callout_handler,
 	    vrtc, 0);
 }
 
 static void
 vrtc_callout_handler(void *arg)
 {
 	struct vrtc *vrtc = arg;
 	sbintime_t freqsbt, basetime;
 	time_t rtctime;
 	int error;
 
 	VM_CTR0(vrtc->vm, "vrtc callout fired");
 
 	VRTC_LOCK(vrtc);
 	if (callout_pending(&vrtc->callout))	/* callout was reset */
 		goto done;
 
 	if (!callout_active(&vrtc->callout))	/* callout was stopped */
 		goto done;
 
 	callout_deactivate(&vrtc->callout);
 
 	KASSERT((vrtc->rtcdev.reg_b & RTCSB_ALL_INTRS) != 0,
 	    ("gratuitous vrtc callout"));
 
 	if (pintr_enabled(vrtc))
 		vrtc_set_reg_c(vrtc, vrtc->rtcdev.reg_c | RTCIR_PERIOD);
 
 	if (aintr_enabled(vrtc) || uintr_enabled(vrtc)) {
 		rtctime = vrtc_curtime(vrtc, &basetime);
 		error = vrtc_time_update(vrtc, rtctime, basetime);
 		KASSERT(error == 0, ("%s: vrtc_time_update error %d",
 		    __func__, error));
 	}
 
 	freqsbt = vrtc_freq(vrtc);
 	KASSERT(freqsbt != 0, ("%s: vrtc frequency cannot be zero", __func__));
 	vrtc_callout_reset(vrtc, freqsbt);
 done:
 	VRTC_UNLOCK(vrtc);
 }
 
 static __inline void
 vrtc_callout_check(struct vrtc *vrtc, sbintime_t freq)
 {
 	int active;
 
 	active = callout_active(&vrtc->callout) ? 1 : 0;
 	KASSERT((freq == 0 && !active) || (freq != 0 && active),
 	    ("vrtc callout %s with frequency %#lx",
 	    active ? "active" : "inactive", freq));
 }
 
 static void
 vrtc_set_reg_c(struct vrtc *vrtc, uint8_t newval)
 {
 	struct rtcdev *rtc;
 	int oldirqf, newirqf;
 	uint8_t oldval, changed;
 
 	KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__));
 
 	rtc = &vrtc->rtcdev;
 	newval &= RTCIR_ALARM | RTCIR_PERIOD | RTCIR_UPDATE;
 
 	oldirqf = rtc->reg_c & RTCIR_INT;
 	if ((aintr_enabled(vrtc) && (newval & RTCIR_ALARM) != 0) ||
 	    (pintr_enabled(vrtc) && (newval & RTCIR_PERIOD) != 0) ||
 	    (uintr_enabled(vrtc) && (newval & RTCIR_UPDATE) != 0)) {
 		newirqf = RTCIR_INT;
 	} else {
 		newirqf = 0;
 	}
 
 	oldval = rtc->reg_c;
 	rtc->reg_c = newirqf | newval;
 	changed = oldval ^ rtc->reg_c;
 	if (changed) {
 		VM_CTR2(vrtc->vm, "RTC reg_c changed from %#x to %#x",
 		    oldval, rtc->reg_c);
 	}
 
 	if (!oldirqf && newirqf) {
 		VM_CTR1(vrtc->vm, "RTC irq %d asserted", RTC_IRQ);
 		vatpic_pulse_irq(vrtc->vm, RTC_IRQ);
 		vioapic_pulse_irq(vrtc->vm, RTC_IRQ);
 	} else if (oldirqf && !newirqf) {
 		VM_CTR1(vrtc->vm, "RTC irq %d deasserted", RTC_IRQ);
 	}
 }
 
 static int
 vrtc_set_reg_b(struct vrtc *vrtc, uint8_t newval)
 {
 	struct rtcdev *rtc;
 	sbintime_t oldfreq, newfreq, basetime;
 	time_t curtime, rtctime;
 	int error;
 	uint8_t oldval, changed;
 
 	KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__));
 
 	rtc = &vrtc->rtcdev;
 	oldval = rtc->reg_b;
 	oldfreq = vrtc_freq(vrtc);
 
 	rtc->reg_b = newval;
 	changed = oldval ^ newval;
 	if (changed) {
 		VM_CTR2(vrtc->vm, "RTC reg_b changed from %#x to %#x",
 		    oldval, newval);
 	}
 
 	if (changed & RTCSB_HALT) {
 		if ((newval & RTCSB_HALT) == 0) {
 			rtctime = rtc_to_secs(vrtc);
 			basetime = sbinuptime();
 			if (rtctime == VRTC_BROKEN_TIME) {
 				if (rtc_flag_broken_time)
 					return (-1);
 			}
 		} else {
 			curtime = vrtc_curtime(vrtc, &basetime);
 			KASSERT(curtime == vrtc->base_rtctime, ("%s: mismatch "
 			    "between vrtc basetime (%#lx) and curtime (%#lx)",
 			    __func__, vrtc->base_rtctime, curtime));
 
 			/*
 			 * Force a refresh of the RTC date/time fields so
 			 * they reflect the time right before the guest set
 			 * the HALT bit.
 			 */
 			secs_to_rtc(curtime, vrtc, 1);
 
 			/*
 			 * Updates are halted so mark 'base_rtctime' to denote
 			 * that the RTC date/time is in flux.
 			 */
 			rtctime = VRTC_BROKEN_TIME;
 			rtc->reg_b &= ~RTCSB_UINTR;
 		}
 		error = vrtc_time_update(vrtc, rtctime, basetime);
 		KASSERT(error == 0, ("vrtc_time_update error %d", error));
 	}
 
 	/*
 	 * Side effect of changes to the interrupt enable bits.
 	 */
 	if (changed & RTCSB_ALL_INTRS)
 		vrtc_set_reg_c(vrtc, vrtc->rtcdev.reg_c);
 
 	/*
 	 * Change the callout frequency if it has changed.
 	 */
 	newfreq = vrtc_freq(vrtc);
 	if (newfreq != oldfreq)
 		vrtc_callout_reset(vrtc, newfreq);
 	else
 		vrtc_callout_check(vrtc, newfreq);
 
 	/*
 	 * The side effect of bits that control the RTC date/time format
 	 * is handled lazily when those fields are actually read.
 	 */
 	return (0);
 }
 
 static void
 vrtc_set_reg_a(struct vrtc *vrtc, uint8_t newval)
 {
 	sbintime_t oldfreq, newfreq;
 	uint8_t oldval, changed;
 
 	KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__));
 
 	newval &= ~RTCSA_TUP;
 	oldval = vrtc->rtcdev.reg_a;
 	oldfreq = vrtc_freq(vrtc);
 
 	if (divider_enabled(oldval) && !divider_enabled(newval)) {
 		VM_CTR2(vrtc->vm, "RTC divider held in reset at %#lx/%#lx",
 		    vrtc->base_rtctime, vrtc->base_uptime);
 	} else if (!divider_enabled(oldval) && divider_enabled(newval)) {
 		/*
 		 * If the dividers are coming out of reset then update
 		 * 'base_uptime' before this happens. This is done to
 		 * maintain the illusion that the RTC date/time was frozen
 		 * while the dividers were disabled.
 		 */
 		vrtc->base_uptime = sbinuptime();
 		VM_CTR2(vrtc->vm, "RTC divider out of reset at %#lx/%#lx",
 		    vrtc->base_rtctime, vrtc->base_uptime);
 	} else {
 		/* NOTHING */
 	}
 
 	vrtc->rtcdev.reg_a = newval;
 	changed = oldval ^ newval;
 	if (changed) {
 		VM_CTR2(vrtc->vm, "RTC reg_a changed from %#x to %#x",
 		    oldval, newval);
 	}
 
 	/*
 	 * Side effect of changes to rate select and divider enable bits.
 	 */
 	newfreq = vrtc_freq(vrtc);
 	if (newfreq != oldfreq)
 		vrtc_callout_reset(vrtc, newfreq);
 	else
 		vrtc_callout_check(vrtc, newfreq);
 }
 
 int
 vrtc_set_time(struct vm *vm, time_t secs)
 {
 	struct vrtc *vrtc;
 	int error;
 
 	vrtc = vm_rtc(vm);
 	VRTC_LOCK(vrtc);
 	error = vrtc_time_update(vrtc, secs, sbinuptime());
 	VRTC_UNLOCK(vrtc);
 
 	if (error) {
 		VM_CTR2(vrtc->vm, "Error %d setting RTC time to %#lx", error,
 		    secs);
 	} else {
 		VM_CTR1(vrtc->vm, "RTC time set to %#lx", secs);
 	}
 
 	return (error);
 }
 
 time_t
 vrtc_get_time(struct vm *vm)
 {
 	struct vrtc *vrtc;
 	sbintime_t basetime;
 	time_t t;
 
 	vrtc = vm_rtc(vm);
 	VRTC_LOCK(vrtc);
 	t = vrtc_curtime(vrtc, &basetime);
 	VRTC_UNLOCK(vrtc);
 
 	return (t);
 }
 
 int
 vrtc_nvram_write(struct vm *vm, int offset, uint8_t value)
 {
 	struct vrtc *vrtc;
 	uint8_t *ptr;
 
 	vrtc = vm_rtc(vm);
 
 	/*
 	 * Don't allow writes to RTC control registers or the date/time fields.
 	 */
 	if (offset < offsetof(struct rtcdev, nvram[0]) ||
 	    offset == RTC_CENTURY || offset >= sizeof(struct rtcdev)) {
 		VM_CTR1(vrtc->vm, "RTC nvram write to invalid offset %d",
 		    offset);
 		return (EINVAL);
 	}
 
 	VRTC_LOCK(vrtc);
 	ptr = (uint8_t *)(&vrtc->rtcdev);
 	ptr[offset] = value;
 	VM_CTR2(vrtc->vm, "RTC nvram write %#x to offset %#x", value, offset);
 	VRTC_UNLOCK(vrtc);
 
 	return (0);
 }
 
 int
 vrtc_nvram_read(struct vm *vm, int offset, uint8_t *retval)
 {
 	struct vrtc *vrtc;
 	sbintime_t basetime;
 	time_t curtime;
 	uint8_t *ptr;
 
 	/*
 	 * Allow all offsets in the RTC to be read.
 	 */
 	if (offset < 0 || offset >= sizeof(struct rtcdev))
 		return (EINVAL);
 
 	vrtc = vm_rtc(vm);
 	VRTC_LOCK(vrtc);
 
 	/*
 	 * Update RTC date/time fields if necessary.
 	 */
 	if (offset < 10 || offset == RTC_CENTURY) {
 		curtime = vrtc_curtime(vrtc, &basetime);
 		secs_to_rtc(curtime, vrtc, 0);
 	}
 
 	ptr = (uint8_t *)(&vrtc->rtcdev);
 	*retval = ptr[offset];
 
 	VRTC_UNLOCK(vrtc);
 	return (0);
 }
 
 int
 vrtc_addr_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
     uint32_t *val)
 {
 	struct vrtc *vrtc;
 
 	vrtc = vm_rtc(vm);
 
 	if (bytes != 1)
 		return (-1);
 
 	if (in) {
 		*val = 0xff;
 		return (0);
 	}
 
 	VRTC_LOCK(vrtc);
 	vrtc->addr = *val & 0x7f;
 	VRTC_UNLOCK(vrtc);
 
 	return (0);
 }
 
 int
 vrtc_data_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
     uint32_t *val)
 {
 	struct vrtc *vrtc;
 	struct rtcdev *rtc;
 	sbintime_t basetime;
 	time_t curtime;
 	int error, offset;
 
 	vrtc = vm_rtc(vm);
 	rtc = &vrtc->rtcdev;
 
 	if (bytes != 1)
 		return (-1);
 
 	VRTC_LOCK(vrtc);
 	offset = vrtc->addr;
 	if (offset >= sizeof(struct rtcdev)) {
 		VRTC_UNLOCK(vrtc);
 		return (-1);
 	}
 
 	error = 0;
 	curtime = vrtc_curtime(vrtc, &basetime);
 	vrtc_time_update(vrtc, curtime, basetime);
 
 	/*
 	 * Update RTC date/time fields if necessary.
 	 *
 	 * This is not just for reads of the RTC. The side-effect of writing
 	 * the century byte requires other RTC date/time fields (e.g. sec)
 	 * to be updated here.
 	 */
 	if (offset < 10 || offset == RTC_CENTURY)
 		secs_to_rtc(curtime, vrtc, 0);
 
 	if (in) {
 		if (offset == 12) {
 			/*
 			 * XXX
 			 * reg_c interrupt flags are updated only if the
 			 * corresponding interrupt enable bit in reg_b is set.
 			 */
 			*val = vrtc->rtcdev.reg_c;
 			vrtc_set_reg_c(vrtc, 0);
 		} else {
 			*val = *((uint8_t *)rtc + offset);
 		}
 		VCPU_CTR2(vm, vcpuid, "Read value %#x from RTC offset %#x",
 		    *val, offset);
 	} else {
 		switch (offset) {
 		case 10:
 			VCPU_CTR1(vm, vcpuid, "RTC reg_a set to %#x", *val);
 			vrtc_set_reg_a(vrtc, *val);
 			break;
 		case 11:
 			VCPU_CTR1(vm, vcpuid, "RTC reg_b set to %#x", *val);
 			error = vrtc_set_reg_b(vrtc, *val);
 			break;
 		case 12:
 			VCPU_CTR1(vm, vcpuid, "RTC reg_c set to %#x (ignored)",
 			    *val);
 			break;
 		case 13:
 			VCPU_CTR1(vm, vcpuid, "RTC reg_d set to %#x (ignored)",
 			    *val);
 			break;
 		case 0:
 			/*
 			 * High order bit of 'seconds' is readonly.
 			 */
 			*val &= 0x7f;
 			/* FALLTHRU */
 		default:
 			VCPU_CTR2(vm, vcpuid, "RTC offset %#x set to %#x",
 			    offset, *val);
 			*((uint8_t *)rtc + offset) = *val;
 			break;
 		}
 
 		/*
 		 * XXX some guests (e.g. OpenBSD) write the century byte
 		 * outside of RTCSB_HALT so re-calculate the RTC date/time.
 		 */
 		if (offset == RTC_CENTURY && !rtc_halted(vrtc)) {
 			curtime = rtc_to_secs(vrtc);
 			error = vrtc_time_update(vrtc, curtime, sbinuptime());
 			KASSERT(!error, ("vrtc_time_update error %d", error));
 			if (curtime == VRTC_BROKEN_TIME && rtc_flag_broken_time)
 				error = -1;
 		}
 	}
 	VRTC_UNLOCK(vrtc);
 	return (error);
 }
 
 void
 vrtc_reset(struct vrtc *vrtc)
 {
 	struct rtcdev *rtc;
 
 	VRTC_LOCK(vrtc);
 
 	rtc = &vrtc->rtcdev;
 	vrtc_set_reg_b(vrtc, rtc->reg_b & ~(RTCSB_ALL_INTRS | RTCSB_SQWE));
 	vrtc_set_reg_c(vrtc, 0);
 	KASSERT(!callout_active(&vrtc->callout), ("rtc callout still active"));
 
 	VRTC_UNLOCK(vrtc);
 }
 
 struct vrtc *
 vrtc_init(struct vm *vm)
 {
 	struct vrtc *vrtc;
 	struct rtcdev *rtc;
 	time_t curtime;
 
 	vrtc = malloc(sizeof(struct vrtc), M_VRTC, M_WAITOK | M_ZERO);
 	vrtc->vm = vm;
 	mtx_init(&vrtc->mtx, "vrtc lock", NULL, MTX_DEF);
 	callout_init(&vrtc->callout, 1);
 
 	/* Allow dividers to keep time but disable everything else */
 	rtc = &vrtc->rtcdev;
 	rtc->reg_a = 0x20;
 	rtc->reg_b = RTCSB_24HR;
 	rtc->reg_c = 0;
 	rtc->reg_d = RTCSD_PWR;
 
 	/* Reset the index register to a safe value. */
 	vrtc->addr = RTC_STATUSD;
 
 	/*
 	 * Initialize RTC time to 00:00:00 Jan 1, 1970.
 	 */
 	curtime = 0;
 
 	VRTC_LOCK(vrtc);
 	vrtc->base_rtctime = VRTC_BROKEN_TIME;
 	vrtc_time_update(vrtc, curtime, sbinuptime());
 	secs_to_rtc(curtime, vrtc, 0);
 	VRTC_UNLOCK(vrtc);
 
 	return (vrtc);
 }
 
 void
 vrtc_cleanup(struct vrtc *vrtc)
 {
 
 	callout_drain(&vrtc->callout);
 	free(vrtc, M_VRTC);
 }
Index: head/sys/amd64/vmm/vmm.c
===================================================================
--- head/sys/amd64/vmm/vmm.c	(revision 357973)
+++ head/sys/amd64/vmm/vmm.c	(revision 357974)
@@ -1,2731 +1,2732 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/sysctl.h>
 #include <sys/malloc.h>
 #include <sys/pcpu.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/systm.h>
 
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_param.h>
 
 #include <machine/cpu.h>
 #include <machine/pcb.h>
 #include <machine/smp.h>
 #include <machine/md_var.h>
 #include <x86/psl.h>
 #include <x86/apicreg.h>
 
 #include <machine/vmm.h>
 #include <machine/vmm_dev.h>
 #include <machine/vmm_instruction_emul.h>
 
 #include "vmm_ioport.h"
 #include "vmm_ktr.h"
 #include "vmm_host.h"
 #include "vmm_mem.h"
 #include "vmm_util.h"
 #include "vatpic.h"
 #include "vatpit.h"
 #include "vhpet.h"
 #include "vioapic.h"
 #include "vlapic.h"
 #include "vpmtmr.h"
 #include "vrtc.h"
 #include "vmm_stat.h"
 #include "vmm_lapic.h"
 
 #include "io/ppt.h"
 #include "io/iommu.h"
 
 struct vlapic;
 
 /*
  * Initialization:
  * (a) allocated when vcpu is created
  * (i) initialized when vcpu is created and when it is reinitialized
  * (o) initialized the first time the vcpu is created
  * (x) initialized before use
  */
 struct vcpu {
 	struct mtx 	mtx;		/* (o) protects 'state' and 'hostcpu' */
 	enum vcpu_state	state;		/* (o) vcpu state */
 	int		hostcpu;	/* (o) vcpu's host cpu */
 	int		reqidle;	/* (i) request vcpu to idle */
 	struct vlapic	*vlapic;	/* (i) APIC device model */
 	enum x2apic_state x2apic_state;	/* (i) APIC mode */
 	uint64_t	exitintinfo;	/* (i) events pending at VM exit */
 	int		nmi_pending;	/* (i) NMI pending */
 	int		extint_pending;	/* (i) INTR pending */
 	int	exception_pending;	/* (i) exception pending */
 	int	exc_vector;		/* (x) exception collateral */
 	int	exc_errcode_valid;
 	uint32_t exc_errcode;
 	struct savefpu	*guestfpu;	/* (a,i) guest fpu state */
 	uint64_t	guest_xcr0;	/* (i) guest %xcr0 register */
 	void		*stats;		/* (a,i) statistics */
 	struct vm_exit	exitinfo;	/* (x) exit reason and collateral */
 	uint64_t	nextrip;	/* (x) next instruction to execute */
 };
 
 #define	vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx))
 #define	vcpu_lock_init(v)	mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
 #define	vcpu_lock(v)		mtx_lock_spin(&((v)->mtx))
 #define	vcpu_unlock(v)		mtx_unlock_spin(&((v)->mtx))
 #define	vcpu_assert_locked(v)	mtx_assert(&((v)->mtx), MA_OWNED)
 
 struct mem_seg {
 	size_t	len;
 	bool	sysmem;
 	struct vm_object *object;
 };
 #define	VM_MAX_MEMSEGS	3
 
 struct mem_map {
 	vm_paddr_t	gpa;
 	size_t		len;
 	vm_ooffset_t	segoff;
 	int		segid;
 	int		prot;
 	int		flags;
 };
 #define	VM_MAX_MEMMAPS	4
 
 /*
  * Initialization:
  * (o) initialized the first time the VM is created
  * (i) initialized when VM is created and when it is reinitialized
  * (x) initialized before use
  */
 struct vm {
 	void		*cookie;		/* (i) cpu-specific data */
 	void		*iommu;			/* (x) iommu-specific data */
 	struct vhpet	*vhpet;			/* (i) virtual HPET */
 	struct vioapic	*vioapic;		/* (i) virtual ioapic */
 	struct vatpic	*vatpic;		/* (i) virtual atpic */
 	struct vatpit	*vatpit;		/* (i) virtual atpit */
 	struct vpmtmr	*vpmtmr;		/* (i) virtual ACPI PM timer */
 	struct vrtc	*vrtc;			/* (o) virtual RTC */
 	volatile cpuset_t active_cpus;		/* (i) active vcpus */
 	volatile cpuset_t debug_cpus;		/* (i) vcpus stopped for debug */
 	int		suspend;		/* (i) stop VM execution */
 	volatile cpuset_t suspended_cpus; 	/* (i) suspended vcpus */
 	volatile cpuset_t halted_cpus;		/* (x) cpus in a hard halt */
 	cpuset_t	rendezvous_req_cpus;	/* (x) rendezvous requested */
 	cpuset_t	rendezvous_done_cpus;	/* (x) rendezvous finished */
 	void		*rendezvous_arg;	/* (x) rendezvous func/arg */
 	vm_rendezvous_func_t rendezvous_func;
 	struct mtx	rendezvous_mtx;		/* (o) rendezvous lock */
 	struct mem_map	mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */
 	struct mem_seg	mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */
 	struct vmspace	*vmspace;		/* (o) guest's address space */
 	char		name[VM_MAX_NAMELEN];	/* (o) virtual machine name */
 	struct vcpu	vcpu[VM_MAXCPU];	/* (i) guest vcpus */
 	/* The following describe the vm cpu topology */
 	uint16_t	sockets;		/* (o) num of sockets */
 	uint16_t	cores;			/* (o) num of cores/socket */
 	uint16_t	threads;		/* (o) num of threads/core */
 	uint16_t	maxcpus;		/* (o) max pluggable cpus */
 };
 
 static int vmm_initialized;
 
 static struct vmm_ops *ops;
 #define	VMM_INIT(num)	(ops != NULL ? (*ops->init)(num) : 0)
 #define	VMM_CLEANUP()	(ops != NULL ? (*ops->cleanup)() : 0)
 #define	VMM_RESUME()	(ops != NULL ? (*ops->resume)() : 0)
 
 #define	VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL)
 #define	VMRUN(vmi, vcpu, rip, pmap, evinfo) \
 	(ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap, evinfo) : ENXIO)
 #define	VMCLEANUP(vmi)	(ops != NULL ? (*ops->vmcleanup)(vmi) : NULL)
 #define	VMSPACE_ALLOC(min, max) \
 	(ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL)
 #define	VMSPACE_FREE(vmspace) \
 	(ops != NULL ? (*ops->vmspace_free)(vmspace) : ENXIO)
 #define	VMGETREG(vmi, vcpu, num, retval)		\
 	(ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO)
 #define	VMSETREG(vmi, vcpu, num, val)		\
 	(ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO)
 #define	VMGETDESC(vmi, vcpu, num, desc)		\
 	(ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO)
 #define	VMSETDESC(vmi, vcpu, num, desc)		\
 	(ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO)
 #define	VMGETCAP(vmi, vcpu, num, retval)	\
 	(ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO)
 #define	VMSETCAP(vmi, vcpu, num, val)		\
 	(ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO)
 #define	VLAPIC_INIT(vmi, vcpu)			\
 	(ops != NULL ? (*ops->vlapic_init)(vmi, vcpu) : NULL)
 #define	VLAPIC_CLEANUP(vmi, vlapic)		\
 	(ops != NULL ? (*ops->vlapic_cleanup)(vmi, vlapic) : NULL)
 
 #define	fpu_start_emulating()	load_cr0(rcr0() | CR0_TS)
 #define	fpu_stop_emulating()	clts()
 
 SDT_PROVIDER_DEFINE(vmm);
 
 static MALLOC_DEFINE(M_VM, "vm", "vm");
 
 /* statistics */
 static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
 
-SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
+SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
+    NULL);
 
 /*
  * Halt the guest if all vcpus are executing a HLT instruction with
  * interrupts disabled.
  */
 static int halt_detection_enabled = 1;
 SYSCTL_INT(_hw_vmm, OID_AUTO, halt_detection, CTLFLAG_RDTUN,
     &halt_detection_enabled, 0,
     "Halt VM if all vcpus execute HLT with interrupts disabled");
 
 static int vmm_ipinum;
 SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0,
     "IPI vector used for vcpu notifications");
 
 static int trace_guest_exceptions;
 SYSCTL_INT(_hw_vmm, OID_AUTO, trace_guest_exceptions, CTLFLAG_RDTUN,
     &trace_guest_exceptions, 0,
     "Trap into hypervisor on all guest exceptions and reflect them back");
 
 static void vm_free_memmap(struct vm *vm, int ident);
 static bool sysmem_mapping(struct vm *vm, struct mem_map *mm);
 static void vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr);
 
 #ifdef KTR
 static const char *
 vcpu_state2str(enum vcpu_state state)
 {
 
 	switch (state) {
 	case VCPU_IDLE:
 		return ("idle");
 	case VCPU_FROZEN:
 		return ("frozen");
 	case VCPU_RUNNING:
 		return ("running");
 	case VCPU_SLEEPING:
 		return ("sleeping");
 	default:
 		return ("unknown");
 	}
 }
 #endif
 
 static void
 vcpu_cleanup(struct vm *vm, int i, bool destroy)
 {
 	struct vcpu *vcpu = &vm->vcpu[i];
 
 	VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic);
 	if (destroy) {
 		vmm_stat_free(vcpu->stats);	
 		fpu_save_area_free(vcpu->guestfpu);
 	}
 }
 
 static void
 vcpu_init(struct vm *vm, int vcpu_id, bool create)
 {
 	struct vcpu *vcpu;
 
 	KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus,
 	    ("vcpu_init: invalid vcpu %d", vcpu_id));
 	  
 	vcpu = &vm->vcpu[vcpu_id];
 
 	if (create) {
 		KASSERT(!vcpu_lock_initialized(vcpu), ("vcpu %d already "
 		    "initialized", vcpu_id));
 		vcpu_lock_init(vcpu);
 		vcpu->state = VCPU_IDLE;
 		vcpu->hostcpu = NOCPU;
 		vcpu->guestfpu = fpu_save_area_alloc();
 		vcpu->stats = vmm_stat_alloc();
 	}
 
 	vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
 	vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED);
 	vcpu->reqidle = 0;
 	vcpu->exitintinfo = 0;
 	vcpu->nmi_pending = 0;
 	vcpu->extint_pending = 0;
 	vcpu->exception_pending = 0;
 	vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
 	fpu_save_area_reset(vcpu->guestfpu);
 	vmm_stat_init(vcpu->stats);
 }
 
 int
 vcpu_trace_exceptions(struct vm *vm, int vcpuid)
 {
 
 	return (trace_guest_exceptions);
 }
 
 struct vm_exit *
 vm_exitinfo(struct vm *vm, int cpuid)
 {
 	struct vcpu *vcpu;
 
 	if (cpuid < 0 || cpuid >= vm->maxcpus)
 		panic("vm_exitinfo: invalid cpuid %d", cpuid);
 
 	vcpu = &vm->vcpu[cpuid];
 
 	return (&vcpu->exitinfo);
 }
 
 static void
 vmm_resume(void)
 {
 	VMM_RESUME();
 }
 
 static int
 vmm_init(void)
 {
 	int error;
 
 	vmm_host_state_init();
 
 	vmm_ipinum = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) :
 	    &IDTVEC(justreturn));
 	if (vmm_ipinum < 0)
 		vmm_ipinum = IPI_AST;
 
 	error = vmm_mem_init();
 	if (error)
 		return (error);
 	
 	if (vmm_is_intel())
 		ops = &vmm_ops_intel;
 	else if (vmm_is_svm())
 		ops = &vmm_ops_amd;
 	else
 		return (ENXIO);
 
 	vmm_resume_p = vmm_resume;
 
 	return (VMM_INIT(vmm_ipinum));
 }
 
 static int
 vmm_handler(module_t mod, int what, void *arg)
 {
 	int error;
 
 	switch (what) {
 	case MOD_LOAD:
 		vmmdev_init();
 		error = vmm_init();
 		if (error == 0)
 			vmm_initialized = 1;
 		break;
 	case MOD_UNLOAD:
 		error = vmmdev_cleanup();
 		if (error == 0) {
 			vmm_resume_p = NULL;
 			iommu_cleanup();
 			if (vmm_ipinum != IPI_AST)
 				lapic_ipi_free(vmm_ipinum);
 			error = VMM_CLEANUP();
 			/*
 			 * Something bad happened - prevent new
 			 * VMs from being created
 			 */
 			if (error)
 				vmm_initialized = 0;
 		}
 		break;
 	default:
 		error = 0;
 		break;
 	}
 	return (error);
 }
 
 static moduledata_t vmm_kmod = {
 	"vmm",
 	vmm_handler,
 	NULL
 };
 
 /*
  * vmm initialization has the following dependencies:
  *
  * - VT-x initialization requires smp_rendezvous() and therefore must happen
  *   after SMP is fully functional (after SI_SUB_SMP).
  */
 DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY);
 MODULE_VERSION(vmm, 1);
 
 static void
 vm_init(struct vm *vm, bool create)
 {
 	int i;
 
 	vm->cookie = VMINIT(vm, vmspace_pmap(vm->vmspace));
 	vm->iommu = NULL;
 	vm->vioapic = vioapic_init(vm);
 	vm->vhpet = vhpet_init(vm);
 	vm->vatpic = vatpic_init(vm);
 	vm->vatpit = vatpit_init(vm);
 	vm->vpmtmr = vpmtmr_init(vm);
 	if (create)
 		vm->vrtc = vrtc_init(vm);
 
 	CPU_ZERO(&vm->active_cpus);
 	CPU_ZERO(&vm->debug_cpus);
 
 	vm->suspend = 0;
 	CPU_ZERO(&vm->suspended_cpus);
 
 	for (i = 0; i < vm->maxcpus; i++)
 		vcpu_init(vm, i, create);
 }
 
 /*
  * The default CPU topology is a single thread per package.
  */
 u_int cores_per_package = 1;
 u_int threads_per_core = 1;
 
 int
 vm_create(const char *name, struct vm **retvm)
 {
 	struct vm *vm;
 	struct vmspace *vmspace;
 
 	/*
 	 * If vmm.ko could not be successfully initialized then don't attempt
 	 * to create the virtual machine.
 	 */
 	if (!vmm_initialized)
 		return (ENXIO);
 
 	if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
 		return (EINVAL);
 
 	vmspace = VMSPACE_ALLOC(0, VM_MAXUSER_ADDRESS);
 	if (vmspace == NULL)
 		return (ENOMEM);
 
 	vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
 	strcpy(vm->name, name);
 	vm->vmspace = vmspace;
 	mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF);
 
 	vm->sockets = 1;
 	vm->cores = cores_per_package;	/* XXX backwards compatibility */
 	vm->threads = threads_per_core;	/* XXX backwards compatibility */
 	vm->maxcpus = VM_MAXCPU;	/* XXX temp to keep code working */
 
 	vm_init(vm, true);
 
 	*retvm = vm;
 	return (0);
 }
 
 void
 vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores,
     uint16_t *threads, uint16_t *maxcpus)
 {
 	*sockets = vm->sockets;
 	*cores = vm->cores;
 	*threads = vm->threads;
 	*maxcpus = vm->maxcpus;
 }
 
 uint16_t
 vm_get_maxcpus(struct vm *vm)
 {
 	return (vm->maxcpus);
 }
 
 int
 vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores,
     uint16_t threads, uint16_t maxcpus)
 {
 	if (maxcpus != 0)
 		return (EINVAL);	/* XXX remove when supported */
 	if ((sockets * cores * threads) > vm->maxcpus)
 		return (EINVAL);
 	/* XXX need to check sockets * cores * threads == vCPU, how? */
 	vm->sockets = sockets;
 	vm->cores = cores;
 	vm->threads = threads;
 	vm->maxcpus = VM_MAXCPU;	/* XXX temp to keep code working */
 	return(0);
 }
 
 static void
 vm_cleanup(struct vm *vm, bool destroy)
 {
 	struct mem_map *mm;
 	int i;
 
 	ppt_unassign_all(vm);
 
 	if (vm->iommu != NULL)
 		iommu_destroy_domain(vm->iommu);
 
 	if (destroy)
 		vrtc_cleanup(vm->vrtc);
 	else
 		vrtc_reset(vm->vrtc);
 	vpmtmr_cleanup(vm->vpmtmr);
 	vatpit_cleanup(vm->vatpit);
 	vhpet_cleanup(vm->vhpet);
 	vatpic_cleanup(vm->vatpic);
 	vioapic_cleanup(vm->vioapic);
 
 	for (i = 0; i < vm->maxcpus; i++)
 		vcpu_cleanup(vm, i, destroy);
 
 	VMCLEANUP(vm->cookie);
 
 	/*
 	 * System memory is removed from the guest address space only when
 	 * the VM is destroyed. This is because the mapping remains the same
 	 * across VM reset.
 	 *
 	 * Device memory can be relocated by the guest (e.g. using PCI BARs)
 	 * so those mappings are removed on a VM reset.
 	 */
 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 		mm = &vm->mem_maps[i];
 		if (destroy || !sysmem_mapping(vm, mm))
 			vm_free_memmap(vm, i);
 	}
 
 	if (destroy) {
 		for (i = 0; i < VM_MAX_MEMSEGS; i++)
 			vm_free_memseg(vm, i);
 
 		VMSPACE_FREE(vm->vmspace);
 		vm->vmspace = NULL;
 	}
 }
 
 void
 vm_destroy(struct vm *vm)
 {
 	vm_cleanup(vm, true);
 	free(vm, M_VM);
 }
 
 int
 vm_reinit(struct vm *vm)
 {
 	int error;
 
 	/*
 	 * A virtual machine can be reset only if all vcpus are suspended.
 	 */
 	if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
 		vm_cleanup(vm, false);
 		vm_init(vm, false);
 		error = 0;
 	} else {
 		error = EBUSY;
 	}
 
 	return (error);
 }
 
 const char *
 vm_name(struct vm *vm)
 {
 	return (vm->name);
 }
 
 int
 vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
 {
 	vm_object_t obj;
 
 	if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL)
 		return (ENOMEM);
 	else
 		return (0);
 }
 
 int
 vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
 {
 
 	vmm_mmio_free(vm->vmspace, gpa, len);
 	return (0);
 }
 
 /*
  * Return 'true' if 'gpa' is allocated in the guest address space.
  *
  * This function is called in the context of a running vcpu which acts as
  * an implicit lock on 'vm->mem_maps[]'.
  */
 bool
 vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa)
 {
 	struct mem_map *mm;
 	int i;
 
 #ifdef INVARIANTS
 	int hostcpu, state;
 	state = vcpu_get_state(vm, vcpuid, &hostcpu);
 	KASSERT(state == VCPU_RUNNING && hostcpu == curcpu,
 	    ("%s: invalid vcpu state %d/%d", __func__, state, hostcpu));
 #endif
 
 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 		mm = &vm->mem_maps[i];
 		if (mm->len != 0 && gpa >= mm->gpa && gpa < mm->gpa + mm->len)
 			return (true);		/* 'gpa' is sysmem or devmem */
 	}
 
 	if (ppt_is_mmio(vm, gpa))
 		return (true);			/* 'gpa' is pci passthru mmio */
 
 	return (false);
 }
 
 int
 vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem)
 {
 	struct mem_seg *seg;
 	vm_object_t obj;
 
 	if (ident < 0 || ident >= VM_MAX_MEMSEGS)
 		return (EINVAL);
 
 	if (len == 0 || (len & PAGE_MASK))
 		return (EINVAL);
 
 	seg = &vm->mem_segs[ident];
 	if (seg->object != NULL) {
 		if (seg->len == len && seg->sysmem == sysmem)
 			return (EEXIST);
 		else
 			return (EINVAL);
 	}
 
 	obj = vm_object_allocate(OBJT_DEFAULT, len >> PAGE_SHIFT);
 	if (obj == NULL)
 		return (ENOMEM);
 
 	seg->len = len;
 	seg->object = obj;
 	seg->sysmem = sysmem;
 	return (0);
 }
 
 int
 vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem,
     vm_object_t *objptr)
 {
 	struct mem_seg *seg;
 
 	if (ident < 0 || ident >= VM_MAX_MEMSEGS)
 		return (EINVAL);
 
 	seg = &vm->mem_segs[ident];
 	if (len)
 		*len = seg->len;
 	if (sysmem)
 		*sysmem = seg->sysmem;
 	if (objptr)
 		*objptr = seg->object;
 	return (0);
 }
 
 void
 vm_free_memseg(struct vm *vm, int ident)
 {
 	struct mem_seg *seg;
 
 	KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS,
 	    ("%s: invalid memseg ident %d", __func__, ident));
 
 	seg = &vm->mem_segs[ident];
 	if (seg->object != NULL) {
 		vm_object_deallocate(seg->object);
 		bzero(seg, sizeof(struct mem_seg));
 	}
 }
 
 int
 vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first,
     size_t len, int prot, int flags)
 {
 	struct mem_seg *seg;
 	struct mem_map *m, *map;
 	vm_ooffset_t last;
 	int i, error;
 
 	if (prot == 0 || (prot & ~(VM_PROT_ALL)) != 0)
 		return (EINVAL);
 
 	if (flags & ~VM_MEMMAP_F_WIRED)
 		return (EINVAL);
 
 	if (segid < 0 || segid >= VM_MAX_MEMSEGS)
 		return (EINVAL);
 
 	seg = &vm->mem_segs[segid];
 	if (seg->object == NULL)
 		return (EINVAL);
 
 	last = first + len;
 	if (first < 0 || first >= last || last > seg->len)
 		return (EINVAL);
 
 	if ((gpa | first | last) & PAGE_MASK)
 		return (EINVAL);
 
 	map = NULL;
 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 		m = &vm->mem_maps[i];
 		if (m->len == 0) {
 			map = m;
 			break;
 		}
 	}
 
 	if (map == NULL)
 		return (ENOSPC);
 
 	error = vm_map_find(&vm->vmspace->vm_map, seg->object, first, &gpa,
 	    len, 0, VMFS_NO_SPACE, prot, prot, 0);
 	if (error != KERN_SUCCESS)
 		return (EFAULT);
 
 	vm_object_reference(seg->object);
 
 	if (flags & VM_MEMMAP_F_WIRED) {
 		error = vm_map_wire(&vm->vmspace->vm_map, gpa, gpa + len,
 		    VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
 		if (error != KERN_SUCCESS) {
 			vm_map_remove(&vm->vmspace->vm_map, gpa, gpa + len);
 			return (error == KERN_RESOURCE_SHORTAGE ? ENOMEM :
 			    EFAULT);
 		}
 	}
 
 	map->gpa = gpa;
 	map->len = len;
 	map->segoff = first;
 	map->segid = segid;
 	map->prot = prot;
 	map->flags = flags;
 	return (0);
 }
 
 int
 vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid,
     vm_ooffset_t *segoff, size_t *len, int *prot, int *flags)
 {
 	struct mem_map *mm, *mmnext;
 	int i;
 
 	mmnext = NULL;
 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 		mm = &vm->mem_maps[i];
 		if (mm->len == 0 || mm->gpa < *gpa)
 			continue;
 		if (mmnext == NULL || mm->gpa < mmnext->gpa)
 			mmnext = mm;
 	}
 
 	if (mmnext != NULL) {
 		*gpa = mmnext->gpa;
 		if (segid)
 			*segid = mmnext->segid;
 		if (segoff)
 			*segoff = mmnext->segoff;
 		if (len)
 			*len = mmnext->len;
 		if (prot)
 			*prot = mmnext->prot;
 		if (flags)
 			*flags = mmnext->flags;
 		return (0);
 	} else {
 		return (ENOENT);
 	}
 }
 
 static void
 vm_free_memmap(struct vm *vm, int ident)
 {
 	struct mem_map *mm;
 	int error;
 
 	mm = &vm->mem_maps[ident];
 	if (mm->len) {
 		error = vm_map_remove(&vm->vmspace->vm_map, mm->gpa,
 		    mm->gpa + mm->len);
 		KASSERT(error == KERN_SUCCESS, ("%s: vm_map_remove error %d",
 		    __func__, error));
 		bzero(mm, sizeof(struct mem_map));
 	}
 }
 
 static __inline bool
 sysmem_mapping(struct vm *vm, struct mem_map *mm)
 {
 
 	if (mm->len != 0 && vm->mem_segs[mm->segid].sysmem)
 		return (true);
 	else
 		return (false);
 }
 
 vm_paddr_t
 vmm_sysmem_maxaddr(struct vm *vm)
 {
 	struct mem_map *mm;
 	vm_paddr_t maxaddr;
 	int i;
 
 	maxaddr = 0;
 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 		mm = &vm->mem_maps[i];
 		if (sysmem_mapping(vm, mm)) {
 			if (maxaddr < mm->gpa + mm->len)
 				maxaddr = mm->gpa + mm->len;
 		}
 	}
 	return (maxaddr);
 }
 
 static void
 vm_iommu_modify(struct vm *vm, bool map)
 {
 	int i, sz;
 	vm_paddr_t gpa, hpa;
 	struct mem_map *mm;
 	void *vp, *cookie, *host_domain;
 
 	sz = PAGE_SIZE;
 	host_domain = iommu_host_domain();
 
 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 		mm = &vm->mem_maps[i];
 		if (!sysmem_mapping(vm, mm))
 			continue;
 
 		if (map) {
 			KASSERT((mm->flags & VM_MEMMAP_F_IOMMU) == 0,
 			    ("iommu map found invalid memmap %#lx/%#lx/%#x",
 			    mm->gpa, mm->len, mm->flags));
 			if ((mm->flags & VM_MEMMAP_F_WIRED) == 0)
 				continue;
 			mm->flags |= VM_MEMMAP_F_IOMMU;
 		} else {
 			if ((mm->flags & VM_MEMMAP_F_IOMMU) == 0)
 				continue;
 			mm->flags &= ~VM_MEMMAP_F_IOMMU;
 			KASSERT((mm->flags & VM_MEMMAP_F_WIRED) != 0,
 			    ("iommu unmap found invalid memmap %#lx/%#lx/%#x",
 			    mm->gpa, mm->len, mm->flags));
 		}
 
 		gpa = mm->gpa;
 		while (gpa < mm->gpa + mm->len) {
 			vp = vm_gpa_hold(vm, -1, gpa, PAGE_SIZE, VM_PROT_WRITE,
 					 &cookie);
 			KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx",
 			    vm_name(vm), gpa));
 
 			vm_gpa_release(cookie);
 
 			hpa = DMAP_TO_PHYS((uintptr_t)vp);
 			if (map) {
 				iommu_create_mapping(vm->iommu, gpa, hpa, sz);
 				iommu_remove_mapping(host_domain, hpa, sz);
 			} else {
 				iommu_remove_mapping(vm->iommu, gpa, sz);
 				iommu_create_mapping(host_domain, hpa, hpa, sz);
 			}
 
 			gpa += PAGE_SIZE;
 		}
 	}
 
 	/*
 	 * Invalidate the cached translations associated with the domain
 	 * from which pages were removed.
 	 */
 	if (map)
 		iommu_invalidate_tlb(host_domain);
 	else
 		iommu_invalidate_tlb(vm->iommu);
 }
 
 #define	vm_iommu_unmap(vm)	vm_iommu_modify((vm), false)
 #define	vm_iommu_map(vm)	vm_iommu_modify((vm), true)
 
 int
 vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func)
 {
 	int error;
 
 	error = ppt_unassign_device(vm, bus, slot, func);
 	if (error)
 		return (error);
 
 	if (ppt_assigned_devices(vm) == 0)
 		vm_iommu_unmap(vm);
 
 	return (0);
 }
 
 int
 vm_assign_pptdev(struct vm *vm, int bus, int slot, int func)
 {
 	int error;
 	vm_paddr_t maxaddr;
 
 	/* Set up the IOMMU to do the 'gpa' to 'hpa' translation */
 	if (ppt_assigned_devices(vm) == 0) {
 		KASSERT(vm->iommu == NULL,
 		    ("vm_assign_pptdev: iommu must be NULL"));
 		maxaddr = vmm_sysmem_maxaddr(vm);
 		vm->iommu = iommu_create_domain(maxaddr);
 		if (vm->iommu == NULL)
 			return (ENXIO);
 		vm_iommu_map(vm);
 	}
 
 	error = ppt_assign_device(vm, bus, slot, func);
 	return (error);
 }
 
 void *
 vm_gpa_hold(struct vm *vm, int vcpuid, vm_paddr_t gpa, size_t len, int reqprot,
 	    void **cookie)
 {
 	int i, count, pageoff;
 	struct mem_map *mm;
 	vm_page_t m;
 #ifdef INVARIANTS
 	/*
 	 * All vcpus are frozen by ioctls that modify the memory map
 	 * (e.g. VM_MMAP_MEMSEG). Therefore 'vm->memmap[]' stability is
 	 * guaranteed if at least one vcpu is in the VCPU_FROZEN state.
 	 */
 	int state;
 	KASSERT(vcpuid >= -1 && vcpuid < vm->maxcpus, ("%s: invalid vcpuid %d",
 	    __func__, vcpuid));
 	for (i = 0; i < vm->maxcpus; i++) {
 		if (vcpuid != -1 && vcpuid != i)
 			continue;
 		state = vcpu_get_state(vm, i, NULL);
 		KASSERT(state == VCPU_FROZEN, ("%s: invalid vcpu state %d",
 		    __func__, state));
 	}
 #endif
 	pageoff = gpa & PAGE_MASK;
 	if (len > PAGE_SIZE - pageoff)
 		panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
 
 	count = 0;
 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 		mm = &vm->mem_maps[i];
 		if (sysmem_mapping(vm, mm) && gpa >= mm->gpa &&
 		    gpa < mm->gpa + mm->len) {
 			count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map,
 			    trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1);
 			break;
 		}
 	}
 
 	if (count == 1) {
 		*cookie = m;
 		return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff));
 	} else {
 		*cookie = NULL;
 		return (NULL);
 	}
 }
 
 void
 vm_gpa_release(void *cookie)
 {
 	vm_page_t m = cookie;
 
 	vm_page_unwire(m, PQ_ACTIVE);
 }
 
 int
 vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
 {
 
 	if (vcpu < 0 || vcpu >= vm->maxcpus)
 		return (EINVAL);
 
 	if (reg >= VM_REG_LAST)
 		return (EINVAL);
 
 	return (VMGETREG(vm->cookie, vcpu, reg, retval));
 }
 
 int
 vm_set_register(struct vm *vm, int vcpuid, int reg, uint64_t val)
 {
 	struct vcpu *vcpu;
 	int error;
 
 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
 		return (EINVAL);
 
 	if (reg >= VM_REG_LAST)
 		return (EINVAL);
 
 	error = VMSETREG(vm->cookie, vcpuid, reg, val);
 	if (error || reg != VM_REG_GUEST_RIP)
 		return (error);
 
 	/* Set 'nextrip' to match the value of %rip */
 	VCPU_CTR1(vm, vcpuid, "Setting nextrip to %#lx", val);
 	vcpu = &vm->vcpu[vcpuid];
 	vcpu->nextrip = val;
 	return (0);
 }
 
 static bool
 is_descriptor_table(int reg)
 {
 
 	switch (reg) {
 	case VM_REG_GUEST_IDTR:
 	case VM_REG_GUEST_GDTR:
 		return (true);
 	default:
 		return (false);
 	}
 }
 
 static bool
 is_segment_register(int reg)
 {
 	
 	switch (reg) {
 	case VM_REG_GUEST_ES:
 	case VM_REG_GUEST_CS:
 	case VM_REG_GUEST_SS:
 	case VM_REG_GUEST_DS:
 	case VM_REG_GUEST_FS:
 	case VM_REG_GUEST_GS:
 	case VM_REG_GUEST_TR:
 	case VM_REG_GUEST_LDTR:
 		return (true);
 	default:
 		return (false);
 	}
 }
 
 int
 vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
 		struct seg_desc *desc)
 {
 
 	if (vcpu < 0 || vcpu >= vm->maxcpus)
 		return (EINVAL);
 
 	if (!is_segment_register(reg) && !is_descriptor_table(reg))
 		return (EINVAL);
 
 	return (VMGETDESC(vm->cookie, vcpu, reg, desc));
 }
 
 int
 vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
 		struct seg_desc *desc)
 {
 	if (vcpu < 0 || vcpu >= vm->maxcpus)
 		return (EINVAL);
 
 	if (!is_segment_register(reg) && !is_descriptor_table(reg))
 		return (EINVAL);
 
 	return (VMSETDESC(vm->cookie, vcpu, reg, desc));
 }
 
 static void
 restore_guest_fpustate(struct vcpu *vcpu)
 {
 
 	/* flush host state to the pcb */
 	fpuexit(curthread);
 
 	/* restore guest FPU state */
 	fpu_stop_emulating();
 	fpurestore(vcpu->guestfpu);
 
 	/* restore guest XCR0 if XSAVE is enabled in the host */
 	if (rcr4() & CR4_XSAVE)
 		load_xcr(0, vcpu->guest_xcr0);
 
 	/*
 	 * The FPU is now "dirty" with the guest's state so turn on emulation
 	 * to trap any access to the FPU by the host.
 	 */
 	fpu_start_emulating();
 }
 
 static void
 save_guest_fpustate(struct vcpu *vcpu)
 {
 
 	if ((rcr0() & CR0_TS) == 0)
 		panic("fpu emulation not enabled in host!");
 
 	/* save guest XCR0 and restore host XCR0 */
 	if (rcr4() & CR4_XSAVE) {
 		vcpu->guest_xcr0 = rxcr(0);
 		load_xcr(0, vmm_get_host_xcr0());
 	}
 
 	/* save guest FPU state */
 	fpu_stop_emulating();
 	fpusave(vcpu->guestfpu);
 	fpu_start_emulating();
 }
 
 static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle");
 
 static int
 vcpu_set_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate,
     bool from_idle)
 {
 	struct vcpu *vcpu;
 	int error;
 
 	vcpu = &vm->vcpu[vcpuid];
 	vcpu_assert_locked(vcpu);
 
 	/*
 	 * State transitions from the vmmdev_ioctl() must always begin from
 	 * the VCPU_IDLE state. This guarantees that there is only a single
 	 * ioctl() operating on a vcpu at any point.
 	 */
 	if (from_idle) {
 		while (vcpu->state != VCPU_IDLE) {
 			vcpu->reqidle = 1;
 			vcpu_notify_event_locked(vcpu, false);
 			VCPU_CTR1(vm, vcpuid, "vcpu state change from %s to "
 			    "idle requested", vcpu_state2str(vcpu->state));
 			msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz);
 		}
 	} else {
 		KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
 		    "vcpu idle state"));
 	}
 
 	if (vcpu->state == VCPU_RUNNING) {
 		KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d "
 		    "mismatch for running vcpu", curcpu, vcpu->hostcpu));
 	} else {
 		KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a "
 		    "vcpu that is not running", vcpu->hostcpu));
 	}
 
 	/*
 	 * The following state transitions are allowed:
 	 * IDLE -> FROZEN -> IDLE
 	 * FROZEN -> RUNNING -> FROZEN
 	 * FROZEN -> SLEEPING -> FROZEN
 	 */
 	switch (vcpu->state) {
 	case VCPU_IDLE:
 	case VCPU_RUNNING:
 	case VCPU_SLEEPING:
 		error = (newstate != VCPU_FROZEN);
 		break;
 	case VCPU_FROZEN:
 		error = (newstate == VCPU_FROZEN);
 		break;
 	default:
 		error = 1;
 		break;
 	}
 
 	if (error)
 		return (EBUSY);
 
 	VCPU_CTR2(vm, vcpuid, "vcpu state changed from %s to %s",
 	    vcpu_state2str(vcpu->state), vcpu_state2str(newstate));
 
 	vcpu->state = newstate;
 	if (newstate == VCPU_RUNNING)
 		vcpu->hostcpu = curcpu;
 	else
 		vcpu->hostcpu = NOCPU;
 
 	if (newstate == VCPU_IDLE)
 		wakeup(&vcpu->state);
 
 	return (0);
 }
 
 static void
 vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
 {
 	int error;
 
 	if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0)
 		panic("Error %d setting state to %d\n", error, newstate);
 }
 
 static void
 vcpu_require_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate)
 {
 	int error;
 
 	if ((error = vcpu_set_state_locked(vm, vcpuid, newstate, false)) != 0)
 		panic("Error %d setting state to %d", error, newstate);
 }
 
 #define	RENDEZVOUS_CTR0(vm, vcpuid, fmt)				\
 	do {								\
 		if (vcpuid >= 0)					\
 			VCPU_CTR0(vm, vcpuid, fmt);			\
 		else							\
 			VM_CTR0(vm, fmt);				\
 	} while (0)
 
 static int
 vm_handle_rendezvous(struct vm *vm, int vcpuid)
 {
 	struct thread *td;
 	int error;
 
 	KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < vm->maxcpus),
 	    ("vm_handle_rendezvous: invalid vcpuid %d", vcpuid));
 
 	error = 0;
 	td = curthread;
 	mtx_lock(&vm->rendezvous_mtx);
 	while (vm->rendezvous_func != NULL) {
 		/* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */
 		CPU_AND(&vm->rendezvous_req_cpus, &vm->active_cpus);
 
 		if (vcpuid != -1 &&
 		    CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus) &&
 		    !CPU_ISSET(vcpuid, &vm->rendezvous_done_cpus)) {
 			VCPU_CTR0(vm, vcpuid, "Calling rendezvous func");
 			(*vm->rendezvous_func)(vm, vcpuid, vm->rendezvous_arg);
 			CPU_SET(vcpuid, &vm->rendezvous_done_cpus);
 		}
 		if (CPU_CMP(&vm->rendezvous_req_cpus,
 		    &vm->rendezvous_done_cpus) == 0) {
 			VCPU_CTR0(vm, vcpuid, "Rendezvous completed");
 			vm->rendezvous_func = NULL;
 			wakeup(&vm->rendezvous_func);
 			break;
 		}
 		RENDEZVOUS_CTR0(vm, vcpuid, "Wait for rendezvous completion");
 		mtx_sleep(&vm->rendezvous_func, &vm->rendezvous_mtx, 0,
 		    "vmrndv", hz);
 		if ((td->td_flags & TDF_NEEDSUSPCHK) != 0) {
 			mtx_unlock(&vm->rendezvous_mtx);
 			error = thread_check_susp(td, true);
 			if (error != 0)
 				return (error);
 			mtx_lock(&vm->rendezvous_mtx);
 		}
 	}
 	mtx_unlock(&vm->rendezvous_mtx);
 	return (0);
 }
 
 /*
  * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
  */
 static int
 vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu)
 {
 	struct vcpu *vcpu;
 	const char *wmesg;
 	struct thread *td;
 	int error, t, vcpu_halted, vm_halted;
 
 	KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted"));
 
 	vcpu = &vm->vcpu[vcpuid];
 	vcpu_halted = 0;
 	vm_halted = 0;
 	error = 0;
 	td = curthread;
 
 	vcpu_lock(vcpu);
 	while (1) {
 		/*
 		 * Do a final check for pending NMI or interrupts before
 		 * really putting this thread to sleep. Also check for
 		 * software events that would cause this vcpu to wakeup.
 		 *
 		 * These interrupts/events could have happened after the
 		 * vcpu returned from VMRUN() and before it acquired the
 		 * vcpu lock above.
 		 */
 		if (vm->rendezvous_func != NULL || vm->suspend || vcpu->reqidle)
 			break;
 		if (vm_nmi_pending(vm, vcpuid))
 			break;
 		if (!intr_disabled) {
 			if (vm_extint_pending(vm, vcpuid) ||
 			    vlapic_pending_intr(vcpu->vlapic, NULL)) {
 				break;
 			}
 		}
 
 		/* Don't go to sleep if the vcpu thread needs to yield */
 		if (vcpu_should_yield(vm, vcpuid))
 			break;
 
 		if (vcpu_debugged(vm, vcpuid))
 			break;
 
 		/*
 		 * Some Linux guests implement "halt" by having all vcpus
 		 * execute HLT with interrupts disabled. 'halted_cpus' keeps
 		 * track of the vcpus that have entered this state. When all
 		 * vcpus enter the halted state the virtual machine is halted.
 		 */
 		if (intr_disabled) {
 			wmesg = "vmhalt";
 			VCPU_CTR0(vm, vcpuid, "Halted");
 			if (!vcpu_halted && halt_detection_enabled) {
 				vcpu_halted = 1;
 				CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus);
 			}
 			if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) {
 				vm_halted = 1;
 				break;
 			}
 		} else {
 			wmesg = "vmidle";
 		}
 
 		t = ticks;
 		vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
 		/*
 		 * XXX msleep_spin() cannot be interrupted by signals so
 		 * wake up periodically to check pending signals.
 		 */
 		msleep_spin(vcpu, &vcpu->mtx, wmesg, hz);
 		vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
 		vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
 		if ((td->td_flags & TDF_NEEDSUSPCHK) != 0) {
 			vcpu_unlock(vcpu);
 			error = thread_check_susp(td, false);
 			if (error != 0)
 				return (error);
 			vcpu_lock(vcpu);
 		}
 	}
 
 	if (vcpu_halted)
 		CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus);
 
 	vcpu_unlock(vcpu);
 
 	if (vm_halted)
 		vm_suspend(vm, VM_SUSPEND_HALT);
 
 	return (0);
 }
 
 static int
 vm_handle_paging(struct vm *vm, int vcpuid, bool *retu)
 {
 	int rv, ftype;
 	struct vm_map *map;
 	struct vcpu *vcpu;
 	struct vm_exit *vme;
 
 	vcpu = &vm->vcpu[vcpuid];
 	vme = &vcpu->exitinfo;
 
 	KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d",
 	    __func__, vme->inst_length));
 
 	ftype = vme->u.paging.fault_type;
 	KASSERT(ftype == VM_PROT_READ ||
 	    ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE,
 	    ("vm_handle_paging: invalid fault_type %d", ftype));
 
 	if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
 		rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace),
 		    vme->u.paging.gpa, ftype);
 		if (rv == 0) {
 			VCPU_CTR2(vm, vcpuid, "%s bit emulation for gpa %#lx",
 			    ftype == VM_PROT_READ ? "accessed" : "dirty",
 			    vme->u.paging.gpa);
 			goto done;
 		}
 	}
 
 	map = &vm->vmspace->vm_map;
 	rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL, NULL);
 
 	VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, "
 	    "ftype = %d", rv, vme->u.paging.gpa, ftype);
 
 	if (rv != KERN_SUCCESS)
 		return (EFAULT);
 done:
 	return (0);
 }
 
 static int
 vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu)
 {
 	struct vie *vie;
 	struct vcpu *vcpu;
 	struct vm_exit *vme;
 	uint64_t gla, gpa, cs_base;
 	struct vm_guest_paging *paging;
 	mem_region_read_t mread;
 	mem_region_write_t mwrite;
 	enum vm_cpu_mode cpu_mode;
 	int cs_d, error, fault;
 
 	vcpu = &vm->vcpu[vcpuid];
 	vme = &vcpu->exitinfo;
 
 	KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d",
 	    __func__, vme->inst_length));
 
 	gla = vme->u.inst_emul.gla;
 	gpa = vme->u.inst_emul.gpa;
 	cs_base = vme->u.inst_emul.cs_base;
 	cs_d = vme->u.inst_emul.cs_d;
 	vie = &vme->u.inst_emul.vie;
 	paging = &vme->u.inst_emul.paging;
 	cpu_mode = paging->cpu_mode;
 
 	VCPU_CTR1(vm, vcpuid, "inst_emul fault accessing gpa %#lx", gpa);
 
 	/* Fetch, decode and emulate the faulting instruction */
 	if (vie->num_valid == 0) {
 		error = vmm_fetch_instruction(vm, vcpuid, paging, vme->rip +
 		    cs_base, VIE_INST_SIZE, vie, &fault);
 	} else {
 		/*
 		 * The instruction bytes have already been copied into 'vie'
 		 */
 		error = fault = 0;
 	}
 	if (error || fault)
 		return (error);
 
 	if (vmm_decode_instruction(vm, vcpuid, gla, cpu_mode, cs_d, vie) != 0) {
 		VCPU_CTR1(vm, vcpuid, "Error decoding instruction at %#lx",
 		    vme->rip + cs_base);
 		*retu = true;	    /* dump instruction bytes in userspace */
 		return (0);
 	}
 
 	/*
 	 * Update 'nextrip' based on the length of the emulated instruction.
 	 */
 	vme->inst_length = vie->num_processed;
 	vcpu->nextrip += vie->num_processed;
 	VCPU_CTR1(vm, vcpuid, "nextrip updated to %#lx after instruction "
 	    "decoding", vcpu->nextrip);
  
 	/* return to userland unless this is an in-kernel emulated device */
 	if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
 		mread = lapic_mmio_read;
 		mwrite = lapic_mmio_write;
 	} else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
 		mread = vioapic_mmio_read;
 		mwrite = vioapic_mmio_write;
 	} else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
 		mread = vhpet_mmio_read;
 		mwrite = vhpet_mmio_write;
 	} else {
 		*retu = true;
 		return (0);
 	}
 
 	error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, paging,
 	    mread, mwrite, retu);
 
 	return (error);
 }
 
 static int
 vm_handle_suspend(struct vm *vm, int vcpuid, bool *retu)
 {
 	int error, i;
 	struct vcpu *vcpu;
 	struct thread *td;
 
 	error = 0;
 	vcpu = &vm->vcpu[vcpuid];
 	td = curthread;
 
 	CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus);
 
 	/*
 	 * Wait until all 'active_cpus' have suspended themselves.
 	 *
 	 * Since a VM may be suspended at any time including when one or
 	 * more vcpus are doing a rendezvous we need to call the rendezvous
 	 * handler while we are waiting to prevent a deadlock.
 	 */
 	vcpu_lock(vcpu);
 	while (error == 0) {
 		if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
 			VCPU_CTR0(vm, vcpuid, "All vcpus suspended");
 			break;
 		}
 
 		if (vm->rendezvous_func == NULL) {
 			VCPU_CTR0(vm, vcpuid, "Sleeping during suspend");
 			vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
 			msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz);
 			vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
 			if ((td->td_flags & TDF_NEEDSUSPCHK) != 0) {
 				vcpu_unlock(vcpu);
 				error = thread_check_susp(td, false);
 				vcpu_lock(vcpu);
 			}
 		} else {
 			VCPU_CTR0(vm, vcpuid, "Rendezvous during suspend");
 			vcpu_unlock(vcpu);
 			error = vm_handle_rendezvous(vm, vcpuid);
 			vcpu_lock(vcpu);
 		}
 	}
 	vcpu_unlock(vcpu);
 
 	/*
 	 * Wakeup the other sleeping vcpus and return to userspace.
 	 */
 	for (i = 0; i < vm->maxcpus; i++) {
 		if (CPU_ISSET(i, &vm->suspended_cpus)) {
 			vcpu_notify_event(vm, i, false);
 		}
 	}
 
 	*retu = true;
 	return (error);
 }
 
 static int
 vm_handle_reqidle(struct vm *vm, int vcpuid, bool *retu)
 {
 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
 
 	vcpu_lock(vcpu);
 	KASSERT(vcpu->reqidle, ("invalid vcpu reqidle %d", vcpu->reqidle));
 	vcpu->reqidle = 0;
 	vcpu_unlock(vcpu);
 	*retu = true;
 	return (0);
 }
 
 int
 vm_suspend(struct vm *vm, enum vm_suspend_how how)
 {
 	int i;
 
 	if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST)
 		return (EINVAL);
 
 	if (atomic_cmpset_int(&vm->suspend, 0, how) == 0) {
 		VM_CTR2(vm, "virtual machine already suspended %d/%d",
 		    vm->suspend, how);
 		return (EALREADY);
 	}
 
 	VM_CTR1(vm, "virtual machine successfully suspended %d", how);
 
 	/*
 	 * Notify all active vcpus that they are now suspended.
 	 */
 	for (i = 0; i < vm->maxcpus; i++) {
 		if (CPU_ISSET(i, &vm->active_cpus))
 			vcpu_notify_event(vm, i, false);
 	}
 
 	return (0);
 }
 
 void
 vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip)
 {
 	struct vm_exit *vmexit;
 
 	KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST,
 	    ("vm_exit_suspended: invalid suspend type %d", vm->suspend));
 
 	vmexit = vm_exitinfo(vm, vcpuid);
 	vmexit->rip = rip;
 	vmexit->inst_length = 0;
 	vmexit->exitcode = VM_EXITCODE_SUSPENDED;
 	vmexit->u.suspended.how = vm->suspend;
 }
 
 void
 vm_exit_debug(struct vm *vm, int vcpuid, uint64_t rip)
 {
 	struct vm_exit *vmexit;
 
 	vmexit = vm_exitinfo(vm, vcpuid);
 	vmexit->rip = rip;
 	vmexit->inst_length = 0;
 	vmexit->exitcode = VM_EXITCODE_DEBUG;
 }
 
 void
 vm_exit_rendezvous(struct vm *vm, int vcpuid, uint64_t rip)
 {
 	struct vm_exit *vmexit;
 
 	KASSERT(vm->rendezvous_func != NULL, ("rendezvous not in progress"));
 
 	vmexit = vm_exitinfo(vm, vcpuid);
 	vmexit->rip = rip;
 	vmexit->inst_length = 0;
 	vmexit->exitcode = VM_EXITCODE_RENDEZVOUS;
 	vmm_stat_incr(vm, vcpuid, VMEXIT_RENDEZVOUS, 1);
 }
 
 void
 vm_exit_reqidle(struct vm *vm, int vcpuid, uint64_t rip)
 {
 	struct vm_exit *vmexit;
 
 	vmexit = vm_exitinfo(vm, vcpuid);
 	vmexit->rip = rip;
 	vmexit->inst_length = 0;
 	vmexit->exitcode = VM_EXITCODE_REQIDLE;
 	vmm_stat_incr(vm, vcpuid, VMEXIT_REQIDLE, 1);
 }
 
 void
 vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip)
 {
 	struct vm_exit *vmexit;
 
 	vmexit = vm_exitinfo(vm, vcpuid);
 	vmexit->rip = rip;
 	vmexit->inst_length = 0;
 	vmexit->exitcode = VM_EXITCODE_BOGUS;
 	vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1);
 }
 
 int
 vm_run(struct vm *vm, struct vm_run *vmrun)
 {
 	struct vm_eventinfo evinfo;
 	int error, vcpuid;
 	struct vcpu *vcpu;
 	struct pcb *pcb;
 	uint64_t tscval;
 	struct vm_exit *vme;
 	bool retu, intr_disabled;
 	pmap_t pmap;
 
 	vcpuid = vmrun->cpuid;
 
 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
 		return (EINVAL);
 
 	if (!CPU_ISSET(vcpuid, &vm->active_cpus))
 		return (EINVAL);
 
 	if (CPU_ISSET(vcpuid, &vm->suspended_cpus))
 		return (EINVAL);
 
 	pmap = vmspace_pmap(vm->vmspace);
 	vcpu = &vm->vcpu[vcpuid];
 	vme = &vcpu->exitinfo;
 	evinfo.rptr = &vm->rendezvous_func;
 	evinfo.sptr = &vm->suspend;
 	evinfo.iptr = &vcpu->reqidle;
 restart:
 	critical_enter();
 
 	KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
 	    ("vm_run: absurd pm_active"));
 
 	tscval = rdtsc();
 
 	pcb = PCPU_GET(curpcb);
 	set_pcb_flags(pcb, PCB_FULL_IRET);
 
 	restore_guest_fpustate(vcpu);
 
 	vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
 	error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip, pmap, &evinfo);
 	vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
 
 	save_guest_fpustate(vcpu);
 
 	vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
 
 	critical_exit();
 
 	if (error == 0) {
 		retu = false;
 		vcpu->nextrip = vme->rip + vme->inst_length;
 		switch (vme->exitcode) {
 		case VM_EXITCODE_REQIDLE:
 			error = vm_handle_reqidle(vm, vcpuid, &retu);
 			break;
 		case VM_EXITCODE_SUSPENDED:
 			error = vm_handle_suspend(vm, vcpuid, &retu);
 			break;
 		case VM_EXITCODE_IOAPIC_EOI:
 			vioapic_process_eoi(vm, vcpuid,
 			    vme->u.ioapic_eoi.vector);
 			break;
 		case VM_EXITCODE_RENDEZVOUS:
 			error = vm_handle_rendezvous(vm, vcpuid);
 			break;
 		case VM_EXITCODE_HLT:
 			intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0);
 			error = vm_handle_hlt(vm, vcpuid, intr_disabled, &retu);
 			break;
 		case VM_EXITCODE_PAGING:
 			error = vm_handle_paging(vm, vcpuid, &retu);
 			break;
 		case VM_EXITCODE_INST_EMUL:
 			error = vm_handle_inst_emul(vm, vcpuid, &retu);
 			break;
 		case VM_EXITCODE_INOUT:
 		case VM_EXITCODE_INOUT_STR:
 			error = vm_handle_inout(vm, vcpuid, vme, &retu);
 			break;
 		case VM_EXITCODE_MONITOR:
 		case VM_EXITCODE_MWAIT:
 		case VM_EXITCODE_VMINSN:
 			vm_inject_ud(vm, vcpuid);
 			break;
 		default:
 			retu = true;	/* handled in userland */
 			break;
 		}
 	}
 
 	if (error == 0 && retu == false)
 		goto restart;
 
 	VCPU_CTR2(vm, vcpuid, "retu %d/%d", error, vme->exitcode);
 
 	/* copy the exit information */
 	bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit));
 	return (error);
 }
 
 int
 vm_restart_instruction(void *arg, int vcpuid)
 {
 	struct vm *vm;
 	struct vcpu *vcpu;
 	enum vcpu_state state;
 	uint64_t rip;
 	int error;
 
 	vm = arg;
 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
 		return (EINVAL);
 
 	vcpu = &vm->vcpu[vcpuid];
 	state = vcpu_get_state(vm, vcpuid, NULL);
 	if (state == VCPU_RUNNING) {
 		/*
 		 * When a vcpu is "running" the next instruction is determined
 		 * by adding 'rip' and 'inst_length' in the vcpu's 'exitinfo'.
 		 * Thus setting 'inst_length' to zero will cause the current
 		 * instruction to be restarted.
 		 */
 		vcpu->exitinfo.inst_length = 0;
 		VCPU_CTR1(vm, vcpuid, "restarting instruction at %#lx by "
 		    "setting inst_length to zero", vcpu->exitinfo.rip);
 	} else if (state == VCPU_FROZEN) {
 		/*
 		 * When a vcpu is "frozen" it is outside the critical section
 		 * around VMRUN() and 'nextrip' points to the next instruction.
 		 * Thus instruction restart is achieved by setting 'nextrip'
 		 * to the vcpu's %rip.
 		 */
 		error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RIP, &rip);
 		KASSERT(!error, ("%s: error %d getting rip", __func__, error));
 		VCPU_CTR2(vm, vcpuid, "restarting instruction by updating "
 		    "nextrip from %#lx to %#lx", vcpu->nextrip, rip);
 		vcpu->nextrip = rip;
 	} else {
 		panic("%s: invalid state %d", __func__, state);
 	}
 	return (0);
 }
 
 int
 vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info)
 {
 	struct vcpu *vcpu;
 	int type, vector;
 
 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
 		return (EINVAL);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	if (info & VM_INTINFO_VALID) {
 		type = info & VM_INTINFO_TYPE;
 		vector = info & 0xff;
 		if (type == VM_INTINFO_NMI && vector != IDT_NMI)
 			return (EINVAL);
 		if (type == VM_INTINFO_HWEXCEPTION && vector >= 32)
 			return (EINVAL);
 		if (info & VM_INTINFO_RSVD)
 			return (EINVAL);
 	} else {
 		info = 0;
 	}
 	VCPU_CTR2(vm, vcpuid, "%s: info1(%#lx)", __func__, info);
 	vcpu->exitintinfo = info;
 	return (0);
 }
 
 enum exc_class {
 	EXC_BENIGN,
 	EXC_CONTRIBUTORY,
 	EXC_PAGEFAULT
 };
 
 #define	IDT_VE	20	/* Virtualization Exception (Intel specific) */
 
 static enum exc_class
 exception_class(uint64_t info)
 {
 	int type, vector;
 
 	KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %#lx", info));
 	type = info & VM_INTINFO_TYPE;
 	vector = info & 0xff;
 
 	/* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */
 	switch (type) {
 	case VM_INTINFO_HWINTR:
 	case VM_INTINFO_SWINTR:
 	case VM_INTINFO_NMI:
 		return (EXC_BENIGN);
 	default:
 		/*
 		 * Hardware exception.
 		 *
 		 * SVM and VT-x use identical type values to represent NMI,
 		 * hardware interrupt and software interrupt.
 		 *
 		 * SVM uses type '3' for all exceptions. VT-x uses type '3'
 		 * for exceptions except #BP and #OF. #BP and #OF use a type
 		 * value of '5' or '6'. Therefore we don't check for explicit
 		 * values of 'type' to classify 'intinfo' into a hardware
 		 * exception.
 		 */
 		break;
 	}
 
 	switch (vector) {
 	case IDT_PF:
 	case IDT_VE:
 		return (EXC_PAGEFAULT);
 	case IDT_DE:
 	case IDT_TS:
 	case IDT_NP:
 	case IDT_SS:
 	case IDT_GP:
 		return (EXC_CONTRIBUTORY);
 	default:
 		return (EXC_BENIGN);
 	}
 }
 
 static int
 nested_fault(struct vm *vm, int vcpuid, uint64_t info1, uint64_t info2,
     uint64_t *retinfo)
 {
 	enum exc_class exc1, exc2;
 	int type1, vector1;
 
 	KASSERT(info1 & VM_INTINFO_VALID, ("info1 %#lx is not valid", info1));
 	KASSERT(info2 & VM_INTINFO_VALID, ("info2 %#lx is not valid", info2));
 
 	/*
 	 * If an exception occurs while attempting to call the double-fault
 	 * handler the processor enters shutdown mode (aka triple fault).
 	 */
 	type1 = info1 & VM_INTINFO_TYPE;
 	vector1 = info1 & 0xff;
 	if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) {
 		VCPU_CTR2(vm, vcpuid, "triple fault: info1(%#lx), info2(%#lx)",
 		    info1, info2);
 		vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT);
 		*retinfo = 0;
 		return (0);
 	}
 
 	/*
 	 * Table 6-5 "Conditions for Generating a Double Fault", Intel SDM, Vol3
 	 */
 	exc1 = exception_class(info1);
 	exc2 = exception_class(info2);
 	if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) ||
 	    (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) {
 		/* Convert nested fault into a double fault. */
 		*retinfo = IDT_DF;
 		*retinfo |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
 		*retinfo |= VM_INTINFO_DEL_ERRCODE;
 	} else {
 		/* Handle exceptions serially */
 		*retinfo = info2;
 	}
 	return (1);
 }
 
 static uint64_t
 vcpu_exception_intinfo(struct vcpu *vcpu)
 {
 	uint64_t info = 0;
 
 	if (vcpu->exception_pending) {
 		info = vcpu->exc_vector & 0xff;
 		info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
 		if (vcpu->exc_errcode_valid) {
 			info |= VM_INTINFO_DEL_ERRCODE;
 			info |= (uint64_t)vcpu->exc_errcode << 32;
 		}
 	}
 	return (info);
 }
 
 int
 vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo)
 {
 	struct vcpu *vcpu;
 	uint64_t info1, info2;
 	int valid;
 
 	KASSERT(vcpuid >= 0 &&
 	    vcpuid < vm->maxcpus, ("invalid vcpu %d", vcpuid));
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	info1 = vcpu->exitintinfo;
 	vcpu->exitintinfo = 0;
 
 	info2 = 0;
 	if (vcpu->exception_pending) {
 		info2 = vcpu_exception_intinfo(vcpu);
 		vcpu->exception_pending = 0;
 		VCPU_CTR2(vm, vcpuid, "Exception %d delivered: %#lx",
 		    vcpu->exc_vector, info2);
 	}
 
 	if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) {
 		valid = nested_fault(vm, vcpuid, info1, info2, retinfo);
 	} else if (info1 & VM_INTINFO_VALID) {
 		*retinfo = info1;
 		valid = 1;
 	} else if (info2 & VM_INTINFO_VALID) {
 		*retinfo = info2;
 		valid = 1;
 	} else {
 		valid = 0;
 	}
 
 	if (valid) {
 		VCPU_CTR4(vm, vcpuid, "%s: info1(%#lx), info2(%#lx), "
 		    "retinfo(%#lx)", __func__, info1, info2, *retinfo);
 	}
 
 	return (valid);
 }
 
 int
 vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2)
 {
 	struct vcpu *vcpu;
 
 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
 		return (EINVAL);
 
 	vcpu = &vm->vcpu[vcpuid];
 	*info1 = vcpu->exitintinfo;
 	*info2 = vcpu_exception_intinfo(vcpu);
 	return (0);
 }
 
 int
 vm_inject_exception(struct vm *vm, int vcpuid, int vector, int errcode_valid,
     uint32_t errcode, int restart_instruction)
 {
 	struct vcpu *vcpu;
 	uint64_t regval;
 	int error;
 
 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
 		return (EINVAL);
 
 	if (vector < 0 || vector >= 32)
 		return (EINVAL);
 
 	/*
 	 * A double fault exception should never be injected directly into
 	 * the guest. It is a derived exception that results from specific
 	 * combinations of nested faults.
 	 */
 	if (vector == IDT_DF)
 		return (EINVAL);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	if (vcpu->exception_pending) {
 		VCPU_CTR2(vm, vcpuid, "Unable to inject exception %d due to "
 		    "pending exception %d", vector, vcpu->exc_vector);
 		return (EBUSY);
 	}
 
 	if (errcode_valid) {
 		/*
 		 * Exceptions don't deliver an error code in real mode.
 		 */
 		error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, &regval);
 		KASSERT(!error, ("%s: error %d getting CR0", __func__, error));
 		if (!(regval & CR0_PE))
 			errcode_valid = 0;
 	}
 
 	/*
 	 * From section 26.6.1 "Interruptibility State" in Intel SDM:
 	 *
 	 * Event blocking by "STI" or "MOV SS" is cleared after guest executes
 	 * one instruction or incurs an exception.
 	 */
 	error = vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0);
 	KASSERT(error == 0, ("%s: error %d clearing interrupt shadow",
 	    __func__, error));
 
 	if (restart_instruction)
 		vm_restart_instruction(vm, vcpuid);
 
 	vcpu->exception_pending = 1;
 	vcpu->exc_vector = vector;
 	vcpu->exc_errcode = errcode;
 	vcpu->exc_errcode_valid = errcode_valid;
 	VCPU_CTR1(vm, vcpuid, "Exception %d pending", vector);
 	return (0);
 }
 
 void
 vm_inject_fault(void *vmarg, int vcpuid, int vector, int errcode_valid,
     int errcode)
 {
 	struct vm *vm;
 	int error, restart_instruction;
 
 	vm = vmarg;
 	restart_instruction = 1;
 
 	error = vm_inject_exception(vm, vcpuid, vector, errcode_valid,
 	    errcode, restart_instruction);
 	KASSERT(error == 0, ("vm_inject_exception error %d", error));
 }
 
 void
 vm_inject_pf(void *vmarg, int vcpuid, int error_code, uint64_t cr2)
 {
 	struct vm *vm;
 	int error;
 
 	vm = vmarg;
 	VCPU_CTR2(vm, vcpuid, "Injecting page fault: error_code %#x, cr2 %#lx",
 	    error_code, cr2);
 
 	error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2);
 	KASSERT(error == 0, ("vm_set_register(cr2) error %d", error));
 
 	vm_inject_fault(vm, vcpuid, IDT_PF, 1, error_code);
 }
 
 static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
 
 int
 vm_inject_nmi(struct vm *vm, int vcpuid)
 {
 	struct vcpu *vcpu;
 
 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
 		return (EINVAL);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	vcpu->nmi_pending = 1;
 	vcpu_notify_event(vm, vcpuid, false);
 	return (0);
 }
 
 int
 vm_nmi_pending(struct vm *vm, int vcpuid)
 {
 	struct vcpu *vcpu;
 
 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
 		panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	return (vcpu->nmi_pending);
 }
 
 void
 vm_nmi_clear(struct vm *vm, int vcpuid)
 {
 	struct vcpu *vcpu;
 
 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
 		panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	if (vcpu->nmi_pending == 0)
 		panic("vm_nmi_clear: inconsistent nmi_pending state");
 
 	vcpu->nmi_pending = 0;
 	vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1);
 }
 
 static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu");
 
 int
 vm_inject_extint(struct vm *vm, int vcpuid)
 {
 	struct vcpu *vcpu;
 
 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
 		return (EINVAL);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	vcpu->extint_pending = 1;
 	vcpu_notify_event(vm, vcpuid, false);
 	return (0);
 }
 
 int
 vm_extint_pending(struct vm *vm, int vcpuid)
 {
 	struct vcpu *vcpu;
 
 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
 		panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	return (vcpu->extint_pending);
 }
 
 void
 vm_extint_clear(struct vm *vm, int vcpuid)
 {
 	struct vcpu *vcpu;
 
 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
 		panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	if (vcpu->extint_pending == 0)
 		panic("vm_extint_clear: inconsistent extint_pending state");
 
 	vcpu->extint_pending = 0;
 	vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1);
 }
 
 int
 vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
 {
 	if (vcpu < 0 || vcpu >= vm->maxcpus)
 		return (EINVAL);
 
 	if (type < 0 || type >= VM_CAP_MAX)
 		return (EINVAL);
 
 	return (VMGETCAP(vm->cookie, vcpu, type, retval));
 }
 
 int
 vm_set_capability(struct vm *vm, int vcpu, int type, int val)
 {
 	if (vcpu < 0 || vcpu >= vm->maxcpus)
 		return (EINVAL);
 
 	if (type < 0 || type >= VM_CAP_MAX)
 		return (EINVAL);
 
 	return (VMSETCAP(vm->cookie, vcpu, type, val));
 }
 
 struct vlapic *
 vm_lapic(struct vm *vm, int cpu)
 {
 	return (vm->vcpu[cpu].vlapic);
 }
 
 struct vioapic *
 vm_ioapic(struct vm *vm)
 {
 
 	return (vm->vioapic);
 }
 
 struct vhpet *
 vm_hpet(struct vm *vm)
 {
 
 	return (vm->vhpet);
 }
 
 bool
 vmm_is_pptdev(int bus, int slot, int func)
 {
 	int b, f, i, n, s;
 	char *val, *cp, *cp2;
 	bool found;
 
 	/*
 	 * XXX
 	 * The length of an environment variable is limited to 128 bytes which
 	 * puts an upper limit on the number of passthru devices that may be
 	 * specified using a single environment variable.
 	 *
 	 * Work around this by scanning multiple environment variable
 	 * names instead of a single one - yuck!
 	 */
 	const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL };
 
 	/* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */
 	found = false;
 	for (i = 0; names[i] != NULL && !found; i++) {
 		cp = val = kern_getenv(names[i]);
 		while (cp != NULL && *cp != '\0') {
 			if ((cp2 = strchr(cp, ' ')) != NULL)
 				*cp2 = '\0';
 
 			n = sscanf(cp, "%d/%d/%d", &b, &s, &f);
 			if (n == 3 && bus == b && slot == s && func == f) {
 				found = true;
 				break;
 			}
 		
 			if (cp2 != NULL)
 				*cp2++ = ' ';
 
 			cp = cp2;
 		}
 		freeenv(val);
 	}
 	return (found);
 }
 
 void *
 vm_iommu_domain(struct vm *vm)
 {
 
 	return (vm->iommu);
 }
 
 int
 vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate,
     bool from_idle)
 {
 	int error;
 	struct vcpu *vcpu;
 
 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
 		panic("vm_set_run_state: invalid vcpuid %d", vcpuid);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	vcpu_lock(vcpu);
 	error = vcpu_set_state_locked(vm, vcpuid, newstate, from_idle);
 	vcpu_unlock(vcpu);
 
 	return (error);
 }
 
 enum vcpu_state
 vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu)
 {
 	struct vcpu *vcpu;
 	enum vcpu_state state;
 
 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
 		panic("vm_get_run_state: invalid vcpuid %d", vcpuid);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	vcpu_lock(vcpu);
 	state = vcpu->state;
 	if (hostcpu != NULL)
 		*hostcpu = vcpu->hostcpu;
 	vcpu_unlock(vcpu);
 
 	return (state);
 }
 
 int
 vm_activate_cpu(struct vm *vm, int vcpuid)
 {
 
 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
 		return (EINVAL);
 
 	if (CPU_ISSET(vcpuid, &vm->active_cpus))
 		return (EBUSY);
 
 	VCPU_CTR0(vm, vcpuid, "activated");
 	CPU_SET_ATOMIC(vcpuid, &vm->active_cpus);
 	return (0);
 }
 
 int
 vm_suspend_cpu(struct vm *vm, int vcpuid)
 {
 	int i;
 
 	if (vcpuid < -1 || vcpuid >= vm->maxcpus)
 		return (EINVAL);
 
 	if (vcpuid == -1) {
 		vm->debug_cpus = vm->active_cpus;
 		for (i = 0; i < vm->maxcpus; i++) {
 			if (CPU_ISSET(i, &vm->active_cpus))
 				vcpu_notify_event(vm, i, false);
 		}
 	} else {
 		if (!CPU_ISSET(vcpuid, &vm->active_cpus))
 			return (EINVAL);
 
 		CPU_SET_ATOMIC(vcpuid, &vm->debug_cpus);
 		vcpu_notify_event(vm, vcpuid, false);
 	}
 	return (0);
 }
 
 int
 vm_resume_cpu(struct vm *vm, int vcpuid)
 {
 
 	if (vcpuid < -1 || vcpuid >= vm->maxcpus)
 		return (EINVAL);
 
 	if (vcpuid == -1) {
 		CPU_ZERO(&vm->debug_cpus);
 	} else {
 		if (!CPU_ISSET(vcpuid, &vm->debug_cpus))
 			return (EINVAL);
 
 		CPU_CLR_ATOMIC(vcpuid, &vm->debug_cpus);
 	}
 	return (0);
 }
 
 int
 vcpu_debugged(struct vm *vm, int vcpuid)
 {
 
 	return (CPU_ISSET(vcpuid, &vm->debug_cpus));
 }
 
 cpuset_t
 vm_active_cpus(struct vm *vm)
 {
 
 	return (vm->active_cpus);
 }
 
 cpuset_t
 vm_debug_cpus(struct vm *vm)
 {
 
 	return (vm->debug_cpus);
 }
 
 cpuset_t
 vm_suspended_cpus(struct vm *vm)
 {
 
 	return (vm->suspended_cpus);
 }
 
 void *
 vcpu_stats(struct vm *vm, int vcpuid)
 {
 
 	return (vm->vcpu[vcpuid].stats);
 }
 
 int
 vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state)
 {
 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
 		return (EINVAL);
 
 	*state = vm->vcpu[vcpuid].x2apic_state;
 
 	return (0);
 }
 
 int
 vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
 {
 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
 		return (EINVAL);
 
 	if (state >= X2APIC_STATE_LAST)
 		return (EINVAL);
 
 	vm->vcpu[vcpuid].x2apic_state = state;
 
 	vlapic_set_x2apic_state(vm, vcpuid, state);
 
 	return (0);
 }
 
 /*
  * This function is called to ensure that a vcpu "sees" a pending event
  * as soon as possible:
  * - If the vcpu thread is sleeping then it is woken up.
  * - If the vcpu is running on a different host_cpu then an IPI will be directed
  *   to the host_cpu to cause the vcpu to trap into the hypervisor.
  */
 static void
 vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr)
 {
 	int hostcpu;
 
 	hostcpu = vcpu->hostcpu;
 	if (vcpu->state == VCPU_RUNNING) {
 		KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
 		if (hostcpu != curcpu) {
 			if (lapic_intr) {
 				vlapic_post_intr(vcpu->vlapic, hostcpu,
 				    vmm_ipinum);
 			} else {
 				ipi_cpu(hostcpu, vmm_ipinum);
 			}
 		} else {
 			/*
 			 * If the 'vcpu' is running on 'curcpu' then it must
 			 * be sending a notification to itself (e.g. SELF_IPI).
 			 * The pending event will be picked up when the vcpu
 			 * transitions back to guest context.
 			 */
 		}
 	} else {
 		KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent "
 		    "with hostcpu %d", vcpu->state, hostcpu));
 		if (vcpu->state == VCPU_SLEEPING)
 			wakeup_one(vcpu);
 	}
 }
 
 void
 vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr)
 {
 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
 
 	vcpu_lock(vcpu);
 	vcpu_notify_event_locked(vcpu, lapic_intr);
 	vcpu_unlock(vcpu);
 }
 
 struct vmspace *
 vm_get_vmspace(struct vm *vm)
 {
 
 	return (vm->vmspace);
 }
 
 int
 vm_apicid2vcpuid(struct vm *vm, int apicid)
 {
 	/*
 	 * XXX apic id is assumed to be numerically identical to vcpu id
 	 */
 	return (apicid);
 }
 
 int
 vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest,
     vm_rendezvous_func_t func, void *arg)
 {
 	int error, i;
 
 	/*
 	 * Enforce that this function is called without any locks
 	 */
 	WITNESS_WARN(WARN_PANIC, NULL, "vm_smp_rendezvous");
 	KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < vm->maxcpus),
 	    ("vm_smp_rendezvous: invalid vcpuid %d", vcpuid));
 
 restart:
 	mtx_lock(&vm->rendezvous_mtx);
 	if (vm->rendezvous_func != NULL) {
 		/*
 		 * If a rendezvous is already in progress then we need to
 		 * call the rendezvous handler in case this 'vcpuid' is one
 		 * of the targets of the rendezvous.
 		 */
 		RENDEZVOUS_CTR0(vm, vcpuid, "Rendezvous already in progress");
 		mtx_unlock(&vm->rendezvous_mtx);
 		error = vm_handle_rendezvous(vm, vcpuid);
 		if (error != 0)
 			return (error);
 		goto restart;
 	}
 	KASSERT(vm->rendezvous_func == NULL, ("vm_smp_rendezvous: previous "
 	    "rendezvous is still in progress"));
 
 	RENDEZVOUS_CTR0(vm, vcpuid, "Initiating rendezvous");
 	vm->rendezvous_req_cpus = dest;
 	CPU_ZERO(&vm->rendezvous_done_cpus);
 	vm->rendezvous_arg = arg;
 	vm->rendezvous_func = func;
 	mtx_unlock(&vm->rendezvous_mtx);
 
 	/*
 	 * Wake up any sleeping vcpus and trigger a VM-exit in any running
 	 * vcpus so they handle the rendezvous as soon as possible.
 	 */
 	for (i = 0; i < vm->maxcpus; i++) {
 		if (CPU_ISSET(i, &dest))
 			vcpu_notify_event(vm, i, false);
 	}
 
 	return (vm_handle_rendezvous(vm, vcpuid));
 }
 
 struct vatpic *
 vm_atpic(struct vm *vm)
 {
 	return (vm->vatpic);
 }
 
 struct vatpit *
 vm_atpit(struct vm *vm)
 {
 	return (vm->vatpit);
 }
 
 struct vpmtmr *
 vm_pmtmr(struct vm *vm)
 {
 
 	return (vm->vpmtmr);
 }
 
 struct vrtc *
 vm_rtc(struct vm *vm)
 {
 
 	return (vm->vrtc);
 }
 
 enum vm_reg_name
 vm_segment_name(int seg)
 {
 	static enum vm_reg_name seg_names[] = {
 		VM_REG_GUEST_ES,
 		VM_REG_GUEST_CS,
 		VM_REG_GUEST_SS,
 		VM_REG_GUEST_DS,
 		VM_REG_GUEST_FS,
 		VM_REG_GUEST_GS
 	};
 
 	KASSERT(seg >= 0 && seg < nitems(seg_names),
 	    ("%s: invalid segment encoding %d", __func__, seg));
 	return (seg_names[seg]);
 }
 
 void
 vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
     int num_copyinfo)
 {
 	int idx;
 
 	for (idx = 0; idx < num_copyinfo; idx++) {
 		if (copyinfo[idx].cookie != NULL)
 			vm_gpa_release(copyinfo[idx].cookie);
 	}
 	bzero(copyinfo, num_copyinfo * sizeof(struct vm_copyinfo));
 }
 
 int
 vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
     uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
     int num_copyinfo, int *fault)
 {
 	int error, idx, nused;
 	size_t n, off, remaining;
 	void *hva, *cookie;
 	uint64_t gpa;
 
 	bzero(copyinfo, sizeof(struct vm_copyinfo) * num_copyinfo);
 
 	nused = 0;
 	remaining = len;
 	while (remaining > 0) {
 		KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo"));
 		error = vm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa, fault);
 		if (error || *fault)
 			return (error);
 		off = gpa & PAGE_MASK;
 		n = min(remaining, PAGE_SIZE - off);
 		copyinfo[nused].gpa = gpa;
 		copyinfo[nused].len = n;
 		remaining -= n;
 		gla += n;
 		nused++;
 	}
 
 	for (idx = 0; idx < nused; idx++) {
 		hva = vm_gpa_hold(vm, vcpuid, copyinfo[idx].gpa,
 		    copyinfo[idx].len, prot, &cookie);
 		if (hva == NULL)
 			break;
 		copyinfo[idx].hva = hva;
 		copyinfo[idx].cookie = cookie;
 	}
 
 	if (idx != nused) {
 		vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo);
 		return (EFAULT);
 	} else {
 		*fault = 0;
 		return (0);
 	}
 }
 
 void
 vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr,
     size_t len)
 {
 	char *dst;
 	int idx;
 	
 	dst = kaddr;
 	idx = 0;
 	while (len > 0) {
 		bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len);
 		len -= copyinfo[idx].len;
 		dst += copyinfo[idx].len;
 		idx++;
 	}
 }
 
 void
 vm_copyout(struct vm *vm, int vcpuid, const void *kaddr,
     struct vm_copyinfo *copyinfo, size_t len)
 {
 	const char *src;
 	int idx;
 
 	src = kaddr;
 	idx = 0;
 	while (len > 0) {
 		bcopy(src, copyinfo[idx].hva, copyinfo[idx].len);
 		len -= copyinfo[idx].len;
 		src += copyinfo[idx].len;
 		idx++;
 	}
 }
 
 /*
  * Return the amount of in-use and wired memory for the VM. Since
  * these are global stats, only return the values with for vCPU 0
  */
 VMM_STAT_DECLARE(VMM_MEM_RESIDENT);
 VMM_STAT_DECLARE(VMM_MEM_WIRED);
 
 static void
 vm_get_rescnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
 {
 
 	if (vcpu == 0) {
 		vmm_stat_set(vm, vcpu, VMM_MEM_RESIDENT,
 	       	    PAGE_SIZE * vmspace_resident_count(vm->vmspace));
 	}	
 }
 
 static void
 vm_get_wiredcnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
 {
 
 	if (vcpu == 0) {
 		vmm_stat_set(vm, vcpu, VMM_MEM_WIRED,
 	      	    PAGE_SIZE * pmap_wired_count(vmspace_pmap(vm->vmspace)));
 	}	
 }
 
 VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt);
 VMM_STAT_FUNC(VMM_MEM_WIRED, "Wired memory", vm_get_wiredcnt);
Index: head/sys/amd64/vmm/vmm_dev.c
===================================================================
--- head/sys/amd64/vmm/vmm_dev.c	(revision 357973)
+++ head/sys/amd64/vmm/vmm_dev.c	(revision 357974)
@@ -1,1165 +1,1167 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/jail.h>
 #include <sys/queue.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/malloc.h>
 #include <sys/conf.h>
 #include <sys/sysctl.h>
 #include <sys/libkern.h>
 #include <sys/ioccom.h>
 #include <sys/mman.h>
 #include <sys/uio.h>
 #include <sys/proc.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 
 #include <machine/vmparam.h>
 #include <machine/vmm.h>
 #include <machine/vmm_instruction_emul.h>
 #include <machine/vmm_dev.h>
 
 #include "vmm_lapic.h"
 #include "vmm_stat.h"
 #include "vmm_mem.h"
 #include "io/ppt.h"
 #include "io/vatpic.h"
 #include "io/vioapic.h"
 #include "io/vhpet.h"
 #include "io/vrtc.h"
 
 struct devmem_softc {
 	int	segid;
 	char	*name;
 	struct cdev *cdev;
 	struct vmmdev_softc *sc;
 	SLIST_ENTRY(devmem_softc) link;
 };
 
 struct vmmdev_softc {
 	struct vm	*vm;		/* vm instance cookie */
 	struct cdev	*cdev;
 	SLIST_ENTRY(vmmdev_softc) link;
 	SLIST_HEAD(, devmem_softc) devmem;
 	int		flags;
 };
 #define	VSC_LINKED		0x01
 
 static SLIST_HEAD(, vmmdev_softc) head;
 
 static unsigned pr_allow_flag;
 static struct mtx vmmdev_mtx;
 
 static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev");
 
 SYSCTL_DECL(_hw_vmm);
 
 static int vmm_priv_check(struct ucred *ucred);
 static int devmem_create_cdev(const char *vmname, int id, char *devmem);
 static void devmem_destroy(void *arg);
 
 static int
 vmm_priv_check(struct ucred *ucred)
 {
 
 	if (jailed(ucred) &&
 	    !(ucred->cr_prison->pr_allow & pr_allow_flag))
 		return (EPERM);
 
 	return (0);
 }
 
 static int
 vcpu_lock_one(struct vmmdev_softc *sc, int vcpu)
 {
 	int error;
 
 	if (vcpu < 0 || vcpu >= vm_get_maxcpus(sc->vm))
 		return (EINVAL);
 
 	error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN, true);
 	return (error);
 }
 
 static void
 vcpu_unlock_one(struct vmmdev_softc *sc, int vcpu)
 {
 	enum vcpu_state state;
 
 	state = vcpu_get_state(sc->vm, vcpu, NULL);
 	if (state != VCPU_FROZEN) {
 		panic("vcpu %s(%d) has invalid state %d", vm_name(sc->vm),
 		    vcpu, state);
 	}
 
 	vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false);
 }
 
 static int
 vcpu_lock_all(struct vmmdev_softc *sc)
 {
 	int error, vcpu;
 	uint16_t maxcpus;
 
 	maxcpus = vm_get_maxcpus(sc->vm);
 	for (vcpu = 0; vcpu < maxcpus; vcpu++) {
 		error = vcpu_lock_one(sc, vcpu);
 		if (error)
 			break;
 	}
 
 	if (error) {
 		while (--vcpu >= 0)
 			vcpu_unlock_one(sc, vcpu);
 	}
 
 	return (error);
 }
 
 static void
 vcpu_unlock_all(struct vmmdev_softc *sc)
 {
 	int vcpu;
 	uint16_t maxcpus;
 
 	maxcpus = vm_get_maxcpus(sc->vm);
 	for (vcpu = 0; vcpu < maxcpus; vcpu++)
 		vcpu_unlock_one(sc, vcpu);
 }
 
 static struct vmmdev_softc *
 vmmdev_lookup(const char *name)
 {
 	struct vmmdev_softc *sc;
 
 #ifdef notyet	/* XXX kernel is not compiled with invariants */
 	mtx_assert(&vmmdev_mtx, MA_OWNED);
 #endif
 
 	SLIST_FOREACH(sc, &head, link) {
 		if (strcmp(name, vm_name(sc->vm)) == 0)
 			break;
 	}
 
 	return (sc);
 }
 
 static struct vmmdev_softc *
 vmmdev_lookup2(struct cdev *cdev)
 {
 
 	return (cdev->si_drv1);
 }
 
 static int
 vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
 {
 	int error, off, c, prot;
 	vm_paddr_t gpa, maxaddr;
 	void *hpa, *cookie;
 	struct vmmdev_softc *sc;
 	uint16_t lastcpu;
 
 	error = vmm_priv_check(curthread->td_ucred);
 	if (error)
 		return (error);
 
 	sc = vmmdev_lookup2(cdev);
 	if (sc == NULL)
 		return (ENXIO);
 
 	/*
 	 * Get a read lock on the guest memory map by freezing any vcpu.
 	 */
 	lastcpu = vm_get_maxcpus(sc->vm) - 1;
 	error = vcpu_lock_one(sc, lastcpu);
 	if (error)
 		return (error);
 
 	prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ);
 	maxaddr = vmm_sysmem_maxaddr(sc->vm);
 	while (uio->uio_resid > 0 && error == 0) {
 		gpa = uio->uio_offset;
 		off = gpa & PAGE_MASK;
 		c = min(uio->uio_resid, PAGE_SIZE - off);
 
 		/*
 		 * The VM has a hole in its physical memory map. If we want to
 		 * use 'dd' to inspect memory beyond the hole we need to
 		 * provide bogus data for memory that lies in the hole.
 		 *
 		 * Since this device does not support lseek(2), dd(1) will
 		 * read(2) blocks of data to simulate the lseek(2).
 		 */
 		hpa = vm_gpa_hold(sc->vm, lastcpu, gpa, c,
 		    prot, &cookie);
 		if (hpa == NULL) {
 			if (uio->uio_rw == UIO_READ && gpa < maxaddr)
 				error = uiomove(__DECONST(void *, zero_region),
 				    c, uio);
 			else
 				error = EFAULT;
 		} else {
 			error = uiomove(hpa, c, uio);
 			vm_gpa_release(cookie);
 		}
 	}
 	vcpu_unlock_one(sc, lastcpu);
 	return (error);
 }
 
 CTASSERT(sizeof(((struct vm_memseg *)0)->name) >= VM_MAX_SUFFIXLEN + 1);
 
 static int
 get_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg)
 {
 	struct devmem_softc *dsc;
 	int error;
 	bool sysmem;
 
 	error = vm_get_memseg(sc->vm, mseg->segid, &mseg->len, &sysmem, NULL);
 	if (error || mseg->len == 0)
 		return (error);
 
 	if (!sysmem) {
 		SLIST_FOREACH(dsc, &sc->devmem, link) {
 			if (dsc->segid == mseg->segid)
 				break;
 		}
 		KASSERT(dsc != NULL, ("%s: devmem segment %d not found",
 		    __func__, mseg->segid));
 		error = copystr(dsc->name, mseg->name, sizeof(mseg->name),
 		    NULL);
 	} else {
 		bzero(mseg->name, sizeof(mseg->name));
 	}
 
 	return (error);
 }
 
 static int
 alloc_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg)
 {
 	char *name;
 	int error;
 	bool sysmem;
 
 	error = 0;
 	name = NULL;
 	sysmem = true;
 
 	/*
 	 * The allocation is lengthened by 1 to hold a terminating NUL.  It'll
 	 * by stripped off when devfs processes the full string.
 	 */
 	if (VM_MEMSEG_NAME(mseg)) {
 		sysmem = false;
 		name = malloc(sizeof(mseg->name), M_VMMDEV, M_WAITOK);
 		error = copystr(mseg->name, name, sizeof(mseg->name), NULL);
 		if (error)
 			goto done;
 	}
 
 	error = vm_alloc_memseg(sc->vm, mseg->segid, mseg->len, sysmem);
 	if (error)
 		goto done;
 
 	if (VM_MEMSEG_NAME(mseg)) {
 		error = devmem_create_cdev(vm_name(sc->vm), mseg->segid, name);
 		if (error)
 			vm_free_memseg(sc->vm, mseg->segid);
 		else
 			name = NULL;	/* freed when 'cdev' is destroyed */
 	}
 done:
 	free(name, M_VMMDEV);
 	return (error);
 }
 
 static int
 vm_get_register_set(struct vm *vm, int vcpu, unsigned int count, int *regnum,
     uint64_t *regval)
 {
 	int error, i;
 
 	error = 0;
 	for (i = 0; i < count; i++) {
 		error = vm_get_register(vm, vcpu, regnum[i], &regval[i]);
 		if (error)
 			break;
 	}
 	return (error);
 }
 
 static int
 vm_set_register_set(struct vm *vm, int vcpu, unsigned int count, int *regnum,
     uint64_t *regval)
 {
 	int error, i;
 
 	error = 0;
 	for (i = 0; i < count; i++) {
 		error = vm_set_register(vm, vcpu, regnum[i], regval[i]);
 		if (error)
 			break;
 	}
 	return (error);
 }
 
 static int
 vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 	     struct thread *td)
 {
 	int error, vcpu, state_changed, size;
 	cpuset_t *cpuset;
 	struct vmmdev_softc *sc;
 	struct vm_register *vmreg;
 	struct vm_seg_desc *vmsegdesc;
 	struct vm_register_set *vmregset;
 	struct vm_run *vmrun;
 	struct vm_exception *vmexc;
 	struct vm_lapic_irq *vmirq;
 	struct vm_lapic_msi *vmmsi;
 	struct vm_ioapic_irq *ioapic_irq;
 	struct vm_isa_irq *isa_irq;
 	struct vm_isa_irq_trigger *isa_irq_trigger;
 	struct vm_capability *vmcap;
 	struct vm_pptdev *pptdev;
 	struct vm_pptdev_mmio *pptmmio;
 	struct vm_pptdev_msi *pptmsi;
 	struct vm_pptdev_msix *pptmsix;
 	struct vm_nmi *vmnmi;
 	struct vm_stats *vmstats;
 	struct vm_stat_desc *statdesc;
 	struct vm_x2apic *x2apic;
 	struct vm_gpa_pte *gpapte;
 	struct vm_suspend *vmsuspend;
 	struct vm_gla2gpa *gg;
 	struct vm_activate_cpu *vac;
 	struct vm_cpuset *vm_cpuset;
 	struct vm_intinfo *vmii;
 	struct vm_rtc_time *rtctime;
 	struct vm_rtc_data *rtcdata;
 	struct vm_memmap *mm;
 	struct vm_cpu_topology *topology;
 	uint64_t *regvals;
 	int *regnums;
 
 	error = vmm_priv_check(curthread->td_ucred);
 	if (error)
 		return (error);
 
 	sc = vmmdev_lookup2(cdev);
 	if (sc == NULL)
 		return (ENXIO);
 
 	vcpu = -1;
 	state_changed = 0;
 
 	/*
 	 * Some VMM ioctls can operate only on vcpus that are not running.
 	 */
 	switch (cmd) {
 	case VM_RUN:
 	case VM_GET_REGISTER:
 	case VM_SET_REGISTER:
 	case VM_GET_SEGMENT_DESCRIPTOR:
 	case VM_SET_SEGMENT_DESCRIPTOR:
 	case VM_GET_REGISTER_SET:
 	case VM_SET_REGISTER_SET:
 	case VM_INJECT_EXCEPTION:
 	case VM_GET_CAPABILITY:
 	case VM_SET_CAPABILITY:
 	case VM_PPTDEV_MSI:
 	case VM_PPTDEV_MSIX:
 	case VM_SET_X2APIC_STATE:
 	case VM_GLA2GPA:
 	case VM_GLA2GPA_NOFAULT:
 	case VM_ACTIVATE_CPU:
 	case VM_SET_INTINFO:
 	case VM_GET_INTINFO:
 	case VM_RESTART_INSTRUCTION:
 		/*
 		 * XXX fragile, handle with care
 		 * Assumes that the first field of the ioctl data is the vcpu.
 		 */
 		vcpu = *(int *)data;
 		error = vcpu_lock_one(sc, vcpu);
 		if (error)
 			goto done;
 		state_changed = 1;
 		break;
 
 	case VM_MAP_PPTDEV_MMIO:
 	case VM_BIND_PPTDEV:
 	case VM_UNBIND_PPTDEV:
 	case VM_ALLOC_MEMSEG:
 	case VM_MMAP_MEMSEG:
 	case VM_REINIT:
 		/*
 		 * ioctls that operate on the entire virtual machine must
 		 * prevent all vcpus from running.
 		 */
 		error = vcpu_lock_all(sc);
 		if (error)
 			goto done;
 		state_changed = 2;
 		break;
 
 	case VM_GET_MEMSEG:
 	case VM_MMAP_GETNEXT:
 		/*
 		 * Lock a vcpu to make sure that the memory map cannot be
 		 * modified while it is being inspected.
 		 */
 		vcpu = vm_get_maxcpus(sc->vm) - 1;
 		error = vcpu_lock_one(sc, vcpu);
 		if (error)
 			goto done;
 		state_changed = 1;
 		break;
 
 	default:
 		break;
 	}
 
 	switch(cmd) {
 	case VM_RUN:
 		vmrun = (struct vm_run *)data;
 		error = vm_run(sc->vm, vmrun);
 		break;
 	case VM_SUSPEND:
 		vmsuspend = (struct vm_suspend *)data;
 		error = vm_suspend(sc->vm, vmsuspend->how);
 		break;
 	case VM_REINIT:
 		error = vm_reinit(sc->vm);
 		break;
 	case VM_STAT_DESC: {
 		statdesc = (struct vm_stat_desc *)data;
 		error = vmm_stat_desc_copy(statdesc->index,
 					statdesc->desc, sizeof(statdesc->desc));
 		break;
 	}
 	case VM_STATS: {
 		CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_ELEMS);
 		vmstats = (struct vm_stats *)data;
 		getmicrotime(&vmstats->tv);
 		error = vmm_stat_copy(sc->vm, vmstats->cpuid,
 				      &vmstats->num_entries, vmstats->statbuf);
 		break;
 	}
 	case VM_PPTDEV_MSI:
 		pptmsi = (struct vm_pptdev_msi *)data;
 		error = ppt_setup_msi(sc->vm, pptmsi->vcpu,
 				      pptmsi->bus, pptmsi->slot, pptmsi->func,
 				      pptmsi->addr, pptmsi->msg,
 				      pptmsi->numvec);
 		break;
 	case VM_PPTDEV_MSIX:
 		pptmsix = (struct vm_pptdev_msix *)data;
 		error = ppt_setup_msix(sc->vm, pptmsix->vcpu,
 				       pptmsix->bus, pptmsix->slot, 
 				       pptmsix->func, pptmsix->idx,
 				       pptmsix->addr, pptmsix->msg,
 				       pptmsix->vector_control);
 		break;
 	case VM_MAP_PPTDEV_MMIO:
 		pptmmio = (struct vm_pptdev_mmio *)data;
 		error = ppt_map_mmio(sc->vm, pptmmio->bus, pptmmio->slot,
 				     pptmmio->func, pptmmio->gpa, pptmmio->len,
 				     pptmmio->hpa);
 		break;
 	case VM_BIND_PPTDEV:
 		pptdev = (struct vm_pptdev *)data;
 		error = vm_assign_pptdev(sc->vm, pptdev->bus, pptdev->slot,
 					 pptdev->func);
 		break;
 	case VM_UNBIND_PPTDEV:
 		pptdev = (struct vm_pptdev *)data;
 		error = vm_unassign_pptdev(sc->vm, pptdev->bus, pptdev->slot,
 					   pptdev->func);
 		break;
 	case VM_INJECT_EXCEPTION:
 		vmexc = (struct vm_exception *)data;
 		error = vm_inject_exception(sc->vm, vmexc->cpuid,
 		    vmexc->vector, vmexc->error_code_valid, vmexc->error_code,
 		    vmexc->restart_instruction);
 		break;
 	case VM_INJECT_NMI:
 		vmnmi = (struct vm_nmi *)data;
 		error = vm_inject_nmi(sc->vm, vmnmi->cpuid);
 		break;
 	case VM_LAPIC_IRQ:
 		vmirq = (struct vm_lapic_irq *)data;
 		error = lapic_intr_edge(sc->vm, vmirq->cpuid, vmirq->vector);
 		break;
 	case VM_LAPIC_LOCAL_IRQ:
 		vmirq = (struct vm_lapic_irq *)data;
 		error = lapic_set_local_intr(sc->vm, vmirq->cpuid,
 		    vmirq->vector);
 		break;
 	case VM_LAPIC_MSI:
 		vmmsi = (struct vm_lapic_msi *)data;
 		error = lapic_intr_msi(sc->vm, vmmsi->addr, vmmsi->msg);
 		break;
 	case VM_IOAPIC_ASSERT_IRQ:
 		ioapic_irq = (struct vm_ioapic_irq *)data;
 		error = vioapic_assert_irq(sc->vm, ioapic_irq->irq);
 		break;
 	case VM_IOAPIC_DEASSERT_IRQ:
 		ioapic_irq = (struct vm_ioapic_irq *)data;
 		error = vioapic_deassert_irq(sc->vm, ioapic_irq->irq);
 		break;
 	case VM_IOAPIC_PULSE_IRQ:
 		ioapic_irq = (struct vm_ioapic_irq *)data;
 		error = vioapic_pulse_irq(sc->vm, ioapic_irq->irq);
 		break;
 	case VM_IOAPIC_PINCOUNT:
 		*(int *)data = vioapic_pincount(sc->vm);
 		break;
 	case VM_ISA_ASSERT_IRQ:
 		isa_irq = (struct vm_isa_irq *)data;
 		error = vatpic_assert_irq(sc->vm, isa_irq->atpic_irq);
 		if (error == 0 && isa_irq->ioapic_irq != -1)
 			error = vioapic_assert_irq(sc->vm,
 			    isa_irq->ioapic_irq);
 		break;
 	case VM_ISA_DEASSERT_IRQ:
 		isa_irq = (struct vm_isa_irq *)data;
 		error = vatpic_deassert_irq(sc->vm, isa_irq->atpic_irq);
 		if (error == 0 && isa_irq->ioapic_irq != -1)
 			error = vioapic_deassert_irq(sc->vm,
 			    isa_irq->ioapic_irq);
 		break;
 	case VM_ISA_PULSE_IRQ:
 		isa_irq = (struct vm_isa_irq *)data;
 		error = vatpic_pulse_irq(sc->vm, isa_irq->atpic_irq);
 		if (error == 0 && isa_irq->ioapic_irq != -1)
 			error = vioapic_pulse_irq(sc->vm, isa_irq->ioapic_irq);
 		break;
 	case VM_ISA_SET_IRQ_TRIGGER:
 		isa_irq_trigger = (struct vm_isa_irq_trigger *)data;
 		error = vatpic_set_irq_trigger(sc->vm,
 		    isa_irq_trigger->atpic_irq, isa_irq_trigger->trigger);
 		break;
 	case VM_MMAP_GETNEXT:
 		mm = (struct vm_memmap *)data;
 		error = vm_mmap_getnext(sc->vm, &mm->gpa, &mm->segid,
 		    &mm->segoff, &mm->len, &mm->prot, &mm->flags);
 		break;
 	case VM_MMAP_MEMSEG:
 		mm = (struct vm_memmap *)data;
 		error = vm_mmap_memseg(sc->vm, mm->gpa, mm->segid, mm->segoff,
 		    mm->len, mm->prot, mm->flags);
 		break;
 	case VM_ALLOC_MEMSEG:
 		error = alloc_memseg(sc, (struct vm_memseg *)data);
 		break;
 	case VM_GET_MEMSEG:
 		error = get_memseg(sc, (struct vm_memseg *)data);
 		break;
 	case VM_GET_REGISTER:
 		vmreg = (struct vm_register *)data;
 		error = vm_get_register(sc->vm, vmreg->cpuid, vmreg->regnum,
 					&vmreg->regval);
 		break;
 	case VM_SET_REGISTER:
 		vmreg = (struct vm_register *)data;
 		error = vm_set_register(sc->vm, vmreg->cpuid, vmreg->regnum,
 					vmreg->regval);
 		break;
 	case VM_SET_SEGMENT_DESCRIPTOR:
 		vmsegdesc = (struct vm_seg_desc *)data;
 		error = vm_set_seg_desc(sc->vm, vmsegdesc->cpuid,
 					vmsegdesc->regnum,
 					&vmsegdesc->desc);
 		break;
 	case VM_GET_SEGMENT_DESCRIPTOR:
 		vmsegdesc = (struct vm_seg_desc *)data;
 		error = vm_get_seg_desc(sc->vm, vmsegdesc->cpuid,
 					vmsegdesc->regnum,
 					&vmsegdesc->desc);
 		break;
 	case VM_GET_REGISTER_SET:
 		vmregset = (struct vm_register_set *)data;
 		if (vmregset->count > VM_REG_LAST) {
 			error = EINVAL;
 			break;
 		}
 		regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV,
 		    M_WAITOK);
 		regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV,
 		    M_WAITOK);
 		error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) *
 		    vmregset->count);
 		if (error == 0)
 			error = vm_get_register_set(sc->vm, vmregset->cpuid,
 			    vmregset->count, regnums, regvals);
 		if (error == 0)
 			error = copyout(regvals, vmregset->regvals,
 			    sizeof(regvals[0]) * vmregset->count);
 		free(regvals, M_VMMDEV);
 		free(regnums, M_VMMDEV);
 		break;
 	case VM_SET_REGISTER_SET:
 		vmregset = (struct vm_register_set *)data;
 		if (vmregset->count > VM_REG_LAST) {
 			error = EINVAL;
 			break;
 		}
 		regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV,
 		    M_WAITOK);
 		regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV,
 		    M_WAITOK);
 		error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) *
 		    vmregset->count);
 		if (error == 0)
 			error = copyin(vmregset->regvals, regvals,
 			    sizeof(regvals[0]) * vmregset->count);
 		if (error == 0)
 			error = vm_set_register_set(sc->vm, vmregset->cpuid,
 			    vmregset->count, regnums, regvals);
 		free(regvals, M_VMMDEV);
 		free(regnums, M_VMMDEV);
 		break;
 	case VM_GET_CAPABILITY:
 		vmcap = (struct vm_capability *)data;
 		error = vm_get_capability(sc->vm, vmcap->cpuid,
 					  vmcap->captype,
 					  &vmcap->capval);
 		break;
 	case VM_SET_CAPABILITY:
 		vmcap = (struct vm_capability *)data;
 		error = vm_set_capability(sc->vm, vmcap->cpuid,
 					  vmcap->captype,
 					  vmcap->capval);
 		break;
 	case VM_SET_X2APIC_STATE:
 		x2apic = (struct vm_x2apic *)data;
 		error = vm_set_x2apic_state(sc->vm,
 					    x2apic->cpuid, x2apic->state);
 		break;
 	case VM_GET_X2APIC_STATE:
 		x2apic = (struct vm_x2apic *)data;
 		error = vm_get_x2apic_state(sc->vm,
 					    x2apic->cpuid, &x2apic->state);
 		break;
 	case VM_GET_GPA_PMAP:
 		gpapte = (struct vm_gpa_pte *)data;
 		pmap_get_mapping(vmspace_pmap(vm_get_vmspace(sc->vm)),
 				 gpapte->gpa, gpapte->pte, &gpapte->ptenum);
 		error = 0;
 		break;
 	case VM_GET_HPET_CAPABILITIES:
 		error = vhpet_getcap((struct vm_hpet_cap *)data);
 		break;
 	case VM_GLA2GPA: {
 		CTASSERT(PROT_READ == VM_PROT_READ);
 		CTASSERT(PROT_WRITE == VM_PROT_WRITE);
 		CTASSERT(PROT_EXEC == VM_PROT_EXECUTE);
 		gg = (struct vm_gla2gpa *)data;
 		error = vm_gla2gpa(sc->vm, gg->vcpuid, &gg->paging, gg->gla,
 		    gg->prot, &gg->gpa, &gg->fault);
 		KASSERT(error == 0 || error == EFAULT,
 		    ("%s: vm_gla2gpa unknown error %d", __func__, error));
 		break;
 	}
 	case VM_GLA2GPA_NOFAULT:
 		gg = (struct vm_gla2gpa *)data;
 		error = vm_gla2gpa_nofault(sc->vm, gg->vcpuid, &gg->paging,
 		    gg->gla, gg->prot, &gg->gpa, &gg->fault);
 		KASSERT(error == 0 || error == EFAULT,
 		    ("%s: vm_gla2gpa unknown error %d", __func__, error));
 		break;
 	case VM_ACTIVATE_CPU:
 		vac = (struct vm_activate_cpu *)data;
 		error = vm_activate_cpu(sc->vm, vac->vcpuid);
 		break;
 	case VM_GET_CPUS:
 		error = 0;
 		vm_cpuset = (struct vm_cpuset *)data;
 		size = vm_cpuset->cpusetsize;
 		if (size < sizeof(cpuset_t) || size > CPU_MAXSIZE / NBBY) {
 			error = ERANGE;
 			break;
 		}
 		cpuset = malloc(size, M_TEMP, M_WAITOK | M_ZERO);
 		if (vm_cpuset->which == VM_ACTIVE_CPUS)
 			*cpuset = vm_active_cpus(sc->vm);
 		else if (vm_cpuset->which == VM_SUSPENDED_CPUS)
 			*cpuset = vm_suspended_cpus(sc->vm);
 		else if (vm_cpuset->which == VM_DEBUG_CPUS)
 			*cpuset = vm_debug_cpus(sc->vm);
 		else
 			error = EINVAL;
 		if (error == 0)
 			error = copyout(cpuset, vm_cpuset->cpus, size);
 		free(cpuset, M_TEMP);
 		break;
 	case VM_SUSPEND_CPU:
 		vac = (struct vm_activate_cpu *)data;
 		error = vm_suspend_cpu(sc->vm, vac->vcpuid);
 		break;
 	case VM_RESUME_CPU:
 		vac = (struct vm_activate_cpu *)data;
 		error = vm_resume_cpu(sc->vm, vac->vcpuid);
 		break;
 	case VM_SET_INTINFO:
 		vmii = (struct vm_intinfo *)data;
 		error = vm_exit_intinfo(sc->vm, vmii->vcpuid, vmii->info1);
 		break;
 	case VM_GET_INTINFO:
 		vmii = (struct vm_intinfo *)data;
 		error = vm_get_intinfo(sc->vm, vmii->vcpuid, &vmii->info1,
 		    &vmii->info2);
 		break;
 	case VM_RTC_WRITE:
 		rtcdata = (struct vm_rtc_data *)data;
 		error = vrtc_nvram_write(sc->vm, rtcdata->offset,
 		    rtcdata->value);
 		break;
 	case VM_RTC_READ:
 		rtcdata = (struct vm_rtc_data *)data;
 		error = vrtc_nvram_read(sc->vm, rtcdata->offset,
 		    &rtcdata->value);
 		break;
 	case VM_RTC_SETTIME:
 		rtctime = (struct vm_rtc_time *)data;
 		error = vrtc_set_time(sc->vm, rtctime->secs);
 		break;
 	case VM_RTC_GETTIME:
 		error = 0;
 		rtctime = (struct vm_rtc_time *)data;
 		rtctime->secs = vrtc_get_time(sc->vm);
 		break;
 	case VM_RESTART_INSTRUCTION:
 		error = vm_restart_instruction(sc->vm, vcpu);
 		break;
 	case VM_SET_TOPOLOGY:
 		topology = (struct vm_cpu_topology *)data;
 		error = vm_set_topology(sc->vm, topology->sockets,
 		    topology->cores, topology->threads, topology->maxcpus);
 		break;
 	case VM_GET_TOPOLOGY:
 		topology = (struct vm_cpu_topology *)data;
 		vm_get_topology(sc->vm, &topology->sockets, &topology->cores,
 		    &topology->threads, &topology->maxcpus);
 		error = 0;
 		break;
 	default:
 		error = ENOTTY;
 		break;
 	}
 
 	if (state_changed == 1)
 		vcpu_unlock_one(sc, vcpu);
 	else if (state_changed == 2)
 		vcpu_unlock_all(sc);
 
 done:
 	/*
 	 * Make sure that no handler returns a kernel-internal
 	 * error value to userspace.
 	 */
 	KASSERT(error == ERESTART || error >= 0,
 	    ("vmmdev_ioctl: invalid error return %d", error));
 	return (error);
 }
 
 static int
 vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t mapsize,
     struct vm_object **objp, int nprot)
 {
 	struct vmmdev_softc *sc;
 	vm_paddr_t gpa;
 	size_t len;
 	vm_ooffset_t segoff, first, last;
 	int error, found, segid;
 	uint16_t lastcpu;
 	bool sysmem;
 
 	error = vmm_priv_check(curthread->td_ucred);
 	if (error)
 		return (error);
 
 	first = *offset;
 	last = first + mapsize;
 	if ((nprot & PROT_EXEC) || first < 0 || first >= last)
 		return (EINVAL);
 
 	sc = vmmdev_lookup2(cdev);
 	if (sc == NULL) {
 		/* virtual machine is in the process of being created */
 		return (EINVAL);
 	}
 
 	/*
 	 * Get a read lock on the guest memory map by freezing any vcpu.
 	 */
 	lastcpu = vm_get_maxcpus(sc->vm) - 1;
 	error = vcpu_lock_one(sc, lastcpu);
 	if (error)
 		return (error);
 
 	gpa = 0;
 	found = 0;
 	while (!found) {
 		error = vm_mmap_getnext(sc->vm, &gpa, &segid, &segoff, &len,
 		    NULL, NULL);
 		if (error)
 			break;
 
 		if (first >= gpa && last <= gpa + len)
 			found = 1;
 		else
 			gpa += len;
 	}
 
 	if (found) {
 		error = vm_get_memseg(sc->vm, segid, &len, &sysmem, objp);
 		KASSERT(error == 0 && *objp != NULL,
 		    ("%s: invalid memory segment %d", __func__, segid));
 		if (sysmem) {
 			vm_object_reference(*objp);
 			*offset = segoff + (first - gpa);
 		} else {
 			error = EINVAL;
 		}
 	}
 	vcpu_unlock_one(sc, lastcpu);
 	return (error);
 }
 
 static void
 vmmdev_destroy(void *arg)
 {
 	struct vmmdev_softc *sc = arg;
 	struct devmem_softc *dsc;
 	int error;
 
 	error = vcpu_lock_all(sc);
 	KASSERT(error == 0, ("%s: error %d freezing vcpus", __func__, error));
 
 	while ((dsc = SLIST_FIRST(&sc->devmem)) != NULL) {
 		KASSERT(dsc->cdev == NULL, ("%s: devmem not free", __func__));
 		SLIST_REMOVE_HEAD(&sc->devmem, link);
 		free(dsc->name, M_VMMDEV);
 		free(dsc, M_VMMDEV);
 	}
 
 	if (sc->cdev != NULL)
 		destroy_dev(sc->cdev);
 
 	if (sc->vm != NULL)
 		vm_destroy(sc->vm);
 
 	if ((sc->flags & VSC_LINKED) != 0) {
 		mtx_lock(&vmmdev_mtx);
 		SLIST_REMOVE(&head, sc, vmmdev_softc, link);
 		mtx_unlock(&vmmdev_mtx);
 	}
 
 	free(sc, M_VMMDEV);
 }
 
 static int
 sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS)
 {
 	struct devmem_softc *dsc;
 	struct vmmdev_softc *sc;
 	struct cdev *cdev;
 	char *buf;
 	int error, buflen;
 
 	error = vmm_priv_check(req->td->td_ucred);
 	if (error)
 		return (error);
 
 	buflen = VM_MAX_NAMELEN + 1;
 	buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO);
 	strlcpy(buf, "beavis", buflen);
 	error = sysctl_handle_string(oidp, buf, buflen, req);
 	if (error != 0 || req->newptr == NULL)
 		goto out;
 
 	mtx_lock(&vmmdev_mtx);
 	sc = vmmdev_lookup(buf);
 	if (sc == NULL || sc->cdev == NULL) {
 		mtx_unlock(&vmmdev_mtx);
 		error = EINVAL;
 		goto out;
 	}
 
 	/*
 	 * The 'cdev' will be destroyed asynchronously when 'si_threadcount'
 	 * goes down to 0 so we should not do it again in the callback.
 	 *
 	 * Setting 'sc->cdev' to NULL is also used to indicate that the VM
 	 * is scheduled for destruction.
 	 */
 	cdev = sc->cdev;
 	sc->cdev = NULL;		
 	mtx_unlock(&vmmdev_mtx);
 
 	/*
 	 * Schedule all cdevs to be destroyed:
 	 *
 	 * - any new operations on the 'cdev' will return an error (ENXIO).
 	 *
 	 * - when the 'si_threadcount' dwindles down to zero the 'cdev' will
 	 *   be destroyed and the callback will be invoked in a taskqueue
 	 *   context.
 	 *
 	 * - the 'devmem' cdevs are destroyed before the virtual machine 'cdev'
 	 */
 	SLIST_FOREACH(dsc, &sc->devmem, link) {
 		KASSERT(dsc->cdev != NULL, ("devmem cdev already destroyed"));
 		destroy_dev_sched_cb(dsc->cdev, devmem_destroy, dsc);
 	}
 	destroy_dev_sched_cb(cdev, vmmdev_destroy, sc);
 	error = 0;
 
 out:
 	free(buf, M_VMMDEV);
 	return (error);
 }
 SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy,
-	    CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON,
-	    NULL, 0, sysctl_vmm_destroy, "A", NULL);
+    CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
+    NULL, 0, sysctl_vmm_destroy, "A",
+    NULL);
 
 static struct cdevsw vmmdevsw = {
 	.d_name		= "vmmdev",
 	.d_version	= D_VERSION,
 	.d_ioctl	= vmmdev_ioctl,
 	.d_mmap_single	= vmmdev_mmap_single,
 	.d_read		= vmmdev_rw,
 	.d_write	= vmmdev_rw,
 };
 
 static int
 sysctl_vmm_create(SYSCTL_HANDLER_ARGS)
 {
 	struct vm *vm;
 	struct cdev *cdev;
 	struct vmmdev_softc *sc, *sc2;
 	char *buf;
 	int error, buflen;
 
 	error = vmm_priv_check(req->td->td_ucred);
 	if (error)
 		return (error);
 
 	buflen = VM_MAX_NAMELEN + 1;
 	buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO);
 	strlcpy(buf, "beavis", buflen);
 	error = sysctl_handle_string(oidp, buf, buflen, req);
 	if (error != 0 || req->newptr == NULL)
 		goto out;
 
 	mtx_lock(&vmmdev_mtx);
 	sc = vmmdev_lookup(buf);
 	mtx_unlock(&vmmdev_mtx);
 	if (sc != NULL) {
 		error = EEXIST;
 		goto out;
 	}
 
 	error = vm_create(buf, &vm);
 	if (error != 0)
 		goto out;
 
 	sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO);
 	sc->vm = vm;
 	SLIST_INIT(&sc->devmem);
 
 	/*
 	 * Lookup the name again just in case somebody sneaked in when we
 	 * dropped the lock.
 	 */
 	mtx_lock(&vmmdev_mtx);
 	sc2 = vmmdev_lookup(buf);
 	if (sc2 == NULL) {
 		SLIST_INSERT_HEAD(&head, sc, link);
 		sc->flags |= VSC_LINKED;
 	}
 	mtx_unlock(&vmmdev_mtx);
 
 	if (sc2 != NULL) {
 		vmmdev_destroy(sc);
 		error = EEXIST;
 		goto out;
 	}
 
 	error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &vmmdevsw, NULL,
 			   UID_ROOT, GID_WHEEL, 0600, "vmm/%s", buf);
 	if (error != 0) {
 		vmmdev_destroy(sc);
 		goto out;
 	}
 
 	mtx_lock(&vmmdev_mtx);
 	sc->cdev = cdev;
 	sc->cdev->si_drv1 = sc;
 	mtx_unlock(&vmmdev_mtx);
 
 out:
 	free(buf, M_VMMDEV);
 	return (error);
 }
 SYSCTL_PROC(_hw_vmm, OID_AUTO, create,
-	    CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON,
-	    NULL, 0, sysctl_vmm_create, "A", NULL);
+    CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
+    NULL, 0, sysctl_vmm_create, "A",
+    NULL);
 
 void
 vmmdev_init(void)
 {
 	mtx_init(&vmmdev_mtx, "vmm device mutex", NULL, MTX_DEF);
 	pr_allow_flag = prison_add_allow(NULL, "vmm", NULL,
 	    "Allow use of vmm in a jail.");
 }
 
 int
 vmmdev_cleanup(void)
 {
 	int error;
 
 	if (SLIST_EMPTY(&head))
 		error = 0;
 	else
 		error = EBUSY;
 
 	return (error);
 }
 
 static int
 devmem_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t len,
     struct vm_object **objp, int nprot)
 {
 	struct devmem_softc *dsc;
 	vm_ooffset_t first, last;
 	size_t seglen;
 	int error;
 	uint16_t lastcpu;
 	bool sysmem;
 
 	dsc = cdev->si_drv1;
 	if (dsc == NULL) {
 		/* 'cdev' has been created but is not ready for use */
 		return (ENXIO);
 	}
 
 	first = *offset;
 	last = *offset + len;
 	if ((nprot & PROT_EXEC) || first < 0 || first >= last)
 		return (EINVAL);
 
 	lastcpu = vm_get_maxcpus(dsc->sc->vm) - 1;
 	error = vcpu_lock_one(dsc->sc, lastcpu);
 	if (error)
 		return (error);
 
 	error = vm_get_memseg(dsc->sc->vm, dsc->segid, &seglen, &sysmem, objp);
 	KASSERT(error == 0 && !sysmem && *objp != NULL,
 	    ("%s: invalid devmem segment %d", __func__, dsc->segid));
 
 	vcpu_unlock_one(dsc->sc, lastcpu);
 
 	if (seglen >= last) {
 		vm_object_reference(*objp);
 		return (0);
 	} else {
 		return (EINVAL);
 	}
 }
 
 static struct cdevsw devmemsw = {
 	.d_name		= "devmem",
 	.d_version	= D_VERSION,
 	.d_mmap_single	= devmem_mmap_single,
 };
 
 static int
 devmem_create_cdev(const char *vmname, int segid, char *devname)
 {
 	struct devmem_softc *dsc;
 	struct vmmdev_softc *sc;
 	struct cdev *cdev;
 	int error;
 
 	error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &devmemsw, NULL,
 	    UID_ROOT, GID_WHEEL, 0600, "vmm.io/%s.%s", vmname, devname);
 	if (error)
 		return (error);
 
 	dsc = malloc(sizeof(struct devmem_softc), M_VMMDEV, M_WAITOK | M_ZERO);
 
 	mtx_lock(&vmmdev_mtx);
 	sc = vmmdev_lookup(vmname);
 	KASSERT(sc != NULL, ("%s: vm %s softc not found", __func__, vmname));
 	if (sc->cdev == NULL) {
 		/* virtual machine is being created or destroyed */
 		mtx_unlock(&vmmdev_mtx);
 		free(dsc, M_VMMDEV);
 		destroy_dev_sched_cb(cdev, NULL, 0);
 		return (ENODEV);
 	}
 
 	dsc->segid = segid;
 	dsc->name = devname;
 	dsc->cdev = cdev;
 	dsc->sc = sc;
 	SLIST_INSERT_HEAD(&sc->devmem, dsc, link);
 	mtx_unlock(&vmmdev_mtx);
 
 	/* The 'cdev' is ready for use after 'si_drv1' is initialized */
 	cdev->si_drv1 = dsc;
 	return (0);
 }
 
 static void
 devmem_destroy(void *arg)
 {
 	struct devmem_softc *dsc = arg;
 
 	KASSERT(dsc->cdev, ("%s: devmem cdev already destroyed", __func__));
 	dsc->cdev = NULL;
 	dsc->sc = NULL;
 }
Index: head/sys/amd64/vmm/x86.c
===================================================================
--- head/sys/amd64/vmm/x86.c	(revision 357973)
+++ head/sys/amd64/vmm/x86.c	(revision 357974)
@@ -1,618 +1,619 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/pcpu.h>
 #include <sys/systm.h>
 #include <sys/sysctl.h>
 
 #include <machine/clock.h>
 #include <machine/cpufunc.h>
 #include <machine/md_var.h>
 #include <machine/segments.h>
 #include <machine/specialreg.h>
 
 #include <machine/vmm.h>
 
 #include "vmm_host.h"
 #include "vmm_ktr.h"
 #include "vmm_util.h"
 #include "x86.h"
 
 SYSCTL_DECL(_hw_vmm);
-static SYSCTL_NODE(_hw_vmm, OID_AUTO, topology, CTLFLAG_RD, 0, NULL);
+static SYSCTL_NODE(_hw_vmm, OID_AUTO, topology, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
+    NULL);
 
 #define	CPUID_VM_HIGH		0x40000000
 
 static const char bhyve_id[12] = "bhyve bhyve ";
 
 static uint64_t bhyve_xcpuids;
 SYSCTL_ULONG(_hw_vmm, OID_AUTO, bhyve_xcpuids, CTLFLAG_RW, &bhyve_xcpuids, 0,
     "Number of times an unknown cpuid leaf was accessed");
 
 #if __FreeBSD_version < 1200060	/* Remove after 11 EOL helps MFCing */
 extern u_int threads_per_core;
 SYSCTL_UINT(_hw_vmm_topology, OID_AUTO, threads_per_core, CTLFLAG_RDTUN,
     &threads_per_core, 0, NULL);
 
 extern u_int cores_per_package;
 SYSCTL_UINT(_hw_vmm_topology, OID_AUTO, cores_per_package, CTLFLAG_RDTUN,
     &cores_per_package, 0, NULL);
 #endif
 
 static int cpuid_leaf_b = 1;
 SYSCTL_INT(_hw_vmm_topology, OID_AUTO, cpuid_leaf_b, CTLFLAG_RDTUN,
     &cpuid_leaf_b, 0, NULL);
 
 /*
  * Round up to the next power of two, if necessary, and then take log2.
  * Returns -1 if argument is zero.
  */
 static __inline int
 log2(u_int x)
 {
 
 	return (fls(x << (1 - powerof2(x))) - 1);
 }
 
 int
 x86_emulate_cpuid(struct vm *vm, int vcpu_id,
 		  uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
 {
 	const struct xsave_limits *limits;
 	uint64_t cr4;
 	int error, enable_invpcid, level, width, x2apic_id;
 	unsigned int func, regs[4], logical_cpus;
 	enum x2apic_state x2apic_state;
 	uint16_t cores, maxcpus, sockets, threads;
 
 	VCPU_CTR2(vm, vcpu_id, "cpuid %#x,%#x", *eax, *ecx);
 
 	/*
 	 * Requests for invalid CPUID levels should map to the highest
 	 * available level instead.
 	 */
 	if (cpu_exthigh != 0 && *eax >= 0x80000000) {
 		if (*eax > cpu_exthigh)
 			*eax = cpu_exthigh;
 	} else if (*eax >= 0x40000000) {
 		if (*eax > CPUID_VM_HIGH)
 			*eax = CPUID_VM_HIGH;
 	} else if (*eax > cpu_high) {
 		*eax = cpu_high;
 	}
 
 	func = *eax;
 
 	/*
 	 * In general the approach used for CPU topology is to
 	 * advertise a flat topology where all CPUs are packages with
 	 * no multi-core or SMT.
 	 */
 	switch (func) {
 		/*
 		 * Pass these through to the guest
 		 */
 		case CPUID_0000_0000:
 		case CPUID_0000_0002:
 		case CPUID_0000_0003:
 		case CPUID_8000_0000:
 		case CPUID_8000_0002:
 		case CPUID_8000_0003:
 		case CPUID_8000_0004:
 		case CPUID_8000_0006:
 			cpuid_count(*eax, *ecx, regs);
 			break;
 		case CPUID_8000_0008:
 			cpuid_count(*eax, *ecx, regs);
 			if (vmm_is_svm()) {
 				/*
 				 * As on Intel (0000_0007:0, EDX), mask out
 				 * unsupported or unsafe AMD extended features
 				 * (8000_0008 EBX).
 				 */
 				regs[1] &= (AMDFEID_CLZERO | AMDFEID_IRPERF |
 				    AMDFEID_XSAVEERPTR);
 
 				vm_get_topology(vm, &sockets, &cores, &threads,
 				    &maxcpus);
 				/*
 				 * Here, width is ApicIdCoreIdSize, present on
 				 * at least Family 15h and newer.  It
 				 * represents the "number of bits in the
 				 * initial apicid that indicate thread id
 				 * within a package."
 				 *
 				 * Our topo_probe_amd() uses it for
 				 * pkg_id_shift and other OSes may rely on it.
 				 */
 				width = MIN(0xF, log2(threads * cores));
 				if (width < 0x4)
 					width = 0;
 				logical_cpus = MIN(0xFF, threads * cores - 1);
 				regs[2] = (width << AMDID_COREID_SIZE_SHIFT) | logical_cpus;
 			}
 			break;
 
 		case CPUID_8000_0001:
 			cpuid_count(*eax, *ecx, regs);
 
 			/*
 			 * Hide SVM from guest.
 			 */
 			regs[2] &= ~AMDID2_SVM;
 
 			/*
 			 * Don't advertise extended performance counter MSRs
 			 * to the guest.
 			 */
 			regs[2] &= ~AMDID2_PCXC;
 			regs[2] &= ~AMDID2_PNXC;
 			regs[2] &= ~AMDID2_PTSCEL2I;
 
 			/*
 			 * Don't advertise Instruction Based Sampling feature.
 			 */
 			regs[2] &= ~AMDID2_IBS;
 
 			/* NodeID MSR not available */
 			regs[2] &= ~AMDID2_NODE_ID;
 
 			/* Don't advertise the OS visible workaround feature */
 			regs[2] &= ~AMDID2_OSVW;
 
 			/* Hide mwaitx/monitorx capability from the guest */
 			regs[2] &= ~AMDID2_MWAITX;
 
 			/*
 			 * Hide rdtscp/ia32_tsc_aux until we know how
 			 * to deal with them.
 			 */
 			regs[3] &= ~AMDID_RDTSCP;
 			break;
 
 		case CPUID_8000_0007:
 			/*
 			 * AMD uses this leaf to advertise the processor's
 			 * power monitoring and RAS capabilities. These
 			 * features are hardware-specific and exposing
 			 * them to a guest doesn't make a lot of sense.
 			 *
 			 * Intel uses this leaf only to advertise the
 			 * "Invariant TSC" feature with all other bits
 			 * being reserved (set to zero).
 			 */
 			regs[0] = 0;
 			regs[1] = 0;
 			regs[2] = 0;
 			regs[3] = 0;
 
 			/*
 			 * "Invariant TSC" can be advertised to the guest if:
 			 * - host TSC frequency is invariant
 			 * - host TSCs are synchronized across physical cpus
 			 *
 			 * XXX This still falls short because the vcpu
 			 * can observe the TSC moving backwards as it
 			 * migrates across physical cpus. But at least
 			 * it should discourage the guest from using the
 			 * TSC to keep track of time.
 			 */
 			if (tsc_is_invariant && smp_tsc)
 				regs[3] |= AMDPM_TSC_INVARIANT;
 			break;
 
 		case CPUID_8000_001D:
 			/* AMD Cache topology, like 0000_0004 for Intel. */
 			if (!vmm_is_svm())
 				goto default_leaf;
 
 			/*
 			 * Similar to Intel, generate a ficticious cache
 			 * topology for the guest with L3 shared by the
 			 * package, and L1 and L2 local to a core.
 			 */
 			vm_get_topology(vm, &sockets, &cores, &threads,
 			    &maxcpus);
 			switch (*ecx) {
 			case 0:
 				logical_cpus = threads;
 				level = 1;
 				func = 1;	/* data cache */
 				break;
 			case 1:
 				logical_cpus = threads;
 				level = 2;
 				func = 3;	/* unified cache */
 				break;
 			case 2:
 				logical_cpus = threads * cores;
 				level = 3;
 				func = 3;	/* unified cache */
 				break;
 			default:
 				logical_cpus = 0;
 				level = 0;
 				func = 0;
 				break;
 			}
 
 			logical_cpus = MIN(0xfff, logical_cpus - 1);
 			regs[0] = (logical_cpus << 14) | (1 << 8) |
 			    (level << 5) | func;
 			regs[1] = (func > 0) ? (CACHE_LINE_SIZE - 1) : 0;
 			regs[2] = 0;
 			regs[3] = 0;
 			break;
 
 		case CPUID_8000_001E:
 			/*
 			 * AMD Family 16h+ and Hygon Family 18h additional
 			 * identifiers.
 			 */
 			if (!vmm_is_svm() || CPUID_TO_FAMILY(cpu_id) < 0x16)
 				goto default_leaf;
 
 			vm_get_topology(vm, &sockets, &cores, &threads,
 			    &maxcpus);
 			regs[0] = vcpu_id;
 			threads = MIN(0xFF, threads - 1);
 			regs[1] = (threads << 8) |
 			    (vcpu_id >> log2(threads + 1));
 			/*
 			 * XXX Bhyve topology cannot yet represent >1 node per
 			 * processor.
 			 */
 			regs[2] = 0;
 			regs[3] = 0;
 			break;
 
 		case CPUID_0000_0001:
 			do_cpuid(1, regs);
 
 			error = vm_get_x2apic_state(vm, vcpu_id, &x2apic_state);
 			if (error) {
 				panic("x86_emulate_cpuid: error %d "
 				      "fetching x2apic state", error);
 			}
 
 			/*
 			 * Override the APIC ID only in ebx
 			 */
 			regs[1] &= ~(CPUID_LOCAL_APIC_ID);
 			regs[1] |= (vcpu_id << CPUID_0000_0001_APICID_SHIFT);
 
 			/*
 			 * Don't expose VMX, SpeedStep, TME or SMX capability.
 			 * Advertise x2APIC capability and Hypervisor guest.
 			 */
 			regs[2] &= ~(CPUID2_VMX | CPUID2_EST | CPUID2_TM2);
 			regs[2] &= ~(CPUID2_SMX);
 
 			regs[2] |= CPUID2_HV;
 
 			if (x2apic_state != X2APIC_DISABLED)
 				regs[2] |= CPUID2_X2APIC;
 			else
 				regs[2] &= ~CPUID2_X2APIC;
 
 			/*
 			 * Only advertise CPUID2_XSAVE in the guest if
 			 * the host is using XSAVE.
 			 */
 			if (!(regs[2] & CPUID2_OSXSAVE))
 				regs[2] &= ~CPUID2_XSAVE;
 
 			/*
 			 * If CPUID2_XSAVE is being advertised and the
 			 * guest has set CR4_XSAVE, set
 			 * CPUID2_OSXSAVE.
 			 */
 			regs[2] &= ~CPUID2_OSXSAVE;
 			if (regs[2] & CPUID2_XSAVE) {
 				error = vm_get_register(vm, vcpu_id,
 				    VM_REG_GUEST_CR4, &cr4);
 				if (error)
 					panic("x86_emulate_cpuid: error %d "
 					      "fetching %%cr4", error);
 				if (cr4 & CR4_XSAVE)
 					regs[2] |= CPUID2_OSXSAVE;
 			}
 
 			/*
 			 * Hide monitor/mwait until we know how to deal with
 			 * these instructions.
 			 */
 			regs[2] &= ~CPUID2_MON;
 
                         /*
 			 * Hide the performance and debug features.
 			 */
 			regs[2] &= ~CPUID2_PDCM;
 
 			/*
 			 * No TSC deadline support in the APIC yet
 			 */
 			regs[2] &= ~CPUID2_TSCDLT;
 
 			/*
 			 * Hide thermal monitoring
 			 */
 			regs[3] &= ~(CPUID_ACPI | CPUID_TM);
 
 			/*
 			 * Hide the debug store capability.
 			 */
 			regs[3] &= ~CPUID_DS;
 
 			/*
 			 * Advertise the Machine Check and MTRR capability.
 			 *
 			 * Some guest OSes (e.g. Windows) will not boot if
 			 * these features are absent.
 			 */
 			regs[3] |= (CPUID_MCA | CPUID_MCE | CPUID_MTRR);
 
 			vm_get_topology(vm, &sockets, &cores, &threads,
 			    &maxcpus);
 			logical_cpus = threads * cores;
 			regs[1] &= ~CPUID_HTT_CORES;
 			regs[1] |= (logical_cpus & 0xff) << 16;
 			regs[3] |= CPUID_HTT;
 			break;
 
 		case CPUID_0000_0004:
 			cpuid_count(*eax, *ecx, regs);
 
 			if (regs[0] || regs[1] || regs[2] || regs[3]) {
 				vm_get_topology(vm, &sockets, &cores, &threads,
 				    &maxcpus);
 				regs[0] &= 0x3ff;
 				regs[0] |= (cores - 1) << 26;
 				/*
 				 * Cache topology:
 				 * - L1 and L2 are shared only by the logical
 				 *   processors in a single core.
 				 * - L3 and above are shared by all logical
 				 *   processors in the package.
 				 */
 				logical_cpus = threads;
 				level = (regs[0] >> 5) & 0x7;
 				if (level >= 3)
 					logical_cpus *= cores;
 				regs[0] |= (logical_cpus - 1) << 14;
 			}
 			break;
 
 		case CPUID_0000_0007:
 			regs[0] = 0;
 			regs[1] = 0;
 			regs[2] = 0;
 			regs[3] = 0;
 
 			/* leaf 0 */
 			if (*ecx == 0) {
 				cpuid_count(*eax, *ecx, regs);
 
 				/* Only leaf 0 is supported */
 				regs[0] = 0;
 
 				/*
 				 * Expose known-safe features.
 				 */
 				regs[1] &= (CPUID_STDEXT_FSGSBASE |
 				    CPUID_STDEXT_BMI1 | CPUID_STDEXT_HLE |
 				    CPUID_STDEXT_AVX2 | CPUID_STDEXT_BMI2 |
 				    CPUID_STDEXT_ERMS | CPUID_STDEXT_RTM |
 				    CPUID_STDEXT_AVX512F |
 				    CPUID_STDEXT_RDSEED |
 				    CPUID_STDEXT_AVX512PF |
 				    CPUID_STDEXT_AVX512ER |
 				    CPUID_STDEXT_AVX512CD | CPUID_STDEXT_SHA);
 				regs[2] = 0;
 				regs[3] &= CPUID_STDEXT3_MD_CLEAR;
 
 				/* Advertise INVPCID if it is enabled. */
 				error = vm_get_capability(vm, vcpu_id,
 				    VM_CAP_ENABLE_INVPCID, &enable_invpcid);
 				if (error == 0 && enable_invpcid)
 					regs[1] |= CPUID_STDEXT_INVPCID;
 			}
 			break;
 
 		case CPUID_0000_0006:
 			regs[0] = CPUTPM1_ARAT;
 			regs[1] = 0;
 			regs[2] = 0;
 			regs[3] = 0;
 			break;
 
 		case CPUID_0000_000A:
 			/*
 			 * Handle the access, but report 0 for
 			 * all options
 			 */
 			regs[0] = 0;
 			regs[1] = 0;
 			regs[2] = 0;
 			regs[3] = 0;
 			break;
 
 		case CPUID_0000_000B:
 			/*
 			 * Intel processor topology enumeration
 			 */
 			if (vmm_is_intel()) {
 				vm_get_topology(vm, &sockets, &cores, &threads,
 				    &maxcpus);
 				if (*ecx == 0) {
 					logical_cpus = threads;
 					width = log2(logical_cpus);
 					level = CPUID_TYPE_SMT;
 					x2apic_id = vcpu_id;
 				}
 
 				if (*ecx == 1) {
 					logical_cpus = threads * cores;
 					width = log2(logical_cpus);
 					level = CPUID_TYPE_CORE;
 					x2apic_id = vcpu_id;
 				}
 
 				if (!cpuid_leaf_b || *ecx >= 2) {
 					width = 0;
 					logical_cpus = 0;
 					level = 0;
 					x2apic_id = 0;
 				}
 
 				regs[0] = width & 0x1f;
 				regs[1] = logical_cpus & 0xffff;
 				regs[2] = (level << 8) | (*ecx & 0xff);
 				regs[3] = x2apic_id;
 			} else {
 				regs[0] = 0;
 				regs[1] = 0;
 				regs[2] = 0;
 				regs[3] = 0;
 			}
 			break;
 
 		case CPUID_0000_000D:
 			limits = vmm_get_xsave_limits();
 			if (!limits->xsave_enabled) {
 				regs[0] = 0;
 				regs[1] = 0;
 				regs[2] = 0;
 				regs[3] = 0;
 				break;
 			}
 
 			cpuid_count(*eax, *ecx, regs);
 			switch (*ecx) {
 			case 0:
 				/*
 				 * Only permit the guest to use bits
 				 * that are active in the host in
 				 * %xcr0.  Also, claim that the
 				 * maximum save area size is
 				 * equivalent to the host's current
 				 * save area size.  Since this runs
 				 * "inside" of vmrun(), it runs with
 				 * the guest's xcr0, so the current
 				 * save area size is correct as-is.
 				 */
 				regs[0] &= limits->xcr0_allowed;
 				regs[2] = limits->xsave_max_size;
 				regs[3] &= (limits->xcr0_allowed >> 32);
 				break;
 			case 1:
 				/* Only permit XSAVEOPT. */
 				regs[0] &= CPUID_EXTSTATE_XSAVEOPT;
 				regs[1] = 0;
 				regs[2] = 0;
 				regs[3] = 0;
 				break;
 			default:
 				/*
 				 * If the leaf is for a permitted feature,
 				 * pass through as-is, otherwise return
 				 * all zeroes.
 				 */
 				if (!(limits->xcr0_allowed & (1ul << *ecx))) {
 					regs[0] = 0;
 					regs[1] = 0;
 					regs[2] = 0;
 					regs[3] = 0;
 				}
 				break;
 			}
 			break;
 
 		case 0x40000000:
 			regs[0] = CPUID_VM_HIGH;
 			bcopy(bhyve_id, &regs[1], 4);
 			bcopy(bhyve_id + 4, &regs[2], 4);
 			bcopy(bhyve_id + 8, &regs[3], 4);
 			break;
 
 		default:
 default_leaf:
 			/*
 			 * The leaf value has already been clamped so
 			 * simply pass this through, keeping count of
 			 * how many unhandled leaf values have been seen.
 			 */
 			atomic_add_long(&bhyve_xcpuids, 1);
 			cpuid_count(*eax, *ecx, regs);
 			break;
 	}
 
 	*eax = regs[0];
 	*ebx = regs[1];
 	*ecx = regs[2];
 	*edx = regs[3];
 
 	return (1);
 }
 
 bool
 vm_cpuid_capability(struct vm *vm, int vcpuid, enum vm_cpuid_capability cap)
 {
 	bool rv;
 
 	KASSERT(cap > 0 && cap < VCC_LAST, ("%s: invalid vm_cpu_capability %d",
 	    __func__, cap));
 
 	/*
 	 * Simply passthrough the capabilities of the host cpu for now.
 	 */
 	rv = false;
 	switch (cap) {
 	case VCC_NO_EXECUTE:
 		if (amd_feature & AMDID_NX)
 			rv = true;
 		break;
 	case VCC_FFXSR:
 		if (amd_feature & AMDID_FFXSR)
 			rv = true;
 		break;
 	case VCC_TCE:
 		if (amd_feature2 & AMDID2_TCE)
 			rv = true;
 		break;
 	default:
 		panic("%s: unknown vm_cpu_capability %d", __func__, cap);
 	}
 	return (rv);
 }