No OneTemporary
Actions

Size

321 KB

Referenced Files

None

Subscribers

None

View Options

This file is larger than 256 KB, so syntax highlighting was skipped.

	Index: head/lib/libvmmapi/vmmapi.c
	===================================================================
	--- head/lib/libvmmapi/vmmapi.c (revision 332156)
	+++ head/lib/libvmmapi/vmmapi.c (revision 332157)
	@@ -1,1519 +1,1551 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2011 NetApp, Inc.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* $FreeBSD$
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/sysctl.h>
	#include <sys/ioctl.h>
	#include <sys/mman.h>
	#include <sys/_iovec.h>
	#include <sys/cpuset.h>

	#include <x86/segments.h>
	#include <machine/specialreg.h>

	#include <errno.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <assert.h>
	#include <string.h>
	#include <fcntl.h>
	#include <unistd.h>

	#include <libutil.h>

	#include <machine/vmm.h>
	#include <machine/vmm_dev.h>

	#include "vmmapi.h"

	#define MB (1024 * 1024UL)
	#define GB (1024 * 1024 * 1024UL)

	/*
	* Size of the guard region before and after the virtual address space
	* mapping the guest physical memory. This must be a multiple of the
	* superpage size for performance reasons.
	*/
	#define VM_MMAP_GUARD_SIZE (4 * MB)

	#define PROT_RW (PROT_READ \| PROT_WRITE)
	#define PROT_ALL (PROT_READ \| PROT_WRITE \| PROT_EXEC)

	struct vmctx {
	int fd;
	uint32_t lowmem_limit;
	int memflags;
	size_t lowmem;
	size_t highmem;
	char *baseaddr;
	char *name;
	};

	#define CREATE(x) sysctlbyname("hw.vmm.create", NULL, NULL, (x), strlen((x)))
	#define DESTROY(x) sysctlbyname("hw.vmm.destroy", NULL, NULL, (x), strlen((x)))

	static int
	vm_device_open(const char *name)
	{
	int fd, len;
	char *vmfile;

	len = strlen("/dev/vmm/") + strlen(name) + 1;
	vmfile = malloc(len);
	assert(vmfile != NULL);
	snprintf(vmfile, len, "/dev/vmm/%s", name);

	/* Open the device file */
	fd = open(vmfile, O_RDWR, 0);

	free(vmfile);
	return (fd);
	}

	int
	vm_create(const char *name)
	{

	return (CREATE((char *)name));
	}

	struct vmctx *
	vm_open(const char *name)
	{
	struct vmctx *vm;

	vm = malloc(sizeof(struct vmctx) + strlen(name) + 1);
	assert(vm != NULL);

	vm->fd = -1;
	vm->memflags = 0;
	vm->lowmem_limit = 3 * GB;
	vm->name = (char *)(vm + 1);
	strcpy(vm->name, name);

	if ((vm->fd = vm_device_open(vm->name)) < 0)
	goto err;

	return (vm);
	err:
	vm_destroy(vm);
	return (NULL);
	}

	void
	vm_destroy(struct vmctx *vm)
	{
	assert(vm != NULL);

	if (vm->fd >= 0)
	close(vm->fd);
	DESTROY(vm->name);

	free(vm);
	}

	int
	vm_parse_memsize(const char optarg, size_t ret_memsize)
	{
	char *endptr;
	size_t optval;
	int error;

	optval = strtoul(optarg, &endptr, 0);
	if (optarg != '\0' && endptr == '\0') {
	/*
	* For the sake of backward compatibility if the memory size
	* specified on the command line is less than a megabyte then
	* it is interpreted as being in units of MB.
	*/
	if (optval < MB)
	optval *= MB;
	*ret_memsize = optval;
	error = 0;
	} else
	error = expand_number(optarg, ret_memsize);

	return (error);
	}

	uint32_t
	vm_get_lowmem_limit(struct vmctx *ctx)
	{

	return (ctx->lowmem_limit);
	}

	void
	vm_set_lowmem_limit(struct vmctx *ctx, uint32_t limit)
	{

	ctx->lowmem_limit = limit;
	}

	void
	vm_set_memflags(struct vmctx *ctx, int flags)
	{

	ctx->memflags = flags;
	}

	int
	vm_get_memflags(struct vmctx *ctx)
	{

	return (ctx->memflags);
	}

	/*
	* Map segment 'segid' starting at 'off' into guest address range [gpa,gpa+len).
	*/
	int
	vm_mmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, int segid, vm_ooffset_t off,
	size_t len, int prot)
	{
	struct vm_memmap memmap;
	int error, flags;

	memmap.gpa = gpa;
	memmap.segid = segid;
	memmap.segoff = off;
	memmap.len = len;
	memmap.prot = prot;
	memmap.flags = 0;

	if (ctx->memflags & VM_MEM_F_WIRED)
	memmap.flags \|= VM_MEMMAP_F_WIRED;

	/*
	* If this mapping already exists then don't create it again. This
	* is the common case for SYSMEM mappings created by bhyveload(8).
	*/
	error = vm_mmap_getnext(ctx, &gpa, &segid, &off, &len, &prot, &flags);
	if (error == 0 && gpa == memmap.gpa) {
	if (segid != memmap.segid \|\| off != memmap.segoff \|\|
	prot != memmap.prot \|\| flags != memmap.flags) {
	errno = EEXIST;
	return (-1);
	} else {
	return (0);
	}
	}

	error = ioctl(ctx->fd, VM_MMAP_MEMSEG, &memmap);
	return (error);
	}

	int
	vm_mmap_getnext(struct vmctx ctx, vm_paddr_t gpa, int *segid,
	vm_ooffset_t segoff, size_t len, int prot, int flags)
	{
	struct vm_memmap memmap;
	int error;

	bzero(&memmap, sizeof(struct vm_memmap));
	memmap.gpa = *gpa;
	error = ioctl(ctx->fd, VM_MMAP_GETNEXT, &memmap);
	if (error == 0) {
	*gpa = memmap.gpa;
	*segid = memmap.segid;
	*segoff = memmap.segoff;
	*len = memmap.len;
	*prot = memmap.prot;
	*flags = memmap.flags;
	}
	return (error);
	}

	/*
	* Return 0 if the segments are identical and non-zero otherwise.
	*
	* This is slightly complicated by the fact that only device memory segments
	* are named.
	*/
	static int
	cmpseg(size_t len, const char str, size_t len2, const char str2)
	{

	if (len == len2) {
	if ((!str && !str2) \|\| (str && str2 && !strcmp(str, str2)))
	return (0);
	}
	return (-1);
	}

	static int
	vm_alloc_memseg(struct vmctx ctx, int segid, size_t len, const char name)
	{
	struct vm_memseg memseg;
	size_t n;
	int error;

	/*
	* If the memory segment has already been created then just return.
	* This is the usual case for the SYSMEM segment created by userspace
	* loaders like bhyveload(8).
	*/
	error = vm_get_memseg(ctx, segid, &memseg.len, memseg.name,
	sizeof(memseg.name));
	if (error)
	return (error);

	if (memseg.len != 0) {
	if (cmpseg(len, name, memseg.len, VM_MEMSEG_NAME(&memseg))) {
	errno = EINVAL;
	return (-1);
	} else {
	return (0);
	}
	}

	bzero(&memseg, sizeof(struct vm_memseg));
	memseg.segid = segid;
	memseg.len = len;
	if (name != NULL) {
	n = strlcpy(memseg.name, name, sizeof(memseg.name));
	if (n >= sizeof(memseg.name)) {
	errno = ENAMETOOLONG;
	return (-1);
	}
	}

	error = ioctl(ctx->fd, VM_ALLOC_MEMSEG, &memseg);
	return (error);
	}

	int
	vm_get_memseg(struct vmctx ctx, int segid, size_t lenp, char *namebuf,
	size_t bufsize)
	{
	struct vm_memseg memseg;
	size_t n;
	int error;

	memseg.segid = segid;
	error = ioctl(ctx->fd, VM_GET_MEMSEG, &memseg);
	if (error == 0) {
	*lenp = memseg.len;
	n = strlcpy(namebuf, memseg.name, bufsize);
	if (n >= bufsize) {
	errno = ENAMETOOLONG;
	error = -1;
	}
	}
	return (error);
	}

	static int
	setup_memory_segment(struct vmctx ctx, vm_paddr_t gpa, size_t len, char base)
	{
	char *ptr;
	int error, flags;

	/* Map 'len' bytes starting at 'gpa' in the guest address space */
	error = vm_mmap_memseg(ctx, gpa, VM_SYSMEM, gpa, len, PROT_ALL);
	if (error)
	return (error);

	flags = MAP_SHARED \| MAP_FIXED;
	if ((ctx->memflags & VM_MEM_F_INCORE) == 0)
	flags \|= MAP_NOCORE;

	/* mmap into the process address space on the host */
	ptr = mmap(base + gpa, len, PROT_RW, flags, ctx->fd, gpa);
	if (ptr == MAP_FAILED)
	return (-1);

	return (0);
	}

	int
	vm_setup_memory(struct vmctx *ctx, size_t memsize, enum vm_mmap_style vms)
	{
	size_t objsize, len;
	vm_paddr_t gpa;
	char baseaddr, ptr;
	int error, flags;

	assert(vms == VM_MMAP_ALL);

	/*
	* If 'memsize' cannot fit entirely in the 'lowmem' segment then
	* create another 'highmem' segment above 4GB for the remainder.
	*/
	if (memsize > ctx->lowmem_limit) {
	ctx->lowmem = ctx->lowmem_limit;
	ctx->highmem = memsize - ctx->lowmem_limit;
	objsize = 4*GB + ctx->highmem;
	} else {
	ctx->lowmem = memsize;
	ctx->highmem = 0;
	objsize = ctx->lowmem;
	}

	error = vm_alloc_memseg(ctx, VM_SYSMEM, objsize, NULL);
	if (error)
	return (error);

	/*
	* Stake out a contiguous region covering the guest physical memory
	* and the adjoining guard regions.
	*/
	len = VM_MMAP_GUARD_SIZE + objsize + VM_MMAP_GUARD_SIZE;
	flags = MAP_PRIVATE \| MAP_ANON \| MAP_NOCORE \| MAP_ALIGNED_SUPER;
	ptr = mmap(NULL, len, PROT_NONE, flags, -1, 0);
	if (ptr == MAP_FAILED)
	return (-1);

	baseaddr = ptr + VM_MMAP_GUARD_SIZE;
	if (ctx->highmem > 0) {
	gpa = 4*GB;
	len = ctx->highmem;
	error = setup_memory_segment(ctx, gpa, len, baseaddr);
	if (error)
	return (error);
	}

	if (ctx->lowmem > 0) {
	gpa = 0;
	len = ctx->lowmem;
	error = setup_memory_segment(ctx, gpa, len, baseaddr);
	if (error)
	return (error);
	}

	ctx->baseaddr = baseaddr;

	return (0);
	}

	/*
	* Returns a non-NULL pointer if [gaddr, gaddr+len) is entirely contained in
	* the lowmem or highmem regions.
	*
	* In particular return NULL if [gaddr, gaddr+len) falls in guest MMIO region.
	* The instruction emulation code depends on this behavior.
	*/
	void *
	vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len)
	{

	if (ctx->lowmem > 0) {
	if (gaddr < ctx->lowmem && len <= ctx->lowmem &&
	gaddr + len <= ctx->lowmem)
	return (ctx->baseaddr + gaddr);
	}

	if (ctx->highmem > 0) {
	if (gaddr >= 4*GB) {
	if (gaddr < 4*GB + ctx->highmem &&
	len <= ctx->highmem &&
	gaddr + len <= 4*GB + ctx->highmem)
	return (ctx->baseaddr + gaddr);
	}
	}

	return (NULL);
	}

	size_t
	vm_get_lowmem_size(struct vmctx *ctx)
	{

	return (ctx->lowmem);
	}

	size_t
	vm_get_highmem_size(struct vmctx *ctx)
	{

	return (ctx->highmem);
	}

	void *
	vm_create_devmem(struct vmctx ctx, int segid, const char name, size_t len)
	{
	char pathname[MAXPATHLEN];
	size_t len2;
	char base, ptr;
	int fd, error, flags;

	fd = -1;
	ptr = MAP_FAILED;
	if (name == NULL \|\| strlen(name) == 0) {
	errno = EINVAL;
	goto done;
	}

	error = vm_alloc_memseg(ctx, segid, len, name);
	if (error)
	goto done;

	strlcpy(pathname, "/dev/vmm.io/", sizeof(pathname));
	strlcat(pathname, ctx->name, sizeof(pathname));
	strlcat(pathname, ".", sizeof(pathname));
	strlcat(pathname, name, sizeof(pathname));

	fd = open(pathname, O_RDWR);
	if (fd < 0)
	goto done;

	/*
	* Stake out a contiguous region covering the device memory and the
	* adjoining guard regions.
	*/
	len2 = VM_MMAP_GUARD_SIZE + len + VM_MMAP_GUARD_SIZE;
	flags = MAP_PRIVATE \| MAP_ANON \| MAP_NOCORE \| MAP_ALIGNED_SUPER;
	base = mmap(NULL, len2, PROT_NONE, flags, -1, 0);
	if (base == MAP_FAILED)
	goto done;

	flags = MAP_SHARED \| MAP_FIXED;
	if ((ctx->memflags & VM_MEM_F_INCORE) == 0)
	flags \|= MAP_NOCORE;

	/* mmap the devmem region in the host address space */
	ptr = mmap(base + VM_MMAP_GUARD_SIZE, len, PROT_RW, flags, fd, 0);
	done:
	if (fd >= 0)
	close(fd);
	return (ptr);
	}

	int
	vm_set_desc(struct vmctx *ctx, int vcpu, int reg,
	uint64_t base, uint32_t limit, uint32_t access)
	{
	int error;
	struct vm_seg_desc vmsegdesc;

	bzero(&vmsegdesc, sizeof(vmsegdesc));
	vmsegdesc.cpuid = vcpu;
	vmsegdesc.regnum = reg;
	vmsegdesc.desc.base = base;
	vmsegdesc.desc.limit = limit;
	vmsegdesc.desc.access = access;

	error = ioctl(ctx->fd, VM_SET_SEGMENT_DESCRIPTOR, &vmsegdesc);
	return (error);
	}

	int
	vm_get_desc(struct vmctx *ctx, int vcpu, int reg,
	uint64_t base, uint32_t limit, uint32_t *access)
	{
	int error;
	struct vm_seg_desc vmsegdesc;

	bzero(&vmsegdesc, sizeof(vmsegdesc));
	vmsegdesc.cpuid = vcpu;
	vmsegdesc.regnum = reg;

	error = ioctl(ctx->fd, VM_GET_SEGMENT_DESCRIPTOR, &vmsegdesc);
	if (error == 0) {
	*base = vmsegdesc.desc.base;
	*limit = vmsegdesc.desc.limit;
	*access = vmsegdesc.desc.access;
	}
	return (error);
	}

	int
	vm_get_seg_desc(struct vmctx ctx, int vcpu, int reg, struct seg_desc seg_desc)
	{
	int error;

	error = vm_get_desc(ctx, vcpu, reg, &seg_desc->base, &seg_desc->limit,
	&seg_desc->access);
	return (error);
	}

	int
	vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val)
	{
	int error;
	struct vm_register vmreg;

	bzero(&vmreg, sizeof(vmreg));
	vmreg.cpuid = vcpu;
	vmreg.regnum = reg;
	vmreg.regval = val;

	error = ioctl(ctx->fd, VM_SET_REGISTER, &vmreg);
	return (error);
	}

	int
	vm_get_register(struct vmctx ctx, int vcpu, int reg, uint64_t ret_val)
	{
	int error;
	struct vm_register vmreg;

	bzero(&vmreg, sizeof(vmreg));
	vmreg.cpuid = vcpu;
	vmreg.regnum = reg;

	error = ioctl(ctx->fd, VM_GET_REGISTER, &vmreg);
	*ret_val = vmreg.regval;
	return (error);
	}

	int
	vm_set_register_set(struct vmctx *ctx, int vcpu, unsigned int count,
	const int regnums, uint64_t regvals)
	{
	int error;
	struct vm_register_set vmregset;

	bzero(&vmregset, sizeof(vmregset));
	vmregset.cpuid = vcpu;
	vmregset.count = count;
	vmregset.regnums = regnums;
	vmregset.regvals = regvals;

	error = ioctl(ctx->fd, VM_SET_REGISTER_SET, &vmregset);
	return (error);
	}

	int
	vm_get_register_set(struct vmctx *ctx, int vcpu, unsigned int count,
	const int regnums, uint64_t regvals)
	{
	int error;
	struct vm_register_set vmregset;

	bzero(&vmregset, sizeof(vmregset));
	vmregset.cpuid = vcpu;
	vmregset.count = count;
	vmregset.regnums = regnums;
	vmregset.regvals = regvals;

	error = ioctl(ctx->fd, VM_GET_REGISTER_SET, &vmregset);
	return (error);
	}

	int
	vm_run(struct vmctx ctx, int vcpu, struct vm_exit vmexit)
	{
	int error;
	struct vm_run vmrun;

	bzero(&vmrun, sizeof(vmrun));
	vmrun.cpuid = vcpu;

	error = ioctl(ctx->fd, VM_RUN, &vmrun);
	bcopy(&vmrun.vm_exit, vmexit, sizeof(struct vm_exit));
	return (error);
	}

	int
	vm_suspend(struct vmctx *ctx, enum vm_suspend_how how)
	{
	struct vm_suspend vmsuspend;

	bzero(&vmsuspend, sizeof(vmsuspend));
	vmsuspend.how = how;
	return (ioctl(ctx->fd, VM_SUSPEND, &vmsuspend));
	}

	int
	vm_reinit(struct vmctx *ctx)
	{

	return (ioctl(ctx->fd, VM_REINIT, 0));
	}

	int
	vm_inject_exception(struct vmctx *ctx, int vcpu, int vector, int errcode_valid,
	uint32_t errcode, int restart_instruction)
	{
	struct vm_exception exc;

	exc.cpuid = vcpu;
	exc.vector = vector;
	exc.error_code = errcode;
	exc.error_code_valid = errcode_valid;
	exc.restart_instruction = restart_instruction;

	return (ioctl(ctx->fd, VM_INJECT_EXCEPTION, &exc));
	}

	int
	vm_apicid2vcpu(struct vmctx *ctx, int apicid)
	{
	/*
	* The apic id associated with the 'vcpu' has the same numerical value
	* as the 'vcpu' itself.
	*/
	return (apicid);
	}

	int
	vm_lapic_irq(struct vmctx *ctx, int vcpu, int vector)
	{
	struct vm_lapic_irq vmirq;

	bzero(&vmirq, sizeof(vmirq));
	vmirq.cpuid = vcpu;
	vmirq.vector = vector;

	return (ioctl(ctx->fd, VM_LAPIC_IRQ, &vmirq));
	}

	int
	vm_lapic_local_irq(struct vmctx *ctx, int vcpu, int vector)
	{
	struct vm_lapic_irq vmirq;

	bzero(&vmirq, sizeof(vmirq));
	vmirq.cpuid = vcpu;
	vmirq.vector = vector;

	return (ioctl(ctx->fd, VM_LAPIC_LOCAL_IRQ, &vmirq));
	}

	int
	vm_lapic_msi(struct vmctx *ctx, uint64_t addr, uint64_t msg)
	{
	struct vm_lapic_msi vmmsi;

	bzero(&vmmsi, sizeof(vmmsi));
	vmmsi.addr = addr;
	vmmsi.msg = msg;

	return (ioctl(ctx->fd, VM_LAPIC_MSI, &vmmsi));
	}

	int
	vm_ioapic_assert_irq(struct vmctx *ctx, int irq)
	{
	struct vm_ioapic_irq ioapic_irq;

	bzero(&ioapic_irq, sizeof(struct vm_ioapic_irq));
	ioapic_irq.irq = irq;

	return (ioctl(ctx->fd, VM_IOAPIC_ASSERT_IRQ, &ioapic_irq));
	}

	int
	vm_ioapic_deassert_irq(struct vmctx *ctx, int irq)
	{
	struct vm_ioapic_irq ioapic_irq;

	bzero(&ioapic_irq, sizeof(struct vm_ioapic_irq));
	ioapic_irq.irq = irq;

	return (ioctl(ctx->fd, VM_IOAPIC_DEASSERT_IRQ, &ioapic_irq));
	}

	int
	vm_ioapic_pulse_irq(struct vmctx *ctx, int irq)
	{
	struct vm_ioapic_irq ioapic_irq;

	bzero(&ioapic_irq, sizeof(struct vm_ioapic_irq));
	ioapic_irq.irq = irq;

	return (ioctl(ctx->fd, VM_IOAPIC_PULSE_IRQ, &ioapic_irq));
	}

	int
	vm_ioapic_pincount(struct vmctx ctx, int pincount)
	{

	return (ioctl(ctx->fd, VM_IOAPIC_PINCOUNT, pincount));
	}

	int
	vm_isa_assert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq)
	{
	struct vm_isa_irq isa_irq;

	bzero(&isa_irq, sizeof(struct vm_isa_irq));
	isa_irq.atpic_irq = atpic_irq;
	isa_irq.ioapic_irq = ioapic_irq;

	return (ioctl(ctx->fd, VM_ISA_ASSERT_IRQ, &isa_irq));
	}

	int
	vm_isa_deassert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq)
	{
	struct vm_isa_irq isa_irq;

	bzero(&isa_irq, sizeof(struct vm_isa_irq));
	isa_irq.atpic_irq = atpic_irq;
	isa_irq.ioapic_irq = ioapic_irq;

	return (ioctl(ctx->fd, VM_ISA_DEASSERT_IRQ, &isa_irq));
	}

	int
	vm_isa_pulse_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq)
	{
	struct vm_isa_irq isa_irq;

	bzero(&isa_irq, sizeof(struct vm_isa_irq));
	isa_irq.atpic_irq = atpic_irq;
	isa_irq.ioapic_irq = ioapic_irq;

	return (ioctl(ctx->fd, VM_ISA_PULSE_IRQ, &isa_irq));
	}

	int
	vm_isa_set_irq_trigger(struct vmctx *ctx, int atpic_irq,
	enum vm_intr_trigger trigger)
	{
	struct vm_isa_irq_trigger isa_irq_trigger;

	bzero(&isa_irq_trigger, sizeof(struct vm_isa_irq_trigger));
	isa_irq_trigger.atpic_irq = atpic_irq;
	isa_irq_trigger.trigger = trigger;

	return (ioctl(ctx->fd, VM_ISA_SET_IRQ_TRIGGER, &isa_irq_trigger));
	}

	int
	vm_inject_nmi(struct vmctx *ctx, int vcpu)
	{
	struct vm_nmi vmnmi;

	bzero(&vmnmi, sizeof(vmnmi));
	vmnmi.cpuid = vcpu;

	return (ioctl(ctx->fd, VM_INJECT_NMI, &vmnmi));
	}

	static struct {
	const char *name;
	int type;
	} capstrmap[] = {
	{ "hlt_exit", VM_CAP_HALT_EXIT },
	{ "mtrap_exit", VM_CAP_MTRAP_EXIT },
	{ "pause_exit", VM_CAP_PAUSE_EXIT },
	{ "unrestricted_guest", VM_CAP_UNRESTRICTED_GUEST },
	{ "enable_invpcid", VM_CAP_ENABLE_INVPCID },
	{ 0 }
	};

	int
	vm_capability_name2type(const char *capname)
	{
	int i;

	for (i = 0; capstrmap[i].name != NULL && capname != NULL; i++) {
	if (strcmp(capstrmap[i].name, capname) == 0)
	return (capstrmap[i].type);
	}

	return (-1);
	}

	const char *
	vm_capability_type2name(int type)
	{
	int i;

	for (i = 0; capstrmap[i].name != NULL; i++) {
	if (capstrmap[i].type == type)
	return (capstrmap[i].name);
	}

	return (NULL);
	}

	int
	vm_get_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap,
	int *retval)
	{
	int error;
	struct vm_capability vmcap;

	bzero(&vmcap, sizeof(vmcap));
	vmcap.cpuid = vcpu;
	vmcap.captype = cap;

	error = ioctl(ctx->fd, VM_GET_CAPABILITY, &vmcap);
	*retval = vmcap.capval;
	return (error);
	}

	int
	vm_set_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap, int val)
	{
	struct vm_capability vmcap;

	bzero(&vmcap, sizeof(vmcap));
	vmcap.cpuid = vcpu;
	vmcap.captype = cap;
	vmcap.capval = val;

	return (ioctl(ctx->fd, VM_SET_CAPABILITY, &vmcap));
	}

	int
	vm_assign_pptdev(struct vmctx *ctx, int bus, int slot, int func)
	{
	struct vm_pptdev pptdev;

	bzero(&pptdev, sizeof(pptdev));
	pptdev.bus = bus;
	pptdev.slot = slot;
	pptdev.func = func;

	return (ioctl(ctx->fd, VM_BIND_PPTDEV, &pptdev));
	}

	int
	vm_unassign_pptdev(struct vmctx *ctx, int bus, int slot, int func)
	{
	struct vm_pptdev pptdev;

	bzero(&pptdev, sizeof(pptdev));
	pptdev.bus = bus;
	pptdev.slot = slot;
	pptdev.func = func;

	return (ioctl(ctx->fd, VM_UNBIND_PPTDEV, &pptdev));
	}

	int
	vm_map_pptdev_mmio(struct vmctx *ctx, int bus, int slot, int func,
	vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
	{
	struct vm_pptdev_mmio pptmmio;

	bzero(&pptmmio, sizeof(pptmmio));
	pptmmio.bus = bus;
	pptmmio.slot = slot;
	pptmmio.func = func;
	pptmmio.gpa = gpa;
	pptmmio.len = len;
	pptmmio.hpa = hpa;

	return (ioctl(ctx->fd, VM_MAP_PPTDEV_MMIO, &pptmmio));
	}

	int
	vm_setup_pptdev_msi(struct vmctx *ctx, int vcpu, int bus, int slot, int func,
	uint64_t addr, uint64_t msg, int numvec)
	{
	struct vm_pptdev_msi pptmsi;

	bzero(&pptmsi, sizeof(pptmsi));
	pptmsi.vcpu = vcpu;
	pptmsi.bus = bus;
	pptmsi.slot = slot;
	pptmsi.func = func;
	pptmsi.msg = msg;
	pptmsi.addr = addr;
	pptmsi.numvec = numvec;

	return (ioctl(ctx->fd, VM_PPTDEV_MSI, &pptmsi));
	}

	int
	vm_setup_pptdev_msix(struct vmctx *ctx, int vcpu, int bus, int slot, int func,
	int idx, uint64_t addr, uint64_t msg, uint32_t vector_control)
	{
	struct vm_pptdev_msix pptmsix;

	bzero(&pptmsix, sizeof(pptmsix));
	pptmsix.vcpu = vcpu;
	pptmsix.bus = bus;
	pptmsix.slot = slot;
	pptmsix.func = func;
	pptmsix.idx = idx;
	pptmsix.msg = msg;
	pptmsix.addr = addr;
	pptmsix.vector_control = vector_control;

	return ioctl(ctx->fd, VM_PPTDEV_MSIX, &pptmsix);
	}

	uint64_t *
	vm_get_stats(struct vmctx ctx, int vcpu, struct timeval ret_tv,
	int *ret_entries)
	{
	int error;

	static struct vm_stats vmstats;

	vmstats.cpuid = vcpu;

	error = ioctl(ctx->fd, VM_STATS, &vmstats);
	if (error == 0) {
	if (ret_entries)
	*ret_entries = vmstats.num_entries;
	if (ret_tv)
	*ret_tv = vmstats.tv;
	return (vmstats.statbuf);
	} else
	return (NULL);
	}

	const char *
	vm_get_stat_desc(struct vmctx *ctx, int index)
	{
	static struct vm_stat_desc statdesc;

	statdesc.index = index;
	if (ioctl(ctx->fd, VM_STAT_DESC, &statdesc) == 0)
	return (statdesc.desc);
	else
	return (NULL);
	}

	int
	vm_get_x2apic_state(struct vmctx ctx, int vcpu, enum x2apic_state state)
	{
	int error;
	struct vm_x2apic x2apic;

	bzero(&x2apic, sizeof(x2apic));
	x2apic.cpuid = vcpu;

	error = ioctl(ctx->fd, VM_GET_X2APIC_STATE, &x2apic);
	*state = x2apic.state;
	return (error);
	}

	int
	vm_set_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state state)
	{
	int error;
	struct vm_x2apic x2apic;

	bzero(&x2apic, sizeof(x2apic));
	x2apic.cpuid = vcpu;
	x2apic.state = state;

	error = ioctl(ctx->fd, VM_SET_X2APIC_STATE, &x2apic);

	return (error);
	}

	/*
	* From Intel Vol 3a:
	* Table 9-1. IA-32 Processor States Following Power-up, Reset or INIT
	*/
	int
	vcpu_reset(struct vmctx *vmctx, int vcpu)
	{
	int error;
	uint64_t rflags, rip, cr0, cr4, zero, desc_base, rdx;
	uint32_t desc_access, desc_limit;
	uint16_t sel;

	zero = 0;

	rflags = 0x2;
	error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RFLAGS, rflags);
	if (error)
	goto done;

	rip = 0xfff0;
	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RIP, rip)) != 0)
	goto done;

	cr0 = CR0_NE;
	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR0, cr0)) != 0)
	goto done;

	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR3, zero)) != 0)
	goto done;

	cr4 = 0;
	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR4, cr4)) != 0)
	goto done;

	/*
	* CS: present, r/w, accessed, 16-bit, byte granularity, usable
	*/
	desc_base = 0xffff0000;
	desc_limit = 0xffff;
	desc_access = 0x0093;
	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_CS,
	desc_base, desc_limit, desc_access);
	if (error)
	goto done;

	sel = 0xf000;
	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CS, sel)) != 0)
	goto done;

	/*
	* SS,DS,ES,FS,GS: present, r/w, accessed, 16-bit, byte granularity
	*/
	desc_base = 0;
	desc_limit = 0xffff;
	desc_access = 0x0093;
	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_SS,
	desc_base, desc_limit, desc_access);
	if (error)
	goto done;

	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_DS,
	desc_base, desc_limit, desc_access);
	if (error)
	goto done;

	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_ES,
	desc_base, desc_limit, desc_access);
	if (error)
	goto done;

	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_FS,
	desc_base, desc_limit, desc_access);
	if (error)
	goto done;

	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GS,
	desc_base, desc_limit, desc_access);
	if (error)
	goto done;

	sel = 0;
	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_SS, sel)) != 0)
	goto done;
	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_DS, sel)) != 0)
	goto done;
	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_ES, sel)) != 0)
	goto done;
	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_FS, sel)) != 0)
	goto done;
	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_GS, sel)) != 0)
	goto done;

	/* General purpose registers */
	rdx = 0xf00;
	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RAX, zero)) != 0)
	goto done;
	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RBX, zero)) != 0)
	goto done;
	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RCX, zero)) != 0)
	goto done;
	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RDX, rdx)) != 0)
	goto done;
	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RSI, zero)) != 0)
	goto done;
	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RDI, zero)) != 0)
	goto done;
	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RBP, zero)) != 0)
	goto done;
	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RSP, zero)) != 0)
	goto done;

	/* GDTR, IDTR */
	desc_base = 0;
	desc_limit = 0xffff;
	desc_access = 0;
	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GDTR,
	desc_base, desc_limit, desc_access);
	if (error != 0)
	goto done;

	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_IDTR,
	desc_base, desc_limit, desc_access);
	if (error != 0)
	goto done;

	/* TR */
	desc_base = 0;
	desc_limit = 0xffff;
	desc_access = 0x0000008b;
	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_TR, 0, 0, desc_access);
	if (error)
	goto done;

	sel = 0;
	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_TR, sel)) != 0)
	goto done;

	/* LDTR */
	desc_base = 0;
	desc_limit = 0xffff;
	desc_access = 0x00000082;
	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_LDTR, desc_base,
	desc_limit, desc_access);
	if (error)
	goto done;

	sel = 0;
	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_LDTR, 0)) != 0)
	goto done;

	/* XXX cr2, debug registers */

	error = 0;
	done:
	return (error);
	}

	int
	vm_get_gpa_pmap(struct vmctx ctx, uint64_t gpa, uint64_t pte, int *num)
	{
	int error, i;
	struct vm_gpa_pte gpapte;

	bzero(&gpapte, sizeof(gpapte));
	gpapte.gpa = gpa;

	error = ioctl(ctx->fd, VM_GET_GPA_PMAP, &gpapte);

	if (error == 0) {
	*num = gpapte.ptenum;
	for (i = 0; i < gpapte.ptenum; i++)
	pte[i] = gpapte.pte[i];
	}

	return (error);
	}

	int
	vm_get_hpet_capabilities(struct vmctx ctx, uint32_t capabilities)
	{
	int error;
	struct vm_hpet_cap cap;

	bzero(&cap, sizeof(struct vm_hpet_cap));
	error = ioctl(ctx->fd, VM_GET_HPET_CAPABILITIES, &cap);
	if (capabilities != NULL)
	*capabilities = cap.capabilities;
	return (error);
	}

	int
	vm_gla2gpa(struct vmctx ctx, int vcpu, struct vm_guest_paging paging,
	uint64_t gla, int prot, uint64_t gpa, int fault)
	{
	struct vm_gla2gpa gg;
	int error;

	bzero(&gg, sizeof(struct vm_gla2gpa));
	gg.vcpuid = vcpu;
	gg.prot = prot;
	gg.gla = gla;
	gg.paging = *paging;

	error = ioctl(ctx->fd, VM_GLA2GPA, &gg);
	if (error == 0) {
	*fault = gg.fault;
	*gpa = gg.gpa;
	}
	return (error);
	}

	int
	vm_gla2gpa_nofault(struct vmctx ctx, int vcpu, struct vm_guest_paging paging,
	uint64_t gla, int prot, uint64_t gpa, int fault)
	{
	struct vm_gla2gpa gg;
	int error;

	bzero(&gg, sizeof(struct vm_gla2gpa));
	gg.vcpuid = vcpu;
	gg.prot = prot;
	gg.gla = gla;
	gg.paging = *paging;

	error = ioctl(ctx->fd, VM_GLA2GPA_NOFAULT, &gg);
	if (error == 0) {
	*fault = gg.fault;
	*gpa = gg.gpa;
	}
	return (error);
	}

	#ifndef min
	#define min(a,b) (((a) < (b)) ? (a) : (b))
	#endif

	int
	vm_copy_setup(struct vmctx ctx, int vcpu, struct vm_guest_paging paging,
	uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt,
	int *fault)
	{
	void *va;
	uint64_t gpa;
	int error, i, n, off;

	for (i = 0; i < iovcnt; i++) {
	iov[i].iov_base = 0;
	iov[i].iov_len = 0;
	}

	while (len) {
	assert(iovcnt > 0);
	error = vm_gla2gpa(ctx, vcpu, paging, gla, prot, &gpa, fault);
	if (error \|\| *fault)
	return (error);

	off = gpa & PAGE_MASK;
	n = min(len, PAGE_SIZE - off);

	va = vm_map_gpa(ctx, gpa, n);
	if (va == NULL)
	return (EFAULT);

	iov->iov_base = va;
	iov->iov_len = n;
	iov++;
	iovcnt--;

	gla += n;
	len -= n;
	}
	return (0);
	}

	void
	vm_copy_teardown(struct vmctx ctx, int vcpu, struct iovec iov, int iovcnt)
	{

	return;
	}

	void
	vm_copyin(struct vmctx ctx, int vcpu, struct iovec iov, void *vp, size_t len)
	{
	const char *src;
	char *dst;
	size_t n;

	dst = vp;
	while (len) {
	assert(iov->iov_len);
	n = min(len, iov->iov_len);
	src = iov->iov_base;
	bcopy(src, dst, n);

	iov++;
	dst += n;
	len -= n;
	}
	}

	void
	vm_copyout(struct vmctx ctx, int vcpu, const void vp, struct iovec *iov,
	size_t len)
	{
	const char *src;
	char *dst;
	size_t n;

	src = vp;
	while (len) {
	assert(iov->iov_len);
	n = min(len, iov->iov_len);
	dst = iov->iov_base;
	bcopy(src, dst, n);

	iov++;
	src += n;
	len -= n;
	}
	}

	static int
	vm_get_cpus(struct vmctx ctx, int which, cpuset_t cpus)
	{
	struct vm_cpuset vm_cpuset;
	int error;

	bzero(&vm_cpuset, sizeof(struct vm_cpuset));
	vm_cpuset.which = which;
	vm_cpuset.cpusetsize = sizeof(cpuset_t);
	vm_cpuset.cpus = cpus;

	error = ioctl(ctx->fd, VM_GET_CPUS, &vm_cpuset);
	return (error);
	}

	int
	vm_active_cpus(struct vmctx ctx, cpuset_t cpus)
	{

	return (vm_get_cpus(ctx, VM_ACTIVE_CPUS, cpus));
	}

	int
	vm_suspended_cpus(struct vmctx ctx, cpuset_t cpus)
	{

	return (vm_get_cpus(ctx, VM_SUSPENDED_CPUS, cpus));
	}

	int
	+vm_debug_cpus(struct vmctx ctx, cpuset_t cpus)
	+{
	+
	+ return (vm_get_cpus(ctx, VM_DEBUG_CPUS, cpus));
	+}
	+
	+int
	vm_activate_cpu(struct vmctx *ctx, int vcpu)
	{
	struct vm_activate_cpu ac;
	int error;

	bzero(&ac, sizeof(struct vm_activate_cpu));
	ac.vcpuid = vcpu;
	error = ioctl(ctx->fd, VM_ACTIVATE_CPU, &ac);
	return (error);
	}

	int
	+vm_suspend_cpu(struct vmctx *ctx, int vcpu)
	+{
	+ struct vm_activate_cpu ac;
	+ int error;
	+
	+ bzero(&ac, sizeof(struct vm_activate_cpu));
	+ ac.vcpuid = vcpu;
	+ error = ioctl(ctx->fd, VM_SUSPEND_CPU, &ac);
	+ return (error);
	+}
	+
	+int
	+vm_resume_cpu(struct vmctx *ctx, int vcpu)
	+{
	+ struct vm_activate_cpu ac;
	+ int error;
	+
	+ bzero(&ac, sizeof(struct vm_activate_cpu));
	+ ac.vcpuid = vcpu;
	+ error = ioctl(ctx->fd, VM_RESUME_CPU, &ac);
	+ return (error);
	+}
	+
	+int
	vm_get_intinfo(struct vmctx ctx, int vcpu, uint64_t info1, uint64_t *info2)
	{
	struct vm_intinfo vmii;
	int error;

	bzero(&vmii, sizeof(struct vm_intinfo));
	vmii.vcpuid = vcpu;
	error = ioctl(ctx->fd, VM_GET_INTINFO, &vmii);
	if (error == 0) {
	*info1 = vmii.info1;
	*info2 = vmii.info2;
	}
	return (error);
	}

	int
	vm_set_intinfo(struct vmctx *ctx, int vcpu, uint64_t info1)
	{
	struct vm_intinfo vmii;
	int error;

	bzero(&vmii, sizeof(struct vm_intinfo));
	vmii.vcpuid = vcpu;
	vmii.info1 = info1;
	error = ioctl(ctx->fd, VM_SET_INTINFO, &vmii);
	return (error);
	}

	int
	vm_rtc_write(struct vmctx *ctx, int offset, uint8_t value)
	{
	struct vm_rtc_data rtcdata;
	int error;

	bzero(&rtcdata, sizeof(struct vm_rtc_data));
	rtcdata.offset = offset;
	rtcdata.value = value;
	error = ioctl(ctx->fd, VM_RTC_WRITE, &rtcdata);
	return (error);
	}

	int
	vm_rtc_read(struct vmctx ctx, int offset, uint8_t retval)
	{
	struct vm_rtc_data rtcdata;
	int error;

	bzero(&rtcdata, sizeof(struct vm_rtc_data));
	rtcdata.offset = offset;
	error = ioctl(ctx->fd, VM_RTC_READ, &rtcdata);
	if (error == 0)
	*retval = rtcdata.value;
	return (error);
	}

	int
	vm_rtc_settime(struct vmctx *ctx, time_t secs)
	{
	struct vm_rtc_time rtctime;
	int error;

	bzero(&rtctime, sizeof(struct vm_rtc_time));
	rtctime.secs = secs;
	error = ioctl(ctx->fd, VM_RTC_SETTIME, &rtctime);
	return (error);
	}

	int
	vm_rtc_gettime(struct vmctx ctx, time_t secs)
	{
	struct vm_rtc_time rtctime;
	int error;

	bzero(&rtctime, sizeof(struct vm_rtc_time));
	error = ioctl(ctx->fd, VM_RTC_GETTIME, &rtctime);
	if (error == 0)
	*secs = rtctime.secs;
	return (error);
	}

	int
	vm_restart_instruction(void *arg, int vcpu)
	{
	struct vmctx *ctx = arg;

	return (ioctl(ctx->fd, VM_RESTART_INSTRUCTION, &vcpu));
	}

	int
	vm_get_device_fd(struct vmctx *ctx)
	{

	return (ctx->fd);
	}

	const cap_ioctl_t *
	vm_get_ioctls(size_t *len)
	{
	cap_ioctl_t *cmds;
	/* keep in sync with machine/vmm_dev.h */
	static const cap_ioctl_t vm_ioctl_cmds[] = { VM_RUN, VM_SUSPEND, VM_REINIT,
	VM_ALLOC_MEMSEG, VM_GET_MEMSEG, VM_MMAP_MEMSEG, VM_MMAP_MEMSEG,
	VM_MMAP_GETNEXT, VM_SET_REGISTER, VM_GET_REGISTER,
	VM_SET_SEGMENT_DESCRIPTOR, VM_GET_SEGMENT_DESCRIPTOR,
	VM_SET_REGISTER_SET, VM_GET_REGISTER_SET,
	VM_INJECT_EXCEPTION, VM_LAPIC_IRQ, VM_LAPIC_LOCAL_IRQ,
	VM_LAPIC_MSI, VM_IOAPIC_ASSERT_IRQ, VM_IOAPIC_DEASSERT_IRQ,
	VM_IOAPIC_PULSE_IRQ, VM_IOAPIC_PINCOUNT, VM_ISA_ASSERT_IRQ,
	VM_ISA_DEASSERT_IRQ, VM_ISA_PULSE_IRQ, VM_ISA_SET_IRQ_TRIGGER,
	VM_SET_CAPABILITY, VM_GET_CAPABILITY, VM_BIND_PPTDEV,
	VM_UNBIND_PPTDEV, VM_MAP_PPTDEV_MMIO, VM_PPTDEV_MSI,
	VM_PPTDEV_MSIX, VM_INJECT_NMI, VM_STATS, VM_STAT_DESC,
	VM_SET_X2APIC_STATE, VM_GET_X2APIC_STATE,
	VM_GET_HPET_CAPABILITIES, VM_GET_GPA_PMAP, VM_GLA2GPA,
	VM_GLA2GPA_NOFAULT,
	- VM_ACTIVATE_CPU, VM_GET_CPUS, VM_SET_INTINFO, VM_GET_INTINFO,
	+ VM_ACTIVATE_CPU, VM_GET_CPUS, VM_SUSPEND_CPU, VM_RESUME_CPU,
	+ VM_SET_INTINFO, VM_GET_INTINFO,
	VM_RTC_WRITE, VM_RTC_READ, VM_RTC_SETTIME, VM_RTC_GETTIME,
	VM_RESTART_INSTRUCTION };

	if (len == NULL) {
	cmds = malloc(sizeof(vm_ioctl_cmds));
	if (cmds == NULL)
	return (NULL);
	bcopy(vm_ioctl_cmds, cmds, sizeof(vm_ioctl_cmds));
	return (cmds);
	}

	*len = nitems(vm_ioctl_cmds);
	return (NULL);
	}

	Index: head/lib/libvmmapi/vmmapi.h
	===================================================================
	--- head/lib/libvmmapi/vmmapi.h (revision 332156)
	+++ head/lib/libvmmapi/vmmapi.h (revision 332157)
	@@ -1,231 +1,234 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2011 NetApp, Inc.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* $FreeBSD$
	*/

	#ifndef _VMMAPI_H_
	#define _VMMAPI_H_

	#include <sys/param.h>
	#include <sys/cpuset.h>

	/*
	* API version for out-of-tree consumers like grub-bhyve for making compile
	* time decisions.
	*/
	#define VMMAPI_VERSION 0103 /* 2 digit major followed by 2 digit minor */

	struct iovec;
	struct vmctx;
	enum x2apic_state;

	/*
	* Different styles of mapping the memory assigned to a VM into the address
	* space of the controlling process.
	*/
	enum vm_mmap_style {
	VM_MMAP_NONE, /* no mapping */
	VM_MMAP_ALL, /* fully and statically mapped */
	VM_MMAP_SPARSE, /* mappings created on-demand */
	};

	/*
	* 'flags' value passed to 'vm_set_memflags()'.
	*/
	#define VM_MEM_F_INCORE 0x01 /* include guest memory in core file */
	#define VM_MEM_F_WIRED 0x02 /* guest memory is wired */

	/*
	* Identifiers for memory segments:
	* - vm_setup_memory() uses VM_SYSMEM for the system memory segment.
	* - the remaining identifiers can be used to create devmem segments.
	*/
	enum {
	VM_SYSMEM,
	VM_BOOTROM,
	VM_FRAMEBUFFER,
	};

	/*
	* Get the length and name of the memory segment identified by 'segid'.
	* Note that system memory segments are identified with a nul name.
	*
	* Returns 0 on success and non-zero otherwise.
	*/
	int vm_get_memseg(struct vmctx ctx, int ident, size_t lenp, char *name,
	size_t namesiz);

	/*
	* Iterate over the guest address space. This function finds an address range
	* that starts at an address >= *gpa.
	*
	* Returns 0 if the next address range was found and non-zero otherwise.
	*/
	int vm_mmap_getnext(struct vmctx ctx, vm_paddr_t gpa, int *segid,
	vm_ooffset_t segoff, size_t len, int prot, int flags);
	/*
	* Create a device memory segment identified by 'segid'.
	*
	* Returns a pointer to the memory segment on success and MAP_FAILED otherwise.
	*/
	void vm_create_devmem(struct vmctx ctx, int segid, const char *name,
	size_t len);

	/*
	* Map the memory segment identified by 'segid' into the guest address space
	* at [gpa,gpa+len) with protection 'prot'.
	*/
	int vm_mmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, int segid,
	vm_ooffset_t segoff, size_t len, int prot);

	int vm_create(const char *name);
	int vm_get_device_fd(struct vmctx *ctx);
	struct vmctx vm_open(const char name);
	void vm_destroy(struct vmctx *ctx);
	int vm_parse_memsize(const char optarg, size_t memsize);
	int vm_setup_memory(struct vmctx *ctx, size_t len, enum vm_mmap_style s);
	void vm_map_gpa(struct vmctx ctx, vm_paddr_t gaddr, size_t len);
	int vm_get_gpa_pmap(struct vmctx , uint64_t gpa, uint64_t pte, int *num);
	int vm_gla2gpa(struct vmctx , int vcpuid, struct vm_guest_paging paging,
	uint64_t gla, int prot, uint64_t gpa, int fault);
	int vm_gla2gpa_nofault(struct vmctx *, int vcpuid,
	struct vm_guest_paging *paging, uint64_t gla, int prot,
	uint64_t gpa, int fault);
	uint32_t vm_get_lowmem_limit(struct vmctx *ctx);
	void vm_set_lowmem_limit(struct vmctx *ctx, uint32_t limit);
	void vm_set_memflags(struct vmctx *ctx, int flags);
	int vm_get_memflags(struct vmctx *ctx);
	size_t vm_get_lowmem_size(struct vmctx *ctx);
	size_t vm_get_highmem_size(struct vmctx *ctx);
	int vm_set_desc(struct vmctx *ctx, int vcpu, int reg,
	uint64_t base, uint32_t limit, uint32_t access);
	int vm_get_desc(struct vmctx *ctx, int vcpu, int reg,
	uint64_t base, uint32_t limit, uint32_t *access);
	int vm_get_seg_desc(struct vmctx *ctx, int vcpu, int reg,
	struct seg_desc *seg_desc);
	int vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val);
	int vm_get_register(struct vmctx ctx, int vcpu, int reg, uint64_t retval);
	int vm_set_register_set(struct vmctx *ctx, int vcpu, unsigned int count,
	const int regnums, uint64_t regvals);
	int vm_get_register_set(struct vmctx *ctx, int vcpu, unsigned int count,
	const int regnums, uint64_t regvals);
	int vm_run(struct vmctx ctx, int vcpu, struct vm_exit ret_vmexit);
	int vm_suspend(struct vmctx *ctx, enum vm_suspend_how how);
	int vm_reinit(struct vmctx *ctx);
	int vm_apicid2vcpu(struct vmctx *ctx, int apicid);
	int vm_inject_exception(struct vmctx *ctx, int vcpu, int vector,
	int errcode_valid, uint32_t errcode, int restart_instruction);
	int vm_lapic_irq(struct vmctx *ctx, int vcpu, int vector);
	int vm_lapic_local_irq(struct vmctx *ctx, int vcpu, int vector);
	int vm_lapic_msi(struct vmctx *ctx, uint64_t addr, uint64_t msg);
	int vm_ioapic_assert_irq(struct vmctx *ctx, int irq);
	int vm_ioapic_deassert_irq(struct vmctx *ctx, int irq);
	int vm_ioapic_pulse_irq(struct vmctx *ctx, int irq);
	int vm_ioapic_pincount(struct vmctx ctx, int pincount);
	int vm_isa_assert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq);
	int vm_isa_deassert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq);
	int vm_isa_pulse_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq);
	int vm_isa_set_irq_trigger(struct vmctx *ctx, int atpic_irq,
	enum vm_intr_trigger trigger);
	int vm_inject_nmi(struct vmctx *ctx, int vcpu);
	int vm_capability_name2type(const char *capname);
	const char *vm_capability_type2name(int type);
	int vm_get_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap,
	int *retval);
	int vm_set_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap,
	int val);
	int vm_assign_pptdev(struct vmctx *ctx, int bus, int slot, int func);
	int vm_unassign_pptdev(struct vmctx *ctx, int bus, int slot, int func);
	int vm_map_pptdev_mmio(struct vmctx *ctx, int bus, int slot, int func,
	vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
	int vm_setup_pptdev_msi(struct vmctx *ctx, int vcpu, int bus, int slot,
	int func, uint64_t addr, uint64_t msg, int numvec);
	int vm_setup_pptdev_msix(struct vmctx *ctx, int vcpu, int bus, int slot,
	int func, int idx, uint64_t addr, uint64_t msg,
	uint32_t vector_control);

	int vm_get_intinfo(struct vmctx ctx, int vcpu, uint64_t i1, uint64_t *i2);
	int vm_set_intinfo(struct vmctx *ctx, int vcpu, uint64_t exit_intinfo);

	const cap_ioctl_t vm_get_ioctls(size_t len);

	/*
	* Return a pointer to the statistics buffer. Note that this is not MT-safe.
	*/
	uint64_t vm_get_stats(struct vmctx ctx, int vcpu, struct timeval *ret_tv,
	int *ret_entries);
	const char vm_get_stat_desc(struct vmctx ctx, int index);

	int vm_get_x2apic_state(struct vmctx ctx, int vcpu, enum x2apic_state s);
	int vm_set_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state s);

	int vm_get_hpet_capabilities(struct vmctx ctx, uint32_t capabilities);

	/*
	* Translate the GLA range [gla,gla+len) into GPA segments in 'iov'.
	* The 'iovcnt' should be big enough to accommodate all GPA segments.
	*
	* retval fault Interpretation
	* 0 0 Success
	* 0 1 An exception was injected into the guest
	* EFAULT N/A Error
	*/
	int vm_copy_setup(struct vmctx ctx, int vcpu, struct vm_guest_paging pg,
	uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt,
	int *fault);
	void vm_copyin(struct vmctx ctx, int vcpu, struct iovec guest_iov,
	void *host_dst, size_t len);
	void vm_copyout(struct vmctx ctx, int vcpu, const void host_src,
	struct iovec *guest_iov, size_t len);
	void vm_copy_teardown(struct vmctx ctx, int vcpu, struct iovec iov,
	int iovcnt);

	/* RTC */
	int vm_rtc_write(struct vmctx *ctx, int offset, uint8_t value);
	int vm_rtc_read(struct vmctx ctx, int offset, uint8_t retval);
	int vm_rtc_settime(struct vmctx *ctx, time_t secs);
	int vm_rtc_gettime(struct vmctx ctx, time_t secs);

	/* Reset vcpu register state */
	int vcpu_reset(struct vmctx *ctx, int vcpu);

	int vm_active_cpus(struct vmctx ctx, cpuset_t cpus);
	int vm_suspended_cpus(struct vmctx ctx, cpuset_t cpus);
	+int vm_debug_cpus(struct vmctx ctx, cpuset_t cpus);
	int vm_activate_cpu(struct vmctx *ctx, int vcpu);
	+int vm_suspend_cpu(struct vmctx *ctx, int vcpu);
	+int vm_resume_cpu(struct vmctx *ctx, int vcpu);

	/*
	* FreeBSD specific APIs
	*/
	int vm_setup_freebsd_registers(struct vmctx *ctx, int vcpu,
	uint64_t rip, uint64_t cr3, uint64_t gdtbase,
	uint64_t rsp);
	int vm_setup_freebsd_registers_i386(struct vmctx *vmctx, int vcpu,
	uint32_t eip, uint32_t gdtbase,
	uint32_t esp);
	void vm_setup_freebsd_gdt(uint64_t *gdtr);
	#endif /* _VMMAPI_H_ */
	Index: head/sys/amd64/include/vmm.h
	===================================================================
	--- head/sys/amd64/include/vmm.h (revision 332156)
	+++ head/sys/amd64/include/vmm.h (revision 332157)
	@@ -1,683 +1,690 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2011 NetApp, Inc.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* $FreeBSD$
	*/

	#ifndef _VMM_H_
	#define _VMM_H_

	#include <x86/segments.h>

	enum vm_suspend_how {
	VM_SUSPEND_NONE,
	VM_SUSPEND_RESET,
	VM_SUSPEND_POWEROFF,
	VM_SUSPEND_HALT,
	VM_SUSPEND_TRIPLEFAULT,
	VM_SUSPEND_LAST
	};

	/*
	* Identifiers for architecturally defined registers.
	*/
	enum vm_reg_name {
	VM_REG_GUEST_RAX,
	VM_REG_GUEST_RBX,
	VM_REG_GUEST_RCX,
	VM_REG_GUEST_RDX,
	VM_REG_GUEST_RSI,
	VM_REG_GUEST_RDI,
	VM_REG_GUEST_RBP,
	VM_REG_GUEST_R8,
	VM_REG_GUEST_R9,
	VM_REG_GUEST_R10,
	VM_REG_GUEST_R11,
	VM_REG_GUEST_R12,
	VM_REG_GUEST_R13,
	VM_REG_GUEST_R14,
	VM_REG_GUEST_R15,
	VM_REG_GUEST_CR0,
	VM_REG_GUEST_CR3,
	VM_REG_GUEST_CR4,
	VM_REG_GUEST_DR7,
	VM_REG_GUEST_RSP,
	VM_REG_GUEST_RIP,
	VM_REG_GUEST_RFLAGS,
	VM_REG_GUEST_ES,
	VM_REG_GUEST_CS,
	VM_REG_GUEST_SS,
	VM_REG_GUEST_DS,
	VM_REG_GUEST_FS,
	VM_REG_GUEST_GS,
	VM_REG_GUEST_LDTR,
	VM_REG_GUEST_TR,
	VM_REG_GUEST_IDTR,
	VM_REG_GUEST_GDTR,
	VM_REG_GUEST_EFER,
	VM_REG_GUEST_CR2,
	VM_REG_GUEST_PDPTE0,
	VM_REG_GUEST_PDPTE1,
	VM_REG_GUEST_PDPTE2,
	VM_REG_GUEST_PDPTE3,
	VM_REG_GUEST_INTR_SHADOW,
	VM_REG_GUEST_DR0,
	VM_REG_GUEST_DR1,
	VM_REG_GUEST_DR2,
	VM_REG_GUEST_DR3,
	VM_REG_GUEST_DR6,
	VM_REG_LAST
	};

	enum x2apic_state {
	X2APIC_DISABLED,
	X2APIC_ENABLED,
	X2APIC_STATE_LAST
	};

	#define VM_INTINFO_VECTOR(info) ((info) & 0xff)
	#define VM_INTINFO_DEL_ERRCODE 0x800
	#define VM_INTINFO_RSVD 0x7ffff000
	#define VM_INTINFO_VALID 0x80000000
	#define VM_INTINFO_TYPE 0x700
	#define VM_INTINFO_HWINTR (0 << 8)
	#define VM_INTINFO_NMI (2 << 8)
	#define VM_INTINFO_HWEXCEPTION (3 << 8)
	#define VM_INTINFO_SWINTR (4 << 8)

	#ifdef _KERNEL

	#define VM_MAX_NAMELEN 32

	struct vm;
	struct vm_exception;
	struct seg_desc;
	struct vm_exit;
	struct vm_run;
	struct vhpet;
	struct vioapic;
	struct vlapic;
	struct vmspace;
	struct vm_object;
	struct vm_guest_paging;
	struct pmap;

	struct vm_eventinfo {
	void rptr; / rendezvous cookie */
	int sptr; / suspend cookie */
	int iptr; / reqidle cookie */
	};

	typedef int (*vmm_init_func_t)(int ipinum);
	typedef int (*vmm_cleanup_func_t)(void);
	typedef void (*vmm_resume_func_t)(void);
	typedef void * (vmi_init_func_t)(struct vm vm, struct pmap *pmap);
	typedef int (vmi_run_func_t)(void vmi, int vcpu, register_t rip,
	struct pmap pmap, struct vm_eventinfo info);
	typedef void (vmi_cleanup_func_t)(void vmi);
	typedef int (vmi_get_register_t)(void vmi, int vcpu, int num,
	uint64_t *retval);
	typedef int (vmi_set_register_t)(void vmi, int vcpu, int num,
	uint64_t val);
	typedef int (vmi_get_desc_t)(void vmi, int vcpu, int num,
	struct seg_desc *desc);
	typedef int (vmi_set_desc_t)(void vmi, int vcpu, int num,
	struct seg_desc *desc);
	typedef int (vmi_get_cap_t)(void vmi, int vcpu, int num, int *retval);
	typedef int (vmi_set_cap_t)(void vmi, int vcpu, int num, int val);
	typedef struct vmspace * (*vmi_vmspace_alloc)(vm_offset_t min, vm_offset_t max);
	typedef void (vmi_vmspace_free)(struct vmspace vmspace);
	typedef struct vlapic * (vmi_vlapic_init)(void vmi, int vcpu);
	typedef void (vmi_vlapic_cleanup)(void vmi, struct vlapic *vlapic);

	struct vmm_ops {
	vmm_init_func_t init; /* module wide initialization */
	vmm_cleanup_func_t cleanup;
	vmm_resume_func_t resume;

	vmi_init_func_t vminit; /* vm-specific initialization */
	vmi_run_func_t vmrun;
	vmi_cleanup_func_t vmcleanup;
	vmi_get_register_t vmgetreg;
	vmi_set_register_t vmsetreg;
	vmi_get_desc_t vmgetdesc;
	vmi_set_desc_t vmsetdesc;
	vmi_get_cap_t vmgetcap;
	vmi_set_cap_t vmsetcap;
	vmi_vmspace_alloc vmspace_alloc;
	vmi_vmspace_free vmspace_free;
	vmi_vlapic_init vlapic_init;
	vmi_vlapic_cleanup vlapic_cleanup;
	};

	extern struct vmm_ops vmm_ops_intel;
	extern struct vmm_ops vmm_ops_amd;

	int vm_create(const char name, struct vm *retvm);
	void vm_destroy(struct vm *vm);
	int vm_reinit(struct vm *vm);
	const char vm_name(struct vm vm);

	/*
	* APIs that modify the guest memory map require all vcpus to be frozen.
	*/
	int vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t off,
	size_t len, int prot, int flags);
	int vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem);
	void vm_free_memseg(struct vm *vm, int ident);
	int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
	int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len);
	int vm_assign_pptdev(struct vm *vm, int bus, int slot, int func);
	int vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func);

	/*
	* APIs that inspect the guest memory map require only a single vcpu to
	* be frozen. This acts like a read lock on the guest memory map since any
	* modification requires all vcpus to be frozen.
	*/
	int vm_mmap_getnext(struct vm vm, vm_paddr_t gpa, int *segid,
	vm_ooffset_t segoff, size_t len, int prot, int flags);
	int vm_get_memseg(struct vm vm, int ident, size_t len, bool *sysmem,
	struct vm_object **objptr);
	void vm_gpa_hold(struct vm , int vcpuid, vm_paddr_t gpa, size_t len,
	int prot, void **cookie);
	void vm_gpa_release(void *cookie);
	bool vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa);

	int vm_get_register(struct vm vm, int vcpu, int reg, uint64_t retval);
	int vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val);
	int vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
	struct seg_desc *ret_desc);
	int vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
	struct seg_desc *desc);
	int vm_run(struct vm vm, struct vm_run vmrun);
	int vm_suspend(struct vm *vm, enum vm_suspend_how how);
	int vm_inject_nmi(struct vm *vm, int vcpu);
	int vm_nmi_pending(struct vm *vm, int vcpuid);
	void vm_nmi_clear(struct vm *vm, int vcpuid);
	int vm_inject_extint(struct vm *vm, int vcpu);
	int vm_extint_pending(struct vm *vm, int vcpuid);
	void vm_extint_clear(struct vm *vm, int vcpuid);
	struct vlapic vm_lapic(struct vm vm, int cpu);
	struct vioapic vm_ioapic(struct vm vm);
	struct vhpet vm_hpet(struct vm vm);
	int vm_get_capability(struct vm vm, int vcpu, int type, int val);
	int vm_set_capability(struct vm *vm, int vcpu, int type, int val);
	int vm_get_x2apic_state(struct vm vm, int vcpu, enum x2apic_state state);
	int vm_set_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state state);
	int vm_apicid2vcpuid(struct vm *vm, int apicid);
	int vm_activate_cpu(struct vm *vm, int vcpu);
	+int vm_suspend_cpu(struct vm *vm, int vcpu);
	+int vm_resume_cpu(struct vm *vm, int vcpu);
	struct vm_exit vm_exitinfo(struct vm vm, int vcpuid);
	void vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip);
	+void vm_exit_debug(struct vm *vm, int vcpuid, uint64_t rip);
	void vm_exit_rendezvous(struct vm *vm, int vcpuid, uint64_t rip);
	void vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip);
	void vm_exit_reqidle(struct vm *vm, int vcpuid, uint64_t rip);

	#ifdef _SYS__CPUSET_H_
	/*
	* Rendezvous all vcpus specified in 'dest' and execute 'func(arg)'.
	* The rendezvous 'func(arg)' is not allowed to do anything that will
	* cause the thread to be put to sleep.
	*
	* If the rendezvous is being initiated from a vcpu context then the
	* 'vcpuid' must refer to that vcpu, otherwise it should be set to -1.
	*
	* The caller cannot hold any locks when initiating the rendezvous.
	*
	* The implementation of this API may cause vcpus other than those specified
	* by 'dest' to be stalled. The caller should not rely on any vcpus making
	* forward progress when the rendezvous is in progress.
	*/
	typedef void (vm_rendezvous_func_t)(struct vm vm, int vcpuid, void *arg);
	void vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest,
	vm_rendezvous_func_t func, void *arg);
	cpuset_t vm_active_cpus(struct vm *vm);
	+cpuset_t vm_debug_cpus(struct vm *vm);
	cpuset_t vm_suspended_cpus(struct vm *vm);
	#endif /* _SYS__CPUSET_H_ */

	static __inline int
	vcpu_rendezvous_pending(struct vm_eventinfo *info)
	{

	return (((uintptr_t )(info->rptr)) != 0);
	}

	static __inline int
	vcpu_suspended(struct vm_eventinfo *info)
	{

	return (*info->sptr);
	}

	static __inline int
	vcpu_reqidle(struct vm_eventinfo *info)
	{

	return (*info->iptr);
	}

	+int vcpu_debugged(struct vm *vm, int vcpuid);
	+
	/*
	* Return 1 if device indicated by bus/slot/func is supposed to be a
	* pci passthrough device.
	*
	* Return 0 otherwise.
	*/
	int vmm_is_pptdev(int bus, int slot, int func);

	void vm_iommu_domain(struct vm vm);

	enum vcpu_state {
	VCPU_IDLE,
	VCPU_FROZEN,
	VCPU_RUNNING,
	VCPU_SLEEPING,
	};

	int vcpu_set_state(struct vm *vm, int vcpu, enum vcpu_state state,
	bool from_idle);
	enum vcpu_state vcpu_get_state(struct vm vm, int vcpu, int hostcpu);

	static int __inline
	vcpu_is_running(struct vm vm, int vcpu, int hostcpu)
	{
	return (vcpu_get_state(vm, vcpu, hostcpu) == VCPU_RUNNING);
	}

	#ifdef _SYS_PROC_H_
	static int __inline
	vcpu_should_yield(struct vm *vm, int vcpu)
	{

	if (curthread->td_flags & (TDF_ASTPENDING \| TDF_NEEDRESCHED))
	return (1);
	else if (curthread->td_owepreempt)
	return (1);
	else
	return (0);
	}
	#endif

	void vcpu_stats(struct vm vm, int vcpu);
	void vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr);
	struct vmspace vm_get_vmspace(struct vm vm);
	struct vatpic vm_atpic(struct vm vm);
	struct vatpit vm_atpit(struct vm vm);
	struct vpmtmr vm_pmtmr(struct vm vm);
	struct vrtc vm_rtc(struct vm vm);

	/*
	* Inject exception 'vector' into the guest vcpu. This function returns 0 on
	* success and non-zero on failure.
	*
	* Wrapper functions like 'vm_inject_gp()' should be preferred to calling
	* this function directly because they enforce the trap-like or fault-like
	* behavior of an exception.
	*
	* This function should only be called in the context of the thread that is
	* executing this vcpu.
	*/
	int vm_inject_exception(struct vm *vm, int vcpuid, int vector, int err_valid,
	uint32_t errcode, int restart_instruction);

	/*
	* This function is called after a VM-exit that occurred during exception or
	* interrupt delivery through the IDT. The format of 'intinfo' is described
	* in Figure 15-1, "EXITINTINFO for All Intercepts", APM, Vol 2.
	*
	* If a VM-exit handler completes the event delivery successfully then it
	* should call vm_exit_intinfo() to extinguish the pending event. For e.g.,
	* if the task switch emulation is triggered via a task gate then it should
	* call this function with 'intinfo=0' to indicate that the external event
	* is not pending anymore.
	*
	* Return value is 0 on success and non-zero on failure.
	*/
	int vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t intinfo);

	/*
	* This function is called before every VM-entry to retrieve a pending
	* event that should be injected into the guest. This function combines
	* nested events into a double or triple fault.
	*
	* Returns 0 if there are no events that need to be injected into the guest
	* and non-zero otherwise.
	*/
	int vm_entry_intinfo(struct vm vm, int vcpuid, uint64_t info);

	int vm_get_intinfo(struct vm vm, int vcpuid, uint64_t info1, uint64_t *info2);

	enum vm_reg_name vm_segment_name(int seg_encoding);

	struct vm_copyinfo {
	uint64_t gpa;
	size_t len;
	void *hva;
	void *cookie;
	};

	/*
	* Set up 'copyinfo[]' to copy to/from guest linear address space starting
	* at 'gla' and 'len' bytes long. The 'prot' should be set to PROT_READ for
	* a copyin or PROT_WRITE for a copyout.
	*
	* retval is_fault Interpretation
	* 0 0 Success
	* 0 1 An exception was injected into the guest
	* EFAULT N/A Unrecoverable error
	*
	* The 'copyinfo[]' can be passed to 'vm_copyin()' or 'vm_copyout()' only if
	* the return value is 0. The 'copyinfo[]' resources should be freed by calling
	* 'vm_copy_teardown()' after the copy is done.
	*/
	int vm_copy_setup(struct vm vm, int vcpuid, struct vm_guest_paging paging,
	uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
	int num_copyinfo, int *is_fault);
	void vm_copy_teardown(struct vm vm, int vcpuid, struct vm_copyinfo copyinfo,
	int num_copyinfo);
	void vm_copyin(struct vm vm, int vcpuid, struct vm_copyinfo copyinfo,
	void *kaddr, size_t len);
	void vm_copyout(struct vm vm, int vcpuid, const void kaddr,
	struct vm_copyinfo *copyinfo, size_t len);

	int vcpu_trace_exceptions(struct vm *vm, int vcpuid);
	#endif /* KERNEL */

	#define VM_MAXCPU 16 /* maximum virtual cpus */

	/*
	* Identifiers for optional vmm capabilities
	*/
	enum vm_cap_type {
	VM_CAP_HALT_EXIT,
	VM_CAP_MTRAP_EXIT,
	VM_CAP_PAUSE_EXIT,
	VM_CAP_UNRESTRICTED_GUEST,
	VM_CAP_ENABLE_INVPCID,
	VM_CAP_MAX
	};

	enum vm_intr_trigger {
	EDGE_TRIGGER,
	LEVEL_TRIGGER
	};

	/*
	* The 'access' field has the format specified in Table 21-2 of the Intel
	* Architecture Manual vol 3b.
	*
	* XXX The contents of the 'access' field are architecturally defined except
	* bit 16 - Segment Unusable.
	*/
	struct seg_desc {
	uint64_t base;
	uint32_t limit;
	uint32_t access;
	};
	#define SEG_DESC_TYPE(access) ((access) & 0x001f)
	#define SEG_DESC_DPL(access) (((access) >> 5) & 0x3)
	#define SEG_DESC_PRESENT(access) (((access) & 0x0080) ? 1 : 0)
	#define SEG_DESC_DEF32(access) (((access) & 0x4000) ? 1 : 0)
	#define SEG_DESC_GRANULARITY(access) (((access) & 0x8000) ? 1 : 0)
	#define SEG_DESC_UNUSABLE(access) (((access) & 0x10000) ? 1 : 0)

	enum vm_cpu_mode {
	CPU_MODE_REAL,
	CPU_MODE_PROTECTED,
	CPU_MODE_COMPATIBILITY, /* IA-32E mode (CS.L = 0) */
	CPU_MODE_64BIT, /* IA-32E mode (CS.L = 1) */
	};

	enum vm_paging_mode {
	PAGING_MODE_FLAT,
	PAGING_MODE_32,
	PAGING_MODE_PAE,
	PAGING_MODE_64,
	};

	struct vm_guest_paging {
	uint64_t cr3;
	int cpl;
	enum vm_cpu_mode cpu_mode;
	enum vm_paging_mode paging_mode;
	};

	/*
	* The data structures 'vie' and 'vie_op' are meant to be opaque to the
	* consumers of instruction decoding. The only reason why their contents
	* need to be exposed is because they are part of the 'vm_exit' structure.
	*/
	struct vie_op {
	uint8_t op_byte; /* actual opcode byte */
	uint8_t op_type; /* type of operation (e.g. MOV) */
	uint16_t op_flags;
	};

	#define VIE_INST_SIZE 15
	struct vie {
	uint8_t inst[VIE_INST_SIZE]; /* instruction bytes */
	uint8_t num_valid; /* size of the instruction */
	uint8_t num_processed;

	uint8_t addrsize:4, opsize:4; /* address and operand sizes */
	uint8_t rex_w:1, /* REX prefix */
	rex_r:1,
	rex_x:1,
	rex_b:1,
	rex_present:1,
	repz_present:1, /* REP/REPE/REPZ prefix */
	repnz_present:1, /* REPNE/REPNZ prefix */
	opsize_override:1, /* Operand size override */
	addrsize_override:1, /* Address size override */
	segment_override:1; /* Segment override */

	uint8_t mod:2, /* ModRM byte */
	reg:4,
	rm:4;

	uint8_t ss:2, /* SIB byte */
	index:4,
	base:4;

	uint8_t disp_bytes;
	uint8_t imm_bytes;

	uint8_t scale;
	int base_register; /* VM_REG_GUEST_xyz */
	int index_register; /* VM_REG_GUEST_xyz */
	int segment_register; /* VM_REG_GUEST_xyz */

	int64_t displacement; /* optional addr displacement */
	int64_t immediate; /* optional immediate operand */

	uint8_t decoded; /* set to 1 if successfully decoded */

	struct vie_op op; /* opcode description */
	};

	enum vm_exitcode {
	VM_EXITCODE_INOUT,
	VM_EXITCODE_VMX,
	VM_EXITCODE_BOGUS,
	VM_EXITCODE_RDMSR,
	VM_EXITCODE_WRMSR,
	VM_EXITCODE_HLT,
	VM_EXITCODE_MTRAP,
	VM_EXITCODE_PAUSE,
	VM_EXITCODE_PAGING,
	VM_EXITCODE_INST_EMUL,
	VM_EXITCODE_SPINUP_AP,
	VM_EXITCODE_DEPRECATED1, /* used to be SPINDOWN_CPU */
	VM_EXITCODE_RENDEZVOUS,
	VM_EXITCODE_IOAPIC_EOI,
	VM_EXITCODE_SUSPENDED,
	VM_EXITCODE_INOUT_STR,
	VM_EXITCODE_TASK_SWITCH,
	VM_EXITCODE_MONITOR,
	VM_EXITCODE_MWAIT,
	VM_EXITCODE_SVM,
	VM_EXITCODE_REQIDLE,
	+ VM_EXITCODE_DEBUG,
	VM_EXITCODE_MAX
	};

	struct vm_inout {
	uint16_t bytes:3; /* 1 or 2 or 4 */
	uint16_t in:1;
	uint16_t string:1;
	uint16_t rep:1;
	uint16_t port;
	uint32_t eax; /* valid for out */
	};

	struct vm_inout_str {
	struct vm_inout inout; /* must be the first element */
	struct vm_guest_paging paging;
	uint64_t rflags;
	uint64_t cr0;
	uint64_t index;
	uint64_t count; /* rep=1 (%rcx), rep=0 (1) */
	int addrsize;
	enum vm_reg_name seg_name;
	struct seg_desc seg_desc;
	};

	enum task_switch_reason {
	TSR_CALL,
	TSR_IRET,
	TSR_JMP,
	TSR_IDT_GATE, /* task gate in IDT */
	};

	struct vm_task_switch {
	uint16_t tsssel; /* new TSS selector */
	int ext; /* task switch due to external event */
	uint32_t errcode;
	int errcode_valid; /* push 'errcode' on the new stack */
	enum task_switch_reason reason;
	struct vm_guest_paging paging;
	};

	struct vm_exit {
	enum vm_exitcode exitcode;
	int inst_length; /* 0 means unknown */
	uint64_t rip;
	union {
	struct vm_inout inout;
	struct vm_inout_str inout_str;
	struct {
	uint64_t gpa;
	int fault_type;
	} paging;
	struct {
	uint64_t gpa;
	uint64_t gla;
	uint64_t cs_base;
	int cs_d; /* CS.D */
	struct vm_guest_paging paging;
	struct vie vie;
	} inst_emul;
	/*
	* VMX specific payload. Used when there is no "better"
	* exitcode to represent the VM-exit.
	*/
	struct {
	int status; /* vmx inst status */
	/*
	* 'exit_reason' and 'exit_qualification' are valid
	* only if 'status' is zero.
	*/
	uint32_t exit_reason;
	uint64_t exit_qualification;
	/*
	* 'inst_error' and 'inst_type' are valid
	* only if 'status' is non-zero.
	*/
	int inst_type;
	int inst_error;
	} vmx;
	/*
	* SVM specific payload.
	*/
	struct {
	uint64_t exitcode;
	uint64_t exitinfo1;
	uint64_t exitinfo2;
	} svm;
	struct {
	uint32_t code; /* ecx value */
	uint64_t wval;
	} msr;
	struct {
	int vcpu;
	uint64_t rip;
	} spinup_ap;
	struct {
	uint64_t rflags;
	uint64_t intr_status;
	} hlt;
	struct {
	int vector;
	} ioapic_eoi;
	struct {
	enum vm_suspend_how how;
	} suspended;
	struct vm_task_switch task_switch;
	} u;
	};

	/* APIs to inject faults into the guest */
	void vm_inject_fault(void *vm, int vcpuid, int vector, int errcode_valid,
	int errcode);

	static __inline void
	vm_inject_ud(void *vm, int vcpuid)
	{
	vm_inject_fault(vm, vcpuid, IDT_UD, 0, 0);
	}

	static __inline void
	vm_inject_gp(void *vm, int vcpuid)
	{
	vm_inject_fault(vm, vcpuid, IDT_GP, 1, 0);
	}

	static __inline void
	vm_inject_ac(void *vm, int vcpuid, int errcode)
	{
	vm_inject_fault(vm, vcpuid, IDT_AC, 1, errcode);
	}

	static __inline void
	vm_inject_ss(void *vm, int vcpuid, int errcode)
	{
	vm_inject_fault(vm, vcpuid, IDT_SS, 1, errcode);
	}

	void vm_inject_pf(void *vm, int vcpuid, int error_code, uint64_t cr2);

	int vm_restart_instruction(void *vm, int vcpuid);

	#endif /* _VMM_H_ */
	Index: head/sys/amd64/include/vmm_dev.h
	===================================================================
	--- head/sys/amd64/include/vmm_dev.h (revision 332156)
	+++ head/sys/amd64/include/vmm_dev.h (revision 332157)
	@@ -1,403 +1,410 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2011 NetApp, Inc.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* $FreeBSD$
	*/

	#ifndef _VMM_DEV_H_
	#define _VMM_DEV_H_

	#ifdef _KERNEL
	void vmmdev_init(void);
	int vmmdev_cleanup(void);
	#endif

	struct vm_memmap {
	vm_paddr_t gpa;
	int segid; /* memory segment */
	vm_ooffset_t segoff; /* offset into memory segment */
	size_t len; /* mmap length */
	int prot; /* RWX */
	int flags;
	};
	#define VM_MEMMAP_F_WIRED 0x01
	#define VM_MEMMAP_F_IOMMU 0x02

	#define VM_MEMSEG_NAME(m) ((m)->name[0] != '\0' ? (m)->name : NULL)
	struct vm_memseg {
	int segid;
	size_t len;
	char name[SPECNAMELEN + 1];
	};

	struct vm_register {
	int cpuid;
	int regnum; /* enum vm_reg_name */
	uint64_t regval;
	};

	struct vm_seg_desc { /* data or code segment */
	int cpuid;
	int regnum; /* enum vm_reg_name */
	struct seg_desc desc;
	};

	struct vm_register_set {
	int cpuid;
	unsigned int count;
	const int regnums; / enum vm_reg_name */
	uint64_t *regvals;
	};

	struct vm_run {
	int cpuid;
	struct vm_exit vm_exit;
	};

	struct vm_exception {
	int cpuid;
	int vector;
	uint32_t error_code;
	int error_code_valid;
	int restart_instruction;
	};

	struct vm_lapic_msi {
	uint64_t msg;
	uint64_t addr;
	};

	struct vm_lapic_irq {
	int cpuid;
	int vector;
	};

	struct vm_ioapic_irq {
	int irq;
	};

	struct vm_isa_irq {
	int atpic_irq;
	int ioapic_irq;
	};

	struct vm_isa_irq_trigger {
	int atpic_irq;
	enum vm_intr_trigger trigger;
	};

	struct vm_capability {
	int cpuid;
	enum vm_cap_type captype;
	int capval;
	int allcpus;
	};

	struct vm_pptdev {
	int bus;
	int slot;
	int func;
	};

	struct vm_pptdev_mmio {
	int bus;
	int slot;
	int func;
	vm_paddr_t gpa;
	vm_paddr_t hpa;
	size_t len;
	};

	struct vm_pptdev_msi {
	int vcpu;
	int bus;
	int slot;
	int func;
	int numvec; /* 0 means disabled */
	uint64_t msg;
	uint64_t addr;
	};

	struct vm_pptdev_msix {
	int vcpu;
	int bus;
	int slot;
	int func;
	int idx;
	uint64_t msg;
	uint32_t vector_control;
	uint64_t addr;
	};

	struct vm_nmi {
	int cpuid;
	};

	#define MAX_VM_STATS 64
	struct vm_stats {
	int cpuid; /* in */
	int num_entries; /* out */
	struct timeval tv;
	uint64_t statbuf[MAX_VM_STATS];
	};

	struct vm_stat_desc {
	int index; /* in */
	char desc[128]; /* out */
	};

	struct vm_x2apic {
	int cpuid;
	enum x2apic_state state;
	};

	struct vm_gpa_pte {
	uint64_t gpa; /* in */
	uint64_t pte[4]; /* out */
	int ptenum;
	};

	struct vm_hpet_cap {
	uint32_t capabilities; /* lower 32 bits of HPET capabilities */
	};

	struct vm_suspend {
	enum vm_suspend_how how;
	};

	struct vm_gla2gpa {
	int vcpuid; /* inputs */
	int prot; /* PROT_READ or PROT_WRITE */
	uint64_t gla;
	struct vm_guest_paging paging;
	int fault; /* outputs */
	uint64_t gpa;
	};

	struct vm_activate_cpu {
	int vcpuid;
	};

	struct vm_cpuset {
	int which;
	int cpusetsize;
	cpuset_t *cpus;
	};
	#define VM_ACTIVE_CPUS 0
	#define VM_SUSPENDED_CPUS 1
	+#define VM_DEBUG_CPUS 2

	struct vm_intinfo {
	int vcpuid;
	uint64_t info1;
	uint64_t info2;
	};

	struct vm_rtc_time {
	time_t secs;
	};

	struct vm_rtc_data {
	int offset;
	uint8_t value;
	};

	enum {
	/* general routines */
	IOCNUM_ABIVERS = 0,
	IOCNUM_RUN = 1,
	IOCNUM_SET_CAPABILITY = 2,
	IOCNUM_GET_CAPABILITY = 3,
	IOCNUM_SUSPEND = 4,
	IOCNUM_REINIT = 5,

	/* memory apis */
	IOCNUM_MAP_MEMORY = 10, /* deprecated */
	IOCNUM_GET_MEMORY_SEG = 11, /* deprecated */
	IOCNUM_GET_GPA_PMAP = 12,
	IOCNUM_GLA2GPA = 13,
	IOCNUM_ALLOC_MEMSEG = 14,
	IOCNUM_GET_MEMSEG = 15,
	IOCNUM_MMAP_MEMSEG = 16,
	IOCNUM_MMAP_GETNEXT = 17,
	IOCNUM_GLA2GPA_NOFAULT = 18,

	/* register/state accessors */
	IOCNUM_SET_REGISTER = 20,
	IOCNUM_GET_REGISTER = 21,
	IOCNUM_SET_SEGMENT_DESCRIPTOR = 22,
	IOCNUM_GET_SEGMENT_DESCRIPTOR = 23,
	IOCNUM_SET_REGISTER_SET = 24,
	IOCNUM_GET_REGISTER_SET = 25,

	/* interrupt injection */
	IOCNUM_GET_INTINFO = 28,
	IOCNUM_SET_INTINFO = 29,
	IOCNUM_INJECT_EXCEPTION = 30,
	IOCNUM_LAPIC_IRQ = 31,
	IOCNUM_INJECT_NMI = 32,
	IOCNUM_IOAPIC_ASSERT_IRQ = 33,
	IOCNUM_IOAPIC_DEASSERT_IRQ = 34,
	IOCNUM_IOAPIC_PULSE_IRQ = 35,
	IOCNUM_LAPIC_MSI = 36,
	IOCNUM_LAPIC_LOCAL_IRQ = 37,
	IOCNUM_IOAPIC_PINCOUNT = 38,
	IOCNUM_RESTART_INSTRUCTION = 39,

	/* PCI pass-thru */
	IOCNUM_BIND_PPTDEV = 40,
	IOCNUM_UNBIND_PPTDEV = 41,
	IOCNUM_MAP_PPTDEV_MMIO = 42,
	IOCNUM_PPTDEV_MSI = 43,
	IOCNUM_PPTDEV_MSIX = 44,

	/* statistics */
	IOCNUM_VM_STATS = 50,
	IOCNUM_VM_STAT_DESC = 51,

	/* kernel device state */
	IOCNUM_SET_X2APIC_STATE = 60,
	IOCNUM_GET_X2APIC_STATE = 61,
	IOCNUM_GET_HPET_CAPABILITIES = 62,

	/* legacy interrupt injection */
	IOCNUM_ISA_ASSERT_IRQ = 80,
	IOCNUM_ISA_DEASSERT_IRQ = 81,
	IOCNUM_ISA_PULSE_IRQ = 82,
	IOCNUM_ISA_SET_IRQ_TRIGGER = 83,

	/* vm_cpuset */
	IOCNUM_ACTIVATE_CPU = 90,
	IOCNUM_GET_CPUSET = 91,
	+ IOCNUM_SUSPEND_CPU = 92,
	+ IOCNUM_RESUME_CPU = 93,

	/* RTC */
	IOCNUM_RTC_READ = 100,
	IOCNUM_RTC_WRITE = 101,
	IOCNUM_RTC_SETTIME = 102,
	IOCNUM_RTC_GETTIME = 103,
	};

	#define VM_RUN \
	_IOWR('v', IOCNUM_RUN, struct vm_run)
	#define VM_SUSPEND \
	_IOW('v', IOCNUM_SUSPEND, struct vm_suspend)
	#define VM_REINIT \
	_IO('v', IOCNUM_REINIT)
	#define VM_ALLOC_MEMSEG \
	_IOW('v', IOCNUM_ALLOC_MEMSEG, struct vm_memseg)
	#define VM_GET_MEMSEG \
	_IOWR('v', IOCNUM_GET_MEMSEG, struct vm_memseg)
	#define VM_MMAP_MEMSEG \
	_IOW('v', IOCNUM_MMAP_MEMSEG, struct vm_memmap)
	#define VM_MMAP_GETNEXT \
	_IOWR('v', IOCNUM_MMAP_GETNEXT, struct vm_memmap)
	#define VM_SET_REGISTER \
	_IOW('v', IOCNUM_SET_REGISTER, struct vm_register)
	#define VM_GET_REGISTER \
	_IOWR('v', IOCNUM_GET_REGISTER, struct vm_register)
	#define VM_SET_SEGMENT_DESCRIPTOR \
	_IOW('v', IOCNUM_SET_SEGMENT_DESCRIPTOR, struct vm_seg_desc)
	#define VM_GET_SEGMENT_DESCRIPTOR \
	_IOWR('v', IOCNUM_GET_SEGMENT_DESCRIPTOR, struct vm_seg_desc)
	#define VM_SET_REGISTER_SET \
	_IOW('v', IOCNUM_SET_REGISTER_SET, struct vm_register_set)
	#define VM_GET_REGISTER_SET \
	_IOWR('v', IOCNUM_GET_REGISTER_SET, struct vm_register_set)
	#define VM_INJECT_EXCEPTION \
	_IOW('v', IOCNUM_INJECT_EXCEPTION, struct vm_exception)
	#define VM_LAPIC_IRQ \
	_IOW('v', IOCNUM_LAPIC_IRQ, struct vm_lapic_irq)
	#define VM_LAPIC_LOCAL_IRQ \
	_IOW('v', IOCNUM_LAPIC_LOCAL_IRQ, struct vm_lapic_irq)
	#define VM_LAPIC_MSI \
	_IOW('v', IOCNUM_LAPIC_MSI, struct vm_lapic_msi)
	#define VM_IOAPIC_ASSERT_IRQ \
	_IOW('v', IOCNUM_IOAPIC_ASSERT_IRQ, struct vm_ioapic_irq)
	#define VM_IOAPIC_DEASSERT_IRQ \
	_IOW('v', IOCNUM_IOAPIC_DEASSERT_IRQ, struct vm_ioapic_irq)
	#define VM_IOAPIC_PULSE_IRQ \
	_IOW('v', IOCNUM_IOAPIC_PULSE_IRQ, struct vm_ioapic_irq)
	#define VM_IOAPIC_PINCOUNT \
	_IOR('v', IOCNUM_IOAPIC_PINCOUNT, int)
	#define VM_ISA_ASSERT_IRQ \
	_IOW('v', IOCNUM_ISA_ASSERT_IRQ, struct vm_isa_irq)
	#define VM_ISA_DEASSERT_IRQ \
	_IOW('v', IOCNUM_ISA_DEASSERT_IRQ, struct vm_isa_irq)
	#define VM_ISA_PULSE_IRQ \
	_IOW('v', IOCNUM_ISA_PULSE_IRQ, struct vm_isa_irq)
	#define VM_ISA_SET_IRQ_TRIGGER \
	_IOW('v', IOCNUM_ISA_SET_IRQ_TRIGGER, struct vm_isa_irq_trigger)
	#define VM_SET_CAPABILITY \
	_IOW('v', IOCNUM_SET_CAPABILITY, struct vm_capability)
	#define VM_GET_CAPABILITY \
	_IOWR('v', IOCNUM_GET_CAPABILITY, struct vm_capability)
	#define VM_BIND_PPTDEV \
	_IOW('v', IOCNUM_BIND_PPTDEV, struct vm_pptdev)
	#define VM_UNBIND_PPTDEV \
	_IOW('v', IOCNUM_UNBIND_PPTDEV, struct vm_pptdev)
	#define VM_MAP_PPTDEV_MMIO \
	_IOW('v', IOCNUM_MAP_PPTDEV_MMIO, struct vm_pptdev_mmio)
	#define VM_PPTDEV_MSI \
	_IOW('v', IOCNUM_PPTDEV_MSI, struct vm_pptdev_msi)
	#define VM_PPTDEV_MSIX \
	_IOW('v', IOCNUM_PPTDEV_MSIX, struct vm_pptdev_msix)
	#define VM_INJECT_NMI \
	_IOW('v', IOCNUM_INJECT_NMI, struct vm_nmi)
	#define VM_STATS \
	_IOWR('v', IOCNUM_VM_STATS, struct vm_stats)
	#define VM_STAT_DESC \
	_IOWR('v', IOCNUM_VM_STAT_DESC, struct vm_stat_desc)
	#define VM_SET_X2APIC_STATE \
	_IOW('v', IOCNUM_SET_X2APIC_STATE, struct vm_x2apic)
	#define VM_GET_X2APIC_STATE \
	_IOWR('v', IOCNUM_GET_X2APIC_STATE, struct vm_x2apic)
	#define VM_GET_HPET_CAPABILITIES \
	_IOR('v', IOCNUM_GET_HPET_CAPABILITIES, struct vm_hpet_cap)
	#define VM_GET_GPA_PMAP \
	_IOWR('v', IOCNUM_GET_GPA_PMAP, struct vm_gpa_pte)
	#define VM_GLA2GPA \
	_IOWR('v', IOCNUM_GLA2GPA, struct vm_gla2gpa)
	#define VM_GLA2GPA_NOFAULT \
	_IOWR('v', IOCNUM_GLA2GPA_NOFAULT, struct vm_gla2gpa)
	#define VM_ACTIVATE_CPU \
	_IOW('v', IOCNUM_ACTIVATE_CPU, struct vm_activate_cpu)
	#define VM_GET_CPUS \
	_IOW('v', IOCNUM_GET_CPUSET, struct vm_cpuset)
	+#define VM_SUSPEND_CPU \
	+ _IOW('v', IOCNUM_SUSPEND_CPU, struct vm_activate_cpu)
	+#define VM_RESUME_CPU \
	+ _IOW('v', IOCNUM_RESUME_CPU, struct vm_activate_cpu)
	#define VM_SET_INTINFO \
	_IOW('v', IOCNUM_SET_INTINFO, struct vm_intinfo)
	#define VM_GET_INTINFO \
	_IOWR('v', IOCNUM_GET_INTINFO, struct vm_intinfo)
	#define VM_RTC_WRITE \
	_IOW('v', IOCNUM_RTC_WRITE, struct vm_rtc_data)
	#define VM_RTC_READ \
	_IOWR('v', IOCNUM_RTC_READ, struct vm_rtc_data)
	#define VM_RTC_SETTIME \
	_IOW('v', IOCNUM_RTC_SETTIME, struct vm_rtc_time)
	#define VM_RTC_GETTIME \
	_IOR('v', IOCNUM_RTC_GETTIME, struct vm_rtc_time)
	#define VM_RESTART_INSTRUCTION \
	_IOW('v', IOCNUM_RESTART_INSTRUCTION, int)
	#endif
	Index: head/sys/amd64/vmm/amd/svm.c
	===================================================================
	--- head/sys/amd64/vmm/amd/svm.c (revision 332156)
	+++ head/sys/amd64/vmm/amd/svm.c (revision 332157)
	@@ -1,2278 +1,2284 @@
	/*-
	* Copyright (c) 2013, Anish Gupta (akgupt3@gmail.com)
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice unmodified, this list of conditions, and the following
	* disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/smp.h>
	#include <sys/kernel.h>
	#include <sys/malloc.h>
	#include <sys/pcpu.h>
	#include <sys/proc.h>
	#include <sys/sysctl.h>

	#include <vm/vm.h>
	#include <vm/pmap.h>

	#include <machine/cpufunc.h>
	#include <machine/psl.h>
	#include <machine/md_var.h>
	#include <machine/specialreg.h>
	#include <machine/smp.h>
	#include <machine/vmm.h>
	#include <machine/vmm_dev.h>
	#include <machine/vmm_instruction_emul.h>

	#include "vmm_lapic.h"
	#include "vmm_stat.h"
	#include "vmm_ktr.h"
	#include "vmm_ioport.h"
	#include "vatpic.h"
	#include "vlapic.h"
	#include "vlapic_priv.h"

	#include "x86.h"
	#include "vmcb.h"
	#include "svm.h"
	#include "svm_softc.h"
	#include "svm_msr.h"
	#include "npt.h"

	SYSCTL_DECL(_hw_vmm);
	SYSCTL_NODE(_hw_vmm, OID_AUTO, svm, CTLFLAG_RW, NULL, NULL);

	/*
	* SVM CPUID function 0x8000_000A, edx bit decoding.
	*/
	#define AMD_CPUID_SVM_NP BIT(0) /* Nested paging or RVI */
	#define AMD_CPUID_SVM_LBR BIT(1) /* Last branch virtualization */
	#define AMD_CPUID_SVM_SVML BIT(2) /* SVM lock */
	#define AMD_CPUID_SVM_NRIP_SAVE BIT(3) /* Next RIP is saved */
	#define AMD_CPUID_SVM_TSC_RATE BIT(4) /* TSC rate control. */
	#define AMD_CPUID_SVM_VMCB_CLEAN BIT(5) /* VMCB state caching */
	#define AMD_CPUID_SVM_FLUSH_BY_ASID BIT(6) /* Flush by ASID */
	#define AMD_CPUID_SVM_DECODE_ASSIST BIT(7) /* Decode assist */
	#define AMD_CPUID_SVM_PAUSE_INC BIT(10) /* Pause intercept filter. */
	#define AMD_CPUID_SVM_PAUSE_FTH BIT(12) /* Pause filter threshold */
	#define AMD_CPUID_SVM_AVIC BIT(13) /* AVIC present */

	#define VMCB_CACHE_DEFAULT (VMCB_CACHE_ASID \| \
	VMCB_CACHE_IOPM \| \
	VMCB_CACHE_I \| \
	VMCB_CACHE_TPR \| \
	VMCB_CACHE_CR2 \| \
	VMCB_CACHE_CR \| \
	VMCB_CACHE_DR \| \
	VMCB_CACHE_DT \| \
	VMCB_CACHE_SEG \| \
	VMCB_CACHE_NP)

	static uint32_t vmcb_clean = VMCB_CACHE_DEFAULT;
	SYSCTL_INT(_hw_vmm_svm, OID_AUTO, vmcb_clean, CTLFLAG_RDTUN, &vmcb_clean,
	0, NULL);

	static MALLOC_DEFINE(M_SVM, "svm", "svm");
	static MALLOC_DEFINE(M_SVM_VLAPIC, "svm-vlapic", "svm-vlapic");

	/* Per-CPU context area. */
	extern struct pcpu __pcpu[];

	static uint32_t svm_feature = ~0U; /* AMD SVM features. */
	SYSCTL_UINT(_hw_vmm_svm, OID_AUTO, features, CTLFLAG_RDTUN, &svm_feature, 0,
	"SVM features advertised by CPUID.8000000AH:EDX");

	static int disable_npf_assist;
	SYSCTL_INT(_hw_vmm_svm, OID_AUTO, disable_npf_assist, CTLFLAG_RWTUN,
	&disable_npf_assist, 0, NULL);

	/* Maximum ASIDs supported by the processor */
	static uint32_t nasid;
	SYSCTL_UINT(_hw_vmm_svm, OID_AUTO, num_asids, CTLFLAG_RDTUN, &nasid, 0,
	"Number of ASIDs supported by this processor");

	/* Current ASID generation for each host cpu */
	static struct asid asid[MAXCPU];

	/*
	* SVM host state saved area of size 4KB for each core.
	*/
	static uint8_t hsave[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE);

	static VMM_STAT_AMD(VCPU_EXITINTINFO, "VM exits during event delivery");
	static VMM_STAT_AMD(VCPU_INTINFO_INJECTED, "Events pending at VM entry");
	static VMM_STAT_AMD(VMEXIT_VINTR, "VM exits due to interrupt window");

	static int svm_setreg(void *arg, int vcpu, int ident, uint64_t val);

	static __inline int
	flush_by_asid(void)
	{

	return (svm_feature & AMD_CPUID_SVM_FLUSH_BY_ASID);
	}

	static __inline int
	decode_assist(void)
	{

	return (svm_feature & AMD_CPUID_SVM_DECODE_ASSIST);
	}

	static void
	svm_disable(void *arg __unused)
	{
	uint64_t efer;

	efer = rdmsr(MSR_EFER);
	efer &= ~EFER_SVM;
	wrmsr(MSR_EFER, efer);
	}

	/*
	* Disable SVM on all CPUs.
	*/
	static int
	svm_cleanup(void)
	{

	smp_rendezvous(NULL, svm_disable, NULL, NULL);
	return (0);
	}

	/*
	* Verify that all the features required by bhyve are available.
	*/
	static int
	check_svm_features(void)
	{
	u_int regs[4];

	/* CPUID Fn8000_000A is for SVM */
	do_cpuid(0x8000000A, regs);
	svm_feature &= regs[3];

	/*
	* The number of ASIDs can be configured to be less than what is
	* supported by the hardware but not more.
	*/
	if (nasid == 0 \|\| nasid > regs[1])
	nasid = regs[1];
	KASSERT(nasid > 1, ("Insufficient ASIDs for guests: %#x", nasid));

	/* bhyve requires the Nested Paging feature */
	if (!(svm_feature & AMD_CPUID_SVM_NP)) {
	printf("SVM: Nested Paging feature not available.\n");
	return (ENXIO);
	}

	/* bhyve requires the NRIP Save feature */
	if (!(svm_feature & AMD_CPUID_SVM_NRIP_SAVE)) {
	printf("SVM: NRIP Save feature not available.\n");
	return (ENXIO);
	}

	return (0);
	}

	static void
	svm_enable(void *arg __unused)
	{
	uint64_t efer;

	efer = rdmsr(MSR_EFER);
	efer \|= EFER_SVM;
	wrmsr(MSR_EFER, efer);

	wrmsr(MSR_VM_HSAVE_PA, vtophys(hsave[curcpu]));
	}

	/*
	* Return 1 if SVM is enabled on this processor and 0 otherwise.
	*/
	static int
	svm_available(void)
	{
	uint64_t msr;

	/* Section 15.4 Enabling SVM from APM2. */
	if ((amd_feature2 & AMDID2_SVM) == 0) {
	printf("SVM: not available.\n");
	return (0);
	}

	msr = rdmsr(MSR_VM_CR);
	if ((msr & VM_CR_SVMDIS) != 0) {
	printf("SVM: disabled by BIOS.\n");
	return (0);
	}

	return (1);
	}

	static int
	svm_init(int ipinum)
	{
	int error, cpu;

	if (!svm_available())
	return (ENXIO);

	error = check_svm_features();
	if (error)
	return (error);

	vmcb_clean &= VMCB_CACHE_DEFAULT;

	for (cpu = 0; cpu < MAXCPU; cpu++) {
	/*
	* Initialize the host ASIDs to their "highest" valid values.
	*
	* The next ASID allocation will rollover both 'gen' and 'num'
	* and start off the sequence at {1,1}.
	*/
	asid[cpu].gen = ~0UL;
	asid[cpu].num = nasid - 1;
	}

	svm_msr_init();
	svm_npt_init(ipinum);

	/* Enable SVM on all CPUs */
	smp_rendezvous(NULL, svm_enable, NULL, NULL);

	return (0);
	}

	static void
	svm_restore(void)
	{

	svm_enable(NULL);
	}

	/* Pentium compatible MSRs */
	#define MSR_PENTIUM_START 0
	#define MSR_PENTIUM_END 0x1FFF
	/* AMD 6th generation and Intel compatible MSRs */
	#define MSR_AMD6TH_START 0xC0000000UL
	#define MSR_AMD6TH_END 0xC0001FFFUL
	/* AMD 7th and 8th generation compatible MSRs */
	#define MSR_AMD7TH_START 0xC0010000UL
	#define MSR_AMD7TH_END 0xC0011FFFUL

	/*
	* Get the index and bit position for a MSR in permission bitmap.
	* Two bits are used for each MSR: lower bit for read and higher bit for write.
	*/
	static int
	svm_msr_index(uint64_t msr, int index, int bit)
	{
	uint32_t base, off;

	*index = -1;
	bit = (msr % 4) 2;
	base = 0;

	if (msr >= MSR_PENTIUM_START && msr <= MSR_PENTIUM_END) {
	*index = msr / 4;
	return (0);
	}

	base += (MSR_PENTIUM_END - MSR_PENTIUM_START + 1);
	if (msr >= MSR_AMD6TH_START && msr <= MSR_AMD6TH_END) {
	off = (msr - MSR_AMD6TH_START);
	*index = (off + base) / 4;
	return (0);
	}

	base += (MSR_AMD6TH_END - MSR_AMD6TH_START + 1);
	if (msr >= MSR_AMD7TH_START && msr <= MSR_AMD7TH_END) {
	off = (msr - MSR_AMD7TH_START);
	*index = (off + base) / 4;
	return (0);
	}

	return (EINVAL);
	}

	/*
	* Allow vcpu to read or write the 'msr' without trapping into the hypervisor.
	*/
	static void
	svm_msr_perm(uint8_t *perm_bitmap, uint64_t msr, bool read, bool write)
	{
	int index, bit, error;

	error = svm_msr_index(msr, &index, &bit);
	KASSERT(error == 0, ("%s: invalid msr %#lx", __func__, msr));
	KASSERT(index >= 0 && index < SVM_MSR_BITMAP_SIZE,
	("%s: invalid index %d for msr %#lx", __func__, index, msr));
	KASSERT(bit >= 0 && bit <= 6, ("%s: invalid bit position %d "
	"msr %#lx", __func__, bit, msr));

	if (read)
	perm_bitmap[index] &= ~(1UL << bit);

	if (write)
	perm_bitmap[index] &= ~(2UL << bit);
	}

	static void
	svm_msr_rw_ok(uint8_t *perm_bitmap, uint64_t msr)
	{

	svm_msr_perm(perm_bitmap, msr, true, true);
	}

	static void
	svm_msr_rd_ok(uint8_t *perm_bitmap, uint64_t msr)
	{

	svm_msr_perm(perm_bitmap, msr, true, false);
	}

	static __inline int
	svm_get_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask)
	{
	struct vmcb_ctrl *ctrl;

	KASSERT(idx >=0 && idx < 5, ("invalid intercept index %d", idx));

	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
	return (ctrl->intercept[idx] & bitmask ? 1 : 0);
	}

	static __inline void
	svm_set_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask,
	int enabled)
	{
	struct vmcb_ctrl *ctrl;
	uint32_t oldval;

	KASSERT(idx >=0 && idx < 5, ("invalid intercept index %d", idx));

	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
	oldval = ctrl->intercept[idx];

	if (enabled)
	ctrl->intercept[idx] \|= bitmask;
	else
	ctrl->intercept[idx] &= ~bitmask;

	if (ctrl->intercept[idx] != oldval) {
	svm_set_dirty(sc, vcpu, VMCB_CACHE_I);
	VCPU_CTR3(sc->vm, vcpu, "intercept[%d] modified "
	"from %#x to %#x", idx, oldval, ctrl->intercept[idx]);
	}
	}

	static __inline void
	svm_disable_intercept(struct svm_softc *sc, int vcpu, int off, uint32_t bitmask)
	{

	svm_set_intercept(sc, vcpu, off, bitmask, 0);
	}

	static __inline void
	svm_enable_intercept(struct svm_softc *sc, int vcpu, int off, uint32_t bitmask)
	{

	svm_set_intercept(sc, vcpu, off, bitmask, 1);
	}

	static void
	vmcb_init(struct svm_softc *sc, int vcpu, uint64_t iopm_base_pa,
	uint64_t msrpm_base_pa, uint64_t np_pml4)
	{
	struct vmcb_ctrl *ctrl;
	struct vmcb_state *state;
	uint32_t mask;
	int n;

	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
	state = svm_get_vmcb_state(sc, vcpu);

	ctrl->iopm_base_pa = iopm_base_pa;
	ctrl->msrpm_base_pa = msrpm_base_pa;

	/* Enable nested paging */
	ctrl->np_enable = 1;
	ctrl->n_cr3 = np_pml4;

	/*
	* Intercept accesses to the control registers that are not shadowed
	* in the VMCB - i.e. all except cr0, cr2, cr3, cr4 and cr8.
	*/
	for (n = 0; n < 16; n++) {
	mask = (BIT(n) << 16) \| BIT(n);
	if (n == 0 \|\| n == 2 \|\| n == 3 \|\| n == 4 \|\| n == 8)
	svm_disable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask);
	else
	svm_enable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask);
	}


	/*
	* Intercept everything when tracing guest exceptions otherwise
	* just intercept machine check exception.
	*/
	if (vcpu_trace_exceptions(sc->vm, vcpu)) {
	for (n = 0; n < 32; n++) {
	/*
	* Skip unimplemented vectors in the exception bitmap.
	*/
	if (n == 2 \|\| n == 9) {
	continue;
	}
	svm_enable_intercept(sc, vcpu, VMCB_EXC_INTCPT, BIT(n));
	}
	} else {
	svm_enable_intercept(sc, vcpu, VMCB_EXC_INTCPT, BIT(IDT_MC));
	}

	/* Intercept various events (for e.g. I/O, MSR and CPUID accesses) */
	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IO);
	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_MSR);
	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_CPUID);
	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INTR);
	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INIT);
	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_NMI);
	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SMI);
	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SHUTDOWN);
	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
	VMCB_INTCPT_FERR_FREEZE);

	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_MONITOR);
	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_MWAIT);

	/*
	* From section "Canonicalization and Consistency Checks" in APMv2
	* the VMRUN intercept bit must be set to pass the consistency check.
	*/
	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMRUN);

	/*
	* The ASID will be set to a non-zero value just before VMRUN.
	*/
	ctrl->asid = 0;

	/*
	* Section 15.21.1, Interrupt Masking in EFLAGS
	* Section 15.21.2, Virtualizing APIC.TPR
	*
	* This must be set for %rflag and %cr8 isolation of guest and host.
	*/
	ctrl->v_intr_masking = 1;

	/* Enable Last Branch Record aka LBR for debugging */
	ctrl->lbr_virt_en = 1;
	state->dbgctl = BIT(0);

	/* EFER_SVM must always be set when the guest is executing */
	state->efer = EFER_SVM;

	/* Set up the PAT to power-on state */
	state->g_pat = PAT_VALUE(0, PAT_WRITE_BACK) \|
	PAT_VALUE(1, PAT_WRITE_THROUGH) \|
	PAT_VALUE(2, PAT_UNCACHED) \|
	PAT_VALUE(3, PAT_UNCACHEABLE) \|
	PAT_VALUE(4, PAT_WRITE_BACK) \|
	PAT_VALUE(5, PAT_WRITE_THROUGH) \|
	PAT_VALUE(6, PAT_UNCACHED) \|
	PAT_VALUE(7, PAT_UNCACHEABLE);

	/* Set up DR6/7 to power-on state */
	state->dr6 = 0xffff0ff0;
	state->dr7 = 0x400;
	}

	/*
	* Initialize a virtual machine.
	*/
	static void *
	svm_vminit(struct vm *vm, pmap_t pmap)
	{
	struct svm_softc *svm_sc;
	struct svm_vcpu *vcpu;
	vm_paddr_t msrpm_pa, iopm_pa, pml4_pa;
	int i;

	svm_sc = malloc(sizeof (*svm_sc), M_SVM, M_WAITOK \| M_ZERO);
	if (((uintptr_t)svm_sc & PAGE_MASK) != 0)
	panic("malloc of svm_softc not aligned on page boundary");

	svm_sc->msr_bitmap = contigmalloc(SVM_MSR_BITMAP_SIZE, M_SVM,
	M_WAITOK, 0, ~(vm_paddr_t)0, PAGE_SIZE, 0);
	if (svm_sc->msr_bitmap == NULL)
	panic("contigmalloc of SVM MSR bitmap failed");
	svm_sc->iopm_bitmap = contigmalloc(SVM_IO_BITMAP_SIZE, M_SVM,
	M_WAITOK, 0, ~(vm_paddr_t)0, PAGE_SIZE, 0);
	if (svm_sc->iopm_bitmap == NULL)
	panic("contigmalloc of SVM IO bitmap failed");

	svm_sc->vm = vm;
	svm_sc->nptp = (vm_offset_t)vtophys(pmap->pm_pml4);

	/*
	* Intercept read and write accesses to all MSRs.
	*/
	memset(svm_sc->msr_bitmap, 0xFF, SVM_MSR_BITMAP_SIZE);

	/*
	* Access to the following MSRs is redirected to the VMCB when the
	* guest is executing. Therefore it is safe to allow the guest to
	* read/write these MSRs directly without hypervisor involvement.
	*/
	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_GSBASE);
	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_FSBASE);
	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_KGSBASE);

	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_STAR);
	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_LSTAR);
	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_CSTAR);
	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SF_MASK);
	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_CS_MSR);
	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_ESP_MSR);
	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_EIP_MSR);
	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_PAT);

	svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_TSC);

	/*
	* Intercept writes to make sure that the EFER_SVM bit is not cleared.
	*/
	svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_EFER);

	/* Intercept access to all I/O ports. */
	memset(svm_sc->iopm_bitmap, 0xFF, SVM_IO_BITMAP_SIZE);

	iopm_pa = vtophys(svm_sc->iopm_bitmap);
	msrpm_pa = vtophys(svm_sc->msr_bitmap);
	pml4_pa = svm_sc->nptp;
	for (i = 0; i < VM_MAXCPU; i++) {
	vcpu = svm_get_vcpu(svm_sc, i);
	vcpu->nextrip = ~0;
	vcpu->lastcpu = NOCPU;
	vcpu->vmcb_pa = vtophys(&vcpu->vmcb);
	vmcb_init(svm_sc, i, iopm_pa, msrpm_pa, pml4_pa);
	svm_msr_guest_init(svm_sc, i);
	}
	return (svm_sc);
	}

	/*
	* Collateral for a generic SVM VM-exit.
	*/
	static void
	vm_exit_svm(struct vm_exit *vme, uint64_t code, uint64_t info1, uint64_t info2)
	{

	vme->exitcode = VM_EXITCODE_SVM;
	vme->u.svm.exitcode = code;
	vme->u.svm.exitinfo1 = info1;
	vme->u.svm.exitinfo2 = info2;
	}

	static int
	svm_cpl(struct vmcb_state *state)
	{

	/*
	* From APMv2:
	* "Retrieve the CPL from the CPL field in the VMCB, not
	* from any segment DPL"
	*/
	return (state->cpl);
	}

	static enum vm_cpu_mode
	svm_vcpu_mode(struct vmcb *vmcb)
	{
	struct vmcb_segment seg;
	struct vmcb_state *state;
	int error;

	state = &vmcb->state;

	if (state->efer & EFER_LMA) {
	error = vmcb_seg(vmcb, VM_REG_GUEST_CS, &seg);
	KASSERT(error == 0, ("%s: vmcb_seg(cs) error %d", __func__,
	error));

	/*
	* Section 4.8.1 for APM2, check if Code Segment has
	* Long attribute set in descriptor.
	*/
	if (seg.attrib & VMCB_CS_ATTRIB_L)
	return (CPU_MODE_64BIT);
	else
	return (CPU_MODE_COMPATIBILITY);
	} else if (state->cr0 & CR0_PE) {
	return (CPU_MODE_PROTECTED);
	} else {
	return (CPU_MODE_REAL);
	}
	}

	static enum vm_paging_mode
	svm_paging_mode(uint64_t cr0, uint64_t cr4, uint64_t efer)
	{

	if ((cr0 & CR0_PG) == 0)
	return (PAGING_MODE_FLAT);
	if ((cr4 & CR4_PAE) == 0)
	return (PAGING_MODE_32);
	if (efer & EFER_LME)
	return (PAGING_MODE_64);
	else
	return (PAGING_MODE_PAE);
	}

	/*
	* ins/outs utility routines
	*/
	static uint64_t
	svm_inout_str_index(struct svm_regctx *regs, int in)
	{
	uint64_t val;

	val = in ? regs->sctx_rdi : regs->sctx_rsi;

	return (val);
	}

	static uint64_t
	svm_inout_str_count(struct svm_regctx *regs, int rep)
	{
	uint64_t val;

	val = rep ? regs->sctx_rcx : 1;

	return (val);
	}

	static void
	svm_inout_str_seginfo(struct svm_softc *svm_sc, int vcpu, int64_t info1,
	int in, struct vm_inout_str *vis)
	{
	int error, s;

	if (in) {
	vis->seg_name = VM_REG_GUEST_ES;
	} else {
	/* The segment field has standard encoding */
	s = (info1 >> 10) & 0x7;
	vis->seg_name = vm_segment_name(s);
	}

	error = vmcb_getdesc(svm_sc, vcpu, vis->seg_name, &vis->seg_desc);
	KASSERT(error == 0, ("%s: svm_getdesc error %d", __func__, error));
	}

	static int
	svm_inout_str_addrsize(uint64_t info1)
	{
	uint32_t size;

	size = (info1 >> 7) & 0x7;
	switch (size) {
	case 1:
	return (2); /* 16 bit */
	case 2:
	return (4); /* 32 bit */
	case 4:
	return (8); /* 64 bit */
	default:
	panic("%s: invalid size encoding %d", __func__, size);
	}
	}

	static void
	svm_paging_info(struct vmcb vmcb, struct vm_guest_paging paging)
	{
	struct vmcb_state *state;

	state = &vmcb->state;
	paging->cr3 = state->cr3;
	paging->cpl = svm_cpl(state);
	paging->cpu_mode = svm_vcpu_mode(vmcb);
	paging->paging_mode = svm_paging_mode(state->cr0, state->cr4,
	state->efer);
	}

	#define UNHANDLED 0

	/*
	* Handle guest I/O intercept.
	*/
	static int
	svm_handle_io(struct svm_softc svm_sc, int vcpu, struct vm_exit vmexit)
	{
	struct vmcb_ctrl *ctrl;
	struct vmcb_state *state;
	struct svm_regctx *regs;
	struct vm_inout_str *vis;
	uint64_t info1;
	int inout_string;

	state = svm_get_vmcb_state(svm_sc, vcpu);
	ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu);
	regs = svm_get_guest_regctx(svm_sc, vcpu);

	info1 = ctrl->exitinfo1;
	inout_string = info1 & BIT(2) ? 1 : 0;

	/*
	* The effective segment number in EXITINFO1[12:10] is populated
	* only if the processor has the DecodeAssist capability.
	*
	* XXX this is not specified explicitly in APMv2 but can be verified
	* empirically.
	*/
	if (inout_string && !decode_assist())
	return (UNHANDLED);

	vmexit->exitcode = VM_EXITCODE_INOUT;
	vmexit->u.inout.in = (info1 & BIT(0)) ? 1 : 0;
	vmexit->u.inout.string = inout_string;
	vmexit->u.inout.rep = (info1 & BIT(3)) ? 1 : 0;
	vmexit->u.inout.bytes = (info1 >> 4) & 0x7;
	vmexit->u.inout.port = (uint16_t)(info1 >> 16);
	vmexit->u.inout.eax = (uint32_t)(state->rax);

	if (inout_string) {
	vmexit->exitcode = VM_EXITCODE_INOUT_STR;
	vis = &vmexit->u.inout_str;
	svm_paging_info(svm_get_vmcb(svm_sc, vcpu), &vis->paging);
	vis->rflags = state->rflags;
	vis->cr0 = state->cr0;
	vis->index = svm_inout_str_index(regs, vmexit->u.inout.in);
	vis->count = svm_inout_str_count(regs, vmexit->u.inout.rep);
	vis->addrsize = svm_inout_str_addrsize(info1);
	svm_inout_str_seginfo(svm_sc, vcpu, info1,
	vmexit->u.inout.in, vis);
	}

	return (UNHANDLED);
	}

	static int
	npf_fault_type(uint64_t exitinfo1)
	{

	if (exitinfo1 & VMCB_NPF_INFO1_W)
	return (VM_PROT_WRITE);
	else if (exitinfo1 & VMCB_NPF_INFO1_ID)
	return (VM_PROT_EXECUTE);
	else
	return (VM_PROT_READ);
	}

	static bool
	svm_npf_emul_fault(uint64_t exitinfo1)
	{

	if (exitinfo1 & VMCB_NPF_INFO1_ID) {
	return (false);
	}

	if (exitinfo1 & VMCB_NPF_INFO1_GPT) {
	return (false);
	}

	if ((exitinfo1 & VMCB_NPF_INFO1_GPA) == 0) {
	return (false);
	}

	return (true);
	}

	static void
	svm_handle_inst_emul(struct vmcb vmcb, uint64_t gpa, struct vm_exit vmexit)
	{
	struct vm_guest_paging *paging;
	struct vmcb_segment seg;
	struct vmcb_ctrl *ctrl;
	char *inst_bytes;
	int error, inst_len;

	ctrl = &vmcb->ctrl;
	paging = &vmexit->u.inst_emul.paging;

	vmexit->exitcode = VM_EXITCODE_INST_EMUL;
	vmexit->u.inst_emul.gpa = gpa;
	vmexit->u.inst_emul.gla = VIE_INVALID_GLA;
	svm_paging_info(vmcb, paging);

	error = vmcb_seg(vmcb, VM_REG_GUEST_CS, &seg);
	KASSERT(error == 0, ("%s: vmcb_seg(CS) error %d", __func__, error));

	switch(paging->cpu_mode) {
	case CPU_MODE_REAL:
	vmexit->u.inst_emul.cs_base = seg.base;
	vmexit->u.inst_emul.cs_d = 0;
	break;
	case CPU_MODE_PROTECTED:
	case CPU_MODE_COMPATIBILITY:
	vmexit->u.inst_emul.cs_base = seg.base;

	/*
	* Section 4.8.1 of APM2, Default Operand Size or D bit.
	*/
	vmexit->u.inst_emul.cs_d = (seg.attrib & VMCB_CS_ATTRIB_D) ?
	1 : 0;
	break;
	default:
	vmexit->u.inst_emul.cs_base = 0;
	vmexit->u.inst_emul.cs_d = 0;
	break;
	}

	/*
	* Copy the instruction bytes into 'vie' if available.
	*/
	if (decode_assist() && !disable_npf_assist) {
	inst_len = ctrl->inst_len;
	inst_bytes = ctrl->inst_bytes;
	} else {
	inst_len = 0;
	inst_bytes = NULL;
	}
	vie_init(&vmexit->u.inst_emul.vie, inst_bytes, inst_len);
	}

	#ifdef KTR
	static const char *
	intrtype_to_str(int intr_type)
	{
	switch (intr_type) {
	case VMCB_EVENTINJ_TYPE_INTR:
	return ("hwintr");
	case VMCB_EVENTINJ_TYPE_NMI:
	return ("nmi");
	case VMCB_EVENTINJ_TYPE_INTn:
	return ("swintr");
	case VMCB_EVENTINJ_TYPE_EXCEPTION:
	return ("exception");
	default:
	panic("%s: unknown intr_type %d", __func__, intr_type);
	}
	}
	#endif

	/*
	* Inject an event to vcpu as described in section 15.20, "Event injection".
	*/
	static void
	svm_eventinject(struct svm_softc *sc, int vcpu, int intr_type, int vector,
	uint32_t error, bool ec_valid)
	{
	struct vmcb_ctrl *ctrl;

	ctrl = svm_get_vmcb_ctrl(sc, vcpu);

	KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) == 0,
	("%s: event already pending %#lx", __func__, ctrl->eventinj));

	KASSERT(vector >=0 && vector <= 255, ("%s: invalid vector %d",
	__func__, vector));

	switch (intr_type) {
	case VMCB_EVENTINJ_TYPE_INTR:
	case VMCB_EVENTINJ_TYPE_NMI:
	case VMCB_EVENTINJ_TYPE_INTn:
	break;
	case VMCB_EVENTINJ_TYPE_EXCEPTION:
	if (vector >= 0 && vector <= 31 && vector != 2)
	break;
	/* FALLTHROUGH */
	default:
	panic("%s: invalid intr_type/vector: %d/%d", __func__,
	intr_type, vector);
	}
	ctrl->eventinj = vector \| (intr_type << 8) \| VMCB_EVENTINJ_VALID;
	if (ec_valid) {
	ctrl->eventinj \|= VMCB_EVENTINJ_EC_VALID;
	ctrl->eventinj \|= (uint64_t)error << 32;
	VCPU_CTR3(sc->vm, vcpu, "Injecting %s at vector %d errcode %#x",
	intrtype_to_str(intr_type), vector, error);
	} else {
	VCPU_CTR2(sc->vm, vcpu, "Injecting %s at vector %d",
	intrtype_to_str(intr_type), vector);
	}
	}

	static void
	svm_update_virqinfo(struct svm_softc *sc, int vcpu)
	{
	struct vm *vm;
	struct vlapic *vlapic;
	struct vmcb_ctrl *ctrl;

	vm = sc->vm;
	vlapic = vm_lapic(vm, vcpu);
	ctrl = svm_get_vmcb_ctrl(sc, vcpu);

	/* Update %cr8 in the emulated vlapic */
	vlapic_set_cr8(vlapic, ctrl->v_tpr);

	/* Virtual interrupt injection is not used. */
	KASSERT(ctrl->v_intr_vector == 0, ("%s: invalid "
	"v_intr_vector %d", __func__, ctrl->v_intr_vector));
	}

	static void
	svm_save_intinfo(struct svm_softc *svm_sc, int vcpu)
	{
	struct vmcb_ctrl *ctrl;
	uint64_t intinfo;

	ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu);
	intinfo = ctrl->exitintinfo;
	if (!VMCB_EXITINTINFO_VALID(intinfo))
	return;

	/*
	* From APMv2, Section "Intercepts during IDT interrupt delivery"
	*
	* If a #VMEXIT happened during event delivery then record the event
	* that was being delivered.
	*/
	VCPU_CTR2(svm_sc->vm, vcpu, "SVM:Pending INTINFO(0x%lx), vector=%d.\n",
	intinfo, VMCB_EXITINTINFO_VECTOR(intinfo));
	vmm_stat_incr(svm_sc->vm, vcpu, VCPU_EXITINTINFO, 1);
	vm_exit_intinfo(svm_sc->vm, vcpu, intinfo);
	}

	#ifdef INVARIANTS
	static __inline int
	vintr_intercept_enabled(struct svm_softc *sc, int vcpu)
	{

	return (svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
	VMCB_INTCPT_VINTR));
	}
	#endif

	static __inline void
	enable_intr_window_exiting(struct svm_softc *sc, int vcpu)
	{
	struct vmcb_ctrl *ctrl;

	ctrl = svm_get_vmcb_ctrl(sc, vcpu);

	if (ctrl->v_irq && ctrl->v_intr_vector == 0) {
	KASSERT(ctrl->v_ign_tpr, ("%s: invalid v_ign_tpr", __func__));
	KASSERT(vintr_intercept_enabled(sc, vcpu),
	("%s: vintr intercept should be enabled", __func__));
	return;
	}

	VCPU_CTR0(sc->vm, vcpu, "Enable intr window exiting");
	ctrl->v_irq = 1;
	ctrl->v_ign_tpr = 1;
	ctrl->v_intr_vector = 0;
	svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR);
	}

	static __inline void
	disable_intr_window_exiting(struct svm_softc *sc, int vcpu)
	{
	struct vmcb_ctrl *ctrl;

	ctrl = svm_get_vmcb_ctrl(sc, vcpu);

	if (!ctrl->v_irq && ctrl->v_intr_vector == 0) {
	KASSERT(!vintr_intercept_enabled(sc, vcpu),
	("%s: vintr intercept should be disabled", __func__));
	return;
	}

	VCPU_CTR0(sc->vm, vcpu, "Disable intr window exiting");
	ctrl->v_irq = 0;
	ctrl->v_intr_vector = 0;
	svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
	svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR);
	}

	static int
	svm_modify_intr_shadow(struct svm_softc *sc, int vcpu, uint64_t val)
	{
	struct vmcb_ctrl *ctrl;
	int oldval, newval;

	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
	oldval = ctrl->intr_shadow;
	newval = val ? 1 : 0;
	if (newval != oldval) {
	ctrl->intr_shadow = newval;
	VCPU_CTR1(sc->vm, vcpu, "Setting intr_shadow to %d", newval);
	}
	return (0);
	}

	static int
	svm_get_intr_shadow(struct svm_softc sc, int vcpu, uint64_t val)
	{
	struct vmcb_ctrl *ctrl;

	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
	*val = ctrl->intr_shadow;
	return (0);
	}

	/*
	* Once an NMI is injected it blocks delivery of further NMIs until the handler
	* executes an IRET. The IRET intercept is enabled when an NMI is injected to
	* to track when the vcpu is done handling the NMI.
	*/
	static int
	nmi_blocked(struct svm_softc *sc, int vcpu)
	{
	int blocked;

	blocked = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
	VMCB_INTCPT_IRET);
	return (blocked);
	}

	static void
	enable_nmi_blocking(struct svm_softc *sc, int vcpu)
	{

	KASSERT(!nmi_blocked(sc, vcpu), ("vNMI already blocked"));
	VCPU_CTR0(sc->vm, vcpu, "vNMI blocking enabled");
	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET);
	}

	static void
	clear_nmi_blocking(struct svm_softc *sc, int vcpu)
	{
	int error;

	KASSERT(nmi_blocked(sc, vcpu), ("vNMI already unblocked"));
	VCPU_CTR0(sc->vm, vcpu, "vNMI blocking cleared");
	/*
	* When the IRET intercept is cleared the vcpu will attempt to execute
	* the "iret" when it runs next. However, it is possible to inject
	* another NMI into the vcpu before the "iret" has actually executed.
	*
	* For e.g. if the "iret" encounters a #NPF when accessing the stack
	* it will trap back into the hypervisor. If an NMI is pending for
	* the vcpu it will be injected into the guest.
	*
	* XXX this needs to be fixed
	*/
	svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET);

	/*
	* Set 'intr_shadow' to prevent an NMI from being injected on the
	* immediate VMRUN.
	*/
	error = svm_modify_intr_shadow(sc, vcpu, 1);
	KASSERT(!error, ("%s: error %d setting intr_shadow", __func__, error));
	}

	#define EFER_MBZ_BITS 0xFFFFFFFFFFFF0200UL

	static int
	svm_write_efer(struct svm_softc sc, int vcpu, uint64_t newval, bool retu)
	{
	struct vm_exit *vme;
	struct vmcb_state *state;
	uint64_t changed, lma, oldval;
	int error;

	state = svm_get_vmcb_state(sc, vcpu);

	oldval = state->efer;
	VCPU_CTR2(sc->vm, vcpu, "wrmsr(efer) %#lx/%#lx", oldval, newval);

	newval &= ~0xFE; /* clear the Read-As-Zero (RAZ) bits */
	changed = oldval ^ newval;

	if (newval & EFER_MBZ_BITS)
	goto gpf;

	/* APMv2 Table 14-5 "Long-Mode Consistency Checks" */
	if (changed & EFER_LME) {
	if (state->cr0 & CR0_PG)
	goto gpf;
	}

	/* EFER.LMA = EFER.LME & CR0.PG */
	if ((newval & EFER_LME) != 0 && (state->cr0 & CR0_PG) != 0)
	lma = EFER_LMA;
	else
	lma = 0;

	if ((newval & EFER_LMA) != lma)
	goto gpf;

	if (newval & EFER_NXE) {
	if (!vm_cpuid_capability(sc->vm, vcpu, VCC_NO_EXECUTE))
	goto gpf;
	}

	/*
	* XXX bhyve does not enforce segment limits in 64-bit mode. Until
	* this is fixed flag guest attempt to set EFER_LMSLE as an error.
	*/
	if (newval & EFER_LMSLE) {
	vme = vm_exitinfo(sc->vm, vcpu);
	vm_exit_svm(vme, VMCB_EXIT_MSR, 1, 0);
	*retu = true;
	return (0);
	}

	if (newval & EFER_FFXSR) {
	if (!vm_cpuid_capability(sc->vm, vcpu, VCC_FFXSR))
	goto gpf;
	}

	if (newval & EFER_TCE) {
	if (!vm_cpuid_capability(sc->vm, vcpu, VCC_TCE))
	goto gpf;
	}

	error = svm_setreg(sc, vcpu, VM_REG_GUEST_EFER, newval);
	KASSERT(error == 0, ("%s: error %d updating efer", __func__, error));
	return (0);
	gpf:
	vm_inject_gp(sc->vm, vcpu);
	return (0);
	}

	static int
	emulate_wrmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t val,
	bool *retu)
	{
	int error;

	if (lapic_msr(num))
	error = lapic_wrmsr(sc->vm, vcpu, num, val, retu);
	else if (num == MSR_EFER)
	error = svm_write_efer(sc, vcpu, val, retu);
	else
	error = svm_wrmsr(sc, vcpu, num, val, retu);

	return (error);
	}

	static int
	emulate_rdmsr(struct svm_softc sc, int vcpu, u_int num, bool retu)
	{
	struct vmcb_state *state;
	struct svm_regctx *ctx;
	uint64_t result;
	int error;

	if (lapic_msr(num))
	error = lapic_rdmsr(sc->vm, vcpu, num, &result, retu);
	else
	error = svm_rdmsr(sc, vcpu, num, &result, retu);

	if (error == 0) {
	state = svm_get_vmcb_state(sc, vcpu);
	ctx = svm_get_guest_regctx(sc, vcpu);
	state->rax = result & 0xffffffff;
	ctx->sctx_rdx = result >> 32;
	}

	return (error);
	}

	#ifdef KTR
	static const char *
	exit_reason_to_str(uint64_t reason)
	{
	static char reasonbuf[32];

	switch (reason) {
	case VMCB_EXIT_INVALID:
	return ("invalvmcb");
	case VMCB_EXIT_SHUTDOWN:
	return ("shutdown");
	case VMCB_EXIT_NPF:
	return ("nptfault");
	case VMCB_EXIT_PAUSE:
	return ("pause");
	case VMCB_EXIT_HLT:
	return ("hlt");
	case VMCB_EXIT_CPUID:
	return ("cpuid");
	case VMCB_EXIT_IO:
	return ("inout");
	case VMCB_EXIT_MC:
	return ("mchk");
	case VMCB_EXIT_INTR:
	return ("extintr");
	case VMCB_EXIT_NMI:
	return ("nmi");
	case VMCB_EXIT_VINTR:
	return ("vintr");
	case VMCB_EXIT_MSR:
	return ("msr");
	case VMCB_EXIT_IRET:
	return ("iret");
	case VMCB_EXIT_MONITOR:
	return ("monitor");
	case VMCB_EXIT_MWAIT:
	return ("mwait");
	default:
	snprintf(reasonbuf, sizeof(reasonbuf), "%#lx", reason);
	return (reasonbuf);
	}
	}
	#endif /* KTR */

	/*
	* From section "State Saved on Exit" in APMv2: nRIP is saved for all #VMEXITs
	* that are due to instruction intercepts as well as MSR and IOIO intercepts
	* and exceptions caused by INT3, INTO and BOUND instructions.
	*
	* Return 1 if the nRIP is valid and 0 otherwise.
	*/
	static int
	nrip_valid(uint64_t exitcode)
	{
	switch (exitcode) {
	case 0x00 ... 0x0F: /* read of CR0 through CR15 */
	case 0x10 ... 0x1F: /* write of CR0 through CR15 */
	case 0x20 ... 0x2F: /* read of DR0 through DR15 */
	case 0x30 ... 0x3F: /* write of DR0 through DR15 */
	case 0x43: /* INT3 */
	case 0x44: /* INTO */
	case 0x45: /* BOUND */
	case 0x65 ... 0x7C: /* VMEXIT_CR0_SEL_WRITE ... VMEXIT_MSR */
	case 0x80 ... 0x8D: /* VMEXIT_VMRUN ... VMEXIT_XSETBV */
	return (1);
	default:
	return (0);
	}
	}

	static int
	svm_vmexit(struct svm_softc svm_sc, int vcpu, struct vm_exit vmexit)
	{
	struct vmcb *vmcb;
	struct vmcb_state *state;
	struct vmcb_ctrl *ctrl;
	struct svm_regctx *ctx;
	uint64_t code, info1, info2, val;
	uint32_t eax, ecx, edx;
	int error, errcode_valid, handled, idtvec, reflect;
	bool retu;

	ctx = svm_get_guest_regctx(svm_sc, vcpu);
	vmcb = svm_get_vmcb(svm_sc, vcpu);
	state = &vmcb->state;
	ctrl = &vmcb->ctrl;

	handled = 0;
	code = ctrl->exitcode;
	info1 = ctrl->exitinfo1;
	info2 = ctrl->exitinfo2;

	vmexit->exitcode = VM_EXITCODE_BOGUS;
	vmexit->rip = state->rip;
	vmexit->inst_length = nrip_valid(code) ? ctrl->nrip - state->rip : 0;

	vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_COUNT, 1);

	/*
	* #VMEXIT(INVALID) needs to be handled early because the VMCB is
	* in an inconsistent state and can trigger assertions that would
	* never happen otherwise.
	*/
	if (code == VMCB_EXIT_INVALID) {
	vm_exit_svm(vmexit, code, info1, info2);
	return (0);
	}

	KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) == 0, ("%s: event "
	"injection valid bit is set %#lx", __func__, ctrl->eventinj));

	KASSERT(vmexit->inst_length >= 0 && vmexit->inst_length <= 15,
	("invalid inst_length %d: code (%#lx), info1 (%#lx), info2 (%#lx)",
	vmexit->inst_length, code, info1, info2));

	svm_update_virqinfo(svm_sc, vcpu);
	svm_save_intinfo(svm_sc, vcpu);

	switch (code) {
	case VMCB_EXIT_IRET:
	/*
	* Restart execution at "iret" but with the intercept cleared.
	*/
	vmexit->inst_length = 0;
	clear_nmi_blocking(svm_sc, vcpu);
	handled = 1;
	break;
	case VMCB_EXIT_VINTR: /* interrupt window exiting */
	vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_VINTR, 1);
	handled = 1;
	break;
	case VMCB_EXIT_INTR: /* external interrupt */
	vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXTINT, 1);
	handled = 1;
	break;
	case VMCB_EXIT_NMI: /* external NMI */
	handled = 1;
	break;
	case 0x40 ... 0x5F:
	vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXCEPTION, 1);
	reflect = 1;
	idtvec = code - 0x40;
	switch (idtvec) {
	case IDT_MC:
	/*
	* Call the machine check handler by hand. Also don't
	* reflect the machine check back into the guest.
	*/
	reflect = 0;
	VCPU_CTR0(svm_sc->vm, vcpu, "Vectoring to MCE handler");
	__asm __volatile("int $18");
	break;
	case IDT_PF:
	error = svm_setreg(svm_sc, vcpu, VM_REG_GUEST_CR2,
	info2);
	KASSERT(error == 0, ("%s: error %d updating cr2",
	__func__, error));
	/* fallthru */
	case IDT_NP:
	case IDT_SS:
	case IDT_GP:
	case IDT_AC:
	case IDT_TS:
	errcode_valid = 1;
	break;

	case IDT_DF:
	errcode_valid = 1;
	info1 = 0;
	break;

	case IDT_BP:
	case IDT_OF:
	case IDT_BR:
	/*
	* The 'nrip' field is populated for INT3, INTO and
	* BOUND exceptions and this also implies that
	* 'inst_length' is non-zero.
	*
	* Reset 'inst_length' to zero so the guest %rip at
	* event injection is identical to what it was when
	* the exception originally happened.
	*/
	VCPU_CTR2(svm_sc->vm, vcpu, "Reset inst_length from %d "
	"to zero before injecting exception %d",
	vmexit->inst_length, idtvec);
	vmexit->inst_length = 0;
	/* fallthru */
	default:
	errcode_valid = 0;
	info1 = 0;
	break;
	}
	KASSERT(vmexit->inst_length == 0, ("invalid inst_length (%d) "
	"when reflecting exception %d into guest",
	vmexit->inst_length, idtvec));

	if (reflect) {
	/* Reflect the exception back into the guest */
	VCPU_CTR2(svm_sc->vm, vcpu, "Reflecting exception "
	"%d/%#x into the guest", idtvec, (int)info1);
	error = vm_inject_exception(svm_sc->vm, vcpu, idtvec,
	errcode_valid, info1, 0);
	KASSERT(error == 0, ("%s: vm_inject_exception error %d",
	__func__, error));
	}
	handled = 1;
	break;
	case VMCB_EXIT_MSR: /* MSR access. */
	eax = state->rax;
	ecx = ctx->sctx_rcx;
	edx = ctx->sctx_rdx;
	retu = false;

	if (info1) {
	vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_WRMSR, 1);
	val = (uint64_t)edx << 32 \| eax;
	VCPU_CTR2(svm_sc->vm, vcpu, "wrmsr %#x val %#lx",
	ecx, val);
	if (emulate_wrmsr(svm_sc, vcpu, ecx, val, &retu)) {
	vmexit->exitcode = VM_EXITCODE_WRMSR;
	vmexit->u.msr.code = ecx;
	vmexit->u.msr.wval = val;
	} else if (!retu) {
	handled = 1;
	} else {
	KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
	("emulate_wrmsr retu with bogus exitcode"));
	}
	} else {
	VCPU_CTR1(svm_sc->vm, vcpu, "rdmsr %#x", ecx);
	vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_RDMSR, 1);
	if (emulate_rdmsr(svm_sc, vcpu, ecx, &retu)) {
	vmexit->exitcode = VM_EXITCODE_RDMSR;
	vmexit->u.msr.code = ecx;
	} else if (!retu) {
	handled = 1;
	} else {
	KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
	("emulate_rdmsr retu with bogus exitcode"));
	}
	}
	break;
	case VMCB_EXIT_IO:
	handled = svm_handle_io(svm_sc, vcpu, vmexit);
	vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_INOUT, 1);
	break;
	case VMCB_EXIT_CPUID:
	vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_CPUID, 1);
	handled = x86_emulate_cpuid(svm_sc->vm, vcpu,
	(uint32_t *)&state->rax,
	(uint32_t *)&ctx->sctx_rbx,
	(uint32_t *)&ctx->sctx_rcx,
	(uint32_t *)&ctx->sctx_rdx);
	break;
	case VMCB_EXIT_HLT:
	vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_HLT, 1);
	vmexit->exitcode = VM_EXITCODE_HLT;
	vmexit->u.hlt.rflags = state->rflags;
	break;
	case VMCB_EXIT_PAUSE:
	vmexit->exitcode = VM_EXITCODE_PAUSE;
	vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_PAUSE, 1);
	break;
	case VMCB_EXIT_NPF:
	/* EXITINFO2 contains the faulting guest physical address */
	if (info1 & VMCB_NPF_INFO1_RSV) {
	VCPU_CTR2(svm_sc->vm, vcpu, "nested page fault with "
	"reserved bits set: info1(%#lx) info2(%#lx)",
	info1, info2);
	} else if (vm_mem_allocated(svm_sc->vm, vcpu, info2)) {
	vmexit->exitcode = VM_EXITCODE_PAGING;
	vmexit->u.paging.gpa = info2;
	vmexit->u.paging.fault_type = npf_fault_type(info1);
	vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_NESTED_FAULT, 1);
	VCPU_CTR3(svm_sc->vm, vcpu, "nested page fault "
	"on gpa %#lx/%#lx at rip %#lx",
	info2, info1, state->rip);
	} else if (svm_npf_emul_fault(info1)) {
	svm_handle_inst_emul(vmcb, info2, vmexit);
	vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_INST_EMUL, 1);
	VCPU_CTR3(svm_sc->vm, vcpu, "inst_emul fault "
	"for gpa %#lx/%#lx at rip %#lx",
	info2, info1, state->rip);
	}
	break;
	case VMCB_EXIT_MONITOR:
	vmexit->exitcode = VM_EXITCODE_MONITOR;
	break;
	case VMCB_EXIT_MWAIT:
	vmexit->exitcode = VM_EXITCODE_MWAIT;
	break;
	default:
	vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_UNKNOWN, 1);
	break;
	}

	VCPU_CTR4(svm_sc->vm, vcpu, "%s %s vmexit at %#lx/%d",
	handled ? "handled" : "unhandled", exit_reason_to_str(code),
	vmexit->rip, vmexit->inst_length);

	if (handled) {
	vmexit->rip += vmexit->inst_length;
	vmexit->inst_length = 0;
	state->rip = vmexit->rip;
	} else {
	if (vmexit->exitcode == VM_EXITCODE_BOGUS) {
	/*
	* If this VM exit was not claimed by anybody then
	* treat it as a generic SVM exit.
	*/
	vm_exit_svm(vmexit, code, info1, info2);
	} else {
	/*
	* The exitcode and collateral have been populated.
	* The VM exit will be processed further in userland.
	*/
	}
	}
	return (handled);
	}

	static void
	svm_inj_intinfo(struct svm_softc *svm_sc, int vcpu)
	{
	uint64_t intinfo;

	if (!vm_entry_intinfo(svm_sc->vm, vcpu, &intinfo))
	return;

	KASSERT(VMCB_EXITINTINFO_VALID(intinfo), ("%s: entry intinfo is not "
	"valid: %#lx", __func__, intinfo));

	svm_eventinject(svm_sc, vcpu, VMCB_EXITINTINFO_TYPE(intinfo),
	VMCB_EXITINTINFO_VECTOR(intinfo),
	VMCB_EXITINTINFO_EC(intinfo),
	VMCB_EXITINTINFO_EC_VALID(intinfo));
	vmm_stat_incr(svm_sc->vm, vcpu, VCPU_INTINFO_INJECTED, 1);
	VCPU_CTR1(svm_sc->vm, vcpu, "Injected entry intinfo: %#lx", intinfo);
	}

	/*
	* Inject event to virtual cpu.
	*/
	static void
	svm_inj_interrupts(struct svm_softc sc, int vcpu, struct vlapic vlapic)
	{
	struct vmcb_ctrl *ctrl;
	struct vmcb_state *state;
	struct svm_vcpu *vcpustate;
	uint8_t v_tpr;
	int vector, need_intr_window;
	int extint_pending;

	state = svm_get_vmcb_state(sc, vcpu);
	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
	vcpustate = svm_get_vcpu(sc, vcpu);

	need_intr_window = 0;

	if (vcpustate->nextrip != state->rip) {
	ctrl->intr_shadow = 0;
	VCPU_CTR2(sc->vm, vcpu, "Guest interrupt blocking "
	"cleared due to rip change: %#lx/%#lx",
	vcpustate->nextrip, state->rip);
	}

	/*
	* Inject pending events or exceptions for this vcpu.
	*
	* An event might be pending because the previous #VMEXIT happened
	* during event delivery (i.e. ctrl->exitintinfo).
	*
	* An event might also be pending because an exception was injected
	* by the hypervisor (e.g. #PF during instruction emulation).
	*/
	svm_inj_intinfo(sc, vcpu);

	/* NMI event has priority over interrupts. */
	if (vm_nmi_pending(sc->vm, vcpu)) {
	if (nmi_blocked(sc, vcpu)) {
	/*
	* Can't inject another NMI if the guest has not
	* yet executed an "iret" after the last NMI.
	*/
	VCPU_CTR0(sc->vm, vcpu, "Cannot inject NMI due "
	"to NMI-blocking");
	} else if (ctrl->intr_shadow) {
	/*
	* Can't inject an NMI if the vcpu is in an intr_shadow.
	*/
	VCPU_CTR0(sc->vm, vcpu, "Cannot inject NMI due to "
	"interrupt shadow");
	need_intr_window = 1;
	goto done;
	} else if (ctrl->eventinj & VMCB_EVENTINJ_VALID) {
	/*
	* If there is already an exception/interrupt pending
	* then defer the NMI until after that.
	*/
	VCPU_CTR1(sc->vm, vcpu, "Cannot inject NMI due to "
	"eventinj %#lx", ctrl->eventinj);

	/*
	* Use self-IPI to trigger a VM-exit as soon as
	* possible after the event injection is completed.
	*
	* This works only if the external interrupt exiting
	* is at a lower priority than the event injection.
	*
	* Although not explicitly specified in APMv2 the
	* relative priorities were verified empirically.
	*/
	ipi_cpu(curcpu, IPI_AST); /* XXX vmm_ipinum? */
	} else {
	vm_nmi_clear(sc->vm, vcpu);

	/* Inject NMI, vector number is not used */
	svm_eventinject(sc, vcpu, VMCB_EVENTINJ_TYPE_NMI,
	IDT_NMI, 0, false);

	/* virtual NMI blocking is now in effect */
	enable_nmi_blocking(sc, vcpu);

	VCPU_CTR0(sc->vm, vcpu, "Injecting vNMI");
	}
	}

	extint_pending = vm_extint_pending(sc->vm, vcpu);
	if (!extint_pending) {
	if (!vlapic_pending_intr(vlapic, &vector))
	goto done;
	KASSERT(vector >= 16 && vector <= 255,
	("invalid vector %d from local APIC", vector));
	} else {
	/* Ask the legacy pic for a vector to inject */
	vatpic_pending_intr(sc->vm, &vector);
	KASSERT(vector >= 0 && vector <= 255,
	("invalid vector %d from INTR", vector));
	}

	/*
	* If the guest has disabled interrupts or is in an interrupt shadow
	* then we cannot inject the pending interrupt.
	*/
	if ((state->rflags & PSL_I) == 0) {
	VCPU_CTR2(sc->vm, vcpu, "Cannot inject vector %d due to "
	"rflags %#lx", vector, state->rflags);
	need_intr_window = 1;
	goto done;
	}

	if (ctrl->intr_shadow) {
	VCPU_CTR1(sc->vm, vcpu, "Cannot inject vector %d due to "
	"interrupt shadow", vector);
	need_intr_window = 1;
	goto done;
	}

	if (ctrl->eventinj & VMCB_EVENTINJ_VALID) {
	VCPU_CTR2(sc->vm, vcpu, "Cannot inject vector %d due to "
	"eventinj %#lx", vector, ctrl->eventinj);
	need_intr_window = 1;
	goto done;
	}

	svm_eventinject(sc, vcpu, VMCB_EVENTINJ_TYPE_INTR, vector, 0, false);

	if (!extint_pending) {
	vlapic_intr_accepted(vlapic, vector);
	} else {
	vm_extint_clear(sc->vm, vcpu);
	vatpic_intr_accepted(sc->vm, vector);
	}

	/*
	* Force a VM-exit as soon as the vcpu is ready to accept another
	* interrupt. This is done because the PIC might have another vector
	* that it wants to inject. Also, if the APIC has a pending interrupt
	* that was preempted by the ExtInt then it allows us to inject the
	* APIC vector as soon as possible.
	*/
	need_intr_window = 1;
	done:
	/*
	* The guest can modify the TPR by writing to %CR8. In guest mode
	* the processor reflects this write to V_TPR without hypervisor
	* intervention.
	*
	* The guest can also modify the TPR by writing to it via the memory
	* mapped APIC page. In this case, the write will be emulated by the
	* hypervisor. For this reason V_TPR must be updated before every
	* VMRUN.
	*/
	v_tpr = vlapic_get_cr8(vlapic);
	KASSERT(v_tpr <= 15, ("invalid v_tpr %#x", v_tpr));
	if (ctrl->v_tpr != v_tpr) {
	VCPU_CTR2(sc->vm, vcpu, "VMCB V_TPR changed from %#x to %#x",
	ctrl->v_tpr, v_tpr);
	ctrl->v_tpr = v_tpr;
	svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
	}

	if (need_intr_window) {
	/*
	* We use V_IRQ in conjunction with the VINTR intercept to
	* trap into the hypervisor as soon as a virtual interrupt
	* can be delivered.
	*
	* Since injected events are not subject to intercept checks
	* we need to ensure that the V_IRQ is not actually going to
	* be delivered on VM entry. The KASSERT below enforces this.
	*/
	KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) != 0 \|\|
	(state->rflags & PSL_I) == 0 \|\| ctrl->intr_shadow,
	("Bogus intr_window_exiting: eventinj (%#lx), "
	"intr_shadow (%u), rflags (%#lx)",
	ctrl->eventinj, ctrl->intr_shadow, state->rflags));
	enable_intr_window_exiting(sc, vcpu);
	} else {
	disable_intr_window_exiting(sc, vcpu);
	}
	}

	static __inline void
	restore_host_tss(void)
	{
	struct system_segment_descriptor *tss_sd;

	/*
	* The TSS descriptor was in use prior to launching the guest so it
	* has been marked busy.
	*
	* 'ltr' requires the descriptor to be marked available so change the
	* type to "64-bit available TSS".
	*/
	tss_sd = PCPU_GET(tss);
	tss_sd->sd_type = SDT_SYSTSS;
	ltr(GSEL(GPROC0_SEL, SEL_KPL));
	}

	static void
	check_asid(struct svm_softc *sc, int vcpuid, pmap_t pmap, u_int thiscpu)
	{
	struct svm_vcpu *vcpustate;
	struct vmcb_ctrl *ctrl;
	long eptgen;
	bool alloc_asid;

	KASSERT(CPU_ISSET(thiscpu, &pmap->pm_active), ("%s: nested pmap not "
	"active on cpu %u", __func__, thiscpu));

	vcpustate = svm_get_vcpu(sc, vcpuid);
	ctrl = svm_get_vmcb_ctrl(sc, vcpuid);

	/*
	* The TLB entries associated with the vcpu's ASID are not valid
	* if either of the following conditions is true:
	*
	* 1. The vcpu's ASID generation is different than the host cpu's
	* ASID generation. This happens when the vcpu migrates to a new
	* host cpu. It can also happen when the number of vcpus executing
	* on a host cpu is greater than the number of ASIDs available.
	*
	* 2. The pmap generation number is different than the value cached in
	* the 'vcpustate'. This happens when the host invalidates pages
	* belonging to the guest.
	*
	* asidgen eptgen Action
	* mismatch mismatch
	* 0 0 (a)
	* 0 1 (b1) or (b2)
	* 1 0 (c)
	* 1 1 (d)
	*
	* (a) There is no mismatch in eptgen or ASID generation and therefore
	* no further action is needed.
	*
	* (b1) If the cpu supports FlushByAsid then the vcpu's ASID is
	* retained and the TLB entries associated with this ASID
	* are flushed by VMRUN.
	*
	* (b2) If the cpu does not support FlushByAsid then a new ASID is
	* allocated.
	*
	* (c) A new ASID is allocated.
	*
	* (d) A new ASID is allocated.
	*/

	alloc_asid = false;
	eptgen = pmap->pm_eptgen;
	ctrl->tlb_ctrl = VMCB_TLB_FLUSH_NOTHING;

	if (vcpustate->asid.gen != asid[thiscpu].gen) {
	alloc_asid = true; /* (c) and (d) */
	} else if (vcpustate->eptgen != eptgen) {
	if (flush_by_asid())
	ctrl->tlb_ctrl = VMCB_TLB_FLUSH_GUEST; /* (b1) */
	else
	alloc_asid = true; /* (b2) */
	} else {
	/*
	* This is the common case (a).
	*/
	KASSERT(!alloc_asid, ("ASID allocation not necessary"));
	KASSERT(ctrl->tlb_ctrl == VMCB_TLB_FLUSH_NOTHING,
	("Invalid VMCB tlb_ctrl: %#x", ctrl->tlb_ctrl));
	}

	if (alloc_asid) {
	if (++asid[thiscpu].num >= nasid) {
	asid[thiscpu].num = 1;
	if (++asid[thiscpu].gen == 0)
	asid[thiscpu].gen = 1;
	/*
	* If this cpu does not support "flush-by-asid"
	* then flush the entire TLB on a generation
	* bump. Subsequent ASID allocation in this
	* generation can be done without a TLB flush.
	*/
	if (!flush_by_asid())
	ctrl->tlb_ctrl = VMCB_TLB_FLUSH_ALL;
	}
	vcpustate->asid.gen = asid[thiscpu].gen;
	vcpustate->asid.num = asid[thiscpu].num;

	ctrl->asid = vcpustate->asid.num;
	svm_set_dirty(sc, vcpuid, VMCB_CACHE_ASID);
	/*
	* If this cpu supports "flush-by-asid" then the TLB
	* was not flushed after the generation bump. The TLB
	* is flushed selectively after every new ASID allocation.
	*/
	if (flush_by_asid())
	ctrl->tlb_ctrl = VMCB_TLB_FLUSH_GUEST;
	}
	vcpustate->eptgen = eptgen;

	KASSERT(ctrl->asid != 0, ("Guest ASID must be non-zero"));
	KASSERT(ctrl->asid == vcpustate->asid.num,
	("ASID mismatch: %u/%u", ctrl->asid, vcpustate->asid.num));
	}

	static __inline void
	disable_gintr(void)
	{

	__asm __volatile("clgi");
	}

	static __inline void
	enable_gintr(void)
	{

	__asm __volatile("stgi");
	}

	static __inline void
	svm_dr_enter_guest(struct svm_regctx *gctx)
	{

	/* Save host control debug registers. */
	gctx->host_dr7 = rdr7();
	gctx->host_debugctl = rdmsr(MSR_DEBUGCTLMSR);

	/*
	* Disable debugging in DR7 and DEBUGCTL to avoid triggering
	* exceptions in the host based on the guest DRx values. The
	* guest DR6, DR7, and DEBUGCTL are saved/restored in the
	* VMCB.
	*/
	load_dr7(0);
	wrmsr(MSR_DEBUGCTLMSR, 0);

	/* Save host debug registers. */
	gctx->host_dr0 = rdr0();
	gctx->host_dr1 = rdr1();
	gctx->host_dr2 = rdr2();
	gctx->host_dr3 = rdr3();
	gctx->host_dr6 = rdr6();

	/* Restore guest debug registers. */
	load_dr0(gctx->sctx_dr0);
	load_dr1(gctx->sctx_dr1);
	load_dr2(gctx->sctx_dr2);
	load_dr3(gctx->sctx_dr3);
	}

	static __inline void
	svm_dr_leave_guest(struct svm_regctx *gctx)
	{

	/* Save guest debug registers. */
	gctx->sctx_dr0 = rdr0();
	gctx->sctx_dr1 = rdr1();
	gctx->sctx_dr2 = rdr2();
	gctx->sctx_dr3 = rdr3();

	/*
	* Restore host debug registers. Restore DR7 and DEBUGCTL
	* last.
	*/
	load_dr0(gctx->host_dr0);
	load_dr1(gctx->host_dr1);
	load_dr2(gctx->host_dr2);
	load_dr3(gctx->host_dr3);
	load_dr6(gctx->host_dr6);
	wrmsr(MSR_DEBUGCTLMSR, gctx->host_debugctl);
	load_dr7(gctx->host_dr7);
	}

	/*
	* Start vcpu with specified RIP.
	*/
	static int
	svm_vmrun(void *arg, int vcpu, register_t rip, pmap_t pmap,
	struct vm_eventinfo *evinfo)
	{
	struct svm_regctx *gctx;
	struct svm_softc *svm_sc;
	struct svm_vcpu *vcpustate;
	struct vmcb_state *state;
	struct vmcb_ctrl *ctrl;
	struct vm_exit *vmexit;
	struct vlapic *vlapic;
	struct vm *vm;
	uint64_t vmcb_pa;
	int handled;

	svm_sc = arg;
	vm = svm_sc->vm;

	vcpustate = svm_get_vcpu(svm_sc, vcpu);
	state = svm_get_vmcb_state(svm_sc, vcpu);
	ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu);
	vmexit = vm_exitinfo(vm, vcpu);
	vlapic = vm_lapic(vm, vcpu);

	gctx = svm_get_guest_regctx(svm_sc, vcpu);
	vmcb_pa = svm_sc->vcpu[vcpu].vmcb_pa;

	if (vcpustate->lastcpu != curcpu) {
	/*
	* Force new ASID allocation by invalidating the generation.
	*/
	vcpustate->asid.gen = 0;

	/*
	* Invalidate the VMCB state cache by marking all fields dirty.
	*/
	svm_set_dirty(svm_sc, vcpu, 0xffffffff);

	/*
	* XXX
	* Setting 'vcpustate->lastcpu' here is bit premature because
	* we may return from this function without actually executing
	* the VMRUN instruction. This could happen if a rendezvous
	* or an AST is pending on the first time through the loop.
	*
	* This works for now but any new side-effects of vcpu
	* migration should take this case into account.
	*/
	vcpustate->lastcpu = curcpu;
	vmm_stat_incr(vm, vcpu, VCPU_MIGRATIONS, 1);
	}

	svm_msr_guest_enter(svm_sc, vcpu);

	/* Update Guest RIP */
	state->rip = rip;

	do {
	/*
	* Disable global interrupts to guarantee atomicity during
	* loading of guest state. This includes not only the state
	* loaded by the "vmrun" instruction but also software state
	* maintained by the hypervisor: suspended and rendezvous
	* state, NPT generation number, vlapic interrupts etc.
	*/
	disable_gintr();

	if (vcpu_suspended(evinfo)) {
	enable_gintr();
	vm_exit_suspended(vm, vcpu, state->rip);
	break;
	}

	if (vcpu_rendezvous_pending(evinfo)) {
	enable_gintr();
	vm_exit_rendezvous(vm, vcpu, state->rip);
	break;
	}

	if (vcpu_reqidle(evinfo)) {
	enable_gintr();
	vm_exit_reqidle(vm, vcpu, state->rip);
	break;
	}

	/* We are asked to give the cpu by scheduler. */
	if (vcpu_should_yield(vm, vcpu)) {
	enable_gintr();
	vm_exit_astpending(vm, vcpu, state->rip);
	break;
	}

	+ if (vcpu_debugged(vm, vcpu)) {
	+ enable_gintr();
	+ vm_exit_debug(vm, vcpu, state->rip);
	+ break;
	+ }
	+
	svm_inj_interrupts(svm_sc, vcpu, vlapic);

	/* Activate the nested pmap on 'curcpu' */
	CPU_SET_ATOMIC_ACQ(curcpu, &pmap->pm_active);

	/*
	* Check the pmap generation and the ASID generation to
	* ensure that the vcpu does not use stale TLB mappings.
	*/
	check_asid(svm_sc, vcpu, pmap, curcpu);

	ctrl->vmcb_clean = vmcb_clean & ~vcpustate->dirty;
	vcpustate->dirty = 0;
	VCPU_CTR1(vm, vcpu, "vmcb clean %#x", ctrl->vmcb_clean);

	/* Launch Virtual Machine. */
	VCPU_CTR1(vm, vcpu, "Resume execution at %#lx", state->rip);
	svm_dr_enter_guest(gctx);
	svm_launch(vmcb_pa, gctx, &__pcpu[curcpu]);
	svm_dr_leave_guest(gctx);

	CPU_CLR_ATOMIC(curcpu, &pmap->pm_active);

	/*
	* The host GDTR and IDTR is saved by VMRUN and restored
	* automatically on #VMEXIT. However, the host TSS needs
	* to be restored explicitly.
	*/
	restore_host_tss();

	/* #VMEXIT disables interrupts so re-enable them here. */
	enable_gintr();

	/* Update 'nextrip' */
	vcpustate->nextrip = state->rip;

	/* Handle #VMEXIT and if required return to user space. */
	handled = svm_vmexit(svm_sc, vcpu, vmexit);
	} while (handled);

	svm_msr_guest_exit(svm_sc, vcpu);

	return (0);
	}

	static void
	svm_vmcleanup(void *arg)
	{
	struct svm_softc *sc = arg;

	contigfree(sc->iopm_bitmap, SVM_IO_BITMAP_SIZE, M_SVM);
	contigfree(sc->msr_bitmap, SVM_MSR_BITMAP_SIZE, M_SVM);
	free(sc, M_SVM);
	}

	static register_t *
	swctx_regptr(struct svm_regctx *regctx, int reg)
	{

	switch (reg) {
	case VM_REG_GUEST_RBX:
	return (&regctx->sctx_rbx);
	case VM_REG_GUEST_RCX:
	return (&regctx->sctx_rcx);
	case VM_REG_GUEST_RDX:
	return (&regctx->sctx_rdx);
	case VM_REG_GUEST_RDI:
	return (&regctx->sctx_rdi);
	case VM_REG_GUEST_RSI:
	return (&regctx->sctx_rsi);
	case VM_REG_GUEST_RBP:
	return (&regctx->sctx_rbp);
	case VM_REG_GUEST_R8:
	return (&regctx->sctx_r8);
	case VM_REG_GUEST_R9:
	return (&regctx->sctx_r9);
	case VM_REG_GUEST_R10:
	return (&regctx->sctx_r10);
	case VM_REG_GUEST_R11:
	return (&regctx->sctx_r11);
	case VM_REG_GUEST_R12:
	return (&regctx->sctx_r12);
	case VM_REG_GUEST_R13:
	return (&regctx->sctx_r13);
	case VM_REG_GUEST_R14:
	return (&regctx->sctx_r14);
	case VM_REG_GUEST_R15:
	return (&regctx->sctx_r15);
	case VM_REG_GUEST_DR0:
	return (&regctx->sctx_dr0);
	case VM_REG_GUEST_DR1:
	return (&regctx->sctx_dr1);
	case VM_REG_GUEST_DR2:
	return (&regctx->sctx_dr2);
	case VM_REG_GUEST_DR3:
	return (&regctx->sctx_dr3);
	default:
	return (NULL);
	}
	}

	static int
	svm_getreg(void arg, int vcpu, int ident, uint64_t val)
	{
	struct svm_softc *svm_sc;
	register_t *reg;

	svm_sc = arg;

	if (ident == VM_REG_GUEST_INTR_SHADOW) {
	return (svm_get_intr_shadow(svm_sc, vcpu, val));
	}

	if (vmcb_read(svm_sc, vcpu, ident, val) == 0) {
	return (0);
	}

	reg = swctx_regptr(svm_get_guest_regctx(svm_sc, vcpu), ident);

	if (reg != NULL) {
	val = reg;
	return (0);
	}

	VCPU_CTR1(svm_sc->vm, vcpu, "svm_getreg: unknown register %#x", ident);
	return (EINVAL);
	}

	static int
	svm_setreg(void *arg, int vcpu, int ident, uint64_t val)
	{
	struct svm_softc *svm_sc;
	register_t *reg;

	svm_sc = arg;

	if (ident == VM_REG_GUEST_INTR_SHADOW) {
	return (svm_modify_intr_shadow(svm_sc, vcpu, val));
	}

	if (vmcb_write(svm_sc, vcpu, ident, val) == 0) {
	return (0);
	}

	reg = swctx_regptr(svm_get_guest_regctx(svm_sc, vcpu), ident);

	if (reg != NULL) {
	*reg = val;
	return (0);
	}

	/*
	* XXX deal with CR3 and invalidate TLB entries tagged with the
	* vcpu's ASID. This needs to be treated differently depending on
	* whether 'running' is true/false.
	*/

	VCPU_CTR1(svm_sc->vm, vcpu, "svm_setreg: unknown register %#x", ident);
	return (EINVAL);
	}

	static int
	svm_setcap(void *arg, int vcpu, int type, int val)
	{
	struct svm_softc *sc;
	int error;

	sc = arg;
	error = 0;
	switch (type) {
	case VM_CAP_HALT_EXIT:
	svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
	VMCB_INTCPT_HLT, val);
	break;
	case VM_CAP_PAUSE_EXIT:
	svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
	VMCB_INTCPT_PAUSE, val);
	break;
	case VM_CAP_UNRESTRICTED_GUEST:
	/* Unrestricted guest execution cannot be disabled in SVM */
	if (val == 0)
	error = EINVAL;
	break;
	default:
	error = ENOENT;
	break;
	}
	return (error);
	}

	static int
	svm_getcap(void arg, int vcpu, int type, int retval)
	{
	struct svm_softc *sc;
	int error;

	sc = arg;
	error = 0;

	switch (type) {
	case VM_CAP_HALT_EXIT:
	*retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
	VMCB_INTCPT_HLT);
	break;
	case VM_CAP_PAUSE_EXIT:
	*retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
	VMCB_INTCPT_PAUSE);
	break;
	case VM_CAP_UNRESTRICTED_GUEST:
	retval = 1; / unrestricted guest is always enabled */
	break;
	default:
	error = ENOENT;
	break;
	}
	return (error);
	}

	static struct vlapic *
	svm_vlapic_init(void *arg, int vcpuid)
	{
	struct svm_softc *svm_sc;
	struct vlapic *vlapic;

	svm_sc = arg;
	vlapic = malloc(sizeof(struct vlapic), M_SVM_VLAPIC, M_WAITOK \| M_ZERO);
	vlapic->vm = svm_sc->vm;
	vlapic->vcpuid = vcpuid;
	vlapic->apic_page = (struct LAPIC *)&svm_sc->apic_page[vcpuid];

	vlapic_init(vlapic);

	return (vlapic);
	}

	static void
	svm_vlapic_cleanup(void arg, struct vlapic vlapic)
	{

	vlapic_cleanup(vlapic);
	free(vlapic, M_SVM_VLAPIC);
	}

	struct vmm_ops vmm_ops_amd = {
	svm_init,
	svm_cleanup,
	svm_restore,
	svm_vminit,
	svm_vmrun,
	svm_vmcleanup,
	svm_getreg,
	svm_setreg,
	vmcb_getdesc,
	vmcb_setdesc,
	svm_getcap,
	svm_setcap,
	svm_npt_alloc,
	svm_npt_free,
	svm_vlapic_init,
	svm_vlapic_cleanup
	};
	Index: head/sys/amd64/vmm/intel/vmx.c
	===================================================================
	--- head/sys/amd64/vmm/intel/vmx.c (revision 332156)
	+++ head/sys/amd64/vmm/intel/vmx.c (revision 332157)
	@@ -1,3553 +1,3559 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2011 NetApp, Inc.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* $FreeBSD$
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/smp.h>
	#include <sys/kernel.h>
	#include <sys/malloc.h>
	#include <sys/pcpu.h>
	#include <sys/proc.h>
	#include <sys/sysctl.h>

	#include <vm/vm.h>
	#include <vm/pmap.h>

	#include <machine/psl.h>
	#include <machine/cpufunc.h>
	#include <machine/md_var.h>
	#include <machine/segments.h>
	#include <machine/smp.h>
	#include <machine/specialreg.h>
	#include <machine/vmparam.h>

	#include <machine/vmm.h>
	#include <machine/vmm_dev.h>
	#include <machine/vmm_instruction_emul.h>
	#include "vmm_lapic.h"
	#include "vmm_host.h"
	#include "vmm_ioport.h"
	#include "vmm_ktr.h"
	#include "vmm_stat.h"
	#include "vatpic.h"
	#include "vlapic.h"
	#include "vlapic_priv.h"

	#include "ept.h"
	#include "vmx_cpufunc.h"
	#include "vmx.h"
	#include "vmx_msr.h"
	#include "x86.h"
	#include "vmx_controls.h"

	#define PINBASED_CTLS_ONE_SETTING \
	(PINBASED_EXTINT_EXITING \| \
	PINBASED_NMI_EXITING \| \
	PINBASED_VIRTUAL_NMI)
	#define PINBASED_CTLS_ZERO_SETTING 0

	#define PROCBASED_CTLS_WINDOW_SETTING \
	(PROCBASED_INT_WINDOW_EXITING \| \
	PROCBASED_NMI_WINDOW_EXITING)

	#define PROCBASED_CTLS_ONE_SETTING \
	(PROCBASED_SECONDARY_CONTROLS \| \
	PROCBASED_MWAIT_EXITING \| \
	PROCBASED_MONITOR_EXITING \| \
	PROCBASED_IO_EXITING \| \
	PROCBASED_MSR_BITMAPS \| \
	PROCBASED_CTLS_WINDOW_SETTING \| \
	PROCBASED_CR8_LOAD_EXITING \| \
	PROCBASED_CR8_STORE_EXITING)
	#define PROCBASED_CTLS_ZERO_SETTING \
	(PROCBASED_CR3_LOAD_EXITING \| \
	PROCBASED_CR3_STORE_EXITING \| \
	PROCBASED_IO_BITMAPS)

	#define PROCBASED_CTLS2_ONE_SETTING PROCBASED2_ENABLE_EPT
	#define PROCBASED_CTLS2_ZERO_SETTING 0

	#define VM_EXIT_CTLS_ONE_SETTING \
	(VM_EXIT_SAVE_DEBUG_CONTROLS \| \
	VM_EXIT_HOST_LMA \| \
	VM_EXIT_SAVE_EFER \| \
	VM_EXIT_LOAD_EFER \| \
	VM_EXIT_ACKNOWLEDGE_INTERRUPT)

	#define VM_EXIT_CTLS_ZERO_SETTING 0

	#define VM_ENTRY_CTLS_ONE_SETTING \
	(VM_ENTRY_LOAD_DEBUG_CONTROLS \| \
	VM_ENTRY_LOAD_EFER)

	#define VM_ENTRY_CTLS_ZERO_SETTING \
	(VM_ENTRY_INTO_SMM \| \
	VM_ENTRY_DEACTIVATE_DUAL_MONITOR)

	#define HANDLED 1
	#define UNHANDLED 0

	static MALLOC_DEFINE(M_VMX, "vmx", "vmx");
	static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic");

	SYSCTL_DECL(_hw_vmm);
	SYSCTL_NODE(_hw_vmm, OID_AUTO, vmx, CTLFLAG_RW, NULL, NULL);

	int vmxon_enabled[MAXCPU];
	static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE);

	static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2;
	static uint32_t exit_ctls, entry_ctls;

	static uint64_t cr0_ones_mask, cr0_zeros_mask;
	SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_ones_mask, CTLFLAG_RD,
	&cr0_ones_mask, 0, NULL);
	SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_zeros_mask, CTLFLAG_RD,
	&cr0_zeros_mask, 0, NULL);

	static uint64_t cr4_ones_mask, cr4_zeros_mask;
	SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_ones_mask, CTLFLAG_RD,
	&cr4_ones_mask, 0, NULL);
	SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_zeros_mask, CTLFLAG_RD,
	&cr4_zeros_mask, 0, NULL);

	static int vmx_initialized;
	SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD,
	&vmx_initialized, 0, "Intel VMX initialized");

	/*
	* Optional capabilities
	*/
	static SYSCTL_NODE(_hw_vmm_vmx, OID_AUTO, cap, CTLFLAG_RW, NULL, NULL);

	static int cap_halt_exit;
	SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, halt_exit, CTLFLAG_RD, &cap_halt_exit, 0,
	"HLT triggers a VM-exit");

	static int cap_pause_exit;
	SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, pause_exit, CTLFLAG_RD, &cap_pause_exit,
	0, "PAUSE triggers a VM-exit");

	static int cap_unrestricted_guest;
	SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, unrestricted_guest, CTLFLAG_RD,
	&cap_unrestricted_guest, 0, "Unrestricted guests");

	static int cap_monitor_trap;
	SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, monitor_trap, CTLFLAG_RD,
	&cap_monitor_trap, 0, "Monitor trap flag");

	static int cap_invpcid;
	SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, invpcid, CTLFLAG_RD, &cap_invpcid,
	0, "Guests are allowed to use INVPCID");

	static int virtual_interrupt_delivery;
	SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, virtual_interrupt_delivery, CTLFLAG_RD,
	&virtual_interrupt_delivery, 0, "APICv virtual interrupt delivery support");

	static int posted_interrupts;
	SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, posted_interrupts, CTLFLAG_RD,
	&posted_interrupts, 0, "APICv posted interrupt support");

	static int pirvec = -1;
	SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupt_vector, CTLFLAG_RD,
	&pirvec, 0, "APICv posted interrupt vector");

	static struct unrhdr *vpid_unr;
	static u_int vpid_alloc_failed;
	SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD,
	&vpid_alloc_failed, 0, NULL);

	/*
	* Use the last page below 4GB as the APIC access address. This address is
	* occupied by the boot firmware so it is guaranteed that it will not conflict
	* with a page in system memory.
	*/
	#define APIC_ACCESS_ADDRESS 0xFFFFF000

	static int vmx_getdesc(void arg, int vcpu, int reg, struct seg_desc desc);
	static int vmx_getreg(void arg, int vcpu, int reg, uint64_t retval);
	static int vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val);
	static void vmx_inject_pir(struct vlapic *vlapic);

	#ifdef KTR
	static const char *
	exit_reason_to_str(int reason)
	{
	static char reasonbuf[32];

	switch (reason) {
	case EXIT_REASON_EXCEPTION:
	return "exception";
	case EXIT_REASON_EXT_INTR:
	return "extint";
	case EXIT_REASON_TRIPLE_FAULT:
	return "triplefault";
	case EXIT_REASON_INIT:
	return "init";
	case EXIT_REASON_SIPI:
	return "sipi";
	case EXIT_REASON_IO_SMI:
	return "iosmi";
	case EXIT_REASON_SMI:
	return "smi";
	case EXIT_REASON_INTR_WINDOW:
	return "intrwindow";
	case EXIT_REASON_NMI_WINDOW:
	return "nmiwindow";
	case EXIT_REASON_TASK_SWITCH:
	return "taskswitch";
	case EXIT_REASON_CPUID:
	return "cpuid";
	case EXIT_REASON_GETSEC:
	return "getsec";
	case EXIT_REASON_HLT:
	return "hlt";
	case EXIT_REASON_INVD:
	return "invd";
	case EXIT_REASON_INVLPG:
	return "invlpg";
	case EXIT_REASON_RDPMC:
	return "rdpmc";
	case EXIT_REASON_RDTSC:
	return "rdtsc";
	case EXIT_REASON_RSM:
	return "rsm";
	case EXIT_REASON_VMCALL:
	return "vmcall";
	case EXIT_REASON_VMCLEAR:
	return "vmclear";
	case EXIT_REASON_VMLAUNCH:
	return "vmlaunch";
	case EXIT_REASON_VMPTRLD:
	return "vmptrld";
	case EXIT_REASON_VMPTRST:
	return "vmptrst";
	case EXIT_REASON_VMREAD:
	return "vmread";
	case EXIT_REASON_VMRESUME:
	return "vmresume";
	case EXIT_REASON_VMWRITE:
	return "vmwrite";
	case EXIT_REASON_VMXOFF:
	return "vmxoff";
	case EXIT_REASON_VMXON:
	return "vmxon";
	case EXIT_REASON_CR_ACCESS:
	return "craccess";
	case EXIT_REASON_DR_ACCESS:
	return "draccess";
	case EXIT_REASON_INOUT:
	return "inout";
	case EXIT_REASON_RDMSR:
	return "rdmsr";
	case EXIT_REASON_WRMSR:
	return "wrmsr";
	case EXIT_REASON_INVAL_VMCS:
	return "invalvmcs";
	case EXIT_REASON_INVAL_MSR:
	return "invalmsr";
	case EXIT_REASON_MWAIT:
	return "mwait";
	case EXIT_REASON_MTF:
	return "mtf";
	case EXIT_REASON_MONITOR:
	return "monitor";
	case EXIT_REASON_PAUSE:
	return "pause";
	case EXIT_REASON_MCE_DURING_ENTRY:
	return "mce-during-entry";
	case EXIT_REASON_TPR:
	return "tpr";
	case EXIT_REASON_APIC_ACCESS:
	return "apic-access";
	case EXIT_REASON_GDTR_IDTR:
	return "gdtridtr";
	case EXIT_REASON_LDTR_TR:
	return "ldtrtr";
	case EXIT_REASON_EPT_FAULT:
	return "eptfault";
	case EXIT_REASON_EPT_MISCONFIG:
	return "eptmisconfig";
	case EXIT_REASON_INVEPT:
	return "invept";
	case EXIT_REASON_RDTSCP:
	return "rdtscp";
	case EXIT_REASON_VMX_PREEMPT:
	return "vmxpreempt";
	case EXIT_REASON_INVVPID:
	return "invvpid";
	case EXIT_REASON_WBINVD:
	return "wbinvd";
	case EXIT_REASON_XSETBV:
	return "xsetbv";
	case EXIT_REASON_APIC_WRITE:
	return "apic-write";
	default:
	snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason);
	return (reasonbuf);
	}
	}
	#endif /* KTR */

	static int
	vmx_allow_x2apic_msrs(struct vmx *vmx)
	{
	int i, error;

	error = 0;

	/*
	* Allow readonly access to the following x2APIC MSRs from the guest.
	*/
	error += guest_msr_ro(vmx, MSR_APIC_ID);
	error += guest_msr_ro(vmx, MSR_APIC_VERSION);
	error += guest_msr_ro(vmx, MSR_APIC_LDR);
	error += guest_msr_ro(vmx, MSR_APIC_SVR);

	for (i = 0; i < 8; i++)
	error += guest_msr_ro(vmx, MSR_APIC_ISR0 + i);

	for (i = 0; i < 8; i++)
	error += guest_msr_ro(vmx, MSR_APIC_TMR0 + i);

	for (i = 0; i < 8; i++)
	error += guest_msr_ro(vmx, MSR_APIC_IRR0 + i);

	error += guest_msr_ro(vmx, MSR_APIC_ESR);
	error += guest_msr_ro(vmx, MSR_APIC_LVT_TIMER);
	error += guest_msr_ro(vmx, MSR_APIC_LVT_THERMAL);
	error += guest_msr_ro(vmx, MSR_APIC_LVT_PCINT);
	error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT0);
	error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT1);
	error += guest_msr_ro(vmx, MSR_APIC_LVT_ERROR);
	error += guest_msr_ro(vmx, MSR_APIC_ICR_TIMER);
	error += guest_msr_ro(vmx, MSR_APIC_DCR_TIMER);
	error += guest_msr_ro(vmx, MSR_APIC_ICR);

	/*
	* Allow TPR, EOI and SELF_IPI MSRs to be read and written by the guest.
	*
	* These registers get special treatment described in the section
	* "Virtualizing MSR-Based APIC Accesses".
	*/
	error += guest_msr_rw(vmx, MSR_APIC_TPR);
	error += guest_msr_rw(vmx, MSR_APIC_EOI);
	error += guest_msr_rw(vmx, MSR_APIC_SELF_IPI);

	return (error);
	}

	u_long
	vmx_fix_cr0(u_long cr0)
	{

	return ((cr0 \| cr0_ones_mask) & ~cr0_zeros_mask);
	}

	u_long
	vmx_fix_cr4(u_long cr4)
	{

	return ((cr4 \| cr4_ones_mask) & ~cr4_zeros_mask);
	}

	static void
	vpid_free(int vpid)
	{
	if (vpid < 0 \|\| vpid > 0xffff)
	panic("vpid_free: invalid vpid %d", vpid);

	/*
	* VPIDs [0,VM_MAXCPU] are special and are not allocated from
	* the unit number allocator.
	*/

	if (vpid > VM_MAXCPU)
	free_unr(vpid_unr, vpid);
	}

	static void
	vpid_alloc(uint16_t *vpid, int num)
	{
	int i, x;

	if (num <= 0 \|\| num > VM_MAXCPU)
	panic("invalid number of vpids requested: %d", num);

	/*
	* If the "enable vpid" execution control is not enabled then the
	* VPID is required to be 0 for all vcpus.
	*/
	if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) == 0) {
	for (i = 0; i < num; i++)
	vpid[i] = 0;
	return;
	}

	/*
	* Allocate a unique VPID for each vcpu from the unit number allocator.
	*/
	for (i = 0; i < num; i++) {
	x = alloc_unr(vpid_unr);
	if (x == -1)
	break;
	else
	vpid[i] = x;
	}

	if (i < num) {
	atomic_add_int(&vpid_alloc_failed, 1);

	/*
	* If the unit number allocator does not have enough unique
	* VPIDs then we need to allocate from the [1,VM_MAXCPU] range.
	*
	* These VPIDs are not be unique across VMs but this does not
	* affect correctness because the combined mappings are also
	* tagged with the EP4TA which is unique for each VM.
	*
	* It is still sub-optimal because the invvpid will invalidate
	* combined mappings for a particular VPID across all EP4TAs.
	*/
	while (i-- > 0)
	vpid_free(vpid[i]);

	for (i = 0; i < num; i++)
	vpid[i] = i + 1;
	}
	}

	static void
	vpid_init(void)
	{
	/*
	* VPID 0 is required when the "enable VPID" execution control is
	* disabled.
	*
	* VPIDs [1,VM_MAXCPU] are used as the "overflow namespace" when the
	* unit number allocator does not have sufficient unique VPIDs to
	* satisfy the allocation.
	*
	* The remaining VPIDs are managed by the unit number allocator.
	*/
	vpid_unr = new_unrhdr(VM_MAXCPU + 1, 0xffff, NULL);
	}

	static void
	vmx_disable(void *arg __unused)
	{
	struct invvpid_desc invvpid_desc = { 0 };
	struct invept_desc invept_desc = { 0 };

	if (vmxon_enabled[curcpu]) {
	/*
	* See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b.
	*
	* VMXON or VMXOFF are not required to invalidate any TLB
	* caching structures. This prevents potential retention of
	* cached information in the TLB between distinct VMX episodes.
	*/
	invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc);
	invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc);
	vmxoff();
	}
	load_cr4(rcr4() & ~CR4_VMXE);
	}

	static int
	vmx_cleanup(void)
	{

	if (pirvec >= 0)
	lapic_ipi_free(pirvec);

	if (vpid_unr != NULL) {
	delete_unrhdr(vpid_unr);
	vpid_unr = NULL;
	}

	smp_rendezvous(NULL, vmx_disable, NULL, NULL);

	return (0);
	}

	static void
	vmx_enable(void *arg __unused)
	{
	int error;
	uint64_t feature_control;

	feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL);
	if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 0 \|\|
	(feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) {
	wrmsr(MSR_IA32_FEATURE_CONTROL,
	feature_control \| IA32_FEATURE_CONTROL_VMX_EN \|
	IA32_FEATURE_CONTROL_LOCK);
	}

	load_cr4(rcr4() \| CR4_VMXE);

	(uint32_t )vmxon_region[curcpu] = vmx_revision();
	error = vmxon(vmxon_region[curcpu]);
	if (error == 0)
	vmxon_enabled[curcpu] = 1;
	}

	static void
	vmx_restore(void)
	{

	if (vmxon_enabled[curcpu])
	vmxon(vmxon_region[curcpu]);
	}

	static int
	vmx_init(int ipinum)
	{
	int error, use_tpr_shadow;
	uint64_t basic, fixed0, fixed1, feature_control;
	uint32_t tmp, procbased2_vid_bits;

	/* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */
	if (!(cpu_feature2 & CPUID2_VMX)) {
	printf("vmx_init: processor does not support VMX operation\n");
	return (ENXIO);
	}

	/*
	* Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits
	* are set (bits 0 and 2 respectively).
	*/
	feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL);
	if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 1 &&
	(feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) {
	printf("vmx_init: VMX operation disabled by BIOS\n");
	return (ENXIO);
	}

	/*
	* Verify capabilities MSR_VMX_BASIC:
	* - bit 54 indicates support for INS/OUTS decoding
	*/
	basic = rdmsr(MSR_VMX_BASIC);
	if ((basic & (1UL << 54)) == 0) {
	printf("vmx_init: processor does not support desired basic "
	"capabilities\n");
	return (EINVAL);
	}

	/* Check support for primary processor-based VM-execution controls */
	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
	MSR_VMX_TRUE_PROCBASED_CTLS,
	PROCBASED_CTLS_ONE_SETTING,
	PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls);
	if (error) {
	printf("vmx_init: processor does not support desired primary "
	"processor-based controls\n");
	return (error);
	}

	/* Clear the processor-based ctl bits that are set on demand */
	procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING;

	/* Check support for secondary processor-based VM-execution controls */
	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
	MSR_VMX_PROCBASED_CTLS2,
	PROCBASED_CTLS2_ONE_SETTING,
	PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2);
	if (error) {
	printf("vmx_init: processor does not support desired secondary "
	"processor-based controls\n");
	return (error);
	}

	/* Check support for VPID */
	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2,
	PROCBASED2_ENABLE_VPID, 0, &tmp);
	if (error == 0)
	procbased_ctls2 \|= PROCBASED2_ENABLE_VPID;

	/* Check support for pin-based VM-execution controls */
	error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS,
	MSR_VMX_TRUE_PINBASED_CTLS,
	PINBASED_CTLS_ONE_SETTING,
	PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls);
	if (error) {
	printf("vmx_init: processor does not support desired "
	"pin-based controls\n");
	return (error);
	}

	/* Check support for VM-exit controls */
	error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS,
	VM_EXIT_CTLS_ONE_SETTING,
	VM_EXIT_CTLS_ZERO_SETTING,
	&exit_ctls);
	if (error) {
	printf("vmx_init: processor does not support desired "
	"exit controls\n");
	return (error);
	}

	/* Check support for VM-entry controls */
	error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, MSR_VMX_TRUE_ENTRY_CTLS,
	VM_ENTRY_CTLS_ONE_SETTING, VM_ENTRY_CTLS_ZERO_SETTING,
	&entry_ctls);
	if (error) {
	printf("vmx_init: processor does not support desired "
	"entry controls\n");
	return (error);
	}

	/*
	* Check support for optional features by testing them
	* as individual bits
	*/
	cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
	MSR_VMX_TRUE_PROCBASED_CTLS,
	PROCBASED_HLT_EXITING, 0,
	&tmp) == 0);

	cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
	MSR_VMX_PROCBASED_CTLS,
	PROCBASED_MTF, 0,
	&tmp) == 0);

	cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
	MSR_VMX_TRUE_PROCBASED_CTLS,
	PROCBASED_PAUSE_EXITING, 0,
	&tmp) == 0);

	cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
	MSR_VMX_PROCBASED_CTLS2,
	PROCBASED2_UNRESTRICTED_GUEST, 0,
	&tmp) == 0);

	cap_invpcid = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
	MSR_VMX_PROCBASED_CTLS2, PROCBASED2_ENABLE_INVPCID, 0,
	&tmp) == 0);

	/*
	* Check support for virtual interrupt delivery.
	*/
	procbased2_vid_bits = (PROCBASED2_VIRTUALIZE_APIC_ACCESSES \|
	PROCBASED2_VIRTUALIZE_X2APIC_MODE \|
	PROCBASED2_APIC_REGISTER_VIRTUALIZATION \|
	PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY);

	use_tpr_shadow = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
	MSR_VMX_TRUE_PROCBASED_CTLS, PROCBASED_USE_TPR_SHADOW, 0,
	&tmp) == 0);

	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2,
	procbased2_vid_bits, 0, &tmp);
	if (error == 0 && use_tpr_shadow) {
	virtual_interrupt_delivery = 1;
	TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_vid",
	&virtual_interrupt_delivery);
	}

	if (virtual_interrupt_delivery) {
	procbased_ctls \|= PROCBASED_USE_TPR_SHADOW;
	procbased_ctls2 \|= procbased2_vid_bits;
	procbased_ctls2 &= ~PROCBASED2_VIRTUALIZE_X2APIC_MODE;

	/*
	* No need to emulate accesses to %CR8 if virtual
	* interrupt delivery is enabled.
	*/
	procbased_ctls &= ~PROCBASED_CR8_LOAD_EXITING;
	procbased_ctls &= ~PROCBASED_CR8_STORE_EXITING;

	/*
	* Check for Posted Interrupts only if Virtual Interrupt
	* Delivery is enabled.
	*/
	error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS,
	MSR_VMX_TRUE_PINBASED_CTLS, PINBASED_POSTED_INTERRUPT, 0,
	&tmp);
	if (error == 0) {
	pirvec = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) :
	&IDTVEC(justreturn));
	if (pirvec < 0) {
	if (bootverbose) {
	printf("vmx_init: unable to allocate "
	"posted interrupt vector\n");
	}
	} else {
	posted_interrupts = 1;
	TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_pir",
	&posted_interrupts);
	}
	}
	}

	if (posted_interrupts)
	pinbased_ctls \|= PINBASED_POSTED_INTERRUPT;

	/* Initialize EPT */
	error = ept_init(ipinum);
	if (error) {
	printf("vmx_init: ept initialization failed (%d)\n", error);
	return (error);
	}

	/*
	* Stash the cr0 and cr4 bits that must be fixed to 0 or 1
	*/
	fixed0 = rdmsr(MSR_VMX_CR0_FIXED0);
	fixed1 = rdmsr(MSR_VMX_CR0_FIXED1);
	cr0_ones_mask = fixed0 & fixed1;
	cr0_zeros_mask = ~fixed0 & ~fixed1;

	/*
	* CR0_PE and CR0_PG can be set to zero in VMX non-root operation
	* if unrestricted guest execution is allowed.
	*/
	if (cap_unrestricted_guest)
	cr0_ones_mask &= ~(CR0_PG \| CR0_PE);

	/*
	* Do not allow the guest to set CR0_NW or CR0_CD.
	*/
	cr0_zeros_mask \|= (CR0_NW \| CR0_CD);

	fixed0 = rdmsr(MSR_VMX_CR4_FIXED0);
	fixed1 = rdmsr(MSR_VMX_CR4_FIXED1);
	cr4_ones_mask = fixed0 & fixed1;
	cr4_zeros_mask = ~fixed0 & ~fixed1;

	vpid_init();

	vmx_msr_init();

	/* enable VMX operation */
	smp_rendezvous(NULL, vmx_enable, NULL, NULL);

	vmx_initialized = 1;

	return (0);
	}

	static void
	vmx_trigger_hostintr(int vector)
	{
	uintptr_t func;
	struct gate_descriptor *gd;

	gd = &idt[vector];

	KASSERT(vector >= 32 && vector <= 255, ("vmx_trigger_hostintr: "
	"invalid vector %d", vector));
	KASSERT(gd->gd_p == 1, ("gate descriptor for vector %d not present",
	vector));
	KASSERT(gd->gd_type == SDT_SYSIGT, ("gate descriptor for vector %d "
	"has invalid type %d", vector, gd->gd_type));
	KASSERT(gd->gd_dpl == SEL_KPL, ("gate descriptor for vector %d "
	"has invalid dpl %d", vector, gd->gd_dpl));
	KASSERT(gd->gd_selector == GSEL(GCODE_SEL, SEL_KPL), ("gate descriptor "
	"for vector %d has invalid selector %d", vector, gd->gd_selector));
	KASSERT(gd->gd_ist == 0, ("gate descriptor for vector %d has invalid "
	"IST %d", vector, gd->gd_ist));

	func = ((long)gd->gd_hioffset << 16 \| gd->gd_looffset);
	vmx_call_isr(func);
	}

	static int
	vmx_setup_cr_shadow(int which, struct vmcs *vmcs, uint32_t initial)
	{
	int error, mask_ident, shadow_ident;
	uint64_t mask_value;

	if (which != 0 && which != 4)
	panic("vmx_setup_cr_shadow: unknown cr%d", which);

	if (which == 0) {
	mask_ident = VMCS_CR0_MASK;
	mask_value = cr0_ones_mask \| cr0_zeros_mask;
	shadow_ident = VMCS_CR0_SHADOW;
	} else {
	mask_ident = VMCS_CR4_MASK;
	mask_value = cr4_ones_mask \| cr4_zeros_mask;
	shadow_ident = VMCS_CR4_SHADOW;
	}

	error = vmcs_setreg(vmcs, 0, VMCS_IDENT(mask_ident), mask_value);
	if (error)
	return (error);

	error = vmcs_setreg(vmcs, 0, VMCS_IDENT(shadow_ident), initial);
	if (error)
	return (error);

	return (0);
	}
	#define vmx_setup_cr0_shadow(vmcs,init) vmx_setup_cr_shadow(0, (vmcs), (init))
	#define vmx_setup_cr4_shadow(vmcs,init) vmx_setup_cr_shadow(4, (vmcs), (init))

	static void *
	vmx_vminit(struct vm *vm, pmap_t pmap)
	{
	uint16_t vpid[VM_MAXCPU];
	int i, error;
	struct vmx *vmx;
	struct vmcs *vmcs;
	uint32_t exc_bitmap;

	vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK \| M_ZERO);
	if ((uintptr_t)vmx & PAGE_MASK) {
	panic("malloc of struct vmx not aligned on %d byte boundary",
	PAGE_SIZE);
	}
	vmx->vm = vm;

	vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pml4));

	/*
	* Clean up EPTP-tagged guest physical and combined mappings
	*
	* VMX transitions are not required to invalidate any guest physical
	* mappings. So, it may be possible for stale guest physical mappings
	* to be present in the processor TLBs.
	*
	* Combined mappings for this EP4TA are also invalidated for all VPIDs.
	*/
	ept_invalidate_mappings(vmx->eptp);

	msr_bitmap_initialize(vmx->msr_bitmap);

	/*
	* It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE.
	* The guest FSBASE and GSBASE are saved and restored during
	* vm-exit and vm-entry respectively. The host FSBASE and GSBASE are
	* always restored from the vmcs host state area on vm-exit.
	*
	* The SYSENTER_CS/ESP/EIP MSRs are identical to FS/GSBASE in
	* how they are saved/restored so can be directly accessed by the
	* guest.
	*
	* MSR_EFER is saved and restored in the guest VMCS area on a
	* VM exit and entry respectively. It is also restored from the
	* host VMCS area on a VM exit.
	*
	* The TSC MSR is exposed read-only. Writes are disallowed as
	* that will impact the host TSC. If the guest does a write
	* the "use TSC offsetting" execution control is enabled and the
	* difference between the host TSC and the guest TSC is written
	* into the TSC offset in the VMCS.
	*/
	if (guest_msr_rw(vmx, MSR_GSBASE) \|\|
	guest_msr_rw(vmx, MSR_FSBASE) \|\|
	guest_msr_rw(vmx, MSR_SYSENTER_CS_MSR) \|\|
	guest_msr_rw(vmx, MSR_SYSENTER_ESP_MSR) \|\|
	guest_msr_rw(vmx, MSR_SYSENTER_EIP_MSR) \|\|
	guest_msr_rw(vmx, MSR_EFER) \|\|
	guest_msr_ro(vmx, MSR_TSC))
	panic("vmx_vminit: error setting guest msr access");

	vpid_alloc(vpid, VM_MAXCPU);

	if (virtual_interrupt_delivery) {
	error = vm_map_mmio(vm, DEFAULT_APIC_BASE, PAGE_SIZE,
	APIC_ACCESS_ADDRESS);
	/* XXX this should really return an error to the caller */
	KASSERT(error == 0, ("vm_map_mmio(apicbase) error %d", error));
	}

	for (i = 0; i < VM_MAXCPU; i++) {
	vmcs = &vmx->vmcs[i];
	vmcs->identifier = vmx_revision();
	error = vmclear(vmcs);
	if (error != 0) {
	panic("vmx_vminit: vmclear error %d on vcpu %d\n",
	error, i);
	}

	vmx_msr_guest_init(vmx, i);

	error = vmcs_init(vmcs);
	KASSERT(error == 0, ("vmcs_init error %d", error));

	VMPTRLD(vmcs);
	error = 0;
	error += vmwrite(VMCS_HOST_RSP, (u_long)&vmx->ctx[i]);
	error += vmwrite(VMCS_EPTP, vmx->eptp);
	error += vmwrite(VMCS_PIN_BASED_CTLS, pinbased_ctls);
	error += vmwrite(VMCS_PRI_PROC_BASED_CTLS, procbased_ctls);
	error += vmwrite(VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2);
	error += vmwrite(VMCS_EXIT_CTLS, exit_ctls);
	error += vmwrite(VMCS_ENTRY_CTLS, entry_ctls);
	error += vmwrite(VMCS_MSR_BITMAP, vtophys(vmx->msr_bitmap));
	error += vmwrite(VMCS_VPID, vpid[i]);

	/* exception bitmap */
	if (vcpu_trace_exceptions(vm, i))
	exc_bitmap = 0xffffffff;
	else
	exc_bitmap = 1 << IDT_MC;
	error += vmwrite(VMCS_EXCEPTION_BITMAP, exc_bitmap);

	vmx->ctx[i].guest_dr6 = 0xffff0ff0;
	error += vmwrite(VMCS_GUEST_DR7, 0x400);

	if (virtual_interrupt_delivery) {
	error += vmwrite(VMCS_APIC_ACCESS, APIC_ACCESS_ADDRESS);
	error += vmwrite(VMCS_VIRTUAL_APIC,
	vtophys(&vmx->apic_page[i]));
	error += vmwrite(VMCS_EOI_EXIT0, 0);
	error += vmwrite(VMCS_EOI_EXIT1, 0);
	error += vmwrite(VMCS_EOI_EXIT2, 0);
	error += vmwrite(VMCS_EOI_EXIT3, 0);
	}
	if (posted_interrupts) {
	error += vmwrite(VMCS_PIR_VECTOR, pirvec);
	error += vmwrite(VMCS_PIR_DESC,
	vtophys(&vmx->pir_desc[i]));
	}
	VMCLEAR(vmcs);
	KASSERT(error == 0, ("vmx_vminit: error customizing the vmcs"));

	vmx->cap[i].set = 0;
	vmx->cap[i].proc_ctls = procbased_ctls;
	vmx->cap[i].proc_ctls2 = procbased_ctls2;

	vmx->state[i].nextrip = ~0;
	vmx->state[i].lastcpu = NOCPU;
	vmx->state[i].vpid = vpid[i];

	/*
	* Set up the CR0/4 shadows, and init the read shadow
	* to the power-on register value from the Intel Sys Arch.
	* CR0 - 0x60000010
	* CR4 - 0
	*/
	error = vmx_setup_cr0_shadow(vmcs, 0x60000010);
	if (error != 0)
	panic("vmx_setup_cr0_shadow %d", error);

	error = vmx_setup_cr4_shadow(vmcs, 0);
	if (error != 0)
	panic("vmx_setup_cr4_shadow %d", error);

	vmx->ctx[i].pmap = pmap;
	}

	return (vmx);
	}

	static int
	vmx_handle_cpuid(struct vm vm, int vcpu, struct vmxctx vmxctx)
	{
	int handled, func;

	func = vmxctx->guest_rax;

	handled = x86_emulate_cpuid(vm, vcpu,
	(uint32_t*)(&vmxctx->guest_rax),
	(uint32_t*)(&vmxctx->guest_rbx),
	(uint32_t*)(&vmxctx->guest_rcx),
	(uint32_t*)(&vmxctx->guest_rdx));
	return (handled);
	}

	static __inline void
	vmx_run_trace(struct vmx *vmx, int vcpu)
	{
	#ifdef KTR
	VCPU_CTR1(vmx->vm, vcpu, "Resume execution at %#lx", vmcs_guest_rip());
	#endif
	}

	static __inline void
	vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason,
	int handled)
	{
	#ifdef KTR
	VCPU_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx",
	handled ? "handled" : "unhandled",
	exit_reason_to_str(exit_reason), rip);
	#endif
	}

	static __inline void
	vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip)
	{
	#ifdef KTR
	VCPU_CTR1(vmx->vm, vcpu, "astpending vmexit at 0x%0lx", rip);
	#endif
	}

	static VMM_STAT_INTEL(VCPU_INVVPID_SAVED, "Number of vpid invalidations saved");
	static VMM_STAT_INTEL(VCPU_INVVPID_DONE, "Number of vpid invalidations done");

	/*
	* Invalidate guest mappings identified by its vpid from the TLB.
	*/
	static __inline void
	vmx_invvpid(struct vmx *vmx, int vcpu, pmap_t pmap, int running)
	{
	struct vmxstate *vmxstate;
	struct invvpid_desc invvpid_desc;

	vmxstate = &vmx->state[vcpu];
	if (vmxstate->vpid == 0)
	return;

	if (!running) {
	/*
	* Set the 'lastcpu' to an invalid host cpu.
	*
	* This will invalidate TLB entries tagged with the vcpu's
	* vpid the next time it runs via vmx_set_pcpu_defaults().
	*/
	vmxstate->lastcpu = NOCPU;
	return;
	}

	KASSERT(curthread->td_critnest > 0, ("%s: vcpu %d running outside "
	"critical section", __func__, vcpu));

	/*
	* Invalidate all mappings tagged with 'vpid'
	*
	* We do this because this vcpu was executing on a different host
	* cpu when it last ran. We do not track whether it invalidated
	* mappings associated with its 'vpid' during that run. So we must
	* assume that the mappings associated with 'vpid' on 'curcpu' are
	* stale and invalidate them.
	*
	* Note that we incur this penalty only when the scheduler chooses to
	* move the thread associated with this vcpu between host cpus.
	*
	* Note also that this will invalidate mappings tagged with 'vpid'
	* for "all" EP4TAs.
	*/
	if (pmap->pm_eptgen == vmx->eptgen[curcpu]) {
	invvpid_desc._res1 = 0;
	invvpid_desc._res2 = 0;
	invvpid_desc.vpid = vmxstate->vpid;
	invvpid_desc.linear_addr = 0;
	invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc);
	vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_DONE, 1);
	} else {
	/*
	* The invvpid can be skipped if an invept is going to
	* be performed before entering the guest. The invept
	* will invalidate combined mappings tagged with
	* 'vmx->eptp' for all vpids.
	*/
	vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_SAVED, 1);
	}
	}

	static void
	vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu, pmap_t pmap)
	{
	struct vmxstate *vmxstate;

	vmxstate = &vmx->state[vcpu];
	if (vmxstate->lastcpu == curcpu)
	return;

	vmxstate->lastcpu = curcpu;

	vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1);

	vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase());
	vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase());
	vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase());
	vmx_invvpid(vmx, vcpu, pmap, 1);
	}

	/*
	* We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set.
	*/
	CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0);

	static void __inline
	vmx_set_int_window_exiting(struct vmx *vmx, int vcpu)
	{

	if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) == 0) {
	vmx->cap[vcpu].proc_ctls \|= PROCBASED_INT_WINDOW_EXITING;
	vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
	VCPU_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting");
	}
	}

	static void __inline
	vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu)
	{

	KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0,
	("intr_window_exiting not set: %#x", vmx->cap[vcpu].proc_ctls));
	vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING;
	vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
	VCPU_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting");
	}

	static void __inline
	vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu)
	{

	if ((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) == 0) {
	vmx->cap[vcpu].proc_ctls \|= PROCBASED_NMI_WINDOW_EXITING;
	vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
	VCPU_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting");
	}
	}

	static void __inline
	vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu)
	{

	KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) != 0,
	("nmi_window_exiting not set %#x", vmx->cap[vcpu].proc_ctls));
	vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING;
	vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
	VCPU_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting");
	}

	int
	vmx_set_tsc_offset(struct vmx *vmx, int vcpu, uint64_t offset)
	{
	int error;

	if ((vmx->cap[vcpu].proc_ctls & PROCBASED_TSC_OFFSET) == 0) {
	vmx->cap[vcpu].proc_ctls \|= PROCBASED_TSC_OFFSET;
	vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
	VCPU_CTR0(vmx->vm, vcpu, "Enabling TSC offsetting");
	}

	error = vmwrite(VMCS_TSC_OFFSET, offset);

	return (error);
	}

	#define NMI_BLOCKING (VMCS_INTERRUPTIBILITY_NMI_BLOCKING \| \
	VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING)
	#define HWINTR_BLOCKING (VMCS_INTERRUPTIBILITY_STI_BLOCKING \| \
	VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING)

	static void
	vmx_inject_nmi(struct vmx *vmx, int vcpu)
	{
	uint32_t gi, info;

	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
	KASSERT((gi & NMI_BLOCKING) == 0, ("vmx_inject_nmi: invalid guest "
	"interruptibility-state %#x", gi));

	info = vmcs_read(VMCS_ENTRY_INTR_INFO);
	KASSERT((info & VMCS_INTR_VALID) == 0, ("vmx_inject_nmi: invalid "
	"VM-entry interruption information %#x", info));

	/*
	* Inject the virtual NMI. The vector must be the NMI IDT entry
	* or the VMCS entry check will fail.
	*/
	info = IDT_NMI \| VMCS_INTR_T_NMI \| VMCS_INTR_VALID;
	vmcs_write(VMCS_ENTRY_INTR_INFO, info);

	VCPU_CTR0(vmx->vm, vcpu, "Injecting vNMI");

	/* Clear the request */
	vm_nmi_clear(vmx->vm, vcpu);
	}

	static void
	vmx_inject_interrupts(struct vmx vmx, int vcpu, struct vlapic vlapic,
	uint64_t guestrip)
	{
	int vector, need_nmi_exiting, extint_pending;
	uint64_t rflags, entryinfo;
	uint32_t gi, info;

	if (vmx->state[vcpu].nextrip != guestrip) {
	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
	if (gi & HWINTR_BLOCKING) {
	VCPU_CTR2(vmx->vm, vcpu, "Guest interrupt blocking "
	"cleared due to rip change: %#lx/%#lx",
	vmx->state[vcpu].nextrip, guestrip);
	gi &= ~HWINTR_BLOCKING;
	vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
	}
	}

	if (vm_entry_intinfo(vmx->vm, vcpu, &entryinfo)) {
	KASSERT((entryinfo & VMCS_INTR_VALID) != 0, ("%s: entry "
	"intinfo is not valid: %#lx", __func__, entryinfo));

	info = vmcs_read(VMCS_ENTRY_INTR_INFO);
	KASSERT((info & VMCS_INTR_VALID) == 0, ("%s: cannot inject "
	"pending exception: %#lx/%#x", __func__, entryinfo, info));

	info = entryinfo;
	vector = info & 0xff;
	if (vector == IDT_BP \|\| vector == IDT_OF) {
	/*
	* VT-x requires #BP and #OF to be injected as software
	* exceptions.
	*/
	info &= ~VMCS_INTR_T_MASK;
	info \|= VMCS_INTR_T_SWEXCEPTION;
	}

	if (info & VMCS_INTR_DEL_ERRCODE)
	vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, entryinfo >> 32);

	vmcs_write(VMCS_ENTRY_INTR_INFO, info);
	}

	if (vm_nmi_pending(vmx->vm, vcpu)) {
	/*
	* If there are no conditions blocking NMI injection then
	* inject it directly here otherwise enable "NMI window
	* exiting" to inject it as soon as we can.
	*
	* We also check for STI_BLOCKING because some implementations
	* don't allow NMI injection in this case. If we are running
	* on a processor that doesn't have this restriction it will
	* immediately exit and the NMI will be injected in the
	* "NMI window exiting" handler.
	*/
	need_nmi_exiting = 1;
	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
	if ((gi & (HWINTR_BLOCKING \| NMI_BLOCKING)) == 0) {
	info = vmcs_read(VMCS_ENTRY_INTR_INFO);
	if ((info & VMCS_INTR_VALID) == 0) {
	vmx_inject_nmi(vmx, vcpu);
	need_nmi_exiting = 0;
	} else {
	VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI "
	"due to VM-entry intr info %#x", info);
	}
	} else {
	VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI due to "
	"Guest Interruptibility-state %#x", gi);
	}

	if (need_nmi_exiting)
	vmx_set_nmi_window_exiting(vmx, vcpu);
	}

	extint_pending = vm_extint_pending(vmx->vm, vcpu);

	if (!extint_pending && virtual_interrupt_delivery) {
	vmx_inject_pir(vlapic);
	return;
	}

	/*
	* If interrupt-window exiting is already in effect then don't bother
	* checking for pending interrupts. This is just an optimization and
	* not needed for correctness.
	*/
	if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0) {
	VCPU_CTR0(vmx->vm, vcpu, "Skip interrupt injection due to "
	"pending int_window_exiting");
	return;
	}

	if (!extint_pending) {
	/* Ask the local apic for a vector to inject */
	if (!vlapic_pending_intr(vlapic, &vector))
	return;

	/*
	* From the Intel SDM, Volume 3, Section "Maskable
	* Hardware Interrupts":
	* - maskable interrupt vectors [16,255] can be delivered
	* through the local APIC.
	*/
	KASSERT(vector >= 16 && vector <= 255,
	("invalid vector %d from local APIC", vector));
	} else {
	/* Ask the legacy pic for a vector to inject */
	vatpic_pending_intr(vmx->vm, &vector);

	/*
	* From the Intel SDM, Volume 3, Section "Maskable
	* Hardware Interrupts":
	* - maskable interrupt vectors [0,255] can be delivered
	* through the INTR pin.
	*/
	KASSERT(vector >= 0 && vector <= 255,
	("invalid vector %d from INTR", vector));
	}

	/* Check RFLAGS.IF and the interruptibility state of the guest */
	rflags = vmcs_read(VMCS_GUEST_RFLAGS);
	if ((rflags & PSL_I) == 0) {
	VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
	"rflags %#lx", vector, rflags);
	goto cantinject;
	}

	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
	if (gi & HWINTR_BLOCKING) {
	VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
	"Guest Interruptibility-state %#x", vector, gi);
	goto cantinject;
	}

	info = vmcs_read(VMCS_ENTRY_INTR_INFO);
	if (info & VMCS_INTR_VALID) {
	/*
	* This is expected and could happen for multiple reasons:
	* - A vectoring VM-entry was aborted due to astpending
	* - A VM-exit happened during event injection.
	* - An exception was injected above.
	* - An NMI was injected above or after "NMI window exiting"
	*/
	VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
	"VM-entry intr info %#x", vector, info);
	goto cantinject;
	}

	/* Inject the interrupt */
	info = VMCS_INTR_T_HWINTR \| VMCS_INTR_VALID;
	info \|= vector;
	vmcs_write(VMCS_ENTRY_INTR_INFO, info);

	if (!extint_pending) {
	/* Update the Local APIC ISR */
	vlapic_intr_accepted(vlapic, vector);
	} else {
	vm_extint_clear(vmx->vm, vcpu);
	vatpic_intr_accepted(vmx->vm, vector);

	/*
	* After we accepted the current ExtINT the PIC may
	* have posted another one. If that is the case, set
	* the Interrupt Window Exiting execution control so
	* we can inject that one too.
	*
	* Also, interrupt window exiting allows us to inject any
	* pending APIC vector that was preempted by the ExtINT
	* as soon as possible. This applies both for the software
	* emulated vlapic and the hardware assisted virtual APIC.
	*/
	vmx_set_int_window_exiting(vmx, vcpu);
	}

	VCPU_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector);

	return;

	cantinject:
	/*
	* Set the Interrupt Window Exiting execution control so we can inject
	* the interrupt as soon as blocking condition goes away.
	*/
	vmx_set_int_window_exiting(vmx, vcpu);
	}

	/*
	* If the Virtual NMIs execution control is '1' then the logical processor
	* tracks virtual-NMI blocking in the Guest Interruptibility-state field of
	* the VMCS. An IRET instruction in VMX non-root operation will remove any
	* virtual-NMI blocking.
	*
	* This unblocking occurs even if the IRET causes a fault. In this case the
	* hypervisor needs to restore virtual-NMI blocking before resuming the guest.
	*/
	static void
	vmx_restore_nmi_blocking(struct vmx *vmx, int vcpuid)
	{
	uint32_t gi;

	VCPU_CTR0(vmx->vm, vcpuid, "Restore Virtual-NMI blocking");
	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
	gi \|= VMCS_INTERRUPTIBILITY_NMI_BLOCKING;
	vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
	}

	static void
	vmx_clear_nmi_blocking(struct vmx *vmx, int vcpuid)
	{
	uint32_t gi;

	VCPU_CTR0(vmx->vm, vcpuid, "Clear Virtual-NMI blocking");
	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
	gi &= ~VMCS_INTERRUPTIBILITY_NMI_BLOCKING;
	vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
	}

	static void
	vmx_assert_nmi_blocking(struct vmx *vmx, int vcpuid)
	{
	uint32_t gi;

	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
	KASSERT(gi & VMCS_INTERRUPTIBILITY_NMI_BLOCKING,
	("NMI blocking is not in effect %#x", gi));
	}

	static int
	vmx_emulate_xsetbv(struct vmx vmx, int vcpu, struct vm_exit vmexit)
	{
	struct vmxctx *vmxctx;
	uint64_t xcrval;
	const struct xsave_limits *limits;

	vmxctx = &vmx->ctx[vcpu];
	limits = vmm_get_xsave_limits();

	/*
	* Note that the processor raises a GP# fault on its own if
	* xsetbv is executed for CPL != 0, so we do not have to
	* emulate that fault here.
	*/

	/* Only xcr0 is supported. */
	if (vmxctx->guest_rcx != 0) {
	vm_inject_gp(vmx->vm, vcpu);
	return (HANDLED);
	}

	/* We only handle xcr0 if both the host and guest have XSAVE enabled. */
	if (!limits->xsave_enabled \|\| !(vmcs_read(VMCS_GUEST_CR4) & CR4_XSAVE)) {
	vm_inject_ud(vmx->vm, vcpu);
	return (HANDLED);
	}

	xcrval = vmxctx->guest_rdx << 32 \| (vmxctx->guest_rax & 0xffffffff);
	if ((xcrval & ~limits->xcr0_allowed) != 0) {
	vm_inject_gp(vmx->vm, vcpu);
	return (HANDLED);
	}

	if (!(xcrval & XFEATURE_ENABLED_X87)) {
	vm_inject_gp(vmx->vm, vcpu);
	return (HANDLED);
	}

	/* AVX (YMM_Hi128) requires SSE. */
	if (xcrval & XFEATURE_ENABLED_AVX &&
	(xcrval & XFEATURE_AVX) != XFEATURE_AVX) {
	vm_inject_gp(vmx->vm, vcpu);
	return (HANDLED);
	}

	/*
	* AVX512 requires base AVX (YMM_Hi128) as well as OpMask,
	* ZMM_Hi256, and Hi16_ZMM.
	*/
	if (xcrval & XFEATURE_AVX512 &&
	(xcrval & (XFEATURE_AVX512 \| XFEATURE_AVX)) !=
	(XFEATURE_AVX512 \| XFEATURE_AVX)) {
	vm_inject_gp(vmx->vm, vcpu);
	return (HANDLED);
	}

	/*
	* Intel MPX requires both bound register state flags to be
	* set.
	*/
	if (((xcrval & XFEATURE_ENABLED_BNDREGS) != 0) !=
	((xcrval & XFEATURE_ENABLED_BNDCSR) != 0)) {
	vm_inject_gp(vmx->vm, vcpu);
	return (HANDLED);
	}

	/*
	* This runs "inside" vmrun() with the guest's FPU state, so
	* modifying xcr0 directly modifies the guest's xcr0, not the
	* host's.
	*/
	load_xcr(0, xcrval);
	return (HANDLED);
	}

	static uint64_t
	vmx_get_guest_reg(struct vmx *vmx, int vcpu, int ident)
	{
	const struct vmxctx *vmxctx;

	vmxctx = &vmx->ctx[vcpu];

	switch (ident) {
	case 0:
	return (vmxctx->guest_rax);
	case 1:
	return (vmxctx->guest_rcx);
	case 2:
	return (vmxctx->guest_rdx);
	case 3:
	return (vmxctx->guest_rbx);
	case 4:
	return (vmcs_read(VMCS_GUEST_RSP));
	case 5:
	return (vmxctx->guest_rbp);
	case 6:
	return (vmxctx->guest_rsi);
	case 7:
	return (vmxctx->guest_rdi);
	case 8:
	return (vmxctx->guest_r8);
	case 9:
	return (vmxctx->guest_r9);
	case 10:
	return (vmxctx->guest_r10);
	case 11:
	return (vmxctx->guest_r11);
	case 12:
	return (vmxctx->guest_r12);
	case 13:
	return (vmxctx->guest_r13);
	case 14:
	return (vmxctx->guest_r14);
	case 15:
	return (vmxctx->guest_r15);
	default:
	panic("invalid vmx register %d", ident);
	}
	}

	static void
	vmx_set_guest_reg(struct vmx *vmx, int vcpu, int ident, uint64_t regval)
	{
	struct vmxctx *vmxctx;

	vmxctx = &vmx->ctx[vcpu];

	switch (ident) {
	case 0:
	vmxctx->guest_rax = regval;
	break;
	case 1:
	vmxctx->guest_rcx = regval;
	break;
	case 2:
	vmxctx->guest_rdx = regval;
	break;
	case 3:
	vmxctx->guest_rbx = regval;
	break;
	case 4:
	vmcs_write(VMCS_GUEST_RSP, regval);
	break;
	case 5:
	vmxctx->guest_rbp = regval;
	break;
	case 6:
	vmxctx->guest_rsi = regval;
	break;
	case 7:
	vmxctx->guest_rdi = regval;
	break;
	case 8:
	vmxctx->guest_r8 = regval;
	break;
	case 9:
	vmxctx->guest_r9 = regval;
	break;
	case 10:
	vmxctx->guest_r10 = regval;
	break;
	case 11:
	vmxctx->guest_r11 = regval;
	break;
	case 12:
	vmxctx->guest_r12 = regval;
	break;
	case 13:
	vmxctx->guest_r13 = regval;
	break;
	case 14:
	vmxctx->guest_r14 = regval;
	break;
	case 15:
	vmxctx->guest_r15 = regval;
	break;
	default:
	panic("invalid vmx register %d", ident);
	}
	}

	static int
	vmx_emulate_cr0_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
	{
	uint64_t crval, regval;

	/* We only handle mov to %cr0 at this time */
	if ((exitqual & 0xf0) != 0x00)
	return (UNHANDLED);

	regval = vmx_get_guest_reg(vmx, vcpu, (exitqual >> 8) & 0xf);

	vmcs_write(VMCS_CR0_SHADOW, regval);

	crval = regval \| cr0_ones_mask;
	crval &= ~cr0_zeros_mask;
	vmcs_write(VMCS_GUEST_CR0, crval);

	if (regval & CR0_PG) {
	uint64_t efer, entry_ctls;

	/*
	* If CR0.PG is 1 and EFER.LME is 1 then EFER.LMA and
	* the "IA-32e mode guest" bit in VM-entry control must be
	* equal.
	*/
	efer = vmcs_read(VMCS_GUEST_IA32_EFER);
	if (efer & EFER_LME) {
	efer \|= EFER_LMA;
	vmcs_write(VMCS_GUEST_IA32_EFER, efer);
	entry_ctls = vmcs_read(VMCS_ENTRY_CTLS);
	entry_ctls \|= VM_ENTRY_GUEST_LMA;
	vmcs_write(VMCS_ENTRY_CTLS, entry_ctls);
	}
	}

	return (HANDLED);
	}

	static int
	vmx_emulate_cr4_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
	{
	uint64_t crval, regval;

	/* We only handle mov to %cr4 at this time */
	if ((exitqual & 0xf0) != 0x00)
	return (UNHANDLED);

	regval = vmx_get_guest_reg(vmx, vcpu, (exitqual >> 8) & 0xf);

	vmcs_write(VMCS_CR4_SHADOW, regval);

	crval = regval \| cr4_ones_mask;
	crval &= ~cr4_zeros_mask;
	vmcs_write(VMCS_GUEST_CR4, crval);

	return (HANDLED);
	}

	static int
	vmx_emulate_cr8_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
	{
	struct vlapic *vlapic;
	uint64_t cr8;
	int regnum;

	/* We only handle mov %cr8 to/from a register at this time. */
	if ((exitqual & 0xe0) != 0x00) {
	return (UNHANDLED);
	}

	vlapic = vm_lapic(vmx->vm, vcpu);
	regnum = (exitqual >> 8) & 0xf;
	if (exitqual & 0x10) {
	cr8 = vlapic_get_cr8(vlapic);
	vmx_set_guest_reg(vmx, vcpu, regnum, cr8);
	} else {
	cr8 = vmx_get_guest_reg(vmx, vcpu, regnum);
	vlapic_set_cr8(vlapic, cr8);
	}

	return (HANDLED);
	}

	/*
	* From section "Guest Register State" in the Intel SDM: CPL = SS.DPL
	*/
	static int
	vmx_cpl(void)
	{
	uint32_t ssar;

	ssar = vmcs_read(VMCS_GUEST_SS_ACCESS_RIGHTS);
	return ((ssar >> 5) & 0x3);
	}

	static enum vm_cpu_mode
	vmx_cpu_mode(void)
	{
	uint32_t csar;

	if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LMA) {
	csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS);
	if (csar & 0x2000)
	return (CPU_MODE_64BIT); /* CS.L = 1 */
	else
	return (CPU_MODE_COMPATIBILITY);
	} else if (vmcs_read(VMCS_GUEST_CR0) & CR0_PE) {
	return (CPU_MODE_PROTECTED);
	} else {
	return (CPU_MODE_REAL);
	}
	}

	static enum vm_paging_mode
	vmx_paging_mode(void)
	{

	if (!(vmcs_read(VMCS_GUEST_CR0) & CR0_PG))
	return (PAGING_MODE_FLAT);
	if (!(vmcs_read(VMCS_GUEST_CR4) & CR4_PAE))
	return (PAGING_MODE_32);
	if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LME)
	return (PAGING_MODE_64);
	else
	return (PAGING_MODE_PAE);
	}

	static uint64_t
	inout_str_index(struct vmx *vmx, int vcpuid, int in)
	{
	uint64_t val;
	int error;
	enum vm_reg_name reg;

	reg = in ? VM_REG_GUEST_RDI : VM_REG_GUEST_RSI;
	error = vmx_getreg(vmx, vcpuid, reg, &val);
	KASSERT(error == 0, ("%s: vmx_getreg error %d", __func__, error));
	return (val);
	}

	static uint64_t
	inout_str_count(struct vmx *vmx, int vcpuid, int rep)
	{
	uint64_t val;
	int error;

	if (rep) {
	error = vmx_getreg(vmx, vcpuid, VM_REG_GUEST_RCX, &val);
	KASSERT(!error, ("%s: vmx_getreg error %d", __func__, error));
	} else {
	val = 1;
	}
	return (val);
	}

	static int
	inout_str_addrsize(uint32_t inst_info)
	{
	uint32_t size;

	size = (inst_info >> 7) & 0x7;
	switch (size) {
	case 0:
	return (2); /* 16 bit */
	case 1:
	return (4); /* 32 bit */
	case 2:
	return (8); /* 64 bit */
	default:
	panic("%s: invalid size encoding %d", __func__, size);
	}
	}

	static void
	inout_str_seginfo(struct vmx *vmx, int vcpuid, uint32_t inst_info, int in,
	struct vm_inout_str *vis)
	{
	int error, s;

	if (in) {
	vis->seg_name = VM_REG_GUEST_ES;
	} else {
	s = (inst_info >> 15) & 0x7;
	vis->seg_name = vm_segment_name(s);
	}

	error = vmx_getdesc(vmx, vcpuid, vis->seg_name, &vis->seg_desc);
	KASSERT(error == 0, ("%s: vmx_getdesc error %d", __func__, error));
	}

	static void
	vmx_paging_info(struct vm_guest_paging *paging)
	{
	paging->cr3 = vmcs_guest_cr3();
	paging->cpl = vmx_cpl();
	paging->cpu_mode = vmx_cpu_mode();
	paging->paging_mode = vmx_paging_mode();
	}

	static void
	vmexit_inst_emul(struct vm_exit *vmexit, uint64_t gpa, uint64_t gla)
	{
	struct vm_guest_paging *paging;
	uint32_t csar;

	paging = &vmexit->u.inst_emul.paging;

	vmexit->exitcode = VM_EXITCODE_INST_EMUL;
	vmexit->inst_length = 0;
	vmexit->u.inst_emul.gpa = gpa;
	vmexit->u.inst_emul.gla = gla;
	vmx_paging_info(paging);
	switch (paging->cpu_mode) {
	case CPU_MODE_REAL:
	vmexit->u.inst_emul.cs_base = vmcs_read(VMCS_GUEST_CS_BASE);
	vmexit->u.inst_emul.cs_d = 0;
	break;
	case CPU_MODE_PROTECTED:
	case CPU_MODE_COMPATIBILITY:
	vmexit->u.inst_emul.cs_base = vmcs_read(VMCS_GUEST_CS_BASE);
	csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS);
	vmexit->u.inst_emul.cs_d = SEG_DESC_DEF32(csar);
	break;
	default:
	vmexit->u.inst_emul.cs_base = 0;
	vmexit->u.inst_emul.cs_d = 0;
	break;
	}
	vie_init(&vmexit->u.inst_emul.vie, NULL, 0);
	}

	static int
	ept_fault_type(uint64_t ept_qual)
	{
	int fault_type;

	if (ept_qual & EPT_VIOLATION_DATA_WRITE)
	fault_type = VM_PROT_WRITE;
	else if (ept_qual & EPT_VIOLATION_INST_FETCH)
	fault_type = VM_PROT_EXECUTE;
	else
	fault_type= VM_PROT_READ;

	return (fault_type);
	}

	static boolean_t
	ept_emulation_fault(uint64_t ept_qual)
	{
	int read, write;

	/* EPT fault on an instruction fetch doesn't make sense here */
	if (ept_qual & EPT_VIOLATION_INST_FETCH)
	return (FALSE);

	/* EPT fault must be a read fault or a write fault */
	read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0;
	write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0;
	if ((read \| write) == 0)
	return (FALSE);

	/*
	* The EPT violation must have been caused by accessing a
	* guest-physical address that is a translation of a guest-linear
	* address.
	*/
	if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 \|\|
	(ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) {
	return (FALSE);
	}

	return (TRUE);
	}

	static __inline int
	apic_access_virtualization(struct vmx *vmx, int vcpuid)
	{
	uint32_t proc_ctls2;

	proc_ctls2 = vmx->cap[vcpuid].proc_ctls2;
	return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) ? 1 : 0);
	}

	static __inline int
	x2apic_virtualization(struct vmx *vmx, int vcpuid)
	{
	uint32_t proc_ctls2;

	proc_ctls2 = vmx->cap[vcpuid].proc_ctls2;
	return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_X2APIC_MODE) ? 1 : 0);
	}

	static int
	vmx_handle_apic_write(struct vmx vmx, int vcpuid, struct vlapic vlapic,
	uint64_t qual)
	{
	int error, handled, offset;
	uint32_t *apic_regs, vector;
	bool retu;

	handled = HANDLED;
	offset = APIC_WRITE_OFFSET(qual);

	if (!apic_access_virtualization(vmx, vcpuid)) {
	/*
	* In general there should not be any APIC write VM-exits
	* unless APIC-access virtualization is enabled.
	*
	* However self-IPI virtualization can legitimately trigger
	* an APIC-write VM-exit so treat it specially.
	*/
	if (x2apic_virtualization(vmx, vcpuid) &&
	offset == APIC_OFFSET_SELF_IPI) {
	apic_regs = (uint32_t *)(vlapic->apic_page);
	vector = apic_regs[APIC_OFFSET_SELF_IPI / 4];
	vlapic_self_ipi_handler(vlapic, vector);
	return (HANDLED);
	} else
	return (UNHANDLED);
	}

	switch (offset) {
	case APIC_OFFSET_ID:
	vlapic_id_write_handler(vlapic);
	break;
	case APIC_OFFSET_LDR:
	vlapic_ldr_write_handler(vlapic);
	break;
	case APIC_OFFSET_DFR:
	vlapic_dfr_write_handler(vlapic);
	break;
	case APIC_OFFSET_SVR:
	vlapic_svr_write_handler(vlapic);
	break;
	case APIC_OFFSET_ESR:
	vlapic_esr_write_handler(vlapic);
	break;
	case APIC_OFFSET_ICR_LOW:
	retu = false;
	error = vlapic_icrlo_write_handler(vlapic, &retu);
	if (error != 0 \|\| retu)
	handled = UNHANDLED;
	break;
	case APIC_OFFSET_CMCI_LVT:
	case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
	vlapic_lvt_write_handler(vlapic, offset);
	break;
	case APIC_OFFSET_TIMER_ICR:
	vlapic_icrtmr_write_handler(vlapic);
	break;
	case APIC_OFFSET_TIMER_DCR:
	vlapic_dcr_write_handler(vlapic);
	break;
	default:
	handled = UNHANDLED;
	break;
	}
	return (handled);
	}

	static bool
	apic_access_fault(struct vmx *vmx, int vcpuid, uint64_t gpa)
	{

	if (apic_access_virtualization(vmx, vcpuid) &&
	(gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE))
	return (true);
	else
	return (false);
	}

	static int
	vmx_handle_apic_access(struct vmx vmx, int vcpuid, struct vm_exit vmexit)
	{
	uint64_t qual;
	int access_type, offset, allowed;

	if (!apic_access_virtualization(vmx, vcpuid))
	return (UNHANDLED);

	qual = vmexit->u.vmx.exit_qualification;
	access_type = APIC_ACCESS_TYPE(qual);
	offset = APIC_ACCESS_OFFSET(qual);

	allowed = 0;
	if (access_type == 0) {
	/*
	* Read data access to the following registers is expected.
	*/
	switch (offset) {
	case APIC_OFFSET_APR:
	case APIC_OFFSET_PPR:
	case APIC_OFFSET_RRR:
	case APIC_OFFSET_CMCI_LVT:
	case APIC_OFFSET_TIMER_CCR:
	allowed = 1;
	break;
	default:
	break;
	}
	} else if (access_type == 1) {
	/*
	* Write data access to the following registers is expected.
	*/
	switch (offset) {
	case APIC_OFFSET_VER:
	case APIC_OFFSET_APR:
	case APIC_OFFSET_PPR:
	case APIC_OFFSET_RRR:
	case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
	case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
	case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
	case APIC_OFFSET_CMCI_LVT:
	case APIC_OFFSET_TIMER_CCR:
	allowed = 1;
	break;
	default:
	break;
	}
	}

	if (allowed) {
	vmexit_inst_emul(vmexit, DEFAULT_APIC_BASE + offset,
	VIE_INVALID_GLA);
	}

	/*
	* Regardless of whether the APIC-access is allowed this handler
	* always returns UNHANDLED:
	* - if the access is allowed then it is handled by emulating the
	* instruction that caused the VM-exit (outside the critical section)
	* - if the access is not allowed then it will be converted to an
	* exitcode of VM_EXITCODE_VMX and will be dealt with in userland.
	*/
	return (UNHANDLED);
	}

	static enum task_switch_reason
	vmx_task_switch_reason(uint64_t qual)
	{
	int reason;

	reason = (qual >> 30) & 0x3;
	switch (reason) {
	case 0:
	return (TSR_CALL);
	case 1:
	return (TSR_IRET);
	case 2:
	return (TSR_JMP);
	case 3:
	return (TSR_IDT_GATE);
	default:
	panic("%s: invalid reason %d", __func__, reason);
	}
	}

	static int
	emulate_wrmsr(struct vmx vmx, int vcpuid, u_int num, uint64_t val, bool retu)
	{
	int error;

	if (lapic_msr(num))
	error = lapic_wrmsr(vmx->vm, vcpuid, num, val, retu);
	else
	error = vmx_wrmsr(vmx, vcpuid, num, val, retu);

	return (error);
	}

	static int
	emulate_rdmsr(struct vmx vmx, int vcpuid, u_int num, bool retu)
	{
	struct vmxctx *vmxctx;
	uint64_t result;
	uint32_t eax, edx;
	int error;

	if (lapic_msr(num))
	error = lapic_rdmsr(vmx->vm, vcpuid, num, &result, retu);
	else
	error = vmx_rdmsr(vmx, vcpuid, num, &result, retu);

	if (error == 0) {
	eax = result;
	vmxctx = &vmx->ctx[vcpuid];
	error = vmxctx_setreg(vmxctx, VM_REG_GUEST_RAX, eax);
	KASSERT(error == 0, ("vmxctx_setreg(rax) error %d", error));

	edx = result >> 32;
	error = vmxctx_setreg(vmxctx, VM_REG_GUEST_RDX, edx);
	KASSERT(error == 0, ("vmxctx_setreg(rdx) error %d", error));
	}

	return (error);
	}

	static int
	vmx_exit_process(struct vmx vmx, int vcpu, struct vm_exit vmexit)
	{
	int error, errcode, errcode_valid, handled, in;
	struct vmxctx *vmxctx;
	struct vlapic *vlapic;
	struct vm_inout_str *vis;
	struct vm_task_switch *ts;
	uint32_t eax, ecx, edx, idtvec_info, idtvec_err, intr_info, inst_info;
	uint32_t intr_type, intr_vec, reason;
	uint64_t exitintinfo, qual, gpa;
	bool retu;

	CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_VIRTUAL_NMI) != 0);
	CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_NMI_EXITING) != 0);

	handled = UNHANDLED;
	vmxctx = &vmx->ctx[vcpu];

	qual = vmexit->u.vmx.exit_qualification;
	reason = vmexit->u.vmx.exit_reason;
	vmexit->exitcode = VM_EXITCODE_BOGUS;

	vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1);

	/*
	* VM-entry failures during or after loading guest state.
	*
	* These VM-exits are uncommon but must be handled specially
	* as most VM-exit fields are not populated as usual.
	*/
	if (__predict_false(reason == EXIT_REASON_MCE_DURING_ENTRY)) {
	VCPU_CTR0(vmx->vm, vcpu, "Handling MCE during VM-entry");
	__asm __volatile("int $18");
	return (1);
	}

	/*
	* VM exits that can be triggered during event delivery need to
	* be handled specially by re-injecting the event if the IDT
	* vectoring information field's valid bit is set.
	*
	* See "Information for VM Exits During Event Delivery" in Intel SDM
	* for details.
	*/
	idtvec_info = vmcs_idt_vectoring_info();
	if (idtvec_info & VMCS_IDT_VEC_VALID) {
	idtvec_info &= ~(1 << 12); /* clear undefined bit */
	exitintinfo = idtvec_info;
	if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) {
	idtvec_err = vmcs_idt_vectoring_err();
	exitintinfo \|= (uint64_t)idtvec_err << 32;
	}
	error = vm_exit_intinfo(vmx->vm, vcpu, exitintinfo);
	KASSERT(error == 0, ("%s: vm_set_intinfo error %d",
	__func__, error));

	/*
	* If 'virtual NMIs' are being used and the VM-exit
	* happened while injecting an NMI during the previous
	* VM-entry, then clear "blocking by NMI" in the
	* Guest Interruptibility-State so the NMI can be
	* reinjected on the subsequent VM-entry.
	*
	* However, if the NMI was being delivered through a task
	* gate, then the new task must start execution with NMIs
	* blocked so don't clear NMI blocking in this case.
	*/
	intr_type = idtvec_info & VMCS_INTR_T_MASK;
	if (intr_type == VMCS_INTR_T_NMI) {
	if (reason != EXIT_REASON_TASK_SWITCH)
	vmx_clear_nmi_blocking(vmx, vcpu);
	else
	vmx_assert_nmi_blocking(vmx, vcpu);
	}

	/*
	* Update VM-entry instruction length if the event being
	* delivered was a software interrupt or software exception.
	*/
	if (intr_type == VMCS_INTR_T_SWINTR \|\|
	intr_type == VMCS_INTR_T_PRIV_SWEXCEPTION \|\|
	intr_type == VMCS_INTR_T_SWEXCEPTION) {
	vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length);
	}
	}

	switch (reason) {
	case EXIT_REASON_TASK_SWITCH:
	ts = &vmexit->u.task_switch;
	ts->tsssel = qual & 0xffff;
	ts->reason = vmx_task_switch_reason(qual);
	ts->ext = 0;
	ts->errcode_valid = 0;
	vmx_paging_info(&ts->paging);
	/*
	* If the task switch was due to a CALL, JMP, IRET, software
	* interrupt (INT n) or software exception (INT3, INTO),
	* then the saved %rip references the instruction that caused
	* the task switch. The instruction length field in the VMCS
	* is valid in this case.
	*
	* In all other cases (e.g., NMI, hardware exception) the
	* saved %rip is one that would have been saved in the old TSS
	* had the task switch completed normally so the instruction
	* length field is not needed in this case and is explicitly
	* set to 0.
	*/
	if (ts->reason == TSR_IDT_GATE) {
	KASSERT(idtvec_info & VMCS_IDT_VEC_VALID,
	("invalid idtvec_info %#x for IDT task switch",
	idtvec_info));
	intr_type = idtvec_info & VMCS_INTR_T_MASK;
	if (intr_type != VMCS_INTR_T_SWINTR &&
	intr_type != VMCS_INTR_T_SWEXCEPTION &&
	intr_type != VMCS_INTR_T_PRIV_SWEXCEPTION) {
	/* Task switch triggered by external event */
	ts->ext = 1;
	vmexit->inst_length = 0;
	if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) {
	ts->errcode_valid = 1;
	ts->errcode = vmcs_idt_vectoring_err();
	}
	}
	}
	vmexit->exitcode = VM_EXITCODE_TASK_SWITCH;
	VCPU_CTR4(vmx->vm, vcpu, "task switch reason %d, tss 0x%04x, "
	"%s errcode 0x%016lx", ts->reason, ts->tsssel,
	ts->ext ? "external" : "internal",
	((uint64_t)ts->errcode << 32) \| ts->errcode_valid);
	break;
	case EXIT_REASON_CR_ACCESS:
	vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1);
	switch (qual & 0xf) {
	case 0:
	handled = vmx_emulate_cr0_access(vmx, vcpu, qual);
	break;
	case 4:
	handled = vmx_emulate_cr4_access(vmx, vcpu, qual);
	break;
	case 8:
	handled = vmx_emulate_cr8_access(vmx, vcpu, qual);
	break;
	}
	break;
	case EXIT_REASON_RDMSR:
	vmm_stat_incr(vmx->vm, vcpu, VMEXIT_RDMSR, 1);
	retu = false;
	ecx = vmxctx->guest_rcx;
	VCPU_CTR1(vmx->vm, vcpu, "rdmsr 0x%08x", ecx);
	error = emulate_rdmsr(vmx, vcpu, ecx, &retu);
	if (error) {
	vmexit->exitcode = VM_EXITCODE_RDMSR;
	vmexit->u.msr.code = ecx;
	} else if (!retu) {
	handled = HANDLED;
	} else {
	/* Return to userspace with a valid exitcode */
	KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
	("emulate_rdmsr retu with bogus exitcode"));
	}
	break;
	case EXIT_REASON_WRMSR:
	vmm_stat_incr(vmx->vm, vcpu, VMEXIT_WRMSR, 1);
	retu = false;
	eax = vmxctx->guest_rax;
	ecx = vmxctx->guest_rcx;
	edx = vmxctx->guest_rdx;
	VCPU_CTR2(vmx->vm, vcpu, "wrmsr 0x%08x value 0x%016lx",
	ecx, (uint64_t)edx << 32 \| eax);
	error = emulate_wrmsr(vmx, vcpu, ecx,
	(uint64_t)edx << 32 \| eax, &retu);
	if (error) {
	vmexit->exitcode = VM_EXITCODE_WRMSR;
	vmexit->u.msr.code = ecx;
	vmexit->u.msr.wval = (uint64_t)edx << 32 \| eax;
	} else if (!retu) {
	handled = HANDLED;
	} else {
	/* Return to userspace with a valid exitcode */
	KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
	("emulate_wrmsr retu with bogus exitcode"));
	}
	break;
	case EXIT_REASON_HLT:
	vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1);
	vmexit->exitcode = VM_EXITCODE_HLT;
	vmexit->u.hlt.rflags = vmcs_read(VMCS_GUEST_RFLAGS);
	if (virtual_interrupt_delivery)
	vmexit->u.hlt.intr_status =
	vmcs_read(VMCS_GUEST_INTR_STATUS);
	else
	vmexit->u.hlt.intr_status = 0;
	break;
	case EXIT_REASON_MTF:
	vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1);
	vmexit->exitcode = VM_EXITCODE_MTRAP;
	vmexit->inst_length = 0;
	break;
	case EXIT_REASON_PAUSE:
	vmm_stat_incr(vmx->vm, vcpu, VMEXIT_PAUSE, 1);
	vmexit->exitcode = VM_EXITCODE_PAUSE;
	break;
	case EXIT_REASON_INTR_WINDOW:
	vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INTR_WINDOW, 1);
	vmx_clear_int_window_exiting(vmx, vcpu);
	return (1);
	case EXIT_REASON_EXT_INTR:
	/*
	* External interrupts serve only to cause VM exits and allow
	* the host interrupt handler to run.
	*
	* If this external interrupt triggers a virtual interrupt
	* to a VM, then that state will be recorded by the
	* host interrupt handler in the VM's softc. We will inject
	* this virtual interrupt during the subsequent VM enter.
	*/
	intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);

	/*
	* XXX: Ignore this exit if VMCS_INTR_VALID is not set.
	* This appears to be a bug in VMware Fusion?
	*/
	if (!(intr_info & VMCS_INTR_VALID))
	return (1);
	KASSERT((intr_info & VMCS_INTR_VALID) != 0 &&
	(intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_HWINTR,
	("VM exit interruption info invalid: %#x", intr_info));
	vmx_trigger_hostintr(intr_info & 0xff);

	/*
	* This is special. We want to treat this as an 'handled'
	* VM-exit but not increment the instruction pointer.
	*/
	vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1);
	return (1);
	case EXIT_REASON_NMI_WINDOW:
	/* Exit to allow the pending virtual NMI to be injected */
	if (vm_nmi_pending(vmx->vm, vcpu))
	vmx_inject_nmi(vmx, vcpu);
	vmx_clear_nmi_window_exiting(vmx, vcpu);
	vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NMI_WINDOW, 1);
	return (1);
	case EXIT_REASON_INOUT:
	vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INOUT, 1);
	vmexit->exitcode = VM_EXITCODE_INOUT;
	vmexit->u.inout.bytes = (qual & 0x7) + 1;
	vmexit->u.inout.in = in = (qual & 0x8) ? 1 : 0;
	vmexit->u.inout.string = (qual & 0x10) ? 1 : 0;
	vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0;
	vmexit->u.inout.port = (uint16_t)(qual >> 16);
	vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax);
	if (vmexit->u.inout.string) {
	inst_info = vmcs_read(VMCS_EXIT_INSTRUCTION_INFO);
	vmexit->exitcode = VM_EXITCODE_INOUT_STR;
	vis = &vmexit->u.inout_str;
	vmx_paging_info(&vis->paging);
	vis->rflags = vmcs_read(VMCS_GUEST_RFLAGS);
	vis->cr0 = vmcs_read(VMCS_GUEST_CR0);
	vis->index = inout_str_index(vmx, vcpu, in);
	vis->count = inout_str_count(vmx, vcpu, vis->inout.rep);
	vis->addrsize = inout_str_addrsize(inst_info);
	inout_str_seginfo(vmx, vcpu, inst_info, in, vis);
	}
	break;
	case EXIT_REASON_CPUID:
	vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CPUID, 1);
	handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx);
	break;
	case EXIT_REASON_EXCEPTION:
	vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXCEPTION, 1);
	intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
	KASSERT((intr_info & VMCS_INTR_VALID) != 0,
	("VM exit interruption info invalid: %#x", intr_info));

	intr_vec = intr_info & 0xff;
	intr_type = intr_info & VMCS_INTR_T_MASK;

	/*
	* If Virtual NMIs control is 1 and the VM-exit is due to a
	* fault encountered during the execution of IRET then we must
	* restore the state of "virtual-NMI blocking" before resuming
	* the guest.
	*
	* See "Resuming Guest Software after Handling an Exception".
	* See "Information for VM Exits Due to Vectored Events".
	*/
	if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 &&
	(intr_vec != IDT_DF) &&
	(intr_info & EXIT_QUAL_NMIUDTI) != 0)
	vmx_restore_nmi_blocking(vmx, vcpu);

	/*
	* The NMI has already been handled in vmx_exit_handle_nmi().
	*/
	if (intr_type == VMCS_INTR_T_NMI)
	return (1);

	/*
	* Call the machine check handler by hand. Also don't reflect
	* the machine check back into the guest.
	*/
	if (intr_vec == IDT_MC) {
	VCPU_CTR0(vmx->vm, vcpu, "Vectoring to MCE handler");
	__asm __volatile("int $18");
	return (1);
	}

	if (intr_vec == IDT_PF) {
	error = vmxctx_setreg(vmxctx, VM_REG_GUEST_CR2, qual);
	KASSERT(error == 0, ("%s: vmxctx_setreg(cr2) error %d",
	__func__, error));
	}

	/*
	* Software exceptions exhibit trap-like behavior. This in
	* turn requires populating the VM-entry instruction length
	* so that the %rip in the trap frame is past the INT3/INTO
	* instruction.
	*/
	if (intr_type == VMCS_INTR_T_SWEXCEPTION)
	vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length);

	/* Reflect all other exceptions back into the guest */
	errcode_valid = errcode = 0;
	if (intr_info & VMCS_INTR_DEL_ERRCODE) {
	errcode_valid = 1;
	errcode = vmcs_read(VMCS_EXIT_INTR_ERRCODE);
	}
	VCPU_CTR2(vmx->vm, vcpu, "Reflecting exception %d/%#x into "
	"the guest", intr_vec, errcode);
	error = vm_inject_exception(vmx->vm, vcpu, intr_vec,
	errcode_valid, errcode, 0);
	KASSERT(error == 0, ("%s: vm_inject_exception error %d",
	__func__, error));
	return (1);

	case EXIT_REASON_EPT_FAULT:
	/*
	* If 'gpa' lies within the address space allocated to
	* memory then this must be a nested page fault otherwise
	* this must be an instruction that accesses MMIO space.
	*/
	gpa = vmcs_gpa();
	if (vm_mem_allocated(vmx->vm, vcpu, gpa) \|\|
	apic_access_fault(vmx, vcpu, gpa)) {
	vmexit->exitcode = VM_EXITCODE_PAGING;
	vmexit->inst_length = 0;
	vmexit->u.paging.gpa = gpa;
	vmexit->u.paging.fault_type = ept_fault_type(qual);
	vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NESTED_FAULT, 1);
	} else if (ept_emulation_fault(qual)) {
	vmexit_inst_emul(vmexit, gpa, vmcs_gla());
	vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INST_EMUL, 1);
	}
	/*
	* If Virtual NMIs control is 1 and the VM-exit is due to an
	* EPT fault during the execution of IRET then we must restore
	* the state of "virtual-NMI blocking" before resuming.
	*
	* See description of "NMI unblocking due to IRET" in
	* "Exit Qualification for EPT Violations".
	*/
	if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 &&
	(qual & EXIT_QUAL_NMIUDTI) != 0)
	vmx_restore_nmi_blocking(vmx, vcpu);
	break;
	case EXIT_REASON_VIRTUALIZED_EOI:
	vmexit->exitcode = VM_EXITCODE_IOAPIC_EOI;
	vmexit->u.ioapic_eoi.vector = qual & 0xFF;
	vmexit->inst_length = 0; /* trap-like */
	break;
	case EXIT_REASON_APIC_ACCESS:
	handled = vmx_handle_apic_access(vmx, vcpu, vmexit);
	break;
	case EXIT_REASON_APIC_WRITE:
	/*
	* APIC-write VM exit is trap-like so the %rip is already
	* pointing to the next instruction.
	*/
	vmexit->inst_length = 0;
	vlapic = vm_lapic(vmx->vm, vcpu);
	handled = vmx_handle_apic_write(vmx, vcpu, vlapic, qual);
	break;
	case EXIT_REASON_XSETBV:
	handled = vmx_emulate_xsetbv(vmx, vcpu, vmexit);
	break;
	case EXIT_REASON_MONITOR:
	vmexit->exitcode = VM_EXITCODE_MONITOR;
	break;
	case EXIT_REASON_MWAIT:
	vmexit->exitcode = VM_EXITCODE_MWAIT;
	break;
	default:
	vmm_stat_incr(vmx->vm, vcpu, VMEXIT_UNKNOWN, 1);
	break;
	}

	if (handled) {
	/*
	* It is possible that control is returned to userland
	* even though we were able to handle the VM exit in the
	* kernel.
	*
	* In such a case we want to make sure that the userland
	* restarts guest execution at the instruction after
	* the one we just processed. Therefore we update the
	* guest rip in the VMCS and in 'vmexit'.
	*/
	vmexit->rip += vmexit->inst_length;
	vmexit->inst_length = 0;
	vmcs_write(VMCS_GUEST_RIP, vmexit->rip);
	} else {
	if (vmexit->exitcode == VM_EXITCODE_BOGUS) {
	/*
	* If this VM exit was not claimed by anybody then
	* treat it as a generic VMX exit.
	*/
	vmexit->exitcode = VM_EXITCODE_VMX;
	vmexit->u.vmx.status = VM_SUCCESS;
	vmexit->u.vmx.inst_type = 0;
	vmexit->u.vmx.inst_error = 0;
	} else {
	/*
	* The exitcode and collateral have been populated.
	* The VM exit will be processed further in userland.
	*/
	}
	}
	return (handled);
	}

	static __inline void
	vmx_exit_inst_error(struct vmxctx vmxctx, int rc, struct vm_exit vmexit)
	{

	KASSERT(vmxctx->inst_fail_status != VM_SUCCESS,
	("vmx_exit_inst_error: invalid inst_fail_status %d",
	vmxctx->inst_fail_status));

	vmexit->inst_length = 0;
	vmexit->exitcode = VM_EXITCODE_VMX;
	vmexit->u.vmx.status = vmxctx->inst_fail_status;
	vmexit->u.vmx.inst_error = vmcs_instruction_error();
	vmexit->u.vmx.exit_reason = ~0;
	vmexit->u.vmx.exit_qualification = ~0;

	switch (rc) {
	case VMX_VMRESUME_ERROR:
	case VMX_VMLAUNCH_ERROR:
	case VMX_INVEPT_ERROR:
	vmexit->u.vmx.inst_type = rc;
	break;
	default:
	panic("vm_exit_inst_error: vmx_enter_guest returned %d", rc);
	}
	}

	/*
	* If the NMI-exiting VM execution control is set to '1' then an NMI in
	* non-root operation causes a VM-exit. NMI blocking is in effect so it is
	* sufficient to simply vector to the NMI handler via a software interrupt.
	* However, this must be done before maskable interrupts are enabled
	* otherwise the "iret" issued by an interrupt handler will incorrectly
	* clear NMI blocking.
	*/
	static __inline void
	vmx_exit_handle_nmi(struct vmx vmx, int vcpuid, struct vm_exit vmexit)
	{
	uint32_t intr_info;

	KASSERT((read_rflags() & PSL_I) == 0, ("interrupts enabled"));

	if (vmexit->u.vmx.exit_reason != EXIT_REASON_EXCEPTION)
	return;

	intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
	KASSERT((intr_info & VMCS_INTR_VALID) != 0,
	("VM exit interruption info invalid: %#x", intr_info));

	if ((intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_NMI) {
	KASSERT((intr_info & 0xff) == IDT_NMI, ("VM exit due "
	"to NMI has invalid vector: %#x", intr_info));
	VCPU_CTR0(vmx->vm, vcpuid, "Vectoring to NMI handler");
	__asm __volatile("int $2");
	}
	}

	static __inline void
	vmx_dr_enter_guest(struct vmxctx *vmxctx)
	{
	register_t rflags;

	/* Save host control debug registers. */
	vmxctx->host_dr7 = rdr7();
	vmxctx->host_debugctl = rdmsr(MSR_DEBUGCTLMSR);

	/*
	* Disable debugging in DR7 and DEBUGCTL to avoid triggering
	* exceptions in the host based on the guest DRx values. The
	* guest DR7 and DEBUGCTL are saved/restored in the VMCS.
	*/
	load_dr7(0);
	wrmsr(MSR_DEBUGCTLMSR, 0);

	/*
	* Disable single stepping the kernel to avoid corrupting the
	* guest DR6. A debugger might still be able to corrupt the
	* guest DR6 by setting a breakpoint after this point and then
	* single stepping.
	*/
	rflags = read_rflags();
	vmxctx->host_tf = rflags & PSL_T;
	write_rflags(rflags & ~PSL_T);

	/* Save host debug registers. */
	vmxctx->host_dr0 = rdr0();
	vmxctx->host_dr1 = rdr1();
	vmxctx->host_dr2 = rdr2();
	vmxctx->host_dr3 = rdr3();
	vmxctx->host_dr6 = rdr6();

	/* Restore guest debug registers. */
	load_dr0(vmxctx->guest_dr0);
	load_dr1(vmxctx->guest_dr1);
	load_dr2(vmxctx->guest_dr2);
	load_dr3(vmxctx->guest_dr3);
	load_dr6(vmxctx->guest_dr6);
	}

	static __inline void
	vmx_dr_leave_guest(struct vmxctx *vmxctx)
	{

	/* Save guest debug registers. */
	vmxctx->guest_dr0 = rdr0();
	vmxctx->guest_dr1 = rdr1();
	vmxctx->guest_dr2 = rdr2();
	vmxctx->guest_dr3 = rdr3();
	vmxctx->guest_dr6 = rdr6();

	/*
	* Restore host debug registers. Restore DR7, DEBUGCTL, and
	* PSL_T last.
	*/
	load_dr0(vmxctx->host_dr0);
	load_dr1(vmxctx->host_dr1);
	load_dr2(vmxctx->host_dr2);
	load_dr3(vmxctx->host_dr3);
	load_dr6(vmxctx->host_dr6);
	wrmsr(MSR_DEBUGCTLMSR, vmxctx->host_debugctl);
	load_dr7(vmxctx->host_dr7);
	write_rflags(read_rflags() \| vmxctx->host_tf);
	}

	static int
	vmx_run(void *arg, int vcpu, register_t rip, pmap_t pmap,
	struct vm_eventinfo *evinfo)
	{
	int rc, handled, launched;
	struct vmx *vmx;
	struct vm *vm;
	struct vmxctx *vmxctx;
	struct vmcs *vmcs;
	struct vm_exit *vmexit;
	struct vlapic *vlapic;
	uint32_t exit_reason;

	vmx = arg;
	vm = vmx->vm;
	vmcs = &vmx->vmcs[vcpu];
	vmxctx = &vmx->ctx[vcpu];
	vlapic = vm_lapic(vm, vcpu);
	vmexit = vm_exitinfo(vm, vcpu);
	launched = 0;

	KASSERT(vmxctx->pmap == pmap,
	("pmap %p different than ctx pmap %p", pmap, vmxctx->pmap));

	vmx_msr_guest_enter(vmx, vcpu);

	VMPTRLD(vmcs);

	/*
	* XXX
	* We do this every time because we may setup the virtual machine
	* from a different process than the one that actually runs it.
	*
	* If the life of a virtual machine was spent entirely in the context
	* of a single process we could do this once in vmx_vminit().
	*/
	vmcs_write(VMCS_HOST_CR3, rcr3());

	vmcs_write(VMCS_GUEST_RIP, rip);
	vmx_set_pcpu_defaults(vmx, vcpu, pmap);
	do {
	KASSERT(vmcs_guest_rip() == rip, ("%s: vmcs guest rip mismatch "
	"%#lx/%#lx", __func__, vmcs_guest_rip(), rip));

	handled = UNHANDLED;
	/*
	* Interrupts are disabled from this point on until the
	* guest starts executing. This is done for the following
	* reasons:
	*
	* If an AST is asserted on this thread after the check below,
	* then the IPI_AST notification will not be lost, because it
	* will cause a VM exit due to external interrupt as soon as
	* the guest state is loaded.
	*
	* A posted interrupt after 'vmx_inject_interrupts()' will
	* not be "lost" because it will be held pending in the host
	* APIC because interrupts are disabled. The pending interrupt
	* will be recognized as soon as the guest state is loaded.
	*
	* The same reasoning applies to the IPI generated by
	* pmap_invalidate_ept().
	*/
	disable_intr();
	vmx_inject_interrupts(vmx, vcpu, vlapic, rip);

	/*
	* Check for vcpu suspension after injecting events because
	* vmx_inject_interrupts() can suspend the vcpu due to a
	* triple fault.
	*/
	if (vcpu_suspended(evinfo)) {
	enable_intr();
	vm_exit_suspended(vmx->vm, vcpu, rip);
	break;
	}

	if (vcpu_rendezvous_pending(evinfo)) {
	enable_intr();
	vm_exit_rendezvous(vmx->vm, vcpu, rip);
	break;
	}

	if (vcpu_reqidle(evinfo)) {
	enable_intr();
	vm_exit_reqidle(vmx->vm, vcpu, rip);
	break;
	}

	if (vcpu_should_yield(vm, vcpu)) {
	enable_intr();
	vm_exit_astpending(vmx->vm, vcpu, rip);
	vmx_astpending_trace(vmx, vcpu, rip);
	handled = HANDLED;
	break;
	}

	+ if (vcpu_debugged(vm, vcpu)) {
	+ enable_intr();
	+ vm_exit_debug(vmx->vm, vcpu, rip);
	+ break;
	+ }
	+
	vmx_run_trace(vmx, vcpu);
	vmx_dr_enter_guest(vmxctx);
	rc = vmx_enter_guest(vmxctx, vmx, launched);
	vmx_dr_leave_guest(vmxctx);

	/* Collect some information for VM exit processing */
	vmexit->rip = rip = vmcs_guest_rip();
	vmexit->inst_length = vmexit_instruction_length();
	vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason();
	vmexit->u.vmx.exit_qualification = vmcs_exit_qualification();

	/* Update 'nextrip' */
	vmx->state[vcpu].nextrip = rip;

	if (rc == VMX_GUEST_VMEXIT) {
	vmx_exit_handle_nmi(vmx, vcpu, vmexit);
	enable_intr();
	handled = vmx_exit_process(vmx, vcpu, vmexit);
	} else {
	enable_intr();
	vmx_exit_inst_error(vmxctx, rc, vmexit);
	}
	launched = 1;
	vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled);
	rip = vmexit->rip;
	} while (handled);

	/*
	* If a VM exit has been handled then the exitcode must be BOGUS
	* If a VM exit is not handled then the exitcode must not be BOGUS
	*/
	if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) \|\|
	(!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) {
	panic("Mismatch between handled (%d) and exitcode (%d)",
	handled, vmexit->exitcode);
	}

	if (!handled)
	vmm_stat_incr(vm, vcpu, VMEXIT_USERSPACE, 1);

	VCPU_CTR1(vm, vcpu, "returning from vmx_run: exitcode %d",
	vmexit->exitcode);

	VMCLEAR(vmcs);
	vmx_msr_guest_exit(vmx, vcpu);

	return (0);
	}

	static void
	vmx_vmcleanup(void *arg)
	{
	int i;
	struct vmx *vmx = arg;

	if (apic_access_virtualization(vmx, 0))
	vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE);

	for (i = 0; i < VM_MAXCPU; i++)
	vpid_free(vmx->state[i].vpid);

	free(vmx, M_VMX);

	return;
	}

	static register_t *
	vmxctx_regptr(struct vmxctx *vmxctx, int reg)
	{

	switch (reg) {
	case VM_REG_GUEST_RAX:
	return (&vmxctx->guest_rax);
	case VM_REG_GUEST_RBX:
	return (&vmxctx->guest_rbx);
	case VM_REG_GUEST_RCX:
	return (&vmxctx->guest_rcx);
	case VM_REG_GUEST_RDX:
	return (&vmxctx->guest_rdx);
	case VM_REG_GUEST_RSI:
	return (&vmxctx->guest_rsi);
	case VM_REG_GUEST_RDI:
	return (&vmxctx->guest_rdi);
	case VM_REG_GUEST_RBP:
	return (&vmxctx->guest_rbp);
	case VM_REG_GUEST_R8:
	return (&vmxctx->guest_r8);
	case VM_REG_GUEST_R9:
	return (&vmxctx->guest_r9);
	case VM_REG_GUEST_R10:
	return (&vmxctx->guest_r10);
	case VM_REG_GUEST_R11:
	return (&vmxctx->guest_r11);
	case VM_REG_GUEST_R12:
	return (&vmxctx->guest_r12);
	case VM_REG_GUEST_R13:
	return (&vmxctx->guest_r13);
	case VM_REG_GUEST_R14:
	return (&vmxctx->guest_r14);
	case VM_REG_GUEST_R15:
	return (&vmxctx->guest_r15);
	case VM_REG_GUEST_CR2:
	return (&vmxctx->guest_cr2);
	case VM_REG_GUEST_DR0:
	return (&vmxctx->guest_dr0);
	case VM_REG_GUEST_DR1:
	return (&vmxctx->guest_dr1);
	case VM_REG_GUEST_DR2:
	return (&vmxctx->guest_dr2);
	case VM_REG_GUEST_DR3:
	return (&vmxctx->guest_dr3);
	case VM_REG_GUEST_DR6:
	return (&vmxctx->guest_dr6);
	default:
	break;
	}
	return (NULL);
	}

	static int
	vmxctx_getreg(struct vmxctx vmxctx, int reg, uint64_t retval)
	{
	register_t *regp;

	if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
	retval = regp;
	return (0);
	} else
	return (EINVAL);
	}

	static int
	vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val)
	{
	register_t *regp;

	if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
	*regp = val;
	return (0);
	} else
	return (EINVAL);
	}

	static int
	vmx_get_intr_shadow(struct vmx vmx, int vcpu, int running, uint64_t retval)
	{
	uint64_t gi;
	int error;

	error = vmcs_getreg(&vmx->vmcs[vcpu], running,
	VMCS_IDENT(VMCS_GUEST_INTERRUPTIBILITY), &gi);
	*retval = (gi & HWINTR_BLOCKING) ? 1 : 0;
	return (error);
	}

	static int
	vmx_modify_intr_shadow(struct vmx *vmx, int vcpu, int running, uint64_t val)
	{
	struct vmcs *vmcs;
	uint64_t gi;
	int error, ident;

	/*
	* Forcing the vcpu into an interrupt shadow is not supported.
	*/
	if (val) {
	error = EINVAL;
	goto done;
	}

	vmcs = &vmx->vmcs[vcpu];
	ident = VMCS_IDENT(VMCS_GUEST_INTERRUPTIBILITY);
	error = vmcs_getreg(vmcs, running, ident, &gi);
	if (error == 0) {
	gi &= ~HWINTR_BLOCKING;
	error = vmcs_setreg(vmcs, running, ident, gi);
	}
	done:
	VCPU_CTR2(vmx->vm, vcpu, "Setting intr_shadow to %#lx %s", val,
	error ? "failed" : "succeeded");
	return (error);
	}

	static int
	vmx_shadow_reg(int reg)
	{
	int shreg;

	shreg = -1;

	switch (reg) {
	case VM_REG_GUEST_CR0:
	shreg = VMCS_CR0_SHADOW;
	break;
	case VM_REG_GUEST_CR4:
	shreg = VMCS_CR4_SHADOW;
	break;
	default:
	break;
	}

	return (shreg);
	}

	static int
	vmx_getreg(void arg, int vcpu, int reg, uint64_t retval)
	{
	int running, hostcpu;
	struct vmx *vmx = arg;

	running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
	if (running && hostcpu != curcpu)
	panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu);

	if (reg == VM_REG_GUEST_INTR_SHADOW)
	return (vmx_get_intr_shadow(vmx, vcpu, running, retval));

	if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0)
	return (0);

	return (vmcs_getreg(&vmx->vmcs[vcpu], running, reg, retval));
	}

	static int
	vmx_setreg(void *arg, int vcpu, int reg, uint64_t val)
	{
	int error, hostcpu, running, shadow;
	uint64_t ctls;
	pmap_t pmap;
	struct vmx *vmx = arg;

	running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
	if (running && hostcpu != curcpu)
	panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu);

	if (reg == VM_REG_GUEST_INTR_SHADOW)
	return (vmx_modify_intr_shadow(vmx, vcpu, running, val));

	if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0)
	return (0);

	error = vmcs_setreg(&vmx->vmcs[vcpu], running, reg, val);

	if (error == 0) {
	/*
	* If the "load EFER" VM-entry control is 1 then the
	* value of EFER.LMA must be identical to "IA-32e mode guest"
	* bit in the VM-entry control.
	*/
	if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 &&
	(reg == VM_REG_GUEST_EFER)) {
	vmcs_getreg(&vmx->vmcs[vcpu], running,
	VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls);
	if (val & EFER_LMA)
	ctls \|= VM_ENTRY_GUEST_LMA;
	else
	ctls &= ~VM_ENTRY_GUEST_LMA;
	vmcs_setreg(&vmx->vmcs[vcpu], running,
	VMCS_IDENT(VMCS_ENTRY_CTLS), ctls);
	}

	shadow = vmx_shadow_reg(reg);
	if (shadow > 0) {
	/*
	* Store the unmodified value in the shadow
	*/
	error = vmcs_setreg(&vmx->vmcs[vcpu], running,
	VMCS_IDENT(shadow), val);
	}

	if (reg == VM_REG_GUEST_CR3) {
	/*
	* Invalidate the guest vcpu's TLB mappings to emulate
	* the behavior of updating %cr3.
	*
	* XXX the processor retains global mappings when %cr3
	* is updated but vmx_invvpid() does not.
	*/
	pmap = vmx->ctx[vcpu].pmap;
	vmx_invvpid(vmx, vcpu, pmap, running);
	}
	}

	return (error);
	}

	static int
	vmx_getdesc(void arg, int vcpu, int reg, struct seg_desc desc)
	{
	int hostcpu, running;
	struct vmx *vmx = arg;

	running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
	if (running && hostcpu != curcpu)
	panic("vmx_getdesc: %s%d is running", vm_name(vmx->vm), vcpu);

	return (vmcs_getdesc(&vmx->vmcs[vcpu], running, reg, desc));
	}

	static int
	vmx_setdesc(void arg, int vcpu, int reg, struct seg_desc desc)
	{
	int hostcpu, running;
	struct vmx *vmx = arg;

	running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
	if (running && hostcpu != curcpu)
	panic("vmx_setdesc: %s%d is running", vm_name(vmx->vm), vcpu);

	return (vmcs_setdesc(&vmx->vmcs[vcpu], running, reg, desc));
	}

	static int
	vmx_getcap(void arg, int vcpu, int type, int retval)
	{
	struct vmx *vmx = arg;
	int vcap;
	int ret;

	ret = ENOENT;

	vcap = vmx->cap[vcpu].set;

	switch (type) {
	case VM_CAP_HALT_EXIT:
	if (cap_halt_exit)
	ret = 0;
	break;
	case VM_CAP_PAUSE_EXIT:
	if (cap_pause_exit)
	ret = 0;
	break;
	case VM_CAP_MTRAP_EXIT:
	if (cap_monitor_trap)
	ret = 0;
	break;
	case VM_CAP_UNRESTRICTED_GUEST:
	if (cap_unrestricted_guest)
	ret = 0;
	break;
	case VM_CAP_ENABLE_INVPCID:
	if (cap_invpcid)
	ret = 0;
	break;
	default:
	break;
	}

	if (ret == 0)
	*retval = (vcap & (1 << type)) ? 1 : 0;

	return (ret);
	}

	static int
	vmx_setcap(void *arg, int vcpu, int type, int val)
	{
	struct vmx *vmx = arg;
	struct vmcs *vmcs = &vmx->vmcs[vcpu];
	uint32_t baseval;
	uint32_t *pptr;
	int error;
	int flag;
	int reg;
	int retval;

	retval = ENOENT;
	pptr = NULL;

	switch (type) {
	case VM_CAP_HALT_EXIT:
	if (cap_halt_exit) {
	retval = 0;
	pptr = &vmx->cap[vcpu].proc_ctls;
	baseval = *pptr;
	flag = PROCBASED_HLT_EXITING;
	reg = VMCS_PRI_PROC_BASED_CTLS;
	}
	break;
	case VM_CAP_MTRAP_EXIT:
	if (cap_monitor_trap) {
	retval = 0;
	pptr = &vmx->cap[vcpu].proc_ctls;
	baseval = *pptr;
	flag = PROCBASED_MTF;
	reg = VMCS_PRI_PROC_BASED_CTLS;
	}
	break;
	case VM_CAP_PAUSE_EXIT:
	if (cap_pause_exit) {
	retval = 0;
	pptr = &vmx->cap[vcpu].proc_ctls;
	baseval = *pptr;
	flag = PROCBASED_PAUSE_EXITING;
	reg = VMCS_PRI_PROC_BASED_CTLS;
	}
	break;
	case VM_CAP_UNRESTRICTED_GUEST:
	if (cap_unrestricted_guest) {
	retval = 0;
	pptr = &vmx->cap[vcpu].proc_ctls2;
	baseval = *pptr;
	flag = PROCBASED2_UNRESTRICTED_GUEST;
	reg = VMCS_SEC_PROC_BASED_CTLS;
	}
	break;
	case VM_CAP_ENABLE_INVPCID:
	if (cap_invpcid) {
	retval = 0;
	pptr = &vmx->cap[vcpu].proc_ctls2;
	baseval = *pptr;
	flag = PROCBASED2_ENABLE_INVPCID;
	reg = VMCS_SEC_PROC_BASED_CTLS;
	}
	break;
	default:
	break;
	}

	if (retval == 0) {
	if (val) {
	baseval \|= flag;
	} else {
	baseval &= ~flag;
	}
	VMPTRLD(vmcs);
	error = vmwrite(reg, baseval);
	VMCLEAR(vmcs);

	if (error) {
	retval = error;
	} else {
	/*
	* Update optional stored flags, and record
	* setting
	*/
	if (pptr != NULL) {
	*pptr = baseval;
	}

	if (val) {
	vmx->cap[vcpu].set \|= (1 << type);
	} else {
	vmx->cap[vcpu].set &= ~(1 << type);
	}
	}
	}

	return (retval);
	}

	struct vlapic_vtx {
	struct vlapic vlapic;
	struct pir_desc *pir_desc;
	struct vmx *vmx;
	};

	#define VMX_CTR_PIR(vm, vcpuid, pir_desc, notify, vector, level, msg) \
	do { \
	VCPU_CTR2(vm, vcpuid, msg " assert %s-triggered vector %d", \
	level ? "level" : "edge", vector); \
	VCPU_CTR1(vm, vcpuid, msg " pir0 0x%016lx", pir_desc->pir[0]); \
	VCPU_CTR1(vm, vcpuid, msg " pir1 0x%016lx", pir_desc->pir[1]); \
	VCPU_CTR1(vm, vcpuid, msg " pir2 0x%016lx", pir_desc->pir[2]); \
	VCPU_CTR1(vm, vcpuid, msg " pir3 0x%016lx", pir_desc->pir[3]); \
	VCPU_CTR1(vm, vcpuid, msg " notify: %s", notify ? "yes" : "no");\
	} while (0)

	/*
	* vlapic->ops handlers that utilize the APICv hardware assist described in
	* Chapter 29 of the Intel SDM.
	*/
	static int
	vmx_set_intr_ready(struct vlapic *vlapic, int vector, bool level)
	{
	struct vlapic_vtx *vlapic_vtx;
	struct pir_desc *pir_desc;
	uint64_t mask;
	int idx, notify;

	vlapic_vtx = (struct vlapic_vtx *)vlapic;
	pir_desc = vlapic_vtx->pir_desc;

	/*
	* Keep track of interrupt requests in the PIR descriptor. This is
	* because the virtual APIC page pointed to by the VMCS cannot be
	* modified if the vcpu is running.
	*/
	idx = vector / 64;
	mask = 1UL << (vector % 64);
	atomic_set_long(&pir_desc->pir[idx], mask);
	notify = atomic_cmpset_long(&pir_desc->pending, 0, 1);

	VMX_CTR_PIR(vlapic->vm, vlapic->vcpuid, pir_desc, notify, vector,
	level, "vmx_set_intr_ready");
	return (notify);
	}

	static int
	vmx_pending_intr(struct vlapic vlapic, int vecptr)
	{
	struct vlapic_vtx *vlapic_vtx;
	struct pir_desc *pir_desc;
	struct LAPIC *lapic;
	uint64_t pending, pirval;
	uint32_t ppr, vpr;
	int i;

	/*
	* This function is only expected to be called from the 'HLT' exit
	* handler which does not care about the vector that is pending.
	*/
	KASSERT(vecptr == NULL, ("vmx_pending_intr: vecptr must be NULL"));

	vlapic_vtx = (struct vlapic_vtx *)vlapic;
	pir_desc = vlapic_vtx->pir_desc;

	pending = atomic_load_acq_long(&pir_desc->pending);
	if (!pending) {
	/*
	* While a virtual interrupt may have already been
	* processed the actual delivery maybe pending the
	* interruptibility of the guest. Recognize a pending
	* interrupt by reevaluating virtual interrupts
	* following Section 29.2.1 in the Intel SDM Volume 3.
	*/
	struct vm_exit *vmexit;
	uint8_t rvi, ppr;

	vmexit = vm_exitinfo(vlapic->vm, vlapic->vcpuid);
	KASSERT(vmexit->exitcode == VM_EXITCODE_HLT,
	("vmx_pending_intr: exitcode not 'HLT'"));
	rvi = vmexit->u.hlt.intr_status & APIC_TPR_INT;
	lapic = vlapic->apic_page;
	ppr = lapic->ppr & APIC_TPR_INT;
	if (rvi > ppr) {
	return (1);
	}

	return (0);
	}

	/*
	* If there is an interrupt pending then it will be recognized only
	* if its priority is greater than the processor priority.
	*
	* Special case: if the processor priority is zero then any pending
	* interrupt will be recognized.
	*/
	lapic = vlapic->apic_page;
	ppr = lapic->ppr & APIC_TPR_INT;
	if (ppr == 0)
	return (1);

	VCPU_CTR1(vlapic->vm, vlapic->vcpuid, "HLT with non-zero PPR %d",
	lapic->ppr);

	for (i = 3; i >= 0; i--) {
	pirval = pir_desc->pir[i];
	if (pirval != 0) {
	vpr = (i * 64 + flsl(pirval) - 1) & APIC_TPR_INT;
	return (vpr > ppr);
	}
	}
	return (0);
	}

	static void
	vmx_intr_accepted(struct vlapic *vlapic, int vector)
	{

	panic("vmx_intr_accepted: not expected to be called");
	}

	static void
	vmx_set_tmr(struct vlapic *vlapic, int vector, bool level)
	{
	struct vlapic_vtx *vlapic_vtx;
	struct vmx *vmx;
	struct vmcs *vmcs;
	uint64_t mask, val;

	KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector));
	KASSERT(!vcpu_is_running(vlapic->vm, vlapic->vcpuid, NULL),
	("vmx_set_tmr: vcpu cannot be running"));

	vlapic_vtx = (struct vlapic_vtx *)vlapic;
	vmx = vlapic_vtx->vmx;
	vmcs = &vmx->vmcs[vlapic->vcpuid];
	mask = 1UL << (vector % 64);

	VMPTRLD(vmcs);
	val = vmcs_read(VMCS_EOI_EXIT(vector));
	if (level)
	val \|= mask;
	else
	val &= ~mask;
	vmcs_write(VMCS_EOI_EXIT(vector), val);
	VMCLEAR(vmcs);
	}

	static void
	vmx_enable_x2apic_mode(struct vlapic *vlapic)
	{
	struct vmx *vmx;
	struct vmcs *vmcs;
	uint32_t proc_ctls2;
	int vcpuid, error;

	vcpuid = vlapic->vcpuid;
	vmx = ((struct vlapic_vtx *)vlapic)->vmx;
	vmcs = &vmx->vmcs[vcpuid];

	proc_ctls2 = vmx->cap[vcpuid].proc_ctls2;
	KASSERT((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) != 0,
	("%s: invalid proc_ctls2 %#x", __func__, proc_ctls2));

	proc_ctls2 &= ~PROCBASED2_VIRTUALIZE_APIC_ACCESSES;
	proc_ctls2 \|= PROCBASED2_VIRTUALIZE_X2APIC_MODE;
	vmx->cap[vcpuid].proc_ctls2 = proc_ctls2;

	VMPTRLD(vmcs);
	vmcs_write(VMCS_SEC_PROC_BASED_CTLS, proc_ctls2);
	VMCLEAR(vmcs);

	if (vlapic->vcpuid == 0) {
	/*
	* The nested page table mappings are shared by all vcpus
	* so unmap the APIC access page just once.
	*/
	error = vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE);
	KASSERT(error == 0, ("%s: vm_unmap_mmio error %d",
	__func__, error));

	/*
	* The MSR bitmap is shared by all vcpus so modify it only
	* once in the context of vcpu 0.
	*/
	error = vmx_allow_x2apic_msrs(vmx);
	KASSERT(error == 0, ("%s: vmx_allow_x2apic_msrs error %d",
	__func__, error));
	}
	}

	static void
	vmx_post_intr(struct vlapic *vlapic, int hostcpu)
	{

	ipi_cpu(hostcpu, pirvec);
	}

	/*
	* Transfer the pending interrupts in the PIR descriptor to the IRR
	* in the virtual APIC page.
	*/
	static void
	vmx_inject_pir(struct vlapic *vlapic)
	{
	struct vlapic_vtx *vlapic_vtx;
	struct pir_desc *pir_desc;
	struct LAPIC *lapic;
	uint64_t val, pirval;
	int rvi, pirbase = -1;
	uint16_t intr_status_old, intr_status_new;

	vlapic_vtx = (struct vlapic_vtx *)vlapic;
	pir_desc = vlapic_vtx->pir_desc;
	if (atomic_cmpset_long(&pir_desc->pending, 1, 0) == 0) {
	VCPU_CTR0(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: "
	"no posted interrupt pending");
	return;
	}

	pirval = 0;
	pirbase = -1;
	lapic = vlapic->apic_page;

	val = atomic_readandclear_long(&pir_desc->pir[0]);
	if (val != 0) {
	lapic->irr0 \|= val;
	lapic->irr1 \|= val >> 32;
	pirbase = 0;
	pirval = val;
	}

	val = atomic_readandclear_long(&pir_desc->pir[1]);
	if (val != 0) {
	lapic->irr2 \|= val;
	lapic->irr3 \|= val >> 32;
	pirbase = 64;
	pirval = val;
	}

	val = atomic_readandclear_long(&pir_desc->pir[2]);
	if (val != 0) {
	lapic->irr4 \|= val;
	lapic->irr5 \|= val >> 32;
	pirbase = 128;
	pirval = val;
	}

	val = atomic_readandclear_long(&pir_desc->pir[3]);
	if (val != 0) {
	lapic->irr6 \|= val;
	lapic->irr7 \|= val >> 32;
	pirbase = 192;
	pirval = val;
	}

	VLAPIC_CTR_IRR(vlapic, "vmx_inject_pir");

	/*
	* Update RVI so the processor can evaluate pending virtual
	* interrupts on VM-entry.
	*
	* It is possible for pirval to be 0 here, even though the
	* pending bit has been set. The scenario is:
	* CPU-Y is sending a posted interrupt to CPU-X, which
	* is running a guest and processing posted interrupts in h/w.
	* CPU-X will eventually exit and the state seen in s/w is
	* the pending bit set, but no PIR bits set.
	*
	* CPU-X CPU-Y
	* (vm running) (host running)
	* rx posted interrupt
	* CLEAR pending bit
	* SET PIR bit
	* READ/CLEAR PIR bits
	* SET pending bit
	* (vm exit)
	* pending bit set, PIR 0
	*/
	if (pirval != 0) {
	rvi = pirbase + flsl(pirval) - 1;
	intr_status_old = vmcs_read(VMCS_GUEST_INTR_STATUS);
	intr_status_new = (intr_status_old & 0xFF00) \| rvi;
	if (intr_status_new > intr_status_old) {
	vmcs_write(VMCS_GUEST_INTR_STATUS, intr_status_new);
	VCPU_CTR2(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: "
	"guest_intr_status changed from 0x%04x to 0x%04x",
	intr_status_old, intr_status_new);
	}
	}
	}

	static struct vlapic *
	vmx_vlapic_init(void *arg, int vcpuid)
	{
	struct vmx *vmx;
	struct vlapic *vlapic;
	struct vlapic_vtx *vlapic_vtx;

	vmx = arg;

	vlapic = malloc(sizeof(struct vlapic_vtx), M_VLAPIC, M_WAITOK \| M_ZERO);
	vlapic->vm = vmx->vm;
	vlapic->vcpuid = vcpuid;
	vlapic->apic_page = (struct LAPIC *)&vmx->apic_page[vcpuid];

	vlapic_vtx = (struct vlapic_vtx *)vlapic;
	vlapic_vtx->pir_desc = &vmx->pir_desc[vcpuid];
	vlapic_vtx->vmx = vmx;

	if (virtual_interrupt_delivery) {
	vlapic->ops.set_intr_ready = vmx_set_intr_ready;
	vlapic->ops.pending_intr = vmx_pending_intr;
	vlapic->ops.intr_accepted = vmx_intr_accepted;
	vlapic->ops.set_tmr = vmx_set_tmr;
	vlapic->ops.enable_x2apic_mode = vmx_enable_x2apic_mode;
	}

	if (posted_interrupts)
	vlapic->ops.post_intr = vmx_post_intr;

	vlapic_init(vlapic);

	return (vlapic);
	}

	static void
	vmx_vlapic_cleanup(void arg, struct vlapic vlapic)
	{

	vlapic_cleanup(vlapic);
	free(vlapic, M_VLAPIC);
	}

	struct vmm_ops vmm_ops_intel = {
	vmx_init,
	vmx_cleanup,
	vmx_restore,
	vmx_vminit,
	vmx_run,
	vmx_vmcleanup,
	vmx_getreg,
	vmx_setreg,
	vmx_getdesc,
	vmx_setdesc,
	vmx_getcap,
	vmx_setcap,
	ept_vmspace_alloc,
	ept_vmspace_free,
	vmx_vlapic_init,
	vmx_vlapic_cleanup,
	};
	Index: head/sys/amd64/vmm/vmm.c
	===================================================================
	--- head/sys/amd64/vmm/vmm.c (revision 332156)
	+++ head/sys/amd64/vmm/vmm.c (revision 332157)
	@@ -1,2594 +1,2666 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2011 NetApp, Inc.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* $FreeBSD$
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/module.h>
	#include <sys/sysctl.h>
	#include <sys/malloc.h>
	#include <sys/pcpu.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/proc.h>
	#include <sys/rwlock.h>
	#include <sys/sched.h>
	#include <sys/smp.h>
	#include <sys/systm.h>

	#include <vm/vm.h>
	#include <vm/vm_object.h>
	#include <vm/vm_page.h>
	#include <vm/pmap.h>
	#include <vm/vm_map.h>
	#include <vm/vm_extern.h>
	#include <vm/vm_param.h>

	#include <machine/cpu.h>
	#include <machine/pcb.h>
	#include <machine/smp.h>
	#include <machine/md_var.h>
	#include <x86/psl.h>
	#include <x86/apicreg.h>

	#include <machine/vmm.h>
	#include <machine/vmm_dev.h>
	#include <machine/vmm_instruction_emul.h>

	#include "vmm_ioport.h"
	#include "vmm_ktr.h"
	#include "vmm_host.h"
	#include "vmm_mem.h"
	#include "vmm_util.h"
	#include "vatpic.h"
	#include "vatpit.h"
	#include "vhpet.h"
	#include "vioapic.h"
	#include "vlapic.h"
	#include "vpmtmr.h"
	#include "vrtc.h"
	#include "vmm_stat.h"
	#include "vmm_lapic.h"

	#include "io/ppt.h"
	#include "io/iommu.h"

	struct vlapic;

	/*
	* Initialization:
	* (a) allocated when vcpu is created
	* (i) initialized when vcpu is created and when it is reinitialized
	* (o) initialized the first time the vcpu is created
	* (x) initialized before use
	*/
	struct vcpu {
	struct mtx mtx; /* (o) protects 'state' and 'hostcpu' */
	enum vcpu_state state; /* (o) vcpu state */
	int hostcpu; /* (o) vcpu's host cpu */
	int reqidle; /* (i) request vcpu to idle */
	struct vlapic vlapic; / (i) APIC device model */
	enum x2apic_state x2apic_state; /* (i) APIC mode */
	uint64_t exitintinfo; /* (i) events pending at VM exit */
	int nmi_pending; /* (i) NMI pending */
	int extint_pending; /* (i) INTR pending */
	int exception_pending; /* (i) exception pending */
	int exc_vector; /* (x) exception collateral */
	int exc_errcode_valid;
	uint32_t exc_errcode;
	struct savefpu guestfpu; / (a,i) guest fpu state */
	uint64_t guest_xcr0; /* (i) guest %xcr0 register */
	void stats; / (a,i) statistics */
	struct vm_exit exitinfo; /* (x) exit reason and collateral */
	uint64_t nextrip; /* (x) next instruction to execute */
	};

	#define vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx))
	#define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
	#define vcpu_lock(v) mtx_lock_spin(&((v)->mtx))
	#define vcpu_unlock(v) mtx_unlock_spin(&((v)->mtx))
	#define vcpu_assert_locked(v) mtx_assert(&((v)->mtx), MA_OWNED)

	struct mem_seg {
	size_t len;
	bool sysmem;
	struct vm_object *object;
	};
	#define VM_MAX_MEMSEGS 3

	struct mem_map {
	vm_paddr_t gpa;
	size_t len;
	vm_ooffset_t segoff;
	int segid;
	int prot;
	int flags;
	};
	#define VM_MAX_MEMMAPS 4

	/*
	* Initialization:
	* (o) initialized the first time the VM is created
	* (i) initialized when VM is created and when it is reinitialized
	* (x) initialized before use
	*/
	struct vm {
	void cookie; / (i) cpu-specific data */
	void iommu; / (x) iommu-specific data */
	struct vhpet vhpet; / (i) virtual HPET */
	struct vioapic vioapic; / (i) virtual ioapic */
	struct vatpic vatpic; / (i) virtual atpic */
	struct vatpit vatpit; / (i) virtual atpit */
	struct vpmtmr vpmtmr; / (i) virtual ACPI PM timer */
	struct vrtc vrtc; / (o) virtual RTC */
	volatile cpuset_t active_cpus; /* (i) active vcpus */
	+ volatile cpuset_t debug_cpus; /* (i) vcpus stopped for debug */
	int suspend; /* (i) stop VM execution */
	volatile cpuset_t suspended_cpus; /* (i) suspended vcpus */
	volatile cpuset_t halted_cpus; /* (x) cpus in a hard halt */
	cpuset_t rendezvous_req_cpus; /* (x) rendezvous requested */
	cpuset_t rendezvous_done_cpus; /* (x) rendezvous finished */
	void rendezvous_arg; / (x) rendezvous func/arg */
	vm_rendezvous_func_t rendezvous_func;
	struct mtx rendezvous_mtx; /* (o) rendezvous lock */
	struct mem_map mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */
	struct mem_seg mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */
	struct vmspace vmspace; / (o) guest's address space */
	char name[VM_MAX_NAMELEN]; /* (o) virtual machine name */
	struct vcpu vcpu[VM_MAXCPU]; /* (i) guest vcpus */
	};

	static int vmm_initialized;

	static struct vmm_ops *ops;
	#define VMM_INIT(num) (ops != NULL ? (*ops->init)(num) : 0)
	#define VMM_CLEANUP() (ops != NULL ? (*ops->cleanup)() : 0)
	#define VMM_RESUME() (ops != NULL ? (*ops->resume)() : 0)

	#define VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL)
	#define VMRUN(vmi, vcpu, rip, pmap, evinfo) \
	(ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap, evinfo) : ENXIO)
	#define VMCLEANUP(vmi) (ops != NULL ? (*ops->vmcleanup)(vmi) : NULL)
	#define VMSPACE_ALLOC(min, max) \
	(ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL)
	#define VMSPACE_FREE(vmspace) \
	(ops != NULL ? (*ops->vmspace_free)(vmspace) : ENXIO)
	#define VMGETREG(vmi, vcpu, num, retval) \
	(ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO)
	#define VMSETREG(vmi, vcpu, num, val) \
	(ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO)
	#define VMGETDESC(vmi, vcpu, num, desc) \
	(ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO)
	#define VMSETDESC(vmi, vcpu, num, desc) \
	(ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO)
	#define VMGETCAP(vmi, vcpu, num, retval) \
	(ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO)
	#define VMSETCAP(vmi, vcpu, num, val) \
	(ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO)
	#define VLAPIC_INIT(vmi, vcpu) \
	(ops != NULL ? (*ops->vlapic_init)(vmi, vcpu) : NULL)
	#define VLAPIC_CLEANUP(vmi, vlapic) \
	(ops != NULL ? (*ops->vlapic_cleanup)(vmi, vlapic) : NULL)

	#define fpu_start_emulating() load_cr0(rcr0() \| CR0_TS)
	#define fpu_stop_emulating() clts()

	static MALLOC_DEFINE(M_VM, "vm", "vm");

	/* statistics */
	static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");

	SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);

	/*
	* Halt the guest if all vcpus are executing a HLT instruction with
	* interrupts disabled.
	*/
	static int halt_detection_enabled = 1;
	SYSCTL_INT(_hw_vmm, OID_AUTO, halt_detection, CTLFLAG_RDTUN,
	&halt_detection_enabled, 0,
	"Halt VM if all vcpus execute HLT with interrupts disabled");

	static int vmm_ipinum;
	SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0,
	"IPI vector used for vcpu notifications");

	static int trace_guest_exceptions;
	SYSCTL_INT(_hw_vmm, OID_AUTO, trace_guest_exceptions, CTLFLAG_RDTUN,
	&trace_guest_exceptions, 0,
	"Trap into hypervisor on all guest exceptions and reflect them back");

	static void vm_free_memmap(struct vm *vm, int ident);
	static bool sysmem_mapping(struct vm vm, struct mem_map mm);
	static void vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr);

	#ifdef KTR
	static const char *
	vcpu_state2str(enum vcpu_state state)
	{

	switch (state) {
	case VCPU_IDLE:
	return ("idle");
	case VCPU_FROZEN:
	return ("frozen");
	case VCPU_RUNNING:
	return ("running");
	case VCPU_SLEEPING:
	return ("sleeping");
	default:
	return ("unknown");
	}
	}
	#endif

	static void
	vcpu_cleanup(struct vm *vm, int i, bool destroy)
	{
	struct vcpu *vcpu = &vm->vcpu[i];

	VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic);
	if (destroy) {
	vmm_stat_free(vcpu->stats);
	fpu_save_area_free(vcpu->guestfpu);
	}
	}

	static void
	vcpu_init(struct vm *vm, int vcpu_id, bool create)
	{
	struct vcpu *vcpu;

	KASSERT(vcpu_id >= 0 && vcpu_id < VM_MAXCPU,
	("vcpu_init: invalid vcpu %d", vcpu_id));

	vcpu = &vm->vcpu[vcpu_id];

	if (create) {
	KASSERT(!vcpu_lock_initialized(vcpu), ("vcpu %d already "
	"initialized", vcpu_id));
	vcpu_lock_init(vcpu);
	vcpu->state = VCPU_IDLE;
	vcpu->hostcpu = NOCPU;
	vcpu->guestfpu = fpu_save_area_alloc();
	vcpu->stats = vmm_stat_alloc();
	}

	vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
	vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED);
	vcpu->reqidle = 0;
	vcpu->exitintinfo = 0;
	vcpu->nmi_pending = 0;
	vcpu->extint_pending = 0;
	vcpu->exception_pending = 0;
	vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
	fpu_save_area_reset(vcpu->guestfpu);
	vmm_stat_init(vcpu->stats);
	}

	int
	vcpu_trace_exceptions(struct vm *vm, int vcpuid)
	{

	return (trace_guest_exceptions);
	}

	struct vm_exit *
	vm_exitinfo(struct vm *vm, int cpuid)
	{
	struct vcpu *vcpu;

	if (cpuid < 0 \|\| cpuid >= VM_MAXCPU)
	panic("vm_exitinfo: invalid cpuid %d", cpuid);

	vcpu = &vm->vcpu[cpuid];

	return (&vcpu->exitinfo);
	}

	static void
	vmm_resume(void)
	{
	VMM_RESUME();
	}

	static int
	vmm_init(void)
	{
	int error;

	vmm_host_state_init();

	vmm_ipinum = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) :
	&IDTVEC(justreturn));
	if (vmm_ipinum < 0)
	vmm_ipinum = IPI_AST;

	error = vmm_mem_init();
	if (error)
	return (error);

	if (vmm_is_intel())
	ops = &vmm_ops_intel;
	else if (vmm_is_amd())
	ops = &vmm_ops_amd;
	else
	return (ENXIO);

	vmm_resume_p = vmm_resume;

	return (VMM_INIT(vmm_ipinum));
	}

	static int
	vmm_handler(module_t mod, int what, void *arg)
	{
	int error;

	switch (what) {
	case MOD_LOAD:
	vmmdev_init();
	error = vmm_init();
	if (error == 0)
	vmm_initialized = 1;
	break;
	case MOD_UNLOAD:
	error = vmmdev_cleanup();
	if (error == 0) {
	vmm_resume_p = NULL;
	iommu_cleanup();
	if (vmm_ipinum != IPI_AST)
	lapic_ipi_free(vmm_ipinum);
	error = VMM_CLEANUP();
	/*
	* Something bad happened - prevent new
	* VMs from being created
	*/
	if (error)
	vmm_initialized = 0;
	}
	break;
	default:
	error = 0;
	break;
	}
	return (error);
	}

	static moduledata_t vmm_kmod = {
	"vmm",
	vmm_handler,
	NULL
	};

	/*
	* vmm initialization has the following dependencies:
	*
	* - VT-x initialization requires smp_rendezvous() and therefore must happen
	* after SMP is fully functional (after SI_SUB_SMP).
	*/
	DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY);
	MODULE_VERSION(vmm, 1);

	static void
	vm_init(struct vm *vm, bool create)
	{
	int i;

	vm->cookie = VMINIT(vm, vmspace_pmap(vm->vmspace));
	vm->iommu = NULL;
	vm->vioapic = vioapic_init(vm);
	vm->vhpet = vhpet_init(vm);
	vm->vatpic = vatpic_init(vm);
	vm->vatpit = vatpit_init(vm);
	vm->vpmtmr = vpmtmr_init(vm);
	if (create)
	vm->vrtc = vrtc_init(vm);

	CPU_ZERO(&vm->active_cpus);
	+ CPU_ZERO(&vm->debug_cpus);

	vm->suspend = 0;
	CPU_ZERO(&vm->suspended_cpus);

	for (i = 0; i < VM_MAXCPU; i++)
	vcpu_init(vm, i, create);
	}

	int
	vm_create(const char name, struct vm *retvm)
	{
	struct vm *vm;
	struct vmspace *vmspace;

	/*
	* If vmm.ko could not be successfully initialized then don't attempt
	* to create the virtual machine.
	*/
	if (!vmm_initialized)
	return (ENXIO);

	if (name == NULL \|\| strlen(name) >= VM_MAX_NAMELEN)
	return (EINVAL);

	vmspace = VMSPACE_ALLOC(0, VM_MAXUSER_ADDRESS);
	if (vmspace == NULL)
	return (ENOMEM);

	vm = malloc(sizeof(struct vm), M_VM, M_WAITOK \| M_ZERO);
	strcpy(vm->name, name);
	vm->vmspace = vmspace;
	mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF);

	vm_init(vm, true);

	*retvm = vm;
	return (0);
	}

	static void
	vm_cleanup(struct vm *vm, bool destroy)
	{
	struct mem_map *mm;
	int i;

	ppt_unassign_all(vm);

	if (vm->iommu != NULL)
	iommu_destroy_domain(vm->iommu);

	if (destroy)
	vrtc_cleanup(vm->vrtc);
	else
	vrtc_reset(vm->vrtc);
	vpmtmr_cleanup(vm->vpmtmr);
	vatpit_cleanup(vm->vatpit);
	vhpet_cleanup(vm->vhpet);
	vatpic_cleanup(vm->vatpic);
	vioapic_cleanup(vm->vioapic);

	for (i = 0; i < VM_MAXCPU; i++)
	vcpu_cleanup(vm, i, destroy);

	VMCLEANUP(vm->cookie);

	/*
	* System memory is removed from the guest address space only when
	* the VM is destroyed. This is because the mapping remains the same
	* across VM reset.
	*
	* Device memory can be relocated by the guest (e.g. using PCI BARs)
	* so those mappings are removed on a VM reset.
	*/
	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
	mm = &vm->mem_maps[i];
	if (destroy \|\| !sysmem_mapping(vm, mm))
	vm_free_memmap(vm, i);
	}

	if (destroy) {
	for (i = 0; i < VM_MAX_MEMSEGS; i++)
	vm_free_memseg(vm, i);

	VMSPACE_FREE(vm->vmspace);
	vm->vmspace = NULL;
	}
	}

	void
	vm_destroy(struct vm *vm)
	{
	vm_cleanup(vm, true);
	free(vm, M_VM);
	}

	int
	vm_reinit(struct vm *vm)
	{
	int error;

	/*
	* A virtual machine can be reset only if all vcpus are suspended.
	*/
	if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
	vm_cleanup(vm, false);
	vm_init(vm, false);
	error = 0;
	} else {
	error = EBUSY;
	}

	return (error);
	}

	const char *
	vm_name(struct vm *vm)
	{
	return (vm->name);
	}

	int
	vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
	{
	vm_object_t obj;

	if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL)
	return (ENOMEM);
	else
	return (0);
	}

	int
	vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
	{

	vmm_mmio_free(vm->vmspace, gpa, len);
	return (0);
	}

	/*
	* Return 'true' if 'gpa' is allocated in the guest address space.
	*
	* This function is called in the context of a running vcpu which acts as
	* an implicit lock on 'vm->mem_maps[]'.
	*/
	bool
	vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa)
	{
	struct mem_map *mm;
	int i;

	#ifdef INVARIANTS
	int hostcpu, state;
	state = vcpu_get_state(vm, vcpuid, &hostcpu);
	KASSERT(state == VCPU_RUNNING && hostcpu == curcpu,
	("%s: invalid vcpu state %d/%d", __func__, state, hostcpu));
	#endif

	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
	mm = &vm->mem_maps[i];
	if (mm->len != 0 && gpa >= mm->gpa && gpa < mm->gpa + mm->len)
	return (true); /* 'gpa' is sysmem or devmem */
	}

	if (ppt_is_mmio(vm, gpa))
	return (true); /* 'gpa' is pci passthru mmio */

	return (false);
	}

	int
	vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem)
	{
	struct mem_seg *seg;
	vm_object_t obj;

	if (ident < 0 \|\| ident >= VM_MAX_MEMSEGS)
	return (EINVAL);

	if (len == 0 \|\| (len & PAGE_MASK))
	return (EINVAL);

	seg = &vm->mem_segs[ident];
	if (seg->object != NULL) {
	if (seg->len == len && seg->sysmem == sysmem)
	return (EEXIST);
	else
	return (EINVAL);
	}

	obj = vm_object_allocate(OBJT_DEFAULT, len >> PAGE_SHIFT);
	if (obj == NULL)
	return (ENOMEM);

	seg->len = len;
	seg->object = obj;
	seg->sysmem = sysmem;
	return (0);
	}

	int
	vm_get_memseg(struct vm vm, int ident, size_t len, bool *sysmem,
	vm_object_t *objptr)
	{
	struct mem_seg *seg;

	if (ident < 0 \|\| ident >= VM_MAX_MEMSEGS)
	return (EINVAL);

	seg = &vm->mem_segs[ident];
	if (len)
	*len = seg->len;
	if (sysmem)
	*sysmem = seg->sysmem;
	if (objptr)
	*objptr = seg->object;
	return (0);
	}

	void
	vm_free_memseg(struct vm *vm, int ident)
	{
	struct mem_seg *seg;

	KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS,
	("%s: invalid memseg ident %d", __func__, ident));

	seg = &vm->mem_segs[ident];
	if (seg->object != NULL) {
	vm_object_deallocate(seg->object);
	bzero(seg, sizeof(struct mem_seg));
	}
	}

	int
	vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first,
	size_t len, int prot, int flags)
	{
	struct mem_seg *seg;
	struct mem_map m, map;
	vm_ooffset_t last;
	int i, error;

	if (prot == 0 \|\| (prot & ~(VM_PROT_ALL)) != 0)
	return (EINVAL);

	if (flags & ~VM_MEMMAP_F_WIRED)
	return (EINVAL);

	if (segid < 0 \|\| segid >= VM_MAX_MEMSEGS)
	return (EINVAL);

	seg = &vm->mem_segs[segid];
	if (seg->object == NULL)
	return (EINVAL);

	last = first + len;
	if (first < 0 \|\| first >= last \|\| last > seg->len)
	return (EINVAL);

	if ((gpa \| first \| last) & PAGE_MASK)
	return (EINVAL);

	map = NULL;
	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
	m = &vm->mem_maps[i];
	if (m->len == 0) {
	map = m;
	break;
	}
	}

	if (map == NULL)
	return (ENOSPC);

	error = vm_map_find(&vm->vmspace->vm_map, seg->object, first, &gpa,
	len, 0, VMFS_NO_SPACE, prot, prot, 0);
	if (error != KERN_SUCCESS)
	return (EFAULT);

	vm_object_reference(seg->object);

	if (flags & VM_MEMMAP_F_WIRED) {
	error = vm_map_wire(&vm->vmspace->vm_map, gpa, gpa + len,
	VM_MAP_WIRE_USER \| VM_MAP_WIRE_NOHOLES);
	if (error != KERN_SUCCESS) {
	vm_map_remove(&vm->vmspace->vm_map, gpa, gpa + len);
	return (EFAULT);
	}
	}

	map->gpa = gpa;
	map->len = len;
	map->segoff = first;
	map->segid = segid;
	map->prot = prot;
	map->flags = flags;
	return (0);
	}

	int
	vm_mmap_getnext(struct vm vm, vm_paddr_t gpa, int *segid,
	vm_ooffset_t segoff, size_t len, int prot, int flags)
	{
	struct mem_map mm, mmnext;
	int i;

	mmnext = NULL;
	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
	mm = &vm->mem_maps[i];
	if (mm->len == 0 \|\| mm->gpa < *gpa)
	continue;
	if (mmnext == NULL \|\| mm->gpa < mmnext->gpa)
	mmnext = mm;
	}

	if (mmnext != NULL) {
	*gpa = mmnext->gpa;
	if (segid)
	*segid = mmnext->segid;
	if (segoff)
	*segoff = mmnext->segoff;
	if (len)
	*len = mmnext->len;
	if (prot)
	*prot = mmnext->prot;
	if (flags)
	*flags = mmnext->flags;
	return (0);
	} else {
	return (ENOENT);
	}
	}

	static void
	vm_free_memmap(struct vm *vm, int ident)
	{
	struct mem_map *mm;
	int error;

	mm = &vm->mem_maps[ident];
	if (mm->len) {
	error = vm_map_remove(&vm->vmspace->vm_map, mm->gpa,
	mm->gpa + mm->len);
	KASSERT(error == KERN_SUCCESS, ("%s: vm_map_remove error %d",
	__func__, error));
	bzero(mm, sizeof(struct mem_map));
	}
	}

	static __inline bool
	sysmem_mapping(struct vm vm, struct mem_map mm)
	{

	if (mm->len != 0 && vm->mem_segs[mm->segid].sysmem)
	return (true);
	else
	return (false);
	}

	static vm_paddr_t
	sysmem_maxaddr(struct vm *vm)
	{
	struct mem_map *mm;
	vm_paddr_t maxaddr;
	int i;

	maxaddr = 0;
	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
	mm = &vm->mem_maps[i];
	if (sysmem_mapping(vm, mm)) {
	if (maxaddr < mm->gpa + mm->len)
	maxaddr = mm->gpa + mm->len;
	}
	}
	return (maxaddr);
	}

	static void
	vm_iommu_modify(struct vm *vm, boolean_t map)
	{
	int i, sz;
	vm_paddr_t gpa, hpa;
	struct mem_map *mm;
	void vp, cookie, *host_domain;

	sz = PAGE_SIZE;
	host_domain = iommu_host_domain();

	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
	mm = &vm->mem_maps[i];
	if (!sysmem_mapping(vm, mm))
	continue;

	if (map) {
	KASSERT((mm->flags & VM_MEMMAP_F_IOMMU) == 0,
	("iommu map found invalid memmap %#lx/%#lx/%#x",
	mm->gpa, mm->len, mm->flags));
	if ((mm->flags & VM_MEMMAP_F_WIRED) == 0)
	continue;
	mm->flags \|= VM_MEMMAP_F_IOMMU;
	} else {
	if ((mm->flags & VM_MEMMAP_F_IOMMU) == 0)
	continue;
	mm->flags &= ~VM_MEMMAP_F_IOMMU;
	KASSERT((mm->flags & VM_MEMMAP_F_WIRED) != 0,
	("iommu unmap found invalid memmap %#lx/%#lx/%#x",
	mm->gpa, mm->len, mm->flags));
	}

	gpa = mm->gpa;
	while (gpa < mm->gpa + mm->len) {
	vp = vm_gpa_hold(vm, -1, gpa, PAGE_SIZE, VM_PROT_WRITE,
	&cookie);
	KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx",
	vm_name(vm), gpa));

	vm_gpa_release(cookie);

	hpa = DMAP_TO_PHYS((uintptr_t)vp);
	if (map) {
	iommu_create_mapping(vm->iommu, gpa, hpa, sz);
	iommu_remove_mapping(host_domain, hpa, sz);
	} else {
	iommu_remove_mapping(vm->iommu, gpa, sz);
	iommu_create_mapping(host_domain, hpa, hpa, sz);
	}

	gpa += PAGE_SIZE;
	}
	}

	/*
	* Invalidate the cached translations associated with the domain
	* from which pages were removed.
	*/
	if (map)
	iommu_invalidate_tlb(host_domain);
	else
	iommu_invalidate_tlb(vm->iommu);
	}

	#define vm_iommu_unmap(vm) vm_iommu_modify((vm), FALSE)
	#define vm_iommu_map(vm) vm_iommu_modify((vm), TRUE)

	int
	vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func)
	{
	int error;

	error = ppt_unassign_device(vm, bus, slot, func);
	if (error)
	return (error);

	if (ppt_assigned_devices(vm) == 0)
	vm_iommu_unmap(vm);

	return (0);
	}

	int
	vm_assign_pptdev(struct vm *vm, int bus, int slot, int func)
	{
	int error;
	vm_paddr_t maxaddr;

	/* Set up the IOMMU to do the 'gpa' to 'hpa' translation */
	if (ppt_assigned_devices(vm) == 0) {
	KASSERT(vm->iommu == NULL,
	("vm_assign_pptdev: iommu must be NULL"));
	maxaddr = sysmem_maxaddr(vm);
	vm->iommu = iommu_create_domain(maxaddr);
	if (vm->iommu == NULL)
	return (ENXIO);
	vm_iommu_map(vm);
	}

	error = ppt_assign_device(vm, bus, slot, func);
	return (error);
	}

	void *
	vm_gpa_hold(struct vm *vm, int vcpuid, vm_paddr_t gpa, size_t len, int reqprot,
	void **cookie)
	{
	int i, count, pageoff;
	struct mem_map *mm;
	vm_page_t m;
	#ifdef INVARIANTS
	/*
	* All vcpus are frozen by ioctls that modify the memory map
	* (e.g. VM_MMAP_MEMSEG). Therefore 'vm->memmap[]' stability is
	* guaranteed if at least one vcpu is in the VCPU_FROZEN state.
	*/
	int state;
	KASSERT(vcpuid >= -1 && vcpuid < VM_MAXCPU, ("%s: invalid vcpuid %d",
	__func__, vcpuid));
	for (i = 0; i < VM_MAXCPU; i++) {
	if (vcpuid != -1 && vcpuid != i)
	continue;
	state = vcpu_get_state(vm, i, NULL);
	KASSERT(state == VCPU_FROZEN, ("%s: invalid vcpu state %d",
	__func__, state));
	}
	#endif
	pageoff = gpa & PAGE_MASK;
	if (len > PAGE_SIZE - pageoff)
	panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);

	count = 0;
	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
	mm = &vm->mem_maps[i];
	if (sysmem_mapping(vm, mm) && gpa >= mm->gpa &&
	gpa < mm->gpa + mm->len) {
	count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map,
	trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1);
	break;
	}
	}

	if (count == 1) {
	*cookie = m;
	return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff));
	} else {
	*cookie = NULL;
	return (NULL);
	}
	}

	void
	vm_gpa_release(void *cookie)
	{
	vm_page_t m = cookie;

	vm_page_lock(m);
	vm_page_unhold(m);
	vm_page_unlock(m);
	}

	int
	vm_get_register(struct vm vm, int vcpu, int reg, uint64_t retval)
	{

	if (vcpu < 0 \|\| vcpu >= VM_MAXCPU)
	return (EINVAL);

	if (reg >= VM_REG_LAST)
	return (EINVAL);

	return (VMGETREG(vm->cookie, vcpu, reg, retval));
	}

	int
	vm_set_register(struct vm *vm, int vcpuid, int reg, uint64_t val)
	{
	struct vcpu *vcpu;
	int error;

	if (vcpuid < 0 \|\| vcpuid >= VM_MAXCPU)
	return (EINVAL);

	if (reg >= VM_REG_LAST)
	return (EINVAL);

	error = VMSETREG(vm->cookie, vcpuid, reg, val);
	if (error \|\| reg != VM_REG_GUEST_RIP)
	return (error);

	/* Set 'nextrip' to match the value of %rip */
	VCPU_CTR1(vm, vcpuid, "Setting nextrip to %#lx", val);
	vcpu = &vm->vcpu[vcpuid];
	vcpu->nextrip = val;
	return (0);
	}

	static boolean_t
	is_descriptor_table(int reg)
	{

	switch (reg) {
	case VM_REG_GUEST_IDTR:
	case VM_REG_GUEST_GDTR:
	return (TRUE);
	default:
	return (FALSE);
	}
	}

	static boolean_t
	is_segment_register(int reg)
	{

	switch (reg) {
	case VM_REG_GUEST_ES:
	case VM_REG_GUEST_CS:
	case VM_REG_GUEST_SS:
	case VM_REG_GUEST_DS:
	case VM_REG_GUEST_FS:
	case VM_REG_GUEST_GS:
	case VM_REG_GUEST_TR:
	case VM_REG_GUEST_LDTR:
	return (TRUE);
	default:
	return (FALSE);
	}
	}

	int
	vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
	struct seg_desc *desc)
	{

	if (vcpu < 0 \|\| vcpu >= VM_MAXCPU)
	return (EINVAL);

	if (!is_segment_register(reg) && !is_descriptor_table(reg))
	return (EINVAL);

	return (VMGETDESC(vm->cookie, vcpu, reg, desc));
	}

	int
	vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
	struct seg_desc *desc)
	{
	if (vcpu < 0 \|\| vcpu >= VM_MAXCPU)
	return (EINVAL);

	if (!is_segment_register(reg) && !is_descriptor_table(reg))
	return (EINVAL);

	return (VMSETDESC(vm->cookie, vcpu, reg, desc));
	}

	static void
	restore_guest_fpustate(struct vcpu *vcpu)
	{

	/* flush host state to the pcb */
	fpuexit(curthread);

	/* restore guest FPU state */
	fpu_stop_emulating();
	fpurestore(vcpu->guestfpu);

	/* restore guest XCR0 if XSAVE is enabled in the host */
	if (rcr4() & CR4_XSAVE)
	load_xcr(0, vcpu->guest_xcr0);

	/*
	* The FPU is now "dirty" with the guest's state so turn on emulation
	* to trap any access to the FPU by the host.
	*/
	fpu_start_emulating();
	}

	static void
	save_guest_fpustate(struct vcpu *vcpu)
	{

	if ((rcr0() & CR0_TS) == 0)
	panic("fpu emulation not enabled in host!");

	/* save guest XCR0 and restore host XCR0 */
	if (rcr4() & CR4_XSAVE) {
	vcpu->guest_xcr0 = rxcr(0);
	load_xcr(0, vmm_get_host_xcr0());
	}

	/* save guest FPU state */
	fpu_stop_emulating();
	fpusave(vcpu->guestfpu);
	fpu_start_emulating();
	}

	static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle");

	static int
	vcpu_set_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate,
	bool from_idle)
	{
	struct vcpu *vcpu;
	int error;

	vcpu = &vm->vcpu[vcpuid];
	vcpu_assert_locked(vcpu);

	/*
	* State transitions from the vmmdev_ioctl() must always begin from
	* the VCPU_IDLE state. This guarantees that there is only a single
	* ioctl() operating on a vcpu at any point.
	*/
	if (from_idle) {
	while (vcpu->state != VCPU_IDLE) {
	vcpu->reqidle = 1;
	vcpu_notify_event_locked(vcpu, false);
	VCPU_CTR1(vm, vcpuid, "vcpu state change from %s to "
	"idle requested", vcpu_state2str(vcpu->state));
	msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz);
	}
	} else {
	KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
	"vcpu idle state"));
	}

	if (vcpu->state == VCPU_RUNNING) {
	KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d "
	"mismatch for running vcpu", curcpu, vcpu->hostcpu));
	} else {
	KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a "
	"vcpu that is not running", vcpu->hostcpu));
	}

	/*
	* The following state transitions are allowed:
	* IDLE -> FROZEN -> IDLE
	* FROZEN -> RUNNING -> FROZEN
	* FROZEN -> SLEEPING -> FROZEN
	*/
	switch (vcpu->state) {
	case VCPU_IDLE:
	case VCPU_RUNNING:
	case VCPU_SLEEPING:
	error = (newstate != VCPU_FROZEN);
	break;
	case VCPU_FROZEN:
	error = (newstate == VCPU_FROZEN);
	break;
	default:
	error = 1;
	break;
	}

	if (error)
	return (EBUSY);

	VCPU_CTR2(vm, vcpuid, "vcpu state changed from %s to %s",
	vcpu_state2str(vcpu->state), vcpu_state2str(newstate));

	vcpu->state = newstate;
	if (newstate == VCPU_RUNNING)
	vcpu->hostcpu = curcpu;
	else
	vcpu->hostcpu = NOCPU;

	if (newstate == VCPU_IDLE)
	wakeup(&vcpu->state);

	return (0);
	}

	static void
	vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
	{
	int error;

	if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0)
	panic("Error %d setting state to %d\n", error, newstate);
	}

	static void
	vcpu_require_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate)
	{
	int error;

	if ((error = vcpu_set_state_locked(vm, vcpuid, newstate, false)) != 0)
	panic("Error %d setting state to %d", error, newstate);
	}

	static void
	vm_set_rendezvous_func(struct vm *vm, vm_rendezvous_func_t func)
	{

	KASSERT(mtx_owned(&vm->rendezvous_mtx), ("rendezvous_mtx not locked"));

	/*
	* Update 'rendezvous_func' and execute a write memory barrier to
	* ensure that it is visible across all host cpus. This is not needed
	* for correctness but it does ensure that all the vcpus will notice
	* that the rendezvous is requested immediately.
	*/
	vm->rendezvous_func = func;
	wmb();
	}

	#define RENDEZVOUS_CTR0(vm, vcpuid, fmt) \
	do { \
	if (vcpuid >= 0) \
	VCPU_CTR0(vm, vcpuid, fmt); \
	else \
	VM_CTR0(vm, fmt); \
	} while (0)

	static void
	vm_handle_rendezvous(struct vm *vm, int vcpuid)
	{

	KASSERT(vcpuid == -1 \|\| (vcpuid >= 0 && vcpuid < VM_MAXCPU),
	("vm_handle_rendezvous: invalid vcpuid %d", vcpuid));

	mtx_lock(&vm->rendezvous_mtx);
	while (vm->rendezvous_func != NULL) {
	/* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */
	CPU_AND(&vm->rendezvous_req_cpus, &vm->active_cpus);

	if (vcpuid != -1 &&
	CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus) &&
	!CPU_ISSET(vcpuid, &vm->rendezvous_done_cpus)) {
	VCPU_CTR0(vm, vcpuid, "Calling rendezvous func");
	(*vm->rendezvous_func)(vm, vcpuid, vm->rendezvous_arg);
	CPU_SET(vcpuid, &vm->rendezvous_done_cpus);
	}
	if (CPU_CMP(&vm->rendezvous_req_cpus,
	&vm->rendezvous_done_cpus) == 0) {
	VCPU_CTR0(vm, vcpuid, "Rendezvous completed");
	vm_set_rendezvous_func(vm, NULL);
	wakeup(&vm->rendezvous_func);
	break;
	}
	RENDEZVOUS_CTR0(vm, vcpuid, "Wait for rendezvous completion");
	mtx_sleep(&vm->rendezvous_func, &vm->rendezvous_mtx, 0,
	"vmrndv", 0);
	}
	mtx_unlock(&vm->rendezvous_mtx);
	}

	/*
	* Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
	*/
	static int
	vm_handle_hlt(struct vm vm, int vcpuid, bool intr_disabled, bool retu)
	{
	struct vcpu *vcpu;
	const char *wmesg;
	int t, vcpu_halted, vm_halted;

	KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted"));

	vcpu = &vm->vcpu[vcpuid];
	vcpu_halted = 0;
	vm_halted = 0;

	vcpu_lock(vcpu);
	while (1) {
	/*
	* Do a final check for pending NMI or interrupts before
	* really putting this thread to sleep. Also check for
	* software events that would cause this vcpu to wakeup.
	*
	* These interrupts/events could have happened after the
	* vcpu returned from VMRUN() and before it acquired the
	* vcpu lock above.
	*/
	if (vm->rendezvous_func != NULL \|\| vm->suspend \|\| vcpu->reqidle)
	break;
	if (vm_nmi_pending(vm, vcpuid))
	break;
	if (!intr_disabled) {
	if (vm_extint_pending(vm, vcpuid) \|\|
	vlapic_pending_intr(vcpu->vlapic, NULL)) {
	break;
	}
	}

	/* Don't go to sleep if the vcpu thread needs to yield */
	if (vcpu_should_yield(vm, vcpuid))
	break;

	+ if (vcpu_debugged(vm, vcpuid))
	+ break;
	+
	/*
	* Some Linux guests implement "halt" by having all vcpus
	* execute HLT with interrupts disabled. 'halted_cpus' keeps
	* track of the vcpus that have entered this state. When all
	* vcpus enter the halted state the virtual machine is halted.
	*/
	if (intr_disabled) {
	wmesg = "vmhalt";
	VCPU_CTR0(vm, vcpuid, "Halted");
	if (!vcpu_halted && halt_detection_enabled) {
	vcpu_halted = 1;
	CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus);
	}
	if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) {
	vm_halted = 1;
	break;
	}
	} else {
	wmesg = "vmidle";
	}

	t = ticks;
	vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
	/*
	* XXX msleep_spin() cannot be interrupted by signals so
	* wake up periodically to check pending signals.
	*/
	msleep_spin(vcpu, &vcpu->mtx, wmesg, hz);
	vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
	vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
	}

	if (vcpu_halted)
	CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus);

	vcpu_unlock(vcpu);

	if (vm_halted)
	vm_suspend(vm, VM_SUSPEND_HALT);

	return (0);
	}

	static int
	vm_handle_paging(struct vm vm, int vcpuid, bool retu)
	{
	int rv, ftype;
	struct vm_map *map;
	struct vcpu *vcpu;
	struct vm_exit *vme;

	vcpu = &vm->vcpu[vcpuid];
	vme = &vcpu->exitinfo;

	KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d",
	__func__, vme->inst_length));

	ftype = vme->u.paging.fault_type;
	KASSERT(ftype == VM_PROT_READ \|\|
	ftype == VM_PROT_WRITE \|\| ftype == VM_PROT_EXECUTE,
	("vm_handle_paging: invalid fault_type %d", ftype));

	if (ftype == VM_PROT_READ \|\| ftype == VM_PROT_WRITE) {
	rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace),
	vme->u.paging.gpa, ftype);
	if (rv == 0) {
	VCPU_CTR2(vm, vcpuid, "%s bit emulation for gpa %#lx",
	ftype == VM_PROT_READ ? "accessed" : "dirty",
	vme->u.paging.gpa);
	goto done;
	}
	}

	map = &vm->vmspace->vm_map;
	rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL);

	VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, "
	"ftype = %d", rv, vme->u.paging.gpa, ftype);

	if (rv != KERN_SUCCESS)
	return (EFAULT);
	done:
	return (0);
	}

	static int
	vm_handle_inst_emul(struct vm vm, int vcpuid, bool retu)
	{
	struct vie *vie;
	struct vcpu *vcpu;
	struct vm_exit *vme;
	uint64_t gla, gpa, cs_base;
	struct vm_guest_paging *paging;
	mem_region_read_t mread;
	mem_region_write_t mwrite;
	enum vm_cpu_mode cpu_mode;
	int cs_d, error, fault;

	vcpu = &vm->vcpu[vcpuid];
	vme = &vcpu->exitinfo;

	KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d",
	__func__, vme->inst_length));

	gla = vme->u.inst_emul.gla;
	gpa = vme->u.inst_emul.gpa;
	cs_base = vme->u.inst_emul.cs_base;
	cs_d = vme->u.inst_emul.cs_d;
	vie = &vme->u.inst_emul.vie;
	paging = &vme->u.inst_emul.paging;
	cpu_mode = paging->cpu_mode;

	VCPU_CTR1(vm, vcpuid, "inst_emul fault accessing gpa %#lx", gpa);

	/* Fetch, decode and emulate the faulting instruction */
	if (vie->num_valid == 0) {
	error = vmm_fetch_instruction(vm, vcpuid, paging, vme->rip +
	cs_base, VIE_INST_SIZE, vie, &fault);
	} else {
	/*
	* The instruction bytes have already been copied into 'vie'
	*/
	error = fault = 0;
	}
	if (error \|\| fault)
	return (error);

	if (vmm_decode_instruction(vm, vcpuid, gla, cpu_mode, cs_d, vie) != 0) {
	VCPU_CTR1(vm, vcpuid, "Error decoding instruction at %#lx",
	vme->rip + cs_base);
	retu = true; / dump instruction bytes in userspace */
	return (0);
	}

	/*
	* Update 'nextrip' based on the length of the emulated instruction.
	*/
	vme->inst_length = vie->num_processed;
	vcpu->nextrip += vie->num_processed;
	VCPU_CTR1(vm, vcpuid, "nextrip updated to %#lx after instruction "
	"decoding", vcpu->nextrip);

	/* return to userland unless this is an in-kernel emulated device */
	if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
	mread = lapic_mmio_read;
	mwrite = lapic_mmio_write;
	} else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
	mread = vioapic_mmio_read;
	mwrite = vioapic_mmio_write;
	} else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
	mread = vhpet_mmio_read;
	mwrite = vhpet_mmio_write;
	} else {
	*retu = true;
	return (0);
	}

	error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, paging,
	mread, mwrite, retu);

	return (error);
	}

	static int
	vm_handle_suspend(struct vm vm, int vcpuid, bool retu)
	{
	int i, done;
	struct vcpu *vcpu;

	done = 0;
	vcpu = &vm->vcpu[vcpuid];

	CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus);

	/*
	* Wait until all 'active_cpus' have suspended themselves.
	*
	* Since a VM may be suspended at any time including when one or
	* more vcpus are doing a rendezvous we need to call the rendezvous
	* handler while we are waiting to prevent a deadlock.
	*/
	vcpu_lock(vcpu);
	while (1) {
	if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
	VCPU_CTR0(vm, vcpuid, "All vcpus suspended");
	break;
	}

	if (vm->rendezvous_func == NULL) {
	VCPU_CTR0(vm, vcpuid, "Sleeping during suspend");
	vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
	msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz);
	vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
	} else {
	VCPU_CTR0(vm, vcpuid, "Rendezvous during suspend");
	vcpu_unlock(vcpu);
	vm_handle_rendezvous(vm, vcpuid);
	vcpu_lock(vcpu);
	}
	}
	vcpu_unlock(vcpu);

	/*
	* Wakeup the other sleeping vcpus and return to userspace.
	*/
	for (i = 0; i < VM_MAXCPU; i++) {
	if (CPU_ISSET(i, &vm->suspended_cpus)) {
	vcpu_notify_event(vm, i, false);
	}
	}

	*retu = true;
	return (0);
	}

	static int
	vm_handle_reqidle(struct vm vm, int vcpuid, bool retu)
	{
	struct vcpu *vcpu = &vm->vcpu[vcpuid];

	vcpu_lock(vcpu);
	KASSERT(vcpu->reqidle, ("invalid vcpu reqidle %d", vcpu->reqidle));
	vcpu->reqidle = 0;
	vcpu_unlock(vcpu);
	*retu = true;
	return (0);
	}

	int
	vm_suspend(struct vm *vm, enum vm_suspend_how how)
	{
	int i;

	if (how <= VM_SUSPEND_NONE \|\| how >= VM_SUSPEND_LAST)
	return (EINVAL);

	if (atomic_cmpset_int(&vm->suspend, 0, how) == 0) {
	VM_CTR2(vm, "virtual machine already suspended %d/%d",
	vm->suspend, how);
	return (EALREADY);
	}

	VM_CTR1(vm, "virtual machine successfully suspended %d", how);

	/*
	* Notify all active vcpus that they are now suspended.
	*/
	for (i = 0; i < VM_MAXCPU; i++) {
	if (CPU_ISSET(i, &vm->active_cpus))
	vcpu_notify_event(vm, i, false);
	}

	return (0);
	}

	void
	vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip)
	{
	struct vm_exit *vmexit;

	KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST,
	("vm_exit_suspended: invalid suspend type %d", vm->suspend));

	vmexit = vm_exitinfo(vm, vcpuid);
	vmexit->rip = rip;
	vmexit->inst_length = 0;
	vmexit->exitcode = VM_EXITCODE_SUSPENDED;
	vmexit->u.suspended.how = vm->suspend;
	}

	void
	+vm_exit_debug(struct vm *vm, int vcpuid, uint64_t rip)
	+{
	+ struct vm_exit *vmexit;
	+
	+ vmexit = vm_exitinfo(vm, vcpuid);
	+ vmexit->rip = rip;
	+ vmexit->inst_length = 0;
	+ vmexit->exitcode = VM_EXITCODE_DEBUG;
	+}
	+
	+void
	vm_exit_rendezvous(struct vm *vm, int vcpuid, uint64_t rip)
	{
	struct vm_exit *vmexit;

	KASSERT(vm->rendezvous_func != NULL, ("rendezvous not in progress"));

	vmexit = vm_exitinfo(vm, vcpuid);
	vmexit->rip = rip;
	vmexit->inst_length = 0;
	vmexit->exitcode = VM_EXITCODE_RENDEZVOUS;
	vmm_stat_incr(vm, vcpuid, VMEXIT_RENDEZVOUS, 1);
	}

	void
	vm_exit_reqidle(struct vm *vm, int vcpuid, uint64_t rip)
	{
	struct vm_exit *vmexit;

	vmexit = vm_exitinfo(vm, vcpuid);
	vmexit->rip = rip;
	vmexit->inst_length = 0;
	vmexit->exitcode = VM_EXITCODE_REQIDLE;
	vmm_stat_incr(vm, vcpuid, VMEXIT_REQIDLE, 1);
	}

	void
	vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip)
	{
	struct vm_exit *vmexit;

	vmexit = vm_exitinfo(vm, vcpuid);
	vmexit->rip = rip;
	vmexit->inst_length = 0;
	vmexit->exitcode = VM_EXITCODE_BOGUS;
	vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1);
	}

	int
	vm_run(struct vm vm, struct vm_run vmrun)
	{
	struct vm_eventinfo evinfo;
	int error, vcpuid;
	struct vcpu *vcpu;
	struct pcb *pcb;
	uint64_t tscval;
	struct vm_exit *vme;
	bool retu, intr_disabled;
	pmap_t pmap;

	vcpuid = vmrun->cpuid;

	if (vcpuid < 0 \|\| vcpuid >= VM_MAXCPU)
	return (EINVAL);

	if (!CPU_ISSET(vcpuid, &vm->active_cpus))
	return (EINVAL);

	if (CPU_ISSET(vcpuid, &vm->suspended_cpus))
	return (EINVAL);

	pmap = vmspace_pmap(vm->vmspace);
	vcpu = &vm->vcpu[vcpuid];
	vme = &vcpu->exitinfo;
	evinfo.rptr = &vm->rendezvous_func;
	evinfo.sptr = &vm->suspend;
	evinfo.iptr = &vcpu->reqidle;
	restart:
	critical_enter();

	KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
	("vm_run: absurd pm_active"));

	tscval = rdtsc();

	pcb = PCPU_GET(curpcb);
	set_pcb_flags(pcb, PCB_FULL_IRET);

	restore_guest_fpustate(vcpu);

	vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
	error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip, pmap, &evinfo);
	vcpu_require_state(vm, vcpuid, VCPU_FROZEN);

	save_guest_fpustate(vcpu);

	vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);

	critical_exit();

	if (error == 0) {
	retu = false;
	vcpu->nextrip = vme->rip + vme->inst_length;
	switch (vme->exitcode) {
	case VM_EXITCODE_REQIDLE:
	error = vm_handle_reqidle(vm, vcpuid, &retu);
	break;
	case VM_EXITCODE_SUSPENDED:
	error = vm_handle_suspend(vm, vcpuid, &retu);
	break;
	case VM_EXITCODE_IOAPIC_EOI:
	vioapic_process_eoi(vm, vcpuid,
	vme->u.ioapic_eoi.vector);
	break;
	case VM_EXITCODE_RENDEZVOUS:
	vm_handle_rendezvous(vm, vcpuid);
	error = 0;
	break;
	case VM_EXITCODE_HLT:
	intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0);
	error = vm_handle_hlt(vm, vcpuid, intr_disabled, &retu);
	break;
	case VM_EXITCODE_PAGING:
	error = vm_handle_paging(vm, vcpuid, &retu);
	break;
	case VM_EXITCODE_INST_EMUL:
	error = vm_handle_inst_emul(vm, vcpuid, &retu);
	break;
	case VM_EXITCODE_INOUT:
	case VM_EXITCODE_INOUT_STR:
	error = vm_handle_inout(vm, vcpuid, vme, &retu);
	break;
	case VM_EXITCODE_MONITOR:
	case VM_EXITCODE_MWAIT:
	vm_inject_ud(vm, vcpuid);
	break;
	default:
	retu = true; /* handled in userland */
	break;
	}
	}

	if (error == 0 && retu == false)
	goto restart;

	VCPU_CTR2(vm, vcpuid, "retu %d/%d", error, vme->exitcode);

	/* copy the exit information */
	bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit));
	return (error);
	}

	int
	vm_restart_instruction(void *arg, int vcpuid)
	{
	struct vm *vm;
	struct vcpu *vcpu;
	enum vcpu_state state;
	uint64_t rip;
	int error;

	vm = arg;
	if (vcpuid < 0 \|\| vcpuid >= VM_MAXCPU)
	return (EINVAL);

	vcpu = &vm->vcpu[vcpuid];
	state = vcpu_get_state(vm, vcpuid, NULL);
	if (state == VCPU_RUNNING) {
	/*
	* When a vcpu is "running" the next instruction is determined
	* by adding 'rip' and 'inst_length' in the vcpu's 'exitinfo'.
	* Thus setting 'inst_length' to zero will cause the current
	* instruction to be restarted.
	*/
	vcpu->exitinfo.inst_length = 0;
	VCPU_CTR1(vm, vcpuid, "restarting instruction at %#lx by "
	"setting inst_length to zero", vcpu->exitinfo.rip);
	} else if (state == VCPU_FROZEN) {
	/*
	* When a vcpu is "frozen" it is outside the critical section
	* around VMRUN() and 'nextrip' points to the next instruction.
	* Thus instruction restart is achieved by setting 'nextrip'
	* to the vcpu's %rip.
	*/
	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RIP, &rip);
	KASSERT(!error, ("%s: error %d getting rip", __func__, error));
	VCPU_CTR2(vm, vcpuid, "restarting instruction by updating "
	"nextrip from %#lx to %#lx", vcpu->nextrip, rip);
	vcpu->nextrip = rip;
	} else {
	panic("%s: invalid state %d", __func__, state);
	}
	return (0);
	}

	int
	vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info)
	{
	struct vcpu *vcpu;
	int type, vector;

	if (vcpuid < 0 \|\| vcpuid >= VM_MAXCPU)
	return (EINVAL);

	vcpu = &vm->vcpu[vcpuid];

	if (info & VM_INTINFO_VALID) {
	type = info & VM_INTINFO_TYPE;
	vector = info & 0xff;
	if (type == VM_INTINFO_NMI && vector != IDT_NMI)
	return (EINVAL);
	if (type == VM_INTINFO_HWEXCEPTION && vector >= 32)
	return (EINVAL);
	if (info & VM_INTINFO_RSVD)
	return (EINVAL);
	} else {
	info = 0;
	}
	VCPU_CTR2(vm, vcpuid, "%s: info1(%#lx)", __func__, info);
	vcpu->exitintinfo = info;
	return (0);
	}

	enum exc_class {
	EXC_BENIGN,
	EXC_CONTRIBUTORY,
	EXC_PAGEFAULT
	};

	#define IDT_VE 20 /* Virtualization Exception (Intel specific) */

	static enum exc_class
	exception_class(uint64_t info)
	{
	int type, vector;

	KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %#lx", info));
	type = info & VM_INTINFO_TYPE;
	vector = info & 0xff;

	/* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */
	switch (type) {
	case VM_INTINFO_HWINTR:
	case VM_INTINFO_SWINTR:
	case VM_INTINFO_NMI:
	return (EXC_BENIGN);
	default:
	/*
	* Hardware exception.
	*
	* SVM and VT-x use identical type values to represent NMI,
	* hardware interrupt and software interrupt.
	*
	* SVM uses type '3' for all exceptions. VT-x uses type '3'
	* for exceptions except #BP and #OF. #BP and #OF use a type
	* value of '5' or '6'. Therefore we don't check for explicit
	* values of 'type' to classify 'intinfo' into a hardware
	* exception.
	*/
	break;
	}

	switch (vector) {
	case IDT_PF:
	case IDT_VE:
	return (EXC_PAGEFAULT);
	case IDT_DE:
	case IDT_TS:
	case IDT_NP:
	case IDT_SS:
	case IDT_GP:
	return (EXC_CONTRIBUTORY);
	default:
	return (EXC_BENIGN);
	}
	}

	static int
	nested_fault(struct vm *vm, int vcpuid, uint64_t info1, uint64_t info2,
	uint64_t *retinfo)
	{
	enum exc_class exc1, exc2;
	int type1, vector1;

	KASSERT(info1 & VM_INTINFO_VALID, ("info1 %#lx is not valid", info1));
	KASSERT(info2 & VM_INTINFO_VALID, ("info2 %#lx is not valid", info2));

	/*
	* If an exception occurs while attempting to call the double-fault
	* handler the processor enters shutdown mode (aka triple fault).
	*/
	type1 = info1 & VM_INTINFO_TYPE;
	vector1 = info1 & 0xff;
	if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) {
	VCPU_CTR2(vm, vcpuid, "triple fault: info1(%#lx), info2(%#lx)",
	info1, info2);
	vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT);
	*retinfo = 0;
	return (0);
	}

	/*
	* Table 6-5 "Conditions for Generating a Double Fault", Intel SDM, Vol3
	*/
	exc1 = exception_class(info1);
	exc2 = exception_class(info2);
	if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) \|\|
	(exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) {
	/* Convert nested fault into a double fault. */
	*retinfo = IDT_DF;
	*retinfo \|= VM_INTINFO_VALID \| VM_INTINFO_HWEXCEPTION;
	*retinfo \|= VM_INTINFO_DEL_ERRCODE;
	} else {
	/* Handle exceptions serially */
	*retinfo = info2;
	}
	return (1);
	}

	static uint64_t
	vcpu_exception_intinfo(struct vcpu *vcpu)
	{
	uint64_t info = 0;

	if (vcpu->exception_pending) {
	info = vcpu->exc_vector & 0xff;
	info \|= VM_INTINFO_VALID \| VM_INTINFO_HWEXCEPTION;
	if (vcpu->exc_errcode_valid) {
	info \|= VM_INTINFO_DEL_ERRCODE;
	info \|= (uint64_t)vcpu->exc_errcode << 32;
	}
	}
	return (info);
	}

	int
	vm_entry_intinfo(struct vm vm, int vcpuid, uint64_t retinfo)
	{
	struct vcpu *vcpu;
	uint64_t info1, info2;
	int valid;

	KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU, ("invalid vcpu %d", vcpuid));

	vcpu = &vm->vcpu[vcpuid];

	info1 = vcpu->exitintinfo;
	vcpu->exitintinfo = 0;

	info2 = 0;
	if (vcpu->exception_pending) {
	info2 = vcpu_exception_intinfo(vcpu);
	vcpu->exception_pending = 0;
	VCPU_CTR2(vm, vcpuid, "Exception %d delivered: %#lx",
	vcpu->exc_vector, info2);
	}

	if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) {
	valid = nested_fault(vm, vcpuid, info1, info2, retinfo);
	} else if (info1 & VM_INTINFO_VALID) {
	*retinfo = info1;
	valid = 1;
	} else if (info2 & VM_INTINFO_VALID) {
	*retinfo = info2;
	valid = 1;
	} else {
	valid = 0;
	}

	if (valid) {
	VCPU_CTR4(vm, vcpuid, "%s: info1(%#lx), info2(%#lx), "
	"retinfo(%#lx)", __func__, info1, info2, *retinfo);
	}

	return (valid);
	}

	int
	vm_get_intinfo(struct vm vm, int vcpuid, uint64_t info1, uint64_t *info2)
	{
	struct vcpu *vcpu;

	if (vcpuid < 0 \|\| vcpuid >= VM_MAXCPU)
	return (EINVAL);

	vcpu = &vm->vcpu[vcpuid];
	*info1 = vcpu->exitintinfo;
	*info2 = vcpu_exception_intinfo(vcpu);
	return (0);
	}

	int
	vm_inject_exception(struct vm *vm, int vcpuid, int vector, int errcode_valid,
	uint32_t errcode, int restart_instruction)
	{
	struct vcpu *vcpu;
	uint64_t regval;
	int error;

	if (vcpuid < 0 \|\| vcpuid >= VM_MAXCPU)
	return (EINVAL);

	if (vector < 0 \|\| vector >= 32)
	return (EINVAL);

	/*
	* A double fault exception should never be injected directly into
	* the guest. It is a derived exception that results from specific
	* combinations of nested faults.
	*/
	if (vector == IDT_DF)
	return (EINVAL);

	vcpu = &vm->vcpu[vcpuid];

	if (vcpu->exception_pending) {
	VCPU_CTR2(vm, vcpuid, "Unable to inject exception %d due to "
	"pending exception %d", vector, vcpu->exc_vector);
	return (EBUSY);
	}

	if (errcode_valid) {
	/*
	* Exceptions don't deliver an error code in real mode.
	*/
	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, &regval);
	KASSERT(!error, ("%s: error %d getting CR0", __func__, error));
	if (!(regval & CR0_PE))
	errcode_valid = 0;
	}

	/*
	* From section 26.6.1 "Interruptibility State" in Intel SDM:
	*
	* Event blocking by "STI" or "MOV SS" is cleared after guest executes
	* one instruction or incurs an exception.
	*/
	error = vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0);
	KASSERT(error == 0, ("%s: error %d clearing interrupt shadow",
	__func__, error));

	if (restart_instruction)
	vm_restart_instruction(vm, vcpuid);

	vcpu->exception_pending = 1;
	vcpu->exc_vector = vector;
	vcpu->exc_errcode = errcode;
	vcpu->exc_errcode_valid = errcode_valid;
	VCPU_CTR1(vm, vcpuid, "Exception %d pending", vector);
	return (0);
	}

	void
	vm_inject_fault(void *vmarg, int vcpuid, int vector, int errcode_valid,
	int errcode)
	{
	struct vm *vm;
	int error, restart_instruction;

	vm = vmarg;
	restart_instruction = 1;

	error = vm_inject_exception(vm, vcpuid, vector, errcode_valid,
	errcode, restart_instruction);
	KASSERT(error == 0, ("vm_inject_exception error %d", error));
	}

	void
	vm_inject_pf(void *vmarg, int vcpuid, int error_code, uint64_t cr2)
	{
	struct vm *vm;
	int error;

	vm = vmarg;
	VCPU_CTR2(vm, vcpuid, "Injecting page fault: error_code %#x, cr2 %#lx",
	error_code, cr2);

	error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2);
	KASSERT(error == 0, ("vm_set_register(cr2) error %d", error));

	vm_inject_fault(vm, vcpuid, IDT_PF, 1, error_code);
	}

	static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");

	int
	vm_inject_nmi(struct vm *vm, int vcpuid)
	{
	struct vcpu *vcpu;

	if (vcpuid < 0 \|\| vcpuid >= VM_MAXCPU)
	return (EINVAL);

	vcpu = &vm->vcpu[vcpuid];

	vcpu->nmi_pending = 1;
	vcpu_notify_event(vm, vcpuid, false);
	return (0);
	}

	int
	vm_nmi_pending(struct vm *vm, int vcpuid)
	{
	struct vcpu *vcpu;

	if (vcpuid < 0 \|\| vcpuid >= VM_MAXCPU)
	panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);

	vcpu = &vm->vcpu[vcpuid];

	return (vcpu->nmi_pending);
	}

	void
	vm_nmi_clear(struct vm *vm, int vcpuid)
	{
	struct vcpu *vcpu;

	if (vcpuid < 0 \|\| vcpuid >= VM_MAXCPU)
	panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);

	vcpu = &vm->vcpu[vcpuid];

	if (vcpu->nmi_pending == 0)
	panic("vm_nmi_clear: inconsistent nmi_pending state");

	vcpu->nmi_pending = 0;
	vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1);
	}

	static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu");

	int
	vm_inject_extint(struct vm *vm, int vcpuid)
	{
	struct vcpu *vcpu;

	if (vcpuid < 0 \|\| vcpuid >= VM_MAXCPU)
	return (EINVAL);

	vcpu = &vm->vcpu[vcpuid];

	vcpu->extint_pending = 1;
	vcpu_notify_event(vm, vcpuid, false);
	return (0);
	}

	int
	vm_extint_pending(struct vm *vm, int vcpuid)
	{
	struct vcpu *vcpu;

	if (vcpuid < 0 \|\| vcpuid >= VM_MAXCPU)
	panic("vm_extint_pending: invalid vcpuid %d", vcpuid);

	vcpu = &vm->vcpu[vcpuid];

	return (vcpu->extint_pending);
	}

	void
	vm_extint_clear(struct vm *vm, int vcpuid)
	{
	struct vcpu *vcpu;

	if (vcpuid < 0 \|\| vcpuid >= VM_MAXCPU)
	panic("vm_extint_pending: invalid vcpuid %d", vcpuid);

	vcpu = &vm->vcpu[vcpuid];

	if (vcpu->extint_pending == 0)
	panic("vm_extint_clear: inconsistent extint_pending state");

	vcpu->extint_pending = 0;
	vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1);
	}

	int
	vm_get_capability(struct vm vm, int vcpu, int type, int retval)
	{
	if (vcpu < 0 \|\| vcpu >= VM_MAXCPU)
	return (EINVAL);

	if (type < 0 \|\| type >= VM_CAP_MAX)
	return (EINVAL);

	return (VMGETCAP(vm->cookie, vcpu, type, retval));
	}

	int
	vm_set_capability(struct vm *vm, int vcpu, int type, int val)
	{
	if (vcpu < 0 \|\| vcpu >= VM_MAXCPU)
	return (EINVAL);

	if (type < 0 \|\| type >= VM_CAP_MAX)
	return (EINVAL);

	return (VMSETCAP(vm->cookie, vcpu, type, val));
	}

	struct vlapic *
	vm_lapic(struct vm *vm, int cpu)
	{
	return (vm->vcpu[cpu].vlapic);
	}

	struct vioapic *
	vm_ioapic(struct vm *vm)
	{

	return (vm->vioapic);
	}

	struct vhpet *
	vm_hpet(struct vm *vm)
	{

	return (vm->vhpet);
	}

	boolean_t
	vmm_is_pptdev(int bus, int slot, int func)
	{
	int found, i, n;
	int b, s, f;
	char val, cp, *cp2;

	/*
	* XXX
	* The length of an environment variable is limited to 128 bytes which
	* puts an upper limit on the number of passthru devices that may be
	* specified using a single environment variable.
	*
	* Work around this by scanning multiple environment variable
	* names instead of a single one - yuck!
	*/
	const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL };

	/* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */
	found = 0;
	for (i = 0; names[i] != NULL && !found; i++) {
	cp = val = kern_getenv(names[i]);
	while (cp != NULL && *cp != '\0') {
	if ((cp2 = strchr(cp, ' ')) != NULL)
	*cp2 = '\0';

	n = sscanf(cp, "%d/%d/%d", &b, &s, &f);
	if (n == 3 && bus == b && slot == s && func == f) {
	found = 1;
	break;
	}

	if (cp2 != NULL)
	*cp2++ = ' ';

	cp = cp2;
	}
	freeenv(val);
	}
	return (found);
	}

	void *
	vm_iommu_domain(struct vm *vm)
	{

	return (vm->iommu);
	}

	int
	vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate,
	bool from_idle)
	{
	int error;
	struct vcpu *vcpu;

	if (vcpuid < 0 \|\| vcpuid >= VM_MAXCPU)
	panic("vm_set_run_state: invalid vcpuid %d", vcpuid);

	vcpu = &vm->vcpu[vcpuid];

	vcpu_lock(vcpu);
	error = vcpu_set_state_locked(vm, vcpuid, newstate, from_idle);
	vcpu_unlock(vcpu);

	return (error);
	}

	enum vcpu_state
	vcpu_get_state(struct vm vm, int vcpuid, int hostcpu)
	{
	struct vcpu *vcpu;
	enum vcpu_state state;

	if (vcpuid < 0 \|\| vcpuid >= VM_MAXCPU)
	panic("vm_get_run_state: invalid vcpuid %d", vcpuid);

	vcpu = &vm->vcpu[vcpuid];

	vcpu_lock(vcpu);
	state = vcpu->state;
	if (hostcpu != NULL)
	*hostcpu = vcpu->hostcpu;
	vcpu_unlock(vcpu);

	return (state);
	}

	int
	vm_activate_cpu(struct vm *vm, int vcpuid)
	{

	if (vcpuid < 0 \|\| vcpuid >= VM_MAXCPU)
	return (EINVAL);

	if (CPU_ISSET(vcpuid, &vm->active_cpus))
	return (EBUSY);

	VCPU_CTR0(vm, vcpuid, "activated");
	CPU_SET_ATOMIC(vcpuid, &vm->active_cpus);
	return (0);
	}

	+int
	+vm_suspend_cpu(struct vm *vm, int vcpuid)
	+{
	+ int i;
	+
	+ if (vcpuid < -1 \|\| vcpuid >= VM_MAXCPU)
	+ return (EINVAL);
	+
	+ if (vcpuid == -1) {
	+ vm->debug_cpus = vm->active_cpus;
	+ for (i = 0; i < VM_MAXCPU; i++) {
	+ if (CPU_ISSET(i, &vm->active_cpus))
	+ vcpu_notify_event(vm, i, false);
	+ }
	+ } else {
	+ if (!CPU_ISSET(vcpuid, &vm->active_cpus))
	+ return (EINVAL);
	+
	+ CPU_SET_ATOMIC(vcpuid, &vm->debug_cpus);
	+ vcpu_notify_event(vm, vcpuid, false);
	+ }
	+ return (0);
	+}
	+
	+int
	+vm_resume_cpu(struct vm *vm, int vcpuid)
	+{
	+
	+ if (vcpuid < -1 \|\| vcpuid >= VM_MAXCPU)
	+ return (EINVAL);
	+
	+ if (vcpuid == -1) {
	+ CPU_ZERO(&vm->debug_cpus);
	+ } else {
	+ if (!CPU_ISSET(vcpuid, &vm->debug_cpus))
	+ return (EINVAL);
	+
	+ CPU_CLR_ATOMIC(vcpuid, &vm->debug_cpus);
	+ }
	+ return (0);
	+}
	+
	+int
	+vcpu_debugged(struct vm *vm, int vcpuid)
	+{
	+
	+ return (CPU_ISSET(vcpuid, &vm->debug_cpus));
	+}
	+
	cpuset_t
	vm_active_cpus(struct vm *vm)
	{

	return (vm->active_cpus);
	+}
	+
	+cpuset_t
	+vm_debug_cpus(struct vm *vm)
	+{
	+
	+ return (vm->debug_cpus);
	}

	cpuset_t
	vm_suspended_cpus(struct vm *vm)
	{

	return (vm->suspended_cpus);
	}

	void *
	vcpu_stats(struct vm *vm, int vcpuid)
	{

	return (vm->vcpu[vcpuid].stats);
	}

	int
	vm_get_x2apic_state(struct vm vm, int vcpuid, enum x2apic_state state)
	{
	if (vcpuid < 0 \|\| vcpuid >= VM_MAXCPU)
	return (EINVAL);

	*state = vm->vcpu[vcpuid].x2apic_state;

	return (0);
	}

	int
	vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
	{
	if (vcpuid < 0 \|\| vcpuid >= VM_MAXCPU)
	return (EINVAL);

	if (state >= X2APIC_STATE_LAST)
	return (EINVAL);

	vm->vcpu[vcpuid].x2apic_state = state;

	vlapic_set_x2apic_state(vm, vcpuid, state);

	return (0);
	}

	/*
	* This function is called to ensure that a vcpu "sees" a pending event
	* as soon as possible:
	* - If the vcpu thread is sleeping then it is woken up.
	* - If the vcpu is running on a different host_cpu then an IPI will be directed
	* to the host_cpu to cause the vcpu to trap into the hypervisor.
	*/
	static void
	vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr)
	{
	int hostcpu;

	hostcpu = vcpu->hostcpu;
	if (vcpu->state == VCPU_RUNNING) {
	KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
	if (hostcpu != curcpu) {
	if (lapic_intr) {
	vlapic_post_intr(vcpu->vlapic, hostcpu,
	vmm_ipinum);
	} else {
	ipi_cpu(hostcpu, vmm_ipinum);
	}
	} else {
	/*
	* If the 'vcpu' is running on 'curcpu' then it must
	* be sending a notification to itself (e.g. SELF_IPI).
	* The pending event will be picked up when the vcpu
	* transitions back to guest context.
	*/
	}
	} else {
	KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent "
	"with hostcpu %d", vcpu->state, hostcpu));
	if (vcpu->state == VCPU_SLEEPING)
	wakeup_one(vcpu);
	}
	}

	void
	vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr)
	{
	struct vcpu *vcpu = &vm->vcpu[vcpuid];

	vcpu_lock(vcpu);
	vcpu_notify_event_locked(vcpu, lapic_intr);
	vcpu_unlock(vcpu);
	}

	struct vmspace *
	vm_get_vmspace(struct vm *vm)
	{

	return (vm->vmspace);
	}

	int
	vm_apicid2vcpuid(struct vm *vm, int apicid)
	{
	/*
	* XXX apic id is assumed to be numerically identical to vcpu id
	*/
	return (apicid);
	}

	void
	vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest,
	vm_rendezvous_func_t func, void *arg)
	{
	int i;

	/*
	* Enforce that this function is called without any locks
	*/
	WITNESS_WARN(WARN_PANIC, NULL, "vm_smp_rendezvous");
	KASSERT(vcpuid == -1 \|\| (vcpuid >= 0 && vcpuid < VM_MAXCPU),
	("vm_smp_rendezvous: invalid vcpuid %d", vcpuid));

	restart:
	mtx_lock(&vm->rendezvous_mtx);
	if (vm->rendezvous_func != NULL) {
	/*
	* If a rendezvous is already in progress then we need to
	* call the rendezvous handler in case this 'vcpuid' is one
	* of the targets of the rendezvous.
	*/
	RENDEZVOUS_CTR0(vm, vcpuid, "Rendezvous already in progress");
	mtx_unlock(&vm->rendezvous_mtx);
	vm_handle_rendezvous(vm, vcpuid);
	goto restart;
	}
	KASSERT(vm->rendezvous_func == NULL, ("vm_smp_rendezvous: previous "
	"rendezvous is still in progress"));

	RENDEZVOUS_CTR0(vm, vcpuid, "Initiating rendezvous");
	vm->rendezvous_req_cpus = dest;
	CPU_ZERO(&vm->rendezvous_done_cpus);
	vm->rendezvous_arg = arg;
	vm_set_rendezvous_func(vm, func);
	mtx_unlock(&vm->rendezvous_mtx);

	/*
	* Wake up any sleeping vcpus and trigger a VM-exit in any running
	* vcpus so they handle the rendezvous as soon as possible.
	*/
	for (i = 0; i < VM_MAXCPU; i++) {
	if (CPU_ISSET(i, &dest))
	vcpu_notify_event(vm, i, false);
	}

	vm_handle_rendezvous(vm, vcpuid);
	}

	struct vatpic *
	vm_atpic(struct vm *vm)
	{
	return (vm->vatpic);
	}

	struct vatpit *
	vm_atpit(struct vm *vm)
	{
	return (vm->vatpit);
	}

	struct vpmtmr *
	vm_pmtmr(struct vm *vm)
	{

	return (vm->vpmtmr);
	}

	struct vrtc *
	vm_rtc(struct vm *vm)
	{

	return (vm->vrtc);
	}

	enum vm_reg_name
	vm_segment_name(int seg)
	{
	static enum vm_reg_name seg_names[] = {
	VM_REG_GUEST_ES,
	VM_REG_GUEST_CS,
	VM_REG_GUEST_SS,
	VM_REG_GUEST_DS,
	VM_REG_GUEST_FS,
	VM_REG_GUEST_GS
	};

	KASSERT(seg >= 0 && seg < nitems(seg_names),
	("%s: invalid segment encoding %d", __func__, seg));
	return (seg_names[seg]);
	}

	void
	vm_copy_teardown(struct vm vm, int vcpuid, struct vm_copyinfo copyinfo,
	int num_copyinfo)
	{
	int idx;

	for (idx = 0; idx < num_copyinfo; idx++) {
	if (copyinfo[idx].cookie != NULL)
	vm_gpa_release(copyinfo[idx].cookie);
	}
	bzero(copyinfo, num_copyinfo * sizeof(struct vm_copyinfo));
	}

	int
	vm_copy_setup(struct vm vm, int vcpuid, struct vm_guest_paging paging,
	uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
	int num_copyinfo, int *fault)
	{
	int error, idx, nused;
	size_t n, off, remaining;
	void hva, cookie;
	uint64_t gpa;

	bzero(copyinfo, sizeof(struct vm_copyinfo) * num_copyinfo);

	nused = 0;
	remaining = len;
	while (remaining > 0) {
	KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo"));
	error = vm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa, fault);
	if (error \|\| *fault)
	return (error);
	off = gpa & PAGE_MASK;
	n = min(remaining, PAGE_SIZE - off);
	copyinfo[nused].gpa = gpa;
	copyinfo[nused].len = n;
	remaining -= n;
	gla += n;
	nused++;
	}

	for (idx = 0; idx < nused; idx++) {
	hva = vm_gpa_hold(vm, vcpuid, copyinfo[idx].gpa,
	copyinfo[idx].len, prot, &cookie);
	if (hva == NULL)
	break;
	copyinfo[idx].hva = hva;
	copyinfo[idx].cookie = cookie;
	}

	if (idx != nused) {
	vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo);
	return (EFAULT);
	} else {
	*fault = 0;
	return (0);
	}
	}

	void
	vm_copyin(struct vm vm, int vcpuid, struct vm_copyinfo copyinfo, void *kaddr,
	size_t len)
	{
	char *dst;
	int idx;

	dst = kaddr;
	idx = 0;
	while (len > 0) {
	bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len);
	len -= copyinfo[idx].len;
	dst += copyinfo[idx].len;
	idx++;
	}
	}

	void
	vm_copyout(struct vm vm, int vcpuid, const void kaddr,
	struct vm_copyinfo *copyinfo, size_t len)
	{
	const char *src;
	int idx;

	src = kaddr;
	idx = 0;
	while (len > 0) {
	bcopy(src, copyinfo[idx].hva, copyinfo[idx].len);
	len -= copyinfo[idx].len;
	src += copyinfo[idx].len;
	idx++;
	}
	}

	/*
	* Return the amount of in-use and wired memory for the VM. Since
	* these are global stats, only return the values with for vCPU 0
	*/
	VMM_STAT_DECLARE(VMM_MEM_RESIDENT);
	VMM_STAT_DECLARE(VMM_MEM_WIRED);

	static void
	vm_get_rescnt(struct vm vm, int vcpu, struct vmm_stat_type stat)
	{

	if (vcpu == 0) {
	vmm_stat_set(vm, vcpu, VMM_MEM_RESIDENT,
	PAGE_SIZE * vmspace_resident_count(vm->vmspace));
	}
	}

	static void
	vm_get_wiredcnt(struct vm vm, int vcpu, struct vmm_stat_type stat)
	{

	if (vcpu == 0) {
	vmm_stat_set(vm, vcpu, VMM_MEM_WIRED,
	PAGE_SIZE * pmap_wired_count(vmspace_pmap(vm->vmspace)));
	}
	}

	VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt);
	VMM_STAT_FUNC(VMM_MEM_WIRED, "Wired memory", vm_get_wiredcnt);
	Index: head/sys/amd64/vmm/vmm_dev.c
	===================================================================
	--- head/sys/amd64/vmm/vmm_dev.c (revision 332156)
	+++ head/sys/amd64/vmm/vmm_dev.c (revision 332157)
	@@ -1,1070 +1,1080 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	*
	* Copyright (c) 2011 NetApp, Inc.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* $FreeBSD$
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/kernel.h>
	#include <sys/queue.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/malloc.h>
	#include <sys/conf.h>
	#include <sys/sysctl.h>
	#include <sys/libkern.h>
	#include <sys/ioccom.h>
	#include <sys/mman.h>
	#include <sys/uio.h>

	#include <vm/vm.h>
	#include <vm/pmap.h>
	#include <vm/vm_map.h>
	#include <vm/vm_object.h>

	#include <machine/vmparam.h>
	#include <machine/vmm.h>
	#include <machine/vmm_instruction_emul.h>
	#include <machine/vmm_dev.h>

	#include "vmm_lapic.h"
	#include "vmm_stat.h"
	#include "vmm_mem.h"
	#include "io/ppt.h"
	#include "io/vatpic.h"
	#include "io/vioapic.h"
	#include "io/vhpet.h"
	#include "io/vrtc.h"

	struct devmem_softc {
	int segid;
	char *name;
	struct cdev *cdev;
	struct vmmdev_softc *sc;
	SLIST_ENTRY(devmem_softc) link;
	};

	struct vmmdev_softc {
	struct vm vm; / vm instance cookie */
	struct cdev *cdev;
	SLIST_ENTRY(vmmdev_softc) link;
	SLIST_HEAD(, devmem_softc) devmem;
	int flags;
	};
	#define VSC_LINKED 0x01

	static SLIST_HEAD(, vmmdev_softc) head;

	static struct mtx vmmdev_mtx;

	static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev");

	SYSCTL_DECL(_hw_vmm);

	static int devmem_create_cdev(const char vmname, int id, char devmem);
	static void devmem_destroy(void *arg);

	static int
	vcpu_lock_one(struct vmmdev_softc *sc, int vcpu)
	{
	int error;

	if (vcpu < 0 \|\| vcpu >= VM_MAXCPU)
	return (EINVAL);

	error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN, true);
	return (error);
	}

	static void
	vcpu_unlock_one(struct vmmdev_softc *sc, int vcpu)
	{
	enum vcpu_state state;

	state = vcpu_get_state(sc->vm, vcpu, NULL);
	if (state != VCPU_FROZEN) {
	panic("vcpu %s(%d) has invalid state %d", vm_name(sc->vm),
	vcpu, state);
	}

	vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false);
	}

	static int
	vcpu_lock_all(struct vmmdev_softc *sc)
	{
	int error, vcpu;

	for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) {
	error = vcpu_lock_one(sc, vcpu);
	if (error)
	break;
	}

	if (error) {
	while (--vcpu >= 0)
	vcpu_unlock_one(sc, vcpu);
	}

	return (error);
	}

	static void
	vcpu_unlock_all(struct vmmdev_softc *sc)
	{
	int vcpu;

	for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++)
	vcpu_unlock_one(sc, vcpu);
	}

	static struct vmmdev_softc *
	vmmdev_lookup(const char *name)
	{
	struct vmmdev_softc *sc;

	#ifdef notyet /* XXX kernel is not compiled with invariants */
	mtx_assert(&vmmdev_mtx, MA_OWNED);
	#endif

	SLIST_FOREACH(sc, &head, link) {
	if (strcmp(name, vm_name(sc->vm)) == 0)
	break;
	}

	return (sc);
	}

	static struct vmmdev_softc *
	vmmdev_lookup2(struct cdev *cdev)
	{

	return (cdev->si_drv1);
	}

	static int
	vmmdev_rw(struct cdev cdev, struct uio uio, int flags)
	{
	int error, off, c, prot;
	vm_paddr_t gpa;
	void hpa, cookie;
	struct vmmdev_softc *sc;

	sc = vmmdev_lookup2(cdev);
	if (sc == NULL)
	return (ENXIO);

	/*
	* Get a read lock on the guest memory map by freezing any vcpu.
	*/
	error = vcpu_lock_one(sc, VM_MAXCPU - 1);
	if (error)
	return (error);

	prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ);
	while (uio->uio_resid > 0 && error == 0) {
	gpa = uio->uio_offset;
	off = gpa & PAGE_MASK;
	c = min(uio->uio_resid, PAGE_SIZE - off);

	/*
	* The VM has a hole in its physical memory map. If we want to
	* use 'dd' to inspect memory beyond the hole we need to
	* provide bogus data for memory that lies in the hole.
	*
	* Since this device does not support lseek(2), dd(1) will
	* read(2) blocks of data to simulate the lseek(2).
	*/
	hpa = vm_gpa_hold(sc->vm, VM_MAXCPU - 1, gpa, c, prot, &cookie);
	if (hpa == NULL) {
	if (uio->uio_rw == UIO_READ)
	error = uiomove(__DECONST(void *, zero_region),
	c, uio);
	else
	error = EFAULT;
	} else {
	error = uiomove(hpa, c, uio);
	vm_gpa_release(cookie);
	}
	}
	vcpu_unlock_one(sc, VM_MAXCPU - 1);
	return (error);
	}

	CTASSERT(sizeof(((struct vm_memseg *)0)->name) >= SPECNAMELEN + 1);

	static int
	get_memseg(struct vmmdev_softc sc, struct vm_memseg mseg)
	{
	struct devmem_softc *dsc;
	int error;
	bool sysmem;

	error = vm_get_memseg(sc->vm, mseg->segid, &mseg->len, &sysmem, NULL);
	if (error \|\| mseg->len == 0)
	return (error);

	if (!sysmem) {
	SLIST_FOREACH(dsc, &sc->devmem, link) {
	if (dsc->segid == mseg->segid)
	break;
	}
	KASSERT(dsc != NULL, ("%s: devmem segment %d not found",
	__func__, mseg->segid));
	error = copystr(dsc->name, mseg->name, SPECNAMELEN + 1, NULL);
	} else {
	bzero(mseg->name, sizeof(mseg->name));
	}

	return (error);
	}

	static int
	alloc_memseg(struct vmmdev_softc sc, struct vm_memseg mseg)
	{
	char *name;
	int error;
	bool sysmem;

	error = 0;
	name = NULL;
	sysmem = true;

	if (VM_MEMSEG_NAME(mseg)) {
	sysmem = false;
	name = malloc(SPECNAMELEN + 1, M_VMMDEV, M_WAITOK);
	error = copystr(mseg->name, name, SPECNAMELEN + 1, 0);
	if (error)
	goto done;
	}

	error = vm_alloc_memseg(sc->vm, mseg->segid, mseg->len, sysmem);
	if (error)
	goto done;

	if (VM_MEMSEG_NAME(mseg)) {
	error = devmem_create_cdev(vm_name(sc->vm), mseg->segid, name);
	if (error)
	vm_free_memseg(sc->vm, mseg->segid);
	else
	name = NULL; /* freed when 'cdev' is destroyed */
	}
	done:
	free(name, M_VMMDEV);
	return (error);
	}

	static int
	vm_get_register_set(struct vm vm, int vcpu, unsigned int count, int regnum,
	uint64_t *regval)
	{
	int error, i;

	error = 0;
	for (i = 0; i < count; i++) {
	error = vm_get_register(vm, vcpu, regnum[i], &regval[i]);
	if (error)
	break;
	}
	return (error);
	}

	static int
	vm_set_register_set(struct vm vm, int vcpu, unsigned int count, int regnum,
	uint64_t *regval)
	{
	int error, i;

	error = 0;
	for (i = 0; i < count; i++) {
	error = vm_set_register(vm, vcpu, regnum[i], regval[i]);
	if (error)
	break;
	}
	return (error);
	}

	static int
	vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
	struct thread *td)
	{
	int error, vcpu, state_changed, size;
	cpuset_t *cpuset;
	struct vmmdev_softc *sc;
	struct vm_register *vmreg;
	struct vm_seg_desc *vmsegdesc;
	struct vm_register_set *vmregset;
	struct vm_run *vmrun;
	struct vm_exception *vmexc;
	struct vm_lapic_irq *vmirq;
	struct vm_lapic_msi *vmmsi;
	struct vm_ioapic_irq *ioapic_irq;
	struct vm_isa_irq *isa_irq;
	struct vm_isa_irq_trigger *isa_irq_trigger;
	struct vm_capability *vmcap;
	struct vm_pptdev *pptdev;
	struct vm_pptdev_mmio *pptmmio;
	struct vm_pptdev_msi *pptmsi;
	struct vm_pptdev_msix *pptmsix;
	struct vm_nmi *vmnmi;
	struct vm_stats *vmstats;
	struct vm_stat_desc *statdesc;
	struct vm_x2apic *x2apic;
	struct vm_gpa_pte *gpapte;
	struct vm_suspend *vmsuspend;
	struct vm_gla2gpa *gg;
	struct vm_activate_cpu *vac;
	struct vm_cpuset *vm_cpuset;
	struct vm_intinfo *vmii;
	struct vm_rtc_time *rtctime;
	struct vm_rtc_data *rtcdata;
	struct vm_memmap *mm;
	uint64_t *regvals;
	int *regnums;

	sc = vmmdev_lookup2(cdev);
	if (sc == NULL)
	return (ENXIO);

	error = 0;
	vcpu = -1;
	state_changed = 0;

	/*
	* Some VMM ioctls can operate only on vcpus that are not running.
	*/
	switch (cmd) {
	case VM_RUN:
	case VM_GET_REGISTER:
	case VM_SET_REGISTER:
	case VM_GET_SEGMENT_DESCRIPTOR:
	case VM_SET_SEGMENT_DESCRIPTOR:
	case VM_GET_REGISTER_SET:
	case VM_SET_REGISTER_SET:
	case VM_INJECT_EXCEPTION:
	case VM_GET_CAPABILITY:
	case VM_SET_CAPABILITY:
	case VM_PPTDEV_MSI:
	case VM_PPTDEV_MSIX:
	case VM_SET_X2APIC_STATE:
	case VM_GLA2GPA:
	case VM_GLA2GPA_NOFAULT:
	case VM_ACTIVATE_CPU:
	case VM_SET_INTINFO:
	case VM_GET_INTINFO:
	case VM_RESTART_INSTRUCTION:
	/*
	* XXX fragile, handle with care
	* Assumes that the first field of the ioctl data is the vcpu.
	*/
	vcpu = (int )data;
	error = vcpu_lock_one(sc, vcpu);
	if (error)
	goto done;
	state_changed = 1;
	break;

	case VM_MAP_PPTDEV_MMIO:
	case VM_BIND_PPTDEV:
	case VM_UNBIND_PPTDEV:
	case VM_ALLOC_MEMSEG:
	case VM_MMAP_MEMSEG:
	case VM_REINIT:
	/*
	* ioctls that operate on the entire virtual machine must
	* prevent all vcpus from running.
	*/
	error = vcpu_lock_all(sc);
	if (error)
	goto done;
	state_changed = 2;
	break;

	case VM_GET_MEMSEG:
	case VM_MMAP_GETNEXT:
	/*
	* Lock a vcpu to make sure that the memory map cannot be
	* modified while it is being inspected.
	*/
	vcpu = VM_MAXCPU - 1;
	error = vcpu_lock_one(sc, vcpu);
	if (error)
	goto done;
	state_changed = 1;
	break;

	default:
	break;
	}

	switch(cmd) {
	case VM_RUN:
	vmrun = (struct vm_run *)data;
	error = vm_run(sc->vm, vmrun);
	break;
	case VM_SUSPEND:
	vmsuspend = (struct vm_suspend *)data;
	error = vm_suspend(sc->vm, vmsuspend->how);
	break;
	case VM_REINIT:
	error = vm_reinit(sc->vm);
	break;
	case VM_STAT_DESC: {
	statdesc = (struct vm_stat_desc *)data;
	error = vmm_stat_desc_copy(statdesc->index,
	statdesc->desc, sizeof(statdesc->desc));
	break;
	}
	case VM_STATS: {
	CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_ELEMS);
	vmstats = (struct vm_stats *)data;
	getmicrotime(&vmstats->tv);
	error = vmm_stat_copy(sc->vm, vmstats->cpuid,
	&vmstats->num_entries, vmstats->statbuf);
	break;
	}
	case VM_PPTDEV_MSI:
	pptmsi = (struct vm_pptdev_msi *)data;
	error = ppt_setup_msi(sc->vm, pptmsi->vcpu,
	pptmsi->bus, pptmsi->slot, pptmsi->func,
	pptmsi->addr, pptmsi->msg,
	pptmsi->numvec);
	break;
	case VM_PPTDEV_MSIX:
	pptmsix = (struct vm_pptdev_msix *)data;
	error = ppt_setup_msix(sc->vm, pptmsix->vcpu,
	pptmsix->bus, pptmsix->slot,
	pptmsix->func, pptmsix->idx,
	pptmsix->addr, pptmsix->msg,
	pptmsix->vector_control);
	break;
	case VM_MAP_PPTDEV_MMIO:
	pptmmio = (struct vm_pptdev_mmio *)data;
	error = ppt_map_mmio(sc->vm, pptmmio->bus, pptmmio->slot,
	pptmmio->func, pptmmio->gpa, pptmmio->len,
	pptmmio->hpa);
	break;
	case VM_BIND_PPTDEV:
	pptdev = (struct vm_pptdev *)data;
	error = vm_assign_pptdev(sc->vm, pptdev->bus, pptdev->slot,
	pptdev->func);
	break;
	case VM_UNBIND_PPTDEV:
	pptdev = (struct vm_pptdev *)data;
	error = vm_unassign_pptdev(sc->vm, pptdev->bus, pptdev->slot,
	pptdev->func);
	break;
	case VM_INJECT_EXCEPTION:
	vmexc = (struct vm_exception *)data;
	error = vm_inject_exception(sc->vm, vmexc->cpuid,
	vmexc->vector, vmexc->error_code_valid, vmexc->error_code,
	vmexc->restart_instruction);
	break;
	case VM_INJECT_NMI:
	vmnmi = (struct vm_nmi *)data;
	error = vm_inject_nmi(sc->vm, vmnmi->cpuid);
	break;
	case VM_LAPIC_IRQ:
	vmirq = (struct vm_lapic_irq *)data;
	error = lapic_intr_edge(sc->vm, vmirq->cpuid, vmirq->vector);
	break;
	case VM_LAPIC_LOCAL_IRQ:
	vmirq = (struct vm_lapic_irq *)data;
	error = lapic_set_local_intr(sc->vm, vmirq->cpuid,
	vmirq->vector);
	break;
	case VM_LAPIC_MSI:
	vmmsi = (struct vm_lapic_msi *)data;
	error = lapic_intr_msi(sc->vm, vmmsi->addr, vmmsi->msg);
	break;
	case VM_IOAPIC_ASSERT_IRQ:
	ioapic_irq = (struct vm_ioapic_irq *)data;
	error = vioapic_assert_irq(sc->vm, ioapic_irq->irq);
	break;
	case VM_IOAPIC_DEASSERT_IRQ:
	ioapic_irq = (struct vm_ioapic_irq *)data;
	error = vioapic_deassert_irq(sc->vm, ioapic_irq->irq);
	break;
	case VM_IOAPIC_PULSE_IRQ:
	ioapic_irq = (struct vm_ioapic_irq *)data;
	error = vioapic_pulse_irq(sc->vm, ioapic_irq->irq);
	break;
	case VM_IOAPIC_PINCOUNT:
	(int )data = vioapic_pincount(sc->vm);
	break;
	case VM_ISA_ASSERT_IRQ:
	isa_irq = (struct vm_isa_irq *)data;
	error = vatpic_assert_irq(sc->vm, isa_irq->atpic_irq);
	if (error == 0 && isa_irq->ioapic_irq != -1)
	error = vioapic_assert_irq(sc->vm,
	isa_irq->ioapic_irq);
	break;
	case VM_ISA_DEASSERT_IRQ:
	isa_irq = (struct vm_isa_irq *)data;
	error = vatpic_deassert_irq(sc->vm, isa_irq->atpic_irq);
	if (error == 0 && isa_irq->ioapic_irq != -1)
	error = vioapic_deassert_irq(sc->vm,
	isa_irq->ioapic_irq);
	break;
	case VM_ISA_PULSE_IRQ:
	isa_irq = (struct vm_isa_irq *)data;
	error = vatpic_pulse_irq(sc->vm, isa_irq->atpic_irq);
	if (error == 0 && isa_irq->ioapic_irq != -1)
	error = vioapic_pulse_irq(sc->vm, isa_irq->ioapic_irq);
	break;
	case VM_ISA_SET_IRQ_TRIGGER:
	isa_irq_trigger = (struct vm_isa_irq_trigger *)data;
	error = vatpic_set_irq_trigger(sc->vm,
	isa_irq_trigger->atpic_irq, isa_irq_trigger->trigger);
	break;
	case VM_MMAP_GETNEXT:
	mm = (struct vm_memmap *)data;
	error = vm_mmap_getnext(sc->vm, &mm->gpa, &mm->segid,
	&mm->segoff, &mm->len, &mm->prot, &mm->flags);
	break;
	case VM_MMAP_MEMSEG:
	mm = (struct vm_memmap *)data;
	error = vm_mmap_memseg(sc->vm, mm->gpa, mm->segid, mm->segoff,
	mm->len, mm->prot, mm->flags);
	break;
	case VM_ALLOC_MEMSEG:
	error = alloc_memseg(sc, (struct vm_memseg *)data);
	break;
	case VM_GET_MEMSEG:
	error = get_memseg(sc, (struct vm_memseg *)data);
	break;
	case VM_GET_REGISTER:
	vmreg = (struct vm_register *)data;
	error = vm_get_register(sc->vm, vmreg->cpuid, vmreg->regnum,
	&vmreg->regval);
	break;
	case VM_SET_REGISTER:
	vmreg = (struct vm_register *)data;
	error = vm_set_register(sc->vm, vmreg->cpuid, vmreg->regnum,
	vmreg->regval);
	break;
	case VM_SET_SEGMENT_DESCRIPTOR:
	vmsegdesc = (struct vm_seg_desc *)data;
	error = vm_set_seg_desc(sc->vm, vmsegdesc->cpuid,
	vmsegdesc->regnum,
	&vmsegdesc->desc);
	break;
	case VM_GET_SEGMENT_DESCRIPTOR:
	vmsegdesc = (struct vm_seg_desc *)data;
	error = vm_get_seg_desc(sc->vm, vmsegdesc->cpuid,
	vmsegdesc->regnum,
	&vmsegdesc->desc);
	break;
	case VM_GET_REGISTER_SET:
	vmregset = (struct vm_register_set *)data;
	if (vmregset->count > VM_REG_LAST) {
	error = EINVAL;
	break;
	}
	regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV,
	M_WAITOK);
	regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV,
	M_WAITOK);
	error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) *
	vmregset->count);
	if (error == 0)
	error = vm_get_register_set(sc->vm, vmregset->cpuid,
	vmregset->count, regnums, regvals);
	if (error == 0)
	error = copyout(regvals, vmregset->regvals,
	sizeof(regvals[0]) * vmregset->count);
	free(regvals, M_VMMDEV);
	free(regnums, M_VMMDEV);
	break;
	case VM_SET_REGISTER_SET:
	vmregset = (struct vm_register_set *)data;
	if (vmregset->count > VM_REG_LAST) {
	error = EINVAL;
	break;
	}
	regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV,
	M_WAITOK);
	regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV,
	M_WAITOK);
	error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) *
	vmregset->count);
	if (error == 0)
	error = copyin(vmregset->regvals, regvals,
	sizeof(regvals[0]) * vmregset->count);
	if (error == 0)
	error = vm_set_register_set(sc->vm, vmregset->cpuid,
	vmregset->count, regnums, regvals);
	free(regvals, M_VMMDEV);
	free(regnums, M_VMMDEV);
	break;
	case VM_GET_CAPABILITY:
	vmcap = (struct vm_capability *)data;
	error = vm_get_capability(sc->vm, vmcap->cpuid,
	vmcap->captype,
	&vmcap->capval);
	break;
	case VM_SET_CAPABILITY:
	vmcap = (struct vm_capability *)data;
	error = vm_set_capability(sc->vm, vmcap->cpuid,
	vmcap->captype,
	vmcap->capval);
	break;
	case VM_SET_X2APIC_STATE:
	x2apic = (struct vm_x2apic *)data;
	error = vm_set_x2apic_state(sc->vm,
	x2apic->cpuid, x2apic->state);
	break;
	case VM_GET_X2APIC_STATE:
	x2apic = (struct vm_x2apic *)data;
	error = vm_get_x2apic_state(sc->vm,
	x2apic->cpuid, &x2apic->state);
	break;
	case VM_GET_GPA_PMAP:
	gpapte = (struct vm_gpa_pte *)data;
	pmap_get_mapping(vmspace_pmap(vm_get_vmspace(sc->vm)),
	gpapte->gpa, gpapte->pte, &gpapte->ptenum);
	error = 0;
	break;
	case VM_GET_HPET_CAPABILITIES:
	error = vhpet_getcap((struct vm_hpet_cap *)data);
	break;
	case VM_GLA2GPA: {
	CTASSERT(PROT_READ == VM_PROT_READ);
	CTASSERT(PROT_WRITE == VM_PROT_WRITE);
	CTASSERT(PROT_EXEC == VM_PROT_EXECUTE);
	gg = (struct vm_gla2gpa *)data;
	error = vm_gla2gpa(sc->vm, gg->vcpuid, &gg->paging, gg->gla,
	gg->prot, &gg->gpa, &gg->fault);
	KASSERT(error == 0 \|\| error == EFAULT,
	("%s: vm_gla2gpa unknown error %d", __func__, error));
	break;
	}
	case VM_GLA2GPA_NOFAULT:
	gg = (struct vm_gla2gpa *)data;
	error = vm_gla2gpa_nofault(sc->vm, gg->vcpuid, &gg->paging,
	gg->gla, gg->prot, &gg->gpa, &gg->fault);
	KASSERT(error == 0 \|\| error == EFAULT,
	("%s: vm_gla2gpa unknown error %d", __func__, error));
	break;
	case VM_ACTIVATE_CPU:
	vac = (struct vm_activate_cpu *)data;
	error = vm_activate_cpu(sc->vm, vac->vcpuid);
	break;
	case VM_GET_CPUS:
	error = 0;
	vm_cpuset = (struct vm_cpuset *)data;
	size = vm_cpuset->cpusetsize;
	if (size < sizeof(cpuset_t) \|\| size > CPU_MAXSIZE / NBBY) {
	error = ERANGE;
	break;
	}
	cpuset = malloc(size, M_TEMP, M_WAITOK \| M_ZERO);
	if (vm_cpuset->which == VM_ACTIVE_CPUS)
	*cpuset = vm_active_cpus(sc->vm);
	else if (vm_cpuset->which == VM_SUSPENDED_CPUS)
	*cpuset = vm_suspended_cpus(sc->vm);
	+ else if (vm_cpuset->which == VM_DEBUG_CPUS)
	+ *cpuset = vm_debug_cpus(sc->vm);
	else
	error = EINVAL;
	if (error == 0)
	error = copyout(cpuset, vm_cpuset->cpus, size);
	free(cpuset, M_TEMP);
	+ break;
	+ case VM_SUSPEND_CPU:
	+ vac = (struct vm_activate_cpu *)data;
	+ error = vm_suspend_cpu(sc->vm, vac->vcpuid);
	+ break;
	+ case VM_RESUME_CPU:
	+ vac = (struct vm_activate_cpu *)data;
	+ error = vm_resume_cpu(sc->vm, vac->vcpuid);
	break;
	case VM_SET_INTINFO:
	vmii = (struct vm_intinfo *)data;
	error = vm_exit_intinfo(sc->vm, vmii->vcpuid, vmii->info1);
	break;
	case VM_GET_INTINFO:
	vmii = (struct vm_intinfo *)data;
	error = vm_get_intinfo(sc->vm, vmii->vcpuid, &vmii->info1,
	&vmii->info2);
	break;
	case VM_RTC_WRITE:
	rtcdata = (struct vm_rtc_data *)data;
	error = vrtc_nvram_write(sc->vm, rtcdata->offset,
	rtcdata->value);
	break;
	case VM_RTC_READ:
	rtcdata = (struct vm_rtc_data *)data;
	error = vrtc_nvram_read(sc->vm, rtcdata->offset,
	&rtcdata->value);
	break;
	case VM_RTC_SETTIME:
	rtctime = (struct vm_rtc_time *)data;
	error = vrtc_set_time(sc->vm, rtctime->secs);
	break;
	case VM_RTC_GETTIME:
	error = 0;
	rtctime = (struct vm_rtc_time *)data;
	rtctime->secs = vrtc_get_time(sc->vm);
	break;
	case VM_RESTART_INSTRUCTION:
	error = vm_restart_instruction(sc->vm, vcpu);
	break;
	default:
	error = ENOTTY;
	break;
	}

	if (state_changed == 1)
	vcpu_unlock_one(sc, vcpu);
	else if (state_changed == 2)
	vcpu_unlock_all(sc);

	done:
	/* Make sure that no handler returns a bogus value like ERESTART */
	KASSERT(error >= 0, ("vmmdev_ioctl: invalid error return %d", error));
	return (error);
	}

	static int
	vmmdev_mmap_single(struct cdev cdev, vm_ooffset_t offset, vm_size_t mapsize,
	struct vm_object **objp, int nprot)
	{
	struct vmmdev_softc *sc;
	vm_paddr_t gpa;
	size_t len;
	vm_ooffset_t segoff, first, last;
	int error, found, segid;
	bool sysmem;

	first = *offset;
	last = first + mapsize;
	if ((nprot & PROT_EXEC) \|\| first < 0 \|\| first >= last)
	return (EINVAL);

	sc = vmmdev_lookup2(cdev);
	if (sc == NULL) {
	/* virtual machine is in the process of being created */
	return (EINVAL);
	}

	/*
	* Get a read lock on the guest memory map by freezing any vcpu.
	*/
	error = vcpu_lock_one(sc, VM_MAXCPU - 1);
	if (error)
	return (error);

	gpa = 0;
	found = 0;
	while (!found) {
	error = vm_mmap_getnext(sc->vm, &gpa, &segid, &segoff, &len,
	NULL, NULL);
	if (error)
	break;

	if (first >= gpa && last <= gpa + len)
	found = 1;
	else
	gpa += len;
	}

	if (found) {
	error = vm_get_memseg(sc->vm, segid, &len, &sysmem, objp);
	KASSERT(error == 0 && *objp != NULL,
	("%s: invalid memory segment %d", __func__, segid));
	if (sysmem) {
	vm_object_reference(*objp);
	*offset = segoff + (first - gpa);
	} else {
	error = EINVAL;
	}
	}
	vcpu_unlock_one(sc, VM_MAXCPU - 1);
	return (error);
	}

	static void
	vmmdev_destroy(void *arg)
	{
	struct vmmdev_softc *sc = arg;
	struct devmem_softc *dsc;
	int error;

	error = vcpu_lock_all(sc);
	KASSERT(error == 0, ("%s: error %d freezing vcpus", __func__, error));

	while ((dsc = SLIST_FIRST(&sc->devmem)) != NULL) {
	KASSERT(dsc->cdev == NULL, ("%s: devmem not free", __func__));
	SLIST_REMOVE_HEAD(&sc->devmem, link);
	free(dsc->name, M_VMMDEV);
	free(dsc, M_VMMDEV);
	}

	if (sc->cdev != NULL)
	destroy_dev(sc->cdev);

	if (sc->vm != NULL)
	vm_destroy(sc->vm);

	if ((sc->flags & VSC_LINKED) != 0) {
	mtx_lock(&vmmdev_mtx);
	SLIST_REMOVE(&head, sc, vmmdev_softc, link);
	mtx_unlock(&vmmdev_mtx);
	}

	free(sc, M_VMMDEV);
	}

	static int
	sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS)
	{
	int error;
	char buf[VM_MAX_NAMELEN];
	struct devmem_softc *dsc;
	struct vmmdev_softc *sc;
	struct cdev *cdev;

	strlcpy(buf, "beavis", sizeof(buf));
	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
	if (error != 0 \|\| req->newptr == NULL)
	return (error);

	mtx_lock(&vmmdev_mtx);
	sc = vmmdev_lookup(buf);
	if (sc == NULL \|\| sc->cdev == NULL) {
	mtx_unlock(&vmmdev_mtx);
	return (EINVAL);
	}

	/*
	* The 'cdev' will be destroyed asynchronously when 'si_threadcount'
	* goes down to 0 so we should not do it again in the callback.
	*
	* Setting 'sc->cdev' to NULL is also used to indicate that the VM
	* is scheduled for destruction.
	*/
	cdev = sc->cdev;
	sc->cdev = NULL;
	mtx_unlock(&vmmdev_mtx);

	/*
	* Schedule all cdevs to be destroyed:
	*
	* - any new operations on the 'cdev' will return an error (ENXIO).
	*
	* - when the 'si_threadcount' dwindles down to zero the 'cdev' will
	* be destroyed and the callback will be invoked in a taskqueue
	* context.
	*
	* - the 'devmem' cdevs are destroyed before the virtual machine 'cdev'
	*/
	SLIST_FOREACH(dsc, &sc->devmem, link) {
	KASSERT(dsc->cdev != NULL, ("devmem cdev already destroyed"));
	destroy_dev_sched_cb(dsc->cdev, devmem_destroy, dsc);
	}
	destroy_dev_sched_cb(cdev, vmmdev_destroy, sc);
	return (0);
	}
	SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy, CTLTYPE_STRING \| CTLFLAG_RW,
	NULL, 0, sysctl_vmm_destroy, "A", NULL);

	static struct cdevsw vmmdevsw = {
	.d_name = "vmmdev",
	.d_version = D_VERSION,
	.d_ioctl = vmmdev_ioctl,
	.d_mmap_single = vmmdev_mmap_single,
	.d_read = vmmdev_rw,
	.d_write = vmmdev_rw,
	};

	static int
	sysctl_vmm_create(SYSCTL_HANDLER_ARGS)
	{
	int error;
	struct vm *vm;
	struct cdev *cdev;
	struct vmmdev_softc sc, sc2;
	char buf[VM_MAX_NAMELEN];

	strlcpy(buf, "beavis", sizeof(buf));
	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
	if (error != 0 \|\| req->newptr == NULL)
	return (error);

	mtx_lock(&vmmdev_mtx);
	sc = vmmdev_lookup(buf);
	mtx_unlock(&vmmdev_mtx);
	if (sc != NULL)
	return (EEXIST);

	error = vm_create(buf, &vm);
	if (error != 0)
	return (error);

	sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK \| M_ZERO);
	sc->vm = vm;
	SLIST_INIT(&sc->devmem);

	/*
	* Lookup the name again just in case somebody sneaked in when we
	* dropped the lock.
	*/
	mtx_lock(&vmmdev_mtx);
	sc2 = vmmdev_lookup(buf);
	if (sc2 == NULL) {
	SLIST_INSERT_HEAD(&head, sc, link);
	sc->flags \|= VSC_LINKED;
	}
	mtx_unlock(&vmmdev_mtx);

	if (sc2 != NULL) {
	vmmdev_destroy(sc);
	return (EEXIST);
	}

	error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &vmmdevsw, NULL,
	UID_ROOT, GID_WHEEL, 0600, "vmm/%s", buf);
	if (error != 0) {
	vmmdev_destroy(sc);
	return (error);
	}

	mtx_lock(&vmmdev_mtx);
	sc->cdev = cdev;
	sc->cdev->si_drv1 = sc;
	mtx_unlock(&vmmdev_mtx);

	return (0);
	}
	SYSCTL_PROC(_hw_vmm, OID_AUTO, create, CTLTYPE_STRING \| CTLFLAG_RW,
	NULL, 0, sysctl_vmm_create, "A", NULL);

	void
	vmmdev_init(void)
	{
	mtx_init(&vmmdev_mtx, "vmm device mutex", NULL, MTX_DEF);
	}

	int
	vmmdev_cleanup(void)
	{
	int error;

	if (SLIST_EMPTY(&head))
	error = 0;
	else
	error = EBUSY;

	return (error);
	}

	static int
	devmem_mmap_single(struct cdev cdev, vm_ooffset_t offset, vm_size_t len,
	struct vm_object **objp, int nprot)
	{
	struct devmem_softc *dsc;
	vm_ooffset_t first, last;
	size_t seglen;
	int error;
	bool sysmem;

	dsc = cdev->si_drv1;
	if (dsc == NULL) {
	/* 'cdev' has been created but is not ready for use */
	return (ENXIO);
	}

	first = *offset;
	last = *offset + len;
	if ((nprot & PROT_EXEC) \|\| first < 0 \|\| first >= last)
	return (EINVAL);

	error = vcpu_lock_one(dsc->sc, VM_MAXCPU - 1);
	if (error)
	return (error);

	error = vm_get_memseg(dsc->sc->vm, dsc->segid, &seglen, &sysmem, objp);
	KASSERT(error == 0 && !sysmem && *objp != NULL,
	("%s: invalid devmem segment %d", __func__, dsc->segid));

	vcpu_unlock_one(dsc->sc, VM_MAXCPU - 1);

	if (seglen >= last) {
	vm_object_reference(*objp);
	return (0);
	} else {
	return (EINVAL);
	}
	}

	static struct cdevsw devmemsw = {
	.d_name = "devmem",
	.d_version = D_VERSION,
	.d_mmap_single = devmem_mmap_single,
	};

	static int
	devmem_create_cdev(const char vmname, int segid, char devname)
	{
	struct devmem_softc *dsc;
	struct vmmdev_softc *sc;
	struct cdev *cdev;
	int error;

	error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &devmemsw, NULL,
	UID_ROOT, GID_WHEEL, 0600, "vmm.io/%s.%s", vmname, devname);
	if (error)
	return (error);

	dsc = malloc(sizeof(struct devmem_softc), M_VMMDEV, M_WAITOK \| M_ZERO);

	mtx_lock(&vmmdev_mtx);
	sc = vmmdev_lookup(vmname);
	KASSERT(sc != NULL, ("%s: vm %s softc not found", __func__, vmname));
	if (sc->cdev == NULL) {
	/* virtual machine is being created or destroyed */
	mtx_unlock(&vmmdev_mtx);
	free(dsc, M_VMMDEV);
	destroy_dev_sched_cb(cdev, NULL, 0);
	return (ENODEV);
	}

	dsc->segid = segid;
	dsc->name = devname;
	dsc->cdev = cdev;
	dsc->sc = sc;
	SLIST_INSERT_HEAD(&sc->devmem, dsc, link);
	mtx_unlock(&vmmdev_mtx);

	/* The 'cdev' is ready for use after 'si_drv1' is initialized */
	cdev->si_drv1 = dsc;
	return (0);
	}

	static void
	devmem_destroy(void *arg)
	{
	struct devmem_softc *dsc = arg;

	KASSERT(dsc->cdev, ("%s: devmem cdev already destroyed", __func__));
	dsc->cdev = NULL;
	dsc->sc = NULL;
	}

File Metadata

Mime Type: text/x-diff
Expires: Wed, Nov 12, 11:44 PM (1 h, 3 m)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 25184755
Default Alt Text: (321 KB)

No OneTemporaryActions

View Options

File Metadata

Event Timeline

No OneTemporary
Actions