diff --git a/lib/libvmmapi/vmmapi.c b/lib/libvmmapi/vmmapi.c
index 11f38c926cc3..7347c41dd311 100644
--- a/lib/libvmmapi/vmmapi.c
+++ b/lib/libvmmapi/vmmapi.c
@@ -1,1578 +1,1643 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/sysctl.h>
 #include <sys/ioctl.h>
 #include <sys/linker.h>
 #include <sys/mman.h>
 #include <sys/module.h>
 #include <sys/_iovec.h>
 #include <sys/cpuset.h>
 
 #include <x86/segments.h>
 #include <machine/specialreg.h>
 
 #include <errno.h>
+#include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <assert.h>
 #include <string.h>
 #include <fcntl.h>
 #include <unistd.h>
 
 #include <libutil.h>
 
+#include <vm/vm.h>
 #include <machine/vmm.h>
 #include <machine/vmm_dev.h>
+#include <machine/vmm_snapshot.h>
 
 #include "vmmapi.h"
 
 #define	MB	(1024 * 1024UL)
 #define	GB	(1024 * 1024 * 1024UL)
 
 /*
  * Size of the guard region before and after the virtual address space
  * mapping the guest physical memory. This must be a multiple of the
  * superpage size for performance reasons.
  */
 #define	VM_MMAP_GUARD_SIZE	(4 * MB)
 
 #define	PROT_RW		(PROT_READ | PROT_WRITE)
 #define	PROT_ALL	(PROT_READ | PROT_WRITE | PROT_EXEC)
 
 struct vmctx {
 	int	fd;
 	uint32_t lowmem_limit;
 	int	memflags;
 	size_t	lowmem;
 	size_t	highmem;
 	char	*baseaddr;
 	char	*name;
 };
 
 #define	CREATE(x)  sysctlbyname("hw.vmm.create", NULL, NULL, (x), strlen((x)))
 #define	DESTROY(x) sysctlbyname("hw.vmm.destroy", NULL, NULL, (x), strlen((x)))
 
 static int
 vm_device_open(const char *name)
 {
 	int fd, len;
 	char *vmfile;
 
 	len = strlen("/dev/vmm/") + strlen(name) + 1;
 	vmfile = malloc(len);
 	assert(vmfile != NULL);
 	snprintf(vmfile, len, "/dev/vmm/%s", name);
 
 	/* Open the device file */
 	fd = open(vmfile, O_RDWR, 0);
 
 	free(vmfile);
 	return (fd);
 }
 
 int
 vm_create(const char *name)
 {
 	/* Try to load vmm(4) module before creating a guest. */
 	if (modfind("vmm") < 0)
 		kldload("vmm");
 	return (CREATE((char *)name));
 }
 
 struct vmctx *
 vm_open(const char *name)
 {
 	struct vmctx *vm;
 
 	vm = malloc(sizeof(struct vmctx) + strlen(name) + 1);
 	assert(vm != NULL);
 
 	vm->fd = -1;
 	vm->memflags = 0;
 	vm->lowmem_limit = 3 * GB;
 	vm->name = (char *)(vm + 1);
 	strcpy(vm->name, name);
 
 	if ((vm->fd = vm_device_open(vm->name)) < 0)
 		goto err;
 
 	return (vm);
 err:
 	vm_destroy(vm);
 	return (NULL);
 }
 
 void
 vm_destroy(struct vmctx *vm)
 {
 	assert(vm != NULL);
 
 	if (vm->fd >= 0)
 		close(vm->fd);
 	DESTROY(vm->name);
 
 	free(vm);
 }
 
 int
 vm_parse_memsize(const char *optarg, size_t *ret_memsize)
 {
 	char *endptr;
 	size_t optval;
 	int error;
 
 	optval = strtoul(optarg, &endptr, 0);
 	if (*optarg != '\0' && *endptr == '\0') {
 		/*
 		 * For the sake of backward compatibility if the memory size
 		 * specified on the command line is less than a megabyte then
 		 * it is interpreted as being in units of MB.
 		 */
 		if (optval < MB)
 			optval *= MB;
 		*ret_memsize = optval;
 		error = 0;
 	} else
 		error = expand_number(optarg, ret_memsize);
 
 	return (error);
 }
 
 uint32_t
 vm_get_lowmem_limit(struct vmctx *ctx)
 {
 
 	return (ctx->lowmem_limit);
 }
 
 void
 vm_set_lowmem_limit(struct vmctx *ctx, uint32_t limit)
 {
 
 	ctx->lowmem_limit = limit;
 }
 
 void
 vm_set_memflags(struct vmctx *ctx, int flags)
 {
 
 	ctx->memflags = flags;
 }
 
 int
 vm_get_memflags(struct vmctx *ctx)
 {
 
 	return (ctx->memflags);
 }
 
 /*
  * Map segment 'segid' starting at 'off' into guest address range [gpa,gpa+len).
  */
 int
 vm_mmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, int segid, vm_ooffset_t off,
     size_t len, int prot)
 {
 	struct vm_memmap memmap;
 	int error, flags;
 
 	memmap.gpa = gpa;
 	memmap.segid = segid;
 	memmap.segoff = off;
 	memmap.len = len;
 	memmap.prot = prot;
 	memmap.flags = 0;
 
 	if (ctx->memflags & VM_MEM_F_WIRED)
 		memmap.flags |= VM_MEMMAP_F_WIRED;
 
 	/*
 	 * If this mapping already exists then don't create it again. This
 	 * is the common case for SYSMEM mappings created by bhyveload(8).
 	 */
 	error = vm_mmap_getnext(ctx, &gpa, &segid, &off, &len, &prot, &flags);
 	if (error == 0 && gpa == memmap.gpa) {
 		if (segid != memmap.segid || off != memmap.segoff ||
 		    prot != memmap.prot || flags != memmap.flags) {
 			errno = EEXIST;
 			return (-1);
 		} else {
 			return (0);
 		}
 	}
 
 	error = ioctl(ctx->fd, VM_MMAP_MEMSEG, &memmap);
 	return (error);
 }
 
+int
+vm_get_guestmem_from_ctx(struct vmctx *ctx, char **guest_baseaddr,
+    size_t *lowmem_size, size_t *highmem_size)
+{
+
+	*guest_baseaddr = ctx->baseaddr;
+	*lowmem_size = ctx->lowmem;
+	*highmem_size = ctx->highmem;
+	return (0);
+}
+
 int
 vm_mmap_getnext(struct vmctx *ctx, vm_paddr_t *gpa, int *segid,
     vm_ooffset_t *segoff, size_t *len, int *prot, int *flags)
 {
 	struct vm_memmap memmap;
 	int error;
 
 	bzero(&memmap, sizeof(struct vm_memmap));
 	memmap.gpa = *gpa;
 	error = ioctl(ctx->fd, VM_MMAP_GETNEXT, &memmap);
 	if (error == 0) {
 		*gpa = memmap.gpa;
 		*segid = memmap.segid;
 		*segoff = memmap.segoff;
 		*len = memmap.len;
 		*prot = memmap.prot;
 		*flags = memmap.flags;
 	}
 	return (error);
 }
 
 /*
  * Return 0 if the segments are identical and non-zero otherwise.
  *
  * This is slightly complicated by the fact that only device memory segments
  * are named.
  */
 static int
 cmpseg(size_t len, const char *str, size_t len2, const char *str2)
 {
 
 	if (len == len2) {
 		if ((!str && !str2) || (str && str2 && !strcmp(str, str2)))
 			return (0);
 	}
 	return (-1);
 }
 
 static int
 vm_alloc_memseg(struct vmctx *ctx, int segid, size_t len, const char *name)
 {
 	struct vm_memseg memseg;
 	size_t n;
 	int error;
 
 	/*
 	 * If the memory segment has already been created then just return.
 	 * This is the usual case for the SYSMEM segment created by userspace
 	 * loaders like bhyveload(8).
 	 */
 	error = vm_get_memseg(ctx, segid, &memseg.len, memseg.name,
 	    sizeof(memseg.name));
 	if (error)
 		return (error);
 
 	if (memseg.len != 0) {
 		if (cmpseg(len, name, memseg.len, VM_MEMSEG_NAME(&memseg))) {
 			errno = EINVAL;
 			return (-1);
 		} else {
 			return (0);
 		}
 	}
 
 	bzero(&memseg, sizeof(struct vm_memseg));
 	memseg.segid = segid;
 	memseg.len = len;
 	if (name != NULL) {
 		n = strlcpy(memseg.name, name, sizeof(memseg.name));
 		if (n >= sizeof(memseg.name)) {
 			errno = ENAMETOOLONG;
 			return (-1);
 		}
 	}
 
 	error = ioctl(ctx->fd, VM_ALLOC_MEMSEG, &memseg);
 	return (error);
 }
 
 int
 vm_get_memseg(struct vmctx *ctx, int segid, size_t *lenp, char *namebuf,
     size_t bufsize)
 {
 	struct vm_memseg memseg;
 	size_t n;
 	int error;
 
 	memseg.segid = segid;
 	error = ioctl(ctx->fd, VM_GET_MEMSEG, &memseg);
 	if (error == 0) {
 		*lenp = memseg.len;
 		n = strlcpy(namebuf, memseg.name, bufsize);
 		if (n >= bufsize) {
 			errno = ENAMETOOLONG;
 			error = -1;
 		}
 	}
 	return (error);
 }
 
 static int
 setup_memory_segment(struct vmctx *ctx, vm_paddr_t gpa, size_t len, char *base)
 {
 	char *ptr;
 	int error, flags;
 
 	/* Map 'len' bytes starting at 'gpa' in the guest address space */
 	error = vm_mmap_memseg(ctx, gpa, VM_SYSMEM, gpa, len, PROT_ALL);
 	if (error)
 		return (error);
 
 	flags = MAP_SHARED | MAP_FIXED;
 	if ((ctx->memflags & VM_MEM_F_INCORE) == 0)
 		flags |= MAP_NOCORE;
 
 	/* mmap into the process address space on the host */
 	ptr = mmap(base + gpa, len, PROT_RW, flags, ctx->fd, gpa);
 	if (ptr == MAP_FAILED)
 		return (-1);
 
 	return (0);
 }
 
 int
 vm_setup_memory(struct vmctx *ctx, size_t memsize, enum vm_mmap_style vms)
 {
 	size_t objsize, len;
 	vm_paddr_t gpa;
 	char *baseaddr, *ptr;
 	int error;
 
 	assert(vms == VM_MMAP_ALL);
 
 	/*
 	 * If 'memsize' cannot fit entirely in the 'lowmem' segment then
 	 * create another 'highmem' segment above 4GB for the remainder.
 	 */
 	if (memsize > ctx->lowmem_limit) {
 		ctx->lowmem = ctx->lowmem_limit;
 		ctx->highmem = memsize - ctx->lowmem_limit;
 		objsize = 4*GB + ctx->highmem;
 	} else {
 		ctx->lowmem = memsize;
 		ctx->highmem = 0;
 		objsize = ctx->lowmem;
 	}
 
 	error = vm_alloc_memseg(ctx, VM_SYSMEM, objsize, NULL);
 	if (error)
 		return (error);
 
 	/*
 	 * Stake out a contiguous region covering the guest physical memory
 	 * and the adjoining guard regions.
 	 */
 	len = VM_MMAP_GUARD_SIZE + objsize + VM_MMAP_GUARD_SIZE;
 	ptr = mmap(NULL, len, PROT_NONE, MAP_GUARD | MAP_ALIGNED_SUPER, -1, 0);
 	if (ptr == MAP_FAILED)
 		return (-1);
 
 	baseaddr = ptr + VM_MMAP_GUARD_SIZE;
 	if (ctx->highmem > 0) {
 		gpa = 4*GB;
 		len = ctx->highmem;
 		error = setup_memory_segment(ctx, gpa, len, baseaddr);
 		if (error)
 			return (error);
 	}
 
 	if (ctx->lowmem > 0) {
 		gpa = 0;
 		len = ctx->lowmem;
 		error = setup_memory_segment(ctx, gpa, len, baseaddr);
 		if (error)
 			return (error);
 	}
 
 	ctx->baseaddr = baseaddr;
 
 	return (0);
 }
 
 /*
  * Returns a non-NULL pointer if [gaddr, gaddr+len) is entirely contained in
  * the lowmem or highmem regions.
  *
  * In particular return NULL if [gaddr, gaddr+len) falls in guest MMIO region.
  * The instruction emulation code depends on this behavior.
  */
 void *
 vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len)
 {
 
 	if (ctx->lowmem > 0) {
 		if (gaddr < ctx->lowmem && len <= ctx->lowmem &&
 		    gaddr + len <= ctx->lowmem)
 			return (ctx->baseaddr + gaddr);
 	}
 
 	if (ctx->highmem > 0) {
                 if (gaddr >= 4*GB) {
 			if (gaddr < 4*GB + ctx->highmem &&
 			    len <= ctx->highmem &&
 			    gaddr + len <= 4*GB + ctx->highmem)
 				return (ctx->baseaddr + gaddr);
 		}
 	}
 
 	return (NULL);
 }
 
+vm_paddr_t
+vm_rev_map_gpa(struct vmctx *ctx, void *addr)
+{
+	vm_paddr_t offaddr;
+
+	offaddr = (char *)addr - ctx->baseaddr;
+
+	if (ctx->lowmem > 0)
+		if (offaddr >= 0 && offaddr <= ctx->lowmem)
+			return (offaddr);
+
+	if (ctx->highmem > 0)
+		if (offaddr >= 4*GB && offaddr < 4*GB + ctx->highmem)
+			return (offaddr);
+
+	return ((vm_paddr_t)-1);
+}
+
+/* TODO: maximum size for vmname */
+int
+vm_get_name(struct vmctx *ctx, char *buf, size_t max_len)
+{
+
+	if (strlcpy(buf, ctx->name, max_len) >= max_len)
+		return (EINVAL);
+	return (0);
+}
+
 size_t
 vm_get_lowmem_size(struct vmctx *ctx)
 {
 
 	return (ctx->lowmem);
 }
 
 size_t
 vm_get_highmem_size(struct vmctx *ctx)
 {
 
 	return (ctx->highmem);
 }
 
 void *
 vm_create_devmem(struct vmctx *ctx, int segid, const char *name, size_t len)
 {
 	char pathname[MAXPATHLEN];
 	size_t len2;
 	char *base, *ptr;
 	int fd, error, flags;
 
 	fd = -1;
 	ptr = MAP_FAILED;
 	if (name == NULL || strlen(name) == 0) {
 		errno = EINVAL;
 		goto done;
 	}
 
 	error = vm_alloc_memseg(ctx, segid, len, name);
 	if (error)
 		goto done;
 
 	strlcpy(pathname, "/dev/vmm.io/", sizeof(pathname));
 	strlcat(pathname, ctx->name, sizeof(pathname));
 	strlcat(pathname, ".", sizeof(pathname));
 	strlcat(pathname, name, sizeof(pathname));
 
 	fd = open(pathname, O_RDWR);
 	if (fd < 0)
 		goto done;
 
 	/*
 	 * Stake out a contiguous region covering the device memory and the
 	 * adjoining guard regions.
 	 */
 	len2 = VM_MMAP_GUARD_SIZE + len + VM_MMAP_GUARD_SIZE;
 	base = mmap(NULL, len2, PROT_NONE, MAP_GUARD | MAP_ALIGNED_SUPER, -1,
 	    0);
 	if (base == MAP_FAILED)
 		goto done;
 
 	flags = MAP_SHARED | MAP_FIXED;
 	if ((ctx->memflags & VM_MEM_F_INCORE) == 0)
 		flags |= MAP_NOCORE;
 
 	/* mmap the devmem region in the host address space */
 	ptr = mmap(base + VM_MMAP_GUARD_SIZE, len, PROT_RW, flags, fd, 0);
 done:
 	if (fd >= 0)
 		close(fd);
 	return (ptr);
 }
 
 int
 vm_set_desc(struct vmctx *ctx, int vcpu, int reg,
 	    uint64_t base, uint32_t limit, uint32_t access)
 {
 	int error;
 	struct vm_seg_desc vmsegdesc;
 
 	bzero(&vmsegdesc, sizeof(vmsegdesc));
 	vmsegdesc.cpuid = vcpu;
 	vmsegdesc.regnum = reg;
 	vmsegdesc.desc.base = base;
 	vmsegdesc.desc.limit = limit;
 	vmsegdesc.desc.access = access;
 
 	error = ioctl(ctx->fd, VM_SET_SEGMENT_DESCRIPTOR, &vmsegdesc);
 	return (error);
 }
 
 int
 vm_get_desc(struct vmctx *ctx, int vcpu, int reg,
 	    uint64_t *base, uint32_t *limit, uint32_t *access)
 {
 	int error;
 	struct vm_seg_desc vmsegdesc;
 
 	bzero(&vmsegdesc, sizeof(vmsegdesc));
 	vmsegdesc.cpuid = vcpu;
 	vmsegdesc.regnum = reg;
 
 	error = ioctl(ctx->fd, VM_GET_SEGMENT_DESCRIPTOR, &vmsegdesc);
 	if (error == 0) {
 		*base = vmsegdesc.desc.base;
 		*limit = vmsegdesc.desc.limit;
 		*access = vmsegdesc.desc.access;
 	}
 	return (error);
 }
 
 int
 vm_get_seg_desc(struct vmctx *ctx, int vcpu, int reg, struct seg_desc *seg_desc)
 {
 	int error;
 
 	error = vm_get_desc(ctx, vcpu, reg, &seg_desc->base, &seg_desc->limit,
 	    &seg_desc->access);
 	return (error);
 }
 
 int
 vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val)
 {
 	int error;
 	struct vm_register vmreg;
 
 	bzero(&vmreg, sizeof(vmreg));
 	vmreg.cpuid = vcpu;
 	vmreg.regnum = reg;
 	vmreg.regval = val;
 
 	error = ioctl(ctx->fd, VM_SET_REGISTER, &vmreg);
 	return (error);
 }
 
 int
 vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *ret_val)
 {
 	int error;
 	struct vm_register vmreg;
 
 	bzero(&vmreg, sizeof(vmreg));
 	vmreg.cpuid = vcpu;
 	vmreg.regnum = reg;
 
 	error = ioctl(ctx->fd, VM_GET_REGISTER, &vmreg);
 	*ret_val = vmreg.regval;
 	return (error);
 }
 
 int
 vm_set_register_set(struct vmctx *ctx, int vcpu, unsigned int count,
     const int *regnums, uint64_t *regvals)
 {
 	int error;
 	struct vm_register_set vmregset;
 
 	bzero(&vmregset, sizeof(vmregset));
 	vmregset.cpuid = vcpu;
 	vmregset.count = count;
 	vmregset.regnums = regnums;
 	vmregset.regvals = regvals;
 
 	error = ioctl(ctx->fd, VM_SET_REGISTER_SET, &vmregset);
 	return (error);
 }
 
 int
 vm_get_register_set(struct vmctx *ctx, int vcpu, unsigned int count,
     const int *regnums, uint64_t *regvals)
 {
 	int error;
 	struct vm_register_set vmregset;
 
 	bzero(&vmregset, sizeof(vmregset));
 	vmregset.cpuid = vcpu;
 	vmregset.count = count;
 	vmregset.regnums = regnums;
 	vmregset.regvals = regvals;
 
 	error = ioctl(ctx->fd, VM_GET_REGISTER_SET, &vmregset);
 	return (error);
 }
 
 int
 vm_run(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit)
 {
 	int error;
 	struct vm_run vmrun;
 
 	bzero(&vmrun, sizeof(vmrun));
 	vmrun.cpuid = vcpu;
 
 	error = ioctl(ctx->fd, VM_RUN, &vmrun);
 	bcopy(&vmrun.vm_exit, vmexit, sizeof(struct vm_exit));
 	return (error);
 }
 
 int
 vm_suspend(struct vmctx *ctx, enum vm_suspend_how how)
 {
 	struct vm_suspend vmsuspend;
 
 	bzero(&vmsuspend, sizeof(vmsuspend));
 	vmsuspend.how = how;
 	return (ioctl(ctx->fd, VM_SUSPEND, &vmsuspend));
 }
 
 int
 vm_reinit(struct vmctx *ctx)
 {
 
 	return (ioctl(ctx->fd, VM_REINIT, 0));
 }
 
 int
 vm_inject_exception(struct vmctx *ctx, int vcpu, int vector, int errcode_valid,
     uint32_t errcode, int restart_instruction)
 {
 	struct vm_exception exc;
 
 	exc.cpuid = vcpu;
 	exc.vector = vector;
 	exc.error_code = errcode;
 	exc.error_code_valid = errcode_valid;
 	exc.restart_instruction = restart_instruction;
 
 	return (ioctl(ctx->fd, VM_INJECT_EXCEPTION, &exc));
 }
 
 int
 vm_apicid2vcpu(struct vmctx *ctx, int apicid)
 {
 	/*
 	 * The apic id associated with the 'vcpu' has the same numerical value
 	 * as the 'vcpu' itself.
 	 */
 	return (apicid);
 }
 
 int
 vm_lapic_irq(struct vmctx *ctx, int vcpu, int vector)
 {
 	struct vm_lapic_irq vmirq;
 
 	bzero(&vmirq, sizeof(vmirq));
 	vmirq.cpuid = vcpu;
 	vmirq.vector = vector;
 
 	return (ioctl(ctx->fd, VM_LAPIC_IRQ, &vmirq));
 }
 
 int
 vm_lapic_local_irq(struct vmctx *ctx, int vcpu, int vector)
 {
 	struct vm_lapic_irq vmirq;
 
 	bzero(&vmirq, sizeof(vmirq));
 	vmirq.cpuid = vcpu;
 	vmirq.vector = vector;
 
 	return (ioctl(ctx->fd, VM_LAPIC_LOCAL_IRQ, &vmirq));
 }
 
 int
 vm_lapic_msi(struct vmctx *ctx, uint64_t addr, uint64_t msg)
 {
 	struct vm_lapic_msi vmmsi;
 
 	bzero(&vmmsi, sizeof(vmmsi));
 	vmmsi.addr = addr;
 	vmmsi.msg = msg;
 
 	return (ioctl(ctx->fd, VM_LAPIC_MSI, &vmmsi));
 }
 
 int
 vm_ioapic_assert_irq(struct vmctx *ctx, int irq)
 {
 	struct vm_ioapic_irq ioapic_irq;
 
 	bzero(&ioapic_irq, sizeof(struct vm_ioapic_irq));
 	ioapic_irq.irq = irq;
 
 	return (ioctl(ctx->fd, VM_IOAPIC_ASSERT_IRQ, &ioapic_irq));
 }
 
 int
 vm_ioapic_deassert_irq(struct vmctx *ctx, int irq)
 {
 	struct vm_ioapic_irq ioapic_irq;
 
 	bzero(&ioapic_irq, sizeof(struct vm_ioapic_irq));
 	ioapic_irq.irq = irq;
 
 	return (ioctl(ctx->fd, VM_IOAPIC_DEASSERT_IRQ, &ioapic_irq));
 }
 
 int
 vm_ioapic_pulse_irq(struct vmctx *ctx, int irq)
 {
 	struct vm_ioapic_irq ioapic_irq;
 
 	bzero(&ioapic_irq, sizeof(struct vm_ioapic_irq));
 	ioapic_irq.irq = irq;
 
 	return (ioctl(ctx->fd, VM_IOAPIC_PULSE_IRQ, &ioapic_irq));
 }
 
 int
 vm_ioapic_pincount(struct vmctx *ctx, int *pincount)
 {
 
 	return (ioctl(ctx->fd, VM_IOAPIC_PINCOUNT, pincount));
 }
 
 int
 vm_isa_assert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq)
 {
 	struct vm_isa_irq isa_irq;
 
 	bzero(&isa_irq, sizeof(struct vm_isa_irq));
 	isa_irq.atpic_irq = atpic_irq;
 	isa_irq.ioapic_irq = ioapic_irq;
 
 	return (ioctl(ctx->fd, VM_ISA_ASSERT_IRQ, &isa_irq));
 }
 
 int
 vm_isa_deassert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq)
 {
 	struct vm_isa_irq isa_irq;
 
 	bzero(&isa_irq, sizeof(struct vm_isa_irq));
 	isa_irq.atpic_irq = atpic_irq;
 	isa_irq.ioapic_irq = ioapic_irq;
 
 	return (ioctl(ctx->fd, VM_ISA_DEASSERT_IRQ, &isa_irq));
 }
 
 int
 vm_isa_pulse_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq)
 {
 	struct vm_isa_irq isa_irq;
 
 	bzero(&isa_irq, sizeof(struct vm_isa_irq));
 	isa_irq.atpic_irq = atpic_irq;
 	isa_irq.ioapic_irq = ioapic_irq;
 
 	return (ioctl(ctx->fd, VM_ISA_PULSE_IRQ, &isa_irq));
 }
 
 int
 vm_isa_set_irq_trigger(struct vmctx *ctx, int atpic_irq,
     enum vm_intr_trigger trigger)
 {
 	struct vm_isa_irq_trigger isa_irq_trigger;
 
 	bzero(&isa_irq_trigger, sizeof(struct vm_isa_irq_trigger));
 	isa_irq_trigger.atpic_irq = atpic_irq;
 	isa_irq_trigger.trigger = trigger;
 
 	return (ioctl(ctx->fd, VM_ISA_SET_IRQ_TRIGGER, &isa_irq_trigger));
 }
 
 int
 vm_inject_nmi(struct vmctx *ctx, int vcpu)
 {
 	struct vm_nmi vmnmi;
 
 	bzero(&vmnmi, sizeof(vmnmi));
 	vmnmi.cpuid = vcpu;
 
 	return (ioctl(ctx->fd, VM_INJECT_NMI, &vmnmi));
 }
 
 static const char *capstrmap[] = {
 	[VM_CAP_HALT_EXIT]  = "hlt_exit",
 	[VM_CAP_MTRAP_EXIT] = "mtrap_exit",
 	[VM_CAP_PAUSE_EXIT] = "pause_exit",
 	[VM_CAP_UNRESTRICTED_GUEST] = "unrestricted_guest",
 	[VM_CAP_ENABLE_INVPCID] = "enable_invpcid",
 	[VM_CAP_BPT_EXIT] = "bpt_exit",
 };
 
 int
 vm_capability_name2type(const char *capname)
 {
 	int i;
 
 	for (i = 0; i < nitems(capstrmap); i++) {
 		if (strcmp(capstrmap[i], capname) == 0)
 			return (i);
 	}
 
 	return (-1);
 }
 
 const char *
 vm_capability_type2name(int type)
 {
 	if (type >= 0 && type < nitems(capstrmap))
 		return (capstrmap[type]);
 
 	return (NULL);
 }
 
 int
 vm_get_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap,
 		  int *retval)
 {
 	int error;
 	struct vm_capability vmcap;
 
 	bzero(&vmcap, sizeof(vmcap));
 	vmcap.cpuid = vcpu;
 	vmcap.captype = cap;
 
 	error = ioctl(ctx->fd, VM_GET_CAPABILITY, &vmcap);
 	*retval = vmcap.capval;
 	return (error);
 }
 
 int
 vm_set_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap, int val)
 {
 	struct vm_capability vmcap;
 
 	bzero(&vmcap, sizeof(vmcap));
 	vmcap.cpuid = vcpu;
 	vmcap.captype = cap;
 	vmcap.capval = val;
 
 	return (ioctl(ctx->fd, VM_SET_CAPABILITY, &vmcap));
 }
 
 int
 vm_assign_pptdev(struct vmctx *ctx, int bus, int slot, int func)
 {
 	struct vm_pptdev pptdev;
 
 	bzero(&pptdev, sizeof(pptdev));
 	pptdev.bus = bus;
 	pptdev.slot = slot;
 	pptdev.func = func;
 
 	return (ioctl(ctx->fd, VM_BIND_PPTDEV, &pptdev));
 }
 
 int
 vm_unassign_pptdev(struct vmctx *ctx, int bus, int slot, int func)
 {
 	struct vm_pptdev pptdev;
 
 	bzero(&pptdev, sizeof(pptdev));
 	pptdev.bus = bus;
 	pptdev.slot = slot;
 	pptdev.func = func;
 
 	return (ioctl(ctx->fd, VM_UNBIND_PPTDEV, &pptdev));
 }
 
 int
 vm_map_pptdev_mmio(struct vmctx *ctx, int bus, int slot, int func,
 		   vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
 {
 	struct vm_pptdev_mmio pptmmio;
 
 	bzero(&pptmmio, sizeof(pptmmio));
 	pptmmio.bus = bus;
 	pptmmio.slot = slot;
 	pptmmio.func = func;
 	pptmmio.gpa = gpa;
 	pptmmio.len = len;
 	pptmmio.hpa = hpa;
 
 	return (ioctl(ctx->fd, VM_MAP_PPTDEV_MMIO, &pptmmio));
 }
 
 int
 vm_setup_pptdev_msi(struct vmctx *ctx, int vcpu, int bus, int slot, int func,
     uint64_t addr, uint64_t msg, int numvec)
 {
 	struct vm_pptdev_msi pptmsi;
 
 	bzero(&pptmsi, sizeof(pptmsi));
 	pptmsi.vcpu = vcpu;
 	pptmsi.bus = bus;
 	pptmsi.slot = slot;
 	pptmsi.func = func;
 	pptmsi.msg = msg;
 	pptmsi.addr = addr;
 	pptmsi.numvec = numvec;
 
 	return (ioctl(ctx->fd, VM_PPTDEV_MSI, &pptmsi));
 }
 
 int	
 vm_setup_pptdev_msix(struct vmctx *ctx, int vcpu, int bus, int slot, int func,
     int idx, uint64_t addr, uint64_t msg, uint32_t vector_control)
 {
 	struct vm_pptdev_msix pptmsix;
 
 	bzero(&pptmsix, sizeof(pptmsix));
 	pptmsix.vcpu = vcpu;
 	pptmsix.bus = bus;
 	pptmsix.slot = slot;
 	pptmsix.func = func;
 	pptmsix.idx = idx;
 	pptmsix.msg = msg;
 	pptmsix.addr = addr;
 	pptmsix.vector_control = vector_control;
 
 	return ioctl(ctx->fd, VM_PPTDEV_MSIX, &pptmsix);
 }
 
 uint64_t *
 vm_get_stats(struct vmctx *ctx, int vcpu, struct timeval *ret_tv,
 	     int *ret_entries)
 {
 	int error;
 
 	static struct vm_stats vmstats;
 
 	vmstats.cpuid = vcpu;
 
 	error = ioctl(ctx->fd, VM_STATS, &vmstats);
 	if (error == 0) {
 		if (ret_entries)
 			*ret_entries = vmstats.num_entries;
 		if (ret_tv)
 			*ret_tv = vmstats.tv;
 		return (vmstats.statbuf);
 	} else
 		return (NULL);
 }
 
 const char *
 vm_get_stat_desc(struct vmctx *ctx, int index)
 {
 	static struct vm_stat_desc statdesc;
 
 	statdesc.index = index;
 	if (ioctl(ctx->fd, VM_STAT_DESC, &statdesc) == 0)
 		return (statdesc.desc);
 	else
 		return (NULL);
 }
 
 int
 vm_get_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state *state)
 {
 	int error;
 	struct vm_x2apic x2apic;
 
 	bzero(&x2apic, sizeof(x2apic));
 	x2apic.cpuid = vcpu;
 
 	error = ioctl(ctx->fd, VM_GET_X2APIC_STATE, &x2apic);
 	*state = x2apic.state;
 	return (error);
 }
 
 int
 vm_set_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state state)
 {
 	int error;
 	struct vm_x2apic x2apic;
 
 	bzero(&x2apic, sizeof(x2apic));
 	x2apic.cpuid = vcpu;
 	x2apic.state = state;
 
 	error = ioctl(ctx->fd, VM_SET_X2APIC_STATE, &x2apic);
 
 	return (error);
 }
 
 /*
  * From Intel Vol 3a:
  * Table 9-1. IA-32 Processor States Following Power-up, Reset or INIT
  */
 int
 vcpu_reset(struct vmctx *vmctx, int vcpu)
 {
 	int error;
 	uint64_t rflags, rip, cr0, cr4, zero, desc_base, rdx;
 	uint32_t desc_access, desc_limit;
 	uint16_t sel;
 
 	zero = 0;
 
 	rflags = 0x2;
 	error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RFLAGS, rflags);
 	if (error)
 		goto done;
 
 	rip = 0xfff0;
 	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RIP, rip)) != 0)
 		goto done;
 
 	cr0 = CR0_NE;
 	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR0, cr0)) != 0)
 		goto done;
 
 	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR3, zero)) != 0)
 		goto done;
 	
 	cr4 = 0;
 	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR4, cr4)) != 0)
 		goto done;
 
 	/*
 	 * CS: present, r/w, accessed, 16-bit, byte granularity, usable
 	 */
 	desc_base = 0xffff0000;
 	desc_limit = 0xffff;
 	desc_access = 0x0093;
 	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_CS,
 			    desc_base, desc_limit, desc_access);
 	if (error)
 		goto done;
 
 	sel = 0xf000;
 	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CS, sel)) != 0)
 		goto done;
 
 	/*
 	 * SS,DS,ES,FS,GS: present, r/w, accessed, 16-bit, byte granularity
 	 */
 	desc_base = 0;
 	desc_limit = 0xffff;
 	desc_access = 0x0093;
 	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_SS,
 			    desc_base, desc_limit, desc_access);
 	if (error)
 		goto done;
 
 	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_DS,
 			    desc_base, desc_limit, desc_access);
 	if (error)
 		goto done;
 
 	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_ES,
 			    desc_base, desc_limit, desc_access);
 	if (error)
 		goto done;
 
 	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_FS,
 			    desc_base, desc_limit, desc_access);
 	if (error)
 		goto done;
 
 	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GS,
 			    desc_base, desc_limit, desc_access);
 	if (error)
 		goto done;
 
 	sel = 0;
 	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_SS, sel)) != 0)
 		goto done;
 	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_DS, sel)) != 0)
 		goto done;
 	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_ES, sel)) != 0)
 		goto done;
 	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_FS, sel)) != 0)
 		goto done;
 	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_GS, sel)) != 0)
 		goto done;
 
 	/* General purpose registers */
 	rdx = 0xf00;
 	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RAX, zero)) != 0)
 		goto done;
 	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RBX, zero)) != 0)
 		goto done;
 	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RCX, zero)) != 0)
 		goto done;
 	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RDX, rdx)) != 0)
 		goto done;
 	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RSI, zero)) != 0)
 		goto done;
 	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RDI, zero)) != 0)
 		goto done;
 	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RBP, zero)) != 0)
 		goto done;
 	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RSP, zero)) != 0)
 		goto done;
 
 	/* GDTR, IDTR */
 	desc_base = 0;
 	desc_limit = 0xffff;
 	desc_access = 0;
 	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GDTR,
 			    desc_base, desc_limit, desc_access);
 	if (error != 0)
 		goto done;
 
 	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_IDTR,
 			    desc_base, desc_limit, desc_access);
 	if (error != 0)
 		goto done;
 
 	/* TR */
 	desc_base = 0;
 	desc_limit = 0xffff;
 	desc_access = 0x0000008b;
 	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_TR, 0, 0, desc_access);
 	if (error)
 		goto done;
 
 	sel = 0;
 	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_TR, sel)) != 0)
 		goto done;
 
 	/* LDTR */
 	desc_base = 0;
 	desc_limit = 0xffff;
 	desc_access = 0x00000082;
 	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_LDTR, desc_base,
 			    desc_limit, desc_access);
 	if (error)
 		goto done;
 
 	sel = 0;
 	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_LDTR, 0)) != 0)
 		goto done;
 
 	/* XXX cr2, debug registers */
 
 	error = 0;
 done:
 	return (error);
 }
 
 int
 vm_get_gpa_pmap(struct vmctx *ctx, uint64_t gpa, uint64_t *pte, int *num)
 {
 	int error, i;
 	struct vm_gpa_pte gpapte;
 
 	bzero(&gpapte, sizeof(gpapte));
 	gpapte.gpa = gpa;
 
 	error = ioctl(ctx->fd, VM_GET_GPA_PMAP, &gpapte);
 
 	if (error == 0) {
 		*num = gpapte.ptenum;
 		for (i = 0; i < gpapte.ptenum; i++)
 			pte[i] = gpapte.pte[i];
 	}
 
 	return (error);
 }
 
 int
 vm_get_hpet_capabilities(struct vmctx *ctx, uint32_t *capabilities)
 {
 	int error;
 	struct vm_hpet_cap cap;
 
 	bzero(&cap, sizeof(struct vm_hpet_cap));
 	error = ioctl(ctx->fd, VM_GET_HPET_CAPABILITIES, &cap);
 	if (capabilities != NULL)
 		*capabilities = cap.capabilities;
 	return (error);
 }
 
 int
 vm_gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
     uint64_t gla, int prot, uint64_t *gpa, int *fault)
 {
 	struct vm_gla2gpa gg;
 	int error;
 
 	bzero(&gg, sizeof(struct vm_gla2gpa));
 	gg.vcpuid = vcpu;
 	gg.prot = prot;
 	gg.gla = gla;
 	gg.paging = *paging;
 
 	error = ioctl(ctx->fd, VM_GLA2GPA, &gg);
 	if (error == 0) {
 		*fault = gg.fault;
 		*gpa = gg.gpa;
 	}
 	return (error);
 }
 
 int
 vm_gla2gpa_nofault(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
     uint64_t gla, int prot, uint64_t *gpa, int *fault)
 {
 	struct vm_gla2gpa gg;
 	int error;
 
 	bzero(&gg, sizeof(struct vm_gla2gpa));
 	gg.vcpuid = vcpu;
 	gg.prot = prot;
 	gg.gla = gla;
 	gg.paging = *paging;
 
 	error = ioctl(ctx->fd, VM_GLA2GPA_NOFAULT, &gg);
 	if (error == 0) {
 		*fault = gg.fault;
 		*gpa = gg.gpa;
 	}
 	return (error);
 }
 
 #ifndef min
 #define	min(a,b)	(((a) < (b)) ? (a) : (b))
 #endif
 
 int
 vm_copy_setup(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
     uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt,
     int *fault)
 {
 	void *va;
 	uint64_t gpa;
 	int error, i, n, off;
 
 	for (i = 0; i < iovcnt; i++) {
 		iov[i].iov_base = 0;
 		iov[i].iov_len = 0;
 	}
 
 	while (len) {
 		assert(iovcnt > 0);
 		error = vm_gla2gpa(ctx, vcpu, paging, gla, prot, &gpa, fault);
 		if (error || *fault)
 			return (error);
 
 		off = gpa & PAGE_MASK;
 		n = min(len, PAGE_SIZE - off);
 
 		va = vm_map_gpa(ctx, gpa, n);
 		if (va == NULL)
 			return (EFAULT);
 
 		iov->iov_base = va;
 		iov->iov_len = n;
 		iov++;
 		iovcnt--;
 
 		gla += n;
 		len -= n;
 	}
 	return (0);
 }
 
 void
 vm_copy_teardown(struct vmctx *ctx, int vcpu, struct iovec *iov, int iovcnt)
 {
 
 	return;
 }
 
 void
 vm_copyin(struct vmctx *ctx, int vcpu, struct iovec *iov, void *vp, size_t len)
 {
 	const char *src;
 	char *dst;
 	size_t n;
 
 	dst = vp;
 	while (len) {
 		assert(iov->iov_len);
 		n = min(len, iov->iov_len);
 		src = iov->iov_base;
 		bcopy(src, dst, n);
 
 		iov++;
 		dst += n;
 		len -= n;
 	}
 }
 
 void
 vm_copyout(struct vmctx *ctx, int vcpu, const void *vp, struct iovec *iov,
     size_t len)
 {
 	const char *src;
 	char *dst;
 	size_t n;
 
 	src = vp;
 	while (len) {
 		assert(iov->iov_len);
 		n = min(len, iov->iov_len);
 		dst = iov->iov_base;
 		bcopy(src, dst, n);
 
 		iov++;
 		src += n;
 		len -= n;
 	}
 }
 
 static int
 vm_get_cpus(struct vmctx *ctx, int which, cpuset_t *cpus)
 {
 	struct vm_cpuset vm_cpuset;
 	int error;
 
 	bzero(&vm_cpuset, sizeof(struct vm_cpuset));
 	vm_cpuset.which = which;
 	vm_cpuset.cpusetsize = sizeof(cpuset_t);
 	vm_cpuset.cpus = cpus;
 
 	error = ioctl(ctx->fd, VM_GET_CPUS, &vm_cpuset);
 	return (error);
 }
 
 int
 vm_active_cpus(struct vmctx *ctx, cpuset_t *cpus)
 {
 
 	return (vm_get_cpus(ctx, VM_ACTIVE_CPUS, cpus));
 }
 
 int
 vm_suspended_cpus(struct vmctx *ctx, cpuset_t *cpus)
 {
 
 	return (vm_get_cpus(ctx, VM_SUSPENDED_CPUS, cpus));
 }
 
 int
 vm_debug_cpus(struct vmctx *ctx, cpuset_t *cpus)
 {
 
 	return (vm_get_cpus(ctx, VM_DEBUG_CPUS, cpus));
 }
 
 int
 vm_activate_cpu(struct vmctx *ctx, int vcpu)
 {
 	struct vm_activate_cpu ac;
 	int error;
 
 	bzero(&ac, sizeof(struct vm_activate_cpu));
 	ac.vcpuid = vcpu;
 	error = ioctl(ctx->fd, VM_ACTIVATE_CPU, &ac);
 	return (error);
 }
 
 int
 vm_suspend_cpu(struct vmctx *ctx, int vcpu)
 {
 	struct vm_activate_cpu ac;
 	int error;
 
 	bzero(&ac, sizeof(struct vm_activate_cpu));
 	ac.vcpuid = vcpu;
 	error = ioctl(ctx->fd, VM_SUSPEND_CPU, &ac);
 	return (error);
 }
 
 int
 vm_resume_cpu(struct vmctx *ctx, int vcpu)
 {
 	struct vm_activate_cpu ac;
 	int error;
 
 	bzero(&ac, sizeof(struct vm_activate_cpu));
 	ac.vcpuid = vcpu;
 	error = ioctl(ctx->fd, VM_RESUME_CPU, &ac);
 	return (error);
 }
 
 int
 vm_get_intinfo(struct vmctx *ctx, int vcpu, uint64_t *info1, uint64_t *info2)
 {
 	struct vm_intinfo vmii;
 	int error;
 
 	bzero(&vmii, sizeof(struct vm_intinfo));
 	vmii.vcpuid = vcpu;
 	error = ioctl(ctx->fd, VM_GET_INTINFO, &vmii);
 	if (error == 0) {
 		*info1 = vmii.info1;
 		*info2 = vmii.info2;
 	}
 	return (error);
 }
 
 int
 vm_set_intinfo(struct vmctx *ctx, int vcpu, uint64_t info1)
 {
 	struct vm_intinfo vmii;
 	int error;
 
 	bzero(&vmii, sizeof(struct vm_intinfo));
 	vmii.vcpuid = vcpu;
 	vmii.info1 = info1;
 	error = ioctl(ctx->fd, VM_SET_INTINFO, &vmii);
 	return (error);
 }
 
 int
 vm_rtc_write(struct vmctx *ctx, int offset, uint8_t value)
 {
 	struct vm_rtc_data rtcdata;
 	int error;
 
 	bzero(&rtcdata, sizeof(struct vm_rtc_data));
 	rtcdata.offset = offset;
 	rtcdata.value = value;
 	error = ioctl(ctx->fd, VM_RTC_WRITE, &rtcdata);
 	return (error);
 }
 
 int
 vm_rtc_read(struct vmctx *ctx, int offset, uint8_t *retval)
 {
 	struct vm_rtc_data rtcdata;
 	int error;
 
 	bzero(&rtcdata, sizeof(struct vm_rtc_data));
 	rtcdata.offset = offset;
 	error = ioctl(ctx->fd, VM_RTC_READ, &rtcdata);
 	if (error == 0)
 		*retval = rtcdata.value;
 	return (error);
 }
 
 int
 vm_rtc_settime(struct vmctx *ctx, time_t secs)
 {
 	struct vm_rtc_time rtctime;
 	int error;
 
 	bzero(&rtctime, sizeof(struct vm_rtc_time));
 	rtctime.secs = secs;
 	error = ioctl(ctx->fd, VM_RTC_SETTIME, &rtctime);
 	return (error);
 }
 
 int
 vm_rtc_gettime(struct vmctx *ctx, time_t *secs)
 {
 	struct vm_rtc_time rtctime;
 	int error;
 
 	bzero(&rtctime, sizeof(struct vm_rtc_time));
 	error = ioctl(ctx->fd, VM_RTC_GETTIME, &rtctime);
 	if (error == 0)
 		*secs = rtctime.secs;
 	return (error);
 }
 
 int
 vm_restart_instruction(void *arg, int vcpu)
 {
 	struct vmctx *ctx = arg;
 
 	return (ioctl(ctx->fd, VM_RESTART_INSTRUCTION, &vcpu));
 }
 
+int
+vm_snapshot_req(struct vm_snapshot_meta *meta)
+{
+
+	if (ioctl(meta->ctx->fd, VM_SNAPSHOT_REQ, meta) == -1) {
+#ifdef SNAPSHOT_DEBUG
+		fprintf(stderr, "%s: snapshot failed for %s: %d\r\n",
+		    __func__, meta->dev_name, errno);
+#endif
+		return (-1);
+	}
+	return (0);
+}
+
+int
+vm_restore_time(struct vmctx *ctx)
+{
+	int dummy;
+
+	dummy = 0;
+	return (ioctl(ctx->fd, VM_RESTORE_TIME, &dummy));
+}
+
 int
 vm_set_topology(struct vmctx *ctx,
     uint16_t sockets, uint16_t cores, uint16_t threads, uint16_t maxcpus)
 {
 	struct vm_cpu_topology topology;
 
 	bzero(&topology, sizeof (struct vm_cpu_topology));
 	topology.sockets = sockets;
 	topology.cores = cores;
 	topology.threads = threads;
 	topology.maxcpus = maxcpus;
 	return (ioctl(ctx->fd, VM_SET_TOPOLOGY, &topology));
 }
 
 int
 vm_get_topology(struct vmctx *ctx,
     uint16_t *sockets, uint16_t *cores, uint16_t *threads, uint16_t *maxcpus)
 {
 	struct vm_cpu_topology topology;
 	int error;
 
 	bzero(&topology, sizeof (struct vm_cpu_topology));
 	error = ioctl(ctx->fd, VM_GET_TOPOLOGY, &topology);
 	if (error == 0) {
 		*sockets = topology.sockets;
 		*cores = topology.cores;
 		*threads = topology.threads;
 		*maxcpus = topology.maxcpus;
 	}
 	return (error);
 }
 
 int
 vm_get_device_fd(struct vmctx *ctx)
 {
 
 	return (ctx->fd);
 }
 
 const cap_ioctl_t *
 vm_get_ioctls(size_t *len)
 {
 	cap_ioctl_t *cmds;
 	/* keep in sync with machine/vmm_dev.h */
 	static const cap_ioctl_t vm_ioctl_cmds[] = { VM_RUN, VM_SUSPEND, VM_REINIT,
 	    VM_ALLOC_MEMSEG, VM_GET_MEMSEG, VM_MMAP_MEMSEG, VM_MMAP_MEMSEG,
 	    VM_MMAP_GETNEXT, VM_SET_REGISTER, VM_GET_REGISTER,
 	    VM_SET_SEGMENT_DESCRIPTOR, VM_GET_SEGMENT_DESCRIPTOR,
 	    VM_SET_REGISTER_SET, VM_GET_REGISTER_SET,
 	    VM_INJECT_EXCEPTION, VM_LAPIC_IRQ, VM_LAPIC_LOCAL_IRQ,
 	    VM_LAPIC_MSI, VM_IOAPIC_ASSERT_IRQ, VM_IOAPIC_DEASSERT_IRQ,
 	    VM_IOAPIC_PULSE_IRQ, VM_IOAPIC_PINCOUNT, VM_ISA_ASSERT_IRQ,
 	    VM_ISA_DEASSERT_IRQ, VM_ISA_PULSE_IRQ, VM_ISA_SET_IRQ_TRIGGER,
 	    VM_SET_CAPABILITY, VM_GET_CAPABILITY, VM_BIND_PPTDEV,
 	    VM_UNBIND_PPTDEV, VM_MAP_PPTDEV_MMIO, VM_PPTDEV_MSI,
 	    VM_PPTDEV_MSIX, VM_INJECT_NMI, VM_STATS, VM_STAT_DESC,
 	    VM_SET_X2APIC_STATE, VM_GET_X2APIC_STATE,
 	    VM_GET_HPET_CAPABILITIES, VM_GET_GPA_PMAP, VM_GLA2GPA,
 	    VM_GLA2GPA_NOFAULT,
 	    VM_ACTIVATE_CPU, VM_GET_CPUS, VM_SUSPEND_CPU, VM_RESUME_CPU,
 	    VM_SET_INTINFO, VM_GET_INTINFO,
 	    VM_RTC_WRITE, VM_RTC_READ, VM_RTC_SETTIME, VM_RTC_GETTIME,
 	    VM_RESTART_INSTRUCTION, VM_SET_TOPOLOGY, VM_GET_TOPOLOGY };
 
 	if (len == NULL) {
 		cmds = malloc(sizeof(vm_ioctl_cmds));
 		if (cmds == NULL)
 			return (NULL);
 		bcopy(vm_ioctl_cmds, cmds, sizeof(vm_ioctl_cmds));
 		return (cmds);
 	}
 
 	*len = nitems(vm_ioctl_cmds);
 	return (NULL);
 }
diff --git a/lib/libvmmapi/vmmapi.h b/lib/libvmmapi/vmmapi.h
index 9819cda16bd9..2b026031b50f 100644
--- a/lib/libvmmapi/vmmapi.h
+++ b/lib/libvmmapi/vmmapi.h
@@ -1,240 +1,269 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _VMMAPI_H_
 #define	_VMMAPI_H_
 
 #include <sys/param.h>
 #include <sys/cpuset.h>
+#include <machine/vmm_dev.h>
 
 /*
  * API version for out-of-tree consumers like grub-bhyve for making compile
  * time decisions.
  */
 #define	VMMAPI_VERSION	0103	/* 2 digit major followed by 2 digit minor */
 
 struct iovec;
 struct vmctx;
+struct vm_snapshot_meta;
 enum x2apic_state;
 
 /*
  * Different styles of mapping the memory assigned to a VM into the address
  * space of the controlling process.
  */
 enum vm_mmap_style {
 	VM_MMAP_NONE,		/* no mapping */
 	VM_MMAP_ALL,		/* fully and statically mapped */
 	VM_MMAP_SPARSE,		/* mappings created on-demand */
 };
 
 /*
  * 'flags' value passed to 'vm_set_memflags()'.
  */
 #define	VM_MEM_F_INCORE	0x01	/* include guest memory in core file */
 #define	VM_MEM_F_WIRED	0x02	/* guest memory is wired */
 
 /*
  * Identifiers for memory segments:
  * - vm_setup_memory() uses VM_SYSMEM for the system memory segment.
  * - the remaining identifiers can be used to create devmem segments.
  */
 enum {
 	VM_SYSMEM,
 	VM_BOOTROM,
 	VM_FRAMEBUFFER,
 };
 
 /*
  * Get the length and name of the memory segment identified by 'segid'.
  * Note that system memory segments are identified with a nul name.
  *
  * Returns 0 on success and non-zero otherwise.
  */
 int	vm_get_memseg(struct vmctx *ctx, int ident, size_t *lenp, char *name,
 	    size_t namesiz);
 
 /*
  * Iterate over the guest address space. This function finds an address range
  * that starts at an address >= *gpa.
  *
  * Returns 0 if the next address range was found and non-zero otherwise.
  */
 int	vm_mmap_getnext(struct vmctx *ctx, vm_paddr_t *gpa, int *segid,
 	    vm_ooffset_t *segoff, size_t *len, int *prot, int *flags);
+
+int	vm_get_guestmem_from_ctx(struct vmctx *ctx, char **guest_baseaddr,
+				 size_t *lowmem_size, size_t *highmem_size);
+
 /*
  * Create a device memory segment identified by 'segid'.
  *
  * Returns a pointer to the memory segment on success and MAP_FAILED otherwise.
  */
 void	*vm_create_devmem(struct vmctx *ctx, int segid, const char *name,
 	    size_t len);
 
 /*
  * Map the memory segment identified by 'segid' into the guest address space
  * at [gpa,gpa+len) with protection 'prot'.
  */
 int	vm_mmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, int segid,
 	    vm_ooffset_t segoff, size_t len, int prot);
 
 int	vm_create(const char *name);
 int	vm_get_device_fd(struct vmctx *ctx);
 struct vmctx *vm_open(const char *name);
 void	vm_destroy(struct vmctx *ctx);
 int	vm_parse_memsize(const char *optarg, size_t *memsize);
 int	vm_setup_memory(struct vmctx *ctx, size_t len, enum vm_mmap_style s);
 void	*vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len);
+/* inverse operation to vm_map_gpa - extract guest address from host pointer */
+vm_paddr_t vm_rev_map_gpa(struct vmctx *ctx, void *addr);
 int	vm_get_gpa_pmap(struct vmctx *, uint64_t gpa, uint64_t *pte, int *num);
 int	vm_gla2gpa(struct vmctx *, int vcpuid, struct vm_guest_paging *paging,
 		   uint64_t gla, int prot, uint64_t *gpa, int *fault);
 int	vm_gla2gpa_nofault(struct vmctx *, int vcpuid,
 		   struct vm_guest_paging *paging, uint64_t gla, int prot,
 		   uint64_t *gpa, int *fault);
 uint32_t vm_get_lowmem_limit(struct vmctx *ctx);
 void	vm_set_lowmem_limit(struct vmctx *ctx, uint32_t limit);
 void	vm_set_memflags(struct vmctx *ctx, int flags);
 int	vm_get_memflags(struct vmctx *ctx);
+int	vm_get_name(struct vmctx *ctx, char *buffer, size_t max_len);
 size_t	vm_get_lowmem_size(struct vmctx *ctx);
 size_t	vm_get_highmem_size(struct vmctx *ctx);
 int	vm_set_desc(struct vmctx *ctx, int vcpu, int reg,
 		    uint64_t base, uint32_t limit, uint32_t access);
 int	vm_get_desc(struct vmctx *ctx, int vcpu, int reg,
 		    uint64_t *base, uint32_t *limit, uint32_t *access);
 int	vm_get_seg_desc(struct vmctx *ctx, int vcpu, int reg,
 			struct seg_desc *seg_desc);
 int	vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val);
 int	vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *retval);
 int	vm_set_register_set(struct vmctx *ctx, int vcpu, unsigned int count,
     const int *regnums, uint64_t *regvals);
 int	vm_get_register_set(struct vmctx *ctx, int vcpu, unsigned int count,
     const int *regnums, uint64_t *regvals);
 int	vm_run(struct vmctx *ctx, int vcpu, struct vm_exit *ret_vmexit);
 int	vm_suspend(struct vmctx *ctx, enum vm_suspend_how how);
 int	vm_reinit(struct vmctx *ctx);
 int	vm_apicid2vcpu(struct vmctx *ctx, int apicid);
 int	vm_inject_exception(struct vmctx *ctx, int vcpu, int vector,
     int errcode_valid, uint32_t errcode, int restart_instruction);
 int	vm_lapic_irq(struct vmctx *ctx, int vcpu, int vector);
 int	vm_lapic_local_irq(struct vmctx *ctx, int vcpu, int vector);
 int	vm_lapic_msi(struct vmctx *ctx, uint64_t addr, uint64_t msg);
 int	vm_ioapic_assert_irq(struct vmctx *ctx, int irq);
 int	vm_ioapic_deassert_irq(struct vmctx *ctx, int irq);
 int	vm_ioapic_pulse_irq(struct vmctx *ctx, int irq);
 int	vm_ioapic_pincount(struct vmctx *ctx, int *pincount);
 int	vm_isa_assert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq);
 int	vm_isa_deassert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq);
 int	vm_isa_pulse_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq);
 int	vm_isa_set_irq_trigger(struct vmctx *ctx, int atpic_irq,
 	    enum vm_intr_trigger trigger);
 int	vm_inject_nmi(struct vmctx *ctx, int vcpu);
 int	vm_capability_name2type(const char *capname);
 const char *vm_capability_type2name(int type);
 int	vm_get_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap,
 			  int *retval);
 int	vm_set_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap,
 			  int val);
 int	vm_assign_pptdev(struct vmctx *ctx, int bus, int slot, int func);
 int	vm_unassign_pptdev(struct vmctx *ctx, int bus, int slot, int func);
 int	vm_map_pptdev_mmio(struct vmctx *ctx, int bus, int slot, int func,
 			   vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
 int	vm_setup_pptdev_msi(struct vmctx *ctx, int vcpu, int bus, int slot,
 	    int func, uint64_t addr, uint64_t msg, int numvec);
 int	vm_setup_pptdev_msix(struct vmctx *ctx, int vcpu, int bus, int slot,
 	    int func, int idx, uint64_t addr, uint64_t msg,
 	    uint32_t vector_control);
 
 int	vm_get_intinfo(struct vmctx *ctx, int vcpu, uint64_t *i1, uint64_t *i2);
 int	vm_set_intinfo(struct vmctx *ctx, int vcpu, uint64_t exit_intinfo);
 
 const cap_ioctl_t *vm_get_ioctls(size_t *len);
 
 /*
  * Return a pointer to the statistics buffer. Note that this is not MT-safe.
  */
 uint64_t *vm_get_stats(struct vmctx *ctx, int vcpu, struct timeval *ret_tv,
 		       int *ret_entries);
 const char *vm_get_stat_desc(struct vmctx *ctx, int index);
 
 int	vm_get_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state *s);
 int	vm_set_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state s);
 
 int	vm_get_hpet_capabilities(struct vmctx *ctx, uint32_t *capabilities);
 
 /*
  * Translate the GLA range [gla,gla+len) into GPA segments in 'iov'.
  * The 'iovcnt' should be big enough to accommodate all GPA segments.
  *
  * retval	fault		Interpretation
  *   0		  0		Success
  *   0		  1		An exception was injected into the guest
  * EFAULT	 N/A		Error
  */
 int	vm_copy_setup(struct vmctx *ctx, int vcpu, struct vm_guest_paging *pg,
 	    uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt,
 	    int *fault);
 void	vm_copyin(struct vmctx *ctx, int vcpu, struct iovec *guest_iov,
 	    void *host_dst, size_t len);
 void	vm_copyout(struct vmctx *ctx, int vcpu, const void *host_src,
 	    struct iovec *guest_iov, size_t len);
 void	vm_copy_teardown(struct vmctx *ctx, int vcpu, struct iovec *iov,
 	    int iovcnt);
 
 /* RTC */
 int	vm_rtc_write(struct vmctx *ctx, int offset, uint8_t value);
 int	vm_rtc_read(struct vmctx *ctx, int offset, uint8_t *retval);
 int	vm_rtc_settime(struct vmctx *ctx, time_t secs);
 int	vm_rtc_gettime(struct vmctx *ctx, time_t *secs);
 
 /* Reset vcpu register state */
 int	vcpu_reset(struct vmctx *ctx, int vcpu);
 
 int	vm_active_cpus(struct vmctx *ctx, cpuset_t *cpus);
 int	vm_suspended_cpus(struct vmctx *ctx, cpuset_t *cpus);
 int	vm_debug_cpus(struct vmctx *ctx, cpuset_t *cpus);
 int	vm_activate_cpu(struct vmctx *ctx, int vcpu);
 int	vm_suspend_cpu(struct vmctx *ctx, int vcpu);
 int	vm_resume_cpu(struct vmctx *ctx, int vcpu);
 
 /* CPU topology */
 int	vm_set_topology(struct vmctx *ctx, uint16_t sockets, uint16_t cores,
 	    uint16_t threads, uint16_t maxcpus);
 int	vm_get_topology(struct vmctx *ctx, uint16_t *sockets, uint16_t *cores,
 	    uint16_t *threads, uint16_t *maxcpus);
 
 /*
  * FreeBSD specific APIs
  */
 int	vm_setup_freebsd_registers(struct vmctx *ctx, int vcpu,
 				uint64_t rip, uint64_t cr3, uint64_t gdtbase,
 				uint64_t rsp);
 int	vm_setup_freebsd_registers_i386(struct vmctx *vmctx, int vcpu,
 					uint32_t eip, uint32_t gdtbase,
 					uint32_t esp);
 void	vm_setup_freebsd_gdt(uint64_t *gdtr);
+
+/*
+ * Save and restore
+ */
+
+#define MAX_SNAPSHOT_VMNAME 100
+
+enum checkpoint_opcodes {
+	START_CHECKPOINT = 0,
+	START_SUSPEND = 1,
+};
+
+struct checkpoint_op {
+	unsigned int op;
+	char snapshot_filename[MAX_SNAPSHOT_VMNAME];
+};
+
+int	vm_snapshot_req(struct vm_snapshot_meta *meta);
+int	vm_restore_time(struct vmctx *ctx);
+
 #endif	/* _VMMAPI_H_ */
diff --git a/share/man/man5/src.conf.5 b/share/man/man5/src.conf.5
index da3a1f9c3044..4d28f019e100 100644
--- a/share/man/man5/src.conf.5
+++ b/share/man/man5/src.conf.5
@@ -1,1781 +1,1788 @@
 .\" DO NOT EDIT-- this file is @generated by tools/build/options/makeman.
 .\" $FreeBSD$
-.Dd April 30, 2020
+.Dd May 4, 2020
 .Dt SRC.CONF 5
 .Os
 .Sh NAME
 .Nm src.conf
 .Nd "source build options"
 .Sh DESCRIPTION
 The
 .Nm
 file contains settings that will apply to every build involving the
 .Fx
 source tree; see
 .Xr build 7 .
 .Pp
 The
 .Nm
 file uses the standard makefile syntax.
 However,
 .Nm
 should not specify any dependencies to
 .Xr make 1 .
 Instead,
 .Nm
 is to set
 .Xr make 1
 variables that control the aspects of how the system builds.
 .Pp
 The default location of
 .Nm
 is
 .Pa /etc/src.conf ,
 though an alternative location can be specified in the
 .Xr make 1
 variable
 .Va SRCCONF .
 Overriding the location of
 .Nm
 may be necessary if the system-wide settings are not suitable
 for a particular build.
 For instance, setting
 .Va SRCCONF
 to
 .Pa /dev/null
 effectively resets all build controls to their defaults.
 .Pp
 The only purpose of
 .Nm
 is to control the compilation of the
 .Fx
 source code, which is usually located in
 .Pa /usr/src .
 As a rule, the system administrator creates
 .Nm
 when the values of certain control variables need to be changed
 from their defaults.
 .Pp
 In addition, control variables can be specified
 for a particular build via the
 .Fl D
 option of
 .Xr make 1
 or in its environment; see
 .Xr environ 7 .
 .Pp
 The environment of
 .Xr make 1
 for the build can be controlled via the
 .Va SRC_ENV_CONF
 variable, which defaults to
 .Pa /etc/src-env.conf .
 Some examples that may only be set in this file are
 .Va WITH_DIRDEPS_BUILD ,
 and
 .Va WITH_META_MODE ,
 and
 .Va MAKEOBJDIRPREFIX
 as they are environment-only variables.
 .Pp
 The values of variables are ignored regardless of their setting;
 even if they would be set to
 .Dq Li FALSE
 or
 .Dq Li NO .
 The presence of an option causes
 it to be honored by
 .Xr make 1 .
 .Pp
 This list provides a name and short description for variables
 that can be used for source builds.
 .Bl -tag -width indent
 .It Va WITHOUT_ACCT
 Set to not build process accounting tools such as
 .Xr accton 8
 and
 .Xr sa 8 .
 .It Va WITHOUT_ACPI
 Set to not build
 .Xr acpiconf 8 ,
 .Xr acpidump 8
 and related programs.
 .It Va WITHOUT_APM
 Set to not build
 .Xr apm 8 ,
 .Xr apmd 8
 and related programs.
 .It Va WITHOUT_ASSERT_DEBUG
 Set to compile programs and libraries without the
 .Xr assert 3
 checks.
 .It Va WITHOUT_AT
 Set to not build
 .Xr at 1
 and related utilities.
 .It Va WITHOUT_ATM
 Set to not build
 programs and libraries related to ATM networking.
 .It Va WITHOUT_AUDIT
 Set to not build audit support into system programs.
 .It Va WITHOUT_AUTHPF
 Set to not build
 .Xr authpf 8 .
 .It Va WITHOUT_AUTOFS
 Set to not build
 .Xr autofs 5
 related programs, libraries, and kernel modules.
 .It Va WITHOUT_AUTO_OBJ
 Disable automatic creation of objdirs.
 This is enabled by default if the wanted OBJDIR is writable by the current user.
 .Pp
 This must be set in the environment, make command line, or
 .Pa /etc/src-env.conf ,
 not
 .Pa /etc/src.conf .
 .It Va WITH_BEARSSL
 Build the BearSSL library.
 .Pp
 BearSSL is a tiny SSL library suitable for embedded environments.
 For details see
 .Lk http://www.BearSSL.org/
 .Pp
 This library is currently only used to perform
 signature verification and related operations
 for Verified Exec and
 .Xr loader 8 .
 When set, these options are also in effect:
 .Pp
 .Bl -inset -compact
 .It Va WITH_LOADER_EFI_SECUREBOOT
 (unless
 .Va WITHOUT_LOADER_EFI_SECUREBOOT
 is set explicitly)
 .It Va WITH_LOADER_VERIEXEC
 (unless
 .Va WITHOUT_LOADER_VERIEXEC
 is set explicitly)
 .It Va WITH_LOADER_VERIEXEC_VECTX
 (unless
 .Va WITHOUT_LOADER_VERIEXEC_VECTX
 is set explicitly)
 .It Va WITH_VERIEXEC
 (unless
 .Va WITHOUT_VERIEXEC
 is set explicitly)
 .El
 .It Va WITHOUT_BHYVE
 Set to not build or install
 .Xr bhyve 8 ,
 associated utilities, and examples.
 .Pp
 This option only affects amd64/amd64.
+.It Va WITH_BHYVE_SNAPSHOT
+Set to include support for save and restore (snapshots) in
+.Xr bhyve 8
+and
+.Xr bhyvectl 8 .
+.Pp
+This option only affects amd64/amd64.
 .It Va WITH_BIND_NOW
 Build all binaries with the
 .Dv DF_BIND_NOW
 flag set to indicate that the run-time loader should perform all relocation
 processing at process startup rather than on demand.
 .It Va WITHOUT_BINUTILS
 Do not build or install GNU
 .Xr as 1 and
 .Xr objdump 1
 as part
 of the normal system build.
 .Pp
 This is a default setting on
 arm64/aarch64, riscv/riscv64 and riscv/riscv64sf.
 .It Va WITH_BINUTILS
 Build and install GNU
 .Xr as 1
 on i386 and amd64,
 and
 .Xr objdump 1
 as part of the normal system build.
 .Pp
 This is a default setting on
 amd64/amd64, arm/armv6, arm/armv7, i386/i386, mips/mips, mips/mips64, powerpc/powerpc and powerpc/powerpc64.
 .It Va WITHOUT_BINUTILS_BOOTSTRAP
 Do not build GNU binutils
 as part of the bootstrap process.
 .Pp
 This is a default setting on
 arm/armv6, arm/armv7, arm64/aarch64, mips/mips, mips/mips64, powerpc/powerpc, powerpc/powerpc64, riscv/riscv64 and riscv/riscv64sf.
 .It Va WITH_BINUTILS_BOOTSTRAP
 Build GNU binutils
 as part of the bootstrap process.
 .Pp
 This is a default setting on
 amd64/amd64 and i386/i386.
 .It Va WITHOUT_BLACKLIST
 Set this if you do not want to build
 .Xr blacklistd 8
 and
 .Xr blacklistctl 8 .
 When set, these options are also in effect:
 .Pp
 .Bl -inset -compact
 .It Va WITHOUT_BLACKLIST_SUPPORT
 (unless
 .Va WITH_BLACKLIST_SUPPORT
 is set explicitly)
 .El
 .It Va WITHOUT_BLACKLIST_SUPPORT
 Set to build some programs without
 .Xr libblacklist 3
 support, like
 .Xr fingerd 8 ,
 .Xr ftpd 8 ,
 and
 .Xr sshd 8 .
 .It Va WITHOUT_BLUETOOTH
 Set to not build Bluetooth related kernel modules, programs and libraries.
 .It Va WITHOUT_BOOT
 Set to not build the boot blocks and loader.
 .It Va WITHOUT_BOOTPARAMD
 Set to not build or install
 .Xr bootparamd 8 .
 .It Va WITHOUT_BOOTPD
 Set to not build or install
 .Xr bootpd 8 .
 .It Va WITHOUT_BSDINSTALL
 Set to not build
 .Xr bsdinstall 8 ,
 .Xr sade 8 ,
 and related programs.
 .It Va WITHOUT_BSD_CPIO
 Set to not build the BSD licensed version of cpio based on
 .Xr libarchive 3 .
 .It Va WITH_BSD_GREP
 Install BSD-licensed grep as '[ef]grep' instead of GNU grep.
 .It Va WITHOUT_BSNMP
 Set to not build or install
 .Xr bsnmpd 1
 and related libraries and data files.
 .It Va WITHOUT_BZIP2
 Set to not build contributed bzip2 software as a part of the base system.
 .Bf -symbolic
 The option has no effect yet.
 .Ef
 When set, these options are also in effect:
 .Pp
 .Bl -inset -compact
 .It Va WITHOUT_BZIP2_SUPPORT
 (unless
 .Va WITH_BZIP2_SUPPORT
 is set explicitly)
 .El
 .It Va WITHOUT_BZIP2_SUPPORT
 Set to build some programs without optional bzip2 support.
 .It Va WITHOUT_CALENDAR
 Set to not build
 .Xr calendar 1 .
 .It Va WITHOUT_CAPSICUM
 Set to not build Capsicum support into system programs.
 When set, it enforces these options:
 .Pp
 .Bl -item -compact
 .It
 .Va WITHOUT_CASPER
 .El
 .It Va WITHOUT_CAROOT
 Set to not add the trusted certificates from the Mozilla NSS bundle to
 base.
 .It Va WITHOUT_CASPER
 Set to not build Casper program and related libraries.
 .It Va WITH_CCACHE_BUILD
 Set to use
 .Xr ccache 1
 for the build.
 No configuration is required except to install the
 .Sy devel/ccache
 package.
 When using with
 .Xr distcc 1 ,
 set
 .Sy CCACHE_PREFIX=/usr/local/bin/distcc .
 The default cache directory of
 .Pa $HOME/.ccache
 will be used, which can be overridden by setting
 .Sy CCACHE_DIR .
 The
 .Sy CCACHE_COMPILERCHECK
 option defaults to
 .Sy content
 when using the in-tree bootstrap compiler,
 and
 .Sy mtime
 when using an external compiler.
 The
 .Sy CCACHE_CPP2
 option is used for Clang but not GCC.
 .Pp
 Sharing a cache between multiple work directories requires using a layout
 similar to
 .Pa /some/prefix/src
 .Pa /some/prefix/obj
 and an environment such as:
 .Bd -literal -offset indent
 CCACHE_BASEDIR='${SRCTOP:H}' MAKEOBJDIRPREFIX='${SRCTOP:H}/obj'
 .Ed
 .Pp
 See
 .Xr ccache 1
 for more configuration options.
 .It Va WITHOUT_CCD
 Set to not build
 .Xr geom_ccd 4
 and related utilities.
 .It Va WITHOUT_CDDL
 Set to not build code licensed under Sun's CDDL.
 When set, it enforces these options:
 .Pp
 .Bl -item -compact
 .It
 .Va WITHOUT_CTF
 .It
 .Va WITHOUT_LOADER_ZFS
 .It
 .Va WITHOUT_ZFS
 .El
 .It Va WITHOUT_CLANG
 Set to not build the Clang C/C++ compiler during the regular phase of the build.
 When set, it enforces these options:
 .Pp
 .Bl -item -compact
 .It
 .Va WITHOUT_CLANG_EXTRAS
 .It
 .Va WITHOUT_CLANG_FULL
 .It
 .Va WITHOUT_LLVM_COV
 .El
 .It Va WITHOUT_CLANG_BOOTSTRAP
 Set to not build the Clang C/C++ compiler during the bootstrap phase of
 the build.
 To be able to build the system, either gcc or clang bootstrap must be
 enabled unless an alternate compiler is provided via XCC.
 .It Va WITH_CLANG_EXTRAS
 Set to build additional clang and llvm tools, such as bugpoint and
 clang-format.
 .It Va WITHOUT_CLANG_FULL
 Set to avoid building the ARCMigrate, Rewriter and StaticAnalyzer components of
 the Clang C/C++ compiler.
 .It Va WITHOUT_CLANG_IS_CC
 Do not install links to the Clang C/C++ compiler as
 .Pa /usr/bin/cc ,
 .Pa /usr/bin/c++
 and
 .Pa /usr/bin/cpp .
 .It Va WITHOUT_CPP
 Set to not build
 .Xr cpp 1 .
 .It Va WITHOUT_CROSS_COMPILER
 Set to not build any cross compiler in the cross-tools stage of buildworld.
 When compiling a different version of
 .Fx
 than what is installed on the system, provide an alternate
 compiler with XCC to ensure success.
 When compiling with an identical version of
 .Fx
 to the host, this option may be safely used.
 This option may also be safe when the host version of
 .Fx
 is close to the sources being built, but all bets are off if there have
 been any changes to the toolchain between the versions.
 When set, it enforces these options:
 .Pp
 .Bl -item -compact
 .It
 .Va WITHOUT_BINUTILS_BOOTSTRAP
 .It
 .Va WITHOUT_CLANG_BOOTSTRAP
 .It
 .Va WITHOUT_ELFTOOLCHAIN_BOOTSTRAP
 .It
 .Va WITHOUT_LLD_BOOTSTRAP
 .El
 .It Va WITHOUT_CRYPT
 Set to not build any crypto code.
 When set, it enforces these options:
 .Pp
 .Bl -item -compact
 .It
 .Va WITHOUT_DMAGENT
 .It
 .Va WITHOUT_KERBEROS
 .It
 .Va WITHOUT_KERBEROS_SUPPORT
 .It
 .Va WITHOUT_LDNS
 .It
 .Va WITHOUT_LDNS_UTILS
 .It
 .Va WITHOUT_OPENSSH
 .It
 .Va WITHOUT_OPENSSL
 .It
 .Va WITHOUT_PKGBOOTSTRAP
 .It
 .Va WITHOUT_SVN
 .It
 .Va WITHOUT_SVNLITE
 .It
 .Va WITHOUT_UNBOUND
 .It
 .Va WITHOUT_WIRELESS
 .El
 .Pp
 When set, these options are also in effect:
 .Pp
 .Bl -inset -compact
 .It Va WITHOUT_GSSAPI
 (unless
 .Va WITH_GSSAPI
 is set explicitly)
 .El
 .It Va WITH_CTF
 Set to compile with CTF (Compact C Type Format) data.
 CTF data encapsulates a reduced form of debugging information
 similar to DWARF and the venerable stabs and is required for DTrace.
 .It Va WITHOUT_CUSE
 Set to not build CUSE-related programs and libraries.
 .It Va WITHOUT_CXGBETOOL
 Set to not build
 .Xr cxgbetool 8
 .Pp
 This is a default setting on
 arm/armv6, arm/armv7, mips/mips, mips/mips64, powerpc/powerpc, riscv/riscv64 and riscv/riscv64sf.
 .It Va WITH_CXGBETOOL
 Set to build
 .Xr cxgbetool 8
 .Pp
 This is a default setting on
 amd64/amd64, arm64/aarch64, i386/i386 and powerpc/powerpc64.
 .It Va WITHOUT_CXX
 Set to not build
 .Xr c++ 1
 and related libraries.
 It will also prevent building of
 .Xr gperf 1
 and
 .Xr devd 8 .
 When set, it enforces these options:
 .Pp
 .Bl -item -compact
 .It
 .Va WITHOUT_CLANG
 .It
 .Va WITHOUT_CLANG_EXTRAS
 .It
 .Va WITHOUT_CLANG_FULL
 .It
 .Va WITHOUT_DTRACE_TESTS
 .It
 .Va WITHOUT_GOOGLETEST
 .It
 .Va WITHOUT_LLVM_COV
 .It
 .Va WITHOUT_TESTS
 .El
 .It Va WITHOUT_DEBUG_FILES
 Set to avoid building or installing standalone debug files for each
 executable binary and shared library.
 .It Va WITHOUT_DIALOG
 Set to not build
 .Xr dialog 1 ,
 .Xr dialog 3 ,
 .Xr dpv 1 ,
 and
 .Xr dpv 3 .
 When set, it enforces these options:
 .Pp
 .Bl -item -compact
 .It
 .Va WITHOUT_BSDINSTALL
 .El
 .It Va WITHOUT_DICT
 Set to not build the Webster dictionary files.
 .It Va WITH_DIRDEPS_BUILD
 This is an experimental build system.
 For details see
 http://www.crufty.net/sjg/docs/freebsd-meta-mode.htm.
 Build commands can be seen from the top-level with:
 .Dl make show-valid-targets
 The build is driven by dirdeps.mk using
 .Va DIRDEPS
 stored in
 Makefile.depend files found in each directory.
 .Pp
 The build can be started from anywhere, and behaves the same.
 The initial instance of
 .Xr make 1
 recursively reads
 .Va DIRDEPS
 from
 .Pa Makefile.depend ,
 computing a graph of tree dependencies from the current origin.
 Setting
 .Va NO_DIRDEPS
 skips checking dirdep dependencies and will only build in the current
 and child directories.
 .Va NO_DIRDEPS_BELOW
 skips building any dirdeps and only build the current directory.
 .Pp
 This also utilizes the
 .Va WITH_META_MODE
 logic for incremental builds.
 .Pp
 The build hides commands executed unless
 .Va NO_SILENT
 is defined.
 .Pp
 Note that there is currently no mass install feature for this.
 .Pp
 When set, it enforces these options:
 .Pp
 .Bl -item -compact
 .It
 .Va WITH_INSTALL_AS_USER
 .El
 .Pp
 When set, these options are also in effect:
 .Pp
 .Bl -inset -compact
 .It Va WITH_META_MODE
 (unless
 .Va WITHOUT_META_MODE
 is set explicitly)
 .It Va WITH_STAGING
 (unless
 .Va WITHOUT_STAGING
 is set explicitly)
 .It Va WITH_STAGING_MAN
 (unless
 .Va WITHOUT_STAGING_MAN
 is set explicitly)
 .It Va WITH_STAGING_PROG
 (unless
 .Va WITHOUT_STAGING_PROG
 is set explicitly)
 .It Va WITH_SYSROOT
 (unless
 .Va WITHOUT_SYSROOT
 is set explicitly)
 .El
 .Pp
 This must be set in the environment, make command line, or
 .Pa /etc/src-env.conf ,
 not
 .Pa /etc/src.conf .
 .It Va WITH_DIRDEPS_CACHE
 Cache result of dirdeps.mk which can save significant time
 for subsequent builds.
 Depends on
 .Va WITH_DIRDEPS_BUILD .
 .Pp
 This must be set in the environment, make command line, or
 .Pa /etc/src-env.conf ,
 not
 .Pa /etc/src.conf .
 .It Va WITHOUT_DMAGENT
 Set to not build dma Mail Transport Agent.
 .It Va WITHOUT_DOCCOMPRESS
 Set to not install compressed system documentation.
 Only the uncompressed version will be installed.
 .It Va WITH_DTRACE_TESTS
 Set to build and install the DTrace test suite in
 .Pa /usr/tests/cddl/usr.sbin/dtrace .
 This test suite is considered experimental on architectures other than
 amd64/amd64 and running it may cause system instability.
 .It Va WITHOUT_DYNAMICROOT
 Set this if you do not want to link
 .Pa /bin
 and
 .Pa /sbin
 dynamically.
 .It Va WITHOUT_EE
 Set to not build and install
 .Xr edit 1 ,
 .Xr ee 1 ,
 and related programs.
 .It Va WITHOUT_EFI
 Set not to build
 .Xr efivar 3
 and
 .Xr efivar 8 .
 .Pp
 This is a default setting on
 mips/mips, mips/mips64, powerpc/powerpc, powerpc/powerpc64, riscv/riscv64 and riscv/riscv64sf.
 .It Va WITH_EFI
 Set to build
 .Xr efivar 3
 and
 .Xr efivar 8 .
 .Pp
 This is a default setting on
 amd64/amd64, arm/armv6, arm/armv7, arm64/aarch64 and i386/i386.
 .It Va WITHOUT_ELFTOOLCHAIN_BOOTSTRAP
 Set to not build ELF Tool Chain tools
 (addr2line, nm, size, strings and strip)
 as part of the bootstrap process.
 .Bf -symbolic
 An alternate bootstrap tool chain must be provided.
 .Ef
 .It Va WITHOUT_EXAMPLES
 Set to avoid installing examples to
 .Pa /usr/share/examples/ .
 .It Va WITH_EXPERIMENTAL
 Set to include experimental features in the build.
 .It Va WITH_EXTRA_TCP_STACKS
 Set to build extra TCP stack modules.
 .It Va WITHOUT_FDT
 Set to not build Flattened Device Tree support as part of the base system.
 This includes the device tree compiler (dtc) and libfdt support library.
 .It Va WITHOUT_FILE
 Set to not build
 .Xr file 1
 and related programs.
 When set, it enforces these options:
 .Pp
 .Bl -item -compact
 .It
 .Va WITHOUT_SVNLITE
 .El
 .It Va WITHOUT_FINGER
 Set to not build or install
 .Xr finger 1
 and
 .Xr fingerd 8 .
 .It Va WITHOUT_FLOPPY
 Set to not build or install programs
 for operating floppy disk driver.
 .It Va WITHOUT_FMTREE
 Set to not build and install
 .Pa /usr/sbin/fmtree .
 .It Va WITHOUT_FORMAT_EXTENSIONS
 Set to not enable
 .Fl fformat-extensions
 when compiling the kernel.
 Also disables all format checking.
 .It Va WITHOUT_FORTH
 Set to build bootloaders without Forth support.
 .It Va WITHOUT_FP_LIBC
 Set to build
 .Nm libc
 without floating-point support.
 .It Va WITHOUT_FREEBSD_UPDATE
 Set to not build
 .Xr freebsd-update 8 .
 .It Va WITHOUT_FTP
 Set to not build or install
 .Xr ftp 1
 and
 .Xr ftpd 8 .
 .It Va WITHOUT_GAMES
 Set to not build games.
 .It Va WITHOUT_GDB
 Set to not build
 .Xr gdb 1 .
 .Pp
 This is a default setting on
 arm64/aarch64, riscv/riscv64 and riscv/riscv64sf.
 .It Va WITH_GDB
 Set to build
 .Xr gdb 1 .
 .Pp
 This is a default setting on
 amd64/amd64, arm/armv6, arm/armv7, i386/i386, mips/mips, mips/mips64, powerpc/powerpc and powerpc/powerpc64.
 .It Va WITHOUT_GNU_DIFF
 Set to not build GNU
 .Xr diff 1
 and
 .Xr diff3 1 .
 .It Va WITHOUT_GNU_GREP
 Set to not build GNU
 .Xr grep 1 .
 .It Va WITH_GNU_GREP_COMPAT
 Set this option to include GNU extensions in
 .Xr bsdgrep 1
 by linking against libgnuregex.
 .It Va WITHOUT_GOOGLETEST
 Set to neither build nor install
 .Lb libgmock ,
 .Lb libgtest ,
 and dependent tests.
 .Pp
 This is a default setting on
 mips/mips and mips/mips64.
 .It Va WITH_GOOGLETEST
 Set to build and install
 .Lb libgmock ,
 .Lb libgtest ,
 and dependent tests.
 .Pp
 This is a default setting on
 amd64/amd64, arm/armv6, arm/armv7, arm64/aarch64, i386/i386, powerpc/powerpc, powerpc/powerpc64, riscv/riscv64 and riscv/riscv64sf.
 .It Va WITHOUT_GPIO
 Set to not build
 .Xr gpioctl 8
 as part of the base system.
 .It Va WITHOUT_GSSAPI
 Set to not build libgssapi.
 .It Va WITHOUT_HAST
 Set to not build
 .Xr hastd 8
 and related utilities.
 .It Va WITH_HESIOD
 Set to build Hesiod support.
 .It Va WITHOUT_HTML
 Set to not build HTML docs.
 .It Va WITHOUT_HYPERV
 Set to not build or install HyperV utilities.
 .Pp
 This is a default setting on
 arm/armv6, arm/armv7, arm64/aarch64, mips/mips, mips/mips64, powerpc/powerpc, powerpc/powerpc64, riscv/riscv64 and riscv/riscv64sf.
 .It Va WITH_HYPERV
 Set to build or install HyperV utilities.
 .Pp
 This is a default setting on
 amd64/amd64 and i386/i386.
 .It Va WITHOUT_ICONV
 Set to not build iconv as part of libc.
 .It Va WITHOUT_INCLUDES
 Set to not install header files.
 This option used to be spelled
 .Va NO_INCS .
 .Bf -symbolic
 The option does not work for build targets.
 .Ef
 .It Va WITHOUT_INET
 Set to not build programs and libraries related to IPv4 networking.
 When set, it enforces these options:
 .Pp
 .Bl -item -compact
 .It
 .Va WITHOUT_INET_SUPPORT
 .El
 .It Va WITHOUT_INET6
 Set to not build
 programs and libraries related to IPv6 networking.
 When set, it enforces these options:
 .Pp
 .Bl -item -compact
 .It
 .Va WITHOUT_INET6_SUPPORT
 .El
 .It Va WITHOUT_INET6_SUPPORT
 Set to build libraries, programs, and kernel modules without IPv6 support.
 .It Va WITHOUT_INETD
 Set to not build
 .Xr inetd 8 .
 .It Va WITHOUT_INET_SUPPORT
 Set to build libraries, programs, and kernel modules without IPv4 support.
 .It Va WITHOUT_INSTALLLIB
 Set this to not install optional libraries.
 For example, when creating a
 .Xr nanobsd 8
 image.
 .Bf -symbolic
 The option does not work for build targets.
 .Ef
 .It Va WITH_INSTALL_AS_USER
 Set to make install targets succeed for non-root users by installing
 files with owner and group attributes set to that of the user running
 the
 .Xr make 1
 command.
 The user still must set the
 .Va DESTDIR
 variable to point to a directory where the user has write permissions.
 .It Va WITHOUT_IPFILTER
 Set to not build IP Filter package.
 .It Va WITHOUT_IPFW
 Set to not build IPFW tools.
 .It Va WITHOUT_IPSEC_SUPPORT
 Set to not build the kernel with
 .Xr ipsec 4
 support.
 This option is needed for
 .Xr ipsec 4
 and
 .Xr tcpmd5 4 .
 .It Va WITHOUT_ISCSI
 Set to not build
 .Xr iscsid 8
 and related utilities.
 .It Va WITHOUT_JAIL
 Set to not build tools for the support of jails; e.g.,
 .Xr jail 8 .
 .It Va WITHOUT_KDUMP
 Set to not build
 .Xr kdump 1
 and
 .Xr truss 1 .
 .It Va WITHOUT_KERBEROS
 Set this to not build Kerberos 5 (KTH Heimdal).
 When set, these options are also in effect:
 .Pp
 .Bl -inset -compact
 .It Va WITHOUT_GSSAPI
 (unless
 .Va WITH_GSSAPI
 is set explicitly)
 .It Va WITHOUT_KERBEROS_SUPPORT
 (unless
 .Va WITH_KERBEROS_SUPPORT
 is set explicitly)
 .El
 .It Va WITHOUT_KERBEROS_SUPPORT
 Set to build some programs without Kerberos support, like
 .Xr ssh 1 ,
 .Xr telnet 1 ,
 .Xr sshd 8 ,
 and
 .Xr telnetd 8 .
 .It Va WITH_KERNEL_RETPOLINE
 Set to enable the "retpoline" mitigation for CVE-2017-5715 in the kernel
 build.
 .It Va WITHOUT_KERNEL_SYMBOLS
 Set to not install kernel symbol files.
 .Bf -symbolic
 This option is recommended for those people who have small root partitions.
 .Ef
 .It Va WITHOUT_KVM
 Set to not build the
 .Nm libkvm
 library as a part of the base system.
 .Bf -symbolic
 The option has no effect yet.
 .Ef
 When set, these options are also in effect:
 .Pp
 .Bl -inset -compact
 .It Va WITHOUT_KVM_SUPPORT
 (unless
 .Va WITH_KVM_SUPPORT
 is set explicitly)
 .El
 .It Va WITHOUT_KVM_SUPPORT
 Set to build some programs without optional
 .Nm libkvm
 support.
 .It Va WITHOUT_LDNS
 Setting this variable will prevent the LDNS library from being built.
 When set, it enforces these options:
 .Pp
 .Bl -item -compact
 .It
 .Va WITHOUT_LDNS_UTILS
 .It
 .Va WITHOUT_UNBOUND
 .El
 .It Va WITHOUT_LDNS_UTILS
 Setting this variable will prevent building the LDNS utilities
 .Xr drill 1
 and
 .Xr host 1 .
 .It Va WITHOUT_LEGACY_CONSOLE
 Set to not build programs that support a legacy PC console; e.g.,
 .Xr kbdcontrol 1
 and
 .Xr vidcontrol 1 .
 .It Va WITHOUT_LIB32
 On 64-bit platforms, set to not build 32-bit library set and a
 .Nm ld-elf32.so.1
 runtime linker.
 .Pp
 This is a default setting on
 arm/armv6, arm/armv7, arm64/aarch64, i386/i386, mips/mips, powerpc/powerpc, riscv/riscv64 and riscv/riscv64sf.
 .It Va WITHOUT_LIBCPLUSPLUS
 Set to avoid building libcxxrt and libc++.
 .It Va WITHOUT_LIBPTHREAD
 Set to not build the
 .Nm libpthread
 providing library,
 .Nm libthr .
 When set, it enforces these options:
 .Pp
 .Bl -item -compact
 .It
 .Va WITHOUT_LIBTHR
 .El
 .It Va WITH_LIBSOFT
 On armv6 only, set to enable soft float ABI compatibility libraries.
 This option is for transitioning to the new hard float ABI.
 .It Va WITHOUT_LIBTHR
 Set to not build the
 .Nm libthr
 (1:1 threading)
 library.
 .It Va WITHOUT_LLD
 Set to not build LLVM's lld linker.
 .It Va WITHOUT_LLDB
 Set to not build the LLDB debugger.
 .Pp
 This is a default setting on
 arm/armv6, arm/armv7, mips/mips, mips/mips64, powerpc/powerpc, powerpc/powerpc64, riscv/riscv64 and riscv/riscv64sf.
 .It Va WITH_LLDB
 Set to build the LLDB debugger.
 .Pp
 This is a default setting on
 amd64/amd64, arm64/aarch64 and i386/i386.
 .It Va WITHOUT_LLD_BOOTSTRAP
 Set to not build the LLD linker during the bootstrap phase of
 the build.
 To be able to build the system, either Binutils or LLD bootstrap must be
 enabled unless an alternate linker is provided via XLD.
 .It Va WITHOUT_LLD_IS_LD
 Set to use GNU binutils ld as the system linker, instead of LLVM's LLD.
 .It Va WITHOUT_LLVM_ASSERTIONS
 Set to disable debugging assertions in LLVM.
 .It Va WITHOUT_LLVM_COV
 Set to not build the
 .Xr llvm-cov 1
 tool.
 .It Va WITHOUT_LLVM_TARGET_AARCH64
 Set to not build LLVM target support for AArch64.
 The
 .Va LLVM_TARGET_ALL
 option should be used rather than this in most cases.
 .It Va WITHOUT_LLVM_TARGET_ALL
 Set to only build the required LLVM target support.
 This option is preferred to specific target support options.
 When set, these options are also in effect:
 .Pp
 .Bl -inset -compact
 .It Va WITHOUT_LLVM_TARGET_AARCH64
 (unless
 .Va WITH_LLVM_TARGET_AARCH64
 is set explicitly)
 .It Va WITHOUT_LLVM_TARGET_ARM
 (unless
 .Va WITH_LLVM_TARGET_ARM
 is set explicitly)
 .It Va WITHOUT_LLVM_TARGET_MIPS
 (unless
 .Va WITH_LLVM_TARGET_MIPS
 is set explicitly)
 .It Va WITHOUT_LLVM_TARGET_POWERPC
 (unless
 .Va WITH_LLVM_TARGET_POWERPC
 is set explicitly)
 .It Va WITHOUT_LLVM_TARGET_RISCV
 (unless
 .Va WITH_LLVM_TARGET_RISCV
 is set explicitly)
 .El
 .It Va WITHOUT_LLVM_TARGET_ARM
 Set to not build LLVM target support for ARM.
 The
 .Va LLVM_TARGET_ALL
 option should be used rather than this in most cases.
 .It Va WITH_LLVM_TARGET_BPF
 Set to build LLVM target support for BPF.
 The
 .Va LLVM_TARGET_ALL
 option should be used rather than this in most cases.
 .It Va WITHOUT_LLVM_TARGET_MIPS
 Set to not build LLVM target support for MIPS.
 The
 .Va LLVM_TARGET_ALL
 option should be used rather than this in most cases.
 .It Va WITHOUT_LLVM_TARGET_POWERPC
 Set to not build LLVM target support for PowerPC.
 The
 .Va LLVM_TARGET_ALL
 option should be used rather than this in most cases.
 .It Va WITHOUT_LLVM_TARGET_RISCV
 Set to not build LLVM target support for RISC-V.
 The
 .Va LLVM_TARGET_ALL
 option should be used rather than this in most cases.
 .It Va WITHOUT_LLVM_TARGET_X86
 Set to not build LLVM target support for X86.
 The
 .Va LLVM_TARGET_ALL
 option should be used rather than this in most cases.
 .It Va WITH_LOADER_EFI_SECUREBOOT
 Enable building
 .Xr loader 8
 with support for verification based on certificates obtained from UEFI.
 .Pp
 .It Va WITH_LOADER_FIREWIRE
 Enable firewire support in /boot/loader on x86. This option is a nop
 on all other platforms.
 .It Va WITH_LOADER_FORCE_LE
 Set to force the powerpc boot loader to launch the kernel in little
 endian mode.
 .It Va WITHOUT_LOADER_GELI
 Disable inclusion of GELI crypto support in the boot chain binaries.
 .Pp
 This is a default setting on
 powerpc/powerpc and powerpc/powerpc64.
 .It Va WITH_LOADER_GELI
 Set to build GELI bootloader support.
 .Pp
 This is a default setting on
 amd64/amd64, arm/armv6, arm/armv7, arm64/aarch64, i386/i386, mips/mips, mips/mips64, riscv/riscv64 and riscv/riscv64sf.
 .It Va WITHOUT_LOADER_LUA
 Set to not build LUA bindings for the boot loader.
 .Pp
 This is a default setting on
 powerpc/powerpc and powerpc/powerpc64.
 .It Va WITH_LOADER_LUA
 Set to build LUA bindings for the boot loader.
 .Pp
 This is a default setting on
 amd64/amd64, arm/armv6, arm/armv7, arm64/aarch64, i386/i386, mips/mips, mips/mips64, riscv/riscv64 and riscv/riscv64sf.
 .It Va WITHOUT_LOADER_OFW
 Disable building of openfirmware bootloader components.
 .Pp
 This is a default setting on
 amd64/amd64, arm/armv6, arm/armv7, arm64/aarch64, i386/i386, mips/mips, mips/mips64, riscv/riscv64 and riscv/riscv64sf.
 .It Va WITH_LOADER_OFW
 Set to build openfirmware bootloader components.
 .Pp
 This is a default setting on
 powerpc/powerpc and powerpc/powerpc64.
 .It Va WITHOUT_LOADER_UBOOT
 Disable building of ubldr.
 .Pp
 This is a default setting on
 amd64/amd64, arm64/aarch64, i386/i386, riscv/riscv64 and riscv/riscv64sf.
 .It Va WITH_LOADER_UBOOT
 Set to build ubldr.
 .Pp
 This is a default setting on
 arm/armv6, arm/armv7, mips/mips, mips/mips64, powerpc/powerpc and powerpc/powerpc64.
 .It Va WITH_LOADER_VERBOSE
 Set to build with extra verbose debugging in the loader.
 May explode already nearly too large loader over the limit.
 Use with care.
 
 .It Va WITH_LOADER_VERIEXEC
 Enable building
 .Xr loader 8
 with support for verification similar to Verified Exec.
 .Pp
 Depends on
 .Va WITH_BEARSSL .
 When set, these options are also in effect:
 .Pp
 .Bl -inset -compact
 .It Va WITH_LOADER_EFI_SECUREBOOT
 (unless
 .Va WITHOUT_LOADER_EFI_SECUREBOOT
 is set explicitly)
 .It Va WITH_LOADER_VERIEXEC_VECTX
 (unless
 .Va WITHOUT_LOADER_VERIEXEC_VECTX
 is set explicitly)
 .El
 .It Va WITH_LOADER_VERIEXEC_PASS_MANIFEST
 Enable building
 .Xr loader 8
 with support to pass a verified manifest to the kernel.
 The kernel has to be built with a module to parse the manifest.
 .Pp
 Depends on
 .Va WITH_LOADER_VERIEXEC .
 .It Va WITHOUT_LOADER_ZFS
 Set to not build ZFS file system boot loader support.
 .It Va WITHOUT_LOCALES
 Set to not build localization files; see
 .Xr locale 1 .
 .It Va WITHOUT_LOCATE
 Set to not build
 .Xr locate 1
 and related programs.
 .It Va WITHOUT_LPR
 Set to not build
 .Xr lpr 1
 and related programs.
 .It Va WITHOUT_LS_COLORS
 Set to build
 .Xr ls 1
 without support for colors to distinguish file types.
 .It Va WITHOUT_LZMA_SUPPORT
 Set to build some programs without optional lzma compression support.
 .It Va WITHOUT_MAIL
 Set to not build any mail support (MUA or MTA).
 When set, it enforces these options:
 .Pp
 .Bl -item -compact
 .It
 .Va WITHOUT_DMAGENT
 .It
 .Va WITHOUT_MAILWRAPPER
 .It
 .Va WITHOUT_SENDMAIL
 .El
 .It Va WITHOUT_MAILWRAPPER
 Set to not build the
 .Xr mailwrapper 8
 MTA selector.
 .It Va WITHOUT_MAKE
 Set to not install
 .Xr make 1
 and related support files.
 .It Va WITHOUT_MAKE_CHECK_USE_SANDBOX
 Set to not execute
 .Dq Li "make check"
 in limited sandbox mode.
 This option should be paired with
 .Va WITH_INSTALL_AS_USER
 if executed as an unprivileged user.
 See
 .Xr tests 7
 for more details.
 .It Va WITHOUT_MAN
 Set to not build manual pages.
 When set, these options are also in effect:
 .Pp
 .Bl -inset -compact
 .It Va WITHOUT_MAN_UTILS
 (unless
 .Va WITH_MAN_UTILS
 is set explicitly)
 .El
 .It Va WITHOUT_MANCOMPRESS
 Set to not to install compressed man pages.
 Only the uncompressed versions will be installed.
 .It Va WITHOUT_MAN_UTILS
 Set to not build utilities for manual pages,
 .Xr apropos 1 ,
 .Xr makewhatis 1 ,
 .Xr man 1 ,
 .Xr whatis 1 ,
 .Xr manctl 8 ,
 and related support files.
 .It Va WITH_META_MODE
 Create
 .Xr make 1
 meta files when building, which can provide a reliable incremental build when
 using
 .Xr filemon 4 .
 The meta file is created in OBJDIR as
 .Pa target.meta .
 These meta files track the command that was executed, its output, and the
 current directory.
 The
 .Xr filemon 4
 module is required unless
 .Va NO_FILEMON
 is defined.
 When the module is loaded, any files used by the commands executed are
 tracked as dependencies for the target in its meta file.
 The target is considered out-of-date and rebuilt if any of these
 conditions are true compared to the last build:
 .Bl -bullet -compact
 .It
 The command to execute changes.
 .It
 The current working directory changes.
 .It
 The target's meta file is missing.
 .It
 The target's meta file is missing filemon data when filemon is loaded
 and a previous run did not have it loaded.
 .It
 [requires
 .Xr filemon 4 ]
 Files read, executed or linked to are newer than the target.
 .It
 [requires
 .Xr filemon 4 ]
 Files read, written, executed or linked are missing.
 .El
 The meta files can also be useful for debugging.
 .Pp
 The build hides commands that are executed unless
 .Va NO_SILENT
 is defined.
 Errors cause
 .Xr make 1
 to show some of its environment for further debugging.
 .Pp
 The build operates as it normally would otherwise.
 This option originally invoked a different build system but that was renamed
 to
 .Va WITH_DIRDEPS_BUILD .
 .Pp
 This must be set in the environment, make command line, or
 .Pa /etc/src-env.conf ,
 not
 .Pa /etc/src.conf .
 .It Va WITHOUT_MLX5TOOL
 Set to not build
 .Xr mlx5tool 8
 .Pp
 This is a default setting on
 arm/armv6, arm/armv7, mips/mips, mips/mips64, powerpc/powerpc, riscv/riscv64 and riscv/riscv64sf.
 .It Va WITH_MLX5TOOL
 Set to build
 .Xr mlx5tool 8
 .Pp
 This is a default setting on
 amd64/amd64, arm64/aarch64, i386/i386 and powerpc/powerpc64.
 .It Va WITHOUT_NDIS
 Set to not build programs and libraries
 related to NDIS emulation support.
 .It Va WITHOUT_NETCAT
 Set to not build
 .Xr nc 1
 utility.
 .It Va WITHOUT_NETGRAPH
 Set to not build applications to support
 .Xr netgraph 4 .
 When set, it enforces these options:
 .Pp
 .Bl -item -compact
 .It
 .Va WITHOUT_ATM
 .It
 .Va WITHOUT_BLUETOOTH
 .El
 .Pp
 When set, these options are also in effect:
 .Pp
 .Bl -inset -compact
 .It Va WITHOUT_NETGRAPH_SUPPORT
 (unless
 .Va WITH_NETGRAPH_SUPPORT
 is set explicitly)
 .El
 .It Va WITHOUT_NETGRAPH_SUPPORT
 Set to build libraries, programs, and kernel modules without netgraph support.
 .It Va WITHOUT_NIS
 Set to not build
 .Xr NIS 8
 support and related programs.
 If set, you might need to adopt your
 .Xr nsswitch.conf 5
 and remove
 .Sq nis
 entries.
 .It Va WITHOUT_NLS
 Set to not build NLS catalogs.
 When set, it enforces these options:
 .Pp
 .Bl -item -compact
 .It
 .Va WITHOUT_NLS_CATALOGS
 .El
 .It Va WITHOUT_NLS_CATALOGS
 Set to not build NLS catalog support for
 .Xr csh 1 .
 .It Va WITHOUT_NS_CACHING
 Set to disable name caching in the
 .Pa nsswitch
 subsystem.
 The generic caching daemon,
 .Xr nscd 8 ,
 will not be built either if this option is set.
 .It Va WITHOUT_NTP
 Set to not build
 .Xr ntpd 8
 and related programs.
 .It Va WITHOUT_NVME
 Set to not build nvme related tools and kernel modules.
 .Pp
 This is a default setting on
 arm/armv6, arm/armv7, mips/mips, mips/mips64, powerpc/powerpc, riscv/riscv64 and riscv/riscv64sf.
 .It Va WITH_NVME
 Set to build nvme related tools and kernel modules.
 
 .Pp
 This is a default setting on
 amd64/amd64, arm64/aarch64, i386/i386 and powerpc/powerpc64.
 .It Va WITH_OFED
 Set to build the
 .Dq "OpenFabrics Enterprise Distribution"
 Infiniband software stack.
 .It Va WITH_OFED_EXTRA
 Set to build the non-essential components of the
 .Dq "OpenFabrics Enterprise Distribution"
 Infiniband software stack, mostly examples.
 .It Va WITH_OPENLDAP
 Enable building openldap support for kerberos.
 .It Va WITHOUT_OPENMP
 Set to not build LLVM's OpenMP runtime.
 .Pp
 This is a default setting on
 arm/armv6, arm/armv7, arm64/aarch64, mips/mips, mips/mips64, powerpc/powerpc, riscv/riscv64 and riscv/riscv64sf.
 .It Va WITH_OPENMP
 Set to build LLVM's OpenMP runtime.
 .Pp
 This is a default setting on
 amd64/amd64, i386/i386 and powerpc/powerpc64.
 .It Va WITHOUT_OPENSSH
 Set to not build OpenSSH.
 .It Va WITHOUT_OPENSSL
 Set to not build OpenSSL.
 When set, it enforces these options:
 .Pp
 .Bl -item -compact
 .It
 .Va WITHOUT_DMAGENT
 .It
 .Va WITHOUT_KERBEROS
 .It
 .Va WITHOUT_KERBEROS_SUPPORT
 .It
 .Va WITHOUT_LDNS
 .It
 .Va WITHOUT_LDNS_UTILS
 .It
 .Va WITHOUT_OPENSSH
 .It
 .Va WITHOUT_PKGBOOTSTRAP
 .It
 .Va WITHOUT_SVN
 .It
 .Va WITHOUT_SVNLITE
 .It
 .Va WITHOUT_UNBOUND
 .It
 .Va WITHOUT_WIRELESS
 .El
 .Pp
 When set, these options are also in effect:
 .Pp
 .Bl -inset -compact
 .It Va WITHOUT_GSSAPI
 (unless
 .Va WITH_GSSAPI
 is set explicitly)
 .El
 .It Va WITHOUT_PAM
 Set to not build PAM library and modules.
 .Bf -symbolic
 This option is deprecated and does nothing.
 .Ef
 When set, these options are also in effect:
 .Pp
 .Bl -inset -compact
 .It Va WITHOUT_PAM_SUPPORT
 (unless
 .Va WITH_PAM_SUPPORT
 is set explicitly)
 .El
 .It Va WITHOUT_PAM_SUPPORT
 Set to build some programs without PAM support, particularly
 .Xr ftpd 8
 and
 .Xr ppp 8 .
 .It Va WITHOUT_PF
 Set to not build PF firewall package.
 When set, it enforces these options:
 .Pp
 .Bl -item -compact
 .It
 .Va WITHOUT_AUTHPF
 .El
 .It Va WITH_PIE
 Build dynamically linked binaries as
 Position-Independent Executable (PIE).
 .It Va WITHOUT_PKGBOOTSTRAP
 Set to not build
 .Xr pkg 7
 bootstrap tool.
 .It Va WITHOUT_PMC
 Set to not build
 .Xr pmccontrol 8
 and related programs.
 .It Va WITHOUT_PORTSNAP
 Set to not build or install
 .Xr portsnap 8
 and related files.
 When set, it enforces these options:
 .Pp
 .Bl -item -compact
 .It
 .Va WITHOUT_FREEBSD_UPDATE
 .El
 .It Va WITHOUT_PPP
 Set to not build
 .Xr ppp 8
 and related programs.
 .It Va WITHOUT_PROFILE
 Set to not build profiled libraries for use with
 .Xr gprof 8 .
 .Pp
 This is a default setting on
 mips/mips64.
 .It Va WITH_PROFILE
 Set to build profiled libraries for use with
 .Xr gprof 8 .
 .Pp
 This is a default setting on
 amd64/amd64, arm/armv6, arm/armv7, arm64/aarch64, i386/i386, mips/mips, powerpc/powerpc, powerpc/powerpc64, riscv/riscv64 and riscv/riscv64sf.
 .It Va WITHOUT_QUOTAS
 Set to not build
 .Xr quota 1
 and related programs.
 .It Va WITHOUT_RADIUS_SUPPORT
 Set to not build radius support into various applications, like
 .Xr pam_radius 8
 and
 .Xr ppp 8 .
 .It Va WITH_RATELIMIT
 Set to build the system with rate limit support.
 .Pp
 This makes
 .Dv SO_MAX_PACING_RATE
 effective in
 .Xr getsockopt 2 ,
 and
 .Ar txrlimit
 support in
 .Xr ifconfig 8 ,
 by proxy.
 .It Va WITHOUT_RBOOTD
 Set to not build or install
 .Xr rbootd 8 .
 .It Va WITH_REPRODUCIBLE_BUILD
 Set to exclude build metadata (such as the build time, user, or host)
 from the kernel, boot loaders, and uname output, so that builds produce
 bit-for-bit identical output.
 .It Va WITHOUT_RESCUE
 Set to not build
 .Xr rescue 8 .
 .It Va WITH_RETPOLINE
 Set to build the base system with the retpoline speculative execution
 vulnerability mitigation for CVE-2017-5715.
 .It Va WITHOUT_ROUTED
 Set to not build
 .Xr routed 8
 utility.
 .It Va WITH_RPCBIND_WARMSTART_SUPPORT
 Set to build
 .Xr rpcbind 8
 with warmstart support.
 .It Va WITHOUT_SENDMAIL
 Set to not build
 .Xr sendmail 8
 and related programs.
 .It Va WITHOUT_SERVICESDB
 Set to not install
 .Pa /var/db/services.db .
 .It Va WITHOUT_SETUID_LOGIN
 Set this to disable the installation of
 .Xr login 1
 as a set-user-ID root program.
 .It Va WITHOUT_SHAREDOCS
 Set to not build the
 .Bx 4.4
 legacy docs.
 .It Va WITHOUT_SHARED_TOOLCHAIN
 Set to build the toolchain binaries as statically linked executables.
 The set includes
 .Xr cc 1 ,
 .Xr make 1
 and necessary utilities like assembler, linker and library archive manager.
 .It Va WITH_SORT_THREADS
 Set to enable threads in
 .Xr sort 1 .
 .It Va WITHOUT_SOURCELESS
 Set to not build kernel modules that include sourceless code (either microcode or native code for host CPU).
 When set, it enforces these options:
 .Pp
 .Bl -item -compact
 .It
 .Va WITHOUT_SOURCELESS_HOST
 .It
 .Va WITHOUT_SOURCELESS_UCODE
 .El
 .It Va WITHOUT_SOURCELESS_HOST
 Set to not build kernel modules that include sourceless native code for host CPU.
 .It Va WITHOUT_SOURCELESS_UCODE
 Set to not build kernel modules that include sourceless microcode.
 .It Va WITHOUT_SSP
 Set to not build world with propolice stack smashing protection.
 .Pp
 This is a default setting on
 mips/mips and mips/mips64.
 .It Va WITH_SSP
 Set to build world with propolice stack smashing protection.
 .Pp
 This is a default setting on
 amd64/amd64, arm/armv6, arm/armv7, arm64/aarch64, i386/i386, powerpc/powerpc, powerpc/powerpc64, riscv/riscv64 and riscv/riscv64sf.
 .It Va WITH_STAGING
 Enable staging of files to a stage tree.
 This can be best thought of as auto-install to
 .Va DESTDIR
 with some extra meta data to ensure dependencies can be tracked.
 Depends on
 .Va WITH_DIRDEPS_BUILD .
 When set, these options are also in effect:
 .Pp
 .Bl -inset -compact
 .It Va WITH_STAGING_MAN
 (unless
 .Va WITHOUT_STAGING_MAN
 is set explicitly)
 .It Va WITH_STAGING_PROG
 (unless
 .Va WITHOUT_STAGING_PROG
 is set explicitly)
 .El
 .Pp
 This must be set in the environment, make command line, or
 .Pa /etc/src-env.conf ,
 not
 .Pa /etc/src.conf .
 .It Va WITH_STAGING_MAN
 Enable staging of man pages to stage tree.
 .It Va WITH_STAGING_PROG
 Enable staging of PROGs to stage tree.
 .It Va WITH_STALE_STAGED
 Check staged files are not stale.
 .It Va WITHOUT_STATS
 Set to neither build nor install
 .Lb libstats
 and dependent binaries.
 .It Va WITH_SVN
 Set to install
 .Xr svnlite 1
 as
 .Xr svn 1 .
 .It Va WITHOUT_SVNLITE
 Set to not build
 .Xr svnlite 1
 and related programs.
 .It Va WITHOUT_SYSCONS
 Set to not build
 .Xr syscons 4
 support files such as keyboard maps, fonts, and screen output maps.
 .It Va WITH_SYSROOT
 Enable use of sysroot during build.
 Depends on
 .Va WITH_DIRDEPS_BUILD .
 .Pp
 This must be set in the environment, make command line, or
 .Pa /etc/src-env.conf ,
 not
 .Pa /etc/src.conf .
 .It Va WITHOUT_SYSTEM_COMPILER
 Set to not opportunistically skip building a cross-compiler during the
 bootstrap phase of the build.
 Normally, if the currently installed compiler matches the planned bootstrap
 compiler type and revision, then it will not be built.
 This does not prevent a compiler from being built for installation though,
 only for building one for the build itself.
 The
 .Va WITHOUT_CLANG
 option controls that.
 .It Va WITHOUT_SYSTEM_LINKER
 Set to not opportunistically skip building a cross-linker during the
 bootstrap phase of the build.
 Normally, if the currently installed linker matches the planned bootstrap
 linker type and revision, then it will not be built.
 This does not prevent a linker from being built for installation though,
 only for building one for the build itself.
 The
 .Va WITHOUT_LLD
 and
 .Va WITHOUT_BINUTILS
 options control those.
 .Pp
 This option is only relevant when
 .Va WITH_LLD_BOOTSTRAP
 is set.
 .It Va WITHOUT_TALK
 Set to not build or install
 .Xr talk 1
 and
 .Xr talkd 8 .
 .It Va WITHOUT_TCP_WRAPPERS
 Set to not build or install
 .Xr tcpd 8 ,
 and related utilities.
 .It Va WITHOUT_TCSH
 Set to not build and install
 .Pa /bin/csh
 (which is
 .Xr tcsh 1 ) .
 .It Va WITHOUT_TELNET
 Set to not build
 .Xr telnet 1
 and related programs.
 .It Va WITHOUT_TESTS
 Set to not build nor install the
 .Fx
 Test Suite in
 .Pa /usr/tests/ .
 See
 .Xr tests 7
 for more details.
 This also disables the build of all test-related dependencies, including ATF.
 When set, it enforces these options:
 .Pp
 .Bl -item -compact
 .It
 .Va WITHOUT_DTRACE_TESTS
 .El
 .Pp
 When set, these options are also in effect:
 .Pp
 .Bl -inset -compact
 .It Va WITHOUT_GOOGLETEST
 (unless
 .Va WITH_GOOGLETEST
 is set explicitly)
 .It Va WITHOUT_TESTS_SUPPORT
 (unless
 .Va WITH_TESTS_SUPPORT
 is set explicitly)
 .El
 .It Va WITHOUT_TESTS_SUPPORT
 Set to disables the build of all test-related dependencies, including ATF.
 When set, it enforces these options:
 .Pp
 .Bl -item -compact
 .It
 .Va WITHOUT_GOOGLETEST
 .El
 .It Va WITHOUT_TEXTPROC
 Set to not build
 programs used for text processing.
 .It Va WITHOUT_TFTP
 Set to not build or install
 .Xr tftp 1
 and
 .Xr tftpd 8 .
 .It Va WITHOUT_TOOLCHAIN
 Set to not install header or
 programs used for program development,
 compilers, debuggers etc.
 When set, it enforces these options:
 .Pp
 .Bl -item -compact
 .It
 .Va WITHOUT_BINUTILS
 .It
 .Va WITHOUT_CLANG
 .It
 .Va WITHOUT_CLANG_EXTRAS
 .It
 .Va WITHOUT_CLANG_FULL
 .It
 .Va WITHOUT_GDB
 .It
 .Va WITHOUT_INCLUDES
 .It
 .Va WITHOUT_LLD
 .It
 .Va WITHOUT_LLDB
 .It
 .Va WITHOUT_LLVM_COV
 .El
 .It Va WITHOUT_UNBOUND
 Set to not build
 .Xr unbound 8
 and related programs.
 .It Va WITHOUT_UNIFIED_OBJDIR
 Set to use the historical object directory format for
 .Xr build 7
 targets.
 For native-builds and builds done directly in sub-directories the format of
 .Pa ${MAKEOBJDIRPREFIX}/${.CURDIR}
 is used,
 while for cross-builds
 .Pa ${MAKEOBJDIRPREFIX}/${TARGET}.${TARGET_ARCH}/${.CURDIR}
 is used.
 .Pp
 This option is transitional and will be removed before the 12.0 release,
 at which time
 .va WITH_UNIFIED_OBJDIR
 will be enabled permanently.
 .Pp
 This must be set in the environment, make command line, or
 .Pa /etc/src-env.conf ,
 not
 .Pa /etc/src.conf .
 .It Va WITHOUT_USB
 Set to not build USB-related programs and libraries.
 .It Va WITHOUT_USB_GADGET_EXAMPLES
 Set to not build USB gadget kernel modules.
 .It Va WITHOUT_UTMPX
 Set to not build user accounting tools such as
 .Xr last 1 ,
 .Xr users 1 ,
 .Xr who 1 ,
 .Xr ac 8 ,
 .Xr lastlogin 8
 and
 .Xr utx 8 .
 .It Va WITH_VERIEXEC
 Enable building
 .Xr veriexec 8
 which loads the contents of verified manifests into the kernel
 for use by
 .Xr mac_veriexec 4
 .Pp
 Depends on
 .Va WITH_BEARSSL .
 .It Va WITHOUT_VI
 Set to not build and install vi, view, ex and related programs.
 .It Va WITHOUT_VT
 Set to not build
 .Xr vt 4
 support files (fonts and keymaps).
 .It Va WITHOUT_WARNS
 Set this to not add warning flags to the compiler invocations.
 Useful as a temporary workaround when code enters the tree
 which triggers warnings in environments that differ from the
 original developer.
 .It Va WITHOUT_WIRELESS
 Set to not build programs used for 802.11 wireless networks; especially
 .Xr wpa_supplicant 8
 and
 .Xr hostapd 8 .
 When set, these options are also in effect:
 .Pp
 .Bl -inset -compact
 .It Va WITHOUT_WIRELESS_SUPPORT
 (unless
 .Va WITH_WIRELESS_SUPPORT
 is set explicitly)
 .El
 .It Va WITHOUT_WIRELESS_SUPPORT
 Set to build libraries, programs, and kernel modules without
 802.11 wireless support.
 .It Va WITHOUT_WPA_SUPPLICANT_EAPOL
 Build
 .Xr wpa_supplicant 8
 without support for the IEEE 802.1X protocol and without
 support for EAP-PEAP, EAP-TLS, EAP-LEAP, and EAP-TTLS
 protocols (usable only via 802.1X).
 .It Va WITHOUT_ZFS
 Set to not build ZFS file system kernel module, libraries, and user commands.
 .It Va WITHOUT_ZONEINFO
 Set to not build the timezone database.
 When set, it enforces these options:
 .Pp
 .Bl -item -compact
 .It
 .Va WITHOUT_ZONEINFO_LEAPSECONDS_SUPPORT
 .El
 .It Va WITH_ZONEINFO_LEAPSECONDS_SUPPORT
 Set to build leapsecond information in to the timezone database.
 .El
 .Sh FILES
 .Bl -tag -compact -width Pa
 .It Pa /etc/src.conf
 .It Pa /etc/src-env.conf
 .It Pa /usr/share/mk/bsd.own.mk
 .El
 .Sh SEE ALSO
 .Xr make 1 ,
 .Xr make.conf 5 ,
 .Xr build 7 ,
 .Xr ports 7
 .Sh HISTORY
 The
 .Nm
 file appeared in
 .Fx 7.0 .
 .Sh AUTHORS
 This manual page was autogenerated by
 .An tools/build/options/makeman .
diff --git a/share/mk/src.opts.mk b/share/mk/src.opts.mk
index fc03abf50c61..5b2c76452a27 100644
--- a/share/mk/src.opts.mk
+++ b/share/mk/src.opts.mk
@@ -1,519 +1,520 @@
 # $FreeBSD$
 #
 # Option file for FreeBSD /usr/src builds.
 #
 # Users define WITH_FOO and WITHOUT_FOO on the command line or in /etc/src.conf
 # and /etc/make.conf files. These translate in the build system to MK_FOO={yes,no}
 # with sensible (usually) defaults.
 #
 # Makefiles must include bsd.opts.mk after defining specific MK_FOO options that
 # are applicable for that Makefile (typically there are none, but sometimes there
 # are exceptions). Recursive makes usually add MK_FOO=no for options that they wish
 # to omit from that make.
 #
 # Makefiles must include bsd.mkopt.mk before they test the value of any MK_FOO
 # variable.
 #
 # Makefiles may also assume that this file is included by src.opts.mk should it
 # need variables defined there prior to the end of the Makefile where
 # bsd.{subdir,lib.bin}.mk is traditionally included.
 #
 # The old-style YES_FOO and NO_FOO are being phased out. No new instances of them
 # should be added. Old instances should be removed since they were just to
 # bridge the gap between FreeBSD 4 and FreeBSD 5.
 #
 # Makefiles should never test WITH_FOO or WITHOUT_FOO directly (although an
 # exception is made for _WITHOUT_SRCONF which turns off this mechanism
 # completely inside bsd.*.mk files).
 #
 
 .if !target(__<src.opts.mk>__)
 __<src.opts.mk>__:
 
 .include <bsd.own.mk>
 
 #
 # Define MK_* variables (which are either "yes" or "no") for users
 # to set via WITH_*/WITHOUT_* in /etc/src.conf and override in the
 # make(1) environment.
 # These should be tested with `== "no"' or `!= "no"' in makefiles.
 # The NO_* variables should only be set by makefiles for variables
 # that haven't been converted over.
 #
 
 # These options are used by the src builds. Those listed in
 # __DEFAULT_YES_OPTIONS default to 'yes' and will build unless turned
 # off.  __DEFAULT_NO_OPTIONS will default to 'no' and won't build
 # unless turned on. Any options listed in 'BROKEN_OPTIONS' will be
 # hard-wired to 'no'.  "Broken" here means not working or
 # not-appropriate and/or not supported. It doesn't imply something is
 # wrong with the code. There's not a single good word for this, so
 # BROKEN was selected as the least imperfect one considered at the
 # time. Options are added to BROKEN_OPTIONS list on a per-arch basis.
 # At this time, there's no provision for mutually incompatible options.
 
 __DEFAULT_YES_OPTIONS = \
     ACCT \
     ACPI \
     APM \
     AT \
     ATM \
     AUDIT \
     AUTHPF \
     AUTOFS \
     BHYVE \
     BINUTILS \
     BLACKLIST \
     BLUETOOTH \
     BOOT \
     BOOTPARAMD \
     BOOTPD \
     BSD_CPIO \
     BSDINSTALL \
     BSNMP \
     BZIP2 \
     CALENDAR \
     CAPSICUM \
     CAROOT \
     CASPER \
     CCD \
     CDDL \
     CLANG \
     CLANG_BOOTSTRAP \
     CLANG_IS_CC \
     CPP \
     CROSS_COMPILER \
     CRYPT \
     CUSE \
     CXX \
     CXGBETOOL \
     DIALOG \
     DICT \
     DMAGENT \
     DYNAMICROOT \
     EE \
     EFI \
     ELFTOOLCHAIN_BOOTSTRAP \
     EXAMPLES \
     FDT \
     FILE \
     FINGER \
     FLOPPY \
     FMTREE \
     FORTH \
     FP_LIBC \
     FREEBSD_UPDATE \
     FTP \
     GAMES \
     GDB \
     GNU_DIFF \
     GNU_GREP \
     GOOGLETEST \
     GPIO \
     HAST \
     HTML \
     HYPERV \
     ICONV \
     INET \
     INET6 \
     INETD \
     IPFILTER \
     IPFW \
     ISCSI \
     JAIL \
     KDUMP \
     KVM \
     LDNS \
     LDNS_UTILS \
     LEGACY_CONSOLE \
     LIBCPLUSPLUS \
     LIBPTHREAD \
     LIBTHR \
     LLD \
     LLD_BOOTSTRAP \
     LLD_IS_LD \
     LLVM_ASSERTIONS \
     LLVM_COV \
     LLVM_TARGET_ALL \
     LOADER_GELI \
     LOADER_LUA \
     LOADER_OFW \
     LOADER_UBOOT \
     LOCALES \
     LOCATE \
     LPR \
     LS_COLORS \
     LZMA_SUPPORT \
     MAIL \
     MAILWRAPPER \
     MAKE \
     MLX5TOOL \
     NDIS \
     NETCAT \
     NETGRAPH \
     NLS_CATALOGS \
     NS_CACHING \
     NTP \
     NVME \
     OFED \
     OPENSSL \
     PAM \
     PF \
     PKGBOOTSTRAP \
     PMC \
     PORTSNAP \
     PPP \
     QUOTAS \
     RADIUS_SUPPORT \
     RBOOTD \
     RESCUE \
     ROUTED \
     SENDMAIL \
     SERVICESDB \
     SETUID_LOGIN \
     SHARED_TOOLCHAIN \
     SHAREDOCS \
     SOURCELESS \
     SOURCELESS_HOST \
     SOURCELESS_UCODE \
     STATS \
     SVNLITE \
     SYSCONS \
     SYSTEM_COMPILER \
     SYSTEM_LINKER \
     TALK \
     TCP_WRAPPERS \
     TCSH \
     TELNET \
     TEXTPROC \
     TFTP \
     UNBOUND \
     USB \
     UTMPX \
     VI \
     VT \
     WIRELESS \
     WPA_SUPPLICANT_EAPOL \
     ZFS \
     LOADER_ZFS \
     ZONEINFO
 
 __DEFAULT_NO_OPTIONS = \
     BEARSSL \
+    BHYVE_SNAPSHOT \
     BSD_GREP \
     CLANG_EXTRAS \
     DTRACE_TESTS \
     EXPERIMENTAL \
     GNU_GREP_COMPAT \
     HESIOD \
     LIBSOFT \
     LOADER_FIREWIRE \
     LOADER_FORCE_LE \
     LOADER_VERBOSE \
     LOADER_VERIEXEC_PASS_MANIFEST \
     OFED_EXTRA \
     OPENLDAP \
     REPRODUCIBLE_BUILD \
     RPCBIND_WARMSTART_SUPPORT \
     SORT_THREADS \
     SVN \
     ZONEINFO_LEAPSECONDS_SUPPORT \
 
 # LEFT/RIGHT. Left options which default to "yes" unless their corresponding
 # RIGHT option is disabled.
 __DEFAULT_DEPENDENT_OPTIONS= \
 	CLANG_FULL/CLANG \
 	LOADER_VERIEXEC/BEARSSL \
 	LOADER_EFI_SECUREBOOT/LOADER_VERIEXEC \
 	LOADER_VERIEXEC_VECTX/LOADER_VERIEXEC \
 	VERIEXEC/BEARSSL \
 
 # MK_*_SUPPORT options which default to "yes" unless their corresponding
 # MK_* variable is set to "no".
 #
 .for var in \
     BLACKLIST \
     BZIP2 \
     INET \
     INET6 \
     KERBEROS \
     KVM \
     NETGRAPH \
     PAM \
     TESTS \
     WIRELESS
 __DEFAULT_DEPENDENT_OPTIONS+= ${var}_SUPPORT/${var}
 .endfor
 
 #
 # Default behaviour of some options depends on the architecture.  Unfortunately
 # this means that we have to test TARGET_ARCH (the buildworld case) as well
 # as MACHINE_ARCH (the non-buildworld case).  Normally TARGET_ARCH is not
 # used at all in bsd.*.mk, but we have to make an exception here if we want
 # to allow defaults for some things like clang to vary by target architecture.
 # Additional, per-target behavior should be rarely added only after much
 # gnashing of teeth and grinding of gears.
 #
 .if defined(TARGET_ARCH)
 __T=${TARGET_ARCH}
 .else
 __T=${MACHINE_ARCH}
 .endif
 
 # All supported backends for LLVM_TARGET_XXX
 __LLVM_TARGETS= \
 		aarch64 \
 		arm \
 		mips \
 		powerpc \
 		riscv \
 		x86
 __LLVM_TARGET_FILT=	C/(amd64|i386)/x86/:C/powerpc.*/powerpc/:C/armv[67]/arm/:C/riscv.*/riscv/:C/mips.*/mips/
 .for __llt in ${__LLVM_TARGETS}
 # Default enable the given TARGET's LLVM_TARGET support
 .if ${__T:${__LLVM_TARGET_FILT}} == ${__llt}
 __DEFAULT_YES_OPTIONS+=	LLVM_TARGET_${__llt:${__LLVM_TARGET_FILT}:tu}
 # aarch64 needs arm for -m32 support.
 .elif ${__T} == "aarch64" && ${__llt:Marm*} != ""
 __DEFAULT_DEPENDENT_OPTIONS+=	LLVM_TARGET_ARM/LLVM_TARGET_AARCH64
 # Default the rest of the LLVM_TARGETs to the value of MK_LLVM_TARGET_ALL.
 .else
 __DEFAULT_DEPENDENT_OPTIONS+=	LLVM_TARGET_${__llt:${__LLVM_TARGET_FILT}:tu}/LLVM_TARGET_ALL
 .endif
 .endfor
 
 __DEFAULT_NO_OPTIONS+=LLVM_TARGET_BPF
 
 .include <bsd.compiler.mk>
 
 # In-tree binutils/gcc are older versions without modern architecture support.
 .if ${__T} == "aarch64" || ${__T:Mriscv*} != ""
 BROKEN_OPTIONS+=BINUTILS BINUTILS_BOOTSTRAP GDB
 .endif
 .if ${__T} == "amd64" || ${__T} == "i386"
 __DEFAULT_YES_OPTIONS+=BINUTILS_BOOTSTRAP
 .else
 __DEFAULT_NO_OPTIONS+=BINUTILS_BOOTSTRAP
 .endif
 .if ${__T:Mriscv*} != ""
 BROKEN_OPTIONS+=OFED
 .endif
 .if ${__T} == "aarch64" || ${__T} == "amd64" || ${__T} == "i386"
 __DEFAULT_YES_OPTIONS+=LLDB
 .else
 __DEFAULT_NO_OPTIONS+=LLDB
 .endif
 # LIB32 is supported on amd64, mips64, and powerpc64
 .if (${__T} == "amd64" || ${__T:Mmips64*} || ${__T} == "powerpc64")
 __DEFAULT_YES_OPTIONS+=LIB32
 .else
 BROKEN_OPTIONS+=LIB32
 .endif
 # Only doing soft float API stuff on armv6 and armv7
 .if ${__T} != "armv6" && ${__T} != "armv7"
 BROKEN_OPTIONS+=LIBSOFT
 .endif
 .if ${__T:Mmips*}
 # GOOGLETEST cannot currently be compiled on mips due to external circumstances.
 # Notably, the freebsd-gcc port isn't linking in libgcc so we end up trying ot
 # link to a hidden symbol. LLVM would successfully link this in, but some of
 # the mips variants are broken under LLVM until LLVM 10. GOOGLETEST should be
 # marked no longer broken with the switch to LLVM.
 BROKEN_OPTIONS+=GOOGLETEST SSP
 .endif
 # EFI doesn't exist on mips, powerpc, or riscv.
 .if ${__T:Mmips*} || ${__T:Mpowerpc*} || ${__T:Mriscv*}
 BROKEN_OPTIONS+=EFI
 .endif
 # OFW is only for powerpc, exclude others
 .if ${__T:Mpowerpc*} == ""
 BROKEN_OPTIONS+=LOADER_OFW
 .endif
 # UBOOT is only for arm, mips and powerpc, exclude others
 .if ${__T:Marm*} == "" && ${__T:Mmips*} == "" && ${__T:Mpowerpc*} == ""
 BROKEN_OPTIONS+=LOADER_UBOOT
 .endif
 # GELI and Lua in loader currently cause boot failures on powerpc.
 # Further debugging is required -- probably they are just broken on big
 # endian systems generically (they jump to null pointers or try to read
 # crazy high addresses, which is typical of endianness problems).
 .if ${__T:Mpowerpc*}
 BROKEN_OPTIONS+=LOADER_GELI LOADER_LUA
 .endif
 
 .if ${__T:Mmips64*}
 # profiling won't work on MIPS64 because there is only assembly for o32
 BROKEN_OPTIONS+=PROFILE
 .endif
 .if ${__T} != "aarch64" && ${__T} != "amd64" && ${__T} != "i386" && \
     ${__T} != "powerpc64"
 BROKEN_OPTIONS+=CXGBETOOL
 BROKEN_OPTIONS+=MLX5TOOL
 .endif
 
 # HyperV is currently x86-only
 .if ${__T} != "amd64" && ${__T} != "i386"
 BROKEN_OPTIONS+=HYPERV
 .endif
 
 # NVME is only aarch64, x86 and powerpc64
 .if ${__T} != "aarch64" && ${__T} != "amd64" && ${__T} != "i386" && \
     ${__T} != "powerpc64"
 BROKEN_OPTIONS+=NVME
 .endif
 
 .if ${__T} == "amd64" || ${__T} == "i386" || ${__T} == "powerpc64"
 __DEFAULT_YES_OPTIONS+=OPENMP
 .else
 __DEFAULT_NO_OPTIONS+=OPENMP
 .endif
 
 .include <bsd.mkopt.mk>
 
 #
 # Force some options off if their dependencies are off.
 # Order is somewhat important.
 #
 .if ${MK_CAPSICUM} == "no"
 MK_CASPER:=	no
 .endif
 
 .if ${MK_LIBPTHREAD} == "no"
 MK_LIBTHR:=	no
 .endif
 
 .if ${MK_SOURCELESS} == "no"
 MK_SOURCELESS_HOST:=	no
 MK_SOURCELESS_UCODE:= no
 .endif
 
 .if ${MK_CDDL} == "no"
 MK_ZFS:=	no
 MK_LOADER_ZFS:=	no
 MK_CTF:=	no
 .endif
 
 .if ${MK_CRYPT} == "no"
 MK_OPENSSL:=	no
 MK_OPENSSH:=	no
 MK_KERBEROS:=	no
 MK_KERBEROS_SUPPORT:=	no
 .endif
 
 .if ${MK_CXX} == "no"
 MK_CLANG:=	no
 MK_GOOGLETEST:=	no
 MK_TESTS:=	no
 .endif
 
 .if ${MK_DIALOG} == "no"
 MK_BSDINSTALL:=	no
 .endif
 
 .if ${MK_FILE} == "no"
 MK_SVNLITE:=	no
 .endif
 
 .if ${MK_MAIL} == "no"
 MK_MAILWRAPPER:= no
 MK_SENDMAIL:=	no
 MK_DMAGENT:=	no
 .endif
 
 .if ${MK_NETGRAPH} == "no"
 MK_ATM:=	no
 MK_BLUETOOTH:=	no
 .endif
 
 .if ${MK_NLS} == "no"
 MK_NLS_CATALOGS:= no
 .endif
 
 .if ${MK_OPENSSL} == "no"
 MK_DMAGENT:=	no
 MK_OPENSSH:=	no
 MK_KERBEROS:=	no
 MK_KERBEROS_SUPPORT:=	no
 MK_LDNS:=	no
 MK_PKGBOOTSTRAP:=	no
 MK_SVN:=		no
 MK_SVNLITE:=		no
 MK_WIRELESS:=		no
 .endif
 
 .if ${MK_LDNS} == "no"
 MK_LDNS_UTILS:=	no
 MK_UNBOUND:= no
 .endif
 
 .if ${MK_PF} == "no"
 MK_AUTHPF:=	no
 .endif
 
 .if ${MK_OFED} == "no"
 MK_OFED_EXTRA:=	no
 .endif
 
 .if ${MK_PORTSNAP} == "no"
 # freebsd-update depends on phttpget from portsnap
 MK_FREEBSD_UPDATE:=	no
 .endif
 
 .if ${MK_TESTS} == "no"
 MK_DTRACE_TESTS:= no
 .endif
 
 .if ${MK_TESTS_SUPPORT} == "no"
 MK_GOOGLETEST:=	no
 .endif
 
 .if ${MK_ZONEINFO} == "no"
 MK_ZONEINFO_LEAPSECONDS_SUPPORT:= no
 .endif
 
 .if ${MK_CROSS_COMPILER} == "no"
 MK_BINUTILS_BOOTSTRAP:= no
 MK_CLANG_BOOTSTRAP:= no
 MK_ELFTOOLCHAIN_BOOTSTRAP:= no
 MK_LLD_BOOTSTRAP:= no
 .endif
 
 .if ${MK_TOOLCHAIN} == "no"
 MK_BINUTILS:=	no
 MK_CLANG:=	no
 MK_GDB:=	no
 MK_INCLUDES:=	no
 MK_LLD:=	no
 MK_LLDB:=	no
 .endif
 
 .if ${MK_CLANG} == "no"
 MK_CLANG_EXTRAS:= no
 MK_CLANG_FULL:= no
 MK_LLVM_COV:= no
 .endif
 
 .if ${MK_LOADER_VERIEXEC} == "no"
 MK_LOADER_VERIEXEC_PASS_MANIFEST := no
 .endif
 
 #
 # MK_* options whose default value depends on another option.
 #
 .for vv in \
     GSSAPI/KERBEROS \
     MAN_UTILS/MAN
 .if defined(WITH_${vv:H})
 MK_${vv:H}:=	yes
 .elif defined(WITHOUT_${vv:H})
 MK_${vv:H}:=	no
 .else
 MK_${vv:H}:=	${MK_${vv:T}}
 .endif
 .endfor
 
 #
 # Set defaults for the MK_*_SUPPORT variables.
 #
 
 .endif #  !target(__<src.opts.mk>__)
diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h
index a08c90ed20be..70909510c983 100644
--- a/sys/amd64/include/vmm.h
+++ b/sys/amd64/include/vmm.h
@@ -1,746 +1,770 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _VMM_H_
 #define	_VMM_H_
 
 #include <sys/sdt.h>
 #include <x86/segments.h>
 
+struct vm_snapshot_meta;
+
 #ifdef _KERNEL
 SDT_PROVIDER_DECLARE(vmm);
 #endif
 
 enum vm_suspend_how {
 	VM_SUSPEND_NONE,
 	VM_SUSPEND_RESET,
 	VM_SUSPEND_POWEROFF,
 	VM_SUSPEND_HALT,
 	VM_SUSPEND_TRIPLEFAULT,
 	VM_SUSPEND_LAST
 };
 
 /*
  * Identifiers for architecturally defined registers.
  */
 enum vm_reg_name {
 	VM_REG_GUEST_RAX,
 	VM_REG_GUEST_RBX,
 	VM_REG_GUEST_RCX,
 	VM_REG_GUEST_RDX,
 	VM_REG_GUEST_RSI,
 	VM_REG_GUEST_RDI,
 	VM_REG_GUEST_RBP,
 	VM_REG_GUEST_R8,
 	VM_REG_GUEST_R9,
 	VM_REG_GUEST_R10,
 	VM_REG_GUEST_R11,
 	VM_REG_GUEST_R12,
 	VM_REG_GUEST_R13,
 	VM_REG_GUEST_R14,
 	VM_REG_GUEST_R15,
 	VM_REG_GUEST_CR0,
 	VM_REG_GUEST_CR3,
 	VM_REG_GUEST_CR4,
 	VM_REG_GUEST_DR7,
 	VM_REG_GUEST_RSP,
 	VM_REG_GUEST_RIP,
 	VM_REG_GUEST_RFLAGS,
 	VM_REG_GUEST_ES,
 	VM_REG_GUEST_CS,
 	VM_REG_GUEST_SS,
 	VM_REG_GUEST_DS,
 	VM_REG_GUEST_FS,
 	VM_REG_GUEST_GS,
 	VM_REG_GUEST_LDTR,
 	VM_REG_GUEST_TR,
 	VM_REG_GUEST_IDTR,
 	VM_REG_GUEST_GDTR,
 	VM_REG_GUEST_EFER,
 	VM_REG_GUEST_CR2,
 	VM_REG_GUEST_PDPTE0,
 	VM_REG_GUEST_PDPTE1,
 	VM_REG_GUEST_PDPTE2,
 	VM_REG_GUEST_PDPTE3,
 	VM_REG_GUEST_INTR_SHADOW,
 	VM_REG_GUEST_DR0,
 	VM_REG_GUEST_DR1,
 	VM_REG_GUEST_DR2,
 	VM_REG_GUEST_DR3,
 	VM_REG_GUEST_DR6,
 	VM_REG_GUEST_ENTRY_INST_LENGTH,
 	VM_REG_LAST
 };
 
 enum x2apic_state {
 	X2APIC_DISABLED,
 	X2APIC_ENABLED,
 	X2APIC_STATE_LAST
 };
 
 #define	VM_INTINFO_VECTOR(info)	((info) & 0xff)
 #define	VM_INTINFO_DEL_ERRCODE	0x800
 #define	VM_INTINFO_RSVD		0x7ffff000
 #define	VM_INTINFO_VALID	0x80000000
 #define	VM_INTINFO_TYPE		0x700
 #define	VM_INTINFO_HWINTR	(0 << 8)
 #define	VM_INTINFO_NMI		(2 << 8)
 #define	VM_INTINFO_HWEXCEPTION	(3 << 8)
 #define	VM_INTINFO_SWINTR	(4 << 8)
 
 /*
  * The VM name has to fit into the pathname length constraints of devfs,
  * governed primarily by SPECNAMELEN.  The length is the total number of
  * characters in the full path, relative to the mount point and not 
  * including any leading '/' characters.
  * A prefix and a suffix are added to the name specified by the user.
  * The prefix is usually "vmm/" or "vmm.io/", but can be a few characters
  * longer for future use.
  * The suffix is a string that identifies a bootrom image or some similar
  * image that is attached to the VM. A separator character gets added to
  * the suffix automatically when generating the full path, so it must be
  * accounted for, reducing the effective length by 1.
  * The effective length of a VM name is 229 bytes for FreeBSD 13 and 37
  * bytes for FreeBSD 12.  A minimum length is set for safety and supports
  * a SPECNAMELEN as small as 32 on old systems.
  */
 #define VM_MAX_PREFIXLEN 10
 #define VM_MAX_SUFFIXLEN 15
 #define VM_MIN_NAMELEN   6
 #define VM_MAX_NAMELEN \
     (SPECNAMELEN - VM_MAX_PREFIXLEN - VM_MAX_SUFFIXLEN - 1)
 
 #ifdef _KERNEL
 CTASSERT(VM_MAX_NAMELEN >= VM_MIN_NAMELEN);
 
 struct vm;
 struct vm_exception;
 struct seg_desc;
 struct vm_exit;
 struct vm_run;
 struct vhpet;
 struct vioapic;
 struct vlapic;
 struct vmspace;
 struct vm_object;
 struct vm_guest_paging;
 struct pmap;
+enum snapshot_req;
 
 struct vm_eventinfo {
 	void	*rptr;		/* rendezvous cookie */
 	int	*sptr;		/* suspend cookie */
 	int	*iptr;		/* reqidle cookie */
 };
 
 typedef int	(*vmm_init_func_t)(int ipinum);
 typedef int	(*vmm_cleanup_func_t)(void);
 typedef void	(*vmm_resume_func_t)(void);
 typedef void *	(*vmi_init_func_t)(struct vm *vm, struct pmap *pmap);
 typedef int	(*vmi_run_func_t)(void *vmi, int vcpu, register_t rip,
 		    struct pmap *pmap, struct vm_eventinfo *info);
 typedef void	(*vmi_cleanup_func_t)(void *vmi);
 typedef int	(*vmi_get_register_t)(void *vmi, int vcpu, int num,
 				      uint64_t *retval);
 typedef int	(*vmi_set_register_t)(void *vmi, int vcpu, int num,
 				      uint64_t val);
 typedef int	(*vmi_get_desc_t)(void *vmi, int vcpu, int num,
 				  struct seg_desc *desc);
 typedef int	(*vmi_set_desc_t)(void *vmi, int vcpu, int num,
 				  struct seg_desc *desc);
 typedef int	(*vmi_get_cap_t)(void *vmi, int vcpu, int num, int *retval);
 typedef int	(*vmi_set_cap_t)(void *vmi, int vcpu, int num, int val);
 typedef struct vmspace * (*vmi_vmspace_alloc)(vm_offset_t min, vm_offset_t max);
 typedef void	(*vmi_vmspace_free)(struct vmspace *vmspace);
 typedef struct vlapic * (*vmi_vlapic_init)(void *vmi, int vcpu);
 typedef void	(*vmi_vlapic_cleanup)(void *vmi, struct vlapic *vlapic);
+typedef int	(*vmi_snapshot_t)(void *vmi, struct vm_snapshot_meta *meta);
+typedef int	(*vmi_snapshot_vmcx_t)(void *vmi, struct vm_snapshot_meta *meta,
+				       int vcpu);
+typedef int	(*vmi_restore_tsc_t)(void *vmi, int vcpuid, uint64_t now);
 
 struct vmm_ops {
 	vmm_init_func_t		init;		/* module wide initialization */
 	vmm_cleanup_func_t	cleanup;
 	vmm_resume_func_t	resume;
 
 	vmi_init_func_t		vminit;		/* vm-specific initialization */
 	vmi_run_func_t		vmrun;
 	vmi_cleanup_func_t	vmcleanup;
 	vmi_get_register_t	vmgetreg;
 	vmi_set_register_t	vmsetreg;
 	vmi_get_desc_t		vmgetdesc;
 	vmi_set_desc_t		vmsetdesc;
 	vmi_get_cap_t		vmgetcap;
 	vmi_set_cap_t		vmsetcap;
 	vmi_vmspace_alloc	vmspace_alloc;
 	vmi_vmspace_free	vmspace_free;
 	vmi_vlapic_init		vlapic_init;
 	vmi_vlapic_cleanup	vlapic_cleanup;
+
+	/* checkpoint operations */
+	vmi_snapshot_t		vmsnapshot;
+	vmi_snapshot_vmcx_t	vmcx_snapshot;
+	vmi_restore_tsc_t	vm_restore_tsc;
 };
 
 extern struct vmm_ops vmm_ops_intel;
 extern struct vmm_ops vmm_ops_amd;
 
 int vm_create(const char *name, struct vm **retvm);
 void vm_destroy(struct vm *vm);
 int vm_reinit(struct vm *vm);
 const char *vm_name(struct vm *vm);
 uint16_t vm_get_maxcpus(struct vm *vm);
 void vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores,
     uint16_t *threads, uint16_t *maxcpus);
 int vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores,
     uint16_t threads, uint16_t maxcpus);
 
 /*
  * APIs that modify the guest memory map require all vcpus to be frozen.
  */
 int vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t off,
     size_t len, int prot, int flags);
 int vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem);
 void vm_free_memseg(struct vm *vm, int ident);
 int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
 int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len);
 int vm_assign_pptdev(struct vm *vm, int bus, int slot, int func);
 int vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func);
 
 /*
  * APIs that inspect the guest memory map require only a *single* vcpu to
  * be frozen. This acts like a read lock on the guest memory map since any
  * modification requires *all* vcpus to be frozen.
  */
 int vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid,
     vm_ooffset_t *segoff, size_t *len, int *prot, int *flags);
 int vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem,
     struct vm_object **objptr);
 vm_paddr_t vmm_sysmem_maxaddr(struct vm *vm);
 void *vm_gpa_hold(struct vm *, int vcpuid, vm_paddr_t gpa, size_t len,
     int prot, void **cookie);
 void vm_gpa_release(void *cookie);
 bool vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa);
 
 int vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval);
 int vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val);
 int vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
 		    struct seg_desc *ret_desc);
 int vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
 		    struct seg_desc *desc);
 int vm_run(struct vm *vm, struct vm_run *vmrun);
 int vm_suspend(struct vm *vm, enum vm_suspend_how how);
 int vm_inject_nmi(struct vm *vm, int vcpu);
 int vm_nmi_pending(struct vm *vm, int vcpuid);
 void vm_nmi_clear(struct vm *vm, int vcpuid);
 int vm_inject_extint(struct vm *vm, int vcpu);
 int vm_extint_pending(struct vm *vm, int vcpuid);
 void vm_extint_clear(struct vm *vm, int vcpuid);
 struct vlapic *vm_lapic(struct vm *vm, int cpu);
 struct vioapic *vm_ioapic(struct vm *vm);
 struct vhpet *vm_hpet(struct vm *vm);
 int vm_get_capability(struct vm *vm, int vcpu, int type, int *val);
 int vm_set_capability(struct vm *vm, int vcpu, int type, int val);
 int vm_get_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state *state);
 int vm_set_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state state);
 int vm_apicid2vcpuid(struct vm *vm, int apicid);
 int vm_activate_cpu(struct vm *vm, int vcpu);
 int vm_suspend_cpu(struct vm *vm, int vcpu);
 int vm_resume_cpu(struct vm *vm, int vcpu);
 struct vm_exit *vm_exitinfo(struct vm *vm, int vcpuid);
 void vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip);
 void vm_exit_debug(struct vm *vm, int vcpuid, uint64_t rip);
 void vm_exit_rendezvous(struct vm *vm, int vcpuid, uint64_t rip);
 void vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip);
 void vm_exit_reqidle(struct vm *vm, int vcpuid, uint64_t rip);
+int vm_snapshot_req(struct vm *vm, struct vm_snapshot_meta *meta);
+int vm_restore_time(struct vm *vm);
+
 
 #ifdef _SYS__CPUSET_H_
 /*
  * Rendezvous all vcpus specified in 'dest' and execute 'func(arg)'.
  * The rendezvous 'func(arg)' is not allowed to do anything that will
  * cause the thread to be put to sleep.
  *
  * If the rendezvous is being initiated from a vcpu context then the
  * 'vcpuid' must refer to that vcpu, otherwise it should be set to -1.
  *
  * The caller cannot hold any locks when initiating the rendezvous.
  *
  * The implementation of this API may cause vcpus other than those specified
  * by 'dest' to be stalled. The caller should not rely on any vcpus making
  * forward progress when the rendezvous is in progress.
  */
 typedef void (*vm_rendezvous_func_t)(struct vm *vm, int vcpuid, void *arg);
 int vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest,
     vm_rendezvous_func_t func, void *arg);
 cpuset_t vm_active_cpus(struct vm *vm);
 cpuset_t vm_debug_cpus(struct vm *vm);
 cpuset_t vm_suspended_cpus(struct vm *vm);
 #endif	/* _SYS__CPUSET_H_ */
 
 static __inline int
 vcpu_rendezvous_pending(struct vm_eventinfo *info)
 {
 
 	return (*((uintptr_t *)(info->rptr)) != 0);
 }
 
 static __inline int
 vcpu_suspended(struct vm_eventinfo *info)
 {
 
 	return (*info->sptr);
 }
 
 static __inline int
 vcpu_reqidle(struct vm_eventinfo *info)
 {
 
 	return (*info->iptr);
 }
 
 int vcpu_debugged(struct vm *vm, int vcpuid);
 
 /*
  * Return true if device indicated by bus/slot/func is supposed to be a
  * pci passthrough device.
  *
  * Return false otherwise.
  */
 bool vmm_is_pptdev(int bus, int slot, int func);
 
 void *vm_iommu_domain(struct vm *vm);
 
 enum vcpu_state {
 	VCPU_IDLE,
 	VCPU_FROZEN,
 	VCPU_RUNNING,
 	VCPU_SLEEPING,
 };
 
 int vcpu_set_state(struct vm *vm, int vcpu, enum vcpu_state state,
     bool from_idle);
 enum vcpu_state vcpu_get_state(struct vm *vm, int vcpu, int *hostcpu);
 
 static int __inline
 vcpu_is_running(struct vm *vm, int vcpu, int *hostcpu)
 {
 	return (vcpu_get_state(vm, vcpu, hostcpu) == VCPU_RUNNING);
 }
 
 #ifdef _SYS_PROC_H_
 static int __inline
 vcpu_should_yield(struct vm *vm, int vcpu)
 {
 
 	if (curthread->td_flags & (TDF_ASTPENDING | TDF_NEEDRESCHED))
 		return (1);
 	else if (curthread->td_owepreempt)
 		return (1);
 	else
 		return (0);
 }
 #endif
 
 void *vcpu_stats(struct vm *vm, int vcpu);
 void vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr);
 struct vmspace *vm_get_vmspace(struct vm *vm);
 struct vatpic *vm_atpic(struct vm *vm);
 struct vatpit *vm_atpit(struct vm *vm);
 struct vpmtmr *vm_pmtmr(struct vm *vm);
 struct vrtc *vm_rtc(struct vm *vm);
 
 /*
  * Inject exception 'vector' into the guest vcpu. This function returns 0 on
  * success and non-zero on failure.
  *
  * Wrapper functions like 'vm_inject_gp()' should be preferred to calling
  * this function directly because they enforce the trap-like or fault-like
  * behavior of an exception.
  *
  * This function should only be called in the context of the thread that is
  * executing this vcpu.
  */
 int vm_inject_exception(struct vm *vm, int vcpuid, int vector, int err_valid,
     uint32_t errcode, int restart_instruction);
 
 /*
  * This function is called after a VM-exit that occurred during exception or
  * interrupt delivery through the IDT. The format of 'intinfo' is described
  * in Figure 15-1, "EXITINTINFO for All Intercepts", APM, Vol 2.
  *
  * If a VM-exit handler completes the event delivery successfully then it
  * should call vm_exit_intinfo() to extinguish the pending event. For e.g.,
  * if the task switch emulation is triggered via a task gate then it should
  * call this function with 'intinfo=0' to indicate that the external event
  * is not pending anymore.
  *
  * Return value is 0 on success and non-zero on failure.
  */
 int vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t intinfo);
 
 /*
  * This function is called before every VM-entry to retrieve a pending
  * event that should be injected into the guest. This function combines
  * nested events into a double or triple fault.
  *
  * Returns 0 if there are no events that need to be injected into the guest
  * and non-zero otherwise.
  */
 int vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *info);
 
 int vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2);
 
+/*
+ * Function used to keep track of the guest's TSC offset. The
+ * offset is used by the virutalization extensions to provide a consistent
+ * value for the Time Stamp Counter to the guest.
+ *
+ * Return value is 0 on success and non-zero on failure.
+ */
+int vm_set_tsc_offset(struct vm *vm, int vcpu_id, uint64_t offset);
+
 enum vm_reg_name vm_segment_name(int seg_encoding);
 
 struct vm_copyinfo {
 	uint64_t	gpa;
 	size_t		len;
 	void		*hva;
 	void		*cookie;
 };
 
 /*
  * Set up 'copyinfo[]' to copy to/from guest linear address space starting
  * at 'gla' and 'len' bytes long. The 'prot' should be set to PROT_READ for
  * a copyin or PROT_WRITE for a copyout. 
  *
  * retval	is_fault	Interpretation
  *   0		   0		Success
  *   0		   1		An exception was injected into the guest
  * EFAULT	  N/A		Unrecoverable error
  *
  * The 'copyinfo[]' can be passed to 'vm_copyin()' or 'vm_copyout()' only if
  * the return value is 0. The 'copyinfo[]' resources should be freed by calling
  * 'vm_copy_teardown()' after the copy is done.
  */
 int vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
     uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
     int num_copyinfo, int *is_fault);
 void vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
     int num_copyinfo);
 void vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
     void *kaddr, size_t len);
 void vm_copyout(struct vm *vm, int vcpuid, const void *kaddr,
     struct vm_copyinfo *copyinfo, size_t len);
 
 int vcpu_trace_exceptions(struct vm *vm, int vcpuid);
 #endif	/* KERNEL */
 
 #define	VM_MAXCPU	16			/* maximum virtual cpus */
 
 /*
  * Identifiers for optional vmm capabilities
  */
 enum vm_cap_type {
 	VM_CAP_HALT_EXIT,
 	VM_CAP_MTRAP_EXIT,
 	VM_CAP_PAUSE_EXIT,
 	VM_CAP_UNRESTRICTED_GUEST,
 	VM_CAP_ENABLE_INVPCID,
 	VM_CAP_BPT_EXIT,
 	VM_CAP_MAX
 };
 
 enum vm_intr_trigger {
 	EDGE_TRIGGER,
 	LEVEL_TRIGGER
 };
 	
 /*
  * The 'access' field has the format specified in Table 21-2 of the Intel
  * Architecture Manual vol 3b.
  *
  * XXX The contents of the 'access' field are architecturally defined except
  * bit 16 - Segment Unusable.
  */
 struct seg_desc {
 	uint64_t	base;
 	uint32_t	limit;
 	uint32_t	access;
 };
 #define	SEG_DESC_TYPE(access)		((access) & 0x001f)
 #define	SEG_DESC_DPL(access)		(((access) >> 5) & 0x3)
 #define	SEG_DESC_PRESENT(access)	(((access) & 0x0080) ? 1 : 0)
 #define	SEG_DESC_DEF32(access)		(((access) & 0x4000) ? 1 : 0)
 #define	SEG_DESC_GRANULARITY(access)	(((access) & 0x8000) ? 1 : 0)
 #define	SEG_DESC_UNUSABLE(access)	(((access) & 0x10000) ? 1 : 0)
 
 enum vm_cpu_mode {
 	CPU_MODE_REAL,
 	CPU_MODE_PROTECTED,
 	CPU_MODE_COMPATIBILITY,		/* IA-32E mode (CS.L = 0) */
 	CPU_MODE_64BIT,			/* IA-32E mode (CS.L = 1) */
 };
 
 enum vm_paging_mode {
 	PAGING_MODE_FLAT,
 	PAGING_MODE_32,
 	PAGING_MODE_PAE,
 	PAGING_MODE_64,
 };
 
 struct vm_guest_paging {
 	uint64_t	cr3;
 	int		cpl;
 	enum vm_cpu_mode cpu_mode;
 	enum vm_paging_mode paging_mode;
 };
 
 /*
  * The data structures 'vie' and 'vie_op' are meant to be opaque to the
  * consumers of instruction decoding. The only reason why their contents
  * need to be exposed is because they are part of the 'vm_exit' structure.
  */
 struct vie_op {
 	uint8_t		op_byte;	/* actual opcode byte */
 	uint8_t		op_type;	/* type of operation (e.g. MOV) */
 	uint16_t	op_flags;
 };
 _Static_assert(sizeof(struct vie_op) == 4, "ABI");
 _Static_assert(_Alignof(struct vie_op) == 2, "ABI");
 
 #define	VIE_INST_SIZE	15
 struct vie {
 	uint8_t		inst[VIE_INST_SIZE];	/* instruction bytes */
 	uint8_t		num_valid;		/* size of the instruction */
 	uint8_t		num_processed;
 
 	uint8_t		addrsize:4, opsize:4;	/* address and operand sizes */
 	uint8_t		rex_w:1,		/* REX prefix */
 			rex_r:1,
 			rex_x:1,
 			rex_b:1,
 			rex_present:1,
 			repz_present:1,		/* REP/REPE/REPZ prefix */
 			repnz_present:1,	/* REPNE/REPNZ prefix */
 			opsize_override:1,	/* Operand size override */
 			addrsize_override:1,	/* Address size override */
 			segment_override:1;	/* Segment override */
 
 	uint8_t		mod:2,			/* ModRM byte */
 			reg:4,
 			rm:4;
 
 	uint8_t		ss:2,			/* SIB byte */
 			vex_present:1,		/* VEX prefixed */
 			vex_l:1,		/* L bit */
 			index:4,		/* SIB byte */
 			base:4;			/* SIB byte */
 
 	uint8_t		disp_bytes;
 	uint8_t		imm_bytes;
 
 	uint8_t		scale;
 
 	uint8_t		vex_reg:4,		/* vvvv: first source register specifier */
 			vex_pp:2,		/* pp */
 			_sparebits:2;
 
 	uint8_t		_sparebytes[2];
 
 	int		base_register;		/* VM_REG_GUEST_xyz */
 	int		index_register;		/* VM_REG_GUEST_xyz */
 	int		segment_register;	/* VM_REG_GUEST_xyz */
 
 	int64_t		displacement;		/* optional addr displacement */
 	int64_t		immediate;		/* optional immediate operand */
 
 	uint8_t		decoded;	/* set to 1 if successfully decoded */
 
 	uint8_t		_sparebyte;
 
 	struct vie_op	op;			/* opcode description */
 };
 _Static_assert(sizeof(struct vie) == 64, "ABI");
 _Static_assert(__offsetof(struct vie, disp_bytes) == 22, "ABI");
 _Static_assert(__offsetof(struct vie, scale) == 24, "ABI");
 _Static_assert(__offsetof(struct vie, base_register) == 28, "ABI");
 
 enum vm_exitcode {
 	VM_EXITCODE_INOUT,
 	VM_EXITCODE_VMX,
 	VM_EXITCODE_BOGUS,
 	VM_EXITCODE_RDMSR,
 	VM_EXITCODE_WRMSR,
 	VM_EXITCODE_HLT,
 	VM_EXITCODE_MTRAP,
 	VM_EXITCODE_PAUSE,
 	VM_EXITCODE_PAGING,
 	VM_EXITCODE_INST_EMUL,
 	VM_EXITCODE_SPINUP_AP,
 	VM_EXITCODE_DEPRECATED1,	/* used to be SPINDOWN_CPU */
 	VM_EXITCODE_RENDEZVOUS,
 	VM_EXITCODE_IOAPIC_EOI,
 	VM_EXITCODE_SUSPENDED,
 	VM_EXITCODE_INOUT_STR,
 	VM_EXITCODE_TASK_SWITCH,
 	VM_EXITCODE_MONITOR,
 	VM_EXITCODE_MWAIT,
 	VM_EXITCODE_SVM,
 	VM_EXITCODE_REQIDLE,
 	VM_EXITCODE_DEBUG,
 	VM_EXITCODE_VMINSN,
 	VM_EXITCODE_BPT,
 	VM_EXITCODE_MAX
 };
 
 struct vm_inout {
 	uint16_t	bytes:3;	/* 1 or 2 or 4 */
 	uint16_t	in:1;
 	uint16_t	string:1;
 	uint16_t	rep:1;
 	uint16_t	port;
 	uint32_t	eax;		/* valid for out */
 };
 
 struct vm_inout_str {
 	struct vm_inout	inout;		/* must be the first element */
 	struct vm_guest_paging paging;
 	uint64_t	rflags;
 	uint64_t	cr0;
 	uint64_t	index;
 	uint64_t	count;		/* rep=1 (%rcx), rep=0 (1) */
 	int		addrsize;
 	enum vm_reg_name seg_name;
 	struct seg_desc seg_desc;
 };
 
 enum task_switch_reason {
 	TSR_CALL,
 	TSR_IRET,
 	TSR_JMP,
 	TSR_IDT_GATE,	/* task gate in IDT */
 };
 
 struct vm_task_switch {
 	uint16_t	tsssel;		/* new TSS selector */
 	int		ext;		/* task switch due to external event */
 	uint32_t	errcode;
 	int		errcode_valid;	/* push 'errcode' on the new stack */
 	enum task_switch_reason reason;
 	struct vm_guest_paging paging;
 };
 
 struct vm_exit {
 	enum vm_exitcode	exitcode;
 	int			inst_length;	/* 0 means unknown */
 	uint64_t		rip;
 	union {
 		struct vm_inout	inout;
 		struct vm_inout_str inout_str;
 		struct {
 			uint64_t	gpa;
 			int		fault_type;
 		} paging;
 		struct {
 			uint64_t	gpa;
 			uint64_t	gla;
 			uint64_t	cs_base;
 			int		cs_d;		/* CS.D */
 			struct vm_guest_paging paging;
 			struct vie	vie;
 		} inst_emul;
 		/*
 		 * VMX specific payload. Used when there is no "better"
 		 * exitcode to represent the VM-exit.
 		 */
 		struct {
 			int		status;		/* vmx inst status */
 			/*
 			 * 'exit_reason' and 'exit_qualification' are valid
 			 * only if 'status' is zero.
 			 */
 			uint32_t	exit_reason;
 			uint64_t	exit_qualification;
 			/*
 			 * 'inst_error' and 'inst_type' are valid
 			 * only if 'status' is non-zero.
 			 */
 			int		inst_type;
 			int		inst_error;
 		} vmx;
 		/*
 		 * SVM specific payload.
 		 */
 		struct {
 			uint64_t	exitcode;
 			uint64_t	exitinfo1;
 			uint64_t	exitinfo2;
 		} svm;
 		struct {
 			int		inst_length;
 		} bpt;
 		struct {
 			uint32_t	code;		/* ecx value */
 			uint64_t	wval;
 		} msr;
 		struct {
 			int		vcpu;
 			uint64_t	rip;
 		} spinup_ap;
 		struct {
 			uint64_t	rflags;
 			uint64_t	intr_status;
 		} hlt;
 		struct {
 			int		vector;
 		} ioapic_eoi;
 		struct {
 			enum vm_suspend_how how;
 		} suspended;
 		struct vm_task_switch task_switch;
 	} u;
 };
 
 /* APIs to inject faults into the guest */
 void vm_inject_fault(void *vm, int vcpuid, int vector, int errcode_valid,
     int errcode);
 
 static __inline void
 vm_inject_ud(void *vm, int vcpuid)
 {
 	vm_inject_fault(vm, vcpuid, IDT_UD, 0, 0);
 }
 
 static __inline void
 vm_inject_gp(void *vm, int vcpuid)
 {
 	vm_inject_fault(vm, vcpuid, IDT_GP, 1, 0);
 }
 
 static __inline void
 vm_inject_ac(void *vm, int vcpuid, int errcode)
 {
 	vm_inject_fault(vm, vcpuid, IDT_AC, 1, errcode);
 }
 
 static __inline void
 vm_inject_ss(void *vm, int vcpuid, int errcode)
 {
 	vm_inject_fault(vm, vcpuid, IDT_SS, 1, errcode);
 }
 
 void vm_inject_pf(void *vm, int vcpuid, int error_code, uint64_t cr2);
 
 int vm_restart_instruction(void *vm, int vcpuid);
 
 #endif	/* _VMM_H_ */
diff --git a/sys/amd64/include/vmm_dev.h b/sys/amd64/include/vmm_dev.h
index bd806e7678f4..21775b70718e 100644
--- a/sys/amd64/include/vmm_dev.h
+++ b/sys/amd64/include/vmm_dev.h
@@ -1,425 +1,436 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef	_VMM_DEV_H_
 #define	_VMM_DEV_H_
 
+struct vm_snapshot_meta;
+
 #ifdef _KERNEL
 void	vmmdev_init(void);
 int	vmmdev_cleanup(void);
 #endif
 
 struct vm_memmap {
 	vm_paddr_t	gpa;
 	int		segid;		/* memory segment */
 	vm_ooffset_t	segoff;		/* offset into memory segment */
 	size_t		len;		/* mmap length */
 	int		prot;		/* RWX */
 	int		flags;
 };
 #define	VM_MEMMAP_F_WIRED	0x01
 #define	VM_MEMMAP_F_IOMMU	0x02
 
 #define	VM_MEMSEG_NAME(m)	((m)->name[0] != '\0' ? (m)->name : NULL)
 struct vm_memseg {
 	int		segid;
 	size_t		len;
 	char		name[VM_MAX_SUFFIXLEN + 1];
 };
 
 struct vm_register {
 	int		cpuid;
 	int		regnum;		/* enum vm_reg_name */
 	uint64_t	regval;
 };
 
 struct vm_seg_desc {			/* data or code segment */
 	int		cpuid;
 	int		regnum;		/* enum vm_reg_name */
 	struct seg_desc desc;
 };
 
 struct vm_register_set {
 	int		cpuid;
 	unsigned int	count;
 	const int	*regnums;	/* enum vm_reg_name */
 	uint64_t	*regvals;
 };
 
 struct vm_run {
 	int		cpuid;
 	struct vm_exit	vm_exit;
 };
 
 struct vm_exception {
 	int		cpuid;
 	int		vector;
 	uint32_t	error_code;
 	int		error_code_valid;
 	int		restart_instruction;
 };
 
 struct vm_lapic_msi {
 	uint64_t	msg;
 	uint64_t	addr;
 };
 
 struct vm_lapic_irq {
 	int		cpuid;
 	int		vector;
 };
 
 struct vm_ioapic_irq {
 	int		irq;
 };
 
 struct vm_isa_irq {
 	int		atpic_irq;
 	int		ioapic_irq;
 };
 
 struct vm_isa_irq_trigger {
 	int		atpic_irq;
 	enum vm_intr_trigger trigger;
 };
 
 struct vm_capability {
 	int		cpuid;
 	enum vm_cap_type captype;
 	int		capval;
 	int		allcpus;
 };
 
 struct vm_pptdev {
 	int		bus;
 	int		slot;
 	int		func;
 };
 
 struct vm_pptdev_mmio {
 	int		bus;
 	int		slot;
 	int		func;
 	vm_paddr_t	gpa;
 	vm_paddr_t	hpa;
 	size_t		len;
 };
 
 struct vm_pptdev_msi {
 	int		vcpu;
 	int		bus;
 	int		slot;
 	int		func;
 	int		numvec;		/* 0 means disabled */
 	uint64_t	msg;
 	uint64_t	addr;
 };
 
 struct vm_pptdev_msix {
 	int		vcpu;
 	int		bus;
 	int		slot;
 	int		func;
 	int		idx;
 	uint64_t	msg;
 	uint32_t	vector_control;
 	uint64_t	addr;
 };
 
 struct vm_nmi {
 	int		cpuid;
 };
 
 #define	MAX_VM_STATS	64
 struct vm_stats {
 	int		cpuid;				/* in */
 	int		num_entries;			/* out */
 	struct timeval	tv;
 	uint64_t	statbuf[MAX_VM_STATS];
 };
 
 struct vm_stat_desc {
 	int		index;				/* in */
 	char		desc[128];			/* out */
 };
 
 struct vm_x2apic {
 	int			cpuid;
 	enum x2apic_state	state;
 };
 
 struct vm_gpa_pte {
 	uint64_t	gpa;				/* in */
 	uint64_t	pte[4];				/* out */
 	int		ptenum;
 };
 
 struct vm_hpet_cap {
 	uint32_t	capabilities;	/* lower 32 bits of HPET capabilities */
 };
 
 struct vm_suspend {
 	enum vm_suspend_how how;
 };
 
 struct vm_gla2gpa {
 	int		vcpuid;		/* inputs */
 	int 		prot;		/* PROT_READ or PROT_WRITE */
 	uint64_t	gla;
 	struct vm_guest_paging paging;
 	int		fault;		/* outputs */
 	uint64_t	gpa;
 };
 
 struct vm_activate_cpu {
 	int		vcpuid;
 };
 
 struct vm_cpuset {
 	int		which;
 	int		cpusetsize;
 	cpuset_t	*cpus;
 };
 #define	VM_ACTIVE_CPUS		0
 #define	VM_SUSPENDED_CPUS	1
 #define	VM_DEBUG_CPUS		2
 
 struct vm_intinfo {
 	int		vcpuid;
 	uint64_t	info1;
 	uint64_t	info2;
 };
 
 struct vm_rtc_time {
 	time_t		secs;
 };
 
 struct vm_rtc_data {
 	int		offset;
 	uint8_t		value;
 };
 
 struct vm_cpu_topology {
 	uint16_t	sockets;
 	uint16_t	cores;
 	uint16_t	threads;
 	uint16_t	maxcpus;
 };
 
 enum {
 	/* general routines */
 	IOCNUM_ABIVERS = 0,
 	IOCNUM_RUN = 1,
 	IOCNUM_SET_CAPABILITY = 2,
 	IOCNUM_GET_CAPABILITY = 3,
 	IOCNUM_SUSPEND = 4,
 	IOCNUM_REINIT = 5,
 
 	/* memory apis */
 	IOCNUM_MAP_MEMORY = 10,			/* deprecated */
 	IOCNUM_GET_MEMORY_SEG = 11,		/* deprecated */
 	IOCNUM_GET_GPA_PMAP = 12,
 	IOCNUM_GLA2GPA = 13,
 	IOCNUM_ALLOC_MEMSEG = 14,
 	IOCNUM_GET_MEMSEG = 15,
 	IOCNUM_MMAP_MEMSEG = 16,
 	IOCNUM_MMAP_GETNEXT = 17,
 	IOCNUM_GLA2GPA_NOFAULT = 18,
 
 	/* register/state accessors */
 	IOCNUM_SET_REGISTER = 20,
 	IOCNUM_GET_REGISTER = 21,
 	IOCNUM_SET_SEGMENT_DESCRIPTOR = 22,
 	IOCNUM_GET_SEGMENT_DESCRIPTOR = 23,
 	IOCNUM_SET_REGISTER_SET = 24,
 	IOCNUM_GET_REGISTER_SET = 25,
 
 	/* interrupt injection */
 	IOCNUM_GET_INTINFO = 28,
 	IOCNUM_SET_INTINFO = 29,
 	IOCNUM_INJECT_EXCEPTION = 30,
 	IOCNUM_LAPIC_IRQ = 31,
 	IOCNUM_INJECT_NMI = 32,
 	IOCNUM_IOAPIC_ASSERT_IRQ = 33,
 	IOCNUM_IOAPIC_DEASSERT_IRQ = 34,
 	IOCNUM_IOAPIC_PULSE_IRQ = 35,
 	IOCNUM_LAPIC_MSI = 36,
 	IOCNUM_LAPIC_LOCAL_IRQ = 37,
 	IOCNUM_IOAPIC_PINCOUNT = 38,
 	IOCNUM_RESTART_INSTRUCTION = 39,
 
 	/* PCI pass-thru */
 	IOCNUM_BIND_PPTDEV = 40,
 	IOCNUM_UNBIND_PPTDEV = 41,
 	IOCNUM_MAP_PPTDEV_MMIO = 42,
 	IOCNUM_PPTDEV_MSI = 43,
 	IOCNUM_PPTDEV_MSIX = 44,
 
 	/* statistics */
 	IOCNUM_VM_STATS = 50, 
 	IOCNUM_VM_STAT_DESC = 51,
 
 	/* kernel device state */
 	IOCNUM_SET_X2APIC_STATE = 60,
 	IOCNUM_GET_X2APIC_STATE = 61,
 	IOCNUM_GET_HPET_CAPABILITIES = 62,
 
 	/* CPU Topology */
 	IOCNUM_SET_TOPOLOGY = 63,
 	IOCNUM_GET_TOPOLOGY = 64,
 
 	/* legacy interrupt injection */
 	IOCNUM_ISA_ASSERT_IRQ = 80,
 	IOCNUM_ISA_DEASSERT_IRQ = 81,
 	IOCNUM_ISA_PULSE_IRQ = 82,
 	IOCNUM_ISA_SET_IRQ_TRIGGER = 83,
 
 	/* vm_cpuset */
 	IOCNUM_ACTIVATE_CPU = 90,
 	IOCNUM_GET_CPUSET = 91,
 	IOCNUM_SUSPEND_CPU = 92,
 	IOCNUM_RESUME_CPU = 93,
 
 	/* RTC */
 	IOCNUM_RTC_READ = 100,
 	IOCNUM_RTC_WRITE = 101,
 	IOCNUM_RTC_SETTIME = 102,
 	IOCNUM_RTC_GETTIME = 103,
+
+	/* checkpoint */
+	IOCNUM_SNAPSHOT_REQ = 113,
+
+	IOCNUM_RESTORE_TIME = 115
 };
 
 #define	VM_RUN		\
 	_IOWR('v', IOCNUM_RUN, struct vm_run)
 #define	VM_SUSPEND	\
 	_IOW('v', IOCNUM_SUSPEND, struct vm_suspend)
 #define	VM_REINIT	\
 	_IO('v', IOCNUM_REINIT)
 #define	VM_ALLOC_MEMSEG	\
 	_IOW('v', IOCNUM_ALLOC_MEMSEG, struct vm_memseg)
 #define	VM_GET_MEMSEG	\
 	_IOWR('v', IOCNUM_GET_MEMSEG, struct vm_memseg)
 #define	VM_MMAP_MEMSEG	\
 	_IOW('v', IOCNUM_MMAP_MEMSEG, struct vm_memmap)
 #define	VM_MMAP_GETNEXT	\
 	_IOWR('v', IOCNUM_MMAP_GETNEXT, struct vm_memmap)
 #define	VM_SET_REGISTER \
 	_IOW('v', IOCNUM_SET_REGISTER, struct vm_register)
 #define	VM_GET_REGISTER \
 	_IOWR('v', IOCNUM_GET_REGISTER, struct vm_register)
 #define	VM_SET_SEGMENT_DESCRIPTOR \
 	_IOW('v', IOCNUM_SET_SEGMENT_DESCRIPTOR, struct vm_seg_desc)
 #define	VM_GET_SEGMENT_DESCRIPTOR \
 	_IOWR('v', IOCNUM_GET_SEGMENT_DESCRIPTOR, struct vm_seg_desc)
 #define	VM_SET_REGISTER_SET \
 	_IOW('v', IOCNUM_SET_REGISTER_SET, struct vm_register_set)
 #define	VM_GET_REGISTER_SET \
 	_IOWR('v', IOCNUM_GET_REGISTER_SET, struct vm_register_set)
 #define	VM_INJECT_EXCEPTION	\
 	_IOW('v', IOCNUM_INJECT_EXCEPTION, struct vm_exception)
 #define	VM_LAPIC_IRQ 		\
 	_IOW('v', IOCNUM_LAPIC_IRQ, struct vm_lapic_irq)
 #define	VM_LAPIC_LOCAL_IRQ 	\
 	_IOW('v', IOCNUM_LAPIC_LOCAL_IRQ, struct vm_lapic_irq)
 #define	VM_LAPIC_MSI		\
 	_IOW('v', IOCNUM_LAPIC_MSI, struct vm_lapic_msi)
 #define	VM_IOAPIC_ASSERT_IRQ	\
 	_IOW('v', IOCNUM_IOAPIC_ASSERT_IRQ, struct vm_ioapic_irq)
 #define	VM_IOAPIC_DEASSERT_IRQ	\
 	_IOW('v', IOCNUM_IOAPIC_DEASSERT_IRQ, struct vm_ioapic_irq)
 #define	VM_IOAPIC_PULSE_IRQ	\
 	_IOW('v', IOCNUM_IOAPIC_PULSE_IRQ, struct vm_ioapic_irq)
 #define	VM_IOAPIC_PINCOUNT	\
 	_IOR('v', IOCNUM_IOAPIC_PINCOUNT, int)
 #define	VM_ISA_ASSERT_IRQ	\
 	_IOW('v', IOCNUM_ISA_ASSERT_IRQ, struct vm_isa_irq)
 #define	VM_ISA_DEASSERT_IRQ	\
 	_IOW('v', IOCNUM_ISA_DEASSERT_IRQ, struct vm_isa_irq)
 #define	VM_ISA_PULSE_IRQ	\
 	_IOW('v', IOCNUM_ISA_PULSE_IRQ, struct vm_isa_irq)
 #define	VM_ISA_SET_IRQ_TRIGGER	\
 	_IOW('v', IOCNUM_ISA_SET_IRQ_TRIGGER, struct vm_isa_irq_trigger)
 #define	VM_SET_CAPABILITY \
 	_IOW('v', IOCNUM_SET_CAPABILITY, struct vm_capability)
 #define	VM_GET_CAPABILITY \
 	_IOWR('v', IOCNUM_GET_CAPABILITY, struct vm_capability)
 #define	VM_BIND_PPTDEV \
 	_IOW('v', IOCNUM_BIND_PPTDEV, struct vm_pptdev)
 #define	VM_UNBIND_PPTDEV \
 	_IOW('v', IOCNUM_UNBIND_PPTDEV, struct vm_pptdev)
 #define	VM_MAP_PPTDEV_MMIO \
 	_IOW('v', IOCNUM_MAP_PPTDEV_MMIO, struct vm_pptdev_mmio)
 #define	VM_PPTDEV_MSI \
 	_IOW('v', IOCNUM_PPTDEV_MSI, struct vm_pptdev_msi)
 #define	VM_PPTDEV_MSIX \
 	_IOW('v', IOCNUM_PPTDEV_MSIX, struct vm_pptdev_msix)
 #define VM_INJECT_NMI \
 	_IOW('v', IOCNUM_INJECT_NMI, struct vm_nmi)
 #define	VM_STATS \
 	_IOWR('v', IOCNUM_VM_STATS, struct vm_stats)
 #define	VM_STAT_DESC \
 	_IOWR('v', IOCNUM_VM_STAT_DESC, struct vm_stat_desc)
 #define	VM_SET_X2APIC_STATE \
 	_IOW('v', IOCNUM_SET_X2APIC_STATE, struct vm_x2apic)
 #define	VM_GET_X2APIC_STATE \
 	_IOWR('v', IOCNUM_GET_X2APIC_STATE, struct vm_x2apic)
 #define	VM_GET_HPET_CAPABILITIES \
 	_IOR('v', IOCNUM_GET_HPET_CAPABILITIES, struct vm_hpet_cap)
 #define VM_SET_TOPOLOGY \
 	_IOW('v', IOCNUM_SET_TOPOLOGY, struct vm_cpu_topology)
 #define VM_GET_TOPOLOGY \
 	_IOR('v', IOCNUM_GET_TOPOLOGY, struct vm_cpu_topology)
 #define	VM_GET_GPA_PMAP \
 	_IOWR('v', IOCNUM_GET_GPA_PMAP, struct vm_gpa_pte)
 #define	VM_GLA2GPA	\
 	_IOWR('v', IOCNUM_GLA2GPA, struct vm_gla2gpa)
 #define	VM_GLA2GPA_NOFAULT \
 	_IOWR('v', IOCNUM_GLA2GPA_NOFAULT, struct vm_gla2gpa)
 #define	VM_ACTIVATE_CPU	\
 	_IOW('v', IOCNUM_ACTIVATE_CPU, struct vm_activate_cpu)
 #define	VM_GET_CPUS	\
 	_IOW('v', IOCNUM_GET_CPUSET, struct vm_cpuset)
 #define	VM_SUSPEND_CPU \
 	_IOW('v', IOCNUM_SUSPEND_CPU, struct vm_activate_cpu)
 #define	VM_RESUME_CPU \
 	_IOW('v', IOCNUM_RESUME_CPU, struct vm_activate_cpu)
 #define	VM_SET_INTINFO	\
 	_IOW('v', IOCNUM_SET_INTINFO, struct vm_intinfo)
 #define	VM_GET_INTINFO	\
 	_IOWR('v', IOCNUM_GET_INTINFO, struct vm_intinfo)
 #define VM_RTC_WRITE \
 	_IOW('v', IOCNUM_RTC_WRITE, struct vm_rtc_data)
 #define VM_RTC_READ \
 	_IOWR('v', IOCNUM_RTC_READ, struct vm_rtc_data)
 #define VM_RTC_SETTIME	\
 	_IOW('v', IOCNUM_RTC_SETTIME, struct vm_rtc_time)
 #define VM_RTC_GETTIME	\
 	_IOR('v', IOCNUM_RTC_GETTIME, struct vm_rtc_time)
 #define	VM_RESTART_INSTRUCTION \
 	_IOW('v', IOCNUM_RESTART_INSTRUCTION, int)
+#define VM_SNAPSHOT_REQ \
+	_IOWR('v', IOCNUM_SNAPSHOT_REQ, struct vm_snapshot_meta)
+#define VM_RESTORE_TIME \
+	_IOWR('v', IOCNUM_RESTORE_TIME, int)
 #endif
diff --git a/sys/amd64/include/vmm_snapshot.h b/sys/amd64/include/vmm_snapshot.h
new file mode 100644
index 000000000000..6ba25a5dae2e
--- /dev/null
+++ b/sys/amd64/include/vmm_snapshot.h
@@ -0,0 +1,156 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2016 Flavius Anton
+ * Copyright (c) 2016 Mihai Tiganus
+ * Copyright (c) 2016-2019 Mihai Carabas
+ * Copyright (c) 2017-2019 Darius Mihai
+ * Copyright (c) 2017-2019 Elena Mihailescu
+ * Copyright (c) 2018-2019 Sergiu Weisz
+ * All rights reserved.
+ * The bhyve-snapshot feature was developed under sponsorships
+ * from Matthew Grooms.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMM_SNAPSHOT_
+#define _VMM_SNAPSHOT_
+
+#include <sys/errno.h>
+#include <sys/types.h>
+#ifndef _KERNEL
+#include <stdbool.h>
+#endif
+
+struct vmctx;
+
+enum snapshot_req {
+	STRUCT_VMX,
+	STRUCT_VIOAPIC,
+	STRUCT_VM,
+	STRUCT_VLAPIC,
+	VM_MEM,
+	STRUCT_VHPET,
+	STRUCT_VMCX,
+	STRUCT_VATPIC,
+	STRUCT_VATPIT,
+	STRUCT_VPMTMR,
+	STRUCT_VRTC,
+};
+
+struct vm_snapshot_buffer {
+	/*
+	 * R/O for device-specific functions;
+	 * written by generic snapshot functions.
+	 */
+	uint8_t *const buf_start;
+	const size_t buf_size;
+
+	/*
+	 * R/W for device-specific functions used to keep track of buffer
+	 * current position and remaining size.
+	 */
+	uint8_t *buf;
+	size_t buf_rem;
+
+	/*
+	 * Length of the snapshot is either determined as (buf_size - buf_rem)
+	 * or (buf - buf_start) -- the second variation returns a signed value
+	 * so it may not be appropriate.
+	 *
+	 * Use vm_get_snapshot_size(meta).
+	 */
+};
+
+enum vm_snapshot_op {
+	VM_SNAPSHOT_SAVE,
+	VM_SNAPSHOT_RESTORE,
+};
+
+struct vm_snapshot_meta {
+	struct vmctx *ctx;
+	void *dev_data;
+	const char *dev_name;      /* identify userspace devices */
+	enum snapshot_req dev_req; /* identify kernel structs */
+
+	struct vm_snapshot_buffer buffer;
+
+	enum vm_snapshot_op op;
+};
+
+
+void vm_snapshot_buf_err(const char *bufname, const enum vm_snapshot_op op);
+int vm_snapshot_buf(volatile void *data, size_t data_size,
+		    struct vm_snapshot_meta *meta);
+size_t vm_get_snapshot_size(struct vm_snapshot_meta *meta);
+int vm_snapshot_guest2host_addr(void **addrp, size_t len, bool restore_null,
+				struct vm_snapshot_meta *meta);
+int vm_snapshot_buf_cmp(volatile void *data, size_t data_size,
+			      struct vm_snapshot_meta *meta);
+
+#define	SNAPSHOT_BUF_OR_LEAVE(DATA, LEN, META, RES, LABEL)			\
+do {										\
+	(RES) = vm_snapshot_buf((DATA), (LEN), (META));				\
+	if ((RES) != 0) {							\
+		vm_snapshot_buf_err(#DATA, (META)->op);				\
+		goto LABEL;							\
+	}									\
+} while (0)
+
+#define	SNAPSHOT_VAR_OR_LEAVE(DATA, META, RES, LABEL)				\
+	SNAPSHOT_BUF_OR_LEAVE(&(DATA), sizeof(DATA), (META), (RES), LABEL)
+
+/*
+ * Address variables are pointers to guest memory.
+ *
+ * When RNULL != 0, do not enforce invalid address checks; instead, make the
+ * pointer NULL at restore time.
+ */
+#define	SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(ADDR, LEN, RNULL, META, RES, LABEL)	\
+do {										\
+	(RES) = vm_snapshot_guest2host_addr((void **)&(ADDR), (LEN), (RNULL),	\
+			(META));					\
+	if ((RES) != 0) {							\
+		if ((RES) == EFAULT)						\
+			fprintf(stderr, "%s: invalid address: %s\r\n",		\
+				__func__, #ADDR);				\
+		goto LABEL;							\
+	}									\
+} while (0)
+
+/* compare the value in the meta buffer with the data */
+#define	SNAPSHOT_BUF_CMP_OR_LEAVE(DATA, LEN, META, RES, LABEL)			\
+do {										\
+	(RES) = vm_snapshot_buf_cmp((DATA), (LEN), (META));			\
+	if ((RES) != 0) {							\
+		vm_snapshot_buf_err(#DATA, (META)->op);				\
+		goto LABEL;							\
+	}									\
+} while (0)
+
+#define	SNAPSHOT_VAR_CMP_OR_LEAVE(DATA, META, RES, LABEL)			\
+	SNAPSHOT_BUF_CMP_OR_LEAVE(&(DATA), sizeof(DATA), (META), (RES), LABEL)
+
+#endif
diff --git a/sys/amd64/vmm/amd/svm.c b/sys/amd64/vmm/amd/svm.c
index d3ba62b4b19c..f9660024fe0c 100644
--- a/sys/amd64/vmm/amd/svm.c
+++ b/sys/amd64/vmm/amd/svm.c
@@ -1,2305 +1,2662 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2013, Anish Gupta (akgupt3@gmail.com)
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
+#include "opt_bhyve_snapshot.h"
+
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/smp.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/pcpu.h>
 #include <sys/proc.h>
 #include <sys/sysctl.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 #include <machine/cpufunc.h>
 #include <machine/psl.h>
 #include <machine/md_var.h>
 #include <machine/reg.h>
 #include <machine/specialreg.h>
 #include <machine/smp.h>
 #include <machine/vmm.h>
 #include <machine/vmm_dev.h>
 #include <machine/vmm_instruction_emul.h>
+#include <machine/vmm_snapshot.h>
 
 #include "vmm_lapic.h"
 #include "vmm_stat.h"
 #include "vmm_ktr.h"
 #include "vmm_ioport.h"
 #include "vatpic.h"
 #include "vlapic.h"
 #include "vlapic_priv.h"
 
 #include "x86.h"
 #include "vmcb.h"
 #include "svm.h"
 #include "svm_softc.h"
 #include "svm_msr.h"
 #include "npt.h"
 
 SYSCTL_DECL(_hw_vmm);
 SYSCTL_NODE(_hw_vmm, OID_AUTO, svm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
     NULL);
 
 /*
  * SVM CPUID function 0x8000_000A, edx bit decoding.
  */
 #define AMD_CPUID_SVM_NP		BIT(0)  /* Nested paging or RVI */
 #define AMD_CPUID_SVM_LBR		BIT(1)  /* Last branch virtualization */
 #define AMD_CPUID_SVM_SVML		BIT(2)  /* SVM lock */
 #define AMD_CPUID_SVM_NRIP_SAVE		BIT(3)  /* Next RIP is saved */
 #define AMD_CPUID_SVM_TSC_RATE		BIT(4)  /* TSC rate control. */
 #define AMD_CPUID_SVM_VMCB_CLEAN	BIT(5)  /* VMCB state caching */
 #define AMD_CPUID_SVM_FLUSH_BY_ASID	BIT(6)  /* Flush by ASID */
 #define AMD_CPUID_SVM_DECODE_ASSIST	BIT(7)  /* Decode assist */
 #define AMD_CPUID_SVM_PAUSE_INC		BIT(10) /* Pause intercept filter. */
 #define AMD_CPUID_SVM_PAUSE_FTH		BIT(12) /* Pause filter threshold */
 #define	AMD_CPUID_SVM_AVIC		BIT(13)	/* AVIC present */
 
 #define	VMCB_CACHE_DEFAULT	(VMCB_CACHE_ASID 	|	\
 				VMCB_CACHE_IOPM		|	\
 				VMCB_CACHE_I		|	\
 				VMCB_CACHE_TPR		|	\
 				VMCB_CACHE_CR2		|	\
 				VMCB_CACHE_CR		|	\
 				VMCB_CACHE_DR		|	\
 				VMCB_CACHE_DT		|	\
 				VMCB_CACHE_SEG		|	\
 				VMCB_CACHE_NP)
 
 static uint32_t vmcb_clean = VMCB_CACHE_DEFAULT;
 SYSCTL_INT(_hw_vmm_svm, OID_AUTO, vmcb_clean, CTLFLAG_RDTUN, &vmcb_clean,
     0, NULL);
 
 static MALLOC_DEFINE(M_SVM, "svm", "svm");
 static MALLOC_DEFINE(M_SVM_VLAPIC, "svm-vlapic", "svm-vlapic");
 
 static uint32_t svm_feature = ~0U;	/* AMD SVM features. */
 SYSCTL_UINT(_hw_vmm_svm, OID_AUTO, features, CTLFLAG_RDTUN, &svm_feature, 0,
     "SVM features advertised by CPUID.8000000AH:EDX");
 
 static int disable_npf_assist;
 SYSCTL_INT(_hw_vmm_svm, OID_AUTO, disable_npf_assist, CTLFLAG_RWTUN,
     &disable_npf_assist, 0, NULL);
 
 /* Maximum ASIDs supported by the processor */
 static uint32_t nasid;
 SYSCTL_UINT(_hw_vmm_svm, OID_AUTO, num_asids, CTLFLAG_RDTUN, &nasid, 0,
     "Number of ASIDs supported by this processor");
 
 /* Current ASID generation for each host cpu */
 static struct asid asid[MAXCPU];
 
 /* 
  * SVM host state saved area of size 4KB for each core.
  */
 static uint8_t hsave[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE);
 
 static VMM_STAT_AMD(VCPU_EXITINTINFO, "VM exits during event delivery");
 static VMM_STAT_AMD(VCPU_INTINFO_INJECTED, "Events pending at VM entry");
 static VMM_STAT_AMD(VMEXIT_VINTR, "VM exits due to interrupt window");
 
 static int svm_setreg(void *arg, int vcpu, int ident, uint64_t val);
 
 static __inline int
 flush_by_asid(void)
 {
 
 	return (svm_feature & AMD_CPUID_SVM_FLUSH_BY_ASID);
 }
 
 static __inline int
 decode_assist(void)
 {
 
 	return (svm_feature & AMD_CPUID_SVM_DECODE_ASSIST);
 }
 
 static void
 svm_disable(void *arg __unused)
 {
 	uint64_t efer;
 
 	efer = rdmsr(MSR_EFER);
 	efer &= ~EFER_SVM;
 	wrmsr(MSR_EFER, efer);
 }
 
 /*
  * Disable SVM on all CPUs.
  */
 static int
 svm_cleanup(void)
 {
 
 	smp_rendezvous(NULL, svm_disable, NULL, NULL);
 	return (0);
 }
 
 /*
  * Verify that all the features required by bhyve are available.
  */
 static int
 check_svm_features(void)
 {
 	u_int regs[4];
 
 	/* CPUID Fn8000_000A is for SVM */
 	do_cpuid(0x8000000A, regs);
 	svm_feature &= regs[3];
 
 	/*
 	 * The number of ASIDs can be configured to be less than what is
 	 * supported by the hardware but not more.
 	 */
 	if (nasid == 0 || nasid > regs[1])
 		nasid = regs[1];
 	KASSERT(nasid > 1, ("Insufficient ASIDs for guests: %#x", nasid));
 
 	/* bhyve requires the Nested Paging feature */
 	if (!(svm_feature & AMD_CPUID_SVM_NP)) {
 		printf("SVM: Nested Paging feature not available.\n");
 		return (ENXIO);
 	}
 
 	/* bhyve requires the NRIP Save feature */
 	if (!(svm_feature & AMD_CPUID_SVM_NRIP_SAVE)) {
 		printf("SVM: NRIP Save feature not available.\n");
 		return (ENXIO);
 	}
 
 	return (0);
 }
 
 static void
 svm_enable(void *arg __unused)
 {
 	uint64_t efer;
 
 	efer = rdmsr(MSR_EFER);
 	efer |= EFER_SVM;
 	wrmsr(MSR_EFER, efer);
 
 	wrmsr(MSR_VM_HSAVE_PA, vtophys(hsave[curcpu]));
 }
 
 /*
  * Return 1 if SVM is enabled on this processor and 0 otherwise.
  */
 static int
 svm_available(void)
 {
 	uint64_t msr;
 
 	/* Section 15.4 Enabling SVM from APM2. */
 	if ((amd_feature2 & AMDID2_SVM) == 0) {
 		printf("SVM: not available.\n");
 		return (0);
 	}
 
 	msr = rdmsr(MSR_VM_CR);
 	if ((msr & VM_CR_SVMDIS) != 0) {
 		printf("SVM: disabled by BIOS.\n");
 		return (0);
 	}
 
 	return (1);
 }
 
 static int
 svm_init(int ipinum)
 {
 	int error, cpu;
 
 	if (!svm_available())
 		return (ENXIO);
 
 	error = check_svm_features();
 	if (error)
 		return (error);
 
 	vmcb_clean &= VMCB_CACHE_DEFAULT;
 
 	for (cpu = 0; cpu < MAXCPU; cpu++) {
 		/*
 		 * Initialize the host ASIDs to their "highest" valid values.
 		 *
 		 * The next ASID allocation will rollover both 'gen' and 'num'
 		 * and start off the sequence at {1,1}.
 		 */
 		asid[cpu].gen = ~0UL;
 		asid[cpu].num = nasid - 1;
 	}
 
 	svm_msr_init();
 	svm_npt_init(ipinum);
 
 	/* Enable SVM on all CPUs */
 	smp_rendezvous(NULL, svm_enable, NULL, NULL);
 
 	return (0);
 }
 
 static void
 svm_restore(void)
 {
 
 	svm_enable(NULL);
 }		
 
+#ifdef BHYVE_SNAPSHOT
+int
+svm_set_tsc_offset(struct svm_softc *sc, int vcpu, uint64_t offset)
+{
+	int error;
+	struct vmcb_ctrl *ctrl;
+
+	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
+	ctrl->tsc_offset = offset;
+
+	svm_set_dirty(sc, vcpu, VMCB_CACHE_I);
+	VCPU_CTR1(sc->vm, vcpu, "tsc offset changed to %#lx", offset);
+
+	error = vm_set_tsc_offset(sc->vm, vcpu, offset);
+
+	return (error);
+}
+#endif
+
 /* Pentium compatible MSRs */
 #define MSR_PENTIUM_START 	0	
 #define MSR_PENTIUM_END 	0x1FFF
 /* AMD 6th generation and Intel compatible MSRs */
 #define MSR_AMD6TH_START 	0xC0000000UL	
 #define MSR_AMD6TH_END 		0xC0001FFFUL	
 /* AMD 7th and 8th generation compatible MSRs */
 #define MSR_AMD7TH_START 	0xC0010000UL	
 #define MSR_AMD7TH_END 		0xC0011FFFUL	
 
 /*
  * Get the index and bit position for a MSR in permission bitmap.
  * Two bits are used for each MSR: lower bit for read and higher bit for write.
  */
 static int
 svm_msr_index(uint64_t msr, int *index, int *bit)
 {
 	uint32_t base, off;
 
 	*index = -1;
 	*bit = (msr % 4) * 2;
 	base = 0;
 
 	if (msr >= MSR_PENTIUM_START && msr <= MSR_PENTIUM_END) {
 		*index = msr / 4;
 		return (0);
 	}
 
 	base += (MSR_PENTIUM_END - MSR_PENTIUM_START + 1); 
 	if (msr >= MSR_AMD6TH_START && msr <= MSR_AMD6TH_END) {
 		off = (msr - MSR_AMD6TH_START); 
 		*index = (off + base) / 4;
 		return (0);
 	} 
 
 	base += (MSR_AMD6TH_END - MSR_AMD6TH_START + 1);
 	if (msr >= MSR_AMD7TH_START && msr <= MSR_AMD7TH_END) {
 		off = (msr - MSR_AMD7TH_START);
 		*index = (off + base) / 4;
 		return (0);
 	}
 
 	return (EINVAL);
 }
 
 /*
  * Allow vcpu to read or write the 'msr' without trapping into the hypervisor.
  */
 static void
 svm_msr_perm(uint8_t *perm_bitmap, uint64_t msr, bool read, bool write)
 {
 	int index, bit, error;
 
 	error = svm_msr_index(msr, &index, &bit);
 	KASSERT(error == 0, ("%s: invalid msr %#lx", __func__, msr));
 	KASSERT(index >= 0 && index < SVM_MSR_BITMAP_SIZE,
 	    ("%s: invalid index %d for msr %#lx", __func__, index, msr));
 	KASSERT(bit >= 0 && bit <= 6, ("%s: invalid bit position %d "
 	    "msr %#lx", __func__, bit, msr));
 
 	if (read)
 		perm_bitmap[index] &= ~(1UL << bit);
 
 	if (write)
 		perm_bitmap[index] &= ~(2UL << bit);
 }
 
 static void
 svm_msr_rw_ok(uint8_t *perm_bitmap, uint64_t msr)
 {
 
 	svm_msr_perm(perm_bitmap, msr, true, true);
 }
 
 static void
 svm_msr_rd_ok(uint8_t *perm_bitmap, uint64_t msr)
 {
 
 	svm_msr_perm(perm_bitmap, msr, true, false);
 }
 
 static __inline int
 svm_get_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask)
 {
 	struct vmcb_ctrl *ctrl;
 
 	KASSERT(idx >=0 && idx < 5, ("invalid intercept index %d", idx));
 
 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
 	return (ctrl->intercept[idx] & bitmask ? 1 : 0);
 }
 
 static __inline void
 svm_set_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask,
     int enabled)
 {
 	struct vmcb_ctrl *ctrl;
 	uint32_t oldval;
 
 	KASSERT(idx >=0 && idx < 5, ("invalid intercept index %d", idx));
 
 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
 	oldval = ctrl->intercept[idx];
 
 	if (enabled)
 		ctrl->intercept[idx] |= bitmask;
 	else
 		ctrl->intercept[idx] &= ~bitmask;
 
 	if (ctrl->intercept[idx] != oldval) {
 		svm_set_dirty(sc, vcpu, VMCB_CACHE_I);
 		VCPU_CTR3(sc->vm, vcpu, "intercept[%d] modified "
 		    "from %#x to %#x", idx, oldval, ctrl->intercept[idx]);
 	}
 }
 
 static __inline void
 svm_disable_intercept(struct svm_softc *sc, int vcpu, int off, uint32_t bitmask)
 {
 
 	svm_set_intercept(sc, vcpu, off, bitmask, 0);
 }
 
 static __inline void
 svm_enable_intercept(struct svm_softc *sc, int vcpu, int off, uint32_t bitmask)
 {
 
 	svm_set_intercept(sc, vcpu, off, bitmask, 1);
 }
 
 static void
 vmcb_init(struct svm_softc *sc, int vcpu, uint64_t iopm_base_pa,
     uint64_t msrpm_base_pa, uint64_t np_pml4)
 {
 	struct vmcb_ctrl *ctrl;
 	struct vmcb_state *state;
 	uint32_t mask;
 	int n;
 
 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
 	state = svm_get_vmcb_state(sc, vcpu);
 
 	ctrl->iopm_base_pa = iopm_base_pa;
 	ctrl->msrpm_base_pa = msrpm_base_pa;
 
 	/* Enable nested paging */
 	ctrl->np_enable = 1;
 	ctrl->n_cr3 = np_pml4;
 
 	/*
 	 * Intercept accesses to the control registers that are not shadowed
 	 * in the VMCB - i.e. all except cr0, cr2, cr3, cr4 and cr8.
 	 */
 	for (n = 0; n < 16; n++) {
 		mask = (BIT(n) << 16) | BIT(n);
 		if (n == 0 || n == 2 || n == 3 || n == 4 || n == 8)
 			svm_disable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask);
 		else
 			svm_enable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask);
 	}
 
 
 	/*
 	 * Intercept everything when tracing guest exceptions otherwise
 	 * just intercept machine check exception.
 	 */
 	if (vcpu_trace_exceptions(sc->vm, vcpu)) {
 		for (n = 0; n < 32; n++) {
 			/*
 			 * Skip unimplemented vectors in the exception bitmap.
 			 */
 			if (n == 2 || n == 9) {
 				continue;
 			}
 			svm_enable_intercept(sc, vcpu, VMCB_EXC_INTCPT, BIT(n));
 		}
 	} else {
 		svm_enable_intercept(sc, vcpu, VMCB_EXC_INTCPT, BIT(IDT_MC));
 	}
 
 	/* Intercept various events (for e.g. I/O, MSR and CPUID accesses) */
 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IO);
 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_MSR);
 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_CPUID);
 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INTR);
 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INIT);
 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_NMI);
 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SMI);
 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SHUTDOWN);
 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
 	    VMCB_INTCPT_FERR_FREEZE);
 
 	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_MONITOR);
 	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_MWAIT);
 
 	/*
 	 * From section "Canonicalization and Consistency Checks" in APMv2
 	 * the VMRUN intercept bit must be set to pass the consistency check.
 	 */
 	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMRUN);
 
 	/*
 	 * The ASID will be set to a non-zero value just before VMRUN.
 	 */
 	ctrl->asid = 0;
 
 	/*
 	 * Section 15.21.1, Interrupt Masking in EFLAGS
 	 * Section 15.21.2, Virtualizing APIC.TPR
 	 *
 	 * This must be set for %rflag and %cr8 isolation of guest and host.
 	 */
 	ctrl->v_intr_masking = 1;
 
 	/* Enable Last Branch Record aka LBR for debugging */
 	ctrl->lbr_virt_en = 1;
 	state->dbgctl = BIT(0);
 
 	/* EFER_SVM must always be set when the guest is executing */
 	state->efer = EFER_SVM;
 
 	/* Set up the PAT to power-on state */
 	state->g_pat = PAT_VALUE(0, PAT_WRITE_BACK)	|
 	    PAT_VALUE(1, PAT_WRITE_THROUGH)	|
 	    PAT_VALUE(2, PAT_UNCACHED)		|
 	    PAT_VALUE(3, PAT_UNCACHEABLE)	|
 	    PAT_VALUE(4, PAT_WRITE_BACK)	|
 	    PAT_VALUE(5, PAT_WRITE_THROUGH)	|
 	    PAT_VALUE(6, PAT_UNCACHED)		|
 	    PAT_VALUE(7, PAT_UNCACHEABLE);
 
 	/* Set up DR6/7 to power-on state */
 	state->dr6 = DBREG_DR6_RESERVED1;
 	state->dr7 = DBREG_DR7_RESERVED1;
 }
 
 /*
  * Initialize a virtual machine.
  */
 static void *
 svm_vminit(struct vm *vm, pmap_t pmap)
 {
 	struct svm_softc *svm_sc;
 	struct svm_vcpu *vcpu;
 	vm_paddr_t msrpm_pa, iopm_pa, pml4_pa;
 	int i;
 	uint16_t maxcpus;
 
 	svm_sc = malloc(sizeof (*svm_sc), M_SVM, M_WAITOK | M_ZERO);
 	if (((uintptr_t)svm_sc & PAGE_MASK) != 0)
 		panic("malloc of svm_softc not aligned on page boundary");
 
 	svm_sc->msr_bitmap = contigmalloc(SVM_MSR_BITMAP_SIZE, M_SVM,
 	    M_WAITOK, 0, ~(vm_paddr_t)0, PAGE_SIZE, 0);
 	if (svm_sc->msr_bitmap == NULL)
 		panic("contigmalloc of SVM MSR bitmap failed");
 	svm_sc->iopm_bitmap = contigmalloc(SVM_IO_BITMAP_SIZE, M_SVM,
 	    M_WAITOK, 0, ~(vm_paddr_t)0, PAGE_SIZE, 0);
 	if (svm_sc->iopm_bitmap == NULL)
 		panic("contigmalloc of SVM IO bitmap failed");
 
 	svm_sc->vm = vm;
 	svm_sc->nptp = (vm_offset_t)vtophys(pmap->pm_pml4);
 
 	/*
 	 * Intercept read and write accesses to all MSRs.
 	 */
 	memset(svm_sc->msr_bitmap, 0xFF, SVM_MSR_BITMAP_SIZE);
 
 	/*
 	 * Access to the following MSRs is redirected to the VMCB when the
 	 * guest is executing. Therefore it is safe to allow the guest to
 	 * read/write these MSRs directly without hypervisor involvement.
 	 */
 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_GSBASE);
 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_FSBASE);
 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_KGSBASE);
 
 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_STAR);
 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_LSTAR);
 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_CSTAR);
 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SF_MASK);
 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_CS_MSR);
 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_ESP_MSR);
 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_EIP_MSR);
 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_PAT);
 
 	svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_TSC);
 
 	/*
 	 * Intercept writes to make sure that the EFER_SVM bit is not cleared.
 	 */
 	svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_EFER);
 
 	/* Intercept access to all I/O ports. */
 	memset(svm_sc->iopm_bitmap, 0xFF, SVM_IO_BITMAP_SIZE);
 
 	iopm_pa = vtophys(svm_sc->iopm_bitmap);
 	msrpm_pa = vtophys(svm_sc->msr_bitmap);
 	pml4_pa = svm_sc->nptp;
 	maxcpus = vm_get_maxcpus(svm_sc->vm);
 	for (i = 0; i < maxcpus; i++) {
 		vcpu = svm_get_vcpu(svm_sc, i);
 		vcpu->nextrip = ~0;
 		vcpu->lastcpu = NOCPU;
 		vcpu->vmcb_pa = vtophys(&vcpu->vmcb);
 		vmcb_init(svm_sc, i, iopm_pa, msrpm_pa, pml4_pa);
 		svm_msr_guest_init(svm_sc, i);
 	}
 	return (svm_sc);
 }
 
 /*
  * Collateral for a generic SVM VM-exit.
  */
 static void
 vm_exit_svm(struct vm_exit *vme, uint64_t code, uint64_t info1, uint64_t info2)
 {
 
 	vme->exitcode = VM_EXITCODE_SVM;
 	vme->u.svm.exitcode = code;
 	vme->u.svm.exitinfo1 = info1;
 	vme->u.svm.exitinfo2 = info2;
 }
 
 static int
 svm_cpl(struct vmcb_state *state)
 {
 
 	/*
 	 * From APMv2:
 	 *   "Retrieve the CPL from the CPL field in the VMCB, not
 	 *    from any segment DPL"
 	 */
 	return (state->cpl);
 }
 
 static enum vm_cpu_mode
 svm_vcpu_mode(struct vmcb *vmcb)
 {
 	struct vmcb_segment seg;
 	struct vmcb_state *state;
 	int error;
 
 	state = &vmcb->state;
 
 	if (state->efer & EFER_LMA) {
 		error = vmcb_seg(vmcb, VM_REG_GUEST_CS, &seg);
 		KASSERT(error == 0, ("%s: vmcb_seg(cs) error %d", __func__,
 		    error));
 
 		/*
 		 * Section 4.8.1 for APM2, check if Code Segment has
 		 * Long attribute set in descriptor.
 		 */
 		if (seg.attrib & VMCB_CS_ATTRIB_L)
 			return (CPU_MODE_64BIT);
 		else
 			return (CPU_MODE_COMPATIBILITY);
 	} else  if (state->cr0 & CR0_PE) {
 		return (CPU_MODE_PROTECTED);
 	} else {
 		return (CPU_MODE_REAL);
 	}
 }
 
 static enum vm_paging_mode
 svm_paging_mode(uint64_t cr0, uint64_t cr4, uint64_t efer)
 {
 
 	if ((cr0 & CR0_PG) == 0)
 		return (PAGING_MODE_FLAT);
 	if ((cr4 & CR4_PAE) == 0)
 		return (PAGING_MODE_32);
 	if (efer & EFER_LME)
 		return (PAGING_MODE_64);
 	else
 		return (PAGING_MODE_PAE);
 }
 
 /*
  * ins/outs utility routines
  */
 static uint64_t
 svm_inout_str_index(struct svm_regctx *regs, int in)
 {
 	uint64_t val;
 
 	val = in ? regs->sctx_rdi : regs->sctx_rsi;
 
 	return (val);
 }
 
 static uint64_t
 svm_inout_str_count(struct svm_regctx *regs, int rep)
 {
 	uint64_t val;
 
 	val = rep ? regs->sctx_rcx : 1;
 
 	return (val);
 }
 
 static void
 svm_inout_str_seginfo(struct svm_softc *svm_sc, int vcpu, int64_t info1,
     int in, struct vm_inout_str *vis)
 {
 	int error, s;
 
 	if (in) {
 		vis->seg_name = VM_REG_GUEST_ES;
 	} else {
 		/* The segment field has standard encoding */
 		s = (info1 >> 10) & 0x7;
 		vis->seg_name = vm_segment_name(s);
 	}
 
 	error = vmcb_getdesc(svm_sc, vcpu, vis->seg_name, &vis->seg_desc);
 	KASSERT(error == 0, ("%s: svm_getdesc error %d", __func__, error));
 }
 
 static int
 svm_inout_str_addrsize(uint64_t info1)
 {
         uint32_t size;
 
         size = (info1 >> 7) & 0x7;
         switch (size) {
         case 1:
                 return (2);     /* 16 bit */
         case 2:
                 return (4);     /* 32 bit */
         case 4:
                 return (8);     /* 64 bit */
         default:
                 panic("%s: invalid size encoding %d", __func__, size);
         }
 }
 
 static void
 svm_paging_info(struct vmcb *vmcb, struct vm_guest_paging *paging)
 {
 	struct vmcb_state *state;
 
 	state = &vmcb->state;
 	paging->cr3 = state->cr3;
 	paging->cpl = svm_cpl(state);
 	paging->cpu_mode = svm_vcpu_mode(vmcb);
 	paging->paging_mode = svm_paging_mode(state->cr0, state->cr4,
 	    state->efer);
 }
 
 #define	UNHANDLED 0
 
 /*
  * Handle guest I/O intercept.
  */
 static int
 svm_handle_io(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit)
 {
 	struct vmcb_ctrl *ctrl;
 	struct vmcb_state *state;
 	struct svm_regctx *regs;
 	struct vm_inout_str *vis;
 	uint64_t info1;
 	int inout_string;
 
 	state = svm_get_vmcb_state(svm_sc, vcpu);
 	ctrl  = svm_get_vmcb_ctrl(svm_sc, vcpu);
 	regs  = svm_get_guest_regctx(svm_sc, vcpu);
 
 	info1 = ctrl->exitinfo1;
 	inout_string = info1 & BIT(2) ? 1 : 0;
 
 	/*
 	 * The effective segment number in EXITINFO1[12:10] is populated
 	 * only if the processor has the DecodeAssist capability.
 	 *
 	 * XXX this is not specified explicitly in APMv2 but can be verified
 	 * empirically.
 	 */
 	if (inout_string && !decode_assist())
 		return (UNHANDLED);
 
 	vmexit->exitcode 	= VM_EXITCODE_INOUT;
 	vmexit->u.inout.in 	= (info1 & BIT(0)) ? 1 : 0;
 	vmexit->u.inout.string 	= inout_string;
 	vmexit->u.inout.rep 	= (info1 & BIT(3)) ? 1 : 0;
 	vmexit->u.inout.bytes 	= (info1 >> 4) & 0x7;
 	vmexit->u.inout.port 	= (uint16_t)(info1 >> 16);
 	vmexit->u.inout.eax 	= (uint32_t)(state->rax);
 
 	if (inout_string) {
 		vmexit->exitcode = VM_EXITCODE_INOUT_STR;
 		vis = &vmexit->u.inout_str;
 		svm_paging_info(svm_get_vmcb(svm_sc, vcpu), &vis->paging);
 		vis->rflags = state->rflags;
 		vis->cr0 = state->cr0;
 		vis->index = svm_inout_str_index(regs, vmexit->u.inout.in);
 		vis->count = svm_inout_str_count(regs, vmexit->u.inout.rep);
 		vis->addrsize = svm_inout_str_addrsize(info1);
 		svm_inout_str_seginfo(svm_sc, vcpu, info1,
 		    vmexit->u.inout.in, vis);
 	}
 
 	return (UNHANDLED);
 }
 
 static int
 npf_fault_type(uint64_t exitinfo1)
 {
 
 	if (exitinfo1 & VMCB_NPF_INFO1_W)
 		return (VM_PROT_WRITE);
 	else if (exitinfo1 & VMCB_NPF_INFO1_ID)
 		return (VM_PROT_EXECUTE);
 	else
 		return (VM_PROT_READ);
 }
 
 static bool
 svm_npf_emul_fault(uint64_t exitinfo1)
 {
 	
 	if (exitinfo1 & VMCB_NPF_INFO1_ID) {
 		return (false);
 	}
 
 	if (exitinfo1 & VMCB_NPF_INFO1_GPT) {
 		return (false);
 	}
 
 	if ((exitinfo1 & VMCB_NPF_INFO1_GPA) == 0) {
 		return (false);
 	}
 
 	return (true);	
 }
 
 static void
 svm_handle_inst_emul(struct vmcb *vmcb, uint64_t gpa, struct vm_exit *vmexit)
 {
 	struct vm_guest_paging *paging;
 	struct vmcb_segment seg;
 	struct vmcb_ctrl *ctrl;
 	char *inst_bytes;
 	int error, inst_len;
 
 	ctrl = &vmcb->ctrl;
 	paging = &vmexit->u.inst_emul.paging;
 
 	vmexit->exitcode = VM_EXITCODE_INST_EMUL;
 	vmexit->u.inst_emul.gpa = gpa;
 	vmexit->u.inst_emul.gla = VIE_INVALID_GLA;
 	svm_paging_info(vmcb, paging);
 
 	error = vmcb_seg(vmcb, VM_REG_GUEST_CS, &seg);
 	KASSERT(error == 0, ("%s: vmcb_seg(CS) error %d", __func__, error));
 
 	switch(paging->cpu_mode) {
 	case CPU_MODE_REAL:
 		vmexit->u.inst_emul.cs_base = seg.base;
 		vmexit->u.inst_emul.cs_d = 0;
 		break;
 	case CPU_MODE_PROTECTED:
 	case CPU_MODE_COMPATIBILITY:
 		vmexit->u.inst_emul.cs_base = seg.base;
 
 		/*
 		 * Section 4.8.1 of APM2, Default Operand Size or D bit.
 		 */
 		vmexit->u.inst_emul.cs_d = (seg.attrib & VMCB_CS_ATTRIB_D) ?
 		    1 : 0;
 		break;
 	default:
 		vmexit->u.inst_emul.cs_base = 0;
 		vmexit->u.inst_emul.cs_d = 0;
 		break;	
 	}
 
 	/*
 	 * Copy the instruction bytes into 'vie' if available.
 	 */
 	if (decode_assist() && !disable_npf_assist) {
 		inst_len = ctrl->inst_len;
 		inst_bytes = ctrl->inst_bytes;
 	} else {
 		inst_len = 0;
 		inst_bytes = NULL;
 	}
 	vie_init(&vmexit->u.inst_emul.vie, inst_bytes, inst_len);
 }
 
 #ifdef KTR
 static const char *
 intrtype_to_str(int intr_type)
 {
 	switch (intr_type) {
 	case VMCB_EVENTINJ_TYPE_INTR:
 		return ("hwintr");
 	case VMCB_EVENTINJ_TYPE_NMI:
 		return ("nmi");
 	case VMCB_EVENTINJ_TYPE_INTn:
 		return ("swintr");
 	case VMCB_EVENTINJ_TYPE_EXCEPTION:
 		return ("exception");
 	default:
 		panic("%s: unknown intr_type %d", __func__, intr_type);
 	}
 }
 #endif
 
 /*
  * Inject an event to vcpu as described in section 15.20, "Event injection".
  */
 static void
 svm_eventinject(struct svm_softc *sc, int vcpu, int intr_type, int vector,
 		 uint32_t error, bool ec_valid)
 {
 	struct vmcb_ctrl *ctrl;
 
 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
 
 	KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) == 0,
 	    ("%s: event already pending %#lx", __func__, ctrl->eventinj));
 
 	KASSERT(vector >=0 && vector <= 255, ("%s: invalid vector %d",
 	    __func__, vector));
 
 	switch (intr_type) {
 	case VMCB_EVENTINJ_TYPE_INTR:
 	case VMCB_EVENTINJ_TYPE_NMI:
 	case VMCB_EVENTINJ_TYPE_INTn:
 		break;
 	case VMCB_EVENTINJ_TYPE_EXCEPTION:
 		if (vector >= 0 && vector <= 31 && vector != 2)
 			break;
 		/* FALLTHROUGH */
 	default:
 		panic("%s: invalid intr_type/vector: %d/%d", __func__,
 		    intr_type, vector);
 	}
 	ctrl->eventinj = vector | (intr_type << 8) | VMCB_EVENTINJ_VALID;
 	if (ec_valid) {
 		ctrl->eventinj |= VMCB_EVENTINJ_EC_VALID;
 		ctrl->eventinj |= (uint64_t)error << 32;
 		VCPU_CTR3(sc->vm, vcpu, "Injecting %s at vector %d errcode %#x",
 		    intrtype_to_str(intr_type), vector, error);
 	} else {
 		VCPU_CTR2(sc->vm, vcpu, "Injecting %s at vector %d",
 		    intrtype_to_str(intr_type), vector);
 	}
 }
 
 static void
 svm_update_virqinfo(struct svm_softc *sc, int vcpu)
 {
 	struct vm *vm;
 	struct vlapic *vlapic;
 	struct vmcb_ctrl *ctrl;
 
 	vm = sc->vm;
 	vlapic = vm_lapic(vm, vcpu);
 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
 
 	/* Update %cr8 in the emulated vlapic */
 	vlapic_set_cr8(vlapic, ctrl->v_tpr);
 
 	/* Virtual interrupt injection is not used. */
 	KASSERT(ctrl->v_intr_vector == 0, ("%s: invalid "
 	    "v_intr_vector %d", __func__, ctrl->v_intr_vector));
 }
 
 static void
 svm_save_intinfo(struct svm_softc *svm_sc, int vcpu)
 {
 	struct vmcb_ctrl *ctrl;
 	uint64_t intinfo;
 
 	ctrl  = svm_get_vmcb_ctrl(svm_sc, vcpu);
 	intinfo = ctrl->exitintinfo;	
 	if (!VMCB_EXITINTINFO_VALID(intinfo))
 		return;
 
 	/*
 	 * From APMv2, Section "Intercepts during IDT interrupt delivery"
 	 *
 	 * If a #VMEXIT happened during event delivery then record the event
 	 * that was being delivered.
 	 */
 	VCPU_CTR2(svm_sc->vm, vcpu, "SVM:Pending INTINFO(0x%lx), vector=%d.\n",
 		intinfo, VMCB_EXITINTINFO_VECTOR(intinfo));
 	vmm_stat_incr(svm_sc->vm, vcpu, VCPU_EXITINTINFO, 1);
 	vm_exit_intinfo(svm_sc->vm, vcpu, intinfo);
 }
 
 #ifdef INVARIANTS
 static __inline int
 vintr_intercept_enabled(struct svm_softc *sc, int vcpu)
 {
 
 	return (svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
 	    VMCB_INTCPT_VINTR));
 }
 #endif
 
 static __inline void
 enable_intr_window_exiting(struct svm_softc *sc, int vcpu)
 {
 	struct vmcb_ctrl *ctrl;
 
 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
 
 	if (ctrl->v_irq && ctrl->v_intr_vector == 0) {
 		KASSERT(ctrl->v_ign_tpr, ("%s: invalid v_ign_tpr", __func__));
 		KASSERT(vintr_intercept_enabled(sc, vcpu),
 		    ("%s: vintr intercept should be enabled", __func__));
 		return;
 	}
 
 	VCPU_CTR0(sc->vm, vcpu, "Enable intr window exiting");
 	ctrl->v_irq = 1;
 	ctrl->v_ign_tpr = 1;
 	ctrl->v_intr_vector = 0;
 	svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR);
 }
 
 static __inline void
 disable_intr_window_exiting(struct svm_softc *sc, int vcpu)
 {
 	struct vmcb_ctrl *ctrl;
 
 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
 
 	if (!ctrl->v_irq && ctrl->v_intr_vector == 0) {
 		KASSERT(!vintr_intercept_enabled(sc, vcpu),
 		    ("%s: vintr intercept should be disabled", __func__));
 		return;
 	}
 
 	VCPU_CTR0(sc->vm, vcpu, "Disable intr window exiting");
 	ctrl->v_irq = 0;
 	ctrl->v_intr_vector = 0;
 	svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
 	svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR);
 }
 
 static int
 svm_modify_intr_shadow(struct svm_softc *sc, int vcpu, uint64_t val)
 {
 	struct vmcb_ctrl *ctrl;
 	int oldval, newval;
 
 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
 	oldval = ctrl->intr_shadow;
 	newval = val ? 1 : 0;
 	if (newval != oldval) {
 		ctrl->intr_shadow = newval;
 		VCPU_CTR1(sc->vm, vcpu, "Setting intr_shadow to %d", newval);
 	}
 	return (0);
 }
 
 static int
 svm_get_intr_shadow(struct svm_softc *sc, int vcpu, uint64_t *val)
 {
 	struct vmcb_ctrl *ctrl;
 
 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
 	*val = ctrl->intr_shadow;
 	return (0);
 }
 
 /*
  * Once an NMI is injected it blocks delivery of further NMIs until the handler
  * executes an IRET. The IRET intercept is enabled when an NMI is injected to
  * to track when the vcpu is done handling the NMI.
  */
 static int
 nmi_blocked(struct svm_softc *sc, int vcpu)
 {
 	int blocked;
 
 	blocked = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
 	    VMCB_INTCPT_IRET);
 	return (blocked);
 }
 
 static void
 enable_nmi_blocking(struct svm_softc *sc, int vcpu)
 {
 
 	KASSERT(!nmi_blocked(sc, vcpu), ("vNMI already blocked"));
 	VCPU_CTR0(sc->vm, vcpu, "vNMI blocking enabled");
 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET);
 }
 
 static void
 clear_nmi_blocking(struct svm_softc *sc, int vcpu)
 {
 	int error;
 
 	KASSERT(nmi_blocked(sc, vcpu), ("vNMI already unblocked"));
 	VCPU_CTR0(sc->vm, vcpu, "vNMI blocking cleared");
 	/*
 	 * When the IRET intercept is cleared the vcpu will attempt to execute
 	 * the "iret" when it runs next. However, it is possible to inject
 	 * another NMI into the vcpu before the "iret" has actually executed.
 	 *
 	 * For e.g. if the "iret" encounters a #NPF when accessing the stack
 	 * it will trap back into the hypervisor. If an NMI is pending for
 	 * the vcpu it will be injected into the guest.
 	 *
 	 * XXX this needs to be fixed
 	 */
 	svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET);
 
 	/*
 	 * Set 'intr_shadow' to prevent an NMI from being injected on the
 	 * immediate VMRUN.
 	 */
 	error = svm_modify_intr_shadow(sc, vcpu, 1);
 	KASSERT(!error, ("%s: error %d setting intr_shadow", __func__, error));
 }
 
 #define	EFER_MBZ_BITS	0xFFFFFFFFFFFF0200UL
 
 static int
 svm_write_efer(struct svm_softc *sc, int vcpu, uint64_t newval, bool *retu)
 {
 	struct vm_exit *vme;
 	struct vmcb_state *state;
 	uint64_t changed, lma, oldval;
 	int error;
 
 	state = svm_get_vmcb_state(sc, vcpu);
 
 	oldval = state->efer;
 	VCPU_CTR2(sc->vm, vcpu, "wrmsr(efer) %#lx/%#lx", oldval, newval);
 
 	newval &= ~0xFE;		/* clear the Read-As-Zero (RAZ) bits */
 	changed = oldval ^ newval;
 
 	if (newval & EFER_MBZ_BITS)
 		goto gpf;
 
 	/* APMv2 Table 14-5 "Long-Mode Consistency Checks" */
 	if (changed & EFER_LME) {
 		if (state->cr0 & CR0_PG)
 			goto gpf;
 	}
 
 	/* EFER.LMA = EFER.LME & CR0.PG */
 	if ((newval & EFER_LME) != 0 && (state->cr0 & CR0_PG) != 0)
 		lma = EFER_LMA;
 	else
 		lma = 0;
 
 	if ((newval & EFER_LMA) != lma)
 		goto gpf;
 
 	if (newval & EFER_NXE) {
 		if (!vm_cpuid_capability(sc->vm, vcpu, VCC_NO_EXECUTE))
 			goto gpf;
 	}
 
 	/*
 	 * XXX bhyve does not enforce segment limits in 64-bit mode. Until
 	 * this is fixed flag guest attempt to set EFER_LMSLE as an error.
 	 */
 	if (newval & EFER_LMSLE) {
 		vme = vm_exitinfo(sc->vm, vcpu);
 		vm_exit_svm(vme, VMCB_EXIT_MSR, 1, 0);
 		*retu = true;
 		return (0);
 	}
 
 	if (newval & EFER_FFXSR) {
 		if (!vm_cpuid_capability(sc->vm, vcpu, VCC_FFXSR))
 			goto gpf;
 	}
 
 	if (newval & EFER_TCE) {
 		if (!vm_cpuid_capability(sc->vm, vcpu, VCC_TCE))
 			goto gpf;
 	}
 
 	error = svm_setreg(sc, vcpu, VM_REG_GUEST_EFER, newval);
 	KASSERT(error == 0, ("%s: error %d updating efer", __func__, error));
 	return (0);
 gpf:
 	vm_inject_gp(sc->vm, vcpu);
 	return (0);
 }
 
 static int
 emulate_wrmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t val,
     bool *retu)
 {
 	int error;
 
 	if (lapic_msr(num))
 		error = lapic_wrmsr(sc->vm, vcpu, num, val, retu);
 	else if (num == MSR_EFER)
 		error = svm_write_efer(sc, vcpu, val, retu);
 	else
 		error = svm_wrmsr(sc, vcpu, num, val, retu);
 
 	return (error);
 }
 
 static int
 emulate_rdmsr(struct svm_softc *sc, int vcpu, u_int num, bool *retu)
 {
 	struct vmcb_state *state;
 	struct svm_regctx *ctx;
 	uint64_t result;
 	int error;
 
 	if (lapic_msr(num))
 		error = lapic_rdmsr(sc->vm, vcpu, num, &result, retu);
 	else
 		error = svm_rdmsr(sc, vcpu, num, &result, retu);
 
 	if (error == 0) {
 		state = svm_get_vmcb_state(sc, vcpu);
 		ctx = svm_get_guest_regctx(sc, vcpu);
 		state->rax = result & 0xffffffff;
 		ctx->sctx_rdx = result >> 32;
 	}
 
 	return (error);
 }
 
 #ifdef KTR
 static const char *
 exit_reason_to_str(uint64_t reason)
 {
 	static char reasonbuf[32];
 
 	switch (reason) {
 	case VMCB_EXIT_INVALID:
 		return ("invalvmcb");
 	case VMCB_EXIT_SHUTDOWN:
 		return ("shutdown");
 	case VMCB_EXIT_NPF:
 		return ("nptfault");
 	case VMCB_EXIT_PAUSE:
 		return ("pause");
 	case VMCB_EXIT_HLT:
 		return ("hlt");
 	case VMCB_EXIT_CPUID:
 		return ("cpuid");
 	case VMCB_EXIT_IO:
 		return ("inout");
 	case VMCB_EXIT_MC:
 		return ("mchk");
 	case VMCB_EXIT_INTR:
 		return ("extintr");
 	case VMCB_EXIT_NMI:
 		return ("nmi");
 	case VMCB_EXIT_VINTR:
 		return ("vintr");
 	case VMCB_EXIT_MSR:
 		return ("msr");
 	case VMCB_EXIT_IRET:
 		return ("iret");
 	case VMCB_EXIT_MONITOR:
 		return ("monitor");
 	case VMCB_EXIT_MWAIT:
 		return ("mwait");
 	default:
 		snprintf(reasonbuf, sizeof(reasonbuf), "%#lx", reason);
 		return (reasonbuf);
 	}
 }
 #endif	/* KTR */
 
 /*
  * From section "State Saved on Exit" in APMv2: nRIP is saved for all #VMEXITs
  * that are due to instruction intercepts as well as MSR and IOIO intercepts
  * and exceptions caused by INT3, INTO and BOUND instructions.
  *
  * Return 1 if the nRIP is valid and 0 otherwise.
  */
 static int
 nrip_valid(uint64_t exitcode)
 {
 	switch (exitcode) {
 	case 0x00 ... 0x0F:	/* read of CR0 through CR15 */
 	case 0x10 ... 0x1F:	/* write of CR0 through CR15 */
 	case 0x20 ... 0x2F:	/* read of DR0 through DR15 */
 	case 0x30 ... 0x3F:	/* write of DR0 through DR15 */
 	case 0x43:		/* INT3 */
 	case 0x44:		/* INTO */
 	case 0x45:		/* BOUND */
 	case 0x65 ... 0x7C:	/* VMEXIT_CR0_SEL_WRITE ... VMEXIT_MSR */
 	case 0x80 ... 0x8D:	/* VMEXIT_VMRUN ... VMEXIT_XSETBV */
 		return (1);
 	default:
 		return (0);
 	}
 }
 
 static int
 svm_vmexit(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit)
 {
 	struct vmcb *vmcb;
 	struct vmcb_state *state;
 	struct vmcb_ctrl *ctrl;
 	struct svm_regctx *ctx;
 	uint64_t code, info1, info2, val;
 	uint32_t eax, ecx, edx;
 	int error, errcode_valid, handled, idtvec, reflect;
 	bool retu;
 
 	ctx = svm_get_guest_regctx(svm_sc, vcpu);
 	vmcb = svm_get_vmcb(svm_sc, vcpu);
 	state = &vmcb->state;
 	ctrl = &vmcb->ctrl;
 
 	handled = 0;
 	code = ctrl->exitcode;
 	info1 = ctrl->exitinfo1;
 	info2 = ctrl->exitinfo2;
 
 	vmexit->exitcode = VM_EXITCODE_BOGUS;
 	vmexit->rip = state->rip;
 	vmexit->inst_length = nrip_valid(code) ? ctrl->nrip - state->rip : 0;
 
 	vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_COUNT, 1);
 
 	/*
 	 * #VMEXIT(INVALID) needs to be handled early because the VMCB is
 	 * in an inconsistent state and can trigger assertions that would
 	 * never happen otherwise.
 	 */
 	if (code == VMCB_EXIT_INVALID) {
 		vm_exit_svm(vmexit, code, info1, info2);
 		return (0);
 	}
 
 	KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) == 0, ("%s: event "
 	    "injection valid bit is set %#lx", __func__, ctrl->eventinj));
 
 	KASSERT(vmexit->inst_length >= 0 && vmexit->inst_length <= 15,
 	    ("invalid inst_length %d: code (%#lx), info1 (%#lx), info2 (%#lx)",
 	    vmexit->inst_length, code, info1, info2));
 
 	svm_update_virqinfo(svm_sc, vcpu);
 	svm_save_intinfo(svm_sc, vcpu);
 
 	switch (code) {
 	case VMCB_EXIT_IRET:
 		/*
 		 * Restart execution at "iret" but with the intercept cleared.
 		 */
 		vmexit->inst_length = 0;
 		clear_nmi_blocking(svm_sc, vcpu);
 		handled = 1;
 		break;
 	case VMCB_EXIT_VINTR:	/* interrupt window exiting */
 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_VINTR, 1);
 		handled = 1;
 		break;
 	case VMCB_EXIT_INTR:	/* external interrupt */
 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXTINT, 1);
 		handled = 1;
 		break;
 	case VMCB_EXIT_NMI:	/* external NMI */
 		handled = 1;
 		break;
 	case 0x40 ... 0x5F:
 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXCEPTION, 1);
 		reflect = 1;
 		idtvec = code - 0x40;
 		switch (idtvec) {
 		case IDT_MC:
 			/*
 			 * Call the machine check handler by hand. Also don't
 			 * reflect the machine check back into the guest.
 			 */
 			reflect = 0;
 			VCPU_CTR0(svm_sc->vm, vcpu, "Vectoring to MCE handler");
 			__asm __volatile("int $18");
 			break;
 		case IDT_PF:
 			error = svm_setreg(svm_sc, vcpu, VM_REG_GUEST_CR2,
 			    info2);
 			KASSERT(error == 0, ("%s: error %d updating cr2",
 			    __func__, error));
 			/* fallthru */
 		case IDT_NP:
 		case IDT_SS:
 		case IDT_GP:
 		case IDT_AC:
 		case IDT_TS:
 			errcode_valid = 1;
 			break;
 
 		case IDT_DF:
 			errcode_valid = 1;
 			info1 = 0;
 			break;
 
 		case IDT_BP:
 		case IDT_OF:
 		case IDT_BR:
 			/*
 			 * The 'nrip' field is populated for INT3, INTO and
 			 * BOUND exceptions and this also implies that
 			 * 'inst_length' is non-zero.
 			 *
 			 * Reset 'inst_length' to zero so the guest %rip at
 			 * event injection is identical to what it was when
 			 * the exception originally happened.
 			 */
 			VCPU_CTR2(svm_sc->vm, vcpu, "Reset inst_length from %d "
 			    "to zero before injecting exception %d",
 			    vmexit->inst_length, idtvec);
 			vmexit->inst_length = 0;
 			/* fallthru */
 		default:
 			errcode_valid = 0;
 			info1 = 0;
 			break;
 		}
 		KASSERT(vmexit->inst_length == 0, ("invalid inst_length (%d) "
 		    "when reflecting exception %d into guest",
 		    vmexit->inst_length, idtvec));
 
 		if (reflect) {
 			/* Reflect the exception back into the guest */
 			VCPU_CTR2(svm_sc->vm, vcpu, "Reflecting exception "
 			    "%d/%#x into the guest", idtvec, (int)info1);
 			error = vm_inject_exception(svm_sc->vm, vcpu, idtvec,
 			    errcode_valid, info1, 0);
 			KASSERT(error == 0, ("%s: vm_inject_exception error %d",
 			    __func__, error));
 		}
 		handled = 1;
 		break;
 	case VMCB_EXIT_MSR:	/* MSR access. */
 		eax = state->rax;
 		ecx = ctx->sctx_rcx;
 		edx = ctx->sctx_rdx;
 		retu = false;	
 
 		if (info1) {
 			vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_WRMSR, 1);
 			val = (uint64_t)edx << 32 | eax;
 			VCPU_CTR2(svm_sc->vm, vcpu, "wrmsr %#x val %#lx",
 			    ecx, val);
 			if (emulate_wrmsr(svm_sc, vcpu, ecx, val, &retu)) {
 				vmexit->exitcode = VM_EXITCODE_WRMSR;
 				vmexit->u.msr.code = ecx;
 				vmexit->u.msr.wval = val;
 			} else if (!retu) {
 				handled = 1;
 			} else {
 				KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
 				    ("emulate_wrmsr retu with bogus exitcode"));
 			}
 		} else {
 			VCPU_CTR1(svm_sc->vm, vcpu, "rdmsr %#x", ecx);
 			vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_RDMSR, 1);
 			if (emulate_rdmsr(svm_sc, vcpu, ecx, &retu)) {
 				vmexit->exitcode = VM_EXITCODE_RDMSR;
 				vmexit->u.msr.code = ecx;
 			} else if (!retu) {
 				handled = 1;
 			} else {
 				KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
 				    ("emulate_rdmsr retu with bogus exitcode"));
 			}
 		}
 		break;
 	case VMCB_EXIT_IO:
 		handled = svm_handle_io(svm_sc, vcpu, vmexit);
 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_INOUT, 1);
 		break;
 	case VMCB_EXIT_CPUID:
 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_CPUID, 1);
 		handled = x86_emulate_cpuid(svm_sc->vm, vcpu,
 		    (uint32_t *)&state->rax,
 		    (uint32_t *)&ctx->sctx_rbx,
 		    (uint32_t *)&ctx->sctx_rcx,
 		    (uint32_t *)&ctx->sctx_rdx);
 		break;
 	case VMCB_EXIT_HLT:
 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_HLT, 1);
 		vmexit->exitcode = VM_EXITCODE_HLT;
 		vmexit->u.hlt.rflags = state->rflags;
 		break;
 	case VMCB_EXIT_PAUSE:
 		vmexit->exitcode = VM_EXITCODE_PAUSE;
 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_PAUSE, 1);
 		break;
 	case VMCB_EXIT_NPF:
 		/* EXITINFO2 contains the faulting guest physical address */
 		if (info1 & VMCB_NPF_INFO1_RSV) {
 			VCPU_CTR2(svm_sc->vm, vcpu, "nested page fault with "
 			    "reserved bits set: info1(%#lx) info2(%#lx)",
 			    info1, info2);
 		} else if (vm_mem_allocated(svm_sc->vm, vcpu, info2)) {
 			vmexit->exitcode = VM_EXITCODE_PAGING;
 			vmexit->u.paging.gpa = info2;
 			vmexit->u.paging.fault_type = npf_fault_type(info1);
 			vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_NESTED_FAULT, 1);
 			VCPU_CTR3(svm_sc->vm, vcpu, "nested page fault "
 			    "on gpa %#lx/%#lx at rip %#lx",
 			    info2, info1, state->rip);
 		} else if (svm_npf_emul_fault(info1)) {
 			svm_handle_inst_emul(vmcb, info2, vmexit);
 			vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_INST_EMUL, 1);
 			VCPU_CTR3(svm_sc->vm, vcpu, "inst_emul fault "
 			    "for gpa %#lx/%#lx at rip %#lx",
 			    info2, info1, state->rip);
 		}
 		break;
 	case VMCB_EXIT_MONITOR:
 		vmexit->exitcode = VM_EXITCODE_MONITOR;
 		break;
 	case VMCB_EXIT_MWAIT:
 		vmexit->exitcode = VM_EXITCODE_MWAIT;
 		break;
 	default:
 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_UNKNOWN, 1);
 		break;
 	}	
 
 	VCPU_CTR4(svm_sc->vm, vcpu, "%s %s vmexit at %#lx/%d",
 	    handled ? "handled" : "unhandled", exit_reason_to_str(code),
 	    vmexit->rip, vmexit->inst_length);
 
 	if (handled) {
 		vmexit->rip += vmexit->inst_length;
 		vmexit->inst_length = 0;
 		state->rip = vmexit->rip;
 	} else {
 		if (vmexit->exitcode == VM_EXITCODE_BOGUS) {
 			/*
 			 * If this VM exit was not claimed by anybody then
 			 * treat it as a generic SVM exit.
 			 */
 			vm_exit_svm(vmexit, code, info1, info2);
 		} else {
 			/*
 			 * The exitcode and collateral have been populated.
 			 * The VM exit will be processed further in userland.
 			 */
 		}
 	}
 	return (handled);
 }
 
 static void
 svm_inj_intinfo(struct svm_softc *svm_sc, int vcpu)
 {
 	uint64_t intinfo;
 
 	if (!vm_entry_intinfo(svm_sc->vm, vcpu, &intinfo))
 		return;
 
 	KASSERT(VMCB_EXITINTINFO_VALID(intinfo), ("%s: entry intinfo is not "
 	    "valid: %#lx", __func__, intinfo));
 
 	svm_eventinject(svm_sc, vcpu, VMCB_EXITINTINFO_TYPE(intinfo),
 		VMCB_EXITINTINFO_VECTOR(intinfo),
 		VMCB_EXITINTINFO_EC(intinfo),
 		VMCB_EXITINTINFO_EC_VALID(intinfo));
 	vmm_stat_incr(svm_sc->vm, vcpu, VCPU_INTINFO_INJECTED, 1);
 	VCPU_CTR1(svm_sc->vm, vcpu, "Injected entry intinfo: %#lx", intinfo);
 }
 
 /*
  * Inject event to virtual cpu.
  */
 static void
 svm_inj_interrupts(struct svm_softc *sc, int vcpu, struct vlapic *vlapic)
 {
 	struct vmcb_ctrl *ctrl;
 	struct vmcb_state *state;
 	struct svm_vcpu *vcpustate;
 	uint8_t v_tpr;
 	int vector, need_intr_window;
 	int extint_pending;
 
 	state = svm_get_vmcb_state(sc, vcpu);
 	ctrl  = svm_get_vmcb_ctrl(sc, vcpu);
 	vcpustate = svm_get_vcpu(sc, vcpu);
 
 	need_intr_window = 0;
 
 	if (vcpustate->nextrip != state->rip) {
 		ctrl->intr_shadow = 0;
 		VCPU_CTR2(sc->vm, vcpu, "Guest interrupt blocking "
 		    "cleared due to rip change: %#lx/%#lx",
 		    vcpustate->nextrip, state->rip);
 	}
 
 	/*
 	 * Inject pending events or exceptions for this vcpu.
 	 *
 	 * An event might be pending because the previous #VMEXIT happened
 	 * during event delivery (i.e. ctrl->exitintinfo).
 	 *
 	 * An event might also be pending because an exception was injected
 	 * by the hypervisor (e.g. #PF during instruction emulation).
 	 */
 	svm_inj_intinfo(sc, vcpu);
 
 	/* NMI event has priority over interrupts. */
 	if (vm_nmi_pending(sc->vm, vcpu)) {
 		if (nmi_blocked(sc, vcpu)) {
 			/*
 			 * Can't inject another NMI if the guest has not
 			 * yet executed an "iret" after the last NMI.
 			 */
 			VCPU_CTR0(sc->vm, vcpu, "Cannot inject NMI due "
 			    "to NMI-blocking");
 		} else if (ctrl->intr_shadow) {
 			/*
 			 * Can't inject an NMI if the vcpu is in an intr_shadow.
 			 */
 			VCPU_CTR0(sc->vm, vcpu, "Cannot inject NMI due to "
 			    "interrupt shadow");
 			need_intr_window = 1;
 			goto done;
 		} else if (ctrl->eventinj & VMCB_EVENTINJ_VALID) {
 			/*
 			 * If there is already an exception/interrupt pending
 			 * then defer the NMI until after that.
 			 */
 			VCPU_CTR1(sc->vm, vcpu, "Cannot inject NMI due to "
 			    "eventinj %#lx", ctrl->eventinj);
 
 			/*
 			 * Use self-IPI to trigger a VM-exit as soon as
 			 * possible after the event injection is completed.
 			 *
 			 * This works only if the external interrupt exiting
 			 * is at a lower priority than the event injection.
 			 *
 			 * Although not explicitly specified in APMv2 the
 			 * relative priorities were verified empirically.
 			 */
 			ipi_cpu(curcpu, IPI_AST);	/* XXX vmm_ipinum? */
 		} else {
 			vm_nmi_clear(sc->vm, vcpu);
 
 			/* Inject NMI, vector number is not used */
 			svm_eventinject(sc, vcpu, VMCB_EVENTINJ_TYPE_NMI,
 			    IDT_NMI, 0, false);
 
 			/* virtual NMI blocking is now in effect */
 			enable_nmi_blocking(sc, vcpu);
 
 			VCPU_CTR0(sc->vm, vcpu, "Injecting vNMI");
 		}
 	}
 
 	extint_pending = vm_extint_pending(sc->vm, vcpu);
 	if (!extint_pending) {
 		if (!vlapic_pending_intr(vlapic, &vector))
 			goto done;
 		KASSERT(vector >= 16 && vector <= 255,
 		    ("invalid vector %d from local APIC", vector));
 	} else {
 		/* Ask the legacy pic for a vector to inject */
 		vatpic_pending_intr(sc->vm, &vector);
 		KASSERT(vector >= 0 && vector <= 255,
 		    ("invalid vector %d from INTR", vector));
 	}
 
 	/*
 	 * If the guest has disabled interrupts or is in an interrupt shadow
 	 * then we cannot inject the pending interrupt.
 	 */
 	if ((state->rflags & PSL_I) == 0) {
 		VCPU_CTR2(sc->vm, vcpu, "Cannot inject vector %d due to "
 		    "rflags %#lx", vector, state->rflags);
 		need_intr_window = 1;
 		goto done;
 	}
 
 	if (ctrl->intr_shadow) {
 		VCPU_CTR1(sc->vm, vcpu, "Cannot inject vector %d due to "
 		    "interrupt shadow", vector);
 		need_intr_window = 1;
 		goto done;
 	}
 
 	if (ctrl->eventinj & VMCB_EVENTINJ_VALID) {
 		VCPU_CTR2(sc->vm, vcpu, "Cannot inject vector %d due to "
 		    "eventinj %#lx", vector, ctrl->eventinj);
 		need_intr_window = 1;
 		goto done;
 	}
 
 	svm_eventinject(sc, vcpu, VMCB_EVENTINJ_TYPE_INTR, vector, 0, false);
 
 	if (!extint_pending) {
 		vlapic_intr_accepted(vlapic, vector);
 	} else {
 		vm_extint_clear(sc->vm, vcpu);
 		vatpic_intr_accepted(sc->vm, vector);
 	}
 
 	/*
 	 * Force a VM-exit as soon as the vcpu is ready to accept another
 	 * interrupt. This is done because the PIC might have another vector
 	 * that it wants to inject. Also, if the APIC has a pending interrupt
 	 * that was preempted by the ExtInt then it allows us to inject the
 	 * APIC vector as soon as possible.
 	 */
 	need_intr_window = 1;
 done:
 	/*
 	 * The guest can modify the TPR by writing to %CR8. In guest mode
 	 * the processor reflects this write to V_TPR without hypervisor
 	 * intervention.
 	 *
 	 * The guest can also modify the TPR by writing to it via the memory
 	 * mapped APIC page. In this case, the write will be emulated by the
 	 * hypervisor. For this reason V_TPR must be updated before every
 	 * VMRUN.
 	 */
 	v_tpr = vlapic_get_cr8(vlapic);
 	KASSERT(v_tpr <= 15, ("invalid v_tpr %#x", v_tpr));
 	if (ctrl->v_tpr != v_tpr) {
 		VCPU_CTR2(sc->vm, vcpu, "VMCB V_TPR changed from %#x to %#x",
 		    ctrl->v_tpr, v_tpr);
 		ctrl->v_tpr = v_tpr;
 		svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
 	}
 
 	if (need_intr_window) {
 		/*
 		 * We use V_IRQ in conjunction with the VINTR intercept to
 		 * trap into the hypervisor as soon as a virtual interrupt
 		 * can be delivered.
 		 *
 		 * Since injected events are not subject to intercept checks
 		 * we need to ensure that the V_IRQ is not actually going to
 		 * be delivered on VM entry. The KASSERT below enforces this.
 		 */
 		KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) != 0 ||
 		    (state->rflags & PSL_I) == 0 || ctrl->intr_shadow,
 		    ("Bogus intr_window_exiting: eventinj (%#lx), "
 		    "intr_shadow (%u), rflags (%#lx)",
 		    ctrl->eventinj, ctrl->intr_shadow, state->rflags));
 		enable_intr_window_exiting(sc, vcpu);
 	} else {
 		disable_intr_window_exiting(sc, vcpu);
 	}
 }
 
 static __inline void
 restore_host_tss(void)
 {
 	struct system_segment_descriptor *tss_sd;
 
 	/*
 	 * The TSS descriptor was in use prior to launching the guest so it
 	 * has been marked busy.
 	 *
 	 * 'ltr' requires the descriptor to be marked available so change the
 	 * type to "64-bit available TSS".
 	 */
 	tss_sd = PCPU_GET(tss);
 	tss_sd->sd_type = SDT_SYSTSS;
 	ltr(GSEL(GPROC0_SEL, SEL_KPL));
 }
 
 static void
 check_asid(struct svm_softc *sc, int vcpuid, pmap_t pmap, u_int thiscpu)
 {
 	struct svm_vcpu *vcpustate;
 	struct vmcb_ctrl *ctrl;
 	long eptgen;
 	bool alloc_asid;
 
 	KASSERT(CPU_ISSET(thiscpu, &pmap->pm_active), ("%s: nested pmap not "
 	    "active on cpu %u", __func__, thiscpu));
 
 	vcpustate = svm_get_vcpu(sc, vcpuid);
 	ctrl = svm_get_vmcb_ctrl(sc, vcpuid);
 
 	/*
 	 * The TLB entries associated with the vcpu's ASID are not valid
 	 * if either of the following conditions is true:
 	 *
 	 * 1. The vcpu's ASID generation is different than the host cpu's
 	 *    ASID generation. This happens when the vcpu migrates to a new
 	 *    host cpu. It can also happen when the number of vcpus executing
 	 *    on a host cpu is greater than the number of ASIDs available.
 	 *
 	 * 2. The pmap generation number is different than the value cached in
 	 *    the 'vcpustate'. This happens when the host invalidates pages
 	 *    belonging to the guest.
 	 *
 	 *	asidgen		eptgen	      Action
 	 *	mismatch	mismatch
 	 *	   0		   0		(a)
 	 *	   0		   1		(b1) or (b2)
 	 *	   1		   0		(c)
 	 *	   1		   1		(d)
 	 *
 	 * (a) There is no mismatch in eptgen or ASID generation and therefore
 	 *     no further action is needed.
 	 *
 	 * (b1) If the cpu supports FlushByAsid then the vcpu's ASID is
 	 *      retained and the TLB entries associated with this ASID
 	 *      are flushed by VMRUN.
 	 *
 	 * (b2) If the cpu does not support FlushByAsid then a new ASID is
 	 *      allocated.
 	 *
 	 * (c) A new ASID is allocated.
 	 *
 	 * (d) A new ASID is allocated.
 	 */
 
 	alloc_asid = false;
 	eptgen = pmap->pm_eptgen;
 	ctrl->tlb_ctrl = VMCB_TLB_FLUSH_NOTHING;
 
 	if (vcpustate->asid.gen != asid[thiscpu].gen) {
 		alloc_asid = true;	/* (c) and (d) */
 	} else if (vcpustate->eptgen != eptgen) {
 		if (flush_by_asid())
 			ctrl->tlb_ctrl = VMCB_TLB_FLUSH_GUEST;	/* (b1) */
 		else
 			alloc_asid = true;			/* (b2) */
 	} else {
 		/*
 		 * This is the common case (a).
 		 */
 		KASSERT(!alloc_asid, ("ASID allocation not necessary"));
 		KASSERT(ctrl->tlb_ctrl == VMCB_TLB_FLUSH_NOTHING,
 		    ("Invalid VMCB tlb_ctrl: %#x", ctrl->tlb_ctrl));
 	}
 
 	if (alloc_asid) {
 		if (++asid[thiscpu].num >= nasid) {
 			asid[thiscpu].num = 1;
 			if (++asid[thiscpu].gen == 0)
 				asid[thiscpu].gen = 1;
 			/*
 			 * If this cpu does not support "flush-by-asid"
 			 * then flush the entire TLB on a generation
 			 * bump. Subsequent ASID allocation in this
 			 * generation can be done without a TLB flush.
 			 */
 			if (!flush_by_asid())
 				ctrl->tlb_ctrl = VMCB_TLB_FLUSH_ALL;
 		}
 		vcpustate->asid.gen = asid[thiscpu].gen;
 		vcpustate->asid.num = asid[thiscpu].num;
 
 		ctrl->asid = vcpustate->asid.num;
 		svm_set_dirty(sc, vcpuid, VMCB_CACHE_ASID);
 		/*
 		 * If this cpu supports "flush-by-asid" then the TLB
 		 * was not flushed after the generation bump. The TLB
 		 * is flushed selectively after every new ASID allocation.
 		 */
 		if (flush_by_asid())
 			ctrl->tlb_ctrl = VMCB_TLB_FLUSH_GUEST;
 	}
 	vcpustate->eptgen = eptgen;
 
 	KASSERT(ctrl->asid != 0, ("Guest ASID must be non-zero"));
 	KASSERT(ctrl->asid == vcpustate->asid.num,
 	    ("ASID mismatch: %u/%u", ctrl->asid, vcpustate->asid.num));
 }
 
 static __inline void
 disable_gintr(void)
 {
 
 	__asm __volatile("clgi");
 }
 
 static __inline void
 enable_gintr(void)
 {
 
         __asm __volatile("stgi");
 }
 
 static __inline void
 svm_dr_enter_guest(struct svm_regctx *gctx)
 {
 
 	/* Save host control debug registers. */
 	gctx->host_dr7 = rdr7();
 	gctx->host_debugctl = rdmsr(MSR_DEBUGCTLMSR);
 
 	/*
 	 * Disable debugging in DR7 and DEBUGCTL to avoid triggering
 	 * exceptions in the host based on the guest DRx values.  The
 	 * guest DR6, DR7, and DEBUGCTL are saved/restored in the
 	 * VMCB.
 	 */
 	load_dr7(0);
 	wrmsr(MSR_DEBUGCTLMSR, 0);
 
 	/* Save host debug registers. */
 	gctx->host_dr0 = rdr0();
 	gctx->host_dr1 = rdr1();
 	gctx->host_dr2 = rdr2();
 	gctx->host_dr3 = rdr3();
 	gctx->host_dr6 = rdr6();
 
 	/* Restore guest debug registers. */
 	load_dr0(gctx->sctx_dr0);
 	load_dr1(gctx->sctx_dr1);
 	load_dr2(gctx->sctx_dr2);
 	load_dr3(gctx->sctx_dr3);
 }
 
 static __inline void
 svm_dr_leave_guest(struct svm_regctx *gctx)
 {
 
 	/* Save guest debug registers. */
 	gctx->sctx_dr0 = rdr0();
 	gctx->sctx_dr1 = rdr1();
 	gctx->sctx_dr2 = rdr2();
 	gctx->sctx_dr3 = rdr3();
 
 	/*
 	 * Restore host debug registers.  Restore DR7 and DEBUGCTL
 	 * last.
 	 */
 	load_dr0(gctx->host_dr0);
 	load_dr1(gctx->host_dr1);
 	load_dr2(gctx->host_dr2);
 	load_dr3(gctx->host_dr3);
 	load_dr6(gctx->host_dr6);
 	wrmsr(MSR_DEBUGCTLMSR, gctx->host_debugctl);
 	load_dr7(gctx->host_dr7);
 }
 
 /*
  * Start vcpu with specified RIP.
  */
 static int
 svm_vmrun(void *arg, int vcpu, register_t rip, pmap_t pmap, 
 	struct vm_eventinfo *evinfo)
 {
 	struct svm_regctx *gctx;
 	struct svm_softc *svm_sc;
 	struct svm_vcpu *vcpustate;
 	struct vmcb_state *state;
 	struct vmcb_ctrl *ctrl;
 	struct vm_exit *vmexit;
 	struct vlapic *vlapic;
 	struct vm *vm;
 	uint64_t vmcb_pa;
 	int handled;
 	uint16_t ldt_sel;
 
 	svm_sc = arg;
 	vm = svm_sc->vm;
 
 	vcpustate = svm_get_vcpu(svm_sc, vcpu);
 	state = svm_get_vmcb_state(svm_sc, vcpu);
 	ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu);
 	vmexit = vm_exitinfo(vm, vcpu);
 	vlapic = vm_lapic(vm, vcpu);
 
 	gctx = svm_get_guest_regctx(svm_sc, vcpu);
 	vmcb_pa = svm_sc->vcpu[vcpu].vmcb_pa;
 
 	if (vcpustate->lastcpu != curcpu) {
 		/*
 		 * Force new ASID allocation by invalidating the generation.
 		 */
 		vcpustate->asid.gen = 0;
 
 		/*
 		 * Invalidate the VMCB state cache by marking all fields dirty.
 		 */
 		svm_set_dirty(svm_sc, vcpu, 0xffffffff);
 
 		/*
 		 * XXX
 		 * Setting 'vcpustate->lastcpu' here is bit premature because
 		 * we may return from this function without actually executing
 		 * the VMRUN  instruction. This could happen if a rendezvous
 		 * or an AST is pending on the first time through the loop.
 		 *
 		 * This works for now but any new side-effects of vcpu
 		 * migration should take this case into account.
 		 */
 		vcpustate->lastcpu = curcpu;
 		vmm_stat_incr(vm, vcpu, VCPU_MIGRATIONS, 1);
 	}
 
 	svm_msr_guest_enter(svm_sc, vcpu);
 
 	/* Update Guest RIP */
 	state->rip = rip;
 
 	do {
 		/*
 		 * Disable global interrupts to guarantee atomicity during
 		 * loading of guest state. This includes not only the state
 		 * loaded by the "vmrun" instruction but also software state
 		 * maintained by the hypervisor: suspended and rendezvous
 		 * state, NPT generation number, vlapic interrupts etc.
 		 */
 		disable_gintr();
 
 		if (vcpu_suspended(evinfo)) {
 			enable_gintr();
 			vm_exit_suspended(vm, vcpu, state->rip);
 			break;
 		}
 
 		if (vcpu_rendezvous_pending(evinfo)) {
 			enable_gintr();
 			vm_exit_rendezvous(vm, vcpu, state->rip);
 			break;
 		}
 
 		if (vcpu_reqidle(evinfo)) {
 			enable_gintr();
 			vm_exit_reqidle(vm, vcpu, state->rip);
 			break;
 		}
 
 		/* We are asked to give the cpu by scheduler. */
 		if (vcpu_should_yield(vm, vcpu)) {
 			enable_gintr();
 			vm_exit_astpending(vm, vcpu, state->rip);
 			break;
 		}
 
 		if (vcpu_debugged(vm, vcpu)) {
 			enable_gintr();
 			vm_exit_debug(vm, vcpu, state->rip);
 			break;
 		}
 
 		/*
 		 * #VMEXIT resumes the host with the guest LDTR, so
 		 * save the current LDT selector so it can be restored
 		 * after an exit.  The userspace hypervisor probably
 		 * doesn't use a LDT, but save and restore it to be
 		 * safe.
 		 */
 		ldt_sel = sldt();
 
 		svm_inj_interrupts(svm_sc, vcpu, vlapic);
 
 		/* Activate the nested pmap on 'curcpu' */
 		CPU_SET_ATOMIC_ACQ(curcpu, &pmap->pm_active);
 
 		/*
 		 * Check the pmap generation and the ASID generation to
 		 * ensure that the vcpu does not use stale TLB mappings.
 		 */
 		check_asid(svm_sc, vcpu, pmap, curcpu);
 
 		ctrl->vmcb_clean = vmcb_clean & ~vcpustate->dirty;
 		vcpustate->dirty = 0;
 		VCPU_CTR1(vm, vcpu, "vmcb clean %#x", ctrl->vmcb_clean);
 
 		/* Launch Virtual Machine. */
 		VCPU_CTR1(vm, vcpu, "Resume execution at %#lx", state->rip);
 		svm_dr_enter_guest(gctx);
 		svm_launch(vmcb_pa, gctx, get_pcpu());
 		svm_dr_leave_guest(gctx);
 
 		CPU_CLR_ATOMIC(curcpu, &pmap->pm_active);
 
 		/*
 		 * The host GDTR and IDTR is saved by VMRUN and restored
 		 * automatically on #VMEXIT. However, the host TSS needs
 		 * to be restored explicitly.
 		 */
 		restore_host_tss();
 
 		/* Restore host LDTR. */
 		lldt(ldt_sel);
 
 		/* #VMEXIT disables interrupts so re-enable them here. */ 
 		enable_gintr();
 
 		/* Update 'nextrip' */
 		vcpustate->nextrip = state->rip;
 
 		/* Handle #VMEXIT and if required return to user space. */
 		handled = svm_vmexit(svm_sc, vcpu, vmexit);
 	} while (handled);
 
 	svm_msr_guest_exit(svm_sc, vcpu);
 
 	return (0);
 }
 
 static void
 svm_vmcleanup(void *arg)
 {
 	struct svm_softc *sc = arg;
 
 	contigfree(sc->iopm_bitmap, SVM_IO_BITMAP_SIZE, M_SVM);
 	contigfree(sc->msr_bitmap, SVM_MSR_BITMAP_SIZE, M_SVM);
 	free(sc, M_SVM);
 }
 
 static register_t *
 swctx_regptr(struct svm_regctx *regctx, int reg)
 {
 
 	switch (reg) {
 	case VM_REG_GUEST_RBX:
 		return (&regctx->sctx_rbx);
 	case VM_REG_GUEST_RCX:
 		return (&regctx->sctx_rcx);
 	case VM_REG_GUEST_RDX:
 		return (&regctx->sctx_rdx);
 	case VM_REG_GUEST_RDI:
 		return (&regctx->sctx_rdi);
 	case VM_REG_GUEST_RSI:
 		return (&regctx->sctx_rsi);
 	case VM_REG_GUEST_RBP:
 		return (&regctx->sctx_rbp);
 	case VM_REG_GUEST_R8:
 		return (&regctx->sctx_r8);
 	case VM_REG_GUEST_R9:
 		return (&regctx->sctx_r9);
 	case VM_REG_GUEST_R10:
 		return (&regctx->sctx_r10);
 	case VM_REG_GUEST_R11:
 		return (&regctx->sctx_r11);
 	case VM_REG_GUEST_R12:
 		return (&regctx->sctx_r12);
 	case VM_REG_GUEST_R13:
 		return (&regctx->sctx_r13);
 	case VM_REG_GUEST_R14:
 		return (&regctx->sctx_r14);
 	case VM_REG_GUEST_R15:
 		return (&regctx->sctx_r15);
 	case VM_REG_GUEST_DR0:
 		return (&regctx->sctx_dr0);
 	case VM_REG_GUEST_DR1:
 		return (&regctx->sctx_dr1);
 	case VM_REG_GUEST_DR2:
 		return (&regctx->sctx_dr2);
 	case VM_REG_GUEST_DR3:
 		return (&regctx->sctx_dr3);
 	default:
 		return (NULL);
 	}
 }
 
 static int
 svm_getreg(void *arg, int vcpu, int ident, uint64_t *val)
 {
 	struct svm_softc *svm_sc;
 	register_t *reg;
 
 	svm_sc = arg;
 
 	if (ident == VM_REG_GUEST_INTR_SHADOW) {
 		return (svm_get_intr_shadow(svm_sc, vcpu, val));
 	}
 
 	if (vmcb_read(svm_sc, vcpu, ident, val) == 0) {
 		return (0);
 	}
 
 	reg = swctx_regptr(svm_get_guest_regctx(svm_sc, vcpu), ident);
 
 	if (reg != NULL) {
 		*val = *reg;
 		return (0);
 	}
 
 	VCPU_CTR1(svm_sc->vm, vcpu, "svm_getreg: unknown register %#x", ident);
 	return (EINVAL);
 }
 
 static int
 svm_setreg(void *arg, int vcpu, int ident, uint64_t val)
 {
 	struct svm_softc *svm_sc;
 	register_t *reg;
 
 	svm_sc = arg;
 
 	if (ident == VM_REG_GUEST_INTR_SHADOW) {
 		return (svm_modify_intr_shadow(svm_sc, vcpu, val));
 	}
 
 	if (vmcb_write(svm_sc, vcpu, ident, val) == 0) {
 		return (0);
 	}
 
 	reg = swctx_regptr(svm_get_guest_regctx(svm_sc, vcpu), ident);
 
 	if (reg != NULL) {
 		*reg = val;
 		return (0);
 	}
 
 	if (ident == VM_REG_GUEST_ENTRY_INST_LENGTH) {
 		/* Ignore. */
 		return (0);
 	}
 
 	/*
 	 * XXX deal with CR3 and invalidate TLB entries tagged with the
 	 * vcpu's ASID. This needs to be treated differently depending on
 	 * whether 'running' is true/false.
 	 */
 
 	VCPU_CTR1(svm_sc->vm, vcpu, "svm_setreg: unknown register %#x", ident);
 	return (EINVAL);
 }
 
+#ifdef BHYVE_SNAPSHOT
+static int
+svm_snapshot_reg(void *arg, int vcpu, int ident,
+		 struct vm_snapshot_meta *meta)
+{
+	int ret;
+	uint64_t val;
+
+	if (meta->op == VM_SNAPSHOT_SAVE) {
+		ret = svm_getreg(arg, vcpu, ident, &val);
+		if (ret != 0)
+			goto done;
+
+		SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done);
+	} else if (meta->op == VM_SNAPSHOT_RESTORE) {
+		SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done);
+
+		ret = svm_setreg(arg, vcpu, ident, val);
+		if (ret != 0)
+			goto done;
+	} else {
+		ret = EINVAL;
+		goto done;
+	}
+
+done:
+	return (ret);
+}
+#endif
+
 static int
 svm_setcap(void *arg, int vcpu, int type, int val)
 {
 	struct svm_softc *sc;
 	int error;
 
 	sc = arg;
 	error = 0;
 	switch (type) {
 	case VM_CAP_HALT_EXIT:
 		svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
 		    VMCB_INTCPT_HLT, val);
 		break;
 	case VM_CAP_PAUSE_EXIT:
 		svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
 		    VMCB_INTCPT_PAUSE, val);
 		break;
 	case VM_CAP_UNRESTRICTED_GUEST:
 		/* Unrestricted guest execution cannot be disabled in SVM */
 		if (val == 0)
 			error = EINVAL;
 		break;
 	default:
 		error = ENOENT;
 		break;
 	}
 	return (error);
 }
 
 static int
 svm_getcap(void *arg, int vcpu, int type, int *retval)
 {
 	struct svm_softc *sc;
 	int error;
 
 	sc = arg;
 	error = 0;
 
 	switch (type) {
 	case VM_CAP_HALT_EXIT:
 		*retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
 		    VMCB_INTCPT_HLT);
 		break;
 	case VM_CAP_PAUSE_EXIT:
 		*retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
 		    VMCB_INTCPT_PAUSE);
 		break;
 	case VM_CAP_UNRESTRICTED_GUEST:
 		*retval = 1;	/* unrestricted guest is always enabled */
 		break;
 	default:
 		error = ENOENT;
 		break;
 	}
 	return (error);
 }
 
 static struct vlapic *
 svm_vlapic_init(void *arg, int vcpuid)
 {
 	struct svm_softc *svm_sc;
 	struct vlapic *vlapic;
 
 	svm_sc = arg;
 	vlapic = malloc(sizeof(struct vlapic), M_SVM_VLAPIC, M_WAITOK | M_ZERO);
 	vlapic->vm = svm_sc->vm;
 	vlapic->vcpuid = vcpuid;
 	vlapic->apic_page = (struct LAPIC *)&svm_sc->apic_page[vcpuid];
 
 	vlapic_init(vlapic);
 
 	return (vlapic);
 }
 
 static void
 svm_vlapic_cleanup(void *arg, struct vlapic *vlapic)
 {
 
         vlapic_cleanup(vlapic);
         free(vlapic, M_SVM_VLAPIC);
 }
 
+#ifdef BHYVE_SNAPSHOT
+static int
+svm_snapshot_vmi(void *arg, struct vm_snapshot_meta *meta)
+{
+	/* struct svm_softc is AMD's representation for SVM softc */
+	struct svm_softc *sc;
+	struct svm_vcpu *vcpu;
+	struct vmcb *vmcb;
+	uint64_t val;
+	int i;
+	int ret;
+
+	sc = arg;
+
+	KASSERT(sc != NULL, ("%s: arg was NULL", __func__));
+
+	SNAPSHOT_VAR_OR_LEAVE(sc->nptp, meta, ret, done);
+
+	for (i = 0; i < VM_MAXCPU; i++) {
+		vcpu = &sc->vcpu[i];
+		vmcb = &vcpu->vmcb;
+
+		/* VMCB fields for virtual cpu i */
+		SNAPSHOT_VAR_OR_LEAVE(vmcb->ctrl.v_tpr, meta, ret, done);
+		val = vmcb->ctrl.v_tpr;
+		SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done);
+		vmcb->ctrl.v_tpr = val;
+
+		SNAPSHOT_VAR_OR_LEAVE(vmcb->ctrl.asid, meta, ret, done);
+		val = vmcb->ctrl.np_enable;
+		SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done);
+		vmcb->ctrl.np_enable = val;
+
+		val = vmcb->ctrl.intr_shadow;
+		SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done);
+		vmcb->ctrl.intr_shadow = val;
+		SNAPSHOT_VAR_OR_LEAVE(vmcb->ctrl.tlb_ctrl, meta, ret, done);
+
+		SNAPSHOT_BUF_OR_LEAVE(vmcb->state.pad1,
+				      sizeof(vmcb->state.pad1),
+				      meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vmcb->state.cpl, meta, ret, done);
+		SNAPSHOT_BUF_OR_LEAVE(vmcb->state.pad2,
+				      sizeof(vmcb->state.pad2),
+				      meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vmcb->state.efer, meta, ret, done);
+		SNAPSHOT_BUF_OR_LEAVE(vmcb->state.pad3,
+				      sizeof(vmcb->state.pad3),
+				      meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vmcb->state.cr4, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vmcb->state.cr3, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vmcb->state.cr0, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vmcb->state.dr7, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vmcb->state.dr6, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vmcb->state.rflags, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vmcb->state.rip, meta, ret, done);
+		SNAPSHOT_BUF_OR_LEAVE(vmcb->state.pad4,
+				      sizeof(vmcb->state.pad4),
+				      meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vmcb->state.rsp, meta, ret, done);
+		SNAPSHOT_BUF_OR_LEAVE(vmcb->state.pad5,
+				      sizeof(vmcb->state.pad5),
+				      meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vmcb->state.rax, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vmcb->state.star, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vmcb->state.lstar, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vmcb->state.cstar, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vmcb->state.sfmask, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vmcb->state.kernelgsbase,
+				      meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vmcb->state.sysenter_cs, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vmcb->state.sysenter_esp,
+				      meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vmcb->state.sysenter_eip,
+				      meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vmcb->state.cr2, meta, ret, done);
+		SNAPSHOT_BUF_OR_LEAVE(vmcb->state.pad6,
+				      sizeof(vmcb->state.pad6),
+				      meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vmcb->state.g_pat, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vmcb->state.dbgctl, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vmcb->state.br_from, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vmcb->state.br_to, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vmcb->state.int_from, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vmcb->state.int_to, meta, ret, done);
+		SNAPSHOT_BUF_OR_LEAVE(vmcb->state.pad7,
+				      sizeof(vmcb->state.pad7),
+				      meta, ret, done);
+
+		/* Snapshot swctx for virtual cpu i */
+		SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_rbp, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_rbx, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_rcx, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_rdx, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_rdi, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_rsi, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_r8, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_r9, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_r10, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_r11, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_r12, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_r13, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_r14, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_r15, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_dr0, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_dr1, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_dr2, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_dr3, meta, ret, done);
+
+		SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.host_dr0, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.host_dr1, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.host_dr2, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.host_dr3, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.host_dr6, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.host_dr7, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.host_debugctl, meta, ret,
+				      done);
+
+		/* Restore other svm_vcpu struct fields */
+
+		/* Restore NEXTRIP field */
+		SNAPSHOT_VAR_OR_LEAVE(vcpu->nextrip, meta, ret, done);
+
+		/* Restore lastcpu field */
+		SNAPSHOT_VAR_OR_LEAVE(vcpu->lastcpu, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vcpu->dirty, meta, ret, done);
+
+		/* Restore EPTGEN field - EPT is Extended Page Tabel */
+		SNAPSHOT_VAR_OR_LEAVE(vcpu->eptgen, meta, ret, done);
+
+		SNAPSHOT_VAR_OR_LEAVE(vcpu->asid.gen, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vcpu->asid.num, meta, ret, done);
+
+		/* Set all caches dirty */
+		if (meta->op == VM_SNAPSHOT_RESTORE) {
+			svm_set_dirty(sc, i, VMCB_CACHE_ASID);
+			svm_set_dirty(sc, i, VMCB_CACHE_IOPM);
+			svm_set_dirty(sc, i, VMCB_CACHE_I);
+			svm_set_dirty(sc, i, VMCB_CACHE_TPR);
+			svm_set_dirty(sc, i, VMCB_CACHE_CR2);
+			svm_set_dirty(sc, i, VMCB_CACHE_CR);
+			svm_set_dirty(sc, i, VMCB_CACHE_DT);
+			svm_set_dirty(sc, i, VMCB_CACHE_SEG);
+			svm_set_dirty(sc, i, VMCB_CACHE_NP);
+		}
+	}
+
+	if (meta->op == VM_SNAPSHOT_RESTORE)
+		flush_by_asid();
+
+done:
+	return (ret);
+}
+
+static int
+svm_snapshot_vmcx(void *arg, struct vm_snapshot_meta *meta, int vcpu)
+{
+	struct vmcb *vmcb;
+	struct svm_softc *sc;
+	int err, running, hostcpu;
+
+	sc = (struct svm_softc *)arg;
+	err = 0;
+
+	KASSERT(arg != NULL, ("%s: arg was NULL", __func__));
+	vmcb = svm_get_vmcb(sc, vcpu);
+
+	running = vcpu_is_running(sc->vm, vcpu, &hostcpu);
+	if (running && hostcpu !=curcpu) {
+		printf("%s: %s%d is running", __func__, vm_name(sc->vm), vcpu);
+		return (EINVAL);
+	}
+
+	err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_CR0, meta);
+	err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_CR2, meta);
+	err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_CR3, meta);
+	err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_CR4, meta);
+
+	err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_DR7, meta);
+
+	err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_RAX, meta);
+
+	err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_RSP, meta);
+	err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_RIP, meta);
+	err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_RFLAGS, meta);
+
+	/* Guest segments */
+	/* ES */
+	err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_ES, meta);
+	err += vmcb_snapshot_desc(sc, vcpu, VM_REG_GUEST_ES, meta);
+
+	/* CS */
+	err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_CS, meta);
+	err += vmcb_snapshot_desc(sc, vcpu, VM_REG_GUEST_CS, meta);
+
+	/* SS */
+	err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_SS, meta);
+	err += vmcb_snapshot_desc(sc, vcpu, VM_REG_GUEST_SS, meta);
+
+	/* DS */
+	err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_DS, meta);
+	err += vmcb_snapshot_desc(sc, vcpu, VM_REG_GUEST_DS, meta);
+
+	/* FS */
+	err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_FS, meta);
+	err += vmcb_snapshot_desc(sc, vcpu, VM_REG_GUEST_FS, meta);
+
+	/* GS */
+	err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_GS, meta);
+	err += vmcb_snapshot_desc(sc, vcpu, VM_REG_GUEST_GS, meta);
+
+	/* TR */
+	err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_TR, meta);
+	err += vmcb_snapshot_desc(sc, vcpu, VM_REG_GUEST_TR, meta);
+
+	/* LDTR */
+	err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_LDTR, meta);
+	err += vmcb_snapshot_desc(sc, vcpu, VM_REG_GUEST_LDTR, meta);
+
+	/* EFER */
+	err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_EFER, meta);
+
+	/* IDTR and GDTR */
+	err += vmcb_snapshot_desc(sc, vcpu, VM_REG_GUEST_IDTR, meta);
+	err += vmcb_snapshot_desc(sc, vcpu, VM_REG_GUEST_GDTR, meta);
+
+	/* Specific AMD registers */
+	err += vmcb_snapshot_any(sc, vcpu,
+				VMCB_ACCESS(VMCB_OFF_SYSENTER_CS, 8), meta);
+	err += vmcb_snapshot_any(sc, vcpu,
+				VMCB_ACCESS(VMCB_OFF_SYSENTER_ESP, 8), meta);
+	err += vmcb_snapshot_any(sc, vcpu,
+				VMCB_ACCESS(VMCB_OFF_SYSENTER_EIP, 8), meta);
+
+	err += vmcb_snapshot_any(sc, vcpu,
+				VMCB_ACCESS(VMCB_OFF_NPT_BASE, 8), meta);
+
+	err += vmcb_snapshot_any(sc, vcpu,
+				VMCB_ACCESS(VMCB_OFF_CR_INTERCEPT, 4), meta);
+	err += vmcb_snapshot_any(sc, vcpu,
+				VMCB_ACCESS(VMCB_OFF_DR_INTERCEPT, 4), meta);
+	err += vmcb_snapshot_any(sc, vcpu,
+				VMCB_ACCESS(VMCB_OFF_EXC_INTERCEPT, 4), meta);
+	err += vmcb_snapshot_any(sc, vcpu,
+				VMCB_ACCESS(VMCB_OFF_INST1_INTERCEPT, 4), meta);
+	err += vmcb_snapshot_any(sc, vcpu,
+				VMCB_ACCESS(VMCB_OFF_INST2_INTERCEPT, 4), meta);
+
+	err += vmcb_snapshot_any(sc, vcpu,
+				VMCB_ACCESS(VMCB_OFF_TLB_CTRL, 4), meta);
+
+	err += vmcb_snapshot_any(sc, vcpu,
+				VMCB_ACCESS(VMCB_OFF_EXITINFO1, 8), meta);
+	err += vmcb_snapshot_any(sc, vcpu,
+				VMCB_ACCESS(VMCB_OFF_EXITINFO2, 8), meta);
+	err += vmcb_snapshot_any(sc, vcpu,
+				VMCB_ACCESS(VMCB_OFF_EXITINTINFO, 8), meta);
+
+	err += vmcb_snapshot_any(sc, vcpu,
+				VMCB_ACCESS(VMCB_OFF_VIRQ, 8), meta);
+
+	err += vmcb_snapshot_any(sc, vcpu,
+				VMCB_ACCESS(VMCB_OFF_GUEST_PAT, 8), meta);
+
+	err += vmcb_snapshot_any(sc, vcpu,
+				VMCB_ACCESS(VMCB_OFF_AVIC_BAR, 8), meta);
+	err += vmcb_snapshot_any(sc, vcpu,
+				VMCB_ACCESS(VMCB_OFF_AVIC_PAGE, 8), meta);
+	err += vmcb_snapshot_any(sc, vcpu,
+				VMCB_ACCESS(VMCB_OFF_AVIC_LT, 8), meta);
+	err += vmcb_snapshot_any(sc, vcpu,
+				VMCB_ACCESS(VMCB_OFF_AVIC_PT, 8), meta);
+
+	err += vmcb_snapshot_any(sc, vcpu,
+				VMCB_ACCESS(VMCB_OFF_IO_PERM, 8), meta);
+	err += vmcb_snapshot_any(sc, vcpu,
+				VMCB_ACCESS(VMCB_OFF_MSR_PERM, 8), meta);
+
+	err += vmcb_snapshot_any(sc, vcpu,
+				VMCB_ACCESS(VMCB_OFF_ASID, 4), meta);
+
+	err += vmcb_snapshot_any(sc, vcpu,
+				VMCB_ACCESS(VMCB_OFF_EXIT_REASON, 8), meta);
+
+	err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_INTR_SHADOW, meta);
+
+	return (err);
+}
+
+static int
+svm_restore_tsc(void *arg, int vcpu, uint64_t offset)
+{
+	int err;
+
+	err = svm_set_tsc_offset(arg, vcpu, offset);
+
+	return (err);
+}
+#endif
+
 struct vmm_ops vmm_ops_amd = {
 	.init		= svm_init,
 	.cleanup	= svm_cleanup,
 	.resume		= svm_restore,
 	.vminit		= svm_vminit,
 	.vmrun		= svm_vmrun,
 	.vmcleanup	= svm_vmcleanup,
 	.vmgetreg	= svm_getreg,
 	.vmsetreg	= svm_setreg,
 	.vmgetdesc	= vmcb_getdesc,
 	.vmsetdesc	= vmcb_setdesc,
 	.vmgetcap	= svm_getcap,
 	.vmsetcap	= svm_setcap,
 	.vmspace_alloc	= svm_npt_alloc,
 	.vmspace_free	= svm_npt_free,
 	.vlapic_init	= svm_vlapic_init,
 	.vlapic_cleanup	= svm_vlapic_cleanup,
+#ifdef BHYVE_SNAPSHOT
+	.vmsnapshot	= svm_snapshot_vmi,
+	.vmcx_snapshot	= svm_snapshot_vmcx,
+	.vm_restore_tsc	= svm_restore_tsc,
+#endif
 };
diff --git a/sys/amd64/vmm/amd/svm.h b/sys/amd64/vmm/amd/svm.h
index 66b584fc95b1..30e58b9e130f 100644
--- a/sys/amd64/vmm/amd/svm.h
+++ b/sys/amd64/vmm/amd/svm.h
@@ -1,70 +1,74 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com)
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _SVM_H_
 #define _SVM_H_
 
 struct pcpu;
+struct svm_softc;
 
 /*
  * Guest register state that is saved outside the VMCB.
  */
 struct svm_regctx {
 	register_t	sctx_rbp;
 	register_t	sctx_rbx;
 	register_t	sctx_rcx;
 	register_t	sctx_rdx;
 	register_t	sctx_rdi;
 	register_t	sctx_rsi;
 	register_t	sctx_r8;
 	register_t	sctx_r9;
 	register_t	sctx_r10;
 	register_t	sctx_r11;
 	register_t	sctx_r12;
 	register_t	sctx_r13;
 	register_t	sctx_r14;
 	register_t	sctx_r15;
 	register_t	sctx_dr0;
 	register_t	sctx_dr1;
 	register_t	sctx_dr2;
 	register_t	sctx_dr3;
 
 	register_t	host_dr0;
 	register_t	host_dr1;
 	register_t	host_dr2;
 	register_t	host_dr3;
 	register_t	host_dr6;
 	register_t	host_dr7;
 	uint64_t	host_debugctl;
 };
 
 void svm_launch(uint64_t pa, struct svm_regctx *gctx, struct pcpu *pcpu);
+#ifdef BHYVE_SNAPSHOT
+int  svm_set_tsc_offset(struct svm_softc *sc, int vcpu, uint64_t offset);
+#endif
 
 #endif /* _SVM_H_ */
diff --git a/sys/amd64/vmm/amd/svm_msr.c b/sys/amd64/vmm/amd/svm_msr.c
index 67c43100f168..12046de4dbb9 100644
--- a/sys/amd64/vmm/amd/svm_msr.c
+++ b/sys/amd64/vmm/amd/svm_msr.c
@@ -1,173 +1,180 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2014, Neel Natu (neel@freebsd.org)
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
+#include "opt_bhyve_snapshot.h"
+
 #include <sys/param.h>
 #include <sys/errno.h>
 #include <sys/systm.h>
 
 #include <machine/cpufunc.h>
 #include <machine/specialreg.h>
 #include <machine/vmm.h>
 
 #include "svm.h"
 #include "vmcb.h"
 #include "svm_softc.h"
 #include "svm_msr.h"
 
 #ifndef MSR_AMDK8_IPM
 #define	MSR_AMDK8_IPM	0xc0010055
 #endif
 
 enum {
 	IDX_MSR_LSTAR,
 	IDX_MSR_CSTAR,
 	IDX_MSR_STAR,
 	IDX_MSR_SF_MASK,
 	HOST_MSR_NUM		/* must be the last enumeration */
 };
 
 static uint64_t host_msrs[HOST_MSR_NUM];
 
 void
 svm_msr_init(void)
 {
 	/* 
 	 * It is safe to cache the values of the following MSRs because they
 	 * don't change based on curcpu, curproc or curthread.
 	 */
 	host_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR);
 	host_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR);
 	host_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR);
 	host_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK);
 }
 
 void
 svm_msr_guest_init(struct svm_softc *sc, int vcpu)
 {
 	/*
 	 * All the MSRs accessible to the guest are either saved/restored by
 	 * hardware on every #VMEXIT/VMRUN (e.g., G_PAT) or are saved/restored
 	 * by VMSAVE/VMLOAD (e.g., MSR_GSBASE).
 	 *
 	 * There are no guest MSRs that are saved/restored "by hand" so nothing
 	 * more to do here.
 	 */
 	return;
 }
 
 void
 svm_msr_guest_enter(struct svm_softc *sc, int vcpu)
 {
 	/*
 	 * Save host MSRs (if any) and restore guest MSRs (if any).
 	 */
 }
 
 void
 svm_msr_guest_exit(struct svm_softc *sc, int vcpu)
 {
 	/*
 	 * Save guest MSRs (if any) and restore host MSRs.
 	 */
 	wrmsr(MSR_LSTAR, host_msrs[IDX_MSR_LSTAR]);
 	wrmsr(MSR_CSTAR, host_msrs[IDX_MSR_CSTAR]);
 	wrmsr(MSR_STAR, host_msrs[IDX_MSR_STAR]);
 	wrmsr(MSR_SF_MASK, host_msrs[IDX_MSR_SF_MASK]);
 
 	/* MSR_KGSBASE will be restored on the way back to userspace */
 }
 
 int
 svm_rdmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t *result,
     bool *retu)
 {
 	int error = 0;
 
 	switch (num) {
 	case MSR_MCG_CAP:
 	case MSR_MCG_STATUS:
 		*result = 0;
 		break;
 	case MSR_MTRRcap:
 	case MSR_MTRRdefType:
 	case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8:
 	case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
 	case MSR_MTRR64kBase:
 	case MSR_SYSCFG:
 	case MSR_AMDK8_IPM:
 	case MSR_EXTFEATURES:
 		*result = 0;
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 
 	return (error);
 }
 
 int
 svm_wrmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t val, bool *retu)
 {
 	int error = 0;
 
 	switch (num) {
 	case MSR_MCG_CAP:
 	case MSR_MCG_STATUS:
 		break;		/* ignore writes */
 	case MSR_MTRRcap:
 		vm_inject_gp(sc->vm, vcpu);
 		break;
 	case MSR_MTRRdefType:
 	case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8:
 	case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
 	case MSR_MTRR64kBase:
 	case MSR_SYSCFG:
 		break;		/* Ignore writes */
 	case MSR_AMDK8_IPM:
 		/*
 		 * Ignore writes to the "Interrupt Pending Message" MSR.
 		 */
 		break;
 	case MSR_K8_UCODE_UPDATE:
 		/*
 		 * Ignore writes to microcode update register.
 		 */
 		break;
+#ifdef BHYVE_SNAPSHOT
+	case MSR_TSC:
+		error = svm_set_tsc_offset(sc, vcpu, val - rdtsc());
+		break;
+#endif
 	case MSR_EXTFEATURES:
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 
 	return (error);
 }
diff --git a/sys/amd64/vmm/amd/vmcb.c b/sys/amd64/vmm/amd/vmcb.c
index 5075b6986730..59baa06112f2 100644
--- a/sys/amd64/vmm/amd/vmcb.c
+++ b/sys/amd64/vmm/amd/vmcb.c
@@ -1,454 +1,560 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com)
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
+#include "opt_bhyve_snapshot.h"
+
 #include <sys/param.h>
 #include <sys/systm.h>
 
 #include <machine/segments.h>
 #include <machine/specialreg.h>
 #include <machine/vmm.h>
+#include <machine/vmm_snapshot.h>
 
 #include "vmm_ktr.h"
 
 #include "vmcb.h"
 #include "svm.h"
 #include "svm_softc.h"
 
 /*
  * The VMCB aka Virtual Machine Control Block is a 4KB aligned page
  * in memory that describes the virtual machine.
  *
  * The VMCB contains:
  * - instructions or events in the guest to intercept
  * - control bits that modify execution environment of the guest
  * - guest processor state (e.g. general purpose registers)
  */
 
 /*
  * Return VMCB segment area.
  */
 static struct vmcb_segment *
 vmcb_segptr(struct vmcb *vmcb, int type)
 {
 	struct vmcb_state *state;
 	struct vmcb_segment *seg;
 
 	state = &vmcb->state;
 
 	switch (type) {
 	case VM_REG_GUEST_CS:
 		seg = &state->cs;
 		break;
 
 	case VM_REG_GUEST_DS:
 		seg = &state->ds;
 		break;
 
 	case VM_REG_GUEST_ES:
 		seg = &state->es;
 		break;
 
 	case VM_REG_GUEST_FS:
 		seg = &state->fs;
 		break;
 
 	case VM_REG_GUEST_GS:
 		seg = &state->gs;
 		break;
 
 	case VM_REG_GUEST_SS:
 		seg = &state->ss;
 		break;
 
 	case VM_REG_GUEST_GDTR:
 		seg = &state->gdt;
 		break;
 
 	case VM_REG_GUEST_IDTR:
 		seg = &state->idt;
 		break;
 
 	case VM_REG_GUEST_LDTR:
 		seg = &state->ldt;
 		break;
 
 	case VM_REG_GUEST_TR:
 		seg = &state->tr;
 		break;
 
 	default:
 		seg = NULL;
 		break;
 	}
 
 	return (seg);
 }
 
 static int
 vmcb_access(struct svm_softc *softc, int vcpu, int write, int ident,
 	uint64_t *val)
 {
 	struct vmcb *vmcb;
 	int off, bytes;
 	char *ptr;
 
 	vmcb	= svm_get_vmcb(softc, vcpu);
 	off	= VMCB_ACCESS_OFFSET(ident);
 	bytes	= VMCB_ACCESS_BYTES(ident);
 
 	if ((off + bytes) >= sizeof (struct vmcb))
 		return (EINVAL);
 
 	ptr = (char *)vmcb;
 
 	if (!write)
 		*val = 0;
 
 	switch (bytes) {
 	case 8:
 	case 4:
 	case 2:
 		if (write)
 			memcpy(ptr + off, val, bytes);
 		else
 			memcpy(val, ptr + off, bytes);
 		break;
 	default:
 		VCPU_CTR1(softc->vm, vcpu,
 		    "Invalid size %d for VMCB access: %d", bytes);
 		return (EINVAL);
 	}
 
 	/* Invalidate all VMCB state cached by h/w. */
 	if (write)
 		svm_set_dirty(softc, vcpu, 0xffffffff);
 
 	return (0);
 }
 
 /*
  * Read from segment selector, control and general purpose register of VMCB.
  */
 int
 vmcb_read(struct svm_softc *sc, int vcpu, int ident, uint64_t *retval)
 {
 	struct vmcb *vmcb;
 	struct vmcb_state *state;
 	struct vmcb_segment *seg;
 	int err;
 
 	vmcb = svm_get_vmcb(sc, vcpu);
 	state = &vmcb->state;
 	err = 0;
 
 	if (VMCB_ACCESS_OK(ident))
 		return (vmcb_access(sc, vcpu, 0, ident, retval));
 
 	switch (ident) {
 	case VM_REG_GUEST_CR0:
 		*retval = state->cr0;
 		break;
 
 	case VM_REG_GUEST_CR2:
 		*retval = state->cr2;
 		break;
 
 	case VM_REG_GUEST_CR3:
 		*retval = state->cr3;
 		break;
 
 	case VM_REG_GUEST_CR4:
 		*retval = state->cr4;
 		break;
 
 	case VM_REG_GUEST_DR6:
 		*retval = state->dr6;
 		break;
 
 	case VM_REG_GUEST_DR7:
 		*retval = state->dr7;
 		break;
 
 	case VM_REG_GUEST_EFER:
 		*retval = state->efer;
 		break;
 
 	case VM_REG_GUEST_RAX:
 		*retval = state->rax;
 		break;
 
 	case VM_REG_GUEST_RFLAGS:
 		*retval = state->rflags;
 		break;
 
 	case VM_REG_GUEST_RIP:
 		*retval = state->rip;
 		break;
 
 	case VM_REG_GUEST_RSP:
 		*retval = state->rsp;
 		break;
 
 	case VM_REG_GUEST_CS:
 	case VM_REG_GUEST_DS:
 	case VM_REG_GUEST_ES:
 	case VM_REG_GUEST_FS:
 	case VM_REG_GUEST_GS:
 	case VM_REG_GUEST_SS:
 	case VM_REG_GUEST_LDTR:
 	case VM_REG_GUEST_TR:
 		seg = vmcb_segptr(vmcb, ident);
 		KASSERT(seg != NULL, ("%s: unable to get segment %d from VMCB",
 		    __func__, ident));
 		*retval = seg->selector;
 		break;
 
 	case VM_REG_GUEST_GDTR:
 	case VM_REG_GUEST_IDTR:
 		/* GDTR and IDTR don't have segment selectors */
 		err = EINVAL;
 		break;
 	default:
 		err =  EINVAL;
 		break;
 	}
 
 	return (err);
 }
 
 /*
  * Write to segment selector, control and general purpose register of VMCB.
  */
 int
 vmcb_write(struct svm_softc *sc, int vcpu, int ident, uint64_t val)
 {
 	struct vmcb *vmcb;
 	struct vmcb_state *state;
 	struct vmcb_segment *seg;
 	int err, dirtyseg;
 
 	vmcb = svm_get_vmcb(sc, vcpu);
 	state = &vmcb->state;
 	dirtyseg = 0;
 	err = 0;
 
 	if (VMCB_ACCESS_OK(ident))
 		return (vmcb_access(sc, vcpu, 1, ident, &val));
 
 	switch (ident) {
 	case VM_REG_GUEST_CR0:
 		state->cr0 = val;
 		svm_set_dirty(sc, vcpu, VMCB_CACHE_CR);
 		break;
 
 	case VM_REG_GUEST_CR2:
 		state->cr2 = val;
 		svm_set_dirty(sc, vcpu, VMCB_CACHE_CR2);
 		break;
 
 	case VM_REG_GUEST_CR3:
 		state->cr3 = val;
 		svm_set_dirty(sc, vcpu, VMCB_CACHE_CR);
 		break;
 
 	case VM_REG_GUEST_CR4:
 		state->cr4 = val;
 		svm_set_dirty(sc, vcpu, VMCB_CACHE_CR);
 		break;
 
 	case VM_REG_GUEST_DR6:
 		state->dr6 = val;
 		svm_set_dirty(sc, vcpu, VMCB_CACHE_DR);
 		break;
 
 	case VM_REG_GUEST_DR7:
 		state->dr7 = val;
 		svm_set_dirty(sc, vcpu, VMCB_CACHE_DR);
 		break;
 
 	case VM_REG_GUEST_EFER:
 		/* EFER_SVM must always be set when the guest is executing */
 		state->efer = val | EFER_SVM;
 		svm_set_dirty(sc, vcpu, VMCB_CACHE_CR);
 		break;
 
 	case VM_REG_GUEST_RAX:
 		state->rax = val;
 		break;
 
 	case VM_REG_GUEST_RFLAGS:
 		state->rflags = val;
 		break;
 
 	case VM_REG_GUEST_RIP:
 		state->rip = val;
 		break;
 
 	case VM_REG_GUEST_RSP:
 		state->rsp = val;
 		break;
 
 	case VM_REG_GUEST_CS:
 	case VM_REG_GUEST_DS:
 	case VM_REG_GUEST_ES:
 	case VM_REG_GUEST_SS:
 		dirtyseg = 1;		/* FALLTHROUGH */
 	case VM_REG_GUEST_FS:
 	case VM_REG_GUEST_GS:
 	case VM_REG_GUEST_LDTR:
 	case VM_REG_GUEST_TR:
 		seg = vmcb_segptr(vmcb, ident);
 		KASSERT(seg != NULL, ("%s: unable to get segment %d from VMCB",
 		    __func__, ident));
 		seg->selector = val;
 		if (dirtyseg)
 			svm_set_dirty(sc, vcpu, VMCB_CACHE_SEG);
 		break;
 
 	case VM_REG_GUEST_GDTR:
 	case VM_REG_GUEST_IDTR:
 		/* GDTR and IDTR don't have segment selectors */
 		err = EINVAL;
 		break;
 	default:
 		err = EINVAL;
 		break;
 	}
 
 	return (err);
 }
 
 int
 vmcb_seg(struct vmcb *vmcb, int ident, struct vmcb_segment *seg2)
 {
 	struct vmcb_segment *seg;
 
 	seg = vmcb_segptr(vmcb, ident);
 	if (seg != NULL) {
 		bcopy(seg, seg2, sizeof(struct vmcb_segment));
 		return (0);
 	} else {
 		return (EINVAL);
 	}
 }
 
 int
 vmcb_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
 {
 	struct vmcb *vmcb;
 	struct svm_softc *sc;
 	struct vmcb_segment *seg;
 	uint16_t attrib;
 
 	sc = arg;
 	vmcb = svm_get_vmcb(sc, vcpu);
 
 	seg = vmcb_segptr(vmcb, reg);
 	KASSERT(seg != NULL, ("%s: invalid segment descriptor %d",
 	    __func__, reg));
 
 	seg->base = desc->base;
 	seg->limit = desc->limit;
 	if (reg != VM_REG_GUEST_GDTR && reg != VM_REG_GUEST_IDTR) {
 		/*
 		 * Map seg_desc access to VMCB attribute format.
 		 *
 		 * SVM uses the 'P' bit in the segment attributes to indicate a
 		 * NULL segment so clear it if the segment is marked unusable.
 		 */
 		attrib = ((desc->access & 0xF000) >> 4) | (desc->access & 0xFF);
 		if (SEG_DESC_UNUSABLE(desc->access)) {
 			attrib &= ~0x80;
 		}
 		seg->attrib = attrib;
 	}
 
 	VCPU_CTR4(sc->vm, vcpu, "Setting desc %d: base (%#lx), limit (%#x), "
 	    "attrib (%#x)", reg, seg->base, seg->limit, seg->attrib);
 
 	switch (reg) {
 	case VM_REG_GUEST_CS:
 	case VM_REG_GUEST_DS:
 	case VM_REG_GUEST_ES:
 	case VM_REG_GUEST_SS:
 		svm_set_dirty(sc, vcpu, VMCB_CACHE_SEG);
 		break;
 	case VM_REG_GUEST_GDTR:
 	case VM_REG_GUEST_IDTR:
 		svm_set_dirty(sc, vcpu, VMCB_CACHE_DT);
 		break;
 	default:
 		break;
 	}
 
 	return (0);
 }
 
 int
 vmcb_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
 {
 	struct vmcb *vmcb;
 	struct svm_softc *sc;
 	struct vmcb_segment *seg;
 
 	sc = arg;
 	vmcb = svm_get_vmcb(sc, vcpu);
 	seg = vmcb_segptr(vmcb, reg);
 	KASSERT(seg != NULL, ("%s: invalid segment descriptor %d",
 	    __func__, reg));
 
 	desc->base = seg->base;
 	desc->limit = seg->limit;
 	desc->access = 0;
 
 	if (reg != VM_REG_GUEST_GDTR && reg != VM_REG_GUEST_IDTR) {
 		/* Map seg_desc access to VMCB attribute format */
 		desc->access = ((seg->attrib & 0xF00) << 4) |
 		    (seg->attrib & 0xFF);
 
 		/*
 		 * VT-x uses bit 16 to indicate a segment that has been loaded
 		 * with a NULL selector (aka unusable). The 'desc->access'
 		 * field is interpreted in the VT-x format by the
 		 * processor-independent code.
 		 *
 		 * SVM uses the 'P' bit to convey the same information so
 		 * convert it into the VT-x format. For more details refer to
 		 * section "Segment State in the VMCB" in APMv2.
 		 */
 		if (reg != VM_REG_GUEST_CS && reg != VM_REG_GUEST_TR) {
 			if ((desc->access & 0x80) == 0)
 				desc->access |= 0x10000;  /* Unusable segment */
 		}
 	}
 
 	return (0);
 }
+
+#ifdef BHYVE_SNAPSHOT
+int
+vmcb_getany(struct svm_softc *sc, int vcpu, int ident, uint64_t *val)
+{
+	int error = 0;
+
+	if (vcpu < 0 || vcpu >= VM_MAXCPU) {
+		error = EINVAL;
+		goto err;
+	}
+
+	if (ident >= VM_REG_LAST) {
+		error = EINVAL;
+		goto err;
+	}
+
+	error = vm_get_register(sc->vm, vcpu, ident, val);
+
+err:
+	return (error);
+}
+
+int
+vmcb_setany(struct svm_softc *sc, int vcpu, int ident, uint64_t val)
+{
+	int error = 0;
+
+	if (vcpu < 0 || vcpu >= VM_MAXCPU) {
+		error = EINVAL;
+		goto err;
+	}
+
+	if (ident >= VM_REG_LAST) {
+		error = EINVAL;
+		goto err;
+	}
+
+	error = vm_set_register(sc->vm, vcpu, ident, val);
+
+err:
+	return (error);
+}
+
+int
+vmcb_snapshot_desc(void *arg, int vcpu, int reg, struct vm_snapshot_meta *meta)
+{
+	int ret;
+	struct seg_desc desc;
+
+	if (meta->op == VM_SNAPSHOT_SAVE) {
+		ret = vmcb_getdesc(arg, vcpu, reg, &desc);
+		if (ret != 0)
+			goto done;
+
+		SNAPSHOT_VAR_OR_LEAVE(desc.base, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(desc.limit, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(desc.access, meta, ret, done);
+	} else if (meta->op == VM_SNAPSHOT_RESTORE) {
+		SNAPSHOT_VAR_OR_LEAVE(desc.base, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(desc.limit, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(desc.access, meta, ret, done);
+
+		ret = vmcb_setdesc(arg, vcpu, reg, &desc);
+		if (ret != 0)
+			goto done;
+	} else {
+		ret = EINVAL;
+		goto done;
+	}
+
+done:
+	return (ret);
+}
+
+int
+vmcb_snapshot_any(struct svm_softc *sc, int vcpu, int ident,
+		  struct vm_snapshot_meta *meta)
+{
+	int ret;
+	uint64_t val;
+
+	if (meta->op == VM_SNAPSHOT_SAVE) {
+		ret = vmcb_getany(sc, vcpu, ident, &val);
+		if (ret != 0)
+			goto done;
+
+		SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done);
+	} else if (meta->op == VM_SNAPSHOT_RESTORE) {
+		SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done);
+
+		ret = vmcb_setany(sc, vcpu, ident, val);
+		if (ret != 0)
+			goto done;
+	} else {
+		ret = EINVAL;
+		goto done;
+	}
+
+done:
+	return (ret);
+}
+#endif
diff --git a/sys/amd64/vmm/amd/vmcb.h b/sys/amd64/vmm/amd/vmcb.h
index ec7caa91f95e..dd2c90cf25ea 100644
--- a/sys/amd64/vmm/amd/vmcb.h
+++ b/sys/amd64/vmm/amd/vmcb.h
@@ -1,336 +1,346 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com)
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _VMCB_H_
 #define	_VMCB_H_
 
-struct svm_softc;
-
 #define BIT(n)			(1ULL << n)
 
 /*
  * Secure Virtual Machine: AMD64 Programmer's Manual Vol2, Chapter 15
  * Layout of VMCB: AMD64 Programmer's Manual Vol2, Appendix B
  */
 
 /* vmcb_ctrl->intercept[] array indices */
 #define	VMCB_CR_INTCPT		0
 #define	VMCB_DR_INTCPT		1
 #define	VMCB_EXC_INTCPT		2
 #define	VMCB_CTRL1_INTCPT	3
 #define	VMCB_CTRL2_INTCPT	4
 
 /* intercept[VMCB_CTRL1_INTCPT] fields */
 #define	VMCB_INTCPT_INTR		BIT(0)
 #define	VMCB_INTCPT_NMI			BIT(1)
 #define	VMCB_INTCPT_SMI			BIT(2)
 #define	VMCB_INTCPT_INIT		BIT(3)
 #define	VMCB_INTCPT_VINTR		BIT(4)
 #define	VMCB_INTCPT_CR0_WRITE		BIT(5)
 #define	VMCB_INTCPT_IDTR_READ		BIT(6)
 #define	VMCB_INTCPT_GDTR_READ		BIT(7)
 #define	VMCB_INTCPT_LDTR_READ		BIT(8)
 #define	VMCB_INTCPT_TR_READ		BIT(9)
 #define	VMCB_INTCPT_IDTR_WRITE		BIT(10)
 #define	VMCB_INTCPT_GDTR_WRITE		BIT(11)
 #define	VMCB_INTCPT_LDTR_WRITE		BIT(12)
 #define	VMCB_INTCPT_TR_WRITE		BIT(13)
 #define	VMCB_INTCPT_RDTSC		BIT(14)
 #define	VMCB_INTCPT_RDPMC		BIT(15)
 #define	VMCB_INTCPT_PUSHF		BIT(16)
 #define	VMCB_INTCPT_POPF		BIT(17)
 #define	VMCB_INTCPT_CPUID		BIT(18)
 #define	VMCB_INTCPT_RSM			BIT(19)
 #define	VMCB_INTCPT_IRET		BIT(20)
 #define	VMCB_INTCPT_INTn		BIT(21)
 #define	VMCB_INTCPT_INVD		BIT(22)
 #define	VMCB_INTCPT_PAUSE		BIT(23)
 #define	VMCB_INTCPT_HLT			BIT(24)
 #define	VMCB_INTCPT_INVPG		BIT(25)
 #define	VMCB_INTCPT_INVPGA		BIT(26)
 #define	VMCB_INTCPT_IO			BIT(27)
 #define	VMCB_INTCPT_MSR			BIT(28)
 #define	VMCB_INTCPT_TASK_SWITCH		BIT(29)
 #define	VMCB_INTCPT_FERR_FREEZE		BIT(30)
 #define	VMCB_INTCPT_SHUTDOWN		BIT(31)
 
 /* intercept[VMCB_CTRL2_INTCPT] fields */
 #define	VMCB_INTCPT_VMRUN		BIT(0)
 #define	VMCB_INTCPT_VMMCALL		BIT(1)
 #define	VMCB_INTCPT_VMLOAD		BIT(2)
 #define	VMCB_INTCPT_VMSAVE		BIT(3)
 #define	VMCB_INTCPT_STGI		BIT(4)
 #define	VMCB_INTCPT_CLGI		BIT(5)
 #define	VMCB_INTCPT_SKINIT		BIT(6)
 #define	VMCB_INTCPT_RDTSCP		BIT(7)
 #define	VMCB_INTCPT_ICEBP		BIT(8)
 #define	VMCB_INTCPT_WBINVD		BIT(9)
 #define	VMCB_INTCPT_MONITOR		BIT(10)
 #define	VMCB_INTCPT_MWAIT		BIT(11)
 #define	VMCB_INTCPT_MWAIT_ARMED		BIT(12)
 #define	VMCB_INTCPT_XSETBV		BIT(13)
 
 /* VMCB TLB control */
 #define	VMCB_TLB_FLUSH_NOTHING		0	/* Flush nothing */
 #define	VMCB_TLB_FLUSH_ALL		1	/* Flush entire TLB */
 #define	VMCB_TLB_FLUSH_GUEST		3	/* Flush all guest entries */
 #define	VMCB_TLB_FLUSH_GUEST_NONGLOBAL	7	/* Flush guest non-PG entries */
 
 /* VMCB state caching */
 #define	VMCB_CACHE_NONE		0	/* No caching */
 #define	VMCB_CACHE_I		BIT(0)	/* Intercept, TSC off, Pause filter */
 #define	VMCB_CACHE_IOPM		BIT(1)	/* I/O and MSR permission */
 #define	VMCB_CACHE_ASID		BIT(2)	/* ASID */
 #define	VMCB_CACHE_TPR		BIT(3)	/* V_TPR to V_INTR_VECTOR */
 #define	VMCB_CACHE_NP		BIT(4)	/* Nested Paging */
 #define	VMCB_CACHE_CR		BIT(5)	/* CR0, CR3, CR4 & EFER */
 #define	VMCB_CACHE_DR		BIT(6)	/* Debug registers */
 #define	VMCB_CACHE_DT		BIT(7)	/* GDT/IDT */
 #define	VMCB_CACHE_SEG		BIT(8)	/* User segments, CPL */
 #define	VMCB_CACHE_CR2		BIT(9)	/* page fault address */
 #define	VMCB_CACHE_LBR		BIT(10)	/* Last branch */
 
 /* VMCB control event injection */
 #define	VMCB_EVENTINJ_EC_VALID		BIT(11)	/* Error Code valid */
 #define	VMCB_EVENTINJ_VALID		BIT(31)	/* Event valid */
 
 /* Event types that can be injected */
 #define	VMCB_EVENTINJ_TYPE_INTR		0
 #define	VMCB_EVENTINJ_TYPE_NMI		2
 #define	VMCB_EVENTINJ_TYPE_EXCEPTION	3
 #define	VMCB_EVENTINJ_TYPE_INTn		4
 
 /* VMCB exit code, APM vol2 Appendix C */
 #define	VMCB_EXIT_MC			0x52
 #define	VMCB_EXIT_INTR			0x60
 #define	VMCB_EXIT_NMI			0x61
 #define	VMCB_EXIT_VINTR			0x64
 #define	VMCB_EXIT_PUSHF			0x70
 #define	VMCB_EXIT_POPF			0x71
 #define	VMCB_EXIT_CPUID			0x72
 #define	VMCB_EXIT_IRET			0x74
 #define	VMCB_EXIT_PAUSE			0x77
 #define	VMCB_EXIT_HLT			0x78
 #define	VMCB_EXIT_IO			0x7B
 #define	VMCB_EXIT_MSR			0x7C
 #define	VMCB_EXIT_SHUTDOWN		0x7F
 #define	VMCB_EXIT_VMSAVE		0x83
 #define	VMCB_EXIT_MONITOR		0x8A
 #define	VMCB_EXIT_MWAIT			0x8B
 #define	VMCB_EXIT_NPF			0x400
 #define	VMCB_EXIT_INVALID		-1
 
 /*
  * Nested page fault.
  * Bit definitions to decode EXITINFO1.
  */
 #define	VMCB_NPF_INFO1_P		BIT(0) /* Nested page present. */
 #define	VMCB_NPF_INFO1_W		BIT(1) /* Access was write. */
 #define	VMCB_NPF_INFO1_U		BIT(2) /* Access was user access. */
 #define	VMCB_NPF_INFO1_RSV		BIT(3) /* Reserved bits present. */
 #define	VMCB_NPF_INFO1_ID		BIT(4) /* Code read. */
 
 #define	VMCB_NPF_INFO1_GPA		BIT(32) /* Guest physical address. */
 #define	VMCB_NPF_INFO1_GPT		BIT(33) /* Guest page table. */
 
 /*
  * EXITINTINFO, Interrupt exit info for all intrecepts.
  * Section 15.7.2, Intercepts during IDT Interrupt Delivery.
  */
 #define VMCB_EXITINTINFO_VECTOR(x)	((x) & 0xFF)
 #define VMCB_EXITINTINFO_TYPE(x)	(((x) >> 8) & 0x7)
 #define VMCB_EXITINTINFO_EC_VALID(x)	(((x) & BIT(11)) ? 1 : 0)
 #define VMCB_EXITINTINFO_VALID(x)	(((x) & BIT(31)) ? 1 : 0)
 #define VMCB_EXITINTINFO_EC(x)		(((x) >> 32) & 0xFFFFFFFF)
 
 /* Offset of various VMCB fields. */
 #define	VMCB_OFF_CTRL(x)		(x)
 #define	VMCB_OFF_STATE(x)		((x) + 0x400)
 
 #define	VMCB_OFF_CR_INTERCEPT		VMCB_OFF_CTRL(0x0)
 #define	VMCB_OFF_DR_INTERCEPT		VMCB_OFF_CTRL(0x4)
 #define	VMCB_OFF_EXC_INTERCEPT		VMCB_OFF_CTRL(0x8)
 #define	VMCB_OFF_INST1_INTERCEPT	VMCB_OFF_CTRL(0xC)
 #define	VMCB_OFF_INST2_INTERCEPT	VMCB_OFF_CTRL(0x10)
 #define	VMCB_OFF_IO_PERM		VMCB_OFF_CTRL(0x40)
 #define	VMCB_OFF_MSR_PERM		VMCB_OFF_CTRL(0x48)
 #define	VMCB_OFF_TSC_OFFSET		VMCB_OFF_CTRL(0x50)
 #define	VMCB_OFF_ASID			VMCB_OFF_CTRL(0x58)
 #define	VMCB_OFF_TLB_CTRL		VMCB_OFF_CTRL(0x5C)
 #define	VMCB_OFF_VIRQ			VMCB_OFF_CTRL(0x60)
 #define	VMCB_OFF_EXIT_REASON		VMCB_OFF_CTRL(0x70)
 #define	VMCB_OFF_EXITINFO1		VMCB_OFF_CTRL(0x78)
 #define	VMCB_OFF_EXITINFO2		VMCB_OFF_CTRL(0x80)
 #define	VMCB_OFF_EXITINTINFO		VMCB_OFF_CTRL(0x88)
 #define	VMCB_OFF_AVIC_BAR		VMCB_OFF_CTRL(0x98)
 #define	VMCB_OFF_NPT_BASE		VMCB_OFF_CTRL(0xB0)
 #define	VMCB_OFF_AVIC_PAGE		VMCB_OFF_CTRL(0xE0)
 #define	VMCB_OFF_AVIC_LT		VMCB_OFF_CTRL(0xF0)
 #define	VMCB_OFF_AVIC_PT		VMCB_OFF_CTRL(0xF8)
 #define	VMCB_OFF_SYSENTER_CS		VMCB_OFF_STATE(0x228)
 #define	VMCB_OFF_SYSENTER_ESP		VMCB_OFF_STATE(0x230)
 #define	VMCB_OFF_SYSENTER_EIP		VMCB_OFF_STATE(0x238)
 #define	VMCB_OFF_GUEST_PAT		VMCB_OFF_STATE(0x268)
 
 /*
  * Encode the VMCB offset and bytes that we want to read from VMCB.
  */
 #define	VMCB_ACCESS(o, w)		(0x80000000 | (((w) & 0xF) << 16) | \
 					((o) & 0xFFF))
 #define	VMCB_ACCESS_OK(v)               ((v) & 0x80000000 )
 #define	VMCB_ACCESS_BYTES(v)            (((v) >> 16) & 0xF)
 #define	VMCB_ACCESS_OFFSET(v)           ((v) & 0xFFF)
 
 #ifdef _KERNEL
+
+struct svm_softc;
+struct vm_snapshot_meta;
+
 /* VMCB save state area segment format */
 struct vmcb_segment {
 	uint16_t	selector;
 	uint16_t	attrib;
 	uint32_t	limit;
 	uint64_t	base;
 } __attribute__ ((__packed__));
 CTASSERT(sizeof(struct vmcb_segment) == 16);
 
 /* Code segment descriptor attribute in 12 bit format as saved by VMCB. */
 #define	VMCB_CS_ATTRIB_L		BIT(9)	/* Long mode. */
 #define	VMCB_CS_ATTRIB_D		BIT(10)	/* OPerand size bit. */
 
 /*
  * The VMCB is divided into two areas - the first one contains various
  * control bits including the intercept vector and the second one contains
  * the guest state.
  */
 
 /* VMCB control area - padded up to 1024 bytes */
 struct vmcb_ctrl {
 	uint32_t intercept[5];	/* all intercepts */
 	uint8_t	 pad1[0x28];	/* Offsets 0x14-0x3B are reserved. */
 	uint16_t pause_filthresh; /* Offset 0x3C, PAUSE filter threshold */
 	uint16_t pause_filcnt;  /* Offset 0x3E, PAUSE filter count */
 	uint64_t iopm_base_pa;	/* 0x40: IOPM_BASE_PA */
 	uint64_t msrpm_base_pa; /* 0x48: MSRPM_BASE_PA */
 	uint64_t tsc_offset;	/* 0x50: TSC_OFFSET */
 	uint32_t asid;		/* 0x58: Guest ASID */
 	uint8_t	 tlb_ctrl;	/* 0x5C: TLB_CONTROL */
 	uint8_t  pad2[3];	/* 0x5D-0x5F: Reserved. */
 	uint8_t	 v_tpr;		/* 0x60: V_TPR, guest CR8 */
 	uint8_t	 v_irq:1;	/* Is virtual interrupt pending? */
 	uint8_t	:7; 		/* Padding */
 	uint8_t v_intr_prio:4;	/* 0x62: Priority for virtual interrupt. */
 	uint8_t v_ign_tpr:1;
 	uint8_t :3;
 	uint8_t	v_intr_masking:1; /* Guest and host sharing of RFLAGS. */
 	uint8_t	:7;
 	uint8_t	v_intr_vector;	/* 0x64: Vector for virtual interrupt. */
 	uint8_t pad3[3];	/* 0x65-0x67 Reserved. */
 	uint64_t intr_shadow:1; /* 0x68: Interrupt shadow, section15.2.1 APM2 */
 	uint64_t :63;
 	uint64_t exitcode;	/* 0x70, Exitcode */
 	uint64_t exitinfo1;	/* 0x78, EXITINFO1 */
 	uint64_t exitinfo2;	/* 0x80, EXITINFO2 */
 	uint64_t exitintinfo;	/* 0x88, Interrupt exit value. */
 	uint64_t np_enable:1;   /* 0x90, Nested paging enable. */
 	uint64_t :63;
 	uint8_t  pad4[0x10];	/* 0x98-0xA7 reserved. */
 	uint64_t eventinj;	/* 0xA8, Event injection. */
 	uint64_t n_cr3;		/* B0, Nested page table. */
 	uint64_t lbr_virt_en:1;	/* Enable LBR virtualization. */
 	uint64_t :63;
 	uint32_t vmcb_clean;	/* 0xC0: VMCB clean bits for caching */
 	uint32_t :32;		/* 0xC4: Reserved */
 	uint64_t nrip;		/* 0xC8: Guest next nRIP. */
 	uint8_t	inst_len;	/* 0xD0: #NPF decode assist */
 	uint8_t	inst_bytes[15];
 	uint8_t	padd6[0x320];
 } __attribute__ ((__packed__));
 CTASSERT(sizeof(struct vmcb_ctrl) == 1024);
 
 struct vmcb_state {
 	struct   vmcb_segment es;
 	struct   vmcb_segment cs;
 	struct   vmcb_segment ss;
 	struct   vmcb_segment ds;
 	struct   vmcb_segment fs;
 	struct   vmcb_segment gs;
 	struct   vmcb_segment gdt;
 	struct   vmcb_segment ldt;
 	struct   vmcb_segment idt;
 	struct   vmcb_segment tr;
 	uint8_t	 pad1[0x2b];		/* Reserved: 0xA0-0xCA */
 	uint8_t	 cpl;
 	uint8_t  pad2[4];
 	uint64_t efer;
 	uint8_t	 pad3[0x70];		/* Reserved: 0xd8-0x147 */
 	uint64_t cr4;
 	uint64_t cr3;			/* Guest CR3 */
 	uint64_t cr0;
 	uint64_t dr7;
 	uint64_t dr6;
 	uint64_t rflags;
 	uint64_t rip;
 	uint8_t	 pad4[0x58]; 		/* Reserved: 0x180-0x1D7 */
 	uint64_t rsp;
 	uint8_t	 pad5[0x18]; 		/* Reserved 0x1E0-0x1F7 */
 	uint64_t rax;
 	uint64_t star;
 	uint64_t lstar;
 	uint64_t cstar;
 	uint64_t sfmask;
 	uint64_t kernelgsbase;
 	uint64_t sysenter_cs;
 	uint64_t sysenter_esp;
 	uint64_t sysenter_eip;
 	uint64_t cr2;
 	uint8_t	 pad6[0x20];
 	uint64_t g_pat;
 	uint64_t dbgctl;
 	uint64_t br_from;
 	uint64_t br_to;
 	uint64_t int_from;
 	uint64_t int_to;
 	uint8_t	 pad7[0x968];		/* Reserved up to end of VMCB */
 } __attribute__ ((__packed__));
 CTASSERT(sizeof(struct vmcb_state) == 0xC00);
 
 struct vmcb {
 	struct vmcb_ctrl ctrl;
 	struct vmcb_state state;
 } __attribute__ ((__packed__));
 CTASSERT(sizeof(struct vmcb) == PAGE_SIZE);
 CTASSERT(offsetof(struct vmcb, state) == 0x400);
 
 int	vmcb_read(struct svm_softc *sc, int vcpu, int ident, uint64_t *retval);
 int	vmcb_write(struct svm_softc *sc, int vcpu, int ident, uint64_t val);
 int	vmcb_setdesc(void *arg, int vcpu, int ident, struct seg_desc *desc);
 int	vmcb_getdesc(void *arg, int vcpu, int ident, struct seg_desc *desc);
 int	vmcb_seg(struct vmcb *vmcb, int ident, struct vmcb_segment *seg);
+#ifdef BHYVE_SNAPSHOT
+int	vmcb_getany(struct svm_softc *sc, int vcpu, int ident, uint64_t *val);
+int	vmcb_setany(struct svm_softc *sc, int vcpu, int ident, uint64_t val);
+int	vmcb_snapshot_desc(void *arg, int vcpu, int reg,
+			   struct vm_snapshot_meta *meta);
+int	vmcb_snapshot_any(struct svm_softc *sc, int vcpu, int ident,
+			  struct vm_snapshot_meta *meta);
+#endif
 
 #endif /* _KERNEL */
 #endif /* _VMCB_H_ */
diff --git a/sys/amd64/vmm/intel/vmcs.c b/sys/amd64/vmm/intel/vmcs.c
index 7632ba930f37..4ccdc1f61f34 100644
--- a/sys/amd64/vmm/intel/vmcs.c
+++ b/sys/amd64/vmm/intel/vmcs.c
@@ -1,521 +1,645 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
+#include "opt_bhyve_snapshot.h"
 #include "opt_ddb.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/pcpu.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 #include <machine/segments.h>
 #include <machine/vmm.h>
+#include <machine/vmm_snapshot.h>
 #include "vmm_host.h"
 #include "vmx_cpufunc.h"
 #include "vmcs.h"
 #include "ept.h"
 #include "vmx.h"
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 SYSCTL_DECL(_hw_vmm_vmx);
 
 static int no_flush_rsb;
 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, no_flush_rsb, CTLFLAG_RW,
     &no_flush_rsb, 0, "Do not flush RSB upon vmexit");
 
 static uint64_t
 vmcs_fix_regval(uint32_t encoding, uint64_t val)
 {
 
 	switch (encoding) {
 	case VMCS_GUEST_CR0:
 		val = vmx_fix_cr0(val);
 		break;
 	case VMCS_GUEST_CR4:
 		val = vmx_fix_cr4(val);
 		break;
 	default:
 		break;
 	}
 	return (val);
 }
 
 static uint32_t
 vmcs_field_encoding(int ident)
 {
 	switch (ident) {
 	case VM_REG_GUEST_CR0:
 		return (VMCS_GUEST_CR0);
 	case VM_REG_GUEST_CR3:
 		return (VMCS_GUEST_CR3);
 	case VM_REG_GUEST_CR4:
 		return (VMCS_GUEST_CR4);
 	case VM_REG_GUEST_DR7:
 		return (VMCS_GUEST_DR7);
 	case VM_REG_GUEST_RSP:
 		return (VMCS_GUEST_RSP);
 	case VM_REG_GUEST_RIP:
 		return (VMCS_GUEST_RIP);
 	case VM_REG_GUEST_RFLAGS:
 		return (VMCS_GUEST_RFLAGS);
 	case VM_REG_GUEST_ES:
 		return (VMCS_GUEST_ES_SELECTOR);
 	case VM_REG_GUEST_CS:
 		return (VMCS_GUEST_CS_SELECTOR);
 	case VM_REG_GUEST_SS:
 		return (VMCS_GUEST_SS_SELECTOR);
 	case VM_REG_GUEST_DS:
 		return (VMCS_GUEST_DS_SELECTOR);
 	case VM_REG_GUEST_FS:
 		return (VMCS_GUEST_FS_SELECTOR);
 	case VM_REG_GUEST_GS:
 		return (VMCS_GUEST_GS_SELECTOR);
 	case VM_REG_GUEST_TR:
 		return (VMCS_GUEST_TR_SELECTOR);
 	case VM_REG_GUEST_LDTR:
 		return (VMCS_GUEST_LDTR_SELECTOR);
 	case VM_REG_GUEST_EFER:
 		return (VMCS_GUEST_IA32_EFER);
 	case VM_REG_GUEST_PDPTE0:
 		return (VMCS_GUEST_PDPTE0);
 	case VM_REG_GUEST_PDPTE1:
 		return (VMCS_GUEST_PDPTE1);
 	case VM_REG_GUEST_PDPTE2:
 		return (VMCS_GUEST_PDPTE2);
 	case VM_REG_GUEST_PDPTE3:
 		return (VMCS_GUEST_PDPTE3);
 	case VM_REG_GUEST_ENTRY_INST_LENGTH:
 		return (VMCS_ENTRY_INST_LENGTH);
 	default:
 		return (-1);
 	}
 
 }
 
 static int
 vmcs_seg_desc_encoding(int seg, uint32_t *base, uint32_t *lim, uint32_t *acc)
 {
 
 	switch (seg) {
 	case VM_REG_GUEST_ES:
 		*base = VMCS_GUEST_ES_BASE;
 		*lim = VMCS_GUEST_ES_LIMIT;
 		*acc = VMCS_GUEST_ES_ACCESS_RIGHTS;
 		break;
 	case VM_REG_GUEST_CS:
 		*base = VMCS_GUEST_CS_BASE;
 		*lim = VMCS_GUEST_CS_LIMIT;
 		*acc = VMCS_GUEST_CS_ACCESS_RIGHTS;
 		break;
 	case VM_REG_GUEST_SS:
 		*base = VMCS_GUEST_SS_BASE;
 		*lim = VMCS_GUEST_SS_LIMIT;
 		*acc = VMCS_GUEST_SS_ACCESS_RIGHTS;
 		break;
 	case VM_REG_GUEST_DS:
 		*base = VMCS_GUEST_DS_BASE;
 		*lim = VMCS_GUEST_DS_LIMIT;
 		*acc = VMCS_GUEST_DS_ACCESS_RIGHTS;
 		break;
 	case VM_REG_GUEST_FS:
 		*base = VMCS_GUEST_FS_BASE;
 		*lim = VMCS_GUEST_FS_LIMIT;
 		*acc = VMCS_GUEST_FS_ACCESS_RIGHTS;
 		break;
 	case VM_REG_GUEST_GS:
 		*base = VMCS_GUEST_GS_BASE;
 		*lim = VMCS_GUEST_GS_LIMIT;
 		*acc = VMCS_GUEST_GS_ACCESS_RIGHTS;
 		break;
 	case VM_REG_GUEST_TR:
 		*base = VMCS_GUEST_TR_BASE;
 		*lim = VMCS_GUEST_TR_LIMIT;
 		*acc = VMCS_GUEST_TR_ACCESS_RIGHTS;
 		break;
 	case VM_REG_GUEST_LDTR:
 		*base = VMCS_GUEST_LDTR_BASE;
 		*lim = VMCS_GUEST_LDTR_LIMIT;
 		*acc = VMCS_GUEST_LDTR_ACCESS_RIGHTS;
 		break;
 	case VM_REG_GUEST_IDTR:
 		*base = VMCS_GUEST_IDTR_BASE;
 		*lim = VMCS_GUEST_IDTR_LIMIT;
 		*acc = VMCS_INVALID_ENCODING;
 		break;
 	case VM_REG_GUEST_GDTR:
 		*base = VMCS_GUEST_GDTR_BASE;
 		*lim = VMCS_GUEST_GDTR_LIMIT;
 		*acc = VMCS_INVALID_ENCODING;
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	return (0);
 }
 
 int
 vmcs_getreg(struct vmcs *vmcs, int running, int ident, uint64_t *retval)
 {
 	int error;
 	uint32_t encoding;
 
 	/*
 	 * If we need to get at vmx-specific state in the VMCS we can bypass
 	 * the translation of 'ident' to 'encoding' by simply setting the
 	 * sign bit. As it so happens the upper 16 bits are reserved (i.e
 	 * set to 0) in the encodings for the VMCS so we are free to use the
 	 * sign bit.
 	 */
 	if (ident < 0)
 		encoding = ident & 0x7fffffff;
 	else
 		encoding = vmcs_field_encoding(ident);
 
 	if (encoding == (uint32_t)-1)
 		return (EINVAL);
 
 	if (!running)
 		VMPTRLD(vmcs);
 
 	error = vmread(encoding, retval);
 
 	if (!running)
 		VMCLEAR(vmcs);
 
 	return (error);
 }
 
 int
 vmcs_setreg(struct vmcs *vmcs, int running, int ident, uint64_t val)
 {
 	int error;
 	uint32_t encoding;
 
 	if (ident < 0)
 		encoding = ident & 0x7fffffff;
 	else
 		encoding = vmcs_field_encoding(ident);
 
 	if (encoding == (uint32_t)-1)
 		return (EINVAL);
 
 	val = vmcs_fix_regval(encoding, val);
 
 	if (!running)
 		VMPTRLD(vmcs);
 
 	error = vmwrite(encoding, val);
 
 	if (!running)
 		VMCLEAR(vmcs);
 
 	return (error);
 }
 
 int
 vmcs_setdesc(struct vmcs *vmcs, int running, int seg, struct seg_desc *desc)
 {
 	int error;
 	uint32_t base, limit, access;
 
 	error = vmcs_seg_desc_encoding(seg, &base, &limit, &access);
 	if (error != 0)
 		panic("vmcs_setdesc: invalid segment register %d", seg);
 
 	if (!running)
 		VMPTRLD(vmcs);
 	if ((error = vmwrite(base, desc->base)) != 0)
 		goto done;
 
 	if ((error = vmwrite(limit, desc->limit)) != 0)
 		goto done;
 
 	if (access != VMCS_INVALID_ENCODING) {
 		if ((error = vmwrite(access, desc->access)) != 0)
 			goto done;
 	}
 done:
 	if (!running)
 		VMCLEAR(vmcs);
 	return (error);
 }
 
 int
 vmcs_getdesc(struct vmcs *vmcs, int running, int seg, struct seg_desc *desc)
 {
 	int error;
 	uint32_t base, limit, access;
 	uint64_t u64;
 
 	error = vmcs_seg_desc_encoding(seg, &base, &limit, &access);
 	if (error != 0)
 		panic("vmcs_getdesc: invalid segment register %d", seg);
 
 	if (!running)
 		VMPTRLD(vmcs);
 	if ((error = vmread(base, &u64)) != 0)
 		goto done;
 	desc->base = u64;
 
 	if ((error = vmread(limit, &u64)) != 0)
 		goto done;
 	desc->limit = u64;
 
 	if (access != VMCS_INVALID_ENCODING) {
 		if ((error = vmread(access, &u64)) != 0)
 			goto done;
 		desc->access = u64;
 	}
 done:
 	if (!running)
 		VMCLEAR(vmcs);
 	return (error);
 }
 
 int
 vmcs_set_msr_save(struct vmcs *vmcs, u_long g_area, u_int g_count)
 {
 	int error;
 
 	VMPTRLD(vmcs);
 
 	/*
 	 * Guest MSRs are saved in the VM-exit MSR-store area.
 	 * Guest MSRs are loaded from the VM-entry MSR-load area.
 	 * Both areas point to the same location in memory.
 	 */
 	if ((error = vmwrite(VMCS_EXIT_MSR_STORE, g_area)) != 0)
 		goto done;
 	if ((error = vmwrite(VMCS_EXIT_MSR_STORE_COUNT, g_count)) != 0)
 		goto done;
 
 	if ((error = vmwrite(VMCS_ENTRY_MSR_LOAD, g_area)) != 0)
 		goto done;
 	if ((error = vmwrite(VMCS_ENTRY_MSR_LOAD_COUNT, g_count)) != 0)
 		goto done;
 
 	error = 0;
 done:
 	VMCLEAR(vmcs);
 	return (error);
 }
 
 int
 vmcs_init(struct vmcs *vmcs)
 {
 	int error, codesel, datasel, tsssel;
 	u_long cr0, cr4, efer;
 	uint64_t pat, fsbase, idtrbase;
 
 	codesel = vmm_get_host_codesel();
 	datasel = vmm_get_host_datasel();
 	tsssel = vmm_get_host_tsssel();
 
 	/*
 	 * Make sure we have a "current" VMCS to work with.
 	 */
 	VMPTRLD(vmcs);
 
 	/* Host state */
 
 	/* Initialize host IA32_PAT MSR */
 	pat = vmm_get_host_pat();
 	if ((error = vmwrite(VMCS_HOST_IA32_PAT, pat)) != 0)
 		goto done;
 
 	/* Load the IA32_EFER MSR */
 	efer = vmm_get_host_efer();
 	if ((error = vmwrite(VMCS_HOST_IA32_EFER, efer)) != 0)
 		goto done;
 
 	/* Load the control registers */
 
 	cr0 = vmm_get_host_cr0();
 	if ((error = vmwrite(VMCS_HOST_CR0, cr0)) != 0)
 		goto done;
 	
 	cr4 = vmm_get_host_cr4() | CR4_VMXE;
 	if ((error = vmwrite(VMCS_HOST_CR4, cr4)) != 0)
 		goto done;
 
 	/* Load the segment selectors */
 	if ((error = vmwrite(VMCS_HOST_ES_SELECTOR, datasel)) != 0)
 		goto done;
 
 	if ((error = vmwrite(VMCS_HOST_CS_SELECTOR, codesel)) != 0)
 		goto done;
 
 	if ((error = vmwrite(VMCS_HOST_SS_SELECTOR, datasel)) != 0)
 		goto done;
 
 	if ((error = vmwrite(VMCS_HOST_DS_SELECTOR, datasel)) != 0)
 		goto done;
 
 	if ((error = vmwrite(VMCS_HOST_FS_SELECTOR, datasel)) != 0)
 		goto done;
 
 	if ((error = vmwrite(VMCS_HOST_GS_SELECTOR, datasel)) != 0)
 		goto done;
 
 	if ((error = vmwrite(VMCS_HOST_TR_SELECTOR, tsssel)) != 0)
 		goto done;
 
 	/*
 	 * Load the Base-Address for %fs and idtr.
 	 *
 	 * Note that we exclude %gs, tss and gdtr here because their base
 	 * address is pcpu specific.
 	 */
 	fsbase = vmm_get_host_fsbase();
 	if ((error = vmwrite(VMCS_HOST_FS_BASE, fsbase)) != 0)
 		goto done;
 
 	idtrbase = vmm_get_host_idtrbase();
 	if ((error = vmwrite(VMCS_HOST_IDTR_BASE, idtrbase)) != 0)
 		goto done;
 
 	/* instruction pointer */
 	if (no_flush_rsb) {
 		if ((error = vmwrite(VMCS_HOST_RIP,
 		    (u_long)vmx_exit_guest)) != 0)
 			goto done;
 	} else {
 		if ((error = vmwrite(VMCS_HOST_RIP,
 		    (u_long)vmx_exit_guest_flush_rsb)) != 0)
 			goto done;
 	}
 
 	/* link pointer */
 	if ((error = vmwrite(VMCS_LINK_POINTER, ~0)) != 0)
 		goto done;
 done:
 	VMCLEAR(vmcs);
 	return (error);
 }
 
+#ifdef BHYVE_SNAPSHOT
+int
+vmcs_getany(struct vmcs *vmcs, int running, int ident, uint64_t *val)
+{
+	int error;
+
+	if (!running)
+		VMPTRLD(vmcs);
+
+	error = vmread(ident, val);
+
+	if (!running)
+		VMCLEAR(vmcs);
+
+	return (error);
+}
+
+int
+vmcs_setany(struct vmcs *vmcs, int running, int ident, uint64_t val)
+{
+	int error;
+
+	if (!running)
+		VMPTRLD(vmcs);
+
+	error = vmwrite(ident, val);
+
+	if (!running)
+		VMCLEAR(vmcs);
+
+	return (error);
+}
+
+int
+vmcs_snapshot_reg(struct vmcs *vmcs, int running, int ident,
+		  struct vm_snapshot_meta *meta)
+{
+	int ret;
+	uint64_t val;
+
+	if (meta->op == VM_SNAPSHOT_SAVE) {
+		ret = vmcs_getreg(vmcs, running, ident, &val);
+		if (ret != 0)
+			goto done;
+
+		SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done);
+	} else if (meta->op == VM_SNAPSHOT_RESTORE) {
+		SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done);
+
+		ret = vmcs_setreg(vmcs, running, ident, val);
+		if (ret != 0)
+			goto done;
+	} else {
+		ret = EINVAL;
+		goto done;
+	}
+
+done:
+	return (ret);
+}
+
+int
+vmcs_snapshot_desc(struct vmcs *vmcs, int running, int seg,
+		   struct vm_snapshot_meta *meta)
+{
+	int ret;
+	struct seg_desc desc;
+
+	if (meta->op == VM_SNAPSHOT_SAVE) {
+		ret = vmcs_getdesc(vmcs, running, seg, &desc);
+		if (ret != 0)
+			goto done;
+
+		SNAPSHOT_VAR_OR_LEAVE(desc.base, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(desc.limit, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(desc.access, meta, ret, done);
+	} else if (meta->op == VM_SNAPSHOT_RESTORE) {
+		SNAPSHOT_VAR_OR_LEAVE(desc.base, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(desc.limit, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(desc.access, meta, ret, done);
+
+		ret = vmcs_setdesc(vmcs, running, seg, &desc);
+		if (ret != 0)
+			goto done;
+	} else {
+		ret = EINVAL;
+		goto done;
+	}
+
+done:
+	return (ret);
+}
+
+int
+vmcs_snapshot_any(struct vmcs *vmcs, int running, int ident,
+		  struct vm_snapshot_meta *meta)
+{
+	int ret;
+	uint64_t val;
+
+	if (meta->op == VM_SNAPSHOT_SAVE) {
+		ret = vmcs_getany(vmcs, running, ident, &val);
+		if (ret != 0)
+			goto done;
+
+		SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done);
+	} else if (meta->op == VM_SNAPSHOT_RESTORE) {
+		SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done);
+
+		ret = vmcs_setany(vmcs, running, ident, val);
+		if (ret != 0)
+			goto done;
+	} else {
+		ret = EINVAL;
+		goto done;
+	}
+
+done:
+	return (ret);
+}
+#endif
+
 #ifdef DDB
 extern int vmxon_enabled[];
 
 DB_SHOW_COMMAND(vmcs, db_show_vmcs)
 {
 	uint64_t cur_vmcs, val;
 	uint32_t exit;
 
 	if (!vmxon_enabled[curcpu]) {
 		db_printf("VMX not enabled\n");
 		return;
 	}
 
 	if (have_addr) {
 		db_printf("Only current VMCS supported\n");
 		return;
 	}
 
 	vmptrst(&cur_vmcs);
 	if (cur_vmcs == VMCS_INITIAL) {
 		db_printf("No current VM context\n");
 		return;
 	}
 	db_printf("VMCS: %jx\n", cur_vmcs);
 	db_printf("VPID: %lu\n", vmcs_read(VMCS_VPID));
 	db_printf("Activity: ");
 	val = vmcs_read(VMCS_GUEST_ACTIVITY);
 	switch (val) {
 	case 0:
 		db_printf("Active");
 		break;
 	case 1:
 		db_printf("HLT");
 		break;
 	case 2:
 		db_printf("Shutdown");
 		break;
 	case 3:
 		db_printf("Wait for SIPI");
 		break;
 	default:
 		db_printf("Unknown: %#lx", val);
 	}
 	db_printf("\n");
 	exit = vmcs_read(VMCS_EXIT_REASON);
 	if (exit & 0x80000000)
 		db_printf("Entry Failure Reason: %u\n", exit & 0xffff);
 	else
 		db_printf("Exit Reason: %u\n", exit & 0xffff);
 	db_printf("Qualification: %#lx\n", vmcs_exit_qualification());
 	db_printf("Guest Linear Address: %#lx\n",
 	    vmcs_read(VMCS_GUEST_LINEAR_ADDRESS));
 	switch (exit & 0x8000ffff) {
 	case EXIT_REASON_EXCEPTION:
 	case EXIT_REASON_EXT_INTR:
 		val = vmcs_read(VMCS_EXIT_INTR_INFO);
 		db_printf("Interrupt Type: ");
 		switch (val >> 8 & 0x7) {
 		case 0:
 			db_printf("external");
 			break;
 		case 2:
 			db_printf("NMI");
 			break;
 		case 3:
 			db_printf("HW exception");
 			break;
 		case 4:
 			db_printf("SW exception");
 			break;
 		default:
 			db_printf("?? %lu", val >> 8 & 0x7);
 			break;
 		}
 		db_printf("  Vector: %lu", val & 0xff);
 		if (val & 0x800)
 			db_printf("  Error Code: %lx",
 			    vmcs_read(VMCS_EXIT_INTR_ERRCODE));
 		db_printf("\n");
 		break;
 	case EXIT_REASON_EPT_FAULT:
 	case EXIT_REASON_EPT_MISCONFIG:
 		db_printf("Guest Physical Address: %#lx\n",
 		    vmcs_read(VMCS_GUEST_PHYSICAL_ADDRESS));
 		break;
 	}
 	db_printf("VM-instruction error: %#lx\n", vmcs_instruction_error());
 }
 #endif
diff --git a/sys/amd64/vmm/intel/vmcs.h b/sys/amd64/vmm/intel/vmcs.h
index 29e0263fb9f2..8aa7b1e8fc08 100644
--- a/sys/amd64/vmm/intel/vmcs.h
+++ b/sys/amd64/vmm/intel/vmcs.h
@@ -1,411 +1,424 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _VMCS_H_
 #define	_VMCS_H_
 
 #ifdef _KERNEL
+
+struct vm_snapshot_meta;
+
 struct vmcs {
 	uint32_t	identifier;
 	uint32_t	abort_code;
 	char		_impl_specific[PAGE_SIZE - sizeof(uint32_t) * 2];
 };
 CTASSERT(sizeof(struct vmcs) == PAGE_SIZE);
 
 /* MSR save region is composed of an array of 'struct msr_entry' */
 struct msr_entry {
 	uint32_t	index;
 	uint32_t	reserved;
 	uint64_t	val;
 
 };
 
 int vmcs_set_msr_save(struct vmcs *vmcs, u_long g_area, u_int g_count);
 int	vmcs_init(struct vmcs *vmcs);
 int	vmcs_getreg(struct vmcs *vmcs, int running, int ident, uint64_t *rv);
 int	vmcs_setreg(struct vmcs *vmcs, int running, int ident, uint64_t val);
 int	vmcs_getdesc(struct vmcs *vmcs, int running, int ident,
 		     struct seg_desc *desc);
 int	vmcs_setdesc(struct vmcs *vmcs, int running, int ident,
 		     struct seg_desc *desc);
+#ifdef BHYVE_SNAPSHOT
+int	vmcs_getany(struct vmcs *vmcs, int running, int ident, uint64_t *val);
+int	vmcs_setany(struct vmcs *vmcs, int running, int ident, uint64_t val);
+int	vmcs_snapshot_reg(struct vmcs *vmcs, int running, int ident,
+			  struct vm_snapshot_meta *meta);
+int	vmcs_snapshot_desc(struct vmcs *vmcs, int running, int seg,
+			   struct vm_snapshot_meta *meta);
+int	vmcs_snapshot_any(struct vmcs *vmcs, int running, int ident,
+			  struct vm_snapshot_meta *meta);
+#endif
 
 /*
  * Avoid header pollution caused by inline use of 'vtophys()' in vmx_cpufunc.h
  */
 #ifdef _VMX_CPUFUNC_H_
 static __inline uint64_t
 vmcs_read(uint32_t encoding)
 {
 	int error;
 	uint64_t val;
 
 	error = vmread(encoding, &val);
 	KASSERT(error == 0, ("vmcs_read(%u) error %d", encoding, error));
 	return (val);
 }
 
 static __inline void
 vmcs_write(uint32_t encoding, uint64_t val)
 {
 	int error;
 
 	error = vmwrite(encoding, val);
 	KASSERT(error == 0, ("vmcs_write(%u) error %d", encoding, error));
 }
 #endif	/* _VMX_CPUFUNC_H_ */
 
 #define	vmexit_instruction_length()	vmcs_read(VMCS_EXIT_INSTRUCTION_LENGTH)
 #define	vmcs_guest_rip()		vmcs_read(VMCS_GUEST_RIP)
 #define	vmcs_instruction_error()	vmcs_read(VMCS_INSTRUCTION_ERROR)
 #define	vmcs_exit_reason()		(vmcs_read(VMCS_EXIT_REASON) & 0xffff)
 #define	vmcs_exit_qualification()	vmcs_read(VMCS_EXIT_QUALIFICATION)
 #define	vmcs_guest_cr3()		vmcs_read(VMCS_GUEST_CR3)
 #define	vmcs_gpa()			vmcs_read(VMCS_GUEST_PHYSICAL_ADDRESS)
 #define	vmcs_gla()			vmcs_read(VMCS_GUEST_LINEAR_ADDRESS)
 #define	vmcs_idt_vectoring_info()	vmcs_read(VMCS_IDT_VECTORING_INFO)
 #define	vmcs_idt_vectoring_err()	vmcs_read(VMCS_IDT_VECTORING_ERROR)
 
 #endif	/* _KERNEL */
 
 #define	VMCS_INITIAL			0xffffffffffffffff
 
 #define	VMCS_IDENT(encoding)		((encoding) | 0x80000000)
 /*
  * VMCS field encodings from Appendix H, Intel Architecture Manual Vol3B.
  */
 #define	VMCS_INVALID_ENCODING		0xffffffff
 
 /* 16-bit control fields */
 #define	VMCS_VPID			0x00000000
 #define	VMCS_PIR_VECTOR			0x00000002
 
 /* 16-bit guest-state fields */
 #define	VMCS_GUEST_ES_SELECTOR		0x00000800
 #define	VMCS_GUEST_CS_SELECTOR		0x00000802
 #define	VMCS_GUEST_SS_SELECTOR		0x00000804
 #define	VMCS_GUEST_DS_SELECTOR		0x00000806
 #define	VMCS_GUEST_FS_SELECTOR		0x00000808
 #define	VMCS_GUEST_GS_SELECTOR		0x0000080A
 #define	VMCS_GUEST_LDTR_SELECTOR	0x0000080C
 #define	VMCS_GUEST_TR_SELECTOR		0x0000080E
 #define	VMCS_GUEST_INTR_STATUS		0x00000810
 
 /* 16-bit host-state fields */
 #define	VMCS_HOST_ES_SELECTOR		0x00000C00
 #define	VMCS_HOST_CS_SELECTOR		0x00000C02
 #define	VMCS_HOST_SS_SELECTOR		0x00000C04
 #define	VMCS_HOST_DS_SELECTOR		0x00000C06
 #define	VMCS_HOST_FS_SELECTOR		0x00000C08
 #define	VMCS_HOST_GS_SELECTOR		0x00000C0A
 #define	VMCS_HOST_TR_SELECTOR		0x00000C0C
 
 /* 64-bit control fields */
 #define	VMCS_IO_BITMAP_A		0x00002000
 #define	VMCS_IO_BITMAP_B		0x00002002
 #define	VMCS_MSR_BITMAP			0x00002004
 #define	VMCS_EXIT_MSR_STORE		0x00002006
 #define	VMCS_EXIT_MSR_LOAD		0x00002008
 #define	VMCS_ENTRY_MSR_LOAD		0x0000200A
 #define	VMCS_EXECUTIVE_VMCS		0x0000200C
 #define	VMCS_TSC_OFFSET			0x00002010
 #define	VMCS_VIRTUAL_APIC		0x00002012
 #define	VMCS_APIC_ACCESS		0x00002014
 #define	VMCS_PIR_DESC			0x00002016
 #define	VMCS_EPTP			0x0000201A
 #define	VMCS_EOI_EXIT0			0x0000201C
 #define	VMCS_EOI_EXIT1			0x0000201E
 #define	VMCS_EOI_EXIT2			0x00002020
 #define	VMCS_EOI_EXIT3			0x00002022
 #define	VMCS_EOI_EXIT(vector)		(VMCS_EOI_EXIT0 + ((vector) / 64) * 2)
 
 /* 64-bit read-only fields */
 #define	VMCS_GUEST_PHYSICAL_ADDRESS	0x00002400
 
 /* 64-bit guest-state fields */
 #define	VMCS_LINK_POINTER		0x00002800
 #define	VMCS_GUEST_IA32_DEBUGCTL	0x00002802
 #define	VMCS_GUEST_IA32_PAT		0x00002804
 #define	VMCS_GUEST_IA32_EFER		0x00002806
 #define	VMCS_GUEST_IA32_PERF_GLOBAL_CTRL 0x00002808
 #define	VMCS_GUEST_PDPTE0		0x0000280A
 #define	VMCS_GUEST_PDPTE1		0x0000280C
 #define	VMCS_GUEST_PDPTE2		0x0000280E
 #define	VMCS_GUEST_PDPTE3		0x00002810
 
 /* 64-bit host-state fields */
 #define	VMCS_HOST_IA32_PAT		0x00002C00
 #define	VMCS_HOST_IA32_EFER		0x00002C02
 #define	VMCS_HOST_IA32_PERF_GLOBAL_CTRL	0x00002C04
 
 /* 32-bit control fields */
 #define	VMCS_PIN_BASED_CTLS		0x00004000
 #define	VMCS_PRI_PROC_BASED_CTLS	0x00004002
 #define	VMCS_EXCEPTION_BITMAP		0x00004004
 #define	VMCS_PF_ERROR_MASK		0x00004006
 #define	VMCS_PF_ERROR_MATCH		0x00004008
 #define	VMCS_CR3_TARGET_COUNT		0x0000400A
 #define	VMCS_EXIT_CTLS			0x0000400C
 #define	VMCS_EXIT_MSR_STORE_COUNT	0x0000400E
 #define	VMCS_EXIT_MSR_LOAD_COUNT	0x00004010
 #define	VMCS_ENTRY_CTLS			0x00004012
 #define	VMCS_ENTRY_MSR_LOAD_COUNT	0x00004014
 #define	VMCS_ENTRY_INTR_INFO		0x00004016
 #define	VMCS_ENTRY_EXCEPTION_ERROR	0x00004018
 #define	VMCS_ENTRY_INST_LENGTH		0x0000401A
 #define	VMCS_TPR_THRESHOLD		0x0000401C
 #define	VMCS_SEC_PROC_BASED_CTLS	0x0000401E
 #define	VMCS_PLE_GAP			0x00004020
 #define	VMCS_PLE_WINDOW			0x00004022
 
 /* 32-bit read-only data fields */
 #define	VMCS_INSTRUCTION_ERROR		0x00004400
 #define	VMCS_EXIT_REASON		0x00004402
 #define	VMCS_EXIT_INTR_INFO		0x00004404
 #define	VMCS_EXIT_INTR_ERRCODE		0x00004406
 #define	VMCS_IDT_VECTORING_INFO		0x00004408
 #define	VMCS_IDT_VECTORING_ERROR	0x0000440A
 #define	VMCS_EXIT_INSTRUCTION_LENGTH	0x0000440C
 #define	VMCS_EXIT_INSTRUCTION_INFO	0x0000440E
 
 /* 32-bit guest-state fields */
 #define	VMCS_GUEST_ES_LIMIT		0x00004800
 #define	VMCS_GUEST_CS_LIMIT		0x00004802
 #define	VMCS_GUEST_SS_LIMIT		0x00004804
 #define	VMCS_GUEST_DS_LIMIT		0x00004806
 #define	VMCS_GUEST_FS_LIMIT		0x00004808
 #define	VMCS_GUEST_GS_LIMIT		0x0000480A
 #define	VMCS_GUEST_LDTR_LIMIT		0x0000480C
 #define	VMCS_GUEST_TR_LIMIT		0x0000480E
 #define	VMCS_GUEST_GDTR_LIMIT		0x00004810
 #define	VMCS_GUEST_IDTR_LIMIT		0x00004812
 #define	VMCS_GUEST_ES_ACCESS_RIGHTS	0x00004814
 #define	VMCS_GUEST_CS_ACCESS_RIGHTS	0x00004816
 #define	VMCS_GUEST_SS_ACCESS_RIGHTS	0x00004818
 #define	VMCS_GUEST_DS_ACCESS_RIGHTS	0x0000481A
 #define	VMCS_GUEST_FS_ACCESS_RIGHTS	0x0000481C
 #define	VMCS_GUEST_GS_ACCESS_RIGHTS	0x0000481E
 #define	VMCS_GUEST_LDTR_ACCESS_RIGHTS	0x00004820
 #define	VMCS_GUEST_TR_ACCESS_RIGHTS	0x00004822
 #define	VMCS_GUEST_INTERRUPTIBILITY	0x00004824
 #define	VMCS_GUEST_ACTIVITY		0x00004826
 #define VMCS_GUEST_SMBASE		0x00004828
 #define	VMCS_GUEST_IA32_SYSENTER_CS	0x0000482A
 #define	VMCS_PREEMPTION_TIMER_VALUE	0x0000482E
 
 /* 32-bit host state fields */
 #define	VMCS_HOST_IA32_SYSENTER_CS	0x00004C00
 
 /* Natural Width control fields */
 #define	VMCS_CR0_MASK			0x00006000
 #define	VMCS_CR4_MASK			0x00006002
 #define	VMCS_CR0_SHADOW			0x00006004
 #define	VMCS_CR4_SHADOW			0x00006006
 #define	VMCS_CR3_TARGET0		0x00006008
 #define	VMCS_CR3_TARGET1		0x0000600A
 #define	VMCS_CR3_TARGET2		0x0000600C
 #define	VMCS_CR3_TARGET3		0x0000600E
 
 /* Natural Width read-only fields */
 #define	VMCS_EXIT_QUALIFICATION		0x00006400
 #define	VMCS_IO_RCX			0x00006402
 #define	VMCS_IO_RSI			0x00006404
 #define	VMCS_IO_RDI			0x00006406
 #define	VMCS_IO_RIP			0x00006408
 #define	VMCS_GUEST_LINEAR_ADDRESS	0x0000640A
 
 /* Natural Width guest-state fields */
 #define	VMCS_GUEST_CR0			0x00006800
 #define	VMCS_GUEST_CR3			0x00006802
 #define	VMCS_GUEST_CR4			0x00006804
 #define	VMCS_GUEST_ES_BASE		0x00006806
 #define	VMCS_GUEST_CS_BASE		0x00006808
 #define	VMCS_GUEST_SS_BASE		0x0000680A
 #define	VMCS_GUEST_DS_BASE		0x0000680C
 #define	VMCS_GUEST_FS_BASE		0x0000680E
 #define	VMCS_GUEST_GS_BASE		0x00006810
 #define	VMCS_GUEST_LDTR_BASE		0x00006812
 #define	VMCS_GUEST_TR_BASE		0x00006814
 #define	VMCS_GUEST_GDTR_BASE		0x00006816
 #define	VMCS_GUEST_IDTR_BASE		0x00006818
 #define	VMCS_GUEST_DR7			0x0000681A
 #define	VMCS_GUEST_RSP			0x0000681C
 #define	VMCS_GUEST_RIP			0x0000681E
 #define	VMCS_GUEST_RFLAGS		0x00006820
 #define	VMCS_GUEST_PENDING_DBG_EXCEPTIONS 0x00006822
 #define	VMCS_GUEST_IA32_SYSENTER_ESP	0x00006824
 #define	VMCS_GUEST_IA32_SYSENTER_EIP	0x00006826
 
 /* Natural Width host-state fields */
 #define	VMCS_HOST_CR0			0x00006C00
 #define	VMCS_HOST_CR3			0x00006C02
 #define	VMCS_HOST_CR4			0x00006C04
 #define	VMCS_HOST_FS_BASE		0x00006C06
 #define	VMCS_HOST_GS_BASE		0x00006C08
 #define	VMCS_HOST_TR_BASE		0x00006C0A
 #define	VMCS_HOST_GDTR_BASE		0x00006C0C
 #define	VMCS_HOST_IDTR_BASE		0x00006C0E
 #define	VMCS_HOST_IA32_SYSENTER_ESP	0x00006C10
 #define	VMCS_HOST_IA32_SYSENTER_EIP	0x00006C12
 #define	VMCS_HOST_RSP			0x00006C14
 #define	VMCS_HOST_RIP			0x00006c16
 
 /*
  * VM instruction error numbers
  */
 #define	VMRESUME_WITH_NON_LAUNCHED_VMCS	5
 
 /*
  * VMCS exit reasons
  */
 #define EXIT_REASON_EXCEPTION		0
 #define EXIT_REASON_EXT_INTR		1
 #define EXIT_REASON_TRIPLE_FAULT	2
 #define EXIT_REASON_INIT		3
 #define EXIT_REASON_SIPI		4
 #define EXIT_REASON_IO_SMI		5
 #define EXIT_REASON_SMI			6
 #define EXIT_REASON_INTR_WINDOW		7
 #define EXIT_REASON_NMI_WINDOW		8
 #define EXIT_REASON_TASK_SWITCH		9
 #define EXIT_REASON_CPUID		10
 #define EXIT_REASON_GETSEC		11
 #define EXIT_REASON_HLT			12
 #define EXIT_REASON_INVD		13
 #define EXIT_REASON_INVLPG		14
 #define EXIT_REASON_RDPMC		15
 #define EXIT_REASON_RDTSC		16
 #define EXIT_REASON_RSM			17
 #define EXIT_REASON_VMCALL		18
 #define EXIT_REASON_VMCLEAR		19
 #define EXIT_REASON_VMLAUNCH		20
 #define EXIT_REASON_VMPTRLD		21
 #define EXIT_REASON_VMPTRST		22
 #define EXIT_REASON_VMREAD		23
 #define EXIT_REASON_VMRESUME		24
 #define EXIT_REASON_VMWRITE		25
 #define EXIT_REASON_VMXOFF		26
 #define EXIT_REASON_VMXON		27
 #define EXIT_REASON_CR_ACCESS		28
 #define EXIT_REASON_DR_ACCESS		29
 #define EXIT_REASON_INOUT		30
 #define EXIT_REASON_RDMSR		31
 #define EXIT_REASON_WRMSR		32
 #define EXIT_REASON_INVAL_VMCS		33
 #define EXIT_REASON_INVAL_MSR		34
 #define EXIT_REASON_MWAIT		36
 #define EXIT_REASON_MTF			37
 #define EXIT_REASON_MONITOR		39
 #define EXIT_REASON_PAUSE		40
 #define EXIT_REASON_MCE_DURING_ENTRY	41
 #define EXIT_REASON_TPR			43
 #define EXIT_REASON_APIC_ACCESS		44
 #define	EXIT_REASON_VIRTUALIZED_EOI	45
 #define EXIT_REASON_GDTR_IDTR		46
 #define EXIT_REASON_LDTR_TR		47
 #define EXIT_REASON_EPT_FAULT		48
 #define EXIT_REASON_EPT_MISCONFIG	49
 #define EXIT_REASON_INVEPT		50
 #define EXIT_REASON_RDTSCP		51
 #define EXIT_REASON_VMX_PREEMPT		52
 #define EXIT_REASON_INVVPID		53
 #define EXIT_REASON_WBINVD		54
 #define EXIT_REASON_XSETBV		55
 #define	EXIT_REASON_APIC_WRITE		56
 #define	EXIT_REASON_RDRAND		57
 #define	EXIT_REASON_INVPCID		58
 #define	EXIT_REASON_VMFUNC		59
 #define	EXIT_REASON_ENCLS		60
 #define	EXIT_REASON_RDSEED		61
 #define	EXIT_REASON_PM_LOG_FULL		62
 #define	EXIT_REASON_XSAVES		63
 #define	EXIT_REASON_XRSTORS		64
 
 /*
  * NMI unblocking due to IRET.
  *
  * Applies to VM-exits due to hardware exception or EPT fault.
  */
 #define	EXIT_QUAL_NMIUDTI	(1 << 12)
 /*
  * VMCS interrupt information fields
  */
 #define	VMCS_INTR_VALID		(1U << 31)
 #define	VMCS_INTR_T_MASK	0x700		/* Interruption-info type */
 #define	VMCS_INTR_T_HWINTR	(0 << 8)
 #define	VMCS_INTR_T_NMI		(2 << 8)
 #define	VMCS_INTR_T_HWEXCEPTION	(3 << 8)
 #define	VMCS_INTR_T_SWINTR	(4 << 8)
 #define	VMCS_INTR_T_PRIV_SWEXCEPTION (5 << 8)
 #define	VMCS_INTR_T_SWEXCEPTION	(6 << 8)
 #define	VMCS_INTR_DEL_ERRCODE	(1 << 11)
 
 /*
  * VMCS IDT-Vectoring information fields
  */
 #define	VMCS_IDT_VEC_VALID		(1U << 31)
 #define	VMCS_IDT_VEC_ERRCODE_VALID	(1 << 11)
 
 /*
  * VMCS Guest interruptibility field
  */
 #define	VMCS_INTERRUPTIBILITY_STI_BLOCKING	(1 << 0)
 #define	VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING	(1 << 1)
 #define	VMCS_INTERRUPTIBILITY_SMI_BLOCKING	(1 << 2)
 #define	VMCS_INTERRUPTIBILITY_NMI_BLOCKING	(1 << 3)
 
 /*
  * Exit qualification for EXIT_REASON_INVAL_VMCS
  */
 #define	EXIT_QUAL_NMI_WHILE_STI_BLOCKING	3
 
 /*
  * Exit qualification for EPT violation
  */
 #define	EPT_VIOLATION_DATA_READ		(1UL << 0)
 #define	EPT_VIOLATION_DATA_WRITE	(1UL << 1)
 #define	EPT_VIOLATION_INST_FETCH	(1UL << 2)
 #define	EPT_VIOLATION_GPA_READABLE	(1UL << 3)
 #define	EPT_VIOLATION_GPA_WRITEABLE	(1UL << 4)
 #define	EPT_VIOLATION_GPA_EXECUTABLE	(1UL << 5)
 #define	EPT_VIOLATION_GLA_VALID		(1UL << 7)
 #define	EPT_VIOLATION_XLAT_VALID	(1UL << 8)
 
 /*
  * Exit qualification for APIC-access VM exit
  */
 #define	APIC_ACCESS_OFFSET(qual)	((qual) & 0xFFF)
 #define	APIC_ACCESS_TYPE(qual)		(((qual) >> 12) & 0xF)
 
 /*
  * Exit qualification for APIC-write VM exit
  */
 #define	APIC_WRITE_OFFSET(qual)		((qual) & 0xFFF)
 
 #endif
diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c
index 9f610ea50852..21a1b9fdefc4 100644
--- a/sys/amd64/vmm/intel/vmx.c
+++ b/sys/amd64/vmm/intel/vmx.c
@@ -1,3896 +1,4058 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  * Copyright (c) 2018 Joyent, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
+#include "opt_bhyve_snapshot.h"
+
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/smp.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/pcpu.h>
 #include <sys/proc.h>
 #include <sys/sysctl.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 #include <machine/psl.h>
 #include <machine/cpufunc.h>
 #include <machine/md_var.h>
 #include <machine/reg.h>
 #include <machine/segments.h>
 #include <machine/smp.h>
 #include <machine/specialreg.h>
 #include <machine/vmparam.h>
 
 #include <machine/vmm.h>
 #include <machine/vmm_dev.h>
 #include <machine/vmm_instruction_emul.h>
+#include <machine/vmm_snapshot.h>
+
 #include "vmm_lapic.h"
 #include "vmm_host.h"
 #include "vmm_ioport.h"
 #include "vmm_ktr.h"
 #include "vmm_stat.h"
 #include "vatpic.h"
 #include "vlapic.h"
 #include "vlapic_priv.h"
 
 #include "ept.h"
 #include "vmx_cpufunc.h"
 #include "vmx.h"
 #include "vmx_msr.h"
 #include "x86.h"
 #include "vmx_controls.h"
 
 #define	PINBASED_CTLS_ONE_SETTING					\
 	(PINBASED_EXTINT_EXITING	|				\
 	 PINBASED_NMI_EXITING		|				\
 	 PINBASED_VIRTUAL_NMI)
 #define	PINBASED_CTLS_ZERO_SETTING	0
 
 #define PROCBASED_CTLS_WINDOW_SETTING					\
 	(PROCBASED_INT_WINDOW_EXITING	|				\
 	 PROCBASED_NMI_WINDOW_EXITING)
 
 #define	PROCBASED_CTLS_ONE_SETTING					\
 	(PROCBASED_SECONDARY_CONTROLS	|				\
 	 PROCBASED_MWAIT_EXITING	|				\
 	 PROCBASED_MONITOR_EXITING	|				\
 	 PROCBASED_IO_EXITING		|				\
 	 PROCBASED_MSR_BITMAPS		|				\
 	 PROCBASED_CTLS_WINDOW_SETTING	|				\
 	 PROCBASED_CR8_LOAD_EXITING	|				\
 	 PROCBASED_CR8_STORE_EXITING)
 #define	PROCBASED_CTLS_ZERO_SETTING	\
 	(PROCBASED_CR3_LOAD_EXITING |	\
 	PROCBASED_CR3_STORE_EXITING |	\
 	PROCBASED_IO_BITMAPS)
 
 #define	PROCBASED_CTLS2_ONE_SETTING	PROCBASED2_ENABLE_EPT
 #define	PROCBASED_CTLS2_ZERO_SETTING	0
 
 #define	VM_EXIT_CTLS_ONE_SETTING					\
 	(VM_EXIT_SAVE_DEBUG_CONTROLS		|			\
 	VM_EXIT_HOST_LMA			|			\
 	VM_EXIT_SAVE_EFER			|			\
 	VM_EXIT_LOAD_EFER			|			\
 	VM_EXIT_ACKNOWLEDGE_INTERRUPT)
 
 #define	VM_EXIT_CTLS_ZERO_SETTING	0
 
 #define	VM_ENTRY_CTLS_ONE_SETTING					\
 	(VM_ENTRY_LOAD_DEBUG_CONTROLS		|			\
 	VM_ENTRY_LOAD_EFER)
 
 #define	VM_ENTRY_CTLS_ZERO_SETTING					\
 	(VM_ENTRY_INTO_SMM			|			\
 	VM_ENTRY_DEACTIVATE_DUAL_MONITOR)
 
 #define	HANDLED		1
 #define	UNHANDLED	0
 
 static MALLOC_DEFINE(M_VMX, "vmx", "vmx");
 static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic");
 
 SYSCTL_DECL(_hw_vmm);
 SYSCTL_NODE(_hw_vmm, OID_AUTO, vmx, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
     NULL);
 
 int vmxon_enabled[MAXCPU];
 static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE);
 
 static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2;
 static uint32_t exit_ctls, entry_ctls;
 
 static uint64_t cr0_ones_mask, cr0_zeros_mask;
 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_ones_mask, CTLFLAG_RD,
 	     &cr0_ones_mask, 0, NULL);
 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_zeros_mask, CTLFLAG_RD,
 	     &cr0_zeros_mask, 0, NULL);
 
 static uint64_t cr4_ones_mask, cr4_zeros_mask;
 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_ones_mask, CTLFLAG_RD,
 	     &cr4_ones_mask, 0, NULL);
 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_zeros_mask, CTLFLAG_RD,
 	     &cr4_zeros_mask, 0, NULL);
 
 static int vmx_initialized;
 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD,
 	   &vmx_initialized, 0, "Intel VMX initialized");
 
 /*
  * Optional capabilities
  */
 static SYSCTL_NODE(_hw_vmm_vmx, OID_AUTO, cap,
     CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
     NULL);
 
 static int cap_halt_exit;
 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, halt_exit, CTLFLAG_RD, &cap_halt_exit, 0,
     "HLT triggers a VM-exit");
 
 static int cap_pause_exit;
 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, pause_exit, CTLFLAG_RD, &cap_pause_exit,
     0, "PAUSE triggers a VM-exit");
 
 static int cap_unrestricted_guest;
 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, unrestricted_guest, CTLFLAG_RD,
     &cap_unrestricted_guest, 0, "Unrestricted guests");
 
 static int cap_monitor_trap;
 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, monitor_trap, CTLFLAG_RD,
     &cap_monitor_trap, 0, "Monitor trap flag");
 
 static int cap_invpcid;
 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, invpcid, CTLFLAG_RD, &cap_invpcid,
     0, "Guests are allowed to use INVPCID");
 
 static int tpr_shadowing;
 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, tpr_shadowing, CTLFLAG_RD,
     &tpr_shadowing, 0, "TPR shadowing support");
 
 static int virtual_interrupt_delivery;
 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, virtual_interrupt_delivery, CTLFLAG_RD,
     &virtual_interrupt_delivery, 0, "APICv virtual interrupt delivery support");
 
 static int posted_interrupts;
 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, posted_interrupts, CTLFLAG_RD,
     &posted_interrupts, 0, "APICv posted interrupt support");
 
 static int pirvec = -1;
 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupt_vector, CTLFLAG_RD,
     &pirvec, 0, "APICv posted interrupt vector");
 
 static struct unrhdr *vpid_unr;
 static u_int vpid_alloc_failed;
 SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD,
 	    &vpid_alloc_failed, 0, NULL);
 
 int guest_l1d_flush;
 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, l1d_flush, CTLFLAG_RD,
     &guest_l1d_flush, 0, NULL);
 int guest_l1d_flush_sw;
 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, l1d_flush_sw, CTLFLAG_RD,
     &guest_l1d_flush_sw, 0, NULL);
 
 static struct msr_entry msr_load_list[1] __aligned(16);
 
 /*
  * The definitions of SDT probes for VMX.
  */
 
 SDT_PROBE_DEFINE3(vmm, vmx, exit, entry,
     "struct vmx *", "int", "struct vm_exit *");
 
 SDT_PROBE_DEFINE4(vmm, vmx, exit, taskswitch,
     "struct vmx *", "int", "struct vm_exit *", "struct vm_task_switch *");
 
 SDT_PROBE_DEFINE4(vmm, vmx, exit, craccess,
     "struct vmx *", "int", "struct vm_exit *", "uint64_t");
 
 SDT_PROBE_DEFINE4(vmm, vmx, exit, rdmsr,
     "struct vmx *", "int", "struct vm_exit *", "uint32_t");
 
 SDT_PROBE_DEFINE5(vmm, vmx, exit, wrmsr,
     "struct vmx *", "int", "struct vm_exit *", "uint32_t", "uint64_t");
 
 SDT_PROBE_DEFINE3(vmm, vmx, exit, halt,
     "struct vmx *", "int", "struct vm_exit *");
 
 SDT_PROBE_DEFINE3(vmm, vmx, exit, mtrap,
     "struct vmx *", "int", "struct vm_exit *");
 
 SDT_PROBE_DEFINE3(vmm, vmx, exit, pause,
     "struct vmx *", "int", "struct vm_exit *");
 
 SDT_PROBE_DEFINE3(vmm, vmx, exit, intrwindow,
     "struct vmx *", "int", "struct vm_exit *");
 
 SDT_PROBE_DEFINE4(vmm, vmx, exit, interrupt,
     "struct vmx *", "int", "struct vm_exit *", "uint32_t");
 
 SDT_PROBE_DEFINE3(vmm, vmx, exit, nmiwindow,
     "struct vmx *", "int", "struct vm_exit *");
 
 SDT_PROBE_DEFINE3(vmm, vmx, exit, inout,
     "struct vmx *", "int", "struct vm_exit *");
 
 SDT_PROBE_DEFINE3(vmm, vmx, exit, cpuid,
     "struct vmx *", "int", "struct vm_exit *");
 
 SDT_PROBE_DEFINE5(vmm, vmx, exit, exception,
     "struct vmx *", "int", "struct vm_exit *", "uint32_t", "int");
 
 SDT_PROBE_DEFINE5(vmm, vmx, exit, nestedfault,
     "struct vmx *", "int", "struct vm_exit *", "uint64_t", "uint64_t");
 
 SDT_PROBE_DEFINE4(vmm, vmx, exit, mmiofault,
     "struct vmx *", "int", "struct vm_exit *", "uint64_t");
 
 SDT_PROBE_DEFINE3(vmm, vmx, exit, eoi,
     "struct vmx *", "int", "struct vm_exit *");
 
 SDT_PROBE_DEFINE3(vmm, vmx, exit, apicaccess,
     "struct vmx *", "int", "struct vm_exit *");
 
 SDT_PROBE_DEFINE4(vmm, vmx, exit, apicwrite,
     "struct vmx *", "int", "struct vm_exit *", "struct vlapic *");
 
 SDT_PROBE_DEFINE3(vmm, vmx, exit, xsetbv,
     "struct vmx *", "int", "struct vm_exit *");
 
 SDT_PROBE_DEFINE3(vmm, vmx, exit, monitor,
     "struct vmx *", "int", "struct vm_exit *");
 
 SDT_PROBE_DEFINE3(vmm, vmx, exit, mwait,
     "struct vmx *", "int", "struct vm_exit *");
 
 SDT_PROBE_DEFINE3(vmm, vmx, exit, vminsn,
     "struct vmx *", "int", "struct vm_exit *");
 
 SDT_PROBE_DEFINE4(vmm, vmx, exit, unknown,
     "struct vmx *", "int", "struct vm_exit *", "uint32_t");
 
 SDT_PROBE_DEFINE4(vmm, vmx, exit, return,
     "struct vmx *", "int", "struct vm_exit *", "int");
 
 /*
  * Use the last page below 4GB as the APIC access address. This address is
  * occupied by the boot firmware so it is guaranteed that it will not conflict
  * with a page in system memory.
  */
 #define	APIC_ACCESS_ADDRESS	0xFFFFF000
 
 static int vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc);
 static int vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval);
 static int vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val);
 static void vmx_inject_pir(struct vlapic *vlapic);
+#ifdef BHYVE_SNAPSHOT
+static int vmx_restore_tsc(void *arg, int vcpu, uint64_t now);
+#endif
 
 #ifdef KTR
 static const char *
 exit_reason_to_str(int reason)
 {
 	static char reasonbuf[32];
 
 	switch (reason) {
 	case EXIT_REASON_EXCEPTION:
 		return "exception";
 	case EXIT_REASON_EXT_INTR:
 		return "extint";
 	case EXIT_REASON_TRIPLE_FAULT:
 		return "triplefault";
 	case EXIT_REASON_INIT:
 		return "init";
 	case EXIT_REASON_SIPI:
 		return "sipi";
 	case EXIT_REASON_IO_SMI:
 		return "iosmi";
 	case EXIT_REASON_SMI:
 		return "smi";
 	case EXIT_REASON_INTR_WINDOW:
 		return "intrwindow";
 	case EXIT_REASON_NMI_WINDOW:
 		return "nmiwindow";
 	case EXIT_REASON_TASK_SWITCH:
 		return "taskswitch";
 	case EXIT_REASON_CPUID:
 		return "cpuid";
 	case EXIT_REASON_GETSEC:
 		return "getsec";
 	case EXIT_REASON_HLT:
 		return "hlt";
 	case EXIT_REASON_INVD:
 		return "invd";
 	case EXIT_REASON_INVLPG:
 		return "invlpg";
 	case EXIT_REASON_RDPMC:
 		return "rdpmc";
 	case EXIT_REASON_RDTSC:
 		return "rdtsc";
 	case EXIT_REASON_RSM:
 		return "rsm";
 	case EXIT_REASON_VMCALL:
 		return "vmcall";
 	case EXIT_REASON_VMCLEAR:
 		return "vmclear";
 	case EXIT_REASON_VMLAUNCH:
 		return "vmlaunch";
 	case EXIT_REASON_VMPTRLD:
 		return "vmptrld";
 	case EXIT_REASON_VMPTRST:
 		return "vmptrst";
 	case EXIT_REASON_VMREAD:
 		return "vmread";
 	case EXIT_REASON_VMRESUME:
 		return "vmresume";
 	case EXIT_REASON_VMWRITE:
 		return "vmwrite";
 	case EXIT_REASON_VMXOFF:
 		return "vmxoff";
 	case EXIT_REASON_VMXON:
 		return "vmxon";
 	case EXIT_REASON_CR_ACCESS:
 		return "craccess";
 	case EXIT_REASON_DR_ACCESS:
 		return "draccess";
 	case EXIT_REASON_INOUT:
 		return "inout";
 	case EXIT_REASON_RDMSR:
 		return "rdmsr";
 	case EXIT_REASON_WRMSR:
 		return "wrmsr";
 	case EXIT_REASON_INVAL_VMCS:
 		return "invalvmcs";
 	case EXIT_REASON_INVAL_MSR:
 		return "invalmsr";
 	case EXIT_REASON_MWAIT:
 		return "mwait";
 	case EXIT_REASON_MTF:
 		return "mtf";
 	case EXIT_REASON_MONITOR:
 		return "monitor";
 	case EXIT_REASON_PAUSE:
 		return "pause";
 	case EXIT_REASON_MCE_DURING_ENTRY:
 		return "mce-during-entry";
 	case EXIT_REASON_TPR:
 		return "tpr";
 	case EXIT_REASON_APIC_ACCESS:
 		return "apic-access";
 	case EXIT_REASON_GDTR_IDTR:
 		return "gdtridtr";
 	case EXIT_REASON_LDTR_TR:
 		return "ldtrtr";
 	case EXIT_REASON_EPT_FAULT:
 		return "eptfault";
 	case EXIT_REASON_EPT_MISCONFIG:
 		return "eptmisconfig";
 	case EXIT_REASON_INVEPT:
 		return "invept";
 	case EXIT_REASON_RDTSCP:
 		return "rdtscp";
 	case EXIT_REASON_VMX_PREEMPT:
 		return "vmxpreempt";
 	case EXIT_REASON_INVVPID:
 		return "invvpid";
 	case EXIT_REASON_WBINVD:
 		return "wbinvd";
 	case EXIT_REASON_XSETBV:
 		return "xsetbv";
 	case EXIT_REASON_APIC_WRITE:
 		return "apic-write";
 	default:
 		snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason);
 		return (reasonbuf);
 	}
 }
 #endif	/* KTR */
 
 static int
 vmx_allow_x2apic_msrs(struct vmx *vmx)
 {
 	int i, error;
 
 	error = 0;
 
 	/*
 	 * Allow readonly access to the following x2APIC MSRs from the guest.
 	 */
 	error += guest_msr_ro(vmx, MSR_APIC_ID);
 	error += guest_msr_ro(vmx, MSR_APIC_VERSION);
 	error += guest_msr_ro(vmx, MSR_APIC_LDR);
 	error += guest_msr_ro(vmx, MSR_APIC_SVR);
 
 	for (i = 0; i < 8; i++)
 		error += guest_msr_ro(vmx, MSR_APIC_ISR0 + i);
 
 	for (i = 0; i < 8; i++)
 		error += guest_msr_ro(vmx, MSR_APIC_TMR0 + i);
 
 	for (i = 0; i < 8; i++)
 		error += guest_msr_ro(vmx, MSR_APIC_IRR0 + i);
 
 	error += guest_msr_ro(vmx, MSR_APIC_ESR);
 	error += guest_msr_ro(vmx, MSR_APIC_LVT_TIMER);
 	error += guest_msr_ro(vmx, MSR_APIC_LVT_THERMAL);
 	error += guest_msr_ro(vmx, MSR_APIC_LVT_PCINT);
 	error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT0);
 	error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT1);
 	error += guest_msr_ro(vmx, MSR_APIC_LVT_ERROR);
 	error += guest_msr_ro(vmx, MSR_APIC_ICR_TIMER);
 	error += guest_msr_ro(vmx, MSR_APIC_DCR_TIMER);
 	error += guest_msr_ro(vmx, MSR_APIC_ICR);
 
 	/*
 	 * Allow TPR, EOI and SELF_IPI MSRs to be read and written by the guest.
 	 *
 	 * These registers get special treatment described in the section
 	 * "Virtualizing MSR-Based APIC Accesses".
 	 */
 	error += guest_msr_rw(vmx, MSR_APIC_TPR);
 	error += guest_msr_rw(vmx, MSR_APIC_EOI);
 	error += guest_msr_rw(vmx, MSR_APIC_SELF_IPI);
 
 	return (error);
 }
 
 u_long
 vmx_fix_cr0(u_long cr0)
 {
 
 	return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask);
 }
 
 u_long
 vmx_fix_cr4(u_long cr4)
 {
 
 	return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask);
 }
 
 static void
 vpid_free(int vpid)
 {
 	if (vpid < 0 || vpid > 0xffff)
 		panic("vpid_free: invalid vpid %d", vpid);
 
 	/*
 	 * VPIDs [0,VM_MAXCPU] are special and are not allocated from
 	 * the unit number allocator.
 	 */
 
 	if (vpid > VM_MAXCPU)
 		free_unr(vpid_unr, vpid);
 }
 
 static void
 vpid_alloc(uint16_t *vpid, int num)
 {
 	int i, x;
 
 	if (num <= 0 || num > VM_MAXCPU)
 		panic("invalid number of vpids requested: %d", num);
 
 	/*
 	 * If the "enable vpid" execution control is not enabled then the
 	 * VPID is required to be 0 for all vcpus.
 	 */
 	if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) == 0) {
 		for (i = 0; i < num; i++)
 			vpid[i] = 0;
 		return;
 	}
 
 	/*
 	 * Allocate a unique VPID for each vcpu from the unit number allocator.
 	 */
 	for (i = 0; i < num; i++) {
 		x = alloc_unr(vpid_unr);
 		if (x == -1)
 			break;
 		else
 			vpid[i] = x;
 	}
 
 	if (i < num) {
 		atomic_add_int(&vpid_alloc_failed, 1);
 
 		/*
 		 * If the unit number allocator does not have enough unique
 		 * VPIDs then we need to allocate from the [1,VM_MAXCPU] range.
 		 *
 		 * These VPIDs are not be unique across VMs but this does not
 		 * affect correctness because the combined mappings are also
 		 * tagged with the EP4TA which is unique for each VM.
 		 *
 		 * It is still sub-optimal because the invvpid will invalidate
 		 * combined mappings for a particular VPID across all EP4TAs.
 		 */
 		while (i-- > 0)
 			vpid_free(vpid[i]);
 
 		for (i = 0; i < num; i++)
 			vpid[i] = i + 1;
 	}
 }
 
 static void
 vpid_init(void)
 {
 	/*
 	 * VPID 0 is required when the "enable VPID" execution control is
 	 * disabled.
 	 *
 	 * VPIDs [1,VM_MAXCPU] are used as the "overflow namespace" when the
 	 * unit number allocator does not have sufficient unique VPIDs to
 	 * satisfy the allocation.
 	 *
 	 * The remaining VPIDs are managed by the unit number allocator.
 	 */
 	vpid_unr = new_unrhdr(VM_MAXCPU + 1, 0xffff, NULL);
 }
 
 static void
 vmx_disable(void *arg __unused)
 {
 	struct invvpid_desc invvpid_desc = { 0 };
 	struct invept_desc invept_desc = { 0 };
 
 	if (vmxon_enabled[curcpu]) {
 		/*
 		 * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b.
 		 *
 		 * VMXON or VMXOFF are not required to invalidate any TLB
 		 * caching structures. This prevents potential retention of
 		 * cached information in the TLB between distinct VMX episodes.
 		 */
 		invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc);
 		invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc);
 		vmxoff();
 	}
 	load_cr4(rcr4() & ~CR4_VMXE);
 }
 
 static int
 vmx_cleanup(void)
 {
 
 	if (pirvec >= 0)
 		lapic_ipi_free(pirvec);
 
 	if (vpid_unr != NULL) {
 		delete_unrhdr(vpid_unr);
 		vpid_unr = NULL;
 	}
 
 	if (nmi_flush_l1d_sw == 1)
 		nmi_flush_l1d_sw = 0;
 
 	smp_rendezvous(NULL, vmx_disable, NULL, NULL);
 
 	return (0);
 }
 
 static void
 vmx_enable(void *arg __unused)
 {
 	int error;
 	uint64_t feature_control;
 
 	feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL);
 	if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 0 ||
 	    (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) {
 		wrmsr(MSR_IA32_FEATURE_CONTROL,
 		    feature_control | IA32_FEATURE_CONTROL_VMX_EN |
 		    IA32_FEATURE_CONTROL_LOCK);
 	}
 
 	load_cr4(rcr4() | CR4_VMXE);
 
 	*(uint32_t *)vmxon_region[curcpu] = vmx_revision();
 	error = vmxon(vmxon_region[curcpu]);
 	if (error == 0)
 		vmxon_enabled[curcpu] = 1;
 }
 
 static void
 vmx_restore(void)
 {
 
 	if (vmxon_enabled[curcpu])
 		vmxon(vmxon_region[curcpu]);
 }
 
 static int
 vmx_init(int ipinum)
 {
 	int error;
 	uint64_t basic, fixed0, fixed1, feature_control;
 	uint32_t tmp, procbased2_vid_bits;
 
 	/* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */
 	if (!(cpu_feature2 & CPUID2_VMX)) {
 		printf("vmx_init: processor does not support VMX operation\n");
 		return (ENXIO);
 	}
 
 	/*
 	 * Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits
 	 * are set (bits 0 and 2 respectively).
 	 */
 	feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL);
 	if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 1 &&
 	    (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) {
 		printf("vmx_init: VMX operation disabled by BIOS\n");
 		return (ENXIO);
 	}
 
 	/*
 	 * Verify capabilities MSR_VMX_BASIC:
 	 * - bit 54 indicates support for INS/OUTS decoding
 	 */
 	basic = rdmsr(MSR_VMX_BASIC);
 	if ((basic & (1UL << 54)) == 0) {
 		printf("vmx_init: processor does not support desired basic "
 		    "capabilities\n");
 		return (EINVAL);
 	}
 
 	/* Check support for primary processor-based VM-execution controls */
 	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
 			       MSR_VMX_TRUE_PROCBASED_CTLS,
 			       PROCBASED_CTLS_ONE_SETTING,
 			       PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls);
 	if (error) {
 		printf("vmx_init: processor does not support desired primary "
 		       "processor-based controls\n");
 		return (error);
 	}
 
 	/* Clear the processor-based ctl bits that are set on demand */
 	procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING;
 
 	/* Check support for secondary processor-based VM-execution controls */
 	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
 			       MSR_VMX_PROCBASED_CTLS2,
 			       PROCBASED_CTLS2_ONE_SETTING,
 			       PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2);
 	if (error) {
 		printf("vmx_init: processor does not support desired secondary "
 		       "processor-based controls\n");
 		return (error);
 	}
 
 	/* Check support for VPID */
 	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2,
 			       PROCBASED2_ENABLE_VPID, 0, &tmp);
 	if (error == 0)
 		procbased_ctls2 |= PROCBASED2_ENABLE_VPID;
 
 	/* Check support for pin-based VM-execution controls */
 	error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS,
 			       MSR_VMX_TRUE_PINBASED_CTLS,
 			       PINBASED_CTLS_ONE_SETTING,
 			       PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls);
 	if (error) {
 		printf("vmx_init: processor does not support desired "
 		       "pin-based controls\n");
 		return (error);
 	}
 
 	/* Check support for VM-exit controls */
 	error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS,
 			       VM_EXIT_CTLS_ONE_SETTING,
 			       VM_EXIT_CTLS_ZERO_SETTING,
 			       &exit_ctls);
 	if (error) {
 		printf("vmx_init: processor does not support desired "
 		    "exit controls\n");
 		return (error);
 	}
 
 	/* Check support for VM-entry controls */
 	error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, MSR_VMX_TRUE_ENTRY_CTLS,
 	    VM_ENTRY_CTLS_ONE_SETTING, VM_ENTRY_CTLS_ZERO_SETTING,
 	    &entry_ctls);
 	if (error) {
 		printf("vmx_init: processor does not support desired "
 		    "entry controls\n");
 		return (error);
 	}
 
 	/*
 	 * Check support for optional features by testing them
 	 * as individual bits
 	 */
 	cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
 					MSR_VMX_TRUE_PROCBASED_CTLS,
 					PROCBASED_HLT_EXITING, 0,
 					&tmp) == 0);
 
 	cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
 					MSR_VMX_PROCBASED_CTLS,
 					PROCBASED_MTF, 0,
 					&tmp) == 0);
 
 	cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
 					 MSR_VMX_TRUE_PROCBASED_CTLS,
 					 PROCBASED_PAUSE_EXITING, 0,
 					 &tmp) == 0);
 
 	cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
 					MSR_VMX_PROCBASED_CTLS2,
 					PROCBASED2_UNRESTRICTED_GUEST, 0,
 				        &tmp) == 0);
 
 	cap_invpcid = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
 	    MSR_VMX_PROCBASED_CTLS2, PROCBASED2_ENABLE_INVPCID, 0,
 	    &tmp) == 0);
 
 	/*
 	 * Check support for TPR shadow.
 	 */
 	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
 	    MSR_VMX_TRUE_PROCBASED_CTLS, PROCBASED_USE_TPR_SHADOW, 0,
 	    &tmp);
 	if (error == 0) {
 		tpr_shadowing = 1;
 		TUNABLE_INT_FETCH("hw.vmm.vmx.use_tpr_shadowing",
 		    &tpr_shadowing);
 	}
 
 	if (tpr_shadowing) {
 		procbased_ctls |= PROCBASED_USE_TPR_SHADOW;
 		procbased_ctls &= ~PROCBASED_CR8_LOAD_EXITING;
 		procbased_ctls &= ~PROCBASED_CR8_STORE_EXITING;
 	}
 
 	/*
 	 * Check support for virtual interrupt delivery.
 	 */
 	procbased2_vid_bits = (PROCBASED2_VIRTUALIZE_APIC_ACCESSES |
 	    PROCBASED2_VIRTUALIZE_X2APIC_MODE |
 	    PROCBASED2_APIC_REGISTER_VIRTUALIZATION |
 	    PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY);
 
 	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2,
 	    procbased2_vid_bits, 0, &tmp);
 	if (error == 0 && tpr_shadowing) {
 		virtual_interrupt_delivery = 1;
 		TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_vid",
 		    &virtual_interrupt_delivery);
 	}
 
 	if (virtual_interrupt_delivery) {
 		procbased_ctls |= PROCBASED_USE_TPR_SHADOW;
 		procbased_ctls2 |= procbased2_vid_bits;
 		procbased_ctls2 &= ~PROCBASED2_VIRTUALIZE_X2APIC_MODE;
 
 		/*
 		 * Check for Posted Interrupts only if Virtual Interrupt
 		 * Delivery is enabled.
 		 */
 		error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS,
 		    MSR_VMX_TRUE_PINBASED_CTLS, PINBASED_POSTED_INTERRUPT, 0,
 		    &tmp);
 		if (error == 0) {
 			pirvec = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) :
 			    &IDTVEC(justreturn));
 			if (pirvec < 0) {
 				if (bootverbose) {
 					printf("vmx_init: unable to allocate "
 					    "posted interrupt vector\n");
 				}
 			} else {
 				posted_interrupts = 1;
 				TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_pir",
 				    &posted_interrupts);
 			}
 		}
 	}
 
 	if (posted_interrupts)
 		    pinbased_ctls |= PINBASED_POSTED_INTERRUPT;
 
 	/* Initialize EPT */
 	error = ept_init(ipinum);
 	if (error) {
 		printf("vmx_init: ept initialization failed (%d)\n", error);
 		return (error);
 	}
 
 	guest_l1d_flush = (cpu_ia32_arch_caps &
 	    IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY) == 0;
 	TUNABLE_INT_FETCH("hw.vmm.l1d_flush", &guest_l1d_flush);
 
 	/*
 	 * L1D cache flush is enabled.  Use IA32_FLUSH_CMD MSR when
 	 * available.  Otherwise fall back to the software flush
 	 * method which loads enough data from the kernel text to
 	 * flush existing L1D content, both on VMX entry and on NMI
 	 * return.
 	 */
 	if (guest_l1d_flush) {
 		if ((cpu_stdext_feature3 & CPUID_STDEXT3_L1D_FLUSH) == 0) {
 			guest_l1d_flush_sw = 1;
 			TUNABLE_INT_FETCH("hw.vmm.l1d_flush_sw",
 			    &guest_l1d_flush_sw);
 		}
 		if (guest_l1d_flush_sw) {
 			if (nmi_flush_l1d_sw <= 1)
 				nmi_flush_l1d_sw = 1;
 		} else {
 			msr_load_list[0].index = MSR_IA32_FLUSH_CMD;
 			msr_load_list[0].val = IA32_FLUSH_CMD_L1D;
 		}
 	}
 
 	/*
 	 * Stash the cr0 and cr4 bits that must be fixed to 0 or 1
 	 */
 	fixed0 = rdmsr(MSR_VMX_CR0_FIXED0);
 	fixed1 = rdmsr(MSR_VMX_CR0_FIXED1);
 	cr0_ones_mask = fixed0 & fixed1;
 	cr0_zeros_mask = ~fixed0 & ~fixed1;
 
 	/*
 	 * CR0_PE and CR0_PG can be set to zero in VMX non-root operation
 	 * if unrestricted guest execution is allowed.
 	 */
 	if (cap_unrestricted_guest)
 		cr0_ones_mask &= ~(CR0_PG | CR0_PE);
 
 	/*
 	 * Do not allow the guest to set CR0_NW or CR0_CD.
 	 */
 	cr0_zeros_mask |= (CR0_NW | CR0_CD);
 
 	fixed0 = rdmsr(MSR_VMX_CR4_FIXED0);
 	fixed1 = rdmsr(MSR_VMX_CR4_FIXED1);
 	cr4_ones_mask = fixed0 & fixed1;
 	cr4_zeros_mask = ~fixed0 & ~fixed1;
 
 	vpid_init();
 
 	vmx_msr_init();
 
 	/* enable VMX operation */
 	smp_rendezvous(NULL, vmx_enable, NULL, NULL);
 
 	vmx_initialized = 1;
 
 	return (0);
 }
 
 static void
 vmx_trigger_hostintr(int vector)
 {
 	uintptr_t func;
 	struct gate_descriptor *gd;
 
 	gd = &idt[vector];
 
 	KASSERT(vector >= 32 && vector <= 255, ("vmx_trigger_hostintr: "
 	    "invalid vector %d", vector));
 	KASSERT(gd->gd_p == 1, ("gate descriptor for vector %d not present",
 	    vector));
 	KASSERT(gd->gd_type == SDT_SYSIGT, ("gate descriptor for vector %d "
 	    "has invalid type %d", vector, gd->gd_type));
 	KASSERT(gd->gd_dpl == SEL_KPL, ("gate descriptor for vector %d "
 	    "has invalid dpl %d", vector, gd->gd_dpl));
 	KASSERT(gd->gd_selector == GSEL(GCODE_SEL, SEL_KPL), ("gate descriptor "
 	    "for vector %d has invalid selector %d", vector, gd->gd_selector));
 	KASSERT(gd->gd_ist == 0, ("gate descriptor for vector %d has invalid "
 	    "IST %d", vector, gd->gd_ist));
 
 	func = ((long)gd->gd_hioffset << 16 | gd->gd_looffset);
 	vmx_call_isr(func);
 }
 
 static int
 vmx_setup_cr_shadow(int which, struct vmcs *vmcs, uint32_t initial)
 {
 	int error, mask_ident, shadow_ident;
 	uint64_t mask_value;
 
 	if (which != 0 && which != 4)
 		panic("vmx_setup_cr_shadow: unknown cr%d", which);
 
 	if (which == 0) {
 		mask_ident = VMCS_CR0_MASK;
 		mask_value = cr0_ones_mask | cr0_zeros_mask;
 		shadow_ident = VMCS_CR0_SHADOW;
 	} else {
 		mask_ident = VMCS_CR4_MASK;
 		mask_value = cr4_ones_mask | cr4_zeros_mask;
 		shadow_ident = VMCS_CR4_SHADOW;
 	}
 
 	error = vmcs_setreg(vmcs, 0, VMCS_IDENT(mask_ident), mask_value);
 	if (error)
 		return (error);
 
 	error = vmcs_setreg(vmcs, 0, VMCS_IDENT(shadow_ident), initial);
 	if (error)
 		return (error);
 
 	return (0);
 }
 #define	vmx_setup_cr0_shadow(vmcs,init)	vmx_setup_cr_shadow(0, (vmcs), (init))
 #define	vmx_setup_cr4_shadow(vmcs,init)	vmx_setup_cr_shadow(4, (vmcs), (init))
 
 static void *
 vmx_vminit(struct vm *vm, pmap_t pmap)
 {
 	uint16_t vpid[VM_MAXCPU];
 	int i, error;
 	struct vmx *vmx;
 	struct vmcs *vmcs;
 	uint32_t exc_bitmap;
 	uint16_t maxcpus;
 
 	vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO);
 	if ((uintptr_t)vmx & PAGE_MASK) {
 		panic("malloc of struct vmx not aligned on %d byte boundary",
 		      PAGE_SIZE);
 	}
 	vmx->vm = vm;
 
 	vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pml4));
 
 	/*
 	 * Clean up EPTP-tagged guest physical and combined mappings
 	 *
 	 * VMX transitions are not required to invalidate any guest physical
 	 * mappings. So, it may be possible for stale guest physical mappings
 	 * to be present in the processor TLBs.
 	 *
 	 * Combined mappings for this EP4TA are also invalidated for all VPIDs.
 	 */
 	ept_invalidate_mappings(vmx->eptp);
 
 	msr_bitmap_initialize(vmx->msr_bitmap);
 
 	/*
 	 * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE.
 	 * The guest FSBASE and GSBASE are saved and restored during
 	 * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are
 	 * always restored from the vmcs host state area on vm-exit.
 	 *
 	 * The SYSENTER_CS/ESP/EIP MSRs are identical to FS/GSBASE in
 	 * how they are saved/restored so can be directly accessed by the
 	 * guest.
 	 *
 	 * MSR_EFER is saved and restored in the guest VMCS area on a
 	 * VM exit and entry respectively. It is also restored from the
 	 * host VMCS area on a VM exit.
 	 *
 	 * The TSC MSR is exposed read-only. Writes are disallowed as
 	 * that will impact the host TSC.  If the guest does a write
 	 * the "use TSC offsetting" execution control is enabled and the
 	 * difference between the host TSC and the guest TSC is written
 	 * into the TSC offset in the VMCS.
 	 */
 	if (guest_msr_rw(vmx, MSR_GSBASE) ||
 	    guest_msr_rw(vmx, MSR_FSBASE) ||
 	    guest_msr_rw(vmx, MSR_SYSENTER_CS_MSR) ||
 	    guest_msr_rw(vmx, MSR_SYSENTER_ESP_MSR) ||
 	    guest_msr_rw(vmx, MSR_SYSENTER_EIP_MSR) ||
 	    guest_msr_rw(vmx, MSR_EFER) ||
 	    guest_msr_ro(vmx, MSR_TSC))
 		panic("vmx_vminit: error setting guest msr access");
 
 	vpid_alloc(vpid, VM_MAXCPU);
 
 	if (virtual_interrupt_delivery) {
 		error = vm_map_mmio(vm, DEFAULT_APIC_BASE, PAGE_SIZE,
 		    APIC_ACCESS_ADDRESS);
 		/* XXX this should really return an error to the caller */
 		KASSERT(error == 0, ("vm_map_mmio(apicbase) error %d", error));
 	}
 
 	maxcpus = vm_get_maxcpus(vm);
 	for (i = 0; i < maxcpus; i++) {
 		vmcs = &vmx->vmcs[i];
 		vmcs->identifier = vmx_revision();
 		error = vmclear(vmcs);
 		if (error != 0) {
 			panic("vmx_vminit: vmclear error %d on vcpu %d\n",
 			      error, i);
 		}
 
 		vmx_msr_guest_init(vmx, i);
 
 		error = vmcs_init(vmcs);
 		KASSERT(error == 0, ("vmcs_init error %d", error));
 
 		VMPTRLD(vmcs);
 		error = 0;
 		error += vmwrite(VMCS_HOST_RSP, (u_long)&vmx->ctx[i]);
 		error += vmwrite(VMCS_EPTP, vmx->eptp);
 		error += vmwrite(VMCS_PIN_BASED_CTLS, pinbased_ctls);
 		error += vmwrite(VMCS_PRI_PROC_BASED_CTLS, procbased_ctls);
 		error += vmwrite(VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2);
 		error += vmwrite(VMCS_EXIT_CTLS, exit_ctls);
 		error += vmwrite(VMCS_ENTRY_CTLS, entry_ctls);
 		error += vmwrite(VMCS_MSR_BITMAP, vtophys(vmx->msr_bitmap));
 		error += vmwrite(VMCS_VPID, vpid[i]);
 
 		if (guest_l1d_flush && !guest_l1d_flush_sw) {
 			vmcs_write(VMCS_ENTRY_MSR_LOAD, pmap_kextract(
 			    (vm_offset_t)&msr_load_list[0]));
 			vmcs_write(VMCS_ENTRY_MSR_LOAD_COUNT,
 			    nitems(msr_load_list));
 			vmcs_write(VMCS_EXIT_MSR_STORE, 0);
 			vmcs_write(VMCS_EXIT_MSR_STORE_COUNT, 0);
 		}
 
 		/* exception bitmap */
 		if (vcpu_trace_exceptions(vm, i))
 			exc_bitmap = 0xffffffff;
 		else
 			exc_bitmap = 1 << IDT_MC;
 		error += vmwrite(VMCS_EXCEPTION_BITMAP, exc_bitmap);
 
 		vmx->ctx[i].guest_dr6 = DBREG_DR6_RESERVED1;
 		error += vmwrite(VMCS_GUEST_DR7, DBREG_DR7_RESERVED1);
 
 		if (tpr_shadowing) {
 			error += vmwrite(VMCS_VIRTUAL_APIC,
 			    vtophys(&vmx->apic_page[i]));
 		}
 
 		if (virtual_interrupt_delivery) {
 			error += vmwrite(VMCS_APIC_ACCESS, APIC_ACCESS_ADDRESS);
 			error += vmwrite(VMCS_EOI_EXIT0, 0);
 			error += vmwrite(VMCS_EOI_EXIT1, 0);
 			error += vmwrite(VMCS_EOI_EXIT2, 0);
 			error += vmwrite(VMCS_EOI_EXIT3, 0);
 		}
 		if (posted_interrupts) {
 			error += vmwrite(VMCS_PIR_VECTOR, pirvec);
 			error += vmwrite(VMCS_PIR_DESC,
 			    vtophys(&vmx->pir_desc[i]));
 		}
 		VMCLEAR(vmcs);
 		KASSERT(error == 0, ("vmx_vminit: error customizing the vmcs"));
 
 		vmx->cap[i].set = 0;
 		vmx->cap[i].proc_ctls = procbased_ctls;
 		vmx->cap[i].proc_ctls2 = procbased_ctls2;
 		vmx->cap[i].exc_bitmap = exc_bitmap;
 
 		vmx->state[i].nextrip = ~0;
 		vmx->state[i].lastcpu = NOCPU;
 		vmx->state[i].vpid = vpid[i];
 
 		/*
 		 * Set up the CR0/4 shadows, and init the read shadow
 		 * to the power-on register value from the Intel Sys Arch.
 		 *  CR0 - 0x60000010
 		 *  CR4 - 0
 		 */
 		error = vmx_setup_cr0_shadow(vmcs, 0x60000010);
 		if (error != 0)
 			panic("vmx_setup_cr0_shadow %d", error);
 
 		error = vmx_setup_cr4_shadow(vmcs, 0);
 		if (error != 0)
 			panic("vmx_setup_cr4_shadow %d", error);
 
 		vmx->ctx[i].pmap = pmap;
 	}
 
 	return (vmx);
 }
 
 static int
 vmx_handle_cpuid(struct vm *vm, int vcpu, struct vmxctx *vmxctx)
 {
 	int handled, func;
 
 	func = vmxctx->guest_rax;
 
 	handled = x86_emulate_cpuid(vm, vcpu,
 				    (uint32_t*)(&vmxctx->guest_rax),
 				    (uint32_t*)(&vmxctx->guest_rbx),
 				    (uint32_t*)(&vmxctx->guest_rcx),
 				    (uint32_t*)(&vmxctx->guest_rdx));
 	return (handled);
 }
 
 static __inline void
 vmx_run_trace(struct vmx *vmx, int vcpu)
 {
 #ifdef KTR
 	VCPU_CTR1(vmx->vm, vcpu, "Resume execution at %#lx", vmcs_guest_rip());
 #endif
 }
 
 static __inline void
 vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason,
 	       int handled)
 {
 #ifdef KTR
 	VCPU_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx",
 		 handled ? "handled" : "unhandled",
 		 exit_reason_to_str(exit_reason), rip);
 #endif
 }
 
 static __inline void
 vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip)
 {
 #ifdef KTR
 	VCPU_CTR1(vmx->vm, vcpu, "astpending vmexit at 0x%0lx", rip);
 #endif
 }
 
 static VMM_STAT_INTEL(VCPU_INVVPID_SAVED, "Number of vpid invalidations saved");
 static VMM_STAT_INTEL(VCPU_INVVPID_DONE, "Number of vpid invalidations done");
 
 /*
  * Invalidate guest mappings identified by its vpid from the TLB.
  */
 static __inline void
 vmx_invvpid(struct vmx *vmx, int vcpu, pmap_t pmap, int running)
 {
 	struct vmxstate *vmxstate;
 	struct invvpid_desc invvpid_desc;
 
 	vmxstate = &vmx->state[vcpu];
 	if (vmxstate->vpid == 0)
 		return;
 
 	if (!running) {
 		/*
 		 * Set the 'lastcpu' to an invalid host cpu.
 		 *
 		 * This will invalidate TLB entries tagged with the vcpu's
 		 * vpid the next time it runs via vmx_set_pcpu_defaults().
 		 */
 		vmxstate->lastcpu = NOCPU;
 		return;
 	}
 
 	KASSERT(curthread->td_critnest > 0, ("%s: vcpu %d running outside "
 	    "critical section", __func__, vcpu));
 
 	/*
 	 * Invalidate all mappings tagged with 'vpid'
 	 *
 	 * We do this because this vcpu was executing on a different host
 	 * cpu when it last ran. We do not track whether it invalidated
 	 * mappings associated with its 'vpid' during that run. So we must
 	 * assume that the mappings associated with 'vpid' on 'curcpu' are
 	 * stale and invalidate them.
 	 *
 	 * Note that we incur this penalty only when the scheduler chooses to
 	 * move the thread associated with this vcpu between host cpus.
 	 *
 	 * Note also that this will invalidate mappings tagged with 'vpid'
 	 * for "all" EP4TAs.
 	 */
 	if (pmap->pm_eptgen == vmx->eptgen[curcpu]) {
 		invvpid_desc._res1 = 0;
 		invvpid_desc._res2 = 0;
 		invvpid_desc.vpid = vmxstate->vpid;
 		invvpid_desc.linear_addr = 0;
 		invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc);
 		vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_DONE, 1);
 	} else {
 		/*
 		 * The invvpid can be skipped if an invept is going to
 		 * be performed before entering the guest. The invept
 		 * will invalidate combined mappings tagged with
 		 * 'vmx->eptp' for all vpids.
 		 */
 		vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_SAVED, 1);
 	}
 }
 
 static void
 vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu, pmap_t pmap)
 {
 	struct vmxstate *vmxstate;
 
 	vmxstate = &vmx->state[vcpu];
 	if (vmxstate->lastcpu == curcpu)
 		return;
 
 	vmxstate->lastcpu = curcpu;
 
 	vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1);
 
 	vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase());
 	vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase());
 	vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase());
 	vmx_invvpid(vmx, vcpu, pmap, 1);
 }
 
 /*
  * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set.
  */
 CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0);
 
 static void __inline
 vmx_set_int_window_exiting(struct vmx *vmx, int vcpu)
 {
 
 	if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) == 0) {
 		vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING;
 		vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
 		VCPU_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting");
 	}
 }
 
 static void __inline
 vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu)
 {
 
 	KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0,
 	    ("intr_window_exiting not set: %#x", vmx->cap[vcpu].proc_ctls));
 	vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING;
 	vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
 	VCPU_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting");
 }
 
 static void __inline
 vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu)
 {
 
 	if ((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) == 0) {
 		vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING;
 		vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
 		VCPU_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting");
 	}
 }
 
 static void __inline
 vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu)
 {
 
 	KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) != 0,
 	    ("nmi_window_exiting not set %#x", vmx->cap[vcpu].proc_ctls));
 	vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING;
 	vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
 	VCPU_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting");
 }
 
 int
 vmx_set_tsc_offset(struct vmx *vmx, int vcpu, uint64_t offset)
 {
 	int error;
 
 	if ((vmx->cap[vcpu].proc_ctls & PROCBASED_TSC_OFFSET) == 0) {
 		vmx->cap[vcpu].proc_ctls |= PROCBASED_TSC_OFFSET;
 		vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
 		VCPU_CTR0(vmx->vm, vcpu, "Enabling TSC offsetting");
 	}
 
 	error = vmwrite(VMCS_TSC_OFFSET, offset);
-
+#ifdef BHYVE_SNAPSHOT
+	if (error == 0)
+		error = vm_set_tsc_offset(vmx->vm, vcpu, offset);
+#endif
 	return (error);
 }
 
 #define	NMI_BLOCKING	(VMCS_INTERRUPTIBILITY_NMI_BLOCKING |		\
 			 VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING)
 #define	HWINTR_BLOCKING	(VMCS_INTERRUPTIBILITY_STI_BLOCKING |		\
 			 VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING)
 
 static void
 vmx_inject_nmi(struct vmx *vmx, int vcpu)
 {
 	uint32_t gi, info;
 
 	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
 	KASSERT((gi & NMI_BLOCKING) == 0, ("vmx_inject_nmi: invalid guest "
 	    "interruptibility-state %#x", gi));
 
 	info = vmcs_read(VMCS_ENTRY_INTR_INFO);
 	KASSERT((info & VMCS_INTR_VALID) == 0, ("vmx_inject_nmi: invalid "
 	    "VM-entry interruption information %#x", info));
 
 	/*
 	 * Inject the virtual NMI. The vector must be the NMI IDT entry
 	 * or the VMCS entry check will fail.
 	 */
 	info = IDT_NMI | VMCS_INTR_T_NMI | VMCS_INTR_VALID;
 	vmcs_write(VMCS_ENTRY_INTR_INFO, info);
 
 	VCPU_CTR0(vmx->vm, vcpu, "Injecting vNMI");
 
 	/* Clear the request */
 	vm_nmi_clear(vmx->vm, vcpu);
 }
 
 static void
 vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic,
     uint64_t guestrip)
 {
 	int vector, need_nmi_exiting, extint_pending;
 	uint64_t rflags, entryinfo;
 	uint32_t gi, info;
 
 	if (vmx->state[vcpu].nextrip != guestrip) {
 		gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
 		if (gi & HWINTR_BLOCKING) {
 			VCPU_CTR2(vmx->vm, vcpu, "Guest interrupt blocking "
 			    "cleared due to rip change: %#lx/%#lx",
 			    vmx->state[vcpu].nextrip, guestrip);
 			gi &= ~HWINTR_BLOCKING;
 			vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
 		}
 	}
 
 	if (vm_entry_intinfo(vmx->vm, vcpu, &entryinfo)) {
 		KASSERT((entryinfo & VMCS_INTR_VALID) != 0, ("%s: entry "
 		    "intinfo is not valid: %#lx", __func__, entryinfo));
 
 		info = vmcs_read(VMCS_ENTRY_INTR_INFO);
 		KASSERT((info & VMCS_INTR_VALID) == 0, ("%s: cannot inject "
 		     "pending exception: %#lx/%#x", __func__, entryinfo, info));
 
 		info = entryinfo;
 		vector = info & 0xff;
 		if (vector == IDT_BP || vector == IDT_OF) {
 			/*
 			 * VT-x requires #BP and #OF to be injected as software
 			 * exceptions.
 			 */
 			info &= ~VMCS_INTR_T_MASK;
 			info |= VMCS_INTR_T_SWEXCEPTION;
 		}
 
 		if (info & VMCS_INTR_DEL_ERRCODE)
 			vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, entryinfo >> 32);
 
 		vmcs_write(VMCS_ENTRY_INTR_INFO, info);
 	}
 
 	if (vm_nmi_pending(vmx->vm, vcpu)) {
 		/*
 		 * If there are no conditions blocking NMI injection then
 		 * inject it directly here otherwise enable "NMI window
 		 * exiting" to inject it as soon as we can.
 		 *
 		 * We also check for STI_BLOCKING because some implementations
 		 * don't allow NMI injection in this case. If we are running
 		 * on a processor that doesn't have this restriction it will
 		 * immediately exit and the NMI will be injected in the
 		 * "NMI window exiting" handler.
 		 */
 		need_nmi_exiting = 1;
 		gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
 		if ((gi & (HWINTR_BLOCKING | NMI_BLOCKING)) == 0) {
 			info = vmcs_read(VMCS_ENTRY_INTR_INFO);
 			if ((info & VMCS_INTR_VALID) == 0) {
 				vmx_inject_nmi(vmx, vcpu);
 				need_nmi_exiting = 0;
 			} else {
 				VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI "
 				    "due to VM-entry intr info %#x", info);
 			}
 		} else {
 			VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI due to "
 			    "Guest Interruptibility-state %#x", gi);
 		}
 
 		if (need_nmi_exiting)
 			vmx_set_nmi_window_exiting(vmx, vcpu);
 	}
 
 	extint_pending = vm_extint_pending(vmx->vm, vcpu);
 
 	if (!extint_pending && virtual_interrupt_delivery) {
 		vmx_inject_pir(vlapic);
 		return;
 	}
 
 	/*
 	 * If interrupt-window exiting is already in effect then don't bother
 	 * checking for pending interrupts. This is just an optimization and
 	 * not needed for correctness.
 	 */
 	if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0) {
 		VCPU_CTR0(vmx->vm, vcpu, "Skip interrupt injection due to "
 		    "pending int_window_exiting");
 		return;
 	}
 
 	if (!extint_pending) {
 		/* Ask the local apic for a vector to inject */
 		if (!vlapic_pending_intr(vlapic, &vector))
 			return;
 
 		/*
 		 * From the Intel SDM, Volume 3, Section "Maskable
 		 * Hardware Interrupts":
 		 * - maskable interrupt vectors [16,255] can be delivered
 		 *   through the local APIC.
 		*/
 		KASSERT(vector >= 16 && vector <= 255,
 		    ("invalid vector %d from local APIC", vector));
 	} else {
 		/* Ask the legacy pic for a vector to inject */
 		vatpic_pending_intr(vmx->vm, &vector);
 
 		/*
 		 * From the Intel SDM, Volume 3, Section "Maskable
 		 * Hardware Interrupts":
 		 * - maskable interrupt vectors [0,255] can be delivered
 		 *   through the INTR pin.
 		 */
 		KASSERT(vector >= 0 && vector <= 255,
 		    ("invalid vector %d from INTR", vector));
 	}
 
 	/* Check RFLAGS.IF and the interruptibility state of the guest */
 	rflags = vmcs_read(VMCS_GUEST_RFLAGS);
 	if ((rflags & PSL_I) == 0) {
 		VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
 		    "rflags %#lx", vector, rflags);
 		goto cantinject;
 	}
 
 	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
 	if (gi & HWINTR_BLOCKING) {
 		VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
 		    "Guest Interruptibility-state %#x", vector, gi);
 		goto cantinject;
 	}
 
 	info = vmcs_read(VMCS_ENTRY_INTR_INFO);
 	if (info & VMCS_INTR_VALID) {
 		/*
 		 * This is expected and could happen for multiple reasons:
 		 * - A vectoring VM-entry was aborted due to astpending
 		 * - A VM-exit happened during event injection.
 		 * - An exception was injected above.
 		 * - An NMI was injected above or after "NMI window exiting"
 		 */
 		VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
 		    "VM-entry intr info %#x", vector, info);
 		goto cantinject;
 	}
 
 	/* Inject the interrupt */
 	info = VMCS_INTR_T_HWINTR | VMCS_INTR_VALID;
 	info |= vector;
 	vmcs_write(VMCS_ENTRY_INTR_INFO, info);
 
 	if (!extint_pending) {
 		/* Update the Local APIC ISR */
 		vlapic_intr_accepted(vlapic, vector);
 	} else {
 		vm_extint_clear(vmx->vm, vcpu);
 		vatpic_intr_accepted(vmx->vm, vector);
 
 		/*
 		 * After we accepted the current ExtINT the PIC may
 		 * have posted another one.  If that is the case, set
 		 * the Interrupt Window Exiting execution control so
 		 * we can inject that one too.
 		 *
 		 * Also, interrupt window exiting allows us to inject any
 		 * pending APIC vector that was preempted by the ExtINT
 		 * as soon as possible. This applies both for the software
 		 * emulated vlapic and the hardware assisted virtual APIC.
 		 */
 		vmx_set_int_window_exiting(vmx, vcpu);
 	}
 
 	VCPU_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector);
 
 	return;
 
 cantinject:
 	/*
 	 * Set the Interrupt Window Exiting execution control so we can inject
 	 * the interrupt as soon as blocking condition goes away.
 	 */
 	vmx_set_int_window_exiting(vmx, vcpu);
 }
 
 /*
  * If the Virtual NMIs execution control is '1' then the logical processor
  * tracks virtual-NMI blocking in the Guest Interruptibility-state field of
  * the VMCS. An IRET instruction in VMX non-root operation will remove any
  * virtual-NMI blocking.
  *
  * This unblocking occurs even if the IRET causes a fault. In this case the
  * hypervisor needs to restore virtual-NMI blocking before resuming the guest.
  */
 static void
 vmx_restore_nmi_blocking(struct vmx *vmx, int vcpuid)
 {
 	uint32_t gi;
 
 	VCPU_CTR0(vmx->vm, vcpuid, "Restore Virtual-NMI blocking");
 	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
 	gi |= VMCS_INTERRUPTIBILITY_NMI_BLOCKING;
 	vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
 }
 
 static void
 vmx_clear_nmi_blocking(struct vmx *vmx, int vcpuid)
 {
 	uint32_t gi;
 
 	VCPU_CTR0(vmx->vm, vcpuid, "Clear Virtual-NMI blocking");
 	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
 	gi &= ~VMCS_INTERRUPTIBILITY_NMI_BLOCKING;
 	vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
 }
 
 static void
 vmx_assert_nmi_blocking(struct vmx *vmx, int vcpuid)
 {
 	uint32_t gi;
 
 	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
 	KASSERT(gi & VMCS_INTERRUPTIBILITY_NMI_BLOCKING,
 	    ("NMI blocking is not in effect %#x", gi));
 }
 
 static int
 vmx_emulate_xsetbv(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 {
 	struct vmxctx *vmxctx;
 	uint64_t xcrval;
 	const struct xsave_limits *limits;
 
 	vmxctx = &vmx->ctx[vcpu];
 	limits = vmm_get_xsave_limits();
 
 	/*
 	 * Note that the processor raises a GP# fault on its own if
 	 * xsetbv is executed for CPL != 0, so we do not have to
 	 * emulate that fault here.
 	 */
 
 	/* Only xcr0 is supported. */
 	if (vmxctx->guest_rcx != 0) {
 		vm_inject_gp(vmx->vm, vcpu);
 		return (HANDLED);
 	}
 
 	/* We only handle xcr0 if both the host and guest have XSAVE enabled. */
 	if (!limits->xsave_enabled || !(vmcs_read(VMCS_GUEST_CR4) & CR4_XSAVE)) {
 		vm_inject_ud(vmx->vm, vcpu);
 		return (HANDLED);
 	}
 
 	xcrval = vmxctx->guest_rdx << 32 | (vmxctx->guest_rax & 0xffffffff);
 	if ((xcrval & ~limits->xcr0_allowed) != 0) {
 		vm_inject_gp(vmx->vm, vcpu);
 		return (HANDLED);
 	}
 
 	if (!(xcrval & XFEATURE_ENABLED_X87)) {
 		vm_inject_gp(vmx->vm, vcpu);
 		return (HANDLED);
 	}
 
 	/* AVX (YMM_Hi128) requires SSE. */
 	if (xcrval & XFEATURE_ENABLED_AVX &&
 	    (xcrval & XFEATURE_AVX) != XFEATURE_AVX) {
 		vm_inject_gp(vmx->vm, vcpu);
 		return (HANDLED);
 	}
 
 	/*
 	 * AVX512 requires base AVX (YMM_Hi128) as well as OpMask,
 	 * ZMM_Hi256, and Hi16_ZMM.
 	 */
 	if (xcrval & XFEATURE_AVX512 &&
 	    (xcrval & (XFEATURE_AVX512 | XFEATURE_AVX)) !=
 	    (XFEATURE_AVX512 | XFEATURE_AVX)) {
 		vm_inject_gp(vmx->vm, vcpu);
 		return (HANDLED);
 	}
 
 	/*
 	 * Intel MPX requires both bound register state flags to be
 	 * set.
 	 */
 	if (((xcrval & XFEATURE_ENABLED_BNDREGS) != 0) !=
 	    ((xcrval & XFEATURE_ENABLED_BNDCSR) != 0)) {
 		vm_inject_gp(vmx->vm, vcpu);
 		return (HANDLED);
 	}
 
 	/*
 	 * This runs "inside" vmrun() with the guest's FPU state, so
 	 * modifying xcr0 directly modifies the guest's xcr0, not the
 	 * host's.
 	 */
 	load_xcr(0, xcrval);
 	return (HANDLED);
 }
 
 static uint64_t
 vmx_get_guest_reg(struct vmx *vmx, int vcpu, int ident)
 {
 	const struct vmxctx *vmxctx;
 
 	vmxctx = &vmx->ctx[vcpu];
 
 	switch (ident) {
 	case 0:
 		return (vmxctx->guest_rax);
 	case 1:
 		return (vmxctx->guest_rcx);
 	case 2:
 		return (vmxctx->guest_rdx);
 	case 3:
 		return (vmxctx->guest_rbx);
 	case 4:
 		return (vmcs_read(VMCS_GUEST_RSP));
 	case 5:
 		return (vmxctx->guest_rbp);
 	case 6:
 		return (vmxctx->guest_rsi);
 	case 7:
 		return (vmxctx->guest_rdi);
 	case 8:
 		return (vmxctx->guest_r8);
 	case 9:
 		return (vmxctx->guest_r9);
 	case 10:
 		return (vmxctx->guest_r10);
 	case 11:
 		return (vmxctx->guest_r11);
 	case 12:
 		return (vmxctx->guest_r12);
 	case 13:
 		return (vmxctx->guest_r13);
 	case 14:
 		return (vmxctx->guest_r14);
 	case 15:
 		return (vmxctx->guest_r15);
 	default:
 		panic("invalid vmx register %d", ident);
 	}
 }
 
 static void
 vmx_set_guest_reg(struct vmx *vmx, int vcpu, int ident, uint64_t regval)
 {
 	struct vmxctx *vmxctx;
 
 	vmxctx = &vmx->ctx[vcpu];
 
 	switch (ident) {
 	case 0:
 		vmxctx->guest_rax = regval;
 		break;
 	case 1:
 		vmxctx->guest_rcx = regval;
 		break;
 	case 2:
 		vmxctx->guest_rdx = regval;
 		break;
 	case 3:
 		vmxctx->guest_rbx = regval;
 		break;
 	case 4:
 		vmcs_write(VMCS_GUEST_RSP, regval);
 		break;
 	case 5:
 		vmxctx->guest_rbp = regval;
 		break;
 	case 6:
 		vmxctx->guest_rsi = regval;
 		break;
 	case 7:
 		vmxctx->guest_rdi = regval;
 		break;
 	case 8:
 		vmxctx->guest_r8 = regval;
 		break;
 	case 9:
 		vmxctx->guest_r9 = regval;
 		break;
 	case 10:
 		vmxctx->guest_r10 = regval;
 		break;
 	case 11:
 		vmxctx->guest_r11 = regval;
 		break;
 	case 12:
 		vmxctx->guest_r12 = regval;
 		break;
 	case 13:
 		vmxctx->guest_r13 = regval;
 		break;
 	case 14:
 		vmxctx->guest_r14 = regval;
 		break;
 	case 15:
 		vmxctx->guest_r15 = regval;
 		break;
 	default:
 		panic("invalid vmx register %d", ident);
 	}
 }
 
 static int
 vmx_emulate_cr0_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
 {
 	uint64_t crval, regval;
 
 	/* We only handle mov to %cr0 at this time */
 	if ((exitqual & 0xf0) != 0x00)
 		return (UNHANDLED);
 
 	regval = vmx_get_guest_reg(vmx, vcpu, (exitqual >> 8) & 0xf);
 
 	vmcs_write(VMCS_CR0_SHADOW, regval);
 
 	crval = regval | cr0_ones_mask;
 	crval &= ~cr0_zeros_mask;
 	vmcs_write(VMCS_GUEST_CR0, crval);
 
 	if (regval & CR0_PG) {
 		uint64_t efer, entry_ctls;
 
 		/*
 		 * If CR0.PG is 1 and EFER.LME is 1 then EFER.LMA and
 		 * the "IA-32e mode guest" bit in VM-entry control must be
 		 * equal.
 		 */
 		efer = vmcs_read(VMCS_GUEST_IA32_EFER);
 		if (efer & EFER_LME) {
 			efer |= EFER_LMA;
 			vmcs_write(VMCS_GUEST_IA32_EFER, efer);
 			entry_ctls = vmcs_read(VMCS_ENTRY_CTLS);
 			entry_ctls |= VM_ENTRY_GUEST_LMA;
 			vmcs_write(VMCS_ENTRY_CTLS, entry_ctls);
 		}
 	}
 
 	return (HANDLED);
 }
 
 static int
 vmx_emulate_cr4_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
 {
 	uint64_t crval, regval;
 
 	/* We only handle mov to %cr4 at this time */
 	if ((exitqual & 0xf0) != 0x00)
 		return (UNHANDLED);
 
 	regval = vmx_get_guest_reg(vmx, vcpu, (exitqual >> 8) & 0xf);
 
 	vmcs_write(VMCS_CR4_SHADOW, regval);
 
 	crval = regval | cr4_ones_mask;
 	crval &= ~cr4_zeros_mask;
 	vmcs_write(VMCS_GUEST_CR4, crval);
 
 	return (HANDLED);
 }
 
 static int
 vmx_emulate_cr8_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
 {
 	struct vlapic *vlapic;
 	uint64_t cr8;
 	int regnum;
 
 	/* We only handle mov %cr8 to/from a register at this time. */
 	if ((exitqual & 0xe0) != 0x00) {
 		return (UNHANDLED);
 	}
 
 	vlapic = vm_lapic(vmx->vm, vcpu);
 	regnum = (exitqual >> 8) & 0xf;
 	if (exitqual & 0x10) {
 		cr8 = vlapic_get_cr8(vlapic);
 		vmx_set_guest_reg(vmx, vcpu, regnum, cr8);
 	} else {
 		cr8 = vmx_get_guest_reg(vmx, vcpu, regnum);
 		vlapic_set_cr8(vlapic, cr8);
 	}
 
 	return (HANDLED);
 }
 
 /*
  * From section "Guest Register State" in the Intel SDM: CPL = SS.DPL
  */
 static int
 vmx_cpl(void)
 {
 	uint32_t ssar;
 
 	ssar = vmcs_read(VMCS_GUEST_SS_ACCESS_RIGHTS);
 	return ((ssar >> 5) & 0x3);
 }
 
 static enum vm_cpu_mode
 vmx_cpu_mode(void)
 {
 	uint32_t csar;
 
 	if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LMA) {
 		csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS);
 		if (csar & 0x2000)
 			return (CPU_MODE_64BIT);	/* CS.L = 1 */
 		else
 			return (CPU_MODE_COMPATIBILITY);
 	} else if (vmcs_read(VMCS_GUEST_CR0) & CR0_PE) {
 		return (CPU_MODE_PROTECTED);
 	} else {
 		return (CPU_MODE_REAL);
 	}
 }
 
 static enum vm_paging_mode
 vmx_paging_mode(void)
 {
 
 	if (!(vmcs_read(VMCS_GUEST_CR0) & CR0_PG))
 		return (PAGING_MODE_FLAT);
 	if (!(vmcs_read(VMCS_GUEST_CR4) & CR4_PAE))
 		return (PAGING_MODE_32);
 	if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LME)
 		return (PAGING_MODE_64);
 	else
 		return (PAGING_MODE_PAE);
 }
 
 static uint64_t
 inout_str_index(struct vmx *vmx, int vcpuid, int in)
 {
 	uint64_t val;
 	int error;
 	enum vm_reg_name reg;
 
 	reg = in ? VM_REG_GUEST_RDI : VM_REG_GUEST_RSI;
 	error = vmx_getreg(vmx, vcpuid, reg, &val);
 	KASSERT(error == 0, ("%s: vmx_getreg error %d", __func__, error));
 	return (val);
 }
 
 static uint64_t
 inout_str_count(struct vmx *vmx, int vcpuid, int rep)
 {
 	uint64_t val;
 	int error;
 
 	if (rep) {
 		error = vmx_getreg(vmx, vcpuid, VM_REG_GUEST_RCX, &val);
 		KASSERT(!error, ("%s: vmx_getreg error %d", __func__, error));
 	} else {
 		val = 1;
 	}
 	return (val);
 }
 
 static int
 inout_str_addrsize(uint32_t inst_info)
 {
 	uint32_t size;
 
 	size = (inst_info >> 7) & 0x7;
 	switch (size) {
 	case 0:
 		return (2);	/* 16 bit */
 	case 1:
 		return (4);	/* 32 bit */
 	case 2:
 		return (8);	/* 64 bit */
 	default:
 		panic("%s: invalid size encoding %d", __func__, size);
 	}
 }
 
 static void
 inout_str_seginfo(struct vmx *vmx, int vcpuid, uint32_t inst_info, int in,
     struct vm_inout_str *vis)
 {
 	int error, s;
 
 	if (in) {
 		vis->seg_name = VM_REG_GUEST_ES;
 	} else {
 		s = (inst_info >> 15) & 0x7;
 		vis->seg_name = vm_segment_name(s);
 	}
 
 	error = vmx_getdesc(vmx, vcpuid, vis->seg_name, &vis->seg_desc);
 	KASSERT(error == 0, ("%s: vmx_getdesc error %d", __func__, error));
 }
 
 static void
 vmx_paging_info(struct vm_guest_paging *paging)
 {
 	paging->cr3 = vmcs_guest_cr3();
 	paging->cpl = vmx_cpl();
 	paging->cpu_mode = vmx_cpu_mode();
 	paging->paging_mode = vmx_paging_mode();
 }
 
 static void
 vmexit_inst_emul(struct vm_exit *vmexit, uint64_t gpa, uint64_t gla)
 {
 	struct vm_guest_paging *paging;
 	uint32_t csar;
 
 	paging = &vmexit->u.inst_emul.paging;
 
 	vmexit->exitcode = VM_EXITCODE_INST_EMUL;
 	vmexit->inst_length = 0;
 	vmexit->u.inst_emul.gpa = gpa;
 	vmexit->u.inst_emul.gla = gla;
 	vmx_paging_info(paging);
 	switch (paging->cpu_mode) {
 	case CPU_MODE_REAL:
 		vmexit->u.inst_emul.cs_base = vmcs_read(VMCS_GUEST_CS_BASE);
 		vmexit->u.inst_emul.cs_d = 0;
 		break;
 	case CPU_MODE_PROTECTED:
 	case CPU_MODE_COMPATIBILITY:
 		vmexit->u.inst_emul.cs_base = vmcs_read(VMCS_GUEST_CS_BASE);
 		csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS);
 		vmexit->u.inst_emul.cs_d = SEG_DESC_DEF32(csar);
 		break;
 	default:
 		vmexit->u.inst_emul.cs_base = 0;
 		vmexit->u.inst_emul.cs_d = 0;
 		break;
 	}
 	vie_init(&vmexit->u.inst_emul.vie, NULL, 0);
 }
 
 static int
 ept_fault_type(uint64_t ept_qual)
 {
 	int fault_type;
 
 	if (ept_qual & EPT_VIOLATION_DATA_WRITE)
 		fault_type = VM_PROT_WRITE;
 	else if (ept_qual & EPT_VIOLATION_INST_FETCH)
 		fault_type = VM_PROT_EXECUTE;
 	else
 		fault_type= VM_PROT_READ;
 
 	return (fault_type);
 }
 
 static bool
 ept_emulation_fault(uint64_t ept_qual)
 {
 	int read, write;
 
 	/* EPT fault on an instruction fetch doesn't make sense here */
 	if (ept_qual & EPT_VIOLATION_INST_FETCH)
 		return (false);
 
 	/* EPT fault must be a read fault or a write fault */
 	read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0;
 	write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0;
 	if ((read | write) == 0)
 		return (false);
 
 	/*
 	 * The EPT violation must have been caused by accessing a
 	 * guest-physical address that is a translation of a guest-linear
 	 * address.
 	 */
 	if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 ||
 	    (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) {
 		return (false);
 	}
 
 	return (true);
 }
 
 static __inline int
 apic_access_virtualization(struct vmx *vmx, int vcpuid)
 {
 	uint32_t proc_ctls2;
 
 	proc_ctls2 = vmx->cap[vcpuid].proc_ctls2;
 	return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) ? 1 : 0);
 }
 
 static __inline int
 x2apic_virtualization(struct vmx *vmx, int vcpuid)
 {
 	uint32_t proc_ctls2;
 
 	proc_ctls2 = vmx->cap[vcpuid].proc_ctls2;
 	return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_X2APIC_MODE) ? 1 : 0);
 }
 
 static int
 vmx_handle_apic_write(struct vmx *vmx, int vcpuid, struct vlapic *vlapic,
     uint64_t qual)
 {
 	int error, handled, offset;
 	uint32_t *apic_regs, vector;
 	bool retu;
 
 	handled = HANDLED;
 	offset = APIC_WRITE_OFFSET(qual);
 
 	if (!apic_access_virtualization(vmx, vcpuid)) {
 		/*
 		 * In general there should not be any APIC write VM-exits
 		 * unless APIC-access virtualization is enabled.
 		 *
 		 * However self-IPI virtualization can legitimately trigger
 		 * an APIC-write VM-exit so treat it specially.
 		 */
 		if (x2apic_virtualization(vmx, vcpuid) &&
 		    offset == APIC_OFFSET_SELF_IPI) {
 			apic_regs = (uint32_t *)(vlapic->apic_page);
 			vector = apic_regs[APIC_OFFSET_SELF_IPI / 4];
 			vlapic_self_ipi_handler(vlapic, vector);
 			return (HANDLED);
 		} else
 			return (UNHANDLED);
 	}
 
 	switch (offset) {
 	case APIC_OFFSET_ID:
 		vlapic_id_write_handler(vlapic);
 		break;
 	case APIC_OFFSET_LDR:
 		vlapic_ldr_write_handler(vlapic);
 		break;
 	case APIC_OFFSET_DFR:
 		vlapic_dfr_write_handler(vlapic);
 		break;
 	case APIC_OFFSET_SVR:
 		vlapic_svr_write_handler(vlapic);
 		break;
 	case APIC_OFFSET_ESR:
 		vlapic_esr_write_handler(vlapic);
 		break;
 	case APIC_OFFSET_ICR_LOW:
 		retu = false;
 		error = vlapic_icrlo_write_handler(vlapic, &retu);
 		if (error != 0 || retu)
 			handled = UNHANDLED;
 		break;
 	case APIC_OFFSET_CMCI_LVT:
 	case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
 		vlapic_lvt_write_handler(vlapic, offset);
 		break;
 	case APIC_OFFSET_TIMER_ICR:
 		vlapic_icrtmr_write_handler(vlapic);
 		break;
 	case APIC_OFFSET_TIMER_DCR:
 		vlapic_dcr_write_handler(vlapic);
 		break;
 	default:
 		handled = UNHANDLED;
 		break;
 	}
 	return (handled);
 }
 
 static bool
 apic_access_fault(struct vmx *vmx, int vcpuid, uint64_t gpa)
 {
 
 	if (apic_access_virtualization(vmx, vcpuid) &&
 	    (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE))
 		return (true);
 	else
 		return (false);
 }
 
 static int
 vmx_handle_apic_access(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit)
 {
 	uint64_t qual;
 	int access_type, offset, allowed;
 
 	if (!apic_access_virtualization(vmx, vcpuid))
 		return (UNHANDLED);
 
 	qual = vmexit->u.vmx.exit_qualification;
 	access_type = APIC_ACCESS_TYPE(qual);
 	offset = APIC_ACCESS_OFFSET(qual);
 
 	allowed = 0;
 	if (access_type == 0) {
 		/*
 		 * Read data access to the following registers is expected.
 		 */
 		switch (offset) {
 		case APIC_OFFSET_APR:
 		case APIC_OFFSET_PPR:
 		case APIC_OFFSET_RRR:
 		case APIC_OFFSET_CMCI_LVT:
 		case APIC_OFFSET_TIMER_CCR:
 			allowed = 1;
 			break;
 		default:
 			break;
 		}
 	} else if (access_type == 1) {
 		/*
 		 * Write data access to the following registers is expected.
 		 */
 		switch (offset) {
 		case APIC_OFFSET_VER:
 		case APIC_OFFSET_APR:
 		case APIC_OFFSET_PPR:
 		case APIC_OFFSET_RRR:
 		case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
 		case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
 		case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
 		case APIC_OFFSET_CMCI_LVT:
 		case APIC_OFFSET_TIMER_CCR:
 			allowed = 1;
 			break;
 		default:
 			break;
 		}
 	}
 
 	if (allowed) {
 		vmexit_inst_emul(vmexit, DEFAULT_APIC_BASE + offset,
 		    VIE_INVALID_GLA);
 	}
 
 	/*
 	 * Regardless of whether the APIC-access is allowed this handler
 	 * always returns UNHANDLED:
 	 * - if the access is allowed then it is handled by emulating the
 	 *   instruction that caused the VM-exit (outside the critical section)
 	 * - if the access is not allowed then it will be converted to an
 	 *   exitcode of VM_EXITCODE_VMX and will be dealt with in userland.
 	 */
 	return (UNHANDLED);
 }
 
 static enum task_switch_reason
 vmx_task_switch_reason(uint64_t qual)
 {
 	int reason;
 
 	reason = (qual >> 30) & 0x3;
 	switch (reason) {
 	case 0:
 		return (TSR_CALL);
 	case 1:
 		return (TSR_IRET);
 	case 2:
 		return (TSR_JMP);
 	case 3:
 		return (TSR_IDT_GATE);
 	default:
 		panic("%s: invalid reason %d", __func__, reason);
 	}
 }
 
 static int
 emulate_wrmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t val, bool *retu)
 {
 	int error;
 
 	if (lapic_msr(num))
 		error = lapic_wrmsr(vmx->vm, vcpuid, num, val, retu);
 	else
 		error = vmx_wrmsr(vmx, vcpuid, num, val, retu);
 
 	return (error);
 }
 
 static int
 emulate_rdmsr(struct vmx *vmx, int vcpuid, u_int num, bool *retu)
 {
 	struct vmxctx *vmxctx;
 	uint64_t result;
 	uint32_t eax, edx;
 	int error;
 
 	if (lapic_msr(num))
 		error = lapic_rdmsr(vmx->vm, vcpuid, num, &result, retu);
 	else
 		error = vmx_rdmsr(vmx, vcpuid, num, &result, retu);
 
 	if (error == 0) {
 		eax = result;
 		vmxctx = &vmx->ctx[vcpuid];
 		error = vmxctx_setreg(vmxctx, VM_REG_GUEST_RAX, eax);
 		KASSERT(error == 0, ("vmxctx_setreg(rax) error %d", error));
 
 		edx = result >> 32;
 		error = vmxctx_setreg(vmxctx, VM_REG_GUEST_RDX, edx);
 		KASSERT(error == 0, ("vmxctx_setreg(rdx) error %d", error));
 	}
 
 	return (error);
 }
 
 static int
 vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 {
 	int error, errcode, errcode_valid, handled, in;
 	struct vmxctx *vmxctx;
 	struct vlapic *vlapic;
 	struct vm_inout_str *vis;
 	struct vm_task_switch *ts;
 	uint32_t eax, ecx, edx, idtvec_info, idtvec_err, intr_info, inst_info;
 	uint32_t intr_type, intr_vec, reason;
 	uint64_t exitintinfo, qual, gpa;
 	bool retu;
 
 	CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_VIRTUAL_NMI) != 0);
 	CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_NMI_EXITING) != 0);
 
 	handled = UNHANDLED;
 	vmxctx = &vmx->ctx[vcpu];
 
 	qual = vmexit->u.vmx.exit_qualification;
 	reason = vmexit->u.vmx.exit_reason;
 	vmexit->exitcode = VM_EXITCODE_BOGUS;
 
 	vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1);
 	SDT_PROBE3(vmm, vmx, exit, entry, vmx, vcpu, vmexit);
 
 	/*
 	 * VM-entry failures during or after loading guest state.
 	 *
 	 * These VM-exits are uncommon but must be handled specially
 	 * as most VM-exit fields are not populated as usual.
 	 */
 	if (__predict_false(reason == EXIT_REASON_MCE_DURING_ENTRY)) {
 		VCPU_CTR0(vmx->vm, vcpu, "Handling MCE during VM-entry");
 		__asm __volatile("int $18");
 		return (1);
 	}
 
 	/*
 	 * VM exits that can be triggered during event delivery need to
 	 * be handled specially by re-injecting the event if the IDT
 	 * vectoring information field's valid bit is set.
 	 *
 	 * See "Information for VM Exits During Event Delivery" in Intel SDM
 	 * for details.
 	 */
 	idtvec_info = vmcs_idt_vectoring_info();
 	if (idtvec_info & VMCS_IDT_VEC_VALID) {
 		idtvec_info &= ~(1 << 12); /* clear undefined bit */
 		exitintinfo = idtvec_info;
 		if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) {
 			idtvec_err = vmcs_idt_vectoring_err();
 			exitintinfo |= (uint64_t)idtvec_err << 32;
 		}
 		error = vm_exit_intinfo(vmx->vm, vcpu, exitintinfo);
 		KASSERT(error == 0, ("%s: vm_set_intinfo error %d",
 		    __func__, error));
 
 		/*
 		 * If 'virtual NMIs' are being used and the VM-exit
 		 * happened while injecting an NMI during the previous
 		 * VM-entry, then clear "blocking by NMI" in the
 		 * Guest Interruptibility-State so the NMI can be
 		 * reinjected on the subsequent VM-entry.
 		 *
 		 * However, if the NMI was being delivered through a task
 		 * gate, then the new task must start execution with NMIs
 		 * blocked so don't clear NMI blocking in this case.
 		 */
 		intr_type = idtvec_info & VMCS_INTR_T_MASK;
 		if (intr_type == VMCS_INTR_T_NMI) {
 			if (reason != EXIT_REASON_TASK_SWITCH)
 				vmx_clear_nmi_blocking(vmx, vcpu);
 			else
 				vmx_assert_nmi_blocking(vmx, vcpu);
 		}
 
 		/*
 		 * Update VM-entry instruction length if the event being
 		 * delivered was a software interrupt or software exception.
 		 */
 		if (intr_type == VMCS_INTR_T_SWINTR ||
 		    intr_type == VMCS_INTR_T_PRIV_SWEXCEPTION ||
 		    intr_type == VMCS_INTR_T_SWEXCEPTION) {
 			vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length);
 		}
 	}
 
 	switch (reason) {
 	case EXIT_REASON_TASK_SWITCH:
 		ts = &vmexit->u.task_switch;
 		ts->tsssel = qual & 0xffff;
 		ts->reason = vmx_task_switch_reason(qual);
 		ts->ext = 0;
 		ts->errcode_valid = 0;
 		vmx_paging_info(&ts->paging);
 		/*
 		 * If the task switch was due to a CALL, JMP, IRET, software
 		 * interrupt (INT n) or software exception (INT3, INTO),
 		 * then the saved %rip references the instruction that caused
 		 * the task switch. The instruction length field in the VMCS
 		 * is valid in this case.
 		 *
 		 * In all other cases (e.g., NMI, hardware exception) the
 		 * saved %rip is one that would have been saved in the old TSS
 		 * had the task switch completed normally so the instruction
 		 * length field is not needed in this case and is explicitly
 		 * set to 0.
 		 */
 		if (ts->reason == TSR_IDT_GATE) {
 			KASSERT(idtvec_info & VMCS_IDT_VEC_VALID,
 			    ("invalid idtvec_info %#x for IDT task switch",
 			    idtvec_info));
 			intr_type = idtvec_info & VMCS_INTR_T_MASK;
 			if (intr_type != VMCS_INTR_T_SWINTR &&
 			    intr_type != VMCS_INTR_T_SWEXCEPTION &&
 			    intr_type != VMCS_INTR_T_PRIV_SWEXCEPTION) {
 				/* Task switch triggered by external event */
 				ts->ext = 1;
 				vmexit->inst_length = 0;
 				if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) {
 					ts->errcode_valid = 1;
 					ts->errcode = vmcs_idt_vectoring_err();
 				}
 			}
 		}
 		vmexit->exitcode = VM_EXITCODE_TASK_SWITCH;
 		SDT_PROBE4(vmm, vmx, exit, taskswitch, vmx, vcpu, vmexit, ts);
 		VCPU_CTR4(vmx->vm, vcpu, "task switch reason %d, tss 0x%04x, "
 		    "%s errcode 0x%016lx", ts->reason, ts->tsssel,
 		    ts->ext ? "external" : "internal",
 		    ((uint64_t)ts->errcode << 32) | ts->errcode_valid);
 		break;
 	case EXIT_REASON_CR_ACCESS:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1);
 		SDT_PROBE4(vmm, vmx, exit, craccess, vmx, vcpu, vmexit, qual);
 		switch (qual & 0xf) {
 		case 0:
 			handled = vmx_emulate_cr0_access(vmx, vcpu, qual);
 			break;
 		case 4:
 			handled = vmx_emulate_cr4_access(vmx, vcpu, qual);
 			break;
 		case 8:
 			handled = vmx_emulate_cr8_access(vmx, vcpu, qual);
 			break;
 		}
 		break;
 	case EXIT_REASON_RDMSR:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_RDMSR, 1);
 		retu = false;
 		ecx = vmxctx->guest_rcx;
 		VCPU_CTR1(vmx->vm, vcpu, "rdmsr 0x%08x", ecx);
 		SDT_PROBE4(vmm, vmx, exit, rdmsr, vmx, vcpu, vmexit, ecx);
 		error = emulate_rdmsr(vmx, vcpu, ecx, &retu);
 		if (error) {
 			vmexit->exitcode = VM_EXITCODE_RDMSR;
 			vmexit->u.msr.code = ecx;
 		} else if (!retu) {
 			handled = HANDLED;
 		} else {
 			/* Return to userspace with a valid exitcode */
 			KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
 			    ("emulate_rdmsr retu with bogus exitcode"));
 		}
 		break;
 	case EXIT_REASON_WRMSR:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_WRMSR, 1);
 		retu = false;
 		eax = vmxctx->guest_rax;
 		ecx = vmxctx->guest_rcx;
 		edx = vmxctx->guest_rdx;
 		VCPU_CTR2(vmx->vm, vcpu, "wrmsr 0x%08x value 0x%016lx",
 		    ecx, (uint64_t)edx << 32 | eax);
 		SDT_PROBE5(vmm, vmx, exit, wrmsr, vmx, vmexit, vcpu, ecx,
 		    (uint64_t)edx << 32 | eax);
 		error = emulate_wrmsr(vmx, vcpu, ecx,
 		    (uint64_t)edx << 32 | eax, &retu);
 		if (error) {
 			vmexit->exitcode = VM_EXITCODE_WRMSR;
 			vmexit->u.msr.code = ecx;
 			vmexit->u.msr.wval = (uint64_t)edx << 32 | eax;
 		} else if (!retu) {
 			handled = HANDLED;
 		} else {
 			/* Return to userspace with a valid exitcode */
 			KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
 			    ("emulate_wrmsr retu with bogus exitcode"));
 		}
 		break;
 	case EXIT_REASON_HLT:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1);
 		SDT_PROBE3(vmm, vmx, exit, halt, vmx, vcpu, vmexit);
 		vmexit->exitcode = VM_EXITCODE_HLT;
 		vmexit->u.hlt.rflags = vmcs_read(VMCS_GUEST_RFLAGS);
 		if (virtual_interrupt_delivery)
 			vmexit->u.hlt.intr_status =
 			    vmcs_read(VMCS_GUEST_INTR_STATUS);
 		else
 			vmexit->u.hlt.intr_status = 0;
 		break;
 	case EXIT_REASON_MTF:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1);
 		SDT_PROBE3(vmm, vmx, exit, mtrap, vmx, vcpu, vmexit);
 		vmexit->exitcode = VM_EXITCODE_MTRAP;
 		vmexit->inst_length = 0;
 		break;
 	case EXIT_REASON_PAUSE:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_PAUSE, 1);
 		SDT_PROBE3(vmm, vmx, exit, pause, vmx, vcpu, vmexit);
 		vmexit->exitcode = VM_EXITCODE_PAUSE;
 		break;
 	case EXIT_REASON_INTR_WINDOW:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INTR_WINDOW, 1);
 		SDT_PROBE3(vmm, vmx, exit, intrwindow, vmx, vcpu, vmexit);
 		vmx_clear_int_window_exiting(vmx, vcpu);
 		return (1);
 	case EXIT_REASON_EXT_INTR:
 		/*
 		 * External interrupts serve only to cause VM exits and allow
 		 * the host interrupt handler to run.
 		 *
 		 * If this external interrupt triggers a virtual interrupt
 		 * to a VM, then that state will be recorded by the
 		 * host interrupt handler in the VM's softc. We will inject
 		 * this virtual interrupt during the subsequent VM enter.
 		 */
 		intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
 		SDT_PROBE4(vmm, vmx, exit, interrupt,
 		    vmx, vcpu, vmexit, intr_info);
 
 		/*
 		 * XXX: Ignore this exit if VMCS_INTR_VALID is not set.
 		 * This appears to be a bug in VMware Fusion?
 		 */
 		if (!(intr_info & VMCS_INTR_VALID))
 			return (1);
 		KASSERT((intr_info & VMCS_INTR_VALID) != 0 &&
 		    (intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_HWINTR,
 		    ("VM exit interruption info invalid: %#x", intr_info));
 		vmx_trigger_hostintr(intr_info & 0xff);
 
 		/*
 		 * This is special. We want to treat this as an 'handled'
 		 * VM-exit but not increment the instruction pointer.
 		 */
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1);
 		return (1);
 	case EXIT_REASON_NMI_WINDOW:
 		SDT_PROBE3(vmm, vmx, exit, nmiwindow, vmx, vcpu, vmexit);
 		/* Exit to allow the pending virtual NMI to be injected */
 		if (vm_nmi_pending(vmx->vm, vcpu))
 			vmx_inject_nmi(vmx, vcpu);
 		vmx_clear_nmi_window_exiting(vmx, vcpu);
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NMI_WINDOW, 1);
 		return (1);
 	case EXIT_REASON_INOUT:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INOUT, 1);
 		vmexit->exitcode = VM_EXITCODE_INOUT;
 		vmexit->u.inout.bytes = (qual & 0x7) + 1;
 		vmexit->u.inout.in = in = (qual & 0x8) ? 1 : 0;
 		vmexit->u.inout.string = (qual & 0x10) ? 1 : 0;
 		vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0;
 		vmexit->u.inout.port = (uint16_t)(qual >> 16);
 		vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax);
 		if (vmexit->u.inout.string) {
 			inst_info = vmcs_read(VMCS_EXIT_INSTRUCTION_INFO);
 			vmexit->exitcode = VM_EXITCODE_INOUT_STR;
 			vis = &vmexit->u.inout_str;
 			vmx_paging_info(&vis->paging);
 			vis->rflags = vmcs_read(VMCS_GUEST_RFLAGS);
 			vis->cr0 = vmcs_read(VMCS_GUEST_CR0);
 			vis->index = inout_str_index(vmx, vcpu, in);
 			vis->count = inout_str_count(vmx, vcpu, vis->inout.rep);
 			vis->addrsize = inout_str_addrsize(inst_info);
 			inout_str_seginfo(vmx, vcpu, inst_info, in, vis);
 		}
 		SDT_PROBE3(vmm, vmx, exit, inout, vmx, vcpu, vmexit);
 		break;
 	case EXIT_REASON_CPUID:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CPUID, 1);
 		SDT_PROBE3(vmm, vmx, exit, cpuid, vmx, vcpu, vmexit);
 		handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx);
 		break;
 	case EXIT_REASON_EXCEPTION:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXCEPTION, 1);
 		intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
 		KASSERT((intr_info & VMCS_INTR_VALID) != 0,
 		    ("VM exit interruption info invalid: %#x", intr_info));
 
 		intr_vec = intr_info & 0xff;
 		intr_type = intr_info & VMCS_INTR_T_MASK;
 
 		/*
 		 * If Virtual NMIs control is 1 and the VM-exit is due to a
 		 * fault encountered during the execution of IRET then we must
 		 * restore the state of "virtual-NMI blocking" before resuming
 		 * the guest.
 		 *
 		 * See "Resuming Guest Software after Handling an Exception".
 		 * See "Information for VM Exits Due to Vectored Events".
 		 */
 		if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 &&
 		    (intr_vec != IDT_DF) &&
 		    (intr_info & EXIT_QUAL_NMIUDTI) != 0)
 			vmx_restore_nmi_blocking(vmx, vcpu);
 
 		/*
 		 * The NMI has already been handled in vmx_exit_handle_nmi().
 		 */
 		if (intr_type == VMCS_INTR_T_NMI)
 			return (1);
 
 		/*
 		 * Call the machine check handler by hand. Also don't reflect
 		 * the machine check back into the guest.
 		 */
 		if (intr_vec == IDT_MC) {
 			VCPU_CTR0(vmx->vm, vcpu, "Vectoring to MCE handler");
 			__asm __volatile("int $18");
 			return (1);
 		}
 
 		/*
 		 * If the hypervisor has requested user exits for
 		 * debug exceptions, bounce them out to userland.
 		 */
 		if (intr_type == VMCS_INTR_T_SWEXCEPTION && intr_vec == IDT_BP &&
 		    (vmx->cap[vcpu].set & (1 << VM_CAP_BPT_EXIT))) {
 			vmexit->exitcode = VM_EXITCODE_BPT;
 			vmexit->u.bpt.inst_length = vmexit->inst_length;
 			vmexit->inst_length = 0;
 			break;
 		}
 
 		if (intr_vec == IDT_PF) {
 			error = vmxctx_setreg(vmxctx, VM_REG_GUEST_CR2, qual);
 			KASSERT(error == 0, ("%s: vmxctx_setreg(cr2) error %d",
 			    __func__, error));
 		}
 
 		/*
 		 * Software exceptions exhibit trap-like behavior. This in
 		 * turn requires populating the VM-entry instruction length
 		 * so that the %rip in the trap frame is past the INT3/INTO
 		 * instruction.
 		 */
 		if (intr_type == VMCS_INTR_T_SWEXCEPTION)
 			vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length);
 
 		/* Reflect all other exceptions back into the guest */
 		errcode_valid = errcode = 0;
 		if (intr_info & VMCS_INTR_DEL_ERRCODE) {
 			errcode_valid = 1;
 			errcode = vmcs_read(VMCS_EXIT_INTR_ERRCODE);
 		}
 		VCPU_CTR2(vmx->vm, vcpu, "Reflecting exception %d/%#x into "
 		    "the guest", intr_vec, errcode);
 		SDT_PROBE5(vmm, vmx, exit, exception,
 		    vmx, vcpu, vmexit, intr_vec, errcode);
 		error = vm_inject_exception(vmx->vm, vcpu, intr_vec,
 		    errcode_valid, errcode, 0);
 		KASSERT(error == 0, ("%s: vm_inject_exception error %d",
 		    __func__, error));
 		return (1);
 
 	case EXIT_REASON_EPT_FAULT:
 		/*
 		 * If 'gpa' lies within the address space allocated to
 		 * memory then this must be a nested page fault otherwise
 		 * this must be an instruction that accesses MMIO space.
 		 */
 		gpa = vmcs_gpa();
 		if (vm_mem_allocated(vmx->vm, vcpu, gpa) ||
 		    apic_access_fault(vmx, vcpu, gpa)) {
 			vmexit->exitcode = VM_EXITCODE_PAGING;
 			vmexit->inst_length = 0;
 			vmexit->u.paging.gpa = gpa;
 			vmexit->u.paging.fault_type = ept_fault_type(qual);
 			vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NESTED_FAULT, 1);
 			SDT_PROBE5(vmm, vmx, exit, nestedfault,
 			    vmx, vcpu, vmexit, gpa, qual);
 		} else if (ept_emulation_fault(qual)) {
 			vmexit_inst_emul(vmexit, gpa, vmcs_gla());
 			vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INST_EMUL, 1);
 			SDT_PROBE4(vmm, vmx, exit, mmiofault,
 			    vmx, vcpu, vmexit, gpa);
 		}
 		/*
 		 * If Virtual NMIs control is 1 and the VM-exit is due to an
 		 * EPT fault during the execution of IRET then we must restore
 		 * the state of "virtual-NMI blocking" before resuming.
 		 *
 		 * See description of "NMI unblocking due to IRET" in
 		 * "Exit Qualification for EPT Violations".
 		 */
 		if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 &&
 		    (qual & EXIT_QUAL_NMIUDTI) != 0)
 			vmx_restore_nmi_blocking(vmx, vcpu);
 		break;
 	case EXIT_REASON_VIRTUALIZED_EOI:
 		vmexit->exitcode = VM_EXITCODE_IOAPIC_EOI;
 		vmexit->u.ioapic_eoi.vector = qual & 0xFF;
 		SDT_PROBE3(vmm, vmx, exit, eoi, vmx, vcpu, vmexit);
 		vmexit->inst_length = 0;	/* trap-like */
 		break;
 	case EXIT_REASON_APIC_ACCESS:
 		SDT_PROBE3(vmm, vmx, exit, apicaccess, vmx, vcpu, vmexit);
 		handled = vmx_handle_apic_access(vmx, vcpu, vmexit);
 		break;
 	case EXIT_REASON_APIC_WRITE:
 		/*
 		 * APIC-write VM exit is trap-like so the %rip is already
 		 * pointing to the next instruction.
 		 */
 		vmexit->inst_length = 0;
 		vlapic = vm_lapic(vmx->vm, vcpu);
 		SDT_PROBE4(vmm, vmx, exit, apicwrite,
 		    vmx, vcpu, vmexit, vlapic);
 		handled = vmx_handle_apic_write(vmx, vcpu, vlapic, qual);
 		break;
 	case EXIT_REASON_XSETBV:
 		SDT_PROBE3(vmm, vmx, exit, xsetbv, vmx, vcpu, vmexit);
 		handled = vmx_emulate_xsetbv(vmx, vcpu, vmexit);
 		break;
 	case EXIT_REASON_MONITOR:
 		SDT_PROBE3(vmm, vmx, exit, monitor, vmx, vcpu, vmexit);
 		vmexit->exitcode = VM_EXITCODE_MONITOR;
 		break;
 	case EXIT_REASON_MWAIT:
 		SDT_PROBE3(vmm, vmx, exit, mwait, vmx, vcpu, vmexit);
 		vmexit->exitcode = VM_EXITCODE_MWAIT;
 		break;
 	case EXIT_REASON_TPR:
 		vlapic = vm_lapic(vmx->vm, vcpu);
 		vlapic_sync_tpr(vlapic);
 		vmexit->inst_length = 0;
 		handled = HANDLED;
 		break;
 	case EXIT_REASON_VMCALL:
 	case EXIT_REASON_VMCLEAR:
 	case EXIT_REASON_VMLAUNCH:
 	case EXIT_REASON_VMPTRLD:
 	case EXIT_REASON_VMPTRST:
 	case EXIT_REASON_VMREAD:
 	case EXIT_REASON_VMRESUME:
 	case EXIT_REASON_VMWRITE:
 	case EXIT_REASON_VMXOFF:
 	case EXIT_REASON_VMXON:
 		SDT_PROBE3(vmm, vmx, exit, vminsn, vmx, vcpu, vmexit);
 		vmexit->exitcode = VM_EXITCODE_VMINSN;
 		break;
 	default:
 		SDT_PROBE4(vmm, vmx, exit, unknown,
 		    vmx, vcpu, vmexit, reason);
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_UNKNOWN, 1);
 		break;
 	}
 
 	if (handled) {
 		/*
 		 * It is possible that control is returned to userland
 		 * even though we were able to handle the VM exit in the
 		 * kernel.
 		 *
 		 * In such a case we want to make sure that the userland
 		 * restarts guest execution at the instruction *after*
 		 * the one we just processed. Therefore we update the
 		 * guest rip in the VMCS and in 'vmexit'.
 		 */
 		vmexit->rip += vmexit->inst_length;
 		vmexit->inst_length = 0;
 		vmcs_write(VMCS_GUEST_RIP, vmexit->rip);
 	} else {
 		if (vmexit->exitcode == VM_EXITCODE_BOGUS) {
 			/*
 			 * If this VM exit was not claimed by anybody then
 			 * treat it as a generic VMX exit.
 			 */
 			vmexit->exitcode = VM_EXITCODE_VMX;
 			vmexit->u.vmx.status = VM_SUCCESS;
 			vmexit->u.vmx.inst_type = 0;
 			vmexit->u.vmx.inst_error = 0;
 		} else {
 			/*
 			 * The exitcode and collateral have been populated.
 			 * The VM exit will be processed further in userland.
 			 */
 		}
 	}
 
 	SDT_PROBE4(vmm, vmx, exit, return,
 	    vmx, vcpu, vmexit, handled);
 	return (handled);
 }
 
 static __inline void
 vmx_exit_inst_error(struct vmxctx *vmxctx, int rc, struct vm_exit *vmexit)
 {
 
 	KASSERT(vmxctx->inst_fail_status != VM_SUCCESS,
 	    ("vmx_exit_inst_error: invalid inst_fail_status %d",
 	    vmxctx->inst_fail_status));
 
 	vmexit->inst_length = 0;
 	vmexit->exitcode = VM_EXITCODE_VMX;
 	vmexit->u.vmx.status = vmxctx->inst_fail_status;
 	vmexit->u.vmx.inst_error = vmcs_instruction_error();
 	vmexit->u.vmx.exit_reason = ~0;
 	vmexit->u.vmx.exit_qualification = ~0;
 
 	switch (rc) {
 	case VMX_VMRESUME_ERROR:
 	case VMX_VMLAUNCH_ERROR:
 	case VMX_INVEPT_ERROR:
 		vmexit->u.vmx.inst_type = rc;
 		break;
 	default:
 		panic("vm_exit_inst_error: vmx_enter_guest returned %d", rc);
 	}
 }
 
 /*
  * If the NMI-exiting VM execution control is set to '1' then an NMI in
  * non-root operation causes a VM-exit. NMI blocking is in effect so it is
  * sufficient to simply vector to the NMI handler via a software interrupt.
  * However, this must be done before maskable interrupts are enabled
  * otherwise the "iret" issued by an interrupt handler will incorrectly
  * clear NMI blocking.
  */
 static __inline void
 vmx_exit_handle_nmi(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit)
 {
 	uint32_t intr_info;
 
 	KASSERT((read_rflags() & PSL_I) == 0, ("interrupts enabled"));
 
 	if (vmexit->u.vmx.exit_reason != EXIT_REASON_EXCEPTION)
 		return;
 
 	intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
 	KASSERT((intr_info & VMCS_INTR_VALID) != 0,
 	    ("VM exit interruption info invalid: %#x", intr_info));
 
 	if ((intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_NMI) {
 		KASSERT((intr_info & 0xff) == IDT_NMI, ("VM exit due "
 		    "to NMI has invalid vector: %#x", intr_info));
 		VCPU_CTR0(vmx->vm, vcpuid, "Vectoring to NMI handler");
 		__asm __volatile("int $2");
 	}
 }
 
 static __inline void
 vmx_dr_enter_guest(struct vmxctx *vmxctx)
 {
 	register_t rflags;
 
 	/* Save host control debug registers. */
 	vmxctx->host_dr7 = rdr7();
 	vmxctx->host_debugctl = rdmsr(MSR_DEBUGCTLMSR);
 
 	/*
 	 * Disable debugging in DR7 and DEBUGCTL to avoid triggering
 	 * exceptions in the host based on the guest DRx values.  The
 	 * guest DR7 and DEBUGCTL are saved/restored in the VMCS.
 	 */
 	load_dr7(0);
 	wrmsr(MSR_DEBUGCTLMSR, 0);
 
 	/*
 	 * Disable single stepping the kernel to avoid corrupting the
 	 * guest DR6.  A debugger might still be able to corrupt the
 	 * guest DR6 by setting a breakpoint after this point and then
 	 * single stepping.
 	 */
 	rflags = read_rflags();
 	vmxctx->host_tf = rflags & PSL_T;
 	write_rflags(rflags & ~PSL_T);
 
 	/* Save host debug registers. */
 	vmxctx->host_dr0 = rdr0();
 	vmxctx->host_dr1 = rdr1();
 	vmxctx->host_dr2 = rdr2();
 	vmxctx->host_dr3 = rdr3();
 	vmxctx->host_dr6 = rdr6();
 
 	/* Restore guest debug registers. */
 	load_dr0(vmxctx->guest_dr0);
 	load_dr1(vmxctx->guest_dr1);
 	load_dr2(vmxctx->guest_dr2);
 	load_dr3(vmxctx->guest_dr3);
 	load_dr6(vmxctx->guest_dr6);
 }
 
 static __inline void
 vmx_dr_leave_guest(struct vmxctx *vmxctx)
 {
 
 	/* Save guest debug registers. */
 	vmxctx->guest_dr0 = rdr0();
 	vmxctx->guest_dr1 = rdr1();
 	vmxctx->guest_dr2 = rdr2();
 	vmxctx->guest_dr3 = rdr3();
 	vmxctx->guest_dr6 = rdr6();
 
 	/*
 	 * Restore host debug registers.  Restore DR7, DEBUGCTL, and
 	 * PSL_T last.
 	 */
 	load_dr0(vmxctx->host_dr0);
 	load_dr1(vmxctx->host_dr1);
 	load_dr2(vmxctx->host_dr2);
 	load_dr3(vmxctx->host_dr3);
 	load_dr6(vmxctx->host_dr6);
 	wrmsr(MSR_DEBUGCTLMSR, vmxctx->host_debugctl);
 	load_dr7(vmxctx->host_dr7);
 	write_rflags(read_rflags() | vmxctx->host_tf);
 }
 
 static int
 vmx_run(void *arg, int vcpu, register_t rip, pmap_t pmap,
     struct vm_eventinfo *evinfo)
 {
 	int rc, handled, launched;
 	struct vmx *vmx;
 	struct vm *vm;
 	struct vmxctx *vmxctx;
 	struct vmcs *vmcs;
 	struct vm_exit *vmexit;
 	struct vlapic *vlapic;
 	uint32_t exit_reason;
 	struct region_descriptor gdtr, idtr;
 	uint16_t ldt_sel;
 
 	vmx = arg;
 	vm = vmx->vm;
 	vmcs = &vmx->vmcs[vcpu];
 	vmxctx = &vmx->ctx[vcpu];
 	vlapic = vm_lapic(vm, vcpu);
 	vmexit = vm_exitinfo(vm, vcpu);
 	launched = 0;
 
 	KASSERT(vmxctx->pmap == pmap,
 	    ("pmap %p different than ctx pmap %p", pmap, vmxctx->pmap));
 
 	vmx_msr_guest_enter(vmx, vcpu);
 
 	VMPTRLD(vmcs);
 
 	/*
 	 * XXX
 	 * We do this every time because we may setup the virtual machine
 	 * from a different process than the one that actually runs it.
 	 *
 	 * If the life of a virtual machine was spent entirely in the context
 	 * of a single process we could do this once in vmx_vminit().
 	 */
 	vmcs_write(VMCS_HOST_CR3, rcr3());
 
 	vmcs_write(VMCS_GUEST_RIP, rip);
 	vmx_set_pcpu_defaults(vmx, vcpu, pmap);
 	do {
 		KASSERT(vmcs_guest_rip() == rip, ("%s: vmcs guest rip mismatch "
 		    "%#lx/%#lx", __func__, vmcs_guest_rip(), rip));
 
 		handled = UNHANDLED;
 		/*
 		 * Interrupts are disabled from this point on until the
 		 * guest starts executing. This is done for the following
 		 * reasons:
 		 *
 		 * If an AST is asserted on this thread after the check below,
 		 * then the IPI_AST notification will not be lost, because it
 		 * will cause a VM exit due to external interrupt as soon as
 		 * the guest state is loaded.
 		 *
 		 * A posted interrupt after 'vmx_inject_interrupts()' will
 		 * not be "lost" because it will be held pending in the host
 		 * APIC because interrupts are disabled. The pending interrupt
 		 * will be recognized as soon as the guest state is loaded.
 		 *
 		 * The same reasoning applies to the IPI generated by
 		 * pmap_invalidate_ept().
 		 */
 		disable_intr();
 		vmx_inject_interrupts(vmx, vcpu, vlapic, rip);
 
 		/*
 		 * Check for vcpu suspension after injecting events because
 		 * vmx_inject_interrupts() can suspend the vcpu due to a
 		 * triple fault.
 		 */
 		if (vcpu_suspended(evinfo)) {
 			enable_intr();
 			vm_exit_suspended(vmx->vm, vcpu, rip);
 			break;
 		}
 
 		if (vcpu_rendezvous_pending(evinfo)) {
 			enable_intr();
 			vm_exit_rendezvous(vmx->vm, vcpu, rip);
 			break;
 		}
 
 		if (vcpu_reqidle(evinfo)) {
 			enable_intr();
 			vm_exit_reqidle(vmx->vm, vcpu, rip);
 			break;
 		}
 
 		if (vcpu_should_yield(vm, vcpu)) {
 			enable_intr();
 			vm_exit_astpending(vmx->vm, vcpu, rip);
 			vmx_astpending_trace(vmx, vcpu, rip);
 			handled = HANDLED;
 			break;
 		}
 
 		if (vcpu_debugged(vm, vcpu)) {
 			enable_intr();
 			vm_exit_debug(vmx->vm, vcpu, rip);
 			break;
 		}
 
 		/*
 		 * If TPR Shadowing is enabled, the TPR Threshold
 		 * must be updated right before entering the guest.
 		 */
 		if (tpr_shadowing && !virtual_interrupt_delivery) {
 			if ((vmx->cap[vcpu].proc_ctls & PROCBASED_USE_TPR_SHADOW) != 0) {
 				vmcs_write(VMCS_TPR_THRESHOLD, vlapic_get_cr8(vlapic));
 			}
 		}
 
 		/*
 		 * VM exits restore the base address but not the
 		 * limits of GDTR and IDTR.  The VMCS only stores the
 		 * base address, so VM exits set the limits to 0xffff.
 		 * Save and restore the full GDTR and IDTR to restore
 		 * the limits.
 		 *
 		 * The VMCS does not save the LDTR at all, and VM
 		 * exits clear LDTR as if a NULL selector were loaded.
 		 * The userspace hypervisor probably doesn't use a
 		 * LDT, but save and restore it to be safe.
 		 */
 		sgdt(&gdtr);
 		sidt(&idtr);
 		ldt_sel = sldt();
 
 		vmx_run_trace(vmx, vcpu);
 		vmx_dr_enter_guest(vmxctx);
 		rc = vmx_enter_guest(vmxctx, vmx, launched);
 		vmx_dr_leave_guest(vmxctx);
 
 		bare_lgdt(&gdtr);
 		lidt(&idtr);
 		lldt(ldt_sel);
 
 		/* Collect some information for VM exit processing */
 		vmexit->rip = rip = vmcs_guest_rip();
 		vmexit->inst_length = vmexit_instruction_length();
 		vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason();
 		vmexit->u.vmx.exit_qualification = vmcs_exit_qualification();
 
 		/* Update 'nextrip' */
 		vmx->state[vcpu].nextrip = rip;
 
 		if (rc == VMX_GUEST_VMEXIT) {
 			vmx_exit_handle_nmi(vmx, vcpu, vmexit);
 			enable_intr();
 			handled = vmx_exit_process(vmx, vcpu, vmexit);
 		} else {
 			enable_intr();
 			vmx_exit_inst_error(vmxctx, rc, vmexit);
 		}
 		launched = 1;
 		vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled);
 		rip = vmexit->rip;
 	} while (handled);
 
 	/*
 	 * If a VM exit has been handled then the exitcode must be BOGUS
 	 * If a VM exit is not handled then the exitcode must not be BOGUS
 	 */
 	if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) ||
 	    (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) {
 		panic("Mismatch between handled (%d) and exitcode (%d)",
 		      handled, vmexit->exitcode);
 	}
 
 	if (!handled)
 		vmm_stat_incr(vm, vcpu, VMEXIT_USERSPACE, 1);
 
 	VCPU_CTR1(vm, vcpu, "returning from vmx_run: exitcode %d",
 	    vmexit->exitcode);
 
 	VMCLEAR(vmcs);
 	vmx_msr_guest_exit(vmx, vcpu);
 
 	return (0);
 }
 
 static void
 vmx_vmcleanup(void *arg)
 {
 	int i;
 	struct vmx *vmx = arg;
 	uint16_t maxcpus;
 
 	if (apic_access_virtualization(vmx, 0))
 		vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE);
 
 	maxcpus = vm_get_maxcpus(vmx->vm);
 	for (i = 0; i < maxcpus; i++)
 		vpid_free(vmx->state[i].vpid);
 
 	free(vmx, M_VMX);
 
 	return;
 }
 
 static register_t *
 vmxctx_regptr(struct vmxctx *vmxctx, int reg)
 {
 
 	switch (reg) {
 	case VM_REG_GUEST_RAX:
 		return (&vmxctx->guest_rax);
 	case VM_REG_GUEST_RBX:
 		return (&vmxctx->guest_rbx);
 	case VM_REG_GUEST_RCX:
 		return (&vmxctx->guest_rcx);
 	case VM_REG_GUEST_RDX:
 		return (&vmxctx->guest_rdx);
 	case VM_REG_GUEST_RSI:
 		return (&vmxctx->guest_rsi);
 	case VM_REG_GUEST_RDI:
 		return (&vmxctx->guest_rdi);
 	case VM_REG_GUEST_RBP:
 		return (&vmxctx->guest_rbp);
 	case VM_REG_GUEST_R8:
 		return (&vmxctx->guest_r8);
 	case VM_REG_GUEST_R9:
 		return (&vmxctx->guest_r9);
 	case VM_REG_GUEST_R10:
 		return (&vmxctx->guest_r10);
 	case VM_REG_GUEST_R11:
 		return (&vmxctx->guest_r11);
 	case VM_REG_GUEST_R12:
 		return (&vmxctx->guest_r12);
 	case VM_REG_GUEST_R13:
 		return (&vmxctx->guest_r13);
 	case VM_REG_GUEST_R14:
 		return (&vmxctx->guest_r14);
 	case VM_REG_GUEST_R15:
 		return (&vmxctx->guest_r15);
 	case VM_REG_GUEST_CR2:
 		return (&vmxctx->guest_cr2);
 	case VM_REG_GUEST_DR0:
 		return (&vmxctx->guest_dr0);
 	case VM_REG_GUEST_DR1:
 		return (&vmxctx->guest_dr1);
 	case VM_REG_GUEST_DR2:
 		return (&vmxctx->guest_dr2);
 	case VM_REG_GUEST_DR3:
 		return (&vmxctx->guest_dr3);
 	case VM_REG_GUEST_DR6:
 		return (&vmxctx->guest_dr6);
 	default:
 		break;
 	}
 	return (NULL);
 }
 
 static int
 vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval)
 {
 	register_t *regp;
 
 	if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
 		*retval = *regp;
 		return (0);
 	} else
 		return (EINVAL);
 }
 
 static int
 vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val)
 {
 	register_t *regp;
 
 	if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
 		*regp = val;
 		return (0);
 	} else
 		return (EINVAL);
 }
 
 static int
 vmx_get_intr_shadow(struct vmx *vmx, int vcpu, int running, uint64_t *retval)
 {
 	uint64_t gi;
 	int error;
 
 	error = vmcs_getreg(&vmx->vmcs[vcpu], running,
 	    VMCS_IDENT(VMCS_GUEST_INTERRUPTIBILITY), &gi);
 	*retval = (gi & HWINTR_BLOCKING) ? 1 : 0;
 	return (error);
 }
 
 static int
 vmx_modify_intr_shadow(struct vmx *vmx, int vcpu, int running, uint64_t val)
 {
 	struct vmcs *vmcs;
 	uint64_t gi;
 	int error, ident;
 
 	/*
 	 * Forcing the vcpu into an interrupt shadow is not supported.
 	 */
 	if (val) {
 		error = EINVAL;
 		goto done;
 	}
 
 	vmcs = &vmx->vmcs[vcpu];
 	ident = VMCS_IDENT(VMCS_GUEST_INTERRUPTIBILITY);
 	error = vmcs_getreg(vmcs, running, ident, &gi);
 	if (error == 0) {
 		gi &= ~HWINTR_BLOCKING;
 		error = vmcs_setreg(vmcs, running, ident, gi);
 	}
 done:
 	VCPU_CTR2(vmx->vm, vcpu, "Setting intr_shadow to %#lx %s", val,
 	    error ? "failed" : "succeeded");
 	return (error);
 }
 
 static int
 vmx_shadow_reg(int reg)
 {
 	int shreg;
 
 	shreg = -1;
 
 	switch (reg) {
 	case VM_REG_GUEST_CR0:
 		shreg = VMCS_CR0_SHADOW;
 		break;
 	case VM_REG_GUEST_CR4:
 		shreg = VMCS_CR4_SHADOW;
 		break;
 	default:
 		break;
 	}
 
 	return (shreg);
 }
 
 static int
 vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval)
 {
 	int running, hostcpu;
 	struct vmx *vmx = arg;
 
 	running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
 	if (running && hostcpu != curcpu)
 		panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu);
 
 	if (reg == VM_REG_GUEST_INTR_SHADOW)
 		return (vmx_get_intr_shadow(vmx, vcpu, running, retval));
 
 	if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0)
 		return (0);
 
 	return (vmcs_getreg(&vmx->vmcs[vcpu], running, reg, retval));
 }
 
 static int
 vmx_setreg(void *arg, int vcpu, int reg, uint64_t val)
 {
 	int error, hostcpu, running, shadow;
 	uint64_t ctls;
 	pmap_t pmap;
 	struct vmx *vmx = arg;
 
 	running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
 	if (running && hostcpu != curcpu)
 		panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu);
 
 	if (reg == VM_REG_GUEST_INTR_SHADOW)
 		return (vmx_modify_intr_shadow(vmx, vcpu, running, val));
 
 	if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0)
 		return (0);
 
 	error = vmcs_setreg(&vmx->vmcs[vcpu], running, reg, val);
 
 	if (error == 0) {
 		/*
 		 * If the "load EFER" VM-entry control is 1 then the
 		 * value of EFER.LMA must be identical to "IA-32e mode guest"
 		 * bit in the VM-entry control.
 		 */
 		if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 &&
 		    (reg == VM_REG_GUEST_EFER)) {
 			vmcs_getreg(&vmx->vmcs[vcpu], running,
 				    VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls);
 			if (val & EFER_LMA)
 				ctls |= VM_ENTRY_GUEST_LMA;
 			else
 				ctls &= ~VM_ENTRY_GUEST_LMA;
 			vmcs_setreg(&vmx->vmcs[vcpu], running,
 				    VMCS_IDENT(VMCS_ENTRY_CTLS), ctls);
 		}
 
 		shadow = vmx_shadow_reg(reg);
 		if (shadow > 0) {
 			/*
 			 * Store the unmodified value in the shadow
 			 */
 			error = vmcs_setreg(&vmx->vmcs[vcpu], running,
 				    VMCS_IDENT(shadow), val);
 		}
 
 		if (reg == VM_REG_GUEST_CR3) {
 			/*
 			 * Invalidate the guest vcpu's TLB mappings to emulate
 			 * the behavior of updating %cr3.
 			 *
 			 * XXX the processor retains global mappings when %cr3
 			 * is updated but vmx_invvpid() does not.
 			 */
 			pmap = vmx->ctx[vcpu].pmap;
 			vmx_invvpid(vmx, vcpu, pmap, running);
 		}
 	}
 
 	return (error);
 }
 
 static int
 vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
 {
 	int hostcpu, running;
 	struct vmx *vmx = arg;
 
 	running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
 	if (running && hostcpu != curcpu)
 		panic("vmx_getdesc: %s%d is running", vm_name(vmx->vm), vcpu);
 
 	return (vmcs_getdesc(&vmx->vmcs[vcpu], running, reg, desc));
 }
 
 static int
 vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
 {
 	int hostcpu, running;
 	struct vmx *vmx = arg;
 
 	running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
 	if (running && hostcpu != curcpu)
 		panic("vmx_setdesc: %s%d is running", vm_name(vmx->vm), vcpu);
 
 	return (vmcs_setdesc(&vmx->vmcs[vcpu], running, reg, desc));
 }
 
 static int
 vmx_getcap(void *arg, int vcpu, int type, int *retval)
 {
 	struct vmx *vmx = arg;
 	int vcap;
 	int ret;
 
 	ret = ENOENT;
 
 	vcap = vmx->cap[vcpu].set;
 
 	switch (type) {
 	case VM_CAP_HALT_EXIT:
 		if (cap_halt_exit)
 			ret = 0;
 		break;
 	case VM_CAP_PAUSE_EXIT:
 		if (cap_pause_exit)
 			ret = 0;
 		break;
 	case VM_CAP_MTRAP_EXIT:
 		if (cap_monitor_trap)
 			ret = 0;
 		break;
 	case VM_CAP_UNRESTRICTED_GUEST:
 		if (cap_unrestricted_guest)
 			ret = 0;
 		break;
 	case VM_CAP_ENABLE_INVPCID:
 		if (cap_invpcid)
 			ret = 0;
 		break;
 	case VM_CAP_BPT_EXIT:
 		ret = 0;
 		break;
 	default:
 		break;
 	}
 
 	if (ret == 0)
 		*retval = (vcap & (1 << type)) ? 1 : 0;
 
 	return (ret);
 }
 
 static int
 vmx_setcap(void *arg, int vcpu, int type, int val)
 {
 	struct vmx *vmx = arg;
 	struct vmcs *vmcs = &vmx->vmcs[vcpu];
 	uint32_t baseval;
 	uint32_t *pptr;
 	int error;
 	int flag;
 	int reg;
 	int retval;
 
 	retval = ENOENT;
 	pptr = NULL;
 
 	switch (type) {
 	case VM_CAP_HALT_EXIT:
 		if (cap_halt_exit) {
 			retval = 0;
 			pptr = &vmx->cap[vcpu].proc_ctls;
 			baseval = *pptr;
 			flag = PROCBASED_HLT_EXITING;
 			reg = VMCS_PRI_PROC_BASED_CTLS;
 		}
 		break;
 	case VM_CAP_MTRAP_EXIT:
 		if (cap_monitor_trap) {
 			retval = 0;
 			pptr = &vmx->cap[vcpu].proc_ctls;
 			baseval = *pptr;
 			flag = PROCBASED_MTF;
 			reg = VMCS_PRI_PROC_BASED_CTLS;
 		}
 		break;
 	case VM_CAP_PAUSE_EXIT:
 		if (cap_pause_exit) {
 			retval = 0;
 			pptr = &vmx->cap[vcpu].proc_ctls;
 			baseval = *pptr;
 			flag = PROCBASED_PAUSE_EXITING;
 			reg = VMCS_PRI_PROC_BASED_CTLS;
 		}
 		break;
 	case VM_CAP_UNRESTRICTED_GUEST:
 		if (cap_unrestricted_guest) {
 			retval = 0;
 			pptr = &vmx->cap[vcpu].proc_ctls2;
 			baseval = *pptr;
 			flag = PROCBASED2_UNRESTRICTED_GUEST;
 			reg = VMCS_SEC_PROC_BASED_CTLS;
 		}
 		break;
 	case VM_CAP_ENABLE_INVPCID:
 		if (cap_invpcid) {
 			retval = 0;
 			pptr = &vmx->cap[vcpu].proc_ctls2;
 			baseval = *pptr;
 			flag = PROCBASED2_ENABLE_INVPCID;
 			reg = VMCS_SEC_PROC_BASED_CTLS;
 		}
 		break;
 	case VM_CAP_BPT_EXIT:
 		retval = 0;
 
 		/* Don't change the bitmap if we are tracing all exceptions. */
 		if (vmx->cap[vcpu].exc_bitmap != 0xffffffff) {
 			pptr = &vmx->cap[vcpu].exc_bitmap;
 			baseval = *pptr;
 			flag = (1 << IDT_BP);
 			reg = VMCS_EXCEPTION_BITMAP;
 		}
 		break;
 	default:
 		break;
 	}
 
 	if (retval)
 		return (retval);
 
 	if (pptr != NULL) {
 		if (val) {
 			baseval |= flag;
 		} else {
 			baseval &= ~flag;
 		}
 		VMPTRLD(vmcs);
 		error = vmwrite(reg, baseval);
 		VMCLEAR(vmcs);
 
 		if (error)
 			return (error);
 
 		/*
 		 * Update optional stored flags, and record
 		 * setting
 		 */
 		*pptr = baseval;
 	}
 
 	if (val) {
 		vmx->cap[vcpu].set |= (1 << type);
 	} else {
 		vmx->cap[vcpu].set &= ~(1 << type);
 	}
 
 	return (0);
 }
 
 struct vlapic_vtx {
 	struct vlapic	vlapic;
 	struct pir_desc	*pir_desc;
 	struct vmx	*vmx;
 	u_int	pending_prio;
 };
 
 #define VPR_PRIO_BIT(vpr)	(1 << ((vpr) >> 4))
 
 #define	VMX_CTR_PIR(vm, vcpuid, pir_desc, notify, vector, level, msg)	\
 do {									\
 	VCPU_CTR2(vm, vcpuid, msg " assert %s-triggered vector %d",	\
 	    level ? "level" : "edge", vector);				\
 	VCPU_CTR1(vm, vcpuid, msg " pir0 0x%016lx", pir_desc->pir[0]);	\
 	VCPU_CTR1(vm, vcpuid, msg " pir1 0x%016lx", pir_desc->pir[1]);	\
 	VCPU_CTR1(vm, vcpuid, msg " pir2 0x%016lx", pir_desc->pir[2]);	\
 	VCPU_CTR1(vm, vcpuid, msg " pir3 0x%016lx", pir_desc->pir[3]);	\
 	VCPU_CTR1(vm, vcpuid, msg " notify: %s", notify ? "yes" : "no");\
 } while (0)
 
 /*
  * vlapic->ops handlers that utilize the APICv hardware assist described in
  * Chapter 29 of the Intel SDM.
  */
 static int
 vmx_set_intr_ready(struct vlapic *vlapic, int vector, bool level)
 {
 	struct vlapic_vtx *vlapic_vtx;
 	struct pir_desc *pir_desc;
 	uint64_t mask;
 	int idx, notify = 0;
 
 	vlapic_vtx = (struct vlapic_vtx *)vlapic;
 	pir_desc = vlapic_vtx->pir_desc;
 
 	/*
 	 * Keep track of interrupt requests in the PIR descriptor. This is
 	 * because the virtual APIC page pointed to by the VMCS cannot be
 	 * modified if the vcpu is running.
 	 */
 	idx = vector / 64;
 	mask = 1UL << (vector % 64);
 	atomic_set_long(&pir_desc->pir[idx], mask);
 
 	/*
 	 * A notification is required whenever the 'pending' bit makes a
 	 * transition from 0->1.
 	 *
 	 * Even if the 'pending' bit is already asserted, notification about
 	 * the incoming interrupt may still be necessary.  For example, if a
 	 * vCPU is HLTed with a high PPR, a low priority interrupt would cause
 	 * the 0->1 'pending' transition with a notification, but the vCPU
 	 * would ignore the interrupt for the time being.  The same vCPU would
 	 * need to then be notified if a high-priority interrupt arrived which
 	 * satisfied the PPR.
 	 *
 	 * The priorities of interrupts injected while 'pending' is asserted
 	 * are tracked in a custom bitfield 'pending_prio'.  Should the
 	 * to-be-injected interrupt exceed the priorities already present, the
 	 * notification is sent.  The priorities recorded in 'pending_prio' are
 	 * cleared whenever the 'pending' bit makes another 0->1 transition.
 	 */
 	if (atomic_cmpset_long(&pir_desc->pending, 0, 1) != 0) {
 		notify = 1;
 		vlapic_vtx->pending_prio = 0;
 	} else {
 		const u_int old_prio = vlapic_vtx->pending_prio;
 		const u_int prio_bit = VPR_PRIO_BIT(vector & APIC_TPR_INT);
 
 		if ((old_prio & prio_bit) == 0 && prio_bit > old_prio) {
 			atomic_set_int(&vlapic_vtx->pending_prio, prio_bit);
 			notify = 1;
 		}
 	}
 
 	VMX_CTR_PIR(vlapic->vm, vlapic->vcpuid, pir_desc, notify, vector,
 	    level, "vmx_set_intr_ready");
 	return (notify);
 }
 
 static int
 vmx_pending_intr(struct vlapic *vlapic, int *vecptr)
 {
 	struct vlapic_vtx *vlapic_vtx;
 	struct pir_desc *pir_desc;
 	struct LAPIC *lapic;
 	uint64_t pending, pirval;
 	uint32_t ppr, vpr;
 	int i;
 
 	/*
 	 * This function is only expected to be called from the 'HLT' exit
 	 * handler which does not care about the vector that is pending.
 	 */
 	KASSERT(vecptr == NULL, ("vmx_pending_intr: vecptr must be NULL"));
 
 	vlapic_vtx = (struct vlapic_vtx *)vlapic;
 	pir_desc = vlapic_vtx->pir_desc;
 
 	pending = atomic_load_acq_long(&pir_desc->pending);
 	if (!pending) {
 		/*
 		 * While a virtual interrupt may have already been
 		 * processed the actual delivery maybe pending the
 		 * interruptibility of the guest.  Recognize a pending
 		 * interrupt by reevaluating virtual interrupts
 		 * following Section 29.2.1 in the Intel SDM Volume 3.
 		 */
 		struct vm_exit *vmexit;
 		uint8_t rvi, ppr;
 
 		vmexit = vm_exitinfo(vlapic->vm, vlapic->vcpuid);
 		KASSERT(vmexit->exitcode == VM_EXITCODE_HLT,
 		    ("vmx_pending_intr: exitcode not 'HLT'"));
 		rvi = vmexit->u.hlt.intr_status & APIC_TPR_INT;
 		lapic = vlapic->apic_page;
 		ppr = lapic->ppr & APIC_TPR_INT;
 		if (rvi > ppr) {
 			return (1);
 		}
 
 		return (0);
 	}
 
 	/*
 	 * If there is an interrupt pending then it will be recognized only
 	 * if its priority is greater than the processor priority.
 	 *
 	 * Special case: if the processor priority is zero then any pending
 	 * interrupt will be recognized.
 	 */
 	lapic = vlapic->apic_page;
 	ppr = lapic->ppr & APIC_TPR_INT;
 	if (ppr == 0)
 		return (1);
 
 	VCPU_CTR1(vlapic->vm, vlapic->vcpuid, "HLT with non-zero PPR %d",
 	    lapic->ppr);
 
 	vpr = 0;
 	for (i = 3; i >= 0; i--) {
 		pirval = pir_desc->pir[i];
 		if (pirval != 0) {
 			vpr = (i * 64 + flsl(pirval) - 1) & APIC_TPR_INT;
 			break;
 		}
 	}
 
 	/*
 	 * If the highest-priority pending interrupt falls short of the
 	 * processor priority of this vCPU, ensure that 'pending_prio' does not
 	 * have any stale bits which would preclude a higher-priority interrupt
 	 * from incurring a notification later.
 	 */
 	if (vpr <= ppr) {
 		const u_int prio_bit = VPR_PRIO_BIT(vpr);
 		const u_int old = vlapic_vtx->pending_prio;
 
 		if (old > prio_bit && (old & prio_bit) == 0) {
 			vlapic_vtx->pending_prio = prio_bit;
 		}
 		return (0);
 	}
 	return (1);
 }
 
 static void
 vmx_intr_accepted(struct vlapic *vlapic, int vector)
 {
 
 	panic("vmx_intr_accepted: not expected to be called");
 }
 
 static void
 vmx_set_tmr(struct vlapic *vlapic, int vector, bool level)
 {
 	struct vlapic_vtx *vlapic_vtx;
 	struct vmx *vmx;
 	struct vmcs *vmcs;
 	uint64_t mask, val;
 
 	KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector));
 	KASSERT(!vcpu_is_running(vlapic->vm, vlapic->vcpuid, NULL),
 	    ("vmx_set_tmr: vcpu cannot be running"));
 
 	vlapic_vtx = (struct vlapic_vtx *)vlapic;
 	vmx = vlapic_vtx->vmx;
 	vmcs = &vmx->vmcs[vlapic->vcpuid];
 	mask = 1UL << (vector % 64);
 
 	VMPTRLD(vmcs);
 	val = vmcs_read(VMCS_EOI_EXIT(vector));
 	if (level)
 		val |= mask;
 	else
 		val &= ~mask;
 	vmcs_write(VMCS_EOI_EXIT(vector), val);
 	VMCLEAR(vmcs);
 }
 
 static void
 vmx_enable_x2apic_mode_ts(struct vlapic *vlapic)
 {
 	struct vmx *vmx;
 	struct vmcs *vmcs;
 	uint32_t proc_ctls;
 	int vcpuid;
 
 	vcpuid = vlapic->vcpuid;
 	vmx = ((struct vlapic_vtx *)vlapic)->vmx;
 	vmcs = &vmx->vmcs[vcpuid];
 
 	proc_ctls = vmx->cap[vcpuid].proc_ctls;
 	proc_ctls &= ~PROCBASED_USE_TPR_SHADOW;
 	proc_ctls |= PROCBASED_CR8_LOAD_EXITING;
 	proc_ctls |= PROCBASED_CR8_STORE_EXITING;
 	vmx->cap[vcpuid].proc_ctls = proc_ctls;
 
 	VMPTRLD(vmcs);
 	vmcs_write(VMCS_PRI_PROC_BASED_CTLS, proc_ctls);
 	VMCLEAR(vmcs);
 }
 
 static void
 vmx_enable_x2apic_mode_vid(struct vlapic *vlapic)
 {
 	struct vmx *vmx;
 	struct vmcs *vmcs;
 	uint32_t proc_ctls2;
 	int vcpuid, error;
 
 	vcpuid = vlapic->vcpuid;
 	vmx = ((struct vlapic_vtx *)vlapic)->vmx;
 	vmcs = &vmx->vmcs[vcpuid];
 
 	proc_ctls2 = vmx->cap[vcpuid].proc_ctls2;
 	KASSERT((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) != 0,
 	    ("%s: invalid proc_ctls2 %#x", __func__, proc_ctls2));
 
 	proc_ctls2 &= ~PROCBASED2_VIRTUALIZE_APIC_ACCESSES;
 	proc_ctls2 |= PROCBASED2_VIRTUALIZE_X2APIC_MODE;
 	vmx->cap[vcpuid].proc_ctls2 = proc_ctls2;
 
 	VMPTRLD(vmcs);
 	vmcs_write(VMCS_SEC_PROC_BASED_CTLS, proc_ctls2);
 	VMCLEAR(vmcs);
 
 	if (vlapic->vcpuid == 0) {
 		/*
 		 * The nested page table mappings are shared by all vcpus
 		 * so unmap the APIC access page just once.
 		 */
 		error = vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE);
 		KASSERT(error == 0, ("%s: vm_unmap_mmio error %d",
 		    __func__, error));
 
 		/*
 		 * The MSR bitmap is shared by all vcpus so modify it only
 		 * once in the context of vcpu 0.
 		 */
 		error = vmx_allow_x2apic_msrs(vmx);
 		KASSERT(error == 0, ("%s: vmx_allow_x2apic_msrs error %d",
 		    __func__, error));
 	}
 }
 
 static void
 vmx_post_intr(struct vlapic *vlapic, int hostcpu)
 {
 
 	ipi_cpu(hostcpu, pirvec);
 }
 
 /*
  * Transfer the pending interrupts in the PIR descriptor to the IRR
  * in the virtual APIC page.
  */
 static void
 vmx_inject_pir(struct vlapic *vlapic)
 {
 	struct vlapic_vtx *vlapic_vtx;
 	struct pir_desc *pir_desc;
 	struct LAPIC *lapic;
 	uint64_t val, pirval;
 	int rvi, pirbase = -1;
 	uint16_t intr_status_old, intr_status_new;
 
 	vlapic_vtx = (struct vlapic_vtx *)vlapic;
 	pir_desc = vlapic_vtx->pir_desc;
 	if (atomic_cmpset_long(&pir_desc->pending, 1, 0) == 0) {
 		VCPU_CTR0(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: "
 		    "no posted interrupt pending");
 		return;
 	}
 
 	pirval = 0;
 	pirbase = -1;
 	lapic = vlapic->apic_page;
 
 	val = atomic_readandclear_long(&pir_desc->pir[0]);
 	if (val != 0) {
 		lapic->irr0 |= val;
 		lapic->irr1 |= val >> 32;
 		pirbase = 0;
 		pirval = val;
 	}
 
 	val = atomic_readandclear_long(&pir_desc->pir[1]);
 	if (val != 0) {
 		lapic->irr2 |= val;
 		lapic->irr3 |= val >> 32;
 		pirbase = 64;
 		pirval = val;
 	}
 
 	val = atomic_readandclear_long(&pir_desc->pir[2]);
 	if (val != 0) {
 		lapic->irr4 |= val;
 		lapic->irr5 |= val >> 32;
 		pirbase = 128;
 		pirval = val;
 	}
 
 	val = atomic_readandclear_long(&pir_desc->pir[3]);
 	if (val != 0) {
 		lapic->irr6 |= val;
 		lapic->irr7 |= val >> 32;
 		pirbase = 192;
 		pirval = val;
 	}
 
 	VLAPIC_CTR_IRR(vlapic, "vmx_inject_pir");
 
 	/*
 	 * Update RVI so the processor can evaluate pending virtual
 	 * interrupts on VM-entry.
 	 *
 	 * It is possible for pirval to be 0 here, even though the
 	 * pending bit has been set. The scenario is:
 	 * CPU-Y is sending a posted interrupt to CPU-X, which
 	 * is running a guest and processing posted interrupts in h/w.
 	 * CPU-X will eventually exit and the state seen in s/w is
 	 * the pending bit set, but no PIR bits set.
 	 *
 	 *      CPU-X                      CPU-Y
 	 *   (vm running)                (host running)
 	 *   rx posted interrupt
 	 *   CLEAR pending bit
 	 *				 SET PIR bit
 	 *   READ/CLEAR PIR bits
 	 *				 SET pending bit
 	 *   (vm exit)
 	 *   pending bit set, PIR 0
 	 */
 	if (pirval != 0) {
 		rvi = pirbase + flsl(pirval) - 1;
 		intr_status_old = vmcs_read(VMCS_GUEST_INTR_STATUS);
 		intr_status_new = (intr_status_old & 0xFF00) | rvi;
 		if (intr_status_new > intr_status_old) {
 			vmcs_write(VMCS_GUEST_INTR_STATUS, intr_status_new);
 			VCPU_CTR2(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: "
 			    "guest_intr_status changed from 0x%04x to 0x%04x",
 			    intr_status_old, intr_status_new);
 		}
 	}
 }
 
 static struct vlapic *
 vmx_vlapic_init(void *arg, int vcpuid)
 {
 	struct vmx *vmx;
 	struct vlapic *vlapic;
 	struct vlapic_vtx *vlapic_vtx;
 
 	vmx = arg;
 
 	vlapic = malloc(sizeof(struct vlapic_vtx), M_VLAPIC, M_WAITOK | M_ZERO);
 	vlapic->vm = vmx->vm;
 	vlapic->vcpuid = vcpuid;
 	vlapic->apic_page = (struct LAPIC *)&vmx->apic_page[vcpuid];
 
 	vlapic_vtx = (struct vlapic_vtx *)vlapic;
 	vlapic_vtx->pir_desc = &vmx->pir_desc[vcpuid];
 	vlapic_vtx->vmx = vmx;
 
 	if (tpr_shadowing) {
 		vlapic->ops.enable_x2apic_mode = vmx_enable_x2apic_mode_ts;
 	}
 
 	if (virtual_interrupt_delivery) {
 		vlapic->ops.set_intr_ready = vmx_set_intr_ready;
 		vlapic->ops.pending_intr = vmx_pending_intr;
 		vlapic->ops.intr_accepted = vmx_intr_accepted;
 		vlapic->ops.set_tmr = vmx_set_tmr;
 		vlapic->ops.enable_x2apic_mode = vmx_enable_x2apic_mode_vid;
 	}
 
 	if (posted_interrupts)
 		vlapic->ops.post_intr = vmx_post_intr;
 
 	vlapic_init(vlapic);
 
 	return (vlapic);
 }
 
 static void
 vmx_vlapic_cleanup(void *arg, struct vlapic *vlapic)
 {
 
 	vlapic_cleanup(vlapic);
 	free(vlapic, M_VLAPIC);
 }
 
+#ifdef BHYVE_SNAPSHOT
+static int
+vmx_snapshot_vmi(void *arg, struct vm_snapshot_meta *meta)
+{
+	struct vmx *vmx;
+	struct vmxctx *vmxctx;
+	int i;
+	int ret;
+
+	vmx = arg;
+
+	KASSERT(vmx != NULL, ("%s: arg was NULL", __func__));
+
+	for (i = 0; i < VM_MAXCPU; i++) {
+		SNAPSHOT_BUF_OR_LEAVE(vmx->guest_msrs[i],
+		      sizeof(vmx->guest_msrs[i]), meta, ret, done);
+
+		vmxctx = &vmx->ctx[i];
+		SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rdi, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rsi, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rdx, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rcx, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r8, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r9, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rax, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rbx, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rbp, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r10, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r11, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r12, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r13, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r14, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r15, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_cr2, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_dr0, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_dr1, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_dr2, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_dr3, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_dr6, meta, ret, done);
+	}
+
+done:
+	return (ret);
+}
+
+static int
+vmx_snapshot_vmcx(void *arg, struct vm_snapshot_meta *meta, int vcpu)
+{
+	struct vmcs *vmcs;
+	struct vmx *vmx;
+	int err, run, hostcpu;
+
+	vmx = (struct vmx *)arg;
+	err = 0;
+
+	KASSERT(arg != NULL, ("%s: arg was NULL", __func__));
+	vmcs = &vmx->vmcs[vcpu];
+
+	run = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
+	if (run && hostcpu != curcpu) {
+		printf("%s: %s%d is running", __func__, vm_name(vmx->vm), vcpu);
+		return (EINVAL);
+	}
+
+	err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_CR0, meta);
+	err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_CR3, meta);
+	err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_CR4, meta);
+	err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_DR7, meta);
+	err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_RSP, meta);
+	err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_RIP, meta);
+	err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_RFLAGS, meta);
+
+	/* Guest segments */
+	err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_ES, meta);
+	err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_ES, meta);
+
+	err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_CS, meta);
+	err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_CS, meta);
+
+	err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_SS, meta);
+	err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_SS, meta);
+
+	err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_DS, meta);
+	err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_DS, meta);
+
+	err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_FS, meta);
+	err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_FS, meta);
+
+	err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_GS, meta);
+	err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_GS, meta);
+
+	err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_TR, meta);
+	err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_TR, meta);
+
+	err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_LDTR, meta);
+	err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_LDTR, meta);
+
+	err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_EFER, meta);
+
+	err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_IDTR, meta);
+	err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_GDTR, meta);
+
+	/* Guest page tables */
+	err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_PDPTE0, meta);
+	err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_PDPTE1, meta);
+	err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_PDPTE2, meta);
+	err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_PDPTE3, meta);
+
+	/* Other guest state */
+	err += vmcs_snapshot_any(vmcs, run, VMCS_GUEST_IA32_SYSENTER_CS, meta);
+	err += vmcs_snapshot_any(vmcs, run, VMCS_GUEST_IA32_SYSENTER_ESP, meta);
+	err += vmcs_snapshot_any(vmcs, run, VMCS_GUEST_IA32_SYSENTER_EIP, meta);
+	err += vmcs_snapshot_any(vmcs, run, VMCS_GUEST_INTERRUPTIBILITY, meta);
+	err += vmcs_snapshot_any(vmcs, run, VMCS_GUEST_ACTIVITY, meta);
+	err += vmcs_snapshot_any(vmcs, run, VMCS_ENTRY_CTLS, meta);
+	err += vmcs_snapshot_any(vmcs, run, VMCS_EXIT_CTLS, meta);
+
+	return (err);
+}
+
+static int
+vmx_restore_tsc(void *arg, int vcpu, uint64_t offset)
+{
+	struct vmcs *vmcs;
+	struct vmx *vmx = (struct vmx *)arg;
+	int error, running, hostcpu;
+
+	KASSERT(arg != NULL, ("%s: arg was NULL", __func__));
+	vmcs = &vmx->vmcs[vcpu];
+
+	running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
+	if (running && hostcpu != curcpu) {
+		printf("%s: %s%d is running", __func__, vm_name(vmx->vm), vcpu);
+		return (EINVAL);
+	}
+
+	if (!running)
+		VMPTRLD(vmcs);
+
+	error = vmx_set_tsc_offset(vmx, vcpu, offset);
+
+	if (!running)
+		VMCLEAR(vmcs);
+	return (error);
+}
+#endif
+
 struct vmm_ops vmm_ops_intel = {
 	.init		= vmx_init,
 	.cleanup	= vmx_cleanup,
 	.resume		= vmx_restore,
 	.vminit		= vmx_vminit,
 	.vmrun		= vmx_run,
 	.vmcleanup	= vmx_vmcleanup,
 	.vmgetreg	= vmx_getreg,
 	.vmsetreg	= vmx_setreg,
 	.vmgetdesc	= vmx_getdesc,
 	.vmsetdesc	= vmx_setdesc,
 	.vmgetcap	= vmx_getcap,
 	.vmsetcap	= vmx_setcap,
 	.vmspace_alloc	= ept_vmspace_alloc,
 	.vmspace_free	= ept_vmspace_free,
 	.vlapic_init	= vmx_vlapic_init,
 	.vlapic_cleanup	= vmx_vlapic_cleanup,
+#ifdef BHYVE_SNAPSHOT
+	.vmsnapshot	= vmx_snapshot_vmi,
+	.vmcx_snapshot	= vmx_snapshot_vmcx,
+	.vm_restore_tsc	= vmx_restore_tsc,
+#endif
 };
diff --git a/sys/amd64/vmm/io/vatpic.c b/sys/amd64/vmm/io/vatpic.c
index ba4cd7785e7d..1e053d26c182 100644
--- a/sys/amd64/vmm/io/vatpic.c
+++ b/sys/amd64/vmm/io/vatpic.c
@@ -1,810 +1,853 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
+#include "opt_bhyve_snapshot.h"
+
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/queue.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/systm.h>
 
 #include <x86/apicreg.h>
 #include <dev/ic/i8259.h>
 
 #include <machine/vmm.h>
+#include <machine/vmm_snapshot.h>
 
 #include "vmm_ktr.h"
 #include "vmm_lapic.h"
 #include "vioapic.h"
 #include "vatpic.h"
 
 static MALLOC_DEFINE(M_VATPIC, "atpic", "bhyve virtual atpic (8259)");
 
 #define	VATPIC_LOCK(vatpic)		mtx_lock_spin(&((vatpic)->mtx))
 #define	VATPIC_UNLOCK(vatpic)		mtx_unlock_spin(&((vatpic)->mtx))
 #define	VATPIC_LOCKED(vatpic)		mtx_owned(&((vatpic)->mtx))
 
 enum irqstate {
 	IRQSTATE_ASSERT,
 	IRQSTATE_DEASSERT,
 	IRQSTATE_PULSE
 };
 
 struct atpic {
 	bool		ready;
 	int		icw_num;
 	int		rd_cmd_reg;
 
 	bool		aeoi;
 	bool		poll;
 	bool		rotate;
 	bool		sfn;		/* special fully-nested mode */
 
 	int		irq_base;
 	uint8_t		request;	/* Interrupt Request Register (IIR) */
 	uint8_t		service;	/* Interrupt Service (ISR) */
 	uint8_t		mask;		/* Interrupt Mask Register (IMR) */
 	uint8_t		smm;		/* special mask mode */
 
 	int		acnt[8];	/* sum of pin asserts and deasserts */
 	int		lowprio;	/* lowest priority irq */
 
 	bool		intr_raised;
 };
 
 struct vatpic {
 	struct vm	*vm;
 	struct mtx	mtx;
 	struct atpic	atpic[2];
 	uint8_t		elc[2];
 };
 
 #define	VATPIC_CTR0(vatpic, fmt)					\
 	VM_CTR0((vatpic)->vm, fmt)
 
 #define	VATPIC_CTR1(vatpic, fmt, a1)					\
 	VM_CTR1((vatpic)->vm, fmt, a1)
 
 #define	VATPIC_CTR2(vatpic, fmt, a1, a2)				\
 	VM_CTR2((vatpic)->vm, fmt, a1, a2)
 
 #define	VATPIC_CTR3(vatpic, fmt, a1, a2, a3)				\
 	VM_CTR3((vatpic)->vm, fmt, a1, a2, a3)
 
 #define	VATPIC_CTR4(vatpic, fmt, a1, a2, a3, a4)			\
 	VM_CTR4((vatpic)->vm, fmt, a1, a2, a3, a4)
 
 /*
  * Loop over all the pins in priority order from highest to lowest.
  */
 #define	ATPIC_PIN_FOREACH(pinvar, atpic, tmpvar)			\
 	for (tmpvar = 0, pinvar = (atpic->lowprio + 1) & 0x7;		\
 	    tmpvar < 8;							\
 	    tmpvar++, pinvar = (pinvar + 1) & 0x7)
 
 static void vatpic_set_pinstate(struct vatpic *vatpic, int pin, bool newstate);
 
 static __inline bool
 master_atpic(struct vatpic *vatpic, struct atpic *atpic)
 {
 
 	if (atpic == &vatpic->atpic[0])
 		return (true);
 	else
 		return (false);
 }
 
 static __inline int
 vatpic_get_highest_isrpin(struct atpic *atpic)
 {
 	int bit, pin;
 	int i;
 
 	ATPIC_PIN_FOREACH(pin, atpic, i) {
                 bit = (1 << pin);
 
 		if (atpic->service & bit) {
 			/*
 			 * An IS bit that is masked by an IMR bit will not be
 			 * cleared by a non-specific EOI in Special Mask Mode.
 			 */
 			if (atpic->smm && (atpic->mask & bit) != 0)
 				continue;
 			else
 				return (pin);
 		}
 	}
 
 	return (-1);
 }
 
 static __inline int
 vatpic_get_highest_irrpin(struct atpic *atpic)
 {
 	int serviced;
 	int bit, pin, tmp;
 
 	/*
 	 * In 'Special Fully-Nested Mode' when an interrupt request from
 	 * a slave is in service, the slave is not locked out from the
 	 * master's priority logic.
 	 */
 	serviced = atpic->service;
 	if (atpic->sfn)
 		serviced &= ~(1 << 2);
 
 	/*
 	 * In 'Special Mask Mode', when a mask bit is set in OCW1 it inhibits
 	 * further interrupts at that level and enables interrupts from all
 	 * other levels that are not masked. In other words the ISR has no
 	 * bearing on the levels that can generate interrupts.
 	 */
 	if (atpic->smm)
 		serviced = 0;
 
 	ATPIC_PIN_FOREACH(pin, atpic, tmp) {
 		bit = 1 << pin;
 
 		/*
 		 * If there is already an interrupt in service at the same
 		 * or higher priority then bail.
 		 */
 		if ((serviced & bit) != 0)
 			break;
 
 		/*
 		 * If an interrupt is asserted and not masked then return
 		 * the corresponding 'pin' to the caller.
 		 */
 		if ((atpic->request & bit) != 0 && (atpic->mask & bit) == 0)
 			return (pin);
 	}
 
 	return (-1);
 }
 
 static void
 vatpic_notify_intr(struct vatpic *vatpic)
 {
 	struct atpic *atpic;
 	int pin;
 
 	KASSERT(VATPIC_LOCKED(vatpic), ("vatpic_notify_intr not locked"));
 
 	/*
 	 * First check the slave.
 	 */
 	atpic = &vatpic->atpic[1];
 	if (!atpic->intr_raised &&
 	    (pin = vatpic_get_highest_irrpin(atpic)) != -1) {
 		VATPIC_CTR4(vatpic, "atpic slave notify pin = %d "
 		    "(imr 0x%x irr 0x%x isr 0x%x)", pin,
 		    atpic->mask, atpic->request, atpic->service);
 
 		/*
 		 * Cascade the request from the slave to the master.
 		 */
 		atpic->intr_raised = true;
 		vatpic_set_pinstate(vatpic, 2, true);
 		vatpic_set_pinstate(vatpic, 2, false);
 	} else {
 		VATPIC_CTR3(vatpic, "atpic slave no eligible interrupts "
 		    "(imr 0x%x irr 0x%x isr 0x%x)",
 		    atpic->mask, atpic->request, atpic->service);
 	}
 
 	/*
 	 * Then check the master.
 	 */
 	atpic = &vatpic->atpic[0];
 	if (!atpic->intr_raised &&
 	    (pin = vatpic_get_highest_irrpin(atpic)) != -1) {
 		VATPIC_CTR4(vatpic, "atpic master notify pin = %d "
 		    "(imr 0x%x irr 0x%x isr 0x%x)", pin,
 		    atpic->mask, atpic->request, atpic->service);
 
 		/*
 		 * From Section 3.6.2, "Interrupt Modes", in the
 		 * MPtable Specification, Version 1.4
 		 *
 		 * PIC interrupts are routed to both the Local APIC
 		 * and the I/O APIC to support operation in 1 of 3
 		 * modes.
 		 *
 		 * 1. Legacy PIC Mode: the PIC effectively bypasses
 		 * all APIC components.  In this mode the local APIC is
 		 * disabled and LINT0 is reconfigured as INTR to
 		 * deliver the PIC interrupt directly to the CPU.
 		 *
 		 * 2. Virtual Wire Mode: the APIC is treated as a
 		 * virtual wire which delivers interrupts from the PIC
 		 * to the CPU.  In this mode LINT0 is programmed as
 		 * ExtINT to indicate that the PIC is the source of
 		 * the interrupt.
 		 *
 		 * 3. Virtual Wire Mode via I/O APIC: PIC interrupts are
 		 * fielded by the I/O APIC and delivered to the appropriate
 		 * CPU.  In this mode the I/O APIC input 0 is programmed
 		 * as ExtINT to indicate that the PIC is the source of the
 		 * interrupt.
 		 */
 		atpic->intr_raised = true;
 		lapic_set_local_intr(vatpic->vm, -1, APIC_LVT_LINT0);
 		vioapic_pulse_irq(vatpic->vm, 0);
 	} else {
 		VATPIC_CTR3(vatpic, "atpic master no eligible interrupts "
 		    "(imr 0x%x irr 0x%x isr 0x%x)",
 		    atpic->mask, atpic->request, atpic->service);
 	}
 }
 
 static int
 vatpic_icw1(struct vatpic *vatpic, struct atpic *atpic, uint8_t val)
 {
 	VATPIC_CTR1(vatpic, "atpic icw1 0x%x", val);
 
 	atpic->ready = false;
 
 	atpic->icw_num = 1;
 	atpic->request = 0;
 	atpic->mask = 0;
 	atpic->lowprio = 7;
 	atpic->rd_cmd_reg = 0;
 	atpic->poll = 0;
 	atpic->smm = 0;
 
 	if ((val & ICW1_SNGL) != 0) {
 		VATPIC_CTR0(vatpic, "vatpic cascade mode required");
 		return (-1);
 	}
 
 	if ((val & ICW1_IC4) == 0) {
 		VATPIC_CTR0(vatpic, "vatpic icw4 required");
 		return (-1);
 	}
 
 	atpic->icw_num++;
 
 	return (0);
 }
 
 static int
 vatpic_icw2(struct vatpic *vatpic, struct atpic *atpic, uint8_t val)
 {
 	VATPIC_CTR1(vatpic, "atpic icw2 0x%x", val);
 
 	atpic->irq_base = val & 0xf8;
 
 	atpic->icw_num++;
 
 	return (0);
 }
 
 static int
 vatpic_icw3(struct vatpic *vatpic, struct atpic *atpic, uint8_t val)
 {
 	VATPIC_CTR1(vatpic, "atpic icw3 0x%x", val);
 
 	atpic->icw_num++;
 
 	return (0);
 }
 
 static int
 vatpic_icw4(struct vatpic *vatpic, struct atpic *atpic, uint8_t val)
 {
 	VATPIC_CTR1(vatpic, "atpic icw4 0x%x", val);
 
 	if ((val & ICW4_8086) == 0) {
 		VATPIC_CTR0(vatpic, "vatpic microprocessor mode required");
 		return (-1);
 	}
 
 	if ((val & ICW4_AEOI) != 0)
 		atpic->aeoi = true;
 
 	if ((val & ICW4_SFNM) != 0) {
 		if (master_atpic(vatpic, atpic)) {
 			atpic->sfn = true;
 		} else {
 			VATPIC_CTR1(vatpic, "Ignoring special fully nested "
 			    "mode on slave atpic: %#x", val);
 		}
 	}
 
 	atpic->icw_num = 0;
 	atpic->ready = true;
 
 	return (0);
 }
 
 static int
 vatpic_ocw1(struct vatpic *vatpic, struct atpic *atpic, uint8_t val)
 {
 	VATPIC_CTR1(vatpic, "atpic ocw1 0x%x", val);
 
 	atpic->mask = val & 0xff;
 
 	return (0);
 }
 
 static int
 vatpic_ocw2(struct vatpic *vatpic, struct atpic *atpic, uint8_t val)
 {
 	VATPIC_CTR1(vatpic, "atpic ocw2 0x%x", val);
 
 	atpic->rotate = ((val & OCW2_R) != 0);
 
 	if ((val & OCW2_EOI) != 0) {
 		int isr_bit;
 
 		if ((val & OCW2_SL) != 0) {
 			/* specific EOI */
 			isr_bit = val & 0x7;
 		} else {
 			/* non-specific EOI */
 			isr_bit = vatpic_get_highest_isrpin(atpic);
 		}
 
 		if (isr_bit != -1) {
 			atpic->service &= ~(1 << isr_bit);
 
 			if (atpic->rotate)
 				atpic->lowprio = isr_bit;
 		}
 	} else if ((val & OCW2_SL) != 0 && atpic->rotate == true) {
 		/* specific priority */
 		atpic->lowprio = val & 0x7;
 	}
 
 	return (0);
 }
 
 static int
 vatpic_ocw3(struct vatpic *vatpic, struct atpic *atpic, uint8_t val)
 {
 	VATPIC_CTR1(vatpic, "atpic ocw3 0x%x", val);
 
 	if (val & OCW3_ESMM) {
 		atpic->smm = val & OCW3_SMM ? 1 : 0;
 		VATPIC_CTR2(vatpic, "%s atpic special mask mode %s",
 		    master_atpic(vatpic, atpic) ? "master" : "slave",
 		    atpic->smm ?  "enabled" : "disabled");
 	}
 
 	if (val & OCW3_RR) {
 		/* read register command */
 		atpic->rd_cmd_reg = val & OCW3_RIS;
 
 		/* Polling mode */
 		atpic->poll = ((val & OCW3_P) != 0);
 	}
 
 	return (0);
 }
 
 static void
 vatpic_set_pinstate(struct vatpic *vatpic, int pin, bool newstate)
 {
 	struct atpic *atpic;
 	int oldcnt, newcnt;
 	bool level;
 
 	KASSERT(pin >= 0 && pin < 16,
 	    ("vatpic_set_pinstate: invalid pin number %d", pin));
 	KASSERT(VATPIC_LOCKED(vatpic),
 	    ("vatpic_set_pinstate: vatpic is not locked"));
 
 	atpic = &vatpic->atpic[pin >> 3];
 
 	oldcnt = atpic->acnt[pin & 0x7];
 	if (newstate)
 		atpic->acnt[pin & 0x7]++;
 	else
 		atpic->acnt[pin & 0x7]--;
 	newcnt = atpic->acnt[pin & 0x7];
 
 	if (newcnt < 0) {
 		VATPIC_CTR2(vatpic, "atpic pin%d: bad acnt %d", pin, newcnt);
 	}
 
 	level = ((vatpic->elc[pin >> 3] & (1 << (pin & 0x7))) != 0);
 
 	if ((oldcnt == 0 && newcnt == 1) || (newcnt > 0 && level == true)) {
 		/* rising edge or level */
 		VATPIC_CTR1(vatpic, "atpic pin%d: asserted", pin);
 		atpic->request |= (1 << (pin & 0x7));
 	} else if (oldcnt == 1 && newcnt == 0) {
 		/* falling edge */
 		VATPIC_CTR1(vatpic, "atpic pin%d: deasserted", pin);
 		if (level)
 			atpic->request &= ~(1 << (pin & 0x7));
 	} else {
 		VATPIC_CTR3(vatpic, "atpic pin%d: %s, ignored, acnt %d",
 		    pin, newstate ? "asserted" : "deasserted", newcnt);
 	}
 
 	vatpic_notify_intr(vatpic);
 }
 
 static int
 vatpic_set_irqstate(struct vm *vm, int irq, enum irqstate irqstate)
 {
 	struct vatpic *vatpic;
 	struct atpic *atpic;
 
 	if (irq < 0 || irq > 15)
 		return (EINVAL);
 
 	vatpic = vm_atpic(vm);
 	atpic = &vatpic->atpic[irq >> 3];
 
 	if (atpic->ready == false)
 		return (0);
 
 	VATPIC_LOCK(vatpic);
 	switch (irqstate) {
 	case IRQSTATE_ASSERT:
 		vatpic_set_pinstate(vatpic, irq, true);
 		break;
 	case IRQSTATE_DEASSERT:
 		vatpic_set_pinstate(vatpic, irq, false);
 		break;
 	case IRQSTATE_PULSE:
 		vatpic_set_pinstate(vatpic, irq, true);
 		vatpic_set_pinstate(vatpic, irq, false);
 		break;
 	default:
 		panic("vatpic_set_irqstate: invalid irqstate %d", irqstate);
 	}
 	VATPIC_UNLOCK(vatpic);
 
 	return (0);
 }
 
 int
 vatpic_assert_irq(struct vm *vm, int irq)
 {
 	return (vatpic_set_irqstate(vm, irq, IRQSTATE_ASSERT));
 }
 
 int
 vatpic_deassert_irq(struct vm *vm, int irq)
 {
 	return (vatpic_set_irqstate(vm, irq, IRQSTATE_DEASSERT));
 }
 
 int
 vatpic_pulse_irq(struct vm *vm, int irq)
 {
 	return (vatpic_set_irqstate(vm, irq, IRQSTATE_PULSE));
 }
 
 int
 vatpic_set_irq_trigger(struct vm *vm, int irq, enum vm_intr_trigger trigger)
 {
 	struct vatpic *vatpic;
 
 	if (irq < 0 || irq > 15)
 		return (EINVAL);
 
 	/*
 	 * See comment in vatpic_elc_handler.  These IRQs must be
 	 * edge triggered.
 	 */
 	if (trigger == LEVEL_TRIGGER) {
 		switch (irq) {
 		case 0:
 		case 1:
 		case 2:
 		case 8:
 		case 13:
 			return (EINVAL);
 		}
 	}
 
 	vatpic = vm_atpic(vm);
 
 	VATPIC_LOCK(vatpic);
 
 	if (trigger == LEVEL_TRIGGER)
 		vatpic->elc[irq >> 3] |=  1 << (irq & 0x7);
 	else
 		vatpic->elc[irq >> 3] &=  ~(1 << (irq & 0x7));
 
 	VATPIC_UNLOCK(vatpic);
 
 	return (0);
 }
 
 void
 vatpic_pending_intr(struct vm *vm, int *vecptr)
 {
 	struct vatpic *vatpic;
 	struct atpic *atpic;
 	int pin;
 
 	vatpic = vm_atpic(vm);
 
 	atpic = &vatpic->atpic[0];
 
 	VATPIC_LOCK(vatpic);
 
 	pin = vatpic_get_highest_irrpin(atpic);
 	if (pin == 2) {
 		atpic = &vatpic->atpic[1];
 		pin = vatpic_get_highest_irrpin(atpic);
 	}
 
 	/*
 	 * If there are no pins active at this moment then return the spurious
 	 * interrupt vector instead.
 	 */
 	if (pin == -1)
 		pin = 7;
 
 	KASSERT(pin >= 0 && pin <= 7, ("%s: invalid pin %d", __func__, pin));
 	*vecptr = atpic->irq_base + pin;
 
 	VATPIC_UNLOCK(vatpic);
 }
 
 static void
 vatpic_pin_accepted(struct atpic *atpic, int pin)
 {
 	atpic->intr_raised = false;
 
 	if (atpic->acnt[pin] == 0)
 		atpic->request &= ~(1 << pin);
 
 	if (atpic->aeoi == true) {
 		if (atpic->rotate == true)
 			atpic->lowprio = pin;
 	} else {
 		atpic->service |= (1 << pin);
 	}
 }
 
 void
 vatpic_intr_accepted(struct vm *vm, int vector)
 {
 	struct vatpic *vatpic;
 	int pin;
 
 	vatpic = vm_atpic(vm);
 
 	VATPIC_LOCK(vatpic);
 
 	pin = vector & 0x7;
 
 	if ((vector & ~0x7) == vatpic->atpic[1].irq_base) {
 		vatpic_pin_accepted(&vatpic->atpic[1], pin);
 		/*
 		 * If this vector originated from the slave,
 		 * accept the cascaded interrupt too.
 		 */
 		vatpic_pin_accepted(&vatpic->atpic[0], 2);
 	} else {
 		vatpic_pin_accepted(&vatpic->atpic[0], pin);
 	}
 
 	vatpic_notify_intr(vatpic);
 
 	VATPIC_UNLOCK(vatpic);
 }
 
 static int
 vatpic_read(struct vatpic *vatpic, struct atpic *atpic, bool in, int port,
 	    int bytes, uint32_t *eax)
 {
 	int pin;
 
 	VATPIC_LOCK(vatpic);
 
 	if (atpic->poll) {
 		atpic->poll = 0;
 		pin = vatpic_get_highest_irrpin(atpic);
 		if (pin >= 0) {
 			vatpic_pin_accepted(atpic, pin);
 			*eax = 0x80 | pin;
 		} else {
 			*eax = 0;
 		}
 	} else {
 		if (port & ICU_IMR_OFFSET) {
 			/* read interrrupt mask register */
 			*eax = atpic->mask;
 		} else {
 			if (atpic->rd_cmd_reg == OCW3_RIS) {
 				/* read interrupt service register */
 				*eax = atpic->service;
 			} else {
 				/* read interrupt request register */
 				*eax = atpic->request;
 			}
 		}
 	}
 
 	VATPIC_UNLOCK(vatpic);
 
 	return (0);
 
 }
 
 static int
 vatpic_write(struct vatpic *vatpic, struct atpic *atpic, bool in, int port,
     int bytes, uint32_t *eax)
 {
 	int error;
 	uint8_t val;
 
 	error = 0;
 	val = *eax;
 
 	VATPIC_LOCK(vatpic);
 
 	if (port & ICU_IMR_OFFSET) {
 		switch (atpic->icw_num) {
 		case 2:
 			error = vatpic_icw2(vatpic, atpic, val);
 			break;
 		case 3:
 			error = vatpic_icw3(vatpic, atpic, val);
 			break;
 		case 4:
 			error = vatpic_icw4(vatpic, atpic, val);
 			break;
 		default:
 			error = vatpic_ocw1(vatpic, atpic, val);
 			break;
 		}
 	} else {
 		if (val & (1 << 4))
 			error = vatpic_icw1(vatpic, atpic, val);
 
 		if (atpic->ready) {
 			if (val & (1 << 3))
 				error = vatpic_ocw3(vatpic, atpic, val);
 			else
 				error = vatpic_ocw2(vatpic, atpic, val);
 		}
 	}
 
 	if (atpic->ready)
 		vatpic_notify_intr(vatpic);
 
 	VATPIC_UNLOCK(vatpic);
 
 	return (error);
 }
 
 int
 vatpic_master_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
     uint32_t *eax)
 {
 	struct vatpic *vatpic;
 	struct atpic *atpic;
 
 	vatpic = vm_atpic(vm);
 	atpic = &vatpic->atpic[0];
 
 	if (bytes != 1)
 		return (-1);
  
 	if (in) {
 		return (vatpic_read(vatpic, atpic, in, port, bytes, eax));
 	}
  
 	return (vatpic_write(vatpic, atpic, in, port, bytes, eax));
 }
 
 int
 vatpic_slave_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
     uint32_t *eax)
 {
 	struct vatpic *vatpic;
 	struct atpic *atpic;
 
 	vatpic = vm_atpic(vm);
 	atpic = &vatpic->atpic[1];
 
 	if (bytes != 1)
 		return (-1);
 
 	if (in) {
 		return (vatpic_read(vatpic, atpic, in, port, bytes, eax));
 	}
 
 	return (vatpic_write(vatpic, atpic, in, port, bytes, eax));
 }
 
 int
 vatpic_elc_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
     uint32_t *eax)
 {
 	struct vatpic *vatpic;
 	bool is_master;
 
 	vatpic = vm_atpic(vm);
 	is_master = (port == IO_ELCR1);
 
 	if (bytes != 1)
 		return (-1);
 
 	VATPIC_LOCK(vatpic);
 
 	if (in) {
 		if (is_master)
 			*eax = vatpic->elc[0];
 		else
 			*eax = vatpic->elc[1];
 	} else {
 		/*
 		 * For the master PIC the cascade channel (IRQ2), the
 		 * heart beat timer (IRQ0), and the keyboard
 		 * controller (IRQ1) cannot be programmed for level
 		 * mode.
 		 *
 		 * For the slave PIC the real time clock (IRQ8) and
 		 * the floating point error interrupt (IRQ13) cannot
 		 * be programmed for level mode.
 		 */
 		if (is_master)
 			vatpic->elc[0] = (*eax & 0xf8);
 		else
 			vatpic->elc[1] = (*eax & 0xde);
 	}
 
 	VATPIC_UNLOCK(vatpic);
 
 	return (0);
 }
 
 struct vatpic *
 vatpic_init(struct vm *vm)
 {
 	struct vatpic *vatpic;
 
 	vatpic = malloc(sizeof(struct vatpic), M_VATPIC, M_WAITOK | M_ZERO);
 	vatpic->vm = vm;
 
 	mtx_init(&vatpic->mtx, "vatpic lock", NULL, MTX_SPIN);
 
 	return (vatpic);
 }
 
 void
 vatpic_cleanup(struct vatpic *vatpic)
 {
 	free(vatpic, M_VATPIC);
 }
+
+#ifdef BHYVE_SNAPSHOT
+int
+vatpic_snapshot(struct vatpic *vatpic, struct vm_snapshot_meta *meta)
+{
+	int ret;
+	int i;
+	struct atpic *atpic;
+
+	for (i = 0; i < nitems(vatpic->atpic); i++) {
+		atpic = &vatpic->atpic[i];
+
+		SNAPSHOT_VAR_OR_LEAVE(atpic->ready, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(atpic->icw_num, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(atpic->rd_cmd_reg, meta, ret, done);
+
+		SNAPSHOT_VAR_OR_LEAVE(atpic->aeoi, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(atpic->poll, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(atpic->rotate, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(atpic->sfn, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(atpic->irq_base, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(atpic->request, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(atpic->service, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(atpic->mask, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(atpic->smm, meta, ret, done);
+
+		SNAPSHOT_BUF_OR_LEAVE(atpic->acnt, sizeof(atpic->acnt),
+				      meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(atpic->lowprio, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(atpic->intr_raised, meta, ret, done);
+
+	}
+
+	SNAPSHOT_BUF_OR_LEAVE(vatpic->elc, sizeof(vatpic->elc),
+			      meta, ret, done);
+
+done:
+	return (ret);
+}
+#endif
diff --git a/sys/amd64/vmm/io/vatpic.h b/sys/amd64/vmm/io/vatpic.h
index d4a1be18208d..8990a2a5fcb0 100644
--- a/sys/amd64/vmm/io/vatpic.h
+++ b/sys/amd64/vmm/io/vatpic.h
@@ -1,57 +1,63 @@
 /*-
  * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _VATPIC_H_
 #define	_VATPIC_H_
 
 #include <isa/isareg.h>
 
 #define	ICU_IMR_OFFSET	1
 
 #define	IO_ELCR1	0x4d0
 #define	IO_ELCR2	0x4d1
 
+struct vm_snapshot_meta;
+
 struct vatpic *vatpic_init(struct vm *vm);
 void vatpic_cleanup(struct vatpic *vatpic);
 
 int vatpic_master_handler(struct vm *vm, int vcpuid, bool in, int port,
     int bytes, uint32_t *eax);
 int vatpic_slave_handler(struct vm *vm, int vcpuid, bool in, int port,
     int bytes, uint32_t *eax);
 int vatpic_elc_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
     uint32_t *eax);
 
 int vatpic_assert_irq(struct vm *vm, int irq);
 int vatpic_deassert_irq(struct vm *vm, int irq);
 int vatpic_pulse_irq(struct vm *vm, int irq);
 int vatpic_set_irq_trigger(struct vm *vm, int irq, enum vm_intr_trigger trigger);
 
 void vatpic_pending_intr(struct vm *vm, int *vecptr);
 void vatpic_intr_accepted(struct vm *vm, int vector);
 
+#ifdef BHYVE_SNAPSHOT
+int vatpic_snapshot(struct vatpic *vatpic, struct vm_snapshot_meta *meta);
+#endif
+
 #endif	/* _VATPIC_H_ */
diff --git a/sys/amd64/vmm/io/vatpit.c b/sys/amd64/vmm/io/vatpit.c
index 91d64af21233..4718a0557065 100644
--- a/sys/amd64/vmm/io/vatpit.c
+++ b/sys/amd64/vmm/io/vatpit.c
@@ -1,474 +1,516 @@
 /*-
  * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  * Copyright (c) 2018 Joyent, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
+#include "opt_bhyve_snapshot.h"
+
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/queue.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/systm.h>
 
 #include <machine/vmm.h>
+#include <machine/vmm_snapshot.h>
 
 #include "vmm_ktr.h"
 #include "vatpic.h"
 #include "vioapic.h"
 #include "vatpit.h"
 
 static MALLOC_DEFINE(M_VATPIT, "atpit", "bhyve virtual atpit (8254)");
 
 #define	VATPIT_LOCK(vatpit)		mtx_lock_spin(&((vatpit)->mtx))
 #define	VATPIT_UNLOCK(vatpit)		mtx_unlock_spin(&((vatpit)->mtx))
 #define	VATPIT_LOCKED(vatpit)		mtx_owned(&((vatpit)->mtx))
 
 #define	TIMER_SEL_MASK		0xc0
 #define	TIMER_RW_MASK		0x30
 #define	TIMER_MODE_MASK		0x0f
 #define	TIMER_SEL_READBACK	0xc0
 
 #define	TIMER_STS_OUT		0x80
 #define	TIMER_STS_NULLCNT	0x40
 
 #define	TIMER_RB_LCTR		0x20
 #define	TIMER_RB_LSTATUS	0x10
 #define	TIMER_RB_CTR_2		0x08
 #define	TIMER_RB_CTR_1		0x04
 #define	TIMER_RB_CTR_0		0x02
 
 #define	TMR2_OUT_STS		0x20
 
 #define	PIT_8254_FREQ		1193182
 #define	TIMER_DIV(freq, hz)	(((freq) + (hz) / 2) / (hz))
 
 struct vatpit_callout_arg {
 	struct vatpit	*vatpit;
 	int		channel_num;
 };
 
 
 struct channel {
 	int		mode;
 	uint16_t	initial;	/* initial counter value */
 	struct bintime	now_bt;		/* uptime when counter was loaded */
 	uint8_t		cr[2];
 	uint8_t		ol[2];
 	bool		slatched;	/* status latched */
 	uint8_t		status;
 	int		crbyte;
 	int		olbyte;
 	int		frbyte;
 	struct callout	callout;
 	struct bintime	callout_bt;	/* target time */
 	struct vatpit_callout_arg callout_arg;
 };
 
 struct vatpit {
 	struct vm	*vm;
 	struct mtx	mtx;
 
 	struct bintime	freq_bt;
 
 	struct channel	channel[3];
 };
 
 static void pit_timer_start_cntr0(struct vatpit *vatpit);
 
 static uint64_t
 vatpit_delta_ticks(struct vatpit *vatpit, struct channel *c)
 {
 	struct bintime delta;
 	uint64_t result;
 
 	binuptime(&delta);
 	bintime_sub(&delta, &c->now_bt);
 
 	result = delta.sec * PIT_8254_FREQ;
 	result += delta.frac / vatpit->freq_bt.frac;
 
 	return (result);
 }
 
 static int
 vatpit_get_out(struct vatpit *vatpit, int channel)
 {
 	struct channel *c;
 	uint64_t delta_ticks;
 	int out;
 
 	c = &vatpit->channel[channel];
 
 	switch (c->mode) {
 	case TIMER_INTTC:
 		delta_ticks = vatpit_delta_ticks(vatpit, c);
 		out = (delta_ticks >= c->initial);
 		break;
 	default:
 		out = 0;
 		break;
 	}
 
 	return (out);
 }
 
 static void
 vatpit_callout_handler(void *a)
 {
 	struct vatpit_callout_arg *arg = a;
 	struct vatpit *vatpit;
 	struct callout *callout;
 	struct channel *c;
 
 	vatpit = arg->vatpit;
 	c = &vatpit->channel[arg->channel_num];
 	callout = &c->callout;
 
 	VM_CTR1(vatpit->vm, "atpit t%d fired", arg->channel_num);
 
 	VATPIT_LOCK(vatpit);
 
 	if (callout_pending(callout))		/* callout was reset */
 		goto done;
 
 	if (!callout_active(callout))		/* callout was stopped */
 		goto done;
 
 	callout_deactivate(callout);
 
 	if (c->mode == TIMER_RATEGEN) {
 		pit_timer_start_cntr0(vatpit);
 	}
 
 	vatpic_pulse_irq(vatpit->vm, 0);
 	vioapic_pulse_irq(vatpit->vm, 2);
 
 done:
 	VATPIT_UNLOCK(vatpit);
 	return;
 }
 
 static void
 pit_timer_start_cntr0(struct vatpit *vatpit)
 {
 	struct channel *c;
 	struct bintime now, delta;
 	sbintime_t precision;
 
 	c = &vatpit->channel[0];
 	if (c->initial != 0) {
 		delta.sec = 0;
 		delta.frac = vatpit->freq_bt.frac * c->initial;
 		bintime_add(&c->callout_bt, &delta);
 		precision = bttosbt(delta) >> tc_precexp;
 
 		/*
 		 * Reset 'callout_bt' if the time that the callout
 		 * was supposed to fire is more than 'c->initial'
 		 * ticks in the past.
 		 */
 		binuptime(&now);
 		if (bintime_cmp(&c->callout_bt, &now, <)) {
 			c->callout_bt = now;
 			bintime_add(&c->callout_bt, &delta);
 		}
 
 		callout_reset_sbt(&c->callout, bttosbt(c->callout_bt),
 		    precision, vatpit_callout_handler, &c->callout_arg,
 		    C_ABSOLUTE);
 	}
 }
 
 static uint16_t
 pit_update_counter(struct vatpit *vatpit, struct channel *c, bool latch)
 {
 	uint16_t lval;
 	uint64_t delta_ticks;
 
 	/* cannot latch a new value until the old one has been consumed */
 	if (latch && c->olbyte != 0)
 		return (0);
 
 	if (c->initial == 0) {
 		/*
 		 * This is possibly an o/s bug - reading the value of
 		 * the timer without having set up the initial value.
 		 *
 		 * The original user-space version of this code set
 		 * the timer to 100hz in this condition; do the same
 		 * here.
 		 */
 		c->initial = TIMER_DIV(PIT_8254_FREQ, 100);
 		binuptime(&c->now_bt);
 		c->status &= ~TIMER_STS_NULLCNT;
 	}
 
 	delta_ticks = vatpit_delta_ticks(vatpit, c);
 	lval = c->initial - delta_ticks % c->initial;
 
 	if (latch) {
 		c->olbyte = 2;
 		c->ol[1] = lval;		/* LSB */
 		c->ol[0] = lval >> 8;		/* MSB */
 	}
 
 	return (lval);
 }
 
 static int
 pit_readback1(struct vatpit *vatpit, int channel, uint8_t cmd)
 {
 	struct channel *c;
 
 	c = &vatpit->channel[channel];
 
 	/*
 	 * Latch the count/status of the timer if not already latched.
 	 * N.B. that the count/status latch-select bits are active-low.
 	 */
 	if (!(cmd & TIMER_RB_LCTR) && !c->olbyte) {
 		(void) pit_update_counter(vatpit, c, true);
 	}
 
 	if (!(cmd & TIMER_RB_LSTATUS) && !c->slatched) {
 		c->slatched = true;
 		/*
 		 * For mode 0, see if the elapsed time is greater
 		 * than the initial value - this results in the
 		 * output pin being set to 1 in the status byte.
 		 */
 		if (c->mode == TIMER_INTTC && vatpit_get_out(vatpit, channel))
 			c->status |= TIMER_STS_OUT;
 		else
 			c->status &= ~TIMER_STS_OUT;
 	}
 
 	return (0);
 }
 
 static int
 pit_readback(struct vatpit *vatpit, uint8_t cmd)
 {
 	int error;
 
 	/*
 	 * The readback command can apply to all timers.
 	 */
 	error = 0;
 	if (cmd & TIMER_RB_CTR_0)
 		error = pit_readback1(vatpit, 0, cmd);
 	if (!error && cmd & TIMER_RB_CTR_1)
 		error = pit_readback1(vatpit, 1, cmd);
 	if (!error && cmd & TIMER_RB_CTR_2)
 		error = pit_readback1(vatpit, 2, cmd);
 
 	return (error);
 }
 
 
 static int
 vatpit_update_mode(struct vatpit *vatpit, uint8_t val)
 {
 	struct channel *c;
 	int sel, rw, mode;
 
 	sel = val & TIMER_SEL_MASK;
 	rw = val & TIMER_RW_MASK;
 	mode = val & TIMER_MODE_MASK;
 
 	if (sel == TIMER_SEL_READBACK)
 		return (pit_readback(vatpit, val));
 
 	if (rw != TIMER_LATCH && rw != TIMER_16BIT)
 		return (-1);
 
 	if (rw != TIMER_LATCH) {
 		/*
 		 * Counter mode is not affected when issuing a
 		 * latch command.
 		 */
 		if (mode != TIMER_INTTC &&
 		    mode != TIMER_RATEGEN &&
 		    mode != TIMER_SQWAVE &&
 		    mode != TIMER_SWSTROBE)
 			return (-1);
 	}
 
 	c = &vatpit->channel[sel >> 6];
 	if (rw == TIMER_LATCH)
 		pit_update_counter(vatpit, c, true);
 	else {
 		c->mode = mode;
 		c->olbyte = 0;	/* reset latch after reprogramming */
 		c->status |= TIMER_STS_NULLCNT;
 	}
 
 	return (0);
 }
 
 int
 vatpit_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
     uint32_t *eax)
 {
 	struct vatpit *vatpit;
 	struct channel *c;
 	uint8_t val;
 	int error;
 
 	vatpit = vm_atpit(vm);
 
 	if (bytes != 1)
 		return (-1);
 
 	val = *eax;
 
 	if (port == TIMER_MODE) {
 		if (in) {
 			VM_CTR0(vatpit->vm, "vatpit attempt to read mode");
 			return (-1);
 		}
 
 		VATPIT_LOCK(vatpit);
 		error = vatpit_update_mode(vatpit, val);
 		VATPIT_UNLOCK(vatpit);
 
 		return (error);
 	}
 
 	/* counter ports */
 	KASSERT(port >= TIMER_CNTR0 && port <= TIMER_CNTR2,
 	    ("invalid port 0x%x", port));
 	c = &vatpit->channel[port - TIMER_CNTR0];
 
 	VATPIT_LOCK(vatpit);
 	if (in && c->slatched) {
 		/*
 		 * Return the status byte if latched
 		 */
 		*eax = c->status;
 		c->slatched = false;
 		c->status = 0;
 	} else if (in) {
 		/*
 		 * The spec says that once the output latch is completely
 		 * read it should revert to "following" the counter. Use
 		 * the free running counter for this case (i.e. Linux
 		 * TSC calibration). Assuming the access mode is 16-bit,
 		 * toggle the MSB/LSB bit on each read.
 		 */
 		if (c->olbyte == 0) {
 			uint16_t tmp;
 
 			tmp = pit_update_counter(vatpit, c, false);
 			if (c->frbyte)
 				tmp >>= 8;
 			tmp &= 0xff;
 			*eax = tmp;
 			c->frbyte ^= 1;
 		}  else
 			*eax = c->ol[--c->olbyte];
 	} else {
 		c->cr[c->crbyte++] = *eax;
 		if (c->crbyte == 2) {
 			c->status &= ~TIMER_STS_NULLCNT;
 			c->frbyte = 0;
 			c->crbyte = 0;
 			c->initial = c->cr[0] | (uint16_t)c->cr[1] << 8;
 			binuptime(&c->now_bt);
 			/* Start an interval timer for channel 0 */
 			if (port == TIMER_CNTR0) {
 				c->callout_bt = c->now_bt;
 				pit_timer_start_cntr0(vatpit);
 			}
 			if (c->initial == 0)
 				c->initial = 0xffff;
 		}
 	}
 	VATPIT_UNLOCK(vatpit);
 
 	return (0);
 }
 
 int
 vatpit_nmisc_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
     uint32_t *eax)
 {
 	struct vatpit *vatpit;
 
 	vatpit = vm_atpit(vm);
 
 	if (in) {
 			VATPIT_LOCK(vatpit);
 			if (vatpit_get_out(vatpit, 2))
 				*eax = TMR2_OUT_STS;
 			else
 				*eax = 0;
 
 			VATPIT_UNLOCK(vatpit);
 	}
 
 	return (0);
 }
 
 struct vatpit *
 vatpit_init(struct vm *vm)
 {
 	struct vatpit *vatpit;
 	struct vatpit_callout_arg *arg;
 	int i;
 
 	vatpit = malloc(sizeof(struct vatpit), M_VATPIT, M_WAITOK | M_ZERO);
 	vatpit->vm = vm;
 
 	mtx_init(&vatpit->mtx, "vatpit lock", NULL, MTX_SPIN);
 
 	FREQ2BT(PIT_8254_FREQ, &vatpit->freq_bt);
 
 	for (i = 0; i < 3; i++) {
 		callout_init(&vatpit->channel[i].callout, 1);
 		arg = &vatpit->channel[i].callout_arg;
 		arg->vatpit = vatpit;
 		arg->channel_num = i;
 	}
 
 	return (vatpit);
 }
 
 void
 vatpit_cleanup(struct vatpit *vatpit)
 {
 	int i;
 
 	for (i = 0; i < 3; i++)
 		callout_drain(&vatpit->channel[i].callout);
 
 	free(vatpit, M_VATPIT);
 }
+
+#ifdef BHYVE_SNAPSHOT
+int
+vatpit_snapshot(struct vatpit *vatpit, struct vm_snapshot_meta *meta)
+{
+	int ret;
+	int i;
+	struct channel *channel;
+
+	SNAPSHOT_VAR_OR_LEAVE(vatpit->freq_bt.sec, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(vatpit->freq_bt.frac, meta, ret, done);
+
+	/* properly restore timers; they will NOT work currently */
+	printf("%s: snapshot restore does not reset timers!\r\n", __func__);
+
+	for (i = 0; i < nitems(vatpit->channel); i++) {
+		channel = &vatpit->channel[i];
+
+		SNAPSHOT_VAR_OR_LEAVE(channel->mode, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(channel->initial, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(channel->now_bt.sec, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(channel->now_bt.frac, meta, ret, done);
+		SNAPSHOT_BUF_OR_LEAVE(channel->cr, sizeof(channel->cr),
+			meta, ret, done);
+		SNAPSHOT_BUF_OR_LEAVE(channel->ol, sizeof(channel->ol),
+			meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(channel->slatched, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(channel->status, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(channel->crbyte, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(channel->frbyte, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(channel->callout_bt.sec, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(channel->callout_bt.frac, meta, ret,
+			done);
+	}
+
+done:
+	return (ret);
+}
+#endif
diff --git a/sys/amd64/vmm/io/vatpit.h b/sys/amd64/vmm/io/vatpit.h
index 090d1a6172a7..65e06ec9bf58 100644
--- a/sys/amd64/vmm/io/vatpit.h
+++ b/sys/amd64/vmm/io/vatpit.h
@@ -1,47 +1,52 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _VATPIT_H_
 #define	_VATPIT_H_
 
 #include <machine/timerreg.h>
 
 #define	NMISC_PORT	0x61
 
+struct vm_snapshot_meta;
+
 struct vatpit *vatpit_init(struct vm *vm);
 void vatpit_cleanup(struct vatpit *vatpit);
 
 int vatpit_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
     uint32_t *eax);
 int vatpit_nmisc_handler(struct vm *vm, int vcpuid, bool in, int port,
     int bytes, uint32_t *eax);
+#ifdef BHYVE_SNAPSHOT
+int vatpit_snapshot(struct vatpit *vatpit, struct vm_snapshot_meta *meta);
+#endif
 
 #endif	/* _VATPIT_H_ */
diff --git a/sys/amd64/vmm/io/vhpet.c b/sys/amd64/vmm/io/vhpet.c
index 8f91f9fe6d78..530f5d49f8f1 100644
--- a/sys/amd64/vmm/io/vhpet.c
+++ b/sys/amd64/vmm/io/vhpet.c
@@ -1,763 +1,812 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2013 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
  * Copyright (c) 2013 Neel Natu <neel@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
+#include "opt_bhyve_snapshot.h"
+
 #include <sys/param.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/systm.h>
 
 #include <dev/acpica/acpi_hpet.h>
 
 #include <machine/vmm.h>
 #include <machine/vmm_dev.h>
+#include <machine/vmm_snapshot.h>
 
 #include "vmm_lapic.h"
 #include "vatpic.h"
 #include "vioapic.h"
 #include "vhpet.h"
 
 #include "vmm_ktr.h"
 
 static MALLOC_DEFINE(M_VHPET, "vhpet", "bhyve virtual hpet");
 
 #define	HPET_FREQ	16777216		/* 16.7 (2^24) Mhz */
 #define	FS_PER_S	1000000000000000ul
 
 /* Timer N Configuration and Capabilities Register */
 #define	HPET_TCAP_RO_MASK	(HPET_TCAP_INT_ROUTE 	|		\
 				 HPET_TCAP_FSB_INT_DEL	|		\
 				 HPET_TCAP_SIZE		|		\
 				 HPET_TCAP_PER_INT)
 /*
  * HPET requires at least 3 timers and up to 32 timers per block.
  */
 #define	VHPET_NUM_TIMERS	8
 CTASSERT(VHPET_NUM_TIMERS >= 3 && VHPET_NUM_TIMERS <= 32);
 
 struct vhpet_callout_arg {
 	struct vhpet *vhpet;
 	int timer_num;
 };
 
 struct vhpet {
 	struct vm	*vm;
 	struct mtx	mtx;
 	sbintime_t	freq_sbt;
 
 	uint64_t	config;		/* Configuration */
 	uint64_t	isr;		/* Interrupt Status */
 	uint32_t	countbase;	/* HPET counter base value */
 	sbintime_t	countbase_sbt;	/* uptime corresponding to base value */
 
 	struct {
 		uint64_t	cap_config;	/* Configuration */
 		uint64_t	msireg;		/* FSB interrupt routing */
 		uint32_t	compval;	/* Comparator */
 		uint32_t	comprate;
 		struct callout	callout;
 		sbintime_t	callout_sbt;	/* time when counter==compval */
 		struct vhpet_callout_arg arg;
 	} timer[VHPET_NUM_TIMERS];
 };
 
 #define	VHPET_LOCK(vhp)		mtx_lock(&((vhp)->mtx))
 #define	VHPET_UNLOCK(vhp)	mtx_unlock(&((vhp)->mtx))
 
 static void vhpet_start_timer(struct vhpet *vhpet, int n, uint32_t counter,
     sbintime_t now);
 
 static uint64_t
 vhpet_capabilities(void)
 {
 	uint64_t cap = 0;
 
 	cap |= 0x8086 << 16;			/* vendor id */
 	cap |= (VHPET_NUM_TIMERS - 1) << 8;	/* number of timers */
 	cap |= 1;				/* revision */
 	cap &= ~HPET_CAP_COUNT_SIZE;		/* 32-bit timer */
 
 	cap &= 0xffffffff;
 	cap |= (FS_PER_S / HPET_FREQ) << 32;	/* tick period in fs */
 
 	return (cap);
 }
 
 static __inline bool
 vhpet_counter_enabled(struct vhpet *vhpet)
 {
 
 	return ((vhpet->config & HPET_CNF_ENABLE) ? true : false);
 }
 
 static __inline bool
 vhpet_timer_msi_enabled(struct vhpet *vhpet, int n)
 {
 	const uint64_t msi_enable = HPET_TCAP_FSB_INT_DEL | HPET_TCNF_FSB_EN;
 
 	if ((vhpet->timer[n].cap_config & msi_enable) == msi_enable)
 		return (true);
 	else
 		return (false);
 }
 
 static __inline int
 vhpet_timer_ioapic_pin(struct vhpet *vhpet, int n)
 {
 	/*
 	 * If the timer is configured to use MSI then treat it as if the
 	 * timer is not connected to the ioapic.
 	 */
 	if (vhpet_timer_msi_enabled(vhpet, n))
 		return (0);
 
 	return ((vhpet->timer[n].cap_config & HPET_TCNF_INT_ROUTE) >> 9);
 }
 
 static uint32_t
 vhpet_counter(struct vhpet *vhpet, sbintime_t *nowptr)
 {
 	uint32_t val;
 	sbintime_t now, delta;
 
 	val = vhpet->countbase;
 	if (vhpet_counter_enabled(vhpet)) {
 		now = sbinuptime();
 		delta = now - vhpet->countbase_sbt;
 		KASSERT(delta >= 0, ("vhpet_counter: uptime went backwards: "
 		    "%#lx to %#lx", vhpet->countbase_sbt, now));
 		val += delta / vhpet->freq_sbt;
 		if (nowptr != NULL)
 			*nowptr = now;
 	} else {
 		/*
 		 * The sbinuptime corresponding to the 'countbase' is
 		 * meaningless when the counter is disabled. Make sure
 		 * that the caller doesn't want to use it.
 		 */
 		KASSERT(nowptr == NULL, ("vhpet_counter: nowptr must be NULL"));
 	}
 	return (val);
 }
 
 static void
 vhpet_timer_clear_isr(struct vhpet *vhpet, int n)
 {
 	int pin;
 
 	if (vhpet->isr & (1 << n)) {
 		pin = vhpet_timer_ioapic_pin(vhpet, n);
 		KASSERT(pin != 0, ("vhpet timer %d irq incorrectly routed", n));
 		vioapic_deassert_irq(vhpet->vm, pin);
 		vhpet->isr &= ~(1 << n);
 	}
 }
 
 static __inline bool
 vhpet_periodic_timer(struct vhpet *vhpet, int n)
 {
 
 	return ((vhpet->timer[n].cap_config & HPET_TCNF_TYPE) != 0);
 }
 
 static __inline bool
 vhpet_timer_interrupt_enabled(struct vhpet *vhpet, int n)
 {
 
 	return ((vhpet->timer[n].cap_config & HPET_TCNF_INT_ENB) != 0);
 }
 
 static __inline bool
 vhpet_timer_edge_trig(struct vhpet *vhpet, int n)
 {
 
 	KASSERT(!vhpet_timer_msi_enabled(vhpet, n), ("vhpet_timer_edge_trig: "
 	    "timer %d is using MSI", n));
 
 	if ((vhpet->timer[n].cap_config & HPET_TCNF_INT_TYPE) == 0)
 		return (true);
 	else
 		return (false);
 }
 
 static void
 vhpet_timer_interrupt(struct vhpet *vhpet, int n)
 {
 	int pin;
 
 	/* If interrupts are not enabled for this timer then just return. */
 	if (!vhpet_timer_interrupt_enabled(vhpet, n))
 		return;
 
 	/*
 	 * If a level triggered interrupt is already asserted then just return.
 	 */
 	if ((vhpet->isr & (1 << n)) != 0) {
 		VM_CTR1(vhpet->vm, "hpet t%d intr is already asserted", n);
 		return;
 	}
 
 	if (vhpet_timer_msi_enabled(vhpet, n)) {
 		lapic_intr_msi(vhpet->vm, vhpet->timer[n].msireg >> 32,
 		    vhpet->timer[n].msireg & 0xffffffff);
 		return;
 	}	
 
 	pin = vhpet_timer_ioapic_pin(vhpet, n);
 	if (pin == 0) {
 		VM_CTR1(vhpet->vm, "hpet t%d intr is not routed to ioapic", n);
 		return;
 	}
 
 	if (vhpet_timer_edge_trig(vhpet, n)) {
 		vioapic_pulse_irq(vhpet->vm, pin);
 	} else {
 		vhpet->isr |= 1 << n;
 		vioapic_assert_irq(vhpet->vm, pin);
 	}
 }
 
 static void
 vhpet_adjust_compval(struct vhpet *vhpet, int n, uint32_t counter)
 {
 	uint32_t compval, comprate, compnext;
 
 	KASSERT(vhpet->timer[n].comprate != 0, ("hpet t%d is not periodic", n));
 
 	compval = vhpet->timer[n].compval;
 	comprate = vhpet->timer[n].comprate;
 
 	/*
 	 * Calculate the comparator value to be used for the next periodic
 	 * interrupt.
 	 *
 	 * This function is commonly called from the callout handler.
 	 * In this scenario the 'counter' is ahead of 'compval'. To find
 	 * the next value to program into the accumulator we divide the
 	 * number space between 'compval' and 'counter' into 'comprate'
 	 * sized units. The 'compval' is rounded up such that is "ahead"
 	 * of 'counter'.
 	 */
 	compnext = compval + ((counter - compval) / comprate + 1) * comprate;
 
 	vhpet->timer[n].compval = compnext;
 }
 
 static void
 vhpet_handler(void *a)
 {
 	int n;
 	uint32_t counter;
 	sbintime_t now;
 	struct vhpet *vhpet;
 	struct callout *callout;
 	struct vhpet_callout_arg *arg;
 
 	arg = a;
 	vhpet = arg->vhpet;
 	n = arg->timer_num;
 	callout = &vhpet->timer[n].callout;
 
 	VM_CTR1(vhpet->vm, "hpet t%d fired", n);
 
 	VHPET_LOCK(vhpet);
 
 	if (callout_pending(callout))		/* callout was reset */
 		goto done;
 
 	if (!callout_active(callout))		/* callout was stopped */
 		goto done;
 
 	callout_deactivate(callout);
 
 	if (!vhpet_counter_enabled(vhpet))
 		panic("vhpet(%p) callout with counter disabled", vhpet);
 
 	counter = vhpet_counter(vhpet, &now);
 	vhpet_start_timer(vhpet, n, counter, now);
 	vhpet_timer_interrupt(vhpet, n);
 done:
 	VHPET_UNLOCK(vhpet);
 	return;
 }
 
 static void
 vhpet_stop_timer(struct vhpet *vhpet, int n, sbintime_t now)
 {
 
 	VM_CTR1(vhpet->vm, "hpet t%d stopped", n);
 	callout_stop(&vhpet->timer[n].callout);
 
 	/*
 	 * If the callout was scheduled to expire in the past but hasn't
 	 * had a chance to execute yet then trigger the timer interrupt
 	 * here. Failing to do so will result in a missed timer interrupt
 	 * in the guest. This is especially bad in one-shot mode because
 	 * the next interrupt has to wait for the counter to wrap around.
 	 */
 	if (vhpet->timer[n].callout_sbt < now) {
 		VM_CTR1(vhpet->vm, "hpet t%d interrupt triggered after "
 		    "stopping timer", n);
 		vhpet_timer_interrupt(vhpet, n);
 	}
 }
 
 static void
 vhpet_start_timer(struct vhpet *vhpet, int n, uint32_t counter, sbintime_t now)
 {
 	sbintime_t delta, precision;
 
 	if (vhpet->timer[n].comprate != 0)
 		vhpet_adjust_compval(vhpet, n, counter);
 	else {
 		/*
 		 * In one-shot mode it is the guest's responsibility to make
 		 * sure that the comparator value is not in the "past". The
 		 * hardware doesn't have any belt-and-suspenders to deal with
 		 * this so we don't either.
 		 */
 	}
 
 	delta = (vhpet->timer[n].compval - counter) * vhpet->freq_sbt;
 	precision = delta >> tc_precexp;
 	vhpet->timer[n].callout_sbt = now + delta;
 	callout_reset_sbt(&vhpet->timer[n].callout, vhpet->timer[n].callout_sbt,
 	    precision, vhpet_handler, &vhpet->timer[n].arg, C_ABSOLUTE);
 }
 
 static void
 vhpet_start_counting(struct vhpet *vhpet)
 {
 	int i;
 
 	vhpet->countbase_sbt = sbinuptime();
 	for (i = 0; i < VHPET_NUM_TIMERS; i++) {
 		/*
 		 * Restart the timers based on the value of the main counter
 		 * when it stopped counting.
 		 */
 		vhpet_start_timer(vhpet, i, vhpet->countbase,
 		    vhpet->countbase_sbt);
 	}
 }
 
 static void
 vhpet_stop_counting(struct vhpet *vhpet, uint32_t counter, sbintime_t now)
 {
 	int i;
 
 	vhpet->countbase = counter;
 	for (i = 0; i < VHPET_NUM_TIMERS; i++)
 		vhpet_stop_timer(vhpet, i, now);
 }
 
 static __inline void
 update_register(uint64_t *regptr, uint64_t data, uint64_t mask)
 {
 
 	*regptr &= ~mask;
 	*regptr |= (data & mask);
 }
 
 static void
 vhpet_timer_update_config(struct vhpet *vhpet, int n, uint64_t data,
     uint64_t mask)
 {
 	bool clear_isr;
 	int old_pin, new_pin;
 	uint32_t allowed_irqs;
 	uint64_t oldval, newval;
 
 	if (vhpet_timer_msi_enabled(vhpet, n) ||
 	    vhpet_timer_edge_trig(vhpet, n)) {
 		if (vhpet->isr & (1 << n))
 			panic("vhpet timer %d isr should not be asserted", n);
 	}
 	old_pin = vhpet_timer_ioapic_pin(vhpet, n);
 	oldval = vhpet->timer[n].cap_config;
 
 	newval = oldval;
 	update_register(&newval, data, mask);
 	newval &= ~(HPET_TCAP_RO_MASK | HPET_TCNF_32MODE);
 	newval |= oldval & HPET_TCAP_RO_MASK;
 
 	if (newval == oldval)
 		return;
 
 	vhpet->timer[n].cap_config = newval;
 	VM_CTR2(vhpet->vm, "hpet t%d cap_config set to 0x%016x", n, newval);
 
 	/*
 	 * Validate the interrupt routing in the HPET_TCNF_INT_ROUTE field.
 	 * If it does not match the bits set in HPET_TCAP_INT_ROUTE then set
 	 * it to the default value of 0.
 	 */
 	allowed_irqs = vhpet->timer[n].cap_config >> 32;
 	new_pin = vhpet_timer_ioapic_pin(vhpet, n);
 	if (new_pin != 0 && (allowed_irqs & (1 << new_pin)) == 0) {
 		VM_CTR3(vhpet->vm, "hpet t%d configured invalid irq %d, "
 		    "allowed_irqs 0x%08x", n, new_pin, allowed_irqs);
 		new_pin = 0;
 		vhpet->timer[n].cap_config &= ~HPET_TCNF_INT_ROUTE;
 	}
 
 	if (!vhpet_periodic_timer(vhpet, n))
 		vhpet->timer[n].comprate = 0;
 
 	/*
 	 * If the timer's ISR bit is set then clear it in the following cases:
 	 * - interrupt is disabled
 	 * - interrupt type is changed from level to edge or fsb.
 	 * - interrupt routing is changed
 	 *
 	 * This is to ensure that this timer's level triggered interrupt does
 	 * not remain asserted forever.
 	 */
 	if (vhpet->isr & (1 << n)) {
 		KASSERT(old_pin != 0, ("timer %d isr asserted to ioapic pin %d",
 		    n, old_pin));
 		if (!vhpet_timer_interrupt_enabled(vhpet, n))
 			clear_isr = true;
 		else if (vhpet_timer_msi_enabled(vhpet, n))
 			clear_isr = true;
 		else if (vhpet_timer_edge_trig(vhpet, n))
 			clear_isr = true;
 		else if (vhpet_timer_ioapic_pin(vhpet, n) != old_pin)
 			clear_isr = true;
 		else
 			clear_isr = false;
 
 		if (clear_isr) {
 			VM_CTR1(vhpet->vm, "hpet t%d isr cleared due to "
 			    "configuration change", n);
 			vioapic_deassert_irq(vhpet->vm, old_pin);
 			vhpet->isr &= ~(1 << n);
 		}
 	}
 }
 
 int
 vhpet_mmio_write(void *vm, int vcpuid, uint64_t gpa, uint64_t val, int size,
     void *arg)
 {
 	struct vhpet *vhpet;
 	uint64_t data, mask, oldval, val64;
 	uint32_t isr_clear_mask, old_compval, old_comprate, counter;
 	sbintime_t now, *nowptr;
 	int i, offset;
 
 	vhpet = vm_hpet(vm);
 	offset = gpa - VHPET_BASE;
 
 	VHPET_LOCK(vhpet);
 
 	/* Accesses to the HPET should be 4 or 8 bytes wide */
 	switch (size) {
 	case 8:
 		mask = 0xffffffffffffffff;
 		data = val;
 		break;
 	case 4:
 		mask = 0xffffffff;
 		data = val;
 		if ((offset & 0x4) != 0) {
 			mask <<= 32;
 			data <<= 32;
 		} 
 		break;
 	default:
 		VM_CTR2(vhpet->vm, "hpet invalid mmio write: "
 		    "offset 0x%08x, size %d", offset, size);
 		goto done;
 	}
 
 	/* Access to the HPET should be naturally aligned to its width */
 	if (offset & (size - 1)) {
 		VM_CTR2(vhpet->vm, "hpet invalid mmio write: "
 		    "offset 0x%08x, size %d", offset, size);
 		goto done;
 	}
 
 	if (offset == HPET_CONFIG || offset == HPET_CONFIG + 4) {
 		/*
 		 * Get the most recent value of the counter before updating
 		 * the 'config' register. If the HPET is going to be disabled
 		 * then we need to update 'countbase' with the value right
 		 * before it is disabled.
 		 */
 		nowptr = vhpet_counter_enabled(vhpet) ? &now : NULL;
 		counter = vhpet_counter(vhpet, nowptr);
 		oldval = vhpet->config;
 		update_register(&vhpet->config, data, mask);
 
 		/*
 		 * LegacyReplacement Routing is not supported so clear the
 		 * bit explicitly.
 		 */
 		vhpet->config &= ~HPET_CNF_LEG_RT;
 
 		if ((oldval ^ vhpet->config) & HPET_CNF_ENABLE) {
 			if (vhpet_counter_enabled(vhpet)) {
 				vhpet_start_counting(vhpet);
 				VM_CTR0(vhpet->vm, "hpet enabled");
 			} else {
 				vhpet_stop_counting(vhpet, counter, now);
 				VM_CTR0(vhpet->vm, "hpet disabled");
 			}
 		}
 		goto done;
 	}
 
 	if (offset == HPET_ISR || offset == HPET_ISR + 4) {
 		isr_clear_mask = vhpet->isr & data;
 		for (i = 0; i < VHPET_NUM_TIMERS; i++) {
 			if ((isr_clear_mask & (1 << i)) != 0) {
 				VM_CTR1(vhpet->vm, "hpet t%d isr cleared", i);
 				vhpet_timer_clear_isr(vhpet, i);
 			}
 		}
 		goto done;
 	}
 
 	if (offset == HPET_MAIN_COUNTER || offset == HPET_MAIN_COUNTER + 4) {
 		/* Zero-extend the counter to 64-bits before updating it */
 		val64 = vhpet_counter(vhpet, NULL);
 		update_register(&val64, data, mask);
 		vhpet->countbase = val64;
 		if (vhpet_counter_enabled(vhpet))
 			vhpet_start_counting(vhpet);
 		goto done;
 	}
 
 	for (i = 0; i < VHPET_NUM_TIMERS; i++) {
 		if (offset == HPET_TIMER_CAP_CNF(i) ||
 		    offset == HPET_TIMER_CAP_CNF(i) + 4) {
 			vhpet_timer_update_config(vhpet, i, data, mask);
 			break;
 		}
 
 		if (offset == HPET_TIMER_COMPARATOR(i) ||
 		    offset == HPET_TIMER_COMPARATOR(i) + 4) {
 			old_compval = vhpet->timer[i].compval;
 			old_comprate = vhpet->timer[i].comprate;
 			if (vhpet_periodic_timer(vhpet, i)) {
 				/*
 				 * In periodic mode writes to the comparator
 				 * change the 'compval' register only if the
 				 * HPET_TCNF_VAL_SET bit is set in the config
 				 * register.
 				 */
 				val64 = vhpet->timer[i].comprate;
 				update_register(&val64, data, mask);
 				vhpet->timer[i].comprate = val64;
 				if ((vhpet->timer[i].cap_config &
 				    HPET_TCNF_VAL_SET) != 0) {
 					vhpet->timer[i].compval = val64;
 				}
 			} else {
 				KASSERT(vhpet->timer[i].comprate == 0,
 				    ("vhpet one-shot timer %d has invalid "
 				    "rate %u", i, vhpet->timer[i].comprate));
 				val64 = vhpet->timer[i].compval;
 				update_register(&val64, data, mask);
 				vhpet->timer[i].compval = val64;
 			}
 			vhpet->timer[i].cap_config &= ~HPET_TCNF_VAL_SET;
 
 			if (vhpet->timer[i].compval != old_compval ||
 			    vhpet->timer[i].comprate != old_comprate) {
 				if (vhpet_counter_enabled(vhpet)) {
 					counter = vhpet_counter(vhpet, &now);
 					vhpet_start_timer(vhpet, i, counter,
 					    now);
 				}
 			}
 			break;
 		}
 
 		if (offset == HPET_TIMER_FSB_VAL(i) ||
 		    offset == HPET_TIMER_FSB_ADDR(i)) {
 			update_register(&vhpet->timer[i].msireg, data, mask);
 			break;
 		}
 	}
 done:
 	VHPET_UNLOCK(vhpet);
 	return (0);
 }
 
 int
 vhpet_mmio_read(void *vm, int vcpuid, uint64_t gpa, uint64_t *rval, int size,
     void *arg)
 {
 	int i, offset;
 	struct vhpet *vhpet;
 	uint64_t data;
 
 	vhpet = vm_hpet(vm);
 	offset = gpa - VHPET_BASE;
 
 	VHPET_LOCK(vhpet);
 
 	/* Accesses to the HPET should be 4 or 8 bytes wide */
 	if (size != 4 && size != 8) {
 		VM_CTR2(vhpet->vm, "hpet invalid mmio read: "
 		    "offset 0x%08x, size %d", offset, size);
 		data = 0;
 		goto done;
 	}
 
 	/* Access to the HPET should be naturally aligned to its width */
 	if (offset & (size - 1)) {
 		VM_CTR2(vhpet->vm, "hpet invalid mmio read: "
 		    "offset 0x%08x, size %d", offset, size);
 		data = 0;
 		goto done;
 	}
 
 	if (offset == HPET_CAPABILITIES || offset == HPET_CAPABILITIES + 4) {
 		data = vhpet_capabilities();
 		goto done;	
 	}
 
 	if (offset == HPET_CONFIG || offset == HPET_CONFIG + 4) {
 		data = vhpet->config;
 		goto done;
 	}
 
 	if (offset == HPET_ISR || offset == HPET_ISR + 4) {
 		data = vhpet->isr;
 		goto done;
 	}
 
 	if (offset == HPET_MAIN_COUNTER || offset == HPET_MAIN_COUNTER + 4) {
 		data = vhpet_counter(vhpet, NULL);
 		goto done;
 	}
 
 	for (i = 0; i < VHPET_NUM_TIMERS; i++) {
 		if (offset == HPET_TIMER_CAP_CNF(i) ||
 		    offset == HPET_TIMER_CAP_CNF(i) + 4) {
 			data = vhpet->timer[i].cap_config;
 			break;
 		}
 
 		if (offset == HPET_TIMER_COMPARATOR(i) ||
 		    offset == HPET_TIMER_COMPARATOR(i) + 4) {
 			data = vhpet->timer[i].compval;
 			break;
 		}
 
 		if (offset == HPET_TIMER_FSB_VAL(i) ||
 		    offset == HPET_TIMER_FSB_ADDR(i)) {
 			data = vhpet->timer[i].msireg;
 			break;
 		}
 	}
 
 	if (i >= VHPET_NUM_TIMERS)
 		data = 0;
 done:
 	VHPET_UNLOCK(vhpet);
 
 	if (size == 4) {
 		if (offset & 0x4)
 			data >>= 32;
 	}
 	*rval = data;
 	return (0);
 }
 
 struct vhpet *
 vhpet_init(struct vm *vm)
 {
 	int i, pincount;
 	struct vhpet *vhpet;
 	uint64_t allowed_irqs;
 	struct vhpet_callout_arg *arg;
 	struct bintime bt;
 
 	vhpet = malloc(sizeof(struct vhpet), M_VHPET, M_WAITOK | M_ZERO);
         vhpet->vm = vm;
 	mtx_init(&vhpet->mtx, "vhpet lock", NULL, MTX_DEF);
 
 	FREQ2BT(HPET_FREQ, &bt);
 	vhpet->freq_sbt = bttosbt(bt);
 
 	pincount = vioapic_pincount(vm);
 	if (pincount >= 32)
 		allowed_irqs = 0xff000000;	/* irqs 24-31 */
 	else if (pincount >= 20)
 		allowed_irqs = 0xf << (pincount - 4);	/* 4 upper irqs */
 	else
 		allowed_irqs = 0;
 
 	/*
 	 * Initialize HPET timer hardware state.
 	 */
 	for (i = 0; i < VHPET_NUM_TIMERS; i++) {
 		vhpet->timer[i].cap_config = allowed_irqs << 32;
 		vhpet->timer[i].cap_config |= HPET_TCAP_PER_INT;
 		vhpet->timer[i].cap_config |= HPET_TCAP_FSB_INT_DEL;
 
 		vhpet->timer[i].compval = 0xffffffff;
 		callout_init(&vhpet->timer[i].callout, 1);
 
 		arg = &vhpet->timer[i].arg;
 		arg->vhpet = vhpet;
 		arg->timer_num = i;
 	}
 
 	return (vhpet);
 }
 
 void
 vhpet_cleanup(struct vhpet *vhpet)
 {
 	int i;
 
 	for (i = 0; i < VHPET_NUM_TIMERS; i++)
 		callout_drain(&vhpet->timer[i].callout);
 
 	free(vhpet, M_VHPET);
 }
 
 int
 vhpet_getcap(struct vm_hpet_cap *cap)
 {
 
 	cap->capabilities = vhpet_capabilities();
 	return (0);
 }
+
+#ifdef BHYVE_SNAPSHOT
+int
+vhpet_snapshot(struct vhpet *vhpet, struct vm_snapshot_meta *meta)
+{
+	int i, ret;
+	uint32_t countbase;
+
+	SNAPSHOT_VAR_OR_LEAVE(vhpet->freq_sbt, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(vhpet->config, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(vhpet->isr, meta, ret, done);
+
+	/* at restore time the countbase should have the value it had when the
+	 * snapshot was created; since the value is not directly kept in
+	 * vhpet->countbase, but rather computed relative to the current system
+	 * uptime using countbase_sbt, save the value retured by vhpet_counter
+	 */
+	if (meta->op == VM_SNAPSHOT_SAVE)
+		countbase = vhpet_counter(vhpet, NULL);
+	SNAPSHOT_VAR_OR_LEAVE(countbase, meta, ret, done);
+	if (meta->op == VM_SNAPSHOT_RESTORE)
+		vhpet->countbase = countbase;
+
+	for (i = 0; i < nitems(vhpet->timer); i++) {
+		SNAPSHOT_VAR_OR_LEAVE(vhpet->timer[i].cap_config,
+				      meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vhpet->timer[i].msireg, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vhpet->timer[i].compval, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vhpet->timer[i].comprate, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vhpet->timer[i].callout_sbt,
+				      meta, ret, done);
+	}
+
+done:
+	return (ret);
+}
+
+int
+vhpet_restore_time(struct vhpet *vhpet)
+{
+	if (vhpet_counter_enabled(vhpet))
+		vhpet_start_counting(vhpet);
+
+	return (0);
+}
+#endif
diff --git a/sys/amd64/vmm/io/vhpet.h b/sys/amd64/vmm/io/vhpet.h
index 3d6b653055c9..113683c09b33 100644
--- a/sys/amd64/vmm/io/vhpet.h
+++ b/sys/amd64/vmm/io/vhpet.h
@@ -1,46 +1,52 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2013 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
  * Copyright (c) 2013 Neel Natu <neel@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _VHPET_H_
 #define	_VHPET_H_
 
 #define	VHPET_BASE	0xfed00000
 #define	VHPET_SIZE	1024
 
+struct vm_snapshot_meta;
+
 struct vhpet *vhpet_init(struct vm *vm);
 void 	vhpet_cleanup(struct vhpet *vhpet);
 int	vhpet_mmio_write(void *vm, int vcpuid, uint64_t gpa, uint64_t val,
 	    int size, void *arg);
 int	vhpet_mmio_read(void *vm, int vcpuid, uint64_t gpa, uint64_t *val,
 	    int size, void *arg);
 int	vhpet_getcap(struct vm_hpet_cap *cap);
+#ifdef BHYVE_SNAPSHOT
+int	vhpet_snapshot(struct vhpet *vhpet, struct vm_snapshot_meta *meta);
+int	vhpet_restore_time(struct vhpet *vhpet);
+#endif
 
 #endif	/* _VHPET_H_ */
diff --git a/sys/amd64/vmm/io/vioapic.c b/sys/amd64/vmm/io/vioapic.c
index 31c1cabab094..a8117da4b879 100644
--- a/sys/amd64/vmm/io/vioapic.c
+++ b/sys/amd64/vmm/io/vioapic.c
@@ -1,501 +1,523 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2013 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
  * Copyright (c) 2013 Neel Natu <neel@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
+#include "opt_bhyve_snapshot.h"
+
 #include <sys/param.h>
 #include <sys/queue.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 
 #include <x86/apicreg.h>
 #include <machine/vmm.h>
+#include <machine/vmm_snapshot.h>
 
 #include "vmm_ktr.h"
 #include "vmm_lapic.h"
 #include "vlapic.h"
 #include "vioapic.h"
 
 #define	IOREGSEL	0x00
 #define	IOWIN		0x10
 
 #define	REDIR_ENTRIES	32
 #define	RTBL_RO_BITS	((uint64_t)(IOART_REM_IRR | IOART_DELIVS))
 
 struct vioapic {
 	struct vm	*vm;
 	struct mtx	mtx;
 	uint32_t	id;
 	uint32_t	ioregsel;
 	struct {
 		uint64_t reg;
 		int	 acnt;	/* sum of pin asserts (+1) and deasserts (-1) */
 	} rtbl[REDIR_ENTRIES];
 };
 
 #define	VIOAPIC_LOCK(vioapic)		mtx_lock_spin(&((vioapic)->mtx))
 #define	VIOAPIC_UNLOCK(vioapic)		mtx_unlock_spin(&((vioapic)->mtx))
 #define	VIOAPIC_LOCKED(vioapic)		mtx_owned(&((vioapic)->mtx))
 
 static MALLOC_DEFINE(M_VIOAPIC, "vioapic", "bhyve virtual ioapic");
 
 #define	VIOAPIC_CTR1(vioapic, fmt, a1)					\
 	VM_CTR1((vioapic)->vm, fmt, a1)
 
 #define	VIOAPIC_CTR2(vioapic, fmt, a1, a2)				\
 	VM_CTR2((vioapic)->vm, fmt, a1, a2)
 
 #define	VIOAPIC_CTR3(vioapic, fmt, a1, a2, a3)				\
 	VM_CTR3((vioapic)->vm, fmt, a1, a2, a3)
 
 #define	VIOAPIC_CTR4(vioapic, fmt, a1, a2, a3, a4)			\
 	VM_CTR4((vioapic)->vm, fmt, a1, a2, a3, a4)
 
 #ifdef KTR
 static const char *
 pinstate_str(bool asserted)
 {
 
 	if (asserted)
 		return ("asserted");
 	else
 		return ("deasserted");
 }
 #endif
 
 static void
 vioapic_send_intr(struct vioapic *vioapic, int pin)
 {
 	int vector, delmode;
 	uint32_t low, high, dest;
 	bool level, phys;
 
 	KASSERT(pin >= 0 && pin < REDIR_ENTRIES,
 	    ("vioapic_set_pinstate: invalid pin number %d", pin));
 
 	KASSERT(VIOAPIC_LOCKED(vioapic),
 	    ("vioapic_set_pinstate: vioapic is not locked"));
 
 	low = vioapic->rtbl[pin].reg;
 	high = vioapic->rtbl[pin].reg >> 32;
 
 	if ((low & IOART_INTMASK) == IOART_INTMSET) {
 		VIOAPIC_CTR1(vioapic, "ioapic pin%d: masked", pin);
 		return;
 	}
 
 	phys = ((low & IOART_DESTMOD) == IOART_DESTPHY);
 	delmode = low & IOART_DELMOD;
 	level = low & IOART_TRGRLVL ? true : false;
 	if (level)
 		vioapic->rtbl[pin].reg |= IOART_REM_IRR;
 
 	vector = low & IOART_INTVEC;
 	dest = high >> APIC_ID_SHIFT;
 	vlapic_deliver_intr(vioapic->vm, level, dest, phys, delmode, vector);
 }
 
 static void
 vioapic_set_pinstate(struct vioapic *vioapic, int pin, bool newstate)
 {
 	int oldcnt, newcnt;
 	bool needintr;
 
 	KASSERT(pin >= 0 && pin < REDIR_ENTRIES,
 	    ("vioapic_set_pinstate: invalid pin number %d", pin));
 
 	KASSERT(VIOAPIC_LOCKED(vioapic),
 	    ("vioapic_set_pinstate: vioapic is not locked"));
 
 	oldcnt = vioapic->rtbl[pin].acnt;
 	if (newstate)
 		vioapic->rtbl[pin].acnt++;
 	else
 		vioapic->rtbl[pin].acnt--;
 	newcnt = vioapic->rtbl[pin].acnt;
 
 	if (newcnt < 0) {
 		VIOAPIC_CTR2(vioapic, "ioapic pin%d: bad acnt %d",
 		    pin, newcnt);
 	}
 
 	needintr = false;
 	if (oldcnt == 0 && newcnt == 1) {
 		needintr = true;
 		VIOAPIC_CTR1(vioapic, "ioapic pin%d: asserted", pin);
 	} else if (oldcnt == 1 && newcnt == 0) {
 		VIOAPIC_CTR1(vioapic, "ioapic pin%d: deasserted", pin);
 	} else {
 		VIOAPIC_CTR3(vioapic, "ioapic pin%d: %s, ignored, acnt %d",
 		    pin, pinstate_str(newstate), newcnt);
 	}
 
 	if (needintr)
 		vioapic_send_intr(vioapic, pin);
 }
 
 enum irqstate {
 	IRQSTATE_ASSERT,
 	IRQSTATE_DEASSERT,
 	IRQSTATE_PULSE
 };
 
 static int
 vioapic_set_irqstate(struct vm *vm, int irq, enum irqstate irqstate)
 {
 	struct vioapic *vioapic;
 
 	if (irq < 0 || irq >= REDIR_ENTRIES)
 		return (EINVAL);
 
 	vioapic = vm_ioapic(vm);
 
 	VIOAPIC_LOCK(vioapic);
 	switch (irqstate) {
 	case IRQSTATE_ASSERT:
 		vioapic_set_pinstate(vioapic, irq, true);
 		break;
 	case IRQSTATE_DEASSERT:
 		vioapic_set_pinstate(vioapic, irq, false);
 		break;
 	case IRQSTATE_PULSE:
 		vioapic_set_pinstate(vioapic, irq, true);
 		vioapic_set_pinstate(vioapic, irq, false);
 		break;
 	default:
 		panic("vioapic_set_irqstate: invalid irqstate %d", irqstate);
 	}
 	VIOAPIC_UNLOCK(vioapic);
 
 	return (0);
 }
 
 int
 vioapic_assert_irq(struct vm *vm, int irq)
 {
 
 	return (vioapic_set_irqstate(vm, irq, IRQSTATE_ASSERT));
 }
 
 int
 vioapic_deassert_irq(struct vm *vm, int irq)
 {
 
 	return (vioapic_set_irqstate(vm, irq, IRQSTATE_DEASSERT));
 }
 
 int
 vioapic_pulse_irq(struct vm *vm, int irq)
 {
 
 	return (vioapic_set_irqstate(vm, irq, IRQSTATE_PULSE));
 }
 
 /*
  * Reset the vlapic's trigger-mode register to reflect the ioapic pin
  * configuration.
  */
 static void
 vioapic_update_tmr(struct vm *vm, int vcpuid, void *arg)
 {
 	struct vioapic *vioapic;
 	struct vlapic *vlapic;
 	uint32_t low, high, dest;
 	int delmode, pin, vector;
 	bool level, phys;
 
 	vlapic = vm_lapic(vm, vcpuid);
 	vioapic = vm_ioapic(vm);
 
 	VIOAPIC_LOCK(vioapic);
 	/*
 	 * Reset all vectors to be edge-triggered.
 	 */
 	vlapic_reset_tmr(vlapic);
 	for (pin = 0; pin < REDIR_ENTRIES; pin++) {
 		low = vioapic->rtbl[pin].reg;
 		high = vioapic->rtbl[pin].reg >> 32;
 
 		level = low & IOART_TRGRLVL ? true : false;
 		if (!level)
 			continue;
 
 		/*
 		 * For a level-triggered 'pin' let the vlapic figure out if
 		 * an assertion on this 'pin' would result in an interrupt
 		 * being delivered to it. If yes, then it will modify the
 		 * TMR bit associated with this vector to level-triggered.
 		 */
 		phys = ((low & IOART_DESTMOD) == IOART_DESTPHY);
 		delmode = low & IOART_DELMOD;
 		vector = low & IOART_INTVEC;
 		dest = high >> APIC_ID_SHIFT;
 		vlapic_set_tmr_level(vlapic, dest, phys, delmode, vector);
 	}
 	VIOAPIC_UNLOCK(vioapic);
 }
 
 static uint32_t
 vioapic_read(struct vioapic *vioapic, int vcpuid, uint32_t addr)
 {
 	int regnum, pin, rshift;
 
 	regnum = addr & 0xff;
 	switch (regnum) {
 	case IOAPIC_ID:
 		return (vioapic->id);
 		break;
 	case IOAPIC_VER:
 		return (((REDIR_ENTRIES - 1) << MAXREDIRSHIFT) | 0x11);
 		break;
 	case IOAPIC_ARB:
 		return (vioapic->id);
 		break;
 	default:
 		break;
 	}
 
 	/* redirection table entries */
 	if (regnum >= IOAPIC_REDTBL &&
 	    regnum < IOAPIC_REDTBL + REDIR_ENTRIES * 2) {
 		pin = (regnum - IOAPIC_REDTBL) / 2;
 		if ((regnum - IOAPIC_REDTBL) % 2)
 			rshift = 32;
 		else
 			rshift = 0;
 
 		return (vioapic->rtbl[pin].reg >> rshift);
 	}
 
 	return (0);
 }
 
 static void
 vioapic_write(struct vioapic *vioapic, int vcpuid, uint32_t addr, uint32_t data)
 {
 	uint64_t data64, mask64;
 	uint64_t last, changed;
 	int regnum, pin, lshift;
 	cpuset_t allvcpus;
 
 	regnum = addr & 0xff;
 	switch (regnum) {
 	case IOAPIC_ID:
 		vioapic->id = data & APIC_ID_MASK;
 		break;
 	case IOAPIC_VER:
 	case IOAPIC_ARB:
 		/* readonly */
 		break;
 	default:
 		break;
 	}
 
 	/* redirection table entries */
 	if (regnum >= IOAPIC_REDTBL &&
 	    regnum < IOAPIC_REDTBL + REDIR_ENTRIES * 2) {
 		pin = (regnum - IOAPIC_REDTBL) / 2;
 		if ((regnum - IOAPIC_REDTBL) % 2)
 			lshift = 32;
 		else
 			lshift = 0;
 
 		last = vioapic->rtbl[pin].reg;
 
 		data64 = (uint64_t)data << lshift;
 		mask64 = (uint64_t)0xffffffff << lshift;
 		vioapic->rtbl[pin].reg &= ~mask64 | RTBL_RO_BITS;
 		vioapic->rtbl[pin].reg |= data64 & ~RTBL_RO_BITS;
 
 		VIOAPIC_CTR2(vioapic, "ioapic pin%d: redir table entry %#lx",
 		    pin, vioapic->rtbl[pin].reg);
 
 		/*
 		 * If any fields in the redirection table entry (except mask
 		 * or polarity) have changed then rendezvous all the vcpus
 		 * to update their vlapic trigger-mode registers.
 		 */
 		changed = last ^ vioapic->rtbl[pin].reg;
 		if (changed & ~(IOART_INTMASK | IOART_INTPOL)) {
 			VIOAPIC_CTR1(vioapic, "ioapic pin%d: recalculate "
 			    "vlapic trigger-mode register", pin);
 			VIOAPIC_UNLOCK(vioapic);
 			allvcpus = vm_active_cpus(vioapic->vm);
 			(void)vm_smp_rendezvous(vioapic->vm, vcpuid, allvcpus,
 			    vioapic_update_tmr, NULL);
 			VIOAPIC_LOCK(vioapic);
 		}
 
 		/*
 		 * Generate an interrupt if the following conditions are met:
 		 * - pin is not masked
 		 * - previous interrupt has been EOIed
 		 * - pin level is asserted
 		 */
 		if ((vioapic->rtbl[pin].reg & IOART_INTMASK) == IOART_INTMCLR &&
 		    (vioapic->rtbl[pin].reg & IOART_REM_IRR) == 0 &&
 		    (vioapic->rtbl[pin].acnt > 0)) {
 			VIOAPIC_CTR2(vioapic, "ioapic pin%d: asserted at rtbl "
 			    "write, acnt %d", pin, vioapic->rtbl[pin].acnt);
 			vioapic_send_intr(vioapic, pin);
 		}
 	}
 }
 
 static int
 vioapic_mmio_rw(struct vioapic *vioapic, int vcpuid, uint64_t gpa,
     uint64_t *data, int size, bool doread)
 {
 	uint64_t offset;
 
 	offset = gpa - VIOAPIC_BASE;
 
 	/*
 	 * The IOAPIC specification allows 32-bit wide accesses to the
 	 * IOREGSEL (offset 0) and IOWIN (offset 16) registers.
 	 */
 	if (size != 4 || (offset != IOREGSEL && offset != IOWIN)) {
 		if (doread)
 			*data = 0;
 		return (0);
 	}
 
 	VIOAPIC_LOCK(vioapic);
 	if (offset == IOREGSEL) {
 		if (doread)
 			*data = vioapic->ioregsel;
 		else
 			vioapic->ioregsel = *data;
 	} else {
 		if (doread) {
 			*data = vioapic_read(vioapic, vcpuid,
 			    vioapic->ioregsel);
 		} else {
 			vioapic_write(vioapic, vcpuid, vioapic->ioregsel,
 			    *data);
 		}
 	}
 	VIOAPIC_UNLOCK(vioapic);
 
 	return (0);
 }
 
 int
 vioapic_mmio_read(void *vm, int vcpuid, uint64_t gpa, uint64_t *rval,
     int size, void *arg)
 {
 	int error;
 	struct vioapic *vioapic;
 
 	vioapic = vm_ioapic(vm);
 	error = vioapic_mmio_rw(vioapic, vcpuid, gpa, rval, size, true);
 	return (error);
 }
 
 int
 vioapic_mmio_write(void *vm, int vcpuid, uint64_t gpa, uint64_t wval,
     int size, void *arg)
 {
 	int error;
 	struct vioapic *vioapic;
 
 	vioapic = vm_ioapic(vm);
 	error = vioapic_mmio_rw(vioapic, vcpuid, gpa, &wval, size, false);
 	return (error);
 }
 
 void
 vioapic_process_eoi(struct vm *vm, int vcpuid, int vector)
 {
 	struct vioapic *vioapic;
 	int pin;
 
 	KASSERT(vector >= 0 && vector < 256,
 	    ("vioapic_process_eoi: invalid vector %d", vector));
 
 	vioapic = vm_ioapic(vm);
 	VIOAPIC_CTR1(vioapic, "ioapic processing eoi for vector %d", vector);
 
 	/*
 	 * XXX keep track of the pins associated with this vector instead
 	 * of iterating on every single pin each time.
 	 */
 	VIOAPIC_LOCK(vioapic);
 	for (pin = 0; pin < REDIR_ENTRIES; pin++) {
 		if ((vioapic->rtbl[pin].reg & IOART_REM_IRR) == 0)
 			continue;
 		if ((vioapic->rtbl[pin].reg & IOART_INTVEC) != vector)
 			continue;
 		vioapic->rtbl[pin].reg &= ~IOART_REM_IRR;
 		if (vioapic->rtbl[pin].acnt > 0) {
 			VIOAPIC_CTR2(vioapic, "ioapic pin%d: asserted at eoi, "
 			    "acnt %d", pin, vioapic->rtbl[pin].acnt);
 			vioapic_send_intr(vioapic, pin);
 		}
 	}
 	VIOAPIC_UNLOCK(vioapic);
 }
 
 struct vioapic *
 vioapic_init(struct vm *vm)
 {
 	int i;
 	struct vioapic *vioapic;
 
 	vioapic = malloc(sizeof(struct vioapic), M_VIOAPIC, M_WAITOK | M_ZERO);
 
 	vioapic->vm = vm;
 	mtx_init(&vioapic->mtx, "vioapic lock", NULL, MTX_SPIN);
 
 	/* Initialize all redirection entries to mask all interrupts */
 	for (i = 0; i < REDIR_ENTRIES; i++)
 		vioapic->rtbl[i].reg = 0x0001000000010000UL;
 
 	return (vioapic);
 }
 
 void
 vioapic_cleanup(struct vioapic *vioapic)
 {
 
 	free(vioapic, M_VIOAPIC);
 }
 
 int
 vioapic_pincount(struct vm *vm)
 {
 
 	return (REDIR_ENTRIES);
 }
+
+#ifdef BHYVE_SNAPSHOT
+int
+vioapic_snapshot(struct vioapic *vioapic, struct vm_snapshot_meta *meta)
+{
+	int ret;
+	int i;
+
+	SNAPSHOT_VAR_OR_LEAVE(vioapic->ioregsel, meta, ret, done);
+
+	for (i = 0; i < nitems(vioapic->rtbl); i++) {
+		SNAPSHOT_VAR_OR_LEAVE(vioapic->rtbl[i].reg, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vioapic->rtbl[i].acnt, meta, ret, done);
+	}
+
+done:
+	return (ret);
+}
+#endif
diff --git a/sys/amd64/vmm/io/vioapic.h b/sys/amd64/vmm/io/vioapic.h
index 730c4b3f2ad9..19dbffe3ec24 100644
--- a/sys/amd64/vmm/io/vioapic.h
+++ b/sys/amd64/vmm/io/vioapic.h
@@ -1,52 +1,59 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2013 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
  * Copyright (c) 2013 Neel Natu <neel@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _VIOAPIC_H_
 #define	_VIOAPIC_H_
 
+struct vm_snapshot_meta;
+
 #define	VIOAPIC_BASE	0xFEC00000
 #define	VIOAPIC_SIZE	4096
 
 struct vioapic *vioapic_init(struct vm *vm);
 void	vioapic_cleanup(struct vioapic *vioapic);
 
 int	vioapic_assert_irq(struct vm *vm, int irq);
 int	vioapic_deassert_irq(struct vm *vm, int irq);
 int	vioapic_pulse_irq(struct vm *vm, int irq);
 
 int	vioapic_mmio_write(void *vm, int vcpuid, uint64_t gpa,
 	    uint64_t wval, int size, void *arg);
 int	vioapic_mmio_read(void *vm, int vcpuid, uint64_t gpa,
 	    uint64_t *rval, int size, void *arg);
 
 int	vioapic_pincount(struct vm *vm);
 void	vioapic_process_eoi(struct vm *vm, int vcpuid, int vector);
+#ifdef BHYVE_SNAPSHOT
+int	vioapic_snapshot(struct vioapic *vioapic,
+			 struct vm_snapshot_meta *meta);
+#endif
+
 #endif
diff --git a/sys/amd64/vmm/io/vlapic.c b/sys/amd64/vmm/io/vlapic.c
index 069989f12386..be944bf097d2 100644
--- a/sys/amd64/vmm/io/vlapic.c
+++ b/sys/amd64/vmm/io/vlapic.c
@@ -1,1652 +1,1758 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  * Copyright (c) 2019 Joyent, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
+#include "opt_bhyve_snapshot.h"
+
 #include <sys/param.h>
 #include <sys/lock.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/systm.h>
 #include <sys/smp.h>
 
 #include <x86/specialreg.h>
 #include <x86/apicreg.h>
 
 #include <machine/clock.h>
 #include <machine/smp.h>
 
 #include <machine/vmm.h>
+#include <machine/vmm_snapshot.h>
 
 #include "vmm_lapic.h"
 #include "vmm_ktr.h"
 #include "vmm_stat.h"
 
 #include "vlapic.h"
 #include "vlapic_priv.h"
 #include "vioapic.h"
 
 #define	PRIO(x)			((x) >> 4)
 
 #define VLAPIC_VERSION		(16)
 
 #define	x2apic(vlapic)	(((vlapic)->msr_apicbase & APICBASE_X2APIC) ? 1 : 0)
 
 /*
  * The 'vlapic->timer_mtx' is used to provide mutual exclusion between the
  * vlapic_callout_handler() and vcpu accesses to:
  * - timer_freq_bt, timer_period_bt, timer_fire_bt
  * - timer LVT register
  */
 #define	VLAPIC_TIMER_LOCK(vlapic)	mtx_lock_spin(&((vlapic)->timer_mtx))
 #define	VLAPIC_TIMER_UNLOCK(vlapic)	mtx_unlock_spin(&((vlapic)->timer_mtx))
 #define	VLAPIC_TIMER_LOCKED(vlapic)	mtx_owned(&((vlapic)->timer_mtx))
 
 /*
  * APIC timer frequency:
  * - arbitrary but chosen to be in the ballpark of contemporary hardware.
  * - power-of-two to avoid loss of precision when converted to a bintime.
  */
 #define VLAPIC_BUS_FREQ		(128 * 1024 * 1024)
 
 static void vlapic_set_error(struct vlapic *, uint32_t, bool);
 
 static __inline uint32_t
 vlapic_get_id(struct vlapic *vlapic)
 {
 
 	if (x2apic(vlapic))
 		return (vlapic->vcpuid);
 	else
 		return (vlapic->vcpuid << 24);
 }
 
 static uint32_t
 x2apic_ldr(struct vlapic *vlapic)
 {
 	int apicid;
 	uint32_t ldr;
 
 	apicid = vlapic_get_id(vlapic);
 	ldr = 1 << (apicid & 0xf);
 	ldr |= (apicid & 0xffff0) << 12;
 	return (ldr);
 }
 
 void
 vlapic_dfr_write_handler(struct vlapic *vlapic)
 {
 	struct LAPIC *lapic;
 
 	lapic = vlapic->apic_page;
 	if (x2apic(vlapic)) {
 		VM_CTR1(vlapic->vm, "ignoring write to DFR in x2apic mode: %#x",
 		    lapic->dfr);
 		lapic->dfr = 0;
 		return;
 	}
 
 	lapic->dfr &= APIC_DFR_MODEL_MASK;
 	lapic->dfr |= APIC_DFR_RESERVED;
 
 	if ((lapic->dfr & APIC_DFR_MODEL_MASK) == APIC_DFR_MODEL_FLAT)
 		VLAPIC_CTR0(vlapic, "vlapic DFR in Flat Model");
 	else if ((lapic->dfr & APIC_DFR_MODEL_MASK) == APIC_DFR_MODEL_CLUSTER)
 		VLAPIC_CTR0(vlapic, "vlapic DFR in Cluster Model");
 	else
 		VLAPIC_CTR1(vlapic, "DFR in Unknown Model %#x", lapic->dfr);
 }
 
 void
 vlapic_ldr_write_handler(struct vlapic *vlapic)
 {
 	struct LAPIC *lapic;
 
 	lapic = vlapic->apic_page;
 
 	/* LDR is read-only in x2apic mode */
 	if (x2apic(vlapic)) {
 		VLAPIC_CTR1(vlapic, "ignoring write to LDR in x2apic mode: %#x",
 		    lapic->ldr);
 		lapic->ldr = x2apic_ldr(vlapic);
 	} else {
 		lapic->ldr &= ~APIC_LDR_RESERVED;
 		VLAPIC_CTR1(vlapic, "vlapic LDR set to %#x", lapic->ldr);
 	}
 }
 
 void
 vlapic_id_write_handler(struct vlapic *vlapic)
 {
 	struct LAPIC *lapic;
 	
 	/*
 	 * We don't allow the ID register to be modified so reset it back to
 	 * its default value.
 	 */
 	lapic = vlapic->apic_page;
 	lapic->id = vlapic_get_id(vlapic);
 }
 
 static int
 vlapic_timer_divisor(uint32_t dcr)
 {
 	switch (dcr & 0xB) {
 	case APIC_TDCR_1:
 		return (1);
 	case APIC_TDCR_2:
 		return (2);
 	case APIC_TDCR_4:
 		return (4);
 	case APIC_TDCR_8:
 		return (8);
 	case APIC_TDCR_16:
 		return (16);
 	case APIC_TDCR_32:
 		return (32);
 	case APIC_TDCR_64:
 		return (64);
 	case APIC_TDCR_128:
 		return (128);
 	default:
 		panic("vlapic_timer_divisor: invalid dcr 0x%08x", dcr);
 	}
 }
 
 #if 0
 static inline void
 vlapic_dump_lvt(uint32_t offset, uint32_t *lvt)
 {
 	printf("Offset %x: lvt %08x (V:%02x DS:%x M:%x)\n", offset,
 	    *lvt, *lvt & APIC_LVTT_VECTOR, *lvt & APIC_LVTT_DS,
 	    *lvt & APIC_LVTT_M);
 }
 #endif
 
 static uint32_t
 vlapic_get_ccr(struct vlapic *vlapic)
 {
 	struct bintime bt_now, bt_rem;
 	struct LAPIC *lapic;
 	uint32_t ccr;
 	
 	ccr = 0;
 	lapic = vlapic->apic_page;
 
 	VLAPIC_TIMER_LOCK(vlapic);
 	if (callout_active(&vlapic->callout)) {
 		/*
 		 * If the timer is scheduled to expire in the future then
 		 * compute the value of 'ccr' based on the remaining time.
 		 */
 		binuptime(&bt_now);
 		if (bintime_cmp(&vlapic->timer_fire_bt, &bt_now, >)) {
 			bt_rem = vlapic->timer_fire_bt;
 			bintime_sub(&bt_rem, &bt_now);
 			ccr += bt_rem.sec * BT2FREQ(&vlapic->timer_freq_bt);
 			ccr += bt_rem.frac / vlapic->timer_freq_bt.frac;
 		}
 	}
 	KASSERT(ccr <= lapic->icr_timer, ("vlapic_get_ccr: invalid ccr %#x, "
 	    "icr_timer is %#x", ccr, lapic->icr_timer));
 	VLAPIC_CTR2(vlapic, "vlapic ccr_timer = %#x, icr_timer = %#x",
 	    ccr, lapic->icr_timer);
 	VLAPIC_TIMER_UNLOCK(vlapic);
 	return (ccr);
 }
 
 void
 vlapic_dcr_write_handler(struct vlapic *vlapic)
 {
 	struct LAPIC *lapic;
 	int divisor;
 	
 	lapic = vlapic->apic_page;
 	VLAPIC_TIMER_LOCK(vlapic);
 
 	divisor = vlapic_timer_divisor(lapic->dcr_timer);
 	VLAPIC_CTR2(vlapic, "vlapic dcr_timer=%#x, divisor=%d",
 	    lapic->dcr_timer, divisor);
 
 	/*
 	 * Update the timer frequency and the timer period.
 	 *
 	 * XXX changes to the frequency divider will not take effect until
 	 * the timer is reloaded.
 	 */
 	FREQ2BT(VLAPIC_BUS_FREQ / divisor, &vlapic->timer_freq_bt);
 	vlapic->timer_period_bt = vlapic->timer_freq_bt;
 	bintime_mul(&vlapic->timer_period_bt, lapic->icr_timer);
 
 	VLAPIC_TIMER_UNLOCK(vlapic);
 }
 
 void
 vlapic_esr_write_handler(struct vlapic *vlapic)
 {
 	struct LAPIC *lapic;
 	
 	lapic = vlapic->apic_page;
 	lapic->esr = vlapic->esr_pending;
 	vlapic->esr_pending = 0;
 }
 
 int
 vlapic_set_intr_ready(struct vlapic *vlapic, int vector, bool level)
 {
 	struct LAPIC *lapic;
 	uint32_t *irrptr, *tmrptr, mask;
 	int idx;
 
 	KASSERT(vector >= 0 && vector < 256, ("invalid vector %d", vector));
 
 	lapic = vlapic->apic_page;
 	if (!(lapic->svr & APIC_SVR_ENABLE)) {
 		VLAPIC_CTR1(vlapic, "vlapic is software disabled, ignoring "
 		    "interrupt %d", vector);
 		return (0);
 	}
 
 	if (vector < 16) {
 		vlapic_set_error(vlapic, APIC_ESR_RECEIVE_ILLEGAL_VECTOR,
 		    false);
 		VLAPIC_CTR1(vlapic, "vlapic ignoring interrupt to vector %d",
 		    vector);
 		return (1);
 	}
 
 	if (vlapic->ops.set_intr_ready)
 		return ((*vlapic->ops.set_intr_ready)(vlapic, vector, level));
 
 	idx = (vector / 32) * 4;
 	mask = 1 << (vector % 32);
 
 	irrptr = &lapic->irr0;
 	atomic_set_int(&irrptr[idx], mask);
 
 	/*
 	 * Verify that the trigger-mode of the interrupt matches with
 	 * the vlapic TMR registers.
 	 */
 	tmrptr = &lapic->tmr0;
 	if ((tmrptr[idx] & mask) != (level ? mask : 0)) {
 		VLAPIC_CTR3(vlapic, "vlapic TMR[%d] is 0x%08x but "
 		    "interrupt is %s-triggered", idx / 4, tmrptr[idx],
 		    level ? "level" : "edge");
 	}
 
 	VLAPIC_CTR_IRR(vlapic, "vlapic_set_intr_ready");
 	return (1);
 }
 
 static __inline uint32_t *
 vlapic_get_lvtptr(struct vlapic *vlapic, uint32_t offset)
 {
 	struct LAPIC	*lapic = vlapic->apic_page;
 	int 		 i;
 
 	switch (offset) {
 	case APIC_OFFSET_CMCI_LVT:
 		return (&lapic->lvt_cmci);
 	case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
 		i = (offset - APIC_OFFSET_TIMER_LVT) >> 2;
 		return ((&lapic->lvt_timer) + i);
 	default:
 		panic("vlapic_get_lvt: invalid LVT\n");
 	}
 }
 
 static __inline int
 lvt_off_to_idx(uint32_t offset)
 {
 	int index;
 
 	switch (offset) {
 	case APIC_OFFSET_CMCI_LVT:
 		index = APIC_LVT_CMCI;
 		break;
 	case APIC_OFFSET_TIMER_LVT:
 		index = APIC_LVT_TIMER;
 		break;
 	case APIC_OFFSET_THERM_LVT:
 		index = APIC_LVT_THERMAL;
 		break;
 	case APIC_OFFSET_PERF_LVT:
 		index = APIC_LVT_PMC;
 		break;
 	case APIC_OFFSET_LINT0_LVT:
 		index = APIC_LVT_LINT0;
 		break;
 	case APIC_OFFSET_LINT1_LVT:
 		index = APIC_LVT_LINT1;
 		break;
 	case APIC_OFFSET_ERROR_LVT:
 		index = APIC_LVT_ERROR;
 		break;
 	default:
 		index = -1;
 		break;
 	}
 	KASSERT(index >= 0 && index <= VLAPIC_MAXLVT_INDEX, ("lvt_off_to_idx: "
 	    "invalid lvt index %d for offset %#x", index, offset));
 
 	return (index);
 }
 
 static __inline uint32_t
 vlapic_get_lvt(struct vlapic *vlapic, uint32_t offset)
 {
 	int idx;
 	uint32_t val;
 
 	idx = lvt_off_to_idx(offset);
 	val = atomic_load_acq_32(&vlapic->lvt_last[idx]);
 	return (val);
 }
 
 void
 vlapic_lvt_write_handler(struct vlapic *vlapic, uint32_t offset)
 {
 	uint32_t *lvtptr, mask, val;
 	struct LAPIC *lapic;
 	int idx;
 	
 	lapic = vlapic->apic_page;
 	lvtptr = vlapic_get_lvtptr(vlapic, offset);	
 	val = *lvtptr;
 	idx = lvt_off_to_idx(offset);
 
 	if (!(lapic->svr & APIC_SVR_ENABLE))
 		val |= APIC_LVT_M;
 	mask = APIC_LVT_M | APIC_LVT_DS | APIC_LVT_VECTOR;
 	switch (offset) {
 	case APIC_OFFSET_TIMER_LVT:
 		mask |= APIC_LVTT_TM;
 		break;
 	case APIC_OFFSET_ERROR_LVT:
 		break;
 	case APIC_OFFSET_LINT0_LVT:
 	case APIC_OFFSET_LINT1_LVT:
 		mask |= APIC_LVT_TM | APIC_LVT_RIRR | APIC_LVT_IIPP;
 		/* FALLTHROUGH */
 	default:
 		mask |= APIC_LVT_DM;
 		break;
 	}
 	val &= mask;
 	*lvtptr = val;
 	atomic_store_rel_32(&vlapic->lvt_last[idx], val);
 }
 
 static void
 vlapic_mask_lvts(struct vlapic *vlapic)
 {
 	struct LAPIC *lapic = vlapic->apic_page;
 
 	lapic->lvt_cmci |= APIC_LVT_M;
 	vlapic_lvt_write_handler(vlapic, APIC_OFFSET_CMCI_LVT);
 
 	lapic->lvt_timer |= APIC_LVT_M;
 	vlapic_lvt_write_handler(vlapic, APIC_OFFSET_TIMER_LVT);
 
 	lapic->lvt_thermal |= APIC_LVT_M;
 	vlapic_lvt_write_handler(vlapic, APIC_OFFSET_THERM_LVT);
 
 	lapic->lvt_pcint |= APIC_LVT_M;
 	vlapic_lvt_write_handler(vlapic, APIC_OFFSET_PERF_LVT);
 
 	lapic->lvt_lint0 |= APIC_LVT_M;
 	vlapic_lvt_write_handler(vlapic, APIC_OFFSET_LINT0_LVT);
 
 	lapic->lvt_lint1 |= APIC_LVT_M;
 	vlapic_lvt_write_handler(vlapic, APIC_OFFSET_LINT1_LVT);
 
 	lapic->lvt_error |= APIC_LVT_M;
 	vlapic_lvt_write_handler(vlapic, APIC_OFFSET_ERROR_LVT);
 }
 
 static int
 vlapic_fire_lvt(struct vlapic *vlapic, u_int lvt)
 {
 	uint32_t mode, reg, vec;
 
 	reg = atomic_load_acq_32(&vlapic->lvt_last[lvt]);
 
 	if (reg & APIC_LVT_M)
 		return (0);
 	vec = reg & APIC_LVT_VECTOR;
 	mode = reg & APIC_LVT_DM;
 
 	switch (mode) {
 	case APIC_LVT_DM_FIXED:
 		if (vec < 16) {
 			vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR,
 			    lvt == APIC_LVT_ERROR);
 			return (0);
 		}
 		if (vlapic_set_intr_ready(vlapic, vec, false))
 			vcpu_notify_event(vlapic->vm, vlapic->vcpuid, true);
 		break;
 	case APIC_LVT_DM_NMI:
 		vm_inject_nmi(vlapic->vm, vlapic->vcpuid);
 		break;
 	case APIC_LVT_DM_EXTINT:
 		vm_inject_extint(vlapic->vm, vlapic->vcpuid);
 		break;
 	default:
 		// Other modes ignored
 		return (0);
 	}
 	return (1);
 }
 
 #if 1
 static void
 dump_isrvec_stk(struct vlapic *vlapic)
 {
 	int i;
 	uint32_t *isrptr;
 
 	isrptr = &vlapic->apic_page->isr0;
 	for (i = 0; i < 8; i++)
 		printf("ISR%d 0x%08x\n", i, isrptr[i * 4]);
 
 	for (i = 0; i <= vlapic->isrvec_stk_top; i++)
 		printf("isrvec_stk[%d] = %d\n", i, vlapic->isrvec_stk[i]);
 }
 #endif
 
 /*
  * Algorithm adopted from section "Interrupt, Task and Processor Priority"
  * in Intel Architecture Manual Vol 3a.
  */
 static void
 vlapic_update_ppr(struct vlapic *vlapic)
 {
 	int isrvec, tpr, ppr;
 
 	/*
 	 * Note that the value on the stack at index 0 is always 0.
 	 *
 	 * This is a placeholder for the value of ISRV when none of the
 	 * bits is set in the ISRx registers.
 	 */
 	isrvec = vlapic->isrvec_stk[vlapic->isrvec_stk_top];
 	tpr = vlapic->apic_page->tpr;
 
 #if 1
 	{
 		int i, lastprio, curprio, vector, idx;
 		uint32_t *isrptr;
 
 		if (vlapic->isrvec_stk_top == 0 && isrvec != 0)
 			panic("isrvec_stk is corrupted: %d", isrvec);
 
 		/*
 		 * Make sure that the priority of the nested interrupts is
 		 * always increasing.
 		 */
 		lastprio = -1;
 		for (i = 1; i <= vlapic->isrvec_stk_top; i++) {
 			curprio = PRIO(vlapic->isrvec_stk[i]);
 			if (curprio <= lastprio) {
 				dump_isrvec_stk(vlapic);
 				panic("isrvec_stk does not satisfy invariant");
 			}
 			lastprio = curprio;
 		}
 
 		/*
 		 * Make sure that each bit set in the ISRx registers has a
 		 * corresponding entry on the isrvec stack.
 		 */
 		i = 1;
 		isrptr = &vlapic->apic_page->isr0;
 		for (vector = 0; vector < 256; vector++) {
 			idx = (vector / 32) * 4;
 			if (isrptr[idx] & (1 << (vector % 32))) {
 				if (i > vlapic->isrvec_stk_top ||
 				    vlapic->isrvec_stk[i] != vector) {
 					dump_isrvec_stk(vlapic);
 					panic("ISR and isrvec_stk out of sync");
 				}
 				i++;
 			}
 		}
 	}
 #endif
 
 	if (PRIO(tpr) >= PRIO(isrvec))
 		ppr = tpr;
 	else
 		ppr = isrvec & 0xf0;
 
 	vlapic->apic_page->ppr = ppr;
 	VLAPIC_CTR1(vlapic, "vlapic_update_ppr 0x%02x", ppr);
 }
 
 void
 vlapic_sync_tpr(struct vlapic *vlapic)
 {
 	vlapic_update_ppr(vlapic);
 }
 
 static VMM_STAT(VLAPIC_GRATUITOUS_EOI, "EOI without any in-service interrupt");
 
 static void
 vlapic_process_eoi(struct vlapic *vlapic)
 {
 	struct LAPIC	*lapic = vlapic->apic_page;
 	uint32_t	*isrptr, *tmrptr;
 	int		i, idx, bitpos, vector;
 
 	isrptr = &lapic->isr0;
 	tmrptr = &lapic->tmr0;
 
 	for (i = 7; i >= 0; i--) {
 		idx = i * 4;
 		bitpos = fls(isrptr[idx]);
 		if (bitpos-- != 0) {
 			if (vlapic->isrvec_stk_top <= 0) {
 				panic("invalid vlapic isrvec_stk_top %d",
 				      vlapic->isrvec_stk_top);
 			}
 			isrptr[idx] &= ~(1 << bitpos);
 			vector = i * 32 + bitpos;
 			VCPU_CTR1(vlapic->vm, vlapic->vcpuid, "EOI vector %d",
 			    vector);
 			VLAPIC_CTR_ISR(vlapic, "vlapic_process_eoi");
 			vlapic->isrvec_stk_top--;
 			vlapic_update_ppr(vlapic);
 			if ((tmrptr[idx] & (1 << bitpos)) != 0) {
 				vioapic_process_eoi(vlapic->vm, vlapic->vcpuid,
 				    vector);
 			}
 			return;
 		}
 	}
 	VCPU_CTR0(vlapic->vm, vlapic->vcpuid, "Gratuitous EOI");
 	vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_GRATUITOUS_EOI, 1);
 }
 
 static __inline int
 vlapic_get_lvt_field(uint32_t lvt, uint32_t mask)
 {
 
 	return (lvt & mask);
 }
 
 static __inline int
 vlapic_periodic_timer(struct vlapic *vlapic)
 {
 	uint32_t lvt;
 	
 	lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT);
 
 	return (vlapic_get_lvt_field(lvt, APIC_LVTT_TM_PERIODIC));
 }
 
 static VMM_STAT(VLAPIC_INTR_ERROR, "error interrupts generated by vlapic");
 
 static void
 vlapic_set_error(struct vlapic *vlapic, uint32_t mask, bool lvt_error)
 {
 
 	vlapic->esr_pending |= mask;
 
 	/*
 	 * Avoid infinite recursion if the error LVT itself is configured with
 	 * an illegal vector.
 	 */
 	if (lvt_error)
 		return;
 
 	if (vlapic_fire_lvt(vlapic, APIC_LVT_ERROR)) {
 		vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_INTR_ERROR, 1);
 	}
 }
 
 static VMM_STAT(VLAPIC_INTR_TIMER, "timer interrupts generated by vlapic");
 
 static void
 vlapic_fire_timer(struct vlapic *vlapic)
 {
 
 	KASSERT(VLAPIC_TIMER_LOCKED(vlapic), ("vlapic_fire_timer not locked"));
 
 	if (vlapic_fire_lvt(vlapic, APIC_LVT_TIMER)) {
 		VLAPIC_CTR0(vlapic, "vlapic timer fired");
 		vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_INTR_TIMER, 1);
 	}
 }
 
 static VMM_STAT(VLAPIC_INTR_CMC,
     "corrected machine check interrupts generated by vlapic");
 
 void
 vlapic_fire_cmci(struct vlapic *vlapic)
 {
 
 	if (vlapic_fire_lvt(vlapic, APIC_LVT_CMCI)) {
 		vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_INTR_CMC, 1);
 	}
 }
 
 static VMM_STAT_ARRAY(LVTS_TRIGGERRED, VLAPIC_MAXLVT_INDEX + 1,
     "lvts triggered");
 
 int
 vlapic_trigger_lvt(struct vlapic *vlapic, int vector)
 {
 
 	if (vlapic_enabled(vlapic) == false) {
 		/*
 		 * When the local APIC is global/hardware disabled,
 		 * LINT[1:0] pins are configured as INTR and NMI pins,
 		 * respectively.
 		*/
 		switch (vector) {
 			case APIC_LVT_LINT0:
 				vm_inject_extint(vlapic->vm, vlapic->vcpuid);
 				break;
 			case APIC_LVT_LINT1:
 				vm_inject_nmi(vlapic->vm, vlapic->vcpuid);
 				break;
 			default:
 				break;
 		}
 		return (0);
 	}
 
 	switch (vector) {
 	case APIC_LVT_LINT0:
 	case APIC_LVT_LINT1:
 	case APIC_LVT_TIMER:
 	case APIC_LVT_ERROR:
 	case APIC_LVT_PMC:
 	case APIC_LVT_THERMAL:
 	case APIC_LVT_CMCI:
 		if (vlapic_fire_lvt(vlapic, vector)) {
 			vmm_stat_array_incr(vlapic->vm, vlapic->vcpuid,
 			    LVTS_TRIGGERRED, vector, 1);
 		}
 		break;
 	default:
 		return (EINVAL);
 	}
 	return (0);
 }
 
 static void
 vlapic_callout_handler(void *arg)
 {
 	struct vlapic *vlapic;
 	struct bintime bt, btnow;
 	sbintime_t rem_sbt;
 
 	vlapic = arg;
 
 	VLAPIC_TIMER_LOCK(vlapic);
 	if (callout_pending(&vlapic->callout))	/* callout was reset */
 		goto done;
 
 	if (!callout_active(&vlapic->callout))	/* callout was stopped */
 		goto done;
 
 	callout_deactivate(&vlapic->callout);
 
 	vlapic_fire_timer(vlapic);
 
 	if (vlapic_periodic_timer(vlapic)) {
 		binuptime(&btnow);
 		KASSERT(bintime_cmp(&btnow, &vlapic->timer_fire_bt, >=),
 		    ("vlapic callout at %#lx.%#lx, expected at %#lx.#%lx",
 		    btnow.sec, btnow.frac, vlapic->timer_fire_bt.sec,
 		    vlapic->timer_fire_bt.frac));
 
 		/*
 		 * Compute the delta between when the timer was supposed to
 		 * fire and the present time.
 		 */
 		bt = btnow;
 		bintime_sub(&bt, &vlapic->timer_fire_bt);
 
 		rem_sbt = bttosbt(vlapic->timer_period_bt);
 		if (bintime_cmp(&bt, &vlapic->timer_period_bt, <)) {
 			/*
 			 * Adjust the time until the next countdown downward
 			 * to account for the lost time.
 			 */
 			rem_sbt -= bttosbt(bt);
 		} else {
 			/*
 			 * If the delta is greater than the timer period then
 			 * just reset our time base instead of trying to catch
 			 * up.
 			 */
 			vlapic->timer_fire_bt = btnow;
 			VLAPIC_CTR2(vlapic, "vlapic timer lagging by %lu "
 			    "usecs, period is %lu usecs - resetting time base",
 			    bttosbt(bt) / SBT_1US,
 			    bttosbt(vlapic->timer_period_bt) / SBT_1US);
 		}
 
 		bintime_add(&vlapic->timer_fire_bt, &vlapic->timer_period_bt);
 		callout_reset_sbt(&vlapic->callout, rem_sbt, 0,
 		    vlapic_callout_handler, vlapic, 0);
 	}
 done:
 	VLAPIC_TIMER_UNLOCK(vlapic);
 }
 
 void
 vlapic_icrtmr_write_handler(struct vlapic *vlapic)
 {
 	struct LAPIC *lapic;
 	sbintime_t sbt;
 	uint32_t icr_timer;
 
 	VLAPIC_TIMER_LOCK(vlapic);
 
 	lapic = vlapic->apic_page;
 	icr_timer = lapic->icr_timer;
 
 	vlapic->timer_period_bt = vlapic->timer_freq_bt;
 	bintime_mul(&vlapic->timer_period_bt, icr_timer);
 
 	if (icr_timer != 0) {
 		binuptime(&vlapic->timer_fire_bt);
 		bintime_add(&vlapic->timer_fire_bt, &vlapic->timer_period_bt);
 
 		sbt = bttosbt(vlapic->timer_period_bt);
 		callout_reset_sbt(&vlapic->callout, sbt, 0,
 		    vlapic_callout_handler, vlapic, 0);
 	} else
 		callout_stop(&vlapic->callout);
 
 	VLAPIC_TIMER_UNLOCK(vlapic);
 }
 
 /*
  * This function populates 'dmask' with the set of vcpus that match the
  * addressing specified by the (dest, phys, lowprio) tuple.
  * 
  * 'x2apic_dest' specifies whether 'dest' is interpreted as x2APIC (32-bit)
  * or xAPIC (8-bit) destination field.
  */
 static void
 vlapic_calcdest(struct vm *vm, cpuset_t *dmask, uint32_t dest, bool phys,
     bool lowprio, bool x2apic_dest)
 {
 	struct vlapic *vlapic;
 	uint32_t dfr, ldr, ldest, cluster;
 	uint32_t mda_flat_ldest, mda_cluster_ldest, mda_ldest, mda_cluster_id;
 	cpuset_t amask;
 	int vcpuid;
 
 	if ((x2apic_dest && dest == 0xffffffff) ||
 	    (!x2apic_dest && dest == 0xff)) {
 		/*
 		 * Broadcast in both logical and physical modes.
 		 */
 		*dmask = vm_active_cpus(vm);
 		return;
 	}
 
 	if (phys) {
 		/*
 		 * Physical mode: destination is APIC ID.
 		 */
 		CPU_ZERO(dmask);
 		vcpuid = vm_apicid2vcpuid(vm, dest);
 		amask = vm_active_cpus(vm);
 		if (vcpuid < vm_get_maxcpus(vm) && CPU_ISSET(vcpuid, &amask))
 			CPU_SET(vcpuid, dmask);
 	} else {
 		/*
 		 * In the "Flat Model" the MDA is interpreted as an 8-bit wide
 		 * bitmask. This model is only available in the xAPIC mode.
 		 */
 		mda_flat_ldest = dest & 0xff;
 
 		/*
 		 * In the "Cluster Model" the MDA is used to identify a
 		 * specific cluster and a set of APICs in that cluster.
 		 */
 		if (x2apic_dest) {
 			mda_cluster_id = dest >> 16;
 			mda_cluster_ldest = dest & 0xffff;
 		} else {
 			mda_cluster_id = (dest >> 4) & 0xf;
 			mda_cluster_ldest = dest & 0xf;
 		}
 
 		/*
 		 * Logical mode: match each APIC that has a bit set
 		 * in its LDR that matches a bit in the ldest.
 		 */
 		CPU_ZERO(dmask);
 		amask = vm_active_cpus(vm);
 		while ((vcpuid = CPU_FFS(&amask)) != 0) {
 			vcpuid--;
 			CPU_CLR(vcpuid, &amask);
 
 			vlapic = vm_lapic(vm, vcpuid);
 			dfr = vlapic->apic_page->dfr;
 			ldr = vlapic->apic_page->ldr;
 
 			if ((dfr & APIC_DFR_MODEL_MASK) ==
 			    APIC_DFR_MODEL_FLAT) {
 				ldest = ldr >> 24;
 				mda_ldest = mda_flat_ldest;
 			} else if ((dfr & APIC_DFR_MODEL_MASK) ==
 			    APIC_DFR_MODEL_CLUSTER) {
 				if (x2apic(vlapic)) {
 					cluster = ldr >> 16;
 					ldest = ldr & 0xffff;
 				} else {
 					cluster = ldr >> 28;
 					ldest = (ldr >> 24) & 0xf;
 				}
 				if (cluster != mda_cluster_id)
 					continue;
 				mda_ldest = mda_cluster_ldest;
 			} else {
 				/*
 				 * Guest has configured a bad logical
 				 * model for this vcpu - skip it.
 				 */
 				VLAPIC_CTR1(vlapic, "vlapic has bad logical "
 				    "model %x - cannot deliver interrupt", dfr);
 				continue;
 			}
 
 			if ((mda_ldest & ldest) != 0) {
 				CPU_SET(vcpuid, dmask);
 				if (lowprio)
 					break;
 			}
 		}
 	}
 }
 
 static VMM_STAT_ARRAY(IPIS_SENT, VM_MAXCPU, "ipis sent to vcpu");
 
 static void
 vlapic_set_tpr(struct vlapic *vlapic, uint8_t val)
 {
 	struct LAPIC *lapic = vlapic->apic_page;
 
 	if (lapic->tpr != val) {
 		VCPU_CTR2(vlapic->vm, vlapic->vcpuid, "vlapic TPR changed "
 		    "from %#x to %#x", lapic->tpr, val);
 		lapic->tpr = val;
 		vlapic_update_ppr(vlapic);
 	}
 }
 
 static uint8_t
 vlapic_get_tpr(struct vlapic *vlapic)
 {
 	struct LAPIC *lapic = vlapic->apic_page;
 
 	return (lapic->tpr);
 }
 
 void
 vlapic_set_cr8(struct vlapic *vlapic, uint64_t val)
 {
 	uint8_t tpr;
 
 	if (val & ~0xf) {
 		vm_inject_gp(vlapic->vm, vlapic->vcpuid);
 		return;
 	}
 
 	tpr = val << 4;
 	vlapic_set_tpr(vlapic, tpr);
 }
 
 uint64_t
 vlapic_get_cr8(struct vlapic *vlapic)
 {
 	uint8_t tpr;
 
 	tpr = vlapic_get_tpr(vlapic);
 	return (tpr >> 4);
 }
 
 int
 vlapic_icrlo_write_handler(struct vlapic *vlapic, bool *retu)
 {
 	int i;
 	bool phys;
 	cpuset_t dmask;
 	uint64_t icrval;
 	uint32_t dest, vec, mode;
 	struct vlapic *vlapic2;
 	struct vm_exit *vmexit;
 	struct LAPIC *lapic;
 	uint16_t maxcpus;
 
 	lapic = vlapic->apic_page;
 	lapic->icr_lo &= ~APIC_DELSTAT_PEND;
 	icrval = ((uint64_t)lapic->icr_hi << 32) | lapic->icr_lo;
 
 	if (x2apic(vlapic))
 		dest = icrval >> 32;
 	else
 		dest = icrval >> (32 + 24);
 	vec = icrval & APIC_VECTOR_MASK;
 	mode = icrval & APIC_DELMODE_MASK;
 
 	if (mode == APIC_DELMODE_FIXED && vec < 16) {
 		vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR, false);
 		VLAPIC_CTR1(vlapic, "Ignoring invalid IPI %d", vec);
 		return (0);
 	}
 
 	VLAPIC_CTR2(vlapic, "icrlo 0x%016lx triggered ipi %d", icrval, vec);
 
 	if (mode == APIC_DELMODE_FIXED || mode == APIC_DELMODE_NMI) {
 		switch (icrval & APIC_DEST_MASK) {
 		case APIC_DEST_DESTFLD:
 			phys = ((icrval & APIC_DESTMODE_LOG) == 0);
 			vlapic_calcdest(vlapic->vm, &dmask, dest, phys, false,
 			    x2apic(vlapic));
 			break;
 		case APIC_DEST_SELF:
 			CPU_SETOF(vlapic->vcpuid, &dmask);
 			break;
 		case APIC_DEST_ALLISELF:
 			dmask = vm_active_cpus(vlapic->vm);
 			break;
 		case APIC_DEST_ALLESELF:
 			dmask = vm_active_cpus(vlapic->vm);
 			CPU_CLR(vlapic->vcpuid, &dmask);
 			break;
 		default:
 			CPU_ZERO(&dmask);	/* satisfy gcc */
 			break;
 		}
 
 		while ((i = CPU_FFS(&dmask)) != 0) {
 			i--;
 			CPU_CLR(i, &dmask);
 			if (mode == APIC_DELMODE_FIXED) {
 				lapic_intr_edge(vlapic->vm, i, vec);
 				vmm_stat_array_incr(vlapic->vm, vlapic->vcpuid,
 						    IPIS_SENT, i, 1);
 				VLAPIC_CTR2(vlapic, "vlapic sending ipi %d "
 				    "to vcpuid %d", vec, i);
 			} else {
 				vm_inject_nmi(vlapic->vm, i);
 				VLAPIC_CTR1(vlapic, "vlapic sending ipi nmi "
 				    "to vcpuid %d", i);
 			}
 		}
 
 		return (0);	/* handled completely in the kernel */
 	}
 
 	maxcpus = vm_get_maxcpus(vlapic->vm);
 	if (mode == APIC_DELMODE_INIT) {
 		if ((icrval & APIC_LEVEL_MASK) == APIC_LEVEL_DEASSERT)
 			return (0);
 
 		if (vlapic->vcpuid == 0 && dest != 0 && dest < maxcpus) {
 			vlapic2 = vm_lapic(vlapic->vm, dest);
 
 			/* move from INIT to waiting-for-SIPI state */
 			if (vlapic2->boot_state == BS_INIT) {
 				vlapic2->boot_state = BS_SIPI;
 			}
 
 			return (0);
 		}
 	}
 
 	if (mode == APIC_DELMODE_STARTUP) {
 		if (vlapic->vcpuid == 0 && dest != 0 && dest < maxcpus) {
 			vlapic2 = vm_lapic(vlapic->vm, dest);
 
 			/*
 			 * Ignore SIPIs in any state other than wait-for-SIPI
 			 */
 			if (vlapic2->boot_state != BS_SIPI)
 				return (0);
 
 			vlapic2->boot_state = BS_RUNNING;
 
 			*retu = true;
 			vmexit = vm_exitinfo(vlapic->vm, vlapic->vcpuid);
 			vmexit->exitcode = VM_EXITCODE_SPINUP_AP;
 			vmexit->u.spinup_ap.vcpu = dest;
 			vmexit->u.spinup_ap.rip = vec << PAGE_SHIFT;
 
 			return (0);
 		}
 	}
 
 	/*
 	 * This will cause a return to userland.
 	 */
 	return (1);
 }
 
 void
 vlapic_self_ipi_handler(struct vlapic *vlapic, uint64_t val)
 {
 	int vec;
 
 	KASSERT(x2apic(vlapic), ("SELF_IPI does not exist in xAPIC mode"));
 
 	vec = val & 0xff;
 	lapic_intr_edge(vlapic->vm, vlapic->vcpuid, vec);
 	vmm_stat_array_incr(vlapic->vm, vlapic->vcpuid, IPIS_SENT,
 	    vlapic->vcpuid, 1);
 	VLAPIC_CTR1(vlapic, "vlapic self-ipi %d", vec);
 }
 
 int
 vlapic_pending_intr(struct vlapic *vlapic, int *vecptr)
 {
 	struct LAPIC	*lapic = vlapic->apic_page;
 	int	  	 idx, i, bitpos, vector;
 	uint32_t	*irrptr, val;
 
 	vlapic_update_ppr(vlapic);
 
 	if (vlapic->ops.pending_intr)
 		return ((*vlapic->ops.pending_intr)(vlapic, vecptr));
 
 	irrptr = &lapic->irr0;
 
 	for (i = 7; i >= 0; i--) {
 		idx = i * 4;
 		val = atomic_load_acq_int(&irrptr[idx]);
 		bitpos = fls(val);
 		if (bitpos != 0) {
 			vector = i * 32 + (bitpos - 1);
 			if (PRIO(vector) > PRIO(lapic->ppr)) {
 				VLAPIC_CTR1(vlapic, "pending intr %d", vector);
 				if (vecptr != NULL)
 					*vecptr = vector;
 				return (1);
 			} else 
 				break;
 		}
 	}
 	return (0);
 }
 
 void
 vlapic_intr_accepted(struct vlapic *vlapic, int vector)
 {
 	struct LAPIC	*lapic = vlapic->apic_page;
 	uint32_t	*irrptr, *isrptr;
 	int		idx, stk_top;
 
 	if (vlapic->ops.intr_accepted)
 		return ((*vlapic->ops.intr_accepted)(vlapic, vector));
 
 	/*
 	 * clear the ready bit for vector being accepted in irr 
 	 * and set the vector as in service in isr.
 	 */
 	idx = (vector / 32) * 4;
 
 	irrptr = &lapic->irr0;
 	atomic_clear_int(&irrptr[idx], 1 << (vector % 32));
 	VLAPIC_CTR_IRR(vlapic, "vlapic_intr_accepted");
 
 	isrptr = &lapic->isr0;
 	isrptr[idx] |= 1 << (vector % 32);
 	VLAPIC_CTR_ISR(vlapic, "vlapic_intr_accepted");
 
 	/*
 	 * Update the PPR
 	 */
 	vlapic->isrvec_stk_top++;
 
 	stk_top = vlapic->isrvec_stk_top;
 	if (stk_top >= ISRVEC_STK_SIZE)
 		panic("isrvec_stk_top overflow %d", stk_top);
 
 	vlapic->isrvec_stk[stk_top] = vector;
 }
 
 void
 vlapic_svr_write_handler(struct vlapic *vlapic)
 {
 	struct LAPIC *lapic;
 	uint32_t old, new, changed;
 
 	lapic = vlapic->apic_page;
 
 	new = lapic->svr;
 	old = vlapic->svr_last;
 	vlapic->svr_last = new;
 
 	changed = old ^ new;
 	if ((changed & APIC_SVR_ENABLE) != 0) {
 		if ((new & APIC_SVR_ENABLE) == 0) {
 			/*
 			 * The apic is now disabled so stop the apic timer
 			 * and mask all the LVT entries.
 			 */
 			VLAPIC_CTR0(vlapic, "vlapic is software-disabled");
 			VLAPIC_TIMER_LOCK(vlapic);
 			callout_stop(&vlapic->callout);
 			VLAPIC_TIMER_UNLOCK(vlapic);
 			vlapic_mask_lvts(vlapic);
 		} else {
 			/*
 			 * The apic is now enabled so restart the apic timer
 			 * if it is configured in periodic mode.
 			 */
 			VLAPIC_CTR0(vlapic, "vlapic is software-enabled");
 			if (vlapic_periodic_timer(vlapic))
 				vlapic_icrtmr_write_handler(vlapic);
 		}
 	}
 }
 
 int
 vlapic_read(struct vlapic *vlapic, int mmio_access, uint64_t offset,
     uint64_t *data, bool *retu)
 {
 	struct LAPIC	*lapic = vlapic->apic_page;
 	uint32_t	*reg;
 	int		 i;
 
 	/* Ignore MMIO accesses in x2APIC mode */
 	if (x2apic(vlapic) && mmio_access) {
 		VLAPIC_CTR1(vlapic, "MMIO read from offset %#lx in x2APIC mode",
 		    offset);
 		*data = 0;
 		goto done;
 	}
 
 	if (!x2apic(vlapic) && !mmio_access) {
 		/*
 		 * XXX Generate GP fault for MSR accesses in xAPIC mode
 		 */
 		VLAPIC_CTR1(vlapic, "x2APIC MSR read from offset %#lx in "
 		    "xAPIC mode", offset);
 		*data = 0;
 		goto done;
 	}
 
 	if (offset > sizeof(*lapic)) {
 		*data = 0;
 		goto done;
 	}
 	
 	offset &= ~3;
 	switch(offset)
 	{
 		case APIC_OFFSET_ID:
 			*data = lapic->id;
 			break;
 		case APIC_OFFSET_VER:
 			*data = lapic->version;
 			break;
 		case APIC_OFFSET_TPR:
 			*data = vlapic_get_tpr(vlapic);
 			break;
 		case APIC_OFFSET_APR:
 			*data = lapic->apr;
 			break;
 		case APIC_OFFSET_PPR:
 			*data = lapic->ppr;
 			break;
 		case APIC_OFFSET_EOI:
 			*data = lapic->eoi;
 			break;
 		case APIC_OFFSET_LDR:
 			*data = lapic->ldr;
 			break;
 		case APIC_OFFSET_DFR:
 			*data = lapic->dfr;
 			break;
 		case APIC_OFFSET_SVR:
 			*data = lapic->svr;
 			break;
 		case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
 			i = (offset - APIC_OFFSET_ISR0) >> 2;
 			reg = &lapic->isr0;
 			*data = *(reg + i);
 			break;
 		case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
 			i = (offset - APIC_OFFSET_TMR0) >> 2;
 			reg = &lapic->tmr0;
 			*data = *(reg + i);
 			break;
 		case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
 			i = (offset - APIC_OFFSET_IRR0) >> 2;
 			reg = &lapic->irr0;
 			*data = atomic_load_acq_int(reg + i);
 			break;
 		case APIC_OFFSET_ESR:
 			*data = lapic->esr;
 			break;
 		case APIC_OFFSET_ICR_LOW: 
 			*data = lapic->icr_lo;
 			if (x2apic(vlapic))
 				*data |= (uint64_t)lapic->icr_hi << 32;
 			break;
 		case APIC_OFFSET_ICR_HI: 
 			*data = lapic->icr_hi;
 			break;
 		case APIC_OFFSET_CMCI_LVT:
 		case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
 			*data = vlapic_get_lvt(vlapic, offset);	
 #ifdef INVARIANTS
 			reg = vlapic_get_lvtptr(vlapic, offset);
 			KASSERT(*data == *reg, ("inconsistent lvt value at "
 			    "offset %#lx: %#lx/%#x", offset, *data, *reg));
 #endif
 			break;
 		case APIC_OFFSET_TIMER_ICR:
 			*data = lapic->icr_timer;
 			break;
 		case APIC_OFFSET_TIMER_CCR:
 			*data = vlapic_get_ccr(vlapic);
 			break;
 		case APIC_OFFSET_TIMER_DCR:
 			*data = lapic->dcr_timer;
 			break;
 		case APIC_OFFSET_SELF_IPI:
 			/*
 			 * XXX generate a GP fault if vlapic is in x2apic mode
 			 */
 			*data = 0;
 			break;
 		case APIC_OFFSET_RRR:
 		default:
 			*data = 0;
 			break;
 	}
 done:
 	VLAPIC_CTR2(vlapic, "vlapic read offset %#x, data %#lx", offset, *data);
 	return 0;
 }
 
 int
 vlapic_write(struct vlapic *vlapic, int mmio_access, uint64_t offset,
     uint64_t data, bool *retu)
 {
 	struct LAPIC	*lapic = vlapic->apic_page;
 	uint32_t	*regptr;
 	int		retval;
 
 	KASSERT((offset & 0xf) == 0 && offset < PAGE_SIZE,
 	    ("vlapic_write: invalid offset %#lx", offset));
 
 	VLAPIC_CTR2(vlapic, "vlapic write offset %#lx, data %#lx",
 	    offset, data);
 
 	if (offset > sizeof(*lapic))
 		return (0);
 
 	/* Ignore MMIO accesses in x2APIC mode */
 	if (x2apic(vlapic) && mmio_access) {
 		VLAPIC_CTR2(vlapic, "MMIO write of %#lx to offset %#lx "
 		    "in x2APIC mode", data, offset);
 		return (0);
 	}
 
 	/*
 	 * XXX Generate GP fault for MSR accesses in xAPIC mode
 	 */
 	if (!x2apic(vlapic) && !mmio_access) {
 		VLAPIC_CTR2(vlapic, "x2APIC MSR write of %#lx to offset %#lx "
 		    "in xAPIC mode", data, offset);
 		return (0);
 	}
 
 	retval = 0;
 	switch(offset)
 	{
 		case APIC_OFFSET_ID:
 			lapic->id = data;
 			vlapic_id_write_handler(vlapic);
 			break;
 		case APIC_OFFSET_TPR:
 			vlapic_set_tpr(vlapic, data & 0xff);
 			break;
 		case APIC_OFFSET_EOI:
 			vlapic_process_eoi(vlapic);
 			break;
 		case APIC_OFFSET_LDR:
 			lapic->ldr = data;
 			vlapic_ldr_write_handler(vlapic);
 			break;
 		case APIC_OFFSET_DFR:
 			lapic->dfr = data;
 			vlapic_dfr_write_handler(vlapic);
 			break;
 		case APIC_OFFSET_SVR:
 			lapic->svr = data;
 			vlapic_svr_write_handler(vlapic);
 			break;
 		case APIC_OFFSET_ICR_LOW: 
 			lapic->icr_lo = data;
 			if (x2apic(vlapic))
 				lapic->icr_hi = data >> 32;
 			retval = vlapic_icrlo_write_handler(vlapic, retu);
 			break;
 		case APIC_OFFSET_ICR_HI:
 			lapic->icr_hi = data;
 			break;
 		case APIC_OFFSET_CMCI_LVT:
 		case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
 			regptr = vlapic_get_lvtptr(vlapic, offset);
 			*regptr = data;
 			vlapic_lvt_write_handler(vlapic, offset);
 			break;
 		case APIC_OFFSET_TIMER_ICR:
 			lapic->icr_timer = data;
 			vlapic_icrtmr_write_handler(vlapic);
 			break;
 
 		case APIC_OFFSET_TIMER_DCR:
 			lapic->dcr_timer = data;
 			vlapic_dcr_write_handler(vlapic);
 			break;
 
 		case APIC_OFFSET_ESR:
 			vlapic_esr_write_handler(vlapic);
 			break;
 
 		case APIC_OFFSET_SELF_IPI:
 			if (x2apic(vlapic))
 				vlapic_self_ipi_handler(vlapic, data);
 			break;
 
 		case APIC_OFFSET_VER:
 		case APIC_OFFSET_APR:
 		case APIC_OFFSET_PPR:
 		case APIC_OFFSET_RRR:
 		case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
 		case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
 		case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
 		case APIC_OFFSET_TIMER_CCR:
 		default:
 			// Read only.
 			break;
 	}
 
 	return (retval);
 }
 
 static void
 vlapic_reset(struct vlapic *vlapic)
 {
 	struct LAPIC *lapic;
 	
 	lapic = vlapic->apic_page;
 	bzero(lapic, sizeof(struct LAPIC));
 
 	lapic->id = vlapic_get_id(vlapic);
 	lapic->version = VLAPIC_VERSION;
 	lapic->version |= (VLAPIC_MAXLVT_INDEX << MAXLVTSHIFT);
 	lapic->dfr = 0xffffffff;
 	lapic->svr = APIC_SVR_VECTOR;
 	vlapic_mask_lvts(vlapic);
 	vlapic_reset_tmr(vlapic);
 
 	lapic->dcr_timer = 0;
 	vlapic_dcr_write_handler(vlapic);
 
 	if (vlapic->vcpuid == 0)
 		vlapic->boot_state = BS_RUNNING;	/* BSP */
 	else
 		vlapic->boot_state = BS_INIT;		/* AP */
 
 	vlapic->svr_last = lapic->svr;
 }
 
 void
 vlapic_init(struct vlapic *vlapic)
 {
 	KASSERT(vlapic->vm != NULL, ("vlapic_init: vm is not initialized"));
 	KASSERT(vlapic->vcpuid >= 0 &&
 	    vlapic->vcpuid < vm_get_maxcpus(vlapic->vm),
 	    ("vlapic_init: vcpuid is not initialized"));
 	KASSERT(vlapic->apic_page != NULL, ("vlapic_init: apic_page is not "
 	    "initialized"));
 
 	/*
 	 * If the vlapic is configured in x2apic mode then it will be
 	 * accessed in the critical section via the MSR emulation code.
 	 *
 	 * Therefore the timer mutex must be a spinlock because blockable
 	 * mutexes cannot be acquired in a critical section.
 	 */
 	mtx_init(&vlapic->timer_mtx, "vlapic timer mtx", NULL, MTX_SPIN);
 	callout_init(&vlapic->callout, 1);
 
 	vlapic->msr_apicbase = DEFAULT_APIC_BASE | APICBASE_ENABLED;
 
 	if (vlapic->vcpuid == 0)
 		vlapic->msr_apicbase |= APICBASE_BSP;
 
 	vlapic_reset(vlapic);
 }
 
 void
 vlapic_cleanup(struct vlapic *vlapic)
 {
 
 	callout_drain(&vlapic->callout);
 }
 
 uint64_t
 vlapic_get_apicbase(struct vlapic *vlapic)
 {
 
 	return (vlapic->msr_apicbase);
 }
 
 int
 vlapic_set_apicbase(struct vlapic *vlapic, uint64_t new)
 {
 
 	if (vlapic->msr_apicbase != new) {
 		VLAPIC_CTR2(vlapic, "Changing APIC_BASE MSR from %#lx to %#lx "
 		    "not supported", vlapic->msr_apicbase, new);
 		return (-1);
 	}
 
 	return (0);
 }
 
 void
 vlapic_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
 {
 	struct vlapic *vlapic;
 	struct LAPIC *lapic;
 
 	vlapic = vm_lapic(vm, vcpuid);
 
 	if (state == X2APIC_DISABLED)
 		vlapic->msr_apicbase &= ~APICBASE_X2APIC;
 	else
 		vlapic->msr_apicbase |= APICBASE_X2APIC;
 
 	/*
 	 * Reset the local APIC registers whose values are mode-dependent.
 	 *
 	 * XXX this works because the APIC mode can be changed only at vcpu
 	 * initialization time.
 	 */
 	lapic = vlapic->apic_page;
 	lapic->id = vlapic_get_id(vlapic);
 	if (x2apic(vlapic)) {
 		lapic->ldr = x2apic_ldr(vlapic);
 		lapic->dfr = 0;
 	} else {
 		lapic->ldr = 0;
 		lapic->dfr = 0xffffffff;
 	}
 
 	if (state == X2APIC_ENABLED) {
 		if (vlapic->ops.enable_x2apic_mode)
 			(*vlapic->ops.enable_x2apic_mode)(vlapic);
 	}
 }
 
 void
 vlapic_deliver_intr(struct vm *vm, bool level, uint32_t dest, bool phys,
     int delmode, int vec)
 {
 	bool lowprio;
 	int vcpuid;
 	cpuset_t dmask;
 
 	if (delmode != IOART_DELFIXED &&
 	    delmode != IOART_DELLOPRI &&
 	    delmode != IOART_DELEXINT) {
 		VM_CTR1(vm, "vlapic intr invalid delmode %#x", delmode);
 		return;
 	}
 	lowprio = (delmode == IOART_DELLOPRI);
 
 	/*
 	 * We don't provide any virtual interrupt redirection hardware so
 	 * all interrupts originating from the ioapic or MSI specify the
 	 * 'dest' in the legacy xAPIC format.
 	 */
 	vlapic_calcdest(vm, &dmask, dest, phys, lowprio, false);
 
 	while ((vcpuid = CPU_FFS(&dmask)) != 0) {
 		vcpuid--;
 		CPU_CLR(vcpuid, &dmask);
 		if (delmode == IOART_DELEXINT) {
 			vm_inject_extint(vm, vcpuid);
 		} else {
 			lapic_set_intr(vm, vcpuid, vec, level);
 		}
 	}
 }
 
 void
 vlapic_post_intr(struct vlapic *vlapic, int hostcpu, int ipinum)
 {
 	/*
 	 * Post an interrupt to the vcpu currently running on 'hostcpu'.
 	 *
 	 * This is done by leveraging features like Posted Interrupts (Intel)
 	 * Doorbell MSR (AMD AVIC) that avoid a VM exit.
 	 *
 	 * If neither of these features are available then fallback to
 	 * sending an IPI to 'hostcpu'.
 	 */
 	if (vlapic->ops.post_intr)
 		(*vlapic->ops.post_intr)(vlapic, hostcpu);
 	else
 		ipi_cpu(hostcpu, ipinum);
 }
 
 bool
 vlapic_enabled(struct vlapic *vlapic)
 {
 	struct LAPIC *lapic = vlapic->apic_page;
 
 	if ((vlapic->msr_apicbase & APICBASE_ENABLED) != 0 &&
 	    (lapic->svr & APIC_SVR_ENABLE) != 0)
 		return (true);
 	else
 		return (false);
 }
 
 static void
 vlapic_set_tmr(struct vlapic *vlapic, int vector, bool level)
 {
 	struct LAPIC *lapic;
 	uint32_t *tmrptr, mask;
 	int idx;
 
 	lapic = vlapic->apic_page;
 	tmrptr = &lapic->tmr0;
 	idx = (vector / 32) * 4;
 	mask = 1 << (vector % 32);
 	if (level)
 		tmrptr[idx] |= mask;
 	else
 		tmrptr[idx] &= ~mask;
 
 	if (vlapic->ops.set_tmr != NULL)
 		(*vlapic->ops.set_tmr)(vlapic, vector, level);
 }
 
 void
 vlapic_reset_tmr(struct vlapic *vlapic)
 {
 	int vector;
 
 	VLAPIC_CTR0(vlapic, "vlapic resetting all vectors to edge-triggered");
 
 	for (vector = 0; vector <= 255; vector++)
 		vlapic_set_tmr(vlapic, vector, false);
 }
 
 void
 vlapic_set_tmr_level(struct vlapic *vlapic, uint32_t dest, bool phys,
     int delmode, int vector)
 {
 	cpuset_t dmask;
 	bool lowprio;
 
 	KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector));
 
 	/*
 	 * A level trigger is valid only for fixed and lowprio delivery modes.
 	 */
 	if (delmode != APIC_DELMODE_FIXED && delmode != APIC_DELMODE_LOWPRIO) {
 		VLAPIC_CTR1(vlapic, "Ignoring level trigger-mode for "
 		    "delivery-mode %d", delmode);
 		return;
 	}
 
 	lowprio = (delmode == APIC_DELMODE_LOWPRIO);
 	vlapic_calcdest(vlapic->vm, &dmask, dest, phys, lowprio, false);
 
 	if (!CPU_ISSET(vlapic->vcpuid, &dmask))
 		return;
 
 	VLAPIC_CTR1(vlapic, "vector %d set to level-triggered", vector);
 	vlapic_set_tmr(vlapic, vector, true);
 }
+
+#ifdef BHYVE_SNAPSHOT
+static void
+vlapic_reset_callout(struct vlapic *vlapic, uint32_t ccr)
+{
+	/* The implementation is similar to the one in the
+	 * `vlapic_icrtmr_write_handler` function
+	 */
+	sbintime_t sbt;
+	struct bintime bt;
+
+	VLAPIC_TIMER_LOCK(vlapic);
+
+	bt = vlapic->timer_freq_bt;
+	bintime_mul(&bt, ccr);
+
+	if (ccr != 0) {
+		binuptime(&vlapic->timer_fire_bt);
+		bintime_add(&vlapic->timer_fire_bt, &bt);
+
+		sbt = bttosbt(bt);
+		callout_reset_sbt(&vlapic->callout, sbt, 0,
+		    vlapic_callout_handler, vlapic, 0);
+	} else {
+		/* even if the CCR was 0, periodic timers should be reset */
+		if (vlapic_periodic_timer(vlapic)) {
+			binuptime(&vlapic->timer_fire_bt);
+			bintime_add(&vlapic->timer_fire_bt,
+				    &vlapic->timer_period_bt);
+			sbt = bttosbt(vlapic->timer_period_bt);
+
+			callout_stop(&vlapic->callout);
+			callout_reset_sbt(&vlapic->callout, sbt, 0,
+					  vlapic_callout_handler, vlapic, 0);
+		}
+	}
+
+	VLAPIC_TIMER_UNLOCK(vlapic);
+}
+
+int
+vlapic_snapshot(struct vm *vm, struct vm_snapshot_meta *meta)
+{
+	int i, ret;
+	struct vlapic *vlapic;
+	struct LAPIC *lapic;
+	uint32_t ccr;
+
+	KASSERT(vm != NULL, ("%s: arg was NULL", __func__));
+
+	ret = 0;
+
+	for (i = 0; i < VM_MAXCPU; i++) {
+		vlapic = vm_lapic(vm, i);
+
+		/* snapshot the page first; timer period depends on icr_timer */
+		lapic = vlapic->apic_page;
+		SNAPSHOT_BUF_OR_LEAVE(lapic, PAGE_SIZE, meta, ret, done);
+
+		SNAPSHOT_VAR_OR_LEAVE(vlapic->esr_pending, meta, ret, done);
+
+		SNAPSHOT_VAR_OR_LEAVE(vlapic->timer_freq_bt.sec,
+				      meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vlapic->timer_freq_bt.frac,
+				      meta, ret, done);
+
+		/*
+		 * Timer period is equal to 'icr_timer' ticks at a frequency of
+		 * 'timer_freq_bt'.
+		 */
+		if (meta->op == VM_SNAPSHOT_RESTORE) {
+			vlapic->timer_period_bt = vlapic->timer_freq_bt;
+			bintime_mul(&vlapic->timer_period_bt, lapic->icr_timer);
+		}
+
+		SNAPSHOT_BUF_OR_LEAVE(vlapic->isrvec_stk,
+				      sizeof(vlapic->isrvec_stk),
+				      meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vlapic->isrvec_stk_top, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vlapic->boot_state, meta, ret, done);
+
+		SNAPSHOT_BUF_OR_LEAVE(vlapic->lvt_last,
+				      sizeof(vlapic->lvt_last),
+				      meta, ret, done);
+
+		if (meta->op == VM_SNAPSHOT_SAVE)
+			ccr = vlapic_get_ccr(vlapic);
+
+		SNAPSHOT_VAR_OR_LEAVE(ccr, meta, ret, done);
+
+		if (meta->op == VM_SNAPSHOT_RESTORE) {
+			/* Reset the value of the 'timer_fire_bt' and the vlapic
+			 * callout based on the value of the current count
+			 * register saved when the VM snapshot was created
+			 */
+			vlapic_reset_callout(vlapic, ccr);
+		}
+	}
+
+done:
+	return (ret);
+}
+#endif
diff --git a/sys/amd64/vmm/io/vlapic.h b/sys/amd64/vmm/io/vlapic.h
index bd650efa8cc1..b87657c8bb51 100644
--- a/sys/amd64/vmm/io/vlapic.h
+++ b/sys/amd64/vmm/io/vlapic.h
@@ -1,112 +1,118 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _VLAPIC_H_
 #define	_VLAPIC_H_
 
 struct vm;
+struct vm_snapshot_meta;
 enum x2apic_state;
 
 int vlapic_write(struct vlapic *vlapic, int mmio_access, uint64_t offset,
     uint64_t data, bool *retu);
 int vlapic_read(struct vlapic *vlapic, int mmio_access, uint64_t offset,
     uint64_t *data, bool *retu);
 
 /*
  * Returns 0 if there is no eligible vector that can be delivered to the
  * guest at this time and non-zero otherwise.
  *
  * If an eligible vector number is found and 'vecptr' is not NULL then it will
  * be stored in the location pointed to by 'vecptr'.
  *
  * Note that the vector does not automatically transition to the ISR as a
  * result of calling this function.
  */
 int vlapic_pending_intr(struct vlapic *vlapic, int *vecptr);
 
 /*
  * Transition 'vector' from IRR to ISR. This function is called with the
  * vector returned by 'vlapic_pending_intr()' when the guest is able to
  * accept this interrupt (i.e. RFLAGS.IF = 1 and no conditions exist that
  * block interrupt delivery).
  */
 void vlapic_intr_accepted(struct vlapic *vlapic, int vector);
 
 /*
  * Returns 1 if the vcpu needs to be notified of the interrupt and 0 otherwise.
  */
 int vlapic_set_intr_ready(struct vlapic *vlapic, int vector, bool level);
 
 /*
  * Post an interrupt to the vcpu running on 'hostcpu'. This will use a
  * hardware assist if available (e.g. Posted Interrupt) or fall back to
  * sending an 'ipinum' to interrupt the 'hostcpu'.
  */
 void vlapic_post_intr(struct vlapic *vlapic, int hostcpu, int ipinum);
 
 void vlapic_fire_cmci(struct vlapic *vlapic);
 int vlapic_trigger_lvt(struct vlapic *vlapic, int vector);
 
 void vlapic_sync_tpr(struct vlapic *vlapic);
 
 uint64_t vlapic_get_apicbase(struct vlapic *vlapic);
 int vlapic_set_apicbase(struct vlapic *vlapic, uint64_t val);
 void vlapic_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state s);
 bool vlapic_enabled(struct vlapic *vlapic);
 
 void vlapic_deliver_intr(struct vm *vm, bool level, uint32_t dest, bool phys,
     int delmode, int vec);
 
 /* Reset the trigger-mode bits for all vectors to be edge-triggered */
 void vlapic_reset_tmr(struct vlapic *vlapic);
 
 /*
  * Set the trigger-mode bit associated with 'vector' to level-triggered if
  * the (dest,phys,delmode) tuple resolves to an interrupt being delivered to
  * this 'vlapic'.
  */
 void vlapic_set_tmr_level(struct vlapic *vlapic, uint32_t dest, bool phys,
     int delmode, int vector);
 
 void vlapic_set_cr8(struct vlapic *vlapic, uint64_t val);
 uint64_t vlapic_get_cr8(struct vlapic *vlapic);
 
 /* APIC write handlers */
 void vlapic_id_write_handler(struct vlapic *vlapic);
 void vlapic_ldr_write_handler(struct vlapic *vlapic);
 void vlapic_dfr_write_handler(struct vlapic *vlapic);
 void vlapic_svr_write_handler(struct vlapic *vlapic);
 void vlapic_esr_write_handler(struct vlapic *vlapic);
 int vlapic_icrlo_write_handler(struct vlapic *vlapic, bool *retu);
 void vlapic_icrtmr_write_handler(struct vlapic *vlapic);
 void vlapic_dcr_write_handler(struct vlapic *vlapic);
 void vlapic_lvt_write_handler(struct vlapic *vlapic, uint32_t offset);
 void vlapic_self_ipi_handler(struct vlapic *vlapic, uint64_t val);
+
+#ifdef BHYVE_SNAPSHOT
+int vlapic_snapshot(struct vm *vm, struct vm_snapshot_meta *meta);
+#endif
+
 #endif	/* _VLAPIC_H_ */
diff --git a/sys/amd64/vmm/io/vpmtmr.c b/sys/amd64/vmm/io/vpmtmr.c
index 4df909777d88..f79e94f6d0fe 100644
--- a/sys/amd64/vmm/io/vpmtmr.c
+++ b/sys/amd64/vmm/io/vpmtmr.c
@@ -1,105 +1,121 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2014, Neel Natu (neel@freebsd.org)
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
+#include "opt_bhyve_snapshot.h"
+
 #include <sys/param.h>
 #include <sys/queue.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/systm.h>
 
 #include <machine/vmm.h>
+#include <machine/vmm_snapshot.h>
 
 #include "vpmtmr.h"
 
 /*
  * The ACPI Power Management timer is a free-running 24- or 32-bit
  * timer with a frequency of 3.579545MHz
  *
  * This implementation will be 32-bits
  */
 
 #define PMTMR_FREQ	3579545  /* 3.579545MHz */
 
 struct vpmtmr {
 	sbintime_t	freq_sbt;
 	sbintime_t	baseuptime;
 	uint32_t	baseval;
 };
 
 static MALLOC_DEFINE(M_VPMTMR, "vpmtmr", "bhyve virtual acpi timer");
 
 struct vpmtmr *
 vpmtmr_init(struct vm *vm)
 {
 	struct vpmtmr *vpmtmr;
 	struct bintime bt;
 
 	vpmtmr = malloc(sizeof(struct vpmtmr), M_VPMTMR, M_WAITOK | M_ZERO);
 	vpmtmr->baseuptime = sbinuptime();
 	vpmtmr->baseval = 0;
 
 	FREQ2BT(PMTMR_FREQ, &bt);
 	vpmtmr->freq_sbt = bttosbt(bt);
 
 	return (vpmtmr);
 }
 
 void
 vpmtmr_cleanup(struct vpmtmr *vpmtmr)
 {
 
 	free(vpmtmr, M_VPMTMR);
 }
 
 int
 vpmtmr_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
     uint32_t *val)
 {
 	struct vpmtmr *vpmtmr;
 	sbintime_t now, delta;
 
 	if (!in || bytes != 4)
 		return (-1);
 
 	vpmtmr = vm_pmtmr(vm);
 
 	/*
 	 * No locking needed because 'baseuptime' and 'baseval' are
 	 * written only during initialization.
 	 */
 	now = sbinuptime();
 	delta = now - vpmtmr->baseuptime;
 	KASSERT(delta >= 0, ("vpmtmr_handler: uptime went backwards: "
 	    "%#lx to %#lx", vpmtmr->baseuptime, now));
 	*val = vpmtmr->baseval + delta / vpmtmr->freq_sbt;
 
 	return (0);
 }
+
+#ifdef BHYVE_SNAPSHOT
+int
+vpmtmr_snapshot(struct vpmtmr *vpmtmr, struct vm_snapshot_meta *meta)
+{
+	int ret;
+
+	SNAPSHOT_VAR_OR_LEAVE(vpmtmr->baseval, meta, ret, done);
+
+done:
+	return (ret);
+}
+#endif
diff --git a/sys/amd64/vmm/io/vpmtmr.h b/sys/amd64/vmm/io/vpmtmr.h
index e6562da5c02e..a10c0b4e8309 100644
--- a/sys/amd64/vmm/io/vpmtmr.h
+++ b/sys/amd64/vmm/io/vpmtmr.h
@@ -1,44 +1,49 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2014 Neel Natu (neel@freebsd.org)
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _VPMTMR_H_
 #define	_VPMTMR_H_
 
 #define	IO_PMTMR 0x408
 
 struct vpmtmr;
+struct vm_snapshot_meta;
 
 struct vpmtmr *vpmtmr_init(struct vm *vm);
 void vpmtmr_cleanup(struct vpmtmr *pmtmr);
 
 int vpmtmr_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
     uint32_t *val);
 
+#ifdef BHYVE_SNAPSHOT
+int vpmtmr_snapshot(struct vpmtmr *vpmtmr, struct vm_snapshot_meta *meta);
+#endif
+
 #endif
diff --git a/sys/amd64/vmm/io/vrtc.c b/sys/amd64/vmm/io/vrtc.c
index 954a78efb588..5d6968e3583e 100644
--- a/sys/amd64/vmm/io/vrtc.c
+++ b/sys/amd64/vmm/io/vrtc.c
@@ -1,1022 +1,1067 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2014, Neel Natu (neel@freebsd.org)
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
+#include "opt_bhyve_snapshot.h"
+
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/queue.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/clock.h>
 #include <sys/sysctl.h>
 
 #include <machine/vmm.h>
+#include <machine/vmm_snapshot.h>
 
 #include <isa/rtc.h>
 
 #include "vmm_ktr.h"
 #include "vatpic.h"
 #include "vioapic.h"
 #include "vrtc.h"
 
 /* Register layout of the RTC */
 struct rtcdev {
 	uint8_t	sec;
 	uint8_t	alarm_sec;
 	uint8_t	min;
 	uint8_t	alarm_min;
 	uint8_t	hour;
 	uint8_t	alarm_hour;
 	uint8_t	day_of_week;
 	uint8_t	day_of_month;
 	uint8_t	month;
 	uint8_t	year;
 	uint8_t	reg_a;
 	uint8_t	reg_b;
 	uint8_t	reg_c;
 	uint8_t	reg_d;
 	uint8_t	nvram[36];
 	uint8_t	century;
 	uint8_t	nvram2[128 - 51];
 } __packed;
 CTASSERT(sizeof(struct rtcdev) == 128);
 CTASSERT(offsetof(struct rtcdev, century) == RTC_CENTURY);
 
 struct vrtc {
 	struct vm	*vm;
 	struct mtx	mtx;
 	struct callout	callout;
 	u_int		addr;		/* RTC register to read or write */
 	sbintime_t	base_uptime;
 	time_t		base_rtctime;
 	struct rtcdev	rtcdev;
 };
 
 #define	VRTC_LOCK(vrtc)		mtx_lock(&((vrtc)->mtx))
 #define	VRTC_UNLOCK(vrtc)	mtx_unlock(&((vrtc)->mtx))
 #define	VRTC_LOCKED(vrtc)	mtx_owned(&((vrtc)->mtx))
 
 /*
  * RTC time is considered "broken" if:
  * - RTC updates are halted by the guest
  * - RTC date/time fields have invalid values
  */
 #define	VRTC_BROKEN_TIME	((time_t)-1)
 
 #define	RTC_IRQ			8
 #define	RTCSB_BIN		0x04
 #define	RTCSB_ALL_INTRS		(RTCSB_UINTR | RTCSB_AINTR | RTCSB_PINTR)
 #define	rtc_halted(vrtc)	((vrtc->rtcdev.reg_b & RTCSB_HALT) != 0)
 #define	aintr_enabled(vrtc)	(((vrtc)->rtcdev.reg_b & RTCSB_AINTR) != 0)
 #define	pintr_enabled(vrtc)	(((vrtc)->rtcdev.reg_b & RTCSB_PINTR) != 0)
 #define	uintr_enabled(vrtc)	(((vrtc)->rtcdev.reg_b & RTCSB_UINTR) != 0)
 
 static void vrtc_callout_handler(void *arg);
 static void vrtc_set_reg_c(struct vrtc *vrtc, uint8_t newval);
 
 static MALLOC_DEFINE(M_VRTC, "vrtc", "bhyve virtual rtc");
 
 SYSCTL_DECL(_hw_vmm);
 SYSCTL_NODE(_hw_vmm, OID_AUTO, vrtc, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
     NULL);
 
 static int rtc_flag_broken_time = 1;
 SYSCTL_INT(_hw_vmm_vrtc, OID_AUTO, flag_broken_time, CTLFLAG_RDTUN,
     &rtc_flag_broken_time, 0, "Stop guest when invalid RTC time is detected");
 
 static __inline bool
 divider_enabled(int reg_a)
 {
 	/*
 	 * The RTC is counting only when dividers are not held in reset.
 	 */
 	return ((reg_a & 0x70) == 0x20);
 }
 
 static __inline bool
 update_enabled(struct vrtc *vrtc)
 {
 	/*
 	 * RTC date/time can be updated only if:
 	 * - divider is not held in reset
 	 * - guest has not disabled updates
 	 * - the date/time fields have valid contents
 	 */
 	if (!divider_enabled(vrtc->rtcdev.reg_a))
 		return (false);
 
 	if (rtc_halted(vrtc))
 		return (false);
 
 	if (vrtc->base_rtctime == VRTC_BROKEN_TIME)
 		return (false);
 
 	return (true);
 }
 
 static time_t
 vrtc_curtime(struct vrtc *vrtc, sbintime_t *basetime)
 {
 	sbintime_t now, delta;
 	time_t t, secs;
 
 	KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__));
 
 	t = vrtc->base_rtctime;
 	*basetime = vrtc->base_uptime;
 	if (update_enabled(vrtc)) {
 		now = sbinuptime();
 		delta = now - vrtc->base_uptime;
 		KASSERT(delta >= 0, ("vrtc_curtime: uptime went backwards: "
 		    "%#lx to %#lx", vrtc->base_uptime, now));
 		secs = delta / SBT_1S;
 		t += secs;
 		*basetime += secs * SBT_1S;
 	}
 	return (t);
 }
 
 static __inline uint8_t
 rtcset(struct rtcdev *rtc, int val)
 {
 
 	KASSERT(val >= 0 && val < 100, ("%s: invalid bin2bcd index %d",
 	    __func__, val));
 
 	return ((rtc->reg_b & RTCSB_BIN) ? val : bin2bcd_data[val]);
 }
 
 static void
 secs_to_rtc(time_t rtctime, struct vrtc *vrtc, int force_update)
 {
 	struct clocktime ct;
 	struct timespec ts;
 	struct rtcdev *rtc;
 	int hour;
 
 	KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__));
 
 	if (rtctime < 0) {
 		KASSERT(rtctime == VRTC_BROKEN_TIME,
 		    ("%s: invalid vrtc time %#lx", __func__, rtctime));
 		return;
 	}
 
 	/*
 	 * If the RTC is halted then the guest has "ownership" of the
 	 * date/time fields. Don't update the RTC date/time fields in
 	 * this case (unless forced).
 	 */
 	if (rtc_halted(vrtc) && !force_update)
 		return;
 
 	ts.tv_sec = rtctime;
 	ts.tv_nsec = 0;
 	clock_ts_to_ct(&ts, &ct);
 
 	KASSERT(ct.sec >= 0 && ct.sec <= 59, ("invalid clocktime sec %d",
 	    ct.sec));
 	KASSERT(ct.min >= 0 && ct.min <= 59, ("invalid clocktime min %d",
 	    ct.min));
 	KASSERT(ct.hour >= 0 && ct.hour <= 23, ("invalid clocktime hour %d",
 	    ct.hour));
 	KASSERT(ct.dow >= 0 && ct.dow <= 6, ("invalid clocktime wday %d",
 	    ct.dow));
 	KASSERT(ct.day >= 1 && ct.day <= 31, ("invalid clocktime mday %d",
 	    ct.day));
 	KASSERT(ct.mon >= 1 && ct.mon <= 12, ("invalid clocktime month %d",
 	    ct.mon));
 	KASSERT(ct.year >= POSIX_BASE_YEAR, ("invalid clocktime year %d",
 	    ct.year));
 
 	rtc = &vrtc->rtcdev;
 	rtc->sec = rtcset(rtc, ct.sec);
 	rtc->min = rtcset(rtc, ct.min);
 
 	if (rtc->reg_b & RTCSB_24HR) {
 		hour = ct.hour;
 	} else {
 		/*
 		 * Convert to the 12-hour format.
 		 */
 		switch (ct.hour) {
 		case 0:			/* 12 AM */
 		case 12:		/* 12 PM */
 			hour = 12;
 			break;
 		default:
 			/*
 			 * The remaining 'ct.hour' values are interpreted as:
 			 * [1  - 11] ->  1 - 11 AM
 			 * [13 - 23] ->  1 - 11 PM
 			 */
 			hour = ct.hour % 12;
 			break;
 		}
 	}
 
 	rtc->hour = rtcset(rtc, hour);
 
 	if ((rtc->reg_b & RTCSB_24HR) == 0 && ct.hour >= 12)
 		rtc->hour |= 0x80;	    /* set MSB to indicate PM */
 
 	rtc->day_of_week = rtcset(rtc, ct.dow + 1);
 	rtc->day_of_month = rtcset(rtc, ct.day);
 	rtc->month = rtcset(rtc, ct.mon);
 	rtc->year = rtcset(rtc, ct.year % 100);
 	rtc->century = rtcset(rtc, ct.year / 100);
 }
 
 static int
 rtcget(struct rtcdev *rtc, int val, int *retval)
 {
 	uint8_t upper, lower;
 
 	if (rtc->reg_b & RTCSB_BIN) {
 		*retval = val;
 		return (0);
 	}
 
 	lower = val & 0xf;
 	upper = (val >> 4) & 0xf;
 
 	if (lower > 9 || upper > 9)
 		return (-1);
 
 	*retval = upper * 10 + lower;
 	return (0);
 }
 
 static time_t
 rtc_to_secs(struct vrtc *vrtc)
 {
 	struct clocktime ct;
 	struct timespec ts;
 	struct rtcdev *rtc;
 	struct vm *vm;
 	int century, error, hour, pm, year;
 
 	KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__));
 
 	vm = vrtc->vm;
 	rtc = &vrtc->rtcdev;
 
 	bzero(&ct, sizeof(struct clocktime));
 
 	error = rtcget(rtc, rtc->sec, &ct.sec);
 	if (error || ct.sec < 0 || ct.sec > 59) {
 		VM_CTR2(vm, "Invalid RTC sec %#x/%d", rtc->sec, ct.sec);
 		goto fail;
 	}
 
 	error = rtcget(rtc, rtc->min, &ct.min);
 	if (error || ct.min < 0 || ct.min > 59) {
 		VM_CTR2(vm, "Invalid RTC min %#x/%d", rtc->min, ct.min);
 		goto fail;
 	}
 
 	pm = 0;
 	hour = rtc->hour;
 	if ((rtc->reg_b & RTCSB_24HR) == 0) {
 		if (hour & 0x80) {
 			hour &= ~0x80;
 			pm = 1;
 		}
 	}
 	error = rtcget(rtc, hour, &ct.hour);
 	if ((rtc->reg_b & RTCSB_24HR) == 0) {
 		if (ct.hour >= 1 && ct.hour <= 12) {
 			/*
 			 * Convert from 12-hour format to internal 24-hour
 			 * representation as follows:
 			 *
 			 *    12-hour format		ct.hour
 			 *	12	AM		0
 			 *	1 - 11	AM		1 - 11
 			 *	12	PM		12
 			 *	1 - 11	PM		13 - 23
 			 */
 			if (ct.hour == 12)
 				ct.hour = 0;
 			if (pm)
 				ct.hour += 12;
 		} else {
 			VM_CTR2(vm, "Invalid RTC 12-hour format %#x/%d",
 			    rtc->hour, ct.hour);
 			goto fail;
 		}
 	}
 
 	if (error || ct.hour < 0 || ct.hour > 23) {
 		VM_CTR2(vm, "Invalid RTC hour %#x/%d", rtc->hour, ct.hour);
 		goto fail;
 	}
 
 	/*
 	 * Ignore 'rtc->dow' because some guests like Linux don't bother
 	 * setting it at all while others like OpenBSD/i386 set it incorrectly. 
 	 *
 	 * clock_ct_to_ts() does not depend on 'ct.dow' anyways so ignore it.
 	 */
 	ct.dow = -1;
 
 	error = rtcget(rtc, rtc->day_of_month, &ct.day);
 	if (error || ct.day < 1 || ct.day > 31) {
 		VM_CTR2(vm, "Invalid RTC mday %#x/%d", rtc->day_of_month,
 		    ct.day);
 		goto fail;
 	}
 
 	error = rtcget(rtc, rtc->month, &ct.mon);
 	if (error || ct.mon < 1 || ct.mon > 12) {
 		VM_CTR2(vm, "Invalid RTC month %#x/%d", rtc->month, ct.mon);
 		goto fail;
 	}
 
 	error = rtcget(rtc, rtc->year, &year);
 	if (error || year < 0 || year > 99) {
 		VM_CTR2(vm, "Invalid RTC year %#x/%d", rtc->year, year);
 		goto fail;
 	}
 
 	error = rtcget(rtc, rtc->century, &century);
 	ct.year = century * 100 + year;
 	if (error || ct.year < POSIX_BASE_YEAR) {
 		VM_CTR2(vm, "Invalid RTC century %#x/%d", rtc->century,
 		    ct.year);
 		goto fail;
 	}
 
 	error = clock_ct_to_ts(&ct, &ts);
 	if (error || ts.tv_sec < 0) {
 		VM_CTR3(vm, "Invalid RTC clocktime.date %04d-%02d-%02d",
 		    ct.year, ct.mon, ct.day);
 		VM_CTR3(vm, "Invalid RTC clocktime.time %02d:%02d:%02d",
 		    ct.hour, ct.min, ct.sec);
 		goto fail;
 	}
 	return (ts.tv_sec);		/* success */
 fail:
 	/*
 	 * Stop updating the RTC if the date/time fields programmed by
 	 * the guest are invalid.
 	 */
 	VM_CTR0(vrtc->vm, "Invalid RTC date/time programming detected");
 	return (VRTC_BROKEN_TIME);
 }
 
 static int
 vrtc_time_update(struct vrtc *vrtc, time_t newtime, sbintime_t newbase)
 {
 	struct rtcdev *rtc;
 	sbintime_t oldbase;
 	time_t oldtime;
 	uint8_t alarm_sec, alarm_min, alarm_hour;
 
 	KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__));
 
 	rtc = &vrtc->rtcdev;
 	alarm_sec = rtc->alarm_sec;
 	alarm_min = rtc->alarm_min;
 	alarm_hour = rtc->alarm_hour;
 
 	oldtime = vrtc->base_rtctime;
 	VM_CTR2(vrtc->vm, "Updating RTC secs from %#lx to %#lx",
 	    oldtime, newtime);
 
 	oldbase = vrtc->base_uptime;
 	VM_CTR2(vrtc->vm, "Updating RTC base uptime from %#lx to %#lx",
 	    oldbase, newbase);
 	vrtc->base_uptime = newbase;
 
 	if (newtime == oldtime)
 		return (0);
 
 	/*
 	 * If 'newtime' indicates that RTC updates are disabled then just
 	 * record that and return. There is no need to do alarm interrupt
 	 * processing in this case.
 	 */
 	if (newtime == VRTC_BROKEN_TIME) {
 		vrtc->base_rtctime = VRTC_BROKEN_TIME;
 		return (0);
 	}
 
 	/*
 	 * Return an error if RTC updates are halted by the guest.
 	 */
 	if (rtc_halted(vrtc)) {
 		VM_CTR0(vrtc->vm, "RTC update halted by guest");
 		return (EBUSY);
 	}
 
 	do {
 		/*
 		 * If the alarm interrupt is enabled and 'oldtime' is valid
 		 * then visit all the seconds between 'oldtime' and 'newtime'
 		 * to check for the alarm condition.
 		 *
 		 * Otherwise move the RTC time forward directly to 'newtime'.
 		 */
 		if (aintr_enabled(vrtc) && oldtime != VRTC_BROKEN_TIME)
 			vrtc->base_rtctime++;
 		else
 			vrtc->base_rtctime = newtime;
 
 		if (aintr_enabled(vrtc)) {
 			/*
 			 * Update the RTC date/time fields before checking
 			 * if the alarm conditions are satisfied.
 			 */
 			secs_to_rtc(vrtc->base_rtctime, vrtc, 0);
 
 			if ((alarm_sec >= 0xC0 || alarm_sec == rtc->sec) &&
 			    (alarm_min >= 0xC0 || alarm_min == rtc->min) &&
 			    (alarm_hour >= 0xC0 || alarm_hour == rtc->hour)) {
 				vrtc_set_reg_c(vrtc, rtc->reg_c | RTCIR_ALARM);
 			}
 		}
 	} while (vrtc->base_rtctime != newtime);
 
 	if (uintr_enabled(vrtc))
 		vrtc_set_reg_c(vrtc, rtc->reg_c | RTCIR_UPDATE);
 
 	return (0);
 }
 
 static sbintime_t
 vrtc_freq(struct vrtc *vrtc)
 {
 	int ratesel;
 
 	static sbintime_t pf[16] = {
 		0,
 		SBT_1S / 256,
 		SBT_1S / 128,
 		SBT_1S / 8192,
 		SBT_1S / 4096,
 		SBT_1S / 2048,
 		SBT_1S / 1024,
 		SBT_1S / 512,
 		SBT_1S / 256,
 		SBT_1S / 128,
 		SBT_1S / 64,
 		SBT_1S / 32,
 		SBT_1S / 16,
 		SBT_1S / 8,
 		SBT_1S / 4,
 		SBT_1S / 2,
 	};
 
 	KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__));
 
 	/*
 	 * If both periodic and alarm interrupts are enabled then use the
 	 * periodic frequency to drive the callout. The minimum periodic
 	 * frequency (2 Hz) is higher than the alarm frequency (1 Hz) so
 	 * piggyback the alarm on top of it. The same argument applies to
 	 * the update interrupt.
 	 */
 	if (pintr_enabled(vrtc) && divider_enabled(vrtc->rtcdev.reg_a)) {
 		ratesel = vrtc->rtcdev.reg_a & 0xf;
 		return (pf[ratesel]);
 	} else if (aintr_enabled(vrtc) && update_enabled(vrtc)) {
 		return (SBT_1S);
 	} else if (uintr_enabled(vrtc) && update_enabled(vrtc)) {
 		return (SBT_1S);
 	} else {
 		return (0);
 	}
 }
 
 static void
 vrtc_callout_reset(struct vrtc *vrtc, sbintime_t freqsbt)
 {
 
 	KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__));
 
 	if (freqsbt == 0) {
 		if (callout_active(&vrtc->callout)) {
 			VM_CTR0(vrtc->vm, "RTC callout stopped");
 			callout_stop(&vrtc->callout);
 		}
 		return;
 	}
 	VM_CTR1(vrtc->vm, "RTC callout frequency %d hz", SBT_1S / freqsbt);
 	callout_reset_sbt(&vrtc->callout, freqsbt, 0, vrtc_callout_handler,
 	    vrtc, 0);
 }
 
 static void
 vrtc_callout_handler(void *arg)
 {
 	struct vrtc *vrtc = arg;
 	sbintime_t freqsbt, basetime;
 	time_t rtctime;
 	int error;
 
 	VM_CTR0(vrtc->vm, "vrtc callout fired");
 
 	VRTC_LOCK(vrtc);
 	if (callout_pending(&vrtc->callout))	/* callout was reset */
 		goto done;
 
 	if (!callout_active(&vrtc->callout))	/* callout was stopped */
 		goto done;
 
 	callout_deactivate(&vrtc->callout);
 
 	KASSERT((vrtc->rtcdev.reg_b & RTCSB_ALL_INTRS) != 0,
 	    ("gratuitous vrtc callout"));
 
 	if (pintr_enabled(vrtc))
 		vrtc_set_reg_c(vrtc, vrtc->rtcdev.reg_c | RTCIR_PERIOD);
 
 	if (aintr_enabled(vrtc) || uintr_enabled(vrtc)) {
 		rtctime = vrtc_curtime(vrtc, &basetime);
 		error = vrtc_time_update(vrtc, rtctime, basetime);
 		KASSERT(error == 0, ("%s: vrtc_time_update error %d",
 		    __func__, error));
 	}
 
 	freqsbt = vrtc_freq(vrtc);
 	KASSERT(freqsbt != 0, ("%s: vrtc frequency cannot be zero", __func__));
 	vrtc_callout_reset(vrtc, freqsbt);
 done:
 	VRTC_UNLOCK(vrtc);
 }
 
 static __inline void
 vrtc_callout_check(struct vrtc *vrtc, sbintime_t freq)
 {
 	int active;
 
 	active = callout_active(&vrtc->callout) ? 1 : 0;
 	KASSERT((freq == 0 && !active) || (freq != 0 && active),
 	    ("vrtc callout %s with frequency %#lx",
 	    active ? "active" : "inactive", freq));
 }
 
 static void
 vrtc_set_reg_c(struct vrtc *vrtc, uint8_t newval)
 {
 	struct rtcdev *rtc;
 	int oldirqf, newirqf;
 	uint8_t oldval, changed;
 
 	KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__));
 
 	rtc = &vrtc->rtcdev;
 	newval &= RTCIR_ALARM | RTCIR_PERIOD | RTCIR_UPDATE;
 
 	oldirqf = rtc->reg_c & RTCIR_INT;
 	if ((aintr_enabled(vrtc) && (newval & RTCIR_ALARM) != 0) ||
 	    (pintr_enabled(vrtc) && (newval & RTCIR_PERIOD) != 0) ||
 	    (uintr_enabled(vrtc) && (newval & RTCIR_UPDATE) != 0)) {
 		newirqf = RTCIR_INT;
 	} else {
 		newirqf = 0;
 	}
 
 	oldval = rtc->reg_c;
 	rtc->reg_c = newirqf | newval;
 	changed = oldval ^ rtc->reg_c;
 	if (changed) {
 		VM_CTR2(vrtc->vm, "RTC reg_c changed from %#x to %#x",
 		    oldval, rtc->reg_c);
 	}
 
 	if (!oldirqf && newirqf) {
 		VM_CTR1(vrtc->vm, "RTC irq %d asserted", RTC_IRQ);
 		vatpic_pulse_irq(vrtc->vm, RTC_IRQ);
 		vioapic_pulse_irq(vrtc->vm, RTC_IRQ);
 	} else if (oldirqf && !newirqf) {
 		VM_CTR1(vrtc->vm, "RTC irq %d deasserted", RTC_IRQ);
 	}
 }
 
 static int
 vrtc_set_reg_b(struct vrtc *vrtc, uint8_t newval)
 {
 	struct rtcdev *rtc;
 	sbintime_t oldfreq, newfreq, basetime;
 	time_t curtime, rtctime;
 	int error;
 	uint8_t oldval, changed;
 
 	KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__));
 
 	rtc = &vrtc->rtcdev;
 	oldval = rtc->reg_b;
 	oldfreq = vrtc_freq(vrtc);
 
 	rtc->reg_b = newval;
 	changed = oldval ^ newval;
 	if (changed) {
 		VM_CTR2(vrtc->vm, "RTC reg_b changed from %#x to %#x",
 		    oldval, newval);
 	}
 
 	if (changed & RTCSB_HALT) {
 		if ((newval & RTCSB_HALT) == 0) {
 			rtctime = rtc_to_secs(vrtc);
 			basetime = sbinuptime();
 			if (rtctime == VRTC_BROKEN_TIME) {
 				if (rtc_flag_broken_time)
 					return (-1);
 			}
 		} else {
 			curtime = vrtc_curtime(vrtc, &basetime);
 			KASSERT(curtime == vrtc->base_rtctime, ("%s: mismatch "
 			    "between vrtc basetime (%#lx) and curtime (%#lx)",
 			    __func__, vrtc->base_rtctime, curtime));
 
 			/*
 			 * Force a refresh of the RTC date/time fields so
 			 * they reflect the time right before the guest set
 			 * the HALT bit.
 			 */
 			secs_to_rtc(curtime, vrtc, 1);
 
 			/*
 			 * Updates are halted so mark 'base_rtctime' to denote
 			 * that the RTC date/time is in flux.
 			 */
 			rtctime = VRTC_BROKEN_TIME;
 			rtc->reg_b &= ~RTCSB_UINTR;
 		}
 		error = vrtc_time_update(vrtc, rtctime, basetime);
 		KASSERT(error == 0, ("vrtc_time_update error %d", error));
 	}
 
 	/*
 	 * Side effect of changes to the interrupt enable bits.
 	 */
 	if (changed & RTCSB_ALL_INTRS)
 		vrtc_set_reg_c(vrtc, vrtc->rtcdev.reg_c);
 
 	/*
 	 * Change the callout frequency if it has changed.
 	 */
 	newfreq = vrtc_freq(vrtc);
 	if (newfreq != oldfreq)
 		vrtc_callout_reset(vrtc, newfreq);
 	else
 		vrtc_callout_check(vrtc, newfreq);
 
 	/*
 	 * The side effect of bits that control the RTC date/time format
 	 * is handled lazily when those fields are actually read.
 	 */
 	return (0);
 }
 
 static void
 vrtc_set_reg_a(struct vrtc *vrtc, uint8_t newval)
 {
 	sbintime_t oldfreq, newfreq;
 	uint8_t oldval, changed;
 
 	KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__));
 
 	newval &= ~RTCSA_TUP;
 	oldval = vrtc->rtcdev.reg_a;
 	oldfreq = vrtc_freq(vrtc);
 
 	if (divider_enabled(oldval) && !divider_enabled(newval)) {
 		VM_CTR2(vrtc->vm, "RTC divider held in reset at %#lx/%#lx",
 		    vrtc->base_rtctime, vrtc->base_uptime);
 	} else if (!divider_enabled(oldval) && divider_enabled(newval)) {
 		/*
 		 * If the dividers are coming out of reset then update
 		 * 'base_uptime' before this happens. This is done to
 		 * maintain the illusion that the RTC date/time was frozen
 		 * while the dividers were disabled.
 		 */
 		vrtc->base_uptime = sbinuptime();
 		VM_CTR2(vrtc->vm, "RTC divider out of reset at %#lx/%#lx",
 		    vrtc->base_rtctime, vrtc->base_uptime);
 	} else {
 		/* NOTHING */
 	}
 
 	vrtc->rtcdev.reg_a = newval;
 	changed = oldval ^ newval;
 	if (changed) {
 		VM_CTR2(vrtc->vm, "RTC reg_a changed from %#x to %#x",
 		    oldval, newval);
 	}
 
 	/*
 	 * Side effect of changes to rate select and divider enable bits.
 	 */
 	newfreq = vrtc_freq(vrtc);
 	if (newfreq != oldfreq)
 		vrtc_callout_reset(vrtc, newfreq);
 	else
 		vrtc_callout_check(vrtc, newfreq);
 }
 
 int
 vrtc_set_time(struct vm *vm, time_t secs)
 {
 	struct vrtc *vrtc;
 	int error;
 
 	vrtc = vm_rtc(vm);
 	VRTC_LOCK(vrtc);
 	error = vrtc_time_update(vrtc, secs, sbinuptime());
 	VRTC_UNLOCK(vrtc);
 
 	if (error) {
 		VM_CTR2(vrtc->vm, "Error %d setting RTC time to %#lx", error,
 		    secs);
 	} else {
 		VM_CTR1(vrtc->vm, "RTC time set to %#lx", secs);
 	}
 
 	return (error);
 }
 
 time_t
 vrtc_get_time(struct vm *vm)
 {
 	struct vrtc *vrtc;
 	sbintime_t basetime;
 	time_t t;
 
 	vrtc = vm_rtc(vm);
 	VRTC_LOCK(vrtc);
 	t = vrtc_curtime(vrtc, &basetime);
 	VRTC_UNLOCK(vrtc);
 
 	return (t);
 }
 
 int
 vrtc_nvram_write(struct vm *vm, int offset, uint8_t value)
 {
 	struct vrtc *vrtc;
 	uint8_t *ptr;
 
 	vrtc = vm_rtc(vm);
 
 	/*
 	 * Don't allow writes to RTC control registers or the date/time fields.
 	 */
 	if (offset < offsetof(struct rtcdev, nvram[0]) ||
 	    offset == RTC_CENTURY || offset >= sizeof(struct rtcdev)) {
 		VM_CTR1(vrtc->vm, "RTC nvram write to invalid offset %d",
 		    offset);
 		return (EINVAL);
 	}
 
 	VRTC_LOCK(vrtc);
 	ptr = (uint8_t *)(&vrtc->rtcdev);
 	ptr[offset] = value;
 	VM_CTR2(vrtc->vm, "RTC nvram write %#x to offset %#x", value, offset);
 	VRTC_UNLOCK(vrtc);
 
 	return (0);
 }
 
 int
 vrtc_nvram_read(struct vm *vm, int offset, uint8_t *retval)
 {
 	struct vrtc *vrtc;
 	sbintime_t basetime;
 	time_t curtime;
 	uint8_t *ptr;
 
 	/*
 	 * Allow all offsets in the RTC to be read.
 	 */
 	if (offset < 0 || offset >= sizeof(struct rtcdev))
 		return (EINVAL);
 
 	vrtc = vm_rtc(vm);
 	VRTC_LOCK(vrtc);
 
 	/*
 	 * Update RTC date/time fields if necessary.
 	 */
 	if (offset < 10 || offset == RTC_CENTURY) {
 		curtime = vrtc_curtime(vrtc, &basetime);
 		secs_to_rtc(curtime, vrtc, 0);
 	}
 
 	ptr = (uint8_t *)(&vrtc->rtcdev);
 	*retval = ptr[offset];
 
 	VRTC_UNLOCK(vrtc);
 	return (0);
 }
 
 int
 vrtc_addr_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
     uint32_t *val)
 {
 	struct vrtc *vrtc;
 
 	vrtc = vm_rtc(vm);
 
 	if (bytes != 1)
 		return (-1);
 
 	if (in) {
 		*val = 0xff;
 		return (0);
 	}
 
 	VRTC_LOCK(vrtc);
 	vrtc->addr = *val & 0x7f;
 	VRTC_UNLOCK(vrtc);
 
 	return (0);
 }
 
 int
 vrtc_data_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
     uint32_t *val)
 {
 	struct vrtc *vrtc;
 	struct rtcdev *rtc;
 	sbintime_t basetime;
 	time_t curtime;
 	int error, offset;
 
 	vrtc = vm_rtc(vm);
 	rtc = &vrtc->rtcdev;
 
 	if (bytes != 1)
 		return (-1);
 
 	VRTC_LOCK(vrtc);
 	offset = vrtc->addr;
 	if (offset >= sizeof(struct rtcdev)) {
 		VRTC_UNLOCK(vrtc);
 		return (-1);
 	}
 
 	error = 0;
 	curtime = vrtc_curtime(vrtc, &basetime);
 	vrtc_time_update(vrtc, curtime, basetime);
 
 	/*
 	 * Update RTC date/time fields if necessary.
 	 *
 	 * This is not just for reads of the RTC. The side-effect of writing
 	 * the century byte requires other RTC date/time fields (e.g. sec)
 	 * to be updated here.
 	 */
 	if (offset < 10 || offset == RTC_CENTURY)
 		secs_to_rtc(curtime, vrtc, 0);
 
 	if (in) {
 		if (offset == 12) {
 			/*
 			 * XXX
 			 * reg_c interrupt flags are updated only if the
 			 * corresponding interrupt enable bit in reg_b is set.
 			 */
 			*val = vrtc->rtcdev.reg_c;
 			vrtc_set_reg_c(vrtc, 0);
 		} else {
 			*val = *((uint8_t *)rtc + offset);
 		}
 		VCPU_CTR2(vm, vcpuid, "Read value %#x from RTC offset %#x",
 		    *val, offset);
 	} else {
 		switch (offset) {
 		case 10:
 			VCPU_CTR1(vm, vcpuid, "RTC reg_a set to %#x", *val);
 			vrtc_set_reg_a(vrtc, *val);
 			break;
 		case 11:
 			VCPU_CTR1(vm, vcpuid, "RTC reg_b set to %#x", *val);
 			error = vrtc_set_reg_b(vrtc, *val);
 			break;
 		case 12:
 			VCPU_CTR1(vm, vcpuid, "RTC reg_c set to %#x (ignored)",
 			    *val);
 			break;
 		case 13:
 			VCPU_CTR1(vm, vcpuid, "RTC reg_d set to %#x (ignored)",
 			    *val);
 			break;
 		case 0:
 			/*
 			 * High order bit of 'seconds' is readonly.
 			 */
 			*val &= 0x7f;
 			/* FALLTHRU */
 		default:
 			VCPU_CTR2(vm, vcpuid, "RTC offset %#x set to %#x",
 			    offset, *val);
 			*((uint8_t *)rtc + offset) = *val;
 			break;
 		}
 
 		/*
 		 * XXX some guests (e.g. OpenBSD) write the century byte
 		 * outside of RTCSB_HALT so re-calculate the RTC date/time.
 		 */
 		if (offset == RTC_CENTURY && !rtc_halted(vrtc)) {
 			curtime = rtc_to_secs(vrtc);
 			error = vrtc_time_update(vrtc, curtime, sbinuptime());
 			KASSERT(!error, ("vrtc_time_update error %d", error));
 			if (curtime == VRTC_BROKEN_TIME && rtc_flag_broken_time)
 				error = -1;
 		}
 	}
 	VRTC_UNLOCK(vrtc);
 	return (error);
 }
 
 void
 vrtc_reset(struct vrtc *vrtc)
 {
 	struct rtcdev *rtc;
 
 	VRTC_LOCK(vrtc);
 
 	rtc = &vrtc->rtcdev;
 	vrtc_set_reg_b(vrtc, rtc->reg_b & ~(RTCSB_ALL_INTRS | RTCSB_SQWE));
 	vrtc_set_reg_c(vrtc, 0);
 	KASSERT(!callout_active(&vrtc->callout), ("rtc callout still active"));
 
 	VRTC_UNLOCK(vrtc);
 }
 
 struct vrtc *
 vrtc_init(struct vm *vm)
 {
 	struct vrtc *vrtc;
 	struct rtcdev *rtc;
 	time_t curtime;
 
 	vrtc = malloc(sizeof(struct vrtc), M_VRTC, M_WAITOK | M_ZERO);
 	vrtc->vm = vm;
 	mtx_init(&vrtc->mtx, "vrtc lock", NULL, MTX_DEF);
 	callout_init(&vrtc->callout, 1);
 
 	/* Allow dividers to keep time but disable everything else */
 	rtc = &vrtc->rtcdev;
 	rtc->reg_a = 0x20;
 	rtc->reg_b = RTCSB_24HR;
 	rtc->reg_c = 0;
 	rtc->reg_d = RTCSD_PWR;
 
 	/* Reset the index register to a safe value. */
 	vrtc->addr = RTC_STATUSD;
 
 	/*
 	 * Initialize RTC time to 00:00:00 Jan 1, 1970.
 	 */
 	curtime = 0;
 
 	VRTC_LOCK(vrtc);
 	vrtc->base_rtctime = VRTC_BROKEN_TIME;
 	vrtc_time_update(vrtc, curtime, sbinuptime());
 	secs_to_rtc(curtime, vrtc, 0);
 	VRTC_UNLOCK(vrtc);
 
 	return (vrtc);
 }
 
 void
 vrtc_cleanup(struct vrtc *vrtc)
 {
 
 	callout_drain(&vrtc->callout);
 	free(vrtc, M_VRTC);
 }
+
+#ifdef BHYVE_SNAPSHOT
+int
+vrtc_snapshot(struct vrtc *vrtc, struct vm_snapshot_meta *meta)
+{
+	int ret;
+
+	VRTC_LOCK(vrtc);
+
+	SNAPSHOT_VAR_OR_LEAVE(vrtc->addr, meta, ret, done);
+	if (meta->op == VM_SNAPSHOT_RESTORE)
+		vrtc->base_uptime = sbinuptime();
+	SNAPSHOT_VAR_OR_LEAVE(vrtc->base_rtctime, meta, ret, done);
+
+	SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.sec, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.alarm_sec, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.min, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.alarm_min, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.hour, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.alarm_hour, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.day_of_week, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.day_of_month, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.month, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.year, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.reg_a, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.reg_b, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.reg_c, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.reg_d, meta, ret, done);
+	SNAPSHOT_BUF_OR_LEAVE(vrtc->rtcdev.nvram, sizeof(vrtc->rtcdev.nvram),
+			      meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.century, meta, ret, done);
+	SNAPSHOT_BUF_OR_LEAVE(vrtc->rtcdev.nvram2, sizeof(vrtc->rtcdev.nvram2),
+			      meta, ret, done);
+
+	vrtc_callout_reset(vrtc, vrtc_freq(vrtc));
+
+	VRTC_UNLOCK(vrtc);
+
+done:
+	return (ret);
+}
+#endif
diff --git a/sys/amd64/vmm/io/vrtc.h b/sys/amd64/vmm/io/vrtc.h
index 836561c7b93b..791fb7db3e26 100644
--- a/sys/amd64/vmm/io/vrtc.h
+++ b/sys/amd64/vmm/io/vrtc.h
@@ -1,52 +1,57 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2014 Neel Natu (neel@freebsd.org)
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _VRTC_H_
 #define	_VRTC_H_
 
 #include <isa/isareg.h>
 
 struct vrtc;
+struct vm_snapshot_meta;
 
 struct vrtc *vrtc_init(struct vm *vm);
 void vrtc_cleanup(struct vrtc *vrtc);
 void vrtc_reset(struct vrtc *vrtc);
 
 time_t vrtc_get_time(struct vm *vm);
 int vrtc_set_time(struct vm *vm, time_t secs);
 int vrtc_nvram_write(struct vm *vm, int offset, uint8_t value);
 int vrtc_nvram_read(struct vm *vm, int offset, uint8_t *retval);
 
 int vrtc_addr_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
     uint32_t *val);
 int vrtc_data_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
     uint32_t *val);
 
+#ifdef BHYVE_SNAPSHOT
+int vrtc_snapshot(struct vrtc *vrtc, struct vm_snapshot_meta *meta);
+#endif
+
 #endif
diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c
index 0f6b803098d6..b2f5fa62efe5 100644
--- a/sys/amd64/vmm/vmm.c
+++ b/sys/amd64/vmm/vmm.c
@@ -1,2732 +1,2924 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
+#include "opt_bhyve_snapshot.h"
+
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/sysctl.h>
 #include <sys/malloc.h>
 #include <sys/pcpu.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
-#include <sys/systm.h>
+#include <sys/vnode.h>
 
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_param.h>
+#include <vm/vm_pager.h>
+#include <vm/vm_kern.h>
+#include <vm/vnode_pager.h>
+#include <vm/swap_pager.h>
+#include <vm/uma.h>
 
 #include <machine/cpu.h>
 #include <machine/pcb.h>
 #include <machine/smp.h>
 #include <machine/md_var.h>
 #include <x86/psl.h>
 #include <x86/apicreg.h>
 
 #include <machine/vmm.h>
 #include <machine/vmm_dev.h>
 #include <machine/vmm_instruction_emul.h>
+#include <machine/vmm_snapshot.h>
 
 #include "vmm_ioport.h"
 #include "vmm_ktr.h"
 #include "vmm_host.h"
 #include "vmm_mem.h"
 #include "vmm_util.h"
 #include "vatpic.h"
 #include "vatpit.h"
 #include "vhpet.h"
 #include "vioapic.h"
 #include "vlapic.h"
 #include "vpmtmr.h"
 #include "vrtc.h"
 #include "vmm_stat.h"
 #include "vmm_lapic.h"
 
 #include "io/ppt.h"
 #include "io/iommu.h"
 
 struct vlapic;
 
 /*
  * Initialization:
  * (a) allocated when vcpu is created
  * (i) initialized when vcpu is created and when it is reinitialized
  * (o) initialized the first time the vcpu is created
  * (x) initialized before use
  */
 struct vcpu {
 	struct mtx 	mtx;		/* (o) protects 'state' and 'hostcpu' */
 	enum vcpu_state	state;		/* (o) vcpu state */
 	int		hostcpu;	/* (o) vcpu's host cpu */
 	int		reqidle;	/* (i) request vcpu to idle */
 	struct vlapic	*vlapic;	/* (i) APIC device model */
 	enum x2apic_state x2apic_state;	/* (i) APIC mode */
 	uint64_t	exitintinfo;	/* (i) events pending at VM exit */
 	int		nmi_pending;	/* (i) NMI pending */
 	int		extint_pending;	/* (i) INTR pending */
 	int	exception_pending;	/* (i) exception pending */
 	int	exc_vector;		/* (x) exception collateral */
 	int	exc_errcode_valid;
 	uint32_t exc_errcode;
 	struct savefpu	*guestfpu;	/* (a,i) guest fpu state */
 	uint64_t	guest_xcr0;	/* (i) guest %xcr0 register */
 	void		*stats;		/* (a,i) statistics */
 	struct vm_exit	exitinfo;	/* (x) exit reason and collateral */
 	uint64_t	nextrip;	/* (x) next instruction to execute */
+	uint64_t	tsc_offset;	/* (o) TSC offsetting */
 };
 
 #define	vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx))
 #define	vcpu_lock_init(v)	mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
 #define	vcpu_lock(v)		mtx_lock_spin(&((v)->mtx))
 #define	vcpu_unlock(v)		mtx_unlock_spin(&((v)->mtx))
 #define	vcpu_assert_locked(v)	mtx_assert(&((v)->mtx), MA_OWNED)
 
 struct mem_seg {
 	size_t	len;
 	bool	sysmem;
 	struct vm_object *object;
 };
 #define	VM_MAX_MEMSEGS	3
 
 struct mem_map {
 	vm_paddr_t	gpa;
 	size_t		len;
 	vm_ooffset_t	segoff;
 	int		segid;
 	int		prot;
 	int		flags;
 };
 #define	VM_MAX_MEMMAPS	8
 
 /*
  * Initialization:
  * (o) initialized the first time the VM is created
  * (i) initialized when VM is created and when it is reinitialized
  * (x) initialized before use
  */
 struct vm {
 	void		*cookie;		/* (i) cpu-specific data */
 	void		*iommu;			/* (x) iommu-specific data */
 	struct vhpet	*vhpet;			/* (i) virtual HPET */
 	struct vioapic	*vioapic;		/* (i) virtual ioapic */
 	struct vatpic	*vatpic;		/* (i) virtual atpic */
 	struct vatpit	*vatpit;		/* (i) virtual atpit */
 	struct vpmtmr	*vpmtmr;		/* (i) virtual ACPI PM timer */
 	struct vrtc	*vrtc;			/* (o) virtual RTC */
 	volatile cpuset_t active_cpus;		/* (i) active vcpus */
 	volatile cpuset_t debug_cpus;		/* (i) vcpus stopped for debug */
 	int		suspend;		/* (i) stop VM execution */
 	volatile cpuset_t suspended_cpus; 	/* (i) suspended vcpus */
 	volatile cpuset_t halted_cpus;		/* (x) cpus in a hard halt */
 	cpuset_t	rendezvous_req_cpus;	/* (x) rendezvous requested */
 	cpuset_t	rendezvous_done_cpus;	/* (x) rendezvous finished */
 	void		*rendezvous_arg;	/* (x) rendezvous func/arg */
 	vm_rendezvous_func_t rendezvous_func;
 	struct mtx	rendezvous_mtx;		/* (o) rendezvous lock */
 	struct mem_map	mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */
 	struct mem_seg	mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */
 	struct vmspace	*vmspace;		/* (o) guest's address space */
 	char		name[VM_MAX_NAMELEN];	/* (o) virtual machine name */
 	struct vcpu	vcpu[VM_MAXCPU];	/* (i) guest vcpus */
 	/* The following describe the vm cpu topology */
 	uint16_t	sockets;		/* (o) num of sockets */
 	uint16_t	cores;			/* (o) num of cores/socket */
 	uint16_t	threads;		/* (o) num of threads/core */
 	uint16_t	maxcpus;		/* (o) max pluggable cpus */
 };
 
 static int vmm_initialized;
 
 static struct vmm_ops *ops;
 #define	VMM_INIT(num)	(ops != NULL ? (*ops->init)(num) : 0)
 #define	VMM_CLEANUP()	(ops != NULL ? (*ops->cleanup)() : 0)
 #define	VMM_RESUME()	(ops != NULL ? (*ops->resume)() : 0)
 
 #define	VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL)
 #define	VMRUN(vmi, vcpu, rip, pmap, evinfo) \
 	(ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap, evinfo) : ENXIO)
 #define	VMCLEANUP(vmi)	(ops != NULL ? (*ops->vmcleanup)(vmi) : NULL)
 #define	VMSPACE_ALLOC(min, max) \
 	(ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL)
 #define	VMSPACE_FREE(vmspace) \
 	(ops != NULL ? (*ops->vmspace_free)(vmspace) : ENXIO)
 #define	VMGETREG(vmi, vcpu, num, retval)		\
 	(ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO)
 #define	VMSETREG(vmi, vcpu, num, val)		\
 	(ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO)
 #define	VMGETDESC(vmi, vcpu, num, desc)		\
 	(ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO)
 #define	VMSETDESC(vmi, vcpu, num, desc)		\
 	(ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO)
 #define	VMGETCAP(vmi, vcpu, num, retval)	\
 	(ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO)
 #define	VMSETCAP(vmi, vcpu, num, val)		\
 	(ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO)
 #define	VLAPIC_INIT(vmi, vcpu)			\
 	(ops != NULL ? (*ops->vlapic_init)(vmi, vcpu) : NULL)
 #define	VLAPIC_CLEANUP(vmi, vlapic)		\
 	(ops != NULL ? (*ops->vlapic_cleanup)(vmi, vlapic) : NULL)
+#ifdef BHYVE_SNAPSHOT
+#define	VM_SNAPSHOT_VMI(vmi, meta) \
+	(ops != NULL ? (*ops->vmsnapshot)(vmi, meta) : ENXIO)
+#define	VM_SNAPSHOT_VMCX(vmi, meta, vcpuid) \
+	(ops != NULL ? (*ops->vmcx_snapshot)(vmi, meta, vcpuid) : ENXIO)
+#define	VM_RESTORE_TSC(vmi, vcpuid, offset) \
+	(ops != NULL ? (*ops->vm_restore_tsc)(vmi, vcpuid, offset) : ENXIO)
+#endif
 
 #define	fpu_start_emulating()	load_cr0(rcr0() | CR0_TS)
 #define	fpu_stop_emulating()	clts()
 
 SDT_PROVIDER_DEFINE(vmm);
 
 static MALLOC_DEFINE(M_VM, "vm", "vm");
 
 /* statistics */
 static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
 
 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
     NULL);
 
 /*
  * Halt the guest if all vcpus are executing a HLT instruction with
  * interrupts disabled.
  */
 static int halt_detection_enabled = 1;
 SYSCTL_INT(_hw_vmm, OID_AUTO, halt_detection, CTLFLAG_RDTUN,
     &halt_detection_enabled, 0,
     "Halt VM if all vcpus execute HLT with interrupts disabled");
 
 static int vmm_ipinum;
 SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0,
     "IPI vector used for vcpu notifications");
 
 static int trace_guest_exceptions;
 SYSCTL_INT(_hw_vmm, OID_AUTO, trace_guest_exceptions, CTLFLAG_RDTUN,
     &trace_guest_exceptions, 0,
     "Trap into hypervisor on all guest exceptions and reflect them back");
 
 static void vm_free_memmap(struct vm *vm, int ident);
 static bool sysmem_mapping(struct vm *vm, struct mem_map *mm);
 static void vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr);
 
 #ifdef KTR
 static const char *
 vcpu_state2str(enum vcpu_state state)
 {
 
 	switch (state) {
 	case VCPU_IDLE:
 		return ("idle");
 	case VCPU_FROZEN:
 		return ("frozen");
 	case VCPU_RUNNING:
 		return ("running");
 	case VCPU_SLEEPING:
 		return ("sleeping");
 	default:
 		return ("unknown");
 	}
 }
 #endif
 
 static void
 vcpu_cleanup(struct vm *vm, int i, bool destroy)
 {
 	struct vcpu *vcpu = &vm->vcpu[i];
 
 	VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic);
 	if (destroy) {
 		vmm_stat_free(vcpu->stats);	
 		fpu_save_area_free(vcpu->guestfpu);
 	}
 }
 
 static void
 vcpu_init(struct vm *vm, int vcpu_id, bool create)
 {
 	struct vcpu *vcpu;
 
 	KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus,
 	    ("vcpu_init: invalid vcpu %d", vcpu_id));
 	  
 	vcpu = &vm->vcpu[vcpu_id];
 
 	if (create) {
 		KASSERT(!vcpu_lock_initialized(vcpu), ("vcpu %d already "
 		    "initialized", vcpu_id));
 		vcpu_lock_init(vcpu);
 		vcpu->state = VCPU_IDLE;
 		vcpu->hostcpu = NOCPU;
 		vcpu->guestfpu = fpu_save_area_alloc();
 		vcpu->stats = vmm_stat_alloc();
+		vcpu->tsc_offset = 0;
 	}
 
 	vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
 	vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED);
 	vcpu->reqidle = 0;
 	vcpu->exitintinfo = 0;
 	vcpu->nmi_pending = 0;
 	vcpu->extint_pending = 0;
 	vcpu->exception_pending = 0;
 	vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
 	fpu_save_area_reset(vcpu->guestfpu);
 	vmm_stat_init(vcpu->stats);
 }
 
 int
 vcpu_trace_exceptions(struct vm *vm, int vcpuid)
 {
 
 	return (trace_guest_exceptions);
 }
 
 struct vm_exit *
 vm_exitinfo(struct vm *vm, int cpuid)
 {
 	struct vcpu *vcpu;
 
 	if (cpuid < 0 || cpuid >= vm->maxcpus)
 		panic("vm_exitinfo: invalid cpuid %d", cpuid);
 
 	vcpu = &vm->vcpu[cpuid];
 
 	return (&vcpu->exitinfo);
 }
 
 static void
 vmm_resume(void)
 {
 	VMM_RESUME();
 }
 
 static int
 vmm_init(void)
 {
 	int error;
 
 	vmm_host_state_init();
 
 	vmm_ipinum = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) :
 	    &IDTVEC(justreturn));
 	if (vmm_ipinum < 0)
 		vmm_ipinum = IPI_AST;
 
 	error = vmm_mem_init();
 	if (error)
 		return (error);
 	
 	if (vmm_is_intel())
 		ops = &vmm_ops_intel;
 	else if (vmm_is_svm())
 		ops = &vmm_ops_amd;
 	else
 		return (ENXIO);
 
 	vmm_resume_p = vmm_resume;
 
 	return (VMM_INIT(vmm_ipinum));
 }
 
 static int
 vmm_handler(module_t mod, int what, void *arg)
 {
 	int error;
 
 	switch (what) {
 	case MOD_LOAD:
 		vmmdev_init();
 		error = vmm_init();
 		if (error == 0)
 			vmm_initialized = 1;
 		break;
 	case MOD_UNLOAD:
 		error = vmmdev_cleanup();
 		if (error == 0) {
 			vmm_resume_p = NULL;
 			iommu_cleanup();
 			if (vmm_ipinum != IPI_AST)
 				lapic_ipi_free(vmm_ipinum);
 			error = VMM_CLEANUP();
 			/*
 			 * Something bad happened - prevent new
 			 * VMs from being created
 			 */
 			if (error)
 				vmm_initialized = 0;
 		}
 		break;
 	default:
 		error = 0;
 		break;
 	}
 	return (error);
 }
 
 static moduledata_t vmm_kmod = {
 	"vmm",
 	vmm_handler,
 	NULL
 };
 
 /*
  * vmm initialization has the following dependencies:
  *
  * - VT-x initialization requires smp_rendezvous() and therefore must happen
  *   after SMP is fully functional (after SI_SUB_SMP).
  */
 DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY);
 MODULE_VERSION(vmm, 1);
 
 static void
 vm_init(struct vm *vm, bool create)
 {
 	int i;
 
 	vm->cookie = VMINIT(vm, vmspace_pmap(vm->vmspace));
 	vm->iommu = NULL;
 	vm->vioapic = vioapic_init(vm);
 	vm->vhpet = vhpet_init(vm);
 	vm->vatpic = vatpic_init(vm);
 	vm->vatpit = vatpit_init(vm);
 	vm->vpmtmr = vpmtmr_init(vm);
 	if (create)
 		vm->vrtc = vrtc_init(vm);
 
 	CPU_ZERO(&vm->active_cpus);
 	CPU_ZERO(&vm->debug_cpus);
 
 	vm->suspend = 0;
 	CPU_ZERO(&vm->suspended_cpus);
 
 	for (i = 0; i < vm->maxcpus; i++)
 		vcpu_init(vm, i, create);
 }
 
 /*
  * The default CPU topology is a single thread per package.
  */
 u_int cores_per_package = 1;
 u_int threads_per_core = 1;
 
 int
 vm_create(const char *name, struct vm **retvm)
 {
 	struct vm *vm;
 	struct vmspace *vmspace;
 
 	/*
 	 * If vmm.ko could not be successfully initialized then don't attempt
 	 * to create the virtual machine.
 	 */
 	if (!vmm_initialized)
 		return (ENXIO);
 
 	if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
 		return (EINVAL);
 
 	vmspace = VMSPACE_ALLOC(0, VM_MAXUSER_ADDRESS);
 	if (vmspace == NULL)
 		return (ENOMEM);
 
 	vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
 	strcpy(vm->name, name);
 	vm->vmspace = vmspace;
 	mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF);
 
 	vm->sockets = 1;
 	vm->cores = cores_per_package;	/* XXX backwards compatibility */
 	vm->threads = threads_per_core;	/* XXX backwards compatibility */
 	vm->maxcpus = VM_MAXCPU;	/* XXX temp to keep code working */
 
 	vm_init(vm, true);
 
 	*retvm = vm;
 	return (0);
 }
 
 void
 vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores,
     uint16_t *threads, uint16_t *maxcpus)
 {
 	*sockets = vm->sockets;
 	*cores = vm->cores;
 	*threads = vm->threads;
 	*maxcpus = vm->maxcpus;
 }
 
 uint16_t
 vm_get_maxcpus(struct vm *vm)
 {
 	return (vm->maxcpus);
 }
 
 int
 vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores,
     uint16_t threads, uint16_t maxcpus)
 {
 	if (maxcpus != 0)
 		return (EINVAL);	/* XXX remove when supported */
 	if ((sockets * cores * threads) > vm->maxcpus)
 		return (EINVAL);
 	/* XXX need to check sockets * cores * threads == vCPU, how? */
 	vm->sockets = sockets;
 	vm->cores = cores;
 	vm->threads = threads;
 	vm->maxcpus = VM_MAXCPU;	/* XXX temp to keep code working */
 	return(0);
 }
 
 static void
 vm_cleanup(struct vm *vm, bool destroy)
 {
 	struct mem_map *mm;
 	int i;
 
 	ppt_unassign_all(vm);
 
 	if (vm->iommu != NULL)
 		iommu_destroy_domain(vm->iommu);
 
 	if (destroy)
 		vrtc_cleanup(vm->vrtc);
 	else
 		vrtc_reset(vm->vrtc);
 	vpmtmr_cleanup(vm->vpmtmr);
 	vatpit_cleanup(vm->vatpit);
 	vhpet_cleanup(vm->vhpet);
 	vatpic_cleanup(vm->vatpic);
 	vioapic_cleanup(vm->vioapic);
 
 	for (i = 0; i < vm->maxcpus; i++)
 		vcpu_cleanup(vm, i, destroy);
 
 	VMCLEANUP(vm->cookie);
 
 	/*
 	 * System memory is removed from the guest address space only when
 	 * the VM is destroyed. This is because the mapping remains the same
 	 * across VM reset.
 	 *
 	 * Device memory can be relocated by the guest (e.g. using PCI BARs)
 	 * so those mappings are removed on a VM reset.
 	 */
 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 		mm = &vm->mem_maps[i];
 		if (destroy || !sysmem_mapping(vm, mm))
 			vm_free_memmap(vm, i);
 	}
 
 	if (destroy) {
 		for (i = 0; i < VM_MAX_MEMSEGS; i++)
 			vm_free_memseg(vm, i);
 
 		VMSPACE_FREE(vm->vmspace);
 		vm->vmspace = NULL;
 	}
 }
 
 void
 vm_destroy(struct vm *vm)
 {
 	vm_cleanup(vm, true);
 	free(vm, M_VM);
 }
 
 int
 vm_reinit(struct vm *vm)
 {
 	int error;
 
 	/*
 	 * A virtual machine can be reset only if all vcpus are suspended.
 	 */
 	if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
 		vm_cleanup(vm, false);
 		vm_init(vm, false);
 		error = 0;
 	} else {
 		error = EBUSY;
 	}
 
 	return (error);
 }
 
 const char *
 vm_name(struct vm *vm)
 {
 	return (vm->name);
 }
 
 int
 vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
 {
 	vm_object_t obj;
 
 	if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL)
 		return (ENOMEM);
 	else
 		return (0);
 }
 
 int
 vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
 {
 
 	vmm_mmio_free(vm->vmspace, gpa, len);
 	return (0);
 }
 
 /*
  * Return 'true' if 'gpa' is allocated in the guest address space.
  *
  * This function is called in the context of a running vcpu which acts as
  * an implicit lock on 'vm->mem_maps[]'.
  */
 bool
 vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa)
 {
 	struct mem_map *mm;
 	int i;
 
 #ifdef INVARIANTS
 	int hostcpu, state;
 	state = vcpu_get_state(vm, vcpuid, &hostcpu);
 	KASSERT(state == VCPU_RUNNING && hostcpu == curcpu,
 	    ("%s: invalid vcpu state %d/%d", __func__, state, hostcpu));
 #endif
 
 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 		mm = &vm->mem_maps[i];
 		if (mm->len != 0 && gpa >= mm->gpa && gpa < mm->gpa + mm->len)
 			return (true);		/* 'gpa' is sysmem or devmem */
 	}
 
 	if (ppt_is_mmio(vm, gpa))
 		return (true);			/* 'gpa' is pci passthru mmio */
 
 	return (false);
 }
 
 int
 vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem)
 {
 	struct mem_seg *seg;
 	vm_object_t obj;
 
 	if (ident < 0 || ident >= VM_MAX_MEMSEGS)
 		return (EINVAL);
 
 	if (len == 0 || (len & PAGE_MASK))
 		return (EINVAL);
 
 	seg = &vm->mem_segs[ident];
 	if (seg->object != NULL) {
 		if (seg->len == len && seg->sysmem == sysmem)
 			return (EEXIST);
 		else
 			return (EINVAL);
 	}
 
 	obj = vm_object_allocate(OBJT_DEFAULT, len >> PAGE_SHIFT);
 	if (obj == NULL)
 		return (ENOMEM);
 
 	seg->len = len;
 	seg->object = obj;
 	seg->sysmem = sysmem;
 	return (0);
 }
 
 int
 vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem,
     vm_object_t *objptr)
 {
 	struct mem_seg *seg;
 
 	if (ident < 0 || ident >= VM_MAX_MEMSEGS)
 		return (EINVAL);
 
 	seg = &vm->mem_segs[ident];
 	if (len)
 		*len = seg->len;
 	if (sysmem)
 		*sysmem = seg->sysmem;
 	if (objptr)
 		*objptr = seg->object;
 	return (0);
 }
 
 void
 vm_free_memseg(struct vm *vm, int ident)
 {
 	struct mem_seg *seg;
 
 	KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS,
 	    ("%s: invalid memseg ident %d", __func__, ident));
 
 	seg = &vm->mem_segs[ident];
 	if (seg->object != NULL) {
 		vm_object_deallocate(seg->object);
 		bzero(seg, sizeof(struct mem_seg));
 	}
 }
 
 int
 vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first,
     size_t len, int prot, int flags)
 {
 	struct mem_seg *seg;
 	struct mem_map *m, *map;
 	vm_ooffset_t last;
 	int i, error;
 
 	if (prot == 0 || (prot & ~(VM_PROT_ALL)) != 0)
 		return (EINVAL);
 
 	if (flags & ~VM_MEMMAP_F_WIRED)
 		return (EINVAL);
 
 	if (segid < 0 || segid >= VM_MAX_MEMSEGS)
 		return (EINVAL);
 
 	seg = &vm->mem_segs[segid];
 	if (seg->object == NULL)
 		return (EINVAL);
 
 	last = first + len;
 	if (first < 0 || first >= last || last > seg->len)
 		return (EINVAL);
 
 	if ((gpa | first | last) & PAGE_MASK)
 		return (EINVAL);
 
 	map = NULL;
 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 		m = &vm->mem_maps[i];
 		if (m->len == 0) {
 			map = m;
 			break;
 		}
 	}
 
 	if (map == NULL)
 		return (ENOSPC);
 
 	error = vm_map_find(&vm->vmspace->vm_map, seg->object, first, &gpa,
 	    len, 0, VMFS_NO_SPACE, prot, prot, 0);
 	if (error != KERN_SUCCESS)
 		return (EFAULT);
 
 	vm_object_reference(seg->object);
 
 	if (flags & VM_MEMMAP_F_WIRED) {
 		error = vm_map_wire(&vm->vmspace->vm_map, gpa, gpa + len,
 		    VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
 		if (error != KERN_SUCCESS) {
 			vm_map_remove(&vm->vmspace->vm_map, gpa, gpa + len);
 			return (error == KERN_RESOURCE_SHORTAGE ? ENOMEM :
 			    EFAULT);
 		}
 	}
 
 	map->gpa = gpa;
 	map->len = len;
 	map->segoff = first;
 	map->segid = segid;
 	map->prot = prot;
 	map->flags = flags;
 	return (0);
 }
 
 int
 vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid,
     vm_ooffset_t *segoff, size_t *len, int *prot, int *flags)
 {
 	struct mem_map *mm, *mmnext;
 	int i;
 
 	mmnext = NULL;
 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 		mm = &vm->mem_maps[i];
 		if (mm->len == 0 || mm->gpa < *gpa)
 			continue;
 		if (mmnext == NULL || mm->gpa < mmnext->gpa)
 			mmnext = mm;
 	}
 
 	if (mmnext != NULL) {
 		*gpa = mmnext->gpa;
 		if (segid)
 			*segid = mmnext->segid;
 		if (segoff)
 			*segoff = mmnext->segoff;
 		if (len)
 			*len = mmnext->len;
 		if (prot)
 			*prot = mmnext->prot;
 		if (flags)
 			*flags = mmnext->flags;
 		return (0);
 	} else {
 		return (ENOENT);
 	}
 }
 
 static void
 vm_free_memmap(struct vm *vm, int ident)
 {
 	struct mem_map *mm;
 	int error;
 
 	mm = &vm->mem_maps[ident];
 	if (mm->len) {
 		error = vm_map_remove(&vm->vmspace->vm_map, mm->gpa,
 		    mm->gpa + mm->len);
 		KASSERT(error == KERN_SUCCESS, ("%s: vm_map_remove error %d",
 		    __func__, error));
 		bzero(mm, sizeof(struct mem_map));
 	}
 }
 
 static __inline bool
 sysmem_mapping(struct vm *vm, struct mem_map *mm)
 {
 
 	if (mm->len != 0 && vm->mem_segs[mm->segid].sysmem)
 		return (true);
 	else
 		return (false);
 }
 
 vm_paddr_t
 vmm_sysmem_maxaddr(struct vm *vm)
 {
 	struct mem_map *mm;
 	vm_paddr_t maxaddr;
 	int i;
 
 	maxaddr = 0;
 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 		mm = &vm->mem_maps[i];
 		if (sysmem_mapping(vm, mm)) {
 			if (maxaddr < mm->gpa + mm->len)
 				maxaddr = mm->gpa + mm->len;
 		}
 	}
 	return (maxaddr);
 }
 
 static void
 vm_iommu_modify(struct vm *vm, bool map)
 {
 	int i, sz;
 	vm_paddr_t gpa, hpa;
 	struct mem_map *mm;
 	void *vp, *cookie, *host_domain;
 
 	sz = PAGE_SIZE;
 	host_domain = iommu_host_domain();
 
 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 		mm = &vm->mem_maps[i];
 		if (!sysmem_mapping(vm, mm))
 			continue;
 
 		if (map) {
 			KASSERT((mm->flags & VM_MEMMAP_F_IOMMU) == 0,
 			    ("iommu map found invalid memmap %#lx/%#lx/%#x",
 			    mm->gpa, mm->len, mm->flags));
 			if ((mm->flags & VM_MEMMAP_F_WIRED) == 0)
 				continue;
 			mm->flags |= VM_MEMMAP_F_IOMMU;
 		} else {
 			if ((mm->flags & VM_MEMMAP_F_IOMMU) == 0)
 				continue;
 			mm->flags &= ~VM_MEMMAP_F_IOMMU;
 			KASSERT((mm->flags & VM_MEMMAP_F_WIRED) != 0,
 			    ("iommu unmap found invalid memmap %#lx/%#lx/%#x",
 			    mm->gpa, mm->len, mm->flags));
 		}
 
 		gpa = mm->gpa;
 		while (gpa < mm->gpa + mm->len) {
 			vp = vm_gpa_hold(vm, -1, gpa, PAGE_SIZE, VM_PROT_WRITE,
 					 &cookie);
 			KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx",
 			    vm_name(vm), gpa));
 
 			vm_gpa_release(cookie);
 
 			hpa = DMAP_TO_PHYS((uintptr_t)vp);
 			if (map) {
 				iommu_create_mapping(vm->iommu, gpa, hpa, sz);
 				iommu_remove_mapping(host_domain, hpa, sz);
 			} else {
 				iommu_remove_mapping(vm->iommu, gpa, sz);
 				iommu_create_mapping(host_domain, hpa, hpa, sz);
 			}
 
 			gpa += PAGE_SIZE;
 		}
 	}
 
 	/*
 	 * Invalidate the cached translations associated with the domain
 	 * from which pages were removed.
 	 */
 	if (map)
 		iommu_invalidate_tlb(host_domain);
 	else
 		iommu_invalidate_tlb(vm->iommu);
 }
 
 #define	vm_iommu_unmap(vm)	vm_iommu_modify((vm), false)
 #define	vm_iommu_map(vm)	vm_iommu_modify((vm), true)
 
 int
 vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func)
 {
 	int error;
 
 	error = ppt_unassign_device(vm, bus, slot, func);
 	if (error)
 		return (error);
 
 	if (ppt_assigned_devices(vm) == 0)
 		vm_iommu_unmap(vm);
 
 	return (0);
 }
 
 int
 vm_assign_pptdev(struct vm *vm, int bus, int slot, int func)
 {
 	int error;
 	vm_paddr_t maxaddr;
 
 	/* Set up the IOMMU to do the 'gpa' to 'hpa' translation */
 	if (ppt_assigned_devices(vm) == 0) {
 		KASSERT(vm->iommu == NULL,
 		    ("vm_assign_pptdev: iommu must be NULL"));
 		maxaddr = vmm_sysmem_maxaddr(vm);
 		vm->iommu = iommu_create_domain(maxaddr);
 		if (vm->iommu == NULL)
 			return (ENXIO);
 		vm_iommu_map(vm);
 	}
 
 	error = ppt_assign_device(vm, bus, slot, func);
 	return (error);
 }
 
 void *
 vm_gpa_hold(struct vm *vm, int vcpuid, vm_paddr_t gpa, size_t len, int reqprot,
 	    void **cookie)
 {
 	int i, count, pageoff;
 	struct mem_map *mm;
 	vm_page_t m;
 #ifdef INVARIANTS
 	/*
 	 * All vcpus are frozen by ioctls that modify the memory map
 	 * (e.g. VM_MMAP_MEMSEG). Therefore 'vm->memmap[]' stability is
 	 * guaranteed if at least one vcpu is in the VCPU_FROZEN state.
 	 */
 	int state;
 	KASSERT(vcpuid >= -1 && vcpuid < vm->maxcpus, ("%s: invalid vcpuid %d",
 	    __func__, vcpuid));
 	for (i = 0; i < vm->maxcpus; i++) {
 		if (vcpuid != -1 && vcpuid != i)
 			continue;
 		state = vcpu_get_state(vm, i, NULL);
 		KASSERT(state == VCPU_FROZEN, ("%s: invalid vcpu state %d",
 		    __func__, state));
 	}
 #endif
 	pageoff = gpa & PAGE_MASK;
 	if (len > PAGE_SIZE - pageoff)
 		panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
 
 	count = 0;
 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 		mm = &vm->mem_maps[i];
 		if (sysmem_mapping(vm, mm) && gpa >= mm->gpa &&
 		    gpa < mm->gpa + mm->len) {
 			count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map,
 			    trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1);
 			break;
 		}
 	}
 
 	if (count == 1) {
 		*cookie = m;
 		return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff));
 	} else {
 		*cookie = NULL;
 		return (NULL);
 	}
 }
 
 void
 vm_gpa_release(void *cookie)
 {
 	vm_page_t m = cookie;
 
 	vm_page_unwire(m, PQ_ACTIVE);
 }
 
 int
 vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
 {
 
 	if (vcpu < 0 || vcpu >= vm->maxcpus)
 		return (EINVAL);
 
 	if (reg >= VM_REG_LAST)
 		return (EINVAL);
 
 	return (VMGETREG(vm->cookie, vcpu, reg, retval));
 }
 
 int
 vm_set_register(struct vm *vm, int vcpuid, int reg, uint64_t val)
 {
 	struct vcpu *vcpu;
 	int error;
 
 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
 		return (EINVAL);
 
 	if (reg >= VM_REG_LAST)
 		return (EINVAL);
 
 	error = VMSETREG(vm->cookie, vcpuid, reg, val);
 	if (error || reg != VM_REG_GUEST_RIP)
 		return (error);
 
 	/* Set 'nextrip' to match the value of %rip */
 	VCPU_CTR1(vm, vcpuid, "Setting nextrip to %#lx", val);
 	vcpu = &vm->vcpu[vcpuid];
 	vcpu->nextrip = val;
 	return (0);
 }
 
 static bool
 is_descriptor_table(int reg)
 {
 
 	switch (reg) {
 	case VM_REG_GUEST_IDTR:
 	case VM_REG_GUEST_GDTR:
 		return (true);
 	default:
 		return (false);
 	}
 }
 
 static bool
 is_segment_register(int reg)
 {
 	
 	switch (reg) {
 	case VM_REG_GUEST_ES:
 	case VM_REG_GUEST_CS:
 	case VM_REG_GUEST_SS:
 	case VM_REG_GUEST_DS:
 	case VM_REG_GUEST_FS:
 	case VM_REG_GUEST_GS:
 	case VM_REG_GUEST_TR:
 	case VM_REG_GUEST_LDTR:
 		return (true);
 	default:
 		return (false);
 	}
 }
 
 int
 vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
 		struct seg_desc *desc)
 {
 
 	if (vcpu < 0 || vcpu >= vm->maxcpus)
 		return (EINVAL);
 
 	if (!is_segment_register(reg) && !is_descriptor_table(reg))
 		return (EINVAL);
 
 	return (VMGETDESC(vm->cookie, vcpu, reg, desc));
 }
 
 int
 vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
 		struct seg_desc *desc)
 {
 	if (vcpu < 0 || vcpu >= vm->maxcpus)
 		return (EINVAL);
 
 	if (!is_segment_register(reg) && !is_descriptor_table(reg))
 		return (EINVAL);
 
 	return (VMSETDESC(vm->cookie, vcpu, reg, desc));
 }
 
 static void
 restore_guest_fpustate(struct vcpu *vcpu)
 {
 
 	/* flush host state to the pcb */
 	fpuexit(curthread);
 
 	/* restore guest FPU state */
 	fpu_stop_emulating();
 	fpurestore(vcpu->guestfpu);
 
 	/* restore guest XCR0 if XSAVE is enabled in the host */
 	if (rcr4() & CR4_XSAVE)
 		load_xcr(0, vcpu->guest_xcr0);
 
 	/*
 	 * The FPU is now "dirty" with the guest's state so turn on emulation
 	 * to trap any access to the FPU by the host.
 	 */
 	fpu_start_emulating();
 }
 
 static void
 save_guest_fpustate(struct vcpu *vcpu)
 {
 
 	if ((rcr0() & CR0_TS) == 0)
 		panic("fpu emulation not enabled in host!");
 
 	/* save guest XCR0 and restore host XCR0 */
 	if (rcr4() & CR4_XSAVE) {
 		vcpu->guest_xcr0 = rxcr(0);
 		load_xcr(0, vmm_get_host_xcr0());
 	}
 
 	/* save guest FPU state */
 	fpu_stop_emulating();
 	fpusave(vcpu->guestfpu);
 	fpu_start_emulating();
 }
 
 static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle");
 
 static int
 vcpu_set_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate,
     bool from_idle)
 {
 	struct vcpu *vcpu;
 	int error;
 
 	vcpu = &vm->vcpu[vcpuid];
 	vcpu_assert_locked(vcpu);
 
 	/*
 	 * State transitions from the vmmdev_ioctl() must always begin from
 	 * the VCPU_IDLE state. This guarantees that there is only a single
 	 * ioctl() operating on a vcpu at any point.
 	 */
 	if (from_idle) {
 		while (vcpu->state != VCPU_IDLE) {
 			vcpu->reqidle = 1;
 			vcpu_notify_event_locked(vcpu, false);
 			VCPU_CTR1(vm, vcpuid, "vcpu state change from %s to "
 			    "idle requested", vcpu_state2str(vcpu->state));
 			msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz);
 		}
 	} else {
 		KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
 		    "vcpu idle state"));
 	}
 
 	if (vcpu->state == VCPU_RUNNING) {
 		KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d "
 		    "mismatch for running vcpu", curcpu, vcpu->hostcpu));
 	} else {
 		KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a "
 		    "vcpu that is not running", vcpu->hostcpu));
 	}
 
 	/*
 	 * The following state transitions are allowed:
 	 * IDLE -> FROZEN -> IDLE
 	 * FROZEN -> RUNNING -> FROZEN
 	 * FROZEN -> SLEEPING -> FROZEN
 	 */
 	switch (vcpu->state) {
 	case VCPU_IDLE:
 	case VCPU_RUNNING:
 	case VCPU_SLEEPING:
 		error = (newstate != VCPU_FROZEN);
 		break;
 	case VCPU_FROZEN:
 		error = (newstate == VCPU_FROZEN);
 		break;
 	default:
 		error = 1;
 		break;
 	}
 
 	if (error)
 		return (EBUSY);
 
 	VCPU_CTR2(vm, vcpuid, "vcpu state changed from %s to %s",
 	    vcpu_state2str(vcpu->state), vcpu_state2str(newstate));
 
 	vcpu->state = newstate;
 	if (newstate == VCPU_RUNNING)
 		vcpu->hostcpu = curcpu;
 	else
 		vcpu->hostcpu = NOCPU;
 
 	if (newstate == VCPU_IDLE)
 		wakeup(&vcpu->state);
 
 	return (0);
 }
 
 static void
 vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
 {
 	int error;
 
 	if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0)
 		panic("Error %d setting state to %d\n", error, newstate);
 }
 
 static void
 vcpu_require_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate)
 {
 	int error;
 
 	if ((error = vcpu_set_state_locked(vm, vcpuid, newstate, false)) != 0)
 		panic("Error %d setting state to %d", error, newstate);
 }
 
 #define	RENDEZVOUS_CTR0(vm, vcpuid, fmt)				\
 	do {								\
 		if (vcpuid >= 0)					\
 			VCPU_CTR0(vm, vcpuid, fmt);			\
 		else							\
 			VM_CTR0(vm, fmt);				\
 	} while (0)
 
 static int
 vm_handle_rendezvous(struct vm *vm, int vcpuid)
 {
 	struct thread *td;
 	int error;
 
 	KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < vm->maxcpus),
 	    ("vm_handle_rendezvous: invalid vcpuid %d", vcpuid));
 
 	error = 0;
 	td = curthread;
 	mtx_lock(&vm->rendezvous_mtx);
 	while (vm->rendezvous_func != NULL) {
 		/* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */
 		CPU_AND(&vm->rendezvous_req_cpus, &vm->active_cpus);
 
 		if (vcpuid != -1 &&
 		    CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus) &&
 		    !CPU_ISSET(vcpuid, &vm->rendezvous_done_cpus)) {
 			VCPU_CTR0(vm, vcpuid, "Calling rendezvous func");
 			(*vm->rendezvous_func)(vm, vcpuid, vm->rendezvous_arg);
 			CPU_SET(vcpuid, &vm->rendezvous_done_cpus);
 		}
 		if (CPU_CMP(&vm->rendezvous_req_cpus,
 		    &vm->rendezvous_done_cpus) == 0) {
 			VCPU_CTR0(vm, vcpuid, "Rendezvous completed");
 			vm->rendezvous_func = NULL;
 			wakeup(&vm->rendezvous_func);
 			break;
 		}
 		RENDEZVOUS_CTR0(vm, vcpuid, "Wait for rendezvous completion");
 		mtx_sleep(&vm->rendezvous_func, &vm->rendezvous_mtx, 0,
 		    "vmrndv", hz);
 		if ((td->td_flags & TDF_NEEDSUSPCHK) != 0) {
 			mtx_unlock(&vm->rendezvous_mtx);
 			error = thread_check_susp(td, true);
 			if (error != 0)
 				return (error);
 			mtx_lock(&vm->rendezvous_mtx);
 		}
 	}
 	mtx_unlock(&vm->rendezvous_mtx);
 	return (0);
 }
 
 /*
  * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
  */
 static int
 vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu)
 {
 	struct vcpu *vcpu;
 	const char *wmesg;
 	struct thread *td;
 	int error, t, vcpu_halted, vm_halted;
 
 	KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted"));
 
 	vcpu = &vm->vcpu[vcpuid];
 	vcpu_halted = 0;
 	vm_halted = 0;
 	error = 0;
 	td = curthread;
 
 	vcpu_lock(vcpu);
 	while (1) {
 		/*
 		 * Do a final check for pending NMI or interrupts before
 		 * really putting this thread to sleep. Also check for
 		 * software events that would cause this vcpu to wakeup.
 		 *
 		 * These interrupts/events could have happened after the
 		 * vcpu returned from VMRUN() and before it acquired the
 		 * vcpu lock above.
 		 */
 		if (vm->rendezvous_func != NULL || vm->suspend || vcpu->reqidle)
 			break;
 		if (vm_nmi_pending(vm, vcpuid))
 			break;
 		if (!intr_disabled) {
 			if (vm_extint_pending(vm, vcpuid) ||
 			    vlapic_pending_intr(vcpu->vlapic, NULL)) {
 				break;
 			}
 		}
 
 		/* Don't go to sleep if the vcpu thread needs to yield */
 		if (vcpu_should_yield(vm, vcpuid))
 			break;
 
 		if (vcpu_debugged(vm, vcpuid))
 			break;
 
 		/*
 		 * Some Linux guests implement "halt" by having all vcpus
 		 * execute HLT with interrupts disabled. 'halted_cpus' keeps
 		 * track of the vcpus that have entered this state. When all
 		 * vcpus enter the halted state the virtual machine is halted.
 		 */
 		if (intr_disabled) {
 			wmesg = "vmhalt";
 			VCPU_CTR0(vm, vcpuid, "Halted");
 			if (!vcpu_halted && halt_detection_enabled) {
 				vcpu_halted = 1;
 				CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus);
 			}
 			if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) {
 				vm_halted = 1;
 				break;
 			}
 		} else {
 			wmesg = "vmidle";
 		}
 
 		t = ticks;
 		vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
 		/*
 		 * XXX msleep_spin() cannot be interrupted by signals so
 		 * wake up periodically to check pending signals.
 		 */
 		msleep_spin(vcpu, &vcpu->mtx, wmesg, hz);
 		vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
 		vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
 		if ((td->td_flags & TDF_NEEDSUSPCHK) != 0) {
 			vcpu_unlock(vcpu);
 			error = thread_check_susp(td, false);
 			if (error != 0)
 				return (error);
 			vcpu_lock(vcpu);
 		}
 	}
 
 	if (vcpu_halted)
 		CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus);
 
 	vcpu_unlock(vcpu);
 
 	if (vm_halted)
 		vm_suspend(vm, VM_SUSPEND_HALT);
 
 	return (0);
 }
 
 static int
 vm_handle_paging(struct vm *vm, int vcpuid, bool *retu)
 {
 	int rv, ftype;
 	struct vm_map *map;
 	struct vcpu *vcpu;
 	struct vm_exit *vme;
 
 	vcpu = &vm->vcpu[vcpuid];
 	vme = &vcpu->exitinfo;
 
 	KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d",
 	    __func__, vme->inst_length));
 
 	ftype = vme->u.paging.fault_type;
 	KASSERT(ftype == VM_PROT_READ ||
 	    ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE,
 	    ("vm_handle_paging: invalid fault_type %d", ftype));
 
 	if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
 		rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace),
 		    vme->u.paging.gpa, ftype);
 		if (rv == 0) {
 			VCPU_CTR2(vm, vcpuid, "%s bit emulation for gpa %#lx",
 			    ftype == VM_PROT_READ ? "accessed" : "dirty",
 			    vme->u.paging.gpa);
 			goto done;
 		}
 	}
 
 	map = &vm->vmspace->vm_map;
 	rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL, NULL);
 
 	VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, "
 	    "ftype = %d", rv, vme->u.paging.gpa, ftype);
 
 	if (rv != KERN_SUCCESS)
 		return (EFAULT);
 done:
 	return (0);
 }
 
 static int
 vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu)
 {
 	struct vie *vie;
 	struct vcpu *vcpu;
 	struct vm_exit *vme;
 	uint64_t gla, gpa, cs_base;
 	struct vm_guest_paging *paging;
 	mem_region_read_t mread;
 	mem_region_write_t mwrite;
 	enum vm_cpu_mode cpu_mode;
 	int cs_d, error, fault;
 
 	vcpu = &vm->vcpu[vcpuid];
 	vme = &vcpu->exitinfo;
 
 	KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d",
 	    __func__, vme->inst_length));
 
 	gla = vme->u.inst_emul.gla;
 	gpa = vme->u.inst_emul.gpa;
 	cs_base = vme->u.inst_emul.cs_base;
 	cs_d = vme->u.inst_emul.cs_d;
 	vie = &vme->u.inst_emul.vie;
 	paging = &vme->u.inst_emul.paging;
 	cpu_mode = paging->cpu_mode;
 
 	VCPU_CTR1(vm, vcpuid, "inst_emul fault accessing gpa %#lx", gpa);
 
 	/* Fetch, decode and emulate the faulting instruction */
 	if (vie->num_valid == 0) {
 		error = vmm_fetch_instruction(vm, vcpuid, paging, vme->rip +
 		    cs_base, VIE_INST_SIZE, vie, &fault);
 	} else {
 		/*
 		 * The instruction bytes have already been copied into 'vie'
 		 */
 		error = fault = 0;
 	}
 	if (error || fault)
 		return (error);
 
 	if (vmm_decode_instruction(vm, vcpuid, gla, cpu_mode, cs_d, vie) != 0) {
 		VCPU_CTR1(vm, vcpuid, "Error decoding instruction at %#lx",
 		    vme->rip + cs_base);
 		*retu = true;	    /* dump instruction bytes in userspace */
 		return (0);
 	}
 
 	/*
 	 * Update 'nextrip' based on the length of the emulated instruction.
 	 */
 	vme->inst_length = vie->num_processed;
 	vcpu->nextrip += vie->num_processed;
 	VCPU_CTR1(vm, vcpuid, "nextrip updated to %#lx after instruction "
 	    "decoding", vcpu->nextrip);
  
 	/* return to userland unless this is an in-kernel emulated device */
 	if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
 		mread = lapic_mmio_read;
 		mwrite = lapic_mmio_write;
 	} else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
 		mread = vioapic_mmio_read;
 		mwrite = vioapic_mmio_write;
 	} else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
 		mread = vhpet_mmio_read;
 		mwrite = vhpet_mmio_write;
 	} else {
 		*retu = true;
 		return (0);
 	}
 
 	error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, paging,
 	    mread, mwrite, retu);
 
 	return (error);
 }
 
 static int
 vm_handle_suspend(struct vm *vm, int vcpuid, bool *retu)
 {
 	int error, i;
 	struct vcpu *vcpu;
 	struct thread *td;
 
 	error = 0;
 	vcpu = &vm->vcpu[vcpuid];
 	td = curthread;
 
 	CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus);
 
 	/*
 	 * Wait until all 'active_cpus' have suspended themselves.
 	 *
 	 * Since a VM may be suspended at any time including when one or
 	 * more vcpus are doing a rendezvous we need to call the rendezvous
 	 * handler while we are waiting to prevent a deadlock.
 	 */
 	vcpu_lock(vcpu);
 	while (error == 0) {
 		if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
 			VCPU_CTR0(vm, vcpuid, "All vcpus suspended");
 			break;
 		}
 
 		if (vm->rendezvous_func == NULL) {
 			VCPU_CTR0(vm, vcpuid, "Sleeping during suspend");
 			vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
 			msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz);
 			vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
 			if ((td->td_flags & TDF_NEEDSUSPCHK) != 0) {
 				vcpu_unlock(vcpu);
 				error = thread_check_susp(td, false);
 				vcpu_lock(vcpu);
 			}
 		} else {
 			VCPU_CTR0(vm, vcpuid, "Rendezvous during suspend");
 			vcpu_unlock(vcpu);
 			error = vm_handle_rendezvous(vm, vcpuid);
 			vcpu_lock(vcpu);
 		}
 	}
 	vcpu_unlock(vcpu);
 
 	/*
 	 * Wakeup the other sleeping vcpus and return to userspace.
 	 */
 	for (i = 0; i < vm->maxcpus; i++) {
 		if (CPU_ISSET(i, &vm->suspended_cpus)) {
 			vcpu_notify_event(vm, i, false);
 		}
 	}
 
 	*retu = true;
 	return (error);
 }
 
 static int
 vm_handle_reqidle(struct vm *vm, int vcpuid, bool *retu)
 {
 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
 
 	vcpu_lock(vcpu);
 	KASSERT(vcpu->reqidle, ("invalid vcpu reqidle %d", vcpu->reqidle));
 	vcpu->reqidle = 0;
 	vcpu_unlock(vcpu);
 	*retu = true;
 	return (0);
 }
 
 int
 vm_suspend(struct vm *vm, enum vm_suspend_how how)
 {
 	int i;
 
 	if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST)
 		return (EINVAL);
 
 	if (atomic_cmpset_int(&vm->suspend, 0, how) == 0) {
 		VM_CTR2(vm, "virtual machine already suspended %d/%d",
 		    vm->suspend, how);
 		return (EALREADY);
 	}
 
 	VM_CTR1(vm, "virtual machine successfully suspended %d", how);
 
 	/*
 	 * Notify all active vcpus that they are now suspended.
 	 */
 	for (i = 0; i < vm->maxcpus; i++) {
 		if (CPU_ISSET(i, &vm->active_cpus))
 			vcpu_notify_event(vm, i, false);
 	}
 
 	return (0);
 }
 
 void
 vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip)
 {
 	struct vm_exit *vmexit;
 
 	KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST,
 	    ("vm_exit_suspended: invalid suspend type %d", vm->suspend));
 
 	vmexit = vm_exitinfo(vm, vcpuid);
 	vmexit->rip = rip;
 	vmexit->inst_length = 0;
 	vmexit->exitcode = VM_EXITCODE_SUSPENDED;
 	vmexit->u.suspended.how = vm->suspend;
 }
 
 void
 vm_exit_debug(struct vm *vm, int vcpuid, uint64_t rip)
 {
 	struct vm_exit *vmexit;
 
 	vmexit = vm_exitinfo(vm, vcpuid);
 	vmexit->rip = rip;
 	vmexit->inst_length = 0;
 	vmexit->exitcode = VM_EXITCODE_DEBUG;
 }
 
 void
 vm_exit_rendezvous(struct vm *vm, int vcpuid, uint64_t rip)
 {
 	struct vm_exit *vmexit;
 
 	KASSERT(vm->rendezvous_func != NULL, ("rendezvous not in progress"));
 
 	vmexit = vm_exitinfo(vm, vcpuid);
 	vmexit->rip = rip;
 	vmexit->inst_length = 0;
 	vmexit->exitcode = VM_EXITCODE_RENDEZVOUS;
 	vmm_stat_incr(vm, vcpuid, VMEXIT_RENDEZVOUS, 1);
 }
 
 void
 vm_exit_reqidle(struct vm *vm, int vcpuid, uint64_t rip)
 {
 	struct vm_exit *vmexit;
 
 	vmexit = vm_exitinfo(vm, vcpuid);
 	vmexit->rip = rip;
 	vmexit->inst_length = 0;
 	vmexit->exitcode = VM_EXITCODE_REQIDLE;
 	vmm_stat_incr(vm, vcpuid, VMEXIT_REQIDLE, 1);
 }
 
 void
 vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip)
 {
 	struct vm_exit *vmexit;
 
 	vmexit = vm_exitinfo(vm, vcpuid);
 	vmexit->rip = rip;
 	vmexit->inst_length = 0;
 	vmexit->exitcode = VM_EXITCODE_BOGUS;
 	vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1);
 }
 
 int
 vm_run(struct vm *vm, struct vm_run *vmrun)
 {
 	struct vm_eventinfo evinfo;
 	int error, vcpuid;
 	struct vcpu *vcpu;
 	struct pcb *pcb;
 	uint64_t tscval;
 	struct vm_exit *vme;
 	bool retu, intr_disabled;
 	pmap_t pmap;
 
 	vcpuid = vmrun->cpuid;
 
 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
 		return (EINVAL);
 
 	if (!CPU_ISSET(vcpuid, &vm->active_cpus))
 		return (EINVAL);
 
 	if (CPU_ISSET(vcpuid, &vm->suspended_cpus))
 		return (EINVAL);
 
 	pmap = vmspace_pmap(vm->vmspace);
 	vcpu = &vm->vcpu[vcpuid];
 	vme = &vcpu->exitinfo;
 	evinfo.rptr = &vm->rendezvous_func;
 	evinfo.sptr = &vm->suspend;
 	evinfo.iptr = &vcpu->reqidle;
 restart:
 	critical_enter();
 
 	KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
 	    ("vm_run: absurd pm_active"));
 
 	tscval = rdtsc();
 
 	pcb = PCPU_GET(curpcb);
 	set_pcb_flags(pcb, PCB_FULL_IRET);
 
 	restore_guest_fpustate(vcpu);
 
 	vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
 	error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip, pmap, &evinfo);
 	vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
 
 	save_guest_fpustate(vcpu);
 
 	vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
 
 	critical_exit();
 
 	if (error == 0) {
 		retu = false;
 		vcpu->nextrip = vme->rip + vme->inst_length;
 		switch (vme->exitcode) {
 		case VM_EXITCODE_REQIDLE:
 			error = vm_handle_reqidle(vm, vcpuid, &retu);
 			break;
 		case VM_EXITCODE_SUSPENDED:
 			error = vm_handle_suspend(vm, vcpuid, &retu);
 			break;
 		case VM_EXITCODE_IOAPIC_EOI:
 			vioapic_process_eoi(vm, vcpuid,
 			    vme->u.ioapic_eoi.vector);
 			break;
 		case VM_EXITCODE_RENDEZVOUS:
 			error = vm_handle_rendezvous(vm, vcpuid);
 			break;
 		case VM_EXITCODE_HLT:
 			intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0);
 			error = vm_handle_hlt(vm, vcpuid, intr_disabled, &retu);
 			break;
 		case VM_EXITCODE_PAGING:
 			error = vm_handle_paging(vm, vcpuid, &retu);
 			break;
 		case VM_EXITCODE_INST_EMUL:
 			error = vm_handle_inst_emul(vm, vcpuid, &retu);
 			break;
 		case VM_EXITCODE_INOUT:
 		case VM_EXITCODE_INOUT_STR:
 			error = vm_handle_inout(vm, vcpuid, vme, &retu);
 			break;
 		case VM_EXITCODE_MONITOR:
 		case VM_EXITCODE_MWAIT:
 		case VM_EXITCODE_VMINSN:
 			vm_inject_ud(vm, vcpuid);
 			break;
 		default:
 			retu = true;	/* handled in userland */
 			break;
 		}
 	}
 
 	if (error == 0 && retu == false)
 		goto restart;
 
 	VCPU_CTR2(vm, vcpuid, "retu %d/%d", error, vme->exitcode);
 
 	/* copy the exit information */
 	bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit));
 	return (error);
 }
 
 int
 vm_restart_instruction(void *arg, int vcpuid)
 {
 	struct vm *vm;
 	struct vcpu *vcpu;
 	enum vcpu_state state;
 	uint64_t rip;
 	int error;
 
 	vm = arg;
 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
 		return (EINVAL);
 
 	vcpu = &vm->vcpu[vcpuid];
 	state = vcpu_get_state(vm, vcpuid, NULL);
 	if (state == VCPU_RUNNING) {
 		/*
 		 * When a vcpu is "running" the next instruction is determined
 		 * by adding 'rip' and 'inst_length' in the vcpu's 'exitinfo'.
 		 * Thus setting 'inst_length' to zero will cause the current
 		 * instruction to be restarted.
 		 */
 		vcpu->exitinfo.inst_length = 0;
 		VCPU_CTR1(vm, vcpuid, "restarting instruction at %#lx by "
 		    "setting inst_length to zero", vcpu->exitinfo.rip);
 	} else if (state == VCPU_FROZEN) {
 		/*
 		 * When a vcpu is "frozen" it is outside the critical section
 		 * around VMRUN() and 'nextrip' points to the next instruction.
 		 * Thus instruction restart is achieved by setting 'nextrip'
 		 * to the vcpu's %rip.
 		 */
 		error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RIP, &rip);
 		KASSERT(!error, ("%s: error %d getting rip", __func__, error));
 		VCPU_CTR2(vm, vcpuid, "restarting instruction by updating "
 		    "nextrip from %#lx to %#lx", vcpu->nextrip, rip);
 		vcpu->nextrip = rip;
 	} else {
 		panic("%s: invalid state %d", __func__, state);
 	}
 	return (0);
 }
 
 int
 vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info)
 {
 	struct vcpu *vcpu;
 	int type, vector;
 
 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
 		return (EINVAL);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	if (info & VM_INTINFO_VALID) {
 		type = info & VM_INTINFO_TYPE;
 		vector = info & 0xff;
 		if (type == VM_INTINFO_NMI && vector != IDT_NMI)
 			return (EINVAL);
 		if (type == VM_INTINFO_HWEXCEPTION && vector >= 32)
 			return (EINVAL);
 		if (info & VM_INTINFO_RSVD)
 			return (EINVAL);
 	} else {
 		info = 0;
 	}
 	VCPU_CTR2(vm, vcpuid, "%s: info1(%#lx)", __func__, info);
 	vcpu->exitintinfo = info;
 	return (0);
 }
 
 enum exc_class {
 	EXC_BENIGN,
 	EXC_CONTRIBUTORY,
 	EXC_PAGEFAULT
 };
 
 #define	IDT_VE	20	/* Virtualization Exception (Intel specific) */
 
 static enum exc_class
 exception_class(uint64_t info)
 {
 	int type, vector;
 
 	KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %#lx", info));
 	type = info & VM_INTINFO_TYPE;
 	vector = info & 0xff;
 
 	/* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */
 	switch (type) {
 	case VM_INTINFO_HWINTR:
 	case VM_INTINFO_SWINTR:
 	case VM_INTINFO_NMI:
 		return (EXC_BENIGN);
 	default:
 		/*
 		 * Hardware exception.
 		 *
 		 * SVM and VT-x use identical type values to represent NMI,
 		 * hardware interrupt and software interrupt.
 		 *
 		 * SVM uses type '3' for all exceptions. VT-x uses type '3'
 		 * for exceptions except #BP and #OF. #BP and #OF use a type
 		 * value of '5' or '6'. Therefore we don't check for explicit
 		 * values of 'type' to classify 'intinfo' into a hardware
 		 * exception.
 		 */
 		break;
 	}
 
 	switch (vector) {
 	case IDT_PF:
 	case IDT_VE:
 		return (EXC_PAGEFAULT);
 	case IDT_DE:
 	case IDT_TS:
 	case IDT_NP:
 	case IDT_SS:
 	case IDT_GP:
 		return (EXC_CONTRIBUTORY);
 	default:
 		return (EXC_BENIGN);
 	}
 }
 
 static int
 nested_fault(struct vm *vm, int vcpuid, uint64_t info1, uint64_t info2,
     uint64_t *retinfo)
 {
 	enum exc_class exc1, exc2;
 	int type1, vector1;
 
 	KASSERT(info1 & VM_INTINFO_VALID, ("info1 %#lx is not valid", info1));
 	KASSERT(info2 & VM_INTINFO_VALID, ("info2 %#lx is not valid", info2));
 
 	/*
 	 * If an exception occurs while attempting to call the double-fault
 	 * handler the processor enters shutdown mode (aka triple fault).
 	 */
 	type1 = info1 & VM_INTINFO_TYPE;
 	vector1 = info1 & 0xff;
 	if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) {
 		VCPU_CTR2(vm, vcpuid, "triple fault: info1(%#lx), info2(%#lx)",
 		    info1, info2);
 		vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT);
 		*retinfo = 0;
 		return (0);
 	}
 
 	/*
 	 * Table 6-5 "Conditions for Generating a Double Fault", Intel SDM, Vol3
 	 */
 	exc1 = exception_class(info1);
 	exc2 = exception_class(info2);
 	if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) ||
 	    (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) {
 		/* Convert nested fault into a double fault. */
 		*retinfo = IDT_DF;
 		*retinfo |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
 		*retinfo |= VM_INTINFO_DEL_ERRCODE;
 	} else {
 		/* Handle exceptions serially */
 		*retinfo = info2;
 	}
 	return (1);
 }
 
 static uint64_t
 vcpu_exception_intinfo(struct vcpu *vcpu)
 {
 	uint64_t info = 0;
 
 	if (vcpu->exception_pending) {
 		info = vcpu->exc_vector & 0xff;
 		info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
 		if (vcpu->exc_errcode_valid) {
 			info |= VM_INTINFO_DEL_ERRCODE;
 			info |= (uint64_t)vcpu->exc_errcode << 32;
 		}
 	}
 	return (info);
 }
 
 int
 vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo)
 {
 	struct vcpu *vcpu;
 	uint64_t info1, info2;
 	int valid;
 
 	KASSERT(vcpuid >= 0 &&
 	    vcpuid < vm->maxcpus, ("invalid vcpu %d", vcpuid));
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	info1 = vcpu->exitintinfo;
 	vcpu->exitintinfo = 0;
 
 	info2 = 0;
 	if (vcpu->exception_pending) {
 		info2 = vcpu_exception_intinfo(vcpu);
 		vcpu->exception_pending = 0;
 		VCPU_CTR2(vm, vcpuid, "Exception %d delivered: %#lx",
 		    vcpu->exc_vector, info2);
 	}
 
 	if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) {
 		valid = nested_fault(vm, vcpuid, info1, info2, retinfo);
 	} else if (info1 & VM_INTINFO_VALID) {
 		*retinfo = info1;
 		valid = 1;
 	} else if (info2 & VM_INTINFO_VALID) {
 		*retinfo = info2;
 		valid = 1;
 	} else {
 		valid = 0;
 	}
 
 	if (valid) {
 		VCPU_CTR4(vm, vcpuid, "%s: info1(%#lx), info2(%#lx), "
 		    "retinfo(%#lx)", __func__, info1, info2, *retinfo);
 	}
 
 	return (valid);
 }
 
 int
 vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2)
 {
 	struct vcpu *vcpu;
 
 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
 		return (EINVAL);
 
 	vcpu = &vm->vcpu[vcpuid];
 	*info1 = vcpu->exitintinfo;
 	*info2 = vcpu_exception_intinfo(vcpu);
 	return (0);
 }
 
 int
 vm_inject_exception(struct vm *vm, int vcpuid, int vector, int errcode_valid,
     uint32_t errcode, int restart_instruction)
 {
 	struct vcpu *vcpu;
 	uint64_t regval;
 	int error;
 
 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
 		return (EINVAL);
 
 	if (vector < 0 || vector >= 32)
 		return (EINVAL);
 
 	/*
 	 * A double fault exception should never be injected directly into
 	 * the guest. It is a derived exception that results from specific
 	 * combinations of nested faults.
 	 */
 	if (vector == IDT_DF)
 		return (EINVAL);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	if (vcpu->exception_pending) {
 		VCPU_CTR2(vm, vcpuid, "Unable to inject exception %d due to "
 		    "pending exception %d", vector, vcpu->exc_vector);
 		return (EBUSY);
 	}
 
 	if (errcode_valid) {
 		/*
 		 * Exceptions don't deliver an error code in real mode.
 		 */
 		error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, &regval);
 		KASSERT(!error, ("%s: error %d getting CR0", __func__, error));
 		if (!(regval & CR0_PE))
 			errcode_valid = 0;
 	}
 
 	/*
 	 * From section 26.6.1 "Interruptibility State" in Intel SDM:
 	 *
 	 * Event blocking by "STI" or "MOV SS" is cleared after guest executes
 	 * one instruction or incurs an exception.
 	 */
 	error = vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0);
 	KASSERT(error == 0, ("%s: error %d clearing interrupt shadow",
 	    __func__, error));
 
 	if (restart_instruction)
 		vm_restart_instruction(vm, vcpuid);
 
 	vcpu->exception_pending = 1;
 	vcpu->exc_vector = vector;
 	vcpu->exc_errcode = errcode;
 	vcpu->exc_errcode_valid = errcode_valid;
 	VCPU_CTR1(vm, vcpuid, "Exception %d pending", vector);
 	return (0);
 }
 
 void
 vm_inject_fault(void *vmarg, int vcpuid, int vector, int errcode_valid,
     int errcode)
 {
 	struct vm *vm;
 	int error, restart_instruction;
 
 	vm = vmarg;
 	restart_instruction = 1;
 
 	error = vm_inject_exception(vm, vcpuid, vector, errcode_valid,
 	    errcode, restart_instruction);
 	KASSERT(error == 0, ("vm_inject_exception error %d", error));
 }
 
 void
 vm_inject_pf(void *vmarg, int vcpuid, int error_code, uint64_t cr2)
 {
 	struct vm *vm;
 	int error;
 
 	vm = vmarg;
 	VCPU_CTR2(vm, vcpuid, "Injecting page fault: error_code %#x, cr2 %#lx",
 	    error_code, cr2);
 
 	error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2);
 	KASSERT(error == 0, ("vm_set_register(cr2) error %d", error));
 
 	vm_inject_fault(vm, vcpuid, IDT_PF, 1, error_code);
 }
 
 static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
 
 int
 vm_inject_nmi(struct vm *vm, int vcpuid)
 {
 	struct vcpu *vcpu;
 
 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
 		return (EINVAL);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	vcpu->nmi_pending = 1;
 	vcpu_notify_event(vm, vcpuid, false);
 	return (0);
 }
 
 int
 vm_nmi_pending(struct vm *vm, int vcpuid)
 {
 	struct vcpu *vcpu;
 
 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
 		panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	return (vcpu->nmi_pending);
 }
 
 void
 vm_nmi_clear(struct vm *vm, int vcpuid)
 {
 	struct vcpu *vcpu;
 
 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
 		panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	if (vcpu->nmi_pending == 0)
 		panic("vm_nmi_clear: inconsistent nmi_pending state");
 
 	vcpu->nmi_pending = 0;
 	vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1);
 }
 
 static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu");
 
 int
 vm_inject_extint(struct vm *vm, int vcpuid)
 {
 	struct vcpu *vcpu;
 
 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
 		return (EINVAL);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	vcpu->extint_pending = 1;
 	vcpu_notify_event(vm, vcpuid, false);
 	return (0);
 }
 
 int
 vm_extint_pending(struct vm *vm, int vcpuid)
 {
 	struct vcpu *vcpu;
 
 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
 		panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	return (vcpu->extint_pending);
 }
 
 void
 vm_extint_clear(struct vm *vm, int vcpuid)
 {
 	struct vcpu *vcpu;
 
 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
 		panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	if (vcpu->extint_pending == 0)
 		panic("vm_extint_clear: inconsistent extint_pending state");
 
 	vcpu->extint_pending = 0;
 	vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1);
 }
 
 int
 vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
 {
 	if (vcpu < 0 || vcpu >= vm->maxcpus)
 		return (EINVAL);
 
 	if (type < 0 || type >= VM_CAP_MAX)
 		return (EINVAL);
 
 	return (VMGETCAP(vm->cookie, vcpu, type, retval));
 }
 
 int
 vm_set_capability(struct vm *vm, int vcpu, int type, int val)
 {
 	if (vcpu < 0 || vcpu >= vm->maxcpus)
 		return (EINVAL);
 
 	if (type < 0 || type >= VM_CAP_MAX)
 		return (EINVAL);
 
 	return (VMSETCAP(vm->cookie, vcpu, type, val));
 }
 
 struct vlapic *
 vm_lapic(struct vm *vm, int cpu)
 {
 	return (vm->vcpu[cpu].vlapic);
 }
 
 struct vioapic *
 vm_ioapic(struct vm *vm)
 {
 
 	return (vm->vioapic);
 }
 
 struct vhpet *
 vm_hpet(struct vm *vm)
 {
 
 	return (vm->vhpet);
 }
 
 bool
 vmm_is_pptdev(int bus, int slot, int func)
 {
 	int b, f, i, n, s;
 	char *val, *cp, *cp2;
 	bool found;
 
 	/*
 	 * XXX
 	 * The length of an environment variable is limited to 128 bytes which
 	 * puts an upper limit on the number of passthru devices that may be
 	 * specified using a single environment variable.
 	 *
 	 * Work around this by scanning multiple environment variable
 	 * names instead of a single one - yuck!
 	 */
 	const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL };
 
 	/* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */
 	found = false;
 	for (i = 0; names[i] != NULL && !found; i++) {
 		cp = val = kern_getenv(names[i]);
 		while (cp != NULL && *cp != '\0') {
 			if ((cp2 = strchr(cp, ' ')) != NULL)
 				*cp2 = '\0';
 
 			n = sscanf(cp, "%d/%d/%d", &b, &s, &f);
 			if (n == 3 && bus == b && slot == s && func == f) {
 				found = true;
 				break;
 			}
 		
 			if (cp2 != NULL)
 				*cp2++ = ' ';
 
 			cp = cp2;
 		}
 		freeenv(val);
 	}
 	return (found);
 }
 
 void *
 vm_iommu_domain(struct vm *vm)
 {
 
 	return (vm->iommu);
 }
 
 int
 vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate,
     bool from_idle)
 {
 	int error;
 	struct vcpu *vcpu;
 
 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
 		panic("vm_set_run_state: invalid vcpuid %d", vcpuid);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	vcpu_lock(vcpu);
 	error = vcpu_set_state_locked(vm, vcpuid, newstate, from_idle);
 	vcpu_unlock(vcpu);
 
 	return (error);
 }
 
 enum vcpu_state
 vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu)
 {
 	struct vcpu *vcpu;
 	enum vcpu_state state;
 
 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
 		panic("vm_get_run_state: invalid vcpuid %d", vcpuid);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	vcpu_lock(vcpu);
 	state = vcpu->state;
 	if (hostcpu != NULL)
 		*hostcpu = vcpu->hostcpu;
 	vcpu_unlock(vcpu);
 
 	return (state);
 }
 
 int
 vm_activate_cpu(struct vm *vm, int vcpuid)
 {
 
 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
 		return (EINVAL);
 
 	if (CPU_ISSET(vcpuid, &vm->active_cpus))
 		return (EBUSY);
 
 	VCPU_CTR0(vm, vcpuid, "activated");
 	CPU_SET_ATOMIC(vcpuid, &vm->active_cpus);
 	return (0);
 }
 
 int
 vm_suspend_cpu(struct vm *vm, int vcpuid)
 {
 	int i;
 
 	if (vcpuid < -1 || vcpuid >= vm->maxcpus)
 		return (EINVAL);
 
 	if (vcpuid == -1) {
 		vm->debug_cpus = vm->active_cpus;
 		for (i = 0; i < vm->maxcpus; i++) {
 			if (CPU_ISSET(i, &vm->active_cpus))
 				vcpu_notify_event(vm, i, false);
 		}
 	} else {
 		if (!CPU_ISSET(vcpuid, &vm->active_cpus))
 			return (EINVAL);
 
 		CPU_SET_ATOMIC(vcpuid, &vm->debug_cpus);
 		vcpu_notify_event(vm, vcpuid, false);
 	}
 	return (0);
 }
 
 int
 vm_resume_cpu(struct vm *vm, int vcpuid)
 {
 
 	if (vcpuid < -1 || vcpuid >= vm->maxcpus)
 		return (EINVAL);
 
 	if (vcpuid == -1) {
 		CPU_ZERO(&vm->debug_cpus);
 	} else {
 		if (!CPU_ISSET(vcpuid, &vm->debug_cpus))
 			return (EINVAL);
 
 		CPU_CLR_ATOMIC(vcpuid, &vm->debug_cpus);
 	}
 	return (0);
 }
 
 int
 vcpu_debugged(struct vm *vm, int vcpuid)
 {
 
 	return (CPU_ISSET(vcpuid, &vm->debug_cpus));
 }
 
 cpuset_t
 vm_active_cpus(struct vm *vm)
 {
 
 	return (vm->active_cpus);
 }
 
 cpuset_t
 vm_debug_cpus(struct vm *vm)
 {
 
 	return (vm->debug_cpus);
 }
 
 cpuset_t
 vm_suspended_cpus(struct vm *vm)
 {
 
 	return (vm->suspended_cpus);
 }
 
 void *
 vcpu_stats(struct vm *vm, int vcpuid)
 {
 
 	return (vm->vcpu[vcpuid].stats);
 }
 
 int
 vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state)
 {
 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
 		return (EINVAL);
 
 	*state = vm->vcpu[vcpuid].x2apic_state;
 
 	return (0);
 }
 
 int
 vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
 {
 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
 		return (EINVAL);
 
 	if (state >= X2APIC_STATE_LAST)
 		return (EINVAL);
 
 	vm->vcpu[vcpuid].x2apic_state = state;
 
 	vlapic_set_x2apic_state(vm, vcpuid, state);
 
 	return (0);
 }
 
 /*
  * This function is called to ensure that a vcpu "sees" a pending event
  * as soon as possible:
  * - If the vcpu thread is sleeping then it is woken up.
  * - If the vcpu is running on a different host_cpu then an IPI will be directed
  *   to the host_cpu to cause the vcpu to trap into the hypervisor.
  */
 static void
 vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr)
 {
 	int hostcpu;
 
 	hostcpu = vcpu->hostcpu;
 	if (vcpu->state == VCPU_RUNNING) {
 		KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
 		if (hostcpu != curcpu) {
 			if (lapic_intr) {
 				vlapic_post_intr(vcpu->vlapic, hostcpu,
 				    vmm_ipinum);
 			} else {
 				ipi_cpu(hostcpu, vmm_ipinum);
 			}
 		} else {
 			/*
 			 * If the 'vcpu' is running on 'curcpu' then it must
 			 * be sending a notification to itself (e.g. SELF_IPI).
 			 * The pending event will be picked up when the vcpu
 			 * transitions back to guest context.
 			 */
 		}
 	} else {
 		KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent "
 		    "with hostcpu %d", vcpu->state, hostcpu));
 		if (vcpu->state == VCPU_SLEEPING)
 			wakeup_one(vcpu);
 	}
 }
 
 void
 vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr)
 {
 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
 
 	vcpu_lock(vcpu);
 	vcpu_notify_event_locked(vcpu, lapic_intr);
 	vcpu_unlock(vcpu);
 }
 
 struct vmspace *
 vm_get_vmspace(struct vm *vm)
 {
 
 	return (vm->vmspace);
 }
 
 int
 vm_apicid2vcpuid(struct vm *vm, int apicid)
 {
 	/*
 	 * XXX apic id is assumed to be numerically identical to vcpu id
 	 */
 	return (apicid);
 }
 
 int
 vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest,
     vm_rendezvous_func_t func, void *arg)
 {
 	int error, i;
 
 	/*
 	 * Enforce that this function is called without any locks
 	 */
 	WITNESS_WARN(WARN_PANIC, NULL, "vm_smp_rendezvous");
 	KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < vm->maxcpus),
 	    ("vm_smp_rendezvous: invalid vcpuid %d", vcpuid));
 
 restart:
 	mtx_lock(&vm->rendezvous_mtx);
 	if (vm->rendezvous_func != NULL) {
 		/*
 		 * If a rendezvous is already in progress then we need to
 		 * call the rendezvous handler in case this 'vcpuid' is one
 		 * of the targets of the rendezvous.
 		 */
 		RENDEZVOUS_CTR0(vm, vcpuid, "Rendezvous already in progress");
 		mtx_unlock(&vm->rendezvous_mtx);
 		error = vm_handle_rendezvous(vm, vcpuid);
 		if (error != 0)
 			return (error);
 		goto restart;
 	}
 	KASSERT(vm->rendezvous_func == NULL, ("vm_smp_rendezvous: previous "
 	    "rendezvous is still in progress"));
 
 	RENDEZVOUS_CTR0(vm, vcpuid, "Initiating rendezvous");
 	vm->rendezvous_req_cpus = dest;
 	CPU_ZERO(&vm->rendezvous_done_cpus);
 	vm->rendezvous_arg = arg;
 	vm->rendezvous_func = func;
 	mtx_unlock(&vm->rendezvous_mtx);
 
 	/*
 	 * Wake up any sleeping vcpus and trigger a VM-exit in any running
 	 * vcpus so they handle the rendezvous as soon as possible.
 	 */
 	for (i = 0; i < vm->maxcpus; i++) {
 		if (CPU_ISSET(i, &dest))
 			vcpu_notify_event(vm, i, false);
 	}
 
 	return (vm_handle_rendezvous(vm, vcpuid));
 }
 
 struct vatpic *
 vm_atpic(struct vm *vm)
 {
 	return (vm->vatpic);
 }
 
 struct vatpit *
 vm_atpit(struct vm *vm)
 {
 	return (vm->vatpit);
 }
 
 struct vpmtmr *
 vm_pmtmr(struct vm *vm)
 {
 
 	return (vm->vpmtmr);
 }
 
 struct vrtc *
 vm_rtc(struct vm *vm)
 {
 
 	return (vm->vrtc);
 }
 
 enum vm_reg_name
 vm_segment_name(int seg)
 {
 	static enum vm_reg_name seg_names[] = {
 		VM_REG_GUEST_ES,
 		VM_REG_GUEST_CS,
 		VM_REG_GUEST_SS,
 		VM_REG_GUEST_DS,
 		VM_REG_GUEST_FS,
 		VM_REG_GUEST_GS
 	};
 
 	KASSERT(seg >= 0 && seg < nitems(seg_names),
 	    ("%s: invalid segment encoding %d", __func__, seg));
 	return (seg_names[seg]);
 }
 
 void
 vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
     int num_copyinfo)
 {
 	int idx;
 
 	for (idx = 0; idx < num_copyinfo; idx++) {
 		if (copyinfo[idx].cookie != NULL)
 			vm_gpa_release(copyinfo[idx].cookie);
 	}
 	bzero(copyinfo, num_copyinfo * sizeof(struct vm_copyinfo));
 }
 
 int
 vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
     uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
     int num_copyinfo, int *fault)
 {
 	int error, idx, nused;
 	size_t n, off, remaining;
 	void *hva, *cookie;
 	uint64_t gpa;
 
 	bzero(copyinfo, sizeof(struct vm_copyinfo) * num_copyinfo);
 
 	nused = 0;
 	remaining = len;
 	while (remaining > 0) {
 		KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo"));
 		error = vm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa, fault);
 		if (error || *fault)
 			return (error);
 		off = gpa & PAGE_MASK;
 		n = min(remaining, PAGE_SIZE - off);
 		copyinfo[nused].gpa = gpa;
 		copyinfo[nused].len = n;
 		remaining -= n;
 		gla += n;
 		nused++;
 	}
 
 	for (idx = 0; idx < nused; idx++) {
 		hva = vm_gpa_hold(vm, vcpuid, copyinfo[idx].gpa,
 		    copyinfo[idx].len, prot, &cookie);
 		if (hva == NULL)
 			break;
 		copyinfo[idx].hva = hva;
 		copyinfo[idx].cookie = cookie;
 	}
 
 	if (idx != nused) {
 		vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo);
 		return (EFAULT);
 	} else {
 		*fault = 0;
 		return (0);
 	}
 }
 
 void
 vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr,
     size_t len)
 {
 	char *dst;
 	int idx;
 	
 	dst = kaddr;
 	idx = 0;
 	while (len > 0) {
 		bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len);
 		len -= copyinfo[idx].len;
 		dst += copyinfo[idx].len;
 		idx++;
 	}
 }
 
 void
 vm_copyout(struct vm *vm, int vcpuid, const void *kaddr,
     struct vm_copyinfo *copyinfo, size_t len)
 {
 	const char *src;
 	int idx;
 
 	src = kaddr;
 	idx = 0;
 	while (len > 0) {
 		bcopy(src, copyinfo[idx].hva, copyinfo[idx].len);
 		len -= copyinfo[idx].len;
 		src += copyinfo[idx].len;
 		idx++;
 	}
 }
 
 /*
  * Return the amount of in-use and wired memory for the VM. Since
  * these are global stats, only return the values with for vCPU 0
  */
 VMM_STAT_DECLARE(VMM_MEM_RESIDENT);
 VMM_STAT_DECLARE(VMM_MEM_WIRED);
 
 static void
 vm_get_rescnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
 {
 
 	if (vcpu == 0) {
 		vmm_stat_set(vm, vcpu, VMM_MEM_RESIDENT,
 	       	    PAGE_SIZE * vmspace_resident_count(vm->vmspace));
 	}	
 }
 
 static void
 vm_get_wiredcnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
 {
 
 	if (vcpu == 0) {
 		vmm_stat_set(vm, vcpu, VMM_MEM_WIRED,
 	      	    PAGE_SIZE * pmap_wired_count(vmspace_pmap(vm->vmspace)));
 	}	
 }
 
 VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt);
 VMM_STAT_FUNC(VMM_MEM_WIRED, "Wired memory", vm_get_wiredcnt);
+
+#ifdef BHYVE_SNAPSHOT
+static int
+vm_snapshot_vcpus(struct vm *vm, struct vm_snapshot_meta *meta)
+{
+	int ret;
+	int i;
+	struct vcpu *vcpu;
+
+	for (i = 0; i < VM_MAXCPU; i++) {
+		vcpu = &vm->vcpu[i];
+
+		SNAPSHOT_VAR_OR_LEAVE(vcpu->x2apic_state, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vcpu->exitintinfo, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vcpu->exc_vector, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vcpu->exc_errcode_valid, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vcpu->exc_errcode, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vcpu->guest_xcr0, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vcpu->exitinfo, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vcpu->nextrip, meta, ret, done);
+		/* XXX we're cheating here, since the value of tsc_offset as
+		 * saved here is actually the value of the guest's TSC value.
+		 *
+		 * It will be turned turned back into an actual offset when the
+		 * TSC restore function is called
+		 */
+		SNAPSHOT_VAR_OR_LEAVE(vcpu->tsc_offset, meta, ret, done);
+	}
+
+done:
+	return (ret);
+}
+
+static int
+vm_snapshot_vm(struct vm *vm, struct vm_snapshot_meta *meta)
+{
+	int ret;
+	int i;
+	uint64_t now;
+
+	ret = 0;
+	now = rdtsc();
+
+	if (meta->op == VM_SNAPSHOT_SAVE) {
+		/* XXX make tsc_offset take the value TSC proper as seen by the
+		 * guest
+		 */
+		for (i = 0; i < VM_MAXCPU; i++)
+			vm->vcpu[i].tsc_offset += now;
+	}
+
+	ret = vm_snapshot_vcpus(vm, meta);
+	if (ret != 0) {
+		printf("%s: failed to copy vm data to user buffer", __func__);
+		goto done;
+	}
+
+	if (meta->op == VM_SNAPSHOT_SAVE) {
+		/* XXX turn tsc_offset back into an offset; actual value is only
+		 * required for restore; using it otherwise would be wrong
+		 */
+		for (i = 0; i < VM_MAXCPU; i++)
+			vm->vcpu[i].tsc_offset -= now;
+	}
+
+done:
+	return (ret);
+}
+
+static int
+vm_snapshot_vmcx(struct vm *vm, struct vm_snapshot_meta *meta)
+{
+	int i, error;
+
+	error = 0;
+
+	for (i = 0; i < VM_MAXCPU; i++) {
+		error = VM_SNAPSHOT_VMCX(vm->cookie, meta, i);
+		if (error != 0) {
+			printf("%s: failed to snapshot vmcs/vmcb data for "
+			       "vCPU: %d; error: %d\n", __func__, i, error);
+			goto done;
+		}
+	}
+
+done:
+	return (error);
+}
+
+/*
+ * Save kernel-side structures to user-space for snapshotting.
+ */
+int
+vm_snapshot_req(struct vm *vm, struct vm_snapshot_meta *meta)
+{
+	int ret = 0;
+
+	switch (meta->dev_req) {
+	case STRUCT_VMX:
+		ret = VM_SNAPSHOT_VMI(vm->cookie, meta);
+		break;
+	case STRUCT_VMCX:
+		ret = vm_snapshot_vmcx(vm, meta);
+		break;
+	case STRUCT_VM:
+		ret = vm_snapshot_vm(vm, meta);
+		break;
+	case STRUCT_VIOAPIC:
+		ret = vioapic_snapshot(vm_ioapic(vm), meta);
+		break;
+	case STRUCT_VLAPIC:
+		ret = vlapic_snapshot(vm, meta);
+		break;
+	case STRUCT_VHPET:
+		ret = vhpet_snapshot(vm_hpet(vm), meta);
+		break;
+	case STRUCT_VATPIC:
+		ret = vatpic_snapshot(vm_atpic(vm), meta);
+		break;
+	case STRUCT_VATPIT:
+		ret = vatpit_snapshot(vm_atpit(vm), meta);
+		break;
+	case STRUCT_VPMTMR:
+		ret = vpmtmr_snapshot(vm_pmtmr(vm), meta);
+		break;
+	case STRUCT_VRTC:
+		ret = vrtc_snapshot(vm_rtc(vm), meta);
+		break;
+	default:
+		printf("%s: failed to find the requested type %#x\n",
+		       __func__, meta->dev_req);
+		ret = (EINVAL);
+	}
+	return (ret);
+}
+
+int
+vm_set_tsc_offset(struct vm *vm, int vcpuid, uint64_t offset)
+{
+	struct vcpu *vcpu;
+
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+		return (EINVAL);
+
+	vcpu = &vm->vcpu[vcpuid];
+	vcpu->tsc_offset = offset;
+
+	return (0);
+}
+
+int
+vm_restore_time(struct vm *vm)
+{
+	int error, i;
+	uint64_t now;
+	struct vcpu *vcpu;
+
+	now = rdtsc();
+
+	error = vhpet_restore_time(vm_hpet(vm));
+	if (error)
+		return (error);
+
+	for (i = 0; i < nitems(vm->vcpu); i++) {
+		vcpu = &vm->vcpu[i];
+
+		error = VM_RESTORE_TSC(vm->cookie, i, vcpu->tsc_offset - now);
+		if (error)
+			return (error);
+	}
+
+	return (0);
+}
+#endif
diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c
index 9818f300efec..e47b7081b795 100644
--- a/sys/amd64/vmm/vmm_dev.c
+++ b/sys/amd64/vmm/vmm_dev.c
@@ -1,1167 +1,1182 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
+#include "opt_bhyve_snapshot.h"
+
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/jail.h>
 #include <sys/queue.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/malloc.h>
 #include <sys/conf.h>
 #include <sys/sysctl.h>
 #include <sys/libkern.h>
 #include <sys/ioccom.h>
 #include <sys/mman.h>
 #include <sys/uio.h>
 #include <sys/proc.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 
 #include <machine/vmparam.h>
 #include <machine/vmm.h>
-#include <machine/vmm_instruction_emul.h>
 #include <machine/vmm_dev.h>
+#include <machine/vmm_instruction_emul.h>
+#include <machine/vmm_snapshot.h>
 
 #include "vmm_lapic.h"
 #include "vmm_stat.h"
 #include "vmm_mem.h"
 #include "io/ppt.h"
 #include "io/vatpic.h"
 #include "io/vioapic.h"
 #include "io/vhpet.h"
 #include "io/vrtc.h"
 
 struct devmem_softc {
 	int	segid;
 	char	*name;
 	struct cdev *cdev;
 	struct vmmdev_softc *sc;
 	SLIST_ENTRY(devmem_softc) link;
 };
 
 struct vmmdev_softc {
 	struct vm	*vm;		/* vm instance cookie */
 	struct cdev	*cdev;
 	SLIST_ENTRY(vmmdev_softc) link;
 	SLIST_HEAD(, devmem_softc) devmem;
 	int		flags;
 };
 #define	VSC_LINKED		0x01
 
 static SLIST_HEAD(, vmmdev_softc) head;
 
 static unsigned pr_allow_flag;
 static struct mtx vmmdev_mtx;
 
 static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev");
 
 SYSCTL_DECL(_hw_vmm);
 
 static int vmm_priv_check(struct ucred *ucred);
 static int devmem_create_cdev(const char *vmname, int id, char *devmem);
 static void devmem_destroy(void *arg);
 
 static int
 vmm_priv_check(struct ucred *ucred)
 {
 
 	if (jailed(ucred) &&
 	    !(ucred->cr_prison->pr_allow & pr_allow_flag))
 		return (EPERM);
 
 	return (0);
 }
 
 static int
 vcpu_lock_one(struct vmmdev_softc *sc, int vcpu)
 {
 	int error;
 
 	if (vcpu < 0 || vcpu >= vm_get_maxcpus(sc->vm))
 		return (EINVAL);
 
 	error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN, true);
 	return (error);
 }
 
 static void
 vcpu_unlock_one(struct vmmdev_softc *sc, int vcpu)
 {
 	enum vcpu_state state;
 
 	state = vcpu_get_state(sc->vm, vcpu, NULL);
 	if (state != VCPU_FROZEN) {
 		panic("vcpu %s(%d) has invalid state %d", vm_name(sc->vm),
 		    vcpu, state);
 	}
 
 	vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false);
 }
 
 static int
 vcpu_lock_all(struct vmmdev_softc *sc)
 {
 	int error, vcpu;
 	uint16_t maxcpus;
 
 	maxcpus = vm_get_maxcpus(sc->vm);
 	for (vcpu = 0; vcpu < maxcpus; vcpu++) {
 		error = vcpu_lock_one(sc, vcpu);
 		if (error)
 			break;
 	}
 
 	if (error) {
 		while (--vcpu >= 0)
 			vcpu_unlock_one(sc, vcpu);
 	}
 
 	return (error);
 }
 
 static void
 vcpu_unlock_all(struct vmmdev_softc *sc)
 {
 	int vcpu;
 	uint16_t maxcpus;
 
 	maxcpus = vm_get_maxcpus(sc->vm);
 	for (vcpu = 0; vcpu < maxcpus; vcpu++)
 		vcpu_unlock_one(sc, vcpu);
 }
 
 static struct vmmdev_softc *
 vmmdev_lookup(const char *name)
 {
 	struct vmmdev_softc *sc;
 
 #ifdef notyet	/* XXX kernel is not compiled with invariants */
 	mtx_assert(&vmmdev_mtx, MA_OWNED);
 #endif
 
 	SLIST_FOREACH(sc, &head, link) {
 		if (strcmp(name, vm_name(sc->vm)) == 0)
 			break;
 	}
 
 	return (sc);
 }
 
 static struct vmmdev_softc *
 vmmdev_lookup2(struct cdev *cdev)
 {
 
 	return (cdev->si_drv1);
 }
 
 static int
 vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
 {
 	int error, off, c, prot;
 	vm_paddr_t gpa, maxaddr;
 	void *hpa, *cookie;
 	struct vmmdev_softc *sc;
 	uint16_t lastcpu;
 
 	error = vmm_priv_check(curthread->td_ucred);
 	if (error)
 		return (error);
 
 	sc = vmmdev_lookup2(cdev);
 	if (sc == NULL)
 		return (ENXIO);
 
 	/*
 	 * Get a read lock on the guest memory map by freezing any vcpu.
 	 */
 	lastcpu = vm_get_maxcpus(sc->vm) - 1;
 	error = vcpu_lock_one(sc, lastcpu);
 	if (error)
 		return (error);
 
 	prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ);
 	maxaddr = vmm_sysmem_maxaddr(sc->vm);
 	while (uio->uio_resid > 0 && error == 0) {
 		gpa = uio->uio_offset;
 		off = gpa & PAGE_MASK;
 		c = min(uio->uio_resid, PAGE_SIZE - off);
 
 		/*
 		 * The VM has a hole in its physical memory map. If we want to
 		 * use 'dd' to inspect memory beyond the hole we need to
 		 * provide bogus data for memory that lies in the hole.
 		 *
 		 * Since this device does not support lseek(2), dd(1) will
 		 * read(2) blocks of data to simulate the lseek(2).
 		 */
 		hpa = vm_gpa_hold(sc->vm, lastcpu, gpa, c,
 		    prot, &cookie);
 		if (hpa == NULL) {
 			if (uio->uio_rw == UIO_READ && gpa < maxaddr)
 				error = uiomove(__DECONST(void *, zero_region),
 				    c, uio);
 			else
 				error = EFAULT;
 		} else {
 			error = uiomove(hpa, c, uio);
 			vm_gpa_release(cookie);
 		}
 	}
 	vcpu_unlock_one(sc, lastcpu);
 	return (error);
 }
 
 CTASSERT(sizeof(((struct vm_memseg *)0)->name) >= VM_MAX_SUFFIXLEN + 1);
 
 static int
 get_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg)
 {
 	struct devmem_softc *dsc;
 	int error;
 	bool sysmem;
 
 	error = vm_get_memseg(sc->vm, mseg->segid, &mseg->len, &sysmem, NULL);
 	if (error || mseg->len == 0)
 		return (error);
 
 	if (!sysmem) {
 		SLIST_FOREACH(dsc, &sc->devmem, link) {
 			if (dsc->segid == mseg->segid)
 				break;
 		}
 		KASSERT(dsc != NULL, ("%s: devmem segment %d not found",
 		    __func__, mseg->segid));
 		error = copystr(dsc->name, mseg->name, sizeof(mseg->name),
 		    NULL);
 	} else {
 		bzero(mseg->name, sizeof(mseg->name));
 	}
 
 	return (error);
 }
 
 static int
 alloc_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg)
 {
 	char *name;
 	int error;
 	bool sysmem;
 
 	error = 0;
 	name = NULL;
 	sysmem = true;
 
 	/*
 	 * The allocation is lengthened by 1 to hold a terminating NUL.  It'll
 	 * by stripped off when devfs processes the full string.
 	 */
 	if (VM_MEMSEG_NAME(mseg)) {
 		sysmem = false;
 		name = malloc(sizeof(mseg->name), M_VMMDEV, M_WAITOK);
 		error = copystr(mseg->name, name, sizeof(mseg->name), NULL);
 		if (error)
 			goto done;
 	}
 
 	error = vm_alloc_memseg(sc->vm, mseg->segid, mseg->len, sysmem);
 	if (error)
 		goto done;
 
 	if (VM_MEMSEG_NAME(mseg)) {
 		error = devmem_create_cdev(vm_name(sc->vm), mseg->segid, name);
 		if (error)
 			vm_free_memseg(sc->vm, mseg->segid);
 		else
 			name = NULL;	/* freed when 'cdev' is destroyed */
 	}
 done:
 	free(name, M_VMMDEV);
 	return (error);
 }
 
 static int
 vm_get_register_set(struct vm *vm, int vcpu, unsigned int count, int *regnum,
     uint64_t *regval)
 {
 	int error, i;
 
 	error = 0;
 	for (i = 0; i < count; i++) {
 		error = vm_get_register(vm, vcpu, regnum[i], &regval[i]);
 		if (error)
 			break;
 	}
 	return (error);
 }
 
 static int
 vm_set_register_set(struct vm *vm, int vcpu, unsigned int count, int *regnum,
     uint64_t *regval)
 {
 	int error, i;
 
 	error = 0;
 	for (i = 0; i < count; i++) {
 		error = vm_set_register(vm, vcpu, regnum[i], regval[i]);
 		if (error)
 			break;
 	}
 	return (error);
 }
 
 static int
 vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 	     struct thread *td)
 {
 	int error, vcpu, state_changed, size;
 	cpuset_t *cpuset;
 	struct vmmdev_softc *sc;
 	struct vm_register *vmreg;
 	struct vm_seg_desc *vmsegdesc;
 	struct vm_register_set *vmregset;
 	struct vm_run *vmrun;
 	struct vm_exception *vmexc;
 	struct vm_lapic_irq *vmirq;
 	struct vm_lapic_msi *vmmsi;
 	struct vm_ioapic_irq *ioapic_irq;
 	struct vm_isa_irq *isa_irq;
 	struct vm_isa_irq_trigger *isa_irq_trigger;
 	struct vm_capability *vmcap;
 	struct vm_pptdev *pptdev;
 	struct vm_pptdev_mmio *pptmmio;
 	struct vm_pptdev_msi *pptmsi;
 	struct vm_pptdev_msix *pptmsix;
 	struct vm_nmi *vmnmi;
 	struct vm_stats *vmstats;
 	struct vm_stat_desc *statdesc;
 	struct vm_x2apic *x2apic;
 	struct vm_gpa_pte *gpapte;
 	struct vm_suspend *vmsuspend;
 	struct vm_gla2gpa *gg;
 	struct vm_activate_cpu *vac;
 	struct vm_cpuset *vm_cpuset;
 	struct vm_intinfo *vmii;
 	struct vm_rtc_time *rtctime;
 	struct vm_rtc_data *rtcdata;
 	struct vm_memmap *mm;
 	struct vm_cpu_topology *topology;
 	uint64_t *regvals;
 	int *regnums;
+#ifdef BHYVE_SNAPSHOT
+	struct vm_snapshot_meta *snapshot_meta;
+#endif
 
 	error = vmm_priv_check(curthread->td_ucred);
 	if (error)
 		return (error);
 
 	sc = vmmdev_lookup2(cdev);
 	if (sc == NULL)
 		return (ENXIO);
 
 	vcpu = -1;
 	state_changed = 0;
 
 	/*
 	 * Some VMM ioctls can operate only on vcpus that are not running.
 	 */
 	switch (cmd) {
 	case VM_RUN:
 	case VM_GET_REGISTER:
 	case VM_SET_REGISTER:
 	case VM_GET_SEGMENT_DESCRIPTOR:
 	case VM_SET_SEGMENT_DESCRIPTOR:
 	case VM_GET_REGISTER_SET:
 	case VM_SET_REGISTER_SET:
 	case VM_INJECT_EXCEPTION:
 	case VM_GET_CAPABILITY:
 	case VM_SET_CAPABILITY:
 	case VM_PPTDEV_MSI:
 	case VM_PPTDEV_MSIX:
 	case VM_SET_X2APIC_STATE:
 	case VM_GLA2GPA:
 	case VM_GLA2GPA_NOFAULT:
 	case VM_ACTIVATE_CPU:
 	case VM_SET_INTINFO:
 	case VM_GET_INTINFO:
 	case VM_RESTART_INSTRUCTION:
 		/*
 		 * XXX fragile, handle with care
 		 * Assumes that the first field of the ioctl data is the vcpu.
 		 */
 		vcpu = *(int *)data;
 		error = vcpu_lock_one(sc, vcpu);
 		if (error)
 			goto done;
 		state_changed = 1;
 		break;
 
 	case VM_MAP_PPTDEV_MMIO:
 	case VM_BIND_PPTDEV:
 	case VM_UNBIND_PPTDEV:
 	case VM_ALLOC_MEMSEG:
 	case VM_MMAP_MEMSEG:
 	case VM_REINIT:
 		/*
 		 * ioctls that operate on the entire virtual machine must
 		 * prevent all vcpus from running.
 		 */
 		error = vcpu_lock_all(sc);
 		if (error)
 			goto done;
 		state_changed = 2;
 		break;
 
 	case VM_GET_MEMSEG:
 	case VM_MMAP_GETNEXT:
 		/*
 		 * Lock a vcpu to make sure that the memory map cannot be
 		 * modified while it is being inspected.
 		 */
 		vcpu = vm_get_maxcpus(sc->vm) - 1;
 		error = vcpu_lock_one(sc, vcpu);
 		if (error)
 			goto done;
 		state_changed = 1;
 		break;
 
 	default:
 		break;
 	}
 
 	switch(cmd) {
 	case VM_RUN:
 		vmrun = (struct vm_run *)data;
 		error = vm_run(sc->vm, vmrun);
 		break;
 	case VM_SUSPEND:
 		vmsuspend = (struct vm_suspend *)data;
 		error = vm_suspend(sc->vm, vmsuspend->how);
 		break;
 	case VM_REINIT:
 		error = vm_reinit(sc->vm);
 		break;
 	case VM_STAT_DESC: {
 		statdesc = (struct vm_stat_desc *)data;
 		error = vmm_stat_desc_copy(statdesc->index,
 					statdesc->desc, sizeof(statdesc->desc));
 		break;
 	}
 	case VM_STATS: {
 		CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_ELEMS);
 		vmstats = (struct vm_stats *)data;
 		getmicrotime(&vmstats->tv);
 		error = vmm_stat_copy(sc->vm, vmstats->cpuid,
 				      &vmstats->num_entries, vmstats->statbuf);
 		break;
 	}
 	case VM_PPTDEV_MSI:
 		pptmsi = (struct vm_pptdev_msi *)data;
 		error = ppt_setup_msi(sc->vm, pptmsi->vcpu,
 				      pptmsi->bus, pptmsi->slot, pptmsi->func,
 				      pptmsi->addr, pptmsi->msg,
 				      pptmsi->numvec);
 		break;
 	case VM_PPTDEV_MSIX:
 		pptmsix = (struct vm_pptdev_msix *)data;
 		error = ppt_setup_msix(sc->vm, pptmsix->vcpu,
 				       pptmsix->bus, pptmsix->slot, 
 				       pptmsix->func, pptmsix->idx,
 				       pptmsix->addr, pptmsix->msg,
 				       pptmsix->vector_control);
 		break;
 	case VM_MAP_PPTDEV_MMIO:
 		pptmmio = (struct vm_pptdev_mmio *)data;
 		error = ppt_map_mmio(sc->vm, pptmmio->bus, pptmmio->slot,
 				     pptmmio->func, pptmmio->gpa, pptmmio->len,
 				     pptmmio->hpa);
 		break;
 	case VM_BIND_PPTDEV:
 		pptdev = (struct vm_pptdev *)data;
 		error = vm_assign_pptdev(sc->vm, pptdev->bus, pptdev->slot,
 					 pptdev->func);
 		break;
 	case VM_UNBIND_PPTDEV:
 		pptdev = (struct vm_pptdev *)data;
 		error = vm_unassign_pptdev(sc->vm, pptdev->bus, pptdev->slot,
 					   pptdev->func);
 		break;
 	case VM_INJECT_EXCEPTION:
 		vmexc = (struct vm_exception *)data;
 		error = vm_inject_exception(sc->vm, vmexc->cpuid,
 		    vmexc->vector, vmexc->error_code_valid, vmexc->error_code,
 		    vmexc->restart_instruction);
 		break;
 	case VM_INJECT_NMI:
 		vmnmi = (struct vm_nmi *)data;
 		error = vm_inject_nmi(sc->vm, vmnmi->cpuid);
 		break;
 	case VM_LAPIC_IRQ:
 		vmirq = (struct vm_lapic_irq *)data;
 		error = lapic_intr_edge(sc->vm, vmirq->cpuid, vmirq->vector);
 		break;
 	case VM_LAPIC_LOCAL_IRQ:
 		vmirq = (struct vm_lapic_irq *)data;
 		error = lapic_set_local_intr(sc->vm, vmirq->cpuid,
 		    vmirq->vector);
 		break;
 	case VM_LAPIC_MSI:
 		vmmsi = (struct vm_lapic_msi *)data;
 		error = lapic_intr_msi(sc->vm, vmmsi->addr, vmmsi->msg);
 		break;
 	case VM_IOAPIC_ASSERT_IRQ:
 		ioapic_irq = (struct vm_ioapic_irq *)data;
 		error = vioapic_assert_irq(sc->vm, ioapic_irq->irq);
 		break;
 	case VM_IOAPIC_DEASSERT_IRQ:
 		ioapic_irq = (struct vm_ioapic_irq *)data;
 		error = vioapic_deassert_irq(sc->vm, ioapic_irq->irq);
 		break;
 	case VM_IOAPIC_PULSE_IRQ:
 		ioapic_irq = (struct vm_ioapic_irq *)data;
 		error = vioapic_pulse_irq(sc->vm, ioapic_irq->irq);
 		break;
 	case VM_IOAPIC_PINCOUNT:
 		*(int *)data = vioapic_pincount(sc->vm);
 		break;
 	case VM_ISA_ASSERT_IRQ:
 		isa_irq = (struct vm_isa_irq *)data;
 		error = vatpic_assert_irq(sc->vm, isa_irq->atpic_irq);
 		if (error == 0 && isa_irq->ioapic_irq != -1)
 			error = vioapic_assert_irq(sc->vm,
 			    isa_irq->ioapic_irq);
 		break;
 	case VM_ISA_DEASSERT_IRQ:
 		isa_irq = (struct vm_isa_irq *)data;
 		error = vatpic_deassert_irq(sc->vm, isa_irq->atpic_irq);
 		if (error == 0 && isa_irq->ioapic_irq != -1)
 			error = vioapic_deassert_irq(sc->vm,
 			    isa_irq->ioapic_irq);
 		break;
 	case VM_ISA_PULSE_IRQ:
 		isa_irq = (struct vm_isa_irq *)data;
 		error = vatpic_pulse_irq(sc->vm, isa_irq->atpic_irq);
 		if (error == 0 && isa_irq->ioapic_irq != -1)
 			error = vioapic_pulse_irq(sc->vm, isa_irq->ioapic_irq);
 		break;
 	case VM_ISA_SET_IRQ_TRIGGER:
 		isa_irq_trigger = (struct vm_isa_irq_trigger *)data;
 		error = vatpic_set_irq_trigger(sc->vm,
 		    isa_irq_trigger->atpic_irq, isa_irq_trigger->trigger);
 		break;
 	case VM_MMAP_GETNEXT:
 		mm = (struct vm_memmap *)data;
 		error = vm_mmap_getnext(sc->vm, &mm->gpa, &mm->segid,
 		    &mm->segoff, &mm->len, &mm->prot, &mm->flags);
 		break;
 	case VM_MMAP_MEMSEG:
 		mm = (struct vm_memmap *)data;
 		error = vm_mmap_memseg(sc->vm, mm->gpa, mm->segid, mm->segoff,
 		    mm->len, mm->prot, mm->flags);
 		break;
 	case VM_ALLOC_MEMSEG:
 		error = alloc_memseg(sc, (struct vm_memseg *)data);
 		break;
 	case VM_GET_MEMSEG:
 		error = get_memseg(sc, (struct vm_memseg *)data);
 		break;
 	case VM_GET_REGISTER:
 		vmreg = (struct vm_register *)data;
 		error = vm_get_register(sc->vm, vmreg->cpuid, vmreg->regnum,
 					&vmreg->regval);
 		break;
 	case VM_SET_REGISTER:
 		vmreg = (struct vm_register *)data;
 		error = vm_set_register(sc->vm, vmreg->cpuid, vmreg->regnum,
 					vmreg->regval);
 		break;
 	case VM_SET_SEGMENT_DESCRIPTOR:
 		vmsegdesc = (struct vm_seg_desc *)data;
 		error = vm_set_seg_desc(sc->vm, vmsegdesc->cpuid,
 					vmsegdesc->regnum,
 					&vmsegdesc->desc);
 		break;
 	case VM_GET_SEGMENT_DESCRIPTOR:
 		vmsegdesc = (struct vm_seg_desc *)data;
 		error = vm_get_seg_desc(sc->vm, vmsegdesc->cpuid,
 					vmsegdesc->regnum,
 					&vmsegdesc->desc);
 		break;
 	case VM_GET_REGISTER_SET:
 		vmregset = (struct vm_register_set *)data;
 		if (vmregset->count > VM_REG_LAST) {
 			error = EINVAL;
 			break;
 		}
 		regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV,
 		    M_WAITOK);
 		regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV,
 		    M_WAITOK);
 		error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) *
 		    vmregset->count);
 		if (error == 0)
 			error = vm_get_register_set(sc->vm, vmregset->cpuid,
 			    vmregset->count, regnums, regvals);
 		if (error == 0)
 			error = copyout(regvals, vmregset->regvals,
 			    sizeof(regvals[0]) * vmregset->count);
 		free(regvals, M_VMMDEV);
 		free(regnums, M_VMMDEV);
 		break;
 	case VM_SET_REGISTER_SET:
 		vmregset = (struct vm_register_set *)data;
 		if (vmregset->count > VM_REG_LAST) {
 			error = EINVAL;
 			break;
 		}
 		regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV,
 		    M_WAITOK);
 		regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV,
 		    M_WAITOK);
 		error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) *
 		    vmregset->count);
 		if (error == 0)
 			error = copyin(vmregset->regvals, regvals,
 			    sizeof(regvals[0]) * vmregset->count);
 		if (error == 0)
 			error = vm_set_register_set(sc->vm, vmregset->cpuid,
 			    vmregset->count, regnums, regvals);
 		free(regvals, M_VMMDEV);
 		free(regnums, M_VMMDEV);
 		break;
 	case VM_GET_CAPABILITY:
 		vmcap = (struct vm_capability *)data;
 		error = vm_get_capability(sc->vm, vmcap->cpuid,
 					  vmcap->captype,
 					  &vmcap->capval);
 		break;
 	case VM_SET_CAPABILITY:
 		vmcap = (struct vm_capability *)data;
 		error = vm_set_capability(sc->vm, vmcap->cpuid,
 					  vmcap->captype,
 					  vmcap->capval);
 		break;
 	case VM_SET_X2APIC_STATE:
 		x2apic = (struct vm_x2apic *)data;
 		error = vm_set_x2apic_state(sc->vm,
 					    x2apic->cpuid, x2apic->state);
 		break;
 	case VM_GET_X2APIC_STATE:
 		x2apic = (struct vm_x2apic *)data;
 		error = vm_get_x2apic_state(sc->vm,
 					    x2apic->cpuid, &x2apic->state);
 		break;
 	case VM_GET_GPA_PMAP:
 		gpapte = (struct vm_gpa_pte *)data;
 		pmap_get_mapping(vmspace_pmap(vm_get_vmspace(sc->vm)),
 				 gpapte->gpa, gpapte->pte, &gpapte->ptenum);
 		error = 0;
 		break;
 	case VM_GET_HPET_CAPABILITIES:
 		error = vhpet_getcap((struct vm_hpet_cap *)data);
 		break;
 	case VM_GLA2GPA: {
 		CTASSERT(PROT_READ == VM_PROT_READ);
 		CTASSERT(PROT_WRITE == VM_PROT_WRITE);
 		CTASSERT(PROT_EXEC == VM_PROT_EXECUTE);
 		gg = (struct vm_gla2gpa *)data;
 		error = vm_gla2gpa(sc->vm, gg->vcpuid, &gg->paging, gg->gla,
 		    gg->prot, &gg->gpa, &gg->fault);
 		KASSERT(error == 0 || error == EFAULT,
 		    ("%s: vm_gla2gpa unknown error %d", __func__, error));
 		break;
 	}
 	case VM_GLA2GPA_NOFAULT:
 		gg = (struct vm_gla2gpa *)data;
 		error = vm_gla2gpa_nofault(sc->vm, gg->vcpuid, &gg->paging,
 		    gg->gla, gg->prot, &gg->gpa, &gg->fault);
 		KASSERT(error == 0 || error == EFAULT,
 		    ("%s: vm_gla2gpa unknown error %d", __func__, error));
 		break;
 	case VM_ACTIVATE_CPU:
 		vac = (struct vm_activate_cpu *)data;
 		error = vm_activate_cpu(sc->vm, vac->vcpuid);
 		break;
 	case VM_GET_CPUS:
 		error = 0;
 		vm_cpuset = (struct vm_cpuset *)data;
 		size = vm_cpuset->cpusetsize;
 		if (size < sizeof(cpuset_t) || size > CPU_MAXSIZE / NBBY) {
 			error = ERANGE;
 			break;
 		}
 		cpuset = malloc(size, M_TEMP, M_WAITOK | M_ZERO);
 		if (vm_cpuset->which == VM_ACTIVE_CPUS)
 			*cpuset = vm_active_cpus(sc->vm);
 		else if (vm_cpuset->which == VM_SUSPENDED_CPUS)
 			*cpuset = vm_suspended_cpus(sc->vm);
 		else if (vm_cpuset->which == VM_DEBUG_CPUS)
 			*cpuset = vm_debug_cpus(sc->vm);
 		else
 			error = EINVAL;
 		if (error == 0)
 			error = copyout(cpuset, vm_cpuset->cpus, size);
 		free(cpuset, M_TEMP);
 		break;
 	case VM_SUSPEND_CPU:
 		vac = (struct vm_activate_cpu *)data;
 		error = vm_suspend_cpu(sc->vm, vac->vcpuid);
 		break;
 	case VM_RESUME_CPU:
 		vac = (struct vm_activate_cpu *)data;
 		error = vm_resume_cpu(sc->vm, vac->vcpuid);
 		break;
 	case VM_SET_INTINFO:
 		vmii = (struct vm_intinfo *)data;
 		error = vm_exit_intinfo(sc->vm, vmii->vcpuid, vmii->info1);
 		break;
 	case VM_GET_INTINFO:
 		vmii = (struct vm_intinfo *)data;
 		error = vm_get_intinfo(sc->vm, vmii->vcpuid, &vmii->info1,
 		    &vmii->info2);
 		break;
 	case VM_RTC_WRITE:
 		rtcdata = (struct vm_rtc_data *)data;
 		error = vrtc_nvram_write(sc->vm, rtcdata->offset,
 		    rtcdata->value);
 		break;
 	case VM_RTC_READ:
 		rtcdata = (struct vm_rtc_data *)data;
 		error = vrtc_nvram_read(sc->vm, rtcdata->offset,
 		    &rtcdata->value);
 		break;
 	case VM_RTC_SETTIME:
 		rtctime = (struct vm_rtc_time *)data;
 		error = vrtc_set_time(sc->vm, rtctime->secs);
 		break;
 	case VM_RTC_GETTIME:
 		error = 0;
 		rtctime = (struct vm_rtc_time *)data;
 		rtctime->secs = vrtc_get_time(sc->vm);
 		break;
 	case VM_RESTART_INSTRUCTION:
 		error = vm_restart_instruction(sc->vm, vcpu);
 		break;
 	case VM_SET_TOPOLOGY:
 		topology = (struct vm_cpu_topology *)data;
 		error = vm_set_topology(sc->vm, topology->sockets,
 		    topology->cores, topology->threads, topology->maxcpus);
 		break;
 	case VM_GET_TOPOLOGY:
 		topology = (struct vm_cpu_topology *)data;
 		vm_get_topology(sc->vm, &topology->sockets, &topology->cores,
 		    &topology->threads, &topology->maxcpus);
 		error = 0;
 		break;
+#ifdef BHYVE_SNAPSHOT
+	case VM_SNAPSHOT_REQ:
+		snapshot_meta = (struct vm_snapshot_meta *)data;
+		error = vm_snapshot_req(sc->vm, snapshot_meta);
+		break;
+	case VM_RESTORE_TIME:
+		error = vm_restore_time(sc->vm);
+		break;
+#endif
 	default:
 		error = ENOTTY;
 		break;
 	}
 
 	if (state_changed == 1)
 		vcpu_unlock_one(sc, vcpu);
 	else if (state_changed == 2)
 		vcpu_unlock_all(sc);
 
 done:
 	/*
 	 * Make sure that no handler returns a kernel-internal
 	 * error value to userspace.
 	 */
 	KASSERT(error == ERESTART || error >= 0,
 	    ("vmmdev_ioctl: invalid error return %d", error));
 	return (error);
 }
 
 static int
 vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t mapsize,
     struct vm_object **objp, int nprot)
 {
 	struct vmmdev_softc *sc;
 	vm_paddr_t gpa;
 	size_t len;
 	vm_ooffset_t segoff, first, last;
 	int error, found, segid;
 	uint16_t lastcpu;
 	bool sysmem;
 
 	error = vmm_priv_check(curthread->td_ucred);
 	if (error)
 		return (error);
 
 	first = *offset;
 	last = first + mapsize;
 	if ((nprot & PROT_EXEC) || first < 0 || first >= last)
 		return (EINVAL);
 
 	sc = vmmdev_lookup2(cdev);
 	if (sc == NULL) {
 		/* virtual machine is in the process of being created */
 		return (EINVAL);
 	}
 
 	/*
 	 * Get a read lock on the guest memory map by freezing any vcpu.
 	 */
 	lastcpu = vm_get_maxcpus(sc->vm) - 1;
 	error = vcpu_lock_one(sc, lastcpu);
 	if (error)
 		return (error);
 
 	gpa = 0;
 	found = 0;
 	while (!found) {
 		error = vm_mmap_getnext(sc->vm, &gpa, &segid, &segoff, &len,
 		    NULL, NULL);
 		if (error)
 			break;
 
 		if (first >= gpa && last <= gpa + len)
 			found = 1;
 		else
 			gpa += len;
 	}
 
 	if (found) {
 		error = vm_get_memseg(sc->vm, segid, &len, &sysmem, objp);
 		KASSERT(error == 0 && *objp != NULL,
 		    ("%s: invalid memory segment %d", __func__, segid));
 		if (sysmem) {
 			vm_object_reference(*objp);
 			*offset = segoff + (first - gpa);
 		} else {
 			error = EINVAL;
 		}
 	}
 	vcpu_unlock_one(sc, lastcpu);
 	return (error);
 }
 
 static void
 vmmdev_destroy(void *arg)
 {
 	struct vmmdev_softc *sc = arg;
 	struct devmem_softc *dsc;
 	int error;
 
 	error = vcpu_lock_all(sc);
 	KASSERT(error == 0, ("%s: error %d freezing vcpus", __func__, error));
 
 	while ((dsc = SLIST_FIRST(&sc->devmem)) != NULL) {
 		KASSERT(dsc->cdev == NULL, ("%s: devmem not free", __func__));
 		SLIST_REMOVE_HEAD(&sc->devmem, link);
 		free(dsc->name, M_VMMDEV);
 		free(dsc, M_VMMDEV);
 	}
 
 	if (sc->cdev != NULL)
 		destroy_dev(sc->cdev);
 
 	if (sc->vm != NULL)
 		vm_destroy(sc->vm);
 
 	if ((sc->flags & VSC_LINKED) != 0) {
 		mtx_lock(&vmmdev_mtx);
 		SLIST_REMOVE(&head, sc, vmmdev_softc, link);
 		mtx_unlock(&vmmdev_mtx);
 	}
 
 	free(sc, M_VMMDEV);
 }
 
 static int
 sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS)
 {
 	struct devmem_softc *dsc;
 	struct vmmdev_softc *sc;
 	struct cdev *cdev;
 	char *buf;
 	int error, buflen;
 
 	error = vmm_priv_check(req->td->td_ucred);
 	if (error)
 		return (error);
 
 	buflen = VM_MAX_NAMELEN + 1;
 	buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO);
 	strlcpy(buf, "beavis", buflen);
 	error = sysctl_handle_string(oidp, buf, buflen, req);
 	if (error != 0 || req->newptr == NULL)
 		goto out;
 
 	mtx_lock(&vmmdev_mtx);
 	sc = vmmdev_lookup(buf);
 	if (sc == NULL || sc->cdev == NULL) {
 		mtx_unlock(&vmmdev_mtx);
 		error = EINVAL;
 		goto out;
 	}
 
 	/*
 	 * The 'cdev' will be destroyed asynchronously when 'si_threadcount'
 	 * goes down to 0 so we should not do it again in the callback.
 	 *
 	 * Setting 'sc->cdev' to NULL is also used to indicate that the VM
 	 * is scheduled for destruction.
 	 */
 	cdev = sc->cdev;
 	sc->cdev = NULL;		
 	mtx_unlock(&vmmdev_mtx);
 
 	/*
 	 * Schedule all cdevs to be destroyed:
 	 *
 	 * - any new operations on the 'cdev' will return an error (ENXIO).
 	 *
 	 * - when the 'si_threadcount' dwindles down to zero the 'cdev' will
 	 *   be destroyed and the callback will be invoked in a taskqueue
 	 *   context.
 	 *
 	 * - the 'devmem' cdevs are destroyed before the virtual machine 'cdev'
 	 */
 	SLIST_FOREACH(dsc, &sc->devmem, link) {
 		KASSERT(dsc->cdev != NULL, ("devmem cdev already destroyed"));
 		destroy_dev_sched_cb(dsc->cdev, devmem_destroy, dsc);
 	}
 	destroy_dev_sched_cb(cdev, vmmdev_destroy, sc);
 	error = 0;
 
 out:
 	free(buf, M_VMMDEV);
 	return (error);
 }
 SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy,
     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
     NULL, 0, sysctl_vmm_destroy, "A",
     NULL);
 
 static struct cdevsw vmmdevsw = {
 	.d_name		= "vmmdev",
 	.d_version	= D_VERSION,
 	.d_ioctl	= vmmdev_ioctl,
 	.d_mmap_single	= vmmdev_mmap_single,
 	.d_read		= vmmdev_rw,
 	.d_write	= vmmdev_rw,
 };
 
 static int
 sysctl_vmm_create(SYSCTL_HANDLER_ARGS)
 {
 	struct vm *vm;
 	struct cdev *cdev;
 	struct vmmdev_softc *sc, *sc2;
 	char *buf;
 	int error, buflen;
 
 	error = vmm_priv_check(req->td->td_ucred);
 	if (error)
 		return (error);
 
 	buflen = VM_MAX_NAMELEN + 1;
 	buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO);
 	strlcpy(buf, "beavis", buflen);
 	error = sysctl_handle_string(oidp, buf, buflen, req);
 	if (error != 0 || req->newptr == NULL)
 		goto out;
 
 	mtx_lock(&vmmdev_mtx);
 	sc = vmmdev_lookup(buf);
 	mtx_unlock(&vmmdev_mtx);
 	if (sc != NULL) {
 		error = EEXIST;
 		goto out;
 	}
 
 	error = vm_create(buf, &vm);
 	if (error != 0)
 		goto out;
 
 	sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO);
 	sc->vm = vm;
 	SLIST_INIT(&sc->devmem);
 
 	/*
 	 * Lookup the name again just in case somebody sneaked in when we
 	 * dropped the lock.
 	 */
 	mtx_lock(&vmmdev_mtx);
 	sc2 = vmmdev_lookup(buf);
 	if (sc2 == NULL) {
 		SLIST_INSERT_HEAD(&head, sc, link);
 		sc->flags |= VSC_LINKED;
 	}
 	mtx_unlock(&vmmdev_mtx);
 
 	if (sc2 != NULL) {
 		vmmdev_destroy(sc);
 		error = EEXIST;
 		goto out;
 	}
 
 	error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &vmmdevsw, NULL,
 			   UID_ROOT, GID_WHEEL, 0600, "vmm/%s", buf);
 	if (error != 0) {
 		vmmdev_destroy(sc);
 		goto out;
 	}
 
 	mtx_lock(&vmmdev_mtx);
 	sc->cdev = cdev;
 	sc->cdev->si_drv1 = sc;
 	mtx_unlock(&vmmdev_mtx);
 
 out:
 	free(buf, M_VMMDEV);
 	return (error);
 }
 SYSCTL_PROC(_hw_vmm, OID_AUTO, create,
     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
     NULL, 0, sysctl_vmm_create, "A",
     NULL);
 
 void
 vmmdev_init(void)
 {
 	mtx_init(&vmmdev_mtx, "vmm device mutex", NULL, MTX_DEF);
 	pr_allow_flag = prison_add_allow(NULL, "vmm", NULL,
 	    "Allow use of vmm in a jail.");
 }
 
 int
 vmmdev_cleanup(void)
 {
 	int error;
 
 	if (SLIST_EMPTY(&head))
 		error = 0;
 	else
 		error = EBUSY;
 
 	return (error);
 }
 
 static int
 devmem_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t len,
     struct vm_object **objp, int nprot)
 {
 	struct devmem_softc *dsc;
 	vm_ooffset_t first, last;
 	size_t seglen;
 	int error;
 	uint16_t lastcpu;
 	bool sysmem;
 
 	dsc = cdev->si_drv1;
 	if (dsc == NULL) {
 		/* 'cdev' has been created but is not ready for use */
 		return (ENXIO);
 	}
 
 	first = *offset;
 	last = *offset + len;
 	if ((nprot & PROT_EXEC) || first < 0 || first >= last)
 		return (EINVAL);
 
 	lastcpu = vm_get_maxcpus(dsc->sc->vm) - 1;
 	error = vcpu_lock_one(dsc->sc, lastcpu);
 	if (error)
 		return (error);
 
 	error = vm_get_memseg(dsc->sc->vm, dsc->segid, &seglen, &sysmem, objp);
 	KASSERT(error == 0 && !sysmem && *objp != NULL,
 	    ("%s: invalid devmem segment %d", __func__, dsc->segid));
 
 	vcpu_unlock_one(dsc->sc, lastcpu);
 
 	if (seglen >= last) {
 		vm_object_reference(*objp);
 		return (0);
 	} else {
 		return (EINVAL);
 	}
 }
 
 static struct cdevsw devmemsw = {
 	.d_name		= "devmem",
 	.d_version	= D_VERSION,
 	.d_mmap_single	= devmem_mmap_single,
 };
 
 static int
 devmem_create_cdev(const char *vmname, int segid, char *devname)
 {
 	struct devmem_softc *dsc;
 	struct vmmdev_softc *sc;
 	struct cdev *cdev;
 	int error;
 
 	error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &devmemsw, NULL,
 	    UID_ROOT, GID_WHEEL, 0600, "vmm.io/%s.%s", vmname, devname);
 	if (error)
 		return (error);
 
 	dsc = malloc(sizeof(struct devmem_softc), M_VMMDEV, M_WAITOK | M_ZERO);
 
 	mtx_lock(&vmmdev_mtx);
 	sc = vmmdev_lookup(vmname);
 	KASSERT(sc != NULL, ("%s: vm %s softc not found", __func__, vmname));
 	if (sc->cdev == NULL) {
 		/* virtual machine is being created or destroyed */
 		mtx_unlock(&vmmdev_mtx);
 		free(dsc, M_VMMDEV);
 		destroy_dev_sched_cb(cdev, NULL, 0);
 		return (ENODEV);
 	}
 
 	dsc->segid = segid;
 	dsc->name = devname;
 	dsc->cdev = cdev;
 	dsc->sc = sc;
 	SLIST_INSERT_HEAD(&sc->devmem, dsc, link);
 	mtx_unlock(&vmmdev_mtx);
 
 	/* The 'cdev' is ready for use after 'si_drv1' is initialized */
 	cdev->si_drv1 = dsc;
 	return (0);
 }
 
 static void
 devmem_destroy(void *arg)
 {
 	struct devmem_softc *dsc = arg;
 
 	KASSERT(dsc->cdev, ("%s: devmem cdev already destroyed", __func__));
 	dsc->cdev = NULL;
 	dsc->sc = NULL;
 }
diff --git a/sys/amd64/vmm/vmm_snapshot.c b/sys/amd64/vmm/vmm_snapshot.c
new file mode 100644
index 000000000000..c77bb05f76b7
--- /dev/null
+++ b/sys/amd64/vmm/vmm_snapshot.c
@@ -0,0 +1,141 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2016 Flavius Anton
+ * Copyright (c) 2016 Mihai Tiganus
+ * Copyright (c) 2016-2019 Mihai Carabas
+ * Copyright (c) 2017-2019 Darius Mihai
+ * Copyright (c) 2017-2019 Elena Mihailescu
+ * Copyright (c) 2018-2019 Sergiu Weisz
+ * All rights reserved.
+ * The bhyve-snapshot feature was developed under sponsorships
+ * from Matthew Grooms.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/systm.h>
+
+#include <machine/vmm_snapshot.h>
+
+void
+vm_snapshot_buf_err(const char *bufname, const enum vm_snapshot_op op)
+{
+	const char *opstr;
+
+	if (op == VM_SNAPSHOT_SAVE)
+		opstr = "save";
+	else if (op == VM_SNAPSHOT_RESTORE)
+		opstr = "restore";
+	else
+		opstr = "unknown";
+
+	printf("%s: snapshot-%s failed for %s\r\n", __func__, opstr, bufname);
+}
+
+int
+vm_snapshot_buf(volatile void *data, size_t data_size,
+	     struct vm_snapshot_meta *meta)
+{
+	struct vm_snapshot_buffer *buffer;
+	int op;
+	void *nv_data;
+
+	nv_data = __DEVOLATILE(void *, data);
+	buffer = &meta->buffer;
+	op = meta->op;
+
+	if (buffer->buf_rem < data_size) {
+		printf("%s: buffer too small\r\n", __func__);
+		return (E2BIG);
+	}
+
+	if (op == VM_SNAPSHOT_SAVE)
+		copyout(nv_data, buffer->buf, data_size);
+	else if (op == VM_SNAPSHOT_RESTORE)
+		copyin(buffer->buf, nv_data, data_size);
+	else
+		return (EINVAL);
+
+	buffer->buf += data_size;
+	buffer->buf_rem -= data_size;
+
+	return (0);
+}
+
+size_t
+vm_get_snapshot_size(struct vm_snapshot_meta *meta)
+{
+	size_t length;
+	struct vm_snapshot_buffer *buffer;
+
+	buffer = &meta->buffer;
+
+	if (buffer->buf_size < buffer->buf_rem) {
+		printf("%s: Invalid buffer: size = %zu, rem = %zu\r\n",
+		       __func__, buffer->buf_size, buffer->buf_rem);
+		length = 0;
+	} else {
+		length = buffer->buf_size - buffer->buf_rem;
+	}
+
+	return (length);
+}
+
+int
+vm_snapshot_buf_cmp(volatile void *data, size_t data_size,
+		    struct vm_snapshot_meta *meta)
+{
+	struct vm_snapshot_buffer *buffer;
+	int op;
+	int ret;
+	void *_data = *(void **)(void *)&data;
+
+	buffer = &meta->buffer;
+	op = meta->op;
+
+	if (buffer->buf_rem < data_size) {
+		printf("%s: buffer too small\r\n", __func__);
+		ret = E2BIG;
+		goto done;
+	}
+
+	if (op == VM_SNAPSHOT_SAVE) {
+		ret = 0;
+		copyout(_data, buffer->buf, data_size);
+	} else if (op == VM_SNAPSHOT_RESTORE) {
+		ret = memcmp(_data, buffer->buf, data_size);
+	} else {
+		ret = EINVAL;
+		goto done;
+	}
+
+	buffer->buf += data_size;
+	buffer->buf_rem -= data_size;
+
+done:
+	return (ret);
+}
diff --git a/sys/conf/config.mk b/sys/conf/config.mk
index 6b405890458e..50188eee923b 100644
--- a/sys/conf/config.mk
+++ b/sys/conf/config.mk
@@ -1,65 +1,72 @@
 # $FreeBSD$
 #
 # Common code to marry kernel config(8) goo and module building goo.
 #
 
 # Generate options files that otherwise would be built
 # in substantially similar ways through the tree. Move
 # the code here when they all produce identical results
 # (or should)
 .if !defined(KERNBUILDDIR)
 opt_global.h:
 	touch ${.TARGET}
 .if ${MACHINE} != "mips"
 	@echo "#define SMP 1" >> ${.TARGET}
 	@echo "#define MAC 1" >> ${.TARGET}
 	@echo "#define VIMAGE 1" >> ${.TARGET}
 .endif
+.if ${MK_BHYVE_SNAPSHOT} != "no"
+opt_bhyve_snapshot.h:
+	@echo "#define BHYVE_SNAPSHOT 1" > ${.TARGET}
+.endif
 opt_bpf.h:
 	echo "#define DEV_BPF 1" > ${.TARGET}
 .if ${MK_INET_SUPPORT} != "no"
 opt_inet.h:
 	@echo "#define INET 1" > ${.TARGET}
 	@echo "#define TCP_OFFLOAD 1" >> ${.TARGET}
 .endif
 .if ${MK_INET6_SUPPORT} != "no"
 opt_inet6.h:
 	@echo "#define INET6 1" > ${.TARGET}
 .endif
 .if ${MK_RATELIMIT} != "no"
 opt_ratelimit.h:
 	@echo "#define RATELIMIT 1" > ${.TARGET}
 .endif
 opt_mrouting.h:
 	echo "#define MROUTING 1" > ${.TARGET}
 opt_printf.h:
 	echo "#define PRINTF_BUFR_SIZE 128" > ${.TARGET}
 opt_scsi.h:
 	echo "#define SCSI_DELAY 15000" > ${.TARGET}
 opt_wlan.h:
 	echo "#define IEEE80211_DEBUG 1" > ${.TARGET}
 	echo "#define IEEE80211_SUPPORT_MESH 1" >> ${.TARGET}
 KERN_OPTS.i386=NEW_PCIB DEV_PCI
 KERN_OPTS.amd64=NEW_PCIB DEV_PCI
 KERN_OPTS.powerpc=NEW_PCIB DEV_PCI
 KERN_OPTS=MROUTING IEEE80211_DEBUG \
 	IEEE80211_SUPPORT_MESH DEV_BPF \
 	${KERN_OPTS.${MACHINE}} ${KERN_OPTS_EXTRA}
+.if ${MK_BHYVE_SNAPSHOT} != "no"
+KERN_OPTS+= BHYVE_SNAPSHOT
+.endif
 .if ${MK_INET_SUPPORT} != "no"
 KERN_OPTS+= INET TCP_OFFLOAD
 .endif
 .if ${MK_INET6_SUPPORT} != "no"
 KERN_OPTS+= INET6
 .endif
 .elif !defined(KERN_OPTS)
 # Add all the options that are mentioned in any opt_*.h file when we
 # have a kernel build directory to pull them from.
 KERN_OPTS!=cat ${KERNBUILDDIR}/opt*.h | awk '{print $$2;}' | sort -u
 .export KERN_OPTS
 .endif
 
 .if !defined(NO_MODULES) && !defined(__MPATH) && !make(install) && \
     (empty(.MAKEFLAGS:M-V) || defined(NO_SKIP_MPATH))
 __MPATH!=find ${SYSDIR:tA}/ -name \*_if.m
 .export __MPATH
 .endif
diff --git a/sys/conf/kern.opts.mk b/sys/conf/kern.opts.mk
index 078d79913634..bc7ddfd06e7c 100644
--- a/sys/conf/kern.opts.mk
+++ b/sys/conf/kern.opts.mk
@@ -1,177 +1,178 @@
 # $FreeBSD$
 
 # Options set in the build system that affect the kernel somehow.
 
 #
 # Define MK_* variables (which are either "yes" or "no") for users
 # to set via WITH_*/WITHOUT_* in /etc/src.conf and override in the
 # make(1) environment.
 # These should be tested with `== "no"' or `!= "no"' in makefiles.
 # The NO_* variables should only be set by makefiles for variables
 # that haven't been converted over.
 #
 
 # Note: bsd.own.mk must be included before the rest of kern.opts.mk to make
 # building on 10.x and earlier work. This should be removed when that's no
 # longer supported since it confounds the defaults (since it uses the host's
 # notion of defaults rather than what's default in current when building
 # within sys/modules).
 .include <bsd.own.mk>
 
 # These options are used by the kernel build process (kern.mk and kmod.mk)
 # They have to be listed here so we can build modules outside of the
 # src tree.
 
 KLDXREF_CMD?=	kldxref
 
 __DEFAULT_YES_OPTIONS = \
     AUTOFS \
     BHYVE \
     BLUETOOTH \
     CCD \
     CDDL \
     CRYPT \
     CUSE \
     EFI \
     FORMAT_EXTENSIONS \
     INET \
     INET6 \
     IPFILTER \
     IPSEC_SUPPORT \
     ISCSI \
     KERNEL_SYMBOLS \
     NETGRAPH \
     PF \
     SOURCELESS_HOST \
     SOURCELESS_UCODE \
     TESTS \
     USB_GADGET_EXAMPLES \
     ZFS
 
 __DEFAULT_NO_OPTIONS = \
+    BHYVE_SNAPSHOT \
     EXTRA_TCP_STACKS \
     KERNEL_RETPOLINE \
     OFED \
     RATELIMIT \
     REPRODUCIBLE_BUILD
 
 # Some options are totally broken on some architectures. We disable
 # them. If you need to enable them on an experimental basis, you
 # must change this code.
 # Note: These only apply to the list of modules we build by default
 # and sometimes what is in the opt_*.h files by default.
 # Kernel config files are unaffected, though some targets can be
 # affected by KERNEL_SYMBOLS, FORMAT_EXTENSIONS, CTF and SSP.
 
 # Things that don't work based on the CPU
 .if ${MACHINE_CPUARCH} == "arm"
 . if ${MACHINE_ARCH:Marmv[67]*} == ""
 BROKEN_OPTIONS+= CDDL ZFS
 . endif
 .endif
 
 .if ${MACHINE_CPUARCH} == "mips"
 BROKEN_OPTIONS+= CDDL ZFS SSP
 .endif
 
 .if ${MACHINE_CPUARCH} == "powerpc" && ${MACHINE_ARCH} == "powerpc"
 BROKEN_OPTIONS+= ZFS
 .endif
 
 .if ${MACHINE_CPUARCH} == "riscv"
 BROKEN_OPTIONS+= FORMAT_EXTENSIONS
 .endif
 
 # Things that don't work because the kernel doesn't have the support
 # for them.
 .if ${MACHINE} != "i386" && ${MACHINE} != "amd64"
 BROKEN_OPTIONS+= OFED
 .endif
 
 # Things that don't work based on toolchain support.
 .if ${MACHINE} != "i386" && ${MACHINE} != "amd64"
 BROKEN_OPTIONS+= KERNEL_RETPOLINE
 .endif
 
 # EFI doesn't exist on mips, powerpc, or riscv.
 .if ${MACHINE:Mmips} || ${MACHINE:Mpowerpc} || ${MACHINE:Mriscv}
 BROKEN_OPTIONS+=EFI
 .endif
 
 # expanded inline from bsd.mkopt.mk to avoid share/mk dependency
 
 # Those that default to yes
 .for var in ${__DEFAULT_YES_OPTIONS}
 .if !defined(MK_${var})
 .if defined(WITHOUT_${var})			# WITHOUT always wins
 MK_${var}:=	no
 .else
 MK_${var}:=	yes
 .endif
 .else
 .if ${MK_${var}} != "yes" && ${MK_${var}} != "no"
 .error "Illegal value for MK_${var}: ${MK_${var}}"
 .endif
 .endif # !defined(MK_${var})
 .endfor
 .undef __DEFAULT_YES_OPTIONS
 
 # Those that default to no
 .for var in ${__DEFAULT_NO_OPTIONS}
 .if !defined(MK_${var})
 .if defined(WITH_${var}) && !defined(WITHOUT_${var}) # WITHOUT always wins
 MK_${var}:=	yes
 .else
 MK_${var}:=	no
 .endif
 .else
 .if ${MK_${var}} != "yes" && ${MK_${var}} != "no"
 .error "Illegal value for MK_${var}: ${MK_${var}}"
 .endif
 .endif # !defined(MK_${var})
 .endfor
 .undef __DEFAULT_NO_OPTIONS
 
 #
 # MK_* options which are always no, usually because they are
 # unsupported/badly broken on this architecture.
 #
 .for var in ${BROKEN_OPTIONS}
 MK_${var}:=	no
 .endfor
 .undef BROKEN_OPTIONS
 #end of bsd.mkopt.mk expanded inline.
 
 #
 # MK_*_SUPPORT options which default to "yes" unless their corresponding
 # MK_* variable is set to "no".
 #
 .for var in \
     INET \
     INET6
 .if defined(WITHOUT_${var}_SUPPORT) || ${MK_${var}} == "no"
 MK_${var}_SUPPORT:= no
 .else
 .if defined(KERNBUILDDIR)	# See if there's an opt_foo.h
 .if !defined(OPT_${var})
 OPT_${var}!= cat ${KERNBUILDDIR}/opt_${var:tl}.h; echo
 .export OPT_${var}
 .endif
 .if ${OPT_${var}} == ""		# nothing -> no
 MK_${var}_SUPPORT:= no
 .else
 MK_${var}_SUPPORT:= yes
 .endif
 .else				# otherwise, yes
 MK_${var}_SUPPORT:= yes
 .endif
 .endif
 .endfor
 
 # Some modules only compile successfully if option FDT is set, due to #ifdef FDT
 # wrapped around declarations.  Module makefiles can optionally compile such
 # things using .if !empty(OPT_FDT)
 .if !defined(OPT_FDT) && defined(KERNBUILDDIR)
 OPT_FDT!= sed -n '/FDT/p' ${KERNBUILDDIR}/opt_platform.h
 .export OPT_FDT
 .endif
diff --git a/sys/conf/options.amd64 b/sys/conf/options.amd64
index 8939ddaf6246..cd90747ba732 100644
--- a/sys/conf/options.amd64
+++ b/sys/conf/options.amd64
@@ -1,70 +1,71 @@
 # $FreeBSD$
 # Options specific to AMD64 platform kernels
 
 AUTO_EOI_1		opt_auto_eoi.h
 AUTO_EOI_2		opt_auto_eoi.h
+BHYVE_SNAPSHOT
 COUNT_XINVLTLB_HITS	opt_smp.h
 COUNT_IPIS		opt_smp.h
 MAXMEM
 MPTABLE_FORCE_HTT
 MP_WATCHDOG
 NKPT			opt_pmap.h
 PV_STATS		opt_pmap.h
 
 # Options for emulators.  These should only be used at config time, so
 # they are handled like options for static filesystems
 # (see src/sys/conf/options), except for broken debugging options.
 COMPAT_AOUT		opt_dontuse.h
 COMPAT_FREEBSD32	opt_global.h
 #COMPAT_LINUX		opt_dontuse.h
 COMPAT_LINUX32		opt_compat.h
 LINPROCFS		opt_dontuse.h
 LINSYSFS		opt_dontuse.h
 NDISAPI			opt_dontuse.h
 
 TIMER_FREQ			opt_clock.h
 
 # options for serial support
 COM_ESP			opt_sio.h
 COM_MULTIPORT		opt_sio.h
 CONSPEED		opt_sio.h
 GDBSPEED		opt_sio.h
 COM_NO_ACPI		opt_sio.h
 
 VGA_ALT_SEQACCESS	opt_vga.h
 VGA_DEBUG		opt_vga.h
 VGA_NO_FONT_LOADING	opt_vga.h
 VGA_NO_MODE_CHANGE	opt_vga.h
 VGA_SLOW_IOACCESS	opt_vga.h
 VGA_WIDTH90		opt_vga.h
 
 VESA
 VESA_DEBUG		opt_vesa.h
 
 # AGP debugging support
 AGP_DEBUG		opt_agp.h
 
 ATKBD_DFLT_KEYMAP	opt_atkbd.h
 
 # iWARP client interface support in ixl
 IXL_IW			opt_ixl.h
 
 # -------------------------------
 # EOF
 # -------------------------------
 HAMMER			opt_cpu.h
 PSM_HOOKRESUME		opt_psm.h
 PSM_RESETAFTERSUSPEND	opt_psm.h
 PSM_DEBUG		opt_psm.h
 DEV_ATPIC		opt_atpic.h
 
 # BPF just-in-time compiler
 BPF_JITTER		opt_bpf.h
 
 XENHVM			opt_global.h
 
 # options for the Intel C600 SAS driver (isci)
 ISCI_LOGGING	opt_isci.h
 
 # EFI Runtime services support
 EFIRT			opt_efirt.h
diff --git a/sys/modules/vmm/Makefile b/sys/modules/vmm/Makefile
index 9471fc9074dc..b5d62c358272 100644
--- a/sys/modules/vmm/Makefile
+++ b/sys/modules/vmm/Makefile
@@ -1,83 +1,90 @@
 # $FreeBSD$
 
+.include <kmod.opts.mk>
+
 KMOD=	vmm
 
-SRCS=	opt_acpi.h opt_ddb.h device_if.h bus_if.h pci_if.h pcib_if.h acpi_if.h
+SRCS=	opt_acpi.h opt_bhyve_snapshot.h opt_ddb.h
+SRCS+=	device_if.h bus_if.h pci_if.h pcib_if.h acpi_if.h vnode_if.h
 DPSRCS+=	vmx_assym.h svm_assym.h
 DPSRCS+=	vmx_genassym.c svm_genassym.c offset.inc
 
 CFLAGS+= -DVMM_KEEP_STATS
 CFLAGS+= -I${SRCTOP}/sys/amd64/vmm
 CFLAGS+= -I${SRCTOP}/sys/amd64/vmm/io
 CFLAGS+= -I${SRCTOP}/sys/amd64/vmm/intel
 CFLAGS+= -I${SRCTOP}/sys/amd64/vmm/amd
 
 # generic vmm support
 .PATH: ${SRCTOP}/sys/amd64/vmm
 SRCS+=	vmm.c		\
 	vmm_dev.c	\
 	vmm_host.c	\
 	vmm_instruction_emul.c	\
 	vmm_ioport.c	\
 	vmm_lapic.c	\
 	vmm_mem.c	\
 	vmm_stat.c	\
 	vmm_util.c	\
 	x86.c
 
 .PATH: ${SRCTOP}/sys/amd64/vmm/io
 SRCS+=	iommu.c		\
 	ppt.c           \
 	vatpic.c	\
 	vatpit.c	\
 	vhpet.c		\
 	vioapic.c	\
 	vlapic.c	\
 	vpmtmr.c	\
 	vrtc.c
 
 # intel-specific files
 .PATH: ${SRCTOP}/sys/amd64/vmm/intel
 SRCS+=	ept.c		\
 	vmcs.c		\
 	vmx_msr.c	\
 	vmx_support.S	\
 	vmx.c		\
 	vtd.c
 
 # amd-specific files
 .PATH: ${SRCTOP}/sys/amd64/vmm/amd
 SRCS+=	vmcb.c		\
 	svm.c		\
 	svm_support.S	\
 	npt.c		\
 	ivrs_drv.c	\
 	amdvi_hw.c	\
 	svm_msr.c
 
+.if ${KERN_OPTS:MBHYVE_SNAPSHOT} != ""
+SRCS+=	vmm_snapshot.c
+.endif
+
 CLEANFILES=	vmx_assym.h vmx_genassym.o svm_assym.h svm_genassym.o
 
 OBJS_DEPEND_GUESS.vmx_support.o+= vmx_assym.h
 OBJS_DEPEND_GUESS.svm_support.o+= svm_assym.h
 
 vmx_assym.h:    vmx_genassym.o
 	sh ${SYSDIR}/kern/genassym.sh vmx_genassym.o > ${.TARGET}
 
 svm_assym.h:    svm_genassym.o
 	sh ${SYSDIR}/kern/genassym.sh svm_genassym.o > ${.TARGET}
 
 vmx_support.o:
 	${CC} -c -x assembler-with-cpp -DLOCORE ${CFLAGS} \
 	    ${.IMPSRC} -o ${.TARGET}
 
 svm_support.o:
 	${CC} -c -x assembler-with-cpp -DLOCORE ${CFLAGS} \
 	    ${.IMPSRC} -o ${.TARGET}
 
 vmx_genassym.o: offset.inc
 	${CC} -c ${CFLAGS:N-flto:N-fno-common} -fcommon ${.IMPSRC}
 
 svm_genassym.o: offset.inc
 	${CC} -c ${CFLAGS:N-flto:N-fno-common} -fcommon ${.IMPSRC}
 
 .include <bsd.kmod.mk>
diff --git a/tools/build/options/WITH_BHYVE_SNAPSHOT b/tools/build/options/WITH_BHYVE_SNAPSHOT
new file mode 100644
index 000000000000..7e673f51c8bb
--- /dev/null
+++ b/tools/build/options/WITH_BHYVE_SNAPSHOT
@@ -0,0 +1,7 @@
+.\" $FreeBSD$
+Set to include support for save and restore (snapshots) in
+.Xr bhyve 8
+and
+.Xr bhyvectl 8 .
+.Pp
+This option only affects amd64/amd64.
diff --git a/usr.sbin/bhyve/Makefile b/usr.sbin/bhyve/Makefile
index 12bd477825bf..9a4460a1b90f 100644
--- a/usr.sbin/bhyve/Makefile
+++ b/usr.sbin/bhyve/Makefile
@@ -1,102 +1,117 @@
 #
 # $FreeBSD$
 #
 
 .include <src.opts.mk>
 CFLAGS+=-I${SRCTOP}/sys
 .PATH:  ${SRCTOP}/sys/cam/ctl
 
 PROG=	bhyve
 PACKAGE=	bhyve
 
 MAN=	bhyve.8
 
 BHYVE_SYSDIR?=${SRCTOP}
 
 SRCS=	\
 	atkbdc.c		\
 	acpi.c			\
 	audio.c			\
 	bhyvegc.c		\
 	bhyverun.c		\
 	block_if.c		\
 	bootrom.c		\
 	console.c		\
 	consport.c		\
 	ctl_util.c		\
 	ctl_scsi_all.c		\
 	dbgport.c		\
 	fwctl.c			\
 	gdb.c			\
 	hda_codec.c		\
 	inout.c			\
 	ioapic.c		\
 	mem.c			\
 	mevent.c		\
 	mptbl.c			\
 	net_backends.c		\
 	net_utils.c		\
 	pci_ahci.c		\
 	pci_e82545.c		\
 	pci_emul.c		\
 	pci_hda.c		\
 	pci_fbuf.c		\
 	pci_hostbridge.c	\
 	pci_irq.c		\
 	pci_lpc.c		\
 	pci_nvme.c		\
 	pci_passthru.c		\
 	pci_virtio_block.c	\
 	pci_virtio_console.c	\
 	pci_virtio_net.c	\
 	pci_virtio_rnd.c	\
 	pci_virtio_scsi.c	\
 	pci_uart.c		\
 	pci_xhci.c		\
 	pm.c			\
 	post.c			\
 	ps2kbd.c		\
 	ps2mouse.c		\
 	rfb.c			\
 	rtc.c			\
 	smbiostbl.c		\
 	sockstream.c		\
 	task_switch.c		\
 	uart_emul.c		\
 	usb_emul.c		\
 	usb_mouse.c		\
 	virtio.c		\
 	vga.c			\
 	vmgenc.c		\
 	xmsr.c			\
 	spinup_ap.c		\
 	iov.c
 
+.if ${MK_BHYVE_SNAPSHOT} != "no"
+SRCS+=	snapshot.c
+.endif
+
 .PATH:  ${BHYVE_SYSDIR}/sys/amd64/vmm
 SRCS+=	vmm_instruction_emul.c
 
 LIBADD=	vmmapi md pthread z util sbuf cam
+.if ${MK_BHYVE_SNAPSHOT} != "no"
+LIBADD+= ucl xo
+.endif
 
 .if ${MK_INET_SUPPORT} != "no"
 CFLAGS+=-DINET
 .endif
 .if ${MK_INET6_SUPPORT} != "no"
 CFLAGS+=-DINET6
 .endif
 .if ${MK_OPENSSL} == "no"
 CFLAGS+=-DNO_OPENSSL
 .else
 LIBADD+=	crypto
 .endif
 
 CFLAGS+= -I${BHYVE_SYSDIR}/sys/dev/e1000
 CFLAGS+= -I${BHYVE_SYSDIR}/sys/dev/mii
 CFLAGS+= -I${BHYVE_SYSDIR}/sys/dev/usb/controller
+.if ${MK_BHYVE_SNAPSHOT} != "no"
+CFLAGS+= -I${SRCTOP}/contrib/libucl/include
+
+# Temporary disable capsicum, until we integrate checkpoint code with it.
+CFLAGS+= -DWITHOUT_CAPSICUM
+
+CFLAGS+= -DBHYVE_SNAPSHOT
+.endif
 
 .ifdef GDB_LOG
 CFLAGS+=-DGDB_LOG
 .endif
 
 WARNS?=	2
 
 .include <bsd.prog.mk>
diff --git a/usr.sbin/bhyve/Makefile.depend b/usr.sbin/bhyve/Makefile.depend
index 8d3ff079d277..8222ceb6ad25 100644
--- a/usr.sbin/bhyve/Makefile.depend
+++ b/usr.sbin/bhyve/Makefile.depend
@@ -1,25 +1,27 @@
 # $FreeBSD$
 # Autogenerated - do NOT edit!
 
 DIRDEPS = \
 	include \
 	include/arpa \
 	include/xlocale \
 	lib/${CSU_DIR} \
 	lib/libc \
 	lib/libcam \
 	lib/libcapsicum \
 	lib/libcasper/libcasper \
 	lib/libcompiler_rt \
 	lib/libsbuf \
 	lib/libthr \
+	lib/libucl \
 	lib/libutil \
 	lib/libvmmapi \
+	lib/libxo \
 	lib/libz \
 
 
 .include <dirdeps.mk>
 
 .if ${DEP_RELDIR} == ${_DEP_RELDIR}
 # local dependencies - needed for -jN in clean tree
 .endif
diff --git a/usr.sbin/bhyve/atkbdc.c b/usr.sbin/bhyve/atkbdc.c
index 1c1838c2e80c..a08f58f84b22 100644
--- a/usr.sbin/bhyve/atkbdc.c
+++ b/usr.sbin/bhyve/atkbdc.c
@@ -1,586 +1,632 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
  * Copyright (c) 2015 Nahanni Systems Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/types.h>
 
 #include <machine/vmm.h>
+#include <machine/vmm_snapshot.h>
 
 #include <vmmapi.h>
 
 #include <assert.h>
 #include <errno.h>
 #include <stdbool.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
 #include <unistd.h>
 #include <pthread.h>
 #include <pthread_np.h>
 
 #include "acpi.h"
 #include "atkbdc.h"
 #include "inout.h"
 #include "pci_emul.h"
 #include "pci_irq.h"
 #include "pci_lpc.h"
 #include "ps2kbd.h"
 #include "ps2mouse.h"
 
 #define	KBD_DATA_PORT		0x60
 
 #define	KBD_STS_CTL_PORT	0x64
 
 #define	KBDC_RESET		0xfe
 
 #define	KBD_DEV_IRQ		1
 #define	AUX_DEV_IRQ		12
 
 /* controller commands */
 #define	KBDC_SET_COMMAND_BYTE	0x60
 #define	KBDC_GET_COMMAND_BYTE	0x20
 #define	KBDC_DISABLE_AUX_PORT	0xa7
 #define	KBDC_ENABLE_AUX_PORT	0xa8
 #define	KBDC_TEST_AUX_PORT	0xa9
 #define	KBDC_TEST_CTRL		0xaa
 #define	KBDC_TEST_KBD_PORT	0xab
 #define	KBDC_DISABLE_KBD_PORT	0xad
 #define	KBDC_ENABLE_KBD_PORT	0xae
 #define	KBDC_READ_INPORT	0xc0
 #define	KBDC_READ_OUTPORT	0xd0
 #define	KBDC_WRITE_OUTPORT	0xd1
 #define	KBDC_WRITE_KBD_OUTBUF	0xd2
 #define	KBDC_WRITE_AUX_OUTBUF	0xd3
 #define	KBDC_WRITE_TO_AUX	0xd4
 
 /* controller command byte (set by KBDC_SET_COMMAND_BYTE) */
 #define	KBD_TRANSLATION		0x40
 #define	KBD_SYS_FLAG_BIT	0x04
 #define	KBD_DISABLE_KBD_PORT	0x10
 #define	KBD_DISABLE_AUX_PORT	0x20
 #define	KBD_ENABLE_AUX_INT	0x02
 #define	KBD_ENABLE_KBD_INT	0x01
 #define	KBD_KBD_CONTROL_BITS	(KBD_DISABLE_KBD_PORT | KBD_ENABLE_KBD_INT)
 #define	KBD_AUX_CONTROL_BITS	(KBD_DISABLE_AUX_PORT | KBD_ENABLE_AUX_INT)
 
 /* controller status bits */
 #define	KBDS_KBD_BUFFER_FULL	0x01
 #define KBDS_SYS_FLAG		0x04
 #define KBDS_CTRL_FLAG		0x08
 #define	KBDS_AUX_BUFFER_FULL	0x20
 
 /* controller output port */
 #define	KBDO_KBD_OUTFULL	0x10
 #define	KBDO_AUX_OUTFULL	0x20
 
 #define	RAMSZ			32
 #define	FIFOSZ			15
 #define	CTRL_CMD_FLAG		0x8000
 
 struct kbd_dev {
 	bool	irq_active;
 	int	irq;
 
 	uint8_t	buffer[FIFOSZ];
 	int	brd, bwr;
 	int	bcnt;
 };
 
 struct aux_dev {
 	bool	irq_active;
 	int	irq;
 };
 
 struct atkbdc_softc {
 	struct vmctx *ctx;
 	pthread_mutex_t mtx;
 
 	struct ps2kbd_softc	*ps2kbd_sc;
 	struct ps2mouse_softc	*ps2mouse_sc;
 
 	uint8_t	status;		/* status register */
 	uint8_t	outport;	/* controller output port */
 	uint8_t	ram[RAMSZ];	/* byte0 = controller config */
 
 	uint32_t curcmd;	/* current command for next byte */
 	uint32_t  ctrlbyte;
 
 	struct kbd_dev kbd;
 	struct aux_dev aux;
 };
 
+#ifdef BHYVE_SNAPSHOT
+static struct atkbdc_softc *atkbdc_sc = NULL;
+#endif
+
 static void
 atkbdc_assert_kbd_intr(struct atkbdc_softc *sc)
 {
 	if ((sc->ram[0] & KBD_ENABLE_KBD_INT) != 0) {
 		sc->kbd.irq_active = true;
 		vm_isa_pulse_irq(sc->ctx, sc->kbd.irq, sc->kbd.irq);
 	}
 }
 
 static void
 atkbdc_assert_aux_intr(struct atkbdc_softc *sc)
 {
 	if ((sc->ram[0] & KBD_ENABLE_AUX_INT) != 0) {
 		sc->aux.irq_active = true;
 		vm_isa_pulse_irq(sc->ctx, sc->aux.irq, sc->aux.irq);
 	}
 }
 
 static int
 atkbdc_kbd_queue_data(struct atkbdc_softc *sc, uint8_t val)
 {
 	assert(pthread_mutex_isowned_np(&sc->mtx));
 
 	if (sc->kbd.bcnt < FIFOSZ) {
 		sc->kbd.buffer[sc->kbd.bwr] = val;
 		sc->kbd.bwr = (sc->kbd.bwr + 1) % FIFOSZ;
 		sc->kbd.bcnt++;
 		sc->status |= KBDS_KBD_BUFFER_FULL;
 		sc->outport |= KBDO_KBD_OUTFULL;
 	} else {
 		printf("atkbd data buffer full\n");
 	}
 
 	return (sc->kbd.bcnt < FIFOSZ);
 }
 
 static void
 atkbdc_kbd_read(struct atkbdc_softc *sc)
 {
 	const uint8_t translation[256] = {
 		0xff, 0x43, 0x41, 0x3f, 0x3d, 0x3b, 0x3c, 0x58,
 		0x64, 0x44, 0x42, 0x40, 0x3e, 0x0f, 0x29, 0x59,
 		0x65, 0x38, 0x2a, 0x70, 0x1d, 0x10, 0x02, 0x5a,
 		0x66, 0x71, 0x2c, 0x1f, 0x1e, 0x11, 0x03, 0x5b,
 		0x67, 0x2e, 0x2d, 0x20, 0x12, 0x05, 0x04, 0x5c,
 		0x68, 0x39, 0x2f, 0x21, 0x14, 0x13, 0x06, 0x5d,
 		0x69, 0x31, 0x30, 0x23, 0x22, 0x15, 0x07, 0x5e,
 		0x6a, 0x72, 0x32, 0x24, 0x16, 0x08, 0x09, 0x5f,
 		0x6b, 0x33, 0x25, 0x17, 0x18, 0x0b, 0x0a, 0x60,
 		0x6c, 0x34, 0x35, 0x26, 0x27, 0x19, 0x0c, 0x61,
 		0x6d, 0x73, 0x28, 0x74, 0x1a, 0x0d, 0x62, 0x6e,
 		0x3a, 0x36, 0x1c, 0x1b, 0x75, 0x2b, 0x63, 0x76,
 		0x55, 0x56, 0x77, 0x78, 0x79, 0x7a, 0x0e, 0x7b,
 		0x7c, 0x4f, 0x7d, 0x4b, 0x47, 0x7e, 0x7f, 0x6f,
 		0x52, 0x53, 0x50, 0x4c, 0x4d, 0x48, 0x01, 0x45,
 		0x57, 0x4e, 0x51, 0x4a, 0x37, 0x49, 0x46, 0x54,
 		0x80, 0x81, 0x82, 0x41, 0x54, 0x85, 0x86, 0x87,
 		0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
 		0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
 		0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
 		0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
 		0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
 		0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
 		0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
 		0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
 		0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
 		0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
 		0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
 		0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
 		0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
 		0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
 		0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff
 	};
 	uint8_t val;
 	uint8_t release = 0;
 
 	assert(pthread_mutex_isowned_np(&sc->mtx));
 
 	if (sc->ram[0] & KBD_TRANSLATION) {
 		while (ps2kbd_read(sc->ps2kbd_sc, &val) != -1) {
 			if (val == 0xf0) {
 				release = 0x80;
 				continue;
 			} else {
 				val = translation[val] | release;
 			}
 			atkbdc_kbd_queue_data(sc, val);
 			break;
 		}
 	} else {
 		while (sc->kbd.bcnt < FIFOSZ) {
 			if (ps2kbd_read(sc->ps2kbd_sc, &val) != -1)
 				atkbdc_kbd_queue_data(sc, val);
 			else
 				break;
 		}
 	}
 
 	if (((sc->ram[0] & KBD_DISABLE_AUX_PORT) ||
 	    ps2mouse_fifocnt(sc->ps2mouse_sc) == 0) && sc->kbd.bcnt > 0)
 		atkbdc_assert_kbd_intr(sc);
 }
 
 static void
 atkbdc_aux_poll(struct atkbdc_softc *sc)
 {
 	if (ps2mouse_fifocnt(sc->ps2mouse_sc) > 0) {
 		sc->status |= KBDS_AUX_BUFFER_FULL | KBDS_KBD_BUFFER_FULL;
 		sc->outport |= KBDO_AUX_OUTFULL;
 		atkbdc_assert_aux_intr(sc);
 	}
 }
 
 static void
 atkbdc_kbd_poll(struct atkbdc_softc *sc)
 {
 	assert(pthread_mutex_isowned_np(&sc->mtx));
 
 	atkbdc_kbd_read(sc);
 }
 
 static void
 atkbdc_poll(struct atkbdc_softc *sc)
 {
 	atkbdc_aux_poll(sc);
 	atkbdc_kbd_poll(sc);
 }
 
 static void
 atkbdc_dequeue_data(struct atkbdc_softc *sc, uint8_t *buf)
 {
 	assert(pthread_mutex_isowned_np(&sc->mtx));
 
 	if (ps2mouse_read(sc->ps2mouse_sc, buf) == 0) {
 		if (ps2mouse_fifocnt(sc->ps2mouse_sc) == 0) {
 			if (sc->kbd.bcnt == 0)
 				sc->status &= ~(KBDS_AUX_BUFFER_FULL |
 				                KBDS_KBD_BUFFER_FULL);
 			else
 				sc->status &= ~(KBDS_AUX_BUFFER_FULL);
 			sc->outport &= ~KBDO_AUX_OUTFULL;
 		}
 
 		atkbdc_poll(sc);
 		return;
 	}
 
 	if (sc->kbd.bcnt > 0) {
 		*buf = sc->kbd.buffer[sc->kbd.brd];
 		sc->kbd.brd = (sc->kbd.brd + 1) % FIFOSZ;
 		sc->kbd.bcnt--;
 		if (sc->kbd.bcnt == 0) {
 			sc->status &= ~KBDS_KBD_BUFFER_FULL;
 			sc->outport &= ~KBDO_KBD_OUTFULL;
 		}
 
 		atkbdc_poll(sc);
 	}
 
 	if (ps2mouse_fifocnt(sc->ps2mouse_sc) == 0 && sc->kbd.bcnt == 0) {
 		sc->status &= ~(KBDS_AUX_BUFFER_FULL | KBDS_KBD_BUFFER_FULL);
 	}
 }
 
 static int
 atkbdc_data_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
     uint32_t *eax, void *arg)
 {
 	struct atkbdc_softc *sc;
 	uint8_t buf;
 	int retval;
 
 	if (bytes != 1)
 		return (-1);
 	sc = arg;
 	retval = 0;
 
 	pthread_mutex_lock(&sc->mtx);
 	if (in) {
 		sc->curcmd = 0;
 		if (sc->ctrlbyte != 0) {
 			*eax = sc->ctrlbyte & 0xff;
 			sc->ctrlbyte = 0;
 		} else {
 			/* read device buffer; includes kbd cmd responses */
 			atkbdc_dequeue_data(sc, &buf);
 			*eax = buf;
 		}
 
 		sc->status &= ~KBDS_CTRL_FLAG;
 		pthread_mutex_unlock(&sc->mtx);
 		return (retval);
 	}
 
 	if (sc->status & KBDS_CTRL_FLAG) {
 		/*
 		 * Command byte for the controller.
 		 */
 		switch (sc->curcmd) {
 		case KBDC_SET_COMMAND_BYTE:
 			sc->ram[0] = *eax;
 			if (sc->ram[0] & KBD_SYS_FLAG_BIT)
 				sc->status |= KBDS_SYS_FLAG;
 			else
 				sc->status &= ~KBDS_SYS_FLAG;
 			break;
 		case KBDC_WRITE_OUTPORT:
 			sc->outport = *eax;
 			break;
 		case KBDC_WRITE_TO_AUX:
 			ps2mouse_write(sc->ps2mouse_sc, *eax, 0);
 			atkbdc_poll(sc);
 			break;
 		case KBDC_WRITE_KBD_OUTBUF:
 			atkbdc_kbd_queue_data(sc, *eax);
 			break;
 		case KBDC_WRITE_AUX_OUTBUF:
 			ps2mouse_write(sc->ps2mouse_sc, *eax, 1);
 			sc->status |= (KBDS_AUX_BUFFER_FULL | KBDS_KBD_BUFFER_FULL);
 			atkbdc_aux_poll(sc);
 			break;
 		default:
 			/* write to particular RAM byte */
 			if (sc->curcmd >= 0x61 && sc->curcmd <= 0x7f) {
 				int byten;
 
 				byten = (sc->curcmd - 0x60) & 0x1f;
 				sc->ram[byten] = *eax & 0xff;
 			}
 			break;
 		}
 
 		sc->curcmd = 0;
 		sc->status &= ~KBDS_CTRL_FLAG;
 
 		pthread_mutex_unlock(&sc->mtx);
 		return (retval);
 	}
 
 	/*
 	 * Data byte for the device.
 	 */
 	ps2kbd_write(sc->ps2kbd_sc, *eax);
 	atkbdc_poll(sc);
 
 	pthread_mutex_unlock(&sc->mtx);
 
 	return (retval);
 }
 
 static int
 atkbdc_sts_ctl_handler(struct vmctx *ctx, int vcpu, int in, int port,
     int bytes, uint32_t *eax, void *arg)
 {
 	struct atkbdc_softc *sc;
 	int	error, retval;
 
 	if (bytes != 1)
 		return (-1);
 
 	sc = arg;
 	retval = 0;
 
 	pthread_mutex_lock(&sc->mtx);
 
 	if (in) {
 		/* read status register */
 		*eax = sc->status;
 		pthread_mutex_unlock(&sc->mtx);
 		return (retval);
 	}
 
 
 	sc->curcmd = 0;
 	sc->status |= KBDS_CTRL_FLAG;
 	sc->ctrlbyte = 0;
 
 	switch (*eax) {
 	case KBDC_GET_COMMAND_BYTE:
 		sc->ctrlbyte = CTRL_CMD_FLAG | sc->ram[0];
 		break;
 	case KBDC_TEST_CTRL:
 		sc->ctrlbyte = CTRL_CMD_FLAG | 0x55;
 		break;
 	case KBDC_TEST_AUX_PORT:
 	case KBDC_TEST_KBD_PORT:
 		sc->ctrlbyte = CTRL_CMD_FLAG | 0;
 		break;
 	case KBDC_READ_INPORT:
 		sc->ctrlbyte = CTRL_CMD_FLAG | 0;
 		break;
 	case KBDC_READ_OUTPORT:
 		sc->ctrlbyte = CTRL_CMD_FLAG | sc->outport;
 		break;
 	case KBDC_SET_COMMAND_BYTE:
 	case KBDC_WRITE_OUTPORT:
 	case KBDC_WRITE_KBD_OUTBUF:
 	case KBDC_WRITE_AUX_OUTBUF:
 		sc->curcmd = *eax;
 		break;
 	case KBDC_DISABLE_KBD_PORT:
 		sc->ram[0] |= KBD_DISABLE_KBD_PORT;
 		break;
 	case KBDC_ENABLE_KBD_PORT:
 		sc->ram[0] &= ~KBD_DISABLE_KBD_PORT;
 		if (sc->kbd.bcnt > 0)
 			sc->status |= KBDS_KBD_BUFFER_FULL;
 		atkbdc_poll(sc);
 		break;
 	case KBDC_WRITE_TO_AUX:
 		sc->curcmd = *eax;
 		break;
 	case KBDC_DISABLE_AUX_PORT:
 		sc->ram[0] |= KBD_DISABLE_AUX_PORT;
 		ps2mouse_toggle(sc->ps2mouse_sc, 0);
 		sc->status &= ~(KBDS_AUX_BUFFER_FULL | KBDS_KBD_BUFFER_FULL);
 		sc->outport &= ~KBDS_AUX_BUFFER_FULL;
 		break;
 	case KBDC_ENABLE_AUX_PORT:
 		sc->ram[0] &= ~KBD_DISABLE_AUX_PORT;
 		ps2mouse_toggle(sc->ps2mouse_sc, 1);
 		if (ps2mouse_fifocnt(sc->ps2mouse_sc) > 0)
 			sc->status |= KBDS_AUX_BUFFER_FULL | KBDS_KBD_BUFFER_FULL;
 		break;
 	case KBDC_RESET:		/* Pulse "reset" line */
 		error = vm_suspend(ctx, VM_SUSPEND_RESET);
 		assert(error == 0 || errno == EALREADY);
 		break;
 	default:
 		if (*eax >= 0x21 && *eax <= 0x3f) {
 			/* read "byte N" from RAM */
 			int	byten;
 
 			byten = (*eax - 0x20) & 0x1f;
 			sc->ctrlbyte = CTRL_CMD_FLAG | sc->ram[byten];
 		}
 		break;
 	}
 
 	pthread_mutex_unlock(&sc->mtx);
 
 	if (sc->ctrlbyte != 0) {
 		sc->status |= KBDS_KBD_BUFFER_FULL;
 		sc->status &= ~KBDS_AUX_BUFFER_FULL;
 		atkbdc_assert_kbd_intr(sc);
 	} else if (ps2mouse_fifocnt(sc->ps2mouse_sc) > 0 &&
 	           (sc->ram[0] & KBD_DISABLE_AUX_PORT) == 0) {
 		sc->status |= KBDS_AUX_BUFFER_FULL | KBDS_KBD_BUFFER_FULL;
 		atkbdc_assert_aux_intr(sc);
 	} else if (sc->kbd.bcnt > 0 && (sc->ram[0] & KBD_DISABLE_KBD_PORT) == 0) {
 		sc->status |= KBDS_KBD_BUFFER_FULL;
 		atkbdc_assert_kbd_intr(sc);
 	}
 
 	return (retval);
 }
 
 void
 atkbdc_event(struct atkbdc_softc *sc, int iskbd)
 {
 	pthread_mutex_lock(&sc->mtx);
 
 	if (iskbd)
 		atkbdc_kbd_poll(sc);
 	else
 		atkbdc_aux_poll(sc);
 	pthread_mutex_unlock(&sc->mtx);
 }
 
 void
 atkbdc_init(struct vmctx *ctx)
 {
 	struct inout_port iop;
 	struct atkbdc_softc *sc;
 	int error;
 
 	sc = calloc(1, sizeof(struct atkbdc_softc));
 	sc->ctx = ctx;
 
 	pthread_mutex_init(&sc->mtx, NULL);
 
 	bzero(&iop, sizeof(struct inout_port));
 	iop.name = "atkdbc";
 	iop.port = KBD_STS_CTL_PORT;
 	iop.size = 1;
 	iop.flags = IOPORT_F_INOUT;
 	iop.handler = atkbdc_sts_ctl_handler;
 	iop.arg = sc;
 
 	error = register_inout(&iop);
 	assert(error == 0);
 
 	bzero(&iop, sizeof(struct inout_port));
 	iop.name = "atkdbc";
 	iop.port = KBD_DATA_PORT;
 	iop.size = 1;
 	iop.flags = IOPORT_F_INOUT;
 	iop.handler = atkbdc_data_handler;
 	iop.arg = sc;
 
 	error = register_inout(&iop);
 	assert(error == 0);
 
 	pci_irq_reserve(KBD_DEV_IRQ);
 	sc->kbd.irq = KBD_DEV_IRQ;
 
 	pci_irq_reserve(AUX_DEV_IRQ);
 	sc->aux.irq = AUX_DEV_IRQ;
 
 	sc->ps2kbd_sc = ps2kbd_init(sc);
 	sc->ps2mouse_sc = ps2mouse_init(sc);
+
+#ifdef BHYVE_SNAPSHOT
+	assert(atkbdc_sc == NULL);
+	atkbdc_sc = sc;
+#endif
+}
+
+#ifdef BHYVE_SNAPSHOT
+int
+atkbdc_snapshot(struct vm_snapshot_meta *meta)
+{
+	int ret;
+
+	SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->status, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->outport, meta, ret, done);
+	SNAPSHOT_BUF_OR_LEAVE(atkbdc_sc->ram,
+			      sizeof(atkbdc_sc->ram), meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->curcmd, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->ctrlbyte, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->kbd, meta, ret, done);
+
+	SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->kbd.irq_active, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->kbd.irq, meta, ret, done);
+	SNAPSHOT_BUF_OR_LEAVE(atkbdc_sc->kbd.buffer,
+			      sizeof(atkbdc_sc->kbd.buffer), meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->kbd.brd, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->kbd.bwr, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->kbd.bcnt, meta, ret, done);
+
+	SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->aux.irq_active, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->aux.irq, meta, ret, done);
+
+	ret = ps2kbd_snapshot(atkbdc_sc->ps2kbd_sc, meta);
+	if (ret != 0)
+		goto done;
+
+	ret = ps2mouse_snapshot(atkbdc_sc->ps2mouse_sc, meta);
+
+done:
+	return (ret);
 }
+#endif
 
 static void
 atkbdc_dsdt(void)
 {
 
 	dsdt_line("");
 	dsdt_line("Device (KBD)");
 	dsdt_line("{");
 	dsdt_line("  Name (_HID, EisaId (\"PNP0303\"))");
 	dsdt_line("  Name (_CRS, ResourceTemplate ()");
 	dsdt_line("  {");
 	dsdt_indent(2);
 	dsdt_fixed_ioport(KBD_DATA_PORT, 1);
 	dsdt_fixed_ioport(KBD_STS_CTL_PORT, 1);
 	dsdt_fixed_irq(1);
 	dsdt_unindent(2);
 	dsdt_line("  })");
 	dsdt_line("}");
 
 	dsdt_line("");
 	dsdt_line("Device (MOU)");
 	dsdt_line("{");
 	dsdt_line("  Name (_HID, EisaId (\"PNP0F13\"))");
 	dsdt_line("  Name (_CRS, ResourceTemplate ()");
 	dsdt_line("  {");
 	dsdt_indent(2);
 	dsdt_fixed_ioport(KBD_DATA_PORT, 1);
 	dsdt_fixed_ioport(KBD_STS_CTL_PORT, 1);
 	dsdt_fixed_irq(12);
 	dsdt_unindent(2);
 	dsdt_line("  })");
 	dsdt_line("}");
 }
 LPC_DSDT(atkbdc_dsdt);
 
diff --git a/usr.sbin/bhyve/atkbdc.h b/usr.sbin/bhyve/atkbdc.h
index 85c8a7141eb2..14c00ed9ae88 100644
--- a/usr.sbin/bhyve/atkbdc.h
+++ b/usr.sbin/bhyve/atkbdc.h
@@ -1,38 +1,43 @@
 /*-
  * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _ATKBDC_H_
 #define _ATKBDC_H_
 
 struct atkbdc_softc;
+struct vm_snapshot_meta;
 struct vmctx;
 
 void atkbdc_init(struct vmctx *ctx);
 void atkbdc_event(struct atkbdc_softc *sc, int iskbd);
 
+#ifdef BHYVE_SNAPSHOT
+int atkbdc_snapshot(struct vm_snapshot_meta *meta);
+#endif
+
 #endif /* _ATKBDC_H_ */
diff --git a/usr.sbin/bhyve/bhyve.8 b/usr.sbin/bhyve/bhyve.8
index 5d329a54491e..85e5f0256fbf 100644
--- a/usr.sbin/bhyve/bhyve.8
+++ b/usr.sbin/bhyve/bhyve.8
@@ -1,669 +1,685 @@
 .\" Copyright (c) 2013 Peter Grehan
 .\" All rights reserved.
 .\"
 .\" Redistribution and use in source and binary forms, with or without
 .\" modification, are permitted provided that the following conditions
 .\" are met:
 .\" 1. Redistributions of source code must retain the above copyright
 .\"    notice, this list of conditions and the following disclaimer.
 .\" 2. Redistributions in binary form must reproduce the above copyright
 .\"    notice, this list of conditions and the following disclaimer in the
 .\"    documentation and/or other materials provided with the distribution.
 .\"
 .\" THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
 .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 .\" ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
 .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 .\" SUCH DAMAGE.
 .\"
 .\" $FreeBSD$
 .\"
-.Dd April 22, 2020
+.Dd May 04, 2020
 .Dt BHYVE 8
 .Os
 .Sh NAME
 .Nm bhyve
 .Nd "run a guest operating system inside a virtual machine"
 .Sh SYNOPSIS
 .Nm
 .Op Fl AabCeHhPSuWwxY
 .Oo
 .Sm off
 .Fl c\~
 .Oo
 .Op Cm cpus=
 .Ar numcpus
 .Oc
 .Op Cm ,sockets= Ar n
 .Op Cm ,cores= Ar n
 .Op Cm ,threads= Ar n
 .Oc
 .Sm on
 .Op Fl G Ar port
 .Op Fl g Ar gdbport
 .Oo Fl l
 .Sm off
 .Cm help | Ar lpcdev Op Cm \&, Ar conf
 .Sm on
 .Oc
 .Oo Fl m
 .Sm off
 .Ar memsize
 .Oo
 .Cm K No | Cm k No | Cm M No | Cm m No | Cm G No | Cm g No | Cm T No | Cm t
 .Oc
 .Sm on
 .Oc
 .Op Fl p Ar vcpu Ns Cm \&: Ns Ar hostcpu
+.Op Fl r Ar file
 .Oo Fl s
 .Sm off
 .Cm help | Ar slot Cm \&, Ar emulation Op Cm \&, Ar conf
 .Sm on
 .Oc
 .Op Fl U Ar uuid
 .Ar vmname
 .Sh DESCRIPTION
 .Nm
 is a hypervisor that runs guest operating systems inside a
 virtual machine.
 .Pp
 Parameters such as the number of virtual CPUs, amount of guest memory, and
 I/O connectivity can be specified with command-line parameters.
 .Pp
 If not using a boot ROM, the guest operating system must be loaded with
 .Xr bhyveload 8
 or a similar boot loader before running
 .Nm ,
 otherwise, it is enough to run
 .Nm
 with a boot ROM of choice.
 .Pp
 .Nm
 runs until the guest operating system reboots or an unhandled hypervisor
 exit is detected.
 .Sh OPTIONS
 .Bl -tag -width 10n
 .It Fl a
 The guest's local APIC is configured in xAPIC mode.
 The xAPIC mode is the default setting so this option is redundant.
 It will be deprecated in a future version.
 .It Fl A
 Generate ACPI tables.
 Required for
 .Fx Ns /amd64
 guests.
 .It Fl b
 Enable a low-level console device supported by
 .Fx
 kernels compiled with
 .Cd "device bvmconsole" .
 This option will be deprecated in a future version.
 .It Fl c Op Ar setting ...
 Number of guest virtual CPUs
 and/or the CPU topology.
 The default value for each of
 .Ar numcpus ,
 .Ar sockets ,
 .Ar cores ,
 and
 .Ar threads
 is 1.
 The current maximum number of guest virtual CPUs is 16.
 If
 .Ar numcpus
 is not specified then it will be calculated from the other arguments.
 The topology must be consistent in that the
 .Ar numcpus
 must equal the product of
 .Ar sockets ,
 .Ar cores ,
 and
 .Ar threads .
 If a
 .Ar setting
 is specified more than once the last one has precedence.
 .It Fl C
 Include guest memory in core file.
 .It Fl e
 Force
 .Nm
 to exit when a guest issues an access to an I/O port that is not emulated.
 This is intended for debug purposes.
 .It Fl g Ar gdbport
 For
 .Fx
 kernels compiled with
 .Cd "device bvmdebug" ,
 allow a remote kernel kgdb to be relayed to the guest kernel gdb stub
 via a local IPv4 address and this port.
 This option will be deprecated in a future version.
 .It Fl G Ar port
 Start a debug server that uses the GDB protocol to export guest state to a
 debugger.
 An IPv4 TCP socket will be bound to the supplied
 .Ar port
 to listen for debugger connections.
 Only a single debugger may be attached to the debug server at a time.
 If
 .Ar port
 begins with
 .Sq w ,
 .Nm
 will pause execution at the first instruction waiting for a debugger to attach.
 .It Fl h
 Print help message and exit.
 .It Fl H
 Yield the virtual CPU thread when a HLT instruction is detected.
 If this option is not specified, virtual CPUs will use 100% of a host CPU.
 .It Fl l Op Ar help|lpcdev Ns Op , Ns Ar conf
 Allow devices behind the LPC PCI-ISA bridge to be configured.
 The only supported devices are the TTY-class devices
 .Ar com1
 and
 .Ar com2
 and the boot ROM device
 .Ar bootrom .
 .Pp
 .Ar help
 print a list of supported LPC devices.
 .It Fl m Ar memsize Ns Op Ar K|k|M|m|G|g|T|t
 Guest physical memory size in bytes.
 This must be the same size that was given to
 .Xr bhyveload 8 .
 .Pp
 The size argument may be suffixed with one of K, M, G or T (either upper
 or lower case) to indicate a multiple of kilobytes, megabytes, gigabytes,
 or terabytes.
 If no suffix is given, the value is assumed to be in megabytes.
 .Pp
 .Ar memsize
 defaults to 256M.
 .It Fl p Ar vcpu:hostcpu
 Pin guest's virtual CPU
 .Em vcpu
 to
 .Em hostcpu .
 .It Fl P
 Force the guest virtual CPU to exit when a PAUSE instruction is detected.
+.It Fl r Ar file
+Resume a guest from a snapshot.
+The guest memory contents are restored from
+.Ar file ,
+and the guest device and vCPU state are restored from the file
+.Dq Ar file Ns .kern .
+.Pp
+Note that the current snapshot file format requires that the configuration of
+devices in the new VM match the VM from which the snapshot was taken by specifying the
+same
+.Op Fl s
+and
+.Op Fl l
+options.
+The count of vCPUs and memory configuration are read from the snapshot.
 .It Fl s Op Ar help|slot,emulation Ns Op , Ns Ar conf
 Configure a virtual PCI slot and function.
 .Pp
 .Nm
 provides PCI bus emulation and virtual devices that can be attached to
 slots on the bus.
 There are 32 available slots, with the option of providing up to 8 functions
 per slot.
 .Bl -tag -width 10n
 .It Ar help
 print a list of supported PCI devices.
 .It Ar slot
 .Ar pcislot[:function]
 .Ar bus:pcislot:function
 .Pp
 The
 .Ar pcislot
 value is 0 to 31.
 The optional
 .Ar function
 value is 0 to 7.
 The optional
 .Ar bus
 value is 0 to 255.
 If not specified, the
 .Ar function
 value defaults to 0.
 If not specified, the
 .Ar bus
 value defaults to 0.
 .It Ar emulation
 .Bl -tag -width 10n
 .It Li hostbridge | Li amd_hostbridge
 .Pp
 Provide a simple host bridge.
 This is usually configured at slot 0, and is required by most guest
 operating systems.
 The
 .Li amd_hostbridge
 emulation is identical but uses a PCI vendor ID of
 .Li AMD .
 .It Li passthru
 PCI pass-through device.
 .It Li virtio-net
 Virtio network interface.
 .It Li virtio-blk
 Virtio block storage interface.
 .It Li virtio-scsi
 Virtio SCSI interface.
 .It Li virtio-rnd
 Virtio RNG interface.
 .It Li virtio-console
 Virtio console interface, which exposes multiple ports
 to the guest in the form of simple char devices for simple IO
 between the guest and host userspaces.
 .It Li ahci
 AHCI controller attached to arbitrary devices.
 .It Li ahci-cd
 AHCI controller attached to an ATAPI CD/DVD.
 .It Li ahci-hd
 AHCI controller attached to a SATA hard-drive.
 .It Li e1000
 Intel e82545 network interface.
 .It Li uart
 PCI 16550 serial device.
 .It Li lpc
 LPC PCI-ISA bridge with COM1 and COM2 16550 serial ports and a boot ROM.
 The LPC bridge emulation can only be configured on bus 0.
 .It Li fbuf
 Raw framebuffer device attached to VNC server.
 .It Li xhci
 eXtensible Host Controller Interface (xHCI) USB controller.
 .It Li nvme
 NVM Express (NVMe) controller.
 .It Li hda
 High Definition Audio Controller.
 .El
 .It Op Ar conf
 This optional parameter describes the backend for device emulations.
 If
 .Ar conf
 is not specified, the device emulation has no backend and can be
 considered unconnected.
 .Pp
 Network devices:
 .Bl -tag -width 10n
 .It Ar tapN Ns Op , Ns Ar mac=xx:xx:xx:xx:xx:xx
 .It Ar vmnetN Ns Op , Ns Ar mac=xx:xx:xx:xx:xx:xx
 .Pp
 If
 .Ar mac
 is not specified, the MAC address is derived from a fixed OUI and the
 remaining bytes from an MD5 hash of the slot and function numbers and
 the device name.
 .Pp
 The MAC address is an ASCII string in
 .Xr ethers 5
 format.
 .El
 .Pp
 Block storage devices:
 .Bl -tag -width 10n
 .It Pa /filename Ns Oo , Ns Ar block-device-options Oc
 .It Pa /dev/xxx Ns Oo , Ns Ar block-device-options Oc
 .El
 .Pp
 The
 .Ar block-device-options
 are:
 .Bl -tag -width 8n
 .It Li nocache
 Open the file with
 .Dv O_DIRECT .
 .It Li direct
 Open the file using
 .Dv O_SYNC .
 .It Li ro
 Force the file to be opened read-only.
 .It Li sectorsize= Ns Ar logical Ns Oo / Ns Ar physical Oc
 Specify the logical and physical sector sizes of the emulated disk.
 The physical sector size is optional and is equal to the logical sector size
 if not explicitly specified.
 .El
 .Pp
 SCSI devices:
 .Bl -tag -width 10n
 .It Pa /dev/cam/ctl Ns Oo Ar pp . Ns Ar vp Oc Ns Oo , Ns Ar scsi-device-options Oc
 .El
 .Pp
 The
 .Ar scsi-device-options
 are:
 .Bl -tag -width 10n
 .It Li iid= Ns Ar IID
 Initiator ID to use when sending requests to specified CTL port.
 The default value is 0.
 .El
 .Pp
 TTY devices:
 .Bl -tag -width 10n
 .It Li stdio
 Connect the serial port to the standard input and output of
 the
 .Nm
 process.
 .It Pa /dev/xxx
 Use the host TTY device for serial port I/O.
 .El
 .Pp
 Boot ROM device:
 .Bl -tag -width 10n
 .It Pa romfile
 Map
 .Ar romfile
 in the guest address space reserved for boot firmware.
 .El
 .Pp
 Pass-through devices:
 .Bl -tag -width 10n
 .It Ns Ar slot Ns / Ns Ar bus Ns / Ns Ar function
 Connect to a PCI device on the host at the selector described by
 .Ar slot ,
 .Ar bus ,
 and
 .Ar function
 numbers.
 .El
 .Pp
 Guest memory must be wired using the
 .Fl S
 option when a pass-through device is configured.
 .Pp
 The host device must have been reserved at boot-time using the
 .Va pptdevs
 loader variable as described in
 .Xr vmm 4 .
 .Pp
 Virtio console devices:
 .Bl -tag -width 10n
 .It Li port1= Ns Pa /path/to/port1.sock Ns ,anotherport= Ns Pa ...
 A maximum of 16 ports per device can be created.
 Every port is named and corresponds to a Unix domain socket created by
 .Nm .
 .Nm
 accepts at most one connection per port at a time.
 .Pp
 Limitations:
 .Bl -bullet -offset 2n
 .It
 Due to lack of destructors in
 .Nm ,
 sockets on the filesystem must be cleaned up manually after
 .Nm
 exits.
 .It
 There is no way to use the "console port" feature, nor the console port
 resize at present.
 .It
 Emergency write is advertised, but no-op at present.
 .El
 .El
 .Pp
 Framebuffer devices:
 .Bl -tag -width 10n
 .It Xo
 .Oo rfb= Ns Oo Ar IP\&: Oc Ns Ar port Oc Ns Oo ,w= Ns Ar width Oc Ns Oo ,h= Ns
 .Ar height Oc Ns Oo ,vga= Ns Ar vgaconf Oc Ns Oo Ns ,wait Oc Ns Oo ,password= Ns
 .Ar password Oc
 .Xc
 .Bl -tag -width 8n
 .It Ar IPv4:port No or Ar [IPv6%zone]:port
 An
 .Ar IP
 address and a
 .Ar port
 VNC should listen on.
 The default is to listen on localhost IPv4 address and default VNC port 5900.
 An IPv6 address must be enclosed in square brackets and may contain an
 optional zone identifier.
 .It Ar width No and Ar height
 A display resolution, width and height, respectively.
 If not specified, a default resolution of 1024x768 pixels will be used.
 Minimal supported resolution is 640x480 pixels,
 and maximum is 1920x1200 pixels.
 .It Ar vgaconf
 Possible values for this option are
 .Dq io
 (default),
 .Dq on
 , and
 .Dq off .
 PCI graphics cards have a dual personality in that they are
 standard PCI devices with BAR addressing, but may also
 implicitly decode legacy VGA I/O space
 .Pq Ad 0x3c0-3df
 and memory space
 .Pq 64KB at Ad 0xA0000 .
 The default
 .Dq io
 option should be used for guests that attempt to issue BIOS calls which result
 in I/O port queries, and fail to boot if I/O decode is disabled.
 .Pp
 The
 .Dq on
 option should be used along with the CSM BIOS capability in UEFI
 to boot traditional BIOS guests that require the legacy VGA I/O and
 memory regions to be available.
 .Pp
 The
 .Dq off
 option should be used for the UEFI guests that assume that
 VGA adapter is present if they detect the I/O ports.
 An example of such a guest is
 .Ox
 in UEFI mode.
 .Pp
 Please refer to the
 .Nm
 .Fx
 wiki page
 .Pq Lk https://wiki.freebsd.org/bhyve
 for configuration notes of particular guests.
 .It wait
 Instruct
 .Nm
 to only boot upon the initiation of a VNC connection, simplifying the
 installation of operating systems that require immediate keyboard input.
 This can be removed for post-installation use.
 .It password
 This type of authentication is known to be cryptographically weak and is not
 intended for use on untrusted networks.
 Many implementations will want to use stronger security, such as running
 the session over an encrypted channel provided by IPsec or SSH.
 .El
 .El
 .Pp
 xHCI USB devices:
 .Bl -tag -width 10n
 .It Li tablet
 A USB tablet device which provides precise cursor synchronization
 when using VNC.
 .El
 .Pp
 NVMe devices:
 .Bl -tag -width 10n
 .It Li devpath
 Accepted device paths are:
 .Ar /dev/blockdev
 or
 .Ar /path/to/image
 or
 .Ar ram=size_in_MiB .
 .It Li maxq
 Max number of queues.
 .It Li qsz
 Max elements in each queue.
 .It Li ioslots
 Max number of concurrent I/O requests.
 .It Li sectsz
 Sector size (defaults to blockif sector size).
 .It Li ser
 Serial number with maximum 20 characters.
 .El
 .Pp
 HD Audio devices:
 .Bl -tag -width 10n
 .It Li play
 Playback device, typically
 .Ar /dev/dsp0 .
 .It Li rec
 Recording device, typically
 .Ar /dev/dsp0 .
 .El
 .El
 .It Fl S
 Wire guest memory.
 .It Fl u
 RTC keeps UTC time.
 .It Fl U Ar uuid
 Set the universally unique identifier
 .Pq UUID
 in the guest's System Management BIOS System Information structure.
 By default a UUID is generated from the host's hostname and
 .Ar vmname .
 .It Fl w
 Ignore accesses to unimplemented Model Specific Registers (MSRs).
 This is intended for debug purposes.
 .It Fl W
 Force virtio PCI device emulations to use MSI interrupts instead of MSI-X
 interrupts.
 .It Fl x
 The guest's local APIC is configured in x2APIC mode.
 .It Fl Y
 Disable MPtable generation.
 .It Ar vmname
 Alphanumeric name of the guest.
 This should be the same as that created by
 .Xr bhyveload 8 .
 .El
 .Sh DEBUG SERVER
 The current debug server provides limited support for debuggers.
 .Ss Registers
 Each virtual CPU is exposed to the debugger as a thread.
 .Pp
 General purpose registers can be queried for each virtual CPU, but other
 registers such as floating-point and system registers cannot be queried.
 .Ss Memory
 Memory (including memory mapped I/O regions) can be read and written by the debugger.
 Memory operations use virtual addresses that are resolved to physical addresses
 via the current virtual CPU's active address translation.
 .Ss Control
 The running guest can be interrupted by the debugger at any time
 .Pq for example, by pressing Ctrl-C in the debugger .
 .Pp
 Single stepping is only supported on Intel CPUs supporting the MTRAP VM exit.
 .Pp
 Breakpoints are supported on Intel CPUs that support single stepping.
 Note that continuing from a breakpoint while interrupts are enabled in the
 guest may not work as expected due to timer interrupts firing while single
 stepping over the breakpoint.
 .Sh SIGNAL HANDLING
 .Nm
 deals with the following signals:
 .Pp
 .Bl -tag -width indent -compact
 .It SIGTERM
 Trigger ACPI poweroff for a VM
 .El
 .Sh EXIT STATUS
 Exit status indicates how the VM was terminated:
 .Pp
 .Bl -tag -width indent -compact
 .It 0
 rebooted
 .It 1
 powered off
 .It 2
 halted
 .It 3
 triple fault
 .It 4
 exited due to an error
 .El
 .Sh EXAMPLES
 If not using a boot ROM, the guest operating system must have been loaded with
 .Xr bhyveload 8
 or a similar boot loader before
 .Xr bhyve 4
 can be run.
 Otherwise, the boot loader is not needed.
 .Pp
 To run a virtual machine with 1GB of memory, two virtual CPUs, a virtio
 block device backed by the
 .Pa /my/image
 filesystem image, and a serial port for the console:
 .Bd -literal -offset indent
 bhyve -c 2 -s 0,hostbridge -s 1,lpc -s 2,virtio-blk,/my/image \\
   -l com1,stdio -A -H -P -m 1G vm1
 .Ed
 .Pp
 Run a 24GB single-CPU virtual machine with three network ports, one of which
 has a MAC address specified:
 .Bd -literal -offset indent
 bhyve -s 0,hostbridge -s 1,lpc -s 2:0,virtio-net,tap0 \\
   -s 2:1,virtio-net,tap1 \\
   -s 2:2,virtio-net,tap2,mac=00:be:fa:76:45:00 \\
   -s 3,virtio-blk,/my/image -l com1,stdio \\
   -A -H -P -m 24G bigvm
 .Ed
 .Pp
 Run an 8GB quad-CPU virtual machine with 8 AHCI SATA disks, an AHCI ATAPI
 CD-ROM, a single virtio network port, an AMD hostbridge, and the console
 port connected to an
 .Xr nmdm 4
 null-modem device.
 .Bd -literal -offset indent
 bhyve -c 4 \\
   -s 0,amd_hostbridge -s 1,lpc \\
   -s 1:0,ahci,hd:/images/disk.1,hd:/images/disk.2,\\
 hd:/images/disk.3,hd:/images/disk.4,\\
 hd:/images/disk.5,hd:/images/disk.6,\\
 hd:/images/disk.7,hd:/images/disk.8,\\
 cd:/images/install.iso \\
   -s 3,virtio-net,tap0 \\
   -l com1,/dev/nmdm0A \\
   -A -H -P -m 8G
 .Ed
 .Pp
 Run a UEFI virtual machine with a display resolution of 800 by 600 pixels
 that can be accessed via VNC at: 0.0.0.0:5900.
 .Bd -literal -offset indent
 bhyve -c 2 -m 4G -w -H \\
   -s 0,hostbridge \\
   -s 3,ahci-cd,/path/to/uefi-OS-install.iso \\
   -s 4,ahci-hd,disk.img \\
   -s 5,virtio-net,tap0 \\
   -s 29,fbuf,tcp=0.0.0.0:5900,w=800,h=600,wait \\
   -s 30,xhci,tablet \\
   -s 31,lpc -l com1,stdio \\
   -l bootrom,/usr/local/share/uefi-firmware/BHYVE_UEFI.fd \\
    uefivm
 .Ed
 .Pp
 Run a UEFI virtual machine with a VNC display that is bound to all IPv6
 addresses on port 5900.
 .Bd -literal -offset indent
 bhyve -c 2 -m 4G -w -H \\
   -s 0,hostbridge \\
   -s 4,ahci-hd,disk.img \\
   -s 5,virtio-net,tap0 \\
   -s 29,fbuf,tcp=[::]:5900,w=800,h=600 \\
   -s 30,xhci,tablet \\
   -s 31,lpc -l com1,stdio \\
   -l bootrom,/usr/local/share/uefi-firmware/BHYVE_UEFI.fd \\
    uefivm
 .Ed
 .Sh SEE ALSO
 .Xr bhyve 4 ,
 .Xr nmdm 4 ,
 .Xr vmm 4 ,
 .Xr ethers 5 ,
 .Xr bhyvectl 8 ,
 .Xr bhyveload 8
 .Pp
 .Rs
 .%A Intel
 .%B 64 and IA-32 Architectures Software Developer’s Manual
 .%V Volume 3
 .Re
 .Sh HISTORY
 .Nm
 first appeared in
 .Fx 10.0 .
 .Sh AUTHORS
 .An Neel Natu Aq Mt neel@freebsd.org
 .An Peter Grehan Aq Mt grehan@freebsd.org
diff --git a/usr.sbin/bhyve/bhyverun.c b/usr.sbin/bhyve/bhyverun.c
index 324e40d2cda2..8d73bd38cae4 100644
--- a/usr.sbin/bhyve/bhyverun.c
+++ b/usr.sbin/bhyve/bhyverun.c
@@ -1,1254 +1,1428 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/types.h>
 #ifndef WITHOUT_CAPSICUM
 #include <sys/capsicum.h>
 #endif
 #include <sys/mman.h>
+#ifdef BHYVE_SNAPSHOT
+#include <sys/socket.h>
+#include <sys/stat.h>
+#endif
 #include <sys/time.h>
+#ifdef BHYVE_SNAPSHOT
+#include <sys/un.h>
+#endif
 
 #include <amd64/vmm/intel/vmcs.h>
 
 #include <machine/atomic.h>
 #include <machine/segments.h>
 
 #ifndef WITHOUT_CAPSICUM
 #include <capsicum_helpers.h>
 #endif
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <err.h>
 #include <errno.h>
+#ifdef BHYVE_SNAPSHOT
+#include <fcntl.h>
+#endif
 #include <libgen.h>
 #include <unistd.h>
 #include <assert.h>
 #include <pthread.h>
 #include <pthread_np.h>
 #include <sysexits.h>
 #include <stdbool.h>
 #include <stdint.h>
+#ifdef BHYVE_SNAPSHOT
+#include <ucl.h>
+#include <unistd.h>
+
+#include <libxo/xo.h>
+#endif
 
 #include <machine/vmm.h>
 #ifndef WITHOUT_CAPSICUM
 #include <machine/vmm_dev.h>
 #endif
 #include <vmmapi.h>
 
 #include "bhyverun.h"
 #include "acpi.h"
 #include "atkbdc.h"
 #include "bootrom.h"
 #include "inout.h"
 #include "dbgport.h"
 #include "debug.h"
 #include "fwctl.h"
 #include "gdb.h"
 #include "ioapic.h"
 #include "mem.h"
 #include "mevent.h"
 #include "mptbl.h"
 #include "pci_emul.h"
 #include "pci_irq.h"
 #include "pci_lpc.h"
 #include "smbiostbl.h"
+#ifdef BHYVE_SNAPSHOT
+#include "snapshot.h"
+#endif
 #include "xmsr.h"
 #include "spinup_ap.h"
 #include "rtc.h"
 #include "vmgenc.h"
 
 #define GUEST_NIO_PORT		0x488	/* guest upcalls via i/o port */
 
 #define MB		(1024UL * 1024)
 #define GB		(1024UL * MB)
 
 static const char * const vmx_exit_reason_desc[] = {
 	[EXIT_REASON_EXCEPTION] = "Exception or non-maskable interrupt (NMI)",
 	[EXIT_REASON_EXT_INTR] = "External interrupt",
 	[EXIT_REASON_TRIPLE_FAULT] = "Triple fault",
 	[EXIT_REASON_INIT] = "INIT signal",
 	[EXIT_REASON_SIPI] = "Start-up IPI (SIPI)",
 	[EXIT_REASON_IO_SMI] = "I/O system-management interrupt (SMI)",
 	[EXIT_REASON_SMI] = "Other SMI",
 	[EXIT_REASON_INTR_WINDOW] = "Interrupt window",
 	[EXIT_REASON_NMI_WINDOW] = "NMI window",
 	[EXIT_REASON_TASK_SWITCH] = "Task switch",
 	[EXIT_REASON_CPUID] = "CPUID",
 	[EXIT_REASON_GETSEC] = "GETSEC",
 	[EXIT_REASON_HLT] = "HLT",
 	[EXIT_REASON_INVD] = "INVD",
 	[EXIT_REASON_INVLPG] = "INVLPG",
 	[EXIT_REASON_RDPMC] = "RDPMC",
 	[EXIT_REASON_RDTSC] = "RDTSC",
 	[EXIT_REASON_RSM] = "RSM",
 	[EXIT_REASON_VMCALL] = "VMCALL",
 	[EXIT_REASON_VMCLEAR] = "VMCLEAR",
 	[EXIT_REASON_VMLAUNCH] = "VMLAUNCH",
 	[EXIT_REASON_VMPTRLD] = "VMPTRLD",
 	[EXIT_REASON_VMPTRST] = "VMPTRST",
 	[EXIT_REASON_VMREAD] = "VMREAD",
 	[EXIT_REASON_VMRESUME] = "VMRESUME",
 	[EXIT_REASON_VMWRITE] = "VMWRITE",
 	[EXIT_REASON_VMXOFF] = "VMXOFF",
 	[EXIT_REASON_VMXON] = "VMXON",
 	[EXIT_REASON_CR_ACCESS] = "Control-register accesses",
 	[EXIT_REASON_DR_ACCESS] = "MOV DR",
 	[EXIT_REASON_INOUT] = "I/O instruction",
 	[EXIT_REASON_RDMSR] = "RDMSR",
 	[EXIT_REASON_WRMSR] = "WRMSR",
 	[EXIT_REASON_INVAL_VMCS] =
 	    "VM-entry failure due to invalid guest state",
 	[EXIT_REASON_INVAL_MSR] = "VM-entry failure due to MSR loading",
 	[EXIT_REASON_MWAIT] = "MWAIT",
 	[EXIT_REASON_MTF] = "Monitor trap flag",
 	[EXIT_REASON_MONITOR] = "MONITOR",
 	[EXIT_REASON_PAUSE] = "PAUSE",
 	[EXIT_REASON_MCE_DURING_ENTRY] =
 	    "VM-entry failure due to machine-check event",
 	[EXIT_REASON_TPR] = "TPR below threshold",
 	[EXIT_REASON_APIC_ACCESS] = "APIC access",
 	[EXIT_REASON_VIRTUALIZED_EOI] = "Virtualized EOI",
 	[EXIT_REASON_GDTR_IDTR] = "Access to GDTR or IDTR",
 	[EXIT_REASON_LDTR_TR] = "Access to LDTR or TR",
 	[EXIT_REASON_EPT_FAULT] = "EPT violation",
 	[EXIT_REASON_EPT_MISCONFIG] = "EPT misconfiguration",
 	[EXIT_REASON_INVEPT] = "INVEPT",
 	[EXIT_REASON_RDTSCP] = "RDTSCP",
 	[EXIT_REASON_VMX_PREEMPT] = "VMX-preemption timer expired",
 	[EXIT_REASON_INVVPID] = "INVVPID",
 	[EXIT_REASON_WBINVD] = "WBINVD",
 	[EXIT_REASON_XSETBV] = "XSETBV",
 	[EXIT_REASON_APIC_WRITE] = "APIC write",
 	[EXIT_REASON_RDRAND] = "RDRAND",
 	[EXIT_REASON_INVPCID] = "INVPCID",
 	[EXIT_REASON_VMFUNC] = "VMFUNC",
 	[EXIT_REASON_ENCLS] = "ENCLS",
 	[EXIT_REASON_RDSEED] = "RDSEED",
 	[EXIT_REASON_PM_LOG_FULL] = "Page-modification log full",
 	[EXIT_REASON_XSAVES] = "XSAVES",
 	[EXIT_REASON_XRSTORS] = "XRSTORS"
 };
 
 typedef int (*vmexit_handler_t)(struct vmctx *, struct vm_exit *, int *vcpu);
 extern int vmexit_task_switch(struct vmctx *, struct vm_exit *, int *vcpu);
 
-char *vmname;
+const char *vmname;
 
 int guest_ncpus;
 uint16_t cores, maxcpus, sockets, threads;
 
 char *guest_uuid_str;
 
 int raw_stdio = 0;
 
 static int gdb_port = 0;
 static int guest_vmexit_on_hlt, guest_vmexit_on_pause;
 static int virtio_msix = 1;
 static int x2apic_mode = 0;	/* default is xAPIC */
 
 static int strictio;
 static int strictmsr = 1;
 
 static int acpi;
 
 static char *progname;
 static const int BSP = 0;
 
 static cpuset_t cpumask;
 
 static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip);
 
 static struct vm_exit vmexit[VM_MAXCPU];
 
 struct bhyvestats {
 	uint64_t	vmexit_bogus;
 	uint64_t	vmexit_reqidle;
 	uint64_t	vmexit_hlt;
 	uint64_t	vmexit_pause;
 	uint64_t	vmexit_mtrap;
 	uint64_t	vmexit_inst_emul;
 	uint64_t	cpu_switch_rotate;
 	uint64_t	cpu_switch_direct;
 } stats;
 
 struct mt_vmm_info {
 	pthread_t	mt_thr;
 	struct vmctx	*mt_ctx;
 	int		mt_vcpu;	
 } mt_vmm_info[VM_MAXCPU];
 
 static cpuset_t *vcpumap[VM_MAXCPU] = { NULL };
 
 static void
 usage(int code)
 {
 
         fprintf(stderr,
 		"Usage: %s [-abehuwxACHPSWY]\n"
 		"       %*s [-c [[cpus=]numcpus][,sockets=n][,cores=n][,threads=n]]\n"
 		"       %*s [-g <gdb port>] [-l <lpc>]\n"
 		"       %*s [-m mem] [-p vcpu:hostcpu] [-s <pci>] [-U uuid] <vm>\n"
 		"       -a: local apic is in xAPIC mode (deprecated)\n"
 		"       -A: create ACPI tables\n"
 		"       -c: number of cpus and/or topology specification\n"
 		"       -C: include guest memory in core file\n"
 		"       -e: exit on unhandled I/O access\n"
 		"       -g: gdb port\n"
 		"       -h: help\n"
 		"       -H: vmexit from the guest on hlt\n"
 		"       -l: LPC device configuration\n"
 		"       -m: memory size in MB\n"
+#ifdef BHYVE_SNAPSHOT
+		"       -r: path to checkpoint file\n"
+#endif
 		"       -p: pin 'vcpu' to 'hostcpu'\n"
 		"       -P: vmexit from the guest on pause\n"
 		"       -s: <slot,driver,configinfo> PCI slot config\n"
 		"       -S: guest memory cannot be swapped\n"
 		"       -u: RTC keeps UTC time\n"
 		"       -U: uuid\n"
 		"       -w: ignore unimplemented MSRs\n"
 		"       -W: force virtio to use single-vector MSI\n"
 		"       -x: local apic is in x2APIC mode\n"
 		"       -Y: disable MPtable generation\n",
 		progname, (int)strlen(progname), "", (int)strlen(progname), "",
 		(int)strlen(progname), "");
 
 	exit(code);
 }
 
 /*
  * XXX This parser is known to have the following issues:
  * 1.  It accepts null key=value tokens ",,".
  * 2.  It accepts whitespace after = and before value.
  * 3.  Values out of range of INT are silently wrapped.
  * 4.  It doesn't check non-final values.
  * 5.  The apparently bogus limits of UINT16_MAX are for future expansion.
  *
  * The acceptance of a null specification ('-c ""') is by design to match the
  * manual page syntax specification, this results in a topology of 1 vCPU.
  */
 static int
 topology_parse(const char *opt)
 {
 	uint64_t ncpus;
 	int c, chk, n, s, t, tmp;
 	char *cp, *str;
 	bool ns, scts;
 
 	c = 1, n = 1, s = 1, t = 1;
 	ns = false, scts = false;
 	str = strdup(opt);
 	if (str == NULL)
 		goto out;
 
 	while ((cp = strsep(&str, ",")) != NULL) {
 		if (sscanf(cp, "%i%n", &tmp, &chk) == 1) {
 			n = tmp;
 			ns = true;
 		} else if (sscanf(cp, "cpus=%i%n", &tmp, &chk) == 1) {
 			n = tmp;
 			ns = true;
 		} else if (sscanf(cp, "sockets=%i%n", &tmp, &chk) == 1) {
 			s = tmp;
 			scts = true;
 		} else if (sscanf(cp, "cores=%i%n", &tmp, &chk) == 1) {
 			c = tmp;
 			scts = true;
 		} else if (sscanf(cp, "threads=%i%n", &tmp, &chk) == 1) {
 			t = tmp;
 			scts = true;
 #ifdef notyet  /* Do not expose this until vmm.ko implements it */
 		} else if (sscanf(cp, "maxcpus=%i%n", &tmp, &chk) == 1) {
 			m = tmp;
 #endif
 		/* Skip the empty argument case from -c "" */
 		} else if (cp[0] == '\0')
 			continue;
 		else
 			goto out;
 		/* Any trailing garbage causes an error */
 		if (cp[chk] != '\0')
 			goto out;
 	}
 	free(str);
 	str = NULL;
 
 	/*
 	 * Range check 1 <= n <= UINT16_MAX all values
 	 */
 	if (n < 1 || s < 1 || c < 1 || t < 1 ||
 	    n > UINT16_MAX || s > UINT16_MAX || c > UINT16_MAX  ||
 	    t > UINT16_MAX)
 		return (-1);
 
 	/* If only the cpus was specified, use that as sockets */
 	if (!scts)
 		s = n;
 	/*
 	 * Compute sockets * cores * threads avoiding overflow
 	 * The range check above insures these are 16 bit values
 	 * If n was specified check it against computed ncpus
 	 */
 	ncpus = (uint64_t)s * c * t;
 	if (ncpus > UINT16_MAX || (ns && n != ncpus))
 		return (-1);
 
 	guest_ncpus = ncpus;
 	sockets = s;
 	cores = c;
 	threads = t;
 	return(0);
 
 out:
 	free(str);
 	return (-1);
 }
 
 static int
 pincpu_parse(const char *opt)
 {
 	int vcpu, pcpu;
 
 	if (sscanf(opt, "%d:%d", &vcpu, &pcpu) != 2) {
 		fprintf(stderr, "invalid format: %s\n", opt);
 		return (-1);
 	}
 
 	if (vcpu < 0 || vcpu >= VM_MAXCPU) {
 		fprintf(stderr, "vcpu '%d' outside valid range from 0 to %d\n",
 		    vcpu, VM_MAXCPU - 1);
 		return (-1);
 	}
 
 	if (pcpu < 0 || pcpu >= CPU_SETSIZE) {
 		fprintf(stderr, "hostcpu '%d' outside valid range from "
 		    "0 to %d\n", pcpu, CPU_SETSIZE - 1);
 		return (-1);
 	}
 
 	if (vcpumap[vcpu] == NULL) {
 		if ((vcpumap[vcpu] = malloc(sizeof(cpuset_t))) == NULL) {
 			perror("malloc");
 			return (-1);
 		}
 		CPU_ZERO(vcpumap[vcpu]);
 	}
 	CPU_SET(pcpu, vcpumap[vcpu]);
 	return (0);
 }
 
 void
 vm_inject_fault(void *arg, int vcpu, int vector, int errcode_valid,
     int errcode)
 {
 	struct vmctx *ctx;
 	int error, restart_instruction;
 
 	ctx = arg;
 	restart_instruction = 1;
 
 	error = vm_inject_exception(ctx, vcpu, vector, errcode_valid, errcode,
 	    restart_instruction);
 	assert(error == 0);
 }
 
 void *
 paddr_guest2host(struct vmctx *ctx, uintptr_t gaddr, size_t len)
 {
 
 	return (vm_map_gpa(ctx, gaddr, len));
 }
 
+#ifdef BHYVE_SNAPSHOT
+uintptr_t
+paddr_host2guest(struct vmctx *ctx, void *addr)
+{
+	return (vm_rev_map_gpa(ctx, addr));
+}
+#endif
+
 int
 fbsdrun_vmexit_on_pause(void)
 {
 
 	return (guest_vmexit_on_pause);
 }
 
 int
 fbsdrun_vmexit_on_hlt(void)
 {
 
 	return (guest_vmexit_on_hlt);
 }
 
 int
 fbsdrun_virtio_msix(void)
 {
 
 	return (virtio_msix);
 }
 
 static void *
 fbsdrun_start_thread(void *param)
 {
 	char tname[MAXCOMLEN + 1];
 	struct mt_vmm_info *mtp;
 	int vcpu;
 
 	mtp = param;
 	vcpu = mtp->mt_vcpu;
 
 	snprintf(tname, sizeof(tname), "vcpu %d", vcpu);
 	pthread_set_name_np(mtp->mt_thr, tname);
 
+#ifdef BHYVE_SNAPSHOT
+	checkpoint_cpu_add(vcpu);
+#endif
 	if (gdb_port != 0)
 		gdb_cpu_add(vcpu);
 
 	vm_loop(mtp->mt_ctx, vcpu, vmexit[vcpu].rip);
 
 	/* not reached */
 	exit(1);
 	return (NULL);
 }
 
 void
 fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip)
 {
 	int error;
 
 	assert(fromcpu == BSP);
 
 	/*
 	 * The 'newcpu' must be activated in the context of 'fromcpu'. If
 	 * vm_activate_cpu() is delayed until newcpu's pthread starts running
 	 * then vmm.ko is out-of-sync with bhyve and this can create a race
 	 * with vm_suspend().
 	 */
 	error = vm_activate_cpu(ctx, newcpu);
 	if (error != 0)
 		err(EX_OSERR, "could not activate CPU %d", newcpu);
 
 	CPU_SET_ATOMIC(newcpu, &cpumask);
 
 	/*
 	 * Set up the vmexit struct to allow execution to start
 	 * at the given RIP
 	 */
 	vmexit[newcpu].rip = rip;
 	vmexit[newcpu].inst_length = 0;
 
 	mt_vmm_info[newcpu].mt_ctx = ctx;
 	mt_vmm_info[newcpu].mt_vcpu = newcpu;
 
 	error = pthread_create(&mt_vmm_info[newcpu].mt_thr, NULL,
 	    fbsdrun_start_thread, &mt_vmm_info[newcpu]);
 	assert(error == 0);
 }
 
 static int
 fbsdrun_deletecpu(struct vmctx *ctx, int vcpu)
 {
 
 	if (!CPU_ISSET(vcpu, &cpumask)) {
 		fprintf(stderr, "Attempting to delete unknown cpu %d\n", vcpu);
 		exit(4);
 	}
 
 	CPU_CLR_ATOMIC(vcpu, &cpumask);
 	return (CPU_EMPTY(&cpumask));
 }
 
 static int
 vmexit_handle_notify(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu,
 		     uint32_t eax)
 {
 #if BHYVE_DEBUG
 	/*
 	 * put guest-driven debug here
 	 */
 #endif
 	return (VMEXIT_CONTINUE);
 }
 
 static int
 vmexit_inout(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
 {
 	int error;
 	int bytes, port, in, out;
 	int vcpu;
 
 	vcpu = *pvcpu;
 
 	port = vme->u.inout.port;
 	bytes = vme->u.inout.bytes;
 	in = vme->u.inout.in;
 	out = !in;
 
         /* Extra-special case of host notifications */
         if (out && port == GUEST_NIO_PORT) {
                 error = vmexit_handle_notify(ctx, vme, pvcpu, vme->u.inout.eax);
 		return (error);
 	}
 
 	error = emulate_inout(ctx, vcpu, vme, strictio);
 	if (error) {
 		fprintf(stderr, "Unhandled %s%c 0x%04x at 0x%lx\n",
 		    in ? "in" : "out",
 		    bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'),
 		    port, vmexit->rip);
 		return (VMEXIT_ABORT);
 	} else {
 		return (VMEXIT_CONTINUE);
 	}
 }
 
 static int
 vmexit_rdmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
 {
 	uint64_t val;
 	uint32_t eax, edx;
 	int error;
 
 	val = 0;
 	error = emulate_rdmsr(ctx, *pvcpu, vme->u.msr.code, &val);
 	if (error != 0) {
 		fprintf(stderr, "rdmsr to register %#x on vcpu %d\n",
 		    vme->u.msr.code, *pvcpu);
 		if (strictmsr) {
 			vm_inject_gp(ctx, *pvcpu);
 			return (VMEXIT_CONTINUE);
 		}
 	}
 
 	eax = val;
 	error = vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RAX, eax);
 	assert(error == 0);
 
 	edx = val >> 32;
 	error = vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RDX, edx);
 	assert(error == 0);
 
 	return (VMEXIT_CONTINUE);
 }
 
 static int
 vmexit_wrmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
 {
 	int error;
 
 	error = emulate_wrmsr(ctx, *pvcpu, vme->u.msr.code, vme->u.msr.wval);
 	if (error != 0) {
 		fprintf(stderr, "wrmsr to register %#x(%#lx) on vcpu %d\n",
 		    vme->u.msr.code, vme->u.msr.wval, *pvcpu);
 		if (strictmsr) {
 			vm_inject_gp(ctx, *pvcpu);
 			return (VMEXIT_CONTINUE);
 		}
 	}
 	return (VMEXIT_CONTINUE);
 }
 
 static int
 vmexit_spinup_ap(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
 {
 
 	(void)spinup_ap(ctx, *pvcpu,
 		    vme->u.spinup_ap.vcpu, vme->u.spinup_ap.rip);
 
 	return (VMEXIT_CONTINUE);
 }
 
 #define	DEBUG_EPT_MISCONFIG
 #ifdef DEBUG_EPT_MISCONFIG
 #define	VMCS_GUEST_PHYSICAL_ADDRESS	0x00002400
 
 static uint64_t ept_misconfig_gpa, ept_misconfig_pte[4];
 static int ept_misconfig_ptenum;
 #endif
 
 static const char *
 vmexit_vmx_desc(uint32_t exit_reason)
 {
 
 	if (exit_reason >= nitems(vmx_exit_reason_desc) ||
 	    vmx_exit_reason_desc[exit_reason] == NULL)
 		return ("Unknown");
 	return (vmx_exit_reason_desc[exit_reason]);
 }
 
 static int
 vmexit_vmx(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 {
 
 	fprintf(stderr, "vm exit[%d]\n", *pvcpu);
 	fprintf(stderr, "\treason\t\tVMX\n");
 	fprintf(stderr, "\trip\t\t0x%016lx\n", vmexit->rip);
 	fprintf(stderr, "\tinst_length\t%d\n", vmexit->inst_length);
 	fprintf(stderr, "\tstatus\t\t%d\n", vmexit->u.vmx.status);
 	fprintf(stderr, "\texit_reason\t%u (%s)\n", vmexit->u.vmx.exit_reason,
 	    vmexit_vmx_desc(vmexit->u.vmx.exit_reason));
 	fprintf(stderr, "\tqualification\t0x%016lx\n",
 	    vmexit->u.vmx.exit_qualification);
 	fprintf(stderr, "\tinst_type\t\t%d\n", vmexit->u.vmx.inst_type);
 	fprintf(stderr, "\tinst_error\t\t%d\n", vmexit->u.vmx.inst_error);
 #ifdef DEBUG_EPT_MISCONFIG
 	if (vmexit->u.vmx.exit_reason == EXIT_REASON_EPT_MISCONFIG) {
 		vm_get_register(ctx, *pvcpu,
 		    VMCS_IDENT(VMCS_GUEST_PHYSICAL_ADDRESS),
 		    &ept_misconfig_gpa);
 		vm_get_gpa_pmap(ctx, ept_misconfig_gpa, ept_misconfig_pte,
 		    &ept_misconfig_ptenum);
 		fprintf(stderr, "\tEPT misconfiguration:\n");
 		fprintf(stderr, "\t\tGPA: %#lx\n", ept_misconfig_gpa);
 		fprintf(stderr, "\t\tPTE(%d): %#lx %#lx %#lx %#lx\n",
 		    ept_misconfig_ptenum, ept_misconfig_pte[0],
 		    ept_misconfig_pte[1], ept_misconfig_pte[2],
 		    ept_misconfig_pte[3]);
 	}
 #endif	/* DEBUG_EPT_MISCONFIG */
 	return (VMEXIT_ABORT);
 }
 
 static int
 vmexit_svm(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 {
 
 	fprintf(stderr, "vm exit[%d]\n", *pvcpu);
 	fprintf(stderr, "\treason\t\tSVM\n");
 	fprintf(stderr, "\trip\t\t0x%016lx\n", vmexit->rip);
 	fprintf(stderr, "\tinst_length\t%d\n", vmexit->inst_length);
 	fprintf(stderr, "\texitcode\t%#lx\n", vmexit->u.svm.exitcode);
 	fprintf(stderr, "\texitinfo1\t%#lx\n", vmexit->u.svm.exitinfo1);
 	fprintf(stderr, "\texitinfo2\t%#lx\n", vmexit->u.svm.exitinfo2);
 	return (VMEXIT_ABORT);
 }
 
 static int
 vmexit_bogus(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 {
 
 	assert(vmexit->inst_length == 0);
 
 	stats.vmexit_bogus++;
 
 	return (VMEXIT_CONTINUE);
 }
 
 static int
 vmexit_reqidle(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 {
 
 	assert(vmexit->inst_length == 0);
 
 	stats.vmexit_reqidle++;
 
 	return (VMEXIT_CONTINUE);
 }
 
 static int
 vmexit_hlt(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 {
 
 	stats.vmexit_hlt++;
 
 	/*
 	 * Just continue execution with the next instruction. We use
 	 * the HLT VM exit as a way to be friendly with the host
 	 * scheduler.
 	 */
 	return (VMEXIT_CONTINUE);
 }
 
 static int
 vmexit_pause(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 {
 
 	stats.vmexit_pause++;
 
 	return (VMEXIT_CONTINUE);
 }
 
 static int
 vmexit_mtrap(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 {
 
 	assert(vmexit->inst_length == 0);
 
 	stats.vmexit_mtrap++;
 
-	if (gdb_port == 0) {
-		fprintf(stderr, "vm_loop: unexpected VMEXIT_MTRAP\n");
-		exit(4);
-	}
-	gdb_cpu_mtrap(*pvcpu);
+#ifdef BHYVE_SNAPSHOT
+	checkpoint_cpu_suspend(*pvcpu);
+#endif
+	if (gdb_port != 0)
+		gdb_cpu_mtrap(*pvcpu);
+#ifdef BHYVE_SNAPSHOT
+	checkpoint_cpu_resume(*pvcpu);
+#endif
+
 	return (VMEXIT_CONTINUE);
 }
 
 static int
 vmexit_inst_emul(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 {
 	int err, i;
 	struct vie *vie;
 
 	stats.vmexit_inst_emul++;
 
 	vie = &vmexit->u.inst_emul.vie;
 	err = emulate_mem(ctx, *pvcpu, vmexit->u.inst_emul.gpa,
 	    vie, &vmexit->u.inst_emul.paging);
 
 	if (err) {
 		if (err == ESRCH) {
 			EPRINTLN("Unhandled memory access to 0x%lx\n",
 			    vmexit->u.inst_emul.gpa);
 		}
 
 		fprintf(stderr, "Failed to emulate instruction sequence [ ");
 		for (i = 0; i < vie->num_valid; i++)
 			fprintf(stderr, "%02x", vie->inst[i]);
 		FPRINTLN(stderr, " ] at 0x%lx", vmexit->rip);
 		return (VMEXIT_ABORT);
 	}
 
 	return (VMEXIT_CONTINUE);
 }
 
 static pthread_mutex_t resetcpu_mtx = PTHREAD_MUTEX_INITIALIZER;
 static pthread_cond_t resetcpu_cond = PTHREAD_COND_INITIALIZER;
 
 static int
 vmexit_suspend(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 {
 	enum vm_suspend_how how;
 
 	how = vmexit->u.suspended.how;
 
 	fbsdrun_deletecpu(ctx, *pvcpu);
 
 	if (*pvcpu != BSP) {
 		pthread_mutex_lock(&resetcpu_mtx);
 		pthread_cond_signal(&resetcpu_cond);
 		pthread_mutex_unlock(&resetcpu_mtx);
 		pthread_exit(NULL);
 	}
 
 	pthread_mutex_lock(&resetcpu_mtx);
 	while (!CPU_EMPTY(&cpumask)) {
 		pthread_cond_wait(&resetcpu_cond, &resetcpu_mtx);
 	}
 	pthread_mutex_unlock(&resetcpu_mtx);
 
 	switch (how) {
 	case VM_SUSPEND_RESET:
 		exit(0);
 	case VM_SUSPEND_POWEROFF:
 		exit(1);
 	case VM_SUSPEND_HALT:
 		exit(2);
 	case VM_SUSPEND_TRIPLEFAULT:
 		exit(3);
 	default:
 		fprintf(stderr, "vmexit_suspend: invalid reason %d\n", how);
 		exit(100);
 	}
 	return (0);	/* NOTREACHED */
 }
 
 static int
 vmexit_debug(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 {
 
-	if (gdb_port == 0) {
-		fprintf(stderr, "vm_loop: unexpected VMEXIT_DEBUG\n");
-		exit(4);
-	}
-	gdb_cpu_suspend(*pvcpu);
+#ifdef BHYVE_SNAPSHOT
+	checkpoint_cpu_suspend(*pvcpu);
+#endif
+	if (gdb_port != 0)
+		gdb_cpu_suspend(*pvcpu);
+#ifdef BHYVE_SNAPSHOT
+	checkpoint_cpu_resume(*pvcpu);
+#endif
 	return (VMEXIT_CONTINUE);
 }
 
 static int
 vmexit_breakpoint(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 {
 
 	if (gdb_port == 0) {
 		fprintf(stderr, "vm_loop: unexpected VMEXIT_DEBUG\n");
 		exit(4);
 	}
 	gdb_cpu_breakpoint(*pvcpu, vmexit);
 	return (VMEXIT_CONTINUE);
 }
 
 static vmexit_handler_t handler[VM_EXITCODE_MAX] = {
 	[VM_EXITCODE_INOUT]  = vmexit_inout,
 	[VM_EXITCODE_INOUT_STR]  = vmexit_inout,
 	[VM_EXITCODE_VMX]    = vmexit_vmx,
 	[VM_EXITCODE_SVM]    = vmexit_svm,
 	[VM_EXITCODE_BOGUS]  = vmexit_bogus,
 	[VM_EXITCODE_REQIDLE] = vmexit_reqidle,
 	[VM_EXITCODE_RDMSR]  = vmexit_rdmsr,
 	[VM_EXITCODE_WRMSR]  = vmexit_wrmsr,
 	[VM_EXITCODE_MTRAP]  = vmexit_mtrap,
 	[VM_EXITCODE_INST_EMUL] = vmexit_inst_emul,
 	[VM_EXITCODE_SPINUP_AP] = vmexit_spinup_ap,
 	[VM_EXITCODE_SUSPENDED] = vmexit_suspend,
 	[VM_EXITCODE_TASK_SWITCH] = vmexit_task_switch,
 	[VM_EXITCODE_DEBUG] = vmexit_debug,
 	[VM_EXITCODE_BPT] = vmexit_breakpoint,
 };
 
 static void
 vm_loop(struct vmctx *ctx, int vcpu, uint64_t startrip)
 {
 	int error, rc;
 	enum vm_exitcode exitcode;
 	cpuset_t active_cpus;
 
 	if (vcpumap[vcpu] != NULL) {
 		error = pthread_setaffinity_np(pthread_self(),
 		    sizeof(cpuset_t), vcpumap[vcpu]);
 		assert(error == 0);
 	}
 
 	error = vm_active_cpus(ctx, &active_cpus);
 	assert(CPU_ISSET(vcpu, &active_cpus));
 
 	error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, startrip);
 	assert(error == 0);
 
 	while (1) {
 		error = vm_run(ctx, vcpu, &vmexit[vcpu]);
 		if (error != 0)
 			break;
 
 		exitcode = vmexit[vcpu].exitcode;
 		if (exitcode >= VM_EXITCODE_MAX || handler[exitcode] == NULL) {
 			fprintf(stderr, "vm_loop: unexpected exitcode 0x%x\n",
 			    exitcode);
 			exit(4);
 		}
 
 		rc = (*handler[exitcode])(ctx, &vmexit[vcpu], &vcpu);
 
 		switch (rc) {
 		case VMEXIT_CONTINUE:
 			break;
 		case VMEXIT_ABORT:
 			abort();
 		default:
 			exit(4);
 		}
 	}
 	fprintf(stderr, "vm_run error %d, errno %d\n", error, errno);
 }
 
 static int
 num_vcpus_allowed(struct vmctx *ctx)
 {
 	int tmp, error;
 
 	error = vm_get_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, &tmp);
 
 	/*
 	 * The guest is allowed to spinup more than one processor only if the
 	 * UNRESTRICTED_GUEST capability is available.
 	 */
 	if (error == 0)
 		return (VM_MAXCPU);
 	else
 		return (1);
 }
 
 void
 fbsdrun_set_capabilities(struct vmctx *ctx, int cpu)
 {
 	int err, tmp;
 
 	if (fbsdrun_vmexit_on_hlt()) {
 		err = vm_get_capability(ctx, cpu, VM_CAP_HALT_EXIT, &tmp);
 		if (err < 0) {
 			fprintf(stderr, "VM exit on HLT not supported\n");
 			exit(4);
 		}
 		vm_set_capability(ctx, cpu, VM_CAP_HALT_EXIT, 1);
 		if (cpu == BSP)
 			handler[VM_EXITCODE_HLT] = vmexit_hlt;
 	}
 
         if (fbsdrun_vmexit_on_pause()) {
 		/*
 		 * pause exit support required for this mode
 		 */
 		err = vm_get_capability(ctx, cpu, VM_CAP_PAUSE_EXIT, &tmp);
 		if (err < 0) {
 			fprintf(stderr,
 			    "SMP mux requested, no pause support\n");
 			exit(4);
 		}
 		vm_set_capability(ctx, cpu, VM_CAP_PAUSE_EXIT, 1);
 		if (cpu == BSP)
 			handler[VM_EXITCODE_PAUSE] = vmexit_pause;
         }
 
 	if (x2apic_mode)
 		err = vm_set_x2apic_state(ctx, cpu, X2APIC_ENABLED);
 	else
 		err = vm_set_x2apic_state(ctx, cpu, X2APIC_DISABLED);
 
 	if (err) {
 		fprintf(stderr, "Unable to set x2apic state (%d)\n", err);
 		exit(4);
 	}
 
 	vm_set_capability(ctx, cpu, VM_CAP_ENABLE_INVPCID, 1);
 }
 
 static struct vmctx *
 do_open(const char *vmname)
 {
 	struct vmctx *ctx;
 	int error;
 	bool reinit, romboot;
 #ifndef WITHOUT_CAPSICUM
 	cap_rights_t rights;
 	const cap_ioctl_t *cmds;	
 	size_t ncmds;
 #endif
 
 	reinit = romboot = false;
 
 	if (lpc_bootrom())
 		romboot = true;
 
 	error = vm_create(vmname);
 	if (error) {
 		if (errno == EEXIST) {
 			if (romboot) {
 				reinit = true;
 			} else {
 				/*
 				 * The virtual machine has been setup by the
 				 * userspace bootloader.
 				 */
 			}
 		} else {
 			perror("vm_create");
 			exit(4);
 		}
 	} else {
 		if (!romboot) {
 			/*
 			 * If the virtual machine was just created then a
 			 * bootrom must be configured to boot it.
 			 */
 			fprintf(stderr, "virtual machine cannot be booted\n");
 			exit(4);
 		}
 	}
 
 	ctx = vm_open(vmname);
 	if (ctx == NULL) {
 		perror("vm_open");
 		exit(4);
 	}
 
 #ifndef WITHOUT_CAPSICUM
 	cap_rights_init(&rights, CAP_IOCTL, CAP_MMAP_RW);
 	if (caph_rights_limit(vm_get_device_fd(ctx), &rights) == -1) 
 		errx(EX_OSERR, "Unable to apply rights for sandbox");
 	vm_get_ioctls(&ncmds);
 	cmds = vm_get_ioctls(NULL);
 	if (cmds == NULL)
 		errx(EX_OSERR, "out of memory");
 	if (caph_ioctls_limit(vm_get_device_fd(ctx), cmds, ncmds) == -1)
 		errx(EX_OSERR, "Unable to apply rights for sandbox");
 	free((cap_ioctl_t *)cmds);
 #endif
  
 	if (reinit) {
 		error = vm_reinit(ctx);
 		if (error) {
 			perror("vm_reinit");
 			exit(4);
 		}
 	}
 	error = vm_set_topology(ctx, sockets, cores, threads, maxcpus);
 	if (error)
 		errx(EX_OSERR, "vm_set_topology");
 	return (ctx);
 }
 
+void
+spinup_vcpu(struct vmctx *ctx, int vcpu)
+{
+	int error;
+	uint64_t rip;
+
+	error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RIP, &rip);
+	assert(error == 0);
+
+	fbsdrun_set_capabilities(ctx, vcpu);
+	error = vm_set_capability(ctx, vcpu, VM_CAP_UNRESTRICTED_GUEST, 1);
+	assert(error == 0);
+
+	fbsdrun_addcpu(ctx, BSP, vcpu, rip);
+}
+
 int
 main(int argc, char *argv[])
 {
 	int c, error, dbg_port, err, bvmcons;
 	int max_vcpus, mptgen, memflags;
 	int rtc_localtime;
 	bool gdb_stop;
 	struct vmctx *ctx;
 	uint64_t rip;
 	size_t memsize;
 	char *optstr;
+#ifdef BHYVE_SNAPSHOT
+	char *restore_file;
+	struct restore_state rstate;
+	int vcpu;
+
+	restore_file = NULL;
+#endif
 
 	bvmcons = 0;
 	progname = basename(argv[0]);
 	dbg_port = 0;
 	gdb_stop = false;
 	guest_ncpus = 1;
 	sockets = cores = threads = 1;
 	maxcpus = 0;
 	memsize = 256 * MB;
 	mptgen = 1;
 	rtc_localtime = 1;
 	memflags = 0;
 
+#ifdef BHYVE_SNAPSHOT
+	optstr = "abehuwxACHIPSWYp:g:G:c:s:m:l:U:r:";
+#else
 	optstr = "abehuwxACHIPSWYp:g:G:c:s:m:l:U:";
+#endif
 	while ((c = getopt(argc, argv, optstr)) != -1) {
 		switch (c) {
 		case 'a':
 			x2apic_mode = 0;
 			break;
 		case 'A':
 			acpi = 1;
 			break;
 		case 'b':
 			bvmcons = 1;
 			break;
 		case 'p':
                         if (pincpu_parse(optarg) != 0) {
                             errx(EX_USAGE, "invalid vcpu pinning "
                                  "configuration '%s'", optarg);
                         }
 			break;
                 case 'c':
 			if (topology_parse(optarg) != 0) {
 			    errx(EX_USAGE, "invalid cpu topology "
 				"'%s'", optarg);
 			}
 			break;
 		case 'C':
 			memflags |= VM_MEM_F_INCORE;
 			break;
 		case 'g':
 			dbg_port = atoi(optarg);
 			break;
 		case 'G':
 			if (optarg[0] == 'w') {
 				gdb_stop = true;
 				optarg++;
 			}
 			gdb_port = atoi(optarg);
 			break;
 		case 'l':
 			if (strncmp(optarg, "help", strlen(optarg)) == 0) {
 				lpc_print_supported_devices();
 				exit(0);
 			} else if (lpc_device_parse(optarg) != 0) {
 				errx(EX_USAGE, "invalid lpc device "
 				    "configuration '%s'", optarg);
 			}
 			break;
+#ifdef BHYVE_SNAPSHOT
+		case 'r':
+			restore_file = optarg;
+			break;
+#endif
 		case 's':
 			if (strncmp(optarg, "help", strlen(optarg)) == 0) {
 				pci_print_supported_devices();
 				exit(0);
 			} else if (pci_parse_slot(optarg) != 0)
 				exit(4);
 			else
 				break;
 		case 'S':
 			memflags |= VM_MEM_F_WIRED;
 			break;
                 case 'm':
 			error = vm_parse_memsize(optarg, &memsize);
 			if (error)
 				errx(EX_USAGE, "invalid memsize '%s'", optarg);
 			break;
 		case 'H':
 			guest_vmexit_on_hlt = 1;
 			break;
 		case 'I':
 			/*
 			 * The "-I" option was used to add an ioapic to the
 			 * virtual machine.
 			 *
 			 * An ioapic is now provided unconditionally for each
 			 * virtual machine and this option is now deprecated.
 			 */
 			break;
 		case 'P':
 			guest_vmexit_on_pause = 1;
 			break;
 		case 'e':
 			strictio = 1;
 			break;
 		case 'u':
 			rtc_localtime = 0;
 			break;
 		case 'U':
 			guest_uuid_str = optarg;
 			break;
 		case 'w':
 			strictmsr = 0;
 			break;
 		case 'W':
 			virtio_msix = 0;
 			break;
 		case 'x':
 			x2apic_mode = 1;
 			break;
 		case 'Y':
 			mptgen = 0;
 			break;
 		case 'h':
 			usage(0);			
 		default:
 			usage(1);
 		}
 	}
 	argc -= optind;
 	argv += optind;
 
+#ifdef BHYVE_SNAPSHOT
+	if (argc > 1 || (argc == 0 && restore_file == NULL))
+		usage(1);
+
+	if (restore_file != NULL) {
+		error = load_restore_file(restore_file, &rstate);
+		if (error) {
+			fprintf(stderr, "Failed to read checkpoint info from "
+					"file: '%s'.\n", restore_file);
+			exit(1);
+		}
+	}
+
+	if (argc == 1) {
+		vmname = argv[0];
+	} else {
+		vmname = lookup_vmname(&rstate);
+		if (vmname == NULL) {
+			fprintf(stderr, "Cannot find VM name in restore file. "
+					"Please specify one.\n");
+			exit(1);
+		}
+	}
+#else
 	if (argc != 1)
 		usage(1);
 
 	vmname = argv[0];
+#endif
 	ctx = do_open(vmname);
 
+#ifdef BHYVE_SNAPSHOT
+	if (restore_file != NULL) {
+		guest_ncpus = lookup_guest_ncpus(&rstate);
+		memflags = lookup_memflags(&rstate);
+		memsize = lookup_memsize(&rstate);
+	}
+
+	if (guest_ncpus < 1) {
+		fprintf(stderr, "Invalid guest vCPUs (%d)\n", guest_ncpus);
+		exit(1);
+	}
+#endif
+
 	max_vcpus = num_vcpus_allowed(ctx);
 	if (guest_ncpus > max_vcpus) {
 		fprintf(stderr, "%d vCPUs requested but only %d available\n",
 			guest_ncpus, max_vcpus);
 		exit(4);
 	}
 
 	fbsdrun_set_capabilities(ctx, BSP);
 
 	vm_set_memflags(ctx, memflags);
 	err = vm_setup_memory(ctx, memsize, VM_MMAP_ALL);
 	if (err) {
 		fprintf(stderr, "Unable to setup memory (%d)\n", errno);
 		exit(4);
 	}
 
 	error = init_msr();
 	if (error) {
 		fprintf(stderr, "init_msr error %d", error);
 		exit(4);
 	}
 
 	init_mem();
 	init_inout();
 	init_bootrom(ctx);
 	atkbdc_init(ctx);
 	pci_irq_init(ctx);
 	ioapic_init(ctx);
 
 	rtc_init(ctx, rtc_localtime);
 	sci_init(ctx);
 
 	/*
 	 * Exit if a device emulation finds an error in its initilization
 	 */
 	if (init_pci(ctx) != 0) {
 		perror("device emulation initialization error");
 		exit(4);
 	}
 
 	/*
 	 * Initialize after PCI, to allow a bootrom file to reserve the high
 	 * region.
 	 */
 	if (acpi)
 		vmgenc_init(ctx);
 
 	if (dbg_port != 0)
 		init_dbgport(dbg_port);
 
 	if (gdb_port != 0)
 		init_gdb(ctx, gdb_port, gdb_stop);
 
 	if (bvmcons)
 		init_bvmcons();
 
 	if (lpc_bootrom()) {
 		if (vm_set_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, 1)) {
 			fprintf(stderr, "ROM boot failed: unrestricted guest "
 			    "capability not available\n");
 			exit(4);
 		}
 		error = vcpu_reset(ctx, BSP);
 		assert(error == 0);
 	}
 
+#ifdef BHYVE_SNAPSHOT
+	if (restore_file != NULL) {
+		fprintf(stdout, "Pausing pci devs...\r\n");
+		if (vm_pause_user_devs(ctx) != 0) {
+			fprintf(stderr, "Failed to pause PCI device state.\n");
+			exit(1);
+		}
+
+		fprintf(stdout, "Restoring vm mem...\r\n");
+		if (restore_vm_mem(ctx, &rstate) != 0) {
+			fprintf(stderr, "Failed to restore VM memory.\n");
+			exit(1);
+		}
+
+		fprintf(stdout, "Restoring pci devs...\r\n");
+		if (vm_restore_user_devs(ctx, &rstate) != 0) {
+			fprintf(stderr, "Failed to restore PCI device state.\n");
+			exit(1);
+		}
+
+		fprintf(stdout, "Restoring kernel structs...\r\n");
+		if (vm_restore_kern_structs(ctx, &rstate) != 0) {
+			fprintf(stderr, "Failed to restore kernel structs.\n");
+			exit(1);
+		}
+
+		fprintf(stdout, "Resuming pci devs...\r\n");
+		if (vm_resume_user_devs(ctx) != 0) {
+			fprintf(stderr, "Failed to resume PCI device state.\n");
+			exit(1);
+		}
+	}
+#endif
+
 	error = vm_get_register(ctx, BSP, VM_REG_GUEST_RIP, &rip);
 	assert(error == 0);
 
 	/*
 	 * build the guest tables, MP etc.
 	 */
 	if (mptgen) {
 		error = mptable_build(ctx, guest_ncpus);
 		if (error) {
 			perror("error to build the guest tables");
 			exit(4);
 		}
 	}
 
 	error = smbios_build(ctx);
 	assert(error == 0);
 
 	if (acpi) {
 		error = acpi_build(ctx, guest_ncpus);
 		assert(error == 0);
 	}
 
 	if (lpc_bootrom())
 		fwctl_init();
 
 	/*
 	 * Change the proc title to include the VM name.
 	 */
 	setproctitle("%s", vmname);
 
 #ifndef WITHOUT_CAPSICUM
 	caph_cache_catpages();
 
 	if (caph_limit_stdout() == -1 || caph_limit_stderr() == -1)
 		errx(EX_OSERR, "Unable to apply rights for sandbox");
 
 	if (caph_enter() == -1)
 		errx(EX_OSERR, "cap_enter() failed");
 #endif
 
+#ifdef BHYVE_SNAPSHOT
+	if (restore_file != NULL)
+		destroy_restore_state(&rstate);
+
+	/*
+	 * checkpointing thread for communication with bhyvectl
+	 */
+	if (init_checkpoint_thread(ctx) < 0)
+		printf("Failed to start checkpoint thread!\r\n");
+
+	if (restore_file != NULL)
+		vm_restore_time(ctx);
+#endif
+
 	/*
 	 * Add CPU 0
 	 */
 	fbsdrun_addcpu(ctx, BSP, BSP, rip);
 
+#ifdef BHYVE_SNAPSHOT
+	/*
+	 * If we restore a VM, start all vCPUs now (including APs), otherwise,
+	 * let the guest OS to spin them up later via vmexits.
+	 */
+	if (restore_file != NULL) {
+		for (vcpu = 0; vcpu < guest_ncpus; vcpu++) {
+			if (vcpu == BSP)
+				continue;
+
+			fprintf(stdout, "spinning up vcpu no %d...\r\n", vcpu);
+			spinup_vcpu(ctx, vcpu);
+		}
+	}
+#endif
+
 	/*
 	 * Head off to the main event dispatch loop
 	 */
 	mevent_dispatch();
 
 	exit(4);
 }
diff --git a/usr.sbin/bhyve/bhyverun.h b/usr.sbin/bhyve/bhyverun.h
index 0b23a6a5c3ae..0177baca14e9 100644
--- a/usr.sbin/bhyve/bhyverun.h
+++ b/usr.sbin/bhyve/bhyverun.h
@@ -1,52 +1,55 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef	_FBSDRUN_H_
 #define	_FBSDRUN_H_
 
 #define	VMEXIT_CONTINUE		(0)
 #define	VMEXIT_ABORT		(-1)
 
 struct vmctx;
 extern int guest_ncpus;
 extern uint16_t cores, sockets, threads;
 extern char *guest_uuid_str;
-extern char *vmname;
+extern const char *vmname;
 
 void *paddr_guest2host(struct vmctx *ctx, uintptr_t addr, size_t len);
+#ifdef BHYVE_SNAPSHOT
+uintptr_t paddr_host2guest(struct vmctx *ctx, void *addr);
+#endif
 
 void fbsdrun_set_capabilities(struct vmctx *ctx, int cpu);
 void fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip);
 int  fbsdrun_muxed(void);
 int  fbsdrun_vmexit_on_hlt(void);
 int  fbsdrun_vmexit_on_pause(void);
 int  fbsdrun_disable_x2apic(void);
 int  fbsdrun_virtio_msix(void);
 #endif
diff --git a/usr.sbin/bhyve/block_if.c b/usr.sbin/bhyve/block_if.c
index 50e1eed12f90..4c91038ca765 100644
--- a/usr.sbin/bhyve/block_if.c
+++ b/usr.sbin/bhyve/block_if.c
@@ -1,857 +1,988 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2013  Peter Grehan <grehan@freebsd.org>
  * All rights reserved.
  * Copyright 2020 Joyent, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #ifndef WITHOUT_CAPSICUM
 #include <sys/capsicum.h>
 #endif
 #include <sys/queue.h>
 #include <sys/errno.h>
 #include <sys/stat.h>
 #include <sys/ioctl.h>
 #include <sys/disk.h>
 
 #include <assert.h>
 #ifndef WITHOUT_CAPSICUM
 #include <capsicum_helpers.h>
 #endif
 #include <err.h>
 #include <fcntl.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <pthread.h>
 #include <pthread_np.h>
 #include <signal.h>
 #include <sysexits.h>
 #include <unistd.h>
 
 #include <machine/atomic.h>
+#include <machine/vmm_snapshot.h>
 
 #include "bhyverun.h"
 #include "debug.h"
 #include "mevent.h"
 #include "block_if.h"
 
 #define BLOCKIF_SIG	0xb109b109
 
 #define BLOCKIF_NUMTHR	8
 #define BLOCKIF_MAXREQ	(BLOCKIF_RING_MAX + BLOCKIF_NUMTHR)
 
 enum blockop {
 	BOP_READ,
 	BOP_WRITE,
 	BOP_FLUSH,
 	BOP_DELETE
 };
 
 enum blockstat {
 	BST_FREE,
 	BST_BLOCK,
 	BST_PEND,
 	BST_BUSY,
 	BST_DONE
 };
 
 struct blockif_elem {
 	TAILQ_ENTRY(blockif_elem) be_link;
 	struct blockif_req  *be_req;
 	enum blockop	     be_op;
 	enum blockstat	     be_status;
 	pthread_t            be_tid;
 	off_t		     be_block;
 };
 
 struct blockif_ctxt {
 	int			bc_magic;
 	int			bc_fd;
 	int			bc_ischr;
 	int			bc_isgeom;
 	int			bc_candelete;
 	int			bc_rdonly;
 	off_t			bc_size;
 	int			bc_sectsz;
 	int			bc_psectsz;
 	int			bc_psectoff;
 	int			bc_closing;
+	int			bc_paused;
+	int			bc_work_count;
 	pthread_t		bc_btid[BLOCKIF_NUMTHR];
 	pthread_mutex_t		bc_mtx;
 	pthread_cond_t		bc_cond;
+	pthread_cond_t		bc_paused_cond;
+	pthread_cond_t		bc_work_done_cond;
 
 	/* Request elements and free/pending/busy queues */
 	TAILQ_HEAD(, blockif_elem) bc_freeq;       
 	TAILQ_HEAD(, blockif_elem) bc_pendq;
 	TAILQ_HEAD(, blockif_elem) bc_busyq;
 	struct blockif_elem	bc_reqs[BLOCKIF_MAXREQ];
 };
 
 static pthread_once_t blockif_once = PTHREAD_ONCE_INIT;
 
 struct blockif_sig_elem {
 	pthread_mutex_t			bse_mtx;
 	pthread_cond_t			bse_cond;
 	int				bse_pending;
 	struct blockif_sig_elem		*bse_next;
 };
 
 static struct blockif_sig_elem *blockif_bse_head;
 
 static int
 blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq,
 		enum blockop op)
 {
 	struct blockif_elem *be, *tbe;
 	off_t off;
 	int i;
 
 	be = TAILQ_FIRST(&bc->bc_freeq);
 	assert(be != NULL);
 	assert(be->be_status == BST_FREE);
 	TAILQ_REMOVE(&bc->bc_freeq, be, be_link);
 	be->be_req = breq;
 	be->be_op = op;
 	switch (op) {
 	case BOP_READ:
 	case BOP_WRITE:
 	case BOP_DELETE:
 		off = breq->br_offset;
 		for (i = 0; i < breq->br_iovcnt; i++)
 			off += breq->br_iov[i].iov_len;
 		break;
 	default:
 		off = OFF_MAX;
 	}
 	be->be_block = off;
 	TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) {
 		if (tbe->be_block == breq->br_offset)
 			break;
 	}
 	if (tbe == NULL) {
 		TAILQ_FOREACH(tbe, &bc->bc_busyq, be_link) {
 			if (tbe->be_block == breq->br_offset)
 				break;
 		}
 	}
 	if (tbe == NULL)
 		be->be_status = BST_PEND;
 	else
 		be->be_status = BST_BLOCK;
 	TAILQ_INSERT_TAIL(&bc->bc_pendq, be, be_link);
 	return (be->be_status == BST_PEND);
 }
 
 static int
 blockif_dequeue(struct blockif_ctxt *bc, pthread_t t, struct blockif_elem **bep)
 {
 	struct blockif_elem *be;
 
 	TAILQ_FOREACH(be, &bc->bc_pendq, be_link) {
 		if (be->be_status == BST_PEND)
 			break;
 		assert(be->be_status == BST_BLOCK);
 	}
 	if (be == NULL)
 		return (0);
 	TAILQ_REMOVE(&bc->bc_pendq, be, be_link);
 	be->be_status = BST_BUSY;
 	be->be_tid = t;
 	TAILQ_INSERT_TAIL(&bc->bc_busyq, be, be_link);
 	*bep = be;
 	return (1);
 }
 
 static void
 blockif_complete(struct blockif_ctxt *bc, struct blockif_elem *be)
 {
 	struct blockif_elem *tbe;
 
 	if (be->be_status == BST_DONE || be->be_status == BST_BUSY)
 		TAILQ_REMOVE(&bc->bc_busyq, be, be_link);
 	else
 		TAILQ_REMOVE(&bc->bc_pendq, be, be_link);
 	TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) {
 		if (tbe->be_req->br_offset == be->be_block)
 			tbe->be_status = BST_PEND;
 	}
 	be->be_tid = 0;
 	be->be_status = BST_FREE;
 	be->be_req = NULL;
 	TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link);
 }
 
+static int
+blockif_flush_bc(struct blockif_ctxt *bc)
+{
+	if (bc->bc_ischr) {
+		if (ioctl(bc->bc_fd, DIOCGFLUSH))
+			return (errno);
+	} else if (fsync(bc->bc_fd))
+		return (errno);
+
+	return (0);
+}
+
 static void
 blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf)
 {
 	struct blockif_req *br;
 	off_t arg[2];
 	ssize_t clen, len, off, boff, voff;
 	int i, err;
 
 	br = be->be_req;
 	if (br->br_iovcnt <= 1)
 		buf = NULL;
 	err = 0;
 	switch (be->be_op) {
 	case BOP_READ:
 		if (buf == NULL) {
 			if ((len = preadv(bc->bc_fd, br->br_iov, br->br_iovcnt,
 				   br->br_offset)) < 0)
 				err = errno;
 			else
 				br->br_resid -= len;
 			break;
 		}
 		i = 0;
 		off = voff = 0;
 		while (br->br_resid > 0) {
 			len = MIN(br->br_resid, MAXPHYS);
 			if (pread(bc->bc_fd, buf, len, br->br_offset +
 			    off) < 0) {
 				err = errno;
 				break;
 			}
 			boff = 0;
 			do {
 				clen = MIN(len - boff, br->br_iov[i].iov_len -
 				    voff);
 				memcpy(br->br_iov[i].iov_base + voff,
 				    buf + boff, clen);
 				if (clen < br->br_iov[i].iov_len - voff)
 					voff += clen;
 				else {
 					i++;
 					voff = 0;
 				}
 				boff += clen;
 			} while (boff < len);
 			off += len;
 			br->br_resid -= len;
 		}
 		break;
 	case BOP_WRITE:
 		if (bc->bc_rdonly) {
 			err = EROFS;
 			break;
 		}
 		if (buf == NULL) {
 			if ((len = pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt,
 				    br->br_offset)) < 0)
 				err = errno;
 			else
 				br->br_resid -= len;
 			break;
 		}
 		i = 0;
 		off = voff = 0;
 		while (br->br_resid > 0) {
 			len = MIN(br->br_resid, MAXPHYS);
 			boff = 0;
 			do {
 				clen = MIN(len - boff, br->br_iov[i].iov_len -
 				    voff);
 				memcpy(buf + boff,
 				    br->br_iov[i].iov_base + voff, clen);
 				if (clen < br->br_iov[i].iov_len - voff)
 					voff += clen;
 				else {
 					i++;
 					voff = 0;
 				}
 				boff += clen;
 			} while (boff < len);
 			if (pwrite(bc->bc_fd, buf, len, br->br_offset +
 			    off) < 0) {
 				err = errno;
 				break;
 			}
 			off += len;
 			br->br_resid -= len;
 		}
 		break;
 	case BOP_FLUSH:
-		if (bc->bc_ischr) {
-			if (ioctl(bc->bc_fd, DIOCGFLUSH))
-				err = errno;
-		} else if (fsync(bc->bc_fd))
-			err = errno;
+		err = blockif_flush_bc(bc);
 		break;
 	case BOP_DELETE:
 		if (!bc->bc_candelete)
 			err = EOPNOTSUPP;
 		else if (bc->bc_rdonly)
 			err = EROFS;
 		else if (bc->bc_ischr) {
 			arg[0] = br->br_offset;
 			arg[1] = br->br_resid;
 			if (ioctl(bc->bc_fd, DIOCGDELETE, arg))
 				err = errno;
 			else
 				br->br_resid = 0;
 		} else
 			err = EOPNOTSUPP;
 		break;
 	default:
 		err = EINVAL;
 		break;
 	}
 
 	be->be_status = BST_DONE;
 
 	(*br->br_callback)(br, err);
 }
 
 static void *
 blockif_thr(void *arg)
 {
 	struct blockif_ctxt *bc;
 	struct blockif_elem *be;
 	pthread_t t;
 	uint8_t *buf;
 
 	bc = arg;
 	if (bc->bc_isgeom)
 		buf = malloc(MAXPHYS);
 	else
 		buf = NULL;
 	t = pthread_self();
 
 	pthread_mutex_lock(&bc->bc_mtx);
 	for (;;) {
-		while (blockif_dequeue(bc, t, &be)) {
+		bc->bc_work_count++;
+
+		/* We cannot process work if the interface is paused */
+		while (!bc->bc_paused && blockif_dequeue(bc, t, &be)) {
 			pthread_mutex_unlock(&bc->bc_mtx);
 			blockif_proc(bc, be, buf);
 			pthread_mutex_lock(&bc->bc_mtx);
 			blockif_complete(bc, be);
 		}
+
+		bc->bc_work_count--;
+
+		/* If none of the workers are busy, notify the main thread */
+		if (bc->bc_work_count == 0)
+			pthread_cond_broadcast(&bc->bc_work_done_cond);
+
 		/* Check ctxt status here to see if exit requested */
 		if (bc->bc_closing)
 			break;
+
+		/* Make all worker threads wait here if the device is paused */
+		while (bc->bc_paused)
+			pthread_cond_wait(&bc->bc_paused_cond, &bc->bc_mtx);
+
 		pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx);
 	}
 	pthread_mutex_unlock(&bc->bc_mtx);
 
 	if (buf)
 		free(buf);
 	pthread_exit(NULL);
 	return (NULL);
 }
 
 static void
 blockif_sigcont_handler(int signal, enum ev_type type, void *arg)
 {
 	struct blockif_sig_elem *bse;
 
 	for (;;) {
 		/*
 		 * Process the entire list even if not intended for
 		 * this thread.
 		 */
 		do {
 			bse = blockif_bse_head;
 			if (bse == NULL)
 				return;
 		} while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head,
 					    (uintptr_t)bse,
 					    (uintptr_t)bse->bse_next));
 
 		pthread_mutex_lock(&bse->bse_mtx);
 		bse->bse_pending = 0;
 		pthread_cond_signal(&bse->bse_cond);
 		pthread_mutex_unlock(&bse->bse_mtx);
 	}
 }
 
 static void
 blockif_init(void)
 {
 	mevent_add(SIGCONT, EVF_SIGNAL, blockif_sigcont_handler, NULL);
 	(void) signal(SIGCONT, SIG_IGN);
 }
 
 struct blockif_ctxt *
 blockif_open(const char *optstr, const char *ident)
 {
 	char tname[MAXCOMLEN + 1];
 	char name[MAXPATHLEN];
 	char *nopt, *xopts, *cp;
 	struct blockif_ctxt *bc;
 	struct stat sbuf;
 	struct diocgattr_arg arg;
 	off_t size, psectsz, psectoff;
 	int extra, fd, i, sectsz;
 	int nocache, sync, ro, candelete, geom, ssopt, pssopt;
 	int nodelete;
 
 #ifndef WITHOUT_CAPSICUM
 	cap_rights_t rights;
 	cap_ioctl_t cmds[] = { DIOCGFLUSH, DIOCGDELETE };
 #endif
 
 	pthread_once(&blockif_once, blockif_init);
 
 	fd = -1;
 	ssopt = 0;
 	nocache = 0;
 	sync = 0;
 	ro = 0;
 	nodelete = 0;
 
 	/*
 	 * The first element in the optstring is always a pathname.
 	 * Optional elements follow
 	 */
 	nopt = xopts = strdup(optstr);
 	while (xopts != NULL) {
 		cp = strsep(&xopts, ",");
 		if (cp == nopt)		/* file or device pathname */
 			continue;
 		else if (!strcmp(cp, "nocache"))
 			nocache = 1;
 		else if (!strcmp(cp, "nodelete"))
 			nodelete = 1;
 		else if (!strcmp(cp, "sync") || !strcmp(cp, "direct"))
 			sync = 1;
 		else if (!strcmp(cp, "ro"))
 			ro = 1;
 		else if (sscanf(cp, "sectorsize=%d/%d", &ssopt, &pssopt) == 2)
 			;
 		else if (sscanf(cp, "sectorsize=%d", &ssopt) == 1)
 			pssopt = ssopt;
 		else {
 			EPRINTLN("Invalid device option \"%s\"", cp);
 			goto err;
 		}
 	}
 
 	extra = 0;
 	if (nocache)
 		extra |= O_DIRECT;
 	if (sync)
 		extra |= O_SYNC;
 
 	fd = open(nopt, (ro ? O_RDONLY : O_RDWR) | extra);
 	if (fd < 0 && !ro) {
 		/* Attempt a r/w fail with a r/o open */
 		fd = open(nopt, O_RDONLY | extra);
 		ro = 1;
 	}
 
 	if (fd < 0) {
 		warn("Could not open backing file: %s", nopt);
 		goto err;
 	}
 
         if (fstat(fd, &sbuf) < 0) {
 		warn("Could not stat backing file %s", nopt);
 		goto err;
         }
 
 #ifndef WITHOUT_CAPSICUM
 	cap_rights_init(&rights, CAP_FSYNC, CAP_IOCTL, CAP_READ, CAP_SEEK,
 	    CAP_WRITE);
 	if (ro)
 		cap_rights_clear(&rights, CAP_FSYNC, CAP_WRITE);
 
 	if (caph_rights_limit(fd, &rights) == -1)
 		errx(EX_OSERR, "Unable to apply rights for sandbox");
 #endif
 
         /*
 	 * Deal with raw devices
 	 */
         size = sbuf.st_size;
 	sectsz = DEV_BSIZE;
 	psectsz = psectoff = 0;
 	candelete = geom = 0;
 	if (S_ISCHR(sbuf.st_mode)) {
 		if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 ||
 		    ioctl(fd, DIOCGSECTORSIZE, &sectsz)) {
 			perror("Could not fetch dev blk/sector size");
 			goto err;
 		}
 		assert(size != 0);
 		assert(sectsz != 0);
 		if (ioctl(fd, DIOCGSTRIPESIZE, &psectsz) == 0 && psectsz > 0)
 			ioctl(fd, DIOCGSTRIPEOFFSET, &psectoff);
 		strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name));
 		arg.len = sizeof(arg.value.i);
 		if (nodelete == 0 && ioctl(fd, DIOCGATTR, &arg) == 0)
 			candelete = arg.value.i;
 		if (ioctl(fd, DIOCGPROVIDERNAME, name) == 0)
 			geom = 1;
 	} else
 		psectsz = sbuf.st_blksize;
 
 #ifndef WITHOUT_CAPSICUM
 	if (caph_ioctls_limit(fd, cmds, nitems(cmds)) == -1)
 		errx(EX_OSERR, "Unable to apply rights for sandbox");
 #endif
 
 	if (ssopt != 0) {
 		if (!powerof2(ssopt) || !powerof2(pssopt) || ssopt < 512 ||
 		    ssopt > pssopt) {
 			EPRINTLN("Invalid sector size %d/%d",
 			    ssopt, pssopt);
 			goto err;
 		}
 
 		/*
 		 * Some backend drivers (e.g. cd0, ada0) require that the I/O
 		 * size be a multiple of the device's sector size.
 		 *
 		 * Validate that the emulated sector size complies with this
 		 * requirement.
 		 */
 		if (S_ISCHR(sbuf.st_mode)) {
 			if (ssopt < sectsz || (ssopt % sectsz) != 0) {
 				EPRINTLN("Sector size %d incompatible "
 				    "with underlying device sector size %d",
 				    ssopt, sectsz);
 				goto err;
 			}
 		}
 
 		sectsz = ssopt;
 		psectsz = pssopt;
 		psectoff = 0;
 	}
 
 	bc = calloc(1, sizeof(struct blockif_ctxt));
 	if (bc == NULL) {
 		perror("calloc");
 		goto err;
 	}
 
 	bc->bc_magic = BLOCKIF_SIG;
 	bc->bc_fd = fd;
 	bc->bc_ischr = S_ISCHR(sbuf.st_mode);
 	bc->bc_isgeom = geom;
 	bc->bc_candelete = candelete;
 	bc->bc_rdonly = ro;
 	bc->bc_size = size;
 	bc->bc_sectsz = sectsz;
 	bc->bc_psectsz = psectsz;
 	bc->bc_psectoff = psectoff;
 	pthread_mutex_init(&bc->bc_mtx, NULL);
 	pthread_cond_init(&bc->bc_cond, NULL);
+	bc->bc_paused = 0;
+	bc->bc_work_count = 0;
+	pthread_cond_init(&bc->bc_paused_cond, NULL);
+	pthread_cond_init(&bc->bc_work_done_cond, NULL);
 	TAILQ_INIT(&bc->bc_freeq);
 	TAILQ_INIT(&bc->bc_pendq);
 	TAILQ_INIT(&bc->bc_busyq);
 	for (i = 0; i < BLOCKIF_MAXREQ; i++) {
 		bc->bc_reqs[i].be_status = BST_FREE;
 		TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link);
 	}
 
 	for (i = 0; i < BLOCKIF_NUMTHR; i++) {
 		pthread_create(&bc->bc_btid[i], NULL, blockif_thr, bc);
 		snprintf(tname, sizeof(tname), "blk-%s-%d", ident, i);
 		pthread_set_name_np(bc->bc_btid[i], tname);
 	}
 
 	return (bc);
 err:
 	if (fd >= 0)
 		close(fd);
 	free(nopt);
 	return (NULL);
 }
 
 static int
 blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq,
 		enum blockop op)
 {
 	int err;
 
 	err = 0;
 
 	pthread_mutex_lock(&bc->bc_mtx);
 	if (!TAILQ_EMPTY(&bc->bc_freeq)) {
 		/*
 		 * Enqueue and inform the block i/o thread
 		 * that there is work available
 		 */
 		if (blockif_enqueue(bc, breq, op))
 			pthread_cond_signal(&bc->bc_cond);
 	} else {
 		/*
 		 * Callers are not allowed to enqueue more than
 		 * the specified blockif queue limit. Return an
 		 * error to indicate that the queue length has been
 		 * exceeded.
 		 */
 		err = E2BIG;
 	}
 	pthread_mutex_unlock(&bc->bc_mtx);
 
 	return (err);
 }
 
 int
 blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq)
 {
 
 	assert(bc->bc_magic == BLOCKIF_SIG);
 	return (blockif_request(bc, breq, BOP_READ));
 }
 
 int
 blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq)
 {
 
 	assert(bc->bc_magic == BLOCKIF_SIG);
 	return (blockif_request(bc, breq, BOP_WRITE));
 }
 
 int
 blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq)
 {
 
 	assert(bc->bc_magic == BLOCKIF_SIG);
 	return (blockif_request(bc, breq, BOP_FLUSH));
 }
 
 int
 blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq)
 {
 
 	assert(bc->bc_magic == BLOCKIF_SIG);
 	return (blockif_request(bc, breq, BOP_DELETE));
 }
 
 int
 blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq)
 {
 	struct blockif_elem *be;
 
 	assert(bc->bc_magic == BLOCKIF_SIG);
 
 	pthread_mutex_lock(&bc->bc_mtx);
+	/* XXX: not waiting while paused */
+
 	/*
 	 * Check pending requests.
 	 */
 	TAILQ_FOREACH(be, &bc->bc_pendq, be_link) {
 		if (be->be_req == breq)
 			break;
 	}
 	if (be != NULL) {
 		/*
 		 * Found it.
 		 */
 		blockif_complete(bc, be);
 		pthread_mutex_unlock(&bc->bc_mtx);
 
 		return (0);
 	}
 
 	/*
 	 * Check in-flight requests.
 	 */
 	TAILQ_FOREACH(be, &bc->bc_busyq, be_link) {
 		if (be->be_req == breq)
 			break;
 	}
 	if (be == NULL) {
 		/*
 		 * Didn't find it.
 		 */
 		pthread_mutex_unlock(&bc->bc_mtx);
 		return (EINVAL);
 	}
 
 	/*
 	 * Interrupt the processing thread to force it return
 	 * prematurely via it's normal callback path.
 	 */
 	while (be->be_status == BST_BUSY) {
 		struct blockif_sig_elem bse, *old_head;
 
 		pthread_mutex_init(&bse.bse_mtx, NULL);
 		pthread_cond_init(&bse.bse_cond, NULL);
 
 		bse.bse_pending = 1;
 
 		do {
 			old_head = blockif_bse_head;
 			bse.bse_next = old_head;
 		} while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head,
 					    (uintptr_t)old_head,
 					    (uintptr_t)&bse));
 
 		pthread_kill(be->be_tid, SIGCONT);
 
 		pthread_mutex_lock(&bse.bse_mtx);
 		while (bse.bse_pending)
 			pthread_cond_wait(&bse.bse_cond, &bse.bse_mtx);
 		pthread_mutex_unlock(&bse.bse_mtx);
 	}
 
 	pthread_mutex_unlock(&bc->bc_mtx);
 
 	/*
 	 * The processing thread has been interrupted.  Since it's not
 	 * clear if the callback has been invoked yet, return EBUSY.
 	 */
 	return (EBUSY);
 }
 
 int
 blockif_close(struct blockif_ctxt *bc)
 {
 	void *jval;
 	int i;
 
 	assert(bc->bc_magic == BLOCKIF_SIG);
 
 	/*
 	 * Stop the block i/o thread
 	 */
 	pthread_mutex_lock(&bc->bc_mtx);
 	bc->bc_closing = 1;
 	pthread_mutex_unlock(&bc->bc_mtx);
 	pthread_cond_broadcast(&bc->bc_cond);
 	for (i = 0; i < BLOCKIF_NUMTHR; i++)
 		pthread_join(bc->bc_btid[i], &jval);
 
 	/* XXX Cancel queued i/o's ??? */
 
 	/*
 	 * Release resources
 	 */
 	bc->bc_magic = 0;
 	close(bc->bc_fd);
 	free(bc);
 
 	return (0);
 }
 
 /*
  * Return virtual C/H/S values for a given block. Use the algorithm
  * outlined in the VHD specification to calculate values.
  */
 void
 blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s)
 {
 	off_t sectors;		/* total sectors of the block dev */
 	off_t hcyl;		/* cylinders times heads */
 	uint16_t secpt;		/* sectors per track */
 	uint8_t heads;
 
 	assert(bc->bc_magic == BLOCKIF_SIG);
 
 	sectors = bc->bc_size / bc->bc_sectsz;
 
 	/* Clamp the size to the largest possible with CHS */
 	if (sectors > 65535UL*16*255)
 		sectors = 65535UL*16*255;
 
 	if (sectors >= 65536UL*16*63) {
 		secpt = 255;
 		heads = 16;
 		hcyl = sectors / secpt;
 	} else {
 		secpt = 17;
 		hcyl = sectors / secpt;
 		heads = (hcyl + 1023) / 1024;
 
 		if (heads < 4)
 			heads = 4;
 
 		if (hcyl >= (heads * 1024) || heads > 16) {
 			secpt = 31;
 			heads = 16;
 			hcyl = sectors / secpt;
 		}
 		if (hcyl >= (heads * 1024)) {
 			secpt = 63;
 			heads = 16;
 			hcyl = sectors / secpt;
 		}
 	}
 
 	*c = hcyl / heads;
 	*h = heads;
 	*s = secpt;
 }
 
 /*
  * Accessors
  */
 off_t
 blockif_size(struct blockif_ctxt *bc)
 {
 
 	assert(bc->bc_magic == BLOCKIF_SIG);
 	return (bc->bc_size);
 }
 
 int
 blockif_sectsz(struct blockif_ctxt *bc)
 {
 
 	assert(bc->bc_magic == BLOCKIF_SIG);
 	return (bc->bc_sectsz);
 }
 
 void
 blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off)
 {
 
 	assert(bc->bc_magic == BLOCKIF_SIG);
 	*size = bc->bc_psectsz;
 	*off = bc->bc_psectoff;
 }
 
 int
 blockif_queuesz(struct blockif_ctxt *bc)
 {
 
 	assert(bc->bc_magic == BLOCKIF_SIG);
 	return (BLOCKIF_MAXREQ - 1);
 }
 
 int
 blockif_is_ro(struct blockif_ctxt *bc)
 {
 
 	assert(bc->bc_magic == BLOCKIF_SIG);
 	return (bc->bc_rdonly);
 }
 
 int
 blockif_candelete(struct blockif_ctxt *bc)
 {
 
 	assert(bc->bc_magic == BLOCKIF_SIG);
 	return (bc->bc_candelete);
 }
+
+#ifdef BHYVE_SNAPSHOT
+void
+blockif_pause(struct blockif_ctxt *bc)
+{
+	assert(bc != NULL);
+	assert(bc->bc_magic == BLOCKIF_SIG);
+
+	pthread_mutex_lock(&bc->bc_mtx);
+	bc->bc_paused = 1;
+
+	/* The interface is paused. Wait for workers to finish their work */
+	while (bc->bc_work_count)
+		pthread_cond_wait(&bc->bc_work_done_cond, &bc->bc_mtx);
+	pthread_mutex_unlock(&bc->bc_mtx);
+
+	if (blockif_flush_bc(bc))
+		fprintf(stderr, "%s: [WARN] failed to flush backing file.\r\n",
+			__func__);
+}
+
+void
+blockif_resume(struct blockif_ctxt *bc)
+{
+	assert(bc != NULL);
+	assert(bc->bc_magic == BLOCKIF_SIG);
+
+	pthread_mutex_lock(&bc->bc_mtx);
+	bc->bc_paused = 0;
+	/* resume the threads waiting for paused */
+	pthread_cond_broadcast(&bc->bc_paused_cond);
+	/* kick the threads after restore */
+	pthread_cond_broadcast(&bc->bc_cond);
+	pthread_mutex_unlock(&bc->bc_mtx);
+}
+
+int
+blockif_snapshot_req(struct blockif_req *br, struct vm_snapshot_meta *meta)
+{
+	int i;
+	struct iovec *iov;
+	int ret;
+
+	SNAPSHOT_VAR_OR_LEAVE(br->br_iovcnt, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(br->br_offset, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(br->br_resid, meta, ret, done);
+
+	/*
+	 * XXX: The callback and parameter must be filled by the virtualized
+	 * device that uses the interface, during its init; we're not touching
+	 * them here.
+	 */
+
+	/* Snapshot the iovecs. */
+	for (i = 0; i < br->br_iovcnt; i++) {
+		iov = &br->br_iov[i];
+
+		SNAPSHOT_VAR_OR_LEAVE(iov->iov_len, meta, ret, done);
+
+		/* We assume the iov is a guest-mapped address. */
+		SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(iov->iov_base, iov->iov_len,
+			false, meta, ret, done);
+	}
+
+done:
+	return (ret);
+}
+
+int
+blockif_snapshot(struct blockif_ctxt *bc, struct vm_snapshot_meta *meta)
+{
+	int ret;
+
+	if (bc->bc_paused == 0) {
+		fprintf(stderr, "%s: Snapshot failed: "
+			"interface not paused.\r\n", __func__);
+		return (ENXIO);
+	}
+
+	pthread_mutex_lock(&bc->bc_mtx);
+
+	SNAPSHOT_VAR_OR_LEAVE(bc->bc_magic, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(bc->bc_ischr, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(bc->bc_isgeom, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(bc->bc_candelete, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(bc->bc_rdonly, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(bc->bc_size, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(bc->bc_sectsz, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(bc->bc_psectsz, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(bc->bc_psectoff, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(bc->bc_closing, meta, ret, done);
+
+done:
+	pthread_mutex_unlock(&bc->bc_mtx);
+	return (ret);
+}
+#endif
diff --git a/usr.sbin/bhyve/block_if.h b/usr.sbin/bhyve/block_if.h
index 75c016447ac2..f3b5b6938ef1 100644
--- a/usr.sbin/bhyve/block_if.h
+++ b/usr.sbin/bhyve/block_if.h
@@ -1,78 +1,89 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2013  Peter Grehan <grehan@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 /*
  * The block API to be used by bhyve block-device emulations. The routines
  * are thread safe, with no assumptions about the context of the completion
  * callback - it may occur in the caller's context, or asynchronously in
  * another thread.
  */
 
 #ifndef _BLOCK_IF_H_
 #define _BLOCK_IF_H_
 
 #include <sys/uio.h>
 #include <sys/unistd.h>
 
+struct vm_snapshot_meta;
+
+
 /*
  * BLOCKIF_IOV_MAX is the maximum number of scatter/gather entries in
  * a single request.  BLOCKIF_RING_MAX is the maxmimum number of
  * pending requests that can be queued.
  */
 #define	BLOCKIF_IOV_MAX		128	/* not practical to be IOV_MAX */
 #define	BLOCKIF_RING_MAX	128
 
 struct blockif_req {
 	int		br_iovcnt;
 	off_t		br_offset;
 	ssize_t		br_resid;
 	void		(*br_callback)(struct blockif_req *req, int err);
 	void		*br_param;
 	struct iovec	br_iov[BLOCKIF_IOV_MAX];
 };
 
 struct blockif_ctxt;
 struct blockif_ctxt *blockif_open(const char *optstr, const char *ident);
 off_t	blockif_size(struct blockif_ctxt *bc);
 void	blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h,
     uint8_t *s);
 int	blockif_sectsz(struct blockif_ctxt *bc);
 void	blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off);
 int	blockif_queuesz(struct blockif_ctxt *bc);
 int	blockif_is_ro(struct blockif_ctxt *bc);
 int	blockif_candelete(struct blockif_ctxt *bc);
 int	blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq);
 int	blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq);
 int	blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq);
 int	blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq);
 int	blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq);
 int	blockif_close(struct blockif_ctxt *bc);
+#ifdef BHYVE_SNAPSHOT
+void	blockif_pause(struct blockif_ctxt *bc);
+void	blockif_resume(struct blockif_ctxt *bc);
+int	blockif_snapshot_req(struct blockif_req *br,
+    struct vm_snapshot_meta *meta);
+int	blockif_snapshot(struct blockif_ctxt *bc,
+    struct vm_snapshot_meta *meta);
+#endif
 
 #endif /* _BLOCK_IF_H_ */
diff --git a/usr.sbin/bhyve/mevent.c b/usr.sbin/bhyve/mevent.c
index c0c69d37f311..649a6b09cb34 100644
--- a/usr.sbin/bhyve/mevent.c
+++ b/usr.sbin/bhyve/mevent.c
@@ -1,492 +1,492 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 /*
  * Micro event library for FreeBSD, designed for a single i/o thread 
  * using kqueue, and having events be persistent by default.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <assert.h>
 #ifndef WITHOUT_CAPSICUM
 #include <capsicum_helpers.h>
 #endif
 #include <err.h>
 #include <errno.h>
 #include <stdbool.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
 #include <sysexits.h>
 #include <unistd.h>
 
 #include <sys/types.h>
 #ifndef WITHOUT_CAPSICUM
 #include <sys/capsicum.h>
 #endif
 #include <sys/event.h>
 #include <sys/time.h>
 
 #include <pthread.h>
 #include <pthread_np.h>
 
 #include "mevent.h"
 
 #define	MEVENT_MAX	64
 
-extern char *vmname;
+extern const char *vmname;
 
 static pthread_t mevent_tid;
 static int mevent_timid = 43;
 static int mevent_pipefd[2];
 static pthread_mutex_t mevent_lmutex = PTHREAD_MUTEX_INITIALIZER;
 
 struct mevent {
 	void	(*me_func)(int, enum ev_type, void *);
 #define me_msecs me_fd
 	int	me_fd;
 	int	me_timid;
 	enum ev_type me_type;
 	void    *me_param;
 	int	me_cq;
 	int	me_state; /* Desired kevent flags. */
 	int	me_closefd;
 	LIST_ENTRY(mevent) me_list;
 };
 
 static LIST_HEAD(listhead, mevent) global_head, change_head;
 
 static void
 mevent_qlock(void)
 {
 	pthread_mutex_lock(&mevent_lmutex);
 }
 
 static void
 mevent_qunlock(void)
 {
 	pthread_mutex_unlock(&mevent_lmutex);
 }
 
 static void
 mevent_pipe_read(int fd, enum ev_type type, void *param)
 {
 	char buf[MEVENT_MAX];
 	int status;
 
 	/*
 	 * Drain the pipe read side. The fd is non-blocking so this is
 	 * safe to do.
 	 */
 	do {
 		status = read(fd, buf, sizeof(buf));
 	} while (status == MEVENT_MAX);
 }
 
 static void
 mevent_notify(void)
 {
 	char c = '\0';
 	
 	/*
 	 * If calling from outside the i/o thread, write a byte on the
 	 * pipe to force the i/o thread to exit the blocking kevent call.
 	 */
 	if (mevent_pipefd[1] != 0 && pthread_self() != mevent_tid) {
 		write(mevent_pipefd[1], &c, 1);
 	}
 }
 
 static int
 mevent_kq_filter(struct mevent *mevp)
 {
 	int retval;
 
 	retval = 0;
 
 	if (mevp->me_type == EVF_READ)
 		retval = EVFILT_READ;
 
 	if (mevp->me_type == EVF_WRITE)
 		retval = EVFILT_WRITE;
 
 	if (mevp->me_type == EVF_TIMER)
 		retval = EVFILT_TIMER;
 
 	if (mevp->me_type == EVF_SIGNAL)
 		retval = EVFILT_SIGNAL;
 
 	return (retval);
 }
 
 static int
 mevent_kq_flags(struct mevent *mevp)
 {
 	return (mevp->me_state);
 }
 
 static int
 mevent_kq_fflags(struct mevent *mevp)
 {
 	/* XXX nothing yet, perhaps EV_EOF for reads ? */
 	return (0);
 }
 
 static int
 mevent_build(int mfd, struct kevent *kev)
 {
 	struct mevent *mevp, *tmpp;
 	int i;
 
 	i = 0;
 
 	mevent_qlock();
 
 	LIST_FOREACH_SAFE(mevp, &change_head, me_list, tmpp) {
 		if (mevp->me_closefd) {
 			/*
 			 * A close of the file descriptor will remove the
 			 * event
 			 */
 			close(mevp->me_fd);
 		} else {
 			if (mevp->me_type == EVF_TIMER) {
 				kev[i].ident = mevp->me_timid;
 				kev[i].data = mevp->me_msecs;
 			} else {
 				kev[i].ident = mevp->me_fd;
 				kev[i].data = 0;
 			}
 			kev[i].filter = mevent_kq_filter(mevp);
 			kev[i].flags = mevent_kq_flags(mevp);
 			kev[i].fflags = mevent_kq_fflags(mevp);
 			kev[i].udata = mevp;
 			i++;
 		}
 
 		mevp->me_cq = 0;
 		LIST_REMOVE(mevp, me_list);
 
 		if (mevp->me_state & EV_DELETE) {
 			free(mevp);
 		} else {
 			/*
 			 * We need to add the event only once, so we can
 			 * reset the EV_ADD bit after it has been propagated
 			 * to the kevent() arguments the first time.
 			 */
 			mevp->me_state &= ~EV_ADD;
 			LIST_INSERT_HEAD(&global_head, mevp, me_list);
 		}
 
 		assert(i < MEVENT_MAX);
 	}
 
 	mevent_qunlock();
 
 	return (i);
 }
 
 static void
 mevent_handle(struct kevent *kev, int numev)
 {
 	struct mevent *mevp;
 	int i;
 
 	for (i = 0; i < numev; i++) {
 		mevp = kev[i].udata;
 
 		/* XXX check for EV_ERROR ? */
 
 		(*mevp->me_func)(mevp->me_fd, mevp->me_type, mevp->me_param);
 	}
 }
 
 static struct mevent *
 mevent_add_state(int tfd, enum ev_type type,
 	   void (*func)(int, enum ev_type, void *), void *param,
 	   int state)
 {
 	struct mevent *lp, *mevp;
 
 	if (tfd < 0 || func == NULL) {
 		return (NULL);
 	}
 
 	mevp = NULL;
 
 	mevent_qlock();
 
 	/*
 	 * Verify that the fd/type tuple is not present in any list
 	 */
 	LIST_FOREACH(lp, &global_head, me_list) {
 		if (type != EVF_TIMER && lp->me_fd == tfd &&
 		    lp->me_type == type) {
 			goto exit;
 		}
 	}
 
 	LIST_FOREACH(lp, &change_head, me_list) {
 		if (type != EVF_TIMER && lp->me_fd == tfd &&
 		    lp->me_type == type) {
 			goto exit;
 		}
 	}
 
 	/*
 	 * Allocate an entry, populate it, and add it to the change list.
 	 */
 	mevp = calloc(1, sizeof(struct mevent));
 	if (mevp == NULL) {
 		goto exit;
 	}
 
 	if (type == EVF_TIMER) {
 		mevp->me_msecs = tfd;
 		mevp->me_timid = mevent_timid++;
 	} else
 		mevp->me_fd = tfd;
 	mevp->me_type = type;
 	mevp->me_func = func;
 	mevp->me_param = param;
 
 	LIST_INSERT_HEAD(&change_head, mevp, me_list);
 	mevp->me_cq = 1;
 	mevp->me_state = state;
 	mevent_notify();
 
 exit:
 	mevent_qunlock();
 
 	return (mevp);
 }
 
 struct mevent *
 mevent_add(int tfd, enum ev_type type,
 	   void (*func)(int, enum ev_type, void *), void *param)
 {
 
 	return (mevent_add_state(tfd, type, func, param, EV_ADD));
 }
 
 struct mevent *
 mevent_add_disabled(int tfd, enum ev_type type,
 		    void (*func)(int, enum ev_type, void *), void *param)
 {
 
 	return (mevent_add_state(tfd, type, func, param, EV_ADD | EV_DISABLE));
 }
 
 static int
 mevent_update(struct mevent *evp, bool enable)
 {
 	int newstate;
 
 	mevent_qlock();
 
 	/*
 	 * It's not possible to enable/disable a deleted event
 	 */
 	assert((evp->me_state & EV_DELETE) == 0);
 
 	newstate = evp->me_state;
 	if (enable) {
 		newstate |= EV_ENABLE;
 		newstate &= ~EV_DISABLE;
 	} else {
 		newstate |= EV_DISABLE;
 		newstate &= ~EV_ENABLE;
 	}
 
 	/*
 	 * No update needed if state isn't changing
 	 */
 	if (evp->me_state != newstate) {
 		evp->me_state = newstate;
 
 		/*
 		 * Place the entry onto the changed list if not
 		 * already there.
 		 */
 		if (evp->me_cq == 0) {
 			evp->me_cq = 1;
 			LIST_REMOVE(evp, me_list);
 			LIST_INSERT_HEAD(&change_head, evp, me_list);
 			mevent_notify();
 		}
 	}
 
 	mevent_qunlock();
 
 	return (0);
 }
 
 int
 mevent_enable(struct mevent *evp)
 {
 
 	return (mevent_update(evp, true));
 }
 
 int
 mevent_disable(struct mevent *evp)
 {
 
 	return (mevent_update(evp, false));
 }
 
 static int
 mevent_delete_event(struct mevent *evp, int closefd)
 {
 	mevent_qlock();
 
 	/*
          * Place the entry onto the changed list if not already there, and
 	 * mark as to be deleted.
          */
         if (evp->me_cq == 0) {
 		evp->me_cq = 1;
 		LIST_REMOVE(evp, me_list);
 		LIST_INSERT_HEAD(&change_head, evp, me_list);
 		mevent_notify();
         }
 	evp->me_state = EV_DELETE;
 
 	if (closefd)
 		evp->me_closefd = 1;
 
 	mevent_qunlock();
 
 	return (0);
 }
 
 int
 mevent_delete(struct mevent *evp)
 {
 
 	return (mevent_delete_event(evp, 0));
 }
 
 int
 mevent_delete_close(struct mevent *evp)
 {
 
 	return (mevent_delete_event(evp, 1));
 }
 
 static void
 mevent_set_name(void)
 {
 
 	pthread_set_name_np(mevent_tid, "mevent");
 }
 
 void
 mevent_dispatch(void)
 {
 	struct kevent changelist[MEVENT_MAX];
 	struct kevent eventlist[MEVENT_MAX];
 	struct mevent *pipev;
 	int mfd;
 	int numev;
 	int ret;
 #ifndef WITHOUT_CAPSICUM
 	cap_rights_t rights;
 #endif
 
 	mevent_tid = pthread_self();
 	mevent_set_name();
 
 	mfd = kqueue();
 	assert(mfd > 0);
 
 #ifndef WITHOUT_CAPSICUM
 	cap_rights_init(&rights, CAP_KQUEUE);
 	if (caph_rights_limit(mfd, &rights) == -1)
 		errx(EX_OSERR, "Unable to apply rights for sandbox");
 #endif
 
 	/*
 	 * Open the pipe that will be used for other threads to force
 	 * the blocking kqueue call to exit by writing to it. Set the
 	 * descriptor to non-blocking.
 	 */
 	ret = pipe(mevent_pipefd);
 	if (ret < 0) {
 		perror("pipe");
 		exit(0);
 	}
 
 #ifndef WITHOUT_CAPSICUM
 	cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE);
 	if (caph_rights_limit(mevent_pipefd[0], &rights) == -1)
 		errx(EX_OSERR, "Unable to apply rights for sandbox");
 	if (caph_rights_limit(mevent_pipefd[1], &rights) == -1)
 		errx(EX_OSERR, "Unable to apply rights for sandbox");
 #endif
 
 	/*
 	 * Add internal event handler for the pipe write fd
 	 */
 	pipev = mevent_add(mevent_pipefd[0], EVF_READ, mevent_pipe_read, NULL);
 	assert(pipev != NULL);
 
 	for (;;) {
 		/*
 		 * Build changelist if required.
 		 * XXX the changelist can be put into the blocking call
 		 * to eliminate the extra syscall. Currently better for
 		 * debug.
 		 */
 		numev = mevent_build(mfd, changelist);
 		if (numev) {
 			ret = kevent(mfd, changelist, numev, NULL, 0, NULL);
 			if (ret == -1) {
 				perror("Error return from kevent change");
 			}
 		}
 
 		/*
 		 * Block awaiting events
 		 */
 		ret = kevent(mfd, NULL, 0, eventlist, MEVENT_MAX, NULL);
 		if (ret == -1 && errno != EINTR) {
 			perror("Error return from kevent monitor");
 		}
 		
 		/*
 		 * Handle reported events
 		 */
 		mevent_handle(eventlist, ret);
 	}			
 }
diff --git a/usr.sbin/bhyve/pci_ahci.c b/usr.sbin/bhyve/pci_ahci.c
index 23aee0fdac6b..49e6452a355d 100644
--- a/usr.sbin/bhyve/pci_ahci.c
+++ b/usr.sbin/bhyve/pci_ahci.c
@@ -1,2474 +1,2770 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2013  Zhixiang Yu <zcore@freebsd.org>
  * Copyright (c) 2015-2016 Alexander Motin <mav@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/linker_set.h>
 #include <sys/stat.h>
 #include <sys/uio.h>
 #include <sys/ioctl.h>
 #include <sys/disk.h>
 #include <sys/ata.h>
 #include <sys/endian.h>
 
+#include <machine/vmm_snapshot.h>
+
 #include <errno.h>
 #include <fcntl.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <strings.h>
 #include <unistd.h>
 #include <assert.h>
 #include <pthread.h>
 #include <pthread_np.h>
 #include <inttypes.h>
 #include <md5.h>
 
 #include "bhyverun.h"
 #include "pci_emul.h"
 #include "ahci.h"
 #include "block_if.h"
 
 #define	DEF_PORTS	6	/* Intel ICH8 AHCI supports 6 ports */
 #define	MAX_PORTS	32	/* AHCI supports 32 ports */
 
 #define	PxSIG_ATA	0x00000101 /* ATA drive */
 #define	PxSIG_ATAPI	0xeb140101 /* ATAPI drive */
 
 enum sata_fis_type {
 	FIS_TYPE_REGH2D		= 0x27,	/* Register FIS - host to device */
 	FIS_TYPE_REGD2H		= 0x34,	/* Register FIS - device to host */
 	FIS_TYPE_DMAACT		= 0x39,	/* DMA activate FIS - device to host */
 	FIS_TYPE_DMASETUP	= 0x41,	/* DMA setup FIS - bidirectional */
 	FIS_TYPE_DATA		= 0x46,	/* Data FIS - bidirectional */
 	FIS_TYPE_BIST		= 0x58,	/* BIST activate FIS - bidirectional */
 	FIS_TYPE_PIOSETUP	= 0x5F,	/* PIO setup FIS - device to host */
 	FIS_TYPE_SETDEVBITS	= 0xA1,	/* Set dev bits FIS - device to host */
 };
 
 /*
  * SCSI opcodes
  */
 #define	TEST_UNIT_READY		0x00
 #define	REQUEST_SENSE		0x03
 #define	INQUIRY			0x12
 #define	START_STOP_UNIT		0x1B
 #define	PREVENT_ALLOW		0x1E
 #define	READ_CAPACITY		0x25
 #define	READ_10			0x28
 #define	POSITION_TO_ELEMENT	0x2B
 #define	READ_TOC		0x43
 #define	GET_EVENT_STATUS_NOTIFICATION 0x4A
 #define	MODE_SENSE_10		0x5A
 #define	REPORT_LUNS		0xA0
 #define	READ_12			0xA8
 #define	READ_CD			0xBE
 
 /*
  * SCSI mode page codes
  */
 #define	MODEPAGE_RW_ERROR_RECOVERY	0x01
 #define	MODEPAGE_CD_CAPABILITIES	0x2A
 
 /*
  * ATA commands
  */
 #define	ATA_SF_ENAB_SATA_SF		0x10
 #define	ATA_SATA_SF_AN			0x05
 #define	ATA_SF_DIS_SATA_SF		0x90
 
 /*
  * Debug printf
  */
 #ifdef AHCI_DEBUG
 static FILE *dbg;
 #define DPRINTF(format, arg...)	do{fprintf(dbg, format, ##arg);fflush(dbg);}while(0)
 #else
 #define DPRINTF(format, arg...)
 #endif
 #define WPRINTF(format, arg...) printf(format, ##arg)
 
 #define AHCI_PORT_IDENT 20 + 1
 
 struct ahci_ioreq {
 	struct blockif_req io_req;
 	struct ahci_port *io_pr;
 	STAILQ_ENTRY(ahci_ioreq) io_flist;
 	TAILQ_ENTRY(ahci_ioreq) io_blist;
 	uint8_t *cfis;
 	uint32_t len;
 	uint32_t done;
 	int slot;
 	int more;
+	int readop;
 };
 
 struct ahci_port {
 	struct blockif_ctxt *bctx;
 	struct pci_ahci_softc *pr_sc;
 	uint8_t *cmd_lst;
 	uint8_t *rfis;
 	char ident[AHCI_PORT_IDENT];
 	int port;
 	int atapi;
 	int reset;
 	int waitforclear;
 	int mult_sectors;
 	uint8_t xfermode;
 	uint8_t err_cfis[20];
 	uint8_t sense_key;
 	uint8_t asc;
 	u_int ccs;
 	uint32_t pending;
 
 	uint32_t clb;
 	uint32_t clbu;
 	uint32_t fb;
 	uint32_t fbu;
 	uint32_t is;
 	uint32_t ie;
 	uint32_t cmd;
 	uint32_t unused0;
 	uint32_t tfd;
 	uint32_t sig;
 	uint32_t ssts;
 	uint32_t sctl;
 	uint32_t serr;
 	uint32_t sact;
 	uint32_t ci;
 	uint32_t sntf;
 	uint32_t fbs;
 
 	/*
 	 * i/o request info
 	 */
 	struct ahci_ioreq *ioreq;
 	int ioqsz;
 	STAILQ_HEAD(ahci_fhead, ahci_ioreq) iofhd;
 	TAILQ_HEAD(ahci_bhead, ahci_ioreq) iobhd;
 };
 
 struct ahci_cmd_hdr {
 	uint16_t flags;
 	uint16_t prdtl;
 	uint32_t prdbc;
 	uint64_t ctba;
 	uint32_t reserved[4];
 };
 
 struct ahci_prdt_entry {
 	uint64_t dba;
 	uint32_t reserved;
 #define	DBCMASK		0x3fffff
 	uint32_t dbc;
 };
 
 struct pci_ahci_softc {
 	struct pci_devinst *asc_pi;
 	pthread_mutex_t	mtx;
 	int ports;
 	uint32_t cap;
 	uint32_t ghc;
 	uint32_t is;
 	uint32_t pi;
 	uint32_t vs;
 	uint32_t ccc_ctl;
 	uint32_t ccc_pts;
 	uint32_t em_loc;
 	uint32_t em_ctl;
 	uint32_t cap2;
 	uint32_t bohc;
 	uint32_t lintr;
 	struct ahci_port port[MAX_PORTS];
 };
 #define	ahci_ctx(sc)	((sc)->asc_pi->pi_vmctx)
 
 static void ahci_handle_port(struct ahci_port *p);
 
 static inline void lba_to_msf(uint8_t *buf, int lba)
 {
 	lba += 150;
 	buf[0] = (lba / 75) / 60;
 	buf[1] = (lba / 75) % 60;
 	buf[2] = lba % 75;
 }
 
 /*
  * Generate HBA interrupts on global IS register write.
  */
 static void
 ahci_generate_intr(struct pci_ahci_softc *sc, uint32_t mask)
 {
 	struct pci_devinst *pi = sc->asc_pi;
 	struct ahci_port *p;
 	int i, nmsg;
 	uint32_t mmask;
 
 	/* Update global IS from PxIS/PxIE. */
 	for (i = 0; i < sc->ports; i++) {
 		p = &sc->port[i];
 		if (p->is & p->ie)
 			sc->is |= (1 << i);
 	}
 	DPRINTF("%s(%08x) %08x", __func__, mask, sc->is);
 
 	/* If there is nothing enabled -- clear legacy interrupt and exit. */
 	if (sc->is == 0 || (sc->ghc & AHCI_GHC_IE) == 0) {
 		if (sc->lintr) {
 			pci_lintr_deassert(pi);
 			sc->lintr = 0;
 		}
 		return;
 	}
 
 	/* If there is anything and no MSI -- assert legacy interrupt. */
 	nmsg = pci_msi_maxmsgnum(pi);
 	if (nmsg == 0) {
 		if (!sc->lintr) {
 			sc->lintr = 1;
 			pci_lintr_assert(pi);
 		}
 		return;
 	}
 
 	/* Assert respective MSIs for ports that were touched. */
 	for (i = 0; i < nmsg; i++) {
 		if (sc->ports <= nmsg || i < nmsg - 1)
 			mmask = 1 << i;
 		else
 			mmask = 0xffffffff << i;
 		if (sc->is & mask && mmask & mask)
 			pci_generate_msi(pi, i);
 	}
 }
 
 /*
  * Generate HBA interrupt on specific port event.
  */
 static void
 ahci_port_intr(struct ahci_port *p)
 {
 	struct pci_ahci_softc *sc = p->pr_sc;
 	struct pci_devinst *pi = sc->asc_pi;
 	int nmsg;
 
 	DPRINTF("%s(%d) %08x/%08x %08x", __func__,
 	    p->port, p->is, p->ie, sc->is);
 
 	/* If there is nothing enabled -- we are done. */
 	if ((p->is & p->ie) == 0)
 		return;
 
 	/* In case of non-shared MSI always generate interrupt. */
 	nmsg = pci_msi_maxmsgnum(pi);
 	if (sc->ports <= nmsg || p->port < nmsg - 1) {
 		sc->is |= (1 << p->port);
 		if ((sc->ghc & AHCI_GHC_IE) == 0)
 			return;
 		pci_generate_msi(pi, p->port);
 		return;
 	}
 
 	/* If IS for this port is already set -- do nothing. */
 	if (sc->is & (1 << p->port))
 		return;
 
 	sc->is |= (1 << p->port);
 
 	/* If interrupts are enabled -- generate one. */
 	if ((sc->ghc & AHCI_GHC_IE) == 0)
 		return;
 	if (nmsg > 0) {
 		pci_generate_msi(pi, nmsg - 1);
 	} else if (!sc->lintr) {
 		sc->lintr = 1;
 		pci_lintr_assert(pi);
 	}
 }
 
 static void
 ahci_write_fis(struct ahci_port *p, enum sata_fis_type ft, uint8_t *fis)
 {
 	int offset, len, irq;
 
 	if (p->rfis == NULL || !(p->cmd & AHCI_P_CMD_FRE))
 		return;
 
 	switch (ft) {
 	case FIS_TYPE_REGD2H:
 		offset = 0x40;
 		len = 20;
 		irq = (fis[1] & (1 << 6)) ? AHCI_P_IX_DHR : 0;
 		break;
 	case FIS_TYPE_SETDEVBITS:
 		offset = 0x58;
 		len = 8;
 		irq = (fis[1] & (1 << 6)) ? AHCI_P_IX_SDB : 0;
 		break;
 	case FIS_TYPE_PIOSETUP:
 		offset = 0x20;
 		len = 20;
 		irq = (fis[1] & (1 << 6)) ? AHCI_P_IX_PS : 0;
 		break;
 	default:
 		WPRINTF("unsupported fis type %d", ft);
 		return;
 	}
 	if (fis[2] & ATA_S_ERROR) {
 		p->waitforclear = 1;
 		irq |= AHCI_P_IX_TFE;
 	}
 	memcpy(p->rfis + offset, fis, len);
 	if (irq) {
 		if (~p->is & irq) {
 			p->is |= irq;
 			ahci_port_intr(p);
 		}
 	}
 }
 
 static void
 ahci_write_fis_piosetup(struct ahci_port *p)
 {
 	uint8_t fis[20];
 
 	memset(fis, 0, sizeof(fis));
 	fis[0] = FIS_TYPE_PIOSETUP;
 	ahci_write_fis(p, FIS_TYPE_PIOSETUP, fis);
 }
 
 static void
 ahci_write_fis_sdb(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t tfd)
 {
 	uint8_t fis[8];
 	uint8_t error;
 
 	error = (tfd >> 8) & 0xff;
 	tfd &= 0x77;
 	memset(fis, 0, sizeof(fis));
 	fis[0] = FIS_TYPE_SETDEVBITS;
 	fis[1] = (1 << 6);
 	fis[2] = tfd;
 	fis[3] = error;
 	if (fis[2] & ATA_S_ERROR) {
 		p->err_cfis[0] = slot;
 		p->err_cfis[2] = tfd;
 		p->err_cfis[3] = error;
 		memcpy(&p->err_cfis[4], cfis + 4, 16);
 	} else {
 		*(uint32_t *)(fis + 4) = (1 << slot);
 		p->sact &= ~(1 << slot);
 	}
 	p->tfd &= ~0x77;
 	p->tfd |= tfd;
 	ahci_write_fis(p, FIS_TYPE_SETDEVBITS, fis);
 }
 
 static void
 ahci_write_fis_d2h(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t tfd)
 {
 	uint8_t fis[20];
 	uint8_t error;
 
 	error = (tfd >> 8) & 0xff;
 	memset(fis, 0, sizeof(fis));
 	fis[0] = FIS_TYPE_REGD2H;
 	fis[1] = (1 << 6);
 	fis[2] = tfd & 0xff;
 	fis[3] = error;
 	fis[4] = cfis[4];
 	fis[5] = cfis[5];
 	fis[6] = cfis[6];
 	fis[7] = cfis[7];
 	fis[8] = cfis[8];
 	fis[9] = cfis[9];
 	fis[10] = cfis[10];
 	fis[11] = cfis[11];
 	fis[12] = cfis[12];
 	fis[13] = cfis[13];
 	if (fis[2] & ATA_S_ERROR) {
 		p->err_cfis[0] = 0x80;
 		p->err_cfis[2] = tfd & 0xff;
 		p->err_cfis[3] = error;
 		memcpy(&p->err_cfis[4], cfis + 4, 16);
 	} else
 		p->ci &= ~(1 << slot);
 	p->tfd = tfd;
 	ahci_write_fis(p, FIS_TYPE_REGD2H, fis);
 }
 
 static void
 ahci_write_fis_d2h_ncq(struct ahci_port *p, int slot)
 {
 	uint8_t fis[20];
 
 	p->tfd = ATA_S_READY | ATA_S_DSC;
 	memset(fis, 0, sizeof(fis));
 	fis[0] = FIS_TYPE_REGD2H;
 	fis[1] = 0;			/* No interrupt */
 	fis[2] = p->tfd;		/* Status */
 	fis[3] = 0;			/* No error */
 	p->ci &= ~(1 << slot);
 	ahci_write_fis(p, FIS_TYPE_REGD2H, fis);
 }
 
 static void
 ahci_write_reset_fis_d2h(struct ahci_port *p)
 {
 	uint8_t fis[20];
 
 	memset(fis, 0, sizeof(fis));
 	fis[0] = FIS_TYPE_REGD2H;
 	fis[3] = 1;
 	fis[4] = 1;
 	if (p->atapi) {
 		fis[5] = 0x14;
 		fis[6] = 0xeb;
 	}
 	fis[12] = 1;
 	ahci_write_fis(p, FIS_TYPE_REGD2H, fis);
 }
 
 static void
 ahci_check_stopped(struct ahci_port *p)
 {
 	/*
 	 * If we are no longer processing the command list and nothing
 	 * is in-flight, clear the running bit, the current command
 	 * slot, the command issue and active bits.
 	 */
 	if (!(p->cmd & AHCI_P_CMD_ST)) {
 		if (p->pending == 0) {
 			p->ccs = 0;
 			p->cmd &= ~(AHCI_P_CMD_CR | AHCI_P_CMD_CCS_MASK);
 			p->ci = 0;
 			p->sact = 0;
 			p->waitforclear = 0;
 		}
 	}
 }
 
 static void
 ahci_port_stop(struct ahci_port *p)
 {
 	struct ahci_ioreq *aior;
 	uint8_t *cfis;
 	int slot;
 	int error;
 
 	assert(pthread_mutex_isowned_np(&p->pr_sc->mtx));
 
 	TAILQ_FOREACH(aior, &p->iobhd, io_blist) {
 		/*
 		 * Try to cancel the outstanding blockif request.
 		 */
 		error = blockif_cancel(p->bctx, &aior->io_req);
 		if (error != 0)
 			continue;
 
 		slot = aior->slot;
 		cfis = aior->cfis;
 		if (cfis[2] == ATA_WRITE_FPDMA_QUEUED ||
 		    cfis[2] == ATA_READ_FPDMA_QUEUED ||
 		    cfis[2] == ATA_SEND_FPDMA_QUEUED)
 			p->sact &= ~(1 << slot);	/* NCQ */
 		else
 			p->ci &= ~(1 << slot);
 
 		/*
 		 * This command is now done.
 		 */
 		p->pending &= ~(1 << slot);
 
 		/*
 		 * Delete the blockif request from the busy list
 		 */
 		TAILQ_REMOVE(&p->iobhd, aior, io_blist);
 
 		/*
 		 * Move the blockif request back to the free list
 		 */
 		STAILQ_INSERT_TAIL(&p->iofhd, aior, io_flist);
 	}
 
 	ahci_check_stopped(p);
 }
 
 static void
 ahci_port_reset(struct ahci_port *pr)
 {
 	pr->serr = 0;
 	pr->sact = 0;
 	pr->xfermode = ATA_UDMA6;
 	pr->mult_sectors = 128;
 
 	if (!pr->bctx) {
 		pr->ssts = ATA_SS_DET_NO_DEVICE;
 		pr->sig = 0xFFFFFFFF;
 		pr->tfd = 0x7F;
 		return;
 	}
 	pr->ssts = ATA_SS_DET_PHY_ONLINE | ATA_SS_IPM_ACTIVE;
 	if (pr->sctl & ATA_SC_SPD_MASK)
 		pr->ssts |= (pr->sctl & ATA_SC_SPD_MASK);
 	else
 		pr->ssts |= ATA_SS_SPD_GEN3;
 	pr->tfd = (1 << 8) | ATA_S_DSC | ATA_S_DMA;
 	if (!pr->atapi) {
 		pr->sig = PxSIG_ATA;
 		pr->tfd |= ATA_S_READY;
 	} else
 		pr->sig = PxSIG_ATAPI;
 	ahci_write_reset_fis_d2h(pr);
 }
 
 static void
 ahci_reset(struct pci_ahci_softc *sc)
 {
 	int i;
 
 	sc->ghc = AHCI_GHC_AE;
 	sc->is = 0;
 
 	if (sc->lintr) {
 		pci_lintr_deassert(sc->asc_pi);
 		sc->lintr = 0;
 	}
 
 	for (i = 0; i < sc->ports; i++) {
 		sc->port[i].ie = 0;
 		sc->port[i].is = 0;
 		sc->port[i].cmd = (AHCI_P_CMD_SUD | AHCI_P_CMD_POD);
 		if (sc->port[i].bctx)
 			sc->port[i].cmd |= AHCI_P_CMD_CPS;
 		sc->port[i].sctl = 0;
 		ahci_port_reset(&sc->port[i]);
 	}
 }
 
 static void
 ata_string(uint8_t *dest, const char *src, int len)
 {
 	int i;
 
 	for (i = 0; i < len; i++) {
 		if (*src)
 			dest[i ^ 1] = *src++;
 		else
 			dest[i ^ 1] = ' ';
 	}
 }
 
 static void
 atapi_string(uint8_t *dest, const char *src, int len)
 {
 	int i;
 
 	for (i = 0; i < len; i++) {
 		if (*src)
 			dest[i] = *src++;
 		else
 			dest[i] = ' ';
 	}
 }
 
 /*
  * Build up the iovec based on the PRDT, 'done' and 'len'.
  */
 static void
 ahci_build_iov(struct ahci_port *p, struct ahci_ioreq *aior,
     struct ahci_prdt_entry *prdt, uint16_t prdtl)
 {
 	struct blockif_req *breq = &aior->io_req;
 	int i, j, skip, todo, left, extra;
 	uint32_t dbcsz;
 
 	/* Copy part of PRDT between 'done' and 'len' bytes into the iov. */
 	skip = aior->done;
 	left = aior->len - aior->done;
 	todo = 0;
 	for (i = 0, j = 0; i < prdtl && j < BLOCKIF_IOV_MAX && left > 0;
 	    i++, prdt++) {
 		dbcsz = (prdt->dbc & DBCMASK) + 1;
 		/* Skip already done part of the PRDT */
 		if (dbcsz <= skip) {
 			skip -= dbcsz;
 			continue;
 		}
 		dbcsz -= skip;
 		if (dbcsz > left)
 			dbcsz = left;
 		breq->br_iov[j].iov_base = paddr_guest2host(ahci_ctx(p->pr_sc),
 		    prdt->dba + skip, dbcsz);
 		breq->br_iov[j].iov_len = dbcsz;
 		todo += dbcsz;
 		left -= dbcsz;
 		skip = 0;
 		j++;
 	}
 
 	/* If we got limited by IOV length, round I/O down to sector size. */
 	if (j == BLOCKIF_IOV_MAX) {
 		extra = todo % blockif_sectsz(p->bctx);
 		todo -= extra;
 		assert(todo > 0);
 		while (extra > 0) {
 			if (breq->br_iov[j - 1].iov_len > extra) {
 				breq->br_iov[j - 1].iov_len -= extra;
 				break;
 			}
 			extra -= breq->br_iov[j - 1].iov_len;
 			j--;
 		}
 	}
 
 	breq->br_iovcnt = j;
 	breq->br_resid = todo;
 	aior->done += todo;
 	aior->more = (aior->done < aior->len && i < prdtl);
 }
 
 static void
 ahci_handle_rw(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done)
 {
 	struct ahci_ioreq *aior;
 	struct blockif_req *breq;
 	struct ahci_prdt_entry *prdt;
 	struct ahci_cmd_hdr *hdr;
 	uint64_t lba;
 	uint32_t len;
 	int err, first, ncq, readop;
 
 	prdt = (struct ahci_prdt_entry *)(cfis + 0x80);
 	hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE);
 	ncq = 0;
 	readop = 1;
 	first = (done == 0);
 
 	if (cfis[2] == ATA_WRITE || cfis[2] == ATA_WRITE48 ||
 	    cfis[2] == ATA_WRITE_MUL || cfis[2] == ATA_WRITE_MUL48 ||
 	    cfis[2] == ATA_WRITE_DMA || cfis[2] == ATA_WRITE_DMA48 ||
 	    cfis[2] == ATA_WRITE_FPDMA_QUEUED)
 		readop = 0;
 
 	if (cfis[2] == ATA_WRITE_FPDMA_QUEUED ||
 	    cfis[2] == ATA_READ_FPDMA_QUEUED) {
 		lba = ((uint64_t)cfis[10] << 40) |
 			((uint64_t)cfis[9] << 32) |
 			((uint64_t)cfis[8] << 24) |
 			((uint64_t)cfis[6] << 16) |
 			((uint64_t)cfis[5] << 8) |
 			cfis[4];
 		len = cfis[11] << 8 | cfis[3];
 		if (!len)
 			len = 65536;
 		ncq = 1;
 	} else if (cfis[2] == ATA_READ48 || cfis[2] == ATA_WRITE48 ||
 	    cfis[2] == ATA_READ_MUL48 || cfis[2] == ATA_WRITE_MUL48 ||
 	    cfis[2] == ATA_READ_DMA48 || cfis[2] == ATA_WRITE_DMA48) {
 		lba = ((uint64_t)cfis[10] << 40) |
 			((uint64_t)cfis[9] << 32) |
 			((uint64_t)cfis[8] << 24) |
 			((uint64_t)cfis[6] << 16) |
 			((uint64_t)cfis[5] << 8) |
 			cfis[4];
 		len = cfis[13] << 8 | cfis[12];
 		if (!len)
 			len = 65536;
 	} else {
 		lba = ((cfis[7] & 0xf) << 24) | (cfis[6] << 16) |
 			(cfis[5] << 8) | cfis[4];
 		len = cfis[12];
 		if (!len)
 			len = 256;
 	}
 	lba *= blockif_sectsz(p->bctx);
 	len *= blockif_sectsz(p->bctx);
 
 	/* Pull request off free list */
 	aior = STAILQ_FIRST(&p->iofhd);
 	assert(aior != NULL);
 	STAILQ_REMOVE_HEAD(&p->iofhd, io_flist);
 
 	aior->cfis = cfis;
 	aior->slot = slot;
 	aior->len = len;
 	aior->done = done;
+	aior->readop = readop;
 	breq = &aior->io_req;
 	breq->br_offset = lba + done;
 	ahci_build_iov(p, aior, prdt, hdr->prdtl);
 
 	/* Mark this command in-flight. */
 	p->pending |= 1 << slot;
 
 	/* Stuff request onto busy list. */
 	TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist);
 
 	if (ncq && first)
 		ahci_write_fis_d2h_ncq(p, slot);
 
 	if (readop)
 		err = blockif_read(p->bctx, breq);
 	else
 		err = blockif_write(p->bctx, breq);
 	assert(err == 0);
 }
 
 static void
 ahci_handle_flush(struct ahci_port *p, int slot, uint8_t *cfis)
 {
 	struct ahci_ioreq *aior;
 	struct blockif_req *breq;
 	int err;
 
 	/*
 	 * Pull request off free list
 	 */
 	aior = STAILQ_FIRST(&p->iofhd);
 	assert(aior != NULL);
 	STAILQ_REMOVE_HEAD(&p->iofhd, io_flist);
 	aior->cfis = cfis;
 	aior->slot = slot;
 	aior->len = 0;
 	aior->done = 0;
 	aior->more = 0;
 	breq = &aior->io_req;
 
 	/*
 	 * Mark this command in-flight.
 	 */
 	p->pending |= 1 << slot;
 
 	/*
 	 * Stuff request onto busy list
 	 */
 	TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist);
 
 	err = blockif_flush(p->bctx, breq);
 	assert(err == 0);
 }
 
 static inline void
 read_prdt(struct ahci_port *p, int slot, uint8_t *cfis,
 		void *buf, int size)
 {
 	struct ahci_cmd_hdr *hdr;
 	struct ahci_prdt_entry *prdt;
 	void *to;
 	int i, len;
 
 	hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE);
 	len = size;
 	to = buf;
 	prdt = (struct ahci_prdt_entry *)(cfis + 0x80);
 	for (i = 0; i < hdr->prdtl && len; i++) {
 		uint8_t *ptr;
 		uint32_t dbcsz;
 		int sublen;
 
 		dbcsz = (prdt->dbc & DBCMASK) + 1;
 		ptr = paddr_guest2host(ahci_ctx(p->pr_sc), prdt->dba, dbcsz);
 		sublen = MIN(len, dbcsz);
 		memcpy(to, ptr, sublen);
 		len -= sublen;
 		to += sublen;
 		prdt++;
 	}
 }
 
 static void
 ahci_handle_dsm_trim(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done)
 {
 	struct ahci_ioreq *aior;
 	struct blockif_req *breq;
 	uint8_t *entry;
 	uint64_t elba;
 	uint32_t len, elen;
 	int err, first, ncq;
 	uint8_t buf[512];
 
 	first = (done == 0);
 	if (cfis[2] == ATA_DATA_SET_MANAGEMENT) {
 		len = (uint16_t)cfis[13] << 8 | cfis[12];
 		len *= 512;
 		ncq = 0;
 	} else { /* ATA_SEND_FPDMA_QUEUED */
 		len = (uint16_t)cfis[11] << 8 | cfis[3];
 		len *= 512;
 		ncq = 1;
 	}
 	read_prdt(p, slot, cfis, buf, sizeof(buf));
 
 next:
 	entry = &buf[done];
 	elba = ((uint64_t)entry[5] << 40) |
 		((uint64_t)entry[4] << 32) |
 		((uint64_t)entry[3] << 24) |
 		((uint64_t)entry[2] << 16) |
 		((uint64_t)entry[1] << 8) |
 		entry[0];
 	elen = (uint16_t)entry[7] << 8 | entry[6];
 	done += 8;
 	if (elen == 0) {
 		if (done >= len) {
 			if (ncq) {
 				if (first)
 					ahci_write_fis_d2h_ncq(p, slot);
 				ahci_write_fis_sdb(p, slot, cfis,
 				    ATA_S_READY | ATA_S_DSC);
 			} else {
 				ahci_write_fis_d2h(p, slot, cfis,
 				    ATA_S_READY | ATA_S_DSC);
 			}
 			p->pending &= ~(1 << slot);
 			ahci_check_stopped(p);
 			if (!first)
 				ahci_handle_port(p);
 			return;
 		}
 		goto next;
 	}
 
 	/*
 	 * Pull request off free list
 	 */
 	aior = STAILQ_FIRST(&p->iofhd);
 	assert(aior != NULL);
 	STAILQ_REMOVE_HEAD(&p->iofhd, io_flist);
 	aior->cfis = cfis;
 	aior->slot = slot;
 	aior->len = len;
 	aior->done = done;
 	aior->more = (len != done);
 
 	breq = &aior->io_req;
 	breq->br_offset = elba * blockif_sectsz(p->bctx);
 	breq->br_resid = elen * blockif_sectsz(p->bctx);
 
 	/*
 	 * Mark this command in-flight.
 	 */
 	p->pending |= 1 << slot;
 
 	/*
 	 * Stuff request onto busy list
 	 */
 	TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist);
 
 	if (ncq && first)
 		ahci_write_fis_d2h_ncq(p, slot);
 
 	err = blockif_delete(p->bctx, breq);
 	assert(err == 0);
 }
 
 static inline void
 write_prdt(struct ahci_port *p, int slot, uint8_t *cfis,
 		void *buf, int size)
 {
 	struct ahci_cmd_hdr *hdr;
 	struct ahci_prdt_entry *prdt;
 	void *from;
 	int i, len;
 
 	hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE);
 	len = size;
 	from = buf;
 	prdt = (struct ahci_prdt_entry *)(cfis + 0x80);
 	for (i = 0; i < hdr->prdtl && len; i++) {
 		uint8_t *ptr;
 		uint32_t dbcsz;
 		int sublen;
 
 		dbcsz = (prdt->dbc & DBCMASK) + 1;
 		ptr = paddr_guest2host(ahci_ctx(p->pr_sc), prdt->dba, dbcsz);
 		sublen = MIN(len, dbcsz);
 		memcpy(ptr, from, sublen);
 		len -= sublen;
 		from += sublen;
 		prdt++;
 	}
 	hdr->prdbc = size - len;
 }
 
 static void
 ahci_checksum(uint8_t *buf, int size)
 {
 	int i;
 	uint8_t sum = 0;
 
 	for (i = 0; i < size - 1; i++)
 		sum += buf[i];
 	buf[size - 1] = 0x100 - sum;
 }
 
 static void
 ahci_handle_read_log(struct ahci_port *p, int slot, uint8_t *cfis)
 {
 	struct ahci_cmd_hdr *hdr;
 	uint32_t buf[128];
 	uint8_t *buf8 = (uint8_t *)buf;
 	uint16_t *buf16 = (uint16_t *)buf;
 
 	hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE);
 	if (p->atapi || hdr->prdtl == 0 || cfis[5] != 0 ||
 	    cfis[9] != 0 || cfis[12] != 1 || cfis[13] != 0) {
 		ahci_write_fis_d2h(p, slot, cfis,
 		    (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR);
 		return;
 	}
 
 	memset(buf, 0, sizeof(buf));
 	if (cfis[4] == 0x00) {	/* Log directory */
 		buf16[0x00] = 1; /* Version -- 1 */
 		buf16[0x10] = 1; /* NCQ Command Error Log -- 1 page */
 		buf16[0x13] = 1; /* SATA NCQ Send and Receive Log -- 1 page */
 	} else if (cfis[4] == 0x10) {	/* NCQ Command Error Log */
 		memcpy(buf8, p->err_cfis, sizeof(p->err_cfis));
 		ahci_checksum(buf8, sizeof(buf));
 	} else if (cfis[4] == 0x13) {	/* SATA NCQ Send and Receive Log */
 		if (blockif_candelete(p->bctx) && !blockif_is_ro(p->bctx)) {
 			buf[0x00] = 1;	/* SFQ DSM supported */
 			buf[0x01] = 1;	/* SFQ DSM TRIM supported */
 		}
 	} else {
 		ahci_write_fis_d2h(p, slot, cfis,
 		    (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR);
 		return;
 	}
 
 	if (cfis[2] == ATA_READ_LOG_EXT)
 		ahci_write_fis_piosetup(p);
 	write_prdt(p, slot, cfis, (void *)buf, sizeof(buf));
 	ahci_write_fis_d2h(p, slot, cfis, ATA_S_DSC | ATA_S_READY);
 }
 
 static void
 handle_identify(struct ahci_port *p, int slot, uint8_t *cfis)
 {
 	struct ahci_cmd_hdr *hdr;
 
 	hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE);
 	if (p->atapi || hdr->prdtl == 0) {
 		ahci_write_fis_d2h(p, slot, cfis,
 		    (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR);
 	} else {
 		uint16_t buf[256];
 		uint64_t sectors;
 		int sectsz, psectsz, psectoff, candelete, ro;
 		uint16_t cyl;
 		uint8_t sech, heads;
 
 		ro = blockif_is_ro(p->bctx);
 		candelete = blockif_candelete(p->bctx);
 		sectsz = blockif_sectsz(p->bctx);
 		sectors = blockif_size(p->bctx) / sectsz;
 		blockif_chs(p->bctx, &cyl, &heads, &sech);
 		blockif_psectsz(p->bctx, &psectsz, &psectoff);
 		memset(buf, 0, sizeof(buf));
 		buf[0] = 0x0040;
 		buf[1] = cyl;
 		buf[3] = heads;
 		buf[6] = sech;
 		ata_string((uint8_t *)(buf+10), p->ident, 20);
 		ata_string((uint8_t *)(buf+23), "001", 8);
 		ata_string((uint8_t *)(buf+27), "BHYVE SATA DISK", 40);
 		buf[47] = (0x8000 | 128);
 		buf[48] = 0;
 		buf[49] = (1 << 8 | 1 << 9 | 1 << 11);
 		buf[50] = (1 << 14);
 		buf[53] = (1 << 1 | 1 << 2);
 		if (p->mult_sectors)
 			buf[59] = (0x100 | p->mult_sectors);
 		if (sectors <= 0x0fffffff) {
 			buf[60] = sectors;
 			buf[61] = (sectors >> 16);
 		} else {
 			buf[60] = 0xffff;
 			buf[61] = 0x0fff;
 		}
 		buf[63] = 0x7;
 		if (p->xfermode & ATA_WDMA0)
 			buf[63] |= (1 << ((p->xfermode & 7) + 8));
 		buf[64] = 0x3;
 		buf[65] = 120;
 		buf[66] = 120;
 		buf[67] = 120;
 		buf[68] = 120;
 		buf[69] = 0;
 		buf[75] = 31;
 		buf[76] = (ATA_SATA_GEN1 | ATA_SATA_GEN2 | ATA_SATA_GEN3 |
 			   ATA_SUPPORT_NCQ);
 		buf[77] = (ATA_SUPPORT_RCVSND_FPDMA_QUEUED |
 			   (p->ssts & ATA_SS_SPD_MASK) >> 3);
 		buf[80] = 0x3f0;
 		buf[81] = 0x28;
 		buf[82] = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_WRITECACHE|
 			   ATA_SUPPORT_LOOKAHEAD | ATA_SUPPORT_NOP);
 		buf[83] = (ATA_SUPPORT_ADDRESS48 | ATA_SUPPORT_FLUSHCACHE |
 			   ATA_SUPPORT_FLUSHCACHE48 | 1 << 14);
 		buf[84] = (1 << 14);
 		buf[85] = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_WRITECACHE|
 			   ATA_SUPPORT_LOOKAHEAD | ATA_SUPPORT_NOP);
 		buf[86] = (ATA_SUPPORT_ADDRESS48 | ATA_SUPPORT_FLUSHCACHE |
 			   ATA_SUPPORT_FLUSHCACHE48 | 1 << 15);
 		buf[87] = (1 << 14);
 		buf[88] = 0x7f;
 		if (p->xfermode & ATA_UDMA0)
 			buf[88] |= (1 << ((p->xfermode & 7) + 8));
 		buf[100] = sectors;
 		buf[101] = (sectors >> 16);
 		buf[102] = (sectors >> 32);
 		buf[103] = (sectors >> 48);
 		if (candelete && !ro) {
 			buf[69] |= ATA_SUPPORT_RZAT | ATA_SUPPORT_DRAT;
 			buf[105] = 1;
 			buf[169] = ATA_SUPPORT_DSM_TRIM;
 		}
 		buf[106] = 0x4000;
 		buf[209] = 0x4000;
 		if (psectsz > sectsz) {
 			buf[106] |= 0x2000;
 			buf[106] |= ffsl(psectsz / sectsz) - 1;
 			buf[209] |= (psectoff / sectsz);
 		}
 		if (sectsz > 512) {
 			buf[106] |= 0x1000;
 			buf[117] = sectsz / 2;
 			buf[118] = ((sectsz / 2) >> 16);
 		}
 		buf[119] = (ATA_SUPPORT_RWLOGDMAEXT | 1 << 14);
 		buf[120] = (ATA_SUPPORT_RWLOGDMAEXT | 1 << 14);
 		buf[222] = 0x1020;
 		buf[255] = 0x00a5;
 		ahci_checksum((uint8_t *)buf, sizeof(buf));
 		ahci_write_fis_piosetup(p);
 		write_prdt(p, slot, cfis, (void *)buf, sizeof(buf));
 		ahci_write_fis_d2h(p, slot, cfis, ATA_S_DSC | ATA_S_READY);
 	}
 }
 
 static void
 handle_atapi_identify(struct ahci_port *p, int slot, uint8_t *cfis)
 {
 	if (!p->atapi) {
 		ahci_write_fis_d2h(p, slot, cfis,
 		    (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR);
 	} else {
 		uint16_t buf[256];
 
 		memset(buf, 0, sizeof(buf));
 		buf[0] = (2 << 14 | 5 << 8 | 1 << 7 | 2 << 5);
 		ata_string((uint8_t *)(buf+10), p->ident, 20);
 		ata_string((uint8_t *)(buf+23), "001", 8);
 		ata_string((uint8_t *)(buf+27), "BHYVE SATA DVD ROM", 40);
 		buf[49] = (1 << 9 | 1 << 8);
 		buf[50] = (1 << 14 | 1);
 		buf[53] = (1 << 2 | 1 << 1);
 		buf[62] = 0x3f;
 		buf[63] = 7;
 		if (p->xfermode & ATA_WDMA0)
 			buf[63] |= (1 << ((p->xfermode & 7) + 8));
 		buf[64] = 3;
 		buf[65] = 120;
 		buf[66] = 120;
 		buf[67] = 120;
 		buf[68] = 120;
 		buf[76] = (ATA_SATA_GEN1 | ATA_SATA_GEN2 | ATA_SATA_GEN3);
 		buf[77] = ((p->ssts & ATA_SS_SPD_MASK) >> 3);
 		buf[78] = (1 << 5);
 		buf[80] = 0x3f0;
 		buf[82] = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_PACKET |
 			   ATA_SUPPORT_RESET | ATA_SUPPORT_NOP);
 		buf[83] = (1 << 14);
 		buf[84] = (1 << 14);
 		buf[85] = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_PACKET |
 			   ATA_SUPPORT_RESET | ATA_SUPPORT_NOP);
 		buf[87] = (1 << 14);
 		buf[88] = 0x7f;
 		if (p->xfermode & ATA_UDMA0)
 			buf[88] |= (1 << ((p->xfermode & 7) + 8));
 		buf[222] = 0x1020;
 		buf[255] = 0x00a5;
 		ahci_checksum((uint8_t *)buf, sizeof(buf));
 		ahci_write_fis_piosetup(p);
 		write_prdt(p, slot, cfis, (void *)buf, sizeof(buf));
 		ahci_write_fis_d2h(p, slot, cfis, ATA_S_DSC | ATA_S_READY);
 	}
 }
 
 static void
 atapi_inquiry(struct ahci_port *p, int slot, uint8_t *cfis)
 {
 	uint8_t buf[36];
 	uint8_t *acmd;
 	int len;
 	uint32_t tfd;
 
 	acmd = cfis + 0x40;
 
 	if (acmd[1] & 1) {		/* VPD */
 		if (acmd[2] == 0) {	/* Supported VPD pages */
 			buf[0] = 0x05;
 			buf[1] = 0;
 			buf[2] = 0;
 			buf[3] = 1;
 			buf[4] = 0;
 			len = 4 + buf[3];
 		} else {
 			p->sense_key = ATA_SENSE_ILLEGAL_REQUEST;
 			p->asc = 0x24;
 			tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR;
 			cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 			ahci_write_fis_d2h(p, slot, cfis, tfd);
 			return;
 		}
 	} else {
 		buf[0] = 0x05;
 		buf[1] = 0x80;
 		buf[2] = 0x00;
 		buf[3] = 0x21;
 		buf[4] = 31;
 		buf[5] = 0;
 		buf[6] = 0;
 		buf[7] = 0;
 		atapi_string(buf + 8, "BHYVE", 8);
 		atapi_string(buf + 16, "BHYVE DVD-ROM", 16);
 		atapi_string(buf + 32, "001", 4);
 		len = sizeof(buf);
 	}
 
 	if (len > acmd[4])
 		len = acmd[4];
 	cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 	write_prdt(p, slot, cfis, buf, len);
 	ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
 }
 
 static void
 atapi_read_capacity(struct ahci_port *p, int slot, uint8_t *cfis)
 {
 	uint8_t buf[8];
 	uint64_t sectors;
 
 	sectors = blockif_size(p->bctx) / 2048;
 	be32enc(buf, sectors - 1);
 	be32enc(buf + 4, 2048);
 	cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 	write_prdt(p, slot, cfis, buf, sizeof(buf));
 	ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
 }
 
 static void
 atapi_read_toc(struct ahci_port *p, int slot, uint8_t *cfis)
 {
 	uint8_t *acmd;
 	uint8_t format;
 	int len;
 
 	acmd = cfis + 0x40;
 
 	len = be16dec(acmd + 7);
 	format = acmd[9] >> 6;
 	switch (format) {
 	case 0:
 	{
 		int msf, size;
 		uint64_t sectors;
 		uint8_t start_track, buf[20], *bp;
 
 		msf = (acmd[1] >> 1) & 1;
 		start_track = acmd[6];
 		if (start_track > 1 && start_track != 0xaa) {
 			uint32_t tfd;
 			p->sense_key = ATA_SENSE_ILLEGAL_REQUEST;
 			p->asc = 0x24;
 			tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR;
 			cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 			ahci_write_fis_d2h(p, slot, cfis, tfd);
 			return;
 		}
 		bp = buf + 2;
 		*bp++ = 1;
 		*bp++ = 1;
 		if (start_track <= 1) {
 			*bp++ = 0;
 			*bp++ = 0x14;
 			*bp++ = 1;
 			*bp++ = 0;
 			if (msf) {
 				*bp++ = 0;
 				lba_to_msf(bp, 0);
 				bp += 3;
 			} else {
 				*bp++ = 0;
 				*bp++ = 0;
 				*bp++ = 0;
 				*bp++ = 0;
 			}
 		}
 		*bp++ = 0;
 		*bp++ = 0x14;
 		*bp++ = 0xaa;
 		*bp++ = 0;
 		sectors = blockif_size(p->bctx) / blockif_sectsz(p->bctx);
 		sectors >>= 2;
 		if (msf) {
 			*bp++ = 0;
 			lba_to_msf(bp, sectors);
 			bp += 3;
 		} else {
 			be32enc(bp, sectors);
 			bp += 4;
 		}
 		size = bp - buf;
 		be16enc(buf, size - 2);
 		if (len > size)
 			len = size;
 		write_prdt(p, slot, cfis, buf, len);
 		cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 		ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
 		break;
 	}
 	case 1:
 	{
 		uint8_t buf[12];
 
 		memset(buf, 0, sizeof(buf));
 		buf[1] = 0xa;
 		buf[2] = 0x1;
 		buf[3] = 0x1;
 		if (len > sizeof(buf))
 			len = sizeof(buf);
 		write_prdt(p, slot, cfis, buf, len);
 		cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 		ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
 		break;
 	}
 	case 2:
 	{
 		int msf, size;
 		uint64_t sectors;
 		uint8_t *bp, buf[50];
 
 		msf = (acmd[1] >> 1) & 1;
 		bp = buf + 2;
 		*bp++ = 1;
 		*bp++ = 1;
 
 		*bp++ = 1;
 		*bp++ = 0x14;
 		*bp++ = 0;
 		*bp++ = 0xa0;
 		*bp++ = 0;
 		*bp++ = 0;
 		*bp++ = 0;
 		*bp++ = 0;
 		*bp++ = 1;
 		*bp++ = 0;
 		*bp++ = 0;
 
 		*bp++ = 1;
 		*bp++ = 0x14;
 		*bp++ = 0;
 		*bp++ = 0xa1;
 		*bp++ = 0;
 		*bp++ = 0;
 		*bp++ = 0;
 		*bp++ = 0;
 		*bp++ = 1;
 		*bp++ = 0;
 		*bp++ = 0;
 
 		*bp++ = 1;
 		*bp++ = 0x14;
 		*bp++ = 0;
 		*bp++ = 0xa2;
 		*bp++ = 0;
 		*bp++ = 0;
 		*bp++ = 0;
 		sectors = blockif_size(p->bctx) / blockif_sectsz(p->bctx);
 		sectors >>= 2;
 		if (msf) {
 			*bp++ = 0;
 			lba_to_msf(bp, sectors);
 			bp += 3;
 		} else {
 			be32enc(bp, sectors);
 			bp += 4;
 		}
 
 		*bp++ = 1;
 		*bp++ = 0x14;
 		*bp++ = 0;
 		*bp++ = 1;
 		*bp++ = 0;
 		*bp++ = 0;
 		*bp++ = 0;
 		if (msf) {
 			*bp++ = 0;
 			lba_to_msf(bp, 0);
 			bp += 3;
 		} else {
 			*bp++ = 0;
 			*bp++ = 0;
 			*bp++ = 0;
 			*bp++ = 0;
 		}
 
 		size = bp - buf;
 		be16enc(buf, size - 2);
 		if (len > size)
 			len = size;
 		write_prdt(p, slot, cfis, buf, len);
 		cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 		ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
 		break;
 	}
 	default:
 	{
 		uint32_t tfd;
 
 		p->sense_key = ATA_SENSE_ILLEGAL_REQUEST;
 		p->asc = 0x24;
 		tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR;
 		cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 		ahci_write_fis_d2h(p, slot, cfis, tfd);
 		break;
 	}
 	}
 }
 
 static void
 atapi_report_luns(struct ahci_port *p, int slot, uint8_t *cfis)
 {
 	uint8_t buf[16];
 
 	memset(buf, 0, sizeof(buf));
 	buf[3] = 8;
 
 	cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 	write_prdt(p, slot, cfis, buf, sizeof(buf));
 	ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
 }
 
 static void
 atapi_read(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done)
 {
 	struct ahci_ioreq *aior;
 	struct ahci_cmd_hdr *hdr;
 	struct ahci_prdt_entry *prdt;
 	struct blockif_req *breq;
 	uint8_t *acmd;
 	uint64_t lba;
 	uint32_t len;
 	int err;
 
 	acmd = cfis + 0x40;
 	hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE);
 	prdt = (struct ahci_prdt_entry *)(cfis + 0x80);
 
 	lba = be32dec(acmd + 2);
 	if (acmd[0] == READ_10)
 		len = be16dec(acmd + 7);
 	else
 		len = be32dec(acmd + 6);
 	if (len == 0) {
 		cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 		ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
 	}
 	lba *= 2048;
 	len *= 2048;
 
 	/*
 	 * Pull request off free list
 	 */
 	aior = STAILQ_FIRST(&p->iofhd);
 	assert(aior != NULL);
 	STAILQ_REMOVE_HEAD(&p->iofhd, io_flist);
 	aior->cfis = cfis;
 	aior->slot = slot;
 	aior->len = len;
 	aior->done = done;
+	aior->readop = 1;
 	breq = &aior->io_req;
 	breq->br_offset = lba + done;
 	ahci_build_iov(p, aior, prdt, hdr->prdtl);
 
 	/* Mark this command in-flight. */
 	p->pending |= 1 << slot;
 
 	/* Stuff request onto busy list. */
 	TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist);
 
 	err = blockif_read(p->bctx, breq);
 	assert(err == 0);
 }
 
 static void
 atapi_request_sense(struct ahci_port *p, int slot, uint8_t *cfis)
 {
 	uint8_t buf[64];
 	uint8_t *acmd;
 	int len;
 
 	acmd = cfis + 0x40;
 	len = acmd[4];
 	if (len > sizeof(buf))
 		len = sizeof(buf);
 	memset(buf, 0, len);
 	buf[0] = 0x70 | (1 << 7);
 	buf[2] = p->sense_key;
 	buf[7] = 10;
 	buf[12] = p->asc;
 	write_prdt(p, slot, cfis, buf, len);
 	cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 	ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
 }
 
 static void
 atapi_start_stop_unit(struct ahci_port *p, int slot, uint8_t *cfis)
 {
 	uint8_t *acmd = cfis + 0x40;
 	uint32_t tfd;
 
 	switch (acmd[4] & 3) {
 	case 0:
 	case 1:
 	case 3:
 		cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 		tfd = ATA_S_READY | ATA_S_DSC;
 		break;
 	case 2:
 		/* TODO eject media */
 		cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 		p->sense_key = ATA_SENSE_ILLEGAL_REQUEST;
 		p->asc = 0x53;
 		tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR;
 		break;
 	}
 	ahci_write_fis_d2h(p, slot, cfis, tfd);
 }
 
 static void
 atapi_mode_sense(struct ahci_port *p, int slot, uint8_t *cfis)
 {
 	uint8_t *acmd;
 	uint32_t tfd;
 	uint8_t pc, code;
 	int len;
 
 	acmd = cfis + 0x40;
 	len = be16dec(acmd + 7);
 	pc = acmd[2] >> 6;
 	code = acmd[2] & 0x3f;
 
 	switch (pc) {
 	case 0:
 		switch (code) {
 		case MODEPAGE_RW_ERROR_RECOVERY:
 		{
 			uint8_t buf[16];
 
 			if (len > sizeof(buf))
 				len = sizeof(buf);
 
 			memset(buf, 0, sizeof(buf));
 			be16enc(buf, 16 - 2);
 			buf[2] = 0x70;
 			buf[8] = 0x01;
 			buf[9] = 16 - 10;
 			buf[11] = 0x05;
 			write_prdt(p, slot, cfis, buf, len);
 			tfd = ATA_S_READY | ATA_S_DSC;
 			break;
 		}
 		case MODEPAGE_CD_CAPABILITIES:
 		{
 			uint8_t buf[30];
 
 			if (len > sizeof(buf))
 				len = sizeof(buf);
 
 			memset(buf, 0, sizeof(buf));
 			be16enc(buf, 30 - 2);
 			buf[2] = 0x70;
 			buf[8] = 0x2A;
 			buf[9] = 30 - 10;
 			buf[10] = 0x08;
 			buf[12] = 0x71;
 			be16enc(&buf[18], 2);
 			be16enc(&buf[20], 512);
 			write_prdt(p, slot, cfis, buf, len);
 			tfd = ATA_S_READY | ATA_S_DSC;
 			break;
 		}
 		default:
 			goto error;
 			break;
 		}
 		break;
 	case 3:
 		p->sense_key = ATA_SENSE_ILLEGAL_REQUEST;
 		p->asc = 0x39;
 		tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR;
 		break;
 error:
 	case 1:
 	case 2:
 		p->sense_key = ATA_SENSE_ILLEGAL_REQUEST;
 		p->asc = 0x24;
 		tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR;
 		break;
 	}
 	cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 	ahci_write_fis_d2h(p, slot, cfis, tfd);
 }
 
 static void
 atapi_get_event_status_notification(struct ahci_port *p, int slot,
     uint8_t *cfis)
 {
 	uint8_t *acmd;
 	uint32_t tfd;
 
 	acmd = cfis + 0x40;
 
 	/* we don't support asynchronous operation */
 	if (!(acmd[1] & 1)) {
 		p->sense_key = ATA_SENSE_ILLEGAL_REQUEST;
 		p->asc = 0x24;
 		tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR;
 	} else {
 		uint8_t buf[8];
 		int len;
 
 		len = be16dec(acmd + 7);
 		if (len > sizeof(buf))
 			len = sizeof(buf);
 
 		memset(buf, 0, sizeof(buf));
 		be16enc(buf, 8 - 2);
 		buf[2] = 0x04;
 		buf[3] = 0x10;
 		buf[5] = 0x02;
 		write_prdt(p, slot, cfis, buf, len);
 		tfd = ATA_S_READY | ATA_S_DSC;
 	}
 	cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 	ahci_write_fis_d2h(p, slot, cfis, tfd);
 }
 
 static void
 handle_packet_cmd(struct ahci_port *p, int slot, uint8_t *cfis)
 {
 	uint8_t *acmd;
 
 	acmd = cfis + 0x40;
 
 #ifdef AHCI_DEBUG
 	{
 		int i;
 		DPRINTF("ACMD:");
 		for (i = 0; i < 16; i++)
 			DPRINTF("%02x ", acmd[i]);
 		DPRINTF("");
 	}
 #endif
 
 	switch (acmd[0]) {
 	case TEST_UNIT_READY:
 		cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 		ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
 		break;
 	case INQUIRY:
 		atapi_inquiry(p, slot, cfis);
 		break;
 	case READ_CAPACITY:
 		atapi_read_capacity(p, slot, cfis);
 		break;
 	case PREVENT_ALLOW:
 		/* TODO */
 		cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 		ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
 		break;
 	case READ_TOC:
 		atapi_read_toc(p, slot, cfis);
 		break;
 	case REPORT_LUNS:
 		atapi_report_luns(p, slot, cfis);
 		break;
 	case READ_10:
 	case READ_12:
 		atapi_read(p, slot, cfis, 0);
 		break;
 	case REQUEST_SENSE:
 		atapi_request_sense(p, slot, cfis);
 		break;
 	case START_STOP_UNIT:
 		atapi_start_stop_unit(p, slot, cfis);
 		break;
 	case MODE_SENSE_10:
 		atapi_mode_sense(p, slot, cfis);
 		break;
 	case GET_EVENT_STATUS_NOTIFICATION:
 		atapi_get_event_status_notification(p, slot, cfis);
 		break;
 	default:
 		cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 		p->sense_key = ATA_SENSE_ILLEGAL_REQUEST;
 		p->asc = 0x20;
 		ahci_write_fis_d2h(p, slot, cfis, (p->sense_key << 12) |
 				ATA_S_READY | ATA_S_ERROR);
 		break;
 	}
 }
 
 static void
 ahci_handle_cmd(struct ahci_port *p, int slot, uint8_t *cfis)
 {
 
 	p->tfd |= ATA_S_BUSY;
 	switch (cfis[2]) {
 	case ATA_ATA_IDENTIFY:
 		handle_identify(p, slot, cfis);
 		break;
 	case ATA_SETFEATURES:
 	{
 		switch (cfis[3]) {
 		case ATA_SF_ENAB_SATA_SF:
 			switch (cfis[12]) {
 			case ATA_SATA_SF_AN:
 				p->tfd = ATA_S_DSC | ATA_S_READY;
 				break;
 			default:
 				p->tfd = ATA_S_ERROR | ATA_S_READY;
 				p->tfd |= (ATA_ERROR_ABORT << 8);
 				break;
 			}
 			break;
 		case ATA_SF_ENAB_WCACHE:
 		case ATA_SF_DIS_WCACHE:
 		case ATA_SF_ENAB_RCACHE:
 		case ATA_SF_DIS_RCACHE:
 			p->tfd = ATA_S_DSC | ATA_S_READY;
 			break;
 		case ATA_SF_SETXFER:
 		{
 			switch (cfis[12] & 0xf8) {
 			case ATA_PIO:
 			case ATA_PIO0:
 				break;
 			case ATA_WDMA0:
 			case ATA_UDMA0:
 				p->xfermode = (cfis[12] & 0x7);
 				break;
 			}
 			p->tfd = ATA_S_DSC | ATA_S_READY;
 			break;
 		}
 		default:
 			p->tfd = ATA_S_ERROR | ATA_S_READY;
 			p->tfd |= (ATA_ERROR_ABORT << 8);
 			break;
 		}
 		ahci_write_fis_d2h(p, slot, cfis, p->tfd);
 		break;
 	}
 	case ATA_SET_MULTI:
 		if (cfis[12] != 0 &&
 			(cfis[12] > 128 || (cfis[12] & (cfis[12] - 1)))) {
 			p->tfd = ATA_S_ERROR | ATA_S_READY;
 			p->tfd |= (ATA_ERROR_ABORT << 8);
 		} else {
 			p->mult_sectors = cfis[12];
 			p->tfd = ATA_S_DSC | ATA_S_READY;
 		}
 		ahci_write_fis_d2h(p, slot, cfis, p->tfd);
 		break;
 	case ATA_READ:
 	case ATA_WRITE:
 	case ATA_READ48:
 	case ATA_WRITE48:
 	case ATA_READ_MUL:
 	case ATA_WRITE_MUL:
 	case ATA_READ_MUL48:
 	case ATA_WRITE_MUL48:
 	case ATA_READ_DMA:
 	case ATA_WRITE_DMA:
 	case ATA_READ_DMA48:
 	case ATA_WRITE_DMA48:
 	case ATA_READ_FPDMA_QUEUED:
 	case ATA_WRITE_FPDMA_QUEUED:
 		ahci_handle_rw(p, slot, cfis, 0);
 		break;
 	case ATA_FLUSHCACHE:
 	case ATA_FLUSHCACHE48:
 		ahci_handle_flush(p, slot, cfis);
 		break;
 	case ATA_DATA_SET_MANAGEMENT:
 		if (cfis[11] == 0 && cfis[3] == ATA_DSM_TRIM &&
 		    cfis[13] == 0 && cfis[12] == 1) {
 			ahci_handle_dsm_trim(p, slot, cfis, 0);
 			break;
 		}
 		ahci_write_fis_d2h(p, slot, cfis,
 		    (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR);
 		break;
 	case ATA_SEND_FPDMA_QUEUED:
 		if ((cfis[13] & 0x1f) == ATA_SFPDMA_DSM &&
 		    cfis[17] == 0 && cfis[16] == ATA_DSM_TRIM &&
 		    cfis[11] == 0 && cfis[3] == 1) {
 			ahci_handle_dsm_trim(p, slot, cfis, 0);
 			break;
 		}
 		ahci_write_fis_d2h(p, slot, cfis,
 		    (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR);
 		break;
 	case ATA_READ_LOG_EXT:
 	case ATA_READ_LOG_DMA_EXT:
 		ahci_handle_read_log(p, slot, cfis);
 		break;
 	case ATA_SECURITY_FREEZE_LOCK:
 	case ATA_SMART_CMD:
 	case ATA_NOP:
 		ahci_write_fis_d2h(p, slot, cfis,
 		    (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR);
 		break;
 	case ATA_CHECK_POWER_MODE:
 		cfis[12] = 0xff;	/* always on */
 		ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
 		break;
 	case ATA_STANDBY_CMD:
 	case ATA_STANDBY_IMMEDIATE:
 	case ATA_IDLE_CMD:
 	case ATA_IDLE_IMMEDIATE:
 	case ATA_SLEEP:
 	case ATA_READ_VERIFY:
 	case ATA_READ_VERIFY48:
 		ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
 		break;
 	case ATA_ATAPI_IDENTIFY:
 		handle_atapi_identify(p, slot, cfis);
 		break;
 	case ATA_PACKET_CMD:
 		if (!p->atapi) {
 			ahci_write_fis_d2h(p, slot, cfis,
 			    (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR);
 		} else
 			handle_packet_cmd(p, slot, cfis);
 		break;
 	default:
 		WPRINTF("Unsupported cmd:%02x", cfis[2]);
 		ahci_write_fis_d2h(p, slot, cfis,
 		    (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR);
 		break;
 	}
 }
 
 static void
 ahci_handle_slot(struct ahci_port *p, int slot)
 {
 	struct ahci_cmd_hdr *hdr;
 #ifdef AHCI_DEBUG
 	struct ahci_prdt_entry *prdt;
 #endif
 	struct pci_ahci_softc *sc;
 	uint8_t *cfis;
 #ifdef AHCI_DEBUG
 	int cfl, i;
 #endif
 
 	sc = p->pr_sc;
 	hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE);
 #ifdef AHCI_DEBUG
 	cfl = (hdr->flags & 0x1f) * 4;
 #endif
 	cfis = paddr_guest2host(ahci_ctx(sc), hdr->ctba,
 			0x80 + hdr->prdtl * sizeof(struct ahci_prdt_entry));
 #ifdef AHCI_DEBUG
 	prdt = (struct ahci_prdt_entry *)(cfis + 0x80);
 
 	DPRINTF("cfis:");
 	for (i = 0; i < cfl; i++) {
 		if (i % 10 == 0)
 			DPRINTF("");
 		DPRINTF("%02x ", cfis[i]);
 	}
 	DPRINTF("");
 
 	for (i = 0; i < hdr->prdtl; i++) {
 		DPRINTF("%d@%08"PRIx64"", prdt->dbc & 0x3fffff, prdt->dba);
 		prdt++;
 	}
 #endif
 
 	if (cfis[0] != FIS_TYPE_REGH2D) {
 		WPRINTF("Not a H2D FIS:%02x", cfis[0]);
 		return;
 	}
 
 	if (cfis[1] & 0x80) {
 		ahci_handle_cmd(p, slot, cfis);
 	} else {
 		if (cfis[15] & (1 << 2))
 			p->reset = 1;
 		else if (p->reset) {
 			p->reset = 0;
 			ahci_port_reset(p);
 		}
 		p->ci &= ~(1 << slot);
 	}
 }
 
 static void
 ahci_handle_port(struct ahci_port *p)
 {
 
 	if (!(p->cmd & AHCI_P_CMD_ST))
 		return;
 
 	/*
 	 * Search for any new commands to issue ignoring those that
 	 * are already in-flight.  Stop if device is busy or in error.
 	 */
 	for (; (p->ci & ~p->pending) != 0; p->ccs = ((p->ccs + 1) & 31)) {
 		if ((p->tfd & (ATA_S_BUSY | ATA_S_DRQ)) != 0)
 			break;
 		if (p->waitforclear)
 			break;
 		if ((p->ci & ~p->pending & (1 << p->ccs)) != 0) {
 			p->cmd &= ~AHCI_P_CMD_CCS_MASK;
 			p->cmd |= p->ccs << AHCI_P_CMD_CCS_SHIFT;
 			ahci_handle_slot(p, p->ccs);
 		}
 	}
 }
 
 /*
  * blockif callback routine - this runs in the context of the blockif
  * i/o thread, so the mutex needs to be acquired.
  */
 static void
 ata_ioreq_cb(struct blockif_req *br, int err)
 {
 	struct ahci_cmd_hdr *hdr;
 	struct ahci_ioreq *aior;
 	struct ahci_port *p;
 	struct pci_ahci_softc *sc;
 	uint32_t tfd;
 	uint8_t *cfis;
 	int slot, ncq, dsm;
 
 	DPRINTF("%s %d", __func__, err);
 
 	ncq = dsm = 0;
 	aior = br->br_param;
 	p = aior->io_pr;
 	cfis = aior->cfis;
 	slot = aior->slot;
 	sc = p->pr_sc;
 	hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE);
 
 	if (cfis[2] == ATA_WRITE_FPDMA_QUEUED ||
 	    cfis[2] == ATA_READ_FPDMA_QUEUED ||
 	    cfis[2] == ATA_SEND_FPDMA_QUEUED)
 		ncq = 1;
 	if (cfis[2] == ATA_DATA_SET_MANAGEMENT ||
 	    (cfis[2] == ATA_SEND_FPDMA_QUEUED &&
 	     (cfis[13] & 0x1f) == ATA_SFPDMA_DSM))
 		dsm = 1;
 
 	pthread_mutex_lock(&sc->mtx);
 
 	/*
 	 * Delete the blockif request from the busy list
 	 */
 	TAILQ_REMOVE(&p->iobhd, aior, io_blist);
 
 	/*
 	 * Move the blockif request back to the free list
 	 */
 	STAILQ_INSERT_TAIL(&p->iofhd, aior, io_flist);
 
 	if (!err)
 		hdr->prdbc = aior->done;
 
 	if (!err && aior->more) {
 		if (dsm)
 			ahci_handle_dsm_trim(p, slot, cfis, aior->done);
 		else 
 			ahci_handle_rw(p, slot, cfis, aior->done);
 		goto out;
 	}
 
 	if (!err)
 		tfd = ATA_S_READY | ATA_S_DSC;
 	else
 		tfd = (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR;
 	if (ncq)
 		ahci_write_fis_sdb(p, slot, cfis, tfd);
 	else
 		ahci_write_fis_d2h(p, slot, cfis, tfd);
 
 	/*
 	 * This command is now complete.
 	 */
 	p->pending &= ~(1 << slot);
 
 	ahci_check_stopped(p);
 	ahci_handle_port(p);
 out:
 	pthread_mutex_unlock(&sc->mtx);
 	DPRINTF("%s exit", __func__);
 }
 
 static void
 atapi_ioreq_cb(struct blockif_req *br, int err)
 {
 	struct ahci_cmd_hdr *hdr;
 	struct ahci_ioreq *aior;
 	struct ahci_port *p;
 	struct pci_ahci_softc *sc;
 	uint8_t *cfis;
 	uint32_t tfd;
 	int slot;
 
 	DPRINTF("%s %d", __func__, err);
 
 	aior = br->br_param;
 	p = aior->io_pr;
 	cfis = aior->cfis;
 	slot = aior->slot;
 	sc = p->pr_sc;
 	hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + aior->slot * AHCI_CL_SIZE);
 
 	pthread_mutex_lock(&sc->mtx);
 
 	/*
 	 * Delete the blockif request from the busy list
 	 */
 	TAILQ_REMOVE(&p->iobhd, aior, io_blist);
 
 	/*
 	 * Move the blockif request back to the free list
 	 */
 	STAILQ_INSERT_TAIL(&p->iofhd, aior, io_flist);
 
 	if (!err)
 		hdr->prdbc = aior->done;
 
 	if (!err && aior->more) {
 		atapi_read(p, slot, cfis, aior->done);
 		goto out;
 	}
 
 	if (!err) {
 		tfd = ATA_S_READY | ATA_S_DSC;
 	} else {
 		p->sense_key = ATA_SENSE_ILLEGAL_REQUEST;
 		p->asc = 0x21;
 		tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR;
 	}
 	cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 	ahci_write_fis_d2h(p, slot, cfis, tfd);
 
 	/*
 	 * This command is now complete.
 	 */
 	p->pending &= ~(1 << slot);
 
 	ahci_check_stopped(p);
 	ahci_handle_port(p);
 out:
 	pthread_mutex_unlock(&sc->mtx);
 	DPRINTF("%s exit", __func__);
 }
 
 static void
 pci_ahci_ioreq_init(struct ahci_port *pr)
 {
 	struct ahci_ioreq *vr;
 	int i;
 
 	pr->ioqsz = blockif_queuesz(pr->bctx);
 	pr->ioreq = calloc(pr->ioqsz, sizeof(struct ahci_ioreq));
 	STAILQ_INIT(&pr->iofhd);
 
 	/*
 	 * Add all i/o request entries to the free queue
 	 */
 	for (i = 0; i < pr->ioqsz; i++) {
 		vr = &pr->ioreq[i];
 		vr->io_pr = pr;
 		if (!pr->atapi)
 			vr->io_req.br_callback = ata_ioreq_cb;
 		else
 			vr->io_req.br_callback = atapi_ioreq_cb;
 		vr->io_req.br_param = vr;
 		STAILQ_INSERT_TAIL(&pr->iofhd, vr, io_flist);
 	}
 
 	TAILQ_INIT(&pr->iobhd);
 }
 
 static void
 pci_ahci_port_write(struct pci_ahci_softc *sc, uint64_t offset, uint64_t value)
 {
 	int port = (offset - AHCI_OFFSET) / AHCI_STEP;
 	offset = (offset - AHCI_OFFSET) % AHCI_STEP;
 	struct ahci_port *p = &sc->port[port];
 
 	DPRINTF("pci_ahci_port %d: write offset 0x%"PRIx64" value 0x%"PRIx64"",
 		port, offset, value);
 
 	switch (offset) {
 	case AHCI_P_CLB:
 		p->clb = value;
 		break;
 	case AHCI_P_CLBU:
 		p->clbu = value;
 		break;
 	case AHCI_P_FB:
 		p->fb = value;
 		break;
 	case AHCI_P_FBU:
 		p->fbu = value;
 		break;
 	case AHCI_P_IS:
 		p->is &= ~value;
 		ahci_port_intr(p);
 		break;
 	case AHCI_P_IE:
 		p->ie = value & 0xFDC000FF;
 		ahci_port_intr(p);
 		break;
 	case AHCI_P_CMD:
 	{
 		p->cmd &= ~(AHCI_P_CMD_ST | AHCI_P_CMD_SUD | AHCI_P_CMD_POD |
 		    AHCI_P_CMD_CLO | AHCI_P_CMD_FRE | AHCI_P_CMD_APSTE |
 		    AHCI_P_CMD_ATAPI | AHCI_P_CMD_DLAE | AHCI_P_CMD_ALPE |
 		    AHCI_P_CMD_ASP | AHCI_P_CMD_ICC_MASK);
 		p->cmd |= (AHCI_P_CMD_ST | AHCI_P_CMD_SUD | AHCI_P_CMD_POD |
 		    AHCI_P_CMD_CLO | AHCI_P_CMD_FRE | AHCI_P_CMD_APSTE |
 		    AHCI_P_CMD_ATAPI | AHCI_P_CMD_DLAE | AHCI_P_CMD_ALPE |
 		    AHCI_P_CMD_ASP | AHCI_P_CMD_ICC_MASK) & value;
 
 		if (!(value & AHCI_P_CMD_ST)) {
 			ahci_port_stop(p);
 		} else {
 			uint64_t clb;
 
 			p->cmd |= AHCI_P_CMD_CR;
 			clb = (uint64_t)p->clbu << 32 | p->clb;
 			p->cmd_lst = paddr_guest2host(ahci_ctx(sc), clb,
 					AHCI_CL_SIZE * AHCI_MAX_SLOTS);
 		}
 
 		if (value & AHCI_P_CMD_FRE) {
 			uint64_t fb;
 
 			p->cmd |= AHCI_P_CMD_FR;
 			fb = (uint64_t)p->fbu << 32 | p->fb;
 			/* we don't support FBSCP, so rfis size is 256Bytes */
 			p->rfis = paddr_guest2host(ahci_ctx(sc), fb, 256);
 		} else {
 			p->cmd &= ~AHCI_P_CMD_FR;
 		}
 
 		if (value & AHCI_P_CMD_CLO) {
 			p->tfd &= ~(ATA_S_BUSY | ATA_S_DRQ);
 			p->cmd &= ~AHCI_P_CMD_CLO;
 		}
 
 		if (value & AHCI_P_CMD_ICC_MASK) {
 			p->cmd &= ~AHCI_P_CMD_ICC_MASK;
 		}
 
 		ahci_handle_port(p);
 		break;
 	}
 	case AHCI_P_TFD:
 	case AHCI_P_SIG:
 	case AHCI_P_SSTS:
 		WPRINTF("pci_ahci_port: read only registers 0x%"PRIx64"", offset);
 		break;
 	case AHCI_P_SCTL:
 		p->sctl = value;
 		if (!(p->cmd & AHCI_P_CMD_ST)) {
 			if (value & ATA_SC_DET_RESET)
 				ahci_port_reset(p);
 		}
 		break;
 	case AHCI_P_SERR:
 		p->serr &= ~value;
 		break;
 	case AHCI_P_SACT:
 		p->sact |= value;
 		break;
 	case AHCI_P_CI:
 		p->ci |= value;
 		ahci_handle_port(p);
 		break;
 	case AHCI_P_SNTF:
 	case AHCI_P_FBS:
 	default:
 		break;
 	}
 }
 
 static void
 pci_ahci_host_write(struct pci_ahci_softc *sc, uint64_t offset, uint64_t value)
 {
 	DPRINTF("pci_ahci_host: write offset 0x%"PRIx64" value 0x%"PRIx64"",
 		offset, value);
 
 	switch (offset) {
 	case AHCI_CAP:
 	case AHCI_PI:
 	case AHCI_VS:
 	case AHCI_CAP2:
 		DPRINTF("pci_ahci_host: read only registers 0x%"PRIx64"", offset);
 		break;
 	case AHCI_GHC:
 		if (value & AHCI_GHC_HR) {
 			ahci_reset(sc);
 			break;
 		}
 		if (value & AHCI_GHC_IE)
 			sc->ghc |= AHCI_GHC_IE;
 		else
 			sc->ghc &= ~AHCI_GHC_IE;
 		ahci_generate_intr(sc, 0xffffffff);
 		break;
 	case AHCI_IS:
 		sc->is &= ~value;
 		ahci_generate_intr(sc, value);
 		break;
 	default:
 		break;
 	}
 }
 
 static void
 pci_ahci_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
 		int baridx, uint64_t offset, int size, uint64_t value)
 {
 	struct pci_ahci_softc *sc = pi->pi_arg;
 
 	assert(baridx == 5);
 	assert((offset % 4) == 0 && size == 4);
 
 	pthread_mutex_lock(&sc->mtx);
 
 	if (offset < AHCI_OFFSET)
 		pci_ahci_host_write(sc, offset, value);
 	else if (offset < AHCI_OFFSET + sc->ports * AHCI_STEP)
 		pci_ahci_port_write(sc, offset, value);
 	else
 		WPRINTF("pci_ahci: unknown i/o write offset 0x%"PRIx64"", offset);
 
 	pthread_mutex_unlock(&sc->mtx);
 }
 
 static uint64_t
 pci_ahci_host_read(struct pci_ahci_softc *sc, uint64_t offset)
 {
 	uint32_t value;
 
 	switch (offset) {
 	case AHCI_CAP:
 	case AHCI_GHC:
 	case AHCI_IS:
 	case AHCI_PI:
 	case AHCI_VS:
 	case AHCI_CCCC:
 	case AHCI_CCCP:
 	case AHCI_EM_LOC:
 	case AHCI_EM_CTL:
 	case AHCI_CAP2:
 	{
 		uint32_t *p = &sc->cap;
 		p += (offset - AHCI_CAP) / sizeof(uint32_t);
 		value = *p;
 		break;
 	}
 	default:
 		value = 0;
 		break;
 	}
 	DPRINTF("pci_ahci_host: read offset 0x%"PRIx64" value 0x%x",
 		offset, value);
 
 	return (value);
 }
 
 static uint64_t
 pci_ahci_port_read(struct pci_ahci_softc *sc, uint64_t offset)
 {
 	uint32_t value;
 	int port = (offset - AHCI_OFFSET) / AHCI_STEP;
 	offset = (offset - AHCI_OFFSET) % AHCI_STEP;
 
 	switch (offset) {
 	case AHCI_P_CLB:
 	case AHCI_P_CLBU:
 	case AHCI_P_FB:
 	case AHCI_P_FBU:
 	case AHCI_P_IS:
 	case AHCI_P_IE:
 	case AHCI_P_CMD:
 	case AHCI_P_TFD:
 	case AHCI_P_SIG:
 	case AHCI_P_SSTS:
 	case AHCI_P_SCTL:
 	case AHCI_P_SERR:
 	case AHCI_P_SACT:
 	case AHCI_P_CI:
 	case AHCI_P_SNTF:
 	case AHCI_P_FBS:
 	{
 		uint32_t *p= &sc->port[port].clb;
 		p += (offset - AHCI_P_CLB) / sizeof(uint32_t);
 		value = *p;
 		break;
 	}
 	default:
 		value = 0;
 		break;
 	}
 
 	DPRINTF("pci_ahci_port %d: read offset 0x%"PRIx64" value 0x%x",
 		port, offset, value);
 
 	return value;
 }
 
 static uint64_t
 pci_ahci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
     uint64_t regoff, int size)
 {
 	struct pci_ahci_softc *sc = pi->pi_arg;
 	uint64_t offset;
 	uint32_t value;
 
 	assert(baridx == 5);
 	assert(size == 1 || size == 2 || size == 4);
 	assert((regoff & (size - 1)) == 0);
 
 	pthread_mutex_lock(&sc->mtx);
 
 	offset = regoff & ~0x3;	    /* round down to a multiple of 4 bytes */
 	if (offset < AHCI_OFFSET)
 		value = pci_ahci_host_read(sc, offset);
 	else if (offset < AHCI_OFFSET + sc->ports * AHCI_STEP)
 		value = pci_ahci_port_read(sc, offset);
 	else {
 		value = 0;
 		WPRINTF("pci_ahci: unknown i/o read offset 0x%"PRIx64"",
 		    regoff);
 	}
 	value >>= 8 * (regoff & 0x3);
 
 	pthread_mutex_unlock(&sc->mtx);
 
 	return (value);
 }
 
 static int
 pci_ahci_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts, int atapi)
 {
 	char bident[sizeof("XX:XX:XX")];
 	struct blockif_ctxt *bctxt;
 	struct pci_ahci_softc *sc;
 	int ret, slots, p;
 	MD5_CTX mdctx;
 	u_char digest[16];
 	char *next, *next2;
 
 	ret = 0;
 
 #ifdef AHCI_DEBUG
 	dbg = fopen("/tmp/log", "w+");
 #endif
 
 	sc = calloc(1, sizeof(struct pci_ahci_softc));
 	pi->pi_arg = sc;
 	sc->asc_pi = pi;
 	pthread_mutex_init(&sc->mtx, NULL);
 	sc->ports = 0;
 	sc->pi = 0;
 	slots = 32;
 
 	for (p = 0; p < MAX_PORTS && opts != NULL; p++, opts = next) {
 		/* Identify and cut off type of present port. */
 		if (strncmp(opts, "hd:", 3) == 0) {
 			atapi = 0;
 			opts += 3;
 		} else if (strncmp(opts, "cd:", 3) == 0) {
 			atapi = 1;
 			opts += 3;
 		}
 
 		/* Find and cut off the next port options. */
 		next = strstr(opts, ",hd:");
 		next2 = strstr(opts, ",cd:");
 		if (next == NULL || (next2 != NULL && next2 < next))
 			next = next2;
 		if (next != NULL) {
 			next[0] = 0;
 			next++;
 		}
 
 		if (opts[0] == 0)
 			continue;
 
 		/*
 		 * Attempt to open the backing image. Use the PCI slot/func
 		 * and the port number for the identifier string.
 		 */
 		snprintf(bident, sizeof(bident), "%d:%d:%d", pi->pi_slot,
 		    pi->pi_func, p);
 		bctxt = blockif_open(opts, bident);
 		if (bctxt == NULL) {
 			sc->ports = p;
 			ret = 1;
 			goto open_fail;
 		}	
 		sc->port[p].bctx = bctxt;
 		sc->port[p].pr_sc = sc;
 		sc->port[p].port = p;
 		sc->port[p].atapi = atapi;
 
 		/*
 		 * Create an identifier for the backing file.
 		 * Use parts of the md5 sum of the filename
 		 */
 		MD5Init(&mdctx);
 		MD5Update(&mdctx, opts, strlen(opts));
 		MD5Final(digest, &mdctx);
 		snprintf(sc->port[p].ident, AHCI_PORT_IDENT,
 		    "BHYVE-%02X%02X-%02X%02X-%02X%02X",
 		    digest[0], digest[1], digest[2], digest[3], digest[4],
 		    digest[5]);
 
 		/*
 		 * Allocate blockif request structures and add them
 		 * to the free list
 		 */
 		pci_ahci_ioreq_init(&sc->port[p]);
 
 		sc->pi |= (1 << p);
 		if (sc->port[p].ioqsz < slots)
 			slots = sc->port[p].ioqsz;
 	}
 	sc->ports = p;
 
 	/* Intel ICH8 AHCI */
 	--slots;
 	if (sc->ports < DEF_PORTS)
 		sc->ports = DEF_PORTS;
 	sc->cap = AHCI_CAP_64BIT | AHCI_CAP_SNCQ | AHCI_CAP_SSNTF |
 	    AHCI_CAP_SMPS | AHCI_CAP_SSS | AHCI_CAP_SALP |
 	    AHCI_CAP_SAL | AHCI_CAP_SCLO | (0x3 << AHCI_CAP_ISS_SHIFT)|
 	    AHCI_CAP_PMD | AHCI_CAP_SSC | AHCI_CAP_PSC |
 	    (slots << AHCI_CAP_NCS_SHIFT) | AHCI_CAP_SXS | (sc->ports - 1);
 
 	sc->vs = 0x10300;
 	sc->cap2 = AHCI_CAP2_APST;
 	ahci_reset(sc);
 
 	pci_set_cfgdata16(pi, PCIR_DEVICE, 0x2821);
 	pci_set_cfgdata16(pi, PCIR_VENDOR, 0x8086);
 	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
 	pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_SATA);
 	pci_set_cfgdata8(pi, PCIR_PROGIF, PCIP_STORAGE_SATA_AHCI_1_0);
 	p = MIN(sc->ports, 16);
 	p = flsl(p) - ((p & (p - 1)) ? 0 : 1);
 	pci_emul_add_msicap(pi, 1 << p);
 	pci_emul_alloc_bar(pi, 5, PCIBAR_MEM32,
 	    AHCI_OFFSET + sc->ports * AHCI_STEP);
 
 	pci_lintr_request(pi);
 
 open_fail:
 	if (ret) {
 		for (p = 0; p < sc->ports; p++) {
 			if (sc->port[p].bctx != NULL)
 				blockif_close(sc->port[p].bctx);
 		}
 		free(sc);
 	}
 
 	return (ret);
 }
 
 static int
 pci_ahci_hd_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 {
 
 	return (pci_ahci_init(ctx, pi, opts, 0));
 }
 
 static int
 pci_ahci_atapi_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 {
 
 	return (pci_ahci_init(ctx, pi, opts, 1));
 }
 
+#ifdef BHYVE_SNAPSHOT
+static int
+pci_ahci_snapshot_save_queues(struct ahci_port *port,
+			      struct vm_snapshot_meta *meta)
+{
+	int ret;
+	int idx;
+	struct ahci_ioreq *ioreq;
+
+	STAILQ_FOREACH(ioreq, &port->iofhd, io_flist) {
+		idx = ((void *) ioreq - (void *) port->ioreq) / sizeof(*ioreq);
+		SNAPSHOT_VAR_OR_LEAVE(idx, meta, ret, done);
+	}
+
+	idx = -1;
+	SNAPSHOT_VAR_OR_LEAVE(idx, meta, ret, done);
+
+	TAILQ_FOREACH(ioreq, &port->iobhd, io_blist) {
+		idx = ((void *) ioreq - (void *) port->ioreq) / sizeof(*ioreq);
+		SNAPSHOT_VAR_OR_LEAVE(idx, meta, ret, done);
+
+		/*
+		 * Snapshot only the busy requests; other requests are
+		 * not valid.
+		 */
+		ret = blockif_snapshot_req(&ioreq->io_req, meta);
+		if (ret != 0) {
+			fprintf(stderr, "%s: failed to snapshot req\r\n",
+				__func__);
+			goto done;
+		}
+	}
+
+	idx = -1;
+	SNAPSHOT_VAR_OR_LEAVE(idx, meta, ret, done);
+
+done:
+	return (ret);
+}
+
+static int
+pci_ahci_snapshot_restore_queues(struct ahci_port *port,
+				 struct vm_snapshot_meta *meta)
+{
+	int ret;
+	int idx;
+	struct ahci_ioreq *ioreq;
+
+	/* Empty the free queue before restoring. */
+	while (!STAILQ_EMPTY(&port->iofhd))
+		STAILQ_REMOVE_HEAD(&port->iofhd, io_flist);
+
+	/* Restore the free queue. */
+	while (1) {
+		SNAPSHOT_VAR_OR_LEAVE(idx, meta, ret, done);
+		if (idx == -1)
+			break;
+
+		STAILQ_INSERT_TAIL(&port->iofhd, &port->ioreq[idx], io_flist);
+	}
+
+	/* Restore the busy queue. */
+	while (1) {
+		SNAPSHOT_VAR_OR_LEAVE(idx, meta, ret, done);
+		if (idx == -1)
+			break;
+
+		ioreq = &port->ioreq[idx];
+		TAILQ_INSERT_TAIL(&port->iobhd, ioreq, io_blist);
+
+		/*
+		 * Restore only the busy requests; other requests are
+		 * not valid.
+		 */
+		ret = blockif_snapshot_req(&ioreq->io_req, meta);
+		if (ret != 0) {
+			fprintf(stderr, "%s: failed to restore request\r\n",
+				__func__);
+			goto done;
+		}
+
+		/* Re-enqueue the requests in the block interface. */
+		if (ioreq->readop)
+			ret = blockif_read(port->bctx, &ioreq->io_req);
+		else
+			ret = blockif_write(port->bctx, &ioreq->io_req);
+
+		if (ret != 0) {
+			fprintf(stderr,
+				"%s: failed to re-enqueue request\r\n",
+				__func__);
+			goto done;
+		}
+	}
+
+done:
+	return (ret);
+}
+
+static int
+pci_ahci_snapshot(struct vm_snapshot_meta *meta)
+{
+	int i, j, ret;
+	void *bctx;
+	struct pci_devinst *pi;
+	struct pci_ahci_softc *sc;
+	struct ahci_port *port;
+	struct ahci_cmd_hdr *hdr;
+	struct ahci_ioreq *ioreq;
+
+	pi = meta->dev_data;
+	sc = pi->pi_arg;
+
+	/* TODO: add mtx lock/unlock */
+
+	SNAPSHOT_VAR_OR_LEAVE(sc->ports, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->cap, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->ghc, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->is, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->pi, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->vs, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->ccc_ctl, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->ccc_pts, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->em_loc, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->em_ctl, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->cap2, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->bohc, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->lintr, meta, ret, done);
+
+	for (i = 0; i < MAX_PORTS; i++) {
+		port = &sc->port[i];
+
+		if (meta->op == VM_SNAPSHOT_SAVE)
+			bctx = port->bctx;
+
+		SNAPSHOT_VAR_OR_LEAVE(bctx, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(port->port, meta, ret, done);
+
+		/* Mostly for restore; save is ensured by the lines above. */
+		if (((bctx == NULL) && (port->bctx != NULL)) ||
+		    ((bctx != NULL) && (port->bctx == NULL))) {
+			fprintf(stderr, "%s: ports not matching\r\n", __func__);
+			ret = EINVAL;
+			goto done;
+		}
+
+		if (port->bctx == NULL)
+			continue;
+
+		if (port->port != i) {
+			fprintf(stderr, "%s: ports not matching: "
+					"actual: %d expected: %d\r\n",
+					__func__, port->port, i);
+			ret = EINVAL;
+			goto done;
+		}
+
+		SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(port->cmd_lst,
+			AHCI_CL_SIZE * AHCI_MAX_SLOTS, false, meta, ret, done);
+		SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(port->rfis, 256, false, meta,
+			ret, done);
+
+		SNAPSHOT_VAR_OR_LEAVE(port->ident, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(port->atapi, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(port->reset, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(port->waitforclear, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(port->mult_sectors, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(port->xfermode, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(port->err_cfis, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(port->sense_key, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(port->asc, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(port->ccs, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(port->pending, meta, ret, done);
+
+		SNAPSHOT_VAR_OR_LEAVE(port->clb, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(port->clbu, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(port->fb, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(port->fbu, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(port->ie, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(port->cmd, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(port->unused0, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(port->tfd, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(port->sig, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(port->ssts, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(port->sctl, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(port->serr, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(port->sact, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(port->ci, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(port->sntf, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(port->fbs, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(port->ioqsz, meta, ret, done);
+
+		for (j = 0; j < port->ioqsz; j++) {
+			ioreq = &port->ioreq[j];
+
+			/* blockif_req snapshot done only for busy requests. */
+			hdr = (struct ahci_cmd_hdr *)(port->cmd_lst +
+				ioreq->slot * AHCI_CL_SIZE);
+			SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(ioreq->cfis,
+				0x80 + hdr->prdtl * sizeof(struct ahci_prdt_entry),
+				false, meta, ret, done);
+
+			SNAPSHOT_VAR_OR_LEAVE(ioreq->len, meta, ret, done);
+			SNAPSHOT_VAR_OR_LEAVE(ioreq->done, meta, ret, done);
+			SNAPSHOT_VAR_OR_LEAVE(ioreq->slot, meta, ret, done);
+			SNAPSHOT_VAR_OR_LEAVE(ioreq->more, meta, ret, done);
+			SNAPSHOT_VAR_OR_LEAVE(ioreq->readop, meta, ret, done);
+		}
+
+		/* Perform save / restore specific operations. */
+		if (meta->op == VM_SNAPSHOT_SAVE) {
+			ret = pci_ahci_snapshot_save_queues(port, meta);
+			if (ret != 0)
+				goto done;
+		} else if (meta->op == VM_SNAPSHOT_RESTORE) {
+			ret = pci_ahci_snapshot_restore_queues(port, meta);
+			if (ret != 0)
+				goto done;
+		} else {
+			ret = EINVAL;
+			goto done;
+		}
+
+		ret = blockif_snapshot(port->bctx, meta);
+		if (ret != 0) {
+			fprintf(stderr, "%s: failed to restore blockif\r\n",
+				__func__);
+			goto done;
+		}
+	}
+
+done:
+	return (ret);
+}
+
+static int
+pci_ahci_pause(struct vmctx *ctx, struct pci_devinst *pi)
+{
+	struct pci_ahci_softc *sc;
+	struct blockif_ctxt *bctxt;
+	int i;
+
+	sc = pi->pi_arg;
+
+	for (i = 0; i < MAX_PORTS; i++) {
+		bctxt = sc->port[i].bctx;
+		if (bctxt == NULL)
+			continue;
+
+		blockif_pause(bctxt);
+	}
+
+	return (0);
+}
+
+static int
+pci_ahci_resume(struct vmctx *ctx, struct pci_devinst *pi)
+{
+	struct pci_ahci_softc *sc;
+	struct blockif_ctxt *bctxt;
+	int i;
+
+	sc = pi->pi_arg;
+
+	for (i = 0; i < MAX_PORTS; i++) {
+		bctxt = sc->port[i].bctx;
+		if (bctxt == NULL)
+			continue;
+
+		blockif_resume(bctxt);
+	}
+
+	return (0);
+}
+#endif
+
 /*
  * Use separate emulation names to distinguish drive and atapi devices
  */
 struct pci_devemu pci_de_ahci = {
 	.pe_emu =	"ahci",
 	.pe_init =	pci_ahci_hd_init,
 	.pe_barwrite =	pci_ahci_write,
-	.pe_barread =	pci_ahci_read
+	.pe_barread =	pci_ahci_read,
+#ifdef BHYVE_SNAPSHOT
+	.pe_snapshot =	pci_ahci_snapshot,
+	.pe_pause =	pci_ahci_pause,
+	.pe_resume =	pci_ahci_resume,
+#endif
 };
 PCI_EMUL_SET(pci_de_ahci);
 
 struct pci_devemu pci_de_ahci_hd = {
 	.pe_emu =	"ahci-hd",
 	.pe_init =	pci_ahci_hd_init,
 	.pe_barwrite =	pci_ahci_write,
-	.pe_barread =	pci_ahci_read
+	.pe_barread =	pci_ahci_read,
+#ifdef BHYVE_SNAPSHOT
+	.pe_snapshot =	pci_ahci_snapshot,
+	.pe_pause =	pci_ahci_pause,
+	.pe_resume =	pci_ahci_resume,
+#endif
 };
 PCI_EMUL_SET(pci_de_ahci_hd);
 
 struct pci_devemu pci_de_ahci_cd = {
 	.pe_emu =	"ahci-cd",
 	.pe_init =	pci_ahci_atapi_init,
 	.pe_barwrite =	pci_ahci_write,
-	.pe_barread =	pci_ahci_read
+	.pe_barread =	pci_ahci_read,
+#ifdef BHYVE_SNAPSHOT
+	.pe_snapshot =	pci_ahci_snapshot,
+	.pe_pause =	pci_ahci_pause,
+	.pe_resume =	pci_ahci_resume,
+#endif
 };
 PCI_EMUL_SET(pci_de_ahci_cd);
diff --git a/usr.sbin/bhyve/pci_e82545.c b/usr.sbin/bhyve/pci_e82545.c
index dca981be85fa..c1443b6aa613 100644
--- a/usr.sbin/bhyve/pci_e82545.c
+++ b/usr.sbin/bhyve/pci_e82545.c
@@ -1,2388 +1,2547 @@
 /*
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2016 Alexander Motin <mav@FreeBSD.org>
  * Copyright (c) 2015 Peter Grehan <grehan@freebsd.org>
  * Copyright (c) 2013 Jeremiah Lott, Avere Systems
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer
  *    in this position and unchanged.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/types.h>
 #ifndef WITHOUT_CAPSICUM
 #include <sys/capsicum.h>
 #endif
 #include <sys/limits.h>
 #include <sys/ioctl.h>
 #include <sys/uio.h>
 #include <net/ethernet.h>
 #include <netinet/in.h>
 #include <netinet/tcp.h>
 
 #ifndef WITHOUT_CAPSICUM
 #include <capsicum_helpers.h>
 #endif
+#include <machine/vmm_snapshot.h>
+
 #include <err.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <md5.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sysexits.h>
 #include <unistd.h>
 #include <pthread.h>
 #include <pthread_np.h>
 
 #include "e1000_regs.h"
 #include "e1000_defines.h"
 #include "mii.h"
 
 #include "bhyverun.h"
 #include "debug.h"
 #include "pci_emul.h"
 #include "mevent.h"
 #include "net_utils.h"
 #include "net_backends.h"
 
 /* Hardware/register definitions XXX: move some to common code. */
 #define E82545_VENDOR_ID_INTEL			0x8086
 #define E82545_DEV_ID_82545EM_COPPER		0x100F
 #define E82545_SUBDEV_ID			0x1008
 
 #define E82545_REVISION_4			4
 
 #define E82545_MDIC_DATA_MASK			0x0000FFFF
 #define E82545_MDIC_OP_MASK			0x0c000000
 #define E82545_MDIC_IE				0x20000000
 
 #define E82545_EECD_FWE_DIS	0x00000010 /* Flash writes disabled */
 #define E82545_EECD_FWE_EN	0x00000020 /* Flash writes enabled */
 #define E82545_EECD_FWE_MASK	0x00000030 /* Flash writes mask */
 
 #define E82545_BAR_REGISTER			0
 #define E82545_BAR_REGISTER_LEN			(128*1024)
 #define E82545_BAR_FLASH			1
 #define E82545_BAR_FLASH_LEN			(64*1024)
 #define E82545_BAR_IO				2
 #define E82545_BAR_IO_LEN			8
 
 #define E82545_IOADDR				0x00000000
 #define E82545_IODATA				0x00000004
 #define E82545_IO_REGISTER_MAX			0x0001FFFF
 #define E82545_IO_FLASH_BASE			0x00080000
 #define E82545_IO_FLASH_MAX			0x000FFFFF
 
 #define E82545_ARRAY_ENTRY(reg, offset)		(reg + (offset<<2))
 #define E82545_RAR_MAX				15
 #define E82545_MTA_MAX				127
 #define E82545_VFTA_MAX				127
 
 /* Slightly modified from the driver versions, hardcoded for 3 opcode bits,
  * followed by 6 address bits.
  * TODO: make opcode bits and addr bits configurable?
  * NVM Commands - Microwire */
 #define E82545_NVM_OPCODE_BITS	3
 #define E82545_NVM_ADDR_BITS	6
 #define E82545_NVM_DATA_BITS	16
 #define E82545_NVM_OPADDR_BITS	(E82545_NVM_OPCODE_BITS + E82545_NVM_ADDR_BITS)
 #define E82545_NVM_ADDR_MASK	((1 << E82545_NVM_ADDR_BITS)-1)
 #define E82545_NVM_OPCODE_MASK	\
     (((1 << E82545_NVM_OPCODE_BITS) - 1) << E82545_NVM_ADDR_BITS)
 #define E82545_NVM_OPCODE_READ	(0x6 << E82545_NVM_ADDR_BITS)	/* read */
 #define E82545_NVM_OPCODE_WRITE	(0x5 << E82545_NVM_ADDR_BITS)	/* write */
 #define E82545_NVM_OPCODE_ERASE	(0x7 << E82545_NVM_ADDR_BITS)	/* erase */
 #define	E82545_NVM_OPCODE_EWEN	(0x4 << E82545_NVM_ADDR_BITS)	/* wr-enable */
 
 #define	E82545_NVM_EEPROM_SIZE	64 /* 64 * 16-bit values == 128K */
 
 #define E1000_ICR_SRPD		0x00010000
 
 /* This is an arbitrary number.  There is no hard limit on the chip. */
 #define I82545_MAX_TXSEGS	64
 
 /* Legacy receive descriptor */
 struct e1000_rx_desc {
 	uint64_t buffer_addr;	/* Address of the descriptor's data buffer */
 	uint16_t length;	/* Length of data DMAed into data buffer */
 	uint16_t csum;		/* Packet checksum */
 	uint8_t	 status;       	/* Descriptor status */
 	uint8_t  errors;	/* Descriptor Errors */
 	uint16_t special;
 };
 
 /* Transmit descriptor types */
 #define	E1000_TXD_MASK		(E1000_TXD_CMD_DEXT | 0x00F00000)
 #define E1000_TXD_TYP_L		(0)
 #define E1000_TXD_TYP_C		(E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_C)
 #define E1000_TXD_TYP_D		(E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_D)
 
 /* Legacy transmit descriptor */
 struct e1000_tx_desc {
 	uint64_t buffer_addr;   /* Address of the descriptor's data buffer */
 	union {
 		uint32_t data;
 		struct {
 			uint16_t length;  /* Data buffer length */
 			uint8_t  cso;  /* Checksum offset */
 			uint8_t  cmd;  /* Descriptor control */
 		} flags;
 	} lower;
 	union {
 		uint32_t data;
 		struct {
 			uint8_t status; /* Descriptor status */
 			uint8_t css;  /* Checksum start */
 			uint16_t special;
 		} fields;
 	} upper;
 };
 
 /* Context descriptor */
 struct e1000_context_desc {
 	union {
 		uint32_t ip_config;
 		struct {
 			uint8_t ipcss;  /* IP checksum start */
 			uint8_t ipcso;  /* IP checksum offset */
 			uint16_t ipcse;  /* IP checksum end */
 		} ip_fields;
 	} lower_setup;
 	union {
 		uint32_t tcp_config;
 		struct {
 			uint8_t tucss;  /* TCP checksum start */
 			uint8_t tucso;  /* TCP checksum offset */
 			uint16_t tucse;  /* TCP checksum end */
 		} tcp_fields;
 	} upper_setup;
 	uint32_t cmd_and_length;
 	union {
 		uint32_t data;
 		struct {
 			uint8_t status;  /* Descriptor status */
 			uint8_t hdr_len;  /* Header length */
 			uint16_t mss;  /* Maximum segment size */
 		} fields;
 	} tcp_seg_setup;
 };
 
 /* Data descriptor */
 struct e1000_data_desc {
 	uint64_t buffer_addr;  /* Address of the descriptor's buffer address */
 	union {
 		uint32_t data;
 		struct {
 			uint16_t length;  /* Data buffer length */
 			uint8_t typ_len_ext;
 			uint8_t cmd;
 		} flags;
 	} lower;
 	union {
 		uint32_t data;
 		struct {
 			uint8_t status;  /* Descriptor status */
 			uint8_t popts;  /* Packet Options */
 			uint16_t special;
 		} fields;
 	} upper;
 };
 
 union e1000_tx_udesc {
 	struct e1000_tx_desc td;
 	struct e1000_context_desc cd;
 	struct e1000_data_desc dd;
 };
 
 /* Tx checksum info for a packet. */
 struct ck_info {
 	int	ck_valid;	/* ck_info is valid */
 	uint8_t	ck_start;	/* start byte of cksum calcuation */
 	uint8_t	ck_off;		/* offset of cksum insertion */
 	uint16_t ck_len;	/* length of cksum calc: 0 is to packet-end */
 };
 
 /*
  * Debug printf
  */
 static int e82545_debug = 0;
 #define WPRINTF(msg,params...) PRINTLN("e82545: " msg, params)
 #define DPRINTF(msg,params...) if (e82545_debug) WPRINTF(msg, params)
 
 #define	MIN(a,b) (((a)<(b))?(a):(b))
 #define	MAX(a,b) (((a)>(b))?(a):(b))
 
 /* s/w representation of the RAL/RAH regs */
 struct  eth_uni {
 	int		eu_valid;
 	int		eu_addrsel;
 	struct ether_addr eu_eth;
 };
 
 
 struct e82545_softc {
 	struct pci_devinst *esc_pi;
 	struct vmctx	*esc_ctx;
 	struct mevent   *esc_mevpitr;
 	pthread_mutex_t	esc_mtx;
 	struct ether_addr esc_mac;
 	net_backend_t	*esc_be;
 
 	/* General */
 	uint32_t	esc_CTRL;	/* x0000 device ctl */
 	uint32_t	esc_FCAL;	/* x0028 flow ctl addr lo */
 	uint32_t	esc_FCAH;	/* x002C flow ctl addr hi */
 	uint32_t	esc_FCT;	/* x0030 flow ctl type */
 	uint32_t	esc_VET;	/* x0038 VLAN eth type */
 	uint32_t	esc_FCTTV;	/* x0170 flow ctl tx timer */
 	uint32_t	esc_LEDCTL;	/* x0E00 LED control */
 	uint32_t	esc_PBA;	/* x1000 pkt buffer allocation */
 	
 	/* Interrupt control */
 	int		esc_irq_asserted;
 	uint32_t	esc_ICR;	/* x00C0 cause read/clear */
 	uint32_t	esc_ITR;	/* x00C4 intr throttling */
 	uint32_t	esc_ICS;	/* x00C8 cause set */
 	uint32_t	esc_IMS;	/* x00D0 mask set/read */
 	uint32_t	esc_IMC;	/* x00D8 mask clear */
 
 	/* Transmit */
 	union e1000_tx_udesc *esc_txdesc;
 	struct e1000_context_desc esc_txctx;
 	pthread_t	esc_tx_tid;
 	pthread_cond_t	esc_tx_cond;
 	int		esc_tx_enabled;
 	int		esc_tx_active;
 	uint32_t	esc_TXCW;	/* x0178 transmit config */
 	uint32_t	esc_TCTL;	/* x0400 transmit ctl */
 	uint32_t	esc_TIPG;	/* x0410 inter-packet gap */
 	uint16_t	esc_AIT;	/* x0458 Adaptive Interframe Throttle */
 	uint64_t	esc_tdba;      	/* verified 64-bit desc table addr */
 	uint32_t	esc_TDBAL;	/* x3800 desc table addr, low bits */
 	uint32_t	esc_TDBAH;	/* x3804 desc table addr, hi 32-bits */
 	uint32_t	esc_TDLEN;	/* x3808 # descriptors in bytes */
 	uint16_t	esc_TDH;	/* x3810 desc table head idx */
 	uint16_t	esc_TDHr;	/* internal read version of TDH */
 	uint16_t	esc_TDT;	/* x3818 desc table tail idx */
 	uint32_t	esc_TIDV;	/* x3820 intr delay */
 	uint32_t	esc_TXDCTL;	/* x3828 desc control */
 	uint32_t	esc_TADV;	/* x382C intr absolute delay */
 	
 	/* L2 frame acceptance */
 	struct eth_uni	esc_uni[16];	/* 16 x unicast MAC addresses */
 	uint32_t	esc_fmcast[128]; /* Multicast filter bit-match */
 	uint32_t	esc_fvlan[128]; /* VLAN 4096-bit filter */
 	
 	/* Receive */
 	struct e1000_rx_desc *esc_rxdesc;
 	pthread_cond_t	esc_rx_cond;
 	int		esc_rx_enabled;
 	int		esc_rx_active;
 	int		esc_rx_loopback;
 	uint32_t	esc_RCTL;	/* x0100 receive ctl */
 	uint32_t	esc_FCRTL;	/* x2160 flow cntl thresh, low */
 	uint32_t	esc_FCRTH;	/* x2168 flow cntl thresh, hi */
 	uint64_t	esc_rdba;	/* verified 64-bit desc table addr */
 	uint32_t	esc_RDBAL;	/* x2800 desc table addr, low bits */
 	uint32_t	esc_RDBAH;	/* x2804 desc table addr, hi 32-bits*/
 	uint32_t	esc_RDLEN;	/* x2808 #descriptors */
 	uint16_t	esc_RDH;	/* x2810 desc table head idx */
 	uint16_t	esc_RDT;	/* x2818 desc table tail idx */
 	uint32_t	esc_RDTR;	/* x2820 intr delay */
 	uint32_t	esc_RXDCTL;	/* x2828 desc control */
 	uint32_t	esc_RADV;	/* x282C intr absolute delay */
 	uint32_t	esc_RSRPD;	/* x2C00 recv small packet detect */
 	uint32_t	esc_RXCSUM;     /* x5000 receive cksum ctl */
 	
 	/* IO Port register access */
 	uint32_t io_addr;
 
 	/* Shadow copy of MDIC */
 	uint32_t mdi_control;
 	/* Shadow copy of EECD */
 	uint32_t eeprom_control;
 	/* Latest NVM in/out */
 	uint16_t nvm_data;
 	uint16_t nvm_opaddr;
 	/* stats */
 	uint32_t missed_pkt_count; /* dropped for no room in rx queue */
 	uint32_t pkt_rx_by_size[6];
 	uint32_t pkt_tx_by_size[6];
 	uint32_t good_pkt_rx_count;
 	uint32_t bcast_pkt_rx_count;
 	uint32_t mcast_pkt_rx_count;
 	uint32_t good_pkt_tx_count;
 	uint32_t bcast_pkt_tx_count;
 	uint32_t mcast_pkt_tx_count;
 	uint32_t oversize_rx_count;
 	uint32_t tso_tx_count;
 	uint64_t good_octets_rx;
 	uint64_t good_octets_tx;
 	uint64_t missed_octets; /* counts missed and oversized */
 
 	uint8_t nvm_bits:6; /* number of bits remaining in/out */
 	uint8_t nvm_mode:2;
 #define E82545_NVM_MODE_OPADDR  0x0
 #define E82545_NVM_MODE_DATAIN  0x1
 #define E82545_NVM_MODE_DATAOUT 0x2
 	/* EEPROM data */
 	uint16_t eeprom_data[E82545_NVM_EEPROM_SIZE];
 };
 
 static void e82545_reset(struct e82545_softc *sc, int dev);
 static void e82545_rx_enable(struct e82545_softc *sc);
 static void e82545_rx_disable(struct e82545_softc *sc);
 static void e82545_rx_callback(int fd, enum ev_type type, void *param);
 static void e82545_tx_start(struct e82545_softc *sc);
 static void e82545_tx_enable(struct e82545_softc *sc);
 static void e82545_tx_disable(struct e82545_softc *sc);
 
 static inline int
 e82545_size_stat_index(uint32_t size)
 {
 	if (size <= 64) {
 		return 0;
 	} else if (size >= 1024) {
 		return 5;
 	} else {
 		/* should be 1-4 */
 		return (ffs(size) - 6);
 	}
 }
 
 static void
 e82545_init_eeprom(struct e82545_softc *sc)
 {
 	uint16_t checksum, i;
 
         /* mac addr */
 	sc->eeprom_data[NVM_MAC_ADDR] = ((uint16_t)sc->esc_mac.octet[0]) |
 		(((uint16_t)sc->esc_mac.octet[1]) << 8);
 	sc->eeprom_data[NVM_MAC_ADDR+1] = ((uint16_t)sc->esc_mac.octet[2]) |
 		(((uint16_t)sc->esc_mac.octet[3]) << 8);
 	sc->eeprom_data[NVM_MAC_ADDR+2] = ((uint16_t)sc->esc_mac.octet[4]) |
 		(((uint16_t)sc->esc_mac.octet[5]) << 8);
 
 	/* pci ids */
 	sc->eeprom_data[NVM_SUB_DEV_ID] = E82545_SUBDEV_ID;
 	sc->eeprom_data[NVM_SUB_VEN_ID] = E82545_VENDOR_ID_INTEL;
 	sc->eeprom_data[NVM_DEV_ID] = E82545_DEV_ID_82545EM_COPPER;
 	sc->eeprom_data[NVM_VEN_ID] = E82545_VENDOR_ID_INTEL;
 
 	/* fill in the checksum */
         checksum = 0;
 	for (i = 0; i < NVM_CHECKSUM_REG; i++) {
 		checksum += sc->eeprom_data[i];
 	}
 	checksum = NVM_SUM - checksum;
 	sc->eeprom_data[NVM_CHECKSUM_REG] = checksum;
 	DPRINTF("eeprom checksum: 0x%x", checksum);
 }
 
 static void
 e82545_write_mdi(struct e82545_softc *sc, uint8_t reg_addr,
 			uint8_t phy_addr, uint32_t data)
 {
 	DPRINTF("Write mdi reg:0x%x phy:0x%x data: 0x%x", reg_addr, phy_addr, data);
 }
 
 static uint32_t
 e82545_read_mdi(struct e82545_softc *sc, uint8_t reg_addr,
 			uint8_t phy_addr)
 {
 	//DPRINTF("Read mdi reg:0x%x phy:0x%x", reg_addr, phy_addr);
 	switch (reg_addr) {
 	case PHY_STATUS:
 		return (MII_SR_LINK_STATUS | MII_SR_AUTONEG_CAPS |
 			MII_SR_AUTONEG_COMPLETE);
 	case PHY_AUTONEG_ADV:
 		return NWAY_AR_SELECTOR_FIELD;
 	case PHY_LP_ABILITY:
 		return 0;
 	case PHY_1000T_STATUS:
 		return (SR_1000T_LP_FD_CAPS | SR_1000T_REMOTE_RX_STATUS |
 			SR_1000T_LOCAL_RX_STATUS);
 	case PHY_ID1:
 		return (M88E1011_I_PHY_ID >> 16) & 0xFFFF;
 	case PHY_ID2:
 		return (M88E1011_I_PHY_ID | E82545_REVISION_4) & 0xFFFF;
 	default:
 		DPRINTF("Unknown mdi read reg:0x%x phy:0x%x", reg_addr, phy_addr);
 		return 0;
 	}
 	/* not reached */
 }
 
 static void
 e82545_eecd_strobe(struct e82545_softc *sc)
 {
 	/* Microwire state machine */
 	/*
 	DPRINTF("eeprom state machine srtobe "
 		"0x%x 0x%x 0x%x 0x%x",
 		sc->nvm_mode, sc->nvm_bits,
 		sc->nvm_opaddr, sc->nvm_data);*/
 
 	if (sc->nvm_bits == 0) {
 		DPRINTF("eeprom state machine not expecting data! "
 			"0x%x 0x%x 0x%x 0x%x",
 			sc->nvm_mode, sc->nvm_bits,
 			sc->nvm_opaddr, sc->nvm_data);
 		return;
 	}
 	sc->nvm_bits--;
 	if (sc->nvm_mode == E82545_NVM_MODE_DATAOUT) {
 		/* shifting out */
 		if (sc->nvm_data & 0x8000) {
 			sc->eeprom_control |= E1000_EECD_DO;
 		} else {
 			sc->eeprom_control &= ~E1000_EECD_DO;
 		}
 		sc->nvm_data <<= 1;
 		if (sc->nvm_bits == 0) {
 			/* read done, back to opcode mode. */
 			sc->nvm_opaddr = 0;
 			sc->nvm_mode = E82545_NVM_MODE_OPADDR;
 			sc->nvm_bits = E82545_NVM_OPADDR_BITS;
 		}
 	} else if (sc->nvm_mode == E82545_NVM_MODE_DATAIN) {
 		/* shifting in */
 		sc->nvm_data <<= 1;
 		if (sc->eeprom_control & E1000_EECD_DI) {
 			sc->nvm_data |= 1;
 		}
 		if (sc->nvm_bits == 0) {
 			/* eeprom write */
 			uint16_t op = sc->nvm_opaddr & E82545_NVM_OPCODE_MASK;
 			uint16_t addr = sc->nvm_opaddr & E82545_NVM_ADDR_MASK;
 			if (op != E82545_NVM_OPCODE_WRITE) {
 				DPRINTF("Illegal eeprom write op 0x%x",
 					sc->nvm_opaddr);
 			} else if (addr >= E82545_NVM_EEPROM_SIZE) {
 				DPRINTF("Illegal eeprom write addr 0x%x",
 					sc->nvm_opaddr);
 			} else {
 				DPRINTF("eeprom write eeprom[0x%x] = 0x%x",
 				addr, sc->nvm_data);
 				sc->eeprom_data[addr] = sc->nvm_data;
 			}
 			/* back to opcode mode */
 			sc->nvm_opaddr = 0;
 			sc->nvm_mode = E82545_NVM_MODE_OPADDR;
 			sc->nvm_bits = E82545_NVM_OPADDR_BITS;
 		}
 	} else if (sc->nvm_mode == E82545_NVM_MODE_OPADDR) {
 		sc->nvm_opaddr <<= 1;
 		if (sc->eeprom_control & E1000_EECD_DI) {
 			sc->nvm_opaddr |= 1;
 		}
 		if (sc->nvm_bits == 0) {
 			uint16_t op = sc->nvm_opaddr & E82545_NVM_OPCODE_MASK;
 			switch (op) {
 			case E82545_NVM_OPCODE_EWEN:
 				DPRINTF("eeprom write enable: 0x%x",
 					sc->nvm_opaddr);
 				/* back to opcode mode */
 				sc->nvm_opaddr = 0;
 				sc->nvm_mode = E82545_NVM_MODE_OPADDR;
 				sc->nvm_bits = E82545_NVM_OPADDR_BITS;
 				break;
 			case E82545_NVM_OPCODE_READ:
 			{
 				uint16_t addr = sc->nvm_opaddr &
 					E82545_NVM_ADDR_MASK;
 				sc->nvm_mode = E82545_NVM_MODE_DATAOUT;
 				sc->nvm_bits = E82545_NVM_DATA_BITS;
 				if (addr < E82545_NVM_EEPROM_SIZE) {
 					sc->nvm_data = sc->eeprom_data[addr];
 					DPRINTF("eeprom read: eeprom[0x%x] = 0x%x",
 						addr, sc->nvm_data);
 				} else {
 					DPRINTF("eeprom illegal read: 0x%x",
 						sc->nvm_opaddr);
 					sc->nvm_data = 0;
 				}
 				break;
 			}
 			case E82545_NVM_OPCODE_WRITE:
 				sc->nvm_mode = E82545_NVM_MODE_DATAIN;
 				sc->nvm_bits = E82545_NVM_DATA_BITS;
 				sc->nvm_data = 0;
 				break;
 			default:
 				DPRINTF("eeprom unknown op: 0x%x",
 					sc->nvm_opaddr);
 				/* back to opcode mode */
 				sc->nvm_opaddr = 0;
 				sc->nvm_mode = E82545_NVM_MODE_OPADDR;
 				sc->nvm_bits = E82545_NVM_OPADDR_BITS;
 			}
 		}
 	} else {
 		DPRINTF("eeprom state machine wrong state! "
 			"0x%x 0x%x 0x%x 0x%x",
 			sc->nvm_mode, sc->nvm_bits,
 			sc->nvm_opaddr, sc->nvm_data);
 	}
 }
 
 static void
 e82545_itr_callback(int fd, enum ev_type type, void *param)
 {
 	uint32_t new;
 	struct e82545_softc *sc = param;
 
 	pthread_mutex_lock(&sc->esc_mtx);
 	new = sc->esc_ICR & sc->esc_IMS;
 	if (new && !sc->esc_irq_asserted) {
 		DPRINTF("itr callback: lintr assert %x", new);
 		sc->esc_irq_asserted = 1;
 		pci_lintr_assert(sc->esc_pi);
 	} else {
 		mevent_delete(sc->esc_mevpitr);
 		sc->esc_mevpitr = NULL;
 	}
 	pthread_mutex_unlock(&sc->esc_mtx);
 }
 
 static void
 e82545_icr_assert(struct e82545_softc *sc, uint32_t bits)
 {
 	uint32_t new;
 
 	DPRINTF("icr assert: 0x%x", bits);
 	
 	/*
 	 * An interrupt is only generated if bits are set that
 	 * aren't already in the ICR, these bits are unmasked,
 	 * and there isn't an interrupt already pending.
 	 */
 	new = bits & ~sc->esc_ICR & sc->esc_IMS;
 	sc->esc_ICR |= bits;
 
 	if (new == 0) {
 		DPRINTF("icr assert: masked %x, ims %x", new, sc->esc_IMS);
 	} else if (sc->esc_mevpitr != NULL) {
 		DPRINTF("icr assert: throttled %x, ims %x", new, sc->esc_IMS);
 	} else if (!sc->esc_irq_asserted) {
 		DPRINTF("icr assert: lintr assert %x", new);
 		sc->esc_irq_asserted = 1;
 		pci_lintr_assert(sc->esc_pi);
 		if (sc->esc_ITR != 0) {
 			sc->esc_mevpitr = mevent_add(
 			    (sc->esc_ITR + 3905) / 3906,  /* 256ns -> 1ms */
 			    EVF_TIMER, e82545_itr_callback, sc);
 		}
 	}
 }
 
 static void
 e82545_ims_change(struct e82545_softc *sc, uint32_t bits)
 {
 	uint32_t new;
 
 	/*
 	 * Changing the mask may allow previously asserted
 	 * but masked interrupt requests to generate an interrupt.
 	 */
 	new = bits & sc->esc_ICR & ~sc->esc_IMS;
 	sc->esc_IMS |= bits;
 
 	if (new == 0) {
 		DPRINTF("ims change: masked %x, ims %x", new, sc->esc_IMS);
 	} else if (sc->esc_mevpitr != NULL) {
 		DPRINTF("ims change: throttled %x, ims %x", new, sc->esc_IMS);
 	} else if (!sc->esc_irq_asserted) {
 		DPRINTF("ims change: lintr assert %x", new);
 		sc->esc_irq_asserted = 1;
 		pci_lintr_assert(sc->esc_pi);
 		if (sc->esc_ITR != 0) {
 			sc->esc_mevpitr = mevent_add(
 			    (sc->esc_ITR + 3905) / 3906,  /* 256ns -> 1ms */
 			    EVF_TIMER, e82545_itr_callback, sc);
 		}
 	}
 }
 
 static void
 e82545_icr_deassert(struct e82545_softc *sc, uint32_t bits)
 {
 
 	DPRINTF("icr deassert: 0x%x", bits);
 	sc->esc_ICR &= ~bits;
 
 	/*
 	 * If there are no longer any interrupt sources and there
 	 * was an asserted interrupt, clear it
 	 */
 	if (sc->esc_irq_asserted && !(sc->esc_ICR & sc->esc_IMS)) {
 		DPRINTF("icr deassert: lintr deassert %x", bits);
 		pci_lintr_deassert(sc->esc_pi);
 		sc->esc_irq_asserted = 0;
 	}
 }
 
 static void
 e82545_intr_write(struct e82545_softc *sc, uint32_t offset, uint32_t value)
 {
 
 	DPRINTF("intr_write: off %x, val %x", offset, value);
 	
 	switch (offset) {
 	case E1000_ICR:
 		e82545_icr_deassert(sc, value);
 		break;
 	case E1000_ITR:
 		sc->esc_ITR = value;
 		break;
 	case E1000_ICS:
 		sc->esc_ICS = value;	/* not used: store for debug */
 		e82545_icr_assert(sc, value);
 		break;
 	case E1000_IMS:
 		e82545_ims_change(sc, value);
 		break;
 	case E1000_IMC:
 		sc->esc_IMC = value;	/* for debug */
 		sc->esc_IMS &= ~value;
 		// XXX clear interrupts if all ICR bits now masked
 		// and interrupt was pending ?
 		break;
 	default:
 		break;
 	}
 }
 
 static uint32_t
 e82545_intr_read(struct e82545_softc *sc, uint32_t offset)
 {
 	uint32_t retval;
 
 	retval = 0;
 
 	DPRINTF("intr_read: off %x", offset);
 	
 	switch (offset) {
 	case E1000_ICR:
 		retval = sc->esc_ICR;
 		sc->esc_ICR = 0;
 		e82545_icr_deassert(sc, ~0);
 		break;
 	case E1000_ITR:
 		retval = sc->esc_ITR;
 		break;
 	case E1000_ICS:
 		/* write-only register */
 		break;
 	case E1000_IMS:
 		retval = sc->esc_IMS;
 		break;
 	case E1000_IMC:
 		/* write-only register */
 		break;
 	default:
 		break;
 	}
 
 	return (retval);
 }
 
 static void
 e82545_devctl(struct e82545_softc *sc, uint32_t val)
 {
 
 	sc->esc_CTRL = val & ~E1000_CTRL_RST;
 
 	if (val & E1000_CTRL_RST) {
 		DPRINTF("e1k: s/w reset, ctl %x", val);
 		e82545_reset(sc, 1);
 	}
 	/* XXX check for phy reset ? */
 }
 
 static void
 e82545_rx_update_rdba(struct e82545_softc *sc)
 {
 
 	/* XXX verify desc base/len within phys mem range */
 	sc->esc_rdba = (uint64_t)sc->esc_RDBAH << 32 |
 	    sc->esc_RDBAL;
 	
 	/* Cache host mapping of guest descriptor array */
 	sc->esc_rxdesc = paddr_guest2host(sc->esc_ctx,
 	    sc->esc_rdba, sc->esc_RDLEN);	
 }
 
 static void
 e82545_rx_ctl(struct e82545_softc *sc, uint32_t val)
 {
 	int on;
 
 	on = ((val & E1000_RCTL_EN) == E1000_RCTL_EN);
 
 	/* Save RCTL after stripping reserved bits 31:27,24,21,14,11:10,0 */
 	sc->esc_RCTL = val & ~0xF9204c01;
 
 	DPRINTF("rx_ctl - %s RCTL %x, val %x",
 		on ? "on" : "off", sc->esc_RCTL, val);
 
 	/* state change requested */
 	if (on != sc->esc_rx_enabled) {
 		if (on) {
 			/* Catch disallowed/unimplemented settings */
 			//assert(!(val & E1000_RCTL_LBM_TCVR));
 
 			if (sc->esc_RCTL & E1000_RCTL_LBM_TCVR) {
 				sc->esc_rx_loopback = 1;
 			} else {
 				sc->esc_rx_loopback = 0;
 			}
 
 			e82545_rx_update_rdba(sc);
 			e82545_rx_enable(sc);
 		} else {
 			e82545_rx_disable(sc);
 			sc->esc_rx_loopback = 0;
 			sc->esc_rdba = 0;
 			sc->esc_rxdesc = NULL;
 		}
 	}
 }
 
 static void
 e82545_tx_update_tdba(struct e82545_softc *sc)
 {
 
 	/* XXX verify desc base/len within phys mem range */
 	sc->esc_tdba = (uint64_t)sc->esc_TDBAH << 32 | sc->esc_TDBAL;
 
 	/* Cache host mapping of guest descriptor array */
 	sc->esc_txdesc = paddr_guest2host(sc->esc_ctx, sc->esc_tdba,
             sc->esc_TDLEN);
 }
 
 static void
 e82545_tx_ctl(struct e82545_softc *sc, uint32_t val)
 {
 	int on;
 	
 	on = ((val & E1000_TCTL_EN) == E1000_TCTL_EN);
 
 	/* ignore TCTL_EN settings that don't change state */
 	if (on == sc->esc_tx_enabled)
 		return;
 
 	if (on) {
 		e82545_tx_update_tdba(sc);
 		e82545_tx_enable(sc);
 	} else {
 		e82545_tx_disable(sc);
 		sc->esc_tdba = 0;
 		sc->esc_txdesc = NULL;
 	}
 
 	/* Save TCTL value after stripping reserved bits 31:25,23,2,0 */
 	sc->esc_TCTL = val & ~0xFE800005;
 }
 
 int
 e82545_bufsz(uint32_t rctl)
 {
 
 	switch (rctl & (E1000_RCTL_BSEX | E1000_RCTL_SZ_256)) {
 	case (E1000_RCTL_SZ_2048): return (2048);
 	case (E1000_RCTL_SZ_1024): return (1024);
 	case (E1000_RCTL_SZ_512): return (512);
 	case (E1000_RCTL_SZ_256): return (256);
 	case (E1000_RCTL_BSEX|E1000_RCTL_SZ_16384): return (16384);
 	case (E1000_RCTL_BSEX|E1000_RCTL_SZ_8192): return (8192);
 	case (E1000_RCTL_BSEX|E1000_RCTL_SZ_4096): return (4096);
 	}
 	return (256);	/* Forbidden value. */
 }
 
 /* XXX one packet at a time until this is debugged */
 static void
 e82545_rx_callback(int fd, enum ev_type type, void *param)
 {
 	struct e82545_softc *sc = param;
 	struct e1000_rx_desc *rxd;
 	struct iovec vec[64];
 	int left, len, lim, maxpktsz, maxpktdesc, bufsz, i, n, size;
 	uint32_t cause = 0;
 	uint16_t *tp, tag, head;
 
 	pthread_mutex_lock(&sc->esc_mtx);
 	DPRINTF("rx_run: head %x, tail %x", sc->esc_RDH, sc->esc_RDT);
 
 	if (!sc->esc_rx_enabled || sc->esc_rx_loopback) {
 		DPRINTF("rx disabled (!%d || %d) -- packet(s) dropped",
 		    sc->esc_rx_enabled, sc->esc_rx_loopback);
 		while (netbe_rx_discard(sc->esc_be) > 0) {
 		}
 		goto done1;
 	}
 	bufsz = e82545_bufsz(sc->esc_RCTL);
 	maxpktsz = (sc->esc_RCTL & E1000_RCTL_LPE) ? 16384 : 1522;
 	maxpktdesc = (maxpktsz + bufsz - 1) / bufsz;
 	size = sc->esc_RDLEN / 16;
 	head = sc->esc_RDH;
 	left = (size + sc->esc_RDT - head) % size;
 	if (left < maxpktdesc) {
 		DPRINTF("rx overflow (%d < %d) -- packet(s) dropped",
 		    left, maxpktdesc);
 		while (netbe_rx_discard(sc->esc_be) > 0) {
 		}
 		goto done1;
 	}
 
 	sc->esc_rx_active = 1;
 	pthread_mutex_unlock(&sc->esc_mtx);
 
 	for (lim = size / 4; lim > 0 && left >= maxpktdesc; lim -= n) {
 
 		/* Grab rx descriptor pointed to by the head pointer */
 		for (i = 0; i < maxpktdesc; i++) {
 			rxd = &sc->esc_rxdesc[(head + i) % size];
 			vec[i].iov_base = paddr_guest2host(sc->esc_ctx,
 			    rxd->buffer_addr, bufsz);
 			vec[i].iov_len = bufsz;
 		}
 		len = netbe_recv(sc->esc_be, vec, maxpktdesc);
 		if (len <= 0) {
 			DPRINTF("netbe_recv() returned %d", len);
 			goto done;
 		}
 
 		/*
 		 * Adjust the packet length based on whether the CRC needs
 		 * to be stripped or if the packet is less than the minimum
 		 * eth packet size.
 		 */
 		if (len < ETHER_MIN_LEN - ETHER_CRC_LEN)
 			len = ETHER_MIN_LEN - ETHER_CRC_LEN;
 		if (!(sc->esc_RCTL & E1000_RCTL_SECRC))
 			len += ETHER_CRC_LEN;
 		n = (len + bufsz - 1) / bufsz;
 
 		DPRINTF("packet read %d bytes, %d segs, head %d",
 		    len, n, head);
 
 		/* Apply VLAN filter. */
 		tp = (uint16_t *)vec[0].iov_base + 6;
 		if ((sc->esc_RCTL & E1000_RCTL_VFE) &&
 		    (ntohs(tp[0]) == sc->esc_VET)) {
 			tag = ntohs(tp[1]) & 0x0fff;
 			if ((sc->esc_fvlan[tag >> 5] &
 			    (1 << (tag & 0x1f))) != 0) {
 				DPRINTF("known VLAN %d", tag);
 			} else {
 				DPRINTF("unknown VLAN %d", tag);
 				n = 0;
 				continue;
 			}
 		}
 
 		/* Update all consumed descriptors. */
 		for (i = 0; i < n - 1; i++) {
 			rxd = &sc->esc_rxdesc[(head + i) % size];
 			rxd->length = bufsz;
 			rxd->csum = 0;
 			rxd->errors = 0;
 			rxd->special = 0;
 			rxd->status = E1000_RXD_STAT_DD;
 		}
 		rxd = &sc->esc_rxdesc[(head + i) % size];
 		rxd->length = len % bufsz;
 		rxd->csum = 0;
 		rxd->errors = 0;
 		rxd->special = 0;
 		/* XXX signal no checksum for now */
 		rxd->status = E1000_RXD_STAT_PIF | E1000_RXD_STAT_IXSM |
 		    E1000_RXD_STAT_EOP | E1000_RXD_STAT_DD;
 
 		/* Schedule receive interrupts. */
 		if (len <= sc->esc_RSRPD) {
 			cause |= E1000_ICR_SRPD | E1000_ICR_RXT0;
 		} else {
 			/* XXX: RDRT and RADV timers should be here. */
 			cause |= E1000_ICR_RXT0;
 		}
 
 		head = (head + n) % size;
 		left -= n;
 	}
 
 done:
 	pthread_mutex_lock(&sc->esc_mtx);
 	sc->esc_rx_active = 0;
 	if (sc->esc_rx_enabled == 0)
 		pthread_cond_signal(&sc->esc_rx_cond);
 
 	sc->esc_RDH = head;
 	/* Respect E1000_RCTL_RDMTS */
 	left = (size + sc->esc_RDT - head) % size;
 	if (left < (size >> (((sc->esc_RCTL >> 8) & 3) + 1)))
 		cause |= E1000_ICR_RXDMT0;
 	/* Assert all accumulated interrupts. */
 	if (cause != 0)
 		e82545_icr_assert(sc, cause);
 done1:
 	DPRINTF("rx_run done: head %x, tail %x", sc->esc_RDH, sc->esc_RDT);
 	pthread_mutex_unlock(&sc->esc_mtx);
 }
 
 static uint16_t
 e82545_carry(uint32_t sum)
 {
 
 	sum = (sum & 0xFFFF) + (sum >> 16);
 	if (sum > 0xFFFF)
 		sum -= 0xFFFF;
 	return (sum);
 }
 
 static uint16_t
 e82545_buf_checksum(uint8_t *buf, int len)
 {
 	int i;
 	uint32_t sum = 0;
 
 	/* Checksum all the pairs of bytes first... */
 	for (i = 0; i < (len & ~1U); i += 2)
 		sum += *((u_int16_t *)(buf + i));
 
 	/*
 	 * If there's a single byte left over, checksum it, too.
 	 * Network byte order is big-endian, so the remaining byte is
 	 * the high byte.
 	 */
 	if (i < len)
 		sum += htons(buf[i] << 8);
 
 	return (e82545_carry(sum));
 }
 
 static uint16_t
 e82545_iov_checksum(struct iovec *iov, int iovcnt, int off, int len)
 {
 	int now, odd;
 	uint32_t sum = 0, s;
 
 	/* Skip completely unneeded vectors. */
 	while (iovcnt > 0 && iov->iov_len <= off && off > 0) {
 		off -= iov->iov_len;
 		iov++;
 		iovcnt--;
 	}
 
 	/* Calculate checksum of requested range. */
 	odd = 0;
 	while (len > 0 && iovcnt > 0) {
 		now = MIN(len, iov->iov_len - off);
 		s = e82545_buf_checksum(iov->iov_base + off, now);
 		sum += odd ? (s << 8) : s;
 		odd ^= (now & 1);
 		len -= now;
 		off = 0;
 		iov++;
 		iovcnt--;
 	}
 
 	return (e82545_carry(sum));
 }
 
 /*
  * Return the transmit descriptor type.
  */
 int
 e82545_txdesc_type(uint32_t lower)
 {
 	int type;
 
 	type = 0;
 	
 	if (lower & E1000_TXD_CMD_DEXT)
 		type = lower & E1000_TXD_MASK;
 
 	return (type);
 }
 
 static void
 e82545_transmit_checksum(struct iovec *iov, int iovcnt, struct ck_info *ck)
 {
 	uint16_t cksum;
 	int cklen;
 
 	DPRINTF("tx cksum: iovcnt/s/off/len %d/%d/%d/%d",
 	    iovcnt, ck->ck_start, ck->ck_off, ck->ck_len);
 	cklen = ck->ck_len ? ck->ck_len - ck->ck_start + 1 : INT_MAX;
 	cksum = e82545_iov_checksum(iov, iovcnt, ck->ck_start, cklen);
 	*(uint16_t *)((uint8_t *)iov[0].iov_base + ck->ck_off) = ~cksum;
 }
 
 static void
 e82545_transmit_backend(struct e82545_softc *sc, struct iovec *iov, int iovcnt)
 {
 
 	if (sc->esc_be == NULL)
 		return;
 
 	(void) netbe_send(sc->esc_be, iov, iovcnt);
 }
 
 static void
 e82545_transmit_done(struct e82545_softc *sc, uint16_t head, uint16_t tail,
     uint16_t dsize, int *tdwb)
 {
 	union e1000_tx_udesc *dsc;
 
 	for ( ; head != tail; head = (head + 1) % dsize) {
 		dsc = &sc->esc_txdesc[head];
 		if (dsc->td.lower.data & E1000_TXD_CMD_RS) {
 			dsc->td.upper.data |= E1000_TXD_STAT_DD;
 			*tdwb = 1;
 		}
 	}
 }
 
 static int
 e82545_transmit(struct e82545_softc *sc, uint16_t head, uint16_t tail,
     uint16_t dsize, uint16_t *rhead, int *tdwb)
 {
 	uint8_t *hdr, *hdrp;
 	struct iovec iovb[I82545_MAX_TXSEGS + 2];
 	struct iovec tiov[I82545_MAX_TXSEGS + 2];
 	struct e1000_context_desc *cd;
 	struct ck_info ckinfo[2];
 	struct iovec *iov;
 	union  e1000_tx_udesc *dsc;
 	int desc, dtype, len, ntype, iovcnt, tlen, tcp, tso;
 	int mss, paylen, seg, tiovcnt, left, now, nleft, nnow, pv, pvoff;
 	unsigned hdrlen, vlen;
 	uint32_t tcpsum, tcpseq;
 	uint16_t ipcs, tcpcs, ipid, ohead;
 
 	ckinfo[0].ck_valid = ckinfo[1].ck_valid = 0;
 	iovcnt = 0;
 	tlen = 0;
 	ntype = 0;
 	tso = 0;
 	ohead = head;
 
 	/* iovb[0/1] may be used for writable copy of headers. */
 	iov = &iovb[2];
 
 	for (desc = 0; ; desc++, head = (head + 1) % dsize) {
 		if (head == tail) {
 			*rhead = head;
 			return (0);
 		}
 		dsc = &sc->esc_txdesc[head];
 		dtype = e82545_txdesc_type(dsc->td.lower.data);
 
 		if (desc == 0) {
 			switch (dtype) {
 			case E1000_TXD_TYP_C:
 				DPRINTF("tx ctxt desc idx %d: %016jx "
 				    "%08x%08x",
 				    head, dsc->td.buffer_addr,
 				    dsc->td.upper.data, dsc->td.lower.data);
 				/* Save context and return */
 				sc->esc_txctx = dsc->cd;
 				goto done;
 			case E1000_TXD_TYP_L:
 				DPRINTF("tx legacy desc idx %d: %08x%08x",
 				    head, dsc->td.upper.data, dsc->td.lower.data);
 				/*
 				 * legacy cksum start valid in first descriptor
 				 */
 				ntype = dtype;
 				ckinfo[0].ck_start = dsc->td.upper.fields.css;
 				break;
 			case E1000_TXD_TYP_D:
 				DPRINTF("tx data desc idx %d: %08x%08x",
 				    head, dsc->td.upper.data, dsc->td.lower.data);
 				ntype = dtype;
 				break;
 			default:
 				break;
 			}
 		} else {
 			/* Descriptor type must be consistent */
 			assert(dtype == ntype);
 			DPRINTF("tx next desc idx %d: %08x%08x",
 			    head, dsc->td.upper.data, dsc->td.lower.data);
 		}
 
 		len = (dtype == E1000_TXD_TYP_L) ? dsc->td.lower.flags.length :
 		    dsc->dd.lower.data & 0xFFFFF;
 
 		if (len > 0) {
 			/* Strip checksum supplied by guest. */
 			if ((dsc->td.lower.data & E1000_TXD_CMD_EOP) != 0 &&
 			    (dsc->td.lower.data & E1000_TXD_CMD_IFCS) == 0)
 				len -= 2;
 			tlen += len;
 			if (iovcnt < I82545_MAX_TXSEGS) {
 				iov[iovcnt].iov_base = paddr_guest2host(
 				    sc->esc_ctx, dsc->td.buffer_addr, len);
 				iov[iovcnt].iov_len = len;
 			}
 			iovcnt++;
 		}
 
 		/*
 		 * Pull out info that is valid in the final descriptor
 		 * and exit descriptor loop.
 		 */
 		if (dsc->td.lower.data & E1000_TXD_CMD_EOP) {
 			if (dtype == E1000_TXD_TYP_L) {
 				if (dsc->td.lower.data & E1000_TXD_CMD_IC) {
 					ckinfo[0].ck_valid = 1;
 					ckinfo[0].ck_off =
 					    dsc->td.lower.flags.cso;
 					ckinfo[0].ck_len = 0;
 				}
 			} else {
 				cd = &sc->esc_txctx;
 				if (dsc->dd.lower.data & E1000_TXD_CMD_TSE)
 					tso = 1;
 				if (dsc->dd.upper.fields.popts &
 				    E1000_TXD_POPTS_IXSM)
 					ckinfo[0].ck_valid = 1;
 				if (dsc->dd.upper.fields.popts &
 				    E1000_TXD_POPTS_IXSM || tso) {
 					ckinfo[0].ck_start =
 					    cd->lower_setup.ip_fields.ipcss;
 					ckinfo[0].ck_off =
 					    cd->lower_setup.ip_fields.ipcso;
 					ckinfo[0].ck_len =
 					    cd->lower_setup.ip_fields.ipcse;
 				}
 				if (dsc->dd.upper.fields.popts &
 				    E1000_TXD_POPTS_TXSM)
 					ckinfo[1].ck_valid = 1;
 				if (dsc->dd.upper.fields.popts &
 				    E1000_TXD_POPTS_TXSM || tso) {
 					ckinfo[1].ck_start =
 					    cd->upper_setup.tcp_fields.tucss;
 					ckinfo[1].ck_off =
 					    cd->upper_setup.tcp_fields.tucso;
 					ckinfo[1].ck_len =
 					    cd->upper_setup.tcp_fields.tucse;
 				}
 			}
 			break;
 		}
 	}
 
 	if (iovcnt > I82545_MAX_TXSEGS) {
 		WPRINTF("tx too many descriptors (%d > %d) -- dropped",
 		    iovcnt, I82545_MAX_TXSEGS);
 		goto done;
 	}
 
 	hdrlen = vlen = 0;
 	/* Estimate writable space for VLAN header insertion. */
 	if ((sc->esc_CTRL & E1000_CTRL_VME) &&
 	    (dsc->td.lower.data & E1000_TXD_CMD_VLE)) {
 		hdrlen = ETHER_ADDR_LEN*2;
 		vlen = ETHER_VLAN_ENCAP_LEN;
 	}
 	if (!tso) {
 		/* Estimate required writable space for checksums. */
 		if (ckinfo[0].ck_valid)
 			hdrlen = MAX(hdrlen, ckinfo[0].ck_off + 2);
 		if (ckinfo[1].ck_valid)
 			hdrlen = MAX(hdrlen, ckinfo[1].ck_off + 2);
 		/* Round up writable space to the first vector. */
 		if (hdrlen != 0 && iov[0].iov_len > hdrlen &&
 		    iov[0].iov_len < hdrlen + 100)
 			hdrlen = iov[0].iov_len;
 	} else {
 		/* In case of TSO header length provided by software. */
 		hdrlen = sc->esc_txctx.tcp_seg_setup.fields.hdr_len;
 
 		/*
 		 * Cap the header length at 240 based on 7.2.4.5 of
 		 * the Intel 82576EB (Rev 2.63) datasheet.
 		 */
 		if (hdrlen > 240) {
 			WPRINTF("TSO hdrlen too large: %d", hdrlen);
 			goto done;
 		}
 
 		/*
 		 * If VLAN insertion is requested, ensure the header
 		 * at least holds the amount of data copied during
 		 * VLAN insertion below.
 		 *
 		 * XXX: Realistic packets will include a full Ethernet
 		 * header before the IP header at ckinfo[0].ck_start,
 		 * but this check is sufficient to prevent
 		 * out-of-bounds access below.
 		 */
 		if (vlen != 0 && hdrlen < ETHER_ADDR_LEN*2) {
 			WPRINTF("TSO hdrlen too small for vlan insertion "
 			    "(%d vs %d) -- dropped", hdrlen,
 			    ETHER_ADDR_LEN*2);
 			goto done;
 		}
 
 		/*
 		 * Ensure that the header length covers the used fields
 		 * in the IP and TCP headers as well as the IP and TCP
 		 * checksums.  The following fields are accessed below:
 		 *
 		 * Header | Field | Offset | Length
 		 * -------+-------+--------+-------
 		 * IPv4   | len   | 2      | 2
 		 * IPv4   | ID    | 4      | 2
 		 * IPv6   | len   | 4      | 2
 		 * TCP    | seq # | 4      | 4
 		 * TCP    | flags | 13     | 1
 		 * UDP    | len   | 4      | 4
 		 */
 		if (hdrlen < ckinfo[0].ck_start + 6 ||
 		    hdrlen < ckinfo[0].ck_off + 2) {
 			WPRINTF("TSO hdrlen too small for IP fields (%d) "
 			    "-- dropped", hdrlen);
 			goto done;
 		}
 		if (sc->esc_txctx.cmd_and_length & E1000_TXD_CMD_TCP) {
 			if (hdrlen < ckinfo[1].ck_start + 14 ||
 			    (ckinfo[1].ck_valid &&
 			    hdrlen < ckinfo[1].ck_off + 2)) {
 				WPRINTF("TSO hdrlen too small for TCP fields "
 				    "(%d) -- dropped", hdrlen);
 				goto done;
 			}
 		} else {
 			if (hdrlen < ckinfo[1].ck_start + 8) {
 				WPRINTF("TSO hdrlen too small for UDP fields "
 				    "(%d) -- dropped", hdrlen);
 				goto done;
 			}
 		}
 	}
 
 	/* Allocate, fill and prepend writable header vector. */
 	if (hdrlen != 0) {
 		hdr = __builtin_alloca(hdrlen + vlen);
 		hdr += vlen;
 		for (left = hdrlen, hdrp = hdr; left > 0;
 		    left -= now, hdrp += now) {
 			now = MIN(left, iov->iov_len);
 			memcpy(hdrp, iov->iov_base, now);
 			iov->iov_base += now;
 			iov->iov_len -= now;
 			if (iov->iov_len == 0) {
 				iov++;
 				iovcnt--;
 			}
 		}
 		iov--;
 		iovcnt++;
 		iov->iov_base = hdr;
 		iov->iov_len = hdrlen;
 	} else
 		hdr = NULL;
 
 	/* Insert VLAN tag. */
 	if (vlen != 0) {
 		hdr -= ETHER_VLAN_ENCAP_LEN;
 		memmove(hdr, hdr + ETHER_VLAN_ENCAP_LEN, ETHER_ADDR_LEN*2);
 		hdrlen += ETHER_VLAN_ENCAP_LEN;
 		hdr[ETHER_ADDR_LEN*2 + 0] = sc->esc_VET >> 8;
 		hdr[ETHER_ADDR_LEN*2 + 1] = sc->esc_VET & 0xff;
 		hdr[ETHER_ADDR_LEN*2 + 2] = dsc->td.upper.fields.special >> 8;
 		hdr[ETHER_ADDR_LEN*2 + 3] = dsc->td.upper.fields.special & 0xff;
 		iov->iov_base = hdr;
 		iov->iov_len += ETHER_VLAN_ENCAP_LEN;
 		/* Correct checksum offsets after VLAN tag insertion. */
 		ckinfo[0].ck_start += ETHER_VLAN_ENCAP_LEN;
 		ckinfo[0].ck_off += ETHER_VLAN_ENCAP_LEN;
 		if (ckinfo[0].ck_len != 0)
 			ckinfo[0].ck_len += ETHER_VLAN_ENCAP_LEN;
 		ckinfo[1].ck_start += ETHER_VLAN_ENCAP_LEN;
 		ckinfo[1].ck_off += ETHER_VLAN_ENCAP_LEN;
 		if (ckinfo[1].ck_len != 0)
 			ckinfo[1].ck_len += ETHER_VLAN_ENCAP_LEN;
 	}
 
 	/* Simple non-TSO case. */
 	if (!tso) {
 		/* Calculate checksums and transmit. */
 		if (ckinfo[0].ck_valid)
 			e82545_transmit_checksum(iov, iovcnt, &ckinfo[0]);
 		if (ckinfo[1].ck_valid)
 			e82545_transmit_checksum(iov, iovcnt, &ckinfo[1]);
 		e82545_transmit_backend(sc, iov, iovcnt);
 		goto done;
 	}
 
 	/* Doing TSO. */
 	tcp = (sc->esc_txctx.cmd_and_length & E1000_TXD_CMD_TCP) != 0;
 	mss = sc->esc_txctx.tcp_seg_setup.fields.mss;
 	paylen = (sc->esc_txctx.cmd_and_length & 0x000fffff);
 	DPRINTF("tx %s segmentation offload %d+%d/%d bytes %d iovs",
 	    tcp ? "TCP" : "UDP", hdrlen, paylen, mss, iovcnt);
 	ipid = ntohs(*(uint16_t *)&hdr[ckinfo[0].ck_start + 4]);
 	tcpseq = 0;
 	if (tcp)
 		tcpseq = ntohl(*(uint32_t *)&hdr[ckinfo[1].ck_start + 4]);
 	ipcs = *(uint16_t *)&hdr[ckinfo[0].ck_off];
 	tcpcs = 0;
 	if (ckinfo[1].ck_valid)	/* Save partial pseudo-header checksum. */
 		tcpcs = *(uint16_t *)&hdr[ckinfo[1].ck_off];
 	pv = 1;
 	pvoff = 0;
 	for (seg = 0, left = paylen; left > 0; seg++, left -= now) {
 		now = MIN(left, mss);
 
 		/* Construct IOVs for the segment. */
 		/* Include whole original header. */
 		tiov[0].iov_base = hdr;
 		tiov[0].iov_len = hdrlen;
 		tiovcnt = 1;
 		/* Include respective part of payload IOV. */
 		for (nleft = now; pv < iovcnt && nleft > 0; nleft -= nnow) {
 			nnow = MIN(nleft, iov[pv].iov_len - pvoff);
 			tiov[tiovcnt].iov_base = iov[pv].iov_base + pvoff;
 			tiov[tiovcnt++].iov_len = nnow;
 			if (pvoff + nnow == iov[pv].iov_len) {
 				pv++;
 				pvoff = 0;
 			} else
 				pvoff += nnow;
 		}
 		DPRINTF("tx segment %d %d+%d bytes %d iovs",
 		    seg, hdrlen, now, tiovcnt);
 
 		/* Update IP header. */
 		if (sc->esc_txctx.cmd_and_length & E1000_TXD_CMD_IP) {
 			/* IPv4 -- set length and ID */
 			*(uint16_t *)&hdr[ckinfo[0].ck_start + 2] =
 			    htons(hdrlen - ckinfo[0].ck_start + now);
 			*(uint16_t *)&hdr[ckinfo[0].ck_start + 4] =
 			    htons(ipid + seg);
 		} else {
 			/* IPv6 -- set length */
 			*(uint16_t *)&hdr[ckinfo[0].ck_start + 4] =
 			    htons(hdrlen - ckinfo[0].ck_start - 40 +
 				  now);
 		}
 
 		/* Update pseudo-header checksum. */
 		tcpsum = tcpcs;
 		tcpsum += htons(hdrlen - ckinfo[1].ck_start + now);
 
 		/* Update TCP/UDP headers. */
 		if (tcp) {
 			/* Update sequence number and FIN/PUSH flags. */
 			*(uint32_t *)&hdr[ckinfo[1].ck_start + 4] =
 			    htonl(tcpseq + paylen - left);
 			if (now < left) {
 				hdr[ckinfo[1].ck_start + 13] &=
 				    ~(TH_FIN | TH_PUSH);
 			}
 		} else {
 			/* Update payload length. */
 			*(uint32_t *)&hdr[ckinfo[1].ck_start + 4] =
 			    hdrlen - ckinfo[1].ck_start + now;
 		}
 
 		/* Calculate checksums and transmit. */
 		if (ckinfo[0].ck_valid) {
 			*(uint16_t *)&hdr[ckinfo[0].ck_off] = ipcs;
 			e82545_transmit_checksum(tiov, tiovcnt, &ckinfo[0]);
 		}
 		if (ckinfo[1].ck_valid) {
 			*(uint16_t *)&hdr[ckinfo[1].ck_off] =
 			    e82545_carry(tcpsum);
 			e82545_transmit_checksum(tiov, tiovcnt, &ckinfo[1]);
 		}
 		e82545_transmit_backend(sc, tiov, tiovcnt);
 	}
 
 done:
 	head = (head + 1) % dsize;
 	e82545_transmit_done(sc, ohead, head, dsize, tdwb);
 
 	*rhead = head;
 	return (desc + 1);
 }
 
 static void
 e82545_tx_run(struct e82545_softc *sc)
 {
 	uint32_t cause;
 	uint16_t head, rhead, tail, size;
 	int lim, tdwb, sent;
 
 	head = sc->esc_TDH;
 	tail = sc->esc_TDT;
 	size = sc->esc_TDLEN / 16;
 	DPRINTF("tx_run: head %x, rhead %x, tail %x",
 	    sc->esc_TDH, sc->esc_TDHr, sc->esc_TDT);
 
 	pthread_mutex_unlock(&sc->esc_mtx);
 	rhead = head;
 	tdwb = 0;
 	for (lim = size / 4; sc->esc_tx_enabled && lim > 0; lim -= sent) {
 		sent = e82545_transmit(sc, head, tail, size, &rhead, &tdwb);
 		if (sent == 0)
 			break;
 		head = rhead;
 	}
 	pthread_mutex_lock(&sc->esc_mtx);
 
 	sc->esc_TDH = head;
 	sc->esc_TDHr = rhead;
 	cause = 0;
 	if (tdwb)
 		cause |= E1000_ICR_TXDW;
 	if (lim != size / 4 && sc->esc_TDH == sc->esc_TDT)
 		cause |= E1000_ICR_TXQE;
 	if (cause)
 		e82545_icr_assert(sc, cause);
 
 	DPRINTF("tx_run done: head %x, rhead %x, tail %x",
 	    sc->esc_TDH, sc->esc_TDHr, sc->esc_TDT);
 }
 
 static _Noreturn void *
 e82545_tx_thread(void *param)
 {
 	struct e82545_softc *sc = param;
 
 	pthread_mutex_lock(&sc->esc_mtx);
 	for (;;) {
 		while (!sc->esc_tx_enabled || sc->esc_TDHr == sc->esc_TDT) {
 			if (sc->esc_tx_enabled && sc->esc_TDHr != sc->esc_TDT)
 				break;
 			sc->esc_tx_active = 0;
 			if (sc->esc_tx_enabled == 0)
 				pthread_cond_signal(&sc->esc_tx_cond);
 			pthread_cond_wait(&sc->esc_tx_cond, &sc->esc_mtx);
 		}
 		sc->esc_tx_active = 1;
 
 		/* Process some tx descriptors.  Lock dropped inside. */
 		e82545_tx_run(sc);
 	}
 }
 
 static void
 e82545_tx_start(struct e82545_softc *sc)
 {
 
 	if (sc->esc_tx_active == 0)
 		pthread_cond_signal(&sc->esc_tx_cond);
 }
 
 static void
 e82545_tx_enable(struct e82545_softc *sc)
 {
 
 	sc->esc_tx_enabled = 1;
 }
 
 static void
 e82545_tx_disable(struct e82545_softc *sc)
 {
 
 	sc->esc_tx_enabled = 0;
 	while (sc->esc_tx_active)
 		pthread_cond_wait(&sc->esc_tx_cond, &sc->esc_mtx);
 }
 
 static void
 e82545_rx_enable(struct e82545_softc *sc)
 {
 
 	sc->esc_rx_enabled = 1;
 }
 
 static void
 e82545_rx_disable(struct e82545_softc *sc)
 {
 
 	sc->esc_rx_enabled = 0;
 	while (sc->esc_rx_active)
 		pthread_cond_wait(&sc->esc_rx_cond, &sc->esc_mtx);
 }
 
 static void
 e82545_write_ra(struct e82545_softc *sc, int reg, uint32_t wval)
 {
 	struct eth_uni *eu;
 	int idx;
 
 	idx = reg >> 1;
 	assert(idx < 15);
 
 	eu = &sc->esc_uni[idx];
 
 	if (reg & 0x1) {
 		/* RAH */
 		eu->eu_valid = ((wval & E1000_RAH_AV) == E1000_RAH_AV);
 		eu->eu_addrsel = (wval >> 16) & 0x3;
 		eu->eu_eth.octet[5] = wval >> 8;
 		eu->eu_eth.octet[4] = wval;
 	} else {
 		/* RAL */
 		eu->eu_eth.octet[3] = wval >> 24;
 		eu->eu_eth.octet[2] = wval >> 16;
 		eu->eu_eth.octet[1] = wval >> 8;
 		eu->eu_eth.octet[0] = wval;
 	}
 }
 
 static uint32_t
 e82545_read_ra(struct e82545_softc *sc, int reg)
 {
 	struct eth_uni *eu;
 	uint32_t retval;
 	int idx;
 
 	idx = reg >> 1;
 	assert(idx < 15);
 
 	eu = &sc->esc_uni[idx];
 
 	if (reg & 0x1) {
 		/* RAH */
 		retval = (eu->eu_valid << 31) |
 			 (eu->eu_addrsel << 16) |
 			 (eu->eu_eth.octet[5] << 8) |
 			 eu->eu_eth.octet[4];
 	} else {
 		/* RAL */
 		retval = (eu->eu_eth.octet[3] << 24) |
 			 (eu->eu_eth.octet[2] << 16) |
 			 (eu->eu_eth.octet[1] << 8) |
 			 eu->eu_eth.octet[0];
 	}
 
 	return (retval);	
 }
 
 static void
 e82545_write_register(struct e82545_softc *sc, uint32_t offset, uint32_t value)
 {
 	int ridx;
 	
 	if (offset & 0x3) {
 		DPRINTF("Unaligned register write offset:0x%x value:0x%x", offset, value);
 		return;
 	}
 	DPRINTF("Register write: 0x%x value: 0x%x", offset, value);
 
 	switch (offset) {
 	case E1000_CTRL:
 	case E1000_CTRL_DUP:
 		e82545_devctl(sc, value);
 		break;
 	case E1000_FCAL:
 		sc->esc_FCAL = value;
 		break;
 	case E1000_FCAH:
 		sc->esc_FCAH = value & ~0xFFFF0000;
 		break;
 	case E1000_FCT:
 		sc->esc_FCT = value & ~0xFFFF0000;
 		break;
 	case E1000_VET:
 		sc->esc_VET = value & ~0xFFFF0000;
 		break;
 	case E1000_FCTTV:
 		sc->esc_FCTTV = value & ~0xFFFF0000;
 		break;
 	case E1000_LEDCTL:
 		sc->esc_LEDCTL = value & ~0x30303000;
 		break;
 	case E1000_PBA:
 		sc->esc_PBA = value & 0x0000FF80;
 		break;
 	case E1000_ICR:
 	case E1000_ITR:
 	case E1000_ICS:
 	case E1000_IMS:
 	case E1000_IMC:
 		e82545_intr_write(sc, offset, value);
 		break;
 	case E1000_RCTL:
 		e82545_rx_ctl(sc, value);
 		break;
 	case E1000_FCRTL:
 		sc->esc_FCRTL = value & ~0xFFFF0007;
 		break;
 	case E1000_FCRTH:
 		sc->esc_FCRTH = value & ~0xFFFF0007;
 		break;
 	case E1000_RDBAL(0):
 		sc->esc_RDBAL = value & ~0xF;
 		if (sc->esc_rx_enabled) {
 			/* Apparently legal: update cached address */
 			e82545_rx_update_rdba(sc);
 		}
 		break;
 	case E1000_RDBAH(0):
 		assert(!sc->esc_rx_enabled);
 		sc->esc_RDBAH = value;
 		break;
 	case E1000_RDLEN(0):
 		assert(!sc->esc_rx_enabled);
 		sc->esc_RDLEN = value & ~0xFFF0007F;
 		break;
 	case E1000_RDH(0):
 		/* XXX should only ever be zero ? Range check ? */
 		sc->esc_RDH = value;
 		break;
 	case E1000_RDT(0):
 		/* XXX if this opens up the rx ring, do something ? */
 		sc->esc_RDT = value;
 		break;
 	case E1000_RDTR:
 		/* ignore FPD bit 31 */
 		sc->esc_RDTR = value & ~0xFFFF0000;
 		break;
 	case E1000_RXDCTL(0):
 		sc->esc_RXDCTL = value & ~0xFEC0C0C0;
 		break;
 	case E1000_RADV:
 		sc->esc_RADV = value & ~0xFFFF0000;
 		break;
 	case E1000_RSRPD:
 		sc->esc_RSRPD = value & ~0xFFFFF000;
 		break;
 	case E1000_RXCSUM:
 		sc->esc_RXCSUM = value & ~0xFFFFF800;
 		break;
 	case E1000_TXCW:
 		sc->esc_TXCW = value & ~0x3FFF0000;
 		break;
 	case E1000_TCTL:
 		e82545_tx_ctl(sc, value);
 		break;
 	case E1000_TIPG:
 		sc->esc_TIPG = value;
 		break;
 	case E1000_AIT:
 		sc->esc_AIT = value;
 		break;
 	case E1000_TDBAL(0):
 		sc->esc_TDBAL = value & ~0xF;
 		if (sc->esc_tx_enabled) {
 			/* Apparently legal */
 			e82545_tx_update_tdba(sc);
 		}
 		break;
 	case E1000_TDBAH(0):
 		//assert(!sc->esc_tx_enabled);		
 		sc->esc_TDBAH = value;
 		break;
 	case E1000_TDLEN(0):
 		//assert(!sc->esc_tx_enabled);
 		sc->esc_TDLEN = value & ~0xFFF0007F;
 		break;
 	case E1000_TDH(0):
 		//assert(!sc->esc_tx_enabled);
 		/* XXX should only ever be zero ? Range check ? */
 		sc->esc_TDHr = sc->esc_TDH = value;
 		break;
 	case E1000_TDT(0):
 		/* XXX range check ? */
 		sc->esc_TDT = value;
 		if (sc->esc_tx_enabled)
 			e82545_tx_start(sc);
 		break;
 	case E1000_TIDV:
 		sc->esc_TIDV = value & ~0xFFFF0000;
 		break;
 	case E1000_TXDCTL(0):
 		//assert(!sc->esc_tx_enabled);
 		sc->esc_TXDCTL = value & ~0xC0C0C0;
 		break;
 	case E1000_TADV:
 		sc->esc_TADV = value & ~0xFFFF0000;
 		break;
 	case E1000_RAL(0) ... E1000_RAH(15):
 		/* convert to u32 offset */
 		ridx = (offset - E1000_RAL(0)) >> 2;
 		e82545_write_ra(sc, ridx, value);
 		break;
 	case E1000_MTA ... (E1000_MTA + (127*4)):
 		sc->esc_fmcast[(offset - E1000_MTA) >> 2] = value;
 		break;
 	case E1000_VFTA ... (E1000_VFTA + (127*4)):
 		sc->esc_fvlan[(offset - E1000_VFTA) >> 2] = value;
 		break;		
 	case E1000_EECD:
 	{
 		//DPRINTF("EECD write 0x%x -> 0x%x", sc->eeprom_control, value);
 		/* edge triggered low->high */
 		uint32_t eecd_strobe = ((sc->eeprom_control & E1000_EECD_SK) ?
 			0 : (value & E1000_EECD_SK));
 		uint32_t eecd_mask = (E1000_EECD_SK|E1000_EECD_CS|
 					E1000_EECD_DI|E1000_EECD_REQ);
 		sc->eeprom_control &= ~eecd_mask;
 		sc->eeprom_control |= (value & eecd_mask);
 		/* grant/revoke immediately */
 		if (value & E1000_EECD_REQ) {
 			sc->eeprom_control |= E1000_EECD_GNT;
 		} else {
                         sc->eeprom_control &= ~E1000_EECD_GNT;
 		}
 		if (eecd_strobe && (sc->eeprom_control & E1000_EECD_CS)) {
 			e82545_eecd_strobe(sc);
 		}
 		return;
 	}
 	case E1000_MDIC:
 	{
 		uint8_t reg_addr = (uint8_t)((value & E1000_MDIC_REG_MASK) >>
 						E1000_MDIC_REG_SHIFT);
 		uint8_t phy_addr = (uint8_t)((value & E1000_MDIC_PHY_MASK) >>
 						E1000_MDIC_PHY_SHIFT);
 		sc->mdi_control =
 			(value & ~(E1000_MDIC_ERROR|E1000_MDIC_DEST));
 		if ((value & E1000_MDIC_READY) != 0) {
 			DPRINTF("Incorrect MDIC ready bit: 0x%x", value);
 			return;
 		}
 		switch (value & E82545_MDIC_OP_MASK) {
 		case E1000_MDIC_OP_READ:
 			sc->mdi_control &= ~E82545_MDIC_DATA_MASK;
 			sc->mdi_control |= e82545_read_mdi(sc, reg_addr, phy_addr);
 			break;
 		case E1000_MDIC_OP_WRITE:
 			e82545_write_mdi(sc, reg_addr, phy_addr,
 				value & E82545_MDIC_DATA_MASK);
 			break;
 		default:
 			DPRINTF("Unknown MDIC op: 0x%x", value);
 			return;
 		}
 		/* TODO: barrier? */
 		sc->mdi_control |= E1000_MDIC_READY;
 		if (value & E82545_MDIC_IE) {
 			// TODO: generate interrupt
 		}
 		return;
 	}
 	case E1000_MANC:
 	case E1000_STATUS: 
 		return;
 	default:
 		DPRINTF("Unknown write register: 0x%x value:%x", offset, value);
 		return;
 	}
 }
 
 static uint32_t
 e82545_read_register(struct e82545_softc *sc, uint32_t offset)
 {
 	uint32_t retval;
 	int ridx;
 
 	if (offset & 0x3) {
 		DPRINTF("Unaligned register read offset:0x%x", offset);
 		return 0;
 	}
 
 	DPRINTF("Register read: 0x%x", offset);
 
 	switch (offset) {
 	case E1000_CTRL:
 		retval = sc->esc_CTRL;
 		break;
 	case E1000_STATUS:
 		retval = E1000_STATUS_FD | E1000_STATUS_LU |
 		    E1000_STATUS_SPEED_1000;
 		break;
 	case E1000_FCAL:
 		retval = sc->esc_FCAL;
 		break;
 	case E1000_FCAH:
 		retval = sc->esc_FCAH;
 		break;
 	case E1000_FCT:
 		retval = sc->esc_FCT;
 		break;
 	case E1000_VET:
 		retval = sc->esc_VET;
 		break;
 	case E1000_FCTTV:
 		retval = sc->esc_FCTTV;
 		break;
 	case E1000_LEDCTL:
 		retval = sc->esc_LEDCTL;
 		break;
 	case E1000_PBA:
 		retval = sc->esc_PBA;
 		break;
 	case E1000_ICR:
 	case E1000_ITR:
 	case E1000_ICS:
 	case E1000_IMS:
 	case E1000_IMC:
 		retval = e82545_intr_read(sc, offset);
 		break;
 	case E1000_RCTL:
 		retval = sc->esc_RCTL;
 		break;
 	case E1000_FCRTL:
 		retval = sc->esc_FCRTL;
 		break;
 	case E1000_FCRTH:
 		retval = sc->esc_FCRTH;
 		break;
 	case E1000_RDBAL(0):
 		retval = sc->esc_RDBAL;
 		break;
 	case E1000_RDBAH(0):
 		retval = sc->esc_RDBAH;
 		break;
 	case E1000_RDLEN(0):
 		retval = sc->esc_RDLEN;
 		break;
 	case E1000_RDH(0):
 		retval = sc->esc_RDH;
 		break;
 	case E1000_RDT(0):
 		retval = sc->esc_RDT;
 		break;
 	case E1000_RDTR:
 		retval = sc->esc_RDTR;
 		break;
 	case E1000_RXDCTL(0):
 		retval = sc->esc_RXDCTL;
 		break;
 	case E1000_RADV:
 		retval = sc->esc_RADV;
 		break;
 	case E1000_RSRPD:
 		retval = sc->esc_RSRPD;
 		break;
 	case E1000_RXCSUM:	       
 		retval = sc->esc_RXCSUM;
 		break;
 	case E1000_TXCW:
 		retval = sc->esc_TXCW;
 		break;
 	case E1000_TCTL:
 		retval = sc->esc_TCTL;
 		break;
 	case E1000_TIPG:
 		retval = sc->esc_TIPG;
 		break;
 	case E1000_AIT:
 		retval = sc->esc_AIT;
 		break;
 	case E1000_TDBAL(0):
 		retval = sc->esc_TDBAL;
 		break;
 	case E1000_TDBAH(0):
 		retval = sc->esc_TDBAH;
 		break;
 	case E1000_TDLEN(0):
 		retval = sc->esc_TDLEN;
 		break;
 	case E1000_TDH(0):
 		retval = sc->esc_TDH;
 		break;
 	case E1000_TDT(0):
 		retval = sc->esc_TDT;
 		break;
 	case E1000_TIDV:
 		retval = sc->esc_TIDV;
 		break;
 	case E1000_TXDCTL(0):
 		retval = sc->esc_TXDCTL;
 		break;
 	case E1000_TADV:
 		retval = sc->esc_TADV;
 		break;
 	case E1000_RAL(0) ... E1000_RAH(15):
 		/* convert to u32 offset */
 		ridx = (offset - E1000_RAL(0)) >> 2;
 		retval = e82545_read_ra(sc, ridx);
 		break;
 	case E1000_MTA ... (E1000_MTA + (127*4)):
 		retval = sc->esc_fmcast[(offset - E1000_MTA) >> 2];
 		break;
 	case E1000_VFTA ... (E1000_VFTA + (127*4)):
 		retval = sc->esc_fvlan[(offset - E1000_VFTA) >> 2];
 		break;		
 	case E1000_EECD:
 		//DPRINTF("EECD read %x", sc->eeprom_control);
 		retval = sc->eeprom_control;
 		break;
 	case E1000_MDIC:
 		retval = sc->mdi_control;
 		break;
 	case E1000_MANC:
 		retval = 0;
 		break;
 	/* stats that we emulate. */
 	case E1000_MPC:
 		retval = sc->missed_pkt_count;
 		break;
 	case E1000_PRC64:
 		retval = sc->pkt_rx_by_size[0];
 		break;
 	case E1000_PRC127:
 		retval = sc->pkt_rx_by_size[1];
 		break;
 	case E1000_PRC255:
 		retval = sc->pkt_rx_by_size[2];
 		break;
 	case E1000_PRC511:
 		retval = sc->pkt_rx_by_size[3];
 		break;
 	case E1000_PRC1023:
 		retval = sc->pkt_rx_by_size[4];
 		break;
 	case E1000_PRC1522:
 		retval = sc->pkt_rx_by_size[5];
 		break;
 	case E1000_GPRC:
 		retval = sc->good_pkt_rx_count;
 		break;
 	case E1000_BPRC:
 		retval = sc->bcast_pkt_rx_count;
 		break;
 	case E1000_MPRC:
 		retval = sc->mcast_pkt_rx_count;
 		break;
 	case E1000_GPTC:
 	case E1000_TPT:
 		retval = sc->good_pkt_tx_count;
 		break;
 	case E1000_GORCL:
 		retval = (uint32_t)sc->good_octets_rx;
 		break;
 	case E1000_GORCH:
 		retval = (uint32_t)(sc->good_octets_rx >> 32);
 		break;
 	case E1000_TOTL:
 	case E1000_GOTCL:
 		retval = (uint32_t)sc->good_octets_tx;
 		break;
 	case E1000_TOTH:
 	case E1000_GOTCH:
 		retval = (uint32_t)(sc->good_octets_tx >> 32);
 		break;
 	case E1000_ROC:
 		retval = sc->oversize_rx_count;
 		break;
 	case E1000_TORL:
 		retval = (uint32_t)(sc->good_octets_rx + sc->missed_octets);
 		break;
 	case E1000_TORH:
 		retval = (uint32_t)((sc->good_octets_rx +
 		    sc->missed_octets) >> 32);
 		break;
 	case E1000_TPR:
 		retval = sc->good_pkt_rx_count + sc->missed_pkt_count +
 		    sc->oversize_rx_count;
 		break;
 	case E1000_PTC64:
 		retval = sc->pkt_tx_by_size[0];
 		break;
 	case E1000_PTC127:
 		retval = sc->pkt_tx_by_size[1];
 		break;
 	case E1000_PTC255:
 		retval = sc->pkt_tx_by_size[2];
 		break;
 	case E1000_PTC511:
 		retval = sc->pkt_tx_by_size[3];
 		break;
 	case E1000_PTC1023:
 		retval = sc->pkt_tx_by_size[4];
 		break;
 	case E1000_PTC1522:
 		retval = sc->pkt_tx_by_size[5];
 		break;
 	case E1000_MPTC:
 		retval = sc->mcast_pkt_tx_count;
 		break;
 	case E1000_BPTC:
 		retval = sc->bcast_pkt_tx_count;
 		break;
 	case E1000_TSCTC:
 		retval = sc->tso_tx_count;
 		break;
 	/* stats that are always 0. */
 	case E1000_CRCERRS:
 	case E1000_ALGNERRC:
 	case E1000_SYMERRS:
 	case E1000_RXERRC:
 	case E1000_SCC:
 	case E1000_ECOL:
 	case E1000_MCC:
 	case E1000_LATECOL:
 	case E1000_COLC:
 	case E1000_DC:
 	case E1000_TNCRS:
 	case E1000_SEC:
 	case E1000_CEXTERR:
 	case E1000_RLEC:
 	case E1000_XONRXC:
 	case E1000_XONTXC:
 	case E1000_XOFFRXC:
 	case E1000_XOFFTXC:
 	case E1000_FCRUC:
 	case E1000_RNBC:
 	case E1000_RUC:
 	case E1000_RFC:
 	case E1000_RJC:
 	case E1000_MGTPRC:
 	case E1000_MGTPDC:
 	case E1000_MGTPTC:
 	case E1000_TSCTFC:
 		retval = 0;
 		break;
 	default:
 		DPRINTF("Unknown read register: 0x%x", offset);
 		retval = 0;
 		break;
 	}
 
 	return (retval);
 }
 
 static void
 e82545_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
 	     uint64_t offset, int size, uint64_t value)
 {
 	struct e82545_softc *sc;
 
 	//DPRINTF("Write bar:%d offset:0x%lx value:0x%lx size:%d", baridx, offset, value, size);
 
 	sc = pi->pi_arg;
 
 	pthread_mutex_lock(&sc->esc_mtx);
 
 	switch (baridx) {
 	case E82545_BAR_IO:
 		switch (offset) {
 		case E82545_IOADDR:
 			if (size != 4) {
 				DPRINTF("Wrong io addr write sz:%d value:0x%lx", size, value);
 			} else
 				sc->io_addr = (uint32_t)value;
 			break;
 		case E82545_IODATA:
 			if (size != 4) {
 				DPRINTF("Wrong io data write size:%d value:0x%lx", size, value);
 			} else if (sc->io_addr > E82545_IO_REGISTER_MAX) {
 				DPRINTF("Non-register io write addr:0x%x value:0x%lx", sc->io_addr, value);
 			} else
 				e82545_write_register(sc, sc->io_addr,
 						      (uint32_t)value);
 			break;
 		default:
 			DPRINTF("Unknown io bar write offset:0x%lx value:0x%lx size:%d", offset, value, size);
 			break;
 		}
 		break;
 	case E82545_BAR_REGISTER:
 		if (size != 4) {
 			DPRINTF("Wrong register write size:%d offset:0x%lx value:0x%lx", size, offset, value);
 		} else
 			e82545_write_register(sc, (uint32_t)offset,
 					      (uint32_t)value);
 		break;
 	default:
 		DPRINTF("Unknown write bar:%d off:0x%lx val:0x%lx size:%d",
 			baridx, offset, value, size);
 	}
 
 	pthread_mutex_unlock(&sc->esc_mtx);
 }
 
 static uint64_t
 e82545_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
 	    uint64_t offset, int size)
 {
 	struct e82545_softc *sc;
 	uint64_t retval;
 	
 	//DPRINTF("Read  bar:%d offset:0x%lx size:%d", baridx, offset, size);
 	sc = pi->pi_arg;
 	retval = 0;
 
 	pthread_mutex_lock(&sc->esc_mtx);
 
 	switch (baridx) {
 	case E82545_BAR_IO:
 		switch (offset) {
 		case E82545_IOADDR:
 			if (size != 4) {
 				DPRINTF("Wrong io addr read sz:%d", size);
 			} else
 				retval = sc->io_addr;
 			break;
 		case E82545_IODATA:
 			if (size != 4) {
 				DPRINTF("Wrong io data read sz:%d", size);
 			}
 			if (sc->io_addr > E82545_IO_REGISTER_MAX) {
 				DPRINTF("Non-register io read addr:0x%x",
 					sc->io_addr);
 			} else
 				retval = e82545_read_register(sc, sc->io_addr);
 			break;
 		default:
 			DPRINTF("Unknown io bar read offset:0x%lx size:%d",
 				offset, size);
 			break;
 		}
 		break;
 	case E82545_BAR_REGISTER:
 		if (size != 4) {
 			DPRINTF("Wrong register read size:%d offset:0x%lx",
 				size, offset);
 		} else
 			retval = e82545_read_register(sc, (uint32_t)offset);
 		break;
 	default:
 		DPRINTF("Unknown read bar:%d offset:0x%lx size:%d",
 			baridx, offset, size);
 		break;
 	}
 
 	pthread_mutex_unlock(&sc->esc_mtx);
 
 	return (retval);
 }
 
 static void
 e82545_reset(struct e82545_softc *sc, int drvr)
 {
 	int i;
 
 	e82545_rx_disable(sc);
 	e82545_tx_disable(sc);
 
 	/* clear outstanding interrupts */
 	if (sc->esc_irq_asserted)
 		pci_lintr_deassert(sc->esc_pi);
 
 	/* misc */
 	if (!drvr) {
 		sc->esc_FCAL = 0;
 		sc->esc_FCAH = 0;
 		sc->esc_FCT = 0;
 		sc->esc_VET = 0;
 		sc->esc_FCTTV = 0;
 	}
 	sc->esc_LEDCTL = 0x07061302;
 	sc->esc_PBA = 0x00100030;
 	
 	/* start nvm in opcode mode. */
 	sc->nvm_opaddr = 0;
 	sc->nvm_mode = E82545_NVM_MODE_OPADDR;
 	sc->nvm_bits = E82545_NVM_OPADDR_BITS;
 	sc->eeprom_control = E1000_EECD_PRES | E82545_EECD_FWE_EN;
 	e82545_init_eeprom(sc);
 
 	/* interrupt */
 	sc->esc_ICR = 0;
 	sc->esc_ITR = 250;
 	sc->esc_ICS = 0;
 	sc->esc_IMS = 0;
 	sc->esc_IMC = 0;
 		
 	/* L2 filters */
 	if (!drvr) {
 		memset(sc->esc_fvlan, 0, sizeof(sc->esc_fvlan));
 		memset(sc->esc_fmcast, 0, sizeof(sc->esc_fmcast));
 		memset(sc->esc_uni, 0, sizeof(sc->esc_uni));
 
 		/* XXX not necessary on 82545 ?? */
 		sc->esc_uni[0].eu_valid = 1;
 		memcpy(sc->esc_uni[0].eu_eth.octet, sc->esc_mac.octet,
 		    ETHER_ADDR_LEN);
 	} else {
 		/* Clear RAH valid bits */
 		for (i = 0; i < 16; i++)
 			sc->esc_uni[i].eu_valid = 0;
 	}
 	
 	/* receive */
 	if (!drvr) {
 		sc->esc_RDBAL = 0;
 		sc->esc_RDBAH = 0;
 	}
 	sc->esc_RCTL = 0;
 	sc->esc_FCRTL = 0;
 	sc->esc_FCRTH = 0;
 	sc->esc_RDLEN = 0;
 	sc->esc_RDH = 0;
 	sc->esc_RDT = 0;
 	sc->esc_RDTR = 0;
 	sc->esc_RXDCTL = (1 << 24) | (1 << 16); /* default GRAN/WTHRESH */
 	sc->esc_RADV = 0;
 	sc->esc_RXCSUM = 0;
 
 	/* transmit */
 	if (!drvr) {
 		sc->esc_TDBAL = 0;
 		sc->esc_TDBAH = 0;
 		sc->esc_TIPG = 0;
 		sc->esc_AIT = 0;
 		sc->esc_TIDV = 0;
 		sc->esc_TADV = 0;
 	}
 	sc->esc_tdba = 0;
 	sc->esc_txdesc = NULL;
 	sc->esc_TXCW = 0;
 	sc->esc_TCTL = 0;
 	sc->esc_TDLEN = 0;
 	sc->esc_TDT = 0;
 	sc->esc_TDHr = sc->esc_TDH = 0;
 	sc->esc_TXDCTL = 0;
 }
 
 static int
 e82545_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 {
 	char nstr[80];
 	struct e82545_softc *sc;
 	char *devname;
 	char *vtopts;
 	int mac_provided;
 
 	DPRINTF("Loading with options: %s", opts);
 
 	/* Setup our softc */
 	sc = calloc(1, sizeof(*sc));
 
 	pi->pi_arg = sc;
 	sc->esc_pi = pi;
 	sc->esc_ctx = ctx;
 
 	pthread_mutex_init(&sc->esc_mtx, NULL);
 	pthread_cond_init(&sc->esc_rx_cond, NULL);
 	pthread_cond_init(&sc->esc_tx_cond, NULL);
 	pthread_create(&sc->esc_tx_tid, NULL, e82545_tx_thread, sc);
 	snprintf(nstr, sizeof(nstr), "e82545-%d:%d tx", pi->pi_slot,
 	    pi->pi_func);
         pthread_set_name_np(sc->esc_tx_tid, nstr);
 
 	pci_set_cfgdata16(pi, PCIR_DEVICE, E82545_DEV_ID_82545EM_COPPER);
 	pci_set_cfgdata16(pi, PCIR_VENDOR, E82545_VENDOR_ID_INTEL);
 	pci_set_cfgdata8(pi,  PCIR_CLASS, PCIC_NETWORK);
 	pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_NETWORK_ETHERNET);
 	pci_set_cfgdata16(pi, PCIR_SUBDEV_0, E82545_SUBDEV_ID);
 	pci_set_cfgdata16(pi, PCIR_SUBVEND_0, E82545_VENDOR_ID_INTEL);
 
 	pci_set_cfgdata8(pi,  PCIR_HDRTYPE, PCIM_HDRTYPE_NORMAL);
 	pci_set_cfgdata8(pi,  PCIR_INTPIN, 0x1);
 	
 	/* TODO: this card also supports msi, but the freebsd driver for it
 	 * does not, so I have not implemented it. */
 	pci_lintr_request(pi);
 
 	pci_emul_alloc_bar(pi, E82545_BAR_REGISTER, PCIBAR_MEM32,
 		E82545_BAR_REGISTER_LEN);
 	pci_emul_alloc_bar(pi, E82545_BAR_FLASH, PCIBAR_MEM32,
 		E82545_BAR_FLASH_LEN);
 	pci_emul_alloc_bar(pi, E82545_BAR_IO, PCIBAR_IO,
 		E82545_BAR_IO_LEN);
 
 	/*
 	 * Attempt to open the net backend and read the MAC address
 	 * if specified.  Copied from virtio-net, slightly modified.
 	 */
 	mac_provided = 0;
 	sc->esc_be = NULL;
 	if (opts != NULL) {
 		int err = 0;
 
 		devname = vtopts = strdup(opts);
 		(void) strsep(&vtopts, ",");
 
 		/*
 		 * Parse the list of options in the form
 		 *     key1=value1,...,keyN=valueN.
 		 */
 		while (vtopts != NULL) {
 			char *value = vtopts;
 			char *key;
 
 			key = strsep(&value, "=");
 			if (value == NULL)
 				break;
 			vtopts = value;
 			(void) strsep(&vtopts, ",");
 
 			if (strcmp(key, "mac") == 0) {
 				err = net_parsemac(value, sc->esc_mac.octet);
 				if (err)
 					break;
 				mac_provided = 1;
 			}
 		}
 
 		if (err) {
 			free(devname);
 			return (err);
 		}
 
 		err = netbe_init(&sc->esc_be, devname, e82545_rx_callback, sc);
 		free(devname);
 		if (err)
 			return (err);
 	}
 
 	if (!mac_provided) {
 		net_genmac(pi, sc->esc_mac.octet);
 	}
 
 	netbe_rx_enable(sc->esc_be);
 
 	/* H/w initiated reset */
 	e82545_reset(sc, 0);
 
 	return (0);
 }
 
+#ifdef BHYVE_SNAPSHOT
+static int
+e82545_snapshot(struct vm_snapshot_meta *meta)
+{
+	int i;
+	int ret;
+	struct e82545_softc *sc;
+	struct pci_devinst *pi;
+	uint64_t bitmap_value;
+
+	pi = meta->dev_data;
+	sc = pi->pi_arg;
+
+	/* esc_mevp and esc_mevpitr should be reinitiated at init. */
+	SNAPSHOT_VAR_OR_LEAVE(sc->esc_mac, meta, ret, done);
+
+	/* General */
+	SNAPSHOT_VAR_OR_LEAVE(sc->esc_CTRL, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->esc_FCAL, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->esc_FCAH, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->esc_FCT, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->esc_VET, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->esc_FCTTV, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->esc_LEDCTL, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->esc_PBA, meta, ret, done);
+
+	/* Interrupt control */
+	SNAPSHOT_VAR_OR_LEAVE(sc->esc_irq_asserted, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->esc_ICR, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->esc_ITR, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->esc_ICS, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->esc_IMS, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->esc_IMC, meta, ret, done);
+
+	/*
+	 * Transmit
+	 *
+	 * The fields in the unions are in superposition to access certain
+	 * bytes in the larger uint variables.
+	 * e.g., ip_config = [ipcss|ipcso|ipcse0|ipcse1]
+	 */
+	SNAPSHOT_VAR_OR_LEAVE(sc->esc_txctx.lower_setup.ip_config, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->esc_txctx.upper_setup.tcp_config, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->esc_txctx.cmd_and_length, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->esc_txctx.tcp_seg_setup.data, meta, ret, done);
+
+	SNAPSHOT_VAR_OR_LEAVE(sc->esc_tx_enabled, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->esc_tx_active, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->esc_TXCW, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->esc_TCTL, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->esc_TIPG, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->esc_AIT, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->esc_tdba, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->esc_TDBAL, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->esc_TDBAH, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->esc_TDLEN, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->esc_TDH, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->esc_TDHr, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->esc_TDT, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->esc_TIDV, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->esc_TXDCTL, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->esc_TADV, meta, ret, done);
+
+	/* Has dependency on esc_TDLEN; reoreder of fields from struct. */
+	SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(sc->esc_txdesc, sc->esc_TDLEN,
+		true, meta, ret, done);
+
+	/* L2 frame acceptance */
+	for (i = 0; i < nitems(sc->esc_uni); i++) {
+		SNAPSHOT_VAR_OR_LEAVE(sc->esc_uni[i].eu_valid, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(sc->esc_uni[i].eu_addrsel, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(sc->esc_uni[i].eu_eth, meta, ret, done);
+	}
+
+	SNAPSHOT_BUF_OR_LEAVE(sc->esc_fmcast, sizeof(sc->esc_fmcast),
+			      meta, ret, done);
+	SNAPSHOT_BUF_OR_LEAVE(sc->esc_fvlan, sizeof(sc->esc_fvlan),
+			      meta, ret, done);
+
+	/* Receive */
+	SNAPSHOT_VAR_OR_LEAVE(sc->esc_rx_enabled, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->esc_rx_active, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->esc_rx_loopback, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->esc_RCTL, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->esc_FCRTL, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->esc_FCRTH, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->esc_rdba, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->esc_RDBAL, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->esc_RDBAH, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->esc_RDLEN, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->esc_RDH, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->esc_RDT, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->esc_RDTR, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->esc_RXDCTL, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->esc_RADV, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->esc_RSRPD, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->esc_RXCSUM, meta, ret, done);
+
+	/* Has dependency on esc_RDLEN; reoreder of fields from struct. */
+	SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(sc->esc_rxdesc, sc->esc_TDLEN,
+		true, meta, ret, done);
+
+	/* IO Port register access */
+	SNAPSHOT_VAR_OR_LEAVE(sc->io_addr, meta, ret, done);
+
+	/* Shadow copy of MDIC */
+	SNAPSHOT_VAR_OR_LEAVE(sc->mdi_control, meta, ret, done);
+
+	/* Shadow copy of EECD */
+	SNAPSHOT_VAR_OR_LEAVE(sc->eeprom_control, meta, ret, done);
+
+	/* Latest NVM in/out */
+	SNAPSHOT_VAR_OR_LEAVE(sc->nvm_data, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->nvm_opaddr, meta, ret, done);
+
+	/* Stats */
+	SNAPSHOT_VAR_OR_LEAVE(sc->missed_pkt_count, meta, ret, done);
+	SNAPSHOT_BUF_OR_LEAVE(sc->pkt_rx_by_size, sizeof(sc->pkt_rx_by_size),
+			      meta, ret, done);
+	SNAPSHOT_BUF_OR_LEAVE(sc->pkt_tx_by_size, sizeof(sc->pkt_tx_by_size),
+			      meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->good_pkt_rx_count, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->bcast_pkt_rx_count, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->mcast_pkt_rx_count, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->good_pkt_tx_count, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->bcast_pkt_tx_count, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->mcast_pkt_tx_count, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->oversize_rx_count, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->tso_tx_count, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->good_octets_rx, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->good_octets_tx, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->missed_octets, meta, ret, done);
+
+	if (meta->op == VM_SNAPSHOT_SAVE)
+		bitmap_value = sc->nvm_bits;
+	SNAPSHOT_VAR_OR_LEAVE(bitmap_value, meta, ret, done);
+	if (meta->op == VM_SNAPSHOT_RESTORE)
+		sc->nvm_bits = bitmap_value;
+
+	if (meta->op == VM_SNAPSHOT_SAVE)
+		bitmap_value = sc->nvm_bits;
+	SNAPSHOT_VAR_OR_LEAVE(bitmap_value, meta, ret, done);
+	if (meta->op == VM_SNAPSHOT_RESTORE)
+		sc->nvm_bits = bitmap_value;
+
+	/* EEPROM data */
+	SNAPSHOT_BUF_OR_LEAVE(sc->eeprom_data, sizeof(sc->eeprom_data),
+			      meta, ret, done);
+
+done:
+	return (ret);
+}
+#endif
+
 struct pci_devemu pci_de_e82545 = {
 	.pe_emu = 	"e1000",
 	.pe_init =	e82545_init,
 	.pe_barwrite =	e82545_write,
-	.pe_barread =	e82545_read
+	.pe_barread =	e82545_read,
+#ifdef BHYVE_SNAPSHOT
+	.pe_snapshot =	e82545_snapshot,
+#endif
 };
 PCI_EMUL_SET(pci_de_e82545);
 
diff --git a/usr.sbin/bhyve/pci_emul.c b/usr.sbin/bhyve/pci_emul.c
index 145b33b5ffd2..e4b83896241e 100644
--- a/usr.sbin/bhyve/pci_emul.c
+++ b/usr.sbin/bhyve/pci_emul.c
@@ -1,2142 +1,2340 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/linker_set.h>
 
 #include <ctype.h>
 #include <errno.h>
 #include <pthread.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <strings.h>
 #include <assert.h>
 #include <stdbool.h>
 
 #include <machine/vmm.h>
+#include <machine/vmm_snapshot.h>
 #include <vmmapi.h>
 
 #include "acpi.h"
 #include "bhyverun.h"
 #include "debug.h"
 #include "inout.h"
 #include "ioapic.h"
 #include "mem.h"
 #include "pci_emul.h"
 #include "pci_irq.h"
 #include "pci_lpc.h"
 
 #define CONF1_ADDR_PORT	   0x0cf8
 #define CONF1_DATA_PORT	   0x0cfc
 
 #define CONF1_ENABLE	   0x80000000ul
 
 #define	MAXBUSES	(PCI_BUSMAX + 1)
 #define MAXSLOTS	(PCI_SLOTMAX + 1)
 #define	MAXFUNCS	(PCI_FUNCMAX + 1)
 
 struct funcinfo {
 	char	*fi_name;
 	char	*fi_param;
 	struct pci_devinst *fi_devi;
 };
 
 struct intxinfo {
 	int	ii_count;
 	int	ii_pirq_pin;
 	int	ii_ioapic_irq;
 };
 
 struct slotinfo {
 	struct intxinfo si_intpins[4];
 	struct funcinfo si_funcs[MAXFUNCS];
 };
 
 struct businfo {
 	uint16_t iobase, iolimit;		/* I/O window */
 	uint32_t membase32, memlimit32;		/* mmio window below 4GB */
 	uint64_t membase64, memlimit64;		/* mmio window above 4GB */
 	struct slotinfo slotinfo[MAXSLOTS];
 };
 
 static struct businfo *pci_businfo[MAXBUSES];
 
 SET_DECLARE(pci_devemu_set, struct pci_devemu);
 
 static uint64_t pci_emul_iobase;
 static uint64_t pci_emul_membase32;
 static uint64_t pci_emul_membase64;
 
 #define	PCI_EMUL_IOBASE		0x2000
 #define	PCI_EMUL_IOLIMIT	0x10000
 
 #define	PCI_EMUL_ECFG_BASE	0xE0000000		    /* 3.5GB */
 #define	PCI_EMUL_ECFG_SIZE	(MAXBUSES * 1024 * 1024)    /* 1MB per bus */
 SYSRES_MEM(PCI_EMUL_ECFG_BASE, PCI_EMUL_ECFG_SIZE);
 
 #define	PCI_EMUL_MEMLIMIT32	PCI_EMUL_ECFG_BASE
 
 #define	PCI_EMUL_MEMBASE64	0xD000000000UL
 #define	PCI_EMUL_MEMLIMIT64	0xFD00000000UL
 
 static struct pci_devemu *pci_emul_finddev(char *name);
 static void pci_lintr_route(struct pci_devinst *pi);
 static void pci_lintr_update(struct pci_devinst *pi);
 static void pci_cfgrw(struct vmctx *ctx, int vcpu, int in, int bus, int slot,
     int func, int coff, int bytes, uint32_t *val);
 
 static __inline void
 CFGWRITE(struct pci_devinst *pi, int coff, uint32_t val, int bytes)
 {
 
 	if (bytes == 1)
 		pci_set_cfgdata8(pi, coff, val);
 	else if (bytes == 2)
 		pci_set_cfgdata16(pi, coff, val);
 	else
 		pci_set_cfgdata32(pi, coff, val);
 }
 
 static __inline uint32_t
 CFGREAD(struct pci_devinst *pi, int coff, int bytes)
 {
 
 	if (bytes == 1)
 		return (pci_get_cfgdata8(pi, coff));
 	else if (bytes == 2)
 		return (pci_get_cfgdata16(pi, coff));
 	else
 		return (pci_get_cfgdata32(pi, coff));
 }
 
 /*
  * I/O access
  */
 
 /*
  * Slot options are in the form:
  *
  *  <bus>:<slot>:<func>,<emul>[,<config>]
  *  <slot>[:<func>],<emul>[,<config>]
  *
  *  slot is 0..31
  *  func is 0..7
  *  emul is a string describing the type of PCI device e.g. virtio-net
  *  config is an optional string, depending on the device, that can be
  *  used for configuration.
  *   Examples are:
  *     1,virtio-net,tap0
  *     3:0,dummy
  */
 static void
 pci_parse_slot_usage(char *aopt)
 {
 
 	EPRINTLN("Invalid PCI slot info field \"%s\"", aopt);
 }
 
 int
 pci_parse_slot(char *opt)
 {
 	struct businfo *bi;
 	struct slotinfo *si;
 	char *emul, *config, *str, *cp;
 	int error, bnum, snum, fnum;
 
 	error = -1;
 	str = strdup(opt);
 
 	emul = config = NULL;
 	if ((cp = strchr(str, ',')) != NULL) {
 		*cp = '\0';
 		emul = cp + 1;
 		if ((cp = strchr(emul, ',')) != NULL) {
 			*cp = '\0';
 			config = cp + 1;
 		}
 	} else {
 		pci_parse_slot_usage(opt);
 		goto done;
 	}
 
 	/* <bus>:<slot>:<func> */
 	if (sscanf(str, "%d:%d:%d", &bnum, &snum, &fnum) != 3) {
 		bnum = 0;
 		/* <slot>:<func> */
 		if (sscanf(str, "%d:%d", &snum, &fnum) != 2) {
 			fnum = 0;
 			/* <slot> */
 			if (sscanf(str, "%d", &snum) != 1) {
 				snum = -1;
 			}
 		}
 	}
 
 	if (bnum < 0 || bnum >= MAXBUSES || snum < 0 || snum >= MAXSLOTS ||
 	    fnum < 0 || fnum >= MAXFUNCS) {
 		pci_parse_slot_usage(opt);
 		goto done;
 	}
 
 	if (pci_businfo[bnum] == NULL)
 		pci_businfo[bnum] = calloc(1, sizeof(struct businfo));
 
 	bi = pci_businfo[bnum];
 	si = &bi->slotinfo[snum];
 
 	if (si->si_funcs[fnum].fi_name != NULL) {
 		EPRINTLN("pci slot %d:%d already occupied!",
 			snum, fnum);
 		goto done;
 	}
 
 	if (pci_emul_finddev(emul) == NULL) {
 		EPRINTLN("pci slot %d:%d: unknown device \"%s\"",
 			snum, fnum, emul);
 		goto done;
 	}
 
 	error = 0;
 	si->si_funcs[fnum].fi_name = emul;
 	si->si_funcs[fnum].fi_param = config;
 
 done:
 	if (error)
 		free(str);
 
 	return (error);
 }
 
 void
 pci_print_supported_devices()
 {
 	struct pci_devemu **pdpp, *pdp;
 
 	SET_FOREACH(pdpp, pci_devemu_set) {
 		pdp = *pdpp;
 		printf("%s\n", pdp->pe_emu);
 	}
 }
 
 static int
 pci_valid_pba_offset(struct pci_devinst *pi, uint64_t offset)
 {
 
 	if (offset < pi->pi_msix.pba_offset)
 		return (0);
 
 	if (offset >= pi->pi_msix.pba_offset + pi->pi_msix.pba_size) {
 		return (0);
 	}
 
 	return (1);
 }
 
 int
 pci_emul_msix_twrite(struct pci_devinst *pi, uint64_t offset, int size,
 		     uint64_t value)
 {
 	int msix_entry_offset;
 	int tab_index;
 	char *dest;
 
 	/* support only 4 or 8 byte writes */
 	if (size != 4 && size != 8)
 		return (-1);
 
 	/*
 	 * Return if table index is beyond what device supports
 	 */
 	tab_index = offset / MSIX_TABLE_ENTRY_SIZE;
 	if (tab_index >= pi->pi_msix.table_count)
 		return (-1);
 
 	msix_entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
 
 	/* support only aligned writes */
 	if ((msix_entry_offset % size) != 0)
 		return (-1);
 
 	dest = (char *)(pi->pi_msix.table + tab_index);
 	dest += msix_entry_offset;
 
 	if (size == 4)
 		*((uint32_t *)dest) = value;
 	else
 		*((uint64_t *)dest) = value;
 
 	return (0);
 }
 
 uint64_t
 pci_emul_msix_tread(struct pci_devinst *pi, uint64_t offset, int size)
 {
 	char *dest;
 	int msix_entry_offset;
 	int tab_index;
 	uint64_t retval = ~0;
 
 	/*
 	 * The PCI standard only allows 4 and 8 byte accesses to the MSI-X
 	 * table but we also allow 1 byte access to accommodate reads from
 	 * ddb.
 	 */
 	if (size != 1 && size != 4 && size != 8)
 		return (retval);
 
 	msix_entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
 
 	/* support only aligned reads */
 	if ((msix_entry_offset % size) != 0) {
 		return (retval);
 	}
 
 	tab_index = offset / MSIX_TABLE_ENTRY_SIZE;
 
 	if (tab_index < pi->pi_msix.table_count) {
 		/* valid MSI-X Table access */
 		dest = (char *)(pi->pi_msix.table + tab_index);
 		dest += msix_entry_offset;
 
 		if (size == 1)
 			retval = *((uint8_t *)dest);
 		else if (size == 4)
 			retval = *((uint32_t *)dest);
 		else
 			retval = *((uint64_t *)dest);
 	} else if (pci_valid_pba_offset(pi, offset)) {
 		/* return 0 for PBA access */
 		retval = 0;
 	}
 
 	return (retval);
 }
 
 int
 pci_msix_table_bar(struct pci_devinst *pi)
 {
 
 	if (pi->pi_msix.table != NULL)
 		return (pi->pi_msix.table_bar);
 	else
 		return (-1);
 }
 
 int
 pci_msix_pba_bar(struct pci_devinst *pi)
 {
 
 	if (pi->pi_msix.table != NULL)
 		return (pi->pi_msix.pba_bar);
 	else
 		return (-1);
 }
 
 static int
 pci_emul_io_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
 		    uint32_t *eax, void *arg)
 {
 	struct pci_devinst *pdi = arg;
 	struct pci_devemu *pe = pdi->pi_d;
 	uint64_t offset;
 	int i;
 
 	for (i = 0; i <= PCI_BARMAX; i++) {
 		if (pdi->pi_bar[i].type == PCIBAR_IO &&
 		    port >= pdi->pi_bar[i].addr &&
 		    port + bytes <= pdi->pi_bar[i].addr + pdi->pi_bar[i].size) {
 			offset = port - pdi->pi_bar[i].addr;
 			if (in)
 				*eax = (*pe->pe_barread)(ctx, vcpu, pdi, i,
 							 offset, bytes);
 			else
 				(*pe->pe_barwrite)(ctx, vcpu, pdi, i, offset,
 						   bytes, *eax);
 			return (0);
 		}
 	}
 	return (-1);
 }
 
 static int
 pci_emul_mem_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr,
 		     int size, uint64_t *val, void *arg1, long arg2)
 {
 	struct pci_devinst *pdi = arg1;
 	struct pci_devemu *pe = pdi->pi_d;
 	uint64_t offset;
 	int bidx = (int) arg2;
 
 	assert(bidx <= PCI_BARMAX);
 	assert(pdi->pi_bar[bidx].type == PCIBAR_MEM32 ||
 	       pdi->pi_bar[bidx].type == PCIBAR_MEM64);
 	assert(addr >= pdi->pi_bar[bidx].addr &&
 	       addr + size <= pdi->pi_bar[bidx].addr + pdi->pi_bar[bidx].size);
 
 	offset = addr - pdi->pi_bar[bidx].addr;
 
 	if (dir == MEM_F_WRITE) {
 		if (size == 8) {
 			(*pe->pe_barwrite)(ctx, vcpu, pdi, bidx, offset,
 					   4, *val & 0xffffffff);
 			(*pe->pe_barwrite)(ctx, vcpu, pdi, bidx, offset + 4,
 					   4, *val >> 32);
 		} else {
 			(*pe->pe_barwrite)(ctx, vcpu, pdi, bidx, offset,
 					   size, *val);
 		}
 	} else {
 		if (size == 8) {
 			*val = (*pe->pe_barread)(ctx, vcpu, pdi, bidx,
 						 offset, 4);
 			*val |= (*pe->pe_barread)(ctx, vcpu, pdi, bidx,
 						  offset + 4, 4) << 32;
 		} else {
 			*val = (*pe->pe_barread)(ctx, vcpu, pdi, bidx,
 						 offset, size);
 		}
 	}
 
 	return (0);
 }
 
 
 static int
 pci_emul_alloc_resource(uint64_t *baseptr, uint64_t limit, uint64_t size,
 			uint64_t *addr)
 {
 	uint64_t base;
 
 	assert((size & (size - 1)) == 0);	/* must be a power of 2 */
 
 	base = roundup2(*baseptr, size);
 
 	if (base + size <= limit) {
 		*addr = base;
 		*baseptr = base + size;
 		return (0);
 	} else
 		return (-1);
 }
 
 int
 pci_emul_alloc_bar(struct pci_devinst *pdi, int idx, enum pcibar_type type,
 		   uint64_t size)
 {
 
 	return (pci_emul_alloc_pbar(pdi, idx, 0, type, size));
 }
 
 /*
  * Register (or unregister) the MMIO or I/O region associated with the BAR
  * register 'idx' of an emulated pci device.
  */
 static void
 modify_bar_registration(struct pci_devinst *pi, int idx, int registration)
 {
 	int error;
 	struct inout_port iop;
 	struct mem_range mr;
 
 	switch (pi->pi_bar[idx].type) {
 	case PCIBAR_IO:
 		bzero(&iop, sizeof(struct inout_port));
 		iop.name = pi->pi_name;
 		iop.port = pi->pi_bar[idx].addr;
 		iop.size = pi->pi_bar[idx].size;
 		if (registration) {
 			iop.flags = IOPORT_F_INOUT;
 			iop.handler = pci_emul_io_handler;
 			iop.arg = pi;
 			error = register_inout(&iop);
 		} else
 			error = unregister_inout(&iop);
 		break;
 	case PCIBAR_MEM32:
 	case PCIBAR_MEM64:
 		bzero(&mr, sizeof(struct mem_range));
 		mr.name = pi->pi_name;
 		mr.base = pi->pi_bar[idx].addr;
 		mr.size = pi->pi_bar[idx].size;
 		if (registration) {
 			mr.flags = MEM_F_RW;
 			mr.handler = pci_emul_mem_handler;
 			mr.arg1 = pi;
 			mr.arg2 = idx;
 			error = register_mem(&mr);
 		} else
 			error = unregister_mem(&mr);
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	assert(error == 0);
 }
 
 static void
 unregister_bar(struct pci_devinst *pi, int idx)
 {
 
 	modify_bar_registration(pi, idx, 0);
 }
 
 static void
 register_bar(struct pci_devinst *pi, int idx)
 {
 
 	modify_bar_registration(pi, idx, 1);
 }
 
 /* Are we decoding i/o port accesses for the emulated pci device? */
 static int
 porten(struct pci_devinst *pi)
 {
 	uint16_t cmd;
 
 	cmd = pci_get_cfgdata16(pi, PCIR_COMMAND);
 
 	return (cmd & PCIM_CMD_PORTEN);
 }
 
 /* Are we decoding memory accesses for the emulated pci device? */
 static int
 memen(struct pci_devinst *pi)
 {
 	uint16_t cmd;
 
 	cmd = pci_get_cfgdata16(pi, PCIR_COMMAND);
 
 	return (cmd & PCIM_CMD_MEMEN);
 }
 
 /*
  * Update the MMIO or I/O address that is decoded by the BAR register.
  *
  * If the pci device has enabled the address space decoding then intercept
  * the address range decoded by the BAR register.
  */
 static void
 update_bar_address(struct pci_devinst *pi, uint64_t addr, int idx, int type)
 {
 	int decode;
 
 	if (pi->pi_bar[idx].type == PCIBAR_IO)
 		decode = porten(pi);
 	else
 		decode = memen(pi);
 
 	if (decode)
 		unregister_bar(pi, idx);
 
 	switch (type) {
 	case PCIBAR_IO:
 	case PCIBAR_MEM32:
 		pi->pi_bar[idx].addr = addr;
 		break;
 	case PCIBAR_MEM64:
 		pi->pi_bar[idx].addr &= ~0xffffffffUL;
 		pi->pi_bar[idx].addr |= addr;
 		break;
 	case PCIBAR_MEMHI64:
 		pi->pi_bar[idx].addr &= 0xffffffff;
 		pi->pi_bar[idx].addr |= addr;
 		break;
 	default:
 		assert(0);
 	}
 
 	if (decode)
 		register_bar(pi, idx);
 }
 
 int
 pci_emul_alloc_pbar(struct pci_devinst *pdi, int idx, uint64_t hostbase,
 		    enum pcibar_type type, uint64_t size)
 {
 	int error;
 	uint64_t *baseptr, limit, addr, mask, lobits, bar;
 	uint16_t cmd, enbit;
 
 	assert(idx >= 0 && idx <= PCI_BARMAX);
 
 	if ((size & (size - 1)) != 0)
 		size = 1UL << flsl(size);	/* round up to a power of 2 */
 
 	/* Enforce minimum BAR sizes required by the PCI standard */
 	if (type == PCIBAR_IO) {
 		if (size < 4)
 			size = 4;
 	} else {
 		if (size < 16)
 			size = 16;
 	}
 
 	switch (type) {
 	case PCIBAR_NONE:
 		baseptr = NULL;
 		addr = mask = lobits = enbit = 0;
 		break;
 	case PCIBAR_IO:
 		baseptr = &pci_emul_iobase;
 		limit = PCI_EMUL_IOLIMIT;
 		mask = PCIM_BAR_IO_BASE;
 		lobits = PCIM_BAR_IO_SPACE;
 		enbit = PCIM_CMD_PORTEN;
 		break;
 	case PCIBAR_MEM64:
 		/*
 		 * XXX
 		 * Some drivers do not work well if the 64-bit BAR is allocated
 		 * above 4GB. Allow for this by allocating small requests under
 		 * 4GB unless then allocation size is larger than some arbitrary
 		 * number (32MB currently).
 		 */
 		if (size > 32 * 1024 * 1024) {
 			/*
 			 * XXX special case for device requiring peer-peer DMA
 			 */
 			if (size == 0x100000000UL)
 				baseptr = &hostbase;
 			else
 				baseptr = &pci_emul_membase64;
 			limit = PCI_EMUL_MEMLIMIT64;
 			mask = PCIM_BAR_MEM_BASE;
 			lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64 |
 				 PCIM_BAR_MEM_PREFETCH;
 		} else {
 			baseptr = &pci_emul_membase32;
 			limit = PCI_EMUL_MEMLIMIT32;
 			mask = PCIM_BAR_MEM_BASE;
 			lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64;
 		}
 		enbit = PCIM_CMD_MEMEN;
 		break;
 	case PCIBAR_MEM32:
 		baseptr = &pci_emul_membase32;
 		limit = PCI_EMUL_MEMLIMIT32;
 		mask = PCIM_BAR_MEM_BASE;
 		lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_32;
 		enbit = PCIM_CMD_MEMEN;
 		break;
 	default:
 		printf("pci_emul_alloc_base: invalid bar type %d\n", type);
 		assert(0);
 	}
 
 	if (baseptr != NULL) {
 		error = pci_emul_alloc_resource(baseptr, limit, size, &addr);
 		if (error != 0)
 			return (error);
 	}
 
 	pdi->pi_bar[idx].type = type;
 	pdi->pi_bar[idx].addr = addr;
 	pdi->pi_bar[idx].size = size;
 
 	/* Initialize the BAR register in config space */
 	bar = (addr & mask) | lobits;
 	pci_set_cfgdata32(pdi, PCIR_BAR(idx), bar);
 
 	if (type == PCIBAR_MEM64) {
 		assert(idx + 1 <= PCI_BARMAX);
 		pdi->pi_bar[idx + 1].type = PCIBAR_MEMHI64;
 		pci_set_cfgdata32(pdi, PCIR_BAR(idx + 1), bar >> 32);
 	}
 
 	cmd = pci_get_cfgdata16(pdi, PCIR_COMMAND);
 	if ((cmd & enbit) != enbit)
 		pci_set_cfgdata16(pdi, PCIR_COMMAND, cmd | enbit);
 	register_bar(pdi, idx);
 
 	return (0);
 }
 
 #define	CAP_START_OFFSET	0x40
 static int
 pci_emul_add_capability(struct pci_devinst *pi, u_char *capdata, int caplen)
 {
 	int i, capoff, reallen;
 	uint16_t sts;
 
 	assert(caplen > 0);
 
 	reallen = roundup2(caplen, 4);		/* dword aligned */
 
 	sts = pci_get_cfgdata16(pi, PCIR_STATUS);
 	if ((sts & PCIM_STATUS_CAPPRESENT) == 0)
 		capoff = CAP_START_OFFSET;
 	else
 		capoff = pi->pi_capend + 1;
 
 	/* Check if we have enough space */
 	if (capoff + reallen > PCI_REGMAX + 1)
 		return (-1);
 
 	/* Set the previous capability pointer */
 	if ((sts & PCIM_STATUS_CAPPRESENT) == 0) {
 		pci_set_cfgdata8(pi, PCIR_CAP_PTR, capoff);
 		pci_set_cfgdata16(pi, PCIR_STATUS, sts|PCIM_STATUS_CAPPRESENT);
 	} else
 		pci_set_cfgdata8(pi, pi->pi_prevcap + 1, capoff);
 
 	/* Copy the capability */
 	for (i = 0; i < caplen; i++)
 		pci_set_cfgdata8(pi, capoff + i, capdata[i]);
 
 	/* Set the next capability pointer */
 	pci_set_cfgdata8(pi, capoff + 1, 0);
 
 	pi->pi_prevcap = capoff;
 	pi->pi_capend = capoff + reallen - 1;
 	return (0);
 }
 
 static struct pci_devemu *
 pci_emul_finddev(char *name)
 {
 	struct pci_devemu **pdpp, *pdp;
 
 	SET_FOREACH(pdpp, pci_devemu_set) {
 		pdp = *pdpp;
 		if (!strcmp(pdp->pe_emu, name)) {
 			return (pdp);
 		}
 	}
 
 	return (NULL);
 }
 
 static int
 pci_emul_init(struct vmctx *ctx, struct pci_devemu *pde, int bus, int slot,
     int func, struct funcinfo *fi)
 {
 	struct pci_devinst *pdi;
 	int err;
 
 	pdi = calloc(1, sizeof(struct pci_devinst));
 
 	pdi->pi_vmctx = ctx;
 	pdi->pi_bus = bus;
 	pdi->pi_slot = slot;
 	pdi->pi_func = func;
 	pthread_mutex_init(&pdi->pi_lintr.lock, NULL);
 	pdi->pi_lintr.pin = 0;
 	pdi->pi_lintr.state = IDLE;
 	pdi->pi_lintr.pirq_pin = 0;
 	pdi->pi_lintr.ioapic_irq = 0;
 	pdi->pi_d = pde;
 	snprintf(pdi->pi_name, PI_NAMESZ, "%s-pci-%d", pde->pe_emu, slot);
 
 	/* Disable legacy interrupts */
 	pci_set_cfgdata8(pdi, PCIR_INTLINE, 255);
 	pci_set_cfgdata8(pdi, PCIR_INTPIN, 0);
 
 	pci_set_cfgdata8(pdi, PCIR_COMMAND, PCIM_CMD_BUSMASTEREN);
 
 	err = (*pde->pe_init)(ctx, pdi, fi->fi_param);
 	if (err == 0)
 		fi->fi_devi = pdi;
 	else
 		free(pdi);
 
 	return (err);
 }
 
 void
 pci_populate_msicap(struct msicap *msicap, int msgnum, int nextptr)
 {
 	int mmc;
 
 	/* Number of msi messages must be a power of 2 between 1 and 32 */
 	assert((msgnum & (msgnum - 1)) == 0 && msgnum >= 1 && msgnum <= 32);
 	mmc = ffs(msgnum) - 1;
 
 	bzero(msicap, sizeof(struct msicap));
 	msicap->capid = PCIY_MSI;
 	msicap->nextptr = nextptr;
 	msicap->msgctrl = PCIM_MSICTRL_64BIT | (mmc << 1);
 }
 
 int
 pci_emul_add_msicap(struct pci_devinst *pi, int msgnum)
 {
 	struct msicap msicap;
 
 	pci_populate_msicap(&msicap, msgnum, 0);
 
 	return (pci_emul_add_capability(pi, (u_char *)&msicap, sizeof(msicap)));
 }
 
 static void
 pci_populate_msixcap(struct msixcap *msixcap, int msgnum, int barnum,
 		     uint32_t msix_tab_size)
 {
 
 	assert(msix_tab_size % 4096 == 0);
 
 	bzero(msixcap, sizeof(struct msixcap));
 	msixcap->capid = PCIY_MSIX;
 
 	/*
 	 * Message Control Register, all fields set to
 	 * zero except for the Table Size.
 	 * Note: Table size N is encoded as N-1
 	 */
 	msixcap->msgctrl = msgnum - 1;
 
 	/*
 	 * MSI-X BAR setup:
 	 * - MSI-X table start at offset 0
 	 * - PBA table starts at a 4K aligned offset after the MSI-X table
 	 */
 	msixcap->table_info = barnum & PCIM_MSIX_BIR_MASK;
 	msixcap->pba_info = msix_tab_size | (barnum & PCIM_MSIX_BIR_MASK);
 }
 
 static void
 pci_msix_table_init(struct pci_devinst *pi, int table_entries)
 {
 	int i, table_size;
 
 	assert(table_entries > 0);
 	assert(table_entries <= MAX_MSIX_TABLE_ENTRIES);
 
 	table_size = table_entries * MSIX_TABLE_ENTRY_SIZE;
 	pi->pi_msix.table = calloc(1, table_size);
 
 	/* set mask bit of vector control register */
 	for (i = 0; i < table_entries; i++)
 		pi->pi_msix.table[i].vector_control |= PCIM_MSIX_VCTRL_MASK;
 }
 
 int
 pci_emul_add_msixcap(struct pci_devinst *pi, int msgnum, int barnum)
 {
 	uint32_t tab_size;
 	struct msixcap msixcap;
 
 	assert(msgnum >= 1 && msgnum <= MAX_MSIX_TABLE_ENTRIES);
 	assert(barnum >= 0 && barnum <= PCIR_MAX_BAR_0);
 
 	tab_size = msgnum * MSIX_TABLE_ENTRY_SIZE;
 
 	/* Align table size to nearest 4K */
 	tab_size = roundup2(tab_size, 4096);
 
 	pi->pi_msix.table_bar = barnum;
 	pi->pi_msix.pba_bar   = barnum;
 	pi->pi_msix.table_offset = 0;
 	pi->pi_msix.table_count = msgnum;
 	pi->pi_msix.pba_offset = tab_size;
 	pi->pi_msix.pba_size = PBA_SIZE(msgnum);
 
 	pci_msix_table_init(pi, msgnum);
 
 	pci_populate_msixcap(&msixcap, msgnum, barnum, tab_size);
 
 	/* allocate memory for MSI-X Table and PBA */
 	pci_emul_alloc_bar(pi, barnum, PCIBAR_MEM32,
 				tab_size + pi->pi_msix.pba_size);
 
 	return (pci_emul_add_capability(pi, (u_char *)&msixcap,
 					sizeof(msixcap)));
 }
 
 void
 msixcap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
 		 int bytes, uint32_t val)
 {
 	uint16_t msgctrl, rwmask;
 	int off;
 
 	off = offset - capoff;
 	/* Message Control Register */
 	if (off == 2 && bytes == 2) {
 		rwmask = PCIM_MSIXCTRL_MSIX_ENABLE | PCIM_MSIXCTRL_FUNCTION_MASK;
 		msgctrl = pci_get_cfgdata16(pi, offset);
 		msgctrl &= ~rwmask;
 		msgctrl |= val & rwmask;
 		val = msgctrl;
 
 		pi->pi_msix.enabled = val & PCIM_MSIXCTRL_MSIX_ENABLE;
 		pi->pi_msix.function_mask = val & PCIM_MSIXCTRL_FUNCTION_MASK;
 		pci_lintr_update(pi);
 	}
 
 	CFGWRITE(pi, offset, val, bytes);
 }
 
 void
 msicap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
 		int bytes, uint32_t val)
 {
 	uint16_t msgctrl, rwmask, msgdata, mme;
 	uint32_t addrlo;
 
 	/*
 	 * If guest is writing to the message control register make sure
 	 * we do not overwrite read-only fields.
 	 */
 	if ((offset - capoff) == 2 && bytes == 2) {
 		rwmask = PCIM_MSICTRL_MME_MASK | PCIM_MSICTRL_MSI_ENABLE;
 		msgctrl = pci_get_cfgdata16(pi, offset);
 		msgctrl &= ~rwmask;
 		msgctrl |= val & rwmask;
 		val = msgctrl;
 	}
 	CFGWRITE(pi, offset, val, bytes);
 
 	msgctrl = pci_get_cfgdata16(pi, capoff + 2);
 	addrlo = pci_get_cfgdata32(pi, capoff + 4);
 	if (msgctrl & PCIM_MSICTRL_64BIT)
 		msgdata = pci_get_cfgdata16(pi, capoff + 12);
 	else
 		msgdata = pci_get_cfgdata16(pi, capoff + 8);
 
 	mme = msgctrl & PCIM_MSICTRL_MME_MASK;
 	pi->pi_msi.enabled = msgctrl & PCIM_MSICTRL_MSI_ENABLE ? 1 : 0;
 	if (pi->pi_msi.enabled) {
 		pi->pi_msi.addr = addrlo;
 		pi->pi_msi.msg_data = msgdata;
 		pi->pi_msi.maxmsgnum = 1 << (mme >> 4);
 	} else {
 		pi->pi_msi.maxmsgnum = 0;
 	}
 	pci_lintr_update(pi);
 }
 
 void
 pciecap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
 		 int bytes, uint32_t val)
 {
 
 	/* XXX don't write to the readonly parts */
 	CFGWRITE(pi, offset, val, bytes);
 }
 
 #define	PCIECAP_VERSION	0x2
 int
 pci_emul_add_pciecap(struct pci_devinst *pi, int type)
 {
 	int err;
 	struct pciecap pciecap;
 
 	bzero(&pciecap, sizeof(pciecap));
 
 	/*
 	 * Use the integrated endpoint type for endpoints on a root complex bus.
 	 *
 	 * NB: bhyve currently only supports a single PCI bus that is the root
 	 * complex bus, so all endpoints are integrated.
 	 */
 	if ((type == PCIEM_TYPE_ENDPOINT) && (pi->pi_bus == 0))
 		type = PCIEM_TYPE_ROOT_INT_EP;
 
 	pciecap.capid = PCIY_EXPRESS;
 	pciecap.pcie_capabilities = PCIECAP_VERSION | type;
 	if (type != PCIEM_TYPE_ROOT_INT_EP) {
 		pciecap.link_capabilities = 0x411;	/* gen1, x1 */
 		pciecap.link_status = 0x11;		/* gen1, x1 */
 	}
 
 	err = pci_emul_add_capability(pi, (u_char *)&pciecap, sizeof(pciecap));
 	return (err);
 }
 
 /*
  * This function assumes that 'coff' is in the capabilities region of the
  * config space.
  */
 static void
 pci_emul_capwrite(struct pci_devinst *pi, int offset, int bytes, uint32_t val)
 {
 	int capid;
 	uint8_t capoff, nextoff;
 
 	/* Do not allow un-aligned writes */
 	if ((offset & (bytes - 1)) != 0)
 		return;
 
 	/* Find the capability that we want to update */
 	capoff = CAP_START_OFFSET;
 	while (1) {
 		nextoff = pci_get_cfgdata8(pi, capoff + 1);
 		if (nextoff == 0)
 			break;
 		if (offset >= capoff && offset < nextoff)
 			break;
 
 		capoff = nextoff;
 	}
 	assert(offset >= capoff);
 
 	/*
 	 * Capability ID and Next Capability Pointer are readonly.
 	 * However, some o/s's do 4-byte writes that include these.
 	 * For this case, trim the write back to 2 bytes and adjust
 	 * the data.
 	 */
 	if (offset == capoff || offset == capoff + 1) {
 		if (offset == capoff && bytes == 4) {
 			bytes = 2;
 			offset += 2;
 			val >>= 16;
 		} else
 			return;
 	}
 
 	capid = pci_get_cfgdata8(pi, capoff);
 	switch (capid) {
 	case PCIY_MSI:
 		msicap_cfgwrite(pi, capoff, offset, bytes, val);
 		break;
 	case PCIY_MSIX:
 		msixcap_cfgwrite(pi, capoff, offset, bytes, val);
 		break;
 	case PCIY_EXPRESS:
 		pciecap_cfgwrite(pi, capoff, offset, bytes, val);
 		break;
 	default:
 		break;
 	}
 }
 
 static int
 pci_emul_iscap(struct pci_devinst *pi, int offset)
 {
 	uint16_t sts;
 
 	sts = pci_get_cfgdata16(pi, PCIR_STATUS);
 	if ((sts & PCIM_STATUS_CAPPRESENT) != 0) {
 		if (offset >= CAP_START_OFFSET && offset <= pi->pi_capend)
 			return (1);
 	}
 	return (0);
 }
 
 static int
 pci_emul_fallback_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr,
 			  int size, uint64_t *val, void *arg1, long arg2)
 {
 	/*
 	 * Ignore writes; return 0xff's for reads. The mem read code
 	 * will take care of truncating to the correct size.
 	 */
 	if (dir == MEM_F_READ) {
 		*val = 0xffffffffffffffff;
 	}
 
 	return (0);
 }
 
 static int
 pci_emul_ecfg_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr,
     int bytes, uint64_t *val, void *arg1, long arg2)
 {
 	int bus, slot, func, coff, in;
 
 	coff = addr & 0xfff;
 	func = (addr >> 12) & 0x7;
 	slot = (addr >> 15) & 0x1f;
 	bus = (addr >> 20) & 0xff;
 	in = (dir == MEM_F_READ);
 	if (in)
 		*val = ~0UL;
 	pci_cfgrw(ctx, vcpu, in, bus, slot, func, coff, bytes, (uint32_t *)val);
 	return (0);
 }
 
 uint64_t
 pci_ecfg_base(void)
 {
 
 	return (PCI_EMUL_ECFG_BASE);
 }
 
 #define	BUSIO_ROUNDUP		32
 #define	BUSMEM_ROUNDUP		(1024 * 1024)
 
 int
 init_pci(struct vmctx *ctx)
 {
 	struct mem_range mr;
 	struct pci_devemu *pde;
 	struct businfo *bi;
 	struct slotinfo *si;
 	struct funcinfo *fi;
 	size_t lowmem;
 	int bus, slot, func;
 	int error;
 
 	pci_emul_iobase = PCI_EMUL_IOBASE;
 	pci_emul_membase32 = vm_get_lowmem_limit(ctx);
 	pci_emul_membase64 = PCI_EMUL_MEMBASE64;
 
 	for (bus = 0; bus < MAXBUSES; bus++) {
 		if ((bi = pci_businfo[bus]) == NULL)
 			continue;
 		/*
 		 * Keep track of the i/o and memory resources allocated to
 		 * this bus.
 		 */
 		bi->iobase = pci_emul_iobase;
 		bi->membase32 = pci_emul_membase32;
 		bi->membase64 = pci_emul_membase64;
 
 		for (slot = 0; slot < MAXSLOTS; slot++) {
 			si = &bi->slotinfo[slot];
 			for (func = 0; func < MAXFUNCS; func++) {
 				fi = &si->si_funcs[func];
 				if (fi->fi_name == NULL)
 					continue;
 				pde = pci_emul_finddev(fi->fi_name);
 				assert(pde != NULL);
 				error = pci_emul_init(ctx, pde, bus, slot,
 				    func, fi);
 				if (error)
 					return (error);
 			}
 		}
 
 		/*
 		 * Add some slop to the I/O and memory resources decoded by
 		 * this bus to give a guest some flexibility if it wants to
 		 * reprogram the BARs.
 		 */
 		pci_emul_iobase += BUSIO_ROUNDUP;
 		pci_emul_iobase = roundup2(pci_emul_iobase, BUSIO_ROUNDUP);
 		bi->iolimit = pci_emul_iobase;
 
 		pci_emul_membase32 += BUSMEM_ROUNDUP;
 		pci_emul_membase32 = roundup2(pci_emul_membase32,
 		    BUSMEM_ROUNDUP);
 		bi->memlimit32 = pci_emul_membase32;
 
 		pci_emul_membase64 += BUSMEM_ROUNDUP;
 		pci_emul_membase64 = roundup2(pci_emul_membase64,
 		    BUSMEM_ROUNDUP);
 		bi->memlimit64 = pci_emul_membase64;
 	}
 
 	/*
 	 * PCI backends are initialized before routing INTx interrupts
 	 * so that LPC devices are able to reserve ISA IRQs before
 	 * routing PIRQ pins.
 	 */
 	for (bus = 0; bus < MAXBUSES; bus++) {
 		if ((bi = pci_businfo[bus]) == NULL)
 			continue;
 
 		for (slot = 0; slot < MAXSLOTS; slot++) {
 			si = &bi->slotinfo[slot];
 			for (func = 0; func < MAXFUNCS; func++) {
 				fi = &si->si_funcs[func];
 				if (fi->fi_devi == NULL)
 					continue;
 				pci_lintr_route(fi->fi_devi);
 			}
 		}
 	}
 	lpc_pirq_routed();
 
 	/*
 	 * The guest physical memory map looks like the following:
 	 * [0,		    lowmem)		guest system memory
 	 * [lowmem,	    lowmem_limit)	memory hole (may be absent)
 	 * [lowmem_limit,   0xE0000000)		PCI hole (32-bit BAR allocation)
 	 * [0xE0000000,	    0xF0000000)		PCI extended config window
 	 * [0xF0000000,	    4GB)		LAPIC, IOAPIC, HPET, firmware
 	 * [4GB,	    4GB + highmem)
 	 */
 
 	/*
 	 * Accesses to memory addresses that are not allocated to system
 	 * memory or PCI devices return 0xff's.
 	 */
 	lowmem = vm_get_lowmem_size(ctx);
 	bzero(&mr, sizeof(struct mem_range));
 	mr.name = "PCI hole";
 	mr.flags = MEM_F_RW | MEM_F_IMMUTABLE;
 	mr.base = lowmem;
 	mr.size = (4ULL * 1024 * 1024 * 1024) - lowmem;
 	mr.handler = pci_emul_fallback_handler;
 	error = register_mem_fallback(&mr);
 	assert(error == 0);
 
 	/* PCI extended config space */
 	bzero(&mr, sizeof(struct mem_range));
 	mr.name = "PCI ECFG";
 	mr.flags = MEM_F_RW | MEM_F_IMMUTABLE;
 	mr.base = PCI_EMUL_ECFG_BASE;
 	mr.size = PCI_EMUL_ECFG_SIZE;
 	mr.handler = pci_emul_ecfg_handler;
 	error = register_mem(&mr);
 	assert(error == 0);
 
 	return (0);
 }
 
 static void
 pci_apic_prt_entry(int bus, int slot, int pin, int pirq_pin, int ioapic_irq,
     void *arg)
 {
 
 	dsdt_line("  Package ()");
 	dsdt_line("  {");
 	dsdt_line("    0x%X,", slot << 16 | 0xffff);
 	dsdt_line("    0x%02X,", pin - 1);
 	dsdt_line("    Zero,");
 	dsdt_line("    0x%X", ioapic_irq);
 	dsdt_line("  },");
 }
 
 static void
 pci_pirq_prt_entry(int bus, int slot, int pin, int pirq_pin, int ioapic_irq,
     void *arg)
 {
 	char *name;
 
 	name = lpc_pirq_name(pirq_pin);
 	if (name == NULL)
 		return;
 	dsdt_line("  Package ()");
 	dsdt_line("  {");
 	dsdt_line("    0x%X,", slot << 16 | 0xffff);
 	dsdt_line("    0x%02X,", pin - 1);
 	dsdt_line("    %s,", name);
 	dsdt_line("    0x00");
 	dsdt_line("  },");
 	free(name);
 }
 
 /*
  * A bhyve virtual machine has a flat PCI hierarchy with a root port
  * corresponding to each PCI bus.
  */
 static void
 pci_bus_write_dsdt(int bus)
 {
 	struct businfo *bi;
 	struct slotinfo *si;
 	struct pci_devinst *pi;
 	int count, func, slot;
 
 	/*
 	 * If there are no devices on this 'bus' then just return.
 	 */
 	if ((bi = pci_businfo[bus]) == NULL) {
 		/*
 		 * Bus 0 is special because it decodes the I/O ports used
 		 * for PCI config space access even if there are no devices
 		 * on it.
 		 */
 		if (bus != 0)
 			return;
 	}
 
 	dsdt_line("  Device (PC%02X)", bus);
 	dsdt_line("  {");
 	dsdt_line("    Name (_HID, EisaId (\"PNP0A03\"))");
 
 	dsdt_line("    Method (_BBN, 0, NotSerialized)");
 	dsdt_line("    {");
 	dsdt_line("        Return (0x%08X)", bus);
 	dsdt_line("    }");
 	dsdt_line("    Name (_CRS, ResourceTemplate ()");
 	dsdt_line("    {");
 	dsdt_line("      WordBusNumber (ResourceProducer, MinFixed, "
 	    "MaxFixed, PosDecode,");
 	dsdt_line("        0x0000,             // Granularity");
 	dsdt_line("        0x%04X,             // Range Minimum", bus);
 	dsdt_line("        0x%04X,             // Range Maximum", bus);
 	dsdt_line("        0x0000,             // Translation Offset");
 	dsdt_line("        0x0001,             // Length");
 	dsdt_line("        ,, )");
 
 	if (bus == 0) {
 		dsdt_indent(3);
 		dsdt_fixed_ioport(0xCF8, 8);
 		dsdt_unindent(3);
 
 		dsdt_line("      WordIO (ResourceProducer, MinFixed, MaxFixed, "
 		    "PosDecode, EntireRange,");
 		dsdt_line("        0x0000,             // Granularity");
 		dsdt_line("        0x0000,             // Range Minimum");
 		dsdt_line("        0x0CF7,             // Range Maximum");
 		dsdt_line("        0x0000,             // Translation Offset");
 		dsdt_line("        0x0CF8,             // Length");
 		dsdt_line("        ,, , TypeStatic)");
 
 		dsdt_line("      WordIO (ResourceProducer, MinFixed, MaxFixed, "
 		    "PosDecode, EntireRange,");
 		dsdt_line("        0x0000,             // Granularity");
 		dsdt_line("        0x0D00,             // Range Minimum");
 		dsdt_line("        0x%04X,             // Range Maximum",
 		    PCI_EMUL_IOBASE - 1);
 		dsdt_line("        0x0000,             // Translation Offset");
 		dsdt_line("        0x%04X,             // Length",
 		    PCI_EMUL_IOBASE - 0x0D00);
 		dsdt_line("        ,, , TypeStatic)");
 
 		if (bi == NULL) {
 			dsdt_line("    })");
 			goto done;
 		}
 	}
 	assert(bi != NULL);
 
 	/* i/o window */
 	dsdt_line("      WordIO (ResourceProducer, MinFixed, MaxFixed, "
 	    "PosDecode, EntireRange,");
 	dsdt_line("        0x0000,             // Granularity");
 	dsdt_line("        0x%04X,             // Range Minimum", bi->iobase);
 	dsdt_line("        0x%04X,             // Range Maximum",
 	    bi->iolimit - 1);
 	dsdt_line("        0x0000,             // Translation Offset");
 	dsdt_line("        0x%04X,             // Length",
 	    bi->iolimit - bi->iobase);
 	dsdt_line("        ,, , TypeStatic)");
 
 	/* mmio window (32-bit) */
 	dsdt_line("      DWordMemory (ResourceProducer, PosDecode, "
 	    "MinFixed, MaxFixed, NonCacheable, ReadWrite,");
 	dsdt_line("        0x00000000,         // Granularity");
 	dsdt_line("        0x%08X,         // Range Minimum\n", bi->membase32);
 	dsdt_line("        0x%08X,         // Range Maximum\n",
 	    bi->memlimit32 - 1);
 	dsdt_line("        0x00000000,         // Translation Offset");
 	dsdt_line("        0x%08X,         // Length\n",
 	    bi->memlimit32 - bi->membase32);
 	dsdt_line("        ,, , AddressRangeMemory, TypeStatic)");
 
 	/* mmio window (64-bit) */
 	dsdt_line("      QWordMemory (ResourceProducer, PosDecode, "
 	    "MinFixed, MaxFixed, NonCacheable, ReadWrite,");
 	dsdt_line("        0x0000000000000000, // Granularity");
 	dsdt_line("        0x%016lX, // Range Minimum\n", bi->membase64);
 	dsdt_line("        0x%016lX, // Range Maximum\n",
 	    bi->memlimit64 - 1);
 	dsdt_line("        0x0000000000000000, // Translation Offset");
 	dsdt_line("        0x%016lX, // Length\n",
 	    bi->memlimit64 - bi->membase64);
 	dsdt_line("        ,, , AddressRangeMemory, TypeStatic)");
 	dsdt_line("    })");
 
 	count = pci_count_lintr(bus);
 	if (count != 0) {
 		dsdt_indent(2);
 		dsdt_line("Name (PPRT, Package ()");
 		dsdt_line("{");
 		pci_walk_lintr(bus, pci_pirq_prt_entry, NULL);
 		dsdt_line("})");
 		dsdt_line("Name (APRT, Package ()");
 		dsdt_line("{");
 		pci_walk_lintr(bus, pci_apic_prt_entry, NULL);
 		dsdt_line("})");
 		dsdt_line("Method (_PRT, 0, NotSerialized)");
 		dsdt_line("{");
 		dsdt_line("  If (PICM)");
 		dsdt_line("  {");
 		dsdt_line("    Return (APRT)");
 		dsdt_line("  }");
 		dsdt_line("  Else");
 		dsdt_line("  {");
 		dsdt_line("    Return (PPRT)");
 		dsdt_line("  }");
 		dsdt_line("}");
 		dsdt_unindent(2);
 	}
 
 	dsdt_indent(2);
 	for (slot = 0; slot < MAXSLOTS; slot++) {
 		si = &bi->slotinfo[slot];
 		for (func = 0; func < MAXFUNCS; func++) {
 			pi = si->si_funcs[func].fi_devi;
 			if (pi != NULL && pi->pi_d->pe_write_dsdt != NULL)
 				pi->pi_d->pe_write_dsdt(pi);
 		}
 	}
 	dsdt_unindent(2);
 done:
 	dsdt_line("  }");
 }
 
 void
 pci_write_dsdt(void)
 {
 	int bus;
 
 	dsdt_indent(1);
 	dsdt_line("Name (PICM, 0x00)");
 	dsdt_line("Method (_PIC, 1, NotSerialized)");
 	dsdt_line("{");
 	dsdt_line("  Store (Arg0, PICM)");
 	dsdt_line("}");
 	dsdt_line("");
 	dsdt_line("Scope (_SB)");
 	dsdt_line("{");
 	for (bus = 0; bus < MAXBUSES; bus++)
 		pci_bus_write_dsdt(bus);
 	dsdt_line("}");
 	dsdt_unindent(1);
 }
 
 int
 pci_bus_configured(int bus)
 {
 	assert(bus >= 0 && bus < MAXBUSES);
 	return (pci_businfo[bus] != NULL);
 }
 
 int
 pci_msi_enabled(struct pci_devinst *pi)
 {
 	return (pi->pi_msi.enabled);
 }
 
 int
 pci_msi_maxmsgnum(struct pci_devinst *pi)
 {
 	if (pi->pi_msi.enabled)
 		return (pi->pi_msi.maxmsgnum);
 	else
 		return (0);
 }
 
 int
 pci_msix_enabled(struct pci_devinst *pi)
 {
 
 	return (pi->pi_msix.enabled && !pi->pi_msi.enabled);
 }
 
 void
 pci_generate_msix(struct pci_devinst *pi, int index)
 {
 	struct msix_table_entry *mte;
 
 	if (!pci_msix_enabled(pi))
 		return;
 
 	if (pi->pi_msix.function_mask)
 		return;
 
 	if (index >= pi->pi_msix.table_count)
 		return;
 
 	mte = &pi->pi_msix.table[index];
 	if ((mte->vector_control & PCIM_MSIX_VCTRL_MASK) == 0) {
 		/* XXX Set PBA bit if interrupt is disabled */
 		vm_lapic_msi(pi->pi_vmctx, mte->addr, mte->msg_data);
 	}
 }
 
 void
 pci_generate_msi(struct pci_devinst *pi, int index)
 {
 
 	if (pci_msi_enabled(pi) && index < pci_msi_maxmsgnum(pi)) {
 		vm_lapic_msi(pi->pi_vmctx, pi->pi_msi.addr,
 			     pi->pi_msi.msg_data + index);
 	}
 }
 
 static bool
 pci_lintr_permitted(struct pci_devinst *pi)
 {
 	uint16_t cmd;
 
 	cmd = pci_get_cfgdata16(pi, PCIR_COMMAND);
 	return (!(pi->pi_msi.enabled || pi->pi_msix.enabled ||
 		(cmd & PCIM_CMD_INTxDIS)));
 }
 
 void
 pci_lintr_request(struct pci_devinst *pi)
 {
 	struct businfo *bi;
 	struct slotinfo *si;
 	int bestpin, bestcount, pin;
 
 	bi = pci_businfo[pi->pi_bus];
 	assert(bi != NULL);
 
 	/*
 	 * Just allocate a pin from our slot.  The pin will be
 	 * assigned IRQs later when interrupts are routed.
 	 */
 	si = &bi->slotinfo[pi->pi_slot];
 	bestpin = 0;
 	bestcount = si->si_intpins[0].ii_count;
 	for (pin = 1; pin < 4; pin++) {
 		if (si->si_intpins[pin].ii_count < bestcount) {
 			bestpin = pin;
 			bestcount = si->si_intpins[pin].ii_count;
 		}
 	}
 
 	si->si_intpins[bestpin].ii_count++;
 	pi->pi_lintr.pin = bestpin + 1;
 	pci_set_cfgdata8(pi, PCIR_INTPIN, bestpin + 1);
 }
 
 static void
 pci_lintr_route(struct pci_devinst *pi)
 {
 	struct businfo *bi;
 	struct intxinfo *ii;
 
 	if (pi->pi_lintr.pin == 0)
 		return;
 
 	bi = pci_businfo[pi->pi_bus];
 	assert(bi != NULL);
 	ii = &bi->slotinfo[pi->pi_slot].si_intpins[pi->pi_lintr.pin - 1];
 
 	/*
 	 * Attempt to allocate an I/O APIC pin for this intpin if one
 	 * is not yet assigned.
 	 */
 	if (ii->ii_ioapic_irq == 0)
 		ii->ii_ioapic_irq = ioapic_pci_alloc_irq(pi);
 	assert(ii->ii_ioapic_irq > 0);
 
 	/*
 	 * Attempt to allocate a PIRQ pin for this intpin if one is
 	 * not yet assigned.
 	 */
 	if (ii->ii_pirq_pin == 0)
 		ii->ii_pirq_pin = pirq_alloc_pin(pi);
 	assert(ii->ii_pirq_pin > 0);
 
 	pi->pi_lintr.ioapic_irq = ii->ii_ioapic_irq;
 	pi->pi_lintr.pirq_pin = ii->ii_pirq_pin;
 	pci_set_cfgdata8(pi, PCIR_INTLINE, pirq_irq(ii->ii_pirq_pin));
 }
 
 void
 pci_lintr_assert(struct pci_devinst *pi)
 {
 
 	assert(pi->pi_lintr.pin > 0);
 
 	pthread_mutex_lock(&pi->pi_lintr.lock);
 	if (pi->pi_lintr.state == IDLE) {
 		if (pci_lintr_permitted(pi)) {
 			pi->pi_lintr.state = ASSERTED;
 			pci_irq_assert(pi);
 		} else
 			pi->pi_lintr.state = PENDING;
 	}
 	pthread_mutex_unlock(&pi->pi_lintr.lock);
 }
 
 void
 pci_lintr_deassert(struct pci_devinst *pi)
 {
 
 	assert(pi->pi_lintr.pin > 0);
 
 	pthread_mutex_lock(&pi->pi_lintr.lock);
 	if (pi->pi_lintr.state == ASSERTED) {
 		pi->pi_lintr.state = IDLE;
 		pci_irq_deassert(pi);
 	} else if (pi->pi_lintr.state == PENDING)
 		pi->pi_lintr.state = IDLE;
 	pthread_mutex_unlock(&pi->pi_lintr.lock);
 }
 
 static void
 pci_lintr_update(struct pci_devinst *pi)
 {
 
 	pthread_mutex_lock(&pi->pi_lintr.lock);
 	if (pi->pi_lintr.state == ASSERTED && !pci_lintr_permitted(pi)) {
 		pci_irq_deassert(pi);
 		pi->pi_lintr.state = PENDING;
 	} else if (pi->pi_lintr.state == PENDING && pci_lintr_permitted(pi)) {
 		pi->pi_lintr.state = ASSERTED;
 		pci_irq_assert(pi);
 	}
 	pthread_mutex_unlock(&pi->pi_lintr.lock);
 }
 
 int
 pci_count_lintr(int bus)
 {
 	int count, slot, pin;
 	struct slotinfo *slotinfo;
 
 	count = 0;
 	if (pci_businfo[bus] != NULL) {
 		for (slot = 0; slot < MAXSLOTS; slot++) {
 			slotinfo = &pci_businfo[bus]->slotinfo[slot];
 			for (pin = 0; pin < 4; pin++) {
 				if (slotinfo->si_intpins[pin].ii_count != 0)
 					count++;
 			}
 		}
 	}
 	return (count);
 }
 
 void
 pci_walk_lintr(int bus, pci_lintr_cb cb, void *arg)
 {
 	struct businfo *bi;
 	struct slotinfo *si;
 	struct intxinfo *ii;
 	int slot, pin;
 
 	if ((bi = pci_businfo[bus]) == NULL)
 		return;
 
 	for (slot = 0; slot < MAXSLOTS; slot++) {
 		si = &bi->slotinfo[slot];
 		for (pin = 0; pin < 4; pin++) {
 			ii = &si->si_intpins[pin];
 			if (ii->ii_count != 0)
 				cb(bus, slot, pin + 1, ii->ii_pirq_pin,
 				    ii->ii_ioapic_irq, arg);
 		}
 	}
 }
 
 /*
  * Return 1 if the emulated device in 'slot' is a multi-function device.
  * Return 0 otherwise.
  */
 static int
 pci_emul_is_mfdev(int bus, int slot)
 {
 	struct businfo *bi;
 	struct slotinfo *si;
 	int f, numfuncs;
 
 	numfuncs = 0;
 	if ((bi = pci_businfo[bus]) != NULL) {
 		si = &bi->slotinfo[slot];
 		for (f = 0; f < MAXFUNCS; f++) {
 			if (si->si_funcs[f].fi_devi != NULL) {
 				numfuncs++;
 			}
 		}
 	}
 	return (numfuncs > 1);
 }
 
 /*
  * Ensure that the PCIM_MFDEV bit is properly set (or unset) depending on
  * whether or not is a multi-function being emulated in the pci 'slot'.
  */
 static void
 pci_emul_hdrtype_fixup(int bus, int slot, int off, int bytes, uint32_t *rv)
 {
 	int mfdev;
 
 	if (off <= PCIR_HDRTYPE && off + bytes > PCIR_HDRTYPE) {
 		mfdev = pci_emul_is_mfdev(bus, slot);
 		switch (bytes) {
 		case 1:
 		case 2:
 			*rv &= ~PCIM_MFDEV;
 			if (mfdev) {
 				*rv |= PCIM_MFDEV;
 			}
 			break;
 		case 4:
 			*rv &= ~(PCIM_MFDEV << 16);
 			if (mfdev) {
 				*rv |= (PCIM_MFDEV << 16);
 			}
 			break;
 		}
 	}
 }
 
 /*
  * Update device state in response to changes to the PCI command
  * register.
  */
 void
 pci_emul_cmd_changed(struct pci_devinst *pi, uint16_t old)
 {
 	int i;
 	uint16_t changed, new;
 
 	new = pci_get_cfgdata16(pi, PCIR_COMMAND);
 	changed = old ^ new;
 
 	/*
 	 * If the MMIO or I/O address space decoding has changed then
 	 * register/unregister all BARs that decode that address space.
 	 */
 	for (i = 0; i <= PCI_BARMAX; i++) {
 		switch (pi->pi_bar[i].type) {
 			case PCIBAR_NONE:
 			case PCIBAR_MEMHI64:
 				break;
 			case PCIBAR_IO:
 				/* I/O address space decoding changed? */
 				if (changed & PCIM_CMD_PORTEN) {
 					if (new & PCIM_CMD_PORTEN)
 						register_bar(pi, i);
 					else
 						unregister_bar(pi, i);
 				}
 				break;
 			case PCIBAR_MEM32:
 			case PCIBAR_MEM64:
 				/* MMIO address space decoding changed? */
 				if (changed & PCIM_CMD_MEMEN) {
 					if (new & PCIM_CMD_MEMEN)
 						register_bar(pi, i);
 					else
 						unregister_bar(pi, i);
 				}
 				break;
 			default:
 				assert(0);
 		}
 	}
 
 	/*
 	 * If INTx has been unmasked and is pending, assert the
 	 * interrupt.
 	 */
 	pci_lintr_update(pi);
 }
 
 static void
 pci_emul_cmdsts_write(struct pci_devinst *pi, int coff, uint32_t new, int bytes)
 {
 	int rshift;
 	uint32_t cmd, old, readonly;
 
 	cmd = pci_get_cfgdata16(pi, PCIR_COMMAND);	/* stash old value */
 
 	/*
 	 * From PCI Local Bus Specification 3.0 sections 6.2.2 and 6.2.3.
 	 *
 	 * XXX Bits 8, 11, 12, 13, 14 and 15 in the status register are
 	 * 'write 1 to clear'. However these bits are not set to '1' by
 	 * any device emulation so it is simpler to treat them as readonly.
 	 */
 	rshift = (coff & 0x3) * 8;
 	readonly = 0xFFFFF880 >> rshift;
 
 	old = CFGREAD(pi, coff, bytes);
 	new &= ~readonly;
 	new |= (old & readonly);
 	CFGWRITE(pi, coff, new, bytes);			/* update config */
 
 	pci_emul_cmd_changed(pi, cmd);
 }
 
 static void
 pci_cfgrw(struct vmctx *ctx, int vcpu, int in, int bus, int slot, int func,
     int coff, int bytes, uint32_t *eax)
 {
 	struct businfo *bi;
 	struct slotinfo *si;
 	struct pci_devinst *pi;
 	struct pci_devemu *pe;
 	int idx, needcfg;
 	uint64_t addr, bar, mask;
 
 	if ((bi = pci_businfo[bus]) != NULL) {
 		si = &bi->slotinfo[slot];
 		pi = si->si_funcs[func].fi_devi;
 	} else
 		pi = NULL;
 
 	/*
 	 * Just return if there is no device at this slot:func or if the
 	 * the guest is doing an un-aligned access.
 	 */
 	if (pi == NULL || (bytes != 1 && bytes != 2 && bytes != 4) ||
 	    (coff & (bytes - 1)) != 0) {
 		if (in)
 			*eax = 0xffffffff;
 		return;
 	}
 
 	/*
 	 * Ignore all writes beyond the standard config space and return all
 	 * ones on reads.
 	 */
 	if (coff >= PCI_REGMAX + 1) {
 		if (in) {
 			*eax = 0xffffffff;
 			/*
 			 * Extended capabilities begin at offset 256 in config
 			 * space. Absence of extended capabilities is signaled
 			 * with all 0s in the extended capability header at
 			 * offset 256.
 			 */
 			if (coff <= PCI_REGMAX + 4)
 				*eax = 0x00000000;
 		}
 		return;
 	}
 
 	pe = pi->pi_d;
 
 	/*
 	 * Config read
 	 */
 	if (in) {
 		/* Let the device emulation override the default handler */
 		if (pe->pe_cfgread != NULL) {
 			needcfg = pe->pe_cfgread(ctx, vcpu, pi, coff, bytes,
 			    eax);
 		} else {
 			needcfg = 1;
 		}
 
 		if (needcfg)
 			*eax = CFGREAD(pi, coff, bytes);
 
 		pci_emul_hdrtype_fixup(bus, slot, coff, bytes, eax);
 	} else {
 		/* Let the device emulation override the default handler */
 		if (pe->pe_cfgwrite != NULL &&
 		    (*pe->pe_cfgwrite)(ctx, vcpu, pi, coff, bytes, *eax) == 0)
 			return;
 
 		/*
 		 * Special handling for write to BAR registers
 		 */
 		if (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1)) {
 			/*
 			 * Ignore writes to BAR registers that are not
 			 * 4-byte aligned.
 			 */
 			if (bytes != 4 || (coff & 0x3) != 0)
 				return;
 			idx = (coff - PCIR_BAR(0)) / 4;
 			mask = ~(pi->pi_bar[idx].size - 1);
 			switch (pi->pi_bar[idx].type) {
 			case PCIBAR_NONE:
 				pi->pi_bar[idx].addr = bar = 0;
 				break;
 			case PCIBAR_IO:
 				addr = *eax & mask;
 				addr &= 0xffff;
 				bar = addr | PCIM_BAR_IO_SPACE;
 				/*
 				 * Register the new BAR value for interception
 				 */
 				if (addr != pi->pi_bar[idx].addr) {
 					update_bar_address(pi, addr, idx,
 							   PCIBAR_IO);
 				}
 				break;
 			case PCIBAR_MEM32:
 				addr = bar = *eax & mask;
 				bar |= PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_32;
 				if (addr != pi->pi_bar[idx].addr) {
 					update_bar_address(pi, addr, idx,
 							   PCIBAR_MEM32);
 				}
 				break;
 			case PCIBAR_MEM64:
 				addr = bar = *eax & mask;
 				bar |= PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64 |
 				       PCIM_BAR_MEM_PREFETCH;
 				if (addr != (uint32_t)pi->pi_bar[idx].addr) {
 					update_bar_address(pi, addr, idx,
 							   PCIBAR_MEM64);
 				}
 				break;
 			case PCIBAR_MEMHI64:
 				mask = ~(pi->pi_bar[idx - 1].size - 1);
 				addr = ((uint64_t)*eax << 32) & mask;
 				bar = addr >> 32;
 				if (bar != pi->pi_bar[idx - 1].addr >> 32) {
 					update_bar_address(pi, addr, idx - 1,
 							   PCIBAR_MEMHI64);
 				}
 				break;
 			default:
 				assert(0);
 			}
 			pci_set_cfgdata32(pi, coff, bar);
 
 		} else if (pci_emul_iscap(pi, coff)) {
 			pci_emul_capwrite(pi, coff, bytes, *eax);
 		} else if (coff >= PCIR_COMMAND && coff < PCIR_REVID) {
 			pci_emul_cmdsts_write(pi, coff, *eax, bytes);
 		} else {
 			CFGWRITE(pi, coff, *eax, bytes);
 		}
 	}
 }
 
 static int cfgenable, cfgbus, cfgslot, cfgfunc, cfgoff;
 
 static int
 pci_emul_cfgaddr(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
 		 uint32_t *eax, void *arg)
 {
 	uint32_t x;
 
 	if (bytes != 4) {
 		if (in)
 			*eax = (bytes == 2) ? 0xffff : 0xff;
 		return (0);
 	}
 
 	if (in) {
 		x = (cfgbus << 16) | (cfgslot << 11) | (cfgfunc << 8) | cfgoff;
 		if (cfgenable)
 			x |= CONF1_ENABLE;
 		*eax = x;
 	} else {
 		x = *eax;
 		cfgenable = (x & CONF1_ENABLE) == CONF1_ENABLE;
 		cfgoff = x & PCI_REGMAX;
 		cfgfunc = (x >> 8) & PCI_FUNCMAX;
 		cfgslot = (x >> 11) & PCI_SLOTMAX;
 		cfgbus = (x >> 16) & PCI_BUSMAX;
 	}
 
 	return (0);
 }
 INOUT_PORT(pci_cfgaddr, CONF1_ADDR_PORT, IOPORT_F_INOUT, pci_emul_cfgaddr);
 
 static int
 pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
 		 uint32_t *eax, void *arg)
 {
 	int coff;
 
 	assert(bytes == 1 || bytes == 2 || bytes == 4);
 
 	coff = cfgoff + (port - CONF1_DATA_PORT);
 	if (cfgenable) {
 		pci_cfgrw(ctx, vcpu, in, cfgbus, cfgslot, cfgfunc, coff, bytes,
 		    eax);
 	} else {
 		/* Ignore accesses to cfgdata if not enabled by cfgaddr */
 		if (in)
 			*eax = 0xffffffff;
 	}
 	return (0);
 }
 
 INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+0, IOPORT_F_INOUT, pci_emul_cfgdata);
 INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+1, IOPORT_F_INOUT, pci_emul_cfgdata);
 INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+2, IOPORT_F_INOUT, pci_emul_cfgdata);
 INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+3, IOPORT_F_INOUT, pci_emul_cfgdata);
 
+#ifdef BHYVE_SNAPSHOT
+/*
+ * Saves/restores PCI device emulated state. Returns 0 on success.
+ */
+static int
+pci_snapshot_pci_dev(struct vm_snapshot_meta *meta)
+{
+	struct pci_devinst *pi;
+	int i;
+	int ret;
+
+	pi = meta->dev_data;
+
+	SNAPSHOT_VAR_OR_LEAVE(pi->pi_msi.enabled, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(pi->pi_msi.addr, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(pi->pi_msi.msg_data, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(pi->pi_msi.maxmsgnum, meta, ret, done);
+
+	SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.enabled, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table_bar, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.pba_bar, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table_offset, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table_count, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.pba_offset, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.pba_size, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.function_mask, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.pba_page_offset, meta, ret, done);
+
+	SNAPSHOT_BUF_OR_LEAVE(pi->pi_cfgdata, sizeof(pi->pi_cfgdata),
+			      meta, ret, done);
+
+	for (i = 0; i < nitems(pi->pi_bar); i++) {
+		SNAPSHOT_VAR_OR_LEAVE(pi->pi_bar[i].type, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(pi->pi_bar[i].size, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(pi->pi_bar[i].addr, meta, ret, done);
+	}
+
+	/* Restore MSI-X table. */
+	for (i = 0; i < pi->pi_msix.table_count; i++) {
+		SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table[i].addr,
+				      meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table[i].msg_data,
+				      meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table[i].vector_control,
+				      meta, ret, done);
+	}
+
+done:
+	return (ret);
+}
+
+static int
+pci_find_slotted_dev(const char *dev_name, struct pci_devemu **pde,
+		     struct pci_devinst **pdi)
+{
+	struct businfo *bi;
+	struct slotinfo *si;
+	struct funcinfo *fi;
+	int bus, slot, func;
+
+	assert(dev_name != NULL);
+	assert(pde != NULL);
+	assert(pdi != NULL);
+
+	for (bus = 0; bus < MAXBUSES; bus++) {
+		if ((bi = pci_businfo[bus]) == NULL)
+			continue;
+
+		for (slot = 0; slot < MAXSLOTS; slot++) {
+			si = &bi->slotinfo[slot];
+			for (func = 0; func < MAXFUNCS; func++) {
+				fi = &si->si_funcs[func];
+				if (fi->fi_name == NULL)
+					continue;
+				if (strcmp(dev_name, fi->fi_name))
+					continue;
+
+				*pde = pci_emul_finddev(fi->fi_name);
+				assert(*pde != NULL);
+
+				*pdi = fi->fi_devi;
+				return (0);
+			}
+		}
+	}
+
+	return (EINVAL);
+}
+
+int
+pci_snapshot(struct vm_snapshot_meta *meta)
+{
+	struct pci_devemu *pde;
+	struct pci_devinst *pdi;
+	int ret;
+
+	assert(meta->dev_name != NULL);
+
+	ret = pci_find_slotted_dev(meta->dev_name, &pde, &pdi);
+	if (ret != 0) {
+		fprintf(stderr, "%s: no such name: %s\r\n",
+			__func__, meta->dev_name);
+		memset(meta->buffer.buf_start, 0, meta->buffer.buf_size);
+		return (0);
+	}
+
+	meta->dev_data = pdi;
+
+	if (pde->pe_snapshot == NULL) {
+		fprintf(stderr, "%s: not implemented yet for: %s\r\n",
+			__func__, meta->dev_name);
+		return (-1);
+	}
+
+	ret = pci_snapshot_pci_dev(meta);
+	if (ret != 0) {
+		fprintf(stderr, "%s: failed to snapshot pci dev\r\n",
+			__func__);
+		return (-1);
+	}
+
+	ret = (*pde->pe_snapshot)(meta);
+
+	return (ret);
+}
+
+int
+pci_pause(struct vmctx *ctx, const char *dev_name)
+{
+	struct pci_devemu *pde;
+	struct pci_devinst *pdi;
+	int ret;
+
+	assert(dev_name != NULL);
+
+	ret = pci_find_slotted_dev(dev_name, &pde, &pdi);
+	if (ret != 0) {
+		/*
+		 * It is possible to call this function without
+		 * checking that the device is inserted first.
+		 */
+		fprintf(stderr, "%s: no such name: %s\n", __func__, dev_name);
+		return (0);
+	}
+
+	if (pde->pe_pause == NULL) {
+		/* The pause/resume functionality is optional. */
+		fprintf(stderr, "%s: not implemented for: %s\n",
+			__func__, dev_name);
+		return (0);
+	}
+
+	return (*pde->pe_pause)(ctx, pdi);
+}
+
+int
+pci_resume(struct vmctx *ctx, const char *dev_name)
+{
+	struct pci_devemu *pde;
+	struct pci_devinst *pdi;
+	int ret;
+
+	assert(dev_name != NULL);
+
+	ret = pci_find_slotted_dev(dev_name, &pde, &pdi);
+	if (ret != 0) {
+		/*
+		 * It is possible to call this function without
+		 * checking that the device is inserted first.
+		 */
+		fprintf(stderr, "%s: no such name: %s\n", __func__, dev_name);
+		return (0);
+	}
+
+	if (pde->pe_resume == NULL) {
+		/* The pause/resume functionality is optional. */
+		fprintf(stderr, "%s: not implemented for: %s\n",
+			__func__, dev_name);
+		return (0);
+	}
+
+	return (*pde->pe_resume)(ctx, pdi);
+}
+#endif
+
 #define PCI_EMUL_TEST
 #ifdef PCI_EMUL_TEST
 /*
  * Define a dummy test device
  */
 #define DIOSZ	8
 #define DMEMSZ	4096
 struct pci_emul_dsoftc {
-	uint8_t	  ioregs[DIOSZ];
+	uint8_t   ioregs[DIOSZ];
 	uint8_t	  memregs[2][DMEMSZ];
 };
 
 #define	PCI_EMUL_MSI_MSGS	 4
 #define	PCI_EMUL_MSIX_MSGS	16
 
 static int
 pci_emul_dinit(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 {
 	int error;
 	struct pci_emul_dsoftc *sc;
 
 	sc = calloc(1, sizeof(struct pci_emul_dsoftc));
 
 	pi->pi_arg = sc;
 
 	pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0001);
 	pci_set_cfgdata16(pi, PCIR_VENDOR, 0x10DD);
 	pci_set_cfgdata8(pi, PCIR_CLASS, 0x02);
 
 	error = pci_emul_add_msicap(pi, PCI_EMUL_MSI_MSGS);
 	assert(error == 0);
 
 	error = pci_emul_alloc_bar(pi, 0, PCIBAR_IO, DIOSZ);
 	assert(error == 0);
 
 	error = pci_emul_alloc_bar(pi, 1, PCIBAR_MEM32, DMEMSZ);
 	assert(error == 0);
 
 	error = pci_emul_alloc_bar(pi, 2, PCIBAR_MEM32, DMEMSZ);
 	assert(error == 0);
 
 	return (0);
 }
 
 static void
 pci_emul_diow(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
 	      uint64_t offset, int size, uint64_t value)
 {
 	int i;
 	struct pci_emul_dsoftc *sc = pi->pi_arg;
 
 	if (baridx == 0) {
 		if (offset + size > DIOSZ) {
 			printf("diow: iow too large, offset %ld size %d\n",
 			       offset, size);
 			return;
 		}
 
 		if (size == 1) {
 			sc->ioregs[offset] = value & 0xff;
 		} else if (size == 2) {
 			*(uint16_t *)&sc->ioregs[offset] = value & 0xffff;
 		} else if (size == 4) {
 			*(uint32_t *)&sc->ioregs[offset] = value;
 		} else {
 			printf("diow: iow unknown size %d\n", size);
 		}
 
 		/*
 		 * Special magic value to generate an interrupt
 		 */
 		if (offset == 4 && size == 4 && pci_msi_enabled(pi))
 			pci_generate_msi(pi, value % pci_msi_maxmsgnum(pi));
 
 		if (value == 0xabcdef) {
 			for (i = 0; i < pci_msi_maxmsgnum(pi); i++)
 				pci_generate_msi(pi, i);
 		}
 	}
 
 	if (baridx == 1 || baridx == 2) {
 		if (offset + size > DMEMSZ) {
 			printf("diow: memw too large, offset %ld size %d\n",
 			       offset, size);
 			return;
 		}
 
 		i = baridx - 1;		/* 'memregs' index */
 
 		if (size == 1) {
 			sc->memregs[i][offset] = value;
 		} else if (size == 2) {
 			*(uint16_t *)&sc->memregs[i][offset] = value;
 		} else if (size == 4) {
 			*(uint32_t *)&sc->memregs[i][offset] = value;
 		} else if (size == 8) {
 			*(uint64_t *)&sc->memregs[i][offset] = value;
 		} else {
 			printf("diow: memw unknown size %d\n", size);
 		}
-
+		
 		/*
 		 * magic interrupt ??
 		 */
 	}
 
 	if (baridx > 2 || baridx < 0) {
 		printf("diow: unknown bar idx %d\n", baridx);
 	}
 }
 
 static uint64_t
 pci_emul_dior(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
 	      uint64_t offset, int size)
 {
 	struct pci_emul_dsoftc *sc = pi->pi_arg;
 	uint32_t value;
 	int i;
 
 	if (baridx == 0) {
 		if (offset + size > DIOSZ) {
 			printf("dior: ior too large, offset %ld size %d\n",
 			       offset, size);
 			return (0);
 		}
-
+	
 		value = 0;
 		if (size == 1) {
 			value = sc->ioregs[offset];
 		} else if (size == 2) {
 			value = *(uint16_t *) &sc->ioregs[offset];
 		} else if (size == 4) {
 			value = *(uint32_t *) &sc->ioregs[offset];
 		} else {
 			printf("dior: ior unknown size %d\n", size);
 		}
 	}
 
 	if (baridx == 1 || baridx == 2) {
 		if (offset + size > DMEMSZ) {
 			printf("dior: memr too large, offset %ld size %d\n",
 			       offset, size);
 			return (0);
 		}
-
+		
 		i = baridx - 1;		/* 'memregs' index */
 
 		if (size == 1) {
 			value = sc->memregs[i][offset];
 		} else if (size == 2) {
 			value = *(uint16_t *) &sc->memregs[i][offset];
 		} else if (size == 4) {
 			value = *(uint32_t *) &sc->memregs[i][offset];
 		} else if (size == 8) {
 			value = *(uint64_t *) &sc->memregs[i][offset];
 		} else {
 			printf("dior: ior unknown size %d\n", size);
 		}
 	}
 
 
 	if (baridx > 2 || baridx < 0) {
 		printf("dior: unknown bar idx %d\n", baridx);
 		return (0);
 	}
 
 	return (value);
 }
 
+#ifdef BHYVE_SNAPSHOT
+int
+pci_emul_snapshot(struct vm_snapshot_meta *meta)
+{
+
+	return (0);
+}
+#endif
+
 struct pci_devemu pci_dummy = {
 	.pe_emu = "dummy",
 	.pe_init = pci_emul_dinit,
 	.pe_barwrite = pci_emul_diow,
-	.pe_barread = pci_emul_dior
+	.pe_barread = pci_emul_dior,
+#ifdef BHYVE_SNAPSHOT
+	.pe_snapshot = pci_emul_snapshot,
+#endif
 };
 PCI_EMUL_SET(pci_dummy);
 
 #endif /* PCI_EMUL_TEST */
diff --git a/usr.sbin/bhyve/pci_emul.h b/usr.sbin/bhyve/pci_emul.h
index fba2e8845af8..1cefa5ed042d 100644
--- a/usr.sbin/bhyve/pci_emul.h
+++ b/usr.sbin/bhyve/pci_emul.h
@@ -1,292 +1,303 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _PCI_EMUL_H_
 #define _PCI_EMUL_H_
 
 #include <sys/types.h>
 #include <sys/queue.h>
 #include <sys/kernel.h>
 #include <sys/_pthreadtypes.h>
 
 #include <dev/pci/pcireg.h>
 
 #include <assert.h>
 
 #define	PCI_BARMAX	PCIR_MAX_BAR_0	/* BAR registers in a Type 0 header */
 
 struct vmctx;
 struct pci_devinst;
 struct memory_region;
+struct vm_snapshot_meta;
 
 struct pci_devemu {
 	char      *pe_emu;		/* Name of device emulation */
 
 	/* instance creation */
 	int       (*pe_init)(struct vmctx *, struct pci_devinst *,
 			     char *opts);
 
 	/* ACPI DSDT enumeration */
 	void	(*pe_write_dsdt)(struct pci_devinst *);
 
 	/* config space read/write callbacks */
 	int	(*pe_cfgwrite)(struct vmctx *ctx, int vcpu,
 			       struct pci_devinst *pi, int offset,
 			       int bytes, uint32_t val);
 	int	(*pe_cfgread)(struct vmctx *ctx, int vcpu,
 			      struct pci_devinst *pi, int offset,
 			      int bytes, uint32_t *retval);
 
 	/* BAR read/write callbacks */
 	void      (*pe_barwrite)(struct vmctx *ctx, int vcpu,
 				 struct pci_devinst *pi, int baridx,
 				 uint64_t offset, int size, uint64_t value);
 	uint64_t  (*pe_barread)(struct vmctx *ctx, int vcpu,
 				struct pci_devinst *pi, int baridx,
 				uint64_t offset, int size);
+
+	/* Save/restore device state */
+	int	(*pe_snapshot)(struct vm_snapshot_meta *meta);
+	int	(*pe_pause)(struct vmctx *ctx, struct pci_devinst *pi);
+	int	(*pe_resume)(struct vmctx *ctx, struct pci_devinst *pi);
 };
 #define PCI_EMUL_SET(x)   DATA_SET(pci_devemu_set, x);
 
 enum pcibar_type {
 	PCIBAR_NONE,
 	PCIBAR_IO,
 	PCIBAR_MEM32,
 	PCIBAR_MEM64,
 	PCIBAR_MEMHI64
 };
 
 struct pcibar {
 	enum pcibar_type	type;		/* io or memory */
 	uint64_t		size;
 	uint64_t		addr;
 };
 
 #define PI_NAMESZ	40
 
 struct msix_table_entry {
 	uint64_t	addr;
 	uint32_t	msg_data;
 	uint32_t	vector_control;
 } __packed;
 
 /* 
  * In case the structure is modified to hold extra information, use a define
  * for the size that should be emulated.
  */
 #define	MSIX_TABLE_ENTRY_SIZE	16
 #define MAX_MSIX_TABLE_ENTRIES	2048
 #define	PBA_SIZE(msgnum)	(roundup2((msgnum), 64) / 8)
 
 enum lintr_stat {
 	IDLE,
 	ASSERTED,
 	PENDING
 };
 
 struct pci_devinst {
 	struct pci_devemu *pi_d;
 	struct vmctx *pi_vmctx;
 	uint8_t	  pi_bus, pi_slot, pi_func;
 	char	  pi_name[PI_NAMESZ];
 	int	  pi_bar_getsize;
 	int	  pi_prevcap;
 	int	  pi_capend;
 
 	struct {
 		int8_t    	pin;
 		enum lintr_stat	state;
 		int		pirq_pin;
 		int	  	ioapic_irq;
 		pthread_mutex_t	lock;
 	} pi_lintr;
 
 	struct {
 		int		enabled;
 		uint64_t	addr;
 		uint64_t	msg_data;
 		int		maxmsgnum;
 	} pi_msi;
 
 	struct {
 		int	enabled;
 		int	table_bar;
 		int	pba_bar;
 		uint32_t table_offset;
 		int	table_count;
 		uint32_t pba_offset;
 		int	pba_size;
 		int	function_mask; 	
 		struct msix_table_entry *table;	/* allocated at runtime */
 		void	*pba_page;
 		int	pba_page_offset;
 	} pi_msix;
 
 	void      *pi_arg;		/* devemu-private data */
 
 	u_char	  pi_cfgdata[PCI_REGMAX + 1];
 	struct pcibar pi_bar[PCI_BARMAX + 1];
 };
 
 struct msicap {
 	uint8_t		capid;
 	uint8_t		nextptr;
 	uint16_t	msgctrl;
 	uint32_t	addrlo;
 	uint32_t	addrhi;
 	uint16_t	msgdata;
 } __packed;
 static_assert(sizeof(struct msicap) == 14, "compile-time assertion failed");
 
 struct msixcap {
 	uint8_t		capid;
 	uint8_t		nextptr;
 	uint16_t	msgctrl;
 	uint32_t	table_info;	/* bar index and offset within it */
 	uint32_t	pba_info;	/* bar index and offset within it */
 } __packed;
 static_assert(sizeof(struct msixcap) == 12, "compile-time assertion failed");
 
 struct pciecap {
 	uint8_t		capid;
 	uint8_t		nextptr;
 	uint16_t	pcie_capabilities;
 
 	uint32_t	dev_capabilities;	/* all devices */
 	uint16_t	dev_control;
 	uint16_t	dev_status;
 
 	uint32_t	link_capabilities;	/* devices with links */
 	uint16_t	link_control;
 	uint16_t	link_status;
 
 	uint32_t	slot_capabilities;	/* ports with slots */
 	uint16_t	slot_control;
 	uint16_t	slot_status;
 
 	uint16_t	root_control;		/* root ports */
 	uint16_t	root_capabilities;
 	uint32_t	root_status;
 
 	uint32_t	dev_capabilities2;	/* all devices */
 	uint16_t	dev_control2;
 	uint16_t	dev_status2;
 
 	uint32_t	link_capabilities2;	/* devices with links */
 	uint16_t	link_control2;
 	uint16_t	link_status2;
 
 	uint32_t	slot_capabilities2;	/* ports with slots */
 	uint16_t	slot_control2;
 	uint16_t	slot_status2;
 } __packed;
 static_assert(sizeof(struct pciecap) == 60, "compile-time assertion failed");
 
 typedef void (*pci_lintr_cb)(int b, int s, int pin, int pirq_pin,
     int ioapic_irq, void *arg);
 
 int	init_pci(struct vmctx *ctx);
 void	msicap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
 	    int bytes, uint32_t val);
 void	msixcap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
 	    int bytes, uint32_t val);
 void	pci_callback(void);
 int	pci_emul_alloc_bar(struct pci_devinst *pdi, int idx,
 	    enum pcibar_type type, uint64_t size);
 int	pci_emul_alloc_pbar(struct pci_devinst *pdi, int idx,
 	    uint64_t hostbase, enum pcibar_type type, uint64_t size);
 int	pci_emul_add_msicap(struct pci_devinst *pi, int msgnum);
 int	pci_emul_add_pciecap(struct pci_devinst *pi, int pcie_device_type);
 void	pci_emul_cmd_changed(struct pci_devinst *pi, uint16_t old);
 void	pci_generate_msi(struct pci_devinst *pi, int msgnum);
 void	pci_generate_msix(struct pci_devinst *pi, int msgnum);
 void	pci_lintr_assert(struct pci_devinst *pi);
 void	pci_lintr_deassert(struct pci_devinst *pi);
 void	pci_lintr_request(struct pci_devinst *pi);
 int	pci_msi_enabled(struct pci_devinst *pi);
 int	pci_msix_enabled(struct pci_devinst *pi);
 int	pci_msix_table_bar(struct pci_devinst *pi);
 int	pci_msix_pba_bar(struct pci_devinst *pi);
 int	pci_msi_maxmsgnum(struct pci_devinst *pi);
 int	pci_parse_slot(char *opt);
 void    pci_print_supported_devices();
 void	pci_populate_msicap(struct msicap *cap, int msgs, int nextptr);
 int	pci_emul_add_msixcap(struct pci_devinst *pi, int msgnum, int barnum);
 int	pci_emul_msix_twrite(struct pci_devinst *pi, uint64_t offset, int size,
 			     uint64_t value);
 uint64_t pci_emul_msix_tread(struct pci_devinst *pi, uint64_t offset, int size);
 int	pci_count_lintr(int bus);
 void	pci_walk_lintr(int bus, pci_lintr_cb cb, void *arg);
 void	pci_write_dsdt(void);
 uint64_t pci_ecfg_base(void);
 int	pci_bus_configured(int bus);
+#ifdef BHYVE_SNAPSHOT
+int	pci_snapshot(struct vm_snapshot_meta *meta);
+int	pci_pause(struct vmctx *ctx, const char *dev_name);
+int	pci_resume(struct vmctx *ctx, const char *dev_name);
+#endif
 
 static __inline void 
 pci_set_cfgdata8(struct pci_devinst *pi, int offset, uint8_t val)
 {
 	assert(offset <= PCI_REGMAX);
 	*(uint8_t *)(pi->pi_cfgdata + offset) = val;
 }
 
 static __inline void 
 pci_set_cfgdata16(struct pci_devinst *pi, int offset, uint16_t val)
 {
 	assert(offset <= (PCI_REGMAX - 1) && (offset & 1) == 0);
 	*(uint16_t *)(pi->pi_cfgdata + offset) = val;
 }
 
 static __inline void 
 pci_set_cfgdata32(struct pci_devinst *pi, int offset, uint32_t val)
 {
 	assert(offset <= (PCI_REGMAX - 3) && (offset & 3) == 0);
 	*(uint32_t *)(pi->pi_cfgdata + offset) = val;
 }
 
 static __inline uint8_t
 pci_get_cfgdata8(struct pci_devinst *pi, int offset)
 {
 	assert(offset <= PCI_REGMAX);
 	return (*(uint8_t *)(pi->pi_cfgdata + offset));
 }
 
 static __inline uint16_t
 pci_get_cfgdata16(struct pci_devinst *pi, int offset)
 {
 	assert(offset <= (PCI_REGMAX - 1) && (offset & 1) == 0);
 	return (*(uint16_t *)(pi->pi_cfgdata + offset));
 }
 
 static __inline uint32_t
 pci_get_cfgdata32(struct pci_devinst *pi, int offset)
 {
 	assert(offset <= (PCI_REGMAX - 3) && (offset & 3) == 0);
 	return (*(uint32_t *)(pi->pi_cfgdata + offset));
 }
 
 #endif /* _PCI_EMUL_H_ */
diff --git a/usr.sbin/bhyve/pci_fbuf.c b/usr.sbin/bhyve/pci_fbuf.c
index 8961875356da..0bd740a0908c 100644
--- a/usr.sbin/bhyve/pci_fbuf.c
+++ b/usr.sbin/bhyve/pci_fbuf.c
@@ -1,449 +1,466 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2015 Nahanni Systems, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/types.h>
 #include <sys/mman.h>
 
 #include <machine/vmm.h>
+#include <machine/vmm_snapshot.h>
 #include <vmmapi.h>
 
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
 #include <errno.h>
 #include <unistd.h>
 
 #include "bhyvegc.h"
 #include "bhyverun.h"
 #include "debug.h"
 #include "console.h"
 #include "inout.h"
 #include "pci_emul.h"
 #include "rfb.h"
 #include "vga.h"
 
 /*
  * bhyve Framebuffer device emulation.
  * BAR0 points to the current mode information.
  * BAR1 is the 32-bit framebuffer address.
  *
  *  -s <b>,fbuf,wait,vga=on|io|off,rfb=<ip>:port,w=width,h=height
  */
 
 static int fbuf_debug = 1;
 #define	DEBUG_INFO	1
 #define	DEBUG_VERBOSE	4
 #define	DPRINTF(level, params)  if (level <= fbuf_debug) PRINTLN params
 
 
 #define	KB	(1024UL)
 #define	MB	(1024 * 1024UL)
 
 #define	DMEMSZ	128
 
 #define	FB_SIZE		(16*MB)
 
 #define COLS_MAX	1920
 #define	ROWS_MAX	1200
 
 #define COLS_DEFAULT	1024
 #define ROWS_DEFAULT	768
 
 #define COLS_MIN	640
 #define ROWS_MIN	480
 
 struct pci_fbuf_softc {
 	struct pci_devinst *fsc_pi;
 	struct {
 		uint32_t fbsize;
 		uint16_t width;
 		uint16_t height;
 		uint16_t depth;
 		uint16_t refreshrate;
 		uint8_t  reserved[116];
 	} __packed memregs;
 
 	/* rfb server */
 	char      *rfb_host;
 	char      *rfb_password;
 	int       rfb_port;
 	int       rfb_wait;
 	int       vga_enabled;
 	int	  vga_full;
 
 	uint32_t  fbaddr;
 	char      *fb_base;
 	uint16_t  gc_width;
 	uint16_t  gc_height;
 	void      *vgasc;
 	struct bhyvegc_image *gc_image;
 };
 
 static struct pci_fbuf_softc *fbuf_sc;
 
 #define	PCI_FBUF_MSI_MSGS	 4
 
 static void
 pci_fbuf_usage(char *opt)
 {
 
 	EPRINTLN("Invalid fbuf emulation option \"%s\"", opt);
 	EPRINTLN("fbuf: {wait,}{vga=on|io|off,}rfb=<ip>:port"
 	    "{,w=width}{,h=height}");
 }
 
 static void
 pci_fbuf_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
 	       int baridx, uint64_t offset, int size, uint64_t value)
 {
 	struct pci_fbuf_softc *sc;
 	uint8_t *p;
 
 	assert(baridx == 0);
 
 	sc = pi->pi_arg;
 
 	DPRINTF(DEBUG_VERBOSE,
 	    ("fbuf wr: offset 0x%lx, size: %d, value: 0x%lx",
 	    offset, size, value));
 
 	if (offset + size > DMEMSZ) {
 		printf("fbuf: write too large, offset %ld size %d\n",
 		       offset, size);
 		return;
 	}
 
 	p = (uint8_t *)&sc->memregs + offset;
 
 	switch (size) {
 	case 1:
 		*p = value;
 		break;
 	case 2:
 		*(uint16_t *)p = value;
 		break;
 	case 4:
 		*(uint32_t *)p = value;
 		break;
 	case 8:
 		*(uint64_t *)p = value;
 		break;
 	default:
 		printf("fbuf: write unknown size %d\n", size);
 		break;
 	}
 
 	if (!sc->gc_image->vgamode && sc->memregs.width == 0 &&
 	    sc->memregs.height == 0) {
 		DPRINTF(DEBUG_INFO, ("switching to VGA mode"));
 		sc->gc_image->vgamode = 1;
 		sc->gc_width = 0;
 		sc->gc_height = 0;
 	} else if (sc->gc_image->vgamode && sc->memregs.width != 0 &&
 	    sc->memregs.height != 0) {
 		DPRINTF(DEBUG_INFO, ("switching to VESA mode"));
 		sc->gc_image->vgamode = 0;
 	}
 }
 
 uint64_t
 pci_fbuf_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
 	      int baridx, uint64_t offset, int size)
 {
 	struct pci_fbuf_softc *sc;
 	uint8_t *p;
 	uint64_t value;
 
 	assert(baridx == 0);
 
 	sc = pi->pi_arg;
 
 
 	if (offset + size > DMEMSZ) {
 		printf("fbuf: read too large, offset %ld size %d\n",
 		       offset, size);
 		return (0);
 	}
 
 	p = (uint8_t *)&sc->memregs + offset;
 	value = 0;
 	switch (size) {
 	case 1:
 		value = *p;
 		break;
 	case 2:
 		value = *(uint16_t *)p;
 		break;
 	case 4:
 		value = *(uint32_t *)p;
 		break;
 	case 8:
 		value = *(uint64_t *)p;
 		break;
 	default:
 		printf("fbuf: read unknown size %d\n", size);
 		break;
 	}
 
 	DPRINTF(DEBUG_VERBOSE,
 	    ("fbuf rd: offset 0x%lx, size: %d, value: 0x%lx",
 	     offset, size, value));
 
 	return (value);
 }
 
 static int
 pci_fbuf_parse_opts(struct pci_fbuf_softc *sc, char *opts)
 {
 	char	*uopts, *uoptsbak, *xopts, *config;
 	char	*tmpstr;
 	int	ret;
 
 	ret = 0;
 	uoptsbak = uopts = strdup(opts);
 	while ((xopts = strsep(&uopts, ",")) != NULL) {
 		if (strcmp(xopts, "wait") == 0) {
 			sc->rfb_wait = 1;
 			continue;
 		}
 
 		if ((config = strchr(xopts, '=')) == NULL) {
 			pci_fbuf_usage(xopts);
 			ret = -1;
 			goto done;
 		}
 
 		*config++ = '\0';
 
 		DPRINTF(DEBUG_VERBOSE, ("pci_fbuf option %s = %s",
 		   xopts, config));
 
 		if (!strcmp(xopts, "tcp") || !strcmp(xopts, "rfb")) {
 			/*
 			 * IPv4 -- host-ip:port
 			 * IPv6 -- [host-ip%zone]:port
 			 * XXX for now port is mandatory.
 			 */
 			tmpstr = strsep(&config, "]");
 			if (config) {
 				if (tmpstr[0] == '[')
 					tmpstr++;
 				sc->rfb_host = strdup(tmpstr);
 				if (config[0] == ':')
 					config++;
 				else {
 					pci_fbuf_usage(xopts);
 					ret = -1;
 					goto done;
 				}
 				sc->rfb_port = atoi(config);
 			} else {
 				config = tmpstr;
 				tmpstr = strsep(&config, ":");
 				if (!config)
 					sc->rfb_port = atoi(tmpstr);
 				else {
 					sc->rfb_port = atoi(config);
 					sc->rfb_host = strdup(tmpstr);
 				}
 			}
 	        } else if (!strcmp(xopts, "vga")) {
 			if (!strcmp(config, "off")) {
 				sc->vga_enabled = 0;
 			} else if (!strcmp(config, "io")) {
 				sc->vga_enabled = 1;
 				sc->vga_full = 0;
 			} else if (!strcmp(config, "on")) {
 				sc->vga_enabled = 1;
 				sc->vga_full = 1;
 			} else {
 				pci_fbuf_usage(xopts);
 				ret = -1;
 				goto done;
 			}
 	        } else if (!strcmp(xopts, "w")) {
 		        sc->memregs.width = atoi(config);
 			if (sc->memregs.width > COLS_MAX) {
 				pci_fbuf_usage(xopts);
 				ret = -1;
 				goto done;
 			} else if (sc->memregs.width == 0)
 				sc->memregs.width = 1920;
 		} else if (!strcmp(xopts, "h")) {
 			sc->memregs.height = atoi(config);
 			if (sc->memregs.height > ROWS_MAX) {
 				pci_fbuf_usage(xopts);
 				ret = -1;
 				goto done;
 			} else if (sc->memregs.height == 0)
 				sc->memregs.height = 1080;
 		} else if (!strcmp(xopts, "password")) {
 			sc->rfb_password = strdup(config);
 		} else {
 			pci_fbuf_usage(xopts);
 			ret = -1;
 			goto done;
 		}
 	}
 
 done:
 	free(uoptsbak);
 	return (ret);
 }
 
 
 extern void vga_render(struct bhyvegc *gc, void *arg);
 
 void
 pci_fbuf_render(struct bhyvegc *gc, void *arg)
 {
 	struct pci_fbuf_softc *sc;
 
 	sc = arg;
 
 	if (sc->vga_full && sc->gc_image->vgamode) {
 		/* TODO: mode switching to vga and vesa should use the special
 		 *      EFI-bhyve protocol port.
 		 */
 		vga_render(gc, sc->vgasc);
 		return;
 	}
 	if (sc->gc_width != sc->memregs.width ||
 	    sc->gc_height != sc->memregs.height) {
 		bhyvegc_resize(gc, sc->memregs.width, sc->memregs.height);
 		sc->gc_width = sc->memregs.width;
 		sc->gc_height = sc->memregs.height;
 	}
 
 	return;
 }
 
 static int
 pci_fbuf_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 {
 	int error, prot;
 	struct pci_fbuf_softc *sc;
 	
 	if (fbuf_sc != NULL) {
 		EPRINTLN("Only one frame buffer device is allowed.");
 		return (-1);
 	}
 
 	sc = calloc(1, sizeof(struct pci_fbuf_softc));
 
 	pi->pi_arg = sc;
 
 	/* initialize config space */
 	pci_set_cfgdata16(pi, PCIR_DEVICE, 0x40FB);
 	pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
 	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_DISPLAY);
 	pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_DISPLAY_VGA);
 
 	error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM32, DMEMSZ);
 	assert(error == 0);
 
 	error = pci_emul_alloc_bar(pi, 1, PCIBAR_MEM32, FB_SIZE);
 	assert(error == 0);
 
 	error = pci_emul_add_msicap(pi, PCI_FBUF_MSI_MSGS);
 	assert(error == 0);
 
 	sc->fbaddr = pi->pi_bar[1].addr;
 	sc->memregs.fbsize = FB_SIZE;
 	sc->memregs.width  = COLS_DEFAULT;
 	sc->memregs.height = ROWS_DEFAULT;
 	sc->memregs.depth  = 32;
 
 	sc->vga_enabled = 1;
 	sc->vga_full = 0;
 
 	sc->fsc_pi = pi;
 
 	error = pci_fbuf_parse_opts(sc, opts);
 	if (error != 0)
 		goto done;
 
 	/* XXX until VGA rendering is enabled */
 	if (sc->vga_full != 0) {
 		EPRINTLN("pci_fbuf: VGA rendering not enabled");
 		goto done;
 	}
 
 	sc->fb_base = vm_create_devmem(ctx, VM_FRAMEBUFFER, "framebuffer", FB_SIZE);
 	if (sc->fb_base == MAP_FAILED) {
 		error = -1;
 		goto done;
 	}
 	DPRINTF(DEBUG_INFO, ("fbuf frame buffer base: %p [sz %lu]",
 	        sc->fb_base, FB_SIZE));
 
 	/*
 	 * Map the framebuffer into the guest address space.
 	 * XXX This may fail if the BAR is different than a prior
 	 * run. In this case flag the error. This will be fixed
 	 * when a change_memseg api is available.
 	 */
 	prot = PROT_READ | PROT_WRITE;
 	if (vm_mmap_memseg(ctx, sc->fbaddr, VM_FRAMEBUFFER, 0, FB_SIZE, prot) != 0) {
 		EPRINTLN("pci_fbuf: mapseg failed - try deleting VM and restarting");
 		error = -1;
 		goto done;
 	}
 
 	console_init(sc->memregs.width, sc->memregs.height, sc->fb_base);
 	console_fb_register(pci_fbuf_render, sc);
 
 	if (sc->vga_enabled)
 		sc->vgasc = vga_init(!sc->vga_full);
 	sc->gc_image = console_get_image();
 
 	fbuf_sc = sc;
 
 	memset((void *)sc->fb_base, 0, FB_SIZE);
 
 	error = rfb_init(sc->rfb_host, sc->rfb_port, sc->rfb_wait, sc->rfb_password);
 done:
 	if (error)
 		free(sc);
 
 	return (error);
 }
 
+#ifdef BHYVE_SNAPSHOT
+static int
+pci_fbuf_snapshot(struct vm_snapshot_meta *meta)
+{
+	int ret;
+
+	SNAPSHOT_BUF_OR_LEAVE(fbuf_sc->fb_base, FB_SIZE, meta, ret, err);
+
+err:
+	return (ret);
+}
+#endif
+
 struct pci_devemu pci_fbuf = {
 	.pe_emu =	"fbuf",
 	.pe_init =	pci_fbuf_init,
 	.pe_barwrite =	pci_fbuf_write,
-	.pe_barread =	pci_fbuf_read
+	.pe_barread =	pci_fbuf_read,
+#ifdef BHYVE_SNAPSHOT
+	.pe_snapshot =	pci_fbuf_snapshot,
+#endif
 };
 PCI_EMUL_SET(pci_fbuf);
diff --git a/usr.sbin/bhyve/pci_lpc.c b/usr.sbin/bhyve/pci_lpc.c
index 1e4b513ec494..4ebdd7039cbc 100644
--- a/usr.sbin/bhyve/pci_lpc.c
+++ b/usr.sbin/bhyve/pci_lpc.c
@@ -1,463 +1,487 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2013 Neel Natu <neel@freebsd.org>
  * Copyright (c) 2013 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/types.h>
 #include <machine/vmm.h>
+#include <machine/vmm_snapshot.h>
 
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
 #include <vmmapi.h>
 
 #include "acpi.h"
 #include "debug.h"
 #include "bootrom.h"
 #include "inout.h"
 #include "pci_emul.h"
 #include "pci_irq.h"
 #include "pci_lpc.h"
 #include "uart_emul.h"
 
 #define	IO_ICU1		0x20
 #define	IO_ICU2		0xA0
 
 SET_DECLARE(lpc_dsdt_set, struct lpc_dsdt);
 SET_DECLARE(lpc_sysres_set, struct lpc_sysres);
 
 #define	ELCR_PORT	0x4d0
 SYSRES_IO(ELCR_PORT, 2);
 
 #define	IO_TIMER1_PORT	0x40
 
 #define	NMISC_PORT	0x61
 SYSRES_IO(NMISC_PORT, 1);
 
 static struct pci_devinst *lpc_bridge;
 
 static const char *romfile;
 
 #define	LPC_UART_NUM	2
 static struct lpc_uart_softc {
 	struct uart_softc *uart_softc;
 	const char *opts;
 	int	iobase;
 	int	irq;
 	int	enabled;
 } lpc_uart_softc[LPC_UART_NUM];
 
 static const char *lpc_uart_names[LPC_UART_NUM] = { "COM1", "COM2" };
 
 /*
  * LPC device configuration is in the following form:
  * <lpc_device_name>[,<options>]
  * For e.g. "com1,stdio" or "bootrom,/var/romfile"
  */
 int
 lpc_device_parse(const char *opts)
 {
 	int unit, error;
 	char *str, *cpy, *lpcdev;
 
 	error = -1;
 	str = cpy = strdup(opts);
 	lpcdev = strsep(&str, ",");
 	if (lpcdev != NULL) {
 		if (strcasecmp(lpcdev, "bootrom") == 0) {
 			romfile = str;
 			error = 0;
 			goto done;
 		}
 		for (unit = 0; unit < LPC_UART_NUM; unit++) {
 			if (strcasecmp(lpcdev, lpc_uart_names[unit]) == 0) {
 				lpc_uart_softc[unit].opts = str;
 				error = 0;
 				goto done;
 			}
 		}
 	}
 
 done:
 	if (error)
 		free(cpy);
 
 	return (error);
 }
 
 void
 lpc_print_supported_devices()
 {
 	size_t i;
 
 	printf("bootrom\n");
 	for (i = 0; i < LPC_UART_NUM; i++)
 		printf("%s\n", lpc_uart_names[i]);
 }
 
 const char *
 lpc_bootrom(void)
 {
 
 	return (romfile);
 }
 
 static void
 lpc_uart_intr_assert(void *arg)
 {
 	struct lpc_uart_softc *sc = arg;
 
 	assert(sc->irq >= 0);
 
 	vm_isa_pulse_irq(lpc_bridge->pi_vmctx, sc->irq, sc->irq);
 }
 
 static void
 lpc_uart_intr_deassert(void *arg)
 {
 	/* 
 	 * The COM devices on the LPC bus generate edge triggered interrupts,
 	 * so nothing more to do here.
 	 */
 }
 
 static int
 lpc_uart_io_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
 		    uint32_t *eax, void *arg)
 {
 	int offset;
 	struct lpc_uart_softc *sc = arg;
 
 	offset = port - sc->iobase;
 
 	switch (bytes) {
 	case 1:
 		if (in)
 			*eax = uart_read(sc->uart_softc, offset);
 		else
 			uart_write(sc->uart_softc, offset, *eax);
 		break;
 	case 2:
 		if (in) {
 			*eax = uart_read(sc->uart_softc, offset);
 			*eax |= uart_read(sc->uart_softc, offset + 1) << 8;
 		} else {
 			uart_write(sc->uart_softc, offset, *eax);
 			uart_write(sc->uart_softc, offset + 1, *eax >> 8);
 		}
 		break;
 	default:
 		return (-1);
 	}
 
 	return (0);
 }
 
 static int
 lpc_init(struct vmctx *ctx)
 {
 	struct lpc_uart_softc *sc;
 	struct inout_port iop;
 	const char *name;
 	int unit, error;
 
 	if (romfile != NULL) {
 		error = bootrom_loadrom(ctx, romfile);
 		if (error)
 			return (error);
 	}
 
 	/* COM1 and COM2 */
 	for (unit = 0; unit < LPC_UART_NUM; unit++) {
 		sc = &lpc_uart_softc[unit];
 		name = lpc_uart_names[unit];
 
 		if (uart_legacy_alloc(unit, &sc->iobase, &sc->irq) != 0) {
 			EPRINTLN("Unable to allocate resources for "
 			    "LPC device %s", name);
 			return (-1);
 		}
 		pci_irq_reserve(sc->irq);
 
 		sc->uart_softc = uart_init(lpc_uart_intr_assert,
 				    lpc_uart_intr_deassert, sc);
 
 		if (uart_set_backend(sc->uart_softc, sc->opts) != 0) {
 			EPRINTLN("Unable to initialize backend '%s' "
 			    "for LPC device %s", sc->opts, name);
 			return (-1);
 		}
 
 		bzero(&iop, sizeof(struct inout_port));
 		iop.name = name;
 		iop.port = sc->iobase;
 		iop.size = UART_IO_BAR_SIZE;
 		iop.flags = IOPORT_F_INOUT;
 		iop.handler = lpc_uart_io_handler;
 		iop.arg = sc;
 
 		error = register_inout(&iop);
 		assert(error == 0);
 		sc->enabled = 1;
 	}
 
 	return (0);
 }
 
 static void
 pci_lpc_write_dsdt(struct pci_devinst *pi)
 {
 	struct lpc_dsdt **ldpp, *ldp;
 
 	dsdt_line("");
 	dsdt_line("Device (ISA)");
 	dsdt_line("{");
 	dsdt_line("  Name (_ADR, 0x%04X%04X)", pi->pi_slot, pi->pi_func);
 	dsdt_line("  OperationRegion (LPCR, PCI_Config, 0x00, 0x100)");
 	dsdt_line("  Field (LPCR, AnyAcc, NoLock, Preserve)");
 	dsdt_line("  {");
 	dsdt_line("    Offset (0x60),");
 	dsdt_line("    PIRA,   8,");
 	dsdt_line("    PIRB,   8,");
 	dsdt_line("    PIRC,   8,");
 	dsdt_line("    PIRD,   8,");
 	dsdt_line("    Offset (0x68),");
 	dsdt_line("    PIRE,   8,");
 	dsdt_line("    PIRF,   8,");
 	dsdt_line("    PIRG,   8,");
 	dsdt_line("    PIRH,   8");
 	dsdt_line("  }");
 	dsdt_line("");
 
 	dsdt_indent(1);
 	SET_FOREACH(ldpp, lpc_dsdt_set) {
 		ldp = *ldpp;
 		ldp->handler();
 	}
 
 	dsdt_line("");
 	dsdt_line("Device (PIC)");
 	dsdt_line("{");
 	dsdt_line("  Name (_HID, EisaId (\"PNP0000\"))");
 	dsdt_line("  Name (_CRS, ResourceTemplate ()");
 	dsdt_line("  {");
 	dsdt_indent(2);
 	dsdt_fixed_ioport(IO_ICU1, 2);
 	dsdt_fixed_ioport(IO_ICU2, 2);
 	dsdt_fixed_irq(2);
 	dsdt_unindent(2);
 	dsdt_line("  })");
 	dsdt_line("}");
 
 	dsdt_line("");
 	dsdt_line("Device (TIMR)");
 	dsdt_line("{");
 	dsdt_line("  Name (_HID, EisaId (\"PNP0100\"))");
 	dsdt_line("  Name (_CRS, ResourceTemplate ()");
 	dsdt_line("  {");
 	dsdt_indent(2);
 	dsdt_fixed_ioport(IO_TIMER1_PORT, 4);
 	dsdt_fixed_irq(0);
 	dsdt_unindent(2);
 	dsdt_line("  })");
 	dsdt_line("}");
 	dsdt_unindent(1);
 
 	dsdt_line("}");
 }
 
 static void
 pci_lpc_sysres_dsdt(void)
 {
 	struct lpc_sysres **lspp, *lsp;
 
 	dsdt_line("");
 	dsdt_line("Device (SIO)");
 	dsdt_line("{");
 	dsdt_line("  Name (_HID, EisaId (\"PNP0C02\"))");
 	dsdt_line("  Name (_CRS, ResourceTemplate ()");
 	dsdt_line("  {");
 
 	dsdt_indent(2);
 	SET_FOREACH(lspp, lpc_sysres_set) {
 		lsp = *lspp;
 		switch (lsp->type) {
 		case LPC_SYSRES_IO:
 			dsdt_fixed_ioport(lsp->base, lsp->length);
 			break;
 		case LPC_SYSRES_MEM:
 			dsdt_fixed_mem32(lsp->base, lsp->length);
 			break;
 		}
 	}
 	dsdt_unindent(2);
 
 	dsdt_line("  })");
 	dsdt_line("}");
 }
 LPC_DSDT(pci_lpc_sysres_dsdt);
 
 static void
 pci_lpc_uart_dsdt(void)
 {
 	struct lpc_uart_softc *sc;
 	int unit;
 
 	for (unit = 0; unit < LPC_UART_NUM; unit++) {
 		sc = &lpc_uart_softc[unit];
 		if (!sc->enabled)
 			continue;
 		dsdt_line("");
 		dsdt_line("Device (%s)", lpc_uart_names[unit]);
 		dsdt_line("{");
 		dsdt_line("  Name (_HID, EisaId (\"PNP0501\"))");
 		dsdt_line("  Name (_UID, %d)", unit + 1);
 		dsdt_line("  Name (_CRS, ResourceTemplate ()");
 		dsdt_line("  {");
 		dsdt_indent(2);
 		dsdt_fixed_ioport(sc->iobase, UART_IO_BAR_SIZE);
 		dsdt_fixed_irq(sc->irq);
 		dsdt_unindent(2);
 		dsdt_line("  })");
 		dsdt_line("}");
 	}
 }
 LPC_DSDT(pci_lpc_uart_dsdt);
 
 static int
 pci_lpc_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
 		  int coff, int bytes, uint32_t val)
 {
 	int pirq_pin;
 
 	if (bytes == 1) {
 		pirq_pin = 0;
 		if (coff >= 0x60 && coff <= 0x63)
 			pirq_pin = coff - 0x60 + 1;
 		if (coff >= 0x68 && coff <= 0x6b)
 			pirq_pin = coff - 0x68 + 5;
 		if (pirq_pin != 0) {
 			pirq_write(ctx, pirq_pin, val);
 			pci_set_cfgdata8(pi, coff, pirq_read(pirq_pin));
 			return (0);
 		}
 	}
 	return (-1);
 }
 
 static void
 pci_lpc_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
 	       int baridx, uint64_t offset, int size, uint64_t value)
 {
 }
 
 static uint64_t
 pci_lpc_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
 	      int baridx, uint64_t offset, int size)
 {
 	return (0);
 }
 
 #define	LPC_DEV		0x7000
 #define	LPC_VENDOR	0x8086
 
 static int
 pci_lpc_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 {
 
 	/*
 	 * Do not allow more than one LPC bridge to be configured.
 	 */
 	if (lpc_bridge != NULL) {
 		EPRINTLN("Only one LPC bridge is allowed.");
 		return (-1);
 	}
 
 	/*
 	 * Enforce that the LPC can only be configured on bus 0. This
 	 * simplifies the ACPI DSDT because it can provide a decode for
 	 * all legacy i/o ports behind bus 0.
 	 */
 	if (pi->pi_bus != 0) {
 		EPRINTLN("LPC bridge can be present only on bus 0.");
 		return (-1);
 	}
 
 	if (lpc_init(ctx) != 0)
 		return (-1);
 
 	/* initialize config space */
 	pci_set_cfgdata16(pi, PCIR_DEVICE, LPC_DEV);
 	pci_set_cfgdata16(pi, PCIR_VENDOR, LPC_VENDOR);
 	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_BRIDGE);
 	pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_BRIDGE_ISA);
 
 	lpc_bridge = pi;
 
 	return (0);
 }
 
 char *
 lpc_pirq_name(int pin)
 {
 	char *name;
 
 	if (lpc_bridge == NULL)
 		return (NULL);
 	asprintf(&name, "\\_SB.PC00.ISA.LNK%c,", 'A' + pin - 1);
 	return (name);
 }
 
 void
 lpc_pirq_routed(void)
 {
 	int pin;
 
 	if (lpc_bridge == NULL)
 		return;
 
  	for (pin = 0; pin < 4; pin++)
 		pci_set_cfgdata8(lpc_bridge, 0x60 + pin, pirq_read(pin + 1));
 	for (pin = 0; pin < 4; pin++)
 		pci_set_cfgdata8(lpc_bridge, 0x68 + pin, pirq_read(pin + 5));
 }
 
+#ifdef BHYVE_SNAPSHOT
+static int
+pci_lpc_snapshot(struct vm_snapshot_meta *meta)
+{
+	int unit, ret;
+	struct uart_softc *sc;
+
+	for (unit = 0; unit < LPC_UART_NUM; unit++) {
+		sc = lpc_uart_softc[unit].uart_softc;
+
+		ret = uart_snapshot(sc, meta);
+		if (ret != 0)
+			goto done;
+	}
+
+done:
+	return (ret);
+}
+#endif
+
 struct pci_devemu pci_de_lpc = {
 	.pe_emu =	"lpc",
 	.pe_init =	pci_lpc_init,
 	.pe_write_dsdt = pci_lpc_write_dsdt,
 	.pe_cfgwrite =	pci_lpc_cfgwrite,
 	.pe_barwrite =	pci_lpc_write,
-	.pe_barread =	pci_lpc_read
+	.pe_barread =	pci_lpc_read,
+#ifdef BHYVE_SNAPSHOT
+	.pe_snapshot =	pci_lpc_snapshot,
+#endif
 };
 PCI_EMUL_SET(pci_de_lpc);
diff --git a/usr.sbin/bhyve/pci_virtio_block.c b/usr.sbin/bhyve/pci_virtio_block.c
index 04ff7718c333..4fd8943efffa 100644
--- a/usr.sbin/bhyve/pci_virtio_block.c
+++ b/usr.sbin/bhyve/pci_virtio_block.c
@@ -1,528 +1,577 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  * Copyright 2020 Joyent, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/linker_set.h>
 #include <sys/stat.h>
 #include <sys/uio.h>
 #include <sys/ioctl.h>
 #include <sys/disk.h>
 
+#include <machine/vmm_snapshot.h>
+
 #include <errno.h>
 #include <fcntl.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <strings.h>
 #include <unistd.h>
 #include <assert.h>
 #include <pthread.h>
 #include <md5.h>
 
 #include "bhyverun.h"
 #include "debug.h"
 #include "pci_emul.h"
 #include "virtio.h"
 #include "block_if.h"
 
 #define	VTBLK_BSIZE	512
 #define	VTBLK_RINGSZ	128
 
 _Static_assert(VTBLK_RINGSZ <= BLOCKIF_RING_MAX, "Each ring entry must be able to queue a request");
 
 #define	VTBLK_S_OK	0
 #define	VTBLK_S_IOERR	1
 #define	VTBLK_S_UNSUPP	2
 
 #define	VTBLK_BLK_ID_BYTES	20 + 1
 
 /* Capability bits */
 #define	VTBLK_F_BARRIER		(1 << 0)	/* Does host support barriers? */
 #define	VTBLK_F_SIZE_MAX	(1 << 1)	/* Indicates maximum segment size */
 #define	VTBLK_F_SEG_MAX		(1 << 2)	/* Indicates maximum # of segments */
 #define	VTBLK_F_GEOMETRY	(1 << 4)	/* Legacy geometry available  */
 #define	VTBLK_F_RO		(1 << 5)	/* Disk is read-only */
 #define	VTBLK_F_BLK_SIZE	(1 << 6)	/* Block size of disk is available*/
 #define	VTBLK_F_SCSI		(1 << 7)	/* Supports scsi command passthru */
 #define	VTBLK_F_FLUSH		(1 << 9)	/* Writeback mode enabled after reset */
 #define	VTBLK_F_WCE		(1 << 9)	/* Legacy alias for FLUSH */
 #define	VTBLK_F_TOPOLOGY	(1 << 10)	/* Topology information is available */
 #define	VTBLK_F_CONFIG_WCE	(1 << 11)	/* Writeback mode available in config */
 #define	VTBLK_F_MQ		(1 << 12)	/* Multi-Queue */
 #define	VTBLK_F_DISCARD		(1 << 13)	/* Trim blocks */
 #define	VTBLK_F_WRITE_ZEROES	(1 << 14)	/* Write zeros */
 
 /*
  * Host capabilities
  */
 #define	VTBLK_S_HOSTCAPS      \
   ( VTBLK_F_SEG_MAX  |						    \
     VTBLK_F_BLK_SIZE |						    \
     VTBLK_F_FLUSH    |						    \
     VTBLK_F_TOPOLOGY |						    \
     VIRTIO_RING_F_INDIRECT_DESC )	/* indirect descriptors */
 
 /*
  * The current blockif_delete() interface only allows a single delete
  * request at a time.
  */
 #define	VTBLK_MAX_DISCARD_SEG	1
 
 /*
  * An arbitrary limit to prevent excessive latency due to large
  * delete requests.
  */
 #define	VTBLK_MAX_DISCARD_SECT	((16 << 20) / VTBLK_BSIZE)	/* 16 MiB */
 
 /*
  * Config space "registers"
  */
 struct vtblk_config {
 	uint64_t	vbc_capacity;
 	uint32_t	vbc_size_max;
 	uint32_t	vbc_seg_max;
 	struct {
 		uint16_t cylinders;
 		uint8_t heads;
 		uint8_t sectors;
 	} vbc_geometry;
 	uint32_t	vbc_blk_size;
 	struct {
 		uint8_t physical_block_exp;
 		uint8_t alignment_offset;
 		uint16_t min_io_size;
 		uint32_t opt_io_size;
 	} vbc_topology;
 	uint8_t		vbc_writeback;
 	uint8_t		unused0[1];
 	uint16_t	num_queues;
 	uint32_t	max_discard_sectors;
 	uint32_t	max_discard_seg;
 	uint32_t	discard_sector_alignment;
 	uint32_t	max_write_zeroes_sectors;
 	uint32_t	max_write_zeroes_seg;
 	uint8_t		write_zeroes_may_unmap;
 	uint8_t		unused1[3];
 } __packed;
 
 /*
  * Fixed-size block header
  */
 struct virtio_blk_hdr {
 #define	VBH_OP_READ		0
 #define	VBH_OP_WRITE		1
 #define	VBH_OP_SCSI_CMD		2
 #define	VBH_OP_SCSI_CMD_OUT	3
 #define	VBH_OP_FLUSH		4
 #define	VBH_OP_FLUSH_OUT	5
 #define	VBH_OP_IDENT		8
 #define	VBH_OP_DISCARD		11
 #define	VBH_OP_WRITE_ZEROES	13
 
 #define	VBH_FLAG_BARRIER	0x80000000	/* OR'ed into vbh_type */
 	uint32_t	vbh_type;
 	uint32_t	vbh_ioprio;
 	uint64_t	vbh_sector;
 } __packed;
 
 /*
  * Debug printf
  */
 static int pci_vtblk_debug;
 #define	DPRINTF(params) if (pci_vtblk_debug) PRINTLN params
 #define	WPRINTF(params) PRINTLN params
 
 struct pci_vtblk_ioreq {
 	struct blockif_req		io_req;
 	struct pci_vtblk_softc		*io_sc;
 	uint8_t				*io_status;
 	uint16_t			io_idx;
 };
 
 struct virtio_blk_discard_write_zeroes {
 	uint64_t	sector;
 	uint32_t	num_sectors;
 	struct {
 		uint32_t unmap:1;
 		uint32_t reserved:31;
 	} flags;
 };
 
 /*
  * Per-device softc
  */
 struct pci_vtblk_softc {
 	struct virtio_softc vbsc_vs;
 	pthread_mutex_t vsc_mtx;
 	struct vqueue_info vbsc_vq;
 	struct vtblk_config vbsc_cfg;
 	struct virtio_consts vbsc_consts;
 	struct blockif_ctxt *bc;
 	char vbsc_ident[VTBLK_BLK_ID_BYTES];
 	struct pci_vtblk_ioreq vbsc_ios[VTBLK_RINGSZ];
 };
 
 static void pci_vtblk_reset(void *);
 static void pci_vtblk_notify(void *, struct vqueue_info *);
 static int pci_vtblk_cfgread(void *, int, int, uint32_t *);
 static int pci_vtblk_cfgwrite(void *, int, int, uint32_t);
+#ifdef BHYVE_SNAPSHOT
+static void pci_vtblk_pause(void *);
+static void pci_vtblk_resume(void *);
+static int pci_vtblk_snapshot(void *, struct vm_snapshot_meta *);
+#endif
 
 static struct virtio_consts vtblk_vi_consts = {
 	"vtblk",		/* our name */
 	1,			/* we support 1 virtqueue */
 	sizeof(struct vtblk_config),	/* config reg size */
 	pci_vtblk_reset,	/* reset */
 	pci_vtblk_notify,	/* device-wide qnotify */
 	pci_vtblk_cfgread,	/* read PCI config */
 	pci_vtblk_cfgwrite,	/* write PCI config */
 	NULL,			/* apply negotiated features */
 	VTBLK_S_HOSTCAPS,	/* our capabilities */
+#ifdef BHYVE_SNAPSHOT
+	pci_vtblk_pause,	/* pause blockif threads */
+	pci_vtblk_resume,	/* resume blockif threads */
+	pci_vtblk_snapshot,	/* save / restore device state */
+#endif
 };
 
 static void
 pci_vtblk_reset(void *vsc)
 {
 	struct pci_vtblk_softc *sc = vsc;
 
 	DPRINTF(("vtblk: device reset requested !"));
 	vi_reset_dev(&sc->vbsc_vs);
 }
 
 static void
 pci_vtblk_done_locked(struct pci_vtblk_ioreq *io, int err)
 {
 	struct pci_vtblk_softc *sc = io->io_sc;
 
 	/* convert errno into a virtio block error return */
 	if (err == EOPNOTSUPP || err == ENOSYS)
 		*io->io_status = VTBLK_S_UNSUPP;
 	else if (err != 0)
 		*io->io_status = VTBLK_S_IOERR;
 	else
 		*io->io_status = VTBLK_S_OK;
 
 	/*
 	 * Return the descriptor back to the host.
 	 * We wrote 1 byte (our status) to host.
 	 */
 	vq_relchain(&sc->vbsc_vq, io->io_idx, 1);
 	vq_endchains(&sc->vbsc_vq, 0);
 }
 
+#ifdef BHYVE_SNAPSHOT
+static void
+pci_vtblk_pause(void *vsc)
+{
+	struct pci_vtblk_softc *sc = vsc;
+
+	DPRINTF(("vtblk: device pause requested !\n"));
+	blockif_pause(sc->bc);
+}
+
+static void
+pci_vtblk_resume(void *vsc)
+{
+	struct pci_vtblk_softc *sc = vsc;
+
+	DPRINTF(("vtblk: device resume requested !\n"));
+	blockif_resume(sc->bc);
+}
+
+static int
+pci_vtblk_snapshot(void *vsc, struct vm_snapshot_meta *meta)
+{
+	int ret;
+	struct pci_vtblk_softc *sc = vsc;
+
+	SNAPSHOT_VAR_OR_LEAVE(sc->vbsc_cfg, meta, ret, done);
+	SNAPSHOT_BUF_OR_LEAVE(sc->vbsc_ident, sizeof(sc->vbsc_ident),
+			      meta, ret, done);
+
+done:
+	return (ret);
+}
+#endif
+
 static void
 pci_vtblk_done(struct blockif_req *br, int err)
 {
 	struct pci_vtblk_ioreq *io = br->br_param;
 	struct pci_vtblk_softc *sc = io->io_sc;
 
 	pthread_mutex_lock(&sc->vsc_mtx);
 	pci_vtblk_done_locked(io, err);
 	pthread_mutex_unlock(&sc->vsc_mtx);
 }
 
 static void
 pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vqueue_info *vq)
 {
 	struct virtio_blk_hdr *vbh;
 	struct pci_vtblk_ioreq *io;
 	int i, n;
 	int err;
 	ssize_t iolen;
 	int writeop, type;
 	struct iovec iov[BLOCKIF_IOV_MAX + 2];
 	uint16_t idx, flags[BLOCKIF_IOV_MAX + 2];
 	struct virtio_blk_discard_write_zeroes *discard;
 
 	n = vq_getchain(vq, &idx, iov, BLOCKIF_IOV_MAX + 2, flags);
 
 	/*
 	 * The first descriptor will be the read-only fixed header,
 	 * and the last is for status (hence +2 above and below).
 	 * The remaining iov's are the actual data I/O vectors.
 	 *
 	 * XXX - note - this fails on crash dump, which does a
 	 * VIRTIO_BLK_T_FLUSH with a zero transfer length
 	 */
 	assert(n >= 2 && n <= BLOCKIF_IOV_MAX + 2);
 
 	io = &sc->vbsc_ios[idx];
 	assert((flags[0] & VRING_DESC_F_WRITE) == 0);
 	assert(iov[0].iov_len == sizeof(struct virtio_blk_hdr));
 	vbh = (struct virtio_blk_hdr *)iov[0].iov_base;
 	memcpy(&io->io_req.br_iov, &iov[1], sizeof(struct iovec) * (n - 2));
 	io->io_req.br_iovcnt = n - 2;
 	io->io_req.br_offset = vbh->vbh_sector * VTBLK_BSIZE;
 	io->io_status = (uint8_t *)iov[--n].iov_base;
 	assert(iov[n].iov_len == 1);
 	assert(flags[n] & VRING_DESC_F_WRITE);
 
 	/*
 	 * XXX
 	 * The guest should not be setting the BARRIER flag because
 	 * we don't advertise the capability.
 	 */
 	type = vbh->vbh_type & ~VBH_FLAG_BARRIER;
 	writeop = (type == VBH_OP_WRITE || type == VBH_OP_DISCARD);
 
 	iolen = 0;
 	for (i = 1; i < n; i++) {
 		/*
 		 * - write op implies read-only descriptor,
 		 * - read/ident op implies write-only descriptor,
 		 * therefore test the inverse of the descriptor bit
 		 * to the op.
 		 */
 		assert(((flags[i] & VRING_DESC_F_WRITE) == 0) == writeop);
 		iolen += iov[i].iov_len;
 	}
 	io->io_req.br_resid = iolen;
 
 	DPRINTF(("virtio-block: %s op, %zd bytes, %d segs, offset %ld",
 		 writeop ? "write/discard" : "read/ident", iolen, i - 1,
 		 io->io_req.br_offset));
 
 	switch (type) {
 	case VBH_OP_READ:
 		err = blockif_read(sc->bc, &io->io_req);
 		break;
 	case VBH_OP_WRITE:
 		err = blockif_write(sc->bc, &io->io_req);
 		break;
 	case VBH_OP_DISCARD:
 		/*
 		 * We currently only support a single request, if the guest
 		 * has submitted a request that doesn't conform to the
 		 * requirements, we return a error.
 		 */
 		if (iov[1].iov_len != sizeof (*discard)) {
 			pci_vtblk_done_locked(io, EINVAL);
 			return;
 		}
 
 		/* The segments to discard are provided rather than data */
 		discard = (struct virtio_blk_discard_write_zeroes *)
 		    iov[1].iov_base;
 
 		/*
 		 * virtio v1.1 5.2.6.2:
 		 * The device MUST set the status byte to VIRTIO_BLK_S_UNSUPP
 		 * for discard and write zeroes commands if any unknown flag is
 		 * set. Furthermore, the device MUST set the status byte to
 		 * VIRTIO_BLK_S_UNSUPP for discard commands if the unmap flag
 		 * is set.
 		 *
 		 * Currently there are no known flags for a DISCARD request.
 		 */
 		if (discard->flags.unmap != 0 || discard->flags.reserved != 0) {
 			pci_vtblk_done_locked(io, ENOTSUP);
 			return;
 		}
 
 		/* Make sure the request doesn't exceed our size limit */
 		if (discard->num_sectors > VTBLK_MAX_DISCARD_SECT) {
 			pci_vtblk_done_locked(io, EINVAL);
 			return;
 		}
 
 		io->io_req.br_offset = discard->sector * VTBLK_BSIZE;
 		io->io_req.br_resid = discard->num_sectors * VTBLK_BSIZE;
 		err = blockif_delete(sc->bc, &io->io_req);
 		break;
 	case VBH_OP_FLUSH:
 	case VBH_OP_FLUSH_OUT:
 		err = blockif_flush(sc->bc, &io->io_req);
 		break;
 	case VBH_OP_IDENT:
 		/* Assume a single buffer */
 		/* S/n equal to buffer is not zero-terminated. */
 		memset(iov[1].iov_base, 0, iov[1].iov_len);
 		strncpy(iov[1].iov_base, sc->vbsc_ident,
 		    MIN(iov[1].iov_len, sizeof(sc->vbsc_ident)));
 		pci_vtblk_done_locked(io, 0);
 		return;
 	default:
 		pci_vtblk_done_locked(io, EOPNOTSUPP);
 		return;
 	}
 	assert(err == 0);
 }
 
 static void
 pci_vtblk_notify(void *vsc, struct vqueue_info *vq)
 {
 	struct pci_vtblk_softc *sc = vsc;
 
 	while (vq_has_descs(vq))
 		pci_vtblk_proc(sc, vq);
 }
 
 static int
 pci_vtblk_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 {
 	char bident[sizeof("XX:X:X")];
 	struct blockif_ctxt *bctxt;
 	MD5_CTX mdctx;
 	u_char digest[16];
 	struct pci_vtblk_softc *sc;
 	off_t size;
 	int i, sectsz, sts, sto;
 
 	if (opts == NULL) {
 		WPRINTF(("virtio-block: backing device required"));
 		return (1);
 	}
 
 	/*
 	 * The supplied backing file has to exist
 	 */
 	snprintf(bident, sizeof(bident), "%d:%d", pi->pi_slot, pi->pi_func);
 	bctxt = blockif_open(opts, bident);
 	if (bctxt == NULL) {
 		perror("Could not open backing file");
 		return (1);
 	}
 
 	size = blockif_size(bctxt);
 	sectsz = blockif_sectsz(bctxt);
 	blockif_psectsz(bctxt, &sts, &sto);
 
 	sc = calloc(1, sizeof(struct pci_vtblk_softc));
 	sc->bc = bctxt;
 	for (i = 0; i < VTBLK_RINGSZ; i++) {
 		struct pci_vtblk_ioreq *io = &sc->vbsc_ios[i];
 		io->io_req.br_callback = pci_vtblk_done;
 		io->io_req.br_param = io;
 		io->io_sc = sc;
 		io->io_idx = i;
 	}
 
 	bcopy(&vtblk_vi_consts, &sc->vbsc_consts, sizeof (vtblk_vi_consts));
 	if (blockif_candelete(sc->bc))
 		sc->vbsc_consts.vc_hv_caps |= VTBLK_F_DISCARD;
 
 	pthread_mutex_init(&sc->vsc_mtx, NULL);
 
 	/* init virtio softc and virtqueues */
 	vi_softc_linkup(&sc->vbsc_vs, &sc->vbsc_consts, sc, pi, &sc->vbsc_vq);
 	sc->vbsc_vs.vs_mtx = &sc->vsc_mtx;
 
 	sc->vbsc_vq.vq_qsize = VTBLK_RINGSZ;
 	/* sc->vbsc_vq.vq_notify = we have no per-queue notify */
 
 	/*
 	 * Create an identifier for the backing file. Use parts of the
 	 * md5 sum of the filename
 	 */
 	MD5Init(&mdctx);
 	MD5Update(&mdctx, opts, strlen(opts));
 	MD5Final(digest, &mdctx);
 	snprintf(sc->vbsc_ident, VTBLK_BLK_ID_BYTES,
 	    "BHYVE-%02X%02X-%02X%02X-%02X%02X",
 	    digest[0], digest[1], digest[2], digest[3], digest[4], digest[5]);
 
 	/* setup virtio block config space */
 	sc->vbsc_cfg.vbc_capacity = size / VTBLK_BSIZE; /* 512-byte units */
 	sc->vbsc_cfg.vbc_size_max = 0;	/* not negotiated */
 
 	/*
 	 * If Linux is presented with a seg_max greater than the virtio queue
 	 * size, it can stumble into situations where it violates its own
 	 * invariants and panics.  For safety, we keep seg_max clamped, paying
 	 * heed to the two extra descriptors needed for the header and status
 	 * of a request.
 	 */
 	sc->vbsc_cfg.vbc_seg_max = MIN(VTBLK_RINGSZ - 2, BLOCKIF_IOV_MAX);
 	sc->vbsc_cfg.vbc_geometry.cylinders = 0;	/* no geometry */
 	sc->vbsc_cfg.vbc_geometry.heads = 0;
 	sc->vbsc_cfg.vbc_geometry.sectors = 0;
 	sc->vbsc_cfg.vbc_blk_size = sectsz;
 	sc->vbsc_cfg.vbc_topology.physical_block_exp =
 	    (sts > sectsz) ? (ffsll(sts / sectsz) - 1) : 0;
 	sc->vbsc_cfg.vbc_topology.alignment_offset =
 	    (sto != 0) ? ((sts - sto) / sectsz) : 0;
 	sc->vbsc_cfg.vbc_topology.min_io_size = 0;
 	sc->vbsc_cfg.vbc_topology.opt_io_size = 0;
 	sc->vbsc_cfg.vbc_writeback = 0;
 	sc->vbsc_cfg.max_discard_sectors = VTBLK_MAX_DISCARD_SECT;
 	sc->vbsc_cfg.max_discard_seg = VTBLK_MAX_DISCARD_SEG;
 	sc->vbsc_cfg.discard_sector_alignment = sectsz / VTBLK_BSIZE;
 
 	/*
 	 * Should we move some of this into virtio.c?  Could
 	 * have the device, class, and subdev_0 as fields in
 	 * the virtio constants structure.
 	 */
 	pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_BLOCK);
 	pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
 	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
 	pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_BLOCK);
 	pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR);
 
 	if (vi_intr_init(&sc->vbsc_vs, 1, fbsdrun_virtio_msix())) {
 		blockif_close(sc->bc);
 		free(sc);
 		return (1);
 	}
 	vi_set_io_bar(&sc->vbsc_vs, 0);
 	return (0);
 }
 
 static int
 pci_vtblk_cfgwrite(void *vsc, int offset, int size, uint32_t value)
 {
 
 	DPRINTF(("vtblk: write to readonly reg %d", offset));
 	return (1);
 }
 
 static int
 pci_vtblk_cfgread(void *vsc, int offset, int size, uint32_t *retval)
 {
 	struct pci_vtblk_softc *sc = vsc;
 	void *ptr;
 
 	/* our caller has already verified offset and size */
 	ptr = (uint8_t *)&sc->vbsc_cfg + offset;
 	memcpy(retval, ptr, size);
 	return (0);
 }
 
 struct pci_devemu pci_de_vblk = {
 	.pe_emu =	"virtio-blk",
 	.pe_init =	pci_vtblk_init,
 	.pe_barwrite =	vi_pci_write,
-	.pe_barread =	vi_pci_read
+	.pe_barread =	vi_pci_read,
+#ifdef BHYVE_SNAPSHOT
+	.pe_snapshot =	vi_pci_snapshot,
+#endif
 };
 PCI_EMUL_SET(pci_de_vblk);
diff --git a/usr.sbin/bhyve/pci_virtio_net.c b/usr.sbin/bhyve/pci_virtio_net.c
index adc273128585..a0fcd9055e65 100644
--- a/usr.sbin/bhyve/pci_virtio_net.c
+++ b/usr.sbin/bhyve/pci_virtio_net.c
@@ -1,749 +1,830 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/linker_set.h>
 #include <sys/select.h>
 #include <sys/uio.h>
 #include <sys/ioctl.h>
+#include <machine/vmm_snapshot.h>
 #include <net/ethernet.h>
 #include <net/if.h> /* IFNAMSIZ */
 
 #include <err.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <strings.h>
 #include <unistd.h>
 #include <assert.h>
 #include <pthread.h>
 #include <pthread_np.h>
 
 #include "bhyverun.h"
 #include "debug.h"
 #include "pci_emul.h"
 #include "mevent.h"
 #include "virtio.h"
 #include "net_utils.h"
 #include "net_backends.h"
 #include "iov.h"
 
 #define VTNET_RINGSZ	1024
 
 #define VTNET_MAXSEGS	256
 
 #define VTNET_MAX_PKT_LEN	(65536 + 64)
 
 #define VTNET_MIN_MTU	ETHERMIN
 #define VTNET_MAX_MTU	65535
 
 #define VTNET_S_HOSTCAPS      \
   ( VIRTIO_NET_F_MAC | VIRTIO_NET_F_STATUS | \
     VIRTIO_F_NOTIFY_ON_EMPTY | VIRTIO_RING_F_INDIRECT_DESC)
 
 /*
  * PCI config-space "registers"
  */
 struct virtio_net_config {
 	uint8_t  mac[6];
 	uint16_t status;
 	uint16_t max_virtqueue_pairs;
 	uint16_t mtu;
 } __packed;
 
 /*
  * Queue definitions.
  */
 #define VTNET_RXQ	0
 #define VTNET_TXQ	1
 #define VTNET_CTLQ	2	/* NB: not yet supported */
 
 #define VTNET_MAXQ	3
 
 /*
  * Debug printf
  */
 static int pci_vtnet_debug;
 #define DPRINTF(params) if (pci_vtnet_debug) PRINTLN params
 #define WPRINTF(params) PRINTLN params
 
 /*
  * Per-device softc
  */
 struct pci_vtnet_softc {
 	struct virtio_softc vsc_vs;
 	struct vqueue_info vsc_queues[VTNET_MAXQ - 1];
 	pthread_mutex_t vsc_mtx;
 
 	net_backend_t	*vsc_be;
 
 	int		resetting;	/* protected by tx_mtx */
 
 	uint64_t	vsc_features;	/* negotiated features */
 	
 	pthread_mutex_t	rx_mtx;
 	int		rx_merge;	/* merged rx bufs in use */
 
 	pthread_t 	tx_tid;
 	pthread_mutex_t	tx_mtx;
 	pthread_cond_t	tx_cond;
 	int		tx_in_progress;
 
 	size_t		vhdrlen;
 	size_t		be_vhdrlen;
 
 	struct virtio_net_config vsc_config;
 	struct virtio_consts vsc_consts;
 };
 
 static void pci_vtnet_reset(void *);
 /* static void pci_vtnet_notify(void *, struct vqueue_info *); */
 static int pci_vtnet_cfgread(void *, int, int, uint32_t *);
 static int pci_vtnet_cfgwrite(void *, int, int, uint32_t);
 static void pci_vtnet_neg_features(void *, uint64_t);
+#ifdef BHYVE_SNAPSHOT
+static void pci_vtnet_pause(void *);
+static void pci_vtnet_resume(void *);
+static int pci_vtnet_snapshot(void *, struct vm_snapshot_meta *);
+#endif
 
 static struct virtio_consts vtnet_vi_consts = {
 	"vtnet",		/* our name */
 	VTNET_MAXQ - 1,		/* we currently support 2 virtqueues */
 	sizeof(struct virtio_net_config), /* config reg size */
 	pci_vtnet_reset,	/* reset */
 	NULL,			/* device-wide qnotify -- not used */
 	pci_vtnet_cfgread,	/* read PCI config */
 	pci_vtnet_cfgwrite,	/* write PCI config */
 	pci_vtnet_neg_features,	/* apply negotiated features */
 	VTNET_S_HOSTCAPS,	/* our capabilities */
+#ifdef BHYVE_SNAPSHOT
+	pci_vtnet_pause,	/* pause rx/tx threads */
+	pci_vtnet_resume,	/* resume rx/tx threads */
+	pci_vtnet_snapshot,	/* save / restore device state */
+#endif
 };
 
 static void
 pci_vtnet_reset(void *vsc)
 {
 	struct pci_vtnet_softc *sc = vsc;
 
 	DPRINTF(("vtnet: device reset requested !"));
 
 	/* Acquire the RX lock to block RX processing. */
 	pthread_mutex_lock(&sc->rx_mtx);
 
 	/*
 	 * Make sure receive operation is disabled at least until we
 	 * re-negotiate the features, since receive operation depends
 	 * on the value of sc->rx_merge and the header length, which
 	 * are both set in pci_vtnet_neg_features().
 	 * Receive operation will be enabled again once the guest adds
 	 * the first receive buffers and kicks us.
 	 */
 	netbe_rx_disable(sc->vsc_be);
 
 	/* Set sc->resetting and give a chance to the TX thread to stop. */
 	pthread_mutex_lock(&sc->tx_mtx);
 	sc->resetting = 1;
 	while (sc->tx_in_progress) {
 		pthread_mutex_unlock(&sc->tx_mtx);
 		usleep(10000);
 		pthread_mutex_lock(&sc->tx_mtx);
 	}
 
 	/*
 	 * Now reset rings, MSI-X vectors, and negotiated capabilities.
 	 * Do that with the TX lock held, since we need to reset
 	 * sc->resetting.
 	 */
 	vi_reset_dev(&sc->vsc_vs);
 
 	sc->resetting = 0;
 	pthread_mutex_unlock(&sc->tx_mtx);
 	pthread_mutex_unlock(&sc->rx_mtx);
 }
 
 static __inline struct iovec *
 iov_trim_hdr(struct iovec *iov, int *iovcnt, unsigned int hlen)
 {
 	struct iovec *riov;
 
 	if (iov[0].iov_len < hlen) {
 		/*
 		 * Not enough header space in the first fragment.
 		 * That's not ok for us.
 		 */
 		return NULL;
 	}
 
 	iov[0].iov_len -= hlen;
 	if (iov[0].iov_len == 0) {
 		*iovcnt -= 1;
 		if (*iovcnt == 0) {
 			/*
 			 * Only space for the header. That's not
 			 * enough for us.
 			 */
 			return NULL;
 		}
 		riov = &iov[1];
 	} else {
 		iov[0].iov_base = (void *)((uintptr_t)iov[0].iov_base + hlen);
 		riov = &iov[0];
 	}
 
 	return (riov);
 }
 
 struct virtio_mrg_rxbuf_info {
 	uint16_t idx;
 	uint16_t pad;
 	uint32_t len;
 };
 
 static void
 pci_vtnet_rx(struct pci_vtnet_softc *sc)
 {
 	int prepend_hdr_len = sc->vhdrlen - sc->be_vhdrlen;
 	struct virtio_mrg_rxbuf_info info[VTNET_MAXSEGS];
 	struct iovec iov[VTNET_MAXSEGS + 1];
 	struct vqueue_info *vq;
 
 	vq = &sc->vsc_queues[VTNET_RXQ];
 	for (;;) {
 		struct virtio_net_rxhdr *hdr;
 		uint32_t riov_bytes;
 		struct iovec *riov;
 		uint32_t ulen;
 		int riov_len;
 		int n_chains;
 		ssize_t rlen;
 		ssize_t plen;
 
 		plen = netbe_peek_recvlen(sc->vsc_be);
 		if (plen <= 0) {
 			/*
 			 * No more packets (plen == 0), or backend errored
 			 * (plen < 0). Interrupt if needed and stop.
 			 */
 			vq_endchains(vq, /*used_all_avail=*/0);
 			return;
 		}
 		plen += prepend_hdr_len;
 
 		/*
 		 * Get a descriptor chain to store the next ingress
 		 * packet. In case of mergeable rx buffers, get as
 		 * many chains as necessary in order to make room
 		 * for plen bytes.
 		 */
 		riov_bytes = 0;
 		riov_len = 0;
 		riov = iov;
 		n_chains = 0;
 		do {
 			int n = vq_getchain(vq, &info[n_chains].idx, riov,
 			    VTNET_MAXSEGS - riov_len, NULL);
 
 			if (n == 0) {
 				/*
 				 * No rx buffers. Enable RX kicks and double
 				 * check.
 				 */
 				vq_kick_enable(vq);
 				if (!vq_has_descs(vq)) {
 					/*
 					 * Still no buffers. Return the unused
 					 * chains (if any), interrupt if needed
 					 * (including for NOTIFY_ON_EMPTY), and
 					 * disable the backend until the next
 					 * kick.
 					 */
 					vq_retchains(vq, n_chains);
 					vq_endchains(vq, /*used_all_avail=*/1);
 					netbe_rx_disable(sc->vsc_be);
 					return;
 				}
 
 				/* More rx buffers found, so keep going. */
 				vq_kick_disable(vq);
 				continue;
 			}
 			assert(n >= 1 && riov_len + n <= VTNET_MAXSEGS);
 			riov_len += n;
 			if (!sc->rx_merge) {
 				n_chains = 1;
 				break;
 			}
 			info[n_chains].len = (uint32_t)count_iov(riov, n);
 			riov_bytes += info[n_chains].len;
 			riov += n;
 			n_chains++;
 		} while (riov_bytes < plen && riov_len < VTNET_MAXSEGS);
 
 		riov = iov;
 		hdr = riov[0].iov_base;
 		if (prepend_hdr_len > 0) {
 			/*
 			 * The frontend uses a virtio-net header, but the
 			 * backend does not. We need to prepend a zeroed
 			 * header.
 			 */
 			riov = iov_trim_hdr(riov, &riov_len, prepend_hdr_len);
 			if (riov == NULL) {
 				/*
 				 * The first collected chain is nonsensical,
 				 * as it is not even enough to store the
 				 * virtio-net header. Just drop it.
 				 */
 				vq_relchain(vq, info[0].idx, 0);
 				vq_retchains(vq, n_chains - 1);
 				continue;
 			}
 			memset(hdr, 0, prepend_hdr_len);
 		}
 
 		rlen = netbe_recv(sc->vsc_be, riov, riov_len);
 		if (rlen != plen - prepend_hdr_len) {
 			/*
 			 * If this happens it means there is something
 			 * wrong with the backend (e.g., some other
 			 * process is stealing our packets).
 			 */
 			WPRINTF(("netbe_recv: expected %zd bytes, "
 				"got %zd", plen - prepend_hdr_len, rlen));
 			vq_retchains(vq, n_chains);
 			continue;
 		}
 
 		ulen = (uint32_t)plen;
 
 		/*
 		 * Publish the used buffers to the guest, reporting the
 		 * number of bytes that we wrote.
 		 */
 		if (!sc->rx_merge) {
 			vq_relchain(vq, info[0].idx, ulen);
 		} else {
 			uint32_t iolen;
 			int i = 0;
 
 			do {
 				iolen = info[i].len;
 				if (iolen > ulen) {
 					iolen = ulen;
 				}
 				vq_relchain_prepare(vq, info[i].idx, iolen);
 				ulen -= iolen;
 				i++;
 			} while (ulen > 0);
 
 			hdr->vrh_bufs = i;
 			vq_relchain_publish(vq);
 			assert(i == n_chains);
 		}
 	}
 
 }
 
 /*
  * Called when there is read activity on the backend file descriptor.
  * Each buffer posted by the guest is assumed to be able to contain
  * an entire ethernet frame + rx header.
  */
 static void
 pci_vtnet_rx_callback(int fd, enum ev_type type, void *param)
 {
 	struct pci_vtnet_softc *sc = param;
 
 	pthread_mutex_lock(&sc->rx_mtx);
 	pci_vtnet_rx(sc);
 	pthread_mutex_unlock(&sc->rx_mtx);
 
 }
 
 /* Called on RX kick. */
 static void
 pci_vtnet_ping_rxq(void *vsc, struct vqueue_info *vq)
 {
 	struct pci_vtnet_softc *sc = vsc;
 
 	/*
 	 * A qnotify means that the rx process can now begin.
 	 */
 	pthread_mutex_lock(&sc->rx_mtx);
 	vq_kick_disable(vq);
 	netbe_rx_enable(sc->vsc_be);
 	pthread_mutex_unlock(&sc->rx_mtx);
 }
 
 /* TX virtqueue processing, called by the TX thread. */
 static void
 pci_vtnet_proctx(struct pci_vtnet_softc *sc, struct vqueue_info *vq)
 {
 	struct iovec iov[VTNET_MAXSEGS + 1];
 	struct iovec *siov = iov;
 	uint16_t idx;
 	ssize_t len;
 	int n;
 
 	/*
 	 * Obtain chain of descriptors. The first descriptor also
 	 * contains the virtio-net header.
 	 */
 	n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL);
 	assert(n >= 1 && n <= VTNET_MAXSEGS);
 
 	if (sc->vhdrlen != sc->be_vhdrlen) {
 		/*
 		 * The frontend uses a virtio-net header, but the backend
 		 * does not. We simply strip the header and ignore it, as
 		 * it should be zero-filled.
 		 */
 		siov = iov_trim_hdr(siov, &n, sc->vhdrlen);
 	}
 
 	if (siov == NULL) {
 		/* The chain is nonsensical. Just drop it. */
 		len = 0;
 	} else {
 		len = netbe_send(sc->vsc_be, siov, n);
 		if (len < 0) {
 			/*
 			 * If send failed, report that 0 bytes
 			 * were read.
 			 */
 			len = 0;
 		}
 	}
 
 	/*
 	 * Return the processed chain to the guest, reporting
 	 * the number of bytes that we read.
 	 */
 	vq_relchain(vq, idx, len);
 }
 
 /* Called on TX kick. */
 static void
 pci_vtnet_ping_txq(void *vsc, struct vqueue_info *vq)
 {
 	struct pci_vtnet_softc *sc = vsc;
 
 	/*
 	 * Any ring entries to process?
 	 */
 	if (!vq_has_descs(vq))
 		return;
 
 	/* Signal the tx thread for processing */
 	pthread_mutex_lock(&sc->tx_mtx);
 	vq_kick_disable(vq);
 	if (sc->tx_in_progress == 0)
 		pthread_cond_signal(&sc->tx_cond);
 	pthread_mutex_unlock(&sc->tx_mtx);
 }
 
 /*
  * Thread which will handle processing of TX desc
  */
 static void *
 pci_vtnet_tx_thread(void *param)
 {
 	struct pci_vtnet_softc *sc = param;
 	struct vqueue_info *vq;
 	int error;
 
 	vq = &sc->vsc_queues[VTNET_TXQ];
 
 	/*
 	 * Let us wait till the tx queue pointers get initialised &
 	 * first tx signaled
 	 */
 	pthread_mutex_lock(&sc->tx_mtx);
 	error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx);
 	assert(error == 0);
 
 	for (;;) {
 		/* note - tx mutex is locked here */
 		while (sc->resetting || !vq_has_descs(vq)) {
 			vq_kick_enable(vq);
 			if (!sc->resetting && vq_has_descs(vq))
 				break;
 
 			sc->tx_in_progress = 0;
 			error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx);
 			assert(error == 0);
 		}
 		vq_kick_disable(vq);
 		sc->tx_in_progress = 1;
 		pthread_mutex_unlock(&sc->tx_mtx);
 
 		do {
 			/*
 			 * Run through entries, placing them into
 			 * iovecs and sending when an end-of-packet
 			 * is found
 			 */
 			pci_vtnet_proctx(sc, vq);
 		} while (vq_has_descs(vq));
 
 		/*
 		 * Generate an interrupt if needed.
 		 */
 		vq_endchains(vq, /*used_all_avail=*/1);
 
 		pthread_mutex_lock(&sc->tx_mtx);
 	}
 }
 
 #ifdef notyet
 static void
 pci_vtnet_ping_ctlq(void *vsc, struct vqueue_info *vq)
 {
 
 	DPRINTF(("vtnet: control qnotify!"));
 }
 #endif
 
 static int
 pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 {
 	struct pci_vtnet_softc *sc;
 	char tname[MAXCOMLEN + 1];
 	int mac_provided;
 	int mtu_provided;
 	unsigned long mtu = ETHERMTU;
 
 	/*
 	 * Allocate data structures for further virtio initializations.
 	 * sc also contains a copy of vtnet_vi_consts, since capabilities
 	 * change depending on the backend.
 	 */
 	sc = calloc(1, sizeof(struct pci_vtnet_softc));
 
 	sc->vsc_consts = vtnet_vi_consts;
 	pthread_mutex_init(&sc->vsc_mtx, NULL);
 
 	sc->vsc_queues[VTNET_RXQ].vq_qsize = VTNET_RINGSZ;
 	sc->vsc_queues[VTNET_RXQ].vq_notify = pci_vtnet_ping_rxq;
 	sc->vsc_queues[VTNET_TXQ].vq_qsize = VTNET_RINGSZ;
 	sc->vsc_queues[VTNET_TXQ].vq_notify = pci_vtnet_ping_txq;
 #ifdef notyet
 	sc->vsc_queues[VTNET_CTLQ].vq_qsize = VTNET_RINGSZ;
         sc->vsc_queues[VTNET_CTLQ].vq_notify = pci_vtnet_ping_ctlq;
 #endif
  
 	/*
 	 * Attempt to open the backend device and read the MAC address
 	 * if specified.
 	 */
 	mac_provided = 0;
 	mtu_provided = 0;
 	if (opts != NULL) {
 		char *devname;
 		char *vtopts;
 		int err = 0;
 
 		/* Get the device name. */
 		devname = vtopts = strdup(opts);
 		(void) strsep(&vtopts, ",");
 
 		/*
 		 * Parse the list of options in the form
 		 *     key1=value1,...,keyN=valueN.
 		 */
 		while (vtopts != NULL) {
 			char *value = vtopts;
 			char *key;
 
 			key = strsep(&value, "=");
 			if (value == NULL)
 				break;
 			vtopts = value;
 			(void) strsep(&vtopts, ",");
 
 			if (strcmp(key, "mac") == 0) {
 				err = net_parsemac(value, sc->vsc_config.mac);
 				if (err)
 					break;
 				mac_provided = 1;
 			} else if (strcmp(key, "mtu") == 0) {
 				err = net_parsemtu(value, &mtu);
 				if (err)
 					break;
 
 				if (mtu < VTNET_MIN_MTU || mtu > VTNET_MAX_MTU) {
 					err = EINVAL;
 					errno = EINVAL;
 					break;
 				}
 				mtu_provided = 1;
 			}
 		}
 
 		if (err) {
 			free(devname);
 			free(sc);
 			return (err);
 		}
 
 		err = netbe_init(&sc->vsc_be, devname, pci_vtnet_rx_callback,
 		          sc);
 		free(devname);
 		if (err) {
 			free(sc);
 			return (err);
 		}
 		sc->vsc_consts.vc_hv_caps |= VIRTIO_NET_F_MRG_RXBUF |
 		    netbe_get_cap(sc->vsc_be);
 	}
 
 	if (!mac_provided) {
 		net_genmac(pi, sc->vsc_config.mac);
 	}
 
 	sc->vsc_config.mtu = mtu;
 	if (mtu_provided) {
 		sc->vsc_consts.vc_hv_caps |= VIRTIO_NET_F_MTU;
 	}
 
 	/* 
 	 * Since we do not actually support multiqueue,
 	 * set the maximum virtqueue pairs to 1. 
 	 */
 	sc->vsc_config.max_virtqueue_pairs = 1;
 
 	/* initialize config space */
 	pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_NET);
 	pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
 	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK);
 	pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_NET);
 	pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR);
 
 	/* Link is up if we managed to open backend device. */
 	sc->vsc_config.status = (opts == NULL || sc->vsc_be);
 	
 	vi_softc_linkup(&sc->vsc_vs, &sc->vsc_consts, sc, pi, sc->vsc_queues);
 	sc->vsc_vs.vs_mtx = &sc->vsc_mtx;
 
 	/* use BAR 1 to map MSI-X table and PBA, if we're using MSI-X */
 	if (vi_intr_init(&sc->vsc_vs, 1, fbsdrun_virtio_msix())) {
 		free(sc);
 		return (1);
 	}
 
 	/* use BAR 0 to map config regs in IO space */
 	vi_set_io_bar(&sc->vsc_vs, 0);
 
 	sc->resetting = 0;
 
 	sc->rx_merge = 0;
 	sc->vhdrlen = sizeof(struct virtio_net_rxhdr) - 2;
 	pthread_mutex_init(&sc->rx_mtx, NULL); 
 
 	/* 
 	 * Initialize tx semaphore & spawn TX processing thread.
 	 * As of now, only one thread for TX desc processing is
 	 * spawned. 
 	 */
 	sc->tx_in_progress = 0;
 	pthread_mutex_init(&sc->tx_mtx, NULL);
 	pthread_cond_init(&sc->tx_cond, NULL);
 	pthread_create(&sc->tx_tid, NULL, pci_vtnet_tx_thread, (void *)sc);
 	snprintf(tname, sizeof(tname), "vtnet-%d:%d tx", pi->pi_slot,
 	    pi->pi_func);
 	pthread_set_name_np(sc->tx_tid, tname);
 
 	return (0);
 }
 
 static int
 pci_vtnet_cfgwrite(void *vsc, int offset, int size, uint32_t value)
 {
 	struct pci_vtnet_softc *sc = vsc;
 	void *ptr;
 
 	if (offset < (int)sizeof(sc->vsc_config.mac)) {
 		assert(offset + size <= (int)sizeof(sc->vsc_config.mac));
 		/*
 		 * The driver is allowed to change the MAC address
 		 */
 		ptr = &sc->vsc_config.mac[offset];
 		memcpy(ptr, &value, size);
 	} else {
 		/* silently ignore other writes */
 		DPRINTF(("vtnet: write to readonly reg %d", offset));
 	}
 
 	return (0);
 }
 
 static int
 pci_vtnet_cfgread(void *vsc, int offset, int size, uint32_t *retval)
 {
 	struct pci_vtnet_softc *sc = vsc;
 	void *ptr;
 
 	ptr = (uint8_t *)&sc->vsc_config + offset;
 	memcpy(retval, ptr, size);
 	return (0);
 }
 
 static void
 pci_vtnet_neg_features(void *vsc, uint64_t negotiated_features)
 {
 	struct pci_vtnet_softc *sc = vsc;
 
 	sc->vsc_features = negotiated_features;
 
 	if (negotiated_features & VIRTIO_NET_F_MRG_RXBUF) {
 		sc->vhdrlen = sizeof(struct virtio_net_rxhdr);
 		sc->rx_merge = 1;
 	} else {
 		/*
 		 * Without mergeable rx buffers, virtio-net header is 2
 		 * bytes shorter than sizeof(struct virtio_net_rxhdr).
 		 */
 		sc->vhdrlen = sizeof(struct virtio_net_rxhdr) - 2;
 		sc->rx_merge = 0;
 	}
 
 	/* Tell the backend to enable some capabilities it has advertised. */
 	netbe_set_cap(sc->vsc_be, negotiated_features, sc->vhdrlen);
 	sc->be_vhdrlen = netbe_get_vnet_hdr_len(sc->vsc_be);
 	assert(sc->be_vhdrlen == 0 || sc->be_vhdrlen == sc->vhdrlen);
 }
 
+#ifdef BHYVE_SNAPSHOT
+static void
+pci_vtnet_pause(void *vsc)
+{
+	struct pci_vtnet_softc *sc = vsc;
+
+	DPRINTF(("vtnet: device pause requested !\n"));
+
+	/* Acquire the RX lock to block RX processing. */
+	pthread_mutex_lock(&sc->rx_mtx);
+
+	/* Wait for the transmit thread to finish its processing. */
+	pthread_mutex_lock(&sc->tx_mtx);
+	while (sc->tx_in_progress) {
+		pthread_mutex_unlock(&sc->tx_mtx);
+		usleep(10000);
+		pthread_mutex_lock(&sc->tx_mtx);
+	}
+}
+
+static void
+pci_vtnet_resume(void *vsc)
+{
+	struct pci_vtnet_softc *sc = vsc;
+
+	DPRINTF(("vtnet: device resume requested !\n"));
+
+	pthread_mutex_unlock(&sc->tx_mtx);
+	/* The RX lock should have been acquired in vtnet_pause. */
+	pthread_mutex_unlock(&sc->rx_mtx);
+}
+
+static int
+pci_vtnet_snapshot(void *vsc, struct vm_snapshot_meta *meta)
+{
+	int ret;
+	struct pci_vtnet_softc *sc = vsc;
+
+	DPRINTF(("vtnet: device snapshot requested !\n"));
+
+	/*
+	 * Queues and consts should have been saved by the more generic
+	 * vi_pci_snapshot function. We need to save only our features and
+	 * config.
+	 */
+
+	SNAPSHOT_VAR_OR_LEAVE(sc->vsc_features, meta, ret, done);
+
+	/* Force reapply negociated features at restore time */
+	if (meta->op == VM_SNAPSHOT_RESTORE) {
+		pci_vtnet_neg_features(sc, sc->vsc_features);
+		netbe_rx_enable(sc->vsc_be);
+	}
+
+	SNAPSHOT_VAR_OR_LEAVE(sc->vsc_config, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->rx_merge, meta, ret, done);
+
+	SNAPSHOT_VAR_OR_LEAVE(sc->vhdrlen, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->be_vhdrlen, meta, ret, done);
+
+done:
+	return (ret);
+}
+#endif
+
 static struct pci_devemu pci_de_vnet = {
 	.pe_emu = 	"virtio-net",
 	.pe_init =	pci_vtnet_init,
 	.pe_barwrite =	vi_pci_write,
-	.pe_barread =	vi_pci_read
+	.pe_barread =	vi_pci_read,
+#ifdef BHYVE_SNAPSHOT
+	.pe_snapshot =	vi_pci_snapshot,
+	.pe_pause =	vi_pci_pause,
+	.pe_resume =	vi_pci_resume,
+#endif
 };
 PCI_EMUL_SET(pci_de_vnet);
diff --git a/usr.sbin/bhyve/pci_xhci.c b/usr.sbin/bhyve/pci_xhci.c
index 672f35c91ef8..0847d5bb38b5 100644
--- a/usr.sbin/bhyve/pci_xhci.c
+++ b/usr.sbin/bhyve/pci_xhci.c
@@ -1,2866 +1,3124 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2014 Leon Dang <ldang@nahannisys.com>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 /*
    XHCI options:
     -s <n>,xhci,{devices}
 
    devices:
      tablet             USB tablet mouse
  */
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/uio.h>
 #include <sys/types.h>
 #include <sys/queue.h>
 
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <errno.h>
 #include <pthread.h>
 #include <unistd.h>
 
+#include <machine/vmm_snapshot.h>
+
 #include <dev/usb/usbdi.h>
 #include <dev/usb/usb.h>
 #include <dev/usb/usb_freebsd.h>
 #include <xhcireg.h>
 
 #include "bhyverun.h"
 #include "debug.h"
 #include "pci_emul.h"
 #include "pci_xhci.h"
 #include "usb_emul.h"
 
 
 static int xhci_debug = 0;
 #define	DPRINTF(params) if (xhci_debug) PRINTLN params
 #define	WPRINTF(params) PRINTLN params
 
 
 #define	XHCI_NAME		"xhci"
 #define	XHCI_MAX_DEVS		8	/* 4 USB3 + 4 USB2 devs */
 
 #define	XHCI_MAX_SLOTS		64	/* min allowed by Windows drivers */
 
 /*
  * XHCI data structures can be up to 64k, but limit paddr_guest2host mapping
  * to 4k to avoid going over the guest physical memory barrier.
  */
 #define	XHCI_PADDR_SZ		4096	/* paddr_guest2host max size */
 
 #define	XHCI_ERST_MAX		0	/* max 2^entries event ring seg tbl */
 
 #define	XHCI_CAPLEN		(4*8)	/* offset of op register space */
 #define	XHCI_HCCPRAMS2		0x1C	/* offset of HCCPARAMS2 register */
 #define	XHCI_PORTREGS_START	0x400
 #define	XHCI_DOORBELL_MAX	256
 
 #define	XHCI_STREAMS_MAX	1	/* 4-15 in XHCI spec */
 
 /* caplength and hci-version registers */
 #define	XHCI_SET_CAPLEN(x)		((x) & 0xFF)
 #define	XHCI_SET_HCIVERSION(x)		(((x) & 0xFFFF) << 16)
 #define	XHCI_GET_HCIVERSION(x)		(((x) >> 16) & 0xFFFF)
 
 /* hcsparams1 register */
 #define	XHCI_SET_HCSP1_MAXSLOTS(x)	((x) & 0xFF)
 #define	XHCI_SET_HCSP1_MAXINTR(x)	(((x) & 0x7FF) << 8)
 #define	XHCI_SET_HCSP1_MAXPORTS(x)	(((x) & 0xFF) << 24)
 
 /* hcsparams2 register */
 #define	XHCI_SET_HCSP2_IST(x)		((x) & 0x0F)
 #define	XHCI_SET_HCSP2_ERSTMAX(x)	(((x) & 0x0F) << 4)
 #define	XHCI_SET_HCSP2_MAXSCRATCH_HI(x)	(((x) & 0x1F) << 21)
 #define	XHCI_SET_HCSP2_MAXSCRATCH_LO(x)	(((x) & 0x1F) << 27)
 
 /* hcsparams3 register */
 #define	XHCI_SET_HCSP3_U1EXITLATENCY(x)	((x) & 0xFF)
 #define	XHCI_SET_HCSP3_U2EXITLATENCY(x)	(((x) & 0xFFFF) << 16)
 
 /* hccparams1 register */
 #define	XHCI_SET_HCCP1_AC64(x)		((x) & 0x01)
 #define	XHCI_SET_HCCP1_BNC(x)		(((x) & 0x01) << 1)
 #define	XHCI_SET_HCCP1_CSZ(x)		(((x) & 0x01) << 2)
 #define	XHCI_SET_HCCP1_PPC(x)		(((x) & 0x01) << 3)
 #define	XHCI_SET_HCCP1_PIND(x)		(((x) & 0x01) << 4)
 #define	XHCI_SET_HCCP1_LHRC(x)		(((x) & 0x01) << 5)
 #define	XHCI_SET_HCCP1_LTC(x)		(((x) & 0x01) << 6)
 #define	XHCI_SET_HCCP1_NSS(x)		(((x) & 0x01) << 7)
 #define	XHCI_SET_HCCP1_PAE(x)		(((x) & 0x01) << 8)
 #define	XHCI_SET_HCCP1_SPC(x)		(((x) & 0x01) << 9)
 #define	XHCI_SET_HCCP1_SEC(x)		(((x) & 0x01) << 10)
 #define	XHCI_SET_HCCP1_CFC(x)		(((x) & 0x01) << 11)
 #define	XHCI_SET_HCCP1_MAXPSA(x)	(((x) & 0x0F) << 12)
 #define	XHCI_SET_HCCP1_XECP(x)		(((x) & 0xFFFF) << 16)
 
 /* hccparams2 register */
 #define	XHCI_SET_HCCP2_U3C(x)		((x) & 0x01)
 #define	XHCI_SET_HCCP2_CMC(x)		(((x) & 0x01) << 1)
 #define	XHCI_SET_HCCP2_FSC(x)		(((x) & 0x01) << 2)
 #define	XHCI_SET_HCCP2_CTC(x)		(((x) & 0x01) << 3)
 #define	XHCI_SET_HCCP2_LEC(x)		(((x) & 0x01) << 4)
 #define	XHCI_SET_HCCP2_CIC(x)		(((x) & 0x01) << 5)
 
 /* other registers */
 #define	XHCI_SET_DOORBELL(x)		((x) & ~0x03)
 #define	XHCI_SET_RTSOFFSET(x)		((x) & ~0x0F)
 
 /* register masks */
 #define	XHCI_PS_PLS_MASK		(0xF << 5)	/* port link state */
 #define	XHCI_PS_SPEED_MASK		(0xF << 10)	/* port speed */
 #define	XHCI_PS_PIC_MASK		(0x3 << 14)	/* port indicator */
 
 /* port register set */
 #define	XHCI_PORTREGS_BASE		0x400		/* base offset */
 #define	XHCI_PORTREGS_PORT0		0x3F0
 #define	XHCI_PORTREGS_SETSZ		0x10		/* size of a set */
 
 #define	MASK_64_HI(x)			((x) & ~0xFFFFFFFFULL)
 #define	MASK_64_LO(x)			((x) & 0xFFFFFFFFULL)
 
 #define	FIELD_REPLACE(a,b,m,s)		(((a) & ~((m) << (s))) | \
 					(((b) & (m)) << (s)))
 #define	FIELD_COPY(a,b,m,s)		(((a) & ~((m) << (s))) | \
 					(((b) & ((m) << (s)))))
 
+#define	SNAP_DEV_NAME_LEN 128
+
 struct pci_xhci_trb_ring {
 	uint64_t ringaddr;		/* current dequeue guest address */
 	uint32_t ccs;			/* consumer cycle state */
 };
 
 /* device endpoint transfer/stream rings */
 struct pci_xhci_dev_ep {
 	union {
 		struct xhci_trb		*_epu_tr;
 		struct xhci_stream_ctx	*_epu_sctx;
 	} _ep_trbsctx;
 #define	ep_tr		_ep_trbsctx._epu_tr
 #define	ep_sctx		_ep_trbsctx._epu_sctx
 
 	union {
 		struct pci_xhci_trb_ring _epu_trb;
 		struct pci_xhci_trb_ring *_epu_sctx_trbs;
 	} _ep_trb_rings;
 #define	ep_ringaddr	_ep_trb_rings._epu_trb.ringaddr
 #define	ep_ccs		_ep_trb_rings._epu_trb.ccs
 #define	ep_sctx_trbs	_ep_trb_rings._epu_sctx_trbs
 
 	struct usb_data_xfer *ep_xfer;	/* transfer chain */
 };
 
 /* device context base address array: maps slot->device context */
 struct xhci_dcbaa {
 	uint64_t dcba[USB_MAX_DEVICES+1]; /* xhci_dev_ctx ptrs */
 };
 
 /* port status registers */
 struct pci_xhci_portregs {
 	uint32_t	portsc;		/* port status and control */
 	uint32_t	portpmsc;	/* port pwr mgmt status & control */
 	uint32_t	portli;		/* port link info */
 	uint32_t	porthlpmc;	/* port hardware LPM control */
 } __packed;
 #define	XHCI_PS_SPEED_SET(x)	(((x) & 0xF) << 10)
 
 /* xHC operational registers */
 struct pci_xhci_opregs {
 	uint32_t	usbcmd;		/* usb command */
 	uint32_t	usbsts;		/* usb status */
 	uint32_t	pgsz;		/* page size */
 	uint32_t	dnctrl;		/* device notification control */
 	uint64_t	crcr;		/* command ring control */
 	uint64_t	dcbaap;		/* device ctx base addr array ptr */
 	uint32_t	config;		/* configure */
 
 	/* guest mapped addresses: */
 	struct xhci_trb	*cr_p;		/* crcr dequeue */
 	struct xhci_dcbaa *dcbaa_p;	/* dev ctx array ptr */
 };
 
 /* xHC runtime registers */
 struct pci_xhci_rtsregs {
 	uint32_t	mfindex;	/* microframe index */
 	struct {			/* interrupter register set */
 		uint32_t	iman;	/* interrupter management */
 		uint32_t	imod;	/* interrupter moderation */
 		uint32_t	erstsz;	/* event ring segment table size */
 		uint32_t	rsvd;
 		uint64_t	erstba;	/* event ring seg-tbl base addr */
 		uint64_t	erdp;	/* event ring dequeue ptr */
 	} intrreg __packed;
 
 	/* guest mapped addresses */
 	struct xhci_event_ring_seg *erstba_p;
 	struct xhci_trb *erst_p;	/* event ring segment tbl */
 	int		er_deq_seg;	/* event ring dequeue segment */
 	int		er_enq_idx;	/* event ring enqueue index - xHCI */
 	int		er_enq_seg;	/* event ring enqueue segment */
 	uint32_t	er_events_cnt;	/* number of events in ER */
 	uint32_t	event_pcs;	/* producer cycle state flag */
 };
 
 
 struct pci_xhci_softc;
 
 
 /*
  * USB device emulation container.
  * This is referenced from usb_hci->hci_sc; 1 pci_xhci_dev_emu for each
  * emulated device instance.
  */
 struct pci_xhci_dev_emu {
 	struct pci_xhci_softc	*xsc;
 
 	/* XHCI contexts */
 	struct xhci_dev_ctx	*dev_ctx;
 	struct pci_xhci_dev_ep	eps[XHCI_MAX_ENDPOINTS];
 	int			dev_slotstate;
 
 	struct usb_devemu	*dev_ue;	/* USB emulated dev */
 	void			*dev_sc;	/* device's softc */
 
 	struct usb_hci		hci;
 };
 
 struct pci_xhci_softc {
 	struct pci_devinst *xsc_pi;
 
 	pthread_mutex_t	mtx;
 
 	uint32_t	caplength;	/* caplen & hciversion */
 	uint32_t	hcsparams1;	/* structural parameters 1 */
 	uint32_t	hcsparams2;	/* structural parameters 2 */
 	uint32_t	hcsparams3;	/* structural parameters 3 */
 	uint32_t	hccparams1;	/* capability parameters 1 */
 	uint32_t	dboff;		/* doorbell offset */
 	uint32_t	rtsoff;		/* runtime register space offset */
 	uint32_t	hccparams2;	/* capability parameters 2 */
 
 	uint32_t	regsend;	/* end of configuration registers */
 
 	struct pci_xhci_opregs  opregs;
 	struct pci_xhci_rtsregs rtsregs;
 
 	struct pci_xhci_portregs *portregs;
 	struct pci_xhci_dev_emu  **devices; /* XHCI[port] = device */
 	struct pci_xhci_dev_emu  **slots;   /* slots assigned from 1 */
 	int		ndevices;
 
 	int		usb2_port_start;
 	int		usb3_port_start;
 };
 
 
 /* portregs and devices arrays are set up to start from idx=1 */
 #define	XHCI_PORTREG_PTR(x,n)	&(x)->portregs[(n)]
 #define	XHCI_DEVINST_PTR(x,n)	(x)->devices[(n)]
 #define	XHCI_SLOTDEV_PTR(x,n)	(x)->slots[(n)]
 
 #define	XHCI_HALTED(sc)		((sc)->opregs.usbsts & XHCI_STS_HCH)
 
+#define	XHCI_GADDR_SIZE(a)	(XHCI_PADDR_SZ - \
+				    (((uint64_t) (a)) & (XHCI_PADDR_SZ - 1)))
 #define	XHCI_GADDR(sc,a)	paddr_guest2host((sc)->xsc_pi->pi_vmctx, \
-				    (a),                                 \
-				    XHCI_PADDR_SZ - ((a) & (XHCI_PADDR_SZ-1)))
+				    (a), XHCI_GADDR_SIZE(a))
 
 static int xhci_in_use;
 
 /* map USB errors to XHCI */
 static const int xhci_usb_errors[USB_ERR_MAX] = {
 	[USB_ERR_NORMAL_COMPLETION]	= XHCI_TRB_ERROR_SUCCESS,
 	[USB_ERR_PENDING_REQUESTS]	= XHCI_TRB_ERROR_RESOURCE,
 	[USB_ERR_NOT_STARTED]		= XHCI_TRB_ERROR_ENDP_NOT_ON,
 	[USB_ERR_INVAL]			= XHCI_TRB_ERROR_INVALID,
 	[USB_ERR_NOMEM]			= XHCI_TRB_ERROR_RESOURCE,
 	[USB_ERR_CANCELLED]		= XHCI_TRB_ERROR_STOPPED,
 	[USB_ERR_BAD_ADDRESS]		= XHCI_TRB_ERROR_PARAMETER,
 	[USB_ERR_BAD_BUFSIZE]		= XHCI_TRB_ERROR_PARAMETER,
 	[USB_ERR_BAD_FLAG]		= XHCI_TRB_ERROR_PARAMETER,
 	[USB_ERR_NO_CALLBACK]		= XHCI_TRB_ERROR_STALL,
 	[USB_ERR_IN_USE]		= XHCI_TRB_ERROR_RESOURCE,
 	[USB_ERR_NO_ADDR]		= XHCI_TRB_ERROR_RESOURCE,
 	[USB_ERR_NO_PIPE]               = XHCI_TRB_ERROR_RESOURCE,
 	[USB_ERR_ZERO_NFRAMES]          = XHCI_TRB_ERROR_UNDEFINED,
 	[USB_ERR_ZERO_MAXP]             = XHCI_TRB_ERROR_UNDEFINED,
 	[USB_ERR_SET_ADDR_FAILED]       = XHCI_TRB_ERROR_RESOURCE,
 	[USB_ERR_NO_POWER]              = XHCI_TRB_ERROR_ENDP_NOT_ON,
 	[USB_ERR_TOO_DEEP]              = XHCI_TRB_ERROR_RESOURCE,
 	[USB_ERR_IOERROR]               = XHCI_TRB_ERROR_TRB,
 	[USB_ERR_NOT_CONFIGURED]        = XHCI_TRB_ERROR_ENDP_NOT_ON,
 	[USB_ERR_TIMEOUT]               = XHCI_TRB_ERROR_CMD_ABORTED,
 	[USB_ERR_SHORT_XFER]            = XHCI_TRB_ERROR_SHORT_PKT,
 	[USB_ERR_STALLED]               = XHCI_TRB_ERROR_STALL,
 	[USB_ERR_INTERRUPTED]           = XHCI_TRB_ERROR_CMD_ABORTED,
 	[USB_ERR_DMA_LOAD_FAILED]       = XHCI_TRB_ERROR_DATA_BUF,
 	[USB_ERR_BAD_CONTEXT]           = XHCI_TRB_ERROR_TRB,
 	[USB_ERR_NO_ROOT_HUB]           = XHCI_TRB_ERROR_UNDEFINED,
 	[USB_ERR_NO_INTR_THREAD]        = XHCI_TRB_ERROR_UNDEFINED,
 	[USB_ERR_NOT_LOCKED]            = XHCI_TRB_ERROR_UNDEFINED,
 };
 #define	USB_TO_XHCI_ERR(e)	((e) < USB_ERR_MAX ? xhci_usb_errors[(e)] : \
 				XHCI_TRB_ERROR_INVALID)
 
 static int pci_xhci_insert_event(struct pci_xhci_softc *sc,
     struct xhci_trb *evtrb, int do_intr);
 static void pci_xhci_dump_trb(struct xhci_trb *trb);
 static void pci_xhci_assert_interrupt(struct pci_xhci_softc *sc);
 static void pci_xhci_reset_slot(struct pci_xhci_softc *sc, int slot);
 static void pci_xhci_reset_port(struct pci_xhci_softc *sc, int portn, int warm);
 static void pci_xhci_update_ep_ring(struct pci_xhci_softc *sc,
     struct pci_xhci_dev_emu *dev, struct pci_xhci_dev_ep *devep,
     struct xhci_endp_ctx *ep_ctx, uint32_t streamid,
     uint64_t ringaddr, int ccs);
 
 static void
 pci_xhci_set_evtrb(struct xhci_trb *evtrb, uint64_t port, uint32_t errcode,
     uint32_t evtype)
 {
 	evtrb->qwTrb0 = port << 24;
 	evtrb->dwTrb2 = XHCI_TRB_2_ERROR_SET(errcode);
 	evtrb->dwTrb3 = XHCI_TRB_3_TYPE_SET(evtype);
 }
 
 
 /* controller reset */
 static void
 pci_xhci_reset(struct pci_xhci_softc *sc)
 {
 	int i;
 
 	sc->rtsregs.er_enq_idx = 0;
 	sc->rtsregs.er_events_cnt = 0;
 	sc->rtsregs.event_pcs = 1;
 
 	for (i = 1; i <= XHCI_MAX_SLOTS; i++) {
 		pci_xhci_reset_slot(sc, i);
 	}
 }
 
 static uint32_t
 pci_xhci_usbcmd_write(struct pci_xhci_softc *sc, uint32_t cmd)
 {
 	int do_intr = 0;
 	int i;
 
 	if (cmd & XHCI_CMD_RS) {
 		do_intr = (sc->opregs.usbcmd & XHCI_CMD_RS) == 0;
 
 		sc->opregs.usbcmd |= XHCI_CMD_RS;
 		sc->opregs.usbsts &= ~XHCI_STS_HCH;
 		sc->opregs.usbsts |= XHCI_STS_PCD;
 
 		/* Queue port change event on controller run from stop */
 		if (do_intr)
 			for (i = 1; i <= XHCI_MAX_DEVS; i++) {
 				struct pci_xhci_dev_emu *dev;
 				struct pci_xhci_portregs *port;
 				struct xhci_trb		evtrb;
 
 				if ((dev = XHCI_DEVINST_PTR(sc, i)) == NULL)
 					continue;
 
 				port = XHCI_PORTREG_PTR(sc, i);
 				port->portsc |= XHCI_PS_CSC | XHCI_PS_CCS;
 				port->portsc &= ~XHCI_PS_PLS_MASK;
 
 				/*
 				 * XHCI 4.19.3 USB2 RxDetect->Polling,
 				 *             USB3 Polling->U0
 				 */
 				if (dev->dev_ue->ue_usbver == 2)
 					port->portsc |=
 					    XHCI_PS_PLS_SET(UPS_PORT_LS_POLL);
 				else
 					port->portsc |=
 					    XHCI_PS_PLS_SET(UPS_PORT_LS_U0);
 
 				pci_xhci_set_evtrb(&evtrb, i,
 				    XHCI_TRB_ERROR_SUCCESS,
 				    XHCI_TRB_EVENT_PORT_STS_CHANGE);
 
 				if (pci_xhci_insert_event(sc, &evtrb, 0) !=
 				    XHCI_TRB_ERROR_SUCCESS)
 					break;
 			}
 	} else {
 		sc->opregs.usbcmd &= ~XHCI_CMD_RS;
 		sc->opregs.usbsts |= XHCI_STS_HCH;
 		sc->opregs.usbsts &= ~XHCI_STS_PCD;
 	}
 
 	/* start execution of schedule; stop when set to 0 */
 	cmd |= sc->opregs.usbcmd & XHCI_CMD_RS;
 
 	if (cmd & XHCI_CMD_HCRST) {
 		/* reset controller */
 		pci_xhci_reset(sc);
 		cmd &= ~XHCI_CMD_HCRST;
 	}
 
 	cmd &= ~(XHCI_CMD_CSS | XHCI_CMD_CRS);
 
 	if (do_intr)
 		pci_xhci_assert_interrupt(sc);
 
 	return (cmd);
 }
 
 static void
 pci_xhci_portregs_write(struct pci_xhci_softc *sc, uint64_t offset,
     uint64_t value)
 {
 	struct xhci_trb		evtrb;
 	struct pci_xhci_portregs *p;
 	int port;
 	uint32_t oldpls, newpls;
 
 	if (sc->portregs == NULL)
 		return;
 
 	port = (offset - XHCI_PORTREGS_PORT0) / XHCI_PORTREGS_SETSZ;
 	offset = (offset - XHCI_PORTREGS_PORT0) % XHCI_PORTREGS_SETSZ;
 
 	DPRINTF(("pci_xhci: portregs wr offset 0x%lx, port %u: 0x%lx",
 	        offset, port, value));
 
 	assert(port >= 0);
 
 	if (port > XHCI_MAX_DEVS) {
 		DPRINTF(("pci_xhci: portregs_write port %d > ndevices",
 		    port));
 		return;
 	}
 
 	if (XHCI_DEVINST_PTR(sc, port) == NULL) {
 		DPRINTF(("pci_xhci: portregs_write to unattached port %d",
 		     port));
 	}
 
 	p = XHCI_PORTREG_PTR(sc, port);
 	switch (offset) {
 	case 0:
 		/* port reset or warm reset */
 		if (value & (XHCI_PS_PR | XHCI_PS_WPR)) {
 			pci_xhci_reset_port(sc, port, value & XHCI_PS_WPR);
 			break;
 		}
 
 		if ((p->portsc & XHCI_PS_PP) == 0) {
 			WPRINTF(("pci_xhci: portregs_write to unpowered "
 			         "port %d", port));
 			break;
 		}
 
 		/* Port status and control register  */
 		oldpls = XHCI_PS_PLS_GET(p->portsc);
 		newpls = XHCI_PS_PLS_GET(value);
 
 		p->portsc &= XHCI_PS_PED | XHCI_PS_PLS_MASK |
 		             XHCI_PS_SPEED_MASK | XHCI_PS_PIC_MASK;
   
 		if (XHCI_DEVINST_PTR(sc, port))
 			p->portsc |= XHCI_PS_CCS;
 
 		p->portsc |= (value &
 		              ~(XHCI_PS_OCA |
 		                XHCI_PS_PR  |
 			        XHCI_PS_PED |
 			        XHCI_PS_PLS_MASK   |	/* link state */
 			        XHCI_PS_SPEED_MASK |
 			        XHCI_PS_PIC_MASK   |	/* port indicator */
 			        XHCI_PS_LWS | XHCI_PS_DR | XHCI_PS_WPR));
 
 		/* clear control bits */
 		p->portsc &= ~(value &
 		               (XHCI_PS_CSC |
 		                XHCI_PS_PEC |
 		                XHCI_PS_WRC |
 		                XHCI_PS_OCC |
 		                XHCI_PS_PRC |
 		                XHCI_PS_PLC |
 		                XHCI_PS_CEC |
 		                XHCI_PS_CAS));
 
 		/* port disable request; for USB3, don't care */
 		if (value & XHCI_PS_PED)
 			DPRINTF(("Disable port %d request", port));
 
 		if (!(value & XHCI_PS_LWS))
 			break;
 
 		DPRINTF(("Port new PLS: %d", newpls));
 		switch (newpls) {
 		case 0: /* U0 */
 		case 3: /* U3 */
 			if (oldpls != newpls) {
 				p->portsc &= ~XHCI_PS_PLS_MASK;
 				p->portsc |= XHCI_PS_PLS_SET(newpls) |
 				             XHCI_PS_PLC;
 
 				if (oldpls != 0 && newpls == 0) {
 					pci_xhci_set_evtrb(&evtrb, port,
 					    XHCI_TRB_ERROR_SUCCESS,
 					    XHCI_TRB_EVENT_PORT_STS_CHANGE);
 
 					pci_xhci_insert_event(sc, &evtrb, 1);
 				}
 			}
 			break;
 
 		default:
 			DPRINTF(("Unhandled change port %d PLS %u",
 			         port, newpls));
 			break;
 		}
 		break;
 	case 4: 
 		/* Port power management status and control register  */
 		p->portpmsc = value;
 		break;
 	case 8:
 		/* Port link information register */
 		DPRINTF(("pci_xhci attempted write to PORTLI, port %d",
 		        port));
 		break;
 	case 12:
 		/*
 		 * Port hardware LPM control register.
 		 * For USB3, this register is reserved.
 		 */
 		p->porthlpmc = value;
 		break;
 	}
 }
 
 struct xhci_dev_ctx *
 pci_xhci_get_dev_ctx(struct pci_xhci_softc *sc, uint32_t slot)
 {
 	uint64_t devctx_addr;
 	struct xhci_dev_ctx *devctx;
 
 	assert(slot > 0 && slot <= sc->ndevices);
 	assert(sc->opregs.dcbaa_p != NULL);
 
 	devctx_addr = sc->opregs.dcbaa_p->dcba[slot];
 
 	if (devctx_addr == 0) {
 		DPRINTF(("get_dev_ctx devctx_addr == 0"));
 		return (NULL);
 	}
 
 	DPRINTF(("pci_xhci: get dev ctx, slot %u devctx addr %016lx",
 	        slot, devctx_addr));
 	devctx = XHCI_GADDR(sc, devctx_addr & ~0x3FUL);
 
 	return (devctx);
 }
 
 struct xhci_trb *
 pci_xhci_trb_next(struct pci_xhci_softc *sc, struct xhci_trb *curtrb,
     uint64_t *guestaddr)
 {
 	struct xhci_trb *next;
 
 	assert(curtrb != NULL);
 
 	if (XHCI_TRB_3_TYPE_GET(curtrb->dwTrb3) == XHCI_TRB_TYPE_LINK) {
 		if (guestaddr)
 			*guestaddr = curtrb->qwTrb0 & ~0xFUL;
 		
 		next = XHCI_GADDR(sc, curtrb->qwTrb0 & ~0xFUL);
 	} else {
 		if (guestaddr)
 			*guestaddr += sizeof(struct xhci_trb) & ~0xFUL;
 
 		next = curtrb + 1;
 	}
 
 	return (next);
 }
 
 static void
 pci_xhci_assert_interrupt(struct pci_xhci_softc *sc)
 {
 
 	sc->rtsregs.intrreg.erdp |= XHCI_ERDP_LO_BUSY;
 	sc->rtsregs.intrreg.iman |= XHCI_IMAN_INTR_PEND;
 	sc->opregs.usbsts |= XHCI_STS_EINT;
 
 	/* only trigger interrupt if permitted */
 	if ((sc->opregs.usbcmd & XHCI_CMD_INTE) &&
 	    (sc->rtsregs.intrreg.iman & XHCI_IMAN_INTR_ENA)) {
 		if (pci_msi_enabled(sc->xsc_pi))
 			pci_generate_msi(sc->xsc_pi, 0);
 		else
 			pci_lintr_assert(sc->xsc_pi);
 	}
 }
 
 static void
 pci_xhci_deassert_interrupt(struct pci_xhci_softc *sc)
 {
 
 	if (!pci_msi_enabled(sc->xsc_pi))
 		pci_lintr_assert(sc->xsc_pi);
 }
 
 static void
 pci_xhci_init_ep(struct pci_xhci_dev_emu *dev, int epid)
 {
 	struct xhci_dev_ctx    *dev_ctx;
 	struct pci_xhci_dev_ep *devep;
 	struct xhci_endp_ctx   *ep_ctx;
 	uint32_t	pstreams;
 	int		i;
 
 	dev_ctx = dev->dev_ctx;
 	ep_ctx = &dev_ctx->ctx_ep[epid];
 	devep = &dev->eps[epid];
 	pstreams = XHCI_EPCTX_0_MAXP_STREAMS_GET(ep_ctx->dwEpCtx0);
 	if (pstreams > 0) {
 		DPRINTF(("init_ep %d with pstreams %d", epid, pstreams));
 		assert(devep->ep_sctx_trbs == NULL);
 
 		devep->ep_sctx = XHCI_GADDR(dev->xsc, ep_ctx->qwEpCtx2 &
 		                            XHCI_EPCTX_2_TR_DQ_PTR_MASK);
 		devep->ep_sctx_trbs = calloc(pstreams,
 		                      sizeof(struct pci_xhci_trb_ring));
 		for (i = 0; i < pstreams; i++) {
 			devep->ep_sctx_trbs[i].ringaddr =
 			                         devep->ep_sctx[i].qwSctx0 &
 			                         XHCI_SCTX_0_TR_DQ_PTR_MASK;
 			devep->ep_sctx_trbs[i].ccs =
 			     XHCI_SCTX_0_DCS_GET(devep->ep_sctx[i].qwSctx0);
 		}
 	} else {
 		DPRINTF(("init_ep %d with no pstreams", epid));
 		devep->ep_ringaddr = ep_ctx->qwEpCtx2 &
 		                     XHCI_EPCTX_2_TR_DQ_PTR_MASK;
 		devep->ep_ccs = XHCI_EPCTX_2_DCS_GET(ep_ctx->qwEpCtx2);
 		devep->ep_tr = XHCI_GADDR(dev->xsc, devep->ep_ringaddr);
 		DPRINTF(("init_ep tr DCS %x", devep->ep_ccs));
 	}
 
 	if (devep->ep_xfer == NULL) {
 		devep->ep_xfer = malloc(sizeof(struct usb_data_xfer));
 		USB_DATA_XFER_INIT(devep->ep_xfer);
 	}
 }
 
 static void
 pci_xhci_disable_ep(struct pci_xhci_dev_emu *dev, int epid)
 {
 	struct xhci_dev_ctx    *dev_ctx;
 	struct pci_xhci_dev_ep *devep;
 	struct xhci_endp_ctx   *ep_ctx;
 
 	DPRINTF(("pci_xhci disable_ep %d", epid));
 
 	dev_ctx = dev->dev_ctx;
 	ep_ctx = &dev_ctx->ctx_ep[epid];
 	ep_ctx->dwEpCtx0 = (ep_ctx->dwEpCtx0 & ~0x7) | XHCI_ST_EPCTX_DISABLED;
 
 	devep = &dev->eps[epid];
 	if (XHCI_EPCTX_0_MAXP_STREAMS_GET(ep_ctx->dwEpCtx0) > 0 &&
 	    devep->ep_sctx_trbs != NULL)
 			free(devep->ep_sctx_trbs);
 
 	if (devep->ep_xfer != NULL) {
 		free(devep->ep_xfer);
 		devep->ep_xfer = NULL;
 	}
 
 	memset(devep, 0, sizeof(struct pci_xhci_dev_ep));
 }
 
 
 /* reset device at slot and data structures related to it */
 static void
 pci_xhci_reset_slot(struct pci_xhci_softc *sc, int slot)
 {
 	struct pci_xhci_dev_emu *dev;
 
 	dev = XHCI_SLOTDEV_PTR(sc, slot);
 
 	if (!dev) {
 		DPRINTF(("xhci reset unassigned slot (%d)?", slot));
 	} else {
 		dev->dev_slotstate = XHCI_ST_DISABLED;
 	}
 
 	/* TODO: reset ring buffer pointers */
 }
 
 static int
 pci_xhci_insert_event(struct pci_xhci_softc *sc, struct xhci_trb *evtrb,
     int do_intr)
 {
 	struct pci_xhci_rtsregs *rts;
 	uint64_t	erdp;
 	int		erdp_idx;
 	int		err;
 	struct xhci_trb *evtrbptr;
 
 	err = XHCI_TRB_ERROR_SUCCESS;
 
 	rts = &sc->rtsregs;
 
 	erdp = rts->intrreg.erdp & ~0xF;
 	erdp_idx = (erdp - rts->erstba_p[rts->er_deq_seg].qwEvrsTablePtr) /
 	           sizeof(struct xhci_trb);
 
 	DPRINTF(("pci_xhci: insert event 0[%lx] 2[%x] 3[%x]",
 	         evtrb->qwTrb0, evtrb->dwTrb2, evtrb->dwTrb3));
 	DPRINTF(("\terdp idx %d/seg %d, enq idx %d/seg %d, pcs %u",
 	         erdp_idx, rts->er_deq_seg, rts->er_enq_idx,
 	         rts->er_enq_seg, rts->event_pcs));
 	DPRINTF(("\t(erdp=0x%lx, erst=0x%lx, tblsz=%u, do_intr %d)",
 		 erdp, rts->erstba_p->qwEvrsTablePtr,
 	         rts->erstba_p->dwEvrsTableSize, do_intr));
 
 	evtrbptr = &rts->erst_p[rts->er_enq_idx];
 
 	/* TODO: multi-segment table */
 	if (rts->er_events_cnt >= rts->erstba_p->dwEvrsTableSize) {
 		DPRINTF(("pci_xhci[%d] cannot insert event; ring full",
 		         __LINE__));
 		err = XHCI_TRB_ERROR_EV_RING_FULL;
 		goto done;
 	}
 
 	if (rts->er_events_cnt == rts->erstba_p->dwEvrsTableSize - 1) {
 		struct xhci_trb	errev;
 
 		if ((evtrbptr->dwTrb3 & 0x1) == (rts->event_pcs & 0x1)) {
 
 			DPRINTF(("pci_xhci[%d] insert evt err: ring full",
 			         __LINE__));
 
 			errev.qwTrb0 = 0;
 			errev.dwTrb2 = XHCI_TRB_2_ERROR_SET(
 			                    XHCI_TRB_ERROR_EV_RING_FULL);
 			errev.dwTrb3 = XHCI_TRB_3_TYPE_SET(
 			                    XHCI_TRB_EVENT_HOST_CTRL) |
 			               rts->event_pcs;
 			rts->er_events_cnt++;
 			memcpy(&rts->erst_p[rts->er_enq_idx], &errev,
 			       sizeof(struct xhci_trb));
 			rts->er_enq_idx = (rts->er_enq_idx + 1) %
 			                  rts->erstba_p->dwEvrsTableSize;
 			err = XHCI_TRB_ERROR_EV_RING_FULL;
 			do_intr = 1;
 
 			goto done;
 		}
 	} else {
 		rts->er_events_cnt++;
 	}
 
 	evtrb->dwTrb3 &= ~XHCI_TRB_3_CYCLE_BIT;
 	evtrb->dwTrb3 |= rts->event_pcs;
 
 	memcpy(&rts->erst_p[rts->er_enq_idx], evtrb, sizeof(struct xhci_trb));
 	rts->er_enq_idx = (rts->er_enq_idx + 1) %
 	                  rts->erstba_p->dwEvrsTableSize;
 
 	if (rts->er_enq_idx == 0)
 		rts->event_pcs ^= 1;
 
 done:
 	if (do_intr)
 		pci_xhci_assert_interrupt(sc);
 
 	return (err);
 }
 
 static uint32_t
 pci_xhci_cmd_enable_slot(struct pci_xhci_softc *sc, uint32_t *slot)
 {
 	struct pci_xhci_dev_emu *dev;
 	uint32_t	cmderr;
 	int		i;
 
 	cmderr = XHCI_TRB_ERROR_NO_SLOTS;
 	if (sc->portregs != NULL)
 		for (i = 1; i <= XHCI_MAX_SLOTS; i++) {
 			dev = XHCI_SLOTDEV_PTR(sc, i);
 			if (dev && dev->dev_slotstate == XHCI_ST_DISABLED) {
 				*slot = i;
 				dev->dev_slotstate = XHCI_ST_ENABLED;
 				cmderr = XHCI_TRB_ERROR_SUCCESS;
 				dev->hci.hci_address = i;
 				break;
 			}
 		}
 
 	DPRINTF(("pci_xhci enable slot (error=%d) slot %u",
 		cmderr != XHCI_TRB_ERROR_SUCCESS, *slot));
 
 	return (cmderr);
 }
 
 static uint32_t
 pci_xhci_cmd_disable_slot(struct pci_xhci_softc *sc, uint32_t slot)
 {
 	struct pci_xhci_dev_emu *dev;
 	uint32_t cmderr;
 
 	DPRINTF(("pci_xhci disable slot %u", slot));
 
 	cmderr = XHCI_TRB_ERROR_NO_SLOTS;
 	if (sc->portregs == NULL)
 		goto done;
 
 	if (slot > sc->ndevices) {
 		cmderr = XHCI_TRB_ERROR_SLOT_NOT_ON;
 		goto done;
 	}
 
 	dev = XHCI_SLOTDEV_PTR(sc, slot);
 	if (dev) {
 		if (dev->dev_slotstate == XHCI_ST_DISABLED) {
 			cmderr = XHCI_TRB_ERROR_SLOT_NOT_ON;
 		} else {
 			dev->dev_slotstate = XHCI_ST_DISABLED;
 			cmderr = XHCI_TRB_ERROR_SUCCESS;
 			/* TODO: reset events and endpoints */
 		}
 	}
 
 done:
 	return (cmderr);
 }
 
 static uint32_t
 pci_xhci_cmd_reset_device(struct pci_xhci_softc *sc, uint32_t slot)
 {
 	struct pci_xhci_dev_emu *dev;
 	struct xhci_dev_ctx     *dev_ctx;
 	struct xhci_endp_ctx    *ep_ctx;
 	uint32_t	cmderr;
 	int		i;
 
 	cmderr = XHCI_TRB_ERROR_NO_SLOTS;
 	if (sc->portregs == NULL)
 		goto done;
 
 	DPRINTF(("pci_xhci reset device slot %u", slot));
 
 	dev = XHCI_SLOTDEV_PTR(sc, slot);
 	if (!dev || dev->dev_slotstate == XHCI_ST_DISABLED)
 		cmderr = XHCI_TRB_ERROR_SLOT_NOT_ON;
 	else {
 		dev->dev_slotstate = XHCI_ST_DEFAULT;
 
 		dev->hci.hci_address = 0;
 		dev_ctx = pci_xhci_get_dev_ctx(sc, slot);
 
 		/* slot state */
 		dev_ctx->ctx_slot.dwSctx3 = FIELD_REPLACE(
 		    dev_ctx->ctx_slot.dwSctx3, XHCI_ST_SLCTX_DEFAULT,
 		    0x1F, 27);
 
 		/* number of contexts */
 		dev_ctx->ctx_slot.dwSctx0 = FIELD_REPLACE(
 		    dev_ctx->ctx_slot.dwSctx0, 1, 0x1F, 27);
 
 		/* reset all eps other than ep-0 */
 		for (i = 2; i <= 31; i++) {
 			ep_ctx = &dev_ctx->ctx_ep[i];
 			ep_ctx->dwEpCtx0 = FIELD_REPLACE( ep_ctx->dwEpCtx0,
 			    XHCI_ST_EPCTX_DISABLED, 0x7, 0);
 		}
 
 		cmderr = XHCI_TRB_ERROR_SUCCESS;
 	}
 
 	pci_xhci_reset_slot(sc, slot);
 
 done:
 	return (cmderr);
 }
 
 static uint32_t
 pci_xhci_cmd_address_device(struct pci_xhci_softc *sc, uint32_t slot,
     struct xhci_trb *trb)
 {
 	struct pci_xhci_dev_emu	*dev;
 	struct xhci_input_dev_ctx *input_ctx;
 	struct xhci_slot_ctx	*islot_ctx;
 	struct xhci_dev_ctx	*dev_ctx;
 	struct xhci_endp_ctx	*ep0_ctx;
 	uint32_t		cmderr;
 
 	input_ctx = XHCI_GADDR(sc, trb->qwTrb0 & ~0xFUL);
 	islot_ctx = &input_ctx->ctx_slot;
 	ep0_ctx = &input_ctx->ctx_ep[1];
 
 	cmderr = XHCI_TRB_ERROR_SUCCESS;
 
 	DPRINTF(("pci_xhci: address device, input ctl: D 0x%08x A 0x%08x,",
 	        input_ctx->ctx_input.dwInCtx0, input_ctx->ctx_input.dwInCtx1));
 	DPRINTF(("          slot %08x %08x %08x %08x",
 	        islot_ctx->dwSctx0, islot_ctx->dwSctx1,
 	        islot_ctx->dwSctx2, islot_ctx->dwSctx3));
 	DPRINTF(("          ep0  %08x %08x %016lx %08x",
 	        ep0_ctx->dwEpCtx0, ep0_ctx->dwEpCtx1, ep0_ctx->qwEpCtx2,
 	        ep0_ctx->dwEpCtx4));
 
 	/* when setting address: drop-ctx=0, add-ctx=slot+ep0 */
 	if ((input_ctx->ctx_input.dwInCtx0 != 0) ||
 	    (input_ctx->ctx_input.dwInCtx1 & 0x03) != 0x03) {
 		DPRINTF(("pci_xhci: address device, input ctl invalid"));
 		cmderr = XHCI_TRB_ERROR_TRB;
 		goto done;
 	}
 
 	/* assign address to slot */
 	dev_ctx = pci_xhci_get_dev_ctx(sc, slot);
 
 	DPRINTF(("pci_xhci: address device, dev ctx"));
 	DPRINTF(("          slot %08x %08x %08x %08x",
 	        dev_ctx->ctx_slot.dwSctx0, dev_ctx->ctx_slot.dwSctx1,
 	        dev_ctx->ctx_slot.dwSctx2, dev_ctx->ctx_slot.dwSctx3));
 
 	dev = XHCI_SLOTDEV_PTR(sc, slot);
 	assert(dev != NULL);
 
 	dev->hci.hci_address = slot;
 	dev->dev_ctx = dev_ctx;
 
 	if (dev->dev_ue->ue_reset == NULL ||
 	    dev->dev_ue->ue_reset(dev->dev_sc) < 0) {
 		cmderr = XHCI_TRB_ERROR_ENDP_NOT_ON;
 		goto done;
 	}
 
 	memcpy(&dev_ctx->ctx_slot, islot_ctx, sizeof(struct xhci_slot_ctx));
 
 	dev_ctx->ctx_slot.dwSctx3 =
 	    XHCI_SCTX_3_SLOT_STATE_SET(XHCI_ST_SLCTX_ADDRESSED) |
 	    XHCI_SCTX_3_DEV_ADDR_SET(slot);
 
 	memcpy(&dev_ctx->ctx_ep[1], ep0_ctx, sizeof(struct xhci_endp_ctx));
 	ep0_ctx = &dev_ctx->ctx_ep[1];
 	ep0_ctx->dwEpCtx0 = (ep0_ctx->dwEpCtx0 & ~0x7) |
 	    XHCI_EPCTX_0_EPSTATE_SET(XHCI_ST_EPCTX_RUNNING);
 
 	pci_xhci_init_ep(dev, 1);
 
 	dev->dev_slotstate = XHCI_ST_ADDRESSED;
 
 	DPRINTF(("pci_xhci: address device, output ctx"));
 	DPRINTF(("          slot %08x %08x %08x %08x",
 	        dev_ctx->ctx_slot.dwSctx0, dev_ctx->ctx_slot.dwSctx1,
 	        dev_ctx->ctx_slot.dwSctx2, dev_ctx->ctx_slot.dwSctx3));
 	DPRINTF(("          ep0  %08x %08x %016lx %08x",
 	        ep0_ctx->dwEpCtx0, ep0_ctx->dwEpCtx1, ep0_ctx->qwEpCtx2,
 	        ep0_ctx->dwEpCtx4));
 
 done:
 	return (cmderr);
 }
 
 static uint32_t
 pci_xhci_cmd_config_ep(struct pci_xhci_softc *sc, uint32_t slot,
     struct xhci_trb *trb)
 {
 	struct xhci_input_dev_ctx *input_ctx;
 	struct pci_xhci_dev_emu	*dev;
 	struct xhci_dev_ctx	*dev_ctx;
 	struct xhci_endp_ctx	*ep_ctx, *iep_ctx;
 	uint32_t	cmderr;
 	int		i;
 
 	cmderr = XHCI_TRB_ERROR_SUCCESS;
 
 	DPRINTF(("pci_xhci config_ep slot %u", slot));
 
 	dev = XHCI_SLOTDEV_PTR(sc, slot);
 	assert(dev != NULL);
 
 	if ((trb->dwTrb3 & XHCI_TRB_3_DCEP_BIT) != 0) {
 		DPRINTF(("pci_xhci config_ep - deconfigure ep slot %u",
 		        slot));
 		if (dev->dev_ue->ue_stop != NULL)
 			dev->dev_ue->ue_stop(dev->dev_sc);
 
 		dev->dev_slotstate = XHCI_ST_ADDRESSED;
 
 		dev->hci.hci_address = 0;
 		dev_ctx = pci_xhci_get_dev_ctx(sc, slot);
 
 		/* number of contexts */
 		dev_ctx->ctx_slot.dwSctx0 = FIELD_REPLACE(
 		    dev_ctx->ctx_slot.dwSctx0, 1, 0x1F, 27);
 
 		/* slot state */
 		dev_ctx->ctx_slot.dwSctx3 = FIELD_REPLACE(
 		    dev_ctx->ctx_slot.dwSctx3, XHCI_ST_SLCTX_ADDRESSED,
 		    0x1F, 27);
 
 		/* disable endpoints */
 		for (i = 2; i < 32; i++)
 			pci_xhci_disable_ep(dev, i);
 
 		cmderr = XHCI_TRB_ERROR_SUCCESS;
 
 		goto done;
 	}
 
 	if (dev->dev_slotstate < XHCI_ST_ADDRESSED) {
 		DPRINTF(("pci_xhci: config_ep slotstate x%x != addressed",
 		        dev->dev_slotstate));
 		cmderr = XHCI_TRB_ERROR_SLOT_NOT_ON;
 		goto done;
 	}
 
 	/* In addressed/configured state;
 	 * for each drop endpoint ctx flag:
 	 *   ep->state = DISABLED
 	 * for each add endpoint ctx flag:
 	 *   cp(ep-in, ep-out)
 	 *   ep->state = RUNNING
 	 * for each drop+add endpoint flag:
 	 *   reset ep resources
 	 *   cp(ep-in, ep-out)
 	 *   ep->state = RUNNING
 	 * if input->DisabledCtx[2-31] < 30: (at least 1 ep not disabled)
 	 *   slot->state = configured
 	 */
 
 	input_ctx = XHCI_GADDR(sc, trb->qwTrb0 & ~0xFUL);
 	dev_ctx = dev->dev_ctx;
 	DPRINTF(("pci_xhci: config_ep inputctx: D:x%08x A:x%08x 7:x%08x",
 		input_ctx->ctx_input.dwInCtx0, input_ctx->ctx_input.dwInCtx1,
 	        input_ctx->ctx_input.dwInCtx7));
 
 	for (i = 2; i <= 31; i++) {
 		ep_ctx = &dev_ctx->ctx_ep[i];
 
 		if (input_ctx->ctx_input.dwInCtx0 &
 		    XHCI_INCTX_0_DROP_MASK(i)) {
 			DPRINTF((" config ep - dropping ep %d", i));
 			pci_xhci_disable_ep(dev, i);
 		}
 
 		if (input_ctx->ctx_input.dwInCtx1 &
 		    XHCI_INCTX_1_ADD_MASK(i)) {
 			iep_ctx = &input_ctx->ctx_ep[i];
 
 			DPRINTF((" enable ep[%d]  %08x %08x %016lx %08x",
 			   i, iep_ctx->dwEpCtx0, iep_ctx->dwEpCtx1,
 			   iep_ctx->qwEpCtx2, iep_ctx->dwEpCtx4));
 
 			memcpy(ep_ctx, iep_ctx, sizeof(struct xhci_endp_ctx));
 
 			pci_xhci_init_ep(dev, i);
 
 			/* ep state */
 			ep_ctx->dwEpCtx0 = FIELD_REPLACE(
 			    ep_ctx->dwEpCtx0, XHCI_ST_EPCTX_RUNNING, 0x7, 0);
 		}
 	}
 
 	/* slot state to configured */
 	dev_ctx->ctx_slot.dwSctx3 = FIELD_REPLACE(
 	    dev_ctx->ctx_slot.dwSctx3, XHCI_ST_SLCTX_CONFIGURED, 0x1F, 27);
 	dev_ctx->ctx_slot.dwSctx0 = FIELD_COPY(
 	    dev_ctx->ctx_slot.dwSctx0, input_ctx->ctx_slot.dwSctx0, 0x1F, 27);
 	dev->dev_slotstate = XHCI_ST_CONFIGURED;
 
 	DPRINTF(("EP configured; slot %u [0]=0x%08x [1]=0x%08x [2]=0x%08x "
 	         "[3]=0x%08x",
 	    slot, dev_ctx->ctx_slot.dwSctx0, dev_ctx->ctx_slot.dwSctx1,
 	    dev_ctx->ctx_slot.dwSctx2, dev_ctx->ctx_slot.dwSctx3));
 
 done:
 	return (cmderr);
 }
 
 static uint32_t
 pci_xhci_cmd_reset_ep(struct pci_xhci_softc *sc, uint32_t slot,
     struct xhci_trb *trb)
 {
 	struct pci_xhci_dev_emu	*dev;
 	struct pci_xhci_dev_ep *devep;
 	struct xhci_dev_ctx	*dev_ctx;
 	struct xhci_endp_ctx	*ep_ctx;
 	uint32_t	cmderr, epid;
 	uint32_t	type;
 
 	epid = XHCI_TRB_3_EP_GET(trb->dwTrb3);
 
 	DPRINTF(("pci_xhci: reset ep %u: slot %u", epid, slot));
 
 	cmderr = XHCI_TRB_ERROR_SUCCESS;
 
 	type = XHCI_TRB_3_TYPE_GET(trb->dwTrb3);
 
 	dev = XHCI_SLOTDEV_PTR(sc, slot);
 	assert(dev != NULL);
 
 	if (type == XHCI_TRB_TYPE_STOP_EP &&
 	    (trb->dwTrb3 & XHCI_TRB_3_SUSP_EP_BIT) != 0) {
 		/* XXX suspend endpoint for 10ms */
 	}
 
 	if (epid < 1 || epid > 31) {
 		DPRINTF(("pci_xhci: reset ep: invalid epid %u", epid));
 		cmderr = XHCI_TRB_ERROR_TRB;
 		goto done;
 	}
 
 	devep = &dev->eps[epid];
 	if (devep->ep_xfer != NULL)
 		USB_DATA_XFER_RESET(devep->ep_xfer);
 
 	dev_ctx = dev->dev_ctx;
 	assert(dev_ctx != NULL);
 
 	ep_ctx = &dev_ctx->ctx_ep[epid];
 
 	ep_ctx->dwEpCtx0 = (ep_ctx->dwEpCtx0 & ~0x7) | XHCI_ST_EPCTX_STOPPED;
 
 	if (XHCI_EPCTX_0_MAXP_STREAMS_GET(ep_ctx->dwEpCtx0) == 0)
 		ep_ctx->qwEpCtx2 = devep->ep_ringaddr | devep->ep_ccs;
 
 	DPRINTF(("pci_xhci: reset ep[%u] %08x %08x %016lx %08x",
 	        epid, ep_ctx->dwEpCtx0, ep_ctx->dwEpCtx1, ep_ctx->qwEpCtx2,
 	        ep_ctx->dwEpCtx4));
 
 	if (type == XHCI_TRB_TYPE_RESET_EP &&
 	    (dev->dev_ue->ue_reset == NULL ||
 	    dev->dev_ue->ue_reset(dev->dev_sc) < 0)) {
 		cmderr = XHCI_TRB_ERROR_ENDP_NOT_ON;
 		goto done;
 	}
 
 done:
 	return (cmderr);
 }
 
 
 static uint32_t
 pci_xhci_find_stream(struct pci_xhci_softc *sc, struct xhci_endp_ctx *ep,
     uint32_t streamid, struct xhci_stream_ctx **osctx)
 {
 	struct xhci_stream_ctx *sctx;
 	uint32_t	maxpstreams;
 
 	maxpstreams = XHCI_EPCTX_0_MAXP_STREAMS_GET(ep->dwEpCtx0);
 	if (maxpstreams == 0)
 		return (XHCI_TRB_ERROR_TRB);
 
 	if (maxpstreams > XHCI_STREAMS_MAX)
 		return (XHCI_TRB_ERROR_INVALID_SID);
 
 	if (XHCI_EPCTX_0_LSA_GET(ep->dwEpCtx0) == 0) {
 		DPRINTF(("pci_xhci: find_stream; LSA bit not set"));
 		return (XHCI_TRB_ERROR_INVALID_SID);
 	}
 
 	/* only support primary stream */
 	if (streamid > maxpstreams)
 		return (XHCI_TRB_ERROR_STREAM_TYPE);
 
 	sctx = XHCI_GADDR(sc, ep->qwEpCtx2 & ~0xFUL) + streamid;
 	if (!XHCI_SCTX_0_SCT_GET(sctx->qwSctx0))
 		return (XHCI_TRB_ERROR_STREAM_TYPE);
 
 	*osctx = sctx;
 
 	return (XHCI_TRB_ERROR_SUCCESS);
 }
 
 
 static uint32_t
 pci_xhci_cmd_set_tr(struct pci_xhci_softc *sc, uint32_t slot,
     struct xhci_trb *trb)
 {
 	struct pci_xhci_dev_emu	*dev;
 	struct pci_xhci_dev_ep	*devep;
 	struct xhci_dev_ctx	*dev_ctx;
 	struct xhci_endp_ctx	*ep_ctx;
 	uint32_t	cmderr, epid;
 	uint32_t	streamid;
 
 	cmderr = XHCI_TRB_ERROR_SUCCESS;
 
 	dev = XHCI_SLOTDEV_PTR(sc, slot);
 	assert(dev != NULL);
 
 	DPRINTF(("pci_xhci set_tr: new-tr x%016lx, SCT %u DCS %u",
 	         (trb->qwTrb0 & ~0xF),  (uint32_t)((trb->qwTrb0 >> 1) & 0x7),
 	         (uint32_t)(trb->qwTrb0 & 0x1)));
 	DPRINTF(("                 stream-id %u, slot %u, epid %u, C %u",
 		 (trb->dwTrb2 >> 16) & 0xFFFF,
 	         XHCI_TRB_3_SLOT_GET(trb->dwTrb3),
 	         XHCI_TRB_3_EP_GET(trb->dwTrb3), trb->dwTrb3 & 0x1));
 
 	epid = XHCI_TRB_3_EP_GET(trb->dwTrb3);
 	if (epid < 1 || epid > 31) {
 		DPRINTF(("pci_xhci: set_tr_deq: invalid epid %u", epid));
 		cmderr = XHCI_TRB_ERROR_TRB;
 		goto done;
 	}
 
 	dev_ctx = dev->dev_ctx;
 	assert(dev_ctx != NULL);
 
 	ep_ctx = &dev_ctx->ctx_ep[epid];
 	devep = &dev->eps[epid];
 
 	switch (XHCI_EPCTX_0_EPSTATE_GET(ep_ctx->dwEpCtx0)) {
 	case XHCI_ST_EPCTX_STOPPED:
 	case XHCI_ST_EPCTX_ERROR:
 		break;
 	default:
 		DPRINTF(("pci_xhci cmd set_tr invalid state %x",
 		        XHCI_EPCTX_0_EPSTATE_GET(ep_ctx->dwEpCtx0)));
 		cmderr = XHCI_TRB_ERROR_CONTEXT_STATE;
 		goto done;
 	}
 
 	streamid = XHCI_TRB_2_STREAM_GET(trb->dwTrb2);
 	if (XHCI_EPCTX_0_MAXP_STREAMS_GET(ep_ctx->dwEpCtx0) > 0) {
 		struct xhci_stream_ctx *sctx;
 
 		sctx = NULL;
 		cmderr = pci_xhci_find_stream(sc, ep_ctx, streamid, &sctx);
 		if (sctx != NULL) {
 			assert(devep->ep_sctx != NULL);
 			
 			devep->ep_sctx[streamid].qwSctx0 = trb->qwTrb0;
 			devep->ep_sctx_trbs[streamid].ringaddr =
 			    trb->qwTrb0 & ~0xF;
 			devep->ep_sctx_trbs[streamid].ccs =
 			    XHCI_EPCTX_2_DCS_GET(trb->qwTrb0);
 		}
 	} else {
 		if (streamid != 0) {
 			DPRINTF(("pci_xhci cmd set_tr streamid %x != 0",
 			        streamid));
 		}
 		ep_ctx->qwEpCtx2 = trb->qwTrb0 & ~0xFUL;
 		devep->ep_ringaddr = ep_ctx->qwEpCtx2 & ~0xFUL;
 		devep->ep_ccs = trb->qwTrb0 & 0x1;
 		devep->ep_tr = XHCI_GADDR(sc, devep->ep_ringaddr);
 
 		DPRINTF(("pci_xhci set_tr first TRB:"));
 		pci_xhci_dump_trb(devep->ep_tr);
 	}
 	ep_ctx->dwEpCtx0 = (ep_ctx->dwEpCtx0 & ~0x7) | XHCI_ST_EPCTX_STOPPED;
 
 done:
 	return (cmderr);
 }
 
 static uint32_t
 pci_xhci_cmd_eval_ctx(struct pci_xhci_softc *sc, uint32_t slot,
     struct xhci_trb *trb)
 {
 	struct xhci_input_dev_ctx *input_ctx;
 	struct xhci_slot_ctx      *islot_ctx;
 	struct xhci_dev_ctx       *dev_ctx;
 	struct xhci_endp_ctx      *ep0_ctx;
 	uint32_t cmderr;
 
 	input_ctx = XHCI_GADDR(sc, trb->qwTrb0 & ~0xFUL);
 	islot_ctx = &input_ctx->ctx_slot;
 	ep0_ctx = &input_ctx->ctx_ep[1];
 
 	cmderr = XHCI_TRB_ERROR_SUCCESS;
 	DPRINTF(("pci_xhci: eval ctx, input ctl: D 0x%08x A 0x%08x,",
 	        input_ctx->ctx_input.dwInCtx0, input_ctx->ctx_input.dwInCtx1));
 	DPRINTF(("          slot %08x %08x %08x %08x",
 	        islot_ctx->dwSctx0, islot_ctx->dwSctx1,
 	        islot_ctx->dwSctx2, islot_ctx->dwSctx3));
 	DPRINTF(("          ep0  %08x %08x %016lx %08x",
 	        ep0_ctx->dwEpCtx0, ep0_ctx->dwEpCtx1, ep0_ctx->qwEpCtx2,
 	        ep0_ctx->dwEpCtx4));
 
 	/* this command expects drop-ctx=0 & add-ctx=slot+ep0 */
 	if ((input_ctx->ctx_input.dwInCtx0 != 0) ||
 	    (input_ctx->ctx_input.dwInCtx1 & 0x03) == 0) {
 		DPRINTF(("pci_xhci: eval ctx, input ctl invalid"));
 		cmderr = XHCI_TRB_ERROR_TRB;
 		goto done;
 	}
 
 	/* assign address to slot; in this emulation, slot_id = address */
 	dev_ctx = pci_xhci_get_dev_ctx(sc, slot);
 
 	DPRINTF(("pci_xhci: eval ctx, dev ctx"));
 	DPRINTF(("          slot %08x %08x %08x %08x",
 	        dev_ctx->ctx_slot.dwSctx0, dev_ctx->ctx_slot.dwSctx1,
 	        dev_ctx->ctx_slot.dwSctx2, dev_ctx->ctx_slot.dwSctx3));
 
 	if (input_ctx->ctx_input.dwInCtx1 & 0x01) {	/* slot ctx */
 		/* set max exit latency */
 		dev_ctx->ctx_slot.dwSctx1 = FIELD_COPY(
 		    dev_ctx->ctx_slot.dwSctx1, input_ctx->ctx_slot.dwSctx1,
 		    0xFFFF, 0);
 
 		/* set interrupter target */
 		dev_ctx->ctx_slot.dwSctx2 = FIELD_COPY(
 		    dev_ctx->ctx_slot.dwSctx2, input_ctx->ctx_slot.dwSctx2,
 		    0x3FF, 22);
 	}
 	if (input_ctx->ctx_input.dwInCtx1 & 0x02) {	/* control ctx */
 		/* set max packet size */
 		dev_ctx->ctx_ep[1].dwEpCtx1 = FIELD_COPY(
 		    dev_ctx->ctx_ep[1].dwEpCtx1, ep0_ctx->dwEpCtx1,
 		    0xFFFF, 16);
 
 		ep0_ctx = &dev_ctx->ctx_ep[1];
 	}
 
 	DPRINTF(("pci_xhci: eval ctx, output ctx"));
 	DPRINTF(("          slot %08x %08x %08x %08x",
 	        dev_ctx->ctx_slot.dwSctx0, dev_ctx->ctx_slot.dwSctx1,
 	        dev_ctx->ctx_slot.dwSctx2, dev_ctx->ctx_slot.dwSctx3));
 	DPRINTF(("          ep0  %08x %08x %016lx %08x",
 	        ep0_ctx->dwEpCtx0, ep0_ctx->dwEpCtx1, ep0_ctx->qwEpCtx2,
 	        ep0_ctx->dwEpCtx4));
 
 done:
 	return (cmderr);
 }
 
 static int
 pci_xhci_complete_commands(struct pci_xhci_softc *sc)
 {
 	struct xhci_trb	evtrb;
 	struct xhci_trb	*trb;
 	uint64_t	crcr;
 	uint32_t	ccs;		/* cycle state (XHCI 4.9.2) */
 	uint32_t	type;
 	uint32_t	slot;
 	uint32_t	cmderr;
 	int		error;
 
 	error = 0;
 	sc->opregs.crcr |= XHCI_CRCR_LO_CRR;
 
 	trb = sc->opregs.cr_p;
 	ccs = sc->opregs.crcr & XHCI_CRCR_LO_RCS;
 	crcr = sc->opregs.crcr & ~0xF;
 
 	while (1) {
 		sc->opregs.cr_p = trb;
 	
 		type = XHCI_TRB_3_TYPE_GET(trb->dwTrb3);
 
 		if ((trb->dwTrb3 & XHCI_TRB_3_CYCLE_BIT) !=
 		    (ccs & XHCI_TRB_3_CYCLE_BIT))
 			break;
 
 		DPRINTF(("pci_xhci: cmd type 0x%x, Trb0 x%016lx dwTrb2 x%08x"
 		        " dwTrb3 x%08x, TRB_CYCLE %u/ccs %u",
 		        type, trb->qwTrb0, trb->dwTrb2, trb->dwTrb3,
 		        trb->dwTrb3 & XHCI_TRB_3_CYCLE_BIT, ccs));
 
 		cmderr = XHCI_TRB_ERROR_SUCCESS;
 		evtrb.dwTrb2 = 0;
 		evtrb.dwTrb3 = (ccs & XHCI_TRB_3_CYCLE_BIT) |
 		      XHCI_TRB_3_TYPE_SET(XHCI_TRB_EVENT_CMD_COMPLETE);
 		slot = 0;
 
 		switch (type) {
 		case XHCI_TRB_TYPE_LINK:			/* 0x06 */
 			if (trb->dwTrb3 & XHCI_TRB_3_TC_BIT)
 				ccs ^= XHCI_CRCR_LO_RCS;
 			break;
 
 		case XHCI_TRB_TYPE_ENABLE_SLOT:			/* 0x09 */
 			cmderr = pci_xhci_cmd_enable_slot(sc, &slot);
 			break;
 
 		case XHCI_TRB_TYPE_DISABLE_SLOT:		/* 0x0A */
 			slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3);
 			cmderr = pci_xhci_cmd_disable_slot(sc, slot);
 			break;
 
 		case XHCI_TRB_TYPE_ADDRESS_DEVICE:		/* 0x0B */
 			slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3);
 			cmderr = pci_xhci_cmd_address_device(sc, slot, trb);
 			break;
 
 		case XHCI_TRB_TYPE_CONFIGURE_EP:		/* 0x0C */
 			slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3);
 			cmderr = pci_xhci_cmd_config_ep(sc, slot, trb);
 			break;
 
 		case XHCI_TRB_TYPE_EVALUATE_CTX:		/* 0x0D */
 			slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3);
 			cmderr = pci_xhci_cmd_eval_ctx(sc, slot, trb);
 			break;
 
 		case XHCI_TRB_TYPE_RESET_EP:			/* 0x0E */
 			DPRINTF(("Reset Endpoint on slot %d", slot));
 			slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3);
 			cmderr = pci_xhci_cmd_reset_ep(sc, slot, trb);
 			break;
 
 		case XHCI_TRB_TYPE_STOP_EP:			/* 0x0F */
 			DPRINTF(("Stop Endpoint on slot %d", slot));
 			slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3);
 			cmderr = pci_xhci_cmd_reset_ep(sc, slot, trb);
 			break;
 
 		case XHCI_TRB_TYPE_SET_TR_DEQUEUE:		/* 0x10 */
 			slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3);
 			cmderr = pci_xhci_cmd_set_tr(sc, slot, trb);
 			break;
 
 		case XHCI_TRB_TYPE_RESET_DEVICE:		/* 0x11 */
 			slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3);
 			cmderr = pci_xhci_cmd_reset_device(sc, slot);
 			break;
 
 		case XHCI_TRB_TYPE_FORCE_EVENT:			/* 0x12 */
 			/* TODO: */
 			break;
 
 		case XHCI_TRB_TYPE_NEGOTIATE_BW:		/* 0x13 */
 			break;
 
 		case XHCI_TRB_TYPE_SET_LATENCY_TOL:		/* 0x14 */
 			break;
 
 		case XHCI_TRB_TYPE_GET_PORT_BW:			/* 0x15 */
 			break;
 
 		case XHCI_TRB_TYPE_FORCE_HEADER:		/* 0x16 */
 			break;
 
 		case XHCI_TRB_TYPE_NOOP_CMD:			/* 0x17 */
 			break;
 
 		default:
 			DPRINTF(("pci_xhci: unsupported cmd %x", type));
 			break;
 		}
 
 		if (type != XHCI_TRB_TYPE_LINK) {
 			/* 
 			 * insert command completion event and assert intr
 			 */
 			evtrb.qwTrb0 = crcr;
 			evtrb.dwTrb2 |= XHCI_TRB_2_ERROR_SET(cmderr);
 			evtrb.dwTrb3 |= XHCI_TRB_3_SLOT_SET(slot);
 			DPRINTF(("pci_xhci: command 0x%x result: 0x%x",
 			        type, cmderr));
 			pci_xhci_insert_event(sc, &evtrb, 1);
 		}
 
 		trb = pci_xhci_trb_next(sc, trb, &crcr);
 	}
 
 	sc->opregs.crcr = crcr | (sc->opregs.crcr & XHCI_CRCR_LO_CA) | ccs;
 	sc->opregs.crcr &= ~XHCI_CRCR_LO_CRR;
 	return (error);
 }
 
 static void
 pci_xhci_dump_trb(struct xhci_trb *trb)
 {
 	static const char *trbtypes[] = {
 		"RESERVED",
 		"NORMAL",
 		"SETUP_STAGE",
 		"DATA_STAGE",
 		"STATUS_STAGE",
 		"ISOCH",
 		"LINK",
 		"EVENT_DATA",
 		"NOOP",
 		"ENABLE_SLOT",
 		"DISABLE_SLOT",
 		"ADDRESS_DEVICE",
 		"CONFIGURE_EP",
 		"EVALUATE_CTX",
 		"RESET_EP",
 		"STOP_EP",
 		"SET_TR_DEQUEUE",
 		"RESET_DEVICE",
 		"FORCE_EVENT",
 		"NEGOTIATE_BW",
 		"SET_LATENCY_TOL",
 		"GET_PORT_BW",
 		"FORCE_HEADER",
 		"NOOP_CMD"
 	};
 	uint32_t type;
 
 	type = XHCI_TRB_3_TYPE_GET(trb->dwTrb3);
 	DPRINTF(("pci_xhci: trb[@%p] type x%02x %s 0:x%016lx 2:x%08x 3:x%08x",
 	         trb, type,
 	         type <= XHCI_TRB_TYPE_NOOP_CMD ? trbtypes[type] : "INVALID",
 	         trb->qwTrb0, trb->dwTrb2, trb->dwTrb3));
 }
 
 static int
 pci_xhci_xfer_complete(struct pci_xhci_softc *sc, struct usb_data_xfer *xfer,
      uint32_t slot, uint32_t epid, int *do_intr)
 {
 	struct pci_xhci_dev_emu *dev;
 	struct pci_xhci_dev_ep	*devep;
 	struct xhci_dev_ctx	*dev_ctx;
 	struct xhci_endp_ctx	*ep_ctx;
 	struct xhci_trb		*trb;
 	struct xhci_trb		evtrb;
 	uint32_t trbflags;
 	uint32_t edtla;
 	int i, err;
 
 	dev = XHCI_SLOTDEV_PTR(sc, slot);
 	devep = &dev->eps[epid];
 	dev_ctx = pci_xhci_get_dev_ctx(sc, slot);
 
 	assert(dev_ctx != NULL);
 
 	ep_ctx = &dev_ctx->ctx_ep[epid];
 
 	err = XHCI_TRB_ERROR_SUCCESS;
 	*do_intr = 0;
 	edtla = 0;
 
 	/* go through list of TRBs and insert event(s) */
 	for (i = xfer->head; xfer->ndata > 0; ) {
 		evtrb.qwTrb0 = (uint64_t)xfer->data[i].hci_data;
 		trb = XHCI_GADDR(sc, evtrb.qwTrb0);
 		trbflags = trb->dwTrb3;
 
 		DPRINTF(("pci_xhci: xfer[%d] done?%u:%d trb %x %016lx %x "
 		         "(err %d) IOC?%d",
 		     i, xfer->data[i].processed, xfer->data[i].blen,
 		     XHCI_TRB_3_TYPE_GET(trbflags), evtrb.qwTrb0,
 		     trbflags, err,
 		     trb->dwTrb3 & XHCI_TRB_3_IOC_BIT ? 1 : 0));
 
 		if (!xfer->data[i].processed) {
 			xfer->head = i;
 			break;
 		}
 
 		xfer->ndata--;
 		edtla += xfer->data[i].bdone;
 
 		trb->dwTrb3 = (trb->dwTrb3 & ~0x1) | (xfer->data[i].ccs);
 
 		pci_xhci_update_ep_ring(sc, dev, devep, ep_ctx,
 		    xfer->data[i].streamid, xfer->data[i].trbnext,
 		    xfer->data[i].ccs);
 
 		/* Only interrupt if IOC or short packet */
 		if (!(trb->dwTrb3 & XHCI_TRB_3_IOC_BIT) &&
 		    !((err == XHCI_TRB_ERROR_SHORT_PKT) &&
 		      (trb->dwTrb3 & XHCI_TRB_3_ISP_BIT))) {
 
 			i = (i + 1) % USB_MAX_XFER_BLOCKS;
 			continue;
 		}
 
 		evtrb.dwTrb2 = XHCI_TRB_2_ERROR_SET(err) |
 		               XHCI_TRB_2_REM_SET(xfer->data[i].blen);
 
 		evtrb.dwTrb3 = XHCI_TRB_3_TYPE_SET(XHCI_TRB_EVENT_TRANSFER) |
 		    XHCI_TRB_3_SLOT_SET(slot) | XHCI_TRB_3_EP_SET(epid);
 
 		if (XHCI_TRB_3_TYPE_GET(trbflags) == XHCI_TRB_TYPE_EVENT_DATA) {
 			DPRINTF(("pci_xhci EVENT_DATA edtla %u", edtla));
 			evtrb.qwTrb0 = trb->qwTrb0;
 			evtrb.dwTrb2 = (edtla & 0xFFFFF) | 
 			         XHCI_TRB_2_ERROR_SET(err);
 			evtrb.dwTrb3 |= XHCI_TRB_3_ED_BIT;
 			edtla = 0;
 		}
 
 		*do_intr = 1;
 
 		err = pci_xhci_insert_event(sc, &evtrb, 0);
 		if (err != XHCI_TRB_ERROR_SUCCESS) {
 			break;
 		}
 
 		i = (i + 1) % USB_MAX_XFER_BLOCKS;
 	}
 
 	return (err);
 }
 
 static void
 pci_xhci_update_ep_ring(struct pci_xhci_softc *sc, struct pci_xhci_dev_emu *dev,
     struct pci_xhci_dev_ep *devep, struct xhci_endp_ctx *ep_ctx,
     uint32_t streamid, uint64_t ringaddr, int ccs)
 {
 
 	if (XHCI_EPCTX_0_MAXP_STREAMS_GET(ep_ctx->dwEpCtx0) != 0) {
 		devep->ep_sctx[streamid].qwSctx0 = (ringaddr & ~0xFUL) |
 		                                   (ccs & 0x1);
 
 		devep->ep_sctx_trbs[streamid].ringaddr = ringaddr & ~0xFUL;
 		devep->ep_sctx_trbs[streamid].ccs = ccs & 0x1;
 		ep_ctx->qwEpCtx2 = (ep_ctx->qwEpCtx2 & ~0x1) | (ccs & 0x1);
 
 		DPRINTF(("xhci update ep-ring stream %d, addr %lx",
 		    streamid, devep->ep_sctx[streamid].qwSctx0));
 	} else {
 		devep->ep_ringaddr = ringaddr & ~0xFUL;
 		devep->ep_ccs = ccs & 0x1;
 		devep->ep_tr = XHCI_GADDR(sc, ringaddr & ~0xFUL);
 		ep_ctx->qwEpCtx2 = (ringaddr & ~0xFUL) | (ccs & 0x1);
 
 		DPRINTF(("xhci update ep-ring, addr %lx",
 		    (devep->ep_ringaddr | devep->ep_ccs)));
 	}
 }
 
 /*
  * Outstanding transfer still in progress (device NAK'd earlier) so retry
  * the transfer again to see if it succeeds.
  */
 static int
 pci_xhci_try_usb_xfer(struct pci_xhci_softc *sc,
     struct pci_xhci_dev_emu *dev, struct pci_xhci_dev_ep *devep,
     struct xhci_endp_ctx *ep_ctx, uint32_t slot, uint32_t epid)
 {
 	struct usb_data_xfer *xfer;
 	int		err;
 	int		do_intr;
 
 	ep_ctx->dwEpCtx0 = FIELD_REPLACE(
 		    ep_ctx->dwEpCtx0, XHCI_ST_EPCTX_RUNNING, 0x7, 0);
 
 	err = 0;
 	do_intr = 0;
 
 	xfer = devep->ep_xfer;
 	USB_DATA_XFER_LOCK(xfer);
 
 	/* outstanding requests queued up */
 	if (dev->dev_ue->ue_data != NULL) {
 		err = dev->dev_ue->ue_data(dev->dev_sc, xfer,
 		            epid & 0x1 ? USB_XFER_IN : USB_XFER_OUT, epid/2);
 		if (err == USB_ERR_CANCELLED) {
 			if (USB_DATA_GET_ERRCODE(&xfer->data[xfer->head]) ==
 			    USB_NAK)
 				err = XHCI_TRB_ERROR_SUCCESS;
 		} else {
 			err = pci_xhci_xfer_complete(sc, xfer, slot, epid,
 			                             &do_intr);
 			if (err == XHCI_TRB_ERROR_SUCCESS && do_intr) {
 				pci_xhci_assert_interrupt(sc);
 			}
 
 
 			/* XXX should not do it if error? */
 			USB_DATA_XFER_RESET(xfer);
 		}
 	}
 
 	USB_DATA_XFER_UNLOCK(xfer);
 
 
 	return (err);
 }
 
 
 static int
 pci_xhci_handle_transfer(struct pci_xhci_softc *sc,
     struct pci_xhci_dev_emu *dev, struct pci_xhci_dev_ep *devep,
     struct xhci_endp_ctx *ep_ctx, struct xhci_trb *trb, uint32_t slot,
     uint32_t epid, uint64_t addr, uint32_t ccs, uint32_t streamid)
 {
 	struct xhci_trb *setup_trb;
 	struct usb_data_xfer *xfer;
 	struct usb_data_xfer_block *xfer_block;
 	uint64_t	val;
 	uint32_t	trbflags;
 	int		do_intr, err;
 	int		do_retry;
 
 	ep_ctx->dwEpCtx0 = FIELD_REPLACE(ep_ctx->dwEpCtx0,
 	                                 XHCI_ST_EPCTX_RUNNING, 0x7, 0);
 
 	xfer = devep->ep_xfer;
 	USB_DATA_XFER_LOCK(xfer);
 
 	DPRINTF(("pci_xhci handle_transfer slot %u", slot));
 
 retry:
 	err = 0;
 	do_retry = 0;
 	do_intr = 0;
 	setup_trb = NULL;
 
 	while (1) {
 		pci_xhci_dump_trb(trb);
 
 		trbflags = trb->dwTrb3;
 
 		if (XHCI_TRB_3_TYPE_GET(trbflags) != XHCI_TRB_TYPE_LINK &&
 		    (trbflags & XHCI_TRB_3_CYCLE_BIT) !=
 		    (ccs & XHCI_TRB_3_CYCLE_BIT)) {
 			DPRINTF(("Cycle-bit changed trbflags %x, ccs %x",
 			    trbflags & XHCI_TRB_3_CYCLE_BIT, ccs));
 			break;
 		}
 
 		xfer_block = NULL;
 
 		switch (XHCI_TRB_3_TYPE_GET(trbflags)) {
 		case XHCI_TRB_TYPE_LINK:
 			if (trb->dwTrb3 & XHCI_TRB_3_TC_BIT)
 				ccs ^= 0x1;
 
 			xfer_block = usb_data_xfer_append(xfer, NULL, 0,
 			                                  (void *)addr, ccs);
 			xfer_block->processed = 1;
 			break;
 
 		case XHCI_TRB_TYPE_SETUP_STAGE:
 			if ((trbflags & XHCI_TRB_3_IDT_BIT) == 0 ||
 			    XHCI_TRB_2_BYTES_GET(trb->dwTrb2) != 8) {
 				DPRINTF(("pci_xhci: invalid setup trb"));
 				err = XHCI_TRB_ERROR_TRB;
 				goto errout;
 			}
 			setup_trb = trb;
 
 			val = trb->qwTrb0;
 			if (!xfer->ureq)
 				xfer->ureq = malloc(
 				           sizeof(struct usb_device_request));
 			memcpy(xfer->ureq, &val,
 			       sizeof(struct usb_device_request));
 
 			xfer_block = usb_data_xfer_append(xfer, NULL, 0,
 			                                  (void *)addr, ccs);
 			xfer_block->processed = 1;
 			break;
 
 		case XHCI_TRB_TYPE_NORMAL:
 		case XHCI_TRB_TYPE_ISOCH:
 			if (setup_trb != NULL) {
 				DPRINTF(("pci_xhci: trb not supposed to be in "
 				         "ctl scope"));
 				err = XHCI_TRB_ERROR_TRB;
 				goto errout;
 			}
 			/* fall through */
 
 		case XHCI_TRB_TYPE_DATA_STAGE:
 			xfer_block = usb_data_xfer_append(xfer,
 			     (void *)(trbflags & XHCI_TRB_3_IDT_BIT ?
 			         &trb->qwTrb0 : XHCI_GADDR(sc, trb->qwTrb0)),
 			     trb->dwTrb2 & 0x1FFFF, (void *)addr, ccs);
 			break;
 
 		case XHCI_TRB_TYPE_STATUS_STAGE:
 			xfer_block = usb_data_xfer_append(xfer, NULL, 0,
 			                                  (void *)addr, ccs);
 			break;
 
 		case XHCI_TRB_TYPE_NOOP:
 			xfer_block = usb_data_xfer_append(xfer, NULL, 0,
 			                                  (void *)addr, ccs);
 			xfer_block->processed = 1;
 			break;
 
 		case XHCI_TRB_TYPE_EVENT_DATA:
 			xfer_block = usb_data_xfer_append(xfer, NULL, 0,
 			                                  (void *)addr, ccs);
 			if ((epid > 1) && (trbflags & XHCI_TRB_3_IOC_BIT)) {
 				xfer_block->processed = 1;
 			}
 			break;
 
 		default:
 			DPRINTF(("pci_xhci: handle xfer unexpected trb type "
 			         "0x%x",
 			         XHCI_TRB_3_TYPE_GET(trbflags)));
 			err = XHCI_TRB_ERROR_TRB;
 			goto errout;
 		}
 
 		trb = pci_xhci_trb_next(sc, trb, &addr);
 
 		DPRINTF(("pci_xhci: next trb: 0x%lx", (uint64_t)trb));
 
 		if (xfer_block) {
 			xfer_block->trbnext = addr;
 			xfer_block->streamid = streamid;
 		}
 
 		if (!setup_trb && !(trbflags & XHCI_TRB_3_CHAIN_BIT) &&
 		    XHCI_TRB_3_TYPE_GET(trbflags) != XHCI_TRB_TYPE_LINK) {
 			break;
 		}
 
 		/* handle current batch that requires interrupt on complete */
 		if (trbflags & XHCI_TRB_3_IOC_BIT) {
 			DPRINTF(("pci_xhci: trb IOC bit set"));
 			if (epid == 1)
 				do_retry = 1;
 			break;
 		}
 	}
 
 	DPRINTF(("pci_xhci[%d]: xfer->ndata %u", __LINE__, xfer->ndata));
 
 	if (epid == 1) {
 		err = USB_ERR_NOT_STARTED;
 		if (dev->dev_ue->ue_request != NULL)
 			err = dev->dev_ue->ue_request(dev->dev_sc, xfer);
 		setup_trb = NULL;
 	} else {
 		/* handle data transfer */
 		pci_xhci_try_usb_xfer(sc, dev, devep, ep_ctx, slot, epid);
 		err = XHCI_TRB_ERROR_SUCCESS;
 		goto errout;
 	}
 
 	err = USB_TO_XHCI_ERR(err);
 	if ((err == XHCI_TRB_ERROR_SUCCESS) ||
 	    (err == XHCI_TRB_ERROR_SHORT_PKT)) {
 		err = pci_xhci_xfer_complete(sc, xfer, slot, epid, &do_intr);
 		if (err != XHCI_TRB_ERROR_SUCCESS)
 			do_retry = 0;
 	}
 
 errout:
 	if (err == XHCI_TRB_ERROR_EV_RING_FULL)
 		DPRINTF(("pci_xhci[%d]: event ring full", __LINE__));
 
 	if (!do_retry)
 		USB_DATA_XFER_UNLOCK(xfer);
 
 	if (do_intr)
 		pci_xhci_assert_interrupt(sc);
 
 	if (do_retry) {
 		USB_DATA_XFER_RESET(xfer);
 		DPRINTF(("pci_xhci[%d]: retry:continuing with next TRBs",
 		         __LINE__));
 		goto retry;
 	}
 
 	if (epid == 1)
 		USB_DATA_XFER_RESET(xfer);
 
 	return (err);
 }
 
 static void
 pci_xhci_device_doorbell(struct pci_xhci_softc *sc, uint32_t slot,
     uint32_t epid, uint32_t streamid)
 {
 	struct pci_xhci_dev_emu *dev;
 	struct pci_xhci_dev_ep	*devep;
 	struct xhci_dev_ctx	*dev_ctx;
 	struct xhci_endp_ctx	*ep_ctx;
 	struct pci_xhci_trb_ring *sctx_tr;
 	struct xhci_trb	*trb;
 	uint64_t	ringaddr;
 	uint32_t	ccs;
 
 	DPRINTF(("pci_xhci doorbell slot %u epid %u stream %u",
 	    slot, epid, streamid));
 
 	if (slot == 0 || slot > sc->ndevices) {
 		DPRINTF(("pci_xhci: invalid doorbell slot %u", slot));
 		return;
 	}
 
 	if (epid == 0 || epid >= XHCI_MAX_ENDPOINTS) {
 		DPRINTF(("pci_xhci: invalid endpoint %u", epid));
 		return;
 	}
 
 	dev = XHCI_SLOTDEV_PTR(sc, slot);
 	devep = &dev->eps[epid];
 	dev_ctx = pci_xhci_get_dev_ctx(sc, slot);
 	if (!dev_ctx) {
 		return;
 	}
 	ep_ctx = &dev_ctx->ctx_ep[epid];
 
 	sctx_tr = NULL;
 
 	DPRINTF(("pci_xhci: device doorbell ep[%u] %08x %08x %016lx %08x",
 	        epid, ep_ctx->dwEpCtx0, ep_ctx->dwEpCtx1, ep_ctx->qwEpCtx2,
 	        ep_ctx->dwEpCtx4));
 
 	if (ep_ctx->qwEpCtx2 == 0)
 		return;
 
 	/* handle pending transfers */
 	if (devep->ep_xfer->ndata > 0) {
 		pci_xhci_try_usb_xfer(sc, dev, devep, ep_ctx, slot, epid);
 		return;
 	}
 
 	/* get next trb work item */
 	if (XHCI_EPCTX_0_MAXP_STREAMS_GET(ep_ctx->dwEpCtx0) != 0) {
 		struct xhci_stream_ctx *sctx;
 
 		/*
 		 * Stream IDs of 0, 65535 (any stream), and 65534
 		 * (prime) are invalid.
 		 */
 		if (streamid == 0 || streamid == 65534 || streamid == 65535) {
 			DPRINTF(("pci_xhci: invalid stream %u", streamid));
 			return;
 		}
 
 		sctx = NULL;
 		pci_xhci_find_stream(sc, ep_ctx, streamid, &sctx);
 		if (sctx == NULL) {
 			DPRINTF(("pci_xhci: invalid stream %u", streamid));
 			return;
 		}
 		sctx_tr = &devep->ep_sctx_trbs[streamid];
 		ringaddr = sctx_tr->ringaddr;
 		ccs = sctx_tr->ccs;
 		trb = XHCI_GADDR(sc, sctx_tr->ringaddr & ~0xFUL);
 		DPRINTF(("doorbell, stream %u, ccs %lx, trb ccs %x",
 		        streamid, ep_ctx->qwEpCtx2 & XHCI_TRB_3_CYCLE_BIT,
 		        trb->dwTrb3 & XHCI_TRB_3_CYCLE_BIT));
 	} else {
 		if (streamid != 0) {
 			DPRINTF(("pci_xhci: invalid stream %u", streamid));
 			return;
 		}
 		ringaddr = devep->ep_ringaddr;
 		ccs = devep->ep_ccs;
 		trb = devep->ep_tr;
 		DPRINTF(("doorbell, ccs %lx, trb ccs %x",
 		        ep_ctx->qwEpCtx2 & XHCI_TRB_3_CYCLE_BIT,
 		        trb->dwTrb3 & XHCI_TRB_3_CYCLE_BIT));
 	}
 
 	if (XHCI_TRB_3_TYPE_GET(trb->dwTrb3) == 0) {
 		DPRINTF(("pci_xhci: ring %lx trb[%lx] EP %u is RESERVED?",
 		        ep_ctx->qwEpCtx2, devep->ep_ringaddr, epid));
 		return;
 	}
 
 	pci_xhci_handle_transfer(sc, dev, devep, ep_ctx, trb, slot, epid,
 	                         ringaddr, ccs, streamid);
 }
 
 static void
 pci_xhci_dbregs_write(struct pci_xhci_softc *sc, uint64_t offset,
     uint64_t value)
 {
 
 	offset = (offset - sc->dboff) / sizeof(uint32_t);
 
 	DPRINTF(("pci_xhci: doorbell write offset 0x%lx: 0x%lx",
 	        offset, value));
 
 	if (XHCI_HALTED(sc)) {
 		DPRINTF(("pci_xhci: controller halted"));
 		return;
 	}
 
 	if (offset == 0)
 		pci_xhci_complete_commands(sc);
 	else if (sc->portregs != NULL)
 		pci_xhci_device_doorbell(sc, offset,
 		   XHCI_DB_TARGET_GET(value), XHCI_DB_SID_GET(value));
 }
 
 static void
 pci_xhci_rtsregs_write(struct pci_xhci_softc *sc, uint64_t offset,
     uint64_t value)
 {
 	struct pci_xhci_rtsregs *rts;
 
 	offset -= sc->rtsoff;
 
 	if (offset == 0) {
 		DPRINTF(("pci_xhci attempted write to MFINDEX"));
 		return;
 	}
 
 	DPRINTF(("pci_xhci: runtime regs write offset 0x%lx: 0x%lx",
 	        offset, value));
 
 	offset -= 0x20;		/* start of intrreg */
 
 	rts = &sc->rtsregs;
 
 	switch (offset) {
 	case 0x00:
 		if (value & XHCI_IMAN_INTR_PEND)
 			rts->intrreg.iman &= ~XHCI_IMAN_INTR_PEND;
 		rts->intrreg.iman = (value & XHCI_IMAN_INTR_ENA) |
 		                    (rts->intrreg.iman & XHCI_IMAN_INTR_PEND);
 
 		if (!(value & XHCI_IMAN_INTR_ENA))
 			pci_xhci_deassert_interrupt(sc);
 
 		break;
 
 	case 0x04:
 		rts->intrreg.imod = value;
 		break;
 
 	case 0x08:
 		rts->intrreg.erstsz = value & 0xFFFF;
 		break;
 
 	case 0x10:
 		/* ERSTBA low bits */
 		rts->intrreg.erstba = MASK_64_HI(sc->rtsregs.intrreg.erstba) |
 		                      (value & ~0x3F);
 		break;
 
 	case 0x14:
 		/* ERSTBA high bits */
 		rts->intrreg.erstba = (value << 32) |
 		    MASK_64_LO(sc->rtsregs.intrreg.erstba);
 
 		rts->erstba_p = XHCI_GADDR(sc,
 		                        sc->rtsregs.intrreg.erstba & ~0x3FUL);
 
 		rts->erst_p = XHCI_GADDR(sc,
 		              sc->rtsregs.erstba_p->qwEvrsTablePtr & ~0x3FUL);
 
 		rts->er_enq_idx = 0;
 		rts->er_events_cnt = 0;
 
 		DPRINTF(("pci_xhci: wr erstba erst (%p) ptr 0x%lx, sz %u",
 		        rts->erstba_p,
 		        rts->erstba_p->qwEvrsTablePtr,
 		        rts->erstba_p->dwEvrsTableSize));
 		break;
 
 	case 0x18:
 		/* ERDP low bits */
 		rts->intrreg.erdp =
 		    MASK_64_HI(sc->rtsregs.intrreg.erdp) |
 		    (rts->intrreg.erdp & XHCI_ERDP_LO_BUSY) |
 		    (value & ~0xF);
 		if (value & XHCI_ERDP_LO_BUSY) {
 			rts->intrreg.erdp &= ~XHCI_ERDP_LO_BUSY;
 			rts->intrreg.iman &= ~XHCI_IMAN_INTR_PEND;
 		}
 
 		rts->er_deq_seg = XHCI_ERDP_LO_SINDEX(value);
 
 		break;
 
 	case 0x1C:
 		/* ERDP high bits */
 		rts->intrreg.erdp = (value << 32) |
 		    MASK_64_LO(sc->rtsregs.intrreg.erdp);
 
 		if (rts->er_events_cnt > 0) {
 			uint64_t erdp;
 			uint32_t erdp_i;
 
 			erdp = rts->intrreg.erdp & ~0xF;
 			erdp_i = (erdp - rts->erstba_p->qwEvrsTablePtr) /
 			           sizeof(struct xhci_trb);
 
 			if (erdp_i <= rts->er_enq_idx)
 				rts->er_events_cnt = rts->er_enq_idx - erdp_i;
 			else
 				rts->er_events_cnt =
 				          rts->erstba_p->dwEvrsTableSize -
 				          (erdp_i - rts->er_enq_idx);
 
 			DPRINTF(("pci_xhci: erdp 0x%lx, events cnt %u",
 			        erdp, rts->er_events_cnt));
 		}
 
 		break;
 
 	default:
 		DPRINTF(("pci_xhci attempted write to RTS offset 0x%lx",
 		        offset));
 		break;
 	}
 }
 
 static uint64_t
 pci_xhci_portregs_read(struct pci_xhci_softc *sc, uint64_t offset)
 {
 	int port;
 	uint32_t *p;
 
 	if (sc->portregs == NULL)
 		return (0);
 
 	port = (offset - 0x3F0) / 0x10;
 
 	if (port > XHCI_MAX_DEVS) {
 		DPRINTF(("pci_xhci: portregs_read port %d >= XHCI_MAX_DEVS",
 		    port));
 
 		/* return default value for unused port */
 		return (XHCI_PS_SPEED_SET(3));
 	}
 
 	offset = (offset - 0x3F0) % 0x10;
 
 	p = &sc->portregs[port].portsc;
 	p += offset / sizeof(uint32_t);
 
 	DPRINTF(("pci_xhci: portregs read offset 0x%lx port %u -> 0x%x",
 	        offset, port, *p));
 
 	return (*p);
 }
 
 static void
 pci_xhci_hostop_write(struct pci_xhci_softc *sc, uint64_t offset,
     uint64_t value)
 {
 	offset -= XHCI_CAPLEN;
 
 	if (offset < 0x400)
 		DPRINTF(("pci_xhci: hostop write offset 0x%lx: 0x%lx",
 		         offset, value));
 
 	switch (offset) {
 	case XHCI_USBCMD:
 		sc->opregs.usbcmd = pci_xhci_usbcmd_write(sc, value & 0x3F0F);
 		break;
 
 	case XHCI_USBSTS:
 		/* clear bits on write */
 		sc->opregs.usbsts &= ~(value &
 		      (XHCI_STS_HSE|XHCI_STS_EINT|XHCI_STS_PCD|XHCI_STS_SSS|
 		       XHCI_STS_RSS|XHCI_STS_SRE|XHCI_STS_CNR));
 		break;
 
 	case XHCI_PAGESIZE:
 		/* read only */
 		break;
 
 	case XHCI_DNCTRL:
 		sc->opregs.dnctrl = value & 0xFFFF;
 		break;
 
 	case XHCI_CRCR_LO:
 		if (sc->opregs.crcr & XHCI_CRCR_LO_CRR) {
 			sc->opregs.crcr &= ~(XHCI_CRCR_LO_CS|XHCI_CRCR_LO_CA);
 			sc->opregs.crcr |= value &
 			                   (XHCI_CRCR_LO_CS|XHCI_CRCR_LO_CA);
 		} else {
 			sc->opregs.crcr = MASK_64_HI(sc->opregs.crcr) |
 			           (value & (0xFFFFFFC0 | XHCI_CRCR_LO_RCS));
 		}
 		break;
 
 	case XHCI_CRCR_HI:
 		if (!(sc->opregs.crcr & XHCI_CRCR_LO_CRR)) {
 			sc->opregs.crcr = MASK_64_LO(sc->opregs.crcr) |
 			                  (value << 32);
 
 			sc->opregs.cr_p = XHCI_GADDR(sc,
 			                  sc->opregs.crcr & ~0xF);
 		}
 
 		if (sc->opregs.crcr & XHCI_CRCR_LO_CS) {
 			/* Stop operation of Command Ring */
 		}
 
 		if (sc->opregs.crcr & XHCI_CRCR_LO_CA) {
 			/* Abort command */
 		}
 
 		break;
 
 	case XHCI_DCBAAP_LO:
 		sc->opregs.dcbaap = MASK_64_HI(sc->opregs.dcbaap) |
 		                    (value & 0xFFFFFFC0);
 		break;
 
 	case XHCI_DCBAAP_HI:
 		sc->opregs.dcbaap =  MASK_64_LO(sc->opregs.dcbaap) |
 		                     (value << 32);
 		sc->opregs.dcbaa_p = XHCI_GADDR(sc, sc->opregs.dcbaap & ~0x3FUL);
 
 		DPRINTF(("pci_xhci: opregs dcbaap = 0x%lx (vaddr 0x%lx)",
 		    sc->opregs.dcbaap, (uint64_t)sc->opregs.dcbaa_p));
 		break;
 
 	case XHCI_CONFIG:
 		sc->opregs.config = value & 0x03FF;
 		break;
 
 	default:
 		if (offset >= 0x400)
 			pci_xhci_portregs_write(sc, offset, value);
 
 		break;
 	}
 }
 
 
 static void
 pci_xhci_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
                 int baridx, uint64_t offset, int size, uint64_t value)
 {
 	struct pci_xhci_softc *sc;
 
 	sc = pi->pi_arg;
 
 	assert(baridx == 0);
 
 
 	pthread_mutex_lock(&sc->mtx);
 	if (offset < XHCI_CAPLEN)	/* read only registers */
 		WPRINTF(("pci_xhci: write RO-CAPs offset %ld", offset));
 	else if (offset < sc->dboff)
 		pci_xhci_hostop_write(sc, offset, value);
 	else if (offset < sc->rtsoff)
 		pci_xhci_dbregs_write(sc, offset, value);
 	else if (offset < sc->regsend)
 		pci_xhci_rtsregs_write(sc, offset, value);
 	else
 		WPRINTF(("pci_xhci: write invalid offset %ld", offset));
 
 	pthread_mutex_unlock(&sc->mtx);
 }
 
 static uint64_t
 pci_xhci_hostcap_read(struct pci_xhci_softc *sc, uint64_t offset)
 {
 	uint64_t	value;
 
 	switch (offset) {
 	case XHCI_CAPLENGTH:	/* 0x00 */
 		value = sc->caplength;
 		break;
 
 	case XHCI_HCSPARAMS1:	/* 0x04 */
 		value = sc->hcsparams1;
 		break;
 
 	case XHCI_HCSPARAMS2:	/* 0x08 */
 		value = sc->hcsparams2;
 		break;
 
 	case XHCI_HCSPARAMS3:	/* 0x0C */
 		value = sc->hcsparams3;
 		break;
 
 	case XHCI_HCSPARAMS0:	/* 0x10 */
 		value = sc->hccparams1;
 		break;
 
 	case XHCI_DBOFF:	/* 0x14 */
 		value = sc->dboff;
 		break;
 
 	case XHCI_RTSOFF:	/* 0x18 */
 		value = sc->rtsoff;
 		break;
 
 	case XHCI_HCCPRAMS2:	/* 0x1C */
 		value = sc->hccparams2;
 		break;
 
 	default:
 		value = 0;
 		break;
 	}
 
 	DPRINTF(("pci_xhci: hostcap read offset 0x%lx -> 0x%lx",
 	        offset, value));
 
 	return (value);
 }
 
 static uint64_t
 pci_xhci_hostop_read(struct pci_xhci_softc *sc, uint64_t offset)
 {
 	uint64_t value;
 
 	offset = (offset - XHCI_CAPLEN);
 
 	switch (offset) {
 	case XHCI_USBCMD:	/* 0x00 */
 		value = sc->opregs.usbcmd;
 		break;
 
 	case XHCI_USBSTS:	/* 0x04 */
 		value = sc->opregs.usbsts;
 		break;
 
 	case XHCI_PAGESIZE:	/* 0x08 */
 		value = sc->opregs.pgsz;
 		break;
 
 	case XHCI_DNCTRL:	/* 0x14 */
 		value = sc->opregs.dnctrl;
 		break;
 
 	case XHCI_CRCR_LO:	/* 0x18 */
 		value = sc->opregs.crcr & XHCI_CRCR_LO_CRR;
 		break;
 
 	case XHCI_CRCR_HI:	/* 0x1C */
 		value = 0;
 		break;
 
 	case XHCI_DCBAAP_LO:	/* 0x30 */
 		value = sc->opregs.dcbaap & 0xFFFFFFFF;
 		break;
 
 	case XHCI_DCBAAP_HI:	/* 0x34 */
 		value = (sc->opregs.dcbaap >> 32) & 0xFFFFFFFF;
 		break;
 
 	case XHCI_CONFIG:	/* 0x38 */
 		value = sc->opregs.config;
 		break;
 
 	default:
 		if (offset >= 0x400)
 			value = pci_xhci_portregs_read(sc, offset);
 		else
 			value = 0;
 
 		break;
 	}
 
 	if (offset < 0x400)
 		DPRINTF(("pci_xhci: hostop read offset 0x%lx -> 0x%lx",
 		        offset, value));
 
 	return (value);
 }
 
 static uint64_t
 pci_xhci_dbregs_read(struct pci_xhci_softc *sc, uint64_t offset)
 {
 
 	/* read doorbell always returns 0 */
 	return (0);
 }
 
 static uint64_t
 pci_xhci_rtsregs_read(struct pci_xhci_softc *sc, uint64_t offset)
 {
 	uint32_t	value;
 
 	offset -= sc->rtsoff;
 	value = 0;
 
 	if (offset == XHCI_MFINDEX) {
 		value = sc->rtsregs.mfindex;
 	} else if (offset >= 0x20) {
 		int item;
 		uint32_t *p;
 
 		offset -= 0x20;
 		item = offset % 32;
 
 		assert(offset < sizeof(sc->rtsregs.intrreg));
 
 		p = &sc->rtsregs.intrreg.iman;
 		p += item / sizeof(uint32_t);
 		value = *p;
 	}
 
 	DPRINTF(("pci_xhci: rtsregs read offset 0x%lx -> 0x%x",
 	        offset, value));
 
 	return (value);
 }
 
 static uint64_t
 pci_xhci_xecp_read(struct pci_xhci_softc *sc, uint64_t offset)
 {
 	uint32_t	value;
 
 	offset -= sc->regsend;
 	value = 0;
 
 	switch (offset) {
 	case 0:
 		/* rev major | rev minor | next-cap | cap-id */
 		value = (0x02 << 24) | (4 << 8) | XHCI_ID_PROTOCOLS;
 		break;
 	case 4:
 		/* name string = "USB" */
 		value = 0x20425355;
 		break;
 	case 8:
 		/* psic | proto-defined | compat # | compat offset */
 		value = ((XHCI_MAX_DEVS/2) << 8) | sc->usb2_port_start;
 		break;
 	case 12:
 		break;
 	case 16:
 		/* rev major | rev minor | next-cap | cap-id */
 		value = (0x03 << 24) | XHCI_ID_PROTOCOLS;
 		break;
 	case 20:
 		/* name string = "USB" */
 		value = 0x20425355;
 		break;
 	case 24:
 		/* psic | proto-defined | compat # | compat offset */
 		value = ((XHCI_MAX_DEVS/2) << 8) | sc->usb3_port_start;
 		break;
 	case 28:
 		break;
 	default:
 		DPRINTF(("pci_xhci: xecp invalid offset 0x%lx", offset));
 		break;
 	}
 
 	DPRINTF(("pci_xhci: xecp read offset 0x%lx -> 0x%x",
 	        offset, value));
 
 	return (value);
 }
 
 
 static uint64_t
 pci_xhci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
     uint64_t offset, int size)
 {
 	struct pci_xhci_softc *sc;
 	uint32_t	value;
 
 	sc = pi->pi_arg;
 
 	assert(baridx == 0);
 
 	pthread_mutex_lock(&sc->mtx);
 	if (offset < XHCI_CAPLEN)
 		value = pci_xhci_hostcap_read(sc, offset);
 	else if (offset < sc->dboff)
 		value = pci_xhci_hostop_read(sc, offset);
 	else if (offset < sc->rtsoff)
 		value = pci_xhci_dbregs_read(sc, offset);
 	else if (offset < sc->regsend)
 		value = pci_xhci_rtsregs_read(sc, offset);
 	else if (offset < (sc->regsend + 4*32))
 		value = pci_xhci_xecp_read(sc, offset);
 	else {
 		value = 0;
 		WPRINTF(("pci_xhci: read invalid offset %ld", offset));
 	}
 
 	pthread_mutex_unlock(&sc->mtx);
 
 	switch (size) {
 	case 1:
 		value &= 0xFF;
 		break;
 	case 2:
 		value &= 0xFFFF;
 		break;
 	case 4:
 		value &= 0xFFFFFFFF;
 		break;
 	}
 
 	return (value);
 }
 
 static void
 pci_xhci_reset_port(struct pci_xhci_softc *sc, int portn, int warm)
 {
 	struct pci_xhci_portregs *port;
 	struct pci_xhci_dev_emu	*dev;
 	struct xhci_trb		evtrb;
 	int	error;
 
 	assert(portn <= XHCI_MAX_DEVS);
 
 	DPRINTF(("xhci reset port %d", portn));
 
 	port = XHCI_PORTREG_PTR(sc, portn);
 	dev = XHCI_DEVINST_PTR(sc, portn);
 	if (dev) {
 		port->portsc &= ~(XHCI_PS_PLS_MASK | XHCI_PS_PR | XHCI_PS_PRC);
 		port->portsc |= XHCI_PS_PED |
 		    XHCI_PS_SPEED_SET(dev->dev_ue->ue_usbspeed);
 
 		if (warm && dev->dev_ue->ue_usbver == 3) {
 			port->portsc |= XHCI_PS_WRC;
 		}
 
 		if ((port->portsc & XHCI_PS_PRC) == 0) {
 			port->portsc |= XHCI_PS_PRC;
 
 			pci_xhci_set_evtrb(&evtrb, portn,
 			     XHCI_TRB_ERROR_SUCCESS,
 			     XHCI_TRB_EVENT_PORT_STS_CHANGE);
 			error = pci_xhci_insert_event(sc, &evtrb, 1);
 			if (error != XHCI_TRB_ERROR_SUCCESS)
 				DPRINTF(("xhci reset port insert event "
 				         "failed"));
 		}
 	}
 }
 
 static void
 pci_xhci_init_port(struct pci_xhci_softc *sc, int portn)
 {
 	struct pci_xhci_portregs *port;
 	struct pci_xhci_dev_emu	*dev;
 
 	port = XHCI_PORTREG_PTR(sc, portn);
 	dev = XHCI_DEVINST_PTR(sc, portn);
 	if (dev) {
 		port->portsc = XHCI_PS_CCS |		/* connected */
 		               XHCI_PS_PP;		/* port power */
 		
 		if (dev->dev_ue->ue_usbver == 2) {
 			port->portsc |= XHCI_PS_PLS_SET(UPS_PORT_LS_POLL) |
 		               XHCI_PS_SPEED_SET(dev->dev_ue->ue_usbspeed);
 		} else {
 			port->portsc |= XHCI_PS_PLS_SET(UPS_PORT_LS_U0) |
 		               XHCI_PS_PED |		/* enabled */
 		               XHCI_PS_SPEED_SET(dev->dev_ue->ue_usbspeed);
 		}
 		
 		DPRINTF(("Init port %d 0x%x", portn, port->portsc));
 	} else {
 		port->portsc = XHCI_PS_PLS_SET(UPS_PORT_LS_RX_DET) | XHCI_PS_PP;
 		DPRINTF(("Init empty port %d 0x%x", portn, port->portsc));
 	}
 }
 
 static int
 pci_xhci_dev_intr(struct usb_hci *hci, int epctx)
 {
 	struct pci_xhci_dev_emu *dev;
 	struct xhci_dev_ctx	*dev_ctx;
 	struct xhci_trb		evtrb;
 	struct pci_xhci_softc	*sc;
 	struct pci_xhci_portregs *p;
 	struct xhci_endp_ctx	*ep_ctx;
 	int	error = 0;
 	int	dir_in;
 	int	epid;
 
 	dir_in = epctx & 0x80;
 	epid = epctx & ~0x80;
 
 	/* HW endpoint contexts are 0-15; convert to epid based on dir */
 	epid = (epid * 2) + (dir_in ? 1 : 0);
 
 	assert(epid >= 1 && epid <= 31);
 
 	dev = hci->hci_sc;
 	sc = dev->xsc;
 
 	/* check if device is ready; OS has to initialise it */
 	if (sc->rtsregs.erstba_p == NULL ||
 	    (sc->opregs.usbcmd & XHCI_CMD_RS) == 0 ||
 	    dev->dev_ctx == NULL)
 		return (0);
 
 	p = XHCI_PORTREG_PTR(sc, hci->hci_port);
 
 	/* raise event if link U3 (suspended) state */
 	if (XHCI_PS_PLS_GET(p->portsc) == 3) {
 		p->portsc &= ~XHCI_PS_PLS_MASK;
 		p->portsc |= XHCI_PS_PLS_SET(UPS_PORT_LS_RESUME);
 		if ((p->portsc & XHCI_PS_PLC) != 0)
 			return (0);
 
 		p->portsc |= XHCI_PS_PLC;
 
 		pci_xhci_set_evtrb(&evtrb, hci->hci_port,
 		      XHCI_TRB_ERROR_SUCCESS, XHCI_TRB_EVENT_PORT_STS_CHANGE);
 		error = pci_xhci_insert_event(sc, &evtrb, 0);
 		if (error != XHCI_TRB_ERROR_SUCCESS)
 			goto done;
 	}
 
 	dev_ctx = dev->dev_ctx;
 	ep_ctx = &dev_ctx->ctx_ep[epid];
 	if ((ep_ctx->dwEpCtx0 & 0x7) == XHCI_ST_EPCTX_DISABLED) {
 		DPRINTF(("xhci device interrupt on disabled endpoint %d",
 		         epid));
 		return (0);
 	}
 
 	DPRINTF(("xhci device interrupt on endpoint %d", epid));
 
 	pci_xhci_device_doorbell(sc, hci->hci_port, epid, 0);
 
 done:
 	return (error);
 }
 
 static int
 pci_xhci_dev_event(struct usb_hci *hci, enum hci_usbev evid, void *param)
 {
 
 	DPRINTF(("xhci device event port %d", hci->hci_port));
 	return (0);
 }
 
 
 
 static void
 pci_xhci_device_usage(char *opt)
 {
 
 	EPRINTLN("Invalid USB emulation \"%s\"", opt);
 }
 
 static int
 pci_xhci_parse_opts(struct pci_xhci_softc *sc, char *opts)
 {
 	struct pci_xhci_dev_emu	**devices;
 	struct pci_xhci_dev_emu	*dev;
 	struct usb_devemu	*ue;
 	void	*devsc;
 	char	*uopt, *xopts, *config;
 	int	usb3_port, usb2_port, i;
 
 	uopt = NULL;
 	usb3_port = sc->usb3_port_start - 1;
 	usb2_port = sc->usb2_port_start - 1;
 	devices = NULL;
 
 	if (opts == NULL)
 		goto portsfinal;
 
 	devices = calloc(XHCI_MAX_DEVS, sizeof(struct pci_xhci_dev_emu *));
 
 	sc->slots = calloc(XHCI_MAX_SLOTS, sizeof(struct pci_xhci_dev_emu *));
 	sc->devices = devices;
 	sc->ndevices = 0;
 
 	uopt = strdup(opts);
 	for (xopts = strtok(uopt, ",");
 	     xopts != NULL;
 	     xopts = strtok(NULL, ",")) {
 		if (usb2_port == ((sc->usb2_port_start-1) + XHCI_MAX_DEVS/2) ||
 		    usb3_port == ((sc->usb3_port_start-1) + XHCI_MAX_DEVS/2)) {
 			WPRINTF(("pci_xhci max number of USB 2 or 3 "
 			     "devices reached, max %d", XHCI_MAX_DEVS/2));
 			usb2_port = usb3_port = -1;
 			goto done;
 		}
 
 		/* device[=<config>] */
 		if ((config = strchr(xopts, '=')) == NULL)
 			config = "";		/* no config */
 		else
 			*config++ = '\0';
 
 		ue = usb_emu_finddev(xopts);
 		if (ue == NULL) {
 			pci_xhci_device_usage(xopts);
 			DPRINTF(("pci_xhci device not found %s", xopts));
 			usb2_port = usb3_port = -1;
 			goto done;
 		}
 
 		DPRINTF(("pci_xhci adding device %s, opts \"%s\"",
 		        xopts, config));
 
 		dev = calloc(1, sizeof(struct pci_xhci_dev_emu));
 		dev->xsc = sc;
 		dev->hci.hci_sc = dev;
 		dev->hci.hci_intr = pci_xhci_dev_intr;
 		dev->hci.hci_event = pci_xhci_dev_event;
 
 		if (ue->ue_usbver == 2) {
 			dev->hci.hci_port = usb2_port + 1;
 			devices[usb2_port] = dev;
 			usb2_port++;
 		} else {
 			dev->hci.hci_port = usb3_port + 1;
 			devices[usb3_port] = dev;
 			usb3_port++;
 		}
 
 		dev->hci.hci_address = 0;
 		devsc = ue->ue_init(&dev->hci, config);
 		if (devsc == NULL) {
 			pci_xhci_device_usage(xopts);
 			usb2_port = usb3_port = -1;
 			goto done;
 		}
 
 		dev->dev_ue = ue;
 		dev->dev_sc = devsc;
 
 		/* assign slot number to device */
 		sc->slots[sc->ndevices] = dev;
 
 		sc->ndevices++;
 	}
 
 portsfinal:
 	sc->portregs = calloc(XHCI_MAX_DEVS, sizeof(struct pci_xhci_portregs));
 
 	if (sc->ndevices > 0) {
 		/* port and slot numbering start from 1 */
 		sc->devices--;
 		sc->portregs--;
 		sc->slots--;
 
 		for (i = 1; i <= XHCI_MAX_DEVS; i++) {
 			pci_xhci_init_port(sc, i);
 		}
 	} else {
 		WPRINTF(("pci_xhci no USB devices configured"));
 		sc->ndevices = 1;
 	}
 
 done:
 	if (devices != NULL) {
 		if (usb2_port <= 0 && usb3_port <= 0) {
 			sc->devices = NULL;
 			for (i = 0; devices[i] != NULL; i++)
 				free(devices[i]);
 			sc->ndevices = -1;
 
 			free(devices);
 		}
 	}
 	free(uopt);
 	return (sc->ndevices);
 }
 
 static int
 pci_xhci_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 {
 	struct pci_xhci_softc *sc;
 	int	error;
 
 	if (xhci_in_use) {
 		WPRINTF(("pci_xhci controller already defined"));
 		return (-1);
 	}
 	xhci_in_use = 1;
 
 	sc = calloc(1, sizeof(struct pci_xhci_softc));
 	pi->pi_arg = sc;
 	sc->xsc_pi = pi;
 
 	sc->usb2_port_start = (XHCI_MAX_DEVS/2) + 1;
 	sc->usb3_port_start = 1;
 
 	/* discover devices */
 	error = pci_xhci_parse_opts(sc, opts);
 	if (error < 0)
 		goto done;
 	else
 		error = 0;
 
 	sc->caplength = XHCI_SET_CAPLEN(XHCI_CAPLEN) |
 	                XHCI_SET_HCIVERSION(0x0100);
 	sc->hcsparams1 = XHCI_SET_HCSP1_MAXPORTS(XHCI_MAX_DEVS) |
 	                 XHCI_SET_HCSP1_MAXINTR(1) |	/* interrupters */
 	                 XHCI_SET_HCSP1_MAXSLOTS(XHCI_MAX_SLOTS);
 	sc->hcsparams2 = XHCI_SET_HCSP2_ERSTMAX(XHCI_ERST_MAX) |
 	                 XHCI_SET_HCSP2_IST(0x04);
 	sc->hcsparams3 = 0;				/* no latency */
 	sc->hccparams1 = XHCI_SET_HCCP1_NSS(1) |	/* no 2nd-streams */
 	                 XHCI_SET_HCCP1_SPC(1) |	/* short packet */
 	                 XHCI_SET_HCCP1_MAXPSA(XHCI_STREAMS_MAX);
 	sc->hccparams2 = XHCI_SET_HCCP2_LEC(1) |
 	                 XHCI_SET_HCCP2_U3C(1);
 	sc->dboff = XHCI_SET_DOORBELL(XHCI_CAPLEN + XHCI_PORTREGS_START +
 	            XHCI_MAX_DEVS * sizeof(struct pci_xhci_portregs));
 
 	/* dboff must be 32-bit aligned */
 	if (sc->dboff & 0x3)
 		sc->dboff = (sc->dboff + 0x3) & ~0x3;
 
 	/* rtsoff must be 32-bytes aligned */
 	sc->rtsoff = XHCI_SET_RTSOFFSET(sc->dboff + (XHCI_MAX_SLOTS+1) * 32);
 	if (sc->rtsoff & 0x1F)
 		sc->rtsoff = (sc->rtsoff + 0x1F) & ~0x1F;
 
 	DPRINTF(("pci_xhci dboff: 0x%x, rtsoff: 0x%x", sc->dboff,
 	        sc->rtsoff));
 
 	sc->opregs.usbsts = XHCI_STS_HCH;
 	sc->opregs.pgsz = XHCI_PAGESIZE_4K;
 
 	pci_xhci_reset(sc);
 
 	sc->regsend = sc->rtsoff + 0x20 + 32;		/* only 1 intrpter */
 
 	/*
 	 * Set extended capabilities pointer to be after regsend;
 	 * value of xecp field is 32-bit offset.
 	 */
 	sc->hccparams1 |= XHCI_SET_HCCP1_XECP(sc->regsend/4);
 
 	pci_set_cfgdata16(pi, PCIR_DEVICE, 0x1E31);
 	pci_set_cfgdata16(pi, PCIR_VENDOR, 0x8086);
 	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_SERIALBUS);
 	pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_SERIALBUS_USB);
 	pci_set_cfgdata8(pi, PCIR_PROGIF,PCIP_SERIALBUS_USB_XHCI);
 	pci_set_cfgdata8(pi, PCI_USBREV, PCI_USB_REV_3_0);
 
 	pci_emul_add_msicap(pi, 1);
 
 	/* regsend + xecp registers */
 	pci_emul_alloc_bar(pi, 0, PCIBAR_MEM32, sc->regsend + 4*32);
 	DPRINTF(("pci_xhci pci_emu_alloc: %d", sc->regsend + 4*32));
 
 
 	pci_lintr_request(pi);
 
 	pthread_mutex_init(&sc->mtx, NULL);
 
 done:
 	if (error) {
 		free(sc);
 	}
 
 	return (error);
 }
 
+#ifdef BHYVE_SNAPSHOT
+static void
+pci_xhci_map_devs_slots(struct pci_xhci_softc *sc, int maps[])
+{
+	int i, j;
+	struct pci_xhci_dev_emu *dev, *slot;
+
+	memset(maps, 0, sizeof(maps[0]) * XHCI_MAX_SLOTS);
+
+	for (i = 1; i <= XHCI_MAX_SLOTS; i++) {
+		for (j = 1; j <= XHCI_MAX_DEVS; j++) {
+			slot = XHCI_SLOTDEV_PTR(sc, i);
+			dev = XHCI_DEVINST_PTR(sc, j);
+
+			if (slot == dev)
+				maps[i] = j;
+		}
+	}
+}
 
+static int
+pci_xhci_snapshot_ep(struct pci_xhci_softc *sc, struct pci_xhci_dev_emu *dev,
+		     int idx, struct vm_snapshot_meta *meta)
+{
+	int k;
+	int ret;
+	struct usb_data_xfer *xfer;
+	struct usb_data_xfer_block *xfer_block;
+
+	/* some sanity checks */
+	if (meta->op == VM_SNAPSHOT_SAVE)
+		xfer = dev->eps[idx].ep_xfer;
+
+	SNAPSHOT_VAR_OR_LEAVE(xfer, meta, ret, done);
+	if (xfer == NULL) {
+		ret = 0;
+		goto done;
+	}
+
+	if (meta->op == VM_SNAPSHOT_RESTORE) {
+		pci_xhci_init_ep(dev, idx);
+		xfer = dev->eps[idx].ep_xfer;
+	}
+
+	/* save / restore proper */
+	for (k = 0; k < USB_MAX_XFER_BLOCKS; k++) {
+		xfer_block = &xfer->data[k];
+
+		SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(xfer_block->buf,
+			XHCI_GADDR_SIZE(xfer_block->buf), true, meta, ret,
+			done);
+		SNAPSHOT_VAR_OR_LEAVE(xfer_block->blen, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(xfer_block->bdone, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(xfer_block->processed, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(xfer_block->hci_data, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(xfer_block->ccs, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(xfer_block->streamid, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(xfer_block->trbnext, meta, ret, done);
+	}
+
+	SNAPSHOT_VAR_OR_LEAVE(xfer->ureq, meta, ret, done);
+	if (xfer->ureq) {
+		/* xfer->ureq is not allocated at restore time */
+		if (meta->op == VM_SNAPSHOT_RESTORE)
+			xfer->ureq = malloc(sizeof(struct usb_device_request));
+
+		SNAPSHOT_BUF_OR_LEAVE(xfer->ureq,
+				      sizeof(struct usb_device_request),
+				      meta, ret, done);
+	}
+
+	SNAPSHOT_VAR_OR_LEAVE(xfer->ndata, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(xfer->head, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(xfer->tail, meta, ret, done);
+
+done:
+	return (ret);
+}
+
+static int
+pci_xhci_snapshot(struct vm_snapshot_meta *meta)
+{
+	int i, j;
+	int ret;
+	int restore_idx;
+	struct pci_devinst *pi;
+	struct pci_xhci_softc *sc;
+	struct pci_xhci_portregs *port;
+	struct pci_xhci_dev_emu *dev;
+	char dname[SNAP_DEV_NAME_LEN];
+	int maps[XHCI_MAX_SLOTS + 1];
+
+	pi = meta->dev_data;
+	sc = pi->pi_arg;
+
+	SNAPSHOT_VAR_OR_LEAVE(sc->caplength, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->hcsparams1, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->hcsparams2, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->hcsparams3, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->hccparams1, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->dboff, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->rtsoff, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->hccparams2, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->regsend, meta, ret, done);
+
+	/* opregs */
+	SNAPSHOT_VAR_OR_LEAVE(sc->opregs.usbcmd, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->opregs.usbsts, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->opregs.pgsz, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->opregs.dnctrl, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->opregs.crcr, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->opregs.dcbaap, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->opregs.config, meta, ret, done);
+
+	/* opregs.cr_p */
+	SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(sc->opregs.cr_p,
+		XHCI_GADDR_SIZE(sc->opregs.cr_p), false, meta, ret, done);
+
+	/* opregs.dcbaa_p */
+	SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(sc->opregs.dcbaa_p,
+		XHCI_GADDR_SIZE(sc->opregs.dcbaa_p), false, meta, ret, done);
+
+	/* rtsregs */
+	SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.mfindex, meta, ret, done);
+
+	/* rtsregs.intrreg */
+	SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.intrreg.iman, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.intrreg.imod, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.intrreg.erstsz, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.intrreg.rsvd, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.intrreg.erstba, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.intrreg.erdp, meta, ret, done);
+
+	/* rtsregs.erstba_p */
+	SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(sc->rtsregs.erstba_p,
+		XHCI_GADDR_SIZE(sc->rtsregs.erstba_p), false, meta, ret, done);
+
+	/* rtsregs.erst_p */
+	SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(sc->rtsregs.erst_p,
+		XHCI_GADDR_SIZE(sc->rtsregs.erst_p), false, meta, ret, done);
+
+	SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.er_deq_seg, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.er_enq_idx, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.er_enq_seg, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.er_events_cnt, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.event_pcs, meta, ret, done);
+
+	/* sanity checking */
+	for (i = 1; i <= XHCI_MAX_DEVS; i++) {
+		dev = XHCI_DEVINST_PTR(sc, i);
+		if (dev == NULL)
+			continue;
+
+		if (meta->op == VM_SNAPSHOT_SAVE)
+			restore_idx = i;
+		SNAPSHOT_VAR_OR_LEAVE(restore_idx, meta, ret, done);
+
+		/* check if the restored device (when restoring) is sane */
+		if (restore_idx != i) {
+			fprintf(stderr, "%s: idx not matching: actual: %d, "
+				"expected: %d\r\n", __func__, restore_idx, i);
+			ret = EINVAL;
+			goto done;
+		}
+
+		if (meta->op == VM_SNAPSHOT_SAVE) {
+			memset(dname, 0, sizeof(dname));
+			strncpy(dname, dev->dev_ue->ue_emu, sizeof(dname) - 1);
+		}
+
+		SNAPSHOT_BUF_OR_LEAVE(dname, sizeof(dname), meta, ret, done);
+
+		if (meta->op == VM_SNAPSHOT_RESTORE) {
+			dname[sizeof(dname) - 1] = '\0';
+			if (strcmp(dev->dev_ue->ue_emu, dname)) {
+				fprintf(stderr, "%s: device names mismatch: "
+					"actual: %s, expected: %s\r\n",
+					__func__, dname, dev->dev_ue->ue_emu);
+
+				ret = EINVAL;
+				goto done;
+			}
+		}
+	}
+
+	/* portregs */
+	for (i = 1; i <= XHCI_MAX_DEVS; i++) {
+		port = XHCI_PORTREG_PTR(sc, i);
+		dev = XHCI_DEVINST_PTR(sc, i);
+
+		if (dev == NULL)
+			continue;
+
+		SNAPSHOT_VAR_OR_LEAVE(port->portsc, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(port->portpmsc, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(port->portli, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(port->porthlpmc, meta, ret, done);
+	}
+
+	/* slots */
+	if (meta->op == VM_SNAPSHOT_SAVE)
+		pci_xhci_map_devs_slots(sc, maps);
+
+	for (i = 1; i <= XHCI_MAX_SLOTS; i++) {
+		SNAPSHOT_VAR_OR_LEAVE(maps[i], meta, ret, done);
+
+		if (meta->op == VM_SNAPSHOT_SAVE) {
+			dev = XHCI_SLOTDEV_PTR(sc, i);
+		} else if (meta->op == VM_SNAPSHOT_RESTORE) {
+			if (maps[i] != 0)
+				dev = XHCI_DEVINST_PTR(sc, maps[i]);
+			else
+				dev = NULL;
+
+			XHCI_SLOTDEV_PTR(sc, i) = dev;
+		} else {
+			/* error */
+			ret = EINVAL;
+			goto done;
+		}
+
+		if (dev == NULL)
+			continue;
+
+		SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(dev->dev_ctx,
+			XHCI_GADDR_SIZE(dev->dev_ctx), false, meta, ret, done);
+
+		for (j = 1; j < XHCI_MAX_ENDPOINTS; j++) {
+			ret = pci_xhci_snapshot_ep(sc, dev, j, meta);
+			if (ret != 0)
+				goto done;
+		}
+
+		SNAPSHOT_VAR_OR_LEAVE(dev->dev_slotstate, meta, ret, done);
+
+		/* devices[i]->dev_sc */
+		dev->dev_ue->ue_snapshot(dev->dev_sc, meta);
+
+		/* devices[i]->hci */
+		SNAPSHOT_VAR_OR_LEAVE(dev->hci.hci_address, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(dev->hci.hci_port, meta, ret, done);
+	}
+
+	SNAPSHOT_VAR_OR_LEAVE(sc->ndevices, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->usb2_port_start, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->usb3_port_start, meta, ret, done);
+
+done:
+	return (ret);
+}
+#endif
 
 struct pci_devemu pci_de_xhci = {
 	.pe_emu =	"xhci",
 	.pe_init =	pci_xhci_init,
 	.pe_barwrite =	pci_xhci_write,
-	.pe_barread =	pci_xhci_read
+	.pe_barread =	pci_xhci_read,
+#ifdef BHYVE_SNAPSHOT
+	.pe_snapshot =	pci_xhci_snapshot,
+#endif
 };
 PCI_EMUL_SET(pci_de_xhci);
diff --git a/usr.sbin/bhyve/ps2kbd.c b/usr.sbin/bhyve/ps2kbd.c
index 3e6a1b67ca38..ef20fa47e0a9 100644
--- a/usr.sbin/bhyve/ps2kbd.c
+++ b/usr.sbin/bhyve/ps2kbd.c
@@ -1,384 +1,401 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
  * Copyright (c) 2015 Nahanni Systems Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/types.h>
 
+#include <machine/vmm_snapshot.h>
+
 #include <assert.h>
 #include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 #include <strings.h>
 #include <pthread.h>
 #include <pthread_np.h>
 
 #include "atkbdc.h"
 #include "debug.h"
 #include "console.h"
 
 /* keyboard device commands */
 #define	PS2KC_RESET_DEV		0xff
 #define	PS2KC_DISABLE		0xf5
 #define	PS2KC_ENABLE		0xf4
 #define	PS2KC_SET_TYPEMATIC	0xf3
 #define	PS2KC_SEND_DEV_ID	0xf2
 #define	PS2KC_SET_SCANCODE_SET	0xf0
 #define	PS2KC_ECHO		0xee
 #define	PS2KC_SET_LEDS		0xed
 
 #define	PS2KC_BAT_SUCCESS	0xaa
 #define	PS2KC_ACK		0xfa
 
 #define	PS2KBD_FIFOSZ		16
 
 struct fifo {
 	uint8_t	buf[PS2KBD_FIFOSZ];
 	int	rindex;		/* index to read from */
 	int	windex;		/* index to write to */
 	int	num;		/* number of bytes in the fifo */
 	int	size;		/* size of the fifo */
 };
 
 struct ps2kbd_softc {
 	struct atkbdc_softc	*atkbdc_sc;
 	pthread_mutex_t		mtx;
 
 	bool			enabled;
 	struct fifo		fifo;
 
 	uint8_t			curcmd;	/* current command for next byte */
 };
 
 #define SCANCODE_E0_PREFIX 1
 struct extended_translation {
 	uint32_t keysym;
 	uint8_t scancode;
 	int flags;
 };
 
 /*
  * FIXME: Pause/break and Print Screen/SysRq require special handling.
  */
 static const struct extended_translation extended_translations[] = {
 		{0xff08, 0x66},		/* Back space */
 		{0xff09, 0x0d},		/* Tab */
 		{0xff0d, 0x5a},		/* Return */
 		{0xff1b, 0x76},		/* Escape */
 		{0xff50, 0x6c, SCANCODE_E0_PREFIX}, 	/* Home */
 		{0xff51, 0x6b, SCANCODE_E0_PREFIX}, 	/* Left arrow */
 		{0xff52, 0x75, SCANCODE_E0_PREFIX}, 	/* Up arrow */
 		{0xff53, 0x74, SCANCODE_E0_PREFIX}, 	/* Right arrow */
 		{0xff54, 0x72, SCANCODE_E0_PREFIX}, 	/* Down arrow */
 		{0xff55, 0x7d, SCANCODE_E0_PREFIX}, 	/* PgUp */
 		{0xff56, 0x7a, SCANCODE_E0_PREFIX}, 	/* PgDown */
 		{0xff57, 0x69, SCANCODE_E0_PREFIX}, 	/* End */
 		{0xff63, 0x70, SCANCODE_E0_PREFIX}, 	/* Ins */
 		{0xff8d, 0x5a, SCANCODE_E0_PREFIX}, 	/* Keypad Enter */
 		{0xffe1, 0x12},		/* Left shift */
 		{0xffe2, 0x59},		/* Right shift */
 		{0xffe3, 0x14},		/* Left control */
 		{0xffe4, 0x14, SCANCODE_E0_PREFIX}, 	/* Right control */
 		/* {0xffe7, XXX}, Left meta */
 		/* {0xffe8, XXX}, Right meta */
 		{0xffe9, 0x11},		/* Left alt */
 		{0xfe03, 0x11, SCANCODE_E0_PREFIX}, 	/* AltGr */
 		{0xffea, 0x11, SCANCODE_E0_PREFIX}, 	/* Right alt */
 		{0xffeb, 0x1f, SCANCODE_E0_PREFIX}, 	/* Left Windows */
 		{0xffec, 0x27, SCANCODE_E0_PREFIX}, 	/* Right Windows */
 		{0xffbe, 0x05},		/* F1 */
 		{0xffbf, 0x06},		/* F2 */
 		{0xffc0, 0x04},		/* F3 */
 		{0xffc1, 0x0c},		/* F4 */
 		{0xffc2, 0x03},		/* F5 */
 		{0xffc3, 0x0b},		/* F6 */
 		{0xffc4, 0x83},		/* F7 */
 		{0xffc5, 0x0a},		/* F8 */
 		{0xffc6, 0x01},		/* F9 */
 		{0xffc7, 0x09},		/* F10 */
 		{0xffc8, 0x78},		/* F11 */
 		{0xffc9, 0x07},		/* F12 */
 		{0xffff, 0x71, SCANCODE_E0_PREFIX},	/* Del */
 		{0xff14, 0x7e},		/* ScrollLock */
 		/* NumLock and Keypads*/
 		{0xff7f, 0x77}, 	/* NumLock */
 		{0xffaf, 0x4a, SCANCODE_E0_PREFIX}, 	/* Keypad slash */
 		{0xffaa, 0x7c}, 	/* Keypad asterisk */
 		{0xffad, 0x7b}, 	/* Keypad minus */
 		{0xffab, 0x79}, 	/* Keypad plus */
 		{0xffb7, 0x6c}, 	/* Keypad 7 */
 		{0xff95, 0x6c}, 	/* Keypad home */
 		{0xffb8, 0x75}, 	/* Keypad 8 */
 		{0xff97, 0x75}, 	/* Keypad up arrow */
 		{0xffb9, 0x7d}, 	/* Keypad 9 */
 		{0xff9a, 0x7d}, 	/* Keypad PgUp */
 		{0xffb4, 0x6b}, 	/* Keypad 4 */
 		{0xff96, 0x6b}, 	/* Keypad left arrow */
 		{0xffb5, 0x73}, 	/* Keypad 5 */
 		{0xff9d, 0x73}, 	/* Keypad empty */
 		{0xffb6, 0x74}, 	/* Keypad 6 */
 		{0xff98, 0x74}, 	/* Keypad right arrow */
 		{0xffb1, 0x69}, 	/* Keypad 1 */
 		{0xff9c, 0x69}, 	/* Keypad end */
 		{0xffb2, 0x72}, 	/* Keypad 2 */
 		{0xff99, 0x72}, 	/* Keypad down arrow */
 		{0xffb3, 0x7a}, 	/* Keypad 3 */
 		{0xff9b, 0x7a}, 	/* Keypad PgDown */
 		{0xffb0, 0x70}, 	/* Keypad 0 */
 		{0xff9e, 0x70}, 	/* Keypad ins */
 		{0xffae, 0x71}, 	/* Keypad . */
 		{0xff9f, 0x71}, 	/* Keypad del */
 		{0, 0, 0} 		/* Terminator */
 };
 
 /* ASCII to type 2 scancode lookup table */
 static const uint8_t ascii_translations[128] = {
 		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 		0x29, 0x16, 0x52, 0x26, 0x25, 0x2e, 0x3d, 0x52,
 		0x46, 0x45, 0x3e, 0x55, 0x41, 0x4e, 0x49, 0x4a,
 		0x45, 0x16, 0x1e, 0x26, 0x25, 0x2e, 0x36, 0x3d,
 		0x3e, 0x46, 0x4c, 0x4c, 0x41, 0x55, 0x49, 0x4a,
 		0x1e, 0x1c, 0x32, 0x21, 0x23, 0x24, 0x2b, 0x34,
 		0x33, 0x43, 0x3b, 0x42, 0x4b, 0x3a, 0x31, 0x44,
 		0x4d, 0x15, 0x2d, 0x1b, 0x2c, 0x3c, 0x2a, 0x1d,
 		0x22, 0x35, 0x1a, 0x54, 0x5d, 0x5b, 0x36, 0x4e,
 		0x0e, 0x1c, 0x32, 0x21, 0x23, 0x24, 0x2b, 0x34,
 		0x33, 0x43, 0x3b, 0x42, 0x4b, 0x3a, 0x31, 0x44,
 		0x4d, 0x15, 0x2d, 0x1b, 0x2c, 0x3c, 0x2a, 0x1d,
 		0x22, 0x35, 0x1a, 0x54, 0x5d, 0x5b, 0x0e, 0x00,
 };
 
 static void
 fifo_init(struct ps2kbd_softc *sc)
 {
 	struct fifo *fifo;
 
 	fifo = &sc->fifo;
 	fifo->size = sizeof(((struct fifo *)0)->buf);
 }
 
 static void
 fifo_reset(struct ps2kbd_softc *sc)
 {
 	struct fifo *fifo;
 
 	fifo = &sc->fifo;
 	bzero(fifo, sizeof(struct fifo));
 	fifo->size = sizeof(((struct fifo *)0)->buf);
 }
 
 static void
 fifo_put(struct ps2kbd_softc *sc, uint8_t val)
 {
 	struct fifo *fifo;
 
 	fifo = &sc->fifo;
 	if (fifo->num < fifo->size) {
 		fifo->buf[fifo->windex] = val;
 		fifo->windex = (fifo->windex + 1) % fifo->size;
 		fifo->num++;
 	}
 }
 
 static int
 fifo_get(struct ps2kbd_softc *sc, uint8_t *val)
 {
 	struct fifo *fifo;
 
 	fifo = &sc->fifo;
 	if (fifo->num > 0) {
 		*val = fifo->buf[fifo->rindex];
 		fifo->rindex = (fifo->rindex + 1) % fifo->size;
 		fifo->num--;
 		return (0);
 	}
 
 	return (-1);
 }
 
 int
 ps2kbd_read(struct ps2kbd_softc *sc, uint8_t *val)
 {
 	int retval;
 
 	pthread_mutex_lock(&sc->mtx);
 	retval = fifo_get(sc, val);
 	pthread_mutex_unlock(&sc->mtx);
 
 	return (retval);
 }
 
 void
 ps2kbd_write(struct ps2kbd_softc *sc, uint8_t val)
 {
 	pthread_mutex_lock(&sc->mtx);
 	if (sc->curcmd) {
 		switch (sc->curcmd) {
 		case PS2KC_SET_TYPEMATIC:
 			fifo_put(sc, PS2KC_ACK);
 			break;
 		case PS2KC_SET_SCANCODE_SET:
 			fifo_put(sc, PS2KC_ACK);
 			break;
 		case PS2KC_SET_LEDS:
 			fifo_put(sc, PS2KC_ACK);
 			break;
 		default:
 			EPRINTLN("Unhandled ps2 keyboard current "
 			    "command byte 0x%02x", val);
 			break;
 		}
 		sc->curcmd = 0;
 	} else {
 		switch (val) {
 		case 0x00:
 			fifo_put(sc, PS2KC_ACK);
 			break;
 		case PS2KC_RESET_DEV:
 			fifo_reset(sc);
 			fifo_put(sc, PS2KC_ACK);
 			fifo_put(sc, PS2KC_BAT_SUCCESS);
 			break;
 		case PS2KC_DISABLE:
 			sc->enabled = false;
 			fifo_put(sc, PS2KC_ACK);
 			break;
 		case PS2KC_ENABLE:
 			sc->enabled = true;
 			fifo_reset(sc);
 			fifo_put(sc, PS2KC_ACK);
 			break;
 		case PS2KC_SET_TYPEMATIC:
 			sc->curcmd = val;
 			fifo_put(sc, PS2KC_ACK);
 			break;
 		case PS2KC_SEND_DEV_ID:
 			fifo_put(sc, PS2KC_ACK);
 			fifo_put(sc, 0xab);
 			fifo_put(sc, 0x83);
 			break;
 		case PS2KC_SET_SCANCODE_SET:
 			sc->curcmd = val;
 			fifo_put(sc, PS2KC_ACK);
 			break;
 		case PS2KC_ECHO:
 			fifo_put(sc, PS2KC_ECHO);
 			break;
 		case PS2KC_SET_LEDS:
 			sc->curcmd = val;
 			fifo_put(sc, PS2KC_ACK);
 			break;
 		default:
 			EPRINTLN("Unhandled ps2 keyboard command "
 			    "0x%02x", val);
 			break;
 		}
 	}
 	pthread_mutex_unlock(&sc->mtx);
 }
 
 /*
  * Translate keysym to type 2 scancode and insert into keyboard buffer.
  */
 static void
 ps2kbd_keysym_queue(struct ps2kbd_softc *sc,
     int down, uint32_t keysym)
 {
 	assert(pthread_mutex_isowned_np(&sc->mtx));
 	int e0_prefix, found;
 	uint8_t code;
 	const struct extended_translation *trans;
 
 	found = 0;
 	if (keysym < 0x80) {
 		code = ascii_translations[keysym];
 		e0_prefix = 0;
 		found = 1;
 	} else {
 		for (trans = &(extended_translations[0]); trans->keysym != 0;
 		    trans++) {
 			if (keysym == trans->keysym) {
 				code = trans->scancode;
 				e0_prefix = trans->flags & SCANCODE_E0_PREFIX;
 				found = 1;
 				break;
 			}
 		}
 	}
 
 	if (!found) {
 		EPRINTLN("Unhandled ps2 keyboard keysym 0x%x", keysym);
 		return;
 	}
 
 	if (e0_prefix)
 		fifo_put(sc, 0xe0);
 	if (!down)
 		fifo_put(sc, 0xf0);
 	fifo_put(sc, code);
 }
 
 static void
 ps2kbd_event(int down, uint32_t keysym, void *arg)
 {
 	struct ps2kbd_softc *sc = arg;
 	int fifo_full;
 
 	pthread_mutex_lock(&sc->mtx);
 	if (!sc->enabled) {
 		pthread_mutex_unlock(&sc->mtx);
 		return;
 	}
 	fifo_full = sc->fifo.num == PS2KBD_FIFOSZ;
 	ps2kbd_keysym_queue(sc, down, keysym);
 	pthread_mutex_unlock(&sc->mtx);
 
 	if (!fifo_full)
 		atkbdc_event(sc->atkbdc_sc, 1);
 }
 
 struct ps2kbd_softc *
 ps2kbd_init(struct atkbdc_softc *atkbdc_sc)
 {
 	struct ps2kbd_softc *sc;
 
 	sc = calloc(1, sizeof (struct ps2kbd_softc));
 	pthread_mutex_init(&sc->mtx, NULL);
 	fifo_init(sc);
 	sc->atkbdc_sc = atkbdc_sc;
 
 	console_kbd_register(ps2kbd_event, sc, 1);
 
 	return (sc);
 }
 
+#ifdef BHYVE_SNAPSHOT
+int
+ps2kbd_snapshot(struct ps2kbd_softc *sc, struct vm_snapshot_meta *meta)
+{
+	int ret;
+
+	SNAPSHOT_VAR_OR_LEAVE(sc->enabled, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->curcmd, meta, ret, done);
+
+done:
+	return (ret);
+}
+#endif
+
diff --git a/usr.sbin/bhyve/ps2kbd.h b/usr.sbin/bhyve/ps2kbd.h
index 17be6d046673..3cf87be1b7f3 100644
--- a/usr.sbin/bhyve/ps2kbd.h
+++ b/usr.sbin/bhyve/ps2kbd.h
@@ -1,41 +1,46 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _PS2KBD_H_
 #define	_PS2KBD_H_
 
 struct atkbdc_softc;
+struct vm_snapshot_meta;
 
 struct ps2kbd_softc *ps2kbd_init(struct atkbdc_softc *sc);
 
 int ps2kbd_read(struct ps2kbd_softc *sc, uint8_t *val);
 void ps2kbd_write(struct ps2kbd_softc *sc, uint8_t val);
 
+#ifdef BHYVE_SNAPSHOT
+int ps2kbd_snapshot(struct ps2kbd_softc *sc, struct vm_snapshot_meta *meta);
+#endif
+
 #endif /* _PS2KBD_H_ */
diff --git a/usr.sbin/bhyve/ps2mouse.c b/usr.sbin/bhyve/ps2mouse.c
index f42d2e726023..afe817710f30 100644
--- a/usr.sbin/bhyve/ps2mouse.c
+++ b/usr.sbin/bhyve/ps2mouse.c
@@ -1,419 +1,441 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
  * Copyright (c) 2015 Nahanni Systems Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/types.h>
 
+#include <machine/vmm_snapshot.h>
+
 #include <assert.h>
 #include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 #include <strings.h>
 #include <pthread.h>
 #include <pthread_np.h>
 
 #include "atkbdc.h"
 #include "debug.h"
 #include "console.h"
 
 /* mouse device commands */
 #define	PS2MC_RESET_DEV		0xff
 #define	PS2MC_SET_DEFAULTS	0xf6
 #define	PS2MC_DISABLE		0xf5
 #define	PS2MC_ENABLE		0xf4
 #define	PS2MC_SET_SAMPLING_RATE	0xf3
 #define	PS2MC_SEND_DEV_ID	0xf2
 #define	PS2MC_SET_REMOTE_MODE	0xf0
 #define	PS2MC_SEND_DEV_DATA	0xeb
 #define	PS2MC_SET_STREAM_MODE	0xea
 #define	PS2MC_SEND_DEV_STATUS	0xe9
 #define	PS2MC_SET_RESOLUTION	0xe8
 #define	PS2MC_SET_SCALING1	0xe7
 #define	PS2MC_SET_SCALING2	0xe6
 
 #define	PS2MC_BAT_SUCCESS	0xaa
 #define	PS2MC_ACK		0xfa
 
 /* mouse device id */
 #define	PS2MOUSE_DEV_ID		0x0
 
 /* mouse data bits */
 #define	PS2M_DATA_Y_OFLOW	0x80
 #define	PS2M_DATA_X_OFLOW	0x40
 #define	PS2M_DATA_Y_SIGN	0x20
 #define	PS2M_DATA_X_SIGN	0x10
 #define	PS2M_DATA_AONE		0x08
 #define	PS2M_DATA_MID_BUTTON	0x04
 #define	PS2M_DATA_RIGHT_BUTTON	0x02
 #define	PS2M_DATA_LEFT_BUTTON	0x01
 
 /* mouse status bits */
 #define	PS2M_STS_REMOTE_MODE	0x40
 #define	PS2M_STS_ENABLE_DEV	0x20
 #define	PS2M_STS_SCALING_21	0x10
 #define	PS2M_STS_MID_BUTTON	0x04
 #define	PS2M_STS_RIGHT_BUTTON	0x02
 #define	PS2M_STS_LEFT_BUTTON	0x01
 
 #define	PS2MOUSE_FIFOSZ		16
 
 struct fifo {
 	uint8_t	buf[PS2MOUSE_FIFOSZ];
 	int	rindex;		/* index to read from */
 	int	windex;		/* index to write to */
 	int	num;		/* number of bytes in the fifo */
 	int	size;		/* size of the fifo */
 };
 
 struct ps2mouse_softc {
 	struct atkbdc_softc	*atkbdc_sc;
 	pthread_mutex_t		mtx;
 
 	uint8_t		status;
 	uint8_t		resolution;
 	uint8_t		sampling_rate;
 	int		ctrlenable;
 	struct fifo	fifo;
 
 	uint8_t		curcmd;	/* current command for next byte */
 
 	int		cur_x, cur_y;
 	int		delta_x, delta_y;
 };
 
 static void
 fifo_init(struct ps2mouse_softc *sc)
 {
 	struct fifo *fifo;
 
 	fifo = &sc->fifo;
 	fifo->size = sizeof(((struct fifo *)0)->buf);
 }
 
 static void
 fifo_reset(struct ps2mouse_softc *sc)
 {
 	struct fifo *fifo;
 
 	fifo = &sc->fifo;
 	bzero(fifo, sizeof(struct fifo));
 	fifo->size = sizeof(((struct fifo *)0)->buf);
 }
 
 static void
 fifo_put(struct ps2mouse_softc *sc, uint8_t val)
 {
 	struct fifo *fifo;
 
 	fifo = &sc->fifo;
 	if (fifo->num < fifo->size) {
 		fifo->buf[fifo->windex] = val;
 		fifo->windex = (fifo->windex + 1) % fifo->size;
 		fifo->num++;
 	}
 }
 
 static int
 fifo_get(struct ps2mouse_softc *sc, uint8_t *val)
 {
 	struct fifo *fifo;
 
 	fifo = &sc->fifo;
 	if (fifo->num > 0) {
 		*val = fifo->buf[fifo->rindex];
 		fifo->rindex = (fifo->rindex + 1) % fifo->size;
 		fifo->num--;
 		return (0);
 	}
 
 	return (-1);
 }
 
 static void
 movement_reset(struct ps2mouse_softc *sc)
 {
 	assert(pthread_mutex_isowned_np(&sc->mtx));
 
 	sc->delta_x = 0;
 	sc->delta_y = 0;
 }
 
 static void
 movement_update(struct ps2mouse_softc *sc, int x, int y)
 {
 	sc->delta_x += x - sc->cur_x;
 	sc->delta_y += sc->cur_y - y;
 	sc->cur_x = x;
 	sc->cur_y = y;
 }
 
 static void
 movement_get(struct ps2mouse_softc *sc)
 {
 	uint8_t val0, val1, val2;
 
 	assert(pthread_mutex_isowned_np(&sc->mtx));
 
 	val0 = PS2M_DATA_AONE;
 	val0 |= sc->status & (PS2M_DATA_LEFT_BUTTON |
 	    PS2M_DATA_RIGHT_BUTTON | PS2M_DATA_MID_BUTTON);
 
 	if (sc->delta_x >= 0) {
 		if (sc->delta_x > 255) {
 			val0 |= PS2M_DATA_X_OFLOW;
 			val1 = 255;
 		} else
 			val1 = sc->delta_x;
 	} else {
 		val0 |= PS2M_DATA_X_SIGN;
 		if (sc->delta_x < -255) {
 			val0 |= PS2M_DATA_X_OFLOW;
 			val1 = 255;
 		} else
 			val1 = sc->delta_x;
 	}
 	sc->delta_x = 0;
 
 	if (sc->delta_y >= 0) {
 		if (sc->delta_y > 255) {
 			val0 |= PS2M_DATA_Y_OFLOW;
 			val2 = 255;
 		} else
 			val2 = sc->delta_y;
 	} else {
 		val0 |= PS2M_DATA_Y_SIGN;
 		if (sc->delta_y < -255) {
 			val0 |= PS2M_DATA_Y_OFLOW;
 			val2 = 255;
 		} else
 			val2 = sc->delta_y;
 	}
 	sc->delta_y = 0;
 
 	if (sc->fifo.num < (sc->fifo.size - 3)) {
 		fifo_put(sc, val0);
 		fifo_put(sc, val1);
 		fifo_put(sc, val2);
 	}
 }
 
 static void
 ps2mouse_reset(struct ps2mouse_softc *sc)
 {
 	assert(pthread_mutex_isowned_np(&sc->mtx));
 	fifo_reset(sc);
 	movement_reset(sc);
 	sc->status = PS2M_STS_ENABLE_DEV;
 	sc->resolution = 4;
 	sc->sampling_rate = 100;
 
 	sc->cur_x = 0;
 	sc->cur_y = 0;
 	sc->delta_x = 0;
 	sc->delta_y = 0;
 }
 
 int
 ps2mouse_read(struct ps2mouse_softc *sc, uint8_t *val)
 {
 	int retval;
 
 	pthread_mutex_lock(&sc->mtx);
 	retval = fifo_get(sc, val);
 	pthread_mutex_unlock(&sc->mtx);
 
 	return (retval);
 }
 
 int
 ps2mouse_fifocnt(struct ps2mouse_softc *sc)
 {
 	return (sc->fifo.num);
 }
 
 void
 ps2mouse_toggle(struct ps2mouse_softc *sc, int enable)
 {
 	pthread_mutex_lock(&sc->mtx);
 	if (enable)
 		sc->ctrlenable = 1;
 	else {
 		sc->ctrlenable = 0;
 		sc->fifo.rindex = 0;
 		sc->fifo.windex = 0;
 		sc->fifo.num = 0;
 	}
 	pthread_mutex_unlock(&sc->mtx);
 }
 
 void
 ps2mouse_write(struct ps2mouse_softc *sc, uint8_t val, int insert)
 {
 	pthread_mutex_lock(&sc->mtx);
 	fifo_reset(sc);
 	if (sc->curcmd) {
 		switch (sc->curcmd) {
 		case PS2MC_SET_SAMPLING_RATE:
 			sc->sampling_rate = val;
 			fifo_put(sc, PS2MC_ACK);
 			break;
 		case PS2MC_SET_RESOLUTION:
 			sc->resolution = val;
 			fifo_put(sc, PS2MC_ACK);
 			break;
 		default:
 			EPRINTLN("Unhandled ps2 mouse current "
 			    "command byte 0x%02x", val);
 			break;
 		}
 		sc->curcmd = 0;
 
 	} else if (insert) {
 		fifo_put(sc, val);
 	} else {
 		switch (val) {
 		case 0x00:
 			fifo_put(sc, PS2MC_ACK);
 			break;
 		case PS2MC_RESET_DEV:
 			ps2mouse_reset(sc);
 			fifo_put(sc, PS2MC_ACK);
 			fifo_put(sc, PS2MC_BAT_SUCCESS);
 			fifo_put(sc, PS2MOUSE_DEV_ID);
 			break;
 		case PS2MC_SET_DEFAULTS:
 			ps2mouse_reset(sc);
 			fifo_put(sc, PS2MC_ACK);
 			break;
 		case PS2MC_DISABLE:
 			fifo_reset(sc);
 			sc->status &= ~PS2M_STS_ENABLE_DEV;
 			fifo_put(sc, PS2MC_ACK);
 			break;
 		case PS2MC_ENABLE:
 			fifo_reset(sc);
 			sc->status |= PS2M_STS_ENABLE_DEV;
 			fifo_put(sc, PS2MC_ACK);
 			break;
 		case PS2MC_SET_SAMPLING_RATE:
 			sc->curcmd = val;
 			fifo_put(sc, PS2MC_ACK);
 			break;
 		case PS2MC_SEND_DEV_ID:
 			fifo_put(sc, PS2MC_ACK);
 			fifo_put(sc, PS2MOUSE_DEV_ID);
 			break;
 		case PS2MC_SET_REMOTE_MODE:
 			sc->status |= PS2M_STS_REMOTE_MODE;
 			fifo_put(sc, PS2MC_ACK);
 			break;
 		case PS2MC_SEND_DEV_DATA:
 			fifo_put(sc, PS2MC_ACK);
 			movement_get(sc);
 			break;
 		case PS2MC_SET_STREAM_MODE:
 			sc->status &= ~PS2M_STS_REMOTE_MODE;
 			fifo_put(sc, PS2MC_ACK);
 			break;
 		case PS2MC_SEND_DEV_STATUS:
 			fifo_put(sc, PS2MC_ACK);
 			fifo_put(sc, sc->status);
 			fifo_put(sc, sc->resolution);
 			fifo_put(sc, sc->sampling_rate);
 			break;
 		case PS2MC_SET_RESOLUTION:
 			sc->curcmd = val;
 			fifo_put(sc, PS2MC_ACK);
 			break;
 		case PS2MC_SET_SCALING1:
 		case PS2MC_SET_SCALING2:
 			fifo_put(sc, PS2MC_ACK);
 			break;
 		default:
 			fifo_put(sc, PS2MC_ACK);
 			EPRINTLN("Unhandled ps2 mouse command "
 			    "0x%02x", val);
 			break;
 		}
 	}
 	pthread_mutex_unlock(&sc->mtx);
 }
 
 static void
 ps2mouse_event(uint8_t button, int x, int y, void *arg)
 {
 	struct ps2mouse_softc *sc = arg;
 
 	pthread_mutex_lock(&sc->mtx);
 	movement_update(sc, x, y);
 
 	sc->status &= ~(PS2M_STS_LEFT_BUTTON |
 	    PS2M_STS_RIGHT_BUTTON | PS2M_STS_MID_BUTTON);
 	if (button & (1 << 0))
 		sc->status |= PS2M_STS_LEFT_BUTTON;
 	if (button & (1 << 1))
 		sc->status |= PS2M_STS_MID_BUTTON;
 	if (button & (1 << 2))
 		sc->status |= PS2M_STS_RIGHT_BUTTON;
 
 	if ((sc->status & PS2M_STS_ENABLE_DEV) == 0 || !sc->ctrlenable) {
 		/* no data reporting */
 		pthread_mutex_unlock(&sc->mtx);
 		return;
 	}
 
 	movement_get(sc);
 	pthread_mutex_unlock(&sc->mtx);
 
 	if (sc->fifo.num > 0)
 		atkbdc_event(sc->atkbdc_sc, 0);
 }
 
 struct ps2mouse_softc *
 ps2mouse_init(struct atkbdc_softc *atkbdc_sc)
 {
 	struct ps2mouse_softc *sc;
 
 	sc = calloc(1, sizeof (struct ps2mouse_softc));
 	pthread_mutex_init(&sc->mtx, NULL);
 	fifo_init(sc);
 	sc->atkbdc_sc = atkbdc_sc;
 
 	pthread_mutex_lock(&sc->mtx);
 	ps2mouse_reset(sc);
 	pthread_mutex_unlock(&sc->mtx);
 
 	console_ptr_register(ps2mouse_event, sc, 1);
 
 	return (sc);
 }
 
-
+#ifdef BHYVE_SNAPSHOT
+int
+ps2mouse_snapshot(struct ps2mouse_softc *sc, struct vm_snapshot_meta *meta)
+{
+	int ret;
+
+	SNAPSHOT_VAR_OR_LEAVE(sc->status, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->resolution, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->sampling_rate, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->ctrlenable, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->curcmd, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->cur_x, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->cur_y, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->delta_x, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->delta_y, meta, ret, done);
+
+done:
+	return (ret);
+}
+#endif
diff --git a/usr.sbin/bhyve/ps2mouse.h b/usr.sbin/bhyve/ps2mouse.h
index 59430b01e2b1..4ae755ef4411 100644
--- a/usr.sbin/bhyve/ps2mouse.h
+++ b/usr.sbin/bhyve/ps2mouse.h
@@ -1,43 +1,48 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _PS2MOUSE_H_
 #define	_PS2MOUSE_H_
 
 struct atkbdc_softc;
+struct vm_snapshot_meta; 
 
 struct ps2mouse_softc *ps2mouse_init(struct atkbdc_softc *sc);
 
 int ps2mouse_read(struct ps2mouse_softc *sc, uint8_t *val);
 void ps2mouse_write(struct ps2mouse_softc *sc, uint8_t val, int insert);
 void ps2mouse_toggle(struct ps2mouse_softc *sc, int enable);
 int ps2mouse_fifocnt(struct ps2mouse_softc *sc);
 
+#ifdef BHYVE_SNAPSHOT
+int ps2mouse_snapshot(struct ps2mouse_softc *sc, struct vm_snapshot_meta *meta);
+#endif
+
 #endif /* _PS2MOUSE_H_ */
diff --git a/usr.sbin/bhyve/snapshot.c b/usr.sbin/bhyve/snapshot.c
new file mode 100644
index 000000000000..22bfd8d28a61
--- /dev/null
+++ b/usr.sbin/bhyve/snapshot.c
@@ -0,0 +1,1742 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2016 Flavius Anton
+ * Copyright (c) 2016 Mihai Tiganus
+ * Copyright (c) 2016-2019 Mihai Carabas
+ * Copyright (c) 2017-2019 Darius Mihai
+ * Copyright (c) 2017-2019 Elena Mihailescu
+ * Copyright (c) 2018-2019 Sergiu Weisz
+ * All rights reserved.
+ * The bhyve-snapshot feature was developed under sponsorships
+ * from Matthew Grooms.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#ifndef WITHOUT_CAPSICUM
+#include <sys/capsicum.h>
+#endif
+#include <sys/mman.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/un.h>
+
+#include <machine/atomic.h>
+#include <machine/segments.h>
+
+#ifndef WITHOUT_CAPSICUM
+#include <capsicum_helpers.h>
+#endif
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <err.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <libgen.h>
+#include <signal.h>
+#include <unistd.h>
+#include <assert.h>
+#include <errno.h>
+#include <pthread.h>
+#include <pthread_np.h>
+#include <sysexits.h>
+#include <stdbool.h>
+#include <sys/ioctl.h>
+
+#include <machine/vmm.h>
+#ifndef WITHOUT_CAPSICUM
+#include <machine/vmm_dev.h>
+#endif
+#include <machine/vmm_snapshot.h>
+#include <vmmapi.h>
+
+#include "bhyverun.h"
+#include "acpi.h"
+#include "atkbdc.h"
+#include "inout.h"
+#include "dbgport.h"
+#include "fwctl.h"
+#include "ioapic.h"
+#include "mem.h"
+#include "mevent.h"
+#include "mptbl.h"
+#include "pci_emul.h"
+#include "pci_irq.h"
+#include "pci_lpc.h"
+#include "smbiostbl.h"
+#include "snapshot.h"
+#include "xmsr.h"
+#include "spinup_ap.h"
+#include "rtc.h"
+
+#include <libxo/xo.h>
+#include <ucl.h>
+
+struct spinner_info {
+	const size_t *crtval;
+	const size_t maxval;
+	const size_t total;
+};
+
+extern int guest_ncpus;
+
+static struct winsize winsize;
+static sig_t old_winch_handler;
+
+#define	KB		(1024UL)
+#define	MB		(1024UL * KB)
+#define	GB		(1024UL * MB)
+
+#define	SNAPSHOT_CHUNK	(4 * MB)
+#define	PROG_BUF_SZ	(8192)
+
+#define	BHYVE_RUN_DIR "/var/run/bhyve"
+#define	CHECKPOINT_RUN_DIR BHYVE_RUN_DIR "/checkpoint"
+#define	MAX_VMNAME 100
+
+#define	MAX_MSG_SIZE 1024
+
+#define	SNAPSHOT_BUFFER_SIZE (20 * MB)
+
+#define	JSON_STRUCT_ARR_KEY		"structs"
+#define	JSON_DEV_ARR_KEY		"devices"
+#define	JSON_BASIC_METADATA_KEY 	"basic metadata"
+#define	JSON_SNAPSHOT_REQ_KEY		"snapshot_req"
+#define	JSON_SIZE_KEY			"size"
+#define	JSON_FILE_OFFSET_KEY		"file_offset"
+
+#define	JSON_NCPUS_KEY			"ncpus"
+#define	JSON_VMNAME_KEY 		"vmname"
+#define	JSON_MEMSIZE_KEY		"memsize"
+#define	JSON_MEMFLAGS_KEY		"memflags"
+
+#define min(a,b)		\
+({				\
+ __typeof__ (a) _a = (a);	\
+ __typeof__ (b) _b = (b); 	\
+ _a < _b ? _a : _b;       	\
+ })
+
+const struct vm_snapshot_dev_info snapshot_devs[] = {
+	{ "atkbdc",	atkbdc_snapshot,	NULL,		NULL		},
+	{ "virtio-net",	pci_snapshot,		pci_pause,	pci_resume	},
+	{ "virtio-blk",	pci_snapshot,		pci_pause,	pci_resume	},
+	{ "lpc",	pci_snapshot,		NULL,		NULL		},
+	{ "fbuf",	pci_snapshot,		NULL,		NULL		},
+	{ "xhci",	pci_snapshot,		NULL,		NULL		},
+	{ "e1000",	pci_snapshot,		NULL,		NULL		},
+	{ "ahci",	pci_snapshot,		pci_pause,	pci_resume	},
+	{ "ahci-hd",	pci_snapshot,		pci_pause,	pci_resume	},
+	{ "ahci-cd",	pci_snapshot,		NULL,		NULL		},
+};
+
+const struct vm_snapshot_kern_info snapshot_kern_structs[] = {
+	{ "vhpet",	STRUCT_VHPET	},
+	{ "vm",		STRUCT_VM	},
+	{ "vmx",	STRUCT_VMX	},
+	{ "vioapic",	STRUCT_VIOAPIC	},
+	{ "vlapic",	STRUCT_VLAPIC	},
+	{ "vmcx",	STRUCT_VMCX	},
+	{ "vatpit",	STRUCT_VATPIT	},
+	{ "vatpic",	STRUCT_VATPIC	},
+	{ "vpmtmr",	STRUCT_VPMTMR	},
+	{ "vrtc",	STRUCT_VRTC	},
+};
+
+static cpuset_t vcpus_active, vcpus_suspended;
+static pthread_mutex_t vcpu_lock;
+static pthread_cond_t vcpus_idle, vcpus_can_run;
+static bool checkpoint_active;
+
+/*
+ * TODO: Harden this function and all of its callers since 'base_str' is a user
+ * provided string.
+ */
+static char *
+strcat_extension(const char *base_str, const char *ext)
+{
+	char *res;
+	size_t base_len, ext_len;
+
+	base_len = strnlen(base_str, MAX_VMNAME);
+	ext_len = strnlen(ext, MAX_VMNAME);
+
+	if (base_len + ext_len > MAX_VMNAME) {
+		fprintf(stderr, "Filename exceeds maximum length.\n");
+		return (NULL);
+	}
+
+	res = malloc(base_len + ext_len + 1);
+	if (res == NULL) {
+		perror("Failed to allocate memory.");
+		return (NULL);
+	}
+
+	memcpy(res, base_str, base_len);
+	memcpy(res + base_len, ext, ext_len);
+	res[base_len + ext_len] = 0;
+
+	return (res);
+}
+
+void
+destroy_restore_state(struct restore_state *rstate)
+{
+	if (rstate == NULL) {
+		fprintf(stderr, "Attempting to destroy NULL restore struct.\n");
+		return;
+	}
+
+	if (rstate->kdata_map != MAP_FAILED)
+		munmap(rstate->kdata_map, rstate->kdata_len);
+
+	if (rstate->kdata_fd > 0)
+		close(rstate->kdata_fd);
+	if (rstate->vmmem_fd > 0)
+		close(rstate->vmmem_fd);
+
+	if (rstate->meta_root_obj != NULL)
+		ucl_object_unref(rstate->meta_root_obj);
+	if (rstate->meta_parser != NULL)
+		ucl_parser_free(rstate->meta_parser);
+}
+
+static int
+load_vmmem_file(const char *filename, struct restore_state *rstate)
+{
+	struct stat sb;
+	int err;
+
+	rstate->vmmem_fd = open(filename, O_RDONLY);
+	if (rstate->vmmem_fd < 0) {
+		perror("Failed to open restore file");
+		return (-1);
+	}
+
+	err = fstat(rstate->vmmem_fd, &sb);
+	if (err < 0) {
+		perror("Failed to stat restore file");
+		goto err_load_vmmem;
+	}
+
+	if (sb.st_size == 0) {
+		fprintf(stderr, "Restore file is empty.\n");
+		goto err_load_vmmem;
+	}
+
+	rstate->vmmem_len = sb.st_size;
+
+	return (0);
+
+err_load_vmmem:
+	if (rstate->vmmem_fd > 0)
+		close(rstate->vmmem_fd);
+	return (-1);
+}
+
+static int
+load_kdata_file(const char *filename, struct restore_state *rstate)
+{
+	struct stat sb;
+	int err;
+
+	rstate->kdata_fd = open(filename, O_RDONLY);
+	if (rstate->kdata_fd < 0) {
+		perror("Failed to open kernel data file");
+		return (-1);
+	}
+
+	err = fstat(rstate->kdata_fd, &sb);
+	if (err < 0) {
+		perror("Failed to stat kernel data file");
+		goto err_load_kdata;
+	}
+
+	if (sb.st_size == 0) {
+		fprintf(stderr, "Kernel data file is empty.\n");
+		goto err_load_kdata;
+	}
+
+	rstate->kdata_len = sb.st_size;
+	rstate->kdata_map = mmap(NULL, rstate->kdata_len, PROT_READ,
+				 MAP_SHARED, rstate->kdata_fd, 0);
+	if (rstate->kdata_map == MAP_FAILED) {
+		perror("Failed to map restore file");
+		goto err_load_kdata;
+	}
+
+	return (0);
+
+err_load_kdata:
+	if (rstate->kdata_fd > 0)
+		close(rstate->kdata_fd);
+	return (-1);
+}
+
+static int
+load_metadata_file(const char *filename, struct restore_state *rstate)
+{
+	const ucl_object_t *obj;
+	struct ucl_parser *parser;
+	int err;
+
+	parser = ucl_parser_new(UCL_PARSER_DEFAULT);
+	if (parser == NULL) {
+		fprintf(stderr, "Failed to initialize UCL parser.\n");
+		goto err_load_metadata;
+	}
+
+	err = ucl_parser_add_file(parser, filename);
+	if (err == 0) {
+		fprintf(stderr, "Failed to parse metadata file: '%s'\n",
+			filename);
+		err = -1;
+		goto err_load_metadata;
+	}
+
+	obj = ucl_parser_get_object(parser);
+	if (obj == NULL) {
+		fprintf(stderr, "Failed to parse object.\n");
+		err = -1;
+		goto err_load_metadata;
+	}
+
+	rstate->meta_parser = parser;
+	rstate->meta_root_obj = (ucl_object_t *)obj;
+
+	return (0);
+
+err_load_metadata:
+	if (parser != NULL)
+		ucl_parser_free(parser);
+	return (err);
+}
+
+int
+load_restore_file(const char *filename, struct restore_state *rstate)
+{
+	int err = 0;
+	char *kdata_filename = NULL, *meta_filename = NULL;
+
+	assert(filename != NULL);
+	assert(rstate != NULL);
+
+	memset(rstate, 0, sizeof(*rstate));
+	rstate->kdata_map = MAP_FAILED;
+
+	err = load_vmmem_file(filename, rstate);
+	if (err != 0) {
+		fprintf(stderr, "Failed to load guest RAM file.\n");
+		goto err_restore;
+	}
+
+	kdata_filename = strcat_extension(filename, ".kern");
+	if (kdata_filename == NULL) {
+		fprintf(stderr, "Failed to construct kernel data filename.\n");
+		goto err_restore;
+	}
+
+	err = load_kdata_file(kdata_filename, rstate);
+	if (err != 0) {
+		fprintf(stderr, "Failed to load guest kernel data file.\n");
+		goto err_restore;
+	}
+
+	meta_filename = strcat_extension(filename, ".meta");
+	if (meta_filename == NULL) {
+		fprintf(stderr, "Failed to construct kernel metadata filename.\n");
+		goto err_restore;
+	}
+
+	err = load_metadata_file(meta_filename, rstate);
+	if (err != 0) {
+		fprintf(stderr, "Failed to load guest metadata file.\n");
+		goto err_restore;
+	}
+
+	return (0);
+
+err_restore:
+	destroy_restore_state(rstate);
+	if (kdata_filename != NULL)
+		free(kdata_filename);
+	if (meta_filename != NULL)
+		free(meta_filename);
+	return (-1);
+}
+
+#define JSON_GET_INT_OR_RETURN(key, obj, result_ptr, ret)			\
+do {										\
+	const ucl_object_t *obj__;						\
+	obj__ = ucl_object_lookup(obj, key);					\
+	if (obj__ == NULL) {							\
+		fprintf(stderr, "Missing key: '%s'", key);			\
+		return (ret);							\
+	}									\
+	if (!ucl_object_toint_safe(obj__, result_ptr)) {			\
+		fprintf(stderr, "Cannot convert '%s' value to int.", key);	\
+		return (ret);							\
+	}									\
+} while(0)
+
+#define JSON_GET_STRING_OR_RETURN(key, obj, result_ptr, ret)			\
+do {										\
+	const ucl_object_t *obj__;						\
+	obj__ = ucl_object_lookup(obj, key);					\
+	if (obj__ == NULL) {							\
+		fprintf(stderr, "Missing key: '%s'", key);			\
+		return (ret);							\
+	}									\
+	if (!ucl_object_tostring_safe(obj__, result_ptr)) {			\
+		fprintf(stderr, "Cannot convert '%s' value to string.", key);	\
+		return (ret);							\
+	}									\
+} while(0)
+
+static void *
+lookup_struct(enum snapshot_req struct_id, struct restore_state *rstate,
+	      size_t *struct_size)
+{
+	const ucl_object_t *structs = NULL, *obj = NULL;
+	ucl_object_iter_t it = NULL;
+	int64_t snapshot_req, size, file_offset;
+
+	structs = ucl_object_lookup(rstate->meta_root_obj, JSON_STRUCT_ARR_KEY);
+	if (structs == NULL) {
+		fprintf(stderr, "Failed to find '%s' object.\n",
+			JSON_STRUCT_ARR_KEY);
+		return (NULL);
+	}
+
+	if (ucl_object_type((ucl_object_t *)structs) != UCL_ARRAY) {
+		fprintf(stderr, "Object '%s' is not an array.\n",
+		JSON_STRUCT_ARR_KEY);
+		return (NULL);
+	}
+
+	while ((obj = ucl_object_iterate(structs, &it, true)) != NULL) {
+		snapshot_req = -1;
+		JSON_GET_INT_OR_RETURN(JSON_SNAPSHOT_REQ_KEY, obj,
+				       &snapshot_req, NULL);
+		assert(snapshot_req >= 0);
+		if ((enum snapshot_req) snapshot_req == struct_id) {
+			JSON_GET_INT_OR_RETURN(JSON_SIZE_KEY, obj,
+					       &size, NULL);
+			assert(size >= 0);
+
+			JSON_GET_INT_OR_RETURN(JSON_FILE_OFFSET_KEY, obj,
+					       &file_offset, NULL);
+			assert(file_offset >= 0);
+			assert(file_offset + size <= rstate->kdata_len);
+
+			*struct_size = (size_t)size;
+			return (rstate->kdata_map + file_offset);
+		}
+	}
+
+	return (NULL);
+}
+
+static void *
+lookup_check_dev(const char *dev_name, struct restore_state *rstate,
+		 const ucl_object_t *obj, size_t *data_size)
+{
+	const char *snapshot_req;
+	int64_t size, file_offset;
+
+	snapshot_req = NULL;
+	JSON_GET_STRING_OR_RETURN(JSON_SNAPSHOT_REQ_KEY, obj,
+				  &snapshot_req, NULL);
+	assert(snapshot_req != NULL);
+	if (!strcmp(snapshot_req, dev_name)) {
+		JSON_GET_INT_OR_RETURN(JSON_SIZE_KEY, obj,
+				       &size, NULL);
+		assert(size >= 0);
+
+		JSON_GET_INT_OR_RETURN(JSON_FILE_OFFSET_KEY, obj,
+				       &file_offset, NULL);
+		assert(file_offset >= 0);
+		assert(file_offset + size <= rstate->kdata_len);
+
+		*data_size = (size_t)size;
+		return (rstate->kdata_map + file_offset);
+	}
+
+	return (NULL);
+}
+
+static void*
+lookup_dev(const char *dev_name, struct restore_state *rstate,
+	   size_t *data_size)
+{
+	const ucl_object_t *devs = NULL, *obj = NULL;
+	ucl_object_iter_t it = NULL;
+	void *ret;
+
+	devs = ucl_object_lookup(rstate->meta_root_obj, JSON_DEV_ARR_KEY);
+	if (devs == NULL) {
+		fprintf(stderr, "Failed to find '%s' object.\n",
+			JSON_DEV_ARR_KEY);
+		return (NULL);
+	}
+
+	if (ucl_object_type((ucl_object_t *)devs) != UCL_ARRAY) {
+		fprintf(stderr, "Object '%s' is not an array.\n",
+			JSON_DEV_ARR_KEY);
+		return (NULL);
+	}
+
+	while ((obj = ucl_object_iterate(devs, &it, true)) != NULL) {
+		ret = lookup_check_dev(dev_name, rstate, obj, data_size);
+		if (ret != NULL)
+			return (ret);
+	}
+
+	return (NULL);
+}
+
+static const ucl_object_t *
+lookup_basic_metadata_object(struct restore_state *rstate)
+{
+	const ucl_object_t *basic_meta_obj = NULL;
+
+	basic_meta_obj = ucl_object_lookup(rstate->meta_root_obj,
+					   JSON_BASIC_METADATA_KEY);
+	if (basic_meta_obj == NULL) {
+		fprintf(stderr, "Failed to find '%s' object.\n",
+			JSON_BASIC_METADATA_KEY);
+		return (NULL);
+	}
+
+	if (ucl_object_type((ucl_object_t *)basic_meta_obj) != UCL_OBJECT) {
+		fprintf(stderr, "Object '%s' is not a JSON object.\n",
+		JSON_BASIC_METADATA_KEY);
+		return (NULL);
+	}
+
+	return (basic_meta_obj);
+}
+
+const char *
+lookup_vmname(struct restore_state *rstate)
+{
+	const char *vmname;
+	const ucl_object_t *obj;
+
+	obj = lookup_basic_metadata_object(rstate);
+	if (obj == NULL)
+		return (NULL);
+
+	JSON_GET_STRING_OR_RETURN(JSON_VMNAME_KEY, obj, &vmname, NULL);
+	return (vmname);
+}
+
+int
+lookup_memflags(struct restore_state *rstate)
+{
+	int64_t memflags;
+	const ucl_object_t *obj;
+
+	obj = lookup_basic_metadata_object(rstate);
+	if (obj == NULL)
+		return (0);
+
+	JSON_GET_INT_OR_RETURN(JSON_MEMFLAGS_KEY, obj, &memflags, 0);
+
+	return ((int)memflags);
+}
+
+size_t
+lookup_memsize(struct restore_state *rstate)
+{
+	int64_t memsize;
+	const ucl_object_t *obj;
+
+	obj = lookup_basic_metadata_object(rstate);
+	if (obj == NULL)
+		return (0);
+
+	JSON_GET_INT_OR_RETURN(JSON_MEMSIZE_KEY, obj, &memsize, 0);
+	if (memsize < 0)
+		memsize = 0;
+
+	return ((size_t)memsize);
+}
+
+
+int
+lookup_guest_ncpus(struct restore_state *rstate)
+{
+	int64_t ncpus;
+	const ucl_object_t *obj;
+
+	obj = lookup_basic_metadata_object(rstate);
+	if (obj == NULL)
+		return (0);
+
+	JSON_GET_INT_OR_RETURN(JSON_NCPUS_KEY, obj, &ncpus, 0);
+	return ((int)ncpus);
+}
+
+static void
+winch_handler(int signal)
+{
+#ifdef TIOCGWINSZ
+	ioctl(STDOUT_FILENO, TIOCGWINSZ, &winsize);
+#endif /* TIOCGWINSZ */
+}
+
+static int
+print_progress(size_t crtval, const size_t maxval)
+{
+	size_t rc;
+	double crtval_gb, maxval_gb;
+	size_t i, win_width, prog_start, prog_done, prog_end;
+	int mval_len;
+
+	static char prog_buf[PROG_BUF_SZ];
+	static const size_t len = sizeof(prog_buf);
+
+	static size_t div;
+	static char *div_str;
+
+	static char wip_bar[] = { '/', '-', '\\', '|' };
+	static int wip_idx = 0;
+
+	if (maxval == 0) {
+		printf("[0B / 0B]\r\n");
+		return (0);
+	}
+
+	if (crtval > maxval)
+		crtval = maxval;
+
+	if (maxval > 10 * GB) {
+		div = GB;
+		div_str = "GiB";
+	} else if (maxval > 10 * MB) {
+		div = MB;
+		div_str = "MiB";
+	} else {
+		div = KB;
+		div_str = "KiB";
+	}
+
+	crtval_gb = (double) crtval / div;
+	maxval_gb = (double) maxval / div;
+
+	rc = snprintf(prog_buf, len, "%.03lf", maxval_gb);
+	if (rc == len) {
+		fprintf(stderr, "Maxval too big\n");
+		return (-1);
+	}
+	mval_len = rc;
+
+	rc = snprintf(prog_buf, len, "\r[%*.03lf%s / %.03lf%s] |",
+		mval_len, crtval_gb, div_str, maxval_gb, div_str);
+
+	if (rc == len) {
+		fprintf(stderr, "Buffer too small to print progress\n");
+		return (-1);
+	}
+
+	win_width = min(winsize.ws_col, len);
+	prog_start = rc;
+
+	if (prog_start < (win_width - 2)) {
+		prog_end = win_width - prog_start - 2;
+		prog_done = prog_end * (crtval_gb / maxval_gb);
+
+		for (i = prog_start; i < prog_start + prog_done; i++)
+			prog_buf[i] = '#';
+
+		if (crtval != maxval) {
+			prog_buf[i] = wip_bar[wip_idx];
+			wip_idx = (wip_idx + 1) % sizeof(wip_bar);
+			i++;
+		} else {
+			prog_buf[i++] = '#';
+		}
+
+		for (; i < win_width - 2; i++)
+			prog_buf[i] = '_';
+
+		prog_buf[win_width - 2] = '|';
+	}
+
+	prog_buf[win_width - 1] = '\0';
+	write(STDOUT_FILENO, prog_buf, win_width);
+
+	return (0);
+}
+
+static void *
+snapshot_spinner_cb(void *arg)
+{
+	int rc;
+	size_t crtval, maxval, total;
+	struct spinner_info *si;
+	struct timespec ts;
+
+	si = arg;
+	if (si == NULL)
+		pthread_exit(NULL);
+
+	ts.tv_sec = 0;
+	ts.tv_nsec = 50 * 1000 * 1000; /* 50 ms sleep time */
+
+	do {
+		crtval = *si->crtval;
+		maxval = si->maxval;
+		total = si->total;
+
+		rc = print_progress(crtval, total);
+		if (rc < 0) {
+			fprintf(stderr, "Failed to parse progress\n");
+			break;
+		}
+
+		nanosleep(&ts, NULL);
+	} while (crtval < maxval);
+
+	pthread_exit(NULL);
+	return NULL;
+}
+
+static int
+vm_snapshot_mem_part(const int snapfd, const size_t foff, void *src,
+		     const size_t len, const size_t totalmem, const bool op_wr)
+{
+	int rc;
+	size_t part_done, todo, rem;
+	ssize_t done;
+	bool show_progress;
+	pthread_t spinner_th;
+	struct spinner_info *si;
+
+	if (lseek(snapfd, foff, SEEK_SET) < 0) {
+		perror("Failed to change file offset");
+		return (-1);
+	}
+
+	show_progress = false;
+	if (isatty(STDIN_FILENO) && (winsize.ws_col != 0))
+		show_progress = true;
+
+	part_done = foff;
+	rem = len;
+
+	if (show_progress) {
+		si = &(struct spinner_info) {
+			.crtval = &part_done,
+			.maxval = foff + len,
+			.total = totalmem
+		};
+
+		rc = pthread_create(&spinner_th, 0, snapshot_spinner_cb, si);
+		if (rc) {
+			perror("Unable to create spinner thread");
+			show_progress = false;
+		}
+	}
+
+	while (rem > 0) {
+		if (show_progress)
+			todo = min(SNAPSHOT_CHUNK, rem);
+		else
+			todo = rem;
+
+		if (op_wr)
+			done = write(snapfd, src, todo);
+		else
+			done = read(snapfd, src, todo);
+		if (done < 0) {
+			perror("Failed to write in file");
+			return (-1);
+		}
+
+		src += done;
+		part_done += done;
+		rem -= done;
+	}
+
+	if (show_progress) {
+		rc = pthread_join(spinner_th, NULL);
+		if (rc)
+			perror("Unable to end spinner thread");
+	}
+
+	return (0);
+}
+
+static size_t
+vm_snapshot_mem(struct vmctx *ctx, int snapfd, size_t memsz, const bool op_wr)
+{
+	int ret;
+	size_t lowmem, highmem, totalmem;
+	char *baseaddr;
+
+	ret = vm_get_guestmem_from_ctx(ctx, &baseaddr, &lowmem, &highmem);
+	if (ret) {
+		fprintf(stderr, "%s: unable to retrieve guest memory size\r\n",
+			__func__);
+		return (0);
+	}
+	totalmem = lowmem + highmem;
+
+	if ((op_wr == false) && (totalmem != memsz)) {
+		fprintf(stderr, "%s: mem size mismatch: %ld vs %ld\r\n",
+			__func__, totalmem, memsz);
+		return (0);
+	}
+
+	winsize.ws_col = 80;
+#ifdef TIOCGWINSZ
+	ioctl(STDOUT_FILENO, TIOCGWINSZ, &winsize);
+#endif /* TIOCGWINSZ */
+	old_winch_handler = signal(SIGWINCH, winch_handler);
+
+	ret = vm_snapshot_mem_part(snapfd, 0, baseaddr, lowmem,
+		totalmem, op_wr);
+	if (ret) {
+		fprintf(stderr, "%s: Could not %s lowmem\r\n",
+			__func__, op_wr ? "write" : "read");
+		totalmem = 0;
+		goto done;
+	}
+
+	if (highmem == 0)
+		goto done;
+
+	ret = vm_snapshot_mem_part(snapfd, lowmem, baseaddr + 4*GB,
+		highmem, totalmem, op_wr);
+	if (ret) {
+		fprintf(stderr, "%s: Could not %s highmem\r\n",
+		        __func__, op_wr ? "write" : "read");
+		totalmem = 0;
+		goto done;
+	}
+
+done:
+	printf("\r\n");
+	signal(SIGWINCH, old_winch_handler);
+
+	return (totalmem);
+}
+
+int
+restore_vm_mem(struct vmctx *ctx, struct restore_state *rstate)
+{
+	size_t restored;
+
+	restored = vm_snapshot_mem(ctx, rstate->vmmem_fd, rstate->vmmem_len,
+				   false);
+
+	if (restored != rstate->vmmem_len)
+		return (-1);
+
+	return (0);
+}
+
+static int
+vm_restore_kern_struct(struct vmctx *ctx, struct restore_state *rstate,
+		       const struct vm_snapshot_kern_info *info)
+{
+	void *struct_ptr;
+	size_t struct_size;
+	int ret;
+	struct vm_snapshot_meta *meta;
+
+	struct_ptr = lookup_struct(info->req, rstate, &struct_size);
+	if (struct_ptr == NULL) {
+		fprintf(stderr, "%s: Failed to lookup struct %s\r\n",
+			__func__, info->struct_name);
+		ret = -1;
+		goto done;
+	}
+
+	if (struct_size == 0) {
+		fprintf(stderr, "%s: Kernel struct size was 0 for: %s\r\n",
+			__func__, info->struct_name);
+		ret = -1;
+		goto done;
+	}
+
+	meta = &(struct vm_snapshot_meta) {
+		.ctx = ctx,
+		.dev_name = info->struct_name,
+		.dev_req  = info->req,
+
+		.buffer.buf_start = struct_ptr,
+		.buffer.buf_size = struct_size,
+
+		.buffer.buf = struct_ptr,
+		.buffer.buf_rem = struct_size,
+
+		.op = VM_SNAPSHOT_RESTORE,
+	};
+
+	ret = vm_snapshot_req(meta);
+	if (ret != 0) {
+		fprintf(stderr, "%s: Failed to restore struct: %s\r\n",
+			__func__, info->struct_name);
+		goto done;
+	}
+
+done:
+	return (ret);
+}
+
+int
+vm_restore_kern_structs(struct vmctx *ctx, struct restore_state *rstate)
+{
+	int ret;
+	int i;
+
+	for (i = 0; i < nitems(snapshot_kern_structs); i++) {
+		ret = vm_restore_kern_struct(ctx, rstate,
+					     &snapshot_kern_structs[i]);
+		if (ret != 0)
+			return (ret);
+	}
+
+	return (0);
+}
+
+int
+vm_restore_user_dev(struct vmctx *ctx, struct restore_state *rstate,
+		    const struct vm_snapshot_dev_info *info)
+{
+	void *dev_ptr;
+	size_t dev_size;
+	int ret;
+	struct vm_snapshot_meta *meta;
+
+	dev_ptr = lookup_dev(info->dev_name, rstate, &dev_size);
+	if (dev_ptr == NULL) {
+		fprintf(stderr, "Failed to lookup dev: %s\r\n", info->dev_name);
+		fprintf(stderr, "Continuing the restore/migration process\r\n");
+		return (0);
+	}
+
+	if (dev_size == 0) {
+		fprintf(stderr, "%s: Device size is 0. "
+			"Assuming %s is not used\r\n",
+			__func__, info->dev_name);
+		return (0);
+	}
+
+	meta = &(struct vm_snapshot_meta) {
+		.ctx = ctx,
+		.dev_name = info->dev_name,
+
+		.buffer.buf_start = dev_ptr,
+		.buffer.buf_size = dev_size,
+
+		.buffer.buf = dev_ptr,
+		.buffer.buf_rem = dev_size,
+
+		.op = VM_SNAPSHOT_RESTORE,
+	};
+
+	ret = (*info->snapshot_cb)(meta);
+	if (ret != 0) {
+		fprintf(stderr, "Failed to restore dev: %s\r\n",
+			info->dev_name);
+		return (-1);
+	}
+
+	return (0);
+}
+
+
+int
+vm_restore_user_devs(struct vmctx *ctx, struct restore_state *rstate)
+{
+	int ret;
+	int i;
+
+	for (i = 0; i < nitems(snapshot_devs); i++) {
+		ret = vm_restore_user_dev(ctx, rstate, &snapshot_devs[i]);
+		if (ret != 0)
+			return (ret);
+	}
+
+	return 0;
+}
+
+int
+vm_pause_user_devs(struct vmctx *ctx)
+{
+	const struct vm_snapshot_dev_info *info;
+	int ret;
+	int i;
+
+	for (i = 0; i < nitems(snapshot_devs); i++) {
+		info = &snapshot_devs[i];
+		if (info->pause_cb == NULL)
+			continue;
+
+		ret = info->pause_cb(ctx, info->dev_name);
+		if (ret != 0)
+			return (ret);
+	}
+
+	return (0);
+}
+
+int
+vm_resume_user_devs(struct vmctx *ctx)
+{
+	const struct vm_snapshot_dev_info *info;
+	int ret;
+	int i;
+
+	for (i = 0; i < nitems(snapshot_devs); i++) {
+		info = &snapshot_devs[i];
+		if (info->resume_cb == NULL)
+			continue;
+
+		ret = info->resume_cb(ctx, info->dev_name);
+		if (ret != 0)
+			return (ret);
+	}
+
+	return (0);
+}
+
+static int
+vm_snapshot_kern_struct(int data_fd, xo_handle_t *xop, const char *array_key,
+			struct vm_snapshot_meta *meta, off_t *offset)
+{
+	int ret;
+	size_t data_size;
+	ssize_t write_cnt;
+
+	ret = vm_snapshot_req(meta);
+	if (ret != 0) {
+		fprintf(stderr, "%s: Failed to snapshot struct %s\r\n",
+			__func__, meta->dev_name);
+		ret = -1;
+		goto done;
+	}
+
+	data_size = vm_get_snapshot_size(meta);
+
+	write_cnt = write(data_fd, meta->buffer.buf_start, data_size);
+	if (write_cnt != data_size) {
+		perror("Failed to write all snapshotted data.");
+		ret = -1;
+		goto done;
+	}
+
+	/* Write metadata. */
+	xo_open_instance_h(xop, array_key);
+	xo_emit_h(xop, "{:debug_name/%s}\n", meta->dev_name);
+	xo_emit_h(xop, "{:" JSON_SNAPSHOT_REQ_KEY "/%d}\n",
+		  meta->dev_req);
+	xo_emit_h(xop, "{:" JSON_SIZE_KEY "/%lu}\n", data_size);
+	xo_emit_h(xop, "{:" JSON_FILE_OFFSET_KEY "/%lu}\n", *offset);
+	xo_close_instance_h(xop, JSON_STRUCT_ARR_KEY);
+
+	*offset += data_size;
+
+done:
+	return (ret);
+}
+
+static int
+vm_snapshot_kern_structs(struct vmctx *ctx, int data_fd, xo_handle_t *xop)
+{
+	int ret, i, error;
+	size_t offset, buf_size;
+	char *buffer;
+	struct vm_snapshot_meta *meta;
+
+	error = 0;
+	offset = 0;
+	buf_size = SNAPSHOT_BUFFER_SIZE;
+
+	buffer = malloc(SNAPSHOT_BUFFER_SIZE * sizeof(char));
+	if (buffer == NULL) {
+		error = ENOMEM;
+		perror("Failed to allocate memory for snapshot buffer");
+		goto err_vm_snapshot_kern_data;
+	}
+
+	meta = &(struct vm_snapshot_meta) {
+		.ctx = ctx,
+
+		.buffer.buf_start = buffer,
+		.buffer.buf_size = buf_size,
+
+		.op = VM_SNAPSHOT_SAVE,
+	};
+
+	xo_open_list_h(xop, JSON_STRUCT_ARR_KEY);
+	for (i = 0; i < nitems(snapshot_kern_structs); i++) {
+		meta->dev_name = snapshot_kern_structs[i].struct_name;
+		meta->dev_req  = snapshot_kern_structs[i].req;
+
+		memset(meta->buffer.buf_start, 0, meta->buffer.buf_size);
+		meta->buffer.buf = meta->buffer.buf_start;
+		meta->buffer.buf_rem = meta->buffer.buf_size;
+
+		ret = vm_snapshot_kern_struct(data_fd, xop, JSON_DEV_ARR_KEY,
+					      meta, &offset);
+		if (ret != 0) {
+			error = -1;
+			goto err_vm_snapshot_kern_data;
+		}
+	}
+	xo_close_list_h(xop, JSON_STRUCT_ARR_KEY);
+
+err_vm_snapshot_kern_data:
+	if (buffer != NULL)
+		free(buffer);
+	return (error);
+}
+
+static int
+vm_snapshot_basic_metadata(struct vmctx *ctx, xo_handle_t *xop, size_t memsz)
+{
+	int error;
+	int memflags;
+	char vmname_buf[MAX_VMNAME];
+
+	memset(vmname_buf, 0, MAX_VMNAME);
+	error = vm_get_name(ctx, vmname_buf, MAX_VMNAME - 1);
+	if (error != 0) {
+		perror("Failed to get VM name");
+		goto err;
+	}
+
+	memflags = vm_get_memflags(ctx);
+
+	xo_open_container_h(xop, JSON_BASIC_METADATA_KEY);
+	xo_emit_h(xop, "{:" JSON_NCPUS_KEY "/%ld}\n", guest_ncpus);
+	xo_emit_h(xop, "{:" JSON_VMNAME_KEY "/%s}\n", vmname_buf);
+	xo_emit_h(xop, "{:" JSON_MEMSIZE_KEY "/%lu}\n", memsz);
+	xo_emit_h(xop, "{:" JSON_MEMFLAGS_KEY "/%d}\n", memflags);
+	xo_close_container_h(xop, JSON_BASIC_METADATA_KEY);
+
+err:
+	return (error);
+}
+
+static int
+vm_snapshot_dev_write_data(int data_fd, xo_handle_t *xop, const char *array_key,
+			   struct vm_snapshot_meta *meta, off_t *offset)
+{
+	int ret;
+	size_t data_size;
+
+	data_size = vm_get_snapshot_size(meta);
+
+	ret = write(data_fd, meta->buffer.buf_start, data_size);
+	if (ret != data_size) {
+		perror("Failed to write all snapshotted data.");
+		return (-1);
+	}
+
+	/* Write metadata. */
+	xo_open_instance_h(xop, array_key);
+	xo_emit_h(xop, "{:" JSON_SNAPSHOT_REQ_KEY "/%s}\n", meta->dev_name);
+	xo_emit_h(xop, "{:" JSON_SIZE_KEY "/%lu}\n", data_size);
+	xo_emit_h(xop, "{:" JSON_FILE_OFFSET_KEY "/%lu}\n", *offset);
+	xo_close_instance_h(xop, array_key);
+
+	*offset += data_size;
+
+	return (0);
+}
+
+static int
+vm_snapshot_user_dev(const struct vm_snapshot_dev_info *info,
+		     int data_fd, xo_handle_t *xop,
+		     struct vm_snapshot_meta *meta, off_t *offset)
+{
+	int ret;
+
+	ret = (*info->snapshot_cb)(meta);
+	if (ret != 0) {
+		fprintf(stderr, "Failed to snapshot %s; ret=%d\r\n",
+			meta->dev_name, ret);
+		return (ret);
+	}
+
+	ret = vm_snapshot_dev_write_data(data_fd, xop, JSON_DEV_ARR_KEY, meta,
+					 offset);
+	if (ret != 0)
+		return (ret);
+
+	return (0);
+}
+
+static int
+vm_snapshot_user_devs(struct vmctx *ctx, int data_fd, xo_handle_t *xop)
+{
+	int ret, i;
+	off_t offset;
+	void *buffer;
+	size_t buf_size;
+	struct vm_snapshot_meta *meta;
+
+	buf_size = SNAPSHOT_BUFFER_SIZE;
+
+	offset = lseek(data_fd, 0, SEEK_CUR);
+	if (offset < 0) {
+		perror("Failed to get data file current offset.");
+		return (-1);
+	}
+
+	buffer = malloc(buf_size);
+	if (buffer == NULL) {
+		perror("Failed to allocate memory for snapshot buffer");
+		ret = ENOSPC;
+		goto snapshot_err;
+	}
+
+	meta = &(struct vm_snapshot_meta) {
+		.ctx = ctx,
+
+		.buffer.buf_start = buffer,
+		.buffer.buf_size = buf_size,
+
+		.op = VM_SNAPSHOT_SAVE,
+	};
+
+	xo_open_list_h(xop, JSON_DEV_ARR_KEY);
+
+	/* Restore other devices that support this feature */
+	for (i = 0; i < nitems(snapshot_devs); i++) {
+		meta->dev_name = snapshot_devs[i].dev_name;
+
+		memset(meta->buffer.buf_start, 0, meta->buffer.buf_size);
+		meta->buffer.buf = meta->buffer.buf_start;
+		meta->buffer.buf_rem = meta->buffer.buf_size;
+
+		ret = vm_snapshot_user_dev(&snapshot_devs[i], data_fd, xop,
+					   meta, &offset);
+		if (ret != 0)
+			goto snapshot_err;
+	}
+
+	xo_close_list_h(xop, JSON_DEV_ARR_KEY);
+
+snapshot_err:
+	if (buffer != NULL)
+		free(buffer);
+	return (ret);
+}
+
+void
+checkpoint_cpu_add(int vcpu)
+{
+
+	pthread_mutex_lock(&vcpu_lock);
+	CPU_SET(vcpu, &vcpus_active);
+
+	if (checkpoint_active) {
+		CPU_SET(vcpu, &vcpus_suspended);
+		while (checkpoint_active)
+			pthread_cond_wait(&vcpus_can_run, &vcpu_lock);
+		CPU_CLR(vcpu, &vcpus_suspended);
+	}
+	pthread_mutex_unlock(&vcpu_lock);
+}
+
+/*
+ * When a vCPU is suspended for any reason, it calls
+ * checkpoint_cpu_suspend().  This records that the vCPU is idle.
+ * Before returning from suspension, checkpoint_cpu_resume() is
+ * called.  In suspend we note that the vCPU is idle.  In resume we
+ * pause the vCPU thread until the checkpoint is complete.  The reason
+ * for the two-step process is that vCPUs might already be stopped in
+ * the debug server when a checkpoint is requested.  This approach
+ * allows us to account for and handle those vCPUs.
+ */
+void
+checkpoint_cpu_suspend(int vcpu)
+{
+
+	pthread_mutex_lock(&vcpu_lock);
+	CPU_SET(vcpu, &vcpus_suspended);
+	if (checkpoint_active && CPU_CMP(&vcpus_active, &vcpus_suspended) == 0)
+		pthread_cond_signal(&vcpus_idle);
+	pthread_mutex_unlock(&vcpu_lock);
+}
+
+void
+checkpoint_cpu_resume(int vcpu)
+{
+
+	pthread_mutex_lock(&vcpu_lock);
+	while (checkpoint_active)
+		pthread_cond_wait(&vcpus_can_run, &vcpu_lock);
+	CPU_CLR(vcpu, &vcpus_suspended);
+	pthread_mutex_unlock(&vcpu_lock);
+}
+
+static void
+vm_vcpu_pause(struct vmctx *ctx)
+{
+
+	pthread_mutex_lock(&vcpu_lock);
+	checkpoint_active = true;
+	vm_suspend_cpu(ctx, -1);
+	while (CPU_CMP(&vcpus_active, &vcpus_suspended) != 0)
+		pthread_cond_wait(&vcpus_idle, &vcpu_lock);
+	pthread_mutex_unlock(&vcpu_lock);
+}
+
+static void
+vm_vcpu_resume(struct vmctx *ctx)
+{
+
+	pthread_mutex_lock(&vcpu_lock);
+	checkpoint_active = false;
+	pthread_mutex_unlock(&vcpu_lock);
+	vm_resume_cpu(ctx, -1);
+	pthread_cond_broadcast(&vcpus_can_run);
+}
+
+static int
+vm_checkpoint(struct vmctx *ctx, char *checkpoint_file, bool stop_vm)
+{
+	int fd_checkpoint = 0, kdata_fd = 0;
+	int ret = 0;
+	int error = 0;
+	size_t memsz;
+	xo_handle_t *xop = NULL;
+	char *meta_filename = NULL;
+	char *kdata_filename = NULL;
+	FILE *meta_file = NULL;
+
+	kdata_filename = strcat_extension(checkpoint_file, ".kern");
+	if (kdata_filename == NULL) {
+		fprintf(stderr, "Failed to construct kernel data filename.\n");
+		return (-1);
+	}
+
+	kdata_fd = open(kdata_filename, O_WRONLY | O_CREAT | O_TRUNC, 0700);
+	if (kdata_fd < 0) {
+		perror("Failed to open kernel data snapshot file.");
+		error = -1;
+		goto done;
+	}
+
+	fd_checkpoint = open(checkpoint_file, O_RDWR | O_CREAT | O_TRUNC, 0700);
+
+	if (fd_checkpoint < 0) {
+		perror("Failed to create checkpoint file");
+		error = -1;
+		goto done;
+	}
+
+	meta_filename = strcat_extension(checkpoint_file, ".meta");
+	if (meta_filename == NULL) {
+		fprintf(stderr, "Failed to construct vm metadata filename.\n");
+		goto done;
+	}
+
+	meta_file = fopen(meta_filename, "w");
+	if (meta_file == NULL) {
+		perror("Failed to open vm metadata snapshot file.");
+		goto done;
+	}
+
+	xop = xo_create_to_file(meta_file, XO_STYLE_JSON, XOF_PRETTY);
+	if (xop == NULL) {
+		perror("Failed to get libxo handle on metadata file.");
+		goto done;
+	}
+
+	vm_vcpu_pause(ctx);
+
+	ret = vm_pause_user_devs(ctx);
+	if (ret != 0) {
+		fprintf(stderr, "Could not pause devices\r\n");
+		error = ret;
+		goto done;
+	}
+
+	memsz = vm_snapshot_mem(ctx, fd_checkpoint, 0, true);
+	if (memsz == 0) {
+		perror("Could not write guest memory to file");
+		error = -1;
+		goto done;
+	}
+
+	ret = vm_snapshot_basic_metadata(ctx, xop, memsz);
+	if (ret != 0) {
+		fprintf(stderr, "Failed to snapshot vm basic metadata.\n");
+		error = -1;
+		goto done;
+	}
+
+
+	ret = vm_snapshot_kern_structs(ctx, kdata_fd, xop);
+	if (ret != 0) {
+		fprintf(stderr, "Failed to snapshot vm kernel data.\n");
+		error = -1;
+		goto done;
+	}
+
+	ret = vm_snapshot_user_devs(ctx, kdata_fd, xop);
+	if (ret != 0) {
+		fprintf(stderr, "Failed to snapshot device state.\n");
+		error = -1;
+		goto done;
+	}
+
+	xo_finish_h(xop);
+
+	if (stop_vm) {
+		vm_destroy(ctx);
+		exit(0);
+	}
+
+done:
+	ret = vm_resume_user_devs(ctx);
+	if (ret != 0)
+		fprintf(stderr, "Could not resume devices\r\n");
+	vm_vcpu_resume(ctx);
+	if (fd_checkpoint > 0)
+		close(fd_checkpoint);
+	if (meta_filename != NULL)
+		free(meta_filename);
+	if (kdata_filename != NULL)
+		free(kdata_filename);
+	if (xop != NULL)
+		xo_destroy(xop);
+	if (meta_file != NULL)
+		fclose(meta_file);
+	if (kdata_fd > 0)
+		close(kdata_fd);
+	return (error);
+}
+
+int
+get_checkpoint_msg(int conn_fd, struct vmctx *ctx)
+{
+	unsigned char buf[MAX_MSG_SIZE];
+	struct checkpoint_op *checkpoint_op;
+	int len, recv_len, total_recv = 0;
+	int err = 0;
+
+	len = sizeof(struct checkpoint_op); /* expected length */
+	while ((recv_len = recv(conn_fd, buf + total_recv, len - total_recv, 0)) > 0) {
+		total_recv += recv_len;
+	}
+	if (recv_len < 0) {
+		perror("Error while receiving data from bhyvectl");
+		err = -1;
+		goto done;
+	}
+
+	checkpoint_op = (struct checkpoint_op *)buf;
+	switch (checkpoint_op->op) {
+		case START_CHECKPOINT:
+			err = vm_checkpoint(ctx, checkpoint_op->snapshot_filename, false);
+			break;
+		case START_SUSPEND:
+			err = vm_checkpoint(ctx, checkpoint_op->snapshot_filename, true);
+			break;
+		default:
+			fprintf(stderr, "Unrecognized checkpoint operation.\n");
+			err = -1;
+	}
+
+done:
+	close(conn_fd);
+	return (err);
+}
+
+/*
+ * Listen for commands from bhyvectl
+ */
+void *
+checkpoint_thread(void *param)
+{
+	struct checkpoint_thread_info *thread_info;
+	int conn_fd, ret;
+
+	pthread_set_name_np(pthread_self(), "checkpoint thread");
+	thread_info = (struct checkpoint_thread_info *)param;
+
+	while ((conn_fd = accept(thread_info->socket_fd, NULL, NULL)) > -1) {
+		ret = get_checkpoint_msg(conn_fd, thread_info->ctx);
+		if (ret != 0) {
+			fprintf(stderr, "Failed to read message on checkpoint "
+					"socket. Retrying.\n");
+		}
+	}
+	if (conn_fd < -1) {
+		perror("Failed to accept connection");
+	}
+
+	return (NULL);
+}
+
+/*
+ * Create directory tree to store runtime specific information:
+ * i.e. UNIX sockets for IPC with bhyvectl.
+ */
+static int
+make_checkpoint_dir(void)
+{
+	int err;
+
+	err = mkdir(BHYVE_RUN_DIR, 0755);
+	if (err < 0 && errno != EEXIST)
+		return (err);
+
+	err = mkdir(CHECKPOINT_RUN_DIR, 0755);
+	if (err < 0 && errno != EEXIST)
+		return (err);
+
+	return 0;
+}
+
+/*
+ * Create the listening socket for IPC with bhyvectl
+ */
+int
+init_checkpoint_thread(struct vmctx *ctx)
+{
+	struct checkpoint_thread_info *checkpoint_info = NULL;
+	struct sockaddr_un addr;
+	int socket_fd;
+	pthread_t checkpoint_pthread;
+	char vmname_buf[MAX_VMNAME];
+	int ret, err = 0;
+
+	memset(&addr, 0, sizeof(addr));
+
+	err = pthread_mutex_init(&vcpu_lock, NULL);
+	if (err != 0)
+		errc(1, err, "checkpoint mutex init");
+	err = pthread_cond_init(&vcpus_idle, NULL);
+	if (err == 0)
+		err = pthread_cond_init(&vcpus_can_run, NULL);
+	if (err != 0)
+		errc(1, err, "checkpoint cv init");
+
+	socket_fd = socket(PF_UNIX, SOCK_STREAM, 0);
+	if (socket_fd < 0) {
+		perror("Socket creation failed (IPC with bhyvectl");
+		err = -1;
+		goto fail;
+	}
+
+	err = make_checkpoint_dir();
+	if (err < 0) {
+		perror("Failed to create checkpoint runtime directory");
+		goto fail;
+	}
+
+	addr.sun_family = AF_UNIX;
+
+	err = vm_get_name(ctx, vmname_buf, MAX_VMNAME - 1);
+	if (err != 0) {
+		perror("Failed to get VM name");
+		goto fail;
+	}
+
+	snprintf(addr.sun_path, sizeof(addr.sun_path), "%s/%s",
+		 CHECKPOINT_RUN_DIR, vmname_buf);
+	addr.sun_len = SUN_LEN(&addr);
+	unlink(addr.sun_path);
+
+	if (bind(socket_fd, (struct sockaddr *)&addr, addr.sun_len) != 0) {
+		perror("Failed to bind socket (IPC with bhyvectl)");
+		err = -1;
+		goto fail;
+	}
+
+	if (listen(socket_fd, 10) < 0) {
+		perror("Failed to listen on socket (IPC with bhyvectl)");
+		err = -1;
+		goto fail;
+	}
+
+	checkpoint_info = calloc(1, sizeof(*checkpoint_info));
+	checkpoint_info->ctx = ctx;
+	checkpoint_info->socket_fd = socket_fd;
+
+	ret = pthread_create(&checkpoint_pthread, NULL, checkpoint_thread,
+		checkpoint_info);
+	if (ret < 0) {
+		err = ret;
+		goto fail;
+	}
+
+	return (0);
+fail:
+	free(checkpoint_info);
+	if (socket_fd > 0)
+		close(socket_fd);
+	unlink(addr.sun_path);
+
+	return (err);
+}
+
+void
+vm_snapshot_buf_err(const char *bufname, const enum vm_snapshot_op op)
+{
+	const char *__op;
+
+	if (op == VM_SNAPSHOT_SAVE)
+		__op = "save";
+	else if (op == VM_SNAPSHOT_RESTORE)
+		__op = "restore";
+	else
+		__op = "unknown";
+
+	fprintf(stderr, "%s: snapshot-%s failed for %s\r\n",
+		__func__, __op, bufname);
+}
+
+int
+vm_snapshot_buf(volatile void *data, size_t data_size,
+		struct vm_snapshot_meta *meta)
+{
+	struct vm_snapshot_buffer *buffer;
+	int op;
+
+	buffer = &meta->buffer;
+	op = meta->op;
+
+	if (buffer->buf_rem < data_size) {
+		fprintf(stderr, "%s: buffer too small\r\n", __func__);
+		return (E2BIG);
+	}
+
+	if (op == VM_SNAPSHOT_SAVE)
+		memcpy(buffer->buf, (uint8_t *) data, data_size);
+	else if (op == VM_SNAPSHOT_RESTORE)
+		memcpy((uint8_t *) data, buffer->buf, data_size);
+	else
+		return (EINVAL);
+
+	buffer->buf += data_size;
+	buffer->buf_rem -= data_size;
+
+	return (0);
+}
+
+size_t
+vm_get_snapshot_size(struct vm_snapshot_meta *meta)
+{
+	size_t length;
+	struct vm_snapshot_buffer *buffer;
+
+	buffer = &meta->buffer;
+
+	if (buffer->buf_size < buffer->buf_rem) {
+		fprintf(stderr, "%s: Invalid buffer: size = %zu, rem = %zu\r\n",
+			__func__, buffer->buf_size, buffer->buf_rem);
+		length = 0;
+	} else {
+		length = buffer->buf_size - buffer->buf_rem;
+	}
+
+	return (length);
+}
+
+int
+vm_snapshot_guest2host_addr(void **addrp, size_t len, bool restore_null,
+			    struct vm_snapshot_meta *meta)
+{
+	int ret;
+	vm_paddr_t gaddr;
+
+	if (meta->op == VM_SNAPSHOT_SAVE) {
+		gaddr = paddr_host2guest(meta->ctx, *addrp);
+		if (gaddr == (vm_paddr_t) -1) {
+			if (!restore_null ||
+			    (restore_null && (*addrp != NULL))) {
+				ret = EFAULT;
+				goto done;
+			}
+		}
+
+		SNAPSHOT_VAR_OR_LEAVE(gaddr, meta, ret, done);
+	} else if (meta->op == VM_SNAPSHOT_RESTORE) {
+		SNAPSHOT_VAR_OR_LEAVE(gaddr, meta, ret, done);
+		if (gaddr == (vm_paddr_t) -1) {
+			if (!restore_null) {
+				ret = EFAULT;
+				goto done;
+			}
+		}
+
+		*addrp = paddr_guest2host(meta->ctx, gaddr, len);
+	} else {
+		ret = EINVAL;
+	}
+
+done:
+	return (ret);
+}
+
+int
+vm_snapshot_buf_cmp(volatile void *data, size_t data_size,
+		    struct vm_snapshot_meta *meta)
+{
+	struct vm_snapshot_buffer *buffer;
+	int op;
+	int ret;
+
+	buffer = &meta->buffer;
+	op = meta->op;
+
+	if (buffer->buf_rem < data_size) {
+		fprintf(stderr, "%s: buffer too small\r\n", __func__);
+		ret = E2BIG;
+		goto done;
+	}
+
+	if (op == VM_SNAPSHOT_SAVE) {
+		ret = 0;
+		memcpy(buffer->buf, (uint8_t *) data, data_size);
+	} else if (op == VM_SNAPSHOT_RESTORE) {
+		ret = memcmp((uint8_t *) data, buffer->buf, data_size);
+	} else {
+		ret = EINVAL;
+		goto done;
+	}
+
+	buffer->buf += data_size;
+	buffer->buf_rem -= data_size;
+
+done:
+	return (ret);
+}
diff --git a/usr.sbin/bhyve/snapshot.h b/usr.sbin/bhyve/snapshot.h
new file mode 100644
index 000000000000..f9ea3d573089
--- /dev/null
+++ b/usr.sbin/bhyve/snapshot.h
@@ -0,0 +1,105 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2016 Flavius Anton
+ * Copyright (c) 2016 Mihai Tiganus
+ * Copyright (c) 2016-2019 Mihai Carabas
+ * Copyright (c) 2017-2019 Darius Mihai
+ * Copyright (c) 2017-2019 Elena Mihailescu
+ * Copyright (c) 2018-2019 Sergiu Weisz
+ * All rights reserved.
+ * The bhyve-snapshot feature was developed under sponsorships
+ * from Matthew Grooms.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _BHYVE_SNAPSHOT_
+#define _BHYVE_SNAPSHOT_
+
+#include <machine/vmm_snapshot.h>
+#include <libxo/xo.h>
+#include <ucl.h>
+
+struct vmctx;
+
+struct restore_state {
+	int kdata_fd;
+	int vmmem_fd;
+
+	void *kdata_map;
+	size_t kdata_len;
+
+	size_t vmmem_len;
+
+	struct ucl_parser *meta_parser;
+	ucl_object_t *meta_root_obj;
+};
+
+struct checkpoint_thread_info {
+	struct vmctx *ctx;
+	int socket_fd;
+};
+
+typedef int (*vm_snapshot_dev_cb)(struct vm_snapshot_meta *);
+typedef int (*vm_pause_dev_cb) (struct vmctx *, const char *);
+typedef int (*vm_resume_dev_cb) (struct vmctx *, const char *);
+
+struct vm_snapshot_dev_info {
+	const char *dev_name;		/* device name */
+	vm_snapshot_dev_cb snapshot_cb;	/* callback for device snapshot */
+	vm_pause_dev_cb pause_cb;	/* callback for device pause */
+	vm_resume_dev_cb resume_cb;	/* callback for device resume */
+};
+
+struct vm_snapshot_kern_info {
+	const char *struct_name;	/* kernel structure name*/
+	enum snapshot_req req;		/* request type */
+};
+
+void destroy_restore_state(struct restore_state *rstate);
+
+const char *lookup_vmname(struct restore_state *rstate);
+int lookup_memflags(struct restore_state *rstate);
+size_t lookup_memsize(struct restore_state *rstate);
+int lookup_guest_ncpus(struct restore_state *rstate);
+
+void checkpoint_cpu_add(int vcpu);
+void checkpoint_cpu_resume(int vcpu);
+void checkpoint_cpu_suspend(int vcpu);
+
+int restore_vm_mem(struct vmctx *ctx, struct restore_state *rstate);
+int vm_restore_kern_structs(struct vmctx *ctx, struct restore_state *rstate);
+
+int vm_restore_user_devs(struct vmctx *ctx, struct restore_state *rstate);
+int vm_pause_user_devs(struct vmctx *ctx);
+int vm_resume_user_devs(struct vmctx *ctx);
+
+int get_checkpoint_msg(int conn_fd, struct vmctx *ctx);
+void *checkpoint_thread(void *param);
+int init_checkpoint_thread(struct vmctx *ctx);
+
+int load_restore_file(const char *filename, struct restore_state *rstate);
+
+#endif
diff --git a/usr.sbin/bhyve/uart_emul.c b/usr.sbin/bhyve/uart_emul.c
index 930344a52935..a89974590a1f 100644
--- a/usr.sbin/bhyve/uart_emul.c
+++ b/usr.sbin/bhyve/uart_emul.c
@@ -1,721 +1,755 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2012 NetApp, Inc.
  * Copyright (c) 2013 Neel Natu <neel@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/types.h>
 #include <dev/ic/ns16550.h>
 #ifndef WITHOUT_CAPSICUM
 #include <sys/capsicum.h>
 #include <capsicum_helpers.h>
 #endif
 
+#include <machine/vmm_snapshot.h>
+
 #include <stdio.h>
 #include <stdlib.h>
 #include <assert.h>
 #include <err.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <termios.h>
 #include <unistd.h>
 #include <stdbool.h>
 #include <string.h>
 #include <pthread.h>
 #include <sysexits.h>
 
 #include "mevent.h"
 #include "uart_emul.h"
 #include "debug.h"
 
 #define	COM1_BASE      	0x3F8
 #define	COM1_IRQ	4
 #define	COM2_BASE      	0x2F8
 #define	COM2_IRQ	3
 
 #define	DEFAULT_RCLK	1843200
 #define	DEFAULT_BAUD	9600
 
 #define	FCR_RX_MASK	0xC0
 
 #define	MCR_OUT1	0x04
 #define	MCR_OUT2	0x08
 
 #define	MSR_DELTA_MASK	0x0f
 
 #ifndef REG_SCR
 #define	REG_SCR		com_scr
 #endif
 
 #define	FIFOSZ	16
 
 static bool uart_stdio;		/* stdio in use for i/o */
 static struct termios tio_stdio_orig;
 
 static struct {
 	int	baseaddr;
 	int	irq;
 	bool	inuse;
 } uart_lres[] = {
 	{ COM1_BASE, COM1_IRQ, false},
 	{ COM2_BASE, COM2_IRQ, false},
 };
 
 #define	UART_NLDEVS	(sizeof(uart_lres) / sizeof(uart_lres[0]))
 
 struct fifo {
 	uint8_t	buf[FIFOSZ];
 	int	rindex;		/* index to read from */
 	int	windex;		/* index to write to */
 	int	num;		/* number of characters in the fifo */
 	int	size;		/* size of the fifo */
 };
 
 struct ttyfd {
 	bool	opened;
 	int	rfd;		/* fd for reading */
 	int	wfd;		/* fd for writing, may be == rfd */
 };
 
 struct uart_softc {
 	pthread_mutex_t mtx;	/* protects all softc elements */
 	uint8_t	data;		/* Data register (R/W) */
 	uint8_t ier;		/* Interrupt enable register (R/W) */
 	uint8_t lcr;		/* Line control register (R/W) */
 	uint8_t mcr;		/* Modem control register (R/W) */
 	uint8_t lsr;		/* Line status register (R/W) */
 	uint8_t msr;		/* Modem status register (R/W) */
 	uint8_t fcr;		/* FIFO control register (W) */
 	uint8_t scr;		/* Scratch register (R/W) */
 
 	uint8_t dll;		/* Baudrate divisor latch LSB */
 	uint8_t dlh;		/* Baudrate divisor latch MSB */
 
 	struct fifo rxfifo;
 	struct mevent *mev;
 
 	struct ttyfd tty;
 	bool	thre_int_pending;	/* THRE interrupt pending */
 
 	void	*arg;
 	uart_intr_func_t intr_assert;
 	uart_intr_func_t intr_deassert;
 };
 
 static void uart_drain(int fd, enum ev_type ev, void *arg);
 
 static void
 ttyclose(void)
 {
 
 	tcsetattr(STDIN_FILENO, TCSANOW, &tio_stdio_orig);
 }
 
 static void
 ttyopen(struct ttyfd *tf)
 {
 	struct termios orig, new;
 
 	tcgetattr(tf->rfd, &orig);
 	new = orig;
 	cfmakeraw(&new);
 	new.c_cflag |= CLOCAL;
 	tcsetattr(tf->rfd, TCSANOW, &new);
 	if (uart_stdio) {
 		tio_stdio_orig = orig;
 		atexit(ttyclose);
 	}
 	raw_stdio = 1;
 }
 
 static int
 ttyread(struct ttyfd *tf)
 {
 	unsigned char rb;
 
 	if (read(tf->rfd, &rb, 1) == 1)
 		return (rb);
 	else
 		return (-1);
 }
 
 static void
 ttywrite(struct ttyfd *tf, unsigned char wb)
 {
 
 	(void)write(tf->wfd, &wb, 1);
 }
 
 static void
 rxfifo_reset(struct uart_softc *sc, int size)
 {
 	char flushbuf[32];
 	struct fifo *fifo;
 	ssize_t nread;
 	int error;
 
 	fifo = &sc->rxfifo;
 	bzero(fifo, sizeof(struct fifo));
 	fifo->size = size;
 
 	if (sc->tty.opened) {
 		/*
 		 * Flush any unread input from the tty buffer.
 		 */
 		while (1) {
 			nread = read(sc->tty.rfd, flushbuf, sizeof(flushbuf));
 			if (nread != sizeof(flushbuf))
 				break;
 		}
 
 		/*
 		 * Enable mevent to trigger when new characters are available
 		 * on the tty fd.
 		 */
 		error = mevent_enable(sc->mev);
 		assert(error == 0);
 	}
 }
 
 static int
 rxfifo_available(struct uart_softc *sc)
 {
 	struct fifo *fifo;
 
 	fifo = &sc->rxfifo;
 	return (fifo->num < fifo->size);
 }
 
 static int
 rxfifo_putchar(struct uart_softc *sc, uint8_t ch)
 {
 	struct fifo *fifo;
 	int error;
 
 	fifo = &sc->rxfifo;
 
 	if (fifo->num < fifo->size) {
 		fifo->buf[fifo->windex] = ch;
 		fifo->windex = (fifo->windex + 1) % fifo->size;
 		fifo->num++;
 		if (!rxfifo_available(sc)) {
 			if (sc->tty.opened) {
 				/*
 				 * Disable mevent callback if the FIFO is full.
 				 */
 				error = mevent_disable(sc->mev);
 				assert(error == 0);
 			}
 		}
 		return (0);
 	} else
 		return (-1);
 }
 
 static int
 rxfifo_getchar(struct uart_softc *sc)
 {
 	struct fifo *fifo;
 	int c, error, wasfull;
 
 	wasfull = 0;
 	fifo = &sc->rxfifo;
 	if (fifo->num > 0) {
 		if (!rxfifo_available(sc))
 			wasfull = 1;
 		c = fifo->buf[fifo->rindex];
 		fifo->rindex = (fifo->rindex + 1) % fifo->size;
 		fifo->num--;
 		if (wasfull) {
 			if (sc->tty.opened) {
 				error = mevent_enable(sc->mev);
 				assert(error == 0);
 			}
 		}
 		return (c);
 	} else
 		return (-1);
 }
 
 static int
 rxfifo_numchars(struct uart_softc *sc)
 {
 	struct fifo *fifo = &sc->rxfifo;
 
 	return (fifo->num);
 }
 
 static void
 uart_opentty(struct uart_softc *sc)
 {
 
 	ttyopen(&sc->tty);
 	sc->mev = mevent_add(sc->tty.rfd, EVF_READ, uart_drain, sc);
 	assert(sc->mev != NULL);
 }
 
 static uint8_t
 modem_status(uint8_t mcr)
 {
 	uint8_t msr;
 
 	if (mcr & MCR_LOOPBACK) {
 		/*
 		 * In the loopback mode certain bits from the MCR are
 		 * reflected back into MSR.
 		 */
 		msr = 0;
 		if (mcr & MCR_RTS)
 			msr |= MSR_CTS;
 		if (mcr & MCR_DTR)
 			msr |= MSR_DSR;
 		if (mcr & MCR_OUT1)
 			msr |= MSR_RI;
 		if (mcr & MCR_OUT2)
 			msr |= MSR_DCD;
 	} else {
 		/*
 		 * Always assert DCD and DSR so tty open doesn't block
 		 * even if CLOCAL is turned off.
 		 */
 		msr = MSR_DCD | MSR_DSR;
 	}
 	assert((msr & MSR_DELTA_MASK) == 0);
 
 	return (msr);
 }
 
 /*
  * The IIR returns a prioritized interrupt reason:
  * - receive data available
  * - transmit holding register empty
  * - modem status change
  *
  * Return an interrupt reason if one is available.
  */
 static int
 uart_intr_reason(struct uart_softc *sc)
 {
 
 	if ((sc->lsr & LSR_OE) != 0 && (sc->ier & IER_ERLS) != 0)
 		return (IIR_RLS);
 	else if (rxfifo_numchars(sc) > 0 && (sc->ier & IER_ERXRDY) != 0)
 		return (IIR_RXTOUT);
 	else if (sc->thre_int_pending && (sc->ier & IER_ETXRDY) != 0)
 		return (IIR_TXRDY);
 	else if ((sc->msr & MSR_DELTA_MASK) != 0 && (sc->ier & IER_EMSC) != 0)
 		return (IIR_MLSC);
 	else
 		return (IIR_NOPEND);
 }
 
 static void
 uart_reset(struct uart_softc *sc)
 {
 	uint16_t divisor;
 
 	divisor = DEFAULT_RCLK / DEFAULT_BAUD / 16;
 	sc->dll = divisor;
 	sc->dlh = divisor >> 16;
 	sc->msr = modem_status(sc->mcr);
 
 	rxfifo_reset(sc, 1);	/* no fifo until enabled by software */
 }
 
 /*
  * Toggle the COM port's intr pin depending on whether or not we have an
  * interrupt condition to report to the processor.
  */
 static void
 uart_toggle_intr(struct uart_softc *sc)
 {
 	uint8_t intr_reason;
 
 	intr_reason = uart_intr_reason(sc);
 
 	if (intr_reason == IIR_NOPEND)
 		(*sc->intr_deassert)(sc->arg);
 	else
 		(*sc->intr_assert)(sc->arg);
 }
 
 static void
 uart_drain(int fd, enum ev_type ev, void *arg)
 {
 	struct uart_softc *sc;
 	int ch;
 
 	sc = arg;	
 
 	assert(fd == sc->tty.rfd);
 	assert(ev == EVF_READ);
 	
 	/*
 	 * This routine is called in the context of the mevent thread
 	 * to take out the softc lock to protect against concurrent
 	 * access from a vCPU i/o exit
 	 */
 	pthread_mutex_lock(&sc->mtx);
 
 	if ((sc->mcr & MCR_LOOPBACK) != 0) {
 		(void) ttyread(&sc->tty);
 	} else {
 		while (rxfifo_available(sc) &&
 		       ((ch = ttyread(&sc->tty)) != -1)) {
 			rxfifo_putchar(sc, ch);
 		}
 		uart_toggle_intr(sc);
 	}
 
 	pthread_mutex_unlock(&sc->mtx);
 }
 
 void
 uart_write(struct uart_softc *sc, int offset, uint8_t value)
 {
 	int fifosz;
 	uint8_t msr;
 
 	pthread_mutex_lock(&sc->mtx);
 
 	/*
 	 * Take care of the special case DLAB accesses first
 	 */
 	if ((sc->lcr & LCR_DLAB) != 0) {
 		if (offset == REG_DLL) {
 			sc->dll = value;
 			goto done;
 		}
 		
 		if (offset == REG_DLH) {
 			sc->dlh = value;
 			goto done;
 		}
 	}
 
         switch (offset) {
 	case REG_DATA:
 		if (sc->mcr & MCR_LOOPBACK) {
 			if (rxfifo_putchar(sc, value) != 0)
 				sc->lsr |= LSR_OE;
 		} else if (sc->tty.opened) {
 			ttywrite(&sc->tty, value);
 		} /* else drop on floor */
 		sc->thre_int_pending = true;
 		break;
 	case REG_IER:
 		/* Set pending when IER_ETXRDY is raised (edge-triggered). */
 		if ((sc->ier & IER_ETXRDY) == 0 && (value & IER_ETXRDY) != 0)
 			sc->thre_int_pending = true;
 		/*
 		 * Apply mask so that bits 4-7 are 0
 		 * Also enables bits 0-3 only if they're 1
 		 */
 		sc->ier = value & 0x0F;
 		break;
 	case REG_FCR:
 		/*
 		 * When moving from FIFO and 16450 mode and vice versa,
 		 * the FIFO contents are reset.
 		 */
 		if ((sc->fcr & FCR_ENABLE) ^ (value & FCR_ENABLE)) {
 			fifosz = (value & FCR_ENABLE) ? FIFOSZ : 1;
 			rxfifo_reset(sc, fifosz);
 		}
 
 		/*
 		 * The FCR_ENABLE bit must be '1' for the programming
 		 * of other FCR bits to be effective.
 		 */
 		if ((value & FCR_ENABLE) == 0) {
 			sc->fcr = 0;
 		} else {
 			if ((value & FCR_RCV_RST) != 0)
 				rxfifo_reset(sc, FIFOSZ);
 
 			sc->fcr = value &
 				 (FCR_ENABLE | FCR_DMA | FCR_RX_MASK);
 		}
 		break;
 	case REG_LCR:
 		sc->lcr = value;
 		break;
 	case REG_MCR:
 		/* Apply mask so that bits 5-7 are 0 */
 		sc->mcr = value & 0x1F;
 		msr = modem_status(sc->mcr);
 
 		/*
 		 * Detect if there has been any change between the
 		 * previous and the new value of MSR. If there is
 		 * then assert the appropriate MSR delta bit.
 		 */
 		if ((msr & MSR_CTS) ^ (sc->msr & MSR_CTS))
 			sc->msr |= MSR_DCTS;
 		if ((msr & MSR_DSR) ^ (sc->msr & MSR_DSR))
 			sc->msr |= MSR_DDSR;
 		if ((msr & MSR_DCD) ^ (sc->msr & MSR_DCD))
 			sc->msr |= MSR_DDCD;
 		if ((sc->msr & MSR_RI) != 0 && (msr & MSR_RI) == 0)
 			sc->msr |= MSR_TERI;
 
 		/*
 		 * Update the value of MSR while retaining the delta
 		 * bits.
 		 */
 		sc->msr &= MSR_DELTA_MASK;
 		sc->msr |= msr;
 		break;
 	case REG_LSR:
 		/*
 		 * Line status register is not meant to be written to
 		 * during normal operation.
 		 */
 		break;
 	case REG_MSR:
 		/*
 		 * As far as I can tell MSR is a read-only register.
 		 */
 		break;
 	case REG_SCR:
 		sc->scr = value;
 		break;
 	default:
 		break;
 	}
 
 done:
 	uart_toggle_intr(sc);
 	pthread_mutex_unlock(&sc->mtx);
 }
 
 uint8_t
 uart_read(struct uart_softc *sc, int offset)
 {
 	uint8_t iir, intr_reason, reg;
 
 	pthread_mutex_lock(&sc->mtx);
 
 	/*
 	 * Take care of the special case DLAB accesses first
 	 */
 	if ((sc->lcr & LCR_DLAB) != 0) {
 		if (offset == REG_DLL) {
 			reg = sc->dll;
 			goto done;
 		}
 		
 		if (offset == REG_DLH) {
 			reg = sc->dlh;
 			goto done;
 		}
 	}
 
 	switch (offset) {
 	case REG_DATA:
 		reg = rxfifo_getchar(sc);
 		break;
 	case REG_IER:
 		reg = sc->ier;
 		break;
 	case REG_IIR:
 		iir = (sc->fcr & FCR_ENABLE) ? IIR_FIFO_MASK : 0;
 
 		intr_reason = uart_intr_reason(sc);
 			
 		/*
 		 * Deal with side effects of reading the IIR register
 		 */
 		if (intr_reason == IIR_TXRDY)
 			sc->thre_int_pending = false;
 
 		iir |= intr_reason;
 
 		reg = iir;
 		break;
 	case REG_LCR:
 		reg = sc->lcr;
 		break;
 	case REG_MCR:
 		reg = sc->mcr;
 		break;
 	case REG_LSR:
 		/* Transmitter is always ready for more data */
 		sc->lsr |= LSR_TEMT | LSR_THRE;
 
 		/* Check for new receive data */
 		if (rxfifo_numchars(sc) > 0)
 			sc->lsr |= LSR_RXRDY;
 		else
 			sc->lsr &= ~LSR_RXRDY;
 
 		reg = sc->lsr;
 
 		/* The LSR_OE bit is cleared on LSR read */
 		sc->lsr &= ~LSR_OE;
 		break;
 	case REG_MSR:
 		/*
 		 * MSR delta bits are cleared on read
 		 */
 		reg = sc->msr;
 		sc->msr &= ~MSR_DELTA_MASK;
 		break;
 	case REG_SCR:
 		reg = sc->scr;
 		break;
 	default:
 		reg = 0xFF;
 		break;
 	}
 
 done:
 	uart_toggle_intr(sc);
 	pthread_mutex_unlock(&sc->mtx);
 
 	return (reg);
 }
 
 int
 uart_legacy_alloc(int which, int *baseaddr, int *irq)
 {
 
 	if (which < 0 || which >= UART_NLDEVS || uart_lres[which].inuse)
 		return (-1);
 
 	uart_lres[which].inuse = true;
 	*baseaddr = uart_lres[which].baseaddr;
 	*irq = uart_lres[which].irq;
 
 	return (0);
 }
 
 struct uart_softc *
 uart_init(uart_intr_func_t intr_assert, uart_intr_func_t intr_deassert,
     void *arg)
 {
 	struct uart_softc *sc;
 
 	sc = calloc(1, sizeof(struct uart_softc));
 
 	sc->arg = arg;
 	sc->intr_assert = intr_assert;
 	sc->intr_deassert = intr_deassert;
 
 	pthread_mutex_init(&sc->mtx, NULL);
 
 	uart_reset(sc);
 
 	return (sc);
 }
 
 static int
 uart_stdio_backend(struct uart_softc *sc)
 {
 #ifndef WITHOUT_CAPSICUM
 	cap_rights_t rights;
 	cap_ioctl_t cmds[] = { TIOCGETA, TIOCSETA, TIOCGWINSZ };
 #endif
 
 	if (uart_stdio)
 		return (-1);
 
 	sc->tty.rfd = STDIN_FILENO;
 	sc->tty.wfd = STDOUT_FILENO;
 	sc->tty.opened = true;
 
 	if (fcntl(sc->tty.rfd, F_SETFL, O_NONBLOCK) != 0)
 		return (-1);
 	if (fcntl(sc->tty.wfd, F_SETFL, O_NONBLOCK) != 0)
 		return (-1);
 
 #ifndef WITHOUT_CAPSICUM
 	cap_rights_init(&rights, CAP_EVENT, CAP_IOCTL, CAP_READ);
 	if (caph_rights_limit(sc->tty.rfd, &rights) == -1)
 		errx(EX_OSERR, "Unable to apply rights for sandbox");
 	if (caph_ioctls_limit(sc->tty.rfd, cmds, nitems(cmds)) == -1)
 		errx(EX_OSERR, "Unable to apply rights for sandbox");
 #endif
 
 	uart_stdio = true;
 
 	return (0);
 }
 
 static int
 uart_tty_backend(struct uart_softc *sc, const char *opts)
 {
 #ifndef WITHOUT_CAPSICUM
 	cap_rights_t rights;
 	cap_ioctl_t cmds[] = { TIOCGETA, TIOCSETA, TIOCGWINSZ };
 #endif
 	int fd;
 
 	fd = open(opts, O_RDWR | O_NONBLOCK);
 	if (fd < 0)
 		return (-1);
 
 	if (!isatty(fd)) {
 		close(fd);
 		return (-1);
 	}
 
 	sc->tty.rfd = sc->tty.wfd = fd;
 	sc->tty.opened = true;
 
 #ifndef WITHOUT_CAPSICUM
 	cap_rights_init(&rights, CAP_EVENT, CAP_IOCTL, CAP_READ, CAP_WRITE);
 	if (caph_rights_limit(fd, &rights) == -1)
 		errx(EX_OSERR, "Unable to apply rights for sandbox");
 	if (caph_ioctls_limit(fd, cmds, nitems(cmds)) == -1)
 		errx(EX_OSERR, "Unable to apply rights for sandbox");
 #endif
 
 	return (0);
 }
 
 int
 uart_set_backend(struct uart_softc *sc, const char *opts)
 {
 	int retval;
 
 	if (opts == NULL)
 		return (0);
 
 	if (strcmp("stdio", opts) == 0)
 		retval = uart_stdio_backend(sc);
 	else
 		retval = uart_tty_backend(sc, opts);
 	if (retval == 0)
 		uart_opentty(sc);
 
 	return (retval);
 }
+
+#ifdef BHYVE_SNAPSHOT
+int
+uart_snapshot(struct uart_softc *sc, struct vm_snapshot_meta *meta)
+{
+	int ret;
+
+	SNAPSHOT_VAR_OR_LEAVE(sc->data, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->ier, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->lcr, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->mcr, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->lsr, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->msr, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->fcr, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->scr, meta, ret, done);
+
+	SNAPSHOT_VAR_OR_LEAVE(sc->dll, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->dlh, meta, ret, done);
+
+	SNAPSHOT_VAR_OR_LEAVE(sc->rxfifo.rindex, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->rxfifo.windex, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->rxfifo.num, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->rxfifo.size, meta, ret, done);
+	SNAPSHOT_BUF_OR_LEAVE(sc->rxfifo.buf, sizeof(sc->rxfifo.buf),
+			      meta, ret, done);
+
+	sc->thre_int_pending = 1;
+
+done:
+	return (ret);
+}
+#endif
diff --git a/usr.sbin/bhyve/uart_emul.h b/usr.sbin/bhyve/uart_emul.h
index a87202df1f96..5a53294da89e 100644
--- a/usr.sbin/bhyve/uart_emul.h
+++ b/usr.sbin/bhyve/uart_emul.h
@@ -1,47 +1,50 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2013 Neel Natu <neel@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _UART_EMUL_H_
 #define	_UART_EMUL_H_
 
-
 #define	UART_IO_BAR_SIZE	8
 
 struct uart_softc;
+struct vm_snapshot_meta;
 
 typedef void (*uart_intr_func_t)(void *arg);
 struct uart_softc *uart_init(uart_intr_func_t intr_assert,
 		uart_intr_func_t intr_deassert, void *arg);
 
 int	uart_legacy_alloc(int unit, int *ioaddr, int *irq);
 uint8_t	uart_read(struct uart_softc *sc, int offset);
 void	uart_write(struct uart_softc *sc, int offset, uint8_t value);
 int	uart_set_backend(struct uart_softc *sc, const char *opt);
+#ifdef BHYVE_SNAPSHOT
+int	uart_snapshot(struct uart_softc *sc, struct vm_snapshot_meta *meta);
+#endif
 #endif
diff --git a/usr.sbin/bhyve/usb_emul.h b/usr.sbin/bhyve/usb_emul.h
index c52411dd0650..d6e1e616cd82 100644
--- a/usr.sbin/bhyve/usb_emul.h
+++ b/usr.sbin/bhyve/usb_emul.h
@@ -1,158 +1,158 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2014 Leon Dang <ldang@nahannisys.com>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _USB_EMUL_H_
 #define _USB_EMUL_H_
 
 #include <stdlib.h>
 #include <sys/linker_set.h>
 #include <pthread.h>
 
 #define	USB_MAX_XFER_BLOCKS	8
 
 #define	USB_XFER_OUT		0
 #define	USB_XFER_IN		1
 
 
-
 struct usb_hci;
 struct usb_device_request;
 struct usb_data_xfer;
+struct vm_snapshot_meta;
 
 /* Device emulation handlers */
 struct usb_devemu {
 	char	*ue_emu;	/* name of device emulation */
 	int	ue_usbver;	/* usb version: 2 or 3 */
 	int	ue_usbspeed;	/* usb device speed */
 
 	/* instance creation */
 	void	*(*ue_init)(struct usb_hci *hci, char *opt);
 
 	/* handlers */
 	int	(*ue_request)(void *sc, struct usb_data_xfer *xfer);
 	int	(*ue_data)(void *sc, struct usb_data_xfer *xfer, int dir,
 	                   int epctx);
 	int	(*ue_reset)(void *sc);
 	int	(*ue_remove)(void *sc);
 	int	(*ue_stop)(void *sc);
+	int	(*ue_snapshot)(void *scarg, struct vm_snapshot_meta *meta);
 };
 #define	USB_EMUL_SET(x)		DATA_SET(usb_emu_set, x);
 
 /*
  * USB device events to notify HCI when state changes
  */
 enum hci_usbev {
 	USBDEV_ATTACH,
 	USBDEV_RESET,
 	USBDEV_STOP,
 	USBDEV_REMOVE,
 };
 
 /* usb controller, ie xhci, ehci */
 struct usb_hci {
 	int	(*hci_intr)(struct usb_hci *hci, int epctx);
 	int	(*hci_event)(struct usb_hci *hci, enum hci_usbev evid,
 		             void *param);
 	void	*hci_sc;			/* private softc for hci */
 
 	/* controller managed fields */
 	int	hci_address;
 	int	hci_port;
 };
 
 /*
  * Each xfer block is mapped to the hci transfer block.
  * On input into the device handler, blen is set to the lenght of buf.
  * The device handler is to update blen to reflect on the residual size
  * of the buffer, i.e. len(buf) - len(consumed).
  */
 struct usb_data_xfer_block {
 	void	*buf;			/* IN or OUT pointer */
 	int	blen;			/* in:len(buf), out:len(remaining) */
 	int	bdone;			/* bytes transferred */
 	uint32_t processed;		/* device processed this + errcode */
 	void	*hci_data;		/* HCI private reference */
 	int	ccs;
 	uint32_t streamid;
 	uint64_t trbnext;		/* next TRB guest address */
 };
 
 struct usb_data_xfer {
 	struct usb_data_xfer_block data[USB_MAX_XFER_BLOCKS];
 	struct usb_device_request *ureq; 	/* setup ctl request */
 	int	ndata;				/* # of data items */
 	int	head;
 	int	tail;
 	pthread_mutex_t mtx;
 };
 
 enum USB_ERRCODE {
 	USB_ACK,
 	USB_NAK,
 	USB_STALL,
 	USB_NYET,
 	USB_ERR,
 	USB_SHORT
 };
 
 #define	USB_DATA_GET_ERRCODE(x)		(x)->processed >> 8
 #define	USB_DATA_SET_ERRCODE(x,e)	do {				\
 			(x)->processed = ((x)->processed & 0xFF) | (e << 8); \
 		} while (0)
 
 #define	USB_DATA_OK(x,i)	((x)->data[(i)].buf != NULL)
 
 #define	USB_DATA_XFER_INIT(x)	do {					\
 			memset((x), 0, sizeof(*(x)));			\
 			pthread_mutex_init(&((x)->mtx), NULL);		\
 		} while (0)
 
 #define	USB_DATA_XFER_RESET(x)	do {					\
 			memset((x)->data, 0, sizeof((x)->data));	\
 			(x)->ndata = 0;					\
 			(x)->head = (x)->tail = 0;			\
 		} while (0)
 
 #define	USB_DATA_XFER_LOCK(x)	do {					\
 			pthread_mutex_lock(&((x)->mtx));		\
 		} while (0)
 
 #define	USB_DATA_XFER_UNLOCK(x)	do {					\
 			pthread_mutex_unlock(&((x)->mtx));		\
 		} while (0)
 
-
 struct usb_devemu *usb_emu_finddev(char *name);
 
 struct usb_data_xfer_block *usb_data_xfer_append(struct usb_data_xfer *xfer,
                           void *buf, int blen, void *hci_data, int ccs);
 
 
 #endif /* _USB_EMUL_H_ */
diff --git a/usr.sbin/bhyve/usb_mouse.c b/usr.sbin/bhyve/usb_mouse.c
index da3800c11fc2..5398da818c7f 100644
--- a/usr.sbin/bhyve/usb_mouse.c
+++ b/usr.sbin/bhyve/usb_mouse.c
@@ -1,803 +1,831 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2014 Leon Dang <ldang@nahannisys.com>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/time.h>
 
+#include <machine/vmm_snapshot.h>
+
 #include <pthread.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
 #include <dev/usb/usb.h>
 #include <dev/usb/usbdi.h>
 
 #include "usb_emul.h"
 #include "console.h"
 #include "bhyvegc.h"
 #include "debug.h"
 
 static int umouse_debug = 0;
 #define	DPRINTF(params) if (umouse_debug) PRINTLN params
 #define	WPRINTF(params) PRINTLN params
 
 /* USB endpoint context (1-15) for reporting mouse data events*/
 #define	UMOUSE_INTR_ENDPT	1
 
 #define UMOUSE_REPORT_DESC_TYPE	0x22
 
 #define	UMOUSE_GET_REPORT	0x01
 #define	UMOUSE_GET_IDLE		0x02
 #define	UMOUSE_GET_PROTOCOL	0x03
 #define	UMOUSE_SET_REPORT	0x09
 #define	UMOUSE_SET_IDLE		0x0A
 #define	UMOUSE_SET_PROTOCOL	0x0B
 
 #define HSETW(ptr, val)   ptr = { (uint8_t)(val), (uint8_t)((val) >> 8) }
 
 enum {
 	UMSTR_LANG,
 	UMSTR_MANUFACTURER,
 	UMSTR_PRODUCT,
 	UMSTR_SERIAL,
 	UMSTR_CONFIG,
 	UMSTR_MAX
 };
 
 static const char *umouse_desc_strings[] = {
 	"\x04\x09",
 	"BHYVE",
 	"HID Tablet",
 	"01",
 	"HID Tablet Device",
 };
 
 struct umouse_hid_descriptor {
 	uint8_t	bLength;
 	uint8_t	bDescriptorType;
 	uint8_t	bcdHID[2];
 	uint8_t	bCountryCode;
 	uint8_t	bNumDescriptors;
 	uint8_t	bReportDescriptorType;
 	uint8_t	wItemLength[2];
 } __packed;
 
 struct umouse_config_desc {
 	struct usb_config_descriptor		confd;
 	struct usb_interface_descriptor		ifcd;
 	struct umouse_hid_descriptor		hidd;
 	struct usb_endpoint_descriptor		endpd;
 	struct usb_endpoint_ss_comp_descriptor	sscompd;
 } __packed;
 
 #define MOUSE_MAX_X	0x8000
 #define MOUSE_MAX_Y	0x8000
 
 static const uint8_t umouse_report_desc[] = {
 	0x05, 0x01,		/* USAGE_PAGE (Generic Desktop)		*/
 	0x09, 0x02,		/* USAGE (Mouse)			*/
 	0xa1, 0x01,		/* COLLECTION (Application) 		*/
 	0x09, 0x01,		/*   USAGE (Pointer)			*/
 	0xa1, 0x00,		/*   COLLECTION (Physical)		*/
 	0x05, 0x09,		/*     USAGE_PAGE (Button)		*/
 	0x19, 0x01,		/*     USAGE_MINIMUM (Button 1)		*/
 	0x29, 0x03,		/*     USAGE_MAXIMUM (Button 3)		*/
 	0x15, 0x00,		/*     LOGICAL_MINIMUM (0)		*/
 	0x25, 0x01,		/*     LOGICAL_MAXIMUM (1)		*/
 	0x75, 0x01,		/*     REPORT_SIZE (1)			*/
 	0x95, 0x03,		/*     REPORT_COUNT (3)			*/
 	0x81, 0x02,		/*     INPUT (Data,Var,Abs); 3 buttons	*/
 	0x75, 0x05,		/*     REPORT_SIZE (5)			*/
 	0x95, 0x01,		/*     REPORT_COUNT (1)			*/
 	0x81, 0x03,		/*     INPUT (Cnst,Var,Abs); padding	*/
 	0x05, 0x01,		/*     USAGE_PAGE (Generic Desktop)	*/
 	0x09, 0x30,		/*     USAGE (X)			*/
 	0x09, 0x31,		/*     USAGE (Y)			*/
 	0x35, 0x00,		/*     PHYSICAL_MINIMUM (0)		*/
 	0x46, 0xff, 0x7f,	/*     PHYSICAL_MAXIMUM (0x7fff)	*/
 	0x15, 0x00,		/*     LOGICAL_MINIMUM (0)		*/
 	0x26, 0xff, 0x7f,	/*     LOGICAL_MAXIMUM (0x7fff)		*/
 	0x75, 0x10,		/*     REPORT_SIZE (16)			*/
 	0x95, 0x02,		/*     REPORT_COUNT (2)			*/
 	0x81, 0x02,		/*     INPUT (Data,Var,Abs)		*/
 	0x05, 0x01,		/*     USAGE Page (Generic Desktop)	*/
 	0x09, 0x38,		/*     USAGE (Wheel)			*/
 	0x35, 0x00,		/*     PHYSICAL_MINIMUM (0)		*/
 	0x45, 0x00,		/*     PHYSICAL_MAXIMUM (0)		*/
 	0x15, 0x81,		/*     LOGICAL_MINIMUM (-127)		*/
 	0x25, 0x7f,		/*     LOGICAL_MAXIMUM (127)		*/
 	0x75, 0x08,		/*     REPORT_SIZE (8)			*/
 	0x95, 0x01,		/*     REPORT_COUNT (1)			*/
 	0x81, 0x06,		/*     INPUT (Data,Var,Rel)		*/
 	0xc0,			/*   END_COLLECTION			*/
 	0xc0			/* END_COLLECTION			*/
 };
 
 struct umouse_report {
 	uint8_t	buttons;	/* bits: 0 left, 1 right, 2 middle */
 	int16_t	x;		/* x position */
 	int16_t	y;		/* y position */
 	int8_t	z;		/* z wheel position */
 } __packed;
 
 
 #define	MSETW(ptr, val)	ptr = { (uint8_t)(val), (uint8_t)((val) >> 8) }
 
 static struct usb_device_descriptor umouse_dev_desc = {
 	.bLength = sizeof(umouse_dev_desc),
 	.bDescriptorType = UDESC_DEVICE,
 	MSETW(.bcdUSB, UD_USB_3_0),
 	.bMaxPacketSize = 8,			/* max packet size */
 	MSETW(.idVendor, 0xFB5D),		/* vendor */
 	MSETW(.idProduct, 0x0001),		/* product */
 	MSETW(.bcdDevice, 0),			/* device version */
 	.iManufacturer = UMSTR_MANUFACTURER,
 	.iProduct = UMSTR_PRODUCT,
 	.iSerialNumber = UMSTR_SERIAL,
 	.bNumConfigurations = 1,
 };
 
 static struct umouse_config_desc umouse_confd = {
 	.confd = {
 		.bLength = sizeof(umouse_confd.confd),
 		.bDescriptorType = UDESC_CONFIG,
 		.wTotalLength[0] = sizeof(umouse_confd),
 		.bNumInterface = 1,
 		.bConfigurationValue = 1,
 		.iConfiguration = UMSTR_CONFIG,
 		.bmAttributes = UC_BUS_POWERED | UC_REMOTE_WAKEUP,
 		.bMaxPower = 0,
 	},
 	.ifcd = {
 		.bLength = sizeof(umouse_confd.ifcd),
 		.bDescriptorType = UDESC_INTERFACE,
 		.bNumEndpoints = 1,
 		.bInterfaceClass = UICLASS_HID,
 		.bInterfaceSubClass = UISUBCLASS_BOOT,
 		.bInterfaceProtocol = UIPROTO_MOUSE,
 	},
 	.hidd = {
 		.bLength = sizeof(umouse_confd.hidd),
 		.bDescriptorType = 0x21,
 		.bcdHID = { 0x01, 0x10 },
 		.bCountryCode = 0,
 		.bNumDescriptors = 1,
 		.bReportDescriptorType = UMOUSE_REPORT_DESC_TYPE,
 		.wItemLength = { sizeof(umouse_report_desc), 0 },
 	},
 	.endpd = {
 		.bLength = sizeof(umouse_confd.endpd),
 		.bDescriptorType = UDESC_ENDPOINT,
 		.bEndpointAddress = UE_DIR_IN | UMOUSE_INTR_ENDPT,
 		.bmAttributes = UE_INTERRUPT,
 		.wMaxPacketSize[0] = 8,
 		.bInterval = 0xA,
 	},
 	.sscompd = {
 		.bLength = sizeof(umouse_confd.sscompd),
 		.bDescriptorType = UDESC_ENDPOINT_SS_COMP,
 		.bMaxBurst = 0,
 		.bmAttributes = 0,
 		MSETW(.wBytesPerInterval, 0),
 	},
 };
 
 
 struct umouse_bos_desc {
 	struct usb_bos_descriptor		bosd;
 	struct usb_devcap_ss_descriptor		usbssd;
 } __packed;
 
 
 struct umouse_bos_desc umouse_bosd = {
 	.bosd = {
 		.bLength = sizeof(umouse_bosd.bosd),
 		.bDescriptorType = UDESC_BOS,
 		HSETW(.wTotalLength, sizeof(umouse_bosd)),
 		.bNumDeviceCaps = 1,
 	},
 	.usbssd = {
 		.bLength = sizeof(umouse_bosd.usbssd),
 		.bDescriptorType = UDESC_DEVICE_CAPABILITY,
 		.bDevCapabilityType = 3,
 		.bmAttributes = 0,
 		HSETW(.wSpeedsSupported, 0x08),
 		.bFunctionalitySupport = 3,
 		.bU1DevExitLat = 0xa,   /* dummy - not used */
 		.wU2DevExitLat = { 0x20, 0x00 },
 	}
 };
 
 
 struct umouse_softc {
 	struct usb_hci *hci;
 
 	char	*opt;
 
 	struct umouse_report um_report;
 	int	newdata;
 	struct {
 		uint8_t	idle;
 		uint8_t	protocol;
 		uint8_t	feature;
 	} hid;
 
 	pthread_mutex_t	mtx;
 	pthread_mutex_t	ev_mtx;
 	int		polling;
 	struct timeval	prev_evt;
 };
 
 static void
 umouse_event(uint8_t button, int x, int y, void *arg)
 {
 	struct umouse_softc *sc;
 	struct bhyvegc_image *gc;
 
 	gc = console_get_image();
 	if (gc == NULL) {
 		/* not ready */
 		return;
 	}
 
 	sc = arg;
 
 	pthread_mutex_lock(&sc->mtx);
 
 	sc->um_report.buttons = 0;
 	sc->um_report.z = 0;
 
 	if (button & 0x01)
 		sc->um_report.buttons |= 0x01;	/* left */
 	if (button & 0x02)
 		sc->um_report.buttons |= 0x04;	/* middle */
 	if (button & 0x04)
 		sc->um_report.buttons |= 0x02;	/* right */
 	if (button & 0x8)
 		sc->um_report.z = 1;
 	if (button & 0x10)
 		sc->um_report.z = -1;
 
 	/* scale coords to mouse resolution */
 	sc->um_report.x = MOUSE_MAX_X * x / gc->width;
 	sc->um_report.y = MOUSE_MAX_Y * y / gc->height;
 	sc->newdata = 1;
 	pthread_mutex_unlock(&sc->mtx);
 
 	pthread_mutex_lock(&sc->ev_mtx);
 	sc->hci->hci_intr(sc->hci, UE_DIR_IN | UMOUSE_INTR_ENDPT);
 	pthread_mutex_unlock(&sc->ev_mtx);
 }
 
 static void *
 umouse_init(struct usb_hci *hci, char *opt)
 {
 	struct umouse_softc *sc;
 
 	sc = calloc(1, sizeof(struct umouse_softc));
 	sc->hci = hci;
 
 	sc->hid.protocol = 1;	/* REPORT protocol */
 	sc->opt = strdup(opt);
 	pthread_mutex_init(&sc->mtx, NULL);
 	pthread_mutex_init(&sc->ev_mtx, NULL);
 
 	console_ptr_register(umouse_event, sc, 10);
 
 	return (sc);
 }
 
 #define	UREQ(x,y)	((x) | ((y) << 8))
 
 static int
 umouse_request(void *scarg, struct usb_data_xfer *xfer)
 {
 	struct umouse_softc *sc;
 	struct usb_data_xfer_block *data;
 	const char *str;
 	uint16_t value;
 	uint16_t index;
 	uint16_t len;
 	uint16_t slen;
 	uint8_t *udata;
 	int	err;
 	int	i, idx;
 	int	eshort;
 
 	sc = scarg;
 
 	data = NULL;
 	udata = NULL;
 	idx = xfer->head;
 	for (i = 0; i < xfer->ndata; i++) {
 		xfer->data[idx].bdone = 0;
 		if (data == NULL && USB_DATA_OK(xfer,i)) {
 			data = &xfer->data[idx];
 			udata = data->buf;
 		}
 
 		xfer->data[idx].processed = 1;
 		idx = (idx + 1) % USB_MAX_XFER_BLOCKS;
 	}
 
 	err = USB_ERR_NORMAL_COMPLETION;
 	eshort = 0;
 
 	if (!xfer->ureq) {
 		DPRINTF(("umouse_request: port %d", sc->hci->hci_port));
 		goto done;
 	}
 
 	value = UGETW(xfer->ureq->wValue);
 	index = UGETW(xfer->ureq->wIndex);
 	len = UGETW(xfer->ureq->wLength);
 
 	DPRINTF(("umouse_request: port %d, type 0x%x, req 0x%x, val 0x%x, "
 	         "idx 0x%x, len %u",
 	         sc->hci->hci_port, xfer->ureq->bmRequestType,
 	         xfer->ureq->bRequest, value, index, len));
 
 	switch (UREQ(xfer->ureq->bRequest, xfer->ureq->bmRequestType)) {
 	case UREQ(UR_GET_CONFIG, UT_READ_DEVICE):
 		DPRINTF(("umouse: (UR_GET_CONFIG, UT_READ_DEVICE)"));
 		if (!data)
 			break;
 
 		*udata = umouse_confd.confd.bConfigurationValue;
 		data->blen = len > 0 ? len - 1 : 0;
 		eshort = data->blen > 0;
 		data->bdone += 1;
 		break;
 
 	case UREQ(UR_GET_DESCRIPTOR, UT_READ_DEVICE):
 		DPRINTF(("umouse: (UR_GET_DESCRIPTOR, UT_READ_DEVICE) val %x",
 		        value >> 8));
 		if (!data)
 			break;
 
 		switch (value >> 8) {
 		case UDESC_DEVICE:
 			DPRINTF(("umouse: (->UDESC_DEVICE) len %u ?= "
 			         "sizeof(umouse_dev_desc) %lu",
 			         len, sizeof(umouse_dev_desc)));
 			if ((value & 0xFF) != 0) {
 				err = USB_ERR_IOERROR;
 				goto done;
 			}
 			if (len > sizeof(umouse_dev_desc)) {
 				data->blen = len - sizeof(umouse_dev_desc);
 				len = sizeof(umouse_dev_desc);
 			} else
 				data->blen = 0;
 			memcpy(data->buf, &umouse_dev_desc, len);
 			data->bdone += len;
 			break;
 
 		case UDESC_CONFIG:
 			DPRINTF(("umouse: (->UDESC_CONFIG)"));
 			if ((value & 0xFF) != 0) {
 				err = USB_ERR_IOERROR;
 				goto done;
 			}
 			if (len > sizeof(umouse_confd)) {
 				data->blen = len - sizeof(umouse_confd);
 				len = sizeof(umouse_confd);
 			} else
 				data->blen = 0;
 
 			memcpy(data->buf, &umouse_confd, len);
 			data->bdone += len;
 			break;
 
 		case UDESC_STRING:
 			DPRINTF(("umouse: (->UDESC_STRING)"));
 			str = NULL;
 			if ((value & 0xFF) < UMSTR_MAX)
 				str = umouse_desc_strings[value & 0xFF];
 			else
 				goto done;
 
 			if ((value & 0xFF) == UMSTR_LANG) {
 				udata[0] = 4;
 				udata[1] = UDESC_STRING;
 				data->blen = len - 2;
 				len -= 2;
 				data->bdone += 2;
 
 				if (len >= 2) {
 					udata[2] = str[0];
 					udata[3] = str[1];
 					data->blen -= 2;
 					data->bdone += 2;
 				} else
 					data->blen = 0;
 
 				goto done;
 			}
 
 			slen = 2 + strlen(str) * 2;
 			udata[0] = slen;
 			udata[1] = UDESC_STRING;
 
 			if (len > slen) {
 				data->blen = len - slen;
 				len = slen;
 			} else
 				data->blen = 0;
 			for (i = 2; i < len; i += 2) {
 				udata[i] = *str++;
 				udata[i+1] = '\0';
 			}
 			data->bdone += slen;
 
 			break;
 
 		case UDESC_BOS:
 			DPRINTF(("umouse: USB3 BOS"));
 			if (len > sizeof(umouse_bosd)) {
 				data->blen = len - sizeof(umouse_bosd);
 				len = sizeof(umouse_bosd);
 			} else
 				data->blen = 0;
 			memcpy(udata, &umouse_bosd, len);
 			data->bdone += len;
 			break;
 
 		default:
 			DPRINTF(("umouse: unknown(%d)->ERROR", value >> 8));
 			err = USB_ERR_IOERROR;
 			goto done;
 		}
 		eshort = data->blen > 0;
 		break;
 
 	case UREQ(UR_GET_DESCRIPTOR, UT_READ_INTERFACE):
 		DPRINTF(("umouse: (UR_GET_DESCRIPTOR, UT_READ_INTERFACE) "
 		         "0x%x", (value >> 8)));
 		if (!data)
 			break;
 
 		switch (value >> 8) {
 		case UMOUSE_REPORT_DESC_TYPE:
 			if (len > sizeof(umouse_report_desc)) {
 				data->blen = len - sizeof(umouse_report_desc);
 				len = sizeof(umouse_report_desc);
 			} else
 				data->blen = 0;
 			memcpy(data->buf, umouse_report_desc, len);
 			data->bdone += len;
 			break;
 		default:
 			DPRINTF(("umouse: IO ERROR"));
 			err = USB_ERR_IOERROR;
 			goto done;
 		}
 		eshort = data->blen > 0;
 		break;
 
 	case UREQ(UR_GET_INTERFACE, UT_READ_INTERFACE):
 		DPRINTF(("umouse: (UR_GET_INTERFACE, UT_READ_INTERFACE)"));
 		if (index != 0) {
 			DPRINTF(("umouse get_interface, invalid index %d",
 			        index));
 			err = USB_ERR_IOERROR;
 			goto done;
 		}
 
 		if (!data)
 			break;
 
 		if (len > 0) {
 			*udata = 0;
 			data->blen = len - 1;
 		}
 		eshort = data->blen > 0;
 		data->bdone += 1;
 		break;
 
 	case UREQ(UR_GET_STATUS, UT_READ_DEVICE):
 		DPRINTF(("umouse: (UR_GET_STATUS, UT_READ_DEVICE)"));
 		if (data != NULL && len > 1) {
 			if (sc->hid.feature == UF_DEVICE_REMOTE_WAKEUP)
 				USETW(udata, UDS_REMOTE_WAKEUP);
 			else
 				USETW(udata, 0);
 			data->blen = len - 2;
 			data->bdone += 2;
 		}
 
 		eshort = data->blen > 0;
 		break;
 
 	case UREQ(UR_GET_STATUS, UT_READ_INTERFACE): 
 	case UREQ(UR_GET_STATUS, UT_READ_ENDPOINT): 
 		DPRINTF(("umouse: (UR_GET_STATUS, UT_READ_INTERFACE)"));
 		if (data != NULL && len > 1) {
 			USETW(udata, 0);
 			data->blen = len - 2;
 			data->bdone += 2;
 		}
 		eshort = data->blen > 0;
 		break;
 
 	case UREQ(UR_SET_ADDRESS, UT_WRITE_DEVICE):
 		/* XXX Controller should've handled this */
 		DPRINTF(("umouse set address %u", value));
 		break;
 
 	case UREQ(UR_SET_CONFIG, UT_WRITE_DEVICE):
 		DPRINTF(("umouse set config %u", value));
 		break;
 
 	case UREQ(UR_SET_DESCRIPTOR, UT_WRITE_DEVICE):
 		DPRINTF(("umouse set descriptor %u", value));
 		break;
 
 
 	case UREQ(UR_CLEAR_FEATURE, UT_WRITE_DEVICE):
 		DPRINTF(("umouse: (UR_SET_FEATURE, UT_WRITE_DEVICE) %x", value));
 		if (value == UF_DEVICE_REMOTE_WAKEUP)
 			sc->hid.feature = 0;
 		break;
 
 	case UREQ(UR_SET_FEATURE, UT_WRITE_DEVICE):
 		DPRINTF(("umouse: (UR_SET_FEATURE, UT_WRITE_DEVICE) %x", value));
 		if (value == UF_DEVICE_REMOTE_WAKEUP)
 			sc->hid.feature = UF_DEVICE_REMOTE_WAKEUP;
 		break;
 
 	case UREQ(UR_CLEAR_FEATURE, UT_WRITE_INTERFACE):
 	case UREQ(UR_CLEAR_FEATURE, UT_WRITE_ENDPOINT):
 	case UREQ(UR_SET_FEATURE, UT_WRITE_INTERFACE):
 	case UREQ(UR_SET_FEATURE, UT_WRITE_ENDPOINT):
 		DPRINTF(("umouse: (UR_CLEAR_FEATURE, UT_WRITE_INTERFACE)"));
 		err = USB_ERR_IOERROR;
 		goto done;
 
 	case UREQ(UR_SET_INTERFACE, UT_WRITE_INTERFACE):
 		DPRINTF(("umouse set interface %u", value));
 		break;
 
 	case UREQ(UR_ISOCH_DELAY, UT_WRITE_DEVICE):
 		DPRINTF(("umouse set isoch delay %u", value));
 		break;
 
 	case UREQ(UR_SET_SEL, 0):
 		DPRINTF(("umouse set sel"));
 		break;
 
 	case UREQ(UR_SYNCH_FRAME, UT_WRITE_ENDPOINT):
 		DPRINTF(("umouse synch frame"));
 		break;
 
 	/* HID device requests */
 
 	case UREQ(UMOUSE_GET_REPORT, UT_READ_CLASS_INTERFACE):
 		DPRINTF(("umouse: (UMOUSE_GET_REPORT, UT_READ_CLASS_INTERFACE) "
 		         "0x%x", (value >> 8)));
 		if (!data)
 			break;
 
 		if ((value >> 8) == 0x01 && len >= sizeof(sc->um_report)) {
 			/* TODO read from backend */
 
 			if (len > sizeof(sc->um_report)) {
 				data->blen = len - sizeof(sc->um_report);
 				len = sizeof(sc->um_report);
 			} else
 				data->blen = 0;
 
 			memcpy(data->buf, &sc->um_report, len);
 			data->bdone += len;
 		} else {
 			err = USB_ERR_IOERROR;
 			goto done;
 		}
 		eshort = data->blen > 0;
 		break;
 
 	case UREQ(UMOUSE_GET_IDLE, UT_READ_CLASS_INTERFACE):
 		if (data != NULL && len > 0) {
 			*udata = sc->hid.idle;
 			data->blen = len - 1;
 			data->bdone += 1;
 		}
 		eshort = data->blen > 0;
 		break;
 
 	case UREQ(UMOUSE_GET_PROTOCOL, UT_READ_CLASS_INTERFACE):
 		if (data != NULL && len > 0) {
 			*udata = sc->hid.protocol;
 			data->blen = len - 1;
 			data->bdone += 1;
 		}
 		eshort = data->blen > 0;
 		break;
 
 	case UREQ(UMOUSE_SET_REPORT, UT_WRITE_CLASS_INTERFACE):
 		DPRINTF(("umouse: (UMOUSE_SET_REPORT, UT_WRITE_CLASS_INTERFACE) ignored"));
 		break;
 
 	case UREQ(UMOUSE_SET_IDLE, UT_WRITE_CLASS_INTERFACE):
 		sc->hid.idle = UGETW(xfer->ureq->wValue) >> 8;
 		DPRINTF(("umouse: (UMOUSE_SET_IDLE, UT_WRITE_CLASS_INTERFACE) %x",
 		        sc->hid.idle));
 		break;
 
 	case UREQ(UMOUSE_SET_PROTOCOL, UT_WRITE_CLASS_INTERFACE):
 		sc->hid.protocol = UGETW(xfer->ureq->wValue) >> 8;
 		DPRINTF(("umouse: (UR_CLEAR_FEATURE, UT_WRITE_CLASS_INTERFACE) %x",
 		        sc->hid.protocol));
 		break;
 
 	default:
 		DPRINTF(("**** umouse request unhandled"));
 		err = USB_ERR_IOERROR;
 		break;
 	}
 
 done:
 	if (xfer->ureq && (xfer->ureq->bmRequestType & UT_WRITE) &&
 	    (err == USB_ERR_NORMAL_COMPLETION) && (data != NULL))
 		data->blen = 0;
 	else if (eshort)
 		err = USB_ERR_SHORT_XFER;
 
 	DPRINTF(("umouse request error code %d (0=ok), blen %u txlen %u",
 	        err, (data ? data->blen : 0), (data ? data->bdone : 0)));
 
 	return (err);
 }
 
 static int
 umouse_data_handler(void *scarg, struct usb_data_xfer *xfer, int dir,
      int epctx)
 {
 	struct umouse_softc *sc;
 	struct usb_data_xfer_block *data;
 	uint8_t *udata;
 	int len, i, idx;
 	int err;
 
 	DPRINTF(("umouse handle data - DIR=%s|EP=%d, blen %d",
 	        dir ? "IN" : "OUT", epctx, xfer->data[0].blen));
 
 
 	/* find buffer to add data */
 	udata = NULL;
 	err = USB_ERR_NORMAL_COMPLETION;
 
 	/* handle xfer at first unprocessed item with buffer */
 	data = NULL;
 	idx = xfer->head;
 	for (i = 0; i < xfer->ndata; i++) {
 		data = &xfer->data[idx];
 		if (data->buf != NULL && data->blen != 0) {
 			break;
 		} else {
 			data->processed = 1;
 			data = NULL;
 		}
 		idx = (idx + 1) % USB_MAX_XFER_BLOCKS;
 	}
 	if (!data)
 		goto done;
 
 	udata = data->buf;
 	len = data->blen;
 
 	if (udata == NULL) {
 		DPRINTF(("umouse no buffer provided for input"));
 		err = USB_ERR_NOMEM;
 		goto done;
 	}
 
 	sc = scarg;
 
 	if (dir) {
 
 		pthread_mutex_lock(&sc->mtx);
 
 		if (!sc->newdata) {
 			err = USB_ERR_CANCELLED;
 			USB_DATA_SET_ERRCODE(&xfer->data[xfer->head], USB_NAK);
 			pthread_mutex_unlock(&sc->mtx);
 			goto done;
 		}
 
 		if (sc->polling) {
 			err = USB_ERR_STALLED;
 			USB_DATA_SET_ERRCODE(data, USB_STALL);
 			pthread_mutex_unlock(&sc->mtx);
 			goto done;
 		}
 		sc->polling = 1;
 
 		if (len > 0) {
 			sc->newdata = 0;
 
 			data->processed = 1;
 			data->bdone += 6;
 			memcpy(udata, &sc->um_report, 6);
 			data->blen = len - 6;
 			if (data->blen > 0)
 				err = USB_ERR_SHORT_XFER;
 		}
 
 		sc->polling = 0;
 		pthread_mutex_unlock(&sc->mtx);
 	} else { 
 		USB_DATA_SET_ERRCODE(data, USB_STALL);
 		err = USB_ERR_STALLED;
 	}
 
 done:
 	return (err);
 }
 
 static int
 umouse_reset(void *scarg)
 {
 	struct umouse_softc *sc;
 
 	sc = scarg;
 
 	sc->newdata = 0;
 
 	return (0);
 }
 
 static int
 umouse_remove(void *scarg)
 {
 
 	return (0);
 }
 
 static int
 umouse_stop(void *scarg)
 {
 
 	return (0);
 }
 
+#ifdef BHYVE_SNAPSHOT
+static int
+umouse_snapshot(void *scarg, struct vm_snapshot_meta *meta)
+{
+	int ret;
+	struct umouse_softc *sc;
+
+	sc = scarg;
+
+	SNAPSHOT_VAR_OR_LEAVE(sc->um_report, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->newdata, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->hid.idle, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->hid.protocol, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->hid.feature, meta, ret, done);
+
+	SNAPSHOT_VAR_OR_LEAVE(sc->polling, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->prev_evt.tv_sec, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(sc->prev_evt.tv_usec, meta, ret, done);
+
+done:
+	return (ret);
+}
+#endif
 
 struct usb_devemu ue_mouse = {
 	.ue_emu =	"tablet",
 	.ue_usbver =	3,
 	.ue_usbspeed =	USB_SPEED_HIGH,
 	.ue_init =	umouse_init,
 	.ue_request =	umouse_request,
 	.ue_data =	umouse_data_handler,
 	.ue_reset =	umouse_reset,
 	.ue_remove =	umouse_remove,
-	.ue_stop =	umouse_stop
+	.ue_stop =	umouse_stop,
+#ifdef BHYVE_SNAPSHOT
+	.ue_snapshot =	umouse_snapshot,
+#endif
 };
 USB_EMUL_SET(ue_mouse);
diff --git a/usr.sbin/bhyve/virtio.c b/usr.sbin/bhyve/virtio.c
index d899a5779570..f3deb72b081c 100644
--- a/usr.sbin/bhyve/virtio.c
+++ b/usr.sbin/bhyve/virtio.c
@@ -1,808 +1,956 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2013  Chris Torek <torek @ torek net>
  * All rights reserved.
  * Copyright (c) 2019 Joyent, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/uio.h>
 
 #include <machine/atomic.h>
+#include <machine/vmm_snapshot.h>
 
 #include <stdio.h>
 #include <stdint.h>
 #include <pthread.h>
 #include <pthread_np.h>
 
 #include "bhyverun.h"
 #include "debug.h"
 #include "pci_emul.h"
 #include "virtio.h"
 
 /*
  * Functions for dealing with generalized "virtual devices" as
  * defined by <https://www.google.com/#output=search&q=virtio+spec>
  */
 
 /*
  * In case we decide to relax the "virtio softc comes at the
  * front of virtio-based device softc" constraint, let's use
  * this to convert.
  */
 #define	DEV_SOFTC(vs) ((void *)(vs))
 
 /*
  * Link a virtio_softc to its constants, the device softc, and
  * the PCI emulation.
  */
 void
 vi_softc_linkup(struct virtio_softc *vs, struct virtio_consts *vc,
 		void *dev_softc, struct pci_devinst *pi,
 		struct vqueue_info *queues)
 {
 	int i;
 
 	/* vs and dev_softc addresses must match */
 	assert((void *)vs == dev_softc);
 	vs->vs_vc = vc;
 	vs->vs_pi = pi;
 	pi->pi_arg = vs;
 
 	vs->vs_queues = queues;
 	for (i = 0; i < vc->vc_nvq; i++) {
 		queues[i].vq_vs = vs;
 		queues[i].vq_num = i;
 	}
 }
 
 /*
  * Reset device (device-wide).  This erases all queues, i.e.,
  * all the queues become invalid (though we don't wipe out the
  * internal pointers, we just clear the VQ_ALLOC flag).
  *
  * It resets negotiated features to "none".
  *
  * If MSI-X is enabled, this also resets all the vectors to NO_VECTOR.
  */
 void
 vi_reset_dev(struct virtio_softc *vs)
 {
 	struct vqueue_info *vq;
 	int i, nvq;
 
 	if (vs->vs_mtx)
 		assert(pthread_mutex_isowned_np(vs->vs_mtx));
 
 	nvq = vs->vs_vc->vc_nvq;
 	for (vq = vs->vs_queues, i = 0; i < nvq; vq++, i++) {
 		vq->vq_flags = 0;
 		vq->vq_last_avail = 0;
 		vq->vq_next_used = 0;
 		vq->vq_save_used = 0;
 		vq->vq_pfn = 0;
 		vq->vq_msix_idx = VIRTIO_MSI_NO_VECTOR;
 	}
 	vs->vs_negotiated_caps = 0;
 	vs->vs_curq = 0;
 	/* vs->vs_status = 0; -- redundant */
 	if (vs->vs_isr)
 		pci_lintr_deassert(vs->vs_pi);
 	vs->vs_isr = 0;
 	vs->vs_msix_cfg_idx = VIRTIO_MSI_NO_VECTOR;
 }
 
 /*
  * Set I/O BAR (usually 0) to map PCI config registers.
  */
 void
 vi_set_io_bar(struct virtio_softc *vs, int barnum)
 {
 	size_t size;
 
 	/*
 	 * ??? should we use CFG0 if MSI-X is disabled?
 	 * Existing code did not...
 	 */
 	size = VTCFG_R_CFG1 + vs->vs_vc->vc_cfgsize;
 	pci_emul_alloc_bar(vs->vs_pi, barnum, PCIBAR_IO, size);
 }
 
 /*
  * Initialize MSI-X vector capabilities if we're to use MSI-X,
  * or MSI capabilities if not.
  *
  * We assume we want one MSI-X vector per queue, here, plus one
  * for the config vec.
  */
 int
 vi_intr_init(struct virtio_softc *vs, int barnum, int use_msix)
 {
 	int nvec;
 
 	if (use_msix) {
 		vs->vs_flags |= VIRTIO_USE_MSIX;
 		VS_LOCK(vs);
 		vi_reset_dev(vs); /* set all vectors to NO_VECTOR */
 		VS_UNLOCK(vs);
 		nvec = vs->vs_vc->vc_nvq + 1;
 		if (pci_emul_add_msixcap(vs->vs_pi, nvec, barnum))
 			return (1);
 	} else
 		vs->vs_flags &= ~VIRTIO_USE_MSIX;
 
 	/* Only 1 MSI vector for bhyve */
 	pci_emul_add_msicap(vs->vs_pi, 1);
 
 	/* Legacy interrupts are mandatory for virtio devices */
 	pci_lintr_request(vs->vs_pi);
 
 	return (0);
 }
 
 /*
  * Initialize the currently-selected virtio queue (vs->vs_curq).
  * The guest just gave us a page frame number, from which we can
  * calculate the addresses of the queue.
  */
 void
 vi_vq_init(struct virtio_softc *vs, uint32_t pfn)
 {
 	struct vqueue_info *vq;
 	uint64_t phys;
 	size_t size;
 	char *base;
 
 	vq = &vs->vs_queues[vs->vs_curq];
 	vq->vq_pfn = pfn;
 	phys = (uint64_t)pfn << VRING_PFN;
 	size = vring_size(vq->vq_qsize);
 	base = paddr_guest2host(vs->vs_pi->pi_vmctx, phys, size);
 
 	/* First page(s) are descriptors... */
 	vq->vq_desc = (struct virtio_desc *)base;
 	base += vq->vq_qsize * sizeof(struct virtio_desc);
 
 	/* ... immediately followed by "avail" ring (entirely uint16_t's) */
 	vq->vq_avail = (struct vring_avail *)base;
 	base += (2 + vq->vq_qsize + 1) * sizeof(uint16_t);
 
 	/* Then it's rounded up to the next page... */
 	base = (char *)roundup2((uintptr_t)base, VRING_ALIGN);
 
 	/* ... and the last page(s) are the used ring. */
 	vq->vq_used = (struct vring_used *)base;
 
 	/* Mark queue as allocated, and start at 0 when we use it. */
 	vq->vq_flags = VQ_ALLOC;
 	vq->vq_last_avail = 0;
 	vq->vq_next_used = 0;
 	vq->vq_save_used = 0;
 }
 
 /*
  * Helper inline for vq_getchain(): record the i'th "real"
  * descriptor.
  */
 static inline void
 _vq_record(int i, volatile struct virtio_desc *vd, struct vmctx *ctx,
 	   struct iovec *iov, int n_iov, uint16_t *flags) {
 
 	if (i >= n_iov)
 		return;
 	iov[i].iov_base = paddr_guest2host(ctx, vd->vd_addr, vd->vd_len);
 	iov[i].iov_len = vd->vd_len;
 	if (flags != NULL)
 		flags[i] = vd->vd_flags;
 }
 #define	VQ_MAX_DESCRIPTORS	512	/* see below */
 
 /*
  * Examine the chain of descriptors starting at the "next one" to
  * make sure that they describe a sensible request.  If so, return
  * the number of "real" descriptors that would be needed/used in
  * acting on this request.  This may be smaller than the number of
  * available descriptors, e.g., if there are two available but
  * they are two separate requests, this just returns 1.  Or, it
  * may be larger: if there are indirect descriptors involved,
  * there may only be one descriptor available but it may be an
  * indirect pointing to eight more.  We return 8 in this case,
  * i.e., we do not count the indirect descriptors, only the "real"
  * ones.
  *
  * Basically, this vets the vd_flags and vd_next field of each
  * descriptor and tells you how many are involved.  Since some may
  * be indirect, this also needs the vmctx (in the pci_devinst
  * at vs->vs_pi) so that it can find indirect descriptors.
  *
  * As we process each descriptor, we copy and adjust it (guest to
  * host address wise, also using the vmtctx) into the given iov[]
  * array (of the given size).  If the array overflows, we stop
  * placing values into the array but keep processing descriptors,
  * up to VQ_MAX_DESCRIPTORS, before giving up and returning -1.
  * So you, the caller, must not assume that iov[] is as big as the
  * return value (you can process the same thing twice to allocate
  * a larger iov array if needed, or supply a zero length to find
  * out how much space is needed).
  *
  * If you want to verify the WRITE flag on each descriptor, pass a
  * non-NULL "flags" pointer to an array of "uint16_t" of the same size
  * as n_iov and we'll copy each vd_flags field after unwinding any
  * indirects.
  *
  * If some descriptor(s) are invalid, this prints a diagnostic message
  * and returns -1.  If no descriptors are ready now it simply returns 0.
  *
  * You are assumed to have done a vq_ring_ready() if needed (note
  * that vq_has_descs() does one).
  */
 int
 vq_getchain(struct vqueue_info *vq, uint16_t *pidx,
 	    struct iovec *iov, int n_iov, uint16_t *flags)
 {
 	int i;
 	u_int ndesc, n_indir;
 	u_int idx, next;
 	volatile struct virtio_desc *vdir, *vindir, *vp;
 	struct vmctx *ctx;
 	struct virtio_softc *vs;
 	const char *name;
 
 	vs = vq->vq_vs;
 	name = vs->vs_vc->vc_name;
 
 	/*
 	 * Note: it's the responsibility of the guest not to
 	 * update vq->vq_avail->va_idx until all of the descriptors
          * the guest has written are valid (including all their
          * vd_next fields and vd_flags).
 	 *
 	 * Compute (va_idx - last_avail) in integers mod 2**16.  This is
 	 * the number of descriptors the device has made available
 	 * since the last time we updated vq->vq_last_avail.
 	 *
 	 * We just need to do the subtraction as an unsigned int,
 	 * then trim off excess bits.
 	 */
 	idx = vq->vq_last_avail;
 	ndesc = (uint16_t)((u_int)vq->vq_avail->va_idx - idx);
 	if (ndesc == 0)
 		return (0);
 	if (ndesc > vq->vq_qsize) {
 		/* XXX need better way to diagnose issues */
 		EPRINTLN(
 		    "%s: ndesc (%u) out of range, driver confused?",
 		    name, (u_int)ndesc);
 		return (-1);
 	}
 
 	/*
 	 * Now count/parse "involved" descriptors starting from
 	 * the head of the chain.
 	 *
 	 * To prevent loops, we could be more complicated and
 	 * check whether we're re-visiting a previously visited
 	 * index, but we just abort if the count gets excessive.
 	 */
 	ctx = vs->vs_pi->pi_vmctx;
 	*pidx = next = vq->vq_avail->va_ring[idx & (vq->vq_qsize - 1)];
 	vq->vq_last_avail++;
 	for (i = 0; i < VQ_MAX_DESCRIPTORS; next = vdir->vd_next) {
 		if (next >= vq->vq_qsize) {
 			EPRINTLN(
 			    "%s: descriptor index %u out of range, "
 			    "driver confused?",
 			    name, next);
 			return (-1);
 		}
 		vdir = &vq->vq_desc[next];
 		if ((vdir->vd_flags & VRING_DESC_F_INDIRECT) == 0) {
 			_vq_record(i, vdir, ctx, iov, n_iov, flags);
 			i++;
 		} else if ((vs->vs_vc->vc_hv_caps &
 		    VIRTIO_RING_F_INDIRECT_DESC) == 0) {
 			EPRINTLN(
 			    "%s: descriptor has forbidden INDIRECT flag, "
 			    "driver confused?",
 			    name);
 			return (-1);
 		} else {
 			n_indir = vdir->vd_len / 16;
 			if ((vdir->vd_len & 0xf) || n_indir == 0) {
 				EPRINTLN(
 				    "%s: invalid indir len 0x%x, "
 				    "driver confused?",
 				    name, (u_int)vdir->vd_len);
 				return (-1);
 			}
 			vindir = paddr_guest2host(ctx,
 			    vdir->vd_addr, vdir->vd_len);
 			/*
 			 * Indirects start at the 0th, then follow
 			 * their own embedded "next"s until those run
 			 * out.  Each one's indirect flag must be off
 			 * (we don't really have to check, could just
 			 * ignore errors...).
 			 */
 			next = 0;
 			for (;;) {
 				vp = &vindir[next];
 				if (vp->vd_flags & VRING_DESC_F_INDIRECT) {
 					EPRINTLN(
 					    "%s: indirect desc has INDIR flag,"
 					    " driver confused?",
 					    name);
 					return (-1);
 				}
 				_vq_record(i, vp, ctx, iov, n_iov, flags);
 				if (++i > VQ_MAX_DESCRIPTORS)
 					goto loopy;
 				if ((vp->vd_flags & VRING_DESC_F_NEXT) == 0)
 					break;
 				next = vp->vd_next;
 				if (next >= n_indir) {
 					EPRINTLN(
 					    "%s: invalid next %u > %u, "
 					    "driver confused?",
 					    name, (u_int)next, n_indir);
 					return (-1);
 				}
 			}
 		}
 		if ((vdir->vd_flags & VRING_DESC_F_NEXT) == 0)
 			return (i);
 	}
 loopy:
 	EPRINTLN(
 	    "%s: descriptor loop? count > %d - driver confused?",
 	    name, i);
 	return (-1);
 }
 
 /*
  * Return the first n_chain request chains back to the available queue.
  *
  * (These chains are the ones you handled when you called vq_getchain()
  * and used its positive return value.)
  */
 void
 vq_retchains(struct vqueue_info *vq, uint16_t n_chains)
 {
 
 	vq->vq_last_avail -= n_chains;
 }
 
 void
 vq_relchain_prepare(struct vqueue_info *vq, uint16_t idx, uint32_t iolen)
 {
 	volatile struct vring_used *vuh;
 	volatile struct virtio_used *vue;
 	uint16_t mask;
 
 	/*
 	 * Notes:
 	 *  - mask is N-1 where N is a power of 2 so computes x % N
 	 *  - vuh points to the "used" data shared with guest
 	 *  - vue points to the "used" ring entry we want to update
 	 *
 	 * (I apologize for the two fields named vu_idx; the
 	 * virtio spec calls the one that vue points to, "id"...)
 	 */
 	mask = vq->vq_qsize - 1;
 	vuh = vq->vq_used;
 
 	vue = &vuh->vu_ring[vq->vq_next_used++ & mask];
 	vue->vu_idx = idx;
 	vue->vu_tlen = iolen;
 }
 
 void
 vq_relchain_publish(struct vqueue_info *vq)
 {
 	/*
 	 * Ensure the used descriptor is visible before updating the index.
 	 * This is necessary on ISAs with memory ordering less strict than x86
 	 * (and even on x86 to act as a compiler barrier).
 	 */
 	atomic_thread_fence_rel();
 	vq->vq_used->vu_idx = vq->vq_next_used;
 }
 
 /*
  * Return specified request chain to the guest, setting its I/O length
  * to the provided value.
  *
  * (This chain is the one you handled when you called vq_getchain()
  * and used its positive return value.)
  */
 void
 vq_relchain(struct vqueue_info *vq, uint16_t idx, uint32_t iolen)
 {
 	vq_relchain_prepare(vq, idx, iolen);
 	vq_relchain_publish(vq);
 }
 
 /*
  * Driver has finished processing "available" chains and calling
  * vq_relchain on each one.  If driver used all the available
  * chains, used_all should be set.
  *
  * If the "used" index moved we may need to inform the guest, i.e.,
  * deliver an interrupt.  Even if the used index did NOT move we
  * may need to deliver an interrupt, if the avail ring is empty and
  * we are supposed to interrupt on empty.
  *
  * Note that used_all_avail is provided by the caller because it's
  * a snapshot of the ring state when he decided to finish interrupt
  * processing -- it's possible that descriptors became available after
  * that point.  (It's also typically a constant 1/True as well.)
  */
 void
 vq_endchains(struct vqueue_info *vq, int used_all_avail)
 {
 	struct virtio_softc *vs;
 	uint16_t event_idx, new_idx, old_idx;
 	int intr;
 
 	/*
 	 * Interrupt generation: if we're using EVENT_IDX,
 	 * interrupt if we've crossed the event threshold.
 	 * Otherwise interrupt is generated if we added "used" entries,
 	 * but suppressed by VRING_AVAIL_F_NO_INTERRUPT.
 	 *
 	 * In any case, though, if NOTIFY_ON_EMPTY is set and the
 	 * entire avail was processed, we need to interrupt always.
 	 */
 	vs = vq->vq_vs;
 	old_idx = vq->vq_save_used;
 	vq->vq_save_used = new_idx = vq->vq_used->vu_idx;
 
 	/*
 	 * Use full memory barrier between vu_idx store from preceding
 	 * vq_relchain() call and the loads from VQ_USED_EVENT_IDX() or
 	 * va_flags below.
 	 */
 	atomic_thread_fence_seq_cst();
 	if (used_all_avail &&
 	    (vs->vs_negotiated_caps & VIRTIO_F_NOTIFY_ON_EMPTY))
 		intr = 1;
 	else if (vs->vs_negotiated_caps & VIRTIO_RING_F_EVENT_IDX) {
 		event_idx = VQ_USED_EVENT_IDX(vq);
 		/*
 		 * This calculation is per docs and the kernel
 		 * (see src/sys/dev/virtio/virtio_ring.h).
 		 */
 		intr = (uint16_t)(new_idx - event_idx - 1) <
 			(uint16_t)(new_idx - old_idx);
 	} else {
 		intr = new_idx != old_idx &&
 		    !(vq->vq_avail->va_flags & VRING_AVAIL_F_NO_INTERRUPT);
 	}
 	if (intr)
 		vq_interrupt(vs, vq);
 }
 
 /* Note: these are in sorted order to make for a fast search */
 static struct config_reg {
 	uint16_t	cr_offset;	/* register offset */
 	uint8_t		cr_size;	/* size (bytes) */
 	uint8_t		cr_ro;		/* true => reg is read only */
 	const char	*cr_name;	/* name of reg */
 } config_regs[] = {
 	{ VTCFG_R_HOSTCAP,	4, 1, "HOSTCAP" },
 	{ VTCFG_R_GUESTCAP,	4, 0, "GUESTCAP" },
 	{ VTCFG_R_PFN,		4, 0, "PFN" },
 	{ VTCFG_R_QNUM,		2, 1, "QNUM" },
 	{ VTCFG_R_QSEL,		2, 0, "QSEL" },
 	{ VTCFG_R_QNOTIFY,	2, 0, "QNOTIFY" },
 	{ VTCFG_R_STATUS,	1, 0, "STATUS" },
 	{ VTCFG_R_ISR,		1, 0, "ISR" },
 	{ VTCFG_R_CFGVEC,	2, 0, "CFGVEC" },
 	{ VTCFG_R_QVEC,		2, 0, "QVEC" },
 };
 
 static inline struct config_reg *
 vi_find_cr(int offset) {
 	u_int hi, lo, mid;
 	struct config_reg *cr;
 
 	lo = 0;
 	hi = sizeof(config_regs) / sizeof(*config_regs) - 1;
 	while (hi >= lo) {
 		mid = (hi + lo) >> 1;
 		cr = &config_regs[mid];
 		if (cr->cr_offset == offset)
 			return (cr);
 		if (cr->cr_offset < offset)
 			lo = mid + 1;
 		else
 			hi = mid - 1;
 	}
 	return (NULL);
 }
 
 /*
  * Handle pci config space reads.
  * If it's to the MSI-X info, do that.
  * If it's part of the virtio standard stuff, do that.
  * Otherwise dispatch to the actual driver.
  */
 uint64_t
 vi_pci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
 	    int baridx, uint64_t offset, int size)
 {
 	struct virtio_softc *vs = pi->pi_arg;
 	struct virtio_consts *vc;
 	struct config_reg *cr;
 	uint64_t virtio_config_size, max;
 	const char *name;
 	uint32_t newoff;
 	uint32_t value;
 	int error;
 
 	if (vs->vs_flags & VIRTIO_USE_MSIX) {
 		if (baridx == pci_msix_table_bar(pi) ||
 		    baridx == pci_msix_pba_bar(pi)) {
 			return (pci_emul_msix_tread(pi, offset, size));
 		}
 	}
 
 	/* XXX probably should do something better than just assert() */
 	assert(baridx == 0);
 
 	if (vs->vs_mtx)
 		pthread_mutex_lock(vs->vs_mtx);
 
 	vc = vs->vs_vc;
 	name = vc->vc_name;
 	value = size == 1 ? 0xff : size == 2 ? 0xffff : 0xffffffff;
 
 	if (size != 1 && size != 2 && size != 4)
 		goto bad;
 
 	if (pci_msix_enabled(pi))
 		virtio_config_size = VTCFG_R_CFG1;
 	else
 		virtio_config_size = VTCFG_R_CFG0;
 
 	if (offset >= virtio_config_size) {
 		/*
 		 * Subtract off the standard size (including MSI-X
 		 * registers if enabled) and dispatch to underlying driver.
 		 * If that fails, fall into general code.
 		 */
 		newoff = offset - virtio_config_size;
 		max = vc->vc_cfgsize ? vc->vc_cfgsize : 0x100000000;
 		if (newoff + size > max)
 			goto bad;
 		error = (*vc->vc_cfgread)(DEV_SOFTC(vs), newoff, size, &value);
 		if (!error)
 			goto done;
 	}
 
 bad:
 	cr = vi_find_cr(offset);
 	if (cr == NULL || cr->cr_size != size) {
 		if (cr != NULL) {
 			/* offset must be OK, so size must be bad */
 			EPRINTLN(
 			    "%s: read from %s: bad size %d",
 			    name, cr->cr_name, size);
 		} else {
 			EPRINTLN(
 			    "%s: read from bad offset/size %jd/%d",
 			    name, (uintmax_t)offset, size);
 		}
 		goto done;
 	}
 
 	switch (offset) {
 	case VTCFG_R_HOSTCAP:
 		value = vc->vc_hv_caps;
 		break;
 	case VTCFG_R_GUESTCAP:
 		value = vs->vs_negotiated_caps;
 		break;
 	case VTCFG_R_PFN:
 		if (vs->vs_curq < vc->vc_nvq)
 			value = vs->vs_queues[vs->vs_curq].vq_pfn;
 		break;
 	case VTCFG_R_QNUM:
 		value = vs->vs_curq < vc->vc_nvq ?
 		    vs->vs_queues[vs->vs_curq].vq_qsize : 0;
 		break;
 	case VTCFG_R_QSEL:
 		value = vs->vs_curq;
 		break;
 	case VTCFG_R_QNOTIFY:
 		value = 0;	/* XXX */
 		break;
 	case VTCFG_R_STATUS:
 		value = vs->vs_status;
 		break;
 	case VTCFG_R_ISR:
 		value = vs->vs_isr;
 		vs->vs_isr = 0;		/* a read clears this flag */
 		if (value)
 			pci_lintr_deassert(pi);
 		break;
 	case VTCFG_R_CFGVEC:
 		value = vs->vs_msix_cfg_idx;
 		break;
 	case VTCFG_R_QVEC:
 		value = vs->vs_curq < vc->vc_nvq ?
 		    vs->vs_queues[vs->vs_curq].vq_msix_idx :
 		    VIRTIO_MSI_NO_VECTOR;
 		break;
 	}
 done:
 	if (vs->vs_mtx)
 		pthread_mutex_unlock(vs->vs_mtx);
 	return (value);
 }
 
 /*
  * Handle pci config space writes.
  * If it's to the MSI-X info, do that.
  * If it's part of the virtio standard stuff, do that.
  * Otherwise dispatch to the actual driver.
  */
 void
 vi_pci_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
 	     int baridx, uint64_t offset, int size, uint64_t value)
 {
 	struct virtio_softc *vs = pi->pi_arg;
 	struct vqueue_info *vq;
 	struct virtio_consts *vc;
 	struct config_reg *cr;
 	uint64_t virtio_config_size, max;
 	const char *name;
 	uint32_t newoff;
 	int error;
 
 	if (vs->vs_flags & VIRTIO_USE_MSIX) {
 		if (baridx == pci_msix_table_bar(pi) ||
 		    baridx == pci_msix_pba_bar(pi)) {
 			pci_emul_msix_twrite(pi, offset, size, value);
 			return;
 		}
 	}
 
 	/* XXX probably should do something better than just assert() */
 	assert(baridx == 0);
 
 	if (vs->vs_mtx)
 		pthread_mutex_lock(vs->vs_mtx);
 
 	vc = vs->vs_vc;
 	name = vc->vc_name;
 
 	if (size != 1 && size != 2 && size != 4)
 		goto bad;
 
 	if (pci_msix_enabled(pi))
 		virtio_config_size = VTCFG_R_CFG1;
 	else
 		virtio_config_size = VTCFG_R_CFG0;
 
 	if (offset >= virtio_config_size) {
 		/*
 		 * Subtract off the standard size (including MSI-X
 		 * registers if enabled) and dispatch to underlying driver.
 		 */
 		newoff = offset - virtio_config_size;
 		max = vc->vc_cfgsize ? vc->vc_cfgsize : 0x100000000;
 		if (newoff + size > max)
 			goto bad;
 		error = (*vc->vc_cfgwrite)(DEV_SOFTC(vs), newoff, size, value);
 		if (!error)
 			goto done;
 	}
 
 bad:
 	cr = vi_find_cr(offset);
 	if (cr == NULL || cr->cr_size != size || cr->cr_ro) {
 		if (cr != NULL) {
 			/* offset must be OK, wrong size and/or reg is R/O */
 			if (cr->cr_size != size)
 				EPRINTLN(
 				    "%s: write to %s: bad size %d",
 				    name, cr->cr_name, size);
 			if (cr->cr_ro)
 				EPRINTLN(
 				    "%s: write to read-only reg %s",
 				    name, cr->cr_name);
 		} else {
 			EPRINTLN(
 			    "%s: write to bad offset/size %jd/%d",
 			    name, (uintmax_t)offset, size);
 		}
 		goto done;
 	}
 
 	switch (offset) {
 	case VTCFG_R_GUESTCAP:
 		vs->vs_negotiated_caps = value & vc->vc_hv_caps;
 		if (vc->vc_apply_features)
 			(*vc->vc_apply_features)(DEV_SOFTC(vs),
 			    vs->vs_negotiated_caps);
 		break;
 	case VTCFG_R_PFN:
 		if (vs->vs_curq >= vc->vc_nvq)
 			goto bad_qindex;
 		vi_vq_init(vs, value);
 		break;
 	case VTCFG_R_QSEL:
 		/*
 		 * Note that the guest is allowed to select an
 		 * invalid queue; we just need to return a QNUM
 		 * of 0 while the bad queue is selected.
 		 */
 		vs->vs_curq = value;
 		break;
 	case VTCFG_R_QNOTIFY:
 		if (value >= vc->vc_nvq) {
 			EPRINTLN("%s: queue %d notify out of range",
 				name, (int)value);
 			goto done;
 		}
 		vq = &vs->vs_queues[value];
 		if (vq->vq_notify)
 			(*vq->vq_notify)(DEV_SOFTC(vs), vq);
 		else if (vc->vc_qnotify)
 			(*vc->vc_qnotify)(DEV_SOFTC(vs), vq);
 		else
 			EPRINTLN(
 			    "%s: qnotify queue %d: missing vq/vc notify",
 				name, (int)value);
 		break;
 	case VTCFG_R_STATUS:
 		vs->vs_status = value;
 		if (value == 0)
 			(*vc->vc_reset)(DEV_SOFTC(vs));
 		break;
 	case VTCFG_R_CFGVEC:
 		vs->vs_msix_cfg_idx = value;
 		break;
 	case VTCFG_R_QVEC:
 		if (vs->vs_curq >= vc->vc_nvq)
 			goto bad_qindex;
 		vq = &vs->vs_queues[vs->vs_curq];
 		vq->vq_msix_idx = value;
 		break;
 	}
 	goto done;
 
 bad_qindex:
 	EPRINTLN(
 	    "%s: write config reg %s: curq %d >= max %d",
 	    name, cr->cr_name, vs->vs_curq, vc->vc_nvq);
 done:
 	if (vs->vs_mtx)
 		pthread_mutex_unlock(vs->vs_mtx);
 }
+
+#ifdef BHYVE_SNAPSHOT
+int
+vi_pci_pause(struct vmctx *ctx, struct pci_devinst *pi)
+{
+	struct virtio_softc *vs;
+	struct virtio_consts *vc;
+
+	vs = pi->pi_arg;
+	vc = vs->vs_vc;
+
+	vc = vs->vs_vc;
+	assert(vc->vc_pause != NULL);
+	(*vc->vc_pause)(DEV_SOFTC(vs));
+
+	return (0);
+}
+
+int
+vi_pci_resume(struct vmctx *ctx, struct pci_devinst *pi)
+{
+	struct virtio_softc *vs;
+	struct virtio_consts *vc;
+
+	vs = pi->pi_arg;
+	vc = vs->vs_vc;
+
+	vc = vs->vs_vc;
+	assert(vc->vc_resume != NULL);
+	(*vc->vc_resume)(DEV_SOFTC(vs));
+
+	return (0);
+}
+
+static int
+vi_pci_snapshot_softc(struct virtio_softc *vs, struct vm_snapshot_meta *meta)
+{
+	int ret;
+
+	SNAPSHOT_VAR_OR_LEAVE(vs->vs_flags, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(vs->vs_negotiated_caps, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(vs->vs_curq, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(vs->vs_status, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(vs->vs_isr, meta, ret, done);
+	SNAPSHOT_VAR_OR_LEAVE(vs->vs_msix_cfg_idx, meta, ret, done);
+
+done:
+	return (ret);
+}
+
+static int
+vi_pci_snapshot_consts(struct virtio_consts *vc, struct vm_snapshot_meta *meta)
+{
+	int ret;
+
+	SNAPSHOT_VAR_CMP_OR_LEAVE(vc->vc_nvq, meta, ret, done);
+	SNAPSHOT_VAR_CMP_OR_LEAVE(vc->vc_cfgsize, meta, ret, done);
+	SNAPSHOT_VAR_CMP_OR_LEAVE(vc->vc_hv_caps, meta, ret, done);
+
+done:
+	return (ret);
+}
+
+static int
+vi_pci_snapshot_queues(struct virtio_softc *vs, struct vm_snapshot_meta *meta)
+{
+	int i;
+	int ret;
+	struct virtio_consts *vc;
+	struct vqueue_info *vq;
+	uint64_t addr_size;
+
+	vc = vs->vs_vc;
+
+	/* Save virtio queue info */
+	for (i = 0; i < vc->vc_nvq; i++) {
+		vq = &vs->vs_queues[i];
+
+		SNAPSHOT_VAR_CMP_OR_LEAVE(vq->vq_qsize, meta, ret, done);
+		SNAPSHOT_VAR_CMP_OR_LEAVE(vq->vq_num, meta, ret, done);
+
+		SNAPSHOT_VAR_OR_LEAVE(vq->vq_flags, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vq->vq_last_avail, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vq->vq_next_used, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vq->vq_save_used, meta, ret, done);
+		SNAPSHOT_VAR_OR_LEAVE(vq->vq_msix_idx, meta, ret, done);
+
+		SNAPSHOT_VAR_OR_LEAVE(vq->vq_pfn, meta, ret, done);
+
+		addr_size = vq->vq_qsize * sizeof(struct virtio_desc);
+		SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(vq->vq_desc, addr_size,
+			false, meta, ret, done);
+
+		addr_size = (2 + vq->vq_qsize + 1) * sizeof(uint16_t);
+		SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(vq->vq_avail, addr_size,
+			false, meta, ret, done);
+
+		addr_size  = (2 + 2 * vq->vq_qsize + 1) * sizeof(uint16_t);
+		SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(vq->vq_used, addr_size,
+			false, meta, ret, done);
+
+		SNAPSHOT_BUF_OR_LEAVE(vq->vq_desc, vring_size(vq->vq_qsize),
+			meta, ret, done);
+	}
+
+done:
+	return (ret);
+}
+
+int
+vi_pci_snapshot(struct vm_snapshot_meta *meta)
+{
+	int ret;
+	struct pci_devinst *pi;
+	struct virtio_softc *vs;
+	struct virtio_consts *vc;
+
+	pi = meta->dev_data;
+	vs = pi->pi_arg;
+	vc = vs->vs_vc;
+
+	/* Save virtio softc */
+	ret = vi_pci_snapshot_softc(vs, meta);
+	if (ret != 0)
+		goto done;
+
+	/* Save virtio consts */
+	ret = vi_pci_snapshot_consts(vc, meta);
+	if (ret != 0)
+		goto done;
+
+	/* Save virtio queue info */
+	ret = vi_pci_snapshot_queues(vs, meta);
+	if (ret != 0)
+		goto done;
+
+	/* Save device softc, if needed */
+	if (vc->vc_snapshot != NULL) {
+		ret = (*vc->vc_snapshot)(DEV_SOFTC(vs), meta);
+		if (ret != 0)
+			goto done;
+	}
+
+done:
+	return (ret);
+}
+#endif
diff --git a/usr.sbin/bhyve/virtio.h b/usr.sbin/bhyve/virtio.h
index ab95d9d213e3..e9432e012b27 100644
--- a/usr.sbin/bhyve/virtio.h
+++ b/usr.sbin/bhyve/virtio.h
@@ -1,494 +1,504 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2013  Chris Torek <torek @ torek net>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef	_VIRTIO_H_
 #define	_VIRTIO_H_
 
 #include <machine/atomic.h>
 
 /*
  * These are derived from several virtio specifications.
  *
  * Some useful links:
  *    https://github.com/rustyrussell/virtio-spec
  *    http://people.redhat.com/pbonzini/virtio-spec.pdf
  */
 
 /*
  * A virtual device has zero or more "virtual queues" (virtqueue).
  * Each virtqueue uses at least two 4096-byte pages, laid out thus:
  *
  *      +-----------------------------------------------+
  *      |    "desc":  <N> descriptors, 16 bytes each    |
  *      |   -----------------------------------------   |
  *      |   "avail":   2 uint16; <N> uint16; 1 uint16   |
  *      |   -----------------------------------------   |
  *      |              pad to 4k boundary               |
  *      +-----------------------------------------------+
  *      |   "used": 2 x uint16; <N> elems; 1 uint16     |
  *      |   -----------------------------------------   |
  *      |              pad to 4k boundary               |
  *      +-----------------------------------------------+
  *
  * The number <N> that appears here is always a power of two and is
  * limited to no more than 32768 (as it must fit in a 16-bit field).
  * If <N> is sufficiently large, the above will occupy more than
  * two pages.  In any case, all pages must be physically contiguous
  * within the guest's physical address space.
  *
  * The <N> 16-byte "desc" descriptors consist of a 64-bit guest
  * physical address <addr>, a 32-bit length <len>, a 16-bit
  * <flags>, and a 16-bit <next> field (all in guest byte order).
  *
  * There are three flags that may be set :
  *	NEXT    descriptor is chained, so use its "next" field
  *	WRITE   descriptor is for host to write into guest RAM
  *		(else host is to read from guest RAM)
  *	INDIRECT   descriptor address field is (guest physical)
  *		address of a linear array of descriptors
  *
  * Unless INDIRECT is set, <len> is the number of bytes that may
  * be read/written from guest physical address <addr>.  If
  * INDIRECT is set, WRITE is ignored and <len> provides the length
  * of the indirect descriptors (and <len> must be a multiple of
  * 16).  Note that NEXT may still be set in the main descriptor
  * pointing to the indirect, and should be set in each indirect
  * descriptor that uses the next descriptor (these should generally
  * be numbered sequentially).  However, INDIRECT must not be set
  * in the indirect descriptors.  Upon reaching an indirect descriptor
  * without a NEXT bit, control returns to the direct descriptors.
  *
  * Except inside an indirect, each <next> value must be in the
  * range [0 .. N) (i.e., the half-open interval).  (Inside an
  * indirect, each <next> must be in the range [0 .. <len>/16).)
  *
  * The "avail" data structures reside in the same pages as the
  * "desc" structures since both together are used by the device to
  * pass information to the hypervisor's virtual driver.  These
  * begin with a 16-bit <flags> field and 16-bit index <idx>, then
  * have <N> 16-bit <ring> values, followed by one final 16-bit
  * field <used_event>.  The <N> <ring> entries are simply indices
  * indices into the descriptor ring (and thus must meet the same
  * constraints as each <next> value).  However, <idx> is counted
  * up from 0 (initially) and simply wraps around after 65535; it
  * is taken mod <N> to find the next available entry.
  *
  * The "used" ring occupies a separate page or pages, and contains
  * values written from the virtual driver back to the guest OS.
  * This begins with a 16-bit <flags> and 16-bit <idx>, then there
  * are <N> "vring_used" elements, followed by a 16-bit <avail_event>.
  * The <N> "vring_used" elements consist of a 32-bit <id> and a
  * 32-bit <len> (vu_tlen below).  The <id> is simply the index of
  * the head of a descriptor chain the guest made available
  * earlier, and the <len> is the number of bytes actually written,
  * e.g., in the case of a network driver that provided a large
  * receive buffer but received only a small amount of data.
  *
  * The two event fields, <used_event> and <avail_event>, in the
  * avail and used rings (respectively -- note the reversal!), are
  * always provided, but are used only if the virtual device
  * negotiates the VIRTIO_RING_F_EVENT_IDX feature during feature
  * negotiation.  Similarly, both rings provide a flag --
  * VRING_AVAIL_F_NO_INTERRUPT and VRING_USED_F_NO_NOTIFY -- in
  * their <flags> field, indicating that the guest does not need an
  * interrupt, or that the hypervisor driver does not need a
  * notify, when descriptors are added to the corresponding ring.
  * (These are provided only for interrupt optimization and need
  * not be implemented.)
  */
 #define VRING_ALIGN	4096
 
 #define VRING_DESC_F_NEXT	(1 << 0)
 #define VRING_DESC_F_WRITE	(1 << 1)
 #define VRING_DESC_F_INDIRECT	(1 << 2)
 
 struct virtio_desc {			/* AKA vring_desc */
 	uint64_t	vd_addr;	/* guest physical address */
 	uint32_t	vd_len;		/* length of scatter/gather seg */
 	uint16_t	vd_flags;	/* VRING_F_DESC_* */
 	uint16_t	vd_next;	/* next desc if F_NEXT */
 } __packed;
 
 struct virtio_used {			/* AKA vring_used_elem */
 	uint32_t	vu_idx;		/* head of used descriptor chain */
 	uint32_t	vu_tlen;	/* length written-to */
 } __packed;
 
 #define VRING_AVAIL_F_NO_INTERRUPT   1
 
 struct vring_avail {
 	uint16_t	va_flags;	/* VRING_AVAIL_F_* */
 	uint16_t	va_idx;		/* counts to 65535, then cycles */
 	uint16_t	va_ring[];	/* size N, reported in QNUM value */
 /*	uint16_t	va_used_event;	-- after N ring entries */
 } __packed;
 
 #define	VRING_USED_F_NO_NOTIFY		1
 struct vring_used {
 	uint16_t	vu_flags;	/* VRING_USED_F_* */
 	uint16_t	vu_idx;		/* counts to 65535, then cycles */
 	struct virtio_used vu_ring[];	/* size N */
 /*	uint16_t	vu_avail_event;	-- after N ring entries */
 } __packed;
 
 /*
  * The address of any given virtual queue is determined by a single
  * Page Frame Number register.  The guest writes the PFN into the
  * PCI config space.  However, a device that has two or more
  * virtqueues can have a different PFN, and size, for each queue.
  * The number of queues is determinable via the PCI config space
  * VTCFG_R_QSEL register.  Writes to QSEL select the queue: 0 means
  * queue #0, 1 means queue#1, etc.  Once a queue is selected, the
  * remaining PFN and QNUM registers refer to that queue.
  *
  * QNUM is a read-only register containing a nonzero power of two
  * that indicates the (hypervisor's) queue size.  Or, if reading it
  * produces zero, the hypervisor does not have a corresponding
  * queue.  (The number of possible queues depends on the virtual
  * device.  The block device has just one; the network device
  * provides either two -- 0 = receive, 1 = transmit -- or three,
  * with 2 = control.)
  *
  * PFN is a read/write register giving the physical page address of
  * the virtqueue in guest memory (the guest must allocate enough space
  * based on the hypervisor's provided QNUM).
  *
  * QNOTIFY is effectively write-only: when the guest writes a queue
  * number to the register, the hypervisor should scan the specified
  * virtqueue. (Reading QNOTIFY currently always gets 0).
  */
 
 /*
  * PFN register shift amount
  */
 #define	VRING_PFN		12
 
 /*
  * Virtio device types
  *
  * XXX Should really be merged with <dev/virtio/virtio.h> defines
  */
 #define	VIRTIO_TYPE_NET		1
 #define	VIRTIO_TYPE_BLOCK	2
 #define	VIRTIO_TYPE_CONSOLE	3
 #define	VIRTIO_TYPE_ENTROPY	4
 #define	VIRTIO_TYPE_BALLOON	5
 #define	VIRTIO_TYPE_IOMEMORY	6
 #define	VIRTIO_TYPE_RPMSG	7
 #define	VIRTIO_TYPE_SCSI	8
 #define	VIRTIO_TYPE_9P		9
 
 /* experimental IDs start at 65535 and work down */
 
 /*
  * PCI vendor/device IDs
  */
 #define	VIRTIO_VENDOR		0x1AF4
 #define	VIRTIO_DEV_NET		0x1000
 #define	VIRTIO_DEV_BLOCK	0x1001
 #define	VIRTIO_DEV_CONSOLE	0x1003
 #define	VIRTIO_DEV_RANDOM	0x1005
 #define	VIRTIO_DEV_SCSI		0x1008
 
 /*
  * PCI config space constants.
  *
  * If MSI-X is enabled, the ISR register is generally not used,
  * and the configuration vector and queue vector appear at offsets
  * 20 and 22 with the remaining configuration registers at 24.
  * If MSI-X is not enabled, those two registers disappear and
  * the remaining configuration registers start at offset 20.
  */
 #define	VTCFG_R_HOSTCAP		0
 #define	VTCFG_R_GUESTCAP	4
 #define	VTCFG_R_PFN		8
 #define	VTCFG_R_QNUM		12
 #define	VTCFG_R_QSEL		14
 #define	VTCFG_R_QNOTIFY		16
 #define	VTCFG_R_STATUS		18
 #define	VTCFG_R_ISR		19
 #define	VTCFG_R_CFGVEC		20
 #define	VTCFG_R_QVEC		22
 #define	VTCFG_R_CFG0		20	/* No MSI-X */
 #define	VTCFG_R_CFG1		24	/* With MSI-X */
 #define	VTCFG_R_MSIX		20
 
 /*
  * Bits in VTCFG_R_STATUS.  Guests need not actually set any of these,
  * but a guest writing 0 to this register means "please reset".
  */
 #define	VTCFG_STATUS_ACK	0x01	/* guest OS has acknowledged dev */
 #define	VTCFG_STATUS_DRIVER	0x02	/* guest OS driver is loaded */
 #define	VTCFG_STATUS_DRIVER_OK	0x04	/* guest OS driver ready */
 #define	VTCFG_STATUS_FAILED	0x80	/* guest has given up on this dev */
 
 /*
  * Bits in VTCFG_R_ISR.  These apply only if not using MSI-X.
  *
  * (We don't [yet?] ever use CONF_CHANGED.)
  */
 #define	VTCFG_ISR_QUEUES	0x01	/* re-scan queues */
 #define	VTCFG_ISR_CONF_CHANGED	0x80	/* configuration changed */
 
 #define	VIRTIO_MSI_NO_VECTOR	0xFFFF
 
 /*
  * Feature flags.
  * Note: bits 0 through 23 are reserved to each device type.
  */
 #define	VIRTIO_F_NOTIFY_ON_EMPTY	(1 << 24)
 #define	VIRTIO_RING_F_INDIRECT_DESC	(1 << 28)
 #define	VIRTIO_RING_F_EVENT_IDX		(1 << 29)
 
 /* From section 2.3, "Virtqueue Configuration", of the virtio specification */
 static inline size_t
 vring_size(u_int qsz)
 {
 	size_t size;
 
 	/* constant 3 below = va_flags, va_idx, va_used_event */
 	size = sizeof(struct virtio_desc) * qsz + sizeof(uint16_t) * (3 + qsz);
 	size = roundup2(size, VRING_ALIGN);
 
 	/* constant 3 below = vu_flags, vu_idx, vu_avail_event */
 	size += sizeof(uint16_t) * 3 + sizeof(struct virtio_used) * qsz;
 	size = roundup2(size, VRING_ALIGN);
 
 	return (size);
 }
 
 struct vmctx;
 struct pci_devinst;
 struct vqueue_info;
+struct vm_snapshot_meta;
 
 /*
  * A virtual device, with some number (possibly 0) of virtual
  * queues and some size (possibly 0) of configuration-space
  * registers private to the device.  The virtio_softc should come
  * at the front of each "derived class", so that a pointer to the
  * virtio_softc is also a pointer to the more specific, derived-
  * from-virtio driver's softc.
  *
  * Note: inside each hypervisor virtio driver, changes to these
  * data structures must be locked against other threads, if any.
  * Except for PCI config space register read/write, we assume each
  * driver does the required locking, but we need a pointer to the
  * lock (if there is one) for PCI config space read/write ops.
  *
  * When the guest reads or writes the device's config space, the
  * generic layer checks for operations on the special registers
  * described above.  If the offset of the register(s) being read
  * or written is past the CFG area (CFG0 or CFG1), the request is
  * passed on to the virtual device, after subtracting off the
  * generic-layer size.  (So, drivers can just use the offset as
  * an offset into "struct config", for instance.)
  *
  * (The virtio layer also makes sure that the read or write is to/
  * from a "good" config offset, hence vc_cfgsize, and on BAR #0.
  * However, the driver must verify the read or write size and offset
  * and that no one is writing a readonly register.)
  *
  * The BROKED flag ("this thing done gone and broked") is for future
  * use.
  */
 #define	VIRTIO_USE_MSIX		0x01
 #define	VIRTIO_EVENT_IDX	0x02	/* use the event-index values */
 #define	VIRTIO_BROKED		0x08	/* ??? */
 
 struct virtio_softc {
 	struct virtio_consts *vs_vc;	/* constants (see below) */
 	int	vs_flags;		/* VIRTIO_* flags from above */
 	pthread_mutex_t *vs_mtx;	/* POSIX mutex, if any */
 	struct pci_devinst *vs_pi;	/* PCI device instance */
 	uint32_t vs_negotiated_caps;	/* negotiated capabilities */
 	struct vqueue_info *vs_queues;	/* one per vc_nvq */
 	int	vs_curq;		/* current queue */
 	uint8_t	vs_status;		/* value from last status write */
 	uint8_t	vs_isr;			/* ISR flags, if not MSI-X */
 	uint16_t vs_msix_cfg_idx;	/* MSI-X vector for config event */
 };
 
 #define	VS_LOCK(vs)							\
 do {									\
 	if (vs->vs_mtx)							\
 		pthread_mutex_lock(vs->vs_mtx);				\
 } while (0)
 
 #define	VS_UNLOCK(vs)							\
 do {									\
 	if (vs->vs_mtx)							\
 		pthread_mutex_unlock(vs->vs_mtx);			\
 } while (0)
 
 struct virtio_consts {
 	const char *vc_name;		/* name of driver (for diagnostics) */
 	int	vc_nvq;			/* number of virtual queues */
 	size_t	vc_cfgsize;		/* size of dev-specific config regs */
 	void	(*vc_reset)(void *);	/* called on virtual device reset */
 	void	(*vc_qnotify)(void *, struct vqueue_info *);
 					/* called on QNOTIFY if no VQ notify */
 	int	(*vc_cfgread)(void *, int, int, uint32_t *);
 					/* called to read config regs */
 	int	(*vc_cfgwrite)(void *, int, int, uint32_t);
 					/* called to write config regs */
 	void    (*vc_apply_features)(void *, uint64_t);
 				/* called to apply negotiated features */
 	uint64_t vc_hv_caps;		/* hypervisor-provided capabilities */
+	void	(*vc_pause)(void *);	/* called to pause device activity */
+	void	(*vc_resume)(void *);	/* called to resume device activity */
+	int	(*vc_snapshot)(void *, struct vm_snapshot_meta *);
+				/* called to save / restore device state */
 };
 
 /*
  * Data structure allocated (statically) per virtual queue.
  *
  * Drivers may change vq_qsize after a reset.  When the guest OS
  * requests a device reset, the hypervisor first calls
  * vs->vs_vc->vc_reset(); then the data structure below is
  * reinitialized (for each virtqueue: vs->vs_vc->vc_nvq).
  *
  * The remaining fields should only be fussed-with by the generic
  * code.
  *
  * Note: the addresses of vq_desc, vq_avail, and vq_used are all
  * computable from each other, but it's a lot simpler if we just
  * keep a pointer to each one.  The event indices are similarly
  * (but more easily) computable, and this time we'll compute them:
  * they're just XX_ring[N].
  */
 #define	VQ_ALLOC	0x01	/* set once we have a pfn */
 #define	VQ_BROKED	0x02	/* ??? */
 struct vqueue_info {
 	uint16_t vq_qsize;	/* size of this queue (a power of 2) */
 	void	(*vq_notify)(void *, struct vqueue_info *);
 				/* called instead of vc_notify, if not NULL */
 
 	struct virtio_softc *vq_vs;	/* backpointer to softc */
 	uint16_t vq_num;	/* we're the num'th queue in the softc */
 
 	uint16_t vq_flags;	/* flags (see above) */
 	uint16_t vq_last_avail;	/* a recent value of vq_avail->va_idx */
 	uint16_t vq_next_used;	/* index of the next used slot to be filled */
 	uint16_t vq_save_used;	/* saved vq_used->vu_idx; see vq_endchains */
 	uint16_t vq_msix_idx;	/* MSI-X index, or VIRTIO_MSI_NO_VECTOR */
 
 	uint32_t vq_pfn;	/* PFN of virt queue (not shifted!) */
 
 	volatile struct virtio_desc *vq_desc;	/* descriptor array */
 	volatile struct vring_avail *vq_avail;	/* the "avail" ring */
 	volatile struct vring_used *vq_used;	/* the "used" ring */
 
 };
 /* as noted above, these are sort of backwards, name-wise */
 #define VQ_AVAIL_EVENT_IDX(vq) \
 	(*(volatile uint16_t *)&(vq)->vq_used->vu_ring[(vq)->vq_qsize])
 #define VQ_USED_EVENT_IDX(vq) \
 	((vq)->vq_avail->va_ring[(vq)->vq_qsize])
 
 /*
  * Is this ring ready for I/O?
  */
 static inline int
 vq_ring_ready(struct vqueue_info *vq)
 {
 
 	return (vq->vq_flags & VQ_ALLOC);
 }
 
 /*
  * Are there "available" descriptors?  (This does not count
  * how many, just returns True if there are some.)
  */
 static inline int
 vq_has_descs(struct vqueue_info *vq)
 {
 
 	return (vq_ring_ready(vq) && vq->vq_last_avail !=
 	    vq->vq_avail->va_idx);
 }
 
 /*
  * Deliver an interrupt to guest on the given virtual queue
  * (if possible, or a generic MSI interrupt if not using MSI-X).
  */
 static inline void
 vq_interrupt(struct virtio_softc *vs, struct vqueue_info *vq)
 {
 
 	if (pci_msix_enabled(vs->vs_pi))
 		pci_generate_msix(vs->vs_pi, vq->vq_msix_idx);
 	else {
 		VS_LOCK(vs);
 		vs->vs_isr |= VTCFG_ISR_QUEUES;
 		pci_generate_msi(vs->vs_pi, 0);
 		pci_lintr_assert(vs->vs_pi);
 		VS_UNLOCK(vs);
 	}
 }
 
 static inline void
 vq_kick_enable(struct vqueue_info *vq)
 {
 
 	vq->vq_used->vu_flags &= ~VRING_USED_F_NO_NOTIFY;
 	/*
 	 * Full memory barrier to make sure the store to vu_flags
 	 * happens before the load from va_idx, which results from
 	 * a subsequent call to vq_has_descs().
 	 */
 	atomic_thread_fence_seq_cst();
 }
 
 static inline void
 vq_kick_disable(struct vqueue_info *vq)
 {
 
 	vq->vq_used->vu_flags |= VRING_USED_F_NO_NOTIFY;
 }
 
 struct iovec;
 void	vi_softc_linkup(struct virtio_softc *vs, struct virtio_consts *vc,
 			void *dev_softc, struct pci_devinst *pi,
 			struct vqueue_info *queues);
 int	vi_intr_init(struct virtio_softc *vs, int barnum, int use_msix);
 void	vi_reset_dev(struct virtio_softc *);
 void	vi_set_io_bar(struct virtio_softc *, int);
 
 int	vq_getchain(struct vqueue_info *vq, uint16_t *pidx,
 		    struct iovec *iov, int n_iov, uint16_t *flags);
 void	vq_retchains(struct vqueue_info *vq, uint16_t n_chains);
 void	vq_relchain_prepare(struct vqueue_info *vq, uint16_t idx,
 			    uint32_t iolen);
 void	vq_relchain_publish(struct vqueue_info *vq);
 void	vq_relchain(struct vqueue_info *vq, uint16_t idx, uint32_t iolen);
 void	vq_endchains(struct vqueue_info *vq, int used_all_avail);
 
 uint64_t vi_pci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
 		     int baridx, uint64_t offset, int size);
 void	vi_pci_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
 		     int baridx, uint64_t offset, int size, uint64_t value);
+#ifdef BHYVE_SNAPSHOT
+int	vi_pci_snapshot(struct vm_snapshot_meta *meta);
+int	vi_pci_pause(struct vmctx *ctx, struct pci_devinst *pi);
+int	vi_pci_resume(struct vmctx *ctx, struct pci_devinst *pi);
+#endif
 #endif	/* _VIRTIO_H_ */
diff --git a/usr.sbin/bhyvectl/Makefile b/usr.sbin/bhyvectl/Makefile
index 0ffca5675cb9..58eaf49dae3a 100644
--- a/usr.sbin/bhyvectl/Makefile
+++ b/usr.sbin/bhyvectl/Makefile
@@ -1,17 +1,23 @@
 #
 # $FreeBSD$
 #
 
+.include <src.opts.mk>
+
 PROG=	bhyvectl
 SRCS=	bhyvectl.c
 PACKAGE=	bhyve
 
 MAN=	bhyvectl.8
 
 LIBADD=	vmmapi util
 
 WARNS?=	3
 
 CFLAGS+= -I${SRCTOP}/sys/amd64/vmm
 
+.if ${MK_BHYVE_SNAPSHOT} != "no"
+CFLAGS+= -DBHYVE_SNAPSHOT
+.endif
+
 .include <bsd.prog.mk>
diff --git a/usr.sbin/bhyvectl/bhyvectl.8 b/usr.sbin/bhyvectl/bhyvectl.8
index 035f9f6c7586..6adf87ca4537 100644
--- a/usr.sbin/bhyvectl/bhyvectl.8
+++ b/usr.sbin/bhyvectl/bhyvectl.8
@@ -1,97 +1,114 @@
 .\" Copyright (c) 2015 Christian Brueffer
 .\" All rights reserved.
 .\"
 .\" Redistribution and use in source and binary forms, with or without
 .\" modification, are permitted provided that the following conditions
 .\" are met:
 .\" 1. Redistributions of source code must retain the above copyright
 .\"    notice, this list of conditions and the following disclaimer.
 .\" 2. Redistributions in binary form must reproduce the above copyright
 .\"    notice, this list of conditions and the following disclaimer in the
 .\"    documentation and/or other materials provided with the distribution.
 .\"
 .\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 .\" ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 .\" SUCH DAMAGE.
 .\"
 .\" $FreeBSD$
 .\"
-.Dd November 13, 2016
+.Dd May 04, 2020
 .Dt BHYVECTL 8
 .Os
 .Sh NAME
 .Nm bhyvectl
 .Nd "control utility for bhyve instances"
 .Sh SYNOPSIS
 .Nm
 .Fl -vm= Ns Ar <vmname>
 .Op Fl -create
 .Op Fl -destroy
 .Op Fl -get-stats
 .Op Fl -inject-nmi
 .Op Fl -force-reset
 .Op Fl -force-poweroff
+.Op Fl -checkpoint= Ns Ar <filename>
+.Op Fl -suspend= Ns Ar <filename>
 .Sh DESCRIPTION
 The
 .Nm
 command is a control utility for active
 .Xr bhyve 8
 virtual machine instances.
 .Pp
 .Em Note :
 Most
 .Nm
 flags are intended for querying and setting the state of an active instance.
 These commands are intended for development purposes, and are not documented here.
 A complete list can be obtained by executing
 .Nm
 without any arguments.
 .Pp
 The user-facing options are as follows:
 .Bl -tag -width ".Fl d Ar argument"
 .It Fl -vm= Ns Ar <vmname>
 Operate on the virtual machine
 .Ar <vmname> .
 .It Fl -create
 Create the specified VM.
 .It Fl -destroy
 Destroy the specified VM.
 .It Fl -get-stats
 Retrieve statistics for the specified VM.
 .It Fl -inject-nmi
 Inject a non-maskable interrupt (NMI) into the VM.
 .It Fl -force-reset
 Force the VM to reset.
 .It Fl -force-poweroff
 Force the VM to power off.
+.It Fl -checkpoint= Ns Ar <filename>
+Save a snapshot of a virtual machine.
+The guest memory contents are saved in the file given in
+.Ar <filename> .
+The guest device and vCPU state are saved in the file
+.Ar <filename>.kern .
+.It Fl -suspend= Ns Ar <filename>
+Save a snapshot of a virtual machine similar to
+.Fl -checkpoint .
+The virtual machine will terminate after the snapshot has been
+saved.
 .El
 .Sh EXIT STATUS
 .Ex -std
 .Sh EXAMPLES
 Destroy the VM called fbsd10:
 .Pp
 .Dl "bhyvectl --vm=fbsd10 --destroy"
+.Sh COMPATIBILITY
+The snapshot file format is not yet stable and is subject to future changes.
+Backwards compatibility support for the current snapshot file format is not
+guaranteed when future changes are made.
 .Sh SEE ALSO
 .Xr bhyve 8 ,
 .Xr bhyveload 8
 .Sh HISTORY
 The
 .Nm
 command first appeared in
 .Fx 10.1 .
 .Sh AUTHORS
 .An -nosplit
 The
 .Nm
 utility was written by
 .An Peter Grehan
 and
 .An Neel Natu .
diff --git a/usr.sbin/bhyvectl/bhyvectl.c b/usr.sbin/bhyvectl/bhyvectl.c
index 8274e6eafccb..d2c4a1488fe8 100644
--- a/usr.sbin/bhyvectl/bhyvectl.c
+++ b/usr.sbin/bhyvectl/bhyvectl.c
@@ -1,2350 +1,2469 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/sysctl.h>
 #include <sys/errno.h>
 #include <sys/mman.h>
 #include <sys/cpuset.h>
 
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdbool.h>
 #include <string.h>
 #include <unistd.h>
 #include <libgen.h>
 #include <libutil.h>
 #include <fcntl.h>
 #include <getopt.h>
 #include <time.h>
 #include <assert.h>
 #include <libutil.h>
 
 #include <machine/cpufunc.h>
 #include <machine/specialreg.h>
 #include <machine/vmm.h>
 #include <machine/vmm_dev.h>
 #include <vmmapi.h>
 
+#include <sys/socket.h>
+#include <sys/un.h>
+
 #include "amd/vmcb.h"
 #include "intel/vmcs.h"
 
 #define	MB	(1UL << 20)
 #define	GB	(1UL << 30)
 
 #define	REQ_ARG		required_argument
 #define	NO_ARG		no_argument
 #define	OPT_ARG		optional_argument
 
+#define CHECKPOINT_RUN_DIR "/var/run/bhyve/checkpoint"
+#define MAX_VMNAME 100
+
 static const char *progname;
 
 static void
 usage(bool cpu_intel)
 {
 
 	(void)fprintf(stderr,
 	"Usage: %s --vm=<vmname>\n"
 	"       [--cpu=<vcpu_number>]\n"
 	"       [--create]\n"
 	"       [--destroy]\n"
+#ifdef BHYVE_SNAPSHOT
+	"       [--checkpoint=<filename>]\n"
+	"       [--suspend=<filename>]\n"
+#endif
 	"       [--get-all]\n"
 	"       [--get-stats]\n"
 	"       [--set-desc-ds]\n"
 	"       [--get-desc-ds]\n"
 	"       [--set-desc-es]\n"
 	"       [--get-desc-es]\n"
 	"       [--set-desc-gs]\n"
 	"       [--get-desc-gs]\n"
 	"       [--set-desc-fs]\n"
 	"       [--get-desc-fs]\n"
 	"       [--set-desc-cs]\n"
 	"       [--get-desc-cs]\n"
 	"       [--set-desc-ss]\n"
 	"       [--get-desc-ss]\n"
 	"       [--set-desc-tr]\n"
 	"       [--get-desc-tr]\n"
 	"       [--set-desc-ldtr]\n"
 	"       [--get-desc-ldtr]\n"
 	"       [--set-desc-gdtr]\n"
 	"       [--get-desc-gdtr]\n"
 	"       [--set-desc-idtr]\n"
 	"       [--get-desc-idtr]\n"
 	"       [--run]\n"
 	"       [--capname=<capname>]\n"
 	"       [--getcap]\n"
 	"       [--setcap=<0|1>]\n"
 	"       [--desc-base=<BASE>]\n"
 	"       [--desc-limit=<LIMIT>]\n"
 	"       [--desc-access=<ACCESS>]\n"
 	"       [--set-cr0=<CR0>]\n"
 	"       [--get-cr0]\n"
 	"       [--set-cr2=<CR2>]\n"
 	"       [--get-cr2]\n"
 	"       [--set-cr3=<CR3>]\n"
 	"       [--get-cr3]\n"
 	"       [--set-cr4=<CR4>]\n"
 	"       [--get-cr4]\n"
 	"       [--set-dr0=<DR0>]\n"
 	"       [--get-dr0]\n"
 	"       [--set-dr1=<DR1>]\n"
 	"       [--get-dr1]\n"
 	"       [--set-dr2=<DR2>]\n"
 	"       [--get-dr2]\n"
 	"       [--set-dr3=<DR3>]\n"
 	"       [--get-dr3]\n"
 	"       [--set-dr6=<DR6>]\n"
 	"       [--get-dr6]\n"
 	"       [--set-dr7=<DR7>]\n"
 	"       [--get-dr7]\n"
 	"       [--set-rsp=<RSP>]\n"
 	"       [--get-rsp]\n"
 	"       [--set-rip=<RIP>]\n"
 	"       [--get-rip]\n"
 	"       [--get-rax]\n"
 	"       [--set-rax=<RAX>]\n"
 	"       [--get-rbx]\n"
 	"       [--get-rcx]\n"
 	"       [--get-rdx]\n"
 	"       [--get-rsi]\n"
 	"       [--get-rdi]\n"
 	"       [--get-rbp]\n"
 	"       [--get-r8]\n"
 	"       [--get-r9]\n"
 	"       [--get-r10]\n"
 	"       [--get-r11]\n"
 	"       [--get-r12]\n"
 	"       [--get-r13]\n"
 	"       [--get-r14]\n"
 	"       [--get-r15]\n"
 	"       [--set-rflags=<RFLAGS>]\n"
 	"       [--get-rflags]\n"
 	"       [--set-cs]\n"
 	"       [--get-cs]\n"
 	"       [--set-ds]\n"
 	"       [--get-ds]\n"
 	"       [--set-es]\n"
 	"       [--get-es]\n"
 	"       [--set-fs]\n"
 	"       [--get-fs]\n"
 	"       [--set-gs]\n"
 	"       [--get-gs]\n"
 	"       [--set-ss]\n"
 	"       [--get-ss]\n"
 	"       [--get-tr]\n"
 	"       [--get-ldtr]\n"
 	"       [--set-x2apic-state=<state>]\n"
 	"       [--get-x2apic-state]\n"
 	"       [--unassign-pptdev=<bus/slot/func>]\n"
 	"       [--set-mem=<memory in units of MB>]\n"
 	"       [--get-lowmem]\n"
 	"       [--get-highmem]\n"
 	"       [--get-gpa-pmap]\n"
 	"       [--assert-lapic-lvt=<pin>]\n"
 	"       [--inject-nmi]\n"
 	"       [--force-reset]\n"
 	"       [--force-poweroff]\n"
 	"       [--get-rtc-time]\n"
 	"       [--set-rtc-time=<secs>]\n"
 	"       [--get-rtc-nvram]\n"
 	"       [--set-rtc-nvram=<val>]\n"
 	"       [--rtc-nvram-offset=<offset>]\n"
 	"       [--get-active-cpus]\n"
 	"       [--get-suspended-cpus]\n"
 	"       [--get-intinfo]\n"
 	"       [--get-eptp]\n"
 	"       [--set-exception-bitmap]\n"
 	"       [--get-exception-bitmap]\n"
 	"       [--get-tsc-offset]\n"
 	"       [--get-guest-pat]\n"
 	"       [--get-io-bitmap-address]\n"
 	"       [--get-msr-bitmap]\n"
 	"       [--get-msr-bitmap-address]\n"
 	"       [--get-guest-sysenter]\n"
 	"       [--get-exit-reason]\n"
 	"       [--get-cpu-topology]\n",
 	progname);
 
 	if (cpu_intel) {
 		(void)fprintf(stderr,
 		"       [--get-vmcs-pinbased-ctls]\n"
 		"       [--get-vmcs-procbased-ctls]\n"
 		"       [--get-vmcs-procbased-ctls2]\n"
 		"       [--get-vmcs-entry-interruption-info]\n"
 		"       [--set-vmcs-entry-interruption-info=<info>]\n"
 		"       [--get-vmcs-guest-physical-address\n"
 		"       [--get-vmcs-guest-linear-address\n"
 		"       [--get-vmcs-host-pat]\n"
 		"       [--get-vmcs-host-cr0]\n"
 		"       [--get-vmcs-host-cr3]\n"
 		"       [--get-vmcs-host-cr4]\n"
 		"       [--get-vmcs-host-rip]\n"
 		"       [--get-vmcs-host-rsp]\n"
 		"       [--get-vmcs-cr0-mask]\n"
 		"       [--get-vmcs-cr0-shadow]\n"
 		"       [--get-vmcs-cr4-mask]\n"
 		"       [--get-vmcs-cr4-shadow]\n"
 		"       [--get-vmcs-cr3-targets]\n"
 		"       [--get-vmcs-apic-access-address]\n"
 		"       [--get-vmcs-virtual-apic-address]\n"
 		"       [--get-vmcs-tpr-threshold]\n"
 		"       [--get-vmcs-vpid]\n"
 		"       [--get-vmcs-instruction-error]\n"
 		"       [--get-vmcs-exit-ctls]\n"
 		"       [--get-vmcs-entry-ctls]\n"
 		"       [--get-vmcs-link]\n"
 		"       [--get-vmcs-exit-qualification]\n"
 		"       [--get-vmcs-exit-interruption-info]\n"
 		"       [--get-vmcs-exit-interruption-error]\n"
 		"       [--get-vmcs-interruptibility]\n"
 		);
 	} else {
 		(void)fprintf(stderr,
 		"       [--get-vmcb-intercepts]\n"
 		"       [--get-vmcb-asid]\n"
 		"       [--get-vmcb-exit-details]\n"
 		"       [--get-vmcb-tlb-ctrl]\n"
 		"       [--get-vmcb-virq]\n"
 		"       [--get-avic-apic-bar]\n"
 		"       [--get-avic-backing-page]\n"
 		"       [--get-avic-table]\n"
 		);
 	}
 	exit(1);
 }
 
 static int get_rtc_time, set_rtc_time;
 static int get_rtc_nvram, set_rtc_nvram;
 static int rtc_nvram_offset;
 static uint8_t rtc_nvram_value;
 static time_t rtc_secs;
 
 static int get_stats, getcap, setcap, capval, get_gpa_pmap;
 static int inject_nmi, assert_lapic_lvt;
 static int force_reset, force_poweroff;
 static const char *capname;
 static int create, destroy, get_memmap, get_memseg;
 static int get_intinfo;
 static int get_active_cpus, get_suspended_cpus;
 static uint64_t memsize;
 static int set_cr0, get_cr0, set_cr2, get_cr2, set_cr3, get_cr3;
 static int set_cr4, get_cr4;
 static int set_efer, get_efer;
 static int set_dr0, get_dr0;
 static int set_dr1, get_dr1;
 static int set_dr2, get_dr2;
 static int set_dr3, get_dr3;
 static int set_dr6, get_dr6;
 static int set_dr7, get_dr7;
 static int set_rsp, get_rsp, set_rip, get_rip, set_rflags, get_rflags;
 static int set_rax, get_rax;
 static int get_rbx, get_rcx, get_rdx, get_rsi, get_rdi, get_rbp;
 static int get_r8, get_r9, get_r10, get_r11, get_r12, get_r13, get_r14, get_r15;
 static int set_desc_ds, get_desc_ds;
 static int set_desc_es, get_desc_es;
 static int set_desc_fs, get_desc_fs;
 static int set_desc_gs, get_desc_gs;
 static int set_desc_cs, get_desc_cs;
 static int set_desc_ss, get_desc_ss;
 static int set_desc_gdtr, get_desc_gdtr;
 static int set_desc_idtr, get_desc_idtr;
 static int set_desc_tr, get_desc_tr;
 static int set_desc_ldtr, get_desc_ldtr;
 static int set_cs, set_ds, set_es, set_fs, set_gs, set_ss, set_tr, set_ldtr;
 static int get_cs, get_ds, get_es, get_fs, get_gs, get_ss, get_tr, get_ldtr;
 static int set_x2apic_state, get_x2apic_state;
 enum x2apic_state x2apic_state;
 static int unassign_pptdev, bus, slot, func;
 static int run;
 static int get_cpu_topology;
+#ifdef BHYVE_SNAPSHOT
+static int vm_checkpoint_opt;
+static int vm_suspend_opt;
+#endif
 
 /*
  * VMCB specific.
  */
 static int get_vmcb_intercept, get_vmcb_exit_details, get_vmcb_tlb_ctrl;
 static int get_vmcb_virq, get_avic_table;
 
 /*
  * VMCS-specific fields
  */
 static int get_pinbased_ctls, get_procbased_ctls, get_procbased_ctls2;
 static int get_eptp, get_io_bitmap, get_tsc_offset;
 static int get_vmcs_entry_interruption_info, set_vmcs_entry_interruption_info;
 static int get_vmcs_interruptibility;
 uint32_t vmcs_entry_interruption_info;
 static int get_vmcs_gpa, get_vmcs_gla;
 static int get_exception_bitmap, set_exception_bitmap, exception_bitmap;
 static int get_cr0_mask, get_cr0_shadow;
 static int get_cr4_mask, get_cr4_shadow;
 static int get_cr3_targets;
 static int get_apic_access_addr, get_virtual_apic_addr, get_tpr_threshold;
 static int get_msr_bitmap, get_msr_bitmap_address;
 static int get_vpid_asid;
 static int get_inst_err, get_exit_ctls, get_entry_ctls;
 static int get_host_cr0, get_host_cr3, get_host_cr4;
 static int get_host_rip, get_host_rsp;
 static int get_guest_pat, get_host_pat;
 static int get_guest_sysenter, get_vmcs_link;
 static int get_exit_reason, get_vmcs_exit_qualification;
 static int get_vmcs_exit_interruption_info, get_vmcs_exit_interruption_error;
 static int get_vmcs_exit_inst_length;
 
 static uint64_t desc_base;
 static uint32_t desc_limit, desc_access;
 
 static int get_all;
 
 static void
 dump_vm_run_exitcode(struct vm_exit *vmexit, int vcpu)
 {
 	printf("vm exit[%d]\n", vcpu);
 	printf("\trip\t\t0x%016lx\n", vmexit->rip);
 	printf("\tinst_length\t%d\n", vmexit->inst_length);
 	switch (vmexit->exitcode) {
 	case VM_EXITCODE_INOUT:
 		printf("\treason\t\tINOUT\n");
 		printf("\tdirection\t%s\n", vmexit->u.inout.in ? "IN" : "OUT");
 		printf("\tbytes\t\t%d\n", vmexit->u.inout.bytes);
 		printf("\tflags\t\t%s%s\n",
 			vmexit->u.inout.string ? "STRING " : "",
 			vmexit->u.inout.rep ? "REP " : "");
 		printf("\tport\t\t0x%04x\n", vmexit->u.inout.port);
 		printf("\teax\t\t0x%08x\n", vmexit->u.inout.eax);
 		break;
 	case VM_EXITCODE_VMX:
 		printf("\treason\t\tVMX\n");
 		printf("\tstatus\t\t%d\n", vmexit->u.vmx.status);
 		printf("\texit_reason\t0x%08x (%u)\n",
 		    vmexit->u.vmx.exit_reason, vmexit->u.vmx.exit_reason);
 		printf("\tqualification\t0x%016lx\n",
 			vmexit->u.vmx.exit_qualification);
 		printf("\tinst_type\t\t%d\n", vmexit->u.vmx.inst_type);
 		printf("\tinst_error\t\t%d\n", vmexit->u.vmx.inst_error);
 		break;
 	case VM_EXITCODE_SVM:
 		printf("\treason\t\tSVM\n");
 		printf("\texit_reason\t\t%#lx\n", vmexit->u.svm.exitcode);
 		printf("\texitinfo1\t\t%#lx\n", vmexit->u.svm.exitinfo1);
 		printf("\texitinfo2\t\t%#lx\n", vmexit->u.svm.exitinfo2);
 		break;
 	default:
 		printf("*** unknown vm run exitcode %d\n", vmexit->exitcode);
 		break;
 	}
 }
 
 /* AMD 6th generation and Intel compatible MSRs */
 #define MSR_AMD6TH_START	0xC0000000
 #define MSR_AMD6TH_END		0xC0001FFF
 /* AMD 7th and 8th generation compatible MSRs */
 #define MSR_AMD7TH_START	0xC0010000
 #define MSR_AMD7TH_END		0xC0011FFF
 
 static const char *
 msr_name(uint32_t msr)
 {
 	static char buf[32];
 
 	switch(msr) {
 	case MSR_TSC:
 		return ("MSR_TSC");
 	case MSR_EFER:
 		return ("MSR_EFER");
 	case MSR_STAR:
 		return ("MSR_STAR");
 	case MSR_LSTAR:	
 		return ("MSR_LSTAR");
 	case MSR_CSTAR:
 		return ("MSR_CSTAR");
 	case MSR_SF_MASK:
 		return ("MSR_SF_MASK");
 	case MSR_FSBASE:
 		return ("MSR_FSBASE");
 	case MSR_GSBASE:
 		return ("MSR_GSBASE");
 	case MSR_KGSBASE:
 		return ("MSR_KGSBASE");
 	case MSR_SYSENTER_CS_MSR:
 		return ("MSR_SYSENTER_CS_MSR");
 	case MSR_SYSENTER_ESP_MSR:
 		return ("MSR_SYSENTER_ESP_MSR");
 	case MSR_SYSENTER_EIP_MSR:
 		return ("MSR_SYSENTER_EIP_MSR");
 	case MSR_PAT:
 		return ("MSR_PAT");
 	}
 	snprintf(buf, sizeof(buf), "MSR       %#08x", msr);
 
 	return (buf);
 }
 
 static inline void
 print_msr_pm(uint64_t msr, int vcpu, int readable, int writeable)
 {
 
 	if (readable || writeable) {
 		printf("%-20s[%d]\t\t%c%c\n", msr_name(msr), vcpu,
 			readable ? 'R' : '-', writeable ? 'W' : '-');
 	}
 }
 
 /*
  * Reference APM vol2, section 15.11 MSR Intercepts.
  */
 static void
 dump_amd_msr_pm(const char *bitmap, int vcpu)
 {
 	int byte, bit, readable, writeable;
 	uint32_t msr;
 
 	for (msr = 0; msr < 0x2000; msr++) {
 		byte = msr / 4;
 		bit = (msr % 4) * 2;
 
 		/* Look at MSRs in the range 0x00000000 to 0x00001FFF */
 		readable = (bitmap[byte] & (1 << bit)) ? 0 : 1;
 		writeable = (bitmap[byte] & (2 << bit)) ?  0 : 1;
 		print_msr_pm(msr, vcpu, readable, writeable);
 
 		/* Look at MSRs in the range 0xC0000000 to 0xC0001FFF */
 		byte += 2048;
 		readable = (bitmap[byte] & (1 << bit)) ? 0 : 1;
 		writeable = (bitmap[byte] & (2 << bit)) ?  0 : 1;
 		print_msr_pm(msr + MSR_AMD6TH_START, vcpu, readable,
 				writeable);
 		
 		/* MSR 0xC0010000 to 0xC0011FF is only for AMD */
 		byte += 4096;
 		readable = (bitmap[byte] & (1 << bit)) ? 0 : 1;
 		writeable = (bitmap[byte] & (2 << bit)) ?  0 : 1;
 		print_msr_pm(msr + MSR_AMD7TH_START, vcpu, readable,
 				writeable);
 	}
 }
 
 /*
  * Reference Intel SDM Vol3 Section 24.6.9 MSR-Bitmap Address
  */
 static void
 dump_intel_msr_pm(const char *bitmap, int vcpu)
 {
 	int byte, bit, readable, writeable;
 	uint32_t msr;
 
 	for (msr = 0; msr < 0x2000; msr++) {
 		byte = msr / 8;
 		bit = msr & 0x7;
 
 		/* Look at MSRs in the range 0x00000000 to 0x00001FFF */
 		readable = (bitmap[byte] & (1 << bit)) ? 0 : 1;
 		writeable = (bitmap[2048 + byte] & (1 << bit)) ?  0 : 1;
 		print_msr_pm(msr, vcpu, readable, writeable);
 
 		/* Look at MSRs in the range 0xC0000000 to 0xC0001FFF */
 		byte += 1024;
 		readable = (bitmap[byte] & (1 << bit)) ? 0 : 1;
 		writeable = (bitmap[2048 + byte] & (1 << bit)) ?  0 : 1;
 		print_msr_pm(msr + MSR_AMD6TH_START, vcpu, readable,
 				writeable);
 	}
 }
 
 static int
 dump_msr_bitmap(int vcpu, uint64_t addr, bool cpu_intel)
 {
 	int error, fd, map_size;
 	const char *bitmap;
 
 	error = -1;
 	bitmap = MAP_FAILED;
 
 	fd = open("/dev/mem", O_RDONLY, 0);
 	if (fd < 0) {
 		perror("Couldn't open /dev/mem");
 		goto done;
 	}
 
 	if (cpu_intel)
 		map_size = PAGE_SIZE;
 	else
 		map_size = 2 * PAGE_SIZE;
 
 	bitmap = mmap(NULL, map_size, PROT_READ, MAP_SHARED, fd, addr);
 	if (bitmap == MAP_FAILED) {
 		perror("mmap failed");
 		goto done;
 	}
 	
 	if (cpu_intel)
 		dump_intel_msr_pm(bitmap, vcpu);
 	else	
 		dump_amd_msr_pm(bitmap, vcpu);
 
 	error = 0;
 done:
 	if (bitmap != MAP_FAILED)
 		munmap((void *)bitmap, map_size);
 	if (fd >= 0)
 		close(fd);
 
 	return (error);
 }
 
 static int
 vm_get_vmcs_field(struct vmctx *ctx, int vcpu, int field, uint64_t *ret_val)
 {
 
 	return (vm_get_register(ctx, vcpu, VMCS_IDENT(field), ret_val));
 }
 
 static int
 vm_set_vmcs_field(struct vmctx *ctx, int vcpu, int field, uint64_t val)
 {
 
 	return (vm_set_register(ctx, vcpu, VMCS_IDENT(field), val));
 }
 
 static int
 vm_get_vmcb_field(struct vmctx *ctx, int vcpu, int off, int bytes,
 	uint64_t *ret_val)
 {
 
 	return (vm_get_register(ctx, vcpu, VMCB_ACCESS(off, bytes), ret_val));
 }
 
 static int
 vm_set_vmcb_field(struct vmctx *ctx, int vcpu, int off, int bytes,
 	uint64_t val)
 {
 	
 	return (vm_set_register(ctx, vcpu, VMCB_ACCESS(off, bytes), val));
 }
 
 enum {
 	VMNAME = 1000,	/* avoid collision with return values from getopt */
 	VCPU,
 	SET_MEM,
 	SET_EFER,
 	SET_CR0,
 	SET_CR2,
 	SET_CR3,
 	SET_CR4,
 	SET_DR0,
 	SET_DR1,
 	SET_DR2,
 	SET_DR3,
 	SET_DR6,
 	SET_DR7,
 	SET_RSP,
 	SET_RIP,
 	SET_RAX,
 	SET_RFLAGS,
 	DESC_BASE,
 	DESC_LIMIT,
 	DESC_ACCESS,
 	SET_CS,
 	SET_DS,
 	SET_ES,
 	SET_FS,
 	SET_GS,
 	SET_SS,
 	SET_TR,
 	SET_LDTR,
 	SET_X2APIC_STATE,
 	SET_EXCEPTION_BITMAP,
 	SET_VMCS_ENTRY_INTERRUPTION_INFO,
 	SET_CAP,
 	CAPNAME,
 	UNASSIGN_PPTDEV,
 	GET_GPA_PMAP,
 	ASSERT_LAPIC_LVT,
 	SET_RTC_TIME,
 	SET_RTC_NVRAM,
 	RTC_NVRAM_OFFSET,
+#ifdef BHYVE_SNAPSHOT
+	SET_CHECKPOINT_FILE,
+	SET_SUSPEND_FILE,
+#endif
 };
 
 static void
 print_cpus(const char *banner, const cpuset_t *cpus)
 {
 	int i, first;
 
 	first = 1;
 	printf("%s:\t", banner);
 	if (!CPU_EMPTY(cpus)) {
 		for (i = 0; i < CPU_SETSIZE; i++) {
 			if (CPU_ISSET(i, cpus)) {
 				printf("%s%d", first ? " " : ", ", i);
 				first = 0;
 			}
 		}
 	} else
 		printf(" (none)");
 	printf("\n");
 }
 
 static void
 print_intinfo(const char *banner, uint64_t info)
 {
 	int type;
 
 	printf("%s:\t", banner);
 	if (info & VM_INTINFO_VALID) {
 		type = info & VM_INTINFO_TYPE;
 		switch (type) {
 		case VM_INTINFO_HWINTR:
 			printf("extint");
 			break;
 		case VM_INTINFO_NMI:
 			printf("nmi");
 			break;
 		case VM_INTINFO_SWINTR:
 			printf("swint");
 			break;
 		default:
 			printf("exception");
 			break;
 		}
 		printf(" vector %d", (int)VM_INTINFO_VECTOR(info));
 		if (info & VM_INTINFO_DEL_ERRCODE)
 			printf(" errcode %#x", (u_int)(info >> 32));
 	} else {
 		printf("n/a");
 	}
 	printf("\n");
 }
 
 static bool
 cpu_vendor_intel(void)
 {
 	u_int regs[4];
 	char cpu_vendor[13];
 
 	do_cpuid(0, regs);
 	((u_int *)&cpu_vendor)[0] = regs[1];
 	((u_int *)&cpu_vendor)[1] = regs[3];
 	((u_int *)&cpu_vendor)[2] = regs[2];
 	cpu_vendor[12] = '\0';
 
 	if (strcmp(cpu_vendor, "AuthenticAMD") == 0) {
 		return (false);
 	} else if (strcmp(cpu_vendor, "HygonGenuine") == 0) {
 		return (false);
 	} else if (strcmp(cpu_vendor, "GenuineIntel") == 0) {
 		return (true);
 	} else {
 		fprintf(stderr, "Unknown cpu vendor \"%s\"\n", cpu_vendor);
 		exit(1);
 	}
 }
 
 static int
 get_all_registers(struct vmctx *ctx, int vcpu)
 {
 	uint64_t cr0, cr2, cr3, cr4, dr0, dr1, dr2, dr3, dr6, dr7;
 	uint64_t rsp, rip, rflags, efer;
 	uint64_t rax, rbx, rcx, rdx, rsi, rdi, rbp;
 	uint64_t r8, r9, r10, r11, r12, r13, r14, r15;
 	int error = 0;
 
 	if (!error && (get_efer || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_EFER, &efer);
 		if (error == 0)
 			printf("efer[%d]\t\t0x%016lx\n", vcpu, efer);
 	}
 
 	if (!error && (get_cr0 || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR0, &cr0);
 		if (error == 0)
 			printf("cr0[%d]\t\t0x%016lx\n", vcpu, cr0);
 	}
 
 	if (!error && (get_cr2 || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR2, &cr2);
 		if (error == 0)
 			printf("cr2[%d]\t\t0x%016lx\n", vcpu, cr2);
 	}
 
 	if (!error && (get_cr3 || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR3, &cr3);
 		if (error == 0)
 			printf("cr3[%d]\t\t0x%016lx\n", vcpu, cr3);
 	}
 
 	if (!error && (get_cr4 || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR4, &cr4);
 		if (error == 0)
 			printf("cr4[%d]\t\t0x%016lx\n", vcpu, cr4);
 	}
 
 	if (!error && (get_dr0 || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_DR0, &dr0);
 		if (error == 0)
 			printf("dr0[%d]\t\t0x%016lx\n", vcpu, dr0);
 	}
 
 	if (!error && (get_dr1 || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_DR1, &dr1);
 		if (error == 0)
 			printf("dr1[%d]\t\t0x%016lx\n", vcpu, dr1);
 	}
 
 	if (!error && (get_dr2 || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_DR2, &dr2);
 		if (error == 0)
 			printf("dr2[%d]\t\t0x%016lx\n", vcpu, dr2);
 	}
 
 	if (!error && (get_dr3 || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_DR3, &dr3);
 		if (error == 0)
 			printf("dr3[%d]\t\t0x%016lx\n", vcpu, dr3);
 	}
 
 	if (!error && (get_dr6 || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_DR6, &dr6);
 		if (error == 0)
 			printf("dr6[%d]\t\t0x%016lx\n", vcpu, dr6);
 	}
 
 	if (!error && (get_dr7 || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_DR7, &dr7);
 		if (error == 0)
 			printf("dr7[%d]\t\t0x%016lx\n", vcpu, dr7);
 	}
 
 	if (!error && (get_rsp || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RSP, &rsp);
 		if (error == 0)
 			printf("rsp[%d]\t\t0x%016lx\n", vcpu, rsp);
 	}
 
 	if (!error && (get_rip || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RIP, &rip);
 		if (error == 0)
 			printf("rip[%d]\t\t0x%016lx\n", vcpu, rip);
 	}
 
 	if (!error && (get_rax || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RAX, &rax);
 		if (error == 0)
 			printf("rax[%d]\t\t0x%016lx\n", vcpu, rax);
 	}
 
 	if (!error && (get_rbx || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RBX, &rbx);
 		if (error == 0)
 			printf("rbx[%d]\t\t0x%016lx\n", vcpu, rbx);
 	}
 
 	if (!error && (get_rcx || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RCX, &rcx);
 		if (error == 0)
 			printf("rcx[%d]\t\t0x%016lx\n", vcpu, rcx);
 	}
 
 	if (!error && (get_rdx || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RDX, &rdx);
 		if (error == 0)
 			printf("rdx[%d]\t\t0x%016lx\n", vcpu, rdx);
 	}
 
 	if (!error && (get_rsi || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RSI, &rsi);
 		if (error == 0)
 			printf("rsi[%d]\t\t0x%016lx\n", vcpu, rsi);
 	}
 
 	if (!error && (get_rdi || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RDI, &rdi);
 		if (error == 0)
 			printf("rdi[%d]\t\t0x%016lx\n", vcpu, rdi);
 	}
 
 	if (!error && (get_rbp || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RBP, &rbp);
 		if (error == 0)
 			printf("rbp[%d]\t\t0x%016lx\n", vcpu, rbp);
 	}
 
 	if (!error && (get_r8 || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R8, &r8);
 		if (error == 0)
 			printf("r8[%d]\t\t0x%016lx\n", vcpu, r8);
 	}
 
 	if (!error && (get_r9 || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R9, &r9);
 		if (error == 0)
 			printf("r9[%d]\t\t0x%016lx\n", vcpu, r9);
 	}
 
 	if (!error && (get_r10 || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R10, &r10);
 		if (error == 0)
 			printf("r10[%d]\t\t0x%016lx\n", vcpu, r10);
 	}
 
 	if (!error && (get_r11 || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R11, &r11);
 		if (error == 0)
 			printf("r11[%d]\t\t0x%016lx\n", vcpu, r11);
 	}
 
 	if (!error && (get_r12 || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R12, &r12);
 		if (error == 0)
 			printf("r12[%d]\t\t0x%016lx\n", vcpu, r12);
 	}
 
 	if (!error && (get_r13 || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R13, &r13);
 		if (error == 0)
 			printf("r13[%d]\t\t0x%016lx\n", vcpu, r13);
 	}
 
 	if (!error && (get_r14 || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R14, &r14);
 		if (error == 0)
 			printf("r14[%d]\t\t0x%016lx\n", vcpu, r14);
 	}
 
 	if (!error && (get_r15 || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R15, &r15);
 		if (error == 0)
 			printf("r15[%d]\t\t0x%016lx\n", vcpu, r15);
 	}
 
 	if (!error && (get_rflags || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RFLAGS,
 					&rflags);
 		if (error == 0)
 			printf("rflags[%d]\t0x%016lx\n", vcpu, rflags);
 	}
 
 	return (error);
 }
 
 static int
 get_all_segments(struct vmctx *ctx, int vcpu)
 {
 	uint64_t cs, ds, es, fs, gs, ss, tr, ldtr;
 	int error = 0;
 
 	if (!error && (get_desc_ds || get_all)) {
 		error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_DS,
 				   &desc_base, &desc_limit, &desc_access);
 		if (error == 0) {
 			printf("ds desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
 			      vcpu, desc_base, desc_limit, desc_access);
 		}
 	}
 
 	if (!error && (get_desc_es || get_all)) {
 		error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_ES,
 				    &desc_base, &desc_limit, &desc_access);
 		if (error == 0) {
 			printf("es desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
 			       vcpu, desc_base, desc_limit, desc_access);
 		}
 	}
 
 	if (!error && (get_desc_fs || get_all)) {
 		error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_FS,
 				    &desc_base, &desc_limit, &desc_access);
 		if (error == 0) {
 			printf("fs desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
 			       vcpu, desc_base, desc_limit, desc_access);
 		}
 	}
 
 	if (!error && (get_desc_gs || get_all)) {
 		error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_GS,
 				    &desc_base, &desc_limit, &desc_access);
 		if (error == 0) {
 			printf("gs desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
 			       vcpu, desc_base, desc_limit, desc_access);
 		}
 	}
 
 	if (!error && (get_desc_ss || get_all)) {
 		error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_SS,
 				    &desc_base, &desc_limit, &desc_access);
 		if (error == 0) {
 			printf("ss desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
 			       vcpu, desc_base, desc_limit, desc_access);
 		}
 	}
 
 	if (!error && (get_desc_cs || get_all)) {
 		error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_CS,
 				    &desc_base, &desc_limit, &desc_access);
 		if (error == 0) {
 			printf("cs desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
 			       vcpu, desc_base, desc_limit, desc_access);
 		}
 	}
 
 	if (!error && (get_desc_tr || get_all)) {
 		error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_TR,
 				    &desc_base, &desc_limit, &desc_access);
 		if (error == 0) {
 			printf("tr desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
 			       vcpu, desc_base, desc_limit, desc_access);
 		}
 	}
 
 	if (!error && (get_desc_ldtr || get_all)) {
 		error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_LDTR,
 				    &desc_base, &desc_limit, &desc_access);
 		if (error == 0) {
 			printf("ldtr desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
 			       vcpu, desc_base, desc_limit, desc_access);
 		}
 	}
 
 	if (!error && (get_desc_gdtr || get_all)) {
 		error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_GDTR,
 				    &desc_base, &desc_limit, &desc_access);
 		if (error == 0) {
 			printf("gdtr[%d]\t\t0x%016lx/0x%08x\n",
 			       vcpu, desc_base, desc_limit);
 		}
 	}
 
 	if (!error && (get_desc_idtr || get_all)) {
 		error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_IDTR,
 				    &desc_base, &desc_limit, &desc_access);
 		if (error == 0) {
 			printf("idtr[%d]\t\t0x%016lx/0x%08x\n",
 			       vcpu, desc_base, desc_limit);
 		}
 	}
 
 	if (!error && (get_cs || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CS, &cs);
 		if (error == 0)
 			printf("cs[%d]\t\t0x%04lx\n", vcpu, cs);
 	}
 
 	if (!error && (get_ds || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_DS, &ds);
 		if (error == 0)
 			printf("ds[%d]\t\t0x%04lx\n", vcpu, ds);
 	}
 
 	if (!error && (get_es || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_ES, &es);
 		if (error == 0)
 			printf("es[%d]\t\t0x%04lx\n", vcpu, es);
 	}
 
 	if (!error && (get_fs || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_FS, &fs);
 		if (error == 0)
 			printf("fs[%d]\t\t0x%04lx\n", vcpu, fs);
 	}
 
 	if (!error && (get_gs || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_GS, &gs);
 		if (error == 0)
 			printf("gs[%d]\t\t0x%04lx\n", vcpu, gs);
 	}
 
 	if (!error && (get_ss || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_SS, &ss);
 		if (error == 0)
 			printf("ss[%d]\t\t0x%04lx\n", vcpu, ss);
 	}
 
 	if (!error && (get_tr || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_TR, &tr);
 		if (error == 0)
 			printf("tr[%d]\t\t0x%04lx\n", vcpu, tr);
 	}
 
 	if (!error && (get_ldtr || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_LDTR, &ldtr);
 		if (error == 0)
 			printf("ldtr[%d]\t\t0x%04lx\n", vcpu, ldtr);
 	}
 
 	return (error);
 }
 
 static int
 get_misc_vmcs(struct vmctx *ctx, int vcpu)
 {
 	uint64_t ctl, cr0, cr3, cr4, rsp, rip, pat, addr, u64;
 	int error = 0;
 
 	if (!error && (get_cr0_mask || get_all)) {
 		uint64_t cr0mask;
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR0_MASK, &cr0mask);
 		if (error == 0)
 			printf("cr0_mask[%d]\t\t0x%016lx\n", vcpu, cr0mask);
 	}
 
 	if (!error && (get_cr0_shadow || get_all)) {
 		uint64_t cr0shadow;
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR0_SHADOW,
 					  &cr0shadow);
 		if (error == 0)
 			printf("cr0_shadow[%d]\t\t0x%016lx\n", vcpu, cr0shadow);
 	}
 
 	if (!error && (get_cr4_mask || get_all)) {
 		uint64_t cr4mask;
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR4_MASK, &cr4mask);
 		if (error == 0)
 			printf("cr4_mask[%d]\t\t0x%016lx\n", vcpu, cr4mask);
 	}
 
 	if (!error && (get_cr4_shadow || get_all)) {
 		uint64_t cr4shadow;
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR4_SHADOW,
 					  &cr4shadow);
 		if (error == 0)
 			printf("cr4_shadow[%d]\t\t0x%016lx\n", vcpu, cr4shadow);
 	}
 	
 	if (!error && (get_cr3_targets || get_all)) {
 		uint64_t target_count, target_addr;
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET_COUNT,
 					  &target_count);
 		if (error == 0) {
 			printf("cr3_target_count[%d]\t0x%016lx\n",
 				vcpu, target_count);
 		}
 
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET0,
 					  &target_addr);
 		if (error == 0) {
 			printf("cr3_target0[%d]\t\t0x%016lx\n",
 				vcpu, target_addr);
 		}
 
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET1,
 					  &target_addr);
 		if (error == 0) {
 			printf("cr3_target1[%d]\t\t0x%016lx\n",
 				vcpu, target_addr);
 		}
 
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET2,
 					  &target_addr);
 		if (error == 0) {
 			printf("cr3_target2[%d]\t\t0x%016lx\n",
 				vcpu, target_addr);
 		}
 
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET3,
 					  &target_addr);
 		if (error == 0) {
 			printf("cr3_target3[%d]\t\t0x%016lx\n",
 				vcpu, target_addr);
 		}
 	}
 
 	if (!error && (get_pinbased_ctls || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_PIN_BASED_CTLS, &ctl);
 		if (error == 0)
 			printf("pinbased_ctls[%d]\t0x%016lx\n", vcpu, ctl);
 	}
 
 	if (!error && (get_procbased_ctls || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu,
 					  VMCS_PRI_PROC_BASED_CTLS, &ctl);
 		if (error == 0)
 			printf("procbased_ctls[%d]\t0x%016lx\n", vcpu, ctl);
 	}
 
 	if (!error && (get_procbased_ctls2 || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu,
 					  VMCS_SEC_PROC_BASED_CTLS, &ctl);
 		if (error == 0)
 			printf("procbased_ctls2[%d]\t0x%016lx\n", vcpu, ctl);
 	}
 
 	if (!error && (get_vmcs_gla || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu,
 					  VMCS_GUEST_LINEAR_ADDRESS, &u64);
 		if (error == 0)
 			printf("gla[%d]\t\t0x%016lx\n", vcpu, u64);
 	}
 
 	if (!error && (get_vmcs_gpa || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu,
 					  VMCS_GUEST_PHYSICAL_ADDRESS, &u64);
 		if (error == 0)
 			printf("gpa[%d]\t\t0x%016lx\n", vcpu, u64);
 	}
 
 	if (!error && (get_vmcs_entry_interruption_info || 
 		get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_ENTRY_INTR_INFO,&u64);
 		if (error == 0) {
 			printf("entry_interruption_info[%d]\t0x%016lx\n",
 				vcpu, u64);
 		}
 	}
 
 	if (!error && (get_tpr_threshold || get_all)) {
 		uint64_t threshold;
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_TPR_THRESHOLD,
 					  &threshold);
 		if (error == 0)
 			printf("tpr_threshold[%d]\t0x%016lx\n", vcpu, threshold);
 	}
 
 	if (!error && (get_inst_err || get_all)) {
 		uint64_t insterr;
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_INSTRUCTION_ERROR,
 					  &insterr);
 		if (error == 0) {
 			printf("instruction_error[%d]\t0x%016lx\n",
 				vcpu, insterr);
 		}
 	}
 
 	if (!error && (get_exit_ctls || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_CTLS, &ctl);
 		if (error == 0)
 			printf("exit_ctls[%d]\t\t0x%016lx\n", vcpu, ctl);
 	}
 
 	if (!error && (get_entry_ctls || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_ENTRY_CTLS, &ctl);
 		if (error == 0)
 			printf("entry_ctls[%d]\t\t0x%016lx\n", vcpu, ctl);
 	}
 
 	if (!error && (get_host_pat || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_IA32_PAT, &pat);
 		if (error == 0)
 			printf("host_pat[%d]\t\t0x%016lx\n", vcpu, pat);
 	}
 
 	if (!error && (get_host_cr0 || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_CR0, &cr0);
 		if (error == 0)
 			printf("host_cr0[%d]\t\t0x%016lx\n", vcpu, cr0);
 	}
 
 	if (!error && (get_host_cr3 || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_CR3, &cr3);
 		if (error == 0)
 			printf("host_cr3[%d]\t\t0x%016lx\n", vcpu, cr3);
 	}
 
 	if (!error && (get_host_cr4 || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_CR4, &cr4);
 		if (error == 0)
 			printf("host_cr4[%d]\t\t0x%016lx\n", vcpu, cr4);
 	}
 
 	if (!error && (get_host_rip || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_RIP, &rip);
 		if (error == 0)
 			printf("host_rip[%d]\t\t0x%016lx\n", vcpu, rip);
 	}
 
 	if (!error && (get_host_rsp || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_RSP, &rsp);
 		if (error == 0)
 			printf("host_rsp[%d]\t\t0x%016lx\n", vcpu, rsp);
 	}
 
 	if (!error && (get_vmcs_link || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_LINK_POINTER, &addr);
 		if (error == 0)
 			printf("vmcs_pointer[%d]\t0x%016lx\n", vcpu, addr);
 	}
 
 	if (!error && (get_vmcs_exit_interruption_info || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_INTR_INFO, &u64);
 		if (error == 0) {
 			printf("vmcs_exit_interruption_info[%d]\t0x%016lx\n",
 				vcpu, u64);
 		}
 	}
 
 	if (!error && (get_vmcs_exit_interruption_error || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_INTR_ERRCODE,
 		    			  &u64);
 		if (error == 0) {
 			printf("vmcs_exit_interruption_error[%d]\t0x%016lx\n",
 				vcpu, u64);
 		}
 	}
 
 	if (!error && (get_vmcs_interruptibility || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu,
 					  VMCS_GUEST_INTERRUPTIBILITY, &u64);
 		if (error == 0) {
 			printf("vmcs_guest_interruptibility[%d]\t0x%016lx\n",
 				vcpu, u64);
 		}
 	}
 
 	if (!error && (get_vmcs_exit_inst_length || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu,
 		    VMCS_EXIT_INSTRUCTION_LENGTH, &u64);
 		if (error == 0)
 			printf("vmcs_exit_inst_length[%d]\t0x%08x\n", vcpu,
 			    (uint32_t)u64);
 	}
 
 	if (!error && (get_vmcs_exit_qualification || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_QUALIFICATION,
 					  &u64);
 		if (error == 0)
 			printf("vmcs_exit_qualification[%d]\t0x%016lx\n",
 				vcpu, u64);
 	}
 	
 	return (error);
 }
 
 static int
 get_misc_vmcb(struct vmctx *ctx, int vcpu)
 {
 	uint64_t ctl, addr;
 	int error = 0;
 
 	if (!error && (get_vmcb_intercept || get_all)) {
 		error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_CR_INTERCEPT, 4,
 		    &ctl);
 		if (error == 0)
 			printf("cr_intercept[%d]\t0x%08x\n", vcpu, (int)ctl);
 
 		error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_DR_INTERCEPT, 4,
 		    &ctl);
 		if (error == 0)
 			printf("dr_intercept[%d]\t0x%08x\n", vcpu, (int)ctl);
 
 		error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_EXC_INTERCEPT, 4,
 		    &ctl);
 		if (error == 0)
 			printf("exc_intercept[%d]\t0x%08x\n", vcpu, (int)ctl);
 
 		error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_INST1_INTERCEPT,
 		    4, &ctl);
 		if (error == 0)
 			printf("inst1_intercept[%d]\t0x%08x\n", vcpu, (int)ctl);
 
 		error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_INST2_INTERCEPT,
 		    4, &ctl);
 		if (error == 0)
 			printf("inst2_intercept[%d]\t0x%08x\n", vcpu, (int)ctl);
 	}
 
 	if (!error && (get_vmcb_tlb_ctrl || get_all)) {
 		error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_TLB_CTRL,
 					  4, &ctl);
 		if (error == 0)
 			printf("TLB ctrl[%d]\t0x%016lx\n", vcpu, ctl);
 	}
 
 	if (!error && (get_vmcb_exit_details || get_all)) {
 		error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_EXITINFO1,
 					  8, &ctl);
 		if (error == 0)
 			printf("exitinfo1[%d]\t0x%016lx\n", vcpu, ctl);
 		error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_EXITINFO2,
 					  8, &ctl);
 		if (error == 0)
 			printf("exitinfo2[%d]\t0x%016lx\n", vcpu, ctl);
 		error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_EXITINTINFO,
 					  8, &ctl);
 		if (error == 0)
 			printf("exitintinfo[%d]\t0x%016lx\n", vcpu, ctl);
 	}
 
 	if (!error && (get_vmcb_virq || get_all)) {
 		error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_VIRQ,
 					  8, &ctl);
 		if (error == 0)
 			printf("v_irq/tpr[%d]\t0x%016lx\n", vcpu, ctl);
 	}
 
 	if (!error && (get_apic_access_addr || get_all)) {
 		error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_AVIC_BAR, 8,
 					  &addr);
 		if (error == 0)
 			printf("AVIC apic_bar[%d]\t0x%016lx\n", vcpu, addr);
 	}
 
 	if (!error && (get_virtual_apic_addr || get_all)) {
 		error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_AVIC_PAGE, 8,
 					  &addr);
 		if (error == 0)
 			printf("AVIC backing page[%d]\t0x%016lx\n", vcpu, addr);
 	}
 
 	if (!error && (get_avic_table || get_all)) {
 		error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_AVIC_LT, 8,
 					  &addr);
 		if (error == 0)
 			printf("AVIC logical table[%d]\t0x%016lx\n",
 				vcpu, addr);
 		error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_AVIC_PT, 8,
 					  &addr);
 		if (error == 0)
 			printf("AVIC physical table[%d]\t0x%016lx\n",
 				vcpu, addr);
 	}
 
 	return (error);
 }
 
 static struct option *
 setup_options(bool cpu_intel)
 {
 	const struct option common_opts[] = {
 		{ "vm",		REQ_ARG,	0,	VMNAME },
 		{ "cpu",	REQ_ARG,	0,	VCPU },
 		{ "set-mem",	REQ_ARG,	0,	SET_MEM },
 		{ "set-efer",	REQ_ARG,	0,	SET_EFER },
 		{ "set-cr0",	REQ_ARG,	0,	SET_CR0 },
 		{ "set-cr2",	REQ_ARG,	0,	SET_CR2 },
 		{ "set-cr3",	REQ_ARG,	0,	SET_CR3 },
 		{ "set-cr4",	REQ_ARG,	0,	SET_CR4 },
 		{ "set-dr0",	REQ_ARG,	0,	SET_DR0 },
 		{ "set-dr1",	REQ_ARG,	0,	SET_DR1 },
 		{ "set-dr2",	REQ_ARG,	0,	SET_DR2 },
 		{ "set-dr3",	REQ_ARG,	0,	SET_DR3 },
 		{ "set-dr6",	REQ_ARG,	0,	SET_DR6 },
 		{ "set-dr7",	REQ_ARG,	0,	SET_DR7 },
 		{ "set-rsp",	REQ_ARG,	0,	SET_RSP },
 		{ "set-rip",	REQ_ARG,	0,	SET_RIP },
 		{ "set-rax",	REQ_ARG,	0,	SET_RAX },
 		{ "set-rflags",	REQ_ARG,	0,	SET_RFLAGS },
 		{ "desc-base",	REQ_ARG,	0,	DESC_BASE },
 		{ "desc-limit",	REQ_ARG,	0,	DESC_LIMIT },
 		{ "desc-access",REQ_ARG,	0,	DESC_ACCESS },
 		{ "set-cs",	REQ_ARG,	0,	SET_CS },
 		{ "set-ds",	REQ_ARG,	0,	SET_DS },
 		{ "set-es",	REQ_ARG,	0,	SET_ES },
 		{ "set-fs",	REQ_ARG,	0,	SET_FS },
 		{ "set-gs",	REQ_ARG,	0,	SET_GS },
 		{ "set-ss",	REQ_ARG,	0,	SET_SS },
 		{ "set-tr",	REQ_ARG,	0,	SET_TR },
 		{ "set-ldtr",	REQ_ARG,	0,	SET_LDTR },
 		{ "set-x2apic-state",REQ_ARG,	0,	SET_X2APIC_STATE },
 		{ "set-exception-bitmap",
 				REQ_ARG,	0, SET_EXCEPTION_BITMAP },
 		{ "capname",	REQ_ARG,	0,	CAPNAME },
 		{ "unassign-pptdev", REQ_ARG,	0,	UNASSIGN_PPTDEV },
 		{ "setcap",	REQ_ARG,	0,	SET_CAP },
 		{ "get-gpa-pmap", REQ_ARG,	0,	GET_GPA_PMAP },
 		{ "assert-lapic-lvt", REQ_ARG,	0,	ASSERT_LAPIC_LVT },
 		{ "get-rtc-time", NO_ARG,	&get_rtc_time,	1 },
 		{ "set-rtc-time", REQ_ARG,	0,	SET_RTC_TIME },
 		{ "rtc-nvram-offset", REQ_ARG,	0,	RTC_NVRAM_OFFSET },
 		{ "get-rtc-nvram", NO_ARG,	&get_rtc_nvram,	1 },
 		{ "set-rtc-nvram", REQ_ARG,	0,	SET_RTC_NVRAM },
 		{ "getcap",	NO_ARG,		&getcap,	1 },
 		{ "get-stats",	NO_ARG,		&get_stats,	1 },
 		{ "get-desc-ds",NO_ARG,		&get_desc_ds,	1 },
 		{ "set-desc-ds",NO_ARG,		&set_desc_ds,	1 },
 		{ "get-desc-es",NO_ARG,		&get_desc_es,	1 },
 		{ "set-desc-es",NO_ARG,		&set_desc_es,	1 },
 		{ "get-desc-ss",NO_ARG,		&get_desc_ss,	1 },
 		{ "set-desc-ss",NO_ARG,		&set_desc_ss,	1 },
 		{ "get-desc-cs",NO_ARG,		&get_desc_cs,	1 },
 		{ "set-desc-cs",NO_ARG,		&set_desc_cs,	1 },
 		{ "get-desc-fs",NO_ARG,		&get_desc_fs,	1 },
 		{ "set-desc-fs",NO_ARG,		&set_desc_fs,	1 },
 		{ "get-desc-gs",NO_ARG,		&get_desc_gs,	1 },
 		{ "set-desc-gs",NO_ARG,		&set_desc_gs,	1 },
 		{ "get-desc-tr",NO_ARG,		&get_desc_tr,	1 },
 		{ "set-desc-tr",NO_ARG,		&set_desc_tr,	1 },
 		{ "set-desc-ldtr", NO_ARG,	&set_desc_ldtr,	1 },
 		{ "get-desc-ldtr", NO_ARG,	&get_desc_ldtr,	1 },
 		{ "set-desc-gdtr", NO_ARG,	&set_desc_gdtr, 1 },
 		{ "get-desc-gdtr", NO_ARG,	&get_desc_gdtr, 1 },
 		{ "set-desc-idtr", NO_ARG,	&set_desc_idtr, 1 },
 		{ "get-desc-idtr", NO_ARG,	&get_desc_idtr, 1 },
 		{ "get-memmap",	NO_ARG,		&get_memmap,	1 },
 		{ "get-memseg", NO_ARG,		&get_memseg,	1 },
 		{ "get-efer",	NO_ARG,		&get_efer,	1 },
 		{ "get-cr0",	NO_ARG,		&get_cr0,	1 },
 		{ "get-cr2",	NO_ARG,		&get_cr2,	1 },
 		{ "get-cr3",	NO_ARG,		&get_cr3,	1 },
 		{ "get-cr4",	NO_ARG,		&get_cr4,	1 },
 		{ "get-dr0",	NO_ARG,		&get_dr0,	1 },
 		{ "get-dr1",	NO_ARG,		&get_dr1,	1 },
 		{ "get-dr2",	NO_ARG,		&get_dr2,	1 },
 		{ "get-dr3",	NO_ARG,		&get_dr3,	1 },
 		{ "get-dr6",	NO_ARG,		&get_dr6,	1 },
 		{ "get-dr7",	NO_ARG,		&get_dr7,	1 },
 		{ "get-rsp",	NO_ARG,		&get_rsp,	1 },
 		{ "get-rip",	NO_ARG,		&get_rip,	1 },
 		{ "get-rax",	NO_ARG,		&get_rax,	1 },
 		{ "get-rbx",	NO_ARG,		&get_rbx,	1 },
 		{ "get-rcx",	NO_ARG,		&get_rcx,	1 },
 		{ "get-rdx",	NO_ARG,		&get_rdx,	1 },
 		{ "get-rsi",	NO_ARG,		&get_rsi,	1 },
 		{ "get-rdi",	NO_ARG,		&get_rdi,	1 },
 		{ "get-rbp",	NO_ARG,		&get_rbp,	1 },
 		{ "get-r8",	NO_ARG,		&get_r8,	1 },
 		{ "get-r9",	NO_ARG,		&get_r9,	1 },
 		{ "get-r10",	NO_ARG,		&get_r10,	1 },
 		{ "get-r11",	NO_ARG,		&get_r11,	1 },
 		{ "get-r12",	NO_ARG,		&get_r12,	1 },
 		{ "get-r13",	NO_ARG,		&get_r13,	1 },
 		{ "get-r14",	NO_ARG,		&get_r14,	1 },
 		{ "get-r15",	NO_ARG,		&get_r15,	1 },
 		{ "get-rflags",	NO_ARG,		&get_rflags,	1 },
 		{ "get-cs",	NO_ARG,		&get_cs,	1 },
 		{ "get-ds",	NO_ARG,		&get_ds,	1 },
 		{ "get-es",	NO_ARG,		&get_es,	1 },
 		{ "get-fs",	NO_ARG,		&get_fs,	1 },
 		{ "get-gs",	NO_ARG,		&get_gs,	1 },
 		{ "get-ss",	NO_ARG,		&get_ss,	1 },
 		{ "get-tr",	NO_ARG,		&get_tr,	1 },
 		{ "get-ldtr",	NO_ARG,		&get_ldtr,	1 },
 		{ "get-eptp", 	NO_ARG,		&get_eptp,	1 },
 		{ "get-exception-bitmap",
 					NO_ARG,	&get_exception_bitmap,  1 },
 		{ "get-io-bitmap-address",
 					NO_ARG,	&get_io_bitmap,		1 },
 		{ "get-tsc-offset", 	NO_ARG, &get_tsc_offset, 	1 },
 		{ "get-msr-bitmap",
 					NO_ARG,	&get_msr_bitmap, 	1 },
 		{ "get-msr-bitmap-address",
 					NO_ARG,	&get_msr_bitmap_address, 1 },
 		{ "get-guest-pat",	NO_ARG,	&get_guest_pat,		1 },
 		{ "get-guest-sysenter",
 					NO_ARG,	&get_guest_sysenter, 	1 },
 		{ "get-exit-reason",
 					NO_ARG,	&get_exit_reason, 	1 },
 		{ "get-x2apic-state",	NO_ARG,	&get_x2apic_state, 	1 },
 		{ "get-all",		NO_ARG,	&get_all,		1 },
 		{ "run",		NO_ARG,	&run,			1 },
 		{ "create",		NO_ARG,	&create,		1 },
 		{ "destroy",		NO_ARG,	&destroy,		1 },
 		{ "inject-nmi",		NO_ARG,	&inject_nmi,		1 },
 		{ "force-reset",	NO_ARG,	&force_reset,		1 },
 		{ "force-poweroff", 	NO_ARG,	&force_poweroff, 	1 },
 		{ "get-active-cpus", 	NO_ARG,	&get_active_cpus, 	1 },
 		{ "get-suspended-cpus", NO_ARG,	&get_suspended_cpus, 	1 },
 		{ "get-intinfo", 	NO_ARG,	&get_intinfo,		1 },
 		{ "get-cpu-topology",	NO_ARG, &get_cpu_topology,	1 },
+#ifdef BHYVE_SNAPSHOT
+		{ "checkpoint", 	REQ_ARG, 0,	SET_CHECKPOINT_FILE},
+		{ "suspend", 		REQ_ARG, 0,	SET_SUSPEND_FILE},
+#endif
 	};
 
 	const struct option intel_opts[] = {
 		{ "get-vmcs-pinbased-ctls",
 				NO_ARG,		&get_pinbased_ctls, 1 },
 		{ "get-vmcs-procbased-ctls",
 				NO_ARG,		&get_procbased_ctls, 1 },
 		{ "get-vmcs-procbased-ctls2",
 				NO_ARG,		&get_procbased_ctls2, 1 },
 		{ "get-vmcs-guest-linear-address",
 				NO_ARG,		&get_vmcs_gla,	1 },
 		{ "get-vmcs-guest-physical-address",
 				NO_ARG,		&get_vmcs_gpa,	1 },
 		{ "get-vmcs-entry-interruption-info",
 				NO_ARG, &get_vmcs_entry_interruption_info, 1},
 		{ "get-vmcs-cr0-mask", NO_ARG,	&get_cr0_mask,	1 },
 		{ "get-vmcs-cr0-shadow", NO_ARG,&get_cr0_shadow, 1 },
 		{ "get-vmcs-cr4-mask", 		NO_ARG,	&get_cr4_mask,	  1 },
 		{ "get-vmcs-cr4-shadow", 	NO_ARG, &get_cr4_shadow,  1 },
 		{ "get-vmcs-cr3-targets", 	NO_ARG, &get_cr3_targets, 1 },
 		{ "get-vmcs-tpr-threshold",
 					NO_ARG,	&get_tpr_threshold, 1 },
 		{ "get-vmcs-vpid", 	NO_ARG,	&get_vpid_asid,	    1 },
 		{ "get-vmcs-exit-ctls", NO_ARG,	&get_exit_ctls,	    1 },
 		{ "get-vmcs-entry-ctls",
 					NO_ARG,	&get_entry_ctls, 1 },
 		{ "get-vmcs-instruction-error",
 					NO_ARG,	&get_inst_err,	1 },
 		{ "get-vmcs-host-pat",	NO_ARG,	&get_host_pat,	1 },
 		{ "get-vmcs-host-cr0",
 					NO_ARG,	&get_host_cr0,	1 },
 		{ "set-vmcs-entry-interruption-info",
 				REQ_ARG, 0, SET_VMCS_ENTRY_INTERRUPTION_INFO },
 		{ "get-vmcs-exit-qualification",
 				NO_ARG,	&get_vmcs_exit_qualification, 1 },
 		{ "get-vmcs-exit-inst-length",
 				NO_ARG,	&get_vmcs_exit_inst_length, 1 },
 		{ "get-vmcs-interruptibility",
 				NO_ARG, &get_vmcs_interruptibility, 1 },
 		{ "get-vmcs-exit-interruption-error",
 				NO_ARG,	&get_vmcs_exit_interruption_error, 1 },
 		{ "get-vmcs-exit-interruption-info",
 				NO_ARG,	&get_vmcs_exit_interruption_info, 1 },
 		{ "get-vmcs-link", 	NO_ARG,		&get_vmcs_link, 1 },
 		{ "get-vmcs-host-cr3",
 					NO_ARG,		&get_host_cr3,	1 },
 		{ "get-vmcs-host-cr4",
 				NO_ARG,		&get_host_cr4,	1 },
 		{ "get-vmcs-host-rip",
 				NO_ARG,		&get_host_rip,	1 },
 		{ "get-vmcs-host-rsp",
 				NO_ARG,		&get_host_rsp,	1 },
 		{ "get-apic-access-address",
 				NO_ARG,		&get_apic_access_addr, 1},
 		{ "get-virtual-apic-address",
 				NO_ARG,		&get_virtual_apic_addr, 1}
 	};
 
 	const struct option amd_opts[] = {
 		{ "get-vmcb-intercepts",
 				NO_ARG,	&get_vmcb_intercept, 	1 },
 		{ "get-vmcb-asid", 
 				NO_ARG,	&get_vpid_asid,	     	1 },
 		{ "get-vmcb-exit-details",
 				NO_ARG, &get_vmcb_exit_details,	1 },
 		{ "get-vmcb-tlb-ctrl",
 				NO_ARG, &get_vmcb_tlb_ctrl, 	1 },
 		{ "get-vmcb-virq",
 				NO_ARG, &get_vmcb_virq, 	1 },
 		{ "get-avic-apic-bar",
 				NO_ARG,	&get_apic_access_addr, 	1 },
 		{ "get-avic-backing-page",
 				NO_ARG,	&get_virtual_apic_addr, 1 },
 		{ "get-avic-table",
 				NO_ARG,	&get_avic_table, 	1 }
 	};
 
 	const struct option null_opt = {
 		NULL, 0, NULL, 0
 	};
 
 	struct option *all_opts;
 	char *cp;
 	int optlen;
 
 	optlen = sizeof(common_opts);
 
 	if (cpu_intel)
 		optlen += sizeof(intel_opts);
 	else
 		optlen += sizeof(amd_opts);
 
 	optlen += sizeof(null_opt);
 
 	all_opts = malloc(optlen);
 
 	cp = (char *)all_opts;
 	memcpy(cp, common_opts, sizeof(common_opts));
 	cp += sizeof(common_opts);
 
 	if (cpu_intel) {
 		memcpy(cp, intel_opts, sizeof(intel_opts));
 		cp += sizeof(intel_opts);
 	} else {
 		memcpy(cp, amd_opts, sizeof(amd_opts));
 		cp += sizeof(amd_opts);
 	}
 
 	memcpy(cp, &null_opt, sizeof(null_opt));
 	cp += sizeof(null_opt);
 
 	return (all_opts);
 }
 
 static const char *
 wday_str(int idx)
 {
 	static const char *weekdays[] = {
 		"Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"
 	};
 
 	if (idx >= 0 && idx < 7)
 		return (weekdays[idx]);
 	else
 		return ("UNK");
 }
 
 static const char *
 mon_str(int idx)
 {
 	static const char *months[] = {
 		"Jan", "Feb", "Mar", "Apr", "May", "Jun",
 		"Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
 	};
 
 	if (idx >= 0 && idx < 12)
 		return (months[idx]);
 	else
 		return ("UNK");
 }
 
 static int
 show_memmap(struct vmctx *ctx)
 {
 	char name[SPECNAMELEN + 1], numbuf[8];
 	vm_ooffset_t segoff;
 	vm_paddr_t gpa;
 	size_t maplen, seglen;
 	int error, flags, prot, segid, delim;
 
 	printf("Address     Length      Segment     Offset      ");
 	printf("Prot  Flags\n");
 
 	gpa = 0;
 	while (1) {
 		error = vm_mmap_getnext(ctx, &gpa, &segid, &segoff, &maplen,
 		    &prot, &flags);
 		if (error)
 			return (errno == ENOENT ? 0 : error);
 
 		error = vm_get_memseg(ctx, segid, &seglen, name, sizeof(name));
 		if (error)
 			return (error);
 
 		printf("%-12lX", gpa);
 		humanize_number(numbuf, sizeof(numbuf), maplen, "B",
 		    HN_AUTOSCALE, HN_NOSPACE);
 		printf("%-12s", numbuf);
 
 		printf("%-12s", name[0] ? name : "sysmem");
 		printf("%-12lX", segoff);
 		printf("%c%c%c   ", prot & PROT_READ ? 'R' : '-',
 		    prot & PROT_WRITE ? 'W' : '-',
 		    prot & PROT_EXEC ? 'X' : '-');
 
 		delim = '\0';
 		if (flags & VM_MEMMAP_F_WIRED) {
 			printf("%cwired", delim);
 			delim = '/';
 		}
 		if (flags & VM_MEMMAP_F_IOMMU) {
 			printf("%ciommu", delim);
 			delim = '/';
 		}
 		printf("\n");
 
 		gpa += maplen;
 	}
 }
 
 static int
 show_memseg(struct vmctx *ctx)
 {
 	char name[SPECNAMELEN + 1], numbuf[8];
 	size_t seglen;
 	int error, segid;
 
 	printf("ID  Length      Name\n");
 
 	segid = 0;
 	while (1) {
 		error = vm_get_memseg(ctx, segid, &seglen, name, sizeof(name));
 		if (error)
 			return (errno == EINVAL ? 0 : error);
 
 		if (seglen) {
 			printf("%-4d", segid);
 			humanize_number(numbuf, sizeof(numbuf), seglen, "B",
 			    HN_AUTOSCALE, HN_NOSPACE);
 			printf("%-12s", numbuf);
 			printf("%s", name[0] ? name : "sysmem");
 			printf("\n");
 		}
 		segid++;
 	}
 }
 
+#ifdef BHYVE_SNAPSHOT
+static int
+send_checkpoint_op_req(struct vmctx *ctx, struct checkpoint_op *op)
+{
+	struct sockaddr_un addr;
+	int socket_fd, len, len_sent, total_sent;
+	int err = 0;
+	char vmname_buf[MAX_VMNAME];
+
+	socket_fd = socket(PF_UNIX, SOCK_STREAM, 0);
+	if (socket_fd < 0) {
+		perror("Error creating bhyvectl socket");
+		err = -1;
+		goto done;
+	}
+
+	memset(&addr, 0, sizeof(struct sockaddr_un));
+	addr.sun_family = AF_UNIX;
+
+	err = vm_get_name(ctx, vmname_buf, MAX_VMNAME - 1);
+	if (err != 0) {
+		perror("Failed to get VM name");
+		goto done;
+	}
+
+	snprintf(addr.sun_path, sizeof(addr.sun_path), "%s/%s", CHECKPOINT_RUN_DIR, vmname_buf);
+
+	if (connect(socket_fd, (struct sockaddr *)&addr,
+			sizeof(struct sockaddr_un)) != 0) {
+		perror("Connect to VM socket failed");
+		err = -1;
+		goto done;
+	}
+
+	len = sizeof(*op);
+	total_sent = 0;
+	while ((len_sent = send(socket_fd, (char *)op + total_sent, len - total_sent, 0)) > 0) {
+		total_sent += len_sent;
+	}
+
+	if (len_sent < 0) {
+		perror("Failed to send checkpoint operation request");
+		err = -1;
+	}
+
+done:
+	if (socket_fd > 0)
+		close(socket_fd);
+	return (err);
+}
+
+static int
+send_start_checkpoint(struct vmctx *ctx, const char *checkpoint_file)
+{
+	struct checkpoint_op op;
+
+	op.op = START_CHECKPOINT;
+	strncpy(op.snapshot_filename, checkpoint_file, MAX_SNAPSHOT_VMNAME);
+	op.snapshot_filename[MAX_SNAPSHOT_VMNAME - 1] = 0;
+
+	return (send_checkpoint_op_req(ctx, &op));
+}
+
+static int
+send_start_suspend(struct vmctx *ctx, const char *suspend_file)
+{
+	struct checkpoint_op op;
+
+	op.op = START_SUSPEND;
+	strncpy(op.snapshot_filename, suspend_file, MAX_SNAPSHOT_VMNAME);
+	op.snapshot_filename[MAX_SNAPSHOT_VMNAME - 1] = 0;
+
+	return (send_checkpoint_op_req(ctx, &op));
+}
+#endif
+
 int
 main(int argc, char *argv[])
 {
 	char *vmname;
 	int error, ch, vcpu, ptenum;
 	vm_paddr_t gpa_pmap;
 	struct vm_exit vmexit;
 	uint64_t rax, cr0, cr2, cr3, cr4, dr0, dr1, dr2, dr3, dr6, dr7;
 	uint64_t rsp, rip, rflags, efer, pat;
 	uint64_t eptp, bm, addr, u64, pteval[4], *pte, info[2];
 	struct vmctx *ctx;
 	cpuset_t cpus;
 	bool cpu_intel;
 	uint64_t cs, ds, es, fs, gs, ss, tr, ldtr;
 	struct tm tm;
 	struct option *opts;
+#ifdef BHYVE_SNAPSHOT
+	char *checkpoint_file, *suspend_file;
+#endif
 
 	cpu_intel = cpu_vendor_intel();
 	opts = setup_options(cpu_intel);
 
 	vcpu = 0;
 	vmname = NULL;
 	assert_lapic_lvt = -1;
 	progname = basename(argv[0]);
 
 	while ((ch = getopt_long(argc, argv, "", opts, NULL)) != -1) {
 		switch (ch) {
 		case 0:
 			break;
 		case VMNAME:
 			vmname = optarg;
 			break;
 		case VCPU:
 			vcpu = atoi(optarg);
 			break;
 		case SET_MEM:
 			memsize = atoi(optarg) * MB;
 			memsize = roundup(memsize, 2 * MB);
 			break;
 		case SET_EFER:
 			efer = strtoul(optarg, NULL, 0);
 			set_efer = 1;
 			break;
 		case SET_CR0:
 			cr0 = strtoul(optarg, NULL, 0);
 			set_cr0 = 1;
 			break;
 		case SET_CR2:
 			cr2 = strtoul(optarg, NULL, 0);
 			set_cr2 = 1;
 			break;
 		case SET_CR3:
 			cr3 = strtoul(optarg, NULL, 0);
 			set_cr3 = 1;
 			break;
 		case SET_CR4:
 			cr4 = strtoul(optarg, NULL, 0);
 			set_cr4 = 1;
 			break;
 		case SET_DR0:
 			dr0 = strtoul(optarg, NULL, 0);
 			set_dr0 = 1;
 			break;
 		case SET_DR1:
 			dr1 = strtoul(optarg, NULL, 0);
 			set_dr1 = 1;
 			break;
 		case SET_DR2:
 			dr2 = strtoul(optarg, NULL, 0);
 			set_dr2 = 1;
 			break;
 		case SET_DR3:
 			dr3 = strtoul(optarg, NULL, 0);
 			set_dr3 = 1;
 			break;
 		case SET_DR6:
 			dr6 = strtoul(optarg, NULL, 0);
 			set_dr6 = 1;
 			break;
 		case SET_DR7:
 			dr7 = strtoul(optarg, NULL, 0);
 			set_dr7 = 1;
 			break;
 		case SET_RSP:
 			rsp = strtoul(optarg, NULL, 0);
 			set_rsp = 1;
 			break;
 		case SET_RIP:
 			rip = strtoul(optarg, NULL, 0);
 			set_rip = 1;
 			break;
 		case SET_RAX:
 			rax = strtoul(optarg, NULL, 0);
 			set_rax = 1;
 			break;
 		case SET_RFLAGS:
 			rflags = strtoul(optarg, NULL, 0);
 			set_rflags = 1;
 			break;
 		case DESC_BASE:
 			desc_base = strtoul(optarg, NULL, 0);
 			break;
 		case DESC_LIMIT:
 			desc_limit = strtoul(optarg, NULL, 0);
 			break;
 		case DESC_ACCESS:
 			desc_access = strtoul(optarg, NULL, 0);
 			break;
 		case SET_CS:
 			cs = strtoul(optarg, NULL, 0);
 			set_cs = 1;
 			break;
 		case SET_DS:
 			ds = strtoul(optarg, NULL, 0);
 			set_ds = 1;
 			break;
 		case SET_ES:
 			es = strtoul(optarg, NULL, 0);
 			set_es = 1;
 			break;
 		case SET_FS:
 			fs = strtoul(optarg, NULL, 0);
 			set_fs = 1;
 			break;
 		case SET_GS:
 			gs = strtoul(optarg, NULL, 0);
 			set_gs = 1;
 			break;
 		case SET_SS:
 			ss = strtoul(optarg, NULL, 0);
 			set_ss = 1;
 			break;
 		case SET_TR:
 			tr = strtoul(optarg, NULL, 0);
 			set_tr = 1;
 			break;
 		case SET_LDTR:
 			ldtr = strtoul(optarg, NULL, 0);
 			set_ldtr = 1;
 			break;
 		case SET_X2APIC_STATE:
 			x2apic_state = strtol(optarg, NULL, 0);
 			set_x2apic_state = 1;
 			break;
 		case SET_EXCEPTION_BITMAP:
 			exception_bitmap = strtoul(optarg, NULL, 0);
 			set_exception_bitmap = 1;
 			break;
 		case SET_VMCS_ENTRY_INTERRUPTION_INFO:
 			vmcs_entry_interruption_info = strtoul(optarg, NULL, 0);
 			set_vmcs_entry_interruption_info = 1;
 			break;
 		case SET_CAP:
 			capval = strtoul(optarg, NULL, 0);
 			setcap = 1;
 			break;
 		case SET_RTC_TIME:
 			rtc_secs = strtoul(optarg, NULL, 0);
 			set_rtc_time = 1;
 			break;
 		case SET_RTC_NVRAM:
 			rtc_nvram_value = (uint8_t)strtoul(optarg, NULL, 0);
 			set_rtc_nvram = 1;
 			break;
 		case RTC_NVRAM_OFFSET:
 			rtc_nvram_offset = strtoul(optarg, NULL, 0);
 			break;
 		case GET_GPA_PMAP:
 			gpa_pmap = strtoul(optarg, NULL, 0);
 			get_gpa_pmap = 1;
 			break;
 		case CAPNAME:
 			capname = optarg;
 			break;
 		case UNASSIGN_PPTDEV:
 			unassign_pptdev = 1;
 			if (sscanf(optarg, "%d/%d/%d", &bus, &slot, &func) != 3)
 				usage(cpu_intel);
 			break;
 		case ASSERT_LAPIC_LVT:
 			assert_lapic_lvt = atoi(optarg);
 			break;
+#ifdef BHYVE_SNAPSHOT
+		case SET_CHECKPOINT_FILE:
+			vm_checkpoint_opt = 1;
+			checkpoint_file = optarg;
+			break;
+		case SET_SUSPEND_FILE:
+			vm_suspend_opt = 1;
+			suspend_file = optarg;
+			break;
+#endif
 		default:
 			usage(cpu_intel);
 		}
 	}
 	argc -= optind;
 	argv += optind;
 
 	if (vmname == NULL)
 		usage(cpu_intel);
 
 	error = 0;
 
 	if (!error && create)
 		error = vm_create(vmname);
 
 	if (!error) {
 		ctx = vm_open(vmname);
 		if (ctx == NULL) {
 			printf("VM:%s is not created.\n", vmname);
 			exit (1);
 		}
 	}
 
 	if (!error && memsize)
 		error = vm_setup_memory(ctx, memsize, VM_MMAP_ALL);
 
 	if (!error && set_efer)
 		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_EFER, efer);
 
 	if (!error && set_cr0)
 		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR0, cr0);
 
 	if (!error && set_cr2)
 		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR2, cr2);
 
 	if (!error && set_cr3)
 		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR3, cr3);
 
 	if (!error && set_cr4)
 		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR4, cr4);
 
 	if (!error && set_dr0)
 		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DR0, dr0);
 
 	if (!error && set_dr1)
 		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DR1, dr1);
 
 	if (!error && set_dr2)
 		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DR2, dr2);
 
 	if (!error && set_dr3)
 		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DR3, dr3);
 
 	if (!error && set_dr6)
 		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DR6, dr6);
 
 	if (!error && set_dr7)
 		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DR7, dr7);
 
 	if (!error && set_rsp)
 		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RSP, rsp);
 
 	if (!error && set_rip)
 		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, rip);
 
 	if (!error && set_rax)
 		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX, rax);
 
 	if (!error && set_rflags) {
 		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RFLAGS,
 					rflags);
 	}
 
 	if (!error && set_desc_ds) {
 		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_DS,
 				    desc_base, desc_limit, desc_access);
 	}
 
 	if (!error && set_desc_es) {
 		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_ES,
 				    desc_base, desc_limit, desc_access);
 	}
 
 	if (!error && set_desc_ss) {
 		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_SS,
 				    desc_base, desc_limit, desc_access);
 	}
 
 	if (!error && set_desc_cs) {
 		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_CS,
 				    desc_base, desc_limit, desc_access);
 	}
 
 	if (!error && set_desc_fs) {
 		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_FS,
 				    desc_base, desc_limit, desc_access);
 	}
 
 	if (!error && set_desc_gs) {
 		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_GS,
 				    desc_base, desc_limit, desc_access);
 	}
 
 	if (!error && set_desc_tr) {
 		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_TR,
 				    desc_base, desc_limit, desc_access);
 	}
 
 	if (!error && set_desc_ldtr) {
 		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_LDTR,
 				    desc_base, desc_limit, desc_access);
 	}
 
 	if (!error && set_desc_gdtr) {
 		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_GDTR,
 				    desc_base, desc_limit, 0);
 	}
 
 	if (!error && set_desc_idtr) {
 		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_IDTR,
 				    desc_base, desc_limit, 0);
 	}
 
 	if (!error && set_cs)
 		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CS, cs);
 
 	if (!error && set_ds)
 		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DS, ds);
 
 	if (!error && set_es)
 		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_ES, es);
 
 	if (!error && set_fs)
 		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_FS, fs);
 
 	if (!error && set_gs)
 		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_GS, gs);
 
 	if (!error && set_ss)
 		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_SS, ss);
 
 	if (!error && set_tr)
 		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_TR, tr);
 
 	if (!error && set_ldtr)
 		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_LDTR, ldtr);
 
 	if (!error && set_x2apic_state)
 		error = vm_set_x2apic_state(ctx, vcpu, x2apic_state);
 
 	if (!error && unassign_pptdev)
 		error = vm_unassign_pptdev(ctx, bus, slot, func);
 
 	if (!error && set_exception_bitmap) {
 		if (cpu_intel)
 			error = vm_set_vmcs_field(ctx, vcpu,
 						  VMCS_EXCEPTION_BITMAP,
 						  exception_bitmap);
 		else
 			error = vm_set_vmcb_field(ctx, vcpu,
 						  VMCB_OFF_EXC_INTERCEPT,
 						  4, exception_bitmap);
 	}
 
 	if (!error && cpu_intel && set_vmcs_entry_interruption_info) {
 		error = vm_set_vmcs_field(ctx, vcpu, VMCS_ENTRY_INTR_INFO,
 					  vmcs_entry_interruption_info);
 	}
 
 	if (!error && inject_nmi) {
 		error = vm_inject_nmi(ctx, vcpu);
 	}
 
 	if (!error && assert_lapic_lvt != -1) {
 		error = vm_lapic_local_irq(ctx, vcpu, assert_lapic_lvt);
 	}
 
 	if (!error && (get_memseg || get_all))
 		error = show_memseg(ctx);
 
 	if (!error && (get_memmap || get_all))
 		error = show_memmap(ctx);
 
 	if (!error)
 		error = get_all_registers(ctx, vcpu);
 
 	if (!error)
 		error = get_all_segments(ctx, vcpu);
 
 	if (!error) {
 		if (cpu_intel)
 			error = get_misc_vmcs(ctx, vcpu);
 		else
 			error = get_misc_vmcb(ctx, vcpu);
 	}
 	
 	if (!error && (get_x2apic_state || get_all)) {
 		error = vm_get_x2apic_state(ctx, vcpu, &x2apic_state);
 		if (error == 0)
 			printf("x2apic_state[%d]\t%d\n", vcpu, x2apic_state);
 	}
 
 	if (!error && (get_eptp || get_all)) {
 		if (cpu_intel)
 			error = vm_get_vmcs_field(ctx, vcpu, VMCS_EPTP, &eptp);
 		else
 			error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_NPT_BASE,
 						   8, &eptp);
 		if (error == 0)
 			printf("%s[%d]\t\t0x%016lx\n",
 				cpu_intel ? "eptp" : "rvi/npt", vcpu, eptp);
 	}
 
 	if (!error && (get_exception_bitmap || get_all)) {
 		if(cpu_intel)
 			error = vm_get_vmcs_field(ctx, vcpu,
 						VMCS_EXCEPTION_BITMAP, &bm);
 		else
 			error = vm_get_vmcb_field(ctx, vcpu,
 						  VMCB_OFF_EXC_INTERCEPT,
 						  4, &bm);
 		if (error == 0)
 			printf("exception_bitmap[%d]\t%#lx\n", vcpu, bm);
 	}
 
 	if (!error && (get_io_bitmap || get_all)) {
 		if (cpu_intel) {
 			error = vm_get_vmcs_field(ctx, vcpu, VMCS_IO_BITMAP_A,
 						  &bm);
 			if (error == 0)
 				printf("io_bitmap_a[%d]\t%#lx\n", vcpu, bm);
 			error = vm_get_vmcs_field(ctx, vcpu, VMCS_IO_BITMAP_B,
 						  &bm);
 			if (error == 0)
 				printf("io_bitmap_b[%d]\t%#lx\n", vcpu, bm);
 		} else {
 			error = vm_get_vmcb_field(ctx, vcpu,
 						  VMCB_OFF_IO_PERM, 8, &bm);
 			if (error == 0)
 				printf("io_bitmap[%d]\t%#lx\n", vcpu, bm);
 		}
 	}
 
 	if (!error && (get_tsc_offset || get_all)) {
 		uint64_t tscoff;
 		if (cpu_intel)
 			error = vm_get_vmcs_field(ctx, vcpu, VMCS_TSC_OFFSET,
 						  &tscoff);
 		else
 			error = vm_get_vmcb_field(ctx, vcpu,
 						  VMCB_OFF_TSC_OFFSET, 
 						  8, &tscoff);
 		if (error == 0)
 			printf("tsc_offset[%d]\t0x%016lx\n", vcpu, tscoff);
 	}
 
 	if (!error && (get_msr_bitmap_address || get_all)) {
 		if (cpu_intel)
 			error = vm_get_vmcs_field(ctx, vcpu, VMCS_MSR_BITMAP, 
 						  &addr);
 		else
 			error = vm_get_vmcb_field(ctx, vcpu,
 						  VMCB_OFF_MSR_PERM, 8, &addr);
 		if (error == 0)
 			printf("msr_bitmap[%d]\t\t%#lx\n", vcpu, addr);
 	}
 
 	if (!error && (get_msr_bitmap || get_all)) {
 		if (cpu_intel) {
 			error = vm_get_vmcs_field(ctx, vcpu, 
 						  VMCS_MSR_BITMAP, &addr);
 		} else {
 			error = vm_get_vmcb_field(ctx, vcpu,
 						  VMCB_OFF_MSR_PERM, 8,
 						  &addr);
 		}
 
 		if (error == 0)
 			error = dump_msr_bitmap(vcpu, addr, cpu_intel);
 	}
 
 	if (!error && (get_vpid_asid || get_all)) {
 		uint64_t vpid;
 		if (cpu_intel)
 			error = vm_get_vmcs_field(ctx, vcpu, VMCS_VPID, &vpid);
 		else
 			error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_ASID, 
 						  4, &vpid);
 		if (error == 0)
 			printf("%s[%d]\t\t0x%04lx\n", 
 				cpu_intel ? "vpid" : "asid", vcpu, vpid);
 	}
 
 	if (!error && (get_guest_pat || get_all)) {
 		if (cpu_intel)
 			error = vm_get_vmcs_field(ctx, vcpu,
 						  VMCS_GUEST_IA32_PAT, &pat);
 		else
 			error = vm_get_vmcb_field(ctx, vcpu,
 						  VMCB_OFF_GUEST_PAT, 8, &pat);
 		if (error == 0)
 			printf("guest_pat[%d]\t\t0x%016lx\n", vcpu, pat);
 	}
 
 	if (!error && (get_guest_sysenter || get_all)) {
 		if (cpu_intel)
 			error = vm_get_vmcs_field(ctx, vcpu,
 						  VMCS_GUEST_IA32_SYSENTER_CS,
 						  &cs);
 		else
 			error = vm_get_vmcb_field(ctx, vcpu,
 						  VMCB_OFF_SYSENTER_CS, 8,
 						  &cs);
 
 		if (error == 0)
 			printf("guest_sysenter_cs[%d]\t%#lx\n", vcpu, cs);
 		if (cpu_intel)
 			error = vm_get_vmcs_field(ctx, vcpu,
 						  VMCS_GUEST_IA32_SYSENTER_ESP,
 						  &rsp);
 		else
 			error = vm_get_vmcb_field(ctx, vcpu,
 						  VMCB_OFF_SYSENTER_ESP, 8,
 						  &rsp);
 
 		if (error == 0)
 			printf("guest_sysenter_sp[%d]\t%#lx\n", vcpu, rsp);
 		if (cpu_intel)
 			error = vm_get_vmcs_field(ctx, vcpu,
 						  VMCS_GUEST_IA32_SYSENTER_EIP,
 						  &rip);
 		else
 			error = vm_get_vmcb_field(ctx, vcpu,
 						  VMCB_OFF_SYSENTER_EIP, 8, 
 						  &rip);
 		if (error == 0)
 			printf("guest_sysenter_ip[%d]\t%#lx\n", vcpu, rip);
 	}
 
 	if (!error && (get_exit_reason || get_all)) {
 		if (cpu_intel)
 			error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_REASON,
 						  &u64);
 		else	
 			error = vm_get_vmcb_field(ctx, vcpu,
 						  VMCB_OFF_EXIT_REASON, 8,
 						  &u64);
 		if (error == 0)
 			printf("exit_reason[%d]\t%#lx\n", vcpu, u64);
 	}
 
 	if (!error && setcap) {
 		int captype;
 		captype = vm_capability_name2type(capname);
 		error = vm_set_capability(ctx, vcpu, captype, capval);
 		if (error != 0 && errno == ENOENT)
 			printf("Capability \"%s\" is not available\n", capname);
 	}
 
 	if (!error && get_gpa_pmap) {
 		error = vm_get_gpa_pmap(ctx, gpa_pmap, pteval, &ptenum);
 		if (error == 0) {
 			printf("gpa %#lx:", gpa_pmap);
 			pte = &pteval[0];
 			while (ptenum-- > 0)
 				printf(" %#lx", *pte++);
 			printf("\n");
 		}
 	}
 
 	if (!error && set_rtc_nvram)
 		error = vm_rtc_write(ctx, rtc_nvram_offset, rtc_nvram_value);
 
 	if (!error && (get_rtc_nvram || get_all)) {
 		error = vm_rtc_read(ctx, rtc_nvram_offset, &rtc_nvram_value);
 		if (error == 0) {
 			printf("rtc nvram[%03d]: 0x%02x\n", rtc_nvram_offset,
 			    rtc_nvram_value);
 		}
 	}
 
 	if (!error && set_rtc_time)
 		error = vm_rtc_settime(ctx, rtc_secs);
 
 	if (!error && (get_rtc_time || get_all)) {
 		error = vm_rtc_gettime(ctx, &rtc_secs);
 		if (error == 0) {
 			gmtime_r(&rtc_secs, &tm);
 			printf("rtc time %#lx: %s %s %02d %02d:%02d:%02d %d\n",
 			    rtc_secs, wday_str(tm.tm_wday), mon_str(tm.tm_mon),
 			    tm.tm_mday, tm.tm_hour, tm.tm_min, tm.tm_sec,
 			    1900 + tm.tm_year);
 		}
 	}
 
 	if (!error && (getcap || get_all)) {
 		int captype, val, getcaptype;
 
 		if (getcap && capname)
 			getcaptype = vm_capability_name2type(capname);
 		else
 			getcaptype = -1;
 
 		for (captype = 0; captype < VM_CAP_MAX; captype++) {
 			if (getcaptype >= 0 && captype != getcaptype)
 				continue;
 			error = vm_get_capability(ctx, vcpu, captype, &val);
 			if (error == 0) {
 				printf("Capability \"%s\" is %s on vcpu %d\n",
 					vm_capability_type2name(captype),
 					val ? "set" : "not set", vcpu);
 			} else if (errno == ENOENT) {
 				error = 0;
 				printf("Capability \"%s\" is not available\n",
 					vm_capability_type2name(captype));
 			} else {
 				break;
 			}
 		}
 	}
 
 	if (!error && (get_active_cpus || get_all)) {
 		error = vm_active_cpus(ctx, &cpus);
 		if (!error)
 			print_cpus("active cpus", &cpus);
 	}
 
 	if (!error && (get_suspended_cpus || get_all)) {
 		error = vm_suspended_cpus(ctx, &cpus);
 		if (!error)
 			print_cpus("suspended cpus", &cpus);
 	}
 
 	if (!error && (get_intinfo || get_all)) {
 		error = vm_get_intinfo(ctx, vcpu, &info[0], &info[1]);
 		if (!error) {
 			print_intinfo("pending", info[0]);
 			print_intinfo("current", info[1]);
 		}
 	}
 
 	if (!error && (get_stats || get_all)) {
 		int i, num_stats;
 		uint64_t *stats;
 		struct timeval tv;
 		const char *desc;
 
 		stats = vm_get_stats(ctx, vcpu, &tv, &num_stats);
 		if (stats != NULL) {
 			printf("vcpu%d stats:\n", vcpu);
 			for (i = 0; i < num_stats; i++) {
 				desc = vm_get_stat_desc(ctx, i);
 				printf("%-40s\t%ld\n", desc, stats[i]);
 			}
 		}
 	}
 
 	if (!error && (get_cpu_topology || get_all)) {
 		uint16_t sockets, cores, threads, maxcpus;
 
 		vm_get_topology(ctx, &sockets, &cores, &threads, &maxcpus);
 		printf("cpu_topology:\tsockets=%hu, cores=%hu, threads=%hu, "
 		    "maxcpus=%hu\n", sockets, cores, threads, maxcpus);
 	}
 
 	if (!error && run) {
 		error = vm_run(ctx, vcpu, &vmexit);
 		if (error == 0)
 			dump_vm_run_exitcode(&vmexit, vcpu);
 		else
 			printf("vm_run error %d\n", error);
 	}
 
 	if (!error && force_reset)
 		error = vm_suspend(ctx, VM_SUSPEND_RESET);
 
 	if (!error && force_poweroff)
 		error = vm_suspend(ctx, VM_SUSPEND_POWEROFF);
 
 	if (error)
 		printf("errno = %d\n", errno);
 
 	if (!error && destroy)
 		vm_destroy(ctx);
 
+#ifdef BHYVE_SNAPSHOT
+	if (!error && vm_checkpoint_opt)
+		error = send_start_checkpoint(ctx, checkpoint_file);
+
+	if (!error && vm_suspend_opt)
+		error = send_start_suspend(ctx, suspend_file);
+#endif
+
 	free (opts);
 	exit(error);
 }