Index: stable/10/lib/libvmmapi/vmmapi.c
===================================================================
--- stable/10/lib/libvmmapi/vmmapi.c	(revision 284898)
+++ stable/10/lib/libvmmapi/vmmapi.c	(revision 284899)
@@ -1,1201 +1,1213 @@
 /*-
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/sysctl.h>
 #include <sys/ioctl.h>
 #include <sys/mman.h>
 #include <sys/_iovec.h>
 #include <sys/cpuset.h>
 
 #include <x86/segments.h>
 #include <machine/specialreg.h>
 #include <machine/param.h>
 
 #include <stdio.h>
 #include <stdlib.h>
 #include <assert.h>
 #include <string.h>
 #include <fcntl.h>
 #include <unistd.h>
 
 #include <libutil.h>
 
 #include <machine/vmm.h>
 #include <machine/vmm_dev.h>
 
 #include "vmmapi.h"
 
 #define	MB	(1024 * 1024UL)
 #define	GB	(1024 * 1024 * 1024UL)
 
 struct vmctx {
 	int	fd;
 	uint32_t lowmem_limit;
 	enum vm_mmap_style vms;
 	int	memflags;
 	size_t	lowmem;
 	char	*lowmem_addr;
 	size_t	highmem;
 	char	*highmem_addr;
 	char	*name;
 };
 
 #define	CREATE(x)  sysctlbyname("hw.vmm.create", NULL, NULL, (x), strlen((x)))
 #define	DESTROY(x) sysctlbyname("hw.vmm.destroy", NULL, NULL, (x), strlen((x)))
 
 static int
 vm_device_open(const char *name)
 {
         int fd, len;
         char *vmfile;
 
 	len = strlen("/dev/vmm/") + strlen(name) + 1;
 	vmfile = malloc(len);
 	assert(vmfile != NULL);
 	snprintf(vmfile, len, "/dev/vmm/%s", name);
 
         /* Open the device file */
         fd = open(vmfile, O_RDWR, 0);
 
 	free(vmfile);
         return (fd);
 }
 
 int
 vm_create(const char *name)
 {
 
 	return (CREATE((char *)name));
 }
 
 struct vmctx *
 vm_open(const char *name)
 {
 	struct vmctx *vm;
 
 	vm = malloc(sizeof(struct vmctx) + strlen(name) + 1);
 	assert(vm != NULL);
 
 	vm->fd = -1;
 	vm->memflags = 0;
 	vm->lowmem_limit = 3 * GB;
 	vm->name = (char *)(vm + 1);
 	strcpy(vm->name, name);
 
 	if ((vm->fd = vm_device_open(vm->name)) < 0)
 		goto err;
 
 	return (vm);
 err:
 	vm_destroy(vm);
 	return (NULL);
 }
 
 void
 vm_destroy(struct vmctx *vm)
 {
 	assert(vm != NULL);
 
 	if (vm->fd >= 0)
 		close(vm->fd);
 	DESTROY(vm->name);
 
 	free(vm);
 }
 
 int
 vm_parse_memsize(const char *optarg, size_t *ret_memsize)
 {
 	char *endptr;
 	size_t optval;
 	int error;
 
 	optval = strtoul(optarg, &endptr, 0);
 	if (*optarg != '\0' && *endptr == '\0') {
 		/*
 		 * For the sake of backward compatibility if the memory size
 		 * specified on the command line is less than a megabyte then
 		 * it is interpreted as being in units of MB.
 		 */
 		if (optval < MB)
 			optval *= MB;
 		*ret_memsize = optval;
 		error = 0;
 	} else
 		error = expand_number(optarg, ret_memsize);
 
 	return (error);
 }
 
 int
 vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, size_t *ret_len,
 		  int *wired)
 {
 	int error;
 	struct vm_memory_segment seg;
 
 	bzero(&seg, sizeof(seg));
 	seg.gpa = gpa;
 	error = ioctl(ctx->fd, VM_GET_MEMORY_SEG, &seg);
 	*ret_len = seg.len;
 	if (wired != NULL)
 		*wired = seg.wired;
 	return (error);
 }
 
 uint32_t
 vm_get_lowmem_limit(struct vmctx *ctx)
 {
 
 	return (ctx->lowmem_limit);
 }
 
 void
 vm_set_lowmem_limit(struct vmctx *ctx, uint32_t limit)
 {
 
 	ctx->lowmem_limit = limit;
 }
 
 void
 vm_set_memflags(struct vmctx *ctx, int flags)
 {
 
 	ctx->memflags = flags;
 }
 
 static int
 setup_memory_segment(struct vmctx *ctx, vm_paddr_t gpa, size_t len, char **addr)
 {
 	int error, mmap_flags;
 	struct vm_memory_segment seg;
 
 	/*
 	 * Create and optionally map 'len' bytes of memory at guest
 	 * physical address 'gpa'
 	 */
 	bzero(&seg, sizeof(seg));
 	seg.gpa = gpa;
 	seg.len = len;
 	error = ioctl(ctx->fd, VM_MAP_MEMORY, &seg);
 	if (error == 0 && addr != NULL) {
 		mmap_flags = MAP_SHARED;
 		if ((ctx->memflags & VM_MEM_F_INCORE) == 0)
 			mmap_flags |= MAP_NOCORE;
 		*addr = mmap(NULL, len, PROT_READ | PROT_WRITE, mmap_flags,
 		    ctx->fd, gpa);
 	}
 	return (error);
 }
 
 int
 vm_setup_memory(struct vmctx *ctx, size_t memsize, enum vm_mmap_style vms)
 {
 	char **addr;
 	int error;
 
 	/* XXX VM_MMAP_SPARSE not implemented yet */
 	assert(vms == VM_MMAP_NONE || vms == VM_MMAP_ALL);
 	ctx->vms = vms;
 
 	/*
 	 * If 'memsize' cannot fit entirely in the 'lowmem' segment then
 	 * create another 'highmem' segment above 4GB for the remainder.
 	 */
 	if (memsize > ctx->lowmem_limit) {
 		ctx->lowmem = ctx->lowmem_limit;
 		ctx->highmem = memsize - ctx->lowmem;
 	} else {
 		ctx->lowmem = memsize;
 		ctx->highmem = 0;
 	}
 
 	if (ctx->lowmem > 0) {
 		addr = (vms == VM_MMAP_ALL) ? &ctx->lowmem_addr : NULL;
 		error = setup_memory_segment(ctx, 0, ctx->lowmem, addr);
 		if (error)
 			return (error);
 	}
 
 	if (ctx->highmem > 0) {
 		addr = (vms == VM_MMAP_ALL) ? &ctx->highmem_addr : NULL;
 		error = setup_memory_segment(ctx, 4*GB, ctx->highmem, addr);
 		if (error)
 			return (error);
 	}
 
 	return (0);
 }
 
 void *
 vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len)
 {
 
 	/* XXX VM_MMAP_SPARSE not implemented yet */
 	assert(ctx->vms == VM_MMAP_ALL);
 
 	if (gaddr < ctx->lowmem && gaddr + len <= ctx->lowmem)
 		return ((void *)(ctx->lowmem_addr + gaddr));
 
 	if (gaddr >= 4*GB) {
 		gaddr -= 4*GB;
 		if (gaddr < ctx->highmem && gaddr + len <= ctx->highmem)
 			return ((void *)(ctx->highmem_addr + gaddr));
 	}
 
 	return (NULL);
 }
 
 size_t
 vm_get_lowmem_size(struct vmctx *ctx)
 {
 
 	return (ctx->lowmem);
 }
 
 size_t
 vm_get_highmem_size(struct vmctx *ctx)
 {
 
 	return (ctx->highmem);
 }
 
 int
 vm_set_desc(struct vmctx *ctx, int vcpu, int reg,
 	    uint64_t base, uint32_t limit, uint32_t access)
 {
 	int error;
 	struct vm_seg_desc vmsegdesc;
 
 	bzero(&vmsegdesc, sizeof(vmsegdesc));
 	vmsegdesc.cpuid = vcpu;
 	vmsegdesc.regnum = reg;
 	vmsegdesc.desc.base = base;
 	vmsegdesc.desc.limit = limit;
 	vmsegdesc.desc.access = access;
 
 	error = ioctl(ctx->fd, VM_SET_SEGMENT_DESCRIPTOR, &vmsegdesc);
 	return (error);
 }
 
 int
 vm_get_desc(struct vmctx *ctx, int vcpu, int reg,
 	    uint64_t *base, uint32_t *limit, uint32_t *access)
 {
 	int error;
 	struct vm_seg_desc vmsegdesc;
 
 	bzero(&vmsegdesc, sizeof(vmsegdesc));
 	vmsegdesc.cpuid = vcpu;
 	vmsegdesc.regnum = reg;
 
 	error = ioctl(ctx->fd, VM_GET_SEGMENT_DESCRIPTOR, &vmsegdesc);
 	if (error == 0) {
 		*base = vmsegdesc.desc.base;
 		*limit = vmsegdesc.desc.limit;
 		*access = vmsegdesc.desc.access;
 	}
 	return (error);
 }
 
 int
 vm_get_seg_desc(struct vmctx *ctx, int vcpu, int reg, struct seg_desc *seg_desc)
 {
 	int error;
 
 	error = vm_get_desc(ctx, vcpu, reg, &seg_desc->base, &seg_desc->limit,
 	    &seg_desc->access);
 	return (error);
 }
 
 int
 vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val)
 {
 	int error;
 	struct vm_register vmreg;
 
 	bzero(&vmreg, sizeof(vmreg));
 	vmreg.cpuid = vcpu;
 	vmreg.regnum = reg;
 	vmreg.regval = val;
 
 	error = ioctl(ctx->fd, VM_SET_REGISTER, &vmreg);
 	return (error);
 }
 
 int
 vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *ret_val)
 {
 	int error;
 	struct vm_register vmreg;
 
 	bzero(&vmreg, sizeof(vmreg));
 	vmreg.cpuid = vcpu;
 	vmreg.regnum = reg;
 
 	error = ioctl(ctx->fd, VM_GET_REGISTER, &vmreg);
 	*ret_val = vmreg.regval;
 	return (error);
 }
 
 int
 vm_run(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit)
 {
 	int error;
 	struct vm_run vmrun;
 
 	bzero(&vmrun, sizeof(vmrun));
 	vmrun.cpuid = vcpu;
 
 	error = ioctl(ctx->fd, VM_RUN, &vmrun);
 	bcopy(&vmrun.vm_exit, vmexit, sizeof(struct vm_exit));
 	return (error);
 }
 
 int
 vm_suspend(struct vmctx *ctx, enum vm_suspend_how how)
 {
 	struct vm_suspend vmsuspend;
 
 	bzero(&vmsuspend, sizeof(vmsuspend));
 	vmsuspend.how = how;
 	return (ioctl(ctx->fd, VM_SUSPEND, &vmsuspend));
 }
 
 int
 vm_reinit(struct vmctx *ctx)
 {
 
 	return (ioctl(ctx->fd, VM_REINIT, 0));
 }
 
 int
 vm_inject_exception(struct vmctx *ctx, int vcpu, int vector, int errcode_valid,
     uint32_t errcode, int restart_instruction)
 {
 	struct vm_exception exc;
 
 	exc.cpuid = vcpu;
 	exc.vector = vector;
 	exc.error_code = errcode;
 	exc.error_code_valid = errcode_valid;
 	exc.restart_instruction = restart_instruction;
 
 	return (ioctl(ctx->fd, VM_INJECT_EXCEPTION, &exc));
 }
 
 int
 vm_apicid2vcpu(struct vmctx *ctx, int apicid)
 {
 	/*
 	 * The apic id associated with the 'vcpu' has the same numerical value
 	 * as the 'vcpu' itself.
 	 */
 	return (apicid);
 }
 
 int
 vm_lapic_irq(struct vmctx *ctx, int vcpu, int vector)
 {
 	struct vm_lapic_irq vmirq;
 
 	bzero(&vmirq, sizeof(vmirq));
 	vmirq.cpuid = vcpu;
 	vmirq.vector = vector;
 
 	return (ioctl(ctx->fd, VM_LAPIC_IRQ, &vmirq));
 }
 
 int
 vm_lapic_local_irq(struct vmctx *ctx, int vcpu, int vector)
 {
 	struct vm_lapic_irq vmirq;
 
 	bzero(&vmirq, sizeof(vmirq));
 	vmirq.cpuid = vcpu;
 	vmirq.vector = vector;
 
 	return (ioctl(ctx->fd, VM_LAPIC_LOCAL_IRQ, &vmirq));
 }
 
 int
 vm_lapic_msi(struct vmctx *ctx, uint64_t addr, uint64_t msg)
 {
 	struct vm_lapic_msi vmmsi;
 
 	bzero(&vmmsi, sizeof(vmmsi));
 	vmmsi.addr = addr;
 	vmmsi.msg = msg;
 
 	return (ioctl(ctx->fd, VM_LAPIC_MSI, &vmmsi));
 }
 
 int
 vm_ioapic_assert_irq(struct vmctx *ctx, int irq)
 {
 	struct vm_ioapic_irq ioapic_irq;
 
 	bzero(&ioapic_irq, sizeof(struct vm_ioapic_irq));
 	ioapic_irq.irq = irq;
 
 	return (ioctl(ctx->fd, VM_IOAPIC_ASSERT_IRQ, &ioapic_irq));
 }
 
 int
 vm_ioapic_deassert_irq(struct vmctx *ctx, int irq)
 {
 	struct vm_ioapic_irq ioapic_irq;
 
 	bzero(&ioapic_irq, sizeof(struct vm_ioapic_irq));
 	ioapic_irq.irq = irq;
 
 	return (ioctl(ctx->fd, VM_IOAPIC_DEASSERT_IRQ, &ioapic_irq));
 }
 
 int
 vm_ioapic_pulse_irq(struct vmctx *ctx, int irq)
 {
 	struct vm_ioapic_irq ioapic_irq;
 
 	bzero(&ioapic_irq, sizeof(struct vm_ioapic_irq));
 	ioapic_irq.irq = irq;
 
 	return (ioctl(ctx->fd, VM_IOAPIC_PULSE_IRQ, &ioapic_irq));
 }
 
 int
 vm_ioapic_pincount(struct vmctx *ctx, int *pincount)
 {
 
 	return (ioctl(ctx->fd, VM_IOAPIC_PINCOUNT, pincount));
 }
 
 int
 vm_isa_assert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq)
 {
 	struct vm_isa_irq isa_irq;
 
 	bzero(&isa_irq, sizeof(struct vm_isa_irq));
 	isa_irq.atpic_irq = atpic_irq;
 	isa_irq.ioapic_irq = ioapic_irq;
 
 	return (ioctl(ctx->fd, VM_ISA_ASSERT_IRQ, &isa_irq));
 }
 
 int
 vm_isa_deassert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq)
 {
 	struct vm_isa_irq isa_irq;
 
 	bzero(&isa_irq, sizeof(struct vm_isa_irq));
 	isa_irq.atpic_irq = atpic_irq;
 	isa_irq.ioapic_irq = ioapic_irq;
 
 	return (ioctl(ctx->fd, VM_ISA_DEASSERT_IRQ, &isa_irq));
 }
 
 int
 vm_isa_pulse_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq)
 {
 	struct vm_isa_irq isa_irq;
 
 	bzero(&isa_irq, sizeof(struct vm_isa_irq));
 	isa_irq.atpic_irq = atpic_irq;
 	isa_irq.ioapic_irq = ioapic_irq;
 
 	return (ioctl(ctx->fd, VM_ISA_PULSE_IRQ, &isa_irq));
 }
 
 int
 vm_isa_set_irq_trigger(struct vmctx *ctx, int atpic_irq,
     enum vm_intr_trigger trigger)
 {
 	struct vm_isa_irq_trigger isa_irq_trigger;
 
 	bzero(&isa_irq_trigger, sizeof(struct vm_isa_irq_trigger));
 	isa_irq_trigger.atpic_irq = atpic_irq;
 	isa_irq_trigger.trigger = trigger;
 
 	return (ioctl(ctx->fd, VM_ISA_SET_IRQ_TRIGGER, &isa_irq_trigger));
 }
 
 int
 vm_inject_nmi(struct vmctx *ctx, int vcpu)
 {
 	struct vm_nmi vmnmi;
 
 	bzero(&vmnmi, sizeof(vmnmi));
 	vmnmi.cpuid = vcpu;
 
 	return (ioctl(ctx->fd, VM_INJECT_NMI, &vmnmi));
 }
 
 static struct {
 	const char	*name;
 	int		type;
 } capstrmap[] = {
 	{ "hlt_exit",		VM_CAP_HALT_EXIT },
 	{ "mtrap_exit",		VM_CAP_MTRAP_EXIT },
 	{ "pause_exit",		VM_CAP_PAUSE_EXIT },
 	{ "unrestricted_guest",	VM_CAP_UNRESTRICTED_GUEST },
 	{ "enable_invpcid",	VM_CAP_ENABLE_INVPCID },
 	{ 0 }
 };
 
 int
 vm_capability_name2type(const char *capname)
 {
 	int i;
 
 	for (i = 0; capstrmap[i].name != NULL && capname != NULL; i++) {
 		if (strcmp(capstrmap[i].name, capname) == 0)
 			return (capstrmap[i].type);
 	}
 
 	return (-1);
 }
 
 const char *
 vm_capability_type2name(int type)
 {
 	int i;
 
 	for (i = 0; capstrmap[i].name != NULL; i++) {
 		if (capstrmap[i].type == type)
 			return (capstrmap[i].name);
 	}
 
 	return (NULL);
 }
 
 int
 vm_get_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap,
 		  int *retval)
 {
 	int error;
 	struct vm_capability vmcap;
 
 	bzero(&vmcap, sizeof(vmcap));
 	vmcap.cpuid = vcpu;
 	vmcap.captype = cap;
 
 	error = ioctl(ctx->fd, VM_GET_CAPABILITY, &vmcap);
 	*retval = vmcap.capval;
 	return (error);
 }
 
 int
 vm_set_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap, int val)
 {
 	struct vm_capability vmcap;
 
 	bzero(&vmcap, sizeof(vmcap));
 	vmcap.cpuid = vcpu;
 	vmcap.captype = cap;
 	vmcap.capval = val;
 	
 	return (ioctl(ctx->fd, VM_SET_CAPABILITY, &vmcap));
 }
 
 int
 vm_assign_pptdev(struct vmctx *ctx, int bus, int slot, int func)
 {
 	struct vm_pptdev pptdev;
 
 	bzero(&pptdev, sizeof(pptdev));
 	pptdev.bus = bus;
 	pptdev.slot = slot;
 	pptdev.func = func;
 
 	return (ioctl(ctx->fd, VM_BIND_PPTDEV, &pptdev));
 }
 
 int
 vm_unassign_pptdev(struct vmctx *ctx, int bus, int slot, int func)
 {
 	struct vm_pptdev pptdev;
 
 	bzero(&pptdev, sizeof(pptdev));
 	pptdev.bus = bus;
 	pptdev.slot = slot;
 	pptdev.func = func;
 
 	return (ioctl(ctx->fd, VM_UNBIND_PPTDEV, &pptdev));
 }
 
 int
 vm_map_pptdev_mmio(struct vmctx *ctx, int bus, int slot, int func,
 		   vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
 {
 	struct vm_pptdev_mmio pptmmio;
 
 	bzero(&pptmmio, sizeof(pptmmio));
 	pptmmio.bus = bus;
 	pptmmio.slot = slot;
 	pptmmio.func = func;
 	pptmmio.gpa = gpa;
 	pptmmio.len = len;
 	pptmmio.hpa = hpa;
 
 	return (ioctl(ctx->fd, VM_MAP_PPTDEV_MMIO, &pptmmio));
 }
 
 int
 vm_setup_pptdev_msi(struct vmctx *ctx, int vcpu, int bus, int slot, int func,
     uint64_t addr, uint64_t msg, int numvec)
 {
 	struct vm_pptdev_msi pptmsi;
 
 	bzero(&pptmsi, sizeof(pptmsi));
 	pptmsi.vcpu = vcpu;
 	pptmsi.bus = bus;
 	pptmsi.slot = slot;
 	pptmsi.func = func;
 	pptmsi.msg = msg;
 	pptmsi.addr = addr;
 	pptmsi.numvec = numvec;
 
 	return (ioctl(ctx->fd, VM_PPTDEV_MSI, &pptmsi));
 }
 
 int	
 vm_setup_pptdev_msix(struct vmctx *ctx, int vcpu, int bus, int slot, int func,
     int idx, uint64_t addr, uint64_t msg, uint32_t vector_control)
 {
 	struct vm_pptdev_msix pptmsix;
 
 	bzero(&pptmsix, sizeof(pptmsix));
 	pptmsix.vcpu = vcpu;
 	pptmsix.bus = bus;
 	pptmsix.slot = slot;
 	pptmsix.func = func;
 	pptmsix.idx = idx;
 	pptmsix.msg = msg;
 	pptmsix.addr = addr;
 	pptmsix.vector_control = vector_control;
 
 	return ioctl(ctx->fd, VM_PPTDEV_MSIX, &pptmsix);
 }
 
 uint64_t *
 vm_get_stats(struct vmctx *ctx, int vcpu, struct timeval *ret_tv,
 	     int *ret_entries)
 {
 	int error;
 
 	static struct vm_stats vmstats;
 
 	vmstats.cpuid = vcpu;
 
 	error = ioctl(ctx->fd, VM_STATS, &vmstats);
 	if (error == 0) {
 		if (ret_entries)
 			*ret_entries = vmstats.num_entries;
 		if (ret_tv)
 			*ret_tv = vmstats.tv;
 		return (vmstats.statbuf);
 	} else
 		return (NULL);
 }
 
 const char *
 vm_get_stat_desc(struct vmctx *ctx, int index)
 {
 	static struct vm_stat_desc statdesc;
 
 	statdesc.index = index;
 	if (ioctl(ctx->fd, VM_STAT_DESC, &statdesc) == 0)
 		return (statdesc.desc);
 	else
 		return (NULL);
 }
 
 int
 vm_get_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state *state)
 {
 	int error;
 	struct vm_x2apic x2apic;
 
 	bzero(&x2apic, sizeof(x2apic));
 	x2apic.cpuid = vcpu;
 
 	error = ioctl(ctx->fd, VM_GET_X2APIC_STATE, &x2apic);
 	*state = x2apic.state;
 	return (error);
 }
 
 int
 vm_set_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state state)
 {
 	int error;
 	struct vm_x2apic x2apic;
 
 	bzero(&x2apic, sizeof(x2apic));
 	x2apic.cpuid = vcpu;
 	x2apic.state = state;
 
 	error = ioctl(ctx->fd, VM_SET_X2APIC_STATE, &x2apic);
 
 	return (error);
 }
 
 /*
  * From Intel Vol 3a:
  * Table 9-1. IA-32 Processor States Following Power-up, Reset or INIT
  */
 int
 vcpu_reset(struct vmctx *vmctx, int vcpu)
 {
 	int error;
 	uint64_t rflags, rip, cr0, cr4, zero, desc_base, rdx;
 	uint32_t desc_access, desc_limit;
 	uint16_t sel;
 
 	zero = 0;
 
 	rflags = 0x2;
 	error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RFLAGS, rflags);
 	if (error)
 		goto done;
 
 	rip = 0xfff0;
 	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RIP, rip)) != 0)
 		goto done;
 
 	cr0 = CR0_NE;
 	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR0, cr0)) != 0)
 		goto done;
 
 	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR3, zero)) != 0)
 		goto done;
 	
 	cr4 = 0;
 	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR4, cr4)) != 0)
 		goto done;
 
 	/*
 	 * CS: present, r/w, accessed, 16-bit, byte granularity, usable
 	 */
 	desc_base = 0xffff0000;
 	desc_limit = 0xffff;
 	desc_access = 0x0093;
 	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_CS,
 			    desc_base, desc_limit, desc_access);
 	if (error)
 		goto done;
 
 	sel = 0xf000;
 	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CS, sel)) != 0)
 		goto done;
 
 	/*
 	 * SS,DS,ES,FS,GS: present, r/w, accessed, 16-bit, byte granularity
 	 */
 	desc_base = 0;
 	desc_limit = 0xffff;
 	desc_access = 0x0093;
 	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_SS,
 			    desc_base, desc_limit, desc_access);
 	if (error)
 		goto done;
 
 	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_DS,
 			    desc_base, desc_limit, desc_access);
 	if (error)
 		goto done;
 
 	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_ES,
 			    desc_base, desc_limit, desc_access);
 	if (error)
 		goto done;
 
 	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_FS,
 			    desc_base, desc_limit, desc_access);
 	if (error)
 		goto done;
 
 	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GS,
 			    desc_base, desc_limit, desc_access);
 	if (error)
 		goto done;
 
 	sel = 0;
 	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_SS, sel)) != 0)
 		goto done;
 	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_DS, sel)) != 0)
 		goto done;
 	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_ES, sel)) != 0)
 		goto done;
 	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_FS, sel)) != 0)
 		goto done;
 	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_GS, sel)) != 0)
 		goto done;
 
 	/* General purpose registers */
 	rdx = 0xf00;
 	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RAX, zero)) != 0)
 		goto done;
 	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RBX, zero)) != 0)
 		goto done;
 	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RCX, zero)) != 0)
 		goto done;
 	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RDX, rdx)) != 0)
 		goto done;
 	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RSI, zero)) != 0)
 		goto done;
 	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RDI, zero)) != 0)
 		goto done;
 	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RBP, zero)) != 0)
 		goto done;
 	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RSP, zero)) != 0)
 		goto done;
 
 	/* GDTR, IDTR */
 	desc_base = 0;
 	desc_limit = 0xffff;
 	desc_access = 0;
 	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GDTR,
 			    desc_base, desc_limit, desc_access);
 	if (error != 0)
 		goto done;
 
 	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_IDTR,
 			    desc_base, desc_limit, desc_access);
 	if (error != 0)
 		goto done;
 
 	/* TR */
 	desc_base = 0;
 	desc_limit = 0xffff;
 	desc_access = 0x0000008b;
 	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_TR, 0, 0, desc_access);
 	if (error)
 		goto done;
 
 	sel = 0;
 	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_TR, sel)) != 0)
 		goto done;
 
 	/* LDTR */
 	desc_base = 0;
 	desc_limit = 0xffff;
 	desc_access = 0x00000082;
 	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_LDTR, desc_base,
 			    desc_limit, desc_access);
 	if (error)
 		goto done;
 
 	sel = 0;
 	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_LDTR, 0)) != 0)
 		goto done;
 
 	/* XXX cr2, debug registers */
 
 	error = 0;
 done:
 	return (error);
 }
 
 int
 vm_get_gpa_pmap(struct vmctx *ctx, uint64_t gpa, uint64_t *pte, int *num)
 {
 	int error, i;
 	struct vm_gpa_pte gpapte;
 
 	bzero(&gpapte, sizeof(gpapte));
 	gpapte.gpa = gpa;
 
 	error = ioctl(ctx->fd, VM_GET_GPA_PMAP, &gpapte);
 
 	if (error == 0) {
 		*num = gpapte.ptenum;
 		for (i = 0; i < gpapte.ptenum; i++)
 			pte[i] = gpapte.pte[i];
 	}
 
 	return (error);
 }
 
 int
 vm_get_hpet_capabilities(struct vmctx *ctx, uint32_t *capabilities)
 {
 	int error;
 	struct vm_hpet_cap cap;
 
 	bzero(&cap, sizeof(struct vm_hpet_cap));
 	error = ioctl(ctx->fd, VM_GET_HPET_CAPABILITIES, &cap);
 	if (capabilities != NULL)
 		*capabilities = cap.capabilities;
 	return (error);
 }
 
 static int
 gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
     uint64_t gla, int prot, int *fault, uint64_t *gpa)
 {
 	struct vm_gla2gpa gg;
 	int error;
 
 	bzero(&gg, sizeof(struct vm_gla2gpa));
 	gg.vcpuid = vcpu;
 	gg.prot = prot;
 	gg.gla = gla;
 	gg.paging = *paging;
 
 	error = ioctl(ctx->fd, VM_GLA2GPA, &gg);
 	if (error == 0) {
 		*fault = gg.fault;
 		*gpa = gg.gpa;
 	}
 	return (error);
 }
 
+int
+vm_gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
+    uint64_t gla, int prot, uint64_t *gpa)
+{
+	int error, fault;
+
+	error = gla2gpa(ctx, vcpu, paging, gla, prot, &fault, gpa);
+	if (fault)
+		error = fault;
+	return (error);
+}
+
 #ifndef min
 #define	min(a,b)	(((a) < (b)) ? (a) : (b))
 #endif
 
 int
 vm_copy_setup(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
     uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt)
 {
 	void *va;
 	uint64_t gpa;
 	int error, fault, i, n, off;
 
 	for (i = 0; i < iovcnt; i++) {
 		iov[i].iov_base = 0;
 		iov[i].iov_len = 0;
 	}
 
 	while (len) {
 		assert(iovcnt > 0);
 		error = gla2gpa(ctx, vcpu, paging, gla, prot, &fault, &gpa);
 		if (error)
 			return (-1);
 		if (fault)
 			return (1);
 
 		off = gpa & PAGE_MASK;
 		n = min(len, PAGE_SIZE - off);
 
 		va = vm_map_gpa(ctx, gpa, n);
 		if (va == NULL)
 			return (-1);
 
 		iov->iov_base = va;
 		iov->iov_len = n;
 		iov++;
 		iovcnt--;
 
 		gla += n;
 		len -= n;
 	}
 	return (0);
 }
 
 void
 vm_copy_teardown(struct vmctx *ctx, int vcpu, struct iovec *iov, int iovcnt)
 {
 
 	return;
 }
 
 void
 vm_copyin(struct vmctx *ctx, int vcpu, struct iovec *iov, void *vp, size_t len)
 {
 	const char *src;
 	char *dst;
 	size_t n;
 
 	dst = vp;
 	while (len) {
 		assert(iov->iov_len);
 		n = min(len, iov->iov_len);
 		src = iov->iov_base;
 		bcopy(src, dst, n);
 
 		iov++;
 		dst += n;
 		len -= n;
 	}
 }
 
 void
 vm_copyout(struct vmctx *ctx, int vcpu, const void *vp, struct iovec *iov,
     size_t len)
 {
 	const char *src;
 	char *dst;
 	size_t n;
 
 	src = vp;
 	while (len) {
 		assert(iov->iov_len);
 		n = min(len, iov->iov_len);
 		dst = iov->iov_base;
 		bcopy(src, dst, n);
 
 		iov++;
 		src += n;
 		len -= n;
 	}
 }
 
 static int
 vm_get_cpus(struct vmctx *ctx, int which, cpuset_t *cpus)
 {
 	struct vm_cpuset vm_cpuset;
 	int error;
 
 	bzero(&vm_cpuset, sizeof(struct vm_cpuset));
 	vm_cpuset.which = which;
 	vm_cpuset.cpusetsize = sizeof(cpuset_t);
 	vm_cpuset.cpus = cpus;
 
 	error = ioctl(ctx->fd, VM_GET_CPUS, &vm_cpuset);
 	return (error);
 }
 
 int
 vm_active_cpus(struct vmctx *ctx, cpuset_t *cpus)
 {
 
 	return (vm_get_cpus(ctx, VM_ACTIVE_CPUS, cpus));
 }
 
 int
 vm_suspended_cpus(struct vmctx *ctx, cpuset_t *cpus)
 {
 
 	return (vm_get_cpus(ctx, VM_SUSPENDED_CPUS, cpus));
 }
 
 int
 vm_activate_cpu(struct vmctx *ctx, int vcpu)
 {
 	struct vm_activate_cpu ac;
 	int error;
 
 	bzero(&ac, sizeof(struct vm_activate_cpu));
 	ac.vcpuid = vcpu;
 	error = ioctl(ctx->fd, VM_ACTIVATE_CPU, &ac);
 	return (error);
 }
 
 int
 vm_get_intinfo(struct vmctx *ctx, int vcpu, uint64_t *info1, uint64_t *info2)
 {
 	struct vm_intinfo vmii;
 	int error;
 
 	bzero(&vmii, sizeof(struct vm_intinfo));
 	vmii.vcpuid = vcpu;
 	error = ioctl(ctx->fd, VM_GET_INTINFO, &vmii);
 	if (error == 0) {
 		*info1 = vmii.info1;
 		*info2 = vmii.info2;
 	}
 	return (error);
 }
 
 int
 vm_set_intinfo(struct vmctx *ctx, int vcpu, uint64_t info1)
 {
 	struct vm_intinfo vmii;
 	int error;
 
 	bzero(&vmii, sizeof(struct vm_intinfo));
 	vmii.vcpuid = vcpu;
 	vmii.info1 = info1;
 	error = ioctl(ctx->fd, VM_SET_INTINFO, &vmii);
 	return (error);
 }
 
 int
 vm_rtc_write(struct vmctx *ctx, int offset, uint8_t value)
 {
 	struct vm_rtc_data rtcdata;
 	int error;
 
 	bzero(&rtcdata, sizeof(struct vm_rtc_data));
 	rtcdata.offset = offset;
 	rtcdata.value = value;
 	error = ioctl(ctx->fd, VM_RTC_WRITE, &rtcdata);
 	return (error);
 }
 
 int
 vm_rtc_read(struct vmctx *ctx, int offset, uint8_t *retval)
 {
 	struct vm_rtc_data rtcdata;
 	int error;
 
 	bzero(&rtcdata, sizeof(struct vm_rtc_data));
 	rtcdata.offset = offset;
 	error = ioctl(ctx->fd, VM_RTC_READ, &rtcdata);
 	if (error == 0)
 		*retval = rtcdata.value;
 	return (error);
 }
 
 int
 vm_rtc_settime(struct vmctx *ctx, time_t secs)
 {
 	struct vm_rtc_time rtctime;
 	int error;
 
 	bzero(&rtctime, sizeof(struct vm_rtc_time));
 	rtctime.secs = secs;
 	error = ioctl(ctx->fd, VM_RTC_SETTIME, &rtctime);
 	return (error);
 }
 
 int
 vm_rtc_gettime(struct vmctx *ctx, time_t *secs)
 {
 	struct vm_rtc_time rtctime;
 	int error;
 
 	bzero(&rtctime, sizeof(struct vm_rtc_time));
 	error = ioctl(ctx->fd, VM_RTC_GETTIME, &rtctime);
 	if (error == 0)
 		*secs = rtctime.secs;
 	return (error);
 }
 
 int
 vm_restart_instruction(void *arg, int vcpu)
 {
 	struct vmctx *ctx = arg;
 
 	return (ioctl(ctx->fd, VM_RESTART_INSTRUCTION, &vcpu));
 }
Index: stable/10/lib/libvmmapi/vmmapi.h
===================================================================
--- stable/10/lib/libvmmapi/vmmapi.h	(revision 284898)
+++ stable/10/lib/libvmmapi/vmmapi.h	(revision 284899)
@@ -1,166 +1,168 @@
 /*-
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _VMMAPI_H_
 #define	_VMMAPI_H_
 
 #include <sys/param.h>
 #include <sys/cpuset.h>
 
 /*
  * API version for out-of-tree consumers like grub-bhyve for making compile
  * time decisions.
  */
 #define	VMMAPI_VERSION	0101	/* 2 digit major followed by 2 digit minor */
 
 struct iovec;
 struct vmctx;
 enum x2apic_state;
 
 /*
  * Different styles of mapping the memory assigned to a VM into the address
  * space of the controlling process.
  */
 enum vm_mmap_style {
 	VM_MMAP_NONE,		/* no mapping */
 	VM_MMAP_ALL,		/* fully and statically mapped */
 	VM_MMAP_SPARSE,		/* mappings created on-demand */
 };
 
 #define	VM_MEM_F_INCORE	0x01	/* include guest memory in core file */
 
 int	vm_create(const char *name);
 struct vmctx *vm_open(const char *name);
 void	vm_destroy(struct vmctx *ctx);
 int	vm_parse_memsize(const char *optarg, size_t *memsize);
 int	vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, size_t *ret_len,
 			  int *wired);
 int	vm_setup_memory(struct vmctx *ctx, size_t len, enum vm_mmap_style s);
 void	*vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len);
 int	vm_get_gpa_pmap(struct vmctx *, uint64_t gpa, uint64_t *pte, int *num);
+int	vm_gla2gpa(struct vmctx *, int vcpuid, struct vm_guest_paging *paging,
+		   uint64_t gla, int prot, uint64_t *gpa);
 uint32_t vm_get_lowmem_limit(struct vmctx *ctx);
 void	vm_set_lowmem_limit(struct vmctx *ctx, uint32_t limit);
 void	vm_set_memflags(struct vmctx *ctx, int flags);
 size_t	vm_get_lowmem_size(struct vmctx *ctx);
 size_t	vm_get_highmem_size(struct vmctx *ctx);
 int	vm_set_desc(struct vmctx *ctx, int vcpu, int reg,
 		    uint64_t base, uint32_t limit, uint32_t access);
 int	vm_get_desc(struct vmctx *ctx, int vcpu, int reg,
 		    uint64_t *base, uint32_t *limit, uint32_t *access);
 int	vm_get_seg_desc(struct vmctx *ctx, int vcpu, int reg,
 			struct seg_desc *seg_desc);
 int	vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val);
 int	vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *retval);
 int	vm_run(struct vmctx *ctx, int vcpu, struct vm_exit *ret_vmexit);
 int	vm_suspend(struct vmctx *ctx, enum vm_suspend_how how);
 int	vm_reinit(struct vmctx *ctx);
 int	vm_apicid2vcpu(struct vmctx *ctx, int apicid);
 int	vm_inject_exception(struct vmctx *ctx, int vcpu, int vector,
     int errcode_valid, uint32_t errcode, int restart_instruction);
 int	vm_lapic_irq(struct vmctx *ctx, int vcpu, int vector);
 int	vm_lapic_local_irq(struct vmctx *ctx, int vcpu, int vector);
 int	vm_lapic_msi(struct vmctx *ctx, uint64_t addr, uint64_t msg);
 int	vm_ioapic_assert_irq(struct vmctx *ctx, int irq);
 int	vm_ioapic_deassert_irq(struct vmctx *ctx, int irq);
 int	vm_ioapic_pulse_irq(struct vmctx *ctx, int irq);
 int	vm_ioapic_pincount(struct vmctx *ctx, int *pincount);
 int	vm_isa_assert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq);
 int	vm_isa_deassert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq);
 int	vm_isa_pulse_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq);
 int	vm_isa_set_irq_trigger(struct vmctx *ctx, int atpic_irq,
 	    enum vm_intr_trigger trigger);
 int	vm_inject_nmi(struct vmctx *ctx, int vcpu);
 int	vm_capability_name2type(const char *capname);
 const char *vm_capability_type2name(int type);
 int	vm_get_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap,
 			  int *retval);
 int	vm_set_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap,
 			  int val);
 int	vm_assign_pptdev(struct vmctx *ctx, int bus, int slot, int func);
 int	vm_unassign_pptdev(struct vmctx *ctx, int bus, int slot, int func);
 int	vm_map_pptdev_mmio(struct vmctx *ctx, int bus, int slot, int func,
 			   vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
 int	vm_setup_pptdev_msi(struct vmctx *ctx, int vcpu, int bus, int slot,
 	    int func, uint64_t addr, uint64_t msg, int numvec);
 int	vm_setup_pptdev_msix(struct vmctx *ctx, int vcpu, int bus, int slot,
 	    int func, int idx, uint64_t addr, uint64_t msg,
 	    uint32_t vector_control);
 
 int	vm_get_intinfo(struct vmctx *ctx, int vcpu, uint64_t *i1, uint64_t *i2);
 int	vm_set_intinfo(struct vmctx *ctx, int vcpu, uint64_t exit_intinfo);
 
 /*
  * Return a pointer to the statistics buffer. Note that this is not MT-safe.
  */
 uint64_t *vm_get_stats(struct vmctx *ctx, int vcpu, struct timeval *ret_tv,
 		       int *ret_entries);
 const char *vm_get_stat_desc(struct vmctx *ctx, int index);
 
 int	vm_get_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state *s);
 int	vm_set_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state s);
 
 int	vm_get_hpet_capabilities(struct vmctx *ctx, uint32_t *capabilities);
 
 /*
  * Translate the GLA range [gla,gla+len) into GPA segments in 'iov'.
  * The 'iovcnt' should be big enough to accomodate all GPA segments.
  * Returns 0 on success, 1 on a guest fault condition and -1 otherwise.
  */
 int	vm_copy_setup(struct vmctx *ctx, int vcpu, struct vm_guest_paging *pg,
 	    uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt);
 void	vm_copyin(struct vmctx *ctx, int vcpu, struct iovec *guest_iov,
 	    void *host_dst, size_t len);
 void	vm_copyout(struct vmctx *ctx, int vcpu, const void *host_src,
 	    struct iovec *guest_iov, size_t len);
 void	vm_copy_teardown(struct vmctx *ctx, int vcpu, struct iovec *iov,
 	    int iovcnt);
 
 /* RTC */
 int	vm_rtc_write(struct vmctx *ctx, int offset, uint8_t value);
 int	vm_rtc_read(struct vmctx *ctx, int offset, uint8_t *retval);
 int	vm_rtc_settime(struct vmctx *ctx, time_t secs);
 int	vm_rtc_gettime(struct vmctx *ctx, time_t *secs);
 
 /* Reset vcpu register state */
 int	vcpu_reset(struct vmctx *ctx, int vcpu);
 
 int	vm_active_cpus(struct vmctx *ctx, cpuset_t *cpus);
 int	vm_suspended_cpus(struct vmctx *ctx, cpuset_t *cpus);
 int	vm_activate_cpu(struct vmctx *ctx, int vcpu);
 
 /*
  * FreeBSD specific APIs
  */
 int	vm_setup_freebsd_registers(struct vmctx *ctx, int vcpu,
 				uint64_t rip, uint64_t cr3, uint64_t gdtbase,
 				uint64_t rsp);
 int	vm_setup_freebsd_registers_i386(struct vmctx *vmctx, int vcpu,
 					uint32_t eip, uint32_t gdtbase,
 					uint32_t esp);
 void	vm_setup_freebsd_gdt(uint64_t *gdtr);
 #endif	/* _VMMAPI_H_ */
Index: stable/10/share/examples/bhyve/vmrun.sh
===================================================================
--- stable/10/share/examples/bhyve/vmrun.sh	(revision 284898)
+++ stable/10/share/examples/bhyve/vmrun.sh	(revision 284899)
@@ -1,283 +1,297 @@
 #!/bin/sh
 #
 # Copyright (c) 2013 NetApp, Inc.
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
 # are met:
 # 1. Redistributions of source code must retain the above copyright
 #    notice, this list of conditions and the following disclaimer.
 # 2. Redistributions in binary form must reproduce the above copyright
 #    notice, this list of conditions and the following disclaimer in the
 #    documentation and/or other materials provided with the distribution.
 #
 # THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 # ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 # SUCH DAMAGE.
 #
 # $FreeBSD$
 #
 
 LOADER=/usr/sbin/bhyveload
 BHYVECTL=/usr/sbin/bhyvectl
 FBSDRUN=/usr/sbin/bhyve
 
 DEFAULT_MEMSIZE=512M
 DEFAULT_CPUS=2
 DEFAULT_TAPDEV=tap0
 DEFAULT_CONSOLE=stdio
 
 DEFAULT_VIRTIO_DISK="./diskdev"
 DEFAULT_ISOFILE="./release.iso"
 
 errmsg() {
 	echo "*** $1"
 }
 
 usage() {
 	local msg=$1
 
 	echo "Usage: vmrun.sh [-ahi] [-c <CPUs>] [-C <console>] [-d <disk file>]"
 	echo "                [-e <name=value>] [-g <gdbport> ] [-H <directory>]"
 	echo "                [-I <location of installation iso>] [-m <memsize>]"
 	echo "                [-t <tapdev>] <vmname>"
 	echo ""
 	echo "       -h: display this help message"
 	echo "       -a: force memory mapped local APIC access"
 	echo "       -c: number of virtual cpus (default is ${DEFAULT_CPUS})"
 	echo "       -C: console device (default is ${DEFAULT_CONSOLE})"
 	echo "       -d: virtio diskdev file (default is ${DEFAULT_VIRTIO_DISK})"
 	echo "       -e: set FreeBSD loader environment variable"
 	echo "       -g: listen for connection from kgdb at <gdbport>"
 	echo "       -H: host filesystem to export to the loader"
 	echo "       -i: force boot of the Installation CDROM image"
 	echo "       -I: Installation CDROM image location (default is ${DEFAULT_ISOFILE})"
 	echo "       -m: memory size (default is ${DEFAULT_MEMSIZE})"
+	echo "       -p: pass-through a host PCI device at bus/slot/func (e.g. 10/0/0)"
 	echo "       -t: tap device for virtio-net (default is $DEFAULT_TAPDEV)"
 	echo ""
 	[ -n "$msg" ] && errmsg "$msg"
 	exit 1
 }
 
 if [ `id -u` -ne 0 ]; then
 	errmsg "This script must be executed with superuser privileges"
 	exit 1
 fi
 
 kldstat -n vmm > /dev/null 2>&1 
 if [ $? -ne 0 ]; then
 	errmsg "vmm.ko is not loaded"
 	exit 1
 fi
 
 force_install=0
 isofile=${DEFAULT_ISOFILE}
 memsize=${DEFAULT_MEMSIZE}
 console=${DEFAULT_CONSOLE}
 cpus=${DEFAULT_CPUS}
 tap_total=0
 disk_total=0
 apic_opt=""
 gdbport=0
 loader_opt=""
+pass_total=0
 
-while getopts ac:C:d:e:g:hH:iI:m:t: c ; do
+while getopts ac:C:d:e:g:hH:iI:m:p:t: c ; do
 	case $c in
 	a)
 		apic_opt="-a"
 		;;
 	c)
 		cpus=${OPTARG}
 		;;
 	C)
 		console=${OPTARG}
 		;;
 	d)
 		disk_dev=${OPTARG%%,*}
 		disk_opts=${OPTARG#${disk_dev}}
 		eval "disk_dev${disk_total}=\"${disk_dev}\""
 		eval "disk_opts${disk_total}=\"${disk_opts}\""
 		disk_total=$(($disk_total + 1))
 		;;
 	e)
 		loader_opt="${loader_opt} -e ${OPTARG}"
 		;;
 	g)	
 		gdbport=${OPTARG}
 		;;
 	H)
 		host_base=`realpath ${OPTARG}`
 		;;
 	i)
 		force_install=1
 		;;
 	I)
 		isofile=${OPTARG}
 		;;
 	m)
 		memsize=${OPTARG}
 		;;
+	p)
+		eval "pass_dev${pass_total}=\"${OPTARG}\""
+		pass_total=$(($pass_total + 1))
+		;;
 	t)
 		eval "tap_dev${tap_total}=\"${OPTARG}\""
 		tap_total=$(($tap_total + 1))
 		;;
 	*)
 		usage
 		;;
 	esac
 done
 
 if [ $tap_total -eq 0 ] ; then
     tap_total=1
     tap_dev0="${DEFAULT_TAPDEV}"
 fi
 if [ $disk_total -eq 0 ] ; then
     disk_total=1
     disk_dev0="${DEFAULT_VIRTIO_DISK}"
 
 fi
 
 shift $((${OPTIND} - 1))
 
 if [ $# -ne 1 ]; then
 	usage "virtual machine name not specified"
 fi
 
 vmname="$1"
 if [ -n "${host_base}" ]; then
 	loader_opt="${loader_opt} -h ${host_base}"
 fi
 
 make_and_check_diskdev()
 {
     local virtio_diskdev="$1"
     # Create the virtio diskdev file if needed
     if [ ! -f ${virtio_diskdev} ]; then
 	    echo "virtio disk device file \"${virtio_diskdev}\" does not exist."
 	    echo "Creating it ..."
 	    truncate -s 8G ${virtio_diskdev} > /dev/null
     fi
 
     if [ ! -r ${virtio_diskdev} ]; then
 	    echo "virtio disk device file \"${virtio_diskdev}\" is not readable"
 	    exit 1
     fi
 
     if [ ! -w ${virtio_diskdev} ]; then
 	    echo "virtio disk device file \"${virtio_diskdev}\" is not writable"
 	    exit 1
     fi
 }
 
 echo "Launching virtual machine \"$vmname\" ..."
 
 first_diskdev="$disk_dev0"
 
 ${BHYVECTL} --vm=${vmname} --destroy > /dev/null 2>&1
 
 while [ 1 ]; do
 
 	file -s ${first_diskdev} | grep "boot sector" > /dev/null
 	rc=$?
 	if [ $rc -ne 0 ]; then
 		file -s ${first_diskdev} | grep ": Unix Fast File sys" > /dev/null
 		rc=$?
 	fi
 	if [ $rc -ne 0 ]; then
 		need_install=1
 	else
 		need_install=0
 	fi
 
 	if [ $force_install -eq 1 -o $need_install -eq 1 ]; then
 		if [ ! -r ${isofile} ]; then
 			echo -n "Installation CDROM image \"${isofile}\" "
 			echo    "is not readable"
 			exit 1
 		fi
 		BOOTDISKS="-d ${isofile}"
 		installer_opt="-s 31:0,ahci-cd,${isofile}"
 	else
 		BOOTDISKS=""
 		i=0
 		while [ $i -lt $disk_total ] ; do
 			eval "disk=\$disk_dev${i}"
 			if [ -r ${disk} ] ; then
 				BOOTDISKS="$BOOTDISKS -d ${disk} "
 			fi
 			i=$(($i + 1))
 		done
 		installer_opt=""
 	fi
 
 	${LOADER} -c ${console} -m ${memsize} ${BOOTDISKS} ${loader_opt} \
 		${vmname}
 	bhyve_exit=$?
 	if [ $bhyve_exit -ne 0 ]; then
 		break
 	fi
 
 	#
 	# Build up args for additional tap and disk devices now.
 	#
 	nextslot=2  # slot 0 is hostbridge, slot 1 is lpc
 	devargs=""  # accumulate disk/tap args here
 	i=0
 	while [ $i -lt $tap_total ] ; do
 	    eval "tapname=\$tap_dev${i}"
 	    devargs="$devargs -s $nextslot:0,virtio-net,${tapname} "
 	    nextslot=$(($nextslot + 1))
 	    i=$(($i + 1))
 	done
 
 	i=0
 	while [ $i -lt $disk_total ] ; do
 	    eval "disk=\$disk_dev${i}"
 	    eval "opts=\$disk_opts${i}"
 	    make_and_check_diskdev "${disk}"
 	    devargs="$devargs -s $nextslot:0,virtio-blk,${disk}${opts} "
 	    nextslot=$(($nextslot + 1))
 	    i=$(($i + 1))
 	done
+
+	i=0
+	while [ $i -lt $pass_total ] ; do
+	    eval "pass=\$pass_dev${i}"
+	    devargs="$devargs -s $nextslot:0,passthru,${pass} "
+	    nextslot=$(($nextslot + 1))
+	    i=$(($i + 1))
+        done
 
 	${FBSDRUN} -c ${cpus} -m ${memsize} ${apic_opt} -A -H -P	\
 		-g ${gdbport}						\
 		-s 0:0,hostbridge					\
 		-s 1:0,lpc						\
 		${devargs}						\
 		-l com1,${console}					\
 		${installer_opt}					\
 		${vmname}
 
 	bhyve_exit=$?
 	# bhyve returns the following status codes:
 	#  0 - VM has been reset
 	#  1 - VM has been powered off
 	#  2 - VM has been halted
 	#  3 - VM generated a triple fault
 	#  all other non-zero status codes are errors
 	#
 	if [ $bhyve_exit -ne 0 ]; then
 		break
 	fi
 done
 
 
 case $bhyve_exit in
 	0|1|2)
 		# Cleanup /dev/vmm entry when bhyve did not exit
 		# due to an error.
 		${BHYVECTL} --vm=${vmname} --destroy > /dev/null 2>&1
 		;;
 esac
 
 exit $bhyve_exit
Index: stable/10/sys/amd64/include/vmm.h
===================================================================
--- stable/10/sys/amd64/include/vmm.h	(revision 284898)
+++ stable/10/sys/amd64/include/vmm.h	(revision 284899)
@@ -1,638 +1,639 @@
 /*-
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _VMM_H_
 #define	_VMM_H_
 
 #include <x86/segments.h>
 
 enum vm_suspend_how {
 	VM_SUSPEND_NONE,
 	VM_SUSPEND_RESET,
 	VM_SUSPEND_POWEROFF,
 	VM_SUSPEND_HALT,
 	VM_SUSPEND_TRIPLEFAULT,
 	VM_SUSPEND_LAST
 };
 
 /*
  * Identifiers for architecturally defined registers.
  */
 enum vm_reg_name {
 	VM_REG_GUEST_RAX,
 	VM_REG_GUEST_RBX,
 	VM_REG_GUEST_RCX,
 	VM_REG_GUEST_RDX,
 	VM_REG_GUEST_RSI,
 	VM_REG_GUEST_RDI,
 	VM_REG_GUEST_RBP,
 	VM_REG_GUEST_R8,
 	VM_REG_GUEST_R9,
 	VM_REG_GUEST_R10,
 	VM_REG_GUEST_R11,
 	VM_REG_GUEST_R12,
 	VM_REG_GUEST_R13,
 	VM_REG_GUEST_R14,
 	VM_REG_GUEST_R15,
 	VM_REG_GUEST_CR0,
 	VM_REG_GUEST_CR3,
 	VM_REG_GUEST_CR4,
 	VM_REG_GUEST_DR7,
 	VM_REG_GUEST_RSP,
 	VM_REG_GUEST_RIP,
 	VM_REG_GUEST_RFLAGS,
 	VM_REG_GUEST_ES,
 	VM_REG_GUEST_CS,
 	VM_REG_GUEST_SS,
 	VM_REG_GUEST_DS,
 	VM_REG_GUEST_FS,
 	VM_REG_GUEST_GS,
 	VM_REG_GUEST_LDTR,
 	VM_REG_GUEST_TR,
 	VM_REG_GUEST_IDTR,
 	VM_REG_GUEST_GDTR,
 	VM_REG_GUEST_EFER,
 	VM_REG_GUEST_CR2,
 	VM_REG_GUEST_PDPTE0,
 	VM_REG_GUEST_PDPTE1,
 	VM_REG_GUEST_PDPTE2,
 	VM_REG_GUEST_PDPTE3,
 	VM_REG_GUEST_INTR_SHADOW,
 	VM_REG_LAST
 };
 
 enum x2apic_state {
 	X2APIC_DISABLED,
 	X2APIC_ENABLED,
 	X2APIC_STATE_LAST
 };
 
 #define	VM_INTINFO_VECTOR(info)	((info) & 0xff)
 #define	VM_INTINFO_DEL_ERRCODE	0x800
 #define	VM_INTINFO_RSVD		0x7ffff000
 #define	VM_INTINFO_VALID	0x80000000
 #define	VM_INTINFO_TYPE		0x700
 #define	VM_INTINFO_HWINTR	(0 << 8)
 #define	VM_INTINFO_NMI		(2 << 8)
 #define	VM_INTINFO_HWEXCEPTION	(3 << 8)
 #define	VM_INTINFO_SWINTR	(4 << 8)
 
 #ifdef _KERNEL
 
 #define	VM_MAX_NAMELEN	32
 
 struct vm;
 struct vm_exception;
 struct vm_memory_segment;
 struct seg_desc;
 struct vm_exit;
 struct vm_run;
 struct vhpet;
 struct vioapic;
 struct vlapic;
 struct vmspace;
 struct vm_object;
 struct vm_guest_paging;
 struct pmap;
 
 typedef int	(*vmm_init_func_t)(int ipinum);
 typedef int	(*vmm_cleanup_func_t)(void);
 typedef void	(*vmm_resume_func_t)(void);
 typedef void *	(*vmi_init_func_t)(struct vm *vm, struct pmap *pmap);
 typedef int	(*vmi_run_func_t)(void *vmi, int vcpu, register_t rip,
 				  struct pmap *pmap, void *rendezvous_cookie,
 				  void *suspend_cookie);
 typedef void	(*vmi_cleanup_func_t)(void *vmi);
 typedef int	(*vmi_get_register_t)(void *vmi, int vcpu, int num,
 				      uint64_t *retval);
 typedef int	(*vmi_set_register_t)(void *vmi, int vcpu, int num,
 				      uint64_t val);
 typedef int	(*vmi_get_desc_t)(void *vmi, int vcpu, int num,
 				  struct seg_desc *desc);
 typedef int	(*vmi_set_desc_t)(void *vmi, int vcpu, int num,
 				  struct seg_desc *desc);
 typedef int	(*vmi_get_cap_t)(void *vmi, int vcpu, int num, int *retval);
 typedef int	(*vmi_set_cap_t)(void *vmi, int vcpu, int num, int val);
 typedef struct vmspace * (*vmi_vmspace_alloc)(vm_offset_t min, vm_offset_t max);
 typedef void	(*vmi_vmspace_free)(struct vmspace *vmspace);
 typedef struct vlapic * (*vmi_vlapic_init)(void *vmi, int vcpu);
 typedef void	(*vmi_vlapic_cleanup)(void *vmi, struct vlapic *vlapic);
 
 struct vmm_ops {
 	vmm_init_func_t		init;		/* module wide initialization */
 	vmm_cleanup_func_t	cleanup;
 	vmm_resume_func_t	resume;
 
 	vmi_init_func_t		vminit;		/* vm-specific initialization */
 	vmi_run_func_t		vmrun;
 	vmi_cleanup_func_t	vmcleanup;
 	vmi_get_register_t	vmgetreg;
 	vmi_set_register_t	vmsetreg;
 	vmi_get_desc_t		vmgetdesc;
 	vmi_set_desc_t		vmsetdesc;
 	vmi_get_cap_t		vmgetcap;
 	vmi_set_cap_t		vmsetcap;
 	vmi_vmspace_alloc	vmspace_alloc;
 	vmi_vmspace_free	vmspace_free;
 	vmi_vlapic_init		vlapic_init;
 	vmi_vlapic_cleanup	vlapic_cleanup;
 };
 
 extern struct vmm_ops vmm_ops_intel;
 extern struct vmm_ops vmm_ops_amd;
 
 int vm_create(const char *name, struct vm **retvm);
 void vm_destroy(struct vm *vm);
 int vm_reinit(struct vm *vm);
 const char *vm_name(struct vm *vm);
 int vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len);
 int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
 int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len);
 void *vm_gpa_hold(struct vm *, vm_paddr_t gpa, size_t len, int prot,
 		  void **cookie);
 void vm_gpa_release(void *cookie);
 int vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
 	      struct vm_memory_segment *seg);
 int vm_get_memobj(struct vm *vm, vm_paddr_t gpa, size_t len,
 		  vm_offset_t *offset, struct vm_object **object);
 boolean_t vm_mem_allocated(struct vm *vm, vm_paddr_t gpa);
 int vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval);
 int vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val);
 int vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
 		    struct seg_desc *ret_desc);
 int vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
 		    struct seg_desc *desc);
 int vm_run(struct vm *vm, struct vm_run *vmrun);
 int vm_suspend(struct vm *vm, enum vm_suspend_how how);
 int vm_inject_nmi(struct vm *vm, int vcpu);
 int vm_nmi_pending(struct vm *vm, int vcpuid);
 void vm_nmi_clear(struct vm *vm, int vcpuid);
 int vm_inject_extint(struct vm *vm, int vcpu);
 int vm_extint_pending(struct vm *vm, int vcpuid);
 void vm_extint_clear(struct vm *vm, int vcpuid);
 struct vlapic *vm_lapic(struct vm *vm, int cpu);
 struct vioapic *vm_ioapic(struct vm *vm);
 struct vhpet *vm_hpet(struct vm *vm);
 int vm_get_capability(struct vm *vm, int vcpu, int type, int *val);
 int vm_set_capability(struct vm *vm, int vcpu, int type, int val);
 int vm_get_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state *state);
 int vm_set_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state state);
 int vm_apicid2vcpuid(struct vm *vm, int apicid);
 int vm_activate_cpu(struct vm *vm, int vcpu);
 cpuset_t vm_active_cpus(struct vm *vm);
 cpuset_t vm_suspended_cpus(struct vm *vm);
 struct vm_exit *vm_exitinfo(struct vm *vm, int vcpuid);
 void vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip);
 void vm_exit_rendezvous(struct vm *vm, int vcpuid, uint64_t rip);
 void vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip);
 
 /*
  * Rendezvous all vcpus specified in 'dest' and execute 'func(arg)'.
  * The rendezvous 'func(arg)' is not allowed to do anything that will
  * cause the thread to be put to sleep.
  *
  * If the rendezvous is being initiated from a vcpu context then the
  * 'vcpuid' must refer to that vcpu, otherwise it should be set to -1.
  *
  * The caller cannot hold any locks when initiating the rendezvous.
  *
  * The implementation of this API may cause vcpus other than those specified
  * by 'dest' to be stalled. The caller should not rely on any vcpus making
  * forward progress when the rendezvous is in progress.
  */
 typedef void (*vm_rendezvous_func_t)(struct vm *vm, int vcpuid, void *arg);
 void vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest,
     vm_rendezvous_func_t func, void *arg);
 
 static __inline int
 vcpu_rendezvous_pending(void *rendezvous_cookie)
 {
 
 	return (*(uintptr_t *)rendezvous_cookie != 0);
 }
 
 static __inline int
 vcpu_suspended(void *suspend_cookie)
 {
 
 	return (*(int *)suspend_cookie);
 }
 
 /*
  * Return 1 if device indicated by bus/slot/func is supposed to be a
  * pci passthrough device.
  *
  * Return 0 otherwise.
  */
 int vmm_is_pptdev(int bus, int slot, int func);
 
 void *vm_iommu_domain(struct vm *vm);
 
 enum vcpu_state {
 	VCPU_IDLE,
 	VCPU_FROZEN,
 	VCPU_RUNNING,
 	VCPU_SLEEPING,
 };
 
 int vcpu_set_state(struct vm *vm, int vcpu, enum vcpu_state state,
     bool from_idle);
 enum vcpu_state vcpu_get_state(struct vm *vm, int vcpu, int *hostcpu);
 
 static int __inline
 vcpu_is_running(struct vm *vm, int vcpu, int *hostcpu)
 {
 	return (vcpu_get_state(vm, vcpu, hostcpu) == VCPU_RUNNING);
 }
 
 #ifdef _SYS_PROC_H_
 static int __inline
 vcpu_should_yield(struct vm *vm, int vcpu)
 {
 	return (curthread->td_flags & (TDF_ASTPENDING | TDF_NEEDRESCHED));
 }
 #endif
 
 void *vcpu_stats(struct vm *vm, int vcpu);
 void vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr);
 struct vmspace *vm_get_vmspace(struct vm *vm);
 int vm_assign_pptdev(struct vm *vm, int bus, int slot, int func);
 int vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func);
 struct vatpic *vm_atpic(struct vm *vm);
 struct vatpit *vm_atpit(struct vm *vm);
 struct vpmtmr *vm_pmtmr(struct vm *vm);
 struct vrtc *vm_rtc(struct vm *vm);
 
 /*
  * Inject exception 'vector' into the guest vcpu. This function returns 0 on
  * success and non-zero on failure.
  *
  * Wrapper functions like 'vm_inject_gp()' should be preferred to calling
  * this function directly because they enforce the trap-like or fault-like
  * behavior of an exception.
  *
  * This function should only be called in the context of the thread that is
  * executing this vcpu.
  */
 int vm_inject_exception(struct vm *vm, int vcpuid, int vector, int err_valid,
     uint32_t errcode, int restart_instruction);
 
 /*
  * This function is called after a VM-exit that occurred during exception or
  * interrupt delivery through the IDT. The format of 'intinfo' is described
  * in Figure 15-1, "EXITINTINFO for All Intercepts", APM, Vol 2.
  *
  * If a VM-exit handler completes the event delivery successfully then it
  * should call vm_exit_intinfo() to extinguish the pending event. For e.g.,
  * if the task switch emulation is triggered via a task gate then it should
  * call this function with 'intinfo=0' to indicate that the external event
  * is not pending anymore.
  *
  * Return value is 0 on success and non-zero on failure.
  */
 int vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t intinfo);
 
 /*
  * This function is called before every VM-entry to retrieve a pending
  * event that should be injected into the guest. This function combines
  * nested events into a double or triple fault.
  *
  * Returns 0 if there are no events that need to be injected into the guest
  * and non-zero otherwise.
  */
 int vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *info);
 
 int vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2);
 
 enum vm_reg_name vm_segment_name(int seg_encoding);
 
 struct vm_copyinfo {
 	uint64_t	gpa;
 	size_t		len;
 	void		*hva;
 	void		*cookie;
 };
 
 /*
  * Set up 'copyinfo[]' to copy to/from guest linear address space starting
  * at 'gla' and 'len' bytes long. The 'prot' should be set to PROT_READ for
  * a copyin or PROT_WRITE for a copyout. 
  *
  * Returns 0 on success.
  * Returns 1 if an exception was injected into the guest.
  * Returns -1 otherwise.
  *
  * The 'copyinfo[]' can be passed to 'vm_copyin()' or 'vm_copyout()' only if
  * the return value is 0. The 'copyinfo[]' resources should be freed by calling
  * 'vm_copy_teardown()' after the copy is done.
  */
 int vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
     uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
     int num_copyinfo);
 void vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
     int num_copyinfo);
 void vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
     void *kaddr, size_t len);
 void vm_copyout(struct vm *vm, int vcpuid, const void *kaddr,
     struct vm_copyinfo *copyinfo, size_t len);
 
 int vcpu_trace_exceptions(struct vm *vm, int vcpuid);
 #endif	/* KERNEL */
 
 #define	VM_MAXCPU	16			/* maximum virtual cpus */
 
 /*
  * Identifiers for optional vmm capabilities
  */
 enum vm_cap_type {
 	VM_CAP_HALT_EXIT,
 	VM_CAP_MTRAP_EXIT,
 	VM_CAP_PAUSE_EXIT,
 	VM_CAP_UNRESTRICTED_GUEST,
 	VM_CAP_ENABLE_INVPCID,
 	VM_CAP_MAX
 };
 
 enum vm_intr_trigger {
 	EDGE_TRIGGER,
 	LEVEL_TRIGGER
 };
 	
 /*
  * The 'access' field has the format specified in Table 21-2 of the Intel
  * Architecture Manual vol 3b.
  *
  * XXX The contents of the 'access' field are architecturally defined except
  * bit 16 - Segment Unusable.
  */
 struct seg_desc {
 	uint64_t	base;
 	uint32_t	limit;
 	uint32_t	access;
 };
 #define	SEG_DESC_TYPE(access)		((access) & 0x001f)
 #define	SEG_DESC_DPL(access)		(((access) >> 5) & 0x3)
 #define	SEG_DESC_PRESENT(access)	(((access) & 0x0080) ? 1 : 0)
 #define	SEG_DESC_DEF32(access)		(((access) & 0x4000) ? 1 : 0)
 #define	SEG_DESC_GRANULARITY(access)	(((access) & 0x8000) ? 1 : 0)
 #define	SEG_DESC_UNUSABLE(access)	(((access) & 0x10000) ? 1 : 0)
 
 enum vm_cpu_mode {
 	CPU_MODE_REAL,
 	CPU_MODE_PROTECTED,
 	CPU_MODE_COMPATIBILITY,		/* IA-32E mode (CS.L = 0) */
 	CPU_MODE_64BIT,			/* IA-32E mode (CS.L = 1) */
 };
 
 enum vm_paging_mode {
 	PAGING_MODE_FLAT,
 	PAGING_MODE_32,
 	PAGING_MODE_PAE,
 	PAGING_MODE_64,
 };
 
 struct vm_guest_paging {
 	uint64_t	cr3;
 	int		cpl;
 	enum vm_cpu_mode cpu_mode;
 	enum vm_paging_mode paging_mode;
 };
 
 /*
  * The data structures 'vie' and 'vie_op' are meant to be opaque to the
  * consumers of instruction decoding. The only reason why their contents
  * need to be exposed is because they are part of the 'vm_exit' structure.
  */
 struct vie_op {
 	uint8_t		op_byte;	/* actual opcode byte */
 	uint8_t		op_type;	/* type of operation (e.g. MOV) */
 	uint16_t	op_flags;
 };
 
 #define	VIE_INST_SIZE	15
 struct vie {
 	uint8_t		inst[VIE_INST_SIZE];	/* instruction bytes */
 	uint8_t		num_valid;		/* size of the instruction */
 	uint8_t		num_processed;
 
 	uint8_t		addrsize:4, opsize:4;	/* address and operand sizes */
 	uint8_t		rex_w:1,		/* REX prefix */
 			rex_r:1,
 			rex_x:1,
 			rex_b:1,
 			rex_present:1,
 			repz_present:1,		/* REP/REPE/REPZ prefix */
 			repnz_present:1,	/* REPNE/REPNZ prefix */
 			opsize_override:1,	/* Operand size override */
 			addrsize_override:1,	/* Address size override */
 			segment_override:1;	/* Segment override */
 
 	uint8_t		mod:2,			/* ModRM byte */
 			reg:4,
 			rm:4;
 
 	uint8_t		ss:2,			/* SIB byte */
 			index:4,
 			base:4;
 
 	uint8_t		disp_bytes;
 	uint8_t		imm_bytes;
 
 	uint8_t		scale;
 	int		base_register;		/* VM_REG_GUEST_xyz */
 	int		index_register;		/* VM_REG_GUEST_xyz */
 	int		segment_register;	/* VM_REG_GUEST_xyz */
 
 	int64_t		displacement;		/* optional addr displacement */
 	int64_t		immediate;		/* optional immediate operand */
 
 	uint8_t		decoded;	/* set to 1 if successfully decoded */
 
 	struct vie_op	op;			/* opcode description */
 };
 
 enum vm_exitcode {
 	VM_EXITCODE_INOUT,
 	VM_EXITCODE_VMX,
 	VM_EXITCODE_BOGUS,
 	VM_EXITCODE_RDMSR,
 	VM_EXITCODE_WRMSR,
 	VM_EXITCODE_HLT,
 	VM_EXITCODE_MTRAP,
 	VM_EXITCODE_PAUSE,
 	VM_EXITCODE_PAGING,
 	VM_EXITCODE_INST_EMUL,
 	VM_EXITCODE_SPINUP_AP,
 	VM_EXITCODE_DEPRECATED1,	/* used to be SPINDOWN_CPU */
 	VM_EXITCODE_RENDEZVOUS,
 	VM_EXITCODE_IOAPIC_EOI,
 	VM_EXITCODE_SUSPENDED,
 	VM_EXITCODE_INOUT_STR,
 	VM_EXITCODE_TASK_SWITCH,
 	VM_EXITCODE_MONITOR,
 	VM_EXITCODE_MWAIT,
 	VM_EXITCODE_SVM,
 	VM_EXITCODE_MAX
 };
 
 struct vm_inout {
 	uint16_t	bytes:3;	/* 1 or 2 or 4 */
 	uint16_t	in:1;
 	uint16_t	string:1;
 	uint16_t	rep:1;
 	uint16_t	port;
 	uint32_t	eax;		/* valid for out */
 };
 
 struct vm_inout_str {
 	struct vm_inout	inout;		/* must be the first element */
 	struct vm_guest_paging paging;
 	uint64_t	rflags;
 	uint64_t	cr0;
 	uint64_t	index;
 	uint64_t	count;		/* rep=1 (%rcx), rep=0 (1) */
 	int		addrsize;
 	enum vm_reg_name seg_name;
 	struct seg_desc seg_desc;
 };
 
 enum task_switch_reason {
 	TSR_CALL,
 	TSR_IRET,
 	TSR_JMP,
 	TSR_IDT_GATE,	/* task gate in IDT */
 };
 
 struct vm_task_switch {
 	uint16_t	tsssel;		/* new TSS selector */
 	int		ext;		/* task switch due to external event */
 	uint32_t	errcode;
 	int		errcode_valid;	/* push 'errcode' on the new stack */
 	enum task_switch_reason reason;
 	struct vm_guest_paging paging;
 };
 
 struct vm_exit {
 	enum vm_exitcode	exitcode;
 	int			inst_length;	/* 0 means unknown */
 	uint64_t		rip;
 	union {
 		struct vm_inout	inout;
 		struct vm_inout_str inout_str;
 		struct {
 			uint64_t	gpa;
 			int		fault_type;
 		} paging;
 		struct {
 			uint64_t	gpa;
 			uint64_t	gla;
+			uint64_t	cs_base;
 			int		cs_d;		/* CS.D */
 			struct vm_guest_paging paging;
 			struct vie	vie;
 		} inst_emul;
 		/*
 		 * VMX specific payload. Used when there is no "better"
 		 * exitcode to represent the VM-exit.
 		 */
 		struct {
 			int		status;		/* vmx inst status */
 			/*
 			 * 'exit_reason' and 'exit_qualification' are valid
 			 * only if 'status' is zero.
 			 */
 			uint32_t	exit_reason;
 			uint64_t	exit_qualification;
 			/*
 			 * 'inst_error' and 'inst_type' are valid
 			 * only if 'status' is non-zero.
 			 */
 			int		inst_type;
 			int		inst_error;
 		} vmx;
 		/*
 		 * SVM specific payload.
 		 */
 		struct {
 			uint64_t	exitcode;
 			uint64_t	exitinfo1;
 			uint64_t	exitinfo2;
 		} svm;
 		struct {
 			uint32_t	code;		/* ecx value */
 			uint64_t	wval;
 		} msr;
 		struct {
 			int		vcpu;
 			uint64_t	rip;
 		} spinup_ap;
 		struct {
 			uint64_t	rflags;
 		} hlt;
 		struct {
 			int		vector;
 		} ioapic_eoi;
 		struct {
 			enum vm_suspend_how how;
 		} suspended;
 		struct vm_task_switch task_switch;
 	} u;
 };
 
 /* APIs to inject faults into the guest */
 void vm_inject_fault(void *vm, int vcpuid, int vector, int errcode_valid,
     int errcode);
 
 static __inline void
 vm_inject_ud(void *vm, int vcpuid)
 {
 	vm_inject_fault(vm, vcpuid, IDT_UD, 0, 0);
 }
 
 static __inline void
 vm_inject_gp(void *vm, int vcpuid)
 {
 	vm_inject_fault(vm, vcpuid, IDT_GP, 1, 0);
 }
 
 static __inline void
 vm_inject_ac(void *vm, int vcpuid, int errcode)
 {
 	vm_inject_fault(vm, vcpuid, IDT_AC, 1, errcode);
 }
 
 static __inline void
 vm_inject_ss(void *vm, int vcpuid, int errcode)
 {
 	vm_inject_fault(vm, vcpuid, IDT_SS, 1, errcode);
 }
 
 void vm_inject_pf(void *vm, int vcpuid, int error_code, uint64_t cr2);
 
 int vm_restart_instruction(void *vm, int vcpuid);
 
 #endif	/* _VMM_H_ */
Index: stable/10/sys/amd64/include/vmm_instruction_emul.h
===================================================================
--- stable/10/sys/amd64/include/vmm_instruction_emul.h	(revision 284898)
+++ stable/10/sys/amd64/include/vmm_instruction_emul.h	(revision 284899)
@@ -1,114 +1,114 @@
 /*-
  * Copyright (c) 2012 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef	_VMM_INSTRUCTION_EMUL_H_
 #define	_VMM_INSTRUCTION_EMUL_H_
 
 #include <sys/mman.h>
 
 /*
  * Callback functions to read and write memory regions.
  */
 typedef int (*mem_region_read_t)(void *vm, int cpuid, uint64_t gpa,
 				 uint64_t *rval, int rsize, void *arg);
 
 typedef int (*mem_region_write_t)(void *vm, int cpuid, uint64_t gpa,
 				  uint64_t wval, int wsize, void *arg);
 
 /*
  * Emulate the decoded 'vie' instruction.
  *
  * The callbacks 'mrr' and 'mrw' emulate reads and writes to the memory region
  * containing 'gpa'. 'mrarg' is an opaque argument that is passed into the
  * callback functions.
  *
  * 'void *vm' should be 'struct vm *' when called from kernel context and
  * 'struct vmctx *' when called from user context.
  * s
  */
 int vmm_emulate_instruction(void *vm, int cpuid, uint64_t gpa, struct vie *vie,
     struct vm_guest_paging *paging, mem_region_read_t mrr,
     mem_region_write_t mrw, void *mrarg);
 
 int vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg,
     uint64_t val, int size);
 
 /*
  * Returns 1 if an alignment check exception should be injected and 0 otherwise.
  */
 int vie_alignment_check(int cpl, int operand_size, uint64_t cr0,
     uint64_t rflags, uint64_t gla);
 
 /* Returns 1 if the 'gla' is not canonical and 0 otherwise. */
 int vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla);
 
 uint64_t vie_size2mask(int size);
 
 int vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
     struct seg_desc *desc, uint64_t off, int length, int addrsize, int prot,
     uint64_t *gla);
 
 #ifdef _KERNEL
 /*
  * APIs to fetch and decode the instruction from nested page fault handler.
  *
  * 'vie' must be initialized before calling 'vmm_fetch_instruction()'
  */
 int vmm_fetch_instruction(struct vm *vm, int cpuid,
 			  struct vm_guest_paging *guest_paging,
 			  uint64_t rip, int inst_length, struct vie *vie);
 
 /*
  * Translate the guest linear address 'gla' to a guest physical address.
  *
  * Returns 0 on success and '*gpa' contains the result of the translation.
  * Returns 1 if an exception was injected into the guest.
  * Returns -1 otherwise.
  */
-int vmm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
+int vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
     uint64_t gla, int prot, uint64_t *gpa);
 
 void vie_init(struct vie *vie, const char *inst_bytes, int inst_length);
 
 /*
  * Decode the instruction fetched into 'vie' so it can be emulated.
  *
  * 'gla' is the guest linear address provided by the hardware assist
  * that caused the nested page table fault. It is used to verify that
  * the software instruction decoding is in agreement with the hardware.
  * 
  * Some hardware assists do not provide the 'gla' to the hypervisor.
  * To skip the 'gla' verification for this or any other reason pass
  * in VIE_INVALID_GLA instead.
  */
 #define	VIE_INVALID_GLA		(1UL << 63)	/* a non-canonical address */
 int vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla,
 			   enum vm_cpu_mode cpu_mode, int csd, struct vie *vie);
 #endif	/* _KERNEL */
 
 #endif	/* _VMM_INSTRUCTION_EMUL_H_ */
Index: stable/10/sys/amd64/vmm/amd/svm.c
===================================================================
--- stable/10/sys/amd64/vmm/amd/svm.c	(revision 284898)
+++ stable/10/sys/amd64/vmm/amd/svm.c	(revision 284899)
@@ -1,2182 +1,2189 @@
 /*-
  * Copyright (c) 2013, Anish Gupta (akgupt3@gmail.com)
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/smp.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/pcpu.h>
 #include <sys/proc.h>
 #include <sys/sysctl.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 #include <machine/cpufunc.h>
 #include <machine/psl.h>
 #include <machine/pmap.h>
 #include <machine/md_var.h>
 #include <machine/specialreg.h>
 #include <machine/smp.h>
 #include <machine/vmm.h>
 #include <machine/vmm_dev.h>
 #include <machine/vmm_instruction_emul.h>
 
 #include "vmm_lapic.h"
 #include "vmm_stat.h"
 #include "vmm_ktr.h"
 #include "vmm_ioport.h"
 #include "vatpic.h"
 #include "vlapic.h"
 #include "vlapic_priv.h"
 
 #include "x86.h"
 #include "vmcb.h"
 #include "svm.h"
 #include "svm_softc.h"
 #include "svm_msr.h"
 #include "npt.h"
 
 SYSCTL_DECL(_hw_vmm);
 SYSCTL_NODE(_hw_vmm, OID_AUTO, svm, CTLFLAG_RW, NULL, NULL);
 
 /*
  * SVM CPUID function 0x8000_000A, edx bit decoding.
  */
 #define AMD_CPUID_SVM_NP		BIT(0)  /* Nested paging or RVI */
 #define AMD_CPUID_SVM_LBR		BIT(1)  /* Last branch virtualization */
 #define AMD_CPUID_SVM_SVML		BIT(2)  /* SVM lock */
 #define AMD_CPUID_SVM_NRIP_SAVE		BIT(3)  /* Next RIP is saved */
 #define AMD_CPUID_SVM_TSC_RATE		BIT(4)  /* TSC rate control. */
 #define AMD_CPUID_SVM_VMCB_CLEAN	BIT(5)  /* VMCB state caching */
 #define AMD_CPUID_SVM_FLUSH_BY_ASID	BIT(6)  /* Flush by ASID */
 #define AMD_CPUID_SVM_DECODE_ASSIST	BIT(7)  /* Decode assist */
 #define AMD_CPUID_SVM_PAUSE_INC		BIT(10) /* Pause intercept filter. */
 #define AMD_CPUID_SVM_PAUSE_FTH		BIT(12) /* Pause filter threshold */
 #define	AMD_CPUID_SVM_AVIC		BIT(13)	/* AVIC present */
 
 #define	VMCB_CACHE_DEFAULT	(VMCB_CACHE_ASID 	|	\
 				VMCB_CACHE_IOPM		|	\
 				VMCB_CACHE_I		|	\
 				VMCB_CACHE_TPR		|	\
 				VMCB_CACHE_CR2		|	\
 				VMCB_CACHE_CR		|	\
 				VMCB_CACHE_DT		|	\
 				VMCB_CACHE_SEG		|	\
 				VMCB_CACHE_NP)
 
 static uint32_t vmcb_clean = VMCB_CACHE_DEFAULT;
 SYSCTL_INT(_hw_vmm_svm, OID_AUTO, vmcb_clean, CTLFLAG_RDTUN, &vmcb_clean,
     0, NULL);
 
 static MALLOC_DEFINE(M_SVM, "svm", "svm");
 static MALLOC_DEFINE(M_SVM_VLAPIC, "svm-vlapic", "svm-vlapic");
 
 /* Per-CPU context area. */
 extern struct pcpu __pcpu[];
 
 static uint32_t svm_feature;	/* AMD SVM features. */
 SYSCTL_UINT(_hw_vmm_svm, OID_AUTO, features, CTLFLAG_RD, &svm_feature, 0,
     "SVM features advertised by CPUID.8000000AH:EDX");
 
 static int disable_npf_assist;
 SYSCTL_INT(_hw_vmm_svm, OID_AUTO, disable_npf_assist, CTLFLAG_RWTUN,
     &disable_npf_assist, 0, NULL);
 
 /* Maximum ASIDs supported by the processor */
 static uint32_t nasid;
 SYSCTL_UINT(_hw_vmm_svm, OID_AUTO, num_asids, CTLFLAG_RD, &nasid, 0,
     "Number of ASIDs supported by this processor");
 
 /* Current ASID generation for each host cpu */
 static struct asid asid[MAXCPU];
 
 /* 
  * SVM host state saved area of size 4KB for each core.
  */
 static uint8_t hsave[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE);
 
 static VMM_STAT_AMD(VCPU_EXITINTINFO, "VM exits during event delivery");
 static VMM_STAT_AMD(VCPU_INTINFO_INJECTED, "Events pending at VM entry");
 static VMM_STAT_AMD(VMEXIT_VINTR, "VM exits due to interrupt window");
 
 static int svm_setreg(void *arg, int vcpu, int ident, uint64_t val);
 
 static __inline int
 flush_by_asid(void)
 {
 
 	return (svm_feature & AMD_CPUID_SVM_FLUSH_BY_ASID);
 }
 
 static __inline int
 decode_assist(void)
 {
 
 	return (svm_feature & AMD_CPUID_SVM_DECODE_ASSIST);
 }
 
 static void
 svm_disable(void *arg __unused)
 {
 	uint64_t efer;
 
 	efer = rdmsr(MSR_EFER);
 	efer &= ~EFER_SVM;
 	wrmsr(MSR_EFER, efer);
 }
 
 /*
  * Disable SVM on all CPUs.
  */
 static int
 svm_cleanup(void)
 {
 
 	smp_rendezvous(NULL, svm_disable, NULL, NULL);
 	return (0);
 }
 
 /*
  * Verify that all the features required by bhyve are available.
  */
 static int
 check_svm_features(void)
 {
 	u_int regs[4];
 
 	/* CPUID Fn8000_000A is for SVM */
 	do_cpuid(0x8000000A, regs);
 	svm_feature = regs[3];
 
 	nasid = regs[1];
 	KASSERT(nasid > 1, ("Insufficient ASIDs for guests: %#x", nasid));
 
 	/* bhyve requires the Nested Paging feature */
 	if (!(svm_feature & AMD_CPUID_SVM_NP)) {
 		printf("SVM: Nested Paging feature not available.\n");
 		return (ENXIO);
 	}
 
 	/* bhyve requires the NRIP Save feature */
 	if (!(svm_feature & AMD_CPUID_SVM_NRIP_SAVE)) {
 		printf("SVM: NRIP Save feature not available.\n");
 		return (ENXIO);
 	}
 
 	return (0);
 }
 
 static void
 svm_enable(void *arg __unused)
 {
 	uint64_t efer;
 
 	efer = rdmsr(MSR_EFER);
 	efer |= EFER_SVM;
 	wrmsr(MSR_EFER, efer);
 
 	wrmsr(MSR_VM_HSAVE_PA, vtophys(hsave[curcpu]));
 }
 
 /*
  * Return 1 if SVM is enabled on this processor and 0 otherwise.
  */
 static int
 svm_available(void)
 {
 	uint64_t msr;
 
 	/* Section 15.4 Enabling SVM from APM2. */
 	if ((amd_feature2 & AMDID2_SVM) == 0) {
 		printf("SVM: not available.\n");
 		return (0);
 	}
 
 	msr = rdmsr(MSR_VM_CR);
 	if ((msr & VM_CR_SVMDIS) != 0) {
 		printf("SVM: disabled by BIOS.\n");
 		return (0);
 	}
 
 	return (1);
 }
 
 static int
 svm_init(int ipinum)
 {
 	int error, cpu;
 
 	if (!svm_available())
 		return (ENXIO);
 
 	error = check_svm_features();
 	if (error)
 		return (error);
 
 	vmcb_clean &= VMCB_CACHE_DEFAULT;
 
 	for (cpu = 0; cpu < MAXCPU; cpu++) {
 		/*
 		 * Initialize the host ASIDs to their "highest" valid values.
 		 *
 		 * The next ASID allocation will rollover both 'gen' and 'num'
 		 * and start off the sequence at {1,1}.
 		 */
 		asid[cpu].gen = ~0UL;
 		asid[cpu].num = nasid - 1;
 	}
 
 	svm_msr_init();
 	svm_npt_init(ipinum);
 
 	/* Enable SVM on all CPUs */
 	smp_rendezvous(NULL, svm_enable, NULL, NULL);
 
 	return (0);
 }
 
 static void
 svm_restore(void)
 {
 
 	svm_enable(NULL);
 }		
 
 /* Pentium compatible MSRs */
 #define MSR_PENTIUM_START 	0	
 #define MSR_PENTIUM_END 	0x1FFF
 /* AMD 6th generation and Intel compatible MSRs */
 #define MSR_AMD6TH_START 	0xC0000000UL	
 #define MSR_AMD6TH_END 		0xC0001FFFUL	
 /* AMD 7th and 8th generation compatible MSRs */
 #define MSR_AMD7TH_START 	0xC0010000UL	
 #define MSR_AMD7TH_END 		0xC0011FFFUL	
 
 /*
  * Get the index and bit position for a MSR in permission bitmap.
  * Two bits are used for each MSR: lower bit for read and higher bit for write.
  */
 static int
 svm_msr_index(uint64_t msr, int *index, int *bit)
 {
 	uint32_t base, off;
 
 	*index = -1;
 	*bit = (msr % 4) * 2;
 	base = 0;
 
 	if (msr >= MSR_PENTIUM_START && msr <= MSR_PENTIUM_END) {
 		*index = msr / 4;
 		return (0);
 	}
 
 	base += (MSR_PENTIUM_END - MSR_PENTIUM_START + 1); 
 	if (msr >= MSR_AMD6TH_START && msr <= MSR_AMD6TH_END) {
 		off = (msr - MSR_AMD6TH_START); 
 		*index = (off + base) / 4;
 		return (0);
 	} 
 
 	base += (MSR_AMD6TH_END - MSR_AMD6TH_START + 1);
 	if (msr >= MSR_AMD7TH_START && msr <= MSR_AMD7TH_END) {
 		off = (msr - MSR_AMD7TH_START);
 		*index = (off + base) / 4;
 		return (0);
 	}
 
 	return (EINVAL);
 }
 
 /*
  * Allow vcpu to read or write the 'msr' without trapping into the hypervisor.
  */
 static void
 svm_msr_perm(uint8_t *perm_bitmap, uint64_t msr, bool read, bool write)
 {
 	int index, bit, error;
 
 	error = svm_msr_index(msr, &index, &bit);
 	KASSERT(error == 0, ("%s: invalid msr %#lx", __func__, msr));
 	KASSERT(index >= 0 && index < SVM_MSR_BITMAP_SIZE,
 	    ("%s: invalid index %d for msr %#lx", __func__, index, msr));
 	KASSERT(bit >= 0 && bit <= 6, ("%s: invalid bit position %d "
 	    "msr %#lx", __func__, bit, msr));
 
 	if (read)
 		perm_bitmap[index] &= ~(1UL << bit);
 
 	if (write)
 		perm_bitmap[index] &= ~(2UL << bit);
 }
 
 static void
 svm_msr_rw_ok(uint8_t *perm_bitmap, uint64_t msr)
 {
 
 	svm_msr_perm(perm_bitmap, msr, true, true);
 }
 
 static void
 svm_msr_rd_ok(uint8_t *perm_bitmap, uint64_t msr)
 {
 
 	svm_msr_perm(perm_bitmap, msr, true, false);
 }
 
 static __inline int
 svm_get_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask)
 {
 	struct vmcb_ctrl *ctrl;
 
 	KASSERT(idx >=0 && idx < 5, ("invalid intercept index %d", idx));
 
 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
 	return (ctrl->intercept[idx] & bitmask ? 1 : 0);
 }
 
 static __inline void
 svm_set_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask,
     int enabled)
 {
 	struct vmcb_ctrl *ctrl;
 	uint32_t oldval;
 
 	KASSERT(idx >=0 && idx < 5, ("invalid intercept index %d", idx));
 
 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
 	oldval = ctrl->intercept[idx];
 
 	if (enabled)
 		ctrl->intercept[idx] |= bitmask;
 	else
 		ctrl->intercept[idx] &= ~bitmask;
 
 	if (ctrl->intercept[idx] != oldval) {
 		svm_set_dirty(sc, vcpu, VMCB_CACHE_I);
 		VCPU_CTR3(sc->vm, vcpu, "intercept[%d] modified "
 		    "from %#x to %#x", idx, oldval, ctrl->intercept[idx]);
 	}
 }
 
 static __inline void
 svm_disable_intercept(struct svm_softc *sc, int vcpu, int off, uint32_t bitmask)
 {
 
 	svm_set_intercept(sc, vcpu, off, bitmask, 0);
 }
 
 static __inline void
 svm_enable_intercept(struct svm_softc *sc, int vcpu, int off, uint32_t bitmask)
 {
 
 	svm_set_intercept(sc, vcpu, off, bitmask, 1);
 }
 
 static void
 vmcb_init(struct svm_softc *sc, int vcpu, uint64_t iopm_base_pa,
     uint64_t msrpm_base_pa, uint64_t np_pml4)
 {
 	struct vmcb_ctrl *ctrl;
 	struct vmcb_state *state;
 	uint32_t mask;
 	int n;
 
 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
 	state = svm_get_vmcb_state(sc, vcpu);
 
 	ctrl->iopm_base_pa = iopm_base_pa;
 	ctrl->msrpm_base_pa = msrpm_base_pa;
 
 	/* Enable nested paging */
 	ctrl->np_enable = 1;
 	ctrl->n_cr3 = np_pml4;
 
 	/*
 	 * Intercept accesses to the control registers that are not shadowed
 	 * in the VMCB - i.e. all except cr0, cr2, cr3, cr4 and cr8.
 	 */
 	for (n = 0; n < 16; n++) {
 		mask = (BIT(n) << 16) | BIT(n);
 		if (n == 0 || n == 2 || n == 3 || n == 4 || n == 8)
 			svm_disable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask);
 		else
 			svm_enable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask);
 	}
 
 
 	/*
 	 * Intercept everything when tracing guest exceptions otherwise
 	 * just intercept machine check exception.
 	 */
 	if (vcpu_trace_exceptions(sc->vm, vcpu)) {
 		for (n = 0; n < 32; n++) {
 			/*
 			 * Skip unimplemented vectors in the exception bitmap.
 			 */
 			if (n == 2 || n == 9) {
 				continue;
 			}
 			svm_enable_intercept(sc, vcpu, VMCB_EXC_INTCPT, BIT(n));
 		}
 	} else {
 		svm_enable_intercept(sc, vcpu, VMCB_EXC_INTCPT, BIT(IDT_MC));
 	}
 
 	/* Intercept various events (for e.g. I/O, MSR and CPUID accesses) */
 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IO);
 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_MSR);
 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_CPUID);
 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INTR);
 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INIT);
 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_NMI);
 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SMI);
 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SHUTDOWN);
 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
 	    VMCB_INTCPT_FERR_FREEZE);
 
 	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_MONITOR);
 	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_MWAIT);
 
 	/*
 	 * From section "Canonicalization and Consistency Checks" in APMv2
 	 * the VMRUN intercept bit must be set to pass the consistency check.
 	 */
 	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMRUN);
 
 	/*
 	 * The ASID will be set to a non-zero value just before VMRUN.
 	 */
 	ctrl->asid = 0;
 
 	/*
 	 * Section 15.21.1, Interrupt Masking in EFLAGS
 	 * Section 15.21.2, Virtualizing APIC.TPR
 	 *
 	 * This must be set for %rflag and %cr8 isolation of guest and host.
 	 */
 	ctrl->v_intr_masking = 1;
 
 	/* Enable Last Branch Record aka LBR for debugging */
 	ctrl->lbr_virt_en = 1;
 	state->dbgctl = BIT(0);
 
 	/* EFER_SVM must always be set when the guest is executing */
 	state->efer = EFER_SVM;
 
 	/* Set up the PAT to power-on state */
 	state->g_pat = PAT_VALUE(0, PAT_WRITE_BACK)	|
 	    PAT_VALUE(1, PAT_WRITE_THROUGH)	|
 	    PAT_VALUE(2, PAT_UNCACHED)		|
 	    PAT_VALUE(3, PAT_UNCACHEABLE)	|
 	    PAT_VALUE(4, PAT_WRITE_BACK)	|
 	    PAT_VALUE(5, PAT_WRITE_THROUGH)	|
 	    PAT_VALUE(6, PAT_UNCACHED)		|
 	    PAT_VALUE(7, PAT_UNCACHEABLE);
 }
 
 /*
  * Initialize a virtual machine.
  */
 static void *
 svm_vminit(struct vm *vm, pmap_t pmap)
 {
 	struct svm_softc *svm_sc;
 	struct svm_vcpu *vcpu;
 	vm_paddr_t msrpm_pa, iopm_pa, pml4_pa;	
 	int i;
 
 	svm_sc = malloc(sizeof (struct svm_softc), M_SVM, M_WAITOK | M_ZERO);
 	svm_sc->vm = vm;
 	svm_sc->nptp = (vm_offset_t)vtophys(pmap->pm_pml4);
 
 	/*
 	 * Intercept read and write accesses to all MSRs.
 	 */
 	memset(svm_sc->msr_bitmap, 0xFF, sizeof(svm_sc->msr_bitmap));
 
 	/*
 	 * Access to the following MSRs is redirected to the VMCB when the
 	 * guest is executing. Therefore it is safe to allow the guest to
 	 * read/write these MSRs directly without hypervisor involvement.
 	 */
 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_GSBASE);
 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_FSBASE);
 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_KGSBASE);
 	
 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_STAR);
 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_LSTAR);
 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_CSTAR);
 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SF_MASK);
 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_CS_MSR);
 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_ESP_MSR);
 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_EIP_MSR);
 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_PAT);
 
 	svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_TSC);
 
 	/*
 	 * Intercept writes to make sure that the EFER_SVM bit is not cleared.
 	 */
 	svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_EFER);
 
 	/* Intercept access to all I/O ports. */
 	memset(svm_sc->iopm_bitmap, 0xFF, sizeof(svm_sc->iopm_bitmap));
 
 	iopm_pa = vtophys(svm_sc->iopm_bitmap);
 	msrpm_pa = vtophys(svm_sc->msr_bitmap);
 	pml4_pa = svm_sc->nptp;
 	for (i = 0; i < VM_MAXCPU; i++) {
 		vcpu = svm_get_vcpu(svm_sc, i);
 		vcpu->nextrip = ~0;
 		vcpu->lastcpu = NOCPU;
 		vcpu->vmcb_pa = vtophys(&vcpu->vmcb);
 		vmcb_init(svm_sc, i, iopm_pa, msrpm_pa, pml4_pa);
 		svm_msr_guest_init(svm_sc, i);
 	}
 	return (svm_sc);
 }
 
 static int
 svm_cpl(struct vmcb_state *state)
 {
 
 	/*
 	 * From APMv2:
 	 *   "Retrieve the CPL from the CPL field in the VMCB, not
 	 *    from any segment DPL"
 	 */
 	return (state->cpl);
 }
 
 static enum vm_cpu_mode
 svm_vcpu_mode(struct vmcb *vmcb)
 {
 	struct vmcb_segment seg;
 	struct vmcb_state *state;
 	int error;
 
 	state = &vmcb->state;
 
 	if (state->efer & EFER_LMA) {
 		error = vmcb_seg(vmcb, VM_REG_GUEST_CS, &seg);
 		KASSERT(error == 0, ("%s: vmcb_seg(cs) error %d", __func__,
 		    error));
 
 		/*
 		 * Section 4.8.1 for APM2, check if Code Segment has
 		 * Long attribute set in descriptor.
 		 */
 		if (seg.attrib & VMCB_CS_ATTRIB_L)
 			return (CPU_MODE_64BIT);
 		else
 			return (CPU_MODE_COMPATIBILITY);
 	} else  if (state->cr0 & CR0_PE) {
 		return (CPU_MODE_PROTECTED);
 	} else {
 		return (CPU_MODE_REAL);
 	}
 }
 
 static enum vm_paging_mode
 svm_paging_mode(uint64_t cr0, uint64_t cr4, uint64_t efer)
 {
 
 	if ((cr0 & CR0_PG) == 0)
 		return (PAGING_MODE_FLAT);
 	if ((cr4 & CR4_PAE) == 0)
 		return (PAGING_MODE_32);
 	if (efer & EFER_LME)
 		return (PAGING_MODE_64);
 	else
 		return (PAGING_MODE_PAE);
 }
 
 /*
  * ins/outs utility routines
  */
 static uint64_t
 svm_inout_str_index(struct svm_regctx *regs, int in)
 {
 	uint64_t val;
 
 	val = in ? regs->sctx_rdi : regs->sctx_rsi;
 
 	return (val);
 }
 
 static uint64_t
 svm_inout_str_count(struct svm_regctx *regs, int rep)
 {
 	uint64_t val;
 
 	val = rep ? regs->sctx_rcx : 1;
 
 	return (val);
 }
 
 static void
 svm_inout_str_seginfo(struct svm_softc *svm_sc, int vcpu, int64_t info1,
     int in, struct vm_inout_str *vis)
 {
 	int error, s;
 
 	if (in) {
 		vis->seg_name = VM_REG_GUEST_ES;
 	} else {
 		/* The segment field has standard encoding */
 		s = (info1 >> 10) & 0x7;
 		vis->seg_name = vm_segment_name(s);
 	}
 
 	error = vmcb_getdesc(svm_sc, vcpu, vis->seg_name, &vis->seg_desc);
 	KASSERT(error == 0, ("%s: svm_getdesc error %d", __func__, error));
 }
 
 static int
 svm_inout_str_addrsize(uint64_t info1)
 {
         uint32_t size;
 
         size = (info1 >> 7) & 0x7;
         switch (size) {
         case 1:
                 return (2);     /* 16 bit */
         case 2:
                 return (4);     /* 32 bit */
         case 4:
                 return (8);     /* 64 bit */
         default:
                 panic("%s: invalid size encoding %d", __func__, size);
         }
 }
 
 static void
 svm_paging_info(struct vmcb *vmcb, struct vm_guest_paging *paging)
 {
 	struct vmcb_state *state;
 
 	state = &vmcb->state;
 	paging->cr3 = state->cr3;
 	paging->cpl = svm_cpl(state);
 	paging->cpu_mode = svm_vcpu_mode(vmcb);
 	paging->paging_mode = svm_paging_mode(state->cr0, state->cr4,
 	    state->efer);
 }
 
 #define	UNHANDLED 0
 
 /*
  * Handle guest I/O intercept.
  */
 static int
 svm_handle_io(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit)
 {
 	struct vmcb_ctrl *ctrl;
 	struct vmcb_state *state;
 	struct svm_regctx *regs;
 	struct vm_inout_str *vis;
 	uint64_t info1;
 	int inout_string;
 
 	state = svm_get_vmcb_state(svm_sc, vcpu);
 	ctrl  = svm_get_vmcb_ctrl(svm_sc, vcpu);
 	regs  = svm_get_guest_regctx(svm_sc, vcpu);
 
 	info1 = ctrl->exitinfo1;
 	inout_string = info1 & BIT(2) ? 1 : 0;
 
 	/*
 	 * The effective segment number in EXITINFO1[12:10] is populated
 	 * only if the processor has the DecodeAssist capability.
 	 *
 	 * XXX this is not specified explicitly in APMv2 but can be verified
 	 * empirically.
 	 */
 	if (inout_string && !decode_assist())
 		return (UNHANDLED);
 
 	vmexit->exitcode 	= VM_EXITCODE_INOUT;
 	vmexit->u.inout.in 	= (info1 & BIT(0)) ? 1 : 0;
 	vmexit->u.inout.string 	= inout_string;
 	vmexit->u.inout.rep 	= (info1 & BIT(3)) ? 1 : 0;
 	vmexit->u.inout.bytes 	= (info1 >> 4) & 0x7;
 	vmexit->u.inout.port 	= (uint16_t)(info1 >> 16);
 	vmexit->u.inout.eax 	= (uint32_t)(state->rax);
 
 	if (inout_string) {
 		vmexit->exitcode = VM_EXITCODE_INOUT_STR;
 		vis = &vmexit->u.inout_str;
 		svm_paging_info(svm_get_vmcb(svm_sc, vcpu), &vis->paging);
 		vis->rflags = state->rflags;
 		vis->cr0 = state->cr0;
 		vis->index = svm_inout_str_index(regs, vmexit->u.inout.in);
 		vis->count = svm_inout_str_count(regs, vmexit->u.inout.rep);
 		vis->addrsize = svm_inout_str_addrsize(info1);
 		svm_inout_str_seginfo(svm_sc, vcpu, info1,
 		    vmexit->u.inout.in, vis);
 	}
 
 	return (UNHANDLED);
 }
 
 static int
 npf_fault_type(uint64_t exitinfo1)
 {
 
 	if (exitinfo1 & VMCB_NPF_INFO1_W)
 		return (VM_PROT_WRITE);
 	else if (exitinfo1 & VMCB_NPF_INFO1_ID)
 		return (VM_PROT_EXECUTE);
 	else
 		return (VM_PROT_READ);
 }
 
 static bool
 svm_npf_emul_fault(uint64_t exitinfo1)
 {
 	
 	if (exitinfo1 & VMCB_NPF_INFO1_ID) {
 		return (false);
 	}
 
 	if (exitinfo1 & VMCB_NPF_INFO1_GPT) {
 		return (false);
 	}
 
 	if ((exitinfo1 & VMCB_NPF_INFO1_GPA) == 0) {
 		return (false);
 	}
 
 	return (true);	
 }
 
 static void
 svm_handle_inst_emul(struct vmcb *vmcb, uint64_t gpa, struct vm_exit *vmexit)
 {
 	struct vm_guest_paging *paging;
 	struct vmcb_segment seg;
 	struct vmcb_ctrl *ctrl;
 	char *inst_bytes;
 	int error, inst_len;
 
 	ctrl = &vmcb->ctrl;
 	paging = &vmexit->u.inst_emul.paging;
 
 	vmexit->exitcode = VM_EXITCODE_INST_EMUL;
 	vmexit->u.inst_emul.gpa = gpa;
 	vmexit->u.inst_emul.gla = VIE_INVALID_GLA;
 	svm_paging_info(vmcb, paging);
 
 	error = vmcb_seg(vmcb, VM_REG_GUEST_CS, &seg);
 	KASSERT(error == 0, ("%s: vmcb_seg(CS) error %d", __func__, error));
 
 	switch(paging->cpu_mode) {
+	case CPU_MODE_REAL:
+		vmexit->u.inst_emul.cs_base = seg.base;
+		vmexit->u.inst_emul.cs_d = 0;
+		break;
 	case CPU_MODE_PROTECTED:
 	case CPU_MODE_COMPATIBILITY:
+		vmexit->u.inst_emul.cs_base = seg.base;
+
 		/*
 		 * Section 4.8.1 of APM2, Default Operand Size or D bit.
 		 */
 		vmexit->u.inst_emul.cs_d = (seg.attrib & VMCB_CS_ATTRIB_D) ?
 		    1 : 0;
 		break;
 	default:
+		vmexit->u.inst_emul.cs_base = 0;
 		vmexit->u.inst_emul.cs_d = 0;
 		break;	
 	}
 
 	/*
 	 * Copy the instruction bytes into 'vie' if available.
 	 */
 	if (decode_assist() && !disable_npf_assist) {
 		inst_len = ctrl->inst_len;
 		inst_bytes = ctrl->inst_bytes;
 	} else {
 		inst_len = 0;
 		inst_bytes = NULL;
 	}
 	vie_init(&vmexit->u.inst_emul.vie, inst_bytes, inst_len);
 }
 
 #ifdef KTR
 static const char *
 intrtype_to_str(int intr_type)
 {
 	switch (intr_type) {
 	case VMCB_EVENTINJ_TYPE_INTR:
 		return ("hwintr");
 	case VMCB_EVENTINJ_TYPE_NMI:
 		return ("nmi");
 	case VMCB_EVENTINJ_TYPE_INTn:
 		return ("swintr");
 	case VMCB_EVENTINJ_TYPE_EXCEPTION:
 		return ("exception");
 	default:
 		panic("%s: unknown intr_type %d", __func__, intr_type);
 	}
 }
 #endif
 
 /*
  * Inject an event to vcpu as described in section 15.20, "Event injection".
  */
 static void
 svm_eventinject(struct svm_softc *sc, int vcpu, int intr_type, int vector,
 		 uint32_t error, bool ec_valid)
 {
 	struct vmcb_ctrl *ctrl;
 
 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
 
 	KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) == 0,
 	    ("%s: event already pending %#lx", __func__, ctrl->eventinj));
 
 	KASSERT(vector >=0 && vector <= 255, ("%s: invalid vector %d",
 	    __func__, vector));
 
 	switch (intr_type) {
 	case VMCB_EVENTINJ_TYPE_INTR:
 	case VMCB_EVENTINJ_TYPE_NMI:
 	case VMCB_EVENTINJ_TYPE_INTn:
 		break;
 	case VMCB_EVENTINJ_TYPE_EXCEPTION:
 		if (vector >= 0 && vector <= 31 && vector != 2)
 			break;
 		/* FALLTHROUGH */
 	default:
 		panic("%s: invalid intr_type/vector: %d/%d", __func__,
 		    intr_type, vector);
 	}
 	ctrl->eventinj = vector | (intr_type << 8) | VMCB_EVENTINJ_VALID;
 	if (ec_valid) {
 		ctrl->eventinj |= VMCB_EVENTINJ_EC_VALID;
 		ctrl->eventinj |= (uint64_t)error << 32;
 		VCPU_CTR3(sc->vm, vcpu, "Injecting %s at vector %d errcode %#x",
 		    intrtype_to_str(intr_type), vector, error);
 	} else {
 		VCPU_CTR2(sc->vm, vcpu, "Injecting %s at vector %d",
 		    intrtype_to_str(intr_type), vector);
 	}
 }
 
 static void
 svm_update_virqinfo(struct svm_softc *sc, int vcpu)
 {
 	struct vm *vm;
 	struct vlapic *vlapic;
 	struct vmcb_ctrl *ctrl;
 	int pending;
 
 	vm = sc->vm;
 	vlapic = vm_lapic(vm, vcpu);
 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
 
 	/* Update %cr8 in the emulated vlapic */
 	vlapic_set_cr8(vlapic, ctrl->v_tpr);
 
 	/*
 	 * If V_IRQ indicates that the interrupt injection attempted on then
 	 * last VMRUN was successful then update the vlapic accordingly.
 	 */
 	if (ctrl->v_intr_vector != 0) {
 		pending = ctrl->v_irq;
 		KASSERT(ctrl->v_intr_vector >= 16, ("%s: invalid "
 		    "v_intr_vector %d", __func__, ctrl->v_intr_vector));
 		KASSERT(!ctrl->v_ign_tpr, ("%s: invalid v_ign_tpr", __func__));
 		VCPU_CTR2(vm, vcpu, "v_intr_vector %d %s", ctrl->v_intr_vector,
 		    pending ? "pending" : "accepted");
 		if (!pending)
 			vlapic_intr_accepted(vlapic, ctrl->v_intr_vector);
 	}
 }
 
 static void
 svm_save_intinfo(struct svm_softc *svm_sc, int vcpu)
 {
 	struct vmcb_ctrl *ctrl;
 	uint64_t intinfo;
 
 	ctrl  = svm_get_vmcb_ctrl(svm_sc, vcpu);
 	intinfo = ctrl->exitintinfo;	
 	if (!VMCB_EXITINTINFO_VALID(intinfo))
 		return;
 
 	/*
 	 * From APMv2, Section "Intercepts during IDT interrupt delivery"
 	 *
 	 * If a #VMEXIT happened during event delivery then record the event
 	 * that was being delivered.
 	 */
 	VCPU_CTR2(svm_sc->vm, vcpu, "SVM:Pending INTINFO(0x%lx), vector=%d.\n",
 		intinfo, VMCB_EXITINTINFO_VECTOR(intinfo));
 	vmm_stat_incr(svm_sc->vm, vcpu, VCPU_EXITINTINFO, 1);
 	vm_exit_intinfo(svm_sc->vm, vcpu, intinfo);
 }
 
 static __inline int
 vintr_intercept_enabled(struct svm_softc *sc, int vcpu)
 {
 
 	return (svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
 	    VMCB_INTCPT_VINTR));
 }
 
 static __inline void
 enable_intr_window_exiting(struct svm_softc *sc, int vcpu)
 {
 	struct vmcb_ctrl *ctrl;
 
 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
 
 	if (ctrl->v_irq && ctrl->v_intr_vector == 0) {
 		KASSERT(ctrl->v_ign_tpr, ("%s: invalid v_ign_tpr", __func__));
 		KASSERT(vintr_intercept_enabled(sc, vcpu),
 		    ("%s: vintr intercept should be enabled", __func__));
 		return;
 	}
 
 	VCPU_CTR0(sc->vm, vcpu, "Enable intr window exiting");
 	ctrl->v_irq = 1;
 	ctrl->v_ign_tpr = 1;
 	ctrl->v_intr_vector = 0;
 	svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR);
 }
 
 static __inline void
 disable_intr_window_exiting(struct svm_softc *sc, int vcpu)
 {
 	struct vmcb_ctrl *ctrl;
 
 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
 
 	if (!ctrl->v_irq && ctrl->v_intr_vector == 0) {
 		KASSERT(!vintr_intercept_enabled(sc, vcpu),
 		    ("%s: vintr intercept should be disabled", __func__));
 		return;
 	}
 
 #ifdef KTR
 	if (ctrl->v_intr_vector == 0)
 		VCPU_CTR0(sc->vm, vcpu, "Disable intr window exiting");
 	else
 		VCPU_CTR0(sc->vm, vcpu, "Clearing V_IRQ interrupt injection");
 #endif
 	ctrl->v_irq = 0;
 	ctrl->v_intr_vector = 0;
 	svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
 	svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR);
 }
 
 static int
 svm_modify_intr_shadow(struct svm_softc *sc, int vcpu, uint64_t val)
 {
 	struct vmcb_ctrl *ctrl;
 	int oldval, newval;
 
 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
 	oldval = ctrl->intr_shadow;
 	newval = val ? 1 : 0;
 	if (newval != oldval) {
 		ctrl->intr_shadow = newval;
 		VCPU_CTR1(sc->vm, vcpu, "Setting intr_shadow to %d", newval);
 	}
 	return (0);
 }
 
 static int
 svm_get_intr_shadow(struct svm_softc *sc, int vcpu, uint64_t *val)
 {
 	struct vmcb_ctrl *ctrl;
 
 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
 	*val = ctrl->intr_shadow;
 	return (0);
 }
 
 /*
  * Once an NMI is injected it blocks delivery of further NMIs until the handler
  * executes an IRET. The IRET intercept is enabled when an NMI is injected to
  * to track when the vcpu is done handling the NMI.
  */
 static int
 nmi_blocked(struct svm_softc *sc, int vcpu)
 {
 	int blocked;
 
 	blocked = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
 	    VMCB_INTCPT_IRET);
 	return (blocked);
 }
 
 static void
 enable_nmi_blocking(struct svm_softc *sc, int vcpu)
 {
 
 	KASSERT(!nmi_blocked(sc, vcpu), ("vNMI already blocked"));
 	VCPU_CTR0(sc->vm, vcpu, "vNMI blocking enabled");
 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET);
 }
 
 static void
 clear_nmi_blocking(struct svm_softc *sc, int vcpu)
 {
 	int error;
 
 	KASSERT(nmi_blocked(sc, vcpu), ("vNMI already unblocked"));
 	VCPU_CTR0(sc->vm, vcpu, "vNMI blocking cleared");
 	/*
 	 * When the IRET intercept is cleared the vcpu will attempt to execute
 	 * the "iret" when it runs next. However, it is possible to inject
 	 * another NMI into the vcpu before the "iret" has actually executed.
 	 *
 	 * For e.g. if the "iret" encounters a #NPF when accessing the stack
 	 * it will trap back into the hypervisor. If an NMI is pending for
 	 * the vcpu it will be injected into the guest.
 	 *
 	 * XXX this needs to be fixed
 	 */
 	svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET);
 
 	/*
 	 * Set 'intr_shadow' to prevent an NMI from being injected on the
 	 * immediate VMRUN.
 	 */
 	error = svm_modify_intr_shadow(sc, vcpu, 1);
 	KASSERT(!error, ("%s: error %d setting intr_shadow", __func__, error));
 }
 
 static int
 emulate_wrmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t val,
     bool *retu)
 {
 	int error;
 
 	if (lapic_msr(num))
 		error = lapic_wrmsr(sc->vm, vcpu, num, val, retu);
 	else if (num == MSR_EFER)
 		error = svm_setreg(sc, vcpu, VM_REG_GUEST_EFER, val);
 	else
 		error = svm_wrmsr(sc, vcpu, num, val, retu);
 
 	return (error);
 }
 
 static int
 emulate_rdmsr(struct svm_softc *sc, int vcpu, u_int num, bool *retu)
 {
 	struct vmcb_state *state;
 	struct svm_regctx *ctx;
 	uint64_t result;
 	int error;
 
 	if (lapic_msr(num))
 		error = lapic_rdmsr(sc->vm, vcpu, num, &result, retu);
 	else
 		error = svm_rdmsr(sc, vcpu, num, &result, retu);
 
 	if (error == 0) {
 		state = svm_get_vmcb_state(sc, vcpu);
 		ctx = svm_get_guest_regctx(sc, vcpu);
 		state->rax = result & 0xffffffff;
 		ctx->sctx_rdx = result >> 32;
 	}
 
 	return (error);
 }
 
 #ifdef KTR
 static const char *
 exit_reason_to_str(uint64_t reason)
 {
 	static char reasonbuf[32];
 
 	switch (reason) {
 	case VMCB_EXIT_INVALID:
 		return ("invalvmcb");
 	case VMCB_EXIT_SHUTDOWN:
 		return ("shutdown");
 	case VMCB_EXIT_NPF:
 		return ("nptfault");
 	case VMCB_EXIT_PAUSE:
 		return ("pause");
 	case VMCB_EXIT_HLT:
 		return ("hlt");
 	case VMCB_EXIT_CPUID:
 		return ("cpuid");
 	case VMCB_EXIT_IO:
 		return ("inout");
 	case VMCB_EXIT_MC:
 		return ("mchk");
 	case VMCB_EXIT_INTR:
 		return ("extintr");
 	case VMCB_EXIT_NMI:
 		return ("nmi");
 	case VMCB_EXIT_VINTR:
 		return ("vintr");
 	case VMCB_EXIT_MSR:
 		return ("msr");
 	case VMCB_EXIT_IRET:
 		return ("iret");
 	case VMCB_EXIT_MONITOR:
 		return ("monitor");
 	case VMCB_EXIT_MWAIT:
 		return ("mwait");
 	default:
 		snprintf(reasonbuf, sizeof(reasonbuf), "%#lx", reason);
 		return (reasonbuf);
 	}
 }
 #endif	/* KTR */
 
 /*
  * From section "State Saved on Exit" in APMv2: nRIP is saved for all #VMEXITs
  * that are due to instruction intercepts as well as MSR and IOIO intercepts
  * and exceptions caused by INT3, INTO and BOUND instructions.
  *
  * Return 1 if the nRIP is valid and 0 otherwise.
  */
 static int
 nrip_valid(uint64_t exitcode)
 {
 	switch (exitcode) {
 	case 0x00 ... 0x0F:	/* read of CR0 through CR15 */
 	case 0x10 ... 0x1F:	/* write of CR0 through CR15 */
 	case 0x20 ... 0x2F:	/* read of DR0 through DR15 */
 	case 0x30 ... 0x3F:	/* write of DR0 through DR15 */
 	case 0x43:		/* INT3 */
 	case 0x44:		/* INTO */
 	case 0x45:		/* BOUND */
 	case 0x65 ... 0x7C:	/* VMEXIT_CR0_SEL_WRITE ... VMEXIT_MSR */
 	case 0x80 ... 0x8D:	/* VMEXIT_VMRUN ... VMEXIT_XSETBV */
 		return (1);
 	default:
 		return (0);
 	}
 }
 
 /*
  * Collateral for a generic SVM VM-exit.
  */
 static void
 vm_exit_svm(struct vm_exit *vme, uint64_t code, uint64_t info1, uint64_t info2)
 {
 
 	vme->exitcode = VM_EXITCODE_SVM;
 	vme->u.svm.exitcode = code;
 	vme->u.svm.exitinfo1 = info1;
 	vme->u.svm.exitinfo2 = info2;
 }
 
 static int
 svm_vmexit(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit)
 {
 	struct vmcb *vmcb;
 	struct vmcb_state *state;
 	struct vmcb_ctrl *ctrl;
 	struct svm_regctx *ctx;
 	uint64_t code, info1, info2, val;
 	uint32_t eax, ecx, edx;
 	int error, errcode_valid, handled, idtvec, reflect;
 	bool retu;
 
 	ctx = svm_get_guest_regctx(svm_sc, vcpu);
 	vmcb = svm_get_vmcb(svm_sc, vcpu);
 	state = &vmcb->state;
 	ctrl = &vmcb->ctrl;
 
 	handled = 0;
 	code = ctrl->exitcode;
 	info1 = ctrl->exitinfo1;
 	info2 = ctrl->exitinfo2;
 
 	vmexit->exitcode = VM_EXITCODE_BOGUS;
 	vmexit->rip = state->rip;
 	vmexit->inst_length = nrip_valid(code) ? ctrl->nrip - state->rip : 0;
 
 	vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_COUNT, 1);
 
 	/*
 	 * #VMEXIT(INVALID) needs to be handled early because the VMCB is
 	 * in an inconsistent state and can trigger assertions that would
 	 * never happen otherwise.
 	 */
 	if (code == VMCB_EXIT_INVALID) {
 		vm_exit_svm(vmexit, code, info1, info2);
 		return (0);
 	}
 
 	KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) == 0, ("%s: event "
 	    "injection valid bit is set %#lx", __func__, ctrl->eventinj));
 
 	KASSERT(vmexit->inst_length >= 0 && vmexit->inst_length <= 15,
 	    ("invalid inst_length %d: code (%#lx), info1 (%#lx), info2 (%#lx)",
 	    vmexit->inst_length, code, info1, info2));
 
 	svm_update_virqinfo(svm_sc, vcpu);
 	svm_save_intinfo(svm_sc, vcpu);
 
 	switch (code) {
 	case VMCB_EXIT_IRET:
 		/*
 		 * Restart execution at "iret" but with the intercept cleared.
 		 */
 		vmexit->inst_length = 0;
 		clear_nmi_blocking(svm_sc, vcpu);
 		handled = 1;
 		break;
 	case VMCB_EXIT_VINTR:	/* interrupt window exiting */
 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_VINTR, 1);
 		handled = 1;
 		break;
 	case VMCB_EXIT_INTR:	/* external interrupt */
 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXTINT, 1);
 		handled = 1;
 		break;
 	case VMCB_EXIT_NMI:	/* external NMI */
 		handled = 1;
 		break;
 	case 0x40 ... 0x5F:
 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXCEPTION, 1);
 		reflect = 1;
 		idtvec = code - 0x40;
 		switch (idtvec) {
 		case IDT_MC:
 			/*
 			 * Call the machine check handler by hand. Also don't
 			 * reflect the machine check back into the guest.
 			 */
 			reflect = 0;
 			VCPU_CTR0(svm_sc->vm, vcpu, "Vectoring to MCE handler");
 			__asm __volatile("int $18");
 			break;
 		case IDT_PF:
 			error = svm_setreg(svm_sc, vcpu, VM_REG_GUEST_CR2,
 			    info2);
 			KASSERT(error == 0, ("%s: error %d updating cr2",
 			    __func__, error));
 			/* fallthru */
 		case IDT_NP:
 		case IDT_SS:
 		case IDT_GP:
 		case IDT_AC:
 		case IDT_TS:
 			errcode_valid = 1;
 			break;
 
 		case IDT_DF:
 			errcode_valid = 1;
 			info1 = 0;
 			break;
 
 		case IDT_BP:
 		case IDT_OF:
 		case IDT_BR:
 			/*
 			 * The 'nrip' field is populated for INT3, INTO and
 			 * BOUND exceptions and this also implies that
 			 * 'inst_length' is non-zero.
 			 *
 			 * Reset 'inst_length' to zero so the guest %rip at
 			 * event injection is identical to what it was when
 			 * the exception originally happened.
 			 */
 			VCPU_CTR2(svm_sc->vm, vcpu, "Reset inst_length from %d "
 			    "to zero before injecting exception %d",
 			    vmexit->inst_length, idtvec);
 			vmexit->inst_length = 0;
 			/* fallthru */
 		default:
 			errcode_valid = 0;
 			info1 = 0;
 			break;
 		}
 		KASSERT(vmexit->inst_length == 0, ("invalid inst_length (%d) "
 		    "when reflecting exception %d into guest",
 		    vmexit->inst_length, idtvec));
 
 		if (reflect) {
 			/* Reflect the exception back into the guest */
 			VCPU_CTR2(svm_sc->vm, vcpu, "Reflecting exception "
 			    "%d/%#x into the guest", idtvec, (int)info1);
 			error = vm_inject_exception(svm_sc->vm, vcpu, idtvec,
 			    errcode_valid, info1, 0);
 			KASSERT(error == 0, ("%s: vm_inject_exception error %d",
 			    __func__, error));
 		}
 		handled = 1;
 		break;
 	case VMCB_EXIT_MSR:	/* MSR access. */
 		eax = state->rax;
 		ecx = ctx->sctx_rcx;
 		edx = ctx->sctx_rdx;
 		retu = false;	
 
 		if (info1) {
 			vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_WRMSR, 1);
 			val = (uint64_t)edx << 32 | eax;
 			VCPU_CTR2(svm_sc->vm, vcpu, "wrmsr %#x val %#lx",
 			    ecx, val);
 			if (emulate_wrmsr(svm_sc, vcpu, ecx, val, &retu)) {
 				vmexit->exitcode = VM_EXITCODE_WRMSR;
 				vmexit->u.msr.code = ecx;
 				vmexit->u.msr.wval = val;
 			} else if (!retu) {
 				handled = 1;
 			} else {
 				KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
 				    ("emulate_wrmsr retu with bogus exitcode"));
 			}
 		} else {
 			VCPU_CTR1(svm_sc->vm, vcpu, "rdmsr %#x", ecx);
 			vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_RDMSR, 1);
 			if (emulate_rdmsr(svm_sc, vcpu, ecx, &retu)) {
 				vmexit->exitcode = VM_EXITCODE_RDMSR;
 				vmexit->u.msr.code = ecx;
 			} else if (!retu) {
 				handled = 1;
 			} else {
 				KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
 				    ("emulate_rdmsr retu with bogus exitcode"));
 			}
 		}
 		break;
 	case VMCB_EXIT_IO:
 		handled = svm_handle_io(svm_sc, vcpu, vmexit);
 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_INOUT, 1);
 		break;
 	case VMCB_EXIT_CPUID:
 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_CPUID, 1);
 		handled = x86_emulate_cpuid(svm_sc->vm, vcpu,
 		    (uint32_t *)&state->rax,
 		    (uint32_t *)&ctx->sctx_rbx,
 		    (uint32_t *)&ctx->sctx_rcx,
 		    (uint32_t *)&ctx->sctx_rdx);
 		break;
 	case VMCB_EXIT_HLT:
 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_HLT, 1);
 		vmexit->exitcode = VM_EXITCODE_HLT;
 		vmexit->u.hlt.rflags = state->rflags;
 		break;
 	case VMCB_EXIT_PAUSE:
 		vmexit->exitcode = VM_EXITCODE_PAUSE;
 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_PAUSE, 1);
 		break;
 	case VMCB_EXIT_NPF:
 		/* EXITINFO2 contains the faulting guest physical address */
 		if (info1 & VMCB_NPF_INFO1_RSV) {
 			VCPU_CTR2(svm_sc->vm, vcpu, "nested page fault with "
 			    "reserved bits set: info1(%#lx) info2(%#lx)",
 			    info1, info2);
 		} else if (vm_mem_allocated(svm_sc->vm, info2)) {
 			vmexit->exitcode = VM_EXITCODE_PAGING;
 			vmexit->u.paging.gpa = info2;
 			vmexit->u.paging.fault_type = npf_fault_type(info1);
 			vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_NESTED_FAULT, 1);
 			VCPU_CTR3(svm_sc->vm, vcpu, "nested page fault "
 			    "on gpa %#lx/%#lx at rip %#lx",
 			    info2, info1, state->rip);
 		} else if (svm_npf_emul_fault(info1)) {
 			svm_handle_inst_emul(vmcb, info2, vmexit);
 			vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_INST_EMUL, 1);
 			VCPU_CTR3(svm_sc->vm, vcpu, "inst_emul fault "
 			    "for gpa %#lx/%#lx at rip %#lx",
 			    info2, info1, state->rip);
 		}
 		break;
 	case VMCB_EXIT_MONITOR:
 		vmexit->exitcode = VM_EXITCODE_MONITOR;
 		break;
 	case VMCB_EXIT_MWAIT:
 		vmexit->exitcode = VM_EXITCODE_MWAIT;
 		break;
 	default:
 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_UNKNOWN, 1);
 		break;
 	}	
 
 	VCPU_CTR4(svm_sc->vm, vcpu, "%s %s vmexit at %#lx/%d",
 	    handled ? "handled" : "unhandled", exit_reason_to_str(code),
 	    vmexit->rip, vmexit->inst_length);
 
 	if (handled) {
 		vmexit->rip += vmexit->inst_length;
 		vmexit->inst_length = 0;
 		state->rip = vmexit->rip;
 	} else {
 		if (vmexit->exitcode == VM_EXITCODE_BOGUS) {
 			/*
 			 * If this VM exit was not claimed by anybody then
 			 * treat it as a generic SVM exit.
 			 */
 			vm_exit_svm(vmexit, code, info1, info2);
 		} else {
 			/*
 			 * The exitcode and collateral have been populated.
 			 * The VM exit will be processed further in userland.
 			 */
 		}
 	}
 	return (handled);
 }
 
 static void
 svm_inj_intinfo(struct svm_softc *svm_sc, int vcpu)
 {
 	uint64_t intinfo;
 
 	if (!vm_entry_intinfo(svm_sc->vm, vcpu, &intinfo))
 		return;
 
 	KASSERT(VMCB_EXITINTINFO_VALID(intinfo), ("%s: entry intinfo is not "
 	    "valid: %#lx", __func__, intinfo));
 
 	svm_eventinject(svm_sc, vcpu, VMCB_EXITINTINFO_TYPE(intinfo),
 		VMCB_EXITINTINFO_VECTOR(intinfo),
 		VMCB_EXITINTINFO_EC(intinfo),
 		VMCB_EXITINTINFO_EC_VALID(intinfo));
 	vmm_stat_incr(svm_sc->vm, vcpu, VCPU_INTINFO_INJECTED, 1);
 	VCPU_CTR1(svm_sc->vm, vcpu, "Injected entry intinfo: %#lx", intinfo);
 }
 
 /*
  * Inject event to virtual cpu.
  */
 static void
 svm_inj_interrupts(struct svm_softc *sc, int vcpu, struct vlapic *vlapic)
 {
 	struct vmcb_ctrl *ctrl;
 	struct vmcb_state *state;
 	struct svm_vcpu *vcpustate;
 	uint8_t v_tpr;
 	int vector, need_intr_window, pending_apic_vector;
 
 	state = svm_get_vmcb_state(sc, vcpu);
 	ctrl  = svm_get_vmcb_ctrl(sc, vcpu);
 	vcpustate = svm_get_vcpu(sc, vcpu);
 
 	need_intr_window = 0;
 	pending_apic_vector = 0;
 
 	if (vcpustate->nextrip != state->rip) {
 		ctrl->intr_shadow = 0;
 		VCPU_CTR2(sc->vm, vcpu, "Guest interrupt blocking "
 		    "cleared due to rip change: %#lx/%#lx",
 		    vcpustate->nextrip, state->rip);
 	}
 
 	/*
 	 * Inject pending events or exceptions for this vcpu.
 	 *
 	 * An event might be pending because the previous #VMEXIT happened
 	 * during event delivery (i.e. ctrl->exitintinfo).
 	 *
 	 * An event might also be pending because an exception was injected
 	 * by the hypervisor (e.g. #PF during instruction emulation).
 	 */
 	svm_inj_intinfo(sc, vcpu);
 
 	/* NMI event has priority over interrupts. */
 	if (vm_nmi_pending(sc->vm, vcpu)) {
 		if (nmi_blocked(sc, vcpu)) {
 			/*
 			 * Can't inject another NMI if the guest has not
 			 * yet executed an "iret" after the last NMI.
 			 */
 			VCPU_CTR0(sc->vm, vcpu, "Cannot inject NMI due "
 			    "to NMI-blocking");
 		} else if (ctrl->intr_shadow) {
 			/*
 			 * Can't inject an NMI if the vcpu is in an intr_shadow.
 			 */
 			VCPU_CTR0(sc->vm, vcpu, "Cannot inject NMI due to "
 			    "interrupt shadow");
 			need_intr_window = 1;
 			goto done;
 		} else if (ctrl->eventinj & VMCB_EVENTINJ_VALID) {
 			/*
 			 * If there is already an exception/interrupt pending
 			 * then defer the NMI until after that.
 			 */
 			VCPU_CTR1(sc->vm, vcpu, "Cannot inject NMI due to "
 			    "eventinj %#lx", ctrl->eventinj);
 
 			/*
 			 * Use self-IPI to trigger a VM-exit as soon as
 			 * possible after the event injection is completed.
 			 *
 			 * This works only if the external interrupt exiting
 			 * is at a lower priority than the event injection.
 			 *
 			 * Although not explicitly specified in APMv2 the
 			 * relative priorities were verified empirically.
 			 */
 			ipi_cpu(curcpu, IPI_AST);	/* XXX vmm_ipinum? */
 		} else {
 			vm_nmi_clear(sc->vm, vcpu);
 
 			/* Inject NMI, vector number is not used */
 			svm_eventinject(sc, vcpu, VMCB_EVENTINJ_TYPE_NMI,
 			    IDT_NMI, 0, false);
 
 			/* virtual NMI blocking is now in effect */
 			enable_nmi_blocking(sc, vcpu);
 
 			VCPU_CTR0(sc->vm, vcpu, "Injecting vNMI");
 		}
 	}
 
 	if (!vm_extint_pending(sc->vm, vcpu)) {
 		/*
 		 * APIC interrupts are delivered using the V_IRQ offload.
 		 *
 		 * The primary benefit is that the hypervisor doesn't need to
 		 * deal with the various conditions that inhibit interrupts.
 		 * It also means that TPR changes via CR8 will be handled
 		 * without any hypervisor involvement.
 		 *
 		 * Note that the APIC vector must remain pending in the vIRR
 		 * until it is confirmed that it was delivered to the guest.
 		 * This can be confirmed based on the value of V_IRQ at the
 		 * next #VMEXIT (1 = pending, 0 = delivered).
 		 *
 		 * Also note that it is possible that another higher priority
 		 * vector can become pending before this vector is delivered
 		 * to the guest. This is alright because vcpu_notify_event()
 		 * will send an IPI and force the vcpu to trap back into the
 		 * hypervisor. The higher priority vector will be injected on
 		 * the next VMRUN.
 		 */
 		if (vlapic_pending_intr(vlapic, &vector)) {
 			KASSERT(vector >= 16 && vector <= 255,
 			    ("invalid vector %d from local APIC", vector));
 			pending_apic_vector = vector;
 		}
 		goto done;
 	}
 
 	/* Ask the legacy pic for a vector to inject */
 	vatpic_pending_intr(sc->vm, &vector);
 	KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d from INTR",
 	    vector));
 
 	/*
 	 * If the guest has disabled interrupts or is in an interrupt shadow
 	 * then we cannot inject the pending interrupt.
 	 */
 	if ((state->rflags & PSL_I) == 0) {
 		VCPU_CTR2(sc->vm, vcpu, "Cannot inject vector %d due to "
 		    "rflags %#lx", vector, state->rflags);
 		need_intr_window = 1;
 		goto done;
 	}
 
 	if (ctrl->intr_shadow) {
 		VCPU_CTR1(sc->vm, vcpu, "Cannot inject vector %d due to "
 		    "interrupt shadow", vector);
 		need_intr_window = 1;
 		goto done;
 	}
 
 	if (ctrl->eventinj & VMCB_EVENTINJ_VALID) {
 		VCPU_CTR2(sc->vm, vcpu, "Cannot inject vector %d due to "
 		    "eventinj %#lx", vector, ctrl->eventinj);
 		need_intr_window = 1;
 		goto done;
 	}
 
 	/*
 	 * Legacy PIC interrupts are delivered via the event injection
 	 * mechanism.
 	 */
 	svm_eventinject(sc, vcpu, VMCB_EVENTINJ_TYPE_INTR, vector, 0, false);
 
 	vm_extint_clear(sc->vm, vcpu);
 	vatpic_intr_accepted(sc->vm, vector);
 
 	/*
 	 * Force a VM-exit as soon as the vcpu is ready to accept another
 	 * interrupt. This is done because the PIC might have another vector
 	 * that it wants to inject. Also, if the APIC has a pending interrupt
 	 * that was preempted by the ExtInt then it allows us to inject the
 	 * APIC vector as soon as possible.
 	 */
 	need_intr_window = 1;
 done:
 	/*
 	 * The guest can modify the TPR by writing to %CR8. In guest mode
 	 * the processor reflects this write to V_TPR without hypervisor
 	 * intervention.
 	 *
 	 * The guest can also modify the TPR by writing to it via the memory
 	 * mapped APIC page. In this case, the write will be emulated by the
 	 * hypervisor. For this reason V_TPR must be updated before every
 	 * VMRUN.
 	 */
 	v_tpr = vlapic_get_cr8(vlapic);
 	KASSERT(v_tpr <= 15, ("invalid v_tpr %#x", v_tpr));
 	if (ctrl->v_tpr != v_tpr) {
 		VCPU_CTR2(sc->vm, vcpu, "VMCB V_TPR changed from %#x to %#x",
 		    ctrl->v_tpr, v_tpr);
 		ctrl->v_tpr = v_tpr;
 		svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
 	}
 
 	if (pending_apic_vector) {
 		/*
 		 * If an APIC vector is being injected then interrupt window
 		 * exiting is not possible on this VMRUN.
 		 */
 		KASSERT(!need_intr_window, ("intr_window exiting impossible"));
 		VCPU_CTR1(sc->vm, vcpu, "Injecting vector %d using V_IRQ",
 		    pending_apic_vector);
 
 		ctrl->v_irq = 1;
 		ctrl->v_ign_tpr = 0;
 		ctrl->v_intr_vector = pending_apic_vector;
 		ctrl->v_intr_prio = pending_apic_vector >> 4;
 		svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
 	} else if (need_intr_window) {
 		/*
 		 * We use V_IRQ in conjunction with the VINTR intercept to
 		 * trap into the hypervisor as soon as a virtual interrupt
 		 * can be delivered.
 		 *
 		 * Since injected events are not subject to intercept checks
 		 * we need to ensure that the V_IRQ is not actually going to
 		 * be delivered on VM entry. The KASSERT below enforces this.
 		 */
 		KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) != 0 ||
 		    (state->rflags & PSL_I) == 0 || ctrl->intr_shadow,
 		    ("Bogus intr_window_exiting: eventinj (%#lx), "
 		    "intr_shadow (%u), rflags (%#lx)",
 		    ctrl->eventinj, ctrl->intr_shadow, state->rflags));
 		enable_intr_window_exiting(sc, vcpu);
 	} else {
 		disable_intr_window_exiting(sc, vcpu);
 	}
 }
 
 static __inline void
 restore_host_tss(void)
 {
 	struct system_segment_descriptor *tss_sd;
 
 	/*
 	 * The TSS descriptor was in use prior to launching the guest so it
 	 * has been marked busy.
 	 *
 	 * 'ltr' requires the descriptor to be marked available so change the
 	 * type to "64-bit available TSS".
 	 */
 	tss_sd = PCPU_GET(tss);
 	tss_sd->sd_type = SDT_SYSTSS;
 	ltr(GSEL(GPROC0_SEL, SEL_KPL));
 }
 
 static void
 check_asid(struct svm_softc *sc, int vcpuid, pmap_t pmap, u_int thiscpu)
 {
 	struct svm_vcpu *vcpustate;
 	struct vmcb_ctrl *ctrl;
 	long eptgen;
 	bool alloc_asid;
 
 	KASSERT(CPU_ISSET(thiscpu, &pmap->pm_active), ("%s: nested pmap not "
 	    "active on cpu %u", __func__, thiscpu));
 
 	vcpustate = svm_get_vcpu(sc, vcpuid);
 	ctrl = svm_get_vmcb_ctrl(sc, vcpuid);
 
 	/*
 	 * The TLB entries associated with the vcpu's ASID are not valid
 	 * if either of the following conditions is true:
 	 *
 	 * 1. The vcpu's ASID generation is different than the host cpu's
 	 *    ASID generation. This happens when the vcpu migrates to a new
 	 *    host cpu. It can also happen when the number of vcpus executing
 	 *    on a host cpu is greater than the number of ASIDs available.
 	 *
 	 * 2. The pmap generation number is different than the value cached in
 	 *    the 'vcpustate'. This happens when the host invalidates pages
 	 *    belonging to the guest.
 	 *
 	 *	asidgen		eptgen	      Action
 	 *	mismatch	mismatch
 	 *	   0		   0		(a)
 	 *	   0		   1		(b1) or (b2)
 	 *	   1		   0		(c)
 	 *	   1		   1		(d)
 	 *
 	 * (a) There is no mismatch in eptgen or ASID generation and therefore
 	 *     no further action is needed.
 	 *
 	 * (b1) If the cpu supports FlushByAsid then the vcpu's ASID is
 	 *      retained and the TLB entries associated with this ASID
 	 *      are flushed by VMRUN.
 	 *
 	 * (b2) If the cpu does not support FlushByAsid then a new ASID is
 	 *      allocated.
 	 *
 	 * (c) A new ASID is allocated.
 	 *
 	 * (d) A new ASID is allocated.
 	 */
 
 	alloc_asid = false;
 	eptgen = pmap->pm_eptgen;
 	ctrl->tlb_ctrl = VMCB_TLB_FLUSH_NOTHING;
 
 	if (vcpustate->asid.gen != asid[thiscpu].gen) {
 		alloc_asid = true;	/* (c) and (d) */
 	} else if (vcpustate->eptgen != eptgen) {
 		if (flush_by_asid())
 			ctrl->tlb_ctrl = VMCB_TLB_FLUSH_GUEST;	/* (b1) */
 		else
 			alloc_asid = true;			/* (b2) */
 	} else {
 		/*
 		 * This is the common case (a).
 		 */
 		KASSERT(!alloc_asid, ("ASID allocation not necessary"));
 		KASSERT(ctrl->tlb_ctrl == VMCB_TLB_FLUSH_NOTHING,
 		    ("Invalid VMCB tlb_ctrl: %#x", ctrl->tlb_ctrl));
 	}
 
 	if (alloc_asid) {
 		if (++asid[thiscpu].num >= nasid) {
 			asid[thiscpu].num = 1;
 			if (++asid[thiscpu].gen == 0)
 				asid[thiscpu].gen = 1;
 			/*
 			 * If this cpu does not support "flush-by-asid"
 			 * then flush the entire TLB on a generation
 			 * bump. Subsequent ASID allocation in this
 			 * generation can be done without a TLB flush.
 			 */
 			if (!flush_by_asid())
 				ctrl->tlb_ctrl = VMCB_TLB_FLUSH_ALL;
 		}
 		vcpustate->asid.gen = asid[thiscpu].gen;
 		vcpustate->asid.num = asid[thiscpu].num;
 
 		ctrl->asid = vcpustate->asid.num;
 		svm_set_dirty(sc, vcpuid, VMCB_CACHE_ASID);
 		/*
 		 * If this cpu supports "flush-by-asid" then the TLB
 		 * was not flushed after the generation bump. The TLB
 		 * is flushed selectively after every new ASID allocation.
 		 */
 		if (flush_by_asid())
 			ctrl->tlb_ctrl = VMCB_TLB_FLUSH_GUEST;
 	}
 	vcpustate->eptgen = eptgen;
 
 	KASSERT(ctrl->asid != 0, ("Guest ASID must be non-zero"));
 	KASSERT(ctrl->asid == vcpustate->asid.num,
 	    ("ASID mismatch: %u/%u", ctrl->asid, vcpustate->asid.num));
 }
 
 static __inline void
 disable_gintr(void)
 {
 
 	__asm __volatile("clgi");
 }
 
 static __inline void
 enable_gintr(void)
 {
 
         __asm __volatile("stgi");
 }
 
 /*
  * Start vcpu with specified RIP.
  */
 static int
 svm_vmrun(void *arg, int vcpu, register_t rip, pmap_t pmap, 
 	void *rend_cookie, void *suspended_cookie)
 {
 	struct svm_regctx *gctx;
 	struct svm_softc *svm_sc;
 	struct svm_vcpu *vcpustate;
 	struct vmcb_state *state;
 	struct vmcb_ctrl *ctrl;
 	struct vm_exit *vmexit;
 	struct vlapic *vlapic;
 	struct vm *vm;
 	uint64_t vmcb_pa;
 	u_int thiscpu;
 	int handled;
 
 	svm_sc = arg;
 	vm = svm_sc->vm;
 
 	vcpustate = svm_get_vcpu(svm_sc, vcpu);
 	state = svm_get_vmcb_state(svm_sc, vcpu);
 	ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu);
 	vmexit = vm_exitinfo(vm, vcpu);
 	vlapic = vm_lapic(vm, vcpu);
 
 	/*
 	 * Stash 'curcpu' on the stack as 'thiscpu'.
 	 *
 	 * The per-cpu data area is not accessible until MSR_GSBASE is restored
 	 * after the #VMEXIT. Since VMRUN is executed inside a critical section
 	 * 'curcpu' and 'thiscpu' are guaranteed to identical.
 	 */
 	thiscpu = curcpu;
 
 	gctx = svm_get_guest_regctx(svm_sc, vcpu);
 	vmcb_pa = svm_sc->vcpu[vcpu].vmcb_pa;
 
 	if (vcpustate->lastcpu != thiscpu) {
 		/*
 		 * Force new ASID allocation by invalidating the generation.
 		 */
 		vcpustate->asid.gen = 0;
 
 		/*
 		 * Invalidate the VMCB state cache by marking all fields dirty.
 		 */
 		svm_set_dirty(svm_sc, vcpu, 0xffffffff);
 
 		/*
 		 * XXX
 		 * Setting 'vcpustate->lastcpu' here is bit premature because
 		 * we may return from this function without actually executing
 		 * the VMRUN  instruction. This could happen if a rendezvous
 		 * or an AST is pending on the first time through the loop.
 		 *
 		 * This works for now but any new side-effects of vcpu
 		 * migration should take this case into account.
 		 */
 		vcpustate->lastcpu = thiscpu;
 		vmm_stat_incr(vm, vcpu, VCPU_MIGRATIONS, 1);
 	}
 
 	svm_msr_guest_enter(svm_sc, vcpu);
 
 	/* Update Guest RIP */
 	state->rip = rip;
 
 	do {
 		/*
 		 * Disable global interrupts to guarantee atomicity during
 		 * loading of guest state. This includes not only the state
 		 * loaded by the "vmrun" instruction but also software state
 		 * maintained by the hypervisor: suspended and rendezvous
 		 * state, NPT generation number, vlapic interrupts etc.
 		 */
 		disable_gintr();
 
 		if (vcpu_suspended(suspended_cookie)) {
 			enable_gintr();
 			vm_exit_suspended(vm, vcpu, state->rip);
 			break;
 		}
 
 		if (vcpu_rendezvous_pending(rend_cookie)) {
 			enable_gintr();
 			vm_exit_rendezvous(vm, vcpu, state->rip);
 			break;
 		}
 
 		/* We are asked to give the cpu by scheduler. */
-		if (curthread->td_flags & (TDF_ASTPENDING | TDF_NEEDRESCHED)) {
+		if (vcpu_should_yield(vm, vcpu)) {
 			enable_gintr();
 			vm_exit_astpending(vm, vcpu, state->rip);
 			break;
 		}
 
 		svm_inj_interrupts(svm_sc, vcpu, vlapic);
 
 		/* Activate the nested pmap on 'thiscpu' */
 		CPU_SET_ATOMIC_ACQ(thiscpu, &pmap->pm_active);
 
 		/*
 		 * Check the pmap generation and the ASID generation to
 		 * ensure that the vcpu does not use stale TLB mappings.
 		 */
 		check_asid(svm_sc, vcpu, pmap, thiscpu);
 
 		ctrl->vmcb_clean = vmcb_clean & ~vcpustate->dirty;
 		vcpustate->dirty = 0;
 		VCPU_CTR1(vm, vcpu, "vmcb clean %#x", ctrl->vmcb_clean);
 
 		/* Launch Virtual Machine. */
 		VCPU_CTR1(vm, vcpu, "Resume execution at %#lx", state->rip);
 		svm_launch(vmcb_pa, gctx);
 
 		CPU_CLR_ATOMIC(thiscpu, &pmap->pm_active);
 
 		/*
 		 * Restore MSR_GSBASE to point to the pcpu data area.
 		 *
 		 * Note that accesses done via PCPU_GET/PCPU_SET will work
 		 * only after MSR_GSBASE is restored.
 		 *
 		 * Also note that we don't bother restoring MSR_KGSBASE
 		 * since it is not used in the kernel and will be restored
 		 * when the VMRUN ioctl returns to userspace.
 		 */
 		wrmsr(MSR_GSBASE, (uint64_t)&__pcpu[thiscpu]);
 		KASSERT(curcpu == thiscpu, ("thiscpu/curcpu (%u/%u) mismatch",
 		    thiscpu, curcpu));
 
 		/*
 		 * The host GDTR and IDTR is saved by VMRUN and restored
 		 * automatically on #VMEXIT. However, the host TSS needs
 		 * to be restored explicitly.
 		 */
 		restore_host_tss();
 
 		/* #VMEXIT disables interrupts so re-enable them here. */ 
 		enable_gintr();
 
 		/* Update 'nextrip' */
 		vcpustate->nextrip = state->rip;
 
 		/* Handle #VMEXIT and if required return to user space. */
 		handled = svm_vmexit(svm_sc, vcpu, vmexit);
 	} while (handled);
 
 	svm_msr_guest_exit(svm_sc, vcpu);
 
 	return (0);
 }
 
 static void
 svm_vmcleanup(void *arg)
 {
 	struct svm_softc *sc = arg;
 
 	free(sc, M_SVM);
 }
 
 static register_t *
 swctx_regptr(struct svm_regctx *regctx, int reg)
 {
 
 	switch (reg) {
 	case VM_REG_GUEST_RBX:
 		return (&regctx->sctx_rbx);
 	case VM_REG_GUEST_RCX:
 		return (&regctx->sctx_rcx);
 	case VM_REG_GUEST_RDX:
 		return (&regctx->sctx_rdx);
 	case VM_REG_GUEST_RDI:
 		return (&regctx->sctx_rdi);
 	case VM_REG_GUEST_RSI:
 		return (&regctx->sctx_rsi);
 	case VM_REG_GUEST_RBP:
 		return (&regctx->sctx_rbp);
 	case VM_REG_GUEST_R8:
 		return (&regctx->sctx_r8);
 	case VM_REG_GUEST_R9:
 		return (&regctx->sctx_r9);
 	case VM_REG_GUEST_R10:
 		return (&regctx->sctx_r10);
 	case VM_REG_GUEST_R11:
 		return (&regctx->sctx_r11);
 	case VM_REG_GUEST_R12:
 		return (&regctx->sctx_r12);
 	case VM_REG_GUEST_R13:
 		return (&regctx->sctx_r13);
 	case VM_REG_GUEST_R14:
 		return (&regctx->sctx_r14);
 	case VM_REG_GUEST_R15:
 		return (&regctx->sctx_r15);
 	default:
 		return (NULL);
 	}
 }
 
 static int
 svm_getreg(void *arg, int vcpu, int ident, uint64_t *val)
 {
 	struct svm_softc *svm_sc;
 	register_t *reg;
 
 	svm_sc = arg;
 
 	if (ident == VM_REG_GUEST_INTR_SHADOW) {
 		return (svm_get_intr_shadow(svm_sc, vcpu, val));
 	}
 
 	if (vmcb_read(svm_sc, vcpu, ident, val) == 0) {
 		return (0);
 	}
 
 	reg = swctx_regptr(svm_get_guest_regctx(svm_sc, vcpu), ident);
 
 	if (reg != NULL) {
 		*val = *reg;
 		return (0);
 	}
 
 	VCPU_CTR1(svm_sc->vm, vcpu, "svm_getreg: unknown register %#x", ident);
 	return (EINVAL);
 }
 
 static int
 svm_setreg(void *arg, int vcpu, int ident, uint64_t val)
 {
 	struct svm_softc *svm_sc;
 	register_t *reg;
 
 	svm_sc = arg;
 
 	if (ident == VM_REG_GUEST_INTR_SHADOW) {
 		return (svm_modify_intr_shadow(svm_sc, vcpu, val));
 	}
 
 	if (vmcb_write(svm_sc, vcpu, ident, val) == 0) {
 		return (0);
 	}
 
 	reg = swctx_regptr(svm_get_guest_regctx(svm_sc, vcpu), ident);
 
 	if (reg != NULL) {
 		*reg = val;
 		return (0);
 	}
 
 	/*
 	 * XXX deal with CR3 and invalidate TLB entries tagged with the
 	 * vcpu's ASID. This needs to be treated differently depending on
 	 * whether 'running' is true/false.
 	 */
 
 	VCPU_CTR1(svm_sc->vm, vcpu, "svm_setreg: unknown register %#x", ident);
 	return (EINVAL);
 }
 
 static int
 svm_setcap(void *arg, int vcpu, int type, int val)
 {
 	struct svm_softc *sc;
 	int error;
 
 	sc = arg;
 	error = 0;
 	switch (type) {
 	case VM_CAP_HALT_EXIT:
 		svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
 		    VMCB_INTCPT_HLT, val);
 		break;
 	case VM_CAP_PAUSE_EXIT:
 		svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
 		    VMCB_INTCPT_PAUSE, val);
 		break;
 	case VM_CAP_UNRESTRICTED_GUEST:
 		/* Unrestricted guest execution cannot be disabled in SVM */
 		if (val == 0)
 			error = EINVAL;
 		break;
 	default:
 		error = ENOENT;
 		break;
 	}
 	return (error);
 }
 
 static int
 svm_getcap(void *arg, int vcpu, int type, int *retval)
 {
 	struct svm_softc *sc;
 	int error;
 
 	sc = arg;
 	error = 0;
 
 	switch (type) {
 	case VM_CAP_HALT_EXIT:
 		*retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
 		    VMCB_INTCPT_HLT);
 		break;
 	case VM_CAP_PAUSE_EXIT:
 		*retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
 		    VMCB_INTCPT_PAUSE);
 		break;
 	case VM_CAP_UNRESTRICTED_GUEST:
 		*retval = 1;	/* unrestricted guest is always enabled */
 		break;
 	default:
 		error = ENOENT;
 		break;
 	}
 	return (error);
 }
 
 static struct vlapic *
 svm_vlapic_init(void *arg, int vcpuid)
 {
 	struct svm_softc *svm_sc;
 	struct vlapic *vlapic;
 
 	svm_sc = arg;
 	vlapic = malloc(sizeof(struct vlapic), M_SVM_VLAPIC, M_WAITOK | M_ZERO);
 	vlapic->vm = svm_sc->vm;
 	vlapic->vcpuid = vcpuid;
 	vlapic->apic_page = (struct LAPIC *)&svm_sc->apic_page[vcpuid];
 
 	vlapic_init(vlapic);
 
 	return (vlapic);
 }
 
 static void
 svm_vlapic_cleanup(void *arg, struct vlapic *vlapic)
 {
 
         vlapic_cleanup(vlapic);
         free(vlapic, M_SVM_VLAPIC);
 }
 
 struct vmm_ops vmm_ops_amd = {
 	svm_init,
 	svm_cleanup,
 	svm_restore,
 	svm_vminit,
 	svm_vmrun,
 	svm_vmcleanup,
 	svm_getreg,
 	svm_setreg,
 	vmcb_getdesc,
 	vmcb_setdesc,
 	svm_getcap,
 	svm_setcap,
 	svm_npt_alloc,
 	svm_npt_free,
 	svm_vlapic_init,
 	svm_vlapic_cleanup	
 };
Index: stable/10/sys/amd64/vmm/intel/vmx.c
===================================================================
--- stable/10/sys/amd64/vmm/intel/vmx.c	(revision 284898)
+++ stable/10/sys/amd64/vmm/intel/vmx.c	(revision 284899)
@@ -1,3411 +1,3417 @@
 /*-
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/smp.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/pcpu.h>
 #include <sys/proc.h>
 #include <sys/sysctl.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 #include <machine/psl.h>
 #include <machine/cpufunc.h>
 #include <machine/md_var.h>
 #include <machine/segments.h>
 #include <machine/smp.h>
 #include <machine/specialreg.h>
 #include <machine/vmparam.h>
 
 #include <machine/vmm.h>
 #include <machine/vmm_dev.h>
 #include <machine/vmm_instruction_emul.h>
 #include "vmm_lapic.h"
 #include "vmm_host.h"
 #include "vmm_ioport.h"
 #include "vmm_ipi.h"
 #include "vmm_ktr.h"
 #include "vmm_stat.h"
 #include "vatpic.h"
 #include "vlapic.h"
 #include "vlapic_priv.h"
 
 #include "ept.h"
 #include "vmx_cpufunc.h"
 #include "vmx.h"
 #include "vmx_msr.h"
 #include "x86.h"
 #include "vmx_controls.h"
 
 #define	PINBASED_CTLS_ONE_SETTING					\
 	(PINBASED_EXTINT_EXITING	|				\
 	 PINBASED_NMI_EXITING		|				\
 	 PINBASED_VIRTUAL_NMI)
 #define	PINBASED_CTLS_ZERO_SETTING	0
 
 #define PROCBASED_CTLS_WINDOW_SETTING					\
 	(PROCBASED_INT_WINDOW_EXITING	|				\
 	 PROCBASED_NMI_WINDOW_EXITING)
 
 #define	PROCBASED_CTLS_ONE_SETTING 					\
 	(PROCBASED_SECONDARY_CONTROLS	|				\
 	 PROCBASED_MWAIT_EXITING	|				\
 	 PROCBASED_MONITOR_EXITING	|				\
 	 PROCBASED_IO_EXITING		|				\
 	 PROCBASED_MSR_BITMAPS		|				\
 	 PROCBASED_CTLS_WINDOW_SETTING	|				\
 	 PROCBASED_CR8_LOAD_EXITING	|				\
 	 PROCBASED_CR8_STORE_EXITING)
 #define	PROCBASED_CTLS_ZERO_SETTING	\
 	(PROCBASED_CR3_LOAD_EXITING |	\
 	PROCBASED_CR3_STORE_EXITING |	\
 	PROCBASED_IO_BITMAPS)
 
 #define	PROCBASED_CTLS2_ONE_SETTING	PROCBASED2_ENABLE_EPT
 #define	PROCBASED_CTLS2_ZERO_SETTING	0
 
 #define	VM_EXIT_CTLS_ONE_SETTING					\
 	(VM_EXIT_HOST_LMA			|			\
 	VM_EXIT_SAVE_EFER			|			\
 	VM_EXIT_LOAD_EFER			|			\
 	VM_EXIT_ACKNOWLEDGE_INTERRUPT)
 
 #define	VM_EXIT_CTLS_ZERO_SETTING	VM_EXIT_SAVE_DEBUG_CONTROLS
 
 #define	VM_ENTRY_CTLS_ONE_SETTING	(VM_ENTRY_LOAD_EFER)
 
 #define	VM_ENTRY_CTLS_ZERO_SETTING					\
 	(VM_ENTRY_LOAD_DEBUG_CONTROLS		|			\
 	VM_ENTRY_INTO_SMM			|			\
 	VM_ENTRY_DEACTIVATE_DUAL_MONITOR)
 
 #define	HANDLED		1
 #define	UNHANDLED	0
 
 static MALLOC_DEFINE(M_VMX, "vmx", "vmx");
 static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic");
 
 SYSCTL_DECL(_hw_vmm);
 SYSCTL_NODE(_hw_vmm, OID_AUTO, vmx, CTLFLAG_RW, NULL, NULL);
 
 int vmxon_enabled[MAXCPU];
 static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE);
 
 static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2;
 static uint32_t exit_ctls, entry_ctls;
 
 static uint64_t cr0_ones_mask, cr0_zeros_mask;
 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_ones_mask, CTLFLAG_RD,
 	     &cr0_ones_mask, 0, NULL);
 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_zeros_mask, CTLFLAG_RD,
 	     &cr0_zeros_mask, 0, NULL);
 
 static uint64_t cr4_ones_mask, cr4_zeros_mask;
 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_ones_mask, CTLFLAG_RD,
 	     &cr4_ones_mask, 0, NULL);
 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_zeros_mask, CTLFLAG_RD,
 	     &cr4_zeros_mask, 0, NULL);
 
 static int vmx_initialized;
 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD,
 	   &vmx_initialized, 0, "Intel VMX initialized");
 
 /*
  * Optional capabilities
  */
 static SYSCTL_NODE(_hw_vmm_vmx, OID_AUTO, cap, CTLFLAG_RW, NULL, NULL);
 
 static int cap_halt_exit;
 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, halt_exit, CTLFLAG_RD, &cap_halt_exit, 0,
     "HLT triggers a VM-exit");
 
 static int cap_pause_exit;
 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, pause_exit, CTLFLAG_RD, &cap_pause_exit,
     0, "PAUSE triggers a VM-exit");
 
 static int cap_unrestricted_guest;
 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, unrestricted_guest, CTLFLAG_RD,
     &cap_unrestricted_guest, 0, "Unrestricted guests");
 
 static int cap_monitor_trap;
 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, monitor_trap, CTLFLAG_RD,
     &cap_monitor_trap, 0, "Monitor trap flag");
 
 static int cap_invpcid;
 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, invpcid, CTLFLAG_RD, &cap_invpcid,
     0, "Guests are allowed to use INVPCID");
 
 static int virtual_interrupt_delivery;
 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, virtual_interrupt_delivery, CTLFLAG_RD,
     &virtual_interrupt_delivery, 0, "APICv virtual interrupt delivery support");
 
 static int posted_interrupts;
 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, posted_interrupts, CTLFLAG_RD,
     &posted_interrupts, 0, "APICv posted interrupt support");
 
 static int pirvec;
 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupt_vector, CTLFLAG_RD,
     &pirvec, 0, "APICv posted interrupt vector");
 
 static struct unrhdr *vpid_unr;
 static u_int vpid_alloc_failed;
 SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD,
 	    &vpid_alloc_failed, 0, NULL);
 
 /*
  * Use the last page below 4GB as the APIC access address. This address is
  * occupied by the boot firmware so it is guaranteed that it will not conflict
  * with a page in system memory.
  */
 #define	APIC_ACCESS_ADDRESS	0xFFFFF000
 
 static int vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc);
 static int vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval);
 static int vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val);
 static void vmx_inject_pir(struct vlapic *vlapic);
 
 #ifdef KTR
 static const char *
 exit_reason_to_str(int reason)
 {
 	static char reasonbuf[32];
 
 	switch (reason) {
 	case EXIT_REASON_EXCEPTION:
 		return "exception";
 	case EXIT_REASON_EXT_INTR:
 		return "extint";
 	case EXIT_REASON_TRIPLE_FAULT:
 		return "triplefault";
 	case EXIT_REASON_INIT:
 		return "init";
 	case EXIT_REASON_SIPI:
 		return "sipi";
 	case EXIT_REASON_IO_SMI:
 		return "iosmi";
 	case EXIT_REASON_SMI:
 		return "smi";
 	case EXIT_REASON_INTR_WINDOW:
 		return "intrwindow";
 	case EXIT_REASON_NMI_WINDOW:
 		return "nmiwindow";
 	case EXIT_REASON_TASK_SWITCH:
 		return "taskswitch";
 	case EXIT_REASON_CPUID:
 		return "cpuid";
 	case EXIT_REASON_GETSEC:
 		return "getsec";
 	case EXIT_REASON_HLT:
 		return "hlt";
 	case EXIT_REASON_INVD:
 		return "invd";
 	case EXIT_REASON_INVLPG:
 		return "invlpg";
 	case EXIT_REASON_RDPMC:
 		return "rdpmc";
 	case EXIT_REASON_RDTSC:
 		return "rdtsc";
 	case EXIT_REASON_RSM:
 		return "rsm";
 	case EXIT_REASON_VMCALL:
 		return "vmcall";
 	case EXIT_REASON_VMCLEAR:
 		return "vmclear";
 	case EXIT_REASON_VMLAUNCH:
 		return "vmlaunch";
 	case EXIT_REASON_VMPTRLD:
 		return "vmptrld";
 	case EXIT_REASON_VMPTRST:
 		return "vmptrst";
 	case EXIT_REASON_VMREAD:
 		return "vmread";
 	case EXIT_REASON_VMRESUME:
 		return "vmresume";
 	case EXIT_REASON_VMWRITE:
 		return "vmwrite";
 	case EXIT_REASON_VMXOFF:
 		return "vmxoff";
 	case EXIT_REASON_VMXON:
 		return "vmxon";
 	case EXIT_REASON_CR_ACCESS:
 		return "craccess";
 	case EXIT_REASON_DR_ACCESS:
 		return "draccess";
 	case EXIT_REASON_INOUT:
 		return "inout";
 	case EXIT_REASON_RDMSR:
 		return "rdmsr";
 	case EXIT_REASON_WRMSR:
 		return "wrmsr";
 	case EXIT_REASON_INVAL_VMCS:
 		return "invalvmcs";
 	case EXIT_REASON_INVAL_MSR:
 		return "invalmsr";
 	case EXIT_REASON_MWAIT:
 		return "mwait";
 	case EXIT_REASON_MTF:
 		return "mtf";
 	case EXIT_REASON_MONITOR:
 		return "monitor";
 	case EXIT_REASON_PAUSE:
 		return "pause";
 	case EXIT_REASON_MCE_DURING_ENTRY:
 		return "mce-during-entry";
 	case EXIT_REASON_TPR:
 		return "tpr";
 	case EXIT_REASON_APIC_ACCESS:
 		return "apic-access";
 	case EXIT_REASON_GDTR_IDTR:
 		return "gdtridtr";
 	case EXIT_REASON_LDTR_TR:
 		return "ldtrtr";
 	case EXIT_REASON_EPT_FAULT:
 		return "eptfault";
 	case EXIT_REASON_EPT_MISCONFIG:
 		return "eptmisconfig";
 	case EXIT_REASON_INVEPT:
 		return "invept";
 	case EXIT_REASON_RDTSCP:
 		return "rdtscp";
 	case EXIT_REASON_VMX_PREEMPT:
 		return "vmxpreempt";
 	case EXIT_REASON_INVVPID:
 		return "invvpid";
 	case EXIT_REASON_WBINVD:
 		return "wbinvd";
 	case EXIT_REASON_XSETBV:
 		return "xsetbv";
 	case EXIT_REASON_APIC_WRITE:
 		return "apic-write";
 	default:
 		snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason);
 		return (reasonbuf);
 	}
 }
 #endif	/* KTR */
 
 static int
 vmx_allow_x2apic_msrs(struct vmx *vmx)
 {
 	int i, error;
 
 	error = 0;
 
 	/*
 	 * Allow readonly access to the following x2APIC MSRs from the guest.
 	 */
 	error += guest_msr_ro(vmx, MSR_APIC_ID);
 	error += guest_msr_ro(vmx, MSR_APIC_VERSION);
 	error += guest_msr_ro(vmx, MSR_APIC_LDR);
 	error += guest_msr_ro(vmx, MSR_APIC_SVR);
 
 	for (i = 0; i < 8; i++)
 		error += guest_msr_ro(vmx, MSR_APIC_ISR0 + i);
 
 	for (i = 0; i < 8; i++)
 		error += guest_msr_ro(vmx, MSR_APIC_TMR0 + i);
 	
 	for (i = 0; i < 8; i++)
 		error += guest_msr_ro(vmx, MSR_APIC_IRR0 + i);
 
 	error += guest_msr_ro(vmx, MSR_APIC_ESR);
 	error += guest_msr_ro(vmx, MSR_APIC_LVT_TIMER);
 	error += guest_msr_ro(vmx, MSR_APIC_LVT_THERMAL);
 	error += guest_msr_ro(vmx, MSR_APIC_LVT_PCINT);
 	error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT0);
 	error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT1);
 	error += guest_msr_ro(vmx, MSR_APIC_LVT_ERROR);
 	error += guest_msr_ro(vmx, MSR_APIC_ICR_TIMER);
 	error += guest_msr_ro(vmx, MSR_APIC_DCR_TIMER);
 	error += guest_msr_ro(vmx, MSR_APIC_ICR);
 
 	/*
 	 * Allow TPR, EOI and SELF_IPI MSRs to be read and written by the guest.
 	 *
 	 * These registers get special treatment described in the section
 	 * "Virtualizing MSR-Based APIC Accesses".
 	 */
 	error += guest_msr_rw(vmx, MSR_APIC_TPR);
 	error += guest_msr_rw(vmx, MSR_APIC_EOI);
 	error += guest_msr_rw(vmx, MSR_APIC_SELF_IPI);
 
 	return (error);
 }
 
 u_long
 vmx_fix_cr0(u_long cr0)
 {
 
 	return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask);
 }
 
 u_long
 vmx_fix_cr4(u_long cr4)
 {
 
 	return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask);
 }
 
 static void
 vpid_free(int vpid)
 {
 	if (vpid < 0 || vpid > 0xffff)
 		panic("vpid_free: invalid vpid %d", vpid);
 
 	/*
 	 * VPIDs [0,VM_MAXCPU] are special and are not allocated from
 	 * the unit number allocator.
 	 */
 
 	if (vpid > VM_MAXCPU)
 		free_unr(vpid_unr, vpid);
 }
 
 static void
 vpid_alloc(uint16_t *vpid, int num)
 {
 	int i, x;
 
 	if (num <= 0 || num > VM_MAXCPU)
 		panic("invalid number of vpids requested: %d", num);
 
 	/*
 	 * If the "enable vpid" execution control is not enabled then the
 	 * VPID is required to be 0 for all vcpus.
 	 */
 	if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) == 0) {
 		for (i = 0; i < num; i++)
 			vpid[i] = 0;
 		return;
 	}
 
 	/*
 	 * Allocate a unique VPID for each vcpu from the unit number allocator.
 	 */
 	for (i = 0; i < num; i++) {
 		x = alloc_unr(vpid_unr);
 		if (x == -1)
 			break;
 		else
 			vpid[i] = x;
 	}
 
 	if (i < num) {
 		atomic_add_int(&vpid_alloc_failed, 1);
 
 		/*
 		 * If the unit number allocator does not have enough unique
 		 * VPIDs then we need to allocate from the [1,VM_MAXCPU] range.
 		 *
 		 * These VPIDs are not be unique across VMs but this does not
 		 * affect correctness because the combined mappings are also
 		 * tagged with the EP4TA which is unique for each VM.
 		 *
 		 * It is still sub-optimal because the invvpid will invalidate
 		 * combined mappings for a particular VPID across all EP4TAs.
 		 */
 		while (i-- > 0)
 			vpid_free(vpid[i]);
 
 		for (i = 0; i < num; i++)
 			vpid[i] = i + 1;
 	}
 }
 
 static void
 vpid_init(void)
 {
 	/*
 	 * VPID 0 is required when the "enable VPID" execution control is
 	 * disabled.
 	 *
 	 * VPIDs [1,VM_MAXCPU] are used as the "overflow namespace" when the
 	 * unit number allocator does not have sufficient unique VPIDs to
 	 * satisfy the allocation.
 	 *
 	 * The remaining VPIDs are managed by the unit number allocator.
 	 */
 	vpid_unr = new_unrhdr(VM_MAXCPU + 1, 0xffff, NULL);
 }
 
 static void
 vmx_disable(void *arg __unused)
 {
 	struct invvpid_desc invvpid_desc = { 0 };
 	struct invept_desc invept_desc = { 0 };
 
 	if (vmxon_enabled[curcpu]) {
 		/*
 		 * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b.
 		 *
 		 * VMXON or VMXOFF are not required to invalidate any TLB
 		 * caching structures. This prevents potential retention of
 		 * cached information in the TLB between distinct VMX episodes.
 		 */
 		invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc);
 		invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc);
 		vmxoff();
 	}
 	load_cr4(rcr4() & ~CR4_VMXE);
 }
 
 static int
 vmx_cleanup(void)
 {
 	
 	if (pirvec != 0)
 		vmm_ipi_free(pirvec);
 
 	if (vpid_unr != NULL) {
 		delete_unrhdr(vpid_unr);
 		vpid_unr = NULL;
 	}
 
 	smp_rendezvous(NULL, vmx_disable, NULL, NULL);
 
 	return (0);
 }
 
 static void
 vmx_enable(void *arg __unused)
 {
 	int error;
 	uint64_t feature_control;
 
 	feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL);
 	if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 0 ||
 	    (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) {
 		wrmsr(MSR_IA32_FEATURE_CONTROL,
 		    feature_control | IA32_FEATURE_CONTROL_VMX_EN |
 		    IA32_FEATURE_CONTROL_LOCK);
 	}
 
 	load_cr4(rcr4() | CR4_VMXE);
 
 	*(uint32_t *)vmxon_region[curcpu] = vmx_revision();
 	error = vmxon(vmxon_region[curcpu]);
 	if (error == 0)
 		vmxon_enabled[curcpu] = 1;
 }
 
 static void
 vmx_restore(void)
 {
 
 	if (vmxon_enabled[curcpu])
 		vmxon(vmxon_region[curcpu]);
 }
 
 static int
 vmx_init(int ipinum)
 {
 	int error, use_tpr_shadow;
 	uint64_t basic, fixed0, fixed1, feature_control;
 	uint32_t tmp, procbased2_vid_bits;
 
 	/* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */
 	if (!(cpu_feature2 & CPUID2_VMX)) {
 		printf("vmx_init: processor does not support VMX operation\n");
 		return (ENXIO);
 	}
 
 	/*
 	 * Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits
 	 * are set (bits 0 and 2 respectively).
 	 */
 	feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL);
 	if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 1 &&
 	    (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) {
 		printf("vmx_init: VMX operation disabled by BIOS\n");
 		return (ENXIO);
 	}
 
 	/*
 	 * Verify capabilities MSR_VMX_BASIC:
 	 * - bit 54 indicates support for INS/OUTS decoding
 	 */
 	basic = rdmsr(MSR_VMX_BASIC);
 	if ((basic & (1UL << 54)) == 0) {
 		printf("vmx_init: processor does not support desired basic "
 		    "capabilities\n");
 		return (EINVAL);
 	}
 
 	/* Check support for primary processor-based VM-execution controls */
 	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
 			       MSR_VMX_TRUE_PROCBASED_CTLS,
 			       PROCBASED_CTLS_ONE_SETTING,
 			       PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls);
 	if (error) {
 		printf("vmx_init: processor does not support desired primary "
 		       "processor-based controls\n");
 		return (error);
 	}
 
 	/* Clear the processor-based ctl bits that are set on demand */
 	procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING;
 
 	/* Check support for secondary processor-based VM-execution controls */
 	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
 			       MSR_VMX_PROCBASED_CTLS2,
 			       PROCBASED_CTLS2_ONE_SETTING,
 			       PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2);
 	if (error) {
 		printf("vmx_init: processor does not support desired secondary "
 		       "processor-based controls\n");
 		return (error);
 	}
 
 	/* Check support for VPID */
 	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2,
 			       PROCBASED2_ENABLE_VPID, 0, &tmp);
 	if (error == 0)
 		procbased_ctls2 |= PROCBASED2_ENABLE_VPID;
 
 	/* Check support for pin-based VM-execution controls */
 	error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS,
 			       MSR_VMX_TRUE_PINBASED_CTLS,
 			       PINBASED_CTLS_ONE_SETTING,
 			       PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls);
 	if (error) {
 		printf("vmx_init: processor does not support desired "
 		       "pin-based controls\n");
 		return (error);
 	}
 
 	/* Check support for VM-exit controls */
 	error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS,
 			       VM_EXIT_CTLS_ONE_SETTING,
 			       VM_EXIT_CTLS_ZERO_SETTING,
 			       &exit_ctls);
 	if (error) {
 		printf("vmx_init: processor does not support desired "
 		    "exit controls\n");
 		return (error);
 	}
 
 	/* Check support for VM-entry controls */
 	error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, MSR_VMX_TRUE_ENTRY_CTLS,
 	    VM_ENTRY_CTLS_ONE_SETTING, VM_ENTRY_CTLS_ZERO_SETTING,
 	    &entry_ctls);
 	if (error) {
 		printf("vmx_init: processor does not support desired "
 		    "entry controls\n");
 		return (error);
 	}
 
 	/*
 	 * Check support for optional features by testing them
 	 * as individual bits
 	 */
 	cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
 					MSR_VMX_TRUE_PROCBASED_CTLS,
 					PROCBASED_HLT_EXITING, 0,
 					&tmp) == 0);
 
 	cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
 					MSR_VMX_PROCBASED_CTLS,
 					PROCBASED_MTF, 0,
 					&tmp) == 0);
 
 	cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
 					 MSR_VMX_TRUE_PROCBASED_CTLS,
 					 PROCBASED_PAUSE_EXITING, 0,
 					 &tmp) == 0);
 
 	cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
 					MSR_VMX_PROCBASED_CTLS2,
 					PROCBASED2_UNRESTRICTED_GUEST, 0,
 				        &tmp) == 0);
 
 	cap_invpcid = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
 	    MSR_VMX_PROCBASED_CTLS2, PROCBASED2_ENABLE_INVPCID, 0,
 	    &tmp) == 0);
 
 	/*
 	 * Check support for virtual interrupt delivery.
 	 */
 	procbased2_vid_bits = (PROCBASED2_VIRTUALIZE_APIC_ACCESSES |
 	    PROCBASED2_VIRTUALIZE_X2APIC_MODE |
 	    PROCBASED2_APIC_REGISTER_VIRTUALIZATION |
 	    PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY);
 
 	use_tpr_shadow = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
 	    MSR_VMX_TRUE_PROCBASED_CTLS, PROCBASED_USE_TPR_SHADOW, 0,
 	    &tmp) == 0);
 
 	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2,
 	    procbased2_vid_bits, 0, &tmp);
 	if (error == 0 && use_tpr_shadow) {
 		virtual_interrupt_delivery = 1;
 		TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_vid",
 		    &virtual_interrupt_delivery);
 	}
 
 	if (virtual_interrupt_delivery) {
 		procbased_ctls |= PROCBASED_USE_TPR_SHADOW;
 		procbased_ctls2 |= procbased2_vid_bits;
 		procbased_ctls2 &= ~PROCBASED2_VIRTUALIZE_X2APIC_MODE;
 
 		/*
 		 * No need to emulate accesses to %CR8 if virtual
 		 * interrupt delivery is enabled.
 		 */
 		procbased_ctls &= ~PROCBASED_CR8_LOAD_EXITING;
 		procbased_ctls &= ~PROCBASED_CR8_STORE_EXITING;
 
 		/*
 		 * Check for Posted Interrupts only if Virtual Interrupt
 		 * Delivery is enabled.
 		 */
 		error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS,
 		    MSR_VMX_TRUE_PINBASED_CTLS, PINBASED_POSTED_INTERRUPT, 0,
 		    &tmp);
 		if (error == 0) {
 			pirvec = vmm_ipi_alloc();
 			if (pirvec == 0) {
 				if (bootverbose) {
 					printf("vmx_init: unable to allocate "
 					    "posted interrupt vector\n");
 				}
 			} else {
 				posted_interrupts = 1;
 				TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_pir",
 				    &posted_interrupts);
 			}
 		}
 	}
 
 	if (posted_interrupts)
 		    pinbased_ctls |= PINBASED_POSTED_INTERRUPT;
 
 	/* Initialize EPT */
 	error = ept_init(ipinum);
 	if (error) {
 		printf("vmx_init: ept initialization failed (%d)\n", error);
 		return (error);
 	}
 
 	/*
 	 * Stash the cr0 and cr4 bits that must be fixed to 0 or 1
 	 */
 	fixed0 = rdmsr(MSR_VMX_CR0_FIXED0);
 	fixed1 = rdmsr(MSR_VMX_CR0_FIXED1);
 	cr0_ones_mask = fixed0 & fixed1;
 	cr0_zeros_mask = ~fixed0 & ~fixed1;
 
 	/*
 	 * CR0_PE and CR0_PG can be set to zero in VMX non-root operation
 	 * if unrestricted guest execution is allowed.
 	 */
 	if (cap_unrestricted_guest)
 		cr0_ones_mask &= ~(CR0_PG | CR0_PE);
 
 	/*
 	 * Do not allow the guest to set CR0_NW or CR0_CD.
 	 */
 	cr0_zeros_mask |= (CR0_NW | CR0_CD);
 
 	fixed0 = rdmsr(MSR_VMX_CR4_FIXED0);
 	fixed1 = rdmsr(MSR_VMX_CR4_FIXED1);
 	cr4_ones_mask = fixed0 & fixed1;
 	cr4_zeros_mask = ~fixed0 & ~fixed1;
 
 	vpid_init();
 
 	vmx_msr_init();
 
 	/* enable VMX operation */
 	smp_rendezvous(NULL, vmx_enable, NULL, NULL);
 
 	vmx_initialized = 1;
 
 	return (0);
 }
 
 static void
 vmx_trigger_hostintr(int vector)
 {
 	uintptr_t func;
 	struct gate_descriptor *gd;
 
 	gd = &idt[vector];
 
 	KASSERT(vector >= 32 && vector <= 255, ("vmx_trigger_hostintr: "
 	    "invalid vector %d", vector));
 	KASSERT(gd->gd_p == 1, ("gate descriptor for vector %d not present",
 	    vector));
 	KASSERT(gd->gd_type == SDT_SYSIGT, ("gate descriptor for vector %d "
 	    "has invalid type %d", vector, gd->gd_type));
 	KASSERT(gd->gd_dpl == SEL_KPL, ("gate descriptor for vector %d "
 	    "has invalid dpl %d", vector, gd->gd_dpl));
 	KASSERT(gd->gd_selector == GSEL(GCODE_SEL, SEL_KPL), ("gate descriptor "
 	    "for vector %d has invalid selector %d", vector, gd->gd_selector));
 	KASSERT(gd->gd_ist == 0, ("gate descriptor for vector %d has invalid "
 	    "IST %d", vector, gd->gd_ist));
 
 	func = ((long)gd->gd_hioffset << 16 | gd->gd_looffset);
 	vmx_call_isr(func);
 }
 
 static int
 vmx_setup_cr_shadow(int which, struct vmcs *vmcs, uint32_t initial)
 {
 	int error, mask_ident, shadow_ident;
 	uint64_t mask_value;
 
 	if (which != 0 && which != 4)
 		panic("vmx_setup_cr_shadow: unknown cr%d", which);
 
 	if (which == 0) {
 		mask_ident = VMCS_CR0_MASK;
 		mask_value = cr0_ones_mask | cr0_zeros_mask;
 		shadow_ident = VMCS_CR0_SHADOW;
 	} else {
 		mask_ident = VMCS_CR4_MASK;
 		mask_value = cr4_ones_mask | cr4_zeros_mask;
 		shadow_ident = VMCS_CR4_SHADOW;
 	}
 
 	error = vmcs_setreg(vmcs, 0, VMCS_IDENT(mask_ident), mask_value);
 	if (error)
 		return (error);
 
 	error = vmcs_setreg(vmcs, 0, VMCS_IDENT(shadow_ident), initial);
 	if (error)
 		return (error);
 
 	return (0);
 }
 #define	vmx_setup_cr0_shadow(vmcs,init)	vmx_setup_cr_shadow(0, (vmcs), (init))
 #define	vmx_setup_cr4_shadow(vmcs,init)	vmx_setup_cr_shadow(4, (vmcs), (init))
 
 static void *
 vmx_vminit(struct vm *vm, pmap_t pmap)
 {
 	uint16_t vpid[VM_MAXCPU];
 	int i, error;
 	struct vmx *vmx;
 	struct vmcs *vmcs;
 	uint32_t exc_bitmap;
 
 	vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO);
 	if ((uintptr_t)vmx & PAGE_MASK) {
 		panic("malloc of struct vmx not aligned on %d byte boundary",
 		      PAGE_SIZE);
 	}
 	vmx->vm = vm;
 
 	vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pml4));
 
 	/*
 	 * Clean up EPTP-tagged guest physical and combined mappings
 	 *
 	 * VMX transitions are not required to invalidate any guest physical
 	 * mappings. So, it may be possible for stale guest physical mappings
 	 * to be present in the processor TLBs.
 	 *
 	 * Combined mappings for this EP4TA are also invalidated for all VPIDs.
 	 */
 	ept_invalidate_mappings(vmx->eptp);
 
 	msr_bitmap_initialize(vmx->msr_bitmap);
 
 	/*
 	 * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE.
 	 * The guest FSBASE and GSBASE are saved and restored during
 	 * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are
 	 * always restored from the vmcs host state area on vm-exit.
 	 *
 	 * The SYSENTER_CS/ESP/EIP MSRs are identical to FS/GSBASE in
 	 * how they are saved/restored so can be directly accessed by the
 	 * guest.
 	 *
 	 * MSR_EFER is saved and restored in the guest VMCS area on a
 	 * VM exit and entry respectively. It is also restored from the
 	 * host VMCS area on a VM exit.
 	 *
 	 * The TSC MSR is exposed read-only. Writes are disallowed as that
 	 * will impact the host TSC.
 	 * XXX Writes would be implemented with a wrmsr trap, and
 	 * then modifying the TSC offset in the VMCS.
 	 */
 	if (guest_msr_rw(vmx, MSR_GSBASE) ||
 	    guest_msr_rw(vmx, MSR_FSBASE) ||
 	    guest_msr_rw(vmx, MSR_SYSENTER_CS_MSR) ||
 	    guest_msr_rw(vmx, MSR_SYSENTER_ESP_MSR) ||
 	    guest_msr_rw(vmx, MSR_SYSENTER_EIP_MSR) ||
 	    guest_msr_rw(vmx, MSR_EFER) ||
 	    guest_msr_ro(vmx, MSR_TSC))
 		panic("vmx_vminit: error setting guest msr access");
 
 	vpid_alloc(vpid, VM_MAXCPU);
 
 	if (virtual_interrupt_delivery) {
 		error = vm_map_mmio(vm, DEFAULT_APIC_BASE, PAGE_SIZE,
 		    APIC_ACCESS_ADDRESS);
 		/* XXX this should really return an error to the caller */
 		KASSERT(error == 0, ("vm_map_mmio(apicbase) error %d", error));
 	}
 
 	for (i = 0; i < VM_MAXCPU; i++) {
 		vmcs = &vmx->vmcs[i];
 		vmcs->identifier = vmx_revision();
 		error = vmclear(vmcs);
 		if (error != 0) {
 			panic("vmx_vminit: vmclear error %d on vcpu %d\n",
 			      error, i);
 		}
 
 		vmx_msr_guest_init(vmx, i);
 
 		error = vmcs_init(vmcs);
 		KASSERT(error == 0, ("vmcs_init error %d", error));
 
 		VMPTRLD(vmcs);
 		error = 0;
 		error += vmwrite(VMCS_HOST_RSP, (u_long)&vmx->ctx[i]);
 		error += vmwrite(VMCS_EPTP, vmx->eptp);
 		error += vmwrite(VMCS_PIN_BASED_CTLS, pinbased_ctls);
 		error += vmwrite(VMCS_PRI_PROC_BASED_CTLS, procbased_ctls);
 		error += vmwrite(VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2);
 		error += vmwrite(VMCS_EXIT_CTLS, exit_ctls);
 		error += vmwrite(VMCS_ENTRY_CTLS, entry_ctls);
 		error += vmwrite(VMCS_MSR_BITMAP, vtophys(vmx->msr_bitmap));
 		error += vmwrite(VMCS_VPID, vpid[i]);
 
 		/* exception bitmap */
 		if (vcpu_trace_exceptions(vm, i))
 			exc_bitmap = 0xffffffff;
 		else
 			exc_bitmap = 1 << IDT_MC;
 		error += vmwrite(VMCS_EXCEPTION_BITMAP, exc_bitmap);
 
 		if (virtual_interrupt_delivery) {
 			error += vmwrite(VMCS_APIC_ACCESS, APIC_ACCESS_ADDRESS);
 			error += vmwrite(VMCS_VIRTUAL_APIC,
 			    vtophys(&vmx->apic_page[i]));
 			error += vmwrite(VMCS_EOI_EXIT0, 0);
 			error += vmwrite(VMCS_EOI_EXIT1, 0);
 			error += vmwrite(VMCS_EOI_EXIT2, 0);
 			error += vmwrite(VMCS_EOI_EXIT3, 0);
 		}
 		if (posted_interrupts) {
 			error += vmwrite(VMCS_PIR_VECTOR, pirvec);
 			error += vmwrite(VMCS_PIR_DESC,
 			    vtophys(&vmx->pir_desc[i]));
 		}
 		VMCLEAR(vmcs);
 		KASSERT(error == 0, ("vmx_vminit: error customizing the vmcs"));
 
 		vmx->cap[i].set = 0;
 		vmx->cap[i].proc_ctls = procbased_ctls;
 		vmx->cap[i].proc_ctls2 = procbased_ctls2;
 
 		vmx->state[i].nextrip = ~0;
 		vmx->state[i].lastcpu = NOCPU;
 		vmx->state[i].vpid = vpid[i];
 
 		/*
 		 * Set up the CR0/4 shadows, and init the read shadow
 		 * to the power-on register value from the Intel Sys Arch.
 		 *  CR0 - 0x60000010
 		 *  CR4 - 0
 		 */
 		error = vmx_setup_cr0_shadow(vmcs, 0x60000010);
 		if (error != 0)
 			panic("vmx_setup_cr0_shadow %d", error);
 
 		error = vmx_setup_cr4_shadow(vmcs, 0);
 		if (error != 0)
 			panic("vmx_setup_cr4_shadow %d", error);
 
 		vmx->ctx[i].pmap = pmap;
 	}
 
 	return (vmx);
 }
 
 static int
 vmx_handle_cpuid(struct vm *vm, int vcpu, struct vmxctx *vmxctx)
 {
 	int handled, func;
 	
 	func = vmxctx->guest_rax;
 
 	handled = x86_emulate_cpuid(vm, vcpu,
 				    (uint32_t*)(&vmxctx->guest_rax),
 				    (uint32_t*)(&vmxctx->guest_rbx),
 				    (uint32_t*)(&vmxctx->guest_rcx),
 				    (uint32_t*)(&vmxctx->guest_rdx));
 	return (handled);
 }
 
 static __inline void
 vmx_run_trace(struct vmx *vmx, int vcpu)
 {
 #ifdef KTR
 	VCPU_CTR1(vmx->vm, vcpu, "Resume execution at %#lx", vmcs_guest_rip());
 #endif
 }
 
 static __inline void
 vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason,
 	       int handled)
 {
 #ifdef KTR
 	VCPU_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx",
 		 handled ? "handled" : "unhandled",
 		 exit_reason_to_str(exit_reason), rip);
 #endif
 }
 
 static __inline void
 vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip)
 {
 #ifdef KTR
 	VCPU_CTR1(vmx->vm, vcpu, "astpending vmexit at 0x%0lx", rip);
 #endif
 }
 
 static VMM_STAT_INTEL(VCPU_INVVPID_SAVED, "Number of vpid invalidations saved");
 static VMM_STAT_INTEL(VCPU_INVVPID_DONE, "Number of vpid invalidations done");
 
 /*
  * Invalidate guest mappings identified by its vpid from the TLB.
  */
 static __inline void
 vmx_invvpid(struct vmx *vmx, int vcpu, pmap_t pmap, int running)
 {
 	struct vmxstate *vmxstate;
 	struct invvpid_desc invvpid_desc;
 
 	vmxstate = &vmx->state[vcpu];
 	if (vmxstate->vpid == 0)
 		return;
 
 	if (!running) {
 		/*
 		 * Set the 'lastcpu' to an invalid host cpu.
 		 *
 		 * This will invalidate TLB entries tagged with the vcpu's
 		 * vpid the next time it runs via vmx_set_pcpu_defaults().
 		 */
 		vmxstate->lastcpu = NOCPU;
 		return;
 	}
 
 	KASSERT(curthread->td_critnest > 0, ("%s: vcpu %d running outside "
 	    "critical section", __func__, vcpu));
 
 	/*
 	 * Invalidate all mappings tagged with 'vpid'
 	 *
 	 * We do this because this vcpu was executing on a different host
 	 * cpu when it last ran. We do not track whether it invalidated
 	 * mappings associated with its 'vpid' during that run. So we must
 	 * assume that the mappings associated with 'vpid' on 'curcpu' are
 	 * stale and invalidate them.
 	 *
 	 * Note that we incur this penalty only when the scheduler chooses to
 	 * move the thread associated with this vcpu between host cpus.
 	 *
 	 * Note also that this will invalidate mappings tagged with 'vpid'
 	 * for "all" EP4TAs.
 	 */
 	if (pmap->pm_eptgen == vmx->eptgen[curcpu]) {
 		invvpid_desc._res1 = 0;
 		invvpid_desc._res2 = 0;
 		invvpid_desc.vpid = vmxstate->vpid;
 		invvpid_desc.linear_addr = 0;
 		invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc);
 		vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_DONE, 1);
 	} else {
 		/*
 		 * The invvpid can be skipped if an invept is going to
 		 * be performed before entering the guest. The invept
 		 * will invalidate combined mappings tagged with
 		 * 'vmx->eptp' for all vpids.
 		 */
 		vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_SAVED, 1);
 	}
 }
 
 static void
 vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu, pmap_t pmap)
 {
 	struct vmxstate *vmxstate;
 
 	vmxstate = &vmx->state[vcpu];
 	if (vmxstate->lastcpu == curcpu)
 		return;
 
 	vmxstate->lastcpu = curcpu;
 
 	vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1);
 
 	vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase());
 	vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase());
 	vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase());
 	vmx_invvpid(vmx, vcpu, pmap, 1);
 }
 
 /*
  * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set.
  */
 CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0);
 
 static void __inline
 vmx_set_int_window_exiting(struct vmx *vmx, int vcpu)
 {
 
 	if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) == 0) {
 		vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING;
 		vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
 		VCPU_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting");
 	}
 }
 
 static void __inline
 vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu)
 {
 
 	KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0,
 	    ("intr_window_exiting not set: %#x", vmx->cap[vcpu].proc_ctls));
 	vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING;
 	vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
 	VCPU_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting");
 }
 
 static void __inline
 vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu)
 {
 
 	if ((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) == 0) {
 		vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING;
 		vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
 		VCPU_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting");
 	}
 }
 
 static void __inline
 vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu)
 {
 
 	KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) != 0,
 	    ("nmi_window_exiting not set %#x", vmx->cap[vcpu].proc_ctls));
 	vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING;
 	vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
 	VCPU_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting");
 }
 
 #define	NMI_BLOCKING	(VMCS_INTERRUPTIBILITY_NMI_BLOCKING |		\
 			 VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING)
 #define	HWINTR_BLOCKING	(VMCS_INTERRUPTIBILITY_STI_BLOCKING |		\
 			 VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING)
 
 static void
 vmx_inject_nmi(struct vmx *vmx, int vcpu)
 {
 	uint32_t gi, info;
 
 	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
 	KASSERT((gi & NMI_BLOCKING) == 0, ("vmx_inject_nmi: invalid guest "
 	    "interruptibility-state %#x", gi));
 
 	info = vmcs_read(VMCS_ENTRY_INTR_INFO);
 	KASSERT((info & VMCS_INTR_VALID) == 0, ("vmx_inject_nmi: invalid "
 	    "VM-entry interruption information %#x", info));
 
 	/*
 	 * Inject the virtual NMI. The vector must be the NMI IDT entry
 	 * or the VMCS entry check will fail.
 	 */
 	info = IDT_NMI | VMCS_INTR_T_NMI | VMCS_INTR_VALID;
 	vmcs_write(VMCS_ENTRY_INTR_INFO, info);
 
 	VCPU_CTR0(vmx->vm, vcpu, "Injecting vNMI");
 
 	/* Clear the request */
 	vm_nmi_clear(vmx->vm, vcpu);
 }
 
 static void
 vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic,
     uint64_t guestrip)
 {
 	int vector, need_nmi_exiting, extint_pending;
 	uint64_t rflags, entryinfo;
 	uint32_t gi, info;
 
 	if (vmx->state[vcpu].nextrip != guestrip) {
 		gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
 		if (gi & HWINTR_BLOCKING) {
 			VCPU_CTR2(vmx->vm, vcpu, "Guest interrupt blocking "
 			    "cleared due to rip change: %#lx/%#lx",
 			    vmx->state[vcpu].nextrip, guestrip);
 			gi &= ~HWINTR_BLOCKING;
 			vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
 		}
 	}
 
 	if (vm_entry_intinfo(vmx->vm, vcpu, &entryinfo)) {
 		KASSERT((entryinfo & VMCS_INTR_VALID) != 0, ("%s: entry "
 		    "intinfo is not valid: %#lx", __func__, entryinfo));
 
 		info = vmcs_read(VMCS_ENTRY_INTR_INFO);
 		KASSERT((info & VMCS_INTR_VALID) == 0, ("%s: cannot inject "
 		     "pending exception: %#lx/%#x", __func__, entryinfo, info));
 
 		info = entryinfo;
 		vector = info & 0xff;
 		if (vector == IDT_BP || vector == IDT_OF) {
 			/*
 			 * VT-x requires #BP and #OF to be injected as software
 			 * exceptions.
 			 */
 			info &= ~VMCS_INTR_T_MASK;
 			info |= VMCS_INTR_T_SWEXCEPTION;
 		}
 
 		if (info & VMCS_INTR_DEL_ERRCODE)
 			vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, entryinfo >> 32);
 
 		vmcs_write(VMCS_ENTRY_INTR_INFO, info);
 	}
 
 	if (vm_nmi_pending(vmx->vm, vcpu)) {
 		/*
 		 * If there are no conditions blocking NMI injection then
 		 * inject it directly here otherwise enable "NMI window
 		 * exiting" to inject it as soon as we can.
 		 *
 		 * We also check for STI_BLOCKING because some implementations
 		 * don't allow NMI injection in this case. If we are running
 		 * on a processor that doesn't have this restriction it will
 		 * immediately exit and the NMI will be injected in the
 		 * "NMI window exiting" handler.
 		 */
 		need_nmi_exiting = 1;
 		gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
 		if ((gi & (HWINTR_BLOCKING | NMI_BLOCKING)) == 0) {
 			info = vmcs_read(VMCS_ENTRY_INTR_INFO);
 			if ((info & VMCS_INTR_VALID) == 0) {
 				vmx_inject_nmi(vmx, vcpu);
 				need_nmi_exiting = 0;
 			} else {
 				VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI "
 				    "due to VM-entry intr info %#x", info);
 			}
 		} else {
 			VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI due to "
 			    "Guest Interruptibility-state %#x", gi);
 		}
 
 		if (need_nmi_exiting)
 			vmx_set_nmi_window_exiting(vmx, vcpu);
 	}
 
 	extint_pending = vm_extint_pending(vmx->vm, vcpu);
 
 	if (!extint_pending && virtual_interrupt_delivery) {
 		vmx_inject_pir(vlapic);
 		return;
 	}
 
 	/*
 	 * If interrupt-window exiting is already in effect then don't bother
 	 * checking for pending interrupts. This is just an optimization and
 	 * not needed for correctness.
 	 */
 	if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0) {
 		VCPU_CTR0(vmx->vm, vcpu, "Skip interrupt injection due to "
 		    "pending int_window_exiting");
 		return;
 	}
 
 	if (!extint_pending) {
 		/* Ask the local apic for a vector to inject */
 		if (!vlapic_pending_intr(vlapic, &vector))
 			return;
 
 		/*
 		 * From the Intel SDM, Volume 3, Section "Maskable
 		 * Hardware Interrupts":
 		 * - maskable interrupt vectors [16,255] can be delivered
 		 *   through the local APIC.
 		*/
 		KASSERT(vector >= 16 && vector <= 255,
 		    ("invalid vector %d from local APIC", vector));
 	} else {
 		/* Ask the legacy pic for a vector to inject */
 		vatpic_pending_intr(vmx->vm, &vector);
 
 		/*
 		 * From the Intel SDM, Volume 3, Section "Maskable
 		 * Hardware Interrupts":
 		 * - maskable interrupt vectors [0,255] can be delivered
 		 *   through the INTR pin.
 		 */
 		KASSERT(vector >= 0 && vector <= 255,
 		    ("invalid vector %d from INTR", vector));
 	}
 
 	/* Check RFLAGS.IF and the interruptibility state of the guest */
 	rflags = vmcs_read(VMCS_GUEST_RFLAGS);
 	if ((rflags & PSL_I) == 0) {
 		VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
 		    "rflags %#lx", vector, rflags);
 		goto cantinject;
 	}
 
 	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
 	if (gi & HWINTR_BLOCKING) {
 		VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
 		    "Guest Interruptibility-state %#x", vector, gi);
 		goto cantinject;
 	}
 
 	info = vmcs_read(VMCS_ENTRY_INTR_INFO);
 	if (info & VMCS_INTR_VALID) {
 		/*
 		 * This is expected and could happen for multiple reasons:
 		 * - A vectoring VM-entry was aborted due to astpending
 		 * - A VM-exit happened during event injection.
 		 * - An exception was injected above.
 		 * - An NMI was injected above or after "NMI window exiting"
 		 */
 		VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
 		    "VM-entry intr info %#x", vector, info);
 		goto cantinject;
 	}
 
 	/* Inject the interrupt */
 	info = VMCS_INTR_T_HWINTR | VMCS_INTR_VALID;
 	info |= vector;
 	vmcs_write(VMCS_ENTRY_INTR_INFO, info);
 
 	if (!extint_pending) {
 		/* Update the Local APIC ISR */
 		vlapic_intr_accepted(vlapic, vector);
 	} else {
 		vm_extint_clear(vmx->vm, vcpu);
 		vatpic_intr_accepted(vmx->vm, vector);
 
 		/*
 		 * After we accepted the current ExtINT the PIC may
 		 * have posted another one.  If that is the case, set
 		 * the Interrupt Window Exiting execution control so
 		 * we can inject that one too.
 		 *
 		 * Also, interrupt window exiting allows us to inject any
 		 * pending APIC vector that was preempted by the ExtINT
 		 * as soon as possible. This applies both for the software
 		 * emulated vlapic and the hardware assisted virtual APIC.
 		 */
 		vmx_set_int_window_exiting(vmx, vcpu);
 	}
 
 	VCPU_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector);
 
 	return;
 
 cantinject:
 	/*
 	 * Set the Interrupt Window Exiting execution control so we can inject
 	 * the interrupt as soon as blocking condition goes away.
 	 */
 	vmx_set_int_window_exiting(vmx, vcpu);
 }
 
 /*
  * If the Virtual NMIs execution control is '1' then the logical processor
  * tracks virtual-NMI blocking in the Guest Interruptibility-state field of
  * the VMCS. An IRET instruction in VMX non-root operation will remove any
  * virtual-NMI blocking.
  *
  * This unblocking occurs even if the IRET causes a fault. In this case the
  * hypervisor needs to restore virtual-NMI blocking before resuming the guest.
  */
 static void
 vmx_restore_nmi_blocking(struct vmx *vmx, int vcpuid)
 {
 	uint32_t gi;
 
 	VCPU_CTR0(vmx->vm, vcpuid, "Restore Virtual-NMI blocking");
 	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
 	gi |= VMCS_INTERRUPTIBILITY_NMI_BLOCKING;
 	vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
 }
 
 static void
 vmx_clear_nmi_blocking(struct vmx *vmx, int vcpuid)
 {
 	uint32_t gi;
 
 	VCPU_CTR0(vmx->vm, vcpuid, "Clear Virtual-NMI blocking");
 	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
 	gi &= ~VMCS_INTERRUPTIBILITY_NMI_BLOCKING;
 	vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
 }
 
 static void
 vmx_assert_nmi_blocking(struct vmx *vmx, int vcpuid)
 {
 	uint32_t gi;
 
 	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
 	KASSERT(gi & VMCS_INTERRUPTIBILITY_NMI_BLOCKING,
 	    ("NMI blocking is not in effect %#x", gi));
 }
 
 static int
 vmx_emulate_xsetbv(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 {
 	struct vmxctx *vmxctx;
 	uint64_t xcrval;
 	const struct xsave_limits *limits;
 
 	vmxctx = &vmx->ctx[vcpu];
 	limits = vmm_get_xsave_limits();
 
 	/*
 	 * Note that the processor raises a GP# fault on its own if
 	 * xsetbv is executed for CPL != 0, so we do not have to
 	 * emulate that fault here.
 	 */
 
 	/* Only xcr0 is supported. */
 	if (vmxctx->guest_rcx != 0) {
 		vm_inject_gp(vmx->vm, vcpu);
 		return (HANDLED);
 	}
 
 	/* We only handle xcr0 if both the host and guest have XSAVE enabled. */
 	if (!limits->xsave_enabled || !(vmcs_read(VMCS_GUEST_CR4) & CR4_XSAVE)) {
 		vm_inject_ud(vmx->vm, vcpu);
 		return (HANDLED);
 	}
 
 	xcrval = vmxctx->guest_rdx << 32 | (vmxctx->guest_rax & 0xffffffff);
 	if ((xcrval & ~limits->xcr0_allowed) != 0) {
 		vm_inject_gp(vmx->vm, vcpu);
 		return (HANDLED);
 	}
 
 	if (!(xcrval & XFEATURE_ENABLED_X87)) {
 		vm_inject_gp(vmx->vm, vcpu);
 		return (HANDLED);
 	}
 
 	/* AVX (YMM_Hi128) requires SSE. */
 	if (xcrval & XFEATURE_ENABLED_AVX &&
 	    (xcrval & XFEATURE_AVX) != XFEATURE_AVX) {
 		vm_inject_gp(vmx->vm, vcpu);
 		return (HANDLED);
 	}
 
 	/*
 	 * AVX512 requires base AVX (YMM_Hi128) as well as OpMask,
 	 * ZMM_Hi256, and Hi16_ZMM.
 	 */
 	if (xcrval & XFEATURE_AVX512 &&
 	    (xcrval & (XFEATURE_AVX512 | XFEATURE_AVX)) !=
 	    (XFEATURE_AVX512 | XFEATURE_AVX)) {
 		vm_inject_gp(vmx->vm, vcpu);
 		return (HANDLED);
 	}
 
 	/*
 	 * Intel MPX requires both bound register state flags to be
 	 * set.
 	 */
 	if (((xcrval & XFEATURE_ENABLED_BNDREGS) != 0) !=
 	    ((xcrval & XFEATURE_ENABLED_BNDCSR) != 0)) {
 		vm_inject_gp(vmx->vm, vcpu);
 		return (HANDLED);
 	}
 
 	/*
 	 * This runs "inside" vmrun() with the guest's FPU state, so
 	 * modifying xcr0 directly modifies the guest's xcr0, not the
 	 * host's.
 	 */
 	load_xcr(0, xcrval);
 	return (HANDLED);
 }
 
 static uint64_t
 vmx_get_guest_reg(struct vmx *vmx, int vcpu, int ident)
 {
 	const struct vmxctx *vmxctx;
 
 	vmxctx = &vmx->ctx[vcpu];
 
 	switch (ident) {
 	case 0:
 		return (vmxctx->guest_rax);
 	case 1:
 		return (vmxctx->guest_rcx);
 	case 2:
 		return (vmxctx->guest_rdx);
 	case 3:
 		return (vmxctx->guest_rbx);
 	case 4:
 		return (vmcs_read(VMCS_GUEST_RSP));
 	case 5:
 		return (vmxctx->guest_rbp);
 	case 6:
 		return (vmxctx->guest_rsi);
 	case 7:
 		return (vmxctx->guest_rdi);
 	case 8:
 		return (vmxctx->guest_r8);
 	case 9:
 		return (vmxctx->guest_r9);
 	case 10:
 		return (vmxctx->guest_r10);
 	case 11:
 		return (vmxctx->guest_r11);
 	case 12:
 		return (vmxctx->guest_r12);
 	case 13:
 		return (vmxctx->guest_r13);
 	case 14:
 		return (vmxctx->guest_r14);
 	case 15:
 		return (vmxctx->guest_r15);
 	default:
 		panic("invalid vmx register %d", ident);
 	}
 }
 
 static void
 vmx_set_guest_reg(struct vmx *vmx, int vcpu, int ident, uint64_t regval)
 {
 	struct vmxctx *vmxctx;
 
 	vmxctx = &vmx->ctx[vcpu];
 
 	switch (ident) {
 	case 0:
 		vmxctx->guest_rax = regval;
 		break;
 	case 1:
 		vmxctx->guest_rcx = regval;
 		break;
 	case 2:
 		vmxctx->guest_rdx = regval;
 		break;
 	case 3:
 		vmxctx->guest_rbx = regval;
 		break;
 	case 4:
 		vmcs_write(VMCS_GUEST_RSP, regval);
 		break;
 	case 5:
 		vmxctx->guest_rbp = regval;
 		break;
 	case 6:
 		vmxctx->guest_rsi = regval;
 		break;
 	case 7:
 		vmxctx->guest_rdi = regval;
 		break;
 	case 8:
 		vmxctx->guest_r8 = regval;
 		break;
 	case 9:
 		vmxctx->guest_r9 = regval;
 		break;
 	case 10:
 		vmxctx->guest_r10 = regval;
 		break;
 	case 11:
 		vmxctx->guest_r11 = regval;
 		break;
 	case 12:
 		vmxctx->guest_r12 = regval;
 		break;
 	case 13:
 		vmxctx->guest_r13 = regval;
 		break;
 	case 14:
 		vmxctx->guest_r14 = regval;
 		break;
 	case 15:
 		vmxctx->guest_r15 = regval;
 		break;
 	default:
 		panic("invalid vmx register %d", ident);
 	}
 }
 
 static int
 vmx_emulate_cr0_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
 {
 	uint64_t crval, regval;
 
 	/* We only handle mov to %cr0 at this time */
 	if ((exitqual & 0xf0) != 0x00)
 		return (UNHANDLED);
 
 	regval = vmx_get_guest_reg(vmx, vcpu, (exitqual >> 8) & 0xf);
 
 	vmcs_write(VMCS_CR0_SHADOW, regval);
 
 	crval = regval | cr0_ones_mask;
 	crval &= ~cr0_zeros_mask;
 	vmcs_write(VMCS_GUEST_CR0, crval);
 
 	if (regval & CR0_PG) {
 		uint64_t efer, entry_ctls;
 
 		/*
 		 * If CR0.PG is 1 and EFER.LME is 1 then EFER.LMA and
 		 * the "IA-32e mode guest" bit in VM-entry control must be
 		 * equal.
 		 */
 		efer = vmcs_read(VMCS_GUEST_IA32_EFER);
 		if (efer & EFER_LME) {
 			efer |= EFER_LMA;
 			vmcs_write(VMCS_GUEST_IA32_EFER, efer);
 			entry_ctls = vmcs_read(VMCS_ENTRY_CTLS);
 			entry_ctls |= VM_ENTRY_GUEST_LMA;
 			vmcs_write(VMCS_ENTRY_CTLS, entry_ctls);
 		}
 	}
 
 	return (HANDLED);
 }
 
 static int
 vmx_emulate_cr4_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
 {
 	uint64_t crval, regval;
 
 	/* We only handle mov to %cr4 at this time */
 	if ((exitqual & 0xf0) != 0x00)
 		return (UNHANDLED);
 
 	regval = vmx_get_guest_reg(vmx, vcpu, (exitqual >> 8) & 0xf);
 
 	vmcs_write(VMCS_CR4_SHADOW, regval);
 
 	crval = regval | cr4_ones_mask;
 	crval &= ~cr4_zeros_mask;
 	vmcs_write(VMCS_GUEST_CR4, crval);
 
 	return (HANDLED);
 }
 
 static int
 vmx_emulate_cr8_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
 {
 	struct vlapic *vlapic;
 	uint64_t cr8;
 	int regnum;
 
 	/* We only handle mov %cr8 to/from a register at this time. */
 	if ((exitqual & 0xe0) != 0x00) {
 		return (UNHANDLED);
 	}
 
 	vlapic = vm_lapic(vmx->vm, vcpu);
 	regnum = (exitqual >> 8) & 0xf;
 	if (exitqual & 0x10) {
 		cr8 = vlapic_get_cr8(vlapic);
 		vmx_set_guest_reg(vmx, vcpu, regnum, cr8);
 	} else {
 		cr8 = vmx_get_guest_reg(vmx, vcpu, regnum);
 		vlapic_set_cr8(vlapic, cr8);
 	}
 
 	return (HANDLED);
 }
 
 /*
  * From section "Guest Register State" in the Intel SDM: CPL = SS.DPL
  */
 static int
 vmx_cpl(void)
 {
 	uint32_t ssar;
 
 	ssar = vmcs_read(VMCS_GUEST_SS_ACCESS_RIGHTS);
 	return ((ssar >> 5) & 0x3);
 }
 
 static enum vm_cpu_mode
 vmx_cpu_mode(void)
 {
 	uint32_t csar;
 
 	if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LMA) {
 		csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS);
 		if (csar & 0x2000)
 			return (CPU_MODE_64BIT);	/* CS.L = 1 */
 		else
 			return (CPU_MODE_COMPATIBILITY);
 	} else if (vmcs_read(VMCS_GUEST_CR0) & CR0_PE) {
 		return (CPU_MODE_PROTECTED);
 	} else {
 		return (CPU_MODE_REAL);
 	}
 }
 
 static enum vm_paging_mode
 vmx_paging_mode(void)
 {
 
 	if (!(vmcs_read(VMCS_GUEST_CR0) & CR0_PG))
 		return (PAGING_MODE_FLAT);
 	if (!(vmcs_read(VMCS_GUEST_CR4) & CR4_PAE))
 		return (PAGING_MODE_32);
 	if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LME)
 		return (PAGING_MODE_64);
 	else
 		return (PAGING_MODE_PAE);
 }
 
 static uint64_t
 inout_str_index(struct vmx *vmx, int vcpuid, int in)
 {
 	uint64_t val;
 	int error;
 	enum vm_reg_name reg;
 
 	reg = in ? VM_REG_GUEST_RDI : VM_REG_GUEST_RSI;
 	error = vmx_getreg(vmx, vcpuid, reg, &val);
 	KASSERT(error == 0, ("%s: vmx_getreg error %d", __func__, error));
 	return (val);
 }
 
 static uint64_t
 inout_str_count(struct vmx *vmx, int vcpuid, int rep)
 {
 	uint64_t val;
 	int error;
 
 	if (rep) {
 		error = vmx_getreg(vmx, vcpuid, VM_REG_GUEST_RCX, &val);
 		KASSERT(!error, ("%s: vmx_getreg error %d", __func__, error));
 	} else {
 		val = 1;
 	}
 	return (val);
 }
 
 static int
 inout_str_addrsize(uint32_t inst_info)
 {
 	uint32_t size;
 
 	size = (inst_info >> 7) & 0x7;
 	switch (size) {
 	case 0:
 		return (2);	/* 16 bit */
 	case 1:
 		return (4);	/* 32 bit */
 	case 2:
 		return (8);	/* 64 bit */
 	default:
 		panic("%s: invalid size encoding %d", __func__, size);
 	}
 }
 
 static void
 inout_str_seginfo(struct vmx *vmx, int vcpuid, uint32_t inst_info, int in,
     struct vm_inout_str *vis)
 {
 	int error, s;
 
 	if (in) {
 		vis->seg_name = VM_REG_GUEST_ES;
 	} else {
 		s = (inst_info >> 15) & 0x7;
 		vis->seg_name = vm_segment_name(s);
 	}
 
 	error = vmx_getdesc(vmx, vcpuid, vis->seg_name, &vis->seg_desc);
 	KASSERT(error == 0, ("%s: vmx_getdesc error %d", __func__, error));
 }
 
 static void
 vmx_paging_info(struct vm_guest_paging *paging)
 {
 	paging->cr3 = vmcs_guest_cr3();
 	paging->cpl = vmx_cpl();
 	paging->cpu_mode = vmx_cpu_mode();
 	paging->paging_mode = vmx_paging_mode();
 }
 
 static void
 vmexit_inst_emul(struct vm_exit *vmexit, uint64_t gpa, uint64_t gla)
 {
 	struct vm_guest_paging *paging;
 	uint32_t csar;
 
 	paging = &vmexit->u.inst_emul.paging;
 
 	vmexit->exitcode = VM_EXITCODE_INST_EMUL;
 	vmexit->u.inst_emul.gpa = gpa;
 	vmexit->u.inst_emul.gla = gla;
 	vmx_paging_info(paging);
 	switch (paging->cpu_mode) {
+	case CPU_MODE_REAL:
+		vmexit->u.inst_emul.cs_base = vmcs_read(VMCS_GUEST_CS_BASE);
+		vmexit->u.inst_emul.cs_d = 0;
+		break;
 	case CPU_MODE_PROTECTED:
 	case CPU_MODE_COMPATIBILITY:
+		vmexit->u.inst_emul.cs_base = vmcs_read(VMCS_GUEST_CS_BASE);
 		csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS);
 		vmexit->u.inst_emul.cs_d = SEG_DESC_DEF32(csar);
 		break;
 	default:
+		vmexit->u.inst_emul.cs_base = 0;
 		vmexit->u.inst_emul.cs_d = 0;
 		break;
 	}
 	vie_init(&vmexit->u.inst_emul.vie, NULL, 0);
 }
 
 static int
 ept_fault_type(uint64_t ept_qual)
 {
 	int fault_type;
 
 	if (ept_qual & EPT_VIOLATION_DATA_WRITE)
 		fault_type = VM_PROT_WRITE;
 	else if (ept_qual & EPT_VIOLATION_INST_FETCH)
 		fault_type = VM_PROT_EXECUTE;
 	else
 		fault_type= VM_PROT_READ;
 
 	return (fault_type);
 }
 
 static boolean_t
 ept_emulation_fault(uint64_t ept_qual)
 {
 	int read, write;
 
 	/* EPT fault on an instruction fetch doesn't make sense here */
 	if (ept_qual & EPT_VIOLATION_INST_FETCH)
 		return (FALSE);
 
 	/* EPT fault must be a read fault or a write fault */
 	read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0;
 	write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0;
 	if ((read | write) == 0)
 		return (FALSE);
 
 	/*
 	 * The EPT violation must have been caused by accessing a
 	 * guest-physical address that is a translation of a guest-linear
 	 * address.
 	 */
 	if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 ||
 	    (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) {
 		return (FALSE);
 	}
 
 	return (TRUE);
 }
 
 static __inline int
 apic_access_virtualization(struct vmx *vmx, int vcpuid)
 {
 	uint32_t proc_ctls2;
 
 	proc_ctls2 = vmx->cap[vcpuid].proc_ctls2;
 	return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) ? 1 : 0);
 }
 
 static __inline int
 x2apic_virtualization(struct vmx *vmx, int vcpuid)
 {
 	uint32_t proc_ctls2;
 
 	proc_ctls2 = vmx->cap[vcpuid].proc_ctls2;
 	return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_X2APIC_MODE) ? 1 : 0);
 }
 
 static int
 vmx_handle_apic_write(struct vmx *vmx, int vcpuid, struct vlapic *vlapic,
     uint64_t qual)
 {
 	int error, handled, offset;
 	uint32_t *apic_regs, vector;
 	bool retu;
 
 	handled = HANDLED;
 	offset = APIC_WRITE_OFFSET(qual);
 
 	if (!apic_access_virtualization(vmx, vcpuid)) {
 		/*
 		 * In general there should not be any APIC write VM-exits
 		 * unless APIC-access virtualization is enabled.
 		 *
 		 * However self-IPI virtualization can legitimately trigger
 		 * an APIC-write VM-exit so treat it specially.
 		 */
 		if (x2apic_virtualization(vmx, vcpuid) &&
 		    offset == APIC_OFFSET_SELF_IPI) {
 			apic_regs = (uint32_t *)(vlapic->apic_page);
 			vector = apic_regs[APIC_OFFSET_SELF_IPI / 4];
 			vlapic_self_ipi_handler(vlapic, vector);
 			return (HANDLED);
 		} else
 			return (UNHANDLED);
 	}
 
 	switch (offset) {
 	case APIC_OFFSET_ID:
 		vlapic_id_write_handler(vlapic);
 		break;
 	case APIC_OFFSET_LDR:
 		vlapic_ldr_write_handler(vlapic);
 		break;
 	case APIC_OFFSET_DFR:
 		vlapic_dfr_write_handler(vlapic);
 		break;
 	case APIC_OFFSET_SVR:
 		vlapic_svr_write_handler(vlapic);
 		break;
 	case APIC_OFFSET_ESR:
 		vlapic_esr_write_handler(vlapic);
 		break;
 	case APIC_OFFSET_ICR_LOW:
 		retu = false;
 		error = vlapic_icrlo_write_handler(vlapic, &retu);
 		if (error != 0 || retu)
 			handled = UNHANDLED;
 		break;
 	case APIC_OFFSET_CMCI_LVT:
 	case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
 		vlapic_lvt_write_handler(vlapic, offset);
 		break;
 	case APIC_OFFSET_TIMER_ICR:
 		vlapic_icrtmr_write_handler(vlapic);
 		break;
 	case APIC_OFFSET_TIMER_DCR:
 		vlapic_dcr_write_handler(vlapic);
 		break;
 	default:
 		handled = UNHANDLED;
 		break;
 	}
 	return (handled);
 }
 
 static bool
 apic_access_fault(struct vmx *vmx, int vcpuid, uint64_t gpa)
 {
 
 	if (apic_access_virtualization(vmx, vcpuid) &&
 	    (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE))
 		return (true);
 	else
 		return (false);
 }
 
 static int
 vmx_handle_apic_access(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit)
 {
 	uint64_t qual;
 	int access_type, offset, allowed;
 
 	if (!apic_access_virtualization(vmx, vcpuid))
 		return (UNHANDLED);
 
 	qual = vmexit->u.vmx.exit_qualification;
 	access_type = APIC_ACCESS_TYPE(qual);
 	offset = APIC_ACCESS_OFFSET(qual);
 
 	allowed = 0;
 	if (access_type == 0) {
 		/*
 		 * Read data access to the following registers is expected.
 		 */
 		switch (offset) {
 		case APIC_OFFSET_APR:
 		case APIC_OFFSET_PPR:
 		case APIC_OFFSET_RRR:
 		case APIC_OFFSET_CMCI_LVT:
 		case APIC_OFFSET_TIMER_CCR:
 			allowed = 1;
 			break;
 		default:
 			break;
 		}
 	} else if (access_type == 1) {
 		/*
 		 * Write data access to the following registers is expected.
 		 */
 		switch (offset) {
 		case APIC_OFFSET_VER:
 		case APIC_OFFSET_APR:
 		case APIC_OFFSET_PPR:
 		case APIC_OFFSET_RRR:
 		case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
 		case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
 		case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
 		case APIC_OFFSET_CMCI_LVT:
 		case APIC_OFFSET_TIMER_CCR:
 			allowed = 1;
 			break;
 		default:
 			break;
 		}
 	}
 
 	if (allowed) {
 		vmexit_inst_emul(vmexit, DEFAULT_APIC_BASE + offset,
 		    VIE_INVALID_GLA);
 	}
 
 	/*
 	 * Regardless of whether the APIC-access is allowed this handler
 	 * always returns UNHANDLED:
 	 * - if the access is allowed then it is handled by emulating the
 	 *   instruction that caused the VM-exit (outside the critical section)
 	 * - if the access is not allowed then it will be converted to an
 	 *   exitcode of VM_EXITCODE_VMX and will be dealt with in userland.
 	 */
 	return (UNHANDLED);
 }
 
 static enum task_switch_reason
 vmx_task_switch_reason(uint64_t qual)
 {
 	int reason;
 
 	reason = (qual >> 30) & 0x3;
 	switch (reason) {
 	case 0:
 		return (TSR_CALL);
 	case 1:
 		return (TSR_IRET);
 	case 2:
 		return (TSR_JMP);
 	case 3:
 		return (TSR_IDT_GATE);
 	default:
 		panic("%s: invalid reason %d", __func__, reason);
 	}
 }
 
 static int
 emulate_wrmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t val, bool *retu)
 {
 	int error;
 
 	if (lapic_msr(num))
 		error = lapic_wrmsr(vmx->vm, vcpuid, num, val, retu);
 	else
 		error = vmx_wrmsr(vmx, vcpuid, num, val, retu);
 
 	return (error);
 }
 
 static int
 emulate_rdmsr(struct vmx *vmx, int vcpuid, u_int num, bool *retu)
 {
 	struct vmxctx *vmxctx;
 	uint64_t result;
 	uint32_t eax, edx;
 	int error;
 
 	if (lapic_msr(num))
 		error = lapic_rdmsr(vmx->vm, vcpuid, num, &result, retu);
 	else
 		error = vmx_rdmsr(vmx, vcpuid, num, &result, retu);
 
 	if (error == 0) {
 		eax = result;
 		vmxctx = &vmx->ctx[vcpuid];
 		error = vmxctx_setreg(vmxctx, VM_REG_GUEST_RAX, eax);
 		KASSERT(error == 0, ("vmxctx_setreg(rax) error %d", error));
 
 		edx = result >> 32;
 		error = vmxctx_setreg(vmxctx, VM_REG_GUEST_RDX, edx);
 		KASSERT(error == 0, ("vmxctx_setreg(rdx) error %d", error));
 	}
 
 	return (error);
 }
 
 static int
 vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 {
 	int error, errcode, errcode_valid, handled, in;
 	struct vmxctx *vmxctx;
 	struct vlapic *vlapic;
 	struct vm_inout_str *vis;
 	struct vm_task_switch *ts;
 	uint32_t eax, ecx, edx, idtvec_info, idtvec_err, intr_info, inst_info;
 	uint32_t intr_type, intr_vec, reason;
 	uint64_t exitintinfo, qual, gpa;
 	bool retu;
 
 	CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_VIRTUAL_NMI) != 0);
 	CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_NMI_EXITING) != 0);
 
 	handled = UNHANDLED;
 	vmxctx = &vmx->ctx[vcpu];
 
 	qual = vmexit->u.vmx.exit_qualification;
 	reason = vmexit->u.vmx.exit_reason;
 	vmexit->exitcode = VM_EXITCODE_BOGUS;
 
 	vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1);
 
 	/*
 	 * VM-entry failures during or after loading guest state.
 	 *
 	 * These VM-exits are uncommon but must be handled specially
 	 * as most VM-exit fields are not populated as usual.
 	 */
 	if (__predict_false(reason == EXIT_REASON_MCE_DURING_ENTRY)) {
 		VCPU_CTR0(vmx->vm, vcpu, "Handling MCE during VM-entry");
 		__asm __volatile("int $18");
 		return (1);
 	}
 
 	/*
 	 * VM exits that can be triggered during event delivery need to
 	 * be handled specially by re-injecting the event if the IDT
 	 * vectoring information field's valid bit is set.
 	 *
 	 * See "Information for VM Exits During Event Delivery" in Intel SDM
 	 * for details.
 	 */
 	idtvec_info = vmcs_idt_vectoring_info();
 	if (idtvec_info & VMCS_IDT_VEC_VALID) {
 		idtvec_info &= ~(1 << 12); /* clear undefined bit */
 		exitintinfo = idtvec_info;
 		if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) {
 			idtvec_err = vmcs_idt_vectoring_err();
 			exitintinfo |= (uint64_t)idtvec_err << 32;
 		}
 		error = vm_exit_intinfo(vmx->vm, vcpu, exitintinfo);
 		KASSERT(error == 0, ("%s: vm_set_intinfo error %d",
 		    __func__, error));
 
 		/*
 		 * If 'virtual NMIs' are being used and the VM-exit
 		 * happened while injecting an NMI during the previous
 		 * VM-entry, then clear "blocking by NMI" in the
 		 * Guest Interruptibility-State so the NMI can be
 		 * reinjected on the subsequent VM-entry.
 		 *
 		 * However, if the NMI was being delivered through a task
 		 * gate, then the new task must start execution with NMIs
 		 * blocked so don't clear NMI blocking in this case.
 		 */
 		intr_type = idtvec_info & VMCS_INTR_T_MASK;
 		if (intr_type == VMCS_INTR_T_NMI) {
 			if (reason != EXIT_REASON_TASK_SWITCH)
 				vmx_clear_nmi_blocking(vmx, vcpu);
 			else
 				vmx_assert_nmi_blocking(vmx, vcpu);
 		}
 
 		/*
 		 * Update VM-entry instruction length if the event being
 		 * delivered was a software interrupt or software exception.
 		 */
 		if (intr_type == VMCS_INTR_T_SWINTR ||
 		    intr_type == VMCS_INTR_T_PRIV_SWEXCEPTION ||
 		    intr_type == VMCS_INTR_T_SWEXCEPTION) {
 			vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length);
 		}
 	}
 
 	switch (reason) {
 	case EXIT_REASON_TASK_SWITCH:
 		ts = &vmexit->u.task_switch;
 		ts->tsssel = qual & 0xffff;
 		ts->reason = vmx_task_switch_reason(qual);
 		ts->ext = 0;
 		ts->errcode_valid = 0;
 		vmx_paging_info(&ts->paging);
 		/*
 		 * If the task switch was due to a CALL, JMP, IRET, software
 		 * interrupt (INT n) or software exception (INT3, INTO),
 		 * then the saved %rip references the instruction that caused
 		 * the task switch. The instruction length field in the VMCS
 		 * is valid in this case.
 		 *
 		 * In all other cases (e.g., NMI, hardware exception) the
 		 * saved %rip is one that would have been saved in the old TSS
 		 * had the task switch completed normally so the instruction
 		 * length field is not needed in this case and is explicitly
 		 * set to 0.
 		 */
 		if (ts->reason == TSR_IDT_GATE) {
 			KASSERT(idtvec_info & VMCS_IDT_VEC_VALID,
 			    ("invalid idtvec_info %#x for IDT task switch",
 			    idtvec_info));
 			intr_type = idtvec_info & VMCS_INTR_T_MASK;
 			if (intr_type != VMCS_INTR_T_SWINTR &&
 			    intr_type != VMCS_INTR_T_SWEXCEPTION &&
 			    intr_type != VMCS_INTR_T_PRIV_SWEXCEPTION) {
 				/* Task switch triggered by external event */
 				ts->ext = 1;
 				vmexit->inst_length = 0;
 				if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) {
 					ts->errcode_valid = 1;
 					ts->errcode = vmcs_idt_vectoring_err();
 				}
 			}
 		}
 		vmexit->exitcode = VM_EXITCODE_TASK_SWITCH;
 		VCPU_CTR4(vmx->vm, vcpu, "task switch reason %d, tss 0x%04x, "
 		    "%s errcode 0x%016lx", ts->reason, ts->tsssel,
 		    ts->ext ? "external" : "internal",
 		    ((uint64_t)ts->errcode << 32) | ts->errcode_valid);
 		break;
 	case EXIT_REASON_CR_ACCESS:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1);
 		switch (qual & 0xf) {
 		case 0:
 			handled = vmx_emulate_cr0_access(vmx, vcpu, qual);
 			break;
 		case 4:
 			handled = vmx_emulate_cr4_access(vmx, vcpu, qual);
 			break;
 		case 8:
 			handled = vmx_emulate_cr8_access(vmx, vcpu, qual);
 			break;
 		}
 		break;
 	case EXIT_REASON_RDMSR:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_RDMSR, 1);
 		retu = false;
 		ecx = vmxctx->guest_rcx;
 		VCPU_CTR1(vmx->vm, vcpu, "rdmsr 0x%08x", ecx);
 		error = emulate_rdmsr(vmx, vcpu, ecx, &retu);
 		if (error) {
 			vmexit->exitcode = VM_EXITCODE_RDMSR;
 			vmexit->u.msr.code = ecx;
 		} else if (!retu) {
 			handled = HANDLED;
 		} else {
 			/* Return to userspace with a valid exitcode */
 			KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
 			    ("emulate_rdmsr retu with bogus exitcode"));
 		}
 		break;
 	case EXIT_REASON_WRMSR:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_WRMSR, 1);
 		retu = false;
 		eax = vmxctx->guest_rax;
 		ecx = vmxctx->guest_rcx;
 		edx = vmxctx->guest_rdx;
 		VCPU_CTR2(vmx->vm, vcpu, "wrmsr 0x%08x value 0x%016lx",
 		    ecx, (uint64_t)edx << 32 | eax);
 		error = emulate_wrmsr(vmx, vcpu, ecx,
 		    (uint64_t)edx << 32 | eax, &retu);
 		if (error) {
 			vmexit->exitcode = VM_EXITCODE_WRMSR;
 			vmexit->u.msr.code = ecx;
 			vmexit->u.msr.wval = (uint64_t)edx << 32 | eax;
 		} else if (!retu) {
 			handled = HANDLED;
 		} else {
 			/* Return to userspace with a valid exitcode */
 			KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
 			    ("emulate_wrmsr retu with bogus exitcode"));
 		}
 		break;
 	case EXIT_REASON_HLT:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1);
 		vmexit->exitcode = VM_EXITCODE_HLT;
 		vmexit->u.hlt.rflags = vmcs_read(VMCS_GUEST_RFLAGS);
 		break;
 	case EXIT_REASON_MTF:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1);
 		vmexit->exitcode = VM_EXITCODE_MTRAP;
 		vmexit->inst_length = 0;
 		break;
 	case EXIT_REASON_PAUSE:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_PAUSE, 1);
 		vmexit->exitcode = VM_EXITCODE_PAUSE;
 		break;
 	case EXIT_REASON_INTR_WINDOW:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INTR_WINDOW, 1);
 		vmx_clear_int_window_exiting(vmx, vcpu);
 		return (1);
 	case EXIT_REASON_EXT_INTR:
 		/*
 		 * External interrupts serve only to cause VM exits and allow
 		 * the host interrupt handler to run.
 		 *
 		 * If this external interrupt triggers a virtual interrupt
 		 * to a VM, then that state will be recorded by the
 		 * host interrupt handler in the VM's softc. We will inject
 		 * this virtual interrupt during the subsequent VM enter.
 		 */
 		intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
 
 		/*
 		 * XXX: Ignore this exit if VMCS_INTR_VALID is not set.
 		 * This appears to be a bug in VMware Fusion?
 		 */
 		if (!(intr_info & VMCS_INTR_VALID))
 			return (1);
 		KASSERT((intr_info & VMCS_INTR_VALID) != 0 &&
 		    (intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_HWINTR,
 		    ("VM exit interruption info invalid: %#x", intr_info));
 		vmx_trigger_hostintr(intr_info & 0xff);
 
 		/*
 		 * This is special. We want to treat this as an 'handled'
 		 * VM-exit but not increment the instruction pointer.
 		 */
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1);
 		return (1);
 	case EXIT_REASON_NMI_WINDOW:
 		/* Exit to allow the pending virtual NMI to be injected */
 		if (vm_nmi_pending(vmx->vm, vcpu))
 			vmx_inject_nmi(vmx, vcpu);
 		vmx_clear_nmi_window_exiting(vmx, vcpu);
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NMI_WINDOW, 1);
 		return (1);
 	case EXIT_REASON_INOUT:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INOUT, 1);
 		vmexit->exitcode = VM_EXITCODE_INOUT;
 		vmexit->u.inout.bytes = (qual & 0x7) + 1;
 		vmexit->u.inout.in = in = (qual & 0x8) ? 1 : 0;
 		vmexit->u.inout.string = (qual & 0x10) ? 1 : 0;
 		vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0;
 		vmexit->u.inout.port = (uint16_t)(qual >> 16);
 		vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax);
 		if (vmexit->u.inout.string) {
 			inst_info = vmcs_read(VMCS_EXIT_INSTRUCTION_INFO);
 			vmexit->exitcode = VM_EXITCODE_INOUT_STR;
 			vis = &vmexit->u.inout_str;
 			vmx_paging_info(&vis->paging);
 			vis->rflags = vmcs_read(VMCS_GUEST_RFLAGS);
 			vis->cr0 = vmcs_read(VMCS_GUEST_CR0);
 			vis->index = inout_str_index(vmx, vcpu, in);
 			vis->count = inout_str_count(vmx, vcpu, vis->inout.rep);
 			vis->addrsize = inout_str_addrsize(inst_info);
 			inout_str_seginfo(vmx, vcpu, inst_info, in, vis);
 		}
 		break;
 	case EXIT_REASON_CPUID:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CPUID, 1);
 		handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx);
 		break;
 	case EXIT_REASON_EXCEPTION:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXCEPTION, 1);
 		intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
 		KASSERT((intr_info & VMCS_INTR_VALID) != 0,
 		    ("VM exit interruption info invalid: %#x", intr_info));
 
 		intr_vec = intr_info & 0xff;
 		intr_type = intr_info & VMCS_INTR_T_MASK;
 
 		/*
 		 * If Virtual NMIs control is 1 and the VM-exit is due to a
 		 * fault encountered during the execution of IRET then we must
 		 * restore the state of "virtual-NMI blocking" before resuming
 		 * the guest.
 		 *
 		 * See "Resuming Guest Software after Handling an Exception".
 		 * See "Information for VM Exits Due to Vectored Events".
 		 */
 		if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 &&
 		    (intr_vec != IDT_DF) &&
 		    (intr_info & EXIT_QUAL_NMIUDTI) != 0)
 			vmx_restore_nmi_blocking(vmx, vcpu);
 
 		/*
 		 * The NMI has already been handled in vmx_exit_handle_nmi().
 		 */
 		if (intr_type == VMCS_INTR_T_NMI)
 			return (1);
 
 		/*
 		 * Call the machine check handler by hand. Also don't reflect
 		 * the machine check back into the guest.
 		 */
 		if (intr_vec == IDT_MC) {
 			VCPU_CTR0(vmx->vm, vcpu, "Vectoring to MCE handler");
 			__asm __volatile("int $18");
 			return (1);
 		}
 
 		if (intr_vec == IDT_PF) {
 			error = vmxctx_setreg(vmxctx, VM_REG_GUEST_CR2, qual);
 			KASSERT(error == 0, ("%s: vmxctx_setreg(cr2) error %d",
 			    __func__, error));
 		}
 
 		/*
 		 * Software exceptions exhibit trap-like behavior. This in
 		 * turn requires populating the VM-entry instruction length
 		 * so that the %rip in the trap frame is past the INT3/INTO
 		 * instruction.
 		 */
 		if (intr_type == VMCS_INTR_T_SWEXCEPTION)
 			vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length);
 
 		/* Reflect all other exceptions back into the guest */
 		errcode_valid = errcode = 0;
 		if (intr_info & VMCS_INTR_DEL_ERRCODE) {
 			errcode_valid = 1;
 			errcode = vmcs_read(VMCS_EXIT_INTR_ERRCODE);
 		}
 		VCPU_CTR2(vmx->vm, vcpu, "Reflecting exception %d/%#x into "
 		    "the guest", intr_vec, errcode);
 		error = vm_inject_exception(vmx->vm, vcpu, intr_vec,
 		    errcode_valid, errcode, 0);
 		KASSERT(error == 0, ("%s: vm_inject_exception error %d",
 		    __func__, error));
 		return (1);
 
 	case EXIT_REASON_EPT_FAULT:
 		/*
 		 * If 'gpa' lies within the address space allocated to
 		 * memory then this must be a nested page fault otherwise
 		 * this must be an instruction that accesses MMIO space.
 		 */
 		gpa = vmcs_gpa();
 		if (vm_mem_allocated(vmx->vm, gpa) ||
 		    apic_access_fault(vmx, vcpu, gpa)) {
 			vmexit->exitcode = VM_EXITCODE_PAGING;
 			vmexit->inst_length = 0;
 			vmexit->u.paging.gpa = gpa;
 			vmexit->u.paging.fault_type = ept_fault_type(qual);
 			vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NESTED_FAULT, 1);
 		} else if (ept_emulation_fault(qual)) {
 			vmexit_inst_emul(vmexit, gpa, vmcs_gla());
 			vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INST_EMUL, 1);
 		}
 		/*
 		 * If Virtual NMIs control is 1 and the VM-exit is due to an
 		 * EPT fault during the execution of IRET then we must restore
 		 * the state of "virtual-NMI blocking" before resuming.
 		 *
 		 * See description of "NMI unblocking due to IRET" in
 		 * "Exit Qualification for EPT Violations".
 		 */
 		if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 &&
 		    (qual & EXIT_QUAL_NMIUDTI) != 0)
 			vmx_restore_nmi_blocking(vmx, vcpu);
 		break;
 	case EXIT_REASON_VIRTUALIZED_EOI:
 		vmexit->exitcode = VM_EXITCODE_IOAPIC_EOI;
 		vmexit->u.ioapic_eoi.vector = qual & 0xFF;
 		vmexit->inst_length = 0;	/* trap-like */
 		break;
 	case EXIT_REASON_APIC_ACCESS:
 		handled = vmx_handle_apic_access(vmx, vcpu, vmexit);
 		break;
 	case EXIT_REASON_APIC_WRITE:
 		/*
 		 * APIC-write VM exit is trap-like so the %rip is already
 		 * pointing to the next instruction.
 		 */
 		vmexit->inst_length = 0;
 		vlapic = vm_lapic(vmx->vm, vcpu);
 		handled = vmx_handle_apic_write(vmx, vcpu, vlapic, qual);
 		break;
 	case EXIT_REASON_XSETBV:
 		handled = vmx_emulate_xsetbv(vmx, vcpu, vmexit);
 		break;
 	case EXIT_REASON_MONITOR:
 		vmexit->exitcode = VM_EXITCODE_MONITOR;
 		break;
 	case EXIT_REASON_MWAIT:
 		vmexit->exitcode = VM_EXITCODE_MWAIT;
 		break;
 	default:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_UNKNOWN, 1);
 		break;
 	}
 
 	if (handled) {
 		/*
 		 * It is possible that control is returned to userland
 		 * even though we were able to handle the VM exit in the
 		 * kernel.
 		 *
 		 * In such a case we want to make sure that the userland
 		 * restarts guest execution at the instruction *after*
 		 * the one we just processed. Therefore we update the
 		 * guest rip in the VMCS and in 'vmexit'.
 		 */
 		vmexit->rip += vmexit->inst_length;
 		vmexit->inst_length = 0;
 		vmcs_write(VMCS_GUEST_RIP, vmexit->rip);
 	} else {
 		if (vmexit->exitcode == VM_EXITCODE_BOGUS) {
 			/*
 			 * If this VM exit was not claimed by anybody then
 			 * treat it as a generic VMX exit.
 			 */
 			vmexit->exitcode = VM_EXITCODE_VMX;
 			vmexit->u.vmx.status = VM_SUCCESS;
 			vmexit->u.vmx.inst_type = 0;
 			vmexit->u.vmx.inst_error = 0;
 		} else {
 			/*
 			 * The exitcode and collateral have been populated.
 			 * The VM exit will be processed further in userland.
 			 */
 		}
 	}
 	return (handled);
 }
 
 static __inline void
 vmx_exit_inst_error(struct vmxctx *vmxctx, int rc, struct vm_exit *vmexit)
 {
 
 	KASSERT(vmxctx->inst_fail_status != VM_SUCCESS,
 	    ("vmx_exit_inst_error: invalid inst_fail_status %d",
 	    vmxctx->inst_fail_status));
 
 	vmexit->inst_length = 0;
 	vmexit->exitcode = VM_EXITCODE_VMX;
 	vmexit->u.vmx.status = vmxctx->inst_fail_status;
 	vmexit->u.vmx.inst_error = vmcs_instruction_error();
 	vmexit->u.vmx.exit_reason = ~0;
 	vmexit->u.vmx.exit_qualification = ~0;
 
 	switch (rc) {
 	case VMX_VMRESUME_ERROR:
 	case VMX_VMLAUNCH_ERROR:
 	case VMX_INVEPT_ERROR:
 		vmexit->u.vmx.inst_type = rc;
 		break;
 	default:
 		panic("vm_exit_inst_error: vmx_enter_guest returned %d", rc);
 	}
 }
 
 /*
  * If the NMI-exiting VM execution control is set to '1' then an NMI in
  * non-root operation causes a VM-exit. NMI blocking is in effect so it is
  * sufficient to simply vector to the NMI handler via a software interrupt.
  * However, this must be done before maskable interrupts are enabled
  * otherwise the "iret" issued by an interrupt handler will incorrectly
  * clear NMI blocking.
  */
 static __inline void
 vmx_exit_handle_nmi(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit)
 {
 	uint32_t intr_info;
 
 	KASSERT((read_rflags() & PSL_I) == 0, ("interrupts enabled"));
 
 	if (vmexit->u.vmx.exit_reason != EXIT_REASON_EXCEPTION)
 		return;
 
 	intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
 	KASSERT((intr_info & VMCS_INTR_VALID) != 0,
 	    ("VM exit interruption info invalid: %#x", intr_info));
 
 	if ((intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_NMI) {
 		KASSERT((intr_info & 0xff) == IDT_NMI, ("VM exit due "
 		    "to NMI has invalid vector: %#x", intr_info));
 		VCPU_CTR0(vmx->vm, vcpuid, "Vectoring to NMI handler");
 		__asm __volatile("int $2");
 	}
 }
 
 static int
 vmx_run(void *arg, int vcpu, register_t rip, pmap_t pmap,
     void *rendezvous_cookie, void *suspend_cookie)
 {
 	int rc, handled, launched;
 	struct vmx *vmx;
 	struct vm *vm;
 	struct vmxctx *vmxctx;
 	struct vmcs *vmcs;
 	struct vm_exit *vmexit;
 	struct vlapic *vlapic;
 	uint32_t exit_reason;
 
 	vmx = arg;
 	vm = vmx->vm;
 	vmcs = &vmx->vmcs[vcpu];
 	vmxctx = &vmx->ctx[vcpu];
 	vlapic = vm_lapic(vm, vcpu);
 	vmexit = vm_exitinfo(vm, vcpu);
 	launched = 0;
 
 	KASSERT(vmxctx->pmap == pmap,
 	    ("pmap %p different than ctx pmap %p", pmap, vmxctx->pmap));
 
 	vmx_msr_guest_enter(vmx, vcpu);
 
 	VMPTRLD(vmcs);
 
 	/*
 	 * XXX
 	 * We do this every time because we may setup the virtual machine
 	 * from a different process than the one that actually runs it.
 	 *
 	 * If the life of a virtual machine was spent entirely in the context
 	 * of a single process we could do this once in vmx_vminit().
 	 */
 	vmcs_write(VMCS_HOST_CR3, rcr3());
 
 	vmcs_write(VMCS_GUEST_RIP, rip);
 	vmx_set_pcpu_defaults(vmx, vcpu, pmap);
 	do {
 		KASSERT(vmcs_guest_rip() == rip, ("%s: vmcs guest rip mismatch "
 		    "%#lx/%#lx", __func__, vmcs_guest_rip(), rip));
 
 		handled = UNHANDLED;
 		/*
 		 * Interrupts are disabled from this point on until the
 		 * guest starts executing. This is done for the following
 		 * reasons:
 		 *
 		 * If an AST is asserted on this thread after the check below,
 		 * then the IPI_AST notification will not be lost, because it
 		 * will cause a VM exit due to external interrupt as soon as
 		 * the guest state is loaded.
 		 *
 		 * A posted interrupt after 'vmx_inject_interrupts()' will
 		 * not be "lost" because it will be held pending in the host
 		 * APIC because interrupts are disabled. The pending interrupt
 		 * will be recognized as soon as the guest state is loaded.
 		 *
 		 * The same reasoning applies to the IPI generated by
 		 * pmap_invalidate_ept().
 		 */
 		disable_intr();
 		vmx_inject_interrupts(vmx, vcpu, vlapic, rip);
 
 		/*
 		 * Check for vcpu suspension after injecting events because
 		 * vmx_inject_interrupts() can suspend the vcpu due to a
 		 * triple fault.
 		 */
 		if (vcpu_suspended(suspend_cookie)) {
 			enable_intr();
 			vm_exit_suspended(vmx->vm, vcpu, rip);
 			break;
 		}
 
 		if (vcpu_rendezvous_pending(rendezvous_cookie)) {
 			enable_intr();
 			vm_exit_rendezvous(vmx->vm, vcpu, rip);
 			break;
 		}
 
 		if (vcpu_should_yield(vm, vcpu)) {
 			enable_intr();
 			vm_exit_astpending(vmx->vm, vcpu, rip);
 			vmx_astpending_trace(vmx, vcpu, rip);
 			handled = HANDLED;
 			break;
 		}
 
 		vmx_run_trace(vmx, vcpu);
 		rc = vmx_enter_guest(vmxctx, vmx, launched);
 
 		/* Collect some information for VM exit processing */
 		vmexit->rip = rip = vmcs_guest_rip();
 		vmexit->inst_length = vmexit_instruction_length();
 		vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason();
 		vmexit->u.vmx.exit_qualification = vmcs_exit_qualification();
 
 		/* Update 'nextrip' */
 		vmx->state[vcpu].nextrip = rip;
 
 		if (rc == VMX_GUEST_VMEXIT) {
 			vmx_exit_handle_nmi(vmx, vcpu, vmexit);
 			enable_intr();
 			handled = vmx_exit_process(vmx, vcpu, vmexit);
 		} else {
 			enable_intr();
 			vmx_exit_inst_error(vmxctx, rc, vmexit);
 		}
 		launched = 1;
 		vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled);
 		rip = vmexit->rip;
 	} while (handled);
 
 	/*
 	 * If a VM exit has been handled then the exitcode must be BOGUS
 	 * If a VM exit is not handled then the exitcode must not be BOGUS
 	 */
 	if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) ||
 	    (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) {
 		panic("Mismatch between handled (%d) and exitcode (%d)",
 		      handled, vmexit->exitcode);
 	}
 
 	if (!handled)
 		vmm_stat_incr(vm, vcpu, VMEXIT_USERSPACE, 1);
 
 	VCPU_CTR1(vm, vcpu, "returning from vmx_run: exitcode %d",
 	    vmexit->exitcode);
 
 	VMCLEAR(vmcs);
 	vmx_msr_guest_exit(vmx, vcpu);
 
 	return (0);
 }
 
 static void
 vmx_vmcleanup(void *arg)
 {
 	int i;
 	struct vmx *vmx = arg;
 
 	if (apic_access_virtualization(vmx, 0))
 		vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE);
 
 	for (i = 0; i < VM_MAXCPU; i++)
 		vpid_free(vmx->state[i].vpid);
 
 	free(vmx, M_VMX);
 
 	return;
 }
 
 static register_t *
 vmxctx_regptr(struct vmxctx *vmxctx, int reg)
 {
 
 	switch (reg) {
 	case VM_REG_GUEST_RAX:
 		return (&vmxctx->guest_rax);
 	case VM_REG_GUEST_RBX:
 		return (&vmxctx->guest_rbx);
 	case VM_REG_GUEST_RCX:
 		return (&vmxctx->guest_rcx);
 	case VM_REG_GUEST_RDX:
 		return (&vmxctx->guest_rdx);
 	case VM_REG_GUEST_RSI:
 		return (&vmxctx->guest_rsi);
 	case VM_REG_GUEST_RDI:
 		return (&vmxctx->guest_rdi);
 	case VM_REG_GUEST_RBP:
 		return (&vmxctx->guest_rbp);
 	case VM_REG_GUEST_R8:
 		return (&vmxctx->guest_r8);
 	case VM_REG_GUEST_R9:
 		return (&vmxctx->guest_r9);
 	case VM_REG_GUEST_R10:
 		return (&vmxctx->guest_r10);
 	case VM_REG_GUEST_R11:
 		return (&vmxctx->guest_r11);
 	case VM_REG_GUEST_R12:
 		return (&vmxctx->guest_r12);
 	case VM_REG_GUEST_R13:
 		return (&vmxctx->guest_r13);
 	case VM_REG_GUEST_R14:
 		return (&vmxctx->guest_r14);
 	case VM_REG_GUEST_R15:
 		return (&vmxctx->guest_r15);
 	case VM_REG_GUEST_CR2:
 		return (&vmxctx->guest_cr2);
 	default:
 		break;
 	}
 	return (NULL);
 }
 
 static int
 vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval)
 {
 	register_t *regp;
 
 	if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
 		*retval = *regp;
 		return (0);
 	} else
 		return (EINVAL);
 }
 
 static int
 vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val)
 {
 	register_t *regp;
 
 	if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
 		*regp = val;
 		return (0);
 	} else
 		return (EINVAL);
 }
 
 static int
 vmx_get_intr_shadow(struct vmx *vmx, int vcpu, int running, uint64_t *retval)
 {
 	uint64_t gi;
 	int error;
 
 	error = vmcs_getreg(&vmx->vmcs[vcpu], running, 
 	    VMCS_IDENT(VMCS_GUEST_INTERRUPTIBILITY), &gi);
 	*retval = (gi & HWINTR_BLOCKING) ? 1 : 0;
 	return (error);
 }
 
 static int
 vmx_modify_intr_shadow(struct vmx *vmx, int vcpu, int running, uint64_t val)
 {
 	struct vmcs *vmcs;
 	uint64_t gi;
 	int error, ident;
 
 	/*
 	 * Forcing the vcpu into an interrupt shadow is not supported.
 	 */
 	if (val) {
 		error = EINVAL;
 		goto done;
 	}
 
 	vmcs = &vmx->vmcs[vcpu];
 	ident = VMCS_IDENT(VMCS_GUEST_INTERRUPTIBILITY);
 	error = vmcs_getreg(vmcs, running, ident, &gi);
 	if (error == 0) {
 		gi &= ~HWINTR_BLOCKING;
 		error = vmcs_setreg(vmcs, running, ident, gi);
 	}
 done:
 	VCPU_CTR2(vmx->vm, vcpu, "Setting intr_shadow to %#lx %s", val,
 	    error ? "failed" : "succeeded");
 	return (error);
 }
 
 static int
 vmx_shadow_reg(int reg)
 {
 	int shreg;
 
 	shreg = -1;
 
 	switch (reg) {
 	case VM_REG_GUEST_CR0:
 		shreg = VMCS_CR0_SHADOW;
                 break;
         case VM_REG_GUEST_CR4:
 		shreg = VMCS_CR4_SHADOW;
 		break;
 	default:
 		break;
 	}
 
 	return (shreg);
 }
 
 static int
 vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval)
 {
 	int running, hostcpu;
 	struct vmx *vmx = arg;
 
 	running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
 	if (running && hostcpu != curcpu)
 		panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu);
 
 	if (reg == VM_REG_GUEST_INTR_SHADOW)
 		return (vmx_get_intr_shadow(vmx, vcpu, running, retval));
 
 	if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0)
 		return (0);
 
 	return (vmcs_getreg(&vmx->vmcs[vcpu], running, reg, retval));
 }
 
 static int
 vmx_setreg(void *arg, int vcpu, int reg, uint64_t val)
 {
 	int error, hostcpu, running, shadow;
 	uint64_t ctls;
 	pmap_t pmap;
 	struct vmx *vmx = arg;
 
 	running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
 	if (running && hostcpu != curcpu)
 		panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu);
 
 	if (reg == VM_REG_GUEST_INTR_SHADOW)
 		return (vmx_modify_intr_shadow(vmx, vcpu, running, val));
 
 	if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0)
 		return (0);
 
 	error = vmcs_setreg(&vmx->vmcs[vcpu], running, reg, val);
 
 	if (error == 0) {
 		/*
 		 * If the "load EFER" VM-entry control is 1 then the
 		 * value of EFER.LMA must be identical to "IA-32e mode guest"
 		 * bit in the VM-entry control.
 		 */
 		if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 &&
 		    (reg == VM_REG_GUEST_EFER)) {
 			vmcs_getreg(&vmx->vmcs[vcpu], running,
 				    VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls);
 			if (val & EFER_LMA)
 				ctls |= VM_ENTRY_GUEST_LMA;
 			else
 				ctls &= ~VM_ENTRY_GUEST_LMA;
 			vmcs_setreg(&vmx->vmcs[vcpu], running,
 				    VMCS_IDENT(VMCS_ENTRY_CTLS), ctls);
 		}
 
 		shadow = vmx_shadow_reg(reg);
 		if (shadow > 0) {
 			/*
 			 * Store the unmodified value in the shadow
 			 */			
 			error = vmcs_setreg(&vmx->vmcs[vcpu], running,
 				    VMCS_IDENT(shadow), val);
 		}
 
 		if (reg == VM_REG_GUEST_CR3) {
 			/*
 			 * Invalidate the guest vcpu's TLB mappings to emulate
 			 * the behavior of updating %cr3.
 			 *
 			 * XXX the processor retains global mappings when %cr3
 			 * is updated but vmx_invvpid() does not.
 			 */
 			pmap = vmx->ctx[vcpu].pmap;
 			vmx_invvpid(vmx, vcpu, pmap, running);
 		}
 	}
 
 	return (error);
 }
 
 static int
 vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
 {
 	int hostcpu, running;
 	struct vmx *vmx = arg;
 
 	running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
 	if (running && hostcpu != curcpu)
 		panic("vmx_getdesc: %s%d is running", vm_name(vmx->vm), vcpu);
 
 	return (vmcs_getdesc(&vmx->vmcs[vcpu], running, reg, desc));
 }
 
 static int
 vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
 {
 	int hostcpu, running;
 	struct vmx *vmx = arg;
 
 	running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
 	if (running && hostcpu != curcpu)
 		panic("vmx_setdesc: %s%d is running", vm_name(vmx->vm), vcpu);
 
 	return (vmcs_setdesc(&vmx->vmcs[vcpu], running, reg, desc));
 }
 
 static int
 vmx_getcap(void *arg, int vcpu, int type, int *retval)
 {
 	struct vmx *vmx = arg;
 	int vcap;
 	int ret;
 
 	ret = ENOENT;
 
 	vcap = vmx->cap[vcpu].set;
 
 	switch (type) {
 	case VM_CAP_HALT_EXIT:
 		if (cap_halt_exit)
 			ret = 0;
 		break;
 	case VM_CAP_PAUSE_EXIT:
 		if (cap_pause_exit)
 			ret = 0;
 		break;
 	case VM_CAP_MTRAP_EXIT:
 		if (cap_monitor_trap)
 			ret = 0;
 		break;
 	case VM_CAP_UNRESTRICTED_GUEST:
 		if (cap_unrestricted_guest)
 			ret = 0;
 		break;
 	case VM_CAP_ENABLE_INVPCID:
 		if (cap_invpcid)
 			ret = 0;
 		break;
 	default:
 		break;
 	}
 
 	if (ret == 0)
 		*retval = (vcap & (1 << type)) ? 1 : 0;
 
 	return (ret);
 }
 
 static int
 vmx_setcap(void *arg, int vcpu, int type, int val)
 {
 	struct vmx *vmx = arg;
 	struct vmcs *vmcs = &vmx->vmcs[vcpu];
 	uint32_t baseval;
 	uint32_t *pptr;
 	int error;
 	int flag;
 	int reg;
 	int retval;
 
 	retval = ENOENT;
 	pptr = NULL;
 
 	switch (type) {
 	case VM_CAP_HALT_EXIT:
 		if (cap_halt_exit) {
 			retval = 0;
 			pptr = &vmx->cap[vcpu].proc_ctls;
 			baseval = *pptr;
 			flag = PROCBASED_HLT_EXITING;
 			reg = VMCS_PRI_PROC_BASED_CTLS;
 		}
 		break;
 	case VM_CAP_MTRAP_EXIT:
 		if (cap_monitor_trap) {
 			retval = 0;
 			pptr = &vmx->cap[vcpu].proc_ctls;
 			baseval = *pptr;
 			flag = PROCBASED_MTF;
 			reg = VMCS_PRI_PROC_BASED_CTLS;
 		}
 		break;
 	case VM_CAP_PAUSE_EXIT:
 		if (cap_pause_exit) {
 			retval = 0;
 			pptr = &vmx->cap[vcpu].proc_ctls;
 			baseval = *pptr;
 			flag = PROCBASED_PAUSE_EXITING;
 			reg = VMCS_PRI_PROC_BASED_CTLS;
 		}
 		break;
 	case VM_CAP_UNRESTRICTED_GUEST:
 		if (cap_unrestricted_guest) {
 			retval = 0;
 			pptr = &vmx->cap[vcpu].proc_ctls2;
 			baseval = *pptr;
 			flag = PROCBASED2_UNRESTRICTED_GUEST;
 			reg = VMCS_SEC_PROC_BASED_CTLS;
 		}
 		break;
 	case VM_CAP_ENABLE_INVPCID:
 		if (cap_invpcid) {
 			retval = 0;
 			pptr = &vmx->cap[vcpu].proc_ctls2;
 			baseval = *pptr;
 			flag = PROCBASED2_ENABLE_INVPCID;
 			reg = VMCS_SEC_PROC_BASED_CTLS;
 		}
 		break;
 	default:
 		break;
 	}
 
 	if (retval == 0) {
 		if (val) {
 			baseval |= flag;
 		} else {
 			baseval &= ~flag;
 		}
 		VMPTRLD(vmcs);
 		error = vmwrite(reg, baseval);
 		VMCLEAR(vmcs);
 
 		if (error) {
 			retval = error;
 		} else {
 			/*
 			 * Update optional stored flags, and record
 			 * setting
 			 */
 			if (pptr != NULL) {
 				*pptr = baseval;
 			}
 
 			if (val) {
 				vmx->cap[vcpu].set |= (1 << type);
 			} else {
 				vmx->cap[vcpu].set &= ~(1 << type);
 			}
 		}
 	}
 
         return (retval);
 }
 
 struct vlapic_vtx {
 	struct vlapic	vlapic;
 	struct pir_desc	*pir_desc;
 	struct vmx	*vmx;
 };
 
 #define	VMX_CTR_PIR(vm, vcpuid, pir_desc, notify, vector, level, msg)	\
 do {									\
 	VCPU_CTR2(vm, vcpuid, msg " assert %s-triggered vector %d",	\
 	    level ? "level" : "edge", vector);				\
 	VCPU_CTR1(vm, vcpuid, msg " pir0 0x%016lx", pir_desc->pir[0]);	\
 	VCPU_CTR1(vm, vcpuid, msg " pir1 0x%016lx", pir_desc->pir[1]);	\
 	VCPU_CTR1(vm, vcpuid, msg " pir2 0x%016lx", pir_desc->pir[2]);	\
 	VCPU_CTR1(vm, vcpuid, msg " pir3 0x%016lx", pir_desc->pir[3]);	\
 	VCPU_CTR1(vm, vcpuid, msg " notify: %s", notify ? "yes" : "no");\
 } while (0)
 
 /*
  * vlapic->ops handlers that utilize the APICv hardware assist described in
  * Chapter 29 of the Intel SDM.
  */
 static int
 vmx_set_intr_ready(struct vlapic *vlapic, int vector, bool level)
 {
 	struct vlapic_vtx *vlapic_vtx;
 	struct pir_desc *pir_desc;
 	uint64_t mask;
 	int idx, notify;
 
 	vlapic_vtx = (struct vlapic_vtx *)vlapic;
 	pir_desc = vlapic_vtx->pir_desc;
 
 	/*
 	 * Keep track of interrupt requests in the PIR descriptor. This is
 	 * because the virtual APIC page pointed to by the VMCS cannot be
 	 * modified if the vcpu is running.
 	 */
 	idx = vector / 64;
 	mask = 1UL << (vector % 64);
 	atomic_set_long(&pir_desc->pir[idx], mask);
 	notify = atomic_cmpset_long(&pir_desc->pending, 0, 1);
 
 	VMX_CTR_PIR(vlapic->vm, vlapic->vcpuid, pir_desc, notify, vector,
 	    level, "vmx_set_intr_ready");
 	return (notify);
 }
 
 static int
 vmx_pending_intr(struct vlapic *vlapic, int *vecptr)
 {
 	struct vlapic_vtx *vlapic_vtx;
 	struct pir_desc *pir_desc;
 	struct LAPIC *lapic;
 	uint64_t pending, pirval;
 	uint32_t ppr, vpr;
 	int i;
 
 	/*
 	 * This function is only expected to be called from the 'HLT' exit
 	 * handler which does not care about the vector that is pending.
 	 */
 	KASSERT(vecptr == NULL, ("vmx_pending_intr: vecptr must be NULL"));
 
 	vlapic_vtx = (struct vlapic_vtx *)vlapic;
 	pir_desc = vlapic_vtx->pir_desc;
 
 	pending = atomic_load_acq_long(&pir_desc->pending);
 	if (!pending)
 		return (0);	/* common case */
 
 	/*
 	 * If there is an interrupt pending then it will be recognized only
 	 * if its priority is greater than the processor priority.
 	 *
 	 * Special case: if the processor priority is zero then any pending
 	 * interrupt will be recognized.
 	 */
 	lapic = vlapic->apic_page;
 	ppr = lapic->ppr & 0xf0;
 	if (ppr == 0)
 		return (1);
 
 	VCPU_CTR1(vlapic->vm, vlapic->vcpuid, "HLT with non-zero PPR %d",
 	    lapic->ppr);
 
 	for (i = 3; i >= 0; i--) {
 		pirval = pir_desc->pir[i];
 		if (pirval != 0) {
 			vpr = (i * 64 + flsl(pirval) - 1) & 0xf0;
 			return (vpr > ppr);
 		}
 	}
 	return (0);
 }
 
 static void
 vmx_intr_accepted(struct vlapic *vlapic, int vector)
 {
 
 	panic("vmx_intr_accepted: not expected to be called");
 }
 
 static void
 vmx_set_tmr(struct vlapic *vlapic, int vector, bool level)
 {
 	struct vlapic_vtx *vlapic_vtx;
 	struct vmx *vmx;
 	struct vmcs *vmcs;
 	uint64_t mask, val;
 
 	KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector));
 	KASSERT(!vcpu_is_running(vlapic->vm, vlapic->vcpuid, NULL),
 	    ("vmx_set_tmr: vcpu cannot be running"));
 
 	vlapic_vtx = (struct vlapic_vtx *)vlapic;
 	vmx = vlapic_vtx->vmx;
 	vmcs = &vmx->vmcs[vlapic->vcpuid];
 	mask = 1UL << (vector % 64);
 
 	VMPTRLD(vmcs);
 	val = vmcs_read(VMCS_EOI_EXIT(vector));
 	if (level)
 		val |= mask;
 	else
 		val &= ~mask;
 	vmcs_write(VMCS_EOI_EXIT(vector), val);
 	VMCLEAR(vmcs);
 }
 
 static void
 vmx_enable_x2apic_mode(struct vlapic *vlapic)
 {
 	struct vmx *vmx;
 	struct vmcs *vmcs;
 	uint32_t proc_ctls2;
 	int vcpuid, error;
 
 	vcpuid = vlapic->vcpuid;
 	vmx = ((struct vlapic_vtx *)vlapic)->vmx;
 	vmcs = &vmx->vmcs[vcpuid];
 
 	proc_ctls2 = vmx->cap[vcpuid].proc_ctls2;
 	KASSERT((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) != 0,
 	    ("%s: invalid proc_ctls2 %#x", __func__, proc_ctls2));
 
 	proc_ctls2 &= ~PROCBASED2_VIRTUALIZE_APIC_ACCESSES;
 	proc_ctls2 |= PROCBASED2_VIRTUALIZE_X2APIC_MODE;
 	vmx->cap[vcpuid].proc_ctls2 = proc_ctls2;
 
 	VMPTRLD(vmcs);
 	vmcs_write(VMCS_SEC_PROC_BASED_CTLS, proc_ctls2);
 	VMCLEAR(vmcs);
 
 	if (vlapic->vcpuid == 0) {
 		/*
 		 * The nested page table mappings are shared by all vcpus
 		 * so unmap the APIC access page just once.
 		 */
 		error = vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE);
 		KASSERT(error == 0, ("%s: vm_unmap_mmio error %d",
 		    __func__, error));
 
 		/*
 		 * The MSR bitmap is shared by all vcpus so modify it only
 		 * once in the context of vcpu 0.
 		 */
 		error = vmx_allow_x2apic_msrs(vmx);
 		KASSERT(error == 0, ("%s: vmx_allow_x2apic_msrs error %d",
 		    __func__, error));
 	}
 }
 
 static void
 vmx_post_intr(struct vlapic *vlapic, int hostcpu)
 {
 
 	ipi_cpu(hostcpu, pirvec);
 }
 
 /*
  * Transfer the pending interrupts in the PIR descriptor to the IRR
  * in the virtual APIC page.
  */
 static void
 vmx_inject_pir(struct vlapic *vlapic)
 {
 	struct vlapic_vtx *vlapic_vtx;
 	struct pir_desc *pir_desc;
 	struct LAPIC *lapic;
 	uint64_t val, pirval;
 	int rvi, pirbase = -1;
 	uint16_t intr_status_old, intr_status_new;
 
 	vlapic_vtx = (struct vlapic_vtx *)vlapic;
 	pir_desc = vlapic_vtx->pir_desc;
 	if (atomic_cmpset_long(&pir_desc->pending, 1, 0) == 0) {
 		VCPU_CTR0(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: "
 		    "no posted interrupt pending");
 		return;
 	}
 
 	pirval = 0;
 	pirbase = -1;
 	lapic = vlapic->apic_page;
 
 	val = atomic_readandclear_long(&pir_desc->pir[0]);
 	if (val != 0) {
 		lapic->irr0 |= val;
 		lapic->irr1 |= val >> 32;
 		pirbase = 0;
 		pirval = val;
 	}
 
 	val = atomic_readandclear_long(&pir_desc->pir[1]);
 	if (val != 0) {
 		lapic->irr2 |= val;
 		lapic->irr3 |= val >> 32;
 		pirbase = 64;
 		pirval = val;
 	}
 
 	val = atomic_readandclear_long(&pir_desc->pir[2]);
 	if (val != 0) {
 		lapic->irr4 |= val;
 		lapic->irr5 |= val >> 32;
 		pirbase = 128;
 		pirval = val;
 	}
 
 	val = atomic_readandclear_long(&pir_desc->pir[3]);
 	if (val != 0) {
 		lapic->irr6 |= val;
 		lapic->irr7 |= val >> 32;
 		pirbase = 192;
 		pirval = val;
 	}
 
 	VLAPIC_CTR_IRR(vlapic, "vmx_inject_pir");
 
 	/*
 	 * Update RVI so the processor can evaluate pending virtual
 	 * interrupts on VM-entry.
 	 *
 	 * It is possible for pirval to be 0 here, even though the
 	 * pending bit has been set. The scenario is:
 	 * CPU-Y is sending a posted interrupt to CPU-X, which
 	 * is running a guest and processing posted interrupts in h/w.
 	 * CPU-X will eventually exit and the state seen in s/w is
 	 * the pending bit set, but no PIR bits set.
 	 *
 	 *      CPU-X                      CPU-Y
 	 *   (vm running)                (host running)
 	 *   rx posted interrupt
 	 *   CLEAR pending bit
 	 *				 SET PIR bit
 	 *   READ/CLEAR PIR bits
 	 *				 SET pending bit
 	 *   (vm exit)
 	 *   pending bit set, PIR 0
 	 */
 	if (pirval != 0) {
 		rvi = pirbase + flsl(pirval) - 1;
 		intr_status_old = vmcs_read(VMCS_GUEST_INTR_STATUS);
 		intr_status_new = (intr_status_old & 0xFF00) | rvi;
 		if (intr_status_new > intr_status_old) {
 			vmcs_write(VMCS_GUEST_INTR_STATUS, intr_status_new);
 			VCPU_CTR2(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: "
 			    "guest_intr_status changed from 0x%04x to 0x%04x",
 			    intr_status_old, intr_status_new);
 		}
 	}
 }
 
 static struct vlapic *
 vmx_vlapic_init(void *arg, int vcpuid)
 {
 	struct vmx *vmx;
 	struct vlapic *vlapic;
 	struct vlapic_vtx *vlapic_vtx;
 	
 	vmx = arg;
 
 	vlapic = malloc(sizeof(struct vlapic_vtx), M_VLAPIC, M_WAITOK | M_ZERO);
 	vlapic->vm = vmx->vm;
 	vlapic->vcpuid = vcpuid;
 	vlapic->apic_page = (struct LAPIC *)&vmx->apic_page[vcpuid];
 
 	vlapic_vtx = (struct vlapic_vtx *)vlapic;
 	vlapic_vtx->pir_desc = &vmx->pir_desc[vcpuid];
 	vlapic_vtx->vmx = vmx;
 
 	if (virtual_interrupt_delivery) {
 		vlapic->ops.set_intr_ready = vmx_set_intr_ready;
 		vlapic->ops.pending_intr = vmx_pending_intr;
 		vlapic->ops.intr_accepted = vmx_intr_accepted;
 		vlapic->ops.set_tmr = vmx_set_tmr;
 		vlapic->ops.enable_x2apic_mode = vmx_enable_x2apic_mode;
 	}
 
 	if (posted_interrupts)
 		vlapic->ops.post_intr = vmx_post_intr;
 
 	vlapic_init(vlapic);
 
 	return (vlapic);
 }
 
 static void
 vmx_vlapic_cleanup(void *arg, struct vlapic *vlapic)
 {
 
 	vlapic_cleanup(vlapic);
 	free(vlapic, M_VLAPIC);
 }
 
 struct vmm_ops vmm_ops_intel = {
 	vmx_init,
 	vmx_cleanup,
 	vmx_restore,
 	vmx_vminit,
 	vmx_run,
 	vmx_vmcleanup,
 	vmx_getreg,
 	vmx_setreg,
 	vmx_getdesc,
 	vmx_setdesc,
 	vmx_getcap,
 	vmx_setcap,
 	ept_vmspace_alloc,
 	ept_vmspace_free,
 	vmx_vlapic_init,
 	vmx_vlapic_cleanup,
 };
Index: stable/10/sys/amd64/vmm/io/ppt.c
===================================================================
--- stable/10/sys/amd64/vmm/io/ppt.c	(revision 284898)
+++ stable/10/sys/amd64/vmm/io/ppt.c	(revision 284899)
@@ -1,639 +1,651 @@
 /*-
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/bus.h>
 #include <sys/pciio.h>
 #include <sys/rman.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 
 #include <dev/pci/pcivar.h>
 #include <dev/pci/pcireg.h>
 
 #include <machine/resource.h>
 
 #include <machine/vmm.h>
 #include <machine/vmm_dev.h>
 
 #include "vmm_lapic.h"
 #include "vmm_ktr.h"
 
 #include "iommu.h"
 #include "ppt.h"
 
 /* XXX locking */
 
-#define	MAX_PPTDEVS	(sizeof(pptdevs) / sizeof(pptdevs[0]))
 #define	MAX_MSIMSGS	32
 
 /*
  * If the MSI-X table is located in the middle of a BAR then that MMIO
  * region gets split into two segments - one segment above the MSI-X table
  * and the other segment below the MSI-X table - with a hole in place of
  * the MSI-X table so accesses to it can be trapped and emulated.
  *
  * So, allocate a MMIO segment for each BAR register + 1 additional segment.
  */
 #define	MAX_MMIOSEGS	((PCIR_MAX_BAR_0 + 1) + 1)
 
 MALLOC_DEFINE(M_PPTMSIX, "pptmsix", "Passthru MSI-X resources");
 
 struct pptintr_arg {				/* pptintr(pptintr_arg) */
 	struct pptdev	*pptdev;
 	uint64_t	addr;
 	uint64_t	msg_data;
 };
 
-static struct pptdev {
+struct pptdev {
 	device_t	dev;
 	struct vm	*vm;			/* owner of this device */
+	TAILQ_ENTRY(pptdev)	next;
 	struct vm_memory_segment mmio[MAX_MMIOSEGS];
 	struct {
 		int	num_msgs;		/* guest state */
 
 		int	startrid;		/* host state */
 		struct resource *res[MAX_MSIMSGS];
 		void	*cookie[MAX_MSIMSGS];
 		struct pptintr_arg arg[MAX_MSIMSGS];
 	} msi;
 
 	struct {
 		int num_msgs;
 		int startrid;
 		int msix_table_rid;
 		struct resource *msix_table_res;
 		struct resource **res;
 		void **cookie;
 		struct pptintr_arg *arg;
 	} msix;
-} pptdevs[64];
+};
 
 SYSCTL_DECL(_hw_vmm);
 SYSCTL_NODE(_hw_vmm, OID_AUTO, ppt, CTLFLAG_RW, 0, "bhyve passthru devices");
 
 static int num_pptdevs;
 SYSCTL_INT(_hw_vmm_ppt, OID_AUTO, devices, CTLFLAG_RD, &num_pptdevs, 0,
     "number of pci passthru devices");
 
+static TAILQ_HEAD(, pptdev) pptdev_list = TAILQ_HEAD_INITIALIZER(pptdev_list);
+
 static int
 ppt_probe(device_t dev)
 {
 	int bus, slot, func;
 	struct pci_devinfo *dinfo;
 
 	dinfo = (struct pci_devinfo *)device_get_ivars(dev);
 
 	bus = pci_get_bus(dev);
 	slot = pci_get_slot(dev);
 	func = pci_get_function(dev);
 
 	/*
 	 * To qualify as a pci passthrough device a device must:
 	 * - be allowed by administrator to be used in this role
 	 * - be an endpoint device
 	 */
-	if (vmm_is_pptdev(bus, slot, func) &&
-	    (dinfo->cfg.hdrtype & PCIM_HDRTYPE) == PCIM_HDRTYPE_NORMAL)
+	if ((dinfo->cfg.hdrtype & PCIM_HDRTYPE) != PCIM_HDRTYPE_NORMAL)
+		return (ENXIO);
+	else if (vmm_is_pptdev(bus, slot, func))
 		return (0);
 	else
-		return (ENXIO);
+		/*
+		 * Returning BUS_PROBE_NOWILDCARD here matches devices that the
+		 * SR-IOV infrastructure specified as "ppt" passthrough devices.
+		 * All normal devices that did not have "ppt" specified as their
+		 * driver will not be matched by this.
+		 */
+		return (BUS_PROBE_NOWILDCARD);
 }
 
 static int
 ppt_attach(device_t dev)
 {
-	int n;
+	struct pptdev *ppt;
 
-	if (num_pptdevs >= MAX_PPTDEVS) {
-		printf("ppt_attach: maximum number of pci passthrough devices "
-		       "exceeded\n");
-		return (ENXIO);
-	}
+	ppt = device_get_softc(dev);
 
-	n = num_pptdevs++;
-	pptdevs[n].dev = dev;
+	num_pptdevs++;
+	TAILQ_INSERT_TAIL(&pptdev_list, ppt, next);
+	ppt->dev = dev;
 
 	if (bootverbose)
 		device_printf(dev, "attached\n");
 
 	return (0);
 }
 
 static int
 ppt_detach(device_t dev)
 {
-	/*
-	 * XXX check whether there are any pci passthrough devices assigned
-	 * to guests before we allow this driver to detach.
-	 */
+	struct pptdev *ppt;
 
+	ppt = device_get_softc(dev);
+
+	if (ppt->vm != NULL)
+		return (EBUSY);
+	num_pptdevs--;
+	TAILQ_REMOVE(&pptdev_list, ppt, next);
+
 	return (0);
 }
 
 static device_method_t ppt_methods[] = {
 	/* Device interface */
 	DEVMETHOD(device_probe,		ppt_probe),
 	DEVMETHOD(device_attach,	ppt_attach),
 	DEVMETHOD(device_detach,	ppt_detach),
 	{0, 0}
 };
 
 static devclass_t ppt_devclass;
-DEFINE_CLASS_0(ppt, ppt_driver, ppt_methods, 0);
+DEFINE_CLASS_0(ppt, ppt_driver, ppt_methods, sizeof(struct pptdev));
 DRIVER_MODULE(ppt, pci, ppt_driver, ppt_devclass, NULL, NULL);
 
 static struct pptdev *
 ppt_find(int bus, int slot, int func)
 {
 	device_t dev;
-	int i, b, s, f;
+	struct pptdev *ppt;
+	int b, s, f;
 
-	for (i = 0; i < num_pptdevs; i++) {
-		dev = pptdevs[i].dev;
+	TAILQ_FOREACH(ppt, &pptdev_list, next) {
+		dev = ppt->dev;
 		b = pci_get_bus(dev);
 		s = pci_get_slot(dev);
 		f = pci_get_function(dev);
 		if (bus == b && slot == s && func == f)
-			return (&pptdevs[i]);
+			return (ppt);
 	}
 	return (NULL);
 }
 
 static void
 ppt_unmap_mmio(struct vm *vm, struct pptdev *ppt)
 {
 	int i;
 	struct vm_memory_segment *seg;
 
 	for (i = 0; i < MAX_MMIOSEGS; i++) {
 		seg = &ppt->mmio[i];
 		if (seg->len == 0)
 			continue;
 		(void)vm_unmap_mmio(vm, seg->gpa, seg->len);
 		bzero(seg, sizeof(struct vm_memory_segment));
 	}
 }
 
 static void
 ppt_teardown_msi(struct pptdev *ppt)
 {
 	int i, rid;
 	void *cookie;
 	struct resource *res;
 
 	if (ppt->msi.num_msgs == 0)
 		return;
 
 	for (i = 0; i < ppt->msi.num_msgs; i++) {
 		rid = ppt->msi.startrid + i;
 		res = ppt->msi.res[i];
 		cookie = ppt->msi.cookie[i];
 
 		if (cookie != NULL)
 			bus_teardown_intr(ppt->dev, res, cookie);
 
 		if (res != NULL)
 			bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, res);
 		
 		ppt->msi.res[i] = NULL;
 		ppt->msi.cookie[i] = NULL;
 	}
 
 	if (ppt->msi.startrid == 1)
 		pci_release_msi(ppt->dev);
 
 	ppt->msi.num_msgs = 0;
 }
 
 static void 
 ppt_teardown_msix_intr(struct pptdev *ppt, int idx)
 {
 	int rid;
 	struct resource *res;
 	void *cookie;
 
 	rid = ppt->msix.startrid + idx;
 	res = ppt->msix.res[idx];
 	cookie = ppt->msix.cookie[idx];
 
 	if (cookie != NULL) 
 		bus_teardown_intr(ppt->dev, res, cookie);
 
 	if (res != NULL) 
 		bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, res);
 
 	ppt->msix.res[idx] = NULL;
 	ppt->msix.cookie[idx] = NULL;
 }
 
 static void 
 ppt_teardown_msix(struct pptdev *ppt)
 {
 	int i;
 
 	if (ppt->msix.num_msgs == 0) 
 		return;
 
 	for (i = 0; i < ppt->msix.num_msgs; i++) 
 		ppt_teardown_msix_intr(ppt, i);
 
 	if (ppt->msix.msix_table_res) {
 		bus_release_resource(ppt->dev, SYS_RES_MEMORY, 
 				     ppt->msix.msix_table_rid,
 				     ppt->msix.msix_table_res);
 		ppt->msix.msix_table_res = NULL;
 		ppt->msix.msix_table_rid = 0;
 	}
 
 	free(ppt->msix.res, M_PPTMSIX);
 	free(ppt->msix.cookie, M_PPTMSIX);
 	free(ppt->msix.arg, M_PPTMSIX);
 
 	pci_release_msi(ppt->dev);
 
 	ppt->msix.num_msgs = 0;
 }
 
 int
 ppt_avail_devices(void)
 {
 
 	return (num_pptdevs);
 }
 
 int
 ppt_assigned_devices(struct vm *vm)
 {
-	int i, num;
+	struct pptdev *ppt;
+	int num;
 
 	num = 0;
-	for (i = 0; i < num_pptdevs; i++) {
-		if (pptdevs[i].vm == vm)
+	TAILQ_FOREACH(ppt, &pptdev_list, next) {
+		if (ppt->vm == vm)
 			num++;
 	}
 	return (num);
 }
 
 boolean_t
 ppt_is_mmio(struct vm *vm, vm_paddr_t gpa)
 {
-	int i, n;
+	int i;
 	struct pptdev *ppt;
 	struct vm_memory_segment *seg;
 
-	for (n = 0; n < num_pptdevs; n++) {
-		ppt = &pptdevs[n];
+	TAILQ_FOREACH(ppt, &pptdev_list, next) {
 		if (ppt->vm != vm)
 			continue;
 
 		for (i = 0; i < MAX_MMIOSEGS; i++) {
 			seg = &ppt->mmio[i];
 			if (seg->len == 0)
 				continue;
 			if (gpa >= seg->gpa && gpa < seg->gpa + seg->len)
 				return (TRUE);
 		}
 	}
 
 	return (FALSE);
 }
 
 int
 ppt_assign_device(struct vm *vm, int bus, int slot, int func)
 {
 	struct pptdev *ppt;
 
 	ppt = ppt_find(bus, slot, func);
 	if (ppt != NULL) {
 		/*
 		 * If this device is owned by a different VM then we
 		 * cannot change its owner.
 		 */
 		if (ppt->vm != NULL && ppt->vm != vm)
 			return (EBUSY);
 
 		ppt->vm = vm;
 		iommu_add_device(vm_iommu_domain(vm), pci_get_rid(ppt->dev));
 		return (0);
 	}
 	return (ENOENT);
 }
 
 int
 ppt_unassign_device(struct vm *vm, int bus, int slot, int func)
 {
 	struct pptdev *ppt;
 
 	ppt = ppt_find(bus, slot, func);
 	if (ppt != NULL) {
 		/*
 		 * If this device is not owned by this 'vm' then bail out.
 		 */
 		if (ppt->vm != vm)
 			return (EBUSY);
 		ppt_unmap_mmio(vm, ppt);
 		ppt_teardown_msi(ppt);
 		ppt_teardown_msix(ppt);
 		iommu_remove_device(vm_iommu_domain(vm), pci_get_rid(ppt->dev));
 		ppt->vm = NULL;
 		return (0);
 	}
 	return (ENOENT);
 }
 
 int
 ppt_unassign_all(struct vm *vm)
 {
-	int i, bus, slot, func;
+	struct pptdev *ppt;
+	int bus, slot, func;
 	device_t dev;
 
-	for (i = 0; i < num_pptdevs; i++) {
-		if (pptdevs[i].vm == vm) {
-			dev = pptdevs[i].dev;
+	TAILQ_FOREACH(ppt, &pptdev_list, next) {
+		if (ppt->vm == vm) {
+			dev = ppt->dev;
 			bus = pci_get_bus(dev);
 			slot = pci_get_slot(dev);
 			func = pci_get_function(dev);
 			vm_unassign_pptdev(vm, bus, slot, func);
 		}
 	}
 
 	return (0);
 }
 
 int
 ppt_map_mmio(struct vm *vm, int bus, int slot, int func,
 	     vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
 {
 	int i, error;
 	struct vm_memory_segment *seg;
 	struct pptdev *ppt;
 
 	ppt = ppt_find(bus, slot, func);
 	if (ppt != NULL) {
 		if (ppt->vm != vm)
 			return (EBUSY);
 
 		for (i = 0; i < MAX_MMIOSEGS; i++) {
 			seg = &ppt->mmio[i];
 			if (seg->len == 0) {
 				error = vm_map_mmio(vm, gpa, len, hpa);
 				if (error == 0) {
 					seg->gpa = gpa;
 					seg->len = len;
 				}
 				return (error);
 			}
 		}
 		return (ENOSPC);
 	}
 	return (ENOENT);
 }
 
 static int
 pptintr(void *arg)
 {
 	struct pptdev *ppt;
 	struct pptintr_arg *pptarg;
 	
 	pptarg = arg;
 	ppt = pptarg->pptdev;
 
 	if (ppt->vm != NULL)
 		lapic_intr_msi(ppt->vm, pptarg->addr, pptarg->msg_data);
 	else {
 		/*
 		 * XXX
 		 * This is not expected to happen - panic?
 		 */
 	}
 
 	/*
 	 * For legacy interrupts give other filters a chance in case
 	 * the interrupt was not generated by the passthrough device.
 	 */
 	if (ppt->msi.startrid == 0)
 		return (FILTER_STRAY);
 	else
 		return (FILTER_HANDLED);
 }
 
 int
 ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func,
 	      uint64_t addr, uint64_t msg, int numvec)
 {
 	int i, rid, flags;
 	int msi_count, startrid, error, tmp;
 	struct pptdev *ppt;
 
 	if (numvec < 0 || numvec > MAX_MSIMSGS)
 		return (EINVAL);
 
 	ppt = ppt_find(bus, slot, func);
 	if (ppt == NULL)
 		return (ENOENT);
 	if (ppt->vm != vm)		/* Make sure we own this device */
 		return (EBUSY);
 
 	/* Free any allocated resources */
 	ppt_teardown_msi(ppt);
 
 	if (numvec == 0)		/* nothing more to do */
 		return (0);
 
 	flags = RF_ACTIVE;
 	msi_count = pci_msi_count(ppt->dev);
 	if (msi_count == 0) {
 		startrid = 0;		/* legacy interrupt */
 		msi_count = 1;
 		flags |= RF_SHAREABLE;
 	} else
 		startrid = 1;		/* MSI */
 
 	/*
 	 * The device must be capable of supporting the number of vectors
 	 * the guest wants to allocate.
 	 */
 	if (numvec > msi_count)
 		return (EINVAL);
 
 	/*
 	 * Make sure that we can allocate all the MSI vectors that are needed
 	 * by the guest.
 	 */
 	if (startrid == 1) {
 		tmp = numvec;
 		error = pci_alloc_msi(ppt->dev, &tmp);
 		if (error)
 			return (error);
 		else if (tmp != numvec) {
 			pci_release_msi(ppt->dev);
 			return (ENOSPC);
 		} else {
 			/* success */
 		}
 	}
 	
 	ppt->msi.startrid = startrid;
 
 	/*
 	 * Allocate the irq resource and attach it to the interrupt handler.
 	 */
 	for (i = 0; i < numvec; i++) {
 		ppt->msi.num_msgs = i + 1;
 		ppt->msi.cookie[i] = NULL;
 
 		rid = startrid + i;
 		ppt->msi.res[i] = bus_alloc_resource_any(ppt->dev, SYS_RES_IRQ,
 							 &rid, flags);
 		if (ppt->msi.res[i] == NULL)
 			break;
 
 		ppt->msi.arg[i].pptdev = ppt;
 		ppt->msi.arg[i].addr = addr;
 		ppt->msi.arg[i].msg_data = msg + i;
 
 		error = bus_setup_intr(ppt->dev, ppt->msi.res[i],
 				       INTR_TYPE_NET | INTR_MPSAFE,
 				       pptintr, NULL, &ppt->msi.arg[i],
 				       &ppt->msi.cookie[i]);
 		if (error != 0)
 			break;
 	}
 	
 	if (i < numvec) {
 		ppt_teardown_msi(ppt);
 		return (ENXIO);
 	}
 
 	return (0);
 }
 
 int
 ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func,
 	       int idx, uint64_t addr, uint64_t msg, uint32_t vector_control)
 {
 	struct pptdev *ppt;
 	struct pci_devinfo *dinfo;
 	int numvec, alloced, rid, error;
 	size_t res_size, cookie_size, arg_size;
 
 	ppt = ppt_find(bus, slot, func);
 	if (ppt == NULL)
 		return (ENOENT);
 	if (ppt->vm != vm)		/* Make sure we own this device */
 		return (EBUSY);
 
 	dinfo = device_get_ivars(ppt->dev);
 	if (!dinfo) 
 		return (ENXIO);
 
 	/* 
 	 * First-time configuration:
 	 * 	Allocate the MSI-X table
 	 *	Allocate the IRQ resources
 	 *	Set up some variables in ppt->msix
 	 */
 	if (ppt->msix.num_msgs == 0) {
 		numvec = pci_msix_count(ppt->dev);
 		if (numvec <= 0)
 			return (EINVAL);
 
 		ppt->msix.startrid = 1;
 		ppt->msix.num_msgs = numvec;
 
 		res_size = numvec * sizeof(ppt->msix.res[0]);
 		cookie_size = numvec * sizeof(ppt->msix.cookie[0]);
 		arg_size = numvec * sizeof(ppt->msix.arg[0]);
 
 		ppt->msix.res = malloc(res_size, M_PPTMSIX, M_WAITOK | M_ZERO);
 		ppt->msix.cookie = malloc(cookie_size, M_PPTMSIX,
 					  M_WAITOK | M_ZERO);
 		ppt->msix.arg = malloc(arg_size, M_PPTMSIX, M_WAITOK | M_ZERO);
 
 		rid = dinfo->cfg.msix.msix_table_bar;
 		ppt->msix.msix_table_res = bus_alloc_resource_any(ppt->dev,
 					       SYS_RES_MEMORY, &rid, RF_ACTIVE);
 
 		if (ppt->msix.msix_table_res == NULL) {
 			ppt_teardown_msix(ppt);
 			return (ENOSPC);
 		}
 		ppt->msix.msix_table_rid = rid;
 
 		alloced = numvec;
 		error = pci_alloc_msix(ppt->dev, &alloced);
 		if (error || alloced != numvec) {
 			ppt_teardown_msix(ppt);
 			return (error == 0 ? ENOSPC: error);
 		}
 	}
 
 	if ((vector_control & PCIM_MSIX_VCTRL_MASK) == 0) {
 		/* Tear down the IRQ if it's already set up */
 		ppt_teardown_msix_intr(ppt, idx);
 
 		/* Allocate the IRQ resource */
 		ppt->msix.cookie[idx] = NULL;
 		rid = ppt->msix.startrid + idx;
 		ppt->msix.res[idx] = bus_alloc_resource_any(ppt->dev, SYS_RES_IRQ,
 							    &rid, RF_ACTIVE);
 		if (ppt->msix.res[idx] == NULL)
 			return (ENXIO);
 	
 		ppt->msix.arg[idx].pptdev = ppt;
 		ppt->msix.arg[idx].addr = addr;
 		ppt->msix.arg[idx].msg_data = msg;
 	
 		/* Setup the MSI-X interrupt */
 		error = bus_setup_intr(ppt->dev, ppt->msix.res[idx],
 				       INTR_TYPE_NET | INTR_MPSAFE,
 				       pptintr, NULL, &ppt->msix.arg[idx],
 				       &ppt->msix.cookie[idx]);
 	
 		if (error != 0) {
 			bus_teardown_intr(ppt->dev, ppt->msix.res[idx], ppt->msix.cookie[idx]);
 			bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, ppt->msix.res[idx]);
 			ppt->msix.cookie[idx] = NULL;
 			ppt->msix.res[idx] = NULL;
 			return (ENXIO);
 		}
 	} else {
 		/* Masked, tear it down if it's already been set up */
 		ppt_teardown_msix_intr(ppt, idx);
 	}
 
 	return (0);
 }
Index: stable/10/sys/amd64/vmm/io/vatpic.c
===================================================================
--- stable/10/sys/amd64/vmm/io/vatpic.c	(revision 284898)
+++ stable/10/sys/amd64/vmm/io/vatpic.c	(revision 284899)
@@ -1,808 +1,809 @@
 /*-
  * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/queue.h>
 #include <sys/cpuset.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/systm.h>
 
 #include <x86/apicreg.h>
 #include <dev/ic/i8259.h>
 
 #include <machine/vmm.h>
 
 #include "vmm_ktr.h"
 #include "vmm_lapic.h"
 #include "vioapic.h"
 #include "vatpic.h"
 
 static MALLOC_DEFINE(M_VATPIC, "atpic", "bhyve virtual atpic (8259)");
 
 #define	VATPIC_LOCK(vatpic)		mtx_lock_spin(&((vatpic)->mtx))
 #define	VATPIC_UNLOCK(vatpic)		mtx_unlock_spin(&((vatpic)->mtx))
 #define	VATPIC_LOCKED(vatpic)		mtx_owned(&((vatpic)->mtx))
 
 enum irqstate {
 	IRQSTATE_ASSERT,
 	IRQSTATE_DEASSERT,
 	IRQSTATE_PULSE
 };
 
 struct atpic {
 	bool		ready;
 	int		icw_num;
 	int		rd_cmd_reg;
 
 	bool		aeoi;
 	bool		poll;
 	bool		rotate;
 	bool		sfn;		/* special fully-nested mode */
 
 	int		irq_base;
 	uint8_t		request;	/* Interrupt Request Register (IIR) */
 	uint8_t		service;	/* Interrupt Service (ISR) */
 	uint8_t		mask;		/* Interrupt Mask Register (IMR) */
 	uint8_t		smm;		/* special mask mode */
 
 	int		acnt[8];	/* sum of pin asserts and deasserts */
 	int		lowprio;	/* lowest priority irq */
 
 	bool		intr_raised;
 };
 
 struct vatpic {
 	struct vm	*vm;
 	struct mtx	mtx;
 	struct atpic	atpic[2];
 	uint8_t		elc[2];
 };
 
 #define	VATPIC_CTR0(vatpic, fmt)					\
 	VM_CTR0((vatpic)->vm, fmt)
 
 #define	VATPIC_CTR1(vatpic, fmt, a1)					\
 	VM_CTR1((vatpic)->vm, fmt, a1)
 
 #define	VATPIC_CTR2(vatpic, fmt, a1, a2)				\
 	VM_CTR2((vatpic)->vm, fmt, a1, a2)
 
 #define	VATPIC_CTR3(vatpic, fmt, a1, a2, a3)				\
 	VM_CTR3((vatpic)->vm, fmt, a1, a2, a3)
 
 #define	VATPIC_CTR4(vatpic, fmt, a1, a2, a3, a4)			\
 	VM_CTR4((vatpic)->vm, fmt, a1, a2, a3, a4)
 
 /*
  * Loop over all the pins in priority order from highest to lowest.
  */
 #define	ATPIC_PIN_FOREACH(pinvar, atpic, tmpvar)			\
 	for (tmpvar = 0, pinvar = (atpic->lowprio + 1) & 0x7;		\
 	    tmpvar < 8;							\
 	    tmpvar++, pinvar = (pinvar + 1) & 0x7)
 
 static void vatpic_set_pinstate(struct vatpic *vatpic, int pin, bool newstate);
 
 static __inline bool
 master_atpic(struct vatpic *vatpic, struct atpic *atpic)
 {
 
 	if (atpic == &vatpic->atpic[0])
 		return (true);
 	else
 		return (false);
 }
 
 static __inline int
 vatpic_get_highest_isrpin(struct atpic *atpic)
 {
 	int bit, pin;
 	int i;
 
 	ATPIC_PIN_FOREACH(pin, atpic, i) {
                 bit = (1 << pin);
 
 		if (atpic->service & bit) {
 			/*
 			 * An IS bit that is masked by an IMR bit will not be
 			 * cleared by a non-specific EOI in Special Mask Mode.
 			 */
 			if (atpic->smm && (atpic->mask & bit) != 0)
 				continue;
 			else
 				return (pin);
 		}
 	}
 
 	return (-1);
 }
 
 static __inline int
 vatpic_get_highest_irrpin(struct atpic *atpic)
 {
 	int serviced;
 	int bit, pin, tmp;
 
 	/*
 	 * In 'Special Fully-Nested Mode' when an interrupt request from
 	 * a slave is in service, the slave is not locked out from the
 	 * master's priority logic.
 	 */
 	serviced = atpic->service;
 	if (atpic->sfn)
 		serviced &= ~(1 << 2);
 
 	/*
 	 * In 'Special Mask Mode', when a mask bit is set in OCW1 it inhibits
 	 * further interrupts at that level and enables interrupts from all
 	 * other levels that are not masked. In other words the ISR has no
 	 * bearing on the levels that can generate interrupts.
 	 */
 	if (atpic->smm)
 		serviced = 0;
 
 	ATPIC_PIN_FOREACH(pin, atpic, tmp) {
 		bit = 1 << pin;
 
 		/*
 		 * If there is already an interrupt in service at the same
 		 * or higher priority then bail.
 		 */
 		if ((serviced & bit) != 0)
 			break;
 
 		/*
 		 * If an interrupt is asserted and not masked then return
 		 * the corresponding 'pin' to the caller.
 		 */
 		if ((atpic->request & bit) != 0 && (atpic->mask & bit) == 0)
 			return (pin);
 	}
 
 	return (-1);
 }
 
 static void
 vatpic_notify_intr(struct vatpic *vatpic)
 {
 	struct atpic *atpic;
 	int pin;
 
 	KASSERT(VATPIC_LOCKED(vatpic), ("vatpic_notify_intr not locked"));
 
 	/*
 	 * First check the slave.
 	 */
 	atpic = &vatpic->atpic[1];
 	if (!atpic->intr_raised &&
 	    (pin = vatpic_get_highest_irrpin(atpic)) != -1) {
 		VATPIC_CTR4(vatpic, "atpic slave notify pin = %d "
 		    "(imr 0x%x irr 0x%x isr 0x%x)", pin,
 		    atpic->mask, atpic->request, atpic->service);
 
 		/*
 		 * Cascade the request from the slave to the master.
 		 */
 		atpic->intr_raised = true;
 		vatpic_set_pinstate(vatpic, 2, true);
 		vatpic_set_pinstate(vatpic, 2, false);
 	} else {
 		VATPIC_CTR3(vatpic, "atpic slave no eligible interrupts "
 		    "(imr 0x%x irr 0x%x isr 0x%x)",
 		    atpic->mask, atpic->request, atpic->service);
 	}
 
 	/*
 	 * Then check the master.
 	 */
 	atpic = &vatpic->atpic[0];
 	if (!atpic->intr_raised &&
 	    (pin = vatpic_get_highest_irrpin(atpic)) != -1) {
 		VATPIC_CTR4(vatpic, "atpic master notify pin = %d "
 		    "(imr 0x%x irr 0x%x isr 0x%x)", pin,
 		    atpic->mask, atpic->request, atpic->service);
 
 		/*
 		 * From Section 3.6.2, "Interrupt Modes", in the
 		 * MPtable Specification, Version 1.4
 		 *
 		 * PIC interrupts are routed to both the Local APIC
 		 * and the I/O APIC to support operation in 1 of 3
 		 * modes.
 		 *
 		 * 1. Legacy PIC Mode: the PIC effectively bypasses
 		 * all APIC components.  In this mode the local APIC is
 		 * disabled and LINT0 is reconfigured as INTR to
 		 * deliver the PIC interrupt directly to the CPU.
 		 *
 		 * 2. Virtual Wire Mode: the APIC is treated as a
 		 * virtual wire which delivers interrupts from the PIC
 		 * to the CPU.  In this mode LINT0 is programmed as
 		 * ExtINT to indicate that the PIC is the source of
 		 * the interrupt.
 		 *
 		 * 3. Virtual Wire Mode via I/O APIC: PIC interrupts are
 		 * fielded by the I/O APIC and delivered to the appropriate
 		 * CPU.  In this mode the I/O APIC input 0 is programmed
 		 * as ExtINT to indicate that the PIC is the source of the
 		 * interrupt.
 		 */
 		atpic->intr_raised = true;
 		lapic_set_local_intr(vatpic->vm, -1, APIC_LVT_LINT0);
 		vioapic_pulse_irq(vatpic->vm, 0);
 	} else {
 		VATPIC_CTR3(vatpic, "atpic master no eligible interrupts "
 		    "(imr 0x%x irr 0x%x isr 0x%x)",
 		    atpic->mask, atpic->request, atpic->service);
 	}
 }
 
 static int
 vatpic_icw1(struct vatpic *vatpic, struct atpic *atpic, uint8_t val)
 {
 	VATPIC_CTR1(vatpic, "atpic icw1 0x%x", val);
 
 	atpic->ready = false;
 
 	atpic->icw_num = 1;
+	atpic->request = 0;
 	atpic->mask = 0;
 	atpic->lowprio = 7;
 	atpic->rd_cmd_reg = 0;
 	atpic->poll = 0;
 	atpic->smm = 0;
 
 	if ((val & ICW1_SNGL) != 0) {
 		VATPIC_CTR0(vatpic, "vatpic cascade mode required");
 		return (-1);
 	}
 
 	if ((val & ICW1_IC4) == 0) {
 		VATPIC_CTR0(vatpic, "vatpic icw4 required");
 		return (-1);
 	}
 
 	atpic->icw_num++;
 
 	return (0);
 }
 
 static int
 vatpic_icw2(struct vatpic *vatpic, struct atpic *atpic, uint8_t val)
 {
 	VATPIC_CTR1(vatpic, "atpic icw2 0x%x", val);
 
 	atpic->irq_base = val & 0xf8;
 
 	atpic->icw_num++;
 
 	return (0);
 }
 
 static int
 vatpic_icw3(struct vatpic *vatpic, struct atpic *atpic, uint8_t val)
 {
 	VATPIC_CTR1(vatpic, "atpic icw3 0x%x", val);
 
 	atpic->icw_num++;
 
 	return (0);
 }
 
 static int
 vatpic_icw4(struct vatpic *vatpic, struct atpic *atpic, uint8_t val)
 {
 	VATPIC_CTR1(vatpic, "atpic icw4 0x%x", val);
 
 	if ((val & ICW4_8086) == 0) {
 		VATPIC_CTR0(vatpic, "vatpic microprocessor mode required");
 		return (-1);
 	}
 
 	if ((val & ICW4_AEOI) != 0)
 		atpic->aeoi = true;
 
 	if ((val & ICW4_SFNM) != 0) {
 		if (master_atpic(vatpic, atpic)) {
 			atpic->sfn = true;
 		} else {
 			VATPIC_CTR1(vatpic, "Ignoring special fully nested "
 			    "mode on slave atpic: %#x", val);
 		}
 	}
 
 	atpic->icw_num = 0;
 	atpic->ready = true;
 
 	return (0);
 }
 
 static int
 vatpic_ocw1(struct vatpic *vatpic, struct atpic *atpic, uint8_t val)
 {
 	VATPIC_CTR1(vatpic, "atpic ocw1 0x%x", val);
 
 	atpic->mask = val & 0xff;
 
 	return (0);
 }
 
 static int
 vatpic_ocw2(struct vatpic *vatpic, struct atpic *atpic, uint8_t val)
 {
 	VATPIC_CTR1(vatpic, "atpic ocw2 0x%x", val);
 
 	atpic->rotate = ((val & OCW2_R) != 0);
 
 	if ((val & OCW2_EOI) != 0) {
 		int isr_bit;
 
 		if ((val & OCW2_SL) != 0) {
 			/* specific EOI */
 			isr_bit = val & 0x7;
 		} else {
 			/* non-specific EOI */
 			isr_bit = vatpic_get_highest_isrpin(atpic);
 		}
 
 		if (isr_bit != -1) {
 			atpic->service &= ~(1 << isr_bit);
 
 			if (atpic->rotate)
 				atpic->lowprio = isr_bit;
 		}
 	} else if ((val & OCW2_SL) != 0 && atpic->rotate == true) {
 		/* specific priority */
 		atpic->lowprio = val & 0x7;
 	}
 
 	return (0);
 }
 
 static int
 vatpic_ocw3(struct vatpic *vatpic, struct atpic *atpic, uint8_t val)
 {
 	VATPIC_CTR1(vatpic, "atpic ocw3 0x%x", val);
 
 	if (val & OCW3_ESMM) {
 		atpic->smm = val & OCW3_SMM ? 1 : 0;
 		VATPIC_CTR2(vatpic, "%s atpic special mask mode %s",
 		    master_atpic(vatpic, atpic) ? "master" : "slave",
 		    atpic->smm ?  "enabled" : "disabled");
 	}
 
 	if (val & OCW3_RR) {
 		/* read register command */
 		atpic->rd_cmd_reg = val & OCW3_RIS;
 
 		/* Polling mode */
 		atpic->poll = ((val & OCW3_P) != 0);
 	}
 
 	return (0);
 }
 
 static void
 vatpic_set_pinstate(struct vatpic *vatpic, int pin, bool newstate)
 {
 	struct atpic *atpic;
 	int oldcnt, newcnt;
 	bool level;
 
 	KASSERT(pin >= 0 && pin < 16,
 	    ("vatpic_set_pinstate: invalid pin number %d", pin));
 	KASSERT(VATPIC_LOCKED(vatpic),
 	    ("vatpic_set_pinstate: vatpic is not locked"));
 
 	atpic = &vatpic->atpic[pin >> 3];
 
 	oldcnt = atpic->acnt[pin & 0x7];
 	if (newstate)
 		atpic->acnt[pin & 0x7]++;
 	else
 		atpic->acnt[pin & 0x7]--;
 	newcnt = atpic->acnt[pin & 0x7];
 
 	if (newcnt < 0) {
 		VATPIC_CTR2(vatpic, "atpic pin%d: bad acnt %d", pin, newcnt);
 	}
 
 	level = ((vatpic->elc[pin >> 3] & (1 << (pin & 0x7))) != 0);
 
 	if ((oldcnt == 0 && newcnt == 1) || (newcnt > 0 && level == true)) {
 		/* rising edge or level */
 		VATPIC_CTR1(vatpic, "atpic pin%d: asserted", pin);
 		atpic->request |= (1 << (pin & 0x7));
 	} else if (oldcnt == 1 && newcnt == 0) {
 		/* falling edge */
 		VATPIC_CTR1(vatpic, "atpic pin%d: deasserted", pin);
 		if (level)
 			atpic->request &= ~(1 << (pin & 0x7));
 	} else {
 		VATPIC_CTR3(vatpic, "atpic pin%d: %s, ignored, acnt %d",
 		    pin, newstate ? "asserted" : "deasserted", newcnt);
 	}
 
 	vatpic_notify_intr(vatpic);
 }
 
 static int
 vatpic_set_irqstate(struct vm *vm, int irq, enum irqstate irqstate)
 {
 	struct vatpic *vatpic;
 	struct atpic *atpic;
 
 	if (irq < 0 || irq > 15)
 		return (EINVAL);
 
 	vatpic = vm_atpic(vm);
 	atpic = &vatpic->atpic[irq >> 3];
 
 	if (atpic->ready == false)
 		return (0);
 
 	VATPIC_LOCK(vatpic);
 	switch (irqstate) {
 	case IRQSTATE_ASSERT:
 		vatpic_set_pinstate(vatpic, irq, true);
 		break;
 	case IRQSTATE_DEASSERT:
 		vatpic_set_pinstate(vatpic, irq, false);
 		break;
 	case IRQSTATE_PULSE:
 		vatpic_set_pinstate(vatpic, irq, true);
 		vatpic_set_pinstate(vatpic, irq, false);
 		break;
 	default:
 		panic("vatpic_set_irqstate: invalid irqstate %d", irqstate);
 	}
 	VATPIC_UNLOCK(vatpic);
 
 	return (0);
 }
 
 int
 vatpic_assert_irq(struct vm *vm, int irq)
 {
 	return (vatpic_set_irqstate(vm, irq, IRQSTATE_ASSERT));
 }
 
 int
 vatpic_deassert_irq(struct vm *vm, int irq)
 {
 	return (vatpic_set_irqstate(vm, irq, IRQSTATE_DEASSERT));
 }
 
 int
 vatpic_pulse_irq(struct vm *vm, int irq)
 {
 	return (vatpic_set_irqstate(vm, irq, IRQSTATE_PULSE));
 }
 
 int
 vatpic_set_irq_trigger(struct vm *vm, int irq, enum vm_intr_trigger trigger)
 {
 	struct vatpic *vatpic;
 
 	if (irq < 0 || irq > 15)
 		return (EINVAL);
 
 	/*
 	 * See comment in vatpic_elc_handler.  These IRQs must be
 	 * edge triggered.
 	 */
 	if (trigger == LEVEL_TRIGGER) {
 		switch (irq) {
 		case 0:
 		case 1:
 		case 2:
 		case 8:
 		case 13:
 			return (EINVAL);
 		}
 	}
 
 	vatpic = vm_atpic(vm);
 
 	VATPIC_LOCK(vatpic);
 
 	if (trigger == LEVEL_TRIGGER)
 		vatpic->elc[irq >> 3] |=  1 << (irq & 0x7);
 	else
 		vatpic->elc[irq >> 3] &=  ~(1 << (irq & 0x7));
 
 	VATPIC_UNLOCK(vatpic);
 
 	return (0);
 }
 
 void
 vatpic_pending_intr(struct vm *vm, int *vecptr)
 {
 	struct vatpic *vatpic;
 	struct atpic *atpic;
 	int pin;
 
 	vatpic = vm_atpic(vm);
 
 	atpic = &vatpic->atpic[0];
 
 	VATPIC_LOCK(vatpic);
 
 	pin = vatpic_get_highest_irrpin(atpic);
 	if (pin == 2) {
 		atpic = &vatpic->atpic[1];
 		pin = vatpic_get_highest_irrpin(atpic);
 	}
 
 	/*
 	 * If there are no pins active at this moment then return the spurious
 	 * interrupt vector instead.
 	 */
 	if (pin == -1)
 		pin = 7;
 
 	KASSERT(pin >= 0 && pin <= 7, ("%s: invalid pin %d", __func__, pin));
 	*vecptr = atpic->irq_base + pin;
 
 	VATPIC_UNLOCK(vatpic);
 }
 
 static void
 vatpic_pin_accepted(struct atpic *atpic, int pin)
 {
 	atpic->intr_raised = false;
 
 	if (atpic->acnt[pin] == 0)
 		atpic->request &= ~(1 << pin);
 
 	if (atpic->aeoi == true) {
 		if (atpic->rotate == true)
 			atpic->lowprio = pin;
 	} else {
 		atpic->service |= (1 << pin);
 	}
 }
 
 void
 vatpic_intr_accepted(struct vm *vm, int vector)
 {
 	struct vatpic *vatpic;
 	int pin;
 
 	vatpic = vm_atpic(vm);
 
 	VATPIC_LOCK(vatpic);
 
 	pin = vector & 0x7;
 
 	if ((vector & ~0x7) == vatpic->atpic[1].irq_base) {
 		vatpic_pin_accepted(&vatpic->atpic[1], pin);
 		/*
 		 * If this vector originated from the slave,
 		 * accept the cascaded interrupt too.
 		 */
 		vatpic_pin_accepted(&vatpic->atpic[0], 2);
 	} else {
 		vatpic_pin_accepted(&vatpic->atpic[0], pin);
 	}
 
 	vatpic_notify_intr(vatpic);
 
 	VATPIC_UNLOCK(vatpic);
 }
 
 static int
 vatpic_read(struct vatpic *vatpic, struct atpic *atpic, bool in, int port,
 	    int bytes, uint32_t *eax)
 {
 	int pin;
 
 	VATPIC_LOCK(vatpic);
 
 	if (atpic->poll) {
 		atpic->poll = 0;
 		pin = vatpic_get_highest_irrpin(atpic);
 		if (pin >= 0) {
 			vatpic_pin_accepted(atpic, pin);
 			*eax = 0x80 | pin;
 		} else {
 			*eax = 0;
 		}
 	} else {
 		if (port & ICU_IMR_OFFSET) {
 			/* read interrrupt mask register */
 			*eax = atpic->mask;
 		} else {
 			if (atpic->rd_cmd_reg == OCW3_RIS) {
 				/* read interrupt service register */
 				*eax = atpic->service;
 			} else {
 				/* read interrupt request register */
 				*eax = atpic->request;
 			}
 		}
 	}
 
 	VATPIC_UNLOCK(vatpic);
 
 	return (0);
 
 }
 
 static int
 vatpic_write(struct vatpic *vatpic, struct atpic *atpic, bool in, int port,
     int bytes, uint32_t *eax)
 {
 	int error;
 	uint8_t val;
 
 	error = 0;
 	val = *eax;
 
 	VATPIC_LOCK(vatpic);
 
 	if (port & ICU_IMR_OFFSET) {
 		switch (atpic->icw_num) {
 		case 2:
 			error = vatpic_icw2(vatpic, atpic, val);
 			break;
 		case 3:
 			error = vatpic_icw3(vatpic, atpic, val);
 			break;
 		case 4:
 			error = vatpic_icw4(vatpic, atpic, val);
 			break;
 		default:
 			error = vatpic_ocw1(vatpic, atpic, val);
 			break;
 		}
 	} else {
 		if (val & (1 << 4))
 			error = vatpic_icw1(vatpic, atpic, val);
 
 		if (atpic->ready) {
 			if (val & (1 << 3))
 				error = vatpic_ocw3(vatpic, atpic, val);
 			else
 				error = vatpic_ocw2(vatpic, atpic, val);
 		}
 	}
 
 	if (atpic->ready)
 		vatpic_notify_intr(vatpic);
 
 	VATPIC_UNLOCK(vatpic);
 
 	return (error);
 }
 
 int
 vatpic_master_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
     uint32_t *eax)
 {
 	struct vatpic *vatpic;
 	struct atpic *atpic;
 
 	vatpic = vm_atpic(vm);
 	atpic = &vatpic->atpic[0];
 
 	if (bytes != 1)
 		return (-1);
  
 	if (in) {
 		return (vatpic_read(vatpic, atpic, in, port, bytes, eax));
 	}
  
 	return (vatpic_write(vatpic, atpic, in, port, bytes, eax));
 }
 
 int
 vatpic_slave_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
     uint32_t *eax)
 {
 	struct vatpic *vatpic;
 	struct atpic *atpic;
 
 	vatpic = vm_atpic(vm);
 	atpic = &vatpic->atpic[1];
 
 	if (bytes != 1)
 		return (-1);
 
 	if (in) {
 		return (vatpic_read(vatpic, atpic, in, port, bytes, eax));
 	}
 
 	return (vatpic_write(vatpic, atpic, in, port, bytes, eax));
 }
 
 int
 vatpic_elc_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
     uint32_t *eax)
 {
 	struct vatpic *vatpic;
 	bool is_master;
 
 	vatpic = vm_atpic(vm);
 	is_master = (port == IO_ELCR1);
 
 	if (bytes != 1)
 		return (-1);
 
 	VATPIC_LOCK(vatpic);
 
 	if (in) {
 		if (is_master)
 			*eax = vatpic->elc[0];
 		else
 			*eax = vatpic->elc[1];
 	} else {
 		/*
 		 * For the master PIC the cascade channel (IRQ2), the
 		 * heart beat timer (IRQ0), and the keyboard
 		 * controller (IRQ1) cannot be programmed for level
 		 * mode.
 		 *
 		 * For the slave PIC the real time clock (IRQ8) and
 		 * the floating point error interrupt (IRQ13) cannot
 		 * be programmed for level mode.
 		 */
 		if (is_master)
 			vatpic->elc[0] = (*eax & 0xf8);
 		else
 			vatpic->elc[1] = (*eax & 0xde);
 	}
 
 	VATPIC_UNLOCK(vatpic);
 
 	return (0);
 }
 
 struct vatpic *
 vatpic_init(struct vm *vm)
 {
 	struct vatpic *vatpic;
 
 	vatpic = malloc(sizeof(struct vatpic), M_VATPIC, M_WAITOK | M_ZERO);
 	vatpic->vm = vm;
 
 	mtx_init(&vatpic->mtx, "vatpic lock", NULL, MTX_SPIN);
 
 	return (vatpic);
 }
 
 void
 vatpic_cleanup(struct vatpic *vatpic)
 {
 	free(vatpic, M_VATPIC);
 }
Index: stable/10/sys/amd64/vmm/io/vrtc.c
===================================================================
--- stable/10/sys/amd64/vmm/io/vrtc.c	(revision 284898)
+++ stable/10/sys/amd64/vmm/io/vrtc.c	(revision 284899)
@@ -1,952 +1,1009 @@
 /*-
  * Copyright (c) 2014, Neel Natu (neel@freebsd.org)
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/queue.h>
 #include <sys/cpuset.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/clock.h>
 #include <sys/sysctl.h>
 
 #include <machine/vmm.h>
 
 #include <isa/rtc.h>
 
 #include "vmm_ktr.h"
 #include "vatpic.h"
 #include "vioapic.h"
 #include "vrtc.h"
 
 /* Register layout of the RTC */
 struct rtcdev {
 	uint8_t	sec;
 	uint8_t	alarm_sec;
 	uint8_t	min;
 	uint8_t	alarm_min;
 	uint8_t	hour;
 	uint8_t	alarm_hour;
 	uint8_t	day_of_week;
 	uint8_t	day_of_month;
 	uint8_t	month;
 	uint8_t	year;
 	uint8_t	reg_a;
 	uint8_t	reg_b;
 	uint8_t	reg_c;
 	uint8_t	reg_d;
-	uint8_t	nvram[128 - 14];
+	uint8_t	nvram[36];
+	uint8_t	century;
+	uint8_t	nvram2[128 - 51];
 } __packed;
 CTASSERT(sizeof(struct rtcdev) == 128);
+CTASSERT(offsetof(struct rtcdev, century) == RTC_CENTURY);
 
 struct vrtc {
 	struct vm	*vm;
 	struct mtx	mtx;
 	struct callout	callout;
 	u_int		addr;		/* RTC register to read or write */
 	sbintime_t	base_uptime;
 	time_t		base_rtctime;
 	struct rtcdev	rtcdev;
 };
 
 #define	VRTC_LOCK(vrtc)		mtx_lock(&((vrtc)->mtx))
 #define	VRTC_UNLOCK(vrtc)	mtx_unlock(&((vrtc)->mtx))
 #define	VRTC_LOCKED(vrtc)	mtx_owned(&((vrtc)->mtx))
 
 /*
  * RTC time is considered "broken" if:
  * - RTC updates are halted by the guest
  * - RTC date/time fields have invalid values
  */
 #define	VRTC_BROKEN_TIME	((time_t)-1)
 
 #define	RTC_IRQ			8
 #define	RTCSB_BIN		0x04
 #define	RTCSB_ALL_INTRS		(RTCSB_UINTR | RTCSB_AINTR | RTCSB_PINTR)
 #define	rtc_halted(vrtc)	((vrtc->rtcdev.reg_b & RTCSB_HALT) != 0)
 #define	aintr_enabled(vrtc)	(((vrtc)->rtcdev.reg_b & RTCSB_AINTR) != 0)
 #define	pintr_enabled(vrtc)	(((vrtc)->rtcdev.reg_b & RTCSB_PINTR) != 0)
 #define	uintr_enabled(vrtc)	(((vrtc)->rtcdev.reg_b & RTCSB_UINTR) != 0)
 
 static void vrtc_callout_handler(void *arg);
 static void vrtc_set_reg_c(struct vrtc *vrtc, uint8_t newval);
 
 static MALLOC_DEFINE(M_VRTC, "vrtc", "bhyve virtual rtc");
 
 SYSCTL_DECL(_hw_vmm);
 SYSCTL_NODE(_hw_vmm, OID_AUTO, vrtc, CTLFLAG_RW, NULL, NULL);
 
 static int rtc_flag_broken_time = 1;
 SYSCTL_INT(_hw_vmm_vrtc, OID_AUTO, flag_broken_time, CTLFLAG_RDTUN,
     &rtc_flag_broken_time, 0, "Stop guest when invalid RTC time is detected");
 
 static __inline bool
 divider_enabled(int reg_a)
 {
 	/*
 	 * The RTC is counting only when dividers are not held in reset.
 	 */
 	return ((reg_a & 0x70) == 0x20);
 }
 
 static __inline bool
 update_enabled(struct vrtc *vrtc)
 {
 	/*
 	 * RTC date/time can be updated only if:
 	 * - divider is not held in reset
 	 * - guest has not disabled updates
 	 * - the date/time fields have valid contents
 	 */
 	if (!divider_enabled(vrtc->rtcdev.reg_a))
 		return (false);
 
 	if (rtc_halted(vrtc))
 		return (false);
 
 	if (vrtc->base_rtctime == VRTC_BROKEN_TIME)
 		return (false);
 
 	return (true);
 }
 
 static time_t
 vrtc_curtime(struct vrtc *vrtc)
 {
 	sbintime_t now, delta;
 	time_t t;
 
 	KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__));
 
 	t = vrtc->base_rtctime;
 	if (update_enabled(vrtc)) {
 		now = sbinuptime();
 		delta = now - vrtc->base_uptime;
 		KASSERT(delta >= 0, ("vrtc_curtime: uptime went backwards: "
 		    "%#lx to %#lx", vrtc->base_uptime, now));
 		t += delta / SBT_1S;
 	}
 	return (t);
 }
 
 static __inline uint8_t
 rtcset(struct rtcdev *rtc, int val)
 {
 
 	KASSERT(val >= 0 && val < 100, ("%s: invalid bin2bcd index %d",
 	    __func__, val));
 
 	return ((rtc->reg_b & RTCSB_BIN) ? val : bin2bcd_data[val]);
 }
 
 static void
 secs_to_rtc(time_t rtctime, struct vrtc *vrtc, int force_update)
 {
 	struct clocktime ct;
 	struct timespec ts;
 	struct rtcdev *rtc;
 	int hour;
 
 	KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__));
 
 	if (rtctime < 0) {
 		KASSERT(rtctime == VRTC_BROKEN_TIME,
 		    ("%s: invalid vrtc time %#lx", __func__, rtctime));
 		return;
 	}
 
 	/*
 	 * If the RTC is halted then the guest has "ownership" of the
 	 * date/time fields. Don't update the RTC date/time fields in
 	 * this case (unless forced).
 	 */
 	if (rtc_halted(vrtc) && !force_update)
 		return;
 
 	ts.tv_sec = rtctime;
 	ts.tv_nsec = 0;
 	clock_ts_to_ct(&ts, &ct);
 
 	KASSERT(ct.sec >= 0 && ct.sec <= 59, ("invalid clocktime sec %d",
 	    ct.sec));
 	KASSERT(ct.min >= 0 && ct.min <= 59, ("invalid clocktime min %d",
 	    ct.min));
 	KASSERT(ct.hour >= 0 && ct.hour <= 23, ("invalid clocktime hour %d",
 	    ct.hour));
 	KASSERT(ct.dow >= 0 && ct.dow <= 6, ("invalid clocktime wday %d",
 	    ct.dow));
 	KASSERT(ct.day >= 1 && ct.day <= 31, ("invalid clocktime mday %d",
 	    ct.day));
 	KASSERT(ct.mon >= 1 && ct.mon <= 12, ("invalid clocktime month %d",
 	    ct.mon));
 	KASSERT(ct.year >= POSIX_BASE_YEAR, ("invalid clocktime year %d",
 	    ct.year));
 
 	rtc = &vrtc->rtcdev;
 	rtc->sec = rtcset(rtc, ct.sec);
 	rtc->min = rtcset(rtc, ct.min);
 
-	hour = ct.hour;
-	if ((rtc->reg_b & RTCSB_24HR) == 0)
-		hour = (hour % 12) + 1;	    /* convert to a 12-hour format */
+	if (rtc->reg_b & RTCSB_24HR) {
+		hour = ct.hour;
+	} else {
+		/*
+		 * Convert to the 12-hour format.
+		 */
+		switch (ct.hour) {
+		case 0:			/* 12 AM */
+		case 12:		/* 12 PM */
+			hour = 12;
+			break;
+		default:
+			/*
+			 * The remaining 'ct.hour' values are interpreted as:
+			 * [1  - 11] ->  1 - 11 AM
+			 * [13 - 23] ->  1 - 11 PM
+			 */
+			hour = ct.hour % 12;
+			break;
+		}
+	}
 
 	rtc->hour = rtcset(rtc, hour);
 
 	if ((rtc->reg_b & RTCSB_24HR) == 0 && ct.hour >= 12)
 		rtc->hour |= 0x80;	    /* set MSB to indicate PM */
 
 	rtc->day_of_week = rtcset(rtc, ct.dow + 1);
 	rtc->day_of_month = rtcset(rtc, ct.day);
 	rtc->month = rtcset(rtc, ct.mon);
 	rtc->year = rtcset(rtc, ct.year % 100);
+	rtc->century = rtcset(rtc, ct.year / 100);
 }
 
 static int
 rtcget(struct rtcdev *rtc, int val, int *retval)
 {
 	uint8_t upper, lower;
 
 	if (rtc->reg_b & RTCSB_BIN) {
 		*retval = val;
 		return (0);
 	}
 
 	lower = val & 0xf;
 	upper = (val >> 4) & 0xf;
 
 	if (lower > 9 || upper > 9)
 		return (-1);
 
 	*retval = upper * 10 + lower;
 	return (0);
 }
 
 static time_t
 rtc_to_secs(struct vrtc *vrtc)
 {
 	struct clocktime ct;
 	struct timespec ts;
 	struct rtcdev *rtc;
 	struct vm *vm;
-	int error, hour, pm, year;
+	int century, error, hour, pm, year;
 
 	KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__));
 
 	vm = vrtc->vm;
 	rtc = &vrtc->rtcdev;
 
 	bzero(&ct, sizeof(struct clocktime));
 
 	error = rtcget(rtc, rtc->sec, &ct.sec);
 	if (error || ct.sec < 0 || ct.sec > 59) {
 		VM_CTR2(vm, "Invalid RTC sec %#x/%d", rtc->sec, ct.sec);
 		goto fail;
 	}
 
 	error = rtcget(rtc, rtc->min, &ct.min);
 	if (error || ct.min < 0 || ct.min > 59) {
 		VM_CTR2(vm, "Invalid RTC min %#x/%d", rtc->min, ct.min);
 		goto fail;
 	}
 
 	pm = 0;
 	hour = rtc->hour;
 	if ((rtc->reg_b & RTCSB_24HR) == 0) {
 		if (hour & 0x80) {
 			hour &= ~0x80;
 			pm = 1;
 		}
 	}
 	error = rtcget(rtc, hour, &ct.hour);
 	if ((rtc->reg_b & RTCSB_24HR) == 0) {
-		ct.hour -= 1;
-		if (pm)
-			ct.hour += 12;
+		if (ct.hour >= 1 && ct.hour <= 12) {
+			/*
+			 * Convert from 12-hour format to internal 24-hour
+			 * representation as follows:
+			 *
+			 *    12-hour format		ct.hour
+			 *	12	AM		0
+			 *	1 - 11	AM		1 - 11
+			 *	12	PM		12
+			 *	1 - 11	PM		13 - 23
+			 */
+			if (ct.hour == 12)
+				ct.hour = 0;
+			if (pm)
+				ct.hour += 12;
+		} else {
+			VM_CTR2(vm, "Invalid RTC 12-hour format %#x/%d",
+			    rtc->hour, ct.hour);
+			goto fail;
+		}
 	}
 
 	if (error || ct.hour < 0 || ct.hour > 23) {
 		VM_CTR2(vm, "Invalid RTC hour %#x/%d", rtc->hour, ct.hour);
 		goto fail;
 	}
 
 	/*
 	 * Ignore 'rtc->dow' because some guests like Linux don't bother
 	 * setting it at all while others like OpenBSD/i386 set it incorrectly. 
 	 *
 	 * clock_ct_to_ts() does not depend on 'ct.dow' anyways so ignore it.
 	 */
 	ct.dow = -1;
 
 	error = rtcget(rtc, rtc->day_of_month, &ct.day);
 	if (error || ct.day < 1 || ct.day > 31) {
 		VM_CTR2(vm, "Invalid RTC mday %#x/%d", rtc->day_of_month,
 		    ct.day);
 		goto fail;
 	}
 
 	error = rtcget(rtc, rtc->month, &ct.mon);
 	if (error || ct.mon < 1 || ct.mon > 12) {
 		VM_CTR2(vm, "Invalid RTC month %#x/%d", rtc->month, ct.mon);
 		goto fail;
 	}
 
 	error = rtcget(rtc, rtc->year, &year);
 	if (error || year < 0 || year > 99) {
 		VM_CTR2(vm, "Invalid RTC year %#x/%d", rtc->year, year);
 		goto fail;
 	}
-	if (year >= 70)
-		ct.year = 1900 + year;
-	else
-		ct.year = 2000 + year;
 
+	error = rtcget(rtc, rtc->century, &century);
+	ct.year = century * 100 + year;
+	if (error || ct.year < POSIX_BASE_YEAR) {
+		VM_CTR2(vm, "Invalid RTC century %#x/%d", rtc->century,
+		    ct.year);
+		goto fail;
+	}
+
 	error = clock_ct_to_ts(&ct, &ts);
 	if (error || ts.tv_sec < 0) {
 		VM_CTR3(vm, "Invalid RTC clocktime.date %04d-%02d-%02d",
 		    ct.year, ct.mon, ct.day);
 		VM_CTR3(vm, "Invalid RTC clocktime.time %02d:%02d:%02d",
 		    ct.hour, ct.min, ct.sec);
 		goto fail;
 	}
 	return (ts.tv_sec);		/* success */
 fail:
-	return (VRTC_BROKEN_TIME);	/* failure */
+	/*
+	 * Stop updating the RTC if the date/time fields programmed by
+	 * the guest are invalid.
+	 */
+	VM_CTR0(vrtc->vm, "Invalid RTC date/time programming detected");
+	return (VRTC_BROKEN_TIME);
 }
 
 static int
 vrtc_time_update(struct vrtc *vrtc, time_t newtime)
 {
 	struct rtcdev *rtc;
 	time_t oldtime;
 	uint8_t alarm_sec, alarm_min, alarm_hour;
 
 	KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__));
 
 	rtc = &vrtc->rtcdev;
 	alarm_sec = rtc->alarm_sec;
 	alarm_min = rtc->alarm_min;
 	alarm_hour = rtc->alarm_hour;
 
 	oldtime = vrtc->base_rtctime;
 	VM_CTR2(vrtc->vm, "Updating RTC time from %#lx to %#lx",
 	    oldtime, newtime);
 
 	if (newtime == oldtime)
 		return (0);
 
 	/*
 	 * If 'newtime' indicates that RTC updates are disabled then just
 	 * record that and return. There is no need to do alarm interrupt
 	 * processing or update 'base_uptime' in this case.
 	 */
 	if (newtime == VRTC_BROKEN_TIME) {
 		vrtc->base_rtctime = VRTC_BROKEN_TIME;
 		return (0);
 	}
 
 	/*
 	 * Return an error if RTC updates are halted by the guest.
 	 */
 	if (rtc_halted(vrtc)) {
 		VM_CTR0(vrtc->vm, "RTC update halted by guest");
 		return (EBUSY);
 	}
 
 	do {
 		/*
 		 * If the alarm interrupt is enabled and 'oldtime' is valid
 		 * then visit all the seconds between 'oldtime' and 'newtime'
 		 * to check for the alarm condition.
 		 *
 		 * Otherwise move the RTC time forward directly to 'newtime'.
 		 */
 		if (aintr_enabled(vrtc) && oldtime != VRTC_BROKEN_TIME)
 			vrtc->base_rtctime++;
 		else
 			vrtc->base_rtctime = newtime;
 
 		if (aintr_enabled(vrtc)) {
 			/*
 			 * Update the RTC date/time fields before checking
 			 * if the alarm conditions are satisfied.
 			 */
 			secs_to_rtc(vrtc->base_rtctime, vrtc, 0);
 
 			if ((alarm_sec >= 0xC0 || alarm_sec == rtc->sec) &&
 			    (alarm_min >= 0xC0 || alarm_min == rtc->min) &&
 			    (alarm_hour >= 0xC0 || alarm_hour == rtc->hour)) {
 				vrtc_set_reg_c(vrtc, rtc->reg_c | RTCIR_ALARM);
 			}
 		}
 	} while (vrtc->base_rtctime != newtime);
 
 	if (uintr_enabled(vrtc))
 		vrtc_set_reg_c(vrtc, rtc->reg_c | RTCIR_UPDATE);
 
 	vrtc->base_uptime = sbinuptime();
 
 	return (0);
 }
 
 static sbintime_t
 vrtc_freq(struct vrtc *vrtc)
 {
 	int ratesel;
 
 	static sbintime_t pf[16] = {
 		0,
 		SBT_1S / 256,
 		SBT_1S / 128,
 		SBT_1S / 8192,
 		SBT_1S / 4096,
 		SBT_1S / 2048,
 		SBT_1S / 1024,
 		SBT_1S / 512,
 		SBT_1S / 256,
 		SBT_1S / 128,
 		SBT_1S / 64,
 		SBT_1S / 32,
 		SBT_1S / 16,
 		SBT_1S / 8,
 		SBT_1S / 4,
 		SBT_1S / 2,
 	};
 
 	KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__));
 
 	/*
 	 * If both periodic and alarm interrupts are enabled then use the
 	 * periodic frequency to drive the callout. The minimum periodic
 	 * frequency (2 Hz) is higher than the alarm frequency (1 Hz) so
 	 * piggyback the alarm on top of it. The same argument applies to
 	 * the update interrupt.
 	 */
 	if (pintr_enabled(vrtc) && divider_enabled(vrtc->rtcdev.reg_a)) {
 		ratesel = vrtc->rtcdev.reg_a & 0xf;
 		return (pf[ratesel]);
 	} else if (aintr_enabled(vrtc) && update_enabled(vrtc)) {
 		return (SBT_1S);
 	} else if (uintr_enabled(vrtc) && update_enabled(vrtc)) {
 		return (SBT_1S);
 	} else {
 		return (0);
 	}
 }
 
 static void
 vrtc_callout_reset(struct vrtc *vrtc, sbintime_t freqsbt)
 {
 
 	KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__));
 
 	if (freqsbt == 0) {
 		if (callout_active(&vrtc->callout)) {
 			VM_CTR0(vrtc->vm, "RTC callout stopped");
 			callout_stop(&vrtc->callout);
 		}
 		return;
 	}
 	VM_CTR1(vrtc->vm, "RTC callout frequency %d hz", SBT_1S / freqsbt);
 	callout_reset_sbt(&vrtc->callout, freqsbt, 0, vrtc_callout_handler,
 	    vrtc, 0);
 }
 
 static void
 vrtc_callout_handler(void *arg)
 {
 	struct vrtc *vrtc = arg;
 	sbintime_t freqsbt;
 	time_t rtctime;
 	int error;
 
 	VM_CTR0(vrtc->vm, "vrtc callout fired");
 
 	VRTC_LOCK(vrtc);
 	if (callout_pending(&vrtc->callout))	/* callout was reset */
 		goto done;
 
 	if (!callout_active(&vrtc->callout))	/* callout was stopped */
 		goto done;
 
 	callout_deactivate(&vrtc->callout);
 
 	KASSERT((vrtc->rtcdev.reg_b & RTCSB_ALL_INTRS) != 0,
 	    ("gratuitous vrtc callout"));
 
 	if (pintr_enabled(vrtc))
 		vrtc_set_reg_c(vrtc, vrtc->rtcdev.reg_c | RTCIR_PERIOD);
 
 	if (aintr_enabled(vrtc) || uintr_enabled(vrtc)) {
 		rtctime = vrtc_curtime(vrtc);
 		error = vrtc_time_update(vrtc, rtctime);
 		KASSERT(error == 0, ("%s: vrtc_time_update error %d",
 		    __func__, error));
 	}
 
 	freqsbt = vrtc_freq(vrtc);
 	KASSERT(freqsbt != 0, ("%s: vrtc frequency cannot be zero", __func__));
 	vrtc_callout_reset(vrtc, freqsbt);
 done:
 	VRTC_UNLOCK(vrtc);
 }
 
 static __inline void
 vrtc_callout_check(struct vrtc *vrtc, sbintime_t freq)
 {
 	int active;
 
 	active = callout_active(&vrtc->callout) ? 1 : 0;
 	KASSERT((freq == 0 && !active) || (freq != 0 && active),
 	    ("vrtc callout %s with frequency %#lx",
 	    active ? "active" : "inactive", freq));
 }
 
 static void
 vrtc_set_reg_c(struct vrtc *vrtc, uint8_t newval)
 {
 	struct rtcdev *rtc;
 	int oldirqf, newirqf;
 	uint8_t oldval, changed;
 
 	KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__));
 
 	rtc = &vrtc->rtcdev;
 	newval &= RTCIR_ALARM | RTCIR_PERIOD | RTCIR_UPDATE;
 
 	oldirqf = rtc->reg_c & RTCIR_INT;
 	if ((aintr_enabled(vrtc) && (newval & RTCIR_ALARM) != 0) ||
 	    (pintr_enabled(vrtc) && (newval & RTCIR_PERIOD) != 0) ||
 	    (uintr_enabled(vrtc) && (newval & RTCIR_UPDATE) != 0)) {
 		newirqf = RTCIR_INT;
 	} else {
 		newirqf = 0;
 	}
 
 	oldval = rtc->reg_c;
 	rtc->reg_c = newirqf | newval;
 	changed = oldval ^ rtc->reg_c;
 	if (changed) {
 		VM_CTR2(vrtc->vm, "RTC reg_c changed from %#x to %#x",
 		    oldval, rtc->reg_c);
 	}
 
 	if (!oldirqf && newirqf) {
 		VM_CTR1(vrtc->vm, "RTC irq %d asserted", RTC_IRQ);
 		vatpic_pulse_irq(vrtc->vm, RTC_IRQ);
 		vioapic_pulse_irq(vrtc->vm, RTC_IRQ);
 	} else if (oldirqf && !newirqf) {
 		VM_CTR1(vrtc->vm, "RTC irq %d deasserted", RTC_IRQ);
 	}
 }
 
 static int
 vrtc_set_reg_b(struct vrtc *vrtc, uint8_t newval)
 {
 	struct rtcdev *rtc;
 	sbintime_t oldfreq, newfreq;
 	time_t curtime, rtctime;
 	int error;
 	uint8_t oldval, changed;
 
 	KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__));
 
 	rtc = &vrtc->rtcdev;
 	oldval = rtc->reg_b;
 	oldfreq = vrtc_freq(vrtc);
 
 	rtc->reg_b = newval;
 	changed = oldval ^ newval;
 	if (changed) {
 		VM_CTR2(vrtc->vm, "RTC reg_b changed from %#x to %#x",
 		    oldval, newval);
 	}
 
 	if (changed & RTCSB_HALT) {
 		if ((newval & RTCSB_HALT) == 0) {
 			rtctime = rtc_to_secs(vrtc);
 			if (rtctime == VRTC_BROKEN_TIME) {
-				/*
-				 * Stop updating the RTC if the date/time
-				 * programmed by the guest is not correct.
-				 */
-				VM_CTR0(vrtc->vm, "Invalid RTC date/time "
-				    "programming detected");
-
 				if (rtc_flag_broken_time)
 					return (-1);
 			}
 		} else {
 			curtime = vrtc_curtime(vrtc);
 			KASSERT(curtime == vrtc->base_rtctime, ("%s: mismatch "
 			    "between vrtc basetime (%#lx) and curtime (%#lx)",
 			    __func__, vrtc->base_rtctime, curtime));
 
 			/*
 			 * Force a refresh of the RTC date/time fields so
 			 * they reflect the time right before the guest set
 			 * the HALT bit.
 			 */
 			secs_to_rtc(curtime, vrtc, 1);
 
 			/*
 			 * Updates are halted so mark 'base_rtctime' to denote
 			 * that the RTC date/time is in flux.
 			 */
 			rtctime = VRTC_BROKEN_TIME;
 			rtc->reg_b &= ~RTCSB_UINTR;
 		}
 		error = vrtc_time_update(vrtc, rtctime);
 		KASSERT(error == 0, ("vrtc_time_update error %d", error));
 	}
 
 	/*
 	 * Side effect of changes to the interrupt enable bits.
 	 */
 	if (changed & RTCSB_ALL_INTRS)
 		vrtc_set_reg_c(vrtc, vrtc->rtcdev.reg_c);
 
 	/*
 	 * Change the callout frequency if it has changed.
 	 */
 	newfreq = vrtc_freq(vrtc);
 	if (newfreq != oldfreq)
 		vrtc_callout_reset(vrtc, newfreq);
 	else
 		vrtc_callout_check(vrtc, newfreq);
 
 	/*
 	 * The side effect of bits that control the RTC date/time format
 	 * is handled lazily when those fields are actually read.
 	 */
 	return (0);
 }
 
 static void
 vrtc_set_reg_a(struct vrtc *vrtc, uint8_t newval)
 {
 	sbintime_t oldfreq, newfreq;
 	uint8_t oldval, changed;
 
 	KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__));
 
 	newval &= ~RTCSA_TUP;
 	oldval = vrtc->rtcdev.reg_a;
 	oldfreq = vrtc_freq(vrtc);
 
 	if (divider_enabled(oldval) && !divider_enabled(newval)) {
 		VM_CTR2(vrtc->vm, "RTC divider held in reset at %#lx/%#lx",
 		    vrtc->base_rtctime, vrtc->base_uptime);
 	} else if (!divider_enabled(oldval) && divider_enabled(newval)) {
 		/*
 		 * If the dividers are coming out of reset then update
 		 * 'base_uptime' before this happens. This is done to
 		 * maintain the illusion that the RTC date/time was frozen
 		 * while the dividers were disabled.
 		 */
 		vrtc->base_uptime = sbinuptime();
 		VM_CTR2(vrtc->vm, "RTC divider out of reset at %#lx/%#lx",
 		    vrtc->base_rtctime, vrtc->base_uptime);
 	} else {
 		/* NOTHING */
 	}
 
 	vrtc->rtcdev.reg_a = newval;
 	changed = oldval ^ newval;
 	if (changed) {
 		VM_CTR2(vrtc->vm, "RTC reg_a changed from %#x to %#x",
 		    oldval, newval);
 	}
 
 	/*
 	 * Side effect of changes to rate select and divider enable bits.
 	 */
 	newfreq = vrtc_freq(vrtc);
 	if (newfreq != oldfreq)
 		vrtc_callout_reset(vrtc, newfreq);
 	else
 		vrtc_callout_check(vrtc, newfreq);
 }
 
 int
 vrtc_set_time(struct vm *vm, time_t secs)
 {
 	struct vrtc *vrtc;
 	int error;
 
 	vrtc = vm_rtc(vm);
 	VRTC_LOCK(vrtc);
 	error = vrtc_time_update(vrtc, secs);
 	VRTC_UNLOCK(vrtc);
 
 	if (error) {
 		VM_CTR2(vrtc->vm, "Error %d setting RTC time to %#lx", error,
 		    secs);
 	} else {
 		VM_CTR1(vrtc->vm, "RTC time set to %#lx", secs);
 	}
 
 	return (error);
 }
 
 time_t
 vrtc_get_time(struct vm *vm)
 {
 	struct vrtc *vrtc;
 	time_t t;
 
 	vrtc = vm_rtc(vm);
 	VRTC_LOCK(vrtc);
 	t = vrtc_curtime(vrtc);
 	VRTC_UNLOCK(vrtc);
 
 	return (t);
 }
 
 int
 vrtc_nvram_write(struct vm *vm, int offset, uint8_t value)
 {
 	struct vrtc *vrtc;
 	uint8_t *ptr;
 
 	vrtc = vm_rtc(vm);
 
 	/*
 	 * Don't allow writes to RTC control registers or the date/time fields.
 	 */
 	if (offset < offsetof(struct rtcdev, nvram[0]) ||
-	    offset >= sizeof(struct rtcdev)) {
+	    offset == RTC_CENTURY || offset >= sizeof(struct rtcdev)) {
 		VM_CTR1(vrtc->vm, "RTC nvram write to invalid offset %d",
 		    offset);
 		return (EINVAL);
 	}
 
 	VRTC_LOCK(vrtc);
 	ptr = (uint8_t *)(&vrtc->rtcdev);
 	ptr[offset] = value;
 	VM_CTR2(vrtc->vm, "RTC nvram write %#x to offset %#x", value, offset);
 	VRTC_UNLOCK(vrtc);
 
 	return (0);
 }
 
 int
 vrtc_nvram_read(struct vm *vm, int offset, uint8_t *retval)
 {
 	struct vrtc *vrtc;
 	time_t curtime;
 	uint8_t *ptr;
 
 	/*
 	 * Allow all offsets in the RTC to be read.
 	 */
 	if (offset < 0 || offset >= sizeof(struct rtcdev))
 		return (EINVAL);
 
 	vrtc = vm_rtc(vm);
 	VRTC_LOCK(vrtc);
 
 	/*
 	 * Update RTC date/time fields if necessary.
 	 */
-	if (offset < 10) {
+	if (offset < 10 || offset == RTC_CENTURY) {
 		curtime = vrtc_curtime(vrtc);
 		secs_to_rtc(curtime, vrtc, 0);
 	}
 
 	ptr = (uint8_t *)(&vrtc->rtcdev);
 	*retval = ptr[offset];
 
 	VRTC_UNLOCK(vrtc);
 	return (0);
 }
 
 int
 vrtc_addr_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
     uint32_t *val)
 {
 	struct vrtc *vrtc;
 
 	vrtc = vm_rtc(vm);
 
 	if (bytes != 1)
 		return (-1);
 
 	if (in) {
 		*val = 0xff;
 		return (0);
 	}
 
 	VRTC_LOCK(vrtc);
 	vrtc->addr = *val & 0x7f;
 	VRTC_UNLOCK(vrtc);
 
 	return (0);
 }
 
 int
 vrtc_data_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
     uint32_t *val)
 {
 	struct vrtc *vrtc;
 	struct rtcdev *rtc;
 	time_t curtime;
 	int error, offset;
 
 	vrtc = vm_rtc(vm);
 	rtc = &vrtc->rtcdev;
 
 	if (bytes != 1)
 		return (-1);
 
 	VRTC_LOCK(vrtc);
 	offset = vrtc->addr;
 	if (offset >= sizeof(struct rtcdev)) {
 		VRTC_UNLOCK(vrtc);
 		return (-1);
 	}
 
 	error = 0;
 	curtime = vrtc_curtime(vrtc);
 	vrtc_time_update(vrtc, curtime);
 
-	if (in) {
-		/*
-		 * Update RTC date/time fields if necessary.
-		 */
-		if (offset < 10)
-			secs_to_rtc(curtime, vrtc, 0);
+	/*
+	 * Update RTC date/time fields if necessary.
+	 *
+	 * This is not just for reads of the RTC. The side-effect of writing
+	 * the century byte requires other RTC date/time fields (e.g. sec)
+	 * to be updated here.
+	 */
+	if (offset < 10 || offset == RTC_CENTURY)
+		secs_to_rtc(curtime, vrtc, 0);
 
+	if (in) {
 		if (offset == 12) {
 			/*
 			 * XXX
 			 * reg_c interrupt flags are updated only if the
 			 * corresponding interrupt enable bit in reg_b is set.
 			 */
 			*val = vrtc->rtcdev.reg_c;
 			vrtc_set_reg_c(vrtc, 0);
 		} else {
 			*val = *((uint8_t *)rtc + offset);
 		}
 		VCPU_CTR2(vm, vcpuid, "Read value %#x from RTC offset %#x",
 		    *val, offset);
 	} else {
 		switch (offset) {
 		case 10:
 			VCPU_CTR1(vm, vcpuid, "RTC reg_a set to %#x", *val);
 			vrtc_set_reg_a(vrtc, *val);
 			break;
 		case 11:
 			VCPU_CTR1(vm, vcpuid, "RTC reg_b set to %#x", *val);
 			error = vrtc_set_reg_b(vrtc, *val);
 			break;
 		case 12:
 			VCPU_CTR1(vm, vcpuid, "RTC reg_c set to %#x (ignored)",
 			    *val);
 			break;
 		case 13:
 			VCPU_CTR1(vm, vcpuid, "RTC reg_d set to %#x (ignored)",
 			    *val);
 			break;
 		case 0:
 			/*
 			 * High order bit of 'seconds' is readonly.
 			 */
 			*val &= 0x7f;
 			/* FALLTHRU */
 		default:
 			VCPU_CTR2(vm, vcpuid, "RTC offset %#x set to %#x",
 			    offset, *val);
 			*((uint8_t *)rtc + offset) = *val;
 			break;
+		}
+
+		/*
+		 * XXX some guests (e.g. OpenBSD) write the century byte
+		 * outside of RTCSB_HALT so re-calculate the RTC date/time.
+		 */
+		if (offset == RTC_CENTURY && !rtc_halted(vrtc)) {
+			curtime = rtc_to_secs(vrtc);
+			error = vrtc_time_update(vrtc, curtime);
+			KASSERT(!error, ("vrtc_time_update error %d", error));
+			if (curtime == VRTC_BROKEN_TIME && rtc_flag_broken_time)
+				error = -1;
 		}
 	}
 	VRTC_UNLOCK(vrtc);
 	return (error);
 }
 
 void
 vrtc_reset(struct vrtc *vrtc)
 {
 	struct rtcdev *rtc;
 
 	VRTC_LOCK(vrtc);
 
 	rtc = &vrtc->rtcdev;
 	vrtc_set_reg_b(vrtc, rtc->reg_b & ~(RTCSB_ALL_INTRS | RTCSB_SQWE));
 	vrtc_set_reg_c(vrtc, 0);
 	KASSERT(!callout_active(&vrtc->callout), ("rtc callout still active"));
 
 	VRTC_UNLOCK(vrtc);
 }
 
 struct vrtc *
 vrtc_init(struct vm *vm)
 {
 	struct vrtc *vrtc;
 	struct rtcdev *rtc;
 	time_t curtime;
 
 	vrtc = malloc(sizeof(struct vrtc), M_VRTC, M_WAITOK | M_ZERO);
 	vrtc->vm = vm;
 	mtx_init(&vrtc->mtx, "vrtc lock", NULL, MTX_DEF);
 	callout_init(&vrtc->callout, 1);
 
 	/* Allow dividers to keep time but disable everything else */
 	rtc = &vrtc->rtcdev;
 	rtc->reg_a = 0x20;
 	rtc->reg_b = RTCSB_24HR;
 	rtc->reg_c = 0;
 	rtc->reg_d = RTCSD_PWR;
 
 	/* Reset the index register to a safe value. */
 	vrtc->addr = RTC_STATUSD;
 
 	/*
 	 * Initialize RTC time to 00:00:00 Jan 1, 1970.
 	 */
 	curtime = 0;
 
 	VRTC_LOCK(vrtc);
 	vrtc->base_rtctime = VRTC_BROKEN_TIME;
 	vrtc_time_update(vrtc, curtime);
 	secs_to_rtc(curtime, vrtc, 0);
 	VRTC_UNLOCK(vrtc);
 
 	return (vrtc);
 }
 
 void
 vrtc_cleanup(struct vrtc *vrtc)
 {
 
 	callout_drain(&vrtc->callout);
 	free(vrtc, M_VRTC);
 }
Index: stable/10/sys/amd64/vmm/vmm.c
===================================================================
--- stable/10/sys/amd64/vmm/vmm.c	(revision 284898)
+++ stable/10/sys/amd64/vmm/vmm.c	(revision 284899)
@@ -1,2422 +1,2428 @@
 /*-
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/sysctl.h>
 #include <sys/malloc.h>
 #include <sys/pcpu.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/systm.h>
 
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_param.h>
 
 #include <machine/cpu.h>
 #include <machine/vm.h>
 #include <machine/pcb.h>
 #include <machine/smp.h>
 #include <x86/psl.h>
 #include <x86/apicreg.h>
 #include <machine/vmparam.h>
 
 #include <machine/vmm.h>
 #include <machine/vmm_dev.h>
 #include <machine/vmm_instruction_emul.h>
 
 #include "vmm_ioport.h"
 #include "vmm_ktr.h"
 #include "vmm_host.h"
 #include "vmm_mem.h"
 #include "vmm_util.h"
 #include "vatpic.h"
 #include "vatpit.h"
 #include "vhpet.h"
 #include "vioapic.h"
 #include "vlapic.h"
 #include "vpmtmr.h"
 #include "vrtc.h"
 #include "vmm_ipi.h"
 #include "vmm_stat.h"
 #include "vmm_lapic.h"
 
 #include "io/ppt.h"
 #include "io/iommu.h"
 
 struct vlapic;
 
 /*
  * Initialization:
  * (a) allocated when vcpu is created
  * (i) initialized when vcpu is created and when it is reinitialized
  * (o) initialized the first time the vcpu is created
  * (x) initialized before use
  */
 struct vcpu {
 	struct mtx 	mtx;		/* (o) protects 'state' and 'hostcpu' */
 	enum vcpu_state	state;		/* (o) vcpu state */
 	int		hostcpu;	/* (o) vcpu's host cpu */
 	struct vlapic	*vlapic;	/* (i) APIC device model */
 	enum x2apic_state x2apic_state;	/* (i) APIC mode */
 	uint64_t	exitintinfo;	/* (i) events pending at VM exit */
 	int		nmi_pending;	/* (i) NMI pending */
 	int		extint_pending;	/* (i) INTR pending */
 	int	exception_pending;	/* (i) exception pending */
 	int	exc_vector;		/* (x) exception collateral */
 	int	exc_errcode_valid;
 	uint32_t exc_errcode;
 	struct savefpu	*guestfpu;	/* (a,i) guest fpu state */
 	uint64_t	guest_xcr0;	/* (i) guest %xcr0 register */
 	void		*stats;		/* (a,i) statistics */
 	struct vm_exit	exitinfo;	/* (x) exit reason and collateral */
 	uint64_t	nextrip;	/* (x) next instruction to execute */
 };
 
 #define	vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx))
 #define	vcpu_lock_init(v)	mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
 #define	vcpu_lock(v)		mtx_lock_spin(&((v)->mtx))
 #define	vcpu_unlock(v)		mtx_unlock_spin(&((v)->mtx))
 #define	vcpu_assert_locked(v)	mtx_assert(&((v)->mtx), MA_OWNED)
 
 struct mem_seg {
 	vm_paddr_t	gpa;
 	size_t		len;
 	boolean_t	wired;
 	vm_object_t	object;
 };
 #define	VM_MAX_MEMORY_SEGMENTS	2
 
 /*
  * Initialization:
  * (o) initialized the first time the VM is created
  * (i) initialized when VM is created and when it is reinitialized
  * (x) initialized before use
  */
 struct vm {
 	void		*cookie;		/* (i) cpu-specific data */
 	void		*iommu;			/* (x) iommu-specific data */
 	struct vhpet	*vhpet;			/* (i) virtual HPET */
 	struct vioapic	*vioapic;		/* (i) virtual ioapic */
 	struct vatpic	*vatpic;		/* (i) virtual atpic */
 	struct vatpit	*vatpit;		/* (i) virtual atpit */
 	struct vpmtmr	*vpmtmr;		/* (i) virtual ACPI PM timer */
 	struct vrtc	*vrtc;			/* (o) virtual RTC */
 	volatile cpuset_t active_cpus;		/* (i) active vcpus */
 	int		suspend;		/* (i) stop VM execution */
 	volatile cpuset_t suspended_cpus; 	/* (i) suspended vcpus */
 	volatile cpuset_t halted_cpus;		/* (x) cpus in a hard halt */
 	cpuset_t	rendezvous_req_cpus;	/* (x) rendezvous requested */
 	cpuset_t	rendezvous_done_cpus;	/* (x) rendezvous finished */
 	void		*rendezvous_arg;	/* (x) rendezvous func/arg */
 	vm_rendezvous_func_t rendezvous_func;
 	struct mtx	rendezvous_mtx;		/* (o) rendezvous lock */
 	int		num_mem_segs;		/* (o) guest memory segments */
 	struct mem_seg	mem_segs[VM_MAX_MEMORY_SEGMENTS];
 	struct vmspace	*vmspace;		/* (o) guest's address space */
 	char		name[VM_MAX_NAMELEN];	/* (o) virtual machine name */
 	struct vcpu	vcpu[VM_MAXCPU];	/* (i) guest vcpus */
 };
 
 static int vmm_initialized;
 
 static struct vmm_ops *ops;
 #define	VMM_INIT(num)	(ops != NULL ? (*ops->init)(num) : 0)
 #define	VMM_CLEANUP()	(ops != NULL ? (*ops->cleanup)() : 0)
 #define	VMM_RESUME()	(ops != NULL ? (*ops->resume)() : 0)
 
 #define	VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL)
 #define	VMRUN(vmi, vcpu, rip, pmap, rptr, sptr) \
 	(ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap, rptr, sptr) : ENXIO)
 #define	VMCLEANUP(vmi)	(ops != NULL ? (*ops->vmcleanup)(vmi) : NULL)
 #define	VMSPACE_ALLOC(min, max) \
 	(ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL)
 #define	VMSPACE_FREE(vmspace) \
 	(ops != NULL ? (*ops->vmspace_free)(vmspace) : ENXIO)
 #define	VMGETREG(vmi, vcpu, num, retval)		\
 	(ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO)
 #define	VMSETREG(vmi, vcpu, num, val)		\
 	(ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO)
 #define	VMGETDESC(vmi, vcpu, num, desc)		\
 	(ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO)
 #define	VMSETDESC(vmi, vcpu, num, desc)		\
 	(ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO)
 #define	VMGETCAP(vmi, vcpu, num, retval)	\
 	(ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO)
 #define	VMSETCAP(vmi, vcpu, num, val)		\
 	(ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO)
 #define	VLAPIC_INIT(vmi, vcpu)			\
 	(ops != NULL ? (*ops->vlapic_init)(vmi, vcpu) : NULL)
 #define	VLAPIC_CLEANUP(vmi, vlapic)		\
 	(ops != NULL ? (*ops->vlapic_cleanup)(vmi, vlapic) : NULL)
 
 #define	fpu_start_emulating()	load_cr0(rcr0() | CR0_TS)
 #define	fpu_stop_emulating()	clts()
 
 static MALLOC_DEFINE(M_VM, "vm", "vm");
 
 /* statistics */
 static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
 
 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
 
 /*
  * Halt the guest if all vcpus are executing a HLT instruction with
  * interrupts disabled.
  */
 static int halt_detection_enabled = 1;
 TUNABLE_INT("hw.vmm.halt_detection", &halt_detection_enabled);
 SYSCTL_INT(_hw_vmm, OID_AUTO, halt_detection, CTLFLAG_RDTUN,
     &halt_detection_enabled, 0,
     "Halt VM if all vcpus execute HLT with interrupts disabled");
 
 static int vmm_ipinum;
 SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0,
     "IPI vector used for vcpu notifications");
 
 static int trace_guest_exceptions;
 SYSCTL_INT(_hw_vmm, OID_AUTO, trace_guest_exceptions, CTLFLAG_RDTUN,
     &trace_guest_exceptions, 0,
     "Trap into hypervisor on all guest exceptions and reflect them back");
 
+static int vmm_force_iommu = 0;
+TUNABLE_INT("hw.vmm.force_iommu", &vmm_force_iommu);
+SYSCTL_INT(_hw_vmm, OID_AUTO, force_iommu, CTLFLAG_RDTUN, &vmm_force_iommu, 0,
+    "Force use of I/O MMU even if no passthrough devices were found.");
+
 static void
 vcpu_cleanup(struct vm *vm, int i, bool destroy)
 {
 	struct vcpu *vcpu = &vm->vcpu[i];
 
 	VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic);
 	if (destroy) {
 		vmm_stat_free(vcpu->stats);	
 		fpu_save_area_free(vcpu->guestfpu);
 	}
 }
 
 static void
 vcpu_init(struct vm *vm, int vcpu_id, bool create)
 {
 	struct vcpu *vcpu;
 
 	KASSERT(vcpu_id >= 0 && vcpu_id < VM_MAXCPU,
 	    ("vcpu_init: invalid vcpu %d", vcpu_id));
 	  
 	vcpu = &vm->vcpu[vcpu_id];
 
 	if (create) {
 		KASSERT(!vcpu_lock_initialized(vcpu), ("vcpu %d already "
 		    "initialized", vcpu_id));
 		vcpu_lock_init(vcpu);
 		vcpu->state = VCPU_IDLE;
 		vcpu->hostcpu = NOCPU;
 		vcpu->guestfpu = fpu_save_area_alloc();
 		vcpu->stats = vmm_stat_alloc();
 	}
 
 	vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
 	vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED);
 	vcpu->exitintinfo = 0;
 	vcpu->nmi_pending = 0;
 	vcpu->extint_pending = 0;
 	vcpu->exception_pending = 0;
 	vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
 	fpu_save_area_reset(vcpu->guestfpu);
 	vmm_stat_init(vcpu->stats);
 }
 
 int
 vcpu_trace_exceptions(struct vm *vm, int vcpuid)
 {
 
 	return (trace_guest_exceptions);
 }
 
 struct vm_exit *
 vm_exitinfo(struct vm *vm, int cpuid)
 {
 	struct vcpu *vcpu;
 
 	if (cpuid < 0 || cpuid >= VM_MAXCPU)
 		panic("vm_exitinfo: invalid cpuid %d", cpuid);
 
 	vcpu = &vm->vcpu[cpuid];
 
 	return (&vcpu->exitinfo);
 }
 
 static void
 vmm_resume(void)
 {
 	VMM_RESUME();
 }
 
 static int
 vmm_init(void)
 {
 	int error;
 
 	vmm_host_state_init();
 
 	vmm_ipinum = vmm_ipi_alloc();
 	if (vmm_ipinum == 0)
 		vmm_ipinum = IPI_AST;
 
 	error = vmm_mem_init();
 	if (error)
 		return (error);
 	
 	if (vmm_is_intel())
 		ops = &vmm_ops_intel;
 	else if (vmm_is_amd())
 		ops = &vmm_ops_amd;
 	else
 		return (ENXIO);
 
 	vmm_resume_p = vmm_resume;
 
 	return (VMM_INIT(vmm_ipinum));
 }
 
 static int
 vmm_handler(module_t mod, int what, void *arg)
 {
 	int error;
 
 	switch (what) {
 	case MOD_LOAD:
 		vmmdev_init();
-		if (ppt_avail_devices() > 0)
+		if (vmm_force_iommu || ppt_avail_devices() > 0)
 			iommu_init();
 		error = vmm_init();
 		if (error == 0)
 			vmm_initialized = 1;
 		break;
 	case MOD_UNLOAD:
 		error = vmmdev_cleanup();
 		if (error == 0) {
 			vmm_resume_p = NULL;
 			iommu_cleanup();
 			if (vmm_ipinum != IPI_AST)
 				vmm_ipi_free(vmm_ipinum);
 			error = VMM_CLEANUP();
 			/*
 			 * Something bad happened - prevent new
 			 * VMs from being created
 			 */
 			if (error)
 				vmm_initialized = 0;
 		}
 		break;
 	default:
 		error = 0;
 		break;
 	}
 	return (error);
 }
 
 static moduledata_t vmm_kmod = {
 	"vmm",
 	vmm_handler,
 	NULL
 };
 
 /*
  * vmm initialization has the following dependencies:
  *
  * - iommu initialization must happen after the pci passthru driver has had
  *   a chance to attach to any passthru devices (after SI_SUB_CONFIGURE).
  *
  * - VT-x initialization requires smp_rendezvous() and therefore must happen
  *   after SMP is fully functional (after SI_SUB_SMP).
  */
 DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY);
 MODULE_VERSION(vmm, 1);
 
 static void
 vm_init(struct vm *vm, bool create)
 {
 	int i;
 
 	vm->cookie = VMINIT(vm, vmspace_pmap(vm->vmspace));
 	vm->iommu = NULL;
 	vm->vioapic = vioapic_init(vm);
 	vm->vhpet = vhpet_init(vm);
 	vm->vatpic = vatpic_init(vm);
 	vm->vatpit = vatpit_init(vm);
 	vm->vpmtmr = vpmtmr_init(vm);
 	if (create)
 		vm->vrtc = vrtc_init(vm);
 
 	CPU_ZERO(&vm->active_cpus);
 
 	vm->suspend = 0;
 	CPU_ZERO(&vm->suspended_cpus);
 
 	for (i = 0; i < VM_MAXCPU; i++)
 		vcpu_init(vm, i, create);
 }
 
 int
 vm_create(const char *name, struct vm **retvm)
 {
 	struct vm *vm;
 	struct vmspace *vmspace;
 
 	/*
 	 * If vmm.ko could not be successfully initialized then don't attempt
 	 * to create the virtual machine.
 	 */
 	if (!vmm_initialized)
 		return (ENXIO);
 
 	if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
 		return (EINVAL);
 
 	vmspace = VMSPACE_ALLOC(0, VM_MAXUSER_ADDRESS);
 	if (vmspace == NULL)
 		return (ENOMEM);
 
 	vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
 	strcpy(vm->name, name);
 	vm->num_mem_segs = 0;
 	vm->vmspace = vmspace;
 	mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF);
 
 	vm_init(vm, true);
 
 	*retvm = vm;
 	return (0);
 }
 
 static void
 vm_free_mem_seg(struct vm *vm, struct mem_seg *seg)
 {
 
 	if (seg->object != NULL)
 		vmm_mem_free(vm->vmspace, seg->gpa, seg->len);
 
 	bzero(seg, sizeof(*seg));
 }
 
 static void
 vm_cleanup(struct vm *vm, bool destroy)
 {
 	int i;
 
 	ppt_unassign_all(vm);
 
 	if (vm->iommu != NULL)
 		iommu_destroy_domain(vm->iommu);
 
 	if (destroy)
 		vrtc_cleanup(vm->vrtc);
 	else
 		vrtc_reset(vm->vrtc);
 	vpmtmr_cleanup(vm->vpmtmr);
 	vatpit_cleanup(vm->vatpit);
 	vhpet_cleanup(vm->vhpet);
 	vatpic_cleanup(vm->vatpic);
 	vioapic_cleanup(vm->vioapic);
 
 	for (i = 0; i < VM_MAXCPU; i++)
 		vcpu_cleanup(vm, i, destroy);
 
 	VMCLEANUP(vm->cookie);
 
 	if (destroy) {
 		for (i = 0; i < vm->num_mem_segs; i++)
 			vm_free_mem_seg(vm, &vm->mem_segs[i]);
 
 		vm->num_mem_segs = 0;
 
 		VMSPACE_FREE(vm->vmspace);
 		vm->vmspace = NULL;
 	}
 }
 
 void
 vm_destroy(struct vm *vm)
 {
 	vm_cleanup(vm, true);
 	free(vm, M_VM);
 }
 
 int
 vm_reinit(struct vm *vm)
 {
 	int error;
 
 	/*
 	 * A virtual machine can be reset only if all vcpus are suspended.
 	 */
 	if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
 		vm_cleanup(vm, false);
 		vm_init(vm, false);
 		error = 0;
 	} else {
 		error = EBUSY;
 	}
 
 	return (error);
 }
 
 const char *
 vm_name(struct vm *vm)
 {
 	return (vm->name);
 }
 
 int
 vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
 {
 	vm_object_t obj;
 
 	if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL)
 		return (ENOMEM);
 	else
 		return (0);
 }
 
 int
 vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
 {
 
 	vmm_mmio_free(vm->vmspace, gpa, len);
 	return (0);
 }
 
 boolean_t
 vm_mem_allocated(struct vm *vm, vm_paddr_t gpa)
 {
 	int i;
 	vm_paddr_t gpabase, gpalimit;
 
 	for (i = 0; i < vm->num_mem_segs; i++) {
 		gpabase = vm->mem_segs[i].gpa;
 		gpalimit = gpabase + vm->mem_segs[i].len;
 		if (gpa >= gpabase && gpa < gpalimit)
 			return (TRUE);		/* 'gpa' is regular memory */
 	}
 
 	if (ppt_is_mmio(vm, gpa))
 		return (TRUE);			/* 'gpa' is pci passthru mmio */
 
 	return (FALSE);
 }
 
 int
 vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
 {
 	int available, allocated;
 	struct mem_seg *seg;
 	vm_object_t object;
 	vm_paddr_t g;
 
 	if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0)
 		return (EINVAL);
 	
 	available = allocated = 0;
 	g = gpa;
 	while (g < gpa + len) {
 		if (vm_mem_allocated(vm, g))
 			allocated++;
 		else
 			available++;
 
 		g += PAGE_SIZE;
 	}
 
 	/*
 	 * If there are some allocated and some available pages in the address
 	 * range then it is an error.
 	 */
 	if (allocated && available)
 		return (EINVAL);
 
 	/*
 	 * If the entire address range being requested has already been
 	 * allocated then there isn't anything more to do.
 	 */
 	if (allocated && available == 0)
 		return (0);
 
 	if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS)
 		return (E2BIG);
 
 	seg = &vm->mem_segs[vm->num_mem_segs];
 
 	if ((object = vmm_mem_alloc(vm->vmspace, gpa, len)) == NULL)
 		return (ENOMEM);
 
 	seg->gpa = gpa;
 	seg->len = len;
 	seg->object = object;
 	seg->wired = FALSE;
 
 	vm->num_mem_segs++;
 
 	return (0);
 }
 
 static vm_paddr_t
 vm_maxmem(struct vm *vm)
 {
 	int i;
 	vm_paddr_t gpa, maxmem;
 
 	maxmem = 0;
 	for (i = 0; i < vm->num_mem_segs; i++) {
 		gpa = vm->mem_segs[i].gpa + vm->mem_segs[i].len;
 		if (gpa > maxmem)
 			maxmem = gpa;
 	}
 	return (maxmem);
 }
 
 static void
 vm_gpa_unwire(struct vm *vm)
 {
 	int i, rv;
 	struct mem_seg *seg;
 
 	for (i = 0; i < vm->num_mem_segs; i++) {
 		seg = &vm->mem_segs[i];
 		if (!seg->wired)
 			continue;
 
 		rv = vm_map_unwire(&vm->vmspace->vm_map,
 				   seg->gpa, seg->gpa + seg->len,
 				   VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
 		KASSERT(rv == KERN_SUCCESS, ("vm(%s) memory segment "
 		    "%#lx/%ld could not be unwired: %d",
 		    vm_name(vm), seg->gpa, seg->len, rv));
 
 		seg->wired = FALSE;
 	}
 }
 
 static int
 vm_gpa_wire(struct vm *vm)
 {
 	int i, rv;
 	struct mem_seg *seg;
 
 	for (i = 0; i < vm->num_mem_segs; i++) {
 		seg = &vm->mem_segs[i];
 		if (seg->wired)
 			continue;
 
 		/* XXX rlimits? */
 		rv = vm_map_wire(&vm->vmspace->vm_map,
 				 seg->gpa, seg->gpa + seg->len,
 				 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
 		if (rv != KERN_SUCCESS)
 			break;
 
 		seg->wired = TRUE;
 	}
 
 	if (i < vm->num_mem_segs) {
 		/*
 		 * Undo the wiring before returning an error.
 		 */
 		vm_gpa_unwire(vm);
 		return (EAGAIN);
 	}
 
 	return (0);
 }
 
 static void
 vm_iommu_modify(struct vm *vm, boolean_t map)
 {
 	int i, sz;
 	vm_paddr_t gpa, hpa;
 	struct mem_seg *seg;
 	void *vp, *cookie, *host_domain;
 
 	sz = PAGE_SIZE;
 	host_domain = iommu_host_domain();
 
 	for (i = 0; i < vm->num_mem_segs; i++) {
 		seg = &vm->mem_segs[i];
 		KASSERT(seg->wired, ("vm(%s) memory segment %#lx/%ld not wired",
 		    vm_name(vm), seg->gpa, seg->len));
 
 		gpa = seg->gpa;
 		while (gpa < seg->gpa + seg->len) {
 			vp = vm_gpa_hold(vm, gpa, PAGE_SIZE, VM_PROT_WRITE,
 					 &cookie);
 			KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx",
 			    vm_name(vm), gpa));
 
 			vm_gpa_release(cookie);
 
 			hpa = DMAP_TO_PHYS((uintptr_t)vp);
 			if (map) {
 				iommu_create_mapping(vm->iommu, gpa, hpa, sz);
 				iommu_remove_mapping(host_domain, hpa, sz);
 			} else {
 				iommu_remove_mapping(vm->iommu, gpa, sz);
 				iommu_create_mapping(host_domain, hpa, hpa, sz);
 			}
 
 			gpa += PAGE_SIZE;
 		}
 	}
 
 	/*
 	 * Invalidate the cached translations associated with the domain
 	 * from which pages were removed.
 	 */
 	if (map)
 		iommu_invalidate_tlb(host_domain);
 	else
 		iommu_invalidate_tlb(vm->iommu);
 }
 
 #define	vm_iommu_unmap(vm)	vm_iommu_modify((vm), FALSE)
 #define	vm_iommu_map(vm)	vm_iommu_modify((vm), TRUE)
 
 int
 vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func)
 {
 	int error;
 
 	error = ppt_unassign_device(vm, bus, slot, func);
 	if (error)
 		return (error);
 
 	if (ppt_assigned_devices(vm) == 0) {
 		vm_iommu_unmap(vm);
 		vm_gpa_unwire(vm);
 	}
 	return (0);
 }
 
 int
 vm_assign_pptdev(struct vm *vm, int bus, int slot, int func)
 {
 	int error;
 	vm_paddr_t maxaddr;
 
 	/*
 	 * Virtual machines with pci passthru devices get special treatment:
 	 * - the guest physical memory is wired
 	 * - the iommu is programmed to do the 'gpa' to 'hpa' translation
 	 *
 	 * We need to do this before the first pci passthru device is attached.
 	 */
 	if (ppt_assigned_devices(vm) == 0) {
 		KASSERT(vm->iommu == NULL,
 		    ("vm_assign_pptdev: iommu must be NULL"));
 		maxaddr = vm_maxmem(vm);
 		vm->iommu = iommu_create_domain(maxaddr);
 
 		error = vm_gpa_wire(vm);
 		if (error)
 			return (error);
 
 		vm_iommu_map(vm);
 	}
 
 	error = ppt_assign_device(vm, bus, slot, func);
 	return (error);
 }
 
 void *
 vm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot,
 	    void **cookie)
 {
 	int count, pageoff;
 	vm_page_t m;
 
 	pageoff = gpa & PAGE_MASK;
 	if (len > PAGE_SIZE - pageoff)
 		panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
 
 	count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map,
 	    trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1);
 
 	if (count == 1) {
 		*cookie = m;
 		return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff));
 	} else {
 		*cookie = NULL;
 		return (NULL);
 	}
 }
 
 void
 vm_gpa_release(void *cookie)
 {
 	vm_page_t m = cookie;
 
 	vm_page_lock(m);
 	vm_page_unhold(m);
 	vm_page_unlock(m);
 }
 
 int
 vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
 		  struct vm_memory_segment *seg)
 {
 	int i;
 
 	for (i = 0; i < vm->num_mem_segs; i++) {
 		if (gpabase == vm->mem_segs[i].gpa) {
 			seg->gpa = vm->mem_segs[i].gpa;
 			seg->len = vm->mem_segs[i].len;
 			seg->wired = vm->mem_segs[i].wired;
 			return (0);
 		}
 	}
 	return (-1);
 }
 
 int
 vm_get_memobj(struct vm *vm, vm_paddr_t gpa, size_t len,
 	      vm_offset_t *offset, struct vm_object **object)
 {
 	int i;
 	size_t seg_len;
 	vm_paddr_t seg_gpa;
 	vm_object_t seg_obj;
 
 	for (i = 0; i < vm->num_mem_segs; i++) {
 		if ((seg_obj = vm->mem_segs[i].object) == NULL)
 			continue;
 
 		seg_gpa = vm->mem_segs[i].gpa;
 		seg_len = vm->mem_segs[i].len;
 
 		if (gpa >= seg_gpa && gpa < seg_gpa + seg_len) {
 			*offset = gpa - seg_gpa;
 			*object = seg_obj;
 			vm_object_reference(seg_obj);
 			return (0);
 		}
 	}
 
 	return (EINVAL);
 }
 
 int
 vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
 {
 
 	if (vcpu < 0 || vcpu >= VM_MAXCPU)
 		return (EINVAL);
 
 	if (reg >= VM_REG_LAST)
 		return (EINVAL);
 
 	return (VMGETREG(vm->cookie, vcpu, reg, retval));
 }
 
 int
 vm_set_register(struct vm *vm, int vcpuid, int reg, uint64_t val)
 {
 	struct vcpu *vcpu;
 	int error;
 
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		return (EINVAL);
 
 	if (reg >= VM_REG_LAST)
 		return (EINVAL);
 
 	error = VMSETREG(vm->cookie, vcpuid, reg, val);
 	if (error || reg != VM_REG_GUEST_RIP)
 		return (error);
 
 	/* Set 'nextrip' to match the value of %rip */
 	VCPU_CTR1(vm, vcpuid, "Setting nextrip to %#lx", val);
 	vcpu = &vm->vcpu[vcpuid];
 	vcpu->nextrip = val;
 	return (0);
 }
 
 static boolean_t
 is_descriptor_table(int reg)
 {
 
 	switch (reg) {
 	case VM_REG_GUEST_IDTR:
 	case VM_REG_GUEST_GDTR:
 		return (TRUE);
 	default:
 		return (FALSE);
 	}
 }
 
 static boolean_t
 is_segment_register(int reg)
 {
 	
 	switch (reg) {
 	case VM_REG_GUEST_ES:
 	case VM_REG_GUEST_CS:
 	case VM_REG_GUEST_SS:
 	case VM_REG_GUEST_DS:
 	case VM_REG_GUEST_FS:
 	case VM_REG_GUEST_GS:
 	case VM_REG_GUEST_TR:
 	case VM_REG_GUEST_LDTR:
 		return (TRUE);
 	default:
 		return (FALSE);
 	}
 }
 
 int
 vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
 		struct seg_desc *desc)
 {
 
 	if (vcpu < 0 || vcpu >= VM_MAXCPU)
 		return (EINVAL);
 
 	if (!is_segment_register(reg) && !is_descriptor_table(reg))
 		return (EINVAL);
 
 	return (VMGETDESC(vm->cookie, vcpu, reg, desc));
 }
 
 int
 vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
 		struct seg_desc *desc)
 {
 	if (vcpu < 0 || vcpu >= VM_MAXCPU)
 		return (EINVAL);
 
 	if (!is_segment_register(reg) && !is_descriptor_table(reg))
 		return (EINVAL);
 
 	return (VMSETDESC(vm->cookie, vcpu, reg, desc));
 }
 
 static void
 restore_guest_fpustate(struct vcpu *vcpu)
 {
 
 	/* flush host state to the pcb */
 	fpuexit(curthread);
 
 	/* restore guest FPU state */
 	fpu_stop_emulating();
 	fpurestore(vcpu->guestfpu);
 
 	/* restore guest XCR0 if XSAVE is enabled in the host */
 	if (rcr4() & CR4_XSAVE)
 		load_xcr(0, vcpu->guest_xcr0);
 
 	/*
 	 * The FPU is now "dirty" with the guest's state so turn on emulation
 	 * to trap any access to the FPU by the host.
 	 */
 	fpu_start_emulating();
 }
 
 static void
 save_guest_fpustate(struct vcpu *vcpu)
 {
 
 	if ((rcr0() & CR0_TS) == 0)
 		panic("fpu emulation not enabled in host!");
 
 	/* save guest XCR0 and restore host XCR0 */
 	if (rcr4() & CR4_XSAVE) {
 		vcpu->guest_xcr0 = rxcr(0);
 		load_xcr(0, vmm_get_host_xcr0());
 	}
 
 	/* save guest FPU state */
 	fpu_stop_emulating();
 	fpusave(vcpu->guestfpu);
 	fpu_start_emulating();
 }
 
 static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle");
 
 static int
 vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate,
     bool from_idle)
 {
 	int error;
 
 	vcpu_assert_locked(vcpu);
 
 	/*
 	 * State transitions from the vmmdev_ioctl() must always begin from
 	 * the VCPU_IDLE state. This guarantees that there is only a single
 	 * ioctl() operating on a vcpu at any point.
 	 */
 	if (from_idle) {
 		while (vcpu->state != VCPU_IDLE)
 			msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz);
 	} else {
 		KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
 		    "vcpu idle state"));
 	}
 
 	if (vcpu->state == VCPU_RUNNING) {
 		KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d "
 		    "mismatch for running vcpu", curcpu, vcpu->hostcpu));
 	} else {
 		KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a "
 		    "vcpu that is not running", vcpu->hostcpu));
 	}
 
 	/*
 	 * The following state transitions are allowed:
 	 * IDLE -> FROZEN -> IDLE
 	 * FROZEN -> RUNNING -> FROZEN
 	 * FROZEN -> SLEEPING -> FROZEN
 	 */
 	switch (vcpu->state) {
 	case VCPU_IDLE:
 	case VCPU_RUNNING:
 	case VCPU_SLEEPING:
 		error = (newstate != VCPU_FROZEN);
 		break;
 	case VCPU_FROZEN:
 		error = (newstate == VCPU_FROZEN);
 		break;
 	default:
 		error = 1;
 		break;
 	}
 
 	if (error)
 		return (EBUSY);
 
 	vcpu->state = newstate;
 	if (newstate == VCPU_RUNNING)
 		vcpu->hostcpu = curcpu;
 	else
 		vcpu->hostcpu = NOCPU;
 
 	if (newstate == VCPU_IDLE)
 		wakeup(&vcpu->state);
 
 	return (0);
 }
 
 static void
 vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
 {
 	int error;
 
 	if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0)
 		panic("Error %d setting state to %d\n", error, newstate);
 }
 
 static void
 vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
 {
 	int error;
 
 	if ((error = vcpu_set_state_locked(vcpu, newstate, false)) != 0)
 		panic("Error %d setting state to %d", error, newstate);
 }
 
 static void
 vm_set_rendezvous_func(struct vm *vm, vm_rendezvous_func_t func)
 {
 
 	KASSERT(mtx_owned(&vm->rendezvous_mtx), ("rendezvous_mtx not locked"));
 
 	/*
 	 * Update 'rendezvous_func' and execute a write memory barrier to
 	 * ensure that it is visible across all host cpus. This is not needed
 	 * for correctness but it does ensure that all the vcpus will notice
 	 * that the rendezvous is requested immediately.
 	 */
 	vm->rendezvous_func = func;
 	wmb();
 }
 
 #define	RENDEZVOUS_CTR0(vm, vcpuid, fmt)				\
 	do {								\
 		if (vcpuid >= 0)					\
 			VCPU_CTR0(vm, vcpuid, fmt);			\
 		else							\
 			VM_CTR0(vm, fmt);				\
 	} while (0)
 
 static void
 vm_handle_rendezvous(struct vm *vm, int vcpuid)
 {
 
 	KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU),
 	    ("vm_handle_rendezvous: invalid vcpuid %d", vcpuid));
 
 	mtx_lock(&vm->rendezvous_mtx);
 	while (vm->rendezvous_func != NULL) {
 		/* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */
 		CPU_AND(&vm->rendezvous_req_cpus, &vm->active_cpus);
 
 		if (vcpuid != -1 &&
 		    CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus) &&
 		    !CPU_ISSET(vcpuid, &vm->rendezvous_done_cpus)) {
 			VCPU_CTR0(vm, vcpuid, "Calling rendezvous func");
 			(*vm->rendezvous_func)(vm, vcpuid, vm->rendezvous_arg);
 			CPU_SET(vcpuid, &vm->rendezvous_done_cpus);
 		}
 		if (CPU_CMP(&vm->rendezvous_req_cpus,
 		    &vm->rendezvous_done_cpus) == 0) {
 			VCPU_CTR0(vm, vcpuid, "Rendezvous completed");
 			vm_set_rendezvous_func(vm, NULL);
 			wakeup(&vm->rendezvous_func);
 			break;
 		}
 		RENDEZVOUS_CTR0(vm, vcpuid, "Wait for rendezvous completion");
 		mtx_sleep(&vm->rendezvous_func, &vm->rendezvous_mtx, 0,
 		    "vmrndv", 0);
 	}
 	mtx_unlock(&vm->rendezvous_mtx);
 }
 
 /*
  * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
  */
 static int
 vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu)
 {
 	struct vcpu *vcpu;
 	const char *wmesg;
 	int t, vcpu_halted, vm_halted;
 
 	KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted"));
 
 	vcpu = &vm->vcpu[vcpuid];
 	vcpu_halted = 0;
 	vm_halted = 0;
 
 	vcpu_lock(vcpu);
 	while (1) {
 		/*
 		 * Do a final check for pending NMI or interrupts before
 		 * really putting this thread to sleep. Also check for
 		 * software events that would cause this vcpu to wakeup.
 		 *
 		 * These interrupts/events could have happened after the
 		 * vcpu returned from VMRUN() and before it acquired the
 		 * vcpu lock above.
 		 */
 		if (vm->rendezvous_func != NULL || vm->suspend)
 			break;
 		if (vm_nmi_pending(vm, vcpuid))
 			break;
 		if (!intr_disabled) {
 			if (vm_extint_pending(vm, vcpuid) ||
 			    vlapic_pending_intr(vcpu->vlapic, NULL)) {
 				break;
 			}
 		}
 
 		/* Don't go to sleep if the vcpu thread needs to yield */
 		if (vcpu_should_yield(vm, vcpuid))
 			break;
 
 		/*
 		 * Some Linux guests implement "halt" by having all vcpus
 		 * execute HLT with interrupts disabled. 'halted_cpus' keeps
 		 * track of the vcpus that have entered this state. When all
 		 * vcpus enter the halted state the virtual machine is halted.
 		 */
 		if (intr_disabled) {
 			wmesg = "vmhalt";
 			VCPU_CTR0(vm, vcpuid, "Halted");
 			if (!vcpu_halted && halt_detection_enabled) {
 				vcpu_halted = 1;
 				CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus);
 			}
 			if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) {
 				vm_halted = 1;
 				break;
 			}
 		} else {
 			wmesg = "vmidle";
 		}
 
 		t = ticks;
 		vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
 		/*
 		 * XXX msleep_spin() cannot be interrupted by signals so
 		 * wake up periodically to check pending signals.
 		 */
 		msleep_spin(vcpu, &vcpu->mtx, wmesg, hz);
 		vcpu_require_state_locked(vcpu, VCPU_FROZEN);
 		vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
 	}
 
 	if (vcpu_halted)
 		CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus);
 
 	vcpu_unlock(vcpu);
 
 	if (vm_halted)
 		vm_suspend(vm, VM_SUSPEND_HALT);
 
 	return (0);
 }
 
 static int
 vm_handle_paging(struct vm *vm, int vcpuid, bool *retu)
 {
 	int rv, ftype;
 	struct vm_map *map;
 	struct vcpu *vcpu;
 	struct vm_exit *vme;
 
 	vcpu = &vm->vcpu[vcpuid];
 	vme = &vcpu->exitinfo;
 
 	KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d",
 	    __func__, vme->inst_length));
 
 	ftype = vme->u.paging.fault_type;
 	KASSERT(ftype == VM_PROT_READ ||
 	    ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE,
 	    ("vm_handle_paging: invalid fault_type %d", ftype));
 
 	if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
 		rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace),
 		    vme->u.paging.gpa, ftype);
 		if (rv == 0) {
 			VCPU_CTR2(vm, vcpuid, "%s bit emulation for gpa %#lx",
 			    ftype == VM_PROT_READ ? "accessed" : "dirty",
 			    vme->u.paging.gpa);
 			goto done;
 		}
 	}
 
 	map = &vm->vmspace->vm_map;
 	rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL);
 
 	VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, "
 	    "ftype = %d", rv, vme->u.paging.gpa, ftype);
 
 	if (rv != KERN_SUCCESS)
 		return (EFAULT);
 done:
 	return (0);
 }
 
 static int
 vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu)
 {
 	struct vie *vie;
 	struct vcpu *vcpu;
 	struct vm_exit *vme;
-	uint64_t gla, gpa;
+	uint64_t gla, gpa, cs_base;
 	struct vm_guest_paging *paging;
 	mem_region_read_t mread;
 	mem_region_write_t mwrite;
 	enum vm_cpu_mode cpu_mode;
 	int cs_d, error, length;
 
 	vcpu = &vm->vcpu[vcpuid];
 	vme = &vcpu->exitinfo;
 
 	gla = vme->u.inst_emul.gla;
 	gpa = vme->u.inst_emul.gpa;
+	cs_base = vme->u.inst_emul.cs_base;
 	cs_d = vme->u.inst_emul.cs_d;
 	vie = &vme->u.inst_emul.vie;
 	paging = &vme->u.inst_emul.paging;
 	cpu_mode = paging->cpu_mode;
 
 	VCPU_CTR1(vm, vcpuid, "inst_emul fault accessing gpa %#lx", gpa);
 
 	/* Fetch, decode and emulate the faulting instruction */
 	if (vie->num_valid == 0) {
 		/*
 		 * If the instruction length is not known then assume a
 		 * maximum size instruction.
 		 */
 		length = vme->inst_length ? vme->inst_length : VIE_INST_SIZE;
-		error = vmm_fetch_instruction(vm, vcpuid, paging, vme->rip,
-		    length, vie);
+		error = vmm_fetch_instruction(vm, vcpuid, paging, vme->rip +
+		    cs_base, length, vie);
 	} else {
 		/*
 		 * The instruction bytes have already been copied into 'vie'
 		 */
 		error = 0;
 	}
 	if (error == 1)
 		return (0);		/* Resume guest to handle page fault */
 	else if (error == -1)
 		return (EFAULT);
 	else if (error != 0)
 		panic("%s: vmm_fetch_instruction error %d", __func__, error);
 
 	if (vmm_decode_instruction(vm, vcpuid, gla, cpu_mode, cs_d, vie) != 0)
 		return (EFAULT);
 
 	/*
 	 * If the instruction length was not specified then update it now
 	 * along with 'nextrip'.
 	 */
 	if (vme->inst_length == 0) {
 		vme->inst_length = vie->num_processed;
 		vcpu->nextrip += vie->num_processed;
 	}
  
 	/* return to userland unless this is an in-kernel emulated device */
 	if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
 		mread = lapic_mmio_read;
 		mwrite = lapic_mmio_write;
 	} else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
 		mread = vioapic_mmio_read;
 		mwrite = vioapic_mmio_write;
 	} else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
 		mread = vhpet_mmio_read;
 		mwrite = vhpet_mmio_write;
 	} else {
 		*retu = true;
 		return (0);
 	}
 
 	error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, paging,
 	    mread, mwrite, retu);
 
 	return (error);
 }
 
 static int
 vm_handle_suspend(struct vm *vm, int vcpuid, bool *retu)
 {
 	int i, done;
 	struct vcpu *vcpu;
 
 	done = 0;
 	vcpu = &vm->vcpu[vcpuid];
 
 	CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus);
 
 	/*
 	 * Wait until all 'active_cpus' have suspended themselves.
 	 *
 	 * Since a VM may be suspended at any time including when one or
 	 * more vcpus are doing a rendezvous we need to call the rendezvous
 	 * handler while we are waiting to prevent a deadlock.
 	 */
 	vcpu_lock(vcpu);
 	while (1) {
 		if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
 			VCPU_CTR0(vm, vcpuid, "All vcpus suspended");
 			break;
 		}
 
 		if (vm->rendezvous_func == NULL) {
 			VCPU_CTR0(vm, vcpuid, "Sleeping during suspend");
 			vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
 			msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz);
 			vcpu_require_state_locked(vcpu, VCPU_FROZEN);
 		} else {
 			VCPU_CTR0(vm, vcpuid, "Rendezvous during suspend");
 			vcpu_unlock(vcpu);
 			vm_handle_rendezvous(vm, vcpuid);
 			vcpu_lock(vcpu);
 		}
 	}
 	vcpu_unlock(vcpu);
 
 	/*
 	 * Wakeup the other sleeping vcpus and return to userspace.
 	 */
 	for (i = 0; i < VM_MAXCPU; i++) {
 		if (CPU_ISSET(i, &vm->suspended_cpus)) {
 			vcpu_notify_event(vm, i, false);
 		}
 	}
 
 	*retu = true;
 	return (0);
 }
 
 int
 vm_suspend(struct vm *vm, enum vm_suspend_how how)
 {
 	int i;
 
 	if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST)
 		return (EINVAL);
 
 	if (atomic_cmpset_int(&vm->suspend, 0, how) == 0) {
 		VM_CTR2(vm, "virtual machine already suspended %d/%d",
 		    vm->suspend, how);
 		return (EALREADY);
 	}
 
 	VM_CTR1(vm, "virtual machine successfully suspended %d", how);
 
 	/*
 	 * Notify all active vcpus that they are now suspended.
 	 */
 	for (i = 0; i < VM_MAXCPU; i++) {
 		if (CPU_ISSET(i, &vm->active_cpus))
 			vcpu_notify_event(vm, i, false);
 	}
 
 	return (0);
 }
 
 void
 vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip)
 {
 	struct vm_exit *vmexit;
 
 	KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST,
 	    ("vm_exit_suspended: invalid suspend type %d", vm->suspend));
 
 	vmexit = vm_exitinfo(vm, vcpuid);
 	vmexit->rip = rip;
 	vmexit->inst_length = 0;
 	vmexit->exitcode = VM_EXITCODE_SUSPENDED;
 	vmexit->u.suspended.how = vm->suspend;
 }
 
 void
 vm_exit_rendezvous(struct vm *vm, int vcpuid, uint64_t rip)
 {
 	struct vm_exit *vmexit;
 
 	KASSERT(vm->rendezvous_func != NULL, ("rendezvous not in progress"));
 
 	vmexit = vm_exitinfo(vm, vcpuid);
 	vmexit->rip = rip;
 	vmexit->inst_length = 0;
 	vmexit->exitcode = VM_EXITCODE_RENDEZVOUS;
 	vmm_stat_incr(vm, vcpuid, VMEXIT_RENDEZVOUS, 1);
 }
 
 void
 vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip)
 {
 	struct vm_exit *vmexit;
 
 	vmexit = vm_exitinfo(vm, vcpuid);
 	vmexit->rip = rip;
 	vmexit->inst_length = 0;
 	vmexit->exitcode = VM_EXITCODE_BOGUS;
 	vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1);
 }
 
 int
 vm_run(struct vm *vm, struct vm_run *vmrun)
 {
 	int error, vcpuid;
 	struct vcpu *vcpu;
 	struct pcb *pcb;
 	uint64_t tscval;
 	struct vm_exit *vme;
 	bool retu, intr_disabled;
 	pmap_t pmap;
 	void *rptr, *sptr;
 
 	vcpuid = vmrun->cpuid;
 
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		return (EINVAL);
 
 	if (!CPU_ISSET(vcpuid, &vm->active_cpus))
 		return (EINVAL);
 
 	if (CPU_ISSET(vcpuid, &vm->suspended_cpus))
 		return (EINVAL);
 
 	rptr = &vm->rendezvous_func;
 	sptr = &vm->suspend;
 	pmap = vmspace_pmap(vm->vmspace);
 	vcpu = &vm->vcpu[vcpuid];
 	vme = &vcpu->exitinfo;
 restart:
 	critical_enter();
 
 	KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
 	    ("vm_run: absurd pm_active"));
 
 	tscval = rdtsc();
 
 	pcb = PCPU_GET(curpcb);
 	set_pcb_flags(pcb, PCB_FULL_IRET);
 
 	restore_guest_fpustate(vcpu);
 
 	vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
 	error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip, pmap, rptr, sptr);
 	vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
 
 	save_guest_fpustate(vcpu);
 
 	vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
 
 	critical_exit();
 
 	if (error == 0) {
 		retu = false;
 		vcpu->nextrip = vme->rip + vme->inst_length;
 		switch (vme->exitcode) {
 		case VM_EXITCODE_SUSPENDED:
 			error = vm_handle_suspend(vm, vcpuid, &retu);
 			break;
 		case VM_EXITCODE_IOAPIC_EOI:
 			vioapic_process_eoi(vm, vcpuid,
 			    vme->u.ioapic_eoi.vector);
 			break;
 		case VM_EXITCODE_RENDEZVOUS:
 			vm_handle_rendezvous(vm, vcpuid);
 			error = 0;
 			break;
 		case VM_EXITCODE_HLT:
 			intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0);
 			error = vm_handle_hlt(vm, vcpuid, intr_disabled, &retu);
 			break;
 		case VM_EXITCODE_PAGING:
 			error = vm_handle_paging(vm, vcpuid, &retu);
 			break;
 		case VM_EXITCODE_INST_EMUL:
 			error = vm_handle_inst_emul(vm, vcpuid, &retu);
 			break;
 		case VM_EXITCODE_INOUT:
 		case VM_EXITCODE_INOUT_STR:
 			error = vm_handle_inout(vm, vcpuid, vme, &retu);
 			break;
 		case VM_EXITCODE_MONITOR:
 		case VM_EXITCODE_MWAIT:
 			vm_inject_ud(vm, vcpuid);
 			break;
 		default:
 			retu = true;	/* handled in userland */
 			break;
 		}
 	}
 
 	if (error == 0 && retu == false)
 		goto restart;
 
 	/* copy the exit information */
 	bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit));
 	return (error);
 }
 
 int
 vm_restart_instruction(void *arg, int vcpuid)
 {
 	struct vm *vm;
 	struct vcpu *vcpu;
 	enum vcpu_state state;
 	uint64_t rip;
 	int error;
 
 	vm = arg;
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		return (EINVAL);
 
 	vcpu = &vm->vcpu[vcpuid];
 	state = vcpu_get_state(vm, vcpuid, NULL);
 	if (state == VCPU_RUNNING) {
 		/*
 		 * When a vcpu is "running" the next instruction is determined
 		 * by adding 'rip' and 'inst_length' in the vcpu's 'exitinfo'.
 		 * Thus setting 'inst_length' to zero will cause the current
 		 * instruction to be restarted.
 		 */
 		vcpu->exitinfo.inst_length = 0;
 		VCPU_CTR1(vm, vcpuid, "restarting instruction at %#lx by "
 		    "setting inst_length to zero", vcpu->exitinfo.rip);
 	} else if (state == VCPU_FROZEN) {
 		/*
 		 * When a vcpu is "frozen" it is outside the critical section
 		 * around VMRUN() and 'nextrip' points to the next instruction.
 		 * Thus instruction restart is achieved by setting 'nextrip'
 		 * to the vcpu's %rip.
 		 */
 		error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RIP, &rip);
 		KASSERT(!error, ("%s: error %d getting rip", __func__, error));
 		VCPU_CTR2(vm, vcpuid, "restarting instruction by updating "
 		    "nextrip from %#lx to %#lx", vcpu->nextrip, rip);
 		vcpu->nextrip = rip;
 	} else {
 		panic("%s: invalid state %d", __func__, state);
 	}
 	return (0);
 }
 
 int
 vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info)
 {
 	struct vcpu *vcpu;
 	int type, vector;
 
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		return (EINVAL);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	if (info & VM_INTINFO_VALID) {
 		type = info & VM_INTINFO_TYPE;
 		vector = info & 0xff;
 		if (type == VM_INTINFO_NMI && vector != IDT_NMI)
 			return (EINVAL);
 		if (type == VM_INTINFO_HWEXCEPTION && vector >= 32)
 			return (EINVAL);
 		if (info & VM_INTINFO_RSVD)
 			return (EINVAL);
 	} else {
 		info = 0;
 	}
 	VCPU_CTR2(vm, vcpuid, "%s: info1(%#lx)", __func__, info);
 	vcpu->exitintinfo = info;
 	return (0);
 }
 
 enum exc_class {
 	EXC_BENIGN,
 	EXC_CONTRIBUTORY,
 	EXC_PAGEFAULT
 };
 
 #define	IDT_VE	20	/* Virtualization Exception (Intel specific) */
 
 static enum exc_class
 exception_class(uint64_t info)
 {
 	int type, vector;
 
 	KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %#lx", info));
 	type = info & VM_INTINFO_TYPE;
 	vector = info & 0xff;
 
 	/* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */
 	switch (type) {
 	case VM_INTINFO_HWINTR:
 	case VM_INTINFO_SWINTR:
 	case VM_INTINFO_NMI:
 		return (EXC_BENIGN);
 	default:
 		/*
 		 * Hardware exception.
 		 *
 		 * SVM and VT-x use identical type values to represent NMI,
 		 * hardware interrupt and software interrupt.
 		 *
 		 * SVM uses type '3' for all exceptions. VT-x uses type '3'
 		 * for exceptions except #BP and #OF. #BP and #OF use a type
 		 * value of '5' or '6'. Therefore we don't check for explicit
 		 * values of 'type' to classify 'intinfo' into a hardware
 		 * exception.
 		 */
 		break;
 	}
 
 	switch (vector) {
 	case IDT_PF:
 	case IDT_VE:
 		return (EXC_PAGEFAULT);
 	case IDT_DE:
 	case IDT_TS:
 	case IDT_NP:
 	case IDT_SS:
 	case IDT_GP:
 		return (EXC_CONTRIBUTORY);
 	default:
 		return (EXC_BENIGN);
 	}
 }
 
 static int
 nested_fault(struct vm *vm, int vcpuid, uint64_t info1, uint64_t info2,
     uint64_t *retinfo)
 {
 	enum exc_class exc1, exc2;
 	int type1, vector1;
 
 	KASSERT(info1 & VM_INTINFO_VALID, ("info1 %#lx is not valid", info1));
 	KASSERT(info2 & VM_INTINFO_VALID, ("info2 %#lx is not valid", info2));
 
 	/*
 	 * If an exception occurs while attempting to call the double-fault
 	 * handler the processor enters shutdown mode (aka triple fault).
 	 */
 	type1 = info1 & VM_INTINFO_TYPE;
 	vector1 = info1 & 0xff;
 	if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) {
 		VCPU_CTR2(vm, vcpuid, "triple fault: info1(%#lx), info2(%#lx)",
 		    info1, info2);
 		vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT);
 		*retinfo = 0;
 		return (0);
 	}
 
 	/*
 	 * Table 6-5 "Conditions for Generating a Double Fault", Intel SDM, Vol3
 	 */
 	exc1 = exception_class(info1);
 	exc2 = exception_class(info2);
 	if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) ||
 	    (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) {
 		/* Convert nested fault into a double fault. */
 		*retinfo = IDT_DF;
 		*retinfo |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
 		*retinfo |= VM_INTINFO_DEL_ERRCODE;
 	} else {
 		/* Handle exceptions serially */
 		*retinfo = info2;
 	}
 	return (1);
 }
 
 static uint64_t
 vcpu_exception_intinfo(struct vcpu *vcpu)
 {
 	uint64_t info = 0;
 
 	if (vcpu->exception_pending) {
 		info = vcpu->exc_vector & 0xff;
 		info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
 		if (vcpu->exc_errcode_valid) {
 			info |= VM_INTINFO_DEL_ERRCODE;
 			info |= (uint64_t)vcpu->exc_errcode << 32;
 		}
 	}
 	return (info);
 }
 
 int
 vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo)
 {
 	struct vcpu *vcpu;
 	uint64_t info1, info2;
 	int valid;
 
 	KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU, ("invalid vcpu %d", vcpuid));
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	info1 = vcpu->exitintinfo;
 	vcpu->exitintinfo = 0;
 
 	info2 = 0;
 	if (vcpu->exception_pending) {
 		info2 = vcpu_exception_intinfo(vcpu);
 		vcpu->exception_pending = 0;
 		VCPU_CTR2(vm, vcpuid, "Exception %d delivered: %#lx",
 		    vcpu->exc_vector, info2);
 	}
 
 	if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) {
 		valid = nested_fault(vm, vcpuid, info1, info2, retinfo);
 	} else if (info1 & VM_INTINFO_VALID) {
 		*retinfo = info1;
 		valid = 1;
 	} else if (info2 & VM_INTINFO_VALID) {
 		*retinfo = info2;
 		valid = 1;
 	} else {
 		valid = 0;
 	}
 
 	if (valid) {
 		VCPU_CTR4(vm, vcpuid, "%s: info1(%#lx), info2(%#lx), "
 		    "retinfo(%#lx)", __func__, info1, info2, *retinfo);
 	}
 
 	return (valid);
 }
 
 int
 vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2)
 {
 	struct vcpu *vcpu;
 
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		return (EINVAL);
 
 	vcpu = &vm->vcpu[vcpuid];
 	*info1 = vcpu->exitintinfo;
 	*info2 = vcpu_exception_intinfo(vcpu);
 	return (0);
 }
 
 int
 vm_inject_exception(struct vm *vm, int vcpuid, int vector, int errcode_valid,
     uint32_t errcode, int restart_instruction)
 {
 	struct vcpu *vcpu;
 	int error;
 
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		return (EINVAL);
 
 	if (vector < 0 || vector >= 32)
 		return (EINVAL);
 
 	/*
 	 * A double fault exception should never be injected directly into
 	 * the guest. It is a derived exception that results from specific
 	 * combinations of nested faults.
 	 */
 	if (vector == IDT_DF)
 		return (EINVAL);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	if (vcpu->exception_pending) {
 		VCPU_CTR2(vm, vcpuid, "Unable to inject exception %d due to "
 		    "pending exception %d", vector, vcpu->exc_vector);
 		return (EBUSY);
 	}
 
 	/*
 	 * From section 26.6.1 "Interruptibility State" in Intel SDM:
 	 *
 	 * Event blocking by "STI" or "MOV SS" is cleared after guest executes
 	 * one instruction or incurs an exception.
 	 */
 	error = vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0);
 	KASSERT(error == 0, ("%s: error %d clearing interrupt shadow",
 	    __func__, error));
 
 	if (restart_instruction)
 		vm_restart_instruction(vm, vcpuid);
 
 	vcpu->exception_pending = 1;
 	vcpu->exc_vector = vector;
 	vcpu->exc_errcode = errcode;
 	vcpu->exc_errcode_valid = errcode_valid;
 	VCPU_CTR1(vm, vcpuid, "Exception %d pending", vector);
 	return (0);
 }
 
 void
 vm_inject_fault(void *vmarg, int vcpuid, int vector, int errcode_valid,
     int errcode)
 {
 	struct vm *vm;
 	int error, restart_instruction;
 
 	vm = vmarg;
 	restart_instruction = 1;
 
 	error = vm_inject_exception(vm, vcpuid, vector, errcode_valid,
 	    errcode, restart_instruction);
 	KASSERT(error == 0, ("vm_inject_exception error %d", error));
 }
 
 void
 vm_inject_pf(void *vmarg, int vcpuid, int error_code, uint64_t cr2)
 {
 	struct vm *vm;
 	int error;
 
 	vm = vmarg;
 	VCPU_CTR2(vm, vcpuid, "Injecting page fault: error_code %#x, cr2 %#lx",
 	    error_code, cr2);
 
 	error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2);
 	KASSERT(error == 0, ("vm_set_register(cr2) error %d", error));
 
 	vm_inject_fault(vm, vcpuid, IDT_PF, 1, error_code);
 }
 
 static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
 
 int
 vm_inject_nmi(struct vm *vm, int vcpuid)
 {
 	struct vcpu *vcpu;
 
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		return (EINVAL);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	vcpu->nmi_pending = 1;
 	vcpu_notify_event(vm, vcpuid, false);
 	return (0);
 }
 
 int
 vm_nmi_pending(struct vm *vm, int vcpuid)
 {
 	struct vcpu *vcpu;
 
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	return (vcpu->nmi_pending);
 }
 
 void
 vm_nmi_clear(struct vm *vm, int vcpuid)
 {
 	struct vcpu *vcpu;
 
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	if (vcpu->nmi_pending == 0)
 		panic("vm_nmi_clear: inconsistent nmi_pending state");
 
 	vcpu->nmi_pending = 0;
 	vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1);
 }
 
 static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu");
 
 int
 vm_inject_extint(struct vm *vm, int vcpuid)
 {
 	struct vcpu *vcpu;
 
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		return (EINVAL);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	vcpu->extint_pending = 1;
 	vcpu_notify_event(vm, vcpuid, false);
 	return (0);
 }
 
 int
 vm_extint_pending(struct vm *vm, int vcpuid)
 {
 	struct vcpu *vcpu;
 
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	return (vcpu->extint_pending);
 }
 
 void
 vm_extint_clear(struct vm *vm, int vcpuid)
 {
 	struct vcpu *vcpu;
 
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	if (vcpu->extint_pending == 0)
 		panic("vm_extint_clear: inconsistent extint_pending state");
 
 	vcpu->extint_pending = 0;
 	vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1);
 }
 
 int
 vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
 {
 	if (vcpu < 0 || vcpu >= VM_MAXCPU)
 		return (EINVAL);
 
 	if (type < 0 || type >= VM_CAP_MAX)
 		return (EINVAL);
 
 	return (VMGETCAP(vm->cookie, vcpu, type, retval));
 }
 
 int
 vm_set_capability(struct vm *vm, int vcpu, int type, int val)
 {
 	if (vcpu < 0 || vcpu >= VM_MAXCPU)
 		return (EINVAL);
 
 	if (type < 0 || type >= VM_CAP_MAX)
 		return (EINVAL);
 
 	return (VMSETCAP(vm->cookie, vcpu, type, val));
 }
 
 struct vlapic *
 vm_lapic(struct vm *vm, int cpu)
 {
 	return (vm->vcpu[cpu].vlapic);
 }
 
 struct vioapic *
 vm_ioapic(struct vm *vm)
 {
 
 	return (vm->vioapic);
 }
 
 struct vhpet *
 vm_hpet(struct vm *vm)
 {
 
 	return (vm->vhpet);
 }
 
 boolean_t
 vmm_is_pptdev(int bus, int slot, int func)
 {
 	int found, i, n;
 	int b, s, f;
 	char *val, *cp, *cp2;
 
 	/*
 	 * XXX
 	 * The length of an environment variable is limited to 128 bytes which
 	 * puts an upper limit on the number of passthru devices that may be
 	 * specified using a single environment variable.
 	 *
 	 * Work around this by scanning multiple environment variable
 	 * names instead of a single one - yuck!
 	 */
 	const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL };
 
 	/* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */
 	found = 0;
 	for (i = 0; names[i] != NULL && !found; i++) {
 		cp = val = getenv(names[i]);
 		while (cp != NULL && *cp != '\0') {
 			if ((cp2 = strchr(cp, ' ')) != NULL)
 				*cp2 = '\0';
 
 			n = sscanf(cp, "%d/%d/%d", &b, &s, &f);
 			if (n == 3 && bus == b && slot == s && func == f) {
 				found = 1;
 				break;
 			}
 		
 			if (cp2 != NULL)
 				*cp2++ = ' ';
 
 			cp = cp2;
 		}
 		freeenv(val);
 	}
 	return (found);
 }
 
 void *
 vm_iommu_domain(struct vm *vm)
 {
 
 	return (vm->iommu);
 }
 
 int
 vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate,
     bool from_idle)
 {
 	int error;
 	struct vcpu *vcpu;
 
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		panic("vm_set_run_state: invalid vcpuid %d", vcpuid);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	vcpu_lock(vcpu);
 	error = vcpu_set_state_locked(vcpu, newstate, from_idle);
 	vcpu_unlock(vcpu);
 
 	return (error);
 }
 
 enum vcpu_state
 vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu)
 {
 	struct vcpu *vcpu;
 	enum vcpu_state state;
 
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		panic("vm_get_run_state: invalid vcpuid %d", vcpuid);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	vcpu_lock(vcpu);
 	state = vcpu->state;
 	if (hostcpu != NULL)
 		*hostcpu = vcpu->hostcpu;
 	vcpu_unlock(vcpu);
 
 	return (state);
 }
 
 int
 vm_activate_cpu(struct vm *vm, int vcpuid)
 {
 
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		return (EINVAL);
 
 	if (CPU_ISSET(vcpuid, &vm->active_cpus))
 		return (EBUSY);
 
 	VCPU_CTR0(vm, vcpuid, "activated");
 	CPU_SET_ATOMIC(vcpuid, &vm->active_cpus);
 	return (0);
 }
 
 cpuset_t
 vm_active_cpus(struct vm *vm)
 {
 
 	return (vm->active_cpus);
 }
 
 cpuset_t
 vm_suspended_cpus(struct vm *vm)
 {
 
 	return (vm->suspended_cpus);
 }
 
 void *
 vcpu_stats(struct vm *vm, int vcpuid)
 {
 
 	return (vm->vcpu[vcpuid].stats);
 }
 
 int
 vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state)
 {
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		return (EINVAL);
 
 	*state = vm->vcpu[vcpuid].x2apic_state;
 
 	return (0);
 }
 
 int
 vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
 {
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		return (EINVAL);
 
 	if (state >= X2APIC_STATE_LAST)
 		return (EINVAL);
 
 	vm->vcpu[vcpuid].x2apic_state = state;
 
 	vlapic_set_x2apic_state(vm, vcpuid, state);
 
 	return (0);
 }
 
 /*
  * This function is called to ensure that a vcpu "sees" a pending event
  * as soon as possible:
  * - If the vcpu thread is sleeping then it is woken up.
  * - If the vcpu is running on a different host_cpu then an IPI will be directed
  *   to the host_cpu to cause the vcpu to trap into the hypervisor.
  */
 void
 vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr)
 {
 	int hostcpu;
 	struct vcpu *vcpu;
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	vcpu_lock(vcpu);
 	hostcpu = vcpu->hostcpu;
 	if (vcpu->state == VCPU_RUNNING) {
 		KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
 		if (hostcpu != curcpu) {
 			if (lapic_intr) {
 				vlapic_post_intr(vcpu->vlapic, hostcpu,
 				    vmm_ipinum);
 			} else {
 				ipi_cpu(hostcpu, vmm_ipinum);
 			}
 		} else {
 			/*
 			 * If the 'vcpu' is running on 'curcpu' then it must
 			 * be sending a notification to itself (e.g. SELF_IPI).
 			 * The pending event will be picked up when the vcpu
 			 * transitions back to guest context.
 			 */
 		}
 	} else {
 		KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent "
 		    "with hostcpu %d", vcpu->state, hostcpu));
 		if (vcpu->state == VCPU_SLEEPING)
 			wakeup_one(vcpu);
 	}
 	vcpu_unlock(vcpu);
 }
 
 struct vmspace *
 vm_get_vmspace(struct vm *vm)
 {
 
 	return (vm->vmspace);
 }
 
 int
 vm_apicid2vcpuid(struct vm *vm, int apicid)
 {
 	/*
 	 * XXX apic id is assumed to be numerically identical to vcpu id
 	 */
 	return (apicid);
 }
 
 void
 vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest,
     vm_rendezvous_func_t func, void *arg)
 {
 	int i;
 
 	/*
 	 * Enforce that this function is called without any locks
 	 */
 	WITNESS_WARN(WARN_PANIC, NULL, "vm_smp_rendezvous");
 	KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU),
 	    ("vm_smp_rendezvous: invalid vcpuid %d", vcpuid));
 
 restart:
 	mtx_lock(&vm->rendezvous_mtx);
 	if (vm->rendezvous_func != NULL) {
 		/*
 		 * If a rendezvous is already in progress then we need to
 		 * call the rendezvous handler in case this 'vcpuid' is one
 		 * of the targets of the rendezvous.
 		 */
 		RENDEZVOUS_CTR0(vm, vcpuid, "Rendezvous already in progress");
 		mtx_unlock(&vm->rendezvous_mtx);
 		vm_handle_rendezvous(vm, vcpuid);
 		goto restart;
 	}
 	KASSERT(vm->rendezvous_func == NULL, ("vm_smp_rendezvous: previous "
 	    "rendezvous is still in progress"));
 
 	RENDEZVOUS_CTR0(vm, vcpuid, "Initiating rendezvous");
 	vm->rendezvous_req_cpus = dest;
 	CPU_ZERO(&vm->rendezvous_done_cpus);
 	vm->rendezvous_arg = arg;
 	vm_set_rendezvous_func(vm, func);
 	mtx_unlock(&vm->rendezvous_mtx);
 
 	/*
 	 * Wake up any sleeping vcpus and trigger a VM-exit in any running
 	 * vcpus so they handle the rendezvous as soon as possible.
 	 */
 	for (i = 0; i < VM_MAXCPU; i++) {
 		if (CPU_ISSET(i, &dest))
 			vcpu_notify_event(vm, i, false);
 	}
 
 	vm_handle_rendezvous(vm, vcpuid);
 }
 
 struct vatpic *
 vm_atpic(struct vm *vm)
 {
 	return (vm->vatpic);
 }
 
 struct vatpit *
 vm_atpit(struct vm *vm)
 {
 	return (vm->vatpit);
 }
 
 struct vpmtmr *
 vm_pmtmr(struct vm *vm)
 {
 
 	return (vm->vpmtmr);
 }
 
 struct vrtc *
 vm_rtc(struct vm *vm)
 {
 
 	return (vm->vrtc);
 }
 
 enum vm_reg_name
 vm_segment_name(int seg)
 {
 	static enum vm_reg_name seg_names[] = {
 		VM_REG_GUEST_ES,
 		VM_REG_GUEST_CS,
 		VM_REG_GUEST_SS,
 		VM_REG_GUEST_DS,
 		VM_REG_GUEST_FS,
 		VM_REG_GUEST_GS
 	};
 
 	KASSERT(seg >= 0 && seg < nitems(seg_names),
 	    ("%s: invalid segment encoding %d", __func__, seg));
 	return (seg_names[seg]);
 }
 
 void
 vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
     int num_copyinfo)
 {
 	int idx;
 
 	for (idx = 0; idx < num_copyinfo; idx++) {
 		if (copyinfo[idx].cookie != NULL)
 			vm_gpa_release(copyinfo[idx].cookie);
 	}
 	bzero(copyinfo, num_copyinfo * sizeof(struct vm_copyinfo));
 }
 
 int
 vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
     uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
     int num_copyinfo)
 {
 	int error, idx, nused;
 	size_t n, off, remaining;
 	void *hva, *cookie;
 	uint64_t gpa;
 
 	bzero(copyinfo, sizeof(struct vm_copyinfo) * num_copyinfo);
 
 	nused = 0;
 	remaining = len;
 	while (remaining > 0) {
 		KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo"));
-		error = vmm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa);
+		error = vm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa);
 		if (error)
 			return (error);
 		off = gpa & PAGE_MASK;
 		n = min(remaining, PAGE_SIZE - off);
 		copyinfo[nused].gpa = gpa;
 		copyinfo[nused].len = n;
 		remaining -= n;
 		gla += n;
 		nused++;
 	}
 
 	for (idx = 0; idx < nused; idx++) {
 		hva = vm_gpa_hold(vm, copyinfo[idx].gpa, copyinfo[idx].len,
 		    prot, &cookie);
 		if (hva == NULL)
 			break;
 		copyinfo[idx].hva = hva;
 		copyinfo[idx].cookie = cookie;
 	}
 
 	if (idx != nused) {
 		vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo);
 		return (-1);
 	} else {
 		return (0);
 	}
 }
 
 void
 vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr,
     size_t len)
 {
 	char *dst;
 	int idx;
 	
 	dst = kaddr;
 	idx = 0;
 	while (len > 0) {
 		bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len);
 		len -= copyinfo[idx].len;
 		dst += copyinfo[idx].len;
 		idx++;
 	}
 }
 
 void
 vm_copyout(struct vm *vm, int vcpuid, const void *kaddr,
     struct vm_copyinfo *copyinfo, size_t len)
 {
 	const char *src;
 	int idx;
 
 	src = kaddr;
 	idx = 0;
 	while (len > 0) {
 		bcopy(src, copyinfo[idx].hva, copyinfo[idx].len);
 		len -= copyinfo[idx].len;
 		src += copyinfo[idx].len;
 		idx++;
 	}
 }
 
 /*
  * Return the amount of in-use and wired memory for the VM. Since
  * these are global stats, only return the values with for vCPU 0
  */
 VMM_STAT_DECLARE(VMM_MEM_RESIDENT);
 VMM_STAT_DECLARE(VMM_MEM_WIRED);
 
 static void
 vm_get_rescnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
 {
 
 	if (vcpu == 0) {
 		vmm_stat_set(vm, vcpu, VMM_MEM_RESIDENT,
 	       	    PAGE_SIZE * vmspace_resident_count(vm->vmspace));
 	}	
 }
 
 static void
 vm_get_wiredcnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
 {
 
 	if (vcpu == 0) {
 		vmm_stat_set(vm, vcpu, VMM_MEM_WIRED,
 	      	    PAGE_SIZE * pmap_wired_count(vmspace_pmap(vm->vmspace)));
 	}	
 }
 
 VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt);
 VMM_STAT_FUNC(VMM_MEM_WIRED, "Wired memory", vm_get_wiredcnt);
Index: stable/10/sys/amd64/vmm/vmm_dev.c
===================================================================
--- stable/10/sys/amd64/vmm/vmm_dev.c	(revision 284898)
+++ stable/10/sys/amd64/vmm/vmm_dev.c	(revision 284899)
@@ -1,699 +1,699 @@
 /*-
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/queue.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/malloc.h>
 #include <sys/conf.h>
 #include <sys/sysctl.h>
 #include <sys/libkern.h>
 #include <sys/ioccom.h>
 #include <sys/mman.h>
 #include <sys/uio.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 
 #include <machine/vmparam.h>
 #include <machine/vmm.h>
 #include <machine/vmm_instruction_emul.h>
 #include <machine/vmm_dev.h>
 
 #include "vmm_lapic.h"
 #include "vmm_stat.h"
 #include "vmm_mem.h"
 #include "io/ppt.h"
 #include "io/vatpic.h"
 #include "io/vioapic.h"
 #include "io/vhpet.h"
 #include "io/vrtc.h"
 
 struct vmmdev_softc {
 	struct vm	*vm;		/* vm instance cookie */
 	struct cdev	*cdev;
 	SLIST_ENTRY(vmmdev_softc) link;
 	int		flags;
 };
 #define	VSC_LINKED		0x01
 
 static SLIST_HEAD(, vmmdev_softc) head;
 
 static struct mtx vmmdev_mtx;
 
 static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev");
 
 SYSCTL_DECL(_hw_vmm);
 
 static struct vmmdev_softc *
 vmmdev_lookup(const char *name)
 {
 	struct vmmdev_softc *sc;
 
 #ifdef notyet	/* XXX kernel is not compiled with invariants */
 	mtx_assert(&vmmdev_mtx, MA_OWNED);
 #endif
 
 	SLIST_FOREACH(sc, &head, link) {
 		if (strcmp(name, vm_name(sc->vm)) == 0)
 			break;
 	}
 
 	return (sc);
 }
 
 static struct vmmdev_softc *
 vmmdev_lookup2(struct cdev *cdev)
 {
 
 	return (cdev->si_drv1);
 }
 
 static int
 vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
 {
 	int error, off, c, prot;
 	vm_paddr_t gpa;
 	void *hpa, *cookie;
 	struct vmmdev_softc *sc;
 
 	static char zerobuf[PAGE_SIZE];
 
 	error = 0;
 	sc = vmmdev_lookup2(cdev);
 	if (sc == NULL)
 		error = ENXIO;
 
 	prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ);
 	while (uio->uio_resid > 0 && error == 0) {
 		gpa = uio->uio_offset;
 		off = gpa & PAGE_MASK;
 		c = min(uio->uio_resid, PAGE_SIZE - off);
 
 		/*
 		 * The VM has a hole in its physical memory map. If we want to
 		 * use 'dd' to inspect memory beyond the hole we need to
 		 * provide bogus data for memory that lies in the hole.
 		 *
 		 * Since this device does not support lseek(2), dd(1) will
 		 * read(2) blocks of data to simulate the lseek(2).
 		 */
 		hpa = vm_gpa_hold(sc->vm, gpa, c, prot, &cookie);
 		if (hpa == NULL) {
 			if (uio->uio_rw == UIO_READ)
 				error = uiomove(zerobuf, c, uio);
 			else
 				error = EFAULT;
 		} else {
 			error = uiomove(hpa, c, uio);
 			vm_gpa_release(cookie);
 		}
 	}
 	return (error);
 }
 
 static int
 vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 	     struct thread *td)
 {
 	int error, vcpu, state_changed, size;
 	cpuset_t *cpuset;
 	struct vmmdev_softc *sc;
 	struct vm_memory_segment *seg;
 	struct vm_register *vmreg;
 	struct vm_seg_desc *vmsegdesc;
 	struct vm_run *vmrun;
 	struct vm_exception *vmexc;
 	struct vm_lapic_irq *vmirq;
 	struct vm_lapic_msi *vmmsi;
 	struct vm_ioapic_irq *ioapic_irq;
 	struct vm_isa_irq *isa_irq;
 	struct vm_isa_irq_trigger *isa_irq_trigger;
 	struct vm_capability *vmcap;
 	struct vm_pptdev *pptdev;
 	struct vm_pptdev_mmio *pptmmio;
 	struct vm_pptdev_msi *pptmsi;
 	struct vm_pptdev_msix *pptmsix;
 	struct vm_nmi *vmnmi;
 	struct vm_stats *vmstats;
 	struct vm_stat_desc *statdesc;
 	struct vm_x2apic *x2apic;
 	struct vm_gpa_pte *gpapte;
 	struct vm_suspend *vmsuspend;
 	struct vm_gla2gpa *gg;
 	struct vm_activate_cpu *vac;
 	struct vm_cpuset *vm_cpuset;
 	struct vm_intinfo *vmii;
 	struct vm_rtc_time *rtctime;
 	struct vm_rtc_data *rtcdata;
 
 	sc = vmmdev_lookup2(cdev);
 	if (sc == NULL)
 		return (ENXIO);
 
 	error = 0;
 	vcpu = -1;
 	state_changed = 0;
 
 	/*
 	 * Some VMM ioctls can operate only on vcpus that are not running.
 	 */
 	switch (cmd) {
 	case VM_RUN:
 	case VM_GET_REGISTER:
 	case VM_SET_REGISTER:
 	case VM_GET_SEGMENT_DESCRIPTOR:
 	case VM_SET_SEGMENT_DESCRIPTOR:
 	case VM_INJECT_EXCEPTION:
 	case VM_GET_CAPABILITY:
 	case VM_SET_CAPABILITY:
 	case VM_PPTDEV_MSI:
 	case VM_PPTDEV_MSIX:
 	case VM_SET_X2APIC_STATE:
 	case VM_GLA2GPA:
 	case VM_ACTIVATE_CPU:
 	case VM_SET_INTINFO:
 	case VM_GET_INTINFO:
 	case VM_RESTART_INSTRUCTION:
 		/*
 		 * XXX fragile, handle with care
 		 * Assumes that the first field of the ioctl data is the vcpu.
 		 */
 		vcpu = *(int *)data;
 		if (vcpu < 0 || vcpu >= VM_MAXCPU) {
 			error = EINVAL;
 			goto done;
 		}
 
 		error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN, true);
 		if (error)
 			goto done;
 
 		state_changed = 1;
 		break;
 
 	case VM_MAP_PPTDEV_MMIO:
 	case VM_BIND_PPTDEV:
 	case VM_UNBIND_PPTDEV:
 	case VM_MAP_MEMORY:
 	case VM_REINIT:
 		/*
 		 * ioctls that operate on the entire virtual machine must
 		 * prevent all vcpus from running.
 		 */
 		error = 0;
 		for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) {
 			error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN, true);
 			if (error)
 				break;
 		}
 
 		if (error) {
 			while (--vcpu >= 0)
 				vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false);
 			goto done;
 		}
 
 		state_changed = 2;
 		break;
 
 	default:
 		break;
 	}
 
 	switch(cmd) {
 	case VM_RUN:
 		vmrun = (struct vm_run *)data;
 		error = vm_run(sc->vm, vmrun);
 		break;
 	case VM_SUSPEND:
 		vmsuspend = (struct vm_suspend *)data;
 		error = vm_suspend(sc->vm, vmsuspend->how);
 		break;
 	case VM_REINIT:
 		error = vm_reinit(sc->vm);
 		break;
 	case VM_STAT_DESC: {
 		statdesc = (struct vm_stat_desc *)data;
 		error = vmm_stat_desc_copy(statdesc->index,
 					statdesc->desc, sizeof(statdesc->desc));
 		break;
 	}
 	case VM_STATS: {
 		CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_ELEMS);
 		vmstats = (struct vm_stats *)data;
 		getmicrotime(&vmstats->tv);
 		error = vmm_stat_copy(sc->vm, vmstats->cpuid,
 				      &vmstats->num_entries, vmstats->statbuf);
 		break;
 	}
 	case VM_PPTDEV_MSI:
 		pptmsi = (struct vm_pptdev_msi *)data;
 		error = ppt_setup_msi(sc->vm, pptmsi->vcpu,
 				      pptmsi->bus, pptmsi->slot, pptmsi->func,
 				      pptmsi->addr, pptmsi->msg,
 				      pptmsi->numvec);
 		break;
 	case VM_PPTDEV_MSIX:
 		pptmsix = (struct vm_pptdev_msix *)data;
 		error = ppt_setup_msix(sc->vm, pptmsix->vcpu,
 				       pptmsix->bus, pptmsix->slot, 
 				       pptmsix->func, pptmsix->idx,
 				       pptmsix->addr, pptmsix->msg,
 				       pptmsix->vector_control);
 		break;
 	case VM_MAP_PPTDEV_MMIO:
 		pptmmio = (struct vm_pptdev_mmio *)data;
 		error = ppt_map_mmio(sc->vm, pptmmio->bus, pptmmio->slot,
 				     pptmmio->func, pptmmio->gpa, pptmmio->len,
 				     pptmmio->hpa);
 		break;
 	case VM_BIND_PPTDEV:
 		pptdev = (struct vm_pptdev *)data;
 		error = vm_assign_pptdev(sc->vm, pptdev->bus, pptdev->slot,
 					 pptdev->func);
 		break;
 	case VM_UNBIND_PPTDEV:
 		pptdev = (struct vm_pptdev *)data;
 		error = vm_unassign_pptdev(sc->vm, pptdev->bus, pptdev->slot,
 					   pptdev->func);
 		break;
 	case VM_INJECT_EXCEPTION:
 		vmexc = (struct vm_exception *)data;
 		error = vm_inject_exception(sc->vm, vmexc->cpuid,
 		    vmexc->vector, vmexc->error_code_valid, vmexc->error_code,
 		    vmexc->restart_instruction);
 		break;
 	case VM_INJECT_NMI:
 		vmnmi = (struct vm_nmi *)data;
 		error = vm_inject_nmi(sc->vm, vmnmi->cpuid);
 		break;
 	case VM_LAPIC_IRQ:
 		vmirq = (struct vm_lapic_irq *)data;
 		error = lapic_intr_edge(sc->vm, vmirq->cpuid, vmirq->vector);
 		break;
 	case VM_LAPIC_LOCAL_IRQ:
 		vmirq = (struct vm_lapic_irq *)data;
 		error = lapic_set_local_intr(sc->vm, vmirq->cpuid,
 		    vmirq->vector);
 		break;
 	case VM_LAPIC_MSI:
 		vmmsi = (struct vm_lapic_msi *)data;
 		error = lapic_intr_msi(sc->vm, vmmsi->addr, vmmsi->msg);
 		break;
 	case VM_IOAPIC_ASSERT_IRQ:
 		ioapic_irq = (struct vm_ioapic_irq *)data;
 		error = vioapic_assert_irq(sc->vm, ioapic_irq->irq);
 		break;
 	case VM_IOAPIC_DEASSERT_IRQ:
 		ioapic_irq = (struct vm_ioapic_irq *)data;
 		error = vioapic_deassert_irq(sc->vm, ioapic_irq->irq);
 		break;
 	case VM_IOAPIC_PULSE_IRQ:
 		ioapic_irq = (struct vm_ioapic_irq *)data;
 		error = vioapic_pulse_irq(sc->vm, ioapic_irq->irq);
 		break;
 	case VM_IOAPIC_PINCOUNT:
 		*(int *)data = vioapic_pincount(sc->vm);
 		break;
 	case VM_ISA_ASSERT_IRQ:
 		isa_irq = (struct vm_isa_irq *)data;
 		error = vatpic_assert_irq(sc->vm, isa_irq->atpic_irq);
 		if (error == 0 && isa_irq->ioapic_irq != -1)
 			error = vioapic_assert_irq(sc->vm,
 			    isa_irq->ioapic_irq);
 		break;
 	case VM_ISA_DEASSERT_IRQ:
 		isa_irq = (struct vm_isa_irq *)data;
 		error = vatpic_deassert_irq(sc->vm, isa_irq->atpic_irq);
 		if (error == 0 && isa_irq->ioapic_irq != -1)
 			error = vioapic_deassert_irq(sc->vm,
 			    isa_irq->ioapic_irq);
 		break;
 	case VM_ISA_PULSE_IRQ:
 		isa_irq = (struct vm_isa_irq *)data;
 		error = vatpic_pulse_irq(sc->vm, isa_irq->atpic_irq);
 		if (error == 0 && isa_irq->ioapic_irq != -1)
 			error = vioapic_pulse_irq(sc->vm, isa_irq->ioapic_irq);
 		break;
 	case VM_ISA_SET_IRQ_TRIGGER:
 		isa_irq_trigger = (struct vm_isa_irq_trigger *)data;
 		error = vatpic_set_irq_trigger(sc->vm,
 		    isa_irq_trigger->atpic_irq, isa_irq_trigger->trigger);
 		break;
 	case VM_MAP_MEMORY:
 		seg = (struct vm_memory_segment *)data;
 		error = vm_malloc(sc->vm, seg->gpa, seg->len);
 		break;
 	case VM_GET_MEMORY_SEG:
 		seg = (struct vm_memory_segment *)data;
 		seg->len = 0;
 		(void)vm_gpabase2memseg(sc->vm, seg->gpa, seg);
 		error = 0;
 		break;
 	case VM_GET_REGISTER:
 		vmreg = (struct vm_register *)data;
 		error = vm_get_register(sc->vm, vmreg->cpuid, vmreg->regnum,
 					&vmreg->regval);
 		break;
 	case VM_SET_REGISTER:
 		vmreg = (struct vm_register *)data;
 		error = vm_set_register(sc->vm, vmreg->cpuid, vmreg->regnum,
 					vmreg->regval);
 		break;
 	case VM_SET_SEGMENT_DESCRIPTOR:
 		vmsegdesc = (struct vm_seg_desc *)data;
 		error = vm_set_seg_desc(sc->vm, vmsegdesc->cpuid,
 					vmsegdesc->regnum,
 					&vmsegdesc->desc);
 		break;
 	case VM_GET_SEGMENT_DESCRIPTOR:
 		vmsegdesc = (struct vm_seg_desc *)data;
 		error = vm_get_seg_desc(sc->vm, vmsegdesc->cpuid,
 					vmsegdesc->regnum,
 					&vmsegdesc->desc);
 		break;
 	case VM_GET_CAPABILITY:
 		vmcap = (struct vm_capability *)data;
 		error = vm_get_capability(sc->vm, vmcap->cpuid,
 					  vmcap->captype,
 					  &vmcap->capval);
 		break;
 	case VM_SET_CAPABILITY:
 		vmcap = (struct vm_capability *)data;
 		error = vm_set_capability(sc->vm, vmcap->cpuid,
 					  vmcap->captype,
 					  vmcap->capval);
 		break;
 	case VM_SET_X2APIC_STATE:
 		x2apic = (struct vm_x2apic *)data;
 		error = vm_set_x2apic_state(sc->vm,
 					    x2apic->cpuid, x2apic->state);
 		break;
 	case VM_GET_X2APIC_STATE:
 		x2apic = (struct vm_x2apic *)data;
 		error = vm_get_x2apic_state(sc->vm,
 					    x2apic->cpuid, &x2apic->state);
 		break;
 	case VM_GET_GPA_PMAP:
 		gpapte = (struct vm_gpa_pte *)data;
 		pmap_get_mapping(vmspace_pmap(vm_get_vmspace(sc->vm)),
 				 gpapte->gpa, gpapte->pte, &gpapte->ptenum);
 		error = 0;
 		break;
 	case VM_GET_HPET_CAPABILITIES:
 		error = vhpet_getcap((struct vm_hpet_cap *)data);
 		break;
 	case VM_GLA2GPA: {
 		CTASSERT(PROT_READ == VM_PROT_READ);
 		CTASSERT(PROT_WRITE == VM_PROT_WRITE);
 		CTASSERT(PROT_EXEC == VM_PROT_EXECUTE);
 		gg = (struct vm_gla2gpa *)data;
-		error = vmm_gla2gpa(sc->vm, gg->vcpuid, &gg->paging, gg->gla,
+		error = vm_gla2gpa(sc->vm, gg->vcpuid, &gg->paging, gg->gla,
 		    gg->prot, &gg->gpa);
 		KASSERT(error == 0 || error == 1 || error == -1,
-		    ("%s: vmm_gla2gpa unknown error %d", __func__, error));
+		    ("%s: vm_gla2gpa unknown error %d", __func__, error));
 		if (error >= 0) {
 			/*
 			 * error = 0: the translation was successful
 			 * error = 1: a fault was injected into the guest
 			 */
 			gg->fault = error;
 			error = 0;
 		} else {
 			error = EFAULT;
 		}
 		break;
 	}
 	case VM_ACTIVATE_CPU:
 		vac = (struct vm_activate_cpu *)data;
 		error = vm_activate_cpu(sc->vm, vac->vcpuid);
 		break;
 	case VM_GET_CPUS:
 		error = 0;
 		vm_cpuset = (struct vm_cpuset *)data;
 		size = vm_cpuset->cpusetsize;
 		if (size < sizeof(cpuset_t) || size > CPU_MAXSIZE / NBBY) {
 			error = ERANGE;
 			break;
 		}
 		cpuset = malloc(size, M_TEMP, M_WAITOK | M_ZERO);
 		if (vm_cpuset->which == VM_ACTIVE_CPUS)
 			*cpuset = vm_active_cpus(sc->vm);
 		else if (vm_cpuset->which == VM_SUSPENDED_CPUS)
 			*cpuset = vm_suspended_cpus(sc->vm);
 		else
 			error = EINVAL;
 		if (error == 0)
 			error = copyout(cpuset, vm_cpuset->cpus, size);
 		free(cpuset, M_TEMP);
 		break;
 	case VM_SET_INTINFO:
 		vmii = (struct vm_intinfo *)data;
 		error = vm_exit_intinfo(sc->vm, vmii->vcpuid, vmii->info1);
 		break;
 	case VM_GET_INTINFO:
 		vmii = (struct vm_intinfo *)data;
 		error = vm_get_intinfo(sc->vm, vmii->vcpuid, &vmii->info1,
 		    &vmii->info2);
 		break;
 	case VM_RTC_WRITE:
 		rtcdata = (struct vm_rtc_data *)data;
 		error = vrtc_nvram_write(sc->vm, rtcdata->offset,
 		    rtcdata->value);
 		break;
 	case VM_RTC_READ:
 		rtcdata = (struct vm_rtc_data *)data;
 		error = vrtc_nvram_read(sc->vm, rtcdata->offset,
 		    &rtcdata->value);
 		break;
 	case VM_RTC_SETTIME:
 		rtctime = (struct vm_rtc_time *)data;
 		error = vrtc_set_time(sc->vm, rtctime->secs);
 		break;
 	case VM_RTC_GETTIME:
 		error = 0;
 		rtctime = (struct vm_rtc_time *)data;
 		rtctime->secs = vrtc_get_time(sc->vm);
 		break;
 	case VM_RESTART_INSTRUCTION:
 		error = vm_restart_instruction(sc->vm, vcpu);
 		break;
 	default:
 		error = ENOTTY;
 		break;
 	}
 
 	if (state_changed == 1) {
 		vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false);
 	} else if (state_changed == 2) {
 		for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++)
 			vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false);
 	}
 
 done:
 	/* Make sure that no handler returns a bogus value like ERESTART */
 	KASSERT(error >= 0, ("vmmdev_ioctl: invalid error return %d", error));
 	return (error);
 }
 
 static int
 vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset,
 		   vm_size_t size, struct vm_object **object, int nprot)
 {
 	int error;
 	struct vmmdev_softc *sc;
 
 	sc = vmmdev_lookup2(cdev);
 	if (sc != NULL && (nprot & PROT_EXEC) == 0)
 		error = vm_get_memobj(sc->vm, *offset, size, offset, object);
 	else
 		error = EINVAL;
 
 	return (error);
 }
 
 static void
 vmmdev_destroy(void *arg)
 {
 
 	struct vmmdev_softc *sc = arg;
 
 	if (sc->cdev != NULL)
 		destroy_dev(sc->cdev);
 
 	if (sc->vm != NULL)
 		vm_destroy(sc->vm);
 
 	if ((sc->flags & VSC_LINKED) != 0) {
 		mtx_lock(&vmmdev_mtx);
 		SLIST_REMOVE(&head, sc, vmmdev_softc, link);
 		mtx_unlock(&vmmdev_mtx);
 	}
 
 	free(sc, M_VMMDEV);
 }
 
 static int
 sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	char buf[VM_MAX_NAMELEN];
 	struct vmmdev_softc *sc;
 	struct cdev *cdev;
 
 	strlcpy(buf, "beavis", sizeof(buf));
 	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 
 	mtx_lock(&vmmdev_mtx);
 	sc = vmmdev_lookup(buf);
 	if (sc == NULL || sc->cdev == NULL) {
 		mtx_unlock(&vmmdev_mtx);
 		return (EINVAL);
 	}
 
 	/*
 	 * The 'cdev' will be destroyed asynchronously when 'si_threadcount'
 	 * goes down to 0 so we should not do it again in the callback.
 	 */
 	cdev = sc->cdev;
 	sc->cdev = NULL;		
 	mtx_unlock(&vmmdev_mtx);
 
 	/*
 	 * Schedule the 'cdev' to be destroyed:
 	 *
 	 * - any new operations on this 'cdev' will return an error (ENXIO).
 	 *
 	 * - when the 'si_threadcount' dwindles down to zero the 'cdev' will
 	 *   be destroyed and the callback will be invoked in a taskqueue
 	 *   context.
 	 */
 	destroy_dev_sched_cb(cdev, vmmdev_destroy, sc);
 
 	return (0);
 }
 SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy, CTLTYPE_STRING | CTLFLAG_RW,
 	    NULL, 0, sysctl_vmm_destroy, "A", NULL);
 
 static struct cdevsw vmmdevsw = {
 	.d_name		= "vmmdev",
 	.d_version	= D_VERSION,
 	.d_ioctl	= vmmdev_ioctl,
 	.d_mmap_single	= vmmdev_mmap_single,
 	.d_read		= vmmdev_rw,
 	.d_write	= vmmdev_rw,
 };
 
 static int
 sysctl_vmm_create(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	struct vm *vm;
 	struct cdev *cdev;
 	struct vmmdev_softc *sc, *sc2;
 	char buf[VM_MAX_NAMELEN];
 
 	strlcpy(buf, "beavis", sizeof(buf));
 	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 
 	mtx_lock(&vmmdev_mtx);
 	sc = vmmdev_lookup(buf);
 	mtx_unlock(&vmmdev_mtx);
 	if (sc != NULL)
 		return (EEXIST);
 
 	error = vm_create(buf, &vm);
 	if (error != 0)
 		return (error);
 
 	sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO);
 	sc->vm = vm;
 
 	/*
 	 * Lookup the name again just in case somebody sneaked in when we
 	 * dropped the lock.
 	 */
 	mtx_lock(&vmmdev_mtx);
 	sc2 = vmmdev_lookup(buf);
 	if (sc2 == NULL) {
 		SLIST_INSERT_HEAD(&head, sc, link);
 		sc->flags |= VSC_LINKED;
 	}
 	mtx_unlock(&vmmdev_mtx);
 
 	if (sc2 != NULL) {
 		vmmdev_destroy(sc);
 		return (EEXIST);
 	}
 
 	error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &vmmdevsw, NULL,
 			   UID_ROOT, GID_WHEEL, 0600, "vmm/%s", buf);
 	if (error != 0) {
 		vmmdev_destroy(sc);
 		return (error);
 	}
 
 	mtx_lock(&vmmdev_mtx);
 	sc->cdev = cdev;
 	sc->cdev->si_drv1 = sc;
 	mtx_unlock(&vmmdev_mtx);
 
 	return (0);
 }
 SYSCTL_PROC(_hw_vmm, OID_AUTO, create, CTLTYPE_STRING | CTLFLAG_RW,
 	    NULL, 0, sysctl_vmm_create, "A", NULL);
 
 void
 vmmdev_init(void)
 {
 	mtx_init(&vmmdev_mtx, "vmm device mutex", NULL, MTX_DEF);
 }
 
 int
 vmmdev_cleanup(void)
 {
 	int error;
 
 	if (SLIST_EMPTY(&head))
 		error = 0;
 	else
 		error = EBUSY;
 
 	return (error);
 }
Index: stable/10/sys/amd64/vmm/vmm_instruction_emul.c
===================================================================
--- stable/10/sys/amd64/vmm/vmm_instruction_emul.c	(revision 284898)
+++ stable/10/sys/amd64/vmm/vmm_instruction_emul.c	(revision 284899)
@@ -1,2196 +1,2338 @@
 /*-
  * Copyright (c) 2012 Sandvine, Inc.
  * Copyright (c) 2012 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #ifdef _KERNEL
 #include <sys/param.h>
 #include <sys/pcpu.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 #include <machine/vmparam.h>
 #include <machine/vmm.h>
 #else	/* !_KERNEL */
 #include <sys/types.h>
 #include <sys/errno.h>
 #include <sys/_iovec.h>
 
 #include <machine/vmm.h>
 
 #include <assert.h>
 #include <vmmapi.h>
 #define	KASSERT(exp,msg)	assert((exp))
 #endif	/* _KERNEL */
 
 #include <machine/vmm_instruction_emul.h>
 #include <x86/psl.h>
 #include <x86/specialreg.h>
 
 /* struct vie_op.op_type */
 enum {
 	VIE_OP_TYPE_NONE = 0,
 	VIE_OP_TYPE_MOV,
 	VIE_OP_TYPE_MOVSX,
 	VIE_OP_TYPE_MOVZX,
 	VIE_OP_TYPE_AND,
 	VIE_OP_TYPE_OR,
 	VIE_OP_TYPE_SUB,
 	VIE_OP_TYPE_TWO_BYTE,
 	VIE_OP_TYPE_PUSH,
 	VIE_OP_TYPE_CMP,
 	VIE_OP_TYPE_POP,
 	VIE_OP_TYPE_MOVS,
+	VIE_OP_TYPE_GROUP1,
+	VIE_OP_TYPE_STOS,
 	VIE_OP_TYPE_LAST
 };
 
 /* struct vie_op.op_flags */
 #define	VIE_OP_F_IMM		(1 << 0)  /* 16/32-bit immediate operand */
 #define	VIE_OP_F_IMM8		(1 << 1)  /* 8-bit immediate operand */
 #define	VIE_OP_F_MOFFSET	(1 << 2)  /* 16/32/64-bit immediate moffset */
 #define	VIE_OP_F_NO_MODRM	(1 << 3)
 #define	VIE_OP_F_NO_GLA_VERIFICATION (1 << 4)
 
 static const struct vie_op two_byte_opcodes[256] = {
 	[0xB6] = {
 		.op_byte = 0xB6,
 		.op_type = VIE_OP_TYPE_MOVZX,
 	},
 	[0xB7] = {
 		.op_byte = 0xB7,
 		.op_type = VIE_OP_TYPE_MOVZX,
 	},
 	[0xBE] = {
 		.op_byte = 0xBE,
 		.op_type = VIE_OP_TYPE_MOVSX,
 	},
 };
 
 static const struct vie_op one_byte_opcodes[256] = {
 	[0x0F] = {
 		.op_byte = 0x0F,
 		.op_type = VIE_OP_TYPE_TWO_BYTE
 	},
 	[0x2B] = {
 		.op_byte = 0x2B,
 		.op_type = VIE_OP_TYPE_SUB,
 	},
 	[0x3B] = {
 		.op_byte = 0x3B,
 		.op_type = VIE_OP_TYPE_CMP,
 	},
 	[0x88] = {
 		.op_byte = 0x88,
 		.op_type = VIE_OP_TYPE_MOV,
 	},
 	[0x89] = {
 		.op_byte = 0x89,
 		.op_type = VIE_OP_TYPE_MOV,
 	},
 	[0x8A] = {
 		.op_byte = 0x8A,
 		.op_type = VIE_OP_TYPE_MOV,
 	},
 	[0x8B] = {
 		.op_byte = 0x8B,
 		.op_type = VIE_OP_TYPE_MOV,
 	},
 	[0xA1] = {
 		.op_byte = 0xA1,
 		.op_type = VIE_OP_TYPE_MOV,
 		.op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
 	},
 	[0xA3] = {
 		.op_byte = 0xA3,
 		.op_type = VIE_OP_TYPE_MOV,
 		.op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
 	},
 	[0xA4] = {
 		.op_byte = 0xA4,
 		.op_type = VIE_OP_TYPE_MOVS,
 		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
 	},
 	[0xA5] = {
 		.op_byte = 0xA5,
 		.op_type = VIE_OP_TYPE_MOVS,
 		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
 	},
+	[0xAA] = {
+		.op_byte = 0xAA,
+		.op_type = VIE_OP_TYPE_STOS,
+		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
+	},
+	[0xAB] = {
+		.op_byte = 0xAB,
+		.op_type = VIE_OP_TYPE_STOS,
+		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
+	},
 	[0xC6] = {
 		/* XXX Group 11 extended opcode - not just MOV */
 		.op_byte = 0xC6,
 		.op_type = VIE_OP_TYPE_MOV,
 		.op_flags = VIE_OP_F_IMM8,
 	},
 	[0xC7] = {
 		.op_byte = 0xC7,
 		.op_type = VIE_OP_TYPE_MOV,
 		.op_flags = VIE_OP_F_IMM,
 	},
 	[0x23] = {
 		.op_byte = 0x23,
 		.op_type = VIE_OP_TYPE_AND,
 	},
 	[0x81] = {
-		/* XXX Group 1 extended opcode - not just AND */
+		/* XXX Group 1 extended opcode */
 		.op_byte = 0x81,
-		.op_type = VIE_OP_TYPE_AND,
+		.op_type = VIE_OP_TYPE_GROUP1,
 		.op_flags = VIE_OP_F_IMM,
 	},
 	[0x83] = {
-		/* XXX Group 1 extended opcode - not just OR */
+		/* XXX Group 1 extended opcode */
 		.op_byte = 0x83,
-		.op_type = VIE_OP_TYPE_OR,
+		.op_type = VIE_OP_TYPE_GROUP1,
 		.op_flags = VIE_OP_F_IMM8,
 	},
 	[0x8F] = {
 		/* XXX Group 1A extended opcode - not just POP */
 		.op_byte = 0x8F,
 		.op_type = VIE_OP_TYPE_POP,
 	},
 	[0xFF] = {
 		/* XXX Group 5 extended opcode - not just PUSH */
 		.op_byte = 0xFF,
 		.op_type = VIE_OP_TYPE_PUSH,
 	}
 };
 
 /* struct vie.mod */
 #define	VIE_MOD_INDIRECT		0
 #define	VIE_MOD_INDIRECT_DISP8		1
 #define	VIE_MOD_INDIRECT_DISP32		2
 #define	VIE_MOD_DIRECT			3
 
 /* struct vie.rm */
 #define	VIE_RM_SIB			4
 #define	VIE_RM_DISP32			5
 
 #define	GB				(1024 * 1024 * 1024)
 
 static enum vm_reg_name gpr_map[16] = {
 	VM_REG_GUEST_RAX,
 	VM_REG_GUEST_RCX,
 	VM_REG_GUEST_RDX,
 	VM_REG_GUEST_RBX,
 	VM_REG_GUEST_RSP,
 	VM_REG_GUEST_RBP,
 	VM_REG_GUEST_RSI,
 	VM_REG_GUEST_RDI,
 	VM_REG_GUEST_R8,
 	VM_REG_GUEST_R9,
 	VM_REG_GUEST_R10,
 	VM_REG_GUEST_R11,
 	VM_REG_GUEST_R12,
 	VM_REG_GUEST_R13,
 	VM_REG_GUEST_R14,
 	VM_REG_GUEST_R15
 };
 
 static uint64_t size2mask[] = {
 	[1] = 0xff,
 	[2] = 0xffff,
 	[4] = 0xffffffff,
 	[8] = 0xffffffffffffffff,
 };
 
 static int
 vie_read_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t *rval)
 {
 	int error;
 
 	error = vm_get_register(vm, vcpuid, reg, rval);
 
 	return (error);
 }
 
 static void
 vie_calc_bytereg(struct vie *vie, enum vm_reg_name *reg, int *lhbr)
 {
 	*lhbr = 0;
 	*reg = gpr_map[vie->reg];
 
 	/*
 	 * 64-bit mode imposes limitations on accessing legacy high byte
 	 * registers (lhbr).
 	 *
 	 * The legacy high-byte registers cannot be addressed if the REX
 	 * prefix is present. In this case the values 4, 5, 6 and 7 of the
 	 * 'ModRM:reg' field address %spl, %bpl, %sil and %dil respectively.
 	 *
 	 * If the REX prefix is not present then the values 4, 5, 6 and 7
 	 * of the 'ModRM:reg' field address the legacy high-byte registers,
 	 * %ah, %ch, %dh and %bh respectively.
 	 */
 	if (!vie->rex_present) {
 		if (vie->reg & 0x4) {
 			*lhbr = 1;
 			*reg = gpr_map[vie->reg & 0x3];
 		}
 	}
 }
 
 static int
 vie_read_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t *rval)
 {
 	uint64_t val;
 	int error, lhbr;
 	enum vm_reg_name reg;
 
 	vie_calc_bytereg(vie, &reg, &lhbr);
 	error = vm_get_register(vm, vcpuid, reg, &val);
 
 	/*
 	 * To obtain the value of a legacy high byte register shift the
 	 * base register right by 8 bits (%ah = %rax >> 8).
 	 */
 	if (lhbr)
 		*rval = val >> 8;
 	else
 		*rval = val;
 	return (error);
 }
 
 static int
 vie_write_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t byte)
 {
 	uint64_t origval, val, mask;
 	int error, lhbr;
 	enum vm_reg_name reg;
 
 	vie_calc_bytereg(vie, &reg, &lhbr);
 	error = vm_get_register(vm, vcpuid, reg, &origval);
 	if (error == 0) {
 		val = byte;
 		mask = 0xff;
 		if (lhbr) {
 			/*
 			 * Shift left by 8 to store 'byte' in a legacy high
 			 * byte register.
 			 */
 			val <<= 8;
 			mask <<= 8;
 		}
 		val |= origval & ~mask;
 		error = vm_set_register(vm, vcpuid, reg, val);
 	}
 	return (error);
 }
 
 int
 vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg,
 		    uint64_t val, int size)
 {
 	int error;
 	uint64_t origval;
 
 	switch (size) {
 	case 1:
 	case 2:
 		error = vie_read_register(vm, vcpuid, reg, &origval);
 		if (error)
 			return (error);
 		val &= size2mask[size];
 		val |= origval & ~size2mask[size];
 		break;
 	case 4:
 		val &= 0xffffffffUL;
 		break;
 	case 8:
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	error = vm_set_register(vm, vcpuid, reg, val);
 	return (error);
 }
 
 #define	RFLAGS_STATUS_BITS    (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_V)
 
 /*
  * Return the status flags that would result from doing (x - y).
  */
 #define	GETCC(sz)							\
 static u_long								\
 getcc##sz(uint##sz##_t x, uint##sz##_t y)				\
 {									\
 	u_long rflags;							\
 									\
 	__asm __volatile("sub %2,%1; pushfq; popq %0" :			\
 	    "=r" (rflags), "+r" (x) : "m" (y));				\
 	return (rflags);						\
 } struct __hack
 
 GETCC(8);
 GETCC(16);
 GETCC(32);
 GETCC(64);
 
 static u_long
 getcc(int opsize, uint64_t x, uint64_t y)
 {
 	KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8,
 	    ("getcc: invalid operand size %d", opsize));
 
 	if (opsize == 1)
 		return (getcc8(x, y));
 	else if (opsize == 2)
 		return (getcc16(x, y));
 	else if (opsize == 4)
 		return (getcc32(x, y));
 	else
 		return (getcc64(x, y));
 }
 
 static int
 emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
 {
 	int error, size;
 	enum vm_reg_name reg;
 	uint8_t byte;
 	uint64_t val;
 
 	size = vie->opsize;
 	error = EINVAL;
 
 	switch (vie->op.op_byte) {
 	case 0x88:
 		/*
 		 * MOV byte from reg (ModRM:reg) to mem (ModRM:r/m)
 		 * 88/r:	mov r/m8, r8
 		 * REX + 88/r:	mov r/m8, r8 (%ah, %ch, %dh, %bh not available)
 		 */
 		size = 1;	/* override for byte operation */
 		error = vie_read_bytereg(vm, vcpuid, vie, &byte);
 		if (error == 0)
 			error = memwrite(vm, vcpuid, gpa, byte, size, arg);
 		break;
 	case 0x89:
 		/*
 		 * MOV from reg (ModRM:reg) to mem (ModRM:r/m)
 		 * 89/r:	mov r/m16, r16
 		 * 89/r:	mov r/m32, r32
 		 * REX.W + 89/r	mov r/m64, r64
 		 */
 		reg = gpr_map[vie->reg];
 		error = vie_read_register(vm, vcpuid, reg, &val);
 		if (error == 0) {
 			val &= size2mask[size];
 			error = memwrite(vm, vcpuid, gpa, val, size, arg);
 		}
 		break;
 	case 0x8A:
 		/*
 		 * MOV byte from mem (ModRM:r/m) to reg (ModRM:reg)
 		 * 8A/r:	mov r8, r/m8
 		 * REX + 8A/r:	mov r8, r/m8
 		 */
 		size = 1;	/* override for byte operation */
 		error = memread(vm, vcpuid, gpa, &val, size, arg);
 		if (error == 0)
 			error = vie_write_bytereg(vm, vcpuid, vie, val);
 		break;
 	case 0x8B:
 		/*
 		 * MOV from mem (ModRM:r/m) to reg (ModRM:reg)
 		 * 8B/r:	mov r16, r/m16
 		 * 8B/r:	mov r32, r/m32
 		 * REX.W 8B/r:	mov r64, r/m64
 		 */
 		error = memread(vm, vcpuid, gpa, &val, size, arg);
 		if (error == 0) {
 			reg = gpr_map[vie->reg];
 			error = vie_update_register(vm, vcpuid, reg, val, size);
 		}
 		break;
 	case 0xA1:
 		/*
 		 * MOV from seg:moffset to AX/EAX/RAX
 		 * A1:		mov AX, moffs16
 		 * A1:		mov EAX, moffs32
 		 * REX.W + A1:	mov RAX, moffs64
 		 */
 		error = memread(vm, vcpuid, gpa, &val, size, arg);
 		if (error == 0) {
 			reg = VM_REG_GUEST_RAX;
 			error = vie_update_register(vm, vcpuid, reg, val, size);
 		}
 		break;
 	case 0xA3:
 		/*
 		 * MOV from AX/EAX/RAX to seg:moffset
 		 * A3:		mov moffs16, AX
 		 * A3:		mov moffs32, EAX 
 		 * REX.W + A3:	mov moffs64, RAX
 		 */
 		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val);
 		if (error == 0) {
 			val &= size2mask[size];
 			error = memwrite(vm, vcpuid, gpa, val, size, arg);
 		}
 		break;
 	case 0xC6:
 		/*
 		 * MOV from imm8 to mem (ModRM:r/m)
 		 * C6/0		mov r/m8, imm8
 		 * REX + C6/0	mov r/m8, imm8
 		 */
 		size = 1;	/* override for byte operation */
 		error = memwrite(vm, vcpuid, gpa, vie->immediate, size, arg);
 		break;
 	case 0xC7:
 		/*
 		 * MOV from imm16/imm32 to mem (ModRM:r/m)
 		 * C7/0		mov r/m16, imm16
 		 * C7/0		mov r/m32, imm32
 		 * REX.W + C7/0	mov r/m64, imm32 (sign-extended to 64-bits)
 		 */
 		val = vie->immediate & size2mask[size];
 		error = memwrite(vm, vcpuid, gpa, val, size, arg);
 		break;
 	default:
 		break;
 	}
 
 	return (error);
 }
 
 static int
 emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	     mem_region_read_t memread, mem_region_write_t memwrite,
 	     void *arg)
 {
 	int error, size;
 	enum vm_reg_name reg;
 	uint64_t val;
 
 	size = vie->opsize;
 	error = EINVAL;
 
 	switch (vie->op.op_byte) {
 	case 0xB6:
 		/*
 		 * MOV and zero extend byte from mem (ModRM:r/m) to
 		 * reg (ModRM:reg).
 		 *
 		 * 0F B6/r		movzx r16, r/m8
 		 * 0F B6/r		movzx r32, r/m8
 		 * REX.W + 0F B6/r	movzx r64, r/m8
 		 */
 
 		/* get the first operand */
 		error = memread(vm, vcpuid, gpa, &val, 1, arg);
 		if (error)
 			break;
 
 		/* get the second operand */
 		reg = gpr_map[vie->reg];
 
 		/* zero-extend byte */
 		val = (uint8_t)val;
 
 		/* write the result */
 		error = vie_update_register(vm, vcpuid, reg, val, size);
 		break;
 	case 0xB7:
 		/*
 		 * MOV and zero extend word from mem (ModRM:r/m) to
 		 * reg (ModRM:reg).
 		 *
 		 * 0F B7/r		movzx r32, r/m16
 		 * REX.W + 0F B7/r	movzx r64, r/m16
 		 */
 		error = memread(vm, vcpuid, gpa, &val, 2, arg);
 		if (error)
 			return (error);
 
 		reg = gpr_map[vie->reg];
 
 		/* zero-extend word */
 		val = (uint16_t)val;
 
 		error = vie_update_register(vm, vcpuid, reg, val, size);
 		break;
 	case 0xBE:
 		/*
 		 * MOV and sign extend byte from mem (ModRM:r/m) to
 		 * reg (ModRM:reg).
 		 *
 		 * 0F BE/r		movsx r16, r/m8
 		 * 0F BE/r		movsx r32, r/m8
 		 * REX.W + 0F BE/r	movsx r64, r/m8
 		 */
 
 		/* get the first operand */
 		error = memread(vm, vcpuid, gpa, &val, 1, arg);
 		if (error)
 			break;
 
 		/* get the second operand */
 		reg = gpr_map[vie->reg];
 
 		/* sign extend byte */
 		val = (int8_t)val;
 
 		/* write the result */
 		error = vie_update_register(vm, vcpuid, reg, val, size);
 		break;
 	default:
 		break;
 	}
 	return (error);
 }
 
 /*
  * Helper function to calculate and validate a linear address.
  *
  * Returns 0 on success and 1 if an exception was injected into the guest.
  */
 static int
 get_gla(void *vm, int vcpuid, struct vie *vie, struct vm_guest_paging *paging,
     int opsize, int addrsize, int prot, enum vm_reg_name seg,
     enum vm_reg_name gpr, uint64_t *gla)
 {
 	struct seg_desc desc;
 	uint64_t cr0, val, rflags;
 	int error;
 
 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0);
 	KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error));
 
 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
 
 	error = vm_get_seg_desc(vm, vcpuid, seg, &desc);
 	KASSERT(error == 0, ("%s: error %d getting segment descriptor %d",
 	    __func__, error, seg));
 
 	error = vie_read_register(vm, vcpuid, gpr, &val);
 	KASSERT(error == 0, ("%s: error %d getting register %d", __func__,
 	    error, gpr));
 
 	if (vie_calculate_gla(paging->cpu_mode, seg, &desc, val, opsize,
 	    addrsize, prot, gla)) {
 		if (seg == VM_REG_GUEST_SS)
 			vm_inject_ss(vm, vcpuid, 0);
 		else
 			vm_inject_gp(vm, vcpuid);
 		return (1);
 	}
 
 	if (vie_canonical_check(paging->cpu_mode, *gla)) {
 		if (seg == VM_REG_GUEST_SS)
 			vm_inject_ss(vm, vcpuid, 0);
 		else
 			vm_inject_gp(vm, vcpuid);
 		return (1);
 	}
 
 	if (vie_alignment_check(paging->cpl, opsize, cr0, rflags, *gla)) {
 		vm_inject_ac(vm, vcpuid, 0);
 		return (1);
 	}
 
 	return (0);
 }
 
 static int
 emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
     struct vm_guest_paging *paging, mem_region_read_t memread,
     mem_region_write_t memwrite, void *arg)
 {
 #ifdef _KERNEL
 	struct vm_copyinfo copyinfo[2];
 #else
 	struct iovec copyinfo[2];
 #endif
-	uint64_t dstaddr, srcaddr, val;
+	uint64_t dstaddr, srcaddr, dstgpa, srcgpa, val;
 	uint64_t rcx, rdi, rsi, rflags;
 	int error, opsize, seg, repeat;
 
 	opsize = (vie->op.op_byte == 0xA4) ? 1 : vie->opsize;
 	val = 0;
 	error = 0;
 
 	/*
 	 * XXX although the MOVS instruction is only supposed to be used with
 	 * the "rep" prefix some guests like FreeBSD will use "repnz" instead.
 	 *
 	 * Empirically the "repnz" prefix has identical behavior to "rep"
 	 * and the zero flag does not make a difference.
 	 */
 	repeat = vie->repz_present | vie->repnz_present;
 
 	if (repeat) {
 		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx);
 		KASSERT(!error, ("%s: error %d getting rcx", __func__, error));
 
 		/*
 		 * The count register is %rcx, %ecx or %cx depending on the
 		 * address size of the instruction.
 		 */
 		if ((rcx & vie_size2mask(vie->addrsize)) == 0)
 			return (0);
 	}
 
 	/*
 	 *	Source		Destination	Comments
 	 *	--------------------------------------------
 	 * (1)  memory		memory		n/a
 	 * (2)  memory		mmio		emulated
 	 * (3)  mmio		memory		emulated
-	 * (4)  mmio		mmio		not emulated
+	 * (4)  mmio		mmio		emulated
 	 *
 	 * At this point we don't have sufficient information to distinguish
 	 * between (2), (3) and (4). We use 'vm_copy_setup()' to tease this
 	 * out because it will succeed only when operating on regular memory.
 	 *
 	 * XXX the emulation doesn't properly handle the case where 'gpa'
 	 * is straddling the boundary between the normal memory and MMIO.
 	 */
 
 	seg = vie->segment_override ? vie->segment_register : VM_REG_GUEST_DS;
 	error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize,
 	    PROT_READ, seg, VM_REG_GUEST_RSI, &srcaddr);
 	if (error)
 		goto done;
 
 	error = vm_copy_setup(vm, vcpuid, paging, srcaddr, opsize, PROT_READ,
 	    copyinfo, nitems(copyinfo));
 	if (error == 0) {
 		/*
 		 * case (2): read from system memory and write to mmio.
 		 */
 		vm_copyin(vm, vcpuid, copyinfo, &val, opsize);
 		vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
 		error = memwrite(vm, vcpuid, gpa, val, opsize, arg);
-		goto done;
+		if (error)
+			goto done;
 	} else if (error > 0) {
 		/*
 		 * Resume guest execution to handle fault.
 		 */
 		goto done;
 	} else {
 		/*
 		 * 'vm_copy_setup()' is expected to fail for cases (3) and (4)
 		 * if 'srcaddr' is in the mmio space.
 		 */
-	}
 
-	error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize,
-	    PROT_WRITE, VM_REG_GUEST_ES, VM_REG_GUEST_RDI, &dstaddr);
-	if (error)
-		goto done;
-
-	error = vm_copy_setup(vm, vcpuid, paging, dstaddr, opsize,
-	    PROT_WRITE, copyinfo, nitems(copyinfo));
-	if (error == 0) {
-		/*
-		 * case (3): read from MMIO and write to system memory.
-		 *
-		 * A MMIO read can have side-effects so we commit to it
-		 * only after vm_copy_setup() is successful. If a page-fault
-		 * needs to be injected into the guest then it will happen
-		 * before the MMIO read is attempted.
-		 */
-		error = memread(vm, vcpuid, gpa, &val, opsize, arg);
+		error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize,
+		    PROT_WRITE, VM_REG_GUEST_ES, VM_REG_GUEST_RDI, &dstaddr);
 		if (error)
 			goto done;
 
-		vm_copyout(vm, vcpuid, &val, copyinfo, opsize);
-		vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
-	} else if (error > 0) {
-		/*
-		 * Resume guest execution to handle fault.
-		 */
-		goto done;
-	} else {
-		goto done;
+		error = vm_copy_setup(vm, vcpuid, paging, dstaddr, opsize,
+		    PROT_WRITE, copyinfo, nitems(copyinfo));
+		if (error == 0) {
+			/*
+			 * case (3): read from MMIO and write to system memory.
+			 *
+			 * A MMIO read can have side-effects so we
+			 * commit to it only after vm_copy_setup() is
+			 * successful. If a page-fault needs to be
+			 * injected into the guest then it will happen
+			 * before the MMIO read is attempted.
+			 */
+			error = memread(vm, vcpuid, gpa, &val, opsize, arg);
+			if (error)
+				goto done;
+
+			vm_copyout(vm, vcpuid, &val, copyinfo, opsize);
+			vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
+		} else if (error > 0) {
+			/*
+			 * Resume guest execution to handle fault.
+			 */
+			goto done;
+		} else {
+			/*
+			 * Case (4): read from and write to mmio.
+			 */
+			error = vm_gla2gpa(vm, vcpuid, paging, srcaddr,
+			    PROT_READ, &srcgpa);
+			if (error)
+				goto done;
+			error = memread(vm, vcpuid, srcgpa, &val, opsize, arg);
+			if (error)
+				goto done;
+
+			error = vm_gla2gpa(vm, vcpuid, paging, dstaddr,
+			   PROT_WRITE, &dstgpa);
+			if (error)
+				goto done;
+			error = memwrite(vm, vcpuid, dstgpa, val, opsize, arg);
+			if (error)
+				goto done;
+		}
 	}
 
 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSI, &rsi);
 	KASSERT(error == 0, ("%s: error %d getting rsi", __func__, error));
 
 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi);
 	KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error));
 
 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
 
 	if (rflags & PSL_D) {
 		rsi -= opsize;
 		rdi -= opsize;
 	} else {
 		rsi += opsize;
 		rdi += opsize;
 	}
 
 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSI, rsi,
 	    vie->addrsize);
 	KASSERT(error == 0, ("%s: error %d updating rsi", __func__, error));
 
 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi,
 	    vie->addrsize);
 	KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error));
 
 	if (repeat) {
 		rcx = rcx - 1;
 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX,
 		    rcx, vie->addrsize);
 		KASSERT(!error, ("%s: error %d updating rcx", __func__, error));
 
 		/*
 		 * Repeat the instruction if the count register is not zero.
 		 */
 		if ((rcx & vie_size2mask(vie->addrsize)) != 0)
 			vm_restart_instruction(vm, vcpuid);
 	}
 done:
 	if (error < 0)
 		return (EFAULT);
 	else
 		return (0);
 }
 
 static int
+emulate_stos(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+    struct vm_guest_paging *paging, mem_region_read_t memread,
+    mem_region_write_t memwrite, void *arg)
+{
+	int error, opsize, repeat;
+	uint64_t val;
+	uint64_t rcx, rdi, rflags;
+
+	opsize = (vie->op.op_byte == 0xAA) ? 1 : vie->opsize;
+	repeat = vie->repz_present | vie->repnz_present;
+
+	if (repeat) {
+		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx);
+		KASSERT(!error, ("%s: error %d getting rcx", __func__, error));
+
+		/*
+		 * The count register is %rcx, %ecx or %cx depending on the
+		 * address size of the instruction.
+		 */
+		if ((rcx & vie_size2mask(vie->addrsize)) == 0)
+			return (0);
+	}
+
+	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val);
+	KASSERT(!error, ("%s: error %d getting rax", __func__, error));
+
+	error = memwrite(vm, vcpuid, gpa, val, opsize, arg);
+	if (error)
+		return (error);
+
+	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi);
+	KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error));
+
+	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
+	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
+
+	if (rflags & PSL_D)
+		rdi -= opsize;
+	else
+		rdi += opsize;
+
+	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi,
+	    vie->addrsize);
+	KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error));
+
+	if (repeat) {
+		rcx = rcx - 1;
+		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX,
+		    rcx, vie->addrsize);
+		KASSERT(!error, ("%s: error %d updating rcx", __func__, error));
+
+		/*
+		 * Repeat the instruction if the count register is not zero.
+		 */
+		if ((rcx & vie_size2mask(vie->addrsize)) != 0)
+			vm_restart_instruction(vm, vcpuid);
+	}
+
+	return (0);
+}
+
+static int
 emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
 {
 	int error, size;
 	enum vm_reg_name reg;
 	uint64_t result, rflags, rflags2, val1, val2;
 
 	size = vie->opsize;
 	error = EINVAL;
 
 	switch (vie->op.op_byte) {
 	case 0x23:
 		/*
 		 * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the
 		 * result in reg.
 		 *
 		 * 23/r		and r16, r/m16
 		 * 23/r		and r32, r/m32
 		 * REX.W + 23/r	and r64, r/m64
 		 */
 
 		/* get the first operand */
 		reg = gpr_map[vie->reg];
 		error = vie_read_register(vm, vcpuid, reg, &val1);
 		if (error)
 			break;
 
 		/* get the second operand */
 		error = memread(vm, vcpuid, gpa, &val2, size, arg);
 		if (error)
 			break;
 
 		/* perform the operation and write the result */
 		result = val1 & val2;
 		error = vie_update_register(vm, vcpuid, reg, result, size);
 		break;
 	case 0x81:
+	case 0x83:
 		/*
-		 * AND/OR mem (ModRM:r/m) with immediate and store the
+		 * AND mem (ModRM:r/m) with immediate and store the
 		 * result in mem.
 		 *
-		 * AND: i = 4
-		 * OR:  i = 1
-		 * 81 /i		op r/m16, imm16
-		 * 81 /i		op r/m32, imm32
-		 * REX.W + 81 /i	op r/m64, imm32 sign-extended to 64
+		 * 81 /4		and r/m16, imm16
+		 * 81 /4		and r/m32, imm32
+		 * REX.W + 81 /4	and r/m64, imm32 sign-extended to 64
 		 *
+		 * 83 /4		and r/m16, imm8 sign-extended to 16
+		 * 83 /4		and r/m32, imm8 sign-extended to 32
+		 * REX.W + 83/4		and r/m64, imm8 sign-extended to 64
 		 */
 
 		/* get the first operand */
                 error = memread(vm, vcpuid, gpa, &val1, size, arg);
                 if (error)
 			break;
 
                 /*
-                 * perform the operation with the pre-fetched immediate
-                 * operand and write the result
-                 */
-		switch (vie->reg & 7) {
-		case 0x4:
-			/* modrm:reg == b100, AND */
-			result = val1 & vie->immediate;
-			break;
-		case 0x1:
-			/* modrm:reg == b001, OR */
-			result = val1 | vie->immediate;
-			break;
-		default:
-			error = EINVAL;
-			break;
-		}
-		if (error)
-			break;
-
-		error = memwrite(vm, vcpuid, gpa, result, size, arg);
+		 * perform the operation with the pre-fetched immediate
+		 * operand and write the result
+		 */
+                result = val1 & vie->immediate;
+                error = memwrite(vm, vcpuid, gpa, result, size, arg);
 		break;
 	default:
 		break;
 	}
 	if (error)
 		return (error);
 
 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
 	if (error)
 		return (error);
 
 	/*
 	 * OF and CF are cleared; the SF, ZF and PF flags are set according
 	 * to the result; AF is undefined.
 	 *
 	 * The updated status flags are obtained by subtracting 0 from 'result'.
 	 */
 	rflags2 = getcc(size, result, 0);
 	rflags &= ~RFLAGS_STATUS_BITS;
 	rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
 
 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
 	return (error);
 }
 
 static int
 emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
 {
 	int error, size;
 	uint64_t val1, result, rflags, rflags2;
 
 	size = vie->opsize;
 	error = EINVAL;
 
 	switch (vie->op.op_byte) {
+	case 0x81:
 	case 0x83:
 		/*
 		 * OR mem (ModRM:r/m) with immediate and store the
 		 * result in mem.
 		 *
-		 * 83 /1		OR r/m16, imm8 sign-extended to 16
-		 * 83 /1		OR r/m32, imm8 sign-extended to 32
-		 * REX.W + 83/1		OR r/m64, imm8 sign-extended to 64
+		 * 81 /1		or r/m16, imm16
+		 * 81 /1		or r/m32, imm32
+		 * REX.W + 81 /1	or r/m64, imm32 sign-extended to 64
 		 *
-		 * Currently, only the OR operation of the 0x83 opcode
-		 * is implemented (ModRM:reg = b001).
+		 * 83 /1		or r/m16, imm8 sign-extended to 16
+		 * 83 /1		or r/m32, imm8 sign-extended to 32
+		 * REX.W + 83/1		or r/m64, imm8 sign-extended to 64
 		 */
-		if ((vie->reg & 7) != 1)
-			break;
 
 		/* get the first operand */
                 error = memread(vm, vcpuid, gpa, &val1, size, arg);
                 if (error)
 			break;
 
                 /*
 		 * perform the operation with the pre-fetched immediate
 		 * operand and write the result
 		 */
                 result = val1 | vie->immediate;
                 error = memwrite(vm, vcpuid, gpa, result, size, arg);
 		break;
 	default:
 		break;
 	}
 	if (error)
 		return (error);
 
 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
 	if (error)
 		return (error);
 
 	/*
 	 * OF and CF are cleared; the SF, ZF and PF flags are set according
 	 * to the result; AF is undefined.
 	 *
 	 * The updated status flags are obtained by subtracting 0 from 'result'.
 	 */
 	rflags2 = getcc(size, result, 0);
 	rflags &= ~RFLAGS_STATUS_BITS;
 	rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
 
 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
 	return (error);
 }
 
 static int
 emulate_cmp(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
 {
 	int error, size;
 	uint64_t op1, op2, rflags, rflags2;
 	enum vm_reg_name reg;
 
 	size = vie->opsize;
 	switch (vie->op.op_byte) {
 	case 0x3B:
 		/*
 		 * 3B/r		CMP r16, r/m16
 		 * 3B/r		CMP r32, r/m32
 		 * REX.W + 3B/r	CMP r64, r/m64
 		 *
 		 * Compare first operand (reg) with second operand (r/m) and
 		 * set status flags in EFLAGS register. The comparison is
 		 * performed by subtracting the second operand from the first
 		 * operand and then setting the status flags.
 		 */
 
 		/* Get the first operand */
 		reg = gpr_map[vie->reg];
 		error = vie_read_register(vm, vcpuid, reg, &op1);
 		if (error)
 			return (error);
 
 		/* Get the second operand */
 		error = memread(vm, vcpuid, gpa, &op2, size, arg);
 		if (error)
 			return (error);
 
+		rflags2 = getcc(size, op1, op2);
 		break;
+	case 0x81:
+	case 0x83:
+		/*
+		 * 81 /7		cmp r/m16, imm16
+		 * 81 /7		cmp r/m32, imm32
+		 * REX.W + 81 /7	cmp r/m64, imm32 sign-extended to 64
+		 *
+		 * 83 /7		cmp r/m16, imm8 sign-extended to 16
+		 * 83 /7		cmp r/m32, imm8 sign-extended to 32
+		 * REX.W + 83 /7	cmp r/m64, imm8 sign-extended to 64
+		 *
+		 * Compare mem (ModRM:r/m) with immediate and set
+		 * status flags according to the results.  The
+		 * comparison is performed by subtracting the
+		 * immediate from the first operand and then setting
+		 * the status flags.
+		 *
+		 */
+
+		/* get the first operand */
+                error = memread(vm, vcpuid, gpa, &op1, size, arg);
+		if (error)
+			return (error);
+
+		rflags2 = getcc(size, op1, vie->immediate);
+		break;
 	default:
 		return (EINVAL);
 	}
-	rflags2 = getcc(size, op1, op2);
 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
 	if (error)
 		return (error);
 	rflags &= ~RFLAGS_STATUS_BITS;
 	rflags |= rflags2 & RFLAGS_STATUS_BITS;
 
 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
 	return (error);
 }
 
 static int
 emulate_sub(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
 {
 	int error, size;
 	uint64_t nval, rflags, rflags2, val1, val2;
 	enum vm_reg_name reg;
 
 	size = vie->opsize;
 	error = EINVAL;
 
 	switch (vie->op.op_byte) {
 	case 0x2B:
 		/*
 		 * SUB r/m from r and store the result in r
 		 * 
 		 * 2B/r            SUB r16, r/m16
 		 * 2B/r            SUB r32, r/m32
 		 * REX.W + 2B/r    SUB r64, r/m64
 		 */
 
 		/* get the first operand */
 		reg = gpr_map[vie->reg];
 		error = vie_read_register(vm, vcpuid, reg, &val1);
 		if (error)
 			break;
 
 		/* get the second operand */
 		error = memread(vm, vcpuid, gpa, &val2, size, arg);
 		if (error)
 			break;
 
 		/* perform the operation and write the result */
 		nval = val1 - val2;
 		error = vie_update_register(vm, vcpuid, reg, nval, size);
 		break;
 	default:
 		break;
 	}
 
 	if (!error) {
 		rflags2 = getcc(size, val1, val2);
 		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
 		    &rflags);
 		if (error)
 			return (error);
 
 		rflags &= ~RFLAGS_STATUS_BITS;
 		rflags |= rflags2 & RFLAGS_STATUS_BITS;
 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
 		    rflags, 8);
 	}
 
 	return (error);
 }
 
 static int
 emulate_stack_op(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
     struct vm_guest_paging *paging, mem_region_read_t memread,
     mem_region_write_t memwrite, void *arg)
 {
 #ifdef _KERNEL
 	struct vm_copyinfo copyinfo[2];
 #else
 	struct iovec copyinfo[2];
 #endif
 	struct seg_desc ss_desc;
 	uint64_t cr0, rflags, rsp, stack_gla, val;
 	int error, size, stackaddrsize, pushop;
 
 	val = 0;
 	size = vie->opsize;
 	pushop = (vie->op.op_type == VIE_OP_TYPE_PUSH) ? 1 : 0;
 
 	/*
 	 * From "Address-Size Attributes for Stack Accesses", Intel SDL, Vol 1
 	 */
 	if (paging->cpu_mode == CPU_MODE_REAL) {
 		stackaddrsize = 2;
 	} else if (paging->cpu_mode == CPU_MODE_64BIT) {
 		/*
 		 * "Stack Manipulation Instructions in 64-bit Mode", SDM, Vol 3
 		 * - Stack pointer size is always 64-bits.
 		 * - PUSH/POP of 32-bit values is not possible in 64-bit mode.
 		 * - 16-bit PUSH/POP is supported by using the operand size
 		 *   override prefix (66H).
 		 */
 		stackaddrsize = 8;
 		size = vie->opsize_override ? 2 : 8;
 	} else {
 		/*
 		 * In protected or compability mode the 'B' flag in the
 		 * stack-segment descriptor determines the size of the
 		 * stack pointer.
 		 */
 		error = vm_get_seg_desc(vm, vcpuid, VM_REG_GUEST_SS, &ss_desc);
 		KASSERT(error == 0, ("%s: error %d getting SS descriptor",
 		    __func__, error));
 		if (SEG_DESC_DEF32(ss_desc.access))
 			stackaddrsize = 4;
 		else
 			stackaddrsize = 2;
 	}
 
 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0);
 	KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error));
 
 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
 
 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSP, &rsp);
 	KASSERT(error == 0, ("%s: error %d getting rsp", __func__, error));
 	if (pushop) {
 		rsp -= size;
 	}
 
 	if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &ss_desc,
 	    rsp, size, stackaddrsize, pushop ? PROT_WRITE : PROT_READ,
 	    &stack_gla)) {
 		vm_inject_ss(vm, vcpuid, 0);
 		return (0);
 	}
 
 	if (vie_canonical_check(paging->cpu_mode, stack_gla)) {
 		vm_inject_ss(vm, vcpuid, 0);
 		return (0);
 	}
 
 	if (vie_alignment_check(paging->cpl, size, cr0, rflags, stack_gla)) {
 		vm_inject_ac(vm, vcpuid, 0);
 		return (0);
 	}
 
 	error = vm_copy_setup(vm, vcpuid, paging, stack_gla, size,
 	    pushop ? PROT_WRITE : PROT_READ, copyinfo, nitems(copyinfo));
 	if (error == -1) {
 		/*
 		 * XXX cannot return a negative error value here because it
 		 * ends up being the return value of the VM_RUN() ioctl and
 		 * is interpreted as a pseudo-error (for e.g. ERESTART).
 		 */
 		return (EFAULT);
 	} else if (error == 1) {
 		/* Resume guest execution to handle page fault */
 		return (0);
 	}
 
 	if (pushop) {
 		error = memread(vm, vcpuid, mmio_gpa, &val, size, arg);
 		if (error == 0)
 			vm_copyout(vm, vcpuid, &val, copyinfo, size);
 	} else {
 		vm_copyin(vm, vcpuid, copyinfo, &val, size);
 		error = memwrite(vm, vcpuid, mmio_gpa, val, size, arg);
 		rsp += size;
 	}
 	vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
 
 	if (error == 0) {
 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSP, rsp,
 		    stackaddrsize);
 		KASSERT(error == 0, ("error %d updating rsp", error));
 	}
 	return (error);
 }
 
 static int
 emulate_push(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
     struct vm_guest_paging *paging, mem_region_read_t memread,
     mem_region_write_t memwrite, void *arg)
 {
 	int error;
 
 	/*
 	 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2.
 	 *
 	 * PUSH is part of the group 5 extended opcodes and is identified
 	 * by ModRM:reg = b110.
 	 */
 	if ((vie->reg & 7) != 6)
 		return (EINVAL);
 
 	error = emulate_stack_op(vm, vcpuid, mmio_gpa, vie, paging, memread,
 	    memwrite, arg);
 	return (error);
 }
 
 static int
 emulate_pop(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
     struct vm_guest_paging *paging, mem_region_read_t memread,
     mem_region_write_t memwrite, void *arg)
 {
 	int error;
 
 	/*
 	 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2.
 	 *
 	 * POP is part of the group 1A extended opcodes and is identified
 	 * by ModRM:reg = b000.
 	 */
 	if ((vie->reg & 7) != 0)
 		return (EINVAL);
 
 	error = emulate_stack_op(vm, vcpuid, mmio_gpa, vie, paging, memread,
 	    memwrite, arg);
 	return (error);
 }
 
+static int
+emulate_group1(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+    struct vm_guest_paging *paging, mem_region_read_t memread,
+    mem_region_write_t memwrite, void *memarg)
+{
+	int error;
+
+	switch (vie->reg & 7) {
+	case 0x1:	/* OR */
+		error = emulate_or(vm, vcpuid, gpa, vie,
+		    memread, memwrite, memarg);
+		break;
+	case 0x4:	/* AND */
+		error = emulate_and(vm, vcpuid, gpa, vie,
+		    memread, memwrite, memarg);
+		break;
+	case 0x7:	/* CMP */
+		error = emulate_cmp(vm, vcpuid, gpa, vie,
+		    memread, memwrite, memarg);
+		break;
+	default:
+		error = EINVAL;
+		break;
+	}
+
+	return (error);
+}
+
 int
 vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
     struct vm_guest_paging *paging, mem_region_read_t memread,
     mem_region_write_t memwrite, void *memarg)
 {
 	int error;
 
 	if (!vie->decoded)
 		return (EINVAL);
 
 	switch (vie->op.op_type) {
+	case VIE_OP_TYPE_GROUP1:
+		error = emulate_group1(vm, vcpuid, gpa, vie, paging, memread,
+		    memwrite, memarg);
+		break;
 	case VIE_OP_TYPE_POP:
 		error = emulate_pop(vm, vcpuid, gpa, vie, paging, memread,
 		    memwrite, memarg);
 		break;
 	case VIE_OP_TYPE_PUSH:
 		error = emulate_push(vm, vcpuid, gpa, vie, paging, memread,
 		    memwrite, memarg);
 		break;
 	case VIE_OP_TYPE_CMP:
 		error = emulate_cmp(vm, vcpuid, gpa, vie,
 				    memread, memwrite, memarg);
 		break;
 	case VIE_OP_TYPE_MOV:
 		error = emulate_mov(vm, vcpuid, gpa, vie,
 				    memread, memwrite, memarg);
 		break;
 	case VIE_OP_TYPE_MOVSX:
 	case VIE_OP_TYPE_MOVZX:
 		error = emulate_movx(vm, vcpuid, gpa, vie,
 				     memread, memwrite, memarg);
 		break;
 	case VIE_OP_TYPE_MOVS:
 		error = emulate_movs(vm, vcpuid, gpa, vie, paging, memread,
 		    memwrite, memarg);
 		break;
+	case VIE_OP_TYPE_STOS:
+		error = emulate_stos(vm, vcpuid, gpa, vie, paging, memread,
+		    memwrite, memarg);
+		break;
 	case VIE_OP_TYPE_AND:
 		error = emulate_and(vm, vcpuid, gpa, vie,
 				    memread, memwrite, memarg);
 		break;
 	case VIE_OP_TYPE_OR:
 		error = emulate_or(vm, vcpuid, gpa, vie,
 				    memread, memwrite, memarg);
 		break;
 	case VIE_OP_TYPE_SUB:
 		error = emulate_sub(vm, vcpuid, gpa, vie,
 				    memread, memwrite, memarg);
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 
 	return (error);
 }
 
 int
 vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla)
 {
 	KASSERT(size == 1 || size == 2 || size == 4 || size == 8,
 	    ("%s: invalid size %d", __func__, size));
 	KASSERT(cpl >= 0 && cpl <= 3, ("%s: invalid cpl %d", __func__, cpl));
 
 	if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0)
 		return (0);
 
 	return ((gla & (size - 1)) ? 1 : 0);
 }
 
 int
 vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla)
 {
 	uint64_t mask;
 
 	if (cpu_mode != CPU_MODE_64BIT)
 		return (0);
 
 	/*
 	 * The value of the bit 47 in the 'gla' should be replicated in the
 	 * most significant 16 bits.
 	 */
 	mask = ~((1UL << 48) - 1);
 	if (gla & (1UL << 47))
 		return ((gla & mask) != mask);
 	else
 		return ((gla & mask) != 0);
 }
 
 uint64_t
 vie_size2mask(int size)
 {
 	KASSERT(size == 1 || size == 2 || size == 4 || size == 8,
 	    ("vie_size2mask: invalid size %d", size));
 	return (size2mask[size]);
 }
 
 int
 vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
     struct seg_desc *desc, uint64_t offset, int length, int addrsize,
     int prot, uint64_t *gla)
 {
 	uint64_t firstoff, low_limit, high_limit, segbase;
 	int glasize, type;
 
 	KASSERT(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS,
 	    ("%s: invalid segment %d", __func__, seg));
 	KASSERT(length == 1 || length == 2 || length == 4 || length == 8,
 	    ("%s: invalid operand size %d", __func__, length));
 	KASSERT((prot & ~(PROT_READ | PROT_WRITE)) == 0,
 	    ("%s: invalid prot %#x", __func__, prot));
 
 	firstoff = offset;
 	if (cpu_mode == CPU_MODE_64BIT) {
 		KASSERT(addrsize == 4 || addrsize == 8, ("%s: invalid address "
 		    "size %d for cpu_mode %d", __func__, addrsize, cpu_mode));
 		glasize = 8;
 	} else {
 		KASSERT(addrsize == 2 || addrsize == 4, ("%s: invalid address "
 		    "size %d for cpu mode %d", __func__, addrsize, cpu_mode));
 		glasize = 4;
 		/*
 		 * If the segment selector is loaded with a NULL selector
 		 * then the descriptor is unusable and attempting to use
 		 * it results in a #GP(0).
 		 */
 		if (SEG_DESC_UNUSABLE(desc->access))
 			return (-1);
 
 		/* 
 		 * The processor generates a #NP exception when a segment
 		 * register is loaded with a selector that points to a
 		 * descriptor that is not present. If this was the case then
 		 * it would have been checked before the VM-exit.
 		 */
 		KASSERT(SEG_DESC_PRESENT(desc->access),
 		    ("segment %d not present: %#x", seg, desc->access));
 
 		/*
 		 * The descriptor type must indicate a code/data segment.
 		 */
 		type = SEG_DESC_TYPE(desc->access);
 		KASSERT(type >= 16 && type <= 31, ("segment %d has invalid "
 		    "descriptor type %#x", seg, type));
 
 		if (prot & PROT_READ) {
 			/* #GP on a read access to a exec-only code segment */
 			if ((type & 0xA) == 0x8)
 				return (-1);
 		}
 
 		if (prot & PROT_WRITE) {
 			/*
 			 * #GP on a write access to a code segment or a
 			 * read-only data segment.
 			 */
 			if (type & 0x8)			/* code segment */
 				return (-1);
 
 			if ((type & 0xA) == 0)		/* read-only data seg */
 				return (-1);
 		}
 
 		/*
 		 * 'desc->limit' is fully expanded taking granularity into
 		 * account.
 		 */
 		if ((type & 0xC) == 0x4) {
 			/* expand-down data segment */
 			low_limit = desc->limit + 1;
 			high_limit = SEG_DESC_DEF32(desc->access) ?
 			    0xffffffff : 0xffff;
 		} else {
 			/* code segment or expand-up data segment */
 			low_limit = 0;
 			high_limit = desc->limit;
 		}
 
 		while (length > 0) {
 			offset &= vie_size2mask(addrsize);
 			if (offset < low_limit || offset > high_limit)
 				return (-1);
 			offset++;
 			length--;
 		}
 	}
 
 	/*
 	 * In 64-bit mode all segments except %fs and %gs have a segment
 	 * base address of 0.
 	 */
 	if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS &&
 	    seg != VM_REG_GUEST_GS) {
 		segbase = 0;
 	} else {
 		segbase = desc->base;
 	}
 
 	/*
 	 * Truncate 'firstoff' to the effective address size before adding
 	 * it to the segment base.
 	 */
 	firstoff &= vie_size2mask(addrsize);
 	*gla = (segbase + firstoff) & vie_size2mask(glasize);
 	return (0);
 }
 
 #ifdef _KERNEL
 void
 vie_init(struct vie *vie, const char *inst_bytes, int inst_length)
 {
 	KASSERT(inst_length >= 0 && inst_length <= VIE_INST_SIZE,
 	    ("%s: invalid instruction length (%d)", __func__, inst_length));
 
 	bzero(vie, sizeof(struct vie));
 
 	vie->base_register = VM_REG_LAST;
 	vie->index_register = VM_REG_LAST;
 	vie->segment_register = VM_REG_LAST;
 
 	if (inst_length) {
 		bcopy(inst_bytes, vie->inst, inst_length);
 		vie->num_valid = inst_length;
 	}
 }
 
 static int
 pf_error_code(int usermode, int prot, int rsvd, uint64_t pte)
 {
 	int error_code = 0;
 
 	if (pte & PG_V)
 		error_code |= PGEX_P;
 	if (prot & VM_PROT_WRITE)
 		error_code |= PGEX_W;
 	if (usermode)
 		error_code |= PGEX_U;
 	if (rsvd)
 		error_code |= PGEX_RSV;
 	if (prot & VM_PROT_EXECUTE)
 		error_code |= PGEX_I;
 
 	return (error_code);
 }
 
 static void
 ptp_release(void **cookie)
 {
 	if (*cookie != NULL) {
 		vm_gpa_release(*cookie);
 		*cookie = NULL;
 	}
 }
 
 static void *
 ptp_hold(struct vm *vm, vm_paddr_t ptpphys, size_t len, void **cookie)
 {
 	void *ptr;
 
 	ptp_release(cookie);
 	ptr = vm_gpa_hold(vm, ptpphys, len, VM_PROT_RW, cookie);
 	return (ptr);
 }
 
 int
-vmm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
+vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
     uint64_t gla, int prot, uint64_t *gpa)
 {
 	int nlevels, pfcode, ptpshift, ptpindex, retval, usermode, writable;
 	u_int retries;
 	uint64_t *ptpbase, ptpphys, pte, pgsize;
 	uint32_t *ptpbase32, pte32;
 	void *cookie;
 
 	usermode = (paging->cpl == 3 ? 1 : 0);
 	writable = prot & VM_PROT_WRITE;
 	cookie = NULL;
 	retval = 0;
 	retries = 0;
 restart:
 	ptpphys = paging->cr3;		/* root of the page tables */
 	ptp_release(&cookie);
 	if (retries++ > 0)
 		maybe_yield();
 
 	if (vie_canonical_check(paging->cpu_mode, gla)) {
 		/*
 		 * XXX assuming a non-stack reference otherwise a stack fault
 		 * should be generated.
 		 */
 		vm_inject_gp(vm, vcpuid);
 		goto fault;
 	}
 
 	if (paging->paging_mode == PAGING_MODE_FLAT) {
 		*gpa = gla;
 		goto done;
 	}
 
 	if (paging->paging_mode == PAGING_MODE_32) {
 		nlevels = 2;
 		while (--nlevels >= 0) {
 			/* Zero out the lower 12 bits. */
 			ptpphys &= ~0xfff;
 
 			ptpbase32 = ptp_hold(vm, ptpphys, PAGE_SIZE, &cookie);
 
 			if (ptpbase32 == NULL)
 				goto error;
 
 			ptpshift = PAGE_SHIFT + nlevels * 10;
 			ptpindex = (gla >> ptpshift) & 0x3FF;
 			pgsize = 1UL << ptpshift;
 
 			pte32 = ptpbase32[ptpindex];
 
 			if ((pte32 & PG_V) == 0 ||
 			    (usermode && (pte32 & PG_U) == 0) ||
 			    (writable && (pte32 & PG_RW) == 0)) {
 				pfcode = pf_error_code(usermode, prot, 0,
 				    pte32);
 				vm_inject_pf(vm, vcpuid, pfcode, gla);
 				goto fault;
 			}
 
 			/*
 			 * Emulate the x86 MMU's management of the accessed
 			 * and dirty flags. While the accessed flag is set
 			 * at every level of the page table, the dirty flag
 			 * is only set at the last level providing the guest
 			 * physical address.
 			 */
 			if ((pte32 & PG_A) == 0) {
 				if (atomic_cmpset_32(&ptpbase32[ptpindex],
 				    pte32, pte32 | PG_A) == 0) {
 					goto restart;
 				}
 			}
 
 			/* XXX must be ignored if CR4.PSE=0 */
 			if (nlevels > 0 && (pte32 & PG_PS) != 0)
 				break;
 
 			ptpphys = pte32;
 		}
 
 		/* Set the dirty bit in the page table entry if necessary */
 		if (writable && (pte32 & PG_M) == 0) {
 			if (atomic_cmpset_32(&ptpbase32[ptpindex],
 			    pte32, pte32 | PG_M) == 0) {
 				goto restart;
 			}
 		}
 
 		/* Zero out the lower 'ptpshift' bits */
 		pte32 >>= ptpshift; pte32 <<= ptpshift;
 		*gpa = pte32 | (gla & (pgsize - 1));
 		goto done;
 	}
 
 	if (paging->paging_mode == PAGING_MODE_PAE) {
 		/* Zero out the lower 5 bits and the upper 32 bits */
 		ptpphys &= 0xffffffe0UL;
 
 		ptpbase = ptp_hold(vm, ptpphys, sizeof(*ptpbase) * 4, &cookie);
 		if (ptpbase == NULL)
 			goto error;
 
 		ptpindex = (gla >> 30) & 0x3;
 
 		pte = ptpbase[ptpindex];
 
 		if ((pte & PG_V) == 0) {
 			pfcode = pf_error_code(usermode, prot, 0, pte);
 			vm_inject_pf(vm, vcpuid, pfcode, gla);
 			goto fault;
 		}
 
 		ptpphys = pte;
 
 		nlevels = 2;
 	} else
 		nlevels = 4;
 	while (--nlevels >= 0) {
 		/* Zero out the lower 12 bits and the upper 12 bits */
 		ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12;
 
 		ptpbase = ptp_hold(vm, ptpphys, PAGE_SIZE, &cookie);
 		if (ptpbase == NULL)
 			goto error;
 
 		ptpshift = PAGE_SHIFT + nlevels * 9;
 		ptpindex = (gla >> ptpshift) & 0x1FF;
 		pgsize = 1UL << ptpshift;
 
 		pte = ptpbase[ptpindex];
 
 		if ((pte & PG_V) == 0 ||
 		    (usermode && (pte & PG_U) == 0) ||
 		    (writable && (pte & PG_RW) == 0)) {
 			pfcode = pf_error_code(usermode, prot, 0, pte);
 			vm_inject_pf(vm, vcpuid, pfcode, gla);
 			goto fault;
 		}
 
 		/* Set the accessed bit in the page table entry */
 		if ((pte & PG_A) == 0) {
 			if (atomic_cmpset_64(&ptpbase[ptpindex],
 			    pte, pte | PG_A) == 0) {
 				goto restart;
 			}
 		}
 
 		if (nlevels > 0 && (pte & PG_PS) != 0) {
 			if (pgsize > 1 * GB) {
 				pfcode = pf_error_code(usermode, prot, 1, pte);
 				vm_inject_pf(vm, vcpuid, pfcode, gla);
 				goto fault;
 			}
 			break;
 		}
 
 		ptpphys = pte;
 	}
 
 	/* Set the dirty bit in the page table entry if necessary */
 	if (writable && (pte & PG_M) == 0) {
 		if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_M) == 0)
 			goto restart;
 	}
 
 	/* Zero out the lower 'ptpshift' bits and the upper 12 bits */
 	pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12;
 	*gpa = pte | (gla & (pgsize - 1));
 done:
 	ptp_release(&cookie);
 	return (retval);
 error:
 	retval = -1;
 	goto done;
 fault:
 	retval = 1;
 	goto done;
 }
 
 int
 vmm_fetch_instruction(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
     uint64_t rip, int inst_length, struct vie *vie)
 {
 	struct vm_copyinfo copyinfo[2];
 	int error, prot;
 
 	if (inst_length > VIE_INST_SIZE)
 		panic("vmm_fetch_instruction: invalid length %d", inst_length);
 
 	prot = PROT_READ | PROT_EXEC;
 	error = vm_copy_setup(vm, vcpuid, paging, rip, inst_length, prot,
 	    copyinfo, nitems(copyinfo));
 	if (error == 0) {
 		vm_copyin(vm, vcpuid, copyinfo, vie->inst, inst_length);
 		vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
 		vie->num_valid = inst_length;
 	}
 	return (error);
 }
 
 static int
 vie_peek(struct vie *vie, uint8_t *x)
 {
 
 	if (vie->num_processed < vie->num_valid) {
 		*x = vie->inst[vie->num_processed];
 		return (0);
 	} else
 		return (-1);
 }
 
 static void
 vie_advance(struct vie *vie)
 {
 
 	vie->num_processed++;
 }
 
 static bool
 segment_override(uint8_t x, int *seg)
 {
 
 	switch (x) {
 	case 0x2E:
 		*seg = VM_REG_GUEST_CS;
 		break;
 	case 0x36:
 		*seg = VM_REG_GUEST_SS;
 		break;
 	case 0x3E:
 		*seg = VM_REG_GUEST_DS;
 		break;
 	case 0x26:
 		*seg = VM_REG_GUEST_ES;
 		break;
 	case 0x64:
 		*seg = VM_REG_GUEST_FS;
 		break;
 	case 0x65:
 		*seg = VM_REG_GUEST_GS;
 		break;
 	default:
 		return (false);
 	}
 	return (true);
 }
 
 static int
 decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d)
 {
 	uint8_t x;
 
 	while (1) {
 		if (vie_peek(vie, &x))
 			return (-1);
 
 		if (x == 0x66)
 			vie->opsize_override = 1;
 		else if (x == 0x67)
 			vie->addrsize_override = 1;
 		else if (x == 0xF3)
 			vie->repz_present = 1;
 		else if (x == 0xF2)
 			vie->repnz_present = 1;
 		else if (segment_override(x, &vie->segment_register))
 			vie->segment_override = 1;
 		else
 			break;
 
 		vie_advance(vie);
 	}
 
 	/*
 	 * From section 2.2.1, "REX Prefixes", Intel SDM Vol 2:
 	 * - Only one REX prefix is allowed per instruction.
 	 * - The REX prefix must immediately precede the opcode byte or the
 	 *   escape opcode byte.
 	 * - If an instruction has a mandatory prefix (0x66, 0xF2 or 0xF3)
 	 *   the mandatory prefix must come before the REX prefix.
 	 */
 	if (cpu_mode == CPU_MODE_64BIT && x >= 0x40 && x <= 0x4F) {
 		vie->rex_present = 1;
 		vie->rex_w = x & 0x8 ? 1 : 0;
 		vie->rex_r = x & 0x4 ? 1 : 0;
 		vie->rex_x = x & 0x2 ? 1 : 0;
 		vie->rex_b = x & 0x1 ? 1 : 0;
 		vie_advance(vie);
 	}
 
 	/*
 	 * Section "Operand-Size And Address-Size Attributes", Intel SDM, Vol 1
 	 */
 	if (cpu_mode == CPU_MODE_64BIT) {
 		/*
 		 * Default address size is 64-bits and default operand size
 		 * is 32-bits.
 		 */
 		vie->addrsize = vie->addrsize_override ? 4 : 8;
 		if (vie->rex_w)
 			vie->opsize = 8;
 		else if (vie->opsize_override)
 			vie->opsize = 2;
 		else
 			vie->opsize = 4;
 	} else if (cs_d) {
 		/* Default address and operand sizes are 32-bits */
 		vie->addrsize = vie->addrsize_override ? 2 : 4;
 		vie->opsize = vie->opsize_override ? 2 : 4;
 	} else {
 		/* Default address and operand sizes are 16-bits */
 		vie->addrsize = vie->addrsize_override ? 4 : 2;
 		vie->opsize = vie->opsize_override ? 4 : 2;
 	}
 	return (0);
 }
 
 static int
 decode_two_byte_opcode(struct vie *vie)
 {
 	uint8_t x;
 
 	if (vie_peek(vie, &x))
 		return (-1);
 
 	vie->op = two_byte_opcodes[x];
 
 	if (vie->op.op_type == VIE_OP_TYPE_NONE)
 		return (-1);
 
 	vie_advance(vie);
 	return (0);
 }
 
 static int
 decode_opcode(struct vie *vie)
 {
 	uint8_t x;
 
 	if (vie_peek(vie, &x))
 		return (-1);
 
 	vie->op = one_byte_opcodes[x];
 
 	if (vie->op.op_type == VIE_OP_TYPE_NONE)
 		return (-1);
 
 	vie_advance(vie);
 
 	if (vie->op.op_type == VIE_OP_TYPE_TWO_BYTE)
 		return (decode_two_byte_opcode(vie));
 
 	return (0);
 }
 
 static int
 decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode)
 {
 	uint8_t x;
 
-	if (cpu_mode == CPU_MODE_REAL)
-		return (-1);
-
 	if (vie->op.op_flags & VIE_OP_F_NO_MODRM)
 		return (0);
+
+	if (cpu_mode == CPU_MODE_REAL)
+		return (-1);
 
 	if (vie_peek(vie, &x))
 		return (-1);
 
 	vie->mod = (x >> 6) & 0x3;
 	vie->rm =  (x >> 0) & 0x7;
 	vie->reg = (x >> 3) & 0x7;
 
 	/*
 	 * A direct addressing mode makes no sense in the context of an EPT
 	 * fault. There has to be a memory access involved to cause the
 	 * EPT fault.
 	 */
 	if (vie->mod == VIE_MOD_DIRECT)
 		return (-1);
 
 	if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) ||
 	    (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) {
 		/*
 		 * Table 2-5: Special Cases of REX Encodings
 		 *
 		 * mod=0, r/m=5 is used in the compatibility mode to
 		 * indicate a disp32 without a base register.
 		 *
 		 * mod!=3, r/m=4 is used in the compatibility mode to
 		 * indicate that the SIB byte is present.
 		 *
 		 * The 'b' bit in the REX prefix is don't care in
 		 * this case.
 		 */
 	} else {
 		vie->rm |= (vie->rex_b << 3);
 	}
 
 	vie->reg |= (vie->rex_r << 3);
 
 	/* SIB */
 	if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)
 		goto done;
 
 	vie->base_register = gpr_map[vie->rm];
 
 	switch (vie->mod) {
 	case VIE_MOD_INDIRECT_DISP8:
 		vie->disp_bytes = 1;
 		break;
 	case VIE_MOD_INDIRECT_DISP32:
 		vie->disp_bytes = 4;
 		break;
 	case VIE_MOD_INDIRECT:
 		if (vie->rm == VIE_RM_DISP32) {
 			vie->disp_bytes = 4;
 			/*
 			 * Table 2-7. RIP-Relative Addressing
 			 *
 			 * In 64-bit mode mod=00 r/m=101 implies [rip] + disp32
 			 * whereas in compatibility mode it just implies disp32.
 			 */
 
 			if (cpu_mode == CPU_MODE_64BIT)
 				vie->base_register = VM_REG_GUEST_RIP;
 			else
 				vie->base_register = VM_REG_LAST;
 		}
 		break;
 	}
 
 done:
 	vie_advance(vie);
 
 	return (0);
 }
 
 static int
 decode_sib(struct vie *vie)
 {
 	uint8_t x;
 
 	/* Proceed only if SIB byte is present */
 	if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB)
 		return (0);
 
 	if (vie_peek(vie, &x))
 		return (-1);
 
 	/* De-construct the SIB byte */
 	vie->ss = (x >> 6) & 0x3;
 	vie->index = (x >> 3) & 0x7;
 	vie->base = (x >> 0) & 0x7;
 
 	/* Apply the REX prefix modifiers */
 	vie->index |= vie->rex_x << 3;
 	vie->base |= vie->rex_b << 3;
 
 	switch (vie->mod) {
 	case VIE_MOD_INDIRECT_DISP8:
 		vie->disp_bytes = 1;
 		break;
 	case VIE_MOD_INDIRECT_DISP32:
 		vie->disp_bytes = 4;
 		break;
 	}
 
 	if (vie->mod == VIE_MOD_INDIRECT &&
 	    (vie->base == 5 || vie->base == 13)) {
 		/*
 		 * Special case when base register is unused if mod = 0
 		 * and base = %rbp or %r13.
 		 *
 		 * Documented in:
 		 * Table 2-3: 32-bit Addressing Forms with the SIB Byte
 		 * Table 2-5: Special Cases of REX Encodings
 		 */
 		vie->disp_bytes = 4;
 	} else {
 		vie->base_register = gpr_map[vie->base];
 	}
 
 	/*
 	 * All encodings of 'index' are valid except for %rsp (4).
 	 *
 	 * Documented in:
 	 * Table 2-3: 32-bit Addressing Forms with the SIB Byte
 	 * Table 2-5: Special Cases of REX Encodings
 	 */
 	if (vie->index != 4)
 		vie->index_register = gpr_map[vie->index];
 
 	/* 'scale' makes sense only in the context of an index register */
 	if (vie->index_register < VM_REG_LAST)
 		vie->scale = 1 << vie->ss;
 
 	vie_advance(vie);
 
 	return (0);
 }
 
 static int
 decode_displacement(struct vie *vie)
 {
 	int n, i;
 	uint8_t x;
 
 	union {
 		char	buf[4];
 		int8_t	signed8;
 		int32_t	signed32;
 	} u;
 
 	if ((n = vie->disp_bytes) == 0)
 		return (0);
 
 	if (n != 1 && n != 4)
 		panic("decode_displacement: invalid disp_bytes %d", n);
 
 	for (i = 0; i < n; i++) {
 		if (vie_peek(vie, &x))
 			return (-1);
 
 		u.buf[i] = x;
 		vie_advance(vie);
 	}
 
 	if (n == 1)
 		vie->displacement = u.signed8;		/* sign-extended */
 	else
 		vie->displacement = u.signed32;		/* sign-extended */
 
 	return (0);
 }
 
 static int
 decode_immediate(struct vie *vie)
 {
 	int i, n;
 	uint8_t x;
 	union {
 		char	buf[4];
 		int8_t	signed8;
 		int16_t	signed16;
 		int32_t	signed32;
 	} u;
 
 	/* Figure out immediate operand size (if any) */
 	if (vie->op.op_flags & VIE_OP_F_IMM) {
 		/*
 		 * Section 2.2.1.5 "Immediates", Intel SDM:
 		 * In 64-bit mode the typical size of immediate operands
 		 * remains 32-bits. When the operand size if 64-bits, the
 		 * processor sign-extends all immediates to 64-bits prior
 		 * to their use.
 		 */
 		if (vie->opsize == 4 || vie->opsize == 8)
 			vie->imm_bytes = 4;
 		else
 			vie->imm_bytes = 2;
 	} else if (vie->op.op_flags & VIE_OP_F_IMM8) {
 		vie->imm_bytes = 1;
 	}
 
 	if ((n = vie->imm_bytes) == 0)
 		return (0);
 
 	KASSERT(n == 1 || n == 2 || n == 4,
 	    ("%s: invalid number of immediate bytes: %d", __func__, n));
 
 	for (i = 0; i < n; i++) {
 		if (vie_peek(vie, &x))
 			return (-1);
 
 		u.buf[i] = x;
 		vie_advance(vie);
 	}
 
 	/* sign-extend the immediate value before use */
 	if (n == 1)
 		vie->immediate = u.signed8;
 	else if (n == 2)
 		vie->immediate = u.signed16;
 	else
 		vie->immediate = u.signed32;
 
 	return (0);
 }
 
 static int
 decode_moffset(struct vie *vie)
 {
 	int i, n;
 	uint8_t x;
 	union {
 		char	buf[8];
 		uint64_t u64;
 	} u;
 
 	if ((vie->op.op_flags & VIE_OP_F_MOFFSET) == 0)
 		return (0);
 
 	/*
 	 * Section 2.2.1.4, "Direct Memory-Offset MOVs", Intel SDM:
 	 * The memory offset size follows the address-size of the instruction.
 	 */
 	n = vie->addrsize;
 	KASSERT(n == 2 || n == 4 || n == 8, ("invalid moffset bytes: %d", n));
 
 	u.u64 = 0;
 	for (i = 0; i < n; i++) {
 		if (vie_peek(vie, &x))
 			return (-1);
 
 		u.buf[i] = x;
 		vie_advance(vie);
 	}
 	vie->displacement = u.u64;
 	return (0);
 }
 
 /*
  * Verify that all the bytes in the instruction buffer were consumed.
  */
 static int
 verify_inst_length(struct vie *vie)
 {
 
 	if (vie->num_processed)
 		return (0);
 	else
 		return (-1);
 }
 
 /*
  * Verify that the 'guest linear address' provided as collateral of the nested
  * page table fault matches with our instruction decoding.
  */
 static int
 verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie)
 {
 	int error;
 	uint64_t base, idx, gla2;
 
 	/* Skip 'gla' verification */
 	if (gla == VIE_INVALID_GLA)
 		return (0);
 
 	base = 0;
 	if (vie->base_register != VM_REG_LAST) {
 		error = vm_get_register(vm, cpuid, vie->base_register, &base);
 		if (error) {
 			printf("verify_gla: error %d getting base reg %d\n",
 				error, vie->base_register);
 			return (-1);
 		}
 
 		/*
 		 * RIP-relative addressing starts from the following
 		 * instruction
 		 */
 		if (vie->base_register == VM_REG_GUEST_RIP)
 			base += vie->num_valid;
 	}
 
 	idx = 0;
 	if (vie->index_register != VM_REG_LAST) {
 		error = vm_get_register(vm, cpuid, vie->index_register, &idx);
 		if (error) {
 			printf("verify_gla: error %d getting index reg %d\n",
 				error, vie->index_register);
 			return (-1);
 		}
 	}
 
 	/* XXX assuming that the base address of the segment is 0 */
 	gla2 = base + vie->scale * idx + vie->displacement;
 	gla2 &= size2mask[vie->addrsize];
 	if (gla != gla2) {
 		printf("verify_gla mismatch: "
 		       "base(0x%0lx), scale(%d), index(0x%0lx), "
 		       "disp(0x%0lx), gla(0x%0lx), gla2(0x%0lx)\n",
 		       base, vie->scale, idx, vie->displacement, gla, gla2);
 		return (-1);
 	}
 
 	return (0);
 }
 
 int
 vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla,
 		       enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie)
 {
 
 	if (decode_prefixes(vie, cpu_mode, cs_d))
 		return (-1);
 
 	if (decode_opcode(vie))
 		return (-1);
 
 	if (decode_modrm(vie, cpu_mode))
 		return (-1);
 
 	if (decode_sib(vie))
 		return (-1);
 
 	if (decode_displacement(vie))
 		return (-1);
 
 	if (decode_immediate(vie))
 		return (-1);
 
 	if (decode_moffset(vie))
 		return (-1);
 
 	if (verify_inst_length(vie))
 		return (-1);
 
 	if ((vie->op.op_flags & VIE_OP_F_NO_GLA_VERIFICATION) == 0) {
 		if (verify_gla(vm, cpuid, gla, vie))
 			return (-1);
 	}
 
 	vie->decoded = 1;	/* success */
 
 	return (0);
 }
 #endif	/* _KERNEL */
Index: stable/10/sys/amd64/vmm/vmm_lapic.c
===================================================================
--- stable/10/sys/amd64/vmm/vmm_lapic.c	(revision 284898)
+++ stable/10/sys/amd64/vmm/vmm_lapic.c	(revision 284899)
@@ -1,244 +1,248 @@
 /*-
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/smp.h>
 
 #include <x86/specialreg.h>
 #include <x86/apicreg.h>
 
 #include <machine/vmm.h>
 #include "vmm_ipi.h"
 #include "vmm_ktr.h"
 #include "vmm_lapic.h"
 #include "vlapic.h"
 
 /*
  * Some MSI message definitions
  */
 #define	MSI_X86_ADDR_MASK	0xfff00000
 #define	MSI_X86_ADDR_BASE	0xfee00000
 #define	MSI_X86_ADDR_RH		0x00000008	/* Redirection Hint */
 #define	MSI_X86_ADDR_LOG	0x00000004	/* Destination Mode */
 
 int
 lapic_set_intr(struct vm *vm, int cpu, int vector, bool level)
 {
 	struct vlapic *vlapic;
 
 	if (cpu < 0 || cpu >= VM_MAXCPU)
 		return (EINVAL);
 
-	if (vector < 32 || vector > 255)
+	/*
+	 * According to section "Maskable Hardware Interrupts" in Intel SDM
+	 * vectors 16 through 255 can be delivered through the local APIC.
+	 */
+	if (vector < 16 || vector > 255)
 		return (EINVAL);
 
 	vlapic = vm_lapic(vm, cpu);
 	if (vlapic_set_intr_ready(vlapic, vector, level))
 		vcpu_notify_event(vm, cpu, true);
 	return (0);
 }
 
 int
 lapic_set_local_intr(struct vm *vm, int cpu, int vector)
 {
 	struct vlapic *vlapic;
 	cpuset_t dmask;
 	int error;
 
 	if (cpu < -1 || cpu >= VM_MAXCPU)
 		return (EINVAL);
 
 	if (cpu == -1)
 		dmask = vm_active_cpus(vm);
 	else
 		CPU_SETOF(cpu, &dmask);
 	error = 0;
 	while ((cpu = CPU_FFS(&dmask)) != 0) {
 		cpu--;
 		CPU_CLR(cpu, &dmask);
 		vlapic = vm_lapic(vm, cpu);
 		error = vlapic_trigger_lvt(vlapic, vector);
 		if (error)
 			break;
 	}
 
 	return (error);
 }
 
 int
 lapic_intr_msi(struct vm *vm, uint64_t addr, uint64_t msg)
 {
 	int delmode, vec;
 	uint32_t dest;
 	bool phys;
 
 	VM_CTR2(vm, "lapic MSI addr: %#lx msg: %#lx", addr, msg);
 
 	if ((addr & MSI_X86_ADDR_MASK) != MSI_X86_ADDR_BASE) {
 		VM_CTR1(vm, "lapic MSI invalid addr %#lx", addr);
 		return (-1);
 	}
 
 	/*
 	 * Extract the x86-specific fields from the MSI addr/msg
 	 * params according to the Intel Arch spec, Vol3 Ch 10.
 	 *
 	 * The PCI specification does not support level triggered
 	 * MSI/MSI-X so ignore trigger level in 'msg'.
 	 *
 	 * The 'dest' is interpreted as a logical APIC ID if both
 	 * the Redirection Hint and Destination Mode are '1' and
 	 * physical otherwise.
 	 */
 	dest = (addr >> 12) & 0xff;
 	phys = ((addr & (MSI_X86_ADDR_RH | MSI_X86_ADDR_LOG)) !=
 	    (MSI_X86_ADDR_RH | MSI_X86_ADDR_LOG));
 	delmode = msg & APIC_DELMODE_MASK;
 	vec = msg & 0xff;
 
 	VM_CTR3(vm, "lapic MSI %s dest %#x, vec %d",
 	    phys ? "physical" : "logical", dest, vec);
 
 	vlapic_deliver_intr(vm, LAPIC_TRIG_EDGE, dest, phys, delmode, vec);
 	return (0);
 }
 
 static boolean_t
 x2apic_msr(u_int msr)
 {
 	if (msr >= 0x800 && msr <= 0xBFF)
 		return (TRUE);
 	else
 		return (FALSE);
 }
 
 static u_int
 x2apic_msr_to_regoff(u_int msr)
 {
 
 	return ((msr - 0x800) << 4);
 }
 
 boolean_t
 lapic_msr(u_int msr)
 {
 
 	if (x2apic_msr(msr) || (msr == MSR_APICBASE))
 		return (TRUE);
 	else
 		return (FALSE);
 }
 
 int
 lapic_rdmsr(struct vm *vm, int cpu, u_int msr, uint64_t *rval, bool *retu)
 {
 	int error;
 	u_int offset;
 	struct vlapic *vlapic;
 
 	vlapic = vm_lapic(vm, cpu);
 
 	if (msr == MSR_APICBASE) {
 		*rval = vlapic_get_apicbase(vlapic);
 		error = 0;
 	} else {
 		offset = x2apic_msr_to_regoff(msr);
 		error = vlapic_read(vlapic, 0, offset, rval, retu);
 	}
 
 	return (error);
 }
 
 int
 lapic_wrmsr(struct vm *vm, int cpu, u_int msr, uint64_t val, bool *retu)
 {
 	int error;
 	u_int offset;
 	struct vlapic *vlapic;
 
 	vlapic = vm_lapic(vm, cpu);
 
 	if (msr == MSR_APICBASE) {
 		error = vlapic_set_apicbase(vlapic, val);
 	} else {
 		offset = x2apic_msr_to_regoff(msr);
 		error = vlapic_write(vlapic, 0, offset, val, retu);
 	}
 
 	return (error);
 }
 
 int
 lapic_mmio_write(void *vm, int cpu, uint64_t gpa, uint64_t wval, int size,
 		 void *arg)
 {
 	int error;
 	uint64_t off;
 	struct vlapic *vlapic;
 
 	off = gpa - DEFAULT_APIC_BASE;
 
 	/*
 	 * Memory mapped local apic accesses must be 4 bytes wide and
 	 * aligned on a 16-byte boundary.
 	 */
 	if (size != 4 || off & 0xf)
 		return (EINVAL);
 
 	vlapic = vm_lapic(vm, cpu);
 	error = vlapic_write(vlapic, 1, off, wval, arg);
 	return (error);
 }
 
 int
 lapic_mmio_read(void *vm, int cpu, uint64_t gpa, uint64_t *rval, int size,
 		void *arg)
 {
 	int error;
 	uint64_t off;
 	struct vlapic *vlapic;
 
 	off = gpa - DEFAULT_APIC_BASE;
 
 	/*
 	 * Memory mapped local apic accesses should be aligned on a
 	 * 16-byte boundary.  They are also suggested to be 4 bytes
 	 * wide, alas not all OSes follow suggestions.
 	 */
 	off &= ~3;
 	if (off & 0xf)
 		return (EINVAL);
 
 	vlapic = vm_lapic(vm, cpu);
 	error = vlapic_read(vlapic, 1, off, rval, arg);
 	return (error);
 }
Index: stable/10/sys/kern/subr_bus.c
===================================================================
--- stable/10/sys/kern/subr_bus.c	(revision 284898)
+++ stable/10/sys/kern/subr_bus.c	(revision 284899)
@@ -1,4998 +1,5000 @@
 /*-
  * Copyright (c) 1997,1998,2003 Doug Rabson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_bus.h"
 #include "opt_random.h"
 
 #include <sys/param.h>
 #include <sys/conf.h>
 #include <sys/filio.h>
 #include <sys/lock.h>
 #include <sys/kernel.h>
 #include <sys/kobj.h>
 #include <sys/limits.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/poll.h>
 #include <sys/proc.h>
 #include <sys/condvar.h>
 #include <sys/queue.h>
 #include <machine/bus.h>
 #include <sys/random.h>
 #include <sys/rman.h>
 #include <sys/selinfo.h>
 #include <sys/signalvar.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/uio.h>
 #include <sys/bus.h>
 #include <sys/interrupt.h>
 #include <sys/cpuset.h>
 
 #include <net/vnet.h>
 
 #include <machine/cpu.h>
 #include <machine/stdarg.h>
 
 #include <vm/uma.h>
 
 SYSCTL_NODE(_hw, OID_AUTO, bus, CTLFLAG_RW, NULL, NULL);
 SYSCTL_NODE(, OID_AUTO, dev, CTLFLAG_RW, NULL, NULL);
 
 /*
  * Used to attach drivers to devclasses.
  */
 typedef struct driverlink *driverlink_t;
 struct driverlink {
 	kobj_class_t	driver;
 	TAILQ_ENTRY(driverlink) link;	/* list of drivers in devclass */
 	int		pass;
 	TAILQ_ENTRY(driverlink) passlink;
 };
 
 /*
  * Forward declarations
  */
 typedef TAILQ_HEAD(devclass_list, devclass) devclass_list_t;
 typedef TAILQ_HEAD(driver_list, driverlink) driver_list_t;
 typedef TAILQ_HEAD(device_list, device) device_list_t;
 
 struct devclass {
 	TAILQ_ENTRY(devclass) link;
 	devclass_t	parent;		/* parent in devclass hierarchy */
 	driver_list_t	drivers;     /* bus devclasses store drivers for bus */
 	char		*name;
 	device_t	*devices;	/* array of devices indexed by unit */
 	int		maxunit;	/* size of devices array */
 	int		flags;
 #define DC_HAS_CHILDREN		1
 
 	struct sysctl_ctx_list sysctl_ctx;
 	struct sysctl_oid *sysctl_tree;
 };
 
 /**
  * @brief Implementation of device.
  */
 struct device {
 	/*
 	 * A device is a kernel object. The first field must be the
 	 * current ops table for the object.
 	 */
 	KOBJ_FIELDS;
 
 	/*
 	 * Device hierarchy.
 	 */
 	TAILQ_ENTRY(device)	link;	/**< list of devices in parent */
 	TAILQ_ENTRY(device)	devlink; /**< global device list membership */
 	device_t	parent;		/**< parent of this device  */
 	device_list_t	children;	/**< list of child devices */
 
 	/*
 	 * Details of this device.
 	 */
 	driver_t	*driver;	/**< current driver */
 	devclass_t	devclass;	/**< current device class */
 	int		unit;		/**< current unit number */
 	char*		nameunit;	/**< name+unit e.g. foodev0 */
 	char*		desc;		/**< driver specific description */
 	int		busy;		/**< count of calls to device_busy() */
 	device_state_t	state;		/**< current device state  */
 	uint32_t	devflags;	/**< api level flags for device_get_flags() */
 	u_int		flags;		/**< internal device flags  */
 #define	DF_ENABLED	0x01		/* device should be probed/attached */
 #define	DF_FIXEDCLASS	0x02		/* devclass specified at create time */
 #define	DF_WILDCARD	0x04		/* unit was originally wildcard */
 #define	DF_DESCMALLOCED	0x08		/* description was malloced */
 #define	DF_QUIET	0x10		/* don't print verbose attach message */
 #define	DF_DONENOMATCH	0x20		/* don't execute DEVICE_NOMATCH again */
 #define	DF_EXTERNALSOFTC 0x40		/* softc not allocated by us */
 #define	DF_REBID	0x80		/* Can rebid after attach */
 	u_int	order;			/**< order from device_add_child_ordered() */
 	void	*ivars;			/**< instance variables  */
 	void	*softc;			/**< current driver's variables  */
 
 	struct sysctl_ctx_list sysctl_ctx; /**< state for sysctl variables  */
 	struct sysctl_oid *sysctl_tree;	/**< state for sysctl variables */
 };
 
 static MALLOC_DEFINE(M_BUS, "bus", "Bus data structures");
 static MALLOC_DEFINE(M_BUS_SC, "bus-sc", "Bus data structures, softc");
 
 #ifdef BUS_DEBUG
 
 static int bus_debug = 1;
 TUNABLE_INT("bus.debug", &bus_debug);
 SYSCTL_INT(_debug, OID_AUTO, bus_debug, CTLFLAG_RW, &bus_debug, 0,
     "Debug bus code");
 
 #define PDEBUG(a)	if (bus_debug) {printf("%s:%d: ", __func__, __LINE__), printf a; printf("\n");}
 #define DEVICENAME(d)	((d)? device_get_name(d): "no device")
 #define DRIVERNAME(d)	((d)? d->name : "no driver")
 #define DEVCLANAME(d)	((d)? d->name : "no devclass")
 
 /**
  * Produce the indenting, indent*2 spaces plus a '.' ahead of that to
  * prevent syslog from deleting initial spaces
  */
 #define indentprintf(p)	do { int iJ; printf("."); for (iJ=0; iJ<indent; iJ++) printf("  "); printf p ; } while (0)
 
 static void print_device_short(device_t dev, int indent);
 static void print_device(device_t dev, int indent);
 void print_device_tree_short(device_t dev, int indent);
 void print_device_tree(device_t dev, int indent);
 static void print_driver_short(driver_t *driver, int indent);
 static void print_driver(driver_t *driver, int indent);
 static void print_driver_list(driver_list_t drivers, int indent);
 static void print_devclass_short(devclass_t dc, int indent);
 static void print_devclass(devclass_t dc, int indent);
 void print_devclass_list_short(void);
 void print_devclass_list(void);
 
 #else
 /* Make the compiler ignore the function calls */
 #define PDEBUG(a)			/* nop */
 #define DEVICENAME(d)			/* nop */
 #define DRIVERNAME(d)			/* nop */
 #define DEVCLANAME(d)			/* nop */
 
 #define print_device_short(d,i)		/* nop */
 #define print_device(d,i)		/* nop */
 #define print_device_tree_short(d,i)	/* nop */
 #define print_device_tree(d,i)		/* nop */
 #define print_driver_short(d,i)		/* nop */
 #define print_driver(d,i)		/* nop */
 #define print_driver_list(d,i)		/* nop */
 #define print_devclass_short(d,i)	/* nop */
 #define print_devclass(d,i)		/* nop */
 #define print_devclass_list_short()	/* nop */
 #define print_devclass_list()		/* nop */
 #endif
 
 /*
  * dev sysctl tree
  */
 
 enum {
 	DEVCLASS_SYSCTL_PARENT,
 };
 
 static int
 devclass_sysctl_handler(SYSCTL_HANDLER_ARGS)
 {
 	devclass_t dc = (devclass_t)arg1;
 	const char *value;
 
 	switch (arg2) {
 	case DEVCLASS_SYSCTL_PARENT:
 		value = dc->parent ? dc->parent->name : "";
 		break;
 	default:
 		return (EINVAL);
 	}
 	return (SYSCTL_OUT(req, value, strlen(value)));
 }
 
 static void
 devclass_sysctl_init(devclass_t dc)
 {
 
 	if (dc->sysctl_tree != NULL)
 		return;
 	sysctl_ctx_init(&dc->sysctl_ctx);
 	dc->sysctl_tree = SYSCTL_ADD_NODE(&dc->sysctl_ctx,
 	    SYSCTL_STATIC_CHILDREN(_dev), OID_AUTO, dc->name,
 	    CTLFLAG_RD, NULL, "");
 	SYSCTL_ADD_PROC(&dc->sysctl_ctx, SYSCTL_CHILDREN(dc->sysctl_tree),
 	    OID_AUTO, "%parent", CTLTYPE_STRING | CTLFLAG_RD,
 	    dc, DEVCLASS_SYSCTL_PARENT, devclass_sysctl_handler, "A",
 	    "parent class");
 }
 
 enum {
 	DEVICE_SYSCTL_DESC,
 	DEVICE_SYSCTL_DRIVER,
 	DEVICE_SYSCTL_LOCATION,
 	DEVICE_SYSCTL_PNPINFO,
 	DEVICE_SYSCTL_PARENT,
 };
 
 static int
 device_sysctl_handler(SYSCTL_HANDLER_ARGS)
 {
 	device_t dev = (device_t)arg1;
 	const char *value;
 	char *buf;
 	int error;
 
 	buf = NULL;
 	switch (arg2) {
 	case DEVICE_SYSCTL_DESC:
 		value = dev->desc ? dev->desc : "";
 		break;
 	case DEVICE_SYSCTL_DRIVER:
 		value = dev->driver ? dev->driver->name : "";
 		break;
 	case DEVICE_SYSCTL_LOCATION:
 		value = buf = malloc(1024, M_BUS, M_WAITOK | M_ZERO);
 		bus_child_location_str(dev, buf, 1024);
 		break;
 	case DEVICE_SYSCTL_PNPINFO:
 		value = buf = malloc(1024, M_BUS, M_WAITOK | M_ZERO);
 		bus_child_pnpinfo_str(dev, buf, 1024);
 		break;
 	case DEVICE_SYSCTL_PARENT:
 		value = dev->parent ? dev->parent->nameunit : "";
 		break;
 	default:
 		return (EINVAL);
 	}
 	error = SYSCTL_OUT(req, value, strlen(value));
 	if (buf != NULL)
 		free(buf, M_BUS);
 	return (error);
 }
 
 static void
 device_sysctl_init(device_t dev)
 {
 	devclass_t dc = dev->devclass;
 	int domain;
 
 	if (dev->sysctl_tree != NULL)
 		return;
 	devclass_sysctl_init(dc);
 	sysctl_ctx_init(&dev->sysctl_ctx);
 	dev->sysctl_tree = SYSCTL_ADD_NODE(&dev->sysctl_ctx,
 	    SYSCTL_CHILDREN(dc->sysctl_tree), OID_AUTO,
 	    dev->nameunit + strlen(dc->name),
 	    CTLFLAG_RD, NULL, "");
 	SYSCTL_ADD_PROC(&dev->sysctl_ctx, SYSCTL_CHILDREN(dev->sysctl_tree),
 	    OID_AUTO, "%desc", CTLTYPE_STRING | CTLFLAG_RD,
 	    dev, DEVICE_SYSCTL_DESC, device_sysctl_handler, "A",
 	    "device description");
 	SYSCTL_ADD_PROC(&dev->sysctl_ctx, SYSCTL_CHILDREN(dev->sysctl_tree),
 	    OID_AUTO, "%driver", CTLTYPE_STRING | CTLFLAG_RD,
 	    dev, DEVICE_SYSCTL_DRIVER, device_sysctl_handler, "A",
 	    "device driver name");
 	SYSCTL_ADD_PROC(&dev->sysctl_ctx, SYSCTL_CHILDREN(dev->sysctl_tree),
 	    OID_AUTO, "%location", CTLTYPE_STRING | CTLFLAG_RD,
 	    dev, DEVICE_SYSCTL_LOCATION, device_sysctl_handler, "A",
 	    "device location relative to parent");
 	SYSCTL_ADD_PROC(&dev->sysctl_ctx, SYSCTL_CHILDREN(dev->sysctl_tree),
 	    OID_AUTO, "%pnpinfo", CTLTYPE_STRING | CTLFLAG_RD,
 	    dev, DEVICE_SYSCTL_PNPINFO, device_sysctl_handler, "A",
 	    "device identification");
 	SYSCTL_ADD_PROC(&dev->sysctl_ctx, SYSCTL_CHILDREN(dev->sysctl_tree),
 	    OID_AUTO, "%parent", CTLTYPE_STRING | CTLFLAG_RD,
 	    dev, DEVICE_SYSCTL_PARENT, device_sysctl_handler, "A",
 	    "parent device");
 	if (bus_get_domain(dev, &domain) == 0)
 		SYSCTL_ADD_INT(&dev->sysctl_ctx,
 		    SYSCTL_CHILDREN(dev->sysctl_tree), OID_AUTO, "%domain",
 		    CTLFLAG_RD, NULL, domain, "NUMA domain");
 }
 
 static void
 device_sysctl_update(device_t dev)
 {
 	devclass_t dc = dev->devclass;
 
 	if (dev->sysctl_tree == NULL)
 		return;
 	sysctl_rename_oid(dev->sysctl_tree, dev->nameunit + strlen(dc->name));
 }
 
 static void
 device_sysctl_fini(device_t dev)
 {
 	if (dev->sysctl_tree == NULL)
 		return;
 	sysctl_ctx_free(&dev->sysctl_ctx);
 	dev->sysctl_tree = NULL;
 }
 
 /*
  * /dev/devctl implementation
  */
 
 /*
  * This design allows only one reader for /dev/devctl.  This is not desirable
  * in the long run, but will get a lot of hair out of this implementation.
  * Maybe we should make this device a clonable device.
  *
  * Also note: we specifically do not attach a device to the device_t tree
  * to avoid potential chicken and egg problems.  One could argue that all
  * of this belongs to the root node.  One could also further argue that the
  * sysctl interface that we have not might more properly be an ioctl
  * interface, but at this stage of the game, I'm not inclined to rock that
  * boat.
  *
  * I'm also not sure that the SIGIO support is done correctly or not, as
  * I copied it from a driver that had SIGIO support that likely hasn't been
  * tested since 3.4 or 2.2.8!
  */
 
 /* Deprecated way to adjust queue length */
 static int sysctl_devctl_disable(SYSCTL_HANDLER_ARGS);
 /* XXX Need to support old-style tunable hw.bus.devctl_disable" */
 SYSCTL_PROC(_hw_bus, OID_AUTO, devctl_disable, CTLTYPE_INT | CTLFLAG_RW |
     CTLFLAG_MPSAFE, NULL, 0, sysctl_devctl_disable, "I",
     "devctl disable -- deprecated");
 
 #define DEVCTL_DEFAULT_QUEUE_LEN 1000
 static int sysctl_devctl_queue(SYSCTL_HANDLER_ARGS);
 static int devctl_queue_length = DEVCTL_DEFAULT_QUEUE_LEN;
 TUNABLE_INT("hw.bus.devctl_queue", &devctl_queue_length);
 SYSCTL_PROC(_hw_bus, OID_AUTO, devctl_queue, CTLTYPE_INT | CTLFLAG_RW |
     CTLFLAG_MPSAFE, NULL, 0, sysctl_devctl_queue, "I", "devctl queue length");
 
 static d_open_t		devopen;
 static d_close_t	devclose;
 static d_read_t		devread;
 static d_ioctl_t	devioctl;
 static d_poll_t		devpoll;
 static d_kqfilter_t	devkqfilter;
 
 static struct cdevsw dev_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_open =	devopen,
 	.d_close =	devclose,
 	.d_read =	devread,
 	.d_ioctl =	devioctl,
 	.d_poll =	devpoll,
 	.d_kqfilter =	devkqfilter,
 	.d_name =	"devctl",
 };
 
 struct dev_event_info
 {
 	char *dei_data;
 	TAILQ_ENTRY(dev_event_info) dei_link;
 };
 
 TAILQ_HEAD(devq, dev_event_info);
 
 static struct dev_softc
 {
 	int	inuse;
 	int	nonblock;
 	int	queued;
 	int	async;
 	struct mtx mtx;
 	struct cv cv;
 	struct selinfo sel;
 	struct devq devq;
 	struct sigio *sigio;
 } devsoftc;
 
 static void	filt_devctl_detach(struct knote *kn);
 static int	filt_devctl_read(struct knote *kn, long hint);
 
 struct filterops devctl_rfiltops = {
 	.f_isfd = 1,
 	.f_detach = filt_devctl_detach,
 	.f_event = filt_devctl_read,
 };
 
 static struct cdev *devctl_dev;
 
 static void
 devinit(void)
 {
 	devctl_dev = make_dev_credf(MAKEDEV_ETERNAL, &dev_cdevsw, 0, NULL,
 	    UID_ROOT, GID_WHEEL, 0600, "devctl");
 	mtx_init(&devsoftc.mtx, "dev mtx", "devd", MTX_DEF);
 	cv_init(&devsoftc.cv, "dev cv");
 	TAILQ_INIT(&devsoftc.devq);
 	knlist_init_mtx(&devsoftc.sel.si_note, &devsoftc.mtx);
 }
 
 static int
 devopen(struct cdev *dev, int oflags, int devtype, struct thread *td)
 {
 
 	mtx_lock(&devsoftc.mtx);
 	if (devsoftc.inuse) {
 		mtx_unlock(&devsoftc.mtx);
 		return (EBUSY);
 	}
 	/* move to init */
 	devsoftc.inuse = 1;
 	mtx_unlock(&devsoftc.mtx);
 	return (0);
 }
 
 static int
 devclose(struct cdev *dev, int fflag, int devtype, struct thread *td)
 {
 
 	mtx_lock(&devsoftc.mtx);
 	devsoftc.inuse = 0;
 	devsoftc.nonblock = 0;
 	devsoftc.async = 0;
 	cv_broadcast(&devsoftc.cv);
 	funsetown(&devsoftc.sigio);
 	mtx_unlock(&devsoftc.mtx);
 	return (0);
 }
 
 /*
  * The read channel for this device is used to report changes to
  * userland in realtime.  We are required to free the data as well as
  * the n1 object because we allocate them separately.  Also note that
  * we return one record at a time.  If you try to read this device a
  * character at a time, you will lose the rest of the data.  Listening
  * programs are expected to cope.
  */
 static int
 devread(struct cdev *dev, struct uio *uio, int ioflag)
 {
 	struct dev_event_info *n1;
 	int rv;
 
 	mtx_lock(&devsoftc.mtx);
 	while (TAILQ_EMPTY(&devsoftc.devq)) {
 		if (devsoftc.nonblock) {
 			mtx_unlock(&devsoftc.mtx);
 			return (EAGAIN);
 		}
 		rv = cv_wait_sig(&devsoftc.cv, &devsoftc.mtx);
 		if (rv) {
 			/*
 			 * Need to translate ERESTART to EINTR here? -- jake
 			 */
 			mtx_unlock(&devsoftc.mtx);
 			return (rv);
 		}
 	}
 	n1 = TAILQ_FIRST(&devsoftc.devq);
 	TAILQ_REMOVE(&devsoftc.devq, n1, dei_link);
 	devsoftc.queued--;
 	mtx_unlock(&devsoftc.mtx);
 	rv = uiomove(n1->dei_data, strlen(n1->dei_data), uio);
 	free(n1->dei_data, M_BUS);
 	free(n1, M_BUS);
 	return (rv);
 }
 
 static	int
 devioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td)
 {
 	switch (cmd) {
 
 	case FIONBIO:
 		if (*(int*)data)
 			devsoftc.nonblock = 1;
 		else
 			devsoftc.nonblock = 0;
 		return (0);
 	case FIOASYNC:
 		if (*(int*)data)
 			devsoftc.async = 1;
 		else
 			devsoftc.async = 0;
 		return (0);
 	case FIOSETOWN:
 		return fsetown(*(int *)data, &devsoftc.sigio);
 	case FIOGETOWN:
 		*(int *)data = fgetown(&devsoftc.sigio);
 		return (0);
 
 		/* (un)Support for other fcntl() calls. */
 	case FIOCLEX:
 	case FIONCLEX:
 	case FIONREAD:
 	default:
 		break;
 	}
 	return (ENOTTY);
 }
 
 static	int
 devpoll(struct cdev *dev, int events, struct thread *td)
 {
 	int	revents = 0;
 
 	mtx_lock(&devsoftc.mtx);
 	if (events & (POLLIN | POLLRDNORM)) {
 		if (!TAILQ_EMPTY(&devsoftc.devq))
 			revents = events & (POLLIN | POLLRDNORM);
 		else
 			selrecord(td, &devsoftc.sel);
 	}
 	mtx_unlock(&devsoftc.mtx);
 
 	return (revents);
 }
 
 static int
 devkqfilter(struct cdev *dev, struct knote *kn)
 {
 	int error;
 
 	if (kn->kn_filter == EVFILT_READ) {
 		kn->kn_fop = &devctl_rfiltops;
 		knlist_add(&devsoftc.sel.si_note, kn, 0);
 		error = 0;
 	} else
 		error = EINVAL;
 	return (error);
 }
 
 static void
 filt_devctl_detach(struct knote *kn)
 {
 
 	knlist_remove(&devsoftc.sel.si_note, kn, 0);
 }
 
 static int
 filt_devctl_read(struct knote *kn, long hint)
 {
 	kn->kn_data = devsoftc.queued;
 	return (kn->kn_data != 0);
 }
 
 /**
  * @brief Return whether the userland process is running
  */
 boolean_t
 devctl_process_running(void)
 {
 	return (devsoftc.inuse == 1);
 }
 
 /**
  * @brief Queue data to be read from the devctl device
  *
  * Generic interface to queue data to the devctl device.  It is
  * assumed that @p data is properly formatted.  It is further assumed
  * that @p data is allocated using the M_BUS malloc type.
  */
 void
 devctl_queue_data_f(char *data, int flags)
 {
 	struct dev_event_info *n1 = NULL, *n2 = NULL;
 
 	if (strlen(data) == 0)
 		goto out;
 	if (devctl_queue_length == 0)
 		goto out;
 	n1 = malloc(sizeof(*n1), M_BUS, flags);
 	if (n1 == NULL)
 		goto out;
 	n1->dei_data = data;
 	mtx_lock(&devsoftc.mtx);
 	if (devctl_queue_length == 0) {
 		mtx_unlock(&devsoftc.mtx);
 		free(n1->dei_data, M_BUS);
 		free(n1, M_BUS);
 		return;
 	}
 	/* Leave at least one spot in the queue... */
 	while (devsoftc.queued > devctl_queue_length - 1) {
 		n2 = TAILQ_FIRST(&devsoftc.devq);
 		TAILQ_REMOVE(&devsoftc.devq, n2, dei_link);
 		free(n2->dei_data, M_BUS);
 		free(n2, M_BUS);
 		devsoftc.queued--;
 	}
 	TAILQ_INSERT_TAIL(&devsoftc.devq, n1, dei_link);
 	devsoftc.queued++;
 	cv_broadcast(&devsoftc.cv);
 	KNOTE_LOCKED(&devsoftc.sel.si_note, 0);
 	mtx_unlock(&devsoftc.mtx);
 	selwakeup(&devsoftc.sel);
 	if (devsoftc.async && devsoftc.sigio != NULL)
 		pgsigio(&devsoftc.sigio, SIGIO, 0);
 	return;
 out:
 	/*
 	 * We have to free data on all error paths since the caller
 	 * assumes it will be free'd when this item is dequeued.
 	 */
 	free(data, M_BUS);
 	return;
 }
 
 void
 devctl_queue_data(char *data)
 {
 
 	devctl_queue_data_f(data, M_NOWAIT);
 }
 
 /**
  * @brief Send a 'notification' to userland, using standard ways
  */
 void
 devctl_notify_f(const char *system, const char *subsystem, const char *type,
     const char *data, int flags)
 {
 	int len = 0;
 	char *msg;
 
 	if (system == NULL)
 		return;		/* BOGUS!  Must specify system. */
 	if (subsystem == NULL)
 		return;		/* BOGUS!  Must specify subsystem. */
 	if (type == NULL)
 		return;		/* BOGUS!  Must specify type. */
 	len += strlen(" system=") + strlen(system);
 	len += strlen(" subsystem=") + strlen(subsystem);
 	len += strlen(" type=") + strlen(type);
 	/* add in the data message plus newline. */
 	if (data != NULL)
 		len += strlen(data);
 	len += 3;	/* '!', '\n', and NUL */
 	msg = malloc(len, M_BUS, flags);
 	if (msg == NULL)
 		return;		/* Drop it on the floor */
 	if (data != NULL)
 		snprintf(msg, len, "!system=%s subsystem=%s type=%s %s\n",
 		    system, subsystem, type, data);
 	else
 		snprintf(msg, len, "!system=%s subsystem=%s type=%s\n",
 		    system, subsystem, type);
 	devctl_queue_data_f(msg, flags);
 }
 
 void
 devctl_notify(const char *system, const char *subsystem, const char *type,
     const char *data)
 {
 
 	devctl_notify_f(system, subsystem, type, data, M_NOWAIT);
 }
 
 /*
  * Common routine that tries to make sending messages as easy as possible.
  * We allocate memory for the data, copy strings into that, but do not
  * free it unless there's an error.  The dequeue part of the driver should
  * free the data.  We don't send data when the device is disabled.  We do
  * send data, even when we have no listeners, because we wish to avoid
  * races relating to startup and restart of listening applications.
  *
  * devaddq is designed to string together the type of event, with the
  * object of that event, plus the plug and play info and location info
  * for that event.  This is likely most useful for devices, but less
  * useful for other consumers of this interface.  Those should use
  * the devctl_queue_data() interface instead.
  */
 static void
 devaddq(const char *type, const char *what, device_t dev)
 {
 	char *data = NULL;
 	char *loc = NULL;
 	char *pnp = NULL;
 	const char *parstr;
 
 	if (!devctl_queue_length)/* Rare race, but lost races safely discard */
 		return;
 	data = malloc(1024, M_BUS, M_NOWAIT);
 	if (data == NULL)
 		goto bad;
 
 	/* get the bus specific location of this device */
 	loc = malloc(1024, M_BUS, M_NOWAIT);
 	if (loc == NULL)
 		goto bad;
 	*loc = '\0';
 	bus_child_location_str(dev, loc, 1024);
 
 	/* Get the bus specific pnp info of this device */
 	pnp = malloc(1024, M_BUS, M_NOWAIT);
 	if (pnp == NULL)
 		goto bad;
 	*pnp = '\0';
 	bus_child_pnpinfo_str(dev, pnp, 1024);
 
 	/* Get the parent of this device, or / if high enough in the tree. */
 	if (device_get_parent(dev) == NULL)
 		parstr = ".";	/* Or '/' ? */
 	else
 		parstr = device_get_nameunit(device_get_parent(dev));
 	/* String it all together. */
 	snprintf(data, 1024, "%s%s at %s %s on %s\n", type, what, loc, pnp,
 	  parstr);
 	free(loc, M_BUS);
 	free(pnp, M_BUS);
 	devctl_queue_data(data);
 	return;
 bad:
 	free(pnp, M_BUS);
 	free(loc, M_BUS);
 	free(data, M_BUS);
 	return;
 }
 
 /*
  * A device was added to the tree.  We are called just after it successfully
  * attaches (that is, probe and attach success for this device).  No call
  * is made if a device is merely parented into the tree.  See devnomatch
  * if probe fails.  If attach fails, no notification is sent (but maybe
  * we should have a different message for this).
  */
 static void
 devadded(device_t dev)
 {
 	devaddq("+", device_get_nameunit(dev), dev);
 }
 
 /*
  * A device was removed from the tree.  We are called just before this
  * happens.
  */
 static void
 devremoved(device_t dev)
 {
 	devaddq("-", device_get_nameunit(dev), dev);
 }
 
 /*
  * Called when there's no match for this device.  This is only called
  * the first time that no match happens, so we don't keep getting this
  * message.  Should that prove to be undesirable, we can change it.
  * This is called when all drivers that can attach to a given bus
  * decline to accept this device.  Other errors may not be detected.
  */
 static void
 devnomatch(device_t dev)
 {
 	devaddq("?", "", dev);
 }
 
 static int
 sysctl_devctl_disable(SYSCTL_HANDLER_ARGS)
 {
 	struct dev_event_info *n1;
 	int dis, error;
 
 	dis = devctl_queue_length == 0;
 	error = sysctl_handle_int(oidp, &dis, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	mtx_lock(&devsoftc.mtx);
 	if (dis) {
 		while (!TAILQ_EMPTY(&devsoftc.devq)) {
 			n1 = TAILQ_FIRST(&devsoftc.devq);
 			TAILQ_REMOVE(&devsoftc.devq, n1, dei_link);
 			free(n1->dei_data, M_BUS);
 			free(n1, M_BUS);
 		}
 		devsoftc.queued = 0;
 		devctl_queue_length = 0;
 	} else {
 		devctl_queue_length = DEVCTL_DEFAULT_QUEUE_LEN;
 	}
 	mtx_unlock(&devsoftc.mtx);
 	return (0);
 }
 
 static int
 sysctl_devctl_queue(SYSCTL_HANDLER_ARGS)
 {
 	struct dev_event_info *n1;
 	int q, error;
 
 	q = devctl_queue_length;
 	error = sysctl_handle_int(oidp, &q, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	if (q < 0)
 		return (EINVAL);
 	mtx_lock(&devsoftc.mtx);
 	devctl_queue_length = q;
 	while (devsoftc.queued > devctl_queue_length) {
 		n1 = TAILQ_FIRST(&devsoftc.devq);
 		TAILQ_REMOVE(&devsoftc.devq, n1, dei_link);
 		free(n1->dei_data, M_BUS);
 		free(n1, M_BUS);
 		devsoftc.queued--;
 	}
 	mtx_unlock(&devsoftc.mtx);
 	return (0);
 }
 
 /* End of /dev/devctl code */
 
 static TAILQ_HEAD(,device)	bus_data_devices;
 static int bus_data_generation = 1;
 
 static kobj_method_t null_methods[] = {
 	KOBJMETHOD_END
 };
 
 DEFINE_CLASS(null, null_methods, 0);
 
 /*
  * Bus pass implementation
  */
 
 static driver_list_t passes = TAILQ_HEAD_INITIALIZER(passes);
 int bus_current_pass = BUS_PASS_ROOT;
 
 /**
  * @internal
  * @brief Register the pass level of a new driver attachment
  *
  * Register a new driver attachment's pass level.  If no driver
  * attachment with the same pass level has been added, then @p new
  * will be added to the global passes list.
  *
  * @param new		the new driver attachment
  */
 static void
 driver_register_pass(struct driverlink *new)
 {
 	struct driverlink *dl;
 
 	/* We only consider pass numbers during boot. */
 	if (bus_current_pass == BUS_PASS_DEFAULT)
 		return;
 
 	/*
 	 * Walk the passes list.  If we already know about this pass
 	 * then there is nothing to do.  If we don't, then insert this
 	 * driver link into the list.
 	 */
 	TAILQ_FOREACH(dl, &passes, passlink) {
 		if (dl->pass < new->pass)
 			continue;
 		if (dl->pass == new->pass)
 			return;
 		TAILQ_INSERT_BEFORE(dl, new, passlink);
 		return;
 	}
 	TAILQ_INSERT_TAIL(&passes, new, passlink);
 }
 
 /**
  * @brief Raise the current bus pass
  *
  * Raise the current bus pass level to @p pass.  Call the BUS_NEW_PASS()
  * method on the root bus to kick off a new device tree scan for each
  * new pass level that has at least one driver.
  */
 void
 bus_set_pass(int pass)
 {
 	struct driverlink *dl;
 
 	if (bus_current_pass > pass)
 		panic("Attempt to lower bus pass level");
 
 	TAILQ_FOREACH(dl, &passes, passlink) {
 		/* Skip pass values below the current pass level. */
 		if (dl->pass <= bus_current_pass)
 			continue;
 
 		/*
 		 * Bail once we hit a driver with a pass level that is
 		 * too high.
 		 */
 		if (dl->pass > pass)
 			break;
 
 		/*
 		 * Raise the pass level to the next level and rescan
 		 * the tree.
 		 */
 		bus_current_pass = dl->pass;
 		BUS_NEW_PASS(root_bus);
 	}
 
 	/*
 	 * If there isn't a driver registered for the requested pass,
 	 * then bus_current_pass might still be less than 'pass'.  Set
 	 * it to 'pass' in that case.
 	 */
 	if (bus_current_pass < pass)
 		bus_current_pass = pass;
 	KASSERT(bus_current_pass == pass, ("Failed to update bus pass level"));
 }
 
 /*
  * Devclass implementation
  */
 
 static devclass_list_t devclasses = TAILQ_HEAD_INITIALIZER(devclasses);
 
 /**
  * @internal
  * @brief Find or create a device class
  *
  * If a device class with the name @p classname exists, return it,
  * otherwise if @p create is non-zero create and return a new device
  * class.
  *
  * If @p parentname is non-NULL, the parent of the devclass is set to
  * the devclass of that name.
  *
  * @param classname	the devclass name to find or create
  * @param parentname	the parent devclass name or @c NULL
  * @param create	non-zero to create a devclass
  */
 static devclass_t
 devclass_find_internal(const char *classname, const char *parentname,
 		       int create)
 {
 	devclass_t dc;
 
 	PDEBUG(("looking for %s", classname));
 	if (!classname)
 		return (NULL);
 
 	TAILQ_FOREACH(dc, &devclasses, link) {
 		if (!strcmp(dc->name, classname))
 			break;
 	}
 
 	if (create && !dc) {
 		PDEBUG(("creating %s", classname));
 		dc = malloc(sizeof(struct devclass) + strlen(classname) + 1,
 		    M_BUS, M_NOWAIT | M_ZERO);
 		if (!dc)
 			return (NULL);
 		dc->parent = NULL;
 		dc->name = (char*) (dc + 1);
 		strcpy(dc->name, classname);
 		TAILQ_INIT(&dc->drivers);
 		TAILQ_INSERT_TAIL(&devclasses, dc, link);
 
 		bus_data_generation_update();
 	}
 
 	/*
 	 * If a parent class is specified, then set that as our parent so
 	 * that this devclass will support drivers for the parent class as
 	 * well.  If the parent class has the same name don't do this though
 	 * as it creates a cycle that can trigger an infinite loop in
 	 * device_probe_child() if a device exists for which there is no
 	 * suitable driver.
 	 */
 	if (parentname && dc && !dc->parent &&
 	    strcmp(classname, parentname) != 0) {
 		dc->parent = devclass_find_internal(parentname, NULL, TRUE);
 		dc->parent->flags |= DC_HAS_CHILDREN;
 	}
 
 	return (dc);
 }
 
 /**
  * @brief Create a device class
  *
  * If a device class with the name @p classname exists, return it,
  * otherwise create and return a new device class.
  *
  * @param classname	the devclass name to find or create
  */
 devclass_t
 devclass_create(const char *classname)
 {
 	return (devclass_find_internal(classname, NULL, TRUE));
 }
 
 /**
  * @brief Find a device class
  *
  * If a device class with the name @p classname exists, return it,
  * otherwise return @c NULL.
  *
  * @param classname	the devclass name to find
  */
 devclass_t
 devclass_find(const char *classname)
 {
 	return (devclass_find_internal(classname, NULL, FALSE));
 }
 
 /**
  * @brief Register that a device driver has been added to a devclass
  *
  * Register that a device driver has been added to a devclass.  This
  * is called by devclass_add_driver to accomplish the recursive
  * notification of all the children classes of dc, as well as dc.
  * Each layer will have BUS_DRIVER_ADDED() called for all instances of
  * the devclass.
  *
  * We do a full search here of the devclass list at each iteration
  * level to save storing children-lists in the devclass structure.  If
  * we ever move beyond a few dozen devices doing this, we may need to
  * reevaluate...
  *
  * @param dc		the devclass to edit
  * @param driver	the driver that was just added
  */
 static void
 devclass_driver_added(devclass_t dc, driver_t *driver)
 {
 	devclass_t parent;
 	int i;
 
 	/*
 	 * Call BUS_DRIVER_ADDED for any existing busses in this class.
 	 */
 	for (i = 0; i < dc->maxunit; i++)
 		if (dc->devices[i] && device_is_attached(dc->devices[i]))
 			BUS_DRIVER_ADDED(dc->devices[i], driver);
 
 	/*
 	 * Walk through the children classes.  Since we only keep a
 	 * single parent pointer around, we walk the entire list of
 	 * devclasses looking for children.  We set the
 	 * DC_HAS_CHILDREN flag when a child devclass is created on
 	 * the parent, so we only walk the list for those devclasses
 	 * that have children.
 	 */
 	if (!(dc->flags & DC_HAS_CHILDREN))
 		return;
 	parent = dc;
 	TAILQ_FOREACH(dc, &devclasses, link) {
 		if (dc->parent == parent)
 			devclass_driver_added(dc, driver);
 	}
 }
 
 /**
  * @brief Add a device driver to a device class
  *
  * Add a device driver to a devclass. This is normally called
  * automatically by DRIVER_MODULE(). The BUS_DRIVER_ADDED() method of
  * all devices in the devclass will be called to allow them to attempt
  * to re-probe any unmatched children.
  *
  * @param dc		the devclass to edit
  * @param driver	the driver to register
  */
 int
 devclass_add_driver(devclass_t dc, driver_t *driver, int pass, devclass_t *dcp)
 {
 	driverlink_t dl;
 	const char *parentname;
 
 	PDEBUG(("%s", DRIVERNAME(driver)));
 
 	/* Don't allow invalid pass values. */
 	if (pass <= BUS_PASS_ROOT)
 		return (EINVAL);
 
 	dl = malloc(sizeof *dl, M_BUS, M_NOWAIT|M_ZERO);
 	if (!dl)
 		return (ENOMEM);
 
 	/*
 	 * Compile the driver's methods. Also increase the reference count
 	 * so that the class doesn't get freed when the last instance
 	 * goes. This means we can safely use static methods and avoids a
 	 * double-free in devclass_delete_driver.
 	 */
 	kobj_class_compile((kobj_class_t) driver);
 
 	/*
 	 * If the driver has any base classes, make the
 	 * devclass inherit from the devclass of the driver's
 	 * first base class. This will allow the system to
 	 * search for drivers in both devclasses for children
 	 * of a device using this driver.
 	 */
 	if (driver->baseclasses)
 		parentname = driver->baseclasses[0]->name;
 	else
 		parentname = NULL;
 	*dcp = devclass_find_internal(driver->name, parentname, TRUE);
 
 	dl->driver = driver;
 	TAILQ_INSERT_TAIL(&dc->drivers, dl, link);
 	driver->refs++;		/* XXX: kobj_mtx */
 	dl->pass = pass;
 	driver_register_pass(dl);
 
 	devclass_driver_added(dc, driver);
 	bus_data_generation_update();
 	return (0);
 }
 
 /**
  * @brief Register that a device driver has been deleted from a devclass
  *
  * Register that a device driver has been removed from a devclass.
  * This is called by devclass_delete_driver to accomplish the
  * recursive notification of all the children classes of busclass, as
  * well as busclass.  Each layer will attempt to detach the driver
  * from any devices that are children of the bus's devclass.  The function
  * will return an error if a device fails to detach.
  * 
  * We do a full search here of the devclass list at each iteration
  * level to save storing children-lists in the devclass structure.  If
  * we ever move beyond a few dozen devices doing this, we may need to
  * reevaluate...
  *
  * @param busclass	the devclass of the parent bus
  * @param dc		the devclass of the driver being deleted
  * @param driver	the driver being deleted
  */
 static int
 devclass_driver_deleted(devclass_t busclass, devclass_t dc, driver_t *driver)
 {
 	devclass_t parent;
 	device_t dev;
 	int error, i;
 
 	/*
 	 * Disassociate from any devices.  We iterate through all the
 	 * devices in the devclass of the driver and detach any which are
 	 * using the driver and which have a parent in the devclass which
 	 * we are deleting from.
 	 *
 	 * Note that since a driver can be in multiple devclasses, we
 	 * should not detach devices which are not children of devices in
 	 * the affected devclass.
 	 */
 	for (i = 0; i < dc->maxunit; i++) {
 		if (dc->devices[i]) {
 			dev = dc->devices[i];
 			if (dev->driver == driver && dev->parent &&
 			    dev->parent->devclass == busclass) {
 				if ((error = device_detach(dev)) != 0)
 					return (error);
 				BUS_PROBE_NOMATCH(dev->parent, dev);
 				devnomatch(dev);
 				dev->flags |= DF_DONENOMATCH;
 			}
 		}
 	}
 
 	/*
 	 * Walk through the children classes.  Since we only keep a
 	 * single parent pointer around, we walk the entire list of
 	 * devclasses looking for children.  We set the
 	 * DC_HAS_CHILDREN flag when a child devclass is created on
 	 * the parent, so we only walk the list for those devclasses
 	 * that have children.
 	 */
 	if (!(busclass->flags & DC_HAS_CHILDREN))
 		return (0);
 	parent = busclass;
 	TAILQ_FOREACH(busclass, &devclasses, link) {
 		if (busclass->parent == parent) {
 			error = devclass_driver_deleted(busclass, dc, driver);
 			if (error)
 				return (error);
 		}
 	}
 	return (0);
 }
 
 /**
  * @brief Delete a device driver from a device class
  *
  * Delete a device driver from a devclass. This is normally called
  * automatically by DRIVER_MODULE().
  *
  * If the driver is currently attached to any devices,
  * devclass_delete_driver() will first attempt to detach from each
  * device. If one of the detach calls fails, the driver will not be
  * deleted.
  *
  * @param dc		the devclass to edit
  * @param driver	the driver to unregister
  */
 int
 devclass_delete_driver(devclass_t busclass, driver_t *driver)
 {
 	devclass_t dc = devclass_find(driver->name);
 	driverlink_t dl;
 	int error;
 
 	PDEBUG(("%s from devclass %s", driver->name, DEVCLANAME(busclass)));
 
 	if (!dc)
 		return (0);
 
 	/*
 	 * Find the link structure in the bus' list of drivers.
 	 */
 	TAILQ_FOREACH(dl, &busclass->drivers, link) {
 		if (dl->driver == driver)
 			break;
 	}
 
 	if (!dl) {
 		PDEBUG(("%s not found in %s list", driver->name,
 		    busclass->name));
 		return (ENOENT);
 	}
 
 	error = devclass_driver_deleted(busclass, dc, driver);
 	if (error != 0)
 		return (error);
 
 	TAILQ_REMOVE(&busclass->drivers, dl, link);
 	free(dl, M_BUS);
 
 	/* XXX: kobj_mtx */
 	driver->refs--;
 	if (driver->refs == 0)
 		kobj_class_free((kobj_class_t) driver);
 
 	bus_data_generation_update();
 	return (0);
 }
 
 /**
  * @brief Quiesces a set of device drivers from a device class
  *
  * Quiesce a device driver from a devclass. This is normally called
  * automatically by DRIVER_MODULE().
  *
  * If the driver is currently attached to any devices,
  * devclass_quiesece_driver() will first attempt to quiesce each
  * device.
  *
  * @param dc		the devclass to edit
  * @param driver	the driver to unregister
  */
 static int
 devclass_quiesce_driver(devclass_t busclass, driver_t *driver)
 {
 	devclass_t dc = devclass_find(driver->name);
 	driverlink_t dl;
 	device_t dev;
 	int i;
 	int error;
 
 	PDEBUG(("%s from devclass %s", driver->name, DEVCLANAME(busclass)));
 
 	if (!dc)
 		return (0);
 
 	/*
 	 * Find the link structure in the bus' list of drivers.
 	 */
 	TAILQ_FOREACH(dl, &busclass->drivers, link) {
 		if (dl->driver == driver)
 			break;
 	}
 
 	if (!dl) {
 		PDEBUG(("%s not found in %s list", driver->name,
 		    busclass->name));
 		return (ENOENT);
 	}
 
 	/*
 	 * Quiesce all devices.  We iterate through all the devices in
 	 * the devclass of the driver and quiesce any which are using
 	 * the driver and which have a parent in the devclass which we
 	 * are quiescing.
 	 *
 	 * Note that since a driver can be in multiple devclasses, we
 	 * should not quiesce devices which are not children of
 	 * devices in the affected devclass.
 	 */
 	for (i = 0; i < dc->maxunit; i++) {
 		if (dc->devices[i]) {
 			dev = dc->devices[i];
 			if (dev->driver == driver && dev->parent &&
 			    dev->parent->devclass == busclass) {
 				if ((error = device_quiesce(dev)) != 0)
 					return (error);
 			}
 		}
 	}
 
 	return (0);
 }
 
 /**
  * @internal
  */
 static driverlink_t
 devclass_find_driver_internal(devclass_t dc, const char *classname)
 {
 	driverlink_t dl;
 
 	PDEBUG(("%s in devclass %s", classname, DEVCLANAME(dc)));
 
 	TAILQ_FOREACH(dl, &dc->drivers, link) {
 		if (!strcmp(dl->driver->name, classname))
 			return (dl);
 	}
 
 	PDEBUG(("not found"));
 	return (NULL);
 }
 
 /**
  * @brief Return the name of the devclass
  */
 const char *
 devclass_get_name(devclass_t dc)
 {
 	return (dc->name);
 }
 
 /**
  * @brief Find a device given a unit number
  *
  * @param dc		the devclass to search
  * @param unit		the unit number to search for
  * 
  * @returns		the device with the given unit number or @c
  *			NULL if there is no such device
  */
 device_t
 devclass_get_device(devclass_t dc, int unit)
 {
 	if (dc == NULL || unit < 0 || unit >= dc->maxunit)
 		return (NULL);
 	return (dc->devices[unit]);
 }
 
 /**
  * @brief Find the softc field of a device given a unit number
  *
  * @param dc		the devclass to search
  * @param unit		the unit number to search for
  * 
  * @returns		the softc field of the device with the given
  *			unit number or @c NULL if there is no such
  *			device
  */
 void *
 devclass_get_softc(devclass_t dc, int unit)
 {
 	device_t dev;
 
 	dev = devclass_get_device(dc, unit);
 	if (!dev)
 		return (NULL);
 
 	return (device_get_softc(dev));
 }
 
 /**
  * @brief Get a list of devices in the devclass
  *
  * An array containing a list of all the devices in the given devclass
  * is allocated and returned in @p *devlistp. The number of devices
  * in the array is returned in @p *devcountp. The caller should free
  * the array using @c free(p, M_TEMP), even if @p *devcountp is 0.
  *
  * @param dc		the devclass to examine
  * @param devlistp	points at location for array pointer return
  *			value
  * @param devcountp	points at location for array size return value
  *
  * @retval 0		success
  * @retval ENOMEM	the array allocation failed
  */
 int
 devclass_get_devices(devclass_t dc, device_t **devlistp, int *devcountp)
 {
 	int count, i;
 	device_t *list;
 
 	count = devclass_get_count(dc);
 	list = malloc(count * sizeof(device_t), M_TEMP, M_NOWAIT|M_ZERO);
 	if (!list)
 		return (ENOMEM);
 
 	count = 0;
 	for (i = 0; i < dc->maxunit; i++) {
 		if (dc->devices[i]) {
 			list[count] = dc->devices[i];
 			count++;
 		}
 	}
 
 	*devlistp = list;
 	*devcountp = count;
 
 	return (0);
 }
 
 /**
  * @brief Get a list of drivers in the devclass
  *
  * An array containing a list of pointers to all the drivers in the
  * given devclass is allocated and returned in @p *listp.  The number
  * of drivers in the array is returned in @p *countp. The caller should
  * free the array using @c free(p, M_TEMP).
  *
  * @param dc		the devclass to examine
  * @param listp		gives location for array pointer return value
  * @param countp	gives location for number of array elements
  *			return value
  *
  * @retval 0		success
  * @retval ENOMEM	the array allocation failed
  */
 int
 devclass_get_drivers(devclass_t dc, driver_t ***listp, int *countp)
 {
 	driverlink_t dl;
 	driver_t **list;
 	int count;
 
 	count = 0;
 	TAILQ_FOREACH(dl, &dc->drivers, link)
 		count++;
 	list = malloc(count * sizeof(driver_t *), M_TEMP, M_NOWAIT);
 	if (list == NULL)
 		return (ENOMEM);
 
 	count = 0;
 	TAILQ_FOREACH(dl, &dc->drivers, link) {
 		list[count] = dl->driver;
 		count++;
 	}
 	*listp = list;
 	*countp = count;
 
 	return (0);
 }
 
 /**
  * @brief Get the number of devices in a devclass
  *
  * @param dc		the devclass to examine
  */
 int
 devclass_get_count(devclass_t dc)
 {
 	int count, i;
 
 	count = 0;
 	for (i = 0; i < dc->maxunit; i++)
 		if (dc->devices[i])
 			count++;
 	return (count);
 }
 
 /**
  * @brief Get the maximum unit number used in a devclass
  *
  * Note that this is one greater than the highest currently-allocated
  * unit.  If a null devclass_t is passed in, -1 is returned to indicate
  * that not even the devclass has been allocated yet.
  *
  * @param dc		the devclass to examine
  */
 int
 devclass_get_maxunit(devclass_t dc)
 {
 	if (dc == NULL)
 		return (-1);
 	return (dc->maxunit);
 }
 
 /**
  * @brief Find a free unit number in a devclass
  *
  * This function searches for the first unused unit number greater
  * that or equal to @p unit.
  *
  * @param dc		the devclass to examine
  * @param unit		the first unit number to check
  */
 int
 devclass_find_free_unit(devclass_t dc, int unit)
 {
 	if (dc == NULL)
 		return (unit);
 	while (unit < dc->maxunit && dc->devices[unit] != NULL)
 		unit++;
 	return (unit);
 }
 
 /**
  * @brief Set the parent of a devclass
  *
  * The parent class is normally initialised automatically by
  * DRIVER_MODULE().
  *
  * @param dc		the devclass to edit
  * @param pdc		the new parent devclass
  */
 void
 devclass_set_parent(devclass_t dc, devclass_t pdc)
 {
 	dc->parent = pdc;
 }
 
 /**
  * @brief Get the parent of a devclass
  *
  * @param dc		the devclass to examine
  */
 devclass_t
 devclass_get_parent(devclass_t dc)
 {
 	return (dc->parent);
 }
 
 struct sysctl_ctx_list *
 devclass_get_sysctl_ctx(devclass_t dc)
 {
 	return (&dc->sysctl_ctx);
 }
 
 struct sysctl_oid *
 devclass_get_sysctl_tree(devclass_t dc)
 {
 	return (dc->sysctl_tree);
 }
 
 /**
  * @internal
  * @brief Allocate a unit number
  *
  * On entry, @p *unitp is the desired unit number (or @c -1 if any
  * will do). The allocated unit number is returned in @p *unitp.
 
  * @param dc		the devclass to allocate from
  * @param unitp		points at the location for the allocated unit
  *			number
  *
  * @retval 0		success
  * @retval EEXIST	the requested unit number is already allocated
  * @retval ENOMEM	memory allocation failure
  */
 static int
 devclass_alloc_unit(devclass_t dc, device_t dev, int *unitp)
 {
 	const char *s;
 	int unit = *unitp;
 
 	PDEBUG(("unit %d in devclass %s", unit, DEVCLANAME(dc)));
 
 	/* Ask the parent bus if it wants to wire this device. */
 	if (unit == -1)
 		BUS_HINT_DEVICE_UNIT(device_get_parent(dev), dev, dc->name,
 		    &unit);
 
 	/* If we were given a wired unit number, check for existing device */
 	/* XXX imp XXX */
 	if (unit != -1) {
 		if (unit >= 0 && unit < dc->maxunit &&
 		    dc->devices[unit] != NULL) {
 			if (bootverbose)
 				printf("%s: %s%d already exists; skipping it\n",
 				    dc->name, dc->name, *unitp);
 			return (EEXIST);
 		}
 	} else {
 		/* Unwired device, find the next available slot for it */
 		unit = 0;
 		for (unit = 0;; unit++) {
 			/* If there is an "at" hint for a unit then skip it. */
 			if (resource_string_value(dc->name, unit, "at", &s) ==
 			    0)
 				continue;
 
 			/* If this device slot is already in use, skip it. */
 			if (unit < dc->maxunit && dc->devices[unit] != NULL)
 				continue;
 
 			break;
 		}
 	}
 
 	/*
 	 * We've selected a unit beyond the length of the table, so let's
 	 * extend the table to make room for all units up to and including
 	 * this one.
 	 */
 	if (unit >= dc->maxunit) {
 		device_t *newlist, *oldlist;
 		int newsize;
 
 		oldlist = dc->devices;
 		newsize = roundup((unit + 1), MINALLOCSIZE / sizeof(device_t));
 		newlist = malloc(sizeof(device_t) * newsize, M_BUS, M_NOWAIT);
 		if (!newlist)
 			return (ENOMEM);
 		if (oldlist != NULL)
 			bcopy(oldlist, newlist, sizeof(device_t) * dc->maxunit);
 		bzero(newlist + dc->maxunit,
 		    sizeof(device_t) * (newsize - dc->maxunit));
 		dc->devices = newlist;
 		dc->maxunit = newsize;
 		if (oldlist != NULL)
 			free(oldlist, M_BUS);
 	}
 	PDEBUG(("now: unit %d in devclass %s", unit, DEVCLANAME(dc)));
 
 	*unitp = unit;
 	return (0);
 }
 
 /**
  * @internal
  * @brief Add a device to a devclass
  *
  * A unit number is allocated for the device (using the device's
  * preferred unit number if any) and the device is registered in the
  * devclass. This allows the device to be looked up by its unit
  * number, e.g. by decoding a dev_t minor number.
  *
  * @param dc		the devclass to add to
  * @param dev		the device to add
  *
  * @retval 0		success
  * @retval EEXIST	the requested unit number is already allocated
  * @retval ENOMEM	memory allocation failure
  */
 static int
 devclass_add_device(devclass_t dc, device_t dev)
 {
 	int buflen, error;
 
 	PDEBUG(("%s in devclass %s", DEVICENAME(dev), DEVCLANAME(dc)));
 
 	buflen = snprintf(NULL, 0, "%s%d$", dc->name, INT_MAX);
 	if (buflen < 0)
 		return (ENOMEM);
 	dev->nameunit = malloc(buflen, M_BUS, M_NOWAIT|M_ZERO);
 	if (!dev->nameunit)
 		return (ENOMEM);
 
 	if ((error = devclass_alloc_unit(dc, dev, &dev->unit)) != 0) {
 		free(dev->nameunit, M_BUS);
 		dev->nameunit = NULL;
 		return (error);
 	}
 	dc->devices[dev->unit] = dev;
 	dev->devclass = dc;
 	snprintf(dev->nameunit, buflen, "%s%d", dc->name, dev->unit);
 
 	return (0);
 }
 
 /**
  * @internal
  * @brief Delete a device from a devclass
  *
  * The device is removed from the devclass's device list and its unit
  * number is freed.
 
  * @param dc		the devclass to delete from
  * @param dev		the device to delete
  *
  * @retval 0		success
  */
 static int
 devclass_delete_device(devclass_t dc, device_t dev)
 {
 	if (!dc || !dev)
 		return (0);
 
 	PDEBUG(("%s in devclass %s", DEVICENAME(dev), DEVCLANAME(dc)));
 
 	if (dev->devclass != dc || dc->devices[dev->unit] != dev)
 		panic("devclass_delete_device: inconsistent device class");
 	dc->devices[dev->unit] = NULL;
 	if (dev->flags & DF_WILDCARD)
 		dev->unit = -1;
 	dev->devclass = NULL;
 	free(dev->nameunit, M_BUS);
 	dev->nameunit = NULL;
 
 	return (0);
 }
 
 /**
  * @internal
  * @brief Make a new device and add it as a child of @p parent
  *
  * @param parent	the parent of the new device
  * @param name		the devclass name of the new device or @c NULL
  *			to leave the devclass unspecified
  * @parem unit		the unit number of the new device of @c -1 to
  *			leave the unit number unspecified
  *
  * @returns the new device
  */
 static device_t
 make_device(device_t parent, const char *name, int unit)
 {
 	device_t dev;
 	devclass_t dc;
 
 	PDEBUG(("%s at %s as unit %d", name, DEVICENAME(parent), unit));
 
 	if (name) {
 		dc = devclass_find_internal(name, NULL, TRUE);
 		if (!dc) {
 			printf("make_device: can't find device class %s\n",
 			    name);
 			return (NULL);
 		}
 	} else {
 		dc = NULL;
 	}
 
 	dev = malloc(sizeof(struct device), M_BUS, M_NOWAIT|M_ZERO);
 	if (!dev)
 		return (NULL);
 
 	dev->parent = parent;
 	TAILQ_INIT(&dev->children);
 	kobj_init((kobj_t) dev, &null_class);
 	dev->driver = NULL;
 	dev->devclass = NULL;
 	dev->unit = unit;
 	dev->nameunit = NULL;
 	dev->desc = NULL;
 	dev->busy = 0;
 	dev->devflags = 0;
 	dev->flags = DF_ENABLED;
 	dev->order = 0;
 	if (unit == -1)
 		dev->flags |= DF_WILDCARD;
 	if (name) {
 		dev->flags |= DF_FIXEDCLASS;
 		if (devclass_add_device(dc, dev)) {
 			kobj_delete((kobj_t) dev, M_BUS);
 			return (NULL);
 		}
 	}
 	dev->ivars = NULL;
 	dev->softc = NULL;
 
 	dev->state = DS_NOTPRESENT;
 
 	TAILQ_INSERT_TAIL(&bus_data_devices, dev, devlink);
 	bus_data_generation_update();
 
 	return (dev);
 }
 
 /**
  * @internal
  * @brief Print a description of a device.
  */
 static int
 device_print_child(device_t dev, device_t child)
 {
 	int retval = 0;
 
 	if (device_is_alive(child))
 		retval += BUS_PRINT_CHILD(dev, child);
 	else
 		retval += device_printf(child, " not found\n");
 
 	return (retval);
 }
 
 /**
  * @brief Create a new device
  *
  * This creates a new device and adds it as a child of an existing
  * parent device. The new device will be added after the last existing
  * child with order zero.
  * 
  * @param dev		the device which will be the parent of the
  *			new child device
  * @param name		devclass name for new device or @c NULL if not
  *			specified
  * @param unit		unit number for new device or @c -1 if not
  *			specified
  * 
  * @returns		the new device
  */
 device_t
 device_add_child(device_t dev, const char *name, int unit)
 {
 	return (device_add_child_ordered(dev, 0, name, unit));
 }
 
 /**
  * @brief Create a new device
  *
  * This creates a new device and adds it as a child of an existing
  * parent device. The new device will be added after the last existing
  * child with the same order.
  * 
  * @param dev		the device which will be the parent of the
  *			new child device
  * @param order		a value which is used to partially sort the
  *			children of @p dev - devices created using
  *			lower values of @p order appear first in @p
  *			dev's list of children
  * @param name		devclass name for new device or @c NULL if not
  *			specified
  * @param unit		unit number for new device or @c -1 if not
  *			specified
  * 
  * @returns		the new device
  */
 device_t
 device_add_child_ordered(device_t dev, u_int order, const char *name, int unit)
 {
 	device_t child;
 	device_t place;
 
 	PDEBUG(("%s at %s with order %u as unit %d",
 	    name, DEVICENAME(dev), order, unit));
 	KASSERT(name != NULL || unit == -1,
 	    ("child device with wildcard name and specific unit number"));
 
 	child = make_device(dev, name, unit);
 	if (child == NULL)
 		return (child);
 	child->order = order;
 
 	TAILQ_FOREACH(place, &dev->children, link) {
 		if (place->order > order)
 			break;
 	}
 
 	if (place) {
 		/*
 		 * The device 'place' is the first device whose order is
 		 * greater than the new child.
 		 */
 		TAILQ_INSERT_BEFORE(place, child, link);
 	} else {
 		/*
 		 * The new child's order is greater or equal to the order of
 		 * any existing device. Add the child to the tail of the list.
 		 */
 		TAILQ_INSERT_TAIL(&dev->children, child, link);
 	}
 
 	bus_data_generation_update();
 	return (child);
 }
 
 /**
  * @brief Delete a device
  *
  * This function deletes a device along with all of its children. If
  * the device currently has a driver attached to it, the device is
  * detached first using device_detach().
  * 
  * @param dev		the parent device
  * @param child		the device to delete
  *
  * @retval 0		success
  * @retval non-zero	a unit error code describing the error
  */
 int
 device_delete_child(device_t dev, device_t child)
 {
 	int error;
 	device_t grandchild;
 
 	PDEBUG(("%s from %s", DEVICENAME(child), DEVICENAME(dev)));
 
 	/* remove children first */
 	while ((grandchild = TAILQ_FIRST(&child->children)) != NULL) {
 		error = device_delete_child(child, grandchild);
 		if (error)
 			return (error);
 	}
 
 	if ((error = device_detach(child)) != 0)
 		return (error);
 	if (child->devclass)
 		devclass_delete_device(child->devclass, child);
 	if (child->parent)
 		BUS_CHILD_DELETED(dev, child);
 	TAILQ_REMOVE(&dev->children, child, link);
 	TAILQ_REMOVE(&bus_data_devices, child, devlink);
 	kobj_delete((kobj_t) child, M_BUS);
 
 	bus_data_generation_update();
 	return (0);
 }
 
 /**
  * @brief Delete all children devices of the given device, if any.
  *
  * This function deletes all children devices of the given device, if
  * any, using the device_delete_child() function for each device it
  * finds. If a child device cannot be deleted, this function will
  * return an error code.
  * 
  * @param dev		the parent device
  *
  * @retval 0		success
  * @retval non-zero	a device would not detach
  */
 int
 device_delete_children(device_t dev)
 {
 	device_t child;
 	int error;
 
 	PDEBUG(("Deleting all children of %s", DEVICENAME(dev)));
 
 	error = 0;
 
 	while ((child = TAILQ_FIRST(&dev->children)) != NULL) {
 		error = device_delete_child(dev, child);
 		if (error) {
 			PDEBUG(("Failed deleting %s", DEVICENAME(child)));
 			break;
 		}
 	}
 	return (error);
 }
 
 /**
  * @brief Find a device given a unit number
  *
  * This is similar to devclass_get_devices() but only searches for
  * devices which have @p dev as a parent.
  *
  * @param dev		the parent device to search
  * @param unit		the unit number to search for.  If the unit is -1,
  *			return the first child of @p dev which has name
  *			@p classname (that is, the one with the lowest unit.)
  *
  * @returns		the device with the given unit number or @c
  *			NULL if there is no such device
  */
 device_t
 device_find_child(device_t dev, const char *classname, int unit)
 {
 	devclass_t dc;
 	device_t child;
 
 	dc = devclass_find(classname);
 	if (!dc)
 		return (NULL);
 
 	if (unit != -1) {
 		child = devclass_get_device(dc, unit);
 		if (child && child->parent == dev)
 			return (child);
 	} else {
 		for (unit = 0; unit < devclass_get_maxunit(dc); unit++) {
 			child = devclass_get_device(dc, unit);
 			if (child && child->parent == dev)
 				return (child);
 		}
 	}
 	return (NULL);
 }
 
 /**
  * @internal
  */
 static driverlink_t
 first_matching_driver(devclass_t dc, device_t dev)
 {
 	if (dev->devclass)
 		return (devclass_find_driver_internal(dc, dev->devclass->name));
 	return (TAILQ_FIRST(&dc->drivers));
 }
 
 /**
  * @internal
  */
 static driverlink_t
 next_matching_driver(devclass_t dc, device_t dev, driverlink_t last)
 {
 	if (dev->devclass) {
 		driverlink_t dl;
 		for (dl = TAILQ_NEXT(last, link); dl; dl = TAILQ_NEXT(dl, link))
 			if (!strcmp(dev->devclass->name, dl->driver->name))
 				return (dl);
 		return (NULL);
 	}
 	return (TAILQ_NEXT(last, link));
 }
 
 /**
  * @internal
  */
 int
 device_probe_child(device_t dev, device_t child)
 {
 	devclass_t dc;
 	driverlink_t best = NULL;
 	driverlink_t dl;
 	int result, pri = 0;
 	int hasclass = (child->devclass != NULL);
 
 	GIANT_REQUIRED;
 
 	dc = dev->devclass;
 	if (!dc)
 		panic("device_probe_child: parent device has no devclass");
 
 	/*
 	 * If the state is already probed, then return.  However, don't
 	 * return if we can rebid this object.
 	 */
 	if (child->state == DS_ALIVE && (child->flags & DF_REBID) == 0)
 		return (0);
 
 	for (; dc; dc = dc->parent) {
 		for (dl = first_matching_driver(dc, child);
 		     dl;
 		     dl = next_matching_driver(dc, child, dl)) {
 			/* If this driver's pass is too high, then ignore it. */
 			if (dl->pass > bus_current_pass)
 				continue;
 
 			PDEBUG(("Trying %s", DRIVERNAME(dl->driver)));
 			result = device_set_driver(child, dl->driver);
 			if (result == ENOMEM)
 				return (result);
 			else if (result != 0)
 				continue;
 			if (!hasclass) {
 				if (device_set_devclass(child,
 				    dl->driver->name) != 0) {
 					char const * devname =
 					    device_get_name(child);
 					if (devname == NULL)
 						devname = "(unknown)";
 					printf("driver bug: Unable to set "
 					    "devclass (class: %s "
 					    "devname: %s)\n",
 					    dl->driver->name,
 					    devname);
 					(void)device_set_driver(child, NULL);
 					continue;
 				}
 			}
 
 			/* Fetch any flags for the device before probing. */
 			resource_int_value(dl->driver->name, child->unit,
 			    "flags", &child->devflags);
 
 			result = DEVICE_PROBE(child);
 
 			/* Reset flags and devclass before the next probe. */
 			child->devflags = 0;
 			if (!hasclass)
 				(void)device_set_devclass(child, NULL);
 
 			/*
 			 * If the driver returns SUCCESS, there can be
 			 * no higher match for this device.
 			 */
 			if (result == 0) {
 				best = dl;
 				pri = 0;
 				break;
 			}
 
 			/*
+			 * Probes that return BUS_PROBE_NOWILDCARD or lower
+			 * only match on devices whose driver was explicitly
+			 * specified.
+			 */
+			if (result <= BUS_PROBE_NOWILDCARD &&
+			    !(child->flags & DF_FIXEDCLASS)) {
+				result = ENXIO;
+			}
+
+			/*
 			 * The driver returned an error so it
 			 * certainly doesn't match.
 			 */
 			if (result > 0) {
 				(void)device_set_driver(child, NULL);
 				continue;
 			}
 
 			/*
 			 * A priority lower than SUCCESS, remember the
 			 * best matching driver. Initialise the value
 			 * of pri for the first match.
 			 */
 			if (best == NULL || result > pri) {
-				/*
-				 * Probes that return BUS_PROBE_NOWILDCARD
-				 * or lower only match on devices whose
-				 * driver was explicitly specified.
-				 */
-				if (result <= BUS_PROBE_NOWILDCARD &&
-				    !(child->flags & DF_FIXEDCLASS))
-					continue;
 				best = dl;
 				pri = result;
 				continue;
 			}
 		}
 		/*
 		 * If we have an unambiguous match in this devclass,
 		 * don't look in the parent.
 		 */
 		if (best && pri == 0)
 			break;
 	}
 
 	/*
 	 * If we found a driver, change state and initialise the devclass.
 	 */
 	/* XXX What happens if we rebid and got no best? */
 	if (best) {
 		/*
 		 * If this device was attached, and we were asked to
 		 * rescan, and it is a different driver, then we have
 		 * to detach the old driver and reattach this new one.
 		 * Note, we don't have to check for DF_REBID here
 		 * because if the state is > DS_ALIVE, we know it must
 		 * be.
 		 *
 		 * This assumes that all DF_REBID drivers can have
 		 * their probe routine called at any time and that
 		 * they are idempotent as well as completely benign in
 		 * normal operations.
 		 *
 		 * We also have to make sure that the detach
 		 * succeeded, otherwise we fail the operation (or
 		 * maybe it should just fail silently?  I'm torn).
 		 */
 		if (child->state > DS_ALIVE && best->driver != child->driver)
 			if ((result = device_detach(dev)) != 0)
 				return (result);
 
 		/* Set the winning driver, devclass, and flags. */
 		if (!child->devclass) {
 			result = device_set_devclass(child, best->driver->name);
 			if (result != 0)
 				return (result);
 		}
 		result = device_set_driver(child, best->driver);
 		if (result != 0)
 			return (result);
 		resource_int_value(best->driver->name, child->unit,
 		    "flags", &child->devflags);
 
 		if (pri < 0) {
 			/*
 			 * A bit bogus. Call the probe method again to make
 			 * sure that we have the right description.
 			 */
 			DEVICE_PROBE(child);
 #if 0
 			child->flags |= DF_REBID;
 #endif
 		} else
 			child->flags &= ~DF_REBID;
 		child->state = DS_ALIVE;
 
 		bus_data_generation_update();
 		return (0);
 	}
 
 	return (ENXIO);
 }
 
 /**
  * @brief Return the parent of a device
  */
 device_t
 device_get_parent(device_t dev)
 {
 	return (dev->parent);
 }
 
 /**
  * @brief Get a list of children of a device
  *
  * An array containing a list of all the children of the given device
  * is allocated and returned in @p *devlistp. The number of devices
  * in the array is returned in @p *devcountp. The caller should free
  * the array using @c free(p, M_TEMP).
  *
  * @param dev		the device to examine
  * @param devlistp	points at location for array pointer return
  *			value
  * @param devcountp	points at location for array size return value
  *
  * @retval 0		success
  * @retval ENOMEM	the array allocation failed
  */
 int
 device_get_children(device_t dev, device_t **devlistp, int *devcountp)
 {
 	int count;
 	device_t child;
 	device_t *list;
 
 	count = 0;
 	TAILQ_FOREACH(child, &dev->children, link) {
 		count++;
 	}
 	if (count == 0) {
 		*devlistp = NULL;
 		*devcountp = 0;
 		return (0);
 	}
 
 	list = malloc(count * sizeof(device_t), M_TEMP, M_NOWAIT|M_ZERO);
 	if (!list)
 		return (ENOMEM);
 
 	count = 0;
 	TAILQ_FOREACH(child, &dev->children, link) {
 		list[count] = child;
 		count++;
 	}
 
 	*devlistp = list;
 	*devcountp = count;
 
 	return (0);
 }
 
 /**
  * @brief Return the current driver for the device or @c NULL if there
  * is no driver currently attached
  */
 driver_t *
 device_get_driver(device_t dev)
 {
 	return (dev->driver);
 }
 
 /**
  * @brief Return the current devclass for the device or @c NULL if
  * there is none.
  */
 devclass_t
 device_get_devclass(device_t dev)
 {
 	return (dev->devclass);
 }
 
 /**
  * @brief Return the name of the device's devclass or @c NULL if there
  * is none.
  */
 const char *
 device_get_name(device_t dev)
 {
 	if (dev != NULL && dev->devclass)
 		return (devclass_get_name(dev->devclass));
 	return (NULL);
 }
 
 /**
  * @brief Return a string containing the device's devclass name
  * followed by an ascii representation of the device's unit number
  * (e.g. @c "foo2").
  */
 const char *
 device_get_nameunit(device_t dev)
 {
 	return (dev->nameunit);
 }
 
 /**
  * @brief Return the device's unit number.
  */
 int
 device_get_unit(device_t dev)
 {
 	return (dev->unit);
 }
 
 /**
  * @brief Return the device's description string
  */
 const char *
 device_get_desc(device_t dev)
 {
 	return (dev->desc);
 }
 
 /**
  * @brief Return the device's flags
  */
 uint32_t
 device_get_flags(device_t dev)
 {
 	return (dev->devflags);
 }
 
 struct sysctl_ctx_list *
 device_get_sysctl_ctx(device_t dev)
 {
 	return (&dev->sysctl_ctx);
 }
 
 struct sysctl_oid *
 device_get_sysctl_tree(device_t dev)
 {
 	return (dev->sysctl_tree);
 }
 
 /**
  * @brief Print the name of the device followed by a colon and a space
  *
  * @returns the number of characters printed
  */
 int
 device_print_prettyname(device_t dev)
 {
 	const char *name = device_get_name(dev);
 
 	if (name == NULL)
 		return (printf("unknown: "));
 	return (printf("%s%d: ", name, device_get_unit(dev)));
 }
 
 /**
  * @brief Print the name of the device followed by a colon, a space
  * and the result of calling vprintf() with the value of @p fmt and
  * the following arguments.
  *
  * @returns the number of characters printed
  */
 int
 device_printf(device_t dev, const char * fmt, ...)
 {
 	va_list ap;
 	int retval;
 
 	retval = device_print_prettyname(dev);
 	va_start(ap, fmt);
 	retval += vprintf(fmt, ap);
 	va_end(ap);
 	return (retval);
 }
 
 /**
  * @internal
  */
 static void
 device_set_desc_internal(device_t dev, const char* desc, int copy)
 {
 	if (dev->desc && (dev->flags & DF_DESCMALLOCED)) {
 		free(dev->desc, M_BUS);
 		dev->flags &= ~DF_DESCMALLOCED;
 		dev->desc = NULL;
 	}
 
 	if (copy && desc) {
 		dev->desc = malloc(strlen(desc) + 1, M_BUS, M_NOWAIT);
 		if (dev->desc) {
 			strcpy(dev->desc, desc);
 			dev->flags |= DF_DESCMALLOCED;
 		}
 	} else {
 		/* Avoid a -Wcast-qual warning */
 		dev->desc = (char *)(uintptr_t) desc;
 	}
 
 	bus_data_generation_update();
 }
 
 /**
  * @brief Set the device's description
  *
  * The value of @c desc should be a string constant that will not
  * change (at least until the description is changed in a subsequent
  * call to device_set_desc() or device_set_desc_copy()).
  */
 void
 device_set_desc(device_t dev, const char* desc)
 {
 	device_set_desc_internal(dev, desc, FALSE);
 }
 
 /**
  * @brief Set the device's description
  *
  * The string pointed to by @c desc is copied. Use this function if
  * the device description is generated, (e.g. with sprintf()).
  */
 void
 device_set_desc_copy(device_t dev, const char* desc)
 {
 	device_set_desc_internal(dev, desc, TRUE);
 }
 
 /**
  * @brief Set the device's flags
  */
 void
 device_set_flags(device_t dev, uint32_t flags)
 {
 	dev->devflags = flags;
 }
 
 /**
  * @brief Return the device's softc field
  *
  * The softc is allocated and zeroed when a driver is attached, based
  * on the size field of the driver.
  */
 void *
 device_get_softc(device_t dev)
 {
 	return (dev->softc);
 }
 
 /**
  * @brief Set the device's softc field
  *
  * Most drivers do not need to use this since the softc is allocated
  * automatically when the driver is attached.
  */
 void
 device_set_softc(device_t dev, void *softc)
 {
 	if (dev->softc && !(dev->flags & DF_EXTERNALSOFTC))
 		free(dev->softc, M_BUS_SC);
 	dev->softc = softc;
 	if (dev->softc)
 		dev->flags |= DF_EXTERNALSOFTC;
 	else
 		dev->flags &= ~DF_EXTERNALSOFTC;
 }
 
 /**
  * @brief Free claimed softc
  *
  * Most drivers do not need to use this since the softc is freed
  * automatically when the driver is detached.
  */
 void
 device_free_softc(void *softc)
 {
 	free(softc, M_BUS_SC);
 }
 
 /**
  * @brief Claim softc
  *
  * This function can be used to let the driver free the automatically
  * allocated softc using "device_free_softc()". This function is
  * useful when the driver is refcounting the softc and the softc
  * cannot be freed when the "device_detach" method is called.
  */
 void
 device_claim_softc(device_t dev)
 {
 	if (dev->softc)
 		dev->flags |= DF_EXTERNALSOFTC;
 	else
 		dev->flags &= ~DF_EXTERNALSOFTC;
 }
 
 /**
  * @brief Get the device's ivars field
  *
  * The ivars field is used by the parent device to store per-device
  * state (e.g. the physical location of the device or a list of
  * resources).
  */
 void *
 device_get_ivars(device_t dev)
 {
 
 	KASSERT(dev != NULL, ("device_get_ivars(NULL, ...)"));
 	return (dev->ivars);
 }
 
 /**
  * @brief Set the device's ivars field
  */
 void
 device_set_ivars(device_t dev, void * ivars)
 {
 
 	KASSERT(dev != NULL, ("device_set_ivars(NULL, ...)"));
 	dev->ivars = ivars;
 }
 
 /**
  * @brief Return the device's state
  */
 device_state_t
 device_get_state(device_t dev)
 {
 	return (dev->state);
 }
 
 /**
  * @brief Set the DF_ENABLED flag for the device
  */
 void
 device_enable(device_t dev)
 {
 	dev->flags |= DF_ENABLED;
 }
 
 /**
  * @brief Clear the DF_ENABLED flag for the device
  */
 void
 device_disable(device_t dev)
 {
 	dev->flags &= ~DF_ENABLED;
 }
 
 /**
  * @brief Increment the busy counter for the device
  */
 void
 device_busy(device_t dev)
 {
 	if (dev->state < DS_ATTACHING)
 		panic("device_busy: called for unattached device");
 	if (dev->busy == 0 && dev->parent)
 		device_busy(dev->parent);
 	dev->busy++;
 	if (dev->state == DS_ATTACHED)
 		dev->state = DS_BUSY;
 }
 
 /**
  * @brief Decrement the busy counter for the device
  */
 void
 device_unbusy(device_t dev)
 {
 	if (dev->busy != 0 && dev->state != DS_BUSY &&
 	    dev->state != DS_ATTACHING)
 		panic("device_unbusy: called for non-busy device %s",
 		    device_get_nameunit(dev));
 	dev->busy--;
 	if (dev->busy == 0) {
 		if (dev->parent)
 			device_unbusy(dev->parent);
 		if (dev->state == DS_BUSY)
 			dev->state = DS_ATTACHED;
 	}
 }
 
 /**
  * @brief Set the DF_QUIET flag for the device
  */
 void
 device_quiet(device_t dev)
 {
 	dev->flags |= DF_QUIET;
 }
 
 /**
  * @brief Clear the DF_QUIET flag for the device
  */
 void
 device_verbose(device_t dev)
 {
 	dev->flags &= ~DF_QUIET;
 }
 
 /**
  * @brief Return non-zero if the DF_QUIET flag is set on the device
  */
 int
 device_is_quiet(device_t dev)
 {
 	return ((dev->flags & DF_QUIET) != 0);
 }
 
 /**
  * @brief Return non-zero if the DF_ENABLED flag is set on the device
  */
 int
 device_is_enabled(device_t dev)
 {
 	return ((dev->flags & DF_ENABLED) != 0);
 }
 
 /**
  * @brief Return non-zero if the device was successfully probed
  */
 int
 device_is_alive(device_t dev)
 {
 	return (dev->state >= DS_ALIVE);
 }
 
 /**
  * @brief Return non-zero if the device currently has a driver
  * attached to it
  */
 int
 device_is_attached(device_t dev)
 {
 	return (dev->state >= DS_ATTACHED);
 }
 
 /**
  * @brief Set the devclass of a device
  * @see devclass_add_device().
  */
 int
 device_set_devclass(device_t dev, const char *classname)
 {
 	devclass_t dc;
 	int error;
 
 	if (!classname) {
 		if (dev->devclass)
 			devclass_delete_device(dev->devclass, dev);
 		return (0);
 	}
 
 	if (dev->devclass) {
 		printf("device_set_devclass: device class already set\n");
 		return (EINVAL);
 	}
 
 	dc = devclass_find_internal(classname, NULL, TRUE);
 	if (!dc)
 		return (ENOMEM);
 
 	error = devclass_add_device(dc, dev);
 
 	bus_data_generation_update();
 	return (error);
 }
 
 /**
  * @brief Set the driver of a device
  *
  * @retval 0		success
  * @retval EBUSY	the device already has a driver attached
  * @retval ENOMEM	a memory allocation failure occurred
  */
 int
 device_set_driver(device_t dev, driver_t *driver)
 {
 	if (dev->state >= DS_ATTACHED)
 		return (EBUSY);
 
 	if (dev->driver == driver)
 		return (0);
 
 	if (dev->softc && !(dev->flags & DF_EXTERNALSOFTC)) {
 		free(dev->softc, M_BUS_SC);
 		dev->softc = NULL;
 	}
 	device_set_desc(dev, NULL);
 	kobj_delete((kobj_t) dev, NULL);
 	dev->driver = driver;
 	if (driver) {
 		kobj_init((kobj_t) dev, (kobj_class_t) driver);
 		if (!(dev->flags & DF_EXTERNALSOFTC) && driver->size > 0) {
 			dev->softc = malloc(driver->size, M_BUS_SC,
 			    M_NOWAIT | M_ZERO);
 			if (!dev->softc) {
 				kobj_delete((kobj_t) dev, NULL);
 				kobj_init((kobj_t) dev, &null_class);
 				dev->driver = NULL;
 				return (ENOMEM);
 			}
 		}
 	} else {
 		kobj_init((kobj_t) dev, &null_class);
 	}
 
 	bus_data_generation_update();
 	return (0);
 }
 
 /**
  * @brief Probe a device, and return this status.
  *
  * This function is the core of the device autoconfiguration
  * system. Its purpose is to select a suitable driver for a device and
  * then call that driver to initialise the hardware appropriately. The
  * driver is selected by calling the DEVICE_PROBE() method of a set of
  * candidate drivers and then choosing the driver which returned the
  * best value. This driver is then attached to the device using
  * device_attach().
  *
  * The set of suitable drivers is taken from the list of drivers in
  * the parent device's devclass. If the device was originally created
  * with a specific class name (see device_add_child()), only drivers
  * with that name are probed, otherwise all drivers in the devclass
  * are probed. If no drivers return successful probe values in the
  * parent devclass, the search continues in the parent of that
  * devclass (see devclass_get_parent()) if any.
  *
  * @param dev		the device to initialise
  *
  * @retval 0		success
  * @retval ENXIO	no driver was found
  * @retval ENOMEM	memory allocation failure
  * @retval non-zero	some other unix error code
  * @retval -1		Device already attached
  */
 int
 device_probe(device_t dev)
 {
 	int error;
 
 	GIANT_REQUIRED;
 
 	if (dev->state >= DS_ALIVE && (dev->flags & DF_REBID) == 0)
 		return (-1);
 
 	if (!(dev->flags & DF_ENABLED)) {
 		if (bootverbose && device_get_name(dev) != NULL) {
 			device_print_prettyname(dev);
 			printf("not probed (disabled)\n");
 		}
 		return (-1);
 	}
 	if ((error = device_probe_child(dev->parent, dev)) != 0) {		
 		if (bus_current_pass == BUS_PASS_DEFAULT &&
 		    !(dev->flags & DF_DONENOMATCH)) {
 			BUS_PROBE_NOMATCH(dev->parent, dev);
 			devnomatch(dev);
 			dev->flags |= DF_DONENOMATCH;
 		}
 		return (error);
 	}
 	return (0);
 }
 
 /**
  * @brief Probe a device and attach a driver if possible
  *
  * calls device_probe() and attaches if that was successful.
  */
 int
 device_probe_and_attach(device_t dev)
 {
 	int error;
 
 	GIANT_REQUIRED;
 
 	error = device_probe(dev);
 	if (error == -1)
 		return (0);
 	else if (error != 0)
 		return (error);
 
 	CURVNET_SET_QUIET(vnet0);
 	error = device_attach(dev);
 	CURVNET_RESTORE();
 	return error;
 }
 
 /**
  * @brief Attach a device driver to a device
  *
  * This function is a wrapper around the DEVICE_ATTACH() driver
  * method. In addition to calling DEVICE_ATTACH(), it initialises the
  * device's sysctl tree, optionally prints a description of the device
  * and queues a notification event for user-based device management
  * services.
  *
  * Normally this function is only called internally from
  * device_probe_and_attach().
  *
  * @param dev		the device to initialise
  *
  * @retval 0		success
  * @retval ENXIO	no driver was found
  * @retval ENOMEM	memory allocation failure
  * @retval non-zero	some other unix error code
  */
 int
 device_attach(device_t dev)
 {
 	uint64_t attachtime;
 	int error;
 
 	if (resource_disabled(dev->driver->name, dev->unit)) {
 		device_disable(dev);
 		if (bootverbose)
 			 device_printf(dev, "disabled via hints entry\n");
 		return (ENXIO);
 	}
 
 	device_sysctl_init(dev);
 	if (!device_is_quiet(dev))
 		device_print_child(dev->parent, dev);
 	attachtime = get_cyclecount();
 	dev->state = DS_ATTACHING;
 	if ((error = DEVICE_ATTACH(dev)) != 0) {
 		printf("device_attach: %s%d attach returned %d\n",
 		    dev->driver->name, dev->unit, error);
 		if (!(dev->flags & DF_FIXEDCLASS))
 			devclass_delete_device(dev->devclass, dev);
 		(void)device_set_driver(dev, NULL);
 		device_sysctl_fini(dev);
 		KASSERT(dev->busy == 0, ("attach failed but busy"));
 		dev->state = DS_NOTPRESENT;
 		return (error);
 	}
 	attachtime = get_cyclecount() - attachtime;
 	/*
 	 * 4 bits per device is a reasonable value for desktop and server
 	 * hardware with good get_cyclecount() implementations, but may
 	 * need to be adjusted on other platforms.
 	 */
 #ifdef RANDOM_DEBUG
 	printf("%s(): feeding %d bit(s) of entropy from %s%d\n",
 	    __func__, 4, dev->driver->name, dev->unit);
 #endif
 	random_harvest(&attachtime, sizeof(attachtime), 4, RANDOM_ATTACH);
 	device_sysctl_update(dev);
 	if (dev->busy)
 		dev->state = DS_BUSY;
 	else
 		dev->state = DS_ATTACHED;
 	dev->flags &= ~DF_DONENOMATCH;
 	devadded(dev);
 	return (0);
 }
 
 /**
  * @brief Detach a driver from a device
  *
  * This function is a wrapper around the DEVICE_DETACH() driver
  * method. If the call to DEVICE_DETACH() succeeds, it calls
  * BUS_CHILD_DETACHED() for the parent of @p dev, queues a
  * notification event for user-based device management services and
  * cleans up the device's sysctl tree.
  *
  * @param dev		the device to un-initialise
  *
  * @retval 0		success
  * @retval ENXIO	no driver was found
  * @retval ENOMEM	memory allocation failure
  * @retval non-zero	some other unix error code
  */
 int
 device_detach(device_t dev)
 {
 	int error;
 
 	GIANT_REQUIRED;
 
 	PDEBUG(("%s", DEVICENAME(dev)));
 	if (dev->state == DS_BUSY)
 		return (EBUSY);
 	if (dev->state != DS_ATTACHED)
 		return (0);
 
 	if ((error = DEVICE_DETACH(dev)) != 0)
 		return (error);
 	devremoved(dev);
 	if (!device_is_quiet(dev))
 		device_printf(dev, "detached\n");
 	if (dev->parent)
 		BUS_CHILD_DETACHED(dev->parent, dev);
 
 	if (!(dev->flags & DF_FIXEDCLASS))
 		devclass_delete_device(dev->devclass, dev);
 
 	dev->state = DS_NOTPRESENT;
 	(void)device_set_driver(dev, NULL);
 	device_sysctl_fini(dev);
 
 	return (0);
 }
 
 /**
  * @brief Tells a driver to quiesce itself.
  *
  * This function is a wrapper around the DEVICE_QUIESCE() driver
  * method. If the call to DEVICE_QUIESCE() succeeds.
  *
  * @param dev		the device to quiesce
  *
  * @retval 0		success
  * @retval ENXIO	no driver was found
  * @retval ENOMEM	memory allocation failure
  * @retval non-zero	some other unix error code
  */
 int
 device_quiesce(device_t dev)
 {
 
 	PDEBUG(("%s", DEVICENAME(dev)));
 	if (dev->state == DS_BUSY)
 		return (EBUSY);
 	if (dev->state != DS_ATTACHED)
 		return (0);
 
 	return (DEVICE_QUIESCE(dev));
 }
 
 /**
  * @brief Notify a device of system shutdown
  *
  * This function calls the DEVICE_SHUTDOWN() driver method if the
  * device currently has an attached driver.
  *
  * @returns the value returned by DEVICE_SHUTDOWN()
  */
 int
 device_shutdown(device_t dev)
 {
 	if (dev->state < DS_ATTACHED)
 		return (0);
 	return (DEVICE_SHUTDOWN(dev));
 }
 
 /**
  * @brief Set the unit number of a device
  *
  * This function can be used to override the unit number used for a
  * device (e.g. to wire a device to a pre-configured unit number).
  */
 int
 device_set_unit(device_t dev, int unit)
 {
 	devclass_t dc;
 	int err;
 
 	dc = device_get_devclass(dev);
 	if (unit < dc->maxunit && dc->devices[unit])
 		return (EBUSY);
 	err = devclass_delete_device(dc, dev);
 	if (err)
 		return (err);
 	dev->unit = unit;
 	err = devclass_add_device(dc, dev);
 	if (err)
 		return (err);
 
 	bus_data_generation_update();
 	return (0);
 }
 
 /*======================================*/
 /*
  * Some useful method implementations to make life easier for bus drivers.
  */
 
 /**
  * @brief Initialise a resource list.
  *
  * @param rl		the resource list to initialise
  */
 void
 resource_list_init(struct resource_list *rl)
 {
 	STAILQ_INIT(rl);
 }
 
 /**
  * @brief Reclaim memory used by a resource list.
  *
  * This function frees the memory for all resource entries on the list
  * (if any).
  *
  * @param rl		the resource list to free		
  */
 void
 resource_list_free(struct resource_list *rl)
 {
 	struct resource_list_entry *rle;
 
 	while ((rle = STAILQ_FIRST(rl)) != NULL) {
 		if (rle->res)
 			panic("resource_list_free: resource entry is busy");
 		STAILQ_REMOVE_HEAD(rl, link);
 		free(rle, M_BUS);
 	}
 }
 
 /**
  * @brief Add a resource entry.
  *
  * This function adds a resource entry using the given @p type, @p
  * start, @p end and @p count values. A rid value is chosen by
  * searching sequentially for the first unused rid starting at zero.
  *
  * @param rl		the resource list to edit
  * @param type		the resource entry type (e.g. SYS_RES_MEMORY)
  * @param start		the start address of the resource
  * @param end		the end address of the resource
  * @param count		XXX end-start+1
  */
 int
 resource_list_add_next(struct resource_list *rl, int type, u_long start,
     u_long end, u_long count)
 {
 	int rid;
 
 	rid = 0;
 	while (resource_list_find(rl, type, rid) != NULL)
 		rid++;
 	resource_list_add(rl, type, rid, start, end, count);
 	return (rid);
 }
 
 /**
  * @brief Add or modify a resource entry.
  *
  * If an existing entry exists with the same type and rid, it will be
  * modified using the given values of @p start, @p end and @p
  * count. If no entry exists, a new one will be created using the
  * given values.  The resource list entry that matches is then returned.
  *
  * @param rl		the resource list to edit
  * @param type		the resource entry type (e.g. SYS_RES_MEMORY)
  * @param rid		the resource identifier
  * @param start		the start address of the resource
  * @param end		the end address of the resource
  * @param count		XXX end-start+1
  */
 struct resource_list_entry *
 resource_list_add(struct resource_list *rl, int type, int rid,
     u_long start, u_long end, u_long count)
 {
 	struct resource_list_entry *rle;
 
 	rle = resource_list_find(rl, type, rid);
 	if (!rle) {
 		rle = malloc(sizeof(struct resource_list_entry), M_BUS,
 		    M_NOWAIT);
 		if (!rle)
 			panic("resource_list_add: can't record entry");
 		STAILQ_INSERT_TAIL(rl, rle, link);
 		rle->type = type;
 		rle->rid = rid;
 		rle->res = NULL;
 		rle->flags = 0;
 	}
 
 	if (rle->res)
 		panic("resource_list_add: resource entry is busy");
 
 	rle->start = start;
 	rle->end = end;
 	rle->count = count;
 	return (rle);
 }
 
 /**
  * @brief Determine if a resource entry is busy.
  *
  * Returns true if a resource entry is busy meaning that it has an
  * associated resource that is not an unallocated "reserved" resource.
  *
  * @param rl		the resource list to search
  * @param type		the resource entry type (e.g. SYS_RES_MEMORY)
  * @param rid		the resource identifier
  *
  * @returns Non-zero if the entry is busy, zero otherwise.
  */
 int
 resource_list_busy(struct resource_list *rl, int type, int rid)
 {
 	struct resource_list_entry *rle;
 
 	rle = resource_list_find(rl, type, rid);
 	if (rle == NULL || rle->res == NULL)
 		return (0);
 	if ((rle->flags & (RLE_RESERVED | RLE_ALLOCATED)) == RLE_RESERVED) {
 		KASSERT(!(rman_get_flags(rle->res) & RF_ACTIVE),
 		    ("reserved resource is active"));
 		return (0);
 	}
 	return (1);
 }
 
 /**
  * @brief Determine if a resource entry is reserved.
  *
  * Returns true if a resource entry is reserved meaning that it has an
  * associated "reserved" resource.  The resource can either be
  * allocated or unallocated.
  *
  * @param rl		the resource list to search
  * @param type		the resource entry type (e.g. SYS_RES_MEMORY)
  * @param rid		the resource identifier
  *
  * @returns Non-zero if the entry is reserved, zero otherwise.
  */
 int
 resource_list_reserved(struct resource_list *rl, int type, int rid)
 {
 	struct resource_list_entry *rle;
 
 	rle = resource_list_find(rl, type, rid);
 	if (rle != NULL && rle->flags & RLE_RESERVED)
 		return (1);
 	return (0);
 }
 
 /**
  * @brief Find a resource entry by type and rid.
  *
  * @param rl		the resource list to search
  * @param type		the resource entry type (e.g. SYS_RES_MEMORY)
  * @param rid		the resource identifier
  *
  * @returns the resource entry pointer or NULL if there is no such
  * entry.
  */
 struct resource_list_entry *
 resource_list_find(struct resource_list *rl, int type, int rid)
 {
 	struct resource_list_entry *rle;
 
 	STAILQ_FOREACH(rle, rl, link) {
 		if (rle->type == type && rle->rid == rid)
 			return (rle);
 	}
 	return (NULL);
 }
 
 /**
  * @brief Delete a resource entry.
  *
  * @param rl		the resource list to edit
  * @param type		the resource entry type (e.g. SYS_RES_MEMORY)
  * @param rid		the resource identifier
  */
 void
 resource_list_delete(struct resource_list *rl, int type, int rid)
 {
 	struct resource_list_entry *rle = resource_list_find(rl, type, rid);
 
 	if (rle) {
 		if (rle->res != NULL)
 			panic("resource_list_delete: resource has not been released");
 		STAILQ_REMOVE(rl, rle, resource_list_entry, link);
 		free(rle, M_BUS);
 	}
 }
 
 /**
  * @brief Allocate a reserved resource
  *
  * This can be used by busses to force the allocation of resources
  * that are always active in the system even if they are not allocated
  * by a driver (e.g. PCI BARs).  This function is usually called when
  * adding a new child to the bus.  The resource is allocated from the
  * parent bus when it is reserved.  The resource list entry is marked
  * with RLE_RESERVED to note that it is a reserved resource.
  *
  * Subsequent attempts to allocate the resource with
  * resource_list_alloc() will succeed the first time and will set
  * RLE_ALLOCATED to note that it has been allocated.  When a reserved
  * resource that has been allocated is released with
  * resource_list_release() the resource RLE_ALLOCATED is cleared, but
  * the actual resource remains allocated.  The resource can be released to
  * the parent bus by calling resource_list_unreserve().
  *
  * @param rl		the resource list to allocate from
  * @param bus		the parent device of @p child
  * @param child		the device for which the resource is being reserved
  * @param type		the type of resource to allocate
  * @param rid		a pointer to the resource identifier
  * @param start		hint at the start of the resource range - pass
  *			@c 0UL for any start address
  * @param end		hint at the end of the resource range - pass
  *			@c ~0UL for any end address
  * @param count		hint at the size of range required - pass @c 1
  *			for any size
  * @param flags		any extra flags to control the resource
  *			allocation - see @c RF_XXX flags in
  *			<sys/rman.h> for details
  * 
  * @returns		the resource which was allocated or @c NULL if no
  *			resource could be allocated
  */
 struct resource *
 resource_list_reserve(struct resource_list *rl, device_t bus, device_t child,
     int type, int *rid, u_long start, u_long end, u_long count, u_int flags)
 {
 	struct resource_list_entry *rle = NULL;
 	int passthrough = (device_get_parent(child) != bus);
 	struct resource *r;
 
 	if (passthrough)
 		panic(
     "resource_list_reserve() should only be called for direct children");
 	if (flags & RF_ACTIVE)
 		panic(
     "resource_list_reserve() should only reserve inactive resources");
 
 	r = resource_list_alloc(rl, bus, child, type, rid, start, end, count,
 	    flags);
 	if (r != NULL) {
 		rle = resource_list_find(rl, type, *rid);
 		rle->flags |= RLE_RESERVED;
 	}
 	return (r);
 }
 
 /**
  * @brief Helper function for implementing BUS_ALLOC_RESOURCE()
  *
  * Implement BUS_ALLOC_RESOURCE() by looking up a resource from the list
  * and passing the allocation up to the parent of @p bus. This assumes
  * that the first entry of @c device_get_ivars(child) is a struct
  * resource_list. This also handles 'passthrough' allocations where a
  * child is a remote descendant of bus by passing the allocation up to
  * the parent of bus.
  *
  * Typically, a bus driver would store a list of child resources
  * somewhere in the child device's ivars (see device_get_ivars()) and
  * its implementation of BUS_ALLOC_RESOURCE() would find that list and
  * then call resource_list_alloc() to perform the allocation.
  *
  * @param rl		the resource list to allocate from
  * @param bus		the parent device of @p child
  * @param child		the device which is requesting an allocation
  * @param type		the type of resource to allocate
  * @param rid		a pointer to the resource identifier
  * @param start		hint at the start of the resource range - pass
  *			@c 0UL for any start address
  * @param end		hint at the end of the resource range - pass
  *			@c ~0UL for any end address
  * @param count		hint at the size of range required - pass @c 1
  *			for any size
  * @param flags		any extra flags to control the resource
  *			allocation - see @c RF_XXX flags in
  *			<sys/rman.h> for details
  * 
  * @returns		the resource which was allocated or @c NULL if no
  *			resource could be allocated
  */
 struct resource *
 resource_list_alloc(struct resource_list *rl, device_t bus, device_t child,
     int type, int *rid, u_long start, u_long end, u_long count, u_int flags)
 {
 	struct resource_list_entry *rle = NULL;
 	int passthrough = (device_get_parent(child) != bus);
 	int isdefault = (start == 0UL && end == ~0UL);
 
 	if (passthrough) {
 		return (BUS_ALLOC_RESOURCE(device_get_parent(bus), child,
 		    type, rid, start, end, count, flags));
 	}
 
 	rle = resource_list_find(rl, type, *rid);
 
 	if (!rle)
 		return (NULL);		/* no resource of that type/rid */
 
 	if (rle->res) {
 		if (rle->flags & RLE_RESERVED) {
 			if (rle->flags & RLE_ALLOCATED)
 				return (NULL);
 			if ((flags & RF_ACTIVE) &&
 			    bus_activate_resource(child, type, *rid,
 			    rle->res) != 0)
 				return (NULL);
 			rle->flags |= RLE_ALLOCATED;
 			return (rle->res);
 		}
 		device_printf(bus,
 		    "resource entry %#x type %d for child %s is busy\n", *rid,
 		    type, device_get_nameunit(child));
 		return (NULL);
 	}
 
 	if (isdefault) {
 		start = rle->start;
 		count = ulmax(count, rle->count);
 		end = ulmax(rle->end, start + count - 1);
 	}
 
 	rle->res = BUS_ALLOC_RESOURCE(device_get_parent(bus), child,
 	    type, rid, start, end, count, flags);
 
 	/*
 	 * Record the new range.
 	 */
 	if (rle->res) {
 		rle->start = rman_get_start(rle->res);
 		rle->end = rman_get_end(rle->res);
 		rle->count = count;
 	}
 
 	return (rle->res);
 }
 
 /**
  * @brief Helper function for implementing BUS_RELEASE_RESOURCE()
  * 
  * Implement BUS_RELEASE_RESOURCE() using a resource list. Normally
  * used with resource_list_alloc().
  * 
  * @param rl		the resource list which was allocated from
  * @param bus		the parent device of @p child
  * @param child		the device which is requesting a release
  * @param type		the type of resource to release
  * @param rid		the resource identifier
  * @param res		the resource to release
  * 
  * @retval 0		success
  * @retval non-zero	a standard unix error code indicating what
  *			error condition prevented the operation
  */
 int
 resource_list_release(struct resource_list *rl, device_t bus, device_t child,
     int type, int rid, struct resource *res)
 {
 	struct resource_list_entry *rle = NULL;
 	int passthrough = (device_get_parent(child) != bus);
 	int error;
 
 	if (passthrough) {
 		return (BUS_RELEASE_RESOURCE(device_get_parent(bus), child,
 		    type, rid, res));
 	}
 
 	rle = resource_list_find(rl, type, rid);
 
 	if (!rle)
 		panic("resource_list_release: can't find resource");
 	if (!rle->res)
 		panic("resource_list_release: resource entry is not busy");
 	if (rle->flags & RLE_RESERVED) {
 		if (rle->flags & RLE_ALLOCATED) {
 			if (rman_get_flags(res) & RF_ACTIVE) {
 				error = bus_deactivate_resource(child, type,
 				    rid, res);
 				if (error)
 					return (error);
 			}
 			rle->flags &= ~RLE_ALLOCATED;
 			return (0);
 		}
 		return (EINVAL);
 	}
 
 	error = BUS_RELEASE_RESOURCE(device_get_parent(bus), child,
 	    type, rid, res);
 	if (error)
 		return (error);
 
 	rle->res = NULL;
 	return (0);
 }
 
 /**
  * @brief Release all active resources of a given type
  *
  * Release all active resources of a specified type.  This is intended
  * to be used to cleanup resources leaked by a driver after detach or
  * a failed attach.
  *
  * @param rl		the resource list which was allocated from
  * @param bus		the parent device of @p child
  * @param child		the device whose active resources are being released
  * @param type		the type of resources to release
  * 
  * @retval 0		success
  * @retval EBUSY	at least one resource was active
  */
 int
 resource_list_release_active(struct resource_list *rl, device_t bus,
     device_t child, int type)
 {
 	struct resource_list_entry *rle;
 	int error, retval;
 
 	retval = 0;
 	STAILQ_FOREACH(rle, rl, link) {
 		if (rle->type != type)
 			continue;
 		if (rle->res == NULL)
 			continue;
 		if ((rle->flags & (RLE_RESERVED | RLE_ALLOCATED)) ==
 		    RLE_RESERVED)
 			continue;
 		retval = EBUSY;
 		error = resource_list_release(rl, bus, child, type,
 		    rman_get_rid(rle->res), rle->res);
 		if (error != 0)
 			device_printf(bus,
 			    "Failed to release active resource: %d\n", error);
 	}
 	return (retval);
 }
 
 
 /**
  * @brief Fully release a reserved resource
  *
  * Fully releases a resource reserved via resource_list_reserve().
  *
  * @param rl		the resource list which was allocated from
  * @param bus		the parent device of @p child
  * @param child		the device whose reserved resource is being released
  * @param type		the type of resource to release
  * @param rid		the resource identifier
  * @param res		the resource to release
  * 
  * @retval 0		success
  * @retval non-zero	a standard unix error code indicating what
  *			error condition prevented the operation
  */
 int
 resource_list_unreserve(struct resource_list *rl, device_t bus, device_t child,
     int type, int rid)
 {
 	struct resource_list_entry *rle = NULL;
 	int passthrough = (device_get_parent(child) != bus);
 
 	if (passthrough)
 		panic(
     "resource_list_unreserve() should only be called for direct children");
 
 	rle = resource_list_find(rl, type, rid);
 
 	if (!rle)
 		panic("resource_list_unreserve: can't find resource");
 	if (!(rle->flags & RLE_RESERVED))
 		return (EINVAL);
 	if (rle->flags & RLE_ALLOCATED)
 		return (EBUSY);
 	rle->flags &= ~RLE_RESERVED;
 	return (resource_list_release(rl, bus, child, type, rid, rle->res));
 }
 
 /**
  * @brief Print a description of resources in a resource list
  *
  * Print all resources of a specified type, for use in BUS_PRINT_CHILD().
  * The name is printed if at least one resource of the given type is available.
  * The format is used to print resource start and end.
  *
  * @param rl		the resource list to print
  * @param name		the name of @p type, e.g. @c "memory"
  * @param type		type type of resource entry to print
  * @param format	printf(9) format string to print resource
  *			start and end values
  * 
  * @returns		the number of characters printed
  */
 int
 resource_list_print_type(struct resource_list *rl, const char *name, int type,
     const char *format)
 {
 	struct resource_list_entry *rle;
 	int printed, retval;
 
 	printed = 0;
 	retval = 0;
 	/* Yes, this is kinda cheating */
 	STAILQ_FOREACH(rle, rl, link) {
 		if (rle->type == type) {
 			if (printed == 0)
 				retval += printf(" %s ", name);
 			else
 				retval += printf(",");
 			printed++;
 			retval += printf(format, rle->start);
 			if (rle->count > 1) {
 				retval += printf("-");
 				retval += printf(format, rle->start +
 						 rle->count - 1);
 			}
 		}
 	}
 	return (retval);
 }
 
 /**
  * @brief Releases all the resources in a list.
  *
  * @param rl		The resource list to purge.
  * 
  * @returns		nothing
  */
 void
 resource_list_purge(struct resource_list *rl)
 {
 	struct resource_list_entry *rle;
 
 	while ((rle = STAILQ_FIRST(rl)) != NULL) {
 		if (rle->res)
 			bus_release_resource(rman_get_device(rle->res),
 			    rle->type, rle->rid, rle->res);
 		STAILQ_REMOVE_HEAD(rl, link);
 		free(rle, M_BUS);
 	}
 }
 
 device_t
 bus_generic_add_child(device_t dev, u_int order, const char *name, int unit)
 {
 
 	return (device_add_child_ordered(dev, order, name, unit));
 }
 
 /**
  * @brief Helper function for implementing DEVICE_PROBE()
  *
  * This function can be used to help implement the DEVICE_PROBE() for
  * a bus (i.e. a device which has other devices attached to it). It
  * calls the DEVICE_IDENTIFY() method of each driver in the device's
  * devclass.
  */
 int
 bus_generic_probe(device_t dev)
 {
 	devclass_t dc = dev->devclass;
 	driverlink_t dl;
 
 	TAILQ_FOREACH(dl, &dc->drivers, link) {
 		/*
 		 * If this driver's pass is too high, then ignore it.
 		 * For most drivers in the default pass, this will
 		 * never be true.  For early-pass drivers they will
 		 * only call the identify routines of eligible drivers
 		 * when this routine is called.  Drivers for later
 		 * passes should have their identify routines called
 		 * on early-pass busses during BUS_NEW_PASS().
 		 */
 		if (dl->pass > bus_current_pass)
 			continue;
 		DEVICE_IDENTIFY(dl->driver, dev);
 	}
 
 	return (0);
 }
 
 /**
  * @brief Helper function for implementing DEVICE_ATTACH()
  *
  * This function can be used to help implement the DEVICE_ATTACH() for
  * a bus. It calls device_probe_and_attach() for each of the device's
  * children.
  */
 int
 bus_generic_attach(device_t dev)
 {
 	device_t child;
 
 	TAILQ_FOREACH(child, &dev->children, link) {
 		device_probe_and_attach(child);
 	}
 
 	return (0);
 }
 
 /**
  * @brief Helper function for implementing DEVICE_DETACH()
  *
  * This function can be used to help implement the DEVICE_DETACH() for
  * a bus. It calls device_detach() for each of the device's
  * children.
  */
 int
 bus_generic_detach(device_t dev)
 {
 	device_t child;
 	int error;
 
 	if (dev->state != DS_ATTACHED)
 		return (EBUSY);
 
 	TAILQ_FOREACH(child, &dev->children, link) {
 		if ((error = device_detach(child)) != 0)
 			return (error);
 	}
 
 	return (0);
 }
 
 /**
  * @brief Helper function for implementing DEVICE_SHUTDOWN()
  *
  * This function can be used to help implement the DEVICE_SHUTDOWN()
  * for a bus. It calls device_shutdown() for each of the device's
  * children.
  */
 int
 bus_generic_shutdown(device_t dev)
 {
 	device_t child;
 
 	TAILQ_FOREACH(child, &dev->children, link) {
 		device_shutdown(child);
 	}
 
 	return (0);
 }
 
 /**
  * @brief Helper function for implementing DEVICE_SUSPEND()
  *
  * This function can be used to help implement the DEVICE_SUSPEND()
  * for a bus. It calls DEVICE_SUSPEND() for each of the device's
  * children. If any call to DEVICE_SUSPEND() fails, the suspend
  * operation is aborted and any devices which were suspended are
  * resumed immediately by calling their DEVICE_RESUME() methods.
  */
 int
 bus_generic_suspend(device_t dev)
 {
 	int		error;
 	device_t	child, child2;
 
 	TAILQ_FOREACH(child, &dev->children, link) {
 		error = DEVICE_SUSPEND(child);
 		if (error) {
 			for (child2 = TAILQ_FIRST(&dev->children);
 			     child2 && child2 != child;
 			     child2 = TAILQ_NEXT(child2, link))
 				DEVICE_RESUME(child2);
 			return (error);
 		}
 	}
 	return (0);
 }
 
 /**
  * @brief Helper function for implementing DEVICE_RESUME()
  *
  * This function can be used to help implement the DEVICE_RESUME() for
  * a bus. It calls DEVICE_RESUME() on each of the device's children.
  */
 int
 bus_generic_resume(device_t dev)
 {
 	device_t	child;
 
 	TAILQ_FOREACH(child, &dev->children, link) {
 		DEVICE_RESUME(child);
 		/* if resume fails, there's nothing we can usefully do... */
 	}
 	return (0);
 }
 
 /**
  * @brief Helper function for implementing BUS_PRINT_CHILD().
  *
  * This function prints the first part of the ascii representation of
  * @p child, including its name, unit and description (if any - see
  * device_set_desc()).
  *
  * @returns the number of characters printed
  */
 int
 bus_print_child_header(device_t dev, device_t child)
 {
 	int	retval = 0;
 
 	if (device_get_desc(child)) {
 		retval += device_printf(child, "<%s>", device_get_desc(child));
 	} else {
 		retval += printf("%s", device_get_nameunit(child));
 	}
 
 	return (retval);
 }
 
 /**
  * @brief Helper function for implementing BUS_PRINT_CHILD().
  *
  * This function prints the last part of the ascii representation of
  * @p child, which consists of the string @c " on " followed by the
  * name and unit of the @p dev.
  *
  * @returns the number of characters printed
  */
 int
 bus_print_child_footer(device_t dev, device_t child)
 {
 	return (printf(" on %s\n", device_get_nameunit(dev)));
 }
 
 /**
  * @brief Helper function for implementing BUS_PRINT_CHILD().
  *
  * This function prints out the VM domain for the given device.
  *
  * @returns the number of characters printed
  */
 int
 bus_print_child_domain(device_t dev, device_t child)
 {
 	int domain;
 
 	/* No domain? Don't print anything */
 	if (BUS_GET_DOMAIN(dev, child, &domain) != 0)
 		return (0);
 
 	return (printf(" numa-domain %d", domain));
 }
 
 /**
  * @brief Helper function for implementing BUS_PRINT_CHILD().
  *
  * This function simply calls bus_print_child_header() followed by
  * bus_print_child_footer().
  *
  * @returns the number of characters printed
  */
 int
 bus_generic_print_child(device_t dev, device_t child)
 {
 	int	retval = 0;
 
 	retval += bus_print_child_header(dev, child);
 	retval += bus_print_child_domain(dev, child);
 	retval += bus_print_child_footer(dev, child);
 
 	return (retval);
 }
 
 /**
  * @brief Stub function for implementing BUS_READ_IVAR().
  * 
  * @returns ENOENT
  */
 int
 bus_generic_read_ivar(device_t dev, device_t child, int index,
     uintptr_t * result)
 {
 	return (ENOENT);
 }
 
 /**
  * @brief Stub function for implementing BUS_WRITE_IVAR().
  * 
  * @returns ENOENT
  */
 int
 bus_generic_write_ivar(device_t dev, device_t child, int index,
     uintptr_t value)
 {
 	return (ENOENT);
 }
 
 /**
  * @brief Stub function for implementing BUS_GET_RESOURCE_LIST().
  * 
  * @returns NULL
  */
 struct resource_list *
 bus_generic_get_resource_list(device_t dev, device_t child)
 {
 	return (NULL);
 }
 
 /**
  * @brief Helper function for implementing BUS_DRIVER_ADDED().
  *
  * This implementation of BUS_DRIVER_ADDED() simply calls the driver's
  * DEVICE_IDENTIFY() method to allow it to add new children to the bus
  * and then calls device_probe_and_attach() for each unattached child.
  */
 void
 bus_generic_driver_added(device_t dev, driver_t *driver)
 {
 	device_t child;
 
 	DEVICE_IDENTIFY(driver, dev);
 	TAILQ_FOREACH(child, &dev->children, link) {
 		if (child->state == DS_NOTPRESENT ||
 		    (child->flags & DF_REBID))
 			device_probe_and_attach(child);
 	}
 }
 
 /**
  * @brief Helper function for implementing BUS_NEW_PASS().
  *
  * This implementing of BUS_NEW_PASS() first calls the identify
  * routines for any drivers that probe at the current pass.  Then it
  * walks the list of devices for this bus.  If a device is already
  * attached, then it calls BUS_NEW_PASS() on that device.  If the
  * device is not already attached, it attempts to attach a driver to
  * it.
  */
 void
 bus_generic_new_pass(device_t dev)
 {
 	driverlink_t dl;
 	devclass_t dc;
 	device_t child;
 
 	dc = dev->devclass;
 	TAILQ_FOREACH(dl, &dc->drivers, link) {
 		if (dl->pass == bus_current_pass)
 			DEVICE_IDENTIFY(dl->driver, dev);
 	}
 	TAILQ_FOREACH(child, &dev->children, link) {
 		if (child->state >= DS_ATTACHED)
 			BUS_NEW_PASS(child);
 		else if (child->state == DS_NOTPRESENT)
 			device_probe_and_attach(child);
 	}
 }
 
 /**
  * @brief Helper function for implementing BUS_SETUP_INTR().
  *
  * This simple implementation of BUS_SETUP_INTR() simply calls the
  * BUS_SETUP_INTR() method of the parent of @p dev.
  */
 int
 bus_generic_setup_intr(device_t dev, device_t child, struct resource *irq,
     int flags, driver_filter_t *filter, driver_intr_t *intr, void *arg, 
     void **cookiep)
 {
 	/* Propagate up the bus hierarchy until someone handles it. */
 	if (dev->parent)
 		return (BUS_SETUP_INTR(dev->parent, child, irq, flags,
 		    filter, intr, arg, cookiep));
 	return (EINVAL);
 }
 
 /**
  * @brief Helper function for implementing BUS_TEARDOWN_INTR().
  *
  * This simple implementation of BUS_TEARDOWN_INTR() simply calls the
  * BUS_TEARDOWN_INTR() method of the parent of @p dev.
  */
 int
 bus_generic_teardown_intr(device_t dev, device_t child, struct resource *irq,
     void *cookie)
 {
 	/* Propagate up the bus hierarchy until someone handles it. */
 	if (dev->parent)
 		return (BUS_TEARDOWN_INTR(dev->parent, child, irq, cookie));
 	return (EINVAL);
 }
 
 /**
  * @brief Helper function for implementing BUS_ADJUST_RESOURCE().
  *
  * This simple implementation of BUS_ADJUST_RESOURCE() simply calls the
  * BUS_ADJUST_RESOURCE() method of the parent of @p dev.
  */
 int
 bus_generic_adjust_resource(device_t dev, device_t child, int type,
     struct resource *r, u_long start, u_long end)
 {
 	/* Propagate up the bus hierarchy until someone handles it. */
 	if (dev->parent)
 		return (BUS_ADJUST_RESOURCE(dev->parent, child, type, r, start,
 		    end));
 	return (EINVAL);
 }
 
 /**
  * @brief Helper function for implementing BUS_ALLOC_RESOURCE().
  *
  * This simple implementation of BUS_ALLOC_RESOURCE() simply calls the
  * BUS_ALLOC_RESOURCE() method of the parent of @p dev.
  */
 struct resource *
 bus_generic_alloc_resource(device_t dev, device_t child, int type, int *rid,
     u_long start, u_long end, u_long count, u_int flags)
 {
 	/* Propagate up the bus hierarchy until someone handles it. */
 	if (dev->parent)
 		return (BUS_ALLOC_RESOURCE(dev->parent, child, type, rid,
 		    start, end, count, flags));
 	return (NULL);
 }
 
 /**
  * @brief Helper function for implementing BUS_RELEASE_RESOURCE().
  *
  * This simple implementation of BUS_RELEASE_RESOURCE() simply calls the
  * BUS_RELEASE_RESOURCE() method of the parent of @p dev.
  */
 int
 bus_generic_release_resource(device_t dev, device_t child, int type, int rid,
     struct resource *r)
 {
 	/* Propagate up the bus hierarchy until someone handles it. */
 	if (dev->parent)
 		return (BUS_RELEASE_RESOURCE(dev->parent, child, type, rid,
 		    r));
 	return (EINVAL);
 }
 
 /**
  * @brief Helper function for implementing BUS_ACTIVATE_RESOURCE().
  *
  * This simple implementation of BUS_ACTIVATE_RESOURCE() simply calls the
  * BUS_ACTIVATE_RESOURCE() method of the parent of @p dev.
  */
 int
 bus_generic_activate_resource(device_t dev, device_t child, int type, int rid,
     struct resource *r)
 {
 	/* Propagate up the bus hierarchy until someone handles it. */
 	if (dev->parent)
 		return (BUS_ACTIVATE_RESOURCE(dev->parent, child, type, rid,
 		    r));
 	return (EINVAL);
 }
 
 /**
  * @brief Helper function for implementing BUS_DEACTIVATE_RESOURCE().
  *
  * This simple implementation of BUS_DEACTIVATE_RESOURCE() simply calls the
  * BUS_DEACTIVATE_RESOURCE() method of the parent of @p dev.
  */
 int
 bus_generic_deactivate_resource(device_t dev, device_t child, int type,
     int rid, struct resource *r)
 {
 	/* Propagate up the bus hierarchy until someone handles it. */
 	if (dev->parent)
 		return (BUS_DEACTIVATE_RESOURCE(dev->parent, child, type, rid,
 		    r));
 	return (EINVAL);
 }
 
 /**
  * @brief Helper function for implementing BUS_BIND_INTR().
  *
  * This simple implementation of BUS_BIND_INTR() simply calls the
  * BUS_BIND_INTR() method of the parent of @p dev.
  */
 int
 bus_generic_bind_intr(device_t dev, device_t child, struct resource *irq,
     int cpu)
 {
 
 	/* Propagate up the bus hierarchy until someone handles it. */
 	if (dev->parent)
 		return (BUS_BIND_INTR(dev->parent, child, irq, cpu));
 	return (EINVAL);
 }
 
 /**
  * @brief Helper function for implementing BUS_CONFIG_INTR().
  *
  * This simple implementation of BUS_CONFIG_INTR() simply calls the
  * BUS_CONFIG_INTR() method of the parent of @p dev.
  */
 int
 bus_generic_config_intr(device_t dev, int irq, enum intr_trigger trig,
     enum intr_polarity pol)
 {
 
 	/* Propagate up the bus hierarchy until someone handles it. */
 	if (dev->parent)
 		return (BUS_CONFIG_INTR(dev->parent, irq, trig, pol));
 	return (EINVAL);
 }
 
 /**
  * @brief Helper function for implementing BUS_DESCRIBE_INTR().
  *
  * This simple implementation of BUS_DESCRIBE_INTR() simply calls the
  * BUS_DESCRIBE_INTR() method of the parent of @p dev.
  */
 int
 bus_generic_describe_intr(device_t dev, device_t child, struct resource *irq,
     void *cookie, const char *descr)
 {
 
 	/* Propagate up the bus hierarchy until someone handles it. */
 	if (dev->parent)
 		return (BUS_DESCRIBE_INTR(dev->parent, child, irq, cookie,
 		    descr));
 	return (EINVAL);
 }
 
 /**
  * @brief Helper function for implementing BUS_GET_DMA_TAG().
  *
  * This simple implementation of BUS_GET_DMA_TAG() simply calls the
  * BUS_GET_DMA_TAG() method of the parent of @p dev.
  */
 bus_dma_tag_t
 bus_generic_get_dma_tag(device_t dev, device_t child)
 {
 
 	/* Propagate up the bus hierarchy until someone handles it. */
 	if (dev->parent != NULL)
 		return (BUS_GET_DMA_TAG(dev->parent, child));
 	return (NULL);
 }
 
 /**
  * @brief Helper function for implementing BUS_GET_RESOURCE().
  *
  * This implementation of BUS_GET_RESOURCE() uses the
  * resource_list_find() function to do most of the work. It calls
  * BUS_GET_RESOURCE_LIST() to find a suitable resource list to
  * search.
  */
 int
 bus_generic_rl_get_resource(device_t dev, device_t child, int type, int rid,
     u_long *startp, u_long *countp)
 {
 	struct resource_list *		rl = NULL;
 	struct resource_list_entry *	rle = NULL;
 
 	rl = BUS_GET_RESOURCE_LIST(dev, child);
 	if (!rl)
 		return (EINVAL);
 
 	rle = resource_list_find(rl, type, rid);
 	if (!rle)
 		return (ENOENT);
 
 	if (startp)
 		*startp = rle->start;
 	if (countp)
 		*countp = rle->count;
 
 	return (0);
 }
 
 /**
  * @brief Helper function for implementing BUS_SET_RESOURCE().
  *
  * This implementation of BUS_SET_RESOURCE() uses the
  * resource_list_add() function to do most of the work. It calls
  * BUS_GET_RESOURCE_LIST() to find a suitable resource list to
  * edit.
  */
 int
 bus_generic_rl_set_resource(device_t dev, device_t child, int type, int rid,
     u_long start, u_long count)
 {
 	struct resource_list *		rl = NULL;
 
 	rl = BUS_GET_RESOURCE_LIST(dev, child);
 	if (!rl)
 		return (EINVAL);
 
 	resource_list_add(rl, type, rid, start, (start + count - 1), count);
 
 	return (0);
 }
 
 /**
  * @brief Helper function for implementing BUS_DELETE_RESOURCE().
  *
  * This implementation of BUS_DELETE_RESOURCE() uses the
  * resource_list_delete() function to do most of the work. It calls
  * BUS_GET_RESOURCE_LIST() to find a suitable resource list to
  * edit.
  */
 void
 bus_generic_rl_delete_resource(device_t dev, device_t child, int type, int rid)
 {
 	struct resource_list *		rl = NULL;
 
 	rl = BUS_GET_RESOURCE_LIST(dev, child);
 	if (!rl)
 		return;
 
 	resource_list_delete(rl, type, rid);
 
 	return;
 }
 
 /**
  * @brief Helper function for implementing BUS_RELEASE_RESOURCE().
  *
  * This implementation of BUS_RELEASE_RESOURCE() uses the
  * resource_list_release() function to do most of the work. It calls
  * BUS_GET_RESOURCE_LIST() to find a suitable resource list.
  */
 int
 bus_generic_rl_release_resource(device_t dev, device_t child, int type,
     int rid, struct resource *r)
 {
 	struct resource_list *		rl = NULL;
 
 	if (device_get_parent(child) != dev)
 		return (BUS_RELEASE_RESOURCE(device_get_parent(dev), child,
 		    type, rid, r));
 
 	rl = BUS_GET_RESOURCE_LIST(dev, child);
 	if (!rl)
 		return (EINVAL);
 
 	return (resource_list_release(rl, dev, child, type, rid, r));
 }
 
 /**
  * @brief Helper function for implementing BUS_ALLOC_RESOURCE().
  *
  * This implementation of BUS_ALLOC_RESOURCE() uses the
  * resource_list_alloc() function to do most of the work. It calls
  * BUS_GET_RESOURCE_LIST() to find a suitable resource list.
  */
 struct resource *
 bus_generic_rl_alloc_resource(device_t dev, device_t child, int type,
     int *rid, u_long start, u_long end, u_long count, u_int flags)
 {
 	struct resource_list *		rl = NULL;
 
 	if (device_get_parent(child) != dev)
 		return (BUS_ALLOC_RESOURCE(device_get_parent(dev), child,
 		    type, rid, start, end, count, flags));
 
 	rl = BUS_GET_RESOURCE_LIST(dev, child);
 	if (!rl)
 		return (NULL);
 
 	return (resource_list_alloc(rl, dev, child, type, rid,
 	    start, end, count, flags));
 }
 
 /**
  * @brief Helper function for implementing BUS_CHILD_PRESENT().
  *
  * This simple implementation of BUS_CHILD_PRESENT() simply calls the
  * BUS_CHILD_PRESENT() method of the parent of @p dev.
  */
 int
 bus_generic_child_present(device_t dev, device_t child)
 {
 	return (BUS_CHILD_PRESENT(device_get_parent(dev), dev));
 }
 
 int
 bus_generic_get_domain(device_t dev, device_t child, int *domain)
 {
 
 	if (dev->parent)
 		return (BUS_GET_DOMAIN(dev->parent, dev, domain));
 
 	return (ENOENT);
 }
 
 /*
  * Some convenience functions to make it easier for drivers to use the
  * resource-management functions.  All these really do is hide the
  * indirection through the parent's method table, making for slightly
  * less-wordy code.  In the future, it might make sense for this code
  * to maintain some sort of a list of resources allocated by each device.
  */
 
 int
 bus_alloc_resources(device_t dev, struct resource_spec *rs,
     struct resource **res)
 {
 	int i;
 
 	for (i = 0; rs[i].type != -1; i++)
 		res[i] = NULL;
 	for (i = 0; rs[i].type != -1; i++) {
 		res[i] = bus_alloc_resource_any(dev,
 		    rs[i].type, &rs[i].rid, rs[i].flags);
 		if (res[i] == NULL && !(rs[i].flags & RF_OPTIONAL)) {
 			bus_release_resources(dev, rs, res);
 			return (ENXIO);
 		}
 	}
 	return (0);
 }
 
 void
 bus_release_resources(device_t dev, const struct resource_spec *rs,
     struct resource **res)
 {
 	int i;
 
 	for (i = 0; rs[i].type != -1; i++)
 		if (res[i] != NULL) {
 			bus_release_resource(
 			    dev, rs[i].type, rs[i].rid, res[i]);
 			res[i] = NULL;
 		}
 }
 
 /**
  * @brief Wrapper function for BUS_ALLOC_RESOURCE().
  *
  * This function simply calls the BUS_ALLOC_RESOURCE() method of the
  * parent of @p dev.
  */
 struct resource *
 bus_alloc_resource(device_t dev, int type, int *rid, u_long start, u_long end,
     u_long count, u_int flags)
 {
 	if (dev->parent == NULL)
 		return (NULL);
 	return (BUS_ALLOC_RESOURCE(dev->parent, dev, type, rid, start, end,
 	    count, flags));
 }
 
 /**
  * @brief Wrapper function for BUS_ADJUST_RESOURCE().
  *
  * This function simply calls the BUS_ADJUST_RESOURCE() method of the
  * parent of @p dev.
  */
 int
 bus_adjust_resource(device_t dev, int type, struct resource *r, u_long start,
     u_long end)
 {
 	if (dev->parent == NULL)
 		return (EINVAL);
 	return (BUS_ADJUST_RESOURCE(dev->parent, dev, type, r, start, end));
 }
 
 /**
  * @brief Wrapper function for BUS_ACTIVATE_RESOURCE().
  *
  * This function simply calls the BUS_ACTIVATE_RESOURCE() method of the
  * parent of @p dev.
  */
 int
 bus_activate_resource(device_t dev, int type, int rid, struct resource *r)
 {
 	if (dev->parent == NULL)
 		return (EINVAL);
 	return (BUS_ACTIVATE_RESOURCE(dev->parent, dev, type, rid, r));
 }
 
 /**
  * @brief Wrapper function for BUS_DEACTIVATE_RESOURCE().
  *
  * This function simply calls the BUS_DEACTIVATE_RESOURCE() method of the
  * parent of @p dev.
  */
 int
 bus_deactivate_resource(device_t dev, int type, int rid, struct resource *r)
 {
 	if (dev->parent == NULL)
 		return (EINVAL);
 	return (BUS_DEACTIVATE_RESOURCE(dev->parent, dev, type, rid, r));
 }
 
 /**
  * @brief Wrapper function for BUS_RELEASE_RESOURCE().
  *
  * This function simply calls the BUS_RELEASE_RESOURCE() method of the
  * parent of @p dev.
  */
 int
 bus_release_resource(device_t dev, int type, int rid, struct resource *r)
 {
 	if (dev->parent == NULL)
 		return (EINVAL);
 	return (BUS_RELEASE_RESOURCE(dev->parent, dev, type, rid, r));
 }
 
 /**
  * @brief Wrapper function for BUS_SETUP_INTR().
  *
  * This function simply calls the BUS_SETUP_INTR() method of the
  * parent of @p dev.
  */
 int
 bus_setup_intr(device_t dev, struct resource *r, int flags,
     driver_filter_t filter, driver_intr_t handler, void *arg, void **cookiep)
 {
 	int error;
 
 	if (dev->parent == NULL)
 		return (EINVAL);
 	error = BUS_SETUP_INTR(dev->parent, dev, r, flags, filter, handler,
 	    arg, cookiep);
 	if (error != 0)
 		return (error);
 	if (handler != NULL && !(flags & INTR_MPSAFE))
 		device_printf(dev, "[GIANT-LOCKED]\n");
 	return (0);
 }
 
 /**
  * @brief Wrapper function for BUS_TEARDOWN_INTR().
  *
  * This function simply calls the BUS_TEARDOWN_INTR() method of the
  * parent of @p dev.
  */
 int
 bus_teardown_intr(device_t dev, struct resource *r, void *cookie)
 {
 	if (dev->parent == NULL)
 		return (EINVAL);
 	return (BUS_TEARDOWN_INTR(dev->parent, dev, r, cookie));
 }
 
 /**
  * @brief Wrapper function for BUS_BIND_INTR().
  *
  * This function simply calls the BUS_BIND_INTR() method of the
  * parent of @p dev.
  */
 int
 bus_bind_intr(device_t dev, struct resource *r, int cpu)
 {
 	if (dev->parent == NULL)
 		return (EINVAL);
 	return (BUS_BIND_INTR(dev->parent, dev, r, cpu));
 }
 
 /**
  * @brief Wrapper function for BUS_DESCRIBE_INTR().
  *
  * This function first formats the requested description into a
  * temporary buffer and then calls the BUS_DESCRIBE_INTR() method of
  * the parent of @p dev.
  */
 int
 bus_describe_intr(device_t dev, struct resource *irq, void *cookie,
     const char *fmt, ...)
 {
 	va_list ap;
 	char descr[MAXCOMLEN + 1];
 
 	if (dev->parent == NULL)
 		return (EINVAL);
 	va_start(ap, fmt);
 	vsnprintf(descr, sizeof(descr), fmt, ap);
 	va_end(ap);
 	return (BUS_DESCRIBE_INTR(dev->parent, dev, irq, cookie, descr));
 }
 
 /**
  * @brief Wrapper function for BUS_SET_RESOURCE().
  *
  * This function simply calls the BUS_SET_RESOURCE() method of the
  * parent of @p dev.
  */
 int
 bus_set_resource(device_t dev, int type, int rid,
     u_long start, u_long count)
 {
 	return (BUS_SET_RESOURCE(device_get_parent(dev), dev, type, rid,
 	    start, count));
 }
 
 /**
  * @brief Wrapper function for BUS_GET_RESOURCE().
  *
  * This function simply calls the BUS_GET_RESOURCE() method of the
  * parent of @p dev.
  */
 int
 bus_get_resource(device_t dev, int type, int rid,
     u_long *startp, u_long *countp)
 {
 	return (BUS_GET_RESOURCE(device_get_parent(dev), dev, type, rid,
 	    startp, countp));
 }
 
 /**
  * @brief Wrapper function for BUS_GET_RESOURCE().
  *
  * This function simply calls the BUS_GET_RESOURCE() method of the
  * parent of @p dev and returns the start value.
  */
 u_long
 bus_get_resource_start(device_t dev, int type, int rid)
 {
 	u_long start, count;
 	int error;
 
 	error = BUS_GET_RESOURCE(device_get_parent(dev), dev, type, rid,
 	    &start, &count);
 	if (error)
 		return (0);
 	return (start);
 }
 
 /**
  * @brief Wrapper function for BUS_GET_RESOURCE().
  *
  * This function simply calls the BUS_GET_RESOURCE() method of the
  * parent of @p dev and returns the count value.
  */
 u_long
 bus_get_resource_count(device_t dev, int type, int rid)
 {
 	u_long start, count;
 	int error;
 
 	error = BUS_GET_RESOURCE(device_get_parent(dev), dev, type, rid,
 	    &start, &count);
 	if (error)
 		return (0);
 	return (count);
 }
 
 /**
  * @brief Wrapper function for BUS_DELETE_RESOURCE().
  *
  * This function simply calls the BUS_DELETE_RESOURCE() method of the
  * parent of @p dev.
  */
 void
 bus_delete_resource(device_t dev, int type, int rid)
 {
 	BUS_DELETE_RESOURCE(device_get_parent(dev), dev, type, rid);
 }
 
 /**
  * @brief Wrapper function for BUS_CHILD_PRESENT().
  *
  * This function simply calls the BUS_CHILD_PRESENT() method of the
  * parent of @p dev.
  */
 int
 bus_child_present(device_t child)
 {
 	return (BUS_CHILD_PRESENT(device_get_parent(child), child));
 }
 
 /**
  * @brief Wrapper function for BUS_CHILD_PNPINFO_STR().
  *
  * This function simply calls the BUS_CHILD_PNPINFO_STR() method of the
  * parent of @p dev.
  */
 int
 bus_child_pnpinfo_str(device_t child, char *buf, size_t buflen)
 {
 	device_t parent;
 
 	parent = device_get_parent(child);
 	if (parent == NULL) {
 		*buf = '\0';
 		return (0);
 	}
 	return (BUS_CHILD_PNPINFO_STR(parent, child, buf, buflen));
 }
 
 /**
  * @brief Wrapper function for BUS_CHILD_LOCATION_STR().
  *
  * This function simply calls the BUS_CHILD_LOCATION_STR() method of the
  * parent of @p dev.
  */
 int
 bus_child_location_str(device_t child, char *buf, size_t buflen)
 {
 	device_t parent;
 
 	parent = device_get_parent(child);
 	if (parent == NULL) {
 		*buf = '\0';
 		return (0);
 	}
 	return (BUS_CHILD_LOCATION_STR(parent, child, buf, buflen));
 }
 
 /**
  * @brief Wrapper function for BUS_GET_DMA_TAG().
  *
  * This function simply calls the BUS_GET_DMA_TAG() method of the
  * parent of @p dev.
  */
 bus_dma_tag_t
 bus_get_dma_tag(device_t dev)
 {
 	device_t parent;
 
 	parent = device_get_parent(dev);
 	if (parent == NULL)
 		return (NULL);
 	return (BUS_GET_DMA_TAG(parent, dev));
 }
 
 /**
  * @brief Wrapper function for BUS_GET_DOMAIN().
  *
  * This function simply calls the BUS_GET_DOMAIN() method of the
  * parent of @p dev.
  */
 int
 bus_get_domain(device_t dev, int *domain)
 {
 	return (BUS_GET_DOMAIN(device_get_parent(dev), dev, domain));
 }
 
 /* Resume all devices and then notify userland that we're up again. */
 static int
 root_resume(device_t dev)
 {
 	int error;
 
 	error = bus_generic_resume(dev);
 	if (error == 0)
 		devctl_notify("kern", "power", "resume", NULL);
 	return (error);
 }
 
 static int
 root_print_child(device_t dev, device_t child)
 {
 	int	retval = 0;
 
 	retval += bus_print_child_header(dev, child);
 	retval += printf("\n");
 
 	return (retval);
 }
 
 static int
 root_setup_intr(device_t dev, device_t child, struct resource *irq, int flags,
     driver_filter_t *filter, driver_intr_t *intr, void *arg, void **cookiep)
 {
 	/*
 	 * If an interrupt mapping gets to here something bad has happened.
 	 */
 	panic("root_setup_intr");
 }
 
 /*
  * If we get here, assume that the device is permanant and really is
  * present in the system.  Removable bus drivers are expected to intercept
  * this call long before it gets here.  We return -1 so that drivers that
  * really care can check vs -1 or some ERRNO returned higher in the food
  * chain.
  */
 static int
 root_child_present(device_t dev, device_t child)
 {
 	return (-1);
 }
 
 static kobj_method_t root_methods[] = {
 	/* Device interface */
 	KOBJMETHOD(device_shutdown,	bus_generic_shutdown),
 	KOBJMETHOD(device_suspend,	bus_generic_suspend),
 	KOBJMETHOD(device_resume,	root_resume),
 
 	/* Bus interface */
 	KOBJMETHOD(bus_print_child,	root_print_child),
 	KOBJMETHOD(bus_read_ivar,	bus_generic_read_ivar),
 	KOBJMETHOD(bus_write_ivar,	bus_generic_write_ivar),
 	KOBJMETHOD(bus_setup_intr,	root_setup_intr),
 	KOBJMETHOD(bus_child_present,	root_child_present),
 
 	KOBJMETHOD_END
 };
 
 static driver_t root_driver = {
 	"root",
 	root_methods,
 	1,			/* no softc */
 };
 
 device_t	root_bus;
 devclass_t	root_devclass;
 
 static int
 root_bus_module_handler(module_t mod, int what, void* arg)
 {
 	switch (what) {
 	case MOD_LOAD:
 		TAILQ_INIT(&bus_data_devices);
 		kobj_class_compile((kobj_class_t) &root_driver);
 		root_bus = make_device(NULL, "root", 0);
 		root_bus->desc = "System root bus";
 		kobj_init((kobj_t) root_bus, (kobj_class_t) &root_driver);
 		root_bus->driver = &root_driver;
 		root_bus->state = DS_ATTACHED;
 		root_devclass = devclass_find_internal("root", NULL, FALSE);
 		devinit();
 		return (0);
 
 	case MOD_SHUTDOWN:
 		device_shutdown(root_bus);
 		return (0);
 	default:
 		return (EOPNOTSUPP);
 	}
 
 	return (0);
 }
 
 static moduledata_t root_bus_mod = {
 	"rootbus",
 	root_bus_module_handler,
 	NULL
 };
 DECLARE_MODULE(rootbus, root_bus_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST);
 
 /**
  * @brief Automatically configure devices
  *
  * This function begins the autoconfiguration process by calling
  * device_probe_and_attach() for each child of the @c root0 device.
  */ 
 void
 root_bus_configure(void)
 {
 
 	PDEBUG(("."));
 
 	/* Eventually this will be split up, but this is sufficient for now. */
 	bus_set_pass(BUS_PASS_DEFAULT);
 }
 
 /**
  * @brief Module handler for registering device drivers
  *
  * This module handler is used to automatically register device
  * drivers when modules are loaded. If @p what is MOD_LOAD, it calls
  * devclass_add_driver() for the driver described by the
  * driver_module_data structure pointed to by @p arg
  */
 int
 driver_module_handler(module_t mod, int what, void *arg)
 {
 	struct driver_module_data *dmd;
 	devclass_t bus_devclass;
 	kobj_class_t driver;
 	int error, pass;
 
 	dmd = (struct driver_module_data *)arg;
 	bus_devclass = devclass_find_internal(dmd->dmd_busname, NULL, TRUE);
 	error = 0;
 
 	switch (what) {
 	case MOD_LOAD:
 		if (dmd->dmd_chainevh)
 			error = dmd->dmd_chainevh(mod,what,dmd->dmd_chainarg);
 
 		pass = dmd->dmd_pass;
 		driver = dmd->dmd_driver;
 		PDEBUG(("Loading module: driver %s on bus %s (pass %d)",
 		    DRIVERNAME(driver), dmd->dmd_busname, pass));
 		error = devclass_add_driver(bus_devclass, driver, pass,
 		    dmd->dmd_devclass);
 		break;
 
 	case MOD_UNLOAD:
 		PDEBUG(("Unloading module: driver %s from bus %s",
 		    DRIVERNAME(dmd->dmd_driver),
 		    dmd->dmd_busname));
 		error = devclass_delete_driver(bus_devclass,
 		    dmd->dmd_driver);
 
 		if (!error && dmd->dmd_chainevh)
 			error = dmd->dmd_chainevh(mod,what,dmd->dmd_chainarg);
 		break;
 	case MOD_QUIESCE:
 		PDEBUG(("Quiesce module: driver %s from bus %s",
 		    DRIVERNAME(dmd->dmd_driver),
 		    dmd->dmd_busname));
 		error = devclass_quiesce_driver(bus_devclass,
 		    dmd->dmd_driver);
 
 		if (!error && dmd->dmd_chainevh)
 			error = dmd->dmd_chainevh(mod,what,dmd->dmd_chainarg);
 		break;
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 
 	return (error);
 }
 
 /**
  * @brief Enumerate all hinted devices for this bus.
  *
  * Walks through the hints for this bus and calls the bus_hinted_child
  * routine for each one it fines.  It searches first for the specific
  * bus that's being probed for hinted children (eg isa0), and then for
  * generic children (eg isa).
  *
  * @param	dev	bus device to enumerate
  */
 void
 bus_enumerate_hinted_children(device_t bus)
 {
 	int i;
 	const char *dname, *busname;
 	int dunit;
 
 	/*
 	 * enumerate all devices on the specific bus
 	 */
 	busname = device_get_nameunit(bus);
 	i = 0;
 	while (resource_find_match(&i, &dname, &dunit, "at", busname) == 0)
 		BUS_HINTED_CHILD(bus, dname, dunit);
 
 	/*
 	 * and all the generic ones.
 	 */
 	busname = device_get_name(bus);
 	i = 0;
 	while (resource_find_match(&i, &dname, &dunit, "at", busname) == 0)
 		BUS_HINTED_CHILD(bus, dname, dunit);
 }
 
 #ifdef BUS_DEBUG
 
 /* the _short versions avoid iteration by not calling anything that prints
  * more than oneliners. I love oneliners.
  */
 
 static void
 print_device_short(device_t dev, int indent)
 {
 	if (!dev)
 		return;
 
 	indentprintf(("device %d: <%s> %sparent,%schildren,%s%s%s%s%s,%sivars,%ssoftc,busy=%d\n",
 	    dev->unit, dev->desc,
 	    (dev->parent? "":"no "),
 	    (TAILQ_EMPTY(&dev->children)? "no ":""),
 	    (dev->flags&DF_ENABLED? "enabled,":"disabled,"),
 	    (dev->flags&DF_FIXEDCLASS? "fixed,":""),
 	    (dev->flags&DF_WILDCARD? "wildcard,":""),
 	    (dev->flags&DF_DESCMALLOCED? "descmalloced,":""),
 	    (dev->flags&DF_REBID? "rebiddable,":""),
 	    (dev->ivars? "":"no "),
 	    (dev->softc? "":"no "),
 	    dev->busy));
 }
 
 static void
 print_device(device_t dev, int indent)
 {
 	if (!dev)
 		return;
 
 	print_device_short(dev, indent);
 
 	indentprintf(("Parent:\n"));
 	print_device_short(dev->parent, indent+1);
 	indentprintf(("Driver:\n"));
 	print_driver_short(dev->driver, indent+1);
 	indentprintf(("Devclass:\n"));
 	print_devclass_short(dev->devclass, indent+1);
 }
 
 void
 print_device_tree_short(device_t dev, int indent)
 /* print the device and all its children (indented) */
 {
 	device_t child;
 
 	if (!dev)
 		return;
 
 	print_device_short(dev, indent);
 
 	TAILQ_FOREACH(child, &dev->children, link) {
 		print_device_tree_short(child, indent+1);
 	}
 }
 
 void
 print_device_tree(device_t dev, int indent)
 /* print the device and all its children (indented) */
 {
 	device_t child;
 
 	if (!dev)
 		return;
 
 	print_device(dev, indent);
 
 	TAILQ_FOREACH(child, &dev->children, link) {
 		print_device_tree(child, indent+1);
 	}
 }
 
 static void
 print_driver_short(driver_t *driver, int indent)
 {
 	if (!driver)
 		return;
 
 	indentprintf(("driver %s: softc size = %zd\n",
 	    driver->name, driver->size));
 }
 
 static void
 print_driver(driver_t *driver, int indent)
 {
 	if (!driver)
 		return;
 
 	print_driver_short(driver, indent);
 }
 
 static void
 print_driver_list(driver_list_t drivers, int indent)
 {
 	driverlink_t driver;
 
 	TAILQ_FOREACH(driver, &drivers, link) {
 		print_driver(driver->driver, indent);
 	}
 }
 
 static void
 print_devclass_short(devclass_t dc, int indent)
 {
 	if ( !dc )
 		return;
 
 	indentprintf(("devclass %s: max units = %d\n", dc->name, dc->maxunit));
 }
 
 static void
 print_devclass(devclass_t dc, int indent)
 {
 	int i;
 
 	if ( !dc )
 		return;
 
 	print_devclass_short(dc, indent);
 	indentprintf(("Drivers:\n"));
 	print_driver_list(dc->drivers, indent+1);
 
 	indentprintf(("Devices:\n"));
 	for (i = 0; i < dc->maxunit; i++)
 		if (dc->devices[i])
 			print_device(dc->devices[i], indent+1);
 }
 
 void
 print_devclass_list_short(void)
 {
 	devclass_t dc;
 
 	printf("Short listing of devclasses, drivers & devices:\n");
 	TAILQ_FOREACH(dc, &devclasses, link) {
 		print_devclass_short(dc, 0);
 	}
 }
 
 void
 print_devclass_list(void)
 {
 	devclass_t dc;
 
 	printf("Full listing of devclasses, drivers & devices:\n");
 	TAILQ_FOREACH(dc, &devclasses, link) {
 		print_devclass(dc, 0);
 	}
 }
 
 #endif
 
 /*
  * User-space access to the device tree.
  *
  * We implement a small set of nodes:
  *
  * hw.bus			Single integer read method to obtain the
  *				current generation count.
  * hw.bus.devices		Reads the entire device tree in flat space.
  * hw.bus.rman			Resource manager interface
  *
  * We might like to add the ability to scan devclasses and/or drivers to
  * determine what else is currently loaded/available.
  */
 
 static int
 sysctl_bus(SYSCTL_HANDLER_ARGS)
 {
 	struct u_businfo	ubus;
 
 	ubus.ub_version = BUS_USER_VERSION;
 	ubus.ub_generation = bus_data_generation;
 
 	return (SYSCTL_OUT(req, &ubus, sizeof(ubus)));
 }
 SYSCTL_NODE(_hw_bus, OID_AUTO, info, CTLFLAG_RW, sysctl_bus,
     "bus-related data");
 
 static int
 sysctl_devices(SYSCTL_HANDLER_ARGS)
 {
 	int			*name = (int *)arg1;
 	u_int			namelen = arg2;
 	int			index;
 	struct device		*dev;
 	struct u_device		udev;	/* XXX this is a bit big */
 	int			error;
 
 	if (namelen != 2)
 		return (EINVAL);
 
 	if (bus_data_generation_check(name[0]))
 		return (EINVAL);
 
 	index = name[1];
 
 	/*
 	 * Scan the list of devices, looking for the requested index.
 	 */
 	TAILQ_FOREACH(dev, &bus_data_devices, devlink) {
 		if (index-- == 0)
 			break;
 	}
 	if (dev == NULL)
 		return (ENOENT);
 
 	/*
 	 * Populate the return array.
 	 */
 	bzero(&udev, sizeof(udev));
 	udev.dv_handle = (uintptr_t)dev;
 	udev.dv_parent = (uintptr_t)dev->parent;
 	if (dev->nameunit != NULL)
 		strlcpy(udev.dv_name, dev->nameunit, sizeof(udev.dv_name));
 	if (dev->desc != NULL)
 		strlcpy(udev.dv_desc, dev->desc, sizeof(udev.dv_desc));
 	if (dev->driver != NULL && dev->driver->name != NULL)
 		strlcpy(udev.dv_drivername, dev->driver->name,
 		    sizeof(udev.dv_drivername));
 	bus_child_pnpinfo_str(dev, udev.dv_pnpinfo, sizeof(udev.dv_pnpinfo));
 	bus_child_location_str(dev, udev.dv_location, sizeof(udev.dv_location));
 	udev.dv_devflags = dev->devflags;
 	udev.dv_flags = dev->flags;
 	udev.dv_state = dev->state;
 	error = SYSCTL_OUT(req, &udev, sizeof(udev));
 	return (error);
 }
 
 SYSCTL_NODE(_hw_bus, OID_AUTO, devices, CTLFLAG_RD, sysctl_devices,
     "system device tree");
 
 int
 bus_data_generation_check(int generation)
 {
 	if (generation != bus_data_generation)
 		return (1);
 
 	/* XXX generate optimised lists here? */
 	return (0);
 }
 
 void
 bus_data_generation_update(void)
 {
 	bus_data_generation++;
 }
 
 int
 bus_free_resource(device_t dev, int type, struct resource *r)
 {
 	if (r == NULL)
 		return (0);
 	return (bus_release_resource(dev, type, rman_get_rid(r), r));
 }
Index: stable/10/usr.sbin/bhyve/acpi.c
===================================================================
--- stable/10/usr.sbin/bhyve/acpi.c	(revision 284898)
+++ stable/10/usr.sbin/bhyve/acpi.c	(revision 284899)
@@ -1,1012 +1,1012 @@
 /*-
  * Copyright (c) 2012 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 /*
  * bhyve ACPI table generator.
  *
  * Create the minimal set of ACPI tables required to boot FreeBSD (and
  * hopefully other o/s's) by writing out ASL template files for each of
  * the tables and the compiling them to AML with the Intel iasl compiler.
  * The AML files are then read into guest memory.
  *
  *  The tables are placed in the guest's ROM area just below 1MB physical,
  * above the MPTable.
  *
  *  Layout
  *  ------
  *   RSDP  ->   0xf2400    (36 bytes fixed)
  *     RSDT  ->   0xf2440    (36 bytes + 4*7 table addrs, 4 used)
  *     XSDT  ->   0xf2480    (36 bytes + 8*7 table addrs, 4 used)
  *       MADT  ->   0xf2500  (depends on #CPUs)
  *       FADT  ->   0xf2600  (268 bytes)
  *       HPET  ->   0xf2740  (56 bytes)
  *       MCFG  ->   0xf2780  (60 bytes)
  *         FACS  ->   0xf27C0 (64 bytes)
  *         DSDT  ->   0xf2800 (variable - can go up to 0x100000)
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/errno.h>
 #include <sys/stat.h>
 
 #include <paths.h>
 #include <stdarg.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
 
 #include <machine/vmm.h>
 #include <vmmapi.h>
 
 #include "bhyverun.h"
 #include "acpi.h"
 #include "pci_emul.h"
 
 /*
  * Define the base address of the ACPI tables, and the offsets to
  * the individual tables
  */
 #define BHYVE_ACPI_BASE		0xf2400
 #define RSDT_OFFSET		0x040
 #define XSDT_OFFSET		0x080
 #define MADT_OFFSET		0x100
 #define FADT_OFFSET		0x200
 #define	HPET_OFFSET		0x340
 #define	MCFG_OFFSET		0x380
 #define FACS_OFFSET		0x3C0
 #define DSDT_OFFSET		0x400
 
 #define	BHYVE_ASL_TEMPLATE	"bhyve.XXXXXXX"
 #define BHYVE_ASL_SUFFIX	".aml"
 #define BHYVE_ASL_COMPILER	"/usr/sbin/iasl"
 
 static int basl_keep_temps;
 static int basl_verbose_iasl;
 static int basl_ncpu;
 static uint32_t basl_acpi_base = BHYVE_ACPI_BASE;
 static uint32_t hpet_capabilities;
 
 /*
  * Contains the full pathname of the template to be passed
  * to mkstemp/mktemps(3)
  */
 static char basl_template[MAXPATHLEN];
 static char basl_stemplate[MAXPATHLEN];
 
 /*
  * State for dsdt_line(), dsdt_indent(), and dsdt_unindent().
  */
 static FILE *dsdt_fp;
 static int dsdt_indent_level;
 static int dsdt_error;
 
 struct basl_fio {
 	int	fd;
 	FILE	*fp;
 	char	f_name[MAXPATHLEN];
 };
 
 #define EFPRINTF(...) \
 	err = fprintf(__VA_ARGS__); if (err < 0) goto err_exit;
 
 #define EFFLUSH(x) \
 	err = fflush(x); if (err != 0) goto err_exit;
 
 static int
 basl_fwrite_rsdp(FILE *fp)
 {
 	int err;
 
 	err = 0;
 
 	EFPRINTF(fp, "/*\n");
 	EFPRINTF(fp, " * bhyve RSDP template\n");
 	EFPRINTF(fp, " */\n");
 	EFPRINTF(fp, "[0008]\t\tSignature : \"RSD PTR \"\n");
 	EFPRINTF(fp, "[0001]\t\tChecksum : 43\n");
 	EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n");
 	EFPRINTF(fp, "[0001]\t\tRevision : 02\n");
 	EFPRINTF(fp, "[0004]\t\tRSDT Address : %08X\n",
 	    basl_acpi_base + RSDT_OFFSET);
 	EFPRINTF(fp, "[0004]\t\tLength : 00000024\n");
 	EFPRINTF(fp, "[0008]\t\tXSDT Address : 00000000%08X\n",
 	    basl_acpi_base + XSDT_OFFSET);
 	EFPRINTF(fp, "[0001]\t\tExtended Checksum : 00\n");
 	EFPRINTF(fp, "[0003]\t\tReserved : 000000\n");
 
 	EFFLUSH(fp);
 
 	return (0);
 
 err_exit:
 	return (errno);
 }
 
 static int
 basl_fwrite_rsdt(FILE *fp)
 {
 	int err;
 
 	err = 0;
 
 	EFPRINTF(fp, "/*\n");
 	EFPRINTF(fp, " * bhyve RSDT template\n");
 	EFPRINTF(fp, " */\n");
 	EFPRINTF(fp, "[0004]\t\tSignature : \"RSDT\"\n");
 	EFPRINTF(fp, "[0004]\t\tTable Length : 00000000\n");
 	EFPRINTF(fp, "[0001]\t\tRevision : 01\n");
 	EFPRINTF(fp, "[0001]\t\tChecksum : 00\n");
 	EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n");
 	EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVRSDT  \"\n");
 	EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n");
 	/* iasl will fill in the compiler ID/revision fields */
 	EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n");
 	EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n");
 	EFPRINTF(fp, "\n");
 
 	/* Add in pointers to the MADT, FADT and HPET */
 	EFPRINTF(fp, "[0004]\t\tACPI Table Address 0 : %08X\n",
 	    basl_acpi_base + MADT_OFFSET);
 	EFPRINTF(fp, "[0004]\t\tACPI Table Address 1 : %08X\n",
 	    basl_acpi_base + FADT_OFFSET);
 	EFPRINTF(fp, "[0004]\t\tACPI Table Address 2 : %08X\n",
 	    basl_acpi_base + HPET_OFFSET);
 	EFPRINTF(fp, "[0004]\t\tACPI Table Address 3 : %08X\n",
 	    basl_acpi_base + MCFG_OFFSET);
 
 	EFFLUSH(fp);
 
 	return (0);
 
 err_exit:
 	return (errno);
 }
 
 static int
 basl_fwrite_xsdt(FILE *fp)
 {
 	int err;
 
 	err = 0;
 
 	EFPRINTF(fp, "/*\n");
 	EFPRINTF(fp, " * bhyve XSDT template\n");
 	EFPRINTF(fp, " */\n");
 	EFPRINTF(fp, "[0004]\t\tSignature : \"XSDT\"\n");
 	EFPRINTF(fp, "[0004]\t\tTable Length : 00000000\n");
 	EFPRINTF(fp, "[0001]\t\tRevision : 01\n");
 	EFPRINTF(fp, "[0001]\t\tChecksum : 00\n");
 	EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n");
 	EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVXSDT  \"\n");
 	EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n");
 	/* iasl will fill in the compiler ID/revision fields */
 	EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n");
 	EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n");
 	EFPRINTF(fp, "\n");
 
 	/* Add in pointers to the MADT, FADT and HPET */
 	EFPRINTF(fp, "[0004]\t\tACPI Table Address 0 : 00000000%08X\n",
 	    basl_acpi_base + MADT_OFFSET);
 	EFPRINTF(fp, "[0004]\t\tACPI Table Address 1 : 00000000%08X\n",
 	    basl_acpi_base + FADT_OFFSET);
 	EFPRINTF(fp, "[0004]\t\tACPI Table Address 2 : 00000000%08X\n",
 	    basl_acpi_base + HPET_OFFSET);
 	EFPRINTF(fp, "[0004]\t\tACPI Table Address 3 : 00000000%08X\n",
 	    basl_acpi_base + MCFG_OFFSET);
 
 	EFFLUSH(fp);
 
 	return (0);
 
 err_exit:
 	return (errno);
 }
 
 static int
 basl_fwrite_madt(FILE *fp)
 {
 	int err;
 	int i;
 
 	err = 0;
 
 	EFPRINTF(fp, "/*\n");
 	EFPRINTF(fp, " * bhyve MADT template\n");
 	EFPRINTF(fp, " */\n");
 	EFPRINTF(fp, "[0004]\t\tSignature : \"APIC\"\n");
 	EFPRINTF(fp, "[0004]\t\tTable Length : 00000000\n");
 	EFPRINTF(fp, "[0001]\t\tRevision : 01\n");
 	EFPRINTF(fp, "[0001]\t\tChecksum : 00\n");
 	EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n");
 	EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVMADT  \"\n");
 	EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n");
 
 	/* iasl will fill in the compiler ID/revision fields */
 	EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n");
 	EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n");
 	EFPRINTF(fp, "\n");
 
 	EFPRINTF(fp, "[0004]\t\tLocal Apic Address : FEE00000\n");
 	EFPRINTF(fp, "[0004]\t\tFlags (decoded below) : 00000001\n");
 	EFPRINTF(fp, "\t\t\tPC-AT Compatibility : 1\n");
 	EFPRINTF(fp, "\n");
 
 	/* Add a Processor Local APIC entry for each CPU */
 	for (i = 0; i < basl_ncpu; i++) {
 		EFPRINTF(fp, "[0001]\t\tSubtable Type : 00\n");
 		EFPRINTF(fp, "[0001]\t\tLength : 08\n");
 		/* iasl expects hex values for the proc and apic id's */
 		EFPRINTF(fp, "[0001]\t\tProcessor ID : %02x\n", i);
 		EFPRINTF(fp, "[0001]\t\tLocal Apic ID : %02x\n", i);
 		EFPRINTF(fp, "[0004]\t\tFlags (decoded below) : 00000001\n");
 		EFPRINTF(fp, "\t\t\tProcessor Enabled : 1\n");
 		EFPRINTF(fp, "\n");
 	}
 
 	/* Always a single IOAPIC entry, with ID 0 */
 	EFPRINTF(fp, "[0001]\t\tSubtable Type : 01\n");
 	EFPRINTF(fp, "[0001]\t\tLength : 0C\n");
 	/* iasl expects a hex value for the i/o apic id */
 	EFPRINTF(fp, "[0001]\t\tI/O Apic ID : %02x\n", 0);
 	EFPRINTF(fp, "[0001]\t\tReserved : 00\n");
 	EFPRINTF(fp, "[0004]\t\tAddress : fec00000\n");
 	EFPRINTF(fp, "[0004]\t\tInterrupt : 00000000\n");
 	EFPRINTF(fp, "\n");
 
 	/* Legacy IRQ0 is connected to pin 2 of the IOAPIC */
 	EFPRINTF(fp, "[0001]\t\tSubtable Type : 02\n");
 	EFPRINTF(fp, "[0001]\t\tLength : 0A\n");
 	EFPRINTF(fp, "[0001]\t\tBus : 00\n");
 	EFPRINTF(fp, "[0001]\t\tSource : 00\n");
 	EFPRINTF(fp, "[0004]\t\tInterrupt : 00000002\n");
 	EFPRINTF(fp, "[0002]\t\tFlags (decoded below) : 0005\n");
 	EFPRINTF(fp, "\t\t\tPolarity : 1\n");
 	EFPRINTF(fp, "\t\t\tTrigger Mode : 1\n");
 	EFPRINTF(fp, "\n");
 
 	EFPRINTF(fp, "[0001]\t\tSubtable Type : 02\n");
 	EFPRINTF(fp, "[0001]\t\tLength : 0A\n");
 	EFPRINTF(fp, "[0001]\t\tBus : 00\n");
 	EFPRINTF(fp, "[0001]\t\tSource : %02X\n", SCI_INT);
 	EFPRINTF(fp, "[0004]\t\tInterrupt : %08X\n", SCI_INT);
 	EFPRINTF(fp, "[0002]\t\tFlags (decoded below) : 0000\n");
 	EFPRINTF(fp, "\t\t\tPolarity : 3\n");
 	EFPRINTF(fp, "\t\t\tTrigger Mode : 3\n");
 	EFPRINTF(fp, "\n");
 
 	/* Local APIC NMI is connected to LINT 1 on all CPUs */
 	EFPRINTF(fp, "[0001]\t\tSubtable Type : 04\n");
 	EFPRINTF(fp, "[0001]\t\tLength : 06\n");
 	EFPRINTF(fp, "[0001]\t\tProcessorId : FF\n");
 	EFPRINTF(fp, "[0002]\t\tFlags (decoded below) : 0005\n");
 	EFPRINTF(fp, "\t\t\tPolarity : 1\n");
 	EFPRINTF(fp, "\t\t\tTrigger Mode : 1\n");
 	EFPRINTF(fp, "[0001]\t\tInterrupt : 01\n");
 	EFPRINTF(fp, "\n");
 
 	EFFLUSH(fp);
 
 	return (0);
 
 err_exit:
 	return (errno);
 }
 
 static int
 basl_fwrite_fadt(FILE *fp)
 {
 	int err;
 
 	err = 0;
 
 	EFPRINTF(fp, "/*\n");
 	EFPRINTF(fp, " * bhyve FADT template\n");
 	EFPRINTF(fp, " */\n");
 	EFPRINTF(fp, "[0004]\t\tSignature : \"FACP\"\n");
 	EFPRINTF(fp, "[0004]\t\tTable Length : 0000010C\n");
 	EFPRINTF(fp, "[0001]\t\tRevision : 05\n");
 	EFPRINTF(fp, "[0001]\t\tChecksum : 00\n");
 	EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n");
 	EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVFACP  \"\n");
 	EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n");
 	/* iasl will fill in the compiler ID/revision fields */
 	EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n");
 	EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n");
 	EFPRINTF(fp, "\n");
 
 	EFPRINTF(fp, "[0004]\t\tFACS Address : %08X\n",
 	    basl_acpi_base + FACS_OFFSET);
 	EFPRINTF(fp, "[0004]\t\tDSDT Address : %08X\n",
 	    basl_acpi_base + DSDT_OFFSET);
 	EFPRINTF(fp, "[0001]\t\tModel : 01\n");
 	EFPRINTF(fp, "[0001]\t\tPM Profile : 00 [Unspecified]\n");
 	EFPRINTF(fp, "[0002]\t\tSCI Interrupt : %04X\n",
 	    SCI_INT);
 	EFPRINTF(fp, "[0004]\t\tSMI Command Port : %08X\n",
 	    SMI_CMD);
 	EFPRINTF(fp, "[0001]\t\tACPI Enable Value : %02X\n",
 	    BHYVE_ACPI_ENABLE);
 	EFPRINTF(fp, "[0001]\t\tACPI Disable Value : %02X\n",
 	    BHYVE_ACPI_DISABLE);
 	EFPRINTF(fp, "[0001]\t\tS4BIOS Command : 00\n");
 	EFPRINTF(fp, "[0001]\t\tP-State Control : 00\n");
 	EFPRINTF(fp, "[0004]\t\tPM1A Event Block Address : %08X\n",
 	    PM1A_EVT_ADDR);
 	EFPRINTF(fp, "[0004]\t\tPM1B Event Block Address : 00000000\n");
 	EFPRINTF(fp, "[0004]\t\tPM1A Control Block Address : %08X\n",
 	    PM1A_CNT_ADDR);
 	EFPRINTF(fp, "[0004]\t\tPM1B Control Block Address : 00000000\n");
 	EFPRINTF(fp, "[0004]\t\tPM2 Control Block Address : 00000000\n");
 	EFPRINTF(fp, "[0004]\t\tPM Timer Block Address : %08X\n",
 	    IO_PMTMR);
 	EFPRINTF(fp, "[0004]\t\tGPE0 Block Address : 00000000\n");
 	EFPRINTF(fp, "[0004]\t\tGPE1 Block Address : 00000000\n");
 	EFPRINTF(fp, "[0001]\t\tPM1 Event Block Length : 04\n");
 	EFPRINTF(fp, "[0001]\t\tPM1 Control Block Length : 02\n");
 	EFPRINTF(fp, "[0001]\t\tPM2 Control Block Length : 00\n");
 	EFPRINTF(fp, "[0001]\t\tPM Timer Block Length : 04\n");
 	EFPRINTF(fp, "[0001]\t\tGPE0 Block Length : 00\n");
 	EFPRINTF(fp, "[0001]\t\tGPE1 Block Length : 00\n");
 	EFPRINTF(fp, "[0001]\t\tGPE1 Base Offset : 00\n");
 	EFPRINTF(fp, "[0001]\t\t_CST Support : 00\n");
 	EFPRINTF(fp, "[0002]\t\tC2 Latency : 0000\n");
 	EFPRINTF(fp, "[0002]\t\tC3 Latency : 0000\n");
 	EFPRINTF(fp, "[0002]\t\tCPU Cache Size : 0000\n");
 	EFPRINTF(fp, "[0002]\t\tCache Flush Stride : 0000\n");
 	EFPRINTF(fp, "[0001]\t\tDuty Cycle Offset : 00\n");
 	EFPRINTF(fp, "[0001]\t\tDuty Cycle Width : 00\n");
 	EFPRINTF(fp, "[0001]\t\tRTC Day Alarm Index : 00\n");
 	EFPRINTF(fp, "[0001]\t\tRTC Month Alarm Index : 00\n");
-	EFPRINTF(fp, "[0001]\t\tRTC Century Index : 00\n");
+	EFPRINTF(fp, "[0001]\t\tRTC Century Index : 32\n");
 	EFPRINTF(fp, "[0002]\t\tBoot Flags (decoded below) : 0000\n");
 	EFPRINTF(fp, "\t\t\tLegacy Devices Supported (V2) : 0\n");
 	EFPRINTF(fp, "\t\t\t8042 Present on ports 60/64 (V2) : 0\n");
 	EFPRINTF(fp, "\t\t\tVGA Not Present (V4) : 1\n");
 	EFPRINTF(fp, "\t\t\tMSI Not Supported (V4) : 0\n");
 	EFPRINTF(fp, "\t\t\tPCIe ASPM Not Supported (V4) : 1\n");
 	EFPRINTF(fp, "\t\t\tCMOS RTC Not Present (V5) : 0\n");
 	EFPRINTF(fp, "[0001]\t\tReserved : 00\n");
 	EFPRINTF(fp, "[0004]\t\tFlags (decoded below) : 00000000\n");
 	EFPRINTF(fp, "\t\t\tWBINVD instruction is operational (V1) : 1\n");
 	EFPRINTF(fp, "\t\t\tWBINVD flushes all caches (V1) : 0\n");
 	EFPRINTF(fp, "\t\t\tAll CPUs support C1 (V1) : 1\n");
 	EFPRINTF(fp, "\t\t\tC2 works on MP system (V1) : 0\n");
 	EFPRINTF(fp, "\t\t\tControl Method Power Button (V1) : 0\n");
 	EFPRINTF(fp, "\t\t\tControl Method Sleep Button (V1) : 1\n");
 	EFPRINTF(fp, "\t\t\tRTC wake not in fixed reg space (V1) : 0\n");
 	EFPRINTF(fp, "\t\t\tRTC can wake system from S4 (V1) : 0\n");
 	EFPRINTF(fp, "\t\t\t32-bit PM Timer (V1) : 1\n");
 	EFPRINTF(fp, "\t\t\tDocking Supported (V1) : 0\n");
 	EFPRINTF(fp, "\t\t\tReset Register Supported (V2) : 1\n");
 	EFPRINTF(fp, "\t\t\tSealed Case (V3) : 0\n");
 	EFPRINTF(fp, "\t\t\tHeadless - No Video (V3) : 1\n");
 	EFPRINTF(fp, "\t\t\tUse native instr after SLP_TYPx (V3) : 0\n");
 	EFPRINTF(fp, "\t\t\tPCIEXP_WAK Bits Supported (V4) : 0\n");
 	EFPRINTF(fp, "\t\t\tUse Platform Timer (V4) : 0\n");
 	EFPRINTF(fp, "\t\t\tRTC_STS valid on S4 wake (V4) : 0\n");
 	EFPRINTF(fp, "\t\t\tRemote Power-on capable (V4) : 0\n");
 	EFPRINTF(fp, "\t\t\tUse APIC Cluster Model (V4) : 0\n");
 	EFPRINTF(fp, "\t\t\tUse APIC Physical Destination Mode (V4) : 1\n");
 	EFPRINTF(fp, "\t\t\tHardware Reduced (V5) : 0\n");
 	EFPRINTF(fp, "\t\t\tLow Power S0 Idle (V5) : 0\n");
 	EFPRINTF(fp, "\n");
 
 	EFPRINTF(fp,
 	    "[0012]\t\tReset Register : [Generic Address Structure]\n");
 	EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
 	EFPRINTF(fp, "[0001]\t\tBit Width : 08\n");
 	EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
 	EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 01 [Byte Access:8]\n");
 	EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000CF9\n");
 	EFPRINTF(fp, "\n");
 
 	EFPRINTF(fp, "[0001]\t\tValue to cause reset : 06\n");
 	EFPRINTF(fp, "[0002]\t\tARM Flags (decoded below): 0000\n");
 	EFPRINTF(fp, "\t\t\tPSCI Compliant : 0\n");
 	EFPRINTF(fp, "\t\t\tMust use HVC for PSCI : 0\n");
 	EFPRINTF(fp, "[0001]\t\tFADT Minor Revision : 01\n");
 	EFPRINTF(fp, "[0008]\t\tFACS Address : 00000000%08X\n",
 	    basl_acpi_base + FACS_OFFSET);
 	EFPRINTF(fp, "[0008]\t\tDSDT Address : 00000000%08X\n",
 	    basl_acpi_base + DSDT_OFFSET);
 	EFPRINTF(fp,
 	    "[0012]\t\tPM1A Event Block : [Generic Address Structure]\n");
 	EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
 	EFPRINTF(fp, "[0001]\t\tBit Width : 20\n");
 	EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
 	EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 02 [Word Access:16]\n");
 	EFPRINTF(fp, "[0008]\t\tAddress : 00000000%08X\n",
 	    PM1A_EVT_ADDR);
 	EFPRINTF(fp, "\n");
 	
 	EFPRINTF(fp,
 	    "[0012]\t\tPM1B Event Block : [Generic Address Structure]\n");
 	EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
 	EFPRINTF(fp, "[0001]\t\tBit Width : 00\n");
 	EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
 	EFPRINTF(fp,
 	    "[0001]\t\tEncoded Access Width : 00 [Undefined/Legacy]\n");
 	EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n");
 	EFPRINTF(fp, "\n");
 
 	EFPRINTF(fp,
 	    "[0012]\t\tPM1A Control Block : [Generic Address Structure]\n");
 	EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
 	EFPRINTF(fp, "[0001]\t\tBit Width : 10\n");
 	EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
 	EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 02 [Word Access:16]\n");
 	EFPRINTF(fp, "[0008]\t\tAddress : 00000000%08X\n",
 	    PM1A_CNT_ADDR);
 	EFPRINTF(fp, "\n");
 
 	EFPRINTF(fp,
 	    "[0012]\t\tPM1B Control Block : [Generic Address Structure]\n");
 	EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
 	EFPRINTF(fp, "[0001]\t\tBit Width : 00\n");
 	EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
 	EFPRINTF(fp,
 	    "[0001]\t\tEncoded Access Width : 00 [Undefined/Legacy]\n");
 	EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n");
 	EFPRINTF(fp, "\n");
 
 	EFPRINTF(fp,
 	    "[0012]\t\tPM2 Control Block : [Generic Address Structure]\n");
 	EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
 	EFPRINTF(fp, "[0001]\t\tBit Width : 08\n");
 	EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
 	EFPRINTF(fp,
 	    "[0001]\t\tEncoded Access Width : 00 [Undefined/Legacy]\n");
 	EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n");
 	EFPRINTF(fp, "\n");
 
 	/* Valid for bhyve */
 	EFPRINTF(fp,
 	    "[0012]\t\tPM Timer Block : [Generic Address Structure]\n");
 	EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
 	EFPRINTF(fp, "[0001]\t\tBit Width : 20\n");
 	EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
 	EFPRINTF(fp,
 	    "[0001]\t\tEncoded Access Width : 03 [DWord Access:32]\n");
 	EFPRINTF(fp, "[0008]\t\tAddress : 00000000%08X\n",
 	    IO_PMTMR);
 	EFPRINTF(fp, "\n");
 
 	EFPRINTF(fp, "[0012]\t\tGPE0 Block : [Generic Address Structure]\n");
 	EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
 	EFPRINTF(fp, "[0001]\t\tBit Width : 00\n");
 	EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
 	EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 01 [Byte Access:8]\n");
 	EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n");
 	EFPRINTF(fp, "\n");
 
 	EFPRINTF(fp, "[0012]\t\tGPE1 Block : [Generic Address Structure]\n");
 	EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
 	EFPRINTF(fp, "[0001]\t\tBit Width : 00\n");
 	EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
 	EFPRINTF(fp,
 	    "[0001]\t\tEncoded Access Width : 00 [Undefined/Legacy]\n");
 	EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n");
 	EFPRINTF(fp, "\n");
 
 	EFPRINTF(fp,
 	   "[0012]\t\tSleep Control Register : [Generic Address Structure]\n");
 	EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
 	EFPRINTF(fp, "[0001]\t\tBit Width : 08\n");
 	EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
 	EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 01 [Byte Access:8]\n");
 	EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n");
 	EFPRINTF(fp, "\n");
 
 	EFPRINTF(fp,
 	    "[0012]\t\tSleep Status Register : [Generic Address Structure]\n");
 	EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
 	EFPRINTF(fp, "[0001]\t\tBit Width : 08\n");
 	EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
 	EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 01 [Byte Access:8]\n");
 	EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n");
 
 	EFFLUSH(fp);
 
 	return (0);
 
 err_exit:
 	return (errno);
 }
 
 static int
 basl_fwrite_hpet(FILE *fp)
 {
 	int err;
 
 	err = 0;
 
 	EFPRINTF(fp, "/*\n");
 	EFPRINTF(fp, " * bhyve HPET template\n");
 	EFPRINTF(fp, " */\n");
 	EFPRINTF(fp, "[0004]\t\tSignature : \"HPET\"\n");
 	EFPRINTF(fp, "[0004]\t\tTable Length : 00000000\n");
 	EFPRINTF(fp, "[0001]\t\tRevision : 01\n");
 	EFPRINTF(fp, "[0001]\t\tChecksum : 00\n");
 	EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n");
 	EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVHPET  \"\n");
 	EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n");
 
 	/* iasl will fill in the compiler ID/revision fields */
 	EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n");
 	EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n");
 	EFPRINTF(fp, "\n");
 
 	EFPRINTF(fp, "[0004]\t\tTimer Block ID : %08X\n", hpet_capabilities);
 	EFPRINTF(fp,
 	    "[0012]\t\tTimer Block Register : [Generic Address Structure]\n");
 	EFPRINTF(fp, "[0001]\t\tSpace ID : 00 [SystemMemory]\n");
 	EFPRINTF(fp, "[0001]\t\tBit Width : 00\n");
 	EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
 	EFPRINTF(fp,
 		 "[0001]\t\tEncoded Access Width : 00 [Undefined/Legacy]\n");
 	EFPRINTF(fp, "[0008]\t\tAddress : 00000000FED00000\n");
 	EFPRINTF(fp, "\n");
 
 	EFPRINTF(fp, "[0001]\t\tHPET Number : 00\n");
 	EFPRINTF(fp, "[0002]\t\tMinimum Clock Ticks : 0000\n");
 	EFPRINTF(fp, "[0004]\t\tFlags (decoded below) : 00000001\n");
 	EFPRINTF(fp, "\t\t\t4K Page Protect : 1\n");
 	EFPRINTF(fp, "\t\t\t64K Page Protect : 0\n");
 	EFPRINTF(fp, "\n");
 
 	EFFLUSH(fp);
 
 	return (0);
 
 err_exit:
 	return (errno);
 }
 
 static int
 basl_fwrite_mcfg(FILE *fp)
 {
 	int err = 0;
 
 	EFPRINTF(fp, "/*\n");
 	EFPRINTF(fp, " * bhyve MCFG template\n");
 	EFPRINTF(fp, " */\n");
 	EFPRINTF(fp, "[0004]\t\tSignature : \"MCFG\"\n");
 	EFPRINTF(fp, "[0004]\t\tTable Length : 00000000\n");
 	EFPRINTF(fp, "[0001]\t\tRevision : 01\n");
 	EFPRINTF(fp, "[0001]\t\tChecksum : 00\n");
 	EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n");
 	EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVMCFG  \"\n");
 	EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n");
 
 	/* iasl will fill in the compiler ID/revision fields */
 	EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n");
 	EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n");
 	EFPRINTF(fp, "[0008]\t\tReserved : 0\n");
 	EFPRINTF(fp, "\n");
 
 	EFPRINTF(fp, "[0008]\t\tBase Address : %016lX\n", pci_ecfg_base());
 	EFPRINTF(fp, "[0002]\t\tSegment Group: 0000\n");
 	EFPRINTF(fp, "[0001]\t\tStart Bus: 00\n");
 	EFPRINTF(fp, "[0001]\t\tEnd Bus: FF\n");
 	EFPRINTF(fp, "[0004]\t\tReserved : 0\n");
 	EFFLUSH(fp);
 	return (0);
 err_exit:
 	return (errno);
 }
 
 static int
 basl_fwrite_facs(FILE *fp)
 {
 	int err;
 
 	err = 0;
 
 	EFPRINTF(fp, "/*\n");
 	EFPRINTF(fp, " * bhyve FACS template\n");
 	EFPRINTF(fp, " */\n");
 	EFPRINTF(fp, "[0004]\t\tSignature : \"FACS\"\n");
 	EFPRINTF(fp, "[0004]\t\tLength : 00000040\n");
 	EFPRINTF(fp, "[0004]\t\tHardware Signature : 00000000\n");
 	EFPRINTF(fp, "[0004]\t\t32 Firmware Waking Vector : 00000000\n");
 	EFPRINTF(fp, "[0004]\t\tGlobal Lock : 00000000\n");
 	EFPRINTF(fp, "[0004]\t\tFlags (decoded below) : 00000000\n");
 	EFPRINTF(fp, "\t\t\tS4BIOS Support Present : 0\n");
 	EFPRINTF(fp, "\t\t\t64-bit Wake Supported (V2) : 0\n");
 	EFPRINTF(fp,
 	    "[0008]\t\t64 Firmware Waking Vector : 0000000000000000\n");
 	EFPRINTF(fp, "[0001]\t\tVersion : 02\n");
 	EFPRINTF(fp, "[0003]\t\tReserved : 000000\n");
 	EFPRINTF(fp, "[0004]\t\tOspmFlags (decoded below) : 00000000\n");
 	EFPRINTF(fp, "\t\t\t64-bit Wake Env Required (V2) : 0\n");
 
 	EFFLUSH(fp);
 
 	return (0);
 	
 err_exit:
 	return (errno);
 }
 
 /*
  * Helper routines for writing to the DSDT from other modules.
  */
 void
 dsdt_line(const char *fmt, ...)
 {
 	va_list ap;
 	int err;
 
 	if (dsdt_error != 0)
 		return;
 
 	if (strcmp(fmt, "") != 0) {
 		if (dsdt_indent_level != 0)
 			EFPRINTF(dsdt_fp, "%*c", dsdt_indent_level * 2, ' ');
 		va_start(ap, fmt);
 		if (vfprintf(dsdt_fp, fmt, ap) < 0)
 			goto err_exit;
 		va_end(ap);
 	}
 	EFPRINTF(dsdt_fp, "\n");
 	return;
 
 err_exit:
 	dsdt_error = errno;
 }
 
 void
 dsdt_indent(int levels)
 {
 
 	dsdt_indent_level += levels;
 	assert(dsdt_indent_level >= 0);
 }
 
 void
 dsdt_unindent(int levels)
 {
 
 	assert(dsdt_indent_level >= levels);
 	dsdt_indent_level -= levels;
 }
 
 void
 dsdt_fixed_ioport(uint16_t iobase, uint16_t length)
 {
 
 	dsdt_line("IO (Decode16,");
 	dsdt_line("  0x%04X,             // Range Minimum", iobase);
 	dsdt_line("  0x%04X,             // Range Maximum", iobase);
 	dsdt_line("  0x01,               // Alignment");
 	dsdt_line("  0x%02X,               // Length", length);
 	dsdt_line("  )");
 }
 
 void
 dsdt_fixed_irq(uint8_t irq)
 {
 
 	dsdt_line("IRQNoFlags ()");
 	dsdt_line("  {%d}", irq);
 }
 
 void
 dsdt_fixed_mem32(uint32_t base, uint32_t length)
 {
 
 	dsdt_line("Memory32Fixed (ReadWrite,");
 	dsdt_line("  0x%08X,         // Address Base", base);
 	dsdt_line("  0x%08X,         // Address Length", length);
 	dsdt_line("  )");
 }
 
 static int
 basl_fwrite_dsdt(FILE *fp)
 {
 	int err;
 
 	err = 0;
 	dsdt_fp = fp;
 	dsdt_error = 0;
 	dsdt_indent_level = 0;
 
 	dsdt_line("/*");
 	dsdt_line(" * bhyve DSDT template");
 	dsdt_line(" */");
 	dsdt_line("DefinitionBlock (\"bhyve_dsdt.aml\", \"DSDT\", 2,"
 		 "\"BHYVE \", \"BVDSDT  \", 0x00000001)");
 	dsdt_line("{");
 	dsdt_line("  Name (_S5, Package ()");
 	dsdt_line("  {");
 	dsdt_line("      0x05,");
 	dsdt_line("      Zero,");
 	dsdt_line("  })");
 
 	pci_write_dsdt();
 
 	dsdt_line("");
 	dsdt_line("  Scope (_SB.PC00)");
 	dsdt_line("  {");
 	dsdt_line("    Device (HPET)");
 	dsdt_line("    {");
 	dsdt_line("      Name (_HID, EISAID(\"PNP0103\"))");
 	dsdt_line("      Name (_UID, 0)");
 	dsdt_line("      Name (_CRS, ResourceTemplate ()");
 	dsdt_line("      {");
 	dsdt_indent(4);
 	dsdt_fixed_mem32(0xFED00000, 0x400);
 	dsdt_unindent(4);
 	dsdt_line("      })");
 	dsdt_line("    }");
 	dsdt_line("  }");
 	dsdt_line("}");
 
 	if (dsdt_error != 0)
 		return (dsdt_error);
 
 	EFFLUSH(fp);
 
 	return (0);
 
 err_exit:
 	return (errno);
 }
 
 static int
 basl_open(struct basl_fio *bf, int suffix)
 {
 	int err;
 
 	err = 0;
 
 	if (suffix) {
 		strncpy(bf->f_name, basl_stemplate, MAXPATHLEN);
 		bf->fd = mkstemps(bf->f_name, strlen(BHYVE_ASL_SUFFIX));
 	} else {
 		strncpy(bf->f_name, basl_template, MAXPATHLEN);
 		bf->fd = mkstemp(bf->f_name);
 	}
 
 	if (bf->fd > 0) {
 		bf->fp = fdopen(bf->fd, "w+");
 		if (bf->fp == NULL) {
 			unlink(bf->f_name);
 			close(bf->fd);
 		}
 	} else {
 		err = 1;
 	}
 
 	return (err);
 }
 
 static void
 basl_close(struct basl_fio *bf)
 {
 
 	if (!basl_keep_temps)
 		unlink(bf->f_name);
 	fclose(bf->fp);
 }
 
 static int
 basl_start(struct basl_fio *in, struct basl_fio *out)
 {
 	int err;
 
 	err = basl_open(in, 0);
 	if (!err) {
 		err = basl_open(out, 1);
 		if (err) {
 			basl_close(in);
 		}
 	}
 
 	return (err);
 }
 
 static void
 basl_end(struct basl_fio *in, struct basl_fio *out)
 {
 
 	basl_close(in);
 	basl_close(out);
 }
 
 static int
 basl_load(struct vmctx *ctx, int fd, uint64_t off)
 {
 	struct stat sb;
 	void *gaddr;
 
 	if (fstat(fd, &sb) < 0)
 		return (errno);
 		
 	gaddr = paddr_guest2host(ctx, basl_acpi_base + off, sb.st_size);
 	if (gaddr == NULL)
 		return (EFAULT);
 
 	if (read(fd, gaddr, sb.st_size) < 0)
 		return (errno);
 
 	return (0);
 }
 
 static int
 basl_compile(struct vmctx *ctx, int (*fwrite_section)(FILE *), uint64_t offset)
 {
 	struct basl_fio io[2];
 	static char iaslbuf[3*MAXPATHLEN + 10];
 	char *fmt;
 	int err;
 
 	err = basl_start(&io[0], &io[1]);
 	if (!err) {
 		err = (*fwrite_section)(io[0].fp);
 
 		if (!err) {
 			/*
 			 * iasl sends the results of the compilation to
 			 * stdout. Shut this down by using the shell to
 			 * redirect stdout to /dev/null, unless the user
 			 * has requested verbose output for debugging
 			 * purposes
 			 */
 			fmt = basl_verbose_iasl ?
 				"%s -p %s %s" :
 				"/bin/sh -c \"%s -p %s %s\" 1> /dev/null";
 				
 			snprintf(iaslbuf, sizeof(iaslbuf),
 				 fmt,
 				 BHYVE_ASL_COMPILER,
 				 io[1].f_name, io[0].f_name);
 			err = system(iaslbuf);
 
 			if (!err) {
 				/*
 				 * Copy the aml output file into guest
 				 * memory at the specified location
 				 */
 				err = basl_load(ctx, io[1].fd, offset);
 			}
 		}
 		basl_end(&io[0], &io[1]);
 	}
 
 	return (err);
 }
 
 static int
 basl_make_templates(void)
 {
 	const char *tmpdir;
 	int err;
 	int len;
 
 	err = 0;
 	
 	/*
 	 * 
 	 */
 	if ((tmpdir = getenv("BHYVE_TMPDIR")) == NULL || *tmpdir == '\0' ||
 	    (tmpdir = getenv("TMPDIR")) == NULL || *tmpdir == '\0') {
 		tmpdir = _PATH_TMP;
 	}
 
 	len = strlen(tmpdir);
 
 	if ((len + sizeof(BHYVE_ASL_TEMPLATE) + 1) < MAXPATHLEN) {
 		strcpy(basl_template, tmpdir);
 		while (len > 0 && basl_template[len - 1] == '/')
 			len--;
 		basl_template[len] = '/';
 		strcpy(&basl_template[len + 1], BHYVE_ASL_TEMPLATE);
 	} else
 		err = E2BIG;
 
 	if (!err) {
 		/*
 		 * len has been intialized (and maybe adjusted) above
 		 */
 		if ((len + sizeof(BHYVE_ASL_TEMPLATE) + 1 +
 		     sizeof(BHYVE_ASL_SUFFIX)) < MAXPATHLEN) {
 			strcpy(basl_stemplate, tmpdir);
 			basl_stemplate[len] = '/';
 			strcpy(&basl_stemplate[len + 1], BHYVE_ASL_TEMPLATE);
 			len = strlen(basl_stemplate);
 			strcpy(&basl_stemplate[len], BHYVE_ASL_SUFFIX);
 		} else
 			err = E2BIG;
 	}
 
 	return (err);
 }
 
 static struct {
 	int	(*wsect)(FILE *fp);
 	uint64_t  offset;
 } basl_ftables[] =
 {
 	{ basl_fwrite_rsdp, 0},
 	{ basl_fwrite_rsdt, RSDT_OFFSET },
 	{ basl_fwrite_xsdt, XSDT_OFFSET },
 	{ basl_fwrite_madt, MADT_OFFSET },
 	{ basl_fwrite_fadt, FADT_OFFSET },
 	{ basl_fwrite_hpet, HPET_OFFSET },
 	{ basl_fwrite_mcfg, MCFG_OFFSET },
 	{ basl_fwrite_facs, FACS_OFFSET },
 	{ basl_fwrite_dsdt, DSDT_OFFSET },
 	{ NULL }
 };
 
 int
 acpi_build(struct vmctx *ctx, int ncpu)
 {
 	int err;
 	int i;
 
 	basl_ncpu = ncpu;
 
 	err = vm_get_hpet_capabilities(ctx, &hpet_capabilities);
 	if (err != 0)
 		return (err);
 
 	/*
 	 * For debug, allow the user to have iasl compiler output sent
 	 * to stdout rather than /dev/null
 	 */
 	if (getenv("BHYVE_ACPI_VERBOSE_IASL"))
 		basl_verbose_iasl = 1;
 
 	/*
 	 * Allow the user to keep the generated ASL files for debugging
 	 * instead of deleting them following use
 	 */
 	if (getenv("BHYVE_ACPI_KEEPTMPS"))
 		basl_keep_temps = 1;
 
 	i = 0;
 	err = basl_make_templates();
 
 	/*
 	 * Run through all the ASL files, compiling them and
 	 * copying them into guest memory
 	 */
 	while (!err && basl_ftables[i].wsect != NULL) {
 		err = basl_compile(ctx, basl_ftables[i].wsect,
 				   basl_ftables[i].offset);
 		i++;
 	}
 
 	return (err);
 }
Index: stable/10/usr.sbin/bhyve/bhyverun.c
===================================================================
--- stable/10/usr.sbin/bhyve/bhyverun.c	(revision 284898)
+++ stable/10/usr.sbin/bhyve/bhyverun.c	(revision 284899)
@@ -1,880 +1,892 @@
 /*-
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/types.h>
 #include <sys/mman.h>
 #include <sys/time.h>
 
 #include <machine/atomic.h>
 #include <machine/segments.h>
 
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <err.h>
 #include <libgen.h>
 #include <unistd.h>
 #include <assert.h>
 #include <errno.h>
 #include <pthread.h>
 #include <pthread_np.h>
 #include <sysexits.h>
 
 #include <machine/vmm.h>
 #include <vmmapi.h>
 
 #include "bhyverun.h"
 #include "acpi.h"
 #include "inout.h"
 #include "dbgport.h"
 #include "ioapic.h"
 #include "mem.h"
 #include "mevent.h"
 #include "mptbl.h"
 #include "pci_emul.h"
 #include "pci_irq.h"
 #include "pci_lpc.h"
 #include "smbiostbl.h"
 #include "xmsr.h"
 #include "spinup_ap.h"
 #include "rtc.h"
 
 #define GUEST_NIO_PORT		0x488	/* guest upcalls via i/o port */
 
 #define MB		(1024UL * 1024)
 #define GB		(1024UL * MB)
 
 typedef int (*vmexit_handler_t)(struct vmctx *, struct vm_exit *, int *vcpu);
 extern int vmexit_task_switch(struct vmctx *, struct vm_exit *, int *vcpu);
 
 char *vmname;
 
 int guest_ncpus;
 char *guest_uuid_str;
 
 static int guest_vmexit_on_hlt, guest_vmexit_on_pause;
 static int virtio_msix = 1;
 static int x2apic_mode = 0;	/* default is xAPIC */
 
 static int strictio;
 static int strictmsr = 1;
 
 static int acpi;
 
 static char *progname;
 static const int BSP = 0;
 
 static cpuset_t cpumask;
 
 static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip);
 
 static struct vm_exit vmexit[VM_MAXCPU];
 
 struct bhyvestats {
         uint64_t        vmexit_bogus;
         uint64_t        vmexit_bogus_switch;
         uint64_t        vmexit_hlt;
         uint64_t        vmexit_pause;
         uint64_t        vmexit_mtrap;
         uint64_t        vmexit_inst_emul;
         uint64_t        cpu_switch_rotate;
         uint64_t        cpu_switch_direct;
 } stats;
 
 struct mt_vmm_info {
 	pthread_t	mt_thr;
 	struct vmctx	*mt_ctx;
 	int		mt_vcpu;	
 } mt_vmm_info[VM_MAXCPU];
 
 static cpuset_t *vcpumap[VM_MAXCPU] = { NULL };
 
 static void
 usage(int code)
 {
 
         fprintf(stderr,
                 "Usage: %s [-abehuwxACHPWY] [-c vcpus] [-g <gdb port>] [-l <lpc>]\n"
 		"       %*s [-m mem] [-p vcpu:hostcpu] [-s <pci>] [-U uuid] <vm>\n"
 		"       -a: local apic is in xAPIC mode (deprecated)\n"
 		"       -A: create ACPI tables\n"
 		"       -c: # cpus (default 1)\n"
 		"       -C: include guest memory in core file\n"
 		"       -e: exit on unhandled I/O access\n"
 		"       -g: gdb port\n"
 		"       -h: help\n"
 		"       -H: vmexit from the guest on hlt\n"
 		"       -l: LPC device configuration\n"
 		"       -m: memory size in MB\n"
 		"       -p: pin 'vcpu' to 'hostcpu'\n"
 		"       -P: vmexit from the guest on pause\n"
 		"       -s: <slot,driver,configinfo> PCI slot config\n"
 		"       -u: RTC keeps UTC time\n"
 		"       -U: uuid\n"
 		"       -w: ignore unimplemented MSRs\n"
 		"       -W: force virtio to use single-vector MSI\n"
 		"       -x: local apic is in x2APIC mode\n"
 		"       -Y: disable MPtable generation\n",
 		progname, (int)strlen(progname), "");
 
 	exit(code);
 }
 
 static int
 pincpu_parse(const char *opt)
 {
 	int vcpu, pcpu;
 
 	if (sscanf(opt, "%d:%d", &vcpu, &pcpu) != 2) {
 		fprintf(stderr, "invalid format: %s\n", opt);
 		return (-1);
 	}
 
 	if (vcpu < 0 || vcpu >= VM_MAXCPU) {
 		fprintf(stderr, "vcpu '%d' outside valid range from 0 to %d\n",
 		    vcpu, VM_MAXCPU - 1);
 		return (-1);
 	}
 
 	if (pcpu < 0 || pcpu >= CPU_SETSIZE) {
 		fprintf(stderr, "hostcpu '%d' outside valid range from "
 		    "0 to %d\n", pcpu, CPU_SETSIZE - 1);
 		return (-1);
 	}
 
 	if (vcpumap[vcpu] == NULL) {
 		if ((vcpumap[vcpu] = malloc(sizeof(cpuset_t))) == NULL) {
 			perror("malloc");
 			return (-1);
 		}
 		CPU_ZERO(vcpumap[vcpu]);
 	}
 	CPU_SET(pcpu, vcpumap[vcpu]);
 	return (0);
 }
 
 void
 vm_inject_fault(void *arg, int vcpu, int vector, int errcode_valid,
     int errcode)
 {
 	struct vmctx *ctx;
 	int error, restart_instruction;
 
 	ctx = arg;
 	restart_instruction = 1;
 
 	error = vm_inject_exception(ctx, vcpu, vector, errcode_valid, errcode,
 	    restart_instruction);
 	assert(error == 0);
 }
 
 void *
 paddr_guest2host(struct vmctx *ctx, uintptr_t gaddr, size_t len)
 {
 
 	return (vm_map_gpa(ctx, gaddr, len));
 }
 
 int
 fbsdrun_vmexit_on_pause(void)
 {
 
 	return (guest_vmexit_on_pause);
 }
 
 int
 fbsdrun_vmexit_on_hlt(void)
 {
 
 	return (guest_vmexit_on_hlt);
 }
 
 int
 fbsdrun_virtio_msix(void)
 {
 
 	return (virtio_msix);
 }
 
 static void *
 fbsdrun_start_thread(void *param)
 {
 	char tname[MAXCOMLEN + 1];
 	struct mt_vmm_info *mtp;
 	int vcpu;
 
 	mtp = param;
 	vcpu = mtp->mt_vcpu;
 
 	snprintf(tname, sizeof(tname), "vcpu %d", vcpu);
 	pthread_set_name_np(mtp->mt_thr, tname);
 
 	vm_loop(mtp->mt_ctx, vcpu, vmexit[vcpu].rip);
 
 	/* not reached */
 	exit(1);
 	return (NULL);
 }
 
 void
 fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip)
 {
 	int error;
 
 	assert(fromcpu == BSP);
 
 	/*
 	 * The 'newcpu' must be activated in the context of 'fromcpu'. If
 	 * vm_activate_cpu() is delayed until newcpu's pthread starts running
 	 * then vmm.ko is out-of-sync with bhyve and this can create a race
 	 * with vm_suspend().
 	 */
 	error = vm_activate_cpu(ctx, newcpu);
 	assert(error == 0);
 
 	CPU_SET_ATOMIC(newcpu, &cpumask);
 
 	/*
 	 * Set up the vmexit struct to allow execution to start
 	 * at the given RIP
 	 */
 	vmexit[newcpu].rip = rip;
 	vmexit[newcpu].inst_length = 0;
 
 	mt_vmm_info[newcpu].mt_ctx = ctx;
 	mt_vmm_info[newcpu].mt_vcpu = newcpu;
 
 	error = pthread_create(&mt_vmm_info[newcpu].mt_thr, NULL,
 	    fbsdrun_start_thread, &mt_vmm_info[newcpu]);
 	assert(error == 0);
 }
 
 static int
 fbsdrun_deletecpu(struct vmctx *ctx, int vcpu)
 {
 
 	if (!CPU_ISSET(vcpu, &cpumask)) {
 		fprintf(stderr, "Attempting to delete unknown cpu %d\n", vcpu);
 		exit(1);
 	}
 
 	CPU_CLR_ATOMIC(vcpu, &cpumask);
 	return (CPU_EMPTY(&cpumask));
 }
 
 static int
 vmexit_handle_notify(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu,
 		     uint32_t eax)
 {
 #if BHYVE_DEBUG
 	/*
 	 * put guest-driven debug here
 	 */
 #endif
         return (VMEXIT_CONTINUE);
 }
 
 static int
 vmexit_inout(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
 {
 	int error;
 	int bytes, port, in, out, string;
 	int vcpu;
 
 	vcpu = *pvcpu;
 
 	port = vme->u.inout.port;
 	bytes = vme->u.inout.bytes;
 	string = vme->u.inout.string;
 	in = vme->u.inout.in;
 	out = !in;
 
         /* Extra-special case of host notifications */
         if (out && port == GUEST_NIO_PORT) {
                 error = vmexit_handle_notify(ctx, vme, pvcpu, vme->u.inout.eax);
 		return (error);
 	}
 
 	error = emulate_inout(ctx, vcpu, vme, strictio);
 	if (error) {
-		fprintf(stderr, "Unhandled %s%c 0x%04x\n", in ? "in" : "out",
-		    bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'), port);
+		fprintf(stderr, "Unhandled %s%c 0x%04x at 0x%lx\n",
+		    in ? "in" : "out",
+		    bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'),
+		    port, vmexit->rip);
 		return (VMEXIT_ABORT);
 	} else {
 		return (VMEXIT_CONTINUE);
 	}
 }
 
 static int
 vmexit_rdmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
 {
 	uint64_t val;
 	uint32_t eax, edx;
 	int error;
 
 	val = 0;
 	error = emulate_rdmsr(ctx, *pvcpu, vme->u.msr.code, &val);
 	if (error != 0) {
 		fprintf(stderr, "rdmsr to register %#x on vcpu %d\n",
 		    vme->u.msr.code, *pvcpu);
 		if (strictmsr) {
 			vm_inject_gp(ctx, *pvcpu);
 			return (VMEXIT_CONTINUE);
 		}
 	}
 
 	eax = val;
 	error = vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RAX, eax);
 	assert(error == 0);
 
 	edx = val >> 32;
 	error = vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RDX, edx);
 	assert(error == 0);
 
 	return (VMEXIT_CONTINUE);
 }
 
 static int
 vmexit_wrmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
 {
 	int error;
 
 	error = emulate_wrmsr(ctx, *pvcpu, vme->u.msr.code, vme->u.msr.wval);
 	if (error != 0) {
 		fprintf(stderr, "wrmsr to register %#x(%#lx) on vcpu %d\n",
 		    vme->u.msr.code, vme->u.msr.wval, *pvcpu);
 		if (strictmsr) {
 			vm_inject_gp(ctx, *pvcpu);
 			return (VMEXIT_CONTINUE);
 		}
 	}
 	return (VMEXIT_CONTINUE);
 }
 
 static int
 vmexit_spinup_ap(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
 {
 	int newcpu;
 	int retval = VMEXIT_CONTINUE;
 
 	newcpu = spinup_ap(ctx, *pvcpu,
 			   vme->u.spinup_ap.vcpu, vme->u.spinup_ap.rip);
 
 	return (retval);
 }
 
 #define	DEBUG_EPT_MISCONFIG
 #ifdef DEBUG_EPT_MISCONFIG
 #define	EXIT_REASON_EPT_MISCONFIG	49
 #define	VMCS_GUEST_PHYSICAL_ADDRESS	0x00002400
 #define	VMCS_IDENT(x)			((x) | 0x80000000)
 
 static uint64_t ept_misconfig_gpa, ept_misconfig_pte[4];
 static int ept_misconfig_ptenum;
 #endif
 
 static int
 vmexit_vmx(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 {
 
 	fprintf(stderr, "vm exit[%d]\n", *pvcpu);
 	fprintf(stderr, "\treason\t\tVMX\n");
 	fprintf(stderr, "\trip\t\t0x%016lx\n", vmexit->rip);
 	fprintf(stderr, "\tinst_length\t%d\n", vmexit->inst_length);
 	fprintf(stderr, "\tstatus\t\t%d\n", vmexit->u.vmx.status);
 	fprintf(stderr, "\texit_reason\t%u\n", vmexit->u.vmx.exit_reason);
 	fprintf(stderr, "\tqualification\t0x%016lx\n",
 	    vmexit->u.vmx.exit_qualification);
 	fprintf(stderr, "\tinst_type\t\t%d\n", vmexit->u.vmx.inst_type);
 	fprintf(stderr, "\tinst_error\t\t%d\n", vmexit->u.vmx.inst_error);
 #ifdef DEBUG_EPT_MISCONFIG
 	if (vmexit->u.vmx.exit_reason == EXIT_REASON_EPT_MISCONFIG) {
 		vm_get_register(ctx, *pvcpu,
 		    VMCS_IDENT(VMCS_GUEST_PHYSICAL_ADDRESS),
 		    &ept_misconfig_gpa);
 		vm_get_gpa_pmap(ctx, ept_misconfig_gpa, ept_misconfig_pte,
 		    &ept_misconfig_ptenum);
 		fprintf(stderr, "\tEPT misconfiguration:\n");
 		fprintf(stderr, "\t\tGPA: %#lx\n", ept_misconfig_gpa);
 		fprintf(stderr, "\t\tPTE(%d): %#lx %#lx %#lx %#lx\n",
 		    ept_misconfig_ptenum, ept_misconfig_pte[0],
 		    ept_misconfig_pte[1], ept_misconfig_pte[2],
 		    ept_misconfig_pte[3]);
 	}
 #endif	/* DEBUG_EPT_MISCONFIG */
 	return (VMEXIT_ABORT);
 }
 
 static int
 vmexit_svm(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 {
 
 	fprintf(stderr, "vm exit[%d]\n", *pvcpu);
 	fprintf(stderr, "\treason\t\tSVM\n");
 	fprintf(stderr, "\trip\t\t0x%016lx\n", vmexit->rip);
 	fprintf(stderr, "\tinst_length\t%d\n", vmexit->inst_length);
 	fprintf(stderr, "\texitcode\t%#lx\n", vmexit->u.svm.exitcode);
 	fprintf(stderr, "\texitinfo1\t%#lx\n", vmexit->u.svm.exitinfo1);
 	fprintf(stderr, "\texitinfo2\t%#lx\n", vmexit->u.svm.exitinfo2);
 	return (VMEXIT_ABORT);
 }
 
 static int
 vmexit_bogus(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 {
 
 	assert(vmexit->inst_length == 0);
 
 	stats.vmexit_bogus++;
 
 	return (VMEXIT_CONTINUE);
 }
 
 static int
 vmexit_hlt(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 {
 
 	stats.vmexit_hlt++;
 
 	/*
 	 * Just continue execution with the next instruction. We use
 	 * the HLT VM exit as a way to be friendly with the host
 	 * scheduler.
 	 */
 	return (VMEXIT_CONTINUE);
 }
 
 static int
 vmexit_pause(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 {
 
 	stats.vmexit_pause++;
 
 	return (VMEXIT_CONTINUE);
 }
 
 static int
 vmexit_mtrap(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 {
 
 	assert(vmexit->inst_length == 0);
 
 	stats.vmexit_mtrap++;
 
 	return (VMEXIT_CONTINUE);
 }
 
 static int
 vmexit_inst_emul(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 {
-	int err;
+	int err, i;
+	struct vie *vie;
+
 	stats.vmexit_inst_emul++;
 
+	vie = &vmexit->u.inst_emul.vie;
 	err = emulate_mem(ctx, *pvcpu, vmexit->u.inst_emul.gpa,
-	    &vmexit->u.inst_emul.vie, &vmexit->u.inst_emul.paging);
+	    vie, &vmexit->u.inst_emul.paging);
 
 	if (err) {
-		if (err == EINVAL) {
-			fprintf(stderr,
-			    "Failed to emulate instruction at 0x%lx\n", 
-			    vmexit->rip);
-		} else if (err == ESRCH) {
+		if (err == ESRCH) {
 			fprintf(stderr, "Unhandled memory access to 0x%lx\n",
 			    vmexit->u.inst_emul.gpa);
 		}
 
+		fprintf(stderr, "Failed to emulate instruction [");
+		for (i = 0; i < vie->num_valid; i++) {
+			fprintf(stderr, "0x%02x%s", vie->inst[i],
+			    i != (vie->num_valid - 1) ? " " : "");
+		}
+		fprintf(stderr, "] at 0x%lx\n", vmexit->rip);
 		return (VMEXIT_ABORT);
 	}
 
 	return (VMEXIT_CONTINUE);
 }
 
 static pthread_mutex_t resetcpu_mtx = PTHREAD_MUTEX_INITIALIZER;
 static pthread_cond_t resetcpu_cond = PTHREAD_COND_INITIALIZER;
 
 static int
 vmexit_suspend(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 {
 	enum vm_suspend_how how;
 
 	how = vmexit->u.suspended.how;
 
 	fbsdrun_deletecpu(ctx, *pvcpu);
 
 	if (*pvcpu != BSP) {
 		pthread_mutex_lock(&resetcpu_mtx);
 		pthread_cond_signal(&resetcpu_cond);
 		pthread_mutex_unlock(&resetcpu_mtx);
 		pthread_exit(NULL);
 	}
 
 	pthread_mutex_lock(&resetcpu_mtx);
 	while (!CPU_EMPTY(&cpumask)) {
 		pthread_cond_wait(&resetcpu_cond, &resetcpu_mtx);
 	}
 	pthread_mutex_unlock(&resetcpu_mtx);
 
 	switch (how) {
 	case VM_SUSPEND_RESET:
 		exit(0);
 	case VM_SUSPEND_POWEROFF:
 		exit(1);
 	case VM_SUSPEND_HALT:
 		exit(2);
 	case VM_SUSPEND_TRIPLEFAULT:
 		exit(3);
 	default:
 		fprintf(stderr, "vmexit_suspend: invalid reason %d\n", how);
 		exit(100);
 	}
 	return (0);	/* NOTREACHED */
 }
 
 static vmexit_handler_t handler[VM_EXITCODE_MAX] = {
 	[VM_EXITCODE_INOUT]  = vmexit_inout,
 	[VM_EXITCODE_INOUT_STR]  = vmexit_inout,
 	[VM_EXITCODE_VMX]    = vmexit_vmx,
 	[VM_EXITCODE_SVM]    = vmexit_svm,
 	[VM_EXITCODE_BOGUS]  = vmexit_bogus,
 	[VM_EXITCODE_RDMSR]  = vmexit_rdmsr,
 	[VM_EXITCODE_WRMSR]  = vmexit_wrmsr,
 	[VM_EXITCODE_MTRAP]  = vmexit_mtrap,
 	[VM_EXITCODE_INST_EMUL] = vmexit_inst_emul,
 	[VM_EXITCODE_SPINUP_AP] = vmexit_spinup_ap,
 	[VM_EXITCODE_SUSPENDED] = vmexit_suspend,
 	[VM_EXITCODE_TASK_SWITCH] = vmexit_task_switch,
 };
 
 static void
 vm_loop(struct vmctx *ctx, int vcpu, uint64_t startrip)
 {
 	int error, rc, prevcpu;
 	enum vm_exitcode exitcode;
 	cpuset_t active_cpus;
 
 	if (vcpumap[vcpu] != NULL) {
 		error = pthread_setaffinity_np(pthread_self(),
 		    sizeof(cpuset_t), vcpumap[vcpu]);
 		assert(error == 0);
 	}
 
 	error = vm_active_cpus(ctx, &active_cpus);
 	assert(CPU_ISSET(vcpu, &active_cpus));
 
 	error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, startrip);
 	assert(error == 0);
 
 	while (1) {
 		error = vm_run(ctx, vcpu, &vmexit[vcpu]);
 		if (error != 0)
 			break;
 
 		prevcpu = vcpu;
 
 		exitcode = vmexit[vcpu].exitcode;
 		if (exitcode >= VM_EXITCODE_MAX || handler[exitcode] == NULL) {
 			fprintf(stderr, "vm_loop: unexpected exitcode 0x%x\n",
 			    exitcode);
 			exit(1);
 		}
 
                 rc = (*handler[exitcode])(ctx, &vmexit[vcpu], &vcpu);
 
 		switch (rc) {
 		case VMEXIT_CONTINUE:
 			break;
 		case VMEXIT_ABORT:
 			abort();
 		default:
 			exit(1);
 		}
 	}
 	fprintf(stderr, "vm_run error %d, errno %d\n", error, errno);
 }
 
 static int
 num_vcpus_allowed(struct vmctx *ctx)
 {
 	int tmp, error;
 
 	error = vm_get_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, &tmp);
 
 	/*
 	 * The guest is allowed to spinup more than one processor only if the
 	 * UNRESTRICTED_GUEST capability is available.
 	 */
 	if (error == 0)
 		return (VM_MAXCPU);
 	else
 		return (1);
 }
 
 void
 fbsdrun_set_capabilities(struct vmctx *ctx, int cpu)
 {
 	int err, tmp;
 
 	if (fbsdrun_vmexit_on_hlt()) {
 		err = vm_get_capability(ctx, cpu, VM_CAP_HALT_EXIT, &tmp);
 		if (err < 0) {
 			fprintf(stderr, "VM exit on HLT not supported\n");
 			exit(1);
 		}
 		vm_set_capability(ctx, cpu, VM_CAP_HALT_EXIT, 1);
 		if (cpu == BSP)
 			handler[VM_EXITCODE_HLT] = vmexit_hlt;
 	}
 
         if (fbsdrun_vmexit_on_pause()) {
 		/*
 		 * pause exit support required for this mode
 		 */
 		err = vm_get_capability(ctx, cpu, VM_CAP_PAUSE_EXIT, &tmp);
 		if (err < 0) {
 			fprintf(stderr,
 			    "SMP mux requested, no pause support\n");
 			exit(1);
 		}
 		vm_set_capability(ctx, cpu, VM_CAP_PAUSE_EXIT, 1);
 		if (cpu == BSP)
 			handler[VM_EXITCODE_PAUSE] = vmexit_pause;
         }
 
 	if (x2apic_mode)
 		err = vm_set_x2apic_state(ctx, cpu, X2APIC_ENABLED);
 	else
 		err = vm_set_x2apic_state(ctx, cpu, X2APIC_DISABLED);
 
 	if (err) {
 		fprintf(stderr, "Unable to set x2apic state (%d)\n", err);
 		exit(1);
 	}
 
 	vm_set_capability(ctx, cpu, VM_CAP_ENABLE_INVPCID, 1);
 }
 
 int
 main(int argc, char *argv[])
 {
 	int c, error, gdb_port, err, bvmcons;
 	int dump_guest_memory, max_vcpus, mptgen;
 	int rtc_localtime;
 	struct vmctx *ctx;
 	uint64_t rip;
 	size_t memsize;
 
 	bvmcons = 0;
 	dump_guest_memory = 0;
 	progname = basename(argv[0]);
 	gdb_port = 0;
 	guest_ncpus = 1;
 	memsize = 256 * MB;
 	mptgen = 1;
 	rtc_localtime = 1;
 
 	while ((c = getopt(argc, argv, "abehuwxACHIPWYp:g:c:s:m:l:U:")) != -1) {
 		switch (c) {
 		case 'a':
 			x2apic_mode = 0;
 			break;
 		case 'A':
 			acpi = 1;
 			break;
 		case 'b':
 			bvmcons = 1;
 			break;
 		case 'p':
                         if (pincpu_parse(optarg) != 0) {
                             errx(EX_USAGE, "invalid vcpu pinning "
                                  "configuration '%s'", optarg);
                         }
 			break;
                 case 'c':
 			guest_ncpus = atoi(optarg);
 			break;
 		case 'C':
 			dump_guest_memory = 1;
 			break;
 		case 'g':
 			gdb_port = atoi(optarg);
 			break;
 		case 'l':
 			if (lpc_device_parse(optarg) != 0) {
 				errx(EX_USAGE, "invalid lpc device "
 				    "configuration '%s'", optarg);
 			}
 			break;
 		case 's':
 			if (pci_parse_slot(optarg) != 0)
 				exit(1);
 			else
 				break;
                 case 'm':
 			error = vm_parse_memsize(optarg, &memsize);
 			if (error)
 				errx(EX_USAGE, "invalid memsize '%s'", optarg);
 			break;
 		case 'H':
 			guest_vmexit_on_hlt = 1;
 			break;
 		case 'I':
 			/*
 			 * The "-I" option was used to add an ioapic to the
 			 * virtual machine.
 			 *
 			 * An ioapic is now provided unconditionally for each
 			 * virtual machine and this option is now deprecated.
 			 */
 			break;
 		case 'P':
 			guest_vmexit_on_pause = 1;
 			break;
 		case 'e':
 			strictio = 1;
 			break;
 		case 'u':
 			rtc_localtime = 0;
 			break;
 		case 'U':
 			guest_uuid_str = optarg;
 			break;
 		case 'w':
 			strictmsr = 0;
 			break;
 		case 'W':
 			virtio_msix = 0;
 			break;
 		case 'x':
 			x2apic_mode = 1;
 			break;
 		case 'Y':
 			mptgen = 0;
 			break;
 		case 'h':
 			usage(0);			
 		default:
 			usage(1);
 		}
 	}
 	argc -= optind;
 	argv += optind;
 
 	if (argc != 1)
 		usage(1);
 
 	vmname = argv[0];
 
 	ctx = vm_open(vmname);
 	if (ctx == NULL) {
 		perror("vm_open");
+		exit(1);
+	}
+
+	if (guest_ncpus < 1) {
+		fprintf(stderr, "Invalid guest vCPUs (%d)\n", guest_ncpus);
 		exit(1);
 	}
 
 	max_vcpus = num_vcpus_allowed(ctx);
 	if (guest_ncpus > max_vcpus) {
 		fprintf(stderr, "%d vCPUs requested but only %d available\n",
 			guest_ncpus, max_vcpus);
 		exit(1);
 	}
 
 	fbsdrun_set_capabilities(ctx, BSP);
 
 	if (dump_guest_memory)
 		vm_set_memflags(ctx, VM_MEM_F_INCORE);
 	err = vm_setup_memory(ctx, memsize, VM_MMAP_ALL);
 	if (err) {
 		fprintf(stderr, "Unable to setup memory (%d)\n", err);
 		exit(1);
 	}
 
 	error = init_msr();
 	if (error) {
 		fprintf(stderr, "init_msr error %d", error);
 		exit(1);
 	}
 
 	init_mem();
 	init_inout();
 	pci_irq_init(ctx);
 	ioapic_init(ctx);
 
 	rtc_init(ctx, rtc_localtime);
 	sci_init(ctx);
 
 	/*
 	 * Exit if a device emulation finds an error in it's initilization
 	 */
 	if (init_pci(ctx) != 0)
 		exit(1);
 
 	if (gdb_port != 0)
 		init_dbgport(gdb_port);
 
 	if (bvmcons)
 		init_bvmcons();
 
 	error = vm_get_register(ctx, BSP, VM_REG_GUEST_RIP, &rip);
 	assert(error == 0);
 
 	/*
 	 * build the guest tables, MP etc.
 	 */
 	if (mptgen) {
 		error = mptable_build(ctx, guest_ncpus);
 		if (error)
 			exit(1);
 	}
 
 	error = smbios_build(ctx);
 	assert(error == 0);
 
 	if (acpi) {
 		error = acpi_build(ctx, guest_ncpus);
 		assert(error == 0);
 	}
 
 	/*
 	 * Change the proc title to include the VM name.
 	 */
 	setproctitle("%s", vmname); 
 	
 	/*
 	 * Add CPU 0
 	 */
 	fbsdrun_addcpu(ctx, BSP, BSP, rip);
 
 	/*
 	 * Head off to the main event dispatch loop
 	 */
 	mevent_dispatch();
 
 	exit(1);
 }
Index: stable/10/usr.sbin/bhyve/pci_emul.c
===================================================================
--- stable/10/usr.sbin/bhyve/pci_emul.c	(revision 284898)
+++ stable/10/usr.sbin/bhyve/pci_emul.c	(revision 284899)
@@ -1,2089 +1,2100 @@
 /*-
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/linker_set.h>
 #include <sys/errno.h>
 
 #include <ctype.h>
 #include <pthread.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <strings.h>
 #include <assert.h>
 #include <stdbool.h>
 
 #include <machine/vmm.h>
 #include <vmmapi.h>
 
 #include "acpi.h"
 #include "bhyverun.h"
 #include "inout.h"
 #include "ioapic.h"
 #include "mem.h"
 #include "pci_emul.h"
 #include "pci_irq.h"
 #include "pci_lpc.h"
 
 #define CONF1_ADDR_PORT    0x0cf8
 #define CONF1_DATA_PORT    0x0cfc
 
 #define CONF1_ENABLE	   0x80000000ul
 
-#define	CFGWRITE(pi,off,val,b)						\
-do {									\
-	if ((b) == 1) {							\
-		pci_set_cfgdata8((pi),(off),(val));			\
-	} else if ((b) == 2) {						\
-		pci_set_cfgdata16((pi),(off),(val));			\
-	} else {							\
-		pci_set_cfgdata32((pi),(off),(val));			\
-	}								\
-} while (0)
-
 #define	MAXBUSES	(PCI_BUSMAX + 1)
 #define MAXSLOTS	(PCI_SLOTMAX + 1)
 #define	MAXFUNCS	(PCI_FUNCMAX + 1)
 
 struct funcinfo {
 	char	*fi_name;
 	char	*fi_param;
 	struct pci_devinst *fi_devi;
 };
 
 struct intxinfo {
 	int	ii_count;
 	int	ii_pirq_pin;
 	int	ii_ioapic_irq;
 };
 
 struct slotinfo {
 	struct intxinfo si_intpins[4];
 	struct funcinfo si_funcs[MAXFUNCS];
 };
 
 struct businfo {
 	uint16_t iobase, iolimit;		/* I/O window */
 	uint32_t membase32, memlimit32;		/* mmio window below 4GB */
 	uint64_t membase64, memlimit64;		/* mmio window above 4GB */
 	struct slotinfo slotinfo[MAXSLOTS];
 };
 
 static struct businfo *pci_businfo[MAXBUSES];
 
 SET_DECLARE(pci_devemu_set, struct pci_devemu);
 
 static uint64_t pci_emul_iobase;
 static uint64_t pci_emul_membase32;
 static uint64_t pci_emul_membase64;
 
 #define	PCI_EMUL_IOBASE		0x2000
 #define	PCI_EMUL_IOLIMIT	0x10000
 
 #define	PCI_EMUL_ECFG_BASE	0xE0000000		    /* 3.5GB */
 #define	PCI_EMUL_ECFG_SIZE	(MAXBUSES * 1024 * 1024)    /* 1MB per bus */
 SYSRES_MEM(PCI_EMUL_ECFG_BASE, PCI_EMUL_ECFG_SIZE);
 
 #define	PCI_EMUL_MEMLIMIT32	PCI_EMUL_ECFG_BASE
 
 #define	PCI_EMUL_MEMBASE64	0xD000000000UL
 #define	PCI_EMUL_MEMLIMIT64	0xFD00000000UL
 
 static struct pci_devemu *pci_emul_finddev(char *name);
 static void pci_lintr_route(struct pci_devinst *pi);
 static void pci_lintr_update(struct pci_devinst *pi);
 static void pci_cfgrw(struct vmctx *ctx, int vcpu, int in, int bus, int slot,
     int func, int coff, int bytes, uint32_t *val);
 
+static __inline void
+CFGWRITE(struct pci_devinst *pi, int coff, uint32_t val, int bytes)
+{
+
+	if (bytes == 1)
+		pci_set_cfgdata8(pi, coff, val);
+	else if (bytes == 2)
+		pci_set_cfgdata16(pi, coff, val);
+	else
+		pci_set_cfgdata32(pi, coff, val);
+}
+
+static __inline uint32_t
+CFGREAD(struct pci_devinst *pi, int coff, int bytes)
+{
+
+	if (bytes == 1)
+		return (pci_get_cfgdata8(pi, coff));
+	else if (bytes == 2)
+		return (pci_get_cfgdata16(pi, coff));
+	else
+		return (pci_get_cfgdata32(pi, coff));
+}
+
 /*
  * I/O access
  */
 
 /*
  * Slot options are in the form:
  *
  *  <bus>:<slot>:<func>,<emul>[,<config>]
  *  <slot>[:<func>],<emul>[,<config>]
  *
  *  slot is 0..31
  *  func is 0..7
  *  emul is a string describing the type of PCI device e.g. virtio-net
  *  config is an optional string, depending on the device, that can be
  *  used for configuration.
  *   Examples are:
  *     1,virtio-net,tap0
  *     3:0,dummy
  */
 static void
 pci_parse_slot_usage(char *aopt)
 {
 
 	fprintf(stderr, "Invalid PCI slot info field \"%s\"\n", aopt);
 }
 
 int
 pci_parse_slot(char *opt)
 {
 	struct businfo *bi;
 	struct slotinfo *si;
 	char *emul, *config, *str, *cp;
 	int error, bnum, snum, fnum;
 
 	error = -1;
 	str = strdup(opt);
 
 	emul = config = NULL;
 	if ((cp = strchr(str, ',')) != NULL) {
 		*cp = '\0';
 		emul = cp + 1;
 		if ((cp = strchr(emul, ',')) != NULL) {
 			*cp = '\0';
 			config = cp + 1;
 		}
 	} else {
 		pci_parse_slot_usage(opt);
 		goto done;
 	}
 
 	/* <bus>:<slot>:<func> */
 	if (sscanf(str, "%d:%d:%d", &bnum, &snum, &fnum) != 3) {
 		bnum = 0;
 		/* <slot>:<func> */
 		if (sscanf(str, "%d:%d", &snum, &fnum) != 2) {
 			fnum = 0;
 			/* <slot> */
 			if (sscanf(str, "%d", &snum) != 1) {
 				snum = -1;
 			}
 		}
 	}
 
 	if (bnum < 0 || bnum >= MAXBUSES || snum < 0 || snum >= MAXSLOTS ||
 	    fnum < 0 || fnum >= MAXFUNCS) {
 		pci_parse_slot_usage(opt);
 		goto done;
 	}
 
 	if (pci_businfo[bnum] == NULL)
 		pci_businfo[bnum] = calloc(1, sizeof(struct businfo));
 
 	bi = pci_businfo[bnum];
 	si = &bi->slotinfo[snum];
 
 	if (si->si_funcs[fnum].fi_name != NULL) {
 		fprintf(stderr, "pci slot %d:%d already occupied!\n",
 			snum, fnum);
 		goto done;
 	}
 
 	if (pci_emul_finddev(emul) == NULL) {
 		fprintf(stderr, "pci slot %d:%d: unknown device \"%s\"\n",
 			snum, fnum, emul);
 		goto done;
 	}
 
 	error = 0;
 	si->si_funcs[fnum].fi_name = emul;
 	si->si_funcs[fnum].fi_param = config;
 
 done:
 	if (error)
 		free(str);
 
 	return (error);
 }
 
 static int
 pci_valid_pba_offset(struct pci_devinst *pi, uint64_t offset)
 {
 
 	if (offset < pi->pi_msix.pba_offset)
 		return (0);
 
 	if (offset >= pi->pi_msix.pba_offset + pi->pi_msix.pba_size) {
 		return (0);
 	}
 
 	return (1);
 }
 
 int
 pci_emul_msix_twrite(struct pci_devinst *pi, uint64_t offset, int size,
 		     uint64_t value)
 {
 	int msix_entry_offset;
 	int tab_index;
 	char *dest;
 
 	/* support only 4 or 8 byte writes */
 	if (size != 4 && size != 8)
 		return (-1);
 
 	/*
 	 * Return if table index is beyond what device supports
 	 */
 	tab_index = offset / MSIX_TABLE_ENTRY_SIZE;
 	if (tab_index >= pi->pi_msix.table_count)
 		return (-1);
 
 	msix_entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
 
 	/* support only aligned writes */
 	if ((msix_entry_offset % size) != 0)
 		return (-1);
 
 	dest = (char *)(pi->pi_msix.table + tab_index);
 	dest += msix_entry_offset;
 
 	if (size == 4)
 		*((uint32_t *)dest) = value;
 	else
 		*((uint64_t *)dest) = value;
 
 	return (0);
 }
 
 uint64_t
 pci_emul_msix_tread(struct pci_devinst *pi, uint64_t offset, int size)
 {
 	char *dest;
 	int msix_entry_offset;
 	int tab_index;
 	uint64_t retval = ~0;
 
 	/*
 	 * The PCI standard only allows 4 and 8 byte accesses to the MSI-X
 	 * table but we also allow 1 byte access to accomodate reads from
 	 * ddb.
 	 */
 	if (size != 1 && size != 4 && size != 8)
 		return (retval);
 
 	msix_entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
 
 	/* support only aligned reads */
 	if ((msix_entry_offset % size) != 0) {
 		return (retval);
 	}
 
 	tab_index = offset / MSIX_TABLE_ENTRY_SIZE;
 
 	if (tab_index < pi->pi_msix.table_count) {
 		/* valid MSI-X Table access */
 		dest = (char *)(pi->pi_msix.table + tab_index);
 		dest += msix_entry_offset;
 
 		if (size == 1)
 			retval = *((uint8_t *)dest);
 		else if (size == 4)
 			retval = *((uint32_t *)dest);
 		else
 			retval = *((uint64_t *)dest);
 	} else if (pci_valid_pba_offset(pi, offset)) {
 		/* return 0 for PBA access */
 		retval = 0;
 	}
 
 	return (retval);
 }
 
 int
 pci_msix_table_bar(struct pci_devinst *pi)
 {
 
 	if (pi->pi_msix.table != NULL)
 		return (pi->pi_msix.table_bar);
 	else
 		return (-1);
 }
 
 int
 pci_msix_pba_bar(struct pci_devinst *pi)
 {
 
 	if (pi->pi_msix.table != NULL)
 		return (pi->pi_msix.pba_bar);
 	else
 		return (-1);
 }
 
 static int
 pci_emul_io_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
 		    uint32_t *eax, void *arg)
 {
 	struct pci_devinst *pdi = arg;
 	struct pci_devemu *pe = pdi->pi_d;
 	uint64_t offset;
 	int i;
 
 	for (i = 0; i <= PCI_BARMAX; i++) {
 		if (pdi->pi_bar[i].type == PCIBAR_IO &&
 		    port >= pdi->pi_bar[i].addr &&
 		    port + bytes <= pdi->pi_bar[i].addr + pdi->pi_bar[i].size) {
 			offset = port - pdi->pi_bar[i].addr;
 			if (in)
 				*eax = (*pe->pe_barread)(ctx, vcpu, pdi, i,
 							 offset, bytes);
 			else
 				(*pe->pe_barwrite)(ctx, vcpu, pdi, i, offset,
 						   bytes, *eax);
 			return (0);
 		}
 	}
 	return (-1);
 }
 
 static int
 pci_emul_mem_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr,
 		     int size, uint64_t *val, void *arg1, long arg2)
 {
 	struct pci_devinst *pdi = arg1;
 	struct pci_devemu *pe = pdi->pi_d;
 	uint64_t offset;
 	int bidx = (int) arg2;
 
 	assert(bidx <= PCI_BARMAX);
 	assert(pdi->pi_bar[bidx].type == PCIBAR_MEM32 ||
 	       pdi->pi_bar[bidx].type == PCIBAR_MEM64);
 	assert(addr >= pdi->pi_bar[bidx].addr &&
 	       addr + size <= pdi->pi_bar[bidx].addr + pdi->pi_bar[bidx].size);
 
 	offset = addr - pdi->pi_bar[bidx].addr;
 
 	if (dir == MEM_F_WRITE) {
 		if (size == 8) {
 			(*pe->pe_barwrite)(ctx, vcpu, pdi, bidx, offset,
 					   4, *val & 0xffffffff);
 			(*pe->pe_barwrite)(ctx, vcpu, pdi, bidx, offset + 4,
 					   4, *val >> 32);
 		} else {
 			(*pe->pe_barwrite)(ctx, vcpu, pdi, bidx, offset,
 					   size, *val);
 		}
 	} else {
 		if (size == 8) {
 			*val = (*pe->pe_barread)(ctx, vcpu, pdi, bidx,
 						 offset, 4);
 			*val |= (*pe->pe_barread)(ctx, vcpu, pdi, bidx,
 						  offset + 4, 4) << 32;
 		} else {
 			*val = (*pe->pe_barread)(ctx, vcpu, pdi, bidx,
 						 offset, size);
 		}
 	}
 
 	return (0);
 }
 
 
 static int
 pci_emul_alloc_resource(uint64_t *baseptr, uint64_t limit, uint64_t size,
 			uint64_t *addr)
 {
 	uint64_t base;
 
 	assert((size & (size - 1)) == 0);	/* must be a power of 2 */
 
 	base = roundup2(*baseptr, size);
 
 	if (base + size <= limit) {
 		*addr = base;
 		*baseptr = base + size;
 		return (0);
 	} else
 		return (-1);
 }
 
 int
 pci_emul_alloc_bar(struct pci_devinst *pdi, int idx, enum pcibar_type type,
 		   uint64_t size)
 {
 
 	return (pci_emul_alloc_pbar(pdi, idx, 0, type, size));
 }
 
 /*
  * Register (or unregister) the MMIO or I/O region associated with the BAR
  * register 'idx' of an emulated pci device.
  */
 static void
 modify_bar_registration(struct pci_devinst *pi, int idx, int registration)
 {
 	int error;
 	struct inout_port iop;
 	struct mem_range mr;
 
 	switch (pi->pi_bar[idx].type) {
 	case PCIBAR_IO:
 		bzero(&iop, sizeof(struct inout_port));
 		iop.name = pi->pi_name;
 		iop.port = pi->pi_bar[idx].addr;
 		iop.size = pi->pi_bar[idx].size;
 		if (registration) {
 			iop.flags = IOPORT_F_INOUT;
 			iop.handler = pci_emul_io_handler;
 			iop.arg = pi;
 			error = register_inout(&iop);
 		} else 
 			error = unregister_inout(&iop);
 		break;
 	case PCIBAR_MEM32:
 	case PCIBAR_MEM64:
 		bzero(&mr, sizeof(struct mem_range));
 		mr.name = pi->pi_name;
 		mr.base = pi->pi_bar[idx].addr;
 		mr.size = pi->pi_bar[idx].size;
 		if (registration) {
 			mr.flags = MEM_F_RW;
 			mr.handler = pci_emul_mem_handler;
 			mr.arg1 = pi;
 			mr.arg2 = idx;
 			error = register_mem(&mr);
 		} else
 			error = unregister_mem(&mr);
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	assert(error == 0);
 }
 
 static void
 unregister_bar(struct pci_devinst *pi, int idx)
 {
 
 	modify_bar_registration(pi, idx, 0);
 }
 
 static void
 register_bar(struct pci_devinst *pi, int idx)
 {
 
 	modify_bar_registration(pi, idx, 1);
 }
 
 /* Are we decoding i/o port accesses for the emulated pci device? */
 static int
 porten(struct pci_devinst *pi)
 {
 	uint16_t cmd;
 
 	cmd = pci_get_cfgdata16(pi, PCIR_COMMAND);
 
 	return (cmd & PCIM_CMD_PORTEN);
 }
 
 /* Are we decoding memory accesses for the emulated pci device? */
 static int
 memen(struct pci_devinst *pi)
 {
 	uint16_t cmd;
 
 	cmd = pci_get_cfgdata16(pi, PCIR_COMMAND);
 
 	return (cmd & PCIM_CMD_MEMEN);
 }
 
 /*
  * Update the MMIO or I/O address that is decoded by the BAR register.
  *
  * If the pci device has enabled the address space decoding then intercept
  * the address range decoded by the BAR register.
  */
 static void
 update_bar_address(struct  pci_devinst *pi, uint64_t addr, int idx, int type)
 {
 	int decode;
 
 	if (pi->pi_bar[idx].type == PCIBAR_IO)
 		decode = porten(pi);
 	else
 		decode = memen(pi);
 
 	if (decode)
 		unregister_bar(pi, idx);
 
 	switch (type) {
 	case PCIBAR_IO:
 	case PCIBAR_MEM32:
 		pi->pi_bar[idx].addr = addr;
 		break;
 	case PCIBAR_MEM64:
 		pi->pi_bar[idx].addr &= ~0xffffffffUL;
 		pi->pi_bar[idx].addr |= addr;
 		break;
 	case PCIBAR_MEMHI64:
 		pi->pi_bar[idx].addr &= 0xffffffff;
 		pi->pi_bar[idx].addr |= addr;
 		break;
 	default:
 		assert(0);
 	}
 
 	if (decode)
 		register_bar(pi, idx);
 }
 
 int
 pci_emul_alloc_pbar(struct pci_devinst *pdi, int idx, uint64_t hostbase,
 		    enum pcibar_type type, uint64_t size)
 {
 	int error;
 	uint64_t *baseptr, limit, addr, mask, lobits, bar;
 
 	assert(idx >= 0 && idx <= PCI_BARMAX);
 
 	if ((size & (size - 1)) != 0)
 		size = 1UL << flsl(size);	/* round up to a power of 2 */
 
 	/* Enforce minimum BAR sizes required by the PCI standard */
 	if (type == PCIBAR_IO) {
 		if (size < 4)
 			size = 4;
 	} else {
 		if (size < 16)
 			size = 16;
 	}
 
 	switch (type) {
 	case PCIBAR_NONE:
 		baseptr = NULL;
 		addr = mask = lobits = 0;
 		break;
 	case PCIBAR_IO:
 		baseptr = &pci_emul_iobase;
 		limit = PCI_EMUL_IOLIMIT;
 		mask = PCIM_BAR_IO_BASE;
 		lobits = PCIM_BAR_IO_SPACE;
 		break;
 	case PCIBAR_MEM64:
 		/*
 		 * XXX
 		 * Some drivers do not work well if the 64-bit BAR is allocated
 		 * above 4GB. Allow for this by allocating small requests under
 		 * 4GB unless then allocation size is larger than some arbitrary
 		 * number (32MB currently).
 		 */
 		if (size > 32 * 1024 * 1024) {
 			/*
 			 * XXX special case for device requiring peer-peer DMA
 			 */
 			if (size == 0x100000000UL)
 				baseptr = &hostbase;
 			else
 				baseptr = &pci_emul_membase64;
 			limit = PCI_EMUL_MEMLIMIT64;
 			mask = PCIM_BAR_MEM_BASE;
 			lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64 |
 				 PCIM_BAR_MEM_PREFETCH;
 			break;
 		} else {
 			baseptr = &pci_emul_membase32;
 			limit = PCI_EMUL_MEMLIMIT32;
 			mask = PCIM_BAR_MEM_BASE;
 			lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64;
 		}
 		break;
 	case PCIBAR_MEM32:
 		baseptr = &pci_emul_membase32;
 		limit = PCI_EMUL_MEMLIMIT32;
 		mask = PCIM_BAR_MEM_BASE;
 		lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_32;
 		break;
 	default:
 		printf("pci_emul_alloc_base: invalid bar type %d\n", type);
 		assert(0);
 	}
 
 	if (baseptr != NULL) {
 		error = pci_emul_alloc_resource(baseptr, limit, size, &addr);
 		if (error != 0)
 			return (error);
 	}
 
 	pdi->pi_bar[idx].type = type;
 	pdi->pi_bar[idx].addr = addr;
 	pdi->pi_bar[idx].size = size;
 
 	/* Initialize the BAR register in config space */
 	bar = (addr & mask) | lobits;
 	pci_set_cfgdata32(pdi, PCIR_BAR(idx), bar);
 
 	if (type == PCIBAR_MEM64) {
 		assert(idx + 1 <= PCI_BARMAX);
 		pdi->pi_bar[idx + 1].type = PCIBAR_MEMHI64;
 		pci_set_cfgdata32(pdi, PCIR_BAR(idx + 1), bar >> 32);
 	}
 	
 	register_bar(pdi, idx);
 
 	return (0);
 }
 
 #define	CAP_START_OFFSET	0x40
 static int
 pci_emul_add_capability(struct pci_devinst *pi, u_char *capdata, int caplen)
 {
 	int i, capoff, reallen;
 	uint16_t sts;
 
 	assert(caplen > 0);
 
 	reallen = roundup2(caplen, 4);		/* dword aligned */
 
 	sts = pci_get_cfgdata16(pi, PCIR_STATUS);
 	if ((sts & PCIM_STATUS_CAPPRESENT) == 0)
 		capoff = CAP_START_OFFSET;
 	else
 		capoff = pi->pi_capend + 1;
 
 	/* Check if we have enough space */
 	if (capoff + reallen > PCI_REGMAX + 1)
 		return (-1);
 
 	/* Set the previous capability pointer */
 	if ((sts & PCIM_STATUS_CAPPRESENT) == 0) {
 		pci_set_cfgdata8(pi, PCIR_CAP_PTR, capoff);
 		pci_set_cfgdata16(pi, PCIR_STATUS, sts|PCIM_STATUS_CAPPRESENT);
 	} else
 		pci_set_cfgdata8(pi, pi->pi_prevcap + 1, capoff);
 
 	/* Copy the capability */
 	for (i = 0; i < caplen; i++)
 		pci_set_cfgdata8(pi, capoff + i, capdata[i]);
 
 	/* Set the next capability pointer */
 	pci_set_cfgdata8(pi, capoff + 1, 0);
 
 	pi->pi_prevcap = capoff;
 	pi->pi_capend = capoff + reallen - 1;
 	return (0);
 }
 
 static struct pci_devemu *
 pci_emul_finddev(char *name)
 {
 	struct pci_devemu **pdpp, *pdp;
 
 	SET_FOREACH(pdpp, pci_devemu_set) {
 		pdp = *pdpp;
 		if (!strcmp(pdp->pe_emu, name)) {
 			return (pdp);
 		}
 	}
 
 	return (NULL);
 }
 
 static int
 pci_emul_init(struct vmctx *ctx, struct pci_devemu *pde, int bus, int slot,
     int func, struct funcinfo *fi)
 {
 	struct pci_devinst *pdi;
 	int err;
 
 	pdi = calloc(1, sizeof(struct pci_devinst));
 
 	pdi->pi_vmctx = ctx;
 	pdi->pi_bus = bus;
 	pdi->pi_slot = slot;
 	pdi->pi_func = func;
 	pthread_mutex_init(&pdi->pi_lintr.lock, NULL);
 	pdi->pi_lintr.pin = 0;
 	pdi->pi_lintr.state = IDLE;
 	pdi->pi_lintr.pirq_pin = 0;
 	pdi->pi_lintr.ioapic_irq = 0;
 	pdi->pi_d = pde;
 	snprintf(pdi->pi_name, PI_NAMESZ, "%s-pci-%d", pde->pe_emu, slot);
 
 	/* Disable legacy interrupts */
 	pci_set_cfgdata8(pdi, PCIR_INTLINE, 255);
 	pci_set_cfgdata8(pdi, PCIR_INTPIN, 0);
 
 	pci_set_cfgdata8(pdi, PCIR_COMMAND,
 		    PCIM_CMD_PORTEN | PCIM_CMD_MEMEN | PCIM_CMD_BUSMASTEREN);
 
 	err = (*pde->pe_init)(ctx, pdi, fi->fi_param);
 	if (err == 0)
 		fi->fi_devi = pdi;
 	else
 		free(pdi);
 
 	return (err);
 }
 
 void
 pci_populate_msicap(struct msicap *msicap, int msgnum, int nextptr)
 {
 	int mmc;
 
 	CTASSERT(sizeof(struct msicap) == 14);
 
 	/* Number of msi messages must be a power of 2 between 1 and 32 */
 	assert((msgnum & (msgnum - 1)) == 0 && msgnum >= 1 && msgnum <= 32);
 	mmc = ffs(msgnum) - 1;
 
 	bzero(msicap, sizeof(struct msicap));
 	msicap->capid = PCIY_MSI;
 	msicap->nextptr = nextptr;
 	msicap->msgctrl = PCIM_MSICTRL_64BIT | (mmc << 1);
 }
 
 int
 pci_emul_add_msicap(struct pci_devinst *pi, int msgnum)
 {
 	struct msicap msicap;
 
 	pci_populate_msicap(&msicap, msgnum, 0);
 
 	return (pci_emul_add_capability(pi, (u_char *)&msicap, sizeof(msicap)));
 }
 
 static void
 pci_populate_msixcap(struct msixcap *msixcap, int msgnum, int barnum,
 		     uint32_t msix_tab_size)
 {
 	CTASSERT(sizeof(struct msixcap) == 12);
 
 	assert(msix_tab_size % 4096 == 0);
 
 	bzero(msixcap, sizeof(struct msixcap));
 	msixcap->capid = PCIY_MSIX;
 
 	/*
 	 * Message Control Register, all fields set to
 	 * zero except for the Table Size.
 	 * Note: Table size N is encoded as N-1
 	 */
 	msixcap->msgctrl = msgnum - 1;
 
 	/*
 	 * MSI-X BAR setup:
 	 * - MSI-X table start at offset 0
 	 * - PBA table starts at a 4K aligned offset after the MSI-X table
 	 */
 	msixcap->table_info = barnum & PCIM_MSIX_BIR_MASK;
 	msixcap->pba_info = msix_tab_size | (barnum & PCIM_MSIX_BIR_MASK);
 }
 
 static void
 pci_msix_table_init(struct pci_devinst *pi, int table_entries)
 {
 	int i, table_size;
 
 	assert(table_entries > 0);
 	assert(table_entries <= MAX_MSIX_TABLE_ENTRIES);
 
 	table_size = table_entries * MSIX_TABLE_ENTRY_SIZE;
 	pi->pi_msix.table = calloc(1, table_size);
 
 	/* set mask bit of vector control register */
 	for (i = 0; i < table_entries; i++)
 		pi->pi_msix.table[i].vector_control |= PCIM_MSIX_VCTRL_MASK;
 }
 
 int
 pci_emul_add_msixcap(struct pci_devinst *pi, int msgnum, int barnum)
 {
 	uint32_t tab_size;
 	struct msixcap msixcap;
 
 	assert(msgnum >= 1 && msgnum <= MAX_MSIX_TABLE_ENTRIES);
 	assert(barnum >= 0 && barnum <= PCIR_MAX_BAR_0);
 	
 	tab_size = msgnum * MSIX_TABLE_ENTRY_SIZE;
 
 	/* Align table size to nearest 4K */
 	tab_size = roundup2(tab_size, 4096);
 
 	pi->pi_msix.table_bar = barnum;
 	pi->pi_msix.pba_bar   = barnum;
 	pi->pi_msix.table_offset = 0;
 	pi->pi_msix.table_count = msgnum;
 	pi->pi_msix.pba_offset = tab_size;
 	pi->pi_msix.pba_size = PBA_SIZE(msgnum);
 
 	pci_msix_table_init(pi, msgnum);
 
 	pci_populate_msixcap(&msixcap, msgnum, barnum, tab_size);
 
 	/* allocate memory for MSI-X Table and PBA */
 	pci_emul_alloc_bar(pi, barnum, PCIBAR_MEM32,
 				tab_size + pi->pi_msix.pba_size);
 
 	return (pci_emul_add_capability(pi, (u_char *)&msixcap,
 					sizeof(msixcap)));
 }
 
 void
 msixcap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
 		 int bytes, uint32_t val)
 {
 	uint16_t msgctrl, rwmask;
 	int off, table_bar;
 	
 	off = offset - capoff;
 	table_bar = pi->pi_msix.table_bar;
 	/* Message Control Register */
 	if (off == 2 && bytes == 2) {
 		rwmask = PCIM_MSIXCTRL_MSIX_ENABLE | PCIM_MSIXCTRL_FUNCTION_MASK;
 		msgctrl = pci_get_cfgdata16(pi, offset);
 		msgctrl &= ~rwmask;
 		msgctrl |= val & rwmask;
 		val = msgctrl;
 
 		pi->pi_msix.enabled = val & PCIM_MSIXCTRL_MSIX_ENABLE;
 		pi->pi_msix.function_mask = val & PCIM_MSIXCTRL_FUNCTION_MASK;
 		pci_lintr_update(pi);
 	} 
 	
 	CFGWRITE(pi, offset, val, bytes);
 }
 
 void
 msicap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
 		int bytes, uint32_t val)
 {
 	uint16_t msgctrl, rwmask, msgdata, mme;
 	uint32_t addrlo;
 
 	/*
 	 * If guest is writing to the message control register make sure
 	 * we do not overwrite read-only fields.
 	 */
 	if ((offset - capoff) == 2 && bytes == 2) {
 		rwmask = PCIM_MSICTRL_MME_MASK | PCIM_MSICTRL_MSI_ENABLE;
 		msgctrl = pci_get_cfgdata16(pi, offset);
 		msgctrl &= ~rwmask;
 		msgctrl |= val & rwmask;
 		val = msgctrl;
 
 		addrlo = pci_get_cfgdata32(pi, capoff + 4);
 		if (msgctrl & PCIM_MSICTRL_64BIT)
 			msgdata = pci_get_cfgdata16(pi, capoff + 12);
 		else
 			msgdata = pci_get_cfgdata16(pi, capoff + 8);
 
 		mme = msgctrl & PCIM_MSICTRL_MME_MASK;
 		pi->pi_msi.enabled = msgctrl & PCIM_MSICTRL_MSI_ENABLE ? 1 : 0;
 		if (pi->pi_msi.enabled) {
 			pi->pi_msi.addr = addrlo;
 			pi->pi_msi.msg_data = msgdata;
 			pi->pi_msi.maxmsgnum = 1 << (mme >> 4);
 		} else {
 			pi->pi_msi.maxmsgnum = 0;
 		}
 		pci_lintr_update(pi);
 	}
 
 	CFGWRITE(pi, offset, val, bytes);
 }
 
 void
 pciecap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
 		 int bytes, uint32_t val)
 {
 
 	/* XXX don't write to the readonly parts */
 	CFGWRITE(pi, offset, val, bytes);
 }
 
 #define	PCIECAP_VERSION	0x2
 int
 pci_emul_add_pciecap(struct pci_devinst *pi, int type)
 {
 	int err;
 	struct pciecap pciecap;
 
 	CTASSERT(sizeof(struct pciecap) == 60);
 
 	if (type != PCIEM_TYPE_ROOT_PORT)
 		return (-1);
 
 	bzero(&pciecap, sizeof(pciecap));
 
 	pciecap.capid = PCIY_EXPRESS;
 	pciecap.pcie_capabilities = PCIECAP_VERSION | PCIEM_TYPE_ROOT_PORT;
 	pciecap.link_capabilities = 0x411;	/* gen1, x1 */
 	pciecap.link_status = 0x11;		/* gen1, x1 */
 
 	err = pci_emul_add_capability(pi, (u_char *)&pciecap, sizeof(pciecap));
 	return (err);
 }
 
 /*
  * This function assumes that 'coff' is in the capabilities region of the
  * config space.
  */
 static void
 pci_emul_capwrite(struct pci_devinst *pi, int offset, int bytes, uint32_t val)
 {
 	int capid;
 	uint8_t capoff, nextoff;
 
 	/* Do not allow un-aligned writes */
 	if ((offset & (bytes - 1)) != 0)
 		return;
 
 	/* Find the capability that we want to update */
 	capoff = CAP_START_OFFSET;
 	while (1) {
 		nextoff = pci_get_cfgdata8(pi, capoff + 1);
 		if (nextoff == 0)
 			break;
 		if (offset >= capoff && offset < nextoff)
 			break;
 
 		capoff = nextoff;
 	}
 	assert(offset >= capoff);
 
 	/*
 	 * Capability ID and Next Capability Pointer are readonly.
 	 * However, some o/s's do 4-byte writes that include these.
 	 * For this case, trim the write back to 2 bytes and adjust
 	 * the data.
 	 */
 	if (offset == capoff || offset == capoff + 1) {
 		if (offset == capoff && bytes == 4) {
 			bytes = 2;
 			offset += 2;
 			val >>= 16;
 		} else
 			return;
 	}
 
 	capid = pci_get_cfgdata8(pi, capoff);
 	switch (capid) {
 	case PCIY_MSI:
 		msicap_cfgwrite(pi, capoff, offset, bytes, val);
 		break;
 	case PCIY_MSIX:
 		msixcap_cfgwrite(pi, capoff, offset, bytes, val);
 		break;
 	case PCIY_EXPRESS:
 		pciecap_cfgwrite(pi, capoff, offset, bytes, val);
 		break;
 	default:
 		break;
 	}
 }
 
 static int
 pci_emul_iscap(struct pci_devinst *pi, int offset)
 {
 	uint16_t sts;
 
 	sts = pci_get_cfgdata16(pi, PCIR_STATUS);
 	if ((sts & PCIM_STATUS_CAPPRESENT) != 0) {
 		if (offset >= CAP_START_OFFSET && offset <= pi->pi_capend)
 			return (1);
 	}
 	return (0);
 }
 
 static int
 pci_emul_fallback_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr,
 			  int size, uint64_t *val, void *arg1, long arg2)
 {
 	/*
 	 * Ignore writes; return 0xff's for reads. The mem read code
 	 * will take care of truncating to the correct size.
 	 */
 	if (dir == MEM_F_READ) {
 		*val = 0xffffffffffffffff;
 	}
 
 	return (0);
 }
 
 static int
 pci_emul_ecfg_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr,
     int bytes, uint64_t *val, void *arg1, long arg2)
 {
 	int bus, slot, func, coff, in;
 
 	coff = addr & 0xfff;
 	func = (addr >> 12) & 0x7;
 	slot = (addr >> 15) & 0x1f;
 	bus = (addr >> 20) & 0xff;
 	in = (dir == MEM_F_READ);
 	if (in)
 		*val = ~0UL;
 	pci_cfgrw(ctx, vcpu, in, bus, slot, func, coff, bytes, (uint32_t *)val);
 	return (0);
 }
 
 uint64_t
 pci_ecfg_base(void)
 {
 
 	return (PCI_EMUL_ECFG_BASE);
 }
 
 #define	BUSIO_ROUNDUP		32
 #define	BUSMEM_ROUNDUP		(1024 * 1024)
 
 int
 init_pci(struct vmctx *ctx)
 {
 	struct mem_range mr;
 	struct pci_devemu *pde;
 	struct businfo *bi;
 	struct slotinfo *si;
 	struct funcinfo *fi;
 	size_t lowmem;
 	int bus, slot, func;
 	int error;
 
 	pci_emul_iobase = PCI_EMUL_IOBASE;
 	pci_emul_membase32 = vm_get_lowmem_limit(ctx);
 	pci_emul_membase64 = PCI_EMUL_MEMBASE64;
 
 	for (bus = 0; bus < MAXBUSES; bus++) {
 		if ((bi = pci_businfo[bus]) == NULL)
 			continue;
 		/* 
 		 * Keep track of the i/o and memory resources allocated to
 		 * this bus.
 		 */
 		bi->iobase = pci_emul_iobase;
 		bi->membase32 = pci_emul_membase32;
 		bi->membase64 = pci_emul_membase64;
 
 		for (slot = 0; slot < MAXSLOTS; slot++) {
 			si = &bi->slotinfo[slot];
 			for (func = 0; func < MAXFUNCS; func++) {
 				fi = &si->si_funcs[func];
 				if (fi->fi_name == NULL)
 					continue;
 				pde = pci_emul_finddev(fi->fi_name);
 				assert(pde != NULL);
 				error = pci_emul_init(ctx, pde, bus, slot,
 				    func, fi);
 				if (error)
 					return (error);
 			}
 		}
 
 		/*
 		 * Add some slop to the I/O and memory resources decoded by
 		 * this bus to give a guest some flexibility if it wants to
 		 * reprogram the BARs.
 		 */
 		pci_emul_iobase += BUSIO_ROUNDUP;
 		pci_emul_iobase = roundup2(pci_emul_iobase, BUSIO_ROUNDUP);
 		bi->iolimit = pci_emul_iobase;
 
 		pci_emul_membase32 += BUSMEM_ROUNDUP;
 		pci_emul_membase32 = roundup2(pci_emul_membase32,
 		    BUSMEM_ROUNDUP);
 		bi->memlimit32 = pci_emul_membase32;
 
 		pci_emul_membase64 += BUSMEM_ROUNDUP;
 		pci_emul_membase64 = roundup2(pci_emul_membase64,
 		    BUSMEM_ROUNDUP);
 		bi->memlimit64 = pci_emul_membase64;
 	}
 
 	/*
 	 * PCI backends are initialized before routing INTx interrupts
 	 * so that LPC devices are able to reserve ISA IRQs before
 	 * routing PIRQ pins.
 	 */
 	for (bus = 0; bus < MAXBUSES; bus++) {
 		if ((bi = pci_businfo[bus]) == NULL)
 			continue;
 
 		for (slot = 0; slot < MAXSLOTS; slot++) {
 			si = &bi->slotinfo[slot];
 			for (func = 0; func < MAXFUNCS; func++) {
 				fi = &si->si_funcs[func];
 				if (fi->fi_devi == NULL)
 					continue;
 				pci_lintr_route(fi->fi_devi);
 			}
 		}
 	}
 	lpc_pirq_routed();
 
 	/*
 	 * The guest physical memory map looks like the following:
 	 * [0,		    lowmem)		guest system memory
 	 * [lowmem,	    lowmem_limit)	memory hole (may be absent)
 	 * [lowmem_limit,   0xE0000000)		PCI hole (32-bit BAR allocation)
 	 * [0xE0000000,	    0xF0000000)		PCI extended config window
 	 * [0xF0000000,	    4GB)		LAPIC, IOAPIC, HPET, firmware
 	 * [4GB,	    4GB + highmem)
 	 */
 
 	/*
 	 * Accesses to memory addresses that are not allocated to system
 	 * memory or PCI devices return 0xff's.
 	 */
 	lowmem = vm_get_lowmem_size(ctx);
 	bzero(&mr, sizeof(struct mem_range));
 	mr.name = "PCI hole";
 	mr.flags = MEM_F_RW | MEM_F_IMMUTABLE;
 	mr.base = lowmem;
 	mr.size = (4ULL * 1024 * 1024 * 1024) - lowmem;
 	mr.handler = pci_emul_fallback_handler;
 	error = register_mem_fallback(&mr);
 	assert(error == 0);
 
 	/* PCI extended config space */
 	bzero(&mr, sizeof(struct mem_range));
 	mr.name = "PCI ECFG";
 	mr.flags = MEM_F_RW | MEM_F_IMMUTABLE;
 	mr.base = PCI_EMUL_ECFG_BASE;
 	mr.size = PCI_EMUL_ECFG_SIZE;
 	mr.handler = pci_emul_ecfg_handler;
 	error = register_mem(&mr);
 	assert(error == 0);
 
 	return (0);
 }
 
 static void
 pci_apic_prt_entry(int bus, int slot, int pin, int pirq_pin, int ioapic_irq,
     void *arg)
 {
 
 	dsdt_line("  Package ()");
 	dsdt_line("  {");
 	dsdt_line("    0x%X,", slot << 16 | 0xffff);
 	dsdt_line("    0x%02X,", pin - 1);
 	dsdt_line("    Zero,");
 	dsdt_line("    0x%X", ioapic_irq);
 	dsdt_line("  },");
 }
 
 static void
 pci_pirq_prt_entry(int bus, int slot, int pin, int pirq_pin, int ioapic_irq,
     void *arg)
 {
 	char *name;
 
 	name = lpc_pirq_name(pirq_pin);
 	if (name == NULL)
 		return;
 	dsdt_line("  Package ()");
 	dsdt_line("  {");
 	dsdt_line("    0x%X,", slot << 16 | 0xffff);
 	dsdt_line("    0x%02X,", pin - 1);
 	dsdt_line("    %s,", name);
 	dsdt_line("    0x00");
 	dsdt_line("  },");
 	free(name);
 }
 
 /*
  * A bhyve virtual machine has a flat PCI hierarchy with a root port
  * corresponding to each PCI bus.
  */
 static void
 pci_bus_write_dsdt(int bus)
 {
 	struct businfo *bi;
 	struct slotinfo *si;
 	struct pci_devinst *pi;
 	int count, func, slot;
 
 	/*
 	 * If there are no devices on this 'bus' then just return.
 	 */
 	if ((bi = pci_businfo[bus]) == NULL) {
 		/*
 		 * Bus 0 is special because it decodes the I/O ports used
 		 * for PCI config space access even if there are no devices
 		 * on it.
 		 */
 		if (bus != 0)
 			return;
 	}
 
 	dsdt_line("  Device (PC%02X)", bus);
 	dsdt_line("  {");
 	dsdt_line("    Name (_HID, EisaId (\"PNP0A03\"))");
 	dsdt_line("    Name (_ADR, Zero)");
 
 	dsdt_line("    Method (_BBN, 0, NotSerialized)");
 	dsdt_line("    {");
 	dsdt_line("        Return (0x%08X)", bus);
 	dsdt_line("    }");
 	dsdt_line("    Name (_CRS, ResourceTemplate ()");
 	dsdt_line("    {");
 	dsdt_line("      WordBusNumber (ResourceProducer, MinFixed, "
 	    "MaxFixed, PosDecode,");
 	dsdt_line("        0x0000,             // Granularity");
 	dsdt_line("        0x%04X,             // Range Minimum", bus);
 	dsdt_line("        0x%04X,             // Range Maximum", bus);
 	dsdt_line("        0x0000,             // Translation Offset");
 	dsdt_line("        0x0001,             // Length");
 	dsdt_line("        ,, )");
 
 	if (bus == 0) {
 		dsdt_indent(3);
 		dsdt_fixed_ioport(0xCF8, 8);
 		dsdt_unindent(3);
 
 		dsdt_line("      WordIO (ResourceProducer, MinFixed, MaxFixed, "
 		    "PosDecode, EntireRange,");
 		dsdt_line("        0x0000,             // Granularity");
 		dsdt_line("        0x0000,             // Range Minimum");
 		dsdt_line("        0x0CF7,             // Range Maximum");
 		dsdt_line("        0x0000,             // Translation Offset");
 		dsdt_line("        0x0CF8,             // Length");
 		dsdt_line("        ,, , TypeStatic)");
 
 		dsdt_line("      WordIO (ResourceProducer, MinFixed, MaxFixed, "
 		    "PosDecode, EntireRange,");
 		dsdt_line("        0x0000,             // Granularity");
 		dsdt_line("        0x0D00,             // Range Minimum");
 		dsdt_line("        0x%04X,             // Range Maximum",
 		    PCI_EMUL_IOBASE - 1);
 		dsdt_line("        0x0000,             // Translation Offset");
 		dsdt_line("        0x%04X,             // Length",
 		    PCI_EMUL_IOBASE - 0x0D00);
 		dsdt_line("        ,, , TypeStatic)");
 
 		if (bi == NULL) {
 			dsdt_line("    })");
 			goto done;
 		}
 	}
 	assert(bi != NULL);
 
 	/* i/o window */
 	dsdt_line("      WordIO (ResourceProducer, MinFixed, MaxFixed, "
 	    "PosDecode, EntireRange,");
 	dsdt_line("        0x0000,             // Granularity");
 	dsdt_line("        0x%04X,             // Range Minimum", bi->iobase);
 	dsdt_line("        0x%04X,             // Range Maximum",
 	    bi->iolimit - 1);
 	dsdt_line("        0x0000,             // Translation Offset");
 	dsdt_line("        0x%04X,             // Length",
 	    bi->iolimit - bi->iobase);
 	dsdt_line("        ,, , TypeStatic)");
 
 	/* mmio window (32-bit) */
 	dsdt_line("      DWordMemory (ResourceProducer, PosDecode, "
 	    "MinFixed, MaxFixed, NonCacheable, ReadWrite,");
 	dsdt_line("        0x00000000,         // Granularity");
 	dsdt_line("        0x%08X,         // Range Minimum\n", bi->membase32);
 	dsdt_line("        0x%08X,         // Range Maximum\n",
 	    bi->memlimit32 - 1);
 	dsdt_line("        0x00000000,         // Translation Offset");
 	dsdt_line("        0x%08X,         // Length\n",
 	    bi->memlimit32 - bi->membase32);
 	dsdt_line("        ,, , AddressRangeMemory, TypeStatic)");
 
 	/* mmio window (64-bit) */
 	dsdt_line("      QWordMemory (ResourceProducer, PosDecode, "
 	    "MinFixed, MaxFixed, NonCacheable, ReadWrite,");
 	dsdt_line("        0x0000000000000000, // Granularity");
 	dsdt_line("        0x%016lX, // Range Minimum\n", bi->membase64);
 	dsdt_line("        0x%016lX, // Range Maximum\n",
 	    bi->memlimit64 - 1);
 	dsdt_line("        0x0000000000000000, // Translation Offset");
 	dsdt_line("        0x%016lX, // Length\n",
 	    bi->memlimit64 - bi->membase64);
 	dsdt_line("        ,, , AddressRangeMemory, TypeStatic)");
 	dsdt_line("    })");
 
 	count = pci_count_lintr(bus);
 	if (count != 0) {
 		dsdt_indent(2);
 		dsdt_line("Name (PPRT, Package ()");
 		dsdt_line("{");
 		pci_walk_lintr(bus, pci_pirq_prt_entry, NULL);
  		dsdt_line("})");
 		dsdt_line("Name (APRT, Package ()");
 		dsdt_line("{");
 		pci_walk_lintr(bus, pci_apic_prt_entry, NULL);
  		dsdt_line("})");
 		dsdt_line("Method (_PRT, 0, NotSerialized)");
 		dsdt_line("{");
 		dsdt_line("  If (PICM)");
 		dsdt_line("  {");
 		dsdt_line("    Return (APRT)");
 		dsdt_line("  }");
 		dsdt_line("  Else");
 		dsdt_line("  {");
 		dsdt_line("    Return (PPRT)");
 		dsdt_line("  }");
 		dsdt_line("}");
 		dsdt_unindent(2);
 	}
 
 	dsdt_indent(2);
 	for (slot = 0; slot < MAXSLOTS; slot++) {
 		si = &bi->slotinfo[slot];
 		for (func = 0; func < MAXFUNCS; func++) {
 			pi = si->si_funcs[func].fi_devi;
 			if (pi != NULL && pi->pi_d->pe_write_dsdt != NULL)
 				pi->pi_d->pe_write_dsdt(pi);
 		}
 	}
 	dsdt_unindent(2);
 done:
 	dsdt_line("  }");
 }
 
 void
 pci_write_dsdt(void)
 {
 	int bus;
 
 	dsdt_indent(1);
 	dsdt_line("Name (PICM, 0x00)");
 	dsdt_line("Method (_PIC, 1, NotSerialized)");
 	dsdt_line("{");
 	dsdt_line("  Store (Arg0, PICM)");
 	dsdt_line("}");
 	dsdt_line("");
 	dsdt_line("Scope (_SB)");
 	dsdt_line("{");
 	for (bus = 0; bus < MAXBUSES; bus++)
 		pci_bus_write_dsdt(bus);
 	dsdt_line("}");
 	dsdt_unindent(1);
 }
 
 int
 pci_bus_configured(int bus)
 {
 	assert(bus >= 0 && bus < MAXBUSES);
 	return (pci_businfo[bus] != NULL);
 }
 
 int
 pci_msi_enabled(struct pci_devinst *pi)
 {
 	return (pi->pi_msi.enabled);
 }
 
 int
 pci_msi_maxmsgnum(struct pci_devinst *pi)
 {
 	if (pi->pi_msi.enabled)
 		return (pi->pi_msi.maxmsgnum);
 	else
 		return (0);
 }
 
 int
 pci_msix_enabled(struct pci_devinst *pi)
 {
 
 	return (pi->pi_msix.enabled && !pi->pi_msi.enabled);
 }
 
 void
 pci_generate_msix(struct pci_devinst *pi, int index)
 {
 	struct msix_table_entry *mte;
 
 	if (!pci_msix_enabled(pi))
 		return;
 
 	if (pi->pi_msix.function_mask)
 		return;
 
 	if (index >= pi->pi_msix.table_count)
 		return;
 
 	mte = &pi->pi_msix.table[index];
 	if ((mte->vector_control & PCIM_MSIX_VCTRL_MASK) == 0) {
 		/* XXX Set PBA bit if interrupt is disabled */
 		vm_lapic_msi(pi->pi_vmctx, mte->addr, mte->msg_data);
 	}
 }
 
 void
 pci_generate_msi(struct pci_devinst *pi, int index)
 {
 
 	if (pci_msi_enabled(pi) && index < pci_msi_maxmsgnum(pi)) {
 		vm_lapic_msi(pi->pi_vmctx, pi->pi_msi.addr,
 			     pi->pi_msi.msg_data + index);
 	}
 }
 
 static bool
 pci_lintr_permitted(struct pci_devinst *pi)
 {
 	uint16_t cmd;
 
 	cmd = pci_get_cfgdata16(pi, PCIR_COMMAND);
 	return (!(pi->pi_msi.enabled || pi->pi_msix.enabled ||
 		(cmd & PCIM_CMD_INTxDIS)));
 }
 
 void
 pci_lintr_request(struct pci_devinst *pi)
 {
 	struct businfo *bi;
 	struct slotinfo *si;
 	int bestpin, bestcount, pin;
 
 	bi = pci_businfo[pi->pi_bus];
 	assert(bi != NULL);
 
 	/*
 	 * Just allocate a pin from our slot.  The pin will be
 	 * assigned IRQs later when interrupts are routed.
 	 */
 	si = &bi->slotinfo[pi->pi_slot];
 	bestpin = 0;
 	bestcount = si->si_intpins[0].ii_count;
 	for (pin = 1; pin < 4; pin++) {
 		if (si->si_intpins[pin].ii_count < bestcount) {
 			bestpin = pin;
 			bestcount = si->si_intpins[pin].ii_count;
 		}
 	}
 
 	si->si_intpins[bestpin].ii_count++;
 	pi->pi_lintr.pin = bestpin + 1;
 	pci_set_cfgdata8(pi, PCIR_INTPIN, bestpin + 1);
 }
 
 static void
 pci_lintr_route(struct pci_devinst *pi)
 {
 	struct businfo *bi;
 	struct intxinfo *ii;
 
 	if (pi->pi_lintr.pin == 0)
 		return;
 
 	bi = pci_businfo[pi->pi_bus];
 	assert(bi != NULL);
 	ii = &bi->slotinfo[pi->pi_slot].si_intpins[pi->pi_lintr.pin - 1];
 
 	/*
 	 * Attempt to allocate an I/O APIC pin for this intpin if one
 	 * is not yet assigned.
 	 */
 	if (ii->ii_ioapic_irq == 0)
 		ii->ii_ioapic_irq = ioapic_pci_alloc_irq();
 	assert(ii->ii_ioapic_irq > 0);
 
 	/*
 	 * Attempt to allocate a PIRQ pin for this intpin if one is
 	 * not yet assigned.
 	 */
 	if (ii->ii_pirq_pin == 0)
 		ii->ii_pirq_pin = pirq_alloc_pin(pi->pi_vmctx);
 	assert(ii->ii_pirq_pin > 0);
 
 	pi->pi_lintr.ioapic_irq = ii->ii_ioapic_irq;
 	pi->pi_lintr.pirq_pin = ii->ii_pirq_pin;
 	pci_set_cfgdata8(pi, PCIR_INTLINE, pirq_irq(ii->ii_pirq_pin));
 }
 
 void
 pci_lintr_assert(struct pci_devinst *pi)
 {
 
 	assert(pi->pi_lintr.pin > 0);
 
 	pthread_mutex_lock(&pi->pi_lintr.lock);
 	if (pi->pi_lintr.state == IDLE) {
 		if (pci_lintr_permitted(pi)) {
 			pi->pi_lintr.state = ASSERTED;
 			pci_irq_assert(pi);
 		} else
 			pi->pi_lintr.state = PENDING;
 	}
 	pthread_mutex_unlock(&pi->pi_lintr.lock);
 }
 
 void
 pci_lintr_deassert(struct pci_devinst *pi)
 {
 
 	assert(pi->pi_lintr.pin > 0);
 
 	pthread_mutex_lock(&pi->pi_lintr.lock);
 	if (pi->pi_lintr.state == ASSERTED) {
 		pi->pi_lintr.state = IDLE;
 		pci_irq_deassert(pi);
 	} else if (pi->pi_lintr.state == PENDING)
 		pi->pi_lintr.state = IDLE;
 	pthread_mutex_unlock(&pi->pi_lintr.lock);
 }
 
 static void
 pci_lintr_update(struct pci_devinst *pi)
 {
 
 	pthread_mutex_lock(&pi->pi_lintr.lock);
 	if (pi->pi_lintr.state == ASSERTED && !pci_lintr_permitted(pi)) {
 		pci_irq_deassert(pi);
 		pi->pi_lintr.state = PENDING;
 	} else if (pi->pi_lintr.state == PENDING && pci_lintr_permitted(pi)) {
 		pi->pi_lintr.state = ASSERTED;
 		pci_irq_assert(pi);
 	}
 	pthread_mutex_unlock(&pi->pi_lintr.lock);
 }
 
 int
 pci_count_lintr(int bus)
 {
 	int count, slot, pin;
 	struct slotinfo *slotinfo;
 
 	count = 0;
 	if (pci_businfo[bus] != NULL) {
 		for (slot = 0; slot < MAXSLOTS; slot++) {
 			slotinfo = &pci_businfo[bus]->slotinfo[slot];
 			for (pin = 0; pin < 4; pin++) {
 				if (slotinfo->si_intpins[pin].ii_count != 0)
 					count++;
 			}
 		}
 	}
 	return (count);
 }
 
 void
 pci_walk_lintr(int bus, pci_lintr_cb cb, void *arg)
 {
 	struct businfo *bi;
 	struct slotinfo *si;
 	struct intxinfo *ii;
 	int slot, pin;
 
 	if ((bi = pci_businfo[bus]) == NULL)
 		return;
 
 	for (slot = 0; slot < MAXSLOTS; slot++) {
 		si = &bi->slotinfo[slot];
 		for (pin = 0; pin < 4; pin++) {
 			ii = &si->si_intpins[pin];
 			if (ii->ii_count != 0)
 				cb(bus, slot, pin + 1, ii->ii_pirq_pin,
 				    ii->ii_ioapic_irq, arg);
 		}
 	}
 }
 
 /*
  * Return 1 if the emulated device in 'slot' is a multi-function device.
  * Return 0 otherwise.
  */
 static int
 pci_emul_is_mfdev(int bus, int slot)
 {
 	struct businfo *bi;
 	struct slotinfo *si;
 	int f, numfuncs;
 
 	numfuncs = 0;
 	if ((bi = pci_businfo[bus]) != NULL) {
 		si = &bi->slotinfo[slot];
 		for (f = 0; f < MAXFUNCS; f++) {
 			if (si->si_funcs[f].fi_devi != NULL) {
 				numfuncs++;
 			}
 		}
 	}
 	return (numfuncs > 1);
 }
 
 /*
  * Ensure that the PCIM_MFDEV bit is properly set (or unset) depending on
  * whether or not is a multi-function being emulated in the pci 'slot'.
  */
 static void
 pci_emul_hdrtype_fixup(int bus, int slot, int off, int bytes, uint32_t *rv)
 {
 	int mfdev;
 
 	if (off <= PCIR_HDRTYPE && off + bytes > PCIR_HDRTYPE) {
 		mfdev = pci_emul_is_mfdev(bus, slot);
 		switch (bytes) {
 		case 1:
 		case 2:
 			*rv &= ~PCIM_MFDEV;
 			if (mfdev) {
 				*rv |= PCIM_MFDEV;
 			}
 			break;
 		case 4:
 			*rv &= ~(PCIM_MFDEV << 16);
 			if (mfdev) {
 				*rv |= (PCIM_MFDEV << 16);
 			}
 			break;
 		}
 	}
 }
 
-static uint32_t
-bits_changed(uint32_t old, uint32_t new, uint32_t mask)
-{
-
-	return ((old ^ new) & mask);
-}
-
 static void
-pci_emul_cmdwrite(struct pci_devinst *pi, uint32_t new, int bytes)
+pci_emul_cmdsts_write(struct pci_devinst *pi, int coff, uint32_t new, int bytes)
 {
-	int i;
-	uint16_t old;
+	int i, rshift;
+	uint32_t cmd, cmd2, changed, old, readonly;
 
+	cmd = pci_get_cfgdata16(pi, PCIR_COMMAND);	/* stash old value */
+
 	/*
-	 * The command register is at an offset of 4 bytes and thus the
-	 * guest could write 1, 2 or 4 bytes starting at this offset.
+	 * From PCI Local Bus Specification 3.0 sections 6.2.2 and 6.2.3.
+	 *
+	 * XXX Bits 8, 11, 12, 13, 14 and 15 in the status register are
+	 * 'write 1 to clear'. However these bits are not set to '1' by
+	 * any device emulation so it is simpler to treat them as readonly.
 	 */
+	rshift = (coff & 0x3) * 8;
+	readonly = 0xFFFFF880 >> rshift;
 
-	old = pci_get_cfgdata16(pi, PCIR_COMMAND);	/* stash old value */
-	CFGWRITE(pi, PCIR_COMMAND, new, bytes);		/* update config */
-	new = pci_get_cfgdata16(pi, PCIR_COMMAND);	/* get updated value */
+	old = CFGREAD(pi, coff, bytes);
+	new &= ~readonly;
+	new |= (old & readonly);
+	CFGWRITE(pi, coff, new, bytes);			/* update config */
 
+	cmd2 = pci_get_cfgdata16(pi, PCIR_COMMAND);	/* get updated value */
+	changed = cmd ^ cmd2;
+
 	/*
 	 * If the MMIO or I/O address space decoding has changed then
 	 * register/unregister all BARs that decode that address space.
 	 */
 	for (i = 0; i <= PCI_BARMAX; i++) {
 		switch (pi->pi_bar[i].type) {
 			case PCIBAR_NONE:
 			case PCIBAR_MEMHI64:
 				break;
 			case PCIBAR_IO:
 				/* I/O address space decoding changed? */
-				if (bits_changed(old, new, PCIM_CMD_PORTEN)) {
+				if (changed & PCIM_CMD_PORTEN) {
 					if (porten(pi))
 						register_bar(pi, i);
 					else
 						unregister_bar(pi, i);
 				}
 				break;
 			case PCIBAR_MEM32:
 			case PCIBAR_MEM64:
 				/* MMIO address space decoding changed? */
-				if (bits_changed(old, new, PCIM_CMD_MEMEN)) {
+				if (changed & PCIM_CMD_MEMEN) {
 					if (memen(pi))
 						register_bar(pi, i);
 					else
 						unregister_bar(pi, i);
 				}
 				break; 
 			default:
 				assert(0); 
 		}
 	}
 
 	/*
 	 * If INTx has been unmasked and is pending, assert the
 	 * interrupt.
 	 */
 	pci_lintr_update(pi);
 }	
 
 static void
 pci_cfgrw(struct vmctx *ctx, int vcpu, int in, int bus, int slot, int func,
     int coff, int bytes, uint32_t *eax)
 {
 	struct businfo *bi;
 	struct slotinfo *si;
 	struct pci_devinst *pi;
 	struct pci_devemu *pe;
 	int idx, needcfg;
 	uint64_t addr, bar, mask;
 
 	if ((bi = pci_businfo[bus]) != NULL) {
 		si = &bi->slotinfo[slot];
 		pi = si->si_funcs[func].fi_devi;
 	} else
 		pi = NULL;
 
 	/*
 	 * Just return if there is no device at this slot:func or if the
 	 * the guest is doing an un-aligned access.
 	 */
 	if (pi == NULL || (bytes != 1 && bytes != 2 && bytes != 4) ||
 	    (coff & (bytes - 1)) != 0) {
 		if (in)
 			*eax = 0xffffffff;
 		return;
 	}
 
 	/*
 	 * Ignore all writes beyond the standard config space and return all
 	 * ones on reads.
 	 */
 	if (coff >= PCI_REGMAX + 1) {
 		if (in) {
 			*eax = 0xffffffff;
 			/*
 			 * Extended capabilities begin at offset 256 in config
 			 * space. Absence of extended capabilities is signaled
 			 * with all 0s in the extended capability header at
 			 * offset 256.
 			 */
 			if (coff <= PCI_REGMAX + 4)
 				*eax = 0x00000000;
 		}
 		return;
 	}
 
 	pe = pi->pi_d;
 
 	/*
 	 * Config read
 	 */
 	if (in) {
 		/* Let the device emulation override the default handler */
 		if (pe->pe_cfgread != NULL) {
 			needcfg = pe->pe_cfgread(ctx, vcpu, pi, coff, bytes,
 			    eax);
 		} else {
 			needcfg = 1;
 		}
 
-		if (needcfg) {
-			if (bytes == 1)
-				*eax = pci_get_cfgdata8(pi, coff);
-			else if (bytes == 2)
-				*eax = pci_get_cfgdata16(pi, coff);
-			else
-				*eax = pci_get_cfgdata32(pi, coff);
-		}
+		if (needcfg)
+			*eax = CFGREAD(pi, coff, bytes);
 
 		pci_emul_hdrtype_fixup(bus, slot, coff, bytes, eax);
 	} else {
 		/* Let the device emulation override the default handler */
 		if (pe->pe_cfgwrite != NULL &&
 		    (*pe->pe_cfgwrite)(ctx, vcpu, pi, coff, bytes, *eax) == 0)
 			return;
 
 		/*
 		 * Special handling for write to BAR registers
 		 */
 		if (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1)) {
 			/*
 			 * Ignore writes to BAR registers that are not
 			 * 4-byte aligned.
 			 */
 			if (bytes != 4 || (coff & 0x3) != 0)
 				return;
 			idx = (coff - PCIR_BAR(0)) / 4;
 			mask = ~(pi->pi_bar[idx].size - 1);
 			switch (pi->pi_bar[idx].type) {
 			case PCIBAR_NONE:
 				pi->pi_bar[idx].addr = bar = 0;
 				break;
 			case PCIBAR_IO:
 				addr = *eax & mask;
 				addr &= 0xffff;
 				bar = addr | PCIM_BAR_IO_SPACE;
 				/*
 				 * Register the new BAR value for interception
 				 */
 				if (addr != pi->pi_bar[idx].addr) {
 					update_bar_address(pi, addr, idx,
 							   PCIBAR_IO);
 				}
 				break;
 			case PCIBAR_MEM32:
 				addr = bar = *eax & mask;
 				bar |= PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_32;
 				if (addr != pi->pi_bar[idx].addr) {
 					update_bar_address(pi, addr, idx,
 							   PCIBAR_MEM32);
 				}
 				break;
 			case PCIBAR_MEM64:
 				addr = bar = *eax & mask;
 				bar |= PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64 |
 				       PCIM_BAR_MEM_PREFETCH;
 				if (addr != (uint32_t)pi->pi_bar[idx].addr) {
 					update_bar_address(pi, addr, idx,
 							   PCIBAR_MEM64);
 				}
 				break;
 			case PCIBAR_MEMHI64:
 				mask = ~(pi->pi_bar[idx - 1].size - 1);
 				addr = ((uint64_t)*eax << 32) & mask;
 				bar = addr >> 32;
 				if (bar != pi->pi_bar[idx - 1].addr >> 32) {
 					update_bar_address(pi, addr, idx - 1,
 							   PCIBAR_MEMHI64);
 				}
 				break;
 			default:
 				assert(0);
 			}
 			pci_set_cfgdata32(pi, coff, bar);
 
 		} else if (pci_emul_iscap(pi, coff)) {
 			pci_emul_capwrite(pi, coff, bytes, *eax);
-		} else if (coff == PCIR_COMMAND) {
-			pci_emul_cmdwrite(pi, *eax, bytes);
+		} else if (coff >= PCIR_COMMAND && coff < PCIR_REVID) {
+			pci_emul_cmdsts_write(pi, coff, *eax, bytes);
 		} else {
 			CFGWRITE(pi, coff, *eax, bytes);
 		}
 	}
 }
 
 static int cfgenable, cfgbus, cfgslot, cfgfunc, cfgoff;
 
 static int
 pci_emul_cfgaddr(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
 		 uint32_t *eax, void *arg)
 {
 	uint32_t x;
 
 	if (bytes != 4) {
 		if (in)
 			*eax = (bytes == 2) ? 0xffff : 0xff;
 		return (0);
 	}
 
 	if (in) {
 		x = (cfgbus << 16) | (cfgslot << 11) | (cfgfunc << 8) | cfgoff;
 		if (cfgenable)
 			x |= CONF1_ENABLE;
 		*eax = x;
 	} else {
 		x = *eax;
 		cfgenable = (x & CONF1_ENABLE) == CONF1_ENABLE;
 		cfgoff = x & PCI_REGMAX;
 		cfgfunc = (x >> 8) & PCI_FUNCMAX;
 		cfgslot = (x >> 11) & PCI_SLOTMAX;
 		cfgbus = (x >> 16) & PCI_BUSMAX;
 	}
 
 	return (0);
 }
 INOUT_PORT(pci_cfgaddr, CONF1_ADDR_PORT, IOPORT_F_INOUT, pci_emul_cfgaddr);
 
 static int
 pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
 		 uint32_t *eax, void *arg)
 {
 	int coff;
 
 	assert(bytes == 1 || bytes == 2 || bytes == 4);
 
 	coff = cfgoff + (port - CONF1_DATA_PORT);
 	if (cfgenable) {
 		pci_cfgrw(ctx, vcpu, in, cfgbus, cfgslot, cfgfunc, coff, bytes,
 		    eax);
 	} else {
 		/* Ignore accesses to cfgdata if not enabled by cfgaddr */
 		if (in)
 			*eax = 0xffffffff;
 	}
 	return (0);
 }
 
 INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+0, IOPORT_F_INOUT, pci_emul_cfgdata);
 INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+1, IOPORT_F_INOUT, pci_emul_cfgdata);
 INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+2, IOPORT_F_INOUT, pci_emul_cfgdata);
 INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+3, IOPORT_F_INOUT, pci_emul_cfgdata);
 
 #define PCI_EMUL_TEST
 #ifdef PCI_EMUL_TEST
 /*
  * Define a dummy test device
  */
 #define DIOSZ	8
 #define DMEMSZ	4096
 struct pci_emul_dsoftc {
 	uint8_t   ioregs[DIOSZ];
 	uint8_t	  memregs[DMEMSZ];
 };
 
 #define	PCI_EMUL_MSI_MSGS	 4
 #define	PCI_EMUL_MSIX_MSGS	16
 
 static int
 pci_emul_dinit(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 {
 	int error;
 	struct pci_emul_dsoftc *sc;
 
 	sc = calloc(1, sizeof(struct pci_emul_dsoftc));
 
 	pi->pi_arg = sc;
 
 	pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0001);
 	pci_set_cfgdata16(pi, PCIR_VENDOR, 0x10DD);
 	pci_set_cfgdata8(pi, PCIR_CLASS, 0x02);
 
 	error = pci_emul_add_msicap(pi, PCI_EMUL_MSI_MSGS);
 	assert(error == 0);
 
 	error = pci_emul_alloc_bar(pi, 0, PCIBAR_IO, DIOSZ);
 	assert(error == 0);
 
 	error = pci_emul_alloc_bar(pi, 1, PCIBAR_MEM32, DMEMSZ);
 	assert(error == 0);
 
 	return (0);
 }
 
 static void
 pci_emul_diow(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
 	      uint64_t offset, int size, uint64_t value)
 {
 	int i;
 	struct pci_emul_dsoftc *sc = pi->pi_arg;
 
 	if (baridx == 0) {
 		if (offset + size > DIOSZ) {
 			printf("diow: iow too large, offset %ld size %d\n",
 			       offset, size);
 			return;
 		}
 
 		if (size == 1) {
 			sc->ioregs[offset] = value & 0xff;
 		} else if (size == 2) {
 			*(uint16_t *)&sc->ioregs[offset] = value & 0xffff;
 		} else if (size == 4) {
 			*(uint32_t *)&sc->ioregs[offset] = value;
 		} else {
 			printf("diow: iow unknown size %d\n", size);
 		}
 
 		/*
 		 * Special magic value to generate an interrupt
 		 */
 		if (offset == 4 && size == 4 && pci_msi_enabled(pi))
 			pci_generate_msi(pi, value % pci_msi_maxmsgnum(pi));
 
 		if (value == 0xabcdef) {
 			for (i = 0; i < pci_msi_maxmsgnum(pi); i++)
 				pci_generate_msi(pi, i);
 		}
 	}
 
 	if (baridx == 1) {
 		if (offset + size > DMEMSZ) {
 			printf("diow: memw too large, offset %ld size %d\n",
 			       offset, size);
 			return;
 		}
 
 		if (size == 1) {
 			sc->memregs[offset] = value;
 		} else if (size == 2) {
 			*(uint16_t *)&sc->memregs[offset] = value;
 		} else if (size == 4) {
 			*(uint32_t *)&sc->memregs[offset] = value;
 		} else if (size == 8) {
 			*(uint64_t *)&sc->memregs[offset] = value;
 		} else {
 			printf("diow: memw unknown size %d\n", size);
 		}
 		
 		/*
 		 * magic interrupt ??
 		 */
 	}
 
 	if (baridx > 1) {
 		printf("diow: unknown bar idx %d\n", baridx);
 	}
 }
 
 static uint64_t
 pci_emul_dior(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
 	      uint64_t offset, int size)
 {
 	struct pci_emul_dsoftc *sc = pi->pi_arg;
 	uint32_t value;
 
 	if (baridx == 0) {
 		if (offset + size > DIOSZ) {
 			printf("dior: ior too large, offset %ld size %d\n",
 			       offset, size);
 			return (0);
 		}
 	
 		if (size == 1) {
 			value = sc->ioregs[offset];
 		} else if (size == 2) {
 			value = *(uint16_t *) &sc->ioregs[offset];
 		} else if (size == 4) {
 			value = *(uint32_t *) &sc->ioregs[offset];
 		} else {
 			printf("dior: ior unknown size %d\n", size);
 		}
 	}
 	
 	if (baridx == 1) {
 		if (offset + size > DMEMSZ) {
 			printf("dior: memr too large, offset %ld size %d\n",
 			       offset, size);
 			return (0);
 		}
 	
 		if (size == 1) {
 			value = sc->memregs[offset];
 		} else if (size == 2) {
 			value = *(uint16_t *) &sc->memregs[offset];
 		} else if (size == 4) {
 			value = *(uint32_t *) &sc->memregs[offset];
 		} else if (size == 8) {
 			value = *(uint64_t *) &sc->memregs[offset];
 		} else {
 			printf("dior: ior unknown size %d\n", size);
 		}
 	}
 
 
 	if (baridx > 1) {
 		printf("dior: unknown bar idx %d\n", baridx);
 		return (0);
 	}
 
 	return (value);
 }
 
 struct pci_devemu pci_dummy = {
 	.pe_emu = "dummy",
 	.pe_init = pci_emul_dinit,
 	.pe_barwrite = pci_emul_diow,
 	.pe_barread = pci_emul_dior
 };
 PCI_EMUL_SET(pci_dummy);
 
 #endif /* PCI_EMUL_TEST */
Index: stable/10/usr.sbin/bhyve/pci_virtio_block.c
===================================================================
--- stable/10/usr.sbin/bhyve/pci_virtio_block.c	(revision 284898)
+++ stable/10/usr.sbin/bhyve/pci_virtio_block.c	(revision 284899)
@@ -1,411 +1,409 @@
 /*-
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/linker_set.h>
 #include <sys/stat.h>
 #include <sys/uio.h>
 #include <sys/ioctl.h>
 #include <sys/disk.h>
 
 #include <errno.h>
 #include <fcntl.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <strings.h>
 #include <unistd.h>
 #include <assert.h>
 #include <pthread.h>
 #include <md5.h>
 
 #include "bhyverun.h"
 #include "pci_emul.h"
 #include "virtio.h"
 #include "block_if.h"
 
 #define VTBLK_RINGSZ	64
 
 #define VTBLK_S_OK	0
 #define VTBLK_S_IOERR	1
 #define	VTBLK_S_UNSUPP	2
 
 #define	VTBLK_BLK_ID_BYTES	20
 
 /* Capability bits */
 #define	VTBLK_F_SEG_MAX		(1 << 2)	/* Maximum request segments */
 #define	VTBLK_F_BLK_SIZE	(1 << 6)	/* cfg block size valid */
 #define	VTBLK_F_FLUSH		(1 << 9)	/* Cache flush support */
 #define	VTBLK_F_TOPOLOGY	(1 << 10)	/* Optimal I/O alignment */
 
 /*
  * Host capabilities
  */
 #define VTBLK_S_HOSTCAPS      \
   ( VTBLK_F_SEG_MAX  |						    \
     VTBLK_F_BLK_SIZE |						    \
     VTBLK_F_FLUSH    |						    \
     VTBLK_F_TOPOLOGY |						    \
     VIRTIO_RING_F_INDIRECT_DESC )	/* indirect descriptors */
 
 /*
  * Config space "registers"
  */
 struct vtblk_config {
 	uint64_t	vbc_capacity;
 	uint32_t	vbc_size_max;
 	uint32_t	vbc_seg_max;
 	struct {
 		uint16_t cylinders;
 		uint8_t heads;
 		uint8_t sectors;
 	} vbc_geometry;
 	uint32_t	vbc_blk_size;
 	struct {
 		uint8_t physical_block_exp;
 		uint8_t alignment_offset;
 		uint16_t min_io_size;
 		uint32_t opt_io_size;
 	} vbc_topology;
 	uint8_t		vbc_writeback;
 } __packed;
 
 /*
  * Fixed-size block header
  */
 struct virtio_blk_hdr {
 #define	VBH_OP_READ		0
 #define	VBH_OP_WRITE		1
 #define	VBH_OP_FLUSH		4
 #define	VBH_OP_FLUSH_OUT	5
 #define	VBH_OP_IDENT		8		
 #define	VBH_FLAG_BARRIER	0x80000000	/* OR'ed into vbh_type */
 	uint32_t       	vbh_type;
 	uint32_t	vbh_ioprio;
 	uint64_t	vbh_sector;
 } __packed;
 
 /*
  * Debug printf
  */
 static int pci_vtblk_debug;
 #define DPRINTF(params) if (pci_vtblk_debug) printf params
 #define WPRINTF(params) printf params
 
 struct pci_vtblk_ioreq {
 	struct blockif_req		io_req;
 	struct pci_vtblk_softc	       *io_sc;
 	uint8_t			       *io_status;
 	uint16_t			io_idx;
 };
 
 /*
  * Per-device softc
  */
 struct pci_vtblk_softc {
 	struct virtio_softc vbsc_vs;
 	pthread_mutex_t vsc_mtx;
 	struct vqueue_info vbsc_vq;
 	struct vtblk_config vbsc_cfg;
 	struct blockif_ctxt *bc;
 	char vbsc_ident[VTBLK_BLK_ID_BYTES];
 	struct pci_vtblk_ioreq vbsc_ios[VTBLK_RINGSZ];
 };
 
 static void pci_vtblk_reset(void *);
 static void pci_vtblk_notify(void *, struct vqueue_info *);
 static int pci_vtblk_cfgread(void *, int, int, uint32_t *);
 static int pci_vtblk_cfgwrite(void *, int, int, uint32_t);
 
 static struct virtio_consts vtblk_vi_consts = {
 	"vtblk",		/* our name */
 	1,			/* we support 1 virtqueue */
 	sizeof(struct vtblk_config), /* config reg size */
 	pci_vtblk_reset,	/* reset */
 	pci_vtblk_notify,	/* device-wide qnotify */
 	pci_vtblk_cfgread,	/* read PCI config */
 	pci_vtblk_cfgwrite,	/* write PCI config */
 	NULL,			/* apply negotiated features */
 	VTBLK_S_HOSTCAPS,	/* our capabilities */
 };
 
 static void
 pci_vtblk_reset(void *vsc)
 {
 	struct pci_vtblk_softc *sc = vsc;
 
 	DPRINTF(("vtblk: device reset requested !\n"));
 	vi_reset_dev(&sc->vbsc_vs);
 }
 
 static void
 pci_vtblk_done(struct blockif_req *br, int err)
 {
 	struct pci_vtblk_ioreq *io = br->br_param;
 	struct pci_vtblk_softc *sc = io->io_sc;
 
 	/* convert errno into a virtio block error return */
 	if (err == EOPNOTSUPP || err == ENOSYS)
 		*io->io_status = VTBLK_S_UNSUPP;
 	else if (err != 0)
 		*io->io_status = VTBLK_S_IOERR;
 	else
 		*io->io_status = VTBLK_S_OK;
 
 	/*
 	 * Return the descriptor back to the host.
 	 * We wrote 1 byte (our status) to host.
 	 */
 	pthread_mutex_lock(&sc->vsc_mtx);
 	vq_relchain(&sc->vbsc_vq, io->io_idx, 1);
 	vq_endchains(&sc->vbsc_vq, 0);
 	pthread_mutex_unlock(&sc->vsc_mtx);
 }
 
 static void
 pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vqueue_info *vq)
 {
 	struct virtio_blk_hdr *vbh;
 	struct pci_vtblk_ioreq *io;
 	int i, n;
 	int err;
 	ssize_t iolen;
 	int writeop, type;
 	off_t offset;
 	struct iovec iov[BLOCKIF_IOV_MAX + 2];
 	uint16_t idx, flags[BLOCKIF_IOV_MAX + 2];
 
 	n = vq_getchain(vq, &idx, iov, BLOCKIF_IOV_MAX + 2, flags);
 
 	/*
 	 * The first descriptor will be the read-only fixed header,
 	 * and the last is for status (hence +2 above and below).
 	 * The remaining iov's are the actual data I/O vectors.
 	 *
 	 * XXX - note - this fails on crash dump, which does a
 	 * VIRTIO_BLK_T_FLUSH with a zero transfer length
 	 */
 	assert(n >= 2 && n <= BLOCKIF_IOV_MAX + 2);
 
 	io = &sc->vbsc_ios[idx];
 	assert((flags[0] & VRING_DESC_F_WRITE) == 0);
 	assert(iov[0].iov_len == sizeof(struct virtio_blk_hdr));
 	vbh = iov[0].iov_base;
 	memcpy(&io->io_req.br_iov, &iov[1], sizeof(struct iovec) * (n - 2));
 	io->io_req.br_iovcnt = n - 2;
 	io->io_req.br_offset = vbh->vbh_sector * DEV_BSIZE;
 	io->io_status = iov[--n].iov_base;
 	assert(iov[n].iov_len == 1);
 	assert(flags[n] & VRING_DESC_F_WRITE);
 
 	/*
 	 * XXX
 	 * The guest should not be setting the BARRIER flag because
 	 * we don't advertise the capability.
 	 */
 	type = vbh->vbh_type & ~VBH_FLAG_BARRIER;
 	writeop = (type == VBH_OP_WRITE);
 
 	iolen = 0;
 	for (i = 1; i < n; i++) {
 		/*
 		 * - write op implies read-only descriptor,
 		 * - read/ident op implies write-only descriptor,
 		 * therefore test the inverse of the descriptor bit
 		 * to the op.
 		 */
 		assert(((flags[i] & VRING_DESC_F_WRITE) == 0) == writeop);
 		iolen += iov[i].iov_len;
 	}
 	io->io_req.br_resid = iolen;
 
 	DPRINTF(("virtio-block: %s op, %zd bytes, %d segs, offset %ld\n\r", 
 		 writeop ? "write" : "read/ident", iolen, i - 1, offset));
 
 	switch (type) {
 	case VBH_OP_READ:
 		err = blockif_read(sc->bc, &io->io_req);
 		break;
 	case VBH_OP_WRITE:
 		err = blockif_write(sc->bc, &io->io_req);
 		break;
 	case VBH_OP_FLUSH:
 	case VBH_OP_FLUSH_OUT:
 		err = blockif_flush(sc->bc, &io->io_req);
 		break;
 	case VBH_OP_IDENT:
 		/* Assume a single buffer */
 		/* S/n equal to buffer is not zero-terminated. */
 		memset(iov[1].iov_base, 0, iov[1].iov_len);
 		strncpy(iov[1].iov_base, sc->vbsc_ident,
 		    MIN(iov[1].iov_len, sizeof(sc->vbsc_ident)));
 		pci_vtblk_done(&io->io_req, 0);
 		return;
 	default:
 		pci_vtblk_done(&io->io_req, EOPNOTSUPP);
 		return;
 	}
 	assert(err == 0);
 }
 
 static void
 pci_vtblk_notify(void *vsc, struct vqueue_info *vq)
 {
 	struct pci_vtblk_softc *sc = vsc;
 
 	while (vq_has_descs(vq))
 		pci_vtblk_proc(sc, vq);
 }
 
 static int
 pci_vtblk_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 {
 	char bident[sizeof("XX:X:X")];
 	struct blockif_ctxt *bctxt;
 	MD5_CTX mdctx;
 	u_char digest[16];
 	struct pci_vtblk_softc *sc;
 	off_t size;
 	int i, sectsz, sts, sto;
 
 	if (opts == NULL) {
 		printf("virtio-block: backing device required\n");
 		return (1);
 	}
 
 	/*
 	 * The supplied backing file has to exist
 	 */
 	snprintf(bident, sizeof(bident), "%d:%d", pi->pi_slot, pi->pi_func);
 	bctxt = blockif_open(opts, bident);
 	if (bctxt == NULL) {       	
 		perror("Could not open backing file");
 		return (1);
 	}
 
 	size = blockif_size(bctxt);
 	sectsz = blockif_sectsz(bctxt);
 	blockif_psectsz(bctxt, &sts, &sto);
 
 	sc = calloc(1, sizeof(struct pci_vtblk_softc));
 	sc->bc = bctxt;
 	for (i = 0; i < VTBLK_RINGSZ; i++) {
 		struct pci_vtblk_ioreq *io = &sc->vbsc_ios[i];
 		io->io_req.br_callback = pci_vtblk_done;
 		io->io_req.br_param = io;
 		io->io_sc = sc;
 		io->io_idx = i;
 	}
 
 	pthread_mutex_init(&sc->vsc_mtx, NULL);
 
 	/* init virtio softc and virtqueues */
 	vi_softc_linkup(&sc->vbsc_vs, &vtblk_vi_consts, sc, pi, &sc->vbsc_vq);
 	sc->vbsc_vs.vs_mtx = &sc->vsc_mtx;
 
 	sc->vbsc_vq.vq_qsize = VTBLK_RINGSZ;
 	/* sc->vbsc_vq.vq_notify = we have no per-queue notify */
 
 	/*
 	 * Create an identifier for the backing file. Use parts of the
 	 * md5 sum of the filename
 	 */
 	MD5Init(&mdctx);
 	MD5Update(&mdctx, opts, strlen(opts));
 	MD5Final(digest, &mdctx);	
 	sprintf(sc->vbsc_ident, "BHYVE-%02X%02X-%02X%02X-%02X%02X",
 	    digest[0], digest[1], digest[2], digest[3], digest[4], digest[5]);
 
 	/* setup virtio block config space */
 	sc->vbsc_cfg.vbc_capacity = size / DEV_BSIZE; /* 512-byte units */
 	sc->vbsc_cfg.vbc_size_max = 0;	/* not negotiated */
 	sc->vbsc_cfg.vbc_seg_max = BLOCKIF_IOV_MAX;
 	sc->vbsc_cfg.vbc_geometry.cylinders = 0;	/* no geometry */
 	sc->vbsc_cfg.vbc_geometry.heads = 0;
 	sc->vbsc_cfg.vbc_geometry.sectors = 0;
 	sc->vbsc_cfg.vbc_blk_size = sectsz;
 	sc->vbsc_cfg.vbc_topology.physical_block_exp =
 	    (sts > sectsz) ? (ffsll(sts / sectsz) - 1) : 0;
 	sc->vbsc_cfg.vbc_topology.alignment_offset =
 	    (sto != 0) ? ((sts - sto) / sectsz) : 0;
 	sc->vbsc_cfg.vbc_topology.min_io_size = 0;
 	sc->vbsc_cfg.vbc_topology.opt_io_size = 0;
 	sc->vbsc_cfg.vbc_writeback = 0;
 
 	/*
 	 * Should we move some of this into virtio.c?  Could
 	 * have the device, class, and subdev_0 as fields in
 	 * the virtio constants structure.
 	 */
 	pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_BLOCK);
 	pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
 	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
 	pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_BLOCK);
 
-	pci_lintr_request(pi);
-
 	if (vi_intr_init(&sc->vbsc_vs, 1, fbsdrun_virtio_msix())) {
 		blockif_close(sc->bc);
 		free(sc);
 		return (1);
 	}
 	vi_set_io_bar(&sc->vbsc_vs, 0);
 	return (0);
 }
 
 static int
 pci_vtblk_cfgwrite(void *vsc, int offset, int size, uint32_t value)
 {
 
 	DPRINTF(("vtblk: write to readonly reg %d\n\r", offset));
 	return (1);
 }
 
 static int
 pci_vtblk_cfgread(void *vsc, int offset, int size, uint32_t *retval)
 {
 	struct pci_vtblk_softc *sc = vsc;
 	void *ptr;
 
 	/* our caller has already verified offset and size */
 	ptr = (uint8_t *)&sc->vbsc_cfg + offset;
 	memcpy(retval, ptr, size);
 	return (0);
 }
 
 struct pci_devemu pci_de_vblk = {
 	.pe_emu =	"virtio-blk",
 	.pe_init =	pci_vtblk_init,
 	.pe_barwrite =	vi_pci_write,
 	.pe_barread =	vi_pci_read
 };
 PCI_EMUL_SET(pci_de_vblk);
Index: stable/10/usr.sbin/bhyve/pci_virtio_net.c
===================================================================
--- stable/10/usr.sbin/bhyve/pci_virtio_net.c	(revision 284898)
+++ stable/10/usr.sbin/bhyve/pci_virtio_net.c	(revision 284899)
@@ -1,731 +1,729 @@
 /*-
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/linker_set.h>
 #include <sys/select.h>
 #include <sys/uio.h>
 #include <sys/ioctl.h>
 #include <machine/atomic.h>
 #include <net/ethernet.h>
 
 #include <errno.h>
 #include <fcntl.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <strings.h>
 #include <unistd.h>
 #include <assert.h>
 #include <md5.h>
 #include <pthread.h>
 #include <pthread_np.h>
 
 #include "bhyverun.h"
 #include "pci_emul.h"
 #include "mevent.h"
 #include "virtio.h"
 
 #define VTNET_RINGSZ	1024
 
 #define VTNET_MAXSEGS	32
 
 /*
  * Host capabilities.  Note that we only offer a few of these.
  */
 #define	VIRTIO_NET_F_CSUM	(1 <<  0) /* host handles partial cksum */
 #define	VIRTIO_NET_F_GUEST_CSUM	(1 <<  1) /* guest handles partial cksum */
 #define	VIRTIO_NET_F_MAC	(1 <<  5) /* host supplies MAC */
 #define	VIRTIO_NET_F_GSO_DEPREC	(1 <<  6) /* deprecated: host handles GSO */
 #define	VIRTIO_NET_F_GUEST_TSO4	(1 <<  7) /* guest can rcv TSOv4 */
 #define	VIRTIO_NET_F_GUEST_TSO6	(1 <<  8) /* guest can rcv TSOv6 */
 #define	VIRTIO_NET_F_GUEST_ECN	(1 <<  9) /* guest can rcv TSO with ECN */
 #define	VIRTIO_NET_F_GUEST_UFO	(1 << 10) /* guest can rcv UFO */
 #define	VIRTIO_NET_F_HOST_TSO4	(1 << 11) /* host can rcv TSOv4 */
 #define	VIRTIO_NET_F_HOST_TSO6	(1 << 12) /* host can rcv TSOv6 */
 #define	VIRTIO_NET_F_HOST_ECN	(1 << 13) /* host can rcv TSO with ECN */
 #define	VIRTIO_NET_F_HOST_UFO	(1 << 14) /* host can rcv UFO */
 #define	VIRTIO_NET_F_MRG_RXBUF	(1 << 15) /* host can merge RX buffers */
 #define	VIRTIO_NET_F_STATUS	(1 << 16) /* config status field available */
 #define	VIRTIO_NET_F_CTRL_VQ	(1 << 17) /* control channel available */
 #define	VIRTIO_NET_F_CTRL_RX	(1 << 18) /* control channel RX mode support */
 #define	VIRTIO_NET_F_CTRL_VLAN	(1 << 19) /* control channel VLAN filtering */
 #define	VIRTIO_NET_F_GUEST_ANNOUNCE \
 				(1 << 21) /* guest can send gratuitous pkts */
 
 #define VTNET_S_HOSTCAPS      \
   ( VIRTIO_NET_F_MAC | VIRTIO_NET_F_MRG_RXBUF | VIRTIO_NET_F_STATUS | \
     VIRTIO_F_NOTIFY_ON_EMPTY)
 
 /*
  * PCI config-space "registers"
  */
 struct virtio_net_config {
 	uint8_t  mac[6];
 	uint16_t status;
 } __packed;
 
 /*
  * Queue definitions.
  */
 #define VTNET_RXQ	0
 #define VTNET_TXQ	1
 #define VTNET_CTLQ	2	/* NB: not yet supported */
 
 #define VTNET_MAXQ	3
 
 /*
  * Fixed network header size
  */
 struct virtio_net_rxhdr {
 	uint8_t		vrh_flags;
 	uint8_t		vrh_gso_type;
 	uint16_t	vrh_hdr_len;
 	uint16_t	vrh_gso_size;
 	uint16_t	vrh_csum_start;
 	uint16_t	vrh_csum_offset;
 	uint16_t	vrh_bufs;
 } __packed;
 
 /*
  * Debug printf
  */
 static int pci_vtnet_debug;
 #define DPRINTF(params) if (pci_vtnet_debug) printf params
 #define WPRINTF(params) printf params
 
 /*
  * Per-device softc
  */
 struct pci_vtnet_softc {
 	struct virtio_softc vsc_vs;
 	struct vqueue_info vsc_queues[VTNET_MAXQ - 1];
 	pthread_mutex_t vsc_mtx;
 	struct mevent	*vsc_mevp;
 
 	int		vsc_tapfd;
 	int		vsc_rx_ready;
 	volatile int	resetting;	/* set and checked outside lock */
 
 	uint64_t	vsc_features;	/* negotiated features */
 	
 	struct virtio_net_config vsc_config;
 
 	pthread_mutex_t	rx_mtx;
 	int		rx_in_progress;
 	int		rx_vhdrlen;
 	int		rx_merge;	/* merged rx bufs in use */
 
 	pthread_t 	tx_tid;
 	pthread_mutex_t	tx_mtx;
 	pthread_cond_t	tx_cond;
 	int		tx_in_progress;
 };
 
 static void pci_vtnet_reset(void *);
 /* static void pci_vtnet_notify(void *, struct vqueue_info *); */
 static int pci_vtnet_cfgread(void *, int, int, uint32_t *);
 static int pci_vtnet_cfgwrite(void *, int, int, uint32_t);
 static void pci_vtnet_neg_features(void *, uint64_t);
 
 static struct virtio_consts vtnet_vi_consts = {
 	"vtnet",		/* our name */
 	VTNET_MAXQ - 1,		/* we currently support 2 virtqueues */
 	sizeof(struct virtio_net_config), /* config reg size */
 	pci_vtnet_reset,	/* reset */
 	NULL,			/* device-wide qnotify -- not used */
 	pci_vtnet_cfgread,	/* read PCI config */
 	pci_vtnet_cfgwrite,	/* write PCI config */
 	pci_vtnet_neg_features,	/* apply negotiated features */
 	VTNET_S_HOSTCAPS,	/* our capabilities */
 };
 
 /*
  * If the transmit thread is active then stall until it is done.
  */
 static void
 pci_vtnet_txwait(struct pci_vtnet_softc *sc)
 {
 
 	pthread_mutex_lock(&sc->tx_mtx);
 	while (sc->tx_in_progress) {
 		pthread_mutex_unlock(&sc->tx_mtx);
 		usleep(10000);
 		pthread_mutex_lock(&sc->tx_mtx);
 	}
 	pthread_mutex_unlock(&sc->tx_mtx);
 }
 
 /*
  * If the receive thread is active then stall until it is done.
  */
 static void
 pci_vtnet_rxwait(struct pci_vtnet_softc *sc)
 {
 
 	pthread_mutex_lock(&sc->rx_mtx);
 	while (sc->rx_in_progress) {
 		pthread_mutex_unlock(&sc->rx_mtx);
 		usleep(10000);
 		pthread_mutex_lock(&sc->rx_mtx);
 	}
 	pthread_mutex_unlock(&sc->rx_mtx);
 }
 
 static void
 pci_vtnet_reset(void *vsc)
 {
 	struct pci_vtnet_softc *sc = vsc;
 
 	DPRINTF(("vtnet: device reset requested !\n"));
 
 	sc->resetting = 1;
 
 	/*
 	 * Wait for the transmit and receive threads to finish their
 	 * processing.
 	 */
 	pci_vtnet_txwait(sc);
 	pci_vtnet_rxwait(sc);
 
 	sc->vsc_rx_ready = 0;
 	sc->rx_merge = 1;
 	sc->rx_vhdrlen = sizeof(struct virtio_net_rxhdr);
 
 	/* now reset rings, MSI-X vectors, and negotiated capabilities */
 	vi_reset_dev(&sc->vsc_vs);
 
 	sc->resetting = 0;
 }
 
 /*
  * Called to send a buffer chain out to the tap device
  */
 static void
 pci_vtnet_tap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt,
 		 int len)
 {
 	static char pad[60]; /* all zero bytes */
 
 	if (sc->vsc_tapfd == -1)
 		return;
 
 	/*
 	 * If the length is < 60, pad out to that and add the
 	 * extra zero'd segment to the iov. It is guaranteed that
 	 * there is always an extra iov available by the caller.
 	 */
 	if (len < 60) {
 		iov[iovcnt].iov_base = pad;
 		iov[iovcnt].iov_len = 60 - len;
 		iovcnt++;
 	}
 	(void) writev(sc->vsc_tapfd, iov, iovcnt);
 }
 
 /*
  *  Called when there is read activity on the tap file descriptor.
  * Each buffer posted by the guest is assumed to be able to contain
  * an entire ethernet frame + rx header.
  *  MP note: the dummybuf is only used for discarding frames, so there
  * is no need for it to be per-vtnet or locked.
  */
 static uint8_t dummybuf[2048];
 
 static __inline struct iovec *
 rx_iov_trim(struct iovec *iov, int *niov, int tlen)
 {
 	struct iovec *riov;
 
 	/* XXX short-cut: assume first segment is >= tlen */
 	assert(iov[0].iov_len >= tlen);
 
 	iov[0].iov_len -= tlen;
 	if (iov[0].iov_len == 0) {
 		assert(*niov > 1);
 		*niov -= 1;
 		riov = &iov[1];
 	} else {
 		iov[0].iov_base = (void *)((uintptr_t)iov[0].iov_base + tlen);
 		riov = &iov[0];
 	}
 
 	return (riov);
 }
 
 static void
 pci_vtnet_tap_rx(struct pci_vtnet_softc *sc)
 {
 	struct iovec iov[VTNET_MAXSEGS], *riov;
 	struct vqueue_info *vq;
 	void *vrx;
 	int len, n;
 	uint16_t idx;
 
 	/*
 	 * Should never be called without a valid tap fd
 	 */
 	assert(sc->vsc_tapfd != -1);
 
 	/*
 	 * But, will be called when the rx ring hasn't yet
 	 * been set up or the guest is resetting the device.
 	 */
 	if (!sc->vsc_rx_ready || sc->resetting) {
 		/*
 		 * Drop the packet and try later.
 		 */
 		(void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf));
 		return;
 	}
 
 	/*
 	 * Check for available rx buffers
 	 */
 	vq = &sc->vsc_queues[VTNET_RXQ];
 	if (!vq_has_descs(vq)) {
 		/*
 		 * Drop the packet and try later.  Interrupt on
 		 * empty, if that's negotiated.
 		 */
 		(void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf));
 		vq_endchains(vq, 1);
 		return;
 	}
 
 	do {
 		/*
 		 * Get descriptor chain.
 		 */
 		n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL);
 		assert(n >= 1 && n <= VTNET_MAXSEGS);
 
 		/*
 		 * Get a pointer to the rx header, and use the
 		 * data immediately following it for the packet buffer.
 		 */
 		vrx = iov[0].iov_base;
 		riov = rx_iov_trim(iov, &n, sc->rx_vhdrlen);
 
 		len = readv(sc->vsc_tapfd, riov, n);
 
 		if (len < 0 && errno == EWOULDBLOCK) {
 			/*
 			 * No more packets, but still some avail ring
 			 * entries.  Interrupt if needed/appropriate.
 			 */
 			vq_retchain(vq);
 			vq_endchains(vq, 0);
 			return;
 		}
 
 		/*
 		 * The only valid field in the rx packet header is the
 		 * number of buffers if merged rx bufs were negotiated.
 		 */
 		memset(vrx, 0, sc->rx_vhdrlen);
 
 		if (sc->rx_merge) {
 			struct virtio_net_rxhdr *vrxh;
 
 			vrxh = vrx;
 			vrxh->vrh_bufs = 1;
 		}
 
 		/*
 		 * Release this chain and handle more chains.
 		 */
 		vq_relchain(vq, idx, len + sc->rx_vhdrlen);
 	} while (vq_has_descs(vq));
 
 	/* Interrupt if needed, including for NOTIFY_ON_EMPTY. */
 	vq_endchains(vq, 1);
 }
 
 static void
 pci_vtnet_tap_callback(int fd, enum ev_type type, void *param)
 {
 	struct pci_vtnet_softc *sc = param;
 
 	pthread_mutex_lock(&sc->rx_mtx);
 	sc->rx_in_progress = 1;
 	pci_vtnet_tap_rx(sc);
 	sc->rx_in_progress = 0;
 	pthread_mutex_unlock(&sc->rx_mtx);
 
 }
 
 static void
 pci_vtnet_ping_rxq(void *vsc, struct vqueue_info *vq)
 {
 	struct pci_vtnet_softc *sc = vsc;
 
 	/*
 	 * A qnotify means that the rx process can now begin
 	 */
 	if (sc->vsc_rx_ready == 0) {
 		sc->vsc_rx_ready = 1;
 		vq->vq_used->vu_flags |= VRING_USED_F_NO_NOTIFY;
 	}
 }
 
 static void
 pci_vtnet_proctx(struct pci_vtnet_softc *sc, struct vqueue_info *vq)
 {
 	struct iovec iov[VTNET_MAXSEGS + 1];
 	int i, n;
 	int plen, tlen;
 	uint16_t idx;
 
 	/*
 	 * Obtain chain of descriptors.  The first one is
 	 * really the header descriptor, so we need to sum
 	 * up two lengths: packet length and transfer length.
 	 */
 	n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL);
 	assert(n >= 1 && n <= VTNET_MAXSEGS);
 	plen = 0;
 	tlen = iov[0].iov_len;
 	for (i = 1; i < n; i++) {
 		plen += iov[i].iov_len;
 		tlen += iov[i].iov_len;
 	}
 
 	DPRINTF(("virtio: packet send, %d bytes, %d segs\n\r", plen, n));
 	pci_vtnet_tap_tx(sc, &iov[1], n - 1, plen);
 
 	/* chain is processed, release it and set tlen */
 	vq_relchain(vq, idx, tlen);
 }
 
 static void
 pci_vtnet_ping_txq(void *vsc, struct vqueue_info *vq)
 {
 	struct pci_vtnet_softc *sc = vsc;
 
 	/*
 	 * Any ring entries to process?
 	 */
 	if (!vq_has_descs(vq))
 		return;
 
 	/* Signal the tx thread for processing */
 	pthread_mutex_lock(&sc->tx_mtx);
 	vq->vq_used->vu_flags |= VRING_USED_F_NO_NOTIFY;
 	if (sc->tx_in_progress == 0)
 		pthread_cond_signal(&sc->tx_cond);
 	pthread_mutex_unlock(&sc->tx_mtx);
 }
 
 /*
  * Thread which will handle processing of TX desc
  */
 static void *
 pci_vtnet_tx_thread(void *param)
 {
 	struct pci_vtnet_softc *sc = param;
 	struct vqueue_info *vq;
 	int error;
 
 	vq = &sc->vsc_queues[VTNET_TXQ];
 
 	/*
 	 * Let us wait till the tx queue pointers get initialised &
 	 * first tx signaled
 	 */
 	pthread_mutex_lock(&sc->tx_mtx);
 	error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx);
 	assert(error == 0);
 
 	for (;;) {
 		/* note - tx mutex is locked here */
 		while (sc->resetting || !vq_has_descs(vq)) {
 			vq->vq_used->vu_flags &= ~VRING_USED_F_NO_NOTIFY;
 			mb();
 			if (!sc->resetting && vq_has_descs(vq))
 				break;
 
 			sc->tx_in_progress = 0;
 			error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx);
 			assert(error == 0);
 		}
 		vq->vq_used->vu_flags |= VRING_USED_F_NO_NOTIFY;
 		sc->tx_in_progress = 1;
 		pthread_mutex_unlock(&sc->tx_mtx);
 
 		do {
 			/*
 			 * Run through entries, placing them into
 			 * iovecs and sending when an end-of-packet
 			 * is found
 			 */
 			pci_vtnet_proctx(sc, vq);
 		} while (vq_has_descs(vq));
 
 		/*
 		 * Generate an interrupt if needed.
 		 */
 		vq_endchains(vq, 1);
 
 		pthread_mutex_lock(&sc->tx_mtx);
 	}
 }
 
 #ifdef notyet
 static void
 pci_vtnet_ping_ctlq(void *vsc, struct vqueue_info *vq)
 {
 
 	DPRINTF(("vtnet: control qnotify!\n\r"));
 }
 #endif
 
 static int
 pci_vtnet_parsemac(char *mac_str, uint8_t *mac_addr)
 {
         struct ether_addr *ea;
         char *tmpstr;
         char zero_addr[ETHER_ADDR_LEN] = { 0, 0, 0, 0, 0, 0 };
 
         tmpstr = strsep(&mac_str,"=");
        
         if ((mac_str != NULL) && (!strcmp(tmpstr,"mac"))) {
                 ea = ether_aton(mac_str);
 
                 if (ea == NULL || ETHER_IS_MULTICAST(ea->octet) ||
                     memcmp(ea->octet, zero_addr, ETHER_ADDR_LEN) == 0) {
 			fprintf(stderr, "Invalid MAC %s\n", mac_str);
                         return (EINVAL);
                 } else
                         memcpy(mac_addr, ea->octet, ETHER_ADDR_LEN);
         }
 
         return (0);
 }
 
 
 static int
 pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 {
 	MD5_CTX mdctx;
 	unsigned char digest[16];
 	char nstr[80];
 	char tname[MAXCOMLEN + 1];
 	struct pci_vtnet_softc *sc;
 	char *devname;
 	char *vtopts;
 	int mac_provided;
 
 	sc = calloc(1, sizeof(struct pci_vtnet_softc));
 
 	pthread_mutex_init(&sc->vsc_mtx, NULL);
 
 	vi_softc_linkup(&sc->vsc_vs, &vtnet_vi_consts, sc, pi, sc->vsc_queues);
 	sc->vsc_vs.vs_mtx = &sc->vsc_mtx;
 
 	sc->vsc_queues[VTNET_RXQ].vq_qsize = VTNET_RINGSZ;
 	sc->vsc_queues[VTNET_RXQ].vq_notify = pci_vtnet_ping_rxq;
 	sc->vsc_queues[VTNET_TXQ].vq_qsize = VTNET_RINGSZ;
 	sc->vsc_queues[VTNET_TXQ].vq_notify = pci_vtnet_ping_txq;
 #ifdef notyet
 	sc->vsc_queues[VTNET_CTLQ].vq_qsize = VTNET_RINGSZ;
         sc->vsc_queues[VTNET_CTLQ].vq_notify = pci_vtnet_ping_ctlq;
 #endif
  
 	/*
 	 * Attempt to open the tap device and read the MAC address
 	 * if specified
 	 */
 	mac_provided = 0;
 	sc->vsc_tapfd = -1;
 	if (opts != NULL) {
 		char tbuf[80];
 		int err;
 
 		devname = vtopts = strdup(opts);
 		(void) strsep(&vtopts, ",");
 
 		if (vtopts != NULL) {
 			err = pci_vtnet_parsemac(vtopts, sc->vsc_config.mac);
 			if (err != 0) {
 				free(devname);
 				return (err);
 			}
 			mac_provided = 1;
 		}
 
 		strcpy(tbuf, "/dev/");
 		strlcat(tbuf, devname, sizeof(tbuf));
 
 		free(devname);
 
 		sc->vsc_tapfd = open(tbuf, O_RDWR);
 		if (sc->vsc_tapfd == -1) {
 			WPRINTF(("open of tap device %s failed\n", tbuf));
 		} else {
 			/*
 			 * Set non-blocking and register for read
 			 * notifications with the event loop
 			 */
 			int opt = 1;
 			if (ioctl(sc->vsc_tapfd, FIONBIO, &opt) < 0) {
 				WPRINTF(("tap device O_NONBLOCK failed\n"));
 				close(sc->vsc_tapfd);
 				sc->vsc_tapfd = -1;
 			}
 
 			sc->vsc_mevp = mevent_add(sc->vsc_tapfd,
 						  EVF_READ,
 						  pci_vtnet_tap_callback,
 						  sc);
 			if (sc->vsc_mevp == NULL) {
 				WPRINTF(("Could not register event\n"));
 				close(sc->vsc_tapfd);
 				sc->vsc_tapfd = -1;
 			}
 		}		
 	}
 
 	/*
 	 * The default MAC address is the standard NetApp OUI of 00-a0-98,
 	 * followed by an MD5 of the PCI slot/func number and dev name
 	 */
 	if (!mac_provided) {
 		snprintf(nstr, sizeof(nstr), "%d-%d-%s", pi->pi_slot,
 		    pi->pi_func, vmname);
 
 		MD5Init(&mdctx);
 		MD5Update(&mdctx, nstr, strlen(nstr));
 		MD5Final(digest, &mdctx);
 
 		sc->vsc_config.mac[0] = 0x00;
 		sc->vsc_config.mac[1] = 0xa0;
 		sc->vsc_config.mac[2] = 0x98;
 		sc->vsc_config.mac[3] = digest[0];
 		sc->vsc_config.mac[4] = digest[1];
 		sc->vsc_config.mac[5] = digest[2];
 	}
 
 	/* initialize config space */
 	pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_NET);
 	pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
 	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK);
 	pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_NET);
 
-	pci_lintr_request(pi);
-
 	/* Link is up if we managed to open tap device. */
 	sc->vsc_config.status = (opts == NULL || sc->vsc_tapfd >= 0);
 	
 	/* use BAR 1 to map MSI-X table and PBA, if we're using MSI-X */
 	if (vi_intr_init(&sc->vsc_vs, 1, fbsdrun_virtio_msix()))
 		return (1);
 
 	/* use BAR 0 to map config regs in IO space */
 	vi_set_io_bar(&sc->vsc_vs, 0);
 
 	sc->resetting = 0;
 
 	sc->rx_merge = 1;
 	sc->rx_vhdrlen = sizeof(struct virtio_net_rxhdr);
 	sc->rx_in_progress = 0;
 	pthread_mutex_init(&sc->rx_mtx, NULL); 
 
 	/* 
 	 * Initialize tx semaphore & spawn TX processing thread.
 	 * As of now, only one thread for TX desc processing is
 	 * spawned. 
 	 */
 	sc->tx_in_progress = 0;
 	pthread_mutex_init(&sc->tx_mtx, NULL);
 	pthread_cond_init(&sc->tx_cond, NULL);
 	pthread_create(&sc->tx_tid, NULL, pci_vtnet_tx_thread, (void *)sc);
 	snprintf(tname, sizeof(tname), "vtnet-%d:%d tx", pi->pi_slot,
 	    pi->pi_func);
         pthread_set_name_np(sc->tx_tid, tname);
 
 	return (0);
 }
 
 static int
 pci_vtnet_cfgwrite(void *vsc, int offset, int size, uint32_t value)
 {
 	struct pci_vtnet_softc *sc = vsc;
 	void *ptr;
 
 	if (offset < 6) {
 		assert(offset + size <= 6);
 		/*
 		 * The driver is allowed to change the MAC address
 		 */
 		ptr = &sc->vsc_config.mac[offset];
 		memcpy(ptr, &value, size);
 	} else {
 		/* silently ignore other writes */
 		DPRINTF(("vtnet: write to readonly reg %d\n\r", offset));
 	}
 
 	return (0);
 }
 
 static int
 pci_vtnet_cfgread(void *vsc, int offset, int size, uint32_t *retval)
 {
 	struct pci_vtnet_softc *sc = vsc;
 	void *ptr;
 
 	ptr = (uint8_t *)&sc->vsc_config + offset;
 	memcpy(retval, ptr, size);
 	return (0);
 }
 
 static void
 pci_vtnet_neg_features(void *vsc, uint64_t negotiated_features)
 {
 	struct pci_vtnet_softc *sc = vsc;
 
 	sc->vsc_features = negotiated_features;
 
 	if (!(sc->vsc_features & VIRTIO_NET_F_MRG_RXBUF)) {
 		sc->rx_merge = 0;
 		/* non-merge rx header is 2 bytes shorter */
 		sc->rx_vhdrlen -= 2;
 	}
 }
 
 struct pci_devemu pci_de_vnet = {
 	.pe_emu = 	"virtio-net",
 	.pe_init =	pci_vtnet_init,
 	.pe_barwrite =	vi_pci_write,
 	.pe_barread =	vi_pci_read
 };
 PCI_EMUL_SET(pci_de_vnet);
Index: stable/10/usr.sbin/bhyve/virtio.c
===================================================================
--- stable/10/usr.sbin/bhyve/virtio.c	(revision 284898)
+++ stable/10/usr.sbin/bhyve/virtio.c	(revision 284899)
@@ -1,772 +1,777 @@
 /*-
  * Copyright (c) 2013  Chris Torek <torek @ torek net>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/uio.h>
 
 #include <stdio.h>
 #include <stdint.h>
 #include <pthread.h>
 #include <pthread_np.h>
 
 #include "bhyverun.h"
 #include "pci_emul.h"
 #include "virtio.h"
 
 /*
  * Functions for dealing with generalized "virtual devices" as
  * defined by <https://www.google.com/#output=search&q=virtio+spec>
  */
 
 /*
  * In case we decide to relax the "virtio softc comes at the
  * front of virtio-based device softc" constraint, let's use
  * this to convert.
  */
 #define DEV_SOFTC(vs) ((void *)(vs))
 
 /*
  * Link a virtio_softc to its constants, the device softc, and
  * the PCI emulation.
  */
 void
 vi_softc_linkup(struct virtio_softc *vs, struct virtio_consts *vc,
 		void *dev_softc, struct pci_devinst *pi,
 		struct vqueue_info *queues)
 {
 	int i;
 
 	/* vs and dev_softc addresses must match */
 	assert((void *)vs == dev_softc);
 	vs->vs_vc = vc;
 	vs->vs_pi = pi;
 	pi->pi_arg = vs;
 
 	vs->vs_queues = queues;
 	for (i = 0; i < vc->vc_nvq; i++) {
 		queues[i].vq_vs = vs;
 		queues[i].vq_num = i;
 	}
 }
 
 /*
  * Reset device (device-wide).  This erases all queues, i.e.,
  * all the queues become invalid (though we don't wipe out the
  * internal pointers, we just clear the VQ_ALLOC flag).
  *
  * It resets negotiated features to "none".
  *
  * If MSI-X is enabled, this also resets all the vectors to NO_VECTOR.
  */
 void
 vi_reset_dev(struct virtio_softc *vs)
 {
 	struct vqueue_info *vq;
 	int i, nvq;
 
 	if (vs->vs_mtx)
 		assert(pthread_mutex_isowned_np(vs->vs_mtx));
 
 	nvq = vs->vs_vc->vc_nvq;
 	for (vq = vs->vs_queues, i = 0; i < nvq; vq++, i++) {
 		vq->vq_flags = 0;
 		vq->vq_last_avail = 0;
 		vq->vq_save_used = 0;
 		vq->vq_pfn = 0;
 		vq->vq_msix_idx = VIRTIO_MSI_NO_VECTOR;
 	}
 	vs->vs_negotiated_caps = 0;
 	vs->vs_curq = 0;
 	/* vs->vs_status = 0; -- redundant */
 	if (vs->vs_isr)
 		pci_lintr_deassert(vs->vs_pi);
 	vs->vs_isr = 0;
 	vs->vs_msix_cfg_idx = VIRTIO_MSI_NO_VECTOR;
 }
 
 /*
  * Set I/O BAR (usually 0) to map PCI config registers.
  */
 void
 vi_set_io_bar(struct virtio_softc *vs, int barnum)
 {
 	size_t size;
 
 	/*
 	 * ??? should we use CFG0 if MSI-X is disabled?
 	 * Existing code did not...
 	 */
 	size = VTCFG_R_CFG1 + vs->vs_vc->vc_cfgsize;
 	pci_emul_alloc_bar(vs->vs_pi, barnum, PCIBAR_IO, size);
 }
 
 /*
  * Initialize MSI-X vector capabilities if we're to use MSI-X,
  * or MSI capabilities if not.
  *
  * We assume we want one MSI-X vector per queue, here, plus one
  * for the config vec.
  */
 int
 vi_intr_init(struct virtio_softc *vs, int barnum, int use_msix)
 {
 	int nvec;
 
 	if (use_msix) {
 		vs->vs_flags |= VIRTIO_USE_MSIX;
 		VS_LOCK(vs);
 		vi_reset_dev(vs); /* set all vectors to NO_VECTOR */
 		VS_UNLOCK(vs);
 		nvec = vs->vs_vc->vc_nvq + 1;
 		if (pci_emul_add_msixcap(vs->vs_pi, nvec, barnum))
 			return (1);
 	} else
 		vs->vs_flags &= ~VIRTIO_USE_MSIX;
+
 	/* Only 1 MSI vector for bhyve */
 	pci_emul_add_msicap(vs->vs_pi, 1);
+
+	/* Legacy interrupts are mandatory for virtio devices */
+	pci_lintr_request(vs->vs_pi);
+
 	return (0);
 }
 
 /*
  * Initialize the currently-selected virtio queue (vs->vs_curq).
  * The guest just gave us a page frame number, from which we can
  * calculate the addresses of the queue.
  */
 void
 vi_vq_init(struct virtio_softc *vs, uint32_t pfn)
 {
 	struct vqueue_info *vq;
 	uint64_t phys;
 	size_t size;
 	char *base;
 
 	vq = &vs->vs_queues[vs->vs_curq];
 	vq->vq_pfn = pfn;
 	phys = (uint64_t)pfn << VRING_PFN;
 	size = vring_size(vq->vq_qsize);
 	base = paddr_guest2host(vs->vs_pi->pi_vmctx, phys, size);
 
 	/* First page(s) are descriptors... */
 	vq->vq_desc = (struct virtio_desc *)base;
 	base += vq->vq_qsize * sizeof(struct virtio_desc);
 
 	/* ... immediately followed by "avail" ring (entirely uint16_t's) */
 	vq->vq_avail = (struct vring_avail *)base;
 	base += (2 + vq->vq_qsize + 1) * sizeof(uint16_t);
 
 	/* Then it's rounded up to the next page... */
 	base = (char *)roundup2((uintptr_t)base, VRING_ALIGN);
 
 	/* ... and the last page(s) are the used ring. */
 	vq->vq_used = (struct vring_used *)base;
 
 	/* Mark queue as allocated, and start at 0 when we use it. */
 	vq->vq_flags = VQ_ALLOC;
 	vq->vq_last_avail = 0;
 	vq->vq_save_used = 0;
 }
 
 /*
  * Helper inline for vq_getchain(): record the i'th "real"
  * descriptor.
  */
 static inline void
 _vq_record(int i, volatile struct virtio_desc *vd, struct vmctx *ctx,
 	   struct iovec *iov, int n_iov, uint16_t *flags) {
 
 	if (i >= n_iov)
 		return;
 	iov[i].iov_base = paddr_guest2host(ctx, vd->vd_addr, vd->vd_len);
 	iov[i].iov_len = vd->vd_len;
 	if (flags != NULL)
 		flags[i] = vd->vd_flags;
 }
 #define	VQ_MAX_DESCRIPTORS	512	/* see below */
 
 /*
  * Examine the chain of descriptors starting at the "next one" to
  * make sure that they describe a sensible request.  If so, return
  * the number of "real" descriptors that would be needed/used in
  * acting on this request.  This may be smaller than the number of
  * available descriptors, e.g., if there are two available but
  * they are two separate requests, this just returns 1.  Or, it
  * may be larger: if there are indirect descriptors involved,
  * there may only be one descriptor available but it may be an
  * indirect pointing to eight more.  We return 8 in this case,
  * i.e., we do not count the indirect descriptors, only the "real"
  * ones.
  *
  * Basically, this vets the vd_flags and vd_next field of each
  * descriptor and tells you how many are involved.  Since some may
  * be indirect, this also needs the vmctx (in the pci_devinst
  * at vs->vs_pi) so that it can find indirect descriptors.
  *
  * As we process each descriptor, we copy and adjust it (guest to
  * host address wise, also using the vmtctx) into the given iov[]
  * array (of the given size).  If the array overflows, we stop
  * placing values into the array but keep processing descriptors,
  * up to VQ_MAX_DESCRIPTORS, before giving up and returning -1.
  * So you, the caller, must not assume that iov[] is as big as the
  * return value (you can process the same thing twice to allocate
  * a larger iov array if needed, or supply a zero length to find
  * out how much space is needed).
  *
  * If you want to verify the WRITE flag on each descriptor, pass a
  * non-NULL "flags" pointer to an array of "uint16_t" of the same size
  * as n_iov and we'll copy each vd_flags field after unwinding any
  * indirects.
  *
  * If some descriptor(s) are invalid, this prints a diagnostic message
  * and returns -1.  If no descriptors are ready now it simply returns 0.
  *
  * You are assumed to have done a vq_ring_ready() if needed (note
  * that vq_has_descs() does one).
  */
 int
 vq_getchain(struct vqueue_info *vq, uint16_t *pidx,
 	    struct iovec *iov, int n_iov, uint16_t *flags)
 {
 	int i;
 	u_int ndesc, n_indir;
 	u_int idx, next;
 	volatile struct virtio_desc *vdir, *vindir, *vp;
 	struct vmctx *ctx;
 	struct virtio_softc *vs;
 	const char *name;
 
 	vs = vq->vq_vs;
 	name = vs->vs_vc->vc_name;
 
 	/*
 	 * Note: it's the responsibility of the guest not to
 	 * update vq->vq_avail->va_idx until all of the descriptors
          * the guest has written are valid (including all their
          * vd_next fields and vd_flags).
 	 *
 	 * Compute (last_avail - va_idx) in integers mod 2**16.  This is
 	 * the number of descriptors the device has made available
 	 * since the last time we updated vq->vq_last_avail.
 	 *
 	 * We just need to do the subtraction as an unsigned int,
 	 * then trim off excess bits.
 	 */
 	idx = vq->vq_last_avail;
 	ndesc = (uint16_t)((u_int)vq->vq_avail->va_idx - idx);
 	if (ndesc == 0)
 		return (0);
 	if (ndesc > vq->vq_qsize) {
 		/* XXX need better way to diagnose issues */
 		fprintf(stderr,
 		    "%s: ndesc (%u) out of range, driver confused?\r\n",
 		    name, (u_int)ndesc);
 		return (-1);
 	}
 
 	/*
 	 * Now count/parse "involved" descriptors starting from
 	 * the head of the chain.
 	 *
 	 * To prevent loops, we could be more complicated and
 	 * check whether we're re-visiting a previously visited
 	 * index, but we just abort if the count gets excessive.
 	 */
 	ctx = vs->vs_pi->pi_vmctx;
 	*pidx = next = vq->vq_avail->va_ring[idx & (vq->vq_qsize - 1)];
 	vq->vq_last_avail++;
 	for (i = 0; i < VQ_MAX_DESCRIPTORS; next = vdir->vd_next) {
 		if (next >= vq->vq_qsize) {
 			fprintf(stderr,
 			    "%s: descriptor index %u out of range, "
 			    "driver confused?\r\n",
 			    name, next);
 			return (-1);
 		}
 		vdir = &vq->vq_desc[next];
 		if ((vdir->vd_flags & VRING_DESC_F_INDIRECT) == 0) {
 			_vq_record(i, vdir, ctx, iov, n_iov, flags);
 			i++;
 		} else if ((vs->vs_negotiated_caps &
 		    VIRTIO_RING_F_INDIRECT_DESC) == 0) {
 			fprintf(stderr,
 			    "%s: descriptor has forbidden INDIRECT flag, "
 			    "driver confused?\r\n",
 			    name);
 			return (-1);
 		} else {
 			n_indir = vdir->vd_len / 16;
 			if ((vdir->vd_len & 0xf) || n_indir == 0) {
 				fprintf(stderr,
 				    "%s: invalid indir len 0x%x, "
 				    "driver confused?\r\n",
 				    name, (u_int)vdir->vd_len);
 				return (-1);
 			}
 			vindir = paddr_guest2host(ctx,
 			    vdir->vd_addr, vdir->vd_len);
 			/*
 			 * Indirects start at the 0th, then follow
 			 * their own embedded "next"s until those run
 			 * out.  Each one's indirect flag must be off
 			 * (we don't really have to check, could just
 			 * ignore errors...).
 			 */
 			next = 0;
 			for (;;) {
 				vp = &vindir[next];
 				if (vp->vd_flags & VRING_DESC_F_INDIRECT) {
 					fprintf(stderr,
 					    "%s: indirect desc has INDIR flag,"
 					    " driver confused?\r\n",
 					    name);
 					return (-1);
 				}
 				_vq_record(i, vp, ctx, iov, n_iov, flags);
 				if (++i > VQ_MAX_DESCRIPTORS)
 					goto loopy;
 				if ((vp->vd_flags & VRING_DESC_F_NEXT) == 0)
 					break;
 				next = vp->vd_next;
 				if (next >= n_indir) {
 					fprintf(stderr,
 					    "%s: invalid next %u > %u, "
 					    "driver confused?\r\n",
 					    name, (u_int)next, n_indir);
 					return (-1);
 				}
 			}
 		}
 		if ((vdir->vd_flags & VRING_DESC_F_NEXT) == 0)
 			return (i);
 	}
 loopy:
 	fprintf(stderr,
 	    "%s: descriptor loop? count > %d - driver confused?\r\n",
 	    name, i);
 	return (-1);
 }
 
 /*
  * Return the currently-first request chain back to the available queue.
  *
  * (This chain is the one you handled when you called vq_getchain()
  * and used its positive return value.)
  */
 void
 vq_retchain(struct vqueue_info *vq)
 {
 
 	vq->vq_last_avail--;
 }
 
 /*
  * Return specified request chain to the guest, setting its I/O length
  * to the provided value.
  *
  * (This chain is the one you handled when you called vq_getchain()
  * and used its positive return value.)
  */
 void
 vq_relchain(struct vqueue_info *vq, uint16_t idx, uint32_t iolen)
 {
 	uint16_t uidx, mask;
 	volatile struct vring_used *vuh;
 	volatile struct virtio_used *vue;
 
 	/*
 	 * Notes:
 	 *  - mask is N-1 where N is a power of 2 so computes x % N
 	 *  - vuh points to the "used" data shared with guest
 	 *  - vue points to the "used" ring entry we want to update
 	 *  - head is the same value we compute in vq_iovecs().
 	 *
 	 * (I apologize for the two fields named vu_idx; the
 	 * virtio spec calls the one that vue points to, "id"...)
 	 */
 	mask = vq->vq_qsize - 1;
 	vuh = vq->vq_used;
 
 	uidx = vuh->vu_idx;
 	vue = &vuh->vu_ring[uidx++ & mask];
 	vue->vu_idx = idx;
 	vue->vu_tlen = iolen;
 	vuh->vu_idx = uidx;
 }
 
 /*
  * Driver has finished processing "available" chains and calling
  * vq_relchain on each one.  If driver used all the available
  * chains, used_all should be set.
  *
  * If the "used" index moved we may need to inform the guest, i.e.,
  * deliver an interrupt.  Even if the used index did NOT move we
  * may need to deliver an interrupt, if the avail ring is empty and
  * we are supposed to interrupt on empty.
  *
  * Note that used_all_avail is provided by the caller because it's
  * a snapshot of the ring state when he decided to finish interrupt
  * processing -- it's possible that descriptors became available after
  * that point.  (It's also typically a constant 1/True as well.)
  */
 void
 vq_endchains(struct vqueue_info *vq, int used_all_avail)
 {
 	struct virtio_softc *vs;
 	uint16_t event_idx, new_idx, old_idx;
 	int intr;
 
 	/*
 	 * Interrupt generation: if we're using EVENT_IDX,
 	 * interrupt if we've crossed the event threshold.
 	 * Otherwise interrupt is generated if we added "used" entries,
 	 * but suppressed by VRING_AVAIL_F_NO_INTERRUPT.
 	 *
 	 * In any case, though, if NOTIFY_ON_EMPTY is set and the
 	 * entire avail was processed, we need to interrupt always.
 	 */
 	vs = vq->vq_vs;
 	old_idx = vq->vq_save_used;
 	vq->vq_save_used = new_idx = vq->vq_used->vu_idx;
 	if (used_all_avail &&
 	    (vs->vs_negotiated_caps & VIRTIO_F_NOTIFY_ON_EMPTY))
 		intr = 1;
 	else if (vs->vs_negotiated_caps & VIRTIO_RING_F_EVENT_IDX) {
 		event_idx = VQ_USED_EVENT_IDX(vq);
 		/*
 		 * This calculation is per docs and the kernel
 		 * (see src/sys/dev/virtio/virtio_ring.h).
 		 */
 		intr = (uint16_t)(new_idx - event_idx - 1) <
 			(uint16_t)(new_idx - old_idx);
 	} else {
 		intr = new_idx != old_idx &&
 		    !(vq->vq_avail->va_flags & VRING_AVAIL_F_NO_INTERRUPT);
 	}
 	if (intr)
 		vq_interrupt(vs, vq);
 }
 
 /* Note: these are in sorted order to make for a fast search */
 static struct config_reg {
 	uint16_t	cr_offset;	/* register offset */
 	uint8_t		cr_size;	/* size (bytes) */
 	uint8_t		cr_ro;		/* true => reg is read only */
 	const char	*cr_name;	/* name of reg */
 } config_regs[] = {
 	{ VTCFG_R_HOSTCAP,	4, 1, "HOSTCAP" },
 	{ VTCFG_R_GUESTCAP,	4, 0, "GUESTCAP" },
 	{ VTCFG_R_PFN,		4, 0, "PFN" },
 	{ VTCFG_R_QNUM,		2, 1, "QNUM" },
 	{ VTCFG_R_QSEL,		2, 0, "QSEL" },
 	{ VTCFG_R_QNOTIFY,	2, 0, "QNOTIFY" },
 	{ VTCFG_R_STATUS,	1, 0, "STATUS" },
 	{ VTCFG_R_ISR,		1, 0, "ISR" },
 	{ VTCFG_R_CFGVEC,	2, 0, "CFGVEC" },
 	{ VTCFG_R_QVEC,		2, 0, "QVEC" },
 };
 
 static inline struct config_reg *
 vi_find_cr(int offset) {
 	u_int hi, lo, mid;
 	struct config_reg *cr;
 
 	lo = 0;
 	hi = sizeof(config_regs) / sizeof(*config_regs) - 1;
 	while (hi >= lo) {
 		mid = (hi + lo) >> 1;
 		cr = &config_regs[mid];
 		if (cr->cr_offset == offset)
 			return (cr);
 		if (cr->cr_offset < offset)
 			lo = mid + 1;
 		else
 			hi = mid - 1;
 	}
 	return (NULL);
 }
 
 /*
  * Handle pci config space reads.
  * If it's to the MSI-X info, do that.
  * If it's part of the virtio standard stuff, do that.
  * Otherwise dispatch to the actual driver.
  */
 uint64_t
 vi_pci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
 	    int baridx, uint64_t offset, int size)
 {
 	struct virtio_softc *vs = pi->pi_arg;
 	struct virtio_consts *vc;
 	struct config_reg *cr;
 	uint64_t virtio_config_size, max;
 	const char *name;
 	uint32_t newoff;
 	uint32_t value;
 	int error;
 
 	if (vs->vs_flags & VIRTIO_USE_MSIX) {
 		if (baridx == pci_msix_table_bar(pi) ||
 		    baridx == pci_msix_pba_bar(pi)) {
 			return (pci_emul_msix_tread(pi, offset, size));
 		}
 	}
 
 	/* XXX probably should do something better than just assert() */
 	assert(baridx == 0);
 
 	if (vs->vs_mtx)
 		pthread_mutex_lock(vs->vs_mtx);
 
 	vc = vs->vs_vc;
 	name = vc->vc_name;
 	value = size == 1 ? 0xff : size == 2 ? 0xffff : 0xffffffff;
 
 	if (size != 1 && size != 2 && size != 4)
 		goto bad;
 
 	if (pci_msix_enabled(pi))
 		virtio_config_size = VTCFG_R_CFG1;
 	else
 		virtio_config_size = VTCFG_R_CFG0;
 
 	if (offset >= virtio_config_size) {
 		/*
 		 * Subtract off the standard size (including MSI-X
 		 * registers if enabled) and dispatch to underlying driver.
 		 * If that fails, fall into general code.
 		 */
 		newoff = offset - virtio_config_size;
 		max = vc->vc_cfgsize ? vc->vc_cfgsize : 0x100000000;
 		if (newoff + size > max)
 			goto bad;
 		error = (*vc->vc_cfgread)(DEV_SOFTC(vs), newoff, size, &value);
 		if (!error)
 			goto done;
 	}
 
 bad:
 	cr = vi_find_cr(offset);
 	if (cr == NULL || cr->cr_size != size) {
 		if (cr != NULL) {
 			/* offset must be OK, so size must be bad */
 			fprintf(stderr,
 			    "%s: read from %s: bad size %d\r\n",
 			    name, cr->cr_name, size);
 		} else {
 			fprintf(stderr,
 			    "%s: read from bad offset/size %jd/%d\r\n",
 			    name, (uintmax_t)offset, size);
 		}
 		goto done;
 	}
 
 	switch (offset) {
 	case VTCFG_R_HOSTCAP:
 		value = vc->vc_hv_caps;
 		break;
 	case VTCFG_R_GUESTCAP:
 		value = vs->vs_negotiated_caps;
 		break;
 	case VTCFG_R_PFN:
 		if (vs->vs_curq < vc->vc_nvq)
 			value = vs->vs_queues[vs->vs_curq].vq_pfn;
 		break;
 	case VTCFG_R_QNUM:
 		value = vs->vs_curq < vc->vc_nvq ?
 		    vs->vs_queues[vs->vs_curq].vq_qsize : 0;
 		break;
 	case VTCFG_R_QSEL:
 		value = vs->vs_curq;
 		break;
 	case VTCFG_R_QNOTIFY:
 		value = 0;	/* XXX */
 		break;
 	case VTCFG_R_STATUS:
 		value = vs->vs_status;
 		break;
 	case VTCFG_R_ISR:
 		value = vs->vs_isr;
 		vs->vs_isr = 0;		/* a read clears this flag */
 		if (value)
 			pci_lintr_deassert(pi);
 		break;
 	case VTCFG_R_CFGVEC:
 		value = vs->vs_msix_cfg_idx;
 		break;
 	case VTCFG_R_QVEC:
 		value = vs->vs_curq < vc->vc_nvq ?
 		    vs->vs_queues[vs->vs_curq].vq_msix_idx :
 		    VIRTIO_MSI_NO_VECTOR;
 		break;
 	}
 done:
 	if (vs->vs_mtx)
 		pthread_mutex_unlock(vs->vs_mtx);
 	return (value);
 }
 
 /*
  * Handle pci config space writes.
  * If it's to the MSI-X info, do that.
  * If it's part of the virtio standard stuff, do that.
  * Otherwise dispatch to the actual driver.
  */
 void
 vi_pci_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
 	     int baridx, uint64_t offset, int size, uint64_t value)
 {
 	struct virtio_softc *vs = pi->pi_arg;
 	struct vqueue_info *vq;
 	struct virtio_consts *vc;
 	struct config_reg *cr;
 	uint64_t virtio_config_size, max;
 	const char *name;
 	uint32_t newoff;
 	int error;
 
 	if (vs->vs_flags & VIRTIO_USE_MSIX) {
 		if (baridx == pci_msix_table_bar(pi) ||
 		    baridx == pci_msix_pba_bar(pi)) {
 			pci_emul_msix_twrite(pi, offset, size, value);
 			return;
 		}
 	}
 
 	/* XXX probably should do something better than just assert() */
 	assert(baridx == 0);
 
 	if (vs->vs_mtx)
 		pthread_mutex_lock(vs->vs_mtx);
 
 	vc = vs->vs_vc;
 	name = vc->vc_name;
 
 	if (size != 1 && size != 2 && size != 4)
 		goto bad;
 
 	if (pci_msix_enabled(pi))
 		virtio_config_size = VTCFG_R_CFG1;
 	else
 		virtio_config_size = VTCFG_R_CFG0;
 
 	if (offset >= virtio_config_size) {
 		/*
 		 * Subtract off the standard size (including MSI-X
 		 * registers if enabled) and dispatch to underlying driver.
 		 */
 		newoff = offset - virtio_config_size;
 		max = vc->vc_cfgsize ? vc->vc_cfgsize : 0x100000000;
 		if (newoff + size > max)
 			goto bad;
 		error = (*vc->vc_cfgwrite)(DEV_SOFTC(vs), newoff, size, value);
 		if (!error)
 			goto done;
 	}
 
 bad:
 	cr = vi_find_cr(offset);
 	if (cr == NULL || cr->cr_size != size || cr->cr_ro) {
 		if (cr != NULL) {
 			/* offset must be OK, wrong size and/or reg is R/O */
 			if (cr->cr_size != size)
 				fprintf(stderr,
 				    "%s: write to %s: bad size %d\r\n",
 				    name, cr->cr_name, size);
 			if (cr->cr_ro)
 				fprintf(stderr,
 				    "%s: write to read-only reg %s\r\n",
 				    name, cr->cr_name);
 		} else {
 			fprintf(stderr,
 			    "%s: write to bad offset/size %jd/%d\r\n",
 			    name, (uintmax_t)offset, size);
 		}
 		goto done;
 	}
 
 	switch (offset) {
 	case VTCFG_R_GUESTCAP:
 		vs->vs_negotiated_caps = value & vc->vc_hv_caps;
 		if (vc->vc_apply_features)
 			(*vc->vc_apply_features)(DEV_SOFTC(vs),
 			    vs->vs_negotiated_caps);
 		break;
 	case VTCFG_R_PFN:
 		if (vs->vs_curq >= vc->vc_nvq)
 			goto bad_qindex;
 		vi_vq_init(vs, value);
 		break;
 	case VTCFG_R_QSEL:
 		/*
 		 * Note that the guest is allowed to select an
 		 * invalid queue; we just need to return a QNUM
 		 * of 0 while the bad queue is selected.
 		 */
 		vs->vs_curq = value;
 		break;
 	case VTCFG_R_QNOTIFY:
 		if (value >= vc->vc_nvq) {
 			fprintf(stderr, "%s: queue %d notify out of range\r\n",
 				name, (int)value);
 			goto done;
 		}
 		vq = &vs->vs_queues[value];
 		if (vq->vq_notify)
 			(*vq->vq_notify)(DEV_SOFTC(vs), vq);
 		else if (vc->vc_qnotify)
 			(*vc->vc_qnotify)(DEV_SOFTC(vs), vq);
 		else
 			fprintf(stderr,
 			    "%s: qnotify queue %d: missing vq/vc notify\r\n",
 				name, (int)value);
 		break;
 	case VTCFG_R_STATUS:
 		vs->vs_status = value;
 		if (value == 0)
 			(*vc->vc_reset)(DEV_SOFTC(vs));
 		break;
 	case VTCFG_R_CFGVEC:
 		vs->vs_msix_cfg_idx = value;
 		break;
 	case VTCFG_R_QVEC:
 		if (vs->vs_curq >= vc->vc_nvq)
 			goto bad_qindex;
 		vq = &vs->vs_queues[vs->vs_curq];
 		vq->vq_msix_idx = value;
 		break;
 	}
 	goto done;
 
 bad_qindex:
 	fprintf(stderr,
 	    "%s: write config reg %s: curq %d >= max %d\r\n",
 	    name, cr->cr_name, vs->vs_curq, vc->vc_nvq);
 done:
 	if (vs->vs_mtx)
 		pthread_mutex_unlock(vs->vs_mtx);
 }
Index: stable/10/usr.sbin/bhyvectl/bhyvectl.c
===================================================================
--- stable/10/usr.sbin/bhyvectl/bhyvectl.c	(revision 284898)
+++ stable/10/usr.sbin/bhyvectl/bhyvectl.c	(revision 284899)
@@ -1,2142 +1,2142 @@
 /*-
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/sysctl.h>
 #include <sys/errno.h>
 #include <sys/mman.h>
 
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdbool.h>
 #include <string.h>
 #include <unistd.h>
 #include <libgen.h>
 #include <libutil.h>
 #include <fcntl.h>
 #include <string.h>
 #include <getopt.h>
 #include <time.h>
 #include <assert.h>
 
 #include <machine/cpufunc.h>
 #include <machine/vmm.h>
 #include <machine/specialreg.h>
 #include <vmmapi.h>
 
 #include "amd/vmcb.h"
 #include "intel/vmcs.h"
 
 #define	MB	(1UL << 20)
 #define	GB	(1UL << 30)
 
 #define	REQ_ARG		required_argument
 #define	NO_ARG		no_argument
 #define	OPT_ARG		optional_argument
 
 static const char *progname;
 
 static void
 usage(bool cpu_intel)
 {
 
 	(void)fprintf(stderr,
 	"Usage: %s --vm=<vmname>\n"
 	"       [--cpu=<vcpu_number>]\n"
 	"       [--create]\n"
 	"       [--destroy]\n"
 	"       [--get-all]\n"
 	"       [--get-stats]\n"
 	"       [--set-desc-ds]\n"
 	"       [--get-desc-ds]\n"
 	"       [--set-desc-es]\n"
 	"       [--get-desc-es]\n"
 	"       [--set-desc-gs]\n"
 	"       [--get-desc-gs]\n"
 	"       [--set-desc-fs]\n"
 	"       [--get-desc-fs]\n"
 	"       [--set-desc-cs]\n"
 	"       [--get-desc-cs]\n"
 	"       [--set-desc-ss]\n"
 	"       [--get-desc-ss]\n"
 	"       [--set-desc-tr]\n"
 	"       [--get-desc-tr]\n"
 	"       [--set-desc-ldtr]\n"
 	"       [--get-desc-ldtr]\n"
 	"       [--set-desc-gdtr]\n"
 	"       [--get-desc-gdtr]\n"
 	"       [--set-desc-idtr]\n"
 	"       [--get-desc-idtr]\n"
 	"       [--run]\n"
 	"       [--capname=<capname>]\n"
 	"       [--getcap]\n"
 	"       [--setcap=<0|1>]\n"
 	"       [--desc-base=<BASE>]\n"
 	"       [--desc-limit=<LIMIT>]\n"
 	"       [--desc-access=<ACCESS>]\n"
 	"       [--set-cr0=<CR0>]\n"
 	"       [--get-cr0]\n"
 	"       [--set-cr3=<CR3>]\n"
 	"       [--get-cr3]\n"
 	"       [--set-cr4=<CR4>]\n"
 	"       [--get-cr4]\n"
 	"       [--set-dr7=<DR7>]\n"
 	"       [--get-dr7]\n"
 	"       [--set-rsp=<RSP>]\n"
 	"       [--get-rsp]\n"
 	"       [--set-rip=<RIP>]\n"
 	"       [--get-rip]\n"
 	"       [--get-rax]\n"
 	"       [--set-rax=<RAX>]\n"
 	"       [--get-rbx]\n"
 	"       [--get-rcx]\n"
 	"       [--get-rdx]\n"
 	"       [--get-rsi]\n"
 	"       [--get-rdi]\n"
 	"       [--get-rbp]\n"
 	"       [--get-r8]\n"
 	"       [--get-r9]\n"
 	"       [--get-r10]\n"
 	"       [--get-r11]\n"
 	"       [--get-r12]\n"
 	"       [--get-r13]\n"
 	"       [--get-r14]\n"
 	"       [--get-r15]\n"
 	"       [--set-rflags=<RFLAGS>]\n"
 	"       [--get-rflags]\n"
 	"       [--set-cs]\n"
 	"       [--get-cs]\n"
 	"       [--set-ds]\n"
 	"       [--get-ds]\n"
 	"       [--set-es]\n"
 	"       [--get-es]\n"
 	"       [--set-fs]\n"
 	"       [--get-fs]\n"
 	"       [--set-gs]\n"
 	"       [--get-gs]\n"
 	"       [--set-ss]\n"
 	"       [--get-ss]\n"
 	"       [--get-tr]\n"
 	"       [--get-ldtr]\n"
 	"       [--set-x2apic-state=<state>]\n"
 	"       [--get-x2apic-state]\n"
 	"       [--unassign-pptdev=<bus/slot/func>]\n"
 	"       [--set-mem=<memory in units of MB>]\n"
 	"       [--get-lowmem]\n"
 	"       [--get-highmem]\n"
 	"       [--get-gpa-pmap]\n"
 	"       [--assert-lapic-lvt=<pin>]\n"
 	"       [--inject-nmi]\n"
 	"       [--force-reset]\n"
 	"       [--force-poweroff]\n"
 	"       [--get-rtc-time]\n"
 	"       [--set-rtc-time=<secs>]\n"
 	"       [--get-rtc-nvram]\n"
 	"       [--set-rtc-nvram=<val>]\n"
 	"       [--rtc-nvram-offset=<offset>]\n"
 	"       [--get-active-cpus]\n"
 	"       [--get-suspended-cpus]\n"
 	"       [--get-intinfo]\n"
 	"       [--get-eptp]\n"
 	"       [--set-exception-bitmap]\n"
 	"       [--get-exception-bitmap]\n"
 	"       [--get-tsc-offset]\n"
 	"       [--get-guest-pat]\n"
 	"       [--get-io-bitmap-address]\n"
 	"       [--get-msr-bitmap]\n"
 	"       [--get-msr-bitmap-address]\n"
 	"       [--get-guest-sysenter]\n"
 	"       [--get-exit-reason]\n",
 	progname);
 
 	if (cpu_intel) {
 		(void)fprintf(stderr,
 		"       [--get-vmcs-pinbased-ctls]\n"
 		"       [--get-vmcs-procbased-ctls]\n"
 		"       [--get-vmcs-procbased-ctls2]\n"
 		"       [--get-vmcs-entry-interruption-info]\n"
 		"       [--set-vmcs-entry-interruption-info=<info>]\n"
 		"       [--get-vmcs-guest-physical-address\n"
 		"       [--get-vmcs-guest-linear-address\n"
 		"       [--get-vmcs-host-pat]\n"
 		"       [--get-vmcs-host-cr0]\n"
 		"       [--get-vmcs-host-cr3]\n"
 		"       [--get-vmcs-host-cr4]\n"
 		"       [--get-vmcs-host-rip]\n"
 		"       [--get-vmcs-host-rsp]\n"
 		"       [--get-vmcs-cr0-mask]\n"
 		"       [--get-vmcs-cr0-shadow]\n"
 		"       [--get-vmcs-cr4-mask]\n"
 		"       [--get-vmcs-cr4-shadow]\n"
 		"       [--get-vmcs-cr3-targets]\n"
 		"       [--get-vmcs-apic-access-address]\n"
 		"       [--get-vmcs-virtual-apic-address]\n"
 		"       [--get-vmcs-tpr-threshold]\n"
 		"       [--get-vmcs-vpid]\n"
 		"       [--get-vmcs-instruction-error]\n"
 		"       [--get-vmcs-exit-ctls]\n"
 		"       [--get-vmcs-entry-ctls]\n"
 		"       [--get-vmcs-link]\n"
 		"       [--get-vmcs-exit-qualification]\n"
 		"       [--get-vmcs-exit-interruption-info]\n"
 		"       [--get-vmcs-exit-interruption-error]\n"
 		"       [--get-vmcs-interruptibility]\n"
 		);
 	} else {
 		(void)fprintf(stderr,
 		"       [--get-vmcb-intercepts]\n"
 		"       [--get-vmcb-asid]\n"
 		"       [--get-vmcb-exit-details]\n"
 		"       [--get-vmcb-tlb-ctrl]\n"
 		"       [--get-vmcb-virq]\n"
 		"       [--get-avic-apic-bar]\n"
 		"       [--get-avic-backing-page]\n"
 		"       [--get-avic-table]\n"
 		);
 	}
 	exit(1);
 }
 
 static int get_rtc_time, set_rtc_time;
 static int get_rtc_nvram, set_rtc_nvram;
 static int rtc_nvram_offset;
 static uint8_t rtc_nvram_value;
 static time_t rtc_secs;
 
 static int get_stats, getcap, setcap, capval, get_gpa_pmap;
 static int inject_nmi, assert_lapic_lvt;
 static int force_reset, force_poweroff;
 static const char *capname;
 static int create, destroy, get_lowmem, get_highmem;
 static int get_intinfo;
 static int get_active_cpus, get_suspended_cpus;
 static uint64_t memsize;
 static int set_cr0, get_cr0, set_cr3, get_cr3, set_cr4, get_cr4;
 static int set_efer, get_efer;
 static int set_dr7, get_dr7;
 static int set_rsp, get_rsp, set_rip, get_rip, set_rflags, get_rflags;
 static int set_rax, get_rax;
 static int get_rbx, get_rcx, get_rdx, get_rsi, get_rdi, get_rbp;
 static int get_r8, get_r9, get_r10, get_r11, get_r12, get_r13, get_r14, get_r15;
 static int set_desc_ds, get_desc_ds;
 static int set_desc_es, get_desc_es;
 static int set_desc_fs, get_desc_fs;
 static int set_desc_gs, get_desc_gs;
 static int set_desc_cs, get_desc_cs;
 static int set_desc_ss, get_desc_ss;
 static int set_desc_gdtr, get_desc_gdtr;
 static int set_desc_idtr, get_desc_idtr;
 static int set_desc_tr, get_desc_tr;
 static int set_desc_ldtr, get_desc_ldtr;
 static int set_cs, set_ds, set_es, set_fs, set_gs, set_ss, set_tr, set_ldtr;
 static int get_cs, get_ds, get_es, get_fs, get_gs, get_ss, get_tr, get_ldtr;
 static int set_x2apic_state, get_x2apic_state;
 enum x2apic_state x2apic_state;
 static int unassign_pptdev, bus, slot, func;
 static int run;
 
 /*
  * VMCB specific.
  */
 static int get_vmcb_intercept, get_vmcb_exit_details, get_vmcb_tlb_ctrl;
 static int get_vmcb_virq, get_avic_table;
 
 /*
  * VMCS-specific fields
  */
 static int get_pinbased_ctls, get_procbased_ctls, get_procbased_ctls2;
 static int get_eptp, get_io_bitmap, get_tsc_offset;
 static int get_vmcs_entry_interruption_info, set_vmcs_entry_interruption_info;
 static int get_vmcs_interruptibility;
 uint32_t vmcs_entry_interruption_info;
 static int get_vmcs_gpa, get_vmcs_gla;
 static int get_exception_bitmap, set_exception_bitmap, exception_bitmap;
 static int get_cr0_mask, get_cr0_shadow;
 static int get_cr4_mask, get_cr4_shadow;
 static int get_cr3_targets;
 static int get_apic_access_addr, get_virtual_apic_addr, get_tpr_threshold;
 static int get_msr_bitmap, get_msr_bitmap_address;
 static int get_vpid_asid;
 static int get_inst_err, get_exit_ctls, get_entry_ctls;
 static int get_host_cr0, get_host_cr3, get_host_cr4;
 static int get_host_rip, get_host_rsp;
 static int get_guest_pat, get_host_pat;
 static int get_guest_sysenter, get_vmcs_link;
 static int get_exit_reason, get_vmcs_exit_qualification;
 static int get_vmcs_exit_interruption_info, get_vmcs_exit_interruption_error;
 
 static uint64_t desc_base;
 static uint32_t desc_limit, desc_access;
 
 static int get_all;
 
 static void
 dump_vm_run_exitcode(struct vm_exit *vmexit, int vcpu)
 {
 	printf("vm exit[%d]\n", vcpu);
 	printf("\trip\t\t0x%016lx\n", vmexit->rip);
 	printf("\tinst_length\t%d\n", vmexit->inst_length);
 	switch (vmexit->exitcode) {
 	case VM_EXITCODE_INOUT:
 		printf("\treason\t\tINOUT\n");
 		printf("\tdirection\t%s\n", vmexit->u.inout.in ? "IN" : "OUT");
 		printf("\tbytes\t\t%d\n", vmexit->u.inout.bytes);
 		printf("\tflags\t\t%s%s\n",
 			vmexit->u.inout.string ? "STRING " : "",
 			vmexit->u.inout.rep ? "REP " : "");
 		printf("\tport\t\t0x%04x\n", vmexit->u.inout.port);
 		printf("\teax\t\t0x%08x\n", vmexit->u.inout.eax);
 		break;
 	case VM_EXITCODE_VMX:
 		printf("\treason\t\tVMX\n");
 		printf("\tstatus\t\t%d\n", vmexit->u.vmx.status);
 		printf("\texit_reason\t0x%08x (%u)\n",
 		    vmexit->u.vmx.exit_reason, vmexit->u.vmx.exit_reason);
 		printf("\tqualification\t0x%016lx\n",
 			vmexit->u.vmx.exit_qualification);
 		printf("\tinst_type\t\t%d\n", vmexit->u.vmx.inst_type);
 		printf("\tinst_error\t\t%d\n", vmexit->u.vmx.inst_error);
 		break;
 	case VM_EXITCODE_SVM:
 		printf("\treason\t\tSVM\n");
 		printf("\texit_reason\t\t%#lx\n", vmexit->u.svm.exitcode);
 		printf("\texitinfo1\t\t%#lx\n", vmexit->u.svm.exitinfo1);
 		printf("\texitinfo2\t\t%#lx\n", vmexit->u.svm.exitinfo2);
 		break;
 	default:
 		printf("*** unknown vm run exitcode %d\n", vmexit->exitcode);
 		break;
 	}
 }
 
 /* AMD 6th generation and Intel compatible MSRs */
 #define MSR_AMD6TH_START	0xC0000000
 #define MSR_AMD6TH_END		0xC0001FFF
 /* AMD 7th and 8th generation compatible MSRs */
 #define MSR_AMD7TH_START	0xC0010000
 #define MSR_AMD7TH_END		0xC0011FFF
 
 static const char *
 msr_name(uint32_t msr)
 {
 	static char buf[32];
 
 	switch(msr) {
 	case MSR_TSC:
 		return ("MSR_TSC");
 	case MSR_EFER:
 		return ("MSR_EFER");
 	case MSR_STAR:
 		return ("MSR_STAR");
 	case MSR_LSTAR:	
 		return ("MSR_LSTAR");
 	case MSR_CSTAR:
 		return ("MSR_CSTAR");
 	case MSR_SF_MASK:
 		return ("MSR_SF_MASK");
 	case MSR_FSBASE:
 		return ("MSR_FSBASE");
 	case MSR_GSBASE:
 		return ("MSR_GSBASE");
 	case MSR_KGSBASE:
 		return ("MSR_KGSBASE");
 	case MSR_SYSENTER_CS_MSR:
 		return ("MSR_SYSENTER_CS_MSR");
 	case MSR_SYSENTER_ESP_MSR:
 		return ("MSR_SYSENTER_ESP_MSR");
 	case MSR_SYSENTER_EIP_MSR:
 		return ("MSR_SYSENTER_EIP_MSR");
 	case MSR_PAT:
 		return ("MSR_PAT");
 	}
 	snprintf(buf, sizeof(buf), "MSR       %#08x", msr);
 
 	return (buf);
 }
 
 static inline void
 print_msr_pm(uint64_t msr, int vcpu, int readable, int writeable)
 {
 
 	if (readable || writeable) {
 		printf("%-20s[%d]\t\t%c%c\n", msr_name(msr), vcpu,
 			readable ? 'R' : '-', writeable ? 'W' : '-');
 	}
 }
 
 /*
  * Reference APM vol2, section 15.11 MSR Intercepts.
  */
 static void
 dump_amd_msr_pm(const char *bitmap, int vcpu)
 {
 	int byte, bit, readable, writeable;
 	uint32_t msr;
 
 	for (msr = 0; msr < 0x2000; msr++) {
 		byte = msr / 4;
 		bit = (msr % 4) * 2;
 
 		/* Look at MSRs in the range 0x00000000 to 0x00001FFF */
 		readable = (bitmap[byte] & (1 << bit)) ? 0 : 1;
 		writeable = (bitmap[byte] & (2 << bit)) ?  0 : 1;
 		print_msr_pm(msr, vcpu, readable, writeable);
 
 		/* Look at MSRs in the range 0xC0000000 to 0xC0001FFF */
 		byte += 2048;
 		readable = (bitmap[byte] & (1 << bit)) ? 0 : 1;
 		writeable = (bitmap[byte] & (2 << bit)) ?  0 : 1;
 		print_msr_pm(msr + MSR_AMD6TH_START, vcpu, readable,
 				writeable);
 		
 		/* MSR 0xC0010000 to 0xC0011FF is only for AMD */
 		byte += 4096;
 		readable = (bitmap[byte] & (1 << bit)) ? 0 : 1;
 		writeable = (bitmap[byte] & (2 << bit)) ?  0 : 1;
 		print_msr_pm(msr + MSR_AMD7TH_START, vcpu, readable,
 				writeable);
 	}
 }
 
 /*
  * Reference Intel SDM Vol3 Section 24.6.9 MSR-Bitmap Address
  */
 static void
 dump_intel_msr_pm(const char *bitmap, int vcpu)
 {
 	int byte, bit, readable, writeable;
 	uint32_t msr;
 
 	for (msr = 0; msr < 0x2000; msr++) {
 		byte = msr / 8;
 		bit = msr & 0x7;
 
 		/* Look at MSRs in the range 0x00000000 to 0x00001FFF */
 		readable = (bitmap[byte] & (1 << bit)) ? 0 : 1;
 		writeable = (bitmap[2048 + byte] & (1 << bit)) ?  0 : 1;
 		print_msr_pm(msr, vcpu, readable, writeable);
 
 		/* Look at MSRs in the range 0xC0000000 to 0xC0001FFF */
 		byte += 1024;
 		readable = (bitmap[byte] & (1 << bit)) ? 0 : 1;
 		writeable = (bitmap[2048 + byte] & (1 << bit)) ?  0 : 1;
 		print_msr_pm(msr + MSR_AMD6TH_START, vcpu, readable,
 				writeable);
 	}
 }
 
 static int
 dump_msr_bitmap(int vcpu, uint64_t addr, bool cpu_intel)
 {
 	int error, fd, map_size;
 	const char *bitmap;
 
 	error = -1;
 	bitmap = MAP_FAILED;
 
 	fd = open("/dev/mem", O_RDONLY, 0);
 	if (fd < 0) {
 		perror("Couldn't open /dev/mem");
 		goto done;
 	}
 
 	if (cpu_intel)
 		map_size = PAGE_SIZE;
 	else
 		map_size = 2 * PAGE_SIZE;
 
 	bitmap = mmap(NULL, map_size, PROT_READ, MAP_SHARED, fd, addr);
 	if (bitmap == MAP_FAILED) {
 		perror("mmap failed");
 		goto done;
 	}
 	
 	if (cpu_intel)
 		dump_intel_msr_pm(bitmap, vcpu);
 	else	
 		dump_amd_msr_pm(bitmap, vcpu);
 
 	error = 0;
 done:
 	if (bitmap != MAP_FAILED)
 		munmap((void *)bitmap, map_size);
 	if (fd >= 0)
 		close(fd);
 
 	return (error);
 }
 
 static int
 vm_get_vmcs_field(struct vmctx *ctx, int vcpu, int field, uint64_t *ret_val)
 {
 
 	return (vm_get_register(ctx, vcpu, VMCS_IDENT(field), ret_val));
 }
 
 static int
 vm_set_vmcs_field(struct vmctx *ctx, int vcpu, int field, uint64_t val)
 {
 
 	return (vm_set_register(ctx, vcpu, VMCS_IDENT(field), val));
 }
 
 static int
 vm_get_vmcb_field(struct vmctx *ctx, int vcpu, int off, int bytes,
 	uint64_t *ret_val)
 {
 
 	return (vm_get_register(ctx, vcpu, VMCB_ACCESS(off, bytes), ret_val));
 }
 
 static int
 vm_set_vmcb_field(struct vmctx *ctx, int vcpu, int off, int bytes,
 	uint64_t val)
 {
 	
 	return (vm_set_register(ctx, vcpu, VMCB_ACCESS(off, bytes), val));
 }
 
 enum {
 	VMNAME = 1000,	/* avoid collision with return values from getopt */
 	VCPU,
 	SET_MEM,
 	SET_EFER,
 	SET_CR0,
 	SET_CR3,
 	SET_CR4,
 	SET_DR7,
 	SET_RSP,
 	SET_RIP,
 	SET_RAX,
 	SET_RFLAGS,
 	DESC_BASE,
 	DESC_LIMIT,
 	DESC_ACCESS,
 	SET_CS,
 	SET_DS,
 	SET_ES,
 	SET_FS,
 	SET_GS,
 	SET_SS,
 	SET_TR,
 	SET_LDTR,
 	SET_X2APIC_STATE,
 	SET_EXCEPTION_BITMAP,
 	SET_VMCS_ENTRY_INTERRUPTION_INFO,
 	SET_CAP,
 	CAPNAME,
 	UNASSIGN_PPTDEV,
 	GET_GPA_PMAP,
 	ASSERT_LAPIC_LVT,
 	SET_RTC_TIME,
 	SET_RTC_NVRAM,
 	RTC_NVRAM_OFFSET,
 };
 
 static void
 print_cpus(const char *banner, const cpuset_t *cpus)
 {
 	int i, first;
 
 	first = 1;
 	printf("%s:\t", banner);
 	if (!CPU_EMPTY(cpus)) {
 		for (i = 0; i < CPU_SETSIZE; i++) {
 			if (CPU_ISSET(i, cpus)) {
 				printf("%s%d", first ? " " : ", ", i);
 				first = 0;
 			}
 		}
 	} else
 		printf(" (none)");
 	printf("\n");
 }
 
 static void
 print_intinfo(const char *banner, uint64_t info)
 {
 	int type;
 
 	printf("%s:\t", banner);
 	if (info & VM_INTINFO_VALID) {
 		type = info & VM_INTINFO_TYPE;
 		switch (type) {
 		case VM_INTINFO_HWINTR:
 			printf("extint");
 			break;
 		case VM_INTINFO_NMI:
 			printf("nmi");
 			break;
 		case VM_INTINFO_SWINTR:
 			printf("swint");
 			break;
 		default:
 			printf("exception");
 			break;
 		}
 		printf(" vector %d", (int)VM_INTINFO_VECTOR(info));
 		if (info & VM_INTINFO_DEL_ERRCODE)
 			printf(" errcode %#x", (u_int)(info >> 32));
 	} else {
 		printf("n/a");
 	}
 	printf("\n");
 }
 
 static bool
 cpu_vendor_intel(void)
 {
 	u_int regs[4];
 	char cpu_vendor[13];
 
 	do_cpuid(0, regs);
 	((u_int *)&cpu_vendor)[0] = regs[1];
 	((u_int *)&cpu_vendor)[1] = regs[3];
 	((u_int *)&cpu_vendor)[2] = regs[2];
 	cpu_vendor[12] = '\0';
 
 	if (strcmp(cpu_vendor, "AuthenticAMD") == 0) {
 		return (false);
 	} else if (strcmp(cpu_vendor, "GenuineIntel") == 0) {
 		return (true);
 	} else {
 		fprintf(stderr, "Unknown cpu vendor \"%s\"\n", cpu_vendor);
 		exit(1);
 	}
 }
 
 static int
 get_all_registers(struct vmctx *ctx, int vcpu)
 {
 	uint64_t cr0, cr3, cr4, dr7, rsp, rip, rflags, efer;
 	uint64_t rax, rbx, rcx, rdx, rsi, rdi, rbp;
 	uint64_t r8, r9, r10, r11, r12, r13, r14, r15;
-	int error;
+	int error = 0;
 
-	if (get_efer || get_all) {
+	if (!error && (get_efer || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_EFER, &efer);
 		if (error == 0)
 			printf("efer[%d]\t\t0x%016lx\n", vcpu, efer);
 	}
 
 	if (!error && (get_cr0 || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR0, &cr0);
 		if (error == 0)
 			printf("cr0[%d]\t\t0x%016lx\n", vcpu, cr0);
 	}
 
 	if (!error && (get_cr3 || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR3, &cr3);
 		if (error == 0)
 			printf("cr3[%d]\t\t0x%016lx\n", vcpu, cr3);
 	}
 
 	if (!error && (get_cr4 || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR4, &cr4);
 		if (error == 0)
 			printf("cr4[%d]\t\t0x%016lx\n", vcpu, cr4);
 	}
 
 	if (!error && (get_dr7 || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_DR7, &dr7);
 		if (error == 0)
 			printf("dr7[%d]\t\t0x%016lx\n", vcpu, dr7);
 	}
 
 	if (!error && (get_rsp || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RSP, &rsp);
 		if (error == 0)
 			printf("rsp[%d]\t\t0x%016lx\n", vcpu, rsp);
 	}
 
 	if (!error && (get_rip || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RIP, &rip);
 		if (error == 0)
 			printf("rip[%d]\t\t0x%016lx\n", vcpu, rip);
 	}
 
 	if (!error && (get_rax || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RAX, &rax);
 		if (error == 0)
 			printf("rax[%d]\t\t0x%016lx\n", vcpu, rax);
 	}
 
 	if (!error && (get_rbx || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RBX, &rbx);
 		if (error == 0)
 			printf("rbx[%d]\t\t0x%016lx\n", vcpu, rbx);
 	}
 
 	if (!error && (get_rcx || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RCX, &rcx);
 		if (error == 0)
 			printf("rcx[%d]\t\t0x%016lx\n", vcpu, rcx);
 	}
 
 	if (!error && (get_rdx || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RDX, &rdx);
 		if (error == 0)
 			printf("rdx[%d]\t\t0x%016lx\n", vcpu, rdx);
 	}
 
 	if (!error && (get_rsi || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RSI, &rsi);
 		if (error == 0)
 			printf("rsi[%d]\t\t0x%016lx\n", vcpu, rsi);
 	}
 
 	if (!error && (get_rdi || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RDI, &rdi);
 		if (error == 0)
 			printf("rdi[%d]\t\t0x%016lx\n", vcpu, rdi);
 	}
 
 	if (!error && (get_rbp || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RBP, &rbp);
 		if (error == 0)
 			printf("rbp[%d]\t\t0x%016lx\n", vcpu, rbp);
 	}
 
 	if (!error && (get_r8 || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R8, &r8);
 		if (error == 0)
 			printf("r8[%d]\t\t0x%016lx\n", vcpu, r8);
 	}
 
 	if (!error && (get_r9 || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R9, &r9);
 		if (error == 0)
 			printf("r9[%d]\t\t0x%016lx\n", vcpu, r9);
 	}
 
 	if (!error && (get_r10 || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R10, &r10);
 		if (error == 0)
 			printf("r10[%d]\t\t0x%016lx\n", vcpu, r10);
 	}
 
 	if (!error && (get_r11 || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R11, &r11);
 		if (error == 0)
 			printf("r11[%d]\t\t0x%016lx\n", vcpu, r11);
 	}
 
 	if (!error && (get_r12 || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R12, &r12);
 		if (error == 0)
 			printf("r12[%d]\t\t0x%016lx\n", vcpu, r12);
 	}
 
 	if (!error && (get_r13 || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R13, &r13);
 		if (error == 0)
 			printf("r13[%d]\t\t0x%016lx\n", vcpu, r13);
 	}
 
 	if (!error && (get_r14 || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R14, &r14);
 		if (error == 0)
 			printf("r14[%d]\t\t0x%016lx\n", vcpu, r14);
 	}
 
 	if (!error && (get_r15 || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R15, &r15);
 		if (error == 0)
 			printf("r15[%d]\t\t0x%016lx\n", vcpu, r15);
 	}
 
 	if (!error && (get_rflags || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RFLAGS,
 					&rflags);
 		if (error == 0)
 			printf("rflags[%d]\t0x%016lx\n", vcpu, rflags);
 	}
 	
 	return (error);
 }
 
 static int
 get_all_segments(struct vmctx *ctx, int vcpu)
 {
-	int error;
 	uint64_t cs, ds, es, fs, gs, ss, tr, ldtr;
+	int error = 0;
 
-	if (get_desc_ds || get_all) {
+	if (!error && (get_desc_ds || get_all)) {
 		error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_DS,
 				   &desc_base, &desc_limit, &desc_access);
 		if (error == 0) {
 			printf("ds desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
 			      vcpu, desc_base, desc_limit, desc_access);
 		}
 	}
 
 	if (!error && (get_desc_es || get_all)) {
 		error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_ES,
 				    &desc_base, &desc_limit, &desc_access);
 		if (error == 0) {
 			printf("es desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
 			       vcpu, desc_base, desc_limit, desc_access);
 		}
 	}
 
 	if (!error && (get_desc_fs || get_all)) {
 		error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_FS,
 				    &desc_base, &desc_limit, &desc_access);
 		if (error == 0) {
 			printf("fs desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
 			       vcpu, desc_base, desc_limit, desc_access);
 		}
 	}
 
 	if (!error && (get_desc_gs || get_all)) {
 		error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_GS,
 				    &desc_base, &desc_limit, &desc_access);
 		if (error == 0) {
 			printf("gs desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
 			       vcpu, desc_base, desc_limit, desc_access);
 		}
 	}
 
 	if (!error && (get_desc_ss || get_all)) {
 		error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_SS,
 				    &desc_base, &desc_limit, &desc_access);
 		if (error == 0) {
 			printf("ss desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
 			       vcpu, desc_base, desc_limit, desc_access);
 		}
 	}
 
 	if (!error && (get_desc_cs || get_all)) {
 		error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_CS,
 				    &desc_base, &desc_limit, &desc_access);
 		if (error == 0) {
 			printf("cs desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
 			       vcpu, desc_base, desc_limit, desc_access);
 		}
 	}
 
 	if (!error && (get_desc_tr || get_all)) {
 		error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_TR,
 				    &desc_base, &desc_limit, &desc_access);
 		if (error == 0) {
 			printf("tr desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
 			       vcpu, desc_base, desc_limit, desc_access);
 		}
 	}
 
 	if (!error && (get_desc_ldtr || get_all)) {
 		error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_LDTR,
 				    &desc_base, &desc_limit, &desc_access);
 		if (error == 0) {
 			printf("ldtr desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
 			       vcpu, desc_base, desc_limit, desc_access);
 		}
 	}
 
 	if (!error && (get_desc_gdtr || get_all)) {
 		error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_GDTR,
 				    &desc_base, &desc_limit, &desc_access);
 		if (error == 0) {
 			printf("gdtr[%d]\t\t0x%016lx/0x%08x\n",
 			       vcpu, desc_base, desc_limit);
 		}
 	}
 
 	if (!error && (get_desc_idtr || get_all)) {
 		error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_IDTR,
 				    &desc_base, &desc_limit, &desc_access);
 		if (error == 0) {
 			printf("idtr[%d]\t\t0x%016lx/0x%08x\n",
 			       vcpu, desc_base, desc_limit);
 		}
 	}
 
 	if (!error && (get_cs || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CS, &cs);
 		if (error == 0)
 			printf("cs[%d]\t\t0x%04lx\n", vcpu, cs);
 	}
 
 	if (!error && (get_ds || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_DS, &ds);
 		if (error == 0)
 			printf("ds[%d]\t\t0x%04lx\n", vcpu, ds);
 	}
 
 	if (!error && (get_es || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_ES, &es);
 		if (error == 0)
 			printf("es[%d]\t\t0x%04lx\n", vcpu, es);
 	}
 
 	if (!error && (get_fs || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_FS, &fs);
 		if (error == 0)
 			printf("fs[%d]\t\t0x%04lx\n", vcpu, fs);
 	}
 
 	if (!error && (get_gs || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_GS, &gs);
 		if (error == 0)
 			printf("gs[%d]\t\t0x%04lx\n", vcpu, gs);
 	}
 
 	if (!error && (get_ss || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_SS, &ss);
 		if (error == 0)
 			printf("ss[%d]\t\t0x%04lx\n", vcpu, ss);
 	}
 
 	if (!error && (get_tr || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_TR, &tr);
 		if (error == 0)
 			printf("tr[%d]\t\t0x%04lx\n", vcpu, tr);
 	}
 
 	if (!error && (get_ldtr || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_LDTR, &ldtr);
 		if (error == 0)
 			printf("ldtr[%d]\t\t0x%04lx\n", vcpu, ldtr);
 	}
 
 	return (error);
 }
 
 static int
 get_misc_vmcs(struct vmctx *ctx, int vcpu)
 {
 	uint64_t ctl, cr0, cr3, cr4, rsp, rip, pat, addr, u64;
-	int error;
-	
-	if (get_cr0_mask || get_all) {
+	int error = 0;
+
+	if (!error && (get_cr0_mask || get_all)) {
 		uint64_t cr0mask;
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR0_MASK, &cr0mask);
 		if (error == 0)
 			printf("cr0_mask[%d]\t\t0x%016lx\n", vcpu, cr0mask);
 	}
 
 	if (!error && (get_cr0_shadow || get_all)) {
 		uint64_t cr0shadow;
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR0_SHADOW,
 					  &cr0shadow);
 		if (error == 0)
 			printf("cr0_shadow[%d]\t\t0x%016lx\n", vcpu, cr0shadow);
 	}
 
 	if (!error && (get_cr4_mask || get_all)) {
 		uint64_t cr4mask;
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR4_MASK, &cr4mask);
 		if (error == 0)
 			printf("cr4_mask[%d]\t\t0x%016lx\n", vcpu, cr4mask);
 	}
 
 	if (!error && (get_cr4_shadow || get_all)) {
 		uint64_t cr4shadow;
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR4_SHADOW,
 					  &cr4shadow);
 		if (error == 0)
 			printf("cr4_shadow[%d]\t\t0x%016lx\n", vcpu, cr4shadow);
 	}
 	
 	if (!error && (get_cr3_targets || get_all)) {
 		uint64_t target_count, target_addr;
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET_COUNT,
 					  &target_count);
 		if (error == 0) {
 			printf("cr3_target_count[%d]\t0x%016lx\n",
 				vcpu, target_count);
 		}
 
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET0,
 					  &target_addr);
 		if (error == 0) {
 			printf("cr3_target0[%d]\t\t0x%016lx\n",
 				vcpu, target_addr);
 		}
 
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET1,
 					  &target_addr);
 		if (error == 0) {
 			printf("cr3_target1[%d]\t\t0x%016lx\n",
 				vcpu, target_addr);
 		}
 
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET2,
 					  &target_addr);
 		if (error == 0) {
 			printf("cr3_target2[%d]\t\t0x%016lx\n",
 				vcpu, target_addr);
 		}
 
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET3,
 					  &target_addr);
 		if (error == 0) {
 			printf("cr3_target3[%d]\t\t0x%016lx\n",
 				vcpu, target_addr);
 		}
 	}
 
 	if (!error && (get_pinbased_ctls || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_PIN_BASED_CTLS, &ctl);
 		if (error == 0)
 			printf("pinbased_ctls[%d]\t0x%016lx\n", vcpu, ctl);
 	}
 
 	if (!error && (get_procbased_ctls || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu,
 					  VMCS_PRI_PROC_BASED_CTLS, &ctl);
 		if (error == 0)
 			printf("procbased_ctls[%d]\t0x%016lx\n", vcpu, ctl);
 	}
 
 	if (!error && (get_procbased_ctls2 || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu,
 					  VMCS_SEC_PROC_BASED_CTLS, &ctl);
 		if (error == 0)
 			printf("procbased_ctls2[%d]\t0x%016lx\n", vcpu, ctl);
 	}
 
 	if (!error && (get_vmcs_gla || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu,
 					  VMCS_GUEST_LINEAR_ADDRESS, &u64);
 		if (error == 0)
 			printf("gla[%d]\t\t0x%016lx\n", vcpu, u64);
 	}
 
 	if (!error && (get_vmcs_gpa || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu,
 					  VMCS_GUEST_PHYSICAL_ADDRESS, &u64);
 		if (error == 0)
 			printf("gpa[%d]\t\t0x%016lx\n", vcpu, u64);
 	}
 
 	if (!error && (get_vmcs_entry_interruption_info || 
 		get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_ENTRY_INTR_INFO,&u64);
 		if (error == 0) {
 			printf("entry_interruption_info[%d]\t0x%016lx\n",
 				vcpu, u64);
 		}
 	}
 	
 	if (!error && (get_tpr_threshold || get_all)) {
 		uint64_t threshold;
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_TPR_THRESHOLD,
 					  &threshold);
 		if (error == 0)
 			printf("tpr_threshold[%d]\t0x%016lx\n", vcpu, threshold);
 	}
 
 	if (!error && (get_inst_err || get_all)) {
 		uint64_t insterr;
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_INSTRUCTION_ERROR,
 					  &insterr);
 		if (error == 0) {
 			printf("instruction_error[%d]\t0x%016lx\n",
 				vcpu, insterr);
 		}
 	}
 	
 	if (!error && (get_exit_ctls || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_CTLS, &ctl);
 		if (error == 0)
 			printf("exit_ctls[%d]\t\t0x%016lx\n", vcpu, ctl);
 	}
 
 	if (!error && (get_entry_ctls || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_ENTRY_CTLS, &ctl);
 		if (error == 0)
 			printf("entry_ctls[%d]\t\t0x%016lx\n", vcpu, ctl);
 	}
 
 	if (!error && (get_host_pat || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_IA32_PAT, &pat);
 		if (error == 0)
 			printf("host_pat[%d]\t\t0x%016lx\n", vcpu, pat);
 	}
 
 	if (!error && (get_host_cr0 || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_CR0, &cr0);
 		if (error == 0)
 			printf("host_cr0[%d]\t\t0x%016lx\n", vcpu, cr0);
 	}
 
 	if (!error && (get_host_cr3 || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_CR3, &cr3);
 		if (error == 0)
 			printf("host_cr3[%d]\t\t0x%016lx\n", vcpu, cr3);
 	}
 
 	if (!error && (get_host_cr4 || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_CR4, &cr4);
 		if (error == 0)
 			printf("host_cr4[%d]\t\t0x%016lx\n", vcpu, cr4);
 	}
 
 	if (!error && (get_host_rip || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_RIP, &rip);
 		if (error == 0)
 			printf("host_rip[%d]\t\t0x%016lx\n", vcpu, rip);
 	}
 
 	if (!error && (get_host_rsp || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_RSP, &rsp);
 		if (error == 0)
 			printf("host_rsp[%d]\t\t0x%016lx\n", vcpu, rsp);
 	}
 	
 	if (!error && (get_vmcs_link || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_LINK_POINTER, &addr);
 		if (error == 0)
 			printf("vmcs_pointer[%d]\t0x%016lx\n", vcpu, addr);
 	}
 
 	if (!error && (get_vmcs_exit_interruption_info || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_INTR_INFO, &u64);
 		if (error == 0) {
 			printf("vmcs_exit_interruption_info[%d]\t0x%016lx\n",
 				vcpu, u64);
 		}
 	}
 
 	if (!error && (get_vmcs_exit_interruption_error || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_INTR_ERRCODE,
 		    			  &u64);
 		if (error == 0) {
 			printf("vmcs_exit_interruption_error[%d]\t0x%016lx\n",
 				vcpu, u64);
 		}
 	}
 
 	if (!error && (get_vmcs_interruptibility || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu,
 					  VMCS_GUEST_INTERRUPTIBILITY, &u64);
 		if (error == 0) {
 			printf("vmcs_guest_interruptibility[%d]\t0x%016lx\n",
 				vcpu, u64);
 		}
 	}
 	
 	if (!error && (get_vmcs_exit_qualification || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_QUALIFICATION,
 					  &u64);
 		if (error == 0)
 			printf("vmcs_exit_qualification[%d]\t0x%016lx\n",
 				vcpu, u64);
 	}
 	
 	return (error);
 }
 
 static int
 get_misc_vmcb(struct vmctx *ctx, int vcpu)
 {
 	uint64_t ctl, addr;
-	int error;
+	int error = 0;
 
-	if (get_vmcb_intercept || get_all) {
+	if (!error && (get_vmcb_intercept || get_all)) {
 		error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_CR_INTERCEPT, 4,
 		    &ctl);
 		if (error == 0)
 			printf("cr_intercept[%d]\t0x%08x\n", vcpu, (int)ctl);
 
 		error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_DR_INTERCEPT, 4,
 		    &ctl);
 		if (error == 0)
 			printf("dr_intercept[%d]\t0x%08x\n", vcpu, (int)ctl);
 
 		error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_EXC_INTERCEPT, 4,
 		    &ctl);
 		if (error == 0)
 			printf("exc_intercept[%d]\t0x%08x\n", vcpu, (int)ctl);
 
 		error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_INST1_INTERCEPT,
 		    4, &ctl);
 		if (error == 0)
 			printf("inst1_intercept[%d]\t0x%08x\n", vcpu, (int)ctl);
 
 		error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_INST2_INTERCEPT,
 		    4, &ctl);
 		if (error == 0)
 			printf("inst2_intercept[%d]\t0x%08x\n", vcpu, (int)ctl);
 	}
 
 	if (!error && (get_vmcb_tlb_ctrl || get_all)) {
 		error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_TLB_CTRL,
 					  4, &ctl);
 		if (error == 0)
 			printf("TLB ctrl[%d]\t0x%016lx\n", vcpu, ctl);
 	}
 
 	if (!error && (get_vmcb_exit_details || get_all)) {
 		error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_EXITINFO1,
 					  8, &ctl);
 		if (error == 0)
 			printf("exitinfo1[%d]\t0x%016lx\n", vcpu, ctl);
 		error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_EXITINFO2,
 					  8, &ctl);
 		if (error == 0)
 			printf("exitinfo2[%d]\t0x%016lx\n", vcpu, ctl);
 		error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_EXITINTINFO,
 					  8, &ctl);
 		if (error == 0)
 			printf("exitintinfo[%d]\t0x%016lx\n", vcpu, ctl);
 	}
 
 	if (!error && (get_vmcb_virq || get_all)) {
 		error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_VIRQ,
 					  8, &ctl);
 		if (error == 0)
 			printf("v_irq/tpr[%d]\t0x%016lx\n", vcpu, ctl);
 	}
 
 	if (!error && (get_apic_access_addr || get_all)) {
 		error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_AVIC_BAR, 8,
 					  &addr);
 		if (error == 0)
 			printf("AVIC apic_bar[%d]\t0x%016lx\n", vcpu, addr);
 	}
 
 	if (!error && (get_virtual_apic_addr || get_all)) {
 		error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_AVIC_PAGE, 8,
 					  &addr);
 		if (error == 0)
 			printf("AVIC backing page[%d]\t0x%016lx\n", vcpu, addr);
 	}
 
 	if (!error && (get_avic_table || get_all)) {
 		error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_AVIC_LT, 8,
 					  &addr);
 		if (error == 0)
 			printf("AVIC logical table[%d]\t0x%016lx\n",
 				vcpu, addr);
 		error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_AVIC_PT, 8,
 					  &addr);
 		if (error == 0)
 			printf("AVIC physical table[%d]\t0x%016lx\n",
 				vcpu, addr);
 	}
 
 	return (error);
 }
 
 static struct option *
 setup_options(bool cpu_intel)
 {
 	const struct option common_opts[] = {
 		{ "vm",		REQ_ARG,	0,	VMNAME },
 		{ "cpu",	REQ_ARG,	0,	VCPU },
 		{ "set-mem",	REQ_ARG,	0,	SET_MEM },
 		{ "set-efer",	REQ_ARG,	0,	SET_EFER },
 		{ "set-cr0",	REQ_ARG,	0,	SET_CR0 },
 		{ "set-cr3",	REQ_ARG,	0,	SET_CR3 },
 		{ "set-cr4",	REQ_ARG,	0,	SET_CR4 },
 		{ "set-dr7",	REQ_ARG,	0,	SET_DR7 },
 		{ "set-rsp",	REQ_ARG,	0,	SET_RSP },
 		{ "set-rip",	REQ_ARG,	0,	SET_RIP },
 		{ "set-rax",	REQ_ARG,	0,	SET_RAX },
 		{ "set-rflags",	REQ_ARG,	0,	SET_RFLAGS },
 		{ "desc-base",	REQ_ARG,	0,	DESC_BASE },
 		{ "desc-limit",	REQ_ARG,	0,	DESC_LIMIT },
 		{ "desc-access",REQ_ARG,	0,	DESC_ACCESS },
 		{ "set-cs",	REQ_ARG,	0,	SET_CS },
 		{ "set-ds",	REQ_ARG,	0,	SET_DS },
 		{ "set-es",	REQ_ARG,	0,	SET_ES },
 		{ "set-fs",	REQ_ARG,	0,	SET_FS },
 		{ "set-gs",	REQ_ARG,	0,	SET_GS },
 		{ "set-ss",	REQ_ARG,	0,	SET_SS },
 		{ "set-tr",	REQ_ARG,	0,	SET_TR },
 		{ "set-ldtr",	REQ_ARG,	0,	SET_LDTR },
 		{ "set-x2apic-state",REQ_ARG,	0,	SET_X2APIC_STATE },
 		{ "set-exception-bitmap",
 				REQ_ARG,	0, SET_EXCEPTION_BITMAP },
 		{ "capname",	REQ_ARG,	0,	CAPNAME },
 		{ "unassign-pptdev", REQ_ARG,	0,	UNASSIGN_PPTDEV },
 		{ "setcap",	REQ_ARG,	0,	SET_CAP },
 		{ "get-gpa-pmap", REQ_ARG,	0,	GET_GPA_PMAP },
 		{ "assert-lapic-lvt", REQ_ARG,	0,	ASSERT_LAPIC_LVT },
 		{ "get-rtc-time", NO_ARG,	&get_rtc_time,	1 },
 		{ "set-rtc-time", REQ_ARG,	0,	SET_RTC_TIME },
 		{ "rtc-nvram-offset", REQ_ARG,	0,	RTC_NVRAM_OFFSET },
 		{ "get-rtc-nvram", NO_ARG,	&get_rtc_nvram,	1 },
 		{ "set-rtc-nvram", REQ_ARG,	0,	SET_RTC_NVRAM },
 		{ "getcap",	NO_ARG,		&getcap,	1 },
 		{ "get-stats",	NO_ARG,		&get_stats,	1 },
 		{ "get-desc-ds",NO_ARG,		&get_desc_ds,	1 },
 		{ "set-desc-ds",NO_ARG,		&set_desc_ds,	1 },
 		{ "get-desc-es",NO_ARG,		&get_desc_es,	1 },
 		{ "set-desc-es",NO_ARG,		&set_desc_es,	1 },
 		{ "get-desc-ss",NO_ARG,		&get_desc_ss,	1 },
 		{ "set-desc-ss",NO_ARG,		&set_desc_ss,	1 },
 		{ "get-desc-cs",NO_ARG,		&get_desc_cs,	1 },
 		{ "set-desc-cs",NO_ARG,		&set_desc_cs,	1 },
 		{ "get-desc-fs",NO_ARG,		&get_desc_fs,	1 },
 		{ "set-desc-fs",NO_ARG,		&set_desc_fs,	1 },
 		{ "get-desc-gs",NO_ARG,		&get_desc_gs,	1 },
 		{ "set-desc-gs",NO_ARG,		&set_desc_gs,	1 },
 		{ "get-desc-tr",NO_ARG,		&get_desc_tr,	1 },
 		{ "set-desc-tr",NO_ARG,		&set_desc_tr,	1 },
 		{ "set-desc-ldtr", NO_ARG,	&set_desc_ldtr,	1 },
 		{ "get-desc-ldtr", NO_ARG,	&get_desc_ldtr,	1 },
 		{ "set-desc-gdtr", NO_ARG,	&set_desc_gdtr, 1 },
 		{ "get-desc-gdtr", NO_ARG,	&get_desc_gdtr, 1 },
 		{ "set-desc-idtr", NO_ARG,	&set_desc_idtr, 1 },
 		{ "get-desc-idtr", NO_ARG,	&get_desc_idtr, 1 },
 		{ "get-lowmem", NO_ARG,		&get_lowmem,	1 },
 		{ "get-highmem",NO_ARG,		&get_highmem,	1 },
 		{ "get-efer",	NO_ARG,		&get_efer,	1 },
 		{ "get-cr0",	NO_ARG,		&get_cr0,	1 },
 		{ "get-cr3",	NO_ARG,		&get_cr3,	1 },
 		{ "get-cr4",	NO_ARG,		&get_cr4,	1 },
 		{ "get-dr7",	NO_ARG,		&get_dr7,	1 },
 		{ "get-rsp",	NO_ARG,		&get_rsp,	1 },
 		{ "get-rip",	NO_ARG,		&get_rip,	1 },
 		{ "get-rax",	NO_ARG,		&get_rax,	1 },
 		{ "get-rbx",	NO_ARG,		&get_rbx,	1 },
 		{ "get-rcx",	NO_ARG,		&get_rcx,	1 },
 		{ "get-rdx",	NO_ARG,		&get_rdx,	1 },
 		{ "get-rsi",	NO_ARG,		&get_rsi,	1 },
 		{ "get-rdi",	NO_ARG,		&get_rdi,	1 },
 		{ "get-rbp",	NO_ARG,		&get_rbp,	1 },
 		{ "get-r8",	NO_ARG,		&get_r8,	1 },
 		{ "get-r9",	NO_ARG,		&get_r9,	1 },
 		{ "get-r10",	NO_ARG,		&get_r10,	1 },
 		{ "get-r11",	NO_ARG,		&get_r11,	1 },
 		{ "get-r12",	NO_ARG,		&get_r12,	1 },
 		{ "get-r13",	NO_ARG,		&get_r13,	1 },
 		{ "get-r14",	NO_ARG,		&get_r14,	1 },
 		{ "get-r15",	NO_ARG,		&get_r15,	1 },
 		{ "get-rflags",	NO_ARG,		&get_rflags,	1 },
 		{ "get-cs",	NO_ARG,		&get_cs,	1 },
 		{ "get-ds",	NO_ARG,		&get_ds,	1 },
 		{ "get-es",	NO_ARG,		&get_es,	1 },
 		{ "get-fs",	NO_ARG,		&get_fs,	1 },
 		{ "get-gs",	NO_ARG,		&get_gs,	1 },
 		{ "get-ss",	NO_ARG,		&get_ss,	1 },
 		{ "get-tr",	NO_ARG,		&get_tr,	1 },
 		{ "get-ldtr",	NO_ARG,		&get_ldtr,	1 },
 		{ "get-eptp", 	NO_ARG,		&get_eptp,	1 },
 		{ "get-exception-bitmap",
 					NO_ARG,	&get_exception_bitmap,  1 },
 		{ "get-io-bitmap-address",
 					NO_ARG,	&get_io_bitmap,		1 },
 		{ "get-tsc-offset", 	NO_ARG, &get_tsc_offset, 	1 },
 		{ "get-msr-bitmap",
 					NO_ARG,	&get_msr_bitmap, 	1 },
 		{ "get-msr-bitmap-address",
 					NO_ARG,	&get_msr_bitmap_address, 1 },
 		{ "get-guest-pat",	NO_ARG,	&get_guest_pat,		1 },
 		{ "get-guest-sysenter",
 					NO_ARG,	&get_guest_sysenter, 	1 },
 		{ "get-exit-reason",
 					NO_ARG,	&get_exit_reason, 	1 },
 		{ "get-x2apic-state",	NO_ARG,	&get_x2apic_state, 	1 },
 		{ "get-all",		NO_ARG,	&get_all,		1 },
 		{ "run",		NO_ARG,	&run,			1 },
 		{ "create",		NO_ARG,	&create,		1 },
 		{ "destroy",		NO_ARG,	&destroy,		1 },
 		{ "inject-nmi",		NO_ARG,	&inject_nmi,		1 },
 		{ "force-reset",	NO_ARG,	&force_reset,		1 },
 		{ "force-poweroff", 	NO_ARG,	&force_poweroff, 	1 },
 		{ "get-active-cpus", 	NO_ARG,	&get_active_cpus, 	1 },
 		{ "get-suspended-cpus", NO_ARG,	&get_suspended_cpus, 	1 },
 		{ "get-intinfo", 	NO_ARG,	&get_intinfo,		1 },
 	};
 
 	const struct option intel_opts[] = {
 		{ "get-vmcs-pinbased-ctls",
 				NO_ARG,		&get_pinbased_ctls, 1 },
 		{ "get-vmcs-procbased-ctls",
 				NO_ARG,		&get_procbased_ctls, 1 },
 		{ "get-vmcs-procbased-ctls2",
 				NO_ARG,		&get_procbased_ctls2, 1 },
 		{ "get-vmcs-guest-linear-address",
 				NO_ARG,		&get_vmcs_gla,	1 },
 		{ "get-vmcs-guest-physical-address",
 				NO_ARG,		&get_vmcs_gpa,	1 },
 		{ "get-vmcs-entry-interruption-info",
 				NO_ARG, &get_vmcs_entry_interruption_info, 1},
 		{ "get-vmcs-cr0-mask", NO_ARG,	&get_cr0_mask,	1 },
 		{ "get-vmcs-cr0-shadow", NO_ARG,&get_cr0_shadow, 1 },
 		{ "get-vmcs-cr4-mask", 		NO_ARG,	&get_cr4_mask,	  1 },
 		{ "get-vmcs-cr4-shadow", 	NO_ARG, &get_cr4_shadow,  1 },
 		{ "get-vmcs-cr3-targets", 	NO_ARG, &get_cr3_targets, 1 },
 		{ "get-vmcs-tpr-threshold",
 					NO_ARG,	&get_tpr_threshold, 1 },
 		{ "get-vmcs-vpid", 	NO_ARG,	&get_vpid_asid,	    1 },
 		{ "get-vmcs-exit-ctls", NO_ARG,	&get_exit_ctls,	    1 },
 		{ "get-vmcs-entry-ctls",
 					NO_ARG,	&get_entry_ctls, 1 },
 		{ "get-vmcs-instruction-error",
 					NO_ARG,	&get_inst_err,	1 },
 		{ "get-vmcs-host-pat",	NO_ARG,	&get_host_pat,	1 },
 		{ "get-vmcs-host-cr0",
 					NO_ARG,	&get_host_cr0,	1 },
 		{ "set-vmcs-entry-interruption-info",
 				REQ_ARG, 0, SET_VMCS_ENTRY_INTERRUPTION_INFO },
 		{ "get-vmcs-exit-qualification",
 				NO_ARG,	&get_vmcs_exit_qualification, 1 },
 		{ "get-vmcs-interruptibility",
 				NO_ARG, &get_vmcs_interruptibility, 1 },
 		{ "get-vmcs-exit-interruption-error",
 				NO_ARG,	&get_vmcs_exit_interruption_error, 1 },
 		{ "get-vmcs-exit-interruption-info",
 				NO_ARG,	&get_vmcs_exit_interruption_info, 1 },
 		{ "get-vmcs-link", 	NO_ARG,		&get_vmcs_link, 1 },
 		{ "get-vmcs-host-cr3",
 					NO_ARG,		&get_host_cr3,	1 },
 		{ "get-vmcs-host-cr4",
 				NO_ARG,		&get_host_cr4,	1 },
 		{ "get-vmcs-host-rip",
 				NO_ARG,		&get_host_rip,	1 },
 		{ "get-vmcs-host-rsp",
 				NO_ARG,		&get_host_rsp,	1 },
 		{ "get-apic-access-address",
 				NO_ARG,		&get_apic_access_addr, 1},
 		{ "get-virtual-apic-address",
 				NO_ARG,		&get_virtual_apic_addr, 1}
 	};
 
 	const struct option amd_opts[] = {
 		{ "get-vmcb-intercepts",
 				NO_ARG,	&get_vmcb_intercept, 	1 },
 		{ "get-vmcb-asid", 
 				NO_ARG,	&get_vpid_asid,	     	1 },
 		{ "get-vmcb-exit-details",
 				NO_ARG, &get_vmcb_exit_details,	1 },
 		{ "get-vmcb-tlb-ctrl",
 				NO_ARG, &get_vmcb_tlb_ctrl, 	1 },
 		{ "get-vmcb-virq",
 				NO_ARG, &get_vmcb_virq, 	1 },
 		{ "get-avic-apic-bar",
 				NO_ARG,	&get_apic_access_addr, 	1 },
 		{ "get-avic-backing-page",
 				NO_ARG,	&get_virtual_apic_addr, 1 },
 		{ "get-avic-table",
 				NO_ARG,	&get_avic_table, 	1 }
 	};
 
 	const struct option null_opt = {
 		NULL, 0, NULL, 0
 	};
 
 	struct option *all_opts;
 	char *cp;
 	int optlen;
 
 	optlen = sizeof(common_opts);
 
 	if (cpu_intel)
 		optlen += sizeof(intel_opts);
 	else
 		optlen += sizeof(amd_opts);
 
 	optlen += sizeof(null_opt);
 
 	all_opts = malloc(optlen);
 
 	cp = (char *)all_opts;
 	memcpy(cp, common_opts, sizeof(common_opts));
 	cp += sizeof(common_opts);
 
 	if (cpu_intel) {
 		memcpy(cp, intel_opts, sizeof(intel_opts));
 		cp += sizeof(intel_opts);
 	} else {
 		memcpy(cp, amd_opts, sizeof(amd_opts));
 		cp += sizeof(amd_opts);
 	}
 
 	memcpy(cp, &null_opt, sizeof(null_opt));
 	cp += sizeof(null_opt);
 
 	return (all_opts);
 }
 
 static const char *
 wday_str(int idx)
 {
 	static const char *weekdays[] = {
 		"Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"
 	};
 
 	if (idx >= 0 && idx < 7)
 		return (weekdays[idx]);
 	else
 		return ("UNK");
 }
 
 static const char *
 mon_str(int idx)
 {
 	static const char *months[] = {
 		"Jan", "Feb", "Mar", "Apr", "May", "Jun",
 		"Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
 	};
 
 	if (idx >= 0 && idx < 12)
 		return (months[idx]);
 	else
 		return ("UNK");
 }
 
 int
 main(int argc, char *argv[])
 {
 	char *vmname;
 	int error, ch, vcpu, ptenum;
 	vm_paddr_t gpa, gpa_pmap;
 	size_t len;
 	struct vm_exit vmexit;
 	uint64_t rax, cr0, cr3, cr4, dr7, rsp, rip, rflags, efer, pat;
 	uint64_t eptp, bm, addr, u64, pteval[4], *pte, info[2];
 	struct vmctx *ctx;
 	int wired;
 	cpuset_t cpus;
 	bool cpu_intel;
 	uint64_t cs, ds, es, fs, gs, ss, tr, ldtr;
 	struct tm tm;
 	struct option *opts;
 
 	cpu_intel = cpu_vendor_intel();
 	opts = setup_options(cpu_intel);
 
 	vcpu = 0;
 	vmname = NULL;
 	assert_lapic_lvt = -1;
 	progname = basename(argv[0]);
 
 	while ((ch = getopt_long(argc, argv, "", opts, NULL)) != -1) {
 		switch (ch) {
 		case 0:
 			break;
 		case VMNAME:
 			vmname = optarg;
 			break;
 		case VCPU:
 			vcpu = atoi(optarg);
 			break;
 		case SET_MEM:
 			memsize = atoi(optarg) * MB;
 			memsize = roundup(memsize, 2 * MB);
 			break;
 		case SET_EFER:
 			efer = strtoul(optarg, NULL, 0);
 			set_efer = 1;
 			break;
 		case SET_CR0:
 			cr0 = strtoul(optarg, NULL, 0);
 			set_cr0 = 1;
 			break;
 		case SET_CR3:
 			cr3 = strtoul(optarg, NULL, 0);
 			set_cr3 = 1;
 			break;
 		case SET_CR4:
 			cr4 = strtoul(optarg, NULL, 0);
 			set_cr4 = 1;
 			break;
 		case SET_DR7:
 			dr7 = strtoul(optarg, NULL, 0);
 			set_dr7 = 1;
 			break;
 		case SET_RSP:
 			rsp = strtoul(optarg, NULL, 0);
 			set_rsp = 1;
 			break;
 		case SET_RIP:
 			rip = strtoul(optarg, NULL, 0);
 			set_rip = 1;
 			break;
 		case SET_RAX:
 			rax = strtoul(optarg, NULL, 0);
 			set_rax = 1;
 			break;
 		case SET_RFLAGS:
 			rflags = strtoul(optarg, NULL, 0);
 			set_rflags = 1;
 			break;
 		case DESC_BASE:
 			desc_base = strtoul(optarg, NULL, 0);
 			break;
 		case DESC_LIMIT:
 			desc_limit = strtoul(optarg, NULL, 0);
 			break;
 		case DESC_ACCESS:
 			desc_access = strtoul(optarg, NULL, 0);
 			break;
 		case SET_CS:
 			cs = strtoul(optarg, NULL, 0);
 			set_cs = 1;
 			break;
 		case SET_DS:
 			ds = strtoul(optarg, NULL, 0);
 			set_ds = 1;
 			break;
 		case SET_ES:
 			es = strtoul(optarg, NULL, 0);
 			set_es = 1;
 			break;
 		case SET_FS:
 			fs = strtoul(optarg, NULL, 0);
 			set_fs = 1;
 			break;
 		case SET_GS:
 			gs = strtoul(optarg, NULL, 0);
 			set_gs = 1;
 			break;
 		case SET_SS:
 			ss = strtoul(optarg, NULL, 0);
 			set_ss = 1;
 			break;
 		case SET_TR:
 			tr = strtoul(optarg, NULL, 0);
 			set_tr = 1;
 			break;
 		case SET_LDTR:
 			ldtr = strtoul(optarg, NULL, 0);
 			set_ldtr = 1;
 			break;
 		case SET_X2APIC_STATE:
 			x2apic_state = strtol(optarg, NULL, 0);
 			set_x2apic_state = 1;
 			break;
 		case SET_EXCEPTION_BITMAP:
 			exception_bitmap = strtoul(optarg, NULL, 0);
 			set_exception_bitmap = 1;
 			break;
 		case SET_VMCS_ENTRY_INTERRUPTION_INFO:
 			vmcs_entry_interruption_info = strtoul(optarg, NULL, 0);
 			set_vmcs_entry_interruption_info = 1;
 			break;
 		case SET_CAP:
 			capval = strtoul(optarg, NULL, 0);
 			setcap = 1;
 			break;
 		case SET_RTC_TIME:
 			rtc_secs = strtoul(optarg, NULL, 0);
 			set_rtc_time = 1;
 			break;
 		case SET_RTC_NVRAM:
 			rtc_nvram_value = (uint8_t)strtoul(optarg, NULL, 0);
 			set_rtc_nvram = 1;
 			break;
 		case RTC_NVRAM_OFFSET:
 			rtc_nvram_offset = strtoul(optarg, NULL, 0);
 			break;
 		case GET_GPA_PMAP:
 			gpa_pmap = strtoul(optarg, NULL, 0);
 			get_gpa_pmap = 1;
 			break;
 		case CAPNAME:
 			capname = optarg;
 			break;
 		case UNASSIGN_PPTDEV:
 			unassign_pptdev = 1;
 			if (sscanf(optarg, "%d/%d/%d", &bus, &slot, &func) != 3)
 				usage(cpu_intel);
 			break;
 		case ASSERT_LAPIC_LVT:
 			assert_lapic_lvt = atoi(optarg);
 			break;
 		default:
 			usage(cpu_intel);
 		}
 	}
 	argc -= optind;
 	argv += optind;
 
 	if (vmname == NULL)
 		usage(cpu_intel);
 
 	error = 0;
 
 	if (!error && create)
 		error = vm_create(vmname);
 
 	if (!error) {
 		ctx = vm_open(vmname);
 		if (ctx == NULL) {
 			printf("VM:%s is not created.\n", vmname);
 			exit (1);
 		}
 	}
 
 	if (!error && memsize)
 		error = vm_setup_memory(ctx, memsize, VM_MMAP_NONE);
 
 	if (!error && set_efer)
 		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_EFER, efer);
 
 	if (!error && set_cr0)
 		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR0, cr0);
 
 	if (!error && set_cr3)
 		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR3, cr3);
 
 	if (!error && set_cr4)
 		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR4, cr4);
 
 	if (!error && set_dr7)
 		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DR7, dr7);
 
 	if (!error && set_rsp)
 		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RSP, rsp);
 
 	if (!error && set_rip)
 		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, rip);
 
 	if (!error && set_rax)
 		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX, rax);
 
 	if (!error && set_rflags) {
 		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RFLAGS,
 					rflags);
 	}
 
 	if (!error && set_desc_ds) {
 		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_DS,
 				    desc_base, desc_limit, desc_access);
 	}
 
 	if (!error && set_desc_es) {
 		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_ES,
 				    desc_base, desc_limit, desc_access);
 	}
 
 	if (!error && set_desc_ss) {
 		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_SS,
 				    desc_base, desc_limit, desc_access);
 	}
 
 	if (!error && set_desc_cs) {
 		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_CS,
 				    desc_base, desc_limit, desc_access);
 	}
 
 	if (!error && set_desc_fs) {
 		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_FS,
 				    desc_base, desc_limit, desc_access);
 	}
 
 	if (!error && set_desc_gs) {
 		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_GS,
 				    desc_base, desc_limit, desc_access);
 	}
 
 	if (!error && set_desc_tr) {
 		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_TR,
 				    desc_base, desc_limit, desc_access);
 	}
 
 	if (!error && set_desc_ldtr) {
 		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_LDTR,
 				    desc_base, desc_limit, desc_access);
 	}
 
 	if (!error && set_desc_gdtr) {
 		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_GDTR,
 				    desc_base, desc_limit, 0);
 	}
 
 	if (!error && set_desc_idtr) {
 		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_IDTR,
 				    desc_base, desc_limit, 0);
 	}
 
 	if (!error && set_cs)
 		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CS, cs);
 
 	if (!error && set_ds)
 		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DS, ds);
 
 	if (!error && set_es)
 		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_ES, es);
 
 	if (!error && set_fs)
 		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_FS, fs);
 
 	if (!error && set_gs)
 		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_GS, gs);
 
 	if (!error && set_ss)
 		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_SS, ss);
 
 	if (!error && set_tr)
 		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_TR, tr);
 
 	if (!error && set_ldtr)
 		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_LDTR, ldtr);
 
 	if (!error && set_x2apic_state)
 		error = vm_set_x2apic_state(ctx, vcpu, x2apic_state);
 
 	if (!error && unassign_pptdev)
 		error = vm_unassign_pptdev(ctx, bus, slot, func);
 
 	if (!error && set_exception_bitmap) {
 		if (cpu_intel)
 			error = vm_set_vmcs_field(ctx, vcpu,
 						  VMCS_EXCEPTION_BITMAP,
 						  exception_bitmap);
 		else
 			error = vm_set_vmcb_field(ctx, vcpu,
 						  VMCB_OFF_EXC_INTERCEPT,
 						  4, exception_bitmap);
 	}
 
 	if (!error && cpu_intel && set_vmcs_entry_interruption_info) {
 		error = vm_set_vmcs_field(ctx, vcpu, VMCS_ENTRY_INTR_INFO,
 					  vmcs_entry_interruption_info);
 	}
 
 	if (!error && inject_nmi) {
 		error = vm_inject_nmi(ctx, vcpu);
 	}
 
 	if (!error && assert_lapic_lvt != -1) {
 		error = vm_lapic_local_irq(ctx, vcpu, assert_lapic_lvt);
 	}
 
 	if (!error && (get_lowmem || get_all)) {
 		gpa = 0;
 		error = vm_get_memory_seg(ctx, gpa, &len, &wired);
 		if (error == 0)
 			printf("lowmem\t\t0x%016lx/%ld%s\n", gpa, len,
 			    wired ? " wired" : "");
 	}
 
 	if (!error && (get_highmem || get_all)) {
 		gpa = 4 * GB;
 		error = vm_get_memory_seg(ctx, gpa, &len, &wired);
 		if (error == 0)
 			printf("highmem\t\t0x%016lx/%ld%s\n", gpa, len,
 			    wired ? " wired" : "");
 	}
 
 	if (!error)
 		error = get_all_registers(ctx, vcpu);
 
 	if (!error)
 		error = get_all_segments(ctx, vcpu);
 
 	if (!error) {
 		if (cpu_intel)
 			error = get_misc_vmcs(ctx, vcpu);
 		else
 			error = get_misc_vmcb(ctx, vcpu);
 	}
 	
 	if (!error && (get_x2apic_state || get_all)) {
 		error = vm_get_x2apic_state(ctx, vcpu, &x2apic_state);
 		if (error == 0)
 			printf("x2apic_state[%d]\t%d\n", vcpu, x2apic_state);
 	}
 
 	if (!error && (get_eptp || get_all)) {
 		if (cpu_intel)
 			error = vm_get_vmcs_field(ctx, vcpu, VMCS_EPTP, &eptp);
 		else
 			error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_NPT_BASE,
 						   8, &eptp);
 		if (error == 0)
 			printf("%s[%d]\t\t0x%016lx\n",
 				cpu_intel ? "eptp" : "rvi/npt", vcpu, eptp);
 	}
 
 	if (!error && (get_exception_bitmap || get_all)) {
 		if(cpu_intel)
 			error = vm_get_vmcs_field(ctx, vcpu,
 						VMCS_EXCEPTION_BITMAP, &bm);
 		else
 			error = vm_get_vmcb_field(ctx, vcpu,
 						  VMCB_OFF_EXC_INTERCEPT,
 						  4, &bm);
 		if (error == 0)
 			printf("exception_bitmap[%d]\t%#lx\n", vcpu, bm);
 	}
 
 	if (!error && (get_io_bitmap || get_all)) {
 		if (cpu_intel) {
 			error = vm_get_vmcs_field(ctx, vcpu, VMCS_IO_BITMAP_A,
 						  &bm);
 			if (error == 0)
 				printf("io_bitmap_a[%d]\t%#lx\n", vcpu, bm);
 			error = vm_get_vmcs_field(ctx, vcpu, VMCS_IO_BITMAP_B,
 						  &bm);
 			if (error == 0)
 				printf("io_bitmap_b[%d]\t%#lx\n", vcpu, bm);
 		} else {
 			error = vm_get_vmcb_field(ctx, vcpu,
 						  VMCB_OFF_IO_PERM, 8, &bm);
 			if (error == 0)
 				printf("io_bitmap[%d]\t%#lx\n", vcpu, bm);
 		}
 	}
 
 	if (!error && (get_tsc_offset || get_all)) {
 		uint64_t tscoff;
 		if (cpu_intel)
 			error = vm_get_vmcs_field(ctx, vcpu, VMCS_TSC_OFFSET,
 						  &tscoff);
 		else
 			error = vm_get_vmcb_field(ctx, vcpu,
 						  VMCB_OFF_TSC_OFFSET, 
 						  8, &tscoff);
 		if (error == 0)
 			printf("tsc_offset[%d]\t0x%016lx\n", vcpu, tscoff);
 	}
 
 	if (!error && (get_msr_bitmap_address || get_all)) {
 		if (cpu_intel)
 			error = vm_get_vmcs_field(ctx, vcpu, VMCS_MSR_BITMAP, 
 						  &addr);
 		else
 			error = vm_get_vmcb_field(ctx, vcpu,
 						  VMCB_OFF_MSR_PERM, 8, &addr);
 		if (error == 0)
 			printf("msr_bitmap[%d]\t\t%#lx\n", vcpu, addr);
 	}
 
 	if (!error && (get_msr_bitmap || get_all)) {
 		if (cpu_intel) {
 			error = vm_get_vmcs_field(ctx, vcpu, 
 						  VMCS_MSR_BITMAP, &addr);
 		} else {
 			error = vm_get_vmcb_field(ctx, vcpu,
 						  VMCB_OFF_MSR_PERM, 8,
 						  &addr);
 		}
 
 		if (error == 0)
 			error = dump_msr_bitmap(vcpu, addr, cpu_intel);
 	}
 
 	if (!error && (get_vpid_asid || get_all)) {
 		uint64_t vpid;
 		if (cpu_intel)
 			error = vm_get_vmcs_field(ctx, vcpu, VMCS_VPID, &vpid);
 		else
 			error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_ASID, 
 						  4, &vpid);
 		if (error == 0)
 			printf("%s[%d]\t\t0x%04lx\n", 
 				cpu_intel ? "vpid" : "asid", vcpu, vpid);
 	}
 
 	if (!error && (get_guest_pat || get_all)) {
 		if (cpu_intel)
 			error = vm_get_vmcs_field(ctx, vcpu,
 						  VMCS_GUEST_IA32_PAT, &pat);
 		else
 			error = vm_get_vmcb_field(ctx, vcpu,
 						  VMCB_OFF_GUEST_PAT, 8, &pat);
 		if (error == 0)
 			printf("guest_pat[%d]\t\t0x%016lx\n", vcpu, pat);
 	}
 
 	if (!error && (get_guest_sysenter || get_all)) {
 		if (cpu_intel)
 			error = vm_get_vmcs_field(ctx, vcpu,
 						  VMCS_GUEST_IA32_SYSENTER_CS,
 						  &cs);
 		else
 			error = vm_get_vmcb_field(ctx, vcpu,
 						  VMCB_OFF_SYSENTER_CS, 8,
 						  &cs);
 
 		if (error == 0)
 			printf("guest_sysenter_cs[%d]\t%#lx\n", vcpu, cs);
 		if (cpu_intel)
 			error = vm_get_vmcs_field(ctx, vcpu,
 						  VMCS_GUEST_IA32_SYSENTER_ESP,
 						  &rsp);
 		else
 			error = vm_get_vmcb_field(ctx, vcpu,
 						  VMCB_OFF_SYSENTER_ESP, 8,
 						  &rsp);
 
 		if (error == 0)
 			printf("guest_sysenter_sp[%d]\t%#lx\n", vcpu, rsp);
 		if (cpu_intel)
 			error = vm_get_vmcs_field(ctx, vcpu,
 						  VMCS_GUEST_IA32_SYSENTER_EIP,
 						  &rip);
 		else
 			error = vm_get_vmcb_field(ctx, vcpu,
 						  VMCB_OFF_SYSENTER_EIP, 8, 
 						  &rip);
 		if (error == 0)
 			printf("guest_sysenter_ip[%d]\t%#lx\n", vcpu, rip);
 	}
 
 	if (!error && (get_exit_reason || get_all)) {
 		if (cpu_intel)
 			error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_REASON,
 						  &u64);
 		else	
 			error = vm_get_vmcb_field(ctx, vcpu,
 						  VMCB_OFF_EXIT_REASON, 8,
 						  &u64);
 		if (error == 0)
 			printf("exit_reason[%d]\t%#lx\n", vcpu, u64);
 	}
 
 	if (!error && setcap) {
 		int captype;
 		captype = vm_capability_name2type(capname);
 		error = vm_set_capability(ctx, vcpu, captype, capval);
 		if (error != 0 && errno == ENOENT)
 			printf("Capability \"%s\" is not available\n", capname);
 	}
 
 	if (!error && get_gpa_pmap) {
 		error = vm_get_gpa_pmap(ctx, gpa_pmap, pteval, &ptenum);
 		if (error == 0) {
 			printf("gpa %#lx:", gpa_pmap);
 			pte = &pteval[0];
 			while (ptenum-- > 0)
 				printf(" %#lx", *pte++);
 			printf("\n");
 		}
 	}
 
 	if (!error && set_rtc_nvram)
 		error = vm_rtc_write(ctx, rtc_nvram_offset, rtc_nvram_value);
 
 	if (!error && (get_rtc_nvram || get_all)) {
 		error = vm_rtc_read(ctx, rtc_nvram_offset, &rtc_nvram_value);
 		if (error == 0) {
 			printf("rtc nvram[%03d]: 0x%02x\n", rtc_nvram_offset,
 			    rtc_nvram_value);
 		}
 	}
 
 	if (!error && set_rtc_time)
 		error = vm_rtc_settime(ctx, rtc_secs);
 
 	if (!error && (get_rtc_time || get_all)) {
 		error = vm_rtc_gettime(ctx, &rtc_secs);
 		if (error == 0) {
 			gmtime_r(&rtc_secs, &tm);
 			printf("rtc time %#lx: %s %s %02d %02d:%02d:%02d %d\n",
 			    rtc_secs, wday_str(tm.tm_wday), mon_str(tm.tm_mon),
 			    tm.tm_mday, tm.tm_hour, tm.tm_min, tm.tm_sec,
 			    1900 + tm.tm_year);
 		}
 	}
 
 	if (!error && (getcap || get_all)) {
 		int captype, val, getcaptype;
 
 		if (getcap && capname)
 			getcaptype = vm_capability_name2type(capname);
 		else
 			getcaptype = -1;
 
 		for (captype = 0; captype < VM_CAP_MAX; captype++) {
 			if (getcaptype >= 0 && captype != getcaptype)
 				continue;
 			error = vm_get_capability(ctx, vcpu, captype, &val);
 			if (error == 0) {
 				printf("Capability \"%s\" is %s on vcpu %d\n",
 					vm_capability_type2name(captype),
 					val ? "set" : "not set", vcpu);
 			} else if (errno == ENOENT) {
 				error = 0;
 				printf("Capability \"%s\" is not available\n",
 					vm_capability_type2name(captype));
 			} else {
 				break;
 			}
 		}
 	}
 
 	if (!error && (get_active_cpus || get_all)) {
 		error = vm_active_cpus(ctx, &cpus);
 		if (!error)
 			print_cpus("active cpus", &cpus);
 	}
 
 	if (!error && (get_suspended_cpus || get_all)) {
 		error = vm_suspended_cpus(ctx, &cpus);
 		if (!error)
 			print_cpus("suspended cpus", &cpus);
 	}
 
 	if (!error && (get_intinfo || get_all)) {
 		error = vm_get_intinfo(ctx, vcpu, &info[0], &info[1]);
 		if (!error) {
 			print_intinfo("pending", info[0]);
 			print_intinfo("current", info[1]);
 		}
 	}
 
 	if (!error && (get_stats || get_all)) {
 		int i, num_stats;
 		uint64_t *stats;
 		struct timeval tv;
 		const char *desc;
 
 		stats = vm_get_stats(ctx, vcpu, &tv, &num_stats);
 		if (stats != NULL) {
 			printf("vcpu%d stats:\n", vcpu);
 			for (i = 0; i < num_stats; i++) {
 				desc = vm_get_stat_desc(ctx, i);
 				printf("%-40s\t%ld\n", desc, stats[i]);
 			}
 		}
 	}
 
 	if (!error && run) {
 		error = vm_run(ctx, vcpu, &vmexit);
 		if (error == 0)
 			dump_vm_run_exitcode(&vmexit, vcpu);
 		else
 			printf("vm_run error %d\n", error);
 	}
 
 	if (!error && force_reset)
 		error = vm_suspend(ctx, VM_SUSPEND_RESET);
 
 	if (!error && force_poweroff)
 		error = vm_suspend(ctx, VM_SUSPEND_POWEROFF);
 
 	if (error)
 		printf("errno = %d\n", errno);
 
 	if (!error && destroy)
 		vm_destroy(ctx);
 
 	free (opts);
 	exit(error);
 }
Index: stable/10
===================================================================
--- stable/10	(revision 284898)
+++ stable/10	(revision 284899)

Property changes on: stable/10
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head:r279444,279683,279925,280447,280725,280775,280929,280968,281145,281542,281559,281561,281611-281612,281630,281879,281946,281987,282206