Index: stable/10/sys/amd64/include/vmm.h
===================================================================
--- stable/10/sys/amd64/include/vmm.h	(revision 276348)
+++ stable/10/sys/amd64/include/vmm.h	(revision 276349)
@@ -1,616 +1,618 @@
 /*-
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _VMM_H_
 #define	_VMM_H_
 
 #include <x86/segments.h>
 
 enum vm_suspend_how {
 	VM_SUSPEND_NONE,
 	VM_SUSPEND_RESET,
 	VM_SUSPEND_POWEROFF,
 	VM_SUSPEND_HALT,
 	VM_SUSPEND_TRIPLEFAULT,
 	VM_SUSPEND_LAST
 };
 
 /*
  * Identifiers for architecturally defined registers.
  */
 enum vm_reg_name {
 	VM_REG_GUEST_RAX,
 	VM_REG_GUEST_RBX,
 	VM_REG_GUEST_RCX,
 	VM_REG_GUEST_RDX,
 	VM_REG_GUEST_RSI,
 	VM_REG_GUEST_RDI,
 	VM_REG_GUEST_RBP,
 	VM_REG_GUEST_R8,
 	VM_REG_GUEST_R9,
 	VM_REG_GUEST_R10,
 	VM_REG_GUEST_R11,
 	VM_REG_GUEST_R12,
 	VM_REG_GUEST_R13,
 	VM_REG_GUEST_R14,
 	VM_REG_GUEST_R15,
 	VM_REG_GUEST_CR0,
 	VM_REG_GUEST_CR3,
 	VM_REG_GUEST_CR4,
 	VM_REG_GUEST_DR7,
 	VM_REG_GUEST_RSP,
 	VM_REG_GUEST_RIP,
 	VM_REG_GUEST_RFLAGS,
 	VM_REG_GUEST_ES,
 	VM_REG_GUEST_CS,
 	VM_REG_GUEST_SS,
 	VM_REG_GUEST_DS,
 	VM_REG_GUEST_FS,
 	VM_REG_GUEST_GS,
 	VM_REG_GUEST_LDTR,
 	VM_REG_GUEST_TR,
 	VM_REG_GUEST_IDTR,
 	VM_REG_GUEST_GDTR,
 	VM_REG_GUEST_EFER,
 	VM_REG_GUEST_CR2,
 	VM_REG_GUEST_PDPTE0,
 	VM_REG_GUEST_PDPTE1,
 	VM_REG_GUEST_PDPTE2,
 	VM_REG_GUEST_PDPTE3,
+	VM_REG_GUEST_INTR_SHADOW,
 	VM_REG_LAST
 };
 
 enum x2apic_state {
 	X2APIC_DISABLED,
 	X2APIC_ENABLED,
 	X2APIC_STATE_LAST
 };
 
 #define	VM_INTINFO_VECTOR(info)	((info) & 0xff)
 #define	VM_INTINFO_DEL_ERRCODE	0x800
 #define	VM_INTINFO_RSVD		0x7ffff000
 #define	VM_INTINFO_VALID	0x80000000
 #define	VM_INTINFO_TYPE		0x700
 #define	VM_INTINFO_HWINTR	(0 << 8)
 #define	VM_INTINFO_NMI		(2 << 8)
 #define	VM_INTINFO_HWEXCEPTION	(3 << 8)
 #define	VM_INTINFO_SWINTR	(4 << 8)
 
 #ifdef _KERNEL
 
 #define	VM_MAX_NAMELEN	32
 
 struct vm;
 struct vm_exception;
 struct vm_memory_segment;
 struct seg_desc;
 struct vm_exit;
 struct vm_run;
 struct vhpet;
 struct vioapic;
 struct vlapic;
 struct vmspace;
 struct vm_object;
 struct vm_guest_paging;
 struct pmap;
 
 typedef int	(*vmm_init_func_t)(int ipinum);
 typedef int	(*vmm_cleanup_func_t)(void);
 typedef void	(*vmm_resume_func_t)(void);
 typedef void *	(*vmi_init_func_t)(struct vm *vm, struct pmap *pmap);
 typedef int	(*vmi_run_func_t)(void *vmi, int vcpu, register_t rip,
 				  struct pmap *pmap, void *rendezvous_cookie,
 				  void *suspend_cookie);
 typedef void	(*vmi_cleanup_func_t)(void *vmi);
 typedef int	(*vmi_get_register_t)(void *vmi, int vcpu, int num,
 				      uint64_t *retval);
 typedef int	(*vmi_set_register_t)(void *vmi, int vcpu, int num,
 				      uint64_t val);
 typedef int	(*vmi_get_desc_t)(void *vmi, int vcpu, int num,
 				  struct seg_desc *desc);
 typedef int	(*vmi_set_desc_t)(void *vmi, int vcpu, int num,
 				  struct seg_desc *desc);
 typedef int	(*vmi_get_cap_t)(void *vmi, int vcpu, int num, int *retval);
 typedef int	(*vmi_set_cap_t)(void *vmi, int vcpu, int num, int val);
 typedef struct vmspace * (*vmi_vmspace_alloc)(vm_offset_t min, vm_offset_t max);
 typedef void	(*vmi_vmspace_free)(struct vmspace *vmspace);
 typedef struct vlapic * (*vmi_vlapic_init)(void *vmi, int vcpu);
 typedef void	(*vmi_vlapic_cleanup)(void *vmi, struct vlapic *vlapic);
 
 struct vmm_ops {
 	vmm_init_func_t		init;		/* module wide initialization */
 	vmm_cleanup_func_t	cleanup;
 	vmm_resume_func_t	resume;
 
 	vmi_init_func_t		vminit;		/* vm-specific initialization */
 	vmi_run_func_t		vmrun;
 	vmi_cleanup_func_t	vmcleanup;
 	vmi_get_register_t	vmgetreg;
 	vmi_set_register_t	vmsetreg;
 	vmi_get_desc_t		vmgetdesc;
 	vmi_set_desc_t		vmsetdesc;
 	vmi_get_cap_t		vmgetcap;
 	vmi_set_cap_t		vmsetcap;
 	vmi_vmspace_alloc	vmspace_alloc;
 	vmi_vmspace_free	vmspace_free;
 	vmi_vlapic_init		vlapic_init;
 	vmi_vlapic_cleanup	vlapic_cleanup;
 };
 
 extern struct vmm_ops vmm_ops_intel;
 extern struct vmm_ops vmm_ops_amd;
 
 int vm_create(const char *name, struct vm **retvm);
 void vm_destroy(struct vm *vm);
 int vm_reinit(struct vm *vm);
 const char *vm_name(struct vm *vm);
 int vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len);
 int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
 int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len);
 void *vm_gpa_hold(struct vm *, vm_paddr_t gpa, size_t len, int prot,
 		  void **cookie);
 void vm_gpa_release(void *cookie);
 int vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
 	      struct vm_memory_segment *seg);
 int vm_get_memobj(struct vm *vm, vm_paddr_t gpa, size_t len,
 		  vm_offset_t *offset, struct vm_object **object);
 boolean_t vm_mem_allocated(struct vm *vm, vm_paddr_t gpa);
 int vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval);
 int vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val);
 int vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
 		    struct seg_desc *ret_desc);
 int vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
 		    struct seg_desc *desc);
 int vm_run(struct vm *vm, struct vm_run *vmrun);
 int vm_suspend(struct vm *vm, enum vm_suspend_how how);
 int vm_inject_nmi(struct vm *vm, int vcpu);
 int vm_nmi_pending(struct vm *vm, int vcpuid);
 void vm_nmi_clear(struct vm *vm, int vcpuid);
 int vm_inject_extint(struct vm *vm, int vcpu);
 int vm_extint_pending(struct vm *vm, int vcpuid);
 void vm_extint_clear(struct vm *vm, int vcpuid);
-uint64_t *vm_guest_msrs(struct vm *vm, int cpu);
 struct vlapic *vm_lapic(struct vm *vm, int cpu);
 struct vioapic *vm_ioapic(struct vm *vm);
 struct vhpet *vm_hpet(struct vm *vm);
 int vm_get_capability(struct vm *vm, int vcpu, int type, int *val);
 int vm_set_capability(struct vm *vm, int vcpu, int type, int val);
 int vm_get_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state *state);
 int vm_set_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state state);
 int vm_apicid2vcpuid(struct vm *vm, int apicid);
 int vm_activate_cpu(struct vm *vm, int vcpu);
 cpuset_t vm_active_cpus(struct vm *vm);
 cpuset_t vm_suspended_cpus(struct vm *vm);
 struct vm_exit *vm_exitinfo(struct vm *vm, int vcpuid);
 void vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip);
 void vm_exit_rendezvous(struct vm *vm, int vcpuid, uint64_t rip);
 void vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip);
 
 /*
  * Rendezvous all vcpus specified in 'dest' and execute 'func(arg)'.
  * The rendezvous 'func(arg)' is not allowed to do anything that will
  * cause the thread to be put to sleep.
  *
  * If the rendezvous is being initiated from a vcpu context then the
  * 'vcpuid' must refer to that vcpu, otherwise it should be set to -1.
  *
  * The caller cannot hold any locks when initiating the rendezvous.
  *
  * The implementation of this API may cause vcpus other than those specified
  * by 'dest' to be stalled. The caller should not rely on any vcpus making
  * forward progress when the rendezvous is in progress.
  */
 typedef void (*vm_rendezvous_func_t)(struct vm *vm, int vcpuid, void *arg);
 void vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest,
     vm_rendezvous_func_t func, void *arg);
 
 static __inline int
 vcpu_rendezvous_pending(void *rendezvous_cookie)
 {
 
 	return (*(uintptr_t *)rendezvous_cookie != 0);
 }
 
 static __inline int
 vcpu_suspended(void *suspend_cookie)
 {
 
 	return (*(int *)suspend_cookie);
 }
 
 /*
  * Return 1 if device indicated by bus/slot/func is supposed to be a
  * pci passthrough device.
  *
  * Return 0 otherwise.
  */
 int vmm_is_pptdev(int bus, int slot, int func);
 
 void *vm_iommu_domain(struct vm *vm);
 
 enum vcpu_state {
 	VCPU_IDLE,
 	VCPU_FROZEN,
 	VCPU_RUNNING,
 	VCPU_SLEEPING,
 };
 
 int vcpu_set_state(struct vm *vm, int vcpu, enum vcpu_state state,
     bool from_idle);
 enum vcpu_state vcpu_get_state(struct vm *vm, int vcpu, int *hostcpu);
 
 static int __inline
 vcpu_is_running(struct vm *vm, int vcpu, int *hostcpu)
 {
 	return (vcpu_get_state(vm, vcpu, hostcpu) == VCPU_RUNNING);
 }
 
 #ifdef _SYS_PROC_H_
 static int __inline
 vcpu_should_yield(struct vm *vm, int vcpu)
 {
 	return (curthread->td_flags & (TDF_ASTPENDING | TDF_NEEDRESCHED));
 }
 #endif
 
 void *vcpu_stats(struct vm *vm, int vcpu);
 void vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr);
 struct vmspace *vm_get_vmspace(struct vm *vm);
 int vm_assign_pptdev(struct vm *vm, int bus, int slot, int func);
 int vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func);
 struct vatpic *vm_atpic(struct vm *vm);
 struct vatpit *vm_atpit(struct vm *vm);
 
 /*
  * Inject exception 'vme' into the guest vcpu. This function returns 0 on
  * success and non-zero on failure.
  *
  * Wrapper functions like 'vm_inject_gp()' should be preferred to calling
  * this function directly because they enforce the trap-like or fault-like
  * behavior of an exception.
  *
  * This function should only be called in the context of the thread that is
  * executing this vcpu.
  */
 int vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *vme);
 
 /*
  * This function is called after a VM-exit that occurred during exception or
  * interrupt delivery through the IDT. The format of 'intinfo' is described
  * in Figure 15-1, "EXITINTINFO for All Intercepts", APM, Vol 2.
  *
  * If a VM-exit handler completes the event delivery successfully then it
  * should call vm_exit_intinfo() to extinguish the pending event. For e.g.,
  * if the task switch emulation is triggered via a task gate then it should
  * call this function with 'intinfo=0' to indicate that the external event
  * is not pending anymore.
  *
  * Return value is 0 on success and non-zero on failure.
  */
 int vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t intinfo);
 
 /*
  * This function is called before every VM-entry to retrieve a pending
  * event that should be injected into the guest. This function combines
  * nested events into a double or triple fault.
  *
  * Returns 0 if there are no events that need to be injected into the guest
  * and non-zero otherwise.
  */
 int vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *info);
 
 int vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2);
 
 enum vm_reg_name vm_segment_name(int seg_encoding);
 
 struct vm_copyinfo {
 	uint64_t	gpa;
 	size_t		len;
 	void		*hva;
 	void		*cookie;
 };
 
 /*
  * Set up 'copyinfo[]' to copy to/from guest linear address space starting
  * at 'gla' and 'len' bytes long. The 'prot' should be set to PROT_READ for
  * a copyin or PROT_WRITE for a copyout. 
  *
  * Returns 0 on success.
  * Returns 1 if an exception was injected into the guest.
  * Returns -1 otherwise.
  *
  * The 'copyinfo[]' can be passed to 'vm_copyin()' or 'vm_copyout()' only if
  * the return value is 0. The 'copyinfo[]' resources should be freed by calling
  * 'vm_copy_teardown()' after the copy is done.
  */
 int vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
     uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
     int num_copyinfo);
 void vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
     int num_copyinfo);
 void vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
     void *kaddr, size_t len);
 void vm_copyout(struct vm *vm, int vcpuid, const void *kaddr,
     struct vm_copyinfo *copyinfo, size_t len);
 #endif	/* KERNEL */
 
 #define	VM_MAXCPU	16			/* maximum virtual cpus */
 
 /*
  * Identifiers for optional vmm capabilities
  */
 enum vm_cap_type {
 	VM_CAP_HALT_EXIT,
 	VM_CAP_MTRAP_EXIT,
 	VM_CAP_PAUSE_EXIT,
 	VM_CAP_UNRESTRICTED_GUEST,
 	VM_CAP_ENABLE_INVPCID,
 	VM_CAP_MAX
 };
 
 enum vm_intr_trigger {
 	EDGE_TRIGGER,
 	LEVEL_TRIGGER
 };
 	
 /*
  * The 'access' field has the format specified in Table 21-2 of the Intel
  * Architecture Manual vol 3b.
  *
  * XXX The contents of the 'access' field are architecturally defined except
  * bit 16 - Segment Unusable.
  */
 struct seg_desc {
 	uint64_t	base;
 	uint32_t	limit;
 	uint32_t	access;
 };
 #define	SEG_DESC_TYPE(access)		((access) & 0x001f)
 #define	SEG_DESC_DPL(access)		(((access) >> 5) & 0x3)
 #define	SEG_DESC_PRESENT(access)	(((access) & 0x0080) ? 1 : 0)
 #define	SEG_DESC_DEF32(access)		(((access) & 0x4000) ? 1 : 0)
 #define	SEG_DESC_GRANULARITY(access)	(((access) & 0x8000) ? 1 : 0)
 #define	SEG_DESC_UNUSABLE(access)	(((access) & 0x10000) ? 1 : 0)
 
 enum vm_cpu_mode {
 	CPU_MODE_REAL,
 	CPU_MODE_PROTECTED,
 	CPU_MODE_COMPATIBILITY,		/* IA-32E mode (CS.L = 0) */
 	CPU_MODE_64BIT,			/* IA-32E mode (CS.L = 1) */
 };
 
 enum vm_paging_mode {
 	PAGING_MODE_FLAT,
 	PAGING_MODE_32,
 	PAGING_MODE_PAE,
 	PAGING_MODE_64,
 };
 
 struct vm_guest_paging {
 	uint64_t	cr3;
 	int		cpl;
 	enum vm_cpu_mode cpu_mode;
 	enum vm_paging_mode paging_mode;
 };
 
 /*
  * The data structures 'vie' and 'vie_op' are meant to be opaque to the
  * consumers of instruction decoding. The only reason why their contents
  * need to be exposed is because they are part of the 'vm_exit' structure.
  */
 struct vie_op {
 	uint8_t		op_byte;	/* actual opcode byte */
 	uint8_t		op_type;	/* type of operation (e.g. MOV) */
 	uint16_t	op_flags;
 };
 
 #define	VIE_INST_SIZE	15
 struct vie {
 	uint8_t		inst[VIE_INST_SIZE];	/* instruction bytes */
 	uint8_t		num_valid;		/* size of the instruction */
 	uint8_t		num_processed;
 
 	uint8_t		addrsize:4, opsize:4;	/* address and operand sizes */
 	uint8_t		rex_w:1,		/* REX prefix */
 			rex_r:1,
 			rex_x:1,
 			rex_b:1,
 			rex_present:1,
 			opsize_override:1,	/* Operand size override */
 			addrsize_override:1;	/* Address size override */
 
 	uint8_t		mod:2,			/* ModRM byte */
 			reg:4,
 			rm:4;
 
 	uint8_t		ss:2,			/* SIB byte */
 			index:4,
 			base:4;
 
 	uint8_t		disp_bytes;
 	uint8_t		imm_bytes;
 
 	uint8_t		scale;
 	int		base_register;		/* VM_REG_GUEST_xyz */
 	int		index_register;		/* VM_REG_GUEST_xyz */
 
 	int64_t		displacement;		/* optional addr displacement */
 	int64_t		immediate;		/* optional immediate operand */
 
 	uint8_t		decoded;	/* set to 1 if successfully decoded */
 
 	struct vie_op	op;			/* opcode description */
 };
 
 enum vm_exitcode {
 	VM_EXITCODE_INOUT,
 	VM_EXITCODE_VMX,
 	VM_EXITCODE_BOGUS,
 	VM_EXITCODE_RDMSR,
 	VM_EXITCODE_WRMSR,
 	VM_EXITCODE_HLT,
 	VM_EXITCODE_MTRAP,
 	VM_EXITCODE_PAUSE,
 	VM_EXITCODE_PAGING,
 	VM_EXITCODE_INST_EMUL,
 	VM_EXITCODE_SPINUP_AP,
 	VM_EXITCODE_DEPRECATED1,	/* used to be SPINDOWN_CPU */
 	VM_EXITCODE_RENDEZVOUS,
 	VM_EXITCODE_IOAPIC_EOI,
 	VM_EXITCODE_SUSPENDED,
 	VM_EXITCODE_INOUT_STR,
 	VM_EXITCODE_TASK_SWITCH,
+	VM_EXITCODE_MONITOR,
+	VM_EXITCODE_MWAIT,
 	VM_EXITCODE_MAX
 };
 
 struct vm_inout {
 	uint16_t	bytes:3;	/* 1 or 2 or 4 */
 	uint16_t	in:1;
 	uint16_t	string:1;
 	uint16_t	rep:1;
 	uint16_t	port;
 	uint32_t	eax;		/* valid for out */
 };
 
 struct vm_inout_str {
 	struct vm_inout	inout;		/* must be the first element */
 	struct vm_guest_paging paging;
 	uint64_t	rflags;
 	uint64_t	cr0;
 	uint64_t	index;
 	uint64_t	count;		/* rep=1 (%rcx), rep=0 (1) */
 	int		addrsize;
 	enum vm_reg_name seg_name;
 	struct seg_desc seg_desc;
 };
 
 enum task_switch_reason {
 	TSR_CALL,
 	TSR_IRET,
 	TSR_JMP,
 	TSR_IDT_GATE,	/* task gate in IDT */
 };
 
 struct vm_task_switch {
 	uint16_t	tsssel;		/* new TSS selector */
 	int		ext;		/* task switch due to external event */
 	uint32_t	errcode;
 	int		errcode_valid;	/* push 'errcode' on the new stack */
 	enum task_switch_reason reason;
 	struct vm_guest_paging paging;
 };
 
 struct vm_exit {
 	enum vm_exitcode	exitcode;
 	int			inst_length;	/* 0 means unknown */
 	uint64_t		rip;
 	union {
 		struct vm_inout	inout;
 		struct vm_inout_str inout_str;
 		struct {
 			uint64_t	gpa;
 			int		fault_type;
 		} paging;
 		struct {
 			uint64_t	gpa;
 			uint64_t	gla;
 			int		cs_d;		/* CS.D */
 			struct vm_guest_paging paging;
 			struct vie	vie;
 		} inst_emul;
 		/*
 		 * VMX specific payload. Used when there is no "better"
 		 * exitcode to represent the VM-exit.
 		 */
 		struct {
 			int		status;		/* vmx inst status */
 			/*
 			 * 'exit_reason' and 'exit_qualification' are valid
 			 * only if 'status' is zero.
 			 */
 			uint32_t	exit_reason;
 			uint64_t	exit_qualification;
 			/*
 			 * 'inst_error' and 'inst_type' are valid
 			 * only if 'status' is non-zero.
 			 */
 			int		inst_type;
 			int		inst_error;
 		} vmx;
 		struct {
 			uint32_t	code;		/* ecx value */
 			uint64_t	wval;
 		} msr;
 		struct {
 			int		vcpu;
 			uint64_t	rip;
 		} spinup_ap;
 		struct {
 			uint64_t	rflags;
 		} hlt;
 		struct {
 			int		vector;
 		} ioapic_eoi;
 		struct {
 			enum vm_suspend_how how;
 		} suspended;
 		struct vm_task_switch task_switch;
 	} u;
 };
 
 /* APIs to inject faults into the guest */
 void vm_inject_fault(void *vm, int vcpuid, int vector, int errcode_valid,
     int errcode);
 
 static __inline void
 vm_inject_ud(void *vm, int vcpuid)
 {
 	vm_inject_fault(vm, vcpuid, IDT_UD, 0, 0);
 }
 
 static __inline void
 vm_inject_gp(void *vm, int vcpuid)
 {
 	vm_inject_fault(vm, vcpuid, IDT_GP, 1, 0);
 }
 
 static __inline void
 vm_inject_ac(void *vm, int vcpuid, int errcode)
 {
 	vm_inject_fault(vm, vcpuid, IDT_AC, 1, errcode);
 }
 
 static __inline void
 vm_inject_ss(void *vm, int vcpuid, int errcode)
 {
 	vm_inject_fault(vm, vcpuid, IDT_SS, 1, errcode);
 }
 
 void vm_inject_pf(void *vm, int vcpuid, int error_code, uint64_t cr2);
 
 #endif	/* _VMM_H_ */
Index: stable/10/sys/amd64/vmm/vmm_msr.c
===================================================================
--- stable/10/sys/amd64/vmm/vmm_msr.c	(revision 276348)
+++ stable/10/sys/amd64/vmm/vmm_msr.c	(nonexistent)
@@ -1,273 +0,0 @@
-/*-
- * Copyright (c) 2011 NetApp, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * $FreeBSD$
- */
-
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
-
-#include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/smp.h>
-
-#include <machine/specialreg.h>
-
-#include <machine/vmm.h>
-#include "vmm_lapic.h"
-#include "vmm_msr.h"
-
-#define	VMM_MSR_F_EMULATE	0x01
-#define	VMM_MSR_F_READONLY	0x02
-#define VMM_MSR_F_INVALID	0x04  /* guest_msr_valid() can override this */
-
-struct vmm_msr {
-	int		num;
-	int		flags;
-	uint64_t	hostval;
-};
-
-static struct vmm_msr vmm_msr[] = {
-	{ MSR_LSTAR,	0 },
-	{ MSR_CSTAR,	0 },
-	{ MSR_STAR,	0 },
-	{ MSR_SF_MASK,	0 },
-	{ MSR_PAT,      VMM_MSR_F_EMULATE | VMM_MSR_F_INVALID },
-	{ MSR_BIOS_SIGN,VMM_MSR_F_EMULATE },
-	{ MSR_MCG_CAP,	VMM_MSR_F_EMULATE | VMM_MSR_F_READONLY },
-	{ MSR_IA32_PLATFORM_ID, VMM_MSR_F_EMULATE | VMM_MSR_F_READONLY },
-	{ MSR_IA32_MISC_ENABLE, VMM_MSR_F_EMULATE | VMM_MSR_F_READONLY },
-};
-
-#define	vmm_msr_num	(sizeof(vmm_msr) / sizeof(vmm_msr[0]))
-CTASSERT(VMM_MSR_NUM >= vmm_msr_num);
-
-#define	readonly_msr(idx)	\
-	((vmm_msr[(idx)].flags & VMM_MSR_F_READONLY) != 0)
-
-#define	emulated_msr(idx)	\
-	((vmm_msr[(idx)].flags & VMM_MSR_F_EMULATE) != 0)
-
-#define invalid_msr(idx)	\
-	((vmm_msr[(idx)].flags & VMM_MSR_F_INVALID) != 0)
-
-void
-vmm_msr_init(void)
-{
-	int i;
-
-	for (i = 0; i < vmm_msr_num; i++) {
-		if (emulated_msr(i))
-			continue;
-		/*
-		 * XXX this assumes that the value of the host msr does not
-		 * change after we have cached it.
-		 */
-		vmm_msr[i].hostval = rdmsr(vmm_msr[i].num);
-	}
-}
-
-void
-guest_msrs_init(struct vm *vm, int cpu)
-{
-	int i;
-	uint64_t *guest_msrs, misc;
-
-	guest_msrs = vm_guest_msrs(vm, cpu);
-	
-	for (i = 0; i < vmm_msr_num; i++) {
-		switch (vmm_msr[i].num) {
-		case MSR_LSTAR:
-		case MSR_CSTAR:
-		case MSR_STAR:
-		case MSR_SF_MASK:
-		case MSR_BIOS_SIGN:
-		case MSR_MCG_CAP:
-			guest_msrs[i] = 0;
-			break;
-		case MSR_PAT:
-			guest_msrs[i] = PAT_VALUE(0, PAT_WRITE_BACK)      |
-				PAT_VALUE(1, PAT_WRITE_THROUGH)   |
-				PAT_VALUE(2, PAT_UNCACHED)        |
-				PAT_VALUE(3, PAT_UNCACHEABLE)     |
-				PAT_VALUE(4, PAT_WRITE_BACK)      |
-				PAT_VALUE(5, PAT_WRITE_THROUGH)   |
-				PAT_VALUE(6, PAT_UNCACHED)        |
-				PAT_VALUE(7, PAT_UNCACHEABLE);
-			break;
-		case MSR_IA32_MISC_ENABLE:
-			misc = rdmsr(MSR_IA32_MISC_ENABLE);
-			/*
-			 * Set mandatory bits
-			 *  11:   branch trace disabled
-			 *  12:   PEBS unavailable
-			 * Clear unsupported features
-			 *  16:   SpeedStep enable
-			 *  18:   enable MONITOR FSM
-                         */
-			misc |= (1 << 12) | (1 << 11);
-			misc &= ~((1 << 18) | (1 << 16));
-			guest_msrs[i] = misc;
-			break;
-		case MSR_IA32_PLATFORM_ID:
-			guest_msrs[i] = 0;
-			break;
-		default:
-			panic("guest_msrs_init: missing initialization for msr "
-			      "0x%0x", vmm_msr[i].num);
-		}
-	}
-}
-
-static int
-msr_num_to_idx(u_int num)
-{
-	int i;
-
-	for (i = 0; i < vmm_msr_num; i++)
-		if (vmm_msr[i].num == num)
-			return (i);
-
-	return (-1);
-}
-
-int
-emulate_wrmsr(struct vm *vm, int cpu, u_int num, uint64_t val, bool *retu)
-{
-	int idx;
-	uint64_t *guest_msrs;
-
-	if (lapic_msr(num))
-		return (lapic_wrmsr(vm, cpu, num, val, retu));
-
-	idx = msr_num_to_idx(num);
-	if (idx < 0 || invalid_msr(idx))
-		return (EINVAL);
-
-	if (!readonly_msr(idx)) {
-		guest_msrs = vm_guest_msrs(vm, cpu);
-
-		/* Stash the value */
-		guest_msrs[idx] = val;
-
-		/* Update processor state for non-emulated MSRs */
-		if (!emulated_msr(idx))
-			wrmsr(vmm_msr[idx].num, val);
-	}
-
-	return (0);
-}
-
-int
-emulate_rdmsr(struct vm *vm, int cpu, u_int num, bool *retu)
-{
-	int error, idx;
-	uint32_t eax, edx;
-	uint64_t result, *guest_msrs;
-
-	if (lapic_msr(num)) {
-		error = lapic_rdmsr(vm, cpu, num, &result, retu);
-		goto done;
-	}
-
-	idx = msr_num_to_idx(num);
-	if (idx < 0 || invalid_msr(idx)) {
-		error = EINVAL;
-		goto done;
-	}
-
-	guest_msrs = vm_guest_msrs(vm, cpu);
-	result = guest_msrs[idx];
-
-	/*
-	 * If this is not an emulated msr register make sure that the processor
-	 * state matches our cached state.
-	 */
-	if (!emulated_msr(idx) && (rdmsr(num) != result)) {
-		panic("emulate_rdmsr: msr 0x%0x has inconsistent cached "
-		      "(0x%016lx) and actual (0x%016lx) values", num,
-		      result, rdmsr(num));
-	}
-
-	error = 0;
-
-done:
-	if (error == 0) {
-		eax = result;
-		edx = result >> 32;
-		error = vm_set_register(vm, cpu, VM_REG_GUEST_RAX, eax);
-		if (error)
-			panic("vm_set_register(rax) error %d", error);
-		error = vm_set_register(vm, cpu, VM_REG_GUEST_RDX, edx);
-		if (error)
-			panic("vm_set_register(rdx) error %d", error);
-	}
-	return (error);
-}
-
-void
-restore_guest_msrs(struct vm *vm, int cpu)
-{
-	int i;
-	uint64_t *guest_msrs;
-
-	guest_msrs = vm_guest_msrs(vm, cpu);
-
-	for (i = 0; i < vmm_msr_num; i++) {
-		if (emulated_msr(i))
-			continue;
-		else
-			wrmsr(vmm_msr[i].num, guest_msrs[i]);
-	}
-}
-
-void
-restore_host_msrs(struct vm *vm, int cpu)
-{
-	int i;
-
-	for (i = 0; i < vmm_msr_num; i++) {
-		if (emulated_msr(i))
-			continue;
-		else
-			wrmsr(vmm_msr[i].num, vmm_msr[i].hostval);
-	}
-}
-
-/*
- * Must be called by the CPU-specific code before any guests are
- * created
- */
-void
-guest_msr_valid(int msr)
-{
-	int i;
-
-	for (i = 0; i < vmm_msr_num; i++) {
-		if (vmm_msr[i].num == msr && invalid_msr(i)) {
-			vmm_msr[i].flags &= ~VMM_MSR_F_INVALID;
-		}
-	}
-}

Property changes on: stable/10/sys/amd64/vmm/vmm_msr.c
___________________________________________________________________
Deleted: svn:eol-style
## -1 +0,0 ##
-native
\ No newline at end of property
Deleted: svn:keywords
## -1 +0,0 ##
-FreeBSD=%H
\ No newline at end of property
Deleted: svn:mime-type
## -1 +0,0 ##
-text/plain
\ No newline at end of property
Index: stable/10/sys/amd64/vmm/vmm_msr.h
===================================================================
--- stable/10/sys/amd64/vmm/vmm_msr.h	(revision 276348)
+++ stable/10/sys/amd64/vmm/vmm_msr.h	(nonexistent)
@@ -1,44 +0,0 @@
-/*-
- * Copyright (c) 2011 NetApp, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * $FreeBSD$
- */
-
-#ifndef	_VMM_MSR_H_
-#define	_VMM_MSR_H_
-
-#define	VMM_MSR_NUM	16
-struct vm;
-
-void	vmm_msr_init(void);
-int	emulate_wrmsr(struct vm *vm, int vcpu, u_int msr, uint64_t val,
-	    bool *retu);
-int	emulate_rdmsr(struct vm *vm, int vcpu, u_int msr, bool *retu);
-void	guest_msrs_init(struct vm *vm, int cpu);
-void	guest_msr_valid(int msr);
-void	restore_host_msrs(struct vm *vm, int cpu);
-void	restore_guest_msrs(struct vm *vm, int cpu);
-
-#endif

Property changes on: stable/10/sys/amd64/vmm/vmm_msr.h
___________________________________________________________________
Deleted: svn:eol-style
## -1 +0,0 ##
-native
\ No newline at end of property
Deleted: svn:keywords
## -1 +0,0 ##
-FreeBSD=%H
\ No newline at end of property
Deleted: svn:mime-type
## -1 +0,0 ##
-text/plain
\ No newline at end of property
Index: stable/10/sys/amd64/vmm/intel/ept.c
===================================================================
--- stable/10/sys/amd64/vmm/intel/ept.c	(revision 276348)
+++ stable/10/sys/amd64/vmm/intel/ept.c	(revision 276349)
@@ -1,207 +1,206 @@
 /*-
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/types.h>
 #include <sys/systm.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_extern.h>
 
 #include <machine/vmm.h>
 
 #include "vmx_cpufunc.h"
 #include "vmm_ipi.h"
-#include "vmx_msr.h"
 #include "ept.h"
 
 #define	EPT_SUPPORTS_EXEC_ONLY(cap)	((cap) & (1UL << 0))
 #define	EPT_PWL4(cap)			((cap) & (1UL << 6))
 #define	EPT_MEMORY_TYPE_WB(cap)		((cap) & (1UL << 14))
 #define	EPT_PDE_SUPERPAGE(cap)		((cap) & (1UL << 16))	/* 2MB pages */
 #define	EPT_PDPTE_SUPERPAGE(cap)	((cap) & (1UL << 17))	/* 1GB pages */
 #define	INVEPT_SUPPORTED(cap)		((cap) & (1UL << 20))
 #define	AD_BITS_SUPPORTED(cap)		((cap) & (1UL << 21))
 #define	INVVPID_SUPPORTED(cap)		((cap) & (1UL << 32))
 
 #define	INVVPID_ALL_TYPES_MASK		0xF0000000000UL
 #define	INVVPID_ALL_TYPES_SUPPORTED(cap)	\
 	(((cap) & INVVPID_ALL_TYPES_MASK) == INVVPID_ALL_TYPES_MASK)
 
 #define	INVEPT_ALL_TYPES_MASK		0x6000000UL
 #define	INVEPT_ALL_TYPES_SUPPORTED(cap)		\
 	(((cap) & INVEPT_ALL_TYPES_MASK) == INVEPT_ALL_TYPES_MASK)
 
 #define	EPT_PWLEVELS		4		/* page walk levels */
 #define	EPT_ENABLE_AD_BITS	(1 << 6)
 
 SYSCTL_DECL(_hw_vmm);
 SYSCTL_NODE(_hw_vmm, OID_AUTO, ept, CTLFLAG_RW, NULL, NULL);
 
 static int ept_enable_ad_bits;
 
 static int ept_pmap_flags;
 SYSCTL_INT(_hw_vmm_ept, OID_AUTO, pmap_flags, CTLFLAG_RD,
     &ept_pmap_flags, 0, NULL);
 
 int
 ept_init(int ipinum)
 {
 	int use_hw_ad_bits, use_superpages, use_exec_only;
 	uint64_t cap;
 
 	cap = rdmsr(MSR_VMX_EPT_VPID_CAP);
 
 	/*
 	 * Verify that:
 	 * - page walk length is 4 steps
 	 * - extended page tables can be laid out in write-back memory
 	 * - invvpid instruction with all possible types is supported
 	 * - invept instruction with all possible types is supported
 	 */
 	if (!EPT_PWL4(cap) ||
 	    !EPT_MEMORY_TYPE_WB(cap) ||
 	    !INVVPID_SUPPORTED(cap) ||
 	    !INVVPID_ALL_TYPES_SUPPORTED(cap) ||
 	    !INVEPT_SUPPORTED(cap) ||
 	    !INVEPT_ALL_TYPES_SUPPORTED(cap))
 		return (EINVAL);
 
 	ept_pmap_flags = ipinum & PMAP_NESTED_IPIMASK;
 
 	use_superpages = 1;
 	TUNABLE_INT_FETCH("hw.vmm.ept.use_superpages", &use_superpages);
 	if (use_superpages && EPT_PDE_SUPERPAGE(cap))
 		ept_pmap_flags |= PMAP_PDE_SUPERPAGE;	/* 2MB superpage */
 
 	use_hw_ad_bits = 1;
 	TUNABLE_INT_FETCH("hw.vmm.ept.use_hw_ad_bits", &use_hw_ad_bits);
 	if (use_hw_ad_bits && AD_BITS_SUPPORTED(cap))
 		ept_enable_ad_bits = 1;
 	else
 		ept_pmap_flags |= PMAP_EMULATE_AD_BITS;
 
 	use_exec_only = 1;
 	TUNABLE_INT_FETCH("hw.vmm.ept.use_exec_only", &use_exec_only);
 	if (use_exec_only && EPT_SUPPORTS_EXEC_ONLY(cap))
 		ept_pmap_flags |= PMAP_SUPPORTS_EXEC_ONLY;
 
 	return (0);
 }
 
 #if 0
 static void
 ept_dump(uint64_t *ptp, int nlevels)
 {
 	int i, t, tabs;
 	uint64_t *ptpnext, ptpval;
 
 	if (--nlevels < 0)
 		return;
 
 	tabs = 3 - nlevels;
 	for (t = 0; t < tabs; t++)
 		printf("\t");
 	printf("PTP = %p\n", ptp);
 
 	for (i = 0; i < 512; i++) {
 		ptpval = ptp[i];
 
 		if (ptpval == 0)
 			continue;
 		
 		for (t = 0; t < tabs; t++)
 			printf("\t");
 		printf("%3d 0x%016lx\n", i, ptpval);
 
 		if (nlevels != 0 && (ptpval & EPT_PG_SUPERPAGE) == 0) {
 			ptpnext = (uint64_t *)
 				  PHYS_TO_DMAP(ptpval & EPT_ADDR_MASK);
 			ept_dump(ptpnext, nlevels);
 		}
 	}
 }
 #endif
 
 static void
 invept_single_context(void *arg)
 {
 	struct invept_desc desc = *(struct invept_desc *)arg;
 
 	invept(INVEPT_TYPE_SINGLE_CONTEXT, desc);
 }
 
 void
 ept_invalidate_mappings(u_long eptp)
 {
 	struct invept_desc invept_desc = { 0 };
 
 	invept_desc.eptp = eptp;
 
 	smp_rendezvous(NULL, invept_single_context, NULL, &invept_desc);
 }
 
 static int
 ept_pinit(pmap_t pmap)
 {
 
 	return (pmap_pinit_type(pmap, PT_EPT, ept_pmap_flags));
 }
 
 struct vmspace *
 ept_vmspace_alloc(vm_offset_t min, vm_offset_t max)
 {
 
 	return (vmspace_alloc(min, max, ept_pinit));
 }
 
 void
 ept_vmspace_free(struct vmspace *vmspace)
 {
 
 	vmspace_free(vmspace);
 }
 
 uint64_t
 eptp(uint64_t pml4)
 {
 	uint64_t eptp_val;
 
 	eptp_val = pml4 | (EPT_PWLEVELS - 1) << 3 | PAT_WRITE_BACK;
 	if (ept_enable_ad_bits)
 		eptp_val |= EPT_ENABLE_AD_BITS;
 
 	return (eptp_val);
 }
Index: stable/10/sys/amd64/vmm/intel/vmcs.h
===================================================================
--- stable/10/sys/amd64/vmm/intel/vmcs.h	(revision 276348)
+++ stable/10/sys/amd64/vmm/intel/vmcs.h	(revision 276349)
@@ -1,396 +1,401 @@
 /*-
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _VMCS_H_
 #define	_VMCS_H_
 
 #ifdef _KERNEL
 struct vmcs {
 	uint32_t	identifier;
 	uint32_t	abort_code;
 	char		_impl_specific[PAGE_SIZE - sizeof(uint32_t) * 2];
 };
 CTASSERT(sizeof(struct vmcs) == PAGE_SIZE);
 
 /* MSR save region is composed of an array of 'struct msr_entry' */
 struct msr_entry {
 	uint32_t	index;
 	uint32_t	reserved;
 	uint64_t	val;
 
 };
 
 int vmcs_set_msr_save(struct vmcs *vmcs, u_long g_area, u_int g_count);
 int	vmcs_init(struct vmcs *vmcs);
 int	vmcs_getreg(struct vmcs *vmcs, int running, int ident, uint64_t *rv);
 int	vmcs_setreg(struct vmcs *vmcs, int running, int ident, uint64_t val);
 int	vmcs_getdesc(struct vmcs *vmcs, int running, int ident,
 		     struct seg_desc *desc);
 int	vmcs_setdesc(struct vmcs *vmcs, int running, int ident,
 		     struct seg_desc *desc);
 
+/*
+ * Avoid header pollution caused by inline use of 'vtophys()' in vmx_cpufunc.h
+ */
+#ifdef _VMX_CPUFUNC_H_
 static __inline uint64_t
 vmcs_read(uint32_t encoding)
 {
 	int error;
 	uint64_t val;
 
 	error = vmread(encoding, &val);
 	KASSERT(error == 0, ("vmcs_read(%u) error %d", encoding, error));
 	return (val);
 }
 
 static __inline void
 vmcs_write(uint32_t encoding, uint64_t val)
 {
 	int error;
 
 	error = vmwrite(encoding, val);
 	KASSERT(error == 0, ("vmcs_write(%u) error %d", encoding, error));
 }
+#endif	/* _VMX_CPUFUNC_H_ */
 
 #define	vmexit_instruction_length()	vmcs_read(VMCS_EXIT_INSTRUCTION_LENGTH)
 #define	vmcs_guest_rip()		vmcs_read(VMCS_GUEST_RIP)
 #define	vmcs_instruction_error()	vmcs_read(VMCS_INSTRUCTION_ERROR)
 #define	vmcs_exit_reason()		(vmcs_read(VMCS_EXIT_REASON) & 0xffff)
 #define	vmcs_exit_qualification()	vmcs_read(VMCS_EXIT_QUALIFICATION)
 #define	vmcs_guest_cr3()		vmcs_read(VMCS_GUEST_CR3)
 #define	vmcs_gpa()			vmcs_read(VMCS_GUEST_PHYSICAL_ADDRESS)
 #define	vmcs_gla()			vmcs_read(VMCS_GUEST_LINEAR_ADDRESS)
 #define	vmcs_idt_vectoring_info()	vmcs_read(VMCS_IDT_VECTORING_INFO)
 #define	vmcs_idt_vectoring_err()	vmcs_read(VMCS_IDT_VECTORING_ERROR)
 
 #endif	/* _KERNEL */
 
 #define	VMCS_INITIAL			0xffffffffffffffff
 
 #define	VMCS_IDENT(encoding)		((encoding) | 0x80000000)
 /*
  * VMCS field encodings from Appendix H, Intel Architecture Manual Vol3B.
  */
 #define	VMCS_INVALID_ENCODING		0xffffffff
 
 /* 16-bit control fields */
 #define	VMCS_VPID			0x00000000
 #define	VMCS_PIR_VECTOR			0x00000002
 
 /* 16-bit guest-state fields */
 #define	VMCS_GUEST_ES_SELECTOR		0x00000800
 #define	VMCS_GUEST_CS_SELECTOR		0x00000802
 #define	VMCS_GUEST_SS_SELECTOR		0x00000804
 #define	VMCS_GUEST_DS_SELECTOR		0x00000806
 #define	VMCS_GUEST_FS_SELECTOR		0x00000808
 #define	VMCS_GUEST_GS_SELECTOR		0x0000080A
 #define	VMCS_GUEST_LDTR_SELECTOR	0x0000080C
 #define	VMCS_GUEST_TR_SELECTOR		0x0000080E
 #define	VMCS_GUEST_INTR_STATUS		0x00000810
 
 /* 16-bit host-state fields */
 #define	VMCS_HOST_ES_SELECTOR		0x00000C00
 #define	VMCS_HOST_CS_SELECTOR		0x00000C02
 #define	VMCS_HOST_SS_SELECTOR		0x00000C04
 #define	VMCS_HOST_DS_SELECTOR		0x00000C06
 #define	VMCS_HOST_FS_SELECTOR		0x00000C08
 #define	VMCS_HOST_GS_SELECTOR		0x00000C0A
 #define	VMCS_HOST_TR_SELECTOR		0x00000C0C
 
 /* 64-bit control fields */
 #define	VMCS_IO_BITMAP_A		0x00002000
 #define	VMCS_IO_BITMAP_B		0x00002002
 #define	VMCS_MSR_BITMAP			0x00002004
 #define	VMCS_EXIT_MSR_STORE		0x00002006
 #define	VMCS_EXIT_MSR_LOAD		0x00002008
 #define	VMCS_ENTRY_MSR_LOAD		0x0000200A
 #define	VMCS_EXECUTIVE_VMCS		0x0000200C
 #define	VMCS_TSC_OFFSET			0x00002010
 #define	VMCS_VIRTUAL_APIC		0x00002012
 #define	VMCS_APIC_ACCESS		0x00002014
 #define	VMCS_PIR_DESC			0x00002016
 #define	VMCS_EPTP			0x0000201A
 #define	VMCS_EOI_EXIT0			0x0000201C
 #define	VMCS_EOI_EXIT1			0x0000201E
 #define	VMCS_EOI_EXIT2			0x00002020
 #define	VMCS_EOI_EXIT3			0x00002022
 #define	VMCS_EOI_EXIT(vector)		(VMCS_EOI_EXIT0 + ((vector) / 64) * 2)
 
 /* 64-bit read-only fields */
 #define	VMCS_GUEST_PHYSICAL_ADDRESS	0x00002400
 
 /* 64-bit guest-state fields */
 #define	VMCS_LINK_POINTER		0x00002800
 #define	VMCS_GUEST_IA32_DEBUGCTL	0x00002802
 #define	VMCS_GUEST_IA32_PAT		0x00002804
 #define	VMCS_GUEST_IA32_EFER		0x00002806
 #define	VMCS_GUEST_IA32_PERF_GLOBAL_CTRL 0x00002808
 #define	VMCS_GUEST_PDPTE0		0x0000280A
 #define	VMCS_GUEST_PDPTE1		0x0000280C
 #define	VMCS_GUEST_PDPTE2		0x0000280E
 #define	VMCS_GUEST_PDPTE3		0x00002810
 
 /* 64-bit host-state fields */
 #define	VMCS_HOST_IA32_PAT		0x00002C00
 #define	VMCS_HOST_IA32_EFER		0x00002C02
 #define	VMCS_HOST_IA32_PERF_GLOBAL_CTRL	0x00002C04
 
 /* 32-bit control fields */
 #define	VMCS_PIN_BASED_CTLS		0x00004000
 #define	VMCS_PRI_PROC_BASED_CTLS	0x00004002
 #define	VMCS_EXCEPTION_BITMAP		0x00004004
 #define	VMCS_PF_ERROR_MASK		0x00004006
 #define	VMCS_PF_ERROR_MATCH		0x00004008
 #define	VMCS_CR3_TARGET_COUNT		0x0000400A
 #define	VMCS_EXIT_CTLS			0x0000400C
 #define	VMCS_EXIT_MSR_STORE_COUNT	0x0000400E
 #define	VMCS_EXIT_MSR_LOAD_COUNT	0x00004010
 #define	VMCS_ENTRY_CTLS			0x00004012
 #define	VMCS_ENTRY_MSR_LOAD_COUNT	0x00004014
 #define	VMCS_ENTRY_INTR_INFO		0x00004016
 #define	VMCS_ENTRY_EXCEPTION_ERROR	0x00004018
 #define	VMCS_ENTRY_INST_LENGTH		0x0000401A
 #define	VMCS_TPR_THRESHOLD		0x0000401C
 #define	VMCS_SEC_PROC_BASED_CTLS	0x0000401E
 #define	VMCS_PLE_GAP			0x00004020
 #define	VMCS_PLE_WINDOW			0x00004022
 
 /* 32-bit read-only data fields */
 #define	VMCS_INSTRUCTION_ERROR		0x00004400
 #define	VMCS_EXIT_REASON		0x00004402
 #define	VMCS_EXIT_INTR_INFO		0x00004404
 #define	VMCS_EXIT_INTR_ERRCODE		0x00004406
 #define	VMCS_IDT_VECTORING_INFO		0x00004408
 #define	VMCS_IDT_VECTORING_ERROR	0x0000440A
 #define	VMCS_EXIT_INSTRUCTION_LENGTH	0x0000440C
 #define	VMCS_EXIT_INSTRUCTION_INFO	0x0000440E
 
 /* 32-bit guest-state fields */
 #define	VMCS_GUEST_ES_LIMIT		0x00004800
 #define	VMCS_GUEST_CS_LIMIT		0x00004802
 #define	VMCS_GUEST_SS_LIMIT		0x00004804
 #define	VMCS_GUEST_DS_LIMIT		0x00004806
 #define	VMCS_GUEST_FS_LIMIT		0x00004808
 #define	VMCS_GUEST_GS_LIMIT		0x0000480A
 #define	VMCS_GUEST_LDTR_LIMIT		0x0000480C
 #define	VMCS_GUEST_TR_LIMIT		0x0000480E
 #define	VMCS_GUEST_GDTR_LIMIT		0x00004810
 #define	VMCS_GUEST_IDTR_LIMIT		0x00004812
 #define	VMCS_GUEST_ES_ACCESS_RIGHTS	0x00004814
 #define	VMCS_GUEST_CS_ACCESS_RIGHTS	0x00004816
 #define	VMCS_GUEST_SS_ACCESS_RIGHTS	0x00004818
 #define	VMCS_GUEST_DS_ACCESS_RIGHTS	0x0000481A
 #define	VMCS_GUEST_FS_ACCESS_RIGHTS	0x0000481C
 #define	VMCS_GUEST_GS_ACCESS_RIGHTS	0x0000481E
 #define	VMCS_GUEST_LDTR_ACCESS_RIGHTS	0x00004820
 #define	VMCS_GUEST_TR_ACCESS_RIGHTS	0x00004822
 #define	VMCS_GUEST_INTERRUPTIBILITY	0x00004824
 #define	VMCS_GUEST_ACTIVITY		0x00004826
 #define VMCS_GUEST_SMBASE		0x00004828
 #define	VMCS_GUEST_IA32_SYSENTER_CS	0x0000482A
 #define	VMCS_PREEMPTION_TIMER_VALUE	0x0000482E
 
 /* 32-bit host state fields */
 #define	VMCS_HOST_IA32_SYSENTER_CS	0x00004C00
 
 /* Natural Width control fields */
 #define	VMCS_CR0_MASK			0x00006000
 #define	VMCS_CR4_MASK			0x00006002
 #define	VMCS_CR0_SHADOW			0x00006004
 #define	VMCS_CR4_SHADOW			0x00006006
 #define	VMCS_CR3_TARGET0		0x00006008
 #define	VMCS_CR3_TARGET1		0x0000600A
 #define	VMCS_CR3_TARGET2		0x0000600C
 #define	VMCS_CR3_TARGET3		0x0000600E
 
 /* Natural Width read-only fields */
 #define	VMCS_EXIT_QUALIFICATION		0x00006400
 #define	VMCS_IO_RCX			0x00006402
 #define	VMCS_IO_RSI			0x00006404
 #define	VMCS_IO_RDI			0x00006406
 #define	VMCS_IO_RIP			0x00006408
 #define	VMCS_GUEST_LINEAR_ADDRESS	0x0000640A
 
 /* Natural Width guest-state fields */
 #define	VMCS_GUEST_CR0			0x00006800
 #define	VMCS_GUEST_CR3			0x00006802
 #define	VMCS_GUEST_CR4			0x00006804
 #define	VMCS_GUEST_ES_BASE		0x00006806
 #define	VMCS_GUEST_CS_BASE		0x00006808
 #define	VMCS_GUEST_SS_BASE		0x0000680A
 #define	VMCS_GUEST_DS_BASE		0x0000680C
 #define	VMCS_GUEST_FS_BASE		0x0000680E
 #define	VMCS_GUEST_GS_BASE		0x00006810
 #define	VMCS_GUEST_LDTR_BASE		0x00006812
 #define	VMCS_GUEST_TR_BASE		0x00006814
 #define	VMCS_GUEST_GDTR_BASE		0x00006816
 #define	VMCS_GUEST_IDTR_BASE		0x00006818
 #define	VMCS_GUEST_DR7			0x0000681A
 #define	VMCS_GUEST_RSP			0x0000681C
 #define	VMCS_GUEST_RIP			0x0000681E
 #define	VMCS_GUEST_RFLAGS		0x00006820
 #define	VMCS_GUEST_PENDING_DBG_EXCEPTIONS 0x00006822
 #define	VMCS_GUEST_IA32_SYSENTER_ESP	0x00006824
 #define	VMCS_GUEST_IA32_SYSENTER_EIP	0x00006826
 
 /* Natural Width host-state fields */
 #define	VMCS_HOST_CR0			0x00006C00
 #define	VMCS_HOST_CR3			0x00006C02
 #define	VMCS_HOST_CR4			0x00006C04
 #define	VMCS_HOST_FS_BASE		0x00006C06
 #define	VMCS_HOST_GS_BASE		0x00006C08
 #define	VMCS_HOST_TR_BASE		0x00006C0A
 #define	VMCS_HOST_GDTR_BASE		0x00006C0C
 #define	VMCS_HOST_IDTR_BASE		0x00006C0E
 #define	VMCS_HOST_IA32_SYSENTER_ESP	0x00006C10
 #define	VMCS_HOST_IA32_SYSENTER_EIP	0x00006C12
 #define	VMCS_HOST_RSP			0x00006C14
 #define	VMCS_HOST_RIP			0x00006c16
 
 /*
  * VM instruction error numbers
  */
 #define	VMRESUME_WITH_NON_LAUNCHED_VMCS	5
 
 /*
  * VMCS exit reasons
  */
 #define EXIT_REASON_EXCEPTION		0
 #define EXIT_REASON_EXT_INTR		1
 #define EXIT_REASON_TRIPLE_FAULT	2
 #define EXIT_REASON_INIT		3
 #define EXIT_REASON_SIPI		4
 #define EXIT_REASON_IO_SMI		5
 #define EXIT_REASON_SMI			6
 #define EXIT_REASON_INTR_WINDOW		7
 #define EXIT_REASON_NMI_WINDOW		8
 #define EXIT_REASON_TASK_SWITCH		9
 #define EXIT_REASON_CPUID		10
 #define EXIT_REASON_GETSEC		11
 #define EXIT_REASON_HLT			12
 #define EXIT_REASON_INVD		13
 #define EXIT_REASON_INVLPG		14
 #define EXIT_REASON_RDPMC		15
 #define EXIT_REASON_RDTSC		16
 #define EXIT_REASON_RSM			17
 #define EXIT_REASON_VMCALL		18
 #define EXIT_REASON_VMCLEAR		19
 #define EXIT_REASON_VMLAUNCH		20
 #define EXIT_REASON_VMPTRLD		21
 #define EXIT_REASON_VMPTRST		22
 #define EXIT_REASON_VMREAD		23
 #define EXIT_REASON_VMRESUME		24
 #define EXIT_REASON_VMWRITE		25
 #define EXIT_REASON_VMXOFF		26
 #define EXIT_REASON_VMXON		27
 #define EXIT_REASON_CR_ACCESS		28
 #define EXIT_REASON_DR_ACCESS		29
 #define EXIT_REASON_INOUT		30
 #define EXIT_REASON_RDMSR		31
 #define EXIT_REASON_WRMSR		32
 #define EXIT_REASON_INVAL_VMCS		33
 #define EXIT_REASON_INVAL_MSR		34
 #define EXIT_REASON_MWAIT		36
 #define EXIT_REASON_MTF			37
 #define EXIT_REASON_MONITOR		39
 #define EXIT_REASON_PAUSE		40
 #define EXIT_REASON_MCE			41
 #define EXIT_REASON_TPR			43
 #define EXIT_REASON_APIC_ACCESS		44
 #define	EXIT_REASON_VIRTUALIZED_EOI	45
 #define EXIT_REASON_GDTR_IDTR		46
 #define EXIT_REASON_LDTR_TR		47
 #define EXIT_REASON_EPT_FAULT		48
 #define EXIT_REASON_EPT_MISCONFIG	49
 #define EXIT_REASON_INVEPT		50
 #define EXIT_REASON_RDTSCP		51
 #define EXIT_REASON_VMX_PREEMPT		52
 #define EXIT_REASON_INVVPID		53
 #define EXIT_REASON_WBINVD		54
 #define EXIT_REASON_XSETBV		55
 #define	EXIT_REASON_APIC_WRITE		56
 
 /*
  * NMI unblocking due to IRET.
  *
  * Applies to VM-exits due to hardware exception or EPT fault.
  */
 #define	EXIT_QUAL_NMIUDTI	(1 << 12)
 /*
  * VMCS interrupt information fields
  */
 #define	VMCS_INTR_VALID		(1U << 31)
 #define	VMCS_INTR_T_MASK	0x700		/* Interruption-info type */
 #define	VMCS_INTR_T_HWINTR	(0 << 8)
 #define	VMCS_INTR_T_NMI		(2 << 8)
 #define	VMCS_INTR_T_HWEXCEPTION	(3 << 8)
 #define	VMCS_INTR_T_SWINTR	(4 << 8)
 #define	VMCS_INTR_T_PRIV_SWEXCEPTION (5 << 8)
 #define	VMCS_INTR_T_SWEXCEPTION	(6 << 8)
 #define	VMCS_INTR_DEL_ERRCODE	(1 << 11)
 
 /*
  * VMCS IDT-Vectoring information fields
  */
 #define	VMCS_IDT_VEC_VALID		(1U << 31)
 #define	VMCS_IDT_VEC_ERRCODE_VALID	(1 << 11)
 
 /*
  * VMCS Guest interruptibility field
  */
 #define	VMCS_INTERRUPTIBILITY_STI_BLOCKING	(1 << 0)
 #define	VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING	(1 << 1)
 #define	VMCS_INTERRUPTIBILITY_SMI_BLOCKING	(1 << 2)
 #define	VMCS_INTERRUPTIBILITY_NMI_BLOCKING	(1 << 3)
 
 /*
  * Exit qualification for EXIT_REASON_INVAL_VMCS
  */
 #define	EXIT_QUAL_NMI_WHILE_STI_BLOCKING	3
 
 /*
  * Exit qualification for EPT violation
  */
 #define	EPT_VIOLATION_DATA_READ		(1UL << 0)
 #define	EPT_VIOLATION_DATA_WRITE	(1UL << 1)
 #define	EPT_VIOLATION_INST_FETCH	(1UL << 2)
 #define	EPT_VIOLATION_GPA_READABLE	(1UL << 3)
 #define	EPT_VIOLATION_GPA_WRITEABLE	(1UL << 4)
 #define	EPT_VIOLATION_GPA_EXECUTABLE	(1UL << 5)
 #define	EPT_VIOLATION_GLA_VALID		(1UL << 7)
 #define	EPT_VIOLATION_XLAT_VALID	(1UL << 8)
 
 /*
  * Exit qualification for APIC-access VM exit
  */
 #define	APIC_ACCESS_OFFSET(qual)	((qual) & 0xFFF)
 #define	APIC_ACCESS_TYPE(qual)		(((qual) >> 12) & 0xF)
 
 /*
  * Exit qualification for APIC-write VM exit
  */
 #define	APIC_WRITE_OFFSET(qual)		((qual) & 0xFFF)
 
 #endif
Index: stable/10/sys/amd64/vmm/intel/vmx.c
===================================================================
--- stable/10/sys/amd64/vmm/intel/vmx.c	(revision 276348)
+++ stable/10/sys/amd64/vmm/intel/vmx.c	(revision 276349)
@@ -1,3308 +1,3336 @@
 /*-
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/smp.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/pcpu.h>
 #include <sys/proc.h>
 #include <sys/sysctl.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 #include <machine/psl.h>
 #include <machine/cpufunc.h>
 #include <machine/md_var.h>
 #include <machine/segments.h>
 #include <machine/smp.h>
 #include <machine/specialreg.h>
 #include <machine/vmparam.h>
 
 #include <machine/vmm.h>
 #include <machine/vmm_dev.h>
 #include <machine/vmm_instruction_emul.h>
+#include "vmm_lapic.h"
 #include "vmm_host.h"
 #include "vmm_ioport.h"
 #include "vmm_ipi.h"
-#include "vmm_msr.h"
 #include "vmm_ktr.h"
 #include "vmm_stat.h"
 #include "vatpic.h"
 #include "vlapic.h"
 #include "vlapic_priv.h"
 
-#include "vmx_msr.h"
 #include "ept.h"
 #include "vmx_cpufunc.h"
 #include "vmx.h"
+#include "vmx_msr.h"
 #include "x86.h"
 #include "vmx_controls.h"
 
 #define	PINBASED_CTLS_ONE_SETTING					\
 	(PINBASED_EXTINT_EXITING	|				\
 	 PINBASED_NMI_EXITING		|				\
 	 PINBASED_VIRTUAL_NMI)
 #define	PINBASED_CTLS_ZERO_SETTING	0
 
 #define PROCBASED_CTLS_WINDOW_SETTING					\
 	(PROCBASED_INT_WINDOW_EXITING	|				\
 	 PROCBASED_NMI_WINDOW_EXITING)
 
 #define	PROCBASED_CTLS_ONE_SETTING 					\
 	(PROCBASED_SECONDARY_CONTROLS	|				\
+	 PROCBASED_MWAIT_EXITING	|				\
+	 PROCBASED_MONITOR_EXITING	|				\
 	 PROCBASED_IO_EXITING		|				\
 	 PROCBASED_MSR_BITMAPS		|				\
 	 PROCBASED_CTLS_WINDOW_SETTING	|				\
 	 PROCBASED_CR8_LOAD_EXITING	|				\
 	 PROCBASED_CR8_STORE_EXITING)
 #define	PROCBASED_CTLS_ZERO_SETTING	\
 	(PROCBASED_CR3_LOAD_EXITING |	\
 	PROCBASED_CR3_STORE_EXITING |	\
 	PROCBASED_IO_BITMAPS)
 
 #define	PROCBASED_CTLS2_ONE_SETTING	PROCBASED2_ENABLE_EPT
 #define	PROCBASED_CTLS2_ZERO_SETTING	0
 
-#define VM_EXIT_CTLS_ONE_SETTING_NO_PAT					\
+#define	VM_EXIT_CTLS_ONE_SETTING					\
 	(VM_EXIT_HOST_LMA			|			\
 	VM_EXIT_SAVE_EFER			|			\
-	VM_EXIT_LOAD_EFER)
-
-#define	VM_EXIT_CTLS_ONE_SETTING					\
-	(VM_EXIT_CTLS_ONE_SETTING_NO_PAT       	|			\
+	VM_EXIT_LOAD_EFER			|			\
 	VM_EXIT_ACKNOWLEDGE_INTERRUPT		|			\
 	VM_EXIT_SAVE_PAT			|			\
 	VM_EXIT_LOAD_PAT)
+
 #define	VM_EXIT_CTLS_ZERO_SETTING	VM_EXIT_SAVE_DEBUG_CONTROLS
 
-#define	VM_ENTRY_CTLS_ONE_SETTING_NO_PAT	VM_ENTRY_LOAD_EFER
+#define	VM_ENTRY_CTLS_ONE_SETTING	(VM_ENTRY_LOAD_EFER | VM_ENTRY_LOAD_PAT)
 
-#define	VM_ENTRY_CTLS_ONE_SETTING					\
-	(VM_ENTRY_CTLS_ONE_SETTING_NO_PAT     	|			\
-	VM_ENTRY_LOAD_PAT)
 #define	VM_ENTRY_CTLS_ZERO_SETTING					\
 	(VM_ENTRY_LOAD_DEBUG_CONTROLS		|			\
 	VM_ENTRY_INTO_SMM			|			\
 	VM_ENTRY_DEACTIVATE_DUAL_MONITOR)
 
-#define	guest_msr_rw(vmx, msr) \
-	msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_RW)
-
-#define	guest_msr_ro(vmx, msr) \
-    msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_READ)
-
 #define	HANDLED		1
 #define	UNHANDLED	0
 
 static MALLOC_DEFINE(M_VMX, "vmx", "vmx");
 static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic");
 
 SYSCTL_DECL(_hw_vmm);
 SYSCTL_NODE(_hw_vmm, OID_AUTO, vmx, CTLFLAG_RW, NULL, NULL);
 
 int vmxon_enabled[MAXCPU];
 static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE);
 
 static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2;
 static uint32_t exit_ctls, entry_ctls;
 
 static uint64_t cr0_ones_mask, cr0_zeros_mask;
 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_ones_mask, CTLFLAG_RD,
 	     &cr0_ones_mask, 0, NULL);
 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_zeros_mask, CTLFLAG_RD,
 	     &cr0_zeros_mask, 0, NULL);
 
 static uint64_t cr4_ones_mask, cr4_zeros_mask;
 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_ones_mask, CTLFLAG_RD,
 	     &cr4_ones_mask, 0, NULL);
 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_zeros_mask, CTLFLAG_RD,
 	     &cr4_zeros_mask, 0, NULL);
 
 static int vmx_initialized;
 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD,
 	   &vmx_initialized, 0, "Intel VMX initialized");
 
 /*
  * Optional capabilities
  */
 static SYSCTL_NODE(_hw_vmm_vmx, OID_AUTO, cap, CTLFLAG_RW, NULL, NULL);
 
-static int vmx_patmsr;
-SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, patmsr, CTLFLAG_RD, &vmx_patmsr, 0,
-    "PAT MSR saved and restored in VCMS");
-
 static int cap_halt_exit;
 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, halt_exit, CTLFLAG_RD, &cap_halt_exit, 0,
     "HLT triggers a VM-exit");
 
 static int cap_pause_exit;
 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, pause_exit, CTLFLAG_RD, &cap_pause_exit,
     0, "PAUSE triggers a VM-exit");
 
 static int cap_unrestricted_guest;
 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, unrestricted_guest, CTLFLAG_RD,
     &cap_unrestricted_guest, 0, "Unrestricted guests");
 
 static int cap_monitor_trap;
 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, monitor_trap, CTLFLAG_RD,
     &cap_monitor_trap, 0, "Monitor trap flag");
 
 static int cap_invpcid;
 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, invpcid, CTLFLAG_RD, &cap_invpcid,
     0, "Guests are allowed to use INVPCID");
 
 static int virtual_interrupt_delivery;
 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, virtual_interrupt_delivery, CTLFLAG_RD,
     &virtual_interrupt_delivery, 0, "APICv virtual interrupt delivery support");
 
 static int posted_interrupts;
 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, posted_interrupts, CTLFLAG_RD,
     &posted_interrupts, 0, "APICv posted interrupt support");
 
 static int pirvec;
 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupt_vector, CTLFLAG_RD,
     &pirvec, 0, "APICv posted interrupt vector");
 
 static struct unrhdr *vpid_unr;
 static u_int vpid_alloc_failed;
 SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD,
 	    &vpid_alloc_failed, 0, NULL);
 
 /*
  * Use the last page below 4GB as the APIC access address. This address is
  * occupied by the boot firmware so it is guaranteed that it will not conflict
  * with a page in system memory.
  */
 #define	APIC_ACCESS_ADDRESS	0xFFFFF000
 
 static int vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc);
 static int vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval);
+static int vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val);
 static void vmx_inject_pir(struct vlapic *vlapic);
 
 #ifdef KTR
 static const char *
 exit_reason_to_str(int reason)
 {
 	static char reasonbuf[32];
 
 	switch (reason) {
 	case EXIT_REASON_EXCEPTION:
 		return "exception";
 	case EXIT_REASON_EXT_INTR:
 		return "extint";
 	case EXIT_REASON_TRIPLE_FAULT:
 		return "triplefault";
 	case EXIT_REASON_INIT:
 		return "init";
 	case EXIT_REASON_SIPI:
 		return "sipi";
 	case EXIT_REASON_IO_SMI:
 		return "iosmi";
 	case EXIT_REASON_SMI:
 		return "smi";
 	case EXIT_REASON_INTR_WINDOW:
 		return "intrwindow";
 	case EXIT_REASON_NMI_WINDOW:
 		return "nmiwindow";
 	case EXIT_REASON_TASK_SWITCH:
 		return "taskswitch";
 	case EXIT_REASON_CPUID:
 		return "cpuid";
 	case EXIT_REASON_GETSEC:
 		return "getsec";
 	case EXIT_REASON_HLT:
 		return "hlt";
 	case EXIT_REASON_INVD:
 		return "invd";
 	case EXIT_REASON_INVLPG:
 		return "invlpg";
 	case EXIT_REASON_RDPMC:
 		return "rdpmc";
 	case EXIT_REASON_RDTSC:
 		return "rdtsc";
 	case EXIT_REASON_RSM:
 		return "rsm";
 	case EXIT_REASON_VMCALL:
 		return "vmcall";
 	case EXIT_REASON_VMCLEAR:
 		return "vmclear";
 	case EXIT_REASON_VMLAUNCH:
 		return "vmlaunch";
 	case EXIT_REASON_VMPTRLD:
 		return "vmptrld";
 	case EXIT_REASON_VMPTRST:
 		return "vmptrst";
 	case EXIT_REASON_VMREAD:
 		return "vmread";
 	case EXIT_REASON_VMRESUME:
 		return "vmresume";
 	case EXIT_REASON_VMWRITE:
 		return "vmwrite";
 	case EXIT_REASON_VMXOFF:
 		return "vmxoff";
 	case EXIT_REASON_VMXON:
 		return "vmxon";
 	case EXIT_REASON_CR_ACCESS:
 		return "craccess";
 	case EXIT_REASON_DR_ACCESS:
 		return "draccess";
 	case EXIT_REASON_INOUT:
 		return "inout";
 	case EXIT_REASON_RDMSR:
 		return "rdmsr";
 	case EXIT_REASON_WRMSR:
 		return "wrmsr";
 	case EXIT_REASON_INVAL_VMCS:
 		return "invalvmcs";
 	case EXIT_REASON_INVAL_MSR:
 		return "invalmsr";
 	case EXIT_REASON_MWAIT:
 		return "mwait";
 	case EXIT_REASON_MTF:
 		return "mtf";
 	case EXIT_REASON_MONITOR:
 		return "monitor";
 	case EXIT_REASON_PAUSE:
 		return "pause";
 	case EXIT_REASON_MCE:
 		return "mce";
 	case EXIT_REASON_TPR:
 		return "tpr";
 	case EXIT_REASON_APIC_ACCESS:
 		return "apic-access";
 	case EXIT_REASON_GDTR_IDTR:
 		return "gdtridtr";
 	case EXIT_REASON_LDTR_TR:
 		return "ldtrtr";
 	case EXIT_REASON_EPT_FAULT:
 		return "eptfault";
 	case EXIT_REASON_EPT_MISCONFIG:
 		return "eptmisconfig";
 	case EXIT_REASON_INVEPT:
 		return "invept";
 	case EXIT_REASON_RDTSCP:
 		return "rdtscp";
 	case EXIT_REASON_VMX_PREEMPT:
 		return "vmxpreempt";
 	case EXIT_REASON_INVVPID:
 		return "invvpid";
 	case EXIT_REASON_WBINVD:
 		return "wbinvd";
 	case EXIT_REASON_XSETBV:
 		return "xsetbv";
 	case EXIT_REASON_APIC_WRITE:
 		return "apic-write";
 	default:
 		snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason);
 		return (reasonbuf);
 	}
 }
 #endif	/* KTR */
 
 static int
 vmx_allow_x2apic_msrs(struct vmx *vmx)
 {
 	int i, error;
 
 	error = 0;
 
 	/*
 	 * Allow readonly access to the following x2APIC MSRs from the guest.
 	 */
 	error += guest_msr_ro(vmx, MSR_APIC_ID);
 	error += guest_msr_ro(vmx, MSR_APIC_VERSION);
 	error += guest_msr_ro(vmx, MSR_APIC_LDR);
 	error += guest_msr_ro(vmx, MSR_APIC_SVR);
 
 	for (i = 0; i < 8; i++)
 		error += guest_msr_ro(vmx, MSR_APIC_ISR0 + i);
 
 	for (i = 0; i < 8; i++)
 		error += guest_msr_ro(vmx, MSR_APIC_TMR0 + i);
 	
 	for (i = 0; i < 8; i++)
 		error += guest_msr_ro(vmx, MSR_APIC_IRR0 + i);
 
 	error += guest_msr_ro(vmx, MSR_APIC_ESR);
 	error += guest_msr_ro(vmx, MSR_APIC_LVT_TIMER);
 	error += guest_msr_ro(vmx, MSR_APIC_LVT_THERMAL);
 	error += guest_msr_ro(vmx, MSR_APIC_LVT_PCINT);
 	error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT0);
 	error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT1);
 	error += guest_msr_ro(vmx, MSR_APIC_LVT_ERROR);
 	error += guest_msr_ro(vmx, MSR_APIC_ICR_TIMER);
 	error += guest_msr_ro(vmx, MSR_APIC_DCR_TIMER);
 	error += guest_msr_ro(vmx, MSR_APIC_ICR);
 
 	/*
 	 * Allow TPR, EOI and SELF_IPI MSRs to be read and written by the guest.
 	 *
 	 * These registers get special treatment described in the section
 	 * "Virtualizing MSR-Based APIC Accesses".
 	 */
 	error += guest_msr_rw(vmx, MSR_APIC_TPR);
 	error += guest_msr_rw(vmx, MSR_APIC_EOI);
 	error += guest_msr_rw(vmx, MSR_APIC_SELF_IPI);
 
 	return (error);
 }
 
 u_long
 vmx_fix_cr0(u_long cr0)
 {
 
 	return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask);
 }
 
 u_long
 vmx_fix_cr4(u_long cr4)
 {
 
 	return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask);
 }
 
 static void
 vpid_free(int vpid)
 {
 	if (vpid < 0 || vpid > 0xffff)
 		panic("vpid_free: invalid vpid %d", vpid);
 
 	/*
 	 * VPIDs [0,VM_MAXCPU] are special and are not allocated from
 	 * the unit number allocator.
 	 */
 
 	if (vpid > VM_MAXCPU)
 		free_unr(vpid_unr, vpid);
 }
 
 static void
 vpid_alloc(uint16_t *vpid, int num)
 {
 	int i, x;
 
 	if (num <= 0 || num > VM_MAXCPU)
 		panic("invalid number of vpids requested: %d", num);
 
 	/*
 	 * If the "enable vpid" execution control is not enabled then the
 	 * VPID is required to be 0 for all vcpus.
 	 */
 	if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) == 0) {
 		for (i = 0; i < num; i++)
 			vpid[i] = 0;
 		return;
 	}
 
 	/*
 	 * Allocate a unique VPID for each vcpu from the unit number allocator.
 	 */
 	for (i = 0; i < num; i++) {
 		x = alloc_unr(vpid_unr);
 		if (x == -1)
 			break;
 		else
 			vpid[i] = x;
 	}
 
 	if (i < num) {
 		atomic_add_int(&vpid_alloc_failed, 1);
 
 		/*
 		 * If the unit number allocator does not have enough unique
 		 * VPIDs then we need to allocate from the [1,VM_MAXCPU] range.
 		 *
 		 * These VPIDs are not be unique across VMs but this does not
 		 * affect correctness because the combined mappings are also
 		 * tagged with the EP4TA which is unique for each VM.
 		 *
 		 * It is still sub-optimal because the invvpid will invalidate
 		 * combined mappings for a particular VPID across all EP4TAs.
 		 */
 		while (i-- > 0)
 			vpid_free(vpid[i]);
 
 		for (i = 0; i < num; i++)
 			vpid[i] = i + 1;
 	}
 }
 
 static void
 vpid_init(void)
 {
 	/*
 	 * VPID 0 is required when the "enable VPID" execution control is
 	 * disabled.
 	 *
 	 * VPIDs [1,VM_MAXCPU] are used as the "overflow namespace" when the
 	 * unit number allocator does not have sufficient unique VPIDs to
 	 * satisfy the allocation.
 	 *
 	 * The remaining VPIDs are managed by the unit number allocator.
 	 */
 	vpid_unr = new_unrhdr(VM_MAXCPU + 1, 0xffff, NULL);
 }
 
 static void
-msr_save_area_init(struct msr_entry *g_area, int *g_count)
-{
-	int cnt;
-
-	static struct msr_entry guest_msrs[] = {
-		{ MSR_KGSBASE, 0, 0 },
-	};
-
-	cnt = sizeof(guest_msrs) / sizeof(guest_msrs[0]);
-	if (cnt > GUEST_MSR_MAX_ENTRIES)
-		panic("guest msr save area overrun");
-	bcopy(guest_msrs, g_area, sizeof(guest_msrs));
-	*g_count = cnt;
-}
-
-static void
 vmx_disable(void *arg __unused)
 {
 	struct invvpid_desc invvpid_desc = { 0 };
 	struct invept_desc invept_desc = { 0 };
 
 	if (vmxon_enabled[curcpu]) {
 		/*
 		 * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b.
 		 *
 		 * VMXON or VMXOFF are not required to invalidate any TLB
 		 * caching structures. This prevents potential retention of
 		 * cached information in the TLB between distinct VMX episodes.
 		 */
 		invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc);
 		invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc);
 		vmxoff();
 	}
 	load_cr4(rcr4() & ~CR4_VMXE);
 }
 
 static int
 vmx_cleanup(void)
 {
 	
 	if (pirvec != 0)
 		vmm_ipi_free(pirvec);
 
 	if (vpid_unr != NULL) {
 		delete_unrhdr(vpid_unr);
 		vpid_unr = NULL;
 	}
 
 	smp_rendezvous(NULL, vmx_disable, NULL, NULL);
 
 	return (0);
 }
 
 static void
 vmx_enable(void *arg __unused)
 {
 	int error;
 	uint64_t feature_control;
 
 	feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL);
 	if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 0 ||
 	    (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) {
 		wrmsr(MSR_IA32_FEATURE_CONTROL,
 		    feature_control | IA32_FEATURE_CONTROL_VMX_EN |
 		    IA32_FEATURE_CONTROL_LOCK);
 	}
 
 	load_cr4(rcr4() | CR4_VMXE);
 
 	*(uint32_t *)vmxon_region[curcpu] = vmx_revision();
 	error = vmxon(vmxon_region[curcpu]);
 	if (error == 0)
 		vmxon_enabled[curcpu] = 1;
 }
 
 static void
 vmx_restore(void)
 {
 
 	if (vmxon_enabled[curcpu])
 		vmxon(vmxon_region[curcpu]);
 }
 
 static int
 vmx_init(int ipinum)
 {
 	int error, use_tpr_shadow;
 	uint64_t basic, fixed0, fixed1, feature_control;
 	uint32_t tmp, procbased2_vid_bits;
 
 	/* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */
 	if (!(cpu_feature2 & CPUID2_VMX)) {
 		printf("vmx_init: processor does not support VMX operation\n");
 		return (ENXIO);
 	}
 
 	/*
 	 * Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits
 	 * are set (bits 0 and 2 respectively).
 	 */
 	feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL);
 	if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 1 &&
 	    (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) {
 		printf("vmx_init: VMX operation disabled by BIOS\n");
 		return (ENXIO);
 	}
 
 	/*
 	 * Verify capabilities MSR_VMX_BASIC:
 	 * - bit 54 indicates support for INS/OUTS decoding
 	 */
 	basic = rdmsr(MSR_VMX_BASIC);
 	if ((basic & (1UL << 54)) == 0) {
 		printf("vmx_init: processor does not support desired basic "
 		    "capabilities\n");
 		return (EINVAL);
 	}
 
 	/* Check support for primary processor-based VM-execution controls */
 	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
 			       MSR_VMX_TRUE_PROCBASED_CTLS,
 			       PROCBASED_CTLS_ONE_SETTING,
 			       PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls);
 	if (error) {
 		printf("vmx_init: processor does not support desired primary "
 		       "processor-based controls\n");
 		return (error);
 	}
 
 	/* Clear the processor-based ctl bits that are set on demand */
 	procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING;
 
 	/* Check support for secondary processor-based VM-execution controls */
 	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
 			       MSR_VMX_PROCBASED_CTLS2,
 			       PROCBASED_CTLS2_ONE_SETTING,
 			       PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2);
 	if (error) {
 		printf("vmx_init: processor does not support desired secondary "
 		       "processor-based controls\n");
 		return (error);
 	}
 
 	/* Check support for VPID */
 	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2,
 			       PROCBASED2_ENABLE_VPID, 0, &tmp);
 	if (error == 0)
 		procbased_ctls2 |= PROCBASED2_ENABLE_VPID;
 
 	/* Check support for pin-based VM-execution controls */
 	error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS,
 			       MSR_VMX_TRUE_PINBASED_CTLS,
 			       PINBASED_CTLS_ONE_SETTING,
 			       PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls);
 	if (error) {
 		printf("vmx_init: processor does not support desired "
 		       "pin-based controls\n");
 		return (error);
 	}
 
 	/* Check support for VM-exit controls */
-	vmx_patmsr = 1;
 	error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS,
 			       VM_EXIT_CTLS_ONE_SETTING,
 			       VM_EXIT_CTLS_ZERO_SETTING,
 			       &exit_ctls);
 	if (error) {
-		/* Try again without the PAT MSR bits */
-		error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS,
-				       MSR_VMX_TRUE_EXIT_CTLS,
-				       VM_EXIT_CTLS_ONE_SETTING_NO_PAT,
-				       VM_EXIT_CTLS_ZERO_SETTING,
-				       &exit_ctls);
-		if (error) {
-			printf("vmx_init: processor does not support desired "
-			       "exit controls\n");
-			return (error);
-		} else {
-			if (bootverbose)
-				printf("vmm: PAT MSR access not supported\n");
-			guest_msr_valid(MSR_PAT);
-			vmx_patmsr = 0;
-		}
+		printf("vmx_init: processor does not support desired "
+		    "exit controls\n");
+		return (error);
 	}
 
 	/* Check support for VM-entry controls */
-	if (vmx_patmsr) {
-		error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS,
-				       MSR_VMX_TRUE_ENTRY_CTLS,
-				       VM_ENTRY_CTLS_ONE_SETTING,
-				       VM_ENTRY_CTLS_ZERO_SETTING,
-				       &entry_ctls);
-	} else {
-		error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS,
-				       MSR_VMX_TRUE_ENTRY_CTLS,
-				       VM_ENTRY_CTLS_ONE_SETTING_NO_PAT,
-				       VM_ENTRY_CTLS_ZERO_SETTING,
-				       &entry_ctls);
-	}
-
+	error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, MSR_VMX_TRUE_ENTRY_CTLS,
+	    VM_ENTRY_CTLS_ONE_SETTING, VM_ENTRY_CTLS_ZERO_SETTING,
+	    &entry_ctls);
 	if (error) {
 		printf("vmx_init: processor does not support desired "
-		       "entry controls\n");
-		       return (error);
+		    "entry controls\n");
+		return (error);
 	}
 
 	/*
 	 * Check support for optional features by testing them
 	 * as individual bits
 	 */
 	cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
 					MSR_VMX_TRUE_PROCBASED_CTLS,
 					PROCBASED_HLT_EXITING, 0,
 					&tmp) == 0);
 
 	cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
 					MSR_VMX_PROCBASED_CTLS,
 					PROCBASED_MTF, 0,
 					&tmp) == 0);
 
 	cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
 					 MSR_VMX_TRUE_PROCBASED_CTLS,
 					 PROCBASED_PAUSE_EXITING, 0,
 					 &tmp) == 0);
 
 	cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
 					MSR_VMX_PROCBASED_CTLS2,
 					PROCBASED2_UNRESTRICTED_GUEST, 0,
 				        &tmp) == 0);
 
 	cap_invpcid = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
 	    MSR_VMX_PROCBASED_CTLS2, PROCBASED2_ENABLE_INVPCID, 0,
 	    &tmp) == 0);
 
 	/*
 	 * Check support for virtual interrupt delivery.
 	 */
 	procbased2_vid_bits = (PROCBASED2_VIRTUALIZE_APIC_ACCESSES |
 	    PROCBASED2_VIRTUALIZE_X2APIC_MODE |
 	    PROCBASED2_APIC_REGISTER_VIRTUALIZATION |
 	    PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY);
 
 	use_tpr_shadow = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
 	    MSR_VMX_TRUE_PROCBASED_CTLS, PROCBASED_USE_TPR_SHADOW, 0,
 	    &tmp) == 0);
 
 	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2,
 	    procbased2_vid_bits, 0, &tmp);
 	if (error == 0 && use_tpr_shadow) {
 		virtual_interrupt_delivery = 1;
 		TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_vid",
 		    &virtual_interrupt_delivery);
 	}
 
 	if (virtual_interrupt_delivery) {
 		procbased_ctls |= PROCBASED_USE_TPR_SHADOW;
 		procbased_ctls2 |= procbased2_vid_bits;
 		procbased_ctls2 &= ~PROCBASED2_VIRTUALIZE_X2APIC_MODE;
 
 		/*
 		 * No need to emulate accesses to %CR8 if virtual
 		 * interrupt delivery is enabled.
 		 */
 		procbased_ctls &= ~PROCBASED_CR8_LOAD_EXITING;
 		procbased_ctls &= ~PROCBASED_CR8_STORE_EXITING;
 
 		/*
 		 * Check for Posted Interrupts only if Virtual Interrupt
 		 * Delivery is enabled.
 		 */
 		error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS,
 		    MSR_VMX_TRUE_PINBASED_CTLS, PINBASED_POSTED_INTERRUPT, 0,
 		    &tmp);
 		if (error == 0) {
 			pirvec = vmm_ipi_alloc();
 			if (pirvec == 0) {
 				if (bootverbose) {
 					printf("vmx_init: unable to allocate "
 					    "posted interrupt vector\n");
 				}
 			} else {
 				posted_interrupts = 1;
 				TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_pir",
 				    &posted_interrupts);
 			}
 		}
 	}
 
 	if (posted_interrupts)
 		    pinbased_ctls |= PINBASED_POSTED_INTERRUPT;
 
 	/* Initialize EPT */
 	error = ept_init(ipinum);
 	if (error) {
 		printf("vmx_init: ept initialization failed (%d)\n", error);
 		return (error);
 	}
 
 	/*
 	 * Stash the cr0 and cr4 bits that must be fixed to 0 or 1
 	 */
 	fixed0 = rdmsr(MSR_VMX_CR0_FIXED0);
 	fixed1 = rdmsr(MSR_VMX_CR0_FIXED1);
 	cr0_ones_mask = fixed0 & fixed1;
 	cr0_zeros_mask = ~fixed0 & ~fixed1;
 
 	/*
 	 * CR0_PE and CR0_PG can be set to zero in VMX non-root operation
 	 * if unrestricted guest execution is allowed.
 	 */
 	if (cap_unrestricted_guest)
 		cr0_ones_mask &= ~(CR0_PG | CR0_PE);
 
 	/*
 	 * Do not allow the guest to set CR0_NW or CR0_CD.
 	 */
 	cr0_zeros_mask |= (CR0_NW | CR0_CD);
 
 	fixed0 = rdmsr(MSR_VMX_CR4_FIXED0);
 	fixed1 = rdmsr(MSR_VMX_CR4_FIXED1);
 	cr4_ones_mask = fixed0 & fixed1;
 	cr4_zeros_mask = ~fixed0 & ~fixed1;
 
 	vpid_init();
 
+	vmx_msr_init();
+
 	/* enable VMX operation */
 	smp_rendezvous(NULL, vmx_enable, NULL, NULL);
 
 	vmx_initialized = 1;
 
 	return (0);
 }
 
 static void
 vmx_trigger_hostintr(int vector)
 {
 	uintptr_t func;
 	struct gate_descriptor *gd;
 
 	gd = &idt[vector];
 
 	KASSERT(vector >= 32 && vector <= 255, ("vmx_trigger_hostintr: "
 	    "invalid vector %d", vector));
 	KASSERT(gd->gd_p == 1, ("gate descriptor for vector %d not present",
 	    vector));
 	KASSERT(gd->gd_type == SDT_SYSIGT, ("gate descriptor for vector %d "
 	    "has invalid type %d", vector, gd->gd_type));
 	KASSERT(gd->gd_dpl == SEL_KPL, ("gate descriptor for vector %d "
 	    "has invalid dpl %d", vector, gd->gd_dpl));
 	KASSERT(gd->gd_selector == GSEL(GCODE_SEL, SEL_KPL), ("gate descriptor "
 	    "for vector %d has invalid selector %d", vector, gd->gd_selector));
 	KASSERT(gd->gd_ist == 0, ("gate descriptor for vector %d has invalid "
 	    "IST %d", vector, gd->gd_ist));
 
 	func = ((long)gd->gd_hioffset << 16 | gd->gd_looffset);
 	vmx_call_isr(func);
 }
 
 static int
 vmx_setup_cr_shadow(int which, struct vmcs *vmcs, uint32_t initial)
 {
 	int error, mask_ident, shadow_ident;
 	uint64_t mask_value;
 
 	if (which != 0 && which != 4)
 		panic("vmx_setup_cr_shadow: unknown cr%d", which);
 
 	if (which == 0) {
 		mask_ident = VMCS_CR0_MASK;
 		mask_value = cr0_ones_mask | cr0_zeros_mask;
 		shadow_ident = VMCS_CR0_SHADOW;
 	} else {
 		mask_ident = VMCS_CR4_MASK;
 		mask_value = cr4_ones_mask | cr4_zeros_mask;
 		shadow_ident = VMCS_CR4_SHADOW;
 	}
 
 	error = vmcs_setreg(vmcs, 0, VMCS_IDENT(mask_ident), mask_value);
 	if (error)
 		return (error);
 
 	error = vmcs_setreg(vmcs, 0, VMCS_IDENT(shadow_ident), initial);
 	if (error)
 		return (error);
 
 	return (0);
 }
 #define	vmx_setup_cr0_shadow(vmcs,init)	vmx_setup_cr_shadow(0, (vmcs), (init))
 #define	vmx_setup_cr4_shadow(vmcs,init)	vmx_setup_cr_shadow(4, (vmcs), (init))
 
 static void *
 vmx_vminit(struct vm *vm, pmap_t pmap)
 {
 	uint16_t vpid[VM_MAXCPU];
-	int i, error, guest_msr_count;
+	int i, error;
 	struct vmx *vmx;
 	struct vmcs *vmcs;
 
 	vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO);
 	if ((uintptr_t)vmx & PAGE_MASK) {
 		panic("malloc of struct vmx not aligned on %d byte boundary",
 		      PAGE_SIZE);
 	}
 	vmx->vm = vm;
 
 	vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pml4));
 
 	/*
 	 * Clean up EPTP-tagged guest physical and combined mappings
 	 *
 	 * VMX transitions are not required to invalidate any guest physical
 	 * mappings. So, it may be possible for stale guest physical mappings
 	 * to be present in the processor TLBs.
 	 *
 	 * Combined mappings for this EP4TA are also invalidated for all VPIDs.
 	 */
 	ept_invalidate_mappings(vmx->eptp);
 
 	msr_bitmap_initialize(vmx->msr_bitmap);
 
 	/*
 	 * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE.
 	 * The guest FSBASE and GSBASE are saved and restored during
 	 * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are
 	 * always restored from the vmcs host state area on vm-exit.
 	 *
 	 * The SYSENTER_CS/ESP/EIP MSRs are identical to FS/GSBASE in
 	 * how they are saved/restored so can be directly accessed by the
 	 * guest.
 	 *
-	 * Guest KGSBASE is saved and restored in the guest MSR save area.
-	 * Host KGSBASE is restored before returning to userland from the pcb.
-	 * There will be a window of time when we are executing in the host
-	 * kernel context with a value of KGSBASE from the guest. This is ok
-	 * because the value of KGSBASE is inconsequential in kernel context.
-	 *
 	 * MSR_EFER is saved and restored in the guest VMCS area on a
 	 * VM exit and entry respectively. It is also restored from the
 	 * host VMCS area on a VM exit.
 	 *
+	 * MSR_PAT is saved and restored in the guest VMCS are on a VM exit
+	 * and entry respectively. It is also restored from the host VMCS
+	 * area on a VM exit.
+	 *
 	 * The TSC MSR is exposed read-only. Writes are disallowed as that
 	 * will impact the host TSC.
 	 * XXX Writes would be implemented with a wrmsr trap, and
 	 * then modifying the TSC offset in the VMCS.
 	 */
 	if (guest_msr_rw(vmx, MSR_GSBASE) ||
 	    guest_msr_rw(vmx, MSR_FSBASE) ||
 	    guest_msr_rw(vmx, MSR_SYSENTER_CS_MSR) ||
 	    guest_msr_rw(vmx, MSR_SYSENTER_ESP_MSR) ||
 	    guest_msr_rw(vmx, MSR_SYSENTER_EIP_MSR) ||
-	    guest_msr_rw(vmx, MSR_KGSBASE) ||
 	    guest_msr_rw(vmx, MSR_EFER) ||
+	    guest_msr_rw(vmx, MSR_PAT) ||
 	    guest_msr_ro(vmx, MSR_TSC))
 		panic("vmx_vminit: error setting guest msr access");
 
-	/*
-	 * MSR_PAT is saved and restored in the guest VMCS are on a VM exit
-	 * and entry respectively. It is also restored from the host VMCS
-	 * area on a VM exit. However, if running on a system with no
-	 * MSR_PAT save/restore support, leave access disabled so accesses
-	 * will be trapped.
-	 */
-	if (vmx_patmsr && guest_msr_rw(vmx, MSR_PAT))
-		panic("vmx_vminit: error setting guest pat msr access");
-
 	vpid_alloc(vpid, VM_MAXCPU);
 
 	if (virtual_interrupt_delivery) {
 		error = vm_map_mmio(vm, DEFAULT_APIC_BASE, PAGE_SIZE,
 		    APIC_ACCESS_ADDRESS);
 		/* XXX this should really return an error to the caller */
 		KASSERT(error == 0, ("vm_map_mmio(apicbase) error %d", error));
 	}
 
 	for (i = 0; i < VM_MAXCPU; i++) {
 		vmcs = &vmx->vmcs[i];
 		vmcs->identifier = vmx_revision();
 		error = vmclear(vmcs);
 		if (error != 0) {
 			panic("vmx_vminit: vmclear error %d on vcpu %d\n",
 			      error, i);
 		}
 
+		vmx_msr_guest_init(vmx, i);
+
 		error = vmcs_init(vmcs);
 		KASSERT(error == 0, ("vmcs_init error %d", error));
 
 		VMPTRLD(vmcs);
 		error = 0;
 		error += vmwrite(VMCS_HOST_RSP, (u_long)&vmx->ctx[i]);
 		error += vmwrite(VMCS_EPTP, vmx->eptp);
 		error += vmwrite(VMCS_PIN_BASED_CTLS, pinbased_ctls);
 		error += vmwrite(VMCS_PRI_PROC_BASED_CTLS, procbased_ctls);
 		error += vmwrite(VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2);
 		error += vmwrite(VMCS_EXIT_CTLS, exit_ctls);
 		error += vmwrite(VMCS_ENTRY_CTLS, entry_ctls);
 		error += vmwrite(VMCS_MSR_BITMAP, vtophys(vmx->msr_bitmap));
 		error += vmwrite(VMCS_VPID, vpid[i]);
 		if (virtual_interrupt_delivery) {
 			error += vmwrite(VMCS_APIC_ACCESS, APIC_ACCESS_ADDRESS);
 			error += vmwrite(VMCS_VIRTUAL_APIC,
 			    vtophys(&vmx->apic_page[i]));
 			error += vmwrite(VMCS_EOI_EXIT0, 0);
 			error += vmwrite(VMCS_EOI_EXIT1, 0);
 			error += vmwrite(VMCS_EOI_EXIT2, 0);
 			error += vmwrite(VMCS_EOI_EXIT3, 0);
 		}
 		if (posted_interrupts) {
 			error += vmwrite(VMCS_PIR_VECTOR, pirvec);
 			error += vmwrite(VMCS_PIR_DESC,
 			    vtophys(&vmx->pir_desc[i]));
 		}
 		VMCLEAR(vmcs);
 		KASSERT(error == 0, ("vmx_vminit: error customizing the vmcs"));
 
 		vmx->cap[i].set = 0;
 		vmx->cap[i].proc_ctls = procbased_ctls;
 		vmx->cap[i].proc_ctls2 = procbased_ctls2;
 
 		vmx->state[i].lastcpu = NOCPU;
 		vmx->state[i].vpid = vpid[i];
 
-		msr_save_area_init(vmx->guest_msrs[i], &guest_msr_count);
-
-		error = vmcs_set_msr_save(vmcs, vtophys(vmx->guest_msrs[i]),
-		    guest_msr_count);
-		if (error != 0)
-			panic("vmcs_set_msr_save error %d", error);
-
 		/*
 		 * Set up the CR0/4 shadows, and init the read shadow
 		 * to the power-on register value from the Intel Sys Arch.
 		 *  CR0 - 0x60000010
 		 *  CR4 - 0
 		 */
 		error = vmx_setup_cr0_shadow(vmcs, 0x60000010);
 		if (error != 0)
 			panic("vmx_setup_cr0_shadow %d", error);
 
 		error = vmx_setup_cr4_shadow(vmcs, 0);
 		if (error != 0)
 			panic("vmx_setup_cr4_shadow %d", error);
 
 		vmx->ctx[i].pmap = pmap;
 	}
 
 	return (vmx);
 }
 
 static int
 vmx_handle_cpuid(struct vm *vm, int vcpu, struct vmxctx *vmxctx)
 {
 	int handled, func;
 	
 	func = vmxctx->guest_rax;
 
 	handled = x86_emulate_cpuid(vm, vcpu,
 				    (uint32_t*)(&vmxctx->guest_rax),
 				    (uint32_t*)(&vmxctx->guest_rbx),
 				    (uint32_t*)(&vmxctx->guest_rcx),
 				    (uint32_t*)(&vmxctx->guest_rdx));
 	return (handled);
 }
 
 static __inline void
 vmx_run_trace(struct vmx *vmx, int vcpu)
 {
 #ifdef KTR
 	VCPU_CTR1(vmx->vm, vcpu, "Resume execution at %#lx", vmcs_guest_rip());
 #endif
 }
 
 static __inline void
 vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason,
 	       int handled)
 {
 #ifdef KTR
 	VCPU_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx",
 		 handled ? "handled" : "unhandled",
 		 exit_reason_to_str(exit_reason), rip);
 #endif
 }
 
 static __inline void
 vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip)
 {
 #ifdef KTR
 	VCPU_CTR1(vmx->vm, vcpu, "astpending vmexit at 0x%0lx", rip);
 #endif
 }
 
 static VMM_STAT_INTEL(VCPU_INVVPID_SAVED, "Number of vpid invalidations saved");
 static VMM_STAT_INTEL(VCPU_INVVPID_DONE, "Number of vpid invalidations done");
 
 /*
  * Invalidate guest mappings identified by its vpid from the TLB.
  */
 static __inline void
 vmx_invvpid(struct vmx *vmx, int vcpu, pmap_t pmap, int running)
 {
 	struct vmxstate *vmxstate;
 	struct invvpid_desc invvpid_desc;
 
 	vmxstate = &vmx->state[vcpu];
 	if (vmxstate->vpid == 0)
 		return;
 
 	if (!running) {
 		/*
 		 * Set the 'lastcpu' to an invalid host cpu.
 		 *
 		 * This will invalidate TLB entries tagged with the vcpu's
 		 * vpid the next time it runs via vmx_set_pcpu_defaults().
 		 */
 		vmxstate->lastcpu = NOCPU;
 		return;
 	}
 
 	KASSERT(curthread->td_critnest > 0, ("%s: vcpu %d running outside "
 	    "critical section", __func__, vcpu));
 
 	/*
 	 * Invalidate all mappings tagged with 'vpid'
 	 *
 	 * We do this because this vcpu was executing on a different host
 	 * cpu when it last ran. We do not track whether it invalidated
 	 * mappings associated with its 'vpid' during that run. So we must
 	 * assume that the mappings associated with 'vpid' on 'curcpu' are
 	 * stale and invalidate them.
 	 *
 	 * Note that we incur this penalty only when the scheduler chooses to
 	 * move the thread associated with this vcpu between host cpus.
 	 *
 	 * Note also that this will invalidate mappings tagged with 'vpid'
 	 * for "all" EP4TAs.
 	 */
 	if (pmap->pm_eptgen == vmx->eptgen[curcpu]) {
 		invvpid_desc._res1 = 0;
 		invvpid_desc._res2 = 0;
 		invvpid_desc.vpid = vmxstate->vpid;
 		invvpid_desc.linear_addr = 0;
 		invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc);
 		vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_DONE, 1);
 	} else {
 		/*
 		 * The invvpid can be skipped if an invept is going to
 		 * be performed before entering the guest. The invept
 		 * will invalidate combined mappings tagged with
 		 * 'vmx->eptp' for all vpids.
 		 */
 		vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_SAVED, 1);
 	}
 }
 
 static void
 vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu, pmap_t pmap)
 {
 	struct vmxstate *vmxstate;
 
 	vmxstate = &vmx->state[vcpu];
 	if (vmxstate->lastcpu == curcpu)
 		return;
 
 	vmxstate->lastcpu = curcpu;
 
 	vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1);
 
 	vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase());
 	vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase());
 	vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase());
 	vmx_invvpid(vmx, vcpu, pmap, 1);
 }
 
 /*
  * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set.
  */
 CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0);
 
 static void __inline
 vmx_set_int_window_exiting(struct vmx *vmx, int vcpu)
 {
 
 	if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) == 0) {
 		vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING;
 		vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
 		VCPU_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting");
 	}
 }
 
 static void __inline
 vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu)
 {
 
 	KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0,
 	    ("intr_window_exiting not set: %#x", vmx->cap[vcpu].proc_ctls));
 	vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING;
 	vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
 	VCPU_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting");
 }
 
 static void __inline
 vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu)
 {
 
 	if ((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) == 0) {
 		vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING;
 		vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
 		VCPU_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting");
 	}
 }
 
 static void __inline
 vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu)
 {
 
 	KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) != 0,
 	    ("nmi_window_exiting not set %#x", vmx->cap[vcpu].proc_ctls));
 	vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING;
 	vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
 	VCPU_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting");
 }
 
 #define	NMI_BLOCKING	(VMCS_INTERRUPTIBILITY_NMI_BLOCKING |		\
 			 VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING)
 #define	HWINTR_BLOCKING	(VMCS_INTERRUPTIBILITY_STI_BLOCKING |		\
 			 VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING)
 
 static void
 vmx_inject_nmi(struct vmx *vmx, int vcpu)
 {
 	uint32_t gi, info;
 
 	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
 	KASSERT((gi & NMI_BLOCKING) == 0, ("vmx_inject_nmi: invalid guest "
 	    "interruptibility-state %#x", gi));
 
 	info = vmcs_read(VMCS_ENTRY_INTR_INFO);
 	KASSERT((info & VMCS_INTR_VALID) == 0, ("vmx_inject_nmi: invalid "
 	    "VM-entry interruption information %#x", info));
 
 	/*
 	 * Inject the virtual NMI. The vector must be the NMI IDT entry
 	 * or the VMCS entry check will fail.
 	 */
 	info = IDT_NMI | VMCS_INTR_T_NMI | VMCS_INTR_VALID;
 	vmcs_write(VMCS_ENTRY_INTR_INFO, info);
 
 	VCPU_CTR0(vmx->vm, vcpu, "Injecting vNMI");
 
 	/* Clear the request */
 	vm_nmi_clear(vmx->vm, vcpu);
 }
 
 static void
 vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic)
 {
 	int vector, need_nmi_exiting, extint_pending;
 	uint64_t rflags, entryinfo;
 	uint32_t gi, info;
 
 	if (vm_entry_intinfo(vmx->vm, vcpu, &entryinfo)) {
 		KASSERT((entryinfo & VMCS_INTR_VALID) != 0, ("%s: entry "
 		    "intinfo is not valid: %#lx", __func__, entryinfo));
 
 		info = vmcs_read(VMCS_ENTRY_INTR_INFO);
 		KASSERT((info & VMCS_INTR_VALID) == 0, ("%s: cannot inject "
 		     "pending exception: %#lx/%#x", __func__, entryinfo, info));
 
 		info = entryinfo;
 		vector = info & 0xff;
 		if (vector == IDT_BP || vector == IDT_OF) {
 			/*
 			 * VT-x requires #BP and #OF to be injected as software
 			 * exceptions.
 			 */
 			info &= ~VMCS_INTR_T_MASK;
 			info |= VMCS_INTR_T_SWEXCEPTION;
 		}
 
 		if (info & VMCS_INTR_DEL_ERRCODE)
 			vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, entryinfo >> 32);
 
 		vmcs_write(VMCS_ENTRY_INTR_INFO, info);
 	}
 
 	if (vm_nmi_pending(vmx->vm, vcpu)) {
 		/*
 		 * If there are no conditions blocking NMI injection then
 		 * inject it directly here otherwise enable "NMI window
 		 * exiting" to inject it as soon as we can.
 		 *
 		 * We also check for STI_BLOCKING because some implementations
 		 * don't allow NMI injection in this case. If we are running
 		 * on a processor that doesn't have this restriction it will
 		 * immediately exit and the NMI will be injected in the
 		 * "NMI window exiting" handler.
 		 */
 		need_nmi_exiting = 1;
 		gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
 		if ((gi & (HWINTR_BLOCKING | NMI_BLOCKING)) == 0) {
 			info = vmcs_read(VMCS_ENTRY_INTR_INFO);
 			if ((info & VMCS_INTR_VALID) == 0) {
 				vmx_inject_nmi(vmx, vcpu);
 				need_nmi_exiting = 0;
 			} else {
 				VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI "
 				    "due to VM-entry intr info %#x", info);
 			}
 		} else {
 			VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI due to "
 			    "Guest Interruptibility-state %#x", gi);
 		}
 
 		if (need_nmi_exiting)
 			vmx_set_nmi_window_exiting(vmx, vcpu);
 	}
 
 	extint_pending = vm_extint_pending(vmx->vm, vcpu);
 
 	if (!extint_pending && virtual_interrupt_delivery) {
 		vmx_inject_pir(vlapic);
 		return;
 	}
 
 	/*
 	 * If interrupt-window exiting is already in effect then don't bother
 	 * checking for pending interrupts. This is just an optimization and
 	 * not needed for correctness.
 	 */
 	if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0) {
 		VCPU_CTR0(vmx->vm, vcpu, "Skip interrupt injection due to "
 		    "pending int_window_exiting");
 		return;
 	}
 
 	if (!extint_pending) {
 		/* Ask the local apic for a vector to inject */
 		if (!vlapic_pending_intr(vlapic, &vector))
 			return;
 
 		/*
 		 * From the Intel SDM, Volume 3, Section "Maskable
 		 * Hardware Interrupts":
 		 * - maskable interrupt vectors [16,255] can be delivered
 		 *   through the local APIC.
 		*/
 		KASSERT(vector >= 16 && vector <= 255,
 		    ("invalid vector %d from local APIC", vector));
 	} else {
 		/* Ask the legacy pic for a vector to inject */
 		vatpic_pending_intr(vmx->vm, &vector);
 
 		/*
 		 * From the Intel SDM, Volume 3, Section "Maskable
 		 * Hardware Interrupts":
 		 * - maskable interrupt vectors [0,255] can be delivered
 		 *   through the INTR pin.
 		 */
 		KASSERT(vector >= 0 && vector <= 255,
 		    ("invalid vector %d from INTR", vector));
 	}
 
 	/* Check RFLAGS.IF and the interruptibility state of the guest */
 	rflags = vmcs_read(VMCS_GUEST_RFLAGS);
 	if ((rflags & PSL_I) == 0) {
 		VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
 		    "rflags %#lx", vector, rflags);
 		goto cantinject;
 	}
 
 	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
 	if (gi & HWINTR_BLOCKING) {
 		VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
 		    "Guest Interruptibility-state %#x", vector, gi);
 		goto cantinject;
 	}
 
 	info = vmcs_read(VMCS_ENTRY_INTR_INFO);
 	if (info & VMCS_INTR_VALID) {
 		/*
 		 * This is expected and could happen for multiple reasons:
 		 * - A vectoring VM-entry was aborted due to astpending
 		 * - A VM-exit happened during event injection.
 		 * - An exception was injected above.
 		 * - An NMI was injected above or after "NMI window exiting"
 		 */
 		VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
 		    "VM-entry intr info %#x", vector, info);
 		goto cantinject;
 	}
 
 	/* Inject the interrupt */
 	info = VMCS_INTR_T_HWINTR | VMCS_INTR_VALID;
 	info |= vector;
 	vmcs_write(VMCS_ENTRY_INTR_INFO, info);
 
 	if (!extint_pending) {
 		/* Update the Local APIC ISR */
 		vlapic_intr_accepted(vlapic, vector);
 	} else {
 		vm_extint_clear(vmx->vm, vcpu);
 		vatpic_intr_accepted(vmx->vm, vector);
 
 		/*
 		 * After we accepted the current ExtINT the PIC may
 		 * have posted another one.  If that is the case, set
 		 * the Interrupt Window Exiting execution control so
 		 * we can inject that one too.
 		 *
 		 * Also, interrupt window exiting allows us to inject any
 		 * pending APIC vector that was preempted by the ExtINT
 		 * as soon as possible. This applies both for the software
 		 * emulated vlapic and the hardware assisted virtual APIC.
 		 */
 		vmx_set_int_window_exiting(vmx, vcpu);
 	}
 
 	VCPU_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector);
 
 	return;
 
 cantinject:
 	/*
 	 * Set the Interrupt Window Exiting execution control so we can inject
 	 * the interrupt as soon as blocking condition goes away.
 	 */
 	vmx_set_int_window_exiting(vmx, vcpu);
 }
 
 /*
  * If the Virtual NMIs execution control is '1' then the logical processor
  * tracks virtual-NMI blocking in the Guest Interruptibility-state field of
  * the VMCS. An IRET instruction in VMX non-root operation will remove any
  * virtual-NMI blocking.
  *
  * This unblocking occurs even if the IRET causes a fault. In this case the
  * hypervisor needs to restore virtual-NMI blocking before resuming the guest.
  */
 static void
 vmx_restore_nmi_blocking(struct vmx *vmx, int vcpuid)
 {
 	uint32_t gi;
 
 	VCPU_CTR0(vmx->vm, vcpuid, "Restore Virtual-NMI blocking");
 	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
 	gi |= VMCS_INTERRUPTIBILITY_NMI_BLOCKING;
 	vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
 }
 
 static void
 vmx_clear_nmi_blocking(struct vmx *vmx, int vcpuid)
 {
 	uint32_t gi;
 
 	VCPU_CTR0(vmx->vm, vcpuid, "Clear Virtual-NMI blocking");
 	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
 	gi &= ~VMCS_INTERRUPTIBILITY_NMI_BLOCKING;
 	vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
 }
 
 static void
 vmx_assert_nmi_blocking(struct vmx *vmx, int vcpuid)
 {
 	uint32_t gi;
 
 	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
 	KASSERT(gi & VMCS_INTERRUPTIBILITY_NMI_BLOCKING,
 	    ("NMI blocking is not in effect %#x", gi));
 }
 
 static int
 vmx_emulate_xsetbv(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 {
 	struct vmxctx *vmxctx;
 	uint64_t xcrval;
 	const struct xsave_limits *limits;
 
 	vmxctx = &vmx->ctx[vcpu];
 	limits = vmm_get_xsave_limits();
 
 	/*
 	 * Note that the processor raises a GP# fault on its own if
 	 * xsetbv is executed for CPL != 0, so we do not have to
 	 * emulate that fault here.
 	 */
 
 	/* Only xcr0 is supported. */
 	if (vmxctx->guest_rcx != 0) {
 		vm_inject_gp(vmx->vm, vcpu);
 		return (HANDLED);
 	}
 
 	/* We only handle xcr0 if both the host and guest have XSAVE enabled. */
 	if (!limits->xsave_enabled || !(vmcs_read(VMCS_GUEST_CR4) & CR4_XSAVE)) {
 		vm_inject_ud(vmx->vm, vcpu);
 		return (HANDLED);
 	}
 
 	xcrval = vmxctx->guest_rdx << 32 | (vmxctx->guest_rax & 0xffffffff);
 	if ((xcrval & ~limits->xcr0_allowed) != 0) {
 		vm_inject_gp(vmx->vm, vcpu);
 		return (HANDLED);
 	}
 
 	if (!(xcrval & XFEATURE_ENABLED_X87)) {
 		vm_inject_gp(vmx->vm, vcpu);
 		return (HANDLED);
 	}
 
 	/* AVX (YMM_Hi128) requires SSE. */
 	if (xcrval & XFEATURE_ENABLED_AVX &&
 	    (xcrval & XFEATURE_AVX) != XFEATURE_AVX) {
 		vm_inject_gp(vmx->vm, vcpu);
 		return (HANDLED);
 	}
 
 	/*
 	 * AVX512 requires base AVX (YMM_Hi128) as well as OpMask,
 	 * ZMM_Hi256, and Hi16_ZMM.
 	 */
 	if (xcrval & XFEATURE_AVX512 &&
 	    (xcrval & (XFEATURE_AVX512 | XFEATURE_AVX)) !=
 	    (XFEATURE_AVX512 | XFEATURE_AVX)) {
 		vm_inject_gp(vmx->vm, vcpu);
 		return (HANDLED);
 	}
 
 	/*
 	 * Intel MPX requires both bound register state flags to be
 	 * set.
 	 */
 	if (((xcrval & XFEATURE_ENABLED_BNDREGS) != 0) !=
 	    ((xcrval & XFEATURE_ENABLED_BNDCSR) != 0)) {
 		vm_inject_gp(vmx->vm, vcpu);
 		return (HANDLED);
 	}
 
 	/*
 	 * This runs "inside" vmrun() with the guest's FPU state, so
 	 * modifying xcr0 directly modifies the guest's xcr0, not the
 	 * host's.
 	 */
 	load_xcr(0, xcrval);
 	return (HANDLED);
 }
 
 static uint64_t
 vmx_get_guest_reg(struct vmx *vmx, int vcpu, int ident)
 {
 	const struct vmxctx *vmxctx;
 
 	vmxctx = &vmx->ctx[vcpu];
 
 	switch (ident) {
 	case 0:
 		return (vmxctx->guest_rax);
 	case 1:
 		return (vmxctx->guest_rcx);
 	case 2:
 		return (vmxctx->guest_rdx);
 	case 3:
 		return (vmxctx->guest_rbx);
 	case 4:
 		return (vmcs_read(VMCS_GUEST_RSP));
 	case 5:
 		return (vmxctx->guest_rbp);
 	case 6:
 		return (vmxctx->guest_rsi);
 	case 7:
 		return (vmxctx->guest_rdi);
 	case 8:
 		return (vmxctx->guest_r8);
 	case 9:
 		return (vmxctx->guest_r9);
 	case 10:
 		return (vmxctx->guest_r10);
 	case 11:
 		return (vmxctx->guest_r11);
 	case 12:
 		return (vmxctx->guest_r12);
 	case 13:
 		return (vmxctx->guest_r13);
 	case 14:
 		return (vmxctx->guest_r14);
 	case 15:
 		return (vmxctx->guest_r15);
 	default:
 		panic("invalid vmx register %d", ident);
 	}
 }
 
 static void
 vmx_set_guest_reg(struct vmx *vmx, int vcpu, int ident, uint64_t regval)
 {
 	struct vmxctx *vmxctx;
 
 	vmxctx = &vmx->ctx[vcpu];
 
 	switch (ident) {
 	case 0:
 		vmxctx->guest_rax = regval;
 		break;
 	case 1:
 		vmxctx->guest_rcx = regval;
 		break;
 	case 2:
 		vmxctx->guest_rdx = regval;
 		break;
 	case 3:
 		vmxctx->guest_rbx = regval;
 		break;
 	case 4:
 		vmcs_write(VMCS_GUEST_RSP, regval);
 		break;
 	case 5:
 		vmxctx->guest_rbp = regval;
 		break;
 	case 6:
 		vmxctx->guest_rsi = regval;
 		break;
 	case 7:
 		vmxctx->guest_rdi = regval;
 		break;
 	case 8:
 		vmxctx->guest_r8 = regval;
 		break;
 	case 9:
 		vmxctx->guest_r9 = regval;
 		break;
 	case 10:
 		vmxctx->guest_r10 = regval;
 		break;
 	case 11:
 		vmxctx->guest_r11 = regval;
 		break;
 	case 12:
 		vmxctx->guest_r12 = regval;
 		break;
 	case 13:
 		vmxctx->guest_r13 = regval;
 		break;
 	case 14:
 		vmxctx->guest_r14 = regval;
 		break;
 	case 15:
 		vmxctx->guest_r15 = regval;
 		break;
 	default:
 		panic("invalid vmx register %d", ident);
 	}
 }
 
 static int
 vmx_emulate_cr0_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
 {
 	uint64_t crval, regval;
 
 	/* We only handle mov to %cr0 at this time */
 	if ((exitqual & 0xf0) != 0x00)
 		return (UNHANDLED);
 
 	regval = vmx_get_guest_reg(vmx, vcpu, (exitqual >> 8) & 0xf);
 
 	vmcs_write(VMCS_CR0_SHADOW, regval);
 
 	crval = regval | cr0_ones_mask;
 	crval &= ~cr0_zeros_mask;
 	vmcs_write(VMCS_GUEST_CR0, crval);
 
 	if (regval & CR0_PG) {
 		uint64_t efer, entry_ctls;
 
 		/*
 		 * If CR0.PG is 1 and EFER.LME is 1 then EFER.LMA and
 		 * the "IA-32e mode guest" bit in VM-entry control must be
 		 * equal.
 		 */
 		efer = vmcs_read(VMCS_GUEST_IA32_EFER);
 		if (efer & EFER_LME) {
 			efer |= EFER_LMA;
 			vmcs_write(VMCS_GUEST_IA32_EFER, efer);
 			entry_ctls = vmcs_read(VMCS_ENTRY_CTLS);
 			entry_ctls |= VM_ENTRY_GUEST_LMA;
 			vmcs_write(VMCS_ENTRY_CTLS, entry_ctls);
 		}
 	}
 
 	return (HANDLED);
 }
 
 static int
 vmx_emulate_cr4_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
 {
 	uint64_t crval, regval;
 
 	/* We only handle mov to %cr4 at this time */
 	if ((exitqual & 0xf0) != 0x00)
 		return (UNHANDLED);
 
 	regval = vmx_get_guest_reg(vmx, vcpu, (exitqual >> 8) & 0xf);
 
 	vmcs_write(VMCS_CR4_SHADOW, regval);
 
 	crval = regval | cr4_ones_mask;
 	crval &= ~cr4_zeros_mask;
 	vmcs_write(VMCS_GUEST_CR4, crval);
 
 	return (HANDLED);
 }
 
 static int
 vmx_emulate_cr8_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
 {
 	struct vlapic *vlapic;
 	uint64_t cr8;
 	int regnum;
 
 	/* We only handle mov %cr8 to/from a register at this time. */
 	if ((exitqual & 0xe0) != 0x00) {
 		return (UNHANDLED);
 	}
 
 	vlapic = vm_lapic(vmx->vm, vcpu);
 	regnum = (exitqual >> 8) & 0xf;
 	if (exitqual & 0x10) {
 		cr8 = vlapic_get_cr8(vlapic);
 		vmx_set_guest_reg(vmx, vcpu, regnum, cr8);
 	} else {
 		cr8 = vmx_get_guest_reg(vmx, vcpu, regnum);
 		vlapic_set_cr8(vlapic, cr8);
 	}
 
 	return (HANDLED);
 }
 
 /*
  * From section "Guest Register State" in the Intel SDM: CPL = SS.DPL
  */
 static int
 vmx_cpl(void)
 {
 	uint32_t ssar;
 
 	ssar = vmcs_read(VMCS_GUEST_SS_ACCESS_RIGHTS);
 	return ((ssar >> 5) & 0x3);
 }
 
 static enum vm_cpu_mode
 vmx_cpu_mode(void)
 {
 	uint32_t csar;
 
 	if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LMA) {
 		csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS);
 		if (csar & 0x2000)
 			return (CPU_MODE_64BIT);	/* CS.L = 1 */
 		else
 			return (CPU_MODE_COMPATIBILITY);
 	} else if (vmcs_read(VMCS_GUEST_CR0) & CR0_PE) {
 		return (CPU_MODE_PROTECTED);
 	} else {
 		return (CPU_MODE_REAL);
 	}
 }
 
 static enum vm_paging_mode
 vmx_paging_mode(void)
 {
 
 	if (!(vmcs_read(VMCS_GUEST_CR0) & CR0_PG))
 		return (PAGING_MODE_FLAT);
 	if (!(vmcs_read(VMCS_GUEST_CR4) & CR4_PAE))
 		return (PAGING_MODE_32);
 	if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LME)
 		return (PAGING_MODE_64);
 	else
 		return (PAGING_MODE_PAE);
 }
 
 static uint64_t
 inout_str_index(struct vmx *vmx, int vcpuid, int in)
 {
 	uint64_t val;
 	int error;
 	enum vm_reg_name reg;
 
 	reg = in ? VM_REG_GUEST_RDI : VM_REG_GUEST_RSI;
 	error = vmx_getreg(vmx, vcpuid, reg, &val);
 	KASSERT(error == 0, ("%s: vmx_getreg error %d", __func__, error));
 	return (val);
 }
 
 static uint64_t
 inout_str_count(struct vmx *vmx, int vcpuid, int rep)
 {
 	uint64_t val;
 	int error;
 
 	if (rep) {
 		error = vmx_getreg(vmx, vcpuid, VM_REG_GUEST_RCX, &val);
 		KASSERT(!error, ("%s: vmx_getreg error %d", __func__, error));
 	} else {
 		val = 1;
 	}
 	return (val);
 }
 
 static int
 inout_str_addrsize(uint32_t inst_info)
 {
 	uint32_t size;
 
 	size = (inst_info >> 7) & 0x7;
 	switch (size) {
 	case 0:
 		return (2);	/* 16 bit */
 	case 1:
 		return (4);	/* 32 bit */
 	case 2:
 		return (8);	/* 64 bit */
 	default:
 		panic("%s: invalid size encoding %d", __func__, size);
 	}
 }
 
 static void
 inout_str_seginfo(struct vmx *vmx, int vcpuid, uint32_t inst_info, int in,
     struct vm_inout_str *vis)
 {
 	int error, s;
 
 	if (in) {
 		vis->seg_name = VM_REG_GUEST_ES;
 	} else {
 		s = (inst_info >> 15) & 0x7;
 		vis->seg_name = vm_segment_name(s);
 	}
 
 	error = vmx_getdesc(vmx, vcpuid, vis->seg_name, &vis->seg_desc);
 	KASSERT(error == 0, ("%s: vmx_getdesc error %d", __func__, error));
 
 	/* XXX modify svm.c to update bit 16 of seg_desc.access (unusable) */
 }
 
 static void
 vmx_paging_info(struct vm_guest_paging *paging)
 {
 	paging->cr3 = vmcs_guest_cr3();
 	paging->cpl = vmx_cpl();
 	paging->cpu_mode = vmx_cpu_mode();
 	paging->paging_mode = vmx_paging_mode();
 }
 
 static void
 vmexit_inst_emul(struct vm_exit *vmexit, uint64_t gpa, uint64_t gla)
 {
 	struct vm_guest_paging *paging;
 	uint32_t csar;
 	
 	paging = &vmexit->u.inst_emul.paging;
 
 	vmexit->exitcode = VM_EXITCODE_INST_EMUL;
 	vmexit->u.inst_emul.gpa = gpa;
 	vmexit->u.inst_emul.gla = gla;
 	vmx_paging_info(paging);
 	switch (paging->cpu_mode) {
 	case CPU_MODE_PROTECTED:
 	case CPU_MODE_COMPATIBILITY:
 		csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS);
 		vmexit->u.inst_emul.cs_d = SEG_DESC_DEF32(csar);
 		break;
 	default:
 		vmexit->u.inst_emul.cs_d = 0;
 		break;
 	}
 }
 
 static int
 ept_fault_type(uint64_t ept_qual)
 {
 	int fault_type;
 
 	if (ept_qual & EPT_VIOLATION_DATA_WRITE)
 		fault_type = VM_PROT_WRITE;
 	else if (ept_qual & EPT_VIOLATION_INST_FETCH)
 		fault_type = VM_PROT_EXECUTE;
 	else
 		fault_type= VM_PROT_READ;
 
 	return (fault_type);
 }
 
 static boolean_t
 ept_emulation_fault(uint64_t ept_qual)
 {
 	int read, write;
 
 	/* EPT fault on an instruction fetch doesn't make sense here */
 	if (ept_qual & EPT_VIOLATION_INST_FETCH)
 		return (FALSE);
 
 	/* EPT fault must be a read fault or a write fault */
 	read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0;
 	write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0;
 	if ((read | write) == 0)
 		return (FALSE);
 
 	/*
 	 * The EPT violation must have been caused by accessing a
 	 * guest-physical address that is a translation of a guest-linear
 	 * address.
 	 */
 	if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 ||
 	    (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) {
 		return (FALSE);
 	}
 
 	return (TRUE);
 }
 
 static __inline int
 apic_access_virtualization(struct vmx *vmx, int vcpuid)
 {
 	uint32_t proc_ctls2;
 
 	proc_ctls2 = vmx->cap[vcpuid].proc_ctls2;
 	return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) ? 1 : 0);
 }
 
 static __inline int
 x2apic_virtualization(struct vmx *vmx, int vcpuid)
 {
 	uint32_t proc_ctls2;
 
 	proc_ctls2 = vmx->cap[vcpuid].proc_ctls2;
 	return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_X2APIC_MODE) ? 1 : 0);
 }
 
 static int
 vmx_handle_apic_write(struct vmx *vmx, int vcpuid, struct vlapic *vlapic,
     uint64_t qual)
 {
 	int error, handled, offset;
 	uint32_t *apic_regs, vector;
 	bool retu;
 
 	handled = HANDLED;
 	offset = APIC_WRITE_OFFSET(qual);
 
 	if (!apic_access_virtualization(vmx, vcpuid)) {
 		/*
 		 * In general there should not be any APIC write VM-exits
 		 * unless APIC-access virtualization is enabled.
 		 *
 		 * However self-IPI virtualization can legitimately trigger
 		 * an APIC-write VM-exit so treat it specially.
 		 */
 		if (x2apic_virtualization(vmx, vcpuid) &&
 		    offset == APIC_OFFSET_SELF_IPI) {
 			apic_regs = (uint32_t *)(vlapic->apic_page);
 			vector = apic_regs[APIC_OFFSET_SELF_IPI / 4];
 			vlapic_self_ipi_handler(vlapic, vector);
 			return (HANDLED);
 		} else
 			return (UNHANDLED);
 	}
 
 	switch (offset) {
 	case APIC_OFFSET_ID:
 		vlapic_id_write_handler(vlapic);
 		break;
 	case APIC_OFFSET_LDR:
 		vlapic_ldr_write_handler(vlapic);
 		break;
 	case APIC_OFFSET_DFR:
 		vlapic_dfr_write_handler(vlapic);
 		break;
 	case APIC_OFFSET_SVR:
 		vlapic_svr_write_handler(vlapic);
 		break;
 	case APIC_OFFSET_ESR:
 		vlapic_esr_write_handler(vlapic);
 		break;
 	case APIC_OFFSET_ICR_LOW:
 		retu = false;
 		error = vlapic_icrlo_write_handler(vlapic, &retu);
 		if (error != 0 || retu)
 			handled = UNHANDLED;
 		break;
 	case APIC_OFFSET_CMCI_LVT:
 	case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
 		vlapic_lvt_write_handler(vlapic, offset);
 		break;
 	case APIC_OFFSET_TIMER_ICR:
 		vlapic_icrtmr_write_handler(vlapic);
 		break;
 	case APIC_OFFSET_TIMER_DCR:
 		vlapic_dcr_write_handler(vlapic);
 		break;
 	default:
 		handled = UNHANDLED;
 		break;
 	}
 	return (handled);
 }
 
 static bool
 apic_access_fault(struct vmx *vmx, int vcpuid, uint64_t gpa)
 {
 
 	if (apic_access_virtualization(vmx, vcpuid) &&
 	    (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE))
 		return (true);
 	else
 		return (false);
 }
 
 static int
 vmx_handle_apic_access(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit)
 {
 	uint64_t qual;
 	int access_type, offset, allowed;
 
 	if (!apic_access_virtualization(vmx, vcpuid))
 		return (UNHANDLED);
 
 	qual = vmexit->u.vmx.exit_qualification;
 	access_type = APIC_ACCESS_TYPE(qual);
 	offset = APIC_ACCESS_OFFSET(qual);
 
 	allowed = 0;
 	if (access_type == 0) {
 		/*
 		 * Read data access to the following registers is expected.
 		 */
 		switch (offset) {
 		case APIC_OFFSET_APR:
 		case APIC_OFFSET_PPR:
 		case APIC_OFFSET_RRR:
 		case APIC_OFFSET_CMCI_LVT:
 		case APIC_OFFSET_TIMER_CCR:
 			allowed = 1;
 			break;
 		default:
 			break;
 		}
 	} else if (access_type == 1) {
 		/*
 		 * Write data access to the following registers is expected.
 		 */
 		switch (offset) {
 		case APIC_OFFSET_VER:
 		case APIC_OFFSET_APR:
 		case APIC_OFFSET_PPR:
 		case APIC_OFFSET_RRR:
 		case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
 		case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
 		case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
 		case APIC_OFFSET_CMCI_LVT:
 		case APIC_OFFSET_TIMER_CCR:
 			allowed = 1;
 			break;
 		default:
 			break;
 		}
 	}
 
 	if (allowed) {
 		vmexit_inst_emul(vmexit, DEFAULT_APIC_BASE + offset,
 		    VIE_INVALID_GLA);
 	}
 
 	/*
 	 * Regardless of whether the APIC-access is allowed this handler
 	 * always returns UNHANDLED:
 	 * - if the access is allowed then it is handled by emulating the
 	 *   instruction that caused the VM-exit (outside the critical section)
 	 * - if the access is not allowed then it will be converted to an
 	 *   exitcode of VM_EXITCODE_VMX and will be dealt with in userland.
 	 */
 	return (UNHANDLED);
 }
 
 static enum task_switch_reason
 vmx_task_switch_reason(uint64_t qual)
 {
 	int reason;
 
 	reason = (qual >> 30) & 0x3;
 	switch (reason) {
 	case 0:
 		return (TSR_CALL);
 	case 1:
 		return (TSR_IRET);
 	case 2:
 		return (TSR_JMP);
 	case 3:
 		return (TSR_IDT_GATE);
 	default:
 		panic("%s: invalid reason %d", __func__, reason);
 	}
 }
 
 static int
+emulate_wrmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t val, bool *retu)
+{
+	int error;
+
+	if (lapic_msr(num))
+		error = lapic_wrmsr(vmx->vm, vcpuid, num, val, retu);
+	else
+		error = vmx_wrmsr(vmx, vcpuid, num, val, retu);
+
+	return (error);
+}
+
+static int
+emulate_rdmsr(struct vmx *vmx, int vcpuid, u_int num, bool *retu)
+{
+	struct vmxctx *vmxctx;
+	uint64_t result;
+	uint32_t eax, edx;
+	int error;
+
+	if (lapic_msr(num))
+		error = lapic_rdmsr(vmx->vm, vcpuid, num, &result, retu);
+	else
+		error = vmx_rdmsr(vmx, vcpuid, num, &result, retu);
+
+	if (error == 0) {
+		eax = result;
+		vmxctx = &vmx->ctx[vcpuid];
+		error = vmxctx_setreg(vmxctx, VM_REG_GUEST_RAX, eax);
+		KASSERT(error == 0, ("vmxctx_setreg(rax) error %d", error));
+
+		edx = result >> 32;
+		error = vmxctx_setreg(vmxctx, VM_REG_GUEST_RDX, edx);
+		KASSERT(error == 0, ("vmxctx_setreg(rdx) error %d", error));
+	}
+
+	return (error);
+}
+
+static int
 vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 {
 	int error, handled, in;
 	struct vmxctx *vmxctx;
 	struct vlapic *vlapic;
 	struct vm_inout_str *vis;
 	struct vm_task_switch *ts;
 	uint32_t eax, ecx, edx, idtvec_info, idtvec_err, intr_info, inst_info;
 	uint32_t intr_type, reason;
 	uint64_t exitintinfo, qual, gpa;
 	bool retu;
 
 	CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_VIRTUAL_NMI) != 0);
 	CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_NMI_EXITING) != 0);
 
 	handled = UNHANDLED;
 	vmxctx = &vmx->ctx[vcpu];
 
 	qual = vmexit->u.vmx.exit_qualification;
 	reason = vmexit->u.vmx.exit_reason;
 	vmexit->exitcode = VM_EXITCODE_BOGUS;
 
 	vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1);
 
 	/*
 	 * VM exits that can be triggered during event delivery need to
 	 * be handled specially by re-injecting the event if the IDT
 	 * vectoring information field's valid bit is set.
 	 *
 	 * See "Information for VM Exits During Event Delivery" in Intel SDM
 	 * for details.
 	 */
 	idtvec_info = vmcs_idt_vectoring_info();
 	if (idtvec_info & VMCS_IDT_VEC_VALID) {
 		idtvec_info &= ~(1 << 12); /* clear undefined bit */
 		exitintinfo = idtvec_info;
 		if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) {
 			idtvec_err = vmcs_idt_vectoring_err();
 			exitintinfo |= (uint64_t)idtvec_err << 32;
 		}
 		error = vm_exit_intinfo(vmx->vm, vcpu, exitintinfo);
 		KASSERT(error == 0, ("%s: vm_set_intinfo error %d",
 		    __func__, error));
 
 		/*
 		 * If 'virtual NMIs' are being used and the VM-exit
 		 * happened while injecting an NMI during the previous
 		 * VM-entry, then clear "blocking by NMI" in the
 		 * Guest Interruptibility-State so the NMI can be
 		 * reinjected on the subsequent VM-entry.
 		 *
 		 * However, if the NMI was being delivered through a task
 		 * gate, then the new task must start execution with NMIs
 		 * blocked so don't clear NMI blocking in this case.
 		 */
 		intr_type = idtvec_info & VMCS_INTR_T_MASK;
 		if (intr_type == VMCS_INTR_T_NMI) {
 			if (reason != EXIT_REASON_TASK_SWITCH)
 				vmx_clear_nmi_blocking(vmx, vcpu);
 			else
 				vmx_assert_nmi_blocking(vmx, vcpu);
 		}
 
 		/*
 		 * Update VM-entry instruction length if the event being
 		 * delivered was a software interrupt or software exception.
 		 */
 		if (intr_type == VMCS_INTR_T_SWINTR ||
 		    intr_type == VMCS_INTR_T_PRIV_SWEXCEPTION ||
 		    intr_type == VMCS_INTR_T_SWEXCEPTION) {
 			vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length);
 		}
 	}
 
 	switch (reason) {
 	case EXIT_REASON_TASK_SWITCH:
 		ts = &vmexit->u.task_switch;
 		ts->tsssel = qual & 0xffff;
 		ts->reason = vmx_task_switch_reason(qual);
 		ts->ext = 0;
 		ts->errcode_valid = 0;
 		vmx_paging_info(&ts->paging);
 		/*
 		 * If the task switch was due to a CALL, JMP, IRET, software
 		 * interrupt (INT n) or software exception (INT3, INTO),
 		 * then the saved %rip references the instruction that caused
 		 * the task switch. The instruction length field in the VMCS
 		 * is valid in this case.
 		 *
 		 * In all other cases (e.g., NMI, hardware exception) the
 		 * saved %rip is one that would have been saved in the old TSS
 		 * had the task switch completed normally so the instruction
 		 * length field is not needed in this case and is explicitly
 		 * set to 0.
 		 */
 		if (ts->reason == TSR_IDT_GATE) {
 			KASSERT(idtvec_info & VMCS_IDT_VEC_VALID,
 			    ("invalid idtvec_info %#x for IDT task switch",
 			    idtvec_info));
 			intr_type = idtvec_info & VMCS_INTR_T_MASK;
 			if (intr_type != VMCS_INTR_T_SWINTR &&
 			    intr_type != VMCS_INTR_T_SWEXCEPTION &&
 			    intr_type != VMCS_INTR_T_PRIV_SWEXCEPTION) {
 				/* Task switch triggered by external event */
 				ts->ext = 1;
 				vmexit->inst_length = 0;
 				if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) {
 					ts->errcode_valid = 1;
 					ts->errcode = vmcs_idt_vectoring_err();
 				}
 			}
 		}
 		vmexit->exitcode = VM_EXITCODE_TASK_SWITCH;
 		VCPU_CTR4(vmx->vm, vcpu, "task switch reason %d, tss 0x%04x, "
 		    "%s errcode 0x%016lx", ts->reason, ts->tsssel,
 		    ts->ext ? "external" : "internal",
 		    ((uint64_t)ts->errcode << 32) | ts->errcode_valid);
 		break;
 	case EXIT_REASON_CR_ACCESS:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1);
 		switch (qual & 0xf) {
 		case 0:
 			handled = vmx_emulate_cr0_access(vmx, vcpu, qual);
 			break;
 		case 4:
 			handled = vmx_emulate_cr4_access(vmx, vcpu, qual);
 			break;
 		case 8:
 			handled = vmx_emulate_cr8_access(vmx, vcpu, qual);
 			break;
 		}
 		break;
 	case EXIT_REASON_RDMSR:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_RDMSR, 1);
 		retu = false;
 		ecx = vmxctx->guest_rcx;
 		VCPU_CTR1(vmx->vm, vcpu, "rdmsr 0x%08x", ecx);
-		error = emulate_rdmsr(vmx->vm, vcpu, ecx, &retu);
+		error = emulate_rdmsr(vmx, vcpu, ecx, &retu);
 		if (error) {
 			vmexit->exitcode = VM_EXITCODE_RDMSR;
 			vmexit->u.msr.code = ecx;
 		} else if (!retu) {
 			handled = HANDLED;
 		} else {
 			/* Return to userspace with a valid exitcode */
 			KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
-			    ("emulate_wrmsr retu with bogus exitcode"));
+			    ("emulate_rdmsr retu with bogus exitcode"));
 		}
 		break;
 	case EXIT_REASON_WRMSR:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_WRMSR, 1);
 		retu = false;
 		eax = vmxctx->guest_rax;
 		ecx = vmxctx->guest_rcx;
 		edx = vmxctx->guest_rdx;
 		VCPU_CTR2(vmx->vm, vcpu, "wrmsr 0x%08x value 0x%016lx",
 		    ecx, (uint64_t)edx << 32 | eax);
-		error = emulate_wrmsr(vmx->vm, vcpu, ecx,
+		error = emulate_wrmsr(vmx, vcpu, ecx,
 		    (uint64_t)edx << 32 | eax, &retu);
 		if (error) {
 			vmexit->exitcode = VM_EXITCODE_WRMSR;
 			vmexit->u.msr.code = ecx;
 			vmexit->u.msr.wval = (uint64_t)edx << 32 | eax;
 		} else if (!retu) {
 			handled = HANDLED;
 		} else {
 			/* Return to userspace with a valid exitcode */
 			KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
 			    ("emulate_wrmsr retu with bogus exitcode"));
 		}
 		break;
 	case EXIT_REASON_HLT:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1);
 		vmexit->exitcode = VM_EXITCODE_HLT;
 		vmexit->u.hlt.rflags = vmcs_read(VMCS_GUEST_RFLAGS);
 		break;
 	case EXIT_REASON_MTF:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1);
 		vmexit->exitcode = VM_EXITCODE_MTRAP;
 		break;
 	case EXIT_REASON_PAUSE:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_PAUSE, 1);
 		vmexit->exitcode = VM_EXITCODE_PAUSE;
 		break;
 	case EXIT_REASON_INTR_WINDOW:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INTR_WINDOW, 1);
 		vmx_clear_int_window_exiting(vmx, vcpu);
 		return (1);
 	case EXIT_REASON_EXT_INTR:
 		/*
 		 * External interrupts serve only to cause VM exits and allow
 		 * the host interrupt handler to run.
 		 *
 		 * If this external interrupt triggers a virtual interrupt
 		 * to a VM, then that state will be recorded by the
 		 * host interrupt handler in the VM's softc. We will inject
 		 * this virtual interrupt during the subsequent VM enter.
 		 */
 		intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
 
 		/*
 		 * XXX: Ignore this exit if VMCS_INTR_VALID is not set.
 		 * This appears to be a bug in VMware Fusion?
 		 */
 		if (!(intr_info & VMCS_INTR_VALID))
 			return (1);
 		KASSERT((intr_info & VMCS_INTR_VALID) != 0 &&
 		    (intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_HWINTR,
 		    ("VM exit interruption info invalid: %#x", intr_info));
 		vmx_trigger_hostintr(intr_info & 0xff);
 
 		/*
 		 * This is special. We want to treat this as an 'handled'
 		 * VM-exit but not increment the instruction pointer.
 		 */
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1);
 		return (1);
 	case EXIT_REASON_NMI_WINDOW:
 		/* Exit to allow the pending virtual NMI to be injected */
 		if (vm_nmi_pending(vmx->vm, vcpu))
 			vmx_inject_nmi(vmx, vcpu);
 		vmx_clear_nmi_window_exiting(vmx, vcpu);
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NMI_WINDOW, 1);
 		return (1);
 	case EXIT_REASON_INOUT:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INOUT, 1);
 		vmexit->exitcode = VM_EXITCODE_INOUT;
 		vmexit->u.inout.bytes = (qual & 0x7) + 1;
 		vmexit->u.inout.in = in = (qual & 0x8) ? 1 : 0;
 		vmexit->u.inout.string = (qual & 0x10) ? 1 : 0;
 		vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0;
 		vmexit->u.inout.port = (uint16_t)(qual >> 16);
 		vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax);
 		if (vmexit->u.inout.string) {
 			inst_info = vmcs_read(VMCS_EXIT_INSTRUCTION_INFO);
 			vmexit->exitcode = VM_EXITCODE_INOUT_STR;
 			vis = &vmexit->u.inout_str;
 			vmx_paging_info(&vis->paging);
 			vis->rflags = vmcs_read(VMCS_GUEST_RFLAGS);
 			vis->cr0 = vmcs_read(VMCS_GUEST_CR0);
 			vis->index = inout_str_index(vmx, vcpu, in);
 			vis->count = inout_str_count(vmx, vcpu, vis->inout.rep);
 			vis->addrsize = inout_str_addrsize(inst_info);
 			inout_str_seginfo(vmx, vcpu, inst_info, in, vis);
 		}
 		break;
 	case EXIT_REASON_CPUID:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CPUID, 1);
 		handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx);
 		break;
 	case EXIT_REASON_EXCEPTION:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXCEPTION, 1);
 		intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
 		KASSERT((intr_info & VMCS_INTR_VALID) != 0,
 		    ("VM exit interruption info invalid: %#x", intr_info));
 
 		/*
 		 * If Virtual NMIs control is 1 and the VM-exit is due to a
 		 * fault encountered during the execution of IRET then we must
 		 * restore the state of "virtual-NMI blocking" before resuming
 		 * the guest.
 		 *
 		 * See "Resuming Guest Software after Handling an Exception".
 		 * See "Information for VM Exits Due to Vectored Events".
 		 */
 		if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 &&
 		    (intr_info & 0xff) != IDT_DF &&
 		    (intr_info & EXIT_QUAL_NMIUDTI) != 0)
 			vmx_restore_nmi_blocking(vmx, vcpu);
 
 		/*
 		 * The NMI has already been handled in vmx_exit_handle_nmi().
 		 */
 		if ((intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_NMI)
 			return (1);
 		break;
 	case EXIT_REASON_EPT_FAULT:
 		/*
 		 * If 'gpa' lies within the address space allocated to
 		 * memory then this must be a nested page fault otherwise
 		 * this must be an instruction that accesses MMIO space.
 		 */
 		gpa = vmcs_gpa();
 		if (vm_mem_allocated(vmx->vm, gpa) ||
 		    apic_access_fault(vmx, vcpu, gpa)) {
 			vmexit->exitcode = VM_EXITCODE_PAGING;
 			vmexit->u.paging.gpa = gpa;
 			vmexit->u.paging.fault_type = ept_fault_type(qual);
 			vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NESTED_FAULT, 1);
 		} else if (ept_emulation_fault(qual)) {
 			vmexit_inst_emul(vmexit, gpa, vmcs_gla());
 			vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INST_EMUL, 1);
 		}
 		/*
 		 * If Virtual NMIs control is 1 and the VM-exit is due to an
 		 * EPT fault during the execution of IRET then we must restore
 		 * the state of "virtual-NMI blocking" before resuming.
 		 *
 		 * See description of "NMI unblocking due to IRET" in
 		 * "Exit Qualification for EPT Violations".
 		 */
 		if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 &&
 		    (qual & EXIT_QUAL_NMIUDTI) != 0)
 			vmx_restore_nmi_blocking(vmx, vcpu);
 		break;
 	case EXIT_REASON_VIRTUALIZED_EOI:
 		vmexit->exitcode = VM_EXITCODE_IOAPIC_EOI;
 		vmexit->u.ioapic_eoi.vector = qual & 0xFF;
 		vmexit->inst_length = 0;	/* trap-like */
 		break;
 	case EXIT_REASON_APIC_ACCESS:
 		handled = vmx_handle_apic_access(vmx, vcpu, vmexit);
 		break;
 	case EXIT_REASON_APIC_WRITE:
 		/*
 		 * APIC-write VM exit is trap-like so the %rip is already
 		 * pointing to the next instruction.
 		 */
 		vmexit->inst_length = 0;
 		vlapic = vm_lapic(vmx->vm, vcpu);
 		handled = vmx_handle_apic_write(vmx, vcpu, vlapic, qual);
 		break;
 	case EXIT_REASON_XSETBV:
 		handled = vmx_emulate_xsetbv(vmx, vcpu, vmexit);
 		break;
+	case EXIT_REASON_MONITOR:
+		vmexit->exitcode = VM_EXITCODE_MONITOR;
+		break;
+	case EXIT_REASON_MWAIT:
+		vmexit->exitcode = VM_EXITCODE_MWAIT;
+		break;
 	default:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_UNKNOWN, 1);
 		break;
 	}
 
 	if (handled) {
 		/*
 		 * It is possible that control is returned to userland
 		 * even though we were able to handle the VM exit in the
 		 * kernel.
 		 *
 		 * In such a case we want to make sure that the userland
 		 * restarts guest execution at the instruction *after*
 		 * the one we just processed. Therefore we update the
 		 * guest rip in the VMCS and in 'vmexit'.
 		 */
 		vmexit->rip += vmexit->inst_length;
 		vmexit->inst_length = 0;
 		vmcs_write(VMCS_GUEST_RIP, vmexit->rip);
 	} else {
 		if (vmexit->exitcode == VM_EXITCODE_BOGUS) {
 			/*
 			 * If this VM exit was not claimed by anybody then
 			 * treat it as a generic VMX exit.
 			 */
 			vmexit->exitcode = VM_EXITCODE_VMX;
 			vmexit->u.vmx.status = VM_SUCCESS;
 			vmexit->u.vmx.inst_type = 0;
 			vmexit->u.vmx.inst_error = 0;
 		} else {
 			/*
 			 * The exitcode and collateral have been populated.
 			 * The VM exit will be processed further in userland.
 			 */
 		}
 	}
 	return (handled);
 }
 
 static __inline void
 vmx_exit_inst_error(struct vmxctx *vmxctx, int rc, struct vm_exit *vmexit)
 {
 
 	KASSERT(vmxctx->inst_fail_status != VM_SUCCESS,
 	    ("vmx_exit_inst_error: invalid inst_fail_status %d",
 	    vmxctx->inst_fail_status));
 
 	vmexit->inst_length = 0;
 	vmexit->exitcode = VM_EXITCODE_VMX;
 	vmexit->u.vmx.status = vmxctx->inst_fail_status;
 	vmexit->u.vmx.inst_error = vmcs_instruction_error();
 	vmexit->u.vmx.exit_reason = ~0;
 	vmexit->u.vmx.exit_qualification = ~0;
 
 	switch (rc) {
 	case VMX_VMRESUME_ERROR:
 	case VMX_VMLAUNCH_ERROR:
 	case VMX_INVEPT_ERROR:
 		vmexit->u.vmx.inst_type = rc;
 		break;
 	default:
 		panic("vm_exit_inst_error: vmx_enter_guest returned %d", rc);
 	}
 }
 
 /*
  * If the NMI-exiting VM execution control is set to '1' then an NMI in
  * non-root operation causes a VM-exit. NMI blocking is in effect so it is
  * sufficient to simply vector to the NMI handler via a software interrupt.
  * However, this must be done before maskable interrupts are enabled
  * otherwise the "iret" issued by an interrupt handler will incorrectly
  * clear NMI blocking.
  */
 static __inline void
 vmx_exit_handle_nmi(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit)
 {
 	uint32_t intr_info;
 
 	KASSERT((read_rflags() & PSL_I) == 0, ("interrupts enabled"));
 
 	if (vmexit->u.vmx.exit_reason != EXIT_REASON_EXCEPTION)
 		return;
 
 	intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
 	KASSERT((intr_info & VMCS_INTR_VALID) != 0,
 	    ("VM exit interruption info invalid: %#x", intr_info));
 
 	if ((intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_NMI) {
 		KASSERT((intr_info & 0xff) == IDT_NMI, ("VM exit due "
 		    "to NMI has invalid vector: %#x", intr_info));
 		VCPU_CTR0(vmx->vm, vcpuid, "Vectoring to NMI handler");
 		__asm __volatile("int $2");
 	}
 }
 
 static int
 vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap,
     void *rendezvous_cookie, void *suspend_cookie)
 {
 	int rc, handled, launched;
 	struct vmx *vmx;
 	struct vm *vm;
 	struct vmxctx *vmxctx;
 	struct vmcs *vmcs;
 	struct vm_exit *vmexit;
 	struct vlapic *vlapic;
 	uint64_t rip;
 	uint32_t exit_reason;
 
 	vmx = arg;
 	vm = vmx->vm;
 	vmcs = &vmx->vmcs[vcpu];
 	vmxctx = &vmx->ctx[vcpu];
 	vlapic = vm_lapic(vm, vcpu);
 	vmexit = vm_exitinfo(vm, vcpu);
 	launched = 0;
 
 	KASSERT(vmxctx->pmap == pmap,
 	    ("pmap %p different than ctx pmap %p", pmap, vmxctx->pmap));
 
+	vmx_msr_guest_enter(vmx, vcpu);
+
 	VMPTRLD(vmcs);
 
 	/*
 	 * XXX
 	 * We do this every time because we may setup the virtual machine
 	 * from a different process than the one that actually runs it.
 	 *
 	 * If the life of a virtual machine was spent entirely in the context
 	 * of a single process we could do this once in vmx_vminit().
 	 */
 	vmcs_write(VMCS_HOST_CR3, rcr3());
 
 	vmcs_write(VMCS_GUEST_RIP, startrip);
 	vmx_set_pcpu_defaults(vmx, vcpu, pmap);
 	do {
 		handled = UNHANDLED;
 
 		/*
 		 * Interrupts are disabled from this point on until the
 		 * guest starts executing. This is done for the following
 		 * reasons:
 		 *
 		 * If an AST is asserted on this thread after the check below,
 		 * then the IPI_AST notification will not be lost, because it
 		 * will cause a VM exit due to external interrupt as soon as
 		 * the guest state is loaded.
 		 *
 		 * A posted interrupt after 'vmx_inject_interrupts()' will
 		 * not be "lost" because it will be held pending in the host
 		 * APIC because interrupts are disabled. The pending interrupt
 		 * will be recognized as soon as the guest state is loaded.
 		 *
 		 * The same reasoning applies to the IPI generated by
 		 * pmap_invalidate_ept().
 		 */
 		disable_intr();
 		vmx_inject_interrupts(vmx, vcpu, vlapic);
 
 		/*
 		 * Check for vcpu suspension after injecting events because
 		 * vmx_inject_interrupts() can suspend the vcpu due to a
 		 * triple fault.
 		 */
 		if (vcpu_suspended(suspend_cookie)) {
 			enable_intr();
 			vm_exit_suspended(vmx->vm, vcpu, vmcs_guest_rip());
 			break;
 		}
 
 		if (vcpu_rendezvous_pending(rendezvous_cookie)) {
 			enable_intr();
 			vm_exit_rendezvous(vmx->vm, vcpu, vmcs_guest_rip());
 			break;
 		}
 
 		if (vcpu_should_yield(vm, vcpu)) {
 			enable_intr();
 			vm_exit_astpending(vmx->vm, vcpu, vmcs_guest_rip());
 			vmx_astpending_trace(vmx, vcpu, vmexit->rip);
 			handled = HANDLED;
 			break;
 		}
 
 		vmx_run_trace(vmx, vcpu);
 		rc = vmx_enter_guest(vmxctx, vmx, launched);
 
 		/* Collect some information for VM exit processing */
 		vmexit->rip = rip = vmcs_guest_rip();
 		vmexit->inst_length = vmexit_instruction_length();
 		vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason();
 		vmexit->u.vmx.exit_qualification = vmcs_exit_qualification();
 
 		if (rc == VMX_GUEST_VMEXIT) {
 			vmx_exit_handle_nmi(vmx, vcpu, vmexit);
 			enable_intr();
 			handled = vmx_exit_process(vmx, vcpu, vmexit);
 		} else {
 			enable_intr();
 			vmx_exit_inst_error(vmxctx, rc, vmexit);
 		}
 		launched = 1;
 		vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled);
 	} while (handled);
 
 	/*
 	 * If a VM exit has been handled then the exitcode must be BOGUS
 	 * If a VM exit is not handled then the exitcode must not be BOGUS
 	 */
 	if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) ||
 	    (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) {
 		panic("Mismatch between handled (%d) and exitcode (%d)",
 		      handled, vmexit->exitcode);
 	}
 
 	if (!handled)
 		vmm_stat_incr(vm, vcpu, VMEXIT_USERSPACE, 1);
 
 	VCPU_CTR1(vm, vcpu, "returning from vmx_run: exitcode %d",
 	    vmexit->exitcode);
 
 	VMCLEAR(vmcs);
+	vmx_msr_guest_exit(vmx, vcpu);
+
 	return (0);
 }
 
 static void
 vmx_vmcleanup(void *arg)
 {
 	int i;
 	struct vmx *vmx = arg;
 
 	if (apic_access_virtualization(vmx, 0))
 		vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE);
 
 	for (i = 0; i < VM_MAXCPU; i++)
 		vpid_free(vmx->state[i].vpid);
 
 	free(vmx, M_VMX);
 
 	return;
 }
 
 static register_t *
 vmxctx_regptr(struct vmxctx *vmxctx, int reg)
 {
 
 	switch (reg) {
 	case VM_REG_GUEST_RAX:
 		return (&vmxctx->guest_rax);
 	case VM_REG_GUEST_RBX:
 		return (&vmxctx->guest_rbx);
 	case VM_REG_GUEST_RCX:
 		return (&vmxctx->guest_rcx);
 	case VM_REG_GUEST_RDX:
 		return (&vmxctx->guest_rdx);
 	case VM_REG_GUEST_RSI:
 		return (&vmxctx->guest_rsi);
 	case VM_REG_GUEST_RDI:
 		return (&vmxctx->guest_rdi);
 	case VM_REG_GUEST_RBP:
 		return (&vmxctx->guest_rbp);
 	case VM_REG_GUEST_R8:
 		return (&vmxctx->guest_r8);
 	case VM_REG_GUEST_R9:
 		return (&vmxctx->guest_r9);
 	case VM_REG_GUEST_R10:
 		return (&vmxctx->guest_r10);
 	case VM_REG_GUEST_R11:
 		return (&vmxctx->guest_r11);
 	case VM_REG_GUEST_R12:
 		return (&vmxctx->guest_r12);
 	case VM_REG_GUEST_R13:
 		return (&vmxctx->guest_r13);
 	case VM_REG_GUEST_R14:
 		return (&vmxctx->guest_r14);
 	case VM_REG_GUEST_R15:
 		return (&vmxctx->guest_r15);
 	case VM_REG_GUEST_CR2:
 		return (&vmxctx->guest_cr2);
 	default:
 		break;
 	}
 	return (NULL);
 }
 
 static int
 vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval)
 {
 	register_t *regp;
 
 	if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
 		*retval = *regp;
 		return (0);
 	} else
 		return (EINVAL);
 }
 
 static int
 vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val)
 {
 	register_t *regp;
 
 	if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
 		*regp = val;
 		return (0);
 	} else
 		return (EINVAL);
 }
 
 static int
+vmx_get_intr_shadow(struct vmx *vmx, int vcpu, int running, uint64_t *retval)
+{
+	uint64_t gi;
+	int error;
+
+	error = vmcs_getreg(&vmx->vmcs[vcpu], running, 
+	    VMCS_IDENT(VMCS_GUEST_INTERRUPTIBILITY), &gi);
+	*retval = (gi & HWINTR_BLOCKING) ? 1 : 0;
+	return (error);
+}
+
+static int
+vmx_modify_intr_shadow(struct vmx *vmx, int vcpu, int running, uint64_t val)
+{
+	struct vmcs *vmcs;
+	uint64_t gi;
+	int error, ident;
+
+	/*
+	 * Forcing the vcpu into an interrupt shadow is not supported.
+	 */
+	if (val) {
+		error = EINVAL;
+		goto done;
+	}
+
+	vmcs = &vmx->vmcs[vcpu];
+	ident = VMCS_IDENT(VMCS_GUEST_INTERRUPTIBILITY);
+	error = vmcs_getreg(vmcs, running, ident, &gi);
+	if (error == 0) {
+		gi &= ~HWINTR_BLOCKING;
+		error = vmcs_setreg(vmcs, running, ident, gi);
+	}
+done:
+	VCPU_CTR2(vmx->vm, vcpu, "Setting intr_shadow to %#lx %s", val,
+	    error ? "failed" : "succeeded");
+	return (error);
+}
+
+static int
 vmx_shadow_reg(int reg)
 {
 	int shreg;
 
 	shreg = -1;
 
 	switch (reg) {
 	case VM_REG_GUEST_CR0:
 		shreg = VMCS_CR0_SHADOW;
                 break;
         case VM_REG_GUEST_CR4:
 		shreg = VMCS_CR4_SHADOW;
 		break;
 	default:
 		break;
 	}
 
 	return (shreg);
 }
 
 static int
 vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval)
 {
 	int running, hostcpu;
 	struct vmx *vmx = arg;
 
 	running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
 	if (running && hostcpu != curcpu)
 		panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu);
 
+	if (reg == VM_REG_GUEST_INTR_SHADOW)
+		return (vmx_get_intr_shadow(vmx, vcpu, running, retval));
+
 	if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0)
 		return (0);
 
 	return (vmcs_getreg(&vmx->vmcs[vcpu], running, reg, retval));
 }
 
 static int
 vmx_setreg(void *arg, int vcpu, int reg, uint64_t val)
 {
 	int error, hostcpu, running, shadow;
 	uint64_t ctls;
 	pmap_t pmap;
 	struct vmx *vmx = arg;
 
 	running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
 	if (running && hostcpu != curcpu)
 		panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu);
+
+	if (reg == VM_REG_GUEST_INTR_SHADOW)
+		return (vmx_modify_intr_shadow(vmx, vcpu, running, val));
 
 	if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0)
 		return (0);
 
 	error = vmcs_setreg(&vmx->vmcs[vcpu], running, reg, val);
 
 	if (error == 0) {
 		/*
 		 * If the "load EFER" VM-entry control is 1 then the
 		 * value of EFER.LMA must be identical to "IA-32e mode guest"
 		 * bit in the VM-entry control.
 		 */
 		if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 &&
 		    (reg == VM_REG_GUEST_EFER)) {
 			vmcs_getreg(&vmx->vmcs[vcpu], running,
 				    VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls);
 			if (val & EFER_LMA)
 				ctls |= VM_ENTRY_GUEST_LMA;
 			else
 				ctls &= ~VM_ENTRY_GUEST_LMA;
 			vmcs_setreg(&vmx->vmcs[vcpu], running,
 				    VMCS_IDENT(VMCS_ENTRY_CTLS), ctls);
 		}
 
 		shadow = vmx_shadow_reg(reg);
 		if (shadow > 0) {
 			/*
 			 * Store the unmodified value in the shadow
 			 */			
 			error = vmcs_setreg(&vmx->vmcs[vcpu], running,
 				    VMCS_IDENT(shadow), val);
 		}
 
 		if (reg == VM_REG_GUEST_CR3) {
 			/*
 			 * Invalidate the guest vcpu's TLB mappings to emulate
 			 * the behavior of updating %cr3.
 			 *
 			 * XXX the processor retains global mappings when %cr3
 			 * is updated but vmx_invvpid() does not.
 			 */
 			pmap = vmx->ctx[vcpu].pmap;
 			vmx_invvpid(vmx, vcpu, pmap, running);
 		}
 	}
 
 	return (error);
 }
 
 static int
 vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
 {
 	int hostcpu, running;
 	struct vmx *vmx = arg;
 
 	running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
 	if (running && hostcpu != curcpu)
 		panic("vmx_getdesc: %s%d is running", vm_name(vmx->vm), vcpu);
 
 	return (vmcs_getdesc(&vmx->vmcs[vcpu], running, reg, desc));
 }
 
 static int
 vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
 {
 	int hostcpu, running;
 	struct vmx *vmx = arg;
 
 	running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
 	if (running && hostcpu != curcpu)
 		panic("vmx_setdesc: %s%d is running", vm_name(vmx->vm), vcpu);
 
 	return (vmcs_setdesc(&vmx->vmcs[vcpu], running, reg, desc));
 }
 
 static int
 vmx_getcap(void *arg, int vcpu, int type, int *retval)
 {
 	struct vmx *vmx = arg;
 	int vcap;
 	int ret;
 
 	ret = ENOENT;
 
 	vcap = vmx->cap[vcpu].set;
 
 	switch (type) {
 	case VM_CAP_HALT_EXIT:
 		if (cap_halt_exit)
 			ret = 0;
 		break;
 	case VM_CAP_PAUSE_EXIT:
 		if (cap_pause_exit)
 			ret = 0;
 		break;
 	case VM_CAP_MTRAP_EXIT:
 		if (cap_monitor_trap)
 			ret = 0;
 		break;
 	case VM_CAP_UNRESTRICTED_GUEST:
 		if (cap_unrestricted_guest)
 			ret = 0;
 		break;
 	case VM_CAP_ENABLE_INVPCID:
 		if (cap_invpcid)
 			ret = 0;
 		break;
 	default:
 		break;
 	}
 
 	if (ret == 0)
 		*retval = (vcap & (1 << type)) ? 1 : 0;
 
 	return (ret);
 }
 
 static int
 vmx_setcap(void *arg, int vcpu, int type, int val)
 {
 	struct vmx *vmx = arg;
 	struct vmcs *vmcs = &vmx->vmcs[vcpu];
 	uint32_t baseval;
 	uint32_t *pptr;
 	int error;
 	int flag;
 	int reg;
 	int retval;
 
 	retval = ENOENT;
 	pptr = NULL;
 
 	switch (type) {
 	case VM_CAP_HALT_EXIT:
 		if (cap_halt_exit) {
 			retval = 0;
 			pptr = &vmx->cap[vcpu].proc_ctls;
 			baseval = *pptr;
 			flag = PROCBASED_HLT_EXITING;
 			reg = VMCS_PRI_PROC_BASED_CTLS;
 		}
 		break;
 	case VM_CAP_MTRAP_EXIT:
 		if (cap_monitor_trap) {
 			retval = 0;
 			pptr = &vmx->cap[vcpu].proc_ctls;
 			baseval = *pptr;
 			flag = PROCBASED_MTF;
 			reg = VMCS_PRI_PROC_BASED_CTLS;
 		}
 		break;
 	case VM_CAP_PAUSE_EXIT:
 		if (cap_pause_exit) {
 			retval = 0;
 			pptr = &vmx->cap[vcpu].proc_ctls;
 			baseval = *pptr;
 			flag = PROCBASED_PAUSE_EXITING;
 			reg = VMCS_PRI_PROC_BASED_CTLS;
 		}
 		break;
 	case VM_CAP_UNRESTRICTED_GUEST:
 		if (cap_unrestricted_guest) {
 			retval = 0;
 			pptr = &vmx->cap[vcpu].proc_ctls2;
 			baseval = *pptr;
 			flag = PROCBASED2_UNRESTRICTED_GUEST;
 			reg = VMCS_SEC_PROC_BASED_CTLS;
 		}
 		break;
 	case VM_CAP_ENABLE_INVPCID:
 		if (cap_invpcid) {
 			retval = 0;
 			pptr = &vmx->cap[vcpu].proc_ctls2;
 			baseval = *pptr;
 			flag = PROCBASED2_ENABLE_INVPCID;
 			reg = VMCS_SEC_PROC_BASED_CTLS;
 		}
 		break;
 	default:
 		break;
 	}
 
 	if (retval == 0) {
 		if (val) {
 			baseval |= flag;
 		} else {
 			baseval &= ~flag;
 		}
 		VMPTRLD(vmcs);
 		error = vmwrite(reg, baseval);
 		VMCLEAR(vmcs);
 
 		if (error) {
 			retval = error;
 		} else {
 			/*
 			 * Update optional stored flags, and record
 			 * setting
 			 */
 			if (pptr != NULL) {
 				*pptr = baseval;
 			}
 
 			if (val) {
 				vmx->cap[vcpu].set |= (1 << type);
 			} else {
 				vmx->cap[vcpu].set &= ~(1 << type);
 			}
 		}
 	}
 
         return (retval);
 }
 
 struct vlapic_vtx {
 	struct vlapic	vlapic;
 	struct pir_desc	*pir_desc;
 	struct vmx	*vmx;
 };
 
 #define	VMX_CTR_PIR(vm, vcpuid, pir_desc, notify, vector, level, msg)	\
 do {									\
 	VCPU_CTR2(vm, vcpuid, msg " assert %s-triggered vector %d",	\
 	    level ? "level" : "edge", vector);				\
 	VCPU_CTR1(vm, vcpuid, msg " pir0 0x%016lx", pir_desc->pir[0]);	\
 	VCPU_CTR1(vm, vcpuid, msg " pir1 0x%016lx", pir_desc->pir[1]);	\
 	VCPU_CTR1(vm, vcpuid, msg " pir2 0x%016lx", pir_desc->pir[2]);	\
 	VCPU_CTR1(vm, vcpuid, msg " pir3 0x%016lx", pir_desc->pir[3]);	\
 	VCPU_CTR1(vm, vcpuid, msg " notify: %s", notify ? "yes" : "no");\
 } while (0)
 
 /*
  * vlapic->ops handlers that utilize the APICv hardware assist described in
  * Chapter 29 of the Intel SDM.
  */
 static int
 vmx_set_intr_ready(struct vlapic *vlapic, int vector, bool level)
 {
 	struct vlapic_vtx *vlapic_vtx;
 	struct pir_desc *pir_desc;
 	uint64_t mask;
 	int idx, notify;
 
 	vlapic_vtx = (struct vlapic_vtx *)vlapic;
 	pir_desc = vlapic_vtx->pir_desc;
 
 	/*
 	 * Keep track of interrupt requests in the PIR descriptor. This is
 	 * because the virtual APIC page pointed to by the VMCS cannot be
 	 * modified if the vcpu is running.
 	 */
 	idx = vector / 64;
 	mask = 1UL << (vector % 64);
 	atomic_set_long(&pir_desc->pir[idx], mask);
 	notify = atomic_cmpset_long(&pir_desc->pending, 0, 1);
 
 	VMX_CTR_PIR(vlapic->vm, vlapic->vcpuid, pir_desc, notify, vector,
 	    level, "vmx_set_intr_ready");
 	return (notify);
 }
 
 static int
 vmx_pending_intr(struct vlapic *vlapic, int *vecptr)
 {
 	struct vlapic_vtx *vlapic_vtx;
 	struct pir_desc *pir_desc;
 	struct LAPIC *lapic;
 	uint64_t pending, pirval;
 	uint32_t ppr, vpr;
 	int i;
 
 	/*
 	 * This function is only expected to be called from the 'HLT' exit
 	 * handler which does not care about the vector that is pending.
 	 */
 	KASSERT(vecptr == NULL, ("vmx_pending_intr: vecptr must be NULL"));
 
 	vlapic_vtx = (struct vlapic_vtx *)vlapic;
 	pir_desc = vlapic_vtx->pir_desc;
 
 	pending = atomic_load_acq_long(&pir_desc->pending);
 	if (!pending)
 		return (0);	/* common case */
 
 	/*
 	 * If there is an interrupt pending then it will be recognized only
 	 * if its priority is greater than the processor priority.
 	 *
 	 * Special case: if the processor priority is zero then any pending
 	 * interrupt will be recognized.
 	 */
 	lapic = vlapic->apic_page;
 	ppr = lapic->ppr & 0xf0;
 	if (ppr == 0)
 		return (1);
 
 	VCPU_CTR1(vlapic->vm, vlapic->vcpuid, "HLT with non-zero PPR %d",
 	    lapic->ppr);
 
 	for (i = 3; i >= 0; i--) {
 		pirval = pir_desc->pir[i];
 		if (pirval != 0) {
 			vpr = (i * 64 + flsl(pirval) - 1) & 0xf0;
 			return (vpr > ppr);
 		}
 	}
 	return (0);
 }
 
 static void
 vmx_intr_accepted(struct vlapic *vlapic, int vector)
 {
 
 	panic("vmx_intr_accepted: not expected to be called");
 }
 
 static void
 vmx_set_tmr(struct vlapic *vlapic, int vector, bool level)
 {
 	struct vlapic_vtx *vlapic_vtx;
 	struct vmx *vmx;
 	struct vmcs *vmcs;
 	uint64_t mask, val;
 
 	KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector));
 	KASSERT(!vcpu_is_running(vlapic->vm, vlapic->vcpuid, NULL),
 	    ("vmx_set_tmr: vcpu cannot be running"));
 
 	vlapic_vtx = (struct vlapic_vtx *)vlapic;
 	vmx = vlapic_vtx->vmx;
 	vmcs = &vmx->vmcs[vlapic->vcpuid];
 	mask = 1UL << (vector % 64);
 
 	VMPTRLD(vmcs);
 	val = vmcs_read(VMCS_EOI_EXIT(vector));
 	if (level)
 		val |= mask;
 	else
 		val &= ~mask;
 	vmcs_write(VMCS_EOI_EXIT(vector), val);
 	VMCLEAR(vmcs);
 }
 
 static void
 vmx_enable_x2apic_mode(struct vlapic *vlapic)
 {
 	struct vmx *vmx;
 	struct vmcs *vmcs;
 	uint32_t proc_ctls2;
 	int vcpuid, error;
 
 	vcpuid = vlapic->vcpuid;
 	vmx = ((struct vlapic_vtx *)vlapic)->vmx;
 	vmcs = &vmx->vmcs[vcpuid];
 
 	proc_ctls2 = vmx->cap[vcpuid].proc_ctls2;
 	KASSERT((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) != 0,
 	    ("%s: invalid proc_ctls2 %#x", __func__, proc_ctls2));
 
 	proc_ctls2 &= ~PROCBASED2_VIRTUALIZE_APIC_ACCESSES;
 	proc_ctls2 |= PROCBASED2_VIRTUALIZE_X2APIC_MODE;
 	vmx->cap[vcpuid].proc_ctls2 = proc_ctls2;
 
 	VMPTRLD(vmcs);
 	vmcs_write(VMCS_SEC_PROC_BASED_CTLS, proc_ctls2);
 	VMCLEAR(vmcs);
 
 	if (vlapic->vcpuid == 0) {
 		/*
 		 * The nested page table mappings are shared by all vcpus
 		 * so unmap the APIC access page just once.
 		 */
 		error = vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE);
 		KASSERT(error == 0, ("%s: vm_unmap_mmio error %d",
 		    __func__, error));
 
 		/*
 		 * The MSR bitmap is shared by all vcpus so modify it only
 		 * once in the context of vcpu 0.
 		 */
 		error = vmx_allow_x2apic_msrs(vmx);
 		KASSERT(error == 0, ("%s: vmx_allow_x2apic_msrs error %d",
 		    __func__, error));
 	}
 }
 
 static void
 vmx_post_intr(struct vlapic *vlapic, int hostcpu)
 {
 
 	ipi_cpu(hostcpu, pirvec);
 }
 
 /*
  * Transfer the pending interrupts in the PIR descriptor to the IRR
  * in the virtual APIC page.
  */
 static void
 vmx_inject_pir(struct vlapic *vlapic)
 {
 	struct vlapic_vtx *vlapic_vtx;
 	struct pir_desc *pir_desc;
 	struct LAPIC *lapic;
 	uint64_t val, pirval;
 	int rvi, pirbase = -1;
 	uint16_t intr_status_old, intr_status_new;
 
 	vlapic_vtx = (struct vlapic_vtx *)vlapic;
 	pir_desc = vlapic_vtx->pir_desc;
 	if (atomic_cmpset_long(&pir_desc->pending, 1, 0) == 0) {
 		VCPU_CTR0(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: "
 		    "no posted interrupt pending");
 		return;
 	}
 
 	pirval = 0;
 	pirbase = -1;
 	lapic = vlapic->apic_page;
 
 	val = atomic_readandclear_long(&pir_desc->pir[0]);
 	if (val != 0) {
 		lapic->irr0 |= val;
 		lapic->irr1 |= val >> 32;
 		pirbase = 0;
 		pirval = val;
 	}
 
 	val = atomic_readandclear_long(&pir_desc->pir[1]);
 	if (val != 0) {
 		lapic->irr2 |= val;
 		lapic->irr3 |= val >> 32;
 		pirbase = 64;
 		pirval = val;
 	}
 
 	val = atomic_readandclear_long(&pir_desc->pir[2]);
 	if (val != 0) {
 		lapic->irr4 |= val;
 		lapic->irr5 |= val >> 32;
 		pirbase = 128;
 		pirval = val;
 	}
 
 	val = atomic_readandclear_long(&pir_desc->pir[3]);
 	if (val != 0) {
 		lapic->irr6 |= val;
 		lapic->irr7 |= val >> 32;
 		pirbase = 192;
 		pirval = val;
 	}
 
 	VLAPIC_CTR_IRR(vlapic, "vmx_inject_pir");
 
 	/*
 	 * Update RVI so the processor can evaluate pending virtual
 	 * interrupts on VM-entry.
 	 *
 	 * It is possible for pirval to be 0 here, even though the
 	 * pending bit has been set. The scenario is:
 	 * CPU-Y is sending a posted interrupt to CPU-X, which
 	 * is running a guest and processing posted interrupts in h/w.
 	 * CPU-X will eventually exit and the state seen in s/w is
 	 * the pending bit set, but no PIR bits set.
 	 *
 	 *      CPU-X                      CPU-Y
 	 *   (vm running)                (host running)
 	 *   rx posted interrupt
 	 *   CLEAR pending bit
 	 *				 SET PIR bit
 	 *   READ/CLEAR PIR bits
 	 *				 SET pending bit
 	 *   (vm exit)
 	 *   pending bit set, PIR 0
 	 */
 	if (pirval != 0) {
 		rvi = pirbase + flsl(pirval) - 1;
 		intr_status_old = vmcs_read(VMCS_GUEST_INTR_STATUS);
 		intr_status_new = (intr_status_old & 0xFF00) | rvi;
 		if (intr_status_new > intr_status_old) {
 			vmcs_write(VMCS_GUEST_INTR_STATUS, intr_status_new);
 			VCPU_CTR2(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: "
 			    "guest_intr_status changed from 0x%04x to 0x%04x",
 			    intr_status_old, intr_status_new);
 		}
 	}
 }
 
 static struct vlapic *
 vmx_vlapic_init(void *arg, int vcpuid)
 {
 	struct vmx *vmx;
 	struct vlapic *vlapic;
 	struct vlapic_vtx *vlapic_vtx;
 	
 	vmx = arg;
 
 	vlapic = malloc(sizeof(struct vlapic_vtx), M_VLAPIC, M_WAITOK | M_ZERO);
 	vlapic->vm = vmx->vm;
 	vlapic->vcpuid = vcpuid;
 	vlapic->apic_page = (struct LAPIC *)&vmx->apic_page[vcpuid];
 
 	vlapic_vtx = (struct vlapic_vtx *)vlapic;
 	vlapic_vtx->pir_desc = &vmx->pir_desc[vcpuid];
 	vlapic_vtx->vmx = vmx;
 
 	if (virtual_interrupt_delivery) {
 		vlapic->ops.set_intr_ready = vmx_set_intr_ready;
 		vlapic->ops.pending_intr = vmx_pending_intr;
 		vlapic->ops.intr_accepted = vmx_intr_accepted;
 		vlapic->ops.set_tmr = vmx_set_tmr;
 		vlapic->ops.enable_x2apic_mode = vmx_enable_x2apic_mode;
 	}
 
 	if (posted_interrupts)
 		vlapic->ops.post_intr = vmx_post_intr;
 
 	vlapic_init(vlapic);
 
 	return (vlapic);
 }
 
 static void
 vmx_vlapic_cleanup(void *arg, struct vlapic *vlapic)
 {
 
 	vlapic_cleanup(vlapic);
 	free(vlapic, M_VLAPIC);
 }
 
 struct vmm_ops vmm_ops_intel = {
 	vmx_init,
 	vmx_cleanup,
 	vmx_restore,
 	vmx_vminit,
 	vmx_run,
 	vmx_vmcleanup,
 	vmx_getreg,
 	vmx_setreg,
 	vmx_getdesc,
 	vmx_setdesc,
 	vmx_getcap,
 	vmx_setcap,
 	ept_vmspace_alloc,
 	ept_vmspace_free,
 	vmx_vlapic_init,
 	vmx_vlapic_cleanup,
 };
Index: stable/10/sys/amd64/vmm/intel/vmx.h
===================================================================
--- stable/10/sys/amd64/vmm/intel/vmx.h	(revision 276348)
+++ stable/10/sys/amd64/vmm/intel/vmx.h	(revision 276349)
@@ -1,131 +1,138 @@
 /*-
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _VMX_H_
 #define	_VMX_H_
 
 #include "vmcs.h"
 
 struct pmap;
 
-#define	GUEST_MSR_MAX_ENTRIES	64		/* arbitrary */
-
 struct vmxctx {
 	register_t	guest_rdi;		/* Guest state */
 	register_t	guest_rsi;
 	register_t	guest_rdx;
 	register_t	guest_rcx;
 	register_t	guest_r8;
 	register_t	guest_r9;
 	register_t	guest_rax;
 	register_t	guest_rbx;
 	register_t	guest_rbp;
 	register_t	guest_r10;
 	register_t	guest_r11;
 	register_t	guest_r12;
 	register_t	guest_r13;
 	register_t	guest_r14;
 	register_t	guest_r15;
 	register_t	guest_cr2;
 
 	register_t	host_r15;		/* Host state */
 	register_t	host_r14;
 	register_t	host_r13;
 	register_t	host_r12;
 	register_t	host_rbp;
 	register_t	host_rsp;
 	register_t	host_rbx;
 	/*
 	 * XXX todo debug registers and fpu state
 	 */
 
 	int		inst_fail_status;
 
 	/*
 	 * The pmap needs to be deactivated in vmx_enter_guest()
 	 * so keep a copy of the 'pmap' in each vmxctx.
 	 */
 	struct pmap	*pmap;
 };
 
 struct vmxcap {
 	int	set;
 	uint32_t proc_ctls;
 	uint32_t proc_ctls2;
 };
 
 struct vmxstate {
 	int	lastcpu;	/* host cpu that this 'vcpu' last ran on */
 	uint16_t vpid;
 };
 
 struct apic_page {
 	uint32_t reg[PAGE_SIZE / 4];
 };
 CTASSERT(sizeof(struct apic_page) == PAGE_SIZE);
 
 /* Posted Interrupt Descriptor (described in section 29.6 of the Intel SDM) */
 struct pir_desc {
 	uint64_t	pir[4];
 	uint64_t	pending;
 	uint64_t	unused[3];
 } __aligned(64);
 CTASSERT(sizeof(struct pir_desc) == 64);
 
+/* Index into the 'guest_msrs[]' array */
+enum {
+	IDX_MSR_LSTAR,
+	IDX_MSR_CSTAR,
+	IDX_MSR_STAR,
+	IDX_MSR_SF_MASK,
+	IDX_MSR_KGSBASE,
+	GUEST_MSR_NUM		/* must be the last enumeration */
+};
+
 /* virtual machine softc */
 struct vmx {
 	struct vmcs	vmcs[VM_MAXCPU];	/* one vmcs per virtual cpu */
 	struct apic_page apic_page[VM_MAXCPU];	/* one apic page per vcpu */
 	char		msr_bitmap[PAGE_SIZE];
 	struct pir_desc	pir_desc[VM_MAXCPU];
-	struct msr_entry guest_msrs[VM_MAXCPU][GUEST_MSR_MAX_ENTRIES];
+	uint64_t	guest_msrs[VM_MAXCPU][GUEST_MSR_NUM];
 	struct vmxctx	ctx[VM_MAXCPU];
 	struct vmxcap	cap[VM_MAXCPU];
 	struct vmxstate	state[VM_MAXCPU];
 	uint64_t	eptp;
 	struct vm	*vm;
 	long		eptgen[MAXCPU];		/* cached pmap->pm_eptgen */
 };
 CTASSERT((offsetof(struct vmx, vmcs) & PAGE_MASK) == 0);
 CTASSERT((offsetof(struct vmx, msr_bitmap) & PAGE_MASK) == 0);
-CTASSERT((offsetof(struct vmx, guest_msrs) & 15) == 0);
 CTASSERT((offsetof(struct vmx, pir_desc[0]) & 63) == 0);
 
 #define	VMX_GUEST_VMEXIT	0
 #define	VMX_VMRESUME_ERROR	1
 #define	VMX_VMLAUNCH_ERROR	2
 #define	VMX_INVEPT_ERROR	3
 int	vmx_enter_guest(struct vmxctx *ctx, struct vmx *vmx, int launched);
 void	vmx_call_isr(uintptr_t entry);
 
 u_long	vmx_fix_cr0(u_long cr0);
 u_long	vmx_fix_cr4(u_long cr4);
 
 extern char	vmx_exit_guest[];
 
 #endif
Index: stable/10/sys/amd64/vmm/intel/vmx_msr.c
===================================================================
--- stable/10/sys/amd64/vmm/intel/vmx_msr.c	(revision 276348)
+++ stable/10/sys/amd64/vmm/intel/vmx_msr.c	(revision 276349)
@@ -1,173 +1,388 @@
 /*-
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
+#include <sys/cpuset.h>
 
+#include <machine/clock.h>
 #include <machine/cpufunc.h>
+#include <machine/md_var.h>
 #include <machine/specialreg.h>
+#include <machine/vmm.h>
 
+#include "vmx.h"
 #include "vmx_msr.h"
 
 static boolean_t
 vmx_ctl_allows_one_setting(uint64_t msr_val, int bitpos)
 {
 
 	if (msr_val & (1UL << (bitpos + 32)))
 		return (TRUE);
 	else
 		return (FALSE);
 }
 
 static boolean_t
 vmx_ctl_allows_zero_setting(uint64_t msr_val, int bitpos)
 {
 
 	if ((msr_val & (1UL << bitpos)) == 0)
 		return (TRUE);
 	else
 		return (FALSE);
 }
 
 uint32_t
 vmx_revision(void)
 {
 
 	return (rdmsr(MSR_VMX_BASIC) & 0xffffffff);
 }
 
 /*
  * Generate a bitmask to be used for the VMCS execution control fields.
  *
  * The caller specifies what bits should be set to one in 'ones_mask'
  * and what bits should be set to zero in 'zeros_mask'. The don't-care
  * bits are set to the default value. The default values are obtained
  * based on "Algorithm 3" in Section 27.5.1 "Algorithms for Determining
  * VMX Capabilities".
  *
  * Returns zero on success and non-zero on error.
  */
 int
 vmx_set_ctlreg(int ctl_reg, int true_ctl_reg, uint32_t ones_mask,
 	       uint32_t zeros_mask, uint32_t *retval)
 {
 	int i;
 	uint64_t val, trueval;
 	boolean_t true_ctls_avail, one_allowed, zero_allowed;
 
 	/* We cannot ask the same bit to be set to both '1' and '0' */
 	if ((ones_mask ^ zeros_mask) != (ones_mask | zeros_mask))
 		return (EINVAL);
 
 	if (rdmsr(MSR_VMX_BASIC) & (1UL << 55))
 		true_ctls_avail = TRUE;
 	else
 		true_ctls_avail = FALSE;
 
 	val = rdmsr(ctl_reg);
 	if (true_ctls_avail)
 		trueval = rdmsr(true_ctl_reg);		/* step c */
 	else
 		trueval = val;				/* step a */
 
 	for (i = 0; i < 32; i++) {
 		one_allowed = vmx_ctl_allows_one_setting(trueval, i);
 		zero_allowed = vmx_ctl_allows_zero_setting(trueval, i);
 
 		KASSERT(one_allowed || zero_allowed,
 			("invalid zero/one setting for bit %d of ctl 0x%0x, "
 			 "truectl 0x%0x\n", i, ctl_reg, true_ctl_reg));
 
 		if (zero_allowed && !one_allowed) {		/* b(i),c(i) */
 			if (ones_mask & (1 << i))
 				return (EINVAL);
 			*retval &= ~(1 << i);
 		} else if (one_allowed && !zero_allowed) {	/* b(i),c(i) */
 			if (zeros_mask & (1 << i))
 				return (EINVAL);
 			*retval |= 1 << i;
 		} else {
 			if (zeros_mask & (1 << i))	/* b(ii),c(ii) */
 				*retval &= ~(1 << i);
 			else if (ones_mask & (1 << i)) /* b(ii), c(ii) */
 				*retval |= 1 << i;
 			else if (!true_ctls_avail)
 				*retval &= ~(1 << i);	/* b(iii) */
 			else if (vmx_ctl_allows_zero_setting(val, i))/* c(iii)*/
 				*retval &= ~(1 << i);
 			else if (vmx_ctl_allows_one_setting(val, i)) /* c(iv) */
 				*retval |= 1 << i;
 			else {
 				panic("vmx_set_ctlreg: unable to determine "
 				      "correct value of ctl bit %d for msr "
 				      "0x%0x and true msr 0x%0x", i, ctl_reg,
 				      true_ctl_reg);
 			}
 		}
 	}
 
 	return (0);
 }
 
 void
 msr_bitmap_initialize(char *bitmap)
 {
 
 	memset(bitmap, 0xff, PAGE_SIZE);
 }
 
 int
 msr_bitmap_change_access(char *bitmap, u_int msr, int access)
 {
 	int byte, bit;
 
 	if (msr <= 0x00001FFF)
 		byte = msr / 8;
 	else if (msr >= 0xC0000000 && msr <= 0xC0001FFF)
 		byte = 1024 + (msr - 0xC0000000) / 8;
 	else
 		return (EINVAL);
 
 	bit = msr & 0x7;
 
 	if (access & MSR_BITMAP_ACCESS_READ)
 		bitmap[byte] &= ~(1 << bit);
 	else
 		bitmap[byte] |= 1 << bit;
 
 	byte += 2048;
 	if (access & MSR_BITMAP_ACCESS_WRITE)
 		bitmap[byte] &= ~(1 << bit);
 	else
 		bitmap[byte] |= 1 << bit;
 
 	return (0);
+}
+
+static uint64_t misc_enable;
+static uint64_t platform_info;
+static uint64_t turbo_ratio_limit;
+static uint64_t host_msrs[GUEST_MSR_NUM];
+
+static bool
+nehalem_cpu(void)
+{
+	u_int family, model;
+
+	/*
+	 * The family:model numbers belonging to the Nehalem microarchitecture
+	 * are documented in Section 35.5, Intel SDM dated Feb 2014.
+	 */
+	family = CPUID_TO_FAMILY(cpu_id);
+	model = CPUID_TO_MODEL(cpu_id);
+	if (family == 0x6) {
+		switch (model) {
+		case 0x1A:
+		case 0x1E:
+		case 0x1F:
+		case 0x2E:
+			return (true);
+		default:
+			break;
+		}
+	}
+	return (false);
+}
+
+static bool
+westmere_cpu(void)
+{
+	u_int family, model;
+
+	/*
+	 * The family:model numbers belonging to the Westmere microarchitecture
+	 * are documented in Section 35.6, Intel SDM dated Feb 2014.
+	 */
+	family = CPUID_TO_FAMILY(cpu_id);
+	model = CPUID_TO_MODEL(cpu_id);
+	if (family == 0x6) {
+		switch (model) {
+		case 0x25:
+		case 0x2C:
+			return (true);
+		default:
+			break;
+		}
+	}
+	return (false);
+}
+
+void
+vmx_msr_init(void)
+{
+	uint64_t bus_freq, ratio;
+	int i;
+
+	/*
+	 * It is safe to cache the values of the following MSRs because
+	 * they don't change based on curcpu, curproc or curthread.
+	 */
+	host_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR);
+	host_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR);
+	host_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR);
+	host_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK);
+
+	/*
+	 * Initialize emulated MSRs
+	 */
+	misc_enable = rdmsr(MSR_IA32_MISC_ENABLE);
+	/*
+	 * Set mandatory bits
+	 *  11:   branch trace disabled
+	 *  12:   PEBS unavailable
+	 * Clear unsupported features
+	 *  16:   SpeedStep enable
+	 *  18:   enable MONITOR FSM
+	 */
+	misc_enable |= (1 << 12) | (1 << 11);
+	misc_enable &= ~((1 << 18) | (1 << 16));
+
+	if (nehalem_cpu() || westmere_cpu())
+		bus_freq = 133330000;		/* 133Mhz */
+	else
+		bus_freq = 100000000;		/* 100Mhz */
+
+	/*
+	 * XXXtime
+	 * The ratio should really be based on the virtual TSC frequency as
+	 * opposed to the host TSC.
+	 */
+	ratio = (tsc_freq / bus_freq) & 0xff;
+
+	/*
+	 * The register definition is based on the micro-architecture
+	 * but the following bits are always the same:
+	 * [15:8]  Maximum Non-Turbo Ratio
+	 * [28]    Programmable Ratio Limit for Turbo Mode
+	 * [29]    Programmable TDC-TDP Limit for Turbo Mode
+	 * [47:40] Maximum Efficiency Ratio
+	 *
+	 * The other bits can be safely set to 0 on all
+	 * micro-architectures up to Haswell.
+	 */
+	platform_info = (ratio << 8) | (ratio << 40);
+
+	/*
+	 * The number of valid bits in the MSR_TURBO_RATIO_LIMITx register is
+	 * dependent on the maximum cores per package supported by the micro-
+	 * architecture. For e.g., Westmere supports 6 cores per package and
+	 * uses the low 48 bits. Sandybridge support 8 cores per package and
+	 * uses up all 64 bits.
+	 *
+	 * However, the unused bits are reserved so we pretend that all bits
+	 * in this MSR are valid.
+	 */
+	for (i = 0; i < 8; i++)
+		turbo_ratio_limit = (turbo_ratio_limit << 8) | ratio;
+}
+
+void
+vmx_msr_guest_init(struct vmx *vmx, int vcpuid)
+{
+	/*
+	 * The permissions bitmap is shared between all vcpus so initialize it
+	 * once when initializing the vBSP.
+	 */
+	if (vcpuid == 0) {
+		guest_msr_rw(vmx, MSR_LSTAR);
+		guest_msr_rw(vmx, MSR_CSTAR);
+		guest_msr_rw(vmx, MSR_STAR);
+		guest_msr_rw(vmx, MSR_SF_MASK);
+		guest_msr_rw(vmx, MSR_KGSBASE);
+	}
+	return;
+}
+
+void
+vmx_msr_guest_enter(struct vmx *vmx, int vcpuid)
+{
+	uint64_t *guest_msrs = vmx->guest_msrs[vcpuid];
+
+	/* Save host MSRs (if any) and restore guest MSRs */
+	wrmsr(MSR_LSTAR, guest_msrs[IDX_MSR_LSTAR]);
+	wrmsr(MSR_CSTAR, guest_msrs[IDX_MSR_CSTAR]);
+	wrmsr(MSR_STAR, guest_msrs[IDX_MSR_STAR]);
+	wrmsr(MSR_SF_MASK, guest_msrs[IDX_MSR_SF_MASK]);
+	wrmsr(MSR_KGSBASE, guest_msrs[IDX_MSR_KGSBASE]);
+}
+
+void
+vmx_msr_guest_exit(struct vmx *vmx, int vcpuid)
+{
+	uint64_t *guest_msrs = vmx->guest_msrs[vcpuid];
+
+	/* Save guest MSRs */
+	guest_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR);
+	guest_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR);
+	guest_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR);
+	guest_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK);
+	guest_msrs[IDX_MSR_KGSBASE] = rdmsr(MSR_KGSBASE);
+
+	/* Restore host MSRs */
+	wrmsr(MSR_LSTAR, host_msrs[IDX_MSR_LSTAR]);
+	wrmsr(MSR_CSTAR, host_msrs[IDX_MSR_CSTAR]);
+	wrmsr(MSR_STAR, host_msrs[IDX_MSR_STAR]);
+	wrmsr(MSR_SF_MASK, host_msrs[IDX_MSR_SF_MASK]);
+
+	/* MSR_KGSBASE will be restored on the way back to userspace */
+}
+
+int
+vmx_rdmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t *val, bool *retu)
+{
+	int error = 0;
+
+	switch (num) {
+	case MSR_IA32_MISC_ENABLE:
+		*val = misc_enable;
+		break;
+	case MSR_PLATFORM_INFO:
+		*val = platform_info;
+		break;
+	case MSR_TURBO_RATIO_LIMIT:
+	case MSR_TURBO_RATIO_LIMIT1:
+		*val = turbo_ratio_limit;
+		break;
+	default:
+		error = EINVAL;
+		break;
+	}
+	return (error);
+}
+
+int
+vmx_wrmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t val, bool *retu)
+{
+	int error = 0;
+
+	switch (num) {
+	default:
+		error = EINVAL;
+		break;
+	}
+
+	return (error);
 }
Index: stable/10/sys/amd64/vmm/intel/vmx_msr.h
===================================================================
--- stable/10/sys/amd64/vmm/intel/vmx_msr.h	(revision 276348)
+++ stable/10/sys/amd64/vmm/intel/vmx_msr.h	(revision 276349)
@@ -1,55 +1,70 @@
 /*-
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _VMX_MSR_H_
 #define	_VMX_MSR_H_
 
+struct vmx;
+
+void vmx_msr_init(void);
+void vmx_msr_guest_init(struct vmx *vmx, int vcpuid);
+void vmx_msr_guest_enter(struct vmx *vmx, int vcpuid);
+void vmx_msr_guest_exit(struct vmx *vmx, int vcpuid);
+int vmx_rdmsr(struct vmx *, int vcpuid, u_int num, uint64_t *val, bool *retu);
+int vmx_wrmsr(struct vmx *, int vcpuid, u_int num, uint64_t val, bool *retu);
+
 uint32_t vmx_revision(void);
 
 int vmx_set_ctlreg(int ctl_reg, int true_ctl_reg, uint32_t ones_mask,
 		   uint32_t zeros_mask, uint32_t *retval);
 
 /*
  * According to Section 21.10.4 "Software Access to Related Structures",
  * changes to data structures pointed to by the VMCS must be made only when
  * there is no logical processor with a current VMCS that points to the
  * data structure.
  *
  * This pretty much limits us to configuring the MSR bitmap before VMCS
  * initialization for SMP VMs. Unless of course we do it the hard way - which
  * would involve some form of synchronization between the vcpus to vmclear
  * all VMCSs' that point to the bitmap.
  */
 #define	MSR_BITMAP_ACCESS_NONE	0x0
 #define	MSR_BITMAP_ACCESS_READ	0x1
 #define	MSR_BITMAP_ACCESS_WRITE	0x2
 #define	MSR_BITMAP_ACCESS_RW	(MSR_BITMAP_ACCESS_READ|MSR_BITMAP_ACCESS_WRITE)
 void	msr_bitmap_initialize(char *bitmap);
 int	msr_bitmap_change_access(char *bitmap, u_int msr, int access);
+
+#define	guest_msr_rw(vmx, msr) \
+    msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_RW)
+
+#define	guest_msr_ro(vmx, msr) \
+    msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_READ)
 
 #endif
Index: stable/10/sys/amd64/vmm/io/vatpic.c
===================================================================
--- stable/10/sys/amd64/vmm/io/vatpic.c	(revision 276348)
+++ stable/10/sys/amd64/vmm/io/vatpic.c	(revision 276349)
@@ -1,736 +1,742 @@
 /*-
  * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/queue.h>
 #include <sys/cpuset.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/systm.h>
 
 #include <x86/apicreg.h>
 #include <dev/ic/i8259.h>
 
 #include <machine/vmm.h>
 
 #include "vmm_ktr.h"
 #include "vmm_lapic.h"
 #include "vioapic.h"
 #include "vatpic.h"
 
 static MALLOC_DEFINE(M_VATPIC, "atpic", "bhyve virtual atpic (8259)");
 
 #define	VATPIC_LOCK(vatpic)		mtx_lock_spin(&((vatpic)->mtx))
 #define	VATPIC_UNLOCK(vatpic)		mtx_unlock_spin(&((vatpic)->mtx))
 #define	VATPIC_LOCKED(vatpic)		mtx_owned(&((vatpic)->mtx))
 
 enum irqstate {
 	IRQSTATE_ASSERT,
 	IRQSTATE_DEASSERT,
 	IRQSTATE_PULSE
 };
 
 struct atpic {
 	bool		ready;
 	int		icw_num;
 	int		rd_cmd_reg;
 
 	bool		aeoi;
 	bool		poll;
 	bool		rotate;
 	bool		sfn;		/* special fully-nested mode */
 
 	int		irq_base;
 	uint8_t		request;	/* Interrupt Request Register (IIR) */
 	uint8_t		service;	/* Interrupt Service (ISR) */
 	uint8_t		mask;		/* Interrupt Mask Register (IMR) */
 
 	int		acnt[8];	/* sum of pin asserts and deasserts */
 	int		priority;	/* current pin priority */
 
 	bool		intr_raised;
 };
 
 struct vatpic {
 	struct vm	*vm;
 	struct mtx	mtx;
 	struct atpic	atpic[2];
 	uint8_t		elc[2];
 };
 
 #define	VATPIC_CTR0(vatpic, fmt)					\
 	VM_CTR0((vatpic)->vm, fmt)
 
 #define	VATPIC_CTR1(vatpic, fmt, a1)					\
 	VM_CTR1((vatpic)->vm, fmt, a1)
 
 #define	VATPIC_CTR2(vatpic, fmt, a1, a2)				\
 	VM_CTR2((vatpic)->vm, fmt, a1, a2)
 
 #define	VATPIC_CTR3(vatpic, fmt, a1, a2, a3)				\
 	VM_CTR3((vatpic)->vm, fmt, a1, a2, a3)
 
 #define	VATPIC_CTR4(vatpic, fmt, a1, a2, a3, a4)			\
 	VM_CTR4((vatpic)->vm, fmt, a1, a2, a3, a4)
 
 static void vatpic_set_pinstate(struct vatpic *vatpic, int pin, bool newstate);
 
 static __inline int
 vatpic_get_highest_isrpin(struct atpic *atpic)
 {
 	int bit, pin;
 	int i;
 
 	for (i = 0; i <= 7; i++) {
 		pin = ((i + 7 - atpic->priority) & 0x7);
                 bit = (1 << pin);
 
 		if (atpic->service & bit)
 			return (pin);
 	}
 
 	return (-1);
 }
 
 static __inline int
 vatpic_get_highest_irrpin(struct atpic *atpic)
 {
 	int serviced;
 	int bit, pin;
 	int i, j;
 
 	/*
 	 * In 'Special Fully-Nested Mode' when an interrupt request from
 	 * a slave is in service, the slave is not locked out from the
 	 * master's priority logic.
 	 */
 	serviced = atpic->service;
 	if (atpic->sfn)
 		serviced &= ~(1 << 2);
 
 	for (i = 0; i <= 7; i++) {
 		pin = ((i + 7 - atpic->priority) & 0x7);
 		bit = (1 << pin);
 		if (serviced & bit)
 			break;
 	}
 
 	for (j = 0; j < i; j++) {
 		pin = ((j + 7 - atpic->priority) & 0x7);
 		bit = (1 << pin);
 		if (atpic->request & bit && (~atpic->mask & bit))
 			return (pin);
 	}
 
 	return (-1);
 }
 
 static void
 vatpic_notify_intr(struct vatpic *vatpic)
 {
 	struct atpic *atpic;
 	int pin;
 
 	KASSERT(VATPIC_LOCKED(vatpic), ("vatpic_notify_intr not locked"));
 
 	/*
 	 * First check the slave.
 	 */
 	atpic = &vatpic->atpic[1];
 	if (!atpic->intr_raised &&
 	    (pin = vatpic_get_highest_irrpin(atpic)) != -1) {
 		VATPIC_CTR4(vatpic, "atpic slave notify pin = %d "
 		    "(imr 0x%x irr 0x%x isr 0x%x)", pin,
 		    atpic->mask, atpic->request, atpic->service);
 
 		/*
 		 * Cascade the request from the slave to the master.
 		 */
 		atpic->intr_raised = true;
 		vatpic_set_pinstate(vatpic, 2, true);
 		vatpic_set_pinstate(vatpic, 2, false);
 	} else {
 		VATPIC_CTR3(vatpic, "atpic slave no eligible interrupts "
 		    "(imr 0x%x irr 0x%x isr 0x%x)",
 		    atpic->mask, atpic->request, atpic->service);
 	}
 
 	/*
 	 * Then check the master.
 	 */
 	atpic = &vatpic->atpic[0];
 	if (!atpic->intr_raised &&
 	    (pin = vatpic_get_highest_irrpin(atpic)) != -1) {
 		VATPIC_CTR4(vatpic, "atpic master notify pin = %d "
 		    "(imr 0x%x irr 0x%x isr 0x%x)", pin,
 		    atpic->mask, atpic->request, atpic->service);
 
 		/*
 		 * From Section 3.6.2, "Interrupt Modes", in the
 		 * MPtable Specification, Version 1.4
 		 *
 		 * PIC interrupts are routed to both the Local APIC
 		 * and the I/O APIC to support operation in 1 of 3
 		 * modes.
 		 *
 		 * 1. Legacy PIC Mode: the PIC effectively bypasses
 		 * all APIC components.  In this mode the local APIC is
 		 * disabled and LINT0 is reconfigured as INTR to
 		 * deliver the PIC interrupt directly to the CPU.
 		 *
 		 * 2. Virtual Wire Mode: the APIC is treated as a
 		 * virtual wire which delivers interrupts from the PIC
 		 * to the CPU.  In this mode LINT0 is programmed as
 		 * ExtINT to indicate that the PIC is the source of
 		 * the interrupt.
 		 *
 		 * 3. Virtual Wire Mode via I/O APIC: PIC interrupts are
 		 * fielded by the I/O APIC and delivered to the appropriate
 		 * CPU.  In this mode the I/O APIC input 0 is programmed
 		 * as ExtINT to indicate that the PIC is the source of the
 		 * interrupt.
 		 */
 		atpic->intr_raised = true;
 		lapic_set_local_intr(vatpic->vm, -1, APIC_LVT_LINT0);
 		vioapic_pulse_irq(vatpic->vm, 0);
 	} else {
 		VATPIC_CTR3(vatpic, "atpic master no eligible interrupts "
 		    "(imr 0x%x irr 0x%x isr 0x%x)",
 		    atpic->mask, atpic->request, atpic->service);
 	}
 }
 
 static int
 vatpic_icw1(struct vatpic *vatpic, struct atpic *atpic, uint8_t val)
 {
 	VATPIC_CTR1(vatpic, "atpic icw1 0x%x", val);
 
 	atpic->ready = false;
 
 	atpic->icw_num = 1;
 	atpic->mask = 0;
 	atpic->priority = 0;
 	atpic->rd_cmd_reg = 0;
 
 	if ((val & ICW1_SNGL) != 0) {
 		VATPIC_CTR0(vatpic, "vatpic cascade mode required");
 		return (-1);
 	}
 
 	if ((val & ICW1_IC4) == 0) {
 		VATPIC_CTR0(vatpic, "vatpic icw4 required");
 		return (-1);
 	}
 
 	atpic->icw_num++;
 
 	return (0);
 }
 
 static int
 vatpic_icw2(struct vatpic *vatpic, struct atpic *atpic, uint8_t val)
 {
 	VATPIC_CTR1(vatpic, "atpic icw2 0x%x", val);
 
 	atpic->irq_base = val & 0xf8;
 
 	atpic->icw_num++;
 
 	return (0);
 }
 
 static int
 vatpic_icw3(struct vatpic *vatpic, struct atpic *atpic, uint8_t val)
 {
 	VATPIC_CTR1(vatpic, "atpic icw3 0x%x", val);
 
 	atpic->icw_num++;
 
 	return (0);
 }
 
 static int
 vatpic_icw4(struct vatpic *vatpic, struct atpic *atpic, uint8_t val)
 {
 	VATPIC_CTR1(vatpic, "atpic icw4 0x%x", val);
 
 	if ((val & ICW4_8086) == 0) {
 		VATPIC_CTR0(vatpic, "vatpic microprocessor mode required");
 		return (-1);
 	}
 
 	if ((val & ICW4_AEOI) != 0)
 		atpic->aeoi = true;
 
 	atpic->icw_num = 0;
 	atpic->ready = true;
 
 	return (0);
 }
 
 static int
 vatpic_ocw1(struct vatpic *vatpic, struct atpic *atpic, uint8_t val)
 {
 	VATPIC_CTR1(vatpic, "atpic ocw1 0x%x", val);
 
 	atpic->mask = val & 0xff;
 
 	return (0);
 }
 
 static int
 vatpic_ocw2(struct vatpic *vatpic, struct atpic *atpic, uint8_t val)
 {
 	VATPIC_CTR1(vatpic, "atpic ocw2 0x%x", val);
 
 	atpic->rotate = ((val & OCW2_R) != 0);
 
 	if ((val & OCW2_EOI) != 0) {
 		int isr_bit;
 
 		if ((val & OCW2_SL) != 0) {
 			/* specific EOI */
 			isr_bit = val & 0x7;
 		} else {
 			/* non-specific EOI */
 			isr_bit = vatpic_get_highest_isrpin(atpic);
 		}
 
 		if (isr_bit != -1) {
 			atpic->service &= ~(1 << isr_bit);
 
 			if (atpic->rotate)
 				atpic->priority = isr_bit;
 		}
 	} else if ((val & OCW2_SL) != 0 && atpic->rotate == true) {
 		/* specific priority */
 		atpic->priority = val & 0x7;
 	}
 
 	return (0);
 }
 
 static int
 vatpic_ocw3(struct vatpic *vatpic, struct atpic *atpic, uint8_t val)
 {
 	VATPIC_CTR1(vatpic, "atpic ocw3 0x%x", val);
 
 	atpic->poll = ((val & OCW3_P) != 0);
 
 	if (val & OCW3_RR) {
 		/* read register command */
 		atpic->rd_cmd_reg = val & OCW3_RIS;
 	}
 
 	return (0);
 }
 
 static void
 vatpic_set_pinstate(struct vatpic *vatpic, int pin, bool newstate)
 {
 	struct atpic *atpic;
 	int oldcnt, newcnt;
 	bool level;
 
 	KASSERT(pin >= 0 && pin < 16,
 	    ("vatpic_set_pinstate: invalid pin number %d", pin));
 	KASSERT(VATPIC_LOCKED(vatpic),
 	    ("vatpic_set_pinstate: vatpic is not locked"));
 
 	atpic = &vatpic->atpic[pin >> 3];
 
 	oldcnt = atpic->acnt[pin & 0x7];
 	if (newstate)
 		atpic->acnt[pin & 0x7]++;
 	else
 		atpic->acnt[pin & 0x7]--;
 	newcnt = atpic->acnt[pin & 0x7];
 
 	if (newcnt < 0) {
 		VATPIC_CTR2(vatpic, "atpic pin%d: bad acnt %d", pin, newcnt);
 	}
 
 	level = ((vatpic->elc[pin >> 3] & (1 << (pin & 0x7))) != 0);
 
 	if ((oldcnt == 0 && newcnt == 1) || (newcnt > 0 && level == true)) {
 		/* rising edge or level */
 		VATPIC_CTR1(vatpic, "atpic pin%d: asserted", pin);
 		atpic->request |= (1 << (pin & 0x7));
 	} else if (oldcnt == 1 && newcnt == 0) {
 		/* falling edge */
 		VATPIC_CTR1(vatpic, "atpic pin%d: deasserted", pin);
 	} else {
 		VATPIC_CTR3(vatpic, "atpic pin%d: %s, ignored, acnt %d",
 		    pin, newstate ? "asserted" : "deasserted", newcnt);
 	}
 
 	vatpic_notify_intr(vatpic);
 }
 
 static int
 vatpic_set_irqstate(struct vm *vm, int irq, enum irqstate irqstate)
 {
 	struct vatpic *vatpic;
 	struct atpic *atpic;
 
 	if (irq < 0 || irq > 15)
 		return (EINVAL);
 
 	vatpic = vm_atpic(vm);
 	atpic = &vatpic->atpic[irq >> 3];
 
 	if (atpic->ready == false)
 		return (0);
 
 	VATPIC_LOCK(vatpic);
 	switch (irqstate) {
 	case IRQSTATE_ASSERT:
 		vatpic_set_pinstate(vatpic, irq, true);
 		break;
 	case IRQSTATE_DEASSERT:
 		vatpic_set_pinstate(vatpic, irq, false);
 		break;
 	case IRQSTATE_PULSE:
 		vatpic_set_pinstate(vatpic, irq, true);
 		vatpic_set_pinstate(vatpic, irq, false);
 		break;
 	default:
 		panic("vatpic_set_irqstate: invalid irqstate %d", irqstate);
 	}
 	VATPIC_UNLOCK(vatpic);
 
 	return (0);
 }
 
 int
 vatpic_assert_irq(struct vm *vm, int irq)
 {
 	return (vatpic_set_irqstate(vm, irq, IRQSTATE_ASSERT));
 }
 
 int
 vatpic_deassert_irq(struct vm *vm, int irq)
 {
 	return (vatpic_set_irqstate(vm, irq, IRQSTATE_DEASSERT));
 }
 
 int
 vatpic_pulse_irq(struct vm *vm, int irq)
 {
 	return (vatpic_set_irqstate(vm, irq, IRQSTATE_PULSE));
 }
 
 int
 vatpic_set_irq_trigger(struct vm *vm, int irq, enum vm_intr_trigger trigger)
 {
 	struct vatpic *vatpic;
 
 	if (irq < 0 || irq > 15)
 		return (EINVAL);
 
 	/*
 	 * See comment in vatpic_elc_handler.  These IRQs must be
 	 * edge triggered.
 	 */
 	if (trigger == LEVEL_TRIGGER) {
 		switch (irq) {
 		case 0:
 		case 1:
 		case 2:
 		case 8:
 		case 13:
 			return (EINVAL);
 		}
 	}
 
 	vatpic = vm_atpic(vm);
 
 	VATPIC_LOCK(vatpic);
 
 	if (trigger == LEVEL_TRIGGER)
 		vatpic->elc[irq >> 3] |=  1 << (irq & 0x7);
 	else
 		vatpic->elc[irq >> 3] &=  ~(1 << (irq & 0x7));
 
 	VATPIC_UNLOCK(vatpic);
 
 	return (0);
 }
 
 void
 vatpic_pending_intr(struct vm *vm, int *vecptr)
 {
 	struct vatpic *vatpic;
 	struct atpic *atpic;
 	int pin;
 
 	vatpic = vm_atpic(vm);
 
 	atpic = &vatpic->atpic[0];
 
 	VATPIC_LOCK(vatpic);
 
 	pin = vatpic_get_highest_irrpin(atpic);
-	if (pin == -1)
-		pin = 7;
 	if (pin == 2) {
 		atpic = &vatpic->atpic[1];
 		pin = vatpic_get_highest_irrpin(atpic);
 	}
 
+	/*
+	 * If there are no pins active at this moment then return the spurious
+	 * interrupt vector instead.
+	 */
+	if (pin == -1)
+		pin = 7;
+
+	KASSERT(pin >= 0 && pin <= 7, ("%s: invalid pin %d", __func__, pin));
 	*vecptr = atpic->irq_base + pin;
 
 	VATPIC_UNLOCK(vatpic);
 }
 
 static void
 vatpic_pin_accepted(struct atpic *atpic, int pin)
 {
 	atpic->intr_raised = false;
 
 	if (atpic->acnt[pin] == 0)
 		atpic->request &= ~(1 << pin);
 
 	if (atpic->aeoi == true) {
 		if (atpic->rotate == true)
 			atpic->priority = pin;
 	} else {
 		atpic->service |= (1 << pin);
 	}
 }
 
 void
 vatpic_intr_accepted(struct vm *vm, int vector)
 {
 	struct vatpic *vatpic;
 	int pin;
 
 	vatpic = vm_atpic(vm);
 
 	VATPIC_LOCK(vatpic);
 
 	pin = vector & 0x7;
 
 	if ((vector & ~0x7) == vatpic->atpic[1].irq_base) {
 		vatpic_pin_accepted(&vatpic->atpic[1], pin);
 		/*
 		 * If this vector originated from the slave,
 		 * accept the cascaded interrupt too.
 		 */
 		vatpic_pin_accepted(&vatpic->atpic[0], 2);
 	} else {
 		vatpic_pin_accepted(&vatpic->atpic[0], pin);
 	}
 
 	vatpic_notify_intr(vatpic);
 
 	VATPIC_UNLOCK(vatpic);
 }
 
 static int
 vatpic_read(struct vatpic *vatpic, struct atpic *atpic, bool in, int port,
 	    int bytes, uint32_t *eax)
 {
 	VATPIC_LOCK(vatpic);
 
 	if (atpic->poll) {
 		VATPIC_CTR0(vatpic, "vatpic polled mode not supported");
 		VATPIC_UNLOCK(vatpic);
 		return (-1);
 	} else {
 		if (port & ICU_IMR_OFFSET) {
 			/* read interrrupt mask register */
 			*eax = atpic->mask;
 		} else {
 			if (atpic->rd_cmd_reg == OCW3_RIS) {
 				/* read interrupt service register */
 				*eax = atpic->service;
 			} else {
 				/* read interrupt request register */
 				*eax = atpic->request;
 			}
 		}
 	}
 
 	VATPIC_UNLOCK(vatpic);
 
 	return (0);
 
 }
 
 static int
 vatpic_write(struct vatpic *vatpic, struct atpic *atpic, bool in, int port,
     int bytes, uint32_t *eax)
 {
 	int error;
 	uint8_t val;
 
 	error = 0;
 	val = *eax;
 
 	VATPIC_LOCK(vatpic);
 
 	if (port & ICU_IMR_OFFSET) {
 		switch (atpic->icw_num) {
 		case 2:
 			error = vatpic_icw2(vatpic, atpic, val);
 			break;
 		case 3:
 			error = vatpic_icw3(vatpic, atpic, val);
 			break;
 		case 4:
 			error = vatpic_icw4(vatpic, atpic, val);
 			break;
 		default:
 			error = vatpic_ocw1(vatpic, atpic, val);
 			break;
 		}
 	} else {
 		if (val & (1 << 4))
 			error = vatpic_icw1(vatpic, atpic, val);
 
 		if (atpic->ready) {
 			if (val & (1 << 3))
 				error = vatpic_ocw3(vatpic, atpic, val);
 			else
 				error = vatpic_ocw2(vatpic, atpic, val);
 		}
 	}
 
 	if (atpic->ready)
 		vatpic_notify_intr(vatpic);
 
 	VATPIC_UNLOCK(vatpic);
 
 	return (error);
 }
 
 int
 vatpic_master_handler(void *vm, int vcpuid, bool in, int port, int bytes,
     uint32_t *eax)
 {
 	struct vatpic *vatpic;
 	struct atpic *atpic;
 
 	vatpic = vm_atpic(vm);
 	atpic = &vatpic->atpic[0];
 
 	if (bytes != 1)
 		return (-1);
  
 	if (in) {
 		return (vatpic_read(vatpic, atpic, in, port, bytes, eax));
 	}
  
 	return (vatpic_write(vatpic, atpic, in, port, bytes, eax));
 }
 
 int
 vatpic_slave_handler(void *vm, int vcpuid, bool in, int port, int bytes,
     uint32_t *eax)
 {
 	struct vatpic *vatpic;
 	struct atpic *atpic;
 
 	vatpic = vm_atpic(vm);
 	atpic = &vatpic->atpic[1];
 
 	if (bytes != 1)
 		return (-1);
 
 	if (in) {
 		return (vatpic_read(vatpic, atpic, in, port, bytes, eax));
 	}
 
 	return (vatpic_write(vatpic, atpic, in, port, bytes, eax));
 }
 
 int
 vatpic_elc_handler(void *vm, int vcpuid, bool in, int port, int bytes,
     uint32_t *eax)
 {
 	struct vatpic *vatpic;
 	bool is_master;
 
 	vatpic = vm_atpic(vm);
 	is_master = (port == IO_ELCR1);
 
 	if (bytes != 1)
 		return (-1);
 
 	VATPIC_LOCK(vatpic);
 
 	if (in) {
 		if (is_master)
 			*eax = vatpic->elc[0];
 		else
 			*eax = vatpic->elc[1];
 	} else {
 		/*
 		 * For the master PIC the cascade channel (IRQ2), the
 		 * heart beat timer (IRQ0), and the keyboard
 		 * controller (IRQ1) cannot be programmed for level
 		 * mode.
 		 *
 		 * For the slave PIC the real time clock (IRQ8) and
 		 * the floating point error interrupt (IRQ13) cannot
 		 * be programmed for level mode.
 		 */
 		if (is_master)
 			vatpic->elc[0] = (*eax & 0xf8);
 		else
 			vatpic->elc[1] = (*eax & 0xde);
 	}
 
 	VATPIC_UNLOCK(vatpic);
 
 	return (0);
 }
 
 struct vatpic *
 vatpic_init(struct vm *vm)
 {
 	struct vatpic *vatpic;
 
 	vatpic = malloc(sizeof(struct vatpic), M_VATPIC, M_WAITOK | M_ZERO);
 	vatpic->vm = vm;
 
 	mtx_init(&vatpic->mtx, "vatpic lock", NULL, MTX_SPIN);
 
 	return (vatpic);
 }
 
 void
 vatpic_cleanup(struct vatpic *vatpic)
 {
 	free(vatpic, M_VATPIC);
 }
Index: stable/10/sys/amd64/vmm/io/vlapic.c
===================================================================
--- stable/10/sys/amd64/vmm/io/vlapic.c	(revision 276348)
+++ stable/10/sys/amd64/vmm/io/vlapic.c	(revision 276349)
@@ -1,1652 +1,1653 @@
 /*-
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/lock.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/systm.h>
 #include <sys/smp.h>
 
 #include <x86/specialreg.h>
 #include <x86/apicreg.h>
 
 #include <machine/clock.h>
 #include <machine/smp.h>
 
 #include <machine/vmm.h>
 
 #include "vmm_ipi.h"
 #include "vmm_lapic.h"
 #include "vmm_ktr.h"
 #include "vmm_stat.h"
 
 #include "vlapic.h"
 #include "vlapic_priv.h"
 #include "vioapic.h"
 
 #define	PRIO(x)			((x) >> 4)
 
 #define VLAPIC_VERSION		(16)
 
 #define	x2apic(vlapic)	(((vlapic)->msr_apicbase & APICBASE_X2APIC) ? 1 : 0)
 
 /*
  * The 'vlapic->timer_mtx' is used to provide mutual exclusion between the
  * vlapic_callout_handler() and vcpu accesses to:
  * - timer_freq_bt, timer_period_bt, timer_fire_bt
  * - timer LVT register
  */
 #define	VLAPIC_TIMER_LOCK(vlapic)	mtx_lock_spin(&((vlapic)->timer_mtx))
 #define	VLAPIC_TIMER_UNLOCK(vlapic)	mtx_unlock_spin(&((vlapic)->timer_mtx))
 #define	VLAPIC_TIMER_LOCKED(vlapic)	mtx_owned(&((vlapic)->timer_mtx))
 
 /*
  * APIC timer frequency:
  * - arbitrary but chosen to be in the ballpark of contemporary hardware.
  * - power-of-two to avoid loss of precision when converted to a bintime.
  */
 #define VLAPIC_BUS_FREQ		(128 * 1024 * 1024)
 
 static __inline uint32_t
 vlapic_get_id(struct vlapic *vlapic)
 {
 
 	if (x2apic(vlapic))
 		return (vlapic->vcpuid);
 	else
 		return (vlapic->vcpuid << 24);
 }
 
 static uint32_t
 x2apic_ldr(struct vlapic *vlapic)
 {
 	int apicid;
 	uint32_t ldr;
 
 	apicid = vlapic_get_id(vlapic);
 	ldr = 1 << (apicid & 0xf);
 	ldr |= (apicid & 0xffff0) << 12;
 	return (ldr);
 }
 
 void
 vlapic_dfr_write_handler(struct vlapic *vlapic)
 {
 	struct LAPIC *lapic;
 
 	lapic = vlapic->apic_page;
 	if (x2apic(vlapic)) {
 		VM_CTR1(vlapic->vm, "ignoring write to DFR in x2apic mode: %#x",
 		    lapic->dfr);
 		lapic->dfr = 0;
 		return;
 	}
 
 	lapic->dfr &= APIC_DFR_MODEL_MASK;
 	lapic->dfr |= APIC_DFR_RESERVED;
 
 	if ((lapic->dfr & APIC_DFR_MODEL_MASK) == APIC_DFR_MODEL_FLAT)
 		VLAPIC_CTR0(vlapic, "vlapic DFR in Flat Model");
 	else if ((lapic->dfr & APIC_DFR_MODEL_MASK) == APIC_DFR_MODEL_CLUSTER)
 		VLAPIC_CTR0(vlapic, "vlapic DFR in Cluster Model");
 	else
 		VLAPIC_CTR1(vlapic, "DFR in Unknown Model %#x", lapic->dfr);
 }
 
 void
 vlapic_ldr_write_handler(struct vlapic *vlapic)
 {
 	struct LAPIC *lapic;
 
 	lapic = vlapic->apic_page;
 
 	/* LDR is read-only in x2apic mode */
 	if (x2apic(vlapic)) {
 		VLAPIC_CTR1(vlapic, "ignoring write to LDR in x2apic mode: %#x",
 		    lapic->ldr);
 		lapic->ldr = x2apic_ldr(vlapic);
 	} else {
 		lapic->ldr &= ~APIC_LDR_RESERVED;
 		VLAPIC_CTR1(vlapic, "vlapic LDR set to %#x", lapic->ldr);
 	}
 }
 
 void
 vlapic_id_write_handler(struct vlapic *vlapic)
 {
 	struct LAPIC *lapic;
 	
 	/*
 	 * We don't allow the ID register to be modified so reset it back to
 	 * its default value.
 	 */
 	lapic = vlapic->apic_page;
 	lapic->id = vlapic_get_id(vlapic);
 }
 
 static int
 vlapic_timer_divisor(uint32_t dcr)
 {
 	switch (dcr & 0xB) {
 	case APIC_TDCR_1:
 		return (1);
 	case APIC_TDCR_2:
 		return (2);
 	case APIC_TDCR_4:
 		return (4);
 	case APIC_TDCR_8:
 		return (8);
 	case APIC_TDCR_16:
 		return (16);
 	case APIC_TDCR_32:
 		return (32);
 	case APIC_TDCR_64:
 		return (64);
 	case APIC_TDCR_128:
 		return (128);
 	default:
 		panic("vlapic_timer_divisor: invalid dcr 0x%08x", dcr);
 	}
 }
 
 #if 0
 static inline void
 vlapic_dump_lvt(uint32_t offset, uint32_t *lvt)
 {
 	printf("Offset %x: lvt %08x (V:%02x DS:%x M:%x)\n", offset,
 	    *lvt, *lvt & APIC_LVTT_VECTOR, *lvt & APIC_LVTT_DS,
 	    *lvt & APIC_LVTT_M);
 }
 #endif
 
 static uint32_t
 vlapic_get_ccr(struct vlapic *vlapic)
 {
 	struct bintime bt_now, bt_rem;
 	struct LAPIC *lapic;
 	uint32_t ccr;
 	
 	ccr = 0;
 	lapic = vlapic->apic_page;
 
 	VLAPIC_TIMER_LOCK(vlapic);
 	if (callout_active(&vlapic->callout)) {
 		/*
 		 * If the timer is scheduled to expire in the future then
 		 * compute the value of 'ccr' based on the remaining time.
 		 */
 		binuptime(&bt_now);
 		if (bintime_cmp(&vlapic->timer_fire_bt, &bt_now, >)) {
 			bt_rem = vlapic->timer_fire_bt;
 			bintime_sub(&bt_rem, &bt_now);
 			ccr += bt_rem.sec * BT2FREQ(&vlapic->timer_freq_bt);
 			ccr += bt_rem.frac / vlapic->timer_freq_bt.frac;
 		}
 	}
 	KASSERT(ccr <= lapic->icr_timer, ("vlapic_get_ccr: invalid ccr %#x, "
 	    "icr_timer is %#x", ccr, lapic->icr_timer));
 	VLAPIC_CTR2(vlapic, "vlapic ccr_timer = %#x, icr_timer = %#x",
 	    ccr, lapic->icr_timer);
 	VLAPIC_TIMER_UNLOCK(vlapic);
 	return (ccr);
 }
 
 void
 vlapic_dcr_write_handler(struct vlapic *vlapic)
 {
 	struct LAPIC *lapic;
 	int divisor;
 	
 	lapic = vlapic->apic_page;
 	VLAPIC_TIMER_LOCK(vlapic);
 
 	divisor = vlapic_timer_divisor(lapic->dcr_timer);
 	VLAPIC_CTR2(vlapic, "vlapic dcr_timer=%#x, divisor=%d",
 	    lapic->dcr_timer, divisor);
 
 	/*
 	 * Update the timer frequency and the timer period.
 	 *
 	 * XXX changes to the frequency divider will not take effect until
 	 * the timer is reloaded.
 	 */
 	FREQ2BT(VLAPIC_BUS_FREQ / divisor, &vlapic->timer_freq_bt);
 	vlapic->timer_period_bt = vlapic->timer_freq_bt;
 	bintime_mul(&vlapic->timer_period_bt, lapic->icr_timer);
 
 	VLAPIC_TIMER_UNLOCK(vlapic);
 }
 
 void
 vlapic_esr_write_handler(struct vlapic *vlapic)
 {
 	struct LAPIC *lapic;
 	
 	lapic = vlapic->apic_page;
 	lapic->esr = vlapic->esr_pending;
 	vlapic->esr_pending = 0;
 }
 
 int
 vlapic_set_intr_ready(struct vlapic *vlapic, int vector, bool level)
 {
 	struct LAPIC *lapic;
 	uint32_t *irrptr, *tmrptr, mask;
 	int idx;
 
 	KASSERT(vector >= 0 && vector < 256, ("invalid vector %d", vector));
 
 	lapic = vlapic->apic_page;
 	if (!(lapic->svr & APIC_SVR_ENABLE)) {
 		VLAPIC_CTR1(vlapic, "vlapic is software disabled, ignoring "
 		    "interrupt %d", vector);
 		return (0);
 	}
 
 	if (vector < 16) {
 		vlapic_set_error(vlapic, APIC_ESR_RECEIVE_ILLEGAL_VECTOR);
 		VLAPIC_CTR1(vlapic, "vlapic ignoring interrupt to vector %d",
 		    vector);
 		return (1);
 	}
 
 	if (vlapic->ops.set_intr_ready)
 		return ((*vlapic->ops.set_intr_ready)(vlapic, vector, level));
 
 	idx = (vector / 32) * 4;
 	mask = 1 << (vector % 32);
 
 	irrptr = &lapic->irr0;
 	atomic_set_int(&irrptr[idx], mask);
 
 	/*
 	 * Verify that the trigger-mode of the interrupt matches with
 	 * the vlapic TMR registers.
 	 */
 	tmrptr = &lapic->tmr0;
 	if ((tmrptr[idx] & mask) != (level ? mask : 0)) {
 		VLAPIC_CTR3(vlapic, "vlapic TMR[%d] is 0x%08x but "
 		    "interrupt is %s-triggered", idx / 4, tmrptr[idx],
 		    level ? "level" : "edge");
 	}
 
 	VLAPIC_CTR_IRR(vlapic, "vlapic_set_intr_ready");
 	return (1);
 }
 
 static __inline uint32_t *
 vlapic_get_lvtptr(struct vlapic *vlapic, uint32_t offset)
 {
 	struct LAPIC	*lapic = vlapic->apic_page;
 	int 		 i;
 
 	switch (offset) {
 	case APIC_OFFSET_CMCI_LVT:
 		return (&lapic->lvt_cmci);
 	case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
 		i = (offset - APIC_OFFSET_TIMER_LVT) >> 2;
 		return ((&lapic->lvt_timer) + i);;
 	default:
 		panic("vlapic_get_lvt: invalid LVT\n");
 	}
 }
 
 static __inline int
 lvt_off_to_idx(uint32_t offset)
 {
 	int index;
 
 	switch (offset) {
 	case APIC_OFFSET_CMCI_LVT:
 		index = APIC_LVT_CMCI;
 		break;
 	case APIC_OFFSET_TIMER_LVT:
 		index = APIC_LVT_TIMER;
 		break;
 	case APIC_OFFSET_THERM_LVT:
 		index = APIC_LVT_THERMAL;
 		break;
 	case APIC_OFFSET_PERF_LVT:
 		index = APIC_LVT_PMC;
 		break;
 	case APIC_OFFSET_LINT0_LVT:
 		index = APIC_LVT_LINT0;
 		break;
 	case APIC_OFFSET_LINT1_LVT:
 		index = APIC_LVT_LINT1;
 		break;
 	case APIC_OFFSET_ERROR_LVT:
 		index = APIC_LVT_ERROR;
 		break;
 	default:
 		index = -1;
 		break;
 	}
 	KASSERT(index >= 0 && index <= VLAPIC_MAXLVT_INDEX, ("lvt_off_to_idx: "
 	    "invalid lvt index %d for offset %#x", index, offset));
 
 	return (index);
 }
 
 static __inline uint32_t
 vlapic_get_lvt(struct vlapic *vlapic, uint32_t offset)
 {
 	int idx;
 	uint32_t val;
 
 	idx = lvt_off_to_idx(offset);
 	val = atomic_load_acq_32(&vlapic->lvt_last[idx]);
 	return (val);
 }
 
 void
 vlapic_lvt_write_handler(struct vlapic *vlapic, uint32_t offset)
 {
 	uint32_t *lvtptr, mask, val;
 	struct LAPIC *lapic;
 	int idx;
 	
 	lapic = vlapic->apic_page;
 	lvtptr = vlapic_get_lvtptr(vlapic, offset);	
 	val = *lvtptr;
 	idx = lvt_off_to_idx(offset);
 
 	if (!(lapic->svr & APIC_SVR_ENABLE))
 		val |= APIC_LVT_M;
 	mask = APIC_LVT_M | APIC_LVT_DS | APIC_LVT_VECTOR;
 	switch (offset) {
 	case APIC_OFFSET_TIMER_LVT:
 		mask |= APIC_LVTT_TM;
 		break;
 	case APIC_OFFSET_ERROR_LVT:
 		break;
 	case APIC_OFFSET_LINT0_LVT:
 	case APIC_OFFSET_LINT1_LVT:
 		mask |= APIC_LVT_TM | APIC_LVT_RIRR | APIC_LVT_IIPP;
 		/* FALLTHROUGH */
 	default:
 		mask |= APIC_LVT_DM;
 		break;
 	}
 	val &= mask;
 	*lvtptr = val;
 	atomic_store_rel_32(&vlapic->lvt_last[idx], val);
 }
 
 static void
 vlapic_mask_lvts(struct vlapic *vlapic)
 {
 	struct LAPIC *lapic = vlapic->apic_page;
 
 	lapic->lvt_cmci |= APIC_LVT_M;
 	vlapic_lvt_write_handler(vlapic, APIC_OFFSET_CMCI_LVT);
 
 	lapic->lvt_timer |= APIC_LVT_M;
 	vlapic_lvt_write_handler(vlapic, APIC_OFFSET_TIMER_LVT);
 
 	lapic->lvt_thermal |= APIC_LVT_M;
 	vlapic_lvt_write_handler(vlapic, APIC_OFFSET_THERM_LVT);
 
 	lapic->lvt_pcint |= APIC_LVT_M;
 	vlapic_lvt_write_handler(vlapic, APIC_OFFSET_PERF_LVT);
 
 	lapic->lvt_lint0 |= APIC_LVT_M;
 	vlapic_lvt_write_handler(vlapic, APIC_OFFSET_LINT0_LVT);
 
 	lapic->lvt_lint1 |= APIC_LVT_M;
 	vlapic_lvt_write_handler(vlapic, APIC_OFFSET_LINT1_LVT);
 
 	lapic->lvt_error |= APIC_LVT_M;
 	vlapic_lvt_write_handler(vlapic, APIC_OFFSET_ERROR_LVT);
 }
 
 static int
 vlapic_fire_lvt(struct vlapic *vlapic, uint32_t lvt)
 {
 	uint32_t vec, mode;
 
 	if (lvt & APIC_LVT_M)
 		return (0);
 
 	vec = lvt & APIC_LVT_VECTOR;
 	mode = lvt & APIC_LVT_DM;
 
 	switch (mode) {
 	case APIC_LVT_DM_FIXED:
 		if (vec < 16) {
 			vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR);
 			return (0);
 		}
 		if (vlapic_set_intr_ready(vlapic, vec, false))
 			vcpu_notify_event(vlapic->vm, vlapic->vcpuid, true);
 		break;
 	case APIC_LVT_DM_NMI:
 		vm_inject_nmi(vlapic->vm, vlapic->vcpuid);
 		break;
 	case APIC_LVT_DM_EXTINT:
 		vm_inject_extint(vlapic->vm, vlapic->vcpuid);
 		break;
 	default:
 		// Other modes ignored
 		return (0);
 	}
 	return (1);
 }
 
 #if 1
 static void
 dump_isrvec_stk(struct vlapic *vlapic)
 {
 	int i;
 	uint32_t *isrptr;
 
 	isrptr = &vlapic->apic_page->isr0;
 	for (i = 0; i < 8; i++)
 		printf("ISR%d 0x%08x\n", i, isrptr[i * 4]);
 
 	for (i = 0; i <= vlapic->isrvec_stk_top; i++)
 		printf("isrvec_stk[%d] = %d\n", i, vlapic->isrvec_stk[i]);
 }
 #endif
 
 /*
  * Algorithm adopted from section "Interrupt, Task and Processor Priority"
  * in Intel Architecture Manual Vol 3a.
  */
 static void
 vlapic_update_ppr(struct vlapic *vlapic)
 {
 	int isrvec, tpr, ppr;
 
 	/*
 	 * Note that the value on the stack at index 0 is always 0.
 	 *
 	 * This is a placeholder for the value of ISRV when none of the
 	 * bits is set in the ISRx registers.
 	 */
 	isrvec = vlapic->isrvec_stk[vlapic->isrvec_stk_top];
 	tpr = vlapic->apic_page->tpr;
 
 #if 1
 	{
 		int i, lastprio, curprio, vector, idx;
 		uint32_t *isrptr;
 
 		if (vlapic->isrvec_stk_top == 0 && isrvec != 0)
 			panic("isrvec_stk is corrupted: %d", isrvec);
 
 		/*
 		 * Make sure that the priority of the nested interrupts is
 		 * always increasing.
 		 */
 		lastprio = -1;
 		for (i = 1; i <= vlapic->isrvec_stk_top; i++) {
 			curprio = PRIO(vlapic->isrvec_stk[i]);
 			if (curprio <= lastprio) {
 				dump_isrvec_stk(vlapic);
 				panic("isrvec_stk does not satisfy invariant");
 			}
 			lastprio = curprio;
 		}
 
 		/*
 		 * Make sure that each bit set in the ISRx registers has a
 		 * corresponding entry on the isrvec stack.
 		 */
 		i = 1;
 		isrptr = &vlapic->apic_page->isr0;
 		for (vector = 0; vector < 256; vector++) {
 			idx = (vector / 32) * 4;
 			if (isrptr[idx] & (1 << (vector % 32))) {
 				if (i > vlapic->isrvec_stk_top ||
 				    vlapic->isrvec_stk[i] != vector) {
 					dump_isrvec_stk(vlapic);
 					panic("ISR and isrvec_stk out of sync");
 				}
 				i++;
 			}
 		}
 	}
 #endif
 
 	if (PRIO(tpr) >= PRIO(isrvec))
 		ppr = tpr;
 	else
 		ppr = isrvec & 0xf0;
 
 	vlapic->apic_page->ppr = ppr;
 	VLAPIC_CTR1(vlapic, "vlapic_update_ppr 0x%02x", ppr);
 }
 
 static void
 vlapic_process_eoi(struct vlapic *vlapic)
 {
 	struct LAPIC	*lapic = vlapic->apic_page;
 	uint32_t	*isrptr, *tmrptr;
 	int		i, idx, bitpos, vector;
 
 	isrptr = &lapic->isr0;
 	tmrptr = &lapic->tmr0;
 
 	/*
 	 * The x86 architecture reserves the the first 32 vectors for use
 	 * by the processor.
 	 */
 	for (i = 7; i > 0; i--) {
 		idx = i * 4;
 		bitpos = fls(isrptr[idx]);
 		if (bitpos-- != 0) {
 			if (vlapic->isrvec_stk_top <= 0) {
 				panic("invalid vlapic isrvec_stk_top %d",
 				      vlapic->isrvec_stk_top);
 			}
 			isrptr[idx] &= ~(1 << bitpos);
 			VLAPIC_CTR_ISR(vlapic, "vlapic_process_eoi");
 			vlapic->isrvec_stk_top--;
 			vlapic_update_ppr(vlapic);
 			if ((tmrptr[idx] & (1 << bitpos)) != 0) {
 				vector = i * 32 + bitpos;
 				vioapic_process_eoi(vlapic->vm, vlapic->vcpuid,
 				    vector);
 			}
 			return;
 		}
 	}
 }
 
 static __inline int
 vlapic_get_lvt_field(uint32_t lvt, uint32_t mask)
 {
 
 	return (lvt & mask);
 }
 
 static __inline int
 vlapic_periodic_timer(struct vlapic *vlapic)
 {
 	uint32_t lvt;
 	
 	lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT);
 
 	return (vlapic_get_lvt_field(lvt, APIC_LVTT_TM_PERIODIC));
 }
 
 static VMM_STAT(VLAPIC_INTR_ERROR, "error interrupts generated by vlapic");
 
 void
 vlapic_set_error(struct vlapic *vlapic, uint32_t mask)
 {
 	uint32_t lvt;
 
 	vlapic->esr_pending |= mask;
 	if (vlapic->esr_firing)
 		return;
 	vlapic->esr_firing = 1;
 
 	// The error LVT always uses the fixed delivery mode.
 	lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_ERROR_LVT);
 	if (vlapic_fire_lvt(vlapic, lvt | APIC_LVT_DM_FIXED)) {
 		vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_INTR_ERROR, 1);
 	}
 	vlapic->esr_firing = 0;
 }
 
 static VMM_STAT(VLAPIC_INTR_TIMER, "timer interrupts generated by vlapic");
 
 static void
 vlapic_fire_timer(struct vlapic *vlapic)
 {
 	uint32_t lvt;
 
 	KASSERT(VLAPIC_TIMER_LOCKED(vlapic), ("vlapic_fire_timer not locked"));
 	
 	// The timer LVT always uses the fixed delivery mode.
 	lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT);
 	if (vlapic_fire_lvt(vlapic, lvt | APIC_LVT_DM_FIXED)) {
+		VLAPIC_CTR0(vlapic, "vlapic timer fired");
 		vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_INTR_TIMER, 1);
 	}
 }
 
 static VMM_STAT(VLAPIC_INTR_CMC,
     "corrected machine check interrupts generated by vlapic");
 
 void
 vlapic_fire_cmci(struct vlapic *vlapic)
 {
 	uint32_t lvt;
 
 	lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_CMCI_LVT);
 	if (vlapic_fire_lvt(vlapic, lvt)) {
 		vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_INTR_CMC, 1);
 	}
 }
 
 static VMM_STAT_ARRAY(LVTS_TRIGGERRED, VLAPIC_MAXLVT_INDEX + 1,
     "lvts triggered");
 
 int
 vlapic_trigger_lvt(struct vlapic *vlapic, int vector)
 {
 	uint32_t lvt;
 
 	if (vlapic_enabled(vlapic) == false) {
 		/*
 		 * When the local APIC is global/hardware disabled,
 		 * LINT[1:0] pins are configured as INTR and NMI pins,
 		 * respectively.
 		*/
 		switch (vector) {
 			case APIC_LVT_LINT0:
 				vm_inject_extint(vlapic->vm, vlapic->vcpuid);
 				break;
 			case APIC_LVT_LINT1:
 				vm_inject_nmi(vlapic->vm, vlapic->vcpuid);
 				break;
 			default:
 				break;
 		}
 		return (0);
 	}
 
 	switch (vector) {
 	case APIC_LVT_LINT0:
 		lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_LINT0_LVT);
 		break;
 	case APIC_LVT_LINT1:
 		lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_LINT1_LVT);
 		break;
 	case APIC_LVT_TIMER:
 		lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT);
 		lvt |= APIC_LVT_DM_FIXED;
 		break;
 	case APIC_LVT_ERROR:
 		lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_ERROR_LVT);
 		lvt |= APIC_LVT_DM_FIXED;
 		break;
 	case APIC_LVT_PMC:
 		lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_PERF_LVT);
 		break;
 	case APIC_LVT_THERMAL:
 		lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_THERM_LVT);
 		break;
 	case APIC_LVT_CMCI:
 		lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_CMCI_LVT);
 		break;
 	default:
 		return (EINVAL);
 	}
 	if (vlapic_fire_lvt(vlapic, lvt)) {
 		vmm_stat_array_incr(vlapic->vm, vlapic->vcpuid,
 		    LVTS_TRIGGERRED, vector, 1);
 	}
 	return (0);
 }
 
 static void
 vlapic_callout_handler(void *arg)
 {
 	struct vlapic *vlapic;
 	struct bintime bt, btnow;
 	sbintime_t rem_sbt;
 
 	vlapic = arg;
 
 	VLAPIC_TIMER_LOCK(vlapic);
 	if (callout_pending(&vlapic->callout))	/* callout was reset */
 		goto done;
 
 	if (!callout_active(&vlapic->callout))	/* callout was stopped */
 		goto done;
 
 	callout_deactivate(&vlapic->callout);
 
 	vlapic_fire_timer(vlapic);
 
 	if (vlapic_periodic_timer(vlapic)) {
 		binuptime(&btnow);
 		KASSERT(bintime_cmp(&btnow, &vlapic->timer_fire_bt, >=),
 		    ("vlapic callout at %#lx.%#lx, expected at %#lx.#%lx",
 		    btnow.sec, btnow.frac, vlapic->timer_fire_bt.sec,
 		    vlapic->timer_fire_bt.frac));
 
 		/*
 		 * Compute the delta between when the timer was supposed to
 		 * fire and the present time.
 		 */
 		bt = btnow;
 		bintime_sub(&bt, &vlapic->timer_fire_bt);
 
 		rem_sbt = bttosbt(vlapic->timer_period_bt);
 		if (bintime_cmp(&bt, &vlapic->timer_period_bt, <)) {
 			/*
 			 * Adjust the time until the next countdown downward
 			 * to account for the lost time.
 			 */
 			rem_sbt -= bttosbt(bt);
 		} else {
 			/*
 			 * If the delta is greater than the timer period then
 			 * just reset our time base instead of trying to catch
 			 * up.
 			 */
 			vlapic->timer_fire_bt = btnow;
 			VLAPIC_CTR2(vlapic, "vlapic timer lagging by %lu "
 			    "usecs, period is %lu usecs - resetting time base",
 			    bttosbt(bt) / SBT_1US,
 			    bttosbt(vlapic->timer_period_bt) / SBT_1US);
 		}
 
 		bintime_add(&vlapic->timer_fire_bt, &vlapic->timer_period_bt);
 		callout_reset_sbt(&vlapic->callout, rem_sbt, 0,
 		    vlapic_callout_handler, vlapic, 0);
 	}
 done:
 	VLAPIC_TIMER_UNLOCK(vlapic);
 }
 
 void
 vlapic_icrtmr_write_handler(struct vlapic *vlapic)
 {
 	struct LAPIC *lapic;
 	sbintime_t sbt;
 	uint32_t icr_timer;
 
 	VLAPIC_TIMER_LOCK(vlapic);
 
 	lapic = vlapic->apic_page;
 	icr_timer = lapic->icr_timer;
 
 	vlapic->timer_period_bt = vlapic->timer_freq_bt;
 	bintime_mul(&vlapic->timer_period_bt, icr_timer);
 
 	if (icr_timer != 0) {
 		binuptime(&vlapic->timer_fire_bt);
 		bintime_add(&vlapic->timer_fire_bt, &vlapic->timer_period_bt);
 
 		sbt = bttosbt(vlapic->timer_period_bt);
 		callout_reset_sbt(&vlapic->callout, sbt, 0,
 		    vlapic_callout_handler, vlapic, 0);
 	} else
 		callout_stop(&vlapic->callout);
 
 	VLAPIC_TIMER_UNLOCK(vlapic);
 }
 
 /*
  * This function populates 'dmask' with the set of vcpus that match the
  * addressing specified by the (dest, phys, lowprio) tuple.
  * 
  * 'x2apic_dest' specifies whether 'dest' is interpreted as x2APIC (32-bit)
  * or xAPIC (8-bit) destination field.
  */
 static void
 vlapic_calcdest(struct vm *vm, cpuset_t *dmask, uint32_t dest, bool phys,
     bool lowprio, bool x2apic_dest)
 {
 	struct vlapic *vlapic;
 	uint32_t dfr, ldr, ldest, cluster;
 	uint32_t mda_flat_ldest, mda_cluster_ldest, mda_ldest, mda_cluster_id;
 	cpuset_t amask;
 	int vcpuid;
 
 	if ((x2apic_dest && dest == 0xffffffff) ||
 	    (!x2apic_dest && dest == 0xff)) {
 		/*
 		 * Broadcast in both logical and physical modes.
 		 */
 		*dmask = vm_active_cpus(vm);
 		return;
 	}
 
 	if (phys) {
 		/*
 		 * Physical mode: destination is APIC ID.
 		 */
 		CPU_ZERO(dmask);
 		vcpuid = vm_apicid2vcpuid(vm, dest);
 		if (vcpuid < VM_MAXCPU)
 			CPU_SET(vcpuid, dmask);
 	} else {
 		/*
 		 * In the "Flat Model" the MDA is interpreted as an 8-bit wide
 		 * bitmask. This model is only avilable in the xAPIC mode.
 		 */
 		mda_flat_ldest = dest & 0xff;
 
 		/*
 		 * In the "Cluster Model" the MDA is used to identify a
 		 * specific cluster and a set of APICs in that cluster.
 		 */
 		if (x2apic_dest) {
 			mda_cluster_id = dest >> 16;
 			mda_cluster_ldest = dest & 0xffff;
 		} else {
 			mda_cluster_id = (dest >> 4) & 0xf;
 			mda_cluster_ldest = dest & 0xf;
 		}
 
 		/*
 		 * Logical mode: match each APIC that has a bit set
 		 * in it's LDR that matches a bit in the ldest.
 		 */
 		CPU_ZERO(dmask);
 		amask = vm_active_cpus(vm);
 		while ((vcpuid = CPU_FFS(&amask)) != 0) {
 			vcpuid--;
 			CPU_CLR(vcpuid, &amask);
 
 			vlapic = vm_lapic(vm, vcpuid);
 			dfr = vlapic->apic_page->dfr;
 			ldr = vlapic->apic_page->ldr;
 
 			if ((dfr & APIC_DFR_MODEL_MASK) ==
 			    APIC_DFR_MODEL_FLAT) {
 				ldest = ldr >> 24;
 				mda_ldest = mda_flat_ldest;
 			} else if ((dfr & APIC_DFR_MODEL_MASK) ==
 			    APIC_DFR_MODEL_CLUSTER) {
 				if (x2apic(vlapic)) {
 					cluster = ldr >> 16;
 					ldest = ldr & 0xffff;
 				} else {
 					cluster = ldr >> 28;
 					ldest = (ldr >> 24) & 0xf;
 				}
 				if (cluster != mda_cluster_id)
 					continue;
 				mda_ldest = mda_cluster_ldest;
 			} else {
 				/*
 				 * Guest has configured a bad logical
 				 * model for this vcpu - skip it.
 				 */
 				VLAPIC_CTR1(vlapic, "vlapic has bad logical "
 				    "model %x - cannot deliver interrupt", dfr);
 				continue;
 			}
 
 			if ((mda_ldest & ldest) != 0) {
 				CPU_SET(vcpuid, dmask);
 				if (lowprio)
 					break;
 			}
 		}
 	}
 }
 
 static VMM_STAT_ARRAY(IPIS_SENT, VM_MAXCPU, "ipis sent to vcpu");
 
 static void
 vlapic_set_tpr(struct vlapic *vlapic, uint8_t val)
 {
 	struct LAPIC *lapic = vlapic->apic_page;
 
 	lapic->tpr = val;
 	vlapic_update_ppr(vlapic);
 }
 
 static uint8_t
 vlapic_get_tpr(struct vlapic *vlapic)
 {
 	struct LAPIC *lapic = vlapic->apic_page;
 
 	return (lapic->tpr);
 }
 
 void
 vlapic_set_cr8(struct vlapic *vlapic, uint64_t val)
 {
 	uint8_t tpr;
 
 	if (val & ~0xf) {
 		vm_inject_gp(vlapic->vm, vlapic->vcpuid);
 		return;
 	}
 
 	tpr = val << 4;
 	vlapic_set_tpr(vlapic, tpr);
 }
 
 uint64_t
 vlapic_get_cr8(struct vlapic *vlapic)
 {
 	uint8_t tpr;
 
 	tpr = vlapic_get_tpr(vlapic);
 	return (tpr >> 4);
 }
 
 int
 vlapic_icrlo_write_handler(struct vlapic *vlapic, bool *retu)
 {
 	int i;
 	bool phys;
 	cpuset_t dmask;
 	uint64_t icrval;
 	uint32_t dest, vec, mode;
 	struct vlapic *vlapic2;
 	struct vm_exit *vmexit;
 	struct LAPIC *lapic;
 
 	lapic = vlapic->apic_page;
 	lapic->icr_lo &= ~APIC_DELSTAT_PEND;
 	icrval = ((uint64_t)lapic->icr_hi << 32) | lapic->icr_lo;
 
 	if (x2apic(vlapic))
 		dest = icrval >> 32;
 	else
 		dest = icrval >> (32 + 24);
 	vec = icrval & APIC_VECTOR_MASK;
 	mode = icrval & APIC_DELMODE_MASK;
 
 	if (mode == APIC_DELMODE_FIXED && vec < 16) {
 		vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR);
 		VLAPIC_CTR1(vlapic, "Ignoring invalid IPI %d", vec);
 		return (0);
 	}
 
 	VLAPIC_CTR2(vlapic, "icrlo 0x%016lx triggered ipi %d", icrval, vec);
 
 	if (mode == APIC_DELMODE_FIXED || mode == APIC_DELMODE_NMI) {
 		switch (icrval & APIC_DEST_MASK) {
 		case APIC_DEST_DESTFLD:
 			phys = ((icrval & APIC_DESTMODE_LOG) == 0);
 			vlapic_calcdest(vlapic->vm, &dmask, dest, phys, false,
 			    x2apic(vlapic));
 			break;
 		case APIC_DEST_SELF:
 			CPU_SETOF(vlapic->vcpuid, &dmask);
 			break;
 		case APIC_DEST_ALLISELF:
 			dmask = vm_active_cpus(vlapic->vm);
 			break;
 		case APIC_DEST_ALLESELF:
 			dmask = vm_active_cpus(vlapic->vm);
 			CPU_CLR(vlapic->vcpuid, &dmask);
 			break;
 		default:
 			CPU_ZERO(&dmask);	/* satisfy gcc */
 			break;
 		}
 
 		while ((i = CPU_FFS(&dmask)) != 0) {
 			i--;
 			CPU_CLR(i, &dmask);
 			if (mode == APIC_DELMODE_FIXED) {
 				lapic_intr_edge(vlapic->vm, i, vec);
 				vmm_stat_array_incr(vlapic->vm, vlapic->vcpuid,
 						    IPIS_SENT, i, 1);
 				VLAPIC_CTR2(vlapic, "vlapic sending ipi %d "
 				    "to vcpuid %d", vec, i);
 			} else {
 				vm_inject_nmi(vlapic->vm, i);
 				VLAPIC_CTR1(vlapic, "vlapic sending ipi nmi "
 				    "to vcpuid %d", i);
 			}
 		}
 
 		return (0);	/* handled completely in the kernel */
 	}
 
 	if (mode == APIC_DELMODE_INIT) {
 		if ((icrval & APIC_LEVEL_MASK) == APIC_LEVEL_DEASSERT)
 			return (0);
 
 		if (vlapic->vcpuid == 0 && dest != 0 && dest < VM_MAXCPU) {
 			vlapic2 = vm_lapic(vlapic->vm, dest);
 
 			/* move from INIT to waiting-for-SIPI state */
 			if (vlapic2->boot_state == BS_INIT) {
 				vlapic2->boot_state = BS_SIPI;
 			}
 
 			return (0);
 		}
 	}
 
 	if (mode == APIC_DELMODE_STARTUP) {
 		if (vlapic->vcpuid == 0 && dest != 0 && dest < VM_MAXCPU) {
 			vlapic2 = vm_lapic(vlapic->vm, dest);
 
 			/*
 			 * Ignore SIPIs in any state other than wait-for-SIPI
 			 */
 			if (vlapic2->boot_state != BS_SIPI)
 				return (0);
 
 			vlapic2->boot_state = BS_RUNNING;
 
 			*retu = true;
 			vmexit = vm_exitinfo(vlapic->vm, vlapic->vcpuid);
 			vmexit->exitcode = VM_EXITCODE_SPINUP_AP;
 			vmexit->u.spinup_ap.vcpu = dest;
 			vmexit->u.spinup_ap.rip = vec << PAGE_SHIFT;
 
 			return (0);
 		}
 	}
 
 	/*
 	 * This will cause a return to userland.
 	 */
 	return (1);
 }
 
 void
 vlapic_self_ipi_handler(struct vlapic *vlapic, uint64_t val)
 {
 	int vec;
 
 	KASSERT(x2apic(vlapic), ("SELF_IPI does not exist in xAPIC mode"));
 
 	vec = val & 0xff;
 	lapic_intr_edge(vlapic->vm, vlapic->vcpuid, vec);
 	vmm_stat_array_incr(vlapic->vm, vlapic->vcpuid, IPIS_SENT,
 	    vlapic->vcpuid, 1);
 	VLAPIC_CTR1(vlapic, "vlapic self-ipi %d", vec);
 }
 
 int
 vlapic_pending_intr(struct vlapic *vlapic, int *vecptr)
 {
 	struct LAPIC	*lapic = vlapic->apic_page;
 	int	  	 idx, i, bitpos, vector;
 	uint32_t	*irrptr, val;
 
 	if (vlapic->ops.pending_intr)
 		return ((*vlapic->ops.pending_intr)(vlapic, vecptr));
 
 	irrptr = &lapic->irr0;
 
 	/*
 	 * The x86 architecture reserves the the first 32 vectors for use
 	 * by the processor.
 	 */
 	for (i = 7; i > 0; i--) {
 		idx = i * 4;
 		val = atomic_load_acq_int(&irrptr[idx]);
 		bitpos = fls(val);
 		if (bitpos != 0) {
 			vector = i * 32 + (bitpos - 1);
 			if (PRIO(vector) > PRIO(lapic->ppr)) {
 				VLAPIC_CTR1(vlapic, "pending intr %d", vector);
 				if (vecptr != NULL)
 					*vecptr = vector;
 				return (1);
 			} else 
 				break;
 		}
 	}
 	return (0);
 }
 
 void
 vlapic_intr_accepted(struct vlapic *vlapic, int vector)
 {
 	struct LAPIC	*lapic = vlapic->apic_page;
 	uint32_t	*irrptr, *isrptr;
 	int		idx, stk_top;
 
 	if (vlapic->ops.intr_accepted)
 		return ((*vlapic->ops.intr_accepted)(vlapic, vector));
 
 	/*
 	 * clear the ready bit for vector being accepted in irr 
 	 * and set the vector as in service in isr.
 	 */
 	idx = (vector / 32) * 4;
 
 	irrptr = &lapic->irr0;
 	atomic_clear_int(&irrptr[idx], 1 << (vector % 32));
 	VLAPIC_CTR_IRR(vlapic, "vlapic_intr_accepted");
 
 	isrptr = &lapic->isr0;
 	isrptr[idx] |= 1 << (vector % 32);
 	VLAPIC_CTR_ISR(vlapic, "vlapic_intr_accepted");
 
 	/*
 	 * Update the PPR
 	 */
 	vlapic->isrvec_stk_top++;
 
 	stk_top = vlapic->isrvec_stk_top;
 	if (stk_top >= ISRVEC_STK_SIZE)
 		panic("isrvec_stk_top overflow %d", stk_top);
 
 	vlapic->isrvec_stk[stk_top] = vector;
 	vlapic_update_ppr(vlapic);
 }
 
 void
 vlapic_svr_write_handler(struct vlapic *vlapic)
 {
 	struct LAPIC *lapic;
 	uint32_t old, new, changed;
 
 	lapic = vlapic->apic_page;
 
 	new = lapic->svr;
 	old = vlapic->svr_last;
 	vlapic->svr_last = new;
 
 	changed = old ^ new;
 	if ((changed & APIC_SVR_ENABLE) != 0) {
 		if ((new & APIC_SVR_ENABLE) == 0) {
 			/*
 			 * The apic is now disabled so stop the apic timer
 			 * and mask all the LVT entries.
 			 */
 			VLAPIC_CTR0(vlapic, "vlapic is software-disabled");
 			VLAPIC_TIMER_LOCK(vlapic);
 			callout_stop(&vlapic->callout);
 			VLAPIC_TIMER_UNLOCK(vlapic);
 			vlapic_mask_lvts(vlapic);
 		} else {
 			/*
 			 * The apic is now enabled so restart the apic timer
 			 * if it is configured in periodic mode.
 			 */
 			VLAPIC_CTR0(vlapic, "vlapic is software-enabled");
 			if (vlapic_periodic_timer(vlapic))
 				vlapic_icrtmr_write_handler(vlapic);
 		}
 	}
 }
 
 int
 vlapic_read(struct vlapic *vlapic, int mmio_access, uint64_t offset,
     uint64_t *data, bool *retu)
 {
 	struct LAPIC	*lapic = vlapic->apic_page;
 	uint32_t	*reg;
 	int		 i;
 
 	/* Ignore MMIO accesses in x2APIC mode */
 	if (x2apic(vlapic) && mmio_access) {
 		VLAPIC_CTR1(vlapic, "MMIO read from offset %#lx in x2APIC mode",
 		    offset);
 		*data = 0;
 		goto done;
 	}
 
 	if (!x2apic(vlapic) && !mmio_access) {
 		/*
 		 * XXX Generate GP fault for MSR accesses in xAPIC mode
 		 */
 		VLAPIC_CTR1(vlapic, "x2APIC MSR read from offset %#lx in "
 		    "xAPIC mode", offset);
 		*data = 0;
 		goto done;
 	}
 
 	if (offset > sizeof(*lapic)) {
 		*data = 0;
 		goto done;
 	}
 	
 	offset &= ~3;
 	switch(offset)
 	{
 		case APIC_OFFSET_ID:
 			*data = lapic->id;
 			break;
 		case APIC_OFFSET_VER:
 			*data = lapic->version;
 			break;
 		case APIC_OFFSET_TPR:
 			*data = vlapic_get_tpr(vlapic);
 			break;
 		case APIC_OFFSET_APR:
 			*data = lapic->apr;
 			break;
 		case APIC_OFFSET_PPR:
 			*data = lapic->ppr;
 			break;
 		case APIC_OFFSET_EOI:
 			*data = lapic->eoi;
 			break;
 		case APIC_OFFSET_LDR:
 			*data = lapic->ldr;
 			break;
 		case APIC_OFFSET_DFR:
 			*data = lapic->dfr;
 			break;
 		case APIC_OFFSET_SVR:
 			*data = lapic->svr;
 			break;
 		case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
 			i = (offset - APIC_OFFSET_ISR0) >> 2;
 			reg = &lapic->isr0;
 			*data = *(reg + i);
 			break;
 		case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
 			i = (offset - APIC_OFFSET_TMR0) >> 2;
 			reg = &lapic->tmr0;
 			*data = *(reg + i);
 			break;
 		case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
 			i = (offset - APIC_OFFSET_IRR0) >> 2;
 			reg = &lapic->irr0;
 			*data = atomic_load_acq_int(reg + i);
 			break;
 		case APIC_OFFSET_ESR:
 			*data = lapic->esr;
 			break;
 		case APIC_OFFSET_ICR_LOW: 
 			*data = lapic->icr_lo;
 			if (x2apic(vlapic))
 				*data |= (uint64_t)lapic->icr_hi << 32;
 			break;
 		case APIC_OFFSET_ICR_HI: 
 			*data = lapic->icr_hi;
 			break;
 		case APIC_OFFSET_CMCI_LVT:
 		case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
 			*data = vlapic_get_lvt(vlapic, offset);	
 #ifdef INVARIANTS
 			reg = vlapic_get_lvtptr(vlapic, offset);
 			KASSERT(*data == *reg, ("inconsistent lvt value at "
 			    "offset %#lx: %#lx/%#x", offset, *data, *reg));
 #endif
 			break;
 		case APIC_OFFSET_TIMER_ICR:
 			*data = lapic->icr_timer;
 			break;
 		case APIC_OFFSET_TIMER_CCR:
 			*data = vlapic_get_ccr(vlapic);
 			break;
 		case APIC_OFFSET_TIMER_DCR:
 			*data = lapic->dcr_timer;
 			break;
 		case APIC_OFFSET_SELF_IPI:
 			/*
 			 * XXX generate a GP fault if vlapic is in x2apic mode
 			 */
 			*data = 0;
 			break;
 		case APIC_OFFSET_RRR:
 		default:
 			*data = 0;
 			break;
 	}
 done:
 	VLAPIC_CTR2(vlapic, "vlapic read offset %#x, data %#lx", offset, *data);
 	return 0;
 }
 
 int
 vlapic_write(struct vlapic *vlapic, int mmio_access, uint64_t offset,
     uint64_t data, bool *retu)
 {
 	struct LAPIC	*lapic = vlapic->apic_page;
 	uint32_t	*regptr;
 	int		retval;
 
 	KASSERT((offset & 0xf) == 0 && offset < PAGE_SIZE,
 	    ("vlapic_write: invalid offset %#lx", offset));
 
 	VLAPIC_CTR2(vlapic, "vlapic write offset %#lx, data %#lx",
 	    offset, data);
 
 	if (offset > sizeof(*lapic))
 		return (0);
 
 	/* Ignore MMIO accesses in x2APIC mode */
 	if (x2apic(vlapic) && mmio_access) {
 		VLAPIC_CTR2(vlapic, "MMIO write of %#lx to offset %#lx "
 		    "in x2APIC mode", data, offset);
 		return (0);
 	}
 
 	/*
 	 * XXX Generate GP fault for MSR accesses in xAPIC mode
 	 */
 	if (!x2apic(vlapic) && !mmio_access) {
 		VLAPIC_CTR2(vlapic, "x2APIC MSR write of %#lx to offset %#lx "
 		    "in xAPIC mode", data, offset);
 		return (0);
 	}
 
 	retval = 0;
 	switch(offset)
 	{
 		case APIC_OFFSET_ID:
 			lapic->id = data;
 			vlapic_id_write_handler(vlapic);
 			break;
 		case APIC_OFFSET_TPR:
 			vlapic_set_tpr(vlapic, data & 0xff);
 			break;
 		case APIC_OFFSET_EOI:
 			vlapic_process_eoi(vlapic);
 			break;
 		case APIC_OFFSET_LDR:
 			lapic->ldr = data;
 			vlapic_ldr_write_handler(vlapic);
 			break;
 		case APIC_OFFSET_DFR:
 			lapic->dfr = data;
 			vlapic_dfr_write_handler(vlapic);
 			break;
 		case APIC_OFFSET_SVR:
 			lapic->svr = data;
 			vlapic_svr_write_handler(vlapic);
 			break;
 		case APIC_OFFSET_ICR_LOW: 
 			lapic->icr_lo = data;
 			if (x2apic(vlapic))
 				lapic->icr_hi = data >> 32;
 			retval = vlapic_icrlo_write_handler(vlapic, retu);
 			break;
 		case APIC_OFFSET_ICR_HI:
 			lapic->icr_hi = data;
 			break;
 		case APIC_OFFSET_CMCI_LVT:
 		case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
 			regptr = vlapic_get_lvtptr(vlapic, offset);
 			*regptr = data;
 			vlapic_lvt_write_handler(vlapic, offset);
 			break;
 		case APIC_OFFSET_TIMER_ICR:
 			lapic->icr_timer = data;
 			vlapic_icrtmr_write_handler(vlapic);
 			break;
 
 		case APIC_OFFSET_TIMER_DCR:
 			lapic->dcr_timer = data;
 			vlapic_dcr_write_handler(vlapic);
 			break;
 
 		case APIC_OFFSET_ESR:
 			vlapic_esr_write_handler(vlapic);
 			break;
 
 		case APIC_OFFSET_SELF_IPI:
 			if (x2apic(vlapic))
 				vlapic_self_ipi_handler(vlapic, data);
 			break;
 
 		case APIC_OFFSET_VER:
 		case APIC_OFFSET_APR:
 		case APIC_OFFSET_PPR:
 		case APIC_OFFSET_RRR:
 		case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
 		case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
 		case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
 		case APIC_OFFSET_TIMER_CCR:
 		default:
 			// Read only.
 			break;
 	}
 
 	return (retval);
 }
 
 static void
 vlapic_reset(struct vlapic *vlapic)
 {
 	struct LAPIC *lapic;
 	
 	lapic = vlapic->apic_page;
 	bzero(lapic, sizeof(struct LAPIC));
 
 	lapic->id = vlapic_get_id(vlapic);
 	lapic->version = VLAPIC_VERSION;
 	lapic->version |= (VLAPIC_MAXLVT_INDEX << MAXLVTSHIFT);
 	lapic->dfr = 0xffffffff;
 	lapic->svr = APIC_SVR_VECTOR;
 	vlapic_mask_lvts(vlapic);
 	vlapic_reset_tmr(vlapic);
 
 	lapic->dcr_timer = 0;
 	vlapic_dcr_write_handler(vlapic);
 
 	if (vlapic->vcpuid == 0)
 		vlapic->boot_state = BS_RUNNING;	/* BSP */
 	else
 		vlapic->boot_state = BS_INIT;		/* AP */
 
 	vlapic->svr_last = lapic->svr;
 }
 
 void
 vlapic_init(struct vlapic *vlapic)
 {
 	KASSERT(vlapic->vm != NULL, ("vlapic_init: vm is not initialized"));
 	KASSERT(vlapic->vcpuid >= 0 && vlapic->vcpuid < VM_MAXCPU,
 	    ("vlapic_init: vcpuid is not initialized"));
 	KASSERT(vlapic->apic_page != NULL, ("vlapic_init: apic_page is not "
 	    "initialized"));
 
 	/*
 	 * If the vlapic is configured in x2apic mode then it will be
 	 * accessed in the critical section via the MSR emulation code.
 	 *
 	 * Therefore the timer mutex must be a spinlock because blockable
 	 * mutexes cannot be acquired in a critical section.
 	 */
 	mtx_init(&vlapic->timer_mtx, "vlapic timer mtx", NULL, MTX_SPIN);
 	callout_init(&vlapic->callout, 1);
 
 	vlapic->msr_apicbase = DEFAULT_APIC_BASE | APICBASE_ENABLED;
 
 	if (vlapic->vcpuid == 0)
 		vlapic->msr_apicbase |= APICBASE_BSP;
 
 	vlapic_reset(vlapic);
 }
 
 void
 vlapic_cleanup(struct vlapic *vlapic)
 {
 
 	callout_drain(&vlapic->callout);
 }
 
 uint64_t
 vlapic_get_apicbase(struct vlapic *vlapic)
 {
 
 	return (vlapic->msr_apicbase);
 }
 
 int
 vlapic_set_apicbase(struct vlapic *vlapic, uint64_t new)
 {
 
 	if (vlapic->msr_apicbase != new) {
 		VLAPIC_CTR2(vlapic, "Changing APIC_BASE MSR from %#lx to %#lx "
 		    "not supported", vlapic->msr_apicbase, new);
 		return (-1);
 	}
 
 	return (0);
 }
 
 void
 vlapic_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
 {
 	struct vlapic *vlapic;
 	struct LAPIC *lapic;
 
 	vlapic = vm_lapic(vm, vcpuid);
 
 	if (state == X2APIC_DISABLED)
 		vlapic->msr_apicbase &= ~APICBASE_X2APIC;
 	else
 		vlapic->msr_apicbase |= APICBASE_X2APIC;
 
 	/*
 	 * Reset the local APIC registers whose values are mode-dependent.
 	 *
 	 * XXX this works because the APIC mode can be changed only at vcpu
 	 * initialization time.
 	 */
 	lapic = vlapic->apic_page;
 	lapic->id = vlapic_get_id(vlapic);
 	if (x2apic(vlapic)) {
 		lapic->ldr = x2apic_ldr(vlapic);
 		lapic->dfr = 0;
 	} else {
 		lapic->ldr = 0;
 		lapic->dfr = 0xffffffff;
 	}
 
 	if (state == X2APIC_ENABLED) {
 		if (vlapic->ops.enable_x2apic_mode)
 			(*vlapic->ops.enable_x2apic_mode)(vlapic);
 	}
 }
 
 void
 vlapic_deliver_intr(struct vm *vm, bool level, uint32_t dest, bool phys,
     int delmode, int vec)
 {
 	bool lowprio;
 	int vcpuid;
 	cpuset_t dmask;
 
 	if (delmode != IOART_DELFIXED &&
 	    delmode != IOART_DELLOPRI &&
 	    delmode != IOART_DELEXINT) {
 		VM_CTR1(vm, "vlapic intr invalid delmode %#x", delmode);
 		return;
 	}
 	lowprio = (delmode == IOART_DELLOPRI);
 
 	/*
 	 * We don't provide any virtual interrupt redirection hardware so
 	 * all interrupts originating from the ioapic or MSI specify the
 	 * 'dest' in the legacy xAPIC format.
 	 */
 	vlapic_calcdest(vm, &dmask, dest, phys, lowprio, false);
 
 	while ((vcpuid = CPU_FFS(&dmask)) != 0) {
 		vcpuid--;
 		CPU_CLR(vcpuid, &dmask);
 		if (delmode == IOART_DELEXINT) {
 			vm_inject_extint(vm, vcpuid);
 		} else {
 			lapic_set_intr(vm, vcpuid, vec, level);
 		}
 	}
 }
 
 void
 vlapic_post_intr(struct vlapic *vlapic, int hostcpu, int ipinum)
 {
 	/*
 	 * Post an interrupt to the vcpu currently running on 'hostcpu'.
 	 *
 	 * This is done by leveraging features like Posted Interrupts (Intel)
 	 * Doorbell MSR (AMD AVIC) that avoid a VM exit.
 	 *
 	 * If neither of these features are available then fallback to
 	 * sending an IPI to 'hostcpu'.
 	 */
 	if (vlapic->ops.post_intr)
 		(*vlapic->ops.post_intr)(vlapic, hostcpu);
 	else
 		ipi_cpu(hostcpu, ipinum);
 }
 
 bool
 vlapic_enabled(struct vlapic *vlapic)
 {
 	struct LAPIC *lapic = vlapic->apic_page;
 
 	if ((vlapic->msr_apicbase & APICBASE_ENABLED) != 0 &&
 	    (lapic->svr & APIC_SVR_ENABLE) != 0)
 		return (true);
 	else
 		return (false);
 }
 
 static void
 vlapic_set_tmr(struct vlapic *vlapic, int vector, bool level)
 {
 	struct LAPIC *lapic;
 	uint32_t *tmrptr, mask;
 	int idx;
 
 	lapic = vlapic->apic_page;
 	tmrptr = &lapic->tmr0;
 	idx = (vector / 32) * 4;
 	mask = 1 << (vector % 32);
 	if (level)
 		tmrptr[idx] |= mask;
 	else
 		tmrptr[idx] &= ~mask;
 
 	if (vlapic->ops.set_tmr != NULL)
 		(*vlapic->ops.set_tmr)(vlapic, vector, level);
 }
 
 void
 vlapic_reset_tmr(struct vlapic *vlapic)
 {
 	int vector;
 
 	VLAPIC_CTR0(vlapic, "vlapic resetting all vectors to edge-triggered");
 
 	for (vector = 0; vector <= 255; vector++)
 		vlapic_set_tmr(vlapic, vector, false);
 }
 
 void
 vlapic_set_tmr_level(struct vlapic *vlapic, uint32_t dest, bool phys,
     int delmode, int vector)
 {
 	cpuset_t dmask;
 	bool lowprio;
 
 	KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector));
 
 	/*
 	 * A level trigger is valid only for fixed and lowprio delivery modes.
 	 */
 	if (delmode != APIC_DELMODE_FIXED && delmode != APIC_DELMODE_LOWPRIO) {
 		VLAPIC_CTR1(vlapic, "Ignoring level trigger-mode for "
 		    "delivery-mode %d", delmode);
 		return;
 	}
 
 	lowprio = (delmode == APIC_DELMODE_LOWPRIO);
 	vlapic_calcdest(vlapic->vm, &dmask, dest, phys, lowprio, false);
 
 	if (!CPU_ISSET(vlapic->vcpuid, &dmask))
 		return;
 
 	VLAPIC_CTR1(vlapic, "vector %d set to level-triggered", vector);
 	vlapic_set_tmr(vlapic, vector, true);
 }
Index: stable/10/sys/amd64/vmm/vmm.c
===================================================================
--- stable/10/sys/amd64/vmm/vmm.c	(revision 276348)
+++ stable/10/sys/amd64/vmm/vmm.c	(revision 276349)
@@ -1,2311 +1,2324 @@
 /*-
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/sysctl.h>
 #include <sys/malloc.h>
 #include <sys/pcpu.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/systm.h>
 
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_param.h>
 
 #include <machine/cpu.h>
 #include <machine/vm.h>
 #include <machine/pcb.h>
 #include <machine/smp.h>
 #include <x86/psl.h>
 #include <x86/apicreg.h>
 #include <machine/vmparam.h>
 
 #include <machine/vmm.h>
 #include <machine/vmm_dev.h>
 #include <machine/vmm_instruction_emul.h>
 
 #include "vmm_ioport.h"
 #include "vmm_ktr.h"
 #include "vmm_host.h"
 #include "vmm_mem.h"
 #include "vmm_util.h"
 #include "vatpic.h"
 #include "vatpit.h"
 #include "vhpet.h"
 #include "vioapic.h"
 #include "vlapic.h"
-#include "vmm_msr.h"
 #include "vmm_ipi.h"
 #include "vmm_stat.h"
 #include "vmm_lapic.h"
 
 #include "io/ppt.h"
 #include "io/iommu.h"
 
 struct vlapic;
 
 /*
  * Initialization:
  * (a) allocated when vcpu is created
  * (i) initialized when vcpu is created and when it is reinitialized
  * (o) initialized the first time the vcpu is created
  * (x) initialized before use
  */
 struct vcpu {
 	struct mtx 	mtx;		/* (o) protects 'state' and 'hostcpu' */
 	enum vcpu_state	state;		/* (o) vcpu state */
 	int		hostcpu;	/* (o) vcpu's host cpu */
 	struct vlapic	*vlapic;	/* (i) APIC device model */
 	enum x2apic_state x2apic_state;	/* (i) APIC mode */
 	uint64_t	exitintinfo;	/* (i) events pending at VM exit */
 	int		nmi_pending;	/* (i) NMI pending */
 	int		extint_pending;	/* (i) INTR pending */
 	struct vm_exception exception;	/* (x) exception collateral */
 	int	exception_pending;	/* (i) exception pending */
 	struct savefpu	*guestfpu;	/* (a,i) guest fpu state */
 	uint64_t	guest_xcr0;	/* (i) guest %xcr0 register */
 	void		*stats;		/* (a,i) statistics */
-	uint64_t guest_msrs[VMM_MSR_NUM]; /* (i) emulated MSRs */
 	struct vm_exit	exitinfo;	/* (x) exit reason and collateral */
 };
 
 #define	vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx))
 #define	vcpu_lock_init(v)	mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
 #define	vcpu_lock(v)		mtx_lock_spin(&((v)->mtx))
 #define	vcpu_unlock(v)		mtx_unlock_spin(&((v)->mtx))
 #define	vcpu_assert_locked(v)	mtx_assert(&((v)->mtx), MA_OWNED)
 
 struct mem_seg {
 	vm_paddr_t	gpa;
 	size_t		len;
 	boolean_t	wired;
 	vm_object_t	object;
 };
 #define	VM_MAX_MEMORY_SEGMENTS	2
 
 /*
  * Initialization:
  * (o) initialized the first time the VM is created
  * (i) initialized when VM is created and when it is reinitialized
  * (x) initialized before use
  */
 struct vm {
 	void		*cookie;		/* (i) cpu-specific data */
 	void		*iommu;			/* (x) iommu-specific data */
 	struct vhpet	*vhpet;			/* (i) virtual HPET */
 	struct vioapic	*vioapic;		/* (i) virtual ioapic */
 	struct vatpic	*vatpic;		/* (i) virtual atpic */
 	struct vatpit	*vatpit;		/* (i) virtual atpit */
 	volatile cpuset_t active_cpus;		/* (i) active vcpus */
 	int		suspend;		/* (i) stop VM execution */
 	volatile cpuset_t suspended_cpus; 	/* (i) suspended vcpus */
 	volatile cpuset_t halted_cpus;		/* (x) cpus in a hard halt */
 	cpuset_t	rendezvous_req_cpus;	/* (x) rendezvous requested */
 	cpuset_t	rendezvous_done_cpus;	/* (x) rendezvous finished */
 	void		*rendezvous_arg;	/* (x) rendezvous func/arg */
 	vm_rendezvous_func_t rendezvous_func;
 	struct mtx	rendezvous_mtx;		/* (o) rendezvous lock */
 	int		num_mem_segs;		/* (o) guest memory segments */
 	struct mem_seg	mem_segs[VM_MAX_MEMORY_SEGMENTS];
 	struct vmspace	*vmspace;		/* (o) guest's address space */
 	char		name[VM_MAX_NAMELEN];	/* (o) virtual machine name */
 	struct vcpu	vcpu[VM_MAXCPU];	/* (i) guest vcpus */
 };
 
 static int vmm_initialized;
 
 static struct vmm_ops *ops;
 #define	VMM_INIT(num)	(ops != NULL ? (*ops->init)(num) : 0)
 #define	VMM_CLEANUP()	(ops != NULL ? (*ops->cleanup)() : 0)
 #define	VMM_RESUME()	(ops != NULL ? (*ops->resume)() : 0)
 
 #define	VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL)
 #define	VMRUN(vmi, vcpu, rip, pmap, rptr, sptr) \
 	(ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap, rptr, sptr) : ENXIO)
 #define	VMCLEANUP(vmi)	(ops != NULL ? (*ops->vmcleanup)(vmi) : NULL)
 #define	VMSPACE_ALLOC(min, max) \
 	(ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL)
 #define	VMSPACE_FREE(vmspace) \
 	(ops != NULL ? (*ops->vmspace_free)(vmspace) : ENXIO)
 #define	VMGETREG(vmi, vcpu, num, retval)		\
 	(ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO)
 #define	VMSETREG(vmi, vcpu, num, val)		\
 	(ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO)
 #define	VMGETDESC(vmi, vcpu, num, desc)		\
 	(ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO)
 #define	VMSETDESC(vmi, vcpu, num, desc)		\
 	(ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO)
 #define	VMGETCAP(vmi, vcpu, num, retval)	\
 	(ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO)
 #define	VMSETCAP(vmi, vcpu, num, val)		\
 	(ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO)
 #define	VLAPIC_INIT(vmi, vcpu)			\
 	(ops != NULL ? (*ops->vlapic_init)(vmi, vcpu) : NULL)
 #define	VLAPIC_CLEANUP(vmi, vlapic)		\
 	(ops != NULL ? (*ops->vlapic_cleanup)(vmi, vlapic) : NULL)
 
 #define	fpu_start_emulating()	load_cr0(rcr0() | CR0_TS)
 #define	fpu_stop_emulating()	clts()
 
 static MALLOC_DEFINE(M_VM, "vm", "vm");
-CTASSERT(VMM_MSR_NUM <= 64);	/* msr_mask can keep track of up to 64 msrs */
 
 /* statistics */
 static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
 
 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
 
 /*
  * Halt the guest if all vcpus are executing a HLT instruction with
  * interrupts disabled.
  */
 static int halt_detection_enabled = 1;
 TUNABLE_INT("hw.vmm.halt_detection", &halt_detection_enabled);
 SYSCTL_INT(_hw_vmm, OID_AUTO, halt_detection, CTLFLAG_RDTUN,
     &halt_detection_enabled, 0,
     "Halt VM if all vcpus execute HLT with interrupts disabled");
 
 static int vmm_ipinum;
 SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0,
     "IPI vector used for vcpu notifications");
 
 static void
 vcpu_cleanup(struct vm *vm, int i, bool destroy)
 {
 	struct vcpu *vcpu = &vm->vcpu[i];
 
 	VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic);
 	if (destroy) {
 		vmm_stat_free(vcpu->stats);	
 		fpu_save_area_free(vcpu->guestfpu);
 	}
 }
 
 static void
 vcpu_init(struct vm *vm, int vcpu_id, bool create)
 {
 	struct vcpu *vcpu;
 
 	KASSERT(vcpu_id >= 0 && vcpu_id < VM_MAXCPU,
 	    ("vcpu_init: invalid vcpu %d", vcpu_id));
 	  
 	vcpu = &vm->vcpu[vcpu_id];
 
 	if (create) {
 		KASSERT(!vcpu_lock_initialized(vcpu), ("vcpu %d already "
 		    "initialized", vcpu_id));
 		vcpu_lock_init(vcpu);
 		vcpu->state = VCPU_IDLE;
 		vcpu->hostcpu = NOCPU;
 		vcpu->guestfpu = fpu_save_area_alloc();
 		vcpu->stats = vmm_stat_alloc();
 	}
 
 	vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
 	vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED);
 	vcpu->exitintinfo = 0;
 	vcpu->nmi_pending = 0;
 	vcpu->extint_pending = 0;
 	vcpu->exception_pending = 0;
 	vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
 	fpu_save_area_reset(vcpu->guestfpu);
 	vmm_stat_init(vcpu->stats);
-	guest_msrs_init(vm, vcpu_id);
 }
 
 struct vm_exit *
 vm_exitinfo(struct vm *vm, int cpuid)
 {
 	struct vcpu *vcpu;
 
 	if (cpuid < 0 || cpuid >= VM_MAXCPU)
 		panic("vm_exitinfo: invalid cpuid %d", cpuid);
 
 	vcpu = &vm->vcpu[cpuid];
 
 	return (&vcpu->exitinfo);
 }
 
 static void
 vmm_resume(void)
 {
 	VMM_RESUME();
 }
 
 static int
 vmm_init(void)
 {
 	int error;
 
 	vmm_host_state_init();
 
 	vmm_ipinum = vmm_ipi_alloc();
 	if (vmm_ipinum == 0)
 		vmm_ipinum = IPI_AST;
 
 	error = vmm_mem_init();
 	if (error)
 		return (error);
 	
 	if (vmm_is_intel())
 		ops = &vmm_ops_intel;
 	else if (vmm_is_amd())
 		ops = &vmm_ops_amd;
 	else
 		return (ENXIO);
 
-	vmm_msr_init();
 	vmm_resume_p = vmm_resume;
 
 	return (VMM_INIT(vmm_ipinum));
 }
 
 static int
 vmm_handler(module_t mod, int what, void *arg)
 {
 	int error;
 
 	switch (what) {
 	case MOD_LOAD:
 		vmmdev_init();
 		if (ppt_avail_devices() > 0)
 			iommu_init();
 		error = vmm_init();
 		if (error == 0)
 			vmm_initialized = 1;
 		break;
 	case MOD_UNLOAD:
 		error = vmmdev_cleanup();
 		if (error == 0) {
 			vmm_resume_p = NULL;
 			iommu_cleanup();
 			if (vmm_ipinum != IPI_AST)
 				vmm_ipi_free(vmm_ipinum);
 			error = VMM_CLEANUP();
 			/*
 			 * Something bad happened - prevent new
 			 * VMs from being created
 			 */
 			if (error)
 				vmm_initialized = 0;
 		}
 		break;
 	default:
 		error = 0;
 		break;
 	}
 	return (error);
 }
 
 static moduledata_t vmm_kmod = {
 	"vmm",
 	vmm_handler,
 	NULL
 };
 
 /*
  * vmm initialization has the following dependencies:
  *
  * - iommu initialization must happen after the pci passthru driver has had
  *   a chance to attach to any passthru devices (after SI_SUB_CONFIGURE).
  *
  * - VT-x initialization requires smp_rendezvous() and therefore must happen
  *   after SMP is fully functional (after SI_SUB_SMP).
  */
 DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY);
 MODULE_VERSION(vmm, 1);
 
 static void
 vm_init(struct vm *vm, bool create)
 {
 	int i;
 
 	vm->cookie = VMINIT(vm, vmspace_pmap(vm->vmspace));
 	vm->iommu = NULL;
 	vm->vioapic = vioapic_init(vm);
 	vm->vhpet = vhpet_init(vm);
 	vm->vatpic = vatpic_init(vm);
 	vm->vatpit = vatpit_init(vm);
 
 	CPU_ZERO(&vm->active_cpus);
 
 	vm->suspend = 0;
 	CPU_ZERO(&vm->suspended_cpus);
 
 	for (i = 0; i < VM_MAXCPU; i++)
 		vcpu_init(vm, i, create);
 }
 
 int
 vm_create(const char *name, struct vm **retvm)
 {
 	struct vm *vm;
 	struct vmspace *vmspace;
 
 	/*
 	 * If vmm.ko could not be successfully initialized then don't attempt
 	 * to create the virtual machine.
 	 */
 	if (!vmm_initialized)
 		return (ENXIO);
 
 	if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
 		return (EINVAL);
 
 	vmspace = VMSPACE_ALLOC(VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS);
 	if (vmspace == NULL)
 		return (ENOMEM);
 
 	vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
 	strcpy(vm->name, name);
 	vm->num_mem_segs = 0;
 	vm->vmspace = vmspace;
 	mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF);
 
 	vm_init(vm, true);
 
 	*retvm = vm;
 	return (0);
 }
 
 static void
 vm_free_mem_seg(struct vm *vm, struct mem_seg *seg)
 {
 
 	if (seg->object != NULL)
 		vmm_mem_free(vm->vmspace, seg->gpa, seg->len);
 
 	bzero(seg, sizeof(*seg));
 }
 
 static void
 vm_cleanup(struct vm *vm, bool destroy)
 {
 	int i;
 
 	ppt_unassign_all(vm);
 
 	if (vm->iommu != NULL)
 		iommu_destroy_domain(vm->iommu);
 
 	vatpit_cleanup(vm->vatpit);
 	vhpet_cleanup(vm->vhpet);
 	vatpic_cleanup(vm->vatpic);
 	vioapic_cleanup(vm->vioapic);
 
 	for (i = 0; i < VM_MAXCPU; i++)
 		vcpu_cleanup(vm, i, destroy);
 
 	VMCLEANUP(vm->cookie);
 
 	if (destroy) {
 		for (i = 0; i < vm->num_mem_segs; i++)
 			vm_free_mem_seg(vm, &vm->mem_segs[i]);
 
 		vm->num_mem_segs = 0;
 
 		VMSPACE_FREE(vm->vmspace);
 		vm->vmspace = NULL;
 	}
 }
 
 void
 vm_destroy(struct vm *vm)
 {
 	vm_cleanup(vm, true);
 	free(vm, M_VM);
 }
 
 int
 vm_reinit(struct vm *vm)
 {
 	int error;
 
 	/*
 	 * A virtual machine can be reset only if all vcpus are suspended.
 	 */
 	if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
 		vm_cleanup(vm, false);
 		vm_init(vm, false);
 		error = 0;
 	} else {
 		error = EBUSY;
 	}
 
 	return (error);
 }
 
 const char *
 vm_name(struct vm *vm)
 {
 	return (vm->name);
 }
 
 int
 vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
 {
 	vm_object_t obj;
 
 	if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL)
 		return (ENOMEM);
 	else
 		return (0);
 }
 
 int
 vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
 {
 
 	vmm_mmio_free(vm->vmspace, gpa, len);
 	return (0);
 }
 
 boolean_t
 vm_mem_allocated(struct vm *vm, vm_paddr_t gpa)
 {
 	int i;
 	vm_paddr_t gpabase, gpalimit;
 
 	for (i = 0; i < vm->num_mem_segs; i++) {
 		gpabase = vm->mem_segs[i].gpa;
 		gpalimit = gpabase + vm->mem_segs[i].len;
 		if (gpa >= gpabase && gpa < gpalimit)
 			return (TRUE);		/* 'gpa' is regular memory */
 	}
 
 	if (ppt_is_mmio(vm, gpa))
 		return (TRUE);			/* 'gpa' is pci passthru mmio */
 
 	return (FALSE);
 }
 
 int
 vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
 {
 	int available, allocated;
 	struct mem_seg *seg;
 	vm_object_t object;
 	vm_paddr_t g;
 
 	if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0)
 		return (EINVAL);
 	
 	available = allocated = 0;
 	g = gpa;
 	while (g < gpa + len) {
 		if (vm_mem_allocated(vm, g))
 			allocated++;
 		else
 			available++;
 
 		g += PAGE_SIZE;
 	}
 
 	/*
 	 * If there are some allocated and some available pages in the address
 	 * range then it is an error.
 	 */
 	if (allocated && available)
 		return (EINVAL);
 
 	/*
 	 * If the entire address range being requested has already been
 	 * allocated then there isn't anything more to do.
 	 */
 	if (allocated && available == 0)
 		return (0);
 
 	if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS)
 		return (E2BIG);
 
 	seg = &vm->mem_segs[vm->num_mem_segs];
 
 	if ((object = vmm_mem_alloc(vm->vmspace, gpa, len)) == NULL)
 		return (ENOMEM);
 
 	seg->gpa = gpa;
 	seg->len = len;
 	seg->object = object;
 	seg->wired = FALSE;
 
 	vm->num_mem_segs++;
 
 	return (0);
 }
 
 static vm_paddr_t
 vm_maxmem(struct vm *vm)
 {
 	int i;
 	vm_paddr_t gpa, maxmem;
 
 	maxmem = 0;
 	for (i = 0; i < vm->num_mem_segs; i++) {
 		gpa = vm->mem_segs[i].gpa + vm->mem_segs[i].len;
 		if (gpa > maxmem)
 			maxmem = gpa;
 	}
 	return (maxmem);
 }
 
 static void
 vm_gpa_unwire(struct vm *vm)
 {
 	int i, rv;
 	struct mem_seg *seg;
 
 	for (i = 0; i < vm->num_mem_segs; i++) {
 		seg = &vm->mem_segs[i];
 		if (!seg->wired)
 			continue;
 
 		rv = vm_map_unwire(&vm->vmspace->vm_map,
 				   seg->gpa, seg->gpa + seg->len,
 				   VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
 		KASSERT(rv == KERN_SUCCESS, ("vm(%s) memory segment "
 		    "%#lx/%ld could not be unwired: %d",
 		    vm_name(vm), seg->gpa, seg->len, rv));
 
 		seg->wired = FALSE;
 	}
 }
 
 static int
 vm_gpa_wire(struct vm *vm)
 {
 	int i, rv;
 	struct mem_seg *seg;
 
 	for (i = 0; i < vm->num_mem_segs; i++) {
 		seg = &vm->mem_segs[i];
 		if (seg->wired)
 			continue;
 
 		/* XXX rlimits? */
 		rv = vm_map_wire(&vm->vmspace->vm_map,
 				 seg->gpa, seg->gpa + seg->len,
 				 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
 		if (rv != KERN_SUCCESS)
 			break;
 
 		seg->wired = TRUE;
 	}
 
 	if (i < vm->num_mem_segs) {
 		/*
 		 * Undo the wiring before returning an error.
 		 */
 		vm_gpa_unwire(vm);
 		return (EAGAIN);
 	}
 
 	return (0);
 }
 
 static void
 vm_iommu_modify(struct vm *vm, boolean_t map)
 {
 	int i, sz;
 	vm_paddr_t gpa, hpa;
 	struct mem_seg *seg;
 	void *vp, *cookie, *host_domain;
 
 	sz = PAGE_SIZE;
 	host_domain = iommu_host_domain();
 
 	for (i = 0; i < vm->num_mem_segs; i++) {
 		seg = &vm->mem_segs[i];
 		KASSERT(seg->wired, ("vm(%s) memory segment %#lx/%ld not wired",
 		    vm_name(vm), seg->gpa, seg->len));
 
 		gpa = seg->gpa;
 		while (gpa < seg->gpa + seg->len) {
 			vp = vm_gpa_hold(vm, gpa, PAGE_SIZE, VM_PROT_WRITE,
 					 &cookie);
 			KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx",
 			    vm_name(vm), gpa));
 
 			vm_gpa_release(cookie);
 
 			hpa = DMAP_TO_PHYS((uintptr_t)vp);
 			if (map) {
 				iommu_create_mapping(vm->iommu, gpa, hpa, sz);
 				iommu_remove_mapping(host_domain, hpa, sz);
 			} else {
 				iommu_remove_mapping(vm->iommu, gpa, sz);
 				iommu_create_mapping(host_domain, hpa, hpa, sz);
 			}
 
 			gpa += PAGE_SIZE;
 		}
 	}
 
 	/*
 	 * Invalidate the cached translations associated with the domain
 	 * from which pages were removed.
 	 */
 	if (map)
 		iommu_invalidate_tlb(host_domain);
 	else
 		iommu_invalidate_tlb(vm->iommu);
 }
 
 #define	vm_iommu_unmap(vm)	vm_iommu_modify((vm), FALSE)
 #define	vm_iommu_map(vm)	vm_iommu_modify((vm), TRUE)
 
 int
 vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func)
 {
 	int error;
 
 	error = ppt_unassign_device(vm, bus, slot, func);
 	if (error)
 		return (error);
 
 	if (ppt_assigned_devices(vm) == 0) {
 		vm_iommu_unmap(vm);
 		vm_gpa_unwire(vm);
 	}
 	return (0);
 }
 
 int
 vm_assign_pptdev(struct vm *vm, int bus, int slot, int func)
 {
 	int error;
 	vm_paddr_t maxaddr;
 
 	/*
 	 * Virtual machines with pci passthru devices get special treatment:
 	 * - the guest physical memory is wired
 	 * - the iommu is programmed to do the 'gpa' to 'hpa' translation
 	 *
 	 * We need to do this before the first pci passthru device is attached.
 	 */
 	if (ppt_assigned_devices(vm) == 0) {
 		KASSERT(vm->iommu == NULL,
 		    ("vm_assign_pptdev: iommu must be NULL"));
 		maxaddr = vm_maxmem(vm);
 		vm->iommu = iommu_create_domain(maxaddr);
 
 		error = vm_gpa_wire(vm);
 		if (error)
 			return (error);
 
 		vm_iommu_map(vm);
 	}
 
 	error = ppt_assign_device(vm, bus, slot, func);
 	return (error);
 }
 
 void *
 vm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot,
 	    void **cookie)
 {
 	int count, pageoff;
 	vm_page_t m;
 
 	pageoff = gpa & PAGE_MASK;
 	if (len > PAGE_SIZE - pageoff)
 		panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
 
 	count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map,
 	    trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1);
 
 	if (count == 1) {
 		*cookie = m;
 		return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff));
 	} else {
 		*cookie = NULL;
 		return (NULL);
 	}
 }
 
 void
 vm_gpa_release(void *cookie)
 {
 	vm_page_t m = cookie;
 
 	vm_page_lock(m);
 	vm_page_unhold(m);
 	vm_page_unlock(m);
 }
 
 int
 vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
 		  struct vm_memory_segment *seg)
 {
 	int i;
 
 	for (i = 0; i < vm->num_mem_segs; i++) {
 		if (gpabase == vm->mem_segs[i].gpa) {
 			seg->gpa = vm->mem_segs[i].gpa;
 			seg->len = vm->mem_segs[i].len;
 			seg->wired = vm->mem_segs[i].wired;
 			return (0);
 		}
 	}
 	return (-1);
 }
 
 int
 vm_get_memobj(struct vm *vm, vm_paddr_t gpa, size_t len,
 	      vm_offset_t *offset, struct vm_object **object)
 {
 	int i;
 	size_t seg_len;
 	vm_paddr_t seg_gpa;
 	vm_object_t seg_obj;
 
 	for (i = 0; i < vm->num_mem_segs; i++) {
 		if ((seg_obj = vm->mem_segs[i].object) == NULL)
 			continue;
 
 		seg_gpa = vm->mem_segs[i].gpa;
 		seg_len = vm->mem_segs[i].len;
 
 		if (gpa >= seg_gpa && gpa < seg_gpa + seg_len) {
 			*offset = gpa - seg_gpa;
 			*object = seg_obj;
 			vm_object_reference(seg_obj);
 			return (0);
 		}
 	}
 
 	return (EINVAL);
 }
 
 int
 vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
 {
 
 	if (vcpu < 0 || vcpu >= VM_MAXCPU)
 		return (EINVAL);
 
 	if (reg >= VM_REG_LAST)
 		return (EINVAL);
 
 	return (VMGETREG(vm->cookie, vcpu, reg, retval));
 }
 
 int
 vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val)
 {
 
 	if (vcpu < 0 || vcpu >= VM_MAXCPU)
 		return (EINVAL);
 
 	if (reg >= VM_REG_LAST)
 		return (EINVAL);
 
 	return (VMSETREG(vm->cookie, vcpu, reg, val));
 }
 
 static boolean_t
 is_descriptor_table(int reg)
 {
 
 	switch (reg) {
 	case VM_REG_GUEST_IDTR:
 	case VM_REG_GUEST_GDTR:
 		return (TRUE);
 	default:
 		return (FALSE);
 	}
 }
 
 static boolean_t
 is_segment_register(int reg)
 {
 	
 	switch (reg) {
 	case VM_REG_GUEST_ES:
 	case VM_REG_GUEST_CS:
 	case VM_REG_GUEST_SS:
 	case VM_REG_GUEST_DS:
 	case VM_REG_GUEST_FS:
 	case VM_REG_GUEST_GS:
 	case VM_REG_GUEST_TR:
 	case VM_REG_GUEST_LDTR:
 		return (TRUE);
 	default:
 		return (FALSE);
 	}
 }
 
 int
 vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
 		struct seg_desc *desc)
 {
 
 	if (vcpu < 0 || vcpu >= VM_MAXCPU)
 		return (EINVAL);
 
 	if (!is_segment_register(reg) && !is_descriptor_table(reg))
 		return (EINVAL);
 
 	return (VMGETDESC(vm->cookie, vcpu, reg, desc));
 }
 
 int
 vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
 		struct seg_desc *desc)
 {
 	if (vcpu < 0 || vcpu >= VM_MAXCPU)
 		return (EINVAL);
 
 	if (!is_segment_register(reg) && !is_descriptor_table(reg))
 		return (EINVAL);
 
 	return (VMSETDESC(vm->cookie, vcpu, reg, desc));
 }
 
 static void
 restore_guest_fpustate(struct vcpu *vcpu)
 {
 
 	/* flush host state to the pcb */
 	fpuexit(curthread);
 
 	/* restore guest FPU state */
 	fpu_stop_emulating();
 	fpurestore(vcpu->guestfpu);
 
 	/* restore guest XCR0 if XSAVE is enabled in the host */
 	if (rcr4() & CR4_XSAVE)
 		load_xcr(0, vcpu->guest_xcr0);
 
 	/*
 	 * The FPU is now "dirty" with the guest's state so turn on emulation
 	 * to trap any access to the FPU by the host.
 	 */
 	fpu_start_emulating();
 }
 
 static void
 save_guest_fpustate(struct vcpu *vcpu)
 {
 
 	if ((rcr0() & CR0_TS) == 0)
 		panic("fpu emulation not enabled in host!");
 
 	/* save guest XCR0 and restore host XCR0 */
 	if (rcr4() & CR4_XSAVE) {
 		vcpu->guest_xcr0 = rxcr(0);
 		load_xcr(0, vmm_get_host_xcr0());
 	}
 
 	/* save guest FPU state */
 	fpu_stop_emulating();
 	fpusave(vcpu->guestfpu);
 	fpu_start_emulating();
 }
 
 static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle");
 
 static int
 vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate,
     bool from_idle)
 {
 	int error;
 
 	vcpu_assert_locked(vcpu);
 
 	/*
 	 * State transitions from the vmmdev_ioctl() must always begin from
 	 * the VCPU_IDLE state. This guarantees that there is only a single
 	 * ioctl() operating on a vcpu at any point.
 	 */
 	if (from_idle) {
 		while (vcpu->state != VCPU_IDLE)
 			msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz);
 	} else {
 		KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
 		    "vcpu idle state"));
 	}
 
 	if (vcpu->state == VCPU_RUNNING) {
 		KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d "
 		    "mismatch for running vcpu", curcpu, vcpu->hostcpu));
 	} else {
 		KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a "
 		    "vcpu that is not running", vcpu->hostcpu));
 	}
 
 	/*
 	 * The following state transitions are allowed:
 	 * IDLE -> FROZEN -> IDLE
 	 * FROZEN -> RUNNING -> FROZEN
 	 * FROZEN -> SLEEPING -> FROZEN
 	 */
 	switch (vcpu->state) {
 	case VCPU_IDLE:
 	case VCPU_RUNNING:
 	case VCPU_SLEEPING:
 		error = (newstate != VCPU_FROZEN);
 		break;
 	case VCPU_FROZEN:
 		error = (newstate == VCPU_FROZEN);
 		break;
 	default:
 		error = 1;
 		break;
 	}
 
 	if (error)
 		return (EBUSY);
 
 	vcpu->state = newstate;
 	if (newstate == VCPU_RUNNING)
 		vcpu->hostcpu = curcpu;
 	else
 		vcpu->hostcpu = NOCPU;
 
 	if (newstate == VCPU_IDLE)
 		wakeup(&vcpu->state);
 
 	return (0);
 }
 
 static void
 vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
 {
 	int error;
 
 	if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0)
 		panic("Error %d setting state to %d\n", error, newstate);
 }
 
 static void
 vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
 {
 	int error;
 
 	if ((error = vcpu_set_state_locked(vcpu, newstate, false)) != 0)
 		panic("Error %d setting state to %d", error, newstate);
 }
 
 static void
 vm_set_rendezvous_func(struct vm *vm, vm_rendezvous_func_t func)
 {
 
 	KASSERT(mtx_owned(&vm->rendezvous_mtx), ("rendezvous_mtx not locked"));
 
 	/*
 	 * Update 'rendezvous_func' and execute a write memory barrier to
 	 * ensure that it is visible across all host cpus. This is not needed
 	 * for correctness but it does ensure that all the vcpus will notice
 	 * that the rendezvous is requested immediately.
 	 */
 	vm->rendezvous_func = func;
 	wmb();
 }
 
 #define	RENDEZVOUS_CTR0(vm, vcpuid, fmt)				\
 	do {								\
 		if (vcpuid >= 0)					\
 			VCPU_CTR0(vm, vcpuid, fmt);			\
 		else							\
 			VM_CTR0(vm, fmt);				\
 	} while (0)
 
 static void
 vm_handle_rendezvous(struct vm *vm, int vcpuid)
 {
 
 	KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU),
 	    ("vm_handle_rendezvous: invalid vcpuid %d", vcpuid));
 
 	mtx_lock(&vm->rendezvous_mtx);
 	while (vm->rendezvous_func != NULL) {
 		/* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */
 		CPU_AND(&vm->rendezvous_req_cpus, &vm->active_cpus);
 
 		if (vcpuid != -1 &&
 		    CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus) &&
 		    !CPU_ISSET(vcpuid, &vm->rendezvous_done_cpus)) {
 			VCPU_CTR0(vm, vcpuid, "Calling rendezvous func");
 			(*vm->rendezvous_func)(vm, vcpuid, vm->rendezvous_arg);
 			CPU_SET(vcpuid, &vm->rendezvous_done_cpus);
 		}
 		if (CPU_CMP(&vm->rendezvous_req_cpus,
 		    &vm->rendezvous_done_cpus) == 0) {
 			VCPU_CTR0(vm, vcpuid, "Rendezvous completed");
 			vm_set_rendezvous_func(vm, NULL);
 			wakeup(&vm->rendezvous_func);
 			break;
 		}
 		RENDEZVOUS_CTR0(vm, vcpuid, "Wait for rendezvous completion");
 		mtx_sleep(&vm->rendezvous_func, &vm->rendezvous_mtx, 0,
 		    "vmrndv", 0);
 	}
 	mtx_unlock(&vm->rendezvous_mtx);
 }
 
 /*
  * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
  */
 static int
 vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu)
 {
 	struct vcpu *vcpu;
 	const char *wmesg;
-	int t, vcpu_halted, vm_halted;
+	int error, t, vcpu_halted, vm_halted;
 
 	KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted"));
 
 	vcpu = &vm->vcpu[vcpuid];
 	vcpu_halted = 0;
 	vm_halted = 0;
 
+	/*
+	 * The typical way to halt a cpu is to execute: "sti; hlt"
+	 *
+	 * STI sets RFLAGS.IF to enable interrupts. However, the processor
+	 * remains in an "interrupt shadow" for an additional instruction
+	 * following the STI. This guarantees that "sti; hlt" sequence is
+	 * atomic and a pending interrupt will be recognized after the HLT.
+	 *
+	 * After the HLT emulation is done the vcpu is no longer in an
+	 * interrupt shadow and a pending interrupt can be injected on
+	 * the next entry into the guest.
+	 */
+	error = vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0);
+	KASSERT(error == 0, ("%s: error %d clearing interrupt shadow",
+	    __func__, error));
+
 	vcpu_lock(vcpu);
 	while (1) {
 		/*
 		 * Do a final check for pending NMI or interrupts before
 		 * really putting this thread to sleep. Also check for
 		 * software events that would cause this vcpu to wakeup.
 		 *
 		 * These interrupts/events could have happened after the
 		 * vcpu returned from VMRUN() and before it acquired the
 		 * vcpu lock above.
 		 */
 		if (vm->rendezvous_func != NULL || vm->suspend)
 			break;
 		if (vm_nmi_pending(vm, vcpuid))
 			break;
 		if (!intr_disabled) {
 			if (vm_extint_pending(vm, vcpuid) ||
 			    vlapic_pending_intr(vcpu->vlapic, NULL)) {
 				break;
 			}
 		}
 
 		/* Don't go to sleep if the vcpu thread needs to yield */
 		if (vcpu_should_yield(vm, vcpuid))
 			break;
 
 		/*
 		 * Some Linux guests implement "halt" by having all vcpus
 		 * execute HLT with interrupts disabled. 'halted_cpus' keeps
 		 * track of the vcpus that have entered this state. When all
 		 * vcpus enter the halted state the virtual machine is halted.
 		 */
 		if (intr_disabled) {
 			wmesg = "vmhalt";
 			VCPU_CTR0(vm, vcpuid, "Halted");
 			if (!vcpu_halted && halt_detection_enabled) {
 				vcpu_halted = 1;
 				CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus);
 			}
 			if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) {
 				vm_halted = 1;
 				break;
 			}
 		} else {
 			wmesg = "vmidle";
 		}
 
 		t = ticks;
 		vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
 		/*
 		 * XXX msleep_spin() cannot be interrupted by signals so
 		 * wake up periodically to check pending signals.
 		 */
 		msleep_spin(vcpu, &vcpu->mtx, wmesg, hz);
 		vcpu_require_state_locked(vcpu, VCPU_FROZEN);
 		vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
 	}
 
 	if (vcpu_halted)
 		CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus);
 
 	vcpu_unlock(vcpu);
 
 	if (vm_halted)
 		vm_suspend(vm, VM_SUSPEND_HALT);
 
 	return (0);
 }
 
 static int
 vm_handle_paging(struct vm *vm, int vcpuid, bool *retu)
 {
 	int rv, ftype;
 	struct vm_map *map;
 	struct vcpu *vcpu;
 	struct vm_exit *vme;
 
 	vcpu = &vm->vcpu[vcpuid];
 	vme = &vcpu->exitinfo;
 
 	ftype = vme->u.paging.fault_type;
 	KASSERT(ftype == VM_PROT_READ ||
 	    ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE,
 	    ("vm_handle_paging: invalid fault_type %d", ftype));
 
 	if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
 		rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace),
 		    vme->u.paging.gpa, ftype);
-		if (rv == 0)
+		if (rv == 0) {
+			VCPU_CTR2(vm, vcpuid, "%s bit emulation for gpa %#lx",
+			    ftype == VM_PROT_READ ? "accessed" : "dirty",
+			    vme->u.paging.gpa);
 			goto done;
+		}
 	}
 
 	map = &vm->vmspace->vm_map;
 	rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL);
 
 	VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, "
 	    "ftype = %d", rv, vme->u.paging.gpa, ftype);
 
 	if (rv != KERN_SUCCESS)
 		return (EFAULT);
 done:
 	/* restart execution at the faulting instruction */
 	vme->inst_length = 0;
 
 	return (0);
 }
 
 static int
 vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu)
 {
 	struct vie *vie;
 	struct vcpu *vcpu;
 	struct vm_exit *vme;
 	uint64_t gla, gpa;
 	struct vm_guest_paging *paging;
 	mem_region_read_t mread;
 	mem_region_write_t mwrite;
 	enum vm_cpu_mode cpu_mode;
 	int cs_d, error;
 
 	vcpu = &vm->vcpu[vcpuid];
 	vme = &vcpu->exitinfo;
 
 	gla = vme->u.inst_emul.gla;
 	gpa = vme->u.inst_emul.gpa;
 	cs_d = vme->u.inst_emul.cs_d;
 	vie = &vme->u.inst_emul.vie;
 	paging = &vme->u.inst_emul.paging;
 	cpu_mode = paging->cpu_mode;
 
+	VCPU_CTR1(vm, vcpuid, "inst_emul fault accessing gpa %#lx", gpa);
+
 	vie_init(vie);
 
 	/* Fetch, decode and emulate the faulting instruction */
 	error = vmm_fetch_instruction(vm, vcpuid, paging, vme->rip,
 	    vme->inst_length, vie);
 	if (error == 1)
 		return (0);		/* Resume guest to handle page fault */
 	else if (error == -1)
 		return (EFAULT);
 	else if (error != 0)
 		panic("%s: vmm_fetch_instruction error %d", __func__, error);
 
 	if (vmm_decode_instruction(vm, vcpuid, gla, cpu_mode, cs_d, vie) != 0)
 		return (EFAULT);
 
 	/* return to userland unless this is an in-kernel emulated device */
 	if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
 		mread = lapic_mmio_read;
 		mwrite = lapic_mmio_write;
 	} else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
 		mread = vioapic_mmio_read;
 		mwrite = vioapic_mmio_write;
 	} else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
 		mread = vhpet_mmio_read;
 		mwrite = vhpet_mmio_write;
 	} else {
 		*retu = true;
 		return (0);
 	}
 
 	error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, paging,
 	    mread, mwrite, retu);
 
 	return (error);
 }
 
 static int
 vm_handle_suspend(struct vm *vm, int vcpuid, bool *retu)
 {
 	int i, done;
 	struct vcpu *vcpu;
 
 	done = 0;
 	vcpu = &vm->vcpu[vcpuid];
 
 	CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus);
 
 	/*
 	 * Wait until all 'active_cpus' have suspended themselves.
 	 *
 	 * Since a VM may be suspended at any time including when one or
 	 * more vcpus are doing a rendezvous we need to call the rendezvous
 	 * handler while we are waiting to prevent a deadlock.
 	 */
 	vcpu_lock(vcpu);
 	while (1) {
 		if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
 			VCPU_CTR0(vm, vcpuid, "All vcpus suspended");
 			break;
 		}
 
 		if (vm->rendezvous_func == NULL) {
 			VCPU_CTR0(vm, vcpuid, "Sleeping during suspend");
 			vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
 			msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz);
 			vcpu_require_state_locked(vcpu, VCPU_FROZEN);
 		} else {
 			VCPU_CTR0(vm, vcpuid, "Rendezvous during suspend");
 			vcpu_unlock(vcpu);
 			vm_handle_rendezvous(vm, vcpuid);
 			vcpu_lock(vcpu);
 		}
 	}
 	vcpu_unlock(vcpu);
 
 	/*
 	 * Wakeup the other sleeping vcpus and return to userspace.
 	 */
 	for (i = 0; i < VM_MAXCPU; i++) {
 		if (CPU_ISSET(i, &vm->suspended_cpus)) {
 			vcpu_notify_event(vm, i, false);
 		}
 	}
 
 	*retu = true;
 	return (0);
 }
 
 int
 vm_suspend(struct vm *vm, enum vm_suspend_how how)
 {
 	int i;
 
 	if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST)
 		return (EINVAL);
 
 	if (atomic_cmpset_int(&vm->suspend, 0, how) == 0) {
 		VM_CTR2(vm, "virtual machine already suspended %d/%d",
 		    vm->suspend, how);
 		return (EALREADY);
 	}
 
 	VM_CTR1(vm, "virtual machine successfully suspended %d", how);
 
 	/*
 	 * Notify all active vcpus that they are now suspended.
 	 */
 	for (i = 0; i < VM_MAXCPU; i++) {
 		if (CPU_ISSET(i, &vm->active_cpus))
 			vcpu_notify_event(vm, i, false);
 	}
 
 	return (0);
 }
 
 void
 vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip)
 {
 	struct vm_exit *vmexit;
 
 	KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST,
 	    ("vm_exit_suspended: invalid suspend type %d", vm->suspend));
 
 	vmexit = vm_exitinfo(vm, vcpuid);
 	vmexit->rip = rip;
 	vmexit->inst_length = 0;
 	vmexit->exitcode = VM_EXITCODE_SUSPENDED;
 	vmexit->u.suspended.how = vm->suspend;
 }
 
 void
 vm_exit_rendezvous(struct vm *vm, int vcpuid, uint64_t rip)
 {
 	struct vm_exit *vmexit;
 
 	KASSERT(vm->rendezvous_func != NULL, ("rendezvous not in progress"));
 
 	vmexit = vm_exitinfo(vm, vcpuid);
 	vmexit->rip = rip;
 	vmexit->inst_length = 0;
 	vmexit->exitcode = VM_EXITCODE_RENDEZVOUS;
 	vmm_stat_incr(vm, vcpuid, VMEXIT_RENDEZVOUS, 1);
 }
 
 void
 vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip)
 {
 	struct vm_exit *vmexit;
 
 	vmexit = vm_exitinfo(vm, vcpuid);
 	vmexit->rip = rip;
 	vmexit->inst_length = 0;
 	vmexit->exitcode = VM_EXITCODE_BOGUS;
 	vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1);
 }
 
 int
 vm_run(struct vm *vm, struct vm_run *vmrun)
 {
 	int error, vcpuid;
 	struct vcpu *vcpu;
 	struct pcb *pcb;
 	uint64_t tscval, rip;
 	struct vm_exit *vme;
 	bool retu, intr_disabled;
 	pmap_t pmap;
 	void *rptr, *sptr;
 
 	vcpuid = vmrun->cpuid;
 
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		return (EINVAL);
 
 	if (!CPU_ISSET(vcpuid, &vm->active_cpus))
 		return (EINVAL);
 
 	if (CPU_ISSET(vcpuid, &vm->suspended_cpus))
 		return (EINVAL);
 
 	rptr = &vm->rendezvous_func;
 	sptr = &vm->suspend;
 	pmap = vmspace_pmap(vm->vmspace);
 	vcpu = &vm->vcpu[vcpuid];
 	vme = &vcpu->exitinfo;
 	rip = vmrun->rip;
 restart:
 	critical_enter();
 
 	KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
 	    ("vm_run: absurd pm_active"));
 
 	tscval = rdtsc();
 
 	pcb = PCPU_GET(curpcb);
 	set_pcb_flags(pcb, PCB_FULL_IRET);
 
-	restore_guest_msrs(vm, vcpuid);	
 	restore_guest_fpustate(vcpu);
 
 	vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
 	error = VMRUN(vm->cookie, vcpuid, rip, pmap, rptr, sptr);
 	vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
 
 	save_guest_fpustate(vcpu);
-	restore_host_msrs(vm, vcpuid);
 
 	vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
 
 	critical_exit();
 
 	if (error == 0) {
 		retu = false;
 		switch (vme->exitcode) {
 		case VM_EXITCODE_SUSPENDED:
 			error = vm_handle_suspend(vm, vcpuid, &retu);
 			break;
 		case VM_EXITCODE_IOAPIC_EOI:
 			vioapic_process_eoi(vm, vcpuid,
 			    vme->u.ioapic_eoi.vector);
 			break;
 		case VM_EXITCODE_RENDEZVOUS:
 			vm_handle_rendezvous(vm, vcpuid);
 			error = 0;
 			break;
 		case VM_EXITCODE_HLT:
 			intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0);
 			error = vm_handle_hlt(vm, vcpuid, intr_disabled, &retu);
 			break;
 		case VM_EXITCODE_PAGING:
 			error = vm_handle_paging(vm, vcpuid, &retu);
 			break;
 		case VM_EXITCODE_INST_EMUL:
 			error = vm_handle_inst_emul(vm, vcpuid, &retu);
 			break;
 		case VM_EXITCODE_INOUT:
 		case VM_EXITCODE_INOUT_STR:
 			error = vm_handle_inout(vm, vcpuid, vme, &retu);
 			break;
+		case VM_EXITCODE_MONITOR:
+		case VM_EXITCODE_MWAIT:
+			vm_inject_ud(vm, vcpuid);
+			break;
 		default:
 			retu = true;	/* handled in userland */
 			break;
 		}
 	}
 
 	if (error == 0 && retu == false) {
 		rip = vme->rip + vme->inst_length;
 		goto restart;
 	}
 
 	/* copy the exit information */
 	bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit));
 	return (error);
 }
 
 int
 vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info)
 {
 	struct vcpu *vcpu;
 	int type, vector;
 
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		return (EINVAL);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	if (info & VM_INTINFO_VALID) {
 		type = info & VM_INTINFO_TYPE;
 		vector = info & 0xff;
 		if (type == VM_INTINFO_NMI && vector != IDT_NMI)
 			return (EINVAL);
 		if (type == VM_INTINFO_HWEXCEPTION && vector >= 32)
 			return (EINVAL);
 		if (info & VM_INTINFO_RSVD)
 			return (EINVAL);
 	} else {
 		info = 0;
 	}
 	VCPU_CTR2(vm, vcpuid, "%s: info1(%#lx)", __func__, info);
 	vcpu->exitintinfo = info;
 	return (0);
 }
 
 enum exc_class {
 	EXC_BENIGN,
 	EXC_CONTRIBUTORY,
 	EXC_PAGEFAULT
 };
 
 #define	IDT_VE	20	/* Virtualization Exception (Intel specific) */
 
 static enum exc_class
 exception_class(uint64_t info)
 {
 	int type, vector;
 
 	KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %#lx", info));
 	type = info & VM_INTINFO_TYPE;
 	vector = info & 0xff;
 
 	/* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */
 	switch (type) {
 	case VM_INTINFO_HWINTR:
 	case VM_INTINFO_SWINTR:
 	case VM_INTINFO_NMI:
 		return (EXC_BENIGN);
 	default:
 		/*
 		 * Hardware exception.
 		 *
 		 * SVM and VT-x use identical type values to represent NMI,
 		 * hardware interrupt and software interrupt.
 		 *
 		 * SVM uses type '3' for all exceptions. VT-x uses type '3'
 		 * for exceptions except #BP and #OF. #BP and #OF use a type
 		 * value of '5' or '6'. Therefore we don't check for explicit
 		 * values of 'type' to classify 'intinfo' into a hardware
 		 * exception.
 		 */
 		break;
 	}
 
 	switch (vector) {
 	case IDT_PF:
 	case IDT_VE:
 		return (EXC_PAGEFAULT);
 	case IDT_DE:
 	case IDT_TS:
 	case IDT_NP:
 	case IDT_SS:
 	case IDT_GP:
 		return (EXC_CONTRIBUTORY);
 	default:
 		return (EXC_BENIGN);
 	}
 }
 
 static int
 nested_fault(struct vm *vm, int vcpuid, uint64_t info1, uint64_t info2,
     uint64_t *retinfo)
 {
 	enum exc_class exc1, exc2;
 	int type1, vector1;
 
 	KASSERT(info1 & VM_INTINFO_VALID, ("info1 %#lx is not valid", info1));
 	KASSERT(info2 & VM_INTINFO_VALID, ("info2 %#lx is not valid", info2));
 
 	/*
 	 * If an exception occurs while attempting to call the double-fault
 	 * handler the processor enters shutdown mode (aka triple fault).
 	 */
 	type1 = info1 & VM_INTINFO_TYPE;
 	vector1 = info1 & 0xff;
 	if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) {
 		VCPU_CTR2(vm, vcpuid, "triple fault: info1(%#lx), info2(%#lx)",
 		    info1, info2);
 		vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT);
 		*retinfo = 0;
 		return (0);
 	}
 
 	/*
 	 * Table 6-5 "Conditions for Generating a Double Fault", Intel SDM, Vol3
 	 */
 	exc1 = exception_class(info1);
 	exc2 = exception_class(info2);
 	if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) ||
 	    (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) {
 		/* Convert nested fault into a double fault. */
 		*retinfo = IDT_DF;
 		*retinfo |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
 		*retinfo |= VM_INTINFO_DEL_ERRCODE;
 	} else {
 		/* Handle exceptions serially */
 		*retinfo = info2;
 	}
 	return (1);
 }
 
 static uint64_t
 vcpu_exception_intinfo(struct vcpu *vcpu)
 {
 	uint64_t info = 0;
 
 	if (vcpu->exception_pending) {
 		info = vcpu->exception.vector & 0xff;
 		info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
 		if (vcpu->exception.error_code_valid) {
 			info |= VM_INTINFO_DEL_ERRCODE;
 			info |= (uint64_t)vcpu->exception.error_code << 32;
 		}
 	}
 	return (info);
 }
 
 int
 vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo)
 {
 	struct vcpu *vcpu;
 	uint64_t info1, info2;
 	int valid;
 
 	KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU, ("invalid vcpu %d", vcpuid));
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	info1 = vcpu->exitintinfo;
 	vcpu->exitintinfo = 0;
 
 	info2 = 0;
 	if (vcpu->exception_pending) {
 		info2 = vcpu_exception_intinfo(vcpu);
 		vcpu->exception_pending = 0;
 		VCPU_CTR2(vm, vcpuid, "Exception %d delivered: %#lx",
 		    vcpu->exception.vector, info2);
 	}
 
 	if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) {
 		valid = nested_fault(vm, vcpuid, info1, info2, retinfo);
 	} else if (info1 & VM_INTINFO_VALID) {
 		*retinfo = info1;
 		valid = 1;
 	} else if (info2 & VM_INTINFO_VALID) {
 		*retinfo = info2;
 		valid = 1;
 	} else {
 		valid = 0;
 	}
 
 	if (valid) {
 		VCPU_CTR4(vm, vcpuid, "%s: info1(%#lx), info2(%#lx), "
 		    "retinfo(%#lx)", __func__, info1, info2, *retinfo);
 	}
 
 	return (valid);
 }
 
 int
 vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2)
 {
 	struct vcpu *vcpu;
 
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		return (EINVAL);
 
 	vcpu = &vm->vcpu[vcpuid];
 	*info1 = vcpu->exitintinfo;
 	*info2 = vcpu_exception_intinfo(vcpu);
 	return (0);
 }
 
 int
 vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *exception)
 {
 	struct vcpu *vcpu;
 
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		return (EINVAL);
 
 	if (exception->vector < 0 || exception->vector >= 32)
 		return (EINVAL);
 
 	/*
 	 * A double fault exception should never be injected directly into
 	 * the guest. It is a derived exception that results from specific
 	 * combinations of nested faults.
 	 */
 	if (exception->vector == IDT_DF)
 		return (EINVAL);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	if (vcpu->exception_pending) {
 		VCPU_CTR2(vm, vcpuid, "Unable to inject exception %d due to "
 		    "pending exception %d", exception->vector,
 		    vcpu->exception.vector);
 		return (EBUSY);
 	}
 
 	vcpu->exception_pending = 1;
 	vcpu->exception = *exception;
 	VCPU_CTR1(vm, vcpuid, "Exception %d pending", exception->vector);
 	return (0);
 }
 
 void
 vm_inject_fault(void *vmarg, int vcpuid, int vector, int errcode_valid,
     int errcode)
 {
 	struct vm_exception exception;
 	struct vm_exit *vmexit;
 	struct vm *vm;
 	int error;
 
 	vm = vmarg;
 
 	exception.vector = vector;
 	exception.error_code = errcode;
 	exception.error_code_valid = errcode_valid;
 	error = vm_inject_exception(vm, vcpuid, &exception);
 	KASSERT(error == 0, ("vm_inject_exception error %d", error));
 
 	/*
 	 * A fault-like exception allows the instruction to be restarted
 	 * after the exception handler returns.
 	 *
 	 * By setting the inst_length to 0 we ensure that the instruction
 	 * pointer remains at the faulting instruction.
 	 */
 	vmexit = vm_exitinfo(vm, vcpuid);
 	vmexit->inst_length = 0;
 }
 
 void
 vm_inject_pf(void *vmarg, int vcpuid, int error_code, uint64_t cr2)
 {
 	struct vm *vm;
 	int error;
 
 	vm = vmarg;
 	VCPU_CTR2(vm, vcpuid, "Injecting page fault: error_code %#x, cr2 %#lx",
 	    error_code, cr2);
 
 	error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2);
 	KASSERT(error == 0, ("vm_set_register(cr2) error %d", error));
 
 	vm_inject_fault(vm, vcpuid, IDT_PF, 1, error_code);
 }
 
 static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
 
 int
 vm_inject_nmi(struct vm *vm, int vcpuid)
 {
 	struct vcpu *vcpu;
 
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		return (EINVAL);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	vcpu->nmi_pending = 1;
 	vcpu_notify_event(vm, vcpuid, false);
 	return (0);
 }
 
 int
 vm_nmi_pending(struct vm *vm, int vcpuid)
 {
 	struct vcpu *vcpu;
 
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	return (vcpu->nmi_pending);
 }
 
 void
 vm_nmi_clear(struct vm *vm, int vcpuid)
 {
 	struct vcpu *vcpu;
 
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	if (vcpu->nmi_pending == 0)
 		panic("vm_nmi_clear: inconsistent nmi_pending state");
 
 	vcpu->nmi_pending = 0;
 	vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1);
 }
 
 static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu");
 
 int
 vm_inject_extint(struct vm *vm, int vcpuid)
 {
 	struct vcpu *vcpu;
 
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		return (EINVAL);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	vcpu->extint_pending = 1;
 	vcpu_notify_event(vm, vcpuid, false);
 	return (0);
 }
 
 int
 vm_extint_pending(struct vm *vm, int vcpuid)
 {
 	struct vcpu *vcpu;
 
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	return (vcpu->extint_pending);
 }
 
 void
 vm_extint_clear(struct vm *vm, int vcpuid)
 {
 	struct vcpu *vcpu;
 
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	if (vcpu->extint_pending == 0)
 		panic("vm_extint_clear: inconsistent extint_pending state");
 
 	vcpu->extint_pending = 0;
 	vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1);
 }
 
 int
 vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
 {
 	if (vcpu < 0 || vcpu >= VM_MAXCPU)
 		return (EINVAL);
 
 	if (type < 0 || type >= VM_CAP_MAX)
 		return (EINVAL);
 
 	return (VMGETCAP(vm->cookie, vcpu, type, retval));
 }
 
 int
 vm_set_capability(struct vm *vm, int vcpu, int type, int val)
 {
 	if (vcpu < 0 || vcpu >= VM_MAXCPU)
 		return (EINVAL);
 
 	if (type < 0 || type >= VM_CAP_MAX)
 		return (EINVAL);
 
 	return (VMSETCAP(vm->cookie, vcpu, type, val));
-}
-
-uint64_t *
-vm_guest_msrs(struct vm *vm, int cpu)
-{
-	return (vm->vcpu[cpu].guest_msrs);
 }
 
 struct vlapic *
 vm_lapic(struct vm *vm, int cpu)
 {
 	return (vm->vcpu[cpu].vlapic);
 }
 
 struct vioapic *
 vm_ioapic(struct vm *vm)
 {
 
 	return (vm->vioapic);
 }
 
 struct vhpet *
 vm_hpet(struct vm *vm)
 {
 
 	return (vm->vhpet);
 }
 
 boolean_t
 vmm_is_pptdev(int bus, int slot, int func)
 {
 	int found, i, n;
 	int b, s, f;
 	char *val, *cp, *cp2;
 
 	/*
 	 * XXX
 	 * The length of an environment variable is limited to 128 bytes which
 	 * puts an upper limit on the number of passthru devices that may be
 	 * specified using a single environment variable.
 	 *
 	 * Work around this by scanning multiple environment variable
 	 * names instead of a single one - yuck!
 	 */
 	const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL };
 
 	/* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */
 	found = 0;
 	for (i = 0; names[i] != NULL && !found; i++) {
 		cp = val = getenv(names[i]);
 		while (cp != NULL && *cp != '\0') {
 			if ((cp2 = strchr(cp, ' ')) != NULL)
 				*cp2 = '\0';
 
 			n = sscanf(cp, "%d/%d/%d", &b, &s, &f);
 			if (n == 3 && bus == b && slot == s && func == f) {
 				found = 1;
 				break;
 			}
 		
 			if (cp2 != NULL)
 				*cp2++ = ' ';
 
 			cp = cp2;
 		}
 		freeenv(val);
 	}
 	return (found);
 }
 
 void *
 vm_iommu_domain(struct vm *vm)
 {
 
 	return (vm->iommu);
 }
 
 int
 vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate,
     bool from_idle)
 {
 	int error;
 	struct vcpu *vcpu;
 
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		panic("vm_set_run_state: invalid vcpuid %d", vcpuid);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	vcpu_lock(vcpu);
 	error = vcpu_set_state_locked(vcpu, newstate, from_idle);
 	vcpu_unlock(vcpu);
 
 	return (error);
 }
 
 enum vcpu_state
 vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu)
 {
 	struct vcpu *vcpu;
 	enum vcpu_state state;
 
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		panic("vm_get_run_state: invalid vcpuid %d", vcpuid);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	vcpu_lock(vcpu);
 	state = vcpu->state;
 	if (hostcpu != NULL)
 		*hostcpu = vcpu->hostcpu;
 	vcpu_unlock(vcpu);
 
 	return (state);
 }
 
 int
 vm_activate_cpu(struct vm *vm, int vcpuid)
 {
 
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		return (EINVAL);
 
 	if (CPU_ISSET(vcpuid, &vm->active_cpus))
 		return (EBUSY);
 
 	VCPU_CTR0(vm, vcpuid, "activated");
 	CPU_SET_ATOMIC(vcpuid, &vm->active_cpus);
 	return (0);
 }
 
 cpuset_t
 vm_active_cpus(struct vm *vm)
 {
 
 	return (vm->active_cpus);
 }
 
 cpuset_t
 vm_suspended_cpus(struct vm *vm)
 {
 
 	return (vm->suspended_cpus);
 }
 
 void *
 vcpu_stats(struct vm *vm, int vcpuid)
 {
 
 	return (vm->vcpu[vcpuid].stats);
 }
 
 int
 vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state)
 {
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		return (EINVAL);
 
 	*state = vm->vcpu[vcpuid].x2apic_state;
 
 	return (0);
 }
 
 int
 vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
 {
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		return (EINVAL);
 
 	if (state >= X2APIC_STATE_LAST)
 		return (EINVAL);
 
 	vm->vcpu[vcpuid].x2apic_state = state;
 
 	vlapic_set_x2apic_state(vm, vcpuid, state);
 
 	return (0);
 }
 
 /*
  * This function is called to ensure that a vcpu "sees" a pending event
  * as soon as possible:
  * - If the vcpu thread is sleeping then it is woken up.
  * - If the vcpu is running on a different host_cpu then an IPI will be directed
  *   to the host_cpu to cause the vcpu to trap into the hypervisor.
  */
 void
 vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr)
 {
 	int hostcpu;
 	struct vcpu *vcpu;
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	vcpu_lock(vcpu);
 	hostcpu = vcpu->hostcpu;
 	if (vcpu->state == VCPU_RUNNING) {
 		KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
 		if (hostcpu != curcpu) {
 			if (lapic_intr) {
 				vlapic_post_intr(vcpu->vlapic, hostcpu,
 				    vmm_ipinum);
 			} else {
 				ipi_cpu(hostcpu, vmm_ipinum);
 			}
 		} else {
 			/*
 			 * If the 'vcpu' is running on 'curcpu' then it must
 			 * be sending a notification to itself (e.g. SELF_IPI).
 			 * The pending event will be picked up when the vcpu
 			 * transitions back to guest context.
 			 */
 		}
 	} else {
 		KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent "
 		    "with hostcpu %d", vcpu->state, hostcpu));
 		if (vcpu->state == VCPU_SLEEPING)
 			wakeup_one(vcpu);
 	}
 	vcpu_unlock(vcpu);
 }
 
 struct vmspace *
 vm_get_vmspace(struct vm *vm)
 {
 
 	return (vm->vmspace);
 }
 
 int
 vm_apicid2vcpuid(struct vm *vm, int apicid)
 {
 	/*
 	 * XXX apic id is assumed to be numerically identical to vcpu id
 	 */
 	return (apicid);
 }
 
 void
 vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest,
     vm_rendezvous_func_t func, void *arg)
 {
 	int i;
 
 	/*
 	 * Enforce that this function is called without any locks
 	 */
 	WITNESS_WARN(WARN_PANIC, NULL, "vm_smp_rendezvous");
 	KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU),
 	    ("vm_smp_rendezvous: invalid vcpuid %d", vcpuid));
 
 restart:
 	mtx_lock(&vm->rendezvous_mtx);
 	if (vm->rendezvous_func != NULL) {
 		/*
 		 * If a rendezvous is already in progress then we need to
 		 * call the rendezvous handler in case this 'vcpuid' is one
 		 * of the targets of the rendezvous.
 		 */
 		RENDEZVOUS_CTR0(vm, vcpuid, "Rendezvous already in progress");
 		mtx_unlock(&vm->rendezvous_mtx);
 		vm_handle_rendezvous(vm, vcpuid);
 		goto restart;
 	}
 	KASSERT(vm->rendezvous_func == NULL, ("vm_smp_rendezvous: previous "
 	    "rendezvous is still in progress"));
 
 	RENDEZVOUS_CTR0(vm, vcpuid, "Initiating rendezvous");
 	vm->rendezvous_req_cpus = dest;
 	CPU_ZERO(&vm->rendezvous_done_cpus);
 	vm->rendezvous_arg = arg;
 	vm_set_rendezvous_func(vm, func);
 	mtx_unlock(&vm->rendezvous_mtx);
 
 	/*
 	 * Wake up any sleeping vcpus and trigger a VM-exit in any running
 	 * vcpus so they handle the rendezvous as soon as possible.
 	 */
 	for (i = 0; i < VM_MAXCPU; i++) {
 		if (CPU_ISSET(i, &dest))
 			vcpu_notify_event(vm, i, false);
 	}
 
 	vm_handle_rendezvous(vm, vcpuid);
 }
 
 struct vatpic *
 vm_atpic(struct vm *vm)
 {
 	return (vm->vatpic);
 }
 
 struct vatpit *
 vm_atpit(struct vm *vm)
 {
 	return (vm->vatpit);
 }
 
 enum vm_reg_name
 vm_segment_name(int seg)
 {
 	static enum vm_reg_name seg_names[] = {
 		VM_REG_GUEST_ES,
 		VM_REG_GUEST_CS,
 		VM_REG_GUEST_SS,
 		VM_REG_GUEST_DS,
 		VM_REG_GUEST_FS,
 		VM_REG_GUEST_GS
 	};
 
 	KASSERT(seg >= 0 && seg < nitems(seg_names),
 	    ("%s: invalid segment encoding %d", __func__, seg));
 	return (seg_names[seg]);
 }
 
 void
 vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
     int num_copyinfo)
 {
 	int idx;
 
 	for (idx = 0; idx < num_copyinfo; idx++) {
 		if (copyinfo[idx].cookie != NULL)
 			vm_gpa_release(copyinfo[idx].cookie);
 	}
 	bzero(copyinfo, num_copyinfo * sizeof(struct vm_copyinfo));
 }
 
 int
 vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
     uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
     int num_copyinfo)
 {
 	int error, idx, nused;
 	size_t n, off, remaining;
 	void *hva, *cookie;
 	uint64_t gpa;
 
 	bzero(copyinfo, sizeof(struct vm_copyinfo) * num_copyinfo);
 
 	nused = 0;
 	remaining = len;
 	while (remaining > 0) {
 		KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo"));
 		error = vmm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa);
 		if (error)
 			return (error);
 		off = gpa & PAGE_MASK;
 		n = min(remaining, PAGE_SIZE - off);
 		copyinfo[nused].gpa = gpa;
 		copyinfo[nused].len = n;
 		remaining -= n;
 		gla += n;
 		nused++;
 	}
 
 	for (idx = 0; idx < nused; idx++) {
 		hva = vm_gpa_hold(vm, copyinfo[idx].gpa, copyinfo[idx].len,
 		    prot, &cookie);
 		if (hva == NULL)
 			break;
 		copyinfo[idx].hva = hva;
 		copyinfo[idx].cookie = cookie;
 	}
 
 	if (idx != nused) {
 		vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo);
 		return (-1);
 	} else {
 		return (0);
 	}
 }
 
 void
 vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr,
     size_t len)
 {
 	char *dst;
 	int idx;
 	
 	dst = kaddr;
 	idx = 0;
 	while (len > 0) {
 		bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len);
 		len -= copyinfo[idx].len;
 		dst += copyinfo[idx].len;
 		idx++;
 	}
 }
 
 void
 vm_copyout(struct vm *vm, int vcpuid, const void *kaddr,
     struct vm_copyinfo *copyinfo, size_t len)
 {
 	const char *src;
 	int idx;
 
 	src = kaddr;
 	idx = 0;
 	while (len > 0) {
 		bcopy(src, copyinfo[idx].hva, copyinfo[idx].len);
 		len -= copyinfo[idx].len;
 		src += copyinfo[idx].len;
 		idx++;
 	}
 }
 
 /*
  * Return the amount of in-use and wired memory for the VM. Since
  * these are global stats, only return the values with for vCPU 0
  */
 VMM_STAT_DECLARE(VMM_MEM_RESIDENT);
 VMM_STAT_DECLARE(VMM_MEM_WIRED);
 
 static void
 vm_get_rescnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
 {
 
 	if (vcpu == 0) {
 		vmm_stat_set(vm, vcpu, VMM_MEM_RESIDENT,
 	       	    PAGE_SIZE * vmspace_resident_count(vm->vmspace));
 	}	
 }
 
 static void
 vm_get_wiredcnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
 {
 
 	if (vcpu == 0) {
 		vmm_stat_set(vm, vcpu, VMM_MEM_WIRED,
 	      	    PAGE_SIZE * pmap_wired_count(vmspace_pmap(vm->vmspace)));
 	}	
 }
 
 VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt);
 VMM_STAT_FUNC(VMM_MEM_WIRED, "Wired memory", vm_get_wiredcnt);
Index: stable/10/sys/amd64/vmm/vmm_instruction_emul.c
===================================================================
--- stable/10/sys/amd64/vmm/vmm_instruction_emul.c	(revision 276348)
+++ stable/10/sys/amd64/vmm/vmm_instruction_emul.c	(revision 276349)
@@ -1,1845 +1,1926 @@
 /*-
  * Copyright (c) 2012 Sandvine, Inc.
  * Copyright (c) 2012 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #ifdef _KERNEL
 #include <sys/param.h>
 #include <sys/pcpu.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 #include <machine/vmparam.h>
 #include <machine/vmm.h>
 #else	/* !_KERNEL */
 #include <sys/types.h>
 #include <sys/errno.h>
 #include <sys/_iovec.h>
 
 #include <machine/vmm.h>
 
 #include <assert.h>
 #include <vmmapi.h>
 #define	KASSERT(exp,msg)	assert((exp))
 #endif	/* _KERNEL */
 
 #include <machine/vmm_instruction_emul.h>
 #include <x86/psl.h>
 #include <x86/specialreg.h>
 
 /* struct vie_op.op_type */
 enum {
 	VIE_OP_TYPE_NONE = 0,
 	VIE_OP_TYPE_MOV,
 	VIE_OP_TYPE_MOVSX,
 	VIE_OP_TYPE_MOVZX,
 	VIE_OP_TYPE_AND,
 	VIE_OP_TYPE_OR,
 	VIE_OP_TYPE_SUB,
 	VIE_OP_TYPE_TWO_BYTE,
 	VIE_OP_TYPE_PUSH,
 	VIE_OP_TYPE_CMP,
+	VIE_OP_TYPE_POP,
 	VIE_OP_TYPE_LAST
 };
 
 /* struct vie_op.op_flags */
 #define	VIE_OP_F_IMM		(1 << 0)  /* 16/32-bit immediate operand */
 #define	VIE_OP_F_IMM8		(1 << 1)  /* 8-bit immediate operand */
 #define	VIE_OP_F_MOFFSET	(1 << 2)  /* 16/32/64-bit immediate moffset */
 #define	VIE_OP_F_NO_MODRM	(1 << 3)
 
 static const struct vie_op two_byte_opcodes[256] = {
 	[0xB6] = {
 		.op_byte = 0xB6,
 		.op_type = VIE_OP_TYPE_MOVZX,
 	},
 	[0xB7] = {
 		.op_byte = 0xB7,
 		.op_type = VIE_OP_TYPE_MOVZX,
 	},
 	[0xBE] = {
 		.op_byte = 0xBE,
 		.op_type = VIE_OP_TYPE_MOVSX,
 	},
 };
 
 static const struct vie_op one_byte_opcodes[256] = {
 	[0x0F] = {
 		.op_byte = 0x0F,
 		.op_type = VIE_OP_TYPE_TWO_BYTE
 	},
 	[0x2B] = {
 		.op_byte = 0x2B,
 		.op_type = VIE_OP_TYPE_SUB,
 	},
 	[0x3B] = {
 		.op_byte = 0x3B,
 		.op_type = VIE_OP_TYPE_CMP,
 	},
 	[0x88] = {
 		.op_byte = 0x88,
 		.op_type = VIE_OP_TYPE_MOV,
 	},
 	[0x89] = {
 		.op_byte = 0x89,
 		.op_type = VIE_OP_TYPE_MOV,
 	},
 	[0x8A] = {
 		.op_byte = 0x8A,
 		.op_type = VIE_OP_TYPE_MOV,
 	},
 	[0x8B] = {
 		.op_byte = 0x8B,
 		.op_type = VIE_OP_TYPE_MOV,
 	},
 	[0xA1] = {
 		.op_byte = 0xA1,
 		.op_type = VIE_OP_TYPE_MOV,
 		.op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
 	},
 	[0xA3] = {
 		.op_byte = 0xA3,
 		.op_type = VIE_OP_TYPE_MOV,
 		.op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
 	},
 	[0xC6] = {
 		/* XXX Group 11 extended opcode - not just MOV */
 		.op_byte = 0xC6,
 		.op_type = VIE_OP_TYPE_MOV,
 		.op_flags = VIE_OP_F_IMM8,
 	},
 	[0xC7] = {
 		.op_byte = 0xC7,
 		.op_type = VIE_OP_TYPE_MOV,
 		.op_flags = VIE_OP_F_IMM,
 	},
 	[0x23] = {
 		.op_byte = 0x23,
 		.op_type = VIE_OP_TYPE_AND,
 	},
 	[0x81] = {
 		/* XXX Group 1 extended opcode - not just AND */
 		.op_byte = 0x81,
 		.op_type = VIE_OP_TYPE_AND,
 		.op_flags = VIE_OP_F_IMM,
 	},
 	[0x83] = {
 		/* XXX Group 1 extended opcode - not just OR */
 		.op_byte = 0x83,
 		.op_type = VIE_OP_TYPE_OR,
 		.op_flags = VIE_OP_F_IMM8,
 	},
+	[0x8F] = {
+		/* XXX Group 1A extended opcode - not just POP */
+		.op_byte = 0x8F,
+		.op_type = VIE_OP_TYPE_POP,
+	},
 	[0xFF] = {
 		/* XXX Group 5 extended opcode - not just PUSH */
 		.op_byte = 0xFF,
 		.op_type = VIE_OP_TYPE_PUSH,
 	}
 };
 
 /* struct vie.mod */
 #define	VIE_MOD_INDIRECT		0
 #define	VIE_MOD_INDIRECT_DISP8		1
 #define	VIE_MOD_INDIRECT_DISP32		2
 #define	VIE_MOD_DIRECT			3
 
 /* struct vie.rm */
 #define	VIE_RM_SIB			4
 #define	VIE_RM_DISP32			5
 
 #define	GB				(1024 * 1024 * 1024)
 
 static enum vm_reg_name gpr_map[16] = {
 	VM_REG_GUEST_RAX,
 	VM_REG_GUEST_RCX,
 	VM_REG_GUEST_RDX,
 	VM_REG_GUEST_RBX,
 	VM_REG_GUEST_RSP,
 	VM_REG_GUEST_RBP,
 	VM_REG_GUEST_RSI,
 	VM_REG_GUEST_RDI,
 	VM_REG_GUEST_R8,
 	VM_REG_GUEST_R9,
 	VM_REG_GUEST_R10,
 	VM_REG_GUEST_R11,
 	VM_REG_GUEST_R12,
 	VM_REG_GUEST_R13,
 	VM_REG_GUEST_R14,
 	VM_REG_GUEST_R15
 };
 
 static uint64_t size2mask[] = {
 	[1] = 0xff,
 	[2] = 0xffff,
 	[4] = 0xffffffff,
 	[8] = 0xffffffffffffffff,
 };
 
 static int
 vie_read_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t *rval)
 {
 	int error;
 
 	error = vm_get_register(vm, vcpuid, reg, rval);
 
 	return (error);
 }
 
 static void
 vie_calc_bytereg(struct vie *vie, enum vm_reg_name *reg, int *lhbr)
 {
 	*lhbr = 0;
 	*reg = gpr_map[vie->reg];
 
 	/*
 	 * 64-bit mode imposes limitations on accessing legacy high byte
 	 * registers (lhbr).
 	 *
 	 * The legacy high-byte registers cannot be addressed if the REX
 	 * prefix is present. In this case the values 4, 5, 6 and 7 of the
 	 * 'ModRM:reg' field address %spl, %bpl, %sil and %dil respectively.
 	 *
 	 * If the REX prefix is not present then the values 4, 5, 6 and 7
 	 * of the 'ModRM:reg' field address the legacy high-byte registers,
 	 * %ah, %ch, %dh and %bh respectively.
 	 */
 	if (!vie->rex_present) {
 		if (vie->reg & 0x4) {
 			*lhbr = 1;
 			*reg = gpr_map[vie->reg & 0x3];
 		}
 	}
 }
 
 static int
 vie_read_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t *rval)
 {
 	uint64_t val;
 	int error, lhbr;
 	enum vm_reg_name reg;
 
 	vie_calc_bytereg(vie, &reg, &lhbr);
 	error = vm_get_register(vm, vcpuid, reg, &val);
 
 	/*
 	 * To obtain the value of a legacy high byte register shift the
 	 * base register right by 8 bits (%ah = %rax >> 8).
 	 */
 	if (lhbr)
 		*rval = val >> 8;
 	else
 		*rval = val;
 	return (error);
 }
 
 static int
 vie_write_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t byte)
 {
 	uint64_t origval, val, mask;
 	int error, lhbr;
 	enum vm_reg_name reg;
 
 	vie_calc_bytereg(vie, &reg, &lhbr);
 	error = vm_get_register(vm, vcpuid, reg, &origval);
 	if (error == 0) {
 		val = byte;
 		mask = 0xff;
 		if (lhbr) {
 			/*
 			 * Shift left by 8 to store 'byte' in a legacy high
 			 * byte register.
 			 */
 			val <<= 8;
 			mask <<= 8;
 		}
 		val |= origval & ~mask;
 		error = vm_set_register(vm, vcpuid, reg, val);
 	}
 	return (error);
 }
 
 int
 vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg,
 		    uint64_t val, int size)
 {
 	int error;
 	uint64_t origval;
 
 	switch (size) {
 	case 1:
 	case 2:
 		error = vie_read_register(vm, vcpuid, reg, &origval);
 		if (error)
 			return (error);
 		val &= size2mask[size];
 		val |= origval & ~size2mask[size];
 		break;
 	case 4:
 		val &= 0xffffffffUL;
 		break;
 	case 8:
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	error = vm_set_register(vm, vcpuid, reg, val);
 	return (error);
 }
 
+#define	RFLAGS_STATUS_BITS    (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_V)
+
 /*
  * Return the status flags that would result from doing (x - y).
  */
-static u_long
-getcc16(uint16_t x, uint16_t y)
-{
-	u_long rflags;
+#define	GETCC(sz)							\
+static u_long								\
+getcc##sz(uint##sz##_t x, uint##sz##_t y)				\
+{									\
+	u_long rflags;							\
+									\
+	__asm __volatile("sub %2,%1; pushfq; popq %0" :			\
+	    "=r" (rflags), "+r" (x) : "m" (y));				\
+	return (rflags);						\
+} struct __hack
 
-	__asm __volatile("sub %1,%2; pushfq; popq %0" :
-	    "=r" (rflags) : "m" (y), "r" (x));
-	return (rflags);
-}
+GETCC(8);
+GETCC(16);
+GETCC(32);
+GETCC(64);
 
 static u_long
-getcc32(uint32_t x, uint32_t y)
-{
-	u_long rflags;
-
-	__asm __volatile("sub %1,%2; pushfq; popq %0" :
-	    "=r" (rflags) : "m" (y), "r" (x));
-	return (rflags);
-}
-
-static u_long
-getcc64(uint64_t x, uint64_t y)
-{
-	u_long rflags;
-
-	__asm __volatile("sub %1,%2; pushfq; popq %0" :
-	    "=r" (rflags) : "m" (y), "r" (x));
-	return (rflags);
-}
-
-static u_long
 getcc(int opsize, uint64_t x, uint64_t y)
 {
-	KASSERT(opsize == 2 || opsize == 4 || opsize == 8,
+	KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8,
 	    ("getcc: invalid operand size %d", opsize));
 
-	if (opsize == 2)
+	if (opsize == 1)
+		return (getcc8(x, y));
+	else if (opsize == 2)
 		return (getcc16(x, y));
 	else if (opsize == 4)
 		return (getcc32(x, y));
 	else
 		return (getcc64(x, y));
 }
 
 static int
 emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
 {
 	int error, size;
 	enum vm_reg_name reg;
 	uint8_t byte;
 	uint64_t val;
 
 	size = vie->opsize;
 	error = EINVAL;
 
 	switch (vie->op.op_byte) {
 	case 0x88:
 		/*
 		 * MOV byte from reg (ModRM:reg) to mem (ModRM:r/m)
 		 * 88/r:	mov r/m8, r8
 		 * REX + 88/r:	mov r/m8, r8 (%ah, %ch, %dh, %bh not available)
 		 */
 		size = 1;	/* override for byte operation */
 		error = vie_read_bytereg(vm, vcpuid, vie, &byte);
 		if (error == 0)
 			error = memwrite(vm, vcpuid, gpa, byte, size, arg);
 		break;
 	case 0x89:
 		/*
 		 * MOV from reg (ModRM:reg) to mem (ModRM:r/m)
 		 * 89/r:	mov r/m16, r16
 		 * 89/r:	mov r/m32, r32
 		 * REX.W + 89/r	mov r/m64, r64
 		 */
 		reg = gpr_map[vie->reg];
 		error = vie_read_register(vm, vcpuid, reg, &val);
 		if (error == 0) {
 			val &= size2mask[size];
 			error = memwrite(vm, vcpuid, gpa, val, size, arg);
 		}
 		break;
 	case 0x8A:
 		/*
 		 * MOV byte from mem (ModRM:r/m) to reg (ModRM:reg)
 		 * 8A/r:	mov r8, r/m8
 		 * REX + 8A/r:	mov r8, r/m8
 		 */
 		size = 1;	/* override for byte operation */
 		error = memread(vm, vcpuid, gpa, &val, size, arg);
 		if (error == 0)
 			error = vie_write_bytereg(vm, vcpuid, vie, val);
 		break;
 	case 0x8B:
 		/*
 		 * MOV from mem (ModRM:r/m) to reg (ModRM:reg)
 		 * 8B/r:	mov r16, r/m16
 		 * 8B/r:	mov r32, r/m32
 		 * REX.W 8B/r:	mov r64, r/m64
 		 */
 		error = memread(vm, vcpuid, gpa, &val, size, arg);
 		if (error == 0) {
 			reg = gpr_map[vie->reg];
 			error = vie_update_register(vm, vcpuid, reg, val, size);
 		}
 		break;
 	case 0xA1:
 		/*
 		 * MOV from seg:moffset to AX/EAX/RAX
 		 * A1:		mov AX, moffs16
 		 * A1:		mov EAX, moffs32
 		 * REX.W + A1:	mov RAX, moffs64
 		 */
 		error = memread(vm, vcpuid, gpa, &val, size, arg);
 		if (error == 0) {
 			reg = VM_REG_GUEST_RAX;
 			error = vie_update_register(vm, vcpuid, reg, val, size);
 		}
 		break;
 	case 0xA3:
 		/*
 		 * MOV from AX/EAX/RAX to seg:moffset
 		 * A3:		mov moffs16, AX
 		 * A3:		mov moffs32, EAX 
 		 * REX.W + A3:	mov moffs64, RAX
 		 */
 		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val);
 		if (error == 0) {
 			val &= size2mask[size];
 			error = memwrite(vm, vcpuid, gpa, val, size, arg);
 		}
 		break;
 	case 0xC6:
 		/*
 		 * MOV from imm8 to mem (ModRM:r/m)
 		 * C6/0		mov r/m8, imm8
 		 * REX + C6/0	mov r/m8, imm8
 		 */
 		size = 1;	/* override for byte operation */
 		error = memwrite(vm, vcpuid, gpa, vie->immediate, size, arg);
 		break;
 	case 0xC7:
 		/*
 		 * MOV from imm16/imm32 to mem (ModRM:r/m)
 		 * C7/0		mov r/m16, imm16
 		 * C7/0		mov r/m32, imm32
 		 * REX.W + C7/0	mov r/m64, imm32 (sign-extended to 64-bits)
 		 */
 		val = vie->immediate & size2mask[size];
 		error = memwrite(vm, vcpuid, gpa, val, size, arg);
 		break;
 	default:
 		break;
 	}
 
 	return (error);
 }
 
 static int
 emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	     mem_region_read_t memread, mem_region_write_t memwrite,
 	     void *arg)
 {
 	int error, size;
 	enum vm_reg_name reg;
 	uint64_t val;
 
 	size = vie->opsize;
 	error = EINVAL;
 
 	switch (vie->op.op_byte) {
 	case 0xB6:
 		/*
 		 * MOV and zero extend byte from mem (ModRM:r/m) to
 		 * reg (ModRM:reg).
 		 *
 		 * 0F B6/r		movzx r16, r/m8
 		 * 0F B6/r		movzx r32, r/m8
 		 * REX.W + 0F B6/r	movzx r64, r/m8
 		 */
 
 		/* get the first operand */
 		error = memread(vm, vcpuid, gpa, &val, 1, arg);
 		if (error)
 			break;
 
 		/* get the second operand */
 		reg = gpr_map[vie->reg];
 
 		/* zero-extend byte */
 		val = (uint8_t)val;
 
 		/* write the result */
 		error = vie_update_register(vm, vcpuid, reg, val, size);
 		break;
 	case 0xB7:
 		/*
 		 * MOV and zero extend word from mem (ModRM:r/m) to
 		 * reg (ModRM:reg).
 		 *
 		 * 0F B7/r		movzx r32, r/m16
 		 * REX.W + 0F B7/r	movzx r64, r/m16
 		 */
 		error = memread(vm, vcpuid, gpa, &val, 2, arg);
 		if (error)
 			return (error);
 
 		reg = gpr_map[vie->reg];
 
 		/* zero-extend word */
 		val = (uint16_t)val;
 
 		error = vie_update_register(vm, vcpuid, reg, val, size);
 		break;
 	case 0xBE:
 		/*
 		 * MOV and sign extend byte from mem (ModRM:r/m) to
 		 * reg (ModRM:reg).
 		 *
 		 * 0F BE/r		movsx r16, r/m8
 		 * 0F BE/r		movsx r32, r/m8
 		 * REX.W + 0F BE/r	movsx r64, r/m8
 		 */
 
 		/* get the first operand */
 		error = memread(vm, vcpuid, gpa, &val, 1, arg);
 		if (error)
 			break;
 
 		/* get the second operand */
 		reg = gpr_map[vie->reg];
 
 		/* sign extend byte */
 		val = (int8_t)val;
 
 		/* write the result */
 		error = vie_update_register(vm, vcpuid, reg, val, size);
 		break;
 	default:
 		break;
 	}
 	return (error);
 }
 
 static int
 emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
 {
 	int error, size;
 	enum vm_reg_name reg;
-	uint64_t val1, val2;
+	uint64_t result, rflags, rflags2, val1, val2;
 
 	size = vie->opsize;
 	error = EINVAL;
 
 	switch (vie->op.op_byte) {
 	case 0x23:
 		/*
 		 * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the
 		 * result in reg.
 		 *
 		 * 23/r		and r16, r/m16
 		 * 23/r		and r32, r/m32
 		 * REX.W + 23/r	and r64, r/m64
 		 */
 
 		/* get the first operand */
 		reg = gpr_map[vie->reg];
 		error = vie_read_register(vm, vcpuid, reg, &val1);
 		if (error)
 			break;
 
 		/* get the second operand */
 		error = memread(vm, vcpuid, gpa, &val2, size, arg);
 		if (error)
 			break;
 
 		/* perform the operation and write the result */
-		val1 &= val2;
-		error = vie_update_register(vm, vcpuid, reg, val1, size);
+		result = val1 & val2;
+		error = vie_update_register(vm, vcpuid, reg, result, size);
 		break;
 	case 0x81:
 		/*
 		 * AND/OR mem (ModRM:r/m) with immediate and store the
 		 * result in mem.
 		 *
 		 * AND: i = 4
 		 * OR:  i = 1
 		 * 81 /i		op r/m16, imm16
 		 * 81 /i		op r/m32, imm32
 		 * REX.W + 81 /i	op r/m64, imm32 sign-extended to 64
 		 *
 		 */
 
 		/* get the first operand */
                 error = memread(vm, vcpuid, gpa, &val1, size, arg);
                 if (error)
 			break;
 
                 /*
                  * perform the operation with the pre-fetched immediate
                  * operand and write the result
                  */
 		switch (vie->reg & 7) {
 		case 0x4:
 			/* modrm:reg == b100, AND */
-			val1 &= vie->immediate;
+			result = val1 & vie->immediate;
 			break;
 		case 0x1:
 			/* modrm:reg == b001, OR */
-			val1 |= vie->immediate;
+			result = val1 | vie->immediate;
 			break;
 		default:
 			error = EINVAL;
 			break;
 		}
 		if (error)
 			break;
 
-		error = memwrite(vm, vcpuid, gpa, val1, size, arg);
+		error = memwrite(vm, vcpuid, gpa, result, size, arg);
 		break;
 	default:
 		break;
 	}
+	if (error)
+		return (error);
+
+	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
+	if (error)
+		return (error);
+
+	/*
+	 * OF and CF are cleared; the SF, ZF and PF flags are set according
+	 * to the result; AF is undefined.
+	 *
+	 * The updated status flags are obtained by subtracting 0 from 'result'.
+	 */
+	rflags2 = getcc(size, result, 0);
+	rflags &= ~RFLAGS_STATUS_BITS;
+	rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
+
+	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
 	return (error);
 }
 
 static int
 emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
 {
 	int error, size;
-	uint64_t val1;
+	uint64_t val1, result, rflags, rflags2;
 
 	size = vie->opsize;
 	error = EINVAL;
 
 	switch (vie->op.op_byte) {
 	case 0x83:
 		/*
 		 * OR mem (ModRM:r/m) with immediate and store the
 		 * result in mem.
 		 *
 		 * 83 /1		OR r/m16, imm8 sign-extended to 16
 		 * 83 /1		OR r/m32, imm8 sign-extended to 32
 		 * REX.W + 83/1		OR r/m64, imm8 sign-extended to 64
 		 *
 		 * Currently, only the OR operation of the 0x83 opcode
 		 * is implemented (ModRM:reg = b001).
 		 */
 		if ((vie->reg & 7) != 1)
 			break;
 
 		/* get the first operand */
                 error = memread(vm, vcpuid, gpa, &val1, size, arg);
                 if (error)
 			break;
 
                 /*
 		 * perform the operation with the pre-fetched immediate
 		 * operand and write the result
 		 */
-                val1 |= vie->immediate;
-                error = memwrite(vm, vcpuid, gpa, val1, size, arg);
+                result = val1 | vie->immediate;
+                error = memwrite(vm, vcpuid, gpa, result, size, arg);
 		break;
 	default:
 		break;
 	}
+	if (error)
+		return (error);
+
+	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
+	if (error)
+		return (error);
+
+	/*
+	 * OF and CF are cleared; the SF, ZF and PF flags are set according
+	 * to the result; AF is undefined.
+	 *
+	 * The updated status flags are obtained by subtracting 0 from 'result'.
+	 */
+	rflags2 = getcc(size, result, 0);
+	rflags &= ~RFLAGS_STATUS_BITS;
+	rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
+
+	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
 	return (error);
 }
 
-#define	RFLAGS_STATUS_BITS    (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_V)
-
 static int
 emulate_cmp(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
 {
 	int error, size;
 	uint64_t op1, op2, rflags, rflags2;
 	enum vm_reg_name reg;
 
 	size = vie->opsize;
 	switch (vie->op.op_byte) {
 	case 0x3B:
 		/*
 		 * 3B/r		CMP r16, r/m16
 		 * 3B/r		CMP r32, r/m32
 		 * REX.W + 3B/r	CMP r64, r/m64
 		 *
 		 * Compare first operand (reg) with second operand (r/m) and
 		 * set status flags in EFLAGS register. The comparison is
 		 * performed by subtracting the second operand from the first
 		 * operand and then setting the status flags.
 		 */
 
 		/* Get the first operand */
 		reg = gpr_map[vie->reg];
 		error = vie_read_register(vm, vcpuid, reg, &op1);
 		if (error)
 			return (error);
 
 		/* Get the second operand */
 		error = memread(vm, vcpuid, gpa, &op2, size, arg);
 		if (error)
 			return (error);
 
 		break;
 	default:
 		return (EINVAL);
 	}
 	rflags2 = getcc(size, op1, op2);
 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
 	if (error)
 		return (error);
 	rflags &= ~RFLAGS_STATUS_BITS;
 	rflags |= rflags2 & RFLAGS_STATUS_BITS;
 
 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
 	return (error);
 }
 
 static int
 emulate_sub(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
 {
 	int error, size;
 	uint64_t nval, rflags, rflags2, val1, val2;
 	enum vm_reg_name reg;
 
 	size = vie->opsize;
 	error = EINVAL;
 
 	switch (vie->op.op_byte) {
 	case 0x2B:
 		/*
 		 * SUB r/m from r and store the result in r
 		 * 
 		 * 2B/r            SUB r16, r/m16
 		 * 2B/r            SUB r32, r/m32
 		 * REX.W + 2B/r    SUB r64, r/m64
 		 */
 
 		/* get the first operand */
 		reg = gpr_map[vie->reg];
 		error = vie_read_register(vm, vcpuid, reg, &val1);
 		if (error)
 			break;
 
 		/* get the second operand */
 		error = memread(vm, vcpuid, gpa, &val2, size, arg);
 		if (error)
 			break;
 
 		/* perform the operation and write the result */
 		nval = val1 - val2;
 		error = vie_update_register(vm, vcpuid, reg, nval, size);
 		break;
 	default:
 		break;
 	}
 
 	if (!error) {
 		rflags2 = getcc(size, val1, val2);
 		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
 		    &rflags);
 		if (error)
 			return (error);
 
 		rflags &= ~RFLAGS_STATUS_BITS;
 		rflags |= rflags2 & RFLAGS_STATUS_BITS;
 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
 		    rflags, 8);
 	}
 
 	return (error);
 }
 
 static int
-emulate_push(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
+emulate_stack_op(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
     struct vm_guest_paging *paging, mem_region_read_t memread,
     mem_region_write_t memwrite, void *arg)
 {
 #ifdef _KERNEL
 	struct vm_copyinfo copyinfo[2];
 #else
 	struct iovec copyinfo[2];
 #endif
 	struct seg_desc ss_desc;
 	uint64_t cr0, rflags, rsp, stack_gla, val;
-	int error, size, stackaddrsize;
+	int error, size, stackaddrsize, pushop;
 
-	/*
-	 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2.
-	 *
-	 * PUSH is part of the group 5 extended opcodes and is identified
-	 * by ModRM:reg = b110.
-	 */
-	if ((vie->reg & 7) != 6)
-		return (EINVAL);
-
+	val = 0;
 	size = vie->opsize;
+	pushop = (vie->op.op_type == VIE_OP_TYPE_PUSH) ? 1 : 0;
+
 	/*
 	 * From "Address-Size Attributes for Stack Accesses", Intel SDL, Vol 1
 	 */
 	if (paging->cpu_mode == CPU_MODE_REAL) {
 		stackaddrsize = 2;
 	} else if (paging->cpu_mode == CPU_MODE_64BIT) {
 		/*
 		 * "Stack Manipulation Instructions in 64-bit Mode", SDM, Vol 3
 		 * - Stack pointer size is always 64-bits.
 		 * - PUSH/POP of 32-bit values is not possible in 64-bit mode.
 		 * - 16-bit PUSH/POP is supported by using the operand size
 		 *   override prefix (66H).
 		 */
 		stackaddrsize = 8;
 		size = vie->opsize_override ? 2 : 8;
 	} else {
 		/*
 		 * In protected or compability mode the 'B' flag in the
 		 * stack-segment descriptor determines the size of the
 		 * stack pointer.
 		 */
 		error = vm_get_seg_desc(vm, vcpuid, VM_REG_GUEST_SS, &ss_desc);
 		KASSERT(error == 0, ("%s: error %d getting SS descriptor",
 		    __func__, error));
 		if (SEG_DESC_DEF32(ss_desc.access))
 			stackaddrsize = 4;
 		else
 			stackaddrsize = 2;
 	}
 
 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0);
 	KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error));
 
 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
 
 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSP, &rsp);
 	KASSERT(error == 0, ("%s: error %d getting rsp", __func__, error));
+	if (pushop) {
+		rsp -= size;
+	}
 
-	rsp -= size;
 	if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &ss_desc,
-	    rsp, size, stackaddrsize, PROT_WRITE, &stack_gla)) {
+	    rsp, size, stackaddrsize, pushop ? PROT_WRITE : PROT_READ,
+	    &stack_gla)) {
 		vm_inject_ss(vm, vcpuid, 0);
 		return (0);
 	}
 
 	if (vie_canonical_check(paging->cpu_mode, stack_gla)) {
 		vm_inject_ss(vm, vcpuid, 0);
 		return (0);
 	}
 
 	if (vie_alignment_check(paging->cpl, size, cr0, rflags, stack_gla)) {
 		vm_inject_ac(vm, vcpuid, 0);
 		return (0);
 	}
 
-	error = vm_copy_setup(vm, vcpuid, paging, stack_gla, size, PROT_WRITE,
-	    copyinfo, nitems(copyinfo));
+	error = vm_copy_setup(vm, vcpuid, paging, stack_gla, size,
+	    pushop ? PROT_WRITE : PROT_READ, copyinfo, nitems(copyinfo));
 	if (error == -1) {
 		/*
 		 * XXX cannot return a negative error value here because it
 		 * ends up being the return value of the VM_RUN() ioctl and
 		 * is interpreted as a pseudo-error (for e.g. ERESTART).
 		 */
 		return (EFAULT);
 	} else if (error == 1) {
 		/* Resume guest execution to handle page fault */
 		return (0);
 	}
 
-	error = memread(vm, vcpuid, mmio_gpa, &val, size, arg);
+	if (pushop) {
+		error = memread(vm, vcpuid, mmio_gpa, &val, size, arg);
+		if (error == 0)
+			vm_copyout(vm, vcpuid, &val, copyinfo, size);
+	} else {
+		vm_copyin(vm, vcpuid, copyinfo, &val, size);
+		error = memwrite(vm, vcpuid, mmio_gpa, val, size, arg);
+		rsp += size;
+	}
+#ifdef _KERNEL
+	vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
+#endif
+
 	if (error == 0) {
-		vm_copyout(vm, vcpuid, &val, copyinfo, size);
 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSP, rsp,
 		    stackaddrsize);
 		KASSERT(error == 0, ("error %d updating rsp", error));
 	}
-#ifdef _KERNEL
-	vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
-#endif
 	return (error);
 }
 
+static int
+emulate_push(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
+    struct vm_guest_paging *paging, mem_region_read_t memread,
+    mem_region_write_t memwrite, void *arg)
+{
+	int error;
+
+	/*
+	 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2.
+	 *
+	 * PUSH is part of the group 5 extended opcodes and is identified
+	 * by ModRM:reg = b110.
+	 */
+	if ((vie->reg & 7) != 6)
+		return (EINVAL);
+
+	error = emulate_stack_op(vm, vcpuid, mmio_gpa, vie, paging, memread,
+	    memwrite, arg);
+	return (error);
+}
+
+static int
+emulate_pop(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
+    struct vm_guest_paging *paging, mem_region_read_t memread,
+    mem_region_write_t memwrite, void *arg)
+{
+	int error;
+
+	/*
+	 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2.
+	 *
+	 * POP is part of the group 1A extended opcodes and is identified
+	 * by ModRM:reg = b000.
+	 */
+	if ((vie->reg & 7) != 0)
+		return (EINVAL);
+
+	error = emulate_stack_op(vm, vcpuid, mmio_gpa, vie, paging, memread,
+	    memwrite, arg);
+	return (error);
+}
+
 int
 vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
     struct vm_guest_paging *paging, mem_region_read_t memread,
     mem_region_write_t memwrite, void *memarg)
 {
 	int error;
 
 	if (!vie->decoded)
 		return (EINVAL);
 
 	switch (vie->op.op_type) {
+	case VIE_OP_TYPE_POP:
+		error = emulate_pop(vm, vcpuid, gpa, vie, paging, memread,
+		    memwrite, memarg);
+		break;
 	case VIE_OP_TYPE_PUSH:
 		error = emulate_push(vm, vcpuid, gpa, vie, paging, memread,
 		    memwrite, memarg);
 		break;
 	case VIE_OP_TYPE_CMP:
 		error = emulate_cmp(vm, vcpuid, gpa, vie,
 				    memread, memwrite, memarg);
 		break;
 	case VIE_OP_TYPE_MOV:
 		error = emulate_mov(vm, vcpuid, gpa, vie,
 				    memread, memwrite, memarg);
 		break;
 	case VIE_OP_TYPE_MOVSX:
 	case VIE_OP_TYPE_MOVZX:
 		error = emulate_movx(vm, vcpuid, gpa, vie,
 				     memread, memwrite, memarg);
 		break;
 	case VIE_OP_TYPE_AND:
 		error = emulate_and(vm, vcpuid, gpa, vie,
 				    memread, memwrite, memarg);
 		break;
 	case VIE_OP_TYPE_OR:
 		error = emulate_or(vm, vcpuid, gpa, vie,
 				    memread, memwrite, memarg);
 		break;
 	case VIE_OP_TYPE_SUB:
 		error = emulate_sub(vm, vcpuid, gpa, vie,
 				    memread, memwrite, memarg);
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 
 	return (error);
 }
 
 int
 vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla)
 {
 	KASSERT(size == 1 || size == 2 || size == 4 || size == 8,
 	    ("%s: invalid size %d", __func__, size));
 	KASSERT(cpl >= 0 && cpl <= 3, ("%s: invalid cpl %d", __func__, cpl));
 
 	if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0)
 		return (0);
 
 	return ((gla & (size - 1)) ? 1 : 0);
 }
 
 int
 vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla)
 {
 	uint64_t mask;
 
 	if (cpu_mode != CPU_MODE_64BIT)
 		return (0);
 
 	/*
 	 * The value of the bit 47 in the 'gla' should be replicated in the
 	 * most significant 16 bits.
 	 */
 	mask = ~((1UL << 48) - 1);
 	if (gla & (1UL << 47))
 		return ((gla & mask) != mask);
 	else
 		return ((gla & mask) != 0);
 }
 
 uint64_t
 vie_size2mask(int size)
 {
 	KASSERT(size == 1 || size == 2 || size == 4 || size == 8,
 	    ("vie_size2mask: invalid size %d", size));
 	return (size2mask[size]);
 }
 
 int
 vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
     struct seg_desc *desc, uint64_t offset, int length, int addrsize,
     int prot, uint64_t *gla)
 {
 	uint64_t firstoff, low_limit, high_limit, segbase;
 	int glasize, type;
 
 	KASSERT(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS,
 	    ("%s: invalid segment %d", __func__, seg));
 	KASSERT(length == 1 || length == 2 || length == 4 || length == 8,
 	    ("%s: invalid operand size %d", __func__, length));
 	KASSERT((prot & ~(PROT_READ | PROT_WRITE)) == 0,
 	    ("%s: invalid prot %#x", __func__, prot));
 
 	firstoff = offset;
 	if (cpu_mode == CPU_MODE_64BIT) {
 		KASSERT(addrsize == 4 || addrsize == 8, ("%s: invalid address "
 		    "size %d for cpu_mode %d", __func__, addrsize, cpu_mode));
 		glasize = 8;
 	} else {
 		KASSERT(addrsize == 2 || addrsize == 4, ("%s: invalid address "
 		    "size %d for cpu mode %d", __func__, addrsize, cpu_mode));
 		glasize = 4;
 		/*
 		 * If the segment selector is loaded with a NULL selector
 		 * then the descriptor is unusable and attempting to use
 		 * it results in a #GP(0).
 		 */
 		if (SEG_DESC_UNUSABLE(desc->access))
 			return (-1);
 
 		/* 
 		 * The processor generates a #NP exception when a segment
 		 * register is loaded with a selector that points to a
 		 * descriptor that is not present. If this was the case then
 		 * it would have been checked before the VM-exit.
 		 */
 		KASSERT(SEG_DESC_PRESENT(desc->access),
 		    ("segment %d not present: %#x", seg, desc->access));
 
 		/*
 		 * The descriptor type must indicate a code/data segment.
 		 */
 		type = SEG_DESC_TYPE(desc->access);
 		KASSERT(type >= 16 && type <= 31, ("segment %d has invalid "
 		    "descriptor type %#x", seg, type));
 
 		if (prot & PROT_READ) {
 			/* #GP on a read access to a exec-only code segment */
 			if ((type & 0xA) == 0x8)
 				return (-1);
 		}
 
 		if (prot & PROT_WRITE) {
 			/*
 			 * #GP on a write access to a code segment or a
 			 * read-only data segment.
 			 */
 			if (type & 0x8)			/* code segment */
 				return (-1);
 
 			if ((type & 0xA) == 0)		/* read-only data seg */
 				return (-1);
 		}
 
 		/*
 		 * 'desc->limit' is fully expanded taking granularity into
 		 * account.
 		 */
 		if ((type & 0xC) == 0x4) {
 			/* expand-down data segment */
 			low_limit = desc->limit + 1;
 			high_limit = SEG_DESC_DEF32(desc->access) ?
 			    0xffffffff : 0xffff;
 		} else {
 			/* code segment or expand-up data segment */
 			low_limit = 0;
 			high_limit = desc->limit;
 		}
 
 		while (length > 0) {
 			offset &= vie_size2mask(addrsize);
 			if (offset < low_limit || offset > high_limit)
 				return (-1);
 			offset++;
 			length--;
 		}
 	}
 
 	/*
 	 * In 64-bit mode all segments except %fs and %gs have a segment
 	 * base address of 0.
 	 */
 	if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS &&
 	    seg != VM_REG_GUEST_GS) {
 		segbase = 0;
 	} else {
 		segbase = desc->base;
 	}
 
 	/*
 	 * Truncate 'firstoff' to the effective address size before adding
 	 * it to the segment base.
 	 */
 	firstoff &= vie_size2mask(addrsize);
 	*gla = (segbase + firstoff) & vie_size2mask(glasize);
 	return (0);
 }
 
 #ifdef _KERNEL
 void
 vie_init(struct vie *vie)
 {
 
 	bzero(vie, sizeof(struct vie));
 
 	vie->base_register = VM_REG_LAST;
 	vie->index_register = VM_REG_LAST;
 }
 
 static int
 pf_error_code(int usermode, int prot, int rsvd, uint64_t pte)
 {
 	int error_code = 0;
 
 	if (pte & PG_V)
 		error_code |= PGEX_P;
 	if (prot & VM_PROT_WRITE)
 		error_code |= PGEX_W;
 	if (usermode)
 		error_code |= PGEX_U;
 	if (rsvd)
 		error_code |= PGEX_RSV;
 	if (prot & VM_PROT_EXECUTE)
 		error_code |= PGEX_I;
 
 	return (error_code);
 }
 
 static void
 ptp_release(void **cookie)
 {
 	if (*cookie != NULL) {
 		vm_gpa_release(*cookie);
 		*cookie = NULL;
 	}
 }
 
 static void *
 ptp_hold(struct vm *vm, vm_paddr_t ptpphys, size_t len, void **cookie)
 {
 	void *ptr;
 
 	ptp_release(cookie);
 	ptr = vm_gpa_hold(vm, ptpphys, len, VM_PROT_RW, cookie);
 	return (ptr);
 }
 
 int
 vmm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
     uint64_t gla, int prot, uint64_t *gpa)
 {
 	int nlevels, pfcode, ptpshift, ptpindex, retval, usermode, writable;
 	u_int retries;
 	uint64_t *ptpbase, ptpphys, pte, pgsize;
 	uint32_t *ptpbase32, pte32;
 	void *cookie;
 
 	usermode = (paging->cpl == 3 ? 1 : 0);
 	writable = prot & VM_PROT_WRITE;
 	cookie = NULL;
 	retval = 0;
 	retries = 0;
 restart:
 	ptpphys = paging->cr3;		/* root of the page tables */
 	ptp_release(&cookie);
 	if (retries++ > 0)
 		maybe_yield();
 
 	if (vie_canonical_check(paging->cpu_mode, gla)) {
 		/*
 		 * XXX assuming a non-stack reference otherwise a stack fault
 		 * should be generated.
 		 */
 		vm_inject_gp(vm, vcpuid);
 		goto fault;
 	}
 
 	if (paging->paging_mode == PAGING_MODE_FLAT) {
 		*gpa = gla;
 		goto done;
 	}
 
 	if (paging->paging_mode == PAGING_MODE_32) {
 		nlevels = 2;
 		while (--nlevels >= 0) {
 			/* Zero out the lower 12 bits. */
 			ptpphys &= ~0xfff;
 
 			ptpbase32 = ptp_hold(vm, ptpphys, PAGE_SIZE, &cookie);
 
 			if (ptpbase32 == NULL)
 				goto error;
 
 			ptpshift = PAGE_SHIFT + nlevels * 10;
 			ptpindex = (gla >> ptpshift) & 0x3FF;
 			pgsize = 1UL << ptpshift;
 
 			pte32 = ptpbase32[ptpindex];
 
 			if ((pte32 & PG_V) == 0 ||
 			    (usermode && (pte32 & PG_U) == 0) ||
 			    (writable && (pte32 & PG_RW) == 0)) {
 				pfcode = pf_error_code(usermode, prot, 0,
 				    pte32);
 				vm_inject_pf(vm, vcpuid, pfcode, gla);
 				goto fault;
 			}
 
 			/*
 			 * Emulate the x86 MMU's management of the accessed
 			 * and dirty flags. While the accessed flag is set
 			 * at every level of the page table, the dirty flag
 			 * is only set at the last level providing the guest
 			 * physical address.
 			 */
 			if ((pte32 & PG_A) == 0) {
 				if (atomic_cmpset_32(&ptpbase32[ptpindex],
 				    pte32, pte32 | PG_A) == 0) {
 					goto restart;
 				}
 			}
 
 			/* XXX must be ignored if CR4.PSE=0 */
 			if (nlevels > 0 && (pte32 & PG_PS) != 0)
 				break;
 
 			ptpphys = pte32;
 		}
 
 		/* Set the dirty bit in the page table entry if necessary */
 		if (writable && (pte32 & PG_M) == 0) {
 			if (atomic_cmpset_32(&ptpbase32[ptpindex],
 			    pte32, pte32 | PG_M) == 0) {
 				goto restart;
 			}
 		}
 
 		/* Zero out the lower 'ptpshift' bits */
 		pte32 >>= ptpshift; pte32 <<= ptpshift;
 		*gpa = pte32 | (gla & (pgsize - 1));
 		goto done;
 	}
 
 	if (paging->paging_mode == PAGING_MODE_PAE) {
 		/* Zero out the lower 5 bits and the upper 32 bits */
 		ptpphys &= 0xffffffe0UL;
 
 		ptpbase = ptp_hold(vm, ptpphys, sizeof(*ptpbase) * 4, &cookie);
 		if (ptpbase == NULL)
 			goto error;
 
 		ptpindex = (gla >> 30) & 0x3;
 
 		pte = ptpbase[ptpindex];
 
 		if ((pte & PG_V) == 0) {
 			pfcode = pf_error_code(usermode, prot, 0, pte);
 			vm_inject_pf(vm, vcpuid, pfcode, gla);
 			goto fault;
 		}
 
 		ptpphys = pte;
 
 		nlevels = 2;
 	} else
 		nlevels = 4;
 	while (--nlevels >= 0) {
 		/* Zero out the lower 12 bits and the upper 12 bits */
 		ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12;
 
 		ptpbase = ptp_hold(vm, ptpphys, PAGE_SIZE, &cookie);
 		if (ptpbase == NULL)
 			goto error;
 
 		ptpshift = PAGE_SHIFT + nlevels * 9;
 		ptpindex = (gla >> ptpshift) & 0x1FF;
 		pgsize = 1UL << ptpshift;
 
 		pte = ptpbase[ptpindex];
 
 		if ((pte & PG_V) == 0 ||
 		    (usermode && (pte & PG_U) == 0) ||
 		    (writable && (pte & PG_RW) == 0)) {
 			pfcode = pf_error_code(usermode, prot, 0, pte);
 			vm_inject_pf(vm, vcpuid, pfcode, gla);
 			goto fault;
 		}
 
 		/* Set the accessed bit in the page table entry */
 		if ((pte & PG_A) == 0) {
 			if (atomic_cmpset_64(&ptpbase[ptpindex],
 			    pte, pte | PG_A) == 0) {
 				goto restart;
 			}
 		}
 
 		if (nlevels > 0 && (pte & PG_PS) != 0) {
 			if (pgsize > 1 * GB) {
 				pfcode = pf_error_code(usermode, prot, 1, pte);
 				vm_inject_pf(vm, vcpuid, pfcode, gla);
 				goto fault;
 			}
 			break;
 		}
 
 		ptpphys = pte;
 	}
 
 	/* Set the dirty bit in the page table entry if necessary */
 	if (writable && (pte & PG_M) == 0) {
 		if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_M) == 0)
 			goto restart;
 	}
 
 	/* Zero out the lower 'ptpshift' bits and the upper 12 bits */
 	pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12;
 	*gpa = pte | (gla & (pgsize - 1));
 done:
 	ptp_release(&cookie);
 	return (retval);
 error:
 	retval = -1;
 	goto done;
 fault:
 	retval = 1;
 	goto done;
 }
 
 int
 vmm_fetch_instruction(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
     uint64_t rip, int inst_length, struct vie *vie)
 {
 	struct vm_copyinfo copyinfo[2];
 	int error, prot;
 
 	if (inst_length > VIE_INST_SIZE)
 		panic("vmm_fetch_instruction: invalid length %d", inst_length);
 
 	prot = PROT_READ | PROT_EXEC;
 	error = vm_copy_setup(vm, vcpuid, paging, rip, inst_length, prot,
 	    copyinfo, nitems(copyinfo));
 	if (error == 0) {
 		vm_copyin(vm, vcpuid, copyinfo, vie->inst, inst_length);
 		vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
 		vie->num_valid = inst_length;
 	}
 	return (error);
 }
 
 static int
 vie_peek(struct vie *vie, uint8_t *x)
 {
 
 	if (vie->num_processed < vie->num_valid) {
 		*x = vie->inst[vie->num_processed];
 		return (0);
 	} else
 		return (-1);
 }
 
 static void
 vie_advance(struct vie *vie)
 {
 
 	vie->num_processed++;
 }
 
 static int
 decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d)
 {
 	uint8_t x;
 
 	while (1) {
 		if (vie_peek(vie, &x))
 			return (-1);
 
 		if (x == 0x66)
 			vie->opsize_override = 1;
 		else if (x == 0x67)
 			vie->addrsize_override = 1;
 		else
 			break;
 
 		vie_advance(vie);
 	}
 
 	/*
 	 * From section 2.2.1, "REX Prefixes", Intel SDM Vol 2:
 	 * - Only one REX prefix is allowed per instruction.
 	 * - The REX prefix must immediately precede the opcode byte or the
 	 *   escape opcode byte.
 	 * - If an instruction has a mandatory prefix (0x66, 0xF2 or 0xF3)
 	 *   the mandatory prefix must come before the REX prefix.
 	 */
 	if (cpu_mode == CPU_MODE_64BIT && x >= 0x40 && x <= 0x4F) {
 		vie->rex_present = 1;
 		vie->rex_w = x & 0x8 ? 1 : 0;
 		vie->rex_r = x & 0x4 ? 1 : 0;
 		vie->rex_x = x & 0x2 ? 1 : 0;
 		vie->rex_b = x & 0x1 ? 1 : 0;
 		vie_advance(vie);
 	}
 
 	/*
 	 * Section "Operand-Size And Address-Size Attributes", Intel SDM, Vol 1
 	 */
 	if (cpu_mode == CPU_MODE_64BIT) {
 		/*
 		 * Default address size is 64-bits and default operand size
 		 * is 32-bits.
 		 */
 		vie->addrsize = vie->addrsize_override ? 4 : 8;
 		if (vie->rex_w)
 			vie->opsize = 8;
 		else if (vie->opsize_override)
 			vie->opsize = 2;
 		else
 			vie->opsize = 4;
 	} else if (cs_d) {
 		/* Default address and operand sizes are 32-bits */
 		vie->addrsize = vie->addrsize_override ? 2 : 4;
 		vie->opsize = vie->opsize_override ? 2 : 4;
 	} else {
 		/* Default address and operand sizes are 16-bits */
 		vie->addrsize = vie->addrsize_override ? 4 : 2;
 		vie->opsize = vie->opsize_override ? 4 : 2;
 	}
 	return (0);
 }
 
 static int
 decode_two_byte_opcode(struct vie *vie)
 {
 	uint8_t x;
 
 	if (vie_peek(vie, &x))
 		return (-1);
 
 	vie->op = two_byte_opcodes[x];
 
 	if (vie->op.op_type == VIE_OP_TYPE_NONE)
 		return (-1);
 
 	vie_advance(vie);
 	return (0);
 }
 
 static int
 decode_opcode(struct vie *vie)
 {
 	uint8_t x;
 
 	if (vie_peek(vie, &x))
 		return (-1);
 
 	vie->op = one_byte_opcodes[x];
 
 	if (vie->op.op_type == VIE_OP_TYPE_NONE)
 		return (-1);
 
 	vie_advance(vie);
 
 	if (vie->op.op_type == VIE_OP_TYPE_TWO_BYTE)
 		return (decode_two_byte_opcode(vie));
 
 	return (0);
 }
 
 static int
 decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode)
 {
 	uint8_t x;
 
 	if (cpu_mode == CPU_MODE_REAL)
 		return (-1);
 
 	if (vie->op.op_flags & VIE_OP_F_NO_MODRM)
 		return (0);
 
 	if (vie_peek(vie, &x))
 		return (-1);
 
 	vie->mod = (x >> 6) & 0x3;
 	vie->rm =  (x >> 0) & 0x7;
 	vie->reg = (x >> 3) & 0x7;
 
 	/*
 	 * A direct addressing mode makes no sense in the context of an EPT
 	 * fault. There has to be a memory access involved to cause the
 	 * EPT fault.
 	 */
 	if (vie->mod == VIE_MOD_DIRECT)
 		return (-1);
 
 	if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) ||
 	    (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) {
 		/*
 		 * Table 2-5: Special Cases of REX Encodings
 		 *
 		 * mod=0, r/m=5 is used in the compatibility mode to
 		 * indicate a disp32 without a base register.
 		 *
 		 * mod!=3, r/m=4 is used in the compatibility mode to
 		 * indicate that the SIB byte is present.
 		 *
 		 * The 'b' bit in the REX prefix is don't care in
 		 * this case.
 		 */
 	} else {
 		vie->rm |= (vie->rex_b << 3);
 	}
 
 	vie->reg |= (vie->rex_r << 3);
 
 	/* SIB */
 	if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)
 		goto done;
 
 	vie->base_register = gpr_map[vie->rm];
 
 	switch (vie->mod) {
 	case VIE_MOD_INDIRECT_DISP8:
 		vie->disp_bytes = 1;
 		break;
 	case VIE_MOD_INDIRECT_DISP32:
 		vie->disp_bytes = 4;
 		break;
 	case VIE_MOD_INDIRECT:
 		if (vie->rm == VIE_RM_DISP32) {
 			vie->disp_bytes = 4;
 			/*
 			 * Table 2-7. RIP-Relative Addressing
 			 *
 			 * In 64-bit mode mod=00 r/m=101 implies [rip] + disp32
 			 * whereas in compatibility mode it just implies disp32.
 			 */
 
 			if (cpu_mode == CPU_MODE_64BIT)
 				vie->base_register = VM_REG_GUEST_RIP;
 			else
 				vie->base_register = VM_REG_LAST;
 		}
 		break;
 	}
 
 done:
 	vie_advance(vie);
 
 	return (0);
 }
 
 static int
 decode_sib(struct vie *vie)
 {
 	uint8_t x;
 
 	/* Proceed only if SIB byte is present */
 	if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB)
 		return (0);
 
 	if (vie_peek(vie, &x))
 		return (-1);
 
 	/* De-construct the SIB byte */
 	vie->ss = (x >> 6) & 0x3;
 	vie->index = (x >> 3) & 0x7;
 	vie->base = (x >> 0) & 0x7;
 
 	/* Apply the REX prefix modifiers */
 	vie->index |= vie->rex_x << 3;
 	vie->base |= vie->rex_b << 3;
 
 	switch (vie->mod) {
 	case VIE_MOD_INDIRECT_DISP8:
 		vie->disp_bytes = 1;
 		break;
 	case VIE_MOD_INDIRECT_DISP32:
 		vie->disp_bytes = 4;
 		break;
 	}
 
 	if (vie->mod == VIE_MOD_INDIRECT &&
 	    (vie->base == 5 || vie->base == 13)) {
 		/*
 		 * Special case when base register is unused if mod = 0
 		 * and base = %rbp or %r13.
 		 *
 		 * Documented in:
 		 * Table 2-3: 32-bit Addressing Forms with the SIB Byte
 		 * Table 2-5: Special Cases of REX Encodings
 		 */
 		vie->disp_bytes = 4;
 	} else {
 		vie->base_register = gpr_map[vie->base];
 	}
 
 	/*
 	 * All encodings of 'index' are valid except for %rsp (4).
 	 *
 	 * Documented in:
 	 * Table 2-3: 32-bit Addressing Forms with the SIB Byte
 	 * Table 2-5: Special Cases of REX Encodings
 	 */
 	if (vie->index != 4)
 		vie->index_register = gpr_map[vie->index];
 
 	/* 'scale' makes sense only in the context of an index register */
 	if (vie->index_register < VM_REG_LAST)
 		vie->scale = 1 << vie->ss;
 
 	vie_advance(vie);
 
 	return (0);
 }
 
 static int
 decode_displacement(struct vie *vie)
 {
 	int n, i;
 	uint8_t x;
 
 	union {
 		char	buf[4];
 		int8_t	signed8;
 		int32_t	signed32;
 	} u;
 
 	if ((n = vie->disp_bytes) == 0)
 		return (0);
 
 	if (n != 1 && n != 4)
 		panic("decode_displacement: invalid disp_bytes %d", n);
 
 	for (i = 0; i < n; i++) {
 		if (vie_peek(vie, &x))
 			return (-1);
 
 		u.buf[i] = x;
 		vie_advance(vie);
 	}
 
 	if (n == 1)
 		vie->displacement = u.signed8;		/* sign-extended */
 	else
 		vie->displacement = u.signed32;		/* sign-extended */
 
 	return (0);
 }
 
 static int
 decode_immediate(struct vie *vie)
 {
 	int i, n;
 	uint8_t x;
 	union {
 		char	buf[4];
 		int8_t	signed8;
 		int16_t	signed16;
 		int32_t	signed32;
 	} u;
 
 	/* Figure out immediate operand size (if any) */
 	if (vie->op.op_flags & VIE_OP_F_IMM) {
 		/*
 		 * Section 2.2.1.5 "Immediates", Intel SDM:
 		 * In 64-bit mode the typical size of immediate operands
 		 * remains 32-bits. When the operand size if 64-bits, the
 		 * processor sign-extends all immediates to 64-bits prior
 		 * to their use.
 		 */
 		if (vie->opsize == 4 || vie->opsize == 8)
 			vie->imm_bytes = 4;
 		else
 			vie->imm_bytes = 2;
 	} else if (vie->op.op_flags & VIE_OP_F_IMM8) {
 		vie->imm_bytes = 1;
 	}
 
 	if ((n = vie->imm_bytes) == 0)
 		return (0);
 
 	KASSERT(n == 1 || n == 2 || n == 4,
 	    ("%s: invalid number of immediate bytes: %d", __func__, n));
 
 	for (i = 0; i < n; i++) {
 		if (vie_peek(vie, &x))
 			return (-1);
 
 		u.buf[i] = x;
 		vie_advance(vie);
 	}
 
 	/* sign-extend the immediate value before use */
 	if (n == 1)
 		vie->immediate = u.signed8;
 	else if (n == 2)
 		vie->immediate = u.signed16;
 	else
 		vie->immediate = u.signed32;
 
 	return (0);
 }
 
 static int
 decode_moffset(struct vie *vie)
 {
 	int i, n;
 	uint8_t x;
 	union {
 		char	buf[8];
 		uint64_t u64;
 	} u;
 
 	if ((vie->op.op_flags & VIE_OP_F_MOFFSET) == 0)
 		return (0);
 
 	/*
 	 * Section 2.2.1.4, "Direct Memory-Offset MOVs", Intel SDM:
 	 * The memory offset size follows the address-size of the instruction.
 	 */
 	n = vie->addrsize;
 	KASSERT(n == 2 || n == 4 || n == 8, ("invalid moffset bytes: %d", n));
 
 	u.u64 = 0;
 	for (i = 0; i < n; i++) {
 		if (vie_peek(vie, &x))
 			return (-1);
 
 		u.buf[i] = x;
 		vie_advance(vie);
 	}
 	vie->displacement = u.u64;
 	return (0);
 }
 
 /*
  * Verify that all the bytes in the instruction buffer were consumed.
  */
 static int
 verify_inst_length(struct vie *vie)
 {
 
 	if (vie->num_processed == vie->num_valid)
 		return (0);
 	else
 		return (-1);
 }
 
 /*
  * Verify that the 'guest linear address' provided as collateral of the nested
  * page table fault matches with our instruction decoding.
  */
 static int
 verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie)
 {
 	int error;
 	uint64_t base, idx, gla2;
 
 	/* Skip 'gla' verification */
 	if (gla == VIE_INVALID_GLA)
 		return (0);
 
 	base = 0;
 	if (vie->base_register != VM_REG_LAST) {
 		error = vm_get_register(vm, cpuid, vie->base_register, &base);
 		if (error) {
 			printf("verify_gla: error %d getting base reg %d\n",
 				error, vie->base_register);
 			return (-1);
 		}
 
 		/*
 		 * RIP-relative addressing starts from the following
 		 * instruction
 		 */
 		if (vie->base_register == VM_REG_GUEST_RIP)
 			base += vie->num_valid;
 	}
 
 	idx = 0;
 	if (vie->index_register != VM_REG_LAST) {
 		error = vm_get_register(vm, cpuid, vie->index_register, &idx);
 		if (error) {
 			printf("verify_gla: error %d getting index reg %d\n",
 				error, vie->index_register);
 			return (-1);
 		}
 	}
 
 	/* XXX assuming that the base address of the segment is 0 */
 	gla2 = base + vie->scale * idx + vie->displacement;
 	gla2 &= size2mask[vie->addrsize];
 	if (gla != gla2) {
 		printf("verify_gla mismatch: "
 		       "base(0x%0lx), scale(%d), index(0x%0lx), "
 		       "disp(0x%0lx), gla(0x%0lx), gla2(0x%0lx)\n",
 		       base, vie->scale, idx, vie->displacement, gla, gla2);
 		return (-1);
 	}
 
 	return (0);
 }
 
 int
 vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla,
 		       enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie)
 {
 
 	if (decode_prefixes(vie, cpu_mode, cs_d))
 		return (-1);
 
 	if (decode_opcode(vie))
 		return (-1);
 
 	if (decode_modrm(vie, cpu_mode))
 		return (-1);
 
 	if (decode_sib(vie))
 		return (-1);
 
 	if (decode_displacement(vie))
 		return (-1);
 
 	if (decode_immediate(vie))
 		return (-1);
 
 	if (decode_moffset(vie))
 		return (-1);
 
 	if (verify_inst_length(vie))
 		return (-1);
 
 	if (verify_gla(vm, cpuid, gla, vie))
 		return (-1);
 
 	vie->decoded = 1;	/* success */
 
 	return (0);
 }
 #endif	/* _KERNEL */
Index: stable/10/sys/amd64/vmm/x86.c
===================================================================
--- stable/10/sys/amd64/vmm/x86.c	(revision 276348)
+++ stable/10/sys/amd64/vmm/x86.c	(revision 276349)
@@ -1,368 +1,421 @@
 /*-
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/pcpu.h>
 #include <sys/systm.h>
 #include <sys/cpuset.h>
+#include <sys/sysctl.h>
 
 #include <machine/clock.h>
 #include <machine/cpufunc.h>
 #include <machine/md_var.h>
 #include <machine/segments.h>
 #include <machine/specialreg.h>
 
 #include <machine/vmm.h>
 
 #include "vmm_host.h"
 #include "x86.h"
 
+SYSCTL_DECL(_hw_vmm);
+static SYSCTL_NODE(_hw_vmm, OID_AUTO, topology, CTLFLAG_RD, 0, NULL);
+
 #define	CPUID_VM_HIGH		0x40000000
 
 static const char bhyve_id[12] = "bhyve bhyve ";
 
 static uint64_t bhyve_xcpuids;
 
+/*
+ * The default CPU topology is a single thread per package.
+ */
+static u_int threads_per_core = 1;
+SYSCTL_UINT(_hw_vmm_topology, OID_AUTO, threads_per_core, CTLFLAG_RDTUN,
+    &threads_per_core, 0, NULL);
+
+static u_int cores_per_package = 1;
+SYSCTL_UINT(_hw_vmm_topology, OID_AUTO, cores_per_package, CTLFLAG_RDTUN,
+    &cores_per_package, 0, NULL);
+
+static int cpuid_leaf_b = 1;
+SYSCTL_INT(_hw_vmm_topology, OID_AUTO, cpuid_leaf_b, CTLFLAG_RDTUN,
+    &cpuid_leaf_b, 0, NULL);
+
+/*
+ * Round up to the next power of two, if necessary, and then take log2.
+ * Returns -1 if argument is zero.
+ */
+static __inline int
+log2(u_int x)
+{
+
+	return (fls(x << (1 - powerof2(x))) - 1);
+}
+
 int
 x86_emulate_cpuid(struct vm *vm, int vcpu_id,
 		  uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
 {
 	const struct xsave_limits *limits;
 	uint64_t cr4;
-	int error, enable_invpcid;
-	unsigned int 	func, regs[4];
+	int error, enable_invpcid, level, width, x2apic_id;
+	unsigned int func, regs[4], logical_cpus;
 	enum x2apic_state x2apic_state;
 
 	/*
 	 * Requests for invalid CPUID levels should map to the highest
 	 * available level instead.
 	 */
 	if (cpu_exthigh != 0 && *eax >= 0x80000000) {
 		if (*eax > cpu_exthigh)
 			*eax = cpu_exthigh;
 	} else if (*eax >= 0x40000000) {
 		if (*eax > CPUID_VM_HIGH)
 			*eax = CPUID_VM_HIGH;
 	} else if (*eax > cpu_high) {
 		*eax = cpu_high;
 	}
 
 	func = *eax;
 
 	/*
 	 * In general the approach used for CPU topology is to
 	 * advertise a flat topology where all CPUs are packages with
 	 * no multi-core or SMT.
 	 */
 	switch (func) {
 		/*
 		 * Pass these through to the guest
 		 */
 		case CPUID_0000_0000:
 		case CPUID_0000_0002:
 		case CPUID_0000_0003:
 		case CPUID_8000_0000:
 		case CPUID_8000_0002:
 		case CPUID_8000_0003:
 		case CPUID_8000_0004:
 		case CPUID_8000_0006:
 		case CPUID_8000_0008:
 			cpuid_count(*eax, *ecx, regs);
 			break;
 
 		case CPUID_8000_0001:
 			/*
 			 * Hide rdtscp/ia32_tsc_aux until we know how
 			 * to deal with them.
 			 */
 			cpuid_count(*eax, *ecx, regs);
 			regs[3] &= ~AMDID_RDTSCP;
 			break;
 
 		case CPUID_8000_0007:
 			cpuid_count(*eax, *ecx, regs);
 			/*
 			 * If the host TSCs are not synchronized across
 			 * physical cpus then we cannot advertise an
 			 * invariant tsc to a vcpu.
 			 *
 			 * XXX This still falls short because the vcpu
 			 * can observe the TSC moving backwards as it
 			 * migrates across physical cpus. But at least
 			 * it should discourage the guest from using the
 			 * TSC to keep track of time.
 			 */
 			if (!smp_tsc)
 				regs[3] &= ~AMDPM_TSC_INVARIANT;
 			break;
 
 		case CPUID_0000_0001:
 			do_cpuid(1, regs);
 
 			error = vm_get_x2apic_state(vm, vcpu_id, &x2apic_state);
 			if (error) {
 				panic("x86_emulate_cpuid: error %d "
 				      "fetching x2apic state", error);
 			}
 
 			/*
 			 * Override the APIC ID only in ebx
 			 */
 			regs[1] &= ~(CPUID_LOCAL_APIC_ID);
 			regs[1] |= (vcpu_id << CPUID_0000_0001_APICID_SHIFT);
 
 			/*
 			 * Don't expose VMX, SpeedStep or TME capability.
 			 * Advertise x2APIC capability and Hypervisor guest.
 			 */
 			regs[2] &= ~(CPUID2_VMX | CPUID2_EST | CPUID2_TM2);
 
 			regs[2] |= CPUID2_HV;
 
 			if (x2apic_state != X2APIC_DISABLED)
 				regs[2] |= CPUID2_X2APIC;
 			else
 				regs[2] &= ~CPUID2_X2APIC;
 
 			/*
 			 * Only advertise CPUID2_XSAVE in the guest if
 			 * the host is using XSAVE.
 			 */
 			if (!(regs[2] & CPUID2_OSXSAVE))
 				regs[2] &= ~CPUID2_XSAVE;
 
 			/*
 			 * If CPUID2_XSAVE is being advertised and the
 			 * guest has set CR4_XSAVE, set
 			 * CPUID2_OSXSAVE.
 			 */
 			regs[2] &= ~CPUID2_OSXSAVE;
 			if (regs[2] & CPUID2_XSAVE) {
 				error = vm_get_register(vm, vcpu_id,
 				    VM_REG_GUEST_CR4, &cr4);
 				if (error)
 					panic("x86_emulate_cpuid: error %d "
 					      "fetching %%cr4", error);
 				if (cr4 & CR4_XSAVE)
 					regs[2] |= CPUID2_OSXSAVE;
 			}
 
 			/*
 			 * Hide monitor/mwait until we know how to deal with
 			 * these instructions.
 			 */
 			regs[2] &= ~CPUID2_MON;
 
                         /*
 			 * Hide the performance and debug features.
 			 */
 			regs[2] &= ~CPUID2_PDCM;
 
 			/*
 			 * No TSC deadline support in the APIC yet
 			 */
 			regs[2] &= ~CPUID2_TSCDLT;
 
 			/*
 			 * Hide thermal monitoring
 			 */
 			regs[3] &= ~(CPUID_ACPI | CPUID_TM);
 			
 			/*
 			 * Machine check handling is done in the host.
 			 * Hide MTRR capability.
 			 */
 			regs[3] &= ~(CPUID_MCA | CPUID_MCE | CPUID_MTRR);
 
                         /*
                         * Hide the debug store capability.
                         */
 			regs[3] &= ~CPUID_DS;
 
-			/*
-			 * Disable multi-core.
-			 */
+			logical_cpus = threads_per_core * cores_per_package;
 			regs[1] &= ~CPUID_HTT_CORES;
-			regs[3] &= ~CPUID_HTT;
+			regs[1] |= (logical_cpus & 0xff) << 16;
+			regs[3] |= CPUID_HTT;
 			break;
 
 		case CPUID_0000_0004:
-			do_cpuid(4, regs);
+			cpuid_count(*eax, *ecx, regs);
 
-			/*
-			 * Do not expose topology.
-			 *
-			 * The maximum number of processor cores in
-			 * this physical processor package and the
-			 * maximum number of threads sharing this
-			 * cache are encoded with "plus 1" encoding.
-			 * Adding one to the value in this register
-			 * field to obtains the actual value.
-			 *
-			 * Therefore 0 for both indicates 1 core per
-			 * package and no cache sharing.
-			 */
-			regs[0] &= 0xffff8000;
+			if (regs[0] || regs[1] || regs[2] || regs[3]) {
+				regs[0] &= 0x3ff;
+				regs[0] |= (cores_per_package - 1) << 26;
+				/*
+				 * Cache topology:
+				 * - L1 and L2 are shared only by the logical
+				 *   processors in a single core.
+				 * - L3 and above are shared by all logical
+				 *   processors in the package.
+				 */
+				logical_cpus = threads_per_core;
+				level = (regs[0] >> 5) & 0x7;
+				if (level >= 3)
+					logical_cpus *= cores_per_package;
+				regs[0] |= (logical_cpus - 1) << 14;
+			}
 			break;
 
 		case CPUID_0000_0007:
 			regs[0] = 0;
 			regs[1] = 0;
 			regs[2] = 0;
 			regs[3] = 0;
 
 			/* leaf 0 */
 			if (*ecx == 0) {
 				cpuid_count(*eax, *ecx, regs);
 
 				/* Only leaf 0 is supported */
 				regs[0] = 0;
 
 				/*
 				 * Expose known-safe features.
 				 */
 				regs[1] &= (CPUID_STDEXT_FSGSBASE |
 				    CPUID_STDEXT_BMI1 | CPUID_STDEXT_HLE |
 				    CPUID_STDEXT_AVX2 | CPUID_STDEXT_BMI2 |
 				    CPUID_STDEXT_ERMS | CPUID_STDEXT_RTM |
 				    CPUID_STDEXT_AVX512F |
 				    CPUID_STDEXT_AVX512PF |
 				    CPUID_STDEXT_AVX512ER |
 				    CPUID_STDEXT_AVX512CD);
 				regs[2] = 0;
 				regs[3] = 0;
 
 				/* Advertise INVPCID if it is enabled. */
 				error = vm_get_capability(vm, vcpu_id,
 				    VM_CAP_ENABLE_INVPCID, &enable_invpcid);
 				if (error == 0 && enable_invpcid)
 					regs[1] |= CPUID_STDEXT_INVPCID;
 			}
 			break;
 
 		case CPUID_0000_0006:
 		case CPUID_0000_000A:
 			/*
 			 * Handle the access, but report 0 for
 			 * all options
 			 */
 			regs[0] = 0;
 			regs[1] = 0;
 			regs[2] = 0;
 			regs[3] = 0;
 			break;
 
 		case CPUID_0000_000B:
 			/*
 			 * Processor topology enumeration
 			 */
-			regs[0] = 0;
-			regs[1] = 0;
-			regs[2] = *ecx & 0xff;
-			regs[3] = vcpu_id;
+			if (*ecx == 0) {
+				logical_cpus = threads_per_core;
+				width = log2(logical_cpus);
+				level = CPUID_TYPE_SMT;
+				x2apic_id = vcpu_id;
+			}
+
+			if (*ecx == 1) {
+				logical_cpus = threads_per_core *
+				    cores_per_package;
+				width = log2(logical_cpus);
+				level = CPUID_TYPE_CORE;
+				x2apic_id = vcpu_id;
+			}
+
+			if (!cpuid_leaf_b || *ecx >= 2) {
+				width = 0;
+				logical_cpus = 0;
+				level = 0;
+				x2apic_id = 0;
+			}
+
+			regs[0] = width & 0x1f;
+			regs[1] = logical_cpus & 0xffff;
+			regs[2] = (level << 8) | (*ecx & 0xff);
+			regs[3] = x2apic_id;
 			break;
 
 		case CPUID_0000_000D:
 			limits = vmm_get_xsave_limits();
 			if (!limits->xsave_enabled) {
 				regs[0] = 0;
 				regs[1] = 0;
 				regs[2] = 0;
 				regs[3] = 0;
 				break;
 			}
 
 			cpuid_count(*eax, *ecx, regs);
 			switch (*ecx) {
 			case 0:
 				/*
 				 * Only permit the guest to use bits
 				 * that are active in the host in
 				 * %xcr0.  Also, claim that the
 				 * maximum save area size is
 				 * equivalent to the host's current
 				 * save area size.  Since this runs
 				 * "inside" of vmrun(), it runs with
 				 * the guest's xcr0, so the current
 				 * save area size is correct as-is.
 				 */
 				regs[0] &= limits->xcr0_allowed;
 				regs[2] = limits->xsave_max_size;
 				regs[3] &= (limits->xcr0_allowed >> 32);
 				break;
 			case 1:
 				/* Only permit XSAVEOPT. */
 				regs[0] &= CPUID_EXTSTATE_XSAVEOPT;
 				regs[1] = 0;
 				regs[2] = 0;
 				regs[3] = 0;
 				break;
 			default:
 				/*
 				 * If the leaf is for a permitted feature,
 				 * pass through as-is, otherwise return
 				 * all zeroes.
 				 */
 				if (!(limits->xcr0_allowed & (1ul << *ecx))) {
 					regs[0] = 0;
 					regs[1] = 0;
 					regs[2] = 0;
 					regs[3] = 0;
 				}
 				break;
 			}
 			break;
 
 		case 0x40000000:
 			regs[0] = CPUID_VM_HIGH;
 			bcopy(bhyve_id, &regs[1], 4);
 			bcopy(bhyve_id + 4, &regs[2], 4);
 			bcopy(bhyve_id + 8, &regs[3], 4);
 			break;
 
 		default:
 			/*
 			 * The leaf value has already been clamped so
 			 * simply pass this through, keeping count of
 			 * how many unhandled leaf values have been seen.
 			 */
 			atomic_add_long(&bhyve_xcpuids, 1);
 			cpuid_count(*eax, *ecx, regs);
 			break;
 	}
 
 	*eax = regs[0];
 	*ebx = regs[1];
 	*ecx = regs[2];
 	*edx = regs[3];
 
 	return (1);
 }
Index: stable/10/sys/modules/vmm/Makefile
===================================================================
--- stable/10/sys/modules/vmm/Makefile	(revision 276348)
+++ stable/10/sys/modules/vmm/Makefile	(revision 276349)
@@ -1,66 +1,65 @@
 # $FreeBSD$
 
 KMOD=	vmm
 
 SRCS=	opt_acpi.h opt_ddb.h device_if.h bus_if.h pci_if.h
 
 CFLAGS+= -DVMM_KEEP_STATS -DSMP
 CFLAGS+= -I${.CURDIR}/../../amd64/vmm
 CFLAGS+= -I${.CURDIR}/../../amd64/vmm/io
 CFLAGS+= -I${.CURDIR}/../../amd64/vmm/intel
 
 # generic vmm support
 .PATH: ${.CURDIR}/../../amd64/vmm
 SRCS+=	vmm.c		\
 	vmm_dev.c	\
 	vmm_host.c	\
 	vmm_instruction_emul.c	\
 	vmm_ioport.c	\
 	vmm_ipi.c	\
 	vmm_lapic.c	\
 	vmm_mem.c	\
-	vmm_msr.c	\
 	vmm_stat.c	\
 	vmm_util.c	\
 	x86.c		\
 	vmm_support.S
 
 .PATH: ${.CURDIR}/../../amd64/vmm/io
 SRCS+=	iommu.c		\
 	ppt.c           \
 	vatpic.c	\
 	vatpit.c	\
 	vhpet.c		\
 	vioapic.c	\
 	vlapic.c
 
 # intel-specific files
 .PATH: ${.CURDIR}/../../amd64/vmm/intel
 SRCS+=	ept.c		\
 	vmcs.c		\
 	vmx_msr.c	\
 	vmx.c		\
 	vtd.c
 
 # amd-specific files
 .PATH: ${.CURDIR}/../../amd64/vmm/amd
 SRCS+=	amdv.c
 
 OBJS=	vmx_support.o
 
 CLEANFILES=	vmx_assym.s vmx_genassym.o
 
 vmx_assym.s:    vmx_genassym.o
 .if exists(@)
 vmx_assym.s:    @/kern/genassym.sh
 .endif
 	sh @/kern/genassym.sh vmx_genassym.o > ${.TARGET}
 
 vmx_support.o:	vmx_support.S vmx_assym.s
 	${CC} -c -x assembler-with-cpp -DLOCORE ${CFLAGS} \
 	    ${.IMPSRC} -o ${.TARGET}
 
 vmx_genassym.o: vmx_genassym.c @ machine x86
 	${CC} -c ${CFLAGS:N-fno-common} ${.IMPSRC}
 
 .include <bsd.kmod.mk>
Index: stable/10/sys/x86/include/specialreg.h
===================================================================
--- stable/10/sys/x86/include/specialreg.h	(revision 276348)
+++ stable/10/sys/x86/include/specialreg.h	(revision 276349)
@@ -1,819 +1,827 @@
 /*-
  * Copyright (c) 1991 The Regents of the University of California.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)specialreg.h	7.1 (Berkeley) 5/9/91
  * $FreeBSD$
  */
 
 #ifndef _MACHINE_SPECIALREG_H_
 #define	_MACHINE_SPECIALREG_H_
 
 /*
  * Bits in 386 special registers:
  */
 #define	CR0_PE	0x00000001	/* Protected mode Enable */
 #define	CR0_MP	0x00000002	/* "Math" (fpu) Present */
 #define	CR0_EM	0x00000004	/* EMulate FPU instructions. (trap ESC only) */
 #define	CR0_TS	0x00000008	/* Task Switched (if MP, trap ESC and WAIT) */
 #define	CR0_PG	0x80000000	/* PaGing enable */
 
 /*
  * Bits in 486 special registers:
  */
 #define	CR0_NE	0x00000020	/* Numeric Error enable (EX16 vs IRQ13) */
 #define	CR0_WP	0x00010000	/* Write Protect (honor page protect in
 							   all modes) */
 #define	CR0_AM	0x00040000	/* Alignment Mask (set to enable AC flag) */
 #define	CR0_NW  0x20000000	/* Not Write-through */
 #define	CR0_CD  0x40000000	/* Cache Disable */
 
 #define	CR3_PCID_SAVE 0x8000000000000000
 
 /*
  * Bits in PPro special registers
  */
 #define	CR4_VME	0x00000001	/* Virtual 8086 mode extensions */
 #define	CR4_PVI	0x00000002	/* Protected-mode virtual interrupts */
 #define	CR4_TSD	0x00000004	/* Time stamp disable */
 #define	CR4_DE	0x00000008	/* Debugging extensions */
 #define	CR4_PSE	0x00000010	/* Page size extensions */
 #define	CR4_PAE	0x00000020	/* Physical address extension */
 #define	CR4_MCE	0x00000040	/* Machine check enable */
 #define	CR4_PGE	0x00000080	/* Page global enable */
 #define	CR4_PCE	0x00000100	/* Performance monitoring counter enable */
 #define	CR4_FXSR 0x00000200	/* Fast FPU save/restore used by OS */
 #define	CR4_XMM	0x00000400	/* enable SIMD/MMX2 to use except 16 */
 #define	CR4_VMXE 0x00002000	/* enable VMX operation (Intel-specific) */
 #define	CR4_FSGSBASE 0x00010000	/* Enable FS/GS BASE accessing instructions */
 #define	CR4_PCIDE 0x00020000	/* Enable Context ID */
 #define	CR4_XSAVE 0x00040000	/* XSETBV/XGETBV */
 #define	CR4_SMEP 0x00100000	/* Supervisor-Mode Execution Prevention */
 
 /*
  * Bits in AMD64 special registers.  EFER is 64 bits wide.
  */
 #define	EFER_SCE 0x000000001	/* System Call Extensions (R/W) */
 #define	EFER_LME 0x000000100	/* Long mode enable (R/W) */
 #define	EFER_LMA 0x000000400	/* Long mode active (R) */
 #define	EFER_NXE 0x000000800	/* PTE No-Execute bit enable (R/W) */
 
 /*
  * Intel Extended Features registers
  */
 #define	XCR0	0		/* XFEATURE_ENABLED_MASK register */
 
 #define	XFEATURE_ENABLED_X87		0x00000001
 #define	XFEATURE_ENABLED_SSE		0x00000002
 #define	XFEATURE_ENABLED_YMM_HI128	0x00000004
 #define	XFEATURE_ENABLED_AVX		XFEATURE_ENABLED_YMM_HI128
 #define	XFEATURE_ENABLED_BNDREGS	0x00000008
 #define	XFEATURE_ENABLED_BNDCSR		0x00000010
 #define	XFEATURE_ENABLED_OPMASK		0x00000020
 #define	XFEATURE_ENABLED_ZMM_HI256	0x00000040
 #define	XFEATURE_ENABLED_HI16_ZMM	0x00000080
 
 #define	XFEATURE_AVX					\
     (XFEATURE_ENABLED_X87 | XFEATURE_ENABLED_SSE | XFEATURE_ENABLED_AVX)
 #define	XFEATURE_AVX512						\
     (XFEATURE_ENABLED_OPMASK | XFEATURE_ENABLED_ZMM_HI256 |	\
     XFEATURE_ENABLED_HI16_ZMM)
 #define	XFEATURE_MPX					\
     (XFEATURE_ENABLED_BNDREGS | XFEATURE_ENABLED_BNDCSR)
 
 /*
  * CPUID instruction features register
  */
 #define	CPUID_FPU	0x00000001
 #define	CPUID_VME	0x00000002
 #define	CPUID_DE	0x00000004
 #define	CPUID_PSE	0x00000008
 #define	CPUID_TSC	0x00000010
 #define	CPUID_MSR	0x00000020
 #define	CPUID_PAE	0x00000040
 #define	CPUID_MCE	0x00000080
 #define	CPUID_CX8	0x00000100
 #define	CPUID_APIC	0x00000200
 #define	CPUID_B10	0x00000400
 #define	CPUID_SEP	0x00000800
 #define	CPUID_MTRR	0x00001000
 #define	CPUID_PGE	0x00002000
 #define	CPUID_MCA	0x00004000
 #define	CPUID_CMOV	0x00008000
 #define	CPUID_PAT	0x00010000
 #define	CPUID_PSE36	0x00020000
 #define	CPUID_PSN	0x00040000
 #define	CPUID_CLFSH	0x00080000
 #define	CPUID_B20	0x00100000
 #define	CPUID_DS	0x00200000
 #define	CPUID_ACPI	0x00400000
 #define	CPUID_MMX	0x00800000
 #define	CPUID_FXSR	0x01000000
 #define	CPUID_SSE	0x02000000
 #define	CPUID_XMM	0x02000000
 #define	CPUID_SSE2	0x04000000
 #define	CPUID_SS	0x08000000
 #define	CPUID_HTT	0x10000000
 #define	CPUID_TM	0x20000000
 #define	CPUID_IA64	0x40000000
 #define	CPUID_PBE	0x80000000
 
 #define	CPUID2_SSE3	0x00000001
 #define	CPUID2_PCLMULQDQ 0x00000002
 #define	CPUID2_DTES64	0x00000004
 #define	CPUID2_MON	0x00000008
 #define	CPUID2_DS_CPL	0x00000010
 #define	CPUID2_VMX	0x00000020
 #define	CPUID2_SMX	0x00000040
 #define	CPUID2_EST	0x00000080
 #define	CPUID2_TM2	0x00000100
 #define	CPUID2_SSSE3	0x00000200
 #define	CPUID2_CNXTID	0x00000400
 #define	CPUID2_FMA	0x00001000
 #define	CPUID2_CX16	0x00002000
 #define	CPUID2_XTPR	0x00004000
 #define	CPUID2_PDCM	0x00008000
 #define	CPUID2_PCID	0x00020000
 #define	CPUID2_DCA	0x00040000
 #define	CPUID2_SSE41	0x00080000
 #define	CPUID2_SSE42	0x00100000
 #define	CPUID2_X2APIC	0x00200000
 #define	CPUID2_MOVBE	0x00400000
 #define	CPUID2_POPCNT	0x00800000
 #define	CPUID2_TSCDLT	0x01000000
 #define	CPUID2_AESNI	0x02000000
 #define	CPUID2_XSAVE	0x04000000
 #define	CPUID2_OSXSAVE	0x08000000
 #define	CPUID2_AVX	0x10000000
 #define	CPUID2_F16C	0x20000000
 #define	CPUID2_RDRAND	0x40000000
 #define	CPUID2_HV	0x80000000
 
 /*
  * Important bits in the Thermal and Power Management flags
  * CPUID.6 EAX and ECX.
  */
 #define	CPUTPM1_SENSOR	0x00000001
 #define	CPUTPM1_TURBO	0x00000002
 #define	CPUTPM1_ARAT	0x00000004
 #define	CPUTPM2_EFFREQ	0x00000001
 
 /*
  * Important bits in the AMD extended cpuid flags
  */
 #define	AMDID_SYSCALL	0x00000800
 #define	AMDID_MP	0x00080000
 #define	AMDID_NX	0x00100000
 #define	AMDID_EXT_MMX	0x00400000
 #define	AMDID_FFXSR	0x01000000
 #define	AMDID_PAGE1GB	0x04000000
 #define	AMDID_RDTSCP	0x08000000
 #define	AMDID_LM	0x20000000
 #define	AMDID_EXT_3DNOW	0x40000000
 #define	AMDID_3DNOW	0x80000000
 
 #define	AMDID2_LAHF	0x00000001
 #define	AMDID2_CMP	0x00000002
 #define	AMDID2_SVM	0x00000004
 #define	AMDID2_EXT_APIC	0x00000008
 #define	AMDID2_CR8	0x00000010
 #define	AMDID2_ABM	0x00000020
 #define	AMDID2_SSE4A	0x00000040
 #define	AMDID2_MAS	0x00000080
 #define	AMDID2_PREFETCH	0x00000100
 #define	AMDID2_OSVW	0x00000200
 #define	AMDID2_IBS	0x00000400
 #define	AMDID2_XOP	0x00000800
 #define	AMDID2_SKINIT	0x00001000
 #define	AMDID2_WDT	0x00002000
 #define	AMDID2_LWP	0x00008000
 #define	AMDID2_FMA4	0x00010000
 #define	AMDID2_TCE	0x00020000
 #define	AMDID2_NODE_ID	0x00080000
 #define	AMDID2_TBM	0x00200000
 #define	AMDID2_TOPOLOGY	0x00400000
 #define	AMDID2_PCXC	0x00800000
 #define	AMDID2_PNXC	0x01000000
 #define	AMDID2_DBE	0x04000000
 #define	AMDID2_PTSC	0x08000000
 #define	AMDID2_PTSCEL2I	0x10000000
 
 /*
  * CPUID instruction 1 eax info
  */
 #define	CPUID_STEPPING		0x0000000f
 #define	CPUID_MODEL		0x000000f0
 #define	CPUID_FAMILY		0x00000f00
 #define	CPUID_EXT_MODEL		0x000f0000
 #define	CPUID_EXT_FAMILY	0x0ff00000
 #ifdef __i386__
 #define	CPUID_TO_MODEL(id) \
     ((((id) & CPUID_MODEL) >> 4) | \
     ((((id) & CPUID_FAMILY) >= 0x600) ? \
     (((id) & CPUID_EXT_MODEL) >> 12) : 0))
 #define	CPUID_TO_FAMILY(id) \
     ((((id) & CPUID_FAMILY) >> 8) + \
     ((((id) & CPUID_FAMILY) == 0xf00) ? \
     (((id) & CPUID_EXT_FAMILY) >> 20) : 0))
 #else
 #define	CPUID_TO_MODEL(id) \
     ((((id) & CPUID_MODEL) >> 4) | \
     (((id) & CPUID_EXT_MODEL) >> 12))
 #define	CPUID_TO_FAMILY(id) \
     ((((id) & CPUID_FAMILY) >> 8) + \
     (((id) & CPUID_EXT_FAMILY) >> 20))
 #endif
 
 /*
  * CPUID instruction 1 ebx info
  */
 #define	CPUID_BRAND_INDEX	0x000000ff
 #define	CPUID_CLFUSH_SIZE	0x0000ff00
 #define	CPUID_HTT_CORES		0x00ff0000
 #define	CPUID_LOCAL_APIC_ID	0xff000000
 
 /*
  * CPUID instruction 5 info
  */
 #define	CPUID5_MON_MIN_SIZE	0x0000ffff	/* eax */
 #define	CPUID5_MON_MAX_SIZE	0x0000ffff	/* ebx */
 #define	CPUID5_MON_MWAIT_EXT	0x00000001	/* ecx */
 #define	CPUID5_MWAIT_INTRBREAK	0x00000002	/* ecx */
 
 /*
  * MWAIT cpu power states.  Lower 4 bits are sub-states.
  */
 #define	MWAIT_C0	0xf0
 #define	MWAIT_C1	0x00
 #define	MWAIT_C2	0x10
 #define	MWAIT_C3	0x20
 #define	MWAIT_C4	0x30
 
 /*
  * MWAIT extensions.
  */
 /* Interrupt breaks MWAIT even when masked. */
 #define	MWAIT_INTRBREAK		0x00000001
 
 /*
  * CPUID instruction 6 ecx info
  */
 #define	CPUID_PERF_STAT		0x00000001
 #define	CPUID_PERF_BIAS		0x00000008
 
 /* 
  * CPUID instruction 0xb ebx info.
  */
 #define	CPUID_TYPE_INVAL	0
 #define	CPUID_TYPE_SMT		1
 #define	CPUID_TYPE_CORE		2
 
 /*
  * CPUID instruction 0xd Processor Extended State Enumeration Sub-leaf 1
  */
 #define	CPUID_EXTSTATE_XSAVEOPT	0x00000001
 #define	CPUID_EXTSTATE_XSAVEC	0x00000002
 #define	CPUID_EXTSTATE_XINUSE	0x00000004
 #define	CPUID_EXTSTATE_XSAVES	0x00000008
 
 /*
  * AMD extended function 8000_0007h edx info
  */
 #define	AMDPM_TS		0x00000001
 #define	AMDPM_FID		0x00000002
 #define	AMDPM_VID		0x00000004
 #define	AMDPM_TTP		0x00000008
 #define	AMDPM_TM		0x00000010
 #define	AMDPM_STC		0x00000020
 #define	AMDPM_100MHZ_STEPS	0x00000040
 #define	AMDPM_HW_PSTATE		0x00000080
 #define	AMDPM_TSC_INVARIANT	0x00000100
 #define	AMDPM_CPB		0x00000200
 
 /*
  * AMD extended function 8000_0008h ecx info
  */
 #define	AMDID_CMP_CORES		0x000000ff
 #define	AMDID_COREID_SIZE	0x0000f000
 #define	AMDID_COREID_SIZE_SHIFT	12
 
 /*
  * CPUID instruction 7 Structured Extended Features, leaf 0 ebx info
  */
 #define	CPUID_STDEXT_FSGSBASE	0x00000001
 #define	CPUID_STDEXT_TSC_ADJUST	0x00000002
 #define	CPUID_STDEXT_BMI1	0x00000008
 #define	CPUID_STDEXT_HLE	0x00000010
 #define	CPUID_STDEXT_AVX2	0x00000020
 #define	CPUID_STDEXT_SMEP	0x00000080
 #define	CPUID_STDEXT_BMI2	0x00000100
 #define	CPUID_STDEXT_ERMS	0x00000200
 #define	CPUID_STDEXT_INVPCID	0x00000400
 #define	CPUID_STDEXT_RTM	0x00000800
 #define	CPUID_STDEXT_MPX	0x00004000
 #define	CPUID_STDEXT_AVX512F	0x00010000
 #define	CPUID_STDEXT_RDSEED	0x00040000
 #define	CPUID_STDEXT_ADX	0x00080000
 #define	CPUID_STDEXT_SMAP	0x00100000
 #define	CPUID_STDEXT_CLFLUSHOPT	0x00800000
 #define	CPUID_STDEXT_PROCTRACE	0x02000000
 #define	CPUID_STDEXT_AVX512PF	0x04000000
 #define	CPUID_STDEXT_AVX512ER	0x08000000
 #define	CPUID_STDEXT_AVX512CD	0x10000000
 #define	CPUID_STDEXT_SHA	0x20000000
 
 /*
  * CPUID manufacturers identifiers
  */
 #define	AMD_VENDOR_ID		"AuthenticAMD"
 #define	CENTAUR_VENDOR_ID	"CentaurHauls"
 #define	CYRIX_VENDOR_ID		"CyrixInstead"
 #define	INTEL_VENDOR_ID		"GenuineIntel"
 #define	NEXGEN_VENDOR_ID	"NexGenDriven"
 #define	NSC_VENDOR_ID		"Geode by NSC"
 #define	RISE_VENDOR_ID		"RiseRiseRise"
 #define	SIS_VENDOR_ID		"SiS SiS SiS "
 #define	TRANSMETA_VENDOR_ID	"GenuineTMx86"
 #define	UMC_VENDOR_ID		"UMC UMC UMC "
 
 /*
  * Model-specific registers for the i386 family
  */
 #define	MSR_P5_MC_ADDR		0x000
 #define	MSR_P5_MC_TYPE		0x001
 #define	MSR_TSC			0x010
 #define	MSR_P5_CESR		0x011
 #define	MSR_P5_CTR0		0x012
 #define	MSR_P5_CTR1		0x013
 #define	MSR_IA32_PLATFORM_ID	0x017
 #define	MSR_APICBASE		0x01b
 #define	MSR_EBL_CR_POWERON	0x02a
 #define	MSR_TEST_CTL		0x033
 #define	MSR_IA32_FEATURE_CONTROL 0x03a
 #define	MSR_BIOS_UPDT_TRIG	0x079
 #define	MSR_BBL_CR_D0		0x088
 #define	MSR_BBL_CR_D1		0x089
 #define	MSR_BBL_CR_D2		0x08a
 #define	MSR_BIOS_SIGN		0x08b
 #define	MSR_PERFCTR0		0x0c1
 #define	MSR_PERFCTR1		0x0c2
+#define	MSR_PLATFORM_INFO	0x0ce
 #define	MSR_MPERF		0x0e7
 #define	MSR_APERF		0x0e8
 #define	MSR_IA32_EXT_CONFIG	0x0ee	/* Undocumented. Core Solo/Duo only */
 #define	MSR_MTRRcap		0x0fe
 #define	MSR_BBL_CR_ADDR		0x116
 #define	MSR_BBL_CR_DECC		0x118
 #define	MSR_BBL_CR_CTL		0x119
 #define	MSR_BBL_CR_TRIG		0x11a
 #define	MSR_BBL_CR_BUSY		0x11b
 #define	MSR_BBL_CR_CTL3		0x11e
 #define	MSR_SYSENTER_CS_MSR	0x174
 #define	MSR_SYSENTER_ESP_MSR	0x175
 #define	MSR_SYSENTER_EIP_MSR	0x176
 #define	MSR_MCG_CAP		0x179
 #define	MSR_MCG_STATUS		0x17a
 #define	MSR_MCG_CTL		0x17b
 #define	MSR_EVNTSEL0		0x186
 #define	MSR_EVNTSEL1		0x187
 #define	MSR_THERM_CONTROL	0x19a
 #define	MSR_THERM_INTERRUPT	0x19b
 #define	MSR_THERM_STATUS	0x19c
 #define	MSR_IA32_MISC_ENABLE	0x1a0
 #define	MSR_IA32_TEMPERATURE_TARGET	0x1a2
+#define	MSR_TURBO_RATIO_LIMIT	0x1ad
+#define	MSR_TURBO_RATIO_LIMIT1	0x1ae
 #define	MSR_DEBUGCTLMSR		0x1d9
 #define	MSR_LASTBRANCHFROMIP	0x1db
 #define	MSR_LASTBRANCHTOIP	0x1dc
 #define	MSR_LASTINTFROMIP	0x1dd
 #define	MSR_LASTINTTOIP		0x1de
 #define	MSR_ROB_CR_BKUPTMPDR6	0x1e0
 #define	MSR_MTRRVarBase		0x200
 #define	MSR_MTRR64kBase		0x250
 #define	MSR_MTRR16kBase		0x258
 #define	MSR_MTRR4kBase		0x268
 #define	MSR_PAT			0x277
 #define	MSR_MC0_CTL2		0x280
 #define	MSR_MTRRdefType		0x2ff
 #define	MSR_MC0_CTL		0x400
 #define	MSR_MC0_STATUS		0x401
 #define	MSR_MC0_ADDR		0x402
 #define	MSR_MC0_MISC		0x403
 #define	MSR_MC1_CTL		0x404
 #define	MSR_MC1_STATUS		0x405
 #define	MSR_MC1_ADDR		0x406
 #define	MSR_MC1_MISC		0x407
 #define	MSR_MC2_CTL		0x408
 #define	MSR_MC2_STATUS		0x409
 #define	MSR_MC2_ADDR		0x40a
 #define	MSR_MC2_MISC		0x40b
 #define	MSR_MC3_CTL		0x40c
 #define	MSR_MC3_STATUS		0x40d
 #define	MSR_MC3_ADDR		0x40e
 #define	MSR_MC3_MISC		0x40f
 #define	MSR_MC4_CTL		0x410
 #define	MSR_MC4_STATUS		0x411
 #define	MSR_MC4_ADDR		0x412
 #define	MSR_MC4_MISC		0x413
+#define	MSR_RAPL_POWER_UNIT	0x606
+#define	MSR_PKG_ENERGY_STATUS	0x611
+#define	MSR_DRAM_ENERGY_STATUS	0x619
+#define	MSR_PP0_ENERGY_STATUS	0x639
+#define	MSR_PP1_ENERGY_STATUS	0x641
 
 /*
  * VMX MSRs
  */
 #define	MSR_VMX_BASIC		0x480
 #define	MSR_VMX_PINBASED_CTLS	0x481
 #define	MSR_VMX_PROCBASED_CTLS	0x482
 #define	MSR_VMX_EXIT_CTLS	0x483
 #define	MSR_VMX_ENTRY_CTLS	0x484
 #define	MSR_VMX_CR0_FIXED0	0x486
 #define	MSR_VMX_CR0_FIXED1	0x487
 #define	MSR_VMX_CR4_FIXED0	0x488
 #define	MSR_VMX_CR4_FIXED1	0x489
 #define	MSR_VMX_PROCBASED_CTLS2	0x48b
 #define	MSR_VMX_EPT_VPID_CAP	0x48c
 #define	MSR_VMX_TRUE_PINBASED_CTLS	0x48d
 #define	MSR_VMX_TRUE_PROCBASED_CTLS	0x48e
 #define	MSR_VMX_TRUE_EXIT_CTLS	0x48f
 #define	MSR_VMX_TRUE_ENTRY_CTLS	0x490
 
 /*
  * X2APIC MSRs
  */
 #define	MSR_APIC_ID		0x802
 #define	MSR_APIC_VERSION	0x803
 #define	MSR_APIC_TPR		0x808
 #define	MSR_APIC_EOI		0x80b
 #define	MSR_APIC_LDR		0x80d
 #define	MSR_APIC_SVR		0x80f
 #define	MSR_APIC_ISR0		0x810
 #define	MSR_APIC_ISR1		0x811
 #define	MSR_APIC_ISR2		0x812
 #define	MSR_APIC_ISR3		0x813
 #define	MSR_APIC_ISR4		0x814
 #define	MSR_APIC_ISR5		0x815
 #define	MSR_APIC_ISR6		0x816
 #define	MSR_APIC_ISR7		0x817
 #define	MSR_APIC_TMR0		0x818
 #define	MSR_APIC_IRR0		0x820
 #define	MSR_APIC_ESR		0x828
 #define	MSR_APIC_LVT_CMCI	0x82F
 #define	MSR_APIC_ICR		0x830
 #define	MSR_APIC_LVT_TIMER	0x832
 #define	MSR_APIC_LVT_THERMAL	0x833
 #define	MSR_APIC_LVT_PCINT	0x834
 #define	MSR_APIC_LVT_LINT0	0x835
 #define	MSR_APIC_LVT_LINT1	0x836
 #define	MSR_APIC_LVT_ERROR	0x837
 #define	MSR_APIC_ICR_TIMER	0x838
 #define	MSR_APIC_CCR_TIMER	0x839
 #define	MSR_APIC_DCR_TIMER	0x83e
 #define	MSR_APIC_SELF_IPI	0x83f
 
 #define	MSR_IA32_XSS		0xda0
 
 /*
  * Constants related to MSR's.
  */
 #define	APICBASE_RESERVED	0x000002ff
 #define	APICBASE_BSP		0x00000100
 #define	APICBASE_X2APIC		0x00000400
 #define	APICBASE_ENABLED	0x00000800
 #define	APICBASE_ADDRESS	0xfffff000
 
 /* MSR_IA32_FEATURE_CONTROL related */
 #define	IA32_FEATURE_CONTROL_LOCK	0x01	/* lock bit */
 #define	IA32_FEATURE_CONTROL_SMX_EN	0x02	/* enable VMX inside SMX */
 #define	IA32_FEATURE_CONTROL_VMX_EN	0x04	/* enable VMX outside SMX */
 
 /*
  * PAT modes.
  */
 #define	PAT_UNCACHEABLE		0x00
 #define	PAT_WRITE_COMBINING	0x01
 #define	PAT_WRITE_THROUGH	0x04
 #define	PAT_WRITE_PROTECTED	0x05
 #define	PAT_WRITE_BACK		0x06
 #define	PAT_UNCACHED		0x07
 #define	PAT_VALUE(i, m)		((long long)(m) << (8 * (i)))
 #define	PAT_MASK(i)		PAT_VALUE(i, 0xff)
 
 /*
  * Constants related to MTRRs
  */
 #define	MTRR_UNCACHEABLE	0x00
 #define	MTRR_WRITE_COMBINING	0x01
 #define	MTRR_WRITE_THROUGH	0x04
 #define	MTRR_WRITE_PROTECTED	0x05
 #define	MTRR_WRITE_BACK		0x06
 #define	MTRR_N64K		8	/* numbers of fixed-size entries */
 #define	MTRR_N16K		16
 #define	MTRR_N4K		64
 #define	MTRR_CAP_WC		0x0000000000000400
 #define	MTRR_CAP_FIXED		0x0000000000000100
 #define	MTRR_CAP_VCNT		0x00000000000000ff
 #define	MTRR_DEF_ENABLE		0x0000000000000800
 #define	MTRR_DEF_FIXED_ENABLE	0x0000000000000400
 #define	MTRR_DEF_TYPE		0x00000000000000ff
 #define	MTRR_PHYSBASE_PHYSBASE	0x000ffffffffff000
 #define	MTRR_PHYSBASE_TYPE	0x00000000000000ff
 #define	MTRR_PHYSMASK_PHYSMASK	0x000ffffffffff000
 #define	MTRR_PHYSMASK_VALID	0x0000000000000800
 
 /*
  * Cyrix configuration registers, accessible as IO ports.
  */
 #define	CCR0			0xc0	/* Configuration control register 0 */
 #define	CCR0_NC0		0x01	/* First 64K of each 1M memory region is
 								   non-cacheable */
 #define	CCR0_NC1		0x02	/* 640K-1M region is non-cacheable */
 #define	CCR0_A20M		0x04	/* Enables A20M# input pin */
 #define	CCR0_KEN		0x08	/* Enables KEN# input pin */
 #define	CCR0_FLUSH		0x10	/* Enables FLUSH# input pin */
 #define	CCR0_BARB		0x20	/* Flushes internal cache when entering hold
 								   state */
 #define	CCR0_CO			0x40	/* Cache org: 1=direct mapped, 0=2x set
 								   assoc */
 #define	CCR0_SUSPEND	0x80	/* Enables SUSP# and SUSPA# pins */
 
 #define	CCR1			0xc1	/* Configuration control register 1 */
 #define	CCR1_RPL		0x01	/* Enables RPLSET and RPLVAL# pins */
 #define	CCR1_SMI		0x02	/* Enables SMM pins */
 #define	CCR1_SMAC		0x04	/* System management memory access */
 #define	CCR1_MMAC		0x08	/* Main memory access */
 #define	CCR1_NO_LOCK	0x10	/* Negate LOCK# */
 #define	CCR1_SM3		0x80	/* SMM address space address region 3 */
 
 #define	CCR2			0xc2
 #define	CCR2_WB			0x02	/* Enables WB cache interface pins */
 #define	CCR2_SADS		0x02	/* Slow ADS */
 #define	CCR2_LOCK_NW	0x04	/* LOCK NW Bit */
 #define	CCR2_SUSP_HLT	0x08	/* Suspend on HALT */
 #define	CCR2_WT1		0x10	/* WT region 1 */
 #define	CCR2_WPR1		0x10	/* Write-protect region 1 */
 #define	CCR2_BARB		0x20	/* Flushes write-back cache when entering
 								   hold state. */
 #define	CCR2_BWRT		0x40	/* Enables burst write cycles */
 #define	CCR2_USE_SUSP	0x80	/* Enables suspend pins */
 
 #define	CCR3			0xc3
 #define	CCR3_SMILOCK	0x01	/* SMM register lock */
 #define	CCR3_NMI		0x02	/* Enables NMI during SMM */
 #define	CCR3_LINBRST	0x04	/* Linear address burst cycles */
 #define	CCR3_SMMMODE	0x08	/* SMM Mode */
 #define	CCR3_MAPEN0		0x10	/* Enables Map0 */
 #define	CCR3_MAPEN1		0x20	/* Enables Map1 */
 #define	CCR3_MAPEN2		0x40	/* Enables Map2 */
 #define	CCR3_MAPEN3		0x80	/* Enables Map3 */
 
 #define	CCR4			0xe8
 #define	CCR4_IOMASK		0x07
 #define	CCR4_MEM		0x08	/* Enables momory bypassing */
 #define	CCR4_DTE		0x10	/* Enables directory table entry cache */
 #define	CCR4_FASTFPE	0x20	/* Fast FPU exception */
 #define	CCR4_CPUID		0x80	/* Enables CPUID instruction */
 
 #define	CCR5			0xe9
 #define	CCR5_WT_ALLOC	0x01	/* Write-through allocate */
 #define	CCR5_SLOP		0x02	/* LOOP instruction slowed down */
 #define	CCR5_LBR1		0x10	/* Local bus region 1 */
 #define	CCR5_ARREN		0x20	/* Enables ARR region */
 
 #define	CCR6			0xea
 
 #define	CCR7			0xeb
 
 /* Performance Control Register (5x86 only). */
 #define	PCR0			0x20
 #define	PCR0_RSTK		0x01	/* Enables return stack */
 #define	PCR0_BTB		0x02	/* Enables branch target buffer */
 #define	PCR0_LOOP		0x04	/* Enables loop */
 #define	PCR0_AIS		0x08	/* Enables all instrcutions stalled to
 								   serialize pipe. */
 #define	PCR0_MLR		0x10	/* Enables reordering of misaligned loads */
 #define	PCR0_BTBRT		0x40	/* Enables BTB test register. */
 #define	PCR0_LSSER		0x80	/* Disable reorder */
 
 /* Device Identification Registers */
 #define	DIR0			0xfe
 #define	DIR1			0xff
 
 /*
  * Machine Check register constants.
  */
 #define	MCG_CAP_COUNT		0x000000ff
 #define	MCG_CAP_CTL_P		0x00000100
 #define	MCG_CAP_EXT_P		0x00000200
 #define	MCG_CAP_CMCI_P		0x00000400
 #define	MCG_CAP_TES_P		0x00000800
 #define	MCG_CAP_EXT_CNT		0x00ff0000
 #define	MCG_CAP_SER_P		0x01000000
 #define	MCG_STATUS_RIPV		0x00000001
 #define	MCG_STATUS_EIPV		0x00000002
 #define	MCG_STATUS_MCIP		0x00000004
 #define	MCG_CTL_ENABLE		0xffffffffffffffff
 #define	MCG_CTL_DISABLE		0x0000000000000000
 #define	MSR_MC_CTL(x)		(MSR_MC0_CTL + (x) * 4)
 #define	MSR_MC_STATUS(x)	(MSR_MC0_STATUS + (x) * 4)
 #define	MSR_MC_ADDR(x)		(MSR_MC0_ADDR + (x) * 4)
 #define	MSR_MC_MISC(x)		(MSR_MC0_MISC + (x) * 4)
 #define	MSR_MC_CTL2(x)		(MSR_MC0_CTL2 + (x))	/* If MCG_CAP_CMCI_P */
 #define	MC_STATUS_MCA_ERROR	0x000000000000ffff
 #define	MC_STATUS_MODEL_ERROR	0x00000000ffff0000
 #define	MC_STATUS_OTHER_INFO	0x01ffffff00000000
 #define	MC_STATUS_COR_COUNT	0x001fffc000000000	/* If MCG_CAP_CMCI_P */
 #define	MC_STATUS_TES_STATUS	0x0060000000000000	/* If MCG_CAP_TES_P */
 #define	MC_STATUS_AR		0x0080000000000000	/* If MCG_CAP_TES_P */
 #define	MC_STATUS_S		0x0100000000000000	/* If MCG_CAP_TES_P */
 #define	MC_STATUS_PCC		0x0200000000000000
 #define	MC_STATUS_ADDRV		0x0400000000000000
 #define	MC_STATUS_MISCV		0x0800000000000000
 #define	MC_STATUS_EN		0x1000000000000000
 #define	MC_STATUS_UC		0x2000000000000000
 #define	MC_STATUS_OVER		0x4000000000000000
 #define	MC_STATUS_VAL		0x8000000000000000
 #define	MC_MISC_RA_LSB		0x000000000000003f	/* If MCG_CAP_SER_P */
 #define	MC_MISC_ADDRESS_MODE	0x00000000000001c0	/* If MCG_CAP_SER_P */
 #define	MC_CTL2_THRESHOLD	0x0000000000007fff
 #define	MC_CTL2_CMCI_EN		0x0000000040000000
 
 /*
  * The following four 3-byte registers control the non-cacheable regions.
  * These registers must be written as three separate bytes.
  *
  * NCRx+0: A31-A24 of starting address
  * NCRx+1: A23-A16 of starting address
  * NCRx+2: A15-A12 of starting address | NCR_SIZE_xx.
  *
  * The non-cacheable region's starting address must be aligned to the
  * size indicated by the NCR_SIZE_xx field.
  */
 #define	NCR1	0xc4
 #define	NCR2	0xc7
 #define	NCR3	0xca
 #define	NCR4	0xcd
 
 #define	NCR_SIZE_0K	0
 #define	NCR_SIZE_4K	1
 #define	NCR_SIZE_8K	2
 #define	NCR_SIZE_16K	3
 #define	NCR_SIZE_32K	4
 #define	NCR_SIZE_64K	5
 #define	NCR_SIZE_128K	6
 #define	NCR_SIZE_256K	7
 #define	NCR_SIZE_512K	8
 #define	NCR_SIZE_1M	9
 #define	NCR_SIZE_2M	10
 #define	NCR_SIZE_4M	11
 #define	NCR_SIZE_8M	12
 #define	NCR_SIZE_16M	13
 #define	NCR_SIZE_32M	14
 #define	NCR_SIZE_4G	15
 
 /*
  * The address region registers are used to specify the location and
  * size for the eight address regions.
  *
  * ARRx + 0: A31-A24 of start address
  * ARRx + 1: A23-A16 of start address
  * ARRx + 2: A15-A12 of start address | ARR_SIZE_xx
  */
 #define	ARR0	0xc4
 #define	ARR1	0xc7
 #define	ARR2	0xca
 #define	ARR3	0xcd
 #define	ARR4	0xd0
 #define	ARR5	0xd3
 #define	ARR6	0xd6
 #define	ARR7	0xd9
 
 #define	ARR_SIZE_0K		0
 #define	ARR_SIZE_4K		1
 #define	ARR_SIZE_8K		2
 #define	ARR_SIZE_16K	3
 #define	ARR_SIZE_32K	4
 #define	ARR_SIZE_64K	5
 #define	ARR_SIZE_128K	6
 #define	ARR_SIZE_256K	7
 #define	ARR_SIZE_512K	8
 #define	ARR_SIZE_1M		9
 #define	ARR_SIZE_2M		10
 #define	ARR_SIZE_4M		11
 #define	ARR_SIZE_8M		12
 #define	ARR_SIZE_16M	13
 #define	ARR_SIZE_32M	14
 #define	ARR_SIZE_4G		15
 
 /*
  * The region control registers specify the attributes associated with
  * the ARRx addres regions.
  */
 #define	RCR0	0xdc
 #define	RCR1	0xdd
 #define	RCR2	0xde
 #define	RCR3	0xdf
 #define	RCR4	0xe0
 #define	RCR5	0xe1
 #define	RCR6	0xe2
 #define	RCR7	0xe3
 
 #define	RCR_RCD	0x01	/* Disables caching for ARRx (x = 0-6). */
 #define	RCR_RCE	0x01	/* Enables caching for ARR7. */
 #define	RCR_WWO	0x02	/* Weak write ordering. */
 #define	RCR_WL	0x04	/* Weak locking. */
 #define	RCR_WG	0x08	/* Write gathering. */
 #define	RCR_WT	0x10	/* Write-through. */
 #define	RCR_NLB	0x20	/* LBA# pin is not asserted. */
 
 /* AMD Write Allocate Top-Of-Memory and Control Register */
 #define	AMD_WT_ALLOC_TME	0x40000	/* top-of-memory enable */
 #define	AMD_WT_ALLOC_PRE	0x20000	/* programmable range enable */
 #define	AMD_WT_ALLOC_FRE	0x10000	/* fixed (A0000-FFFFF) range enable */
 
 /* AMD64 MSR's */
 #define	MSR_EFER	0xc0000080	/* extended features */
 #define	MSR_STAR	0xc0000081	/* legacy mode SYSCALL target/cs/ss */
 #define	MSR_LSTAR	0xc0000082	/* long mode SYSCALL target rip */
 #define	MSR_CSTAR	0xc0000083	/* compat mode SYSCALL target rip */
 #define	MSR_SF_MASK	0xc0000084	/* syscall flags mask */
 #define	MSR_FSBASE	0xc0000100	/* base address of the %fs "segment" */
 #define	MSR_GSBASE	0xc0000101	/* base address of the %gs "segment" */
 #define	MSR_KGSBASE	0xc0000102	/* base address of the kernel %gs */
 #define	MSR_PERFEVSEL0	0xc0010000
 #define	MSR_PERFEVSEL1	0xc0010001
 #define	MSR_PERFEVSEL2	0xc0010002
 #define	MSR_PERFEVSEL3	0xc0010003
 #undef MSR_PERFCTR0
 #undef MSR_PERFCTR1
 #define	MSR_PERFCTR0	0xc0010004
 #define	MSR_PERFCTR1	0xc0010005
 #define	MSR_PERFCTR2	0xc0010006
 #define	MSR_PERFCTR3	0xc0010007
 #define	MSR_SYSCFG	0xc0010010
 #define	MSR_HWCR	0xc0010015
 #define	MSR_IORRBASE0	0xc0010016
 #define	MSR_IORRMASK0	0xc0010017
 #define	MSR_IORRBASE1	0xc0010018
 #define	MSR_IORRMASK1	0xc0010019
 #define	MSR_TOP_MEM	0xc001001a	/* boundary for ram below 4G */
 #define	MSR_TOP_MEM2	0xc001001d	/* boundary for ram above 4G */
 #define	MSR_K8_UCODE_UPDATE	0xc0010020	/* update microcode */
 #define	MSR_MC0_CTL_MASK	0xc0010044
 
 /* VIA ACE crypto featureset: for via_feature_rng */
 #define	VIA_HAS_RNG		1	/* cpu has RNG */
 
 /* VIA ACE crypto featureset: for via_feature_xcrypt */
 #define	VIA_HAS_AES		1	/* cpu has AES */
 #define	VIA_HAS_SHA		2	/* cpu has SHA1 & SHA256 */
 #define	VIA_HAS_MM		4	/* cpu has RSA instructions */
 #define	VIA_HAS_AESCTR		8	/* cpu has AES-CTR instructions */
 
 /* Centaur Extended Feature flags */
 #define	VIA_CPUID_HAS_RNG	0x000004
 #define	VIA_CPUID_DO_RNG	0x000008
 #define	VIA_CPUID_HAS_ACE	0x000040
 #define	VIA_CPUID_DO_ACE	0x000080
 #define	VIA_CPUID_HAS_ACE2	0x000100
 #define	VIA_CPUID_DO_ACE2	0x000200
 #define	VIA_CPUID_HAS_PHE	0x000400
 #define	VIA_CPUID_DO_PHE	0x000800
 #define	VIA_CPUID_HAS_PMM	0x001000
 #define	VIA_CPUID_DO_PMM	0x002000
 
 /* VIA ACE xcrypt-* instruction context control options */
 #define	VIA_CRYPT_CWLO_ROUND_M		0x0000000f
 #define	VIA_CRYPT_CWLO_ALG_M		0x00000070
 #define	VIA_CRYPT_CWLO_ALG_AES		0x00000000
 #define	VIA_CRYPT_CWLO_KEYGEN_M		0x00000080
 #define	VIA_CRYPT_CWLO_KEYGEN_HW	0x00000000
 #define	VIA_CRYPT_CWLO_KEYGEN_SW	0x00000080
 #define	VIA_CRYPT_CWLO_NORMAL		0x00000000
 #define	VIA_CRYPT_CWLO_INTERMEDIATE	0x00000100
 #define	VIA_CRYPT_CWLO_ENCRYPT		0x00000000
 #define	VIA_CRYPT_CWLO_DECRYPT		0x00000200
 #define	VIA_CRYPT_CWLO_KEY128		0x0000000a	/* 128bit, 10 rds */
 #define	VIA_CRYPT_CWLO_KEY192		0x0000040c	/* 192bit, 12 rds */
 #define	VIA_CRYPT_CWLO_KEY256		0x0000080e	/* 256bit, 15 rds */
 
 #endif /* !_MACHINE_SPECIALREG_H_ */
Index: stable/10/usr.sbin/bhyve/acpi.c
===================================================================
--- stable/10/usr.sbin/bhyve/acpi.c	(revision 276348)
+++ stable/10/usr.sbin/bhyve/acpi.c	(revision 276349)
@@ -1,1009 +1,1009 @@
 /*-
  * Copyright (c) 2012 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 /*
  * bhyve ACPI table generator.
  *
  * Create the minimal set of ACPI tables required to boot FreeBSD (and
  * hopefully other o/s's) by writing out ASL template files for each of
  * the tables and the compiling them to AML with the Intel iasl compiler.
  * The AML files are then read into guest memory.
  *
  *  The tables are placed in the guest's ROM area just below 1MB physical,
  * above the MPTable.
  *
  *  Layout
  *  ------
  *   RSDP  ->   0xf2400    (36 bytes fixed)
  *     RSDT  ->   0xf2440    (36 bytes + 4*7 table addrs, 4 used)
  *     XSDT  ->   0xf2480    (36 bytes + 8*7 table addrs, 4 used)
  *       MADT  ->   0xf2500  (depends on #CPUs)
  *       FADT  ->   0xf2600  (268 bytes)
  *       HPET  ->   0xf2740  (56 bytes)
  *       MCFG  ->   0xf2780  (60 bytes)
  *         FACS  ->   0xf27C0 (64 bytes)
  *         DSDT  ->   0xf2800 (variable - can go up to 0x100000)
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/errno.h>
 #include <sys/stat.h>
 
 #include <paths.h>
 #include <stdarg.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
 
 #include <machine/vmm.h>
 #include <vmmapi.h>
 
 #include "bhyverun.h"
 #include "acpi.h"
 #include "pci_emul.h"
 
 /*
  * Define the base address of the ACPI tables, and the offsets to
  * the individual tables
  */
 #define BHYVE_ACPI_BASE		0xf2400
 #define RSDT_OFFSET		0x040
 #define XSDT_OFFSET		0x080
 #define MADT_OFFSET		0x100
 #define FADT_OFFSET		0x200
 #define	HPET_OFFSET		0x340
 #define	MCFG_OFFSET		0x380
 #define FACS_OFFSET		0x3C0
 #define DSDT_OFFSET		0x400
 
 #define	BHYVE_ASL_TEMPLATE	"bhyve.XXXXXXX"
 #define BHYVE_ASL_SUFFIX	".aml"
 #define BHYVE_ASL_COMPILER	"/usr/sbin/iasl"
 
 static int basl_keep_temps;
 static int basl_verbose_iasl;
 static int basl_ncpu;
 static uint32_t basl_acpi_base = BHYVE_ACPI_BASE;
 static uint32_t hpet_capabilities;
 
 /*
  * Contains the full pathname of the template to be passed
  * to mkstemp/mktemps(3)
  */
 static char basl_template[MAXPATHLEN];
 static char basl_stemplate[MAXPATHLEN];
 
 /*
  * State for dsdt_line(), dsdt_indent(), and dsdt_unindent().
  */
 static FILE *dsdt_fp;
 static int dsdt_indent_level;
 static int dsdt_error;
 
 struct basl_fio {
 	int	fd;
 	FILE	*fp;
 	char	f_name[MAXPATHLEN];
 };
 
 #define EFPRINTF(...) \
 	err = fprintf(__VA_ARGS__); if (err < 0) goto err_exit;
 
 #define EFFLUSH(x) \
 	err = fflush(x); if (err != 0) goto err_exit;
 
 static int
 basl_fwrite_rsdp(FILE *fp)
 {
 	int err;
 
 	err = 0;
 
 	EFPRINTF(fp, "/*\n");
 	EFPRINTF(fp, " * bhyve RSDP template\n");
 	EFPRINTF(fp, " */\n");
 	EFPRINTF(fp, "[0008]\t\tSignature : \"RSD PTR \"\n");
 	EFPRINTF(fp, "[0001]\t\tChecksum : 43\n");
 	EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n");
 	EFPRINTF(fp, "[0001]\t\tRevision : 02\n");
 	EFPRINTF(fp, "[0004]\t\tRSDT Address : %08X\n",
 	    basl_acpi_base + RSDT_OFFSET);
 	EFPRINTF(fp, "[0004]\t\tLength : 00000024\n");
 	EFPRINTF(fp, "[0008]\t\tXSDT Address : 00000000%08X\n",
 	    basl_acpi_base + XSDT_OFFSET);
 	EFPRINTF(fp, "[0001]\t\tExtended Checksum : 00\n");
 	EFPRINTF(fp, "[0003]\t\tReserved : 000000\n");
 
 	EFFLUSH(fp);
 
 	return (0);
 
 err_exit:
 	return (errno);
 }
 
 static int
 basl_fwrite_rsdt(FILE *fp)
 {
 	int err;
 
 	err = 0;
 
 	EFPRINTF(fp, "/*\n");
 	EFPRINTF(fp, " * bhyve RSDT template\n");
 	EFPRINTF(fp, " */\n");
 	EFPRINTF(fp, "[0004]\t\tSignature : \"RSDT\"\n");
 	EFPRINTF(fp, "[0004]\t\tTable Length : 00000000\n");
 	EFPRINTF(fp, "[0001]\t\tRevision : 01\n");
 	EFPRINTF(fp, "[0001]\t\tChecksum : 00\n");
 	EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n");
 	EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVRSDT  \"\n");
 	EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n");
 	/* iasl will fill in the compiler ID/revision fields */
 	EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n");
 	EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n");
 	EFPRINTF(fp, "\n");
 
 	/* Add in pointers to the MADT, FADT and HPET */
 	EFPRINTF(fp, "[0004]\t\tACPI Table Address 0 : %08X\n",
 	    basl_acpi_base + MADT_OFFSET);
 	EFPRINTF(fp, "[0004]\t\tACPI Table Address 1 : %08X\n",
 	    basl_acpi_base + FADT_OFFSET);
 	EFPRINTF(fp, "[0004]\t\tACPI Table Address 2 : %08X\n",
 	    basl_acpi_base + HPET_OFFSET);
 	EFPRINTF(fp, "[0004]\t\tACPI Table Address 3 : %08X\n",
 	    basl_acpi_base + MCFG_OFFSET);
 
 	EFFLUSH(fp);
 
 	return (0);
 
 err_exit:
 	return (errno);
 }
 
 static int
 basl_fwrite_xsdt(FILE *fp)
 {
 	int err;
 
 	err = 0;
 
 	EFPRINTF(fp, "/*\n");
 	EFPRINTF(fp, " * bhyve XSDT template\n");
 	EFPRINTF(fp, " */\n");
 	EFPRINTF(fp, "[0004]\t\tSignature : \"XSDT\"\n");
 	EFPRINTF(fp, "[0004]\t\tTable Length : 00000000\n");
 	EFPRINTF(fp, "[0001]\t\tRevision : 01\n");
 	EFPRINTF(fp, "[0001]\t\tChecksum : 00\n");
 	EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n");
 	EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVXSDT  \"\n");
 	EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n");
 	/* iasl will fill in the compiler ID/revision fields */
 	EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n");
 	EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n");
 	EFPRINTF(fp, "\n");
 
 	/* Add in pointers to the MADT, FADT and HPET */
 	EFPRINTF(fp, "[0004]\t\tACPI Table Address 0 : 00000000%08X\n",
 	    basl_acpi_base + MADT_OFFSET);
 	EFPRINTF(fp, "[0004]\t\tACPI Table Address 1 : 00000000%08X\n",
 	    basl_acpi_base + FADT_OFFSET);
 	EFPRINTF(fp, "[0004]\t\tACPI Table Address 2 : 00000000%08X\n",
 	    basl_acpi_base + HPET_OFFSET);
 	EFPRINTF(fp, "[0004]\t\tACPI Table Address 3 : 00000000%08X\n",
 	    basl_acpi_base + MCFG_OFFSET);
 
 	EFFLUSH(fp);
 
 	return (0);
 
 err_exit:
 	return (errno);
 }
 
 static int
 basl_fwrite_madt(FILE *fp)
 {
 	int err;
 	int i;
 
 	err = 0;
 
 	EFPRINTF(fp, "/*\n");
 	EFPRINTF(fp, " * bhyve MADT template\n");
 	EFPRINTF(fp, " */\n");
 	EFPRINTF(fp, "[0004]\t\tSignature : \"APIC\"\n");
 	EFPRINTF(fp, "[0004]\t\tTable Length : 00000000\n");
 	EFPRINTF(fp, "[0001]\t\tRevision : 01\n");
 	EFPRINTF(fp, "[0001]\t\tChecksum : 00\n");
 	EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n");
 	EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVMADT  \"\n");
 	EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n");
 
 	/* iasl will fill in the compiler ID/revision fields */
 	EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n");
 	EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n");
 	EFPRINTF(fp, "\n");
 
 	EFPRINTF(fp, "[0004]\t\tLocal Apic Address : FEE00000\n");
 	EFPRINTF(fp, "[0004]\t\tFlags (decoded below) : 00000001\n");
 	EFPRINTF(fp, "\t\t\tPC-AT Compatibility : 1\n");
 	EFPRINTF(fp, "\n");
 
 	/* Add a Processor Local APIC entry for each CPU */
 	for (i = 0; i < basl_ncpu; i++) {
 		EFPRINTF(fp, "[0001]\t\tSubtable Type : 00\n");
 		EFPRINTF(fp, "[0001]\t\tLength : 08\n");
 		/* iasl expects hex values for the proc and apic id's */
 		EFPRINTF(fp, "[0001]\t\tProcessor ID : %02x\n", i);
 		EFPRINTF(fp, "[0001]\t\tLocal Apic ID : %02x\n", i);
 		EFPRINTF(fp, "[0004]\t\tFlags (decoded below) : 00000001\n");
 		EFPRINTF(fp, "\t\t\tProcessor Enabled : 1\n");
 		EFPRINTF(fp, "\n");
 	}
 
 	/* Always a single IOAPIC entry, with ID 0 */
 	EFPRINTF(fp, "[0001]\t\tSubtable Type : 01\n");
 	EFPRINTF(fp, "[0001]\t\tLength : 0C\n");
 	/* iasl expects a hex value for the i/o apic id */
 	EFPRINTF(fp, "[0001]\t\tI/O Apic ID : %02x\n", 0);
 	EFPRINTF(fp, "[0001]\t\tReserved : 00\n");
 	EFPRINTF(fp, "[0004]\t\tAddress : fec00000\n");
 	EFPRINTF(fp, "[0004]\t\tInterrupt : 00000000\n");
 	EFPRINTF(fp, "\n");
 
 	/* Legacy IRQ0 is connected to pin 2 of the IOAPIC */
 	EFPRINTF(fp, "[0001]\t\tSubtable Type : 02\n");
 	EFPRINTF(fp, "[0001]\t\tLength : 0A\n");
 	EFPRINTF(fp, "[0001]\t\tBus : 00\n");
 	EFPRINTF(fp, "[0001]\t\tSource : 00\n");
 	EFPRINTF(fp, "[0004]\t\tInterrupt : 00000002\n");
 	EFPRINTF(fp, "[0002]\t\tFlags (decoded below) : 0005\n");
 	EFPRINTF(fp, "\t\t\tPolarity : 1\n");
 	EFPRINTF(fp, "\t\t\tTrigger Mode : 1\n");
 	EFPRINTF(fp, "\n");
 
 	EFPRINTF(fp, "[0001]\t\tSubtable Type : 02\n");
 	EFPRINTF(fp, "[0001]\t\tLength : 0A\n");
 	EFPRINTF(fp, "[0001]\t\tBus : 00\n");
 	EFPRINTF(fp, "[0001]\t\tSource : %02X\n", SCI_INT);
 	EFPRINTF(fp, "[0004]\t\tInterrupt : %08X\n", SCI_INT);
 	EFPRINTF(fp, "[0002]\t\tFlags (decoded below) : 0000\n");
 	EFPRINTF(fp, "\t\t\tPolarity : 3\n");
 	EFPRINTF(fp, "\t\t\tTrigger Mode : 3\n");
 	EFPRINTF(fp, "\n");
 
 	/* Local APIC NMI is connected to LINT 1 on all CPUs */
 	EFPRINTF(fp, "[0001]\t\tSubtable Type : 04\n");
 	EFPRINTF(fp, "[0001]\t\tLength : 06\n");
 	EFPRINTF(fp, "[0001]\t\tProcessorId : FF\n");
 	EFPRINTF(fp, "[0002]\t\tFlags (decoded below) : 0005\n");
 	EFPRINTF(fp, "\t\t\tPolarity : 1\n");
 	EFPRINTF(fp, "\t\t\tTrigger Mode : 1\n");
 	EFPRINTF(fp, "[0001]\t\tInterrupt : 01\n");
 	EFPRINTF(fp, "\n");
 
 	EFFLUSH(fp);
 
 	return (0);
 
 err_exit:
 	return (errno);
 }
 
 static int
 basl_fwrite_fadt(FILE *fp)
 {
 	int err;
 
 	err = 0;
 
 	EFPRINTF(fp, "/*\n");
 	EFPRINTF(fp, " * bhyve FADT template\n");
 	EFPRINTF(fp, " */\n");
 	EFPRINTF(fp, "[0004]\t\tSignature : \"FACP\"\n");
 	EFPRINTF(fp, "[0004]\t\tTable Length : 0000010C\n");
 	EFPRINTF(fp, "[0001]\t\tRevision : 05\n");
 	EFPRINTF(fp, "[0001]\t\tChecksum : 00\n");
 	EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n");
 	EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVFACP  \"\n");
 	EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n");
 	/* iasl will fill in the compiler ID/revision fields */
 	EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n");
 	EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n");
 	EFPRINTF(fp, "\n");
 
 	EFPRINTF(fp, "[0004]\t\tFACS Address : %08X\n",
 	    basl_acpi_base + FACS_OFFSET);
 	EFPRINTF(fp, "[0004]\t\tDSDT Address : %08X\n",
 	    basl_acpi_base + DSDT_OFFSET);
 	EFPRINTF(fp, "[0001]\t\tModel : 01\n");
 	EFPRINTF(fp, "[0001]\t\tPM Profile : 00 [Unspecified]\n");
 	EFPRINTF(fp, "[0002]\t\tSCI Interrupt : %04X\n",
 	    SCI_INT);
 	EFPRINTF(fp, "[0004]\t\tSMI Command Port : %08X\n",
 	    SMI_CMD);
 	EFPRINTF(fp, "[0001]\t\tACPI Enable Value : %02X\n",
 	    BHYVE_ACPI_ENABLE);
 	EFPRINTF(fp, "[0001]\t\tACPI Disable Value : %02X\n",
 	    BHYVE_ACPI_DISABLE);
 	EFPRINTF(fp, "[0001]\t\tS4BIOS Command : 00\n");
 	EFPRINTF(fp, "[0001]\t\tP-State Control : 00\n");
 	EFPRINTF(fp, "[0004]\t\tPM1A Event Block Address : %08X\n",
 	    PM1A_EVT_ADDR);
 	EFPRINTF(fp, "[0004]\t\tPM1B Event Block Address : 00000000\n");
 	EFPRINTF(fp, "[0004]\t\tPM1A Control Block Address : %08X\n",
 	    PM1A_CNT_ADDR);
 	EFPRINTF(fp, "[0004]\t\tPM1B Control Block Address : 00000000\n");
 	EFPRINTF(fp, "[0004]\t\tPM2 Control Block Address : 00000000\n");
 	EFPRINTF(fp, "[0004]\t\tPM Timer Block Address : %08X\n",
 	    IO_PMTMR);
 	EFPRINTF(fp, "[0004]\t\tGPE0 Block Address : 00000000\n");
 	EFPRINTF(fp, "[0004]\t\tGPE1 Block Address : 00000000\n");
 	EFPRINTF(fp, "[0001]\t\tPM1 Event Block Length : 04\n");
 	EFPRINTF(fp, "[0001]\t\tPM1 Control Block Length : 02\n");
 	EFPRINTF(fp, "[0001]\t\tPM2 Control Block Length : 00\n");
 	EFPRINTF(fp, "[0001]\t\tPM Timer Block Length : 04\n");
 	EFPRINTF(fp, "[0001]\t\tGPE0 Block Length : 00\n");
 	EFPRINTF(fp, "[0001]\t\tGPE1 Block Length : 00\n");
 	EFPRINTF(fp, "[0001]\t\tGPE1 Base Offset : 00\n");
 	EFPRINTF(fp, "[0001]\t\t_CST Support : 00\n");
 	EFPRINTF(fp, "[0002]\t\tC2 Latency : 0000\n");
 	EFPRINTF(fp, "[0002]\t\tC3 Latency : 0000\n");
 	EFPRINTF(fp, "[0002]\t\tCPU Cache Size : 0000\n");
 	EFPRINTF(fp, "[0002]\t\tCache Flush Stride : 0000\n");
 	EFPRINTF(fp, "[0001]\t\tDuty Cycle Offset : 00\n");
 	EFPRINTF(fp, "[0001]\t\tDuty Cycle Width : 00\n");
 	EFPRINTF(fp, "[0001]\t\tRTC Day Alarm Index : 00\n");
 	EFPRINTF(fp, "[0001]\t\tRTC Month Alarm Index : 00\n");
 	EFPRINTF(fp, "[0001]\t\tRTC Century Index : 00\n");
 	EFPRINTF(fp, "[0002]\t\tBoot Flags (decoded below) : 0000\n");
 	EFPRINTF(fp, "\t\t\tLegacy Devices Supported (V2) : 0\n");
 	EFPRINTF(fp, "\t\t\t8042 Present on ports 60/64 (V2) : 0\n");
 	EFPRINTF(fp, "\t\t\tVGA Not Present (V4) : 1\n");
 	EFPRINTF(fp, "\t\t\tMSI Not Supported (V4) : 0\n");
 	EFPRINTF(fp, "\t\t\tPCIe ASPM Not Supported (V4) : 1\n");
 	EFPRINTF(fp, "\t\t\tCMOS RTC Not Present (V5) : 0\n");
 	EFPRINTF(fp, "[0001]\t\tReserved : 00\n");
 	EFPRINTF(fp, "[0004]\t\tFlags (decoded below) : 00000000\n");
 	EFPRINTF(fp, "\t\t\tWBINVD instruction is operational (V1) : 1\n");
 	EFPRINTF(fp, "\t\t\tWBINVD flushes all caches (V1) : 0\n");
 	EFPRINTF(fp, "\t\t\tAll CPUs support C1 (V1) : 1\n");
 	EFPRINTF(fp, "\t\t\tC2 works on MP system (V1) : 0\n");
 	EFPRINTF(fp, "\t\t\tControl Method Power Button (V1) : 0\n");
 	EFPRINTF(fp, "\t\t\tControl Method Sleep Button (V1) : 1\n");
 	EFPRINTF(fp, "\t\t\tRTC wake not in fixed reg space (V1) : 0\n");
 	EFPRINTF(fp, "\t\t\tRTC can wake system from S4 (V1) : 0\n");
 	EFPRINTF(fp, "\t\t\t32-bit PM Timer (V1) : 1\n");
 	EFPRINTF(fp, "\t\t\tDocking Supported (V1) : 0\n");
 	EFPRINTF(fp, "\t\t\tReset Register Supported (V2) : 1\n");
 	EFPRINTF(fp, "\t\t\tSealed Case (V3) : 0\n");
 	EFPRINTF(fp, "\t\t\tHeadless - No Video (V3) : 1\n");
 	EFPRINTF(fp, "\t\t\tUse native instr after SLP_TYPx (V3) : 0\n");
 	EFPRINTF(fp, "\t\t\tPCIEXP_WAK Bits Supported (V4) : 0\n");
 	EFPRINTF(fp, "\t\t\tUse Platform Timer (V4) : 0\n");
 	EFPRINTF(fp, "\t\t\tRTC_STS valid on S4 wake (V4) : 0\n");
 	EFPRINTF(fp, "\t\t\tRemote Power-on capable (V4) : 0\n");
 	EFPRINTF(fp, "\t\t\tUse APIC Cluster Model (V4) : 0\n");
 	EFPRINTF(fp, "\t\t\tUse APIC Physical Destination Mode (V4) : 1\n");
 	EFPRINTF(fp, "\t\t\tHardware Reduced (V5) : 0\n");
 	EFPRINTF(fp, "\t\t\tLow Power S0 Idle (V5) : 0\n");
 	EFPRINTF(fp, "\n");
 
 	EFPRINTF(fp,
 	    "[0012]\t\tReset Register : [Generic Address Structure]\n");
 	EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
 	EFPRINTF(fp, "[0001]\t\tBit Width : 08\n");
 	EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
 	EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 01 [Byte Access:8]\n");
 	EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000CF9\n");
 	EFPRINTF(fp, "\n");
 
 	EFPRINTF(fp, "[0001]\t\tValue to cause reset : 06\n");
 	EFPRINTF(fp, "[0003]\t\tReserved : 000000\n");
 	EFPRINTF(fp, "[0008]\t\tFACS Address : 00000000%08X\n",
 	    basl_acpi_base + FACS_OFFSET);
 	EFPRINTF(fp, "[0008]\t\tDSDT Address : 00000000%08X\n",
 	    basl_acpi_base + DSDT_OFFSET);
 	EFPRINTF(fp,
 	    "[0012]\t\tPM1A Event Block : [Generic Address Structure]\n");
 	EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
 	EFPRINTF(fp, "[0001]\t\tBit Width : 20\n");
 	EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
 	EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 02 [Word Access:16]\n");
 	EFPRINTF(fp, "[0008]\t\tAddress : 00000000%08X\n",
 	    PM1A_EVT_ADDR);
 	EFPRINTF(fp, "\n");
 	
 	EFPRINTF(fp,
 	    "[0012]\t\tPM1B Event Block : [Generic Address Structure]\n");
 	EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
 	EFPRINTF(fp, "[0001]\t\tBit Width : 00\n");
 	EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
 	EFPRINTF(fp,
 	    "[0001]\t\tEncoded Access Width : 00 [Undefined/Legacy]\n");
 	EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n");
 	EFPRINTF(fp, "\n");
 
 	EFPRINTF(fp,
 	    "[0012]\t\tPM1A Control Block : [Generic Address Structure]\n");
 	EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
 	EFPRINTF(fp, "[0001]\t\tBit Width : 10\n");
 	EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
 	EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 02 [Word Access:16]\n");
 	EFPRINTF(fp, "[0008]\t\tAddress : 00000000%08X\n",
 	    PM1A_CNT_ADDR);
 	EFPRINTF(fp, "\n");
 
 	EFPRINTF(fp,
 	    "[0012]\t\tPM1B Control Block : [Generic Address Structure]\n");
 	EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
 	EFPRINTF(fp, "[0001]\t\tBit Width : 00\n");
 	EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
 	EFPRINTF(fp,
 	    "[0001]\t\tEncoded Access Width : 00 [Undefined/Legacy]\n");
 	EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n");
 	EFPRINTF(fp, "\n");
 
 	EFPRINTF(fp,
 	    "[0012]\t\tPM2 Control Block : [Generic Address Structure]\n");
 	EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
 	EFPRINTF(fp, "[0001]\t\tBit Width : 08\n");
 	EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
 	EFPRINTF(fp,
 	    "[0001]\t\tEncoded Access Width : 00 [Undefined/Legacy]\n");
 	EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n");
 	EFPRINTF(fp, "\n");
 
 	/* Valid for bhyve */
 	EFPRINTF(fp,
 	    "[0012]\t\tPM Timer Block : [Generic Address Structure]\n");
 	EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
-	EFPRINTF(fp, "[0001]\t\tBit Width : 32\n");
+	EFPRINTF(fp, "[0001]\t\tBit Width : 20\n");
 	EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
 	EFPRINTF(fp,
 	    "[0001]\t\tEncoded Access Width : 03 [DWord Access:32]\n");
 	EFPRINTF(fp, "[0008]\t\tAddress : 00000000%08X\n",
 	    IO_PMTMR);
 	EFPRINTF(fp, "\n");
 
 	EFPRINTF(fp, "[0012]\t\tGPE0 Block : [Generic Address Structure]\n");
 	EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
-	EFPRINTF(fp, "[0001]\t\tBit Width : 80\n");
+	EFPRINTF(fp, "[0001]\t\tBit Width : 00\n");
 	EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
 	EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 01 [Byte Access:8]\n");
 	EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n");
 	EFPRINTF(fp, "\n");
 
 	EFPRINTF(fp, "[0012]\t\tGPE1 Block : [Generic Address Structure]\n");
 	EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
 	EFPRINTF(fp, "[0001]\t\tBit Width : 00\n");
 	EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
 	EFPRINTF(fp,
 	    "[0001]\t\tEncoded Access Width : 00 [Undefined/Legacy]\n");
 	EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n");
 	EFPRINTF(fp, "\n");
 
 	EFPRINTF(fp,
 	   "[0012]\t\tSleep Control Register : [Generic Address Structure]\n");
 	EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
 	EFPRINTF(fp, "[0001]\t\tBit Width : 08\n");
 	EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
 	EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 01 [Byte Access:8]\n");
 	EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n");
 	EFPRINTF(fp, "\n");
 
 	EFPRINTF(fp,
 	    "[0012]\t\tSleep Status Register : [Generic Address Structure]\n");
 	EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
 	EFPRINTF(fp, "[0001]\t\tBit Width : 08\n");
 	EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
 	EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 01 [Byte Access:8]\n");
 	EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n");
 
 	EFFLUSH(fp);
 
 	return (0);
 
 err_exit:
 	return (errno);
 }
 
 static int
 basl_fwrite_hpet(FILE *fp)
 {
 	int err;
 
 	err = 0;
 
 	EFPRINTF(fp, "/*\n");
 	EFPRINTF(fp, " * bhyve HPET template\n");
 	EFPRINTF(fp, " */\n");
 	EFPRINTF(fp, "[0004]\t\tSignature : \"HPET\"\n");
 	EFPRINTF(fp, "[0004]\t\tTable Length : 00000000\n");
 	EFPRINTF(fp, "[0001]\t\tRevision : 01\n");
 	EFPRINTF(fp, "[0001]\t\tChecksum : 00\n");
 	EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n");
 	EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVHPET  \"\n");
 	EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n");
 
 	/* iasl will fill in the compiler ID/revision fields */
 	EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n");
 	EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n");
 	EFPRINTF(fp, "\n");
 
 	EFPRINTF(fp, "[0004]\t\tTimer Block ID : %08X\n", hpet_capabilities);
 	EFPRINTF(fp,
 	    "[0012]\t\tTimer Block Register : [Generic Address Structure]\n");
 	EFPRINTF(fp, "[0001]\t\tSpace ID : 00 [SystemMemory]\n");
 	EFPRINTF(fp, "[0001]\t\tBit Width : 00\n");
 	EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
 	EFPRINTF(fp,
 		 "[0001]\t\tEncoded Access Width : 00 [Undefined/Legacy]\n");
 	EFPRINTF(fp, "[0008]\t\tAddress : 00000000FED00000\n");
 	EFPRINTF(fp, "\n");
 
 	EFPRINTF(fp, "[0001]\t\tHPET Number : 00\n");
 	EFPRINTF(fp, "[0002]\t\tMinimum Clock Ticks : 0000\n");
 	EFPRINTF(fp, "[0004]\t\tFlags (decoded below) : 00000001\n");
 	EFPRINTF(fp, "\t\t\t4K Page Protect : 1\n");
 	EFPRINTF(fp, "\t\t\t64K Page Protect : 0\n");
 	EFPRINTF(fp, "\n");
 
 	EFFLUSH(fp);
 
 	return (0);
 
 err_exit:
 	return (errno);
 }
 
 static int
 basl_fwrite_mcfg(FILE *fp)
 {
 	int err = 0;
 
 	EFPRINTF(fp, "/*\n");
 	EFPRINTF(fp, " * bhyve MCFG template\n");
 	EFPRINTF(fp, " */\n");
 	EFPRINTF(fp, "[0004]\t\tSignature : \"MCFG\"\n");
 	EFPRINTF(fp, "[0004]\t\tTable Length : 00000000\n");
 	EFPRINTF(fp, "[0001]\t\tRevision : 01\n");
 	EFPRINTF(fp, "[0001]\t\tChecksum : 00\n");
 	EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n");
 	EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVMCFG  \"\n");
 	EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n");
 
 	/* iasl will fill in the compiler ID/revision fields */
 	EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n");
 	EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n");
 	EFPRINTF(fp, "[0008]\t\tReserved : 0\n");
 	EFPRINTF(fp, "\n");
 
 	EFPRINTF(fp, "[0008]\t\tBase Address : %016lX\n", pci_ecfg_base());
 	EFPRINTF(fp, "[0002]\t\tSegment Group: 0000\n");
 	EFPRINTF(fp, "[0001]\t\tStart Bus: 00\n");
 	EFPRINTF(fp, "[0001]\t\tEnd Bus: FF\n");
 	EFPRINTF(fp, "[0004]\t\tReserved : 0\n");
 	EFFLUSH(fp);
 	return (0);
 err_exit:
 	return (errno);
 }
 
 static int
 basl_fwrite_facs(FILE *fp)
 {
 	int err;
 
 	err = 0;
 
 	EFPRINTF(fp, "/*\n");
 	EFPRINTF(fp, " * bhyve FACS template\n");
 	EFPRINTF(fp, " */\n");
 	EFPRINTF(fp, "[0004]\t\tSignature : \"FACS\"\n");
 	EFPRINTF(fp, "[0004]\t\tLength : 00000040\n");
 	EFPRINTF(fp, "[0004]\t\tHardware Signature : 00000000\n");
 	EFPRINTF(fp, "[0004]\t\t32 Firmware Waking Vector : 00000000\n");
 	EFPRINTF(fp, "[0004]\t\tGlobal Lock : 00000000\n");
 	EFPRINTF(fp, "[0004]\t\tFlags (decoded below) : 00000000\n");
 	EFPRINTF(fp, "\t\t\tS4BIOS Support Present : 0\n");
 	EFPRINTF(fp, "\t\t\t64-bit Wake Supported (V2) : 0\n");
 	EFPRINTF(fp,
 	    "[0008]\t\t64 Firmware Waking Vector : 0000000000000000\n");
 	EFPRINTF(fp, "[0001]\t\tVersion : 02\n");
 	EFPRINTF(fp, "[0003]\t\tReserved : 000000\n");
 	EFPRINTF(fp, "[0004]\t\tOspmFlags (decoded below) : 00000000\n");
 	EFPRINTF(fp, "\t\t\t64-bit Wake Env Required (V2) : 0\n");
 
 	EFFLUSH(fp);
 
 	return (0);
 	
 err_exit:
 	return (errno);
 }
 
 /*
  * Helper routines for writing to the DSDT from other modules.
  */
 void
 dsdt_line(const char *fmt, ...)
 {
 	va_list ap;
 	int err;
 
 	if (dsdt_error != 0)
 		return;
 
 	if (strcmp(fmt, "") != 0) {
 		if (dsdt_indent_level != 0)
 			EFPRINTF(dsdt_fp, "%*c", dsdt_indent_level * 2, ' ');
 		va_start(ap, fmt);
 		if (vfprintf(dsdt_fp, fmt, ap) < 0)
 			goto err_exit;
 		va_end(ap);
 	}
 	EFPRINTF(dsdt_fp, "\n");
 	return;
 
 err_exit:
 	dsdt_error = errno;
 }
 
 void
 dsdt_indent(int levels)
 {
 
 	dsdt_indent_level += levels;
 	assert(dsdt_indent_level >= 0);
 }
 
 void
 dsdt_unindent(int levels)
 {
 
 	assert(dsdt_indent_level >= levels);
 	dsdt_indent_level -= levels;
 }
 
 void
 dsdt_fixed_ioport(uint16_t iobase, uint16_t length)
 {
 
 	dsdt_line("IO (Decode16,");
 	dsdt_line("  0x%04X,             // Range Minimum", iobase);
 	dsdt_line("  0x%04X,             // Range Maximum", iobase);
 	dsdt_line("  0x01,               // Alignment");
 	dsdt_line("  0x%02X,               // Length", length);
 	dsdt_line("  )");
 }
 
 void
 dsdt_fixed_irq(uint8_t irq)
 {
 
 	dsdt_line("IRQNoFlags ()");
 	dsdt_line("  {%d}", irq);
 }
 
 void
 dsdt_fixed_mem32(uint32_t base, uint32_t length)
 {
 
 	dsdt_line("Memory32Fixed (ReadWrite,");
 	dsdt_line("  0x%08X,         // Address Base", base);
 	dsdt_line("  0x%08X,         // Address Length", length);
 	dsdt_line("  )");
 }
 
 static int
 basl_fwrite_dsdt(FILE *fp)
 {
 	int err;
 
 	err = 0;
 	dsdt_fp = fp;
 	dsdt_error = 0;
 	dsdt_indent_level = 0;
 
 	dsdt_line("/*");
 	dsdt_line(" * bhyve DSDT template");
 	dsdt_line(" */");
 	dsdt_line("DefinitionBlock (\"bhyve_dsdt.aml\", \"DSDT\", 2,"
 		 "\"BHYVE \", \"BVDSDT  \", 0x00000001)");
 	dsdt_line("{");
 	dsdt_line("  Name (_S5, Package ()");
 	dsdt_line("  {");
 	dsdt_line("      0x05,");
 	dsdt_line("      Zero,");
 	dsdt_line("  })");
 
 	pci_write_dsdt();
 
 	dsdt_line("");
 	dsdt_line("  Scope (_SB.PC00)");
 	dsdt_line("  {");
 	dsdt_line("    Device (HPET)");
 	dsdt_line("    {");
 	dsdt_line("      Name (_HID, EISAID(\"PNP0103\"))");
 	dsdt_line("      Name (_UID, 0)");
 	dsdt_line("      Name (_CRS, ResourceTemplate ()");
 	dsdt_line("      {");
 	dsdt_indent(4);
 	dsdt_fixed_mem32(0xFED00000, 0x400);
 	dsdt_unindent(4);
 	dsdt_line("      })");
 	dsdt_line("    }");
 	dsdt_line("  }");
 	dsdt_line("}");
 
 	if (dsdt_error != 0)
 		return (dsdt_error);
 
 	EFFLUSH(fp);
 
 	return (0);
 
 err_exit:
 	return (errno);
 }
 
 static int
 basl_open(struct basl_fio *bf, int suffix)
 {
 	int err;
 
 	err = 0;
 
 	if (suffix) {
 		strncpy(bf->f_name, basl_stemplate, MAXPATHLEN);
 		bf->fd = mkstemps(bf->f_name, strlen(BHYVE_ASL_SUFFIX));
 	} else {
 		strncpy(bf->f_name, basl_template, MAXPATHLEN);
 		bf->fd = mkstemp(bf->f_name);
 	}
 
 	if (bf->fd > 0) {
 		bf->fp = fdopen(bf->fd, "w+");
 		if (bf->fp == NULL) {
 			unlink(bf->f_name);
 			close(bf->fd);
 		}
 	} else {
 		err = 1;
 	}
 
 	return (err);
 }
 
 static void
 basl_close(struct basl_fio *bf)
 {
 
 	if (!basl_keep_temps)
 		unlink(bf->f_name);
 	fclose(bf->fp);
 }
 
 static int
 basl_start(struct basl_fio *in, struct basl_fio *out)
 {
 	int err;
 
 	err = basl_open(in, 0);
 	if (!err) {
 		err = basl_open(out, 1);
 		if (err) {
 			basl_close(in);
 		}
 	}
 
 	return (err);
 }
 
 static void
 basl_end(struct basl_fio *in, struct basl_fio *out)
 {
 
 	basl_close(in);
 	basl_close(out);
 }
 
 static int
 basl_load(struct vmctx *ctx, int fd, uint64_t off)
 {
 	struct stat sb;
 	void *gaddr;
 
 	if (fstat(fd, &sb) < 0)
 		return (errno);
 		
 	gaddr = paddr_guest2host(ctx, basl_acpi_base + off, sb.st_size);
 	if (gaddr == NULL)
 		return (EFAULT);
 
 	if (read(fd, gaddr, sb.st_size) < 0)
 		return (errno);
 
 	return (0);
 }
 
 static int
 basl_compile(struct vmctx *ctx, int (*fwrite_section)(FILE *), uint64_t offset)
 {
 	struct basl_fio io[2];
 	static char iaslbuf[3*MAXPATHLEN + 10];
 	char *fmt;
 	int err;
 
 	err = basl_start(&io[0], &io[1]);
 	if (!err) {
 		err = (*fwrite_section)(io[0].fp);
 
 		if (!err) {
 			/*
 			 * iasl sends the results of the compilation to
 			 * stdout. Shut this down by using the shell to
 			 * redirect stdout to /dev/null, unless the user
 			 * has requested verbose output for debugging
 			 * purposes
 			 */
 			fmt = basl_verbose_iasl ?
 				"%s -p %s %s" :
 				"/bin/sh -c \"%s -p %s %s\" 1> /dev/null";
 				
 			snprintf(iaslbuf, sizeof(iaslbuf),
 				 fmt,
 				 BHYVE_ASL_COMPILER,
 				 io[1].f_name, io[0].f_name);
 			err = system(iaslbuf);
 
 			if (!err) {
 				/*
 				 * Copy the aml output file into guest
 				 * memory at the specified location
 				 */
 				err = basl_load(ctx, io[1].fd, offset);
 			}
 		}
 		basl_end(&io[0], &io[1]);
 	}
 
 	return (err);
 }
 
 static int
 basl_make_templates(void)
 {
 	const char *tmpdir;
 	int err;
 	int len;
 
 	err = 0;
 	
 	/*
 	 * 
 	 */
 	if ((tmpdir = getenv("BHYVE_TMPDIR")) == NULL || *tmpdir == '\0' ||
 	    (tmpdir = getenv("TMPDIR")) == NULL || *tmpdir == '\0') {
 		tmpdir = _PATH_TMP;
 	}
 
 	len = strlen(tmpdir);
 
 	if ((len + sizeof(BHYVE_ASL_TEMPLATE) + 1) < MAXPATHLEN) {
 		strcpy(basl_template, tmpdir);
 		while (len > 0 && basl_template[len - 1] == '/')
 			len--;
 		basl_template[len] = '/';
 		strcpy(&basl_template[len + 1], BHYVE_ASL_TEMPLATE);
 	} else
 		err = E2BIG;
 
 	if (!err) {
 		/*
 		 * len has been intialized (and maybe adjusted) above
 		 */
 		if ((len + sizeof(BHYVE_ASL_TEMPLATE) + 1 +
 		     sizeof(BHYVE_ASL_SUFFIX)) < MAXPATHLEN) {
 			strcpy(basl_stemplate, tmpdir);
 			basl_stemplate[len] = '/';
 			strcpy(&basl_stemplate[len + 1], BHYVE_ASL_TEMPLATE);
 			len = strlen(basl_stemplate);
 			strcpy(&basl_stemplate[len], BHYVE_ASL_SUFFIX);
 		} else
 			err = E2BIG;
 	}
 
 	return (err);
 }
 
 static struct {
 	int	(*wsect)(FILE *fp);
 	uint64_t  offset;
 } basl_ftables[] =
 {
 	{ basl_fwrite_rsdp, 0},
 	{ basl_fwrite_rsdt, RSDT_OFFSET },
 	{ basl_fwrite_xsdt, XSDT_OFFSET },
 	{ basl_fwrite_madt, MADT_OFFSET },
 	{ basl_fwrite_fadt, FADT_OFFSET },
 	{ basl_fwrite_hpet, HPET_OFFSET },
 	{ basl_fwrite_mcfg, MCFG_OFFSET },
 	{ basl_fwrite_facs, FACS_OFFSET },
 	{ basl_fwrite_dsdt, DSDT_OFFSET },
 	{ NULL }
 };
 
 int
 acpi_build(struct vmctx *ctx, int ncpu)
 {
 	int err;
 	int i;
 
 	basl_ncpu = ncpu;
 
 	err = vm_get_hpet_capabilities(ctx, &hpet_capabilities);
 	if (err != 0)
 		return (err);
 
 	/*
 	 * For debug, allow the user to have iasl compiler output sent
 	 * to stdout rather than /dev/null
 	 */
 	if (getenv("BHYVE_ACPI_VERBOSE_IASL"))
 		basl_verbose_iasl = 1;
 
 	/*
 	 * Allow the user to keep the generated ASL files for debugging
 	 * instead of deleting them following use
 	 */
 	if (getenv("BHYVE_ACPI_KEEPTMPS"))
 		basl_keep_temps = 1;
 
 	i = 0;
 	err = basl_make_templates();
 
 	/*
 	 * Run through all the ASL files, compiling them and
 	 * copying them into guest memory
 	 */
 	while (!err && basl_ftables[i].wsect != NULL) {
 		err = basl_compile(ctx, basl_ftables[i].wsect,
 				   basl_ftables[i].offset);
 		i++;
 	}
 
 	return (err);
 }
Index: stable/10/usr.sbin/bhyve/bhyverun.c
===================================================================
--- stable/10/usr.sbin/bhyve/bhyverun.c	(revision 276348)
+++ stable/10/usr.sbin/bhyve/bhyverun.c	(revision 276349)
@@ -1,862 +1,868 @@
 /*-
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/types.h>
 #include <sys/mman.h>
 #include <sys/time.h>
 
 #include <machine/atomic.h>
 #include <machine/segments.h>
 
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <err.h>
 #include <libgen.h>
 #include <unistd.h>
 #include <assert.h>
 #include <errno.h>
 #include <pthread.h>
 #include <pthread_np.h>
 #include <sysexits.h>
 
 #include <machine/vmm.h>
 #include <vmmapi.h>
 
 #include "bhyverun.h"
 #include "acpi.h"
 #include "inout.h"
 #include "dbgport.h"
 #include "ioapic.h"
 #include "mem.h"
 #include "mevent.h"
 #include "mptbl.h"
 #include "pci_emul.h"
 #include "pci_irq.h"
 #include "pci_lpc.h"
 #include "smbiostbl.h"
 #include "xmsr.h"
 #include "spinup_ap.h"
 #include "rtc.h"
 
 #define GUEST_NIO_PORT		0x488	/* guest upcalls via i/o port */
 
 #define MB		(1024UL * 1024)
 #define GB		(1024UL * MB)
 
 typedef int (*vmexit_handler_t)(struct vmctx *, struct vm_exit *, int *vcpu);
 extern int vmexit_task_switch(struct vmctx *, struct vm_exit *, int *vcpu);
 
 char *vmname;
 
 int guest_ncpus;
 char *guest_uuid_str;
 
 static int guest_vmexit_on_hlt, guest_vmexit_on_pause;
 static int virtio_msix = 1;
 static int x2apic_mode = 0;	/* default is xAPIC */
 
 static int strictio;
 static int strictmsr = 1;
 
 static int acpi;
 
 static char *progname;
 static const int BSP = 0;
 
 static cpuset_t cpumask;
 
 static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip);
 
 static struct vm_exit vmexit[VM_MAXCPU];
 
 struct bhyvestats {
         uint64_t        vmexit_bogus;
         uint64_t        vmexit_bogus_switch;
         uint64_t        vmexit_hlt;
         uint64_t        vmexit_pause;
         uint64_t        vmexit_mtrap;
         uint64_t        vmexit_inst_emul;
         uint64_t        cpu_switch_rotate;
         uint64_t        cpu_switch_direct;
 } stats;
 
 struct mt_vmm_info {
 	pthread_t	mt_thr;
 	struct vmctx	*mt_ctx;
 	int		mt_vcpu;	
 } mt_vmm_info[VM_MAXCPU];
 
 static cpuset_t *vcpumap[VM_MAXCPU] = { NULL };
 
 static void
 usage(int code)
 {
 
         fprintf(stderr,
                 "Usage: %s [-abehwxACHPWY] [-c vcpus] [-g <gdb port>] [-l <lpc>]\n"
 		"       %*s [-m mem] [-p vcpu:hostcpu] [-s <pci>] [-U uuid] <vm>\n"
 		"       -a: local apic is in xAPIC mode (deprecated)\n"
 		"       -A: create ACPI tables\n"
 		"       -c: # cpus (default 1)\n"
 		"       -C: include guest memory in core file\n"
 		"       -e: exit on unhandled I/O access\n"
 		"       -g: gdb port\n"
 		"       -h: help\n"
 		"       -H: vmexit from the guest on hlt\n"
 		"       -l: LPC device configuration\n"
 		"       -m: memory size in MB\n"
 		"       -p: pin 'vcpu' to 'hostcpu'\n"
 		"       -P: vmexit from the guest on pause\n"
 		"       -s: <slot,driver,configinfo> PCI slot config\n"
 		"       -U: uuid\n"
 		"       -w: ignore unimplemented MSRs\n"
 		"       -W: force virtio to use single-vector MSI\n"
 		"       -x: local apic is in x2APIC mode\n"
 		"       -Y: disable MPtable generation\n",
 		progname, (int)strlen(progname), "");
 
 	exit(code);
 }
 
 static int
 pincpu_parse(const char *opt)
 {
 	int vcpu, pcpu;
 
 	if (sscanf(opt, "%d:%d", &vcpu, &pcpu) != 2) {
 		fprintf(stderr, "invalid format: %s\n", opt);
 		return (-1);
 	}
 
 	if (vcpu < 0 || vcpu >= VM_MAXCPU) {
 		fprintf(stderr, "vcpu '%d' outside valid range from 0 to %d\n",
 		    vcpu, VM_MAXCPU - 1);
 		return (-1);
 	}
 
 	if (pcpu < 0 || pcpu >= CPU_SETSIZE) {
 		fprintf(stderr, "hostcpu '%d' outside valid range from "
 		    "0 to %d\n", pcpu, CPU_SETSIZE - 1);
 		return (-1);
 	}
 
 	if (vcpumap[vcpu] == NULL) {
 		if ((vcpumap[vcpu] = malloc(sizeof(cpuset_t))) == NULL) {
 			perror("malloc");
 			return (-1);
 		}
 		CPU_ZERO(vcpumap[vcpu]);
 	}
 	CPU_SET(pcpu, vcpumap[vcpu]);
 	return (0);
 }
 
 void
 vm_inject_fault(void *arg, int vcpu, int vector, int errcode_valid,
     int errcode)
 {
 	struct vmctx *ctx;
 	int error;
 
 	ctx = arg;
 	if (errcode_valid)
 		error = vm_inject_exception2(ctx, vcpu, vector, errcode);
 	else
 		error = vm_inject_exception(ctx, vcpu, vector);
 	assert(error == 0);
 
 	/*
 	 * Set the instruction length to 0 to ensure that the instruction is
 	 * restarted when the fault handler returns.
 	 */
 	vmexit[vcpu].inst_length = 0;
 }
 
 void *
 paddr_guest2host(struct vmctx *ctx, uintptr_t gaddr, size_t len)
 {
 
 	return (vm_map_gpa(ctx, gaddr, len));
 }
 
 int
 fbsdrun_vmexit_on_pause(void)
 {
 
 	return (guest_vmexit_on_pause);
 }
 
 int
 fbsdrun_vmexit_on_hlt(void)
 {
 
 	return (guest_vmexit_on_hlt);
 }
 
 int
 fbsdrun_virtio_msix(void)
 {
 
 	return (virtio_msix);
 }
 
 static void *
 fbsdrun_start_thread(void *param)
 {
 	char tname[MAXCOMLEN + 1];
 	struct mt_vmm_info *mtp;
 	int vcpu;
 
 	mtp = param;
 	vcpu = mtp->mt_vcpu;
 
 	snprintf(tname, sizeof(tname), "vcpu %d", vcpu);
 	pthread_set_name_np(mtp->mt_thr, tname);
 
 	vm_loop(mtp->mt_ctx, vcpu, vmexit[vcpu].rip);
 
 	/* not reached */
 	exit(1);
 	return (NULL);
 }
 
 void
 fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip)
 {
 	int error;
 
 	assert(fromcpu == BSP);
 
 	/*
 	 * The 'newcpu' must be activated in the context of 'fromcpu'. If
 	 * vm_activate_cpu() is delayed until newcpu's pthread starts running
 	 * then vmm.ko is out-of-sync with bhyve and this can create a race
 	 * with vm_suspend().
 	 */
 	error = vm_activate_cpu(ctx, newcpu);
 	assert(error == 0);
 
 	CPU_SET_ATOMIC(newcpu, &cpumask);
 
 	/*
 	 * Set up the vmexit struct to allow execution to start
 	 * at the given RIP
 	 */
 	vmexit[newcpu].rip = rip;
 	vmexit[newcpu].inst_length = 0;
 
 	mt_vmm_info[newcpu].mt_ctx = ctx;
 	mt_vmm_info[newcpu].mt_vcpu = newcpu;
 
 	error = pthread_create(&mt_vmm_info[newcpu].mt_thr, NULL,
 	    fbsdrun_start_thread, &mt_vmm_info[newcpu]);
 	assert(error == 0);
 }
 
 static int
 fbsdrun_deletecpu(struct vmctx *ctx, int vcpu)
 {
 
 	if (!CPU_ISSET(vcpu, &cpumask)) {
 		fprintf(stderr, "Attempting to delete unknown cpu %d\n", vcpu);
 		exit(1);
 	}
 
 	CPU_CLR_ATOMIC(vcpu, &cpumask);
 	return (CPU_EMPTY(&cpumask));
 }
 
 static int
 vmexit_handle_notify(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu,
 		     uint32_t eax)
 {
 #if BHYVE_DEBUG
 	/*
 	 * put guest-driven debug here
 	 */
 #endif
         return (VMEXIT_CONTINUE);
 }
 
 static int
 vmexit_inout(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
 {
 	int error;
 	int bytes, port, in, out, string;
 	int vcpu;
 
 	vcpu = *pvcpu;
 
 	port = vme->u.inout.port;
 	bytes = vme->u.inout.bytes;
 	string = vme->u.inout.string;
 	in = vme->u.inout.in;
 	out = !in;
 
         /* Extra-special case of host notifications */
         if (out && port == GUEST_NIO_PORT) {
                 error = vmexit_handle_notify(ctx, vme, pvcpu, vme->u.inout.eax);
 		return (error);
 	}
 
 	error = emulate_inout(ctx, vcpu, vme, strictio);
 	if (!error && in && !string) {
 		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX,
 		    vme->u.inout.eax);
 		assert(error == 0);
 	}
 
 	if (error) {
 		fprintf(stderr, "Unhandled %s%c 0x%04x\n", in ? "in" : "out",
 		    bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'), port);
 		return (VMEXIT_ABORT);
 	} else {
 		return (VMEXIT_CONTINUE);
 	}
 }
 
 static int
 vmexit_rdmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
 {
 	uint64_t val;
 	uint32_t eax, edx;
 	int error;
 
 	val = 0;
 	error = emulate_rdmsr(ctx, *pvcpu, vme->u.msr.code, &val);
 	if (error != 0) {
 		fprintf(stderr, "rdmsr to register %#x on vcpu %d\n",
 		    vme->u.msr.code, *pvcpu);
 		if (strictmsr) {
 			vm_inject_gp(ctx, *pvcpu);
 			return (VMEXIT_RESTART);
 		}
 	}
 
 	eax = val;
 	error = vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RAX, eax);
 	assert(error == 0);
 
 	edx = val >> 32;
 	error = vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RDX, edx);
 	assert(error == 0);
 
 	return (VMEXIT_CONTINUE);
 }
 
 static int
 vmexit_wrmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
 {
 	int error;
 
 	error = emulate_wrmsr(ctx, *pvcpu, vme->u.msr.code, vme->u.msr.wval);
 	if (error != 0) {
 		fprintf(stderr, "wrmsr to register %#x(%#lx) on vcpu %d\n",
 		    vme->u.msr.code, vme->u.msr.wval, *pvcpu);
 		if (strictmsr) {
 			vm_inject_gp(ctx, *pvcpu);
 			return (VMEXIT_RESTART);
 		}
 	}
 	return (VMEXIT_CONTINUE);
 }
 
 static int
 vmexit_spinup_ap(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
 {
 	int newcpu;
 	int retval = VMEXIT_CONTINUE;
 
 	newcpu = spinup_ap(ctx, *pvcpu,
 			   vme->u.spinup_ap.vcpu, vme->u.spinup_ap.rip);
 
 	return (retval);
 }
 
 #define	DEBUG_EPT_MISCONFIG
 #ifdef DEBUG_EPT_MISCONFIG
 #define	EXIT_REASON_EPT_MISCONFIG	49
 #define	VMCS_GUEST_PHYSICAL_ADDRESS	0x00002400
 #define	VMCS_IDENT(x)			((x) | 0x80000000)
 
 static uint64_t ept_misconfig_gpa, ept_misconfig_pte[4];
 static int ept_misconfig_ptenum;
 #endif
 
 static int
 vmexit_vmx(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 {
 
 	fprintf(stderr, "vm exit[%d]\n", *pvcpu);
 	fprintf(stderr, "\treason\t\tVMX\n");
 	fprintf(stderr, "\trip\t\t0x%016lx\n", vmexit->rip);
 	fprintf(stderr, "\tinst_length\t%d\n", vmexit->inst_length);
 	fprintf(stderr, "\tstatus\t\t%d\n", vmexit->u.vmx.status);
 	fprintf(stderr, "\texit_reason\t%u\n", vmexit->u.vmx.exit_reason);
 	fprintf(stderr, "\tqualification\t0x%016lx\n",
 	    vmexit->u.vmx.exit_qualification);
 	fprintf(stderr, "\tinst_type\t\t%d\n", vmexit->u.vmx.inst_type);
 	fprintf(stderr, "\tinst_error\t\t%d\n", vmexit->u.vmx.inst_error);
 #ifdef DEBUG_EPT_MISCONFIG
 	if (vmexit->u.vmx.exit_reason == EXIT_REASON_EPT_MISCONFIG) {
 		vm_get_register(ctx, *pvcpu,
 		    VMCS_IDENT(VMCS_GUEST_PHYSICAL_ADDRESS),
 		    &ept_misconfig_gpa);
 		vm_get_gpa_pmap(ctx, ept_misconfig_gpa, ept_misconfig_pte,
 		    &ept_misconfig_ptenum);
 		fprintf(stderr, "\tEPT misconfiguration:\n");
 		fprintf(stderr, "\t\tGPA: %#lx\n", ept_misconfig_gpa);
 		fprintf(stderr, "\t\tPTE(%d): %#lx %#lx %#lx %#lx\n",
 		    ept_misconfig_ptenum, ept_misconfig_pte[0],
 		    ept_misconfig_pte[1], ept_misconfig_pte[2],
 		    ept_misconfig_pte[3]);
 	}
 #endif	/* DEBUG_EPT_MISCONFIG */
 	return (VMEXIT_ABORT);
 }
 
 static int
 vmexit_bogus(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 {
 
 	stats.vmexit_bogus++;
 
 	return (VMEXIT_RESTART);
 }
 
 static int
 vmexit_hlt(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 {
 
 	stats.vmexit_hlt++;
 
 	/*
 	 * Just continue execution with the next instruction. We use
 	 * the HLT VM exit as a way to be friendly with the host
 	 * scheduler.
 	 */
 	return (VMEXIT_CONTINUE);
 }
 
 static int
 vmexit_pause(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 {
 
 	stats.vmexit_pause++;
 
 	return (VMEXIT_CONTINUE);
 }
 
 static int
 vmexit_mtrap(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 {
 
 	stats.vmexit_mtrap++;
 
 	return (VMEXIT_RESTART);
 }
 
 static int
 vmexit_inst_emul(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 {
 	int err;
 	stats.vmexit_inst_emul++;
 
 	err = emulate_mem(ctx, *pvcpu, vmexit->u.inst_emul.gpa,
 	    &vmexit->u.inst_emul.vie, &vmexit->u.inst_emul.paging);
 
 	if (err) {
 		if (err == EINVAL) {
 			fprintf(stderr,
 			    "Failed to emulate instruction at 0x%lx\n", 
 			    vmexit->rip);
 		} else if (err == ESRCH) {
 			fprintf(stderr, "Unhandled memory access to 0x%lx\n",
 			    vmexit->u.inst_emul.gpa);
 		}
 
 		return (VMEXIT_ABORT);
 	}
 
 	return (VMEXIT_CONTINUE);
 }
 
 static pthread_mutex_t resetcpu_mtx = PTHREAD_MUTEX_INITIALIZER;
 static pthread_cond_t resetcpu_cond = PTHREAD_COND_INITIALIZER;
 
 static int
 vmexit_suspend(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 {
 	enum vm_suspend_how how;
 
 	how = vmexit->u.suspended.how;
 
 	fbsdrun_deletecpu(ctx, *pvcpu);
 
 	if (*pvcpu != BSP) {
 		pthread_mutex_lock(&resetcpu_mtx);
 		pthread_cond_signal(&resetcpu_cond);
 		pthread_mutex_unlock(&resetcpu_mtx);
 		pthread_exit(NULL);
 	}
 
 	pthread_mutex_lock(&resetcpu_mtx);
 	while (!CPU_EMPTY(&cpumask)) {
 		pthread_cond_wait(&resetcpu_cond, &resetcpu_mtx);
 	}
 	pthread_mutex_unlock(&resetcpu_mtx);
 
 	switch (how) {
 	case VM_SUSPEND_RESET:
 		exit(0);
 	case VM_SUSPEND_POWEROFF:
 		exit(1);
 	case VM_SUSPEND_HALT:
 		exit(2);
 	case VM_SUSPEND_TRIPLEFAULT:
 		exit(3);
 	default:
 		fprintf(stderr, "vmexit_suspend: invalid reason %d\n", how);
 		exit(100);
 	}
 	return (0);	/* NOTREACHED */
 }
 
 static vmexit_handler_t handler[VM_EXITCODE_MAX] = {
 	[VM_EXITCODE_INOUT]  = vmexit_inout,
 	[VM_EXITCODE_INOUT_STR]  = vmexit_inout,
 	[VM_EXITCODE_VMX]    = vmexit_vmx,
 	[VM_EXITCODE_BOGUS]  = vmexit_bogus,
 	[VM_EXITCODE_RDMSR]  = vmexit_rdmsr,
 	[VM_EXITCODE_WRMSR]  = vmexit_wrmsr,
 	[VM_EXITCODE_MTRAP]  = vmexit_mtrap,
 	[VM_EXITCODE_INST_EMUL] = vmexit_inst_emul,
 	[VM_EXITCODE_SPINUP_AP] = vmexit_spinup_ap,
 	[VM_EXITCODE_SUSPENDED] = vmexit_suspend,
 	[VM_EXITCODE_TASK_SWITCH] = vmexit_task_switch,
 };
 
 static void
 vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip)
 {
 	int error, rc, prevcpu;
 	enum vm_exitcode exitcode;
 	cpuset_t active_cpus;
 
 	if (vcpumap[vcpu] != NULL) {
 		error = pthread_setaffinity_np(pthread_self(),
 		    sizeof(cpuset_t), vcpumap[vcpu]);
 		assert(error == 0);
 	}
 
 	error = vm_active_cpus(ctx, &active_cpus);
 	assert(CPU_ISSET(vcpu, &active_cpus));
 
 	while (1) {
 		error = vm_run(ctx, vcpu, rip, &vmexit[vcpu]);
 		if (error != 0)
 			break;
 
 		prevcpu = vcpu;
 
 		exitcode = vmexit[vcpu].exitcode;
 		if (exitcode >= VM_EXITCODE_MAX || handler[exitcode] == NULL) {
 			fprintf(stderr, "vm_loop: unexpected exitcode 0x%x\n",
 			    exitcode);
 			exit(1);
 		}
 
                 rc = (*handler[exitcode])(ctx, &vmexit[vcpu], &vcpu);
 
 		switch (rc) {
 		case VMEXIT_CONTINUE:
                         rip = vmexit[vcpu].rip + vmexit[vcpu].inst_length;
 			break;
 		case VMEXIT_RESTART:
                         rip = vmexit[vcpu].rip;
 			break;
 		case VMEXIT_ABORT:
 			abort();
 		default:
 			exit(1);
 		}
 	}
 	fprintf(stderr, "vm_run error %d, errno %d\n", error, errno);
 }
 
 static int
 num_vcpus_allowed(struct vmctx *ctx)
 {
 	int tmp, error;
 
 	error = vm_get_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, &tmp);
 
 	/*
 	 * The guest is allowed to spinup more than one processor only if the
 	 * UNRESTRICTED_GUEST capability is available.
 	 */
 	if (error == 0)
 		return (VM_MAXCPU);
 	else
 		return (1);
 }
 
 void
 fbsdrun_set_capabilities(struct vmctx *ctx, int cpu)
 {
 	int err, tmp;
 
 	if (fbsdrun_vmexit_on_hlt()) {
 		err = vm_get_capability(ctx, cpu, VM_CAP_HALT_EXIT, &tmp);
 		if (err < 0) {
 			fprintf(stderr, "VM exit on HLT not supported\n");
 			exit(1);
 		}
 		vm_set_capability(ctx, cpu, VM_CAP_HALT_EXIT, 1);
 		if (cpu == BSP)
 			handler[VM_EXITCODE_HLT] = vmexit_hlt;
 	}
 
         if (fbsdrun_vmexit_on_pause()) {
 		/*
 		 * pause exit support required for this mode
 		 */
 		err = vm_get_capability(ctx, cpu, VM_CAP_PAUSE_EXIT, &tmp);
 		if (err < 0) {
 			fprintf(stderr,
 			    "SMP mux requested, no pause support\n");
 			exit(1);
 		}
 		vm_set_capability(ctx, cpu, VM_CAP_PAUSE_EXIT, 1);
 		if (cpu == BSP)
 			handler[VM_EXITCODE_PAUSE] = vmexit_pause;
         }
 
 	if (x2apic_mode)
 		err = vm_set_x2apic_state(ctx, cpu, X2APIC_ENABLED);
 	else
 		err = vm_set_x2apic_state(ctx, cpu, X2APIC_DISABLED);
 
 	if (err) {
 		fprintf(stderr, "Unable to set x2apic state (%d)\n", err);
 		exit(1);
 	}
 
 	vm_set_capability(ctx, cpu, VM_CAP_ENABLE_INVPCID, 1);
 }
 
 int
 main(int argc, char *argv[])
 {
 	int c, error, gdb_port, err, bvmcons;
 	int dump_guest_memory, max_vcpus, mptgen;
 	struct vmctx *ctx;
 	uint64_t rip;
 	size_t memsize;
 
 	bvmcons = 0;
 	dump_guest_memory = 0;
 	progname = basename(argv[0]);
 	gdb_port = 0;
 	guest_ncpus = 1;
 	memsize = 256 * MB;
 	mptgen = 1;
 
 	while ((c = getopt(argc, argv, "abehwxACHIPWYp:g:c:s:m:l:U:")) != -1) {
 		switch (c) {
 		case 'a':
 			x2apic_mode = 0;
 			break;
 		case 'A':
 			acpi = 1;
 			break;
 		case 'b':
 			bvmcons = 1;
 			break;
 		case 'p':
                         if (pincpu_parse(optarg) != 0) {
                             errx(EX_USAGE, "invalid vcpu pinning "
                                  "configuration '%s'", optarg);
                         }
 			break;
                 case 'c':
 			guest_ncpus = atoi(optarg);
 			break;
 		case 'C':
 			dump_guest_memory = 1;
 			break;
 		case 'g':
 			gdb_port = atoi(optarg);
 			break;
 		case 'l':
 			if (lpc_device_parse(optarg) != 0) {
 				errx(EX_USAGE, "invalid lpc device "
 				    "configuration '%s'", optarg);
 			}
 			break;
 		case 's':
 			if (pci_parse_slot(optarg) != 0)
 				exit(1);
 			else
 				break;
                 case 'm':
 			error = vm_parse_memsize(optarg, &memsize);
 			if (error)
 				errx(EX_USAGE, "invalid memsize '%s'", optarg);
 			break;
 		case 'H':
 			guest_vmexit_on_hlt = 1;
 			break;
 		case 'I':
 			/*
 			 * The "-I" option was used to add an ioapic to the
 			 * virtual machine.
 			 *
 			 * An ioapic is now provided unconditionally for each
 			 * virtual machine and this option is now deprecated.
 			 */
 			break;
 		case 'P':
 			guest_vmexit_on_pause = 1;
 			break;
 		case 'e':
 			strictio = 1;
 			break;
 		case 'U':
 			guest_uuid_str = optarg;
 			break;
 		case 'w':
 			strictmsr = 0;
 			break;
 		case 'W':
 			virtio_msix = 0;
 			break;
 		case 'x':
 			x2apic_mode = 1;
 			break;
 		case 'Y':
 			mptgen = 0;
 			break;
 		case 'h':
 			usage(0);			
 		default:
 			usage(1);
 		}
 	}
 	argc -= optind;
 	argv += optind;
 
 	if (argc != 1)
 		usage(1);
 
 	vmname = argv[0];
 
 	ctx = vm_open(vmname);
 	if (ctx == NULL) {
 		perror("vm_open");
 		exit(1);
 	}
 
 	max_vcpus = num_vcpus_allowed(ctx);
 	if (guest_ncpus > max_vcpus) {
 		fprintf(stderr, "%d vCPUs requested but only %d available\n",
 			guest_ncpus, max_vcpus);
 		exit(1);
 	}
 
 	fbsdrun_set_capabilities(ctx, BSP);
 
 	if (dump_guest_memory)
 		vm_set_memflags(ctx, VM_MEM_F_INCORE);
 	err = vm_setup_memory(ctx, memsize, VM_MMAP_ALL);
 	if (err) {
 		fprintf(stderr, "Unable to setup memory (%d)\n", err);
 		exit(1);
 	}
 
+	error = init_msr();
+	if (error) {
+		fprintf(stderr, "init_msr error %d", error);
+		exit(1);
+	}
+
 	init_mem();
 	init_inout();
 	pci_irq_init(ctx);
 	ioapic_init(ctx);
 
 	rtc_init(ctx);
 	sci_init(ctx);
 
 	/*
 	 * Exit if a device emulation finds an error in it's initilization
 	 */
 	if (init_pci(ctx) != 0)
 		exit(1);
 
 	if (gdb_port != 0)
 		init_dbgport(gdb_port);
 
 	if (bvmcons)
 		init_bvmcons();
 
 	error = vm_get_register(ctx, BSP, VM_REG_GUEST_RIP, &rip);
 	assert(error == 0);
 
 	/*
 	 * build the guest tables, MP etc.
 	 */
 	if (mptgen) {
 		error = mptable_build(ctx, guest_ncpus);
 		if (error)
 			exit(1);
 	}
 
 	error = smbios_build(ctx);
 	assert(error == 0);
 
 	if (acpi) {
 		error = acpi_build(ctx, guest_ncpus);
 		assert(error == 0);
 	}
 
 	/*
 	 * Change the proc title to include the VM name.
 	 */
 	setproctitle("%s", vmname); 
 	
 	/*
 	 * Add CPU 0
 	 */
 	fbsdrun_addcpu(ctx, BSP, BSP, rip);
 
 	/*
 	 * Head off to the main event dispatch loop
 	 */
 	mevent_dispatch();
 
 	exit(1);
 }
Index: stable/10/usr.sbin/bhyve/block_if.c
===================================================================
--- stable/10/usr.sbin/bhyve/block_if.c	(revision 276348)
+++ stable/10/usr.sbin/bhyve/block_if.c	(revision 276349)
@@ -1,474 +1,490 @@
 /*-
  * Copyright (c) 2013  Peter Grehan <grehan@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/queue.h>
 #include <sys/errno.h>
 #include <sys/stat.h>
 #include <sys/ioctl.h>
 #include <sys/disk.h>
 
 #include <assert.h>
 #include <fcntl.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <pthread.h>
 #include <pthread_np.h>
 #include <unistd.h>
 
 #include "bhyverun.h"
 #include "block_if.h"
 
 #define BLOCKIF_SIG	0xb109b109
 
 #define BLOCKIF_MAXREQ	32
 
 enum blockop {
 	BOP_READ,
 	BOP_WRITE,
-	BOP_FLUSH,
-	BOP_CANCEL
+	BOP_FLUSH
 };
 
 enum blockstat {
 	BST_FREE,
 	BST_INUSE
 };
 
 struct blockif_elem {
 	TAILQ_ENTRY(blockif_elem) be_link;
 	struct blockif_req  *be_req;
 	enum blockop	     be_op;
 	enum blockstat	     be_status;
 };
 
 struct blockif_ctxt {
 	int			bc_magic;
 	int			bc_fd;
 	int			bc_rdonly;
 	off_t			bc_size;
 	int			bc_sectsz;
 	pthread_t		bc_btid;
         pthread_mutex_t		bc_mtx;
         pthread_cond_t		bc_cond;
 	int			bc_closing;
 
 	/* Request elements and free/inuse queues */
 	TAILQ_HEAD(, blockif_elem) bc_freeq;       
 	TAILQ_HEAD(, blockif_elem) bc_inuseq;       
 	u_int			bc_req_count;
 	struct blockif_elem	bc_reqs[BLOCKIF_MAXREQ];
 };
 
 static int
 blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq,
 		enum blockop op)
 {
 	struct blockif_elem *be;
 
 	assert(bc->bc_req_count < BLOCKIF_MAXREQ);
 
 	be = TAILQ_FIRST(&bc->bc_freeq);
 	assert(be != NULL);
 	assert(be->be_status == BST_FREE);
 
 	TAILQ_REMOVE(&bc->bc_freeq, be, be_link);
 	be->be_status = BST_INUSE;
 	be->be_req = breq;
 	be->be_op = op;
 	TAILQ_INSERT_TAIL(&bc->bc_inuseq, be, be_link);
 
 	bc->bc_req_count++;
 
 	return (0);
 }
 
 static int
 blockif_dequeue(struct blockif_ctxt *bc, struct blockif_elem *el)
 {
 	struct blockif_elem *be;
 
 	if (bc->bc_req_count == 0)
 		return (ENOENT);
 
 	be = TAILQ_FIRST(&bc->bc_inuseq);
 	assert(be != NULL);
 	assert(be->be_status == BST_INUSE);
 	*el = *be;
 
 	TAILQ_REMOVE(&bc->bc_inuseq, be, be_link);
 	be->be_status = BST_FREE;
 	be->be_req = NULL;
 	TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link);
 	
 	bc->bc_req_count--;
 
 	return (0);
 }
 
 static void
 blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be)
 {
 	struct blockif_req *br;
 	int err;
 
 	br = be->be_req;
 	err = 0;
 
 	switch (be->be_op) {
 	case BOP_READ:
 		if (preadv(bc->bc_fd, br->br_iov, br->br_iovcnt,
 			   br->br_offset) < 0)
 			err = errno;
 		break;
 	case BOP_WRITE:
 		if (bc->bc_rdonly)
 			err = EROFS;
 		else if (pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt,
 			     br->br_offset) < 0)
 			err = errno;
 		break;
 	case BOP_FLUSH:
 		break;
-	case BOP_CANCEL:
-		err = EINTR;
-		break;
 	default:
 		err = EINVAL;
 		break;
 	}
 
 	(*br->br_callback)(br, err);
 }
 
 static void *
 blockif_thr(void *arg)
 {
 	struct blockif_ctxt *bc;
 	struct blockif_elem req;
 
 	bc = arg;
 
 	for (;;) {
 		pthread_mutex_lock(&bc->bc_mtx);
 		while (!blockif_dequeue(bc, &req)) {
 			pthread_mutex_unlock(&bc->bc_mtx);
 			blockif_proc(bc, &req);
 			pthread_mutex_lock(&bc->bc_mtx);
 		}
 		pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx);
 		pthread_mutex_unlock(&bc->bc_mtx);
 
 		/*
 		 * Check ctxt status here to see if exit requested
 		 */
 		if (bc->bc_closing)
 			pthread_exit(NULL);
 	}
 
 	/* Not reached */
 	return (NULL);
 }
 
 struct blockif_ctxt *
 blockif_open(const char *optstr, const char *ident)
 {
 	char tname[MAXCOMLEN + 1];
 	char *nopt, *xopts;
 	struct blockif_ctxt *bc;
 	struct stat sbuf;
 	off_t size;
 	int extra, fd, i, sectsz;
 	int nocache, sync, ro;
 
 	nocache = 0;
 	sync = 0;
 	ro = 0;
 
 	/*
 	 * The first element in the optstring is always a pathname.
 	 * Optional elements follow
 	 */
 	nopt = strdup(optstr);
 	for (xopts = strtok(nopt, ",");
 	     xopts != NULL;
 	     xopts = strtok(NULL, ",")) {
 		if (!strcmp(xopts, "nocache"))
 			nocache = 1;
 		else if (!strcmp(xopts, "sync"))
 			sync = 1;
 		else if (!strcmp(xopts, "ro"))
 			ro = 1;
 	}
 
 	extra = 0;
 	if (nocache)
 		extra |= O_DIRECT;
 	if (sync)
 		extra |= O_SYNC;
 
 	fd = open(nopt, (ro ? O_RDONLY : O_RDWR) | extra);
 	if (fd < 0 && !ro) {
 		/* Attempt a r/w fail with a r/o open */
 		fd = open(nopt, O_RDONLY | extra);
 		ro = 1;
 	}
 
 	if (fd < 0) {
 		perror("Could not open backing file");
 		return (NULL);
 	}
 
         if (fstat(fd, &sbuf) < 0) {
                 perror("Could not stat backing file");
                 close(fd);
                 return (NULL);
         }
 
         /*
 	 * Deal with raw devices
 	 */
         size = sbuf.st_size;
 	sectsz = DEV_BSIZE;
 	if (S_ISCHR(sbuf.st_mode)) {
 		if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 ||
 		    ioctl(fd, DIOCGSECTORSIZE, &sectsz)) {
 			perror("Could not fetch dev blk/sector size");
 			close(fd);
 			return (NULL);
 		}
 		assert(size != 0);
 		assert(sectsz != 0);
 	}
 
 	bc = calloc(1, sizeof(struct blockif_ctxt));
 	if (bc == NULL) {
 		close(fd);
 		return (NULL);
 	}
 
 	bc->bc_magic = BLOCKIF_SIG;
 	bc->bc_fd = fd;
+	bc->bc_rdonly = ro;
 	bc->bc_size = size;
 	bc->bc_sectsz = sectsz;
 	pthread_mutex_init(&bc->bc_mtx, NULL);
 	pthread_cond_init(&bc->bc_cond, NULL);
 	TAILQ_INIT(&bc->bc_freeq);
 	TAILQ_INIT(&bc->bc_inuseq);
 	bc->bc_req_count = 0;
 	for (i = 0; i < BLOCKIF_MAXREQ; i++) {
 		bc->bc_reqs[i].be_status = BST_FREE;
 		TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link);
 	}
 
 	pthread_create(&bc->bc_btid, NULL, blockif_thr, bc);
 
 	snprintf(tname, sizeof(tname), "blk-%s", ident);
 	pthread_set_name_np(bc->bc_btid, tname);
 
 	return (bc);
 }
 
 static int
 blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq,
 		enum blockop op)
 {
 	int err;
 
 	err = 0;
 
 	pthread_mutex_lock(&bc->bc_mtx);
 	if (bc->bc_req_count < BLOCKIF_MAXREQ) {
 		/*
 		 * Enqueue and inform the block i/o thread
 		 * that there is work available
 		 */
 		blockif_enqueue(bc, breq, op);
 		pthread_cond_signal(&bc->bc_cond);
 	} else {
 		/*
 		 * Callers are not allowed to enqueue more than
 		 * the specified blockif queue limit. Return an
 		 * error to indicate that the queue length has been
 		 * exceeded.
 		 */
 		err = E2BIG;
 	}
 	pthread_mutex_unlock(&bc->bc_mtx);
 
 	return (err);
 }
 
 int
 blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq)
 {
 
 	assert(bc->bc_magic == BLOCKIF_SIG);
 	return (blockif_request(bc, breq, BOP_READ));
 }
 
 int
 blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq)
 {
 
 	assert(bc->bc_magic == BLOCKIF_SIG);
 	return (blockif_request(bc, breq, BOP_WRITE));
 }
 
 int
 blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq)
 {
 
 	assert(bc->bc_magic == BLOCKIF_SIG);
 	return (blockif_request(bc, breq, BOP_FLUSH));
 }
 
 int
 blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq)
 {
+	struct blockif_elem *be;
 
 	assert(bc->bc_magic == BLOCKIF_SIG);
-	return (blockif_request(bc, breq, BOP_CANCEL));
+
+	pthread_mutex_lock(&bc->bc_mtx);
+	TAILQ_FOREACH(be, &bc->bc_inuseq, be_link) {
+		if (be->be_req == breq)
+			break;
+	}
+	if (be == NULL) {
+		pthread_mutex_unlock(&bc->bc_mtx);
+		return (EINVAL);
+	}
+
+	TAILQ_REMOVE(&bc->bc_inuseq, be, be_link);
+	be->be_status = BST_FREE;
+	be->be_req = NULL;
+	TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link);
+	bc->bc_req_count--;
+	pthread_mutex_unlock(&bc->bc_mtx);
+
+	return (0);
 }
 
 int
 blockif_close(struct blockif_ctxt *bc)
 {
 	void *jval;
 	int err;
 
 	err = 0;
 
 	assert(bc->bc_magic == BLOCKIF_SIG);
 
 	/*
 	 * Stop the block i/o thread
 	 */
 	bc->bc_closing = 1;
 	pthread_cond_signal(&bc->bc_cond);
 	pthread_join(bc->bc_btid, &jval);
 
 	/* XXX Cancel queued i/o's ??? */
 
 	/*
 	 * Release resources
 	 */
 	bc->bc_magic = 0;
 	close(bc->bc_fd);
 	free(bc);
 
 	return (0);
 }
 
 /*
  * Return virtual C/H/S values for a given block. Use the algorithm
  * outlined in the VHD specification to calculate values.
  */
 void
 blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s)
 {
 	off_t sectors;		/* total sectors of the block dev */
 	off_t hcyl;		/* cylinders times heads */
 	uint16_t secpt;		/* sectors per track */
 	uint8_t heads;
 
 	assert(bc->bc_magic == BLOCKIF_SIG);
 
 	sectors = bc->bc_size / bc->bc_sectsz;
 
 	/* Clamp the size to the largest possible with CHS */
 	if (sectors > 65535UL*16*255)
 		sectors = 65535UL*16*255;
 
 	if (sectors >= 65536UL*16*63) {
 		secpt = 255;
 		heads = 16;
 		hcyl = sectors / secpt;
 	} else {
 		secpt = 17;
 		hcyl = sectors / secpt;
 		heads = (hcyl + 1023) / 1024;
 
 		if (heads < 4)
 			heads = 4;
 
 		if (hcyl >= (heads * 1024) || heads > 16) {
 			secpt = 31;
 			heads = 16;
 			hcyl = sectors / secpt;
 		}
 		if (hcyl >= (heads * 1024)) {
 			secpt = 63;
 			heads = 16;
 			hcyl = sectors / secpt;
 		}
 	}
 
 	*c = hcyl / heads;
 	*h = heads;
 	*s = secpt;
 }
 
 /*
  * Accessors
  */
 off_t
 blockif_size(struct blockif_ctxt *bc)
 {
 
 	assert(bc->bc_magic == BLOCKIF_SIG);
 	return (bc->bc_size);
 }
 
 int
 blockif_sectsz(struct blockif_ctxt *bc)
 {
 
 	assert(bc->bc_magic == BLOCKIF_SIG);
 	return (bc->bc_sectsz);
 }
 
 int
 blockif_queuesz(struct blockif_ctxt *bc)
 {
 
 	assert(bc->bc_magic == BLOCKIF_SIG);
 	return (BLOCKIF_MAXREQ);
 }
 
 int
 blockif_is_ro(struct blockif_ctxt *bc)
 {
 
 	assert(bc->bc_magic == BLOCKIF_SIG);
 	return (bc->bc_rdonly);
 }
Index: stable/10/usr.sbin/bhyve/pci_ahci.c
===================================================================
--- stable/10/usr.sbin/bhyve/pci_ahci.c	(revision 276348)
+++ stable/10/usr.sbin/bhyve/pci_ahci.c	(revision 276349)
@@ -1,1897 +1,2005 @@
 /*-
  * Copyright (c) 2013  Zhixiang Yu <zcore@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/linker_set.h>
 #include <sys/stat.h>
 #include <sys/uio.h>
 #include <sys/ioctl.h>
 #include <sys/disk.h>
 #include <sys/ata.h>
 #include <sys/endian.h>
 
 #include <errno.h>
 #include <fcntl.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <strings.h>
 #include <unistd.h>
 #include <assert.h>
 #include <pthread.h>
+#include <pthread_np.h>
 #include <inttypes.h>
 
 #include "bhyverun.h"
 #include "pci_emul.h"
 #include "ahci.h"
 #include "block_if.h"
 
 #define	MAX_PORTS	6	/* Intel ICH8 AHCI supports 6 ports */
 
 #define	PxSIG_ATA	0x00000101 /* ATA drive */
 #define	PxSIG_ATAPI	0xeb140101 /* ATAPI drive */
 
 enum sata_fis_type {
 	FIS_TYPE_REGH2D		= 0x27,	/* Register FIS - host to device */
 	FIS_TYPE_REGD2H		= 0x34,	/* Register FIS - device to host */
 	FIS_TYPE_DMAACT		= 0x39,	/* DMA activate FIS - device to host */
 	FIS_TYPE_DMASETUP	= 0x41,	/* DMA setup FIS - bidirectional */
 	FIS_TYPE_DATA		= 0x46,	/* Data FIS - bidirectional */
 	FIS_TYPE_BIST		= 0x58,	/* BIST activate FIS - bidirectional */
 	FIS_TYPE_PIOSETUP	= 0x5F,	/* PIO setup FIS - device to host */
 	FIS_TYPE_SETDEVBITS	= 0xA1,	/* Set dev bits FIS - device to host */
 };
 
 /*
  * SCSI opcodes
  */
 #define	TEST_UNIT_READY		0x00
 #define	REQUEST_SENSE		0x03
 #define	INQUIRY			0x12
 #define	START_STOP_UNIT		0x1B
 #define	PREVENT_ALLOW		0x1E
 #define	READ_CAPACITY		0x25
 #define	READ_10			0x28
 #define	POSITION_TO_ELEMENT	0x2B
 #define	READ_TOC		0x43
 #define	GET_EVENT_STATUS_NOTIFICATION 0x4A
 #define	MODE_SENSE_10		0x5A
 #define	READ_12			0xA8
 #define	READ_CD			0xBE
 
 /*
  * SCSI mode page codes
  */
 #define	MODEPAGE_RW_ERROR_RECOVERY	0x01
 #define	MODEPAGE_CD_CAPABILITIES	0x2A
 
 /*
  * ATA commands
  */
 #define	ATA_SF_ENAB_SATA_SF		0x10
 #define		ATA_SATA_SF_AN		0x05
 #define	ATA_SF_DIS_SATA_SF		0x90
 
 /*
  * Debug printf
  */
 #ifdef AHCI_DEBUG
 static FILE *dbg;
 #define DPRINTF(format, arg...)	do{fprintf(dbg, format, ##arg);fflush(dbg);}while(0)
 #else
 #define DPRINTF(format, arg...)
 #endif
 #define WPRINTF(format, arg...) printf(format, ##arg)
 
 struct ahci_ioreq {
 	struct blockif_req io_req;
 	struct ahci_port *io_pr;
-	STAILQ_ENTRY(ahci_ioreq) io_list;
+	STAILQ_ENTRY(ahci_ioreq) io_flist;
+	TAILQ_ENTRY(ahci_ioreq) io_blist;
 	uint8_t *cfis;
 	uint32_t len;
 	uint32_t done;
 	int slot;
 	int prdtl;
 };
 
 struct ahci_port {
 	struct blockif_ctxt *bctx;
 	struct pci_ahci_softc *pr_sc;
 	uint8_t *cmd_lst;
 	uint8_t *rfis;
 	int atapi;
 	int reset;
 	int mult_sectors;
 	uint8_t xfermode;
 	uint8_t sense_key;
 	uint8_t asc;
 	uint32_t pending;
 
 	uint32_t clb;
 	uint32_t clbu;
 	uint32_t fb;
 	uint32_t fbu;
 	uint32_t is;
 	uint32_t ie;
 	uint32_t cmd;
 	uint32_t unused0;
 	uint32_t tfd;
 	uint32_t sig;
 	uint32_t ssts;
 	uint32_t sctl;
 	uint32_t serr;
 	uint32_t sact;
 	uint32_t ci;
 	uint32_t sntf;
 	uint32_t fbs;
 
 	/*
 	 * i/o request info
 	 */
 	struct ahci_ioreq *ioreq;
 	int ioqsz;
 	STAILQ_HEAD(ahci_fhead, ahci_ioreq) iofhd;
+	TAILQ_HEAD(ahci_bhead, ahci_ioreq) iobhd;
 };
 
 struct ahci_cmd_hdr {
 	uint16_t flags;
 	uint16_t prdtl;
 	uint32_t prdbc;
 	uint64_t ctba;
 	uint32_t reserved[4];
 };
 
 struct ahci_prdt_entry {
 	uint64_t dba;
 	uint32_t reserved;
 #define	DBCMASK		0x3fffff
 	uint32_t dbc;
 };
 
 struct pci_ahci_softc {
 	struct pci_devinst *asc_pi;
 	pthread_mutex_t	mtx;
 	int ports;
 	uint32_t cap;
 	uint32_t ghc;
 	uint32_t is;
 	uint32_t pi;
 	uint32_t vs;
 	uint32_t ccc_ctl;
 	uint32_t ccc_pts;
 	uint32_t em_loc;
 	uint32_t em_ctl;
 	uint32_t cap2;
 	uint32_t bohc;
 	uint32_t lintr;
 	struct ahci_port port[MAX_PORTS];
 };
 #define	ahci_ctx(sc)	((sc)->asc_pi->pi_vmctx)
 
 static inline void lba_to_msf(uint8_t *buf, int lba)
 {
 	lba += 150;
 	buf[0] = (lba / 75) / 60;
 	buf[1] = (lba / 75) % 60;
 	buf[2] = lba % 75;
 }
 
 /*
  * generate HBA intr depending on whether or not ports within
  * the controller have an interrupt pending.
  */
 static void
 ahci_generate_intr(struct pci_ahci_softc *sc)
 {
 	struct pci_devinst *pi;
 	int i;
 
 	pi = sc->asc_pi;
 
 	for (i = 0; i < sc->ports; i++) {
 		struct ahci_port *pr;
 		pr = &sc->port[i];
 		if (pr->is & pr->ie)
 			sc->is |= (1 << i);
 	}
 
 	DPRINTF("%s %x\n", __func__, sc->is);
 
 	if (sc->is && (sc->ghc & AHCI_GHC_IE)) {		
 		if (pci_msi_enabled(pi)) {
 			/*
 			 * Generate an MSI interrupt on every edge
 			 */
 			pci_generate_msi(pi, 0);
 		} else if (!sc->lintr) {
 			/*
 			 * Only generate a pin-based interrupt if one wasn't
 			 * in progress
 			 */
 			sc->lintr = 1;
 			pci_lintr_assert(pi);
 		}
 	} else if (sc->lintr) {
 		/*
 		 * No interrupts: deassert pin-based signal if it had
 		 * been asserted
 		 */
 		pci_lintr_deassert(pi);
 		sc->lintr = 0;
 	}
 }
 
 static void
 ahci_write_fis(struct ahci_port *p, enum sata_fis_type ft, uint8_t *fis)
 {
 	int offset, len, irq;
 
 	if (p->rfis == NULL || !(p->cmd & AHCI_P_CMD_FRE))
 		return;
 
 	switch (ft) {
 	case FIS_TYPE_REGD2H:
 		offset = 0x40;
 		len = 20;
 		irq = AHCI_P_IX_DHR;
 		break;
 	case FIS_TYPE_SETDEVBITS:
 		offset = 0x58;
 		len = 8;
 		irq = AHCI_P_IX_SDB;
 		break;
 	case FIS_TYPE_PIOSETUP:
 		offset = 0x20;
 		len = 20;
 		irq = 0;
 		break;
 	default:
 		WPRINTF("unsupported fis type %d\n", ft);
 		return;
 	}
 	memcpy(p->rfis + offset, fis, len);
 	if (irq) {
 		p->is |= irq;
 		ahci_generate_intr(p->pr_sc);
 	}
 }
 
 static void
 ahci_write_fis_piosetup(struct ahci_port *p)
 {
 	uint8_t fis[20];
 
 	memset(fis, 0, sizeof(fis));
 	fis[0] = FIS_TYPE_PIOSETUP;
 	ahci_write_fis(p, FIS_TYPE_PIOSETUP, fis);
 }
 
 static void
 ahci_write_fis_sdb(struct ahci_port *p, int slot, uint32_t tfd)
 {
 	uint8_t fis[8];
 	uint8_t error;
 
 	error = (tfd >> 8) & 0xff;
 	memset(fis, 0, sizeof(fis));
 	fis[0] = error;
 	fis[2] = tfd & 0x77;
 	*(uint32_t *)(fis + 4) = (1 << slot);
 	if (fis[2] & ATA_S_ERROR)
 		p->is |= AHCI_P_IX_TFE;
 	p->tfd = tfd;
 	ahci_write_fis(p, FIS_TYPE_SETDEVBITS, fis);
 }
 
 static void
 ahci_write_fis_d2h(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t tfd)
 {
 	uint8_t fis[20];
 	uint8_t error;
 
 	error = (tfd >> 8) & 0xff;
 	memset(fis, 0, sizeof(fis));
 	fis[0] = FIS_TYPE_REGD2H;
 	fis[1] = (1 << 6);
 	fis[2] = tfd & 0xff;
 	fis[3] = error;
 	fis[4] = cfis[4];
 	fis[5] = cfis[5];
 	fis[6] = cfis[6];
 	fis[7] = cfis[7];
 	fis[8] = cfis[8];
 	fis[9] = cfis[9];
 	fis[10] = cfis[10];
 	fis[11] = cfis[11];
 	fis[12] = cfis[12];
 	fis[13] = cfis[13];
 	if (fis[2] & ATA_S_ERROR)
 		p->is |= AHCI_P_IX_TFE;
 	else
 		p->ci &= ~(1 << slot);
 	p->tfd = tfd;
 	ahci_write_fis(p, FIS_TYPE_REGD2H, fis);
 }
 
 static void
 ahci_write_reset_fis_d2h(struct ahci_port *p)
 {
 	uint8_t fis[20];
 
 	memset(fis, 0, sizeof(fis));
 	fis[0] = FIS_TYPE_REGD2H;
 	fis[3] = 1;
 	fis[4] = 1;
 	if (p->atapi) {
 		fis[5] = 0x14;
 		fis[6] = 0xeb;
 	}
 	fis[12] = 1;
 	ahci_write_fis(p, FIS_TYPE_REGD2H, fis);
 }
 
 static void
+ahci_check_stopped(struct ahci_port *p)
+{
+	/*
+	 * If we are no longer processing the command list and nothing
+	 * is in-flight, clear the running bit.
+	 */
+	if (!(p->cmd & AHCI_P_CMD_ST)) {
+		if (p->pending == 0)
+			p->cmd &= ~(AHCI_P_CMD_CR | AHCI_P_CMD_CCS_MASK);
+	}
+}
+
+static void
+ahci_port_stop(struct ahci_port *p)
+{
+	struct ahci_ioreq *aior;
+	uint8_t *cfis;
+	int slot;
+	int ncq;
+	int error;
+
+	assert(pthread_mutex_isowned_np(&p->pr_sc->mtx));
+
+	TAILQ_FOREACH(aior, &p->iobhd, io_blist) {
+		/*
+		 * Try to cancel the outstanding blockif request.
+		 */
+		error = blockif_cancel(p->bctx, &aior->io_req);
+		if (error != 0)
+			continue;
+
+		slot = aior->slot;
+		cfis = aior->cfis;
+		if (cfis[2] == ATA_WRITE_FPDMA_QUEUED ||
+		    cfis[2] == ATA_READ_FPDMA_QUEUED)
+			ncq = 1;
+
+		if (ncq)
+			p->sact &= ~(1 << slot);
+		else
+			p->ci &= ~(1 << slot);
+
+		/*
+		 * This command is now done.
+		 */
+		p->pending &= ~(1 << slot);
+
+		/*
+		 * Delete the blockif request from the busy list
+		 */
+		TAILQ_REMOVE(&p->iobhd, aior, io_blist);
+
+		/*
+		 * Move the blockif request back to the free list
+		 */
+		STAILQ_INSERT_TAIL(&p->iofhd, aior, io_flist);
+	}
+
+	ahci_check_stopped(p);
+}
+
+static void
 ahci_port_reset(struct ahci_port *pr)
 {
 	pr->sctl = 0;
 	pr->serr = 0;
 	pr->sact = 0;
 	pr->xfermode = ATA_UDMA6;
 	pr->mult_sectors = 128;
 
 	if (!pr->bctx) {
 		pr->ssts = ATA_SS_DET_NO_DEVICE;
 		pr->sig = 0xFFFFFFFF;
 		pr->tfd = 0x7F;
 		return;
 	}
 	pr->ssts = ATA_SS_DET_PHY_ONLINE | ATA_SS_SPD_GEN2 |
 		ATA_SS_IPM_ACTIVE;
 	pr->tfd = (1 << 8) | ATA_S_DSC | ATA_S_DMA;
 	if (!pr->atapi) {
 		pr->sig = PxSIG_ATA;
 		pr->tfd |= ATA_S_READY;
 	} else
 		pr->sig = PxSIG_ATAPI;
 	ahci_write_reset_fis_d2h(pr);
 }
 
 static void
 ahci_reset(struct pci_ahci_softc *sc)
 {
 	int i;
 
 	sc->ghc = AHCI_GHC_AE;
 	sc->is = 0;
 
 	if (sc->lintr) {
 		pci_lintr_deassert(sc->asc_pi);
 		sc->lintr = 0;
 	}
 
 	for (i = 0; i < sc->ports; i++) {
 		sc->port[i].ie = 0;
 		sc->port[i].is = 0;
 		ahci_port_reset(&sc->port[i]);
 	}
 }
 
 static void
 ata_string(uint8_t *dest, const char *src, int len)
 {
 	int i;
 
 	for (i = 0; i < len; i++) {
 		if (*src)
 			dest[i ^ 1] = *src++;
 		else
 			dest[i ^ 1] = ' ';
 	}
 }
 
 static void
 atapi_string(uint8_t *dest, const char *src, int len)
 {
 	int i;
 
 	for (i = 0; i < len; i++) {
 		if (*src)
 			dest[i] = *src++;
 		else
 			dest[i] = ' ';
 	}
 }
 
 static void
 ahci_handle_dma(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done,
     int seek)
 {
 	struct ahci_ioreq *aior;
 	struct blockif_req *breq;
 	struct pci_ahci_softc *sc;
 	struct ahci_prdt_entry *prdt;
 	struct ahci_cmd_hdr *hdr;
 	uint64_t lba;
 	uint32_t len;
 	int i, err, iovcnt, ncq, readop;
 
 	sc = p->pr_sc;
 	prdt = (struct ahci_prdt_entry *)(cfis + 0x80);
 	hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE);
 	ncq = 0;
 	readop = 1;
 
 	prdt += seek;
 	if (cfis[2] == ATA_WRITE_DMA || cfis[2] == ATA_WRITE_DMA48 ||
 			cfis[2] == ATA_WRITE_FPDMA_QUEUED)
 		readop = 0;
 
 	if (cfis[2] == ATA_WRITE_FPDMA_QUEUED ||
 			cfis[2] == ATA_READ_FPDMA_QUEUED) {
 		lba = ((uint64_t)cfis[10] << 40) |
 			((uint64_t)cfis[9] << 32) |
 			((uint64_t)cfis[8] << 24) |
 			((uint64_t)cfis[6] << 16) |
 			((uint64_t)cfis[5] << 8) |
 			cfis[4];
 		len = cfis[11] << 8 | cfis[3];
 		if (!len)
 			len = 65536;
 		ncq = 1;
 	} else if (cfis[2] == ATA_READ_DMA48 || cfis[2] == ATA_WRITE_DMA48) {
 		lba = ((uint64_t)cfis[10] << 40) |
 			((uint64_t)cfis[9] << 32) |
 			((uint64_t)cfis[8] << 24) |
 			((uint64_t)cfis[6] << 16) |
 			((uint64_t)cfis[5] << 8) |
 			cfis[4];
 		len = cfis[13] << 8 | cfis[12];
 		if (!len)
 			len = 65536;
 	} else {
 		lba = ((cfis[7] & 0xf) << 24) | (cfis[6] << 16) |
 			(cfis[5] << 8) | cfis[4];
 		len = cfis[12];
 		if (!len)
 			len = 256;
 	}
 	lba *= blockif_sectsz(p->bctx);
 	len *= blockif_sectsz(p->bctx);
 
 	/*
 	 * Pull request off free list
 	 */
 	aior = STAILQ_FIRST(&p->iofhd);
 	assert(aior != NULL);
-	STAILQ_REMOVE_HEAD(&p->iofhd, io_list);
+	STAILQ_REMOVE_HEAD(&p->iofhd, io_flist);
 	aior->cfis = cfis;
 	aior->slot = slot;
 	aior->len = len;
 	aior->done = done;
 	breq = &aior->io_req;
 	breq->br_offset = lba + done;
 	iovcnt = hdr->prdtl - seek;
 	if (iovcnt > BLOCKIF_IOV_MAX) {
 		aior->prdtl = iovcnt - BLOCKIF_IOV_MAX;
 		iovcnt = BLOCKIF_IOV_MAX;
-		/*
-		 * Mark this command in-flight.
-		 */
-		p->pending |= 1 << slot;
 	} else
 		aior->prdtl = 0;
 	breq->br_iovcnt = iovcnt;
 
 	/*
+	 * Mark this command in-flight.
+	 */
+	p->pending |= 1 << slot;
+
+	/*
+	 * Stuff request onto busy list
+	 */
+	TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist);
+
+	/*
 	 * Build up the iovec based on the prdt
 	 */
 	for (i = 0; i < iovcnt; i++) {
 		uint32_t dbcsz;
 
 		dbcsz = (prdt->dbc & DBCMASK) + 1;
 		breq->br_iov[i].iov_base = paddr_guest2host(ahci_ctx(sc),
 		    prdt->dba, dbcsz);
 		breq->br_iov[i].iov_len = dbcsz;
 		aior->done += dbcsz;
 		prdt++;
 	}
 	if (readop)
 		err = blockif_read(p->bctx, breq);
 	else
 		err = blockif_write(p->bctx, breq);
 	assert(err == 0);
 
 	if (ncq)
 		p->ci &= ~(1 << slot);
 }
 
 static void
 ahci_handle_flush(struct ahci_port *p, int slot, uint8_t *cfis)
 {
 	struct ahci_ioreq *aior;
 	struct blockif_req *breq;
 	int err;
 
 	/*
 	 * Pull request off free list
 	 */
 	aior = STAILQ_FIRST(&p->iofhd);
 	assert(aior != NULL);
-	STAILQ_REMOVE_HEAD(&p->iofhd, io_list);
+	STAILQ_REMOVE_HEAD(&p->iofhd, io_flist);
 	aior->cfis = cfis;
 	aior->slot = slot;
 	aior->len = 0;
 	aior->done = 0;
 	aior->prdtl = 0;
 	breq = &aior->io_req;
 
+	/*
+	 * Mark this command in-flight.
+	 */
+	p->pending |= 1 << slot;
+
+	/*
+	 * Stuff request onto busy list
+	 */
+	TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist);
+
 	err = blockif_flush(p->bctx, breq);
 	assert(err == 0);
 }
 
 static inline void
 write_prdt(struct ahci_port *p, int slot, uint8_t *cfis,
 		void *buf, int size)
 {
 	struct ahci_cmd_hdr *hdr;
 	struct ahci_prdt_entry *prdt;
 	void *from;
 	int i, len;
 
 	hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE);
 	len = size;
 	from = buf;
 	prdt = (struct ahci_prdt_entry *)(cfis + 0x80);
 	for (i = 0; i < hdr->prdtl && len; i++) {
 		uint8_t *ptr;
 		uint32_t dbcsz;
 		int sublen;
 
 		dbcsz = (prdt->dbc & DBCMASK) + 1;
 		ptr = paddr_guest2host(ahci_ctx(p->pr_sc), prdt->dba, dbcsz);
 		sublen = len < dbcsz ? len : dbcsz;
 		memcpy(ptr, from, sublen);
 		len -= sublen;
 		from += sublen;
 		prdt++;
 	}
 	hdr->prdbc = size - len;
 }
 
 static void
 handle_identify(struct ahci_port *p, int slot, uint8_t *cfis)
 {
 	struct ahci_cmd_hdr *hdr;
 
 	hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE);
 	if (p->atapi || hdr->prdtl == 0) {
 		p->tfd = (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR;
 		p->is |= AHCI_P_IX_TFE;
 	} else {
 		uint16_t buf[256];
 		uint64_t sectors;
 		uint16_t cyl;
 		uint8_t sech, heads;
 
 		sectors = blockif_size(p->bctx) / blockif_sectsz(p->bctx);
 		blockif_chs(p->bctx, &cyl, &heads, &sech);
 		memset(buf, 0, sizeof(buf));
 		buf[0] = 0x0040;
 		buf[1] = cyl;
 		buf[3] = heads;
 		buf[6] = sech;
 		/* TODO emulate different serial? */
 		ata_string((uint8_t *)(buf+10), "123456", 20);
 		ata_string((uint8_t *)(buf+23), "001", 8);
 		ata_string((uint8_t *)(buf+27), "BHYVE SATA DISK", 40);
 		buf[47] = (0x8000 | 128);
 		buf[48] = 0x1;
 		buf[49] = (1 << 8 | 1 << 9 | 1 << 11);
 		buf[50] = (1 << 14);
 		buf[53] = (1 << 1 | 1 << 2);
 		if (p->mult_sectors)
 			buf[59] = (0x100 | p->mult_sectors);
 		buf[60] = sectors;
 		buf[61] = (sectors >> 16);
 		buf[63] = 0x7;
 		if (p->xfermode & ATA_WDMA0)
 			buf[63] |= (1 << ((p->xfermode & 7) + 8));
 		buf[64] = 0x3;
 		buf[65] = 100;
 		buf[66] = 100;
 		buf[67] = 100;
 		buf[68] = 100;
 		buf[75] = 31;
 		buf[76] = (1 << 8 | 1 << 2);
 		buf[80] = 0x1f0;
 		buf[81] = 0x28;
 		buf[82] = (1 << 5 | 1 << 14);
 		buf[83] = (1 << 10 | 1 << 12 | 1 << 13 | 1 << 14);
 		buf[84] = (1 << 14);
 		buf[85] = (1 << 5 | 1 << 14);
 		buf[86] = (1 << 10 | 1 << 12 | 1 << 13);
 		buf[87] = (1 << 14);
 		buf[88] = 0x7f;
 		if (p->xfermode & ATA_UDMA0)
 			buf[88] |= (1 << ((p->xfermode & 7) + 8));
 		buf[93] = (1 | 1 <<14);
 		buf[100] = sectors;
 		buf[101] = (sectors >> 16);
 		buf[102] = (sectors >> 32);
 		buf[103] = (sectors >> 48);
 		ahci_write_fis_piosetup(p);
 		write_prdt(p, slot, cfis, (void *)buf, sizeof(buf));
 		p->tfd = ATA_S_DSC | ATA_S_READY;
 		p->is |= AHCI_P_IX_DP;
 		p->ci &= ~(1 << slot);
 	}
 	ahci_generate_intr(p->pr_sc);
 }
 
 static void
 handle_atapi_identify(struct ahci_port *p, int slot, uint8_t *cfis)
 {
 	if (!p->atapi) {
 		p->tfd = (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR;
 		p->is |= AHCI_P_IX_TFE;
 	} else {
 		uint16_t buf[256];
 
 		memset(buf, 0, sizeof(buf));
 		buf[0] = (2 << 14 | 5 << 8 | 1 << 7 | 2 << 5);
 		/* TODO emulate different serial? */
 		ata_string((uint8_t *)(buf+10), "123456", 20);
 		ata_string((uint8_t *)(buf+23), "001", 8);
 		ata_string((uint8_t *)(buf+27), "BHYVE SATA DVD ROM", 40);
 		buf[49] = (1 << 9 | 1 << 8);
 		buf[50] = (1 << 14 | 1);
 		buf[53] = (1 << 2 | 1 << 1);
 		buf[62] = 0x3f;
 		buf[63] = 7;
 		buf[64] = 3;
 		buf[65] = 100;
 		buf[66] = 100;
 		buf[67] = 100;
 		buf[68] = 100;
 		buf[76] = (1 << 2 | 1 << 1);
 		buf[78] = (1 << 5);
 		buf[80] = (0x1f << 4);
 		buf[82] = (1 << 4);
 		buf[83] = (1 << 14);
 		buf[84] = (1 << 14);
 		buf[85] = (1 << 4);
 		buf[87] = (1 << 14);
 		buf[88] = (1 << 14 | 0x7f);
 		ahci_write_fis_piosetup(p);
 		write_prdt(p, slot, cfis, (void *)buf, sizeof(buf));
 		p->tfd = ATA_S_DSC | ATA_S_READY;
 		p->is |= AHCI_P_IX_DHR;
 		p->ci &= ~(1 << slot);
 	}
 	ahci_generate_intr(p->pr_sc);
 }
 
 static void
 atapi_inquiry(struct ahci_port *p, int slot, uint8_t *cfis)
 {
 	uint8_t buf[36];
 	uint8_t *acmd;
 	int len;
 
 	acmd = cfis + 0x40;
 
 	buf[0] = 0x05;
 	buf[1] = 0x80;
 	buf[2] = 0x00;
 	buf[3] = 0x21;
 	buf[4] = 31;
 	buf[5] = 0;
 	buf[6] = 0;
 	buf[7] = 0;
 	atapi_string(buf + 8, "BHYVE", 8);
 	atapi_string(buf + 16, "BHYVE DVD-ROM", 16);
 	atapi_string(buf + 32, "001", 4);
 
 	len = sizeof(buf);
 	if (len > acmd[4])
 		len = acmd[4];
 	cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 	write_prdt(p, slot, cfis, buf, len);
 	ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
 }
 
 static void
 atapi_read_capacity(struct ahci_port *p, int slot, uint8_t *cfis)
 {
 	uint8_t buf[8];
 	uint64_t sectors;
 
 	sectors = blockif_size(p->bctx) / 2048;
 	be32enc(buf, sectors - 1);
 	be32enc(buf + 4, 2048);
 	cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 	write_prdt(p, slot, cfis, buf, sizeof(buf));
 	ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
 }
 
 static void
 atapi_read_toc(struct ahci_port *p, int slot, uint8_t *cfis)
 {
 	uint8_t *acmd;
 	uint8_t format;
 	int len;
 
 	acmd = cfis + 0x40;
 
 	len = be16dec(acmd + 7);
 	format = acmd[9] >> 6;
 	switch (format) {
 	case 0:
 	{
 		int msf, size;
 		uint64_t sectors;
 		uint8_t start_track, buf[20], *bp;
 
 		msf = (acmd[1] >> 1) & 1;
 		start_track = acmd[6];
 		if (start_track > 1 && start_track != 0xaa) {
 			uint32_t tfd;
 			p->sense_key = ATA_SENSE_ILLEGAL_REQUEST;
 			p->asc = 0x24;
 			tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR;
 			cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 			ahci_write_fis_d2h(p, slot, cfis, tfd);
 			return;
 		}
 		bp = buf + 2;
 		*bp++ = 1;
 		*bp++ = 1;
 		if (start_track <= 1) {
 			*bp++ = 0;
 			*bp++ = 0x14;
 			*bp++ = 1;
 			*bp++ = 0;
 			if (msf) {
 				*bp++ = 0;
 				lba_to_msf(bp, 0);
 				bp += 3;
 			} else {
 				*bp++ = 0;
 				*bp++ = 0;
 				*bp++ = 0;
 				*bp++ = 0;
 			}
 		}
 		*bp++ = 0;
 		*bp++ = 0x14;
 		*bp++ = 0xaa;
 		*bp++ = 0;
 		sectors = blockif_size(p->bctx) / blockif_sectsz(p->bctx);
 		sectors >>= 2;
 		if (msf) {
 			*bp++ = 0;
 			lba_to_msf(bp, sectors);
 			bp += 3;
 		} else {
 			be32enc(bp, sectors);
 			bp += 4;
 		}
 		size = bp - buf;
 		be16enc(buf, size - 2);
 		if (len > size)
 			len = size;
 		write_prdt(p, slot, cfis, buf, len);
 		cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 		ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
 		break;
 	}
 	case 1:
 	{
 		uint8_t buf[12];
 
 		memset(buf, 0, sizeof(buf));
 		buf[1] = 0xa;
 		buf[2] = 0x1;
 		buf[3] = 0x1;
 		if (len > sizeof(buf))
 			len = sizeof(buf);
 		write_prdt(p, slot, cfis, buf, len);
 		cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 		ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
 		break;
 	}
 	case 2:
 	{
 		int msf, size;
 		uint64_t sectors;
 		uint8_t start_track, *bp, buf[50];
 
 		msf = (acmd[1] >> 1) & 1;
 		start_track = acmd[6];
 		bp = buf + 2;
 		*bp++ = 1;
 		*bp++ = 1;
 
 		*bp++ = 1;
 		*bp++ = 0x14;
 		*bp++ = 0;
 		*bp++ = 0xa0;
 		*bp++ = 0;
 		*bp++ = 0;
 		*bp++ = 0;
 		*bp++ = 0;
 		*bp++ = 1;
 		*bp++ = 0;
 		*bp++ = 0;
 
 		*bp++ = 1;
 		*bp++ = 0x14;
 		*bp++ = 0;
 		*bp++ = 0xa1;
 		*bp++ = 0;
 		*bp++ = 0;
 		*bp++ = 0;
 		*bp++ = 0;
 		*bp++ = 1;
 		*bp++ = 0;
 		*bp++ = 0;
 
 		*bp++ = 1;
 		*bp++ = 0x14;
 		*bp++ = 0;
 		*bp++ = 0xa2;
 		*bp++ = 0;
 		*bp++ = 0;
 		*bp++ = 0;
 		sectors = blockif_size(p->bctx) / blockif_sectsz(p->bctx);
 		sectors >>= 2;
 		if (msf) {
 			*bp++ = 0;
 			lba_to_msf(bp, sectors);
 			bp += 3;
 		} else {
 			be32enc(bp, sectors);
 			bp += 4;
 		}
 
 		*bp++ = 1;
 		*bp++ = 0x14;
 		*bp++ = 0;
 		*bp++ = 1;
 		*bp++ = 0;
 		*bp++ = 0;
 		*bp++ = 0;
 		if (msf) {
 			*bp++ = 0;
 			lba_to_msf(bp, 0);
 			bp += 3;
 		} else {
 			*bp++ = 0;
 			*bp++ = 0;
 			*bp++ = 0;
 			*bp++ = 0;
 		}
 
 		size = bp - buf;
 		be16enc(buf, size - 2);
 		if (len > size)
 			len = size;
 		write_prdt(p, slot, cfis, buf, len);
 		cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 		ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
 		break;
 	}
 	default:
 	{
 		uint32_t tfd;
 
 		p->sense_key = ATA_SENSE_ILLEGAL_REQUEST;
 		p->asc = 0x24;
 		tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR;
 		cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 		ahci_write_fis_d2h(p, slot, cfis, tfd);
 		break;
 	}
 	}
 }
 
 static void
 atapi_read(struct ahci_port *p, int slot, uint8_t *cfis,
 		uint32_t done, int seek)
 {
 	struct ahci_ioreq *aior;
 	struct ahci_cmd_hdr *hdr;
 	struct ahci_prdt_entry *prdt;
 	struct blockif_req *breq;
 	struct pci_ahci_softc *sc;
 	uint8_t *acmd;
 	uint64_t lba;
 	uint32_t len;
 	int i, err, iovcnt;
 
 	sc = p->pr_sc;
 	acmd = cfis + 0x40;
 	hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE);
 	prdt = (struct ahci_prdt_entry *)(cfis + 0x80);
 
 	prdt += seek;
 	lba = be32dec(acmd + 2);
 	if (acmd[0] == READ_10)
 		len = be16dec(acmd + 7);
 	else
 		len = be32dec(acmd + 6);
 	if (len == 0) {
 		cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 		ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
 	}
 	lba *= 2048;
 	len *= 2048;
 
 	/*
 	 * Pull request off free list
 	 */
 	aior = STAILQ_FIRST(&p->iofhd);
 	assert(aior != NULL);
-	STAILQ_REMOVE_HEAD(&p->iofhd, io_list);
+	STAILQ_REMOVE_HEAD(&p->iofhd, io_flist);
 	aior->cfis = cfis;
 	aior->slot = slot;
 	aior->len = len;
 	aior->done = done;
 	breq = &aior->io_req;
 	breq->br_offset = lba + done;
 	iovcnt = hdr->prdtl - seek;
 	if (iovcnt > BLOCKIF_IOV_MAX) {
 		aior->prdtl = iovcnt - BLOCKIF_IOV_MAX;
 		iovcnt = BLOCKIF_IOV_MAX;
 	} else
 		aior->prdtl = 0;
 	breq->br_iovcnt = iovcnt;
 
 	/*
+	 * Mark this command in-flight.
+	 */
+	p->pending |= 1 << slot;
+
+	/*
+	 * Stuff request onto busy list
+	 */
+	TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist);
+
+	/*
 	 * Build up the iovec based on the prdt
 	 */
 	for (i = 0; i < iovcnt; i++) {
 		uint32_t dbcsz;
 
 		dbcsz = (prdt->dbc & DBCMASK) + 1;
 		breq->br_iov[i].iov_base = paddr_guest2host(ahci_ctx(sc),
 		    prdt->dba, dbcsz);
 		breq->br_iov[i].iov_len = dbcsz;
 		aior->done += dbcsz;
 		prdt++;
 	}
 	err = blockif_read(p->bctx, breq);
 	assert(err == 0);
 }
 
 static void
 atapi_request_sense(struct ahci_port *p, int slot, uint8_t *cfis)
 {
 	uint8_t buf[64];
 	uint8_t *acmd;
 	int len;
 
 	acmd = cfis + 0x40;
 	len = acmd[4];
 	if (len > sizeof(buf))
 		len = sizeof(buf);
 	memset(buf, 0, len);
 	buf[0] = 0x70 | (1 << 7);
 	buf[2] = p->sense_key;
 	buf[7] = 10;
 	buf[12] = p->asc;
 	write_prdt(p, slot, cfis, buf, len);
 	cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 	ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
 }
 
 static void
 atapi_start_stop_unit(struct ahci_port *p, int slot, uint8_t *cfis)
 {
 	uint8_t *acmd = cfis + 0x40;
 	uint32_t tfd;
 
 	switch (acmd[4] & 3) {
 	case 0:
 	case 1:
 	case 3:
 		cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 		tfd = ATA_S_READY | ATA_S_DSC;
 		break;
 	case 2:
 		/* TODO eject media */
 		cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 		p->sense_key = ATA_SENSE_ILLEGAL_REQUEST;
 		p->asc = 0x53;
 		tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR;
 		break;
 	}
 	ahci_write_fis_d2h(p, slot, cfis, tfd);
 }
 
 static void
 atapi_mode_sense(struct ahci_port *p, int slot, uint8_t *cfis)
 {
 	uint8_t *acmd;
 	uint32_t tfd;
 	uint8_t pc, code;
 	int len;
 
 	acmd = cfis + 0x40;
 	len = be16dec(acmd + 7);
 	pc = acmd[2] >> 6;
 	code = acmd[2] & 0x3f;
 
 	switch (pc) {
 	case 0:
 		switch (code) {
 		case MODEPAGE_RW_ERROR_RECOVERY:
 		{
 			uint8_t buf[16];
 
 			if (len > sizeof(buf))
 				len = sizeof(buf);
 
 			memset(buf, 0, sizeof(buf));
 			be16enc(buf, 16 - 2);
 			buf[2] = 0x70;
 			buf[8] = 0x01;
 			buf[9] = 16 - 10;
 			buf[11] = 0x05;
 			write_prdt(p, slot, cfis, buf, len);
 			tfd = ATA_S_READY | ATA_S_DSC;
 			break;
 		}
 		case MODEPAGE_CD_CAPABILITIES:
 		{
 			uint8_t buf[30];
 
 			if (len > sizeof(buf))
 				len = sizeof(buf);
 
 			memset(buf, 0, sizeof(buf));
 			be16enc(buf, 30 - 2);
 			buf[2] = 0x70;
 			buf[8] = 0x2A;
 			buf[9] = 30 - 10;
 			buf[10] = 0x08;
 			buf[12] = 0x71;
 			be16enc(&buf[18], 2);
 			be16enc(&buf[20], 512);
 			write_prdt(p, slot, cfis, buf, len);
 			tfd = ATA_S_READY | ATA_S_DSC;
 			break;
 		}
 		default:
 			goto error;
 			break;
 		}
 		break;
 	case 3:
 		p->sense_key = ATA_SENSE_ILLEGAL_REQUEST;
 		p->asc = 0x39;
 		tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR;
 		break;
 error:
 	case 1:
 	case 2:
 		p->sense_key = ATA_SENSE_ILLEGAL_REQUEST;
 		p->asc = 0x24;
 		tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR;
 		break;
 	}
 	cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 	ahci_write_fis_d2h(p, slot, cfis, tfd);
 }
 
 static void
 atapi_get_event_status_notification(struct ahci_port *p, int slot,
     uint8_t *cfis)
 {
 	uint8_t *acmd;
 	uint32_t tfd;
 
 	acmd = cfis + 0x40;
 
 	/* we don't support asynchronous operation */
 	if (!(acmd[1] & 1)) {
 		p->sense_key = ATA_SENSE_ILLEGAL_REQUEST;
 		p->asc = 0x24;
 		tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR;
 	} else {
 		uint8_t buf[8];
 		int len;
 
 		len = be16dec(acmd + 7);
 		if (len > sizeof(buf))
 			len = sizeof(buf);
 
 		memset(buf, 0, sizeof(buf));
 		be16enc(buf, 8 - 2);
 		buf[2] = 0x04;
 		buf[3] = 0x10;
 		buf[5] = 0x02;
 		write_prdt(p, slot, cfis, buf, len);
 		tfd = ATA_S_READY | ATA_S_DSC;
 	}
 	cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 	ahci_write_fis_d2h(p, slot, cfis, tfd);
 }
 
 static void
 handle_packet_cmd(struct ahci_port *p, int slot, uint8_t *cfis)
 {
 	uint8_t *acmd;
 
 	acmd = cfis + 0x40;
 
 #ifdef AHCI_DEBUG
 	{
 		int i;
 		DPRINTF("ACMD:");
 		for (i = 0; i < 16; i++)
 			DPRINTF("%02x ", acmd[i]);
 		DPRINTF("\n");
 	}
 #endif
 
 	switch (acmd[0]) {
 	case TEST_UNIT_READY:
 		cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 		ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
 		break;
 	case INQUIRY:
 		atapi_inquiry(p, slot, cfis);
 		break;
 	case READ_CAPACITY:
 		atapi_read_capacity(p, slot, cfis);
 		break;
 	case PREVENT_ALLOW:
 		/* TODO */
 		cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 		ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
 		break;
 	case READ_TOC:
 		atapi_read_toc(p, slot, cfis);
 		break;
 	case READ_10:
 	case READ_12:
 		atapi_read(p, slot, cfis, 0, 0);
 		break;
 	case REQUEST_SENSE:
 		atapi_request_sense(p, slot, cfis);
 		break;
 	case START_STOP_UNIT:
 		atapi_start_stop_unit(p, slot, cfis);
 		break;
 	case MODE_SENSE_10:
 		atapi_mode_sense(p, slot, cfis);
 		break;
 	case GET_EVENT_STATUS_NOTIFICATION:
 		atapi_get_event_status_notification(p, slot, cfis);
 		break;
 	default:
 		cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 		p->sense_key = ATA_SENSE_ILLEGAL_REQUEST;
 		p->asc = 0x20;
 		ahci_write_fis_d2h(p, slot, cfis, (p->sense_key << 12) |
 				ATA_S_READY | ATA_S_ERROR);
 		break;
 	}
 }
 
 static void
 ahci_handle_cmd(struct ahci_port *p, int slot, uint8_t *cfis)
 {
 
 	switch (cfis[2]) {
 	case ATA_ATA_IDENTIFY:
 		handle_identify(p, slot, cfis);
 		break;
 	case ATA_SETFEATURES:
 	{
 		switch (cfis[3]) {
 		case ATA_SF_ENAB_SATA_SF:
 			switch (cfis[12]) {
 			case ATA_SATA_SF_AN:
 				p->tfd = ATA_S_DSC | ATA_S_READY;
 				break;
 			default:
 				p->tfd = ATA_S_ERROR | ATA_S_READY;
 				p->tfd |= (ATA_ERROR_ABORT << 8);
 				break;
 			}
 			break;
 		case ATA_SF_ENAB_WCACHE:
 		case ATA_SF_DIS_WCACHE:
 		case ATA_SF_ENAB_RCACHE:
 		case ATA_SF_DIS_RCACHE:
 			p->tfd = ATA_S_DSC | ATA_S_READY;
 			break;
 		case ATA_SF_SETXFER:
 		{
 			switch (cfis[12] & 0xf8) {
 			case ATA_PIO:
 			case ATA_PIO0:
 				break;
 			case ATA_WDMA0:
 			case ATA_UDMA0:
 				p->xfermode = (cfis[12] & 0x7);
 				break;
 			}
 			p->tfd = ATA_S_DSC | ATA_S_READY;
 			break;
 		}
 		default:
 			p->tfd = ATA_S_ERROR | ATA_S_READY;
 			p->tfd |= (ATA_ERROR_ABORT << 8);
 			break;
 		}
 		ahci_write_fis_d2h(p, slot, cfis, p->tfd);
 		break;
 	}
 	case ATA_SET_MULTI:
 		if (cfis[12] != 0 &&
 			(cfis[12] > 128 || (cfis[12] & (cfis[12] - 1)))) {
 			p->tfd = ATA_S_ERROR | ATA_S_READY;
 			p->tfd |= (ATA_ERROR_ABORT << 8);
 		} else {
 			p->mult_sectors = cfis[12];
 			p->tfd = ATA_S_DSC | ATA_S_READY;
 		}
 		p->is |= AHCI_P_IX_DP;
 		p->ci &= ~(1 << slot);
 		ahci_generate_intr(p->pr_sc);
 		break;
 	case ATA_READ_DMA:
 	case ATA_WRITE_DMA:
 	case ATA_READ_DMA48:
 	case ATA_WRITE_DMA48:
 	case ATA_READ_FPDMA_QUEUED:
 	case ATA_WRITE_FPDMA_QUEUED:
 		ahci_handle_dma(p, slot, cfis, 0, 0);
 		break;
 	case ATA_FLUSHCACHE:
 	case ATA_FLUSHCACHE48:
 		ahci_handle_flush(p, slot, cfis);
 		break;
 	case ATA_STANDBY_CMD:
 		break;
 	case ATA_NOP:
 	case ATA_STANDBY_IMMEDIATE:
 	case ATA_IDLE_IMMEDIATE:
 	case ATA_SLEEP:
 		ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
 		break;
 	case ATA_ATAPI_IDENTIFY:
 		handle_atapi_identify(p, slot, cfis);
 		break;
 	case ATA_PACKET_CMD:
 		if (!p->atapi) {
 			p->tfd = (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR;
 			p->is |= AHCI_P_IX_TFE;
 			ahci_generate_intr(p->pr_sc);
 		} else
 			handle_packet_cmd(p, slot, cfis);
 		break;
 	default:
 		WPRINTF("Unsupported cmd:%02x\n", cfis[2]);
 		p->tfd = (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR;
 		p->is |= AHCI_P_IX_TFE;
 		ahci_generate_intr(p->pr_sc);
 		break;
 	}
 }
 
 static void
 ahci_handle_slot(struct ahci_port *p, int slot)
 {
 	struct ahci_cmd_hdr *hdr;
 	struct ahci_prdt_entry *prdt;
 	struct pci_ahci_softc *sc;
 	uint8_t *cfis;
 	int cfl;
 
 	sc = p->pr_sc;
 	hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE);
 	cfl = (hdr->flags & 0x1f) * 4;
 	cfis = paddr_guest2host(ahci_ctx(sc), hdr->ctba,
 			0x80 + hdr->prdtl * sizeof(struct ahci_prdt_entry));
 	prdt = (struct ahci_prdt_entry *)(cfis + 0x80);
 
 #ifdef AHCI_DEBUG
 	DPRINTF("\ncfis:");
 	for (i = 0; i < cfl; i++) {
 		if (i % 10 == 0)
 			DPRINTF("\n");
 		DPRINTF("%02x ", cfis[i]);
 	}
 	DPRINTF("\n");
 
 	for (i = 0; i < hdr->prdtl; i++) {
 		DPRINTF("%d@%08"PRIx64"\n", prdt->dbc & 0x3fffff, prdt->dba);
 		prdt++;
 	}
 #endif
 
 	if (cfis[0] != FIS_TYPE_REGH2D) {
 		WPRINTF("Not a H2D FIS:%02x\n", cfis[0]);
 		return;
 	}
 
 	if (cfis[1] & 0x80) {
 		ahci_handle_cmd(p, slot, cfis);
 	} else {
 		if (cfis[15] & (1 << 2))
 			p->reset = 1;
 		else if (p->reset) {
 			p->reset = 0;
 			ahci_port_reset(p);
 		}
 		p->ci &= ~(1 << slot);
 	}
 }
 
 static void
 ahci_handle_port(struct ahci_port *p)
 {
 	int i;
 
 	if (!(p->cmd & AHCI_P_CMD_ST))
 		return;
 
 	/*
 	 * Search for any new commands to issue ignoring those that
 	 * are already in-flight.
 	 */
 	for (i = 0; (i < 32) && p->ci; i++) {
 		if ((p->ci & (1 << i)) && !(p->pending & (1 << i))) {
 			p->cmd &= ~AHCI_P_CMD_CCS_MASK;
 			p->cmd |= i << AHCI_P_CMD_CCS_SHIFT;
 			ahci_handle_slot(p, i);
 		}
 	}
 }
 
 /*
  * blockif callback routine - this runs in the context of the blockif
  * i/o thread, so the mutex needs to be acquired.
  */
 static void
 ata_ioreq_cb(struct blockif_req *br, int err)
 {
 	struct ahci_cmd_hdr *hdr;
 	struct ahci_ioreq *aior;
 	struct ahci_port *p;
 	struct pci_ahci_softc *sc;
 	uint32_t tfd;
 	uint8_t *cfis;
 	int pending, slot, ncq;
 
 	DPRINTF("%s %d\n", __func__, err);
 
 	ncq = 0;
 	aior = br->br_param;
 	p = aior->io_pr;
 	cfis = aior->cfis;
 	slot = aior->slot;
 	pending = aior->prdtl;
 	sc = p->pr_sc;
 	hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE);
 
 	if (cfis[2] == ATA_WRITE_FPDMA_QUEUED ||
 			cfis[2] == ATA_READ_FPDMA_QUEUED)
 		ncq = 1;
 
 	pthread_mutex_lock(&sc->mtx);
 
 	/*
+	 * Delete the blockif request from the busy list
+	 */
+	TAILQ_REMOVE(&p->iobhd, aior, io_blist);
+
+	/*
 	 * Move the blockif request back to the free list
 	 */
-	STAILQ_INSERT_TAIL(&p->iofhd, aior, io_list);
+	STAILQ_INSERT_TAIL(&p->iofhd, aior, io_flist);
 
 	if (pending && !err) {
 		ahci_handle_dma(p, slot, cfis, aior->done,
 		    hdr->prdtl - pending);
 		goto out;
 	}
 
 	if (!err && aior->done == aior->len) {
 		tfd = ATA_S_READY | ATA_S_DSC;
 		if (ncq)
 			hdr->prdbc = 0;
 		else
 			hdr->prdbc = aior->len;
 	} else {
 		tfd = (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR;
 		hdr->prdbc = 0;
 		if (ncq)
 			p->serr |= (1 << slot);
 	}
 
-	/*
-	 * This command is now complete.
-	 */
-	p->pending &= ~(1 << slot);
-
 	if (ncq) {
 		p->sact &= ~(1 << slot);
 		ahci_write_fis_sdb(p, slot, tfd);
 	} else
 		ahci_write_fis_d2h(p, slot, cfis, tfd);
 
+	/*
+	 * This command is now complete.
+	 */
+	p->pending &= ~(1 << slot);
+
+	ahci_check_stopped(p);
 out:
 	pthread_mutex_unlock(&sc->mtx);
 	DPRINTF("%s exit\n", __func__);
 }
 
 static void
 atapi_ioreq_cb(struct blockif_req *br, int err)
 {
 	struct ahci_cmd_hdr *hdr;
 	struct ahci_ioreq *aior;
 	struct ahci_port *p;
 	struct pci_ahci_softc *sc;
 	uint8_t *cfis;
 	uint32_t tfd;
 	int pending, slot;
 
 	DPRINTF("%s %d\n", __func__, err);
 
 	aior = br->br_param;
 	p = aior->io_pr;
 	cfis = aior->cfis;
 	slot = aior->slot;
 	pending = aior->prdtl;
 	sc = p->pr_sc;
 	hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + aior->slot * AHCI_CL_SIZE);
 
 	pthread_mutex_lock(&sc->mtx);
 
 	/*
+	 * Delete the blockif request from the busy list
+	 */
+	TAILQ_REMOVE(&p->iobhd, aior, io_blist);
+
+	/*
 	 * Move the blockif request back to the free list
 	 */
-	STAILQ_INSERT_TAIL(&p->iofhd, aior, io_list);
+	STAILQ_INSERT_TAIL(&p->iofhd, aior, io_flist);
 
 	if (pending && !err) {
 		atapi_read(p, slot, cfis, aior->done, hdr->prdtl - pending);
 		goto out;
 	}
 
 	if (!err && aior->done == aior->len) {
 		tfd = ATA_S_READY | ATA_S_DSC;
 		hdr->prdbc = aior->len;
 	} else {
 		p->sense_key = ATA_SENSE_ILLEGAL_REQUEST;
 		p->asc = 0x21;
 		tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR;
 		hdr->prdbc = 0;
 	}
 
 	cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 	ahci_write_fis_d2h(p, slot, cfis, tfd);
 
+	/*
+	 * This command is now complete.
+	 */
+	p->pending &= ~(1 << slot);
+
+	ahci_check_stopped(p);
 out:
 	pthread_mutex_unlock(&sc->mtx);
 	DPRINTF("%s exit\n", __func__);
 }
 
 static void
 pci_ahci_ioreq_init(struct ahci_port *pr)
 {
 	struct ahci_ioreq *vr;
 	int i;
 
 	pr->ioqsz = blockif_queuesz(pr->bctx);
 	pr->ioreq = calloc(pr->ioqsz, sizeof(struct ahci_ioreq));
 	STAILQ_INIT(&pr->iofhd);
 
 	/*
 	 * Add all i/o request entries to the free queue
 	 */
 	for (i = 0; i < pr->ioqsz; i++) {
 		vr = &pr->ioreq[i];
 		vr->io_pr = pr;
 		if (!pr->atapi)
 			vr->io_req.br_callback = ata_ioreq_cb;
 		else
 			vr->io_req.br_callback = atapi_ioreq_cb;
 		vr->io_req.br_param = vr;
-		STAILQ_INSERT_TAIL(&pr->iofhd, vr, io_list);
+		STAILQ_INSERT_TAIL(&pr->iofhd, vr, io_flist);
 	}
+
+	TAILQ_INIT(&pr->iobhd);
 }
 
 static void
 pci_ahci_port_write(struct pci_ahci_softc *sc, uint64_t offset, uint64_t value)
 {
 	int port = (offset - AHCI_OFFSET) / AHCI_STEP;
 	offset = (offset - AHCI_OFFSET) % AHCI_STEP;
 	struct ahci_port *p = &sc->port[port];
 
 	DPRINTF("pci_ahci_port %d: write offset 0x%"PRIx64" value 0x%"PRIx64"\n",
 		port, offset, value);
 
 	switch (offset) {
 	case AHCI_P_CLB:
 		p->clb = value;
 		break;
 	case AHCI_P_CLBU:
 		p->clbu = value;
 		break;
 	case AHCI_P_FB:
 		p->fb = value;
 		break;
 	case AHCI_P_FBU:
 		p->fbu = value;
 		break;
 	case AHCI_P_IS:
 		p->is &= ~value;
 		break;
 	case AHCI_P_IE:
 		p->ie = value & 0xFDC000FF;
 		ahci_generate_intr(sc);
 		break;
 	case AHCI_P_CMD:
 	{
 		p->cmd = value;
 		
 		if (!(value & AHCI_P_CMD_ST)) {
-			p->cmd &= ~(AHCI_P_CMD_CR | AHCI_P_CMD_CCS_MASK);
-			p->ci = 0;
-			p->sact = 0;
+			ahci_port_stop(p);
 		} else {
 			uint64_t clb;
 
 			p->cmd |= AHCI_P_CMD_CR;
 			clb = (uint64_t)p->clbu << 32 | p->clb;
 			p->cmd_lst = paddr_guest2host(ahci_ctx(sc), clb,
 					AHCI_CL_SIZE * AHCI_MAX_SLOTS);
 		}
 
 		if (value & AHCI_P_CMD_FRE) {
 			uint64_t fb;
 
 			p->cmd |= AHCI_P_CMD_FR;
 			fb = (uint64_t)p->fbu << 32 | p->fb;
 			/* we don't support FBSCP, so rfis size is 256Bytes */
 			p->rfis = paddr_guest2host(ahci_ctx(sc), fb, 256);
 		} else {
 			p->cmd &= ~AHCI_P_CMD_FR;
 		}
 
 		if (value & AHCI_P_CMD_CLO) {
 			p->tfd = 0;
 			p->cmd &= ~AHCI_P_CMD_CLO;
 		}
 
 		ahci_handle_port(p);
 		break;
 	}
 	case AHCI_P_TFD:
 	case AHCI_P_SIG:
 	case AHCI_P_SSTS:
 		WPRINTF("pci_ahci_port: read only registers 0x%"PRIx64"\n", offset);
 		break;
 	case AHCI_P_SCTL:
 		if (!(p->cmd & AHCI_P_CMD_ST)) {
 			if (value & ATA_SC_DET_RESET)
 				ahci_port_reset(p);
 			p->sctl = value;
 		}
 		break;
 	case AHCI_P_SERR:
 		p->serr &= ~value;
 		break;
 	case AHCI_P_SACT:
 		p->sact |= value;
 		break;
 	case AHCI_P_CI:
 		p->ci |= value;
 		ahci_handle_port(p);
 		break;
 	case AHCI_P_SNTF:
 	case AHCI_P_FBS:
 	default:
 		break;
 	}
 }
 
 static void
 pci_ahci_host_write(struct pci_ahci_softc *sc, uint64_t offset, uint64_t value)
 {
 	DPRINTF("pci_ahci_host: write offset 0x%"PRIx64" value 0x%"PRIx64"\n",
 		offset, value);
 
 	switch (offset) {
 	case AHCI_CAP:
 	case AHCI_PI:
 	case AHCI_VS:
 	case AHCI_CAP2:
 		DPRINTF("pci_ahci_host: read only registers 0x%"PRIx64"\n", offset);
 		break;
 	case AHCI_GHC:
 		if (value & AHCI_GHC_HR)
 			ahci_reset(sc);
 		else if (value & AHCI_GHC_IE) {
 			sc->ghc |= AHCI_GHC_IE;
 			ahci_generate_intr(sc);
 		}
 		break;
 	case AHCI_IS:
 		sc->is &= ~value;
 		ahci_generate_intr(sc);
 		break;
 	default:
 		break;
 	}
 }
 
 static void
 pci_ahci_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
 		int baridx, uint64_t offset, int size, uint64_t value)
 {
 	struct pci_ahci_softc *sc = pi->pi_arg;
 
 	assert(baridx == 5);
 	assert(size == 4);
 
 	pthread_mutex_lock(&sc->mtx);
 
 	if (offset < AHCI_OFFSET)
 		pci_ahci_host_write(sc, offset, value);
 	else if (offset < AHCI_OFFSET + sc->ports * AHCI_STEP)
 		pci_ahci_port_write(sc, offset, value);
 	else
 		WPRINTF("pci_ahci: unknown i/o write offset 0x%"PRIx64"\n", offset);
 
 	pthread_mutex_unlock(&sc->mtx);
 }
 
 static uint64_t
 pci_ahci_host_read(struct pci_ahci_softc *sc, uint64_t offset)
 {
 	uint32_t value;
 
 	switch (offset) {
 	case AHCI_CAP:
 	case AHCI_GHC:
 	case AHCI_IS:
 	case AHCI_PI:
 	case AHCI_VS:
 	case AHCI_CCCC:
 	case AHCI_CCCP:
 	case AHCI_EM_LOC:
 	case AHCI_EM_CTL:
 	case AHCI_CAP2:
 	{
 		uint32_t *p = &sc->cap;
 		p += (offset - AHCI_CAP) / sizeof(uint32_t);
 		value = *p;
 		break;
 	}
 	default:
 		value = 0;
 		break;
 	}
 	DPRINTF("pci_ahci_host: read offset 0x%"PRIx64" value 0x%x\n",
 		offset, value);
 
 	return (value);
 }
 
 static uint64_t
 pci_ahci_port_read(struct pci_ahci_softc *sc, uint64_t offset)
 {
 	uint32_t value;
 	int port = (offset - AHCI_OFFSET) / AHCI_STEP;
 	offset = (offset - AHCI_OFFSET) % AHCI_STEP;
 
 	switch (offset) {
 	case AHCI_P_CLB:
 	case AHCI_P_CLBU:
 	case AHCI_P_FB:
 	case AHCI_P_FBU:
 	case AHCI_P_IS:
 	case AHCI_P_IE:
 	case AHCI_P_CMD:
 	case AHCI_P_TFD:
 	case AHCI_P_SIG:
 	case AHCI_P_SSTS:
 	case AHCI_P_SCTL:
 	case AHCI_P_SERR:
 	case AHCI_P_SACT:
 	case AHCI_P_CI:
 	case AHCI_P_SNTF:
 	case AHCI_P_FBS:
 	{
 		uint32_t *p= &sc->port[port].clb;
 		p += (offset - AHCI_P_CLB) / sizeof(uint32_t);
 		value = *p;
 		break;
 	}
 	default:
 		value = 0;
 		break;
 	}
 
 	DPRINTF("pci_ahci_port %d: read offset 0x%"PRIx64" value 0x%x\n",
 		port, offset, value);
 
 	return value;
 }
 
 static uint64_t
 pci_ahci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
     uint64_t offset, int size)
 {
 	struct pci_ahci_softc *sc = pi->pi_arg;
 	uint32_t value;
 
 	assert(baridx == 5);
 	assert(size == 4);
 
 	pthread_mutex_lock(&sc->mtx);
 
 	if (offset < AHCI_OFFSET)
 		value = pci_ahci_host_read(sc, offset);
 	else if (offset < AHCI_OFFSET + sc->ports * AHCI_STEP)
 		value = pci_ahci_port_read(sc, offset);
 	else {
 		value = 0;
 		WPRINTF("pci_ahci: unknown i/o read offset 0x%"PRIx64"\n", offset);
 	}
 
 	pthread_mutex_unlock(&sc->mtx);
 
 	return (value);
 }
 
 static int
 pci_ahci_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts, int atapi)
 {
 	char bident[sizeof("XX:X:X")];
 	struct blockif_ctxt *bctxt;
 	struct pci_ahci_softc *sc;
 	int ret, slots;
 
 	ret = 0;
 
 	if (opts == NULL) {
 		fprintf(stderr, "pci_ahci: backing device required\n");
 		return (1);
 	}
 
 #ifdef AHCI_DEBUG
 	dbg = fopen("/tmp/log", "w+");
 #endif
 
 	sc = calloc(1, sizeof(struct pci_ahci_softc));
 	pi->pi_arg = sc;
 	sc->asc_pi = pi;
 	sc->ports = MAX_PORTS;
 
 	/*
 	 * Only use port 0 for a backing device. All other ports will be
 	 * marked as unused
 	 */
 	sc->port[0].atapi = atapi;
 
 	/*
 	 * Attempt to open the backing image. Use the PCI
 	 * slot/func for the identifier string.
 	 */
 	snprintf(bident, sizeof(bident), "%d:%d", pi->pi_slot, pi->pi_func);
 	bctxt = blockif_open(opts, bident);
 	if (bctxt == NULL) {       	
 		ret = 1;
 		goto open_fail;
 	}	
 	sc->port[0].bctx = bctxt;
 	sc->port[0].pr_sc = sc;
 
 	/*
 	 * Allocate blockif request structures and add them
 	 * to the free list
 	 */
 	pci_ahci_ioreq_init(&sc->port[0]);
 
 	pthread_mutex_init(&sc->mtx, NULL);
 
 	/* Intel ICH8 AHCI */
 	slots = sc->port[0].ioqsz;
 	if (slots > 32)
 		slots = 32;
 	--slots;
 	sc->cap = AHCI_CAP_64BIT | AHCI_CAP_SNCQ | AHCI_CAP_SSNTF |
 	    AHCI_CAP_SMPS | AHCI_CAP_SSS | AHCI_CAP_SALP |
 	    AHCI_CAP_SAL | AHCI_CAP_SCLO | (0x3 << AHCI_CAP_ISS_SHIFT)|
 	    AHCI_CAP_PMD | AHCI_CAP_SSC | AHCI_CAP_PSC |
 	    (slots << AHCI_CAP_NCS_SHIFT) | AHCI_CAP_SXS | (sc->ports - 1);
 
 	/* Only port 0 implemented */
 	sc->pi = 1;
 	sc->vs = 0x10300;
 	sc->cap2 = AHCI_CAP2_APST;
 	ahci_reset(sc);
 
 	pci_set_cfgdata16(pi, PCIR_DEVICE, 0x2821);
 	pci_set_cfgdata16(pi, PCIR_VENDOR, 0x8086);
 	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
 	pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_SATA);
 	pci_set_cfgdata8(pi, PCIR_PROGIF, PCIP_STORAGE_SATA_AHCI_1_0);
 	pci_emul_add_msicap(pi, 1);
 	pci_emul_alloc_bar(pi, 5, PCIBAR_MEM32,
 	    AHCI_OFFSET + sc->ports * AHCI_STEP);
 
 	pci_lintr_request(pi);
 
 open_fail:
 	if (ret) {
 		blockif_close(sc->port[0].bctx);
 		free(sc);
 	}
 
 	return (ret);
 }
 
 static int
 pci_ahci_hd_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 {
 
 	return (pci_ahci_init(ctx, pi, opts, 0));
 }
 
 static int
 pci_ahci_atapi_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 {
 
 	return (pci_ahci_init(ctx, pi, opts, 1));
 }
 
 /*
  * Use separate emulation names to distinguish drive and atapi devices
  */
 struct pci_devemu pci_de_ahci_hd = {
 	.pe_emu =	"ahci-hd",
 	.pe_init =	pci_ahci_hd_init,
 	.pe_barwrite =	pci_ahci_write,
 	.pe_barread =	pci_ahci_read
 };
 PCI_EMUL_SET(pci_de_ahci_hd);
 
 struct pci_devemu pci_de_ahci_cd = {
 	.pe_emu =	"ahci-cd",
 	.pe_init =	pci_ahci_atapi_init,
 	.pe_barwrite =	pci_ahci_write,
 	.pe_barread =	pci_ahci_read
 };
 PCI_EMUL_SET(pci_de_ahci_cd);
Index: stable/10/usr.sbin/bhyve/pci_virtio_block.c
===================================================================
--- stable/10/usr.sbin/bhyve/pci_virtio_block.c	(revision 276348)
+++ stable/10/usr.sbin/bhyve/pci_virtio_block.c	(revision 276349)
@@ -1,377 +1,383 @@
 /*-
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/linker_set.h>
 #include <sys/stat.h>
 #include <sys/uio.h>
 #include <sys/ioctl.h>
 #include <sys/disk.h>
 
 #include <errno.h>
 #include <fcntl.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <strings.h>
 #include <unistd.h>
 #include <assert.h>
 #include <pthread.h>
 #include <md5.h>
 
 #include "bhyverun.h"
 #include "pci_emul.h"
 #include "virtio.h"
 
 #define VTBLK_RINGSZ	64
 
 #define VTBLK_MAXSEGS	32
 
 #define VTBLK_S_OK	0
 #define VTBLK_S_IOERR	1
 #define	VTBLK_S_UNSUPP	2
 
 #define	VTBLK_BLK_ID_BYTES	20
 
 /* Capability bits */
 #define	VTBLK_F_SEG_MAX		(1 << 2)	/* Maximum request segments */
 #define	VTBLK_F_BLK_SIZE       	(1 << 6)	/* cfg block size valid */
 
 /*
  * Host capabilities
  */
 #define VTBLK_S_HOSTCAPS      \
   ( VTBLK_F_SEG_MAX  |						    \
     VTBLK_F_BLK_SIZE |						    \
     VIRTIO_RING_F_INDIRECT_DESC )	/* indirect descriptors */
 
 /*
  * Config space "registers"
  */
 struct vtblk_config {
 	uint64_t	vbc_capacity;
 	uint32_t	vbc_size_max;
 	uint32_t	vbc_seg_max;
 	uint16_t	vbc_geom_c;
 	uint8_t		vbc_geom_h;
 	uint8_t		vbc_geom_s;
 	uint32_t	vbc_blk_size;
 	uint32_t	vbc_sectors_max;
 } __packed;
 
 /*
  * Fixed-size block header
  */
 struct virtio_blk_hdr {
 #define	VBH_OP_READ		0
 #define	VBH_OP_WRITE		1
+#define	VBH_OP_FLUSH		4
+#define	VBH_OP_FLUSH_OUT	5
 #define	VBH_OP_IDENT		8		
 #define	VBH_FLAG_BARRIER	0x80000000	/* OR'ed into vbh_type */
 	uint32_t       	vbh_type;
 	uint32_t	vbh_ioprio;
 	uint64_t	vbh_sector;
 } __packed;
 
 /*
  * Debug printf
  */
 static int pci_vtblk_debug;
 #define DPRINTF(params) if (pci_vtblk_debug) printf params
 #define WPRINTF(params) printf params
 
 /*
  * Per-device softc
  */
 struct pci_vtblk_softc {
 	struct virtio_softc vbsc_vs;
 	pthread_mutex_t vsc_mtx;
 	struct vqueue_info vbsc_vq;
 	int		vbsc_fd;
 	struct vtblk_config vbsc_cfg;	
 	char vbsc_ident[VTBLK_BLK_ID_BYTES];
 };
 
 static void pci_vtblk_reset(void *);
 static void pci_vtblk_notify(void *, struct vqueue_info *);
 static int pci_vtblk_cfgread(void *, int, int, uint32_t *);
 static int pci_vtblk_cfgwrite(void *, int, int, uint32_t);
 
 static struct virtio_consts vtblk_vi_consts = {
 	"vtblk",		/* our name */
 	1,			/* we support 1 virtqueue */
 	sizeof(struct vtblk_config), /* config reg size */
 	pci_vtblk_reset,	/* reset */
 	pci_vtblk_notify,	/* device-wide qnotify */
 	pci_vtblk_cfgread,	/* read PCI config */
 	pci_vtblk_cfgwrite,	/* write PCI config */
 	NULL,			/* apply negotiated features */
 	VTBLK_S_HOSTCAPS,	/* our capabilities */
 };
 
 static void
 pci_vtblk_reset(void *vsc)
 {
 	struct pci_vtblk_softc *sc = vsc;
 
 	DPRINTF(("vtblk: device reset requested !\n"));
 	vi_reset_dev(&sc->vbsc_vs);
 }
 
 static void
 pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vqueue_info *vq)
 {
 	struct virtio_blk_hdr *vbh;
 	uint8_t *status;
 	int i, n;
 	int err;
 	int iolen;
 	int writeop, type;
 	off_t offset;
 	struct iovec iov[VTBLK_MAXSEGS + 2];
 	uint16_t flags[VTBLK_MAXSEGS + 2];
 
 	n = vq_getchain(vq, iov, VTBLK_MAXSEGS + 2, flags);
 
 	/*
 	 * The first descriptor will be the read-only fixed header,
 	 * and the last is for status (hence +2 above and below).
 	 * The remaining iov's are the actual data I/O vectors.
 	 *
 	 * XXX - note - this fails on crash dump, which does a
 	 * VIRTIO_BLK_T_FLUSH with a zero transfer length
 	 */
 	assert(n >= 2 && n <= VTBLK_MAXSEGS + 2);
 
 	assert((flags[0] & VRING_DESC_F_WRITE) == 0);
 	assert(iov[0].iov_len == sizeof(struct virtio_blk_hdr));
 	vbh = iov[0].iov_base;
 
 	status = iov[--n].iov_base;
 	assert(iov[n].iov_len == 1);
 	assert(flags[n] & VRING_DESC_F_WRITE);
 
 	/*
 	 * XXX
 	 * The guest should not be setting the BARRIER flag because
 	 * we don't advertise the capability.
 	 */
 	type = vbh->vbh_type & ~VBH_FLAG_BARRIER;
 	writeop = (type == VBH_OP_WRITE);
 
 	offset = vbh->vbh_sector * DEV_BSIZE;
 
 	iolen = 0;
 	for (i = 1; i < n; i++) {
 		/*
 		 * - write op implies read-only descriptor,
 		 * - read/ident op implies write-only descriptor,
 		 * therefore test the inverse of the descriptor bit
 		 * to the op.
 		 */
 		assert(((flags[i] & VRING_DESC_F_WRITE) == 0) == writeop);
 		iolen += iov[i].iov_len;
 	}
 
 	DPRINTF(("virtio-block: %s op, %d bytes, %d segs, offset %ld\n\r", 
 		 writeop ? "write" : "read/ident", iolen, i - 1, offset));
 
 	switch (type) {
 	case VBH_OP_WRITE:
 		err = pwritev(sc->vbsc_fd, iov + 1, i - 1, offset);
 		break;
 	case VBH_OP_READ:
 		err = preadv(sc->vbsc_fd, iov + 1, i - 1, offset);
 		break;
 	case VBH_OP_IDENT:
 		/* Assume a single buffer */
 		strlcpy(iov[1].iov_base, sc->vbsc_ident,
 		    MIN(iov[1].iov_len, sizeof(sc->vbsc_ident)));
 		err = 0;
+		break;
+	case VBH_OP_FLUSH:
+	case VBH_OP_FLUSH_OUT:
+		err = fsync(sc->vbsc_fd);
 		break;
 	default:
 		err = -ENOSYS;
 		break;
 	}
 
 	/* convert errno into a virtio block error return */
 	if (err < 0) {
 		if (err == -ENOSYS)
 			*status = VTBLK_S_UNSUPP;
 		else
 			*status = VTBLK_S_IOERR;
 	} else
 		*status = VTBLK_S_OK;
 
 	/*
 	 * Return the descriptor back to the host.
 	 * We wrote 1 byte (our status) to host.
 	 */
 	vq_relchain(vq, 1);
 }
 
 static void
 pci_vtblk_notify(void *vsc, struct vqueue_info *vq)
 {
 	struct pci_vtblk_softc *sc = vsc;
 
 	vq_startchains(vq);
 	while (vq_has_descs(vq))
 		pci_vtblk_proc(sc, vq);
 	vq_endchains(vq, 1);	/* Generate interrupt if appropriate. */
 }
 
 static int
 pci_vtblk_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 {
 	struct stat sbuf;
 	MD5_CTX mdctx;
 	u_char digest[16];
 	struct pci_vtblk_softc *sc;
 	off_t size;	
 	int fd;
 	int sectsz;
 
 	if (opts == NULL) {
 		printf("virtio-block: backing device required\n");
 		return (1);
 	}
 
 	/*
 	 * The supplied backing file has to exist
 	 */
 	fd = open(opts, O_RDWR);
 	if (fd < 0) {
 		perror("Could not open backing file");
 		return (1);
 	}
 
 	if (fstat(fd, &sbuf) < 0) {
 		perror("Could not stat backing file");
 		close(fd);
 		return (1);
 	}
 
 	/*
 	 * Deal with raw devices
 	 */
 	size = sbuf.st_size;
 	sectsz = DEV_BSIZE;
 	if (S_ISCHR(sbuf.st_mode)) {
 		if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 ||
 		    ioctl(fd, DIOCGSECTORSIZE, &sectsz)) {
 			perror("Could not fetch dev blk/sector size");
 			close(fd);
 			return (1);
 		}
 		assert(size != 0);
 		assert(sectsz != 0);
 	}
 
 	sc = calloc(1, sizeof(struct pci_vtblk_softc));
 
 	/* record fd of storage device/file */
 	sc->vbsc_fd = fd;
 
 	pthread_mutex_init(&sc->vsc_mtx, NULL);
 
 	/* init virtio softc and virtqueues */
 	vi_softc_linkup(&sc->vbsc_vs, &vtblk_vi_consts, sc, pi, &sc->vbsc_vq);
 	sc->vbsc_vs.vs_mtx = &sc->vsc_mtx;
 
 	sc->vbsc_vq.vq_qsize = VTBLK_RINGSZ;
 	/* sc->vbsc_vq.vq_notify = we have no per-queue notify */
 
 	/*
 	 * Create an identifier for the backing file. Use parts of the
 	 * md5 sum of the filename
 	 */
 	MD5Init(&mdctx);
 	MD5Update(&mdctx, opts, strlen(opts));
 	MD5Final(digest, &mdctx);	
 	sprintf(sc->vbsc_ident, "BHYVE-%02X%02X-%02X%02X-%02X%02X",
 	    digest[0], digest[1], digest[2], digest[3], digest[4], digest[5]);
 
 	/* setup virtio block config space */
 	sc->vbsc_cfg.vbc_capacity = size / DEV_BSIZE; /* 512-byte units */
 	sc->vbsc_cfg.vbc_seg_max = VTBLK_MAXSEGS;
 	sc->vbsc_cfg.vbc_blk_size = sectsz;
 	sc->vbsc_cfg.vbc_size_max = 0;	/* not negotiated */
 	sc->vbsc_cfg.vbc_geom_c = 0;	/* no geometry */
 	sc->vbsc_cfg.vbc_geom_h = 0;
 	sc->vbsc_cfg.vbc_geom_s = 0;
 	sc->vbsc_cfg.vbc_sectors_max = 0;
 
 	/*
 	 * Should we move some of this into virtio.c?  Could
 	 * have the device, class, and subdev_0 as fields in
 	 * the virtio constants structure.
 	 */
 	pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_BLOCK);
 	pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
 	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
 	pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_BLOCK);
 
 	pci_lintr_request(pi);
 
 	if (vi_intr_init(&sc->vbsc_vs, 1, fbsdrun_virtio_msix()))
 		return (1);
 	vi_set_io_bar(&sc->vbsc_vs, 0);
 	return (0);
 }
 
 static int
 pci_vtblk_cfgwrite(void *vsc, int offset, int size, uint32_t value)
 {
 
 	DPRINTF(("vtblk: write to readonly reg %d\n\r", offset));
 	return (1);
 }
 
 static int
 pci_vtblk_cfgread(void *vsc, int offset, int size, uint32_t *retval)
 {
 	struct pci_vtblk_softc *sc = vsc;
 	void *ptr;
 
 	/* our caller has already verified offset and size */
 	ptr = (uint8_t *)&sc->vbsc_cfg + offset;
 	memcpy(retval, ptr, size);
 	return (0);
 }
 
 struct pci_devemu pci_de_vblk = {
 	.pe_emu =	"virtio-blk",
 	.pe_init =	pci_vtblk_init,
 	.pe_barwrite =	vi_pci_write,
 	.pe_barread =	vi_pci_read
 };
 PCI_EMUL_SET(pci_de_vblk);
Index: stable/10/usr.sbin/bhyve/task_switch.c
===================================================================
--- stable/10/usr.sbin/bhyve/task_switch.c	(revision 276348)
+++ stable/10/usr.sbin/bhyve/task_switch.c	(revision 276349)
@@ -1,932 +1,946 @@
 /*-
  * Copyright (c) 2014 Neel Natu <neel@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/_iovec.h>
 #include <sys/mman.h>
 
 #include <x86/psl.h>
 #include <x86/segments.h>
 #include <x86/specialreg.h>
 #include <machine/vmm.h>
 #include <machine/vmm_instruction_emul.h>
 
 #include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <assert.h>
 #include <errno.h>
 
 #include <vmmapi.h>
 
 #include "bhyverun.h"
 
 /*
  * Using 'struct i386tss' is tempting but causes myriad sign extension
  * issues because all of its fields are defined as signed integers.
  */
 struct tss32 {
 	uint16_t	tss_link;
 	uint16_t	rsvd1;
 	uint32_t	tss_esp0;
 	uint16_t	tss_ss0;
 	uint16_t	rsvd2;
 	uint32_t	tss_esp1;
 	uint16_t	tss_ss1;
 	uint16_t	rsvd3;
 	uint32_t	tss_esp2;
 	uint16_t	tss_ss2;
 	uint16_t	rsvd4;
 	uint32_t	tss_cr3;
 	uint32_t	tss_eip;
 	uint32_t	tss_eflags;
 	uint32_t	tss_eax;
 	uint32_t	tss_ecx;
 	uint32_t	tss_edx;
 	uint32_t	tss_ebx;
 	uint32_t	tss_esp;
 	uint32_t	tss_ebp;
 	uint32_t	tss_esi;
 	uint32_t	tss_edi;
 	uint16_t	tss_es;
 	uint16_t	rsvd5;
 	uint16_t	tss_cs;
 	uint16_t	rsvd6;
 	uint16_t	tss_ss;
 	uint16_t	rsvd7;
 	uint16_t	tss_ds;
 	uint16_t	rsvd8;
 	uint16_t	tss_fs;
 	uint16_t	rsvd9;
 	uint16_t	tss_gs;
 	uint16_t	rsvd10;
 	uint16_t	tss_ldt;
 	uint16_t	rsvd11;
 	uint16_t	tss_trap;
 	uint16_t	tss_iomap;
 };
 CTASSERT(sizeof(struct tss32) == 104);
 
 #define	SEL_START(sel)	(((sel) & ~0x7))
 #define	SEL_LIMIT(sel)	(((sel) | 0x7))
 #define	TSS_BUSY(type)	(((type) & 0x2) != 0)
 
 static uint64_t
 GETREG(struct vmctx *ctx, int vcpu, int reg)
 {
 	uint64_t val;
 	int error;
 
 	error = vm_get_register(ctx, vcpu, reg, &val);
 	assert(error == 0);
 	return (val);
 }
 
 static void
 SETREG(struct vmctx *ctx, int vcpu, int reg, uint64_t val)
 {
 	int error;
 
 	error = vm_set_register(ctx, vcpu, reg, val);
 	assert(error == 0);
 }
 
 static struct seg_desc
 usd_to_seg_desc(struct user_segment_descriptor *usd)
 {
 	struct seg_desc seg_desc;
 
 	seg_desc.base = (u_int)USD_GETBASE(usd);
 	if (usd->sd_gran)
 		seg_desc.limit = (u_int)(USD_GETLIMIT(usd) << 12) | 0xfff;
 	else
 		seg_desc.limit = (u_int)USD_GETLIMIT(usd);
 	seg_desc.access = usd->sd_type | usd->sd_dpl << 5 | usd->sd_p << 7;
 	seg_desc.access |= usd->sd_xx << 12;
 	seg_desc.access |= usd->sd_def32 << 14;
 	seg_desc.access |= usd->sd_gran << 15;
 
 	return (seg_desc);
 }
 
 /*
  * Inject an exception with an error code that is a segment selector.
  * The format of the error code is described in section 6.13, "Error Code",
  * Intel SDM volume 3.
  *
  * Bit 0 (EXT) denotes whether the exception occurred during delivery
  * of an external event like an interrupt.
  *
  * Bit 1 (IDT) indicates whether the selector points to a gate descriptor
  * in the IDT.
  *
  * Bit 2(GDT/LDT) has the usual interpretation of Table Indicator (TI).
  */
 static void
 sel_exception(struct vmctx *ctx, int vcpu, int vector, uint16_t sel, int ext)
 {
 	/*
 	 * Bit 2 from the selector is retained as-is in the error code.
 	 *
 	 * Bit 1 can be safely cleared because none of the selectors
 	 * encountered during task switch emulation refer to a task
 	 * gate in the IDT.
 	 *
 	 * Bit 0 is set depending on the value of 'ext'.
 	 */
 	sel &= ~0x3;
 	if (ext)
 		sel |= 0x1;
 	vm_inject_fault(ctx, vcpu, vector, 1, sel);
 }
 
 /*
  * Return 0 if the selector 'sel' in within the limits of the GDT/LDT
  * and non-zero otherwise.
  */
 static int
 desc_table_limit_check(struct vmctx *ctx, int vcpu, uint16_t sel)
 {
 	uint64_t base;
 	uint32_t limit, access;
 	int error, reg;
 
 	reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR;
 	error = vm_get_desc(ctx, vcpu, reg, &base, &limit, &access);
 	assert(error == 0);
 
 	if (reg == VM_REG_GUEST_LDTR) {
 		if (SEG_DESC_UNUSABLE(access) || !SEG_DESC_PRESENT(access))
 			return (-1);
 	}
 
 	if (limit < SEL_LIMIT(sel))
 		return (-1);
 	else
 		return (0);
 }
 
 /*
  * Read/write the segment descriptor 'desc' into the GDT/LDT slot referenced
  * by the selector 'sel'.
  *
  * Returns 0 on success.
  * Returns 1 if an exception was injected into the guest.
  * Returns -1 otherwise.
  */
 static int
 desc_table_rw(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
     uint16_t sel, struct user_segment_descriptor *desc, bool doread)
 {
 	struct iovec iov[2];
 	uint64_t base;
 	uint32_t limit, access;
 	int error, reg;
 
 	reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR;
 	error = vm_get_desc(ctx, vcpu, reg, &base, &limit, &access);
 	assert(error == 0);
 	assert(limit >= SEL_LIMIT(sel));
 
 	error = vm_copy_setup(ctx, vcpu, paging, base + SEL_START(sel),
 	    sizeof(*desc), doread ? PROT_READ : PROT_WRITE, iov, nitems(iov));
 	if (error == 0) {
 		if (doread)
 			vm_copyin(ctx, vcpu, iov, desc, sizeof(*desc));
 		else
 			vm_copyout(ctx, vcpu, desc, iov, sizeof(*desc));
 	}
 	return (error);
 }
 
 static int
 desc_table_read(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
     uint16_t sel, struct user_segment_descriptor *desc)
 {
 	return (desc_table_rw(ctx, vcpu, paging, sel, desc, true));
 }
 
 static int
 desc_table_write(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
     uint16_t sel, struct user_segment_descriptor *desc)
 {
 	return (desc_table_rw(ctx, vcpu, paging, sel, desc, false));
 }
 
 /*
  * Read the TSS descriptor referenced by 'sel' into 'desc'.
  *
  * Returns 0 on success.
  * Returns 1 if an exception was injected into the guest.
  * Returns -1 otherwise.
  */
 static int
 read_tss_descriptor(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts,
     uint16_t sel, struct user_segment_descriptor *desc)
 {
 	struct vm_guest_paging sup_paging;
 	int error;
 
 	assert(!ISLDT(sel));
 	assert(IDXSEL(sel) != 0);
 
 	/* Fetch the new TSS descriptor */
 	if (desc_table_limit_check(ctx, vcpu, sel)) {
 		if (ts->reason == TSR_IRET)
 			sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
 		else
 			sel_exception(ctx, vcpu, IDT_GP, sel, ts->ext);
 		return (1);
 	}
 
 	sup_paging = ts->paging;
 	sup_paging.cpl = 0;		/* implicit supervisor mode */
 	error = desc_table_read(ctx, vcpu, &sup_paging, sel, desc);
 	return (error);
 }
 
 static bool
 code_desc(int sd_type)
 {
 	/* code descriptor */
 	return ((sd_type & 0x18) == 0x18);
 }
 
 static bool
 stack_desc(int sd_type)
 {
 	/* writable data descriptor */
 	return ((sd_type & 0x1A) == 0x12);
 }
 
 static bool
 data_desc(int sd_type)
 {
 	/* data descriptor or a readable code descriptor */
 	return ((sd_type & 0x18) == 0x10 || (sd_type & 0x1A) == 0x1A);
 }
 
 static bool
 ldt_desc(int sd_type)
 {
 
 	return (sd_type == SDT_SYSLDT);
 }
 
 /*
  * Validate the descriptor 'seg_desc' associated with 'segment'.
  *
  * Returns 0 on success.
  * Returns 1 if an exception was injected into the guest.
  * Returns -1 otherwise.
  */
 static int
 validate_seg_desc(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts,
     int segment, struct seg_desc *seg_desc)
 {
 	struct vm_guest_paging sup_paging;
 	struct user_segment_descriptor usd;
 	int error, idtvec;
 	int cpl, dpl, rpl;
 	uint16_t sel, cs;
 	bool ldtseg, codeseg, stackseg, dataseg, conforming;
 
 	ldtseg = codeseg = stackseg = dataseg = false;
 	switch (segment) {
 	case VM_REG_GUEST_LDTR:
 		ldtseg = true;
 		break;
 	case VM_REG_GUEST_CS:
 		codeseg = true;
 		break;
 	case VM_REG_GUEST_SS:
 		stackseg = true;
 		break;
 	case VM_REG_GUEST_DS:
 	case VM_REG_GUEST_ES:
 	case VM_REG_GUEST_FS:
 	case VM_REG_GUEST_GS:
 		dataseg = true;
 		break;
 	default:
 		assert(0);
 	}
 
 	/* Get the segment selector */
 	sel = GETREG(ctx, vcpu, segment);
 
 	/* LDT selector must point into the GDT */
 	if (ldtseg && ISLDT(sel)) {
 		sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
 		return (1);
 	}
 
 	/* Descriptor table limit check */
 	if (desc_table_limit_check(ctx, vcpu, sel)) {
 		sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
 		return (1);
 	}
 
 	/* NULL selector */
 	if (IDXSEL(sel) == 0) {
 		/* Code and stack segment selectors cannot be NULL */
 		if (codeseg || stackseg) {
 			sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
 			return (1);
 		}
 		seg_desc->base = 0;
 		seg_desc->limit = 0;
 		seg_desc->access = 0x10000;	/* unusable */
 		return (0);
 	}
 
 	/* Read the descriptor from the GDT/LDT */
 	sup_paging = ts->paging;
 	sup_paging.cpl = 0;	/* implicit supervisor mode */
 	error = desc_table_read(ctx, vcpu, &sup_paging, sel, &usd);
 	if (error)
 		return (error);
 
 	/* Verify that the descriptor type is compatible with the segment */
 	if ((ldtseg && !ldt_desc(usd.sd_type)) ||
 	    (codeseg && !code_desc(usd.sd_type)) ||
 	    (dataseg && !data_desc(usd.sd_type)) ||
 	    (stackseg && !stack_desc(usd.sd_type))) {
 		sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
 		return (1);
 	}
 
 	/* Segment must be marked present */
 	if (!usd.sd_p) {
 		if (ldtseg)
 			idtvec = IDT_TS;
 		else if (stackseg)
 			idtvec = IDT_SS;
 		else
 			idtvec = IDT_NP;
 		sel_exception(ctx, vcpu, idtvec, sel, ts->ext);
 		return (1);
 	}
 
 	cs = GETREG(ctx, vcpu, VM_REG_GUEST_CS);
 	cpl = cs & SEL_RPL_MASK;
 	rpl = sel & SEL_RPL_MASK;
 	dpl = usd.sd_dpl;
 
 	if (stackseg && (rpl != cpl || dpl != cpl)) {
 		sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
 		return (1);
 	}
 
 	if (codeseg) {
 		conforming = (usd.sd_type & 0x4) ? true : false;
 		if ((conforming && (cpl < dpl)) ||
 		    (!conforming && (cpl != dpl))) {
 			sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
 			return (1);
 		}
 	}
 
 	if (dataseg) {
 		/*
 		 * A data segment is always non-conforming except when it's
 		 * descriptor is a readable, conforming code segment.
 		 */
 		if (code_desc(usd.sd_type) && (usd.sd_type & 0x4) != 0)
 			conforming = true;
 		else
 			conforming = false;
 
 		if (!conforming && (rpl > dpl || cpl > dpl)) {
 			sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
 			return (1);
 		}
 	}
 	*seg_desc = usd_to_seg_desc(&usd);
 	return (0);
 }
 
 static void
 tss32_save(struct vmctx *ctx, int vcpu, struct vm_task_switch *task_switch,
     uint32_t eip, struct tss32 *tss, struct iovec *iov)
 {
 
 	/* General purpose registers */
 	tss->tss_eax = GETREG(ctx, vcpu, VM_REG_GUEST_RAX);
 	tss->tss_ecx = GETREG(ctx, vcpu, VM_REG_GUEST_RCX);
 	tss->tss_edx = GETREG(ctx, vcpu, VM_REG_GUEST_RDX);
 	tss->tss_ebx = GETREG(ctx, vcpu, VM_REG_GUEST_RBX);
 	tss->tss_esp = GETREG(ctx, vcpu, VM_REG_GUEST_RSP);
 	tss->tss_ebp = GETREG(ctx, vcpu, VM_REG_GUEST_RBP);
 	tss->tss_esi = GETREG(ctx, vcpu, VM_REG_GUEST_RSI);
 	tss->tss_edi = GETREG(ctx, vcpu, VM_REG_GUEST_RDI);
 
 	/* Segment selectors */
 	tss->tss_es = GETREG(ctx, vcpu, VM_REG_GUEST_ES);
 	tss->tss_cs = GETREG(ctx, vcpu, VM_REG_GUEST_CS);
 	tss->tss_ss = GETREG(ctx, vcpu, VM_REG_GUEST_SS);
 	tss->tss_ds = GETREG(ctx, vcpu, VM_REG_GUEST_DS);
 	tss->tss_fs = GETREG(ctx, vcpu, VM_REG_GUEST_FS);
 	tss->tss_gs = GETREG(ctx, vcpu, VM_REG_GUEST_GS);
 
 	/* eflags and eip */
 	tss->tss_eflags = GETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS);
 	if (task_switch->reason == TSR_IRET)
 		tss->tss_eflags &= ~PSL_NT;
 	tss->tss_eip = eip;
 
 	/* Copy updated old TSS into guest memory */
 	vm_copyout(ctx, vcpu, tss, iov, sizeof(struct tss32));
 }
 
 static void
 update_seg_desc(struct vmctx *ctx, int vcpu, int reg, struct seg_desc *sd)
 {
 	int error;
 
 	error = vm_set_desc(ctx, vcpu, reg, sd->base, sd->limit, sd->access);
 	assert(error == 0);
 }
 
 /*
  * Update the vcpu registers to reflect the state of the new task.
  *
  * Returns 0 on success.
  * Returns 1 if an exception was injected into the guest.
  * Returns -1 otherwise.
  */
 static int
 tss32_restore(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts,
     uint16_t ot_sel, struct tss32 *tss, struct iovec *iov)
 {
 	struct seg_desc seg_desc, seg_desc2;
 	uint64_t *pdpte, maxphyaddr, reserved;
 	uint32_t eflags;
 	int error, i;
 	bool nested;
 
 	nested = false;
 	if (ts->reason != TSR_IRET && ts->reason != TSR_JMP) {
 		tss->tss_link = ot_sel;
 		nested = true;
 	}
 
 	eflags = tss->tss_eflags;
 	if (nested)
 		eflags |= PSL_NT;
 
 	/* LDTR */
 	SETREG(ctx, vcpu, VM_REG_GUEST_LDTR, tss->tss_ldt);
 
 	/* PBDR */
 	if (ts->paging.paging_mode != PAGING_MODE_FLAT) {
 		if (ts->paging.paging_mode == PAGING_MODE_PAE) {
 			/*
 			 * XXX Assuming 36-bit MAXPHYADDR.
 			 */
 			maxphyaddr = (1UL << 36) - 1;
 			pdpte = paddr_guest2host(ctx, tss->tss_cr3 & ~0x1f, 32);
 			for (i = 0; i < 4; i++) {
 				/* Check reserved bits if the PDPTE is valid */
 				if (!(pdpte[i] & 0x1))
 					continue;
 				/*
 				 * Bits 2:1, 8:5 and bits above the processor's
 				 * maximum physical address are reserved.
 				 */
 				reserved = ~maxphyaddr | 0x1E6;
 				if (pdpte[i] & reserved) {
 					vm_inject_gp(ctx, vcpu);
 					return (1);
 				}
 			}
 			SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE0, pdpte[0]);
 			SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE1, pdpte[1]);
 			SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE2, pdpte[2]);
 			SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE3, pdpte[3]);
 		}
 		SETREG(ctx, vcpu, VM_REG_GUEST_CR3, tss->tss_cr3);
 		ts->paging.cr3 = tss->tss_cr3;
 	}
 
 	/* eflags and eip */
 	SETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS, eflags);
 	SETREG(ctx, vcpu, VM_REG_GUEST_RIP, tss->tss_eip);
 
 	/* General purpose registers */
 	SETREG(ctx, vcpu, VM_REG_GUEST_RAX, tss->tss_eax);
 	SETREG(ctx, vcpu, VM_REG_GUEST_RCX, tss->tss_ecx);
 	SETREG(ctx, vcpu, VM_REG_GUEST_RDX, tss->tss_edx);
 	SETREG(ctx, vcpu, VM_REG_GUEST_RBX, tss->tss_ebx);
 	SETREG(ctx, vcpu, VM_REG_GUEST_RSP, tss->tss_esp);
 	SETREG(ctx, vcpu, VM_REG_GUEST_RBP, tss->tss_ebp);
 	SETREG(ctx, vcpu, VM_REG_GUEST_RSI, tss->tss_esi);
 	SETREG(ctx, vcpu, VM_REG_GUEST_RDI, tss->tss_edi);
 
 	/* Segment selectors */
 	SETREG(ctx, vcpu, VM_REG_GUEST_ES, tss->tss_es);
 	SETREG(ctx, vcpu, VM_REG_GUEST_CS, tss->tss_cs);
 	SETREG(ctx, vcpu, VM_REG_GUEST_SS, tss->tss_ss);
 	SETREG(ctx, vcpu, VM_REG_GUEST_DS, tss->tss_ds);
 	SETREG(ctx, vcpu, VM_REG_GUEST_FS, tss->tss_fs);
 	SETREG(ctx, vcpu, VM_REG_GUEST_GS, tss->tss_gs);
 
 	/*
 	 * If this is a nested task then write out the new TSS to update
 	 * the previous link field.
 	 */
 	if (nested)
 		vm_copyout(ctx, vcpu, tss, iov, sizeof(*tss));
 
 	/* Validate segment descriptors */
 	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_LDTR, &seg_desc);
 	if (error)
 		return (error);
 	update_seg_desc(ctx, vcpu, VM_REG_GUEST_LDTR, &seg_desc);
 
 	/*
 	 * Section "Checks on Guest Segment Registers", Intel SDM, Vol 3.
 	 *
 	 * The SS and CS attribute checks on VM-entry are inter-dependent so
 	 * we need to make sure that both segments are valid before updating
 	 * either of them. This ensures that the VMCS state can pass the
 	 * VM-entry checks so the guest can handle any exception injected
 	 * during task switch emulation.
 	 */
 	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_CS, &seg_desc);
 	if (error)
 		return (error);
 	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_SS, &seg_desc2);
 	if (error)
 		return (error);
 	update_seg_desc(ctx, vcpu, VM_REG_GUEST_CS, &seg_desc);
 	update_seg_desc(ctx, vcpu, VM_REG_GUEST_SS, &seg_desc2);
 	ts->paging.cpl = tss->tss_cs & SEL_RPL_MASK;
 
 	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_DS, &seg_desc);
 	if (error)
 		return (error);
 	update_seg_desc(ctx, vcpu, VM_REG_GUEST_DS, &seg_desc);
 
 	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_ES, &seg_desc);
 	if (error)
 		return (error);
 	update_seg_desc(ctx, vcpu, VM_REG_GUEST_ES, &seg_desc);
 
 	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_FS, &seg_desc);
 	if (error)
 		return (error);
 	update_seg_desc(ctx, vcpu, VM_REG_GUEST_FS, &seg_desc);
 
 	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_GS, &seg_desc);
 	if (error)
 		return (error);
 	update_seg_desc(ctx, vcpu, VM_REG_GUEST_GS, &seg_desc);
 
 	return (0);
 }
 
 /*
  * Push an error code on the stack of the new task. This is needed if the
  * task switch was triggered by a hardware exception that causes an error
  * code to be saved (e.g. #PF).
  *
  * Returns 0 on success.
  * Returns 1 if an exception was injected into the guest.
  * Returns -1 otherwise.
  */
 static int
 push_errcode(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
     int task_type, uint32_t errcode)
 {
 	struct iovec iov[2];
 	struct seg_desc seg_desc;
 	int stacksize, bytes, error;
 	uint64_t gla, cr0, rflags;
 	uint32_t esp;
 	uint16_t stacksel;
 
 	cr0 = GETREG(ctx, vcpu, VM_REG_GUEST_CR0);
 	rflags = GETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS);
 	stacksel = GETREG(ctx, vcpu, VM_REG_GUEST_SS);
 
 	error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_SS, &seg_desc.base,
 	    &seg_desc.limit, &seg_desc.access);
 	assert(error == 0);
 
 	/*
 	 * Section "Error Code" in the Intel SDM vol 3: the error code is
 	 * pushed on the stack as a doubleword or word (depending on the
 	 * default interrupt, trap or task gate size).
 	 */
 	if (task_type == SDT_SYS386BSY || task_type == SDT_SYS386TSS)
 		bytes = 4;
 	else
 		bytes = 2;
 
 	/*
 	 * PUSH instruction from Intel SDM vol 2: the 'B' flag in the
 	 * stack-segment descriptor determines the size of the stack
 	 * pointer outside of 64-bit mode.
 	 */
 	if (SEG_DESC_DEF32(seg_desc.access))
 		stacksize = 4;
 	else
 		stacksize = 2;
 
 	esp = GETREG(ctx, vcpu, VM_REG_GUEST_RSP);
 	esp -= bytes;
 
 	if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS,
 	    &seg_desc, esp, bytes, stacksize, PROT_WRITE, &gla)) {
 		sel_exception(ctx, vcpu, IDT_SS, stacksel, 1);
 		return (1);
 	}
 
 	if (vie_alignment_check(paging->cpl, bytes, cr0, rflags, gla)) {
 		vm_inject_ac(ctx, vcpu, 1);
 		return (1);
 	}
 
 	error = vm_copy_setup(ctx, vcpu, paging, gla, bytes, PROT_WRITE,
 	    iov, nitems(iov));
 	if (error)
 		return (error);
 
 	vm_copyout(ctx, vcpu, &errcode, iov, bytes);
 	SETREG(ctx, vcpu, VM_REG_GUEST_RSP, esp);
 	return (0);
 }
 
 /*
  * Evaluate return value from helper functions and potentially return to
  * the VM run loop.
  *  0: success
  * +1: an exception was injected into the guest vcpu
  * -1: unrecoverable/programming error
  */
 #define	CHKERR(x)							\
 	do {								\
 		assert(((x) == 0) || ((x) == 1) || ((x) == -1));	\
 		if ((x) == -1)						\
 			return (VMEXIT_ABORT);				\
 		else if ((x) == 1)					\
 			return (VMEXIT_CONTINUE);			\
 	} while (0)
 
 int
 vmexit_task_switch(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 {
 	struct seg_desc nt;
 	struct tss32 oldtss, newtss;
 	struct vm_task_switch *task_switch;
 	struct vm_guest_paging *paging, sup_paging;
 	struct user_segment_descriptor nt_desc, ot_desc;
 	struct iovec nt_iov[2], ot_iov[2];
 	uint64_t cr0, ot_base;
 	uint32_t eip, ot_lim, access;
 	int error, ext, minlimit, nt_type, ot_type, vcpu;
 	enum task_switch_reason reason;
 	uint16_t nt_sel, ot_sel;
 
 	task_switch = &vmexit->u.task_switch;
 	nt_sel = task_switch->tsssel;
 	ext = vmexit->u.task_switch.ext;
 	reason = vmexit->u.task_switch.reason;
 	paging = &vmexit->u.task_switch.paging;
 	vcpu = *pvcpu;
 
 	assert(paging->cpu_mode == CPU_MODE_PROTECTED);
 
 	/*
+	 * Calculate the %eip to store in the old TSS before modifying the
+	 * 'inst_length'.
+	 */
+	eip = vmexit->rip + vmexit->inst_length;
+
+	/*
+	 * Set the 'inst_length' to '0'.
+	 *
+	 * If an exception is triggered during emulation of the task switch
+	 * then the exception handler should return to the instruction that
+	 * caused the task switch as opposed to the subsequent instruction.
+	 */
+	vmexit->inst_length = 0;
+
+	/*
 	 * Section 4.6, "Access Rights" in Intel SDM Vol 3.
 	 * The following page table accesses are implicitly supervisor mode:
 	 * - accesses to GDT or LDT to load segment descriptors
 	 * - accesses to the task state segment during task switch
 	 */
 	sup_paging = *paging;
 	sup_paging.cpl = 0;	/* implicit supervisor mode */
 
 	/* Fetch the new TSS descriptor */
 	error = read_tss_descriptor(ctx, vcpu, task_switch, nt_sel, &nt_desc);
 	CHKERR(error);
 
 	nt = usd_to_seg_desc(&nt_desc);
 
 	/* Verify the type of the new TSS */
 	nt_type = SEG_DESC_TYPE(nt.access);
 	if (nt_type != SDT_SYS386BSY && nt_type != SDT_SYS386TSS &&
 	    nt_type != SDT_SYS286BSY && nt_type != SDT_SYS286TSS) {
 		sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext);
 		goto done;
 	}
 
 	/* TSS descriptor must have present bit set */
 	if (!SEG_DESC_PRESENT(nt.access)) {
 		sel_exception(ctx, vcpu, IDT_NP, nt_sel, ext);
 		goto done;
 	}
 
 	/*
 	 * TSS must have a minimum length of 104 bytes for a 32-bit TSS and
 	 * 44 bytes for a 16-bit TSS.
 	 */
 	if (nt_type == SDT_SYS386BSY || nt_type == SDT_SYS386TSS)
 		minlimit = 104 - 1;
 	else if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS)
 		minlimit = 44 - 1;
 	else
 		minlimit = 0;
 
 	assert(minlimit > 0);
 	if (nt.limit < minlimit) {
 		sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext);
 		goto done;
 	}
 
 	/* TSS must be busy if task switch is due to IRET */
 	if (reason == TSR_IRET && !TSS_BUSY(nt_type)) {
 		sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext);
 		goto done;
 	}
 
 	/*
 	 * TSS must be available (not busy) if task switch reason is
 	 * CALL, JMP, exception or interrupt.
 	 */
 	if (reason != TSR_IRET && TSS_BUSY(nt_type)) {
 		sel_exception(ctx, vcpu, IDT_GP, nt_sel, ext);
 		goto done;
 	}
 
 	/* Fetch the new TSS */
 	error = vm_copy_setup(ctx, vcpu, &sup_paging, nt.base, minlimit + 1,
 	    PROT_READ | PROT_WRITE, nt_iov, nitems(nt_iov));
 	CHKERR(error);
 	vm_copyin(ctx, vcpu, nt_iov, &newtss, minlimit + 1);
 
 	/* Get the old TSS selector from the guest's task register */
 	ot_sel = GETREG(ctx, vcpu, VM_REG_GUEST_TR);
 	if (ISLDT(ot_sel) || IDXSEL(ot_sel) == 0) {
 		/*
 		 * This might happen if a task switch was attempted without
 		 * ever loading the task register with LTR. In this case the
 		 * TR would contain the values from power-on:
 		 * (sel = 0, base = 0, limit = 0xffff).
 		 */
 		sel_exception(ctx, vcpu, IDT_TS, ot_sel, task_switch->ext);
 		goto done;
 	}
 
 	/* Get the old TSS base and limit from the guest's task register */
 	error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_TR, &ot_base, &ot_lim,
 	    &access);
 	assert(error == 0);
 	assert(!SEG_DESC_UNUSABLE(access) && SEG_DESC_PRESENT(access));
 	ot_type = SEG_DESC_TYPE(access);
 	assert(ot_type == SDT_SYS386BSY || ot_type == SDT_SYS286BSY);
 
 	/* Fetch the old TSS descriptor */
 	error = read_tss_descriptor(ctx, vcpu, task_switch, ot_sel, &ot_desc);
 	CHKERR(error);
 
 	/* Get the old TSS */
 	error = vm_copy_setup(ctx, vcpu, &sup_paging, ot_base, minlimit + 1,
 	    PROT_READ | PROT_WRITE, ot_iov, nitems(ot_iov));
 	CHKERR(error);
 	vm_copyin(ctx, vcpu, ot_iov, &oldtss, minlimit + 1);
 
 	/*
 	 * Clear the busy bit in the old TSS descriptor if the task switch
 	 * due to an IRET or JMP instruction.
 	 */
 	if (reason == TSR_IRET || reason == TSR_JMP) {
 		ot_desc.sd_type &= ~0x2;
 		error = desc_table_write(ctx, vcpu, &sup_paging, ot_sel,
 		    &ot_desc);
 		CHKERR(error);
 	}
 
 	if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS) {
 		fprintf(stderr, "Task switch to 16-bit TSS not supported\n");
 		return (VMEXIT_ABORT);
 	}
 
 	/* Save processor state in old TSS */
-	eip = vmexit->rip + vmexit->inst_length;
 	tss32_save(ctx, vcpu, task_switch, eip, &oldtss, ot_iov);
 
 	/*
 	 * If the task switch was triggered for any reason other than IRET
 	 * then set the busy bit in the new TSS descriptor.
 	 */
 	if (reason != TSR_IRET) {
 		nt_desc.sd_type |= 0x2;
 		error = desc_table_write(ctx, vcpu, &sup_paging, nt_sel,
 		    &nt_desc);
 		CHKERR(error);
 	}
 
 	/* Update task register to point at the new TSS */
 	SETREG(ctx, vcpu, VM_REG_GUEST_TR, nt_sel);
 
 	/* Update the hidden descriptor state of the task register */
 	nt = usd_to_seg_desc(&nt_desc);
 	update_seg_desc(ctx, vcpu, VM_REG_GUEST_TR, &nt);
 
 	/* Set CR0.TS */
 	cr0 = GETREG(ctx, vcpu, VM_REG_GUEST_CR0);
 	SETREG(ctx, vcpu, VM_REG_GUEST_CR0, cr0 | CR0_TS);
 
 	/*
 	 * We are now committed to the task switch. Any exceptions encountered
 	 * after this point will be handled in the context of the new task and
 	 * the saved instruction pointer will belong to the new task.
 	 */
 	vmexit->rip = newtss.tss_eip;
-	vmexit->inst_length = 0;
+	assert(vmexit->inst_length == 0);
 
 	/* Load processor state from new TSS */
 	error = tss32_restore(ctx, vcpu, task_switch, ot_sel, &newtss, nt_iov);
 	CHKERR(error);
 
 	/*
 	 * Section "Interrupt Tasks" in Intel SDM, Vol 3: if an exception
 	 * caused an error code to be generated, this error code is copied
 	 * to the stack of the new task.
 	 */
 	if (task_switch->errcode_valid) {
 		assert(task_switch->ext);
 		assert(task_switch->reason == TSR_IDT_GATE);
 		error = push_errcode(ctx, vcpu, &task_switch->paging, nt_type,
 		    task_switch->errcode);
 		CHKERR(error);
 	}
 
 	/*
 	 * Treatment of virtual-NMI blocking if NMI is delivered through
 	 * a task gate.
 	 *
 	 * Section "Architectural State Before A VM Exit", Intel SDM, Vol3:
 	 * If the virtual NMIs VM-execution control is 1, VM entry injects
 	 * an NMI, and delivery of the NMI causes a task switch that causes
 	 * a VM exit, virtual-NMI blocking is in effect before the VM exit
 	 * commences.
 	 *
 	 * Thus, virtual-NMI blocking is in effect at the time of the task
 	 * switch VM exit.
 	 */
 
 	/*
 	 * Treatment of virtual-NMI unblocking on IRET from NMI handler task.
 	 *
 	 * Section "Changes to Instruction Behavior in VMX Non-Root Operation"
 	 * If "virtual NMIs" control is 1 IRET removes any virtual-NMI blocking.
 	 * This unblocking of virtual-NMI occurs even if IRET causes a fault.
 	 *
 	 * Thus, virtual-NMI blocking is cleared at the time of the task switch
 	 * VM exit.
 	 */
 
 	/*
 	 * If the task switch was triggered by an event delivered through
 	 * the IDT then extinguish the pending event from the vcpu's
 	 * exitintinfo.
 	 */
 	if (task_switch->reason == TSR_IDT_GATE) {
 		error = vm_set_intinfo(ctx, vcpu, 0);
 		assert(error == 0);
 	}
 
 	/*
 	 * XXX should inject debug exception if 'T' bit is 1
 	 */
 done:
 	return (VMEXIT_CONTINUE);
 }
Index: stable/10/usr.sbin/bhyve/virtio.c
===================================================================
--- stable/10/usr.sbin/bhyve/virtio.c	(revision 276348)
+++ stable/10/usr.sbin/bhyve/virtio.c	(revision 276349)
@@ -1,754 +1,758 @@
 /*-
  * Copyright (c) 2013  Chris Torek <torek @ torek net>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/uio.h>
 
 #include <stdio.h>
 #include <stdint.h>
 #include <pthread.h>
+#include <pthread_np.h>
 
 #include "bhyverun.h"
 #include "pci_emul.h"
 #include "virtio.h"
 
 /*
  * Functions for dealing with generalized "virtual devices" as
  * defined by <https://www.google.com/#output=search&q=virtio+spec>
  */
 
 /*
  * In case we decide to relax the "virtio softc comes at the
  * front of virtio-based device softc" constraint, let's use
  * this to convert.
  */
 #define DEV_SOFTC(vs) ((void *)(vs))
 
 /*
  * Link a virtio_softc to its constants, the device softc, and
  * the PCI emulation.
  */
 void
 vi_softc_linkup(struct virtio_softc *vs, struct virtio_consts *vc,
 		void *dev_softc, struct pci_devinst *pi,
 		struct vqueue_info *queues)
 {
 	int i;
 
 	/* vs and dev_softc addresses must match */
 	assert((void *)vs == dev_softc);
 	vs->vs_vc = vc;
 	vs->vs_pi = pi;
 	pi->pi_arg = vs;
 
 	vs->vs_queues = queues;
 	for (i = 0; i < vc->vc_nvq; i++) {
 		queues[i].vq_vs = vs;
 		queues[i].vq_num = i;
 	}
 }
 
 /*
  * Reset device (device-wide).  This erases all queues, i.e.,
  * all the queues become invalid (though we don't wipe out the
  * internal pointers, we just clear the VQ_ALLOC flag).
  *
  * It resets negotiated features to "none".
  *
  * If MSI-X is enabled, this also resets all the vectors to NO_VECTOR.
  */
 void
 vi_reset_dev(struct virtio_softc *vs)
 {
 	struct vqueue_info *vq;
 	int i, nvq;
 
+	if (vs->vs_mtx)
+		assert(pthread_mutex_isowned_np(vs->vs_mtx));
+
 	nvq = vs->vs_vc->vc_nvq;
 	for (vq = vs->vs_queues, i = 0; i < nvq; vq++, i++) {
 		vq->vq_flags = 0;
 		vq->vq_last_avail = 0;
 		vq->vq_pfn = 0;
 		vq->vq_msix_idx = VIRTIO_MSI_NO_VECTOR;
 	}
 	vs->vs_negotiated_caps = 0;
 	vs->vs_curq = 0;
 	/* vs->vs_status = 0; -- redundant */
-	VS_LOCK(vs);
 	if (vs->vs_isr)
 		pci_lintr_deassert(vs->vs_pi);
 	vs->vs_isr = 0;
-	VS_UNLOCK(vs);
 	vs->vs_msix_cfg_idx = VIRTIO_MSI_NO_VECTOR;
 }
 
 /*
  * Set I/O BAR (usually 0) to map PCI config registers.
  */
 void
 vi_set_io_bar(struct virtio_softc *vs, int barnum)
 {
 	size_t size;
 
 	/*
 	 * ??? should we use CFG0 if MSI-X is disabled?
 	 * Existing code did not...
 	 */
 	size = VTCFG_R_CFG1 + vs->vs_vc->vc_cfgsize;
 	pci_emul_alloc_bar(vs->vs_pi, barnum, PCIBAR_IO, size);
 }
 
 /*
  * Initialize MSI-X vector capabilities if we're to use MSI-X,
  * or MSI capabilities if not.
  *
  * We assume we want one MSI-X vector per queue, here, plus one
  * for the config vec.
  */
 int
 vi_intr_init(struct virtio_softc *vs, int barnum, int use_msix)
 {
 	int nvec;
 
 	if (use_msix) {
 		vs->vs_flags |= VIRTIO_USE_MSIX;
+		VS_LOCK(vs);
 		vi_reset_dev(vs); /* set all vectors to NO_VECTOR */
+		VS_UNLOCK(vs);
 		nvec = vs->vs_vc->vc_nvq + 1;
 		if (pci_emul_add_msixcap(vs->vs_pi, nvec, barnum))
 			return (1);
 	} else
 		vs->vs_flags &= ~VIRTIO_USE_MSIX;
 	/* Only 1 MSI vector for bhyve */
 	pci_emul_add_msicap(vs->vs_pi, 1);
 	return (0);
 }
 
 /*
  * Initialize the currently-selected virtio queue (vs->vs_curq).
  * The guest just gave us a page frame number, from which we can
  * calculate the addresses of the queue.
  */
 void
 vi_vq_init(struct virtio_softc *vs, uint32_t pfn)
 {
 	struct vqueue_info *vq;
 	uint64_t phys;
 	size_t size;
 	char *base;
 
 	vq = &vs->vs_queues[vs->vs_curq];
 	vq->vq_pfn = pfn;
 	phys = (uint64_t)pfn << VRING_PFN;
 	size = vring_size(vq->vq_qsize);
 	base = paddr_guest2host(vs->vs_pi->pi_vmctx, phys, size);
 
 	/* First page(s) are descriptors... */
 	vq->vq_desc = (struct virtio_desc *)base;
 	base += vq->vq_qsize * sizeof(struct virtio_desc);
 
 	/* ... immediately followed by "avail" ring (entirely uint16_t's) */
 	vq->vq_avail = (struct vring_avail *)base;
 	base += (2 + vq->vq_qsize + 1) * sizeof(uint16_t);
 
 	/* Then it's rounded up to the next page... */
 	base = (char *)roundup2((uintptr_t)base, VRING_ALIGN);
 
 	/* ... and the last page(s) are the used ring. */
 	vq->vq_used = (struct vring_used *)base;
 
 	/* Mark queue as allocated, and start at 0 when we use it. */
 	vq->vq_flags = VQ_ALLOC;
 	vq->vq_last_avail = 0;
 }
 
 /*
  * Helper inline for vq_getchain(): record the i'th "real"
  * descriptor.
  */
 static inline void
 _vq_record(int i, volatile struct virtio_desc *vd, struct vmctx *ctx,
 	   struct iovec *iov, int n_iov, uint16_t *flags) {
 
 	if (i >= n_iov)
 		return;
 	iov[i].iov_base = paddr_guest2host(ctx, vd->vd_addr, vd->vd_len);
 	iov[i].iov_len = vd->vd_len;
 	if (flags != NULL)
 		flags[i] = vd->vd_flags;
 }
 #define	VQ_MAX_DESCRIPTORS	512	/* see below */
 
 /*
  * Examine the chain of descriptors starting at the "next one" to
  * make sure that they describe a sensible request.  If so, return
  * the number of "real" descriptors that would be needed/used in
  * acting on this request.  This may be smaller than the number of
  * available descriptors, e.g., if there are two available but
  * they are two separate requests, this just returns 1.  Or, it
  * may be larger: if there are indirect descriptors involved,
  * there may only be one descriptor available but it may be an
  * indirect pointing to eight more.  We return 8 in this case,
  * i.e., we do not count the indirect descriptors, only the "real"
  * ones.
  *
  * Basically, this vets the vd_flags and vd_next field of each
  * descriptor and tells you how many are involved.  Since some may
  * be indirect, this also needs the vmctx (in the pci_devinst
  * at vs->vs_pi) so that it can find indirect descriptors.
  *
  * As we process each descriptor, we copy and adjust it (guest to
  * host address wise, also using the vmtctx) into the given iov[]
  * array (of the given size).  If the array overflows, we stop
  * placing values into the array but keep processing descriptors,
  * up to VQ_MAX_DESCRIPTORS, before giving up and returning -1.
  * So you, the caller, must not assume that iov[] is as big as the
  * return value (you can process the same thing twice to allocate
  * a larger iov array if needed, or supply a zero length to find
  * out how much space is needed).
  *
  * If you want to verify the WRITE flag on each descriptor, pass a
  * non-NULL "flags" pointer to an array of "uint16_t" of the same size
  * as n_iov and we'll copy each vd_flags field after unwinding any
  * indirects.
  *
  * If some descriptor(s) are invalid, this prints a diagnostic message
  * and returns -1.  If no descriptors are ready now it simply returns 0.
  *
  * You are assumed to have done a vq_ring_ready() if needed (note
  * that vq_has_descs() does one).
  */
 int
 vq_getchain(struct vqueue_info *vq,
 	    struct iovec *iov, int n_iov, uint16_t *flags)
 {
 	int i;
 	u_int ndesc, n_indir;
 	u_int idx, head, next;
 	volatile struct virtio_desc *vdir, *vindir, *vp;
 	struct vmctx *ctx;
 	struct virtio_softc *vs;
 	const char *name;
 
 	vs = vq->vq_vs;
 	name = vs->vs_vc->vc_name;
 
 	/*
 	 * Note: it's the responsibility of the guest not to
 	 * update vq->vq_avail->va_idx until all of the descriptors
          * the guest has written are valid (including all their
          * vd_next fields and vd_flags).
 	 *
 	 * Compute (last_avail - va_idx) in integers mod 2**16.  This is
 	 * the number of descriptors the device has made available
 	 * since the last time we updated vq->vq_last_avail.
 	 *
 	 * We just need to do the subtraction as an unsigned int,
 	 * then trim off excess bits.
 	 */
 	idx = vq->vq_last_avail;
 	ndesc = (uint16_t)((u_int)vq->vq_avail->va_idx - idx);
 	if (ndesc == 0)
 		return (0);
 	if (ndesc > vq->vq_qsize) {
 		/* XXX need better way to diagnose issues */
 		fprintf(stderr,
 		    "%s: ndesc (%u) out of range, driver confused?\r\n",
 		    name, (u_int)ndesc);
 		return (-1);
 	}
 
 	/*
 	 * Now count/parse "involved" descriptors starting from
 	 * the head of the chain.
 	 *
 	 * To prevent loops, we could be more complicated and
 	 * check whether we're re-visiting a previously visited
 	 * index, but we just abort if the count gets excessive.
 	 */
 	ctx = vs->vs_pi->pi_vmctx;
 	head = vq->vq_avail->va_ring[idx & (vq->vq_qsize - 1)];
 	next = head;
 	for (i = 0; i < VQ_MAX_DESCRIPTORS; next = vdir->vd_next) {
 		if (next >= vq->vq_qsize) {
 			fprintf(stderr,
 			    "%s: descriptor index %u out of range, "
 			    "driver confused?\r\n",
 			    name, next);
 			return (-1);
 		}
 		vdir = &vq->vq_desc[next];
 		if ((vdir->vd_flags & VRING_DESC_F_INDIRECT) == 0) {
 			_vq_record(i, vdir, ctx, iov, n_iov, flags);
 			i++;
 		} else if ((vs->vs_negotiated_caps &
 		    VIRTIO_RING_F_INDIRECT_DESC) == 0) {
 			fprintf(stderr,
 			    "%s: descriptor has forbidden INDIRECT flag, "
 			    "driver confused?\r\n",
 			    name);
 			return (-1);
 		} else {
 			n_indir = vdir->vd_len / 16;
 			if ((vdir->vd_len & 0xf) || n_indir == 0) {
 				fprintf(stderr,
 				    "%s: invalid indir len 0x%x, "
 				    "driver confused?\r\n",
 				    name, (u_int)vdir->vd_len);
 				return (-1);
 			}
 			vindir = paddr_guest2host(ctx,
 			    vdir->vd_addr, vdir->vd_len);
 			/*
 			 * Indirects start at the 0th, then follow
 			 * their own embedded "next"s until those run
 			 * out.  Each one's indirect flag must be off
 			 * (we don't really have to check, could just
 			 * ignore errors...).
 			 */
 			next = 0;
 			for (;;) {
 				vp = &vindir[next];
 				if (vp->vd_flags & VRING_DESC_F_INDIRECT) {
 					fprintf(stderr,
 					    "%s: indirect desc has INDIR flag,"
 					    " driver confused?\r\n",
 					    name);
 					return (-1);
 				}
 				_vq_record(i, vp, ctx, iov, n_iov, flags);
 				if (++i > VQ_MAX_DESCRIPTORS)
 					goto loopy;
 				if ((vp->vd_flags & VRING_DESC_F_NEXT) == 0)
 					break;
 				next = vp->vd_next;
 				if (next >= n_indir) {
 					fprintf(stderr,
 					    "%s: invalid next %u > %u, "
 					    "driver confused?\r\n",
 					    name, (u_int)next, n_indir);
 					return (-1);
 				}
 			}
 		}
 		if ((vdir->vd_flags & VRING_DESC_F_NEXT) == 0)
 			return (i);
 	}
 loopy:
 	fprintf(stderr,
 	    "%s: descriptor loop? count > %d - driver confused?\r\n",
 	    name, i);
 	return (-1);
 }
 
 /*
  * Return the currently-first request chain to the guest, setting
  * its I/O length to the provided value.
  *
  * (This chain is the one you handled when you called vq_getchain()
  * and used its positive return value.)
  */
 void
 vq_relchain(struct vqueue_info *vq, uint32_t iolen)
 {
 	uint16_t head, uidx, mask;
 	volatile struct vring_used *vuh;
 	volatile struct virtio_used *vue;
 
 	/*
 	 * Notes:
 	 *  - mask is N-1 where N is a power of 2 so computes x % N
 	 *  - vuh points to the "used" data shared with guest
 	 *  - vue points to the "used" ring entry we want to update
 	 *  - head is the same value we compute in vq_iovecs().
 	 *
 	 * (I apologize for the two fields named vu_idx; the
 	 * virtio spec calls the one that vue points to, "id"...)
 	 */
 	mask = vq->vq_qsize - 1;
 	vuh = vq->vq_used;
 	head = vq->vq_avail->va_ring[vq->vq_last_avail++ & mask];
 
 	uidx = vuh->vu_idx;
 	vue = &vuh->vu_ring[uidx++ & mask];
 	vue->vu_idx = head; /* ie, vue->id = head */
 	vue->vu_tlen = iolen;
 	vuh->vu_idx = uidx;
 }
 
 /*
  * Driver has finished processing "available" chains and calling
  * vq_relchain on each one.  If driver used all the available
  * chains, used_all should be set.
  *
  * If the "used" index moved we may need to inform the guest, i.e.,
  * deliver an interrupt.  Even if the used index did NOT move we
  * may need to deliver an interrupt, if the avail ring is empty and
  * we are supposed to interrupt on empty.
  *
  * Note that used_all_avail is provided by the caller because it's
  * a snapshot of the ring state when he decided to finish interrupt
  * processing -- it's possible that descriptors became available after
  * that point.  (It's also typically a constant 1/True as well.)
  */
 void
 vq_endchains(struct vqueue_info *vq, int used_all_avail)
 {
 	struct virtio_softc *vs;
 	uint16_t event_idx, new_idx, old_idx;
 	int intr;
 
 	/*
 	 * Interrupt generation: if we're using EVENT_IDX,
 	 * interrupt if we've crossed the event threshold.
 	 * Otherwise interrupt is generated if we added "used" entries,
 	 * but suppressed by VRING_AVAIL_F_NO_INTERRUPT.
 	 *
 	 * In any case, though, if NOTIFY_ON_EMPTY is set and the
 	 * entire avail was processed, we need to interrupt always.
 	 */
 	vs = vq->vq_vs;
 	new_idx = vq->vq_used->vu_idx;
 	old_idx = vq->vq_save_used;
 	if (used_all_avail &&
 	    (vs->vs_negotiated_caps & VIRTIO_F_NOTIFY_ON_EMPTY))
 		intr = 1;
 	else if (vs->vs_negotiated_caps & VIRTIO_RING_F_EVENT_IDX) {
 		event_idx = VQ_USED_EVENT_IDX(vq);
 		/*
 		 * This calculation is per docs and the kernel
 		 * (see src/sys/dev/virtio/virtio_ring.h).
 		 */
 		intr = (uint16_t)(new_idx - event_idx - 1) <
 			(uint16_t)(new_idx - old_idx);
 	} else {
 		intr = new_idx != old_idx &&
 		    !(vq->vq_avail->va_flags & VRING_AVAIL_F_NO_INTERRUPT);
 	}
 	if (intr)
 		vq_interrupt(vs, vq);
 }
 
 /* Note: these are in sorted order to make for a fast search */
 static struct config_reg {
 	uint16_t	cr_offset;	/* register offset */
 	uint8_t		cr_size;	/* size (bytes) */
 	uint8_t		cr_ro;		/* true => reg is read only */
 	const char	*cr_name;	/* name of reg */
 } config_regs[] = {
 	{ VTCFG_R_HOSTCAP,	4, 1, "HOSTCAP" },
 	{ VTCFG_R_GUESTCAP,	4, 0, "GUESTCAP" },
 	{ VTCFG_R_PFN,		4, 0, "PFN" },
 	{ VTCFG_R_QNUM,		2, 1, "QNUM" },
 	{ VTCFG_R_QSEL,		2, 0, "QSEL" },
 	{ VTCFG_R_QNOTIFY,	2, 0, "QNOTIFY" },
 	{ VTCFG_R_STATUS,	1, 0, "STATUS" },
 	{ VTCFG_R_ISR,		1, 0, "ISR" },
 	{ VTCFG_R_CFGVEC,	2, 0, "CFGVEC" },
 	{ VTCFG_R_QVEC,		2, 0, "QVEC" },
 };
 
 static inline struct config_reg *
 vi_find_cr(int offset) {
 	u_int hi, lo, mid;
 	struct config_reg *cr;
 
 	lo = 0;
 	hi = sizeof(config_regs) / sizeof(*config_regs) - 1;
 	while (hi >= lo) {
 		mid = (hi + lo) >> 1;
 		cr = &config_regs[mid];
 		if (cr->cr_offset == offset)
 			return (cr);
 		if (cr->cr_offset < offset)
 			lo = mid + 1;
 		else
 			hi = mid - 1;
 	}
 	return (NULL);
 }
 
 /*
  * Handle pci config space reads.
  * If it's to the MSI-X info, do that.
  * If it's part of the virtio standard stuff, do that.
  * Otherwise dispatch to the actual driver.
  */
 uint64_t
 vi_pci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
 	    int baridx, uint64_t offset, int size)
 {
 	struct virtio_softc *vs = pi->pi_arg;
 	struct virtio_consts *vc;
 	struct config_reg *cr;
 	uint64_t virtio_config_size, max;
 	const char *name;
 	uint32_t newoff;
 	uint32_t value;
 	int error;
 
 	if (vs->vs_flags & VIRTIO_USE_MSIX) {
 		if (baridx == pci_msix_table_bar(pi) ||
 		    baridx == pci_msix_pba_bar(pi)) {
 			return (pci_emul_msix_tread(pi, offset, size));
 		}
 	}
 
 	/* XXX probably should do something better than just assert() */
 	assert(baridx == 0);
 
 	if (vs->vs_mtx)
 		pthread_mutex_lock(vs->vs_mtx);
 
 	vc = vs->vs_vc;
 	name = vc->vc_name;
 	value = size == 1 ? 0xff : size == 2 ? 0xffff : 0xffffffff;
 
 	if (size != 1 && size != 2 && size != 4)
 		goto bad;
 
 	if (pci_msix_enabled(pi))
 		virtio_config_size = VTCFG_R_CFG1;
 	else
 		virtio_config_size = VTCFG_R_CFG0;
 
 	if (offset >= virtio_config_size) {
 		/*
 		 * Subtract off the standard size (including MSI-X
 		 * registers if enabled) and dispatch to underlying driver.
 		 * If that fails, fall into general code.
 		 */
 		newoff = offset - virtio_config_size;
 		max = vc->vc_cfgsize ? vc->vc_cfgsize : 0x100000000;
 		if (newoff + size > max)
 			goto bad;
 		error = (*vc->vc_cfgread)(DEV_SOFTC(vs), newoff, size, &value);
 		if (!error)
 			goto done;
 	}
 
 bad:
 	cr = vi_find_cr(offset);
 	if (cr == NULL || cr->cr_size != size) {
 		if (cr != NULL) {
 			/* offset must be OK, so size must be bad */
 			fprintf(stderr,
 			    "%s: read from %s: bad size %d\r\n",
 			    name, cr->cr_name, size);
 		} else {
 			fprintf(stderr,
 			    "%s: read from bad offset/size %jd/%d\r\n",
 			    name, (uintmax_t)offset, size);
 		}
 		goto done;
 	}
 
 	switch (offset) {
 	case VTCFG_R_HOSTCAP:
 		value = vc->vc_hv_caps;
 		break;
 	case VTCFG_R_GUESTCAP:
 		value = vs->vs_negotiated_caps;
 		break;
 	case VTCFG_R_PFN:
 		if (vs->vs_curq < vc->vc_nvq)
 			value = vs->vs_queues[vs->vs_curq].vq_pfn;
 		break;
 	case VTCFG_R_QNUM:
 		value = vs->vs_curq < vc->vc_nvq ?
 		    vs->vs_queues[vs->vs_curq].vq_qsize : 0;
 		break;
 	case VTCFG_R_QSEL:
 		value = vs->vs_curq;
 		break;
 	case VTCFG_R_QNOTIFY:
 		value = 0;	/* XXX */
 		break;
 	case VTCFG_R_STATUS:
 		value = vs->vs_status;
 		break;
 	case VTCFG_R_ISR:
 		value = vs->vs_isr;
 		vs->vs_isr = 0;		/* a read clears this flag */
 		if (value)
 			pci_lintr_deassert(pi);
 		break;
 	case VTCFG_R_CFGVEC:
 		value = vs->vs_msix_cfg_idx;
 		break;
 	case VTCFG_R_QVEC:
 		value = vs->vs_curq < vc->vc_nvq ?
 		    vs->vs_queues[vs->vs_curq].vq_msix_idx :
 		    VIRTIO_MSI_NO_VECTOR;
 		break;
 	}
 done:
 	if (vs->vs_mtx)
 		pthread_mutex_unlock(vs->vs_mtx);
 	return (value);
 }
 
 /*
  * Handle pci config space writes.
  * If it's to the MSI-X info, do that.
  * If it's part of the virtio standard stuff, do that.
  * Otherwise dispatch to the actual driver.
  */
 void
 vi_pci_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
 	     int baridx, uint64_t offset, int size, uint64_t value)
 {
 	struct virtio_softc *vs = pi->pi_arg;
 	struct vqueue_info *vq;
 	struct virtio_consts *vc;
 	struct config_reg *cr;
 	uint64_t virtio_config_size, max;
 	const char *name;
 	uint32_t newoff;
 	int error;
 
 	if (vs->vs_flags & VIRTIO_USE_MSIX) {
 		if (baridx == pci_msix_table_bar(pi) ||
 		    baridx == pci_msix_pba_bar(pi)) {
 			pci_emul_msix_twrite(pi, offset, size, value);
 			return;
 		}
 	}
 
 	/* XXX probably should do something better than just assert() */
 	assert(baridx == 0);
 
 	if (vs->vs_mtx)
 		pthread_mutex_lock(vs->vs_mtx);
 
 	vc = vs->vs_vc;
 	name = vc->vc_name;
 
 	if (size != 1 && size != 2 && size != 4)
 		goto bad;
 
 	if (pci_msix_enabled(pi))
 		virtio_config_size = VTCFG_R_CFG1;
 	else
 		virtio_config_size = VTCFG_R_CFG0;
 
 	if (offset >= virtio_config_size) {
 		/*
 		 * Subtract off the standard size (including MSI-X
 		 * registers if enabled) and dispatch to underlying driver.
 		 */
 		newoff = offset - virtio_config_size;
 		max = vc->vc_cfgsize ? vc->vc_cfgsize : 0x100000000;
 		if (newoff + size > max)
 			goto bad;
 		error = (*vc->vc_cfgwrite)(DEV_SOFTC(vs), newoff, size, value);
 		if (!error)
 			goto done;
 	}
 
 bad:
 	cr = vi_find_cr(offset);
 	if (cr == NULL || cr->cr_size != size || cr->cr_ro) {
 		if (cr != NULL) {
 			/* offset must be OK, wrong size and/or reg is R/O */
 			if (cr->cr_size != size)
 				fprintf(stderr,
 				    "%s: write to %s: bad size %d\r\n",
 				    name, cr->cr_name, size);
 			if (cr->cr_ro)
 				fprintf(stderr,
 				    "%s: write to read-only reg %s\r\n",
 				    name, cr->cr_name);
 		} else {
 			fprintf(stderr,
 			    "%s: write to bad offset/size %jd/%d\r\n",
 			    name, (uintmax_t)offset, size);
 		}
 		goto done;
 	}
 
 	switch (offset) {
 	case VTCFG_R_GUESTCAP:
 		vs->vs_negotiated_caps = value & vc->vc_hv_caps;
 		if (vc->vc_apply_features)
 			(*vc->vc_apply_features)(DEV_SOFTC(vs),
 			    vs->vs_negotiated_caps);
 		break;
 	case VTCFG_R_PFN:
 		if (vs->vs_curq >= vc->vc_nvq)
 			goto bad_qindex;
 		vi_vq_init(vs, value);
 		break;
 	case VTCFG_R_QSEL:
 		/*
 		 * Note that the guest is allowed to select an
 		 * invalid queue; we just need to return a QNUM
 		 * of 0 while the bad queue is selected.
 		 */
 		vs->vs_curq = value;
 		break;
 	case VTCFG_R_QNOTIFY:
 		if (value >= vc->vc_nvq) {
 			fprintf(stderr, "%s: queue %d notify out of range\r\n",
 				name, (int)value);
 			goto done;
 		}
 		vq = &vs->vs_queues[value];
 		if (vq->vq_notify)
 			(*vq->vq_notify)(DEV_SOFTC(vs), vq);
 		else if (vc->vc_qnotify)
 			(*vc->vc_qnotify)(DEV_SOFTC(vs), vq);
 		else
 			fprintf(stderr,
 			    "%s: qnotify queue %d: missing vq/vc notify\r\n",
 				name, (int)value);
 		break;
 	case VTCFG_R_STATUS:
 		vs->vs_status = value;
 		if (value == 0)
 			(*vc->vc_reset)(DEV_SOFTC(vs));
 		break;
 	case VTCFG_R_CFGVEC:
 		vs->vs_msix_cfg_idx = value;
 		break;
 	case VTCFG_R_QVEC:
 		if (vs->vs_curq >= vc->vc_nvq)
 			goto bad_qindex;
 		vq = &vs->vs_queues[vs->vs_curq];
 		vq->vq_msix_idx = value;
 		break;
 	}
 	goto done;
 
 bad_qindex:
 	fprintf(stderr,
 	    "%s: write config reg %s: curq %d >= max %d\r\n",
 	    name, cr->cr_name, vs->vs_curq, vc->vc_nvq);
 done:
 	if (vs->vs_mtx)
 		pthread_mutex_unlock(vs->vs_mtx);
 }
Index: stable/10/usr.sbin/bhyve/xmsr.c
===================================================================
--- stable/10/usr.sbin/bhyve/xmsr.c	(revision 276348)
+++ stable/10/usr.sbin/bhyve/xmsr.c	(revision 276349)
@@ -1,63 +1,121 @@
 /*-
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/types.h>
 
+#include <machine/cpufunc.h>
 #include <machine/vmm.h>
+#include <machine/specialreg.h>
+
 #include <vmmapi.h>
 
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 
 #include "xmsr.h"
 
+static int cpu_vendor_intel, cpu_vendor_amd;
+
 int
 emulate_wrmsr(struct vmctx *ctx, int vcpu, uint32_t code, uint64_t val)
 {
 
-	switch (code) {
-	case 0xd04:			/* Sandy Bridge uncore PMC MSRs */
-	case 0xc24:
-		return (0);
-	case 0x79:
-		return (0);		/* IA32_BIOS_UPDT_TRIG MSR */
-	default:
-		break;
+	if (cpu_vendor_intel) {
+		switch (code) {
+		case 0xd04:		/* Sandy Bridge uncore PMCs */
+		case 0xc24:
+			return (0);
+		case MSR_BIOS_UPDT_TRIG:
+			return (0);
+		case MSR_BIOS_SIGN:
+			return (0);
+		default:
+			break;
+		}
 	}
 	return (-1);
 }
 
 int
-emulate_rdmsr(struct vmctx *ctx, int vcpu, uint32_t code, uint64_t *val)
+emulate_rdmsr(struct vmctx *ctx, int vcpu, uint32_t num, uint64_t *val)
 {
+	int error = 0;
 
-	return (-1);
+	if (cpu_vendor_intel) {
+		switch (num) {
+		case MSR_BIOS_SIGN:
+		case MSR_IA32_PLATFORM_ID:
+		case MSR_PKG_ENERGY_STATUS:
+		case MSR_PP0_ENERGY_STATUS:
+		case MSR_PP1_ENERGY_STATUS:
+		case MSR_DRAM_ENERGY_STATUS:
+			*val = 0;
+			break;
+		case MSR_RAPL_POWER_UNIT:
+			/*
+			 * Use the default value documented in section
+			 * "RAPL Interfaces" in Intel SDM vol3.
+			 */
+			*val = 0x000a1003;
+			break;
+		default:
+			error = -1;
+			break;
+		}
+	}
+	return (error);
+}
+
+int
+init_msr(void)
+{
+	int error;
+	u_int regs[4];
+	char cpu_vendor[13];
+
+	do_cpuid(0, regs);
+	((u_int *)&cpu_vendor)[0] = regs[1];
+	((u_int *)&cpu_vendor)[1] = regs[3];
+	((u_int *)&cpu_vendor)[2] = regs[2];
+	cpu_vendor[12] = '\0';
+
+	error = 0;
+	if (strcmp(cpu_vendor, "AuthenticAMD") == 0) {
+		cpu_vendor_amd = 1;
+	} else if (strcmp(cpu_vendor, "GenuineIntel") == 0) {
+		cpu_vendor_intel = 1;
+	} else {
+		fprintf(stderr, "Unknown cpu vendor \"%s\"\n", cpu_vendor);
+		error = -1;
+	}
+	return (error);
 }
Index: stable/10/usr.sbin/bhyve/xmsr.h
===================================================================
--- stable/10/usr.sbin/bhyve/xmsr.h	(revision 276348)
+++ stable/10/usr.sbin/bhyve/xmsr.h	(revision 276349)
@@ -1,35 +1,36 @@
 /*-
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef	_XMSR_H_
 #define	_XMSR_H_
 
+int init_msr(void);
 int emulate_wrmsr(struct vmctx *ctx, int vcpu, uint32_t code, uint64_t val);
 int emulate_rdmsr(struct vmctx *ctx, int vcpu, uint32_t code, uint64_t *val);
 
 #endif
Index: stable/10/usr.sbin/bhyvectl/bhyvectl.c
===================================================================
--- stable/10/usr.sbin/bhyvectl/bhyvectl.c	(revision 276348)
+++ stable/10/usr.sbin/bhyvectl/bhyvectl.c	(revision 276349)
@@ -1,1635 +1,1635 @@
 /*-
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/sysctl.h>
 #include <sys/errno.h>
 #include <sys/mman.h>
 
 #include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
 #include <libgen.h>
 #include <libutil.h>
 #include <fcntl.h>
 #include <string.h>
 #include <getopt.h>
 #include <assert.h>
 
 #include <machine/vmm.h>
 #include <vmmapi.h>
 
 #include "intel/vmcs.h"
 
 #define	MB	(1UL << 20)
 #define	GB	(1UL << 30)
 
 #define	REQ_ARG		required_argument
 #define	NO_ARG		no_argument
 #define	OPT_ARG		optional_argument
 
 static const char *progname;
 
 static void
 usage(void)
 {
 
 	(void)fprintf(stderr,
 	"Usage: %s --vm=<vmname>\n"
 	"       [--cpu=<vcpu_number>]\n"
 	"       [--create]\n"
 	"       [--destroy]\n"
 	"       [--get-all]\n"
 	"       [--get-stats]\n"
 	"       [--set-desc-ds]\n"
 	"       [--get-desc-ds]\n"
 	"       [--set-desc-es]\n"
 	"       [--get-desc-es]\n"
 	"       [--set-desc-gs]\n"
 	"       [--get-desc-gs]\n"
 	"       [--set-desc-fs]\n"
 	"       [--get-desc-fs]\n"
 	"       [--set-desc-cs]\n"
 	"       [--get-desc-cs]\n"
 	"       [--set-desc-ss]\n"
 	"       [--get-desc-ss]\n"
 	"       [--set-desc-tr]\n"
 	"       [--get-desc-tr]\n"
 	"       [--set-desc-ldtr]\n"
 	"       [--get-desc-ldtr]\n"
 	"       [--set-desc-gdtr]\n"
 	"       [--get-desc-gdtr]\n"
 	"       [--set-desc-idtr]\n"
 	"       [--get-desc-idtr]\n"
 	"       [--run]\n"
 	"       [--capname=<capname>]\n"
 	"       [--getcap]\n"
 	"       [--setcap=<0|1>]\n"
 	"       [--desc-base=<BASE>]\n"
 	"       [--desc-limit=<LIMIT>]\n"
 	"       [--desc-access=<ACCESS>]\n"
 	"       [--set-cr0=<CR0>]\n"
 	"       [--get-cr0]\n"
 	"       [--set-cr3=<CR3>]\n"
 	"       [--get-cr3]\n"
 	"       [--set-cr4=<CR4>]\n"
 	"       [--get-cr4]\n"
 	"       [--set-dr7=<DR7>]\n"
 	"       [--get-dr7]\n"
 	"       [--set-rsp=<RSP>]\n"
 	"       [--get-rsp]\n"
 	"       [--set-rip=<RIP>]\n"
 	"       [--get-rip]\n"
 	"       [--get-rax]\n"
 	"       [--set-rax=<RAX>]\n"
 	"       [--get-rbx]\n"
 	"       [--get-rcx]\n"
 	"       [--get-rdx]\n"
 	"       [--get-rsi]\n"
 	"       [--get-rdi]\n"
 	"       [--get-rbp]\n"
 	"       [--get-r8]\n"
 	"       [--get-r9]\n"
 	"       [--get-r10]\n"
 	"       [--get-r11]\n"
 	"       [--get-r12]\n"
 	"       [--get-r13]\n"
 	"       [--get-r14]\n"
 	"       [--get-r15]\n"
 	"       [--set-rflags=<RFLAGS>]\n"
 	"       [--get-rflags]\n"
 	"       [--set-cs]\n"
 	"       [--get-cs]\n"
 	"       [--set-ds]\n"
 	"       [--get-ds]\n"
 	"       [--set-es]\n"
 	"       [--get-es]\n"
 	"       [--set-fs]\n"
 	"       [--get-fs]\n"
 	"       [--set-gs]\n"
 	"       [--get-gs]\n"
 	"       [--set-ss]\n"
 	"       [--get-ss]\n"
 	"       [--get-tr]\n"
 	"       [--get-ldtr]\n"
 	"       [--get-vmcs-pinbased-ctls]\n"
 	"       [--get-vmcs-procbased-ctls]\n"
 	"       [--get-vmcs-procbased-ctls2]\n"
 	"       [--get-vmcs-entry-interruption-info]\n"
 	"       [--set-vmcs-entry-interruption-info=<info>]\n"
 	"       [--get-vmcs-eptp]\n"
 	"       [--get-vmcs-guest-physical-address\n"
 	"       [--get-vmcs-guest-linear-address\n"
 	"       [--set-vmcs-exception-bitmap]\n"
 	"       [--get-vmcs-exception-bitmap]\n"
 	"       [--get-vmcs-io-bitmap-address]\n"
 	"       [--get-vmcs-tsc-offset]\n"
 	"       [--get-vmcs-guest-pat]\n"
 	"       [--get-vmcs-host-pat]\n"
 	"       [--get-vmcs-host-cr0]\n"
 	"       [--get-vmcs-host-cr3]\n"
 	"       [--get-vmcs-host-cr4]\n"
 	"       [--get-vmcs-host-rip]\n"
 	"       [--get-vmcs-host-rsp]\n"
 	"       [--get-vmcs-cr0-mask]\n"
 	"       [--get-vmcs-cr0-shadow]\n"
 	"       [--get-vmcs-cr4-mask]\n"
 	"       [--get-vmcs-cr4-shadow]\n"
 	"       [--get-vmcs-cr3-targets]\n"
 	"       [--get-vmcs-apic-access-address]\n"
 	"       [--get-vmcs-virtual-apic-address]\n"
 	"       [--get-vmcs-tpr-threshold]\n"
 	"       [--get-vmcs-msr-bitmap]\n"
 	"       [--get-vmcs-msr-bitmap-address]\n"
 	"       [--get-vmcs-vpid]\n"
 	"       [--get-vmcs-ple-gap]\n"
 	"       [--get-vmcs-ple-window]\n"
 	"       [--get-vmcs-instruction-error]\n"
 	"       [--get-vmcs-exit-ctls]\n"
 	"       [--get-vmcs-entry-ctls]\n"
 	"       [--get-vmcs-guest-sysenter]\n"
 	"       [--get-vmcs-link]\n"
 	"       [--get-vmcs-exit-reason]\n"
 	"       [--get-vmcs-exit-qualification]\n"
 	"       [--get-vmcs-exit-interruption-info]\n"
 	"       [--get-vmcs-exit-interruption-error]\n"
 	"       [--get-vmcs-interruptibility]\n"
 	"       [--set-x2apic-state=<state>]\n"
 	"       [--get-x2apic-state]\n"
 	"       [--unassign-pptdev=<bus/slot/func>]\n"
 	"       [--set-mem=<memory in units of MB>]\n"
 	"       [--get-lowmem]\n"
 	"       [--get-highmem]\n"
 	"       [--get-gpa-pmap]\n"
 	"       [--assert-lapic-lvt=<pin>]\n"
 	"       [--inject-nmi]\n"
 	"       [--force-reset]\n"
 	"       [--force-poweroff]\n"
 	"       [--get-active-cpus]\n"
 	"       [--get-suspended-cpus]\n"
 	"       [--get-intinfo]\n",
 	progname);
 	exit(1);
 }
 
 static int get_stats, getcap, setcap, capval, get_gpa_pmap;
 static int inject_nmi, assert_lapic_lvt;
 static int force_reset, force_poweroff;
 static const char *capname;
 static int create, destroy, get_lowmem, get_highmem;
 static int get_intinfo;
 static int get_active_cpus, get_suspended_cpus;
 static uint64_t memsize;
 static int set_cr0, get_cr0, set_cr3, get_cr3, set_cr4, get_cr4;
 static int set_efer, get_efer;
 static int set_dr7, get_dr7;
 static int set_rsp, get_rsp, set_rip, get_rip, set_rflags, get_rflags;
 static int set_rax, get_rax;
 static int get_rbx, get_rcx, get_rdx, get_rsi, get_rdi, get_rbp;
 static int get_r8, get_r9, get_r10, get_r11, get_r12, get_r13, get_r14, get_r15;
 static int set_desc_ds, get_desc_ds;
 static int set_desc_es, get_desc_es;
 static int set_desc_fs, get_desc_fs;
 static int set_desc_gs, get_desc_gs;
 static int set_desc_cs, get_desc_cs;
 static int set_desc_ss, get_desc_ss;
 static int set_desc_gdtr, get_desc_gdtr;
 static int set_desc_idtr, get_desc_idtr;
 static int set_desc_tr, get_desc_tr;
 static int set_desc_ldtr, get_desc_ldtr;
 static int set_cs, set_ds, set_es, set_fs, set_gs, set_ss, set_tr, set_ldtr;
 static int get_cs, get_ds, get_es, get_fs, get_gs, get_ss, get_tr, get_ldtr;
 static int set_x2apic_state, get_x2apic_state;
 enum x2apic_state x2apic_state;
 static int unassign_pptdev, bus, slot, func;
 static int run;
 
 /*
  * VMCS-specific fields
  */
 static int get_pinbased_ctls, get_procbased_ctls, get_procbased_ctls2;
 static int get_eptp, get_io_bitmap, get_tsc_offset;
 static int get_vmcs_entry_interruption_info, set_vmcs_entry_interruption_info;
 static int get_vmcs_interruptibility;
 uint32_t vmcs_entry_interruption_info;
 static int get_vmcs_gpa, get_vmcs_gla;
 static int get_exception_bitmap, set_exception_bitmap, exception_bitmap;
 static int get_cr0_mask, get_cr0_shadow;
 static int get_cr4_mask, get_cr4_shadow;
 static int get_cr3_targets;
 static int get_apic_access_addr, get_virtual_apic_addr, get_tpr_threshold;
 static int get_msr_bitmap, get_msr_bitmap_address;
 static int get_vpid, get_ple_gap, get_ple_window;
 static int get_inst_err, get_exit_ctls, get_entry_ctls;
 static int get_host_cr0, get_host_cr3, get_host_cr4;
 static int get_host_rip, get_host_rsp;
 static int get_guest_pat, get_host_pat;
 static int get_guest_sysenter, get_vmcs_link;
 static int get_vmcs_exit_reason, get_vmcs_exit_qualification;
 static int get_vmcs_exit_interruption_info, get_vmcs_exit_interruption_error;
 
 static uint64_t desc_base;
 static uint32_t desc_limit, desc_access;
 
 static int get_all;
 
 static void
 dump_vm_run_exitcode(struct vm_exit *vmexit, int vcpu)
 {
 	printf("vm exit[%d]\n", vcpu);
 	printf("\trip\t\t0x%016lx\n", vmexit->rip);
 	printf("\tinst_length\t%d\n", vmexit->inst_length);
 	switch (vmexit->exitcode) {
 	case VM_EXITCODE_INOUT:
 		printf("\treason\t\tINOUT\n");
 		printf("\tdirection\t%s\n", vmexit->u.inout.in ? "IN" : "OUT");
 		printf("\tbytes\t\t%d\n", vmexit->u.inout.bytes);
 		printf("\tflags\t\t%s%s\n",
 			vmexit->u.inout.string ? "STRING " : "",
 			vmexit->u.inout.rep ? "REP " : "");
 		printf("\tport\t\t0x%04x\n", vmexit->u.inout.port);
 		printf("\teax\t\t0x%08x\n", vmexit->u.inout.eax);
 		break;
 	case VM_EXITCODE_VMX:
 		printf("\treason\t\tVMX\n");
 		printf("\tstatus\t\t%d\n", vmexit->u.vmx.status);
 		printf("\texit_reason\t0x%08x (%u)\n",
 		    vmexit->u.vmx.exit_reason, vmexit->u.vmx.exit_reason);
 		printf("\tqualification\t0x%016lx\n",
 			vmexit->u.vmx.exit_qualification);
 		printf("\tinst_type\t\t%d\n", vmexit->u.vmx.inst_type);
 		printf("\tinst_error\t\t%d\n", vmexit->u.vmx.inst_error);
 		break;
 	default:
 		printf("*** unknown vm run exitcode %d\n", vmexit->exitcode);
 		break;
 	}
 }
 
 static int
 dump_vmcs_msr_bitmap(int vcpu, u_long addr)
 {
 	int error, fd, byte, bit, readable, writeable;
 	u_int msr;
 	const char *bitmap;
 
 	error = -1;
 	bitmap = MAP_FAILED;
 
 	fd = open("/dev/mem", O_RDONLY, 0);
 	if (fd < 0)
 		goto done;
 
-	bitmap = mmap(NULL, PAGE_SIZE, PROT_READ, 0, fd, addr);
+	bitmap = mmap(NULL, PAGE_SIZE, PROT_READ, MAP_SHARED, fd, addr);
 	if (bitmap == MAP_FAILED)
 		goto done;
 
 	for (msr = 0; msr < 0x2000; msr++) {
 		byte = msr / 8;
 		bit = msr & 0x7;
 
 		/* Look at MSRs in the range 0x00000000 to 0x00001FFF */
 		readable = (bitmap[byte] & (1 << bit)) ? 0 : 1;
 		writeable = (bitmap[2048 + byte] & (1 << bit)) ? 0 : 1;
 		if (readable || writeable) {
 			printf("msr 0x%08x[%d]\t\t%c%c\n", msr, vcpu,
 				readable ? 'R' : '-',
 				writeable ? 'W' : '-');
 		}
 
 		/* Look at MSRs in the range 0xC0000000 to 0xC0001FFF */
 		byte += 1024;
 		readable = (bitmap[byte] & (1 << bit)) ? 0 : 1;
 		writeable = (bitmap[2048 + byte] & (1 << bit)) ? 0 : 1;
 		if (readable || writeable) {
 			printf("msr 0x%08x[%d]\t\t%c%c\n",
 				0xc0000000 + msr, vcpu,
 				readable ? 'R' : '-',
 				writeable ? 'W' : '-');
 		}
 	}
 
 	error = 0;
 done:
 	if (bitmap != MAP_FAILED)
 		munmap((void *)bitmap, PAGE_SIZE);
 	if (fd >= 0)
 		close(fd);
 	return (error);
 }
 
 static int
 vm_get_vmcs_field(struct vmctx *ctx, int vcpu, int field, uint64_t *ret_val)
 {
 
 	return (vm_get_register(ctx, vcpu, VMCS_IDENT(field), ret_val));
 }
 
 static int
 vm_set_vmcs_field(struct vmctx *ctx, int vcpu, int field, uint64_t val)
 {
 
 	return (vm_set_register(ctx, vcpu, VMCS_IDENT(field), val));
 }
 
 enum {
 	VMNAME = 1000,	/* avoid collision with return values from getopt */
 	VCPU,
 	SET_MEM,
 	SET_EFER,
 	SET_CR0,
 	SET_CR3,
 	SET_CR4,
 	SET_DR7,
 	SET_RSP,
 	SET_RIP,
 	SET_RAX,
 	SET_RFLAGS,
 	DESC_BASE,
 	DESC_LIMIT,
 	DESC_ACCESS,
 	SET_CS,
 	SET_DS,
 	SET_ES,
 	SET_FS,
 	SET_GS,
 	SET_SS,
 	SET_TR,
 	SET_LDTR,
 	SET_X2APIC_STATE,
 	SET_VMCS_EXCEPTION_BITMAP,
 	SET_VMCS_ENTRY_INTERRUPTION_INFO,
 	SET_CAP,
 	CAPNAME,
 	UNASSIGN_PPTDEV,
 	GET_GPA_PMAP,
 	ASSERT_LAPIC_LVT,
 };
 
 static void
 print_cpus(const char *banner, const cpuset_t *cpus)
 {
 	int i, first;
 
 	first = 1;
 	printf("%s:\t", banner);
 	if (!CPU_EMPTY(cpus)) {
 		for (i = 0; i < CPU_SETSIZE; i++) {
 			if (CPU_ISSET(i, cpus)) {
 				printf("%s%d", first ? " " : ", ", i);
 				first = 0;
 			}
 		}
 	} else
 		printf(" (none)");
 	printf("\n");
 }
 
 static void
 print_intinfo(const char *banner, uint64_t info)
 {
 	int type;
 
 	printf("%s:\t", banner);
 	if (info & VM_INTINFO_VALID) {
 		type = info & VM_INTINFO_TYPE;
 		switch (type) {
 		case VM_INTINFO_HWINTR:
 			printf("extint");
 			break;
 		case VM_INTINFO_NMI:
 			printf("nmi");
 			break;
 		case VM_INTINFO_SWINTR:
 			printf("swint");
 			break;
 		default:
 			printf("exception");
 			break;
 		}
 		printf(" vector %d", (int)VM_INTINFO_VECTOR(info));
 		if (info & VM_INTINFO_DEL_ERRCODE)
 			printf(" errcode %#x", (u_int)(info >> 32));
 	} else {
 		printf("n/a");
 	}
 	printf("\n");
 }
 
 int
 main(int argc, char *argv[])
 {
 	char *vmname;
 	int error, ch, vcpu, ptenum;
 	vm_paddr_t gpa, gpa_pmap;
 	size_t len;
 	struct vm_exit vmexit;
 	uint64_t ctl, eptp, bm, addr, u64, pteval[4], *pte, info[2];
 	struct vmctx *ctx;
 	int wired;
 	cpuset_t cpus;
 
 	uint64_t cr0, cr3, cr4, dr7, rsp, rip, rflags, efer, pat;
 	uint64_t rax, rbx, rcx, rdx, rsi, rdi, rbp;
 	uint64_t r8, r9, r10, r11, r12, r13, r14, r15;
 	uint64_t cs, ds, es, fs, gs, ss, tr, ldtr;
 
 	struct option opts[] = {
 		{ "vm",		REQ_ARG,	0,	VMNAME },
 		{ "cpu",	REQ_ARG,	0,	VCPU },
 		{ "set-mem",	REQ_ARG,	0,	SET_MEM },
 		{ "set-efer",	REQ_ARG,	0,	SET_EFER },
 		{ "set-cr0",	REQ_ARG,	0,	SET_CR0 },
 		{ "set-cr3",	REQ_ARG,	0,	SET_CR3 },
 		{ "set-cr4",	REQ_ARG,	0,	SET_CR4 },
 		{ "set-dr7",	REQ_ARG,	0,	SET_DR7 },
 		{ "set-rsp",	REQ_ARG,	0,	SET_RSP },
 		{ "set-rip",	REQ_ARG,	0,	SET_RIP },
 		{ "set-rax",	REQ_ARG,	0,	SET_RAX },
 		{ "set-rflags",	REQ_ARG,	0,	SET_RFLAGS },
 		{ "desc-base",	REQ_ARG,	0,	DESC_BASE },
 		{ "desc-limit",	REQ_ARG,	0,	DESC_LIMIT },
 		{ "desc-access",REQ_ARG,	0,	DESC_ACCESS },
 		{ "set-cs",	REQ_ARG,	0,	SET_CS },
 		{ "set-ds",	REQ_ARG,	0,	SET_DS },
 		{ "set-es",	REQ_ARG,	0,	SET_ES },
 		{ "set-fs",	REQ_ARG,	0,	SET_FS },
 		{ "set-gs",	REQ_ARG,	0,	SET_GS },
 		{ "set-ss",	REQ_ARG,	0,	SET_SS },
 		{ "set-tr",	REQ_ARG,	0,	SET_TR },
 		{ "set-ldtr",	REQ_ARG,	0,	SET_LDTR },
 		{ "set-x2apic-state",REQ_ARG,	0,	SET_X2APIC_STATE },
 		{ "set-vmcs-exception-bitmap",
 				REQ_ARG,	0, SET_VMCS_EXCEPTION_BITMAP },
 		{ "set-vmcs-entry-interruption-info",
 				REQ_ARG, 0, SET_VMCS_ENTRY_INTERRUPTION_INFO },
 		{ "capname",	REQ_ARG,	0,	CAPNAME },
 		{ "unassign-pptdev", REQ_ARG,	0,	UNASSIGN_PPTDEV },
 		{ "setcap",	REQ_ARG,	0,	SET_CAP },
 		{ "get-gpa-pmap", REQ_ARG,	0,	GET_GPA_PMAP },
 		{ "assert-lapic-lvt", REQ_ARG,	0,	ASSERT_LAPIC_LVT },
 		{ "getcap",	NO_ARG,		&getcap,	1 },
 		{ "get-stats",	NO_ARG,		&get_stats,	1 },
 		{ "get-desc-ds",NO_ARG,		&get_desc_ds,	1 },
 		{ "set-desc-ds",NO_ARG,		&set_desc_ds,	1 },
 		{ "get-desc-es",NO_ARG,		&get_desc_es,	1 },
 		{ "set-desc-es",NO_ARG,		&set_desc_es,	1 },
 		{ "get-desc-ss",NO_ARG,		&get_desc_ss,	1 },
 		{ "set-desc-ss",NO_ARG,		&set_desc_ss,	1 },
 		{ "get-desc-cs",NO_ARG,		&get_desc_cs,	1 },
 		{ "set-desc-cs",NO_ARG,		&set_desc_cs,	1 },
 		{ "get-desc-fs",NO_ARG,		&get_desc_fs,	1 },
 		{ "set-desc-fs",NO_ARG,		&set_desc_fs,	1 },
 		{ "get-desc-gs",NO_ARG,		&get_desc_gs,	1 },
 		{ "set-desc-gs",NO_ARG,		&set_desc_gs,	1 },
 		{ "get-desc-tr",NO_ARG,		&get_desc_tr,	1 },
 		{ "set-desc-tr",NO_ARG,		&set_desc_tr,	1 },
 		{ "set-desc-ldtr", NO_ARG,	&set_desc_ldtr,	1 },
 		{ "get-desc-ldtr", NO_ARG,	&get_desc_ldtr,	1 },
 		{ "set-desc-gdtr", NO_ARG,	&set_desc_gdtr, 1 },
 		{ "get-desc-gdtr", NO_ARG,	&get_desc_gdtr, 1 },
 		{ "set-desc-idtr", NO_ARG,	&set_desc_idtr, 1 },
 		{ "get-desc-idtr", NO_ARG,	&get_desc_idtr, 1 },
 		{ "get-lowmem", NO_ARG,		&get_lowmem,	1 },
 		{ "get-highmem",NO_ARG,		&get_highmem,	1 },
 		{ "get-efer",	NO_ARG,		&get_efer,	1 },
 		{ "get-cr0",	NO_ARG,		&get_cr0,	1 },
 		{ "get-cr3",	NO_ARG,		&get_cr3,	1 },
 		{ "get-cr4",	NO_ARG,		&get_cr4,	1 },
 		{ "get-dr7",	NO_ARG,		&get_dr7,	1 },
 		{ "get-rsp",	NO_ARG,		&get_rsp,	1 },
 		{ "get-rip",	NO_ARG,		&get_rip,	1 },
 		{ "get-rax",	NO_ARG,		&get_rax,	1 },
 		{ "get-rbx",	NO_ARG,		&get_rbx,	1 },
 		{ "get-rcx",	NO_ARG,		&get_rcx,	1 },
 		{ "get-rdx",	NO_ARG,		&get_rdx,	1 },
 		{ "get-rsi",	NO_ARG,		&get_rsi,	1 },
 		{ "get-rdi",	NO_ARG,		&get_rdi,	1 },
 		{ "get-rbp",	NO_ARG,		&get_rbp,	1 },
 		{ "get-r8",	NO_ARG,		&get_r8,	1 },
 		{ "get-r9",	NO_ARG,		&get_r9,	1 },
 		{ "get-r10",	NO_ARG,		&get_r10,	1 },
 		{ "get-r11",	NO_ARG,		&get_r11,	1 },
 		{ "get-r12",	NO_ARG,		&get_r12,	1 },
 		{ "get-r13",	NO_ARG,		&get_r13,	1 },
 		{ "get-r14",	NO_ARG,		&get_r14,	1 },
 		{ "get-r15",	NO_ARG,		&get_r15,	1 },
 		{ "get-rflags",	NO_ARG,		&get_rflags,	1 },
 		{ "get-cs",	NO_ARG,		&get_cs,	1 },
 		{ "get-ds",	NO_ARG,		&get_ds,	1 },
 		{ "get-es",	NO_ARG,		&get_es,	1 },
 		{ "get-fs",	NO_ARG,		&get_fs,	1 },
 		{ "get-gs",	NO_ARG,		&get_gs,	1 },
 		{ "get-ss",	NO_ARG,		&get_ss,	1 },
 		{ "get-tr",	NO_ARG,		&get_tr,	1 },
 		{ "get-ldtr",	NO_ARG,		&get_ldtr,	1 },
 		{ "get-vmcs-pinbased-ctls",
 				NO_ARG,		&get_pinbased_ctls, 1 },
 		{ "get-vmcs-procbased-ctls",
 				NO_ARG,		&get_procbased_ctls, 1 },
 		{ "get-vmcs-procbased-ctls2",
 				NO_ARG,		&get_procbased_ctls2, 1 },
 		{ "get-vmcs-guest-linear-address",
 				NO_ARG,		&get_vmcs_gla,	1 },
 		{ "get-vmcs-guest-physical-address",
 				NO_ARG,		&get_vmcs_gpa,	1 },
 		{ "get-vmcs-entry-interruption-info",
 				NO_ARG, &get_vmcs_entry_interruption_info, 1},
 		{ "get-vmcs-eptp", NO_ARG,	&get_eptp,	1 },
 		{ "get-vmcs-exception-bitmap",
 				NO_ARG,		&get_exception_bitmap, 1 },
 		{ "get-vmcs-io-bitmap-address",
 				NO_ARG,		&get_io_bitmap,	1 },
 		{ "get-vmcs-tsc-offset", NO_ARG,&get_tsc_offset, 1 },
 		{ "get-vmcs-cr0-mask", NO_ARG,	&get_cr0_mask,	1 },
 		{ "get-vmcs-cr0-shadow", NO_ARG,&get_cr0_shadow, 1 },
 		{ "get-vmcs-cr4-mask", NO_ARG,	&get_cr4_mask,	1 },
 		{ "get-vmcs-cr4-shadow", NO_ARG,&get_cr4_shadow, 1 },
 		{ "get-vmcs-cr3-targets", NO_ARG, &get_cr3_targets, 1},
 		{ "get-vmcs-apic-access-address",
 				NO_ARG,		&get_apic_access_addr, 1},
 		{ "get-vmcs-virtual-apic-address",
 				NO_ARG,		&get_virtual_apic_addr, 1},
 		{ "get-vmcs-tpr-threshold",
 				NO_ARG,		&get_tpr_threshold, 1 },
 		{ "get-vmcs-msr-bitmap",
 				NO_ARG,		&get_msr_bitmap, 1 },
 		{ "get-vmcs-msr-bitmap-address",
 				NO_ARG,		&get_msr_bitmap_address, 1 },
 		{ "get-vmcs-vpid", NO_ARG,	&get_vpid,	1 },
 		{ "get-vmcs-ple-gap", NO_ARG,	&get_ple_gap,	1 },
 		{ "get-vmcs-ple-window", NO_ARG,&get_ple_window,1 },
 		{ "get-vmcs-instruction-error",
 				NO_ARG,		&get_inst_err,	1 },
 		{ "get-vmcs-exit-ctls", NO_ARG,	&get_exit_ctls,	1 },
 		{ "get-vmcs-entry-ctls",
 					NO_ARG,	&get_entry_ctls, 1 },
 		{ "get-vmcs-guest-pat",	NO_ARG,	&get_guest_pat,	1 },
 		{ "get-vmcs-host-pat",	NO_ARG,	&get_host_pat,	1 },
 		{ "get-vmcs-host-cr0",
 				NO_ARG,		&get_host_cr0,	1 },
 		{ "get-vmcs-host-cr3",
 				NO_ARG,		&get_host_cr3,	1 },
 		{ "get-vmcs-host-cr4",
 				NO_ARG,		&get_host_cr4,	1 },
 		{ "get-vmcs-host-rip",
 				NO_ARG,		&get_host_rip,	1 },
 		{ "get-vmcs-host-rsp",
 				NO_ARG,		&get_host_rsp,	1 },
 		{ "get-vmcs-guest-sysenter",
 				NO_ARG,		&get_guest_sysenter, 1 },
 		{ "get-vmcs-link", NO_ARG,	&get_vmcs_link, 1 },
 		{ "get-vmcs-exit-reason",
 				NO_ARG,		&get_vmcs_exit_reason, 1 },
 		{ "get-vmcs-exit-qualification",
 			NO_ARG,		&get_vmcs_exit_qualification, 1 },
 		{ "get-vmcs-exit-interruption-info",
 				NO_ARG,	&get_vmcs_exit_interruption_info, 1},
 		{ "get-vmcs-exit-interruption-error",
 				NO_ARG,	&get_vmcs_exit_interruption_error, 1},
 		{ "get-vmcs-interruptibility",
 				NO_ARG, &get_vmcs_interruptibility, 1 },
 		{ "get-x2apic-state",NO_ARG,	&get_x2apic_state, 1 },
 		{ "get-all",	NO_ARG,		&get_all,	1 },
 		{ "run",	NO_ARG,		&run,		1 },
 		{ "create",	NO_ARG,		&create,	1 },
 		{ "destroy",	NO_ARG,		&destroy,	1 },
 		{ "inject-nmi",	NO_ARG,		&inject_nmi,	1 },
 		{ "force-reset",	NO_ARG,	&force_reset,	1 },
 		{ "force-poweroff", NO_ARG,	&force_poweroff, 1 },
 		{ "get-active-cpus", NO_ARG,	&get_active_cpus, 1 },
 		{ "get-suspended-cpus", NO_ARG,	&get_suspended_cpus, 1 },
 		{ "get-intinfo", NO_ARG,	&get_intinfo,	1 },
 		{ NULL,		0,		NULL,		0 }
 	};
 
 	vcpu = 0;
 	vmname = NULL;
 	assert_lapic_lvt = -1;
 	progname = basename(argv[0]);
 
 	while ((ch = getopt_long(argc, argv, "", opts, NULL)) != -1) {
 		switch (ch) {
 		case 0:
 			break;
 		case VMNAME:
 			vmname = optarg;
 			break;
 		case VCPU:
 			vcpu = atoi(optarg);
 			break;
 		case SET_MEM:
 			memsize = atoi(optarg) * MB;
 			memsize = roundup(memsize, 2 * MB);
 			break;
 		case SET_EFER:
 			efer = strtoul(optarg, NULL, 0);
 			set_efer = 1;
 			break;
 		case SET_CR0:
 			cr0 = strtoul(optarg, NULL, 0);
 			set_cr0 = 1;
 			break;
 		case SET_CR3:
 			cr3 = strtoul(optarg, NULL, 0);
 			set_cr3 = 1;
 			break;
 		case SET_CR4:
 			cr4 = strtoul(optarg, NULL, 0);
 			set_cr4 = 1;
 			break;
 		case SET_DR7:
 			dr7 = strtoul(optarg, NULL, 0);
 			set_dr7 = 1;
 			break;
 		case SET_RSP:
 			rsp = strtoul(optarg, NULL, 0);
 			set_rsp = 1;
 			break;
 		case SET_RIP:
 			rip = strtoul(optarg, NULL, 0);
 			set_rip = 1;
 			break;
 		case SET_RAX:
 			rax = strtoul(optarg, NULL, 0);
 			set_rax = 1;
 			break;
 		case SET_RFLAGS:
 			rflags = strtoul(optarg, NULL, 0);
 			set_rflags = 1;
 			break;
 		case DESC_BASE:
 			desc_base = strtoul(optarg, NULL, 0);
 			break;
 		case DESC_LIMIT:
 			desc_limit = strtoul(optarg, NULL, 0);
 			break;
 		case DESC_ACCESS:
 			desc_access = strtoul(optarg, NULL, 0);
 			break;
 		case SET_CS:
 			cs = strtoul(optarg, NULL, 0);
 			set_cs = 1;
 			break;
 		case SET_DS:
 			ds = strtoul(optarg, NULL, 0);
 			set_ds = 1;
 			break;
 		case SET_ES:
 			es = strtoul(optarg, NULL, 0);
 			set_es = 1;
 			break;
 		case SET_FS:
 			fs = strtoul(optarg, NULL, 0);
 			set_fs = 1;
 			break;
 		case SET_GS:
 			gs = strtoul(optarg, NULL, 0);
 			set_gs = 1;
 			break;
 		case SET_SS:
 			ss = strtoul(optarg, NULL, 0);
 			set_ss = 1;
 			break;
 		case SET_TR:
 			tr = strtoul(optarg, NULL, 0);
 			set_tr = 1;
 			break;
 		case SET_LDTR:
 			ldtr = strtoul(optarg, NULL, 0);
 			set_ldtr = 1;
 			break;
 		case SET_X2APIC_STATE:
 			x2apic_state = strtol(optarg, NULL, 0);
 			set_x2apic_state = 1;
 			break;
 		case SET_VMCS_EXCEPTION_BITMAP:
 			exception_bitmap = strtoul(optarg, NULL, 0);
 			set_exception_bitmap = 1;
 			break;
 		case SET_VMCS_ENTRY_INTERRUPTION_INFO:
 			vmcs_entry_interruption_info = strtoul(optarg, NULL, 0);
 			set_vmcs_entry_interruption_info = 1;
 			break;
 		case SET_CAP:
 			capval = strtoul(optarg, NULL, 0);
 			setcap = 1;
 			break;
 		case GET_GPA_PMAP:
 			gpa_pmap = strtoul(optarg, NULL, 0);
 			get_gpa_pmap = 1;
 			break;
 		case CAPNAME:
 			capname = optarg;
 			break;
 		case UNASSIGN_PPTDEV:
 			unassign_pptdev = 1;
 			if (sscanf(optarg, "%d/%d/%d", &bus, &slot, &func) != 3)
 				usage();
 			break;
 		case ASSERT_LAPIC_LVT:
 			assert_lapic_lvt = atoi(optarg);
 			break;
 		default:
 			usage();
 		}
 	}
 	argc -= optind;
 	argv += optind;
 
 	if (vmname == NULL)
 		usage();
 
 	error = 0;
 
 	if (!error && create)
 		error = vm_create(vmname);
 
 	if (!error) {
 		ctx = vm_open(vmname);
 		if (ctx == NULL)
 			error = -1;
 	}
 
 	if (!error && memsize)
 		error = vm_setup_memory(ctx, memsize, VM_MMAP_NONE);
 
 	if (!error && set_efer)
 		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_EFER, efer);
 
 	if (!error && set_cr0)
 		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR0, cr0);
 
 	if (!error && set_cr3)
 		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR3, cr3);
 
 	if (!error && set_cr4)
 		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR4, cr4);
 
 	if (!error && set_dr7)
 		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DR7, dr7);
 
 	if (!error && set_rsp)
 		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RSP, rsp);
 
 	if (!error && set_rip)
 		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, rip);
 
 	if (!error && set_rax)
 		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX, rax);
 
 	if (!error && set_rflags) {
 		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RFLAGS,
 					rflags);
 	}
 
 	if (!error && set_desc_ds) {
 		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_DS,
 				    desc_base, desc_limit, desc_access);
 	}
 
 	if (!error && set_desc_es) {
 		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_ES,
 				    desc_base, desc_limit, desc_access);
 	}
 
 	if (!error && set_desc_ss) {
 		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_SS,
 				    desc_base, desc_limit, desc_access);
 	}
 
 	if (!error && set_desc_cs) {
 		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_CS,
 				    desc_base, desc_limit, desc_access);
 	}
 
 	if (!error && set_desc_fs) {
 		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_FS,
 				    desc_base, desc_limit, desc_access);
 	}
 
 	if (!error && set_desc_gs) {
 		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_GS,
 				    desc_base, desc_limit, desc_access);
 	}
 
 	if (!error && set_desc_tr) {
 		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_TR,
 				    desc_base, desc_limit, desc_access);
 	}
 
 	if (!error && set_desc_ldtr) {
 		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_LDTR,
 				    desc_base, desc_limit, desc_access);
 	}
 
 	if (!error && set_desc_gdtr) {
 		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_GDTR,
 				    desc_base, desc_limit, 0);
 	}
 
 	if (!error && set_desc_idtr) {
 		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_IDTR,
 				    desc_base, desc_limit, 0);
 	}
 
 	if (!error && set_cs)
 		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CS, cs);
 
 	if (!error && set_ds)
 		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DS, ds);
 
 	if (!error && set_es)
 		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_ES, es);
 
 	if (!error && set_fs)
 		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_FS, fs);
 
 	if (!error && set_gs)
 		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_GS, gs);
 
 	if (!error && set_ss)
 		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_SS, ss);
 
 	if (!error && set_tr)
 		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_TR, tr);
 
 	if (!error && set_ldtr)
 		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_LDTR, ldtr);
 
 	if (!error && set_x2apic_state)
 		error = vm_set_x2apic_state(ctx, vcpu, x2apic_state);
 
 	if (!error && unassign_pptdev)
 		error = vm_unassign_pptdev(ctx, bus, slot, func);
 
 	if (!error && set_exception_bitmap) {
 		error = vm_set_vmcs_field(ctx, vcpu, VMCS_EXCEPTION_BITMAP,
 					  exception_bitmap);
 	}
 
 	if (!error && set_vmcs_entry_interruption_info) {
 		error = vm_set_vmcs_field(ctx, vcpu, VMCS_ENTRY_INTR_INFO,
 					  vmcs_entry_interruption_info);
 	}
 
 	if (!error && inject_nmi) {
 		error = vm_inject_nmi(ctx, vcpu);
 	}
 
 	if (!error && assert_lapic_lvt != -1) {
 		error = vm_lapic_local_irq(ctx, vcpu, assert_lapic_lvt);
 	}
 
 	if (!error && (get_lowmem || get_all)) {
 		gpa = 0;
 		error = vm_get_memory_seg(ctx, gpa, &len, &wired);
 		if (error == 0)
 			printf("lowmem\t\t0x%016lx/%ld%s\n", gpa, len,
 			    wired ? " wired" : "");
 	}
 
 	if (!error && (get_highmem || get_all)) {
 		gpa = 4 * GB;
 		error = vm_get_memory_seg(ctx, gpa, &len, &wired);
 		if (error == 0)
 			printf("highmem\t\t0x%016lx/%ld%s\n", gpa, len,
 			    wired ? " wired" : "");
 	}
 
 	if (!error && (get_efer || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_EFER, &efer);
 		if (error == 0)
 			printf("efer[%d]\t\t0x%016lx\n", vcpu, efer);
 	}
 
 	if (!error && (get_cr0 || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR0, &cr0);
 		if (error == 0)
 			printf("cr0[%d]\t\t0x%016lx\n", vcpu, cr0);
 	}
 
 	if (!error && (get_cr3 || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR3, &cr3);
 		if (error == 0)
 			printf("cr3[%d]\t\t0x%016lx\n", vcpu, cr3);
 	}
 
 	if (!error && (get_cr4 || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR4, &cr4);
 		if (error == 0)
 			printf("cr4[%d]\t\t0x%016lx\n", vcpu, cr4);
 	}
 
 	if (!error && (get_dr7 || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_DR7, &dr7);
 		if (error == 0)
 			printf("dr7[%d]\t\t0x%016lx\n", vcpu, dr7);
 	}
 
 	if (!error && (get_rsp || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RSP, &rsp);
 		if (error == 0)
 			printf("rsp[%d]\t\t0x%016lx\n", vcpu, rsp);
 	}
 
 	if (!error && (get_rip || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RIP, &rip);
 		if (error == 0)
 			printf("rip[%d]\t\t0x%016lx\n", vcpu, rip);
 	}
 
 	if (!error && (get_rax || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RAX, &rax);
 		if (error == 0)
 			printf("rax[%d]\t\t0x%016lx\n", vcpu, rax);
 	}
 
 	if (!error && (get_rbx || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RBX, &rbx);
 		if (error == 0)
 			printf("rbx[%d]\t\t0x%016lx\n", vcpu, rbx);
 	}
 
 	if (!error && (get_rcx || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RCX, &rcx);
 		if (error == 0)
 			printf("rcx[%d]\t\t0x%016lx\n", vcpu, rcx);
 	}
 
 	if (!error && (get_rdx || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RDX, &rdx);
 		if (error == 0)
 			printf("rdx[%d]\t\t0x%016lx\n", vcpu, rdx);
 	}
 
 	if (!error && (get_rsi || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RSI, &rsi);
 		if (error == 0)
 			printf("rsi[%d]\t\t0x%016lx\n", vcpu, rsi);
 	}
 
 	if (!error && (get_rdi || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RDI, &rdi);
 		if (error == 0)
 			printf("rdi[%d]\t\t0x%016lx\n", vcpu, rdi);
 	}
 
 	if (!error && (get_rbp || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RBP, &rbp);
 		if (error == 0)
 			printf("rbp[%d]\t\t0x%016lx\n", vcpu, rbp);
 	}
 
 	if (!error && (get_r8 || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R8, &r8);
 		if (error == 0)
 			printf("r8[%d]\t\t0x%016lx\n", vcpu, r8);
 	}
 
 	if (!error && (get_r9 || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R9, &r9);
 		if (error == 0)
 			printf("r9[%d]\t\t0x%016lx\n", vcpu, r9);
 	}
 
 	if (!error && (get_r10 || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R10, &r10);
 		if (error == 0)
 			printf("r10[%d]\t\t0x%016lx\n", vcpu, r10);
 	}
 
 	if (!error && (get_r11 || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R11, &r11);
 		if (error == 0)
 			printf("r11[%d]\t\t0x%016lx\n", vcpu, r11);
 	}
 
 	if (!error && (get_r12 || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R12, &r12);
 		if (error == 0)
 			printf("r12[%d]\t\t0x%016lx\n", vcpu, r12);
 	}
 
 	if (!error && (get_r13 || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R13, &r13);
 		if (error == 0)
 			printf("r13[%d]\t\t0x%016lx\n", vcpu, r13);
 	}
 
 	if (!error && (get_r14 || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R14, &r14);
 		if (error == 0)
 			printf("r14[%d]\t\t0x%016lx\n", vcpu, r14);
 	}
 
 	if (!error && (get_r15 || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R15, &r15);
 		if (error == 0)
 			printf("r15[%d]\t\t0x%016lx\n", vcpu, r15);
 	}
 
 	if (!error && (get_rflags || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RFLAGS,
 					&rflags);
 		if (error == 0)
 			printf("rflags[%d]\t0x%016lx\n", vcpu, rflags);
 	}
 
 	if (!error && (get_stats || get_all)) {
 		int i, num_stats;
 		uint64_t *stats;
 		struct timeval tv;
 		const char *desc;
 
 		stats = vm_get_stats(ctx, vcpu, &tv, &num_stats);
 		if (stats != NULL) {
 			printf("vcpu%d\n", vcpu);
 			for (i = 0; i < num_stats; i++) {
 				desc = vm_get_stat_desc(ctx, i);
 				printf("%-40s\t%ld\n", desc, stats[i]);
 			}
 		}
 	}
 
 	if (!error && (get_desc_ds || get_all)) {
 		error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_DS,
 				    &desc_base, &desc_limit, &desc_access);
 		if (error == 0) {
 			printf("ds desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
 			       vcpu, desc_base, desc_limit, desc_access);	
 		}
 	}
 
 	if (!error && (get_desc_es || get_all)) {
 		error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_ES,
 				    &desc_base, &desc_limit, &desc_access);
 		if (error == 0) {
 			printf("es desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
 			       vcpu, desc_base, desc_limit, desc_access);	
 		}
 	}
 
 	if (!error && (get_desc_fs || get_all)) {
 		error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_FS,
 				    &desc_base, &desc_limit, &desc_access);
 		if (error == 0) {
 			printf("fs desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
 			       vcpu, desc_base, desc_limit, desc_access);	
 		}
 	}
 
 	if (!error && (get_desc_gs || get_all)) {
 		error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_GS,
 				    &desc_base, &desc_limit, &desc_access);
 		if (error == 0) {
 			printf("gs desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
 			       vcpu, desc_base, desc_limit, desc_access);	
 		}
 	}
 
 	if (!error && (get_desc_ss || get_all)) {
 		error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_SS,
 				    &desc_base, &desc_limit, &desc_access);
 		if (error == 0) {
 			printf("ss desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
 			       vcpu, desc_base, desc_limit, desc_access);	
 		}
 	}
 
 	if (!error && (get_desc_cs || get_all)) {
 		error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_CS,
 				    &desc_base, &desc_limit, &desc_access);
 		if (error == 0) {
 			printf("cs desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
 			       vcpu, desc_base, desc_limit, desc_access);	
 		}
 	}
 
 	if (!error && (get_desc_tr || get_all)) {
 		error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_TR,
 				    &desc_base, &desc_limit, &desc_access);
 		if (error == 0) {
 			printf("tr desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
 			       vcpu, desc_base, desc_limit, desc_access);	
 		}
 	}
 
 	if (!error && (get_desc_ldtr || get_all)) {
 		error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_LDTR,
 				    &desc_base, &desc_limit, &desc_access);
 		if (error == 0) {
 			printf("ldtr desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
 			       vcpu, desc_base, desc_limit, desc_access);	
 		}
 	}
 
 	if (!error && (get_desc_gdtr || get_all)) {
 		error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_GDTR,
 				    &desc_base, &desc_limit, &desc_access);
 		if (error == 0) {
 			printf("gdtr[%d]\t\t0x%016lx/0x%08x\n",
 			       vcpu, desc_base, desc_limit);	
 		}
 	}
 
 	if (!error && (get_desc_idtr || get_all)) {
 		error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_IDTR,
 				    &desc_base, &desc_limit, &desc_access);
 		if (error == 0) {
 			printf("idtr[%d]\t\t0x%016lx/0x%08x\n",
 			       vcpu, desc_base, desc_limit);	
 		}
 	}
 
 	if (!error && (get_cs || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CS, &cs);
 		if (error == 0)
 			printf("cs[%d]\t\t0x%04lx\n", vcpu, cs);
 	}
 
 	if (!error && (get_ds || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_DS, &ds);
 		if (error == 0)
 			printf("ds[%d]\t\t0x%04lx\n", vcpu, ds);
 	}
 
 	if (!error && (get_es || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_ES, &es);
 		if (error == 0)
 			printf("es[%d]\t\t0x%04lx\n", vcpu, es);
 	}
 
 	if (!error && (get_fs || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_FS, &fs);
 		if (error == 0)
 			printf("fs[%d]\t\t0x%04lx\n", vcpu, fs);
 	}
 
 	if (!error && (get_gs || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_GS, &gs);
 		if (error == 0)
 			printf("gs[%d]\t\t0x%04lx\n", vcpu, gs);
 	}
 
 	if (!error && (get_ss || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_SS, &ss);
 		if (error == 0)
 			printf("ss[%d]\t\t0x%04lx\n", vcpu, ss);
 	}
 
 	if (!error && (get_tr || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_TR, &tr);
 		if (error == 0)
 			printf("tr[%d]\t\t0x%04lx\n", vcpu, tr);
 	}
 
 	if (!error && (get_ldtr || get_all)) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_LDTR, &ldtr);
 		if (error == 0)
 			printf("ldtr[%d]\t\t0x%04lx\n", vcpu, ldtr);
 	}
 
 	if (!error && (get_x2apic_state || get_all)) {
 		error = vm_get_x2apic_state(ctx, vcpu, &x2apic_state);
 		if (error == 0)
 			printf("x2apic_state[%d]\t%d\n", vcpu, x2apic_state);
 	}
 
 	if (!error && (get_pinbased_ctls || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_PIN_BASED_CTLS, &ctl);
 		if (error == 0)
 			printf("pinbased_ctls[%d]\t0x%08lx\n", vcpu, ctl);
 	}
 
 	if (!error && (get_procbased_ctls || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu,
 					  VMCS_PRI_PROC_BASED_CTLS, &ctl);
 		if (error == 0)
 			printf("procbased_ctls[%d]\t0x%08lx\n", vcpu, ctl);
 	}
 
 	if (!error && (get_procbased_ctls2 || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu,
 					  VMCS_SEC_PROC_BASED_CTLS, &ctl);
 		if (error == 0)
 			printf("procbased_ctls2[%d]\t0x%08lx\n", vcpu, ctl);
 	}
 
 	if (!error && (get_vmcs_gla || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu,
 					  VMCS_GUEST_LINEAR_ADDRESS, &u64);
 		if (error == 0)
 			printf("gla[%d]\t\t0x%016lx\n", vcpu, u64);
 	}
 
 	if (!error && (get_vmcs_gpa || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu,
 					  VMCS_GUEST_PHYSICAL_ADDRESS, &u64);
 		if (error == 0)
 			printf("gpa[%d]\t\t0x%016lx\n", vcpu, u64);
 	}
 
 	if (!error && (get_vmcs_entry_interruption_info || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_ENTRY_INTR_INFO,&u64);
 		if (error == 0) {
 			printf("entry_interruption_info[%d]\t0x%08lx\n",
 				vcpu, u64);
 		}
 	}
 
 	if (!error && (get_eptp || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_EPTP, &eptp);
 		if (error == 0)
 			printf("eptp[%d]\t\t0x%016lx\n", vcpu, eptp);
 	}
 
 	if (!error && (get_exception_bitmap || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXCEPTION_BITMAP,
 					  &bm);
 		if (error == 0)
 			printf("exception_bitmap[%d]\t0x%08lx\n", vcpu, bm);
 	}
 
 	if (!error && (get_io_bitmap || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_IO_BITMAP_A, &bm);
 		if (error == 0)
 			printf("io_bitmap_a[%d]\t0x%08lx\n", vcpu, bm);
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_IO_BITMAP_B, &bm);
 		if (error == 0)
 			printf("io_bitmap_b[%d]\t0x%08lx\n", vcpu, bm);
 	}
 
 	if (!error && (get_tsc_offset || get_all)) {
 		uint64_t tscoff;
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_TSC_OFFSET, &tscoff);
 		if (error == 0)
 			printf("tsc_offset[%d]\t0x%016lx\n", vcpu, tscoff);
 	}
 
 	if (!error && (get_cr0_mask || get_all)) {
 		uint64_t cr0mask;
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR0_MASK, &cr0mask);
 		if (error == 0)
 			printf("cr0_mask[%d]\t\t0x%016lx\n", vcpu, cr0mask);
 	}
 
 	if (!error && (get_cr0_shadow || get_all)) {
 		uint64_t cr0shadow;
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR0_SHADOW,
 					  &cr0shadow);
 		if (error == 0)
 			printf("cr0_shadow[%d]\t\t0x%016lx\n", vcpu, cr0shadow);
 	}
 
 	if (!error && (get_cr4_mask || get_all)) {
 		uint64_t cr4mask;
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR4_MASK, &cr4mask);
 		if (error == 0)
 			printf("cr4_mask[%d]\t\t0x%016lx\n", vcpu, cr4mask);
 	}
 
 	if (!error && (get_cr4_shadow || get_all)) {
 		uint64_t cr4shadow;
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR4_SHADOW,
 					  &cr4shadow);
 		if (error == 0)
 			printf("cr4_shadow[%d]\t\t0x%016lx\n", vcpu, cr4shadow);
 	}
 	
 	if (!error && (get_cr3_targets || get_all)) {
 		uint64_t target_count, target_addr;
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET_COUNT,
 					  &target_count);
 		if (error == 0) {
 			printf("cr3_target_count[%d]\t0x%08lx\n",
 				vcpu, target_count);
 		}
 
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET0,
 					  &target_addr);
 		if (error == 0) {
 			printf("cr3_target0[%d]\t\t0x%016lx\n",
 				vcpu, target_addr);
 		}
 
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET1,
 					  &target_addr);
 		if (error == 0) {
 			printf("cr3_target1[%d]\t\t0x%016lx\n",
 				vcpu, target_addr);
 		}
 
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET2,
 					  &target_addr);
 		if (error == 0) {
 			printf("cr3_target2[%d]\t\t0x%016lx\n",
 				vcpu, target_addr);
 		}
 
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET3,
 					  &target_addr);
 		if (error == 0) {
 			printf("cr3_target3[%d]\t\t0x%016lx\n",
 				vcpu, target_addr);
 		}
 	}
 
 	if (!error && (get_apic_access_addr || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_APIC_ACCESS, &addr);
 		if (error == 0)
 			printf("apic_access_addr[%d]\t0x%016lx\n", vcpu, addr);
 	}
 
 	if (!error && (get_virtual_apic_addr || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_VIRTUAL_APIC, &addr);
 		if (error == 0)
 			printf("virtual_apic_addr[%d]\t0x%016lx\n", vcpu, addr);
 	}
 
 	if (!error && (get_tpr_threshold || get_all)) {
 		uint64_t threshold;
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_TPR_THRESHOLD,
 					  &threshold);
 		if (error == 0)
 			printf("tpr_threshold[%d]\t0x%08lx\n", vcpu, threshold);
 	}
 
 	if (!error && (get_msr_bitmap_address || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_MSR_BITMAP, &addr);
 		if (error == 0)
 			printf("msr_bitmap[%d]\t\t0x%016lx\n", vcpu, addr);
 	}
 
 	if (!error && (get_msr_bitmap || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_MSR_BITMAP, &addr);
 		if (error == 0)
 			error = dump_vmcs_msr_bitmap(vcpu, addr);
 	}
 
 	if (!error && (get_vpid || get_all)) {
 		uint64_t vpid;
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_VPID, &vpid);
 		if (error == 0)
 			printf("vpid[%d]\t\t0x%04lx\n", vcpu, vpid);
 	}
 	
 	if (!error && (get_ple_window || get_all)) {
 		uint64_t window;
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_PLE_WINDOW, &window);
 		if (error == 0)
 			printf("ple_window[%d]\t\t0x%08lx\n", vcpu, window);
 	}
 
 	if (!error && (get_ple_gap || get_all)) {
 		uint64_t gap;
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_PLE_GAP, &gap);
 		if (error == 0)
 			printf("ple_gap[%d]\t\t0x%08lx\n", vcpu, gap);
 	}
 
 	if (!error && (get_inst_err || get_all)) {
 		uint64_t insterr;
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_INSTRUCTION_ERROR,
 					  &insterr);
 		if (error == 0) {
 			printf("instruction_error[%d]\t0x%08lx\n",
 				vcpu, insterr);
 		}
 	}
 
 	if (!error && (get_exit_ctls || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_CTLS, &ctl);
 		if (error == 0)
 			printf("exit_ctls[%d]\t\t0x%08lx\n", vcpu, ctl);
 	}
 
 	if (!error && (get_entry_ctls || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_ENTRY_CTLS, &ctl);
 		if (error == 0)
 			printf("entry_ctls[%d]\t\t0x%08lx\n", vcpu, ctl);
 	}
 
 	if (!error && (get_host_pat || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_IA32_PAT, &pat);
 		if (error == 0)
 			printf("host_pat[%d]\t\t0x%016lx\n", vcpu, pat);
 	}
 
 	if (!error && (get_guest_pat || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_GUEST_IA32_PAT, &pat);
 		if (error == 0)
 			printf("guest_pat[%d]\t\t0x%016lx\n", vcpu, pat);
 	}
 
 	if (!error && (get_host_cr0 || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_CR0, &cr0);
 		if (error == 0)
 			printf("host_cr0[%d]\t\t0x%016lx\n", vcpu, cr0);
 	}
 
 	if (!error && (get_host_cr3 || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_CR3, &cr3);
 		if (error == 0)
 			printf("host_cr3[%d]\t\t0x%016lx\n", vcpu, cr3);
 	}
 
 	if (!error && (get_host_cr4 || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_CR4, &cr4);
 		if (error == 0)
 			printf("host_cr4[%d]\t\t0x%016lx\n", vcpu, cr4);
 	}
 
 	if (!error && (get_host_rip || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_RIP, &rip);
 		if (error == 0)
 			printf("host_rip[%d]\t\t0x%016lx\n", vcpu, rip);
 	}
 
 	if (!error && (get_host_rsp || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_RSP, &rsp);
 		if (error == 0)
 			printf("host_rsp[%d]\t\t0x%016lx\n", vcpu, rsp);
 	}
 
 	if (!error && (get_guest_sysenter || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu,
 					  VMCS_GUEST_IA32_SYSENTER_CS, &cs);
 		if (error == 0)
 			printf("guest_sysenter_cs[%d]\t0x%08lx\n", vcpu, cs);
 
 		error = vm_get_vmcs_field(ctx, vcpu,
 					  VMCS_GUEST_IA32_SYSENTER_ESP, &rsp);
 		if (error == 0)
 			printf("guest_sysenter_sp[%d]\t0x%016lx\n", vcpu, rsp);
 		error = vm_get_vmcs_field(ctx, vcpu,
 					  VMCS_GUEST_IA32_SYSENTER_EIP, &rip);
 		if (error == 0)
 			printf("guest_sysenter_ip[%d]\t0x%016lx\n", vcpu, rip);
 	}
 
 	if (!error && (get_vmcs_link || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_LINK_POINTER, &addr);
 		if (error == 0)
 			printf("vmcs_pointer[%d]\t0x%016lx\n", vcpu, addr);
 	}
 
 	if (!error && (get_vmcs_exit_reason || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_REASON, &u64);
 		if (error == 0)
 			printf("vmcs_exit_reason[%d]\t0x%016lx\n", vcpu, u64);
 	}
 
 	if (!error && (get_vmcs_exit_qualification || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_QUALIFICATION,
 					  &u64);
 		if (error == 0)
 			printf("vmcs_exit_qualification[%d]\t0x%016lx\n",
 				vcpu, u64);
 	}
 
 	if (!error && (get_vmcs_exit_interruption_info || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_INTR_INFO, &u64);
 		if (error == 0) {
 			printf("vmcs_exit_interruption_info[%d]\t0x%08lx\n",
 				vcpu, u64);
 		}
 	}
 
 	if (!error && (get_vmcs_exit_interruption_error || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_INTR_ERRCODE,
 		    &u64);
 		if (error == 0) {
 			printf("vmcs_exit_interruption_error[%d]\t0x%08lx\n",
 				vcpu, u64);
 		}
 	}
 
 	if (!error && (get_vmcs_interruptibility || get_all)) {
 		error = vm_get_vmcs_field(ctx, vcpu,
 					  VMCS_GUEST_INTERRUPTIBILITY, &u64);
 		if (error == 0) {
 			printf("vmcs_guest_interruptibility[%d]\t0x%08lx\n",
 				vcpu, u64);
 		}
 	}
 
 	if (!error && setcap) {
 		int captype;
 		captype = vm_capability_name2type(capname);
 		error = vm_set_capability(ctx, vcpu, captype, capval);
 		if (error != 0 && errno == ENOENT)
 			printf("Capability \"%s\" is not available\n", capname);
 	}
 
 	if (!error && get_gpa_pmap) {
 		error = vm_get_gpa_pmap(ctx, gpa_pmap, pteval, &ptenum);
 		if (error == 0) {
 			printf("gpa %#lx:", gpa_pmap);
 			pte = &pteval[0];
 			while (ptenum-- > 0)
 				printf(" %#lx", *pte++);
 			printf("\n");
 		}
 	}
 
 	if (!error && (getcap || get_all)) {
 		int captype, val, getcaptype;
 
 		if (getcap && capname)
 			getcaptype = vm_capability_name2type(capname);
 		else
 			getcaptype = -1;
 
 		for (captype = 0; captype < VM_CAP_MAX; captype++) {
 			if (getcaptype >= 0 && captype != getcaptype)
 				continue;
 			error = vm_get_capability(ctx, vcpu, captype, &val);
 			if (error == 0) {
 				printf("Capability \"%s\" is %s on vcpu %d\n",
 					vm_capability_type2name(captype),
 					val ? "set" : "not set", vcpu);
 			} else if (errno == ENOENT) {
 				error = 0;
 				printf("Capability \"%s\" is not available\n",
 					vm_capability_type2name(captype));
 			} else {
 				break;
 			}
 		}
 	}
 
 	if (!error && (get_active_cpus || get_all)) {
 		error = vm_active_cpus(ctx, &cpus);
 		if (!error)
 			print_cpus("active cpus", &cpus);
 	}
 
 	if (!error && (get_suspended_cpus || get_all)) {
 		error = vm_suspended_cpus(ctx, &cpus);
 		if (!error)
 			print_cpus("suspended cpus", &cpus);
 	}
 
 	if (!error && (get_intinfo || get_all)) {
 		error = vm_get_intinfo(ctx, vcpu, &info[0], &info[1]);
 		if (!error) {
 			print_intinfo("pending", info[0]);
 			print_intinfo("current", info[1]);
 		}
 	}
 
 	if (!error && run) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RIP, &rip);
 		assert(error == 0);
 
 		error = vm_run(ctx, vcpu, rip, &vmexit);
 		if (error == 0)
 			dump_vm_run_exitcode(&vmexit, vcpu);
 		else
 			printf("vm_run error %d\n", error);
 	}
 
 	if (!error && force_reset)
 		error = vm_suspend(ctx, VM_SUSPEND_RESET);
 
 	if (!error && force_poweroff)
 		error = vm_suspend(ctx, VM_SUSPEND_POWEROFF);
 
 	if (error)
 		printf("errno = %d\n", errno);
 
 	if (!error && destroy)
 		vm_destroy(ctx);
 
 	exit(error);
 }
Index: stable/10
===================================================================
--- stable/10	(revision 276348)
+++ stable/10	(revision 276349)

Property changes on: stable/10
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head:r270326,270434,270436-270437,270855,270857,271439,271451,271888,271890-271891,272197,272395,272670,272710,272838-272839,273108,273212