Index: head/sys/amd64/include/vmm.h
===================================================================
--- head/sys/amd64/include/vmm.h	(revision 362599)
+++ head/sys/amd64/include/vmm.h	(revision 362600)
@@ -1,770 +1,773 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _VMM_H_
 #define	_VMM_H_
 
 #include <sys/sdt.h>
 #include <x86/segments.h>
 
 struct vm_snapshot_meta;
 
 #ifdef _KERNEL
 SDT_PROVIDER_DECLARE(vmm);
 #endif
 
 enum vm_suspend_how {
 	VM_SUSPEND_NONE,
 	VM_SUSPEND_RESET,
 	VM_SUSPEND_POWEROFF,
 	VM_SUSPEND_HALT,
 	VM_SUSPEND_TRIPLEFAULT,
 	VM_SUSPEND_LAST
 };
 
 /*
  * Identifiers for architecturally defined registers.
  */
 enum vm_reg_name {
 	VM_REG_GUEST_RAX,
 	VM_REG_GUEST_RBX,
 	VM_REG_GUEST_RCX,
 	VM_REG_GUEST_RDX,
 	VM_REG_GUEST_RSI,
 	VM_REG_GUEST_RDI,
 	VM_REG_GUEST_RBP,
 	VM_REG_GUEST_R8,
 	VM_REG_GUEST_R9,
 	VM_REG_GUEST_R10,
 	VM_REG_GUEST_R11,
 	VM_REG_GUEST_R12,
 	VM_REG_GUEST_R13,
 	VM_REG_GUEST_R14,
 	VM_REG_GUEST_R15,
 	VM_REG_GUEST_CR0,
 	VM_REG_GUEST_CR3,
 	VM_REG_GUEST_CR4,
 	VM_REG_GUEST_DR7,
 	VM_REG_GUEST_RSP,
 	VM_REG_GUEST_RIP,
 	VM_REG_GUEST_RFLAGS,
 	VM_REG_GUEST_ES,
 	VM_REG_GUEST_CS,
 	VM_REG_GUEST_SS,
 	VM_REG_GUEST_DS,
 	VM_REG_GUEST_FS,
 	VM_REG_GUEST_GS,
 	VM_REG_GUEST_LDTR,
 	VM_REG_GUEST_TR,
 	VM_REG_GUEST_IDTR,
 	VM_REG_GUEST_GDTR,
 	VM_REG_GUEST_EFER,
 	VM_REG_GUEST_CR2,
 	VM_REG_GUEST_PDPTE0,
 	VM_REG_GUEST_PDPTE1,
 	VM_REG_GUEST_PDPTE2,
 	VM_REG_GUEST_PDPTE3,
 	VM_REG_GUEST_INTR_SHADOW,
 	VM_REG_GUEST_DR0,
 	VM_REG_GUEST_DR1,
 	VM_REG_GUEST_DR2,
 	VM_REG_GUEST_DR3,
 	VM_REG_GUEST_DR6,
 	VM_REG_GUEST_ENTRY_INST_LENGTH,
 	VM_REG_LAST
 };
 
 enum x2apic_state {
 	X2APIC_DISABLED,
 	X2APIC_ENABLED,
 	X2APIC_STATE_LAST
 };
 
 #define	VM_INTINFO_VECTOR(info)	((info) & 0xff)
 #define	VM_INTINFO_DEL_ERRCODE	0x800
 #define	VM_INTINFO_RSVD		0x7ffff000
 #define	VM_INTINFO_VALID	0x80000000
 #define	VM_INTINFO_TYPE		0x700
 #define	VM_INTINFO_HWINTR	(0 << 8)
 #define	VM_INTINFO_NMI		(2 << 8)
 #define	VM_INTINFO_HWEXCEPTION	(3 << 8)
 #define	VM_INTINFO_SWINTR	(4 << 8)
 
 /*
  * The VM name has to fit into the pathname length constraints of devfs,
  * governed primarily by SPECNAMELEN.  The length is the total number of
  * characters in the full path, relative to the mount point and not 
  * including any leading '/' characters.
  * A prefix and a suffix are added to the name specified by the user.
  * The prefix is usually "vmm/" or "vmm.io/", but can be a few characters
  * longer for future use.
  * The suffix is a string that identifies a bootrom image or some similar
  * image that is attached to the VM. A separator character gets added to
  * the suffix automatically when generating the full path, so it must be
  * accounted for, reducing the effective length by 1.
  * The effective length of a VM name is 229 bytes for FreeBSD 13 and 37
  * bytes for FreeBSD 12.  A minimum length is set for safety and supports
  * a SPECNAMELEN as small as 32 on old systems.
  */
 #define VM_MAX_PREFIXLEN 10
 #define VM_MAX_SUFFIXLEN 15
 #define VM_MIN_NAMELEN   6
 #define VM_MAX_NAMELEN \
     (SPECNAMELEN - VM_MAX_PREFIXLEN - VM_MAX_SUFFIXLEN - 1)
 
 #ifdef _KERNEL
 CTASSERT(VM_MAX_NAMELEN >= VM_MIN_NAMELEN);
 
 struct vm;
 struct vm_exception;
 struct seg_desc;
 struct vm_exit;
 struct vm_run;
 struct vhpet;
 struct vioapic;
 struct vlapic;
 struct vmspace;
 struct vm_object;
 struct vm_guest_paging;
 struct pmap;
 enum snapshot_req;
 
 struct vm_eventinfo {
 	void	*rptr;		/* rendezvous cookie */
 	int	*sptr;		/* suspend cookie */
 	int	*iptr;		/* reqidle cookie */
 };
 
 typedef int	(*vmm_init_func_t)(int ipinum);
 typedef int	(*vmm_cleanup_func_t)(void);
 typedef void	(*vmm_resume_func_t)(void);
 typedef void *	(*vmi_init_func_t)(struct vm *vm, struct pmap *pmap);
 typedef int	(*vmi_run_func_t)(void *vmi, int vcpu, register_t rip,
 		    struct pmap *pmap, struct vm_eventinfo *info);
 typedef void	(*vmi_cleanup_func_t)(void *vmi);
 typedef int	(*vmi_get_register_t)(void *vmi, int vcpu, int num,
 				      uint64_t *retval);
 typedef int	(*vmi_set_register_t)(void *vmi, int vcpu, int num,
 				      uint64_t val);
 typedef int	(*vmi_get_desc_t)(void *vmi, int vcpu, int num,
 				  struct seg_desc *desc);
 typedef int	(*vmi_set_desc_t)(void *vmi, int vcpu, int num,
 				  struct seg_desc *desc);
 typedef int	(*vmi_get_cap_t)(void *vmi, int vcpu, int num, int *retval);
 typedef int	(*vmi_set_cap_t)(void *vmi, int vcpu, int num, int val);
 typedef struct vmspace * (*vmi_vmspace_alloc)(vm_offset_t min, vm_offset_t max);
 typedef void	(*vmi_vmspace_free)(struct vmspace *vmspace);
 typedef struct vlapic * (*vmi_vlapic_init)(void *vmi, int vcpu);
 typedef void	(*vmi_vlapic_cleanup)(void *vmi, struct vlapic *vlapic);
 typedef int	(*vmi_snapshot_t)(void *vmi, struct vm_snapshot_meta *meta);
 typedef int	(*vmi_snapshot_vmcx_t)(void *vmi, struct vm_snapshot_meta *meta,
 				       int vcpu);
 typedef int	(*vmi_restore_tsc_t)(void *vmi, int vcpuid, uint64_t now);
 
 struct vmm_ops {
 	vmm_init_func_t		init;		/* module wide initialization */
 	vmm_cleanup_func_t	cleanup;
 	vmm_resume_func_t	resume;
 
 	vmi_init_func_t		vminit;		/* vm-specific initialization */
 	vmi_run_func_t		vmrun;
 	vmi_cleanup_func_t	vmcleanup;
 	vmi_get_register_t	vmgetreg;
 	vmi_set_register_t	vmsetreg;
 	vmi_get_desc_t		vmgetdesc;
 	vmi_set_desc_t		vmsetdesc;
 	vmi_get_cap_t		vmgetcap;
 	vmi_set_cap_t		vmsetcap;
 	vmi_vmspace_alloc	vmspace_alloc;
 	vmi_vmspace_free	vmspace_free;
 	vmi_vlapic_init		vlapic_init;
 	vmi_vlapic_cleanup	vlapic_cleanup;
 
 	/* checkpoint operations */
 	vmi_snapshot_t		vmsnapshot;
 	vmi_snapshot_vmcx_t	vmcx_snapshot;
 	vmi_restore_tsc_t	vm_restore_tsc;
 };
 
 extern struct vmm_ops vmm_ops_intel;
 extern struct vmm_ops vmm_ops_amd;
 
 int vm_create(const char *name, struct vm **retvm);
 void vm_destroy(struct vm *vm);
 int vm_reinit(struct vm *vm);
 const char *vm_name(struct vm *vm);
 uint16_t vm_get_maxcpus(struct vm *vm);
 void vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores,
     uint16_t *threads, uint16_t *maxcpus);
 int vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores,
     uint16_t threads, uint16_t maxcpus);
 
 /*
  * APIs that modify the guest memory map require all vcpus to be frozen.
  */
 int vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t off,
     size_t len, int prot, int flags);
 int vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem);
 void vm_free_memseg(struct vm *vm, int ident);
 int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
 int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len);
 int vm_assign_pptdev(struct vm *vm, int bus, int slot, int func);
 int vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func);
 
 /*
  * APIs that inspect the guest memory map require only a *single* vcpu to
  * be frozen. This acts like a read lock on the guest memory map since any
  * modification requires *all* vcpus to be frozen.
  */
 int vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid,
     vm_ooffset_t *segoff, size_t *len, int *prot, int *flags);
 int vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem,
     struct vm_object **objptr);
 vm_paddr_t vmm_sysmem_maxaddr(struct vm *vm);
 void *vm_gpa_hold(struct vm *, int vcpuid, vm_paddr_t gpa, size_t len,
     int prot, void **cookie);
 void vm_gpa_release(void *cookie);
 bool vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa);
 
 int vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval);
 int vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val);
 int vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
 		    struct seg_desc *ret_desc);
 int vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
 		    struct seg_desc *desc);
 int vm_run(struct vm *vm, struct vm_run *vmrun);
 int vm_suspend(struct vm *vm, enum vm_suspend_how how);
 int vm_inject_nmi(struct vm *vm, int vcpu);
 int vm_nmi_pending(struct vm *vm, int vcpuid);
 void vm_nmi_clear(struct vm *vm, int vcpuid);
 int vm_inject_extint(struct vm *vm, int vcpu);
 int vm_extint_pending(struct vm *vm, int vcpuid);
 void vm_extint_clear(struct vm *vm, int vcpuid);
 struct vlapic *vm_lapic(struct vm *vm, int cpu);
 struct vioapic *vm_ioapic(struct vm *vm);
 struct vhpet *vm_hpet(struct vm *vm);
 int vm_get_capability(struct vm *vm, int vcpu, int type, int *val);
 int vm_set_capability(struct vm *vm, int vcpu, int type, int val);
 int vm_get_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state *state);
 int vm_set_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state state);
 int vm_apicid2vcpuid(struct vm *vm, int apicid);
 int vm_activate_cpu(struct vm *vm, int vcpu);
 int vm_suspend_cpu(struct vm *vm, int vcpu);
 int vm_resume_cpu(struct vm *vm, int vcpu);
 struct vm_exit *vm_exitinfo(struct vm *vm, int vcpuid);
 void vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip);
 void vm_exit_debug(struct vm *vm, int vcpuid, uint64_t rip);
 void vm_exit_rendezvous(struct vm *vm, int vcpuid, uint64_t rip);
 void vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip);
 void vm_exit_reqidle(struct vm *vm, int vcpuid, uint64_t rip);
 int vm_snapshot_req(struct vm *vm, struct vm_snapshot_meta *meta);
 int vm_restore_time(struct vm *vm);
 
 
 #ifdef _SYS__CPUSET_H_
 /*
  * Rendezvous all vcpus specified in 'dest' and execute 'func(arg)'.
  * The rendezvous 'func(arg)' is not allowed to do anything that will
  * cause the thread to be put to sleep.
  *
  * If the rendezvous is being initiated from a vcpu context then the
  * 'vcpuid' must refer to that vcpu, otherwise it should be set to -1.
  *
  * The caller cannot hold any locks when initiating the rendezvous.
  *
  * The implementation of this API may cause vcpus other than those specified
  * by 'dest' to be stalled. The caller should not rely on any vcpus making
  * forward progress when the rendezvous is in progress.
  */
 typedef void (*vm_rendezvous_func_t)(struct vm *vm, int vcpuid, void *arg);
 int vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest,
     vm_rendezvous_func_t func, void *arg);
 cpuset_t vm_active_cpus(struct vm *vm);
 cpuset_t vm_debug_cpus(struct vm *vm);
 cpuset_t vm_suspended_cpus(struct vm *vm);
 #endif	/* _SYS__CPUSET_H_ */
 
 static __inline int
 vcpu_rendezvous_pending(struct vm_eventinfo *info)
 {
 
 	return (*((uintptr_t *)(info->rptr)) != 0);
 }
 
 static __inline int
 vcpu_suspended(struct vm_eventinfo *info)
 {
 
 	return (*info->sptr);
 }
 
 static __inline int
 vcpu_reqidle(struct vm_eventinfo *info)
 {
 
 	return (*info->iptr);
 }
 
 int vcpu_debugged(struct vm *vm, int vcpuid);
 
 /*
  * Return true if device indicated by bus/slot/func is supposed to be a
  * pci passthrough device.
  *
  * Return false otherwise.
  */
 bool vmm_is_pptdev(int bus, int slot, int func);
 
 void *vm_iommu_domain(struct vm *vm);
 
 enum vcpu_state {
 	VCPU_IDLE,
 	VCPU_FROZEN,
 	VCPU_RUNNING,
 	VCPU_SLEEPING,
 };
 
 int vcpu_set_state(struct vm *vm, int vcpu, enum vcpu_state state,
     bool from_idle);
 enum vcpu_state vcpu_get_state(struct vm *vm, int vcpu, int *hostcpu);
 
 static int __inline
 vcpu_is_running(struct vm *vm, int vcpu, int *hostcpu)
 {
 	return (vcpu_get_state(vm, vcpu, hostcpu) == VCPU_RUNNING);
 }
 
 #ifdef _SYS_PROC_H_
 static int __inline
 vcpu_should_yield(struct vm *vm, int vcpu)
 {
 
 	if (curthread->td_flags & (TDF_ASTPENDING | TDF_NEEDRESCHED))
 		return (1);
 	else if (curthread->td_owepreempt)
 		return (1);
 	else
 		return (0);
 }
 #endif
 
 void *vcpu_stats(struct vm *vm, int vcpu);
 void vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr);
 struct vmspace *vm_get_vmspace(struct vm *vm);
 struct vatpic *vm_atpic(struct vm *vm);
 struct vatpit *vm_atpit(struct vm *vm);
 struct vpmtmr *vm_pmtmr(struct vm *vm);
 struct vrtc *vm_rtc(struct vm *vm);
 
 /*
  * Inject exception 'vector' into the guest vcpu. This function returns 0 on
  * success and non-zero on failure.
  *
  * Wrapper functions like 'vm_inject_gp()' should be preferred to calling
  * this function directly because they enforce the trap-like or fault-like
  * behavior of an exception.
  *
  * This function should only be called in the context of the thread that is
  * executing this vcpu.
  */
 int vm_inject_exception(struct vm *vm, int vcpuid, int vector, int err_valid,
     uint32_t errcode, int restart_instruction);
 
 /*
  * This function is called after a VM-exit that occurred during exception or
  * interrupt delivery through the IDT. The format of 'intinfo' is described
  * in Figure 15-1, "EXITINTINFO for All Intercepts", APM, Vol 2.
  *
  * If a VM-exit handler completes the event delivery successfully then it
  * should call vm_exit_intinfo() to extinguish the pending event. For e.g.,
  * if the task switch emulation is triggered via a task gate then it should
  * call this function with 'intinfo=0' to indicate that the external event
  * is not pending anymore.
  *
  * Return value is 0 on success and non-zero on failure.
  */
 int vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t intinfo);
 
 /*
  * This function is called before every VM-entry to retrieve a pending
  * event that should be injected into the guest. This function combines
  * nested events into a double or triple fault.
  *
  * Returns 0 if there are no events that need to be injected into the guest
  * and non-zero otherwise.
  */
 int vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *info);
 
 int vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2);
 
 /*
  * Function used to keep track of the guest's TSC offset. The
  * offset is used by the virutalization extensions to provide a consistent
  * value for the Time Stamp Counter to the guest.
  *
  * Return value is 0 on success and non-zero on failure.
  */
 int vm_set_tsc_offset(struct vm *vm, int vcpu_id, uint64_t offset);
 
 enum vm_reg_name vm_segment_name(int seg_encoding);
 
 struct vm_copyinfo {
 	uint64_t	gpa;
 	size_t		len;
 	void		*hva;
 	void		*cookie;
 };
 
 /*
  * Set up 'copyinfo[]' to copy to/from guest linear address space starting
  * at 'gla' and 'len' bytes long. The 'prot' should be set to PROT_READ for
  * a copyin or PROT_WRITE for a copyout. 
  *
  * retval	is_fault	Interpretation
  *   0		   0		Success
  *   0		   1		An exception was injected into the guest
  * EFAULT	  N/A		Unrecoverable error
  *
  * The 'copyinfo[]' can be passed to 'vm_copyin()' or 'vm_copyout()' only if
  * the return value is 0. The 'copyinfo[]' resources should be freed by calling
  * 'vm_copy_teardown()' after the copy is done.
  */
 int vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
     uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
     int num_copyinfo, int *is_fault);
 void vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
     int num_copyinfo);
 void vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
     void *kaddr, size_t len);
 void vm_copyout(struct vm *vm, int vcpuid, const void *kaddr,
     struct vm_copyinfo *copyinfo, size_t len);
 
 int vcpu_trace_exceptions(struct vm *vm, int vcpuid);
 #endif	/* KERNEL */
 
 #define	VM_MAXCPU	16			/* maximum virtual cpus */
 
 /*
  * Identifiers for optional vmm capabilities
  */
 enum vm_cap_type {
 	VM_CAP_HALT_EXIT,
 	VM_CAP_MTRAP_EXIT,
 	VM_CAP_PAUSE_EXIT,
 	VM_CAP_UNRESTRICTED_GUEST,
 	VM_CAP_ENABLE_INVPCID,
 	VM_CAP_BPT_EXIT,
 	VM_CAP_MAX
 };
 
 enum vm_intr_trigger {
 	EDGE_TRIGGER,
 	LEVEL_TRIGGER
 };
 	
 /*
  * The 'access' field has the format specified in Table 21-2 of the Intel
  * Architecture Manual vol 3b.
  *
  * XXX The contents of the 'access' field are architecturally defined except
  * bit 16 - Segment Unusable.
  */
 struct seg_desc {
 	uint64_t	base;
 	uint32_t	limit;
 	uint32_t	access;
 };
 #define	SEG_DESC_TYPE(access)		((access) & 0x001f)
 #define	SEG_DESC_DPL(access)		(((access) >> 5) & 0x3)
 #define	SEG_DESC_PRESENT(access)	(((access) & 0x0080) ? 1 : 0)
 #define	SEG_DESC_DEF32(access)		(((access) & 0x4000) ? 1 : 0)
 #define	SEG_DESC_GRANULARITY(access)	(((access) & 0x8000) ? 1 : 0)
 #define	SEG_DESC_UNUSABLE(access)	(((access) & 0x10000) ? 1 : 0)
 
 enum vm_cpu_mode {
 	CPU_MODE_REAL,
 	CPU_MODE_PROTECTED,
 	CPU_MODE_COMPATIBILITY,		/* IA-32E mode (CS.L = 0) */
 	CPU_MODE_64BIT,			/* IA-32E mode (CS.L = 1) */
 };
 
 enum vm_paging_mode {
 	PAGING_MODE_FLAT,
 	PAGING_MODE_32,
 	PAGING_MODE_PAE,
 	PAGING_MODE_64,
 };
 
 struct vm_guest_paging {
 	uint64_t	cr3;
 	int		cpl;
 	enum vm_cpu_mode cpu_mode;
 	enum vm_paging_mode paging_mode;
 };
 
 /*
  * The data structures 'vie' and 'vie_op' are meant to be opaque to the
  * consumers of instruction decoding. The only reason why their contents
  * need to be exposed is because they are part of the 'vm_exit' structure.
  */
 struct vie_op {
 	uint8_t		op_byte;	/* actual opcode byte */
 	uint8_t		op_type;	/* type of operation (e.g. MOV) */
 	uint16_t	op_flags;
 };
 _Static_assert(sizeof(struct vie_op) == 4, "ABI");
 _Static_assert(_Alignof(struct vie_op) == 2, "ABI");
 
 #define	VIE_INST_SIZE	15
 struct vie {
 	uint8_t		inst[VIE_INST_SIZE];	/* instruction bytes */
 	uint8_t		num_valid;		/* size of the instruction */
+
+/* The following fields are all zeroed upon restart. */
+#define	vie_startzero	num_processed
 	uint8_t		num_processed;
 
 	uint8_t		addrsize:4, opsize:4;	/* address and operand sizes */
 	uint8_t		rex_w:1,		/* REX prefix */
 			rex_r:1,
 			rex_x:1,
 			rex_b:1,
 			rex_present:1,
 			repz_present:1,		/* REP/REPE/REPZ prefix */
 			repnz_present:1,	/* REPNE/REPNZ prefix */
 			opsize_override:1,	/* Operand size override */
 			addrsize_override:1,	/* Address size override */
 			segment_override:1;	/* Segment override */
 
 	uint8_t		mod:2,			/* ModRM byte */
 			reg:4,
 			rm:4;
 
 	uint8_t		ss:2,			/* SIB byte */
 			vex_present:1,		/* VEX prefixed */
 			vex_l:1,		/* L bit */
 			index:4,		/* SIB byte */
 			base:4;			/* SIB byte */
 
 	uint8_t		disp_bytes;
 	uint8_t		imm_bytes;
 
 	uint8_t		scale;
 
 	uint8_t		vex_reg:4,		/* vvvv: first source register specifier */
 			vex_pp:2,		/* pp */
 			_sparebits:2;
 
 	uint8_t		_sparebytes[2];
 
 	int		base_register;		/* VM_REG_GUEST_xyz */
 	int		index_register;		/* VM_REG_GUEST_xyz */
 	int		segment_register;	/* VM_REG_GUEST_xyz */
 
 	int64_t		displacement;		/* optional addr displacement */
 	int64_t		immediate;		/* optional immediate operand */
 
 	uint8_t		decoded;	/* set to 1 if successfully decoded */
 
 	uint8_t		_sparebyte;
 
 	struct vie_op	op;			/* opcode description */
 };
 _Static_assert(sizeof(struct vie) == 64, "ABI");
 _Static_assert(__offsetof(struct vie, disp_bytes) == 22, "ABI");
 _Static_assert(__offsetof(struct vie, scale) == 24, "ABI");
 _Static_assert(__offsetof(struct vie, base_register) == 28, "ABI");
 
 enum vm_exitcode {
 	VM_EXITCODE_INOUT,
 	VM_EXITCODE_VMX,
 	VM_EXITCODE_BOGUS,
 	VM_EXITCODE_RDMSR,
 	VM_EXITCODE_WRMSR,
 	VM_EXITCODE_HLT,
 	VM_EXITCODE_MTRAP,
 	VM_EXITCODE_PAUSE,
 	VM_EXITCODE_PAGING,
 	VM_EXITCODE_INST_EMUL,
 	VM_EXITCODE_SPINUP_AP,
 	VM_EXITCODE_DEPRECATED1,	/* used to be SPINDOWN_CPU */
 	VM_EXITCODE_RENDEZVOUS,
 	VM_EXITCODE_IOAPIC_EOI,
 	VM_EXITCODE_SUSPENDED,
 	VM_EXITCODE_INOUT_STR,
 	VM_EXITCODE_TASK_SWITCH,
 	VM_EXITCODE_MONITOR,
 	VM_EXITCODE_MWAIT,
 	VM_EXITCODE_SVM,
 	VM_EXITCODE_REQIDLE,
 	VM_EXITCODE_DEBUG,
 	VM_EXITCODE_VMINSN,
 	VM_EXITCODE_BPT,
 	VM_EXITCODE_MAX
 };
 
 struct vm_inout {
 	uint16_t	bytes:3;	/* 1 or 2 or 4 */
 	uint16_t	in:1;
 	uint16_t	string:1;
 	uint16_t	rep:1;
 	uint16_t	port;
 	uint32_t	eax;		/* valid for out */
 };
 
 struct vm_inout_str {
 	struct vm_inout	inout;		/* must be the first element */
 	struct vm_guest_paging paging;
 	uint64_t	rflags;
 	uint64_t	cr0;
 	uint64_t	index;
 	uint64_t	count;		/* rep=1 (%rcx), rep=0 (1) */
 	int		addrsize;
 	enum vm_reg_name seg_name;
 	struct seg_desc seg_desc;
 };
 
 enum task_switch_reason {
 	TSR_CALL,
 	TSR_IRET,
 	TSR_JMP,
 	TSR_IDT_GATE,	/* task gate in IDT */
 };
 
 struct vm_task_switch {
 	uint16_t	tsssel;		/* new TSS selector */
 	int		ext;		/* task switch due to external event */
 	uint32_t	errcode;
 	int		errcode_valid;	/* push 'errcode' on the new stack */
 	enum task_switch_reason reason;
 	struct vm_guest_paging paging;
 };
 
 struct vm_exit {
 	enum vm_exitcode	exitcode;
 	int			inst_length;	/* 0 means unknown */
 	uint64_t		rip;
 	union {
 		struct vm_inout	inout;
 		struct vm_inout_str inout_str;
 		struct {
 			uint64_t	gpa;
 			int		fault_type;
 		} paging;
 		struct {
 			uint64_t	gpa;
 			uint64_t	gla;
 			uint64_t	cs_base;
 			int		cs_d;		/* CS.D */
 			struct vm_guest_paging paging;
 			struct vie	vie;
 		} inst_emul;
 		/*
 		 * VMX specific payload. Used when there is no "better"
 		 * exitcode to represent the VM-exit.
 		 */
 		struct {
 			int		status;		/* vmx inst status */
 			/*
 			 * 'exit_reason' and 'exit_qualification' are valid
 			 * only if 'status' is zero.
 			 */
 			uint32_t	exit_reason;
 			uint64_t	exit_qualification;
 			/*
 			 * 'inst_error' and 'inst_type' are valid
 			 * only if 'status' is non-zero.
 			 */
 			int		inst_type;
 			int		inst_error;
 		} vmx;
 		/*
 		 * SVM specific payload.
 		 */
 		struct {
 			uint64_t	exitcode;
 			uint64_t	exitinfo1;
 			uint64_t	exitinfo2;
 		} svm;
 		struct {
 			int		inst_length;
 		} bpt;
 		struct {
 			uint32_t	code;		/* ecx value */
 			uint64_t	wval;
 		} msr;
 		struct {
 			int		vcpu;
 			uint64_t	rip;
 		} spinup_ap;
 		struct {
 			uint64_t	rflags;
 			uint64_t	intr_status;
 		} hlt;
 		struct {
 			int		vector;
 		} ioapic_eoi;
 		struct {
 			enum vm_suspend_how how;
 		} suspended;
 		struct vm_task_switch task_switch;
 	} u;
 };
 
 /* APIs to inject faults into the guest */
 void vm_inject_fault(void *vm, int vcpuid, int vector, int errcode_valid,
     int errcode);
 
 static __inline void
 vm_inject_ud(void *vm, int vcpuid)
 {
 	vm_inject_fault(vm, vcpuid, IDT_UD, 0, 0);
 }
 
 static __inline void
 vm_inject_gp(void *vm, int vcpuid)
 {
 	vm_inject_fault(vm, vcpuid, IDT_GP, 1, 0);
 }
 
 static __inline void
 vm_inject_ac(void *vm, int vcpuid, int errcode)
 {
 	vm_inject_fault(vm, vcpuid, IDT_AC, 1, errcode);
 }
 
 static __inline void
 vm_inject_ss(void *vm, int vcpuid, int errcode)
 {
 	vm_inject_fault(vm, vcpuid, IDT_SS, 1, errcode);
 }
 
 void vm_inject_pf(void *vm, int vcpuid, int error_code, uint64_t cr2);
 
 int vm_restart_instruction(void *vm, int vcpuid);
 
 #endif	/* _VMM_H_ */
Index: head/sys/amd64/include/vmm_instruction_emul.h
===================================================================
--- head/sys/amd64/include/vmm_instruction_emul.h	(revision 362599)
+++ head/sys/amd64/include/vmm_instruction_emul.h	(revision 362600)
@@ -1,134 +1,135 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2012 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef	_VMM_INSTRUCTION_EMUL_H_
 #define	_VMM_INSTRUCTION_EMUL_H_
 
 #include <sys/mman.h>
 
 /*
  * Callback functions to read and write memory regions.
  */
 typedef int (*mem_region_read_t)(void *vm, int cpuid, uint64_t gpa,
 				 uint64_t *rval, int rsize, void *arg);
 
 typedef int (*mem_region_write_t)(void *vm, int cpuid, uint64_t gpa,
 				  uint64_t wval, int wsize, void *arg);
 
 /*
  * Emulate the decoded 'vie' instruction.
  *
  * The callbacks 'mrr' and 'mrw' emulate reads and writes to the memory region
  * containing 'gpa'. 'mrarg' is an opaque argument that is passed into the
  * callback functions.
  *
  * 'void *vm' should be 'struct vm *' when called from kernel context and
  * 'struct vmctx *' when called from user context.
  * s
  */
 int vmm_emulate_instruction(void *vm, int cpuid, uint64_t gpa, struct vie *vie,
     struct vm_guest_paging *paging, mem_region_read_t mrr,
     mem_region_write_t mrw, void *mrarg);
 
 int vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg,
     uint64_t val, int size);
 
 /*
  * Returns 1 if an alignment check exception should be injected and 0 otherwise.
  */
 int vie_alignment_check(int cpl, int operand_size, uint64_t cr0,
     uint64_t rflags, uint64_t gla);
 
 /* Returns 1 if the 'gla' is not canonical and 0 otherwise. */
 int vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla);
 
 uint64_t vie_size2mask(int size);
 
 int vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
     struct seg_desc *desc, uint64_t off, int length, int addrsize, int prot,
     uint64_t *gla);
 
 #ifdef _KERNEL
 /*
  * APIs to fetch and decode the instruction from nested page fault handler.
  *
  * 'vie' must be initialized before calling 'vmm_fetch_instruction()'
  */
 int vmm_fetch_instruction(struct vm *vm, int cpuid,
 			  struct vm_guest_paging *guest_paging,
 			  uint64_t rip, int inst_length, struct vie *vie,
 			  int *is_fault);
 
 /*
  * Translate the guest linear address 'gla' to a guest physical address.
  *
  * retval	is_fault	Interpretation
  *   0		   0		'gpa' contains result of the translation
  *   0		   1		An exception was injected into the guest
  * EFAULT	  N/A		An unrecoverable hypervisor error occurred
  */
 int vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
     uint64_t gla, int prot, uint64_t *gpa, int *is_fault);
 
 /*
  * Like vm_gla2gpa, but no exceptions are injected into the guest and
  * PTEs are not changed.
  */
 int vm_gla2gpa_nofault(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
     uint64_t gla, int prot, uint64_t *gpa, int *is_fault);
 #endif /* _KERNEL */
 
+void vie_restart(struct vie *vie);
 void vie_init(struct vie *vie, const char *inst_bytes, int inst_length);
 
 /*
  * Decode the instruction fetched into 'vie' so it can be emulated.
  *
  * 'gla' is the guest linear address provided by the hardware assist
  * that caused the nested page table fault. It is used to verify that
  * the software instruction decoding is in agreement with the hardware.
  * 
  * Some hardware assists do not provide the 'gla' to the hypervisor.
  * To skip the 'gla' verification for this or any other reason pass
  * in VIE_INVALID_GLA instead.
  */
 #ifdef _KERNEL
 #define	VIE_INVALID_GLA		(1UL << 63)	/* a non-canonical address */
 int vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla,
 			   enum vm_cpu_mode cpu_mode, int csd, struct vie *vie);
 #else /* !_KERNEL */
 /*
  * Permit instruction decoding logic to be compiled outside of the kernel for
  * rapid iteration and validation.  No GLA validation is performed, obviously.
  */
 int vmm_decode_instruction(enum vm_cpu_mode cpu_mode, int csd,
     struct vie *vie);
 #endif	/* _KERNEL */
 
 #endif	/* _VMM_INSTRUCTION_EMUL_H_ */
Index: head/sys/amd64/vmm/vmm_instruction_emul.c
===================================================================
--- head/sys/amd64/vmm/vmm_instruction_emul.c	(revision 362599)
+++ head/sys/amd64/vmm/vmm_instruction_emul.c	(revision 362600)
@@ -1,2922 +1,2938 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2012 Sandvine, Inc.
  * Copyright (c) 2012 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #ifdef _KERNEL
 #include <sys/param.h>
 #include <sys/pcpu.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 #include <machine/vmparam.h>
 #include <machine/vmm.h>
 #else	/* !_KERNEL */
 #include <sys/types.h>
 #include <sys/errno.h>
 #include <sys/_iovec.h>
 
 #include <machine/vmm.h>
 
 #include <err.h>
 #include <assert.h>
 #include <stdbool.h>
+#include <stddef.h>
 #include <stdio.h>
+#include <string.h>
 #include <strings.h>
 #include <vmmapi.h>
 #define	KASSERT(exp,msg)	assert((exp))
 #define	panic(...)		errx(4, __VA_ARGS__)
 #endif	/* _KERNEL */
 
 #include <machine/vmm_instruction_emul.h>
 #include <x86/psl.h>
 #include <x86/specialreg.h>
 
 /* struct vie_op.op_type */
 enum {
 	VIE_OP_TYPE_NONE = 0,
 	VIE_OP_TYPE_MOV,
 	VIE_OP_TYPE_MOVSX,
 	VIE_OP_TYPE_MOVZX,
 	VIE_OP_TYPE_AND,
 	VIE_OP_TYPE_OR,
 	VIE_OP_TYPE_SUB,
 	VIE_OP_TYPE_TWO_BYTE,
 	VIE_OP_TYPE_PUSH,
 	VIE_OP_TYPE_CMP,
 	VIE_OP_TYPE_POP,
 	VIE_OP_TYPE_MOVS,
 	VIE_OP_TYPE_GROUP1,
 	VIE_OP_TYPE_STOS,
 	VIE_OP_TYPE_BITTEST,
 	VIE_OP_TYPE_TWOB_GRP15,
 	VIE_OP_TYPE_ADD,
 	VIE_OP_TYPE_TEST,
 	VIE_OP_TYPE_BEXTR,
 	VIE_OP_TYPE_LAST
 };
 
 /* struct vie_op.op_flags */
 #define	VIE_OP_F_IMM		(1 << 0)  /* 16/32-bit immediate operand */
 #define	VIE_OP_F_IMM8		(1 << 1)  /* 8-bit immediate operand */
 #define	VIE_OP_F_MOFFSET	(1 << 2)  /* 16/32/64-bit immediate moffset */
 #define	VIE_OP_F_NO_MODRM	(1 << 3)
 #define	VIE_OP_F_NO_GLA_VERIFICATION (1 << 4)
 
 static const struct vie_op three_byte_opcodes_0f38[256] = {
 	[0xF7] = {
 		.op_byte = 0xF7,
 		.op_type = VIE_OP_TYPE_BEXTR,
 	},
 };
 
 static const struct vie_op two_byte_opcodes[256] = {
 	[0xAE] = {
 		.op_byte = 0xAE,
 		.op_type = VIE_OP_TYPE_TWOB_GRP15,
 	},
 	[0xB6] = {
 		.op_byte = 0xB6,
 		.op_type = VIE_OP_TYPE_MOVZX,
 	},
 	[0xB7] = {
 		.op_byte = 0xB7,
 		.op_type = VIE_OP_TYPE_MOVZX,
 	},
 	[0xBA] = {
 		.op_byte = 0xBA,
 		.op_type = VIE_OP_TYPE_BITTEST,
 		.op_flags = VIE_OP_F_IMM8,
 	},
 	[0xBE] = {
 		.op_byte = 0xBE,
 		.op_type = VIE_OP_TYPE_MOVSX,
 	},
 };
 
 static const struct vie_op one_byte_opcodes[256] = {
 	[0x03] = {
 		.op_byte = 0x03,
 		.op_type = VIE_OP_TYPE_ADD,
 	},
 	[0x0F] = {
 		.op_byte = 0x0F,
 		.op_type = VIE_OP_TYPE_TWO_BYTE
 	},
 	[0x0B] = {
 		.op_byte = 0x0B,
 		.op_type = VIE_OP_TYPE_OR,
 	},
 	[0x2B] = {
 		.op_byte = 0x2B,
 		.op_type = VIE_OP_TYPE_SUB,
 	},
 	[0x39] = {
 		.op_byte = 0x39,
 		.op_type = VIE_OP_TYPE_CMP,
 	},
 	[0x3B] = {
 		.op_byte = 0x3B,
 		.op_type = VIE_OP_TYPE_CMP,
 	},
 	[0x88] = {
 		.op_byte = 0x88,
 		.op_type = VIE_OP_TYPE_MOV,
 	},
 	[0x89] = {
 		.op_byte = 0x89,
 		.op_type = VIE_OP_TYPE_MOV,
 	},
 	[0x8A] = {
 		.op_byte = 0x8A,
 		.op_type = VIE_OP_TYPE_MOV,
 	},
 	[0x8B] = {
 		.op_byte = 0x8B,
 		.op_type = VIE_OP_TYPE_MOV,
 	},
 	[0xA1] = {
 		.op_byte = 0xA1,
 		.op_type = VIE_OP_TYPE_MOV,
 		.op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
 	},
 	[0xA3] = {
 		.op_byte = 0xA3,
 		.op_type = VIE_OP_TYPE_MOV,
 		.op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
 	},
 	[0xA4] = {
 		.op_byte = 0xA4,
 		.op_type = VIE_OP_TYPE_MOVS,
 		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
 	},
 	[0xA5] = {
 		.op_byte = 0xA5,
 		.op_type = VIE_OP_TYPE_MOVS,
 		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
 	},
 	[0xAA] = {
 		.op_byte = 0xAA,
 		.op_type = VIE_OP_TYPE_STOS,
 		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
 	},
 	[0xAB] = {
 		.op_byte = 0xAB,
 		.op_type = VIE_OP_TYPE_STOS,
 		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
 	},
 	[0xC6] = {
 		/* XXX Group 11 extended opcode - not just MOV */
 		.op_byte = 0xC6,
 		.op_type = VIE_OP_TYPE_MOV,
 		.op_flags = VIE_OP_F_IMM8,
 	},
 	[0xC7] = {
 		.op_byte = 0xC7,
 		.op_type = VIE_OP_TYPE_MOV,
 		.op_flags = VIE_OP_F_IMM,
 	},
 	[0x23] = {
 		.op_byte = 0x23,
 		.op_type = VIE_OP_TYPE_AND,
 	},
 	[0x80] = {
 		/* Group 1 extended opcode */
 		.op_byte = 0x80,
 		.op_type = VIE_OP_TYPE_GROUP1,
 		.op_flags = VIE_OP_F_IMM8,
 	},
 	[0x81] = {
 		/* Group 1 extended opcode */
 		.op_byte = 0x81,
 		.op_type = VIE_OP_TYPE_GROUP1,
 		.op_flags = VIE_OP_F_IMM,
 	},
 	[0x83] = {
 		/* Group 1 extended opcode */
 		.op_byte = 0x83,
 		.op_type = VIE_OP_TYPE_GROUP1,
 		.op_flags = VIE_OP_F_IMM8,
 	},
 	[0x8F] = {
 		/* XXX Group 1A extended opcode - not just POP */
 		.op_byte = 0x8F,
 		.op_type = VIE_OP_TYPE_POP,
 	},
 	[0xF7] = {
 		/* XXX Group 3 extended opcode - not just TEST */
 		.op_byte = 0xF7,
 		.op_type = VIE_OP_TYPE_TEST,
 		.op_flags = VIE_OP_F_IMM,
 	},
 	[0xFF] = {
 		/* XXX Group 5 extended opcode - not just PUSH */
 		.op_byte = 0xFF,
 		.op_type = VIE_OP_TYPE_PUSH,
 	}
 };
 
 /* struct vie.mod */
 #define	VIE_MOD_INDIRECT		0
 #define	VIE_MOD_INDIRECT_DISP8		1
 #define	VIE_MOD_INDIRECT_DISP32		2
 #define	VIE_MOD_DIRECT			3
 
 /* struct vie.rm */
 #define	VIE_RM_SIB			4
 #define	VIE_RM_DISP32			5
 
 #define	GB				(1024 * 1024 * 1024)
 
 static enum vm_reg_name gpr_map[16] = {
 	VM_REG_GUEST_RAX,
 	VM_REG_GUEST_RCX,
 	VM_REG_GUEST_RDX,
 	VM_REG_GUEST_RBX,
 	VM_REG_GUEST_RSP,
 	VM_REG_GUEST_RBP,
 	VM_REG_GUEST_RSI,
 	VM_REG_GUEST_RDI,
 	VM_REG_GUEST_R8,
 	VM_REG_GUEST_R9,
 	VM_REG_GUEST_R10,
 	VM_REG_GUEST_R11,
 	VM_REG_GUEST_R12,
 	VM_REG_GUEST_R13,
 	VM_REG_GUEST_R14,
 	VM_REG_GUEST_R15
 };
 
 static uint64_t size2mask[] = {
 	[1] = 0xff,
 	[2] = 0xffff,
 	[4] = 0xffffffff,
 	[8] = 0xffffffffffffffff,
 };
 
 static int
 vie_read_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t *rval)
 {
 	int error;
 
 	error = vm_get_register(vm, vcpuid, reg, rval);
 
 	return (error);
 }
 
 static void
 vie_calc_bytereg(struct vie *vie, enum vm_reg_name *reg, int *lhbr)
 {
 	*lhbr = 0;
 	*reg = gpr_map[vie->reg];
 
 	/*
 	 * 64-bit mode imposes limitations on accessing legacy high byte
 	 * registers (lhbr).
 	 *
 	 * The legacy high-byte registers cannot be addressed if the REX
 	 * prefix is present. In this case the values 4, 5, 6 and 7 of the
 	 * 'ModRM:reg' field address %spl, %bpl, %sil and %dil respectively.
 	 *
 	 * If the REX prefix is not present then the values 4, 5, 6 and 7
 	 * of the 'ModRM:reg' field address the legacy high-byte registers,
 	 * %ah, %ch, %dh and %bh respectively.
 	 */
 	if (!vie->rex_present) {
 		if (vie->reg & 0x4) {
 			*lhbr = 1;
 			*reg = gpr_map[vie->reg & 0x3];
 		}
 	}
 }
 
 static int
 vie_read_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t *rval)
 {
 	uint64_t val;
 	int error, lhbr;
 	enum vm_reg_name reg;
 
 	vie_calc_bytereg(vie, &reg, &lhbr);
 	error = vm_get_register(vm, vcpuid, reg, &val);
 
 	/*
 	 * To obtain the value of a legacy high byte register shift the
 	 * base register right by 8 bits (%ah = %rax >> 8).
 	 */
 	if (lhbr)
 		*rval = val >> 8;
 	else
 		*rval = val;
 	return (error);
 }
 
 static int
 vie_write_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t byte)
 {
 	uint64_t origval, val, mask;
 	int error, lhbr;
 	enum vm_reg_name reg;
 
 	vie_calc_bytereg(vie, &reg, &lhbr);
 	error = vm_get_register(vm, vcpuid, reg, &origval);
 	if (error == 0) {
 		val = byte;
 		mask = 0xff;
 		if (lhbr) {
 			/*
 			 * Shift left by 8 to store 'byte' in a legacy high
 			 * byte register.
 			 */
 			val <<= 8;
 			mask <<= 8;
 		}
 		val |= origval & ~mask;
 		error = vm_set_register(vm, vcpuid, reg, val);
 	}
 	return (error);
 }
 
 int
 vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg,
 		    uint64_t val, int size)
 {
 	int error;
 	uint64_t origval;
 
 	switch (size) {
 	case 1:
 	case 2:
 		error = vie_read_register(vm, vcpuid, reg, &origval);
 		if (error)
 			return (error);
 		val &= size2mask[size];
 		val |= origval & ~size2mask[size];
 		break;
 	case 4:
 		val &= 0xffffffffUL;
 		break;
 	case 8:
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	error = vm_set_register(vm, vcpuid, reg, val);
 	return (error);
 }
 
 #define	RFLAGS_STATUS_BITS    (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_V)
 
 /*
  * Return the status flags that would result from doing (x - y).
  */
 #define	GETCC(sz)							\
 static u_long								\
 getcc##sz(uint##sz##_t x, uint##sz##_t y)				\
 {									\
 	u_long rflags;							\
 									\
 	__asm __volatile("sub %2,%1; pushfq; popq %0" :			\
 	    "=r" (rflags), "+r" (x) : "m" (y));				\
 	return (rflags);						\
 } struct __hack
 
 GETCC(8);
 GETCC(16);
 GETCC(32);
 GETCC(64);
 
 static u_long
 getcc(int opsize, uint64_t x, uint64_t y)
 {
 	KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8,
 	    ("getcc: invalid operand size %d", opsize));
 
 	if (opsize == 1)
 		return (getcc8(x, y));
 	else if (opsize == 2)
 		return (getcc16(x, y));
 	else if (opsize == 4)
 		return (getcc32(x, y));
 	else
 		return (getcc64(x, y));
 }
 
 /*
  * Macro creation of functions getaddflags{8,16,32,64}
  */
 #define	GETADDFLAGS(sz)							\
 static u_long								\
 getaddflags##sz(uint##sz##_t x, uint##sz##_t y)				\
 {									\
 	u_long rflags;							\
 									\
 	__asm __volatile("add %2,%1; pushfq; popq %0" :			\
 	    "=r" (rflags), "+r" (x) : "m" (y));				\
 	return (rflags);						\
 } struct __hack
 
 GETADDFLAGS(8);
 GETADDFLAGS(16);
 GETADDFLAGS(32);
 GETADDFLAGS(64);
 
 static u_long
 getaddflags(int opsize, uint64_t x, uint64_t y)
 {
 	KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8,
 	    ("getaddflags: invalid operand size %d", opsize));
 
 	if (opsize == 1)
 		return (getaddflags8(x, y));
 	else if (opsize == 2)
 		return (getaddflags16(x, y));
 	else if (opsize == 4)
 		return (getaddflags32(x, y));
 	else
 		return (getaddflags64(x, y));
 }
 
 /*
  * Return the status flags that would result from doing (x & y).
  */
 #define	GETANDFLAGS(sz)							\
 static u_long								\
 getandflags##sz(uint##sz##_t x, uint##sz##_t y)				\
 {									\
 	u_long rflags;							\
 									\
 	__asm __volatile("and %2,%1; pushfq; popq %0" :			\
 	    "=r" (rflags), "+r" (x) : "m" (y));				\
 	return (rflags);						\
 } struct __hack
 
 GETANDFLAGS(8);
 GETANDFLAGS(16);
 GETANDFLAGS(32);
 GETANDFLAGS(64);
 
 static u_long
 getandflags(int opsize, uint64_t x, uint64_t y)
 {
 	KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8,
 	    ("getandflags: invalid operand size %d", opsize));
 
 	if (opsize == 1)
 		return (getandflags8(x, y));
 	else if (opsize == 2)
 		return (getandflags16(x, y));
 	else if (opsize == 4)
 		return (getandflags32(x, y));
 	else
 		return (getandflags64(x, y));
 }
 
 static int
 emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
 {
 	int error, size;
 	enum vm_reg_name reg;
 	uint8_t byte;
 	uint64_t val;
 
 	size = vie->opsize;
 	error = EINVAL;
 
 	switch (vie->op.op_byte) {
 	case 0x88:
 		/*
 		 * MOV byte from reg (ModRM:reg) to mem (ModRM:r/m)
 		 * 88/r:	mov r/m8, r8
 		 * REX + 88/r:	mov r/m8, r8 (%ah, %ch, %dh, %bh not available)
 		 */
 		size = 1;	/* override for byte operation */
 		error = vie_read_bytereg(vm, vcpuid, vie, &byte);
 		if (error == 0)
 			error = memwrite(vm, vcpuid, gpa, byte, size, arg);
 		break;
 	case 0x89:
 		/*
 		 * MOV from reg (ModRM:reg) to mem (ModRM:r/m)
 		 * 89/r:	mov r/m16, r16
 		 * 89/r:	mov r/m32, r32
 		 * REX.W + 89/r	mov r/m64, r64
 		 */
 		reg = gpr_map[vie->reg];
 		error = vie_read_register(vm, vcpuid, reg, &val);
 		if (error == 0) {
 			val &= size2mask[size];
 			error = memwrite(vm, vcpuid, gpa, val, size, arg);
 		}
 		break;
 	case 0x8A:
 		/*
 		 * MOV byte from mem (ModRM:r/m) to reg (ModRM:reg)
 		 * 8A/r:	mov r8, r/m8
 		 * REX + 8A/r:	mov r8, r/m8
 		 */
 		size = 1;	/* override for byte operation */
 		error = memread(vm, vcpuid, gpa, &val, size, arg);
 		if (error == 0)
 			error = vie_write_bytereg(vm, vcpuid, vie, val);
 		break;
 	case 0x8B:
 		/*
 		 * MOV from mem (ModRM:r/m) to reg (ModRM:reg)
 		 * 8B/r:	mov r16, r/m16
 		 * 8B/r:	mov r32, r/m32
 		 * REX.W 8B/r:	mov r64, r/m64
 		 */
 		error = memread(vm, vcpuid, gpa, &val, size, arg);
 		if (error == 0) {
 			reg = gpr_map[vie->reg];
 			error = vie_update_register(vm, vcpuid, reg, val, size);
 		}
 		break;
 	case 0xA1:
 		/*
 		 * MOV from seg:moffset to AX/EAX/RAX
 		 * A1:		mov AX, moffs16
 		 * A1:		mov EAX, moffs32
 		 * REX.W + A1:	mov RAX, moffs64
 		 */
 		error = memread(vm, vcpuid, gpa, &val, size, arg);
 		if (error == 0) {
 			reg = VM_REG_GUEST_RAX;
 			error = vie_update_register(vm, vcpuid, reg, val, size);
 		}
 		break;
 	case 0xA3:
 		/*
 		 * MOV from AX/EAX/RAX to seg:moffset
 		 * A3:		mov moffs16, AX
 		 * A3:		mov moffs32, EAX 
 		 * REX.W + A3:	mov moffs64, RAX
 		 */
 		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val);
 		if (error == 0) {
 			val &= size2mask[size];
 			error = memwrite(vm, vcpuid, gpa, val, size, arg);
 		}
 		break;
 	case 0xC6:
 		/*
 		 * MOV from imm8 to mem (ModRM:r/m)
 		 * C6/0		mov r/m8, imm8
 		 * REX + C6/0	mov r/m8, imm8
 		 */
 		size = 1;	/* override for byte operation */
 		error = memwrite(vm, vcpuid, gpa, vie->immediate, size, arg);
 		break;
 	case 0xC7:
 		/*
 		 * MOV from imm16/imm32 to mem (ModRM:r/m)
 		 * C7/0		mov r/m16, imm16
 		 * C7/0		mov r/m32, imm32
 		 * REX.W + C7/0	mov r/m64, imm32 (sign-extended to 64-bits)
 		 */
 		val = vie->immediate & size2mask[size];
 		error = memwrite(vm, vcpuid, gpa, val, size, arg);
 		break;
 	default:
 		break;
 	}
 
 	return (error);
 }
 
 static int
 emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	     mem_region_read_t memread, mem_region_write_t memwrite,
 	     void *arg)
 {
 	int error, size;
 	enum vm_reg_name reg;
 	uint64_t val;
 
 	size = vie->opsize;
 	error = EINVAL;
 
 	switch (vie->op.op_byte) {
 	case 0xB6:
 		/*
 		 * MOV and zero extend byte from mem (ModRM:r/m) to
 		 * reg (ModRM:reg).
 		 *
 		 * 0F B6/r		movzx r16, r/m8
 		 * 0F B6/r		movzx r32, r/m8
 		 * REX.W + 0F B6/r	movzx r64, r/m8
 		 */
 
 		/* get the first operand */
 		error = memread(vm, vcpuid, gpa, &val, 1, arg);
 		if (error)
 			break;
 
 		/* get the second operand */
 		reg = gpr_map[vie->reg];
 
 		/* zero-extend byte */
 		val = (uint8_t)val;
 
 		/* write the result */
 		error = vie_update_register(vm, vcpuid, reg, val, size);
 		break;
 	case 0xB7:
 		/*
 		 * MOV and zero extend word from mem (ModRM:r/m) to
 		 * reg (ModRM:reg).
 		 *
 		 * 0F B7/r		movzx r32, r/m16
 		 * REX.W + 0F B7/r	movzx r64, r/m16
 		 */
 		error = memread(vm, vcpuid, gpa, &val, 2, arg);
 		if (error)
 			return (error);
 
 		reg = gpr_map[vie->reg];
 
 		/* zero-extend word */
 		val = (uint16_t)val;
 
 		error = vie_update_register(vm, vcpuid, reg, val, size);
 		break;
 	case 0xBE:
 		/*
 		 * MOV and sign extend byte from mem (ModRM:r/m) to
 		 * reg (ModRM:reg).
 		 *
 		 * 0F BE/r		movsx r16, r/m8
 		 * 0F BE/r		movsx r32, r/m8
 		 * REX.W + 0F BE/r	movsx r64, r/m8
 		 */
 
 		/* get the first operand */
 		error = memread(vm, vcpuid, gpa, &val, 1, arg);
 		if (error)
 			break;
 
 		/* get the second operand */
 		reg = gpr_map[vie->reg];
 
 		/* sign extend byte */
 		val = (int8_t)val;
 
 		/* write the result */
 		error = vie_update_register(vm, vcpuid, reg, val, size);
 		break;
 	default:
 		break;
 	}
 	return (error);
 }
 
 /*
  * Helper function to calculate and validate a linear address.
  */
 static int
 get_gla(void *vm, int vcpuid, struct vie *vie, struct vm_guest_paging *paging,
     int opsize, int addrsize, int prot, enum vm_reg_name seg,
     enum vm_reg_name gpr, uint64_t *gla, int *fault)
 {
 	struct seg_desc desc;
 	uint64_t cr0, val, rflags;
 	int error;
 
 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0);
 	KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error));
 
 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
 
 	error = vm_get_seg_desc(vm, vcpuid, seg, &desc);
 	KASSERT(error == 0, ("%s: error %d getting segment descriptor %d",
 	    __func__, error, seg));
 
 	error = vie_read_register(vm, vcpuid, gpr, &val);
 	KASSERT(error == 0, ("%s: error %d getting register %d", __func__,
 	    error, gpr));
 
 	if (vie_calculate_gla(paging->cpu_mode, seg, &desc, val, opsize,
 	    addrsize, prot, gla)) {
 		if (seg == VM_REG_GUEST_SS)
 			vm_inject_ss(vm, vcpuid, 0);
 		else
 			vm_inject_gp(vm, vcpuid);
 		goto guest_fault;
 	}
 
 	if (vie_canonical_check(paging->cpu_mode, *gla)) {
 		if (seg == VM_REG_GUEST_SS)
 			vm_inject_ss(vm, vcpuid, 0);
 		else
 			vm_inject_gp(vm, vcpuid);
 		goto guest_fault;
 	}
 
 	if (vie_alignment_check(paging->cpl, opsize, cr0, rflags, *gla)) {
 		vm_inject_ac(vm, vcpuid, 0);
 		goto guest_fault;
 	}
 
 	*fault = 0;
 	return (0);
 
 guest_fault:
 	*fault = 1;
 	return (0);
 }
 
 static int
 emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
     struct vm_guest_paging *paging, mem_region_read_t memread,
     mem_region_write_t memwrite, void *arg)
 {
 #ifdef _KERNEL
 	struct vm_copyinfo copyinfo[2];
 #else
 	struct iovec copyinfo[2];
 #endif
 	uint64_t dstaddr, srcaddr, dstgpa, srcgpa, val;
 	uint64_t rcx, rdi, rsi, rflags;
 	int error, fault, opsize, seg, repeat;
 
 	opsize = (vie->op.op_byte == 0xA4) ? 1 : vie->opsize;
 	val = 0;
 	error = 0;
 
 	/*
 	 * XXX although the MOVS instruction is only supposed to be used with
 	 * the "rep" prefix some guests like FreeBSD will use "repnz" instead.
 	 *
 	 * Empirically the "repnz" prefix has identical behavior to "rep"
 	 * and the zero flag does not make a difference.
 	 */
 	repeat = vie->repz_present | vie->repnz_present;
 
 	if (repeat) {
 		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx);
 		KASSERT(!error, ("%s: error %d getting rcx", __func__, error));
 
 		/*
 		 * The count register is %rcx, %ecx or %cx depending on the
 		 * address size of the instruction.
 		 */
 		if ((rcx & vie_size2mask(vie->addrsize)) == 0) {
 			error = 0;
 			goto done;
 		}
 	}
 
 	/*
 	 *	Source		Destination	Comments
 	 *	--------------------------------------------
 	 * (1)  memory		memory		n/a
 	 * (2)  memory		mmio		emulated
 	 * (3)  mmio		memory		emulated
 	 * (4)  mmio		mmio		emulated
 	 *
 	 * At this point we don't have sufficient information to distinguish
 	 * between (2), (3) and (4). We use 'vm_copy_setup()' to tease this
 	 * out because it will succeed only when operating on regular memory.
 	 *
 	 * XXX the emulation doesn't properly handle the case where 'gpa'
 	 * is straddling the boundary between the normal memory and MMIO.
 	 */
 
 	seg = vie->segment_override ? vie->segment_register : VM_REG_GUEST_DS;
 	error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize,
 	    PROT_READ, seg, VM_REG_GUEST_RSI, &srcaddr, &fault);
 	if (error || fault)
 		goto done;
 
 	error = vm_copy_setup(vm, vcpuid, paging, srcaddr, opsize, PROT_READ,
 	    copyinfo, nitems(copyinfo), &fault);
 	if (error == 0) {
 		if (fault)
 			goto done;	/* Resume guest to handle fault */
 
 		/*
 		 * case (2): read from system memory and write to mmio.
 		 */
 		vm_copyin(vm, vcpuid, copyinfo, &val, opsize);
 		vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
 		error = memwrite(vm, vcpuid, gpa, val, opsize, arg);
 		if (error)
 			goto done;
 	} else {
 		/*
 		 * 'vm_copy_setup()' is expected to fail for cases (3) and (4)
 		 * if 'srcaddr' is in the mmio space.
 		 */
 
 		error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize,
 		    PROT_WRITE, VM_REG_GUEST_ES, VM_REG_GUEST_RDI, &dstaddr,
 		    &fault);
 		if (error || fault)
 			goto done;
 
 		error = vm_copy_setup(vm, vcpuid, paging, dstaddr, opsize,
 		    PROT_WRITE, copyinfo, nitems(copyinfo), &fault);
 		if (error == 0) {
 			if (fault)
 				goto done;    /* Resume guest to handle fault */
 
 			/*
 			 * case (3): read from MMIO and write to system memory.
 			 *
 			 * A MMIO read can have side-effects so we
 			 * commit to it only after vm_copy_setup() is
 			 * successful. If a page-fault needs to be
 			 * injected into the guest then it will happen
 			 * before the MMIO read is attempted.
 			 */
 			error = memread(vm, vcpuid, gpa, &val, opsize, arg);
 			if (error)
 				goto done;
 
 			vm_copyout(vm, vcpuid, &val, copyinfo, opsize);
 			vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
 		} else {
 			/*
 			 * Case (4): read from and write to mmio.
 			 *
 			 * Commit to the MMIO read/write (with potential
 			 * side-effects) only after we are sure that the
 			 * instruction is not going to be restarted due
 			 * to address translation faults.
 			 */
 			error = vm_gla2gpa(vm, vcpuid, paging, srcaddr,
 			    PROT_READ, &srcgpa, &fault);
 			if (error || fault)
 				goto done;
 
 			error = vm_gla2gpa(vm, vcpuid, paging, dstaddr,
 			   PROT_WRITE, &dstgpa, &fault);
 			if (error || fault)
 				goto done;
 
 			error = memread(vm, vcpuid, srcgpa, &val, opsize, arg);
 			if (error)
 				goto done;
 
 			error = memwrite(vm, vcpuid, dstgpa, val, opsize, arg);
 			if (error)
 				goto done;
 		}
 	}
 
 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSI, &rsi);
 	KASSERT(error == 0, ("%s: error %d getting rsi", __func__, error));
 
 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi);
 	KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error));
 
 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
 
 	if (rflags & PSL_D) {
 		rsi -= opsize;
 		rdi -= opsize;
 	} else {
 		rsi += opsize;
 		rdi += opsize;
 	}
 
 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSI, rsi,
 	    vie->addrsize);
 	KASSERT(error == 0, ("%s: error %d updating rsi", __func__, error));
 
 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi,
 	    vie->addrsize);
 	KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error));
 
 	if (repeat) {
 		rcx = rcx - 1;
 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX,
 		    rcx, vie->addrsize);
 		KASSERT(!error, ("%s: error %d updating rcx", __func__, error));
 
 		/*
 		 * Repeat the instruction if the count register is not zero.
 		 */
 		if ((rcx & vie_size2mask(vie->addrsize)) != 0)
 			vm_restart_instruction(vm, vcpuid);
 	}
 done:
 	KASSERT(error == 0 || error == EFAULT, ("%s: unexpected error %d",
 	    __func__, error));
 	return (error);
 }
 
 static int
 emulate_stos(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
     struct vm_guest_paging *paging, mem_region_read_t memread,
     mem_region_write_t memwrite, void *arg)
 {
 	int error, opsize, repeat;
 	uint64_t val;
 	uint64_t rcx, rdi, rflags;
 
 	opsize = (vie->op.op_byte == 0xAA) ? 1 : vie->opsize;
 	repeat = vie->repz_present | vie->repnz_present;
 
 	if (repeat) {
 		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx);
 		KASSERT(!error, ("%s: error %d getting rcx", __func__, error));
 
 		/*
 		 * The count register is %rcx, %ecx or %cx depending on the
 		 * address size of the instruction.
 		 */
 		if ((rcx & vie_size2mask(vie->addrsize)) == 0)
 			return (0);
 	}
 
 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val);
 	KASSERT(!error, ("%s: error %d getting rax", __func__, error));
 
 	error = memwrite(vm, vcpuid, gpa, val, opsize, arg);
 	if (error)
 		return (error);
 
 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi);
 	KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error));
 
 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
 
 	if (rflags & PSL_D)
 		rdi -= opsize;
 	else
 		rdi += opsize;
 
 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi,
 	    vie->addrsize);
 	KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error));
 
 	if (repeat) {
 		rcx = rcx - 1;
 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX,
 		    rcx, vie->addrsize);
 		KASSERT(!error, ("%s: error %d updating rcx", __func__, error));
 
 		/*
 		 * Repeat the instruction if the count register is not zero.
 		 */
 		if ((rcx & vie_size2mask(vie->addrsize)) != 0)
 			vm_restart_instruction(vm, vcpuid);
 	}
 
 	return (0);
 }
 
 static int
 emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
 {
 	int error, size;
 	enum vm_reg_name reg;
 	uint64_t result, rflags, rflags2, val1, val2;
 
 	size = vie->opsize;
 	error = EINVAL;
 
 	switch (vie->op.op_byte) {
 	case 0x23:
 		/*
 		 * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the
 		 * result in reg.
 		 *
 		 * 23/r		and r16, r/m16
 		 * 23/r		and r32, r/m32
 		 * REX.W + 23/r	and r64, r/m64
 		 */
 
 		/* get the first operand */
 		reg = gpr_map[vie->reg];
 		error = vie_read_register(vm, vcpuid, reg, &val1);
 		if (error)
 			break;
 
 		/* get the second operand */
 		error = memread(vm, vcpuid, gpa, &val2, size, arg);
 		if (error)
 			break;
 
 		/* perform the operation and write the result */
 		result = val1 & val2;
 		error = vie_update_register(vm, vcpuid, reg, result, size);
 		break;
 	case 0x81:
 	case 0x83:
 		/*
 		 * AND mem (ModRM:r/m) with immediate and store the
 		 * result in mem.
 		 *
 		 * 81 /4		and r/m16, imm16
 		 * 81 /4		and r/m32, imm32
 		 * REX.W + 81 /4	and r/m64, imm32 sign-extended to 64
 		 *
 		 * 83 /4		and r/m16, imm8 sign-extended to 16
 		 * 83 /4		and r/m32, imm8 sign-extended to 32
 		 * REX.W + 83/4		and r/m64, imm8 sign-extended to 64
 		 */
 
 		/* get the first operand */
                 error = memread(vm, vcpuid, gpa, &val1, size, arg);
                 if (error)
 			break;
 
                 /*
 		 * perform the operation with the pre-fetched immediate
 		 * operand and write the result
 		 */
                 result = val1 & vie->immediate;
                 error = memwrite(vm, vcpuid, gpa, result, size, arg);
 		break;
 	default:
 		break;
 	}
 	if (error)
 		return (error);
 
 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
 	if (error)
 		return (error);
 
 	/*
 	 * OF and CF are cleared; the SF, ZF and PF flags are set according
 	 * to the result; AF is undefined.
 	 *
 	 * The updated status flags are obtained by subtracting 0 from 'result'.
 	 */
 	rflags2 = getcc(size, result, 0);
 	rflags &= ~RFLAGS_STATUS_BITS;
 	rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
 
 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
 	return (error);
 }
 
 static int
 emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
 {
 	int error, size;
 	enum vm_reg_name reg;
 	uint64_t result, rflags, rflags2, val1, val2;
 
 	size = vie->opsize;
 	error = EINVAL;
 
 	switch (vie->op.op_byte) {
 	case 0x0B:
 		/*
 		 * OR reg (ModRM:reg) and mem (ModRM:r/m) and store the
 		 * result in reg.
 		 *
 		 * 0b/r         or r16, r/m16
 		 * 0b/r         or r32, r/m32
 		 * REX.W + 0b/r or r64, r/m64
 		 */
 
 		/* get the first operand */
 		reg = gpr_map[vie->reg];
 		error = vie_read_register(vm, vcpuid, reg, &val1);
 		if (error)
 			break;
 		
 		/* get the second operand */
 		error = memread(vm, vcpuid, gpa, &val2, size, arg);
 		if (error)
 			break;
 
 		/* perform the operation and write the result */
 		result = val1 | val2;
 		error = vie_update_register(vm, vcpuid, reg, result, size);
 		break;
 	case 0x81:
 	case 0x83:
 		/*
 		 * OR mem (ModRM:r/m) with immediate and store the
 		 * result in mem.
 		 *
 		 * 81 /1		or r/m16, imm16
 		 * 81 /1		or r/m32, imm32
 		 * REX.W + 81 /1	or r/m64, imm32 sign-extended to 64
 		 *
 		 * 83 /1		or r/m16, imm8 sign-extended to 16
 		 * 83 /1		or r/m32, imm8 sign-extended to 32
 		 * REX.W + 83/1		or r/m64, imm8 sign-extended to 64
 		 */
 
 		/* get the first operand */
                 error = memread(vm, vcpuid, gpa, &val1, size, arg);
                 if (error)
 			break;
 
                 /*
 		 * perform the operation with the pre-fetched immediate
 		 * operand and write the result
 		 */
                 result = val1 | vie->immediate;
                 error = memwrite(vm, vcpuid, gpa, result, size, arg);
 		break;
 	default:
 		break;
 	}
 	if (error)
 		return (error);
 
 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
 	if (error)
 		return (error);
 
 	/*
 	 * OF and CF are cleared; the SF, ZF and PF flags are set according
 	 * to the result; AF is undefined.
 	 *
 	 * The updated status flags are obtained by subtracting 0 from 'result'.
 	 */
 	rflags2 = getcc(size, result, 0);
 	rflags &= ~RFLAGS_STATUS_BITS;
 	rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
 
 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
 	return (error);
 }
 
 static int
 emulate_cmp(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
 {
 	int error, size;
 	uint64_t regop, memop, op1, op2, rflags, rflags2;
 	enum vm_reg_name reg;
 
 	size = vie->opsize;
 	switch (vie->op.op_byte) {
 	case 0x39:
 	case 0x3B:
 		/*
 		 * 39/r		CMP r/m16, r16
 		 * 39/r		CMP r/m32, r32
 		 * REX.W 39/r	CMP r/m64, r64
 		 *
 		 * 3B/r		CMP r16, r/m16
 		 * 3B/r		CMP r32, r/m32
 		 * REX.W + 3B/r	CMP r64, r/m64
 		 *
 		 * Compare the first operand with the second operand and
 		 * set status flags in EFLAGS register. The comparison is
 		 * performed by subtracting the second operand from the first
 		 * operand and then setting the status flags.
 		 */
 
 		/* Get the register operand */
 		reg = gpr_map[vie->reg];
 		error = vie_read_register(vm, vcpuid, reg, &regop);
 		if (error)
 			return (error);
 
 		/* Get the memory operand */
 		error = memread(vm, vcpuid, gpa, &memop, size, arg);
 		if (error)
 			return (error);
 
 		if (vie->op.op_byte == 0x3B) {
 			op1 = regop;
 			op2 = memop;
 		} else {
 			op1 = memop;
 			op2 = regop;
 		}
 		rflags2 = getcc(size, op1, op2);
 		break;
 	case 0x80:
 	case 0x81:
 	case 0x83:
 		/*
 		 * 80 /7		cmp r/m8, imm8
 		 * REX + 80 /7		cmp r/m8, imm8
 		 *
 		 * 81 /7		cmp r/m16, imm16
 		 * 81 /7		cmp r/m32, imm32
 		 * REX.W + 81 /7	cmp r/m64, imm32 sign-extended to 64
 		 *
 		 * 83 /7		cmp r/m16, imm8 sign-extended to 16
 		 * 83 /7		cmp r/m32, imm8 sign-extended to 32
 		 * REX.W + 83 /7	cmp r/m64, imm8 sign-extended to 64
 		 *
 		 * Compare mem (ModRM:r/m) with immediate and set
 		 * status flags according to the results.  The
 		 * comparison is performed by subtracting the
 		 * immediate from the first operand and then setting
 		 * the status flags.
 		 *
 		 */
 		if (vie->op.op_byte == 0x80)
 			size = 1;
 
 		/* get the first operand */
                 error = memread(vm, vcpuid, gpa, &op1, size, arg);
 		if (error)
 			return (error);
 
 		rflags2 = getcc(size, op1, vie->immediate);
 		break;
 	default:
 		return (EINVAL);
 	}
 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
 	if (error)
 		return (error);
 	rflags &= ~RFLAGS_STATUS_BITS;
 	rflags |= rflags2 & RFLAGS_STATUS_BITS;
 
 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
 	return (error);
 }
 
 static int
 emulate_test(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
     mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
 {
 	int error, size;
 	uint64_t op1, rflags, rflags2;
 
 	size = vie->opsize;
 	error = EINVAL;
 
 	switch (vie->op.op_byte) {
 	case 0xF7:
 		/*
 		 * F7 /0		test r/m16, imm16
 		 * F7 /0		test r/m32, imm32
 		 * REX.W + F7 /0	test r/m64, imm32 sign-extended to 64
 		 *
 		 * Test mem (ModRM:r/m) with immediate and set status
 		 * flags according to the results.  The comparison is
 		 * performed by anding the immediate from the first
 		 * operand and then setting the status flags.
 		 */
 		if ((vie->reg & 7) != 0)
 			return (EINVAL);
 
 		error = memread(vm, vcpuid, gpa, &op1, size, arg);
 		if (error)
 			return (error);
 
 		rflags2 = getandflags(size, op1, vie->immediate);
 		break;
 	default:
 		return (EINVAL);
 	}
 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
 	if (error)
 		return (error);
 
 	/*
 	 * OF and CF are cleared; the SF, ZF and PF flags are set according
 	 * to the result; AF is undefined.
 	 */
 	rflags &= ~RFLAGS_STATUS_BITS;
 	rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
 
 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
 	return (error);
 }
 
 static int
 emulate_bextr(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
     struct vm_guest_paging *paging, mem_region_read_t memread,
     mem_region_write_t memwrite, void *arg)
 {
 	uint64_t src1, src2, dst, rflags;
 	unsigned start, len;
 	int error, size;
 
 	size = vie->opsize;
 	error = EINVAL;
 
 	/*
 	 * VEX.LZ.0F38.W0 F7 /r		BEXTR r32a, r/m32, r32b
 	 * VEX.LZ.0F38.W1 F7 /r		BEXTR r64a, r/m64, r64b
 	 *
 	 * Destination operand is ModRM:reg.  Source operands are ModRM:r/m and
 	 * Vex.vvvv.
 	 *
 	 * Operand size is always 32-bit if not in 64-bit mode (W1 is ignored).
 	 */
 	if (size != 4 && paging->cpu_mode != CPU_MODE_64BIT)
 		size = 4;
 
 	/*
 	 * Extracts contiguous bits from the first /source/ operand (second
 	 * operand) using an index and length specified in the second /source/
 	 * operand (third operand).
 	 */
 	error = memread(vm, vcpuid, gpa, &src1, size, arg);
 	if (error)
 		return (error);
 	error = vie_read_register(vm, vcpuid, gpr_map[vie->vex_reg], &src2);
 	if (error)
 		return (error);
 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
 	if (error)
 		return (error);
 
 	start = (src2 & 0xff);
 	len = (src2 & 0xff00) >> 8;
 
 	/* If no bits are extracted, the destination register is cleared. */
 	dst = 0;
 
 	/* If START exceeds the operand size, no bits are extracted. */
 	if (start > size * 8)
 		goto done;
 	/* Length is bounded by both the destination size and start offset. */
 	if (start + len > size * 8)
 		len = (size * 8) - start;
 	if (len == 0)
 		goto done;
 
 	if (start > 0)
 		src1 = (src1 >> start);
 	if (len < 64)
 		src1 = src1 & ((1ull << len) - 1);
 	dst = src1;
 
 done:
 	error = vie_update_register(vm, vcpuid, gpr_map[vie->reg], dst, size);
 	if (error)
 		return (error);
 
 	/*
 	 * AMD: OF, CF cleared; SF/AF/PF undefined; ZF set by result.
 	 * Intel: ZF is set by result; AF/SF/PF undefined; all others cleared.
 	 */
 	rflags &= ~RFLAGS_STATUS_BITS;
 	if (dst == 0)
 		rflags |= PSL_Z;
 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags,
 	    8);
 	return (error);
 }
 
 static int
 emulate_add(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
 {
 	int error, size;
 	uint64_t nval, rflags, rflags2, val1, val2;
 	enum vm_reg_name reg;
 
 	size = vie->opsize;
 	error = EINVAL;
 
 	switch (vie->op.op_byte) {
 	case 0x03:
 		/*
 		 * ADD r/m to r and store the result in r
 		 *
 		 * 03/r            ADD r16, r/m16
 		 * 03/r            ADD r32, r/m32
 		 * REX.W + 03/r    ADD r64, r/m64
 		 */
 
 		/* get the first operand */
 		reg = gpr_map[vie->reg];
 		error = vie_read_register(vm, vcpuid, reg, &val1);
 		if (error)
 			break;
 
 		/* get the second operand */
 		error = memread(vm, vcpuid, gpa, &val2, size, arg);
 		if (error)
 			break;
 
 		/* perform the operation and write the result */
 		nval = val1 + val2;
 		error = vie_update_register(vm, vcpuid, reg, nval, size);
 		break;
 	default:
 		break;
 	}
 
 	if (!error) {
 		rflags2 = getaddflags(size, val1, val2);
 		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
 		    &rflags);
 		if (error)
 			return (error);
 
 		rflags &= ~RFLAGS_STATUS_BITS;
 		rflags |= rflags2 & RFLAGS_STATUS_BITS;
 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
 		    rflags, 8);
 	}
 
 	return (error);
 }
 
 static int
 emulate_sub(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
 {
 	int error, size;
 	uint64_t nval, rflags, rflags2, val1, val2;
 	enum vm_reg_name reg;
 
 	size = vie->opsize;
 	error = EINVAL;
 
 	switch (vie->op.op_byte) {
 	case 0x2B:
 		/*
 		 * SUB r/m from r and store the result in r
 		 * 
 		 * 2B/r            SUB r16, r/m16
 		 * 2B/r            SUB r32, r/m32
 		 * REX.W + 2B/r    SUB r64, r/m64
 		 */
 
 		/* get the first operand */
 		reg = gpr_map[vie->reg];
 		error = vie_read_register(vm, vcpuid, reg, &val1);
 		if (error)
 			break;
 
 		/* get the second operand */
 		error = memread(vm, vcpuid, gpa, &val2, size, arg);
 		if (error)
 			break;
 
 		/* perform the operation and write the result */
 		nval = val1 - val2;
 		error = vie_update_register(vm, vcpuid, reg, nval, size);
 		break;
 	default:
 		break;
 	}
 
 	if (!error) {
 		rflags2 = getcc(size, val1, val2);
 		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
 		    &rflags);
 		if (error)
 			return (error);
 
 		rflags &= ~RFLAGS_STATUS_BITS;
 		rflags |= rflags2 & RFLAGS_STATUS_BITS;
 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
 		    rflags, 8);
 	}
 
 	return (error);
 }
 
 static int
 emulate_stack_op(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
     struct vm_guest_paging *paging, mem_region_read_t memread,
     mem_region_write_t memwrite, void *arg)
 {
 #ifdef _KERNEL
 	struct vm_copyinfo copyinfo[2];
 #else
 	struct iovec copyinfo[2];
 #endif
 	struct seg_desc ss_desc;
 	uint64_t cr0, rflags, rsp, stack_gla, val;
 	int error, fault, size, stackaddrsize, pushop;
 
 	val = 0;
 	size = vie->opsize;
 	pushop = (vie->op.op_type == VIE_OP_TYPE_PUSH) ? 1 : 0;
 
 	/*
 	 * From "Address-Size Attributes for Stack Accesses", Intel SDL, Vol 1
 	 */
 	if (paging->cpu_mode == CPU_MODE_REAL) {
 		stackaddrsize = 2;
 	} else if (paging->cpu_mode == CPU_MODE_64BIT) {
 		/*
 		 * "Stack Manipulation Instructions in 64-bit Mode", SDM, Vol 3
 		 * - Stack pointer size is always 64-bits.
 		 * - PUSH/POP of 32-bit values is not possible in 64-bit mode.
 		 * - 16-bit PUSH/POP is supported by using the operand size
 		 *   override prefix (66H).
 		 */
 		stackaddrsize = 8;
 		size = vie->opsize_override ? 2 : 8;
 	} else {
 		/*
 		 * In protected or compatibility mode the 'B' flag in the
 		 * stack-segment descriptor determines the size of the
 		 * stack pointer.
 		 */
 		error = vm_get_seg_desc(vm, vcpuid, VM_REG_GUEST_SS, &ss_desc);
 		KASSERT(error == 0, ("%s: error %d getting SS descriptor",
 		    __func__, error));
 		if (SEG_DESC_DEF32(ss_desc.access))
 			stackaddrsize = 4;
 		else
 			stackaddrsize = 2;
 	}
 
 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0);
 	KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error));
 
 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
 
 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSP, &rsp);
 	KASSERT(error == 0, ("%s: error %d getting rsp", __func__, error));
 	if (pushop) {
 		rsp -= size;
 	}
 
 	if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &ss_desc,
 	    rsp, size, stackaddrsize, pushop ? PROT_WRITE : PROT_READ,
 	    &stack_gla)) {
 		vm_inject_ss(vm, vcpuid, 0);
 		return (0);
 	}
 
 	if (vie_canonical_check(paging->cpu_mode, stack_gla)) {
 		vm_inject_ss(vm, vcpuid, 0);
 		return (0);
 	}
 
 	if (vie_alignment_check(paging->cpl, size, cr0, rflags, stack_gla)) {
 		vm_inject_ac(vm, vcpuid, 0);
 		return (0);
 	}
 
 	error = vm_copy_setup(vm, vcpuid, paging, stack_gla, size,
 	    pushop ? PROT_WRITE : PROT_READ, copyinfo, nitems(copyinfo),
 	    &fault);
 	if (error || fault)
 		return (error);
 
 	if (pushop) {
 		error = memread(vm, vcpuid, mmio_gpa, &val, size, arg);
 		if (error == 0)
 			vm_copyout(vm, vcpuid, &val, copyinfo, size);
 	} else {
 		vm_copyin(vm, vcpuid, copyinfo, &val, size);
 		error = memwrite(vm, vcpuid, mmio_gpa, val, size, arg);
 		rsp += size;
 	}
 	vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
 
 	if (error == 0) {
 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSP, rsp,
 		    stackaddrsize);
 		KASSERT(error == 0, ("error %d updating rsp", error));
 	}
 	return (error);
 }
 
 static int
 emulate_push(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
     struct vm_guest_paging *paging, mem_region_read_t memread,
     mem_region_write_t memwrite, void *arg)
 {
 	int error;
 
 	/*
 	 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2.
 	 *
 	 * PUSH is part of the group 5 extended opcodes and is identified
 	 * by ModRM:reg = b110.
 	 */
 	if ((vie->reg & 7) != 6)
 		return (EINVAL);
 
 	error = emulate_stack_op(vm, vcpuid, mmio_gpa, vie, paging, memread,
 	    memwrite, arg);
 	return (error);
 }
 
 static int
 emulate_pop(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
     struct vm_guest_paging *paging, mem_region_read_t memread,
     mem_region_write_t memwrite, void *arg)
 {
 	int error;
 
 	/*
 	 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2.
 	 *
 	 * POP is part of the group 1A extended opcodes and is identified
 	 * by ModRM:reg = b000.
 	 */
 	if ((vie->reg & 7) != 0)
 		return (EINVAL);
 
 	error = emulate_stack_op(vm, vcpuid, mmio_gpa, vie, paging, memread,
 	    memwrite, arg);
 	return (error);
 }
 
 static int
 emulate_group1(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
     struct vm_guest_paging *paging, mem_region_read_t memread,
     mem_region_write_t memwrite, void *memarg)
 {
 	int error;
 
 	switch (vie->reg & 7) {
 	case 0x1:	/* OR */
 		error = emulate_or(vm, vcpuid, gpa, vie,
 		    memread, memwrite, memarg);
 		break;
 	case 0x4:	/* AND */
 		error = emulate_and(vm, vcpuid, gpa, vie,
 		    memread, memwrite, memarg);
 		break;
 	case 0x7:	/* CMP */
 		error = emulate_cmp(vm, vcpuid, gpa, vie,
 		    memread, memwrite, memarg);
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 
 	return (error);
 }
 
 static int
 emulate_bittest(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
     mem_region_read_t memread, mem_region_write_t memwrite, void *memarg)
 {
 	uint64_t val, rflags;
 	int error, bitmask, bitoff;
 
 	/*
 	 * 0F BA is a Group 8 extended opcode.
 	 *
 	 * Currently we only emulate the 'Bit Test' instruction which is
 	 * identified by a ModR/M:reg encoding of 100b.
 	 */
 	if ((vie->reg & 7) != 4)
 		return (EINVAL);
 
 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
 
 	error = memread(vm, vcpuid, gpa, &val, vie->opsize, memarg);
 	if (error)
 		return (error);
 
 	/*
 	 * Intel SDM, Vol 2, Table 3-2:
 	 * "Range of Bit Positions Specified by Bit Offset Operands"
 	 */
 	bitmask = vie->opsize * 8 - 1;
 	bitoff = vie->immediate & bitmask;
 
 	/* Copy the bit into the Carry flag in %rflags */
 	if (val & (1UL << bitoff))
 		rflags |= PSL_C;
 	else
 		rflags &= ~PSL_C;
 
 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
 	KASSERT(error == 0, ("%s: error %d updating rflags", __func__, error));
 
 	return (0);
 }
 
 static int
 emulate_twob_group15(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
     mem_region_read_t memread, mem_region_write_t memwrite, void *memarg)
 {
 	int error;
 	uint64_t buf;
 
 	switch (vie->reg & 7) {
 	case 0x7:	/* CLFLUSH, CLFLUSHOPT, and SFENCE */
 		if (vie->mod == 0x3) {
 			/*
 			 * SFENCE.  Ignore it, VM exit provides enough
 			 * barriers on its own.
 			 */
 			error = 0;
 		} else {
 			/*
 			 * CLFLUSH, CLFLUSHOPT.  Only check for access
 			 * rights.
 			 */
 			error = memread(vm, vcpuid, gpa, &buf, 1, memarg);
 		}
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 
 	return (error);
 }
 
 int
 vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
     struct vm_guest_paging *paging, mem_region_read_t memread,
     mem_region_write_t memwrite, void *memarg)
 {
 	int error;
 
 	if (!vie->decoded)
 		return (EINVAL);
 
 	switch (vie->op.op_type) {
 	case VIE_OP_TYPE_GROUP1:
 		error = emulate_group1(vm, vcpuid, gpa, vie, paging, memread,
 		    memwrite, memarg);
 		break;
 	case VIE_OP_TYPE_POP:
 		error = emulate_pop(vm, vcpuid, gpa, vie, paging, memread,
 		    memwrite, memarg);
 		break;
 	case VIE_OP_TYPE_PUSH:
 		error = emulate_push(vm, vcpuid, gpa, vie, paging, memread,
 		    memwrite, memarg);
 		break;
 	case VIE_OP_TYPE_CMP:
 		error = emulate_cmp(vm, vcpuid, gpa, vie,
 				    memread, memwrite, memarg);
 		break;
 	case VIE_OP_TYPE_MOV:
 		error = emulate_mov(vm, vcpuid, gpa, vie,
 				    memread, memwrite, memarg);
 		break;
 	case VIE_OP_TYPE_MOVSX:
 	case VIE_OP_TYPE_MOVZX:
 		error = emulate_movx(vm, vcpuid, gpa, vie,
 				     memread, memwrite, memarg);
 		break;
 	case VIE_OP_TYPE_MOVS:
 		error = emulate_movs(vm, vcpuid, gpa, vie, paging, memread,
 		    memwrite, memarg);
 		break;
 	case VIE_OP_TYPE_STOS:
 		error = emulate_stos(vm, vcpuid, gpa, vie, paging, memread,
 		    memwrite, memarg);
 		break;
 	case VIE_OP_TYPE_AND:
 		error = emulate_and(vm, vcpuid, gpa, vie,
 				    memread, memwrite, memarg);
 		break;
 	case VIE_OP_TYPE_OR:
 		error = emulate_or(vm, vcpuid, gpa, vie,
 				    memread, memwrite, memarg);
 		break;
 	case VIE_OP_TYPE_SUB:
 		error = emulate_sub(vm, vcpuid, gpa, vie,
 				    memread, memwrite, memarg);
 		break;
 	case VIE_OP_TYPE_BITTEST:
 		error = emulate_bittest(vm, vcpuid, gpa, vie,
 		    memread, memwrite, memarg);
 		break;
 	case VIE_OP_TYPE_TWOB_GRP15:
 		error = emulate_twob_group15(vm, vcpuid, gpa, vie,
 		    memread, memwrite, memarg);
 		break;
 	case VIE_OP_TYPE_ADD:
 		error = emulate_add(vm, vcpuid, gpa, vie, memread,
 		    memwrite, memarg);
 		break;
 	case VIE_OP_TYPE_TEST:
 		error = emulate_test(vm, vcpuid, gpa, vie,
 		    memread, memwrite, memarg);
 		break;
 	case VIE_OP_TYPE_BEXTR:
 		error = emulate_bextr(vm, vcpuid, gpa, vie, paging,
 		    memread, memwrite, memarg);
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 
 	return (error);
 }
 
 int
 vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla)
 {
 	KASSERT(size == 1 || size == 2 || size == 4 || size == 8,
 	    ("%s: invalid size %d", __func__, size));
 	KASSERT(cpl >= 0 && cpl <= 3, ("%s: invalid cpl %d", __func__, cpl));
 
 	if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0)
 		return (0);
 
 	return ((gla & (size - 1)) ? 1 : 0);
 }
 
 int
 vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla)
 {
 	uint64_t mask;
 
 	if (cpu_mode != CPU_MODE_64BIT)
 		return (0);
 
 	/*
 	 * The value of the bit 47 in the 'gla' should be replicated in the
 	 * most significant 16 bits.
 	 */
 	mask = ~((1UL << 48) - 1);
 	if (gla & (1UL << 47))
 		return ((gla & mask) != mask);
 	else
 		return ((gla & mask) != 0);
 }
 
 uint64_t
 vie_size2mask(int size)
 {
 	KASSERT(size == 1 || size == 2 || size == 4 || size == 8,
 	    ("vie_size2mask: invalid size %d", size));
 	return (size2mask[size]);
 }
 
 int
 vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
     struct seg_desc *desc, uint64_t offset, int length, int addrsize,
     int prot, uint64_t *gla)
 {
 	uint64_t firstoff, low_limit, high_limit, segbase;
 	int glasize, type;
 
 	KASSERT(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS,
 	    ("%s: invalid segment %d", __func__, seg));
 	KASSERT(length == 1 || length == 2 || length == 4 || length == 8,
 	    ("%s: invalid operand size %d", __func__, length));
 	KASSERT((prot & ~(PROT_READ | PROT_WRITE)) == 0,
 	    ("%s: invalid prot %#x", __func__, prot));
 
 	firstoff = offset;
 	if (cpu_mode == CPU_MODE_64BIT) {
 		KASSERT(addrsize == 4 || addrsize == 8, ("%s: invalid address "
 		    "size %d for cpu_mode %d", __func__, addrsize, cpu_mode));
 		glasize = 8;
 	} else {
 		KASSERT(addrsize == 2 || addrsize == 4, ("%s: invalid address "
 		    "size %d for cpu mode %d", __func__, addrsize, cpu_mode));
 		glasize = 4;
 		/*
 		 * If the segment selector is loaded with a NULL selector
 		 * then the descriptor is unusable and attempting to use
 		 * it results in a #GP(0).
 		 */
 		if (SEG_DESC_UNUSABLE(desc->access))
 			return (-1);
 
 		/* 
 		 * The processor generates a #NP exception when a segment
 		 * register is loaded with a selector that points to a
 		 * descriptor that is not present. If this was the case then
 		 * it would have been checked before the VM-exit.
 		 */
 		KASSERT(SEG_DESC_PRESENT(desc->access),
 		    ("segment %d not present: %#x", seg, desc->access));
 
 		/*
 		 * The descriptor type must indicate a code/data segment.
 		 */
 		type = SEG_DESC_TYPE(desc->access);
 		KASSERT(type >= 16 && type <= 31, ("segment %d has invalid "
 		    "descriptor type %#x", seg, type));
 
 		if (prot & PROT_READ) {
 			/* #GP on a read access to a exec-only code segment */
 			if ((type & 0xA) == 0x8)
 				return (-1);
 		}
 
 		if (prot & PROT_WRITE) {
 			/*
 			 * #GP on a write access to a code segment or a
 			 * read-only data segment.
 			 */
 			if (type & 0x8)			/* code segment */
 				return (-1);
 
 			if ((type & 0xA) == 0)		/* read-only data seg */
 				return (-1);
 		}
 
 		/*
 		 * 'desc->limit' is fully expanded taking granularity into
 		 * account.
 		 */
 		if ((type & 0xC) == 0x4) {
 			/* expand-down data segment */
 			low_limit = desc->limit + 1;
 			high_limit = SEG_DESC_DEF32(desc->access) ?
 			    0xffffffff : 0xffff;
 		} else {
 			/* code segment or expand-up data segment */
 			low_limit = 0;
 			high_limit = desc->limit;
 		}
 
 		while (length > 0) {
 			offset &= vie_size2mask(addrsize);
 			if (offset < low_limit || offset > high_limit)
 				return (-1);
 			offset++;
 			length--;
 		}
 	}
 
 	/*
 	 * In 64-bit mode all segments except %fs and %gs have a segment
 	 * base address of 0.
 	 */
 	if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS &&
 	    seg != VM_REG_GUEST_GS) {
 		segbase = 0;
 	} else {
 		segbase = desc->base;
 	}
 
 	/*
 	 * Truncate 'firstoff' to the effective address size before adding
 	 * it to the segment base.
 	 */
 	firstoff &= vie_size2mask(addrsize);
 	*gla = (segbase + firstoff) & vie_size2mask(glasize);
 	return (0);
 }
 
+/*
+ * Prepare a partially decoded vie for a 2nd attempt.
+ */
 void
-vie_init(struct vie *vie, const char *inst_bytes, int inst_length)
+vie_restart(struct vie *vie)
 {
-	KASSERT(inst_length >= 0 && inst_length <= VIE_INST_SIZE,
-	    ("%s: invalid instruction length (%d)", __func__, inst_length));
+	_Static_assert(
+	    offsetof(struct vie, inst) < offsetof(struct vie, vie_startzero) &&
+	    offsetof(struct vie, num_valid) < offsetof(struct vie, vie_startzero),
+	    "restart should not erase instruction length or contents");
 
-	bzero(vie, sizeof(struct vie));
+	memset((char *)vie + offsetof(struct vie, vie_startzero), 0,
+	    sizeof(*vie) - offsetof(struct vie, vie_startzero));
 
 	vie->base_register = VM_REG_LAST;
 	vie->index_register = VM_REG_LAST;
 	vie->segment_register = VM_REG_LAST;
+}
 
-	if (inst_length) {
-		bcopy(inst_bytes, vie->inst, inst_length);
-		vie->num_valid = inst_length;
-	}
+void
+vie_init(struct vie *vie, const char *inst_bytes, int inst_length)
+{
+	KASSERT(inst_length >= 0 && inst_length <= VIE_INST_SIZE,
+	    ("%s: invalid instruction length (%d)", __func__, inst_length));
+
+	vie_restart(vie);
+	memset(vie->inst, 0, sizeof(vie->inst));
+	if (inst_length != 0)
+		memcpy(vie->inst, inst_bytes, inst_length);
+	vie->num_valid = inst_length;
 }
 
 #ifdef _KERNEL
 static int
 pf_error_code(int usermode, int prot, int rsvd, uint64_t pte)
 {
 	int error_code = 0;
 
 	if (pte & PG_V)
 		error_code |= PGEX_P;
 	if (prot & VM_PROT_WRITE)
 		error_code |= PGEX_W;
 	if (usermode)
 		error_code |= PGEX_U;
 	if (rsvd)
 		error_code |= PGEX_RSV;
 	if (prot & VM_PROT_EXECUTE)
 		error_code |= PGEX_I;
 
 	return (error_code);
 }
 
 static void
 ptp_release(void **cookie)
 {
 	if (*cookie != NULL) {
 		vm_gpa_release(*cookie);
 		*cookie = NULL;
 	}
 }
 
 static void *
 ptp_hold(struct vm *vm, int vcpu, vm_paddr_t ptpphys, size_t len, void **cookie)
 {
 	void *ptr;
 
 	ptp_release(cookie);
 	ptr = vm_gpa_hold(vm, vcpu, ptpphys, len, VM_PROT_RW, cookie);
 	return (ptr);
 }
 
 static int
 _vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
     uint64_t gla, int prot, uint64_t *gpa, int *guest_fault, bool check_only)
 {
 	int nlevels, pfcode, ptpshift, ptpindex, retval, usermode, writable;
 	u_int retries;
 	uint64_t *ptpbase, ptpphys, pte, pgsize;
 	uint32_t *ptpbase32, pte32;
 	void *cookie;
 
 	*guest_fault = 0;
 
 	usermode = (paging->cpl == 3 ? 1 : 0);
 	writable = prot & VM_PROT_WRITE;
 	cookie = NULL;
 	retval = 0;
 	retries = 0;
 restart:
 	ptpphys = paging->cr3;		/* root of the page tables */
 	ptp_release(&cookie);
 	if (retries++ > 0)
 		maybe_yield();
 
 	if (vie_canonical_check(paging->cpu_mode, gla)) {
 		/*
 		 * XXX assuming a non-stack reference otherwise a stack fault
 		 * should be generated.
 		 */
 		if (!check_only)
 			vm_inject_gp(vm, vcpuid);
 		goto fault;
 	}
 
 	if (paging->paging_mode == PAGING_MODE_FLAT) {
 		*gpa = gla;
 		goto done;
 	}
 
 	if (paging->paging_mode == PAGING_MODE_32) {
 		nlevels = 2;
 		while (--nlevels >= 0) {
 			/* Zero out the lower 12 bits. */
 			ptpphys &= ~0xfff;
 
 			ptpbase32 = ptp_hold(vm, vcpuid, ptpphys, PAGE_SIZE,
 			    &cookie);
 
 			if (ptpbase32 == NULL)
 				goto error;
 
 			ptpshift = PAGE_SHIFT + nlevels * 10;
 			ptpindex = (gla >> ptpshift) & 0x3FF;
 			pgsize = 1UL << ptpshift;
 
 			pte32 = ptpbase32[ptpindex];
 
 			if ((pte32 & PG_V) == 0 ||
 			    (usermode && (pte32 & PG_U) == 0) ||
 			    (writable && (pte32 & PG_RW) == 0)) {
 				if (!check_only) {
 					pfcode = pf_error_code(usermode, prot, 0,
 					    pte32);
 					vm_inject_pf(vm, vcpuid, pfcode, gla);
 				}
 				goto fault;
 			}
 
 			/*
 			 * Emulate the x86 MMU's management of the accessed
 			 * and dirty flags. While the accessed flag is set
 			 * at every level of the page table, the dirty flag
 			 * is only set at the last level providing the guest
 			 * physical address.
 			 */
 			if (!check_only && (pte32 & PG_A) == 0) {
 				if (atomic_cmpset_32(&ptpbase32[ptpindex],
 				    pte32, pte32 | PG_A) == 0) {
 					goto restart;
 				}
 			}
 
 			/* XXX must be ignored if CR4.PSE=0 */
 			if (nlevels > 0 && (pte32 & PG_PS) != 0)
 				break;
 
 			ptpphys = pte32;
 		}
 
 		/* Set the dirty bit in the page table entry if necessary */
 		if (!check_only && writable && (pte32 & PG_M) == 0) {
 			if (atomic_cmpset_32(&ptpbase32[ptpindex],
 			    pte32, pte32 | PG_M) == 0) {
 				goto restart;
 			}
 		}
 
 		/* Zero out the lower 'ptpshift' bits */
 		pte32 >>= ptpshift; pte32 <<= ptpshift;
 		*gpa = pte32 | (gla & (pgsize - 1));
 		goto done;
 	}
 
 	if (paging->paging_mode == PAGING_MODE_PAE) {
 		/* Zero out the lower 5 bits and the upper 32 bits */
 		ptpphys &= 0xffffffe0UL;
 
 		ptpbase = ptp_hold(vm, vcpuid, ptpphys, sizeof(*ptpbase) * 4,
 		    &cookie);
 		if (ptpbase == NULL)
 			goto error;
 
 		ptpindex = (gla >> 30) & 0x3;
 
 		pte = ptpbase[ptpindex];
 
 		if ((pte & PG_V) == 0) {
 			if (!check_only) {
 				pfcode = pf_error_code(usermode, prot, 0, pte);
 				vm_inject_pf(vm, vcpuid, pfcode, gla);
 			}
 			goto fault;
 		}
 
 		ptpphys = pte;
 
 		nlevels = 2;
 	} else
 		nlevels = 4;
 	while (--nlevels >= 0) {
 		/* Zero out the lower 12 bits and the upper 12 bits */
 		ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12;
 
 		ptpbase = ptp_hold(vm, vcpuid, ptpphys, PAGE_SIZE, &cookie);
 		if (ptpbase == NULL)
 			goto error;
 
 		ptpshift = PAGE_SHIFT + nlevels * 9;
 		ptpindex = (gla >> ptpshift) & 0x1FF;
 		pgsize = 1UL << ptpshift;
 
 		pte = ptpbase[ptpindex];
 
 		if ((pte & PG_V) == 0 ||
 		    (usermode && (pte & PG_U) == 0) ||
 		    (writable && (pte & PG_RW) == 0)) {
 			if (!check_only) {
 				pfcode = pf_error_code(usermode, prot, 0, pte);
 				vm_inject_pf(vm, vcpuid, pfcode, gla);
 			}
 			goto fault;
 		}
 
 		/* Set the accessed bit in the page table entry */
 		if (!check_only && (pte & PG_A) == 0) {
 			if (atomic_cmpset_64(&ptpbase[ptpindex],
 			    pte, pte | PG_A) == 0) {
 				goto restart;
 			}
 		}
 
 		if (nlevels > 0 && (pte & PG_PS) != 0) {
 			if (pgsize > 1 * GB) {
 				if (!check_only) {
 					pfcode = pf_error_code(usermode, prot, 1,
 					    pte);
 					vm_inject_pf(vm, vcpuid, pfcode, gla);
 				}
 				goto fault;
 			}
 			break;
 		}
 
 		ptpphys = pte;
 	}
 
 	/* Set the dirty bit in the page table entry if necessary */
 	if (!check_only && writable && (pte & PG_M) == 0) {
 		if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_M) == 0)
 			goto restart;
 	}
 
 	/* Zero out the lower 'ptpshift' bits and the upper 12 bits */
 	pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12;
 	*gpa = pte | (gla & (pgsize - 1));
 done:
 	ptp_release(&cookie);
 	KASSERT(retval == 0 || retval == EFAULT, ("%s: unexpected retval %d",
 	    __func__, retval));
 	return (retval);
 error:
 	retval = EFAULT;
 	goto done;
 fault:
 	*guest_fault = 1;
 	goto done;
 }
 
 int
 vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
     uint64_t gla, int prot, uint64_t *gpa, int *guest_fault)
 {
 
 	return (_vm_gla2gpa(vm, vcpuid, paging, gla, prot, gpa, guest_fault,
 	    false));
 }
 
 int
 vm_gla2gpa_nofault(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
     uint64_t gla, int prot, uint64_t *gpa, int *guest_fault)
 {
 
 	return (_vm_gla2gpa(vm, vcpuid, paging, gla, prot, gpa, guest_fault,
 	    true));
 }
 
 int
 vmm_fetch_instruction(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
     uint64_t rip, int inst_length, struct vie *vie, int *faultptr)
 {
 	struct vm_copyinfo copyinfo[2];
 	int error, prot;
 
 	if (inst_length > VIE_INST_SIZE)
 		panic("vmm_fetch_instruction: invalid length %d", inst_length);
 
 	prot = PROT_READ | PROT_EXEC;
 	error = vm_copy_setup(vm, vcpuid, paging, rip, inst_length, prot,
 	    copyinfo, nitems(copyinfo), faultptr);
 	if (error || *faultptr)
 		return (error);
 
 	vm_copyin(vm, vcpuid, copyinfo, vie->inst, inst_length);
 	vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
 	vie->num_valid = inst_length;
 	return (0);
 }
 #endif	/* _KERNEL */
 
 static int
 vie_peek(struct vie *vie, uint8_t *x)
 {
 
 	if (vie->num_processed < vie->num_valid) {
 		*x = vie->inst[vie->num_processed];
 		return (0);
 	} else
 		return (-1);
 }
 
 static void
 vie_advance(struct vie *vie)
 {
 
 	vie->num_processed++;
 }
 
 static bool
 segment_override(uint8_t x, int *seg)
 {
 
 	switch (x) {
 	case 0x2E:
 		*seg = VM_REG_GUEST_CS;
 		break;
 	case 0x36:
 		*seg = VM_REG_GUEST_SS;
 		break;
 	case 0x3E:
 		*seg = VM_REG_GUEST_DS;
 		break;
 	case 0x26:
 		*seg = VM_REG_GUEST_ES;
 		break;
 	case 0x64:
 		*seg = VM_REG_GUEST_FS;
 		break;
 	case 0x65:
 		*seg = VM_REG_GUEST_GS;
 		break;
 	default:
 		return (false);
 	}
 	return (true);
 }
 
 static int
 decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d)
 {
 	uint8_t x;
 
 	while (1) {
 		if (vie_peek(vie, &x))
 			return (-1);
 
 		if (x == 0x66)
 			vie->opsize_override = 1;
 		else if (x == 0x67)
 			vie->addrsize_override = 1;
 		else if (x == 0xF3)
 			vie->repz_present = 1;
 		else if (x == 0xF2)
 			vie->repnz_present = 1;
 		else if (segment_override(x, &vie->segment_register))
 			vie->segment_override = 1;
 		else
 			break;
 
 		vie_advance(vie);
 	}
 
 	/*
 	 * From section 2.2.1, "REX Prefixes", Intel SDM Vol 2:
 	 * - Only one REX prefix is allowed per instruction.
 	 * - The REX prefix must immediately precede the opcode byte or the
 	 *   escape opcode byte.
 	 * - If an instruction has a mandatory prefix (0x66, 0xF2 or 0xF3)
 	 *   the mandatory prefix must come before the REX prefix.
 	 */
 	if (cpu_mode == CPU_MODE_64BIT && x >= 0x40 && x <= 0x4F) {
 		vie->rex_present = 1;
 		vie->rex_w = x & 0x8 ? 1 : 0;
 		vie->rex_r = x & 0x4 ? 1 : 0;
 		vie->rex_x = x & 0x2 ? 1 : 0;
 		vie->rex_b = x & 0x1 ? 1 : 0;
 		vie_advance(vie);
 	}
 
 	/*
 	 * § 2.3.5, "The VEX Prefix", SDM Vol 2.
 	 */
 	if ((cpu_mode == CPU_MODE_64BIT || cpu_mode == CPU_MODE_COMPATIBILITY)
 	    && x == 0xC4) {
 		const struct vie_op *optab;
 
 		/* 3-byte VEX prefix. */
 		vie->vex_present = 1;
 
 		vie_advance(vie);
 		if (vie_peek(vie, &x))
 			return (-1);
 
 		/*
 		 * 2nd byte: [R', X', B', mmmmm[4:0]].  Bits are inverted
 		 * relative to REX encoding.
 		 */
 		vie->rex_r = x & 0x80 ? 0 : 1;
 		vie->rex_x = x & 0x40 ? 0 : 1;
 		vie->rex_b = x & 0x20 ? 0 : 1;
 
 		switch (x & 0x1F) {
 		case 0x2:
 			/* 0F 38. */
 			optab = three_byte_opcodes_0f38;
 			break;
 		case 0x1:
 			/* 0F class - nothing handled here yet. */
 			/* FALLTHROUGH */
 		case 0x3:
 			/* 0F 3A class - nothing handled here yet. */
 			/* FALLTHROUGH */
 		default:
 			/* Reserved (#UD). */
 			return (-1);
 		}
 
 		vie_advance(vie);
 		if (vie_peek(vie, &x))
 			return (-1);
 
 		/* 3rd byte: [W, vvvv[6:3], L, pp[1:0]]. */
 		vie->rex_w = x & 0x80 ? 1 : 0;
 
 		vie->vex_reg = ((~(unsigned)x & 0x78u) >> 3);
 		vie->vex_l = !!(x & 0x4);
 		vie->vex_pp = (x & 0x3);
 
 		/* PP: 1=66 2=F3 3=F2 prefixes. */
 		switch (vie->vex_pp) {
 		case 0x1:
 			vie->opsize_override = 1;
 			break;
 		case 0x2:
 			vie->repz_present = 1;
 			break;
 		case 0x3:
 			vie->repnz_present = 1;
 			break;
 		}
 
 		vie_advance(vie);
 
 		/* Opcode, sans literal prefix prefix. */
 		if (vie_peek(vie, &x))
 			return (-1);
 
 		vie->op = optab[x];
 		if (vie->op.op_type == VIE_OP_TYPE_NONE)
 			return (-1);
 
 		vie_advance(vie);
 	}
 
 	/*
 	 * Section "Operand-Size And Address-Size Attributes", Intel SDM, Vol 1
 	 */
 	if (cpu_mode == CPU_MODE_64BIT) {
 		/*
 		 * Default address size is 64-bits and default operand size
 		 * is 32-bits.
 		 */
 		vie->addrsize = vie->addrsize_override ? 4 : 8;
 		if (vie->rex_w)
 			vie->opsize = 8;
 		else if (vie->opsize_override)
 			vie->opsize = 2;
 		else
 			vie->opsize = 4;
 	} else if (cs_d) {
 		/* Default address and operand sizes are 32-bits */
 		vie->addrsize = vie->addrsize_override ? 2 : 4;
 		vie->opsize = vie->opsize_override ? 2 : 4;
 	} else {
 		/* Default address and operand sizes are 16-bits */
 		vie->addrsize = vie->addrsize_override ? 4 : 2;
 		vie->opsize = vie->opsize_override ? 4 : 2;
 	}
 	return (0);
 }
 
 static int
 decode_two_byte_opcode(struct vie *vie)
 {
 	uint8_t x;
 
 	if (vie_peek(vie, &x))
 		return (-1);
 
 	vie->op = two_byte_opcodes[x];
 
 	if (vie->op.op_type == VIE_OP_TYPE_NONE)
 		return (-1);
 
 	vie_advance(vie);
 	return (0);
 }
 
 static int
 decode_opcode(struct vie *vie)
 {
 	uint8_t x;
 
 	if (vie_peek(vie, &x))
 		return (-1);
 
 	/* Already did this via VEX prefix. */
 	if (vie->op.op_type != VIE_OP_TYPE_NONE)
 		return (0);
 
 	vie->op = one_byte_opcodes[x];
 
 	if (vie->op.op_type == VIE_OP_TYPE_NONE)
 		return (-1);
 
 	vie_advance(vie);
 
 	if (vie->op.op_type == VIE_OP_TYPE_TWO_BYTE)
 		return (decode_two_byte_opcode(vie));
 
 	return (0);
 }
 
 static int
 decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode)
 {
 	uint8_t x;
 
 	if (vie->op.op_flags & VIE_OP_F_NO_MODRM)
 		return (0);
 
 	if (cpu_mode == CPU_MODE_REAL)
 		return (-1);
 
 	if (vie_peek(vie, &x))
 		return (-1);
 
 	vie->mod = (x >> 6) & 0x3;
 	vie->rm =  (x >> 0) & 0x7;
 	vie->reg = (x >> 3) & 0x7;
 
 	/*
 	 * A direct addressing mode makes no sense in the context of an EPT
 	 * fault. There has to be a memory access involved to cause the
 	 * EPT fault.
 	 */
 	if (vie->mod == VIE_MOD_DIRECT)
 		return (-1);
 
 	if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) ||
 	    (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) {
 		/*
 		 * Table 2-5: Special Cases of REX Encodings
 		 *
 		 * mod=0, r/m=5 is used in the compatibility mode to
 		 * indicate a disp32 without a base register.
 		 *
 		 * mod!=3, r/m=4 is used in the compatibility mode to
 		 * indicate that the SIB byte is present.
 		 *
 		 * The 'b' bit in the REX prefix is don't care in
 		 * this case.
 		 */
 	} else {
 		vie->rm |= (vie->rex_b << 3);
 	}
 
 	vie->reg |= (vie->rex_r << 3);
 
 	/* SIB */
 	if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)
 		goto done;
 
 	vie->base_register = gpr_map[vie->rm];
 
 	switch (vie->mod) {
 	case VIE_MOD_INDIRECT_DISP8:
 		vie->disp_bytes = 1;
 		break;
 	case VIE_MOD_INDIRECT_DISP32:
 		vie->disp_bytes = 4;
 		break;
 	case VIE_MOD_INDIRECT:
 		if (vie->rm == VIE_RM_DISP32) {
 			vie->disp_bytes = 4;
 			/*
 			 * Table 2-7. RIP-Relative Addressing
 			 *
 			 * In 64-bit mode mod=00 r/m=101 implies [rip] + disp32
 			 * whereas in compatibility mode it just implies disp32.
 			 */
 
 			if (cpu_mode == CPU_MODE_64BIT)
 				vie->base_register = VM_REG_GUEST_RIP;
 			else
 				vie->base_register = VM_REG_LAST;
 		}
 		break;
 	}
 
 done:
 	vie_advance(vie);
 
 	return (0);
 }
 
 static int
 decode_sib(struct vie *vie)
 {
 	uint8_t x;
 
 	/* Proceed only if SIB byte is present */
 	if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB)
 		return (0);
 
 	if (vie_peek(vie, &x))
 		return (-1);
 
 	/* De-construct the SIB byte */
 	vie->ss = (x >> 6) & 0x3;
 	vie->index = (x >> 3) & 0x7;
 	vie->base = (x >> 0) & 0x7;
 
 	/* Apply the REX prefix modifiers */
 	vie->index |= vie->rex_x << 3;
 	vie->base |= vie->rex_b << 3;
 
 	switch (vie->mod) {
 	case VIE_MOD_INDIRECT_DISP8:
 		vie->disp_bytes = 1;
 		break;
 	case VIE_MOD_INDIRECT_DISP32:
 		vie->disp_bytes = 4;
 		break;
 	}
 
 	if (vie->mod == VIE_MOD_INDIRECT &&
 	    (vie->base == 5 || vie->base == 13)) {
 		/*
 		 * Special case when base register is unused if mod = 0
 		 * and base = %rbp or %r13.
 		 *
 		 * Documented in:
 		 * Table 2-3: 32-bit Addressing Forms with the SIB Byte
 		 * Table 2-5: Special Cases of REX Encodings
 		 */
 		vie->disp_bytes = 4;
 	} else {
 		vie->base_register = gpr_map[vie->base];
 	}
 
 	/*
 	 * All encodings of 'index' are valid except for %rsp (4).
 	 *
 	 * Documented in:
 	 * Table 2-3: 32-bit Addressing Forms with the SIB Byte
 	 * Table 2-5: Special Cases of REX Encodings
 	 */
 	if (vie->index != 4)
 		vie->index_register = gpr_map[vie->index];
 
 	/* 'scale' makes sense only in the context of an index register */
 	if (vie->index_register < VM_REG_LAST)
 		vie->scale = 1 << vie->ss;
 
 	vie_advance(vie);
 
 	return (0);
 }
 
 static int
 decode_displacement(struct vie *vie)
 {
 	int n, i;
 	uint8_t x;
 
 	union {
 		char	buf[4];
 		int8_t	signed8;
 		int32_t	signed32;
 	} u;
 
 	if ((n = vie->disp_bytes) == 0)
 		return (0);
 
 	if (n != 1 && n != 4)
 		panic("decode_displacement: invalid disp_bytes %d", n);
 
 	for (i = 0; i < n; i++) {
 		if (vie_peek(vie, &x))
 			return (-1);
 
 		u.buf[i] = x;
 		vie_advance(vie);
 	}
 
 	if (n == 1)
 		vie->displacement = u.signed8;		/* sign-extended */
 	else
 		vie->displacement = u.signed32;		/* sign-extended */
 
 	return (0);
 }
 
 static int
 decode_immediate(struct vie *vie)
 {
 	int i, n;
 	uint8_t x;
 	union {
 		char	buf[4];
 		int8_t	signed8;
 		int16_t	signed16;
 		int32_t	signed32;
 	} u;
 
 	/* Figure out immediate operand size (if any) */
 	if (vie->op.op_flags & VIE_OP_F_IMM) {
 		/*
 		 * Section 2.2.1.5 "Immediates", Intel SDM:
 		 * In 64-bit mode the typical size of immediate operands
 		 * remains 32-bits. When the operand size if 64-bits, the
 		 * processor sign-extends all immediates to 64-bits prior
 		 * to their use.
 		 */
 		if (vie->opsize == 4 || vie->opsize == 8)
 			vie->imm_bytes = 4;
 		else
 			vie->imm_bytes = 2;
 	} else if (vie->op.op_flags & VIE_OP_F_IMM8) {
 		vie->imm_bytes = 1;
 	}
 
 	if ((n = vie->imm_bytes) == 0)
 		return (0);
 
 	KASSERT(n == 1 || n == 2 || n == 4,
 	    ("%s: invalid number of immediate bytes: %d", __func__, n));
 
 	for (i = 0; i < n; i++) {
 		if (vie_peek(vie, &x))
 			return (-1);
 
 		u.buf[i] = x;
 		vie_advance(vie);
 	}
 
 	/* sign-extend the immediate value before use */
 	if (n == 1)
 		vie->immediate = u.signed8;
 	else if (n == 2)
 		vie->immediate = u.signed16;
 	else
 		vie->immediate = u.signed32;
 
 	return (0);
 }
 
 static int
 decode_moffset(struct vie *vie)
 {
 	int i, n;
 	uint8_t x;
 	union {
 		char	buf[8];
 		uint64_t u64;
 	} u;
 
 	if ((vie->op.op_flags & VIE_OP_F_MOFFSET) == 0)
 		return (0);
 
 	/*
 	 * Section 2.2.1.4, "Direct Memory-Offset MOVs", Intel SDM:
 	 * The memory offset size follows the address-size of the instruction.
 	 */
 	n = vie->addrsize;
 	KASSERT(n == 2 || n == 4 || n == 8, ("invalid moffset bytes: %d", n));
 
 	u.u64 = 0;
 	for (i = 0; i < n; i++) {
 		if (vie_peek(vie, &x))
 			return (-1);
 
 		u.buf[i] = x;
 		vie_advance(vie);
 	}
 	vie->displacement = u.u64;
 	return (0);
 }
 
 #ifdef _KERNEL
 /*
  * Verify that the 'guest linear address' provided as collateral of the nested
  * page table fault matches with our instruction decoding.
  */
 static int
 verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie,
     enum vm_cpu_mode cpu_mode)
 {
 	int error;
 	uint64_t base, segbase, idx, gla2;
 	enum vm_reg_name seg;
 	struct seg_desc desc;
 
 	/* Skip 'gla' verification */
 	if (gla == VIE_INVALID_GLA)
 		return (0);
 
 	base = 0;
 	if (vie->base_register != VM_REG_LAST) {
 		error = vm_get_register(vm, cpuid, vie->base_register, &base);
 		if (error) {
 			printf("verify_gla: error %d getting base reg %d\n",
 				error, vie->base_register);
 			return (-1);
 		}
 
 		/*
 		 * RIP-relative addressing starts from the following
 		 * instruction
 		 */
 		if (vie->base_register == VM_REG_GUEST_RIP)
 			base += vie->num_processed;
 	}
 
 	idx = 0;
 	if (vie->index_register != VM_REG_LAST) {
 		error = vm_get_register(vm, cpuid, vie->index_register, &idx);
 		if (error) {
 			printf("verify_gla: error %d getting index reg %d\n",
 				error, vie->index_register);
 			return (-1);
 		}
 	}
 
 	/*
 	 * From "Specifying a Segment Selector", Intel SDM, Vol 1
 	 *
 	 * In 64-bit mode, segmentation is generally (but not
 	 * completely) disabled.  The exceptions are the FS and GS
 	 * segments.
 	 *
 	 * In legacy IA-32 mode, when the ESP or EBP register is used
 	 * as the base, the SS segment is the default segment.  For
 	 * other data references, except when relative to stack or
 	 * string destination the DS segment is the default.  These
 	 * can be overridden to allow other segments to be accessed.
 	 */
 	if (vie->segment_override)
 		seg = vie->segment_register;
 	else if (vie->base_register == VM_REG_GUEST_RSP ||
 	    vie->base_register == VM_REG_GUEST_RBP)
 		seg = VM_REG_GUEST_SS;
 	else
 		seg = VM_REG_GUEST_DS;
 	if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS &&
 	    seg != VM_REG_GUEST_GS) {
 		segbase = 0;
 	} else {
 		error = vm_get_seg_desc(vm, cpuid, seg, &desc);
 		if (error) {
 			printf("verify_gla: error %d getting segment"
 			       " descriptor %d", error,
 			       vie->segment_register);
 			return (-1);
 		}
 		segbase = desc.base;
 	}
 
 	gla2 = segbase + base + vie->scale * idx + vie->displacement;
 	gla2 &= size2mask[vie->addrsize];
 	if (gla != gla2) {
 		printf("verify_gla mismatch: segbase(0x%0lx)"
 		       "base(0x%0lx), scale(%d), index(0x%0lx), "
 		       "disp(0x%0lx), gla(0x%0lx), gla2(0x%0lx)\n",
 		       segbase, base, vie->scale, idx, vie->displacement,
 		       gla, gla2);
 		return (-1);
 	}
 
 	return (0);
 }
 #endif	/* _KERNEL */
 
 int
 #ifdef _KERNEL
 vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla,
 		       enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie)
 #else
 vmm_decode_instruction(enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie)
 #endif
 {
 
 	if (decode_prefixes(vie, cpu_mode, cs_d))
 		return (-1);
 
 	if (decode_opcode(vie))
 		return (-1);
 
 	if (decode_modrm(vie, cpu_mode))
 		return (-1);
 
 	if (decode_sib(vie))
 		return (-1);
 
 	if (decode_displacement(vie))
 		return (-1);
 
 	if (decode_immediate(vie))
 		return (-1);
 
 	if (decode_moffset(vie))
 		return (-1);
 
 #ifdef _KERNEL
 	if ((vie->op.op_flags & VIE_OP_F_NO_GLA_VERIFICATION) == 0) {
 		if (verify_gla(vm, cpuid, gla, vie, cpu_mode))
 			return (-1);
 	}
 #endif
 
 	vie->decoded = 1;	/* success */
 
 	return (0);
 }
Index: head/usr.sbin/bhyve/bhyverun.c
===================================================================
--- head/usr.sbin/bhyve/bhyverun.c	(revision 362599)
+++ head/usr.sbin/bhyve/bhyverun.c	(revision 362600)
@@ -1,1430 +1,1445 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/types.h>
 #ifndef WITHOUT_CAPSICUM
 #include <sys/capsicum.h>
 #endif
 #include <sys/mman.h>
 #ifdef BHYVE_SNAPSHOT
 #include <sys/socket.h>
 #include <sys/stat.h>
 #endif
 #include <sys/time.h>
 #ifdef BHYVE_SNAPSHOT
 #include <sys/un.h>
 #endif
 
 #include <amd64/vmm/intel/vmcs.h>
 
 #include <machine/atomic.h>
 #include <machine/segments.h>
 
 #ifndef WITHOUT_CAPSICUM
 #include <capsicum_helpers.h>
 #endif
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <err.h>
 #include <errno.h>
 #ifdef BHYVE_SNAPSHOT
 #include <fcntl.h>
 #endif
 #include <libgen.h>
 #include <unistd.h>
 #include <assert.h>
 #include <pthread.h>
 #include <pthread_np.h>
 #include <sysexits.h>
 #include <stdbool.h>
 #include <stdint.h>
 #ifdef BHYVE_SNAPSHOT
 #include <ucl.h>
 #include <unistd.h>
 
 #include <libxo/xo.h>
 #endif
 
 #include <machine/vmm.h>
 #ifndef WITHOUT_CAPSICUM
 #include <machine/vmm_dev.h>
 #endif
+#include <machine/vmm_instruction_emul.h>
 #include <vmmapi.h>
 
 #include "bhyverun.h"
 #include "acpi.h"
 #include "atkbdc.h"
 #include "bootrom.h"
 #include "inout.h"
 #include "dbgport.h"
 #include "debug.h"
 #include "fwctl.h"
 #include "gdb.h"
 #include "ioapic.h"
 #include "kernemu_dev.h"
 #include "mem.h"
 #include "mevent.h"
 #include "mptbl.h"
 #include "pci_emul.h"
 #include "pci_irq.h"
 #include "pci_lpc.h"
 #include "smbiostbl.h"
 #ifdef BHYVE_SNAPSHOT
 #include "snapshot.h"
 #endif
 #include "xmsr.h"
 #include "spinup_ap.h"
 #include "rtc.h"
 #include "vmgenc.h"
 
 #define GUEST_NIO_PORT		0x488	/* guest upcalls via i/o port */
 
 #define MB		(1024UL * 1024)
 #define GB		(1024UL * MB)
 
 static const char * const vmx_exit_reason_desc[] = {
 	[EXIT_REASON_EXCEPTION] = "Exception or non-maskable interrupt (NMI)",
 	[EXIT_REASON_EXT_INTR] = "External interrupt",
 	[EXIT_REASON_TRIPLE_FAULT] = "Triple fault",
 	[EXIT_REASON_INIT] = "INIT signal",
 	[EXIT_REASON_SIPI] = "Start-up IPI (SIPI)",
 	[EXIT_REASON_IO_SMI] = "I/O system-management interrupt (SMI)",
 	[EXIT_REASON_SMI] = "Other SMI",
 	[EXIT_REASON_INTR_WINDOW] = "Interrupt window",
 	[EXIT_REASON_NMI_WINDOW] = "NMI window",
 	[EXIT_REASON_TASK_SWITCH] = "Task switch",
 	[EXIT_REASON_CPUID] = "CPUID",
 	[EXIT_REASON_GETSEC] = "GETSEC",
 	[EXIT_REASON_HLT] = "HLT",
 	[EXIT_REASON_INVD] = "INVD",
 	[EXIT_REASON_INVLPG] = "INVLPG",
 	[EXIT_REASON_RDPMC] = "RDPMC",
 	[EXIT_REASON_RDTSC] = "RDTSC",
 	[EXIT_REASON_RSM] = "RSM",
 	[EXIT_REASON_VMCALL] = "VMCALL",
 	[EXIT_REASON_VMCLEAR] = "VMCLEAR",
 	[EXIT_REASON_VMLAUNCH] = "VMLAUNCH",
 	[EXIT_REASON_VMPTRLD] = "VMPTRLD",
 	[EXIT_REASON_VMPTRST] = "VMPTRST",
 	[EXIT_REASON_VMREAD] = "VMREAD",
 	[EXIT_REASON_VMRESUME] = "VMRESUME",
 	[EXIT_REASON_VMWRITE] = "VMWRITE",
 	[EXIT_REASON_VMXOFF] = "VMXOFF",
 	[EXIT_REASON_VMXON] = "VMXON",
 	[EXIT_REASON_CR_ACCESS] = "Control-register accesses",
 	[EXIT_REASON_DR_ACCESS] = "MOV DR",
 	[EXIT_REASON_INOUT] = "I/O instruction",
 	[EXIT_REASON_RDMSR] = "RDMSR",
 	[EXIT_REASON_WRMSR] = "WRMSR",
 	[EXIT_REASON_INVAL_VMCS] =
 	    "VM-entry failure due to invalid guest state",
 	[EXIT_REASON_INVAL_MSR] = "VM-entry failure due to MSR loading",
 	[EXIT_REASON_MWAIT] = "MWAIT",
 	[EXIT_REASON_MTF] = "Monitor trap flag",
 	[EXIT_REASON_MONITOR] = "MONITOR",
 	[EXIT_REASON_PAUSE] = "PAUSE",
 	[EXIT_REASON_MCE_DURING_ENTRY] =
 	    "VM-entry failure due to machine-check event",
 	[EXIT_REASON_TPR] = "TPR below threshold",
 	[EXIT_REASON_APIC_ACCESS] = "APIC access",
 	[EXIT_REASON_VIRTUALIZED_EOI] = "Virtualized EOI",
 	[EXIT_REASON_GDTR_IDTR] = "Access to GDTR or IDTR",
 	[EXIT_REASON_LDTR_TR] = "Access to LDTR or TR",
 	[EXIT_REASON_EPT_FAULT] = "EPT violation",
 	[EXIT_REASON_EPT_MISCONFIG] = "EPT misconfiguration",
 	[EXIT_REASON_INVEPT] = "INVEPT",
 	[EXIT_REASON_RDTSCP] = "RDTSCP",
 	[EXIT_REASON_VMX_PREEMPT] = "VMX-preemption timer expired",
 	[EXIT_REASON_INVVPID] = "INVVPID",
 	[EXIT_REASON_WBINVD] = "WBINVD",
 	[EXIT_REASON_XSETBV] = "XSETBV",
 	[EXIT_REASON_APIC_WRITE] = "APIC write",
 	[EXIT_REASON_RDRAND] = "RDRAND",
 	[EXIT_REASON_INVPCID] = "INVPCID",
 	[EXIT_REASON_VMFUNC] = "VMFUNC",
 	[EXIT_REASON_ENCLS] = "ENCLS",
 	[EXIT_REASON_RDSEED] = "RDSEED",
 	[EXIT_REASON_PM_LOG_FULL] = "Page-modification log full",
 	[EXIT_REASON_XSAVES] = "XSAVES",
 	[EXIT_REASON_XRSTORS] = "XRSTORS"
 };
 
 typedef int (*vmexit_handler_t)(struct vmctx *, struct vm_exit *, int *vcpu);
 extern int vmexit_task_switch(struct vmctx *, struct vm_exit *, int *vcpu);
 
 const char *vmname;
 
 int guest_ncpus;
 uint16_t cores, maxcpus, sockets, threads;
 
 char *guest_uuid_str;
 
 int raw_stdio = 0;
 
 static int gdb_port = 0;
 static int guest_vmexit_on_hlt, guest_vmexit_on_pause;
 static int virtio_msix = 1;
 static int x2apic_mode = 0;	/* default is xAPIC */
 
 static int strictio;
 static int strictmsr = 1;
 
 static int acpi;
 
 static char *progname;
 static const int BSP = 0;
 
 static cpuset_t cpumask;
 
 static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip);
 
 static struct vm_exit vmexit[VM_MAXCPU];
 
 struct bhyvestats {
 	uint64_t	vmexit_bogus;
 	uint64_t	vmexit_reqidle;
 	uint64_t	vmexit_hlt;
 	uint64_t	vmexit_pause;
 	uint64_t	vmexit_mtrap;
 	uint64_t	vmexit_inst_emul;
 	uint64_t	cpu_switch_rotate;
 	uint64_t	cpu_switch_direct;
 } stats;
 
 struct mt_vmm_info {
 	pthread_t	mt_thr;
 	struct vmctx	*mt_ctx;
 	int		mt_vcpu;	
 } mt_vmm_info[VM_MAXCPU];
 
 static cpuset_t *vcpumap[VM_MAXCPU] = { NULL };
 
 static void
 usage(int code)
 {
 
         fprintf(stderr,
 		"Usage: %s [-abehuwxACHPSWY]\n"
 		"       %*s [-c [[cpus=]numcpus][,sockets=n][,cores=n][,threads=n]]\n"
 		"       %*s [-g <gdb port>] [-l <lpc>]\n"
 		"       %*s [-m mem] [-p vcpu:hostcpu] [-s <pci>] [-U uuid] <vm>\n"
 		"       -a: local apic is in xAPIC mode (deprecated)\n"
 		"       -A: create ACPI tables\n"
 		"       -c: number of cpus and/or topology specification\n"
 		"       -C: include guest memory in core file\n"
 		"       -e: exit on unhandled I/O access\n"
 		"       -g: gdb port\n"
 		"       -h: help\n"
 		"       -H: vmexit from the guest on hlt\n"
 		"       -l: LPC device configuration\n"
 		"       -m: memory size in MB\n"
 #ifdef BHYVE_SNAPSHOT
 		"       -r: path to checkpoint file\n"
 #endif
 		"       -p: pin 'vcpu' to 'hostcpu'\n"
 		"       -P: vmexit from the guest on pause\n"
 		"       -s: <slot,driver,configinfo> PCI slot config\n"
 		"       -S: guest memory cannot be swapped\n"
 		"       -u: RTC keeps UTC time\n"
 		"       -U: uuid\n"
 		"       -w: ignore unimplemented MSRs\n"
 		"       -W: force virtio to use single-vector MSI\n"
 		"       -x: local apic is in x2APIC mode\n"
 		"       -Y: disable MPtable generation\n",
 		progname, (int)strlen(progname), "", (int)strlen(progname), "",
 		(int)strlen(progname), "");
 
 	exit(code);
 }
 
 /*
  * XXX This parser is known to have the following issues:
  * 1.  It accepts null key=value tokens ",,".
  * 2.  It accepts whitespace after = and before value.
  * 3.  Values out of range of INT are silently wrapped.
  * 4.  It doesn't check non-final values.
  * 5.  The apparently bogus limits of UINT16_MAX are for future expansion.
  *
  * The acceptance of a null specification ('-c ""') is by design to match the
  * manual page syntax specification, this results in a topology of 1 vCPU.
  */
 static int
 topology_parse(const char *opt)
 {
 	uint64_t ncpus;
 	int c, chk, n, s, t, tmp;
 	char *cp, *str;
 	bool ns, scts;
 
 	c = 1, n = 1, s = 1, t = 1;
 	ns = false, scts = false;
 	str = strdup(opt);
 	if (str == NULL)
 		goto out;
 
 	while ((cp = strsep(&str, ",")) != NULL) {
 		if (sscanf(cp, "%i%n", &tmp, &chk) == 1) {
 			n = tmp;
 			ns = true;
 		} else if (sscanf(cp, "cpus=%i%n", &tmp, &chk) == 1) {
 			n = tmp;
 			ns = true;
 		} else if (sscanf(cp, "sockets=%i%n", &tmp, &chk) == 1) {
 			s = tmp;
 			scts = true;
 		} else if (sscanf(cp, "cores=%i%n", &tmp, &chk) == 1) {
 			c = tmp;
 			scts = true;
 		} else if (sscanf(cp, "threads=%i%n", &tmp, &chk) == 1) {
 			t = tmp;
 			scts = true;
 #ifdef notyet  /* Do not expose this until vmm.ko implements it */
 		} else if (sscanf(cp, "maxcpus=%i%n", &tmp, &chk) == 1) {
 			m = tmp;
 #endif
 		/* Skip the empty argument case from -c "" */
 		} else if (cp[0] == '\0')
 			continue;
 		else
 			goto out;
 		/* Any trailing garbage causes an error */
 		if (cp[chk] != '\0')
 			goto out;
 	}
 	free(str);
 	str = NULL;
 
 	/*
 	 * Range check 1 <= n <= UINT16_MAX all values
 	 */
 	if (n < 1 || s < 1 || c < 1 || t < 1 ||
 	    n > UINT16_MAX || s > UINT16_MAX || c > UINT16_MAX  ||
 	    t > UINT16_MAX)
 		return (-1);
 
 	/* If only the cpus was specified, use that as sockets */
 	if (!scts)
 		s = n;
 	/*
 	 * Compute sockets * cores * threads avoiding overflow
 	 * The range check above insures these are 16 bit values
 	 * If n was specified check it against computed ncpus
 	 */
 	ncpus = (uint64_t)s * c * t;
 	if (ncpus > UINT16_MAX || (ns && n != ncpus))
 		return (-1);
 
 	guest_ncpus = ncpus;
 	sockets = s;
 	cores = c;
 	threads = t;
 	return(0);
 
 out:
 	free(str);
 	return (-1);
 }
 
 static int
 pincpu_parse(const char *opt)
 {
 	int vcpu, pcpu;
 
 	if (sscanf(opt, "%d:%d", &vcpu, &pcpu) != 2) {
 		fprintf(stderr, "invalid format: %s\n", opt);
 		return (-1);
 	}
 
 	if (vcpu < 0 || vcpu >= VM_MAXCPU) {
 		fprintf(stderr, "vcpu '%d' outside valid range from 0 to %d\n",
 		    vcpu, VM_MAXCPU - 1);
 		return (-1);
 	}
 
 	if (pcpu < 0 || pcpu >= CPU_SETSIZE) {
 		fprintf(stderr, "hostcpu '%d' outside valid range from "
 		    "0 to %d\n", pcpu, CPU_SETSIZE - 1);
 		return (-1);
 	}
 
 	if (vcpumap[vcpu] == NULL) {
 		if ((vcpumap[vcpu] = malloc(sizeof(cpuset_t))) == NULL) {
 			perror("malloc");
 			return (-1);
 		}
 		CPU_ZERO(vcpumap[vcpu]);
 	}
 	CPU_SET(pcpu, vcpumap[vcpu]);
 	return (0);
 }
 
 void
 vm_inject_fault(void *arg, int vcpu, int vector, int errcode_valid,
     int errcode)
 {
 	struct vmctx *ctx;
 	int error, restart_instruction;
 
 	ctx = arg;
 	restart_instruction = 1;
 
 	error = vm_inject_exception(ctx, vcpu, vector, errcode_valid, errcode,
 	    restart_instruction);
 	assert(error == 0);
 }
 
 void *
 paddr_guest2host(struct vmctx *ctx, uintptr_t gaddr, size_t len)
 {
 
 	return (vm_map_gpa(ctx, gaddr, len));
 }
 
 #ifdef BHYVE_SNAPSHOT
 uintptr_t
 paddr_host2guest(struct vmctx *ctx, void *addr)
 {
 	return (vm_rev_map_gpa(ctx, addr));
 }
 #endif
 
 int
 fbsdrun_vmexit_on_pause(void)
 {
 
 	return (guest_vmexit_on_pause);
 }
 
 int
 fbsdrun_vmexit_on_hlt(void)
 {
 
 	return (guest_vmexit_on_hlt);
 }
 
 int
 fbsdrun_virtio_msix(void)
 {
 
 	return (virtio_msix);
 }
 
 static void *
 fbsdrun_start_thread(void *param)
 {
 	char tname[MAXCOMLEN + 1];
 	struct mt_vmm_info *mtp;
 	int vcpu;
 
 	mtp = param;
 	vcpu = mtp->mt_vcpu;
 
 	snprintf(tname, sizeof(tname), "vcpu %d", vcpu);
 	pthread_set_name_np(mtp->mt_thr, tname);
 
 #ifdef BHYVE_SNAPSHOT
 	checkpoint_cpu_add(vcpu);
 #endif
 	if (gdb_port != 0)
 		gdb_cpu_add(vcpu);
 
 	vm_loop(mtp->mt_ctx, vcpu, vmexit[vcpu].rip);
 
 	/* not reached */
 	exit(1);
 	return (NULL);
 }
 
 void
 fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip)
 {
 	int error;
 
 	assert(fromcpu == BSP);
 
 	/*
 	 * The 'newcpu' must be activated in the context of 'fromcpu'. If
 	 * vm_activate_cpu() is delayed until newcpu's pthread starts running
 	 * then vmm.ko is out-of-sync with bhyve and this can create a race
 	 * with vm_suspend().
 	 */
 	error = vm_activate_cpu(ctx, newcpu);
 	if (error != 0)
 		err(EX_OSERR, "could not activate CPU %d", newcpu);
 
 	CPU_SET_ATOMIC(newcpu, &cpumask);
 
 	/*
 	 * Set up the vmexit struct to allow execution to start
 	 * at the given RIP
 	 */
 	vmexit[newcpu].rip = rip;
 	vmexit[newcpu].inst_length = 0;
 
 	mt_vmm_info[newcpu].mt_ctx = ctx;
 	mt_vmm_info[newcpu].mt_vcpu = newcpu;
 
 	error = pthread_create(&mt_vmm_info[newcpu].mt_thr, NULL,
 	    fbsdrun_start_thread, &mt_vmm_info[newcpu]);
 	assert(error == 0);
 }
 
 static int
 fbsdrun_deletecpu(struct vmctx *ctx, int vcpu)
 {
 
 	if (!CPU_ISSET(vcpu, &cpumask)) {
 		fprintf(stderr, "Attempting to delete unknown cpu %d\n", vcpu);
 		exit(4);
 	}
 
 	CPU_CLR_ATOMIC(vcpu, &cpumask);
 	return (CPU_EMPTY(&cpumask));
 }
 
 static int
 vmexit_handle_notify(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu,
 		     uint32_t eax)
 {
 #if BHYVE_DEBUG
 	/*
 	 * put guest-driven debug here
 	 */
 #endif
 	return (VMEXIT_CONTINUE);
 }
 
 static int
 vmexit_inout(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
 {
 	int error;
 	int bytes, port, in, out;
 	int vcpu;
 
 	vcpu = *pvcpu;
 
 	port = vme->u.inout.port;
 	bytes = vme->u.inout.bytes;
 	in = vme->u.inout.in;
 	out = !in;
 
         /* Extra-special case of host notifications */
         if (out && port == GUEST_NIO_PORT) {
                 error = vmexit_handle_notify(ctx, vme, pvcpu, vme->u.inout.eax);
 		return (error);
 	}
 
 	error = emulate_inout(ctx, vcpu, vme, strictio);
 	if (error) {
 		fprintf(stderr, "Unhandled %s%c 0x%04x at 0x%lx\n",
 		    in ? "in" : "out",
 		    bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'),
 		    port, vmexit->rip);
 		return (VMEXIT_ABORT);
 	} else {
 		return (VMEXIT_CONTINUE);
 	}
 }
 
 static int
 vmexit_rdmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
 {
 	uint64_t val;
 	uint32_t eax, edx;
 	int error;
 
 	val = 0;
 	error = emulate_rdmsr(ctx, *pvcpu, vme->u.msr.code, &val);
 	if (error != 0) {
 		fprintf(stderr, "rdmsr to register %#x on vcpu %d\n",
 		    vme->u.msr.code, *pvcpu);
 		if (strictmsr) {
 			vm_inject_gp(ctx, *pvcpu);
 			return (VMEXIT_CONTINUE);
 		}
 	}
 
 	eax = val;
 	error = vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RAX, eax);
 	assert(error == 0);
 
 	edx = val >> 32;
 	error = vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RDX, edx);
 	assert(error == 0);
 
 	return (VMEXIT_CONTINUE);
 }
 
 static int
 vmexit_wrmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
 {
 	int error;
 
 	error = emulate_wrmsr(ctx, *pvcpu, vme->u.msr.code, vme->u.msr.wval);
 	if (error != 0) {
 		fprintf(stderr, "wrmsr to register %#x(%#lx) on vcpu %d\n",
 		    vme->u.msr.code, vme->u.msr.wval, *pvcpu);
 		if (strictmsr) {
 			vm_inject_gp(ctx, *pvcpu);
 			return (VMEXIT_CONTINUE);
 		}
 	}
 	return (VMEXIT_CONTINUE);
 }
 
 static int
 vmexit_spinup_ap(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
 {
 
 	(void)spinup_ap(ctx, *pvcpu,
 		    vme->u.spinup_ap.vcpu, vme->u.spinup_ap.rip);
 
 	return (VMEXIT_CONTINUE);
 }
 
 #define	DEBUG_EPT_MISCONFIG
 #ifdef DEBUG_EPT_MISCONFIG
 #define	VMCS_GUEST_PHYSICAL_ADDRESS	0x00002400
 
 static uint64_t ept_misconfig_gpa, ept_misconfig_pte[4];
 static int ept_misconfig_ptenum;
 #endif
 
 static const char *
 vmexit_vmx_desc(uint32_t exit_reason)
 {
 
 	if (exit_reason >= nitems(vmx_exit_reason_desc) ||
 	    vmx_exit_reason_desc[exit_reason] == NULL)
 		return ("Unknown");
 	return (vmx_exit_reason_desc[exit_reason]);
 }
 
 static int
 vmexit_vmx(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 {
 
 	fprintf(stderr, "vm exit[%d]\n", *pvcpu);
 	fprintf(stderr, "\treason\t\tVMX\n");
 	fprintf(stderr, "\trip\t\t0x%016lx\n", vmexit->rip);
 	fprintf(stderr, "\tinst_length\t%d\n", vmexit->inst_length);
 	fprintf(stderr, "\tstatus\t\t%d\n", vmexit->u.vmx.status);
 	fprintf(stderr, "\texit_reason\t%u (%s)\n", vmexit->u.vmx.exit_reason,
 	    vmexit_vmx_desc(vmexit->u.vmx.exit_reason));
 	fprintf(stderr, "\tqualification\t0x%016lx\n",
 	    vmexit->u.vmx.exit_qualification);
 	fprintf(stderr, "\tinst_type\t\t%d\n", vmexit->u.vmx.inst_type);
 	fprintf(stderr, "\tinst_error\t\t%d\n", vmexit->u.vmx.inst_error);
 #ifdef DEBUG_EPT_MISCONFIG
 	if (vmexit->u.vmx.exit_reason == EXIT_REASON_EPT_MISCONFIG) {
 		vm_get_register(ctx, *pvcpu,
 		    VMCS_IDENT(VMCS_GUEST_PHYSICAL_ADDRESS),
 		    &ept_misconfig_gpa);
 		vm_get_gpa_pmap(ctx, ept_misconfig_gpa, ept_misconfig_pte,
 		    &ept_misconfig_ptenum);
 		fprintf(stderr, "\tEPT misconfiguration:\n");
 		fprintf(stderr, "\t\tGPA: %#lx\n", ept_misconfig_gpa);
 		fprintf(stderr, "\t\tPTE(%d): %#lx %#lx %#lx %#lx\n",
 		    ept_misconfig_ptenum, ept_misconfig_pte[0],
 		    ept_misconfig_pte[1], ept_misconfig_pte[2],
 		    ept_misconfig_pte[3]);
 	}
 #endif	/* DEBUG_EPT_MISCONFIG */
 	return (VMEXIT_ABORT);
 }
 
 static int
 vmexit_svm(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 {
 
 	fprintf(stderr, "vm exit[%d]\n", *pvcpu);
 	fprintf(stderr, "\treason\t\tSVM\n");
 	fprintf(stderr, "\trip\t\t0x%016lx\n", vmexit->rip);
 	fprintf(stderr, "\tinst_length\t%d\n", vmexit->inst_length);
 	fprintf(stderr, "\texitcode\t%#lx\n", vmexit->u.svm.exitcode);
 	fprintf(stderr, "\texitinfo1\t%#lx\n", vmexit->u.svm.exitinfo1);
 	fprintf(stderr, "\texitinfo2\t%#lx\n", vmexit->u.svm.exitinfo2);
 	return (VMEXIT_ABORT);
 }
 
 static int
 vmexit_bogus(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 {
 
 	assert(vmexit->inst_length == 0);
 
 	stats.vmexit_bogus++;
 
 	return (VMEXIT_CONTINUE);
 }
 
 static int
 vmexit_reqidle(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 {
 
 	assert(vmexit->inst_length == 0);
 
 	stats.vmexit_reqidle++;
 
 	return (VMEXIT_CONTINUE);
 }
 
 static int
 vmexit_hlt(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 {
 
 	stats.vmexit_hlt++;
 
 	/*
 	 * Just continue execution with the next instruction. We use
 	 * the HLT VM exit as a way to be friendly with the host
 	 * scheduler.
 	 */
 	return (VMEXIT_CONTINUE);
 }
 
 static int
 vmexit_pause(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 {
 
 	stats.vmexit_pause++;
 
 	return (VMEXIT_CONTINUE);
 }
 
 static int
 vmexit_mtrap(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 {
 
 	assert(vmexit->inst_length == 0);
 
 	stats.vmexit_mtrap++;
 
 #ifdef BHYVE_SNAPSHOT
 	checkpoint_cpu_suspend(*pvcpu);
 #endif
 	if (gdb_port != 0)
 		gdb_cpu_mtrap(*pvcpu);
 #ifdef BHYVE_SNAPSHOT
 	checkpoint_cpu_resume(*pvcpu);
 #endif
 
 	return (VMEXIT_CONTINUE);
 }
 
 static int
 vmexit_inst_emul(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 {
-	int err, i;
+	int err, i, cs_d;
 	struct vie *vie;
+	enum vm_cpu_mode mode;
 
 	stats.vmexit_inst_emul++;
 
 	vie = &vmexit->u.inst_emul.vie;
+	if (!vie->decoded) {
+		/*
+		 * Attempt to decode in userspace as a fallback.  This allows
+		 * updating instruction decode in bhyve without rebooting the
+		 * kernel (rapid prototyping), albeit with much slower
+		 * emulation.
+		 */
+		vie_restart(vie);
+		mode = vmexit->u.inst_emul.paging.cpu_mode;
+		cs_d = vmexit->u.inst_emul.cs_d;
+		(void)vmm_decode_instruction(mode, cs_d, vie);
+	}
+
 	err = emulate_mem(ctx, *pvcpu, vmexit->u.inst_emul.gpa,
 	    vie, &vmexit->u.inst_emul.paging);
 
 	if (err) {
 		if (err == ESRCH) {
 			EPRINTLN("Unhandled memory access to 0x%lx\n",
 			    vmexit->u.inst_emul.gpa);
 		}
 
 		fprintf(stderr, "Failed to emulate instruction sequence [ ");
 		for (i = 0; i < vie->num_valid; i++)
 			fprintf(stderr, "%02x", vie->inst[i]);
 		FPRINTLN(stderr, " ] at 0x%lx", vmexit->rip);
 		return (VMEXIT_ABORT);
 	}
 
 	return (VMEXIT_CONTINUE);
 }
 
 static pthread_mutex_t resetcpu_mtx = PTHREAD_MUTEX_INITIALIZER;
 static pthread_cond_t resetcpu_cond = PTHREAD_COND_INITIALIZER;
 
 static int
 vmexit_suspend(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 {
 	enum vm_suspend_how how;
 
 	how = vmexit->u.suspended.how;
 
 	fbsdrun_deletecpu(ctx, *pvcpu);
 
 	if (*pvcpu != BSP) {
 		pthread_mutex_lock(&resetcpu_mtx);
 		pthread_cond_signal(&resetcpu_cond);
 		pthread_mutex_unlock(&resetcpu_mtx);
 		pthread_exit(NULL);
 	}
 
 	pthread_mutex_lock(&resetcpu_mtx);
 	while (!CPU_EMPTY(&cpumask)) {
 		pthread_cond_wait(&resetcpu_cond, &resetcpu_mtx);
 	}
 	pthread_mutex_unlock(&resetcpu_mtx);
 
 	switch (how) {
 	case VM_SUSPEND_RESET:
 		exit(0);
 	case VM_SUSPEND_POWEROFF:
 		exit(1);
 	case VM_SUSPEND_HALT:
 		exit(2);
 	case VM_SUSPEND_TRIPLEFAULT:
 		exit(3);
 	default:
 		fprintf(stderr, "vmexit_suspend: invalid reason %d\n", how);
 		exit(100);
 	}
 	return (0);	/* NOTREACHED */
 }
 
 static int
 vmexit_debug(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 {
 
 #ifdef BHYVE_SNAPSHOT
 	checkpoint_cpu_suspend(*pvcpu);
 #endif
 	if (gdb_port != 0)
 		gdb_cpu_suspend(*pvcpu);
 #ifdef BHYVE_SNAPSHOT
 	checkpoint_cpu_resume(*pvcpu);
 #endif
 	return (VMEXIT_CONTINUE);
 }
 
 static int
 vmexit_breakpoint(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 {
 
 	if (gdb_port == 0) {
 		fprintf(stderr, "vm_loop: unexpected VMEXIT_DEBUG\n");
 		exit(4);
 	}
 	gdb_cpu_breakpoint(*pvcpu, vmexit);
 	return (VMEXIT_CONTINUE);
 }
 
 static vmexit_handler_t handler[VM_EXITCODE_MAX] = {
 	[VM_EXITCODE_INOUT]  = vmexit_inout,
 	[VM_EXITCODE_INOUT_STR]  = vmexit_inout,
 	[VM_EXITCODE_VMX]    = vmexit_vmx,
 	[VM_EXITCODE_SVM]    = vmexit_svm,
 	[VM_EXITCODE_BOGUS]  = vmexit_bogus,
 	[VM_EXITCODE_REQIDLE] = vmexit_reqidle,
 	[VM_EXITCODE_RDMSR]  = vmexit_rdmsr,
 	[VM_EXITCODE_WRMSR]  = vmexit_wrmsr,
 	[VM_EXITCODE_MTRAP]  = vmexit_mtrap,
 	[VM_EXITCODE_INST_EMUL] = vmexit_inst_emul,
 	[VM_EXITCODE_SPINUP_AP] = vmexit_spinup_ap,
 	[VM_EXITCODE_SUSPENDED] = vmexit_suspend,
 	[VM_EXITCODE_TASK_SWITCH] = vmexit_task_switch,
 	[VM_EXITCODE_DEBUG] = vmexit_debug,
 	[VM_EXITCODE_BPT] = vmexit_breakpoint,
 };
 
 static void
 vm_loop(struct vmctx *ctx, int vcpu, uint64_t startrip)
 {
 	int error, rc;
 	enum vm_exitcode exitcode;
 	cpuset_t active_cpus;
 
 	if (vcpumap[vcpu] != NULL) {
 		error = pthread_setaffinity_np(pthread_self(),
 		    sizeof(cpuset_t), vcpumap[vcpu]);
 		assert(error == 0);
 	}
 
 	error = vm_active_cpus(ctx, &active_cpus);
 	assert(CPU_ISSET(vcpu, &active_cpus));
 
 	error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, startrip);
 	assert(error == 0);
 
 	while (1) {
 		error = vm_run(ctx, vcpu, &vmexit[vcpu]);
 		if (error != 0)
 			break;
 
 		exitcode = vmexit[vcpu].exitcode;
 		if (exitcode >= VM_EXITCODE_MAX || handler[exitcode] == NULL) {
 			fprintf(stderr, "vm_loop: unexpected exitcode 0x%x\n",
 			    exitcode);
 			exit(4);
 		}
 
 		rc = (*handler[exitcode])(ctx, &vmexit[vcpu], &vcpu);
 
 		switch (rc) {
 		case VMEXIT_CONTINUE:
 			break;
 		case VMEXIT_ABORT:
 			abort();
 		default:
 			exit(4);
 		}
 	}
 	fprintf(stderr, "vm_run error %d, errno %d\n", error, errno);
 }
 
 static int
 num_vcpus_allowed(struct vmctx *ctx)
 {
 	int tmp, error;
 
 	error = vm_get_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, &tmp);
 
 	/*
 	 * The guest is allowed to spinup more than one processor only if the
 	 * UNRESTRICTED_GUEST capability is available.
 	 */
 	if (error == 0)
 		return (VM_MAXCPU);
 	else
 		return (1);
 }
 
 void
 fbsdrun_set_capabilities(struct vmctx *ctx, int cpu)
 {
 	int err, tmp;
 
 	if (fbsdrun_vmexit_on_hlt()) {
 		err = vm_get_capability(ctx, cpu, VM_CAP_HALT_EXIT, &tmp);
 		if (err < 0) {
 			fprintf(stderr, "VM exit on HLT not supported\n");
 			exit(4);
 		}
 		vm_set_capability(ctx, cpu, VM_CAP_HALT_EXIT, 1);
 		if (cpu == BSP)
 			handler[VM_EXITCODE_HLT] = vmexit_hlt;
 	}
 
         if (fbsdrun_vmexit_on_pause()) {
 		/*
 		 * pause exit support required for this mode
 		 */
 		err = vm_get_capability(ctx, cpu, VM_CAP_PAUSE_EXIT, &tmp);
 		if (err < 0) {
 			fprintf(stderr,
 			    "SMP mux requested, no pause support\n");
 			exit(4);
 		}
 		vm_set_capability(ctx, cpu, VM_CAP_PAUSE_EXIT, 1);
 		if (cpu == BSP)
 			handler[VM_EXITCODE_PAUSE] = vmexit_pause;
         }
 
 	if (x2apic_mode)
 		err = vm_set_x2apic_state(ctx, cpu, X2APIC_ENABLED);
 	else
 		err = vm_set_x2apic_state(ctx, cpu, X2APIC_DISABLED);
 
 	if (err) {
 		fprintf(stderr, "Unable to set x2apic state (%d)\n", err);
 		exit(4);
 	}
 
 	vm_set_capability(ctx, cpu, VM_CAP_ENABLE_INVPCID, 1);
 }
 
 static struct vmctx *
 do_open(const char *vmname)
 {
 	struct vmctx *ctx;
 	int error;
 	bool reinit, romboot;
 #ifndef WITHOUT_CAPSICUM
 	cap_rights_t rights;
 	const cap_ioctl_t *cmds;	
 	size_t ncmds;
 #endif
 
 	reinit = romboot = false;
 
 	if (lpc_bootrom())
 		romboot = true;
 
 	error = vm_create(vmname);
 	if (error) {
 		if (errno == EEXIST) {
 			if (romboot) {
 				reinit = true;
 			} else {
 				/*
 				 * The virtual machine has been setup by the
 				 * userspace bootloader.
 				 */
 			}
 		} else {
 			perror("vm_create");
 			exit(4);
 		}
 	} else {
 		if (!romboot) {
 			/*
 			 * If the virtual machine was just created then a
 			 * bootrom must be configured to boot it.
 			 */
 			fprintf(stderr, "virtual machine cannot be booted\n");
 			exit(4);
 		}
 	}
 
 	ctx = vm_open(vmname);
 	if (ctx == NULL) {
 		perror("vm_open");
 		exit(4);
 	}
 
 #ifndef WITHOUT_CAPSICUM
 	cap_rights_init(&rights, CAP_IOCTL, CAP_MMAP_RW);
 	if (caph_rights_limit(vm_get_device_fd(ctx), &rights) == -1) 
 		errx(EX_OSERR, "Unable to apply rights for sandbox");
 	vm_get_ioctls(&ncmds);
 	cmds = vm_get_ioctls(NULL);
 	if (cmds == NULL)
 		errx(EX_OSERR, "out of memory");
 	if (caph_ioctls_limit(vm_get_device_fd(ctx), cmds, ncmds) == -1)
 		errx(EX_OSERR, "Unable to apply rights for sandbox");
 	free((cap_ioctl_t *)cmds);
 #endif
  
 	if (reinit) {
 		error = vm_reinit(ctx);
 		if (error) {
 			perror("vm_reinit");
 			exit(4);
 		}
 	}
 	error = vm_set_topology(ctx, sockets, cores, threads, maxcpus);
 	if (error)
 		errx(EX_OSERR, "vm_set_topology");
 	return (ctx);
 }
 
 void
 spinup_vcpu(struct vmctx *ctx, int vcpu)
 {
 	int error;
 	uint64_t rip;
 
 	error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RIP, &rip);
 	assert(error == 0);
 
 	fbsdrun_set_capabilities(ctx, vcpu);
 	error = vm_set_capability(ctx, vcpu, VM_CAP_UNRESTRICTED_GUEST, 1);
 	assert(error == 0);
 
 	fbsdrun_addcpu(ctx, BSP, vcpu, rip);
 }
 
 int
 main(int argc, char *argv[])
 {
 	int c, error, dbg_port, err, bvmcons;
 	int max_vcpus, mptgen, memflags;
 	int rtc_localtime;
 	bool gdb_stop;
 	struct vmctx *ctx;
 	uint64_t rip;
 	size_t memsize;
 	char *optstr;
 #ifdef BHYVE_SNAPSHOT
 	char *restore_file;
 	struct restore_state rstate;
 	int vcpu;
 
 	restore_file = NULL;
 #endif
 
 	bvmcons = 0;
 	progname = basename(argv[0]);
 	dbg_port = 0;
 	gdb_stop = false;
 	guest_ncpus = 1;
 	sockets = cores = threads = 1;
 	maxcpus = 0;
 	memsize = 256 * MB;
 	mptgen = 1;
 	rtc_localtime = 1;
 	memflags = 0;
 
 #ifdef BHYVE_SNAPSHOT
 	optstr = "abehuwxACHIPSWYp:g:G:c:s:m:l:U:r:";
 #else
 	optstr = "abehuwxACHIPSWYp:g:G:c:s:m:l:U:";
 #endif
 	while ((c = getopt(argc, argv, optstr)) != -1) {
 		switch (c) {
 		case 'a':
 			x2apic_mode = 0;
 			break;
 		case 'A':
 			acpi = 1;
 			break;
 		case 'b':
 			bvmcons = 1;
 			break;
 		case 'p':
                         if (pincpu_parse(optarg) != 0) {
                             errx(EX_USAGE, "invalid vcpu pinning "
                                  "configuration '%s'", optarg);
                         }
 			break;
                 case 'c':
 			if (topology_parse(optarg) != 0) {
 			    errx(EX_USAGE, "invalid cpu topology "
 				"'%s'", optarg);
 			}
 			break;
 		case 'C':
 			memflags |= VM_MEM_F_INCORE;
 			break;
 		case 'g':
 			dbg_port = atoi(optarg);
 			break;
 		case 'G':
 			if (optarg[0] == 'w') {
 				gdb_stop = true;
 				optarg++;
 			}
 			gdb_port = atoi(optarg);
 			break;
 		case 'l':
 			if (strncmp(optarg, "help", strlen(optarg)) == 0) {
 				lpc_print_supported_devices();
 				exit(0);
 			} else if (lpc_device_parse(optarg) != 0) {
 				errx(EX_USAGE, "invalid lpc device "
 				    "configuration '%s'", optarg);
 			}
 			break;
 #ifdef BHYVE_SNAPSHOT
 		case 'r':
 			restore_file = optarg;
 			break;
 #endif
 		case 's':
 			if (strncmp(optarg, "help", strlen(optarg)) == 0) {
 				pci_print_supported_devices();
 				exit(0);
 			} else if (pci_parse_slot(optarg) != 0)
 				exit(4);
 			else
 				break;
 		case 'S':
 			memflags |= VM_MEM_F_WIRED;
 			break;
                 case 'm':
 			error = vm_parse_memsize(optarg, &memsize);
 			if (error)
 				errx(EX_USAGE, "invalid memsize '%s'", optarg);
 			break;
 		case 'H':
 			guest_vmexit_on_hlt = 1;
 			break;
 		case 'I':
 			/*
 			 * The "-I" option was used to add an ioapic to the
 			 * virtual machine.
 			 *
 			 * An ioapic is now provided unconditionally for each
 			 * virtual machine and this option is now deprecated.
 			 */
 			break;
 		case 'P':
 			guest_vmexit_on_pause = 1;
 			break;
 		case 'e':
 			strictio = 1;
 			break;
 		case 'u':
 			rtc_localtime = 0;
 			break;
 		case 'U':
 			guest_uuid_str = optarg;
 			break;
 		case 'w':
 			strictmsr = 0;
 			break;
 		case 'W':
 			virtio_msix = 0;
 			break;
 		case 'x':
 			x2apic_mode = 1;
 			break;
 		case 'Y':
 			mptgen = 0;
 			break;
 		case 'h':
 			usage(0);			
 		default:
 			usage(1);
 		}
 	}
 	argc -= optind;
 	argv += optind;
 
 #ifdef BHYVE_SNAPSHOT
 	if (argc > 1 || (argc == 0 && restore_file == NULL))
 		usage(1);
 
 	if (restore_file != NULL) {
 		error = load_restore_file(restore_file, &rstate);
 		if (error) {
 			fprintf(stderr, "Failed to read checkpoint info from "
 					"file: '%s'.\n", restore_file);
 			exit(1);
 		}
 	}
 
 	if (argc == 1) {
 		vmname = argv[0];
 	} else {
 		vmname = lookup_vmname(&rstate);
 		if (vmname == NULL) {
 			fprintf(stderr, "Cannot find VM name in restore file. "
 					"Please specify one.\n");
 			exit(1);
 		}
 	}
 #else
 	if (argc != 1)
 		usage(1);
 
 	vmname = argv[0];
 #endif
 	ctx = do_open(vmname);
 
 #ifdef BHYVE_SNAPSHOT
 	if (restore_file != NULL) {
 		guest_ncpus = lookup_guest_ncpus(&rstate);
 		memflags = lookup_memflags(&rstate);
 		memsize = lookup_memsize(&rstate);
 	}
 
 	if (guest_ncpus < 1) {
 		fprintf(stderr, "Invalid guest vCPUs (%d)\n", guest_ncpus);
 		exit(1);
 	}
 #endif
 
 	max_vcpus = num_vcpus_allowed(ctx);
 	if (guest_ncpus > max_vcpus) {
 		fprintf(stderr, "%d vCPUs requested but only %d available\n",
 			guest_ncpus, max_vcpus);
 		exit(4);
 	}
 
 	fbsdrun_set_capabilities(ctx, BSP);
 
 	vm_set_memflags(ctx, memflags);
 	err = vm_setup_memory(ctx, memsize, VM_MMAP_ALL);
 	if (err) {
 		fprintf(stderr, "Unable to setup memory (%d)\n", errno);
 		exit(4);
 	}
 
 	error = init_msr();
 	if (error) {
 		fprintf(stderr, "init_msr error %d", error);
 		exit(4);
 	}
 
 	init_mem();
 	init_inout();
 	kernemu_dev_init();
 	init_bootrom(ctx);
 	atkbdc_init(ctx);
 	pci_irq_init(ctx);
 	ioapic_init(ctx);
 
 	rtc_init(ctx, rtc_localtime);
 	sci_init(ctx);
 
 	/*
 	 * Exit if a device emulation finds an error in its initilization
 	 */
 	if (init_pci(ctx) != 0) {
 		perror("device emulation initialization error");
 		exit(4);
 	}
 
 	/*
 	 * Initialize after PCI, to allow a bootrom file to reserve the high
 	 * region.
 	 */
 	if (acpi)
 		vmgenc_init(ctx);
 
 	if (dbg_port != 0)
 		init_dbgport(dbg_port);
 
 	if (gdb_port != 0)
 		init_gdb(ctx, gdb_port, gdb_stop);
 
 	if (bvmcons)
 		init_bvmcons();
 
 	if (lpc_bootrom()) {
 		if (vm_set_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, 1)) {
 			fprintf(stderr, "ROM boot failed: unrestricted guest "
 			    "capability not available\n");
 			exit(4);
 		}
 		error = vcpu_reset(ctx, BSP);
 		assert(error == 0);
 	}
 
 #ifdef BHYVE_SNAPSHOT
 	if (restore_file != NULL) {
 		fprintf(stdout, "Pausing pci devs...\r\n");
 		if (vm_pause_user_devs(ctx) != 0) {
 			fprintf(stderr, "Failed to pause PCI device state.\n");
 			exit(1);
 		}
 
 		fprintf(stdout, "Restoring vm mem...\r\n");
 		if (restore_vm_mem(ctx, &rstate) != 0) {
 			fprintf(stderr, "Failed to restore VM memory.\n");
 			exit(1);
 		}
 
 		fprintf(stdout, "Restoring pci devs...\r\n");
 		if (vm_restore_user_devs(ctx, &rstate) != 0) {
 			fprintf(stderr, "Failed to restore PCI device state.\n");
 			exit(1);
 		}
 
 		fprintf(stdout, "Restoring kernel structs...\r\n");
 		if (vm_restore_kern_structs(ctx, &rstate) != 0) {
 			fprintf(stderr, "Failed to restore kernel structs.\n");
 			exit(1);
 		}
 
 		fprintf(stdout, "Resuming pci devs...\r\n");
 		if (vm_resume_user_devs(ctx) != 0) {
 			fprintf(stderr, "Failed to resume PCI device state.\n");
 			exit(1);
 		}
 	}
 #endif
 
 	error = vm_get_register(ctx, BSP, VM_REG_GUEST_RIP, &rip);
 	assert(error == 0);
 
 	/*
 	 * build the guest tables, MP etc.
 	 */
 	if (mptgen) {
 		error = mptable_build(ctx, guest_ncpus);
 		if (error) {
 			perror("error to build the guest tables");
 			exit(4);
 		}
 	}
 
 	error = smbios_build(ctx);
 	assert(error == 0);
 
 	if (acpi) {
 		error = acpi_build(ctx, guest_ncpus);
 		assert(error == 0);
 	}
 
 	if (lpc_bootrom())
 		fwctl_init();
 
 	/*
 	 * Change the proc title to include the VM name.
 	 */
 	setproctitle("%s", vmname);
 
 #ifndef WITHOUT_CAPSICUM
 	caph_cache_catpages();
 
 	if (caph_limit_stdout() == -1 || caph_limit_stderr() == -1)
 		errx(EX_OSERR, "Unable to apply rights for sandbox");
 
 	if (caph_enter() == -1)
 		errx(EX_OSERR, "cap_enter() failed");
 #endif
 
 #ifdef BHYVE_SNAPSHOT
 	if (restore_file != NULL)
 		destroy_restore_state(&rstate);
 
 	/*
 	 * checkpointing thread for communication with bhyvectl
 	 */
 	if (init_checkpoint_thread(ctx) < 0)
 		printf("Failed to start checkpoint thread!\r\n");
 
 	if (restore_file != NULL)
 		vm_restore_time(ctx);
 #endif
 
 	/*
 	 * Add CPU 0
 	 */
 	fbsdrun_addcpu(ctx, BSP, BSP, rip);
 
 #ifdef BHYVE_SNAPSHOT
 	/*
 	 * If we restore a VM, start all vCPUs now (including APs), otherwise,
 	 * let the guest OS to spin them up later via vmexits.
 	 */
 	if (restore_file != NULL) {
 		for (vcpu = 0; vcpu < guest_ncpus; vcpu++) {
 			if (vcpu == BSP)
 				continue;
 
 			fprintf(stdout, "spinning up vcpu no %d...\r\n", vcpu);
 			spinup_vcpu(ctx, vcpu);
 		}
 	}
 #endif
 
 	/*
 	 * Head off to the main event dispatch loop
 	 */
 	mevent_dispatch();
 
 	exit(4);
 }