diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h
index 0b3daed4f69e..e35119af8572 100644
--- a/sys/amd64/include/vmm.h
+++ b/sys/amd64/include/vmm.h
@@ -1,788 +1,789 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #ifndef _VMM_H_
 #define	_VMM_H_
 
 #include <sys/cpuset.h>
 #include <sys/sdt.h>
 #include <x86/segments.h>
 
 struct vcpu;
 struct vm_snapshot_meta;
 
 #ifdef _KERNEL
 SDT_PROVIDER_DECLARE(vmm);
 #endif
 
 enum vm_suspend_how {
 	VM_SUSPEND_NONE,
 	VM_SUSPEND_RESET,
 	VM_SUSPEND_POWEROFF,
 	VM_SUSPEND_HALT,
 	VM_SUSPEND_TRIPLEFAULT,
+	VM_SUSPEND_DESTROY,
 	VM_SUSPEND_LAST
 };
 
 /*
  * Identifiers for architecturally defined registers.
  */
 enum vm_reg_name {
 	VM_REG_GUEST_RAX,
 	VM_REG_GUEST_RBX,
 	VM_REG_GUEST_RCX,
 	VM_REG_GUEST_RDX,
 	VM_REG_GUEST_RSI,
 	VM_REG_GUEST_RDI,
 	VM_REG_GUEST_RBP,
 	VM_REG_GUEST_R8,
 	VM_REG_GUEST_R9,
 	VM_REG_GUEST_R10,
 	VM_REG_GUEST_R11,
 	VM_REG_GUEST_R12,
 	VM_REG_GUEST_R13,
 	VM_REG_GUEST_R14,
 	VM_REG_GUEST_R15,
 	VM_REG_GUEST_CR0,
 	VM_REG_GUEST_CR3,
 	VM_REG_GUEST_CR4,
 	VM_REG_GUEST_DR7,
 	VM_REG_GUEST_RSP,
 	VM_REG_GUEST_RIP,
 	VM_REG_GUEST_RFLAGS,
 	VM_REG_GUEST_ES,
 	VM_REG_GUEST_CS,
 	VM_REG_GUEST_SS,
 	VM_REG_GUEST_DS,
 	VM_REG_GUEST_FS,
 	VM_REG_GUEST_GS,
 	VM_REG_GUEST_LDTR,
 	VM_REG_GUEST_TR,
 	VM_REG_GUEST_IDTR,
 	VM_REG_GUEST_GDTR,
 	VM_REG_GUEST_EFER,
 	VM_REG_GUEST_CR2,
 	VM_REG_GUEST_PDPTE0,
 	VM_REG_GUEST_PDPTE1,
 	VM_REG_GUEST_PDPTE2,
 	VM_REG_GUEST_PDPTE3,
 	VM_REG_GUEST_INTR_SHADOW,
 	VM_REG_GUEST_DR0,
 	VM_REG_GUEST_DR1,
 	VM_REG_GUEST_DR2,
 	VM_REG_GUEST_DR3,
 	VM_REG_GUEST_DR6,
 	VM_REG_GUEST_ENTRY_INST_LENGTH,
 	VM_REG_GUEST_FS_BASE,
 	VM_REG_GUEST_GS_BASE,
 	VM_REG_GUEST_KGS_BASE,
 	VM_REG_GUEST_TPR,
 	VM_REG_LAST
 };
 
 enum x2apic_state {
 	X2APIC_DISABLED,
 	X2APIC_ENABLED,
 	X2APIC_STATE_LAST
 };
 
 #define	VM_INTINFO_VECTOR(info)	((info) & 0xff)
 #define	VM_INTINFO_DEL_ERRCODE	0x800
 #define	VM_INTINFO_RSVD		0x7ffff000
 #define	VM_INTINFO_VALID	0x80000000
 #define	VM_INTINFO_TYPE		0x700
 #define	VM_INTINFO_HWINTR	(0 << 8)
 #define	VM_INTINFO_NMI		(2 << 8)
 #define	VM_INTINFO_HWEXCEPTION	(3 << 8)
 #define	VM_INTINFO_SWINTR	(4 << 8)
 
 /*
  * The VM name has to fit into the pathname length constraints of devfs,
  * governed primarily by SPECNAMELEN.  The length is the total number of
  * characters in the full path, relative to the mount point and not
  * including any leading '/' characters.
  * A prefix and a suffix are added to the name specified by the user.
  * The prefix is usually "vmm/" or "vmm.io/", but can be a few characters
  * longer for future use.
  * The suffix is a string that identifies a bootrom image or some similar
  * image that is attached to the VM. A separator character gets added to
  * the suffix automatically when generating the full path, so it must be
  * accounted for, reducing the effective length by 1.
  * The effective length of a VM name is 229 bytes for FreeBSD 13 and 37
  * bytes for FreeBSD 12.  A minimum length is set for safety and supports
  * a SPECNAMELEN as small as 32 on old systems.
  */
 #define VM_MAX_PREFIXLEN 10
 #define VM_MAX_SUFFIXLEN 15
 #define VM_MIN_NAMELEN   6
 #define VM_MAX_NAMELEN \
     (SPECNAMELEN - VM_MAX_PREFIXLEN - VM_MAX_SUFFIXLEN - 1)
 
 #ifdef _KERNEL
 #include <sys/kassert.h>
 
 CTASSERT(VM_MAX_NAMELEN >= VM_MIN_NAMELEN);
 
 struct vm;
 struct vm_exception;
 struct vm_mem;
 struct seg_desc;
 struct vm_exit;
 struct vm_run;
 struct vhpet;
 struct vioapic;
 struct vlapic;
 struct vmspace;
 struct vm_object;
 struct vm_guest_paging;
 struct pmap;
 enum snapshot_req;
 
 struct vm_eventinfo {
 	cpuset_t *rptr;		/* rendezvous cookie */
 	int	*sptr;		/* suspend cookie */
 	int	*iptr;		/* reqidle cookie */
 };
 
 typedef int	(*vmm_init_func_t)(int ipinum);
 typedef int	(*vmm_cleanup_func_t)(void);
 typedef void	(*vmm_suspend_func_t)(void);
 typedef void	(*vmm_resume_func_t)(void);
 typedef void *	(*vmi_init_func_t)(struct vm *vm, struct pmap *pmap);
 typedef int	(*vmi_run_func_t)(void *vcpui, register_t rip,
 		    struct pmap *pmap, struct vm_eventinfo *info);
 typedef void	(*vmi_cleanup_func_t)(void *vmi);
 typedef void *	(*vmi_vcpu_init_func_t)(void *vmi, struct vcpu *vcpu,
 		    int vcpu_id);
 typedef void	(*vmi_vcpu_cleanup_func_t)(void *vcpui);
 typedef int	(*vmi_get_register_t)(void *vcpui, int num, uint64_t *retval);
 typedef int	(*vmi_set_register_t)(void *vcpui, int num, uint64_t val);
 typedef int	(*vmi_get_desc_t)(void *vcpui, int num, struct seg_desc *desc);
 typedef int	(*vmi_set_desc_t)(void *vcpui, int num, struct seg_desc *desc);
 typedef int	(*vmi_get_cap_t)(void *vcpui, int num, int *retval);
 typedef int	(*vmi_set_cap_t)(void *vcpui, int num, int val);
 typedef struct vmspace * (*vmi_vmspace_alloc)(vm_offset_t min, vm_offset_t max);
 typedef void	(*vmi_vmspace_free)(struct vmspace *vmspace);
 typedef struct vlapic * (*vmi_vlapic_init)(void *vcpui);
 typedef void	(*vmi_vlapic_cleanup)(struct vlapic *vlapic);
 typedef int	(*vmi_snapshot_vcpu_t)(void *vcpui, struct vm_snapshot_meta *meta);
 typedef int	(*vmi_restore_tsc_t)(void *vcpui, uint64_t now);
 
 struct vmm_ops {
 	vmm_init_func_t		modinit;	/* module wide initialization */
 	vmm_cleanup_func_t	modcleanup;
 	vmm_resume_func_t	modsuspend;
 	vmm_resume_func_t	modresume;
 
 	vmi_init_func_t		init;		/* vm-specific initialization */
 	vmi_run_func_t		run;
 	vmi_cleanup_func_t	cleanup;
 	vmi_vcpu_init_func_t	vcpu_init;
 	vmi_vcpu_cleanup_func_t	vcpu_cleanup;
 	vmi_get_register_t	getreg;
 	vmi_set_register_t	setreg;
 	vmi_get_desc_t		getdesc;
 	vmi_set_desc_t		setdesc;
 	vmi_get_cap_t		getcap;
 	vmi_set_cap_t		setcap;
 	vmi_vmspace_alloc	vmspace_alloc;
 	vmi_vmspace_free	vmspace_free;
 	vmi_vlapic_init		vlapic_init;
 	vmi_vlapic_cleanup	vlapic_cleanup;
 
 	/* checkpoint operations */
 	vmi_snapshot_vcpu_t	vcpu_snapshot;
 	vmi_restore_tsc_t	restore_tsc;
 };
 
 extern const struct vmm_ops vmm_ops_intel;
 extern const struct vmm_ops vmm_ops_amd;
 
 extern u_int vm_maxcpu;			/* maximum virtual cpus */
 
 int vm_create(const char *name, struct vm **retvm);
 struct vcpu *vm_alloc_vcpu(struct vm *vm, int vcpuid);
 void vm_disable_vcpu_creation(struct vm *vm);
 void vm_slock_vcpus(struct vm *vm);
 void vm_unlock_vcpus(struct vm *vm);
 void vm_destroy(struct vm *vm);
 int vm_reinit(struct vm *vm);
 const char *vm_name(struct vm *vm);
 uint16_t vm_get_maxcpus(struct vm *vm);
 void vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores,
     uint16_t *threads, uint16_t *maxcpus);
 int vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores,
     uint16_t threads, uint16_t maxcpus);
 
 int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
 int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len);
 int vm_assign_pptdev(struct vm *vm, int bus, int slot, int func);
 int vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func);
 
 int vm_get_register(struct vcpu *vcpu, int reg, uint64_t *retval);
 int vm_set_register(struct vcpu *vcpu, int reg, uint64_t val);
 int vm_get_seg_desc(struct vcpu *vcpu, int reg,
 		    struct seg_desc *ret_desc);
 int vm_set_seg_desc(struct vcpu *vcpu, int reg,
 		    struct seg_desc *desc);
 int vm_run(struct vcpu *vcpu);
 int vm_suspend(struct vm *vm, enum vm_suspend_how how);
 int vm_inject_nmi(struct vcpu *vcpu);
 int vm_nmi_pending(struct vcpu *vcpu);
 void vm_nmi_clear(struct vcpu *vcpu);
 int vm_inject_extint(struct vcpu *vcpu);
 int vm_extint_pending(struct vcpu *vcpu);
 void vm_extint_clear(struct vcpu *vcpu);
 int vcpu_vcpuid(struct vcpu *vcpu);
 struct vm *vcpu_vm(struct vcpu *vcpu);
 struct vcpu *vm_vcpu(struct vm *vm, int cpu);
 struct vlapic *vm_lapic(struct vcpu *vcpu);
 struct vioapic *vm_ioapic(struct vm *vm);
 struct vhpet *vm_hpet(struct vm *vm);
 int vm_get_capability(struct vcpu *vcpu, int type, int *val);
 int vm_set_capability(struct vcpu *vcpu, int type, int val);
 int vm_get_x2apic_state(struct vcpu *vcpu, enum x2apic_state *state);
 int vm_set_x2apic_state(struct vcpu *vcpu, enum x2apic_state state);
 int vm_apicid2vcpuid(struct vm *vm, int apicid);
 int vm_activate_cpu(struct vcpu *vcpu);
 int vm_suspend_cpu(struct vm *vm, struct vcpu *vcpu);
 int vm_resume_cpu(struct vm *vm, struct vcpu *vcpu);
 int vm_restart_instruction(struct vcpu *vcpu);
 struct vm_exit *vm_exitinfo(struct vcpu *vcpu);
 cpuset_t *vm_exitinfo_cpuset(struct vcpu *vcpu);
 void vm_exit_suspended(struct vcpu *vcpu, uint64_t rip);
 void vm_exit_debug(struct vcpu *vcpu, uint64_t rip);
 void vm_exit_rendezvous(struct vcpu *vcpu, uint64_t rip);
 void vm_exit_astpending(struct vcpu *vcpu, uint64_t rip);
 void vm_exit_reqidle(struct vcpu *vcpu, uint64_t rip);
 int vm_snapshot_req(struct vm *vm, struct vm_snapshot_meta *meta);
 int vm_restore_time(struct vm *vm);
 
 #ifdef _SYS__CPUSET_H_
 /*
  * Rendezvous all vcpus specified in 'dest' and execute 'func(arg)'.
  * The rendezvous 'func(arg)' is not allowed to do anything that will
  * cause the thread to be put to sleep.
  *
  * The caller cannot hold any locks when initiating the rendezvous.
  *
  * The implementation of this API may cause vcpus other than those specified
  * by 'dest' to be stalled. The caller should not rely on any vcpus making
  * forward progress when the rendezvous is in progress.
  */
 typedef void (*vm_rendezvous_func_t)(struct vcpu *vcpu, void *arg);
 int vm_smp_rendezvous(struct vcpu *vcpu, cpuset_t dest,
     vm_rendezvous_func_t func, void *arg);
 
 cpuset_t vm_active_cpus(struct vm *vm);
 cpuset_t vm_debug_cpus(struct vm *vm);
 cpuset_t vm_suspended_cpus(struct vm *vm);
 cpuset_t vm_start_cpus(struct vm *vm, const cpuset_t *tostart);
 void vm_await_start(struct vm *vm, const cpuset_t *waiting);
 #endif	/* _SYS__CPUSET_H_ */
 
 static __inline int
 vcpu_rendezvous_pending(struct vcpu *vcpu, struct vm_eventinfo *info)
 {
 	/*
 	 * This check isn't done with atomic operations or under a lock because
 	 * there's no need to. If the vcpuid bit is set, the vcpu is part of a
 	 * rendezvous and the bit won't be cleared until the vcpu enters the
 	 * rendezvous. On rendezvous exit, the cpuset is cleared and the vcpu
 	 * will see an empty cpuset. So, the races are harmless.
 	 */
 	return (CPU_ISSET(vcpu_vcpuid(vcpu), info->rptr));
 }
 
 static __inline int
 vcpu_suspended(struct vm_eventinfo *info)
 {
 
 	return (*info->sptr);
 }
 
 static __inline int
 vcpu_reqidle(struct vm_eventinfo *info)
 {
 
 	return (*info->iptr);
 }
 
 int vcpu_debugged(struct vcpu *vcpu);
 
 /*
  * Return true if device indicated by bus/slot/func is supposed to be a
  * pci passthrough device.
  *
  * Return false otherwise.
  */
 bool vmm_is_pptdev(int bus, int slot, int func);
 
 void *vm_iommu_domain(struct vm *vm);
 
 enum vcpu_state {
 	VCPU_IDLE,
 	VCPU_FROZEN,
 	VCPU_RUNNING,
 	VCPU_SLEEPING,
 };
 
 int vcpu_set_state(struct vcpu *vcpu, enum vcpu_state state, bool from_idle);
 enum vcpu_state vcpu_get_state(struct vcpu *vcpu, int *hostcpu);
 
 static int __inline
 vcpu_is_running(struct vcpu *vcpu, int *hostcpu)
 {
 	return (vcpu_get_state(vcpu, hostcpu) == VCPU_RUNNING);
 }
 
 #ifdef _SYS_PROC_H_
 static int __inline
 vcpu_should_yield(struct vcpu *vcpu)
 {
 	struct thread *td;
 
 	td = curthread;
 	return (td->td_ast != 0 || td->td_owepreempt != 0);
 }
 #endif
 
 void *vcpu_stats(struct vcpu *vcpu);
 void vcpu_notify_event(struct vcpu *vcpu, bool lapic_intr);
 struct vmspace *vm_vmspace(struct vm *vm);
 struct vm_mem *vm_mem(struct vm *vm);
 struct vatpic *vm_atpic(struct vm *vm);
 struct vatpit *vm_atpit(struct vm *vm);
 struct vpmtmr *vm_pmtmr(struct vm *vm);
 struct vrtc *vm_rtc(struct vm *vm);
 
 /*
  * Inject exception 'vector' into the guest vcpu. This function returns 0 on
  * success and non-zero on failure.
  *
  * Wrapper functions like 'vm_inject_gp()' should be preferred to calling
  * this function directly because they enforce the trap-like or fault-like
  * behavior of an exception.
  *
  * This function should only be called in the context of the thread that is
  * executing this vcpu.
  */
 int vm_inject_exception(struct vcpu *vcpu, int vector, int err_valid,
     uint32_t errcode, int restart_instruction);
 
 /*
  * This function is called after a VM-exit that occurred during exception or
  * interrupt delivery through the IDT. The format of 'intinfo' is described
  * in Figure 15-1, "EXITINTINFO for All Intercepts", APM, Vol 2.
  *
  * If a VM-exit handler completes the event delivery successfully then it
  * should call vm_exit_intinfo() to extinguish the pending event. For e.g.,
  * if the task switch emulation is triggered via a task gate then it should
  * call this function with 'intinfo=0' to indicate that the external event
  * is not pending anymore.
  *
  * Return value is 0 on success and non-zero on failure.
  */
 int vm_exit_intinfo(struct vcpu *vcpu, uint64_t intinfo);
 
 /*
  * This function is called before every VM-entry to retrieve a pending
  * event that should be injected into the guest. This function combines
  * nested events into a double or triple fault.
  *
  * Returns 0 if there are no events that need to be injected into the guest
  * and non-zero otherwise.
  */
 int vm_entry_intinfo(struct vcpu *vcpu, uint64_t *info);
 
 int vm_get_intinfo(struct vcpu *vcpu, uint64_t *info1, uint64_t *info2);
 
 /*
  * Function used to keep track of the guest's TSC offset. The
  * offset is used by the virtualization extensions to provide a consistent
  * value for the Time Stamp Counter to the guest.
  */
 void vm_set_tsc_offset(struct vcpu *vcpu, uint64_t offset);
 
 enum vm_reg_name vm_segment_name(int seg_encoding);
 
 struct vm_copyinfo {
 	uint64_t	gpa;
 	size_t		len;
 	void		*hva;
 	void		*cookie;
 };
 
 /*
  * Set up 'copyinfo[]' to copy to/from guest linear address space starting
  * at 'gla' and 'len' bytes long. The 'prot' should be set to PROT_READ for
  * a copyin or PROT_WRITE for a copyout.
  *
  * retval	is_fault	Interpretation
  *   0		   0		Success
  *   0		   1		An exception was injected into the guest
  * EFAULT	  N/A		Unrecoverable error
  *
  * The 'copyinfo[]' can be passed to 'vm_copyin()' or 'vm_copyout()' only if
  * the return value is 0. The 'copyinfo[]' resources should be freed by calling
  * 'vm_copy_teardown()' after the copy is done.
  */
 int vm_copy_setup(struct vcpu *vcpu, struct vm_guest_paging *paging,
     uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
     int num_copyinfo, int *is_fault);
 void vm_copy_teardown(struct vm_copyinfo *copyinfo, int num_copyinfo);
 void vm_copyin(struct vm_copyinfo *copyinfo, void *kaddr, size_t len);
 void vm_copyout(const void *kaddr, struct vm_copyinfo *copyinfo, size_t len);
 
 int vcpu_trace_exceptions(struct vcpu *vcpu);
 int vcpu_trap_wbinvd(struct vcpu *vcpu);
 #endif	/* KERNEL */
 
 /*
  * Identifiers for optional vmm capabilities
  */
 enum vm_cap_type {
 	VM_CAP_HALT_EXIT,
 	VM_CAP_MTRAP_EXIT,
 	VM_CAP_PAUSE_EXIT,
 	VM_CAP_UNRESTRICTED_GUEST,
 	VM_CAP_ENABLE_INVPCID,
 	VM_CAP_BPT_EXIT,
 	VM_CAP_RDPID,
 	VM_CAP_RDTSCP,
 	VM_CAP_IPI_EXIT,
 	VM_CAP_MASK_HWINTR,
 	VM_CAP_RFLAGS_TF,
 	VM_CAP_MAX
 };
 
 enum vm_intr_trigger {
 	EDGE_TRIGGER,
 	LEVEL_TRIGGER
 };
 
 /*
  * The 'access' field has the format specified in Table 21-2 of the Intel
  * Architecture Manual vol 3b.
  *
  * XXX The contents of the 'access' field are architecturally defined except
  * bit 16 - Segment Unusable.
  */
 struct seg_desc {
 	uint64_t	base;
 	uint32_t	limit;
 	uint32_t	access;
 };
 #define	SEG_DESC_TYPE(access)		((access) & 0x001f)
 #define	SEG_DESC_DPL(access)		(((access) >> 5) & 0x3)
 #define	SEG_DESC_PRESENT(access)	(((access) & 0x0080) ? 1 : 0)
 #define	SEG_DESC_DEF32(access)		(((access) & 0x4000) ? 1 : 0)
 #define	SEG_DESC_GRANULARITY(access)	(((access) & 0x8000) ? 1 : 0)
 #define	SEG_DESC_UNUSABLE(access)	(((access) & 0x10000) ? 1 : 0)
 
 enum vm_cpu_mode {
 	CPU_MODE_REAL,
 	CPU_MODE_PROTECTED,
 	CPU_MODE_COMPATIBILITY,		/* IA-32E mode (CS.L = 0) */
 	CPU_MODE_64BIT,			/* IA-32E mode (CS.L = 1) */
 };
 
 enum vm_paging_mode {
 	PAGING_MODE_FLAT,
 	PAGING_MODE_32,
 	PAGING_MODE_PAE,
 	PAGING_MODE_64,
 	PAGING_MODE_64_LA57,
 };
 
 struct vm_guest_paging {
 	uint64_t	cr3;
 	int		cpl;
 	enum vm_cpu_mode cpu_mode;
 	enum vm_paging_mode paging_mode;
 };
 
 /*
  * The data structures 'vie' and 'vie_op' are meant to be opaque to the
  * consumers of instruction decoding. The only reason why their contents
  * need to be exposed is because they are part of the 'vm_exit' structure.
  */
 struct vie_op {
 	uint8_t		op_byte;	/* actual opcode byte */
 	uint8_t		op_type;	/* type of operation (e.g. MOV) */
 	uint16_t	op_flags;
 };
 _Static_assert(sizeof(struct vie_op) == 4, "ABI");
 _Static_assert(_Alignof(struct vie_op) == 2, "ABI");
 
 #define	VIE_INST_SIZE	15
 struct vie {
 	uint8_t		inst[VIE_INST_SIZE];	/* instruction bytes */
 	uint8_t		num_valid;		/* size of the instruction */
 
 /* The following fields are all zeroed upon restart. */
 #define	vie_startzero	num_processed
 	uint8_t		num_processed;
 
 	uint8_t		addrsize:4, opsize:4;	/* address and operand sizes */
 	uint8_t		rex_w:1,		/* REX prefix */
 			rex_r:1,
 			rex_x:1,
 			rex_b:1,
 			rex_present:1,
 			repz_present:1,		/* REP/REPE/REPZ prefix */
 			repnz_present:1,	/* REPNE/REPNZ prefix */
 			opsize_override:1,	/* Operand size override */
 			addrsize_override:1,	/* Address size override */
 			segment_override:1;	/* Segment override */
 
 	uint8_t		mod:2,			/* ModRM byte */
 			reg:4,
 			rm:4;
 
 	uint8_t		ss:2,			/* SIB byte */
 			vex_present:1,		/* VEX prefixed */
 			vex_l:1,		/* L bit */
 			index:4,		/* SIB byte */
 			base:4;			/* SIB byte */
 
 	uint8_t		disp_bytes;
 	uint8_t		imm_bytes;
 
 	uint8_t		scale;
 
 	uint8_t		vex_reg:4,		/* vvvv: first source register specifier */
 			vex_pp:2,		/* pp */
 			_sparebits:2;
 
 	uint8_t		_sparebytes[2];
 
 	int		base_register;		/* VM_REG_GUEST_xyz */
 	int		index_register;		/* VM_REG_GUEST_xyz */
 	int		segment_register;	/* VM_REG_GUEST_xyz */
 
 	int64_t		displacement;		/* optional addr displacement */
 	int64_t		immediate;		/* optional immediate operand */
 
 	uint8_t		decoded;	/* set to 1 if successfully decoded */
 
 	uint8_t		_sparebyte;
 
 	struct vie_op	op;			/* opcode description */
 };
 _Static_assert(sizeof(struct vie) == 64, "ABI");
 _Static_assert(__offsetof(struct vie, disp_bytes) == 22, "ABI");
 _Static_assert(__offsetof(struct vie, scale) == 24, "ABI");
 _Static_assert(__offsetof(struct vie, base_register) == 28, "ABI");
 
 enum vm_exitcode {
 	VM_EXITCODE_INOUT,
 	VM_EXITCODE_VMX,
 	VM_EXITCODE_BOGUS,
 	VM_EXITCODE_RDMSR,
 	VM_EXITCODE_WRMSR,
 	VM_EXITCODE_HLT,
 	VM_EXITCODE_MTRAP,
 	VM_EXITCODE_PAUSE,
 	VM_EXITCODE_PAGING,
 	VM_EXITCODE_INST_EMUL,
 	VM_EXITCODE_SPINUP_AP,
 	VM_EXITCODE_DEPRECATED1,	/* used to be SPINDOWN_CPU */
 	VM_EXITCODE_RENDEZVOUS,
 	VM_EXITCODE_IOAPIC_EOI,
 	VM_EXITCODE_SUSPENDED,
 	VM_EXITCODE_INOUT_STR,
 	VM_EXITCODE_TASK_SWITCH,
 	VM_EXITCODE_MONITOR,
 	VM_EXITCODE_MWAIT,
 	VM_EXITCODE_SVM,
 	VM_EXITCODE_REQIDLE,
 	VM_EXITCODE_DEBUG,
 	VM_EXITCODE_VMINSN,
 	VM_EXITCODE_BPT,
 	VM_EXITCODE_IPI,
 	VM_EXITCODE_DB,
 	VM_EXITCODE_MAX
 };
 
 struct vm_inout {
 	uint16_t	bytes:3;	/* 1 or 2 or 4 */
 	uint16_t	in:1;
 	uint16_t	string:1;
 	uint16_t	rep:1;
 	uint16_t	port;
 	uint32_t	eax;		/* valid for out */
 };
 
 struct vm_inout_str {
 	struct vm_inout	inout;		/* must be the first element */
 	struct vm_guest_paging paging;
 	uint64_t	rflags;
 	uint64_t	cr0;
 	uint64_t	index;
 	uint64_t	count;		/* rep=1 (%rcx), rep=0 (1) */
 	int		addrsize;
 	enum vm_reg_name seg_name;
 	struct seg_desc seg_desc;
 	int		cs_d;
 	uint64_t	cs_base;
 };
 
 enum task_switch_reason {
 	TSR_CALL,
 	TSR_IRET,
 	TSR_JMP,
 	TSR_IDT_GATE,	/* task gate in IDT */
 };
 
 struct vm_task_switch {
 	uint16_t	tsssel;		/* new TSS selector */
 	int		ext;		/* task switch due to external event */
 	uint32_t	errcode;
 	int		errcode_valid;	/* push 'errcode' on the new stack */
 	enum task_switch_reason reason;
 	struct vm_guest_paging paging;
 };
 
 struct vm_exit {
 	enum vm_exitcode	exitcode;
 	int			inst_length;	/* 0 means unknown */
 	uint64_t		rip;
 	union {
 		struct vm_inout	inout;
 		struct vm_inout_str inout_str;
 		struct {
 			uint64_t	gpa;
 			int		fault_type;
 		} paging;
 		struct {
 			uint64_t	gpa;
 			uint64_t	gla;
 			uint64_t	cs_base;
 			int		cs_d;		/* CS.D */
 			struct vm_guest_paging paging;
 			struct vie	vie;
 		} inst_emul;
 		/*
 		 * VMX specific payload. Used when there is no "better"
 		 * exitcode to represent the VM-exit.
 		 */
 		struct {
 			int		status;		/* vmx inst status */
 			/*
 			 * 'exit_reason' and 'exit_qualification' are valid
 			 * only if 'status' is zero.
 			 */
 			uint32_t	exit_reason;
 			uint64_t	exit_qualification;
 			/*
 			 * 'inst_error' and 'inst_type' are valid
 			 * only if 'status' is non-zero.
 			 */
 			int		inst_type;
 			int		inst_error;
 		} vmx;
 		/*
 		 * SVM specific payload.
 		 */
 		struct {
 			uint64_t	exitcode;
 			uint64_t	exitinfo1;
 			uint64_t	exitinfo2;
 		} svm;
 		struct {
 			int		inst_length;
 		} bpt;
 		struct {
 			int		trace_trap;
 			int		pushf_intercept;
 			int		tf_shadow_val;
 			struct		vm_guest_paging paging;
 		} dbg;
 		struct {
 			uint32_t	code;		/* ecx value */
 			uint64_t	wval;
 		} msr;
 		struct {
 			int		vcpu;
 			uint64_t	rip;
 		} spinup_ap;
 		struct {
 			uint64_t	rflags;
 			uint64_t	intr_status;
 		} hlt;
 		struct {
 			int		vector;
 		} ioapic_eoi;
 		struct {
 			enum vm_suspend_how how;
 		} suspended;
 		struct {
 			/*
 			 * The destination vCPU mask is saved in vcpu->cpuset
 			 * and is copied out to userspace separately to avoid
 			 * ABI concerns.
 			 */
 			uint32_t mode;
 			uint8_t vector;
 		} ipi;
 		struct vm_task_switch task_switch;
 	} u;
 };
 
 /* APIs to inject faults into the guest */
 void vm_inject_fault(struct vcpu *vcpu, int vector, int errcode_valid,
     int errcode);
 
 static __inline void
 vm_inject_ud(struct vcpu *vcpu)
 {
 	vm_inject_fault(vcpu, IDT_UD, 0, 0);
 }
 
 static __inline void
 vm_inject_gp(struct vcpu *vcpu)
 {
 	vm_inject_fault(vcpu, IDT_GP, 1, 0);
 }
 
 static __inline void
 vm_inject_ac(struct vcpu *vcpu, int errcode)
 {
 	vm_inject_fault(vcpu, IDT_AC, 1, errcode);
 }
 
 static __inline void
 vm_inject_ss(struct vcpu *vcpu, int errcode)
 {
 	vm_inject_fault(vcpu, IDT_SS, 1, errcode);
 }
 
 void vm_inject_pf(struct vcpu *vcpu, int error_code, uint64_t cr2);
 
 #endif	/* _VMM_H_ */
diff --git a/sys/arm64/include/vmm.h b/sys/arm64/include/vmm.h
index 73b5b4a09591..e839b5dd92c9 100644
--- a/sys/arm64/include/vmm.h
+++ b/sys/arm64/include/vmm.h
@@ -1,347 +1,348 @@
 /*
  * Copyright (C) 2015 Mihai Carabas <mihai.carabas@gmail.com>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #ifndef _VMM_H_
 #define	_VMM_H_
 
 #include <sys/param.h>
 #include <sys/cpuset.h>
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 #include "pte.h"
 #include "pmap.h"
 
 struct vcpu;
 
 enum vm_suspend_how {
 	VM_SUSPEND_NONE,
 	VM_SUSPEND_RESET,
 	VM_SUSPEND_POWEROFF,
 	VM_SUSPEND_HALT,
+	VM_SUSPEND_DESTROY,
 	VM_SUSPEND_LAST
 };
 
 /*
  * Identifiers for architecturally defined registers.
  */
 enum vm_reg_name {
 	VM_REG_GUEST_X0 = 0,
 	VM_REG_GUEST_X1,
 	VM_REG_GUEST_X2,
 	VM_REG_GUEST_X3,
 	VM_REG_GUEST_X4,
 	VM_REG_GUEST_X5,
 	VM_REG_GUEST_X6,
 	VM_REG_GUEST_X7,
 	VM_REG_GUEST_X8,
 	VM_REG_GUEST_X9,
 	VM_REG_GUEST_X10,
 	VM_REG_GUEST_X11,
 	VM_REG_GUEST_X12,
 	VM_REG_GUEST_X13,
 	VM_REG_GUEST_X14,
 	VM_REG_GUEST_X15,
 	VM_REG_GUEST_X16,
 	VM_REG_GUEST_X17,
 	VM_REG_GUEST_X18,
 	VM_REG_GUEST_X19,
 	VM_REG_GUEST_X20,
 	VM_REG_GUEST_X21,
 	VM_REG_GUEST_X22,
 	VM_REG_GUEST_X23,
 	VM_REG_GUEST_X24,
 	VM_REG_GUEST_X25,
 	VM_REG_GUEST_X26,
 	VM_REG_GUEST_X27,
 	VM_REG_GUEST_X28,
 	VM_REG_GUEST_X29,
 	VM_REG_GUEST_LR,
 	VM_REG_GUEST_SP,
 	VM_REG_GUEST_PC,
 	VM_REG_GUEST_CPSR,
 
 	VM_REG_GUEST_SCTLR_EL1,
 	VM_REG_GUEST_TTBR0_EL1,
 	VM_REG_GUEST_TTBR1_EL1,
 	VM_REG_GUEST_TCR_EL1,
 	VM_REG_GUEST_TCR2_EL1,
 	VM_REG_GUEST_MPIDR_EL1,
 	VM_REG_LAST
 };
 
 #define	VM_INTINFO_VECTOR(info)	((info) & 0xff)
 #define	VM_INTINFO_DEL_ERRCODE	0x800
 #define	VM_INTINFO_RSVD		0x7ffff000
 #define	VM_INTINFO_VALID	0x80000000
 #define	VM_INTINFO_TYPE		0x700
 #define	VM_INTINFO_HWINTR	(0 << 8)
 #define	VM_INTINFO_NMI		(2 << 8)
 #define	VM_INTINFO_HWEXCEPTION	(3 << 8)
 #define	VM_INTINFO_SWINTR	(4 << 8)
 
 #define VM_GUEST_BASE_IPA	0x80000000UL	/* Guest kernel start ipa */
 
 /*
  * The VM name has to fit into the pathname length constraints of devfs,
  * governed primarily by SPECNAMELEN.  The length is the total number of
  * characters in the full path, relative to the mount point and not 
  * including any leading '/' characters.
  * A prefix and a suffix are added to the name specified by the user.
  * The prefix is usually "vmm/" or "vmm.io/", but can be a few characters
  * longer for future use.
  * The suffix is a string that identifies a bootrom image or some similar
  * image that is attached to the VM. A separator character gets added to
  * the suffix automatically when generating the full path, so it must be
  * accounted for, reducing the effective length by 1.
  * The effective length of a VM name is 229 bytes for FreeBSD 13 and 37
  * bytes for FreeBSD 12.  A minimum length is set for safety and supports
  * a SPECNAMELEN as small as 32 on old systems.
  */
 #define VM_MAX_PREFIXLEN 10
 #define VM_MAX_SUFFIXLEN 15
 #define VM_MAX_NAMELEN \
     (SPECNAMELEN - VM_MAX_PREFIXLEN - VM_MAX_SUFFIXLEN - 1)
 
 #ifdef _KERNEL
 struct vm;
 struct vm_exception;
 struct vm_exit;
 struct vm_run;
 struct vm_object;
 struct vm_guest_paging;
 struct vm_vgic_descr;
 struct pmap;
 
 struct vm_eventinfo {
 	void	*rptr;		/* rendezvous cookie */
 	int	*sptr;		/* suspend cookie */
 	int	*iptr;		/* reqidle cookie */
 };
 
 int vm_create(const char *name, struct vm **retvm);
 struct vcpu *vm_alloc_vcpu(struct vm *vm, int vcpuid);
 void vm_disable_vcpu_creation(struct vm *vm);
 void vm_slock_vcpus(struct vm *vm);
 void vm_unlock_vcpus(struct vm *vm);
 void vm_destroy(struct vm *vm);
 int vm_reinit(struct vm *vm);
 const char *vm_name(struct vm *vm);
 
 uint16_t vm_get_maxcpus(struct vm *vm);
 void vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores,
     uint16_t *threads, uint16_t *maxcpus);
 int vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores,
     uint16_t threads, uint16_t maxcpus);
 int vm_get_register(struct vcpu *vcpu, int reg, uint64_t *retval);
 int vm_set_register(struct vcpu *vcpu, int reg, uint64_t val);
 int vm_run(struct vcpu *vcpu);
 int vm_suspend(struct vm *vm, enum vm_suspend_how how);
 void* vm_get_cookie(struct vm *vm);
 int vcpu_vcpuid(struct vcpu *vcpu);
 void *vcpu_get_cookie(struct vcpu *vcpu);
 struct vm *vcpu_vm(struct vcpu *vcpu);
 struct vcpu *vm_vcpu(struct vm *vm, int cpu);
 int vm_get_capability(struct vcpu *vcpu, int type, int *val);
 int vm_set_capability(struct vcpu *vcpu, int type, int val);
 int vm_activate_cpu(struct vcpu *vcpu);
 int vm_suspend_cpu(struct vm *vm, struct vcpu *vcpu);
 int vm_resume_cpu(struct vm *vm, struct vcpu *vcpu);
 int vm_inject_exception(struct vcpu *vcpu, uint64_t esr, uint64_t far);
 int vm_attach_vgic(struct vm *vm, struct vm_vgic_descr *descr);
 int vm_assert_irq(struct vm *vm, uint32_t irq);
 int vm_deassert_irq(struct vm *vm, uint32_t irq);
 int vm_raise_msi(struct vm *vm, uint64_t msg, uint64_t addr, int bus, int slot,
     int func);
 struct vm_exit *vm_exitinfo(struct vcpu *vcpu);
 void vm_exit_suspended(struct vcpu *vcpu, uint64_t pc);
 void vm_exit_debug(struct vcpu *vcpu, uint64_t pc);
 void vm_exit_rendezvous(struct vcpu *vcpu, uint64_t pc);
 void vm_exit_astpending(struct vcpu *vcpu, uint64_t pc);
 
 cpuset_t vm_active_cpus(struct vm *vm);
 cpuset_t vm_debug_cpus(struct vm *vm);
 cpuset_t vm_suspended_cpus(struct vm *vm);
 
 static __inline int
 vcpu_rendezvous_pending(struct vm_eventinfo *info)
 {
 
 	return (*((uintptr_t *)(info->rptr)) != 0);
 }
 
 static __inline int
 vcpu_suspended(struct vm_eventinfo *info)
 {
 
 	return (*info->sptr);
 }
 
 int vcpu_debugged(struct vcpu *vcpu);
 
 enum vcpu_state {
 	VCPU_IDLE,
 	VCPU_FROZEN,
 	VCPU_RUNNING,
 	VCPU_SLEEPING,
 };
 
 int vcpu_set_state(struct vcpu *vcpu, enum vcpu_state state, bool from_idle);
 enum vcpu_state vcpu_get_state(struct vcpu *vcpu, int *hostcpu);
 
 static int __inline
 vcpu_is_running(struct vcpu *vcpu, int *hostcpu)
 {
 	return (vcpu_get_state(vcpu, hostcpu) == VCPU_RUNNING);
 }
 
 #ifdef _SYS_PROC_H_
 static int __inline
 vcpu_should_yield(struct vcpu *vcpu)
 {
 	struct thread *td;
 
 	td = curthread;
 	return (td->td_ast != 0 || td->td_owepreempt != 0);
 }
 #endif
 
 void *vcpu_stats(struct vcpu *vcpu);
 void vcpu_notify_event(struct vcpu *vcpu);
 struct vmspace *vm_vmspace(struct vm *vm);
 struct vm_mem *vm_mem(struct vm *vm);
 
 enum vm_reg_name vm_segment_name(int seg_encoding);
 
 struct vm_copyinfo {
 	uint64_t	gpa;
 	size_t		len;
 	void		*hva;
 	void		*cookie;
 };
 
 #endif	/* _KERNEL */
 
 #define	VM_DIR_READ	0
 #define	VM_DIR_WRITE	1
 
 #define	VM_GP_M_MASK		0x1f
 #define	VM_GP_MMU_ENABLED	(1 << 5)
 
 struct vm_guest_paging {
 	uint64_t	ttbr0_addr;
 	uint64_t	ttbr1_addr;
 	uint64_t	tcr_el1;
 	uint64_t	tcr2_el1;
 	int		flags;
 	int		padding;
 };
 
 struct vie {
 	uint8_t access_size:4, sign_extend:1, dir:1, unused:2;
 	enum vm_reg_name reg;
 };
 
 struct vre {
 	uint32_t inst_syndrome;
 	uint8_t dir:1, unused:7;
 	enum vm_reg_name reg;
 };
 
 /*
  * Identifiers for optional vmm capabilities
  */
 enum vm_cap_type {
 	VM_CAP_HALT_EXIT,
 	VM_CAP_PAUSE_EXIT,
 	VM_CAP_UNRESTRICTED_GUEST,
 	VM_CAP_BRK_EXIT,
 	VM_CAP_SS_EXIT,
 	VM_CAP_MASK_HWINTR,
 	VM_CAP_MAX
 };
 
 enum vm_exitcode {
 	VM_EXITCODE_BOGUS,
 	VM_EXITCODE_INST_EMUL,
 	VM_EXITCODE_REG_EMUL,
 	VM_EXITCODE_HVC,
 	VM_EXITCODE_SUSPENDED,
 	VM_EXITCODE_HYP,
 	VM_EXITCODE_WFI,
 	VM_EXITCODE_PAGING,
 	VM_EXITCODE_SMCCC,
 	VM_EXITCODE_DEBUG,
 	VM_EXITCODE_BRK,
 	VM_EXITCODE_SS,
 	VM_EXITCODE_MAX
 };
 
 struct vm_exit {
 	enum vm_exitcode	exitcode;
 	int			inst_length;
 	uint64_t		pc;
 	union {
 		/*
 		 * ARM specific payload.
 		 */
 		struct {
 			uint32_t	exception_nr;
 			uint32_t	pad;
 			uint64_t	esr_el2;	/* Exception Syndrome Register */
 			uint64_t	far_el2;	/* Fault Address Register */
 			uint64_t	hpfar_el2;	/* Hypervisor IPA Fault Address Register */
 		} hyp;
 		struct {
 			struct vre 	vre;
 		} reg_emul;
 		struct {
 			uint64_t	gpa;
 			uint64_t	esr;
 		} paging;
 		struct {
 			uint64_t	gpa;
 			struct vm_guest_paging paging;
 			struct vie	vie;
 		} inst_emul;
 
 		/*
 		 * A SMCCC call, e.g. starting a core via PSCI.
 		 * Further arguments can be read by asking the kernel for
 		 * all register values.
 		 */
 		struct {
 			uint64_t	func_id;
 			uint64_t	args[7];
 		} smccc_call;
 
 		struct {
 			enum vm_suspend_how how;
 		} suspended;
 	} u;
 };
 
 #endif	/* _VMM_H_ */
diff --git a/sys/arm64/vmm/vmm.c b/sys/arm64/vmm/vmm.c
index 3082d2941221..1dcefa1489e9 100644
--- a/sys/arm64/vmm/vmm.c
+++ b/sys/arm64/vmm/vmm.c
@@ -1,1541 +1,1547 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (C) 2015 Mihai Carabas <mihai.carabas@gmail.com>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/cpuset.h>
 #include <sys/kernel.h>
 #include <sys/linker.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/proc.h>
 #include <sys/queue.h>
 #include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_param.h>
 
 #include <machine/armreg.h>
 #include <machine/cpu.h>
 #include <machine/fpu.h>
 #include <machine/machdep.h>
 #include <machine/pcb.h>
 #include <machine/smp.h>
 #include <machine/vm.h>
 #include <machine/vmparam.h>
 #include <machine/vmm.h>
 #include <machine/vmm_instruction_emul.h>
 
 #include <dev/pci/pcireg.h>
 #include <dev/vmm/vmm_dev.h>
 #include <dev/vmm/vmm_ktr.h>
 #include <dev/vmm/vmm_mem.h>
 #include <dev/vmm/vmm_stat.h>
 
 #include "arm64.h"
 #include "mmu.h"
 
 #include "io/vgic.h"
 #include "io/vtimer.h"
 
 struct vcpu {
 	int		flags;
 	enum vcpu_state	state;
 	struct mtx	mtx;
 	int		hostcpu;	/* host cpuid this vcpu last ran on */
 	int		vcpuid;
 	void		*stats;
 	struct vm_exit	exitinfo;
 	uint64_t	nextpc;		/* (x) next instruction to execute */
 	struct vm	*vm;		/* (o) */
 	void		*cookie;	/* (i) cpu-specific data */
 	struct vfpstate	*guestfpu;	/* (a,i) guest fpu state */
 };
 
 #define	vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx))
 #define	vcpu_lock_init(v)	mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
 #define	vcpu_lock_destroy(v)	mtx_destroy(&((v)->mtx))
 #define	vcpu_lock(v)		mtx_lock_spin(&((v)->mtx))
 #define	vcpu_unlock(v)		mtx_unlock_spin(&((v)->mtx))
 #define	vcpu_assert_locked(v)	mtx_assert(&((v)->mtx), MA_OWNED)
 
 struct vmm_mmio_region {
 	uint64_t start;
 	uint64_t end;
 	mem_region_read_t read;
 	mem_region_write_t write;
 };
 #define	VM_MAX_MMIO_REGIONS	4
 
 struct vmm_special_reg {
 	uint32_t	esr_iss;
 	uint32_t	esr_mask;
 	reg_read_t	reg_read;
 	reg_write_t	reg_write;
 	void		*arg;
 };
 #define	VM_MAX_SPECIAL_REGS	16
 
 /*
  * Initialization:
  * (o) initialized the first time the VM is created
  * (i) initialized when VM is created and when it is reinitialized
  * (x) initialized before use
  */
 struct vm {
 	void		*cookie;		/* (i) cpu-specific data */
 	volatile cpuset_t active_cpus;		/* (i) active vcpus */
 	volatile cpuset_t debug_cpus;		/* (i) vcpus stopped for debug */
 	int		suspend;		/* (i) stop VM execution */
 	bool		dying;			/* (o) is dying */
 	volatile cpuset_t suspended_cpus; 	/* (i) suspended vcpus */
 	volatile cpuset_t halted_cpus;		/* (x) cpus in a hard halt */
 	struct vmspace	*vmspace;		/* (o) guest's address space */
 	struct vm_mem	mem;			/* (i) guest memory */
 	char		name[VM_MAX_NAMELEN];	/* (o) virtual machine name */
 	struct vcpu	**vcpu;			/* (i) guest vcpus */
 	struct vmm_mmio_region mmio_region[VM_MAX_MMIO_REGIONS];
 						/* (o) guest MMIO regions */
 	struct vmm_special_reg special_reg[VM_MAX_SPECIAL_REGS];
 	/* The following describe the vm cpu topology */
 	uint16_t	sockets;		/* (o) num of sockets */
 	uint16_t	cores;			/* (o) num of cores/socket */
 	uint16_t	threads;		/* (o) num of threads/core */
 	uint16_t	maxcpus;		/* (o) max pluggable cpus */
 	struct sx	vcpus_init_lock;	/* (o) */
 };
 
 static bool vmm_initialized = false;
 
 static int vm_handle_wfi(struct vcpu *vcpu,
 			 struct vm_exit *vme, bool *retu);
 
 static MALLOC_DEFINE(M_VMM, "vmm", "vmm");
 
 /* statistics */
 static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
 
 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
 
 static int vmm_ipinum;
 SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0,
     "IPI vector used for vcpu notifications");
 
 struct vmm_regs {
 	uint64_t	id_aa64afr0;
 	uint64_t	id_aa64afr1;
 	uint64_t	id_aa64dfr0;
 	uint64_t	id_aa64dfr1;
 	uint64_t	id_aa64isar0;
 	uint64_t	id_aa64isar1;
 	uint64_t	id_aa64isar2;
 	uint64_t	id_aa64mmfr0;
 	uint64_t	id_aa64mmfr1;
 	uint64_t	id_aa64mmfr2;
 	uint64_t	id_aa64pfr0;
 	uint64_t	id_aa64pfr1;
 };
 
 static const struct vmm_regs vmm_arch_regs_masks = {
 	.id_aa64dfr0 =
 	    ID_AA64DFR0_CTX_CMPs_MASK |
 	    ID_AA64DFR0_WRPs_MASK |
 	    ID_AA64DFR0_BRPs_MASK |
 	    ID_AA64DFR0_PMUVer_3 |
 	    ID_AA64DFR0_DebugVer_8,
 	.id_aa64isar0 =
 	    ID_AA64ISAR0_TLB_TLBIOSR |
 	    ID_AA64ISAR0_SHA3_IMPL |
 	    ID_AA64ISAR0_RDM_IMPL |
 	    ID_AA64ISAR0_Atomic_IMPL |
 	    ID_AA64ISAR0_CRC32_BASE |
 	    ID_AA64ISAR0_SHA2_512 |
 	    ID_AA64ISAR0_SHA1_BASE |
 	    ID_AA64ISAR0_AES_PMULL,
 	.id_aa64mmfr0 =
 	    ID_AA64MMFR0_TGran4_IMPL |
 	    ID_AA64MMFR0_TGran64_IMPL |
 	    ID_AA64MMFR0_TGran16_IMPL |
 	    ID_AA64MMFR0_ASIDBits_16 |
 	    ID_AA64MMFR0_PARange_4P,
 	.id_aa64mmfr1 =
 	    ID_AA64MMFR1_SpecSEI_IMPL |
 	    ID_AA64MMFR1_PAN_ATS1E1 |
 	    ID_AA64MMFR1_HAFDBS_AF,
 	.id_aa64pfr0 =
 	    ID_AA64PFR0_GIC_CPUIF_NONE |
 	    ID_AA64PFR0_AdvSIMD_HP |
 	    ID_AA64PFR0_FP_HP |
 	    ID_AA64PFR0_EL3_64 |
 	    ID_AA64PFR0_EL2_64 |
 	    ID_AA64PFR0_EL1_64 |
 	    ID_AA64PFR0_EL0_64,
 };
 
 /* Host registers masked by vmm_arch_regs_masks. */
 static struct vmm_regs vmm_arch_regs;
 
 u_int vm_maxcpu;
 SYSCTL_UINT(_hw_vmm, OID_AUTO, maxcpu, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
     &vm_maxcpu, 0, "Maximum number of vCPUs");
 
 static void vcpu_notify_event_locked(struct vcpu *vcpu);
 
 /* global statistics */
 VMM_STAT(VMEXIT_COUNT, "total number of vm exits");
 VMM_STAT(VMEXIT_UNKNOWN, "number of vmexits for the unknown exception");
 VMM_STAT(VMEXIT_WFI, "number of times wfi was intercepted");
 VMM_STAT(VMEXIT_WFE, "number of times wfe was intercepted");
 VMM_STAT(VMEXIT_HVC, "number of times hvc was intercepted");
 VMM_STAT(VMEXIT_MSR, "number of times msr/mrs was intercepted");
 VMM_STAT(VMEXIT_DATA_ABORT, "number of vmexits for a data abort");
 VMM_STAT(VMEXIT_INSN_ABORT, "number of vmexits for an instruction abort");
 VMM_STAT(VMEXIT_UNHANDLED_SYNC, "number of vmexits for an unhandled synchronous exception");
 VMM_STAT(VMEXIT_IRQ, "number of vmexits for an irq");
 VMM_STAT(VMEXIT_FIQ, "number of vmexits for an interrupt");
 VMM_STAT(VMEXIT_BRK, "number of vmexits for a breakpoint exception");
 VMM_STAT(VMEXIT_SS, "number of vmexits for a single-step exception");
 VMM_STAT(VMEXIT_UNHANDLED_EL2, "number of vmexits for an unhandled EL2 exception");
 VMM_STAT(VMEXIT_UNHANDLED, "number of vmexits for an unhandled exception");
 
 /*
  * Upper limit on vm_maxcpu. We could increase this to 28 bits, but this
  * is a safe value for now.
  */
 #define	VM_MAXCPU	MIN(0xffff - 1, CPU_SETSIZE)
 
 static int
 vmm_regs_init(struct vmm_regs *regs, const struct vmm_regs *masks)
 {
 #define	_FETCH_KERN_REG(reg, field) do {				\
 	regs->field = vmm_arch_regs_masks.field;			\
 	if (!get_kernel_reg_iss_masked(reg ## _ISS, &regs->field,	\
 	    masks->field))						\
 		regs->field = 0;					\
 } while (0)
 	_FETCH_KERN_REG(ID_AA64AFR0_EL1, id_aa64afr0);
 	_FETCH_KERN_REG(ID_AA64AFR1_EL1, id_aa64afr1);
 	_FETCH_KERN_REG(ID_AA64DFR0_EL1, id_aa64dfr0);
 	_FETCH_KERN_REG(ID_AA64DFR1_EL1, id_aa64dfr1);
 	_FETCH_KERN_REG(ID_AA64ISAR0_EL1, id_aa64isar0);
 	_FETCH_KERN_REG(ID_AA64ISAR1_EL1, id_aa64isar1);
 	_FETCH_KERN_REG(ID_AA64ISAR2_EL1, id_aa64isar2);
 	_FETCH_KERN_REG(ID_AA64MMFR0_EL1, id_aa64mmfr0);
 	_FETCH_KERN_REG(ID_AA64MMFR1_EL1, id_aa64mmfr1);
 	_FETCH_KERN_REG(ID_AA64MMFR2_EL1, id_aa64mmfr2);
 	_FETCH_KERN_REG(ID_AA64PFR0_EL1, id_aa64pfr0);
 	_FETCH_KERN_REG(ID_AA64PFR1_EL1, id_aa64pfr1);
 #undef _FETCH_KERN_REG
 	return (0);
 }
 
 static void
 vcpu_cleanup(struct vcpu *vcpu, bool destroy)
 {
 	vmmops_vcpu_cleanup(vcpu->cookie);
 	vcpu->cookie = NULL;
 	if (destroy) {
 		vmm_stat_free(vcpu->stats);
 		fpu_save_area_free(vcpu->guestfpu);
 		vcpu_lock_destroy(vcpu);
 	}
 }
 
 static struct vcpu *
 vcpu_alloc(struct vm *vm, int vcpu_id)
 {
 	struct vcpu *vcpu;
 
 	KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus,
 	    ("vcpu_alloc: invalid vcpu %d", vcpu_id));
 
 	vcpu = malloc(sizeof(*vcpu), M_VMM, M_WAITOK | M_ZERO);
 	vcpu_lock_init(vcpu);
 	vcpu->state = VCPU_IDLE;
 	vcpu->hostcpu = NOCPU;
 	vcpu->vcpuid = vcpu_id;
 	vcpu->vm = vm;
 	vcpu->guestfpu = fpu_save_area_alloc();
 	vcpu->stats = vmm_stat_alloc();
 	return (vcpu);
 }
 
 static void
 vcpu_init(struct vcpu *vcpu)
 {
 	vcpu->cookie = vmmops_vcpu_init(vcpu->vm->cookie, vcpu, vcpu->vcpuid);
 	MPASS(vcpu->cookie != NULL);
 	fpu_save_area_reset(vcpu->guestfpu);
 	vmm_stat_init(vcpu->stats);
 }
 
 struct vm_exit *
 vm_exitinfo(struct vcpu *vcpu)
 {
 	return (&vcpu->exitinfo);
 }
 
 static int
 vmm_unsupported_quirk(void)
 {
 	/*
 	 * Known to not load on Ampere eMAG
 	 * https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=285051
 	 */
 	if (CPU_MATCH(CPU_IMPL_MASK | CPU_PART_MASK, CPU_IMPL_APM,
 	    CPU_PART_EMAG8180, 0, 0))
 		return (ENXIO);
 
 	return (0);
 }
 
 static int
 vmm_init(void)
 {
 	int error;
 
 	vm_maxcpu = mp_ncpus;
 	TUNABLE_INT_FETCH("hw.vmm.maxcpu", &vm_maxcpu);
 
 	if (vm_maxcpu > VM_MAXCPU) {
 		printf("vmm: vm_maxcpu clamped to %u\n", VM_MAXCPU);
 		vm_maxcpu = VM_MAXCPU;
 	}
 	if (vm_maxcpu == 0)
 		vm_maxcpu = 1;
 
 	error = vmm_regs_init(&vmm_arch_regs, &vmm_arch_regs_masks);
 	if (error != 0)
 		return (error);
 
 	return (vmmops_modinit(0));
 }
 
 static int
 vmm_handler(module_t mod, int what, void *arg)
 {
 	int error;
 
 	switch (what) {
 	case MOD_LOAD:
 		error = vmm_unsupported_quirk();
 		if (error != 0)
 			break;
 		error = vmmdev_init();
 		if (error != 0)
 			break;
 		error = vmm_init();
 		if (error == 0)
 			vmm_initialized = true;
 		else
 			(void)vmmdev_cleanup();
 		break;
 	case MOD_UNLOAD:
 		error = vmmdev_cleanup();
 		if (error == 0 && vmm_initialized) {
 			error = vmmops_modcleanup();
 			if (error) {
 				/*
 				 * Something bad happened - prevent new
 				 * VMs from being created
 				 */
 				vmm_initialized = false;
 			}
 		}
 		break;
 	default:
 		error = 0;
 		break;
 	}
 	return (error);
 }
 
 static moduledata_t vmm_kmod = {
 	"vmm",
 	vmm_handler,
 	NULL
 };
 
 /*
  * vmm initialization has the following dependencies:
  *
  * - HYP initialization requires smp_rendezvous() and therefore must happen
  *   after SMP is fully functional (after SI_SUB_SMP).
  * - vmm device initialization requires an initialized devfs.
  */
 DECLARE_MODULE(vmm, vmm_kmod, MAX(SI_SUB_SMP, SI_SUB_DEVFS) + 1, SI_ORDER_ANY);
 MODULE_VERSION(vmm, 1);
 
 static void
 vm_init(struct vm *vm, bool create)
 {
 	int i;
 
 	vm->cookie = vmmops_init(vm, vmspace_pmap(vm->vmspace));
 	MPASS(vm->cookie != NULL);
 
 	CPU_ZERO(&vm->active_cpus);
 	CPU_ZERO(&vm->debug_cpus);
 
 	vm->suspend = 0;
 	CPU_ZERO(&vm->suspended_cpus);
 
 	memset(vm->mmio_region, 0, sizeof(vm->mmio_region));
 	memset(vm->special_reg, 0, sizeof(vm->special_reg));
 
 	if (!create) {
 		for (i = 0; i < vm->maxcpus; i++) {
 			if (vm->vcpu[i] != NULL)
 				vcpu_init(vm->vcpu[i]);
 		}
 	}
 }
 
 void
 vm_disable_vcpu_creation(struct vm *vm)
 {
 	sx_xlock(&vm->vcpus_init_lock);
 	vm->dying = true;
 	sx_xunlock(&vm->vcpus_init_lock);
 }
 
 struct vcpu *
 vm_alloc_vcpu(struct vm *vm, int vcpuid)
 {
 	struct vcpu *vcpu;
 
 	if (vcpuid < 0 || vcpuid >= vm_get_maxcpus(vm))
 		return (NULL);
 
 	/* Some interrupt controllers may have a CPU limit */
 	if (vcpuid >= vgic_max_cpu_count(vm->cookie))
 		return (NULL);
 
 	vcpu = (struct vcpu *)
 	    atomic_load_acq_ptr((uintptr_t *)&vm->vcpu[vcpuid]);
 	if (__predict_true(vcpu != NULL))
 		return (vcpu);
 
 	sx_xlock(&vm->vcpus_init_lock);
 	vcpu = vm->vcpu[vcpuid];
 	if (vcpu == NULL && !vm->dying) {
 		vcpu = vcpu_alloc(vm, vcpuid);
 		vcpu_init(vcpu);
 
 		/*
 		 * Ensure vCPU is fully created before updating pointer
 		 * to permit unlocked reads above.
 		 */
 		atomic_store_rel_ptr((uintptr_t *)&vm->vcpu[vcpuid],
 		    (uintptr_t)vcpu);
 	}
 	sx_xunlock(&vm->vcpus_init_lock);
 	return (vcpu);
 }
 
 void
 vm_slock_vcpus(struct vm *vm)
 {
 	sx_slock(&vm->vcpus_init_lock);
 }
 
 void
 vm_unlock_vcpus(struct vm *vm)
 {
 	sx_unlock(&vm->vcpus_init_lock);
 }
 
 int
 vm_create(const char *name, struct vm **retvm)
 {
 	struct vm *vm;
 	struct vmspace *vmspace;
 
 	/*
 	 * If vmm.ko could not be successfully initialized then don't attempt
 	 * to create the virtual machine.
 	 */
 	if (!vmm_initialized)
 		return (ENXIO);
 
 	if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
 		return (EINVAL);
 
 	vmspace = vmmops_vmspace_alloc(0, 1ul << 39);
 	if (vmspace == NULL)
 		return (ENOMEM);
 
 	vm = malloc(sizeof(struct vm), M_VMM, M_WAITOK | M_ZERO);
 	strcpy(vm->name, name);
 	vm->vmspace = vmspace;
 	vm_mem_init(&vm->mem);
 	sx_init(&vm->vcpus_init_lock, "vm vcpus");
 
 	vm->sockets = 1;
 	vm->cores = 1;			/* XXX backwards compatibility */
 	vm->threads = 1;		/* XXX backwards compatibility */
 	vm->maxcpus = vm_maxcpu;
 
 	vm->vcpu = malloc(sizeof(*vm->vcpu) * vm->maxcpus, M_VMM,
 	    M_WAITOK | M_ZERO);
 
 	vm_init(vm, true);
 
 	*retvm = vm;
 	return (0);
 }
 
 void
 vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores,
     uint16_t *threads, uint16_t *maxcpus)
 {
 	*sockets = vm->sockets;
 	*cores = vm->cores;
 	*threads = vm->threads;
 	*maxcpus = vm->maxcpus;
 }
 
 uint16_t
 vm_get_maxcpus(struct vm *vm)
 {
 	return (vm->maxcpus);
 }
 
 int
 vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores,
     uint16_t threads, uint16_t maxcpus)
 {
 	/* Ignore maxcpus. */
 	if ((sockets * cores * threads) > vm->maxcpus)
 		return (EINVAL);
 	vm->sockets = sockets;
 	vm->cores = cores;
 	vm->threads = threads;
 	return(0);
 }
 
 static void
 vm_cleanup(struct vm *vm, bool destroy)
 {
 	pmap_t pmap __diagused;
 	int i;
 
 	if (destroy) {
 		vm_xlock_memsegs(vm);
 		pmap = vmspace_pmap(vm->vmspace);
 		sched_pin();
 		PCPU_SET(curvmpmap, NULL);
 		sched_unpin();
 		CPU_FOREACH(i) {
 			MPASS(cpuid_to_pcpu[i]->pc_curvmpmap != pmap);
 		}
 	} else
 		vm_assert_memseg_xlocked(vm);
 
 
 	vgic_detach_from_vm(vm->cookie);
 
 	for (i = 0; i < vm->maxcpus; i++) {
 		if (vm->vcpu[i] != NULL)
 			vcpu_cleanup(vm->vcpu[i], destroy);
 	}
 
 	vmmops_cleanup(vm->cookie);
 
 	vm_mem_cleanup(vm);
 	if (destroy) {
 		vm_mem_destroy(vm);
 
 		vmmops_vmspace_free(vm->vmspace);
 		vm->vmspace = NULL;
 
 		for (i = 0; i < vm->maxcpus; i++)
 			free(vm->vcpu[i], M_VMM);
 		free(vm->vcpu, M_VMM);
 		sx_destroy(&vm->vcpus_init_lock);
 	}
 }
 
 void
 vm_destroy(struct vm *vm)
 {
 	vm_cleanup(vm, true);
 	free(vm, M_VMM);
 }
 
 int
 vm_reinit(struct vm *vm)
 {
 	int error;
 
 	/*
 	 * A virtual machine can be reset only if all vcpus are suspended.
 	 */
 	if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
 		vm_cleanup(vm, false);
 		vm_init(vm, false);
 		error = 0;
 	} else {
 		error = EBUSY;
 	}
 
 	return (error);
 }
 
 const char *
 vm_name(struct vm *vm)
 {
 	return (vm->name);
 }
 
 int
 vm_gla2gpa_nofault(struct vcpu *vcpu, struct vm_guest_paging *paging,
     uint64_t gla, int prot, uint64_t *gpa, int *is_fault)
 {
 	return (vmmops_gla2gpa(vcpu->cookie, paging, gla, prot, gpa, is_fault));
 }
 
 static int
 vmm_reg_raz(struct vcpu *vcpu, uint64_t *rval, void *arg)
 {
 	*rval = 0;
 	return (0);
 }
 
 static int
 vmm_reg_read_arg(struct vcpu *vcpu, uint64_t *rval, void *arg)
 {
 	*rval = *(uint64_t *)arg;
 	return (0);
 }
 
 static int
 vmm_reg_wi(struct vcpu *vcpu, uint64_t wval, void *arg)
 {
 	return (0);
 }
 
 static const struct vmm_special_reg vmm_special_regs[] = {
 #define	SPECIAL_REG(_reg, _read, _write)				\
 	{								\
 		.esr_iss = ((_reg ## _op0) << ISS_MSR_OP0_SHIFT) |	\
 		    ((_reg ## _op1) << ISS_MSR_OP1_SHIFT) |		\
 		    ((_reg ## _CRn) << ISS_MSR_CRn_SHIFT) |		\
 		    ((_reg ## _CRm) << ISS_MSR_CRm_SHIFT) |		\
 		    ((_reg ## _op2) << ISS_MSR_OP2_SHIFT),		\
 		.esr_mask = ISS_MSR_REG_MASK,				\
 		.reg_read = (_read),					\
 		.reg_write = (_write),					\
 		.arg = NULL,						\
 	}
 #define	ID_SPECIAL_REG(_reg, _name)					\
 	{								\
 		.esr_iss = ((_reg ## _op0) << ISS_MSR_OP0_SHIFT) |	\
 		    ((_reg ## _op1) << ISS_MSR_OP1_SHIFT) |		\
 		    ((_reg ## _CRn) << ISS_MSR_CRn_SHIFT) |		\
 		    ((_reg ## _CRm) << ISS_MSR_CRm_SHIFT) |		\
 		    ((_reg ## _op2) << ISS_MSR_OP2_SHIFT),		\
 		.esr_mask = ISS_MSR_REG_MASK,				\
 		.reg_read = vmm_reg_read_arg,				\
 		.reg_write = vmm_reg_wi,				\
 		.arg = &(vmm_arch_regs._name),				\
 	}
 
 	/* ID registers */
 	ID_SPECIAL_REG(ID_AA64PFR0_EL1, id_aa64pfr0),
 	ID_SPECIAL_REG(ID_AA64DFR0_EL1, id_aa64dfr0),
 	ID_SPECIAL_REG(ID_AA64ISAR0_EL1, id_aa64isar0),
 	ID_SPECIAL_REG(ID_AA64MMFR0_EL1, id_aa64mmfr0),
 	ID_SPECIAL_REG(ID_AA64MMFR1_EL1, id_aa64mmfr1),
 
 	/*
 	 * All other ID registers are read as zero.
 	 * They are all in the op0=3, op1=0, CRn=0, CRm={0..7} space.
 	 */
 	{
 		.esr_iss = (3 << ISS_MSR_OP0_SHIFT) |
 		    (0 << ISS_MSR_OP1_SHIFT) |
 		    (0 << ISS_MSR_CRn_SHIFT) |
 		    (0 << ISS_MSR_CRm_SHIFT),
 		.esr_mask = ISS_MSR_OP0_MASK | ISS_MSR_OP1_MASK |
 		    ISS_MSR_CRn_MASK | (0x8 << ISS_MSR_CRm_SHIFT),
 		.reg_read = vmm_reg_raz,
 		.reg_write = vmm_reg_wi,
 		.arg = NULL,
 	},
 
 	/* Counter physical registers */
 	SPECIAL_REG(CNTP_CTL_EL0, vtimer_phys_ctl_read, vtimer_phys_ctl_write),
 	SPECIAL_REG(CNTP_CVAL_EL0, vtimer_phys_cval_read,
 	    vtimer_phys_cval_write),
 	SPECIAL_REG(CNTP_TVAL_EL0, vtimer_phys_tval_read,
 	    vtimer_phys_tval_write),
 	SPECIAL_REG(CNTPCT_EL0, vtimer_phys_cnt_read, vtimer_phys_cnt_write),
 #undef SPECIAL_REG
 };
 
 void
 vm_register_reg_handler(struct vm *vm, uint64_t iss, uint64_t mask,
     reg_read_t reg_read, reg_write_t reg_write, void *arg)
 {
 	int i;
 
 	for (i = 0; i < nitems(vm->special_reg); i++) {
 		if (vm->special_reg[i].esr_iss == 0 &&
 		    vm->special_reg[i].esr_mask == 0) {
 			vm->special_reg[i].esr_iss = iss;
 			vm->special_reg[i].esr_mask = mask;
 			vm->special_reg[i].reg_read = reg_read;
 			vm->special_reg[i].reg_write = reg_write;
 			vm->special_reg[i].arg = arg;
 			return;
 		}
 	}
 
 	panic("%s: No free special register slot", __func__);
 }
 
 void
 vm_deregister_reg_handler(struct vm *vm, uint64_t iss, uint64_t mask)
 {
 	int i;
 
 	for (i = 0; i < nitems(vm->special_reg); i++) {
 		if (vm->special_reg[i].esr_iss == iss &&
 		    vm->special_reg[i].esr_mask == mask) {
 			memset(&vm->special_reg[i], 0,
 			    sizeof(vm->special_reg[i]));
 			return;
 		}
 	}
 
 	panic("%s: Invalid special register: iss %lx mask %lx", __func__, iss,
 	    mask);
 }
 
 static int
 vm_handle_reg_emul(struct vcpu *vcpu, bool *retu)
 {
 	struct vm *vm;
 	struct vm_exit *vme;
 	struct vre *vre;
 	int i, rv;
 
 	vm = vcpu->vm;
 	vme = &vcpu->exitinfo;
 	vre = &vme->u.reg_emul.vre;
 
 	for (i = 0; i < nitems(vm->special_reg); i++) {
 		if (vm->special_reg[i].esr_iss == 0 &&
 		    vm->special_reg[i].esr_mask == 0)
 			continue;
 
 		if ((vre->inst_syndrome & vm->special_reg[i].esr_mask) ==
 		    vm->special_reg[i].esr_iss) {
 			rv = vmm_emulate_register(vcpu, vre,
 			    vm->special_reg[i].reg_read,
 			    vm->special_reg[i].reg_write,
 			    vm->special_reg[i].arg);
 			if (rv == 0) {
 				*retu = false;
 			}
 			return (rv);
 		}
 	}
 	for (i = 0; i < nitems(vmm_special_regs); i++) {
 		if ((vre->inst_syndrome & vmm_special_regs[i].esr_mask) ==
 		    vmm_special_regs[i].esr_iss) {
 			rv = vmm_emulate_register(vcpu, vre,
 			    vmm_special_regs[i].reg_read,
 			    vmm_special_regs[i].reg_write,
 			    vmm_special_regs[i].arg);
 			if (rv == 0) {
 				*retu = false;
 			}
 			return (rv);
 		}
 	}
 
 
 	*retu = true;
 	return (0);
 }
 
 void
 vm_register_inst_handler(struct vm *vm, uint64_t start, uint64_t size,
     mem_region_read_t mmio_read, mem_region_write_t mmio_write)
 {
 	int i;
 
 	for (i = 0; i < nitems(vm->mmio_region); i++) {
 		if (vm->mmio_region[i].start == 0 &&
 		    vm->mmio_region[i].end == 0) {
 			vm->mmio_region[i].start = start;
 			vm->mmio_region[i].end = start + size;
 			vm->mmio_region[i].read = mmio_read;
 			vm->mmio_region[i].write = mmio_write;
 			return;
 		}
 	}
 
 	panic("%s: No free MMIO region", __func__);
 }
 
 void
 vm_deregister_inst_handler(struct vm *vm, uint64_t start, uint64_t size)
 {
 	int i;
 
 	for (i = 0; i < nitems(vm->mmio_region); i++) {
 		if (vm->mmio_region[i].start == start &&
 		    vm->mmio_region[i].end == start + size) {
 			memset(&vm->mmio_region[i], 0,
 			    sizeof(vm->mmio_region[i]));
 			return;
 		}
 	}
 
 	panic("%s: Invalid MMIO region: %lx - %lx", __func__, start,
 	    start + size);
 }
 
 static int
 vm_handle_inst_emul(struct vcpu *vcpu, bool *retu)
 {
 	struct vm *vm;
 	struct vm_exit *vme;
 	struct vie *vie;
 	struct hyp *hyp;
 	uint64_t fault_ipa;
 	struct vm_guest_paging *paging;
 	struct vmm_mmio_region *vmr;
 	int error, i;
 
 	vm = vcpu->vm;
 	hyp = vm->cookie;
 	if (!hyp->vgic_attached)
 		goto out_user;
 
 	vme = &vcpu->exitinfo;
 	vie = &vme->u.inst_emul.vie;
 	paging = &vme->u.inst_emul.paging;
 
 	fault_ipa = vme->u.inst_emul.gpa;
 
 	vmr = NULL;
 	for (i = 0; i < nitems(vm->mmio_region); i++) {
 		if (vm->mmio_region[i].start <= fault_ipa &&
 		    vm->mmio_region[i].end > fault_ipa) {
 			vmr = &vm->mmio_region[i];
 			break;
 		}
 	}
 	if (vmr == NULL)
 		goto out_user;
 
 	error = vmm_emulate_instruction(vcpu, fault_ipa, vie, paging,
 	    vmr->read, vmr->write, retu);
 	return (error);
 
 out_user:
 	*retu = true;
 	return (0);
 }
 
 int
 vm_suspend(struct vm *vm, enum vm_suspend_how how)
 {
 	int i;
 
 	if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST)
 		return (EINVAL);
 
 	if (atomic_cmpset_int(&vm->suspend, 0, how) == 0) {
 		VM_CTR2(vm, "virtual machine already suspended %d/%d",
 		    vm->suspend, how);
 		return (EALREADY);
 	}
 
 	VM_CTR1(vm, "virtual machine successfully suspended %d", how);
 
 	/*
 	 * Notify all active vcpus that they are now suspended.
 	 */
 	for (i = 0; i < vm->maxcpus; i++) {
 		if (CPU_ISSET(i, &vm->active_cpus))
 			vcpu_notify_event(vm_vcpu(vm, i));
 	}
 
 	return (0);
 }
 
 void
 vm_exit_suspended(struct vcpu *vcpu, uint64_t pc)
 {
 	struct vm *vm = vcpu->vm;
 	struct vm_exit *vmexit;
 
 	KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST,
 	    ("vm_exit_suspended: invalid suspend type %d", vm->suspend));
 
 	vmexit = vm_exitinfo(vcpu);
 	vmexit->pc = pc;
 	vmexit->inst_length = 4;
 	vmexit->exitcode = VM_EXITCODE_SUSPENDED;
 	vmexit->u.suspended.how = vm->suspend;
 }
 
 void
 vm_exit_debug(struct vcpu *vcpu, uint64_t pc)
 {
 	struct vm_exit *vmexit;
 
 	vmexit = vm_exitinfo(vcpu);
 	vmexit->pc = pc;
 	vmexit->inst_length = 4;
 	vmexit->exitcode = VM_EXITCODE_DEBUG;
 }
 
 int
 vm_activate_cpu(struct vcpu *vcpu)
 {
 	struct vm *vm = vcpu->vm;
 
 	if (CPU_ISSET(vcpu->vcpuid, &vm->active_cpus))
 		return (EBUSY);
 
 	CPU_SET_ATOMIC(vcpu->vcpuid, &vm->active_cpus);
 	return (0);
 
 }
 
 int
 vm_suspend_cpu(struct vm *vm, struct vcpu *vcpu)
 {
 	if (vcpu == NULL) {
 		vm->debug_cpus = vm->active_cpus;
 		for (int i = 0; i < vm->maxcpus; i++) {
 			if (CPU_ISSET(i, &vm->active_cpus))
 				vcpu_notify_event(vm_vcpu(vm, i));
 		}
 	} else {
 		if (!CPU_ISSET(vcpu->vcpuid, &vm->active_cpus))
 			return (EINVAL);
 
 		CPU_SET_ATOMIC(vcpu->vcpuid, &vm->debug_cpus);
 		vcpu_notify_event(vcpu);
 	}
 	return (0);
 }
 
 int
 vm_resume_cpu(struct vm *vm, struct vcpu *vcpu)
 {
 
 	if (vcpu == NULL) {
 		CPU_ZERO(&vm->debug_cpus);
 	} else {
 		if (!CPU_ISSET(vcpu->vcpuid, &vm->debug_cpus))
 			return (EINVAL);
 
 		CPU_CLR_ATOMIC(vcpu->vcpuid, &vm->debug_cpus);
 	}
 	return (0);
 }
 
 int
 vcpu_debugged(struct vcpu *vcpu)
 {
 
 	return (CPU_ISSET(vcpu->vcpuid, &vcpu->vm->debug_cpus));
 }
 
 cpuset_t
 vm_active_cpus(struct vm *vm)
 {
 
 	return (vm->active_cpus);
 }
 
 cpuset_t
 vm_debug_cpus(struct vm *vm)
 {
 
 	return (vm->debug_cpus);
 }
 
 cpuset_t
 vm_suspended_cpus(struct vm *vm)
 {
 
 	return (vm->suspended_cpus);
 }
 
 
 void *
 vcpu_stats(struct vcpu *vcpu)
 {
 
 	return (vcpu->stats);
 }
 
 /*
  * This function is called to ensure that a vcpu "sees" a pending event
  * as soon as possible:
  * - If the vcpu thread is sleeping then it is woken up.
  * - If the vcpu is running on a different host_cpu then an IPI will be directed
  *   to the host_cpu to cause the vcpu to trap into the hypervisor.
  */
 static void
 vcpu_notify_event_locked(struct vcpu *vcpu)
 {
 	int hostcpu;
 
 	hostcpu = vcpu->hostcpu;
 	if (vcpu->state == VCPU_RUNNING) {
 		KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
 		if (hostcpu != curcpu) {
 			ipi_cpu(hostcpu, vmm_ipinum);
 		} else {
 			/*
 			 * If the 'vcpu' is running on 'curcpu' then it must
 			 * be sending a notification to itself (e.g. SELF_IPI).
 			 * The pending event will be picked up when the vcpu
 			 * transitions back to guest context.
 			 */
 		}
 	} else {
 		KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent "
 		    "with hostcpu %d", vcpu->state, hostcpu));
 		if (vcpu->state == VCPU_SLEEPING)
 			wakeup_one(vcpu);
 	}
 }
 
 void
 vcpu_notify_event(struct vcpu *vcpu)
 {
 	vcpu_lock(vcpu);
 	vcpu_notify_event_locked(vcpu);
 	vcpu_unlock(vcpu);
 }
 
 struct vmspace *
 vm_vmspace(struct vm *vm)
 {
 	return (vm->vmspace);
 }
 
 struct vm_mem *
 vm_mem(struct vm *vm)
 {
 	return (&vm->mem);
 }
 
 static void
 restore_guest_fpustate(struct vcpu *vcpu)
 {
 
 	/* flush host state to the pcb */
 	vfp_save_state(curthread, curthread->td_pcb);
 	/* Ensure the VFP state will be re-loaded when exiting the guest */
 	PCPU_SET(fpcurthread, NULL);
 
 	/* restore guest FPU state */
 	vfp_enable();
 	vfp_restore(vcpu->guestfpu);
 
 	/*
 	 * The FPU is now "dirty" with the guest's state so turn on emulation
 	 * to trap any access to the FPU by the host.
 	 */
 	vfp_disable();
 }
 
 static void
 save_guest_fpustate(struct vcpu *vcpu)
 {
 	if ((READ_SPECIALREG(cpacr_el1) & CPACR_FPEN_MASK) !=
 	    CPACR_FPEN_TRAP_ALL1)
 		panic("VFP not enabled in host!");
 
 	/* save guest FPU state */
 	vfp_enable();
 	vfp_store(vcpu->guestfpu);
 	vfp_disable();
 
 	KASSERT(PCPU_GET(fpcurthread) == NULL,
 	    ("%s: fpcurthread set with guest registers", __func__));
 }
 static int
 vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate,
     bool from_idle)
 {
 	int error;
 
 	vcpu_assert_locked(vcpu);
 
 	/*
 	 * State transitions from the vmmdev_ioctl() must always begin from
 	 * the VCPU_IDLE state. This guarantees that there is only a single
 	 * ioctl() operating on a vcpu at any point.
 	 */
 	if (from_idle) {
 		while (vcpu->state != VCPU_IDLE) {
 			vcpu_notify_event_locked(vcpu);
 			msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz);
 		}
 	} else {
 		KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
 		    "vcpu idle state"));
 	}
 
 	if (vcpu->state == VCPU_RUNNING) {
 		KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d "
 		    "mismatch for running vcpu", curcpu, vcpu->hostcpu));
 	} else {
 		KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a "
 		    "vcpu that is not running", vcpu->hostcpu));
 	}
 
 	/*
 	 * The following state transitions are allowed:
 	 * IDLE -> FROZEN -> IDLE
 	 * FROZEN -> RUNNING -> FROZEN
 	 * FROZEN -> SLEEPING -> FROZEN
 	 */
 	switch (vcpu->state) {
 	case VCPU_IDLE:
 	case VCPU_RUNNING:
 	case VCPU_SLEEPING:
 		error = (newstate != VCPU_FROZEN);
 		break;
 	case VCPU_FROZEN:
 		error = (newstate == VCPU_FROZEN);
 		break;
 	default:
 		error = 1;
 		break;
 	}
 
 	if (error)
 		return (EBUSY);
 
 	vcpu->state = newstate;
 	if (newstate == VCPU_RUNNING)
 		vcpu->hostcpu = curcpu;
 	else
 		vcpu->hostcpu = NOCPU;
 
 	if (newstate == VCPU_IDLE)
 		wakeup(&vcpu->state);
 
 	return (0);
 }
 
 static void
 vcpu_require_state(struct vcpu *vcpu, enum vcpu_state newstate)
 {
 	int error;
 
 	if ((error = vcpu_set_state(vcpu, newstate, false)) != 0)
 		panic("Error %d setting state to %d\n", error, newstate);
 }
 
 static void
 vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
 {
 	int error;
 
 	if ((error = vcpu_set_state_locked(vcpu, newstate, false)) != 0)
 		panic("Error %d setting state to %d", error, newstate);
 }
 
 int
 vm_get_capability(struct vcpu *vcpu, int type, int *retval)
 {
 	if (type < 0 || type >= VM_CAP_MAX)
 		return (EINVAL);
 
 	return (vmmops_getcap(vcpu->cookie, type, retval));
 }
 
 int
 vm_set_capability(struct vcpu *vcpu, int type, int val)
 {
 	if (type < 0 || type >= VM_CAP_MAX)
 		return (EINVAL);
 
 	return (vmmops_setcap(vcpu->cookie, type, val));
 }
 
 struct vm *
 vcpu_vm(struct vcpu *vcpu)
 {
 	return (vcpu->vm);
 }
 
 int
 vcpu_vcpuid(struct vcpu *vcpu)
 {
 	return (vcpu->vcpuid);
 }
 
 void *
 vcpu_get_cookie(struct vcpu *vcpu)
 {
 	return (vcpu->cookie);
 }
 
 struct vcpu *
 vm_vcpu(struct vm *vm, int vcpuid)
 {
 	return (vm->vcpu[vcpuid]);
 }
 
 int
 vcpu_set_state(struct vcpu *vcpu, enum vcpu_state newstate, bool from_idle)
 {
 	int error;
 
 	vcpu_lock(vcpu);
 	error = vcpu_set_state_locked(vcpu, newstate, from_idle);
 	vcpu_unlock(vcpu);
 
 	return (error);
 }
 
 enum vcpu_state
 vcpu_get_state(struct vcpu *vcpu, int *hostcpu)
 {
 	enum vcpu_state state;
 
 	vcpu_lock(vcpu);
 	state = vcpu->state;
 	if (hostcpu != NULL)
 		*hostcpu = vcpu->hostcpu;
 	vcpu_unlock(vcpu);
 
 	return (state);
 }
 
 int
 vm_get_register(struct vcpu *vcpu, int reg, uint64_t *retval)
 {
 
 	if (reg >= VM_REG_LAST)
 		return (EINVAL);
 
 	return (vmmops_getreg(vcpu->cookie, reg, retval));
 }
 
 int
 vm_set_register(struct vcpu *vcpu, int reg, uint64_t val)
 {
 	int error;
 
 	if (reg >= VM_REG_LAST)
 		return (EINVAL);
 	error = vmmops_setreg(vcpu->cookie, reg, val);
 	if (error || reg != VM_REG_GUEST_PC)
 		return (error);
 
 	vcpu->nextpc = val;
 
 	return (0);
 }
 
 void *
 vm_get_cookie(struct vm *vm)
 {
 	return (vm->cookie);
 }
 
 int
 vm_inject_exception(struct vcpu *vcpu, uint64_t esr, uint64_t far)
 {
 	return (vmmops_exception(vcpu->cookie, esr, far));
 }
 
 int
 vm_attach_vgic(struct vm *vm, struct vm_vgic_descr *descr)
 {
 	return (vgic_attach_to_vm(vm->cookie, descr));
 }
 
 int
 vm_assert_irq(struct vm *vm, uint32_t irq)
 {
 	return (vgic_inject_irq(vm->cookie, -1, irq, true));
 }
 
 int
 vm_deassert_irq(struct vm *vm, uint32_t irq)
 {
 	return (vgic_inject_irq(vm->cookie, -1, irq, false));
 }
 
 int
 vm_raise_msi(struct vm *vm, uint64_t msg, uint64_t addr, int bus, int slot,
     int func)
 {
 	/* TODO: Should we raise an SError? */
 	return (vgic_inject_msi(vm->cookie, msg, addr));
 }
 
 static int
 vm_handle_smccc_call(struct vcpu *vcpu, struct vm_exit *vme, bool *retu)
 {
 	struct hypctx *hypctx;
 	int i;
 
 	hypctx = vcpu_get_cookie(vcpu);
 
 	if ((hypctx->tf.tf_esr & ESR_ELx_ISS_MASK) != 0)
 		return (1);
 
 	vme->exitcode = VM_EXITCODE_SMCCC;
 	vme->u.smccc_call.func_id = hypctx->tf.tf_x[0];
 	for (i = 0; i < nitems(vme->u.smccc_call.args); i++)
 		vme->u.smccc_call.args[i] = hypctx->tf.tf_x[i + 1];
 
 	*retu = true;
 	return (0);
 }
 
 static int
 vm_handle_wfi(struct vcpu *vcpu, struct vm_exit *vme, bool *retu)
 {
+	struct vm *vm;
+
+	vm = vcpu->vm;
 	vcpu_lock(vcpu);
 	while (1) {
+		if (vm->suspend)
+			break;
+
 		if (vgic_has_pending_irq(vcpu->cookie))
 			break;
 
 		if (vcpu_should_yield(vcpu))
 			break;
 
 		vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
 		/*
 		 * XXX msleep_spin() cannot be interrupted by signals so
 		 * wake up periodically to check pending signals.
 		 */
 		msleep_spin(vcpu, &vcpu->mtx, "vmidle", hz);
 		vcpu_require_state_locked(vcpu, VCPU_FROZEN);
 	}
 	vcpu_unlock(vcpu);
 
 	*retu = false;
 	return (0);
 }
 
 static int
 vm_handle_paging(struct vcpu *vcpu, bool *retu)
 {
 	struct vm *vm = vcpu->vm;
 	struct vm_exit *vme;
 	struct vm_map *map;
 	uint64_t addr, esr;
 	pmap_t pmap;
 	int ftype, rv;
 
 	vme = &vcpu->exitinfo;
 
 	pmap = vmspace_pmap(vcpu->vm->vmspace);
 	addr = vme->u.paging.gpa;
 	esr = vme->u.paging.esr;
 
 	/* The page exists, but the page table needs to be updated. */
 	if (pmap_fault(pmap, esr, addr) == KERN_SUCCESS)
 		return (0);
 
 	switch (ESR_ELx_EXCEPTION(esr)) {
 	case EXCP_INSN_ABORT_L:
 	case EXCP_DATA_ABORT_L:
 		ftype = VM_PROT_EXECUTE | VM_PROT_READ | VM_PROT_WRITE;
 		break;
 	default:
 		panic("%s: Invalid exception (esr = %lx)", __func__, esr);
 	}
 
 	map = &vm->vmspace->vm_map;
 	rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL, NULL);
 	if (rv != KERN_SUCCESS)
 		return (EFAULT);
 
 	return (0);
 }
 
 static int
 vm_handle_suspend(struct vcpu *vcpu, bool *retu)
 {
 	struct vm *vm = vcpu->vm;
 	int error, i;
 	struct thread *td;
 
 	error = 0;
 	td = curthread;
 
 	CPU_SET_ATOMIC(vcpu->vcpuid, &vm->suspended_cpus);
 
 	/*
 	 * Wait until all 'active_cpus' have suspended themselves.
 	 *
 	 * Since a VM may be suspended at any time including when one or
 	 * more vcpus are doing a rendezvous we need to call the rendezvous
 	 * handler while we are waiting to prevent a deadlock.
 	 */
 	vcpu_lock(vcpu);
 	while (error == 0) {
 		if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0)
 			break;
 
 		vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
 		msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz);
 		vcpu_require_state_locked(vcpu, VCPU_FROZEN);
 		if (td_ast_pending(td, TDA_SUSPEND)) {
 			vcpu_unlock(vcpu);
 			error = thread_check_susp(td, false);
 			vcpu_lock(vcpu);
 		}
 	}
 	vcpu_unlock(vcpu);
 
 	/*
 	 * Wakeup the other sleeping vcpus and return to userspace.
 	 */
 	for (i = 0; i < vm->maxcpus; i++) {
 		if (CPU_ISSET(i, &vm->suspended_cpus)) {
 			vcpu_notify_event(vm_vcpu(vm, i));
 		}
 	}
 
 	*retu = true;
 	return (error);
 }
 
 int
 vm_run(struct vcpu *vcpu)
 {
 	struct vm *vm = vcpu->vm;
 	struct vm_eventinfo evinfo;
 	int error, vcpuid;
 	struct vm_exit *vme;
 	bool retu;
 	pmap_t pmap;
 
 	vcpuid = vcpu->vcpuid;
 
 	if (!CPU_ISSET(vcpuid, &vm->active_cpus))
 		return (EINVAL);
 
 	if (CPU_ISSET(vcpuid, &vm->suspended_cpus))
 		return (EINVAL);
 
 	pmap = vmspace_pmap(vm->vmspace);
 	vme = &vcpu->exitinfo;
 	evinfo.rptr = NULL;
 	evinfo.sptr = &vm->suspend;
 	evinfo.iptr = NULL;
 restart:
 	critical_enter();
 
 	restore_guest_fpustate(vcpu);
 
 	vcpu_require_state(vcpu, VCPU_RUNNING);
 	error = vmmops_run(vcpu->cookie, vcpu->nextpc, pmap, &evinfo);
 	vcpu_require_state(vcpu, VCPU_FROZEN);
 
 	save_guest_fpustate(vcpu);
 
 	critical_exit();
 
 	if (error == 0) {
 		retu = false;
 		switch (vme->exitcode) {
 		case VM_EXITCODE_INST_EMUL:
 			vcpu->nextpc = vme->pc + vme->inst_length;
 			error = vm_handle_inst_emul(vcpu, &retu);
 			break;
 
 		case VM_EXITCODE_REG_EMUL:
 			vcpu->nextpc = vme->pc + vme->inst_length;
 			error = vm_handle_reg_emul(vcpu, &retu);
 			break;
 
 		case VM_EXITCODE_HVC:
 			/*
 			 * The HVC instruction saves the address for the
 			 * next instruction as the return address.
 			 */
 			vcpu->nextpc = vme->pc;
 			/*
 			 * The PSCI call can change the exit information in the
 			 * case of suspend/reset/poweroff/cpu off/cpu on.
 			 */
 			error = vm_handle_smccc_call(vcpu, vme, &retu);
 			break;
 
 		case VM_EXITCODE_WFI:
 			vcpu->nextpc = vme->pc + vme->inst_length;
 			error = vm_handle_wfi(vcpu, vme, &retu);
 			break;
 
 		case VM_EXITCODE_PAGING:
 			vcpu->nextpc = vme->pc;
 			error = vm_handle_paging(vcpu, &retu);
 			break;
 
 		case VM_EXITCODE_SUSPENDED:
 			vcpu->nextpc = vme->pc;
 			error = vm_handle_suspend(vcpu, &retu);
 			break;
 
 		default:
 			/* Handle in userland */
 			vcpu->nextpc = vme->pc;
 			retu = true;
 			break;
 		}
 	}
 
 	if (error == 0 && retu == false)
 		goto restart;
 
 	return (error);
 }
diff --git a/sys/dev/vmm/vmm_dev.c b/sys/dev/vmm/vmm_dev.c
index 9f2b009d02ec..460a508a60dc 100644
--- a/sys/dev/vmm/vmm_dev.c
+++ b/sys/dev/vmm/vmm_dev.c
@@ -1,1208 +1,1209 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 2011 NetApp, Inc.
  * Copyright (C) 2015 Mihai Carabas <mihai.carabas@gmail.com>
  * All rights reserved.
  */
 
 #include <sys/param.h>
 #include <sys/conf.h>
 #include <sys/fcntl.h>
 #include <sys/ioccom.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/proc.h>
 #include <sys/queue.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/ucred.h>
 #include <sys/uio.h>
 
 #include <machine/vmm.h>
 
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 
 #include <dev/vmm/vmm_dev.h>
 #include <dev/vmm/vmm_mem.h>
 #include <dev/vmm/vmm_stat.h>
 
 #ifdef __amd64__
 #ifdef COMPAT_FREEBSD12
 struct vm_memseg_12 {
 	int		segid;
 	size_t		len;
 	char		name[64];
 };
 _Static_assert(sizeof(struct vm_memseg_12) == 80, "COMPAT_FREEBSD12 ABI");
 
 #define	VM_ALLOC_MEMSEG_12	\
 	_IOW('v', IOCNUM_ALLOC_MEMSEG, struct vm_memseg_12)
 #define	VM_GET_MEMSEG_12	\
 	_IOWR('v', IOCNUM_GET_MEMSEG, struct vm_memseg_12)
 #endif /* COMPAT_FREEBSD12 */
 #ifdef COMPAT_FREEBSD14
 struct vm_memseg_14 {
 	int		segid;
 	size_t		len;
 	char		name[VM_MAX_SUFFIXLEN + 1];
 };
 _Static_assert(sizeof(struct vm_memseg_14) == (VM_MAX_SUFFIXLEN + 1 + 16),
     "COMPAT_FREEBSD14 ABI");
 
 #define	VM_ALLOC_MEMSEG_14	\
 	_IOW('v', IOCNUM_ALLOC_MEMSEG, struct vm_memseg_14)
 #define	VM_GET_MEMSEG_14	\
 	_IOWR('v', IOCNUM_GET_MEMSEG, struct vm_memseg_14)
 #endif /* COMPAT_FREEBSD14 */
 #endif /* __amd64__ */
 
 struct devmem_softc {
 	int	segid;
 	char	*name;
 	struct cdev *cdev;
 	struct vmmdev_softc *sc;
 	SLIST_ENTRY(devmem_softc) link;
 };
 
 struct vmmdev_softc {
 	struct vm	*vm;		/* vm instance cookie */
 	struct cdev	*cdev;
 	struct ucred	*ucred;
 	SLIST_ENTRY(vmmdev_softc) link;
 	SLIST_HEAD(, devmem_softc) devmem;
 	int		flags;
 };
 
 static SLIST_HEAD(, vmmdev_softc) head;
 
 static unsigned pr_allow_flag;
 static struct sx vmmdev_mtx;
 SX_SYSINIT(vmmdev_mtx, &vmmdev_mtx, "vmm device mutex");
 
 static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev");
 
 SYSCTL_DECL(_hw_vmm);
 
 static void devmem_destroy(void *arg);
 static int devmem_create_cdev(struct vmmdev_softc *sc, int id, char *devmem);
 
 static int
 vmm_priv_check(struct ucred *ucred)
 {
 	if (jailed(ucred) &&
 	    !(ucred->cr_prison->pr_allow & pr_allow_flag))
 		return (EPERM);
 
 	return (0);
 }
 
 static int
 vcpu_lock_one(struct vcpu *vcpu)
 {
 	return (vcpu_set_state(vcpu, VCPU_FROZEN, true));
 }
 
 static void
 vcpu_unlock_one(struct vcpu *vcpu)
 {
 	enum vcpu_state state;
 
 	state = vcpu_get_state(vcpu, NULL);
 	if (state != VCPU_FROZEN) {
 		panic("vcpu %s(%d) has invalid state %d",
 		    vm_name(vcpu_vm(vcpu)), vcpu_vcpuid(vcpu), state);
 	}
 
 	vcpu_set_state(vcpu, VCPU_IDLE, false);
 }
 
 static int
 vcpu_lock_all(struct vmmdev_softc *sc)
 {
 	struct vcpu *vcpu;
 	int error;
 	uint16_t i, j, maxcpus;
 
 	error = 0;
 	vm_slock_vcpus(sc->vm);
 	maxcpus = vm_get_maxcpus(sc->vm);
 	for (i = 0; i < maxcpus; i++) {
 		vcpu = vm_vcpu(sc->vm, i);
 		if (vcpu == NULL)
 			continue;
 		error = vcpu_lock_one(vcpu);
 		if (error)
 			break;
 	}
 
 	if (error) {
 		for (j = 0; j < i; j++) {
 			vcpu = vm_vcpu(sc->vm, j);
 			if (vcpu == NULL)
 				continue;
 			vcpu_unlock_one(vcpu);
 		}
 		vm_unlock_vcpus(sc->vm);
 	}
 
 	return (error);
 }
 
 static void
 vcpu_unlock_all(struct vmmdev_softc *sc)
 {
 	struct vcpu *vcpu;
 	uint16_t i, maxcpus;
 
 	maxcpus = vm_get_maxcpus(sc->vm);
 	for (i = 0; i < maxcpus; i++) {
 		vcpu = vm_vcpu(sc->vm, i);
 		if (vcpu == NULL)
 			continue;
 		vcpu_unlock_one(vcpu);
 	}
 	vm_unlock_vcpus(sc->vm);
 }
 
 static struct vmmdev_softc *
 vmmdev_lookup(const char *name, struct ucred *cred)
 {
 	struct vmmdev_softc *sc;
 
 	sx_assert(&vmmdev_mtx, SA_XLOCKED);
 
 	SLIST_FOREACH(sc, &head, link) {
 		if (strcmp(name, vm_name(sc->vm)) == 0)
 			break;
 	}
 
 	if (sc == NULL)
 		return (NULL);
 
 	if (cr_cansee(cred, sc->ucred))
 		return (NULL);
 
 	return (sc);
 }
 
 static struct vmmdev_softc *
 vmmdev_lookup2(struct cdev *cdev)
 {
 	return (cdev->si_drv1);
 }
 
 static int
 vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
 {
 	int error, off, c, prot;
 	vm_paddr_t gpa, maxaddr;
 	void *hpa, *cookie;
 	struct vmmdev_softc *sc;
 
 	sc = vmmdev_lookup2(cdev);
 	if (sc == NULL)
 		return (ENXIO);
 
 	/*
 	 * Get a read lock on the guest memory map.
 	 */
 	vm_slock_memsegs(sc->vm);
 
 	error = 0;
 	prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ);
 	maxaddr = vmm_sysmem_maxaddr(sc->vm);
 	while (uio->uio_resid > 0 && error == 0) {
 		gpa = uio->uio_offset;
 		off = gpa & PAGE_MASK;
 		c = min(uio->uio_resid, PAGE_SIZE - off);
 
 		/*
 		 * The VM has a hole in its physical memory map. If we want to
 		 * use 'dd' to inspect memory beyond the hole we need to
 		 * provide bogus data for memory that lies in the hole.
 		 *
 		 * Since this device does not support lseek(2), dd(1) will
 		 * read(2) blocks of data to simulate the lseek(2).
 		 */
 		hpa = vm_gpa_hold_global(sc->vm, gpa, c, prot, &cookie);
 		if (hpa == NULL) {
 			if (uio->uio_rw == UIO_READ && gpa < maxaddr)
 				error = uiomove(__DECONST(void *, zero_region),
 				    c, uio);
 			else
 				error = EFAULT;
 		} else {
 			error = uiomove(hpa, c, uio);
 			vm_gpa_release(cookie);
 		}
 	}
 	vm_unlock_memsegs(sc->vm);
 	return (error);
 }
 
 CTASSERT(sizeof(((struct vm_memseg *)0)->name) >= VM_MAX_SUFFIXLEN + 1);
 
 static int
 get_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len)
 {
 	struct devmem_softc *dsc;
 	int error;
 	bool sysmem;
 
 	error = vm_get_memseg(sc->vm, mseg->segid, &mseg->len, &sysmem, NULL);
 	if (error || mseg->len == 0)
 		return (error);
 
 	if (!sysmem) {
 		SLIST_FOREACH(dsc, &sc->devmem, link) {
 			if (dsc->segid == mseg->segid)
 				break;
 		}
 		KASSERT(dsc != NULL, ("%s: devmem segment %d not found",
 		    __func__, mseg->segid));
 		error = copystr(dsc->name, mseg->name, len, NULL);
 	} else {
 		bzero(mseg->name, len);
 	}
 
 	return (error);
 }
 
 static int
 alloc_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len,
     struct domainset *domainset)
 {
 	char *name;
 	int error;
 	bool sysmem;
 
 	error = 0;
 	name = NULL;
 	sysmem = true;
 
 	/*
 	 * The allocation is lengthened by 1 to hold a terminating NUL.  It'll
 	 * by stripped off when devfs processes the full string.
 	 */
 	if (VM_MEMSEG_NAME(mseg)) {
 		sysmem = false;
 		name = malloc(len, M_VMMDEV, M_WAITOK);
 		error = copystr(mseg->name, name, len, NULL);
 		if (error)
 			goto done;
 	}
 	error = vm_alloc_memseg(sc->vm, mseg->segid, mseg->len, sysmem, domainset);
 	if (error)
 		goto done;
 
 	if (VM_MEMSEG_NAME(mseg)) {
 		error = devmem_create_cdev(sc, mseg->segid, name);
 		if (error)
 			vm_free_memseg(sc->vm, mseg->segid);
 		else
 			name = NULL;	/* freed when 'cdev' is destroyed */
 	}
 done:
 	free(name, M_VMMDEV);
 	return (error);
 }
 
 #if defined(__amd64__) && \
     (defined(COMPAT_FREEBSD14) || defined(COMPAT_FREEBSD12))
 /*
  * Translate pre-15.0 memory segment identifiers into their 15.0 counterparts.
  */
 static void
 adjust_segid(struct vm_memseg *mseg)
 {
 	if (mseg->segid != VM_SYSMEM) {
 		mseg->segid += (VM_BOOTROM - 1);
 	}
 }
 #endif
 
 static int
 vm_get_register_set(struct vcpu *vcpu, unsigned int count, int *regnum,
     uint64_t *regval)
 {
 	int error, i;
 
 	error = 0;
 	for (i = 0; i < count; i++) {
 		error = vm_get_register(vcpu, regnum[i], &regval[i]);
 		if (error)
 			break;
 	}
 	return (error);
 }
 
 static int
 vm_set_register_set(struct vcpu *vcpu, unsigned int count, int *regnum,
     uint64_t *regval)
 {
 	int error, i;
 
 	error = 0;
 	for (i = 0; i < count; i++) {
 		error = vm_set_register(vcpu, regnum[i], regval[i]);
 		if (error)
 			break;
 	}
 	return (error);
 }
 
 static int
 vmmdev_open(struct cdev *dev, int flags, int fmt, struct thread *td)
 {
 	int error;
 
 	/*
 	 * A jail without vmm access shouldn't be able to access vmm device
 	 * files at all, but check here just to be thorough.
 	 */
 	error = vmm_priv_check(td->td_ucred);
 	if (error != 0)
 		return (error);
 
 	return (0);
 }
 
 static const struct vmmdev_ioctl vmmdev_ioctls[] = {
 	VMMDEV_IOCTL(VM_GET_REGISTER, VMMDEV_IOCTL_LOCK_ONE_VCPU),
 	VMMDEV_IOCTL(VM_SET_REGISTER, VMMDEV_IOCTL_LOCK_ONE_VCPU),
 	VMMDEV_IOCTL(VM_GET_REGISTER_SET, VMMDEV_IOCTL_LOCK_ONE_VCPU),
 	VMMDEV_IOCTL(VM_SET_REGISTER_SET, VMMDEV_IOCTL_LOCK_ONE_VCPU),
 	VMMDEV_IOCTL(VM_GET_CAPABILITY, VMMDEV_IOCTL_LOCK_ONE_VCPU),
 	VMMDEV_IOCTL(VM_SET_CAPABILITY, VMMDEV_IOCTL_LOCK_ONE_VCPU),
 	VMMDEV_IOCTL(VM_ACTIVATE_CPU, VMMDEV_IOCTL_LOCK_ONE_VCPU),
 	VMMDEV_IOCTL(VM_INJECT_EXCEPTION, VMMDEV_IOCTL_LOCK_ONE_VCPU),
 	VMMDEV_IOCTL(VM_STATS, VMMDEV_IOCTL_LOCK_ONE_VCPU),
 	VMMDEV_IOCTL(VM_STAT_DESC, 0),
 
 #ifdef __amd64__
 #ifdef COMPAT_FREEBSD12
 	VMMDEV_IOCTL(VM_ALLOC_MEMSEG_12,
 	    VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
 #endif
 #ifdef COMPAT_FREEBSD14
 	VMMDEV_IOCTL(VM_ALLOC_MEMSEG_14,
 	    VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
 #endif
 #endif /* __amd64__ */
 	VMMDEV_IOCTL(VM_ALLOC_MEMSEG,
 	    VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
 	VMMDEV_IOCTL(VM_MMAP_MEMSEG,
 	    VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
 	VMMDEV_IOCTL(VM_MUNMAP_MEMSEG,
 	    VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
 	VMMDEV_IOCTL(VM_REINIT,
 	    VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS),
 
 #ifdef __amd64__
 #if defined(COMPAT_FREEBSD12)
 	VMMDEV_IOCTL(VM_GET_MEMSEG_12, VMMDEV_IOCTL_SLOCK_MEMSEGS),
 #endif
 #ifdef COMPAT_FREEBSD14
 	VMMDEV_IOCTL(VM_GET_MEMSEG_14, VMMDEV_IOCTL_SLOCK_MEMSEGS),
 #endif
 #endif /* __amd64__ */
 	VMMDEV_IOCTL(VM_GET_MEMSEG, VMMDEV_IOCTL_SLOCK_MEMSEGS),
 	VMMDEV_IOCTL(VM_MMAP_GETNEXT, VMMDEV_IOCTL_SLOCK_MEMSEGS),
 
 	VMMDEV_IOCTL(VM_SUSPEND_CPU, VMMDEV_IOCTL_MAYBE_ALLOC_VCPU),
 	VMMDEV_IOCTL(VM_RESUME_CPU, VMMDEV_IOCTL_MAYBE_ALLOC_VCPU),
 
 	VMMDEV_IOCTL(VM_SUSPEND, 0),
 	VMMDEV_IOCTL(VM_GET_CPUS, 0),
 	VMMDEV_IOCTL(VM_GET_TOPOLOGY, 0),
 	VMMDEV_IOCTL(VM_SET_TOPOLOGY, 0),
 };
 
 static int
 vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
     struct thread *td)
 {
 	struct vmmdev_softc *sc;
 	struct vcpu *vcpu;
 	const struct vmmdev_ioctl *ioctl;
 	struct vm_memseg *mseg;
 	int error, vcpuid;
 
 	sc = vmmdev_lookup2(cdev);
 	if (sc == NULL)
 		return (ENXIO);
 
 	ioctl = NULL;
 	for (size_t i = 0; i < nitems(vmmdev_ioctls); i++) {
 		if (vmmdev_ioctls[i].cmd == cmd) {
 			ioctl = &vmmdev_ioctls[i];
 			break;
 		}
 	}
 	if (ioctl == NULL) {
 		for (size_t i = 0; i < vmmdev_machdep_ioctl_count; i++) {
 			if (vmmdev_machdep_ioctls[i].cmd == cmd) {
 				ioctl = &vmmdev_machdep_ioctls[i];
 				break;
 			}
 		}
 	}
 	if (ioctl == NULL)
 		return (ENOTTY);
 
 	if ((ioctl->flags & VMMDEV_IOCTL_XLOCK_MEMSEGS) != 0)
 		vm_xlock_memsegs(sc->vm);
 	else if ((ioctl->flags & VMMDEV_IOCTL_SLOCK_MEMSEGS) != 0)
 		vm_slock_memsegs(sc->vm);
 
 	vcpu = NULL;
 	vcpuid = -1;
 	if ((ioctl->flags & (VMMDEV_IOCTL_LOCK_ONE_VCPU |
 	    VMMDEV_IOCTL_ALLOC_VCPU | VMMDEV_IOCTL_MAYBE_ALLOC_VCPU)) != 0) {
 		vcpuid = *(int *)data;
 		if (vcpuid == -1) {
 			if ((ioctl->flags &
 			    VMMDEV_IOCTL_MAYBE_ALLOC_VCPU) == 0) {
 				error = EINVAL;
 				goto lockfail;
 			}
 		} else {
 			vcpu = vm_alloc_vcpu(sc->vm, vcpuid);
 			if (vcpu == NULL) {
 				error = EINVAL;
 				goto lockfail;
 			}
 			if ((ioctl->flags & VMMDEV_IOCTL_LOCK_ONE_VCPU) != 0) {
 				error = vcpu_lock_one(vcpu);
 				if (error)
 					goto lockfail;
 			}
 		}
 	}
 	if ((ioctl->flags & VMMDEV_IOCTL_LOCK_ALL_VCPUS) != 0) {
 		error = vcpu_lock_all(sc);
 		if (error)
 			goto lockfail;
 	}
 
 	switch (cmd) {
 	case VM_SUSPEND: {
 		struct vm_suspend *vmsuspend;
 
 		vmsuspend = (struct vm_suspend *)data;
 		error = vm_suspend(sc->vm, vmsuspend->how);
 		break;
 	}
 	case VM_REINIT:
 		error = vm_reinit(sc->vm);
 		break;
 	case VM_STAT_DESC: {
 		struct vm_stat_desc *statdesc;
 
 		statdesc = (struct vm_stat_desc *)data;
 		error = vmm_stat_desc_copy(statdesc->index, statdesc->desc,
 		    sizeof(statdesc->desc));
 		break;
 	}
 	case VM_STATS: {
 		struct vm_stats *vmstats;
 
 		vmstats = (struct vm_stats *)data;
 		getmicrotime(&vmstats->tv);
 		error = vmm_stat_copy(vcpu, vmstats->index,
 		    nitems(vmstats->statbuf), &vmstats->num_entries,
 		    vmstats->statbuf);
 		break;
 	}
 	case VM_MMAP_GETNEXT: {
 		struct vm_memmap *mm;
 
 		mm = (struct vm_memmap *)data;
 		error = vm_mmap_getnext(sc->vm, &mm->gpa, &mm->segid,
 		    &mm->segoff, &mm->len, &mm->prot, &mm->flags);
 		break;
 	}
 	case VM_MMAP_MEMSEG: {
 		struct vm_memmap *mm;
 
 		mm = (struct vm_memmap *)data;
 		error = vm_mmap_memseg(sc->vm, mm->gpa, mm->segid, mm->segoff,
 		    mm->len, mm->prot, mm->flags);
 		break;
 	}
 	case VM_MUNMAP_MEMSEG: {
 		struct vm_munmap *mu;
 
 		mu = (struct vm_munmap *)data;
 		error = vm_munmap_memseg(sc->vm, mu->gpa, mu->len);
 		break;
 	}
 #ifdef __amd64__
 #ifdef COMPAT_FREEBSD12
 	case VM_ALLOC_MEMSEG_12:
 		mseg = (struct vm_memseg *)data;
 
 		adjust_segid(mseg);
 		error = alloc_memseg(sc, mseg,
 		    sizeof(((struct vm_memseg_12 *)0)->name), NULL);
 		break;
 	case VM_GET_MEMSEG_12:
 		mseg = (struct vm_memseg *)data;
 
 		adjust_segid(mseg);
 		error = get_memseg(sc, mseg,
 		    sizeof(((struct vm_memseg_12 *)0)->name));
 		break;
 #endif /* COMPAT_FREEBSD12 */
 #ifdef COMPAT_FREEBSD14
 	case VM_ALLOC_MEMSEG_14:
 		mseg = (struct vm_memseg *)data;
 
 		adjust_segid(mseg);
 		error = alloc_memseg(sc, mseg,
 		    sizeof(((struct vm_memseg_14 *)0)->name), NULL);
 		break;
 	case VM_GET_MEMSEG_14:
 		mseg = (struct vm_memseg *)data;
 
 		adjust_segid(mseg);
 		error = get_memseg(sc, mseg,
 		    sizeof(((struct vm_memseg_14 *)0)->name));
 		break;
 #endif /* COMPAT_FREEBSD14 */
 #endif /* __amd64__ */
 	case VM_ALLOC_MEMSEG: {
 		domainset_t *mask;
 		struct domainset *domainset, domain;
 
 		domainset = NULL;
 		mseg = (struct vm_memseg *)data;
 		if (mseg->ds_policy != DOMAINSET_POLICY_INVALID && mseg->ds_mask != NULL) {
 			if (mseg->ds_mask_size < sizeof(domainset_t) ||
 			    mseg->ds_mask_size > DOMAINSET_MAXSIZE / NBBY) {
 				error = ERANGE;
 				break;
 			}
 			memset(&domain, 0, sizeof(domain));
 			mask = malloc(mseg->ds_mask_size, M_VMMDEV, M_WAITOK);
 			error = copyin(mseg->ds_mask, mask, mseg->ds_mask_size);
 			if (error) {
 				free(mask, M_VMMDEV);
 				break;
 			}
 			error = domainset_populate(&domain, mask, mseg->ds_policy,
 			    mseg->ds_mask_size);
 			if (error) {
 				free(mask, M_VMMDEV);
 				break;
 			}
 			domainset = domainset_create(&domain);
 			if (domainset == NULL) {
 				error = EINVAL;
 				free(mask, M_VMMDEV);
 				break;
 			}
 			free(mask, M_VMMDEV);
 		}
 		error = alloc_memseg(sc, mseg, sizeof(mseg->name), domainset);
 
 		break;
 	}
 	case VM_GET_MEMSEG:
 		error = get_memseg(sc, (struct vm_memseg *)data,
 		    sizeof(((struct vm_memseg *)0)->name));
 		break;
 	case VM_GET_REGISTER: {
 		struct vm_register *vmreg;
 
 		vmreg = (struct vm_register *)data;
 		error = vm_get_register(vcpu, vmreg->regnum, &vmreg->regval);
 		break;
 	}
 	case VM_SET_REGISTER: {
 		struct vm_register *vmreg;
 
 		vmreg = (struct vm_register *)data;
 		error = vm_set_register(vcpu, vmreg->regnum, vmreg->regval);
 		break;
 	}
 	case VM_GET_REGISTER_SET: {
 		struct vm_register_set *vmregset;
 		uint64_t *regvals;
 		int *regnums;
 
 		vmregset = (struct vm_register_set *)data;
 		if (vmregset->count > VM_REG_LAST) {
 			error = EINVAL;
 			break;
 		}
 		regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV,
 		    M_WAITOK);
 		regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV,
 		    M_WAITOK);
 		error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) *
 		    vmregset->count);
 		if (error == 0)
 			error = vm_get_register_set(vcpu,
 			    vmregset->count, regnums, regvals);
 		if (error == 0)
 			error = copyout(regvals, vmregset->regvals,
 			    sizeof(regvals[0]) * vmregset->count);
 		free(regvals, M_VMMDEV);
 		free(regnums, M_VMMDEV);
 		break;
 	}
 	case VM_SET_REGISTER_SET: {
 		struct vm_register_set *vmregset;
 		uint64_t *regvals;
 		int *regnums;
 
 		vmregset = (struct vm_register_set *)data;
 		if (vmregset->count > VM_REG_LAST) {
 			error = EINVAL;
 			break;
 		}
 		regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV,
 		    M_WAITOK);
 		regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV,
 		    M_WAITOK);
 		error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) *
 		    vmregset->count);
 		if (error == 0)
 			error = copyin(vmregset->regvals, regvals,
 			    sizeof(regvals[0]) * vmregset->count);
 		if (error == 0)
 			error = vm_set_register_set(vcpu,
 			    vmregset->count, regnums, regvals);
 		free(regvals, M_VMMDEV);
 		free(regnums, M_VMMDEV);
 		break;
 	}
 	case VM_GET_CAPABILITY: {
 		struct vm_capability *vmcap;
 
 		vmcap = (struct vm_capability *)data;
 		error = vm_get_capability(vcpu, vmcap->captype, &vmcap->capval);
 		break;
 	}
 	case VM_SET_CAPABILITY: {
 		struct vm_capability *vmcap;
 
 		vmcap = (struct vm_capability *)data;
 		error = vm_set_capability(vcpu, vmcap->captype, vmcap->capval);
 		break;
 	}
 	case VM_ACTIVATE_CPU:
 		error = vm_activate_cpu(vcpu);
 		break;
 	case VM_GET_CPUS: {
 		struct vm_cpuset *vm_cpuset;
 		cpuset_t *cpuset;
 		int size;
 
 		error = 0;
 		vm_cpuset = (struct vm_cpuset *)data;
 		size = vm_cpuset->cpusetsize;
 		if (size < 1 || size > CPU_MAXSIZE / NBBY) {
 			error = ERANGE;
 			break;
 		}
 		cpuset = malloc(max(size, sizeof(cpuset_t)), M_TEMP,
 		    M_WAITOK | M_ZERO);
 		if (vm_cpuset->which == VM_ACTIVE_CPUS)
 			*cpuset = vm_active_cpus(sc->vm);
 		else if (vm_cpuset->which == VM_SUSPENDED_CPUS)
 			*cpuset = vm_suspended_cpus(sc->vm);
 		else if (vm_cpuset->which == VM_DEBUG_CPUS)
 			*cpuset = vm_debug_cpus(sc->vm);
 		else
 			error = EINVAL;
 		if (error == 0 && size < howmany(CPU_FLS(cpuset), NBBY))
 			error = ERANGE;
 		if (error == 0)
 			error = copyout(cpuset, vm_cpuset->cpus, size);
 		free(cpuset, M_TEMP);
 		break;
 	}
 	case VM_SUSPEND_CPU:
 		error = vm_suspend_cpu(sc->vm, vcpu);
 		break;
 	case VM_RESUME_CPU:
 		error = vm_resume_cpu(sc->vm, vcpu);
 		break;
 	case VM_SET_TOPOLOGY: {
 		struct vm_cpu_topology *topology;
 
 		topology = (struct vm_cpu_topology *)data;
 		error = vm_set_topology(sc->vm, topology->sockets,
 		    topology->cores, topology->threads, topology->maxcpus);
 		break;
 	}
 	case VM_GET_TOPOLOGY: {
 		struct vm_cpu_topology *topology;
 
 		topology = (struct vm_cpu_topology *)data;
 		vm_get_topology(sc->vm, &topology->sockets, &topology->cores,
 		    &topology->threads, &topology->maxcpus);
 		error = 0;
 		break;
 	}
 	default:
 		error = vmmdev_machdep_ioctl(sc->vm, vcpu, cmd, data, fflag,
 		    td);
 		break;
 	}
 
 	if ((ioctl->flags &
 	    (VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_SLOCK_MEMSEGS)) != 0)
 		vm_unlock_memsegs(sc->vm);
 	if ((ioctl->flags & VMMDEV_IOCTL_LOCK_ALL_VCPUS) != 0)
 		vcpu_unlock_all(sc);
 	else if ((ioctl->flags & VMMDEV_IOCTL_LOCK_ONE_VCPU) != 0)
 		vcpu_unlock_one(vcpu);
 
 	/*
 	 * Make sure that no handler returns a kernel-internal
 	 * error value to userspace.
 	 */
 	KASSERT(error == ERESTART || error >= 0,
 	    ("vmmdev_ioctl: invalid error return %d", error));
 	return (error);
 
 lockfail:
 	if ((ioctl->flags &
 	    (VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_SLOCK_MEMSEGS)) != 0)
 		vm_unlock_memsegs(sc->vm);
 	return (error);
 }
 
 static int
 vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t mapsize,
     struct vm_object **objp, int nprot)
 {
 	struct vmmdev_softc *sc;
 	vm_paddr_t gpa;
 	size_t len;
 	vm_ooffset_t segoff, first, last;
 	int error, found, segid;
 	bool sysmem;
 
 	first = *offset;
 	last = first + mapsize;
 	if ((nprot & PROT_EXEC) || first < 0 || first >= last)
 		return (EINVAL);
 
 	sc = vmmdev_lookup2(cdev);
 	if (sc == NULL) {
 		/* virtual machine is in the process of being created */
 		return (EINVAL);
 	}
 
 	/*
 	 * Get a read lock on the guest memory map.
 	 */
 	vm_slock_memsegs(sc->vm);
 
 	gpa = 0;
 	found = 0;
 	while (!found) {
 		error = vm_mmap_getnext(sc->vm, &gpa, &segid, &segoff, &len,
 		    NULL, NULL);
 		if (error)
 			break;
 
 		if (first >= gpa && last <= gpa + len)
 			found = 1;
 		else
 			gpa += len;
 	}
 
 	if (found) {
 		error = vm_get_memseg(sc->vm, segid, &len, &sysmem, objp);
 		KASSERT(error == 0 && *objp != NULL,
 		    ("%s: invalid memory segment %d", __func__, segid));
 		if (sysmem) {
 			vm_object_reference(*objp);
 			*offset = segoff + (first - gpa);
 		} else {
 			error = EINVAL;
 		}
 	}
 	vm_unlock_memsegs(sc->vm);
 	return (error);
 }
 
 static void
 vmmdev_destroy(struct vmmdev_softc *sc)
 {
 	struct devmem_softc *dsc;
 	int error __diagused;
 
 	KASSERT(sc->cdev == NULL, ("%s: cdev not free", __func__));
 
 	/*
 	 * Destroy all cdevs:
 	 *
 	 * - any new operations on the 'cdev' will return an error (ENXIO).
 	 *
 	 * - the 'devmem' cdevs are destroyed before the virtual machine 'cdev'
 	 */
 	SLIST_FOREACH(dsc, &sc->devmem, link) {
 		KASSERT(dsc->cdev != NULL, ("devmem cdev already destroyed"));
 		devmem_destroy(dsc);
 	}
 
 	vm_disable_vcpu_creation(sc->vm);
 	error = vcpu_lock_all(sc);
 	KASSERT(error == 0, ("%s: error %d freezing vcpus", __func__, error));
 	vm_unlock_vcpus(sc->vm);
 
 	while ((dsc = SLIST_FIRST(&sc->devmem)) != NULL) {
 		KASSERT(dsc->cdev == NULL, ("%s: devmem not free", __func__));
 		SLIST_REMOVE_HEAD(&sc->devmem, link);
 		free(dsc->name, M_VMMDEV);
 		free(dsc, M_VMMDEV);
 	}
 
 	if (sc->vm != NULL)
 		vm_destroy(sc->vm);
 
 	if (sc->ucred != NULL)
 		crfree(sc->ucred);
 
 	sx_xlock(&vmmdev_mtx);
 	SLIST_REMOVE(&head, sc, vmmdev_softc, link);
 	sx_xunlock(&vmmdev_mtx);
 	free(sc, M_VMMDEV);
 }
 
 static int
 vmmdev_lookup_and_destroy(const char *name, struct ucred *cred)
 {
 	struct cdev *cdev;
 	struct vmmdev_softc *sc;
 
 	sx_xlock(&vmmdev_mtx);
 	sc = vmmdev_lookup(name, cred);
 	if (sc == NULL || sc->cdev == NULL) {
 		sx_xunlock(&vmmdev_mtx);
 		return (EINVAL);
 	}
 
 	/*
 	 * Setting 'sc->cdev' to NULL is used to indicate that the VM
 	 * is scheduled for destruction.
 	 */
 	cdev = sc->cdev;
 	sc->cdev = NULL;
 	sx_xunlock(&vmmdev_mtx);
 
+	vm_suspend(sc->vm, VM_SUSPEND_DESTROY);
 	destroy_dev(cdev);
 	vmmdev_destroy(sc);
 
 	return (0);
 }
 
 static int
 sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS)
 {
 	char *buf;
 	int error, buflen;
 
 	error = vmm_priv_check(req->td->td_ucred);
 	if (error)
 		return (error);
 
 	buflen = VM_MAX_NAMELEN + 1;
 	buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO);
 	error = sysctl_handle_string(oidp, buf, buflen, req);
 	if (error == 0 && req->newptr != NULL)
 		error = vmmdev_lookup_and_destroy(buf, req->td->td_ucred);
 	free(buf, M_VMMDEV);
 	return (error);
 }
 SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy,
     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
     NULL, 0, sysctl_vmm_destroy, "A",
     "Destroy a vmm(4) instance (legacy interface)");
 
 static struct cdevsw vmmdevsw = {
 	.d_name		= "vmmdev",
 	.d_version	= D_VERSION,
 	.d_open		= vmmdev_open,
 	.d_ioctl	= vmmdev_ioctl,
 	.d_mmap_single	= vmmdev_mmap_single,
 	.d_read		= vmmdev_rw,
 	.d_write	= vmmdev_rw,
 };
 
 static struct vmmdev_softc *
 vmmdev_alloc(struct vm *vm, struct ucred *cred)
 {
 	struct vmmdev_softc *sc;
 
 	sc = malloc(sizeof(*sc), M_VMMDEV, M_WAITOK | M_ZERO);
 	SLIST_INIT(&sc->devmem);
 	sc->vm = vm;
 	sc->ucred = crhold(cred);
 	return (sc);
 }
 
 static int
 vmmdev_create(const char *name, struct ucred *cred)
 {
 	struct make_dev_args mda;
 	struct cdev *cdev;
 	struct vmmdev_softc *sc;
 	struct vm *vm;
 	int error;
 
 	sx_xlock(&vmmdev_mtx);
 	sc = vmmdev_lookup(name, cred);
 	if (sc != NULL) {
 		sx_xunlock(&vmmdev_mtx);
 		return (EEXIST);
 	}
 
 	error = vm_create(name, &vm);
 	if (error != 0) {
 		sx_xunlock(&vmmdev_mtx);
 		return (error);
 	}
 	sc = vmmdev_alloc(vm, cred);
 	SLIST_INSERT_HEAD(&head, sc, link);
 
 	make_dev_args_init(&mda);
 	mda.mda_devsw = &vmmdevsw;
 	mda.mda_cr = sc->ucred;
 	mda.mda_uid = UID_ROOT;
 	mda.mda_gid = GID_WHEEL;
 	mda.mda_mode = 0600;
 	mda.mda_si_drv1 = sc;
 	mda.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
 	error = make_dev_s(&mda, &cdev, "vmm/%s", name);
 	if (error != 0) {
 		sx_xunlock(&vmmdev_mtx);
 		vmmdev_destroy(sc);
 		return (error);
 	}
 	sc->cdev = cdev;
 	sx_xunlock(&vmmdev_mtx);
 	return (0);
 }
 
 static int
 sysctl_vmm_create(SYSCTL_HANDLER_ARGS)
 {
 	char *buf;
 	int error, buflen;
 
 	error = vmm_priv_check(req->td->td_ucred);
 	if (error != 0)
 		return (error);
 
 	buflen = VM_MAX_NAMELEN + 1;
 	buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO);
 	error = sysctl_handle_string(oidp, buf, buflen, req);
 	if (error == 0 && req->newptr != NULL)
 		error = vmmdev_create(buf, req->td->td_ucred);
 	free(buf, M_VMMDEV);
 	return (error);
 }
 SYSCTL_PROC(_hw_vmm, OID_AUTO, create,
     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
     NULL, 0, sysctl_vmm_create, "A",
     "Create a vmm(4) instance (legacy interface)");
 
 static int
 vmmctl_open(struct cdev *cdev, int flags, int fmt, struct thread *td)
 {
 	int error;
 
 	error = vmm_priv_check(td->td_ucred);
 	if (error != 0)
 		return (error);
 
 	if ((flags & FWRITE) == 0)
 		return (EPERM);
 
 	return (0);
 }
 
 static int
 vmmctl_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
     struct thread *td)
 {
 	int error;
 
 	switch (cmd) {
 	case VMMCTL_VM_CREATE: {
 		struct vmmctl_vm_create *vmc;
 
 		vmc = (struct vmmctl_vm_create *)data;
 		vmc->name[VM_MAX_NAMELEN] = '\0';
 		for (size_t i = 0; i < nitems(vmc->reserved); i++) {
 			if (vmc->reserved[i] != 0) {
 				error = EINVAL;
 				return (error);
 			}
 		}
 
 		error = vmmdev_create(vmc->name, td->td_ucred);
 		break;
 	}
 	case VMMCTL_VM_DESTROY: {
 		struct vmmctl_vm_destroy *vmd;
 
 		vmd = (struct vmmctl_vm_destroy *)data;
 		vmd->name[VM_MAX_NAMELEN] = '\0';
 		for (size_t i = 0; i < nitems(vmd->reserved); i++) {
 			if (vmd->reserved[i] != 0) {
 				error = EINVAL;
 				return (error);
 			}
 		}
 
 		error = vmmdev_lookup_and_destroy(vmd->name, td->td_ucred);
 		break;
 	}
 	default:
 		error = ENOTTY;
 		break;
 	}
 
 	return (error);
 }
 
 static struct cdev *vmmctl_cdev;
 static struct cdevsw vmmctlsw = {
 	.d_name		= "vmmctl",
 	.d_version	= D_VERSION,
 	.d_open		= vmmctl_open,
 	.d_ioctl	= vmmctl_ioctl,
 };
 
 int
 vmmdev_init(void)
 {
 	int error;
 
 	sx_xlock(&vmmdev_mtx);
 	error = make_dev_p(MAKEDEV_CHECKNAME, &vmmctl_cdev, &vmmctlsw, NULL,
 	    UID_ROOT, GID_WHEEL, 0600, "vmmctl");
 	if (error == 0)
 		pr_allow_flag = prison_add_allow(NULL, "vmm", NULL,
 		    "Allow use of vmm in a jail.");
 	sx_xunlock(&vmmdev_mtx);
 
 	return (error);
 }
 
 int
 vmmdev_cleanup(void)
 {
 	sx_xlock(&vmmdev_mtx);
 	if (!SLIST_EMPTY(&head)) {
 		sx_xunlock(&vmmdev_mtx);
 		return (EBUSY);
 	}
 	if (vmmctl_cdev != NULL) {
 		destroy_dev(vmmctl_cdev);
 		vmmctl_cdev = NULL;
 	}
 	sx_xunlock(&vmmdev_mtx);
 
 	return (0);
 }
 
 static int
 devmem_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t len,
     struct vm_object **objp, int nprot)
 {
 	struct devmem_softc *dsc;
 	vm_ooffset_t first, last;
 	size_t seglen;
 	int error;
 	bool sysmem;
 
 	dsc = cdev->si_drv1;
 	if (dsc == NULL) {
 		/* 'cdev' has been created but is not ready for use */
 		return (ENXIO);
 	}
 
 	first = *offset;
 	last = *offset + len;
 	if ((nprot & PROT_EXEC) || first < 0 || first >= last)
 		return (EINVAL);
 
 	vm_slock_memsegs(dsc->sc->vm);
 
 	error = vm_get_memseg(dsc->sc->vm, dsc->segid, &seglen, &sysmem, objp);
 	KASSERT(error == 0 && !sysmem && *objp != NULL,
 	    ("%s: invalid devmem segment %d", __func__, dsc->segid));
 
 	if (seglen >= last)
 		vm_object_reference(*objp);
 	else
 		error = EINVAL;
 
 	vm_unlock_memsegs(dsc->sc->vm);
 	return (error);
 }
 
 static struct cdevsw devmemsw = {
 	.d_name		= "devmem",
 	.d_version	= D_VERSION,
 	.d_mmap_single	= devmem_mmap_single,
 };
 
 static int
 devmem_create_cdev(struct vmmdev_softc *sc, int segid, char *devname)
 {
 	struct make_dev_args mda;
 	struct devmem_softc *dsc;
 	int error;
 
 	sx_xlock(&vmmdev_mtx);
 
 	dsc = malloc(sizeof(struct devmem_softc), M_VMMDEV, M_WAITOK | M_ZERO);
 	dsc->segid = segid;
 	dsc->name = devname;
 	dsc->sc = sc;
 	SLIST_INSERT_HEAD(&sc->devmem, dsc, link);
 
 	make_dev_args_init(&mda);
 	mda.mda_devsw = &devmemsw;
 	mda.mda_cr = sc->ucred;
 	mda.mda_uid = UID_ROOT;
 	mda.mda_gid = GID_WHEEL;
 	mda.mda_mode = 0600;
 	mda.mda_si_drv1 = dsc;
 	mda.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
 	error = make_dev_s(&mda, &dsc->cdev, "vmm.io/%s.%s", vm_name(sc->vm),
 	    devname);
 	if (error != 0) {
 		SLIST_REMOVE(&sc->devmem, dsc, devmem_softc, link);
 		free(dsc->name, M_VMMDEV);
 		free(dsc, M_VMMDEV);
 	}
 
 	sx_xunlock(&vmmdev_mtx);
 
 	return (error);
 }
 
 static void
 devmem_destroy(void *arg)
 {
 	struct devmem_softc *dsc = arg;
 
 	destroy_dev(dsc->cdev);
 	dsc->cdev = NULL;
 	dsc->sc = NULL;
 }
diff --git a/sys/riscv/include/vmm.h b/sys/riscv/include/vmm.h
index 1221521be368..de7119dd534a 100644
--- a/sys/riscv/include/vmm.h
+++ b/sys/riscv/include/vmm.h
@@ -1,298 +1,299 @@
 /*
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 2015 Mihai Carabas <mihai.carabas@gmail.com>
  * Copyright (c) 2024 Ruslan Bukin <br@bsdpad.com>
  *
  * This software was developed by the University of Cambridge Computer
  * Laboratory (Department of Computer Science and Technology) under Innovate
  * UK project 105694, "Digital Security by Design (DSbD) Technology Platform
  * Prototype".
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #ifndef _VMM_H_
 #define	_VMM_H_
 
 #include <sys/param.h>
 #include <sys/cpuset.h>
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 #include "pte.h"
 #include "pmap.h"
 
 struct vcpu;
 
 enum vm_suspend_how {
 	VM_SUSPEND_NONE,
 	VM_SUSPEND_RESET,
 	VM_SUSPEND_POWEROFF,
 	VM_SUSPEND_HALT,
+	VM_SUSPEND_DESTROY,
 	VM_SUSPEND_LAST
 };
 
 /*
  * Identifiers for architecturally defined registers.
  */
 enum vm_reg_name {
 	VM_REG_GUEST_ZERO = 0,
 	VM_REG_GUEST_RA,
 	VM_REG_GUEST_SP,
 	VM_REG_GUEST_GP,
 	VM_REG_GUEST_TP,
 	VM_REG_GUEST_T0,
 	VM_REG_GUEST_T1,
 	VM_REG_GUEST_T2,
 	VM_REG_GUEST_S0,
 	VM_REG_GUEST_S1,
 	VM_REG_GUEST_A0,
 	VM_REG_GUEST_A1,
 	VM_REG_GUEST_A2,
 	VM_REG_GUEST_A3,
 	VM_REG_GUEST_A4,
 	VM_REG_GUEST_A5,
 	VM_REG_GUEST_A6,
 	VM_REG_GUEST_A7,
 	VM_REG_GUEST_S2,
 	VM_REG_GUEST_S3,
 	VM_REG_GUEST_S4,
 	VM_REG_GUEST_S5,
 	VM_REG_GUEST_S6,
 	VM_REG_GUEST_S7,
 	VM_REG_GUEST_S8,
 	VM_REG_GUEST_S9,
 	VM_REG_GUEST_S10,
 	VM_REG_GUEST_S11,
 	VM_REG_GUEST_T3,
 	VM_REG_GUEST_T4,
 	VM_REG_GUEST_T5,
 	VM_REG_GUEST_T6,
 	VM_REG_GUEST_SEPC,
 	VM_REG_LAST
 };
 
 #define	VM_INTINFO_VECTOR(info)	((info) & 0xff)
 #define	VM_INTINFO_DEL_ERRCODE	0x800
 #define	VM_INTINFO_RSVD		0x7ffff000
 #define	VM_INTINFO_VALID	0x80000000
 #define	VM_INTINFO_TYPE		0x700
 #define	VM_INTINFO_HWINTR	(0 << 8)
 #define	VM_INTINFO_NMI		(2 << 8)
 #define	VM_INTINFO_HWEXCEPTION	(3 << 8)
 #define	VM_INTINFO_SWINTR	(4 << 8)
 
 #define	VM_MAX_NAMELEN	32
 #define VM_MAX_SUFFIXLEN 15
 
 #ifdef _KERNEL
 
 struct vm;
 struct vm_exception;
 struct vm_exit;
 struct vm_run;
 struct vm_object;
 struct vm_guest_paging;
 struct vm_aplic_descr;
 struct pmap;
 
 struct vm_eventinfo {
 	void	*rptr;		/* rendezvous cookie */
 	int	*sptr;		/* suspend cookie */
 	int	*iptr;		/* reqidle cookie */
 };
 
 int vm_create(const char *name, struct vm **retvm);
 struct vcpu *vm_alloc_vcpu(struct vm *vm, int vcpuid);
 void vm_disable_vcpu_creation(struct vm *vm);
 void vm_slock_vcpus(struct vm *vm);
 void vm_unlock_vcpus(struct vm *vm);
 void vm_destroy(struct vm *vm);
 int vm_reinit(struct vm *vm);
 const char *vm_name(struct vm *vm);
 
 uint16_t vm_get_maxcpus(struct vm *vm);
 void vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores,
     uint16_t *threads, uint16_t *maxcpus);
 int vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores,
     uint16_t threads, uint16_t maxcpus);
 int vm_get_register(struct vcpu *vcpu, int reg, uint64_t *retval);
 int vm_set_register(struct vcpu *vcpu, int reg, uint64_t val);
 int vm_run(struct vcpu *vcpu);
 int vm_suspend(struct vm *vm, enum vm_suspend_how how);
 void* vm_get_cookie(struct vm *vm);
 int vcpu_vcpuid(struct vcpu *vcpu);
 void *vcpu_get_cookie(struct vcpu *vcpu);
 struct vm *vcpu_vm(struct vcpu *vcpu);
 struct vcpu *vm_vcpu(struct vm *vm, int cpu);
 int vm_get_capability(struct vcpu *vcpu, int type, int *val);
 int vm_set_capability(struct vcpu *vcpu, int type, int val);
 int vm_activate_cpu(struct vcpu *vcpu);
 int vm_suspend_cpu(struct vm *vm, struct vcpu *vcpu);
 int vm_resume_cpu(struct vm *vm, struct vcpu *vcpu);
 int vm_inject_exception(struct vcpu *vcpu, uint64_t scause);
 int vm_attach_aplic(struct vm *vm, struct vm_aplic_descr *descr);
 int vm_assert_irq(struct vm *vm, uint32_t irq);
 int vm_deassert_irq(struct vm *vm, uint32_t irq);
 int vm_raise_msi(struct vm *vm, uint64_t msg, uint64_t addr, int bus, int slot,
     int func);
 struct vm_exit *vm_exitinfo(struct vcpu *vcpu);
 void vm_exit_suspended(struct vcpu *vcpu, uint64_t pc);
 void vm_exit_debug(struct vcpu *vcpu, uint64_t pc);
 void vm_exit_rendezvous(struct vcpu *vcpu, uint64_t pc);
 void vm_exit_astpending(struct vcpu *vcpu, uint64_t pc);
 
 cpuset_t vm_active_cpus(struct vm *vm);
 cpuset_t vm_debug_cpus(struct vm *vm);
 cpuset_t vm_suspended_cpus(struct vm *vm);
 
 static __inline int
 vcpu_rendezvous_pending(struct vm_eventinfo *info)
 {
 
 	return (*((uintptr_t *)(info->rptr)) != 0);
 }
 
 static __inline int
 vcpu_suspended(struct vm_eventinfo *info)
 {
 
 	return (*info->sptr);
 }
 
 int vcpu_debugged(struct vcpu *vcpu);
 
 enum vcpu_state {
 	VCPU_IDLE,
 	VCPU_FROZEN,
 	VCPU_RUNNING,
 	VCPU_SLEEPING,
 };
 
 int vcpu_set_state(struct vcpu *vcpu, enum vcpu_state state, bool from_idle);
 enum vcpu_state vcpu_get_state(struct vcpu *vcpu, int *hostcpu);
 
 static int __inline
 vcpu_is_running(struct vcpu *vcpu, int *hostcpu)
 {
 	return (vcpu_get_state(vcpu, hostcpu) == VCPU_RUNNING);
 }
 
 #ifdef _SYS_PROC_H_
 static int __inline
 vcpu_should_yield(struct vcpu *vcpu)
 {
 	struct thread *td;
 
 	td = curthread;
 	return (td->td_ast != 0 || td->td_owepreempt != 0);
 }
 #endif
 
 void *vcpu_stats(struct vcpu *vcpu);
 void vcpu_notify_event(struct vcpu *vcpu);
 struct vmspace *vm_vmspace(struct vm *vm);
 struct vm_mem *vm_mem(struct vm *vm);
 
 enum vm_reg_name vm_segment_name(int seg_encoding);
 
 #endif	/* _KERNEL */
 
 #define	VM_DIR_READ	0
 #define	VM_DIR_WRITE	1
 
 #define	VM_GP_M_MASK		0x1f
 #define	VM_GP_MMU_ENABLED	(1 << 5)
 
 struct vm_guest_paging {
 	int		flags;
 	int		padding;
 };
 
 struct vie {
 	uint8_t access_size:4, sign_extend:1, dir:1, unused:2;
 	enum vm_reg_name reg;
 };
 
 struct vre {
 	uint32_t inst_syndrome;
 	uint8_t dir:1, unused:7;
 	enum vm_reg_name reg;
 };
 
 /*
  * Identifiers for optional vmm capabilities
  */
 enum vm_cap_type {
 	VM_CAP_UNRESTRICTED_GUEST,
 	VM_CAP_SSTC,
 	VM_CAP_MAX
 };
 
 enum vm_exitcode {
 	VM_EXITCODE_BOGUS,
 	VM_EXITCODE_ECALL,
 	VM_EXITCODE_HYP,
 	VM_EXITCODE_PAGING,
 	VM_EXITCODE_SUSPENDED,
 	VM_EXITCODE_DEBUG,
 	VM_EXITCODE_INST_EMUL,
 	VM_EXITCODE_WFI,
 	VM_EXITCODE_MAX
 };
 
 struct vm_exit {
 	uint64_t scause;
 	uint64_t sepc;
 	uint64_t stval;
 	uint64_t htval;
 	uint64_t htinst;
 	enum vm_exitcode exitcode;
 	int inst_length;
 	uint64_t pc;
 	union {
 		struct {
 			uint64_t gpa;
 		} paging;
 
 		struct {
 			uint64_t gpa;
 			struct vm_guest_paging paging;
 			struct vie vie;
 		} inst_emul;
 
 		struct {
 			uint64_t args[8];
 		} ecall;
 
 		struct {
 			enum vm_suspend_how how;
 		} suspended;
 
 		struct {
 			uint64_t scause;
 		} hyp;
 	} u;
 };
 
 #endif	/* _VMM_H_ */
diff --git a/sys/riscv/vmm/vmm.c b/sys/riscv/vmm/vmm.c
index 7528ef6e4698..ec4514f70fa6 100644
--- a/sys/riscv/vmm/vmm.c
+++ b/sys/riscv/vmm/vmm.c
@@ -1,1247 +1,1251 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 2015 Mihai Carabas <mihai.carabas@gmail.com>
  * Copyright (c) 2024 Ruslan Bukin <br@bsdpad.com>
  *
  * This software was developed by the University of Cambridge Computer
  * Laboratory (Department of Computer Science and Technology) under Innovate
  * UK project 105694, "Digital Security by Design (DSbD) Technology Platform
  * Prototype".
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/cpuset.h>
 #include <sys/kernel.h>
 #include <sys/linker.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/proc.h>
 #include <sys/queue.h>
 #include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_param.h>
 
 #include <machine/riscvreg.h>
 #include <machine/cpu.h>
 #include <machine/fpe.h>
 #include <machine/machdep.h>
 #include <machine/pcb.h>
 #include <machine/smp.h>
 #include <machine/vm.h>
 #include <machine/vmparam.h>
 #include <machine/vmm.h>
 #include <machine/vmm_instruction_emul.h>
 
 #include <dev/pci/pcireg.h>
 
 #include <dev/vmm/vmm_dev.h>
 #include <dev/vmm/vmm_ktr.h>
 #include <dev/vmm/vmm_mem.h>
 
 #include "vmm_stat.h"
 #include "riscv.h"
 
 #include "vmm_aplic.h"
 
 struct vcpu {
 	int		flags;
 	enum vcpu_state	state;
 	struct mtx	mtx;
 	int		hostcpu;	/* host cpuid this vcpu last ran on */
 	int		vcpuid;
 	void		*stats;
 	struct vm_exit	exitinfo;
 	uint64_t	nextpc;		/* (x) next instruction to execute */
 	struct vm	*vm;		/* (o) */
 	void		*cookie;	/* (i) cpu-specific data */
 	struct fpreg	*guestfpu;	/* (a,i) guest fpu state */
 };
 
 #define	vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx))
 #define	vcpu_lock_init(v)	mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
 #define	vcpu_lock_destroy(v)	mtx_destroy(&((v)->mtx))
 #define	vcpu_lock(v)		mtx_lock_spin(&((v)->mtx))
 #define	vcpu_unlock(v)		mtx_unlock_spin(&((v)->mtx))
 #define	vcpu_assert_locked(v)	mtx_assert(&((v)->mtx), MA_OWNED)
 
 struct vmm_mmio_region {
 	uint64_t start;
 	uint64_t end;
 	mem_region_read_t read;
 	mem_region_write_t write;
 };
 #define	VM_MAX_MMIO_REGIONS	4
 
 /*
  * Initialization:
  * (o) initialized the first time the VM is created
  * (i) initialized when VM is created and when it is reinitialized
  * (x) initialized before use
  */
 struct vm {
 	void		*cookie;		/* (i) cpu-specific data */
 	volatile cpuset_t active_cpus;		/* (i) active vcpus */
 	volatile cpuset_t debug_cpus;		/* (i) vcpus stopped for debug*/
 	int		suspend;		/* (i) stop VM execution */
 	bool		dying;			/* (o) is dying */
 	volatile cpuset_t suspended_cpus; 	/* (i) suspended vcpus */
 	volatile cpuset_t halted_cpus;		/* (x) cpus in a hard halt */
 	struct vmspace	*vmspace;		/* (o) guest's address space */
 	struct vm_mem	mem;			/* (i) [m+v] guest memory */
 	char		name[VM_MAX_NAMELEN];	/* (o) virtual machine name */
 	struct vcpu	**vcpu;			/* (i) guest vcpus */
 	struct vmm_mmio_region mmio_region[VM_MAX_MMIO_REGIONS];
 						/* (o) guest MMIO regions */
 	/* The following describe the vm cpu topology */
 	uint16_t	sockets;		/* (o) num of sockets */
 	uint16_t	cores;			/* (o) num of cores/socket */
 	uint16_t	threads;		/* (o) num of threads/core */
 	uint16_t	maxcpus;		/* (o) max pluggable cpus */
 	struct sx	vcpus_init_lock;	/* (o) */
 };
 
 static bool vmm_initialized = false;
 
 static MALLOC_DEFINE(M_VMM, "vmm", "vmm");
 
 /* statistics */
 static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
 
 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
 
 static int vmm_ipinum;
 SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0,
     "IPI vector used for vcpu notifications");
 
 u_int vm_maxcpu;
 SYSCTL_UINT(_hw_vmm, OID_AUTO, maxcpu, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
     &vm_maxcpu, 0, "Maximum number of vCPUs");
 
 static void vcpu_notify_event_locked(struct vcpu *vcpu);
 
 /* global statistics */
 VMM_STAT(VMEXIT_COUNT, "total number of vm exits");
 VMM_STAT(VMEXIT_IRQ, "number of vmexits for an irq");
 VMM_STAT(VMEXIT_UNHANDLED, "number of vmexits for an unhandled exception");
 
 /*
  * Upper limit on vm_maxcpu. We could increase this to 28 bits, but this
  * is a safe value for now.
  */
 #define	VM_MAXCPU	MIN(0xffff - 1, CPU_SETSIZE)
 
 static void
 vcpu_cleanup(struct vcpu *vcpu, bool destroy)
 {
 	vmmops_vcpu_cleanup(vcpu->cookie);
 	vcpu->cookie = NULL;
 	if (destroy) {
 		vmm_stat_free(vcpu->stats);
 		fpu_save_area_free(vcpu->guestfpu);
 		vcpu_lock_destroy(vcpu);
 	}
 }
 
 static struct vcpu *
 vcpu_alloc(struct vm *vm, int vcpu_id)
 {
 	struct vcpu *vcpu;
 
 	KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus,
 	    ("vcpu_alloc: invalid vcpu %d", vcpu_id));
 
 	vcpu = malloc(sizeof(*vcpu), M_VMM, M_WAITOK | M_ZERO);
 	vcpu_lock_init(vcpu);
 	vcpu->state = VCPU_IDLE;
 	vcpu->hostcpu = NOCPU;
 	vcpu->vcpuid = vcpu_id;
 	vcpu->vm = vm;
 	vcpu->guestfpu = fpu_save_area_alloc();
 	vcpu->stats = vmm_stat_alloc();
 	return (vcpu);
 }
 
 static void
 vcpu_init(struct vcpu *vcpu)
 {
 	vcpu->cookie = vmmops_vcpu_init(vcpu->vm->cookie, vcpu, vcpu->vcpuid);
 	MPASS(vcpu->cookie != NULL);
 	fpu_save_area_reset(vcpu->guestfpu);
 	vmm_stat_init(vcpu->stats);
 }
 
 struct vm_exit *
 vm_exitinfo(struct vcpu *vcpu)
 {
 	return (&vcpu->exitinfo);
 }
 
 static int
 vmm_init(void)
 {
 
 	vm_maxcpu = mp_ncpus;
 
 	TUNABLE_INT_FETCH("hw.vmm.maxcpu", &vm_maxcpu);
 
 	if (vm_maxcpu > VM_MAXCPU) {
 		printf("vmm: vm_maxcpu clamped to %u\n", VM_MAXCPU);
 		vm_maxcpu = VM_MAXCPU;
 	}
 
 	if (vm_maxcpu == 0)
 		vm_maxcpu = 1;
 
 	return (vmmops_modinit());
 }
 
 static int
 vmm_handler(module_t mod, int what, void *arg)
 {
 	int error;
 
 	switch (what) {
 	case MOD_LOAD:
 		error = vmmdev_init();
 		if (error != 0)
 			break;
 		error = vmm_init();
 		if (error == 0)
 			vmm_initialized = true;
 		else
 			(void)vmmdev_cleanup();
 		break;
 	case MOD_UNLOAD:
 		error = vmmdev_cleanup();
 		if (error == 0 && vmm_initialized) {
 			error = vmmops_modcleanup();
 			if (error) {
 				/*
 				 * Something bad happened - prevent new
 				 * VMs from being created
 				 */
 				vmm_initialized = false;
 			}
 		}
 		break;
 	default:
 		error = 0;
 		break;
 	}
 	return (error);
 }
 
 static moduledata_t vmm_kmod = {
 	"vmm",
 	vmm_handler,
 	NULL
 };
 
 /*
  * vmm initialization has the following dependencies:
  *
  * - vmm device initialization requires an initialized devfs.
  */
 DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_DEVFS + 1, SI_ORDER_ANY);
 MODULE_VERSION(vmm, 1);
 
 static void
 vm_init(struct vm *vm, bool create)
 {
 	int i;
 
 	vm->cookie = vmmops_init(vm, vmspace_pmap(vm->vmspace));
 	MPASS(vm->cookie != NULL);
 
 	CPU_ZERO(&vm->active_cpus);
 	CPU_ZERO(&vm->debug_cpus);
 
 	vm->suspend = 0;
 	CPU_ZERO(&vm->suspended_cpus);
 
 	memset(vm->mmio_region, 0, sizeof(vm->mmio_region));
 
 	if (!create) {
 		for (i = 0; i < vm->maxcpus; i++) {
 			if (vm->vcpu[i] != NULL)
 				vcpu_init(vm->vcpu[i]);
 		}
 	}
 }
 
 void
 vm_disable_vcpu_creation(struct vm *vm)
 {
 	sx_xlock(&vm->vcpus_init_lock);
 	vm->dying = true;
 	sx_xunlock(&vm->vcpus_init_lock);
 }
 
 struct vcpu *
 vm_alloc_vcpu(struct vm *vm, int vcpuid)
 {
 	struct vcpu *vcpu;
 
 	if (vcpuid < 0 || vcpuid >= vm_get_maxcpus(vm))
 		return (NULL);
 
 	/* Some interrupt controllers may have a CPU limit */
 	if (vcpuid >= aplic_max_cpu_count(vm->cookie))
 		return (NULL);
 
 	vcpu = (struct vcpu *)
 	    atomic_load_acq_ptr((uintptr_t *)&vm->vcpu[vcpuid]);
 	if (__predict_true(vcpu != NULL))
 		return (vcpu);
 
 	sx_xlock(&vm->vcpus_init_lock);
 	vcpu = vm->vcpu[vcpuid];
 	if (vcpu == NULL && !vm->dying) {
 		vcpu = vcpu_alloc(vm, vcpuid);
 		vcpu_init(vcpu);
 
 		/*
 		 * Ensure vCPU is fully created before updating pointer
 		 * to permit unlocked reads above.
 		 */
 		atomic_store_rel_ptr((uintptr_t *)&vm->vcpu[vcpuid],
 		    (uintptr_t)vcpu);
 	}
 	sx_xunlock(&vm->vcpus_init_lock);
 	return (vcpu);
 }
 
 void
 vm_slock_vcpus(struct vm *vm)
 {
 	sx_slock(&vm->vcpus_init_lock);
 }
 
 void
 vm_unlock_vcpus(struct vm *vm)
 {
 	sx_unlock(&vm->vcpus_init_lock);
 }
 
 int
 vm_create(const char *name, struct vm **retvm)
 {
 	struct vm *vm;
 	struct vmspace *vmspace;
 
 	/*
 	 * If vmm.ko could not be successfully initialized then don't attempt
 	 * to create the virtual machine.
 	 */
 	if (!vmm_initialized)
 		return (ENXIO);
 
 	if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
 		return (EINVAL);
 
 	vmspace = vmmops_vmspace_alloc(0, 1ul << 39);
 	if (vmspace == NULL)
 		return (ENOMEM);
 
 	vm = malloc(sizeof(struct vm), M_VMM, M_WAITOK | M_ZERO);
 	strcpy(vm->name, name);
 	vm->vmspace = vmspace;
 	vm_mem_init(&vm->mem);
 	sx_init(&vm->vcpus_init_lock, "vm vcpus");
 
 	vm->sockets = 1;
 	vm->cores = 1;			/* XXX backwards compatibility */
 	vm->threads = 1;		/* XXX backwards compatibility */
 	vm->maxcpus = vm_maxcpu;
 
 	vm->vcpu = malloc(sizeof(*vm->vcpu) * vm->maxcpus, M_VMM,
 	    M_WAITOK | M_ZERO);
 
 	vm_init(vm, true);
 
 	*retvm = vm;
 	return (0);
 }
 
 void
 vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores,
     uint16_t *threads, uint16_t *maxcpus)
 {
 	*sockets = vm->sockets;
 	*cores = vm->cores;
 	*threads = vm->threads;
 	*maxcpus = vm->maxcpus;
 }
 
 uint16_t
 vm_get_maxcpus(struct vm *vm)
 {
 	return (vm->maxcpus);
 }
 
 int
 vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores,
     uint16_t threads, uint16_t maxcpus)
 {
 	/* Ignore maxcpus. */
 	if ((sockets * cores * threads) > vm->maxcpus)
 		return (EINVAL);
 	vm->sockets = sockets;
 	vm->cores = cores;
 	vm->threads = threads;
 	return(0);
 }
 
 static void
 vm_cleanup(struct vm *vm, bool destroy)
 {
 	int i;
 
 	if (destroy)
 		vm_xlock_memsegs(vm);
 	else
 		vm_assert_memseg_xlocked(vm);
 
 	aplic_detach_from_vm(vm->cookie);
 
 	for (i = 0; i < vm->maxcpus; i++) {
 		if (vm->vcpu[i] != NULL)
 			vcpu_cleanup(vm->vcpu[i], destroy);
 	}
 
 	vmmops_cleanup(vm->cookie);
 
 	vm_mem_cleanup(vm);
 	if (destroy) {
 		vm_mem_destroy(vm);
 
 		vmmops_vmspace_free(vm->vmspace);
 		vm->vmspace = NULL;
 
 		for (i = 0; i < vm->maxcpus; i++)
 			free(vm->vcpu[i], M_VMM);
 		free(vm->vcpu, M_VMM);
 		sx_destroy(&vm->vcpus_init_lock);
 	}
 }
 
 void
 vm_destroy(struct vm *vm)
 {
 
 	vm_cleanup(vm, true);
 
 	free(vm, M_VMM);
 }
 
 int
 vm_reinit(struct vm *vm)
 {
 	int error;
 
 	/*
 	 * A virtual machine can be reset only if all vcpus are suspended.
 	 */
 	if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
 		vm_cleanup(vm, false);
 		vm_init(vm, false);
 		error = 0;
 	} else {
 		error = EBUSY;
 	}
 
 	return (error);
 }
 
 const char *
 vm_name(struct vm *vm)
 {
 	return (vm->name);
 }
 
 int
 vm_gla2gpa_nofault(struct vcpu *vcpu, struct vm_guest_paging *paging,
     uint64_t gla, int prot, uint64_t *gpa, int *is_fault)
 {
 	return (vmmops_gla2gpa(vcpu->cookie, paging, gla, prot, gpa, is_fault));
 }
 
 void
 vm_register_inst_handler(struct vm *vm, uint64_t start, uint64_t size,
     mem_region_read_t mmio_read, mem_region_write_t mmio_write)
 {
 	int i;
 
 	for (i = 0; i < nitems(vm->mmio_region); i++) {
 		if (vm->mmio_region[i].start == 0 &&
 		    vm->mmio_region[i].end == 0) {
 			vm->mmio_region[i].start = start;
 			vm->mmio_region[i].end = start + size;
 			vm->mmio_region[i].read = mmio_read;
 			vm->mmio_region[i].write = mmio_write;
 			return;
 		}
 	}
 
 	panic("%s: No free MMIO region", __func__);
 }
 
 void
 vm_deregister_inst_handler(struct vm *vm, uint64_t start, uint64_t size)
 {
 	int i;
 
 	for (i = 0; i < nitems(vm->mmio_region); i++) {
 		if (vm->mmio_region[i].start == start &&
 		    vm->mmio_region[i].end == start + size) {
 			memset(&vm->mmio_region[i], 0,
 			    sizeof(vm->mmio_region[i]));
 			return;
 		}
 	}
 
 	panic("%s: Invalid MMIO region: %lx - %lx", __func__, start,
 	    start + size);
 }
 
 static int
 vm_handle_inst_emul(struct vcpu *vcpu, bool *retu)
 {
 	struct vm *vm;
 	struct vm_exit *vme;
 	struct vie *vie;
 	struct hyp *hyp;
 	uint64_t fault_ipa;
 	struct vm_guest_paging *paging;
 	struct vmm_mmio_region *vmr;
 	int error, i;
 
 	vm = vcpu->vm;
 	hyp = vm->cookie;
 	if (!hyp->aplic_attached)
 		goto out_user;
 
 	vme = &vcpu->exitinfo;
 	vie = &vme->u.inst_emul.vie;
 	paging = &vme->u.inst_emul.paging;
 
 	fault_ipa = vme->u.inst_emul.gpa;
 
 	vmr = NULL;
 	for (i = 0; i < nitems(vm->mmio_region); i++) {
 		if (vm->mmio_region[i].start <= fault_ipa &&
 		    vm->mmio_region[i].end > fault_ipa) {
 			vmr = &vm->mmio_region[i];
 			break;
 		}
 	}
 	if (vmr == NULL)
 		goto out_user;
 
 	error = vmm_emulate_instruction(vcpu, fault_ipa, vie, paging,
 	    vmr->read, vmr->write, retu);
 	return (error);
 
 out_user:
 	*retu = true;
 	return (0);
 }
 
 int
 vm_suspend(struct vm *vm, enum vm_suspend_how how)
 {
 	int i;
 
 	if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST)
 		return (EINVAL);
 
 	if (atomic_cmpset_int(&vm->suspend, 0, how) == 0) {
 		VM_CTR2(vm, "virtual machine already suspended %d/%d",
 		    vm->suspend, how);
 		return (EALREADY);
 	}
 
 	VM_CTR1(vm, "virtual machine successfully suspended %d", how);
 
 	/*
 	 * Notify all active vcpus that they are now suspended.
 	 */
 	for (i = 0; i < vm->maxcpus; i++) {
 		if (CPU_ISSET(i, &vm->active_cpus))
 			vcpu_notify_event(vm_vcpu(vm, i));
 	}
 
 	return (0);
 }
 
 void
 vm_exit_suspended(struct vcpu *vcpu, uint64_t pc)
 {
 	struct vm *vm = vcpu->vm;
 	struct vm_exit *vmexit;
 
 	KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST,
 	    ("vm_exit_suspended: invalid suspend type %d", vm->suspend));
 
 	vmexit = vm_exitinfo(vcpu);
 	vmexit->pc = pc;
 	vmexit->inst_length = 4;
 	vmexit->exitcode = VM_EXITCODE_SUSPENDED;
 	vmexit->u.suspended.how = vm->suspend;
 }
 
 void
 vm_exit_debug(struct vcpu *vcpu, uint64_t pc)
 {
 	struct vm_exit *vmexit;
 
 	vmexit = vm_exitinfo(vcpu);
 	vmexit->pc = pc;
 	vmexit->inst_length = 4;
 	vmexit->exitcode = VM_EXITCODE_DEBUG;
 }
 
 int
 vm_activate_cpu(struct vcpu *vcpu)
 {
 	struct vm *vm = vcpu->vm;
 
 	if (CPU_ISSET(vcpu->vcpuid, &vm->active_cpus))
 		return (EBUSY);
 
 	CPU_SET_ATOMIC(vcpu->vcpuid, &vm->active_cpus);
 	return (0);
 
 }
 
 int
 vm_suspend_cpu(struct vm *vm, struct vcpu *vcpu)
 {
 	if (vcpu == NULL) {
 		vm->debug_cpus = vm->active_cpus;
 		for (int i = 0; i < vm->maxcpus; i++) {
 			if (CPU_ISSET(i, &vm->active_cpus))
 				vcpu_notify_event(vm_vcpu(vm, i));
 		}
 	} else {
 		if (!CPU_ISSET(vcpu->vcpuid, &vm->active_cpus))
 			return (EINVAL);
 
 		CPU_SET_ATOMIC(vcpu->vcpuid, &vm->debug_cpus);
 		vcpu_notify_event(vcpu);
 	}
 	return (0);
 }
 
 int
 vm_resume_cpu(struct vm *vm, struct vcpu *vcpu)
 {
 
 	if (vcpu == NULL) {
 		CPU_ZERO(&vm->debug_cpus);
 	} else {
 		if (!CPU_ISSET(vcpu->vcpuid, &vm->debug_cpus))
 			return (EINVAL);
 
 		CPU_CLR_ATOMIC(vcpu->vcpuid, &vm->debug_cpus);
 	}
 	return (0);
 }
 
 int
 vcpu_debugged(struct vcpu *vcpu)
 {
 
 	return (CPU_ISSET(vcpu->vcpuid, &vcpu->vm->debug_cpus));
 }
 
 cpuset_t
 vm_active_cpus(struct vm *vm)
 {
 
 	return (vm->active_cpus);
 }
 
 cpuset_t
 vm_debug_cpus(struct vm *vm)
 {
 
 	return (vm->debug_cpus);
 }
 
 cpuset_t
 vm_suspended_cpus(struct vm *vm)
 {
 
 	return (vm->suspended_cpus);
 }
 
 
 void *
 vcpu_stats(struct vcpu *vcpu)
 {
 
 	return (vcpu->stats);
 }
 
 /*
  * This function is called to ensure that a vcpu "sees" a pending event
  * as soon as possible:
  * - If the vcpu thread is sleeping then it is woken up.
  * - If the vcpu is running on a different host_cpu then an IPI will be directed
  *   to the host_cpu to cause the vcpu to trap into the hypervisor.
  */
 static void
 vcpu_notify_event_locked(struct vcpu *vcpu)
 {
 	int hostcpu;
 
 	hostcpu = vcpu->hostcpu;
 	if (vcpu->state == VCPU_RUNNING) {
 		KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
 		if (hostcpu != curcpu) {
 			ipi_cpu(hostcpu, vmm_ipinum);
 		} else {
 			/*
 			 * If the 'vcpu' is running on 'curcpu' then it must
 			 * be sending a notification to itself (e.g. SELF_IPI).
 			 * The pending event will be picked up when the vcpu
 			 * transitions back to guest context.
 			 */
 		}
 	} else {
 		KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent "
 		    "with hostcpu %d", vcpu->state, hostcpu));
 		if (vcpu->state == VCPU_SLEEPING)
 			wakeup_one(vcpu);
 	}
 }
 
 void
 vcpu_notify_event(struct vcpu *vcpu)
 {
 	vcpu_lock(vcpu);
 	vcpu_notify_event_locked(vcpu);
 	vcpu_unlock(vcpu);
 }
 
 struct vmspace *
 vm_vmspace(struct vm *vm)
 {
 	return (vm->vmspace);
 }
 
 struct vm_mem *
 vm_mem(struct vm *vm)
 {
 	return (&vm->mem);
 }
 
 static void
 restore_guest_fpustate(struct vcpu *vcpu)
 {
 
 	/* Flush host state to the pcb. */
 	fpe_state_save(curthread);
 
 	/* Ensure the VFP state will be re-loaded when exiting the guest. */
 	PCPU_SET(fpcurthread, NULL);
 
 	/* restore guest FPU state */
 	fpe_enable();
 	fpe_restore(vcpu->guestfpu);
 
 	/*
 	 * The FPU is now "dirty" with the guest's state so turn on emulation
 	 * to trap any access to the FPU by the host.
 	 */
 	fpe_disable();
 }
 
 static void
 save_guest_fpustate(struct vcpu *vcpu)
 {
 
 	/* Save guest FPE state. */
 	fpe_enable();
 	fpe_store(vcpu->guestfpu);
 	fpe_disable();
 
 	KASSERT(PCPU_GET(fpcurthread) == NULL,
 	    ("%s: fpcurthread set with guest registers", __func__));
 }
 
 static int
 vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate,
     bool from_idle)
 {
 	int error;
 
 	vcpu_assert_locked(vcpu);
 
 	/*
 	 * State transitions from the vmmdev_ioctl() must always begin from
 	 * the VCPU_IDLE state. This guarantees that there is only a single
 	 * ioctl() operating on a vcpu at any point.
 	 */
 	if (from_idle) {
 		while (vcpu->state != VCPU_IDLE) {
 			vcpu_notify_event_locked(vcpu);
 			msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz);
 		}
 	} else {
 		KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
 		    "vcpu idle state"));
 	}
 
 	if (vcpu->state == VCPU_RUNNING) {
 		KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d "
 		    "mismatch for running vcpu", curcpu, vcpu->hostcpu));
 	} else {
 		KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a "
 		    "vcpu that is not running", vcpu->hostcpu));
 	}
 
 	/*
 	 * The following state transitions are allowed:
 	 * IDLE -> FROZEN -> IDLE
 	 * FROZEN -> RUNNING -> FROZEN
 	 * FROZEN -> SLEEPING -> FROZEN
 	 */
 	switch (vcpu->state) {
 	case VCPU_IDLE:
 	case VCPU_RUNNING:
 	case VCPU_SLEEPING:
 		error = (newstate != VCPU_FROZEN);
 		break;
 	case VCPU_FROZEN:
 		error = (newstate == VCPU_FROZEN);
 		break;
 	default:
 		error = 1;
 		break;
 	}
 
 	if (error)
 		return (EBUSY);
 
 	vcpu->state = newstate;
 	if (newstate == VCPU_RUNNING)
 		vcpu->hostcpu = curcpu;
 	else
 		vcpu->hostcpu = NOCPU;
 
 	if (newstate == VCPU_IDLE)
 		wakeup(&vcpu->state);
 
 	return (0);
 }
 
 static void
 vcpu_require_state(struct vcpu *vcpu, enum vcpu_state newstate)
 {
 	int error;
 
 	if ((error = vcpu_set_state(vcpu, newstate, false)) != 0)
 		panic("Error %d setting state to %d\n", error, newstate);
 }
 
 static void
 vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
 {
 	int error;
 
 	if ((error = vcpu_set_state_locked(vcpu, newstate, false)) != 0)
 		panic("Error %d setting state to %d", error, newstate);
 }
 
 int
 vm_get_capability(struct vcpu *vcpu, int type, int *retval)
 {
 
 	if (type < 0 || type >= VM_CAP_MAX)
 		return (EINVAL);
 
 	return (vmmops_getcap(vcpu->cookie, type, retval));
 }
 
 int
 vm_set_capability(struct vcpu *vcpu, int type, int val)
 {
 
 	if (type < 0 || type >= VM_CAP_MAX)
 		return (EINVAL);
 
 	return (vmmops_setcap(vcpu->cookie, type, val));
 }
 
 struct vm *
 vcpu_vm(struct vcpu *vcpu)
 {
 
 	return (vcpu->vm);
 }
 
 int
 vcpu_vcpuid(struct vcpu *vcpu)
 {
 
 	return (vcpu->vcpuid);
 }
 
 void *
 vcpu_get_cookie(struct vcpu *vcpu)
 {
 
 	return (vcpu->cookie);
 }
 
 struct vcpu *
 vm_vcpu(struct vm *vm, int vcpuid)
 {
 
 	return (vm->vcpu[vcpuid]);
 }
 
 int
 vcpu_set_state(struct vcpu *vcpu, enum vcpu_state newstate, bool from_idle)
 {
 	int error;
 
 	vcpu_lock(vcpu);
 	error = vcpu_set_state_locked(vcpu, newstate, from_idle);
 	vcpu_unlock(vcpu);
 
 	return (error);
 }
 
 enum vcpu_state
 vcpu_get_state(struct vcpu *vcpu, int *hostcpu)
 {
 	enum vcpu_state state;
 
 	vcpu_lock(vcpu);
 	state = vcpu->state;
 	if (hostcpu != NULL)
 		*hostcpu = vcpu->hostcpu;
 	vcpu_unlock(vcpu);
 
 	return (state);
 }
 
 int
 vm_get_register(struct vcpu *vcpu, int reg, uint64_t *retval)
 {
 
 	if (reg >= VM_REG_LAST)
 		return (EINVAL);
 
 	return (vmmops_getreg(vcpu->cookie, reg, retval));
 }
 
 int
 vm_set_register(struct vcpu *vcpu, int reg, uint64_t val)
 {
 	int error;
 
 	if (reg >= VM_REG_LAST)
 		return (EINVAL);
 	error = vmmops_setreg(vcpu->cookie, reg, val);
 	if (error || reg != VM_REG_GUEST_SEPC)
 		return (error);
 
 	vcpu->nextpc = val;
 
 	return (0);
 }
 
 void *
 vm_get_cookie(struct vm *vm)
 {
 
 	return (vm->cookie);
 }
 
 int
 vm_inject_exception(struct vcpu *vcpu, uint64_t scause)
 {
 
 	return (vmmops_exception(vcpu->cookie, scause));
 }
 
 int
 vm_attach_aplic(struct vm *vm, struct vm_aplic_descr *descr)
 {
 
 	return (aplic_attach_to_vm(vm->cookie, descr));
 }
 
 int
 vm_assert_irq(struct vm *vm, uint32_t irq)
 {
 
 	return (aplic_inject_irq(vm->cookie, -1, irq, true));
 }
 
 int
 vm_deassert_irq(struct vm *vm, uint32_t irq)
 {
 
 	return (aplic_inject_irq(vm->cookie, -1, irq, false));
 }
 
 int
 vm_raise_msi(struct vm *vm, uint64_t msg, uint64_t addr, int bus, int slot,
     int func)
 {
 
 	return (aplic_inject_msi(vm->cookie, msg, addr));
 }
 
 static int
 vm_handle_wfi(struct vcpu *vcpu, struct vm_exit *vme, bool *retu)
 {
+	struct vm *vm;
 
+	vm = vcpu->vm;
 	vcpu_lock(vcpu);
-
 	while (1) {
+		if (vm->suspend)
+			break;
+
 		if (aplic_check_pending(vcpu->cookie))
 			break;
 
 		if (riscv_check_ipi(vcpu->cookie, false))
 			break;
 
 		if (riscv_check_interrupts_pending(vcpu->cookie))
 			break;
 
 		if (vcpu_should_yield(vcpu))
 			break;
 
 		vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
 		/*
 		 * XXX msleep_spin() cannot be interrupted by signals so
 		 * wake up periodically to check pending signals.
 		 */
 		msleep_spin(vcpu, &vcpu->mtx, "vmidle", hz);
 		vcpu_require_state_locked(vcpu, VCPU_FROZEN);
 	}
 	vcpu_unlock(vcpu);
 
 	*retu = false;
 
 	return (0);
 }
 
 static int
 vm_handle_paging(struct vcpu *vcpu, bool *retu)
 {
 	struct vm *vm;
 	struct vm_exit *vme;
 	struct vm_map *map;
 	uint64_t addr;
 	pmap_t pmap;
 	int ftype, rv;
 
 	vm = vcpu->vm;
 	vme = &vcpu->exitinfo;
 
 	pmap = vmspace_pmap(vm->vmspace);
 	addr = (vme->htval << 2) & ~(PAGE_SIZE - 1);
 
 	dprintf("%s: %lx\n", __func__, addr);
 
 	switch (vme->scause) {
 	case SCAUSE_STORE_GUEST_PAGE_FAULT:
 		ftype = VM_PROT_WRITE;
 		break;
 	case SCAUSE_FETCH_GUEST_PAGE_FAULT:
 		ftype = VM_PROT_EXECUTE;
 		break;
 	case SCAUSE_LOAD_GUEST_PAGE_FAULT:
 		ftype = VM_PROT_READ;
 		break;
 	default:
 		panic("unknown page trap: %lu", vme->scause);
 	}
 
 	/* The page exists, but the page table needs to be updated. */
 	if (pmap_fault(pmap, addr, ftype))
 		return (0);
 
 	map = &vm->vmspace->vm_map;
 	rv = vm_fault(map, addr, ftype, VM_FAULT_NORMAL, NULL);
 	if (rv != KERN_SUCCESS) {
 		printf("%s: vm_fault failed, addr %lx, ftype %d, err %d\n",
 		    __func__, addr, ftype, rv);
 		return (EFAULT);
 	}
 
 	return (0);
 }
 
 static int
 vm_handle_suspend(struct vcpu *vcpu, bool *retu)
 {
 	struct vm *vm = vcpu->vm;
 	int error, i;
 	struct thread *td;
 
 	error = 0;
 	td = curthread;
 
 	CPU_SET_ATOMIC(vcpu->vcpuid, &vm->suspended_cpus);
 
 	/*
 	 * Wait until all 'active_cpus' have suspended themselves.
 	 *
 	 * Since a VM may be suspended at any time including when one or
 	 * more vcpus are doing a rendezvous we need to call the rendezvous
 	 * handler while we are waiting to prevent a deadlock.
 	 */
 	vcpu_lock(vcpu);
 	while (error == 0) {
 		if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0)
 			break;
 
 		vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
 		msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz);
 		vcpu_require_state_locked(vcpu, VCPU_FROZEN);
 		if (td_ast_pending(td, TDA_SUSPEND)) {
 			vcpu_unlock(vcpu);
 			error = thread_check_susp(td, false);
 			vcpu_lock(vcpu);
 		}
 	}
 	vcpu_unlock(vcpu);
 
 	/*
 	 * Wakeup the other sleeping vcpus and return to userspace.
 	 */
 	for (i = 0; i < vm->maxcpus; i++) {
 		if (CPU_ISSET(i, &vm->suspended_cpus)) {
 			vcpu_notify_event(vm_vcpu(vm, i));
 		}
 	}
 
 	*retu = true;
 	return (error);
 }
 
 int
 vm_run(struct vcpu *vcpu)
 {
 	struct vm_eventinfo evinfo;
 	struct vm_exit *vme;
 	struct vm *vm;
 	pmap_t pmap;
 	int error;
 	int vcpuid;
 	bool retu;
 
 	vm = vcpu->vm;
 
 	dprintf("%s\n", __func__);
 
 	vcpuid = vcpu->vcpuid;
 
 	if (!CPU_ISSET(vcpuid, &vm->active_cpus))
 		return (EINVAL);
 
 	if (CPU_ISSET(vcpuid, &vm->suspended_cpus))
 		return (EINVAL);
 
 	pmap = vmspace_pmap(vm->vmspace);
 	vme = &vcpu->exitinfo;
 	evinfo.rptr = NULL;
 	evinfo.sptr = &vm->suspend;
 	evinfo.iptr = NULL;
 restart:
 	critical_enter();
 
 	restore_guest_fpustate(vcpu);
 
 	vcpu_require_state(vcpu, VCPU_RUNNING);
 	error = vmmops_run(vcpu->cookie, vcpu->nextpc, pmap, &evinfo);
 	vcpu_require_state(vcpu, VCPU_FROZEN);
 
 	save_guest_fpustate(vcpu);
 
 	critical_exit();
 
 	if (error == 0) {
 		retu = false;
 		switch (vme->exitcode) {
 		case VM_EXITCODE_INST_EMUL:
 			vcpu->nextpc = vme->pc + vme->inst_length;
 			error = vm_handle_inst_emul(vcpu, &retu);
 			break;
 		case VM_EXITCODE_WFI:
 			vcpu->nextpc = vme->pc + vme->inst_length;
 			error = vm_handle_wfi(vcpu, vme, &retu);
 			break;
 		case VM_EXITCODE_ECALL:
 			/* Handle in userland. */
 			vcpu->nextpc = vme->pc + vme->inst_length;
 			retu = true;
 			break;
 		case VM_EXITCODE_PAGING:
 			vcpu->nextpc = vme->pc;
 			error = vm_handle_paging(vcpu, &retu);
 			break;
 		case VM_EXITCODE_BOGUS:
 			vcpu->nextpc = vme->pc;
 			retu = false;
 			error = 0;
 			break;
 		case VM_EXITCODE_SUSPENDED:
 			vcpu->nextpc = vme->pc;
 			error = vm_handle_suspend(vcpu, &retu);
 			break;
 		default:
 			/* Handle in userland. */
 			vcpu->nextpc = vme->pc;
 			retu = true;
 			break;
 		}
 	}
 
 	if (error == 0 && retu == false)
 		goto restart;
 
 	return (error);
 }
diff --git a/usr.sbin/bhyve/aarch64/vmexit.c b/usr.sbin/bhyve/aarch64/vmexit.c
index 3acad4020a3c..2457cbe76b5e 100644
--- a/usr.sbin/bhyve/aarch64/vmexit.c
+++ b/usr.sbin/bhyve/aarch64/vmexit.c
@@ -1,303 +1,305 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/types.h>
 #include <sys/cpuset.h>
 
 #include <dev/psci/psci.h>
 #include <dev/psci/smccc.h>
 
 #include <machine/armreg.h>
 #include <machine/cpu.h>
 #include <machine/vmm.h>
 #include <machine/vmm_dev.h>
 #include <machine/vmm_instruction_emul.h>
 
 #include <assert.h>
 #include <errno.h>
 #include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
 
 #include <vmmapi.h>
 
 #include "bhyve_machdep.h"
 #include "bhyverun.h"
 #include "config.h"
 #include "debug.h"
 #include "gdb.h"
 #include "mem.h"
 #include "vmexit.h"
 
 cpuset_t running_cpumask;
 
 static int
 vmexit_inst_emul(struct vmctx *ctx __unused, struct vcpu *vcpu,
     struct vm_run *vmrun)
 {
 	struct vm_exit *vme;
 	struct vie *vie;
 	int err;
 
 	vme = vmrun->vm_exit;
 	vie = &vme->u.inst_emul.vie;
 
 	err = emulate_mem(vcpu, vme->u.inst_emul.gpa, vie,
 	    &vme->u.inst_emul.paging);
 	if (err) {
 		if (err == ESRCH) {
 			EPRINTLN("Unhandled memory access to 0x%lx\n",
 			    vme->u.inst_emul.gpa);
 		}
 		goto fail;
 	}
 
 	return (VMEXIT_CONTINUE);
 
 fail:
 	fprintf(stderr, "Failed to emulate instruction ");
 	FPRINTLN(stderr, "at 0x%lx", vme->pc);
 	return (VMEXIT_ABORT);
 }
 
 static int
 vmexit_reg_emul(struct vmctx *ctx __unused, struct vcpu *vcpu __unused,
     struct vm_run *vmrun)
 {
 	struct vm_exit *vme;
 	struct vre *vre;
 
 	vme = vmrun->vm_exit;
 	vre = &vme->u.reg_emul.vre;
 
 	EPRINTLN("Unhandled register access: pc %#lx syndrome %#x reg %d\n",
 	    vme->pc, vre->inst_syndrome, vre->reg);
 	return (VMEXIT_ABORT);
 }
 
 static int
 vmexit_suspend(struct vmctx *ctx, struct vcpu *vcpu, struct vm_run *vmrun)
 {
 	struct vm_exit *vme;
 	enum vm_suspend_how how;
 	int vcpuid = vcpu_id(vcpu);
 
 	vme = vmrun->vm_exit;
 	how = vme->u.suspended.how;
 
 	fbsdrun_deletecpu(vcpuid);
 
 	switch (how) {
 	case VM_SUSPEND_RESET:
 		exit(0);
 	case VM_SUSPEND_POWEROFF:
 		if (get_config_bool_default("destroy_on_poweroff", false))
 			vm_destroy(ctx);
 		exit(1);
 	case VM_SUSPEND_HALT:
 		exit(2);
+	case VM_SUSPEND_DESTROY:
+		exit(4);
 	default:
 		fprintf(stderr, "vmexit_suspend: invalid reason %d\n", how);
 		exit(100);
 	}
 	return (0);	/* NOTREACHED */
 }
 
 static int
 vmexit_debug(struct vmctx *ctx __unused, struct vcpu *vcpu,
     struct vm_run *vmrun __unused)
 {
 	gdb_cpu_suspend(vcpu);
 	/*
 	 * XXX-MJ sleep for a short period to avoid chewing up the CPU in the
 	 * window between activation of the vCPU thread and the STARTUP IPI.
 	 */
 	usleep(1000);
 	return (VMEXIT_CONTINUE);
 }
 
 static int
 vmexit_bogus(struct vmctx *ctx __unused, struct vcpu *vcpu __unused,
     struct vm_run *vmrun __unused)
 {
 	return (VMEXIT_CONTINUE);
 }
 
 static uint64_t
 smccc_affinity_info(uint64_t target_affinity, uint32_t lowest_affinity_level)
 {
 	uint64_t mask = 0;
 
 	switch (lowest_affinity_level) {
 	case 0:
 		mask |= CPU_AFF0_MASK;
 		/* FALLTHROUGH */
 	case 1:
 		mask |= CPU_AFF1_MASK;
 		/* FALLTHROUGH */
 	case 2:
 		mask |= CPU_AFF2_MASK;
 		/* FALLTHROUGH */
 	case 3:
 		mask |= CPU_AFF3_MASK;
 		break;
 	default:
 		return (PSCI_RETVAL_INVALID_PARAMS);
 	}
 
 	for (int vcpu = 0; vcpu < guest_ncpus; vcpu++) {
 		if ((cpu_to_mpidr[vcpu] & mask) == (target_affinity & mask) &&
 		    CPU_ISSET(vcpu, &running_cpumask)) {
 			/* Return ON if any CPUs are on */
 			return (PSCI_AFFINITY_INFO_ON);
 		}
 	}
 
 	/* No CPUs in the affinity mask are on, return OFF */
 	return (PSCI_AFFINITY_INFO_OFF);
 }
 
 static int
 vmexit_smccc(struct vmctx *ctx, struct vcpu *vcpu, struct vm_run *vmrun)
 {
 	struct vcpu *newvcpu;
 	struct vm_exit *vme;
 	uint64_t mpidr, smccc_rv;
 	enum vm_suspend_how how;
 	int error, newcpu;
 
 	/* Return the Unknown Function Identifier  by default */
 	smccc_rv = SMCCC_RET_NOT_SUPPORTED;
 
 	vme = vmrun->vm_exit;
 	switch (vme->u.smccc_call.func_id) {
 	case PSCI_FNID_VERSION:
 		/* We implement PSCI 1.0 */
 		smccc_rv = PSCI_VER(1, 0);
 		break;
 	case PSCI_FNID_CPU_SUSPEND:
 		break;
 	case PSCI_FNID_CPU_OFF:
 		CPU_CLR_ATOMIC(vcpu_id(vcpu), &running_cpumask);
 		vm_suspend_cpu(vcpu);
 		break;
 	case PSCI_FNID_CPU_ON:
 		mpidr = vme->u.smccc_call.args[0];
 		for (newcpu = 0; newcpu < guest_ncpus; newcpu++) {
 			if (cpu_to_mpidr[newcpu] == mpidr)
 				break;
 		}
 
 		if (newcpu == guest_ncpus) {
 			smccc_rv = PSCI_RETVAL_INVALID_PARAMS;
 			break;
 		}
 
 		if (CPU_TEST_SET_ATOMIC(newcpu, &running_cpumask)) {
 			smccc_rv = PSCI_RETVAL_ALREADY_ON;
 			break;
 		}
 
 		newvcpu = fbsdrun_vcpu(newcpu);
 		assert(newvcpu != NULL);
 
 		/* Set the context ID */
 		error = vm_set_register(newvcpu, VM_REG_GUEST_X0,
 		    vme->u.smccc_call.args[2]);
 		assert(error == 0);
 
 		/* Set the start program counter */
 		error = vm_set_register(newvcpu, VM_REG_GUEST_PC,
 		    vme->u.smccc_call.args[1]);
 		assert(error == 0);
 
 		vm_resume_cpu(newvcpu);
 
 		smccc_rv = PSCI_RETVAL_SUCCESS;
 		break;
 	case PSCI_FNID_AFFINITY_INFO:
 		smccc_rv = smccc_affinity_info(vme->u.smccc_call.args[0],
 		    vme->u.smccc_call.args[1]);
 		break;
 	case PSCI_FNID_SYSTEM_OFF:
 	case PSCI_FNID_SYSTEM_RESET:
 		if (vme->u.smccc_call.func_id == PSCI_FNID_SYSTEM_OFF)
 			how = VM_SUSPEND_POWEROFF;
 		else
 			how = VM_SUSPEND_RESET;
 		error = vm_suspend(ctx, how);
 		assert(error == 0 || errno == EALREADY);
 		break;
 	default:
 		break;
 	}
 
 	error = vm_set_register(vcpu, VM_REG_GUEST_X0, smccc_rv);
 	assert(error == 0);
 
 	return (VMEXIT_CONTINUE);
 }
 
 static int
 vmexit_hyp(struct vmctx *ctx __unused, struct vcpu *vcpu, struct vm_run *vmrun)
 {
 	/* Raise an unknown reason exception */
 	if (vm_inject_exception(vcpu,
 	    (EXCP_UNKNOWN << ESR_ELx_EC_SHIFT) | ESR_ELx_IL,
 	    vmrun->vm_exit->u.hyp.far_el2) != 0)
 		return (VMEXIT_ABORT);
 
 	return (VMEXIT_CONTINUE);
 }
 
 static int
 vmexit_brk(struct vmctx *ctx __unused, struct vcpu *vcpu, struct vm_run *vmrun)
 {
 	gdb_cpu_breakpoint(vcpu, vmrun->vm_exit);
 	return (VMEXIT_CONTINUE);
 }
 
 static int
 vmexit_ss(struct vmctx *ctx __unused, struct vcpu *vcpu, struct vm_run *vmrun)
 {
 	gdb_cpu_debug(vcpu, vmrun->vm_exit);
 	return (VMEXIT_CONTINUE);
 }
 
 const vmexit_handler_t vmexit_handlers[VM_EXITCODE_MAX] = {
 	[VM_EXITCODE_BOGUS]  = vmexit_bogus,
 	[VM_EXITCODE_INST_EMUL] = vmexit_inst_emul,
 	[VM_EXITCODE_REG_EMUL] = vmexit_reg_emul,
 	[VM_EXITCODE_SUSPENDED] = vmexit_suspend,
 	[VM_EXITCODE_DEBUG] = vmexit_debug,
 	[VM_EXITCODE_SMCCC] = vmexit_smccc,
 	[VM_EXITCODE_HYP] = vmexit_hyp,
 	[VM_EXITCODE_BRK] = vmexit_brk,
 	[VM_EXITCODE_SS] = vmexit_ss,
 };
diff --git a/usr.sbin/bhyve/amd64/vmexit.c b/usr.sbin/bhyve/amd64/vmexit.c
index 944f5de34645..14f89563fd0f 100644
--- a/usr.sbin/bhyve/amd64/vmexit.c
+++ b/usr.sbin/bhyve/amd64/vmexit.c
@@ -1,527 +1,529 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/types.h>
 
 #include <machine/vmm.h>
 #include <machine/vmm_dev.h>
 #include <machine/vmm_instruction_emul.h>
 #include <amd64/vmm/intel/vmcs.h>
 #include <x86/apicreg.h>
 
 #include <assert.h>
 #include <err.h>
 #include <errno.h>
 #include <stdlib.h>
 #include <strings.h>
 #include <unistd.h>
 
 #include <vmmapi.h>
 
 #include "bhyverun.h"
 #include "config.h"
 #include "debug.h"
 #include "gdb.h"
 #include "inout.h"
 #include "mem.h"
 #ifdef BHYVE_SNAPSHOT
 #include "snapshot.h"
 #endif
 #include "spinup_ap.h"
 #include "vmexit.h"
 #include "xmsr.h"
 
 void
 vm_inject_fault(struct vcpu *vcpu, int vector, int errcode_valid,
     int errcode)
 {
 	int error, restart_instruction;
 
 	restart_instruction = 1;
 
 	error = vm_inject_exception(vcpu, vector, errcode_valid, errcode,
 	    restart_instruction);
 	assert(error == 0);
 }
 
 static int
 vmexit_inout(struct vmctx *ctx, struct vcpu *vcpu, struct vm_run *vmrun)
 {
 	struct vm_exit *vme;
 	int error;
 	int bytes, port, in;
 
 	vme = vmrun->vm_exit;
 	port = vme->u.inout.port;
 	bytes = vme->u.inout.bytes;
 	in = vme->u.inout.in;
 
 	error = emulate_inout(ctx, vcpu, vme);
 	if (error) {
 		EPRINTLN("Unhandled %s%c 0x%04x at 0x%lx",
 		    in ? "in" : "out",
 		    bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'),
 		    port, vme->rip);
 		return (VMEXIT_ABORT);
 	} else {
 		return (VMEXIT_CONTINUE);
 	}
 }
 
 static int
 vmexit_rdmsr(struct vmctx *ctx __unused, struct vcpu *vcpu,
     struct vm_run *vmrun)
 {
 	struct vm_exit *vme;
 	uint64_t val;
 	uint32_t eax, edx;
 	int error;
 
 	vme = vmrun->vm_exit;
 
 	val = 0;
 	error = emulate_rdmsr(vcpu, vme->u.msr.code, &val);
 	if (error != 0) {
 		if (get_config_bool("x86.strictmsr") ||
 		    get_config_bool("x86.verbosemsr")) {
 			EPRINTLN("rdmsr to register %#x on vcpu %d",
 			    vme->u.msr.code, vcpu_id(vcpu));
 		}
 		if (get_config_bool("x86.strictmsr")) {
 			vm_inject_gp(vcpu);
 			return (VMEXIT_CONTINUE);
 		}
 	}
 
 	eax = val;
 	error = vm_set_register(vcpu, VM_REG_GUEST_RAX, eax);
 	assert(error == 0);
 
 	edx = val >> 32;
 	error = vm_set_register(vcpu, VM_REG_GUEST_RDX, edx);
 	assert(error == 0);
 
 	return (VMEXIT_CONTINUE);
 }
 
 static int
 vmexit_wrmsr(struct vmctx *ctx __unused, struct vcpu *vcpu,
     struct vm_run *vmrun)
 {
 	struct vm_exit *vme;
 	int error;
 
 	vme = vmrun->vm_exit;
 
 	error = emulate_wrmsr(vcpu, vme->u.msr.code, vme->u.msr.wval);
 	if (error != 0) {
 		if (get_config_bool("x86.strictmsr") ||
 		    get_config_bool("x86.verbosemsr")) {
 			EPRINTLN("wrmsr to register %#x(%#lx) on vcpu %d",
 			    vme->u.msr.code, vme->u.msr.wval, vcpu_id(vcpu));
 		}
 		if (get_config_bool("x86.strictmsr")) {
 			vm_inject_gp(vcpu);
 			return (VMEXIT_CONTINUE);
 		}
 	}
 	return (VMEXIT_CONTINUE);
 }
 
 static const char * const vmx_exit_reason_desc[] = {
 	[EXIT_REASON_EXCEPTION] = "Exception or non-maskable interrupt (NMI)",
 	[EXIT_REASON_EXT_INTR] = "External interrupt",
 	[EXIT_REASON_TRIPLE_FAULT] = "Triple fault",
 	[EXIT_REASON_INIT] = "INIT signal",
 	[EXIT_REASON_SIPI] = "Start-up IPI (SIPI)",
 	[EXIT_REASON_IO_SMI] = "I/O system-management interrupt (SMI)",
 	[EXIT_REASON_SMI] = "Other SMI",
 	[EXIT_REASON_INTR_WINDOW] = "Interrupt window",
 	[EXIT_REASON_NMI_WINDOW] = "NMI window",
 	[EXIT_REASON_TASK_SWITCH] = "Task switch",
 	[EXIT_REASON_CPUID] = "CPUID",
 	[EXIT_REASON_GETSEC] = "GETSEC",
 	[EXIT_REASON_HLT] = "HLT",
 	[EXIT_REASON_INVD] = "INVD",
 	[EXIT_REASON_INVLPG] = "INVLPG",
 	[EXIT_REASON_RDPMC] = "RDPMC",
 	[EXIT_REASON_RDTSC] = "RDTSC",
 	[EXIT_REASON_RSM] = "RSM",
 	[EXIT_REASON_VMCALL] = "VMCALL",
 	[EXIT_REASON_VMCLEAR] = "VMCLEAR",
 	[EXIT_REASON_VMLAUNCH] = "VMLAUNCH",
 	[EXIT_REASON_VMPTRLD] = "VMPTRLD",
 	[EXIT_REASON_VMPTRST] = "VMPTRST",
 	[EXIT_REASON_VMREAD] = "VMREAD",
 	[EXIT_REASON_VMRESUME] = "VMRESUME",
 	[EXIT_REASON_VMWRITE] = "VMWRITE",
 	[EXIT_REASON_VMXOFF] = "VMXOFF",
 	[EXIT_REASON_VMXON] = "VMXON",
 	[EXIT_REASON_CR_ACCESS] = "Control-register accesses",
 	[EXIT_REASON_DR_ACCESS] = "MOV DR",
 	[EXIT_REASON_INOUT] = "I/O instruction",
 	[EXIT_REASON_RDMSR] = "RDMSR",
 	[EXIT_REASON_WRMSR] = "WRMSR",
 	[EXIT_REASON_INVAL_VMCS] =
 	    "VM-entry failure due to invalid guest state",
 	[EXIT_REASON_INVAL_MSR] = "VM-entry failure due to MSR loading",
 	[EXIT_REASON_MWAIT] = "MWAIT",
 	[EXIT_REASON_MTF] = "Monitor trap flag",
 	[EXIT_REASON_MONITOR] = "MONITOR",
 	[EXIT_REASON_PAUSE] = "PAUSE",
 	[EXIT_REASON_MCE_DURING_ENTRY] =
 	    "VM-entry failure due to machine-check event",
 	[EXIT_REASON_TPR] = "TPR below threshold",
 	[EXIT_REASON_APIC_ACCESS] = "APIC access",
 	[EXIT_REASON_VIRTUALIZED_EOI] = "Virtualized EOI",
 	[EXIT_REASON_GDTR_IDTR] = "Access to GDTR or IDTR",
 	[EXIT_REASON_LDTR_TR] = "Access to LDTR or TR",
 	[EXIT_REASON_EPT_FAULT] = "EPT violation",
 	[EXIT_REASON_EPT_MISCONFIG] = "EPT misconfiguration",
 	[EXIT_REASON_INVEPT] = "INVEPT",
 	[EXIT_REASON_RDTSCP] = "RDTSCP",
 	[EXIT_REASON_VMX_PREEMPT] = "VMX-preemption timer expired",
 	[EXIT_REASON_INVVPID] = "INVVPID",
 	[EXIT_REASON_WBINVD] = "WBINVD",
 	[EXIT_REASON_XSETBV] = "XSETBV",
 	[EXIT_REASON_APIC_WRITE] = "APIC write",
 	[EXIT_REASON_RDRAND] = "RDRAND",
 	[EXIT_REASON_INVPCID] = "INVPCID",
 	[EXIT_REASON_VMFUNC] = "VMFUNC",
 	[EXIT_REASON_ENCLS] = "ENCLS",
 	[EXIT_REASON_RDSEED] = "RDSEED",
 	[EXIT_REASON_PM_LOG_FULL] = "Page-modification log full",
 	[EXIT_REASON_XSAVES] = "XSAVES",
 	[EXIT_REASON_XRSTORS] = "XRSTORS"
 };
 
 static const char *
 vmexit_vmx_desc(uint32_t exit_reason)
 {
 
 	if (exit_reason >= nitems(vmx_exit_reason_desc) ||
 	    vmx_exit_reason_desc[exit_reason] == NULL)
 		return ("Unknown");
 	return (vmx_exit_reason_desc[exit_reason]);
 }
 
 #define	DEBUG_EPT_MISCONFIG
 #ifdef DEBUG_EPT_MISCONFIG
 #define	VMCS_GUEST_PHYSICAL_ADDRESS	0x00002400
 
 static uint64_t ept_misconfig_gpa, ept_misconfig_pte[4];
 static int ept_misconfig_ptenum;
 #endif
 
 static int
 vmexit_vmx(struct vmctx *ctx, struct vcpu *vcpu, struct vm_run *vmrun)
 {
 	struct vm_exit *vme;
 
 	vme = vmrun->vm_exit;
 
 	EPRINTLN("vm exit[%d]", vcpu_id(vcpu));
 	EPRINTLN("\treason\t\tVMX");
 	EPRINTLN("\trip\t\t0x%016lx", vme->rip);
 	EPRINTLN("\tinst_length\t%d", vme->inst_length);
 	EPRINTLN("\tstatus\t\t%d", vme->u.vmx.status);
 	EPRINTLN("\texit_reason\t%u (%s)", vme->u.vmx.exit_reason,
 	    vmexit_vmx_desc(vme->u.vmx.exit_reason));
 	EPRINTLN("\tqualification\t0x%016lx",
 	    vme->u.vmx.exit_qualification);
 	EPRINTLN("\tinst_type\t\t%d", vme->u.vmx.inst_type);
 	EPRINTLN("\tinst_error\t\t%d", vme->u.vmx.inst_error);
 #ifdef DEBUG_EPT_MISCONFIG
 	if (vme->u.vmx.exit_reason == EXIT_REASON_EPT_MISCONFIG) {
 		vm_get_register(vcpu,
 		    VMCS_IDENT(VMCS_GUEST_PHYSICAL_ADDRESS),
 		    &ept_misconfig_gpa);
 		vm_get_gpa_pmap(ctx, ept_misconfig_gpa, ept_misconfig_pte,
 		    &ept_misconfig_ptenum);
 		EPRINTLN("\tEPT misconfiguration:");
 		EPRINTLN("\t\tGPA: %#lx", ept_misconfig_gpa);
 		EPRINTLN("\t\tPTE(%d): %#lx %#lx %#lx %#lx",
 		    ept_misconfig_ptenum, ept_misconfig_pte[0],
 		    ept_misconfig_pte[1], ept_misconfig_pte[2],
 		    ept_misconfig_pte[3]);
 	}
 #endif	/* DEBUG_EPT_MISCONFIG */
 	return (VMEXIT_ABORT);
 }
 
 static int
 vmexit_svm(struct vmctx *ctx __unused, struct vcpu *vcpu, struct vm_run *vmrun)
 {
 	struct vm_exit *vme;
 
 	vme = vmrun->vm_exit;
 
 	EPRINTLN("vm exit[%d]", vcpu_id(vcpu));
 	EPRINTLN("\treason\t\tSVM");
 	EPRINTLN("\trip\t\t0x%016lx", vme->rip);
 	EPRINTLN("\tinst_length\t%d", vme->inst_length);
 	EPRINTLN("\texitcode\t%#lx", vme->u.svm.exitcode);
 	EPRINTLN("\texitinfo1\t%#lx", vme->u.svm.exitinfo1);
 	EPRINTLN("\texitinfo2\t%#lx", vme->u.svm.exitinfo2);
 	return (VMEXIT_ABORT);
 }
 
 static int
 vmexit_bogus(struct vmctx *ctx __unused, struct vcpu *vcpu __unused,
     struct vm_run *vmrun)
 {
 	assert(vmrun->vm_exit->inst_length == 0);
 
 	return (VMEXIT_CONTINUE);
 }
 
 static int
 vmexit_reqidle(struct vmctx *ctx __unused, struct vcpu *vcpu __unused,
     struct vm_run *vmrun)
 {
 	assert(vmrun->vm_exit->inst_length == 0);
 
 	return (VMEXIT_CONTINUE);
 }
 
 static int
 vmexit_hlt(struct vmctx *ctx __unused, struct vcpu *vcpu __unused,
     struct vm_run *vmrun __unused)
 {
 	/*
 	 * Just continue execution with the next instruction. We use
 	 * the HLT VM exit as a way to be friendly with the host
 	 * scheduler.
 	 */
 	return (VMEXIT_CONTINUE);
 }
 
 static int
 vmexit_pause(struct vmctx *ctx __unused, struct vcpu *vcpu __unused,
     struct vm_run *vmrun __unused)
 {
 	return (VMEXIT_CONTINUE);
 }
 
 static int
 vmexit_mtrap(struct vmctx *ctx __unused, struct vcpu *vcpu,
     struct vm_run *vmrun)
 {
 	assert(vmrun->vm_exit->inst_length == 0);
 
 #ifdef BHYVE_SNAPSHOT
 	checkpoint_cpu_suspend(vcpu_id(vcpu));
 #endif
 	gdb_cpu_mtrap(vcpu);
 #ifdef BHYVE_SNAPSHOT
 	checkpoint_cpu_resume(vcpu_id(vcpu));
 #endif
 
 	return (VMEXIT_CONTINUE);
 }
 
 static int
 vmexit_inst_emul(struct vmctx *ctx __unused, struct vcpu *vcpu,
     struct vm_run *vmrun)
 {
 	struct vm_exit *vme;
 	struct vie *vie;
 	int err, i, cs_d;
 	enum vm_cpu_mode mode;
 
 	vme = vmrun->vm_exit;
 
 	vie = &vme->u.inst_emul.vie;
 	if (!vie->decoded) {
 		/*
 		 * Attempt to decode in userspace as a fallback.  This allows
 		 * updating instruction decode in bhyve without rebooting the
 		 * kernel (rapid prototyping), albeit with much slower
 		 * emulation.
 		 */
 		vie_restart(vie);
 		mode = vme->u.inst_emul.paging.cpu_mode;
 		cs_d = vme->u.inst_emul.cs_d;
 		if (vmm_decode_instruction(mode, cs_d, vie) != 0)
 			goto fail;
 		if (vm_set_register(vcpu, VM_REG_GUEST_RIP,
 		    vme->rip + vie->num_processed) != 0)
 			goto fail;
 	}
 
 	err = emulate_mem(vcpu, vme->u.inst_emul.gpa, vie,
 	    &vme->u.inst_emul.paging);
 	if (err) {
 		if (err == ESRCH) {
 			EPRINTLN("Unhandled memory access to 0x%lx\n",
 			    vme->u.inst_emul.gpa);
 		}
 		goto fail;
 	}
 
 	return (VMEXIT_CONTINUE);
 
 fail:
 	fprintf(stderr, "Failed to emulate instruction sequence [ ");
 	for (i = 0; i < vie->num_valid; i++)
 		fprintf(stderr, "%02x", vie->inst[i]);
 	FPRINTLN(stderr, " ] at 0x%lx", vme->rip);
 	return (VMEXIT_ABORT);
 }
 
 static int
 vmexit_suspend(struct vmctx *ctx, struct vcpu *vcpu, struct vm_run *vmrun)
 {
 	struct vm_exit *vme;
 	enum vm_suspend_how how;
 	int vcpuid = vcpu_id(vcpu);
 
 	vme = vmrun->vm_exit;
 
 	how = vme->u.suspended.how;
 
 	fbsdrun_deletecpu(vcpuid);
 
 	switch (how) {
 	case VM_SUSPEND_RESET:
 		exit(0);
 	case VM_SUSPEND_POWEROFF:
 		if (get_config_bool_default("destroy_on_poweroff", false))
 			vm_destroy(ctx);
 		exit(1);
 	case VM_SUSPEND_HALT:
 		exit(2);
 	case VM_SUSPEND_TRIPLEFAULT:
 		exit(3);
+	case VM_SUSPEND_DESTROY:
+		exit(4);
 	default:
 		EPRINTLN("vmexit_suspend: invalid reason %d", how);
 		exit(100);
 	}
 	return (0);	/* NOTREACHED */
 }
 
 static int
 vmexit_debug(struct vmctx *ctx __unused, struct vcpu *vcpu,
     struct vm_run *vmrun __unused)
 {
 
 #ifdef BHYVE_SNAPSHOT
 	checkpoint_cpu_suspend(vcpu_id(vcpu));
 #endif
 	gdb_cpu_suspend(vcpu);
 #ifdef BHYVE_SNAPSHOT
 	checkpoint_cpu_resume(vcpu_id(vcpu));
 #endif
 	/*
 	 * XXX-MJ sleep for a short period to avoid chewing up the CPU in the
 	 * window between activation of the vCPU thread and the STARTUP IPI.
 	 */
 	usleep(1000);
 	return (VMEXIT_CONTINUE);
 }
 
 static int
 vmexit_db(struct vmctx *ctx __unused, struct vcpu *vcpu, struct vm_run *vmrun)
 {
 
 #ifdef BHYVE_SNAPSHOT
 	checkpoint_cpu_suspend(vcpu_id(vcpu));
 #endif
 	gdb_cpu_debug(vcpu, vmrun->vm_exit);
 #ifdef BHYVE_SNAPSHOT
 	checkpoint_cpu_resume(vcpu_id(vcpu));
 #endif
 	return (VMEXIT_CONTINUE);
 }
 
 static int
 vmexit_breakpoint(struct vmctx *ctx __unused, struct vcpu *vcpu,
     struct vm_run *vmrun)
 {
 	gdb_cpu_breakpoint(vcpu, vmrun->vm_exit);
 	return (VMEXIT_CONTINUE);
 }
 
 static int
 vmexit_ipi(struct vmctx *ctx __unused, struct vcpu *vcpu __unused,
     struct vm_run *vmrun)
 {
 	struct vm_exit *vme;
 	cpuset_t *dmask;
 	int error = -1;
 	int i;
 
 	dmask = vmrun->cpuset;
 	vme = vmrun->vm_exit;
 
 	switch (vme->u.ipi.mode) {
 	case APIC_DELMODE_INIT:
 		CPU_FOREACH_ISSET(i, dmask) {
 			error = fbsdrun_suspendcpu(i);
 			if (error) {
 				warnx("failed to suspend cpu %d", i);
 				break;
 			}
 		}
 		break;
 	case APIC_DELMODE_STARTUP:
 		CPU_FOREACH_ISSET(i, dmask) {
 			spinup_ap(fbsdrun_vcpu(i),
 			    vme->u.ipi.vector << PAGE_SHIFT);
 		}
 		error = 0;
 		break;
 	default:
 		break;
 	}
 
 	return (error);
 }
 
 int vmexit_task_switch(struct vmctx *, struct vcpu *, struct vm_run *);
 
 const vmexit_handler_t vmexit_handlers[VM_EXITCODE_MAX] = {
 	[VM_EXITCODE_INOUT]  = vmexit_inout,
 	[VM_EXITCODE_INOUT_STR]  = vmexit_inout,
 	[VM_EXITCODE_VMX]    = vmexit_vmx,
 	[VM_EXITCODE_SVM]    = vmexit_svm,
 	[VM_EXITCODE_BOGUS]  = vmexit_bogus,
 	[VM_EXITCODE_REQIDLE] = vmexit_reqidle,
 	[VM_EXITCODE_RDMSR]  = vmexit_rdmsr,
 	[VM_EXITCODE_WRMSR]  = vmexit_wrmsr,
 	[VM_EXITCODE_MTRAP]  = vmexit_mtrap,
 	[VM_EXITCODE_INST_EMUL] = vmexit_inst_emul,
 	[VM_EXITCODE_SUSPENDED] = vmexit_suspend,
 	[VM_EXITCODE_TASK_SWITCH] = vmexit_task_switch,
 	[VM_EXITCODE_DEBUG] = vmexit_debug,
 	[VM_EXITCODE_BPT] = vmexit_breakpoint,
 	[VM_EXITCODE_IPI] = vmexit_ipi,
 	[VM_EXITCODE_HLT] = vmexit_hlt,
 	[VM_EXITCODE_PAUSE] = vmexit_pause,
 	[VM_EXITCODE_DB] = vmexit_db,
 };
diff --git a/usr.sbin/bhyve/bhyve.8 b/usr.sbin/bhyve/bhyve.8
index 89c0b23961a8..c902c265da9e 100644
--- a/usr.sbin/bhyve/bhyve.8
+++ b/usr.sbin/bhyve/bhyve.8
@@ -1,1303 +1,1303 @@
 .\" Copyright (c) 2013 Peter Grehan
 .\" All rights reserved.
 .\"
 .\" Redistribution and use in source and binary forms, with or without
 .\" modification, are permitted provided that the following conditions
 .\" are met:
 .\" 1. Redistributions of source code must retain the above copyright
 .\"    notice, this list of conditions and the following disclaimer.
 .\" 2. Redistributions in binary form must reproduce the above copyright
 .\"    notice, this list of conditions and the following disclaimer in the
 .\"    documentation and/or other materials provided with the distribution.
 .\"
 .\" THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
 .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 .\" ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
 .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 .\" SUCH DAMAGE.
 .\"
 .Dd August 21, 2024
 .Dt BHYVE 8
 .Os
 .Sh NAME
 .Nm bhyve
 .Nd "run a guest operating system inside a virtual machine"
 .Sh SYNOPSIS
 .Nm
 .Op Fl aCDeHhPSuWwxY
 .Oo
 .Sm off
 .Fl c\~
 .Oo
 .Op Cm cpus=
 .Ar numcpus
 .Oc
 .Op Cm ,sockets= Ar n
 .Op Cm ,cores= Ar n
 .Op Cm ,threads= Ar n
 .Oc
 .Sm on
 .Oo Fl f
 .Sm off
 .Ar name Cm \&,
 .Oo
 .Cm string No | Cm file
 .Oc
 .Cm \&= Ar data
 .Sm on
 .Oc
 .Oo
 .Sm off
 .Fl G\~
 .Oo Ar w Oc
 .Oo Ar bind_address Cm \&: Oc
 .Ar port
 .Sm on
 .Oc
 .Op Fl k Ar config_file
 .Op Fl K Ar layout
 .Oo Fl l
 .Sm off
 .Ar lpcdev Op Cm \&, Ar conf
 .Sm on
 .Oc
 .Sm off
 .Oo Fl m\~
 .Ar memsize
 .Oo
 .Cm K | Cm k | Cm M | Cm m | Cm G | Cm g | Cm T | Cm t
 .Oc
 .Sm on
 .Oc
 .Op Fl o Ar var Ns Cm = Ns Ar value
 .Op Fl p Ar vcpu Ns Cm \&: Ns Ar hostcpu
 .Op Fl r Ar file
 .Sm off
 .Oo Fl s\~
 .Ar slot Cm \&, Ar emulation Op Cm \&, Ar conf
 .Sm on
 .Oc
 .Op Fl U Ar uuid
 .Ar vmname
 .Nm
 .Fl l Cm help
 .Nm
 .Fl s Cm help
 .Sh DESCRIPTION
 .Nm
 is a hypervisor that runs guest operating systems inside a
 virtual machine.
 It can run guests on amd64 and arm64 platforms with suitable hardware support.
 .Pp
 Parameters such as the number of virtual CPUs, amount of guest memory, and
 I/O connectivity can be specified with command-line parameters.
 .Pp
 .Nm
 is typically used with a boot ROM that can load the guest operating system.
 On arm64 platforms, this is currently required.
 If not using a boot ROM, the guest operating system must be loaded with
 .Xr bhyveload 8
 or a similar boot loader before running
 .Nm ,
 otherwise.
 On amd64, the
 .Pa edk2-bhyve
 package provides a UEFI firmware that can be used to boot the guest;
 on arm64 the
 .Pa u-boot-bhyve-arm64
 package provides a U-Boot image that can be used to boot the guest.
 .Pp
 .Nm
 runs until the guest operating system reboots or an unhandled hypervisor
 exit is detected.
 .Sh OPTIONS
 .Bl -tag -width 10n
 .It Fl a
 The guest's local APIC is configured in xAPIC mode.
 This option only applies to the amd64 platform.
 xAPIC mode is the default setting so this option is redundant.
 It will be deprecated in a future version.
 .It Fl C
 Include guest memory in core files.
 .It Fl c Op Ar setting ...
 Number of guest virtual CPUs
 and/or the CPU topology.
 The default value for each of
 .Ar numcpus ,
 .Ar sockets ,
 .Ar cores ,
 and
 .Ar threads
 is 1.
 If
 .Ar numcpus
 is not specified then it will be calculated from the other arguments.
 The topology must be consistent in that the
 .Ar numcpus
 must equal the product of
 .Ar sockets ,
 .Ar cores ,
 and
 .Ar threads .
 If a
 .Ar setting
 is specified more than once the last one has precedence.
 .Pp
 The maximum number of virtual CPUs defaults to the number of active
 physical CPUs in the system available via the
 .Va hw.vmm.maxcpu
 .Xr sysctl 8
 variable.
 The limit can be adjusted via the
 .Va hw.vmm.maxcpu
 loader tunable.
 .It Fl D
 Destroy the VM on guest initiated power-off.
 .It Fl e
 Force
 .Nm
 to exit when a guest issues an access to an I/O port that is not emulated.
 This is intended for debug purposes and only applies to the amd64 platform.
 .It Fl f Ar name Ns Cm \&, Ns Oo Cm string Ns No | Ns Cm file Ns Oc Ns Cm \&= Ns Ar data
 Add a fw_cfg file
 .Ar name
 to the fw_cfg interface.
 If a
 .Cm string
 is specified, the fw_cfg file contains the string as data.
 If a
 .Cm file
 is specified, bhyve reads the file and adds the file content as fw_cfg data.
 .It Fl G Xo
 .Sm off
 .Oo Ar w Oc
 .Oo Ar bind_address Cm \&: Oc
 .Ar port
 .Sm on
 .Xc
 Start a debug server that uses the GDB protocol to export guest state to a
 debugger.
 An IPv4 TCP socket will be bound to the supplied
 .Ar bind_address
 and
 .Ar port
 to listen for debugger connections.
 Only a single debugger may be attached to the debug server at a time.
 If the option begins with
 .Sq w ,
 .Nm
 will pause execution at the first instruction waiting for a debugger to attach.
 .It Fl H
 Yield the virtual CPU thread when a HLT instruction is detected.
 If this option is not specified, virtual CPUs will use 100% of a host CPU.
 This option applies only to the amd64 platform.
 .It Fl h
 Print help message and exit.
 .It Fl k Ar config_file
 Set configuration variables from a simple, key-value config file.
 Each line of the config file is expected to consist of a config variable
 name, an equals sign
 .Pq Sq = ,
 and a value.
 No spaces are permitted between the variable name, equals sign, or
 value.
 Blank lines and lines starting with
 .Sq #
 are ignored.
 See
 .Xr bhyve_config 5
 for more details.
 .It Fl K Ar layout
 Specify the keyboard layout.
 The value that can be specified sets the file name in
 .Ar /usr/share/bhyve/kbdlayout .
 This specification only works when loaded with UEFI mode for VNC.
 When using a VNC client that supports QEMU Extended Key Event Message (e.g.
 TigerVNC), this option isn't needed.
 When using a VNC client that doesn't support QEMU Extended Key Event Message
 (e.g. tightVNC), the layout defaults to the US keyboard unless specified
 otherwise.
 .It Fl l Cm help
 Print a list of supported LPC devices.
 .It Fl l Ar lpcdev Ns Op Cm \&, Ns Ar conf
 Allow devices behind the LPC PCI-ISA bridge to be configured.
 The only supported devices are the TTY-class devices
 .Cm com1 , com2 , com3 ,
 and
 .Cm com4 ,
 the TPM module
 .Cm tpm ,
 the boot ROM device
 .Cm bootrom ,
 the
 .Cm fwcfg
 type and the debug/test device
 .Cm pc-testdev .
 .Pp
 The possible values for the
 .Ar conf
 argument are listed in the
 .Fl s
 flag description.
 .Pp
 This option applies only to the amd64 platform.
 On arm64, the console and boot ROM devices are configured using the
 more generic
 .Fl o
 option.
 .It Xo
 .Fl m Ar memsize Ns Oo
 .Sm off
 .Cm K | k | M | m | G | g | T | t
 .Sm on
 .Oc
 .Xc
 Set the guest physical memory size.
 This must be the same size that was given to
 .Xr bhyveload 8 .
 .Pp
 The size argument may be suffixed with one of
 .Cm K , M , G
 or
 .Cm T
 (either upper or lower case)
 to indicate a multiple of kilobytes, megabytes, gigabytes, or terabytes.
 If no suffix is given, the value is assumed to be in megabytes.
 The default is 256M.
 .Pp
 .It Fl n Ar id Ns Cm \&, Ns Ar size Ns Cm \&, Ns Ar cpus Ns Op Cm \&, Ns Ar domain_policy
 Configure guest NUMA domains.
 This option applies only to the amd64 platform.
 .Pp
 The
 .Fl n
 option allows the guest physical address space to be partitioned into domains.
 The layout of each domain is encoded in an ACPI table
 visible to the guest operating system.
 The
 .Fl n
 option also allows the specification of a
 .Xr domainset 9
 memory allocation policy for the host memory backing a given NUMA domain.
 A guest can have up to 8 NUMA domains.
 This feature requires that the guest use a boot ROM, and in
 particular cannot be used if the guest was initialized using
 .Xr bhyveload 8 .
 .Pp
 Each domain is identified by a numerical
 .Em id .
 The domain memory
 .Em size
 is specified using the same format as the
 .Fl m
 flag.
 The sum of all
 .Em size
 parameters overrides the total VM memory size specified by the
 .Fl m
 flag.
 However, if at least one domain memory size parameter is
 missing, the total VM memory size will be equally distributed across
 all emulated domains.
 The
 .Em cpuset
 parameter specifies the set of CPUs that are part of the domain.
 The
 .Em domain_policy
 parameter may be optionally used to configure the
 .Xr domainset 9
 host NUMA memory allocation policy for an emulated
 domain.
 See the
 .Ar -n
 flag in
 .Xr cpuset 1
 for a list of valid NUMA memory allocation policies and their formats.
 .It Fl o Ar var Ns Cm = Ns Ar value
 Set the configuration variable
 .Ar var
 to
 .Ar value .
 See
 .Xr bhyve_config 5
 for configuration options.
 .It Fl P
 Force the guest virtual CPU to exit when a PAUSE instruction is detected.
 This option applies only to the amd64 platform.
 .It Fl p Ar vcpu Ns Cm \& : Ns Ar hostcpu
 Pin guest's virtual CPU
 .Em vcpu
 to
 .Em hostcpu .
 Host CPUs and guest virtual CPUs are numbered starting from 0.
 A
 .Fl p
 option is required for every guest vCPU to be pinned.
 To map a 4 vCPU guest to host CPUs 12-15:
 .Bd -literal
 -p 0:12 -p 1:13 -p 2:14 -p 3:15
 .Ed
 .It Fl r Ar file
 Resume a guest from a snapshot.
 The guest memory contents are restored from
 .Ar file ,
 and the guest device and vCPU state are restored from the file
 .Dq Ar file Ns .kern .
 .Pp
 Note that the current snapshot file format requires that the
 configuration of devices in the new VM match the VM from which the
 snapshot was taken by specifying the same
 .Fl s
 and
 .Fl l
 options.
 The count of vCPUs and memory configuration are read from the snapshot.
 .It Fl S
 Wire guest memory.
 .It Fl s Cm help
 Print a list of supported PCI devices.
 .It Fl s Ar slot Ns Cm \&, Ns Ar emulation Ns Op Cm \&, Ns Ar conf
 Configure a virtual PCI slot and function.
 .Pp
 .Nm
 provides PCI bus emulation and virtual devices that can be attached to
 slots on the bus.
 There are 32 available slots, with the option of providing up to 8 functions
 per slot.
 .Pp
 The
 .Ar slot
 can be specified in one of the following formats:
 .Pp
 .Bl -bullet -compact
 .It
 .Ar pcislot
 .It
 .Sm off
 .Ar pcislot Cm \&: Ar function
 .Sm on
 .It
 .Sm off
 .Ar bus Cm \&: Ar pcislot Cm \&: Ar function
 .Sm on
 .El
 .Pp
 The
 .Ar pcislot
 value is 0 to 31.
 The optional
 .Ar function
 value is 0 to 7.
 The optional
 .Ar bus
 value is 0 to 255.
 If not specified, the
 .Ar function
 value defaults to 0.
 If not specified, the
 .Ar bus
 value defaults to 0.
 .Pp
 See
 .Sx "PCI EMULATION"
 for available options for the
 .Ar emulation
 argument.
 .It Fl U Ar uuid
 Set the universally unique identifier
 .Pq UUID
 in the guest's System Management BIOS System Information structure.
 By default a UUID is generated from the host's hostname and
 .Ar vmname .
 .It Fl u
 RTC keeps UTC time.
 .It Fl W
 Force virtio PCI device emulations to use MSI interrupts instead of MSI-X
 interrupts.
 .It Fl w
 Ignore accesses to unimplemented Model Specific Registers (MSRs).
 This is intended for debug purposes.
 .It Fl x
 The guest's local APIC is configured in x2APIC mode.
 This option applies only to the amd64 platform.
 .It Fl Y
 Disable MPtable generation.
 This option applies only to the amd64 platform.
 .It Ar vmname
 Alphanumeric name of the guest.
 This should be the same as that created by
 .Xr bhyveload 8 .
 .El
 .Sh PCI EMULATION
 .Nm
 provides emulation for various PCI devices.
 They are specified by the
 .Fl s
 .Ar slot,emulation,conf
 configuration's
 .Ar emulation
 argument, which can be one of the following:
 .Bl -tag -width "amd_hostbridge"
 .It Cm hostbridge
 A simple host bridge.
 This is usually configured at slot 0, and is required by most guest
 operating systems.
 .It Cm amd_hostbridge
 Emulation identical to
 .Cm hostbridge
 using a PCI vendor ID of AMD.
 .It Cm passthru
 PCI pass-through device.
 .It Cm virtio-net
 Virtio network interface.
 .It Cm virtio-blk
 Virtio block storage interface.
 .It Cm virtio-scsi
 Virtio SCSI interface.
 .It Cm virtio-9p
 Virtio 9p (VirtFS) interface.
 .It Cm virtio-rnd
 Virtio RNG interface.
 .It Cm virtio-console
 Virtio console interface, which exposes multiple ports
 to the guest in the form of simple char devices for simple IO
 between the guest and host userspaces.
 .It Cm virtio-input
 Virtio input interface.
 .It Cm ahci
 AHCI controller attached to arbitrary devices.
 .It Cm ahci-cd
 AHCI controller attached to an ATAPI CD/DVD.
 .It Cm ahci-hd
 AHCI controller attached to a SATA hard drive.
 .It Cm e1000
 Intel e82545 network interface.
 .It Cm uart
 PCI 16550 serial device.
 .It Cm lpc
 LPC PCI-ISA bridge with COM1, COM2, COM3, and COM4 16550 serial ports,
 a boot ROM, and,
 optionally, a TPM module, a fwcfg type, and the debug/test device.
 The LPC bridge emulation can only be configured on bus 0.
 .It Cm fbuf
 Raw framebuffer device attached to VNC server.
 .It Cm xhci
 eXtensible Host Controller Interface (xHCI) USB controller.
 .It Cm nvme
 NVM Express (NVMe) controller.
 .It Cm hda
 High Definition Audio Controller.
 .El
 .Pp
 The optional parameter
 .Ar conf
 describes the backend for device emulations.
 If
 .Ar conf
 is not specified, the device emulation has no backend and can be
 considered unconnected.
 .Ss Network device backends
 .Sm off
 .Bl -bullet
 .It
 .Xo
 .Cm tap Ar N
 .Op Cm \&,mac= Ar xx:xx:xx:xx:xx:xx
 .Op Cm \&,mtu= Ar N
 .Xc
 .It
 .Xo
 .Cm vmnet Ar N
 .Op Cm \&,mac= Ar xx:xx:xx:xx:xx:xx
 .Op Cm \&,mtu= Ar N
 .Xc
 .It
 .Xo
 .Cm netgraph,path= Ar ADDRESS Cm \&,peerhook= Ar HOOK
 .Op Cm \&,socket= Ar NAME
 .Op Cm \&,hook= Ar HOOK
 .Op Cm \&,mac= Ar xx:xx:xx:xx:xx:xx
 .Op Cm \&,mtu= Ar N
 .Xc
 .It
 .Xo
 .Cm slirp,hostfwd= Ar proto : Ar hostaddr : Ar hostport - Ar guestaddr : Ar guestport
 .Xc
 .El
 .Sm on
 .Pp
 If
 .Cm mac
 is not specified, the MAC address is derived from a fixed OUI, and the
 remaining bytes from an MD5 hash of the slot and function numbers and
 the device name.
 .Pp
 The MAC address is an ASCII string in
 .Xr ethers 5
 format.
 .Pp
 With
 .Cm virtio-net
 devices, the
 .Cm mtu
 parameter can be specified to inform the guest about the largest MTU
 that should be allowed, expressed in bytes.
 .Pp
 With
 .Cm netgraph
 backend, the
 .Cm path
 and
 .Cm peerhook
 parameters must be specified to set the destination node and corresponding hook.
 The optional parameters
 .Cm socket
 and
 .Cm hook
 may be used to set the
 .Xr ng_socket 4
 node name and source hook.
 The
 .Ar ADDRESS ,
 .Ar HOOK ,
 and
 .Ar NAME
 must comply with
 .Xr netgraph 4
 addressing rules.
 .Pp
 The slirp backend can be used to provide a NATed network to the guest.
 This backend has poor performance but does not require any network
 configuration on the host system.
 It depends on the
 .Pa net/libslirp
 port.
 The
 .Cm hostfwd
 option takes a 5-tuple describing how connections from the host are to be
 forwarded to the guest.
 Multiple rules can be specified, separated by semicolons.
 Note that semicolons must be escaped or quoted to prevent the shell from
 interpreting them.
 .Ss Block storage device backends:
 .Bl -bullet
 .Sm off
 .It
 .Ar /filename Op Cm \&, Ar block-device-options
 .It
 .Ar /dev/xxx Op Cm \&, Ar block-device-options
 .Sm on
 .El
 .Pp
 The
 .Ar block-device-options
 are:
 .Bl -tag -width 10n
 .It Cm nocache
 Open the file with
 .Dv O_DIRECT .
 .It Cm direct
 Open the file using
 .Dv O_SYNC .
 .It Cm ro
 Force the file to be opened read-only.
 .It Cm sectorsize= Ns Ar logical Ns Oo Cm \&/ Ns Ar physical Oc
 Specify the logical and physical sector sizes of the emulated disk.
 The physical sector size is optional and is equal to the logical sector size
 if not explicitly specified.
 .It Cm nodelete
 Disable emulation of guest trim requests via
 .Dv DIOCGDELETE
 requests.
 .It Li bootindex= Ns Ar index
 Add the device to the bootorder at
 .Ar index .
 A fwcfg file is used to specify the bootorder.
 The guest firmware may ignore or doesn't support this fwcfg file.
 In that case, this feature doesn't work as expected.
 .El
 .Ss SCSI device backends
 .Bl -bullet
 .Sm off
 .It
 .Pa /dev/cam/ctl Oo Ar pp Cm \&. Ar vp Oc Oo Cm \&, Ar scsi-device-options Oc
 .Sm on
 .El
 .Pp
 The
 .Ar scsi-device-options
 are:
 .Bl -tag -width 10n
 .It Cm iid= Ns Ar IID
 Initiator ID to use when sending requests to specified CTL port.
 The default value is 0.
 .It Li bootindex= Ns Ar index
 Add the device to the bootorder at
 .Ar index .
 A fwcfg file is used to specify the bootorder.
 The guest firmware may ignore or doesn't support this fwcfg file.
 In that case, this feature doesn't work as expected.
 .El
 .Ss 9P device backends
 .Bl -bullet
 .Sm off
 .It
 .Ar sharename Cm = Ar /path/to/share Op Cm \&, Ar 9p-device-options
 .Sm on
 .El
 .Pp
 The
 .Ar 9p-device-options
 are:
 .Bl -tag -width 10n
 .It Cm ro
 Expose the share in read-only mode.
 .El
 .Ss TTY device backends
 .Bl -tag -width 10n
 .It Cm stdio
 Connect the serial port to the standard input and output of
 the
 .Nm
 process.
 .It Ar /dev/xxx
 Use the host TTY device for serial port I/O.
 .It Ar tcp=ip:port
 Use the TCP server for serial port I/O.
 Configuring this option will start a TCP server that waits for connections.
 Only one connection is allowed at any time. Other connection try to connect
 to TCP server will be disconnected immediately. Note that this feature
 allows unprivileged users to access the guest console, so ensure that
 access is appropriately restricted.
 .El
 .Ss TPM device backends
 .Bl -bullet
 .Sm off
 .It
 .Ar type Ns \&, Ns Ar path Ns Op Cm \&, Ns Ar tpm-device-options
 .Sm on
 .El
 .Pp
 Emulate a TPM device.
 Supported options for
 .Ar type :
 .Bl -tag -width 10n
 .It Cm passthru
 Use a physical TPM device.
 The argument
 .Ar path
 needs to point to a valid TPM device path, i.e.
 .Pa /dev/tpm0 .
 .It Cm swtpm
 Connect to a running
 .Cm swtpm
 instance.
 The argument
 .Ar path
 needs to point to a UNIX domain socket that a
 .Cm swtpm
 process is listening on.
 .El
 .Pp
 The
 .Ar tpm-device-options
 are:
 .Bl -tag -width 10n
 .It Cm version= Ns Ar version
 Version of the TPM device according to the TCG specification.
 Defaults to
 .Cm 2.0 ,
 which is the only version currently supported.
 .El
 .Ss Boot ROM device backends
 .Sm off
 .Bl -bullet
 .It
 .Ar romfile Ns Op Cm \&, Ns Ar varfile
 .El
 .Sm on
 .Pp
 Map
 .Ar romfile
 in the guest address space reserved for boot firmware.
 .Pp
 If
 .Ar varfile
 is provided, that file is also mapped in the boot firmware guest
 address space, and any modifications the guest makes will be saved
 to that file.
 .Pp
 Fwcfg types:
 .Bl -tag -width 10n
 .It Ar fwcfg
 The fwcfg interface is used to pass information such as the CPU count
 or ACPI tables to the guest firmware.
 Supported values are
 .Ql bhyve
 and
 .Ql qemu .
 Due to backward compatibility reasons,
 .Ql bhyve
 is the default option.
 When
 .Ql bhyve
 is used, bhyve's fwctl interface is used.
 It currently reports only the CPU count to the guest firmware.
 The
 .Ql qemu
 option uses QEMU's fwcfg interface.
 This interface is widely used and allows user-defined information to
 be passed to the guest.
 It is used for passing the CPU count, ACPI tables, a boot order and
 many other things to the guest.
 Some operating systems such as Fedora CoreOS can be configured by
 qemu's fwcfg interface as well.
 .El
 .Ss Pass-through device backends
 .Sm off
 .Bl -bullet
 .It
 .Cm ppt Ar N Oo , Ar passthru-device-options Oc
 .It
 .Ns Ar bus Cm \&/ Ar slot Cm \&/ Ar function
 .Op , Ar passthru-device-options
 .It
 .Cm pci Ar bus Cm : Ar slot Cm : Ns Ar function
 .Op , Ar passthru-device-options
 .El
 .Sm on
 .Pp
 Connect to a PCI device on the host either named ppt
 .Ns Ar N
 or at the selector described by
 .Ar slot ,
 .Ar bus ,
 and
 .Ar function
 numbers.
 .Pp
 The
 .Ar passthru-device-options
 are:
 .Bl -tag -width 10n
 .It Cm rom= Ns Ar romfile
 Add
 .Ar romfile
 as option ROM to the PCI device.
 The ROM will be loaded by firmware and should be capable of
 initializing the device.
 .It Li bootindex= Ns Ar index
 Add the device to the bootorder at
 .Ar index .
 A fwcfg file is used to specify the bootorder.
 The guest firmware may ignore or doesn't support this fwcfg file.
 In that case, this feature doesn't work as expected.
 .El
 .Pp
 Guest memory must be wired using the
 .Fl S
 option when a pass-through device is configured.
 .Pp
 The host device must have been reserved at boot-time using the
 .Va pptdevs
 loader variable as described in
 .Xr vmm 4 .
 .Ss Virtio console device backends
 .Bl -bullet
 .Sm off
 .It
 .Cm port1= Ns Ar /path/to/port1.sock Ns Op Cm ,port Ns Ar N Cm \&= Ns Ar /path/to/port2.sock No \~ Ar ...
 .Sm on
 .El
 .Pp
 A maximum of 16 ports per device can be created.
 Every port is named and corresponds to a Unix domain socket created by
 .Nm .
 .Nm
 accepts at most one connection per port at a time.
 .Pp
 Limitations:
 .Bl -bullet
 .It
 Due to the lack of destructors in
 .Nm ,
 sockets on the filesystem must be cleaned up manually after
 .Nm
 exits.
 .It
 There is no way to use the
 .Dq console port
 feature, nor the console port
 resize at present.
 .It
 Emergency write is advertised, but no-op at present.
 .El
 .Ss Virtio input device backends:
 .Bl -bullet
 .Sm off
 .It
 .Ar /dev/input/eventX
 .Sm on
 .El
 .Pp
 Send input events of
 .Ar /dev/input/eventX
 to guest by VirtIO Input Interface.
 .Ss Framebuffer device backends
 .Bl -bullet
 .Sm off
 .It
 .Op Cm rfb= Ar ip-and-port
 .Op Cm ,w= Ar width
 .Op Cm ,h= Ar height
 .Op Cm ,vga= Ar vgaconf
 .Op Cm ,wait
 .Op Cm ,password= Ar password
 .Sm on
 .El
 .Pp
 Configuration options are defined as follows:
 .Bl -tag -width 10n
 .It Cm rfb= Ns Ar ip-and-port Pq or Cm tcp= Ns Ar ip-and-port
 An IP address and a port VNC should listen on.
 There are two formats:
 .Pp
 .Bl -bullet -compact
 .It
 .Sm off
 .Op Ar IPv4 Cm \&:
 .Ar port
 .Sm on
 .It
 .Sm off
 .Cm \&[ Ar IPv6%zone Cm \&] Cm \&: Ar port
 .Sm on
 .El
 .Pp
 The default is to listen on localhost IPv4 address and default VNC port 5900.
 An IPv6 address must be enclosed in square brackets and may contain an
 optional zone identifier.
 .It Cm w= Ns Ar width No and Cm h= Ns Ar height
 A display resolution, width and height, respectively.
 If not specified, a default resolution of 1024x768 pixels will be used.
 Minimal supported resolution is 640x480 pixels,
 and maximum is 3840x2160 pixels.
 .It Cm vga= Ns Ar vgaconf
 Possible values for this option are
 .Cm io
 (default),
 .Cm on ,
 and
 .Cm off .
 PCI graphics cards have a dual personality in that they are
 standard PCI devices with BAR addressing, but may also
 implicitly decode legacy VGA I/O space
 .Pq Ad 0x3c0-3df
 and memory space
 .Pq 64KB at Ad 0xA0000 .
 The default
 .Cm io
 option should be used for guests that attempt to issue BIOS calls which result
 in I/O port queries, and fail to boot if I/O decode is disabled.
 .Pp
 The
 .Cm on
 option should be used along with the CSM BIOS capability in UEFI
 to boot traditional BIOS guests that require the legacy VGA I/O and
 memory regions to be available.
 .Pp
 The
 .Cm off
 option should be used for the UEFI guests that assume that
 VGA adapter is present if they detect the I/O ports.
 An example of such a guest is
 .Ox
 in UEFI mode.
 .Pp
 Please refer to the
 .Nm
 .Fx
 wiki page
 .Pq Lk https://wiki.freebsd.org/bhyve
 for configuration notes of particular guests.
 .It Cm wait
 Instruct
 .Nm
 to only boot upon the initiation of a VNC connection, simplifying the
 installation of operating systems that require immediate keyboard input.
 This can be removed for post-installation use.
 .It Cm password= Ns Ar password
 This type of authentication is known to be cryptographically weak and is not
 intended for use on untrusted networks.
 Many implementations will want to use stronger security, such as running
 the session over an encrypted channel provided by IPsec or SSH.
 .El
 .Ss xHCI USB device backends
 .Bl -bullet
 .Sm off
 .It
 .Ar tablet
 .Sm on
 .El
 .Pp
 A USB tablet device that provides precise cursor synchronization
 when using VNC.
 .Ss NVMe device backends
 .Bl -bullet
 .Sm off
 .It
 .Ar devpath
 .Op Cm ,maxq= Ar #
 .Op Cm ,qsz= Ar #
 .Op Cm ,ioslots= Ar #
 .Op Cm ,sectsz= Ar #
 .Op Cm ,ser= Ar #
 .Op Cm ,eui64= Ar #
 .Op Cm ,dsm= Ar opt
 .Sm on
 .El
 .Pp
 Configuration options are defined as follows:
 .Bl -tag -width 10n
 .It Ar devpath
 Accepted device paths are:
 .Ar /dev/blockdev
 or
 .Ar /path/to/image
 or
 .Cm ram= Ns Ar size_in_MiB .
 .It Cm maxq
 Max number of queues.
 .It Cm qsz
 Max elements in each queue.
 .It Cm ioslots
 Max number of concurrent I/O requests.
 .It Cm sectsz
 Sector size (defaults to blockif sector size).
 .It Cm ser
 Serial number with maximum 20 characters.
 .It Cm eui64
 IEEE Extended Unique Identifier (8 byte value).
 .It Cm dsm
 DataSet Management support.
 Supported values are:
 .Cm auto , enable ,
 and
 .Cm disable .
 .El
 .Ss AHCI device backends
 .Bl -bullet
 .It
 .Sm off
 .Op Oo Cm hd\&: | cd\&: Oc Ar path
 .Op Cm ,nmrr= Ar nmrr
 .Op Cm ,ser= Ar #
 .Op Cm ,rev= Ar #
 .Op Cm ,model= Ar #
 .Sm on
 .El
 .Pp
 Configuration options are defined as follows:
 .Bl -tag -width 10n
 .It Cm nmrr
 Nominal Media Rotation Rate, known as RPM.
 Value 1 will indicate device as Solid State Disk.
 Default value is 0, not report.
 .It Cm ser
 Serial Number with maximum 20 characters.
 .It Cm rev
 Revision Number with maximum 8 characters.
 .It Cm model
 Model Number with maximum 40 characters.
 .El
 .Ss HD Audio device backends
 .Bl -bullet
 .It
 .Sm off
 .Op Cm play= Ar playback
 .Op Cm ,rec= Ar recording
 .Sm on
 .El
 .Pp
 Configuration options are defined as follows:
 .Bl -tag -width 10n
 .It Cm play
 Playback device, typically
 .Ar /dev/dsp0 .
 .It Cm rec
 Recording device, typically
 .Ar /dev/dsp0 .
 .El
 .Sh CONFIGURATION VARIABLES
 .Nm
 uses an internal tree of configuration variables to describe global and
 per-device settings.
 When
 .Nm
 starts,
 it parses command line options (including config files) in the order given
 on the command line.
 Each command line option sets one or more configuration variables.
 For example,
 the
 .Fl s
 option creates a new tree node for a PCI device and sets one or more variables
 under that node including the device model and device model-specific variables.
 Variables may be set multiple times during this parsing stage with the final
 value overriding previous values.
 .Pp
 Once all of the command line options have been processed,
 the configuration values are frozen.
 .Nm
 then uses the value of configuration values to initialize device models
 and global settings.
 .Pp
 More details on configuration variables can be found in
 .Xr bhyve_config 5 .
 .Sh CONFIGURATION FILE CREATION
 The
 .Fl k
 flag allows one to provide a path to a configuration file holding all
 settings, which otherwise would need to be defined by providing a long
 list of program arguments to
 .Nm .
 .Pp
 There is a very simple way to translate a complex set of program
 arguments to an equivalent configuration file in
 .Xr bhyve_config 5
 format.
 .Pp
 Use
 .Fl o
 .Ar config.dump=1
 to make
 .Nm
 dump a configuration file representing the used flags and arguments to
 stdout. You can pipe the output into a file to persist the generated settings.
 .Pp
 Make sure to remove the
 .Ar config.dump
 line from the resulting configuration file before using it to start
 .Nm .
 .Sh DEBUG SERVER
 The current debug server provides limited support for debuggers.
 .Ss Registers
 Each virtual CPU is exposed to the debugger as a thread.
 .Pp
 General purpose registers can be queried for each virtual CPU, but other
 registers such as floating-point and system registers cannot be queried.
 .Ss Memory
 Memory (including memory mapped I/O regions) can be read and written
 by the debugger.
 Memory operations use virtual addresses that are resolved to physical
 addresses via the current virtual CPU's active address translation.
 .Ss Control
 The running guest can be interrupted by the debugger at any time
 .Pq for example, by pressing Ctrl-C in the debugger .
 .Pp
 Single stepping is only supported on Intel CPUs supporting the MTRAP VM exit.
 .Pp
 Breakpoints are supported on Intel CPUs that support single stepping.
 Note that continuing from a breakpoint while interrupts are enabled in the
 guest may not work as expected due to timer interrupts firing while single
 stepping over the breakpoint.
 .Sh SIGNAL HANDLING
 .Nm
 deals with the following signals:
 .Pp
 .Bl -tag -width SIGTERM -compact
 .It SIGTERM
 Trigger ACPI poweroff for a VM
 .El
 .Sh EXIT STATUS
 Exit status indicates how the VM was terminated:
 .Pp
 .Bl -tag -width indent -compact
 .It 0
 rebooted
 .It 1
 powered off
 .It 2
 halted
 .It 3
-triple fault
+triple fault (amd64 only)
 .It 4
 exited due to an error
 .El
 .Sh EXAMPLES
 If not using a boot ROM, the guest operating system must have been loaded with
 .Xr bhyveload 8
 or a similar boot loader before
 .Xr bhyve 4
 can be run.
 Otherwise, the boot loader is not needed.
 .Pp
 To run a virtual machine with 1GB of memory, two virtual CPUs, a virtio
 block device backed by the
 .Pa /my/image
 filesystem image, and a serial port for the console:
 .Bd -literal -offset indent
 bhyve -c 2 -s 0,hostbridge -s 1,lpc -s 2,virtio-blk,/my/image \\
   -l com1,stdio -H -P -m 1G vm1
 .Ed
 .Pp
 To do the same on arm64:
 .Bd -literal -offset indent
 .Ed
 bhyve -c 2 -s 0,hostbridge -s 1,virtio-blk,/my/image -o console=stdio \\
   -o bootrom=/usr/local/share/u-boot/u-boot-bhyve-arm64/u-boot.bin -m 1G vm1
 .Pp
 Run a 24GB single-CPU virtual machine with three network ports, one of which
 has a MAC address specified:
 .Bd -literal -offset indent
 bhyve -s 0,hostbridge -s 1,lpc -s 2:0,virtio-net,tap0 \\
   -s 2:1,virtio-net,tap1 \\
   -s 2:2,virtio-net,tap2,mac=00:be:fa:76:45:00 \\
   -s 3,virtio-blk,/my/image -l com1,stdio \\
   -H -P -m 24G bigvm
 .Ed
 .Pp
 Run an 8GB quad-CPU virtual machine with 8 AHCI SATA disks, an AHCI ATAPI
 CD-ROM, a single virtio network port, an AMD hostbridge, and the console
 port connected to an
 .Xr nmdm 4
 null-modem device.
 .Bd -literal -offset indent
 bhyve -c 4 \\
   -s 0,amd_hostbridge -s 1,lpc \\
   -s 1:0,ahci,hd:/images/disk.1,hd:/images/disk.2,\\
 hd:/images/disk.3,hd:/images/disk.4,\\
 hd:/images/disk.5,hd:/images/disk.6,\\
 hd:/images/disk.7,hd:/images/disk.8,\\
 cd:/images/install.iso \\
   -s 3,virtio-net,tap0 \\
   -l com1,/dev/nmdm0A \\
   -H -P -m 8G
 .Ed
 .Pp
 Run a UEFI virtual machine with a display resolution of 800 by 600 pixels
 that can be accessed via VNC at: 0.0.0.0:5900 or via serial console over
 TCP at: 127.0.0.1:1234 (unsafe if you expose serial console without protection).
 .Bd -literal -offset indent
 bhyve -c 2 -m 4G -w -H \\
   -s 0,hostbridge \\
   -s 3,ahci-cd,/path/to/uefi-OS-install.iso \\
   -s 4,ahci-hd,disk.img \\
   -s 5,virtio-net,tap0 \\
   -s 29,fbuf,tcp=0.0.0.0:5900,w=800,h=600,wait \\
   -s 30,xhci,tablet \\
   -s 31,lpc -l com1,tcp=127.0.0.1:1234 \\
   -l bootrom,/usr/local/share/uefi-firmware/BHYVE_UEFI.fd \\
    uefivm
 .Ed
 .Pp
 Run a UEFI virtual machine with a VNC display that is bound to all IPv6
 addresses on port 5900 and a serial I/O port bound to TCP port 1234 of
 loopback address (unsafe if you expose serial console without protection).
 .Bd -literal -offset indent
 bhyve -c 2 -m 4G -w -H \\
   -s 0,hostbridge \\
   -s 4,ahci-hd,disk.img \\
   -s 5,virtio-net,tap0 \\
   -s 29,fbuf,tcp=[::]:5900,w=800,h=600 \\
   -s 30,xhci,tablet \\
   -s 31,lpc -l com1,tcp=[::1]:1234 \\
   -l bootrom,/usr/local/share/uefi-firmware/BHYVE_UEFI.fd \\
    uefivm
 .Ed
 .Pp
 Run a UEFI virtual machine with a VARS file to save EFI variables.
 Note that
 .Nm
 will write guest modifications to the given VARS file.
 Be sure to create a per-guest copy of the template VARS file from
 .Pa /usr .
 .Bd -literal -offset indent
 bhyve -c 2 -m 4g -w -H \\
   -s 0,hostbridge \\
   -s 31,lpc -l com1,stdio \\
   -l bootrom,/usr/local/share/uefi-firmware/BHYVE_UEFI_CODE.fd,BHYVE_UEFI_VARS.fd
    uefivm
 .Ed
 .Pp
 To create a configuration file
 .Pa configfile
 for a virtual machine, use
 .Fl o
 .Ar config.dump=1 :
 .Bd -literal -offset indent
 /usr/sbin/bhyve -c 2 -m 256 -H -P \\
   -s 0:0,hostbridge -s 1:0,virtio-net,tap0 \\
   -s 2:0,ahci-hd,./vm0.img \\
   -s 31,lpc -l com1,stdio \\
   -o config.dump=1 vm0 > configfile
 .Ed
 .Pp
 Then use an editor of your choice to remove the line "config.dump=1"
 from the newly generated
 .Pa configfile .
 .Pp
 To start
 .Nm
 using this configuration file, use flag
 .Fl k :
 .Bd -literal -offset indent
 /usr/sbin/bhyve -k configfile vm0
 .Ed
 .Pp
 Run a UEFI virtual machine with four CPUs and two emulated NUMA domains:
 .Bd -literal -offset indent
 bhyve -c 4 -w -H \\
   -s 0,hostbridge \\
   -s 4,ahci-hd,disk.img \\
   -s 31,lpc -l com1,stdio \\
   -l bootrom,/usr/local/share/uefi-firmware/BHYVE_UEFI.fd \\
   -n id=0,size=4G,cpus=0-1 \\
   -n id=1,size=4G,cpus=2-3 \\
    numavm
 .Ed
 .Pp
 Assuming a host machine with two NUMA domains,
 run a UEFI virtual machine with four CPUs using a
 .Ar prefer
 .Xr domainset 9
 policy to allocate guest memory from the first host NUMA domain only.
 .Bd -literal -offset indent
 bhyve -c 2 -w -H \\
   -s 0,hostbridge \\
   -s 4,ahci-hd,disk.img \\
   -s 31,lpc -l com1,stdio \\
   -l bootrom,/usr/local/share/uefi-firmware/BHYVE_UEFI.fd \\
   -n id=0,size=4G,cpus=0-1,domain_policy=prefer:0 \\
    numavm
 .Ed
 .Sh SEE ALSO
 .Xr bhyve 4 ,
 .Xr netgraph 4 ,
 .Xr ng_socket 4 ,
 .Xr nmdm 4 ,
 .Xr vmm 4 ,
 .Xr bhyve_config 5 ,
 .Xr ethers 5 ,
 .Xr bhyvectl 8 ,
 .Xr bhyveload 8 ,
 .Xr domainset 9
 .Pp
 .Rs
 .%A Intel
 .%B 64 and IA-32 Architectures Software Developer’s Manual
 .%V Volume 3
 .Re
 .Sh HISTORY
 .Nm
 first appeared in
 .Fx 10.0 .
 .Sh AUTHORS
 .An Neel Natu Aq Mt neel@freebsd.org
 .An Peter Grehan Aq Mt grehan@freebsd.org
diff --git a/usr.sbin/bhyve/bhyverun.c b/usr.sbin/bhyve/bhyverun.c
index 9ead49582a7d..bfc0b949a75d 100644
--- a/usr.sbin/bhyve/bhyverun.c
+++ b/usr.sbin/bhyve/bhyverun.c
@@ -1,1024 +1,1022 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/types.h>
 #ifndef WITHOUT_CAPSICUM
 #include <sys/capsicum.h>
 #endif
 #include <sys/cpuset.h>
 #include <sys/domainset.h>
 #include <sys/mman.h>
 #ifdef BHYVE_SNAPSHOT
 #include <sys/socket.h>
 #include <sys/stat.h>
 #endif
 #include <sys/time.h>
 #ifdef BHYVE_SNAPSHOT
 #include <sys/un.h>
 #endif
 
 #include <machine/atomic.h>
 
 #ifndef WITHOUT_CAPSICUM
 #include <capsicum_helpers.h>
 #endif
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <err.h>
 #include <errno.h>
 #ifdef BHYVE_SNAPSHOT
 #include <fcntl.h>
 #endif
 #include <libgen.h>
 #include <libutil.h>
 #include <unistd.h>
 #include <assert.h>
 #include <pthread.h>
 #include <pthread_np.h>
 #include <sysexits.h>
 #include <stdbool.h>
 #include <stdint.h>
 #ifdef BHYVE_SNAPSHOT
 #include <ucl.h>
 #include <unistd.h>
 
 #include <libxo/xo.h>
 #endif
 
 #include <dev/vmm/vmm_mem.h>
 #include <vmmapi.h>
 
 #include "acpi.h"
 #include "bhyverun.h"
 #include "bootrom.h"
 #include "config.h"
 #include "debug.h"
 #ifdef BHYVE_GDB
 #include "gdb.h"
 #endif
 #include "mem.h"
 #include "mevent.h"
 #include "pci_emul.h"
 #ifdef __amd64__
 #include "amd64/pci_lpc.h"
 #endif
 #include "qemu_fwcfg.h"
 #ifdef BHYVE_SNAPSHOT
 #include "snapshot.h"
 #endif
 #include "tpm_device.h"
 #include "vmgenc.h"
 #include "vmexit.h"
 
 #define MB		(1024UL * 1024)
 #define GB		(1024UL * MB)
 
 int guest_ncpus;
 uint16_t cpu_cores, cpu_sockets, cpu_threads;
 
 int raw_stdio = 0;
 
 #ifdef BHYVE_SNAPSHOT
 char *restore_file;
 #endif
 
 static const int BSP = 0;
 
 static cpuset_t cpumask;
 
 static struct vm_mem_domain guest_domains[VM_MAXMEMDOM];
 static int guest_ndomains = 0;
 
 static void vm_loop(struct vmctx *ctx, struct vcpu *vcpu);
 
 static struct vcpu_info {
 	struct vmctx	*ctx;
 	struct vcpu	*vcpu;
 	int		vcpuid;
 } *vcpu_info;
 
 static cpuset_t **vcpumap;
 
 /*
  * XXX This parser is known to have the following issues:
  * 1.  It accepts null key=value tokens ",," as setting "cpus" to an
  *     empty string.
  *
  * The acceptance of a null specification ('-c ""') is by design to match the
  * manual page syntax specification, this results in a topology of 1 vCPU.
  */
 int
 bhyve_topology_parse(const char *opt)
 {
 	char *cp, *str, *tofree;
 
 	if (*opt == '\0') {
 		set_config_value("sockets", "1");
 		set_config_value("cores", "1");
 		set_config_value("threads", "1");
 		set_config_value("cpus", "1");
 		return (0);
 	}
 
 	tofree = str = strdup(opt);
 	if (str == NULL)
 		errx(4, "Failed to allocate memory");
 
 	while ((cp = strsep(&str, ",")) != NULL) {
 		if (strncmp(cp, "cpus=", strlen("cpus=")) == 0)
 			set_config_value("cpus", cp + strlen("cpus="));
 		else if (strncmp(cp, "sockets=", strlen("sockets=")) == 0)
 			set_config_value("sockets", cp + strlen("sockets="));
 		else if (strncmp(cp, "cores=", strlen("cores=")) == 0)
 			set_config_value("cores", cp + strlen("cores="));
 		else if (strncmp(cp, "threads=", strlen("threads=")) == 0)
 			set_config_value("threads", cp + strlen("threads="));
 		else if (strchr(cp, '=') != NULL)
 			goto out;
 		else
 			set_config_value("cpus", cp);
 	}
 	free(tofree);
 	return (0);
 
 out:
 	free(tofree);
 	return (-1);
 }
 
 static int
 parse_int_value(const char *key, const char *value, int minval, int maxval)
 {
 	char *cp;
 	long lval;
 
 	errno = 0;
 	lval = strtol(value, &cp, 0);
 	if (errno != 0 || *cp != '\0' || cp == value || lval < minval ||
 	    lval > maxval)
 		errx(4, "Invalid value for %s: '%s'", key, value);
 	return (lval);
 }
 
 int
 bhyve_numa_parse(const char *opt)
 {
 	int id = -1;
 	nvlist_t *nvl;
 	char *cp, *str, *tofree;
 	char pathbuf[64] = { 0 };
 	char *size = NULL, *cpus = NULL, *domain_policy = NULL;
 
 	if (*opt == '\0') {
 		return (-1);
 	}
 
 	tofree = str = strdup(opt);
 	if (str == NULL)
 		errx(4, "Failed to allocate memory");
 
 	while ((cp = strsep(&str, ",")) != NULL) {
 		if (strncmp(cp, "id=", strlen("id=")) == 0)
 			id = parse_int_value("id", cp + strlen("id="), 0,
 			    UINT8_MAX);
 		else if (strncmp(cp, "size=", strlen("size=")) == 0)
 			size = cp + strlen("size=");
 		else if (strncmp(cp,
 		    "domain_policy=", strlen("domain_policy=")) == 0)
 			domain_policy = cp + strlen("domain_policy=");
 		else if (strncmp(cp, "cpus=", strlen("cpus=")) == 0)
 			cpus = cp + strlen("cpus=");
 	}
 
 	if (id == -1) {
 		EPRINTLN("Missing NUMA domain ID in '%s'", opt);
 		goto out;
 	}
 
 	snprintf(pathbuf, sizeof(pathbuf), "domains.%d", id);
 	nvl = find_config_node(pathbuf);
 	if (nvl == NULL)
 		nvl = create_config_node(pathbuf);
 	if (size != NULL)
 		set_config_value_node(nvl, "size", size);
 	if (domain_policy != NULL)
 		set_config_value_node(nvl, "domain_policy", domain_policy);
 	if (cpus != NULL)
 		set_config_value_node(nvl, "cpus", cpus);
 
 	free(tofree);
 	return (0);
 
 out:
 	free(tofree);
 	return (-1);
 }
 
 static void
 calc_mem_affinity(size_t vm_memsize)
 {
 	int i;
 	nvlist_t *nvl;
 	bool need_recalc;
 	const char *value;
 	struct vm_mem_domain *dom;
 	char pathbuf[64] = { 0 };
 
 	need_recalc = false;
 	for (i = 0; i < VM_MAXMEMDOM; i++) {
 		dom = &guest_domains[i];
 		snprintf(pathbuf, sizeof(pathbuf), "domains.%d", i);
 		nvl = find_config_node(pathbuf);
 		if (nvl == NULL) {
 			break;
 		}
 
 		value = get_config_value_node(nvl, "size");
 		need_recalc |= value == NULL;
 		if (value != NULL && vm_parse_memsize(value, &dom->size)) {
 			errx(EX_USAGE, "invalid memsize for domain %d: '%s'", i,
 			    value);
 		}
 
 		dom->ds_mask = calloc(1, sizeof(domainset_t));
 		if (dom->ds_mask == NULL) {
 			errx(EX_OSERR, "Failed to allocate domainset mask");
 		}
 		dom->ds_size = sizeof(domainset_t);
 		value = get_config_value_node(nvl, "domain_policy");
 		if (value == NULL) {
 			dom->ds_policy = DOMAINSET_POLICY_INVALID;
 			DOMAINSET_ZERO(dom->ds_mask);
 		} else if (domainset_parselist(value, dom->ds_mask, &dom->ds_policy) !=
 		    CPUSET_PARSE_OK) {
 				errx(EX_USAGE, "failed to parse domain policy '%s'", value);
 		}
 	}
 
 	guest_ndomains = i;
 	if (guest_ndomains == 0) {
 		/*
 		 * No domains were specified - create domain
 		 * 0 holding all CPUs and memory.
 		 */
 		guest_ndomains = 1;
 		guest_domains[0].size = vm_memsize;
 	} else if (need_recalc) {
 		warnx("At least one domain memory size was not specified, distributing"
 		    " total VM memory size across all domains");
 		for (i = 0; i < guest_ndomains; i++) {
 			guest_domains[i].size = vm_memsize / guest_ndomains;
 		}
 	}
 }
 
 /*
  * Set the sockets, cores, threads, and guest_cpus variables based on
  * the configured topology.
  *
  * The limits of UINT16_MAX are due to the types passed to
  * vm_set_topology().  vmm.ko may enforce tighter limits.
  */
 static void
 calc_topology(void)
 {
 	const char *value;
 	bool explicit_cpus;
 	uint64_t ncpus;
 
 	value = get_config_value("cpus");
 	if (value != NULL) {
 		guest_ncpus = parse_int_value("cpus", value, 1, UINT16_MAX);
 		explicit_cpus = true;
 	} else {
 		guest_ncpus = 1;
 		explicit_cpus = false;
 	}
 	value = get_config_value("cores");
 	if (value != NULL)
 		cpu_cores = parse_int_value("cores", value, 1, UINT16_MAX);
 	else
 		cpu_cores = 1;
 	value = get_config_value("threads");
 	if (value != NULL)
 		cpu_threads = parse_int_value("threads", value, 1, UINT16_MAX);
 	else
 		cpu_threads = 1;
 	value = get_config_value("sockets");
 	if (value != NULL)
 		cpu_sockets = parse_int_value("sockets", value, 1, UINT16_MAX);
 	else
 		cpu_sockets = guest_ncpus;
 
 	/*
 	 * Compute sockets * cores * threads avoiding overflow.  The
 	 * range check above insures these are 16 bit values.
 	 */
 	ncpus = (uint64_t)cpu_sockets * cpu_cores * cpu_threads;
 	if (ncpus > UINT16_MAX)
 		errx(4, "Computed number of vCPUs too high: %ju",
 		    (uintmax_t)ncpus);
 
 	if (explicit_cpus) {
 		if (guest_ncpus != (int)ncpus)
 			errx(4, "Topology (%d sockets, %d cores, %d threads) "
 			    "does not match %d vCPUs",
 			    cpu_sockets, cpu_cores, cpu_threads,
 			    guest_ncpus);
 	} else
 		guest_ncpus = ncpus;
 }
 
 int
 bhyve_pincpu_parse(const char *opt)
 {
 	const char *value;
 	char *newval;
 	char key[16];
 	int vcpu, pcpu;
 
 	if (sscanf(opt, "%d:%d", &vcpu, &pcpu) != 2) {
 		fprintf(stderr, "invalid format: %s\n", opt);
 		return (-1);
 	}
 
 	if (vcpu < 0) {
 		fprintf(stderr, "invalid vcpu '%d'\n", vcpu);
 		return (-1);
 	}
 
 	if (pcpu < 0 || pcpu >= CPU_SETSIZE) {
 		fprintf(stderr, "hostcpu '%d' outside valid range from "
 		    "0 to %d\n", pcpu, CPU_SETSIZE - 1);
 		return (-1);
 	}
 
 	snprintf(key, sizeof(key), "vcpu.%d.cpuset", vcpu);
 	value = get_config_value(key);
 
 	if (asprintf(&newval, "%s%s%d", value != NULL ? value : "",
 	    value != NULL ? "," : "", pcpu) == -1) {
 		perror("failed to build new cpuset string");
 		return (-1);
 	}
 
 	set_config_value(key, newval);
 	free(newval);
 	return (0);
 }
 
 static void
 parse_cpuset(int vcpu, const char *list, cpuset_t *set)
 {
 	char *cp, *token;
 	int pcpu, start;
 
 	CPU_ZERO(set);
 	start = -1;
 	token = __DECONST(char *, list);
 	for (;;) {
 		pcpu = strtoul(token, &cp, 0);
 		if (cp == token)
 			errx(4, "invalid cpuset for vcpu %d: '%s'", vcpu, list);
 		if (pcpu < 0 || pcpu >= CPU_SETSIZE)
 			errx(4, "hostcpu '%d' outside valid range from 0 to %d",
 			    pcpu, CPU_SETSIZE - 1);
 		switch (*cp) {
 		case ',':
 		case '\0':
 			if (start >= 0) {
 				if (start > pcpu)
 					errx(4, "Invalid hostcpu range %d-%d",
 					    start, pcpu);
 				while (start < pcpu) {
 					CPU_SET(start, set);
 					start++;
 				}
 				start = -1;
 			}
 			CPU_SET(pcpu, set);
 			break;
 		case '-':
 			if (start >= 0)
 				errx(4, "invalid cpuset for vcpu %d: '%s'",
 				    vcpu, list);
 			start = pcpu;
 			break;
 		default:
 			errx(4, "invalid cpuset for vcpu %d: '%s'", vcpu, list);
 		}
 		if (*cp == '\0')
 			break;
 		token = cp + 1;
 	}
 }
 
 static void
 build_vcpumaps(void)
 {
 	char key[16];
 	const char *value;
 	int vcpu;
 
 	vcpumap = calloc(guest_ncpus, sizeof(*vcpumap));
 	for (vcpu = 0; vcpu < guest_ncpus; vcpu++) {
 		snprintf(key, sizeof(key), "vcpu.%d.cpuset", vcpu);
 		value = get_config_value(key);
 		if (value == NULL)
 			continue;
 		vcpumap[vcpu] = malloc(sizeof(cpuset_t));
 		if (vcpumap[vcpu] == NULL)
 			err(4, "Failed to allocate cpuset for vcpu %d", vcpu);
 		parse_cpuset(vcpu, value, vcpumap[vcpu]);
 	}
 }
 
 static void
 set_vcpu_affinities(void)
 {
 	int cpu, error;
 	nvlist_t *nvl = NULL;
 	cpuset_t cpus;
 	const char *value;
 	char pathbuf[64] = { 0 };
 
 	for (int dom = 0; dom < guest_ndomains; dom++) {
 		snprintf(pathbuf, sizeof(pathbuf), "domains.%d", dom);
 		nvl = find_config_node(pathbuf);
 		if (nvl == NULL)
 			break;
 
 		value = get_config_value_node(nvl, "cpus");
 		if (value == NULL) {
 			EPRINTLN("Missing CPU set for domain %d", dom);
 			exit(4);
 		}
 
 		parse_cpuset(dom, value, &cpus);
 		CPU_FOREACH_ISSET(cpu, &cpus) {
 			error = acpi_add_vcpu_affinity(cpu, dom);
 			if (error) {
 				EPRINTLN(
 				    "Unable to set vCPU %d affinity for domain %d: %s",
 				    cpu, dom, strerror(errno));
 				exit(4);
 			}
 		}
 	}
 	if (guest_ndomains > 1 || nvl != NULL)
 		return;
 
 	/*
 	 * If we're dealing with one domain and no cpuset was provided, create a
 	 * default one holding all cpus.
 	 */
 	for (cpu = 0; cpu < guest_ncpus; cpu++) {
 		error = acpi_add_vcpu_affinity(cpu, 0);
 		if (error) {
 			EPRINTLN(
 			    "Unable to set vCPU %d affinity for domain %d: %s",
 			    cpu, 0, strerror(errno));
 			exit(4);
 		}
 	}
 }
 
 void *
 paddr_guest2host(struct vmctx *ctx, uintptr_t gaddr, size_t len)
 {
 
 	return (vm_map_gpa(ctx, gaddr, len));
 }
 
 #ifdef BHYVE_SNAPSHOT
 uintptr_t
 paddr_host2guest(struct vmctx *ctx, void *addr)
 {
 	return (vm_rev_map_gpa(ctx, addr));
 }
 #endif
 
 int
 fbsdrun_virtio_msix(void)
 {
 
 	return (get_config_bool_default("virtio_msix", true));
 }
 
 struct vcpu *
 fbsdrun_vcpu(int vcpuid)
 {
 	return (vcpu_info[vcpuid].vcpu);
 }
 
 static void *
 fbsdrun_start_thread(void *param)
 {
 	char tname[MAXCOMLEN + 1];
 	struct vcpu_info *vi = param;
 	int error;
 
 	snprintf(tname, sizeof(tname), "vcpu %d", vi->vcpuid);
 	pthread_set_name_np(pthread_self(), tname);
 
 	if (vcpumap[vi->vcpuid] != NULL) {
 		error = pthread_setaffinity_np(pthread_self(),
 		    sizeof(cpuset_t), vcpumap[vi->vcpuid]);
 		assert(error == 0);
 	}
 
 #ifdef BHYVE_SNAPSHOT
 	checkpoint_cpu_add(vi->vcpuid);
 #endif
 #ifdef BHYVE_GDB
 	gdb_cpu_add(vi->vcpu);
 #endif
 
 	vm_loop(vi->ctx, vi->vcpu);
-
-	/* not reached */
-	exit(1);
-	return (NULL);
+	/* We get here if the VM was destroyed asynchronously. */
+	exit(4);
 }
 
 void
 fbsdrun_addcpu(int vcpuid)
 {
 	struct vcpu_info *vi;
 	pthread_t thr;
 	int error;
 
 	vi = &vcpu_info[vcpuid];
 
 	error = vm_activate_cpu(vi->vcpu);
 	if (error != 0)
 		err(EX_OSERR, "could not activate CPU %d", vi->vcpuid);
 
 	CPU_SET_ATOMIC(vcpuid, &cpumask);
 
 	error = vm_suspend_cpu(vi->vcpu);
 	assert(error == 0);
 
 	error = pthread_create(&thr, NULL, fbsdrun_start_thread, vi);
 	assert(error == 0);
 }
 
 void
 fbsdrun_deletecpu(int vcpu)
 {
 	static pthread_mutex_t resetcpu_mtx = PTHREAD_MUTEX_INITIALIZER;
 	static pthread_cond_t resetcpu_cond = PTHREAD_COND_INITIALIZER;
 
 	pthread_mutex_lock(&resetcpu_mtx);
 	if (!CPU_ISSET(vcpu, &cpumask)) {
 		EPRINTLN("Attempting to delete unknown cpu %d", vcpu);
 		exit(4);
 	}
 
 	CPU_CLR(vcpu, &cpumask);
 
 	if (vcpu != BSP) {
 		pthread_cond_signal(&resetcpu_cond);
 		pthread_mutex_unlock(&resetcpu_mtx);
 		pthread_exit(NULL);
 		/* NOTREACHED */
 	}
 
 	while (!CPU_EMPTY(&cpumask)) {
 		pthread_cond_wait(&resetcpu_cond, &resetcpu_mtx);
 	}
 	pthread_mutex_unlock(&resetcpu_mtx);
 }
 
 int
 fbsdrun_suspendcpu(int vcpuid)
 {
 	return (vm_suspend_cpu(vcpu_info[vcpuid].vcpu));
 }
 
 static void
 vm_loop(struct vmctx *ctx, struct vcpu *vcpu)
 {
 	struct vm_exit vme;
 	struct vm_run vmrun;
 	int error, rc;
 	enum vm_exitcode exitcode;
 	cpuset_t active_cpus, dmask;
 
 	error = vm_active_cpus(ctx, &active_cpus);
 	assert(CPU_ISSET(vcpu_id(vcpu), &active_cpus));
 
 	vmrun.vm_exit = &vme;
 	vmrun.cpuset = &dmask;
 	vmrun.cpusetsize = sizeof(dmask);
 
 	while (1) {
 		error = vm_run(vcpu, &vmrun);
 		if (error != 0)
 			break;
 
 		exitcode = vme.exitcode;
 		if (exitcode >= VM_EXITCODE_MAX ||
 		    vmexit_handlers[exitcode] == NULL) {
 			warnx("vm_loop: unexpected exitcode 0x%x", exitcode);
 			exit(4);
 		}
 
 		rc = (*vmexit_handlers[exitcode])(ctx, vcpu, &vmrun);
 
 		switch (rc) {
 		case VMEXIT_CONTINUE:
 			break;
 		case VMEXIT_ABORT:
 			abort();
 		default:
 			exit(4);
 		}
 	}
 	EPRINTLN("vm_run error %d, errno %d", error, errno);
 }
 
 static int
 num_vcpus_allowed(struct vmctx *ctx, struct vcpu *vcpu)
 {
 	uint16_t sockets, cores, threads, maxcpus;
 	int tmp, error;
 
 	/*
 	 * The guest is allowed to spinup more than one processor only if the
 	 * UNRESTRICTED_GUEST capability is available.
 	 */
 	error = vm_get_capability(vcpu, VM_CAP_UNRESTRICTED_GUEST, &tmp);
 	if (error != 0)
 		return (1);
 
 	error = vm_get_topology(ctx, &sockets, &cores, &threads, &maxcpus);
 	if (error == 0)
 		return (maxcpus);
 	else
 		return (1);
 }
 
 static struct vmctx *
 do_open(const char *vmname)
 {
 	struct vmctx *ctx;
 	int error;
 	bool romboot;
 
 	romboot = bootrom_boot();
 
 	/*
 	 * If we don't have a boot ROM, the guest context must have been
 	 * initialized by bhyveload(8) or equivalent.
 	 */
 	ctx = vm_openf(vmname, romboot ? VMMAPI_OPEN_REINIT : 0);
 	if (ctx == NULL) {
 		if (errno != ENOENT)
 			err(4, "vm_openf");
 		if (!romboot)
 			errx(4, "no bootrom was configured");
 		ctx = vm_openf(vmname, VMMAPI_OPEN_CREATE);
 		if (ctx == NULL)
 			err(4, "vm_openf");
 	}
 
 #ifndef WITHOUT_CAPSICUM
 	if (vm_limit_rights(ctx) != 0)
 		err(EX_OSERR, "vm_limit_rights");
 #endif
 
 	error = vm_set_topology(ctx, cpu_sockets, cpu_cores, cpu_threads, 0);
 	if (error)
 		errx(EX_OSERR, "vm_set_topology");
 	return (ctx);
 }
 
 bool
 bhyve_parse_config_option(const char *option)
 {
 	const char *value;
 	char *path;
 
 	value = strchr(option, '=');
 	if (value == NULL || value[1] == '\0')
 		return (false);
 	path = strndup(option, value - option);
 	if (path == NULL)
 		err(4, "Failed to allocate memory");
 	set_config_value(path, value + 1);
 	free(path);
 	return (true);
 }
 
 void
 bhyve_parse_simple_config_file(const char *path)
 {
 	FILE *fp;
 	char *line, *cp;
 	size_t linecap;
 	unsigned int lineno;
 
 	fp = fopen(path, "r");
 	if (fp == NULL)
 		err(4, "Failed to open configuration file %s", path);
 	line = NULL;
 	linecap = 0;
 	lineno = 1;
 	for (lineno = 1; getline(&line, &linecap, fp) > 0; lineno++) {
 		if (*line == '#' || *line == '\n')
 			continue;
 		cp = strchr(line, '\n');
 		if (cp != NULL)
 			*cp = '\0';
 		if (!bhyve_parse_config_option(line))
 			errx(4, "%s line %u: invalid config option '%s'", path,
 			    lineno, line);
 	}
 	free(line);
 	fclose(fp);
 }
 
 #ifdef BHYVE_GDB
 void
 bhyve_parse_gdb_options(const char *opt)
 {
 	const char *sport;
 	char *colon;
 
 	if (opt[0] == 'w') {
 		set_config_bool("gdb.wait", true);
 		opt++;
 	}
 
 	colon = strrchr(opt, ':');
 	if (colon == NULL) {
 		sport = opt;
 	} else {
 		*colon = '\0';
 		colon++;
 		sport = colon;
 		set_config_value("gdb.address", opt);
 	}
 
 	set_config_value("gdb.port", sport);
 }
 #endif
 
 int
 main(int argc, char *argv[])
 {
 	int error;
 	int max_vcpus, memflags;
 	struct vcpu *bsp;
 	struct vmctx *ctx;
 	size_t memsize;
 	const char *value, *vmname;
 #ifdef BHYVE_SNAPSHOT
 	struct restore_state rstate;
 #endif
 
 	bhyve_init_config();
 	bhyve_optparse(argc, argv);
 	argc -= optind;
 	argv += optind;
 
 	if (argc > 1)
 		bhyve_usage(1);
 
 #ifdef BHYVE_SNAPSHOT
 	if (restore_file != NULL) {
 		error = load_restore_file(restore_file, &rstate);
 		if (error) {
 			fprintf(stderr, "Failed to read checkpoint info from "
 					"file: '%s'.\n", restore_file);
 			exit(1);
 		}
 		vmname = lookup_vmname(&rstate);
 		if (vmname != NULL)
 			set_config_value("name", vmname);
 	}
 #endif
 
 	if (argc == 1)
 		set_config_value("name", argv[0]);
 
 	vmname = get_config_value("name");
 	if (vmname == NULL)
 		bhyve_usage(1);
 
 	if (get_config_bool_default("config.dump", false)) {
 		dump_config();
 		exit(1);
 	}
 
 	calc_topology();
 	build_vcpumaps();
 
 	value = get_config_value("memory.size");
 	error = vm_parse_memsize(value, &memsize);
 	if (error)
 		errx(EX_USAGE, "invalid memsize '%s'", value);
 
 	ctx = do_open(vmname);
 
 #ifdef BHYVE_SNAPSHOT
 	if (restore_file != NULL) {
 		guest_ncpus = lookup_guest_ncpus(&rstate);
 		memflags = lookup_memflags(&rstate);
 		memsize = lookup_memsize(&rstate);
 	}
 
 	if (guest_ncpus < 1) {
 		fprintf(stderr, "Invalid guest vCPUs (%d)\n", guest_ncpus);
 		exit(1);
 	}
 #endif
 
 	bsp = vm_vcpu_open(ctx, BSP);
 	max_vcpus = num_vcpus_allowed(ctx, bsp);
 	if (guest_ncpus > max_vcpus) {
 		fprintf(stderr, "%d vCPUs requested but only %d available\n",
 			guest_ncpus, max_vcpus);
 		exit(4);
 	}
 
 	bhyve_init_vcpu(bsp);
 
 	/* Allocate per-VCPU resources. */
 	vcpu_info = calloc(guest_ncpus, sizeof(*vcpu_info));
 	for (int vcpuid = 0; vcpuid < guest_ncpus; vcpuid++) {
 		vcpu_info[vcpuid].ctx = ctx;
 		vcpu_info[vcpuid].vcpuid = vcpuid;
 		if (vcpuid == BSP)
 			vcpu_info[vcpuid].vcpu = bsp;
 		else
 			vcpu_info[vcpuid].vcpu = vm_vcpu_open(ctx, vcpuid);
 	}
 
 	calc_mem_affinity(memsize);
 	memflags = 0;
 	if (get_config_bool_default("memory.wired", false))
 		memflags |= VM_MEM_F_WIRED;
 	if (get_config_bool_default("memory.guest_in_core", false))
 		memflags |= VM_MEM_F_INCORE;
 	vm_set_memflags(ctx, memflags);
 	error = vm_setup_memory_domains(ctx, VM_MMAP_ALL, guest_domains,
 	    guest_ndomains);
 	if (error) {
 		fprintf(stderr, "Unable to setup memory (%d)\n", errno);
 		exit(4);
 	}
 
 	set_vcpu_affinities();
 	init_mem(guest_ncpus);
 	init_bootrom(ctx);
 	if (bhyve_init_platform(ctx, bsp) != 0)
 		exit(4);
 
 	if (qemu_fwcfg_init(ctx) != 0) {
 		fprintf(stderr, "qemu fwcfg initialization error\n");
 		exit(4);
 	}
 
 	if (qemu_fwcfg_add_file("opt/bhyve/hw.ncpu", sizeof(guest_ncpus),
 	    &guest_ncpus) != 0) {
 		fprintf(stderr, "Could not add qemu fwcfg opt/bhyve/hw.ncpu\n");
 		exit(4);
 	}
 
 	/*
 	 * Exit if a device emulation finds an error in its initialization
 	 */
 	if (init_pci(ctx) != 0) {
 		EPRINTLN("Device emulation initialization error: %s",
 		    strerror(errno));
 		exit(4);
 	}
 	if (init_tpm(ctx) != 0) {
 		EPRINTLN("Failed to init TPM device");
 		exit(4);
 	}
 
 	/*
 	 * Initialize after PCI, to allow a bootrom file to reserve the high
 	 * region.
 	 */
 	if (get_config_bool("acpi_tables"))
 		vmgenc_init(ctx);
 
 #ifdef BHYVE_GDB
 	init_gdb(ctx);
 #endif
 
 	/*
 	 * Add all vCPUs.
 	 */
 	for (int vcpuid = 0; vcpuid < guest_ncpus; vcpuid++)
 		bhyve_start_vcpu(vcpu_info[vcpuid].vcpu, vcpuid == BSP);
 
 #ifdef BHYVE_SNAPSHOT
 	if (restore_file != NULL) {
 		FPRINTLN(stdout, "Pausing pci devs...");
 		if (vm_pause_devices() != 0) {
 			EPRINTLN("Failed to pause PCI device state.");
 			exit(1);
 		}
 
 		FPRINTLN(stdout, "Restoring vm mem...");
 		if (restore_vm_mem(ctx, &rstate) != 0) {
 			EPRINTLN("Failed to restore VM memory.");
 			exit(1);
 		}
 
 		FPRINTLN(stdout, "Restoring pci devs...");
 		if (vm_restore_devices(&rstate) != 0) {
 			EPRINTLN("Failed to restore PCI device state.");
 			exit(1);
 		}
 
 		FPRINTLN(stdout, "Restoring kernel structs...");
 		if (vm_restore_kern_structs(ctx, &rstate) != 0) {
 			EPRINTLN("Failed to restore kernel structs.");
 			exit(1);
 		}
 
 		FPRINTLN(stdout, "Resuming pci devs...");
 		if (vm_resume_devices() != 0) {
 			EPRINTLN("Failed to resume PCI device state.");
 			exit(1);
 		}
 	}
 #endif
 
 	if (bhyve_init_platform_late(ctx, bsp) != 0)
 		exit(4);
 
 	/*
 	 * Change the proc title to include the VM name.
 	 */
 	setproctitle("%s", vmname);
 
 #ifdef BHYVE_SNAPSHOT
 	/*
 	 * checkpointing thread for communication with bhyvectl
 	 */
 	if (init_checkpoint_thread(ctx) != 0)
 		errx(EX_OSERR, "Failed to start checkpoint thread");
 #endif
 
 #ifndef WITHOUT_CAPSICUM
 	caph_cache_catpages();
 
 	if (caph_limit_stdout() == -1 || caph_limit_stderr() == -1)
 		errx(EX_OSERR, "Unable to apply rights for sandbox");
 
 	if (caph_enter() == -1)
 		errx(EX_OSERR, "cap_enter() failed");
 #endif
 
 #ifdef BHYVE_SNAPSHOT
 	if (restore_file != NULL) {
 		destroy_restore_state(&rstate);
 		if (vm_restore_time(ctx) < 0)
 			err(EX_OSERR, "Unable to restore time");
 
 		for (int vcpuid = 0; vcpuid < guest_ncpus; vcpuid++)
 			vm_resume_cpu(vcpu_info[vcpuid].vcpu);
 	} else
 #endif
 		vm_resume_cpu(bsp);
 
 	/*
 	 * Head off to the main event dispatch loop
 	 */
 	mevent_dispatch();
 
 	exit(4);
 }
diff --git a/usr.sbin/bhyve/riscv/vmexit.c b/usr.sbin/bhyve/riscv/vmexit.c
index 3bc83b3bef4e..985f8e4e9065 100644
--- a/usr.sbin/bhyve/riscv/vmexit.c
+++ b/usr.sbin/bhyve/riscv/vmexit.c
@@ -1,363 +1,365 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  * Copyright (c) 2024 Ruslan Bukin <br@bsdpad.com>
  *
  * This software was developed by the University of Cambridge Computer
  * Laboratory (Department of Computer Science and Technology) under Innovate
  * UK project 105694, "Digital Security by Design (DSbD) Technology Platform
  * Prototype".
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/types.h>
 #include <sys/cpuset.h>
 
 #include <machine/riscvreg.h>
 #include <machine/cpu.h>
 #include <machine/sbi.h>
 #include <machine/vmm.h>
 #include <machine/vmm_dev.h>
 #include <machine/vmm_instruction_emul.h>
 
 #include <assert.h>
 #include <errno.h>
 #include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
 
 #include <vmmapi.h>
 
 #include "bhyverun.h"
 #include "config.h"
 #include "debug.h"
 #include "mem.h"
 #include "vmexit.h"
 #include "riscv.h"
 
 #define	BHYVE_VERSION	((uint64_t)__FreeBSD_version)
 #define	SBI_VERS_MAJOR	2
 #define	SBI_VERS_MINOR	0
 
 static cpuset_t running_hartmask = CPUSET_T_INITIALIZER(0);
 
 void
 vmexit_set_bsp(int hart_id)
 {
 
 	CPU_SET_ATOMIC(hart_id, &running_hartmask);
 }
 
 static int
 vmexit_inst_emul(struct vmctx *ctx __unused, struct vcpu *vcpu,
     struct vm_run *vmrun)
 {
 	struct vm_exit *vme;
 	struct vie *vie;
 	int err;
 
 	vme = vmrun->vm_exit;
 	vie = &vme->u.inst_emul.vie;
 
 	err = emulate_mem(vcpu, vme->u.inst_emul.gpa, vie,
 	    &vme->u.inst_emul.paging);
 	if (err) {
 		if (err == ESRCH) {
 			EPRINTLN("Unhandled memory access to 0x%lx\n",
 			    vme->u.inst_emul.gpa);
 		}
 		goto fail;
 	}
 
 	return (VMEXIT_CONTINUE);
 
 fail:
 	fprintf(stderr, "Failed to emulate instruction ");
 	FPRINTLN(stderr, "at 0x%lx", vme->pc);
 	return (VMEXIT_ABORT);
 }
 
 static int
 vmexit_suspend(struct vmctx *ctx, struct vcpu *vcpu, struct vm_run *vmrun)
 {
 	struct vm_exit *vme;
 	enum vm_suspend_how how;
 	int vcpuid = vcpu_id(vcpu);
 
 	vme = vmrun->vm_exit;
 	how = vme->u.suspended.how;
 
 	fbsdrun_deletecpu(vcpuid);
 
 	switch (how) {
 	case VM_SUSPEND_RESET:
 		exit(0);
 	case VM_SUSPEND_POWEROFF:
 		if (get_config_bool_default("destroy_on_poweroff", false))
 			vm_destroy(ctx);
 		exit(1);
 	case VM_SUSPEND_HALT:
 		exit(2);
+	case VM_SUSPEND_DESTROY:
+		exit(4);
 	default:
 		fprintf(stderr, "vmexit_suspend: invalid reason %d\n", how);
 		exit(100);
 	}
 
 	/* NOT REACHED. */
 
 	return (0);
 }
 
 static int
 vmexit_debug(struct vmctx *ctx __unused, struct vcpu *vcpu __unused,
     struct vm_run *vmrun __unused)
 {
 
 	/*
 	 * XXX-MJ sleep for a short period to avoid chewing up the CPU in the
 	 * window between activation of the vCPU thread and the
 	 * SBI_HSM_HART_START request.
 	 */
 	usleep(1000);
 	return (VMEXIT_CONTINUE);
 }
 
 static int
 vmexit_bogus(struct vmctx *ctx __unused, struct vcpu *vcpu __unused,
     struct vm_run *vmrun __unused)
 {
 
 	return (VMEXIT_CONTINUE);
 }
 
 static int
 vmm_sbi_probe_extension(int ext_id)
 {
 
 	switch (ext_id) {
 	case SBI_EXT_ID_HSM:
 	case SBI_EXT_ID_TIME:
 	case SBI_EXT_ID_IPI:
 	case SBI_EXT_ID_RFNC:
 	case SBI_EXT_ID_SRST:
 	case SBI_CONSOLE_PUTCHAR:
 	case SBI_CONSOLE_GETCHAR:
 		break;
 	default:
 		return (0);
 	}
 
 	return (1);
 }
 
 static int
 vmexit_ecall_hsm(struct vmctx *ctx __unused, struct vcpu *vcpu __unused,
     struct vm_exit *vme)
 {
 	struct vcpu *newvcpu;
 	uint64_t hart_id;
 	int func_id;
 	int error;
 
 	hart_id = vme->u.ecall.args[0];
 	func_id = vme->u.ecall.args[6];
 
 	if (HART_TO_CPU(hart_id) >= (uint64_t)guest_ncpus)
 		return (SBI_ERR_INVALID_PARAM);
 
 	newvcpu = fbsdrun_vcpu(HART_TO_CPU(hart_id));
 	assert(newvcpu != NULL);
 
 	switch (func_id) {
 	case SBI_HSM_HART_START:
 		if (CPU_ISSET(hart_id, &running_hartmask))
 			break;
 
 		/* Set hart ID. */
 		error = vm_set_register(newvcpu, VM_REG_GUEST_A0, hart_id);
 		assert(error == 0);
 
 		/* Set PC. */
 		error = vm_set_register(newvcpu, VM_REG_GUEST_SEPC,
 		    vme->u.ecall.args[1]);
 		assert(error == 0);
 
 		/* Pass private data. */
 		error = vm_set_register(newvcpu, VM_REG_GUEST_A1,
 		    vme->u.ecall.args[2]);
 		assert(error == 0);
 
 		vm_resume_cpu(newvcpu);
 		CPU_SET_ATOMIC(hart_id, &running_hartmask);
 		break;
 	case SBI_HSM_HART_STOP:
 		if (!CPU_ISSET(hart_id, &running_hartmask))
 			break;
 		CPU_CLR_ATOMIC(hart_id, &running_hartmask);
 		vm_suspend_cpu(newvcpu);
 		break;
 	case SBI_HSM_HART_STATUS:
 		/* TODO. */
 		break;
 	default:
 		return (SBI_ERR_NOT_SUPPORTED);
 	}
 
 	return (SBI_SUCCESS);
 }
 
 static int
 vmexit_ecall_base(struct vmctx *ctx __unused, struct vcpu *vcpu,
     struct vm_exit *vme)
 {
 	int sbi_function_id;
 	uint32_t val;
 	int ext_id;
 	int error;
 
 	sbi_function_id = vme->u.ecall.args[6];
 
 	switch (sbi_function_id) {
 	case SBI_BASE_GET_SPEC_VERSION:
 		val = SBI_VERS_MAJOR << SBI_SPEC_VERS_MAJOR_OFFSET;
 		val |= SBI_VERS_MINOR << SBI_SPEC_VERS_MINOR_OFFSET;
 		break;
 	case SBI_BASE_GET_IMPL_ID:
 		val = SBI_IMPL_ID_BHYVE;
 		break;
 	case SBI_BASE_GET_IMPL_VERSION:
 		val = BHYVE_VERSION;
 		break;
 	case SBI_BASE_PROBE_EXTENSION:
 		ext_id = vme->u.ecall.args[0];
 		val = vmm_sbi_probe_extension(ext_id);
 		break;
 	case SBI_BASE_GET_MVENDORID:
 		val = MVENDORID_UNIMPL;
 		break;
 	case SBI_BASE_GET_MARCHID:
 		val = MARCHID_UNIMPL;
 		break;
 	case SBI_BASE_GET_MIMPID:
 		val = 0;
 		break;
 	default:
 		return (SBI_ERR_NOT_SUPPORTED);
 	}
 
 	error = vm_set_register(vcpu, VM_REG_GUEST_A1, val);
 	assert(error == 0);
 
 	return (SBI_SUCCESS);
 }
 
 static int
 vmexit_ecall_srst(struct vmctx *ctx, struct vm_exit *vme)
 {
 	enum vm_suspend_how how;
 	int func_id;
 	int type;
 
 	func_id = vme->u.ecall.args[6];
 	type = vme->u.ecall.args[0];
 
 	switch (func_id) {
 	case SBI_SRST_SYSTEM_RESET:
 		switch (type) {
 		case SBI_SRST_TYPE_SHUTDOWN:
 		case SBI_SRST_TYPE_COLD_REBOOT:
 		case SBI_SRST_TYPE_WARM_REBOOT:
 			how = VM_SUSPEND_POWEROFF;
 			vm_suspend(ctx, how);
 			break;
 		default:
 			return (SBI_ERR_NOT_SUPPORTED);
 		}
 		break;
 	default:
 		return (SBI_ERR_NOT_SUPPORTED);
 	}
 
 	return (SBI_SUCCESS);
 }
 
 static int
 vmexit_ecall(struct vmctx *ctx, struct vcpu *vcpu, struct vm_run *vmrun)
 {
 	int sbi_extension_id;
 	struct vm_exit *vme;
 	int error;
 	int ret;
 
 	vme = vmrun->vm_exit;
 
 	sbi_extension_id = vme->u.ecall.args[7];
 	switch (sbi_extension_id) {
 	case SBI_EXT_ID_SRST:
 		ret = vmexit_ecall_srst(ctx, vme);
 		break;
 	case SBI_EXT_ID_BASE:
 		ret = vmexit_ecall_base(ctx, vcpu, vme);
 		break;
 	case SBI_EXT_ID_HSM:
 		ret = vmexit_ecall_hsm(ctx, vcpu, vme);
 		break;
 	case SBI_CONSOLE_PUTCHAR:
 	case SBI_CONSOLE_GETCHAR:
 	default:
 		/* Unknown SBI extension. */
 		ret = SBI_ERR_NOT_SUPPORTED;
 		break;
 	}
 
 	error = vm_set_register(vcpu, VM_REG_GUEST_A0, ret);
 	assert(error == 0);
 
 	return (VMEXIT_CONTINUE);
 }
 
 
 static int
 vmexit_hyp(struct vmctx *ctx __unused, struct vcpu *vcpu __unused,
     struct vm_run *vmrun)
 {
 	struct vm_exit *vme;
 
 	vme = vmrun->vm_exit;
 
 	printf("unhandled exception: scause %#lx\n", vme->u.hyp.scause);
 
 	return (VMEXIT_ABORT);
 }
 
 const vmexit_handler_t vmexit_handlers[VM_EXITCODE_MAX] = {
 	[VM_EXITCODE_BOGUS]  = vmexit_bogus,
 	[VM_EXITCODE_HYP] = vmexit_hyp,
 	[VM_EXITCODE_INST_EMUL] = vmexit_inst_emul,
 	[VM_EXITCODE_SUSPENDED] = vmexit_suspend,
 	[VM_EXITCODE_DEBUG] = vmexit_debug,
 	[VM_EXITCODE_ECALL] = vmexit_ecall,
 };