diff --git a/lib/libvmmapi/vmmapi.h b/lib/libvmmapi/vmmapi.h
index 55ca7395d63b..d2ee9cae127b 100644
--- a/lib/libvmmapi/vmmapi.h
+++ b/lib/libvmmapi/vmmapi.h
@@ -1,278 +1,279 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _VMMAPI_H_
 #define	_VMMAPI_H_
 
 #include <sys/param.h>
 #include <sys/cpuset.h>
 #include <machine/vmm_dev.h>
 
 #include <stdbool.h>
 
 /*
  * API version for out-of-tree consumers like grub-bhyve for making compile
  * time decisions.
  */
 #define	VMMAPI_VERSION	0103	/* 2 digit major followed by 2 digit minor */
 
 struct iovec;
 struct vmctx;
 struct vm_snapshot_meta;
 enum x2apic_state;
 
 /*
  * Different styles of mapping the memory assigned to a VM into the address
  * space of the controlling process.
  */
 enum vm_mmap_style {
 	VM_MMAP_NONE,		/* no mapping */
 	VM_MMAP_ALL,		/* fully and statically mapped */
 	VM_MMAP_SPARSE,		/* mappings created on-demand */
 };
 
 /*
  * 'flags' value passed to 'vm_set_memflags()'.
  */
 #define	VM_MEM_F_INCORE	0x01	/* include guest memory in core file */
 #define	VM_MEM_F_WIRED	0x02	/* guest memory is wired */
 
 /*
  * Identifiers for memory segments:
  * - vm_setup_memory() uses VM_SYSMEM for the system memory segment.
  * - the remaining identifiers can be used to create devmem segments.
  */
 enum {
 	VM_SYSMEM,
 	VM_BOOTROM,
 	VM_FRAMEBUFFER,
+	VM_PCIROM,
 };
 
 /*
  * Get the length and name of the memory segment identified by 'segid'.
  * Note that system memory segments are identified with a nul name.
  *
  * Returns 0 on success and non-zero otherwise.
  */
 int	vm_get_memseg(struct vmctx *ctx, int ident, size_t *lenp, char *name,
 	    size_t namesiz);
 
 /*
  * Iterate over the guest address space. This function finds an address range
  * that starts at an address >= *gpa.
  *
  * Returns 0 if the next address range was found and non-zero otherwise.
  */
 int	vm_mmap_getnext(struct vmctx *ctx, vm_paddr_t *gpa, int *segid,
 	    vm_ooffset_t *segoff, size_t *len, int *prot, int *flags);
 
 int	vm_get_guestmem_from_ctx(struct vmctx *ctx, char **guest_baseaddr,
 				 size_t *lowmem_size, size_t *highmem_size);
 
 /*
  * Create a device memory segment identified by 'segid'.
  *
  * Returns a pointer to the memory segment on success and MAP_FAILED otherwise.
  */
 void	*vm_create_devmem(struct vmctx *ctx, int segid, const char *name,
 	    size_t len);
 
 /*
  * Map the memory segment identified by 'segid' into the guest address space
  * at [gpa,gpa+len) with protection 'prot'.
  */
 int	vm_mmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, int segid,
 	    vm_ooffset_t segoff, size_t len, int prot);
 
 int	vm_munmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, size_t len);
 
 int	vm_create(const char *name);
 int	vm_get_device_fd(struct vmctx *ctx);
 struct vmctx *vm_open(const char *name);
 void	vm_destroy(struct vmctx *ctx);
 int	vm_parse_memsize(const char *optarg, size_t *memsize);
 int	vm_setup_memory(struct vmctx *ctx, size_t len, enum vm_mmap_style s);
 void	*vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len);
 /* inverse operation to vm_map_gpa - extract guest address from host pointer */
 vm_paddr_t vm_rev_map_gpa(struct vmctx *ctx, void *addr);
 int	vm_get_gpa_pmap(struct vmctx *, uint64_t gpa, uint64_t *pte, int *num);
 int	vm_gla2gpa(struct vmctx *, int vcpuid, struct vm_guest_paging *paging,
 		   uint64_t gla, int prot, uint64_t *gpa, int *fault);
 int	vm_gla2gpa_nofault(struct vmctx *, int vcpuid,
 		   struct vm_guest_paging *paging, uint64_t gla, int prot,
 		   uint64_t *gpa, int *fault);
 uint32_t vm_get_lowmem_limit(struct vmctx *ctx);
 void	vm_set_lowmem_limit(struct vmctx *ctx, uint32_t limit);
 void	vm_set_memflags(struct vmctx *ctx, int flags);
 int	vm_get_memflags(struct vmctx *ctx);
 int	vm_get_name(struct vmctx *ctx, char *buffer, size_t max_len);
 size_t	vm_get_lowmem_size(struct vmctx *ctx);
 size_t	vm_get_highmem_size(struct vmctx *ctx);
 int	vm_set_desc(struct vmctx *ctx, int vcpu, int reg,
 		    uint64_t base, uint32_t limit, uint32_t access);
 int	vm_get_desc(struct vmctx *ctx, int vcpu, int reg,
 		    uint64_t *base, uint32_t *limit, uint32_t *access);
 int	vm_get_seg_desc(struct vmctx *ctx, int vcpu, int reg,
 			struct seg_desc *seg_desc);
 int	vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val);
 int	vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *retval);
 int	vm_set_register_set(struct vmctx *ctx, int vcpu, unsigned int count,
     const int *regnums, uint64_t *regvals);
 int	vm_get_register_set(struct vmctx *ctx, int vcpu, unsigned int count,
     const int *regnums, uint64_t *regvals);
 int	vm_run(struct vmctx *ctx, int vcpu, struct vm_exit *ret_vmexit);
 int	vm_suspend(struct vmctx *ctx, enum vm_suspend_how how);
 int	vm_reinit(struct vmctx *ctx);
 int	vm_apicid2vcpu(struct vmctx *ctx, int apicid);
 int	vm_inject_exception(struct vmctx *ctx, int vcpu, int vector,
     int errcode_valid, uint32_t errcode, int restart_instruction);
 int	vm_lapic_irq(struct vmctx *ctx, int vcpu, int vector);
 int	vm_lapic_local_irq(struct vmctx *ctx, int vcpu, int vector);
 int	vm_lapic_msi(struct vmctx *ctx, uint64_t addr, uint64_t msg);
 int	vm_ioapic_assert_irq(struct vmctx *ctx, int irq);
 int	vm_ioapic_deassert_irq(struct vmctx *ctx, int irq);
 int	vm_ioapic_pulse_irq(struct vmctx *ctx, int irq);
 int	vm_ioapic_pincount(struct vmctx *ctx, int *pincount);
 int	vm_readwrite_kernemu_device(struct vmctx *ctx, int vcpu,
 	    vm_paddr_t gpa, bool write, int size, uint64_t *value);
 int	vm_isa_assert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq);
 int	vm_isa_deassert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq);
 int	vm_isa_pulse_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq);
 int	vm_isa_set_irq_trigger(struct vmctx *ctx, int atpic_irq,
 	    enum vm_intr_trigger trigger);
 int	vm_inject_nmi(struct vmctx *ctx, int vcpu);
 int	vm_capability_name2type(const char *capname);
 const char *vm_capability_type2name(int type);
 int	vm_get_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap,
 			  int *retval);
 int	vm_set_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap,
 			  int val);
 int	vm_assign_pptdev(struct vmctx *ctx, int bus, int slot, int func);
 int	vm_unassign_pptdev(struct vmctx *ctx, int bus, int slot, int func);
 int	vm_map_pptdev_mmio(struct vmctx *ctx, int bus, int slot, int func,
 			   vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
 int	vm_unmap_pptdev_mmio(struct vmctx *ctx, int bus, int slot, int func,
 			     vm_paddr_t gpa, size_t len);
 int	vm_setup_pptdev_msi(struct vmctx *ctx, int vcpu, int bus, int slot,
 	    int func, uint64_t addr, uint64_t msg, int numvec);
 int	vm_setup_pptdev_msix(struct vmctx *ctx, int vcpu, int bus, int slot,
 	    int func, int idx, uint64_t addr, uint64_t msg,
 	    uint32_t vector_control);
 int	vm_disable_pptdev_msix(struct vmctx *ctx, int bus, int slot, int func);
 
 int	vm_get_intinfo(struct vmctx *ctx, int vcpu, uint64_t *i1, uint64_t *i2);
 int	vm_set_intinfo(struct vmctx *ctx, int vcpu, uint64_t exit_intinfo);
 
 const cap_ioctl_t *vm_get_ioctls(size_t *len);
 
 /*
  * Return a pointer to the statistics buffer. Note that this is not MT-safe.
  */
 uint64_t *vm_get_stats(struct vmctx *ctx, int vcpu, struct timeval *ret_tv,
 		       int *ret_entries);
 const char *vm_get_stat_desc(struct vmctx *ctx, int index);
 
 int	vm_get_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state *s);
 int	vm_set_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state s);
 
 int	vm_get_hpet_capabilities(struct vmctx *ctx, uint32_t *capabilities);
 
 /*
  * Translate the GLA range [gla,gla+len) into GPA segments in 'iov'.
  * The 'iovcnt' should be big enough to accommodate all GPA segments.
  *
  * retval	fault		Interpretation
  *   0		  0		Success
  *   0		  1		An exception was injected into the guest
  * EFAULT	 N/A		Error
  */
 int	vm_copy_setup(struct vmctx *ctx, int vcpu, struct vm_guest_paging *pg,
 	    uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt,
 	    int *fault);
 void	vm_copyin(struct vmctx *ctx, int vcpu, struct iovec *guest_iov,
 	    void *host_dst, size_t len);
 void	vm_copyout(struct vmctx *ctx, int vcpu, const void *host_src,
 	    struct iovec *guest_iov, size_t len);
 void	vm_copy_teardown(struct vmctx *ctx, int vcpu, struct iovec *iov,
 	    int iovcnt);
 
 /* RTC */
 int	vm_rtc_write(struct vmctx *ctx, int offset, uint8_t value);
 int	vm_rtc_read(struct vmctx *ctx, int offset, uint8_t *retval);
 int	vm_rtc_settime(struct vmctx *ctx, time_t secs);
 int	vm_rtc_gettime(struct vmctx *ctx, time_t *secs);
 
 /* Reset vcpu register state */
 int	vcpu_reset(struct vmctx *ctx, int vcpu);
 
 int	vm_active_cpus(struct vmctx *ctx, cpuset_t *cpus);
 int	vm_suspended_cpus(struct vmctx *ctx, cpuset_t *cpus);
 int	vm_debug_cpus(struct vmctx *ctx, cpuset_t *cpus);
 int	vm_activate_cpu(struct vmctx *ctx, int vcpu);
 int	vm_suspend_cpu(struct vmctx *ctx, int vcpu);
 int	vm_resume_cpu(struct vmctx *ctx, int vcpu);
 
 /* CPU topology */
 int	vm_set_topology(struct vmctx *ctx, uint16_t sockets, uint16_t cores,
 	    uint16_t threads, uint16_t maxcpus);
 int	vm_get_topology(struct vmctx *ctx, uint16_t *sockets, uint16_t *cores,
 	    uint16_t *threads, uint16_t *maxcpus);
 
 /*
  * FreeBSD specific APIs
  */
 int	vm_setup_freebsd_registers(struct vmctx *ctx, int vcpu,
 				uint64_t rip, uint64_t cr3, uint64_t gdtbase,
 				uint64_t rsp);
 int	vm_setup_freebsd_registers_i386(struct vmctx *vmctx, int vcpu,
 					uint32_t eip, uint32_t gdtbase,
 					uint32_t esp);
 void	vm_setup_freebsd_gdt(uint64_t *gdtr);
 
 /*
  * Save and restore
  */
 
 #define MAX_SNAPSHOT_VMNAME 100
 
 enum checkpoint_opcodes {
 	START_CHECKPOINT = 0,
 	START_SUSPEND = 1,
 };
 
 struct checkpoint_op {
 	unsigned int op;
 	char snapshot_filename[MAX_SNAPSHOT_VMNAME];
 };
 
 int	vm_snapshot_req(struct vm_snapshot_meta *meta);
 int	vm_restore_time(struct vmctx *ctx);
 
 #endif	/* _VMMAPI_H_ */
diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c
index 5a5744a3fb16..3d6fca939498 100644
--- a/sys/amd64/vmm/vmm.c
+++ b/sys/amd64/vmm/vmm.c
@@ -1,2951 +1,2951 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_bhyve_snapshot.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/sysctl.h>
 #include <sys/malloc.h>
 #include <sys/pcpu.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/vnode.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_kern.h>
 #include <vm/vnode_pager.h>
 #include <vm/swap_pager.h>
 #include <vm/uma.h>
 
 #include <machine/cpu.h>
 #include <machine/pcb.h>
 #include <machine/smp.h>
 #include <machine/md_var.h>
 #include <x86/psl.h>
 #include <x86/apicreg.h>
 #include <x86/ifunc.h>
 
 #include <machine/vmm.h>
 #include <machine/vmm_dev.h>
 #include <machine/vmm_instruction_emul.h>
 #include <machine/vmm_snapshot.h>
 
 #include "vmm_ioport.h"
 #include "vmm_ktr.h"
 #include "vmm_host.h"
 #include "vmm_mem.h"
 #include "vmm_util.h"
 #include "vatpic.h"
 #include "vatpit.h"
 #include "vhpet.h"
 #include "vioapic.h"
 #include "vlapic.h"
 #include "vpmtmr.h"
 #include "vrtc.h"
 #include "vmm_stat.h"
 #include "vmm_lapic.h"
 
 #include "io/ppt.h"
 #include "io/iommu.h"
 
 struct vlapic;
 
 /*
  * Initialization:
  * (a) allocated when vcpu is created
  * (i) initialized when vcpu is created and when it is reinitialized
  * (o) initialized the first time the vcpu is created
  * (x) initialized before use
  */
 struct vcpu {
 	struct mtx 	mtx;		/* (o) protects 'state' and 'hostcpu' */
 	enum vcpu_state	state;		/* (o) vcpu state */
 	int		hostcpu;	/* (o) vcpu's host cpu */
 	int		reqidle;	/* (i) request vcpu to idle */
 	struct vlapic	*vlapic;	/* (i) APIC device model */
 	enum x2apic_state x2apic_state;	/* (i) APIC mode */
 	uint64_t	exitintinfo;	/* (i) events pending at VM exit */
 	int		nmi_pending;	/* (i) NMI pending */
 	int		extint_pending;	/* (i) INTR pending */
 	int	exception_pending;	/* (i) exception pending */
 	int	exc_vector;		/* (x) exception collateral */
 	int	exc_errcode_valid;
 	uint32_t exc_errcode;
 	struct savefpu	*guestfpu;	/* (a,i) guest fpu state */
 	uint64_t	guest_xcr0;	/* (i) guest %xcr0 register */
 	void		*stats;		/* (a,i) statistics */
 	struct vm_exit	exitinfo;	/* (x) exit reason and collateral */
 	uint64_t	nextrip;	/* (x) next instruction to execute */
 	uint64_t	tsc_offset;	/* (o) TSC offsetting */
 };
 
 #define	vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx))
 #define	vcpu_lock_init(v)	mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
 #define	vcpu_lock(v)		mtx_lock_spin(&((v)->mtx))
 #define	vcpu_unlock(v)		mtx_unlock_spin(&((v)->mtx))
 #define	vcpu_assert_locked(v)	mtx_assert(&((v)->mtx), MA_OWNED)
 
 struct mem_seg {
 	size_t	len;
 	bool	sysmem;
 	struct vm_object *object;
 };
-#define	VM_MAX_MEMSEGS	3
+#define	VM_MAX_MEMSEGS	4
 
 struct mem_map {
 	vm_paddr_t	gpa;
 	size_t		len;
 	vm_ooffset_t	segoff;
 	int		segid;
 	int		prot;
 	int		flags;
 };
 #define	VM_MAX_MEMMAPS	8
 
 /*
  * Initialization:
  * (o) initialized the first time the VM is created
  * (i) initialized when VM is created and when it is reinitialized
  * (x) initialized before use
  */
 struct vm {
 	void		*cookie;		/* (i) cpu-specific data */
 	void		*iommu;			/* (x) iommu-specific data */
 	struct vhpet	*vhpet;			/* (i) virtual HPET */
 	struct vioapic	*vioapic;		/* (i) virtual ioapic */
 	struct vatpic	*vatpic;		/* (i) virtual atpic */
 	struct vatpit	*vatpit;		/* (i) virtual atpit */
 	struct vpmtmr	*vpmtmr;		/* (i) virtual ACPI PM timer */
 	struct vrtc	*vrtc;			/* (o) virtual RTC */
 	volatile cpuset_t active_cpus;		/* (i) active vcpus */
 	volatile cpuset_t debug_cpus;		/* (i) vcpus stopped for debug */
 	int		suspend;		/* (i) stop VM execution */
 	volatile cpuset_t suspended_cpus; 	/* (i) suspended vcpus */
 	volatile cpuset_t halted_cpus;		/* (x) cpus in a hard halt */
 	cpuset_t	rendezvous_req_cpus;	/* (x) rendezvous requested */
 	cpuset_t	rendezvous_done_cpus;	/* (x) rendezvous finished */
 	void		*rendezvous_arg;	/* (x) rendezvous func/arg */
 	vm_rendezvous_func_t rendezvous_func;
 	struct mtx	rendezvous_mtx;		/* (o) rendezvous lock */
 	struct mem_map	mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */
 	struct mem_seg	mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */
 	struct vmspace	*vmspace;		/* (o) guest's address space */
 	char		name[VM_MAX_NAMELEN+1];	/* (o) virtual machine name */
 	struct vcpu	vcpu[VM_MAXCPU];	/* (i) guest vcpus */
 	/* The following describe the vm cpu topology */
 	uint16_t	sockets;		/* (o) num of sockets */
 	uint16_t	cores;			/* (o) num of cores/socket */
 	uint16_t	threads;		/* (o) num of threads/core */
 	uint16_t	maxcpus;		/* (o) max pluggable cpus */
 };
 
 static int vmm_initialized;
 
 static void	vmmops_panic(void);
 
 static void
 vmmops_panic(void)
 {
 	panic("vmm_ops func called when !vmm_is_intel() && !vmm_is_svm()");
 }
 
 #define	DEFINE_VMMOPS_IFUNC(ret_type, opname, args)			\
     DEFINE_IFUNC(static, ret_type, vmmops_##opname, args)		\
     {									\
     	if (vmm_is_intel())						\
     		return (vmm_ops_intel.opname);				\
     	else if (vmm_is_svm())						\
     		return (vmm_ops_amd.opname);				\
     	else								\
     		return ((ret_type (*)args)vmmops_panic);		\
     }
 
 DEFINE_VMMOPS_IFUNC(int, modinit, (int ipinum))
 DEFINE_VMMOPS_IFUNC(int, modcleanup, (void))
 DEFINE_VMMOPS_IFUNC(void, modresume, (void))
 DEFINE_VMMOPS_IFUNC(void *, init, (struct vm *vm, struct pmap *pmap))
 DEFINE_VMMOPS_IFUNC(int, run, (void *vmi, int vcpu, register_t rip,
     struct pmap *pmap, struct vm_eventinfo *info))
 DEFINE_VMMOPS_IFUNC(void, cleanup, (void *vmi))
 DEFINE_VMMOPS_IFUNC(int, getreg, (void *vmi, int vcpu, int num,
     uint64_t *retval))
 DEFINE_VMMOPS_IFUNC(int, setreg, (void *vmi, int vcpu, int num,
     uint64_t val))
 DEFINE_VMMOPS_IFUNC(int, getdesc, (void *vmi, int vcpu, int num,
     struct seg_desc *desc))
 DEFINE_VMMOPS_IFUNC(int, setdesc, (void *vmi, int vcpu, int num,
     struct seg_desc *desc))
 DEFINE_VMMOPS_IFUNC(int, getcap, (void *vmi, int vcpu, int num, int *retval))
 DEFINE_VMMOPS_IFUNC(int, setcap, (void *vmi, int vcpu, int num, int val))
 DEFINE_VMMOPS_IFUNC(struct vmspace *, vmspace_alloc, (vm_offset_t min,
     vm_offset_t max))
 DEFINE_VMMOPS_IFUNC(void, vmspace_free, (struct vmspace *vmspace))
 DEFINE_VMMOPS_IFUNC(struct vlapic *, vlapic_init, (void *vmi, int vcpu))
 DEFINE_VMMOPS_IFUNC(void, vlapic_cleanup, (void *vmi, struct vlapic *vlapic))
 #ifdef BHYVE_SNAPSHOT
 DEFINE_VMMOPS_IFUNC(int, snapshot, (void *vmi, struct vm_snapshot_meta
     *meta))
 DEFINE_VMMOPS_IFUNC(int, vmcx_snapshot, (void *vmi, struct vm_snapshot_meta
     *meta, int vcpu))
 DEFINE_VMMOPS_IFUNC(int, restore_tsc, (void *vmi, int vcpuid, uint64_t now))
 #endif
 
 #define	fpu_start_emulating()	load_cr0(rcr0() | CR0_TS)
 #define	fpu_stop_emulating()	clts()
 
 SDT_PROVIDER_DEFINE(vmm);
 
 static MALLOC_DEFINE(M_VM, "vm", "vm");
 
 /* statistics */
 static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
 
 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
     NULL);
 
 /*
  * Halt the guest if all vcpus are executing a HLT instruction with
  * interrupts disabled.
  */
 static int halt_detection_enabled = 1;
 SYSCTL_INT(_hw_vmm, OID_AUTO, halt_detection, CTLFLAG_RDTUN,
     &halt_detection_enabled, 0,
     "Halt VM if all vcpus execute HLT with interrupts disabled");
 
 static int vmm_ipinum;
 SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0,
     "IPI vector used for vcpu notifications");
 
 static int trace_guest_exceptions;
 SYSCTL_INT(_hw_vmm, OID_AUTO, trace_guest_exceptions, CTLFLAG_RDTUN,
     &trace_guest_exceptions, 0,
     "Trap into hypervisor on all guest exceptions and reflect them back");
 
 static void vm_free_memmap(struct vm *vm, int ident);
 static bool sysmem_mapping(struct vm *vm, struct mem_map *mm);
 static void vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr);
 
 #ifdef KTR
 static const char *
 vcpu_state2str(enum vcpu_state state)
 {
 
 	switch (state) {
 	case VCPU_IDLE:
 		return ("idle");
 	case VCPU_FROZEN:
 		return ("frozen");
 	case VCPU_RUNNING:
 		return ("running");
 	case VCPU_SLEEPING:
 		return ("sleeping");
 	default:
 		return ("unknown");
 	}
 }
 #endif
 
 static void
 vcpu_cleanup(struct vm *vm, int i, bool destroy)
 {
 	struct vcpu *vcpu = &vm->vcpu[i];
 
 	vmmops_vlapic_cleanup(vm->cookie, vcpu->vlapic);
 	if (destroy) {
 		vmm_stat_free(vcpu->stats);	
 		fpu_save_area_free(vcpu->guestfpu);
 	}
 }
 
 static void
 vcpu_init(struct vm *vm, int vcpu_id, bool create)
 {
 	struct vcpu *vcpu;
 
 	KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus,
 	    ("vcpu_init: invalid vcpu %d", vcpu_id));
 	  
 	vcpu = &vm->vcpu[vcpu_id];
 
 	if (create) {
 		KASSERT(!vcpu_lock_initialized(vcpu), ("vcpu %d already "
 		    "initialized", vcpu_id));
 		vcpu_lock_init(vcpu);
 		vcpu->state = VCPU_IDLE;
 		vcpu->hostcpu = NOCPU;
 		vcpu->guestfpu = fpu_save_area_alloc();
 		vcpu->stats = vmm_stat_alloc();
 		vcpu->tsc_offset = 0;
 	}
 
 	vcpu->vlapic = vmmops_vlapic_init(vm->cookie, vcpu_id);
 	vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED);
 	vcpu->reqidle = 0;
 	vcpu->exitintinfo = 0;
 	vcpu->nmi_pending = 0;
 	vcpu->extint_pending = 0;
 	vcpu->exception_pending = 0;
 	vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
 	fpu_save_area_reset(vcpu->guestfpu);
 	vmm_stat_init(vcpu->stats);
 }
 
 int
 vcpu_trace_exceptions(struct vm *vm, int vcpuid)
 {
 
 	return (trace_guest_exceptions);
 }
 
 struct vm_exit *
 vm_exitinfo(struct vm *vm, int cpuid)
 {
 	struct vcpu *vcpu;
 
 	if (cpuid < 0 || cpuid >= vm->maxcpus)
 		panic("vm_exitinfo: invalid cpuid %d", cpuid);
 
 	vcpu = &vm->vcpu[cpuid];
 
 	return (&vcpu->exitinfo);
 }
 
 static int
 vmm_init(void)
 {
 	int error;
 
 	if (!vmm_is_hw_supported())
 		return (ENXIO);
 
 	vmm_host_state_init();
 
 	vmm_ipinum = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) :
 	    &IDTVEC(justreturn));
 	if (vmm_ipinum < 0)
 		vmm_ipinum = IPI_AST;
 
 	error = vmm_mem_init();
 	if (error)
 		return (error);
 
 	vmm_resume_p = vmmops_modresume;
 
 	return (vmmops_modinit(vmm_ipinum));
 }
 
 static int
 vmm_handler(module_t mod, int what, void *arg)
 {
 	int error;
 
 	switch (what) {
 	case MOD_LOAD:
 		if (vmm_is_hw_supported()) {
 			vmmdev_init();
 			error = vmm_init();
 			if (error == 0)
 				vmm_initialized = 1;
 		} else {
 			error = ENXIO;
 		}
 		break;
 	case MOD_UNLOAD:
 		if (vmm_is_hw_supported()) {
 			error = vmmdev_cleanup();
 			if (error == 0) {
 				vmm_resume_p = NULL;
 				iommu_cleanup();
 				if (vmm_ipinum != IPI_AST)
 					lapic_ipi_free(vmm_ipinum);
 				error = vmmops_modcleanup();
 				/*
 				 * Something bad happened - prevent new
 				 * VMs from being created
 				 */
 				if (error)
 					vmm_initialized = 0;
 			}
 		} else {
 			error = 0;
 		}
 		break;
 	default:
 		error = 0;
 		break;
 	}
 	return (error);
 }
 
 static moduledata_t vmm_kmod = {
 	"vmm",
 	vmm_handler,
 	NULL
 };
 
 /*
  * vmm initialization has the following dependencies:
  *
  * - VT-x initialization requires smp_rendezvous() and therefore must happen
  *   after SMP is fully functional (after SI_SUB_SMP).
  */
 DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY);
 MODULE_VERSION(vmm, 1);
 
 static void
 vm_init(struct vm *vm, bool create)
 {
 	int i;
 
 	vm->cookie = vmmops_init(vm, vmspace_pmap(vm->vmspace));
 	vm->iommu = NULL;
 	vm->vioapic = vioapic_init(vm);
 	vm->vhpet = vhpet_init(vm);
 	vm->vatpic = vatpic_init(vm);
 	vm->vatpit = vatpit_init(vm);
 	vm->vpmtmr = vpmtmr_init(vm);
 	if (create)
 		vm->vrtc = vrtc_init(vm);
 
 	CPU_ZERO(&vm->active_cpus);
 	CPU_ZERO(&vm->debug_cpus);
 
 	vm->suspend = 0;
 	CPU_ZERO(&vm->suspended_cpus);
 
 	for (i = 0; i < vm->maxcpus; i++)
 		vcpu_init(vm, i, create);
 }
 
 /*
  * The default CPU topology is a single thread per package.
  */
 u_int cores_per_package = 1;
 u_int threads_per_core = 1;
 
 int
 vm_create(const char *name, struct vm **retvm)
 {
 	struct vm *vm;
 	struct vmspace *vmspace;
 
 	/*
 	 * If vmm.ko could not be successfully initialized then don't attempt
 	 * to create the virtual machine.
 	 */
 	if (!vmm_initialized)
 		return (ENXIO);
 
 	if (name == NULL || strnlen(name, VM_MAX_NAMELEN + 1) ==
 	    VM_MAX_NAMELEN + 1)
 		return (EINVAL);
 
 	vmspace = vmmops_vmspace_alloc(0, VM_MAXUSER_ADDRESS_LA48);
 	if (vmspace == NULL)
 		return (ENOMEM);
 
 	vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
 	strcpy(vm->name, name);
 	vm->vmspace = vmspace;
 	mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF);
 
 	vm->sockets = 1;
 	vm->cores = cores_per_package;	/* XXX backwards compatibility */
 	vm->threads = threads_per_core;	/* XXX backwards compatibility */
 	vm->maxcpus = VM_MAXCPU;	/* XXX temp to keep code working */
 
 	vm_init(vm, true);
 
 	*retvm = vm;
 	return (0);
 }
 
 void
 vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores,
     uint16_t *threads, uint16_t *maxcpus)
 {
 	*sockets = vm->sockets;
 	*cores = vm->cores;
 	*threads = vm->threads;
 	*maxcpus = vm->maxcpus;
 }
 
 uint16_t
 vm_get_maxcpus(struct vm *vm)
 {
 	return (vm->maxcpus);
 }
 
 int
 vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores,
     uint16_t threads, uint16_t maxcpus)
 {
 	if (maxcpus != 0)
 		return (EINVAL);	/* XXX remove when supported */
 	if ((sockets * cores * threads) > vm->maxcpus)
 		return (EINVAL);
 	/* XXX need to check sockets * cores * threads == vCPU, how? */
 	vm->sockets = sockets;
 	vm->cores = cores;
 	vm->threads = threads;
 	vm->maxcpus = VM_MAXCPU;	/* XXX temp to keep code working */
 	return(0);
 }
 
 static void
 vm_cleanup(struct vm *vm, bool destroy)
 {
 	struct mem_map *mm;
 	int i;
 
 	ppt_unassign_all(vm);
 
 	if (vm->iommu != NULL)
 		iommu_destroy_domain(vm->iommu);
 
 	if (destroy)
 		vrtc_cleanup(vm->vrtc);
 	else
 		vrtc_reset(vm->vrtc);
 	vpmtmr_cleanup(vm->vpmtmr);
 	vatpit_cleanup(vm->vatpit);
 	vhpet_cleanup(vm->vhpet);
 	vatpic_cleanup(vm->vatpic);
 	vioapic_cleanup(vm->vioapic);
 
 	for (i = 0; i < vm->maxcpus; i++)
 		vcpu_cleanup(vm, i, destroy);
 
 	vmmops_cleanup(vm->cookie);
 
 	/*
 	 * System memory is removed from the guest address space only when
 	 * the VM is destroyed. This is because the mapping remains the same
 	 * across VM reset.
 	 *
 	 * Device memory can be relocated by the guest (e.g. using PCI BARs)
 	 * so those mappings are removed on a VM reset.
 	 */
 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 		mm = &vm->mem_maps[i];
 		if (destroy || !sysmem_mapping(vm, mm))
 			vm_free_memmap(vm, i);
 	}
 
 	if (destroy) {
 		for (i = 0; i < VM_MAX_MEMSEGS; i++)
 			vm_free_memseg(vm, i);
 
 		vmmops_vmspace_free(vm->vmspace);
 		vm->vmspace = NULL;
 	}
 }
 
 void
 vm_destroy(struct vm *vm)
 {
 	vm_cleanup(vm, true);
 	free(vm, M_VM);
 }
 
 int
 vm_reinit(struct vm *vm)
 {
 	int error;
 
 	/*
 	 * A virtual machine can be reset only if all vcpus are suspended.
 	 */
 	if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
 		vm_cleanup(vm, false);
 		vm_init(vm, false);
 		error = 0;
 	} else {
 		error = EBUSY;
 	}
 
 	return (error);
 }
 
 const char *
 vm_name(struct vm *vm)
 {
 	return (vm->name);
 }
 
 int
 vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
 {
 	vm_object_t obj;
 
 	if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL)
 		return (ENOMEM);
 	else
 		return (0);
 }
 
 int
 vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
 {
 
 	vmm_mmio_free(vm->vmspace, gpa, len);
 	return (0);
 }
 
 /*
  * Return 'true' if 'gpa' is allocated in the guest address space.
  *
  * This function is called in the context of a running vcpu which acts as
  * an implicit lock on 'vm->mem_maps[]'.
  */
 bool
 vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa)
 {
 	struct mem_map *mm;
 	int i;
 
 #ifdef INVARIANTS
 	int hostcpu, state;
 	state = vcpu_get_state(vm, vcpuid, &hostcpu);
 	KASSERT(state == VCPU_RUNNING && hostcpu == curcpu,
 	    ("%s: invalid vcpu state %d/%d", __func__, state, hostcpu));
 #endif
 
 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 		mm = &vm->mem_maps[i];
 		if (mm->len != 0 && gpa >= mm->gpa && gpa < mm->gpa + mm->len)
 			return (true);		/* 'gpa' is sysmem or devmem */
 	}
 
 	if (ppt_is_mmio(vm, gpa))
 		return (true);			/* 'gpa' is pci passthru mmio */
 
 	return (false);
 }
 
 int
 vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem)
 {
 	struct mem_seg *seg;
 	vm_object_t obj;
 
 	if (ident < 0 || ident >= VM_MAX_MEMSEGS)
 		return (EINVAL);
 
 	if (len == 0 || (len & PAGE_MASK))
 		return (EINVAL);
 
 	seg = &vm->mem_segs[ident];
 	if (seg->object != NULL) {
 		if (seg->len == len && seg->sysmem == sysmem)
 			return (EEXIST);
 		else
 			return (EINVAL);
 	}
 
 	obj = vm_object_allocate(OBJT_DEFAULT, len >> PAGE_SHIFT);
 	if (obj == NULL)
 		return (ENOMEM);
 
 	seg->len = len;
 	seg->object = obj;
 	seg->sysmem = sysmem;
 	return (0);
 }
 
 int
 vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem,
     vm_object_t *objptr)
 {
 	struct mem_seg *seg;
 
 	if (ident < 0 || ident >= VM_MAX_MEMSEGS)
 		return (EINVAL);
 
 	seg = &vm->mem_segs[ident];
 	if (len)
 		*len = seg->len;
 	if (sysmem)
 		*sysmem = seg->sysmem;
 	if (objptr)
 		*objptr = seg->object;
 	return (0);
 }
 
 void
 vm_free_memseg(struct vm *vm, int ident)
 {
 	struct mem_seg *seg;
 
 	KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS,
 	    ("%s: invalid memseg ident %d", __func__, ident));
 
 	seg = &vm->mem_segs[ident];
 	if (seg->object != NULL) {
 		vm_object_deallocate(seg->object);
 		bzero(seg, sizeof(struct mem_seg));
 	}
 }
 
 int
 vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first,
     size_t len, int prot, int flags)
 {
 	struct mem_seg *seg;
 	struct mem_map *m, *map;
 	vm_ooffset_t last;
 	int i, error;
 
 	if (prot == 0 || (prot & ~(VM_PROT_ALL)) != 0)
 		return (EINVAL);
 
 	if (flags & ~VM_MEMMAP_F_WIRED)
 		return (EINVAL);
 
 	if (segid < 0 || segid >= VM_MAX_MEMSEGS)
 		return (EINVAL);
 
 	seg = &vm->mem_segs[segid];
 	if (seg->object == NULL)
 		return (EINVAL);
 
 	last = first + len;
 	if (first < 0 || first >= last || last > seg->len)
 		return (EINVAL);
 
 	if ((gpa | first | last) & PAGE_MASK)
 		return (EINVAL);
 
 	map = NULL;
 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 		m = &vm->mem_maps[i];
 		if (m->len == 0) {
 			map = m;
 			break;
 		}
 	}
 
 	if (map == NULL)
 		return (ENOSPC);
 
 	error = vm_map_find(&vm->vmspace->vm_map, seg->object, first, &gpa,
 	    len, 0, VMFS_NO_SPACE, prot, prot, 0);
 	if (error != KERN_SUCCESS)
 		return (EFAULT);
 
 	vm_object_reference(seg->object);
 
 	if (flags & VM_MEMMAP_F_WIRED) {
 		error = vm_map_wire(&vm->vmspace->vm_map, gpa, gpa + len,
 		    VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
 		if (error != KERN_SUCCESS) {
 			vm_map_remove(&vm->vmspace->vm_map, gpa, gpa + len);
 			return (error == KERN_RESOURCE_SHORTAGE ? ENOMEM :
 			    EFAULT);
 		}
 	}
 
 	map->gpa = gpa;
 	map->len = len;
 	map->segoff = first;
 	map->segid = segid;
 	map->prot = prot;
 	map->flags = flags;
 	return (0);
 }
 
 int
 vm_munmap_memseg(struct vm *vm, vm_paddr_t gpa, size_t len)
 {
 	struct mem_map *m;
 	int i;
 
 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 		m = &vm->mem_maps[i];
 		if (m->gpa == gpa && m->len == len &&
 		    (m->flags & VM_MEMMAP_F_IOMMU) == 0) {
 			vm_free_memmap(vm, i);
 			return (0);
 		}
 	}
 
 	return (EINVAL);
 }
 
 int
 vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid,
     vm_ooffset_t *segoff, size_t *len, int *prot, int *flags)
 {
 	struct mem_map *mm, *mmnext;
 	int i;
 
 	mmnext = NULL;
 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 		mm = &vm->mem_maps[i];
 		if (mm->len == 0 || mm->gpa < *gpa)
 			continue;
 		if (mmnext == NULL || mm->gpa < mmnext->gpa)
 			mmnext = mm;
 	}
 
 	if (mmnext != NULL) {
 		*gpa = mmnext->gpa;
 		if (segid)
 			*segid = mmnext->segid;
 		if (segoff)
 			*segoff = mmnext->segoff;
 		if (len)
 			*len = mmnext->len;
 		if (prot)
 			*prot = mmnext->prot;
 		if (flags)
 			*flags = mmnext->flags;
 		return (0);
 	} else {
 		return (ENOENT);
 	}
 }
 
 static void
 vm_free_memmap(struct vm *vm, int ident)
 {
 	struct mem_map *mm;
 	int error;
 
 	mm = &vm->mem_maps[ident];
 	if (mm->len) {
 		error = vm_map_remove(&vm->vmspace->vm_map, mm->gpa,
 		    mm->gpa + mm->len);
 		KASSERT(error == KERN_SUCCESS, ("%s: vm_map_remove error %d",
 		    __func__, error));
 		bzero(mm, sizeof(struct mem_map));
 	}
 }
 
 static __inline bool
 sysmem_mapping(struct vm *vm, struct mem_map *mm)
 {
 
 	if (mm->len != 0 && vm->mem_segs[mm->segid].sysmem)
 		return (true);
 	else
 		return (false);
 }
 
 vm_paddr_t
 vmm_sysmem_maxaddr(struct vm *vm)
 {
 	struct mem_map *mm;
 	vm_paddr_t maxaddr;
 	int i;
 
 	maxaddr = 0;
 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 		mm = &vm->mem_maps[i];
 		if (sysmem_mapping(vm, mm)) {
 			if (maxaddr < mm->gpa + mm->len)
 				maxaddr = mm->gpa + mm->len;
 		}
 	}
 	return (maxaddr);
 }
 
 static void
 vm_iommu_modify(struct vm *vm, bool map)
 {
 	int i, sz;
 	vm_paddr_t gpa, hpa;
 	struct mem_map *mm;
 	void *vp, *cookie, *host_domain;
 
 	sz = PAGE_SIZE;
 	host_domain = iommu_host_domain();
 
 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 		mm = &vm->mem_maps[i];
 		if (!sysmem_mapping(vm, mm))
 			continue;
 
 		if (map) {
 			KASSERT((mm->flags & VM_MEMMAP_F_IOMMU) == 0,
 			    ("iommu map found invalid memmap %#lx/%#lx/%#x",
 			    mm->gpa, mm->len, mm->flags));
 			if ((mm->flags & VM_MEMMAP_F_WIRED) == 0)
 				continue;
 			mm->flags |= VM_MEMMAP_F_IOMMU;
 		} else {
 			if ((mm->flags & VM_MEMMAP_F_IOMMU) == 0)
 				continue;
 			mm->flags &= ~VM_MEMMAP_F_IOMMU;
 			KASSERT((mm->flags & VM_MEMMAP_F_WIRED) != 0,
 			    ("iommu unmap found invalid memmap %#lx/%#lx/%#x",
 			    mm->gpa, mm->len, mm->flags));
 		}
 
 		gpa = mm->gpa;
 		while (gpa < mm->gpa + mm->len) {
 			vp = vm_gpa_hold(vm, -1, gpa, PAGE_SIZE, VM_PROT_WRITE,
 					 &cookie);
 			KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx",
 			    vm_name(vm), gpa));
 
 			vm_gpa_release(cookie);
 
 			hpa = DMAP_TO_PHYS((uintptr_t)vp);
 			if (map) {
 				iommu_create_mapping(vm->iommu, gpa, hpa, sz);
 			} else {
 				iommu_remove_mapping(vm->iommu, gpa, sz);
 			}
 
 			gpa += PAGE_SIZE;
 		}
 	}
 
 	/*
 	 * Invalidate the cached translations associated with the domain
 	 * from which pages were removed.
 	 */
 	if (map)
 		iommu_invalidate_tlb(host_domain);
 	else
 		iommu_invalidate_tlb(vm->iommu);
 }
 
 #define	vm_iommu_unmap(vm)	vm_iommu_modify((vm), false)
 #define	vm_iommu_map(vm)	vm_iommu_modify((vm), true)
 
 int
 vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func)
 {
 	int error;
 
 	error = ppt_unassign_device(vm, bus, slot, func);
 	if (error)
 		return (error);
 
 	if (ppt_assigned_devices(vm) == 0)
 		vm_iommu_unmap(vm);
 
 	return (0);
 }
 
 int
 vm_assign_pptdev(struct vm *vm, int bus, int slot, int func)
 {
 	int error;
 	vm_paddr_t maxaddr;
 
 	/* Set up the IOMMU to do the 'gpa' to 'hpa' translation */
 	if (ppt_assigned_devices(vm) == 0) {
 		KASSERT(vm->iommu == NULL,
 		    ("vm_assign_pptdev: iommu must be NULL"));
 		maxaddr = vmm_sysmem_maxaddr(vm);
 		vm->iommu = iommu_create_domain(maxaddr);
 		if (vm->iommu == NULL)
 			return (ENXIO);
 		vm_iommu_map(vm);
 	}
 
 	error = ppt_assign_device(vm, bus, slot, func);
 	return (error);
 }
 
 void *
 vm_gpa_hold(struct vm *vm, int vcpuid, vm_paddr_t gpa, size_t len, int reqprot,
 	    void **cookie)
 {
 	int i, count, pageoff;
 	struct mem_map *mm;
 	vm_page_t m;
 #ifdef INVARIANTS
 	/*
 	 * All vcpus are frozen by ioctls that modify the memory map
 	 * (e.g. VM_MMAP_MEMSEG). Therefore 'vm->memmap[]' stability is
 	 * guaranteed if at least one vcpu is in the VCPU_FROZEN state.
 	 */
 	int state;
 	KASSERT(vcpuid >= -1 && vcpuid < vm->maxcpus, ("%s: invalid vcpuid %d",
 	    __func__, vcpuid));
 	for (i = 0; i < vm->maxcpus; i++) {
 		if (vcpuid != -1 && vcpuid != i)
 			continue;
 		state = vcpu_get_state(vm, i, NULL);
 		KASSERT(state == VCPU_FROZEN, ("%s: invalid vcpu state %d",
 		    __func__, state));
 	}
 #endif
 	pageoff = gpa & PAGE_MASK;
 	if (len > PAGE_SIZE - pageoff)
 		panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
 
 	count = 0;
 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 		mm = &vm->mem_maps[i];
 		if (gpa >= mm->gpa && gpa < mm->gpa + mm->len) {
 			count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map,
 			    trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1);
 			break;
 		}
 	}
 
 	if (count == 1) {
 		*cookie = m;
 		return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff));
 	} else {
 		*cookie = NULL;
 		return (NULL);
 	}
 }
 
 void
 vm_gpa_release(void *cookie)
 {
 	vm_page_t m = cookie;
 
 	vm_page_unwire(m, PQ_ACTIVE);
 }
 
 int
 vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
 {
 
 	if (vcpu < 0 || vcpu >= vm->maxcpus)
 		return (EINVAL);
 
 	if (reg >= VM_REG_LAST)
 		return (EINVAL);
 
 	return (vmmops_getreg(vm->cookie, vcpu, reg, retval));
 }
 
 int
 vm_set_register(struct vm *vm, int vcpuid, int reg, uint64_t val)
 {
 	struct vcpu *vcpu;
 	int error;
 
 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
 		return (EINVAL);
 
 	if (reg >= VM_REG_LAST)
 		return (EINVAL);
 
 	error = vmmops_setreg(vm->cookie, vcpuid, reg, val);
 	if (error || reg != VM_REG_GUEST_RIP)
 		return (error);
 
 	/* Set 'nextrip' to match the value of %rip */
 	VCPU_CTR1(vm, vcpuid, "Setting nextrip to %#lx", val);
 	vcpu = &vm->vcpu[vcpuid];
 	vcpu->nextrip = val;
 	return (0);
 }
 
 static bool
 is_descriptor_table(int reg)
 {
 
 	switch (reg) {
 	case VM_REG_GUEST_IDTR:
 	case VM_REG_GUEST_GDTR:
 		return (true);
 	default:
 		return (false);
 	}
 }
 
 static bool
 is_segment_register(int reg)
 {
 
 	switch (reg) {
 	case VM_REG_GUEST_ES:
 	case VM_REG_GUEST_CS:
 	case VM_REG_GUEST_SS:
 	case VM_REG_GUEST_DS:
 	case VM_REG_GUEST_FS:
 	case VM_REG_GUEST_GS:
 	case VM_REG_GUEST_TR:
 	case VM_REG_GUEST_LDTR:
 		return (true);
 	default:
 		return (false);
 	}
 }
 
 int
 vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
 		struct seg_desc *desc)
 {
 
 	if (vcpu < 0 || vcpu >= vm->maxcpus)
 		return (EINVAL);
 
 	if (!is_segment_register(reg) && !is_descriptor_table(reg))
 		return (EINVAL);
 
 	return (vmmops_getdesc(vm->cookie, vcpu, reg, desc));
 }
 
 int
 vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
 		struct seg_desc *desc)
 {
 	if (vcpu < 0 || vcpu >= vm->maxcpus)
 		return (EINVAL);
 
 	if (!is_segment_register(reg) && !is_descriptor_table(reg))
 		return (EINVAL);
 
 	return (vmmops_setdesc(vm->cookie, vcpu, reg, desc));
 }
 
 static void
 restore_guest_fpustate(struct vcpu *vcpu)
 {
 
 	/* flush host state to the pcb */
 	fpuexit(curthread);
 
 	/* restore guest FPU state */
 	fpu_stop_emulating();
 	fpurestore(vcpu->guestfpu);
 
 	/* restore guest XCR0 if XSAVE is enabled in the host */
 	if (rcr4() & CR4_XSAVE)
 		load_xcr(0, vcpu->guest_xcr0);
 
 	/*
 	 * The FPU is now "dirty" with the guest's state so turn on emulation
 	 * to trap any access to the FPU by the host.
 	 */
 	fpu_start_emulating();
 }
 
 static void
 save_guest_fpustate(struct vcpu *vcpu)
 {
 
 	if ((rcr0() & CR0_TS) == 0)
 		panic("fpu emulation not enabled in host!");
 
 	/* save guest XCR0 and restore host XCR0 */
 	if (rcr4() & CR4_XSAVE) {
 		vcpu->guest_xcr0 = rxcr(0);
 		load_xcr(0, vmm_get_host_xcr0());
 	}
 
 	/* save guest FPU state */
 	fpu_stop_emulating();
 	fpusave(vcpu->guestfpu);
 	fpu_start_emulating();
 }
 
 static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle");
 
 static int
 vcpu_set_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate,
     bool from_idle)
 {
 	struct vcpu *vcpu;
 	int error;
 
 	vcpu = &vm->vcpu[vcpuid];
 	vcpu_assert_locked(vcpu);
 
 	/*
 	 * State transitions from the vmmdev_ioctl() must always begin from
 	 * the VCPU_IDLE state. This guarantees that there is only a single
 	 * ioctl() operating on a vcpu at any point.
 	 */
 	if (from_idle) {
 		while (vcpu->state != VCPU_IDLE) {
 			vcpu->reqidle = 1;
 			vcpu_notify_event_locked(vcpu, false);
 			VCPU_CTR1(vm, vcpuid, "vcpu state change from %s to "
 			    "idle requested", vcpu_state2str(vcpu->state));
 			msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz);
 		}
 	} else {
 		KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
 		    "vcpu idle state"));
 	}
 
 	if (vcpu->state == VCPU_RUNNING) {
 		KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d "
 		    "mismatch for running vcpu", curcpu, vcpu->hostcpu));
 	} else {
 		KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a "
 		    "vcpu that is not running", vcpu->hostcpu));
 	}
 
 	/*
 	 * The following state transitions are allowed:
 	 * IDLE -> FROZEN -> IDLE
 	 * FROZEN -> RUNNING -> FROZEN
 	 * FROZEN -> SLEEPING -> FROZEN
 	 */
 	switch (vcpu->state) {
 	case VCPU_IDLE:
 	case VCPU_RUNNING:
 	case VCPU_SLEEPING:
 		error = (newstate != VCPU_FROZEN);
 		break;
 	case VCPU_FROZEN:
 		error = (newstate == VCPU_FROZEN);
 		break;
 	default:
 		error = 1;
 		break;
 	}
 
 	if (error)
 		return (EBUSY);
 
 	VCPU_CTR2(vm, vcpuid, "vcpu state changed from %s to %s",
 	    vcpu_state2str(vcpu->state), vcpu_state2str(newstate));
 
 	vcpu->state = newstate;
 	if (newstate == VCPU_RUNNING)
 		vcpu->hostcpu = curcpu;
 	else
 		vcpu->hostcpu = NOCPU;
 
 	if (newstate == VCPU_IDLE)
 		wakeup(&vcpu->state);
 
 	return (0);
 }
 
 static void
 vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
 {
 	int error;
 
 	if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0)
 		panic("Error %d setting state to %d\n", error, newstate);
 }
 
 static void
 vcpu_require_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate)
 {
 	int error;
 
 	if ((error = vcpu_set_state_locked(vm, vcpuid, newstate, false)) != 0)
 		panic("Error %d setting state to %d", error, newstate);
 }
 
 #define	RENDEZVOUS_CTR0(vm, vcpuid, fmt)				\
 	do {								\
 		if (vcpuid >= 0)					\
 			VCPU_CTR0(vm, vcpuid, fmt);			\
 		else							\
 			VM_CTR0(vm, fmt);				\
 	} while (0)
 
 static int
 vm_handle_rendezvous(struct vm *vm, int vcpuid)
 {
 	struct thread *td;
 	int error;
 
 	KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < vm->maxcpus),
 	    ("vm_handle_rendezvous: invalid vcpuid %d", vcpuid));
 
 	error = 0;
 	td = curthread;
 	mtx_lock(&vm->rendezvous_mtx);
 	while (vm->rendezvous_func != NULL) {
 		/* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */
 		CPU_AND(&vm->rendezvous_req_cpus, &vm->rendezvous_req_cpus, &vm->active_cpus);
 
 		if (vcpuid != -1 &&
 		    CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus) &&
 		    !CPU_ISSET(vcpuid, &vm->rendezvous_done_cpus)) {
 			VCPU_CTR0(vm, vcpuid, "Calling rendezvous func");
 			(*vm->rendezvous_func)(vm, vcpuid, vm->rendezvous_arg);
 			CPU_SET(vcpuid, &vm->rendezvous_done_cpus);
 		}
 		if (CPU_CMP(&vm->rendezvous_req_cpus,
 		    &vm->rendezvous_done_cpus) == 0) {
 			VCPU_CTR0(vm, vcpuid, "Rendezvous completed");
 			vm->rendezvous_func = NULL;
 			wakeup(&vm->rendezvous_func);
 			break;
 		}
 		RENDEZVOUS_CTR0(vm, vcpuid, "Wait for rendezvous completion");
 		mtx_sleep(&vm->rendezvous_func, &vm->rendezvous_mtx, 0,
 		    "vmrndv", hz);
 		if ((td->td_flags & TDF_NEEDSUSPCHK) != 0) {
 			mtx_unlock(&vm->rendezvous_mtx);
 			error = thread_check_susp(td, true);
 			if (error != 0)
 				return (error);
 			mtx_lock(&vm->rendezvous_mtx);
 		}
 	}
 	mtx_unlock(&vm->rendezvous_mtx);
 	return (0);
 }
 
 /*
  * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
  */
 static int
 vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu)
 {
 	struct vcpu *vcpu;
 	const char *wmesg;
 	struct thread *td;
 	int error, t, vcpu_halted, vm_halted;
 
 	KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted"));
 
 	vcpu = &vm->vcpu[vcpuid];
 	vcpu_halted = 0;
 	vm_halted = 0;
 	error = 0;
 	td = curthread;
 
 	vcpu_lock(vcpu);
 	while (1) {
 		/*
 		 * Do a final check for pending NMI or interrupts before
 		 * really putting this thread to sleep. Also check for
 		 * software events that would cause this vcpu to wakeup.
 		 *
 		 * These interrupts/events could have happened after the
 		 * vcpu returned from vmmops_run() and before it acquired the
 		 * vcpu lock above.
 		 */
 		if (vm->rendezvous_func != NULL || vm->suspend || vcpu->reqidle)
 			break;
 		if (vm_nmi_pending(vm, vcpuid))
 			break;
 		if (!intr_disabled) {
 			if (vm_extint_pending(vm, vcpuid) ||
 			    vlapic_pending_intr(vcpu->vlapic, NULL)) {
 				break;
 			}
 		}
 
 		/* Don't go to sleep if the vcpu thread needs to yield */
 		if (vcpu_should_yield(vm, vcpuid))
 			break;
 
 		if (vcpu_debugged(vm, vcpuid))
 			break;
 
 		/*
 		 * Some Linux guests implement "halt" by having all vcpus
 		 * execute HLT with interrupts disabled. 'halted_cpus' keeps
 		 * track of the vcpus that have entered this state. When all
 		 * vcpus enter the halted state the virtual machine is halted.
 		 */
 		if (intr_disabled) {
 			wmesg = "vmhalt";
 			VCPU_CTR0(vm, vcpuid, "Halted");
 			if (!vcpu_halted && halt_detection_enabled) {
 				vcpu_halted = 1;
 				CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus);
 			}
 			if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) {
 				vm_halted = 1;
 				break;
 			}
 		} else {
 			wmesg = "vmidle";
 		}
 
 		t = ticks;
 		vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
 		/*
 		 * XXX msleep_spin() cannot be interrupted by signals so
 		 * wake up periodically to check pending signals.
 		 */
 		msleep_spin(vcpu, &vcpu->mtx, wmesg, hz);
 		vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
 		vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
 		if ((td->td_flags & TDF_NEEDSUSPCHK) != 0) {
 			vcpu_unlock(vcpu);
 			error = thread_check_susp(td, false);
 			if (error != 0)
 				return (error);
 			vcpu_lock(vcpu);
 		}
 	}
 
 	if (vcpu_halted)
 		CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus);
 
 	vcpu_unlock(vcpu);
 
 	if (vm_halted)
 		vm_suspend(vm, VM_SUSPEND_HALT);
 
 	return (0);
 }
 
 static int
 vm_handle_paging(struct vm *vm, int vcpuid, bool *retu)
 {
 	int rv, ftype;
 	struct vm_map *map;
 	struct vcpu *vcpu;
 	struct vm_exit *vme;
 
 	vcpu = &vm->vcpu[vcpuid];
 	vme = &vcpu->exitinfo;
 
 	KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d",
 	    __func__, vme->inst_length));
 
 	ftype = vme->u.paging.fault_type;
 	KASSERT(ftype == VM_PROT_READ ||
 	    ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE,
 	    ("vm_handle_paging: invalid fault_type %d", ftype));
 
 	if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
 		rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace),
 		    vme->u.paging.gpa, ftype);
 		if (rv == 0) {
 			VCPU_CTR2(vm, vcpuid, "%s bit emulation for gpa %#lx",
 			    ftype == VM_PROT_READ ? "accessed" : "dirty",
 			    vme->u.paging.gpa);
 			goto done;
 		}
 	}
 
 	map = &vm->vmspace->vm_map;
 	rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL, NULL);
 
 	VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, "
 	    "ftype = %d", rv, vme->u.paging.gpa, ftype);
 
 	if (rv != KERN_SUCCESS)
 		return (EFAULT);
 done:
 	return (0);
 }
 
 static int
 vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu)
 {
 	struct vie *vie;
 	struct vcpu *vcpu;
 	struct vm_exit *vme;
 	uint64_t gla, gpa, cs_base;
 	struct vm_guest_paging *paging;
 	mem_region_read_t mread;
 	mem_region_write_t mwrite;
 	enum vm_cpu_mode cpu_mode;
 	int cs_d, error, fault;
 
 	vcpu = &vm->vcpu[vcpuid];
 	vme = &vcpu->exitinfo;
 
 	KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d",
 	    __func__, vme->inst_length));
 
 	gla = vme->u.inst_emul.gla;
 	gpa = vme->u.inst_emul.gpa;
 	cs_base = vme->u.inst_emul.cs_base;
 	cs_d = vme->u.inst_emul.cs_d;
 	vie = &vme->u.inst_emul.vie;
 	paging = &vme->u.inst_emul.paging;
 	cpu_mode = paging->cpu_mode;
 
 	VCPU_CTR1(vm, vcpuid, "inst_emul fault accessing gpa %#lx", gpa);
 
 	/* Fetch, decode and emulate the faulting instruction */
 	if (vie->num_valid == 0) {
 		error = vmm_fetch_instruction(vm, vcpuid, paging, vme->rip +
 		    cs_base, VIE_INST_SIZE, vie, &fault);
 	} else {
 		/*
 		 * The instruction bytes have already been copied into 'vie'
 		 */
 		error = fault = 0;
 	}
 	if (error || fault)
 		return (error);
 
 	if (vmm_decode_instruction(vm, vcpuid, gla, cpu_mode, cs_d, vie) != 0) {
 		VCPU_CTR1(vm, vcpuid, "Error decoding instruction at %#lx",
 		    vme->rip + cs_base);
 		*retu = true;	    /* dump instruction bytes in userspace */
 		return (0);
 	}
 
 	/*
 	 * Update 'nextrip' based on the length of the emulated instruction.
 	 */
 	vme->inst_length = vie->num_processed;
 	vcpu->nextrip += vie->num_processed;
 	VCPU_CTR1(vm, vcpuid, "nextrip updated to %#lx after instruction "
 	    "decoding", vcpu->nextrip);
 
 	/* return to userland unless this is an in-kernel emulated device */
 	if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
 		mread = lapic_mmio_read;
 		mwrite = lapic_mmio_write;
 	} else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
 		mread = vioapic_mmio_read;
 		mwrite = vioapic_mmio_write;
 	} else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
 		mread = vhpet_mmio_read;
 		mwrite = vhpet_mmio_write;
 	} else {
 		*retu = true;
 		return (0);
 	}
 
 	error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, paging,
 	    mread, mwrite, retu);
 
 	return (error);
 }
 
 static int
 vm_handle_suspend(struct vm *vm, int vcpuid, bool *retu)
 {
 	int error, i;
 	struct vcpu *vcpu;
 	struct thread *td;
 
 	error = 0;
 	vcpu = &vm->vcpu[vcpuid];
 	td = curthread;
 
 	CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus);
 
 	/*
 	 * Wait until all 'active_cpus' have suspended themselves.
 	 *
 	 * Since a VM may be suspended at any time including when one or
 	 * more vcpus are doing a rendezvous we need to call the rendezvous
 	 * handler while we are waiting to prevent a deadlock.
 	 */
 	vcpu_lock(vcpu);
 	while (error == 0) {
 		if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
 			VCPU_CTR0(vm, vcpuid, "All vcpus suspended");
 			break;
 		}
 
 		if (vm->rendezvous_func == NULL) {
 			VCPU_CTR0(vm, vcpuid, "Sleeping during suspend");
 			vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
 			msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz);
 			vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
 			if ((td->td_flags & TDF_NEEDSUSPCHK) != 0) {
 				vcpu_unlock(vcpu);
 				error = thread_check_susp(td, false);
 				vcpu_lock(vcpu);
 			}
 		} else {
 			VCPU_CTR0(vm, vcpuid, "Rendezvous during suspend");
 			vcpu_unlock(vcpu);
 			error = vm_handle_rendezvous(vm, vcpuid);
 			vcpu_lock(vcpu);
 		}
 	}
 	vcpu_unlock(vcpu);
 
 	/*
 	 * Wakeup the other sleeping vcpus and return to userspace.
 	 */
 	for (i = 0; i < vm->maxcpus; i++) {
 		if (CPU_ISSET(i, &vm->suspended_cpus)) {
 			vcpu_notify_event(vm, i, false);
 		}
 	}
 
 	*retu = true;
 	return (error);
 }
 
 static int
 vm_handle_reqidle(struct vm *vm, int vcpuid, bool *retu)
 {
 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
 
 	vcpu_lock(vcpu);
 	KASSERT(vcpu->reqidle, ("invalid vcpu reqidle %d", vcpu->reqidle));
 	vcpu->reqidle = 0;
 	vcpu_unlock(vcpu);
 	*retu = true;
 	return (0);
 }
 
 int
 vm_suspend(struct vm *vm, enum vm_suspend_how how)
 {
 	int i;
 
 	if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST)
 		return (EINVAL);
 
 	if (atomic_cmpset_int(&vm->suspend, 0, how) == 0) {
 		VM_CTR2(vm, "virtual machine already suspended %d/%d",
 		    vm->suspend, how);
 		return (EALREADY);
 	}
 
 	VM_CTR1(vm, "virtual machine successfully suspended %d", how);
 
 	/*
 	 * Notify all active vcpus that they are now suspended.
 	 */
 	for (i = 0; i < vm->maxcpus; i++) {
 		if (CPU_ISSET(i, &vm->active_cpus))
 			vcpu_notify_event(vm, i, false);
 	}
 
 	return (0);
 }
 
 void
 vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip)
 {
 	struct vm_exit *vmexit;
 
 	KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST,
 	    ("vm_exit_suspended: invalid suspend type %d", vm->suspend));
 
 	vmexit = vm_exitinfo(vm, vcpuid);
 	vmexit->rip = rip;
 	vmexit->inst_length = 0;
 	vmexit->exitcode = VM_EXITCODE_SUSPENDED;
 	vmexit->u.suspended.how = vm->suspend;
 }
 
 void
 vm_exit_debug(struct vm *vm, int vcpuid, uint64_t rip)
 {
 	struct vm_exit *vmexit;
 
 	vmexit = vm_exitinfo(vm, vcpuid);
 	vmexit->rip = rip;
 	vmexit->inst_length = 0;
 	vmexit->exitcode = VM_EXITCODE_DEBUG;
 }
 
 void
 vm_exit_rendezvous(struct vm *vm, int vcpuid, uint64_t rip)
 {
 	struct vm_exit *vmexit;
 
 	KASSERT(vm->rendezvous_func != NULL, ("rendezvous not in progress"));
 
 	vmexit = vm_exitinfo(vm, vcpuid);
 	vmexit->rip = rip;
 	vmexit->inst_length = 0;
 	vmexit->exitcode = VM_EXITCODE_RENDEZVOUS;
 	vmm_stat_incr(vm, vcpuid, VMEXIT_RENDEZVOUS, 1);
 }
 
 void
 vm_exit_reqidle(struct vm *vm, int vcpuid, uint64_t rip)
 {
 	struct vm_exit *vmexit;
 
 	vmexit = vm_exitinfo(vm, vcpuid);
 	vmexit->rip = rip;
 	vmexit->inst_length = 0;
 	vmexit->exitcode = VM_EXITCODE_REQIDLE;
 	vmm_stat_incr(vm, vcpuid, VMEXIT_REQIDLE, 1);
 }
 
 void
 vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip)
 {
 	struct vm_exit *vmexit;
 
 	vmexit = vm_exitinfo(vm, vcpuid);
 	vmexit->rip = rip;
 	vmexit->inst_length = 0;
 	vmexit->exitcode = VM_EXITCODE_BOGUS;
 	vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1);
 }
 
 int
 vm_run(struct vm *vm, struct vm_run *vmrun)
 {
 	struct vm_eventinfo evinfo;
 	int error, vcpuid;
 	struct vcpu *vcpu;
 	struct pcb *pcb;
 	uint64_t tscval;
 	struct vm_exit *vme;
 	bool retu, intr_disabled;
 	pmap_t pmap;
 
 	vcpuid = vmrun->cpuid;
 
 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
 		return (EINVAL);
 
 	if (!CPU_ISSET(vcpuid, &vm->active_cpus))
 		return (EINVAL);
 
 	if (CPU_ISSET(vcpuid, &vm->suspended_cpus))
 		return (EINVAL);
 
 	pmap = vmspace_pmap(vm->vmspace);
 	vcpu = &vm->vcpu[vcpuid];
 	vme = &vcpu->exitinfo;
 	evinfo.rptr = &vm->rendezvous_func;
 	evinfo.sptr = &vm->suspend;
 	evinfo.iptr = &vcpu->reqidle;
 restart:
 	critical_enter();
 
 	KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
 	    ("vm_run: absurd pm_active"));
 
 	tscval = rdtsc();
 
 	pcb = PCPU_GET(curpcb);
 	set_pcb_flags(pcb, PCB_FULL_IRET);
 
 	restore_guest_fpustate(vcpu);
 
 	vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
 	error = vmmops_run(vm->cookie, vcpuid, vcpu->nextrip, pmap, &evinfo);
 	vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
 
 	save_guest_fpustate(vcpu);
 
 	vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
 
 	critical_exit();
 
 	if (error == 0) {
 		retu = false;
 		vcpu->nextrip = vme->rip + vme->inst_length;
 		switch (vme->exitcode) {
 		case VM_EXITCODE_REQIDLE:
 			error = vm_handle_reqidle(vm, vcpuid, &retu);
 			break;
 		case VM_EXITCODE_SUSPENDED:
 			error = vm_handle_suspend(vm, vcpuid, &retu);
 			break;
 		case VM_EXITCODE_IOAPIC_EOI:
 			vioapic_process_eoi(vm, vcpuid,
 			    vme->u.ioapic_eoi.vector);
 			break;
 		case VM_EXITCODE_RENDEZVOUS:
 			error = vm_handle_rendezvous(vm, vcpuid);
 			break;
 		case VM_EXITCODE_HLT:
 			intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0);
 			error = vm_handle_hlt(vm, vcpuid, intr_disabled, &retu);
 			break;
 		case VM_EXITCODE_PAGING:
 			error = vm_handle_paging(vm, vcpuid, &retu);
 			break;
 		case VM_EXITCODE_INST_EMUL:
 			error = vm_handle_inst_emul(vm, vcpuid, &retu);
 			break;
 		case VM_EXITCODE_INOUT:
 		case VM_EXITCODE_INOUT_STR:
 			error = vm_handle_inout(vm, vcpuid, vme, &retu);
 			break;
 		case VM_EXITCODE_MONITOR:
 		case VM_EXITCODE_MWAIT:
 		case VM_EXITCODE_VMINSN:
 			vm_inject_ud(vm, vcpuid);
 			break;
 		default:
 			retu = true;	/* handled in userland */
 			break;
 		}
 	}
 
 	if (error == 0 && retu == false)
 		goto restart;
 
 	VCPU_CTR2(vm, vcpuid, "retu %d/%d", error, vme->exitcode);
 
 	/* copy the exit information */
 	bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit));
 	return (error);
 }
 
 int
 vm_restart_instruction(void *arg, int vcpuid)
 {
 	struct vm *vm;
 	struct vcpu *vcpu;
 	enum vcpu_state state;
 	uint64_t rip;
 	int error;
 
 	vm = arg;
 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
 		return (EINVAL);
 
 	vcpu = &vm->vcpu[vcpuid];
 	state = vcpu_get_state(vm, vcpuid, NULL);
 	if (state == VCPU_RUNNING) {
 		/*
 		 * When a vcpu is "running" the next instruction is determined
 		 * by adding 'rip' and 'inst_length' in the vcpu's 'exitinfo'.
 		 * Thus setting 'inst_length' to zero will cause the current
 		 * instruction to be restarted.
 		 */
 		vcpu->exitinfo.inst_length = 0;
 		VCPU_CTR1(vm, vcpuid, "restarting instruction at %#lx by "
 		    "setting inst_length to zero", vcpu->exitinfo.rip);
 	} else if (state == VCPU_FROZEN) {
 		/*
 		 * When a vcpu is "frozen" it is outside the critical section
 		 * around vmmops_run() and 'nextrip' points to the next
 		 * instruction. Thus instruction restart is achieved by setting
 		 * 'nextrip' to the vcpu's %rip.
 		 */
 		error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RIP, &rip);
 		KASSERT(!error, ("%s: error %d getting rip", __func__, error));
 		VCPU_CTR2(vm, vcpuid, "restarting instruction by updating "
 		    "nextrip from %#lx to %#lx", vcpu->nextrip, rip);
 		vcpu->nextrip = rip;
 	} else {
 		panic("%s: invalid state %d", __func__, state);
 	}
 	return (0);
 }
 
 int
 vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info)
 {
 	struct vcpu *vcpu;
 	int type, vector;
 
 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
 		return (EINVAL);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	if (info & VM_INTINFO_VALID) {
 		type = info & VM_INTINFO_TYPE;
 		vector = info & 0xff;
 		if (type == VM_INTINFO_NMI && vector != IDT_NMI)
 			return (EINVAL);
 		if (type == VM_INTINFO_HWEXCEPTION && vector >= 32)
 			return (EINVAL);
 		if (info & VM_INTINFO_RSVD)
 			return (EINVAL);
 	} else {
 		info = 0;
 	}
 	VCPU_CTR2(vm, vcpuid, "%s: info1(%#lx)", __func__, info);
 	vcpu->exitintinfo = info;
 	return (0);
 }
 
 enum exc_class {
 	EXC_BENIGN,
 	EXC_CONTRIBUTORY,
 	EXC_PAGEFAULT
 };
 
 #define	IDT_VE	20	/* Virtualization Exception (Intel specific) */
 
 static enum exc_class
 exception_class(uint64_t info)
 {
 	int type, vector;
 
 	KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %#lx", info));
 	type = info & VM_INTINFO_TYPE;
 	vector = info & 0xff;
 
 	/* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */
 	switch (type) {
 	case VM_INTINFO_HWINTR:
 	case VM_INTINFO_SWINTR:
 	case VM_INTINFO_NMI:
 		return (EXC_BENIGN);
 	default:
 		/*
 		 * Hardware exception.
 		 *
 		 * SVM and VT-x use identical type values to represent NMI,
 		 * hardware interrupt and software interrupt.
 		 *
 		 * SVM uses type '3' for all exceptions. VT-x uses type '3'
 		 * for exceptions except #BP and #OF. #BP and #OF use a type
 		 * value of '5' or '6'. Therefore we don't check for explicit
 		 * values of 'type' to classify 'intinfo' into a hardware
 		 * exception.
 		 */
 		break;
 	}
 
 	switch (vector) {
 	case IDT_PF:
 	case IDT_VE:
 		return (EXC_PAGEFAULT);
 	case IDT_DE:
 	case IDT_TS:
 	case IDT_NP:
 	case IDT_SS:
 	case IDT_GP:
 		return (EXC_CONTRIBUTORY);
 	default:
 		return (EXC_BENIGN);
 	}
 }
 
 static int
 nested_fault(struct vm *vm, int vcpuid, uint64_t info1, uint64_t info2,
     uint64_t *retinfo)
 {
 	enum exc_class exc1, exc2;
 	int type1, vector1;
 
 	KASSERT(info1 & VM_INTINFO_VALID, ("info1 %#lx is not valid", info1));
 	KASSERT(info2 & VM_INTINFO_VALID, ("info2 %#lx is not valid", info2));
 
 	/*
 	 * If an exception occurs while attempting to call the double-fault
 	 * handler the processor enters shutdown mode (aka triple fault).
 	 */
 	type1 = info1 & VM_INTINFO_TYPE;
 	vector1 = info1 & 0xff;
 	if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) {
 		VCPU_CTR2(vm, vcpuid, "triple fault: info1(%#lx), info2(%#lx)",
 		    info1, info2);
 		vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT);
 		*retinfo = 0;
 		return (0);
 	}
 
 	/*
 	 * Table 6-5 "Conditions for Generating a Double Fault", Intel SDM, Vol3
 	 */
 	exc1 = exception_class(info1);
 	exc2 = exception_class(info2);
 	if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) ||
 	    (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) {
 		/* Convert nested fault into a double fault. */
 		*retinfo = IDT_DF;
 		*retinfo |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
 		*retinfo |= VM_INTINFO_DEL_ERRCODE;
 	} else {
 		/* Handle exceptions serially */
 		*retinfo = info2;
 	}
 	return (1);
 }
 
 static uint64_t
 vcpu_exception_intinfo(struct vcpu *vcpu)
 {
 	uint64_t info = 0;
 
 	if (vcpu->exception_pending) {
 		info = vcpu->exc_vector & 0xff;
 		info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
 		if (vcpu->exc_errcode_valid) {
 			info |= VM_INTINFO_DEL_ERRCODE;
 			info |= (uint64_t)vcpu->exc_errcode << 32;
 		}
 	}
 	return (info);
 }
 
 int
 vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo)
 {
 	struct vcpu *vcpu;
 	uint64_t info1, info2;
 	int valid;
 
 	KASSERT(vcpuid >= 0 &&
 	    vcpuid < vm->maxcpus, ("invalid vcpu %d", vcpuid));
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	info1 = vcpu->exitintinfo;
 	vcpu->exitintinfo = 0;
 
 	info2 = 0;
 	if (vcpu->exception_pending) {
 		info2 = vcpu_exception_intinfo(vcpu);
 		vcpu->exception_pending = 0;
 		VCPU_CTR2(vm, vcpuid, "Exception %d delivered: %#lx",
 		    vcpu->exc_vector, info2);
 	}
 
 	if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) {
 		valid = nested_fault(vm, vcpuid, info1, info2, retinfo);
 	} else if (info1 & VM_INTINFO_VALID) {
 		*retinfo = info1;
 		valid = 1;
 	} else if (info2 & VM_INTINFO_VALID) {
 		*retinfo = info2;
 		valid = 1;
 	} else {
 		valid = 0;
 	}
 
 	if (valid) {
 		VCPU_CTR4(vm, vcpuid, "%s: info1(%#lx), info2(%#lx), "
 		    "retinfo(%#lx)", __func__, info1, info2, *retinfo);
 	}
 
 	return (valid);
 }
 
 int
 vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2)
 {
 	struct vcpu *vcpu;
 
 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
 		return (EINVAL);
 
 	vcpu = &vm->vcpu[vcpuid];
 	*info1 = vcpu->exitintinfo;
 	*info2 = vcpu_exception_intinfo(vcpu);
 	return (0);
 }
 
 int
 vm_inject_exception(struct vm *vm, int vcpuid, int vector, int errcode_valid,
     uint32_t errcode, int restart_instruction)
 {
 	struct vcpu *vcpu;
 	uint64_t regval;
 	int error;
 
 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
 		return (EINVAL);
 
 	if (vector < 0 || vector >= 32)
 		return (EINVAL);
 
 	/*
 	 * A double fault exception should never be injected directly into
 	 * the guest. It is a derived exception that results from specific
 	 * combinations of nested faults.
 	 */
 	if (vector == IDT_DF)
 		return (EINVAL);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	if (vcpu->exception_pending) {
 		VCPU_CTR2(vm, vcpuid, "Unable to inject exception %d due to "
 		    "pending exception %d", vector, vcpu->exc_vector);
 		return (EBUSY);
 	}
 
 	if (errcode_valid) {
 		/*
 		 * Exceptions don't deliver an error code in real mode.
 		 */
 		error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, &regval);
 		KASSERT(!error, ("%s: error %d getting CR0", __func__, error));
 		if (!(regval & CR0_PE))
 			errcode_valid = 0;
 	}
 
 	/*
 	 * From section 26.6.1 "Interruptibility State" in Intel SDM:
 	 *
 	 * Event blocking by "STI" or "MOV SS" is cleared after guest executes
 	 * one instruction or incurs an exception.
 	 */
 	error = vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0);
 	KASSERT(error == 0, ("%s: error %d clearing interrupt shadow",
 	    __func__, error));
 
 	if (restart_instruction)
 		vm_restart_instruction(vm, vcpuid);
 
 	vcpu->exception_pending = 1;
 	vcpu->exc_vector = vector;
 	vcpu->exc_errcode = errcode;
 	vcpu->exc_errcode_valid = errcode_valid;
 	VCPU_CTR1(vm, vcpuid, "Exception %d pending", vector);
 	return (0);
 }
 
 void
 vm_inject_fault(void *vmarg, int vcpuid, int vector, int errcode_valid,
     int errcode)
 {
 	struct vm *vm;
 	int error, restart_instruction;
 
 	vm = vmarg;
 	restart_instruction = 1;
 
 	error = vm_inject_exception(vm, vcpuid, vector, errcode_valid,
 	    errcode, restart_instruction);
 	KASSERT(error == 0, ("vm_inject_exception error %d", error));
 }
 
 void
 vm_inject_pf(void *vmarg, int vcpuid, int error_code, uint64_t cr2)
 {
 	struct vm *vm;
 	int error;
 
 	vm = vmarg;
 	VCPU_CTR2(vm, vcpuid, "Injecting page fault: error_code %#x, cr2 %#lx",
 	    error_code, cr2);
 
 	error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2);
 	KASSERT(error == 0, ("vm_set_register(cr2) error %d", error));
 
 	vm_inject_fault(vm, vcpuid, IDT_PF, 1, error_code);
 }
 
 static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
 
 int
 vm_inject_nmi(struct vm *vm, int vcpuid)
 {
 	struct vcpu *vcpu;
 
 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
 		return (EINVAL);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	vcpu->nmi_pending = 1;
 	vcpu_notify_event(vm, vcpuid, false);
 	return (0);
 }
 
 int
 vm_nmi_pending(struct vm *vm, int vcpuid)
 {
 	struct vcpu *vcpu;
 
 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
 		panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	return (vcpu->nmi_pending);
 }
 
 void
 vm_nmi_clear(struct vm *vm, int vcpuid)
 {
 	struct vcpu *vcpu;
 
 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
 		panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	if (vcpu->nmi_pending == 0)
 		panic("vm_nmi_clear: inconsistent nmi_pending state");
 
 	vcpu->nmi_pending = 0;
 	vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1);
 }
 
 static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu");
 
 int
 vm_inject_extint(struct vm *vm, int vcpuid)
 {
 	struct vcpu *vcpu;
 
 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
 		return (EINVAL);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	vcpu->extint_pending = 1;
 	vcpu_notify_event(vm, vcpuid, false);
 	return (0);
 }
 
 int
 vm_extint_pending(struct vm *vm, int vcpuid)
 {
 	struct vcpu *vcpu;
 
 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
 		panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	return (vcpu->extint_pending);
 }
 
 void
 vm_extint_clear(struct vm *vm, int vcpuid)
 {
 	struct vcpu *vcpu;
 
 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
 		panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	if (vcpu->extint_pending == 0)
 		panic("vm_extint_clear: inconsistent extint_pending state");
 
 	vcpu->extint_pending = 0;
 	vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1);
 }
 
 int
 vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
 {
 	if (vcpu < 0 || vcpu >= vm->maxcpus)
 		return (EINVAL);
 
 	if (type < 0 || type >= VM_CAP_MAX)
 		return (EINVAL);
 
 	return (vmmops_getcap(vm->cookie, vcpu, type, retval));
 }
 
 int
 vm_set_capability(struct vm *vm, int vcpu, int type, int val)
 {
 	if (vcpu < 0 || vcpu >= vm->maxcpus)
 		return (EINVAL);
 
 	if (type < 0 || type >= VM_CAP_MAX)
 		return (EINVAL);
 
 	return (vmmops_setcap(vm->cookie, vcpu, type, val));
 }
 
 struct vlapic *
 vm_lapic(struct vm *vm, int cpu)
 {
 	return (vm->vcpu[cpu].vlapic);
 }
 
 struct vioapic *
 vm_ioapic(struct vm *vm)
 {
 
 	return (vm->vioapic);
 }
 
 struct vhpet *
 vm_hpet(struct vm *vm)
 {
 
 	return (vm->vhpet);
 }
 
 bool
 vmm_is_pptdev(int bus, int slot, int func)
 {
 	int b, f, i, n, s;
 	char *val, *cp, *cp2;
 	bool found;
 
 	/*
 	 * XXX
 	 * The length of an environment variable is limited to 128 bytes which
 	 * puts an upper limit on the number of passthru devices that may be
 	 * specified using a single environment variable.
 	 *
 	 * Work around this by scanning multiple environment variable
 	 * names instead of a single one - yuck!
 	 */
 	const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL };
 
 	/* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */
 	found = false;
 	for (i = 0; names[i] != NULL && !found; i++) {
 		cp = val = kern_getenv(names[i]);
 		while (cp != NULL && *cp != '\0') {
 			if ((cp2 = strchr(cp, ' ')) != NULL)
 				*cp2 = '\0';
 
 			n = sscanf(cp, "%d/%d/%d", &b, &s, &f);
 			if (n == 3 && bus == b && slot == s && func == f) {
 				found = true;
 				break;
 			}
 		
 			if (cp2 != NULL)
 				*cp2++ = ' ';
 
 			cp = cp2;
 		}
 		freeenv(val);
 	}
 	return (found);
 }
 
 void *
 vm_iommu_domain(struct vm *vm)
 {
 
 	return (vm->iommu);
 }
 
 int
 vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate,
     bool from_idle)
 {
 	int error;
 	struct vcpu *vcpu;
 
 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
 		panic("vm_set_run_state: invalid vcpuid %d", vcpuid);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	vcpu_lock(vcpu);
 	error = vcpu_set_state_locked(vm, vcpuid, newstate, from_idle);
 	vcpu_unlock(vcpu);
 
 	return (error);
 }
 
 enum vcpu_state
 vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu)
 {
 	struct vcpu *vcpu;
 	enum vcpu_state state;
 
 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
 		panic("vm_get_run_state: invalid vcpuid %d", vcpuid);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	vcpu_lock(vcpu);
 	state = vcpu->state;
 	if (hostcpu != NULL)
 		*hostcpu = vcpu->hostcpu;
 	vcpu_unlock(vcpu);
 
 	return (state);
 }
 
 int
 vm_activate_cpu(struct vm *vm, int vcpuid)
 {
 
 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
 		return (EINVAL);
 
 	if (CPU_ISSET(vcpuid, &vm->active_cpus))
 		return (EBUSY);
 
 	VCPU_CTR0(vm, vcpuid, "activated");
 	CPU_SET_ATOMIC(vcpuid, &vm->active_cpus);
 	return (0);
 }
 
 int
 vm_suspend_cpu(struct vm *vm, int vcpuid)
 {
 	int i;
 
 	if (vcpuid < -1 || vcpuid >= vm->maxcpus)
 		return (EINVAL);
 
 	if (vcpuid == -1) {
 		vm->debug_cpus = vm->active_cpus;
 		for (i = 0; i < vm->maxcpus; i++) {
 			if (CPU_ISSET(i, &vm->active_cpus))
 				vcpu_notify_event(vm, i, false);
 		}
 	} else {
 		if (!CPU_ISSET(vcpuid, &vm->active_cpus))
 			return (EINVAL);
 
 		CPU_SET_ATOMIC(vcpuid, &vm->debug_cpus);
 		vcpu_notify_event(vm, vcpuid, false);
 	}
 	return (0);
 }
 
 int
 vm_resume_cpu(struct vm *vm, int vcpuid)
 {
 
 	if (vcpuid < -1 || vcpuid >= vm->maxcpus)
 		return (EINVAL);
 
 	if (vcpuid == -1) {
 		CPU_ZERO(&vm->debug_cpus);
 	} else {
 		if (!CPU_ISSET(vcpuid, &vm->debug_cpus))
 			return (EINVAL);
 
 		CPU_CLR_ATOMIC(vcpuid, &vm->debug_cpus);
 	}
 	return (0);
 }
 
 int
 vcpu_debugged(struct vm *vm, int vcpuid)
 {
 
 	return (CPU_ISSET(vcpuid, &vm->debug_cpus));
 }
 
 cpuset_t
 vm_active_cpus(struct vm *vm)
 {
 
 	return (vm->active_cpus);
 }
 
 cpuset_t
 vm_debug_cpus(struct vm *vm)
 {
 
 	return (vm->debug_cpus);
 }
 
 cpuset_t
 vm_suspended_cpus(struct vm *vm)
 {
 
 	return (vm->suspended_cpus);
 }
 
 void *
 vcpu_stats(struct vm *vm, int vcpuid)
 {
 
 	return (vm->vcpu[vcpuid].stats);
 }
 
 int
 vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state)
 {
 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
 		return (EINVAL);
 
 	*state = vm->vcpu[vcpuid].x2apic_state;
 
 	return (0);
 }
 
 int
 vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
 {
 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
 		return (EINVAL);
 
 	if (state >= X2APIC_STATE_LAST)
 		return (EINVAL);
 
 	vm->vcpu[vcpuid].x2apic_state = state;
 
 	vlapic_set_x2apic_state(vm, vcpuid, state);
 
 	return (0);
 }
 
 /*
  * This function is called to ensure that a vcpu "sees" a pending event
  * as soon as possible:
  * - If the vcpu thread is sleeping then it is woken up.
  * - If the vcpu is running on a different host_cpu then an IPI will be directed
  *   to the host_cpu to cause the vcpu to trap into the hypervisor.
  */
 static void
 vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr)
 {
 	int hostcpu;
 
 	hostcpu = vcpu->hostcpu;
 	if (vcpu->state == VCPU_RUNNING) {
 		KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
 		if (hostcpu != curcpu) {
 			if (lapic_intr) {
 				vlapic_post_intr(vcpu->vlapic, hostcpu,
 				    vmm_ipinum);
 			} else {
 				ipi_cpu(hostcpu, vmm_ipinum);
 			}
 		} else {
 			/*
 			 * If the 'vcpu' is running on 'curcpu' then it must
 			 * be sending a notification to itself (e.g. SELF_IPI).
 			 * The pending event will be picked up when the vcpu
 			 * transitions back to guest context.
 			 */
 		}
 	} else {
 		KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent "
 		    "with hostcpu %d", vcpu->state, hostcpu));
 		if (vcpu->state == VCPU_SLEEPING)
 			wakeup_one(vcpu);
 	}
 }
 
 void
 vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr)
 {
 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
 
 	vcpu_lock(vcpu);
 	vcpu_notify_event_locked(vcpu, lapic_intr);
 	vcpu_unlock(vcpu);
 }
 
 struct vmspace *
 vm_get_vmspace(struct vm *vm)
 {
 
 	return (vm->vmspace);
 }
 
 int
 vm_apicid2vcpuid(struct vm *vm, int apicid)
 {
 	/*
 	 * XXX apic id is assumed to be numerically identical to vcpu id
 	 */
 	return (apicid);
 }
 
 int
 vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest,
     vm_rendezvous_func_t func, void *arg)
 {
 	int error, i;
 
 	/*
 	 * Enforce that this function is called without any locks
 	 */
 	WITNESS_WARN(WARN_PANIC, NULL, "vm_smp_rendezvous");
 	KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < vm->maxcpus),
 	    ("vm_smp_rendezvous: invalid vcpuid %d", vcpuid));
 
 restart:
 	mtx_lock(&vm->rendezvous_mtx);
 	if (vm->rendezvous_func != NULL) {
 		/*
 		 * If a rendezvous is already in progress then we need to
 		 * call the rendezvous handler in case this 'vcpuid' is one
 		 * of the targets of the rendezvous.
 		 */
 		RENDEZVOUS_CTR0(vm, vcpuid, "Rendezvous already in progress");
 		mtx_unlock(&vm->rendezvous_mtx);
 		error = vm_handle_rendezvous(vm, vcpuid);
 		if (error != 0)
 			return (error);
 		goto restart;
 	}
 	KASSERT(vm->rendezvous_func == NULL, ("vm_smp_rendezvous: previous "
 	    "rendezvous is still in progress"));
 
 	RENDEZVOUS_CTR0(vm, vcpuid, "Initiating rendezvous");
 	vm->rendezvous_req_cpus = dest;
 	CPU_ZERO(&vm->rendezvous_done_cpus);
 	vm->rendezvous_arg = arg;
 	vm->rendezvous_func = func;
 	mtx_unlock(&vm->rendezvous_mtx);
 
 	/*
 	 * Wake up any sleeping vcpus and trigger a VM-exit in any running
 	 * vcpus so they handle the rendezvous as soon as possible.
 	 */
 	for (i = 0; i < vm->maxcpus; i++) {
 		if (CPU_ISSET(i, &dest))
 			vcpu_notify_event(vm, i, false);
 	}
 
 	return (vm_handle_rendezvous(vm, vcpuid));
 }
 
 struct vatpic *
 vm_atpic(struct vm *vm)
 {
 	return (vm->vatpic);
 }
 
 struct vatpit *
 vm_atpit(struct vm *vm)
 {
 	return (vm->vatpit);
 }
 
 struct vpmtmr *
 vm_pmtmr(struct vm *vm)
 {
 
 	return (vm->vpmtmr);
 }
 
 struct vrtc *
 vm_rtc(struct vm *vm)
 {
 
 	return (vm->vrtc);
 }
 
 enum vm_reg_name
 vm_segment_name(int seg)
 {
 	static enum vm_reg_name seg_names[] = {
 		VM_REG_GUEST_ES,
 		VM_REG_GUEST_CS,
 		VM_REG_GUEST_SS,
 		VM_REG_GUEST_DS,
 		VM_REG_GUEST_FS,
 		VM_REG_GUEST_GS
 	};
 
 	KASSERT(seg >= 0 && seg < nitems(seg_names),
 	    ("%s: invalid segment encoding %d", __func__, seg));
 	return (seg_names[seg]);
 }
 
 void
 vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
     int num_copyinfo)
 {
 	int idx;
 
 	for (idx = 0; idx < num_copyinfo; idx++) {
 		if (copyinfo[idx].cookie != NULL)
 			vm_gpa_release(copyinfo[idx].cookie);
 	}
 	bzero(copyinfo, num_copyinfo * sizeof(struct vm_copyinfo));
 }
 
 int
 vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
     uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
     int num_copyinfo, int *fault)
 {
 	int error, idx, nused;
 	size_t n, off, remaining;
 	void *hva, *cookie;
 	uint64_t gpa;
 
 	bzero(copyinfo, sizeof(struct vm_copyinfo) * num_copyinfo);
 
 	nused = 0;
 	remaining = len;
 	while (remaining > 0) {
 		KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo"));
 		error = vm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa, fault);
 		if (error || *fault)
 			return (error);
 		off = gpa & PAGE_MASK;
 		n = min(remaining, PAGE_SIZE - off);
 		copyinfo[nused].gpa = gpa;
 		copyinfo[nused].len = n;
 		remaining -= n;
 		gla += n;
 		nused++;
 	}
 
 	for (idx = 0; idx < nused; idx++) {
 		hva = vm_gpa_hold(vm, vcpuid, copyinfo[idx].gpa,
 		    copyinfo[idx].len, prot, &cookie);
 		if (hva == NULL)
 			break;
 		copyinfo[idx].hva = hva;
 		copyinfo[idx].cookie = cookie;
 	}
 
 	if (idx != nused) {
 		vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo);
 		return (EFAULT);
 	} else {
 		*fault = 0;
 		return (0);
 	}
 }
 
 void
 vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr,
     size_t len)
 {
 	char *dst;
 	int idx;
 
 	dst = kaddr;
 	idx = 0;
 	while (len > 0) {
 		bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len);
 		len -= copyinfo[idx].len;
 		dst += copyinfo[idx].len;
 		idx++;
 	}
 }
 
 void
 vm_copyout(struct vm *vm, int vcpuid, const void *kaddr,
     struct vm_copyinfo *copyinfo, size_t len)
 {
 	const char *src;
 	int idx;
 
 	src = kaddr;
 	idx = 0;
 	while (len > 0) {
 		bcopy(src, copyinfo[idx].hva, copyinfo[idx].len);
 		len -= copyinfo[idx].len;
 		src += copyinfo[idx].len;
 		idx++;
 	}
 }
 
 /*
  * Return the amount of in-use and wired memory for the VM. Since
  * these are global stats, only return the values with for vCPU 0
  */
 VMM_STAT_DECLARE(VMM_MEM_RESIDENT);
 VMM_STAT_DECLARE(VMM_MEM_WIRED);
 
 static void
 vm_get_rescnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
 {
 
 	if (vcpu == 0) {
 		vmm_stat_set(vm, vcpu, VMM_MEM_RESIDENT,
 	       	    PAGE_SIZE * vmspace_resident_count(vm->vmspace));
 	}	
 }
 
 static void
 vm_get_wiredcnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
 {
 
 	if (vcpu == 0) {
 		vmm_stat_set(vm, vcpu, VMM_MEM_WIRED,
 	      	    PAGE_SIZE * pmap_wired_count(vmspace_pmap(vm->vmspace)));
 	}	
 }
 
 VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt);
 VMM_STAT_FUNC(VMM_MEM_WIRED, "Wired memory", vm_get_wiredcnt);
 
 #ifdef BHYVE_SNAPSHOT
 static int
 vm_snapshot_vcpus(struct vm *vm, struct vm_snapshot_meta *meta)
 {
 	int ret;
 	int i;
 	struct vcpu *vcpu;
 
 	for (i = 0; i < VM_MAXCPU; i++) {
 		vcpu = &vm->vcpu[i];
 
 		SNAPSHOT_VAR_OR_LEAVE(vcpu->x2apic_state, meta, ret, done);
 		SNAPSHOT_VAR_OR_LEAVE(vcpu->exitintinfo, meta, ret, done);
 		SNAPSHOT_VAR_OR_LEAVE(vcpu->exc_vector, meta, ret, done);
 		SNAPSHOT_VAR_OR_LEAVE(vcpu->exc_errcode_valid, meta, ret, done);
 		SNAPSHOT_VAR_OR_LEAVE(vcpu->exc_errcode, meta, ret, done);
 		SNAPSHOT_VAR_OR_LEAVE(vcpu->guest_xcr0, meta, ret, done);
 		SNAPSHOT_VAR_OR_LEAVE(vcpu->exitinfo, meta, ret, done);
 		SNAPSHOT_VAR_OR_LEAVE(vcpu->nextrip, meta, ret, done);
 		/* XXX we're cheating here, since the value of tsc_offset as
 		 * saved here is actually the value of the guest's TSC value.
 		 *
 		 * It will be turned turned back into an actual offset when the
 		 * TSC restore function is called
 		 */
 		SNAPSHOT_VAR_OR_LEAVE(vcpu->tsc_offset, meta, ret, done);
 	}
 
 done:
 	return (ret);
 }
 
 static int
 vm_snapshot_vm(struct vm *vm, struct vm_snapshot_meta *meta)
 {
 	int ret;
 	int i;
 	uint64_t now;
 
 	ret = 0;
 	now = rdtsc();
 
 	if (meta->op == VM_SNAPSHOT_SAVE) {
 		/* XXX make tsc_offset take the value TSC proper as seen by the
 		 * guest
 		 */
 		for (i = 0; i < VM_MAXCPU; i++)
 			vm->vcpu[i].tsc_offset += now;
 	}
 
 	ret = vm_snapshot_vcpus(vm, meta);
 	if (ret != 0) {
 		printf("%s: failed to copy vm data to user buffer", __func__);
 		goto done;
 	}
 
 	if (meta->op == VM_SNAPSHOT_SAVE) {
 		/* XXX turn tsc_offset back into an offset; actual value is only
 		 * required for restore; using it otherwise would be wrong
 		 */
 		for (i = 0; i < VM_MAXCPU; i++)
 			vm->vcpu[i].tsc_offset -= now;
 	}
 
 done:
 	return (ret);
 }
 
 static int
 vm_snapshot_vmcx(struct vm *vm, struct vm_snapshot_meta *meta)
 {
 	int i, error;
 
 	error = 0;
 
 	for (i = 0; i < VM_MAXCPU; i++) {
 		error = vmmops_vmcx_snapshot(vm->cookie, meta, i);
 		if (error != 0) {
 			printf("%s: failed to snapshot vmcs/vmcb data for "
 			       "vCPU: %d; error: %d\n", __func__, i, error);
 			goto done;
 		}
 	}
 
 done:
 	return (error);
 }
 
 /*
  * Save kernel-side structures to user-space for snapshotting.
  */
 int
 vm_snapshot_req(struct vm *vm, struct vm_snapshot_meta *meta)
 {
 	int ret = 0;
 
 	switch (meta->dev_req) {
 	case STRUCT_VMX:
 		ret = vmmops_snapshot(vm->cookie, meta);
 		break;
 	case STRUCT_VMCX:
 		ret = vm_snapshot_vmcx(vm, meta);
 		break;
 	case STRUCT_VM:
 		ret = vm_snapshot_vm(vm, meta);
 		break;
 	case STRUCT_VIOAPIC:
 		ret = vioapic_snapshot(vm_ioapic(vm), meta);
 		break;
 	case STRUCT_VLAPIC:
 		ret = vlapic_snapshot(vm, meta);
 		break;
 	case STRUCT_VHPET:
 		ret = vhpet_snapshot(vm_hpet(vm), meta);
 		break;
 	case STRUCT_VATPIC:
 		ret = vatpic_snapshot(vm_atpic(vm), meta);
 		break;
 	case STRUCT_VATPIT:
 		ret = vatpit_snapshot(vm_atpit(vm), meta);
 		break;
 	case STRUCT_VPMTMR:
 		ret = vpmtmr_snapshot(vm_pmtmr(vm), meta);
 		break;
 	case STRUCT_VRTC:
 		ret = vrtc_snapshot(vm_rtc(vm), meta);
 		break;
 	default:
 		printf("%s: failed to find the requested type %#x\n",
 		       __func__, meta->dev_req);
 		ret = (EINVAL);
 	}
 	return (ret);
 }
 
 int
 vm_set_tsc_offset(struct vm *vm, int vcpuid, uint64_t offset)
 {
 	struct vcpu *vcpu;
 
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		return (EINVAL);
 
 	vcpu = &vm->vcpu[vcpuid];
 	vcpu->tsc_offset = offset;
 
 	return (0);
 }
 
 int
 vm_restore_time(struct vm *vm)
 {
 	int error, i;
 	uint64_t now;
 	struct vcpu *vcpu;
 
 	now = rdtsc();
 
 	error = vhpet_restore_time(vm_hpet(vm));
 	if (error)
 		return (error);
 
 	for (i = 0; i < nitems(vm->vcpu); i++) {
 		vcpu = &vm->vcpu[i];
 
 		error = vmmops_restore_tsc(vm->cookie, i, vcpu->tsc_offset -
 		    now);
 		if (error)
 			return (error);
 	}
 
 	return (0);
 }
 #endif
diff --git a/usr.sbin/bhyve/bhyve.8 b/usr.sbin/bhyve/bhyve.8
index 4f5b544cfc65..f0c61228cf77 100644
--- a/usr.sbin/bhyve/bhyve.8
+++ b/usr.sbin/bhyve/bhyve.8
@@ -1,918 +1,923 @@
 .\" Copyright (c) 2013 Peter Grehan
 .\" All rights reserved.
 .\"
 .\" Redistribution and use in source and binary forms, with or without
 .\" modification, are permitted provided that the following conditions
 .\" are met:
 .\" 1. Redistributions of source code must retain the above copyright
 .\"    notice, this list of conditions and the following disclaimer.
 .\" 2. Redistributions in binary form must reproduce the above copyright
 .\"    notice, this list of conditions and the following disclaimer in the
 .\"    documentation and/or other materials provided with the distribution.
 .\"
 .\" THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
 .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 .\" ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
 .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 .\" SUCH DAMAGE.
 .\"
 .\" $FreeBSD$
 .\"
 .Dd October 13, 2021
 .Dt BHYVE 8
 .Os
 .Sh NAME
 .Nm bhyve
 .Nd "run a guest operating system inside a virtual machine"
 .Sh SYNOPSIS
 .Nm
 .Op Fl AaCDeHhPSuWwxY
 .Oo
 .Sm off
 .Fl c\~
 .Oo
 .Op Cm cpus=
 .Ar numcpus
 .Oc
 .Op Cm ,sockets= Ar n
 .Op Cm ,cores= Ar n
 .Op Cm ,threads= Ar n
 .Oc
 .Sm on
 .Op Fl G Ar port
 .Op Fl k Ar config_file
 .Oo Fl l
 .Sm off
 .Ar lpcdev Op Cm \&, Ar conf
 .Sm on
 .Oc
 .Oo Fl m
 .Sm off
 .Ar memsize
 .Oo
 .Cm K | Cm k | Cm M | Cm m | Cm G | Cm g | Cm T | Cm t
 .Oc
 .Sm on
 .Oc
 .Op Fl o Ar var Ns Cm = Ns Ar value
 .Op Fl p Ar vcpu Ns Cm \&: Ns Ar hostcpu
 .Op Fl r Ar file
 .Sm off
 .Oo Fl s\~
 .Ar slot Cm \&, Ar emulation Op Cm \&, Ar conf
 .Sm on
 .Oc
 .Op Fl U Ar uuid
 .Ar vmname
 .Nm
 .Fl l Cm help
 .Nm
 .Fl s Cm help
 .Sh DESCRIPTION
 .Nm
 is a hypervisor that runs guest operating systems inside a
 virtual machine.
 .Pp
 Parameters such as the number of virtual CPUs, amount of guest memory, and
 I/O connectivity can be specified with command-line parameters.
 .Pp
 If not using a boot ROM, the guest operating system must be loaded with
 .Xr bhyveload 8
 or a similar boot loader before running
 .Nm ,
 otherwise, it is enough to run
 .Nm
 with a boot ROM of choice.
 .Pp
 .Nm
 runs until the guest operating system reboots or an unhandled hypervisor
 exit is detected.
 .Sh OPTIONS
 .Bl -tag -width 10n
 .It Fl A
 Generate ACPI tables.
 Required for
 .Fx Ns /amd64
 guests.
 .It Fl a
 The guest's local APIC is configured in xAPIC mode.
 The xAPIC mode is the default setting so this option is redundant.
 It will be deprecated in a future version.
 .It Fl C
 Include guest memory in core file.
 .It Fl c Op Ar setting ...
 Number of guest virtual CPUs
 and/or the CPU topology.
 The default value for each of
 .Ar numcpus ,
 .Ar sockets ,
 .Ar cores ,
 and
 .Ar threads
 is 1.
 The current maximum number of guest virtual CPUs is 16.
 If
 .Ar numcpus
 is not specified then it will be calculated from the other arguments.
 The topology must be consistent in that the
 .Ar numcpus
 must equal the product of
 .Ar sockets ,
 .Ar cores ,
 and
 .Ar threads .
 If a
 .Ar setting
 is specified more than once the last one has precedence.
 .It Fl D
 Destroy the VM on guest initiated power-off.
 .It Fl e
 Force
 .Nm
 to exit when a guest issues an access to an I/O port that is not emulated.
 This is intended for debug purposes.
 .It Fl G Ar port
 Start a debug server that uses the GDB protocol to export guest state to a
 debugger.
 An IPv4 TCP socket will be bound to the supplied
 .Ar port
 to listen for debugger connections.
 Only a single debugger may be attached to the debug server at a time.
 If
 .Ar port
 begins with
 .Sq w ,
 .Nm
 will pause execution at the first instruction waiting for a debugger to attach.
 .It Fl H
 Yield the virtual CPU thread when a HLT instruction is detected.
 If this option is not specified, virtual CPUs will use 100% of a host CPU.
 .It Fl h
 Print help message and exit.
 .It Fl k Ar config_file
 Set configuration variables from a simple, key-value config file.
 Each line of the config file is expected to consist of a config variable
 name, an equals sign
 .Pq Sq = ,
 and a value.
 No spaces are permitted between the variable name, equals sign, or
 value.
 Blank lines and lines starting with
 .Sq #
 are ignored.
 See
 .Xr bhyve_config 5
 for more details.
 .It Fl l Cm help
 Print a list of supported LPC devices.
 .It Fl l Ar lpcdev Ns Op Cm \&, Ns Ar conf
 Allow devices behind the LPC PCI-ISA bridge to be configured.
 The only supported devices are the TTY-class devices
 .Cm com1 , com2 , com3 ,
 and
 .Cm com4 ,
 the boot ROM device
 .Cm bootrom ,
 and the debug/test device
 .Cm pc-testdev .
 .Pp
 The possible values for the
 .Ar conf
 argument are listed in the
 .Fl s
 flag description.
 .It Xo
 .Fl m Ar memsize Ns Oo
 .Sm off
 .Cm K | k | M | m | G | g | T | t
 .Sm on
 .Oc
 .Xc
 Set the guest physical memory size
 This must be the same size that was given to
 .Xr bhyveload 8 .
 .Pp
 The size argument may be suffixed with one of
 .Cm K , M , G
 or
 .Cm T
 (either upper or lower case)
 to indicate a multiple of kilobytes, megabytes, gigabytes, or terabytes.
 If no suffix is given, the value is assumed to be in megabytes.
 .Pp
 The default is 256M.
 .It Fl o Ar var Ns Cm = Ns Ar value
 Set the configuration variable
 .Ar var
 to
 .Ar value .
 .It Fl P
 Force the guest virtual CPU to exit when a PAUSE instruction is detected.
 .It Fl p Ar vcpu Ns Cm \& : Ns Ar hostcpu
 Pin guest's virtual CPU
 .Em vcpu
 to
 .Em hostcpu .
 .It Fl r Ar file
 Resume a guest from a snapshot.
 The guest memory contents are restored from
 .Ar file ,
 and the guest device and vCPU state are restored from the file
 .Dq Ar file Ns .kern .
 .Pp
 Note that the current snapshot file format requires that the configuration of
 devices in the new VM match the VM from which the snapshot was taken by specifying the
 same
 .Fl s
 and
 .Fl l
 options.
 The count of vCPUs and memory configuration are read from the snapshot.
 .It Fl S
 Wire guest memory.
 .It Fl s Cm help
 Print a list of supported PCI devices.
 .It Fl s Ar slot Ns Cm \&, Ns Ar emulation Ns Op Cm \&, Ns Ar conf
 Configure a virtual PCI slot and function.
 .Pp
 .Nm
 provides PCI bus emulation and virtual devices that can be attached to
 slots on the bus.
 There are 32 available slots, with the option of providing up to 8 functions
 per slot.
 .Pp
 The
 .Ar slot
 can be specified in one of the following formats:
 .Pp
 .Bl -bullet -compact
 .It
 .Ar pcislot
 .It
 .Sm off
 .Ar pcislot Cm \&: Ar function
 .Sm on
 .It
 .Sm off
 .Ar bus Cm \&: Ar pcislot Cm \&: Ar function
 .Sm on
 .El
 .Pp
 The
 .Ar pcislot
 value is 0 to 31.
 The optional
 .Ar function
 value is 0 to 7.
 The optional
 .Ar bus
 value is 0 to 255.
 If not specified, the
 .Ar function
 value defaults to 0.
 If not specified, the
 .Ar bus
 value defaults to 0.
 .Pp
 The
 .Ar emulation
 argument
 can be one of the following:
 .Bl -tag -width "amd_hostbridge"
 .It Cm hostbridge
 A simple host bridge.
 This is usually configured at slot 0, and is required by most guest
 operating systems.
 .It Cm amd_hostbridge
 Emulation identical to
 .Cm hostbridge
 using a PCI vendor ID of AMD.
 .It Cm passthru
 PCI pass-through device.
 .It Cm virtio-net
 Virtio network interface.
 .It Cm virtio-blk
 Virtio block storage interface.
 .It Cm virtio-scsi
 Virtio SCSI interface.
 .It Cm virtio-9p
 Virtio 9p (VirtFS) interface.
 .It Cm virtio-rnd
 Virtio RNG interface.
 .It Cm virtio-console
 Virtio console interface, which exposes multiple ports
 to the guest in the form of simple char devices for simple IO
 between the guest and host userspaces.
 .It Cm ahci
 AHCI controller attached to arbitrary devices.
 .It Cm ahci-cd
 AHCI controller attached to an ATAPI CD/DVD.
 .It Cm ahci-hd
 AHCI controller attached to a SATA hard drive.
 .It Cm e1000
 Intel e82545 network interface.
 .It Cm uart
 PCI 16550 serial device.
 .It Cm lpc
 LPC PCI-ISA bridge with COM1, COM2, COM3, and COM4 16550 serial ports,
 a boot ROM, and,
 optionally, the debug/test device.
 The LPC bridge emulation can only be configured on bus 0.
 .It Cm fbuf
 Raw framebuffer device attached to VNC server.
 .It Cm xhci
 eXtensible Host Controller Interface (xHCI) USB controller.
 .It Cm nvme
 NVM Express (NVMe) controller.
 .It Cm hda
 High Definition Audio Controller.
 .El
 .Pp
 The optional parameter
 .Ar conf
 describes the backend for device emulations.
 If
 .Ar conf
 is not specified, the device emulation has no backend and can be
 considered unconnected.
 .Pp
 Network device backends:
 .Sm off
 .Bl -bullet
 .It
 .Xo
 .Cm tap Ar N
 .Op Cm \&,mac= Ar xx:xx:xx:xx:xx:xx
 .Op Cm \&,mtu= Ar N
 .Xc
 .It
 .Xo
 .Cm vmnet Ar N
 .Op Cm \&,mac= Ar xx:xx:xx:xx:xx:xx
 .Op Cm \&,mtu= Ar N
 .Xc
 .It
 .Xo
 .Cm netgraph,path= Ar ADDRESS Cm \&,peerhook= Ar HOOK
 .Op Cm \&,socket= Ar NAME
 .Op Cm \&,hook= Ar HOOK
 .Op Cm \&,mac= Ar xx:xx:xx:xx:xx:xx
 .Op Cm \&,mtu= Ar N
 .Xc
 .El
 .Sm on
 If
 .Cm mac
 is not specified, the MAC address is derived from a fixed OUI and the
 remaining bytes from an MD5 hash of the slot and function numbers and
 the device name.
 .Pp
 The MAC address is an ASCII string in
 .Xr ethers 5
 format.
 .Pp
 With
 .Cm virtio-net
 devices, the
 .Cm mtu
 parameter can be specified to inform the guest about the largest MTU
 that should be allowed, expressed in bytes.
 .Pp
 With
 .Cm netgraph
 backend, the
 .Cm path
 and
 .Cm peerhook
 parameters must be specified to set the destination node and corresponding hook.
 The optional parameters
 .Cm socket
 and
 .Cm hook
 may be used to set the
 .Xr ng_socket 4
 node name and source hook.
 The
 .Ar ADDRESS ,
 .Ar HOOK ,
 and
 .Ar NAME
 must comply with
 .Xr netgraph 4
 addressing rules.
 .Pp
 Block storage device backends:
 .Sm off
 .Bl -bullet
 .It
 .Ar /filename Op Cm \&, Ar block-device-options
 .It
 .Ar /dev/xxx Op Cm \&, Ar block-device-options
 .El
 .Sm on
 .Pp
 The
 .Ar block-device-options
 are:
 .Bl -tag -width 10n
 .It Cm nocache
 Open the file with
 .Dv O_DIRECT .
 .It Cm direct
 Open the file using
 .Dv O_SYNC .
 .It Cm ro
 Force the file to be opened read-only.
 .It Cm sectorsize= Ns Ar logical Ns Oo Cm \&/ Ns Ar physical Oc
 Specify the logical and physical sector sizes of the emulated disk.
 The physical sector size is optional and is equal to the logical sector size
 if not explicitly specified.
 .It Cm nodelete
 Disable emulation of guest trim requests via
 .Dv DIOCGDELETE
 requests.
 .El
 .Pp
 SCSI device backends:
 .Sm off
 .Bl -bullet
 .It
 .Pa /dev/cam/ctl Oo Ar pp Cm \&. Ar vp Oc Oo Cm \&, Ar scsi-device-options Oc
 .El
 .Sm on
 .Pp
 The
 .Ar scsi-device-options
 are:
 .Bl -tag -width 10n
 .It Cm iid= Ns Ar IID
 Initiator ID to use when sending requests to specified CTL port.
 The default value is 0.
 .El
 .Pp
 9P device backends:
 .Sm off
 .Bl -bullet
 .It
 .Ar sharename Cm = Ar /path/to/share Op Cm \&, Ar 9p-device-options
 .El
 .Sm on
 .Pp
 The
 .Ar 9p-device-options
 are:
 .Bl -tag -width 10n
 .It Cm ro
 Expose the share in read-only mode.
 .El
 .Pp
 TTY device backends:
 .Bl -tag -width 10n
 .It Cm stdio
 Connect the serial port to the standard input and output of
 the
 .Nm
 process.
 .It Ar /dev/xxx
 Use the host TTY device for serial port I/O.
 .El
 .Pp
 Boot ROM device backends:
 .Bl -tag -width 10n
 .It Ar romfile
 Map
 .Ar romfile
 in the guest address space reserved for boot firmware.
 .El
 .Pp
 Pass-through device backends:
 .Bl -tag -width 10n
 .It Ns Ar slot Ns Cm \&/ Ns Ar bus Ns Cm \&/ Ns Ar function
 Connect to a PCI device on the host at the selector described by
 .Ar slot ,
 .Ar bus ,
 and
 .Ar function
 numbers.
+.It Li rom= Ns Ar romfile
+Add
+.Ar romfile
+as option ROM to the PCI device.
+The ROM will be loaded by firmware and should be capable of initializing the device.
 .El
 .Pp
 Guest memory must be wired using the
 .Fl S
 option when a pass-through device is configured.
 .Pp
 The host device must have been reserved at boot-time using the
 .Va pptdevs
 loader variable as described in
 .Xr vmm 4 .
 .Pp
 Virtio console device backends:
 .Bl -bullet
 .Sm off
 .It
 .Cm port1= Ns Ar /path/to/port1.sock Ns Op Cm ,port Ns Ar N Cm \&= Ns Ar /path/to/port2.sock No \~ Ar ...
 .Sm on
 .El
 .Pp
 A maximum of 16 ports per device can be created.
 Every port is named and corresponds to a Unix domain socket created by
 .Nm .
 .Nm
 accepts at most one connection per port at a time.
 .Pp
 Limitations:
 .Bl -bullet
 .It
 Due to lack of destructors in
 .Nm ,
 sockets on the filesystem must be cleaned up manually after
 .Nm
 exits.
 .It
 There is no way to use the
 .Dq console port
 feature, nor the console port
 resize at present.
 .It
 Emergency write is advertised, but no-op at present.
 .El
 .Pp
 Framebuffer devices backends:
 .Bl -bullet
 .Sm off
 .It
 .Op Cm rfb= Ar ip-and-port
 .Op Cm ,w= Ar width
 .Op Cm ,h= Ar height
 .Op Cm ,vga= Ar vgaconf
 .Op Cm ,wait
 .Op Cm ,password= Ar password
 .Sm on
 .El
 .Pp
 Configuration options are defined as follows:
 .Bl -tag -width 10n
 .It Cm rfb= Ns Ar ip-and-port Pq or Cm tcp= Ns Ar ip-and-port
 An IP address and a port VNC should listen on.
 There are two formats:
 .Pp
 .Bl -bullet -compact
 .It
 .Sm off
 .Op Ar IPv4 Cm \&:
 .Ar port
 .Sm on
 .It
 .Sm off
 .Cm \&[ Ar IPv6%zone Cm \&] Cm \&: Ar port
 .Sm on
 .El
 .Pp
 The default is to listen on localhost IPv4 address and default VNC port 5900.
 An IPv6 address must be enclosed in square brackets and may contain an
 optional zone identifier.
 .It Cm w= Ns Ar width No and Cm h= Ns Ar height
 A display resolution, width and height, respectively.
 If not specified, a default resolution of 1024x768 pixels will be used.
 Minimal supported resolution is 640x480 pixels,
 and maximum is 1920x1200 pixels.
 .It Cm vga= Ns Ar vgaconf
 Possible values for this option are
 .Cm io
 (default),
 .Cm on
 , and
 .Cm off .
 PCI graphics cards have a dual personality in that they are
 standard PCI devices with BAR addressing, but may also
 implicitly decode legacy VGA I/O space
 .Pq Ad 0x3c0-3df
 and memory space
 .Pq 64KB at Ad 0xA0000 .
 The default
 .Cm io
 option should be used for guests that attempt to issue BIOS calls which result
 in I/O port queries, and fail to boot if I/O decode is disabled.
 .Pp
 The
 .Cm on
 option should be used along with the CSM BIOS capability in UEFI
 to boot traditional BIOS guests that require the legacy VGA I/O and
 memory regions to be available.
 .Pp
 The
 .Cm off
 option should be used for the UEFI guests that assume that
 VGA adapter is present if they detect the I/O ports.
 An example of such a guest is
 .Ox
 in UEFI mode.
 .Pp
 Please refer to the
 .Nm
 .Fx
 wiki page
 .Pq Lk https://wiki.freebsd.org/bhyve
 for configuration notes of particular guests.
 .It Cm wait
 Instruct
 .Nm
 to only boot upon the initiation of a VNC connection, simplifying the
 installation of operating systems that require immediate keyboard input.
 This can be removed for post-installation use.
 .It Cm password= Ns Ar password
 This type of authentication is known to be cryptographically weak and is not
 intended for use on untrusted networks.
 Many implementations will want to use stronger security, such as running
 the session over an encrypted channel provided by IPsec or SSH.
 .El
 .Pp
 xHCI USB device backends:
 .Bl -tag -width 10n
 .It Cm tablet
 A USB tablet device which provides precise cursor synchronization
 when using VNC.
 .El
 .Pp
 NVMe device backends:
 .Bl -bullet
 .Sm off
 .It
 .Ar devpath
 .Op Cm ,maxq= Ar #
 .Op Cm ,qsz= Ar #
 .Op Cm ,ioslots= Ar #
 .Op Cm ,sectsz= Ar #
 .Op Cm ,ser= Ar #
 .Op Cm ,eui64= Ar #
 .Op Cm ,dsm= Ar opt
 .Sm on
 .El
 .Pp
 Configuration options are defined as follows:
 .Bl -tag -width 10n
 .It Ar devpath
 Accepted device paths are:
 .Ar /dev/blockdev
 or
 .Ar /path/to/image
 or
 .Cm ram= Ns Ar size_in_MiB .
 .It Cm maxq
 Max number of queues.
 .It Cm qsz
 Max elements in each queue.
 .It Cm ioslots
 Max number of concurrent I/O requests.
 .It Cm sectsz
 Sector size (defaults to blockif sector size).
 .It Cm ser
 Serial number with maximum 20 characters.
 .It Cm eui64
 IEEE Extended Unique Identifier (8 byte value).
 .It Cm dsm
 DataSet Management support.
 Supported values are:
 .Cm auto , enable ,
 and
 .Cm disable .
 .El
 .Pp
 AHCI device backends:
 .Bl -bullet
 .It
 .Sm off
 .Op Oo Cm hd\&: | cd\&: Oc Ar path
 .Op Cm ,nmrr= Ar nmrr
 .Op Cm ,ser= Ar #
 .Op Cm ,rev= Ar #
 .Op Cm ,model= Ar #
 .Sm on
 .El
 .Pp
 Configuration options are defined as follows:
 .Bl -tag -width 10n
 .It Cm nmrr
 Nominal Media Rotation Rate, known as RPM.
 Value 1 will indicate device as Solid State Disk.
 Default value is 0, not report.
 .It Cm ser
 Serial Number with maximum 20 characters.
 .It Cm rev
 Revision Number with maximum 8 characters.
 .It Cm model
 Model Number with maximum 40 characters.
 .El
 .Pp
 HD Audio device backends:
 .Bl -bullet
 .It
 .Sm off
 .Op Cm play= Ar playback
 .Op Cm ,rec= Ar recording
 .Sm on
 .El
 .Pp
 Configuration options are defined as follows:
 .Bl -tag -width 10n
 .It Cm play
 Playback device, typically
 .Ar /dev/dsp0 .
 .It Cm rec
 Recording device, typically
 .Ar /dev/dsp0 .
 .El
 .It Fl U Ar uuid
 Set the universally unique identifier
 .Pq UUID
 in the guest's System Management BIOS System Information structure.
 By default a UUID is generated from the host's hostname and
 .Ar vmname .
 .It Fl u
 RTC keeps UTC time.
 .It Fl W
 Force virtio PCI device emulations to use MSI interrupts instead of MSI-X
 interrupts.
 .It Fl w
 Ignore accesses to unimplemented Model Specific Registers (MSRs).
 This is intended for debug purposes.
 .It Fl x
 The guest's local APIC is configured in x2APIC mode.
 .It Fl Y
 Disable MPtable generation.
 .It Ar vmname
 Alphanumeric name of the guest.
 This should be the same as that created by
 .Xr bhyveload 8 .
 .El
 .Sh CONFIGURATION VARIABLES
 .Nm
 uses an internal tree of configuration variables to describe global and
 per-device settings.
 When
 .Nm
 starts,
 it parses command line options (including config files) in the order given
 on the command line.
 Each command line option sets one or more configuration variables.
 For example,
 the
 .Fl s
 option creates a new tree node for a PCI device and sets one or more variables
 under that node including the device model and device model-specific variables.
 Variables may be set multiple times during this parsing stage with the final
 value overriding previous values.
 .Pp
 Once all of the command line options have been processed,
 the configuration values are frozen.
 .Nm
 then uses the value of configuration values to initialize device models
 and global settings.
 .Pp
 More details on configuration variables can be found in
 .Xr bhyve_config 5 .
 .Sh DEBUG SERVER
 The current debug server provides limited support for debuggers.
 .Ss Registers
 Each virtual CPU is exposed to the debugger as a thread.
 .Pp
 General purpose registers can be queried for each virtual CPU, but other
 registers such as floating-point and system registers cannot be queried.
 .Ss Memory
 Memory (including memory mapped I/O regions) can be read and written by the debugger.
 Memory operations use virtual addresses that are resolved to physical addresses
 via the current virtual CPU's active address translation.
 .Ss Control
 The running guest can be interrupted by the debugger at any time
 .Pq for example, by pressing Ctrl-C in the debugger .
 .Pp
 Single stepping is only supported on Intel CPUs supporting the MTRAP VM exit.
 .Pp
 Breakpoints are supported on Intel CPUs that support single stepping.
 Note that continuing from a breakpoint while interrupts are enabled in the
 guest may not work as expected due to timer interrupts firing while single
 stepping over the breakpoint.
 .Sh SIGNAL HANDLING
 .Nm
 deals with the following signals:
 .Pp
 .Bl -tag -width SIGTERM -compact
 .It SIGTERM
 Trigger ACPI poweroff for a VM
 .El
 .Sh EXIT STATUS
 Exit status indicates how the VM was terminated:
 .Pp
 .Bl -tag -width indent -compact
 .It 0
 rebooted
 .It 1
 powered off
 .It 2
 halted
 .It 3
 triple fault
 .It 4
 exited due to an error
 .El
 .Sh EXAMPLES
 If not using a boot ROM, the guest operating system must have been loaded with
 .Xr bhyveload 8
 or a similar boot loader before
 .Xr bhyve 4
 can be run.
 Otherwise, the boot loader is not needed.
 .Pp
 To run a virtual machine with 1GB of memory, two virtual CPUs, a virtio
 block device backed by the
 .Pa /my/image
 filesystem image, and a serial port for the console:
 .Bd -literal -offset indent
 bhyve -c 2 -s 0,hostbridge -s 1,lpc -s 2,virtio-blk,/my/image \\
   -l com1,stdio -A -H -P -m 1G vm1
 .Ed
 .Pp
 Run a 24GB single-CPU virtual machine with three network ports, one of which
 has a MAC address specified:
 .Bd -literal -offset indent
 bhyve -s 0,hostbridge -s 1,lpc -s 2:0,virtio-net,tap0 \\
   -s 2:1,virtio-net,tap1 \\
   -s 2:2,virtio-net,tap2,mac=00:be:fa:76:45:00 \\
   -s 3,virtio-blk,/my/image -l com1,stdio \\
   -A -H -P -m 24G bigvm
 .Ed
 .Pp
 Run an 8GB quad-CPU virtual machine with 8 AHCI SATA disks, an AHCI ATAPI
 CD-ROM, a single virtio network port, an AMD hostbridge, and the console
 port connected to an
 .Xr nmdm 4
 null-modem device.
 .Bd -literal -offset indent
 bhyve -c 4 \\
   -s 0,amd_hostbridge -s 1,lpc \\
   -s 1:0,ahci,hd:/images/disk.1,hd:/images/disk.2,\\
 hd:/images/disk.3,hd:/images/disk.4,\\
 hd:/images/disk.5,hd:/images/disk.6,\\
 hd:/images/disk.7,hd:/images/disk.8,\\
 cd:/images/install.iso \\
   -s 3,virtio-net,tap0 \\
   -l com1,/dev/nmdm0A \\
   -A -H -P -m 8G
 .Ed
 .Pp
 Run a UEFI virtual machine with a display resolution of 800 by 600 pixels
 that can be accessed via VNC at: 0.0.0.0:5900.
 .Bd -literal -offset indent
 bhyve -c 2 -m 4G -w -H \\
   -s 0,hostbridge \\
   -s 3,ahci-cd,/path/to/uefi-OS-install.iso \\
   -s 4,ahci-hd,disk.img \\
   -s 5,virtio-net,tap0 \\
   -s 29,fbuf,tcp=0.0.0.0:5900,w=800,h=600,wait \\
   -s 30,xhci,tablet \\
   -s 31,lpc -l com1,stdio \\
   -l bootrom,/usr/local/share/uefi-firmware/BHYVE_UEFI.fd \\
    uefivm
 .Ed
 .Pp
 Run a UEFI virtual machine with a VNC display that is bound to all IPv6
 addresses on port 5900.
 .Bd -literal -offset indent
 bhyve -c 2 -m 4G -w -H \\
   -s 0,hostbridge \\
   -s 4,ahci-hd,disk.img \\
   -s 5,virtio-net,tap0 \\
   -s 29,fbuf,tcp=[::]:5900,w=800,h=600 \\
   -s 30,xhci,tablet \\
   -s 31,lpc -l com1,stdio \\
   -l bootrom,/usr/local/share/uefi-firmware/BHYVE_UEFI.fd \\
    uefivm
 .Ed
 .Sh SEE ALSO
 .Xr bhyve 4 ,
 .Xr netgraph 4 ,
 .Xr ng_socket 4 ,
 .Xr nmdm 4 ,
 .Xr vmm 4 ,
 .Xr bhyve_config 5 ,
 .Xr ethers 5 ,
 .Xr bhyvectl 8 ,
 .Xr bhyveload 8
 .Pp
 .Rs
 .%A Intel
 .%B 64 and IA-32 Architectures Software Developer’s Manual
 .%V Volume 3
 .Re
 .Sh HISTORY
 .Nm
 first appeared in
 .Fx 10.0 .
 .Sh AUTHORS
 .An Neel Natu Aq Mt neel@freebsd.org
 .An Peter Grehan Aq Mt grehan@freebsd.org
diff --git a/usr.sbin/bhyve/bhyve_config.5 b/usr.sbin/bhyve/bhyve_config.5
index a9cea3849c01..77d0bbaedaca 100644
--- a/usr.sbin/bhyve/bhyve_config.5
+++ b/usr.sbin/bhyve/bhyve_config.5
@@ -1,572 +1,574 @@
 .\" SPDX-License-Identifier: BSD-2-Clause
 .\"
 .\" Copyright (c) 2021 John H. Baldwin <jhb@FreeBSD.org>
 .\"
 .\" Redistribution and use in source and binary forms, with or without
 .\" modification, are permitted provided that the following conditions
 .\" are met:
 .\" 1. Redistributions of source code must retain the above copyright
 .\"    notice, this list of conditions and the following disclaimer.
 .\" 2. Redistributions in binary form must reproduce the above copyright
 .\"    notice, this list of conditions and the following disclaimer in the
 .\"    documentation and/or other materials provided with the distribution.
 .\"
 .\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 .\" ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 .\" SUCH DAMAGE.
 .\"
 .Dd September 17, 2021
 .Dt BHYVE_CONFIG 5
 .Os
 .Sh NAME
 .Nm bhyve_config
 .Nd "bhyve configuration variables"
 .Sh DESCRIPTION
 .Xr bhyve 8
 uses a hierarchical tree of configuration variables to describe global and
 per-device settings.
 Internal nodes in this tree do not have a value,
 only leaf nodes have values.
 This manual describes the configuration variables understood by
 .Xr bhyve 8 .
 If additional variables are defined,
 .Xr bhyve 8
 will ignore them and will not emit errors for unknown variables.
 However, these additional variables can be referenced by other
 variables as described below.
 .Sh VARIABLE VALUES
 Configuration variable values are stored as strings.
 A configuration variable value may refer to one or more other
 configuration values by name.
 Instances of the pattern
 .Sq % Ns Pq Ar var
 are replaced by the value of the configuration variable
 .Va var .
 To avoid unwanted expansion,
 .Sq %
 characters can be escaped by a leading
 .Sq % .
 For example,
 if a configuration variable
 .Va disk
 uses the value
 .Pa /dev/zvol/bhyve/%(name) ,
 then the final value of the
 .Va disk
 variable will be set to the path of a ZFS volume whose name matches
 the name of the virtual machine on the pool
 .Pa bhyve .
 .Pp
 Some configuration variables may be interpreted as a boolean value.
 For those variables the following case-insensitive values may be used to
 indicate true:
 .Pp
 .Bl -bullet -offset indent -compact
 .It
 true
 .It
 on
 .It
 yes
 .It
 1
 .El
 .Pp
 The following values may be used to indicate false:
 .Pp
 .Bl -bullet -offset indent -compact
 .It
 false
 .It
 off
 .It
 no
 .It
 0
 .El
 .Pp
 Some configuration variables may be interperted as an integer.
 For those variables,
 any syntax supported by
 .Xr strtol 3
 may be used.
 .Sh GLOBAL SETTINGS
 .Ss Architecture Neutral Settings
 .Bl -column "memory.guest_in_core" "integer" "Default"
 .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description
 .It Va name Ta string Ta Ta
 The name of the VM.
 .It Va cpus Ta integer Ta 1 Ta
 The total number of virtual CPUs.
 .It Va cores Ta integer Ta 1 Ta
 The number of virtual cores in each virtual socket.
 .It Va threads Ta integer Ta 1 Ta
 The number of virtual CPUs in each virtual core.
 .It Va sockets Ta integer Ta 1 Ta
 The number of virtual sockets.
 .It Va memory.guest_in_core Ta bool Ta false Ta
 Include guest memory in core file.
 .It Va memory.size Ta string Ta 256M Ta
 Guest physical memory size in bytes.
 The value must be formatted as described in
 .Xr expand_number 3 .
 .It Va memory.wired Ta bool Ta false Ta
 Wire guest memory.
 .It Va acpi_tables Ta bool Ta false Ta
 Generate ACPI tables.
 .It Va destroy_on_poweroff Ta bool Ta false Ta
 Destroy the VM on guest-initiated power-off.
 .It Va gdb.port Ta integer Ta 0 Ta
 TCP port number for the debug server.
 If this is set to a non-zero value, a debug server
 will listen for connections on this port.
 .It Va gdb.wait Ta bool Ta false Ta
 If the debug server is enabled, wait for a debugger to connect
 before starting the guest.
 .It Va rtc.use_localtime Ta bool Ta true Ta
 The real time clock uses the local time of the host.
 If this is set to false, the real time clock uses UTC.
 .It Va uuid Ta string Ta Ta
 The universally unique identifier (UUID) to use in the guest's
 System Management BIOS System Information structure.
 If an explicit value is not set, a valid UUID is generated from
 the host's hostname and the VM name.
 .It Va virtio_msix Ta bool Ta true Ta
 Use MSI-X interrupts for PCI VirtIO devices.
 If set to false, MSI interrupts are used instead.
 .It Va config.dump Ta bool Ta false Ta
 If this value is set to true after
 .Xr bhyve 8
 has finished parsing command line options,
 then
 .Xr bhyve 8
 will write all of its configuration variables to stdout and exit.
 No VM will be started.
 .El
 .Ss x86-Specific Settings
 .Bl -column "x86.vmexit_on_pause" "integer" "Default"
 .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description
 .It Va x86.mptable Ta bool Ta true Ta
 Generate an MPTable.
 .It Va x86.x2apic Ta bool Ta false Ta
 Configure guest's local APICs in x2APIC mode.
 .It Va x86.strictio Ta bool Ta false Ta
 Exit if a guest accesses an I/O port that is not emulated.
 By default, writes are ignored and reads return all bits set.
 .It Va x86.strictmsr Ta bool Ta true Ta
 Inject a general protection fault if a guest accesses a Model Specific
 Register (MSR) that is not emulated.
 If this is false, writes are ignored and reads return zero.
 .It Va x86.vmexit_on_hlt Ta bool Ta false Ta
 Force a VM exit when a guest CPU executes the
 .Dv HLT
 instruction.
 This allows idle guest CPUs to yield the host CPU.
 .It Va x86.vmexit_on_pause Ta bool Ta false Ta
 Force a VM exit when a guest CPU executes the
 .Dv PAUSE
 instruction.
 .El
 .Sh DEVICE SETTINGS
 Device settings are stored under a device node.
 The device node's name is set by the parent bus of the device.
 .Ss PCI Device Settings
 PCI devices are described by a device node named
 .Dq pci Ns Ar bus . Ns Ar slot . Ns Ar function
 where each of
 .Ar bus ,
 .Ar slot ,
 and
 .Ar function
 are formatted as decimal values with no padding.
 All PCI device nodes must contain a configuration variable named
 .Dq device
 which specifies the device model to use.
 The following PCI device models are supported:
 .Bl -tag -indent
 .It Li hostbridge
 Provide a simple PCI-Host bridge device.
 This is usually configured at pci0:0:0 and is required by most guest
 operating systems.
 .It Li ahci
 AHCI storage controller.
 .It Li e1000
 Intel e82545 network interface.
 .It Li fbuf
 VGA framebuffer device attached to VNC server.
 .It Li lpc
 LPC PCI-ISA bridge with COM1-COM4 16550 serial ports,
 a boot ROM,
 and an optional debug/test device.
 This device must be configured on bus 0.
 .It Li hda
 High Definition audio controller.
 .It Li nvme
 NVM Express (NVMe) controller.
 .It Li passthru
 PCI pass-through device.
 .It Li uart
 PCI 16550 serial device.
 .It Li virtio-9p
 VirtIO 9p (VirtFS) interface.
 .It Li virtio-blk
 VirtIO block storage interface.
 .It Li virtio-console
 VirtIO console interface.
 .It Li virtio-net
 VirtIO network interface.
 .It Li virtio-rnd
 VirtIO RNG interface.
 .It Li virtio-scsi
 VirtIO SCSI interface.
 .It Li xhci
 Extensible Host Controller Interface (XHCI) USB controller.
 .El
 .Ss USB Device Settings
 USB controller devices contain zero or more child USB devices
 attached to slots.
 Each USB device stores its settings in a node named
 .Dq slot. Ns Va N
 under the controller's device node.
 .Va N
 is the number of the slot to which the USB device is attached.
 Note that USB slot numbers begin at 1.
 All USB device nodes must contain a configuration variable named
 .Dq device
 which specifies the device model to use.
 The following USB device models are supported:
 .Bl -tag -indent
 .It Li tablet
 A USB tablet device which provides precise cursor synchronization
 when using VNC.
 .El
 .Ss Block Device Settings
 Block devices use the following settings to configure their backing store.
 These settings are stored in the configuration node of the respective device.
 .Bl -column "sectorsize" "logical[/physical]" "Default"
 .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description
 .It path Ta string Ta Ta
 The path of the file or disk device to use as the backing store.
 .It nocache Ta bool Ta false Ta
 Disable caching on the backing file by opening the backing file with
 .Dv O_DIRECT .
 .It nodelete Ta bool Ta false Ta
 Disable emulation of guest trim requests via
 .Dv DIOCGDELETE
 requests.
 .It sync Ta bool Ta false Ta
 Write changes to the backing file with synchronous writes.
 .It direct Ta bool Ta false Ta
 An alias for
 .Va sync .
 .It ro Ta bool Ta false Ta
 Disable writes to the backing file.
 .It sectorsize Ta Va logical Ns Op / Ns Va physical Ta Ta
 Specify the logical and physical sector size of the emulated disk.
 If the physical size is not specified,
 it is equal to the logical size.
 .El
 .Ss Network Backend Settings
 Network devices use the following settings to configure their backend.
 The backend is responsible for passing packets between the device model
 and a desired destination.
 Configuring a backend requires setting the
 .Va backend
 variable to one of the following values:
 .Bl -tag
 .It tap Ns Va N
 Use the named
 .Xr tap 4
 interface as the backend.
 .It vmnet Ns Va N
 Use the named
 .Xr vmnet 4
 interface as the backend.
 .It netgraph
 Use a
 .Xr netgraph 4
 socket hook as the backend.
 This backend uses the following additional variables:
 .Bl -column "peerhook" "Format" "Default"
 .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description
 .It Va path Ta string Ta Ta
 The name of the
 .Xr netgraph 4
 destination node.
 .It Va peerhook Ta string Ta Ta
 The name of the destination hook.
 .It Va socket Ta string Ta Ta
 The name of the created
 .Xr ng_socket 4
 node.
 .It Va hook Ta string Ta vmlink Ta
 The name of the source hook on the created
 .Xr ng_socket 4
 node.
 .El
 .It netmap: Ns Va interface
 Use
 .Xr netmap 4
 on a network interface as the backend.
 .It vale Ns Va bridge : Ns Va port
 Use a port on a
 .Xr vale 4
 bridge as the backend.
 .El
 .Ss UART Device Settings
 .Bl -column "Name" "Format" "Default"
 .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description
 .It Va path Ta path Ta Ta
 Backend device for the serial port.
 Either the pathname of a character device or
 .Dq stdio
 to use standard input and output of the
 .Xr bhyve 8
 process.
 .El
 .Ss Host Bridge Settings
 .Bl -column "vendor" "integer" "Default"
 .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description
 .It Va vendor Ta integer Ta 0x1275 Ta
 PCI vendor ID.
 .It Va devid Ta integer Ta 0x1275 Ta
 PCI device ID.
 .El
 .Ss AHCI Controller Settings
 AHCI controller devices contain zero or more ports each of which
 provides a storage device.
 Each port stores its settings in a node named
 .Dq port. Ns Va N
 under the controller's device node.
 The
 .Va N
 values are formatted as successive decimal values starting with 0.
 In addition to the block device settings described above, each
 port supports the following settings:
 .Bl -column "model" "integer" "generated"
 .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description
 .It Va type Ta string Ta Ta
 The type of storage device to emulate.
 Must be set to either
 .Dq cd
 or
 .Dq hd .
 .It Va nmrr Ta integer Ta 0 Ta
 Nominal Media Rotation Rate, also known as RPM.
 A value 1 of indicates a device with no rate such as a Solid State Disk.
 .It Va ser Ta string Ta generated Ta
 Serial number of up to twenty characters.
 A default serial number is generated using a hash of the backing
 store's pathname.
 .It Va rev Ta string Ta 001 Ta
 Revision number of up to eight characters.
 .It Va model Ta string Ta Ta
 Model number of up to forty characters.
 Separate default model strings are used for
 .Dq cd
 and
 .Dq hd
 device types.
 .El
 .Ss e1000 Settings
 In addition to the network backend settings,
 Intel e82545 network interfaces support the following variables:
 .Bl -column "Name" "MAC address" "generated"
 .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description
 .It Va mac Ta MAC address Ta generated Ta
 MAC address.
 If an explicit address is not provided,
 a MAC address is generated from a hash of the device's PCI address.
 .El
 .Ss Frame Buffer Settings
 .Bl -column "password" "[IP:]port" "127.0.0.1:5900"
 .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description
 .It Va wait Ta bool Ta false Ta
 Wait for a remote connection before starting the VM.
 .It Va rfb Ta Oo Ar IP Ns : Oc Ns Ar port Ta 127.0.0.1:5900 Ta
 TCP address to listen on for remote connections.
 The IP address must be given as a numeric address.
 IPv6 addresses must be enclosed in square brackets and
 support scoped identifiers as described in
 .Xr getaddrinfo 3 .
 A bare port number may be given in which case the IPv4
 localhost address is used.
 .It Va vga Ta string Ta io Ta
 VGA configuration.
 More details are provided in
 .Xr bhyve 8 .
 .It Va w Ta integer Ta 1024 Ta
 Frame buffer width in pixels.
 .It Va h Ta integer Ta 768 Ta
 Frame buffer height in pixels.
 .It Va password Ta string Ta Ta
 Password to use for VNC authentication.
 This type of authentication is known to be cryptographically weak and is not
 intended for use on untrusted networks.
 .El
 .Ss High Definition Audio Settings
 .Bl -column "Name" "Format" "Default"
 .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description
 .It Va play Ta path Ta Ta
 Host playback device,
 typically
 .Pa /dev/dsp0 .
 .It Va rec Ta path Ta Ta
 Host recording device,
 typically
 .Pa /dev/dsp0 .
 .El
 .Ss LPC Device Settings
 The LPC bridge stores its configuration under a top-level
 .Va lpc
 node rather than under the PCI LPC device's node.
 The following nodes are available under
 .Va lpc :
 .Bl -column "pc-testdev" "Format" "Default"
 .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description
 .It Va bootrom Ta path Ta Ta
 Path to a boot ROM.
 The contents of this file are copied into the guest's
 memory ending just before the 4GB physical address.
 If a boot ROM is present, a firmware interface device is
 also enabled for use by the boot ROM.
 .It Va com1 Ta node Ta Ta
 Settings for the COM1 serial port device.
 .It Va com2 Ta node Ta Ta
 Settings for the COM2 serial port device.
 .It Va com3 Ta node Ta Ta
 Settings for the COM3 serial port device.
 .It Va com4 Ta node Ta Ta
 Settings for the COM4 serial port device.
 .It Va pc-testdev Ta bool Ta false Ta
 Enable the PC debug/test device.
 .El
 .Ss NVMe Controller Settings
 Each NVMe controller supports a single storage device.
 The device can be backed either by a memory disk described by the
 .Va ram
 variable, or a block device using the the block device settings described above.
 In addition, each controller supports the following settings:
 .Bl -column "ioslots" "Format" "Default"
 .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description
 .It Va maxq Ta integer Ta 16 Ta
 Maximum number of I/O submission and completion queue pairs.
 .It Va qsz Ta integer Ta 2058 Ta
 Number of elements in each I/O queue.
 .It Va ioslots Ta integer Ta 8 Ta
 Maximum number of concurrent I/O requests.
 .It Va sectsz Ta integer Ta Ta
 Sector size.
 Can be one of 512, 4096, or 8192.
 Devices backed by a memory disk use 4096 as the default.
 Devices backed by a block device use the block device's sector size
 as the default.
 .It Va ser Ta string Ta Ta
 Serial number of up to twenty characters.
 A default serial number is generated using a hash of the device's PCI address.
 .It Va eui64 Ta integer Ta Ta
 IEEE Extended Unique Identifier.
 If an EUI is not provided, a default is generated using a checksum of the
 device's PCI address.
 .It Va dsm Ta string Ta auto Ta
 Whether or not to advertise DataSet Management support.
 One of
 .Dq auto ,
 .Dq enable ,
 or
 .Dq disable .
 The
 .Dq auto
 setting only advertises support if the backing store supports
 resource freeing, for example via TRIM.
 .It Va ram Ta integer Ta Ta
 If set, allocate a memory disk as the backing store.
 The value of this variable is the size of the memory disk in megabytes.
 .El
 .Ss PCI Passthrough Settings
 .Bl -column "Name" "integer" "Default"
 .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description
 .It Va bus Ta integer Ta Ta
 Host PCI bus address of device to pass through.
 .It Va slot Ta integer Ta Ta
 Host PCI slot address of device to pass through.
 .It Va func Ta integer Ta Ta
 Host PCI function address of device to pass through.
+.It Va rom Ta path Ta Ta
+ROM file of the device which will be executed by OVMF to init the device.
 .El
 .Ss VirtIO 9p Settings
 Each VirtIO 9p device exposes a single filesystem from a host path.
 .Bl -column "sharename" "Format" "Default"
 .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description
 .It Va sharename Ta string Ta Ta
 The share name exposed to the guest.
 .It Va path Ta path Ta Ta
 The path of a directory on the host to export to the guest.
 .It Va ro Ta bool Ta false Ta
 If true, the guest filesystem is read-only.
 .El
 .Ss VirtIO Block Device Settings
 In addition to the block device settings described above, each
 VirtIO block device supports the following settings:
 .Bl -column "model" "integer" "generated"
 .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description
 .It Va ser Ta string Ta generated Ta
 Serial number of up to twenty characters.
 A default serial number is generated using a hash of the backing
 store's pathname.
 .El
 .Ss VirtIO Console Device Settings
 Each VirtIO Console device contains one or more console ports.
 Each port stores its settings in a node named
 .Dq port. Ns Va N
 under the controller's device node.
 The
 .Va N
 values are formatted as successive decimal values starting with 0.
 Each port supports the following settings:
 .Bl -column "Name" "Format" "Default"
 .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description
 .It Va name Ta string Ta Ta
 The name of the port exposed to the guest.
 .It Va path Ta path Ta Ta
 The path of a UNIX domain socket providing the host connection for the port.
 .El
 .Ss VirtIO Network Interface Settings
 In addition to the network backend settings,
 VirtIO network interfaces support the following variables:
 .Bl -column "Name" "MAC address" "generated"
 .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description
 .It Va mac Ta MAC address Ta generated Ta
 MAC address.
 If an explicit address is not provided,
 a MAC address is generated from a hash of the device's PCI address.
 .It Va mtu Ta integer Ta 1500 Ta
 The largest supported MTU advertised to the guest.
 .El
 .Ss VirtIO SCSI Settings
 .Bl -column "Name" "integer" "Default"
 .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description
 .It Va dev Ta path Ta Ta
 The path of a CAM target layer (CTL) device to export:
 .Pa /dev/cam/ctl Ns Oo Ar pp . Ns Ar vp Oc .
 .It Va iid Ta integer Ta 0 Ta
 Initiator ID to use when sending requests to the CTL port.
 .El
 .Sh SEE ALSO
 .Xr expand_number 3 ,
 .Xr getaddrinfo 3 ,
 .Xr strtol 3 ,
 .Xr netgraph 4 ,
 .Xr netmap 4 ,
 .Xr ng_socket 4 ,
 .Xr tap 4 ,
 .Xr vale 4 ,
 .Xr vmnet 4 ,
 .Xr bhyve 8
diff --git a/usr.sbin/bhyve/pci_emul.c b/usr.sbin/bhyve/pci_emul.c
index e9c88922b0ef..7b62b09b53f7 100644
--- a/usr.sbin/bhyve/pci_emul.c
+++ b/usr.sbin/bhyve/pci_emul.c
@@ -1,2517 +1,2622 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/linker_set.h>
+#include <sys/mman.h>
 
 #include <ctype.h>
 #include <err.h>
 #include <errno.h>
 #include <pthread.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <strings.h>
 #include <assert.h>
 #include <stdbool.h>
 #include <sysexits.h>
 
 #include <machine/vmm.h>
 #include <machine/vmm_snapshot.h>
 #include <vmmapi.h>
 
 #include "acpi.h"
 #include "bhyverun.h"
 #include "config.h"
 #include "debug.h"
 #include "inout.h"
 #include "ioapic.h"
 #include "mem.h"
 #include "pci_emul.h"
 #include "pci_irq.h"
 #include "pci_lpc.h"
 
 #define CONF1_ADDR_PORT	   0x0cf8
 #define CONF1_DATA_PORT	   0x0cfc
 
 #define CONF1_ENABLE	   0x80000000ul
 
 #define	MAXBUSES	(PCI_BUSMAX + 1)
 #define MAXSLOTS	(PCI_SLOTMAX + 1)
 #define	MAXFUNCS	(PCI_FUNCMAX + 1)
 
 #define GB		(1024 * 1024 * 1024UL)
 
 struct funcinfo {
 	nvlist_t *fi_config;
 	struct pci_devemu *fi_pde;
 	struct pci_devinst *fi_devi;
 };
 
 struct intxinfo {
 	int	ii_count;
 	int	ii_pirq_pin;
 	int	ii_ioapic_irq;
 };
 
 struct slotinfo {
 	struct intxinfo si_intpins[4];
 	struct funcinfo si_funcs[MAXFUNCS];
 };
 
 struct businfo {
 	uint16_t iobase, iolimit;		/* I/O window */
 	uint32_t membase32, memlimit32;		/* mmio window below 4GB */
 	uint64_t membase64, memlimit64;		/* mmio window above 4GB */
 	struct slotinfo slotinfo[MAXSLOTS];
 };
 
 static struct businfo *pci_businfo[MAXBUSES];
 
 SET_DECLARE(pci_devemu_set, struct pci_devemu);
 
 static uint64_t pci_emul_iobase;
+static uint8_t *pci_emul_rombase;
+static uint64_t pci_emul_romoffset;
+static uint8_t *pci_emul_romlim;
 static uint64_t pci_emul_membase32;
 static uint64_t pci_emul_membase64;
 static uint64_t pci_emul_memlim64;
 
 struct pci_bar_allocation {
 	TAILQ_ENTRY(pci_bar_allocation) chain;
 	struct pci_devinst *pdi;
 	int idx;
 	enum pcibar_type type;
 	uint64_t size;
 };
 TAILQ_HEAD(pci_bar_list, pci_bar_allocation) pci_bars = TAILQ_HEAD_INITIALIZER(
     pci_bars);
 
 #define	PCI_EMUL_IOBASE		0x2000
 #define	PCI_EMUL_IOLIMIT	0x10000
 
+#define PCI_EMUL_ROMSIZE 0x10000000
+
 #define	PCI_EMUL_ECFG_BASE	0xE0000000		    /* 3.5GB */
 #define	PCI_EMUL_ECFG_SIZE	(MAXBUSES * 1024 * 1024)    /* 1MB per bus */
 SYSRES_MEM(PCI_EMUL_ECFG_BASE, PCI_EMUL_ECFG_SIZE);
 
 /*
  * OVMF always uses 0xC0000000 as base address for 32 bit PCI MMIO. Don't
  * change this address without changing it in OVMF.
  */
 #define PCI_EMUL_MEMBASE32 0xC0000000
 #define	PCI_EMUL_MEMLIMIT32	PCI_EMUL_ECFG_BASE
 #define PCI_EMUL_MEMSIZE64	(32*GB)
 
 static struct pci_devemu *pci_emul_finddev(const char *name);
 static void pci_lintr_route(struct pci_devinst *pi);
 static void pci_lintr_update(struct pci_devinst *pi);
 static void pci_cfgrw(struct vmctx *ctx, int vcpu, int in, int bus, int slot,
     int func, int coff, int bytes, uint32_t *val);
 
 static __inline void
 CFGWRITE(struct pci_devinst *pi, int coff, uint32_t val, int bytes)
 {
 
 	if (bytes == 1)
 		pci_set_cfgdata8(pi, coff, val);
 	else if (bytes == 2)
 		pci_set_cfgdata16(pi, coff, val);
 	else
 		pci_set_cfgdata32(pi, coff, val);
 }
 
 static __inline uint32_t
 CFGREAD(struct pci_devinst *pi, int coff, int bytes)
 {
 
 	if (bytes == 1)
 		return (pci_get_cfgdata8(pi, coff));
 	else if (bytes == 2)
 		return (pci_get_cfgdata16(pi, coff));
 	else
 		return (pci_get_cfgdata32(pi, coff));
 }
 
 /*
  * I/O access
  */
 
 /*
  * Slot options are in the form:
  *
  *  <bus>:<slot>:<func>,<emul>[,<config>]
  *  <slot>[:<func>],<emul>[,<config>]
  *
  *  slot is 0..31
  *  func is 0..7
  *  emul is a string describing the type of PCI device e.g. virtio-net
  *  config is an optional string, depending on the device, that can be
  *  used for configuration.
  *   Examples are:
  *     1,virtio-net,tap0
  *     3:0,dummy
  */
 static void
 pci_parse_slot_usage(char *aopt)
 {
 
 	EPRINTLN("Invalid PCI slot info field \"%s\"", aopt);
 }
 
 /*
  * Helper function to parse a list of comma-separated options where
  * each option is formatted as "name[=value]".  If no value is
  * provided, the option is treated as a boolean and is given a value
  * of true.
  */
 int
 pci_parse_legacy_config(nvlist_t *nvl, const char *opt)
 {
 	char *config, *name, *tofree, *value;
 
 	if (opt == NULL)
 		return (0);
 
 	config = tofree = strdup(opt);
 	while ((name = strsep(&config, ",")) != NULL) {
 		value = strchr(name, '=');
 		if (value != NULL) {
 			*value = '\0';
 			value++;
 			set_config_value_node(nvl, name, value);
 		} else
 			set_config_bool_node(nvl, name, true);
 	}
 	free(tofree);
 	return (0);
 }
 
 /*
  * PCI device configuration is stored in MIBs that encode the device's
  * location:
  *
  * pci.<bus>.<slot>.<func>
  *
  * Where "bus", "slot", and "func" are all decimal values without
  * leading zeroes.  Each valid device must have a "device" node which
  * identifies the driver model of the device.
  *
  * Device backends can provide a parser for the "config" string.  If
  * a custom parser is not provided, pci_parse_legacy_config() is used
  * to parse the string.
  */
 int
 pci_parse_slot(char *opt)
 {
 	char node_name[sizeof("pci.XXX.XX.X")];
 	struct pci_devemu *pde;
 	char *emul, *config, *str, *cp;
 	int error, bnum, snum, fnum;
 	nvlist_t *nvl;
 
 	error = -1;
 	str = strdup(opt);
 
 	emul = config = NULL;
 	if ((cp = strchr(str, ',')) != NULL) {
 		*cp = '\0';
 		emul = cp + 1;
 		if ((cp = strchr(emul, ',')) != NULL) {
 			*cp = '\0';
 			config = cp + 1;
 		}
 	} else {
 		pci_parse_slot_usage(opt);
 		goto done;
 	}
 
 	/* <bus>:<slot>:<func> */
 	if (sscanf(str, "%d:%d:%d", &bnum, &snum, &fnum) != 3) {
 		bnum = 0;
 		/* <slot>:<func> */
 		if (sscanf(str, "%d:%d", &snum, &fnum) != 2) {
 			fnum = 0;
 			/* <slot> */
 			if (sscanf(str, "%d", &snum) != 1) {
 				snum = -1;
 			}
 		}
 	}
 
 	if (bnum < 0 || bnum >= MAXBUSES || snum < 0 || snum >= MAXSLOTS ||
 	    fnum < 0 || fnum >= MAXFUNCS) {
 		pci_parse_slot_usage(opt);
 		goto done;
 	}
 
 	pde = pci_emul_finddev(emul);
 	if (pde == NULL) {
 		EPRINTLN("pci slot %d:%d:%d: unknown device \"%s\"", bnum, snum,
 		    fnum, emul);
 		goto done;
 	}
 
 	snprintf(node_name, sizeof(node_name), "pci.%d.%d.%d", bnum, snum,
 	    fnum);
 	nvl = find_config_node(node_name);
 	if (nvl != NULL) {
 		EPRINTLN("pci slot %d:%d:%d already occupied!", bnum, snum,
 		    fnum);
 		goto done;
 	}
 	nvl = create_config_node(node_name);
 	if (pde->pe_alias != NULL)
 		set_config_value_node(nvl, "device", pde->pe_alias);
 	else
 		set_config_value_node(nvl, "device", pde->pe_emu);
 
 	if (pde->pe_legacy_config != NULL)
 		error = pde->pe_legacy_config(nvl, config);
 	else
 		error = pci_parse_legacy_config(nvl, config);
 done:
 	free(str);
 	return (error);
 }
 
 void
 pci_print_supported_devices()
 {
 	struct pci_devemu **pdpp, *pdp;
 
 	SET_FOREACH(pdpp, pci_devemu_set) {
 		pdp = *pdpp;
 		printf("%s\n", pdp->pe_emu);
 	}
 }
 
 static int
 pci_valid_pba_offset(struct pci_devinst *pi, uint64_t offset)
 {
 
 	if (offset < pi->pi_msix.pba_offset)
 		return (0);
 
 	if (offset >= pi->pi_msix.pba_offset + pi->pi_msix.pba_size) {
 		return (0);
 	}
 
 	return (1);
 }
 
 int
 pci_emul_msix_twrite(struct pci_devinst *pi, uint64_t offset, int size,
 		     uint64_t value)
 {
 	int msix_entry_offset;
 	int tab_index;
 	char *dest;
 
 	/* support only 4 or 8 byte writes */
 	if (size != 4 && size != 8)
 		return (-1);
 
 	/*
 	 * Return if table index is beyond what device supports
 	 */
 	tab_index = offset / MSIX_TABLE_ENTRY_SIZE;
 	if (tab_index >= pi->pi_msix.table_count)
 		return (-1);
 
 	msix_entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
 
 	/* support only aligned writes */
 	if ((msix_entry_offset % size) != 0)
 		return (-1);
 
 	dest = (char *)(pi->pi_msix.table + tab_index);
 	dest += msix_entry_offset;
 
 	if (size == 4)
 		*((uint32_t *)dest) = value;
 	else
 		*((uint64_t *)dest) = value;
 
 	return (0);
 }
 
 uint64_t
 pci_emul_msix_tread(struct pci_devinst *pi, uint64_t offset, int size)
 {
 	char *dest;
 	int msix_entry_offset;
 	int tab_index;
 	uint64_t retval = ~0;
 
 	/*
 	 * The PCI standard only allows 4 and 8 byte accesses to the MSI-X
 	 * table but we also allow 1 byte access to accommodate reads from
 	 * ddb.
 	 */
 	if (size != 1 && size != 4 && size != 8)
 		return (retval);
 
 	msix_entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
 
 	/* support only aligned reads */
 	if ((msix_entry_offset % size) != 0) {
 		return (retval);
 	}
 
 	tab_index = offset / MSIX_TABLE_ENTRY_SIZE;
 
 	if (tab_index < pi->pi_msix.table_count) {
 		/* valid MSI-X Table access */
 		dest = (char *)(pi->pi_msix.table + tab_index);
 		dest += msix_entry_offset;
 
 		if (size == 1)
 			retval = *((uint8_t *)dest);
 		else if (size == 4)
 			retval = *((uint32_t *)dest);
 		else
 			retval = *((uint64_t *)dest);
 	} else if (pci_valid_pba_offset(pi, offset)) {
 		/* return 0 for PBA access */
 		retval = 0;
 	}
 
 	return (retval);
 }
 
 int
 pci_msix_table_bar(struct pci_devinst *pi)
 {
 
 	if (pi->pi_msix.table != NULL)
 		return (pi->pi_msix.table_bar);
 	else
 		return (-1);
 }
 
 int
 pci_msix_pba_bar(struct pci_devinst *pi)
 {
 
 	if (pi->pi_msix.table != NULL)
 		return (pi->pi_msix.pba_bar);
 	else
 		return (-1);
 }
 
 static int
 pci_emul_io_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
 		    uint32_t *eax, void *arg)
 {
 	struct pci_devinst *pdi = arg;
 	struct pci_devemu *pe = pdi->pi_d;
 	uint64_t offset;
 	int i;
 
 	for (i = 0; i <= PCI_BARMAX; i++) {
 		if (pdi->pi_bar[i].type == PCIBAR_IO &&
 		    port >= pdi->pi_bar[i].addr &&
 		    port + bytes <= pdi->pi_bar[i].addr + pdi->pi_bar[i].size) {
 			offset = port - pdi->pi_bar[i].addr;
 			if (in)
 				*eax = (*pe->pe_barread)(ctx, vcpu, pdi, i,
 							 offset, bytes);
 			else
 				(*pe->pe_barwrite)(ctx, vcpu, pdi, i, offset,
 						   bytes, *eax);
 			return (0);
 		}
 	}
 	return (-1);
 }
 
 static int
 pci_emul_mem_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr,
 		     int size, uint64_t *val, void *arg1, long arg2)
 {
 	struct pci_devinst *pdi = arg1;
 	struct pci_devemu *pe = pdi->pi_d;
 	uint64_t offset;
 	int bidx = (int) arg2;
 
 	assert(bidx <= PCI_BARMAX);
 	assert(pdi->pi_bar[bidx].type == PCIBAR_MEM32 ||
 	       pdi->pi_bar[bidx].type == PCIBAR_MEM64);
 	assert(addr >= pdi->pi_bar[bidx].addr &&
 	       addr + size <= pdi->pi_bar[bidx].addr + pdi->pi_bar[bidx].size);
 
 	offset = addr - pdi->pi_bar[bidx].addr;
 
 	if (dir == MEM_F_WRITE) {
 		if (size == 8) {
 			(*pe->pe_barwrite)(ctx, vcpu, pdi, bidx, offset,
 					   4, *val & 0xffffffff);
 			(*pe->pe_barwrite)(ctx, vcpu, pdi, bidx, offset + 4,
 					   4, *val >> 32);
 		} else {
 			(*pe->pe_barwrite)(ctx, vcpu, pdi, bidx, offset,
 					   size, *val);
 		}
 	} else {
 		if (size == 8) {
 			*val = (*pe->pe_barread)(ctx, vcpu, pdi, bidx,
 						 offset, 4);
 			*val |= (*pe->pe_barread)(ctx, vcpu, pdi, bidx,
 						  offset + 4, 4) << 32;
 		} else {
 			*val = (*pe->pe_barread)(ctx, vcpu, pdi, bidx,
 						 offset, size);
 		}
 	}
 
 	return (0);
 }
 
 
 static int
 pci_emul_alloc_resource(uint64_t *baseptr, uint64_t limit, uint64_t size,
 			uint64_t *addr)
 {
 	uint64_t base;
 
 	assert((size & (size - 1)) == 0);	/* must be a power of 2 */
 
 	base = roundup2(*baseptr, size);
 
 	if (base + size <= limit) {
 		*addr = base;
 		*baseptr = base + size;
 		return (0);
 	} else
 		return (-1);
 }
 
 /*
  * Register (or unregister) the MMIO or I/O region associated with the BAR
  * register 'idx' of an emulated pci device.
  */
 static void
 modify_bar_registration(struct pci_devinst *pi, int idx, int registration)
 {
 	struct pci_devemu *pe;
 	int error;
 	struct inout_port iop;
 	struct mem_range mr;
 
 	pe = pi->pi_d;
 	switch (pi->pi_bar[idx].type) {
 	case PCIBAR_IO:
 		bzero(&iop, sizeof(struct inout_port));
 		iop.name = pi->pi_name;
 		iop.port = pi->pi_bar[idx].addr;
 		iop.size = pi->pi_bar[idx].size;
 		if (registration) {
 			iop.flags = IOPORT_F_INOUT;
 			iop.handler = pci_emul_io_handler;
 			iop.arg = pi;
 			error = register_inout(&iop);
 		} else
 			error = unregister_inout(&iop);
 		if (pe->pe_baraddr != NULL)
 			(*pe->pe_baraddr)(pi->pi_vmctx, pi, idx, registration,
 					  pi->pi_bar[idx].addr);
 		break;
 	case PCIBAR_MEM32:
 	case PCIBAR_MEM64:
 		bzero(&mr, sizeof(struct mem_range));
 		mr.name = pi->pi_name;
 		mr.base = pi->pi_bar[idx].addr;
 		mr.size = pi->pi_bar[idx].size;
 		if (registration) {
 			mr.flags = MEM_F_RW;
 			mr.handler = pci_emul_mem_handler;
 			mr.arg1 = pi;
 			mr.arg2 = idx;
 			error = register_mem(&mr);
 		} else
 			error = unregister_mem(&mr);
 		if (pe->pe_baraddr != NULL)
 			(*pe->pe_baraddr)(pi->pi_vmctx, pi, idx, registration,
 					  pi->pi_bar[idx].addr);
 		break;
+	case PCIBAR_ROM:
+		error = 0;
+		if (pe->pe_baraddr != NULL)
+			(*pe->pe_baraddr)(pi->pi_vmctx, pi, idx, registration,
+			    pi->pi_bar[idx].addr);
+		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	assert(error == 0);
 }
 
 static void
 unregister_bar(struct pci_devinst *pi, int idx)
 {
 
 	modify_bar_registration(pi, idx, 0);
 }
 
 static void
 register_bar(struct pci_devinst *pi, int idx)
 {
 
 	modify_bar_registration(pi, idx, 1);
 }
 
+/* Is the ROM enabled for the emulated pci device? */
+static int
+romen(struct pci_devinst *pi)
+{
+	return (pi->pi_bar[PCI_ROM_IDX].lobits & PCIM_BIOS_ENABLE) ==
+	    PCIM_BIOS_ENABLE;
+}
+
 /* Are we decoding i/o port accesses for the emulated pci device? */
 static int
 porten(struct pci_devinst *pi)
 {
 	uint16_t cmd;
 
 	cmd = pci_get_cfgdata16(pi, PCIR_COMMAND);
 
 	return (cmd & PCIM_CMD_PORTEN);
 }
 
 /* Are we decoding memory accesses for the emulated pci device? */
 static int
 memen(struct pci_devinst *pi)
 {
 	uint16_t cmd;
 
 	cmd = pci_get_cfgdata16(pi, PCIR_COMMAND);
 
 	return (cmd & PCIM_CMD_MEMEN);
 }
 
 /*
  * Update the MMIO or I/O address that is decoded by the BAR register.
  *
  * If the pci device has enabled the address space decoding then intercept
  * the address range decoded by the BAR register.
  */
 static void
 update_bar_address(struct pci_devinst *pi, uint64_t addr, int idx, int type)
 {
 	int decode;
 
 	if (pi->pi_bar[idx].type == PCIBAR_IO)
 		decode = porten(pi);
 	else
 		decode = memen(pi);
 
 	if (decode)
 		unregister_bar(pi, idx);
 
 	switch (type) {
 	case PCIBAR_IO:
 	case PCIBAR_MEM32:
 		pi->pi_bar[idx].addr = addr;
 		break;
 	case PCIBAR_MEM64:
 		pi->pi_bar[idx].addr &= ~0xffffffffUL;
 		pi->pi_bar[idx].addr |= addr;
 		break;
 	case PCIBAR_MEMHI64:
 		pi->pi_bar[idx].addr &= 0xffffffff;
 		pi->pi_bar[idx].addr |= addr;
 		break;
 	default:
 		assert(0);
 	}
 
 	if (decode)
 		register_bar(pi, idx);
 }
 
 int
 pci_emul_alloc_bar(struct pci_devinst *pdi, int idx, enum pcibar_type type,
     uint64_t size)
 {
-	assert(idx >= 0 && idx <= PCI_BARMAX);
+	assert((type == PCIBAR_ROM) || (idx >= 0 && idx <= PCI_BARMAX));
+	assert((type != PCIBAR_ROM) || (idx == PCI_ROM_IDX));
 
 	if ((size & (size - 1)) != 0)
 		size = 1UL << flsl(size);	/* round up to a power of 2 */
 
 	/* Enforce minimum BAR sizes required by the PCI standard */
 	if (type == PCIBAR_IO) {
 		if (size < 4)
 			size = 4;
+	} else if (type == PCIBAR_ROM) {
+		if (size < ~PCIM_BIOS_ADDR_MASK + 1)
+			size = ~PCIM_BIOS_ADDR_MASK + 1;
 	} else {
 		if (size < 16)
 			size = 16;
 	}
 
 	/*
 	 * To reduce fragmentation of the MMIO space, we allocate the BARs by
 	 * size. Therefore, don't allocate the BAR yet. We create a list of all
 	 * BAR allocation which is sorted by BAR size. When all PCI devices are
 	 * initialized, we will assign an address to the BARs.
 	 */
 
 	/* create a new list entry */
 	struct pci_bar_allocation *const new_bar = malloc(sizeof(*new_bar));
 	memset(new_bar, 0, sizeof(*new_bar));
 	new_bar->pdi = pdi;
 	new_bar->idx = idx;
 	new_bar->type = type;
 	new_bar->size = size;
 
 	/*
 	 * Search for a BAR which size is lower than the size of our newly
 	 * allocated BAR.
 	 */
 	struct pci_bar_allocation *bar = NULL;
 	TAILQ_FOREACH(bar, &pci_bars, chain) {
 		if (bar->size < size) {
 			break;
 		}
 	}
 
 	if (bar == NULL) {
 		/*
 		 * Either the list is empty or new BAR is the smallest BAR of
 		 * the list. Append it to the end of our list.
 		 */
 		TAILQ_INSERT_TAIL(&pci_bars, new_bar, chain);
 	} else {
 		/*
 		 * The found BAR is smaller than our new BAR. For that reason,
 		 * insert our new BAR before the found BAR.
 		 */
 		TAILQ_INSERT_BEFORE(bar, new_bar, chain);
 	}
 
 	/*
 	 * pci_passthru devices synchronize their physical and virtual command
 	 * register on init. For that reason, the virtual cmd reg should be
 	 * updated as early as possible.
 	 */
 	uint16_t enbit = 0;
 	switch (type) {
 	case PCIBAR_IO:
 		enbit = PCIM_CMD_PORTEN;
 		break;
 	case PCIBAR_MEM64:
 	case PCIBAR_MEM32:
 		enbit = PCIM_CMD_MEMEN;
 		break;
 	default:
 		enbit = 0;
 		break;
 	}
 
 	const uint16_t cmd = pci_get_cfgdata16(pdi, PCIR_COMMAND);
 	pci_set_cfgdata16(pdi, PCIR_COMMAND, cmd | enbit);
 
 	return (0);
 }
 
 static int
 pci_emul_assign_bar(struct pci_devinst *const pdi, const int idx,
     const enum pcibar_type type, const uint64_t size)
 {
 	int error;
 	uint64_t *baseptr, limit, addr, mask, lobits, bar;
 
 	switch (type) {
 	case PCIBAR_NONE:
 		baseptr = NULL;
 		addr = mask = lobits = 0;
 		break;
 	case PCIBAR_IO:
 		baseptr = &pci_emul_iobase;
 		limit = PCI_EMUL_IOLIMIT;
 		mask = PCIM_BAR_IO_BASE;
 		lobits = PCIM_BAR_IO_SPACE;
 		break;
 	case PCIBAR_MEM64:
 		/*
 		 * XXX
 		 * Some drivers do not work well if the 64-bit BAR is allocated
 		 * above 4GB. Allow for this by allocating small requests under
 		 * 4GB unless then allocation size is larger than some arbitrary
 		 * number (128MB currently).
 		 */
 		if (size > 128 * 1024 * 1024) {
 			baseptr = &pci_emul_membase64;
 			limit = pci_emul_memlim64;
 			mask = PCIM_BAR_MEM_BASE;
 			lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64 |
 				 PCIM_BAR_MEM_PREFETCH;
 		} else {
 			baseptr = &pci_emul_membase32;
 			limit = PCI_EMUL_MEMLIMIT32;
 			mask = PCIM_BAR_MEM_BASE;
 			lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64;
 		}
 		break;
 	case PCIBAR_MEM32:
 		baseptr = &pci_emul_membase32;
 		limit = PCI_EMUL_MEMLIMIT32;
 		mask = PCIM_BAR_MEM_BASE;
 		lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_32;
 		break;
+	case PCIBAR_ROM:
+		/* do not claim memory for ROM. OVMF will do it for us. */
+		baseptr = NULL;
+		limit = 0;
+		mask = PCIM_BIOS_ADDR_MASK;
+		lobits = 0;
+		break;
 	default:
 		printf("pci_emul_alloc_base: invalid bar type %d\n", type);
 		assert(0);
 	}
 
 	if (baseptr != NULL) {
 		error = pci_emul_alloc_resource(baseptr, limit, size, &addr);
 		if (error != 0)
 			return (error);
 	}
 
 	pdi->pi_bar[idx].type = type;
 	pdi->pi_bar[idx].addr = addr;
 	pdi->pi_bar[idx].size = size;
 	/*
 	 * passthru devices are using same lobits as physical device they set
 	 * this property
 	 */
 	if (pdi->pi_bar[idx].lobits != 0) {
 		lobits = pdi->pi_bar[idx].lobits;
 	} else {
 		pdi->pi_bar[idx].lobits = lobits;
 	}
 
 	/* Initialize the BAR register in config space */
 	bar = (addr & mask) | lobits;
 	pci_set_cfgdata32(pdi, PCIR_BAR(idx), bar);
 
 	if (type == PCIBAR_MEM64) {
 		assert(idx + 1 <= PCI_BARMAX);
 		pdi->pi_bar[idx + 1].type = PCIBAR_MEMHI64;
 		pci_set_cfgdata32(pdi, PCIR_BAR(idx + 1), bar >> 32);
 	}
 
-	register_bar(pdi, idx);
+	if (type != PCIBAR_ROM) {
+		register_bar(pdi, idx);
+	}
+
+	return (0);
+}
+
+int
+pci_emul_alloc_rom(struct pci_devinst *const pdi, const uint64_t size,
+    void **const addr)
+{
+	/* allocate ROM space once on first call */
+	if (pci_emul_rombase == 0) {
+		pci_emul_rombase = vm_create_devmem(pdi->pi_vmctx, VM_PCIROM,
+		    "pcirom", PCI_EMUL_ROMSIZE);
+		if (pci_emul_rombase == MAP_FAILED) {
+			warnx("%s: failed to create rom segment", __func__);
+			return (-1);
+		}
+		pci_emul_romlim = pci_emul_rombase + PCI_EMUL_ROMSIZE;
+		pci_emul_romoffset = 0;
+	}
+
+	/* ROM size should be a power of 2 and greater than 2 KB */
+	const uint64_t rom_size = MAX(1UL << flsl(size),
+	    ~PCIM_BIOS_ADDR_MASK + 1);
+
+	/* check if ROM fits into ROM space */
+	if (pci_emul_romoffset + rom_size > PCI_EMUL_ROMSIZE) {
+		warnx("%s: no space left in rom segment:", __func__);
+		warnx("%16lu bytes left",
+		    PCI_EMUL_ROMSIZE - pci_emul_romoffset);
+		warnx("%16lu bytes required by %d/%d/%d", rom_size, pdi->pi_bus,
+		    pdi->pi_slot, pdi->pi_func);
+		return (-1);
+	}
+
+	/* allocate ROM BAR */
+	const int error = pci_emul_alloc_bar(pdi, PCI_ROM_IDX, PCIBAR_ROM,
+	    rom_size);
+	if (error)
+		return error;
+
+	/* return address */
+	*addr = pci_emul_rombase + pci_emul_romoffset;
+
+	/* save offset into ROM Space */
+	pdi->pi_romoffset = pci_emul_romoffset;
+
+	/* increase offset for next ROM */
+	pci_emul_romoffset += rom_size;
 
 	return (0);
 }
 
 #define	CAP_START_OFFSET	0x40
 static int
 pci_emul_add_capability(struct pci_devinst *pi, u_char *capdata, int caplen)
 {
 	int i, capoff, reallen;
 	uint16_t sts;
 
 	assert(caplen > 0);
 
 	reallen = roundup2(caplen, 4);		/* dword aligned */
 
 	sts = pci_get_cfgdata16(pi, PCIR_STATUS);
 	if ((sts & PCIM_STATUS_CAPPRESENT) == 0)
 		capoff = CAP_START_OFFSET;
 	else
 		capoff = pi->pi_capend + 1;
 
 	/* Check if we have enough space */
 	if (capoff + reallen > PCI_REGMAX + 1)
 		return (-1);
 
 	/* Set the previous capability pointer */
 	if ((sts & PCIM_STATUS_CAPPRESENT) == 0) {
 		pci_set_cfgdata8(pi, PCIR_CAP_PTR, capoff);
 		pci_set_cfgdata16(pi, PCIR_STATUS, sts|PCIM_STATUS_CAPPRESENT);
 	} else
 		pci_set_cfgdata8(pi, pi->pi_prevcap + 1, capoff);
 
 	/* Copy the capability */
 	for (i = 0; i < caplen; i++)
 		pci_set_cfgdata8(pi, capoff + i, capdata[i]);
 
 	/* Set the next capability pointer */
 	pci_set_cfgdata8(pi, capoff + 1, 0);
 
 	pi->pi_prevcap = capoff;
 	pi->pi_capend = capoff + reallen - 1;
 	return (0);
 }
 
 static struct pci_devemu *
 pci_emul_finddev(const char *name)
 {
 	struct pci_devemu **pdpp, *pdp;
 
 	SET_FOREACH(pdpp, pci_devemu_set) {
 		pdp = *pdpp;
 		if (!strcmp(pdp->pe_emu, name)) {
 			return (pdp);
 		}
 	}
 
 	return (NULL);
 }
 
 static int
 pci_emul_init(struct vmctx *ctx, struct pci_devemu *pde, int bus, int slot,
     int func, struct funcinfo *fi)
 {
 	struct pci_devinst *pdi;
 	int err;
 
 	pdi = calloc(1, sizeof(struct pci_devinst));
 
 	pdi->pi_vmctx = ctx;
 	pdi->pi_bus = bus;
 	pdi->pi_slot = slot;
 	pdi->pi_func = func;
 	pthread_mutex_init(&pdi->pi_lintr.lock, NULL);
 	pdi->pi_lintr.pin = 0;
 	pdi->pi_lintr.state = IDLE;
 	pdi->pi_lintr.pirq_pin = 0;
 	pdi->pi_lintr.ioapic_irq = 0;
 	pdi->pi_d = pde;
 	snprintf(pdi->pi_name, PI_NAMESZ, "%s-pci-%d", pde->pe_emu, slot);
 
 	/* Disable legacy interrupts */
 	pci_set_cfgdata8(pdi, PCIR_INTLINE, 255);
 	pci_set_cfgdata8(pdi, PCIR_INTPIN, 0);
 
 	pci_set_cfgdata8(pdi, PCIR_COMMAND, PCIM_CMD_BUSMASTEREN);
 
 	err = (*pde->pe_init)(ctx, pdi, fi->fi_config);
 	if (err == 0)
 		fi->fi_devi = pdi;
 	else
 		free(pdi);
 
 	return (err);
 }
 
 void
 pci_populate_msicap(struct msicap *msicap, int msgnum, int nextptr)
 {
 	int mmc;
 
 	/* Number of msi messages must be a power of 2 between 1 and 32 */
 	assert((msgnum & (msgnum - 1)) == 0 && msgnum >= 1 && msgnum <= 32);
 	mmc = ffs(msgnum) - 1;
 
 	bzero(msicap, sizeof(struct msicap));
 	msicap->capid = PCIY_MSI;
 	msicap->nextptr = nextptr;
 	msicap->msgctrl = PCIM_MSICTRL_64BIT | (mmc << 1);
 }
 
 int
 pci_emul_add_msicap(struct pci_devinst *pi, int msgnum)
 {
 	struct msicap msicap;
 
 	pci_populate_msicap(&msicap, msgnum, 0);
 
 	return (pci_emul_add_capability(pi, (u_char *)&msicap, sizeof(msicap)));
 }
 
 static void
 pci_populate_msixcap(struct msixcap *msixcap, int msgnum, int barnum,
 		     uint32_t msix_tab_size)
 {
 
 	assert(msix_tab_size % 4096 == 0);
 
 	bzero(msixcap, sizeof(struct msixcap));
 	msixcap->capid = PCIY_MSIX;
 
 	/*
 	 * Message Control Register, all fields set to
 	 * zero except for the Table Size.
 	 * Note: Table size N is encoded as N-1
 	 */
 	msixcap->msgctrl = msgnum - 1;
 
 	/*
 	 * MSI-X BAR setup:
 	 * - MSI-X table start at offset 0
 	 * - PBA table starts at a 4K aligned offset after the MSI-X table
 	 */
 	msixcap->table_info = barnum & PCIM_MSIX_BIR_MASK;
 	msixcap->pba_info = msix_tab_size | (barnum & PCIM_MSIX_BIR_MASK);
 }
 
 static void
 pci_msix_table_init(struct pci_devinst *pi, int table_entries)
 {
 	int i, table_size;
 
 	assert(table_entries > 0);
 	assert(table_entries <= MAX_MSIX_TABLE_ENTRIES);
 
 	table_size = table_entries * MSIX_TABLE_ENTRY_SIZE;
 	pi->pi_msix.table = calloc(1, table_size);
 
 	/* set mask bit of vector control register */
 	for (i = 0; i < table_entries; i++)
 		pi->pi_msix.table[i].vector_control |= PCIM_MSIX_VCTRL_MASK;
 }
 
 int
 pci_emul_add_msixcap(struct pci_devinst *pi, int msgnum, int barnum)
 {
 	uint32_t tab_size;
 	struct msixcap msixcap;
 
 	assert(msgnum >= 1 && msgnum <= MAX_MSIX_TABLE_ENTRIES);
 	assert(barnum >= 0 && barnum <= PCIR_MAX_BAR_0);
 
 	tab_size = msgnum * MSIX_TABLE_ENTRY_SIZE;
 
 	/* Align table size to nearest 4K */
 	tab_size = roundup2(tab_size, 4096);
 
 	pi->pi_msix.table_bar = barnum;
 	pi->pi_msix.pba_bar   = barnum;
 	pi->pi_msix.table_offset = 0;
 	pi->pi_msix.table_count = msgnum;
 	pi->pi_msix.pba_offset = tab_size;
 	pi->pi_msix.pba_size = PBA_SIZE(msgnum);
 
 	pci_msix_table_init(pi, msgnum);
 
 	pci_populate_msixcap(&msixcap, msgnum, barnum, tab_size);
 
 	/* allocate memory for MSI-X Table and PBA */
 	pci_emul_alloc_bar(pi, barnum, PCIBAR_MEM32,
 				tab_size + pi->pi_msix.pba_size);
 
 	return (pci_emul_add_capability(pi, (u_char *)&msixcap,
 					sizeof(msixcap)));
 }
 
 static void
 msixcap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
 		 int bytes, uint32_t val)
 {
 	uint16_t msgctrl, rwmask;
 	int off;
 
 	off = offset - capoff;
 	/* Message Control Register */
 	if (off == 2 && bytes == 2) {
 		rwmask = PCIM_MSIXCTRL_MSIX_ENABLE | PCIM_MSIXCTRL_FUNCTION_MASK;
 		msgctrl = pci_get_cfgdata16(pi, offset);
 		msgctrl &= ~rwmask;
 		msgctrl |= val & rwmask;
 		val = msgctrl;
 
 		pi->pi_msix.enabled = val & PCIM_MSIXCTRL_MSIX_ENABLE;
 		pi->pi_msix.function_mask = val & PCIM_MSIXCTRL_FUNCTION_MASK;
 		pci_lintr_update(pi);
 	}
 
 	CFGWRITE(pi, offset, val, bytes);
 }
 
 static void
 msicap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
 		int bytes, uint32_t val)
 {
 	uint16_t msgctrl, rwmask, msgdata, mme;
 	uint32_t addrlo;
 
 	/*
 	 * If guest is writing to the message control register make sure
 	 * we do not overwrite read-only fields.
 	 */
 	if ((offset - capoff) == 2 && bytes == 2) {
 		rwmask = PCIM_MSICTRL_MME_MASK | PCIM_MSICTRL_MSI_ENABLE;
 		msgctrl = pci_get_cfgdata16(pi, offset);
 		msgctrl &= ~rwmask;
 		msgctrl |= val & rwmask;
 		val = msgctrl;
 	}
 	CFGWRITE(pi, offset, val, bytes);
 
 	msgctrl = pci_get_cfgdata16(pi, capoff + 2);
 	addrlo = pci_get_cfgdata32(pi, capoff + 4);
 	if (msgctrl & PCIM_MSICTRL_64BIT)
 		msgdata = pci_get_cfgdata16(pi, capoff + 12);
 	else
 		msgdata = pci_get_cfgdata16(pi, capoff + 8);
 
 	mme = msgctrl & PCIM_MSICTRL_MME_MASK;
 	pi->pi_msi.enabled = msgctrl & PCIM_MSICTRL_MSI_ENABLE ? 1 : 0;
 	if (pi->pi_msi.enabled) {
 		pi->pi_msi.addr = addrlo;
 		pi->pi_msi.msg_data = msgdata;
 		pi->pi_msi.maxmsgnum = 1 << (mme >> 4);
 	} else {
 		pi->pi_msi.maxmsgnum = 0;
 	}
 	pci_lintr_update(pi);
 }
 
 void
 pciecap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
 		 int bytes, uint32_t val)
 {
 
 	/* XXX don't write to the readonly parts */
 	CFGWRITE(pi, offset, val, bytes);
 }
 
 #define	PCIECAP_VERSION	0x2
 int
 pci_emul_add_pciecap(struct pci_devinst *pi, int type)
 {
 	int err;
 	struct pciecap pciecap;
 
 	bzero(&pciecap, sizeof(pciecap));
 
 	/*
 	 * Use the integrated endpoint type for endpoints on a root complex bus.
 	 *
 	 * NB: bhyve currently only supports a single PCI bus that is the root
 	 * complex bus, so all endpoints are integrated.
 	 */
 	if ((type == PCIEM_TYPE_ENDPOINT) && (pi->pi_bus == 0))
 		type = PCIEM_TYPE_ROOT_INT_EP;
 
 	pciecap.capid = PCIY_EXPRESS;
 	pciecap.pcie_capabilities = PCIECAP_VERSION | type;
 	if (type != PCIEM_TYPE_ROOT_INT_EP) {
 		pciecap.link_capabilities = 0x411;	/* gen1, x1 */
 		pciecap.link_status = 0x11;		/* gen1, x1 */
 	}
 
 	err = pci_emul_add_capability(pi, (u_char *)&pciecap, sizeof(pciecap));
 	return (err);
 }
 
 /*
  * This function assumes that 'coff' is in the capabilities region of the
  * config space. A capoff parameter of zero will force a search for the
  * offset and type.
  */
 void
 pci_emul_capwrite(struct pci_devinst *pi, int offset, int bytes, uint32_t val,
     uint8_t capoff, int capid)
 {
 	uint8_t nextoff;
 
 	/* Do not allow un-aligned writes */
 	if ((offset & (bytes - 1)) != 0)
 		return;
 
 	if (capoff == 0) {
 		/* Find the capability that we want to update */
 		capoff = CAP_START_OFFSET;
 		while (1) {
 			nextoff = pci_get_cfgdata8(pi, capoff + 1);
 			if (nextoff == 0)
 				break;
 			if (offset >= capoff && offset < nextoff)
 				break;
 
 			capoff = nextoff;
 		}
 		assert(offset >= capoff);
 		capid = pci_get_cfgdata8(pi, capoff);
 	}
 
 	/*
 	 * Capability ID and Next Capability Pointer are readonly.
 	 * However, some o/s's do 4-byte writes that include these.
 	 * For this case, trim the write back to 2 bytes and adjust
 	 * the data.
 	 */
 	if (offset == capoff || offset == capoff + 1) {
 		if (offset == capoff && bytes == 4) {
 			bytes = 2;
 			offset += 2;
 			val >>= 16;
 		} else
 			return;
 	}
 
 	switch (capid) {
 	case PCIY_MSI:
 		msicap_cfgwrite(pi, capoff, offset, bytes, val);
 		break;
 	case PCIY_MSIX:
 		msixcap_cfgwrite(pi, capoff, offset, bytes, val);
 		break;
 	case PCIY_EXPRESS:
 		pciecap_cfgwrite(pi, capoff, offset, bytes, val);
 		break;
 	default:
 		break;
 	}
 }
 
 static int
 pci_emul_iscap(struct pci_devinst *pi, int offset)
 {
 	uint16_t sts;
 
 	sts = pci_get_cfgdata16(pi, PCIR_STATUS);
 	if ((sts & PCIM_STATUS_CAPPRESENT) != 0) {
 		if (offset >= CAP_START_OFFSET && offset <= pi->pi_capend)
 			return (1);
 	}
 	return (0);
 }
 
 static int
 pci_emul_fallback_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr,
 			  int size, uint64_t *val, void *arg1, long arg2)
 {
 	/*
 	 * Ignore writes; return 0xff's for reads. The mem read code
 	 * will take care of truncating to the correct size.
 	 */
 	if (dir == MEM_F_READ) {
 		*val = 0xffffffffffffffff;
 	}
 
 	return (0);
 }
 
 static int
 pci_emul_ecfg_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr,
     int bytes, uint64_t *val, void *arg1, long arg2)
 {
 	int bus, slot, func, coff, in;
 
 	coff = addr & 0xfff;
 	func = (addr >> 12) & 0x7;
 	slot = (addr >> 15) & 0x1f;
 	bus = (addr >> 20) & 0xff;
 	in = (dir == MEM_F_READ);
 	if (in)
 		*val = ~0UL;
 	pci_cfgrw(ctx, vcpu, in, bus, slot, func, coff, bytes, (uint32_t *)val);
 	return (0);
 }
 
 uint64_t
 pci_ecfg_base(void)
 {
 
 	return (PCI_EMUL_ECFG_BASE);
 }
 
 #define	BUSIO_ROUNDUP		32
 #define	BUSMEM32_ROUNDUP	(1024 * 1024)
 #define	BUSMEM64_ROUNDUP	(512 * 1024 * 1024)
 
 int
 init_pci(struct vmctx *ctx)
 {
 	char node_name[sizeof("pci.XXX.XX.X")];
 	struct mem_range mr;
 	struct pci_devemu *pde;
 	struct businfo *bi;
 	struct slotinfo *si;
 	struct funcinfo *fi;
 	nvlist_t *nvl;
 	const char *emul;
 	size_t lowmem;
 	int bus, slot, func;
 	int error;
 
 	if (vm_get_lowmem_limit(ctx) > PCI_EMUL_MEMBASE32)
 		errx(EX_OSERR, "Invalid lowmem limit");
 
 	pci_emul_iobase = PCI_EMUL_IOBASE;
 	pci_emul_membase32 = PCI_EMUL_MEMBASE32;
 
 	pci_emul_membase64 = 4*GB + vm_get_highmem_size(ctx);
 	pci_emul_membase64 = roundup2(pci_emul_membase64, PCI_EMUL_MEMSIZE64);
 	pci_emul_memlim64 = pci_emul_membase64 + PCI_EMUL_MEMSIZE64;
 
 	for (bus = 0; bus < MAXBUSES; bus++) {
 		snprintf(node_name, sizeof(node_name), "pci.%d", bus);
 		nvl = find_config_node(node_name);
 		if (nvl == NULL)
 			continue;
 		pci_businfo[bus] = calloc(1, sizeof(struct businfo));
 		bi = pci_businfo[bus];
 
 		/*
 		 * Keep track of the i/o and memory resources allocated to
 		 * this bus.
 		 */
 		bi->iobase = pci_emul_iobase;
 		bi->membase32 = pci_emul_membase32;
 		bi->membase64 = pci_emul_membase64;
 
 		/* first run: init devices */
 		for (slot = 0; slot < MAXSLOTS; slot++) {
 			si = &bi->slotinfo[slot];
 			for (func = 0; func < MAXFUNCS; func++) {
 				fi = &si->si_funcs[func];
 				snprintf(node_name, sizeof(node_name),
 				    "pci.%d.%d.%d", bus, slot, func);
 				nvl = find_config_node(node_name);
 				if (nvl == NULL)
 					continue;
 
 				fi->fi_config = nvl;
 				emul = get_config_value_node(nvl, "device");
 				if (emul == NULL) {
 					EPRINTLN("pci slot %d:%d:%d: missing "
 					    "\"device\" value", bus, slot, func);
 					return (EINVAL);
 				}
 				pde = pci_emul_finddev(emul);
 				if (pde == NULL) {
 					EPRINTLN("pci slot %d:%d:%d: unknown "
 					    "device \"%s\"", bus, slot, func,
 					    emul);
 					return (EINVAL);
 				}
 				if (pde->pe_alias != NULL) {
 					EPRINTLN("pci slot %d:%d:%d: legacy "
 					    "device \"%s\", use \"%s\" instead",
 					    bus, slot, func, emul,
 					    pde->pe_alias);
 					return (EINVAL);
 				}
 				fi->fi_pde = pde;
 				error = pci_emul_init(ctx, pde, bus, slot,
 				    func, fi);
 				if (error)
 					return (error);
 			}
 		}
 
 		/* second run: assign BARs and free list */
 		struct pci_bar_allocation *bar;
 		struct pci_bar_allocation *bar_tmp;
 		TAILQ_FOREACH_SAFE(bar, &pci_bars, chain, bar_tmp) {
 			pci_emul_assign_bar(bar->pdi, bar->idx, bar->type,
 			    bar->size);
 			free(bar);
 		}
 		TAILQ_INIT(&pci_bars);
 
 		/*
 		 * Add some slop to the I/O and memory resources decoded by
 		 * this bus to give a guest some flexibility if it wants to
 		 * reprogram the BARs.
 		 */
 		pci_emul_iobase += BUSIO_ROUNDUP;
 		pci_emul_iobase = roundup2(pci_emul_iobase, BUSIO_ROUNDUP);
 		bi->iolimit = pci_emul_iobase;
 
 		pci_emul_membase32 += BUSMEM32_ROUNDUP;
 		pci_emul_membase32 = roundup2(pci_emul_membase32,
 		    BUSMEM32_ROUNDUP);
 		bi->memlimit32 = pci_emul_membase32;
 
 		pci_emul_membase64 += BUSMEM64_ROUNDUP;
 		pci_emul_membase64 = roundup2(pci_emul_membase64,
 		    BUSMEM64_ROUNDUP);
 		bi->memlimit64 = pci_emul_membase64;
 	}
 
 	/*
 	 * PCI backends are initialized before routing INTx interrupts
 	 * so that LPC devices are able to reserve ISA IRQs before
 	 * routing PIRQ pins.
 	 */
 	for (bus = 0; bus < MAXBUSES; bus++) {
 		if ((bi = pci_businfo[bus]) == NULL)
 			continue;
 
 		for (slot = 0; slot < MAXSLOTS; slot++) {
 			si = &bi->slotinfo[slot];
 			for (func = 0; func < MAXFUNCS; func++) {
 				fi = &si->si_funcs[func];
 				if (fi->fi_devi == NULL)
 					continue;
 				pci_lintr_route(fi->fi_devi);
 			}
 		}
 	}
 	lpc_pirq_routed();
 
 	/*
 	 * The guest physical memory map looks like the following:
 	 * [0,		    lowmem)		guest system memory
 	 * [lowmem,	    0xC0000000)		memory hole (may be absent)
 	 * [0xC0000000,     0xE0000000)		PCI hole (32-bit BAR allocation)
 	 * [0xE0000000,	    0xF0000000)		PCI extended config window
 	 * [0xF0000000,	    4GB)		LAPIC, IOAPIC, HPET, firmware
 	 * [4GB,	    4GB + highmem)
 	 */
 
 	/*
 	 * Accesses to memory addresses that are not allocated to system
 	 * memory or PCI devices return 0xff's.
 	 */
 	lowmem = vm_get_lowmem_size(ctx);
 	bzero(&mr, sizeof(struct mem_range));
 	mr.name = "PCI hole";
 	mr.flags = MEM_F_RW | MEM_F_IMMUTABLE;
 	mr.base = lowmem;
 	mr.size = (4ULL * 1024 * 1024 * 1024) - lowmem;
 	mr.handler = pci_emul_fallback_handler;
 	error = register_mem_fallback(&mr);
 	assert(error == 0);
 
 	/* PCI extended config space */
 	bzero(&mr, sizeof(struct mem_range));
 	mr.name = "PCI ECFG";
 	mr.flags = MEM_F_RW | MEM_F_IMMUTABLE;
 	mr.base = PCI_EMUL_ECFG_BASE;
 	mr.size = PCI_EMUL_ECFG_SIZE;
 	mr.handler = pci_emul_ecfg_handler;
 	error = register_mem(&mr);
 	assert(error == 0);
 
 	return (0);
 }
 
 static void
 pci_apic_prt_entry(int bus, int slot, int pin, int pirq_pin, int ioapic_irq,
     void *arg)
 {
 
 	dsdt_line("  Package ()");
 	dsdt_line("  {");
 	dsdt_line("    0x%X,", slot << 16 | 0xffff);
 	dsdt_line("    0x%02X,", pin - 1);
 	dsdt_line("    Zero,");
 	dsdt_line("    0x%X", ioapic_irq);
 	dsdt_line("  },");
 }
 
 static void
 pci_pirq_prt_entry(int bus, int slot, int pin, int pirq_pin, int ioapic_irq,
     void *arg)
 {
 	char *name;
 
 	name = lpc_pirq_name(pirq_pin);
 	if (name == NULL)
 		return;
 	dsdt_line("  Package ()");
 	dsdt_line("  {");
 	dsdt_line("    0x%X,", slot << 16 | 0xffff);
 	dsdt_line("    0x%02X,", pin - 1);
 	dsdt_line("    %s,", name);
 	dsdt_line("    0x00");
 	dsdt_line("  },");
 	free(name);
 }
 
 /*
  * A bhyve virtual machine has a flat PCI hierarchy with a root port
  * corresponding to each PCI bus.
  */
 static void
 pci_bus_write_dsdt(int bus)
 {
 	struct businfo *bi;
 	struct slotinfo *si;
 	struct pci_devinst *pi;
 	int count, func, slot;
 
 	/*
 	 * If there are no devices on this 'bus' then just return.
 	 */
 	if ((bi = pci_businfo[bus]) == NULL) {
 		/*
 		 * Bus 0 is special because it decodes the I/O ports used
 		 * for PCI config space access even if there are no devices
 		 * on it.
 		 */
 		if (bus != 0)
 			return;
 	}
 
 	dsdt_line("  Device (PC%02X)", bus);
 	dsdt_line("  {");
 	dsdt_line("    Name (_HID, EisaId (\"PNP0A03\"))");
 
 	dsdt_line("    Method (_BBN, 0, NotSerialized)");
 	dsdt_line("    {");
 	dsdt_line("        Return (0x%08X)", bus);
 	dsdt_line("    }");
 	dsdt_line("    Name (_CRS, ResourceTemplate ()");
 	dsdt_line("    {");
 	dsdt_line("      WordBusNumber (ResourceProducer, MinFixed, "
 	    "MaxFixed, PosDecode,");
 	dsdt_line("        0x0000,             // Granularity");
 	dsdt_line("        0x%04X,             // Range Minimum", bus);
 	dsdt_line("        0x%04X,             // Range Maximum", bus);
 	dsdt_line("        0x0000,             // Translation Offset");
 	dsdt_line("        0x0001,             // Length");
 	dsdt_line("        ,, )");
 
 	if (bus == 0) {
 		dsdt_indent(3);
 		dsdt_fixed_ioport(0xCF8, 8);
 		dsdt_unindent(3);
 
 		dsdt_line("      WordIO (ResourceProducer, MinFixed, MaxFixed, "
 		    "PosDecode, EntireRange,");
 		dsdt_line("        0x0000,             // Granularity");
 		dsdt_line("        0x0000,             // Range Minimum");
 		dsdt_line("        0x0CF7,             // Range Maximum");
 		dsdt_line("        0x0000,             // Translation Offset");
 		dsdt_line("        0x0CF8,             // Length");
 		dsdt_line("        ,, , TypeStatic)");
 
 		dsdt_line("      WordIO (ResourceProducer, MinFixed, MaxFixed, "
 		    "PosDecode, EntireRange,");
 		dsdt_line("        0x0000,             // Granularity");
 		dsdt_line("        0x0D00,             // Range Minimum");
 		dsdt_line("        0x%04X,             // Range Maximum",
 		    PCI_EMUL_IOBASE - 1);
 		dsdt_line("        0x0000,             // Translation Offset");
 		dsdt_line("        0x%04X,             // Length",
 		    PCI_EMUL_IOBASE - 0x0D00);
 		dsdt_line("        ,, , TypeStatic)");
 
 		if (bi == NULL) {
 			dsdt_line("    })");
 			goto done;
 		}
 	}
 	assert(bi != NULL);
 
 	/* i/o window */
 	dsdt_line("      WordIO (ResourceProducer, MinFixed, MaxFixed, "
 	    "PosDecode, EntireRange,");
 	dsdt_line("        0x0000,             // Granularity");
 	dsdt_line("        0x%04X,             // Range Minimum", bi->iobase);
 	dsdt_line("        0x%04X,             // Range Maximum",
 	    bi->iolimit - 1);
 	dsdt_line("        0x0000,             // Translation Offset");
 	dsdt_line("        0x%04X,             // Length",
 	    bi->iolimit - bi->iobase);
 	dsdt_line("        ,, , TypeStatic)");
 
 	/* mmio window (32-bit) */
 	dsdt_line("      DWordMemory (ResourceProducer, PosDecode, "
 	    "MinFixed, MaxFixed, NonCacheable, ReadWrite,");
 	dsdt_line("        0x00000000,         // Granularity");
 	dsdt_line("        0x%08X,         // Range Minimum\n", bi->membase32);
 	dsdt_line("        0x%08X,         // Range Maximum\n",
 	    bi->memlimit32 - 1);
 	dsdt_line("        0x00000000,         // Translation Offset");
 	dsdt_line("        0x%08X,         // Length\n",
 	    bi->memlimit32 - bi->membase32);
 	dsdt_line("        ,, , AddressRangeMemory, TypeStatic)");
 
 	/* mmio window (64-bit) */
 	dsdt_line("      QWordMemory (ResourceProducer, PosDecode, "
 	    "MinFixed, MaxFixed, NonCacheable, ReadWrite,");
 	dsdt_line("        0x0000000000000000, // Granularity");
 	dsdt_line("        0x%016lX, // Range Minimum\n", bi->membase64);
 	dsdt_line("        0x%016lX, // Range Maximum\n",
 	    bi->memlimit64 - 1);
 	dsdt_line("        0x0000000000000000, // Translation Offset");
 	dsdt_line("        0x%016lX, // Length\n",
 	    bi->memlimit64 - bi->membase64);
 	dsdt_line("        ,, , AddressRangeMemory, TypeStatic)");
 	dsdt_line("    })");
 
 	count = pci_count_lintr(bus);
 	if (count != 0) {
 		dsdt_indent(2);
 		dsdt_line("Name (PPRT, Package ()");
 		dsdt_line("{");
 		pci_walk_lintr(bus, pci_pirq_prt_entry, NULL);
 		dsdt_line("})");
 		dsdt_line("Name (APRT, Package ()");
 		dsdt_line("{");
 		pci_walk_lintr(bus, pci_apic_prt_entry, NULL);
 		dsdt_line("})");
 		dsdt_line("Method (_PRT, 0, NotSerialized)");
 		dsdt_line("{");
 		dsdt_line("  If (PICM)");
 		dsdt_line("  {");
 		dsdt_line("    Return (APRT)");
 		dsdt_line("  }");
 		dsdt_line("  Else");
 		dsdt_line("  {");
 		dsdt_line("    Return (PPRT)");
 		dsdt_line("  }");
 		dsdt_line("}");
 		dsdt_unindent(2);
 	}
 
 	dsdt_indent(2);
 	for (slot = 0; slot < MAXSLOTS; slot++) {
 		si = &bi->slotinfo[slot];
 		for (func = 0; func < MAXFUNCS; func++) {
 			pi = si->si_funcs[func].fi_devi;
 			if (pi != NULL && pi->pi_d->pe_write_dsdt != NULL)
 				pi->pi_d->pe_write_dsdt(pi);
 		}
 	}
 	dsdt_unindent(2);
 done:
 	dsdt_line("  }");
 }
 
 void
 pci_write_dsdt(void)
 {
 	int bus;
 
 	dsdt_indent(1);
 	dsdt_line("Name (PICM, 0x00)");
 	dsdt_line("Method (_PIC, 1, NotSerialized)");
 	dsdt_line("{");
 	dsdt_line("  Store (Arg0, PICM)");
 	dsdt_line("}");
 	dsdt_line("");
 	dsdt_line("Scope (_SB)");
 	dsdt_line("{");
 	for (bus = 0; bus < MAXBUSES; bus++)
 		pci_bus_write_dsdt(bus);
 	dsdt_line("}");
 	dsdt_unindent(1);
 }
 
 int
 pci_bus_configured(int bus)
 {
 	assert(bus >= 0 && bus < MAXBUSES);
 	return (pci_businfo[bus] != NULL);
 }
 
 int
 pci_msi_enabled(struct pci_devinst *pi)
 {
 	return (pi->pi_msi.enabled);
 }
 
 int
 pci_msi_maxmsgnum(struct pci_devinst *pi)
 {
 	if (pi->pi_msi.enabled)
 		return (pi->pi_msi.maxmsgnum);
 	else
 		return (0);
 }
 
 int
 pci_msix_enabled(struct pci_devinst *pi)
 {
 
 	return (pi->pi_msix.enabled && !pi->pi_msi.enabled);
 }
 
 void
 pci_generate_msix(struct pci_devinst *pi, int index)
 {
 	struct msix_table_entry *mte;
 
 	if (!pci_msix_enabled(pi))
 		return;
 
 	if (pi->pi_msix.function_mask)
 		return;
 
 	if (index >= pi->pi_msix.table_count)
 		return;
 
 	mte = &pi->pi_msix.table[index];
 	if ((mte->vector_control & PCIM_MSIX_VCTRL_MASK) == 0) {
 		/* XXX Set PBA bit if interrupt is disabled */
 		vm_lapic_msi(pi->pi_vmctx, mte->addr, mte->msg_data);
 	}
 }
 
 void
 pci_generate_msi(struct pci_devinst *pi, int index)
 {
 
 	if (pci_msi_enabled(pi) && index < pci_msi_maxmsgnum(pi)) {
 		vm_lapic_msi(pi->pi_vmctx, pi->pi_msi.addr,
 			     pi->pi_msi.msg_data + index);
 	}
 }
 
 static bool
 pci_lintr_permitted(struct pci_devinst *pi)
 {
 	uint16_t cmd;
 
 	cmd = pci_get_cfgdata16(pi, PCIR_COMMAND);
 	return (!(pi->pi_msi.enabled || pi->pi_msix.enabled ||
 		(cmd & PCIM_CMD_INTxDIS)));
 }
 
 void
 pci_lintr_request(struct pci_devinst *pi)
 {
 	struct businfo *bi;
 	struct slotinfo *si;
 	int bestpin, bestcount, pin;
 
 	bi = pci_businfo[pi->pi_bus];
 	assert(bi != NULL);
 
 	/*
 	 * Just allocate a pin from our slot.  The pin will be
 	 * assigned IRQs later when interrupts are routed.
 	 */
 	si = &bi->slotinfo[pi->pi_slot];
 	bestpin = 0;
 	bestcount = si->si_intpins[0].ii_count;
 	for (pin = 1; pin < 4; pin++) {
 		if (si->si_intpins[pin].ii_count < bestcount) {
 			bestpin = pin;
 			bestcount = si->si_intpins[pin].ii_count;
 		}
 	}
 
 	si->si_intpins[bestpin].ii_count++;
 	pi->pi_lintr.pin = bestpin + 1;
 	pci_set_cfgdata8(pi, PCIR_INTPIN, bestpin + 1);
 }
 
 static void
 pci_lintr_route(struct pci_devinst *pi)
 {
 	struct businfo *bi;
 	struct intxinfo *ii;
 
 	if (pi->pi_lintr.pin == 0)
 		return;
 
 	bi = pci_businfo[pi->pi_bus];
 	assert(bi != NULL);
 	ii = &bi->slotinfo[pi->pi_slot].si_intpins[pi->pi_lintr.pin - 1];
 
 	/*
 	 * Attempt to allocate an I/O APIC pin for this intpin if one
 	 * is not yet assigned.
 	 */
 	if (ii->ii_ioapic_irq == 0)
 		ii->ii_ioapic_irq = ioapic_pci_alloc_irq(pi);
 	assert(ii->ii_ioapic_irq > 0);
 
 	/*
 	 * Attempt to allocate a PIRQ pin for this intpin if one is
 	 * not yet assigned.
 	 */
 	if (ii->ii_pirq_pin == 0)
 		ii->ii_pirq_pin = pirq_alloc_pin(pi);
 	assert(ii->ii_pirq_pin > 0);
 
 	pi->pi_lintr.ioapic_irq = ii->ii_ioapic_irq;
 	pi->pi_lintr.pirq_pin = ii->ii_pirq_pin;
 	pci_set_cfgdata8(pi, PCIR_INTLINE, pirq_irq(ii->ii_pirq_pin));
 }
 
 void
 pci_lintr_assert(struct pci_devinst *pi)
 {
 
 	assert(pi->pi_lintr.pin > 0);
 
 	pthread_mutex_lock(&pi->pi_lintr.lock);
 	if (pi->pi_lintr.state == IDLE) {
 		if (pci_lintr_permitted(pi)) {
 			pi->pi_lintr.state = ASSERTED;
 			pci_irq_assert(pi);
 		} else
 			pi->pi_lintr.state = PENDING;
 	}
 	pthread_mutex_unlock(&pi->pi_lintr.lock);
 }
 
 void
 pci_lintr_deassert(struct pci_devinst *pi)
 {
 
 	assert(pi->pi_lintr.pin > 0);
 
 	pthread_mutex_lock(&pi->pi_lintr.lock);
 	if (pi->pi_lintr.state == ASSERTED) {
 		pi->pi_lintr.state = IDLE;
 		pci_irq_deassert(pi);
 	} else if (pi->pi_lintr.state == PENDING)
 		pi->pi_lintr.state = IDLE;
 	pthread_mutex_unlock(&pi->pi_lintr.lock);
 }
 
 static void
 pci_lintr_update(struct pci_devinst *pi)
 {
 
 	pthread_mutex_lock(&pi->pi_lintr.lock);
 	if (pi->pi_lintr.state == ASSERTED && !pci_lintr_permitted(pi)) {
 		pci_irq_deassert(pi);
 		pi->pi_lintr.state = PENDING;
 	} else if (pi->pi_lintr.state == PENDING && pci_lintr_permitted(pi)) {
 		pi->pi_lintr.state = ASSERTED;
 		pci_irq_assert(pi);
 	}
 	pthread_mutex_unlock(&pi->pi_lintr.lock);
 }
 
 int
 pci_count_lintr(int bus)
 {
 	int count, slot, pin;
 	struct slotinfo *slotinfo;
 
 	count = 0;
 	if (pci_businfo[bus] != NULL) {
 		for (slot = 0; slot < MAXSLOTS; slot++) {
 			slotinfo = &pci_businfo[bus]->slotinfo[slot];
 			for (pin = 0; pin < 4; pin++) {
 				if (slotinfo->si_intpins[pin].ii_count != 0)
 					count++;
 			}
 		}
 	}
 	return (count);
 }
 
 void
 pci_walk_lintr(int bus, pci_lintr_cb cb, void *arg)
 {
 	struct businfo *bi;
 	struct slotinfo *si;
 	struct intxinfo *ii;
 	int slot, pin;
 
 	if ((bi = pci_businfo[bus]) == NULL)
 		return;
 
 	for (slot = 0; slot < MAXSLOTS; slot++) {
 		si = &bi->slotinfo[slot];
 		for (pin = 0; pin < 4; pin++) {
 			ii = &si->si_intpins[pin];
 			if (ii->ii_count != 0)
 				cb(bus, slot, pin + 1, ii->ii_pirq_pin,
 				    ii->ii_ioapic_irq, arg);
 		}
 	}
 }
 
 /*
  * Return 1 if the emulated device in 'slot' is a multi-function device.
  * Return 0 otherwise.
  */
 static int
 pci_emul_is_mfdev(int bus, int slot)
 {
 	struct businfo *bi;
 	struct slotinfo *si;
 	int f, numfuncs;
 
 	numfuncs = 0;
 	if ((bi = pci_businfo[bus]) != NULL) {
 		si = &bi->slotinfo[slot];
 		for (f = 0; f < MAXFUNCS; f++) {
 			if (si->si_funcs[f].fi_devi != NULL) {
 				numfuncs++;
 			}
 		}
 	}
 	return (numfuncs > 1);
 }
 
 /*
  * Ensure that the PCIM_MFDEV bit is properly set (or unset) depending on
  * whether or not is a multi-function being emulated in the pci 'slot'.
  */
 static void
 pci_emul_hdrtype_fixup(int bus, int slot, int off, int bytes, uint32_t *rv)
 {
 	int mfdev;
 
 	if (off <= PCIR_HDRTYPE && off + bytes > PCIR_HDRTYPE) {
 		mfdev = pci_emul_is_mfdev(bus, slot);
 		switch (bytes) {
 		case 1:
 		case 2:
 			*rv &= ~PCIM_MFDEV;
 			if (mfdev) {
 				*rv |= PCIM_MFDEV;
 			}
 			break;
 		case 4:
 			*rv &= ~(PCIM_MFDEV << 16);
 			if (mfdev) {
 				*rv |= (PCIM_MFDEV << 16);
 			}
 			break;
 		}
 	}
 }
 
 /*
  * Update device state in response to changes to the PCI command
  * register.
  */
 void
 pci_emul_cmd_changed(struct pci_devinst *pi, uint16_t old)
 {
 	int i;
 	uint16_t changed, new;
 
 	new = pci_get_cfgdata16(pi, PCIR_COMMAND);
 	changed = old ^ new;
 
 	/*
 	 * If the MMIO or I/O address space decoding has changed then
 	 * register/unregister all BARs that decode that address space.
 	 */
-	for (i = 0; i <= PCI_BARMAX; i++) {
+	for (i = 0; i <= PCI_BARMAX_WITH_ROM; i++) {
 		switch (pi->pi_bar[i].type) {
 			case PCIBAR_NONE:
 			case PCIBAR_MEMHI64:
 				break;
 			case PCIBAR_IO:
 				/* I/O address space decoding changed? */
 				if (changed & PCIM_CMD_PORTEN) {
 					if (new & PCIM_CMD_PORTEN)
 						register_bar(pi, i);
 					else
 						unregister_bar(pi, i);
 				}
 				break;
+			case PCIBAR_ROM:
+				/* skip (un-)register of ROM if it disabled */
+				if (!romen(pi))
+					break;
+				/* fallthrough */
 			case PCIBAR_MEM32:
 			case PCIBAR_MEM64:
 				/* MMIO address space decoding changed? */
 				if (changed & PCIM_CMD_MEMEN) {
 					if (new & PCIM_CMD_MEMEN)
 						register_bar(pi, i);
 					else
 						unregister_bar(pi, i);
 				}
 				break;
 			default:
 				assert(0);
 		}
 	}
 
 	/*
 	 * If INTx has been unmasked and is pending, assert the
 	 * interrupt.
 	 */
 	pci_lintr_update(pi);
 }
 
 static void
 pci_emul_cmdsts_write(struct pci_devinst *pi, int coff, uint32_t new, int bytes)
 {
 	int rshift;
 	uint32_t cmd, old, readonly;
 
 	cmd = pci_get_cfgdata16(pi, PCIR_COMMAND);	/* stash old value */
 
 	/*
 	 * From PCI Local Bus Specification 3.0 sections 6.2.2 and 6.2.3.
 	 *
 	 * XXX Bits 8, 11, 12, 13, 14 and 15 in the status register are
 	 * 'write 1 to clear'. However these bits are not set to '1' by
 	 * any device emulation so it is simpler to treat them as readonly.
 	 */
 	rshift = (coff & 0x3) * 8;
 	readonly = 0xFFFFF880 >> rshift;
 
 	old = CFGREAD(pi, coff, bytes);
 	new &= ~readonly;
 	new |= (old & readonly);
 	CFGWRITE(pi, coff, new, bytes);			/* update config */
 
 	pci_emul_cmd_changed(pi, cmd);
 }
 
 static void
 pci_cfgrw(struct vmctx *ctx, int vcpu, int in, int bus, int slot, int func,
     int coff, int bytes, uint32_t *eax)
 {
 	struct businfo *bi;
 	struct slotinfo *si;
 	struct pci_devinst *pi;
 	struct pci_devemu *pe;
 	int idx, needcfg;
 	uint64_t addr, bar, mask;
 
 	if ((bi = pci_businfo[bus]) != NULL) {
 		si = &bi->slotinfo[slot];
 		pi = si->si_funcs[func].fi_devi;
 	} else
 		pi = NULL;
 
 	/*
 	 * Just return if there is no device at this slot:func or if the
 	 * the guest is doing an un-aligned access.
 	 */
 	if (pi == NULL || (bytes != 1 && bytes != 2 && bytes != 4) ||
 	    (coff & (bytes - 1)) != 0) {
 		if (in)
 			*eax = 0xffffffff;
 		return;
 	}
 
 	/*
 	 * Ignore all writes beyond the standard config space and return all
 	 * ones on reads.
 	 */
 	if (coff >= PCI_REGMAX + 1) {
 		if (in) {
 			*eax = 0xffffffff;
 			/*
 			 * Extended capabilities begin at offset 256 in config
 			 * space. Absence of extended capabilities is signaled
 			 * with all 0s in the extended capability header at
 			 * offset 256.
 			 */
 			if (coff <= PCI_REGMAX + 4)
 				*eax = 0x00000000;
 		}
 		return;
 	}
 
 	pe = pi->pi_d;
 
 	/*
 	 * Config read
 	 */
 	if (in) {
 		/* Let the device emulation override the default handler */
 		if (pe->pe_cfgread != NULL) {
 			needcfg = pe->pe_cfgread(ctx, vcpu, pi, coff, bytes,
 			    eax);
 		} else {
 			needcfg = 1;
 		}
 
 		if (needcfg)
 			*eax = CFGREAD(pi, coff, bytes);
 
 		pci_emul_hdrtype_fixup(bus, slot, coff, bytes, eax);
 	} else {
 		/* Let the device emulation override the default handler */
 		if (pe->pe_cfgwrite != NULL &&
 		    (*pe->pe_cfgwrite)(ctx, vcpu, pi, coff, bytes, *eax) == 0)
 			return;
 
 		/*
-		 * Special handling for write to BAR registers
+		 * Special handling for write to BAR and ROM registers
 		 */
-		if (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1)) {
+		if ((coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1)) ||
+		    (coff >= PCIR_BIOS && coff < PCIR_BIOS + 4)) {
 			/*
 			 * Ignore writes to BAR registers that are not
 			 * 4-byte aligned.
 			 */
 			if (bytes != 4 || (coff & 0x3) != 0)
 				return;
-			idx = (coff - PCIR_BAR(0)) / 4;
+			if (coff != PCIR_BIOS) {
+				idx = (coff - PCIR_BAR(0)) / 4;
+			} else {
+				idx = PCI_ROM_IDX;
+			}
 			mask = ~(pi->pi_bar[idx].size - 1);
 			switch (pi->pi_bar[idx].type) {
 			case PCIBAR_NONE:
 				pi->pi_bar[idx].addr = bar = 0;
 				break;
 			case PCIBAR_IO:
 				addr = *eax & mask;
 				addr &= 0xffff;
 				bar = addr | pi->pi_bar[idx].lobits;
 				/*
 				 * Register the new BAR value for interception
 				 */
 				if (addr != pi->pi_bar[idx].addr) {
 					update_bar_address(pi, addr, idx,
 							   PCIBAR_IO);
 				}
 				break;
 			case PCIBAR_MEM32:
 				addr = bar = *eax & mask;
 				bar |= pi->pi_bar[idx].lobits;
 				if (addr != pi->pi_bar[idx].addr) {
 					update_bar_address(pi, addr, idx,
 							   PCIBAR_MEM32);
 				}
 				break;
 			case PCIBAR_MEM64:
 				addr = bar = *eax & mask;
 				bar |= pi->pi_bar[idx].lobits;
 				if (addr != (uint32_t)pi->pi_bar[idx].addr) {
 					update_bar_address(pi, addr, idx,
 							   PCIBAR_MEM64);
 				}
 				break;
 			case PCIBAR_MEMHI64:
 				mask = ~(pi->pi_bar[idx - 1].size - 1);
 				addr = ((uint64_t)*eax << 32) & mask;
 				bar = addr >> 32;
 				if (bar != pi->pi_bar[idx - 1].addr >> 32) {
 					update_bar_address(pi, addr, idx - 1,
 							   PCIBAR_MEMHI64);
 				}
 				break;
+			case PCIBAR_ROM:
+				addr = bar = *eax & mask;
+				if (memen(pi) && romen(pi)) {
+					unregister_bar(pi, idx);
+				}
+				pi->pi_bar[idx].addr = addr;
+				pi->pi_bar[idx].lobits = *eax &
+				    PCIM_BIOS_ENABLE;
+				/* romen could have changed it value */
+				if (memen(pi) && romen(pi)) {
+					register_bar(pi, idx);
+				}
+				bar |= pi->pi_bar[idx].lobits;
+				break;
 			default:
 				assert(0);
 			}
 			pci_set_cfgdata32(pi, coff, bar);
 
 		} else if (pci_emul_iscap(pi, coff)) {
 			pci_emul_capwrite(pi, coff, bytes, *eax, 0, 0);
 		} else if (coff >= PCIR_COMMAND && coff < PCIR_REVID) {
 			pci_emul_cmdsts_write(pi, coff, *eax, bytes);
 		} else {
 			CFGWRITE(pi, coff, *eax, bytes);
 		}
 	}
 }
 
 static int cfgenable, cfgbus, cfgslot, cfgfunc, cfgoff;
 
 static int
 pci_emul_cfgaddr(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
 		 uint32_t *eax, void *arg)
 {
 	uint32_t x;
 
 	if (bytes != 4) {
 		if (in)
 			*eax = (bytes == 2) ? 0xffff : 0xff;
 		return (0);
 	}
 
 	if (in) {
 		x = (cfgbus << 16) | (cfgslot << 11) | (cfgfunc << 8) | cfgoff;
 		if (cfgenable)
 			x |= CONF1_ENABLE;
 		*eax = x;
 	} else {
 		x = *eax;
 		cfgenable = (x & CONF1_ENABLE) == CONF1_ENABLE;
 		cfgoff = (x & PCI_REGMAX) & ~0x03;
 		cfgfunc = (x >> 8) & PCI_FUNCMAX;
 		cfgslot = (x >> 11) & PCI_SLOTMAX;
 		cfgbus = (x >> 16) & PCI_BUSMAX;
 	}
 
 	return (0);
 }
 INOUT_PORT(pci_cfgaddr, CONF1_ADDR_PORT, IOPORT_F_INOUT, pci_emul_cfgaddr);
 
 static int
 pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
 		 uint32_t *eax, void *arg)
 {
 	int coff;
 
 	assert(bytes == 1 || bytes == 2 || bytes == 4);
 
 	coff = cfgoff + (port - CONF1_DATA_PORT);
 	if (cfgenable) {
 		pci_cfgrw(ctx, vcpu, in, cfgbus, cfgslot, cfgfunc, coff, bytes,
 		    eax);
 	} else {
 		/* Ignore accesses to cfgdata if not enabled by cfgaddr */
 		if (in)
 			*eax = 0xffffffff;
 	}
 	return (0);
 }
 
 INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+0, IOPORT_F_INOUT, pci_emul_cfgdata);
 INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+1, IOPORT_F_INOUT, pci_emul_cfgdata);
 INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+2, IOPORT_F_INOUT, pci_emul_cfgdata);
 INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+3, IOPORT_F_INOUT, pci_emul_cfgdata);
 
 #ifdef BHYVE_SNAPSHOT
 /*
  * Saves/restores PCI device emulated state. Returns 0 on success.
  */
 static int
 pci_snapshot_pci_dev(struct vm_snapshot_meta *meta)
 {
 	struct pci_devinst *pi;
 	int i;
 	int ret;
 
 	pi = meta->dev_data;
 
 	SNAPSHOT_VAR_OR_LEAVE(pi->pi_msi.enabled, meta, ret, done);
 	SNAPSHOT_VAR_OR_LEAVE(pi->pi_msi.addr, meta, ret, done);
 	SNAPSHOT_VAR_OR_LEAVE(pi->pi_msi.msg_data, meta, ret, done);
 	SNAPSHOT_VAR_OR_LEAVE(pi->pi_msi.maxmsgnum, meta, ret, done);
 
 	SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.enabled, meta, ret, done);
 	SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table_bar, meta, ret, done);
 	SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.pba_bar, meta, ret, done);
 	SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table_offset, meta, ret, done);
 	SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table_count, meta, ret, done);
 	SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.pba_offset, meta, ret, done);
 	SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.pba_size, meta, ret, done);
 	SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.function_mask, meta, ret, done);
 
 	SNAPSHOT_BUF_OR_LEAVE(pi->pi_cfgdata, sizeof(pi->pi_cfgdata),
 			      meta, ret, done);
 
 	for (i = 0; i < nitems(pi->pi_bar); i++) {
 		SNAPSHOT_VAR_OR_LEAVE(pi->pi_bar[i].type, meta, ret, done);
 		SNAPSHOT_VAR_OR_LEAVE(pi->pi_bar[i].size, meta, ret, done);
 		SNAPSHOT_VAR_OR_LEAVE(pi->pi_bar[i].addr, meta, ret, done);
 	}
 
 	/* Restore MSI-X table. */
 	for (i = 0; i < pi->pi_msix.table_count; i++) {
 		SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table[i].addr,
 				      meta, ret, done);
 		SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table[i].msg_data,
 				      meta, ret, done);
 		SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table[i].vector_control,
 				      meta, ret, done);
 	}
 
 done:
 	return (ret);
 }
 
 static int
 pci_find_slotted_dev(const char *dev_name, struct pci_devemu **pde,
 		     struct pci_devinst **pdi)
 {
 	struct businfo *bi;
 	struct slotinfo *si;
 	struct funcinfo *fi;
 	int bus, slot, func;
 
 	assert(dev_name != NULL);
 	assert(pde != NULL);
 	assert(pdi != NULL);
 
 	for (bus = 0; bus < MAXBUSES; bus++) {
 		if ((bi = pci_businfo[bus]) == NULL)
 			continue;
 
 		for (slot = 0; slot < MAXSLOTS; slot++) {
 			si = &bi->slotinfo[slot];
 			for (func = 0; func < MAXFUNCS; func++) {
 				fi = &si->si_funcs[func];
 				if (fi->fi_pde == NULL)
 					continue;
 				if (strcmp(dev_name, fi->fi_pde->pe_emu) != 0)
 					continue;
 
 				*pde = fi->fi_pde;
 				*pdi = fi->fi_devi;
 				return (0);
 			}
 		}
 	}
 
 	return (EINVAL);
 }
 
 int
 pci_snapshot(struct vm_snapshot_meta *meta)
 {
 	struct pci_devemu *pde;
 	struct pci_devinst *pdi;
 	int ret;
 
 	assert(meta->dev_name != NULL);
 
 	ret = pci_find_slotted_dev(meta->dev_name, &pde, &pdi);
 	if (ret != 0) {
 		fprintf(stderr, "%s: no such name: %s\r\n",
 			__func__, meta->dev_name);
 		memset(meta->buffer.buf_start, 0, meta->buffer.buf_size);
 		return (0);
 	}
 
 	meta->dev_data = pdi;
 
 	if (pde->pe_snapshot == NULL) {
 		fprintf(stderr, "%s: not implemented yet for: %s\r\n",
 			__func__, meta->dev_name);
 		return (-1);
 	}
 
 	ret = pci_snapshot_pci_dev(meta);
 	if (ret != 0) {
 		fprintf(stderr, "%s: failed to snapshot pci dev\r\n",
 			__func__);
 		return (-1);
 	}
 
 	ret = (*pde->pe_snapshot)(meta);
 
 	return (ret);
 }
 
 int
 pci_pause(struct vmctx *ctx, const char *dev_name)
 {
 	struct pci_devemu *pde;
 	struct pci_devinst *pdi;
 	int ret;
 
 	assert(dev_name != NULL);
 
 	ret = pci_find_slotted_dev(dev_name, &pde, &pdi);
 	if (ret != 0) {
 		/*
 		 * It is possible to call this function without
 		 * checking that the device is inserted first.
 		 */
 		fprintf(stderr, "%s: no such name: %s\n", __func__, dev_name);
 		return (0);
 	}
 
 	if (pde->pe_pause == NULL) {
 		/* The pause/resume functionality is optional. */
 		fprintf(stderr, "%s: not implemented for: %s\n",
 			__func__, dev_name);
 		return (0);
 	}
 
 	return (*pde->pe_pause)(ctx, pdi);
 }
 
 int
 pci_resume(struct vmctx *ctx, const char *dev_name)
 {
 	struct pci_devemu *pde;
 	struct pci_devinst *pdi;
 	int ret;
 
 	assert(dev_name != NULL);
 
 	ret = pci_find_slotted_dev(dev_name, &pde, &pdi);
 	if (ret != 0) {
 		/*
 		 * It is possible to call this function without
 		 * checking that the device is inserted first.
 		 */
 		fprintf(stderr, "%s: no such name: %s\n", __func__, dev_name);
 		return (0);
 	}
 
 	if (pde->pe_resume == NULL) {
 		/* The pause/resume functionality is optional. */
 		fprintf(stderr, "%s: not implemented for: %s\n",
 			__func__, dev_name);
 		return (0);
 	}
 
 	return (*pde->pe_resume)(ctx, pdi);
 }
 #endif
 
 #define PCI_EMUL_TEST
 #ifdef PCI_EMUL_TEST
 /*
  * Define a dummy test device
  */
 #define DIOSZ	8
 #define DMEMSZ	4096
 struct pci_emul_dsoftc {
 	uint8_t   ioregs[DIOSZ];
 	uint8_t	  memregs[2][DMEMSZ];
 };
 
 #define	PCI_EMUL_MSI_MSGS	 4
 #define	PCI_EMUL_MSIX_MSGS	16
 
 static int
 pci_emul_dinit(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl)
 {
 	int error;
 	struct pci_emul_dsoftc *sc;
 
 	sc = calloc(1, sizeof(struct pci_emul_dsoftc));
 
 	pi->pi_arg = sc;
 
 	pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0001);
 	pci_set_cfgdata16(pi, PCIR_VENDOR, 0x10DD);
 	pci_set_cfgdata8(pi, PCIR_CLASS, 0x02);
 
 	error = pci_emul_add_msicap(pi, PCI_EMUL_MSI_MSGS);
 	assert(error == 0);
 
 	error = pci_emul_alloc_bar(pi, 0, PCIBAR_IO, DIOSZ);
 	assert(error == 0);
 
 	error = pci_emul_alloc_bar(pi, 1, PCIBAR_MEM32, DMEMSZ);
 	assert(error == 0);
 
 	error = pci_emul_alloc_bar(pi, 2, PCIBAR_MEM32, DMEMSZ);
 	assert(error == 0);
 
 	return (0);
 }
 
 static void
 pci_emul_diow(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
 	      uint64_t offset, int size, uint64_t value)
 {
 	int i;
 	struct pci_emul_dsoftc *sc = pi->pi_arg;
 
 	if (baridx == 0) {
 		if (offset + size > DIOSZ) {
 			printf("diow: iow too large, offset %ld size %d\n",
 			       offset, size);
 			return;
 		}
 
 		if (size == 1) {
 			sc->ioregs[offset] = value & 0xff;
 		} else if (size == 2) {
 			*(uint16_t *)&sc->ioregs[offset] = value & 0xffff;
 		} else if (size == 4) {
 			*(uint32_t *)&sc->ioregs[offset] = value;
 		} else {
 			printf("diow: iow unknown size %d\n", size);
 		}
 
 		/*
 		 * Special magic value to generate an interrupt
 		 */
 		if (offset == 4 && size == 4 && pci_msi_enabled(pi))
 			pci_generate_msi(pi, value % pci_msi_maxmsgnum(pi));
 
 		if (value == 0xabcdef) {
 			for (i = 0; i < pci_msi_maxmsgnum(pi); i++)
 				pci_generate_msi(pi, i);
 		}
 	}
 
 	if (baridx == 1 || baridx == 2) {
 		if (offset + size > DMEMSZ) {
 			printf("diow: memw too large, offset %ld size %d\n",
 			       offset, size);
 			return;
 		}
 
 		i = baridx - 1;		/* 'memregs' index */
 
 		if (size == 1) {
 			sc->memregs[i][offset] = value;
 		} else if (size == 2) {
 			*(uint16_t *)&sc->memregs[i][offset] = value;
 		} else if (size == 4) {
 			*(uint32_t *)&sc->memregs[i][offset] = value;
 		} else if (size == 8) {
 			*(uint64_t *)&sc->memregs[i][offset] = value;
 		} else {
 			printf("diow: memw unknown size %d\n", size);
 		}
 		
 		/*
 		 * magic interrupt ??
 		 */
 	}
 
 	if (baridx > 2 || baridx < 0) {
 		printf("diow: unknown bar idx %d\n", baridx);
 	}
 }
 
 static uint64_t
 pci_emul_dior(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
 	      uint64_t offset, int size)
 {
 	struct pci_emul_dsoftc *sc = pi->pi_arg;
 	uint32_t value;
 	int i;
 
 	if (baridx == 0) {
 		if (offset + size > DIOSZ) {
 			printf("dior: ior too large, offset %ld size %d\n",
 			       offset, size);
 			return (0);
 		}
 	
 		value = 0;
 		if (size == 1) {
 			value = sc->ioregs[offset];
 		} else if (size == 2) {
 			value = *(uint16_t *) &sc->ioregs[offset];
 		} else if (size == 4) {
 			value = *(uint32_t *) &sc->ioregs[offset];
 		} else {
 			printf("dior: ior unknown size %d\n", size);
 		}
 	}
 
 	if (baridx == 1 || baridx == 2) {
 		if (offset + size > DMEMSZ) {
 			printf("dior: memr too large, offset %ld size %d\n",
 			       offset, size);
 			return (0);
 		}
 		
 		i = baridx - 1;		/* 'memregs' index */
 
 		if (size == 1) {
 			value = sc->memregs[i][offset];
 		} else if (size == 2) {
 			value = *(uint16_t *) &sc->memregs[i][offset];
 		} else if (size == 4) {
 			value = *(uint32_t *) &sc->memregs[i][offset];
 		} else if (size == 8) {
 			value = *(uint64_t *) &sc->memregs[i][offset];
 		} else {
 			printf("dior: ior unknown size %d\n", size);
 		}
 	}
 
 
 	if (baridx > 2 || baridx < 0) {
 		printf("dior: unknown bar idx %d\n", baridx);
 		return (0);
 	}
 
 	return (value);
 }
 
 #ifdef BHYVE_SNAPSHOT
 int
 pci_emul_snapshot(struct vm_snapshot_meta *meta)
 {
 
 	return (0);
 }
 #endif
 
 struct pci_devemu pci_dummy = {
 	.pe_emu = "dummy",
 	.pe_init = pci_emul_dinit,
 	.pe_barwrite = pci_emul_diow,
 	.pe_barread = pci_emul_dior,
 #ifdef BHYVE_SNAPSHOT
 	.pe_snapshot = pci_emul_snapshot,
 #endif
 };
 PCI_EMUL_SET(pci_dummy);
 
 #endif /* PCI_EMUL_TEST */
diff --git a/usr.sbin/bhyve/pci_emul.h b/usr.sbin/bhyve/pci_emul.h
index 6eac0720f09f..c10c4af4698b 100644
--- a/usr.sbin/bhyve/pci_emul.h
+++ b/usr.sbin/bhyve/pci_emul.h
@@ -1,308 +1,315 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _PCI_EMUL_H_
 #define _PCI_EMUL_H_
 
 #include <sys/types.h>
 #include <sys/queue.h>
 #include <sys/kernel.h>
 #include <sys/nv.h>
 #include <sys/_pthreadtypes.h>
 
 #include <dev/pci/pcireg.h>
 
 #include <assert.h>
 
 #define	PCI_BARMAX	PCIR_MAX_BAR_0	/* BAR registers in a Type 0 header */
+#define PCI_BARMAX_WITH_ROM (PCI_BARMAX + 1)
+#define PCI_ROM_IDX (PCI_BARMAX + 1)
 
 struct vmctx;
 struct pci_devinst;
 struct memory_region;
 struct vm_snapshot_meta;
 
 struct pci_devemu {
 	char      *pe_emu;		/* Name of device emulation */
 
 	/* instance creation */
 	int       (*pe_init)(struct vmctx *, struct pci_devinst *,
 			     nvlist_t *);
 	int	(*pe_legacy_config)(nvlist_t *, const char *);
 	const char *pe_alias;
 
 	/* ACPI DSDT enumeration */
 	void	(*pe_write_dsdt)(struct pci_devinst *);
 
 	/* config space read/write callbacks */
 	int	(*pe_cfgwrite)(struct vmctx *ctx, int vcpu,
 			       struct pci_devinst *pi, int offset,
 			       int bytes, uint32_t val);
 	int	(*pe_cfgread)(struct vmctx *ctx, int vcpu,
 			      struct pci_devinst *pi, int offset,
 			      int bytes, uint32_t *retval);
 
 	/* BAR read/write callbacks */
 	void      (*pe_barwrite)(struct vmctx *ctx, int vcpu,
 				 struct pci_devinst *pi, int baridx,
 				 uint64_t offset, int size, uint64_t value);
 	uint64_t  (*pe_barread)(struct vmctx *ctx, int vcpu,
 				struct pci_devinst *pi, int baridx,
 				uint64_t offset, int size);
 
 	void	(*pe_baraddr)(struct vmctx *ctx, struct pci_devinst *pi,
 			      int baridx, int enabled, uint64_t address);
 
 	/* Save/restore device state */
 	int	(*pe_snapshot)(struct vm_snapshot_meta *meta);
 	int	(*pe_pause)(struct vmctx *ctx, struct pci_devinst *pi);
 	int	(*pe_resume)(struct vmctx *ctx, struct pci_devinst *pi);
 
 };
 #define PCI_EMUL_SET(x)   DATA_SET(pci_devemu_set, x);
 
 enum pcibar_type {
 	PCIBAR_NONE,
 	PCIBAR_IO,
 	PCIBAR_MEM32,
 	PCIBAR_MEM64,
-	PCIBAR_MEMHI64
+	PCIBAR_MEMHI64,
+	PCIBAR_ROM,
 };
 
 struct pcibar {
 	enum pcibar_type	type;		/* io or memory */
 	uint64_t		size;
 	uint64_t		addr;
 	uint8_t			lobits;
 };
 
 #define PI_NAMESZ	40
 
 struct msix_table_entry {
 	uint64_t	addr;
 	uint32_t	msg_data;
 	uint32_t	vector_control;
 } __packed;
 
 /* 
  * In case the structure is modified to hold extra information, use a define
  * for the size that should be emulated.
  */
 #define	MSIX_TABLE_ENTRY_SIZE	16
 #define MAX_MSIX_TABLE_ENTRIES	2048
 #define	PBA_SIZE(msgnum)	(roundup2((msgnum), 64) / 8)
 
 enum lintr_stat {
 	IDLE,
 	ASSERTED,
 	PENDING
 };
 
 struct pci_devinst {
 	struct pci_devemu *pi_d;
 	struct vmctx *pi_vmctx;
 	uint8_t	  pi_bus, pi_slot, pi_func;
 	char	  pi_name[PI_NAMESZ];
 	int	  pi_bar_getsize;
 	int	  pi_prevcap;
 	int	  pi_capend;
 
 	struct {
 		int8_t    	pin;
 		enum lintr_stat	state;
 		int		pirq_pin;
 		int	  	ioapic_irq;
 		pthread_mutex_t	lock;
 	} pi_lintr;
 
 	struct {
 		int		enabled;
 		uint64_t	addr;
 		uint64_t	msg_data;
 		int		maxmsgnum;
 	} pi_msi;
 
 	struct {
 		int	enabled;
 		int	table_bar;
 		int	pba_bar;
 		uint32_t table_offset;
 		int	table_count;
 		uint32_t pba_offset;
 		int	pba_size;
 		int	function_mask; 	
 		struct msix_table_entry *table;	/* allocated at runtime */
 		uint8_t *mapped_addr;
 		size_t	mapped_size;
 	} pi_msix;
 
 	void      *pi_arg;		/* devemu-private data */
 
 	u_char	  pi_cfgdata[PCI_REGMAX + 1];
-	struct pcibar pi_bar[PCI_BARMAX + 1];
+	/* ROM is handled like a BAR */
+	struct pcibar pi_bar[PCI_BARMAX_WITH_ROM + 1];
+	uint64_t pi_romoffset;
 };
 
 struct msicap {
 	uint8_t		capid;
 	uint8_t		nextptr;
 	uint16_t	msgctrl;
 	uint32_t	addrlo;
 	uint32_t	addrhi;
 	uint16_t	msgdata;
 } __packed;
 static_assert(sizeof(struct msicap) == 14, "compile-time assertion failed");
 
 struct msixcap {
 	uint8_t		capid;
 	uint8_t		nextptr;
 	uint16_t	msgctrl;
 	uint32_t	table_info;	/* bar index and offset within it */
 	uint32_t	pba_info;	/* bar index and offset within it */
 } __packed;
 static_assert(sizeof(struct msixcap) == 12, "compile-time assertion failed");
 
 struct pciecap {
 	uint8_t		capid;
 	uint8_t		nextptr;
 	uint16_t	pcie_capabilities;
 
 	uint32_t	dev_capabilities;	/* all devices */
 	uint16_t	dev_control;
 	uint16_t	dev_status;
 
 	uint32_t	link_capabilities;	/* devices with links */
 	uint16_t	link_control;
 	uint16_t	link_status;
 
 	uint32_t	slot_capabilities;	/* ports with slots */
 	uint16_t	slot_control;
 	uint16_t	slot_status;
 
 	uint16_t	root_control;		/* root ports */
 	uint16_t	root_capabilities;
 	uint32_t	root_status;
 
 	uint32_t	dev_capabilities2;	/* all devices */
 	uint16_t	dev_control2;
 	uint16_t	dev_status2;
 
 	uint32_t	link_capabilities2;	/* devices with links */
 	uint16_t	link_control2;
 	uint16_t	link_status2;
 
 	uint32_t	slot_capabilities2;	/* ports with slots */
 	uint16_t	slot_control2;
 	uint16_t	slot_status2;
 } __packed;
 static_assert(sizeof(struct pciecap) == 60, "compile-time assertion failed");
 
 typedef void (*pci_lintr_cb)(int b, int s, int pin, int pirq_pin,
     int ioapic_irq, void *arg);
 
 int	init_pci(struct vmctx *ctx);
 void	pci_callback(void);
 int	pci_emul_alloc_bar(struct pci_devinst *pdi, int idx,
 	    enum pcibar_type type, uint64_t size);
+int 	pci_emul_alloc_rom(struct pci_devinst *const pdi, const uint64_t size,
+    	    void **const addr);
 int	pci_emul_add_msicap(struct pci_devinst *pi, int msgnum);
 int	pci_emul_add_pciecap(struct pci_devinst *pi, int pcie_device_type);
 void	pci_emul_capwrite(struct pci_devinst *pi, int offset, int bytes,
 	    uint32_t val, uint8_t capoff, int capid);
 void	pci_emul_cmd_changed(struct pci_devinst *pi, uint16_t old);
 void	pci_generate_msi(struct pci_devinst *pi, int msgnum);
 void	pci_generate_msix(struct pci_devinst *pi, int msgnum);
 void	pci_lintr_assert(struct pci_devinst *pi);
 void	pci_lintr_deassert(struct pci_devinst *pi);
 void	pci_lintr_request(struct pci_devinst *pi);
 int	pci_msi_enabled(struct pci_devinst *pi);
 int	pci_msix_enabled(struct pci_devinst *pi);
 int	pci_msix_table_bar(struct pci_devinst *pi);
 int	pci_msix_pba_bar(struct pci_devinst *pi);
 int	pci_msi_maxmsgnum(struct pci_devinst *pi);
 int	pci_parse_legacy_config(nvlist_t *nvl, const char *opt);
 int	pci_parse_slot(char *opt);
 void    pci_print_supported_devices();
 void	pci_populate_msicap(struct msicap *cap, int msgs, int nextptr);
 int	pci_emul_add_msixcap(struct pci_devinst *pi, int msgnum, int barnum);
 int	pci_emul_msix_twrite(struct pci_devinst *pi, uint64_t offset, int size,
 			     uint64_t value);
 uint64_t pci_emul_msix_tread(struct pci_devinst *pi, uint64_t offset, int size);
 int	pci_count_lintr(int bus);
 void	pci_walk_lintr(int bus, pci_lintr_cb cb, void *arg);
 void	pci_write_dsdt(void);
 uint64_t pci_ecfg_base(void);
 int	pci_bus_configured(int bus);
 #ifdef BHYVE_SNAPSHOT
 int	pci_snapshot(struct vm_snapshot_meta *meta);
 int	pci_pause(struct vmctx *ctx, const char *dev_name);
 int	pci_resume(struct vmctx *ctx, const char *dev_name);
 #endif
 
 static __inline void 
 pci_set_cfgdata8(struct pci_devinst *pi, int offset, uint8_t val)
 {
 	assert(offset <= PCI_REGMAX);
 	*(uint8_t *)(pi->pi_cfgdata + offset) = val;
 }
 
 static __inline void 
 pci_set_cfgdata16(struct pci_devinst *pi, int offset, uint16_t val)
 {
 	assert(offset <= (PCI_REGMAX - 1) && (offset & 1) == 0);
 	*(uint16_t *)(pi->pi_cfgdata + offset) = val;
 }
 
 static __inline void 
 pci_set_cfgdata32(struct pci_devinst *pi, int offset, uint32_t val)
 {
 	assert(offset <= (PCI_REGMAX - 3) && (offset & 3) == 0);
 	*(uint32_t *)(pi->pi_cfgdata + offset) = val;
 }
 
 static __inline uint8_t
 pci_get_cfgdata8(struct pci_devinst *pi, int offset)
 {
 	assert(offset <= PCI_REGMAX);
 	return (*(uint8_t *)(pi->pi_cfgdata + offset));
 }
 
 static __inline uint16_t
 pci_get_cfgdata16(struct pci_devinst *pi, int offset)
 {
 	assert(offset <= (PCI_REGMAX - 1) && (offset & 1) == 0);
 	return (*(uint16_t *)(pi->pi_cfgdata + offset));
 }
 
 static __inline uint32_t
 pci_get_cfgdata32(struct pci_devinst *pi, int offset)
 {
 	assert(offset <= (PCI_REGMAX - 3) && (offset & 3) == 0);
 	return (*(uint32_t *)(pi->pi_cfgdata + offset));
 }
 
 #endif /* _PCI_EMUL_H_ */
diff --git a/usr.sbin/bhyve/pci_passthru.c b/usr.sbin/bhyve/pci_passthru.c
index 3fe446f42eee..56f049a6d312 100644
--- a/usr.sbin/bhyve/pci_passthru.c
+++ b/usr.sbin/bhyve/pci_passthru.c
@@ -1,1043 +1,1139 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #ifndef WITHOUT_CAPSICUM
 #include <sys/capsicum.h>
 #endif
 #include <sys/types.h>
 #include <sys/mman.h>
 #include <sys/pciio.h>
 #include <sys/ioctl.h>
+#include <sys/stat.h>
 
 #include <dev/io/iodev.h>
 #include <dev/pci/pcireg.h>
 
 #include <vm/vm.h>
 
 #include <machine/iodev.h>
 #include <machine/vm.h>
 
 #ifndef WITHOUT_CAPSICUM
 #include <capsicum_helpers.h>
 #endif
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <err.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <sysexits.h>
 #include <unistd.h>
 
 #include <machine/vmm.h>
 
 #include "config.h"
 #include "debug.h"
 #include "mem.h"
 #include "pci_passthru.h"
 
 #ifndef _PATH_DEVPCI
 #define	_PATH_DEVPCI	"/dev/pci"
 #endif
 
 #define	LEGACY_SUPPORT	1
 
 #define MSIX_TABLE_COUNT(ctrl) (((ctrl) & PCIM_MSIXCTRL_TABLE_SIZE) + 1)
 #define MSIX_CAPLEN 12
 
 static int pcifd = -1;
 
 struct passthru_softc {
 	struct pci_devinst *psc_pi;
-	struct pcibar psc_bar[PCI_BARMAX + 1];
+	/* ROM is handled like a BAR */
+	struct pcibar psc_bar[PCI_BARMAX_WITH_ROM + 1];
 	struct {
 		int		capoff;
 		int		msgctrl;
 		int		emulated;
 	} psc_msi;
 	struct {
 		int		capoff;
 	} psc_msix;
 	struct pcisel psc_sel;
 };
 
 static int
 msi_caplen(int msgctrl)
 {
 	int len;
 	
 	len = 10;		/* minimum length of msi capability */
 
 	if (msgctrl & PCIM_MSICTRL_64BIT)
 		len += 4;
 
 #if 0
 	/*
 	 * Ignore the 'mask' and 'pending' bits in the MSI capability.
 	 * We'll let the guest manipulate them directly.
 	 */
 	if (msgctrl & PCIM_MSICTRL_VECTOR)
 		len += 10;
 #endif
 
 	return (len);
 }
 
 static int
 pcifd_init() {
 	pcifd = open(_PATH_DEVPCI, O_RDWR, 0);
 	if (pcifd < 0) {
 		warn("failed to open %s", _PATH_DEVPCI);
 		return (1);
 	}
 
 #ifndef WITHOUT_CAPSICUM
 	cap_rights_t pcifd_rights;
 	cap_rights_init(&pcifd_rights, CAP_IOCTL, CAP_READ, CAP_WRITE);
 	if (caph_rights_limit(pcifd, &pcifd_rights) == -1)
 		errx(EX_OSERR, "Unable to apply rights for sandbox");
 
 	const cap_ioctl_t pcifd_ioctls[] = { PCIOCREAD, PCIOCWRITE, PCIOCGETBAR,
 		PCIOCBARIO, PCIOCBARMMAP };
 	if (caph_ioctls_limit(pcifd, pcifd_ioctls, nitems(pcifd_ioctls)) == -1)
 		errx(EX_OSERR, "Unable to apply rights for sandbox");
 #endif
 
 	return (0);
 }
 
 uint32_t
 read_config(const struct pcisel *sel, long reg, int width)
 {
 	if (pcifd < 0 && pcifd_init()) {
 		return (0);
 	}
 
 	struct pci_io pi;
 
 	bzero(&pi, sizeof(pi));
 	pi.pi_sel = *sel;
 	pi.pi_reg = reg;
 	pi.pi_width = width;
 
 	if (ioctl(pcifd, PCIOCREAD, &pi) < 0)
 		return (0);				/* XXX */
 	else
 		return (pi.pi_data);
 }
 
 void
 write_config(const struct pcisel *sel, long reg, int width, uint32_t data)
 {
 	if (pcifd < 0 && pcifd_init()) {
 		return;
 	}
 
 	struct pci_io pi;
 
 	bzero(&pi, sizeof(pi));
 	pi.pi_sel = *sel;
 	pi.pi_reg = reg;
 	pi.pi_width = width;
 	pi.pi_data = data;
 
 	(void)ioctl(pcifd, PCIOCWRITE, &pi);		/* XXX */
 }
 
 #ifdef LEGACY_SUPPORT
 static int
 passthru_add_msicap(struct pci_devinst *pi, int msgnum, int nextptr)
 {
 	int capoff, i;
 	struct msicap msicap;
 	u_char *capdata;
 
 	pci_populate_msicap(&msicap, msgnum, nextptr);
 
 	/*
 	 * XXX
 	 * Copy the msi capability structure in the last 16 bytes of the
 	 * config space. This is wrong because it could shadow something
 	 * useful to the device.
 	 */
 	capoff = 256 - roundup(sizeof(msicap), 4);
 	capdata = (u_char *)&msicap;
 	for (i = 0; i < sizeof(msicap); i++)
 		pci_set_cfgdata8(pi, capoff + i, capdata[i]);
 
 	return (capoff);
 }
 #endif	/* LEGACY_SUPPORT */
 
 static int
 cfginitmsi(struct passthru_softc *sc)
 {
 	int i, ptr, capptr, cap, sts, caplen, table_size;
 	uint32_t u32;
 	struct pcisel sel;
 	struct pci_devinst *pi;
 	struct msixcap msixcap;
 	uint32_t *msixcap_ptr;
 
 	pi = sc->psc_pi;
 	sel = sc->psc_sel;
 
 	/*
 	 * Parse the capabilities and cache the location of the MSI
 	 * and MSI-X capabilities.
 	 */
 	sts = read_config(&sel, PCIR_STATUS, 2);
 	if (sts & PCIM_STATUS_CAPPRESENT) {
 		ptr = read_config(&sel, PCIR_CAP_PTR, 1);
 		while (ptr != 0 && ptr != 0xff) {
 			cap = read_config(&sel, ptr + PCICAP_ID, 1);
 			if (cap == PCIY_MSI) {
 				/*
 				 * Copy the MSI capability into the config
 				 * space of the emulated pci device
 				 */
 				sc->psc_msi.capoff = ptr;
 				sc->psc_msi.msgctrl = read_config(&sel,
 								  ptr + 2, 2);
 				sc->psc_msi.emulated = 0;
 				caplen = msi_caplen(sc->psc_msi.msgctrl);
 				capptr = ptr;
 				while (caplen > 0) {
 					u32 = read_config(&sel, capptr, 4);
 					pci_set_cfgdata32(pi, capptr, u32);
 					caplen -= 4;
 					capptr += 4;
 				}
 			} else if (cap == PCIY_MSIX) {
 				/*
 				 * Copy the MSI-X capability 
 				 */
 				sc->psc_msix.capoff = ptr;
 				caplen = 12;
 				msixcap_ptr = (uint32_t*) &msixcap;
 				capptr = ptr;
 				while (caplen > 0) {
 					u32 = read_config(&sel, capptr, 4);
 					*msixcap_ptr = u32;
 					pci_set_cfgdata32(pi, capptr, u32);
 					caplen -= 4;
 					capptr += 4;
 					msixcap_ptr++;
 				}
 			}
 			ptr = read_config(&sel, ptr + PCICAP_NEXTPTR, 1);
 		}
 	}
 
 	if (sc->psc_msix.capoff != 0) {
 		pi->pi_msix.pba_bar =
 		    msixcap.pba_info & PCIM_MSIX_BIR_MASK;
 		pi->pi_msix.pba_offset =
 		    msixcap.pba_info & ~PCIM_MSIX_BIR_MASK;
 		pi->pi_msix.table_bar =
 		    msixcap.table_info & PCIM_MSIX_BIR_MASK;
 		pi->pi_msix.table_offset =
 		    msixcap.table_info & ~PCIM_MSIX_BIR_MASK;
 		pi->pi_msix.table_count = MSIX_TABLE_COUNT(msixcap.msgctrl);
 		pi->pi_msix.pba_size = PBA_SIZE(pi->pi_msix.table_count);
 
 		/* Allocate the emulated MSI-X table array */
 		table_size = pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE;
 		pi->pi_msix.table = calloc(1, table_size);
 
 		/* Mask all table entries */
 		for (i = 0; i < pi->pi_msix.table_count; i++) {
 			pi->pi_msix.table[i].vector_control |=
 						PCIM_MSIX_VCTRL_MASK;
 		}
 	}
 
 #ifdef LEGACY_SUPPORT
 	/*
 	 * If the passthrough device does not support MSI then craft a
 	 * MSI capability for it. We link the new MSI capability at the
 	 * head of the list of capabilities.
 	 */
 	if ((sts & PCIM_STATUS_CAPPRESENT) != 0 && sc->psc_msi.capoff == 0) {
 		int origptr, msiptr;
 		origptr = read_config(&sel, PCIR_CAP_PTR, 1);
 		msiptr = passthru_add_msicap(pi, 1, origptr);
 		sc->psc_msi.capoff = msiptr;
 		sc->psc_msi.msgctrl = pci_get_cfgdata16(pi, msiptr + 2);
 		sc->psc_msi.emulated = 1;
 		pci_set_cfgdata8(pi, PCIR_CAP_PTR, msiptr);
 	}
 #endif
 
 	/* Make sure one of the capabilities is present */
 	if (sc->psc_msi.capoff == 0 && sc->psc_msix.capoff == 0) 
 		return (-1);
 	else
 		return (0);
 }
 
 static uint64_t
 msix_table_read(struct passthru_softc *sc, uint64_t offset, int size)
 {
 	struct pci_devinst *pi;
 	struct msix_table_entry *entry;
 	uint8_t *src8;
 	uint16_t *src16;
 	uint32_t *src32;
 	uint64_t *src64;
 	uint64_t data;
 	size_t entry_offset;
 	uint32_t table_offset;
 	int index, table_count;
 
 	pi = sc->psc_pi;
 
 	table_offset = pi->pi_msix.table_offset;
 	table_count = pi->pi_msix.table_count;
 	if (offset < table_offset ||
 	    offset >= table_offset + table_count * MSIX_TABLE_ENTRY_SIZE) {
 		switch (size) {
 		case 1:
 			src8 = (uint8_t *)(pi->pi_msix.mapped_addr + offset);
 			data = *src8;
 			break;
 		case 2:
 			src16 = (uint16_t *)(pi->pi_msix.mapped_addr + offset);
 			data = *src16;
 			break;
 		case 4:
 			src32 = (uint32_t *)(pi->pi_msix.mapped_addr + offset);
 			data = *src32;
 			break;
 		case 8:
 			src64 = (uint64_t *)(pi->pi_msix.mapped_addr + offset);
 			data = *src64;
 			break;
 		default:
 			return (-1);
 		}
 		return (data);
 	}
 
 	offset -= table_offset;
 	index = offset / MSIX_TABLE_ENTRY_SIZE;
 	assert(index < table_count);
 
 	entry = &pi->pi_msix.table[index];
 	entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
 
 	switch (size) {
 	case 1:
 		src8 = (uint8_t *)((uint8_t *)entry + entry_offset);
 		data = *src8;
 		break;
 	case 2:
 		src16 = (uint16_t *)((uint8_t *)entry + entry_offset);
 		data = *src16;
 		break;
 	case 4:
 		src32 = (uint32_t *)((uint8_t *)entry + entry_offset);
 		data = *src32;
 		break;
 	case 8:
 		src64 = (uint64_t *)((uint8_t *)entry + entry_offset);
 		data = *src64;
 		break;
 	default:
 		return (-1);
 	}
 
 	return (data);
 }
 
 static void
 msix_table_write(struct vmctx *ctx, int vcpu, struct passthru_softc *sc,
 		 uint64_t offset, int size, uint64_t data)
 {
 	struct pci_devinst *pi;
 	struct msix_table_entry *entry;
 	uint8_t *dest8;
 	uint16_t *dest16;
 	uint32_t *dest32;
 	uint64_t *dest64;
 	size_t entry_offset;
 	uint32_t table_offset, vector_control;
 	int index, table_count;
 
 	pi = sc->psc_pi;
 
 	table_offset = pi->pi_msix.table_offset;
 	table_count = pi->pi_msix.table_count;
 	if (offset < table_offset ||
 	    offset >= table_offset + table_count * MSIX_TABLE_ENTRY_SIZE) {
 		switch (size) {
 		case 1:
 			dest8 = (uint8_t *)(pi->pi_msix.mapped_addr + offset);
 			*dest8 = data;
 			break;
 		case 2:
 			dest16 = (uint16_t *)(pi->pi_msix.mapped_addr + offset);
 			*dest16 = data;
 			break;
 		case 4:
 			dest32 = (uint32_t *)(pi->pi_msix.mapped_addr + offset);
 			*dest32 = data;
 			break;
 		case 8:
 			dest64 = (uint64_t *)(pi->pi_msix.mapped_addr + offset);
 			*dest64 = data;
 			break;
 		}
 		return;
 	}
 
 	offset -= table_offset;
 	index = offset / MSIX_TABLE_ENTRY_SIZE;
 	assert(index < table_count);
 
 	entry = &pi->pi_msix.table[index];
 	entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
 
 	/* Only 4 byte naturally-aligned writes are supported */
 	assert(size == 4);
 	assert(entry_offset % 4 == 0);
 
 	vector_control = entry->vector_control;
 	dest32 = (uint32_t *)((void *)entry + entry_offset);
 	*dest32 = data;
 	/* If MSI-X hasn't been enabled, do nothing */
 	if (pi->pi_msix.enabled) {
 		/* If the entry is masked, don't set it up */
 		if ((entry->vector_control & PCIM_MSIX_VCTRL_MASK) == 0 ||
 		    (vector_control & PCIM_MSIX_VCTRL_MASK) == 0) {
 			(void)vm_setup_pptdev_msix(ctx, vcpu,
 			    sc->psc_sel.pc_bus, sc->psc_sel.pc_dev,
 			    sc->psc_sel.pc_func, index, entry->addr,
 			    entry->msg_data, entry->vector_control);
 		}
 	}
 }
 
 static int
 init_msix_table(struct vmctx *ctx, struct passthru_softc *sc)
 {
 	struct pci_devinst *pi = sc->psc_pi;
 	struct pci_bar_mmap pbm;
 	int b, s, f;
 	uint32_t table_size, table_offset;
 
 	assert(pci_msix_table_bar(pi) >= 0 && pci_msix_pba_bar(pi) >= 0);
 
 	b = sc->psc_sel.pc_bus;
 	s = sc->psc_sel.pc_dev;
 	f = sc->psc_sel.pc_func;
 
 	/*
 	 * Map the region of the BAR containing the MSI-X table.  This is
 	 * necessary for two reasons:
 	 * 1. The PBA may reside in the first or last page containing the MSI-X
 	 *    table.
 	 * 2. While PCI devices are not supposed to use the page(s) containing
 	 *    the MSI-X table for other purposes, some do in practice.
 	 */
 	memset(&pbm, 0, sizeof(pbm));
 	pbm.pbm_sel = sc->psc_sel;
 	pbm.pbm_flags = PCIIO_BAR_MMAP_RW;
 	pbm.pbm_reg = PCIR_BAR(pi->pi_msix.table_bar);
 	pbm.pbm_memattr = VM_MEMATTR_DEVICE;
 
 	if (ioctl(pcifd, PCIOCBARMMAP, &pbm) != 0) {
 		warn("Failed to map MSI-X table BAR on %d/%d/%d", b, s, f);
 		return (-1);
 	}
 	assert(pbm.pbm_bar_off == 0);
 	pi->pi_msix.mapped_addr = (uint8_t *)(uintptr_t)pbm.pbm_map_base;
 	pi->pi_msix.mapped_size = pbm.pbm_map_length;
 
 	table_offset = rounddown2(pi->pi_msix.table_offset, 4096);
 
 	table_size = pi->pi_msix.table_offset - table_offset;
 	table_size += pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE;
 	table_size = roundup2(table_size, 4096);
 
 	/*
 	 * Unmap any pages not containing the table, we do not need to emulate
 	 * accesses to them.  Avoid releasing address space to help ensure that
 	 * a buggy out-of-bounds access causes a crash.
 	 */
 	if (table_offset != 0)
 		if (mprotect(pi->pi_msix.mapped_addr, table_offset,
 		    PROT_NONE) != 0)
 			warn("Failed to unmap MSI-X table BAR region");
 	if (table_offset + table_size != pi->pi_msix.mapped_size)
 		if (mprotect(
 		    pi->pi_msix.mapped_addr + table_offset + table_size,
 		    pi->pi_msix.mapped_size - (table_offset + table_size),
 		    PROT_NONE) != 0)
 			warn("Failed to unmap MSI-X table BAR region");
 
 	return (0);
 }
 
 static int
 cfginitbar(struct vmctx *ctx, struct passthru_softc *sc)
 {
 	int i, error;
 	struct pci_devinst *pi;
 	struct pci_bar_io bar;
 	enum pcibar_type bartype;
 	uint64_t base, size;
 
 	pi = sc->psc_pi;
 
 	/*
 	 * Initialize BAR registers
 	 */
 	for (i = 0; i <= PCI_BARMAX; i++) {
 		bzero(&bar, sizeof(bar));
 		bar.pbi_sel = sc->psc_sel;
 		bar.pbi_reg = PCIR_BAR(i);
 
 		if (ioctl(pcifd, PCIOCGETBAR, &bar) < 0)
 			continue;
 
 		if (PCI_BAR_IO(bar.pbi_base)) {
 			bartype = PCIBAR_IO;
 			base = bar.pbi_base & PCIM_BAR_IO_BASE;
 		} else {
 			switch (bar.pbi_base & PCIM_BAR_MEM_TYPE) {
 			case PCIM_BAR_MEM_64:
 				bartype = PCIBAR_MEM64;
 				break;
 			default:
 				bartype = PCIBAR_MEM32;
 				break;
 			}
 			base = bar.pbi_base & PCIM_BAR_MEM_BASE;
 		}
 		size = bar.pbi_length;
 
 		if (bartype != PCIBAR_IO) {
 			if (((base | size) & PAGE_MASK) != 0) {
 				warnx("passthru device %d/%d/%d BAR %d: "
 				    "base %#lx or size %#lx not page aligned\n",
 				    sc->psc_sel.pc_bus, sc->psc_sel.pc_dev,
 				    sc->psc_sel.pc_func, i, base, size);
 				return (-1);
 			}
 		}
 
 		/* Cache information about the "real" BAR */
 		sc->psc_bar[i].type = bartype;
 		sc->psc_bar[i].size = size;
 		sc->psc_bar[i].addr = base;
 		sc->psc_bar[i].lobits = 0;
 
 		/* Allocate the BAR in the guest I/O or MMIO space */
 		error = pci_emul_alloc_bar(pi, i, bartype, size);
 		if (error)
 			return (-1);
 
 		/* Use same lobits as physical bar */
 		uint8_t lobits = read_config(&sc->psc_sel, PCIR_BAR(i), 0x01);
 		if (bartype == PCIBAR_MEM32 || bartype == PCIBAR_MEM64) {
 			lobits &= ~PCIM_BAR_MEM_BASE;
 		} else {
 			lobits &= ~PCIM_BAR_IO_BASE;
 		}
 		sc->psc_bar[i].lobits = lobits;
 		pi->pi_bar[i].lobits = lobits;
 
 		/* The MSI-X table needs special handling */
 		if (i == pci_msix_table_bar(pi)) {
 			error = init_msix_table(ctx, sc);
 			if (error) 
 				return (-1);
 		}
 
 		/*
 		 * 64-bit BAR takes up two slots so skip the next one.
 		 */
 		if (bartype == PCIBAR_MEM64) {
 			i++;
 			assert(i <= PCI_BARMAX);
 			sc->psc_bar[i].type = PCIBAR_MEMHI64;
 		}
 	}
 	return (0);
 }
 
 static int
 cfginit(struct vmctx *ctx, struct pci_devinst *pi, int bus, int slot, int func)
 {
 	int error;
 	struct passthru_softc *sc;
 
 	error = 1;
 	sc = pi->pi_arg;
 
 	bzero(&sc->psc_sel, sizeof(struct pcisel));
 	sc->psc_sel.pc_bus = bus;
 	sc->psc_sel.pc_dev = slot;
 	sc->psc_sel.pc_func = func;
 
 	if (cfginitmsi(sc) != 0) {
 		warnx("failed to initialize MSI for PCI %d/%d/%d",
 		    bus, slot, func);
 		goto done;
 	}
 
 	if (cfginitbar(ctx, sc) != 0) {
 		warnx("failed to initialize BARs for PCI %d/%d/%d",
 		    bus, slot, func);
 		goto done;
 	}
 
 	write_config(&sc->psc_sel, PCIR_COMMAND, 2,
 	    pci_get_cfgdata16(pi, PCIR_COMMAND));
 
 	/*
 	 * We need to do this after PCIR_COMMAND got possibly updated, e.g.,
 	 * a BAR was enabled, as otherwise the PCIOCBARMMAP might fail on us.
 	 */
 	if (pci_msix_table_bar(pi) >= 0) {
 		error = init_msix_table(ctx, sc);
 		if (error != 0) {
 			warnx(
 			    "failed to initialize MSI-X table for PCI %d/%d/%d: %d",
 			    bus, slot, func, error);
 			goto done;
 		}
 	}
 
 	error = 0;				/* success */
 done:
 	return (error);
 }
 
 static int
 passthru_legacy_config(nvlist_t *nvl, const char *opts)
 {
 	char value[16];
 	int bus, slot, func;
 
 	if (opts == NULL)
 		return (0);
 
 	if (sscanf(opts, "%d/%d/%d", &bus, &slot, &func) != 3) {
 		EPRINTLN("passthru: invalid options \"%s\"", opts);
 		return (-1);
 	}
 
 	snprintf(value, sizeof(value), "%d", bus);
 	set_config_value_node(nvl, "bus", value);
 	snprintf(value, sizeof(value), "%d", slot);
 	set_config_value_node(nvl, "slot", value);
 	snprintf(value, sizeof(value), "%d", func);
 	set_config_value_node(nvl, "func", value);
+
+	return (pci_parse_legacy_config(nvl, strchr(opts, ',')));
+}
+
+static int
+passthru_init_rom(struct vmctx *const ctx, struct passthru_softc *const sc,
+    const char *const romfile)
+{
+	if (romfile == NULL) {
+		return (0);
+	}
+
+	const int fd = open(romfile, O_RDONLY);
+	if (fd < 0) {
+		warnx("%s: can't open romfile \"%s\"", __func__, romfile);
+		return (-1);
+	}
+
+	struct stat sbuf;
+	if (fstat(fd, &sbuf) < 0) {
+		warnx("%s: can't fstat romfile \"%s\"", __func__, romfile);
+		close(fd);
+		return (-1);
+	}
+	const uint64_t rom_size = sbuf.st_size;
+
+	void *const rom_data = mmap(NULL, rom_size, PROT_READ, MAP_SHARED, fd,
+	    0);
+	if (rom_data == MAP_FAILED) {
+		warnx("%s: unable to mmap romfile \"%s\" (%d)", __func__,
+		    romfile, errno);
+		close(fd);
+		return (-1);
+	}
+
+	void *rom_addr;
+	int error = pci_emul_alloc_rom(sc->psc_pi, rom_size, &rom_addr);
+	if (error) {
+		warnx("%s: failed to alloc rom segment", __func__);
+		munmap(rom_data, rom_size);
+		close(fd);
+		return (error);
+	}
+	memcpy(rom_addr, rom_data, rom_size);
+
+	sc->psc_bar[PCI_ROM_IDX].type = PCIBAR_ROM;
+	sc->psc_bar[PCI_ROM_IDX].addr = (uint64_t)rom_addr;
+	sc->psc_bar[PCI_ROM_IDX].size = rom_size;
+
+	munmap(rom_data, rom_size);
+	close(fd);
+
 	return (0);
 }
 
 static int
 passthru_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl)
 {
 	int bus, slot, func, error, memflags;
 	struct passthru_softc *sc;
 	const char *value;
 
 	sc = NULL;
 	error = 1;
 
 	memflags = vm_get_memflags(ctx);
 	if (!(memflags & VM_MEM_F_WIRED)) {
 		warnx("passthru requires guest memory to be wired");
 		return (error);
 	}
 
 	if (pcifd < 0 && pcifd_init()) {
 		return (error);
 	}
 
 #define GET_INT_CONFIG(var, name) do {					\
 	value = get_config_value_node(nvl, name);			\
 	if (value == NULL) {						\
 		EPRINTLN("passthru: missing required %s setting", name); \
 		return (error);						\
 	}								\
 	var = atoi(value);						\
 } while (0)
 
 	GET_INT_CONFIG(bus, "bus");
 	GET_INT_CONFIG(slot, "slot");
 	GET_INT_CONFIG(func, "func");
 
 	if (vm_assign_pptdev(ctx, bus, slot, func) != 0) {
 		warnx("PCI device at %d/%d/%d is not using the ppt(4) driver",
 		    bus, slot, func);
 		goto done;
 	}
 
 	sc = calloc(1, sizeof(struct passthru_softc));
 
 	pi->pi_arg = sc;
 	sc->psc_pi = pi;
 
 	/* initialize config space */
-	error = cfginit(ctx, pi, bus, slot, func);
+	if ((error = cfginit(ctx, pi, bus, slot, func)) != 0)
+		goto done;
+
+	/* initialize ROM */
+	if ((error = passthru_init_rom(ctx, sc,
+            get_config_value_node(nvl, "rom"))) != 0)
+		goto done;
+
+	error = 0;		/* success */
 done:
 	if (error) {
 		free(sc);
 		vm_unassign_pptdev(ctx, bus, slot, func);
 	}
 	return (error);
 }
 
 static int
 bar_access(int coff)
 {
-	if (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1))
+	if ((coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1)) ||
+	    coff == PCIR_BIOS)
 		return (1);
 	else
 		return (0);
 }
 
 static int
 msicap_access(struct passthru_softc *sc, int coff)
 {
 	int caplen;
 
 	if (sc->psc_msi.capoff == 0)
 		return (0);
 
 	caplen = msi_caplen(sc->psc_msi.msgctrl);
 
 	if (coff >= sc->psc_msi.capoff && coff < sc->psc_msi.capoff + caplen)
 		return (1);
 	else
 		return (0);
 }
 
 static int 
 msixcap_access(struct passthru_softc *sc, int coff)
 {
 	if (sc->psc_msix.capoff == 0) 
 		return (0);
 
 	return (coff >= sc->psc_msix.capoff && 
 	        coff < sc->psc_msix.capoff + MSIX_CAPLEN);
 }
 
 static int
 passthru_cfgread(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
 		 int coff, int bytes, uint32_t *rv)
 {
 	struct passthru_softc *sc;
 
 	sc = pi->pi_arg;
 
 	/*
 	 * PCI BARs and MSI capability is emulated.
 	 */
 	if (bar_access(coff) || msicap_access(sc, coff) ||
 	    msixcap_access(sc, coff))
 		return (-1);
 
 #ifdef LEGACY_SUPPORT
 	/*
 	 * Emulate PCIR_CAP_PTR if this device does not support MSI capability
 	 * natively.
 	 */
 	if (sc->psc_msi.emulated) {
 		if (coff >= PCIR_CAP_PTR && coff < PCIR_CAP_PTR + 4)
 			return (-1);
 	}
 #endif
 
 	/*
 	 * Emulate the command register.  If a single read reads both the
 	 * command and status registers, read the status register from the
 	 * device's config space.
 	 */
 	if (coff == PCIR_COMMAND) {
 		if (bytes <= 2)
 			return (-1);
 		*rv = read_config(&sc->psc_sel, PCIR_STATUS, 2) << 16 |
 		    pci_get_cfgdata16(pi, PCIR_COMMAND);
 		return (0);
 	}
 
 	/* Everything else just read from the device's config space */
 	*rv = read_config(&sc->psc_sel, coff, bytes);
 
 	return (0);
 }
 
 static int
 passthru_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
 		  int coff, int bytes, uint32_t val)
 {
 	int error, msix_table_entries, i;
 	struct passthru_softc *sc;
 	uint16_t cmd_old;
 
 	sc = pi->pi_arg;
 
 	/*
 	 * PCI BARs are emulated
 	 */
 	if (bar_access(coff))
 		return (-1);
 
 	/*
 	 * MSI capability is emulated
 	 */
 	if (msicap_access(sc, coff)) {
 		pci_emul_capwrite(pi, coff, bytes, val, sc->psc_msi.capoff,
 		    PCIY_MSI);
 		error = vm_setup_pptdev_msi(ctx, vcpu, sc->psc_sel.pc_bus,
 			sc->psc_sel.pc_dev, sc->psc_sel.pc_func,
 			pi->pi_msi.addr, pi->pi_msi.msg_data,
 			pi->pi_msi.maxmsgnum);
 		if (error != 0)
 			err(1, "vm_setup_pptdev_msi");
 		return (0);
 	}
 
 	if (msixcap_access(sc, coff)) {
 		pci_emul_capwrite(pi, coff, bytes, val, sc->psc_msix.capoff,
 		    PCIY_MSIX);
 		if (pi->pi_msix.enabled) {
 			msix_table_entries = pi->pi_msix.table_count;
 			for (i = 0; i < msix_table_entries; i++) {
 				error = vm_setup_pptdev_msix(ctx, vcpu,
 				    sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, 
 				    sc->psc_sel.pc_func, i, 
 				    pi->pi_msix.table[i].addr,
 				    pi->pi_msix.table[i].msg_data,
 				    pi->pi_msix.table[i].vector_control);
 		
 				if (error)
 					err(1, "vm_setup_pptdev_msix");
 			}
 		} else {
 			error = vm_disable_pptdev_msix(ctx, sc->psc_sel.pc_bus,
 			    sc->psc_sel.pc_dev, sc->psc_sel.pc_func);
 			if (error)
 				err(1, "vm_disable_pptdev_msix");
 		}
 		return (0);
 	}
 
 #ifdef LEGACY_SUPPORT
 	/*
 	 * If this device does not support MSI natively then we cannot let
 	 * the guest disable legacy interrupts from the device. It is the
 	 * legacy interrupt that is triggering the virtual MSI to the guest.
 	 */
 	if (sc->psc_msi.emulated && pci_msi_enabled(pi)) {
 		if (coff == PCIR_COMMAND && bytes == 2)
 			val &= ~PCIM_CMD_INTxDIS;
 	}
 #endif
 
 	write_config(&sc->psc_sel, coff, bytes, val);
 	if (coff == PCIR_COMMAND) {
 		cmd_old = pci_get_cfgdata16(pi, PCIR_COMMAND);
 		if (bytes == 1)
 			pci_set_cfgdata8(pi, PCIR_COMMAND, val);
 		else if (bytes == 2)
 			pci_set_cfgdata16(pi, PCIR_COMMAND, val);
 		pci_emul_cmd_changed(pi, cmd_old);
 	}
 
 	return (0);
 }
 
 static void
 passthru_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
 	       uint64_t offset, int size, uint64_t value)
 {
 	struct passthru_softc *sc;
 	struct pci_bar_ioreq pio;
 
 	sc = pi->pi_arg;
 
 	if (baridx == pci_msix_table_bar(pi)) {
 		msix_table_write(ctx, vcpu, sc, offset, size, value);
 	} else {
 		assert(pi->pi_bar[baridx].type == PCIBAR_IO);
 		assert(size == 1 || size == 2 || size == 4);
 		assert(offset <= UINT32_MAX && offset + size <= UINT32_MAX);
 
 		bzero(&pio, sizeof(pio));
 		pio.pbi_sel = sc->psc_sel;
 		pio.pbi_op = PCIBARIO_WRITE;
 		pio.pbi_bar = baridx;
 		pio.pbi_offset = (uint32_t)offset;
 		pio.pbi_width = size;
 		pio.pbi_value = (uint32_t)value;
 
 		(void)ioctl(pcifd, PCIOCBARIO, &pio);
 	}
 }
 
 static uint64_t
 passthru_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
 	      uint64_t offset, int size)
 {
 	struct passthru_softc *sc;
 	struct pci_bar_ioreq pio;
 	uint64_t val;
 
 	sc = pi->pi_arg;
 
 	if (baridx == pci_msix_table_bar(pi)) {
 		val = msix_table_read(sc, offset, size);
 	} else {
 		assert(pi->pi_bar[baridx].type == PCIBAR_IO);
 		assert(size == 1 || size == 2 || size == 4);
 		assert(offset <= UINT32_MAX && offset + size <= UINT32_MAX);
 
 		bzero(&pio, sizeof(pio));
 		pio.pbi_sel = sc->psc_sel;
 		pio.pbi_op = PCIBARIO_READ;
 		pio.pbi_bar = baridx;
 		pio.pbi_offset = (uint32_t)offset;
 		pio.pbi_width = size;
 
 		(void)ioctl(pcifd, PCIOCBARIO, &pio);
 
 		val = pio.pbi_value;
 	}
 
 	return (val);
 }
 
 static void
 passthru_msix_addr(struct vmctx *ctx, struct pci_devinst *pi, int baridx,
 		   int enabled, uint64_t address)
 {
 	struct passthru_softc *sc;
 	size_t remaining;
 	uint32_t table_size, table_offset;
 
 	sc = pi->pi_arg;
 	table_offset = rounddown2(pi->pi_msix.table_offset, 4096);
 	if (table_offset > 0) {
 		if (!enabled) {
 			if (vm_unmap_pptdev_mmio(ctx, sc->psc_sel.pc_bus,
 						 sc->psc_sel.pc_dev,
 						 sc->psc_sel.pc_func, address,
 						 table_offset) != 0)
 				warnx("pci_passthru: unmap_pptdev_mmio failed");
 		} else {
 			if (vm_map_pptdev_mmio(ctx, sc->psc_sel.pc_bus,
 					       sc->psc_sel.pc_dev,
 					       sc->psc_sel.pc_func, address,
 					       table_offset,
 					       sc->psc_bar[baridx].addr) != 0)
 				warnx("pci_passthru: map_pptdev_mmio failed");
 		}
 	}
 	table_size = pi->pi_msix.table_offset - table_offset;
 	table_size += pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE;
 	table_size = roundup2(table_size, 4096);
 	remaining = pi->pi_bar[baridx].size - table_offset - table_size;
 	if (remaining > 0) {
 		address += table_offset + table_size;
 		if (!enabled) {
 			if (vm_unmap_pptdev_mmio(ctx, sc->psc_sel.pc_bus,
 						 sc->psc_sel.pc_dev,
 						 sc->psc_sel.pc_func, address,
 						 remaining) != 0)
 				warnx("pci_passthru: unmap_pptdev_mmio failed");
 		} else {
 			if (vm_map_pptdev_mmio(ctx, sc->psc_sel.pc_bus,
 					       sc->psc_sel.pc_dev,
 					       sc->psc_sel.pc_func, address,
 					       remaining,
 					       sc->psc_bar[baridx].addr +
 					       table_offset + table_size) != 0)
 				warnx("pci_passthru: map_pptdev_mmio failed");
 		}
 	}
 }
 
 static void
 passthru_mmio_addr(struct vmctx *ctx, struct pci_devinst *pi, int baridx,
 		   int enabled, uint64_t address)
 {
 	struct passthru_softc *sc;
 
 	sc = pi->pi_arg;
 	if (!enabled) {
 		if (vm_unmap_pptdev_mmio(ctx, sc->psc_sel.pc_bus,
 					 sc->psc_sel.pc_dev,
 					 sc->psc_sel.pc_func, address,
 					 sc->psc_bar[baridx].size) != 0)
 			warnx("pci_passthru: unmap_pptdev_mmio failed");
 	} else {
 		if (vm_map_pptdev_mmio(ctx, sc->psc_sel.pc_bus,
 				       sc->psc_sel.pc_dev,
 				       sc->psc_sel.pc_func, address,
 				       sc->psc_bar[baridx].size,
 				       sc->psc_bar[baridx].addr) != 0)
 			warnx("pci_passthru: map_pptdev_mmio failed");
 	}
 }
 
 static void
-passthru_addr(struct vmctx *ctx, struct pci_devinst *pi, int baridx,
-	      int enabled, uint64_t address)
+passthru_addr_rom(struct pci_devinst *const pi, const int idx,
+    const int enabled)
 {
+	const uint64_t addr = pi->pi_bar[idx].addr;
+	const uint64_t size = pi->pi_bar[idx].size;
 
-	if (pi->pi_bar[baridx].type == PCIBAR_IO)
-		return;
-	if (baridx == pci_msix_table_bar(pi))
-		passthru_msix_addr(ctx, pi, baridx, enabled, address);
-	else
-		passthru_mmio_addr(ctx, pi, baridx, enabled, address);
+	if (!enabled) {
+		if (vm_munmap_memseg(pi->pi_vmctx, addr, size) != 0) {
+			errx(4, "%s: munmap_memseg @ [%016lx - %016lx] failed",
+			    __func__, addr, addr + size);
+		}
+
+	} else {
+		if (vm_mmap_memseg(pi->pi_vmctx, addr, VM_PCIROM,
+			pi->pi_romoffset, size, PROT_READ | PROT_EXEC) != 0) {
+			errx(4, "%s: mnmap_memseg @ [%016lx - %016lx]  failed",
+			    __func__, addr, addr + size);
+		}
+	}
+}
+
+static void
+passthru_addr(struct vmctx *ctx, struct pci_devinst *pi, int baridx,
+    int enabled, uint64_t address)
+{
+	switch (pi->pi_bar[baridx].type) {
+	case PCIBAR_IO:
+		/* IO BARs are emulated */
+		break;
+	case PCIBAR_ROM:
+		passthru_addr_rom(pi, baridx, enabled);
+		break;
+	case PCIBAR_MEM32:
+	case PCIBAR_MEM64:
+		if (baridx == pci_msix_table_bar(pi))
+			passthru_msix_addr(ctx, pi, baridx, enabled, address);
+		else
+			passthru_mmio_addr(ctx, pi, baridx, enabled, address);
+		break;
+	default:
+		errx(4, "%s: invalid BAR type %d", __func__,
+		    pi->pi_bar[baridx].type);
+	}
 }
 
 struct pci_devemu passthru = {
 	.pe_emu		= "passthru",
 	.pe_init	= passthru_init,
 	.pe_legacy_config = passthru_legacy_config,
 	.pe_cfgwrite	= passthru_cfgwrite,
 	.pe_cfgread	= passthru_cfgread,
 	.pe_barwrite 	= passthru_write,
 	.pe_barread    	= passthru_read,
 	.pe_baraddr	= passthru_addr,
 };
 PCI_EMUL_SET(passthru);